net: tcp: Support SACK
[akaros.git] / kern / src / net / tcp.c
1 /* Copyright © 1994-1999 Lucent Technologies Inc.  All rights reserved.
2  * Portions Copyright © 1997-1999 Vita Nuova Limited
3  * Portions Copyright © 2000-2007 Vita Nuova Holdings Limited
4  *                                (www.vitanuova.com)
5  * Revisions Copyright © 2000-2007 Lucent Technologies Inc. and others
6  *
7  * Modified for the Akaros operating system:
8  * Copyright (c) 2013-2014 The Regents of the University of California
9  * Copyright (c) 2013-2015 Google Inc.
10  *
11  * Permission is hereby granted, free of charge, to any person obtaining a copy
12  * of this software and associated documentation files (the "Software"), to deal
13  * in the Software without restriction, including without limitation the rights
14  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15  * copies of the Software, and to permit persons to whom the Software is
16  * furnished to do so, subject to the following conditions:
17  *
18  * The above copyright notice and this permission notice shall be included in
19  * all copies or substantial portions of the Software.
20  *
21  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
24  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27  * SOFTWARE. */
28
29 #include <vfs.h>
30 #include <kfs.h>
31 #include <slab.h>
32 #include <kmalloc.h>
33 #include <kref.h>
34 #include <string.h>
35 #include <stdio.h>
36 #include <assert.h>
37 #include <error.h>
38 #include <cpio.h>
39 #include <pmap.h>
40 #include <smp.h>
41 #include <ip.h>
42
43 #include <vfs.h>
44 #include <kfs.h>
45 #include <slab.h>
46 #include <kmalloc.h>
47 #include <kref.h>
48 #include <string.h>
49 #include <stdio.h>
50 #include <assert.h>
51 #include <error.h>
52 #include <cpio.h>
53 #include <pmap.h>
54 #include <smp.h>
55 #include <ip.h>
56
57 enum {
58         QMAX = 64 * 1024 - 1,
59         IP_TCPPROTO = 6,
60
61         TCP4_IPLEN = 8,
62         TCP4_PHDRSIZE = 12,
63         TCP4_HDRSIZE = 20,
64         TCP4_TCBPHDRSZ = 40,
65         TCP4_PKT = TCP4_IPLEN + TCP4_PHDRSIZE,
66
67         TCP6_IPLEN = 0,
68         TCP6_PHDRSIZE = 40,
69         TCP6_HDRSIZE = 20,
70         TCP6_TCBPHDRSZ = 60,
71         TCP6_PKT = TCP6_IPLEN + TCP6_PHDRSIZE,
72
73         TcptimerOFF = 0,
74         TcptimerON = 1,
75         TcptimerDONE = 2,
76         MAX_TIME = (1 << 20),   /* Forever */
77         TCP_ACK = 50,   /* Timed ack sequence in ms */
78         MAXBACKMS = 9 * 60 * 1000,      /* longest backoff time (ms) before hangup */
79
80         URG = 0x20,     /* Data marked urgent */
81         ACK = 0x10,     /* Acknowledge is valid */
82         PSH = 0x08,     /* Whole data pipe is pushed */
83         RST = 0x04,     /* Reset connection */
84         SYN = 0x02,     /* Pkt. is synchronise */
85         FIN = 0x01,     /* Start close down */
86
87         EOLOPT = 0,
88         NOOPOPT = 1,
89         MSSOPT = 2,
90         MSS_LENGTH = 4, /* max segment size header option length */
91         WSOPT = 3,
92         WS_LENGTH = 3,  /* WS header option length */
93         MAX_WS_VALUE = 14,      /* RFC specified.  Limits available window to 2^30 */
94         TS_OPT = 8,
95         TS_LENGTH = 10,
96         TS_SEND_PREPAD = 2,     /* For non-SYNs, pre-pad 2 nops for 32 byte alignment */
97         SACK_OK_OPT = 4,
98         SACK_OK_LENGTH = 2,
99         SACK_OPT = 5,
100         MSL2 = 10,
101         MSPTICK = 50,   /* Milliseconds per timer tick */
102         DEF_MSS = 1460, /* Default mean segment */
103         DEF_MSS6 = 1280,        /* Default mean segment (min) for v6 */
104         SACK_SUPPORTED = TRUE,  /* SACK is on by default */
105         MAX_NR_SACKS_PER_PACKET = 4,    /* limited by TCP's opts size */
106         MAX_NR_SND_SACKS = 10,
107         MAX_NR_RCV_SACKS = 3,   /* We could try for 4, but don't need to */
108         DEF_RTT = 500,  /* Default round trip */
109         DEF_KAT = 120000,       /* Default time (ms) between keep alives */
110         TCP_LISTEN = 0, /* Listen connection */
111         TCP_CONNECT = 1,        /* Outgoing connection */
112         SYNACK_RXTIMER = 250,   /* ms between SYNACK retransmits */
113
114         TCPREXMTTHRESH = 3,     /* dupack threshold for recovery */
115         SACK_RETRANS_RECOVERY = 1,
116         FAST_RETRANS_RECOVERY = 2,
117         RTO_RETRANS_RECOVERY = 3,
118         CWIND_SCALE = 10,       /* initial CWIND will be MSS * this */
119
120         FORCE = 1,
121         CLONE = 2,
122         RETRAN = 4,
123         ACTIVE = 8,
124         SYNACK = 16,
125         TSO = 32,
126
127         LOGAGAIN = 3,
128         LOGDGAIN = 2,
129
130         Closed = 0,     /* Connection states */
131         Listen,
132         Syn_sent,
133         Established,
134         Finwait1,
135         Finwait2,
136         Close_wait,
137         Closing,
138         Last_ack,
139         Time_wait,
140
141         Maxlimbo = 1000,        /* maximum procs waiting for response to SYN ACK */
142         NLHT = 256,     /* hash table size, must be a power of 2 */
143         LHTMASK = NLHT - 1,
144
145         HaveWS = 1 << 8,
146 };
147
148 /* Must correspond to the enumeration above */
149 char *tcpstates[] = {
150         "Closed", "Listen", "Syn_sent",
151         "Established", "Finwait1", "Finwait2", "Close_wait",
152         "Closing", "Last_ack", "Time_wait"
153 };
154
155 typedef struct Tcptimer Tcptimer;
156 struct Tcptimer {
157         Tcptimer *next;
158         Tcptimer *prev;
159         Tcptimer *readynext;
160         int state;
161         uint64_t start;
162         uint64_t count;
163         void (*func) (void *);
164         void *arg;
165 };
166
167 /*
168  *  v4 and v6 pseudo headers used for
169  *  checksuming tcp
170  */
171 typedef struct Tcp4hdr Tcp4hdr;
172 struct Tcp4hdr {
173         uint8_t vihl;                           /* Version and header length */
174         uint8_t tos;                            /* Type of service */
175         uint8_t length[2];                      /* packet length */
176         uint8_t id[2];                          /* Identification */
177         uint8_t frag[2];                        /* Fragment information */
178         uint8_t Unused;
179         uint8_t proto;
180         uint8_t tcplen[2];
181         uint8_t tcpsrc[4];
182         uint8_t tcpdst[4];
183         uint8_t tcpsport[2];
184         uint8_t tcpdport[2];
185         uint8_t tcpseq[4];
186         uint8_t tcpack[4];
187         uint8_t tcpflag[2];
188         uint8_t tcpwin[2];
189         uint8_t tcpcksum[2];
190         uint8_t tcpurg[2];
191         /* Options segment */
192         uint8_t tcpopt[1];
193 };
194
195 typedef struct Tcp6hdr Tcp6hdr;
196 struct Tcp6hdr {
197         uint8_t vcf[4];
198         uint8_t ploadlen[2];
199         uint8_t proto;
200         uint8_t ttl;
201         uint8_t tcpsrc[IPaddrlen];
202         uint8_t tcpdst[IPaddrlen];
203         uint8_t tcpsport[2];
204         uint8_t tcpdport[2];
205         uint8_t tcpseq[4];
206         uint8_t tcpack[4];
207         uint8_t tcpflag[2];
208         uint8_t tcpwin[2];
209         uint8_t tcpcksum[2];
210         uint8_t tcpurg[2];
211         /* Options segment */
212         uint8_t tcpopt[1];
213 };
214
215 struct sack_block {
216         uint32_t left;
217         uint32_t right;
218 };
219
220 /*
221  *  this represents the control info
222  *  for a single packet.  It is derived from
223  *  a packet in ntohtcp{4,6}() and stuck into
224  *  a packet in htontcp{4,6}().
225  */
226 typedef struct Tcp Tcp;
227 struct Tcp {
228         uint16_t source;
229         uint16_t dest;
230         uint32_t seq;
231         uint32_t ack;
232         uint8_t flags;
233         uint16_t ws;                            /* window scale option (if not zero) */
234         uint32_t wnd;
235         uint16_t urg;
236         uint16_t mss;                           /* max segment size option (if not zero) */
237         uint16_t len;                           /* size of data */
238         uint32_t ts_val;                        /* timestamp val from sender */
239         uint32_t ts_ecr;                        /* timestamp echo response from sender */
240         bool sack_ok;                           /* header had/should have SACK_PERMITTED */
241         uint8_t nr_sacks;
242         struct sack_block sacks[MAX_NR_SACKS_PER_PACKET];
243 };
244
245 /*
246  *  this header is malloc'd to thread together fragments
247  *  waiting to be coalesced
248  */
249 typedef struct Reseq Reseq;
250 struct Reseq {
251         Reseq *next;
252         Tcp seg;
253         struct block *bp;
254         uint16_t length;
255 };
256
257 /*
258  *  the qlock in the Conv locks this structure
259  */
260 typedef struct Tcpctl Tcpctl;
261 struct Tcpctl {
262         uint8_t state;                          /* Connection state */
263         uint8_t type;                           /* Listening or active connection */
264         uint8_t code;                           /* Icmp code */
265         struct {
266                 uint32_t una;                   /* Left edge of unacked data region */
267                 uint32_t nxt;                   /* Next seq to send, right edge of unacked */
268                 uint32_t rtx;                   /* Next to send for retrans */
269                 uint32_t wnd;                   /* Tcp send window */
270                 uint32_t urg;                   /* Urgent data pointer */
271                 uint32_t wl2;
272                 int scale;                              /* how much to right shift window for xmit */
273                 uint32_t in_flight;             /* estimate of how much is in flight */
274                 uint8_t loss_hint;              /* number of loss hints rcvd */
275                 uint8_t sack_loss_hint; /* For detecting sack rxmit losses */
276                 bool flush_sacks;               /* Two timeouts in a row == dump sacks */
277                 uint8_t recovery;               /* loss recovery flag */
278                 uint32_t recovery_pt;   /* right window for recovery point */
279                 uint8_t nr_sacks;
280                 struct sack_block sacks[MAX_NR_SND_SACKS];
281         } snd;
282         struct {
283                 uint32_t nxt;                   /* Receive pointer to next uint8_t slot */
284                 uint32_t wnd;                   /* Receive window incoming */
285                 uint32_t urg;                   /* Urgent pointer */
286                 int blocked;
287                 int una;                                /* unacked data segs */
288                 int scale;                              /* how much to left shift window for rx */
289                 uint8_t nr_sacks;
290                 struct sack_block sacks[MAX_NR_RCV_SACKS];
291         } rcv;
292         uint32_t iss;                           /* Initial sequence number */
293         int sawwsopt;                           /* true if we saw a wsopt on the incoming SYN */
294         uint32_t cwind;                         /* Congestion window */
295         int scale;                                      /* desired snd.scale */
296         uint32_t ssthresh;                      /* Slow start threshold */
297         int irs;                                        /* Initial received squence */
298         uint16_t mss;                           /* Max segment size */
299         uint16_t typical_mss;           /* MSS for most packets (< MSS for some opts) */
300         int rerecv;                                     /* Overlap of data rerecevived */
301         uint32_t window;                        /* Recevive window */
302         uint8_t backoff;                        /* Exponential backoff counter */
303         int backedoff;                          /* ms we've backed off for rexmits */
304         uint8_t flags;                          /* State flags */
305         Reseq *reseq;                           /* Resequencing queue */
306         Tcptimer timer;                         /* Activity timer */
307         Tcptimer acktimer;                      /* Acknowledge timer */
308         Tcptimer rtt_timer;                     /* Round trip timer */
309         Tcptimer katimer;                       /* keep alive timer */
310         uint32_t rttseq;                        /* Round trip sequence */
311         int srtt;                                       /* Shortened round trip */
312         int mdev;                                       /* Mean deviation of round trip */
313         int kacounter;                          /* count down for keep alive */
314         uint64_t sndsyntime;            /* time syn sent */
315         uint64_t time;                          /* time Finwait2 was sent */
316         int nochecksum;                         /* non-zero means don't send checksums */
317         int flgcnt;                                     /* number of flags in the sequence (FIN,SYN) */
318         uint32_t ts_recent;                     /* timestamp received around last_ack_sent */
319         uint32_t last_ack_sent;         /* to determine when to update timestamp */
320         bool sack_ok;                           /* Can use SACK for this connection */
321
322         union {
323                 Tcp4hdr tcp4hdr;
324                 Tcp6hdr tcp6hdr;
325         } protohdr;                                     /* prototype header */
326 };
327
328 /*
329  *  New calls are put in limbo rather than having a conversation structure
330  *  allocated.  Thus, a SYN attack results in lots of limbo'd calls but not
331  *  any real Conv structures mucking things up.  Calls in limbo rexmit their
332  *  SYN ACK every SYNACK_RXTIMER ms up to 4 times, i.e., they disappear after 1 second.
333  *
334  *  In particular they aren't on a listener's queue so that they don't figure
335  *  in the input queue limit.
336  *
337  *  If 1/2 of a T3 was attacking SYN packets, we'ld have a permanent queue
338  *  of 70000 limbo'd calls.  Not great for a linear list but doable.  Therefore
339  *  there is no hashing of this list.
340  */
341 typedef struct Limbo Limbo;
342 struct Limbo {
343         Limbo *next;
344
345         uint8_t laddr[IPaddrlen];
346         uint8_t raddr[IPaddrlen];
347         uint16_t lport;
348         uint16_t rport;
349         uint32_t irs;                           /* initial received sequence */
350         uint32_t iss;                           /* initial sent sequence */
351         uint16_t mss;                           /* mss from the other end */
352         uint16_t rcvscale;                      /* how much to scale rcvd windows */
353         uint16_t sndscale;                      /* how much to scale sent windows */
354         uint64_t lastsend;                      /* last time we sent a synack */
355         uint8_t version;                        /* v4 or v6 */
356         uint8_t rexmits;                        /* number of retransmissions */
357         bool sack_ok;                           /* other side said SACK_OK */
358         uint32_t ts_val;                        /* timestamp val from sender */
359 };
360
361 int tcp_irtt = DEF_RTT;                 /* Initial guess at round trip time */
362 uint16_t tcp_mss = DEF_MSS;             /* Maximum segment size to be sent */
363
364 enum {
365         /* MIB stats */
366         MaxConn,
367         ActiveOpens,
368         PassiveOpens,
369         EstabResets,
370         CurrEstab,
371         InSegs,
372         OutSegs,
373         RetransSegs,
374         RetransTimeouts,
375         InErrs,
376         OutRsts,
377
378         /* non-MIB stats */
379         CsumErrs,
380         HlenErrs,
381         LenErrs,
382         OutOfOrder,
383
384         Nstats
385 };
386
387 static char *statnames[] = {
388         [MaxConn] "MaxConn",
389         [ActiveOpens] "ActiveOpens",
390         [PassiveOpens] "PassiveOpens",
391         [EstabResets] "EstabResets",
392         [CurrEstab] "CurrEstab",
393         [InSegs] "InSegs",
394         [OutSegs] "OutSegs",
395         [RetransSegs] "RetransSegs",
396         [RetransTimeouts] "RetransTimeouts",
397         [InErrs] "InErrs",
398         [OutRsts] "OutRsts",
399         [CsumErrs] "CsumErrs",
400         [HlenErrs] "HlenErrs",
401         [LenErrs] "LenErrs",
402         [OutOfOrder] "OutOfOrder",
403 };
404
405 typedef struct Tcppriv Tcppriv;
406 struct tcppriv {
407         /* List of active timers */
408         qlock_t tl;
409         Tcptimer *timers;
410
411         /* hash table for matching conversations */
412         struct Ipht ht;
413
414         /* calls in limbo waiting for an ACK to our SYN ACK */
415         int nlimbo;
416         Limbo *lht[NLHT];
417
418         /* for keeping track of tcpackproc */
419         qlock_t apl;
420         int ackprocstarted;
421
422         uint32_t stats[Nstats];
423 };
424
425 /*
426  *  Setting tcpporthogdefense to non-zero enables Dong Lin's
427  *  solution to hijacked systems staking out port's as a form
428  *  of DoS attack.
429  *
430  *  To avoid stateless Conv hogs, we pick a sequence number at random.  If
431  *  it that number gets acked by the other end, we shut down the connection.
432  *  Look for tcpporthogedefense in the code.
433  */
434 int tcpporthogdefense = 0;
435
436 int addreseq(Tcpctl *, struct tcppriv *, Tcp *, struct block *, uint16_t);
437 void getreseq(Tcpctl *, Tcp *, struct block **, uint16_t *);
438 void localclose(struct conv *, char *unused_char_p_t);
439 void procsyn(struct conv *, Tcp *);
440 void tcpiput(struct Proto *, struct Ipifc *, struct block *);
441 void tcpoutput(struct conv *);
442 int tcptrim(Tcpctl *, Tcp *, struct block **, uint16_t *);
443 void tcpstart(struct conv *, int);
444 void tcptimeout(void *);
445 void tcpsndsyn(struct conv *, Tcpctl *);
446 void tcprcvwin(struct conv *);
447 void tcpacktimer(void *);
448 void tcpkeepalive(void *);
449 void tcpsetkacounter(Tcpctl *);
450 void tcprxmit(struct conv *);
451 void tcpsettimer(Tcpctl *);
452 void tcpsynackrtt(struct conv *);
453 void tcpsetscale(struct conv *, Tcpctl *, uint16_t, uint16_t);
454 static void tcp_loss_event(struct conv *s, Tcpctl *tcb);
455 static uint16_t derive_payload_mss(Tcpctl *tcb);
456 static int seq_within(uint32_t x, uint32_t low, uint32_t high);
457 static int seq_lt(uint32_t x, uint32_t y);
458 static int seq_le(uint32_t x, uint32_t y);
459 static int seq_gt(uint32_t x, uint32_t y);
460 static int seq_ge(uint32_t x, uint32_t y);
461 static uint32_t seq_max(uint32_t x, uint32_t y);
462 static uint32_t seq_min(uint32_t x, uint32_t y);
463 static void set_in_flight(Tcpctl *tcb);
464
465 static void limborexmit(struct Proto *);
466 static void limbo(struct conv *, uint8_t * unused_uint8_p_t, uint8_t *, Tcp *,
467                                   int);
468
469 void tcpsetstate(struct conv *s, uint8_t newstate)
470 {
471         Tcpctl *tcb;
472         uint8_t oldstate;
473         struct tcppriv *tpriv;
474
475         tpriv = s->p->priv;
476
477         tcb = (Tcpctl *) s->ptcl;
478
479         oldstate = tcb->state;
480         if (oldstate == newstate)
481                 return;
482
483         if (oldstate == Established)
484                 tpriv->stats[CurrEstab]--;
485         if (newstate == Established)
486                 tpriv->stats[CurrEstab]++;
487
488         /**
489         print( "%d/%d %s->%s CurrEstab=%d\n", s->lport, s->rport,
490                 tcpstates[oldstate], tcpstates[newstate], tpriv->tstats.tcpCurrEstab );
491         **/
492
493         switch (newstate) {
494                 case Closed:
495                         qclose(s->rq);
496                         qclose(s->wq);
497                         qclose(s->eq);
498                         break;
499
500                 case Close_wait:        /* Remote closes */
501                         qhangup(s->rq, NULL);
502                         break;
503         }
504
505         tcb->state = newstate;
506
507         if (oldstate == Syn_sent && newstate != Closed)
508                 Fsconnected(s, NULL);
509 }
510
511 static void tcpconnect(struct conv *c, char **argv, int argc)
512 {
513         Fsstdconnect(c, argv, argc);
514         tcpstart(c, TCP_CONNECT);
515 }
516
517 static int tcpstate(struct conv *c, char *state, int n)
518 {
519         Tcpctl *s;
520
521         s = (Tcpctl *) (c->ptcl);
522
523         return snprintf(state, n,
524                                         "%s qin %d qout %d srtt %d mdev %d cwin %u swin %u>>%d rwin %u>>%d timer.start %llu timer.count %llu rerecv %d katimer.start %d katimer.count %d\n",
525                                         tcpstates[s->state],
526                                         c->rq ? qlen(c->rq) : 0,
527                                         c->wq ? qlen(c->wq) : 0,
528                                         s->srtt, s->mdev,
529                                         s->cwind, s->snd.wnd, s->rcv.scale, s->rcv.wnd,
530                                         s->snd.scale, s->timer.start, s->timer.count, s->rerecv,
531                                         s->katimer.start, s->katimer.count);
532 }
533
534 static int tcpinuse(struct conv *c)
535 {
536         Tcpctl *s;
537
538         s = (Tcpctl *) (c->ptcl);
539         return s->state != Closed;
540 }
541
542 static void tcpannounce(struct conv *c, char **argv, int argc)
543 {
544         Fsstdannounce(c, argv, argc);
545         tcpstart(c, TCP_LISTEN);
546         Fsconnected(c, NULL);
547 }
548
549 static void tcpbypass(struct conv *cv, char **argv, int argc)
550 {
551         struct tcppriv *tpriv = cv->p->priv;
552
553         Fsstdbypass(cv, argv, argc);
554         iphtadd(&tpriv->ht, cv);
555 }
556
557 static void tcpshutdown(struct conv *c, int how)
558 {
559         Tcpctl *tcb = (Tcpctl*)c->ptcl;
560
561         /* Do nothing for the read side */
562         if (how == SHUT_RD)
563                 return;
564         /* Sends a FIN.  If we're in another state (like Listen), we'll run into
565          * issues, since we'll never send the FIN.  We'll be shutdown on our end,
566          * but we'll never tell the distant end.  Might just be an app issue. */
567         switch (tcb->state) {
568         case Established:
569                 tcb->flgcnt++;
570                 tcb->snd.nxt++;
571                 tcpsetstate(c, Finwait1);
572                 tcpoutput(c);
573                 break;
574         }
575 }
576
577 /*
578  *  tcpclose is always called with the q locked
579  */
580 static void tcpclose(struct conv *c)
581 {
582         Tcpctl *tcb;
583
584         tcb = (Tcpctl *) c->ptcl;
585
586         qhangup(c->rq, NULL);
587         qhangup(c->wq, NULL);
588         qhangup(c->eq, NULL);
589         qflush(c->rq);
590
591         switch (tcb->state) {
592                 case Listen:
593                         /*
594                          *  reset any incoming calls to this listener
595                          */
596                         Fsconnected(c, "Hangup");
597
598                         localclose(c, NULL);
599                         break;
600                 case Closed:
601                 case Syn_sent:
602                         localclose(c, NULL);
603                         break;
604                 case Established:
605                         tcb->flgcnt++;
606                         tcb->snd.nxt++;
607                         tcpsetstate(c, Finwait1);
608                         tcpoutput(c);
609                         break;
610                 case Close_wait:
611                         tcb->flgcnt++;
612                         tcb->snd.nxt++;
613                         tcpsetstate(c, Last_ack);
614                         tcpoutput(c);
615                         break;
616         }
617 }
618
619 void tcpkick(void *x)
620 {
621         ERRSTACK(1);
622         struct conv *s = x;
623         Tcpctl *tcb;
624
625         tcb = (Tcpctl *) s->ptcl;
626
627         qlock(&s->qlock);
628         if (waserror()) {
629                 qunlock(&s->qlock);
630                 nexterror();
631         }
632
633         switch (tcb->state) {
634                 case Syn_sent:
635                 case Established:
636                 case Close_wait:
637                         /*
638                          * Push data
639                          */
640                         tcprcvwin(s);
641                         tcpoutput(s);
642                         break;
643                 default:
644                         localclose(s, "Hangup");
645                         break;
646         }
647
648         qunlock(&s->qlock);
649         poperror();
650 }
651
652 void tcprcvwin(struct conv *s)
653 {       /* Call with tcb locked */
654         int w;
655         Tcpctl *tcb;
656
657         tcb = (Tcpctl *) s->ptcl;
658         w = tcb->window - qlen(s->rq);
659         if (w < 0)
660                 w = 0;
661         tcb->rcv.wnd = w;
662         if (w == 0)
663                 tcb->rcv.blocked = 1;
664 }
665
666 void tcpacktimer(void *v)
667 {
668         ERRSTACK(1);
669         Tcpctl *tcb;
670         struct conv *s;
671
672         s = v;
673         tcb = (Tcpctl *) s->ptcl;
674
675         qlock(&s->qlock);
676         if (waserror()) {
677                 qunlock(&s->qlock);
678                 nexterror();
679         }
680         if (tcb->state != Closed) {
681                 tcb->flags |= FORCE;
682                 tcprcvwin(s);
683                 tcpoutput(s);
684         }
685         qunlock(&s->qlock);
686         poperror();
687 }
688
689 static void tcpcreate(struct conv *c)
690 {
691         c->rq = qopen(QMAX, Qcoalesce, 0, 0);
692         c->wq = qopen(8 * QMAX, Qkick, tcpkick, c);
693 }
694
695 static void timerstate(struct tcppriv *priv, Tcptimer * t, int newstate)
696 {
697         if (newstate != TcptimerON) {
698                 if (t->state == TcptimerON) {
699                         // unchain
700                         if (priv->timers == t) {
701                                 priv->timers = t->next;
702                                 if (t->prev != NULL)
703                                         panic("timerstate1");
704                         }
705                         if (t->next)
706                                 t->next->prev = t->prev;
707                         if (t->prev)
708                                 t->prev->next = t->next;
709                         t->next = t->prev = NULL;
710                 }
711         } else {
712                 if (t->state != TcptimerON) {
713                         // chain
714                         if (t->prev != NULL || t->next != NULL)
715                                 panic("timerstate2");
716                         t->prev = NULL;
717                         t->next = priv->timers;
718                         if (t->next)
719                                 t->next->prev = t;
720                         priv->timers = t;
721                 }
722         }
723         t->state = newstate;
724 }
725
726 void tcpackproc(void *a)
727 {
728         ERRSTACK(1);
729         Tcptimer *t, *tp, *timeo;
730         struct Proto *tcp;
731         struct tcppriv *priv;
732         int loop;
733
734         tcp = a;
735         priv = tcp->priv;
736
737         for (;;) {
738                 kthread_usleep(MSPTICK * 1000);
739
740                 qlock(&priv->tl);
741                 timeo = NULL;
742                 loop = 0;
743                 for (t = priv->timers; t != NULL; t = tp) {
744                         if (loop++ > 10000)
745                                 panic("tcpackproc1");
746                         tp = t->next;
747                         if (t->state == TcptimerON) {
748                                 t->count--;
749                                 if (t->count == 0) {
750                                         timerstate(priv, t, TcptimerDONE);
751                                         t->readynext = timeo;
752                                         timeo = t;
753                                 }
754                         }
755                 }
756                 qunlock(&priv->tl);
757
758                 loop = 0;
759                 for (t = timeo; t != NULL; t = t->readynext) {
760                         if (loop++ > 10000)
761                                 panic("tcpackproc2");
762                         if (t->state == TcptimerDONE && t->func != NULL) {
763                                 /* discard error style */
764                                 if (!waserror())
765                                         (*t->func) (t->arg);
766                                 poperror();
767                         }
768                 }
769
770                 limborexmit(tcp);
771         }
772 }
773
774 void tcpgo(struct tcppriv *priv, Tcptimer * t)
775 {
776         if (t == NULL || t->start == 0)
777                 return;
778
779         qlock(&priv->tl);
780         t->count = t->start;
781         timerstate(priv, t, TcptimerON);
782         qunlock(&priv->tl);
783 }
784
785 void tcphalt(struct tcppriv *priv, Tcptimer * t)
786 {
787         if (t == NULL)
788                 return;
789
790         qlock(&priv->tl);
791         timerstate(priv, t, TcptimerOFF);
792         qunlock(&priv->tl);
793 }
794
795 int backoff(int n)
796 {
797         return 1 << n;
798 }
799
800 void localclose(struct conv *s, char *reason)
801 {       /* called with tcb locked */
802         Tcpctl *tcb;
803         Reseq *rp, *rp1;
804         struct tcppriv *tpriv;
805
806         tpriv = s->p->priv;
807         tcb = (Tcpctl *) s->ptcl;
808
809         iphtrem(&tpriv->ht, s);
810
811         tcphalt(tpriv, &tcb->timer);
812         tcphalt(tpriv, &tcb->rtt_timer);
813         tcphalt(tpriv, &tcb->acktimer);
814         tcphalt(tpriv, &tcb->katimer);
815
816         /* Flush reassembly queue; nothing more can arrive */
817         for (rp = tcb->reseq; rp != NULL; rp = rp1) {
818                 rp1 = rp->next;
819                 freeblist(rp->bp);
820                 kfree(rp);
821         }
822         tcb->reseq = NULL;
823
824         if (tcb->state == Syn_sent)
825                 Fsconnected(s, reason);
826
827         qhangup(s->rq, reason);
828         qhangup(s->wq, reason);
829
830         tcpsetstate(s, Closed);
831
832         /* listener will check the rq state */
833         if (s->state == Announced)
834                 rendez_wakeup(&s->listenr);
835 }
836
837 /* mtu (- TCP + IP hdr len) of 1st hop */
838 int tcpmtu(struct Proto *tcp, uint8_t * addr, int version, int *scale,
839            uint8_t *flags)
840 {
841         struct Ipifc *ifc;
842         int mtu;
843
844         ifc = findipifc(tcp->f, addr, 0);
845         switch (version) {
846                 default:
847                 case V4:
848                         mtu = DEF_MSS;
849                         if (ifc != NULL)
850                                 mtu = ifc->maxtu - ifc->m->hsize - (TCP4_PKT + TCP4_HDRSIZE);
851                         break;
852                 case V6:
853                         mtu = DEF_MSS6;
854                         if (ifc != NULL)
855                                 mtu = ifc->maxtu - ifc->m->hsize - (TCP6_PKT + TCP6_HDRSIZE);
856                         break;
857         }
858         *flags &= ~TSO;
859
860         if (ifc != NULL) {
861                 if (ifc->mbps > 100)
862                         *scale = HaveWS | 3;
863                 else if (ifc->mbps > 10)
864                         *scale = HaveWS | 1;
865                 else
866                         *scale = HaveWS | 0;
867                 if (ifc->feat & NETF_TSO)
868                         *flags |= TSO;
869         } else
870                 *scale = HaveWS | 0;
871
872         return mtu;
873 }
874
875 void inittcpctl(struct conv *s, int mode)
876 {
877         Tcpctl *tcb;
878         Tcp4hdr *h4;
879         Tcp6hdr *h6;
880         int mss;
881
882         tcb = (Tcpctl *) s->ptcl;
883
884         memset(tcb, 0, sizeof(Tcpctl));
885
886         tcb->ssthresh = UINT32_MAX;
887         tcb->srtt = tcp_irtt << LOGAGAIN;
888         tcb->mdev = 0;
889
890         /* setup timers */
891         tcb->timer.start = tcp_irtt / MSPTICK;
892         tcb->timer.func = tcptimeout;
893         tcb->timer.arg = s;
894         tcb->rtt_timer.start = MAX_TIME;
895         tcb->acktimer.start = TCP_ACK / MSPTICK;
896         tcb->acktimer.func = tcpacktimer;
897         tcb->acktimer.arg = s;
898         tcb->katimer.start = DEF_KAT / MSPTICK;
899         tcb->katimer.func = tcpkeepalive;
900         tcb->katimer.arg = s;
901
902         mss = DEF_MSS;
903
904         /* create a prototype(pseudo) header */
905         if (mode != TCP_LISTEN) {
906                 if (ipcmp(s->laddr, IPnoaddr) == 0)
907                         findlocalip(s->p->f, s->laddr, s->raddr);
908
909                 switch (s->ipversion) {
910                         case V4:
911                                 h4 = &tcb->protohdr.tcp4hdr;
912                                 memset(h4, 0, sizeof(*h4));
913                                 h4->proto = IP_TCPPROTO;
914                                 hnputs(h4->tcpsport, s->lport);
915                                 hnputs(h4->tcpdport, s->rport);
916                                 v6tov4(h4->tcpsrc, s->laddr);
917                                 v6tov4(h4->tcpdst, s->raddr);
918                                 break;
919                         case V6:
920                                 h6 = &tcb->protohdr.tcp6hdr;
921                                 memset(h6, 0, sizeof(*h6));
922                                 h6->proto = IP_TCPPROTO;
923                                 hnputs(h6->tcpsport, s->lport);
924                                 hnputs(h6->tcpdport, s->rport);
925                                 ipmove(h6->tcpsrc, s->laddr);
926                                 ipmove(h6->tcpdst, s->raddr);
927                                 mss = DEF_MSS6;
928                                 break;
929                         default:
930                                 panic("inittcpctl: version %d", s->ipversion);
931                 }
932         }
933
934         tcb->mss = mss;
935         tcb->typical_mss = mss;
936         tcb->cwind = tcb->typical_mss * CWIND_SCALE;
937
938         /* default is no window scaling */
939         tcb->window = QMAX;
940         tcb->rcv.wnd = QMAX;
941         tcb->rcv.scale = 0;
942         tcb->snd.scale = 0;
943         qsetlimit(s->rq, QMAX);
944 }
945
946 /*
947  *  called with s qlocked
948  */
949 void tcpstart(struct conv *s, int mode)
950 {
951         Tcpctl *tcb;
952         struct tcppriv *tpriv;
953         char *kpname;
954
955         tpriv = s->p->priv;
956
957         if (tpriv->ackprocstarted == 0) {
958                 qlock(&tpriv->apl);
959                 if (tpriv->ackprocstarted == 0) {
960                         /* tcpackproc needs to free this if it ever exits */
961                         kpname = kmalloc(KNAMELEN, MEM_WAIT);
962                         snprintf(kpname, KNAMELEN, "#I%dtcpack", s->p->f->dev);
963                         ktask(kpname, tcpackproc, s->p);
964                         tpriv->ackprocstarted = 1;
965                 }
966                 qunlock(&tpriv->apl);
967         }
968
969         tcb = (Tcpctl *) s->ptcl;
970
971         inittcpctl(s, mode);
972
973         iphtadd(&tpriv->ht, s);
974         switch (mode) {
975                 case TCP_LISTEN:
976                         tpriv->stats[PassiveOpens]++;
977                         tcb->flags |= CLONE;
978                         tcpsetstate(s, Listen);
979                         break;
980
981                 case TCP_CONNECT:
982                         tpriv->stats[ActiveOpens]++;
983                         tcb->flags |= ACTIVE;
984                         tcpsndsyn(s, tcb);
985                         tcpsetstate(s, Syn_sent);
986                         tcpoutput(s);
987                         break;
988         }
989 }
990
991 static char *tcpflag(uint16_t flag)
992 {
993         static char buf[128];
994
995         snprintf(buf, sizeof(buf), "%d", flag >> 10);   /* Head len */
996         if (flag & URG)
997                 snprintf(buf, sizeof(buf), "%s%s", buf, " URG");
998         if (flag & ACK)
999                 snprintf(buf, sizeof(buf), "%s%s", buf, " ACK");
1000         if (flag & PSH)
1001                 snprintf(buf, sizeof(buf), "%s%s", buf, " PSH");
1002         if (flag & RST)
1003                 snprintf(buf, sizeof(buf), "%s%s", buf, " RST");
1004         if (flag & SYN)
1005                 snprintf(buf, sizeof(buf), "%s%s", buf, " SYN");
1006         if (flag & FIN)
1007                 snprintf(buf, sizeof(buf), "%s%s", buf, " FIN");
1008
1009         return buf;
1010 }
1011
1012 /* Helper, determine if we should send a TCP timestamp.  ts_val was the
1013  * timestamp from our distant end.  We'll also send a TS on SYN (no ACK). */
1014 static bool tcp_seg_has_ts(Tcp *tcph)
1015 {
1016         return tcph->ts_val || ((tcph->flags & SYN) && !(tcph->flags & ACK));
1017 }
1018
1019 /* Given a TCP header/segment and default header size (e.g. TCP4_HDRSIZE),
1020  * return the actual hdr_len and opt_pad */
1021 static void compute_hdrlen_optpad(Tcp *tcph, uint16_t default_hdrlen,
1022                                   uint16_t *ret_hdrlen, uint16_t *ret_optpad,
1023                                   Tcpctl *tcb)
1024 {
1025         uint16_t hdrlen = default_hdrlen;
1026         uint16_t optpad = 0;
1027
1028         if (tcph->flags & SYN) {
1029                 if (tcph->mss)
1030                         hdrlen += MSS_LENGTH;
1031                 if (tcph->ws)
1032                         hdrlen += WS_LENGTH;
1033                 if (tcph->sack_ok)
1034                         hdrlen += SACK_OK_LENGTH;
1035         }
1036         if (tcp_seg_has_ts(tcph)) {
1037                 hdrlen += TS_LENGTH;
1038                 /* SYNs have other opts, don't do the PREPAD NOOP optimization. */
1039                 if (!(tcph->flags & SYN))
1040                         hdrlen += TS_SEND_PREPAD;
1041         }
1042         if (tcb && tcb->rcv.nr_sacks)
1043                 hdrlen += 2 + tcb->rcv.nr_sacks * 8;
1044         optpad = hdrlen & 3;
1045         if (optpad)
1046                 optpad = 4 - optpad;
1047         hdrlen += optpad;
1048         *ret_hdrlen = hdrlen;
1049         *ret_optpad = optpad;
1050 }
1051
1052 /* Writes the TCP options for tcph to opt. */
1053 static void write_opts(Tcp *tcph, uint8_t *opt, uint16_t optpad, Tcpctl *tcb)
1054 {
1055         if (tcph->flags & SYN) {
1056                 if (tcph->mss != 0) {
1057                         *opt++ = MSSOPT;
1058                         *opt++ = MSS_LENGTH;
1059                         hnputs(opt, tcph->mss);
1060                         opt += 2;
1061                 }
1062                 if (tcph->ws != 0) {
1063                         *opt++ = WSOPT;
1064                         *opt++ = WS_LENGTH;
1065                         *opt++ = tcph->ws;
1066                 }
1067                 if (tcph->sack_ok) {
1068                         *opt++ = SACK_OK_OPT;
1069                         *opt++ = SACK_OK_LENGTH;
1070                 }
1071         }
1072         if (tcp_seg_has_ts(tcph)) {
1073                 if (!(tcph->flags & SYN)) {
1074                         *opt++ = NOOPOPT;
1075                         *opt++ = NOOPOPT;
1076                 }
1077                 *opt++ = TS_OPT;
1078                 *opt++ = TS_LENGTH;
1079                 /* Setting TSval, our time */
1080                 hnputl(opt, milliseconds());
1081                 opt += 4;
1082                 /* Setting TSecr, the time we last saw from them, stored in ts_val */
1083                 hnputl(opt, tcph->ts_val);
1084                 opt += 4;
1085         }
1086         if (tcb && tcb->rcv.nr_sacks) {
1087                 *opt++ = SACK_OPT;
1088                 *opt++ = 2 + tcb->rcv.nr_sacks * 8;
1089                 for (int i = 0; i < tcb->rcv.nr_sacks; i++) {
1090                         hnputl(opt, tcb->rcv.sacks[i].left);
1091                         opt += 4;
1092                         hnputl(opt, tcb->rcv.sacks[i].right);
1093                         opt += 4;
1094                 }
1095         }
1096         while (optpad-- > 0)
1097                 *opt++ = NOOPOPT;
1098 }
1099
1100 /* Given a data block (or NULL) returns a block with enough header room that we
1101  * can send out.  block->wp is set to the beginning of the payload.  Returns
1102  * NULL on some sort of error. */
1103 static struct block *alloc_or_pad_block(struct block *data,
1104                                         uint16_t total_hdr_size)
1105 {
1106         if (data) {
1107                 data = padblock(data, total_hdr_size);
1108                 if (data == NULL)
1109                         return NULL;
1110         } else {
1111                 /* the 64 pad is to meet mintu's */
1112                 data = block_alloc(total_hdr_size + 64, MEM_WAIT);
1113                 if (data == NULL)
1114                         return NULL;
1115                 data->wp += total_hdr_size;
1116         }
1117         return data;
1118 }
1119
1120 struct block *htontcp6(Tcp *tcph, struct block *data, Tcp6hdr *ph,
1121                                            Tcpctl *tcb)
1122 {
1123         int dlen = blocklen(data);
1124         Tcp6hdr *h;
1125         uint16_t csum;
1126         uint16_t hdrlen, optpad;
1127
1128         compute_hdrlen_optpad(tcph, TCP6_HDRSIZE, &hdrlen, &optpad, tcb);
1129
1130         data = alloc_or_pad_block(data, hdrlen + TCP6_PKT);
1131         if (data == NULL)
1132                 return NULL;
1133         /* relative to the block start (bp->rp) */
1134         data->transport_header_end = hdrlen + TCP6_PKT;
1135
1136         /* copy in pseudo ip header plus port numbers */
1137         h = (Tcp6hdr *) (data->rp);
1138         memmove(h, ph, TCP6_TCBPHDRSZ);
1139
1140         /* compose pseudo tcp header, do cksum calculation */
1141         hnputl(h->vcf, hdrlen + dlen);
1142         h->ploadlen[0] = h->ploadlen[1] = h->proto = 0;
1143         h->ttl = ph->proto;
1144
1145         /* copy in variable bits */
1146         hnputl(h->tcpseq, tcph->seq);
1147         hnputl(h->tcpack, tcph->ack);
1148         hnputs(h->tcpflag, (hdrlen << 10) | tcph->flags);
1149         hnputs(h->tcpwin, tcph->wnd >> (tcb != NULL ? tcb->snd.scale : 0));
1150         hnputs(h->tcpurg, tcph->urg);
1151
1152         write_opts(tcph, h->tcpopt, optpad, tcb);
1153
1154         if (tcb != NULL && tcb->nochecksum) {
1155                 h->tcpcksum[0] = h->tcpcksum[1] = 0;
1156         } else {
1157                 csum = ptclcsum(data, TCP6_IPLEN, hdrlen + dlen + TCP6_PHDRSIZE);
1158                 hnputs(h->tcpcksum, csum);
1159         }
1160
1161         /* move from pseudo header back to normal ip header */
1162         memset(h->vcf, 0, 4);
1163         h->vcf[0] = IP_VER6;
1164         hnputs(h->ploadlen, hdrlen + dlen);
1165         h->proto = ph->proto;
1166
1167         return data;
1168 }
1169
1170 struct block *htontcp4(Tcp *tcph, struct block *data, Tcp4hdr *ph,
1171                                            Tcpctl *tcb)
1172 {
1173         int dlen = blocklen(data);
1174         Tcp4hdr *h;
1175         uint16_t csum;
1176         uint16_t hdrlen, optpad;
1177
1178         compute_hdrlen_optpad(tcph, TCP4_HDRSIZE, &hdrlen, &optpad, tcb);
1179
1180         data = alloc_or_pad_block(data, hdrlen + TCP4_PKT);
1181         if (data == NULL)
1182                 return NULL;
1183         /* relative to the block start (bp->rp) */
1184         data->transport_header_end = hdrlen + TCP4_PKT;
1185
1186         /* copy in pseudo ip header plus port numbers */
1187         h = (Tcp4hdr *) (data->rp);
1188         memmove(h, ph, TCP4_TCBPHDRSZ);
1189
1190         /* copy in variable bits */
1191         hnputs(h->tcplen, hdrlen + dlen);
1192         hnputl(h->tcpseq, tcph->seq);
1193         hnputl(h->tcpack, tcph->ack);
1194         hnputs(h->tcpflag, (hdrlen << 10) | tcph->flags);
1195         hnputs(h->tcpwin, tcph->wnd >> (tcb != NULL ? tcb->snd.scale : 0));
1196         hnputs(h->tcpurg, tcph->urg);
1197
1198         write_opts(tcph, h->tcpopt, optpad, tcb);
1199
1200         if (tcb != NULL && tcb->nochecksum) {
1201                 h->tcpcksum[0] = h->tcpcksum[1] = 0;
1202         } else {
1203                 csum = ~ptclcsum(data, TCP4_IPLEN, TCP4_PHDRSIZE);
1204                 hnputs(h->tcpcksum, csum);
1205                 data->checksum_start = TCP4_IPLEN + TCP4_PHDRSIZE;
1206                 data->checksum_offset = ph->tcpcksum - ph->tcpsport;
1207                 data->flag |= Btcpck;
1208         }
1209
1210         return data;
1211 }
1212
1213 static void parse_inbound_sacks(Tcp *tcph, uint8_t *opt, uint16_t optlen)
1214 {
1215         uint8_t nr_sacks;
1216         uint32_t left, right;
1217
1218         nr_sacks = (optlen - 2) / 8;
1219         if (nr_sacks > MAX_NR_SACKS_PER_PACKET)
1220                 return;
1221         opt += 2;
1222         for (int i = 0; i < nr_sacks; i++, opt += 8) {
1223                 left = nhgetl(opt);
1224                 right = nhgetl(opt + 4);
1225                 if (seq_ge(left, right)) {
1226                         /* bad / malicious SACK.  Skip it, and adjust. */
1227                         nr_sacks--;
1228                         i--;    /* stay on this array element next loop */
1229                         continue;
1230                 }
1231                 tcph->sacks[i].left = left;
1232                 tcph->sacks[i].right = right;
1233         }
1234         tcph->nr_sacks = nr_sacks;
1235 }
1236
1237 static void parse_inbound_opts(Tcp *tcph, uint8_t *opt, uint16_t optsize)
1238 {
1239         uint16_t optlen;
1240
1241         while (optsize > 0 && *opt != EOLOPT) {
1242                 if (*opt == NOOPOPT) {
1243                         optsize--;
1244                         opt++;
1245                         continue;
1246                 }
1247                 optlen = opt[1];
1248                 if (optlen < 2 || optlen > optsize)
1249                         break;
1250                 switch (*opt) {
1251                         case MSSOPT:
1252                                 if (optlen == MSS_LENGTH)
1253                                         tcph->mss = nhgets(opt + 2);
1254                                 break;
1255                         case WSOPT:
1256                                 if (optlen == WS_LENGTH && *(opt + 2) <= MAX_WS_VALUE)
1257                                         tcph->ws = HaveWS | *(opt + 2);
1258                                 break;
1259                         case SACK_OK_OPT:
1260                                 if (optlen == SACK_OK_LENGTH)
1261                                         tcph->sack_ok = TRUE;
1262                                 break;
1263                         case SACK_OPT:
1264                                 parse_inbound_sacks(tcph, opt, optlen);
1265                                 break;
1266                         case TS_OPT:
1267                                 if (optlen == TS_LENGTH) {
1268                                         tcph->ts_val = nhgetl(opt + 2);
1269                                         tcph->ts_ecr = nhgetl(opt + 6);
1270                                 }
1271                                 break;
1272                 }
1273                 optsize -= optlen;
1274                 opt += optlen;
1275         }
1276 }
1277
1278 /* Helper, clears the opts.  We'll later set them with e.g. parse_inbound_opts,
1279  * set them manually, or something else. */
1280 static void clear_tcph_opts(Tcp *tcph)
1281 {
1282         tcph->mss = 0;
1283         tcph->ws = 0;
1284         tcph->sack_ok = FALSE;
1285         tcph->nr_sacks = 0;
1286         tcph->ts_val = 0;
1287         tcph->ts_ecr = 0;
1288 }
1289
1290 int ntohtcp6(Tcp * tcph, struct block **bpp)
1291 {
1292         Tcp6hdr *h;
1293         uint16_t hdrlen;
1294
1295         *bpp = pullupblock(*bpp, TCP6_PKT + TCP6_HDRSIZE);
1296         if (*bpp == NULL)
1297                 return -1;
1298
1299         h = (Tcp6hdr *) ((*bpp)->rp);
1300         tcph->source = nhgets(h->tcpsport);
1301         tcph->dest = nhgets(h->tcpdport);
1302         tcph->seq = nhgetl(h->tcpseq);
1303         tcph->ack = nhgetl(h->tcpack);
1304         hdrlen = (h->tcpflag[0] >> 2) & ~3;
1305         if (hdrlen < TCP6_HDRSIZE) {
1306                 freeblist(*bpp);
1307                 return -1;
1308         }
1309
1310         tcph->flags = h->tcpflag[1];
1311         tcph->wnd = nhgets(h->tcpwin);
1312         tcph->urg = nhgets(h->tcpurg);
1313         clear_tcph_opts(tcph);
1314         tcph->len = nhgets(h->ploadlen) - hdrlen;
1315
1316         *bpp = pullupblock(*bpp, hdrlen + TCP6_PKT);
1317         if (*bpp == NULL)
1318                 return -1;
1319         parse_inbound_opts(tcph, h->tcpopt, hdrlen - TCP6_HDRSIZE);
1320         return hdrlen;
1321 }
1322
1323 int ntohtcp4(Tcp * tcph, struct block **bpp)
1324 {
1325         Tcp4hdr *h;
1326         uint16_t hdrlen;
1327
1328         *bpp = pullupblock(*bpp, TCP4_PKT + TCP4_HDRSIZE);
1329         if (*bpp == NULL)
1330                 return -1;
1331
1332         h = (Tcp4hdr *) ((*bpp)->rp);
1333         tcph->source = nhgets(h->tcpsport);
1334         tcph->dest = nhgets(h->tcpdport);
1335         tcph->seq = nhgetl(h->tcpseq);
1336         tcph->ack = nhgetl(h->tcpack);
1337
1338         hdrlen = (h->tcpflag[0] >> 2) & ~3;
1339         if (hdrlen < TCP4_HDRSIZE) {
1340                 freeblist(*bpp);
1341                 return -1;
1342         }
1343
1344         tcph->flags = h->tcpflag[1];
1345         tcph->wnd = nhgets(h->tcpwin);
1346         tcph->urg = nhgets(h->tcpurg);
1347         clear_tcph_opts(tcph);
1348         tcph->len = nhgets(h->length) - (hdrlen + TCP4_PKT);
1349
1350         *bpp = pullupblock(*bpp, hdrlen + TCP4_PKT);
1351         if (*bpp == NULL)
1352                 return -1;
1353         parse_inbound_opts(tcph, h->tcpopt, hdrlen - TCP4_HDRSIZE);
1354         return hdrlen;
1355 }
1356
1357 /*
1358  *  For outgoing calls, generate an initial sequence
1359  *  number and put a SYN on the send queue
1360  */
1361 void tcpsndsyn(struct conv *s, Tcpctl * tcb)
1362 {
1363         urandom_read(&tcb->iss, sizeof(tcb->iss));
1364         tcb->rttseq = tcb->iss;
1365         tcb->snd.wl2 = tcb->iss;
1366         tcb->snd.una = tcb->iss;
1367         tcb->snd.rtx = tcb->rttseq;
1368         tcb->snd.nxt = tcb->rttseq;
1369         tcb->flgcnt++;
1370         tcb->flags |= FORCE;
1371         tcb->sndsyntime = NOW;
1372
1373         /* set desired mss and scale */
1374         tcb->mss = tcpmtu(s->p, s->laddr, s->ipversion, &tcb->scale,
1375                           &tcb->flags);
1376 }
1377
1378 void
1379 sndrst(struct Proto *tcp, uint8_t * source, uint8_t * dest,
1380            uint16_t length, Tcp * seg, uint8_t version, char *reason)
1381 {
1382         struct block *hbp;
1383         uint8_t rflags;
1384         struct tcppriv *tpriv;
1385         Tcp4hdr ph4;
1386         Tcp6hdr ph6;
1387
1388         netlog(tcp->f, Logtcpreset, "sndrst: %s\n", reason);
1389
1390         tpriv = tcp->priv;
1391
1392         if (seg->flags & RST)
1393                 return;
1394
1395         /* make pseudo header */
1396         switch (version) {
1397                 case V4:
1398                         memset(&ph4, 0, sizeof(ph4));
1399                         ph4.vihl = IP_VER4;
1400                         v6tov4(ph4.tcpsrc, dest);
1401                         v6tov4(ph4.tcpdst, source);
1402                         ph4.proto = IP_TCPPROTO;
1403                         hnputs(ph4.tcplen, TCP4_HDRSIZE);
1404                         hnputs(ph4.tcpsport, seg->dest);
1405                         hnputs(ph4.tcpdport, seg->source);
1406                         break;
1407                 case V6:
1408                         memset(&ph6, 0, sizeof(ph6));
1409                         ph6.vcf[0] = IP_VER6;
1410                         ipmove(ph6.tcpsrc, dest);
1411                         ipmove(ph6.tcpdst, source);
1412                         ph6.proto = IP_TCPPROTO;
1413                         hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1414                         hnputs(ph6.tcpsport, seg->dest);
1415                         hnputs(ph6.tcpdport, seg->source);
1416                         break;
1417                 default:
1418                         panic("sndrst: version %d", version);
1419         }
1420
1421         tpriv->stats[OutRsts]++;
1422         rflags = RST;
1423
1424         /* convince the other end that this reset is in band */
1425         if (seg->flags & ACK) {
1426                 seg->seq = seg->ack;
1427                 seg->ack = 0;
1428         } else {
1429                 rflags |= ACK;
1430                 seg->ack = seg->seq;
1431                 seg->seq = 0;
1432                 if (seg->flags & SYN)
1433                         seg->ack++;
1434                 seg->ack += length;
1435                 if (seg->flags & FIN)
1436                         seg->ack++;
1437         }
1438         seg->flags = rflags;
1439         seg->wnd = 0;
1440         seg->urg = 0;
1441         seg->mss = 0;
1442         seg->ws = 0;
1443         seg->sack_ok = FALSE;
1444         seg->nr_sacks = 0;
1445         /* seg->ts_val is already set with their timestamp */
1446         switch (version) {
1447                 case V4:
1448                         hbp = htontcp4(seg, NULL, &ph4, NULL);
1449                         if (hbp == NULL)
1450                                 return;
1451                         ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1452                         break;
1453                 case V6:
1454                         hbp = htontcp6(seg, NULL, &ph6, NULL);
1455                         if (hbp == NULL)
1456                                 return;
1457                         ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1458                         break;
1459                 default:
1460                         panic("sndrst2: version %d", version);
1461         }
1462 }
1463
1464 /*
1465  *  send a reset to the remote side and close the conversation
1466  *  called with s qlocked
1467  */
1468 static void tcphangup(struct conv *s)
1469 {
1470         ERRSTACK(1);
1471         Tcp seg;
1472         Tcpctl *tcb;
1473         struct block *hbp;
1474
1475         tcb = (Tcpctl *) s->ptcl;
1476         if (ipcmp(s->raddr, IPnoaddr)) {
1477                 /* discard error style, poperror regardless */
1478                 if (!waserror()) {
1479                         seg.flags = RST | ACK;
1480                         seg.ack = tcb->rcv.nxt;
1481                         tcb->last_ack_sent = seg.ack;
1482                         tcb->rcv.una = 0;
1483                         seg.seq = tcb->snd.nxt;
1484                         seg.wnd = 0;
1485                         seg.urg = 0;
1486                         seg.mss = 0;
1487                         seg.ws = 0;
1488                         seg.sack_ok = FALSE;
1489                         seg.nr_sacks = 0;
1490                         seg.ts_val = tcb->ts_recent;
1491                         switch (s->ipversion) {
1492                                 case V4:
1493                                         tcb->protohdr.tcp4hdr.vihl = IP_VER4;
1494                                         hbp = htontcp4(&seg, NULL, &tcb->protohdr.tcp4hdr, tcb);
1495                                         ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
1496                                         break;
1497                                 case V6:
1498                                         tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
1499                                         hbp = htontcp6(&seg, NULL, &tcb->protohdr.tcp6hdr, tcb);
1500                                         ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
1501                                         break;
1502                                 default:
1503                                         panic("tcphangup: version %d", s->ipversion);
1504                         }
1505                 }
1506                 poperror();
1507         }
1508         localclose(s, NULL);
1509 }
1510
1511 /*
1512  *  (re)send a SYN ACK
1513  */
1514 int sndsynack(struct Proto *tcp, Limbo * lp)
1515 {
1516         struct block *hbp;
1517         Tcp4hdr ph4;
1518         Tcp6hdr ph6;
1519         Tcp seg;
1520         int scale;
1521         uint8_t flag = 0;
1522
1523         /* make pseudo header */
1524         switch (lp->version) {
1525                 case V4:
1526                         memset(&ph4, 0, sizeof(ph4));
1527                         ph4.vihl = IP_VER4;
1528                         v6tov4(ph4.tcpsrc, lp->laddr);
1529                         v6tov4(ph4.tcpdst, lp->raddr);
1530                         ph4.proto = IP_TCPPROTO;
1531                         hnputs(ph4.tcplen, TCP4_HDRSIZE);
1532                         hnputs(ph4.tcpsport, lp->lport);
1533                         hnputs(ph4.tcpdport, lp->rport);
1534                         break;
1535                 case V6:
1536                         memset(&ph6, 0, sizeof(ph6));
1537                         ph6.vcf[0] = IP_VER6;
1538                         ipmove(ph6.tcpsrc, lp->laddr);
1539                         ipmove(ph6.tcpdst, lp->raddr);
1540                         ph6.proto = IP_TCPPROTO;
1541                         hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1542                         hnputs(ph6.tcpsport, lp->lport);
1543                         hnputs(ph6.tcpdport, lp->rport);
1544                         break;
1545                 default:
1546                         panic("sndrst: version %d", lp->version);
1547         }
1548
1549         seg.seq = lp->iss;
1550         seg.ack = lp->irs + 1;
1551         seg.flags = SYN | ACK;
1552         seg.urg = 0;
1553         seg.mss = tcpmtu(tcp, lp->laddr, lp->version, &scale, &flag);
1554         seg.wnd = QMAX;
1555         seg.ts_val = lp->ts_val;
1556         seg.nr_sacks = 0;
1557
1558         /* if the other side set scale, we should too */
1559         if (lp->rcvscale) {
1560                 seg.ws = scale;
1561                 lp->sndscale = scale;
1562         } else {
1563                 seg.ws = 0;
1564                 lp->sndscale = 0;
1565         }
1566         if (SACK_SUPPORTED)
1567                 seg.sack_ok = lp->sack_ok;
1568         else
1569                 seg.sack_ok = FALSE;
1570
1571         switch (lp->version) {
1572                 case V4:
1573                         hbp = htontcp4(&seg, NULL, &ph4, NULL);
1574                         if (hbp == NULL)
1575                                 return -1;
1576                         ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1577                         break;
1578                 case V6:
1579                         hbp = htontcp6(&seg, NULL, &ph6, NULL);
1580                         if (hbp == NULL)
1581                                 return -1;
1582                         ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1583                         break;
1584                 default:
1585                         panic("sndsnack: version %d", lp->version);
1586         }
1587         lp->lastsend = NOW;
1588         return 0;
1589 }
1590
1591 #define hashipa(a, p) ( ( (a)[IPaddrlen-2] + (a)[IPaddrlen-1] + p )&LHTMASK )
1592
1593 /*
1594  *  put a call into limbo and respond with a SYN ACK
1595  *
1596  *  called with proto locked
1597  */
1598 static void
1599 limbo(struct conv *s, uint8_t * source, uint8_t * dest, Tcp * seg, int version)
1600 {
1601         Limbo *lp, **l;
1602         struct tcppriv *tpriv;
1603         int h;
1604
1605         tpriv = s->p->priv;
1606         h = hashipa(source, seg->source);
1607
1608         for (l = &tpriv->lht[h]; *l != NULL; l = &lp->next) {
1609                 lp = *l;
1610                 if (lp->lport != seg->dest || lp->rport != seg->source
1611                         || lp->version != version)
1612                         continue;
1613                 if (ipcmp(lp->raddr, source) != 0)
1614                         continue;
1615                 if (ipcmp(lp->laddr, dest) != 0)
1616                         continue;
1617
1618                 /* each new SYN restarts the retransmits */
1619                 lp->irs = seg->seq;
1620                 break;
1621         }
1622         lp = *l;
1623         if (lp == NULL) {
1624                 if (tpriv->nlimbo >= Maxlimbo && tpriv->lht[h]) {
1625                         lp = tpriv->lht[h];
1626                         tpriv->lht[h] = lp->next;
1627                         lp->next = NULL;
1628                 } else {
1629                         lp = kzmalloc(sizeof(*lp), 0);
1630                         if (lp == NULL)
1631                                 return;
1632                         tpriv->nlimbo++;
1633                 }
1634                 *l = lp;
1635                 lp->version = version;
1636                 ipmove(lp->laddr, dest);
1637                 ipmove(lp->raddr, source);
1638                 lp->lport = seg->dest;
1639                 lp->rport = seg->source;
1640                 lp->mss = seg->mss;
1641                 lp->rcvscale = seg->ws;
1642                 lp->sack_ok = seg->sack_ok;
1643                 lp->irs = seg->seq;
1644                 lp->ts_val = seg->ts_val;
1645                 urandom_read(&lp->iss, sizeof(lp->iss));
1646         }
1647
1648         if (sndsynack(s->p, lp) < 0) {
1649                 *l = lp->next;
1650                 tpriv->nlimbo--;
1651                 kfree(lp);
1652         }
1653 }
1654
1655 /*
1656  *  resend SYN ACK's once every SYNACK_RXTIMER ms.
1657  */
1658 static void limborexmit(struct Proto *tcp)
1659 {
1660         struct tcppriv *tpriv;
1661         Limbo **l, *lp;
1662         int h;
1663         int seen;
1664         uint64_t now;
1665
1666         tpriv = tcp->priv;
1667
1668         if (!canqlock(&tcp->qlock))
1669                 return;
1670         seen = 0;
1671         now = NOW;
1672         for (h = 0; h < NLHT && seen < tpriv->nlimbo; h++) {
1673                 for (l = &tpriv->lht[h]; *l != NULL && seen < tpriv->nlimbo;) {
1674                         lp = *l;
1675                         seen++;
1676                         if (now - lp->lastsend < (lp->rexmits + 1) * SYNACK_RXTIMER)
1677                                 continue;
1678
1679                         /* time it out after 1 second */
1680                         if (++(lp->rexmits) > 5) {
1681                                 tpriv->nlimbo--;
1682                                 *l = lp->next;
1683                                 kfree(lp);
1684                                 continue;
1685                         }
1686
1687                         /* if we're being attacked, don't bother resending SYN ACK's */
1688                         if (tpriv->nlimbo > 100)
1689                                 continue;
1690
1691                         if (sndsynack(tcp, lp) < 0) {
1692                                 tpriv->nlimbo--;
1693                                 *l = lp->next;
1694                                 kfree(lp);
1695                                 continue;
1696                         }
1697
1698                         l = &lp->next;
1699                 }
1700         }
1701         qunlock(&tcp->qlock);
1702 }
1703
1704 /*
1705  *  lookup call in limbo.  if found, throw it out.
1706  *
1707  *  called with proto locked
1708  */
1709 static void
1710 limborst(struct conv *s, Tcp * segp, uint8_t * src, uint8_t * dst,
1711                  uint8_t version)
1712 {
1713         Limbo *lp, **l;
1714         int h;
1715         struct tcppriv *tpriv;
1716
1717         tpriv = s->p->priv;
1718
1719         /* find a call in limbo */
1720         h = hashipa(src, segp->source);
1721         for (l = &tpriv->lht[h]; *l != NULL; l = &lp->next) {
1722                 lp = *l;
1723                 if (lp->lport != segp->dest || lp->rport != segp->source
1724                         || lp->version != version)
1725                         continue;
1726                 if (ipcmp(lp->laddr, dst) != 0)
1727                         continue;
1728                 if (ipcmp(lp->raddr, src) != 0)
1729                         continue;
1730
1731                 /* RST can only follow the SYN */
1732                 if (segp->seq == lp->irs + 1) {
1733                         tpriv->nlimbo--;
1734                         *l = lp->next;
1735                         kfree(lp);
1736                 }
1737                 break;
1738         }
1739 }
1740
1741 /* The advertised MSS (e.g. 1460) includes any per-packet TCP options, such as
1742  * TCP timestamps.  A given packet will contain mss bytes, but only typical_mss
1743  * bytes of *data*.  If we know we'll use those options, we should adjust our
1744  * typical_mss, which will affect the cwnd. */
1745 static void adjust_typical_mss_for_opts(Tcp *tcph, Tcpctl *tcb)
1746 {
1747         uint16_t opt_size = 0;
1748
1749         if (tcph->ts_val)
1750                 opt_size += TS_LENGTH + TS_SEND_PREPAD;
1751         opt_size = ROUNDUP(opt_size, 4);
1752         tcb->typical_mss -= opt_size;
1753 }
1754
1755 /*
1756  *  come here when we finally get an ACK to our SYN-ACK.
1757  *  lookup call in limbo.  if found, create a new conversation
1758  *
1759  *  called with proto locked
1760  */
1761 static struct conv *tcpincoming(struct conv *s, Tcp * segp, uint8_t * src,
1762                                                                 uint8_t * dst, uint8_t version)
1763 {
1764         struct conv *new;
1765         Tcpctl *tcb;
1766         struct tcppriv *tpriv;
1767         Tcp4hdr *h4;
1768         Tcp6hdr *h6;
1769         Limbo *lp, **l;
1770         int h;
1771
1772         /* unless it's just an ack, it can't be someone coming out of limbo */
1773         if ((segp->flags & SYN) || (segp->flags & ACK) == 0)
1774                 return NULL;
1775
1776         tpriv = s->p->priv;
1777
1778         /* find a call in limbo */
1779         h = hashipa(src, segp->source);
1780         for (l = &tpriv->lht[h]; (lp = *l) != NULL; l = &lp->next) {
1781                 netlog(s->p->f, Logtcp,
1782                            "tcpincoming s %I!%d/%I!%d d %I!%d/%I!%d v %d/%d\n", src,
1783                            segp->source, lp->raddr, lp->rport, dst, segp->dest, lp->laddr,
1784                            lp->lport, version, lp->version);
1785
1786                 if (lp->lport != segp->dest || lp->rport != segp->source
1787                         || lp->version != version)
1788                         continue;
1789                 if (ipcmp(lp->laddr, dst) != 0)
1790                         continue;
1791                 if (ipcmp(lp->raddr, src) != 0)
1792                         continue;
1793
1794                 /* we're assuming no data with the initial SYN */
1795                 if (segp->seq != lp->irs + 1 || segp->ack != lp->iss + 1) {
1796                         netlog(s->p->f, Logtcp, "tcpincoming s 0x%lx/0x%lx a 0x%lx 0x%lx\n",
1797                                    segp->seq, lp->irs + 1, segp->ack, lp->iss + 1);
1798                         lp = NULL;
1799                 } else {
1800                         tpriv->nlimbo--;
1801                         *l = lp->next;
1802                 }
1803                 break;
1804         }
1805         if (lp == NULL)
1806                 return NULL;
1807
1808         new = Fsnewcall(s, src, segp->source, dst, segp->dest, version);
1809         if (new == NULL)
1810                 return NULL;
1811
1812         memmove(new->ptcl, s->ptcl, sizeof(Tcpctl));
1813         tcb = (Tcpctl *) new->ptcl;
1814         tcb->flags &= ~CLONE;
1815         tcb->timer.arg = new;
1816         tcb->timer.state = TcptimerOFF;
1817         tcb->acktimer.arg = new;
1818         tcb->acktimer.state = TcptimerOFF;
1819         tcb->katimer.arg = new;
1820         tcb->katimer.state = TcptimerOFF;
1821         tcb->rtt_timer.arg = new;
1822         tcb->rtt_timer.state = TcptimerOFF;
1823
1824         tcb->irs = lp->irs;
1825         tcb->rcv.nxt = tcb->irs + 1;
1826         tcb->rcv.urg = tcb->rcv.nxt;
1827
1828         tcb->iss = lp->iss;
1829         tcb->rttseq = tcb->iss;
1830         tcb->snd.wl2 = tcb->iss;
1831         tcb->snd.una = tcb->iss + 1;
1832         tcb->snd.rtx = tcb->iss + 1;
1833         tcb->snd.nxt = tcb->iss + 1;
1834         tcb->flgcnt = 0;
1835         tcb->flags |= SYNACK;
1836
1837         /* our sending max segment size cannot be bigger than what he asked for */
1838         if (lp->mss != 0 && lp->mss < tcb->mss) {
1839                 tcb->mss = lp->mss;
1840                 tcb->typical_mss = tcb->mss;
1841         }
1842         adjust_typical_mss_for_opts(segp, tcb);
1843
1844         /* Here's where we record the previously-decided header options.  They were
1845          * actually decided on when we agreed to them in the SYNACK we sent.  We
1846          * didn't create an actual TCB until now, so we can copy those decisions out
1847          * of the limbo tracker and into the TCB. */
1848         tcb->sack_ok = lp->sack_ok;
1849         /* window scaling */
1850         tcpsetscale(new, tcb, lp->rcvscale, lp->sndscale);
1851
1852         tcb->snd.wnd = segp->wnd;
1853         tcb->cwind = tcb->typical_mss * CWIND_SCALE;
1854
1855         /* set initial round trip time */
1856         tcb->sndsyntime = lp->lastsend + lp->rexmits * SYNACK_RXTIMER;
1857         tcpsynackrtt(new);
1858
1859         kfree(lp);
1860
1861         /* set up proto header */
1862         switch (version) {
1863                 case V4:
1864                         h4 = &tcb->protohdr.tcp4hdr;
1865                         memset(h4, 0, sizeof(*h4));
1866                         h4->proto = IP_TCPPROTO;
1867                         hnputs(h4->tcpsport, new->lport);
1868                         hnputs(h4->tcpdport, new->rport);
1869                         v6tov4(h4->tcpsrc, dst);
1870                         v6tov4(h4->tcpdst, src);
1871                         break;
1872                 case V6:
1873                         h6 = &tcb->protohdr.tcp6hdr;
1874                         memset(h6, 0, sizeof(*h6));
1875                         h6->proto = IP_TCPPROTO;
1876                         hnputs(h6->tcpsport, new->lport);
1877                         hnputs(h6->tcpdport, new->rport);
1878                         ipmove(h6->tcpsrc, dst);
1879                         ipmove(h6->tcpdst, src);
1880                         break;
1881                 default:
1882                         panic("tcpincoming: version %d", new->ipversion);
1883         }
1884
1885         tcpsetstate(new, Established);
1886
1887         iphtadd(&tpriv->ht, new);
1888
1889         return new;
1890 }
1891
1892 int seq_within(uint32_t x, uint32_t low, uint32_t high)
1893 {
1894         if (low <= high) {
1895                 if (low <= x && x <= high)
1896                         return 1;
1897         } else {
1898                 if (x >= low || x <= high)
1899                         return 1;
1900         }
1901         return 0;
1902 }
1903
1904 int seq_lt(uint32_t x, uint32_t y)
1905 {
1906         return (int)(x - y) < 0;
1907 }
1908
1909 int seq_le(uint32_t x, uint32_t y)
1910 {
1911         return (int)(x - y) <= 0;
1912 }
1913
1914 int seq_gt(uint32_t x, uint32_t y)
1915 {
1916         return (int)(x - y) > 0;
1917 }
1918
1919 int seq_ge(uint32_t x, uint32_t y)
1920 {
1921         return (int)(x - y) >= 0;
1922 }
1923
1924 static uint32_t seq_max(uint32_t x, uint32_t y)
1925 {
1926         return seq_ge(x, y) ? x : y;
1927 }
1928
1929 static uint32_t seq_min(uint32_t x, uint32_t y)
1930 {
1931         return seq_le(x, y) ? x : y;
1932 }
1933
1934 /*
1935  *  use the time between the first SYN and it's ack as the
1936  *  initial round trip time
1937  */
1938 void tcpsynackrtt(struct conv *s)
1939 {
1940         Tcpctl *tcb;
1941         uint64_t delta;
1942         struct tcppriv *tpriv;
1943
1944         tcb = (Tcpctl *) s->ptcl;
1945         tpriv = s->p->priv;
1946
1947         delta = NOW - tcb->sndsyntime;
1948         tcb->srtt = delta << LOGAGAIN;
1949         tcb->mdev = delta << LOGDGAIN;
1950
1951         /* halt round trip timer */
1952         tcphalt(tpriv, &tcb->rtt_timer);
1953 }
1954
1955 /* For LFNs (long/fat), our default tx queue doesn't hold enough data, and TCP
1956  * blocks on the application - even if the app already has the data ready to go.
1957  * We need to hold the sent, unacked data (1x cwnd), plus all the data we might
1958  * send next RTT (1x cwnd).  Note this is called after cwnd was expanded. */
1959 static void adjust_tx_qio_limit(struct conv *s)
1960 {
1961         Tcpctl *tcb = (Tcpctl *) s->ptcl;
1962         size_t ideal_limit = tcb->cwind * 2;
1963
1964         /* This is called for every ACK, and it's not entirely free to update the
1965          * limit (locks, CVs, taps).  Updating in chunks of mss seems reasonable.
1966          * During SS, we'll update this on most ACKs (given each ACK increased the
1967          * cwind by > MSS).
1968          *
1969          * We also don't want a lot of tiny blocks from the user, but the way qio
1970          * works, you can put in as much as you want (Maxatomic) and then get
1971          * flow-controlled. */
1972         if (qgetlimit(s->wq) + tcb->typical_mss < ideal_limit)
1973                 qsetlimit(s->wq, ideal_limit);
1974         /* TODO: we could shrink the qio limit too, if we had a better idea what the
1975          * actual threshold was.  We want the limit to be the 'stable' cwnd * 2. */
1976 }
1977
1978 /* Attempts to merge later sacks into sack 'into' (index in the array) */
1979 static void merge_sacks_into(Tcpctl *tcb, int into)
1980 {
1981         struct sack_block *into_sack = &tcb->snd.sacks[into];
1982         struct sack_block *tcb_sack;
1983         int shift = 0;
1984
1985         for (int i = into + 1; i < tcb->snd.nr_sacks; i++) {
1986                 tcb_sack = &tcb->snd.sacks[i];
1987                 if (seq_lt(into_sack->right, tcb_sack->left))
1988                         break;
1989                 if (seq_gt(tcb_sack->right, into_sack->right))
1990                         into_sack->right = tcb_sack->right;
1991                 shift++;
1992         }
1993         if (shift) {
1994                 memmove(tcb->snd.sacks + into + 1,
1995                         tcb->snd.sacks + into + 1 + shift,
1996                         sizeof(struct sack_block) * (tcb->snd.nr_sacks - into - 1
1997                                                              - shift));
1998                 tcb->snd.nr_sacks -= shift;
1999         }
2000 }
2001
2002 /* If we update a sack, it means they received a packet (possibly out of order),
2003  * but they have not received earlier packets.  Otherwise, they would do a full
2004  * ACK.
2005  *
2006  * The trick is in knowing whether the reception growing this sack is due to a
2007  * retrans or due to packets from before our last loss event.  The rightmost
2008  * sack tends to grow a lot with packets we sent before the loss.  However,
2009  * intermediate sacks that grow are signs of a loss, since they only grow as a
2010  * result of retrans.
2011  *
2012  * This is only true for the first time through a retrans.  After we've gone
2013  * through a full retrans blast, the sack that hinted at the retrans loss (and
2014  * there could be multiple of them!) will continue to grow.  We could come up
2015  * with some tracking for this, but instead we'll just do a one-time deal.  You
2016  * can recover from one detected sack retrans loss.  After that, you'll have to
2017  * use the RTO.
2018  *
2019  * This won't catch some things, like a sack that grew and merged with the
2020  * rightmost sack.  This also won't work if you have a single sack.  We can't
2021  * tell where the retrans ends and the sending begins. */
2022 static bool sack_hints_at_loss(Tcpctl *tcb, struct sack_block *tcb_sack)
2023 {
2024         if (tcb->snd.recovery != SACK_RETRANS_RECOVERY)
2025                 return FALSE;
2026         return &tcb->snd.sacks[tcb->snd.nr_sacks - 1] != tcb_sack;
2027 }
2028
2029 static bool sack_contains(struct sack_block *tcb_sack, uint32_t seq)
2030 {
2031         return seq_le(tcb_sack->left, seq) && seq_lt(seq, tcb_sack->right);
2032 }
2033
2034 /* Debugging helper! */
2035 static void sack_asserter(Tcpctl *tcb, char *str)
2036 {
2037         struct sack_block *tcb_sack;
2038
2039         for (int i = 0; i < tcb->snd.nr_sacks; i++) {
2040                 tcb_sack = &tcb->snd.sacks[i];
2041                 /* Checking invariants: snd.rtx is never inside a sack, sacks are always
2042                  * mutually exclusive. */
2043                 if (sack_contains(tcb_sack, tcb->snd.rtx) ||
2044                     ((i + 1 < tcb->snd.nr_sacks) && seq_ge(tcb_sack->right,
2045                                                                (tcb_sack + 1)->left))) {
2046                         printk("SACK ASSERT ERROR at %s\n", str);
2047                         printk("rtx %u una %u nxt %u, sack [%u, %u)\n",
2048                                tcb->snd.rtx, tcb->snd.una, tcb->snd.nxt, tcb_sack->left,
2049                                    tcb_sack->right);
2050                         for (int i = 0; i < tcb->snd.nr_sacks; i++)
2051                                 printk("\t %d: [%u, %u)\n", i, tcb->snd.sacks[i].left,
2052                                        tcb->snd.sacks[i].right);
2053                         backtrace();
2054                         panic("");
2055                 }
2056         }
2057 }
2058
2059 /* Updates bookkeeping whenever a sack is added or updated */
2060 static void sack_has_changed(struct conv *s, Tcpctl *tcb,
2061                              struct sack_block *tcb_sack)
2062 {
2063         /* Due to the change, snd.rtx might be in the middle of this sack.  Advance
2064          * it to the right edge. */
2065         if (sack_contains(tcb_sack, tcb->snd.rtx))
2066                 tcb->snd.rtx = tcb_sack->right;
2067
2068         /* This is a sack for something we retransed and we think it means there was
2069          * another loss.  Instead of waiting for the RTO, we can take action. */
2070         if (sack_hints_at_loss(tcb, tcb_sack)) {
2071                 if (++tcb->snd.sack_loss_hint == TCPREXMTTHRESH) {
2072                         netlog(s->p->f, Logtcprxmt,
2073                                "%I.%d -> %I.%d: sack rxmit loss: snd.rtx %u, sack [%u,%u), una %u, recovery_pt %u\n",
2074                                s->laddr, s->lport, s->raddr, s->rport,
2075                                tcb->snd.rtx, tcb_sack->left, tcb_sack->right, tcb->snd.una,
2076                                tcb->snd.recovery_pt);
2077                         /* Redo retrans, but keep the sacks and recovery point */
2078                         tcp_loss_event(s, tcb);
2079                         tcb->snd.rtx = tcb->snd.una;
2080                         tcb->snd.sack_loss_hint = 0;
2081                         /* Act like an RTO.  We just detected it earlier.  This prevents us
2082                          * from getting another sack hint loss this recovery period and from
2083                          * advancing the opportunistic right edge. */
2084                         tcb->snd.recovery = RTO_RETRANS_RECOVERY;
2085                         /* We didn't actually time out yet and we expect to keep getting
2086                          * sacks, so we don't want to flush or worry about in_flight.  If we
2087                          * messed something up, the RTO will still fire. */
2088                         set_in_flight(tcb);
2089                 }
2090         }
2091 }
2092
2093 /* Advances tcb_sack's right edge, if new_right is farther, and updates the
2094  * bookkeeping due to the change. */
2095 static void update_right_edge(struct conv *s, Tcpctl *tcb,
2096                               struct sack_block *tcb_sack, uint32_t new_right)
2097 {
2098         if (seq_le(new_right, tcb_sack->right))
2099                 return;
2100         tcb_sack->right = new_right;
2101         merge_sacks_into(tcb, tcb_sack - tcb->snd.sacks);
2102         sack_has_changed(s, tcb, tcb_sack);
2103 }
2104
2105 static void update_or_insert_sack(struct conv *s, Tcpctl *tcb,
2106                                   struct sack_block *seg_sack)
2107 {
2108         struct sack_block *tcb_sack;
2109
2110         for (int i = 0; i < tcb->snd.nr_sacks; i++) {
2111                 tcb_sack = &tcb->snd.sacks[i];
2112                 if (seq_lt(tcb_sack->left, seg_sack->left)) {
2113                         /* This includes adjacent (which I've seen!) and overlap. */
2114                         if (seq_le(seg_sack->left, tcb_sack->right)) {
2115                                 update_right_edge(s, tcb, tcb_sack, seg_sack->right);
2116                                 return;
2117                         }
2118                         continue;
2119                 }
2120                 /* Update existing sack */
2121                 if (tcb_sack->left == seg_sack->left) {
2122                         update_right_edge(s, tcb, tcb_sack, seg_sack->right);
2123                         return;
2124                 }
2125                 /* Found our slot */
2126                 if (seq_gt(tcb_sack->left, seg_sack->left)) {
2127                         if (tcb->snd.nr_sacks == MAX_NR_SND_SACKS) {
2128                                 /* Out of room, but it is possible this sack overlaps later
2129                                  * sacks, including the max sack's right edge. */
2130                                 if (seq_ge(seg_sack->right, tcb_sack->left)) {
2131                                         /* Take over the sack */
2132                                         tcb_sack->left = seg_sack->left;
2133                                         update_right_edge(s, tcb, tcb_sack, seg_sack->right);
2134                                 }
2135                                 return;
2136                         }
2137                         /* O/W, it's our slot and we have room (at least one spot). */
2138                         memmove(&tcb->snd.sacks[i + 1], &tcb->snd.sacks[i],
2139                                 sizeof(struct sack_block) * (tcb->snd.nr_sacks - i));
2140                         tcb_sack->left = seg_sack->left;
2141                         tcb_sack->right = seg_sack->right;
2142                         tcb->snd.nr_sacks++;
2143                         merge_sacks_into(tcb, i);
2144                         sack_has_changed(s, tcb, tcb_sack);
2145                         return;
2146                 }
2147         }
2148         if (tcb->snd.nr_sacks == MAX_NR_SND_SACKS) {
2149                 /* We didn't find space in the sack array. */
2150                 tcb_sack = &tcb->snd.sacks[MAX_NR_SND_SACKS - 1];
2151                 /* Need to always maintain the rightmost sack, discarding the prev */
2152                 if (seq_gt(seg_sack->right, tcb_sack->right)) {
2153                         tcb_sack->left = seg_sack->left;
2154                         tcb_sack->right = seg_sack->right;
2155                         sack_has_changed(s, tcb, tcb_sack);
2156                 }
2157                 return;
2158         }
2159         tcb_sack = &tcb->snd.sacks[tcb->snd.nr_sacks];
2160         tcb->snd.nr_sacks++;
2161         tcb_sack->left = seg_sack->left;
2162         tcb_sack->right = seg_sack->right;
2163         sack_has_changed(s, tcb, tcb_sack);
2164 }
2165
2166 /* Given the packet seg, track the sacks in TCB.  There are a few things: if seg
2167  * acks new data, some sacks might no longer be needed.  Some sacks might grow,
2168  * we might add new sacks, either of which can cause a merger.
2169  *
2170  * The important thing is that we always have the max sack entry: it must be
2171  * inserted for sure and findable.  We need that for our measurement of what
2172  * packets are in the network.
2173  *
2174  * Note that we keep sacks that are below snd.rtx (and above
2175  * seg.ack/tcb->snd.una) as best we can - we don't prune them.  We'll need those
2176  * for the in_flight estimate.
2177  *
2178  * When we run out of room, we'll have to throw away a sack.  Anything we throw
2179  * away below snd.rtx will be counted as 'in flight', even though it isn't.  If
2180  * we throw away something greater than snd.rtx, we'll also retrans it.  For
2181  * simplicity, we throw-away / replace the rightmost sack, since we're always
2182  * maintaining a highest sack. */
2183 static void update_sacks(struct conv *s, Tcpctl *tcb, Tcp *seg)
2184 {
2185         int prune = 0;
2186         struct sack_block *tcb_sack;
2187
2188         for (int i = 0; i < tcb->snd.nr_sacks; i++) {
2189                 tcb_sack = &tcb->snd.sacks[i];
2190                 /* For the equality case, if they acked up to, but not including an old
2191                  * sack, they must have reneged it.  Otherwise they would have acked
2192                  * beyond the sack. */
2193                 if (seq_lt(seg->ack, tcb_sack->left))
2194                         break;
2195                 prune++;
2196         }
2197         if (prune) {
2198                 memmove(tcb->snd.sacks, tcb->snd.sacks + prune,
2199                         sizeof(struct sack_block) * (tcb->snd.nr_sacks - prune));
2200                 tcb->snd.nr_sacks -= prune;
2201         }
2202         for (int i = 0; i < seg->nr_sacks; i++) {
2203                 /* old sacks */
2204                 if (seq_lt(seg->sacks[i].left, seg->ack))
2205                         continue;
2206                 /* buggy sack: out of range */
2207                 if (seq_gt(seg->sacks[i].right, tcb->snd.nxt))
2208                         continue;
2209                 update_or_insert_sack(s, tcb, &seg->sacks[i]);
2210         }
2211 }
2212
2213 /* This is a little bit of an under estimate, since we assume a packet is lost
2214  * once we have any sacks above it.  Overall, it's at most 2 * MSS of an
2215  * overestimate.
2216  *
2217  * If we have no sacks (either reneged or never used) we'll assume all packets
2218  * above snd.rtx are lost.  This will be the case for sackless fast rxmit
2219  * (Dong's stuff) or for a timeout.  In the former case, this is probably not
2220  * true, and in_flight should be higher, but we have no knowledge without the
2221  * sacks. */
2222 static void set_in_flight(Tcpctl *tcb)
2223 {
2224         struct sack_block *tcb_sack;
2225         uint32_t in_flight = 0;
2226         uint32_t from;
2227
2228         if (!tcb->snd.nr_sacks) {
2229                 tcb->snd.in_flight = tcb->snd.rtx - tcb->snd.una;
2230                 return;
2231         }
2232
2233         /* Everything to the right of the unsacked */
2234         tcb_sack = &tcb->snd.sacks[tcb->snd.nr_sacks - 1];
2235         in_flight += tcb->snd.nxt - tcb_sack->right;
2236
2237         /* Everything retransed (from una to snd.rtx, minus sacked regions.  Note
2238          * we only retrans at most the last sack's left edge.  snd.rtx will be
2239          * advanced to the right edge of some sack (possibly the last one). */
2240         from = tcb->snd.una;
2241         for (int i = 0; i < tcb->snd.nr_sacks; i++) {
2242                 tcb_sack = &tcb->snd.sacks[i];
2243                 if (seq_ge(tcb_sack->left, tcb->snd.rtx))
2244                         break;
2245                 assert(seq_ge(tcb->snd.rtx, tcb_sack->right));
2246                 in_flight += tcb_sack->left - from;
2247                 from = tcb_sack->right;
2248         }
2249         in_flight += tcb->snd.rtx - from;
2250
2251         tcb->snd.in_flight = in_flight;
2252 }
2253
2254 static void reset_recovery(struct conv *s, Tcpctl *tcb)
2255 {
2256         netlog(s->p->f, Logtcprxmt,
2257                "%I.%d -> %I.%d: recovery complete, una %u, rtx %u, nxt %u, recovery %u\n",
2258                s->laddr, s->lport, s->raddr, s->rport,
2259                tcb->snd.una, tcb->snd.rtx, tcb->snd.nxt, tcb->snd.recovery_pt);
2260         tcb->snd.recovery = 0;
2261         tcb->snd.recovery_pt = 0;
2262         tcb->snd.loss_hint = 0;
2263         tcb->snd.flush_sacks = FALSE;
2264         tcb->snd.sack_loss_hint = 0;
2265 }
2266
2267 static bool is_dup_ack(Tcpctl *tcb, Tcp *seg)
2268 {
2269         /* this is a pure ack w/o window update */
2270         return (seg->ack == tcb->snd.una) &&
2271                (tcb->snd.una != tcb->snd.nxt) &&
2272                (seg->len == 0) &&
2273                (seg->wnd == tcb->snd.wnd);
2274 }
2275
2276 /* If we have sacks, we'll ignore dupacks and look at the sacks ahead of una
2277  * (which are managed by the TCB).  The tcb will not have old sacks (below
2278  * ack/snd.rtx).  Receivers often send sacks below their ack point when we are
2279  * coming out of a loss, and we don't want those to count.
2280  *
2281  * Note the tcb could have sacks (in the future), but the receiver stopped using
2282  * them (reneged).  We'll catch that with the RTO.  If we try to catch it here,
2283  * we could get in a state where we never allow them to renege. */
2284 static bool is_potential_loss(Tcpctl *tcb, Tcp *seg)
2285 {
2286         if (seg->nr_sacks > 0)
2287                 return tcb->snd.nr_sacks > 0;
2288         else
2289                 return is_dup_ack(tcb, seg);
2290 }
2291
2292 void update(struct conv *s, Tcp * seg)
2293 {
2294         int rtt, delta;
2295         Tcpctl *tcb;
2296         uint32_t acked, expand;
2297         struct tcppriv *tpriv;
2298
2299         tpriv = s->p->priv;
2300         tcb = (Tcpctl *) s->ptcl;
2301
2302         /* if everything has been acked, force output(?) */
2303         if (seq_gt(seg->ack, tcb->snd.nxt)) {
2304                 tcb->flags |= FORCE;
2305                 return;
2306         }
2307
2308         acked = seg->ack - tcb->snd.una;
2309         tcb->snd.una = seg->ack;
2310         if (seq_gt(seg->ack, tcb->snd.rtx))
2311                 tcb->snd.rtx = seg->ack;
2312
2313         update_sacks(s, tcb, seg);
2314         set_in_flight(tcb);
2315
2316         /* We treat either a dupack or forward SACKs as a hint that there is a loss.
2317          * The RFCs suggest three dupacks before treating it as a loss (alternative
2318          * is reordered packets).  We'll treat three SACKs the same way. */
2319         if (is_potential_loss(tcb, seg) && !tcb->snd.recovery) {
2320                 tcb->snd.loss_hint++;
2321                 if (tcb->snd.loss_hint == TCPREXMTTHRESH) {
2322                         netlog(s->p->f, Logtcprxmt,
2323                                "%I.%d -> %I.%d: loss hint thresh, nr sacks %u, nxt %u, una %u, cwnd %u\n",
2324                                s->laddr, s->lport, s->raddr, s->rport,
2325                                tcb->snd.nr_sacks, tcb->snd.nxt, tcb->snd.una, tcb->cwind);
2326                         tcp_loss_event(s, tcb);
2327                         tcb->snd.recovery_pt = tcb->snd.nxt;
2328                         if (tcb->snd.nr_sacks) {
2329                                 tcb->snd.recovery = SACK_RETRANS_RECOVERY;
2330                                 tcb->snd.flush_sacks = FALSE;
2331                                 tcb->snd.sack_loss_hint = 0;
2332                         } else {
2333                                 tcb->snd.recovery = FAST_RETRANS_RECOVERY;
2334                         }
2335                         tcprxmit(s);
2336                 }
2337         }
2338
2339         /*
2340          *  update window
2341          */
2342         if (seq_gt(seg->ack, tcb->snd.wl2)
2343                 || (tcb->snd.wl2 == seg->ack && seg->wnd > tcb->snd.wnd)) {
2344                 tcb->snd.wnd = seg->wnd;
2345                 tcb->snd.wl2 = seg->ack;
2346         }
2347
2348         if (!acked) {
2349                 /*
2350                  *  don't let us hangup if sending into a closed window and
2351                  *  we're still getting acks
2352                  */
2353                 if ((tcb->flags & RETRAN) && tcb->snd.wnd == 0) {
2354                         tcb->backedoff = MAXBACKMS / 4;
2355                 }
2356                 return;
2357         }
2358         /* At this point, they have acked something new. (positive ack, ack > una).
2359          *
2360          * If we hadn't reached the threshold for recovery yet, the positive ACK
2361          * will reset our loss_hint count. */
2362         if (!tcb->snd.recovery)
2363                 tcb->snd.loss_hint = 0;
2364         else if (seq_ge(seg->ack, tcb->snd.recovery_pt))
2365                 reset_recovery(s, tcb);
2366
2367         /* avoid slow start and timers for SYN acks */
2368         if ((tcb->flags & SYNACK) == 0) {
2369                 tcb->flags |= SYNACK;
2370                 acked--;
2371                 tcb->flgcnt--;
2372                 goto done;
2373         }
2374
2375         /* slow start as long as we're not recovering from lost packets */
2376         if (tcb->cwind < tcb->snd.wnd && !tcb->snd.recovery) {
2377                 if (tcb->cwind < tcb->ssthresh) {
2378                         /* We increase the cwind by every byte we receive.  We want to
2379                          * increase the cwind by one MSS for every MSS that gets ACKed.
2380                          * Note that multiple MSSs can be ACKed in a single ACK.  If we had
2381                          * a remainder of acked / MSS, we'd add just that remainder - not 0
2382                          * or 1 MSS. */
2383                         expand = acked;
2384                 } else {
2385                         /* Every RTT, which consists of CWND bytes, we're supposed to expand
2386                          * by MSS bytes.  The classic algorithm was
2387                          *              expand = (tcb->mss * tcb->mss) / tcb->cwind;
2388                          * which assumes the ACK was for MSS bytes.  Instead, for every
2389                          * 'acked' bytes, we increase the window by acked / CWND (in units
2390                          * of MSS). */
2391                         expand = MAX(acked, tcb->typical_mss) * tcb->typical_mss
2392                                  / tcb->cwind;
2393                 }
2394
2395                 if (tcb->cwind + expand < tcb->cwind)
2396                         expand = tcb->snd.wnd - tcb->cwind;
2397                 if (tcb->cwind + expand > tcb->snd.wnd)
2398                         expand = tcb->snd.wnd - tcb->cwind;
2399                 tcb->cwind += expand;
2400         }
2401         adjust_tx_qio_limit(s);
2402
2403         /* Adjust the timers according to the round trip time */
2404         if (tcb->rtt_timer.state == TcptimerON && seq_ge(seg->ack, tcb->rttseq)) {
2405                 tcphalt(tpriv, &tcb->rtt_timer);
2406                 if ((tcb->flags & RETRAN) == 0) {
2407                         tcb->backoff = 0;
2408                         tcb->backedoff = 0;
2409                         rtt = tcb->rtt_timer.start - tcb->rtt_timer.count;
2410                         if (rtt == 0)
2411                                 rtt = 1;        /* otherwise all close systems will rexmit in 0 time */
2412                         rtt *= MSPTICK;
2413                         if (tcb->srtt == 0) {
2414                                 tcb->srtt = rtt << LOGAGAIN;
2415                                 tcb->mdev = rtt << LOGDGAIN;
2416                         } else {
2417                                 delta = rtt - (tcb->srtt >> LOGAGAIN);
2418                                 tcb->srtt += delta;
2419                                 if (tcb->srtt <= 0)
2420                                         tcb->srtt = 1;
2421
2422                                 delta = abs(delta) - (tcb->mdev >> LOGDGAIN);
2423                                 tcb->mdev += delta;
2424                                 if (tcb->mdev <= 0)
2425                                         tcb->mdev = 1;
2426                         }
2427                         tcpsettimer(tcb);
2428                 }
2429         }
2430
2431 done:
2432         if (qdiscard(s->wq, acked) < acked)
2433                 tcb->flgcnt--;
2434
2435         if (seq_gt(seg->ack, tcb->snd.urg))
2436                 tcb->snd.urg = seg->ack;
2437
2438         if (tcb->snd.una != tcb->snd.nxt)
2439                 tcpgo(tpriv, &tcb->timer);
2440         else
2441                 tcphalt(tpriv, &tcb->timer);
2442
2443         tcb->flags &= ~RETRAN;
2444         tcb->backoff = 0;
2445         tcb->backedoff = 0;
2446 }
2447
2448 static void update_tcb_ts(Tcpctl *tcb, Tcp *seg)
2449 {
2450         /* Get timestamp info from the tcp header.  Even though the timestamps
2451          * aren't sequence numbers, we still need to protect for wraparound.  Though
2452          * if the values were 0, assume that means we need an update.  We could have
2453          * an initial ts_val that appears negative (signed). */
2454         if (!tcb->ts_recent || !tcb->last_ack_sent ||
2455             (seq_ge(seg->ts_val, tcb->ts_recent) &&
2456              seq_le(seg->seq, tcb->last_ack_sent)))
2457                 tcb->ts_recent = seg->ts_val;
2458 }
2459
2460 /* Overlap happens when one sack's left edge is inside another sack. */
2461 static bool sacks_overlap(struct sack_block *x, struct sack_block *y)
2462 {
2463         return (seq_le(x->left, y->left) && seq_le(y->left, x->right)) ||
2464                (seq_le(y->left, x->left) && seq_le(x->left, y->right));
2465 }
2466
2467 static void make_sack_first(Tcpctl *tcb, struct sack_block *tcb_sack)
2468 {
2469         struct sack_block temp;
2470
2471         if (tcb_sack == &tcb->rcv.sacks[0])
2472                 return;
2473         temp = tcb->rcv.sacks[0];
2474         tcb->rcv.sacks[0] = *tcb_sack;
2475         *tcb_sack = temp;
2476 }
2477
2478 /* Track sack in our tcb for a block of data we received.  This handles all the
2479  * stuff: making sure sack is first (since it's the most recent sack change),
2480  * updating or merging sacks, and dropping excess sacks (we only need to
2481  * maintain 3).  Unlike on the snd side, our tcb sacks are *not* sorted. */
2482 static void track_rcv_sack(Tcpctl *tcb, uint32_t left, uint32_t right)
2483 {
2484         struct sack_block *tcb_sack;
2485         struct sack_block sack[1];
2486
2487         if (!tcb->sack_ok)
2488                 return;
2489         assert(seq_lt(left, right));
2490         sack->left = left;
2491         sack->right = right;
2492         /* We can reuse an existing sack if we're merging or overlapping. */
2493         for (int i = 0; i < tcb->rcv.nr_sacks; i++) {
2494                 tcb_sack = &tcb->rcv.sacks[i];
2495                 if (sacks_overlap(tcb_sack, sack)) {
2496                         tcb_sack->left = seq_min(tcb_sack->left, sack->left);
2497                         tcb_sack->right = seq_max(tcb_sack->right, sack->right);
2498                         make_sack_first(tcb, tcb_sack);
2499                         return;
2500                 }
2501         }
2502         /* We can discard the last sack (right shift) - we should have sent it at
2503          * least once by now.  If not, oh well. */
2504         memmove(tcb->rcv.sacks + 1, tcb->rcv.sacks, sizeof(struct sack_block) *
2505                 MIN(MAX_NR_RCV_SACKS - 1, tcb->rcv.nr_sacks));
2506         tcb->rcv.sacks[0] = *sack;
2507         if (tcb->rcv.nr_sacks < MAX_NR_RCV_SACKS)
2508                 tcb->rcv.nr_sacks++;
2509 }
2510
2511 /* Once we receive everything and move rcv.nxt past a sack, we don't need to
2512  * track it.  I've seen Linux report sacks in the past, but we probably
2513  * shouldn't. */
2514 static void drop_old_rcv_sacks(Tcpctl *tcb)
2515 {
2516         struct sack_block *tcb_sack;
2517
2518         for (int i = 0; i < tcb->rcv.nr_sacks; i++) {
2519                 tcb_sack = &tcb->rcv.sacks[i];
2520                 /* Moving up to or past the left is enough to drop it. */
2521                 if (seq_ge(tcb->rcv.nxt, tcb_sack->left)) {
2522                         memmove(tcb->rcv.sacks + i, tcb->rcv.sacks + i + 1,
2523                                 sizeof(struct sack_block) * (tcb->rcv.nr_sacks - i - 1));
2524                         tcb->rcv.nr_sacks--;
2525                         i--;
2526                 }
2527         }
2528 }
2529
2530 void tcpiput(struct Proto *tcp, struct Ipifc *unused, struct block *bp)
2531 {
2532         ERRSTACK(1);
2533         Tcp seg;
2534         Tcp4hdr *h4;
2535         Tcp6hdr *h6;
2536         int hdrlen;
2537         Tcpctl *tcb;
2538         uint16_t length;
2539         uint8_t source[IPaddrlen], dest[IPaddrlen];
2540         struct conv *s;
2541         struct Fs *f;
2542         struct tcppriv *tpriv;
2543         uint8_t version;
2544
2545         f = tcp->f;
2546         tpriv = tcp->priv;
2547
2548         tpriv->stats[InSegs]++;
2549
2550         h4 = (Tcp4hdr *) (bp->rp);
2551         h6 = (Tcp6hdr *) (bp->rp);
2552
2553         if ((h4->vihl & 0xF0) == IP_VER4) {
2554                 uint8_t ttl;
2555
2556                 version = V4;
2557                 length = nhgets(h4->length);
2558                 v4tov6(dest, h4->tcpdst);
2559                 v4tov6(source, h4->tcpsrc);
2560
2561                 /* ttl isn't part of the xsum pseudo header, but bypass needs it. */
2562                 ttl = h4->Unused;
2563                 h4->Unused = 0;
2564                 hnputs(h4->tcplen, length - TCP4_PKT);
2565                 if (!(bp->flag & Btcpck) && (h4->tcpcksum[0] || h4->tcpcksum[1]) &&
2566                         ptclcsum(bp, TCP4_IPLEN, length - TCP4_IPLEN)) {
2567                         tpriv->stats[CsumErrs]++;
2568                         tpriv->stats[InErrs]++;
2569                         netlog(f, Logtcp, "bad tcp proto cksum\n");
2570                         freeblist(bp);
2571                         return;
2572                 }
2573                 h4->Unused = ttl;
2574
2575                 hdrlen = ntohtcp4(&seg, &bp);
2576                 if (hdrlen < 0) {
2577                         tpriv->stats[HlenErrs]++;
2578                         tpriv->stats[InErrs]++;
2579                         netlog(f, Logtcp, "bad tcp hdr len\n");
2580                         return;
2581                 }
2582
2583                 s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest);
2584                 if (s && s->state == Bypass) {
2585                         bypass_or_drop(s, bp);
2586                         return;
2587                 }
2588
2589                 /* trim the packet to the size claimed by the datagram */
2590                 length -= hdrlen + TCP4_PKT;
2591                 bp = trimblock(bp, hdrlen + TCP4_PKT, length);
2592                 if (bp == NULL) {
2593                         tpriv->stats[LenErrs]++;
2594                         tpriv->stats[InErrs]++;
2595                         netlog(f, Logtcp, "tcp len < 0 after trim\n");
2596                         return;
2597                 }
2598         } else {
2599                 int ttl = h6->ttl;
2600                 int proto = h6->proto;
2601
2602                 version = V6;
2603                 length = nhgets(h6->ploadlen);
2604                 ipmove(dest, h6->tcpdst);
2605                 ipmove(source, h6->tcpsrc);
2606
2607                 h6->ploadlen[0] = h6->ploadlen[1] = h6->proto = 0;
2608                 h6->ttl = proto;
2609                 hnputl(h6->vcf, length);
2610                 if ((h6->tcpcksum[0] || h6->tcpcksum[1]) &&
2611                         ptclcsum(bp, TCP6_IPLEN, length + TCP6_PHDRSIZE)) {
2612                         tpriv->stats[CsumErrs]++;
2613                         tpriv->stats[InErrs]++;
2614                         netlog(f, Logtcp, "bad tcp proto cksum\n");
2615                         freeblist(bp);
2616                         return;
2617                 }
2618                 h6->ttl = ttl;
2619                 h6->proto = proto;
2620                 hnputs(h6->ploadlen, length);
2621
2622                 hdrlen = ntohtcp6(&seg, &bp);
2623                 if (hdrlen < 0) {
2624                         tpriv->stats[HlenErrs]++;
2625                         tpriv->stats[InErrs]++;
2626                         netlog(f, Logtcp, "bad tcp hdr len\n");
2627                         return;
2628                 }
2629
2630                 s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest);
2631                 if (s && s->state == Bypass) {
2632                         bypass_or_drop(s, bp);
2633                         return;
2634                 }
2635
2636                 /* trim the packet to the size claimed by the datagram */
2637                 length -= hdrlen;
2638                 bp = trimblock(bp, hdrlen + TCP6_PKT, length);
2639                 if (bp == NULL) {
2640                         tpriv->stats[LenErrs]++;
2641                         tpriv->stats[InErrs]++;
2642                         netlog(f, Logtcp, "tcp len < 0 after trim\n");
2643                         return;
2644                 }
2645         }
2646
2647         /* s, the conv matching the n-tuple, was set above */
2648         if (s == NULL) {
2649                 netlog(f, Logtcpreset, "iphtlook failed: src %I:%u, dst %I:%u\n",
2650                        source, seg.source, dest, seg.dest);
2651 reset:
2652                 sndrst(tcp, source, dest, length, &seg, version, "no conversation");
2653                 freeblist(bp);
2654                 return;
2655         }
2656
2657         /* lock protocol for unstate Plan 9 invariants.  funcs like limbo or
2658          * incoming might rely on it. */
2659         qlock(&tcp->qlock);
2660
2661         /* if it's a listener, look for the right flags and get a new conv */
2662         tcb = (Tcpctl *) s->ptcl;
2663         if (tcb->state == Listen) {
2664                 if (seg.flags & RST) {
2665                         limborst(s, &seg, source, dest, version);
2666                         qunlock(&tcp->qlock);
2667                         freeblist(bp);
2668                         return;
2669                 }
2670
2671                 /* if this is a new SYN, put the call into limbo */
2672                 if ((seg.flags & SYN) && (seg.flags & ACK) == 0) {
2673                         limbo(s, source, dest, &seg, version);
2674                         qunlock(&tcp->qlock);
2675                         freeblist(bp);
2676                         return;
2677                 }
2678
2679                 /* if there's a matching call in limbo, tcpincoming will return it */
2680                 s = tcpincoming(s, &seg, source, dest, version);
2681                 if (s == NULL) {
2682                         qunlock(&tcp->qlock);
2683                         goto reset;
2684                 }
2685         }
2686
2687         /* The rest of the input state machine is run with the control block
2688          * locked and implements the state machine directly out of the RFC.
2689          * Out-of-band data is ignored - it was always a bad idea.
2690          */
2691         tcb = (Tcpctl *) s->ptcl;
2692         if (waserror()) {
2693                 qunlock(&s->qlock);
2694                 nexterror();
2695         }
2696         qlock(&s->qlock);
2697         qunlock(&tcp->qlock);
2698
2699         update_tcb_ts(tcb, &seg);
2700         /* fix up window */
2701         seg.wnd <<= tcb->rcv.scale;
2702
2703         /* every input packet in puts off the keep alive time out */
2704         tcpsetkacounter(tcb);
2705
2706         switch (tcb->state) {
2707                 case Closed:
2708                         sndrst(tcp, source, dest, length, &seg, version,
2709                                    "sending to Closed");
2710                         goto raise;
2711                 case Syn_sent:
2712                         if (seg.flags & ACK) {
2713                                 if (!seq_within(seg.ack, tcb->iss + 1, tcb->snd.nxt)) {
2714                                         sndrst(tcp, source, dest, length, &seg, version,
2715                                                    "bad seq in Syn_sent");
2716                                         goto raise;
2717                                 }
2718                         }
2719                         if (seg.flags & RST) {
2720                                 if (seg.flags & ACK)
2721                                         localclose(s, "connection refused");
2722                                 goto raise;
2723                         }
2724
2725                         if (seg.flags & SYN) {
2726                                 procsyn(s, &seg);
2727                                 if (seg.flags & ACK) {
2728                                         update(s, &seg);
2729                                         tcpsynackrtt(s);
2730                                         tcpsetstate(s, Established);
2731                                         /* Here's where we get the results of header option
2732                                          * negotiations for connections we started. (SYNACK has the
2733                                          * response) */
2734                                         tcpsetscale(s, tcb, seg.ws, tcb->scale);
2735                                         tcb->sack_ok = seg.sack_ok;
2736                                 } else {
2737                                         sndrst(tcp, source, dest, length, &seg, version,
2738                                                    "Got SYN with no ACK");
2739                                         goto raise;
2740                                 }
2741
2742                                 if (length != 0 || (seg.flags & FIN))
2743                                         break;
2744
2745                                 freeblist(bp);
2746                                 goto output;
2747                         } else
2748                                 freeblist(bp);
2749
2750                         qunlock(&s->qlock);
2751                         poperror();
2752                         return;
2753         }
2754
2755         /*
2756          *  One DOS attack is to open connections to us and then forget about them,
2757          *  thereby tying up a conv at no long term cost to the attacker.
2758          *  This is an attempt to defeat these stateless DOS attacks.  See
2759          *  corresponding code in tcpsendka().
2760          */
2761         if ((seg.flags & RST) == 0) {
2762                 if (tcpporthogdefense
2763                         && seq_within(seg.ack, tcb->snd.una - (1 << 31),
2764                                                   tcb->snd.una - (1 << 29))) {
2765                         printd("stateless hog %I.%d->%I.%d f 0x%x 0x%lx - 0x%lx - 0x%lx\n",
2766                                    source, seg.source, dest, seg.dest, seg.flags,
2767                                    tcb->snd.una - (1 << 31), seg.ack, tcb->snd.una - (1 << 29));
2768                         localclose(s, "stateless hog");
2769                 }
2770         }
2771
2772         /* Cut the data to fit the receive window */
2773         if (tcptrim(tcb, &seg, &bp, &length) == -1) {
2774                 netlog(f, Logtcp, "%I.%d -> %I.%d: tcp len < 0, %lu %d\n",
2775                        s->raddr, s->rport, s->laddr, s->lport, seg.seq, length);
2776                 update(s, &seg);
2777                 if (qlen(s->wq) + tcb->flgcnt == 0 && tcb->state == Closing) {
2778                         tcphalt(tpriv, &tcb->rtt_timer);
2779                         tcphalt(tpriv, &tcb->acktimer);
2780                         tcphalt(tpriv, &tcb->katimer);
2781                         tcpsetstate(s, Time_wait);
2782                         tcb->timer.start = MSL2 * (1000 / MSPTICK);
2783                         tcpgo(tpriv, &tcb->timer);
2784                 }
2785                 if (!(seg.flags & RST)) {
2786                         tcb->flags |= FORCE;
2787                         goto output;
2788                 }
2789                 qunlock(&s->qlock);
2790                 poperror();
2791                 return;
2792         }
2793
2794         /* Cannot accept so answer with a rst */
2795         if (length && tcb->state == Closed) {
2796                 sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
2797                 goto raise;
2798         }
2799
2800         /* The segment is beyond the current receive pointer so
2801          * queue the data in the resequence queue
2802          */
2803         if (seg.seq != tcb->rcv.nxt)
2804                 if (length != 0 || (seg.flags & (SYN | FIN))) {
2805                         update(s, &seg);
2806                         if (addreseq(tcb, tpriv, &seg, bp, length) < 0)
2807                                 printd("reseq %I.%d -> %I.%d\n", s->raddr, s->rport, s->laddr,
2808                                            s->lport);
2809                         tcb->flags |= FORCE;
2810                         goto output;
2811                 }
2812
2813         /*
2814          *  keep looping till we've processed this packet plus any
2815          *  adjacent packets in the resequence queue
2816          */
2817         for (;;) {
2818                 if (seg.flags & RST) {
2819                         if (tcb->state == Established) {
2820                                 tpriv->stats[EstabResets]++;
2821                                 if (tcb->rcv.nxt != seg.seq)
2822                                         printd
2823                                                 ("out of order RST rcvd: %I.%d -> %I.%d, rcv.nxt 0x%lx seq 0x%lx\n",
2824                                                  s->raddr, s->rport, s->laddr, s->lport, tcb->rcv.nxt,
2825                                                  seg.seq);
2826                         }
2827                         localclose(s, "connection refused");
2828                         goto raise;
2829                 }
2830
2831                 if ((seg.flags & ACK) == 0)
2832                         goto raise;
2833
2834                 switch (tcb->state) {
2835                         case Established:
2836                         case Close_wait:
2837                                 update(s, &seg);
2838                                 break;
2839                         case Finwait1:
2840                                 update(s, &seg);
2841                                 if (qlen(s->wq) + tcb->flgcnt == 0) {
2842                                         tcphalt(tpriv, &tcb->rtt_timer);
2843                                         tcphalt(tpriv, &tcb->acktimer);
2844                                         tcpsetkacounter(tcb);
2845                                         tcb->time = NOW;
2846                                         tcpsetstate(s, Finwait2);
2847                                         tcb->katimer.start = MSL2 * (1000 / MSPTICK);
2848                                         tcpgo(tpriv, &tcb->katimer);
2849                                 }
2850                                 break;
2851                         case Finwait2:
2852                                 update(s, &seg);
2853                                 break;
2854                         case Closing:
2855                                 update(s, &seg);
2856                                 if (qlen(s->wq) + tcb->flgcnt == 0) {
2857                                         tcphalt(tpriv, &tcb->rtt_timer);
2858                                         tcphalt(tpriv, &tcb->acktimer);
2859                                         tcphalt(tpriv, &tcb->katimer);
2860                                         tcpsetstate(s, Time_wait);
2861                                         tcb->timer.start = MSL2 * (1000 / MSPTICK);
2862                                         tcpgo(tpriv, &tcb->timer);
2863                                 }
2864                                 break;
2865                         case Last_ack:
2866                                 update(s, &seg);
2867                                 if (qlen(s->wq) + tcb->flgcnt == 0) {
2868                                         localclose(s, NULL);
2869                                         goto raise;
2870                                 }
2871                         case Time_wait:
2872                                 tcb->flags |= FORCE;
2873                                 if (tcb->timer.state != TcptimerON)
2874                                         tcpgo(tpriv, &tcb->timer);
2875                 }
2876
2877                 if ((seg.flags & URG) && seg.urg) {
2878                         if (seq_gt(seg.urg + seg.seq, tcb->rcv.urg)) {
2879                                 tcb->rcv.urg = seg.urg + seg.seq;
2880                                 pullblock(&bp, seg.urg);
2881                         }
2882                 } else if (seq_gt(tcb->rcv.nxt, tcb->rcv.urg))
2883                         tcb->rcv.urg = tcb->rcv.nxt;
2884
2885                 if (length == 0) {
2886                         if (bp != NULL)
2887                                 freeblist(bp);
2888                 } else {
2889                         switch (tcb->state) {
2890                                 default:
2891                                         /* Ignore segment text */
2892                                         if (bp != NULL)
2893                                                 freeblist(bp);
2894                                         break;
2895
2896                                 case Established:
2897                                 case Finwait1:
2898                                         /* If we still have some data place on
2899                                          * receive queue
2900                                          */
2901                                         if (bp) {
2902                                                 bp = packblock(bp);
2903                                                 if (bp == NULL)
2904                                                         panic("tcp packblock");
2905                                                 qpassnolim(s->rq, bp);
2906                                                 bp = NULL;
2907
2908                                                 /*
2909                                                  *  Force an ack every 2 data messages.  This is
2910                                                  *  a hack for rob to make his home system run
2911                                                  *  faster.
2912                                                  *
2913                                                  *  this also keeps the standard TCP congestion
2914                                                  *  control working since it needs an ack every
2915                                                  *  2 max segs worth.  This is not quite that,
2916                                                  *  but under a real stream is equivalent since
2917                                                  *  every packet has a max seg in it.
2918                                                  */
2919                                                 if (++(tcb->rcv.una) >= 2)
2920                                                         tcb->flags |= FORCE;
2921                                         }
2922                                         tcb->rcv.nxt += length;
2923                                         drop_old_rcv_sacks(tcb);
2924
2925                                         /*
2926                                          *  update our rcv window
2927                                          */
2928                                         tcprcvwin(s);
2929
2930                                         /*
2931                                          *  turn on the acktimer if there's something
2932                                          *  to ack
2933                                          */
2934                                         if (tcb->acktimer.state != TcptimerON)
2935                                                 tcpgo(tpriv, &tcb->acktimer);
2936
2937                                         break;
2938                                 case Finwait2:
2939                                         /* no process to read the data, send a reset */
2940                                         if (bp != NULL)
2941                                                 freeblist(bp);
2942                                         sndrst(tcp, source, dest, length, &seg, version,
2943                                                    "send to Finwait2");
2944                                         qunlock(&s->qlock);
2945                                         poperror();
2946                                         return;
2947                         }
2948                 }
2949
2950                 if (seg.flags & FIN) {
2951                         tcb->flags |= FORCE;
2952
2953                         switch (tcb->state) {
2954                                 case Established:
2955                                         tcb->rcv.nxt++;
2956                                         tcpsetstate(s, Close_wait);
2957                                         break;
2958                                 case Finwait1:
2959                                         tcb->rcv.nxt++;
2960                                         if (qlen(s->wq) + tcb->flgcnt == 0) {
2961                                                 tcphalt(tpriv, &tcb->rtt_timer);
2962                                                 tcphalt(tpriv, &tcb->acktimer);
2963                                                 tcphalt(tpriv, &tcb->katimer);
2964                                                 tcpsetstate(s, Time_wait);
2965                                                 tcb->timer.start = MSL2 * (1000 / MSPTICK);
2966                                                 tcpgo(tpriv, &tcb->timer);
2967                                         } else
2968                                                 tcpsetstate(s, Closing);
2969                                         break;
2970                                 case Finwait2:
2971                                         tcb->rcv.nxt++;
2972                                         tcphalt(tpriv, &tcb->rtt_timer);
2973                                         tcphalt(tpriv, &tcb->acktimer);
2974                                         tcphalt(tpriv, &tcb->katimer);
2975                                         tcpsetstate(s, Time_wait);
2976                                         tcb->timer.start = MSL2 * (1000 / MSPTICK);
2977                                         tcpgo(tpriv, &tcb->timer);
2978                                         break;
2979                                 case Close_wait:
2980                                 case Closing:
2981                                 case Last_ack:
2982                                         break;
2983                                 case Time_wait:
2984                                         tcpgo(tpriv, &tcb->timer);
2985                                         break;
2986                         }
2987                 }
2988
2989                 /*
2990                  *  get next adjacent segment from the resequence queue.
2991                  *  dump/trim any overlapping segments
2992                  */
2993                 for (;;) {
2994                         if (tcb->reseq == NULL)
2995                                 goto output;
2996
2997                         if (seq_ge(tcb->rcv.nxt, tcb->reseq->seg.seq) == 0)
2998                                 goto output;
2999
3000                         getreseq(tcb, &seg, &bp, &length);
3001
3002                         if (tcptrim(tcb, &seg, &bp, &length) == 0)
3003                                 break;
3004                 }
3005         }
3006 output:
3007         tcpoutput(s);
3008         qunlock(&s->qlock);
3009         poperror();
3010         return;
3011 raise:
3012         qunlock(&s->qlock);
3013         poperror();
3014         freeblist(bp);
3015         tcpkick(s);
3016 }
3017
3018 /* The advertised mss = data + TCP headers */
3019 static uint16_t derive_payload_mss(Tcpctl *tcb)
3020 {
3021         uint16_t payload_mss = tcb->mss;
3022         uint16_t opt_size = 0;
3023
3024         if (tcb->ts_recent) {
3025                 opt_size += TS_LENGTH;
3026                 /* Note that when we're a SYN, we overestimate slightly.  This is safe,
3027                  * and not really a problem. */
3028                 opt_size += TS_SEND_PREPAD;
3029         }
3030         if (tcb->rcv.nr_sacks)
3031                 opt_size += 2 + tcb->rcv.nr_sacks * 8;
3032         opt_size = ROUNDUP(opt_size, 4);
3033         payload_mss -= opt_size;
3034         return payload_mss;
3035 }
3036
3037 /* Decreases the xmit amt, given the MSS / TSO. */
3038 static uint32_t throttle_for_mss(Tcpctl *tcb, uint32_t ssize,
3039                                  uint16_t payload_mss, bool retrans)
3040 {
3041         if (ssize > payload_mss) {
3042                 if ((tcb->flags & TSO) == 0) {
3043                         ssize = payload_mss;
3044                 } else {
3045                         /* Don't send too much.  32K is arbitrary.. */
3046                         if (ssize > 32 * 1024)
3047                                 ssize = 32 * 1024;
3048                         if (!retrans) {
3049                                 /* Clamp xmit to an integral MSS to avoid ragged tail segments
3050                                  * causing poor link utilization. */
3051                                 ssize = ROUNDDOWN(ssize, payload_mss);
3052                         }
3053                 }
3054         }
3055         return ssize;
3056 }
3057
3058 /* Reduces ssize for a variety of reasons.  Returns FALSE if we should abort
3059  * sending the packet.  o/w returns TRUE and modifies ssize by reference. */
3060 static bool throttle_ssize(struct conv *s, Tcpctl *tcb, uint32_t *ssize_p,
3061                            uint16_t payload_mss, bool retrans)
3062 {
3063         struct Fs *f = s->p->f;
3064         uint32_t usable;
3065         uint32_t ssize = *ssize_p;
3066
3067         /* Compute usable segment based on offered window and limit
3068          * window probes to one */
3069         if (tcb->snd.wnd == 0) {
3070                 if (tcb->snd.in_flight != 0) {
3071                         if ((tcb->flags & FORCE) == 0)
3072                                 return FALSE;
3073                 }
3074                 usable = 1;
3075         } else {
3076                 usable = tcb->cwind;
3077                 if (tcb->snd.wnd < usable)
3078                         usable = tcb->snd.wnd;
3079                 if (usable > tcb->snd.in_flight)
3080                         usable -= tcb->snd.in_flight;
3081                 else
3082                         usable = 0;
3083         }
3084         if (ssize && usable < 2)
3085                 netlog(s->p->f, Logtcpverbose,
3086                        "%I.%d -> %I.%d: throttled snd.wnd %lu cwind %lu\n",
3087                        s->laddr, s->lport, s->raddr, s->rport,
3088                        tcb->snd.wnd, tcb->cwind);
3089         if (usable < ssize)
3090                 ssize = usable;
3091
3092         ssize = throttle_for_mss(tcb, ssize, payload_mss, retrans);
3093
3094         *ssize_p = ssize;
3095         return TRUE;
3096 }
3097
3098 /* Helper, picks the next segment to send, which is possibly a retransmission.
3099  * Returns TRUE if we have a segment, FALSE o/w.  Returns ssize, from_seq, and
3100  * sent by reference.
3101  *
3102  * from_seq is the seq number we are transmitting from.
3103  *
3104  * sent includes all seq from una to from_seq *including* any previously sent
3105  * flags (part of tcb->flgcnt), for instance an unacknowledged SYN (which counts
3106  * as a seq number).  Those flags are in the e.g. snd.nxt - snd.una range, and
3107  * they get dropped after qdiscard.
3108  *
3109  * ssize is the amount of data we are sending, starting from from_seq, and it
3110  * will include any *new* flags, which haven't been accounted for yet.
3111  *
3112  * tcb->flgcnt consists of the flags both in ssize and in sent.
3113  *
3114  * Note that we could be in recovery and not sack_retrans a segment. */
3115 static bool get_xmit_segment(struct conv *s, Tcpctl *tcb, uint16_t payload_mss,
3116                              uint32_t *from_seq_p, uint32_t *sent_p,
3117                              uint32_t *ssize_p)
3118 {
3119         struct Fs *f = s->p->f;
3120         struct tcppriv *tpriv = s->p->priv;
3121         uint32_t ssize, sent, from_seq;
3122         bool sack_retrans = FALSE;
3123         struct sack_block *tcb_sack = 0;
3124
3125         for (int i = 0; i < tcb->snd.nr_sacks; i++) {
3126                 tcb_sack = &tcb->snd.sacks[i];
3127                 if (seq_lt(tcb->snd.rtx, tcb_sack->left)) {
3128                         /* So ssize is supposed to include any *new* flags to flgcnt, which
3129                          * at this point would be a FIN.
3130                          *
3131                          * It might be possible that flgcnt is incremented so we send a FIN,
3132                          * even for an intermediate sack retrans.  Perhaps the user closed
3133                          * the conv.
3134                          *
3135                          * However, the way the "flgcnt for FIN" works is that it inflates
3136                          * the desired amount we'd like to send (qlen + flgcnt).
3137                          * Eventually, we reach the end of the queue and fail to extract all
3138                          * of dsize.  At that point, we put on the FIN, and that's where the
3139                          * extra 'byte' comes from.
3140                          *
3141                          * For sack retrans, since we're extracting from parts of the qio
3142                          * that aren't the right-most edge, we don't need to consider flgcnt
3143                          * when setting ssize. */
3144                         from_seq = tcb->snd.rtx;
3145                         sent = from_seq - tcb->snd.una;
3146                         ssize = tcb_sack->left - from_seq;
3147                         sack_retrans = TRUE;
3148                         break;
3149                 }
3150         }
3151         /* SACK holes have first dibs, but we can still opportunisitically send new
3152          * data.
3153          *
3154          * During other types of recovery, we'll just send from the retrans point.
3155          * If we're in an RTO while we still have sacks, we could be resending data
3156          * that wasn't lost.  Consider a sack that is still growing (usually the
3157          * right-most), but we haven't received the ACK yet.  rxt may be included in
3158          * that area.  Given we had two losses or otherwise timed out, I'm not too
3159          * concerned.
3160          *
3161          * Note that Fast and RTO can send data beyond nxt.  If we change that,
3162          * change the accounting below. */
3163         if (!sack_retrans) {
3164                 switch (tcb->snd.recovery) {
3165                 default:
3166                 case SACK_RETRANS_RECOVERY:
3167                         from_seq = tcb->snd.nxt;
3168                         break;
3169                 case FAST_RETRANS_RECOVERY:
3170                 case RTO_RETRANS_RECOVERY:
3171                         from_seq = tcb->snd.rtx;
3172                         break;
3173                 }
3174                 sent = from_seq - tcb->snd.una;
3175                 /* qlen + flgcnt is every seq we want to have sent, including unack'd
3176                  * data, unacked flags, and new flags. */
3177                 ssize = qlen(s->wq) + tcb->flgcnt - sent;
3178         }
3179
3180         if (!throttle_ssize(s, tcb, &ssize, payload_mss, sack_retrans))
3181                 return FALSE;
3182
3183         /* This counts flags, which is a little hokey, but it's okay since in_flight
3184          * gets reset on each ACK */
3185         tcb->snd.in_flight += ssize;
3186         /* Log and track rxmit.  This covers both SACK (retrans) and fast rxmit. */
3187         if (ssize && seq_lt(tcb->snd.rtx, tcb->snd.nxt)) {
3188                 netlog(f, Logtcpverbose,
3189                        "%I.%d -> %I.%d: rxmit: rtx %u amt %u, nxt %u\n",
3190                        s->laddr, s->lport, s->raddr, s->rport,
3191                        tcb->snd.rtx, MIN(tcb->snd.nxt - tcb->snd.rtx, ssize),
3192                        tcb->snd.nxt);
3193                 tpriv->stats[RetransSegs]++;
3194         }
3195         if (sack_retrans) {
3196                 /* If we'll send up to the left edge, advance snd.rtx to the right.
3197                  *
3198                  * This includes the largest sack.  It might get removed later, in which
3199                  * case we'll underestimate the amount in-flight.  The alternative is to
3200                  * not count the rightmost sack, but when it gets removed, we'll retrans
3201                  * it anyway.  No matter what, we'd count it. */
3202                 tcb->snd.rtx += ssize;
3203                 if (tcb->snd.rtx == tcb_sack->left)
3204                         tcb->snd.rtx = tcb_sack->right;
3205                 /* RFC 6675 says we MAY rearm the RTO timer on each retrans, since we
3206                  * might not be getting ACKs for a while. */
3207                 tcpsettimer(tcb);
3208         } else {
3209                 switch (tcb->snd.recovery) {
3210                 default:
3211                         /* under normal op, we drag rtx along with nxt.  this prevents us
3212                          * from sending sacks too early (up above), since rtx doesn't get
3213                          * reset to una until we have a loss (e.g. 3 dupacks/sacks). */
3214                         tcb->snd.nxt += ssize;
3215                         tcb->snd.rtx = tcb->snd.nxt;
3216                         break;
3217                 case SACK_RETRANS_RECOVERY:
3218                         /* We explicitly do not want to increase rtx here.  We might still
3219                          * need it to fill in a sack gap below nxt if we get new, higher
3220                          * sacks. */
3221                         tcb->snd.nxt += ssize;
3222                         break;
3223                 case FAST_RETRANS_RECOVERY:
3224                 case RTO_RETRANS_RECOVERY:
3225                         tcb->snd.rtx += ssize;
3226                         /* Fast and RTO can send new data, advancing nxt. */
3227                         if (seq_gt(tcb->snd.rtx, tcb->snd.nxt))
3228                                 tcb->snd.nxt = tcb->snd.rtx;
3229                         break;
3230                 }
3231         }
3232         *from_seq_p = from_seq;
3233         *sent_p = sent;
3234         *ssize_p = ssize;
3235
3236         return TRUE;
3237 }
3238
3239 /*
3240  *  always enters and exits with the s locked.  We drop
3241  *  the lock to ipoput the packet so some care has to be
3242  *  taken by callers.
3243  */
3244 void tcpoutput(struct conv *s)
3245 {
3246         Tcp seg;
3247         int msgs;
3248         int next_yield = 1;
3249         Tcpctl *tcb;
3250         struct block *hbp, *bp;
3251         uint32_t ssize, dsize, sent, from_seq;
3252         struct Fs *f;
3253         struct tcppriv *tpriv;
3254         uint8_t version;
3255         uint16_t payload_mss;
3256
3257         f = s->p->f;
3258         tpriv = s->p->priv;
3259         version = s->ipversion;
3260
3261         for (msgs = 0; msgs < 100; msgs++) {
3262                 tcb = (Tcpctl *) s->ptcl;
3263
3264                 switch (tcb->state) {
3265                         case Listen:
3266                         case Closed:
3267                         case Finwait2:
3268                                 return;
3269                 }
3270
3271                 /* force an ack when a window has opened up */
3272                 if (tcb->rcv.blocked && tcb->rcv.wnd > 0) {
3273                         tcb->rcv.blocked = 0;
3274                         tcb->flags |= FORCE;
3275                 }
3276
3277                 /* Don't send anything else until our SYN has been acked */
3278                 if (tcb->snd.nxt != tcb->iss && (tcb->flags & SYNACK) == 0)
3279                         break;
3280
3281                 /* payload_mss is the actual amount of data in the packet, which is the
3282                  * advertised (mss - header opts).  This varies from packet to packet,
3283                  * based on the options that might be present (e.g. always timestamps,
3284                  * sometimes SACKs) */
3285                 payload_mss = derive_payload_mss(tcb);
3286
3287                 if (!get_xmit_segment(s, tcb, payload_mss, &from_seq, &sent, &ssize))
3288                         break;
3289
3290                 dsize = ssize;
3291                 seg.urg = 0;
3292
3293                 if (ssize == 0)
3294                         if ((tcb->flags & FORCE) == 0)
3295                                 break;
3296
3297                 tcb->flags &= ~FORCE;
3298                 tcprcvwin(s);
3299
3300                 /* By default we will generate an ack */
3301                 tcphalt(tpriv, &tcb->acktimer);
3302                 tcb->rcv.una = 0;
3303                 seg.source = s->lport;
3304                 seg.dest = s->rport;
3305                 seg.flags = ACK;
3306                 seg.mss = 0;
3307                 seg.ws = 0;
3308                 seg.sack_ok = FALSE;
3309                 seg.nr_sacks = 0;
3310                 /* When outputting, Syn_sent means "send the Syn", for connections we
3311                  * initiate.  SYNACKs are sent from sndsynack directly. */
3312                 if (tcb->state == Syn_sent) {
3313                         seg.flags = 0;
3314                         seg.sack_ok = SACK_SUPPORTED;   /* here's where we advertise SACK */
3315                         if (tcb->snd.nxt - ssize == tcb->iss) {
3316                                 seg.flags |= SYN;
3317                                 dsize--;
3318                                 seg.mss = tcb->mss;
3319                                 seg.ws = tcb->scale;
3320                         } else {
3321                                 /* TODO: Not sure why we'd get here. */
3322                                 warn("TCP: weird Syn_sent state, tell someone you saw this");
3323                         }
3324                 }
3325                 seg.seq = from_seq;
3326                 seg.ack = tcb->rcv.nxt;
3327                 tcb->last_ack_sent = seg.ack;
3328                 seg.wnd = tcb->rcv.wnd;
3329                 seg.ts_val = tcb->ts_recent;
3330
3331                 /* Pull out data to send */
3332                 bp = NULL;
3333                 if (dsize != 0) {
3334                         bp = qcopy(s->wq, dsize, sent);
3335                         if (BLEN(bp) != dsize) {
3336                                 /* Here's where the flgcnt kicked in.  Note dsize is
3337                                  * decremented, but ssize isn't.  Not that we use ssize for much
3338                                  * anymore.  Decrementing dsize prevents us from sending a PSH
3339                                  * with the FIN. */
3340                                 seg.flags |= FIN;
3341                                 dsize--;
3342                         }
3343                         if (BLEN(bp) > payload_mss) {
3344                                 bp->flag |= Btso;
3345                                 bp->mss = payload_mss;
3346                         }
3347                 }
3348
3349                 if (sent + dsize == qlen(s->wq) + tcb->flgcnt)
3350                         seg.flags |= PSH;
3351
3352                 /* Build header, link data and compute cksum */
3353                 switch (version) {
3354                         case V4:
3355                                 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
3356                                 hbp = htontcp4(&seg, bp, &tcb->protohdr.tcp4hdr, tcb);
3357                                 if (hbp == NULL) {
3358                                         freeblist(bp);
3359                                         return;
3360                                 }
3361                                 break;
3362                         case V6:
3363                                 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
3364                                 hbp = htontcp6(&seg, bp, &tcb->protohdr.tcp6hdr, tcb);
3365                                 if (hbp == NULL) {
3366                                         freeblist(bp);
3367                                         return;
3368                                 }
3369                                 break;
3370                         default:
3371                                 hbp = NULL;     /* to suppress a warning */
3372                                 panic("tcpoutput: version %d", version);
3373                 }
3374
3375                 /* Start the transmission timers if there is new data and we
3376                  * expect acknowledges
3377                  */
3378                 if (ssize != 0) {
3379                         if (tcb->timer.state != TcptimerON)
3380                                 tcpgo(tpriv, &tcb->timer);
3381
3382                         /* If round trip timer isn't running, start it. */
3383                         if (tcb->rtt_timer.state != TcptimerON) {
3384                                 tcpgo(tpriv, &tcb->rtt_timer);
3385                                 tcb->rttseq = from_seq + ssize;
3386                         }
3387                 }
3388
3389                 tpriv->stats[OutSegs]++;
3390
3391                 /* put off the next keep alive */
3392                 tcpgo(tpriv, &tcb->katimer);
3393
3394                 switch (version) {
3395                         case V4:
3396                                 if (ipoput4(f, hbp, 0, s->ttl, s->tos, s) < 0) {
3397                                         /* a negative return means no route */
3398                                         localclose(s, "no route");
3399                                 }
3400                                 break;
3401                         case V6:
3402                                 if (ipoput6(f, hbp, 0, s->ttl, s->tos, s) < 0) {
3403                                         /* a negative return means no route */
3404                                         localclose(s, "no route");
3405                                 }
3406                                 break;
3407                         default:
3408                                 panic("tcpoutput2: version %d", version);
3409                 }
3410                 if (ssize) {
3411                         /* The outer loop thinks we sent one packet.  If we used TSO, we
3412                          * might have sent several.  Minus one for the loop increment. */
3413                         msgs += DIV_ROUND_UP(ssize, payload_mss) - 1;
3414                 }
3415                 /* Old Plan 9 tidbit - yield every four messages.  We want to break out
3416                  * and unlock so we can process inbound ACKs which might do things like
3417                  * say "slow down". */
3418                 if (msgs >= next_yield) {
3419                         next_yield = msgs + 4;
3420                         qunlock(&s->qlock);
3421                         kthread_yield();
3422                         qlock(&s->qlock);
3423                 }
3424         }
3425 }
3426
3427 /*
3428  *  the BSD convention (hack?) for keep alives.  resend last uint8_t acked.
3429  */
3430 void tcpsendka(struct conv *s)
3431 {
3432         Tcp seg;
3433         Tcpctl *tcb;
3434         struct block *hbp, *dbp;
3435
3436         tcb = (Tcpctl *) s->ptcl;
3437
3438         dbp = NULL;
3439         seg.urg = 0;
3440         seg.source = s->lport;
3441         seg.dest = s->rport;
3442         seg.flags = ACK | PSH;
3443         seg.mss = 0;
3444         seg.ws = 0;
3445         seg.sack_ok = FALSE;
3446         seg.nr_sacks = 0;
3447         if (tcpporthogdefense)
3448                 urandom_read(&seg.seq, sizeof(seg.seq));
3449         else
3450                 seg.seq = tcb->snd.una - 1;
3451         seg.ack = tcb->rcv.nxt;
3452         tcb->last_ack_sent = seg.ack;
3453         tcb->rcv.una = 0;
3454         seg.wnd = tcb->rcv.wnd;
3455         seg.ts_val = tcb->ts_recent;
3456         if (tcb->state == Finwait2) {
3457                 seg.flags |= FIN;
3458         } else {
3459                 dbp = block_alloc(1, MEM_WAIT);
3460                 dbp->wp++;
3461         }
3462
3463         if (isv4(s->raddr)) {
3464                 /* Build header, link data and compute cksum */
3465                 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
3466                 hbp = htontcp4(&seg, dbp, &tcb->protohdr.tcp4hdr, tcb);
3467                 if (hbp == NULL) {
3468                         freeblist(dbp);
3469                         return;
3470                 }
3471                 ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
3472         } else {
3473                 /* Build header, link data and compute cksum */
3474                 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
3475                 hbp = htontcp6(&seg, dbp, &tcb->protohdr.tcp6hdr, tcb);
3476                 if (hbp == NULL) {
3477                         freeblist(dbp);
3478                         return;
3479                 }
3480                 ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
3481         }
3482 }
3483
3484 /*
3485  *  set connection to time out after 12 minutes
3486  */
3487 void tcpsetkacounter(Tcpctl * tcb)
3488 {
3489         tcb->kacounter = (12 * 60 * 1000) / (tcb->katimer.start * MSPTICK);
3490         if (tcb->kacounter < 3)
3491                 tcb->kacounter = 3;
3492 }
3493
3494 /*
3495  *  if we've timed out, close the connection
3496  *  otherwise, send a keepalive and restart the timer
3497  */
3498 void tcpkeepalive(void *v)
3499 {
3500         ERRSTACK(1);
3501         Tcpctl *tcb;
3502         struct conv *s;
3503
3504         s = v;
3505         tcb = (Tcpctl *) s->ptcl;
3506         qlock(&s->qlock);
3507         if (waserror()) {
3508                 qunlock(&s->qlock);
3509                 nexterror();
3510         }
3511         if (tcb->state != Closed) {
3512                 if (--(tcb->kacounter) <= 0) {
3513                         localclose(s, "connection timed out");
3514                 } else {
3515                         tcpsendka(s);
3516                         tcpgo(s->p->priv, &tcb->katimer);
3517                 }
3518         }
3519         qunlock(&s->qlock);
3520         poperror();
3521 }
3522
3523 /*
3524  *  start keepalive timer
3525  */
3526 static void tcpstartka(struct conv *s, char **f, int n)
3527 {
3528         Tcpctl *tcb;
3529         int x;
3530
3531         tcb = (Tcpctl *) s->ptcl;
3532         if (tcb->state != Established)
3533                 error(ENOTCONN, "connection must be in Establised state");
3534         if (n > 1) {
3535                 x = atoi(f[1]);
3536                 if (x >= MSPTICK)
3537                         tcb->katimer.start = x / MSPTICK;
3538         }
3539         tcpsetkacounter(tcb);
3540         tcpgo(s->p->priv, &tcb->katimer);
3541 }
3542
3543 /*
3544  *  turn checksums on/off
3545  */
3546 static void tcpsetchecksum(struct conv *s, char **f, int unused)
3547 {
3548         Tcpctl *tcb;
3549
3550         tcb = (Tcpctl *) s->ptcl;
3551         tcb->nochecksum = !atoi(f[1]);
3552 }
3553
3554 static void tcp_loss_event(struct conv *s, Tcpctl *tcb)
3555 {
3556         uint32_t old_cwnd = tcb->cwind;
3557
3558         /* Reno */
3559         tcb->ssthresh = tcb->cwind / 2;
3560         tcb->cwind = tcb->ssthresh;
3561         netlog(s->p->f, Logtcprxmt,
3562                "%I.%d -> %I.%d: loss event, cwnd was %d, now %d\n",
3563                s->laddr, s->lport, s->raddr, s->rport,
3564                old_cwnd, tcb->cwind);
3565 }
3566
3567 /* Called when we need to retrans the entire outstanding window (everything
3568  * previously sent, but unacknowledged). */
3569 void tcprxmit(struct conv *s)
3570 {
3571         Tcpctl *tcb;
3572
3573         tcb = (Tcpctl *) s->ptcl;
3574
3575         tcb->flags |= RETRAN | FORCE;
3576         tcb->snd.rtx = tcb->snd.una;
3577         set_in_flight(tcb);
3578
3579         tcpoutput(s);
3580 }
3581
3582 /* The original RFC said to drop sacks on a timeout, since the receiver could
3583  * renege.  Later RFCs say we can keep them around, so long as we are careful.
3584  *
3585  * We'll go with a "flush if we have two timeouts" plan.  This doesn't have to
3586  * be perfect - there might be cases where we accidentally flush the sacks too
3587  * often.  Perhaps we never get dup_acks to start fast/sack rxmit.  The main
3588  * thing is that after multiple timeouts we flush the sacks, since the receiver
3589  * might renege.
3590  *
3591  * We also have an Akaros-specific problem.  We use the sacks to determine
3592  * in_flight.  Specifically, the (snd.nxt - upper right edge) is tracked as in
3593  * flight.  Usually the receiver will keep sacking that right edge all the way
3594  * up to snd.nxt, but they might not, and the gap might be quite large.  After a
3595  * timeout, that data is definitely not in flight.  If that block's size is
3596  * greater than cwnd, we'll never transmit.  This should be rare, and in that
3597  * case we can just dump the sacks.  The typical_mss fudge factor is so we can
3598  * send a reasonably-sized packet. */
3599 static void timeout_handle_sacks(Tcpctl *tcb)
3600 {
3601         struct sack_block *last_sack;
3602
3603         if (tcb->snd.nr_sacks) {
3604                 last_sack = &tcb->snd.sacks[tcb->snd.nr_sacks - 1];
3605                 if (tcb->snd.flush_sacks || (tcb->snd.nxt - last_sack->right >=
3606                                              tcb->cwind - tcb->typical_mss)) {
3607                         tcb->snd.nr_sacks = 0;
3608                         tcb->snd.flush_sacks = FALSE;
3609                 } else {
3610                         tcb->snd.flush_sacks = TRUE;
3611                 }
3612         }
3613 }
3614
3615 void tcptimeout(void *arg)
3616 {
3617         ERRSTACK(1);
3618         struct conv *s;
3619         Tcpctl *tcb;
3620         int maxback;
3621         struct tcppriv *tpriv;
3622
3623         s = (struct conv *)arg;
3624         tpriv = s->p->priv;
3625         tcb = (Tcpctl *) s->ptcl;
3626
3627         qlock(&s->qlock);
3628         if (waserror()) {
3629                 qunlock(&s->qlock);
3630                 nexterror();
3631         }
3632         switch (tcb->state) {
3633                 default:
3634                         tcb->backoff++;
3635                         if (tcb->state == Syn_sent)
3636                                 maxback = MAXBACKMS / 2;
3637                         else
3638                                 maxback = MAXBACKMS;
3639                         tcb->backedoff += tcb->timer.start * MSPTICK;
3640                         if (tcb->backedoff >= maxback) {
3641                                 localclose(s, "connection timed out");
3642                                 break;
3643                         }
3644                         netlog(s->p->f, Logtcprxmt,
3645                                "%I.%d -> %I.%d: timeout rxmit una %u, rtx %u, nxt %u, in_flight %u, timer.start %u\n",
3646      &nbs