net: tcp: Don't increment snd.nxt
[akaros.git] / kern / src / net / tcp.c
1 /* Copyright © 1994-1999 Lucent Technologies Inc.  All rights reserved.
2  * Portions Copyright © 1997-1999 Vita Nuova Limited
3  * Portions Copyright © 2000-2007 Vita Nuova Holdings Limited
4  *                                (www.vitanuova.com)
5  * Revisions Copyright © 2000-2007 Lucent Technologies Inc. and others
6  *
7  * Modified for the Akaros operating system:
8  * Copyright (c) 2013-2014 The Regents of the University of California
9  * Copyright (c) 2013-2017 Google Inc.
10  *
11  * Permission is hereby granted, free of charge, to any person obtaining a copy
12  * of this software and associated documentation files (the "Software"), to deal
13  * in the Software without restriction, including without limitation the rights
14  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15  * copies of the Software, and to permit persons to whom the Software is
16  * furnished to do so, subject to the following conditions:
17  *
18  * The above copyright notice and this permission notice shall be included in
19  * all copies or substantial portions of the Software.
20  *
21  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
24  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27  * SOFTWARE. */
28
29 #include <vfs.h>
30 #include <kfs.h>
31 #include <slab.h>
32 #include <kmalloc.h>
33 #include <kref.h>
34 #include <string.h>
35 #include <stdio.h>
36 #include <assert.h>
37 #include <error.h>
38 #include <cpio.h>
39 #include <pmap.h>
40 #include <smp.h>
41 #include <ip.h>
42
43 #include <vfs.h>
44 #include <kfs.h>
45 #include <slab.h>
46 #include <kmalloc.h>
47 #include <kref.h>
48 #include <string.h>
49 #include <stdio.h>
50 #include <assert.h>
51 #include <error.h>
52 #include <cpio.h>
53 #include <pmap.h>
54 #include <smp.h>
55 #include <ip.h>
56
57 enum {
58         QMAX = 64 * 1024 - 1,
59         IP_TCPPROTO = 6,
60
61         TCP4_IPLEN = 8,
62         TCP4_PHDRSIZE = 12,
63         TCP4_HDRSIZE = 20,
64         TCP4_TCBPHDRSZ = 40,
65         TCP4_PKT = TCP4_IPLEN + TCP4_PHDRSIZE,
66
67         TCP6_IPLEN = 0,
68         TCP6_PHDRSIZE = 40,
69         TCP6_HDRSIZE = 20,
70         TCP6_TCBPHDRSZ = 60,
71         TCP6_PKT = TCP6_IPLEN + TCP6_PHDRSIZE,
72
73         TcptimerOFF = 0,
74         TcptimerON = 1,
75         TcptimerDONE = 2,
76         MAX_TIME = (1 << 20),   /* Forever */
77         TCP_ACK = 50,   /* Timed ack sequence in ms */
78         MAXBACKMS = 9 * 60 * 1000,      /* longest backoff time (ms) before hangup */
79
80         URG = 0x20,     /* Data marked urgent */
81         ACK = 0x10,     /* Acknowledge is valid */
82         PSH = 0x08,     /* Whole data pipe is pushed */
83         RST = 0x04,     /* Reset connection */
84         SYN = 0x02,     /* Pkt. is synchronise */
85         FIN = 0x01,     /* Start close down */
86
87         EOLOPT = 0,
88         NOOPOPT = 1,
89         MSSOPT = 2,
90         MSS_LENGTH = 4, /* max segment size header option length */
91         WSOPT = 3,
92         WS_LENGTH = 3,  /* WS header option length */
93         MAX_WS_VALUE = 14,      /* RFC specified.  Limits available window to 2^30 */
94         TS_OPT = 8,
95         TS_LENGTH = 10,
96         TS_SEND_PREPAD = 2,     /* For non-SYNs, pre-pad 2 nops for 32 byte alignment */
97         SACK_OK_OPT = 4,
98         SACK_OK_LENGTH = 2,
99         SACK_OPT = 5,
100         MSL2 = 10,
101         MSPTICK = 50,   /* Milliseconds per timer tick */
102         DEF_MSS = 1460, /* Default mean segment */
103         DEF_MSS6 = 1280,        /* Default mean segment (min) for v6 */
104         SACK_SUPPORTED = TRUE,  /* SACK is on by default */
105         MAX_NR_SACKS_PER_PACKET = 4,    /* limited by TCP's opts size */
106         MAX_NR_SND_SACKS = 10,
107         MAX_NR_RCV_SACKS = 3,   /* We could try for 4, but don't need to */
108         DEF_RTT = 500,  /* Default round trip */
109         DEF_KAT = 120000,       /* Default time (ms) between keep alives */
110         TCP_LISTEN = 0, /* Listen connection */
111         TCP_CONNECT = 1,        /* Outgoing connection */
112         SYNACK_RXTIMER = 250,   /* ms between SYNACK retransmits */
113
114         TCPREXMTTHRESH = 3,     /* dupack threshold for recovery */
115         SACK_RETRANS_RECOVERY = 1,
116         FAST_RETRANS_RECOVERY = 2,
117         RTO_RETRANS_RECOVERY = 3,
118         CWIND_SCALE = 10,       /* initial CWIND will be MSS * this */
119
120         FORCE                   = 1 << 0,
121         CLONE                   = 1 << 1,
122         ACTIVE                  = 1 << 2,
123         SYNACK                  = 1 << 3,
124         TSO                             = 1 << 4,
125
126         RTTM_ALPHA_SHIFT = 3,   /* alpha = 1/8 */
127         RTTM_BRAVO_SHIFT = 2,   /* bravo = 1/4 (beta) */
128
129         Closed = 0,     /* Connection states */
130         Listen,
131         Syn_sent,
132         Established,
133         Finwait1,
134         Finwait2,
135         Close_wait,
136         Closing,
137         Last_ack,
138         Time_wait,
139
140         Maxlimbo = 1000,        /* maximum procs waiting for response to SYN ACK */
141         NLHT = 256,     /* hash table size, must be a power of 2 */
142         LHTMASK = NLHT - 1,
143
144         HaveWS = 1 << 8,
145 };
146
147 /* Must correspond to the enumeration above */
148 char *tcpstates[] = {
149         "Closed", "Listen", "Syn_sent",
150         "Established", "Finwait1", "Finwait2", "Close_wait",
151         "Closing", "Last_ack", "Time_wait"
152 };
153
154 typedef struct Tcptimer Tcptimer;
155 struct Tcptimer {
156         Tcptimer *next;
157         Tcptimer *prev;
158         Tcptimer *readynext;
159         int state;
160         uint64_t start;
161         uint64_t count;
162         void (*func) (void *);
163         void *arg;
164 };
165
166 /*
167  *  v4 and v6 pseudo headers used for
168  *  checksuming tcp
169  */
170 typedef struct Tcp4hdr Tcp4hdr;
171 struct Tcp4hdr {
172         uint8_t vihl;                           /* Version and header length */
173         uint8_t tos;                            /* Type of service */
174         uint8_t length[2];                      /* packet length */
175         uint8_t id[2];                          /* Identification */
176         uint8_t frag[2];                        /* Fragment information */
177         uint8_t Unused;
178         uint8_t proto;
179         uint8_t tcplen[2];
180         uint8_t tcpsrc[4];
181         uint8_t tcpdst[4];
182         uint8_t tcpsport[2];
183         uint8_t tcpdport[2];
184         uint8_t tcpseq[4];
185         uint8_t tcpack[4];
186         uint8_t tcpflag[2];
187         uint8_t tcpwin[2];
188         uint8_t tcpcksum[2];
189         uint8_t tcpurg[2];
190         /* Options segment */
191         uint8_t tcpopt[1];
192 };
193
194 typedef struct Tcp6hdr Tcp6hdr;
195 struct Tcp6hdr {
196         uint8_t vcf[4];
197         uint8_t ploadlen[2];
198         uint8_t proto;
199         uint8_t ttl;
200         uint8_t tcpsrc[IPaddrlen];
201         uint8_t tcpdst[IPaddrlen];
202         uint8_t tcpsport[2];
203         uint8_t tcpdport[2];
204         uint8_t tcpseq[4];
205         uint8_t tcpack[4];
206         uint8_t tcpflag[2];
207         uint8_t tcpwin[2];
208         uint8_t tcpcksum[2];
209         uint8_t tcpurg[2];
210         /* Options segment */
211         uint8_t tcpopt[1];
212 };
213
214 struct sack_block {
215         uint32_t left;
216         uint32_t right;
217 };
218
219 /*
220  *  this represents the control info
221  *  for a single packet.  It is derived from
222  *  a packet in ntohtcp{4,6}() and stuck into
223  *  a packet in htontcp{4,6}().
224  */
225 typedef struct Tcp Tcp;
226 struct Tcp {
227         uint16_t source;
228         uint16_t dest;
229         uint32_t seq;
230         uint32_t ack;
231         uint8_t flags;
232         uint16_t ws;                            /* window scale option (if not zero) */
233         uint32_t wnd;
234         uint16_t urg;
235         uint16_t mss;                           /* max segment size option (if not zero) */
236         uint16_t len;                           /* size of data */
237         uint32_t ts_val;                        /* timestamp val from sender */
238         uint32_t ts_ecr;                        /* timestamp echo response from sender */
239         bool sack_ok;                           /* header had/should have SACK_PERMITTED */
240         uint8_t nr_sacks;
241         struct sack_block sacks[MAX_NR_SACKS_PER_PACKET];
242 };
243
244 /*
245  *  this header is malloc'd to thread together fragments
246  *  waiting to be coalesced
247  */
248 typedef struct Reseq Reseq;
249 struct Reseq {
250         Reseq *next;
251         Tcp seg;
252         struct block *bp;
253         uint16_t length;
254 };
255
256 /*
257  *  the qlock in the Conv locks this structure
258  */
259 typedef struct Tcpctl Tcpctl;
260 struct Tcpctl {
261         uint8_t state;                          /* Connection state */
262         uint8_t type;                           /* Listening or active connection */
263         uint8_t code;                           /* Icmp code */
264         struct {
265                 uint32_t una;                   /* Left edge of unacked data region */
266                 uint32_t nxt;                   /* Next seq to send, right edge of unacked */
267                 uint32_t rtx;                   /* Next to send for retrans */
268                 uint32_t wnd;                   /* Tcp send window */
269                 uint32_t urg;                   /* Urgent data pointer */
270                 uint32_t wl2;
271                 int scale;                              /* how much to right shift window for xmit */
272                 uint32_t in_flight;             /* estimate of how much is in flight */
273                 uint8_t loss_hint;              /* number of loss hints rcvd */
274                 uint8_t sack_loss_hint; /* For detecting sack rxmit losses */
275                 bool flush_sacks;               /* Two timeouts in a row == dump sacks */
276                 uint8_t recovery;               /* loss recovery flag */
277                 uint32_t recovery_pt;   /* right window for recovery point */
278                 uint8_t nr_sacks;
279                 struct sack_block sacks[MAX_NR_SND_SACKS];
280         } snd;
281         struct {
282                 uint32_t nxt;                   /* Receive pointer to next uint8_t slot */
283                 uint32_t wnd;                   /* Receive window incoming */
284                 uint32_t urg;                   /* Urgent pointer */
285                 int blocked;
286                 int una;                                /* unacked data segs */
287                 int scale;                              /* how much to left shift window for rx */
288                 uint8_t nr_sacks;
289                 struct sack_block sacks[MAX_NR_RCV_SACKS];
290         } rcv;
291         uint32_t iss;                           /* Initial sequence number */
292         int sawwsopt;                           /* true if we saw a wsopt on the incoming SYN */
293         uint32_t cwind;                         /* Congestion window */
294         int scale;                                      /* desired snd.scale */
295         uint32_t ssthresh;                      /* Slow start threshold */
296         int irs;                                        /* Initial received squence */
297         uint16_t mss;                           /* Max segment size */
298         uint16_t typical_mss;           /* MSS for most packets (< MSS for some opts) */
299         int rerecv;                                     /* Overlap of data rerecevived */
300         uint32_t window;                        /* Recevive window */
301         uint8_t backoff;                        /* Exponential backoff counter */
302         int backedoff;                          /* ms we've backed off for rexmits */
303         uint8_t flags;                          /* State flags */
304         Reseq *reseq;                           /* Resequencing queue */
305         Tcptimer timer;                         /* Activity timer */
306         Tcptimer acktimer;                      /* Acknowledge timer */
307         Tcptimer rtt_timer;                     /* Round trip timer */
308         Tcptimer katimer;                       /* keep alive timer */
309         uint32_t rttseq;                        /* Round trip sequence */
310         int srtt;                                       /* Shortened round trip */
311         int mdev;                                       /* Mean deviation of round trip */
312         int kacounter;                          /* count down for keep alive */
313         uint64_t sndsyntime;            /* time syn sent */
314         uint64_t time;                          /* time Finwait2 was sent */
315         int nochecksum;                         /* non-zero means don't send checksums */
316         int flgcnt;                                     /* number of flags in the sequence (FIN,SYN) */
317         uint32_t ts_recent;                     /* timestamp received around last_ack_sent */
318         uint32_t last_ack_sent;         /* to determine when to update timestamp */
319         bool sack_ok;                           /* Can use SACK for this connection */
320
321         union {
322                 Tcp4hdr tcp4hdr;
323                 Tcp6hdr tcp6hdr;
324         } protohdr;                                     /* prototype header */
325 };
326
327 /*
328  *  New calls are put in limbo rather than having a conversation structure
329  *  allocated.  Thus, a SYN attack results in lots of limbo'd calls but not
330  *  any real Conv structures mucking things up.  Calls in limbo rexmit their
331  *  SYN ACK every SYNACK_RXTIMER ms up to 4 times, i.e., they disappear after 1 second.
332  *
333  *  In particular they aren't on a listener's queue so that they don't figure
334  *  in the input queue limit.
335  *
336  *  If 1/2 of a T3 was attacking SYN packets, we'ld have a permanent queue
337  *  of 70000 limbo'd calls.  Not great for a linear list but doable.  Therefore
338  *  there is no hashing of this list.
339  */
340 typedef struct Limbo Limbo;
341 struct Limbo {
342         Limbo *next;
343
344         uint8_t laddr[IPaddrlen];
345         uint8_t raddr[IPaddrlen];
346         uint16_t lport;
347         uint16_t rport;
348         uint32_t irs;                           /* initial received sequence */
349         uint32_t iss;                           /* initial sent sequence */
350         uint16_t mss;                           /* mss from the other end */
351         uint16_t rcvscale;                      /* how much to scale rcvd windows */
352         uint16_t sndscale;                      /* how much to scale sent windows */
353         uint64_t lastsend;                      /* last time we sent a synack */
354         uint8_t version;                        /* v4 or v6 */
355         uint8_t rexmits;                        /* number of retransmissions */
356         bool sack_ok;                           /* other side said SACK_OK */
357         uint32_t ts_val;                        /* timestamp val from sender */
358 };
359
360 int tcp_irtt = DEF_RTT;                 /* Initial guess at round trip time */
361 uint16_t tcp_mss = DEF_MSS;             /* Maximum segment size to be sent */
362
363 enum {
364         /* MIB stats */
365         MaxConn,
366         ActiveOpens,
367         PassiveOpens,
368         EstabResets,
369         CurrEstab,
370         InSegs,
371         OutSegs,
372         RetransSegs,
373         RetransTimeouts,
374         InErrs,
375         OutRsts,
376
377         /* non-MIB stats */
378         CsumErrs,
379         HlenErrs,
380         LenErrs,
381         OutOfOrder,
382
383         Nstats
384 };
385
386 static char *statnames[] = {
387         [MaxConn] "MaxConn",
388         [ActiveOpens] "ActiveOpens",
389         [PassiveOpens] "PassiveOpens",
390         [EstabResets] "EstabResets",
391         [CurrEstab] "CurrEstab",
392         [InSegs] "InSegs",
393         [OutSegs] "OutSegs",
394         [RetransSegs] "RetransSegs",
395         [RetransTimeouts] "RetransTimeouts",
396         [InErrs] "InErrs",
397         [OutRsts] "OutRsts",
398         [CsumErrs] "CsumErrs",
399         [HlenErrs] "HlenErrs",
400         [LenErrs] "LenErrs",
401         [OutOfOrder] "OutOfOrder",
402 };
403
404 typedef struct Tcppriv Tcppriv;
405 struct tcppriv {
406         /* List of active timers */
407         qlock_t tl;
408         Tcptimer *timers;
409
410         /* hash table for matching conversations */
411         struct Ipht ht;
412
413         /* calls in limbo waiting for an ACK to our SYN ACK */
414         int nlimbo;
415         Limbo *lht[NLHT];
416
417         /* for keeping track of tcpackproc */
418         qlock_t apl;
419         int ackprocstarted;
420
421         uint32_t stats[Nstats];
422 };
423
424 /*
425  *  Setting tcpporthogdefense to non-zero enables Dong Lin's
426  *  solution to hijacked systems staking out port's as a form
427  *  of DoS attack.
428  *
429  *  To avoid stateless Conv hogs, we pick a sequence number at random.  If
430  *  it that number gets acked by the other end, we shut down the connection.
431  *  Look for tcpporthogedefense in the code.
432  */
433 int tcpporthogdefense = 0;
434
435 int addreseq(Tcpctl *, struct tcppriv *, Tcp *, struct block *, uint16_t);
436 void getreseq(Tcpctl *, Tcp *, struct block **, uint16_t *);
437 void localclose(struct conv *, char *unused_char_p_t);
438 void procsyn(struct conv *, Tcp *);
439 void tcpiput(struct Proto *, struct Ipifc *, struct block *);
440 void tcpoutput(struct conv *);
441 int tcptrim(Tcpctl *, Tcp *, struct block **, uint16_t *);
442 void tcpstart(struct conv *, int);
443 void tcptimeout(void *);
444 void tcpsndsyn(struct conv *, Tcpctl *);
445 void tcprcvwin(struct conv *);
446 void tcpacktimer(void *);
447 void tcpkeepalive(void *);
448 void tcpsetkacounter(Tcpctl *);
449 void tcprxmit(struct conv *);
450 void tcpsettimer(Tcpctl *);
451 void tcpsynackrtt(struct conv *);
452 void tcpsetscale(struct conv *, Tcpctl *, uint16_t, uint16_t);
453 static void tcp_loss_event(struct conv *s, Tcpctl *tcb);
454 static uint16_t derive_payload_mss(Tcpctl *tcb);
455 static int seq_within(uint32_t x, uint32_t low, uint32_t high);
456 static int seq_lt(uint32_t x, uint32_t y);
457 static int seq_le(uint32_t x, uint32_t y);
458 static int seq_gt(uint32_t x, uint32_t y);
459 static int seq_ge(uint32_t x, uint32_t y);
460 static uint32_t seq_max(uint32_t x, uint32_t y);
461 static uint32_t seq_min(uint32_t x, uint32_t y);
462 static void set_in_flight(Tcpctl *tcb);
463
464 static void limborexmit(struct Proto *);
465 static void limbo(struct conv *, uint8_t * unused_uint8_p_t, uint8_t *, Tcp *,
466                                   int);
467
468 void tcpsetstate(struct conv *s, uint8_t newstate)
469 {
470         Tcpctl *tcb;
471         uint8_t oldstate;
472         struct tcppriv *tpriv;
473
474         tpriv = s->p->priv;
475
476         tcb = (Tcpctl *) s->ptcl;
477
478         oldstate = tcb->state;
479         if (oldstate == newstate)
480                 return;
481
482         if (oldstate == Established)
483                 tpriv->stats[CurrEstab]--;
484         if (newstate == Established)
485                 tpriv->stats[CurrEstab]++;
486
487         /**
488         print( "%d/%d %s->%s CurrEstab=%d\n", s->lport, s->rport,
489                 tcpstates[oldstate], tcpstates[newstate], tpriv->tstats.tcpCurrEstab );
490         **/
491
492         switch (newstate) {
493                 case Closed:
494                         qclose(s->rq);
495                         qclose(s->wq);
496                         qclose(s->eq);
497                         break;
498
499                 case Close_wait:        /* Remote closes */
500                         qhangup(s->rq, NULL);
501                         break;
502         }
503
504         tcb->state = newstate;
505
506         if (oldstate == Syn_sent && newstate != Closed)
507                 Fsconnected(s, NULL);
508 }
509
510 static void tcpconnect(struct conv *c, char **argv, int argc)
511 {
512         Fsstdconnect(c, argv, argc);
513         tcpstart(c, TCP_CONNECT);
514 }
515
516 static int tcpstate(struct conv *c, char *state, int n)
517 {
518         Tcpctl *s;
519
520         s = (Tcpctl *) (c->ptcl);
521
522         return snprintf(state, n,
523                                         "%s qin %d qout %d srtt %d mdev %d cwin %u swin %u>>%d rwin %u>>%d timer.start %llu timer.count %llu rerecv %d katimer.start %d katimer.count %d\n",
524                                         tcpstates[s->state],
525                                         c->rq ? qlen(c->rq) : 0,
526                                         c->wq ? qlen(c->wq) : 0,
527                                         s->srtt, s->mdev,
528                                         s->cwind, s->snd.wnd, s->rcv.scale, s->rcv.wnd,
529                                         s->snd.scale, s->timer.start, s->timer.count, s->rerecv,
530                                         s->katimer.start, s->katimer.count);
531 }
532
533 static int tcpinuse(struct conv *c)
534 {
535         Tcpctl *s;
536
537         s = (Tcpctl *) (c->ptcl);
538         return s->state != Closed;
539 }
540
541 static void tcpannounce(struct conv *c, char **argv, int argc)
542 {
543         Fsstdannounce(c, argv, argc);
544         tcpstart(c, TCP_LISTEN);
545         Fsconnected(c, NULL);
546 }
547
548 static void tcpbypass(struct conv *cv, char **argv, int argc)
549 {
550         struct tcppriv *tpriv = cv->p->priv;
551
552         Fsstdbypass(cv, argv, argc);
553         iphtadd(&tpriv->ht, cv);
554 }
555
556 static void tcpshutdown(struct conv *c, int how)
557 {
558         Tcpctl *tcb = (Tcpctl*)c->ptcl;
559
560         /* Do nothing for the read side */
561         if (how == SHUT_RD)
562                 return;
563         /* Sends a FIN.  If we're in another state (like Listen), we'll run into
564          * issues, since we'll never send the FIN.  We'll be shutdown on our end,
565          * but we'll never tell the distant end.  Might just be an app issue. */
566         switch (tcb->state) {
567         case Established:
568                 tcb->flgcnt++;
569                 tcpsetstate(c, Finwait1);
570                 tcpoutput(c);
571                 break;
572         }
573 }
574
575 /*
576  *  tcpclose is always called with the q locked
577  */
578 static void tcpclose(struct conv *c)
579 {
580         Tcpctl *tcb;
581
582         tcb = (Tcpctl *) c->ptcl;
583
584         qhangup(c->rq, NULL);
585         qhangup(c->wq, NULL);
586         qhangup(c->eq, NULL);
587         qflush(c->rq);
588
589         switch (tcb->state) {
590                 case Listen:
591                         /*
592                          *  reset any incoming calls to this listener
593                          */
594                         Fsconnected(c, "Hangup");
595
596                         localclose(c, NULL);
597                         break;
598                 case Closed:
599                 case Syn_sent:
600                         localclose(c, NULL);
601                         break;
602                 case Established:
603                         tcb->flgcnt++;
604                         tcpsetstate(c, Finwait1);
605                         tcpoutput(c);
606                         break;
607                 case Close_wait:
608                         tcb->flgcnt++;
609                         tcpsetstate(c, Last_ack);
610                         tcpoutput(c);
611                         break;
612         }
613 }
614
615 void tcpkick(void *x)
616 {
617         ERRSTACK(1);
618         struct conv *s = x;
619         Tcpctl *tcb;
620
621         tcb = (Tcpctl *) s->ptcl;
622
623         qlock(&s->qlock);
624         if (waserror()) {
625                 qunlock(&s->qlock);
626                 nexterror();
627         }
628
629         switch (tcb->state) {
630                 case Syn_sent:
631                 case Established:
632                 case Close_wait:
633                         /*
634                          * Push data
635                          */
636                         tcprcvwin(s);
637                         tcpoutput(s);
638                         break;
639                 default:
640                         localclose(s, "Hangup");
641                         break;
642         }
643
644         qunlock(&s->qlock);
645         poperror();
646 }
647
648 void tcprcvwin(struct conv *s)
649 {
650         /* Call with tcb locked */
651         int w;
652         Tcpctl *tcb;
653
654         tcb = (Tcpctl *) s->ptcl;
655         w = tcb->window - qlen(s->rq);
656         if (w < 0)
657                 w = 0;
658
659         /* RFC 813: Avoid SWS.  We'll always reduce the window (because the qio
660          * increased - that's legit), and we'll always advertise the window
661          * increases (corresponding to qio drains) when those are greater than MSS.
662          * But we don't advertise increases less than MSS.
663          *
664          * Note we don't shrink the window at all - that'll result in tcptrim()
665          * dropping packets that were sent before the sender gets our update. */
666         if ((w < tcb->rcv.wnd) || (w >= tcb->mss))
667                 tcb->rcv.wnd = w;
668         /* We've delayed sending an update to rcv.wnd, and we might never get
669          * another ACK to drive the TCP stack after the qio is drained.  We could
670          * replace this stuff with qio kicks or callbacks, but that might be
671          * trickier with the MSS limitation.  (and 'edge' isn't empty or not). */
672         if (w < tcb->mss)
673                 tcb->rcv.blocked = 1;
674 }
675
676 void tcpacktimer(void *v)
677 {
678         ERRSTACK(1);
679         Tcpctl *tcb;
680         struct conv *s;
681
682         s = v;
683         tcb = (Tcpctl *) s->ptcl;
684
685         qlock(&s->qlock);
686         if (waserror()) {
687                 qunlock(&s->qlock);
688                 nexterror();
689         }
690         if (tcb->state != Closed) {
691                 tcb->flags |= FORCE;
692                 tcprcvwin(s);
693                 tcpoutput(s);
694         }
695         qunlock(&s->qlock);
696         poperror();
697 }
698
699 static void tcpcreate(struct conv *c)
700 {
701         /* We don't use qio limits.  Instead, TCP manages flow control on its own.
702          * We only use qpassnolim().  Note for qio that 0 doesn't mean no limit. */
703         c->rq = qopen(0, Qcoalesce, 0, 0);
704         c->wq = qopen(8 * QMAX, Qkick, tcpkick, c);
705 }
706
707 static void timerstate(struct tcppriv *priv, Tcptimer * t, int newstate)
708 {
709         if (newstate != TcptimerON) {
710                 if (t->state == TcptimerON) {
711                         // unchain
712                         if (priv->timers == t) {
713                                 priv->timers = t->next;
714                                 if (t->prev != NULL)
715                                         panic("timerstate1");
716                         }
717                         if (t->next)
718                                 t->next->prev = t->prev;
719                         if (t->prev)
720                                 t->prev->next = t->next;
721                         t->next = t->prev = NULL;
722                 }
723         } else {
724                 if (t->state != TcptimerON) {
725                         // chain
726                         if (t->prev != NULL || t->next != NULL)
727                                 panic("timerstate2");
728                         t->prev = NULL;
729                         t->next = priv->timers;
730                         if (t->next)
731                                 t->next->prev = t;
732                         priv->timers = t;
733                 }
734         }
735         t->state = newstate;
736 }
737
738 void tcpackproc(void *a)
739 {
740         ERRSTACK(1);
741         Tcptimer *t, *tp, *timeo;
742         struct Proto *tcp;
743         struct tcppriv *priv;
744         int loop;
745
746         tcp = a;
747         priv = tcp->priv;
748
749         for (;;) {
750                 kthread_usleep(MSPTICK * 1000);
751
752                 qlock(&priv->tl);
753                 timeo = NULL;
754                 loop = 0;
755                 for (t = priv->timers; t != NULL; t = tp) {
756                         if (loop++ > 10000)
757                                 panic("tcpackproc1");
758                         tp = t->next;
759                         if (t->state == TcptimerON) {
760                                 t->count--;
761                                 if (t->count == 0) {
762                                         timerstate(priv, t, TcptimerDONE);
763                                         t->readynext = timeo;
764                                         timeo = t;
765                                 }
766                         }
767                 }
768                 qunlock(&priv->tl);
769
770                 loop = 0;
771                 for (t = timeo; t != NULL; t = t->readynext) {
772                         if (loop++ > 10000)
773                                 panic("tcpackproc2");
774                         if (t->state == TcptimerDONE && t->func != NULL) {
775                                 /* discard error style */
776                                 if (!waserror())
777                                         (*t->func) (t->arg);
778                                 poperror();
779                         }
780                 }
781
782                 limborexmit(tcp);
783         }
784 }
785
786 void tcpgo(struct tcppriv *priv, Tcptimer * t)
787 {
788         if (t == NULL || t->start == 0)
789                 return;
790
791         qlock(&priv->tl);
792         t->count = t->start;
793         timerstate(priv, t, TcptimerON);
794         qunlock(&priv->tl);
795 }
796
797 void tcphalt(struct tcppriv *priv, Tcptimer * t)
798 {
799         if (t == NULL)
800                 return;
801
802         qlock(&priv->tl);
803         timerstate(priv, t, TcptimerOFF);
804         qunlock(&priv->tl);
805 }
806
807 int backoff(int n)
808 {
809         return 1 << n;
810 }
811
812 void localclose(struct conv *s, char *reason)
813 {       /* called with tcb locked */
814         Tcpctl *tcb;
815         Reseq *rp, *rp1;
816         struct tcppriv *tpriv;
817
818         tpriv = s->p->priv;
819         tcb = (Tcpctl *) s->ptcl;
820
821         iphtrem(&tpriv->ht, s);
822
823         tcphalt(tpriv, &tcb->timer);
824         tcphalt(tpriv, &tcb->rtt_timer);
825         tcphalt(tpriv, &tcb->acktimer);
826         tcphalt(tpriv, &tcb->katimer);
827
828         /* Flush reassembly queue; nothing more can arrive */
829         for (rp = tcb->reseq; rp != NULL; rp = rp1) {
830                 rp1 = rp->next;
831                 freeblist(rp->bp);
832                 kfree(rp);
833         }
834         tcb->reseq = NULL;
835
836         if (tcb->state == Syn_sent)
837                 Fsconnected(s, reason);
838
839         qhangup(s->rq, reason);
840         qhangup(s->wq, reason);
841
842         tcpsetstate(s, Closed);
843
844         /* listener will check the rq state */
845         if (s->state == Announced)
846                 rendez_wakeup(&s->listenr);
847 }
848
849 /* mtu (- TCP + IP hdr len) of 1st hop */
850 int tcpmtu(struct Proto *tcp, uint8_t * addr, int version, int *scale,
851            uint8_t *flags)
852 {
853         struct Ipifc *ifc;
854         int mtu;
855
856         ifc = findipifc(tcp->f, addr, 0);
857         switch (version) {
858                 default:
859                 case V4:
860                         mtu = DEF_MSS;
861                         if (ifc != NULL)
862                                 mtu = ifc->maxtu - ifc->m->hsize - (TCP4_PKT + TCP4_HDRSIZE);
863                         break;
864                 case V6:
865                         mtu = DEF_MSS6;
866                         if (ifc != NULL)
867                                 mtu = ifc->maxtu - ifc->m->hsize - (TCP6_PKT + TCP6_HDRSIZE);
868                         break;
869         }
870         *flags &= ~TSO;
871         if (ifc && (ifc->feat & NETF_TSO))
872                 *flags |= TSO;
873         *scale = HaveWS | 7;
874
875         return mtu;
876 }
877
878 void inittcpctl(struct conv *s, int mode)
879 {
880         Tcpctl *tcb;
881         Tcp4hdr *h4;
882         Tcp6hdr *h6;
883         int mss;
884
885         tcb = (Tcpctl *) s->ptcl;
886
887         memset(tcb, 0, sizeof(Tcpctl));
888
889         tcb->ssthresh = UINT32_MAX;
890         tcb->srtt = tcp_irtt;
891         tcb->mdev = 0;
892
893         /* setup timers */
894         tcb->timer.start = tcp_irtt / MSPTICK;
895         tcb->timer.func = tcptimeout;
896         tcb->timer.arg = s;
897         tcb->rtt_timer.start = MAX_TIME;
898         tcb->acktimer.start = TCP_ACK / MSPTICK;
899         tcb->acktimer.func = tcpacktimer;
900         tcb->acktimer.arg = s;
901         tcb->katimer.start = DEF_KAT / MSPTICK;
902         tcb->katimer.func = tcpkeepalive;
903         tcb->katimer.arg = s;
904
905         mss = DEF_MSS;
906
907         /* create a prototype(pseudo) header */
908         if (mode != TCP_LISTEN) {
909                 if (ipcmp(s->laddr, IPnoaddr) == 0)
910                         findlocalip(s->p->f, s->laddr, s->raddr);
911
912                 switch (s->ipversion) {
913                         case V4:
914                                 h4 = &tcb->protohdr.tcp4hdr;
915                                 memset(h4, 0, sizeof(*h4));
916                                 h4->proto = IP_TCPPROTO;
917                                 hnputs(h4->tcpsport, s->lport);
918                                 hnputs(h4->tcpdport, s->rport);
919                                 v6tov4(h4->tcpsrc, s->laddr);
920                                 v6tov4(h4->tcpdst, s->raddr);
921                                 break;
922                         case V6:
923                                 h6 = &tcb->protohdr.tcp6hdr;
924                                 memset(h6, 0, sizeof(*h6));
925                                 h6->proto = IP_TCPPROTO;
926                                 hnputs(h6->tcpsport, s->lport);
927                                 hnputs(h6->tcpdport, s->rport);
928                                 ipmove(h6->tcpsrc, s->laddr);
929                                 ipmove(h6->tcpdst, s->raddr);
930                                 mss = DEF_MSS6;
931                                 break;
932                         default:
933                                 panic("inittcpctl: version %d", s->ipversion);
934                 }
935         }
936
937         tcb->mss = mss;
938         tcb->typical_mss = mss;
939         tcb->cwind = tcb->typical_mss * CWIND_SCALE;
940
941         /* default is no window scaling */
942         tcb->window = QMAX;
943         tcb->rcv.wnd = QMAX;
944         tcb->rcv.scale = 0;
945         tcb->snd.scale = 0;
946 }
947
948 /*
949  *  called with s qlocked
950  */
951 void tcpstart(struct conv *s, int mode)
952 {
953         Tcpctl *tcb;
954         struct tcppriv *tpriv;
955         char *kpname;
956
957         tpriv = s->p->priv;
958
959         if (tpriv->ackprocstarted == 0) {
960                 qlock(&tpriv->apl);
961                 if (tpriv->ackprocstarted == 0) {
962                         /* tcpackproc needs to free this if it ever exits */
963                         kpname = kmalloc(KNAMELEN, MEM_WAIT);
964                         snprintf(kpname, KNAMELEN, "#I%dtcpack", s->p->f->dev);
965                         ktask(kpname, tcpackproc, s->p);
966                         tpriv->ackprocstarted = 1;
967                 }
968                 qunlock(&tpriv->apl);
969         }
970
971         tcb = (Tcpctl *) s->ptcl;
972
973         inittcpctl(s, mode);
974
975         iphtadd(&tpriv->ht, s);
976         switch (mode) {
977                 case TCP_LISTEN:
978                         tpriv->stats[PassiveOpens]++;
979                         tcb->flags |= CLONE;
980                         tcpsetstate(s, Listen);
981                         break;
982
983                 case TCP_CONNECT:
984                         tpriv->stats[ActiveOpens]++;
985                         tcb->flags |= ACTIVE;
986                         tcpsndsyn(s, tcb);
987                         tcpsetstate(s, Syn_sent);
988                         tcpoutput(s);
989                         break;
990         }
991 }
992
993 static char *tcpflag(uint16_t flag)
994 {
995         static char buf[128];
996
997         snprintf(buf, sizeof(buf), "%d", flag >> 10);   /* Head len */
998         if (flag & URG)
999                 snprintf(buf, sizeof(buf), "%s%s", buf, " URG");
1000         if (flag & ACK)
1001                 snprintf(buf, sizeof(buf), "%s%s", buf, " ACK");
1002         if (flag & PSH)
1003                 snprintf(buf, sizeof(buf), "%s%s", buf, " PSH");
1004         if (flag & RST)
1005                 snprintf(buf, sizeof(buf), "%s%s", buf, " RST");
1006         if (flag & SYN)
1007                 snprintf(buf, sizeof(buf), "%s%s", buf, " SYN");
1008         if (flag & FIN)
1009                 snprintf(buf, sizeof(buf), "%s%s", buf, " FIN");
1010
1011         return buf;
1012 }
1013
1014 /* Helper, determine if we should send a TCP timestamp.  ts_val was the
1015  * timestamp from our distant end.  We'll also send a TS on SYN (no ACK). */
1016 static bool tcp_seg_has_ts(Tcp *tcph)
1017 {
1018         return tcph->ts_val || ((tcph->flags & SYN) && !(tcph->flags & ACK));
1019 }
1020
1021 /* Given a TCP header/segment and default header size (e.g. TCP4_HDRSIZE),
1022  * return the actual hdr_len and opt_pad */
1023 static void compute_hdrlen_optpad(Tcp *tcph, uint16_t default_hdrlen,
1024                                   uint16_t *ret_hdrlen, uint16_t *ret_optpad,
1025                                   Tcpctl *tcb)
1026 {
1027         uint16_t hdrlen = default_hdrlen;
1028         uint16_t optpad = 0;
1029
1030         if (tcph->flags & SYN) {
1031                 if (tcph->mss)
1032                         hdrlen += MSS_LENGTH;
1033                 if (tcph->ws)
1034                         hdrlen += WS_LENGTH;
1035                 if (tcph->sack_ok)
1036                         hdrlen += SACK_OK_LENGTH;
1037         }
1038         if (tcp_seg_has_ts(tcph)) {
1039                 hdrlen += TS_LENGTH;
1040                 /* SYNs have other opts, don't do the PREPAD NOOP optimization. */
1041                 if (!(tcph->flags & SYN))
1042                         hdrlen += TS_SEND_PREPAD;
1043         }
1044         if (tcb && tcb->rcv.nr_sacks)
1045                 hdrlen += 2 + tcb->rcv.nr_sacks * 8;
1046         optpad = hdrlen & 3;
1047         if (optpad)
1048                 optpad = 4 - optpad;
1049         hdrlen += optpad;
1050         *ret_hdrlen = hdrlen;
1051         *ret_optpad = optpad;
1052 }
1053
1054 /* Writes the TCP options for tcph to opt. */
1055 static void write_opts(Tcp *tcph, uint8_t *opt, uint16_t optpad, Tcpctl *tcb)
1056 {
1057         if (tcph->flags & SYN) {
1058                 if (tcph->mss != 0) {
1059                         *opt++ = MSSOPT;
1060                         *opt++ = MSS_LENGTH;
1061                         hnputs(opt, tcph->mss);
1062                         opt += 2;
1063                 }
1064                 if (tcph->ws != 0) {
1065                         *opt++ = WSOPT;
1066                         *opt++ = WS_LENGTH;
1067                         *opt++ = tcph->ws;
1068                 }
1069                 if (tcph->sack_ok) {
1070                         *opt++ = SACK_OK_OPT;
1071                         *opt++ = SACK_OK_LENGTH;
1072                 }
1073         }
1074         if (tcp_seg_has_ts(tcph)) {
1075                 if (!(tcph->flags & SYN)) {
1076                         *opt++ = NOOPOPT;
1077                         *opt++ = NOOPOPT;
1078                 }
1079                 *opt++ = TS_OPT;
1080                 *opt++ = TS_LENGTH;
1081                 /* Setting TSval, our time */
1082                 hnputl(opt, milliseconds());
1083                 opt += 4;
1084                 /* Setting TSecr, the time we last saw from them, stored in ts_val */
1085                 hnputl(opt, tcph->ts_val);
1086                 opt += 4;
1087         }
1088         if (tcb && tcb->rcv.nr_sacks) {
1089                 *opt++ = SACK_OPT;
1090                 *opt++ = 2 + tcb->rcv.nr_sacks * 8;
1091                 for (int i = 0; i < tcb->rcv.nr_sacks; i++) {
1092                         hnputl(opt, tcb->rcv.sacks[i].left);
1093                         opt += 4;
1094                         hnputl(opt, tcb->rcv.sacks[i].right);
1095                         opt += 4;
1096                 }
1097         }
1098         while (optpad-- > 0)
1099                 *opt++ = NOOPOPT;
1100 }
1101
1102 /* Given a data block (or NULL) returns a block with enough header room that we
1103  * can send out.  block->wp is set to the beginning of the payload.  Returns
1104  * NULL on some sort of error. */
1105 static struct block *alloc_or_pad_block(struct block *data,
1106                                         uint16_t total_hdr_size)
1107 {
1108         if (data) {
1109                 data = padblock(data, total_hdr_size);
1110                 if (data == NULL)
1111                         return NULL;
1112         } else {
1113                 /* the 64 pad is to meet mintu's */
1114                 data = block_alloc(total_hdr_size + 64, MEM_WAIT);
1115                 if (data == NULL)
1116                         return NULL;
1117                 data->wp += total_hdr_size;
1118         }
1119         return data;
1120 }
1121
1122 struct block *htontcp6(Tcp *tcph, struct block *data, Tcp6hdr *ph,
1123                                            Tcpctl *tcb)
1124 {
1125         int dlen = blocklen(data);
1126         Tcp6hdr *h;
1127         uint16_t csum;
1128         uint16_t hdrlen, optpad;
1129
1130         compute_hdrlen_optpad(tcph, TCP6_HDRSIZE, &hdrlen, &optpad, tcb);
1131
1132         data = alloc_or_pad_block(data, hdrlen + TCP6_PKT);
1133         if (data == NULL)
1134                 return NULL;
1135         /* relative to the block start (bp->rp) */
1136         data->transport_header_end = hdrlen + TCP6_PKT;
1137
1138         /* copy in pseudo ip header plus port numbers */
1139         h = (Tcp6hdr *) (data->rp);
1140         memmove(h, ph, TCP6_TCBPHDRSZ);
1141
1142         /* compose pseudo tcp header, do cksum calculation */
1143         hnputl(h->vcf, hdrlen + dlen);
1144         h->ploadlen[0] = h->ploadlen[1] = h->proto = 0;
1145         h->ttl = ph->proto;
1146
1147         /* copy in variable bits */
1148         hnputl(h->tcpseq, tcph->seq);
1149         hnputl(h->tcpack, tcph->ack);
1150         hnputs(h->tcpflag, (hdrlen << 10) | tcph->flags);
1151         hnputs(h->tcpwin, tcph->wnd >> (tcb != NULL ? tcb->snd.scale : 0));
1152         hnputs(h->tcpurg, tcph->urg);
1153
1154         write_opts(tcph, h->tcpopt, optpad, tcb);
1155
1156         if (tcb != NULL && tcb->nochecksum) {
1157                 h->tcpcksum[0] = h->tcpcksum[1] = 0;
1158         } else {
1159                 csum = ptclcsum(data, TCP6_IPLEN, hdrlen + dlen + TCP6_PHDRSIZE);
1160                 hnputs(h->tcpcksum, csum);
1161         }
1162
1163         /* move from pseudo header back to normal ip header */
1164         memset(h->vcf, 0, 4);
1165         h->vcf[0] = IP_VER6;
1166         hnputs(h->ploadlen, hdrlen + dlen);
1167         h->proto = ph->proto;
1168
1169         return data;
1170 }
1171
1172 struct block *htontcp4(Tcp *tcph, struct block *data, Tcp4hdr *ph,
1173                                            Tcpctl *tcb)
1174 {
1175         int dlen = blocklen(data);
1176         Tcp4hdr *h;
1177         uint16_t csum;
1178         uint16_t hdrlen, optpad;
1179
1180         compute_hdrlen_optpad(tcph, TCP4_HDRSIZE, &hdrlen, &optpad, tcb);
1181
1182         data = alloc_or_pad_block(data, hdrlen + TCP4_PKT);
1183         if (data == NULL)
1184                 return NULL;
1185         /* relative to the block start (bp->rp) */
1186         data->transport_header_end = hdrlen + TCP4_PKT;
1187
1188         /* copy in pseudo ip header plus port numbers */
1189         h = (Tcp4hdr *) (data->rp);
1190         memmove(h, ph, TCP4_TCBPHDRSZ);
1191
1192         /* copy in variable bits */
1193         hnputs(h->tcplen, hdrlen + dlen);
1194         hnputl(h->tcpseq, tcph->seq);
1195         hnputl(h->tcpack, tcph->ack);
1196         hnputs(h->tcpflag, (hdrlen << 10) | tcph->flags);
1197         hnputs(h->tcpwin, tcph->wnd >> (tcb != NULL ? tcb->snd.scale : 0));
1198         hnputs(h->tcpurg, tcph->urg);
1199
1200         write_opts(tcph, h->tcpopt, optpad, tcb);
1201
1202         if (tcb != NULL && tcb->nochecksum) {
1203                 h->tcpcksum[0] = h->tcpcksum[1] = 0;
1204         } else {
1205                 csum = ~ptclcsum(data, TCP4_IPLEN, TCP4_PHDRSIZE);
1206                 hnputs(h->tcpcksum, csum);
1207                 data->checksum_start = TCP4_IPLEN + TCP4_PHDRSIZE;
1208                 data->checksum_offset = ph->tcpcksum - ph->tcpsport;
1209                 data->flag |= Btcpck;
1210         }
1211
1212         return data;
1213 }
1214
1215 static void parse_inbound_sacks(Tcp *tcph, uint8_t *opt, uint16_t optlen)
1216 {
1217         uint8_t nr_sacks;
1218         uint32_t left, right;
1219
1220         nr_sacks = (optlen - 2) / 8;
1221         if (nr_sacks > MAX_NR_SACKS_PER_PACKET)
1222                 return;
1223         opt += 2;
1224         for (int i = 0; i < nr_sacks; i++, opt += 8) {
1225                 left = nhgetl(opt);
1226                 right = nhgetl(opt + 4);
1227                 if (seq_ge(left, right)) {
1228                         /* bad / malicious SACK.  Skip it, and adjust. */
1229                         nr_sacks--;
1230                         i--;    /* stay on this array element next loop */
1231                         continue;
1232                 }
1233                 tcph->sacks[i].left = left;
1234                 tcph->sacks[i].right = right;
1235         }
1236         tcph->nr_sacks = nr_sacks;
1237 }
1238
1239 static void parse_inbound_opts(Tcp *tcph, uint8_t *opt, uint16_t optsize)
1240 {
1241         uint16_t optlen;
1242
1243         while (optsize > 0 && *opt != EOLOPT) {
1244                 if (*opt == NOOPOPT) {
1245                         optsize--;
1246                         opt++;
1247                         continue;
1248                 }
1249                 optlen = opt[1];
1250                 if (optlen < 2 || optlen > optsize)
1251                         break;
1252                 switch (*opt) {
1253                         case MSSOPT:
1254                                 if (optlen == MSS_LENGTH)
1255                                         tcph->mss = nhgets(opt + 2);
1256                                 break;
1257                         case WSOPT:
1258                                 if (optlen == WS_LENGTH && *(opt + 2) <= MAX_WS_VALUE)
1259                                         tcph->ws = HaveWS | *(opt + 2);
1260                                 break;
1261                         case SACK_OK_OPT:
1262                                 if (optlen == SACK_OK_LENGTH)
1263                                         tcph->sack_ok = TRUE;
1264                                 break;
1265                         case SACK_OPT:
1266                                 parse_inbound_sacks(tcph, opt, optlen);
1267                                 break;
1268                         case TS_OPT:
1269                                 if (optlen == TS_LENGTH) {
1270                                         tcph->ts_val = nhgetl(opt + 2);
1271                                         tcph->ts_ecr = nhgetl(opt + 6);
1272                                 }
1273                                 break;
1274                 }
1275                 optsize -= optlen;
1276                 opt += optlen;
1277         }
1278 }
1279
1280 /* Helper, clears the opts.  We'll later set them with e.g. parse_inbound_opts,
1281  * set them manually, or something else. */
1282 static void clear_tcph_opts(Tcp *tcph)
1283 {
1284         tcph->mss = 0;
1285         tcph->ws = 0;
1286         tcph->sack_ok = FALSE;
1287         tcph->nr_sacks = 0;
1288         tcph->ts_val = 0;
1289         tcph->ts_ecr = 0;
1290 }
1291
1292 int ntohtcp6(Tcp * tcph, struct block **bpp)
1293 {
1294         Tcp6hdr *h;
1295         uint16_t hdrlen;
1296
1297         *bpp = pullupblock(*bpp, TCP6_PKT + TCP6_HDRSIZE);
1298         if (*bpp == NULL)
1299                 return -1;
1300
1301         h = (Tcp6hdr *) ((*bpp)->rp);
1302         tcph->source = nhgets(h->tcpsport);
1303         tcph->dest = nhgets(h->tcpdport);
1304         tcph->seq = nhgetl(h->tcpseq);
1305         tcph->ack = nhgetl(h->tcpack);
1306         hdrlen = (h->tcpflag[0] >> 2) & ~3;
1307         if (hdrlen < TCP6_HDRSIZE) {
1308                 freeblist(*bpp);
1309                 return -1;
1310         }
1311
1312         tcph->flags = h->tcpflag[1];
1313         tcph->wnd = nhgets(h->tcpwin);
1314         tcph->urg = nhgets(h->tcpurg);
1315         clear_tcph_opts(tcph);
1316         tcph->len = nhgets(h->ploadlen) - hdrlen;
1317
1318         *bpp = pullupblock(*bpp, hdrlen + TCP6_PKT);
1319         if (*bpp == NULL)
1320                 return -1;
1321         parse_inbound_opts(tcph, h->tcpopt, hdrlen - TCP6_HDRSIZE);
1322         return hdrlen;
1323 }
1324
1325 int ntohtcp4(Tcp * tcph, struct block **bpp)
1326 {
1327         Tcp4hdr *h;
1328         uint16_t hdrlen;
1329
1330         *bpp = pullupblock(*bpp, TCP4_PKT + TCP4_HDRSIZE);
1331         if (*bpp == NULL)
1332                 return -1;
1333
1334         h = (Tcp4hdr *) ((*bpp)->rp);
1335         tcph->source = nhgets(h->tcpsport);
1336         tcph->dest = nhgets(h->tcpdport);
1337         tcph->seq = nhgetl(h->tcpseq);
1338         tcph->ack = nhgetl(h->tcpack);
1339
1340         hdrlen = (h->tcpflag[0] >> 2) & ~3;
1341         if (hdrlen < TCP4_HDRSIZE) {
1342                 freeblist(*bpp);
1343                 return -1;
1344         }
1345
1346         tcph->flags = h->tcpflag[1];
1347         tcph->wnd = nhgets(h->tcpwin);
1348         tcph->urg = nhgets(h->tcpurg);
1349         clear_tcph_opts(tcph);
1350         tcph->len = nhgets(h->length) - (hdrlen + TCP4_PKT);
1351
1352         *bpp = pullupblock(*bpp, hdrlen + TCP4_PKT);
1353         if (*bpp == NULL)
1354                 return -1;
1355         parse_inbound_opts(tcph, h->tcpopt, hdrlen - TCP4_HDRSIZE);
1356         return hdrlen;
1357 }
1358
1359 /*
1360  *  For outgoing calls, generate an initial sequence
1361  *  number and put a SYN on the send queue
1362  */
1363 void tcpsndsyn(struct conv *s, Tcpctl * tcb)
1364 {
1365         urandom_read(&tcb->iss, sizeof(tcb->iss));
1366         tcb->rttseq = tcb->iss;
1367         tcb->snd.wl2 = tcb->iss;
1368         tcb->snd.una = tcb->iss;
1369         tcb->snd.rtx = tcb->rttseq;
1370         tcb->snd.nxt = tcb->rttseq;
1371         tcb->flgcnt++;
1372         tcb->flags |= FORCE;
1373         tcb->sndsyntime = NOW;
1374
1375         /* set desired mss and scale */
1376         tcb->mss = tcpmtu(s->p, s->laddr, s->ipversion, &tcb->scale,
1377                           &tcb->flags);
1378 }
1379
1380 void
1381 sndrst(struct Proto *tcp, uint8_t * source, uint8_t * dest,
1382            uint16_t length, Tcp * seg, uint8_t version, char *reason)
1383 {
1384         struct block *hbp;
1385         uint8_t rflags;
1386         struct tcppriv *tpriv;
1387         Tcp4hdr ph4;
1388         Tcp6hdr ph6;
1389
1390         netlog(tcp->f, Logtcpreset, "sndrst: %s\n", reason);
1391
1392         tpriv = tcp->priv;
1393
1394         if (seg->flags & RST)
1395                 return;
1396
1397         /* make pseudo header */
1398         switch (version) {
1399                 case V4:
1400                         memset(&ph4, 0, sizeof(ph4));
1401                         ph4.vihl = IP_VER4;
1402                         v6tov4(ph4.tcpsrc, dest);
1403                         v6tov4(ph4.tcpdst, source);
1404                         ph4.proto = IP_TCPPROTO;
1405                         hnputs(ph4.tcplen, TCP4_HDRSIZE);
1406                         hnputs(ph4.tcpsport, seg->dest);
1407                         hnputs(ph4.tcpdport, seg->source);
1408                         break;
1409                 case V6:
1410                         memset(&ph6, 0, sizeof(ph6));
1411                         ph6.vcf[0] = IP_VER6;
1412                         ipmove(ph6.tcpsrc, dest);
1413                         ipmove(ph6.tcpdst, source);
1414                         ph6.proto = IP_TCPPROTO;
1415                         hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1416                         hnputs(ph6.tcpsport, seg->dest);
1417                         hnputs(ph6.tcpdport, seg->source);
1418                         break;
1419                 default:
1420                         panic("sndrst: version %d", version);
1421         }
1422
1423         tpriv->stats[OutRsts]++;
1424         rflags = RST;
1425
1426         /* convince the other end that this reset is in band */
1427         if (seg->flags & ACK) {
1428                 seg->seq = seg->ack;
1429                 seg->ack = 0;
1430         } else {
1431                 rflags |= ACK;
1432                 seg->ack = seg->seq;
1433                 seg->seq = 0;
1434                 if (seg->flags & SYN)
1435                         seg->ack++;
1436                 seg->ack += length;
1437                 if (seg->flags & FIN)
1438                         seg->ack++;
1439         }
1440         seg->flags = rflags;
1441         seg->wnd = 0;
1442         seg->urg = 0;
1443         seg->mss = 0;
1444         seg->ws = 0;
1445         seg->sack_ok = FALSE;
1446         seg->nr_sacks = 0;
1447         /* seg->ts_val is already set with their timestamp */
1448         switch (version) {
1449                 case V4:
1450                         hbp = htontcp4(seg, NULL, &ph4, NULL);
1451                         if (hbp == NULL)
1452                                 return;
1453                         ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1454                         break;
1455                 case V6:
1456                         hbp = htontcp6(seg, NULL, &ph6, NULL);
1457                         if (hbp == NULL)
1458                                 return;
1459                         ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1460                         break;
1461                 default:
1462                         panic("sndrst2: version %d", version);
1463         }
1464 }
1465
1466 /*
1467  *  send a reset to the remote side and close the conversation
1468  *  called with s qlocked
1469  */
1470 static void tcphangup(struct conv *s)
1471 {
1472         ERRSTACK(1);
1473         Tcp seg;
1474         Tcpctl *tcb;
1475         struct block *hbp;
1476
1477         tcb = (Tcpctl *) s->ptcl;
1478         if (ipcmp(s->raddr, IPnoaddr)) {
1479                 /* discard error style, poperror regardless */
1480                 if (!waserror()) {
1481                         seg.flags = RST | ACK;
1482                         seg.ack = tcb->rcv.nxt;
1483                         tcb->last_ack_sent = seg.ack;
1484                         tcb->rcv.una = 0;
1485                         seg.seq = tcb->snd.nxt;
1486                         seg.wnd = 0;
1487                         seg.urg = 0;
1488                         seg.mss = 0;
1489                         seg.ws = 0;
1490                         seg.sack_ok = FALSE;
1491                         seg.nr_sacks = 0;
1492                         seg.ts_val = tcb->ts_recent;
1493                         switch (s->ipversion) {
1494                                 case V4:
1495                                         tcb->protohdr.tcp4hdr.vihl = IP_VER4;
1496                                         hbp = htontcp4(&seg, NULL, &tcb->protohdr.tcp4hdr, tcb);
1497                                         ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
1498                                         break;
1499                                 case V6:
1500                                         tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
1501                                         hbp = htontcp6(&seg, NULL, &tcb->protohdr.tcp6hdr, tcb);
1502                                         ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
1503                                         break;
1504                                 default:
1505                                         panic("tcphangup: version %d", s->ipversion);
1506                         }
1507                 }
1508                 poperror();
1509         }
1510         localclose(s, NULL);
1511 }
1512
1513 /*
1514  *  (re)send a SYN ACK
1515  */
1516 int sndsynack(struct Proto *tcp, Limbo * lp)
1517 {
1518         struct block *hbp;
1519         Tcp4hdr ph4;
1520         Tcp6hdr ph6;
1521         Tcp seg;
1522         int scale;
1523         uint8_t flag = 0;
1524
1525         /* make pseudo header */
1526         switch (lp->version) {
1527                 case V4:
1528                         memset(&ph4, 0, sizeof(ph4));
1529                         ph4.vihl = IP_VER4;
1530                         v6tov4(ph4.tcpsrc, lp->laddr);
1531                         v6tov4(ph4.tcpdst, lp->raddr);
1532                         ph4.proto = IP_TCPPROTO;
1533                         hnputs(ph4.tcplen, TCP4_HDRSIZE);
1534                         hnputs(ph4.tcpsport, lp->lport);
1535                         hnputs(ph4.tcpdport, lp->rport);
1536                         break;
1537                 case V6:
1538                         memset(&ph6, 0, sizeof(ph6));
1539                         ph6.vcf[0] = IP_VER6;
1540                         ipmove(ph6.tcpsrc, lp->laddr);
1541                         ipmove(ph6.tcpdst, lp->raddr);
1542                         ph6.proto = IP_TCPPROTO;
1543                         hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1544                         hnputs(ph6.tcpsport, lp->lport);
1545                         hnputs(ph6.tcpdport, lp->rport);
1546                         break;
1547                 default:
1548                         panic("sndrst: version %d", lp->version);
1549         }
1550
1551         seg.seq = lp->iss;
1552         seg.ack = lp->irs + 1;
1553         seg.flags = SYN | ACK;
1554         seg.urg = 0;
1555         seg.mss = tcpmtu(tcp, lp->laddr, lp->version, &scale, &flag);
1556         seg.wnd = QMAX;
1557         seg.ts_val = lp->ts_val;
1558         seg.nr_sacks = 0;
1559
1560         /* if the other side set scale, we should too */
1561         if (lp->rcvscale) {
1562                 seg.ws = scale;
1563                 lp->sndscale = scale;
1564         } else {
1565                 seg.ws = 0;
1566                 lp->sndscale = 0;
1567         }
1568         if (SACK_SUPPORTED)
1569                 seg.sack_ok = lp->sack_ok;
1570         else
1571                 seg.sack_ok = FALSE;
1572
1573         switch (lp->version) {
1574                 case V4:
1575                         hbp = htontcp4(&seg, NULL, &ph4, NULL);
1576                         if (hbp == NULL)
1577                                 return -1;
1578                         ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1579                         break;
1580                 case V6:
1581                         hbp = htontcp6(&seg, NULL, &ph6, NULL);
1582                         if (hbp == NULL)
1583                                 return -1;
1584                         ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1585                         break;
1586                 default:
1587                         panic("sndsnack: version %d", lp->version);
1588         }
1589         lp->lastsend = NOW;
1590         return 0;
1591 }
1592
1593 #define hashipa(a, p) ( ( (a)[IPaddrlen-2] + (a)[IPaddrlen-1] + p )&LHTMASK )
1594
1595 /*
1596  *  put a call into limbo and respond with a SYN ACK
1597  *
1598  *  called with proto locked
1599  */
1600 static void
1601 limbo(struct conv *s, uint8_t * source, uint8_t * dest, Tcp * seg, int version)
1602 {
1603         Limbo *lp, **l;
1604         struct tcppriv *tpriv;
1605         int h;
1606
1607         tpriv = s->p->priv;
1608         h = hashipa(source, seg->source);
1609
1610         for (l = &tpriv->lht[h]; *l != NULL; l = &lp->next) {
1611                 lp = *l;
1612                 if (lp->lport != seg->dest || lp->rport != seg->source
1613                         || lp->version != version)
1614                         continue;
1615                 if (ipcmp(lp->raddr, source) != 0)
1616                         continue;
1617                 if (ipcmp(lp->laddr, dest) != 0)
1618                         continue;
1619
1620                 /* each new SYN restarts the retransmits */
1621                 lp->irs = seg->seq;
1622                 break;
1623         }
1624         lp = *l;
1625         if (lp == NULL) {
1626                 if (tpriv->nlimbo >= Maxlimbo && tpriv->lht[h]) {
1627                         lp = tpriv->lht[h];
1628                         tpriv->lht[h] = lp->next;
1629                         lp->next = NULL;
1630                 } else {
1631                         lp = kzmalloc(sizeof(*lp), 0);
1632                         if (lp == NULL)
1633                                 return;
1634                         tpriv->nlimbo++;
1635                 }
1636                 *l = lp;
1637                 lp->version = version;
1638                 ipmove(lp->laddr, dest);
1639                 ipmove(lp->raddr, source);
1640                 lp->lport = seg->dest;
1641                 lp->rport = seg->source;
1642                 lp->mss = seg->mss;
1643                 lp->rcvscale = seg->ws;
1644                 lp->sack_ok = seg->sack_ok;
1645                 lp->irs = seg->seq;
1646                 lp->ts_val = seg->ts_val;
1647                 urandom_read(&lp->iss, sizeof(lp->iss));
1648         }
1649
1650         if (sndsynack(s->p, lp) < 0) {
1651                 *l = lp->next;
1652                 tpriv->nlimbo--;
1653                 kfree(lp);
1654         }
1655 }
1656
1657 /*
1658  *  resend SYN ACK's once every SYNACK_RXTIMER ms.
1659  */
1660 static void limborexmit(struct Proto *tcp)
1661 {
1662         struct tcppriv *tpriv;
1663         Limbo **l, *lp;
1664         int h;
1665         int seen;
1666         uint64_t now;
1667
1668         tpriv = tcp->priv;
1669
1670         if (!canqlock(&tcp->qlock))
1671                 return;
1672         seen = 0;
1673         now = NOW;
1674         for (h = 0; h < NLHT && seen < tpriv->nlimbo; h++) {
1675                 for (l = &tpriv->lht[h]; *l != NULL && seen < tpriv->nlimbo;) {
1676                         lp = *l;
1677                         seen++;
1678                         if (now - lp->lastsend < (lp->rexmits + 1) * SYNACK_RXTIMER)
1679                                 continue;
1680
1681                         /* time it out after 1 second */
1682                         if (++(lp->rexmits) > 5) {
1683                                 tpriv->nlimbo--;
1684                                 *l = lp->next;
1685                                 kfree(lp);
1686                                 continue;
1687                         }
1688
1689                         /* if we're being attacked, don't bother resending SYN ACK's */
1690                         if (tpriv->nlimbo > 100)
1691                                 continue;
1692
1693                         if (sndsynack(tcp, lp) < 0) {
1694                                 tpriv->nlimbo--;
1695                                 *l = lp->next;
1696                                 kfree(lp);
1697                                 continue;
1698                         }
1699
1700                         l = &lp->next;
1701                 }
1702         }
1703         qunlock(&tcp->qlock);
1704 }
1705
1706 /*
1707  *  lookup call in limbo.  if found, throw it out.
1708  *
1709  *  called with proto locked
1710  */
1711 static void
1712 limborst(struct conv *s, Tcp * segp, uint8_t * src, uint8_t * dst,
1713                  uint8_t version)
1714 {
1715         Limbo *lp, **l;
1716         int h;
1717         struct tcppriv *tpriv;
1718
1719         tpriv = s->p->priv;
1720
1721         /* find a call in limbo */
1722         h = hashipa(src, segp->source);
1723         for (l = &tpriv->lht[h]; *l != NULL; l = &lp->next) {
1724                 lp = *l;
1725                 if (lp->lport != segp->dest || lp->rport != segp->source
1726                         || lp->version != version)
1727                         continue;
1728                 if (ipcmp(lp->laddr, dst) != 0)
1729                         continue;
1730                 if (ipcmp(lp->raddr, src) != 0)
1731                         continue;
1732
1733                 /* RST can only follow the SYN */
1734                 if (segp->seq == lp->irs + 1) {
1735                         tpriv->nlimbo--;
1736                         *l = lp->next;
1737                         kfree(lp);
1738                 }
1739                 break;
1740         }
1741 }
1742
1743 /* The advertised MSS (e.g. 1460) includes any per-packet TCP options, such as
1744  * TCP timestamps.  A given packet will contain mss bytes, but only typical_mss
1745  * bytes of *data*.  If we know we'll use those options, we should adjust our
1746  * typical_mss, which will affect the cwnd. */
1747 static void adjust_typical_mss_for_opts(Tcp *tcph, Tcpctl *tcb)
1748 {
1749         uint16_t opt_size = 0;
1750
1751         if (tcph->ts_val)
1752                 opt_size += TS_LENGTH + TS_SEND_PREPAD;
1753         opt_size = ROUNDUP(opt_size, 4);
1754         tcb->typical_mss -= opt_size;
1755 }
1756
1757 /*
1758  *  come here when we finally get an ACK to our SYN-ACK.
1759  *  lookup call in limbo.  if found, create a new conversation
1760  *
1761  *  called with proto locked
1762  */
1763 static struct conv *tcpincoming(struct conv *s, Tcp * segp, uint8_t * src,
1764                                                                 uint8_t * dst, uint8_t version)
1765 {
1766         struct conv *new;
1767         Tcpctl *tcb;
1768         struct tcppriv *tpriv;
1769         Tcp4hdr *h4;
1770         Tcp6hdr *h6;
1771         Limbo *lp, **l;
1772         int h;
1773
1774         /* unless it's just an ack, it can't be someone coming out of limbo */
1775         if ((segp->flags & SYN) || (segp->flags & ACK) == 0)
1776                 return NULL;
1777
1778         tpriv = s->p->priv;
1779
1780         /* find a call in limbo */
1781         h = hashipa(src, segp->source);
1782         for (l = &tpriv->lht[h]; (lp = *l) != NULL; l = &lp->next) {
1783                 netlog(s->p->f, Logtcp,
1784                            "tcpincoming s %I!%d/%I!%d d %I!%d/%I!%d v %d/%d\n", src,
1785                            segp->source, lp->raddr, lp->rport, dst, segp->dest, lp->laddr,
1786                            lp->lport, version, lp->version);
1787
1788                 if (lp->lport != segp->dest || lp->rport != segp->source
1789                         || lp->version != version)
1790                         continue;
1791                 if (ipcmp(lp->laddr, dst) != 0)
1792                         continue;
1793                 if (ipcmp(lp->raddr, src) != 0)
1794                         continue;
1795
1796                 /* we're assuming no data with the initial SYN */
1797                 if (segp->seq != lp->irs + 1 || segp->ack != lp->iss + 1) {
1798                         netlog(s->p->f, Logtcp, "tcpincoming s 0x%lx/0x%lx a 0x%lx 0x%lx\n",
1799                                    segp->seq, lp->irs + 1, segp->ack, lp->iss + 1);
1800                         lp = NULL;
1801                 } else {
1802                         tpriv->nlimbo--;
1803                         *l = lp->next;
1804                 }
1805                 break;
1806         }
1807         if (lp == NULL)
1808                 return NULL;
1809
1810         new = Fsnewcall(s, src, segp->source, dst, segp->dest, version);
1811         if (new == NULL)
1812                 return NULL;
1813
1814         memmove(new->ptcl, s->ptcl, sizeof(Tcpctl));
1815         tcb = (Tcpctl *) new->ptcl;
1816         tcb->flags &= ~CLONE;
1817         tcb->timer.arg = new;
1818         tcb->timer.state = TcptimerOFF;
1819         tcb->acktimer.arg = new;
1820         tcb->acktimer.state = TcptimerOFF;
1821         tcb->katimer.arg = new;
1822         tcb->katimer.state = TcptimerOFF;
1823         tcb->rtt_timer.arg = new;
1824         tcb->rtt_timer.state = TcptimerOFF;
1825
1826         tcb->irs = lp->irs;
1827         tcb->rcv.nxt = tcb->irs + 1;
1828         tcb->rcv.urg = tcb->rcv.nxt;
1829
1830         tcb->iss = lp->iss;
1831         tcb->rttseq = tcb->iss;
1832         tcb->snd.wl2 = tcb->iss;
1833         tcb->snd.una = tcb->iss + 1;
1834         tcb->snd.rtx = tcb->iss + 1;
1835         tcb->snd.nxt = tcb->iss + 1;
1836         tcb->flgcnt = 0;
1837         tcb->flags |= SYNACK;
1838
1839         /* our sending max segment size cannot be bigger than what he asked for */
1840         if (lp->mss != 0 && lp->mss < tcb->mss) {
1841                 tcb->mss = lp->mss;
1842                 tcb->typical_mss = tcb->mss;
1843         }
1844         adjust_typical_mss_for_opts(segp, tcb);
1845
1846         /* Here's where we record the previously-decided header options.  They were
1847          * actually decided on when we agreed to them in the SYNACK we sent.  We
1848          * didn't create an actual TCB until now, so we can copy those decisions out
1849          * of the limbo tracker and into the TCB. */
1850         tcb->sack_ok = lp->sack_ok;
1851         /* window scaling */
1852         tcpsetscale(new, tcb, lp->rcvscale, lp->sndscale);
1853
1854         tcb->snd.wnd = segp->wnd;
1855         tcb->cwind = tcb->typical_mss * CWIND_SCALE;
1856
1857         /* set initial round trip time */
1858         tcb->sndsyntime = lp->lastsend + lp->rexmits * SYNACK_RXTIMER;
1859         tcpsynackrtt(new);
1860
1861         kfree(lp);
1862
1863         /* set up proto header */
1864         switch (version) {
1865                 case V4:
1866                         h4 = &tcb->protohdr.tcp4hdr;
1867                         memset(h4, 0, sizeof(*h4));
1868                         h4->proto = IP_TCPPROTO;
1869                         hnputs(h4->tcpsport, new->lport);
1870                         hnputs(h4->tcpdport, new->rport);
1871                         v6tov4(h4->tcpsrc, dst);
1872                         v6tov4(h4->tcpdst, src);
1873                         break;
1874                 case V6:
1875                         h6 = &tcb->protohdr.tcp6hdr;
1876                         memset(h6, 0, sizeof(*h6));
1877                         h6->proto = IP_TCPPROTO;
1878                         hnputs(h6->tcpsport, new->lport);
1879                         hnputs(h6->tcpdport, new->rport);
1880                         ipmove(h6->tcpsrc, dst);
1881                         ipmove(h6->tcpdst, src);
1882                         break;
1883                 default:
1884                         panic("tcpincoming: version %d", new->ipversion);
1885         }
1886
1887         tcpsetstate(new, Established);
1888
1889         iphtadd(&tpriv->ht, new);
1890
1891         return new;
1892 }
1893
1894 int seq_within(uint32_t x, uint32_t low, uint32_t high)
1895 {
1896         if (low <= high) {
1897                 if (low <= x && x <= high)
1898                         return 1;
1899         } else {
1900                 if (x >= low || x <= high)
1901                         return 1;
1902         }
1903         return 0;
1904 }
1905
1906 int seq_lt(uint32_t x, uint32_t y)
1907 {
1908         return (int)(x - y) < 0;
1909 }
1910
1911 int seq_le(uint32_t x, uint32_t y)
1912 {
1913         return (int)(x - y) <= 0;
1914 }
1915
1916 int seq_gt(uint32_t x, uint32_t y)
1917 {
1918         return (int)(x - y) > 0;
1919 }
1920
1921 int seq_ge(uint32_t x, uint32_t y)
1922 {
1923         return (int)(x - y) >= 0;
1924 }
1925
1926 static uint32_t seq_max(uint32_t x, uint32_t y)
1927 {
1928         return seq_ge(x, y) ? x : y;
1929 }
1930
1931 static uint32_t seq_min(uint32_t x, uint32_t y)
1932 {
1933         return seq_le(x, y) ? x : y;
1934 }
1935
1936 /*
1937  *  use the time between the first SYN and it's ack as the
1938  *  initial round trip time
1939  */
1940 void tcpsynackrtt(struct conv *s)
1941 {
1942         Tcpctl *tcb;
1943         uint64_t delta;
1944         struct tcppriv *tpriv;
1945
1946         tcb = (Tcpctl *) s->ptcl;
1947         tpriv = s->p->priv;
1948
1949         delta = NOW - tcb->sndsyntime;
1950         tcb->srtt = delta;
1951         tcb->mdev = delta / 2;
1952
1953         /* halt round trip timer */
1954         tcphalt(tpriv, &tcb->rtt_timer);
1955 }
1956
1957 /* For LFNs (long/fat), our default tx queue doesn't hold enough data, and TCP
1958  * blocks on the application - even if the app already has the data ready to go.
1959  * We need to hold the sent, unacked data (1x cwnd), plus all the data we might
1960  * send next RTT (1x cwnd).  Note this is called after cwnd was expanded. */
1961 static void adjust_tx_qio_limit(struct conv *s)
1962 {
1963         Tcpctl *tcb = (Tcpctl *) s->ptcl;
1964         size_t ideal_limit = tcb->cwind * 2;
1965
1966         /* This is called for every ACK, and it's not entirely free to update the
1967          * limit (locks, CVs, taps).  Updating in chunks of mss seems reasonable.
1968          * During SS, we'll update this on most ACKs (given each ACK increased the
1969          * cwind by > MSS).
1970          *
1971          * We also don't want a lot of tiny blocks from the user, but the way qio
1972          * works, you can put in as much as you want (Maxatomic) and then get
1973          * flow-controlled. */
1974         if (qgetlimit(s->wq) + tcb->typical_mss < ideal_limit)
1975                 qsetlimit(s->wq, ideal_limit);
1976         /* TODO: we could shrink the qio limit too, if we had a better idea what the
1977          * actual threshold was.  We want the limit to be the 'stable' cwnd * 2. */
1978 }
1979
1980 /* Attempts to merge later sacks into sack 'into' (index in the array) */
1981 static void merge_sacks_into(Tcpctl *tcb, int into)
1982 {
1983         struct sack_block *into_sack = &tcb->snd.sacks[into];
1984         struct sack_block *tcb_sack;
1985         int shift = 0;
1986
1987         for (int i = into + 1; i < tcb->snd.nr_sacks; i++) {
1988                 tcb_sack = &tcb->snd.sacks[i];
1989                 if (seq_lt(into_sack->right, tcb_sack->left))
1990                         break;
1991                 if (seq_gt(tcb_sack->right, into_sack->right))
1992                         into_sack->right = tcb_sack->right;
1993                 shift++;
1994         }
1995         if (shift) {
1996                 memmove(tcb->snd.sacks + into + 1,
1997                         tcb->snd.sacks + into + 1 + shift,
1998                         sizeof(struct sack_block) * (tcb->snd.nr_sacks - into - 1
1999                                                              - shift));
2000                 tcb->snd.nr_sacks -= shift;
2001         }
2002 }
2003
2004 /* If we update a sack, it means they received a packet (possibly out of order),
2005  * but they have not received earlier packets.  Otherwise, they would do a full
2006  * ACK.
2007  *
2008  * The trick is in knowing whether the reception growing this sack is due to a
2009  * retrans or due to packets from before our last loss event.  The rightmost
2010  * sack tends to grow a lot with packets we sent before the loss.  However,
2011  * intermediate sacks that grow are signs of a loss, since they only grow as a
2012  * result of retrans.
2013  *
2014  * This is only true for the first time through a retrans.  After we've gone
2015  * through a full retrans blast, the sack that hinted at the retrans loss (and
2016  * there could be multiple of them!) will continue to grow.  We could come up
2017  * with some tracking for this, but instead we'll just do a one-time deal.  You
2018  * can recover from one detected sack retrans loss.  After that, you'll have to
2019  * use the RTO.
2020  *
2021  * This won't catch some things, like a sack that grew and merged with the
2022  * rightmost sack.  This also won't work if you have a single sack.  We can't
2023  * tell where the retrans ends and the sending begins. */
2024 static bool sack_hints_at_loss(Tcpctl *tcb, struct sack_block *tcb_sack)
2025 {
2026         if (tcb->snd.recovery != SACK_RETRANS_RECOVERY)
2027                 return FALSE;
2028         return &tcb->snd.sacks[tcb->snd.nr_sacks - 1] != tcb_sack;
2029 }
2030
2031 static bool sack_contains(struct sack_block *tcb_sack, uint32_t seq)
2032 {
2033         return seq_le(tcb_sack->left, seq) && seq_lt(seq, tcb_sack->right);
2034 }
2035
2036 /* Debugging helper! */
2037 static void sack_asserter(Tcpctl *tcb, char *str)
2038 {
2039         struct sack_block *tcb_sack;
2040
2041         for (int i = 0; i < tcb->snd.nr_sacks; i++) {
2042                 tcb_sack = &tcb->snd.sacks[i];
2043                 /* Checking invariants: snd.rtx is never inside a sack, sacks are always
2044                  * mutually exclusive. */
2045                 if (sack_contains(tcb_sack, tcb->snd.rtx) ||
2046                     ((i + 1 < tcb->snd.nr_sacks) && seq_ge(tcb_sack->right,
2047                                                                (tcb_sack + 1)->left))) {
2048                         printk("SACK ASSERT ERROR at %s\n", str);
2049                         printk("rtx %u una %u nxt %u, sack [%u, %u)\n",
2050                                tcb->snd.rtx, tcb->snd.una, tcb->snd.nxt, tcb_sack->left,
2051                                    tcb_sack->right);
2052                         for (int i = 0; i < tcb->snd.nr_sacks; i++)
2053                                 printk("\t %d: [%u, %u)\n", i, tcb->snd.sacks[i].left,
2054                                        tcb->snd.sacks[i].right);
2055                         backtrace();
2056                         panic("");
2057                 }
2058         }
2059 }
2060
2061 /* Updates bookkeeping whenever a sack is added or updated */
2062 static void sack_has_changed(struct conv *s, Tcpctl *tcb,
2063                              struct sack_block *tcb_sack)
2064 {
2065         /* Due to the change, snd.rtx might be in the middle of this sack.  Advance
2066          * it to the right edge. */
2067         if (sack_contains(tcb_sack, tcb->snd.rtx))
2068                 tcb->snd.rtx = tcb_sack->right;
2069
2070         /* This is a sack for something we retransed and we think it means there was
2071          * another loss.  Instead of waiting for the RTO, we can take action. */
2072         if (sack_hints_at_loss(tcb, tcb_sack)) {
2073                 if (++tcb->snd.sack_loss_hint == TCPREXMTTHRESH) {
2074                         netlog(s->p->f, Logtcprxmt,
2075                                "%I.%d -> %I.%d: sack rxmit loss: snd.rtx %u, sack [%u,%u), una %u, recovery_pt %u\n",
2076                                s->laddr, s->lport, s->raddr, s->rport,
2077                                tcb->snd.rtx, tcb_sack->left, tcb_sack->right, tcb->snd.una,
2078                                tcb->snd.recovery_pt);
2079                         /* Redo retrans, but keep the sacks and recovery point */
2080                         tcp_loss_event(s, tcb);
2081                         tcb->snd.rtx = tcb->snd.una;
2082                         tcb->snd.sack_loss_hint = 0;
2083                         /* Act like an RTO.  We just detected it earlier.  This prevents us
2084                          * from getting another sack hint loss this recovery period and from
2085                          * advancing the opportunistic right edge. */
2086                         tcb->snd.recovery = RTO_RETRANS_RECOVERY;
2087                         /* We didn't actually time out yet and we expect to keep getting
2088                          * sacks, so we don't want to flush or worry about in_flight.  If we
2089                          * messed something up, the RTO will still fire. */
2090                         set_in_flight(tcb);
2091                 }
2092         }
2093 }
2094
2095 /* Advances tcb_sack's right edge, if new_right is farther, and updates the
2096  * bookkeeping due to the change. */
2097 static void update_right_edge(struct conv *s, Tcpctl *tcb,
2098                               struct sack_block *tcb_sack, uint32_t new_right)
2099 {
2100         if (seq_le(new_right, tcb_sack->right))
2101                 return;
2102         tcb_sack->right = new_right;
2103         merge_sacks_into(tcb, tcb_sack - tcb->snd.sacks);
2104         sack_has_changed(s, tcb, tcb_sack);
2105 }
2106
2107 static void update_or_insert_sack(struct conv *s, Tcpctl *tcb,
2108                                   struct sack_block *seg_sack)
2109 {
2110         struct sack_block *tcb_sack;
2111
2112         for (int i = 0; i < tcb->snd.nr_sacks; i++) {
2113                 tcb_sack = &tcb->snd.sacks[i];
2114                 if (seq_lt(tcb_sack->left, seg_sack->left)) {
2115                         /* This includes adjacent (which I've seen!) and overlap. */
2116                         if (seq_le(seg_sack->left, tcb_sack->right)) {
2117                                 update_right_edge(s, tcb, tcb_sack, seg_sack->right);
2118                                 return;
2119                         }
2120                         continue;
2121                 }
2122                 /* Update existing sack */
2123                 if (tcb_sack->left == seg_sack->left) {
2124                         update_right_edge(s, tcb, tcb_sack, seg_sack->right);
2125                         return;
2126                 }
2127                 /* Found our slot */
2128                 if (seq_gt(tcb_sack->left, seg_sack->left)) {
2129                         if (tcb->snd.nr_sacks == MAX_NR_SND_SACKS) {
2130                                 /* Out of room, but it is possible this sack overlaps later
2131                                  * sacks, including the max sack's right edge. */
2132                                 if (seq_ge(seg_sack->right, tcb_sack->left)) {
2133                                         /* Take over the sack */
2134                                         tcb_sack->left = seg_sack->left;
2135                                         update_right_edge(s, tcb, tcb_sack, seg_sack->right);
2136                                 }
2137                                 return;
2138                         }
2139                         /* O/W, it's our slot and we have room (at least one spot). */
2140                         memmove(&tcb->snd.sacks[i + 1], &tcb->snd.sacks[i],
2141                                 sizeof(struct sack_block) * (tcb->snd.nr_sacks - i));
2142                         tcb_sack->left = seg_sack->left;
2143                         tcb_sack->right = seg_sack->right;
2144                         tcb->snd.nr_sacks++;
2145                         merge_sacks_into(tcb, i);
2146                         sack_has_changed(s, tcb, tcb_sack);
2147                         return;
2148                 }
2149         }
2150         if (tcb->snd.nr_sacks == MAX_NR_SND_SACKS) {
2151                 /* We didn't find space in the sack array. */
2152                 tcb_sack = &tcb->snd.sacks[MAX_NR_SND_SACKS - 1];
2153                 /* Need to always maintain the rightmost sack, discarding the prev */
2154                 if (seq_gt(seg_sack->right, tcb_sack->right)) {
2155                         tcb_sack->left = seg_sack->left;
2156                         tcb_sack->right = seg_sack->right;
2157                         sack_has_changed(s, tcb, tcb_sack);
2158                 }
2159                 return;
2160         }
2161         tcb_sack = &tcb->snd.sacks[tcb->snd.nr_sacks];
2162         tcb->snd.nr_sacks++;
2163         tcb_sack->left = seg_sack->left;
2164         tcb_sack->right = seg_sack->right;
2165         sack_has_changed(s, tcb, tcb_sack);
2166 }
2167
2168 /* Given the packet seg, track the sacks in TCB.  There are a few things: if seg
2169  * acks new data, some sacks might no longer be needed.  Some sacks might grow,
2170  * we might add new sacks, either of which can cause a merger.
2171  *
2172  * The important thing is that we always have the max sack entry: it must be
2173  * inserted for sure and findable.  We need that for our measurement of what
2174  * packets are in the network.
2175  *
2176  * Note that we keep sacks that are below snd.rtx (and above
2177  * seg.ack/tcb->snd.una) as best we can - we don't prune them.  We'll need those
2178  * for the in_flight estimate.
2179  *
2180  * When we run out of room, we'll have to throw away a sack.  Anything we throw
2181  * away below snd.rtx will be counted as 'in flight', even though it isn't.  If
2182  * we throw away something greater than snd.rtx, we'll also retrans it.  For
2183  * simplicity, we throw-away / replace the rightmost sack, since we're always
2184  * maintaining a highest sack. */
2185 static void update_sacks(struct conv *s, Tcpctl *tcb, Tcp *seg)
2186 {
2187         int prune = 0;
2188         struct sack_block *tcb_sack;
2189
2190         for (int i = 0; i < tcb->snd.nr_sacks; i++) {
2191                 tcb_sack = &tcb->snd.sacks[i];
2192                 /* For the equality case, if they acked up to, but not including an old
2193                  * sack, they must have reneged it.  Otherwise they would have acked
2194                  * beyond the sack. */
2195                 if (seq_lt(seg->ack, tcb_sack->left))
2196                         break;
2197                 prune++;
2198         }
2199         if (prune) {
2200                 memmove(tcb->snd.sacks, tcb->snd.sacks + prune,
2201                         sizeof(struct sack_block) * (tcb->snd.nr_sacks - prune));
2202                 tcb->snd.nr_sacks -= prune;
2203         }
2204         for (int i = 0; i < seg->nr_sacks; i++) {
2205                 /* old sacks */
2206                 if (seq_lt(seg->sacks[i].left, seg->ack))
2207                         continue;
2208                 /* buggy sack: out of range */
2209                 if (seq_gt(seg->sacks[i].right, tcb->snd.nxt))
2210                         continue;
2211                 update_or_insert_sack(s, tcb, &seg->sacks[i]);
2212         }
2213 }
2214
2215 /* This is a little bit of an under estimate, since we assume a packet is lost
2216  * once we have any sacks above it.  Overall, it's at most 2 * MSS of an
2217  * overestimate.
2218  *
2219  * If we have no sacks (either reneged or never used) we'll assume all packets
2220  * above snd.rtx are lost.  This will be the case for sackless fast rxmit
2221  * (Dong's stuff) or for a timeout.  In the former case, this is probably not
2222  * true, and in_flight should be higher, but we have no knowledge without the
2223  * sacks. */
2224 static void set_in_flight(Tcpctl *tcb)
2225 {
2226         struct sack_block *tcb_sack;
2227         uint32_t in_flight = 0;
2228         uint32_t from;
2229
2230         if (!tcb->snd.nr_sacks) {
2231                 tcb->snd.in_flight = tcb->snd.rtx - tcb->snd.una;
2232                 return;
2233         }
2234
2235         /* Everything to the right of the unsacked */
2236         tcb_sack = &tcb->snd.sacks[tcb->snd.nr_sacks - 1];
2237         in_flight += tcb->snd.nxt - tcb_sack->right;
2238
2239         /* Everything retransed (from una to snd.rtx, minus sacked regions.  Note
2240          * we only retrans at most the last sack's left edge.  snd.rtx will be
2241          * advanced to the right edge of some sack (possibly the last one). */
2242         from = tcb->snd.una;
2243         for (int i = 0; i < tcb->snd.nr_sacks; i++) {
2244                 tcb_sack = &tcb->snd.sacks[i];
2245                 if (seq_ge(tcb_sack->left, tcb->snd.rtx))
2246                         break;
2247                 assert(seq_ge(tcb->snd.rtx, tcb_sack->right));
2248                 in_flight += tcb_sack->left - from;
2249                 from = tcb_sack->right;
2250         }
2251         in_flight += tcb->snd.rtx - from;
2252
2253         tcb->snd.in_flight = in_flight;
2254 }
2255
2256 static void reset_recovery(struct conv *s, Tcpctl *tcb)
2257 {
2258         netlog(s->p->f, Logtcprxmt,
2259                "%I.%d -> %I.%d: recovery complete, una %u, rtx %u, nxt %u, recovery %u\n",
2260                s->laddr, s->lport, s->raddr, s->rport,
2261                tcb->snd.una, tcb->snd.rtx, tcb->snd.nxt, tcb->snd.recovery_pt);
2262         tcb->snd.recovery = 0;
2263         tcb->snd.recovery_pt = 0;
2264         tcb->snd.loss_hint = 0;
2265         tcb->snd.flush_sacks = FALSE;
2266         tcb->snd.sack_loss_hint = 0;
2267 }
2268
2269 static bool is_dup_ack(Tcpctl *tcb, Tcp *seg)
2270 {
2271         /* this is a pure ack w/o window update */
2272         return (seg->ack == tcb->snd.una) &&
2273                (tcb->snd.una != tcb->snd.nxt) &&
2274                (seg->len == 0) &&
2275                (seg->wnd == tcb->snd.wnd);
2276 }
2277
2278 /* If we have sacks, we'll ignore dupacks and look at the sacks ahead of una
2279  * (which are managed by the TCB).  The tcb will not have old sacks (below
2280  * ack/snd.rtx).  Receivers often send sacks below their ack point when we are
2281  * coming out of a loss, and we don't want those to count.
2282  *
2283  * Note the tcb could have sacks (in the future), but the receiver stopped using
2284  * them (reneged).  We'll catch that with the RTO.  If we try to catch it here,
2285  * we could get in a state where we never allow them to renege. */
2286 static bool is_potential_loss(Tcpctl *tcb, Tcp *seg)
2287 {
2288         if (seg->nr_sacks > 0)
2289                 return tcb->snd.nr_sacks > 0;
2290         else
2291                 return is_dup_ack(tcb, seg);
2292 }
2293
2294 /* When we use timestamps for RTTM, RFC 7323 suggests scaling by
2295  * expected_samples (per cwnd).  They say:
2296  *
2297  * ExpectedSamples = ceiling(FlightSize / (SMSS * 2))
2298  *
2299  * However, SMMS * 2 is really "number of bytes expected to be acked in a
2300  * packet.".  We'll use 'acked' to approximate that.  When the receiver uses
2301  * LRO, they'll send back large ACKs, which decreases the number of samples.
2302  *
2303  * If it turns out that all the divides are bad, we can just go back to not
2304  * using expected_samples at all. */
2305 static int expected_samples_ts(Tcpctl *tcb, uint32_t acked)
2306 {
2307         assert(acked);
2308         return MAX(DIV_ROUND_UP(tcb->snd.nxt - tcb->snd.una, acked), 1);
2309 }
2310
2311 /* Updates the RTT, given the currently sampled RTT and the number samples per
2312  * cwnd.  For non-TS RTTM, that'll be 1. */
2313 static void update_rtt(Tcpctl *tcb, int rtt_sample, int expected_samples)
2314 {
2315         int delta;
2316
2317         tcb->backoff = 0;
2318         tcb->backedoff = 0;
2319         if (tcb->srtt == 0) {
2320                 tcb->srtt = rtt_sample;
2321                 tcb->mdev = rtt_sample / 2;
2322         } else {
2323                 delta = rtt_sample - tcb->srtt;
2324                 tcb->srtt += (delta >> RTTM_ALPHA_SHIFT) / expected_samples;
2325                 if (tcb->srtt <= 0)
2326                         tcb->srtt = 1;
2327                 tcb->mdev += ((abs(delta) - tcb->mdev) >> RTTM_BRAVO_SHIFT) /
2328                              expected_samples;
2329                 if (tcb->mdev <= 0)
2330                         tcb->mdev = 1;
2331         }
2332         tcpsettimer(tcb);
2333 }
2334
2335 void update(struct conv *s, Tcp * seg)
2336 {
2337         int rtt;
2338         Tcpctl *tcb;
2339         uint32_t acked, expand;
2340         struct tcppriv *tpriv;
2341
2342         tpriv = s->p->priv;
2343         tcb = (Tcpctl *) s->ptcl;
2344
2345         if (!seq_within(seg->ack, tcb->snd.una, tcb->snd.nxt))
2346                 return;
2347
2348         acked = seg->ack - tcb->snd.una;
2349         tcb->snd.una = seg->ack;
2350         if (seq_gt(seg->ack, tcb->snd.rtx))
2351                 tcb->snd.rtx = seg->ack;
2352
2353         update_sacks(s, tcb, seg);
2354         set_in_flight(tcb);
2355
2356         /* We treat either a dupack or forward SACKs as a hint that there is a loss.
2357          * The RFCs suggest three dupacks before treating it as a loss (alternative
2358          * is reordered packets).  We'll treat three SACKs the same way. */
2359         if (is_potential_loss(tcb, seg) && !tcb->snd.recovery) {
2360                 tcb->snd.loss_hint++;
2361                 if (tcb->snd.loss_hint == TCPREXMTTHRESH) {
2362                         netlog(s->p->f, Logtcprxmt,
2363                                "%I.%d -> %I.%d: loss hint thresh, nr sacks %u, nxt %u, una %u, cwnd %u\n",
2364                                s->laddr, s->lport, s->raddr, s->rport,
2365                                tcb->snd.nr_sacks, tcb->snd.nxt, tcb->snd.una, tcb->cwind);
2366                         tcp_loss_event(s, tcb);
2367                         tcb->snd.recovery_pt = tcb->snd.nxt;
2368                         if (tcb->snd.nr_sacks) {
2369                                 tcb->snd.recovery = SACK_RETRANS_RECOVERY;
2370                                 tcb->snd.flush_sacks = FALSE;
2371                                 tcb->snd.sack_loss_hint = 0;
2372                         } else {
2373                                 tcb->snd.recovery = FAST_RETRANS_RECOVERY;
2374                         }
2375                         tcprxmit(s);
2376                 }
2377         }
2378
2379         /*
2380          *  update window
2381          */
2382         if (seq_gt(seg->ack, tcb->snd.wl2)
2383                 || (tcb->snd.wl2 == seg->ack && seg->wnd > tcb->snd.wnd)) {
2384                 tcb->snd.wnd = seg->wnd;
2385                 tcb->snd.wl2 = seg->ack;
2386         }
2387
2388         if (!acked) {
2389                 /*
2390                  *  don't let us hangup if sending into a closed window and
2391                  *  we're still getting acks
2392                  */
2393                 if (tcb->snd.recovery && (tcb->snd.wnd == 0))
2394                         tcb->backedoff = MAXBACKMS / 4;
2395                 return;
2396         }
2397         /* At this point, they have acked something new. (positive ack, ack > una).
2398          *
2399          * If we hadn't reached the threshold for recovery yet, the positive ACK
2400          * will reset our loss_hint count. */
2401         if (!tcb->snd.recovery)
2402                 tcb->snd.loss_hint = 0;
2403         else if (seq_ge(seg->ack, tcb->snd.recovery_pt))
2404                 reset_recovery(s, tcb);
2405
2406         /* avoid slow start and timers for SYN acks */
2407         if ((tcb->flags & SYNACK) == 0) {
2408                 tcb->flags |= SYNACK;
2409                 acked--;
2410                 tcb->flgcnt--;
2411                 goto done;
2412         }
2413
2414         /* slow start as long as we're not recovering from lost packets */
2415         if (tcb->cwind < tcb->snd.wnd && !tcb->snd.recovery) {
2416                 if (tcb->cwind < tcb->ssthresh) {
2417                         /* We increase the cwind by every byte we receive.  We want to
2418                          * increase the cwind by one MSS for every MSS that gets ACKed.
2419                          * Note that multiple MSSs can be ACKed in a single ACK.  If we had
2420                          * a remainder of acked / MSS, we'd add just that remainder - not 0
2421                          * or 1 MSS. */
2422                         expand = acked;
2423                 } else {
2424                         /* Every RTT, which consists of CWND bytes, we're supposed to expand
2425                          * by MSS bytes.  The classic algorithm was
2426                          *              expand = (tcb->mss * tcb->mss) / tcb->cwind;
2427                          * which assumes the ACK was for MSS bytes.  Instead, for every
2428                          * 'acked' bytes, we increase the window by acked / CWND (in units
2429                          * of MSS). */
2430                         expand = MAX(acked, tcb->typical_mss) * tcb->typical_mss
2431                                  / tcb->cwind;
2432                 }
2433
2434                 if (tcb->cwind + expand < tcb->cwind)
2435                         expand = tcb->snd.wnd - tcb->cwind;
2436                 if (tcb->cwind + expand > tcb->snd.wnd)
2437                         expand = tcb->snd.wnd - tcb->cwind;
2438                 tcb->cwind += expand;
2439         }
2440         adjust_tx_qio_limit(s);
2441
2442         if (tcb->ts_recent) {
2443                 update_rtt(tcb, abs(milliseconds() - seg->ts_ecr),
2444                            expected_samples_ts(tcb, acked));
2445         } else if (tcb->rtt_timer.state == TcptimerON &&
2446                    seq_ge(seg->ack, tcb->rttseq)) {
2447                 /* Adjust the timers according to the round trip time */
2448                 tcphalt(tpriv, &tcb->rtt_timer);
2449                 if (!tcb->snd.recovery) {
2450                         rtt = tcb->rtt_timer.start - tcb->rtt_timer.count;
2451                         if (rtt == 0)
2452                                 rtt = 1;        /* o/w all close systems will rexmit in 0 time */
2453                         rtt *= MSPTICK;
2454                         update_rtt(tcb, rtt, 1);
2455                 }
2456         }
2457
2458 done:
2459         if (qdiscard(s->wq, acked) < acked) {
2460                 tcb->flgcnt--;
2461                 /* This happened due to another bug where acked was very large
2462                  * (negative), which was interpreted as "hey, one less flag, since they
2463                  * acked one of our flags (like a SYN).  If flgcnt goes negative,
2464                  * get_xmit_segment() will attempt to send out large packets. */
2465                 assert(tcb->flgcnt >= 0);
2466         }
2467
2468         if (seq_gt(seg->ack, tcb->snd.urg))
2469                 tcb->snd.urg = seg->ack;
2470
2471         if (tcb->snd.una != tcb->snd.nxt)
2472                 tcpgo(tpriv, &tcb->timer);
2473         else
2474                 tcphalt(tpriv, &tcb->timer);
2475
2476         tcb->backoff = 0;
2477         tcb->backedoff = 0;
2478 }
2479
2480 static void update_tcb_ts(Tcpctl *tcb, Tcp *seg)
2481 {
2482         /* Get timestamp info from the tcp header.  Even though the timestamps
2483          * aren't sequence numbers, we still need to protect for wraparound.  Though
2484          * if the values were 0, assume that means we need an update.  We could have
2485          * an initial ts_val that appears negative (signed). */
2486         if (!tcb->ts_recent || !tcb->last_ack_sent ||
2487             (seq_ge(seg->ts_val, tcb->ts_recent) &&
2488              seq_le(seg->seq, tcb->last_ack_sent)))
2489                 tcb->ts_recent = seg->ts_val;
2490 }
2491
2492 /* Overlap happens when one sack's left edge is inside another sack. */
2493 static bool sacks_overlap(struct sack_block *x, struct sack_block *y)
2494 {
2495         return (seq_le(x->left, y->left) && seq_le(y->left, x->right)) ||
2496                (seq_le(y->left, x->left) && seq_le(x->left, y->right));
2497 }
2498
2499 static void make_sack_first(Tcpctl *tcb, struct sack_block *tcb_sack)
2500 {
2501         struct sack_block temp;
2502
2503         if (tcb_sack == &tcb->rcv.sacks[0])
2504                 return;
2505         temp = tcb->rcv.sacks[0];
2506         tcb->rcv.sacks[0] = *tcb_sack;
2507         *tcb_sack = temp;
2508 }
2509
2510 /* Track sack in our tcb for a block of data we received.  This handles all the
2511  * stuff: making sure sack is first (since it's the most recent sack change),
2512  * updating or merging sacks, and dropping excess sacks (we only need to
2513  * maintain 3).  Unlike on the snd side, our tcb sacks are *not* sorted. */
2514 static void track_rcv_sack(Tcpctl *tcb, uint32_t left, uint32_t right)
2515 {
2516         struct sack_block *tcb_sack;
2517         struct sack_block sack[1];
2518
2519         if (!tcb->sack_ok)
2520                 return;
2521         assert(seq_lt(left, right));
2522         sack->left = left;
2523         sack->right = right;
2524         /* We can reuse an existing sack if we're merging or overlapping. */
2525         for (int i = 0; i < tcb->rcv.nr_sacks; i++) {
2526                 tcb_sack = &tcb->rcv.sacks[i];
2527                 if (sacks_overlap(tcb_sack, sack)) {
2528                         tcb_sack->left = seq_min(tcb_sack->left, sack->left);
2529                         tcb_sack->right = seq_max(tcb_sack->right, sack->right);
2530                         make_sack_first(tcb, tcb_sack);
2531                         return;
2532                 }
2533         }
2534         /* We can discard the last sack (right shift) - we should have sent it at
2535          * least once by now.  If not, oh well. */
2536         memmove(tcb->rcv.sacks + 1, tcb->rcv.sacks, sizeof(struct sack_block) *
2537                 MIN(MAX_NR_RCV_SACKS - 1, tcb->rcv.nr_sacks));
2538         tcb->rcv.sacks[0] = *sack;
2539         if (tcb->rcv.nr_sacks < MAX_NR_RCV_SACKS)
2540                 tcb->rcv.nr_sacks++;
2541 }
2542
2543 /* Once we receive everything and move rcv.nxt past a sack, we don't need to
2544  * track it.  I've seen Linux report sacks in the past, but we probably
2545  * shouldn't. */
2546 static void drop_old_rcv_sacks(Tcpctl *tcb)
2547 {
2548         struct sack_block *tcb_sack;
2549
2550         for (int i = 0; i < tcb->rcv.nr_sacks; i++) {
2551                 tcb_sack = &tcb->rcv.sacks[i];
2552                 /* Moving up to or past the left is enough to drop it. */
2553                 if (seq_ge(tcb->rcv.nxt, tcb_sack->left)) {
2554                         memmove(tcb->rcv.sacks + i, tcb->rcv.sacks + i + 1,
2555                                 sizeof(struct sack_block) * (tcb->rcv.nr_sacks - i - 1));
2556                         tcb->rcv.nr_sacks--;
2557                         i--;
2558                 }
2559         }
2560 }
2561
2562 void tcpiput(struct Proto *tcp, struct Ipifc *unused, struct block *bp)
2563 {
2564         ERRSTACK(1);
2565         Tcp seg;
2566         Tcp4hdr *h4;
2567         Tcp6hdr *h6;
2568         int hdrlen;
2569         Tcpctl *tcb;
2570         uint16_t length;
2571         uint8_t source[IPaddrlen], dest[IPaddrlen];
2572         struct conv *s;
2573         struct Fs *f;
2574         struct tcppriv *tpriv;
2575         uint8_t version;
2576
2577         f = tcp->f;
2578         tpriv = tcp->priv;
2579
2580         tpriv->stats[InSegs]++;
2581
2582         h4 = (Tcp4hdr *) (bp->rp);
2583         h6 = (Tcp6hdr *) (bp->rp);
2584
2585         if ((h4->vihl & 0xF0) == IP_VER4) {
2586                 uint8_t ttl;
2587
2588                 version = V4;
2589                 length = nhgets(h4->length);
2590                 v4tov6(dest, h4->tcpdst);
2591                 v4tov6(source, h4->tcpsrc);
2592
2593                 /* ttl isn't part of the xsum pseudo header, but bypass needs it. */
2594                 ttl = h4->Unused;
2595                 h4->Unused = 0;
2596                 hnputs(h4->tcplen, length - TCP4_PKT);
2597                 if (!(bp->flag & Btcpck) && (h4->tcpcksum[0] || h4->tcpcksum[1]) &&
2598                         ptclcsum(bp, TCP4_IPLEN, length - TCP4_IPLEN)) {
2599                         tpriv->stats[CsumErrs]++;
2600                         tpriv->stats[InErrs]++;
2601                         netlog(f, Logtcp, "bad tcp proto cksum\n");
2602                         freeblist(bp);
2603                         return;
2604                 }
2605                 h4->Unused = ttl;
2606
2607                 hdrlen = ntohtcp4(&seg, &bp);
2608                 if (hdrlen < 0) {
2609                         tpriv->stats[HlenErrs]++;
2610                         tpriv->stats[InErrs]++;
2611                         netlog(f, Logtcp, "bad tcp hdr len\n");
2612                         return;
2613                 }
2614
2615                 s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest);
2616                 if (s && s->state == Bypass) {
2617                         bypass_or_drop(s, bp);
2618                         return;
2619                 }
2620
2621                 /* trim the packet to the size claimed by the datagram */
2622                 length -= hdrlen + TCP4_PKT;
2623                 bp = trimblock(bp, hdrlen + TCP4_PKT, length);
2624                 if (bp == NULL) {
2625                         tpriv->stats[LenErrs]++;
2626                         tpriv->stats[InErrs]++;
2627                         netlog(f, Logtcp, "tcp len < 0 after trim\n");
2628                         return;
2629                 }
2630         } else {
2631                 int ttl = h6->ttl;
2632                 int proto = h6->proto;
2633
2634                 version = V6;
2635                 length = nhgets(h6->ploadlen);
2636                 ipmove(dest, h6->tcpdst);
2637                 ipmove(source, h6->tcpsrc);
2638
2639                 h6->ploadlen[0] = h6->ploadlen[1] = h6->proto = 0;
2640                 h6->ttl = proto;
2641                 hnputl(h6->vcf, length);
2642                 if ((h6->tcpcksum[0] || h6->tcpcksum[1]) &&
2643                         ptclcsum(bp, TCP6_IPLEN, length + TCP6_PHDRSIZE)) {
2644                         tpriv->stats[CsumErrs]++;
2645                         tpriv->stats[InErrs]++;
2646                         netlog(f, Logtcp, "bad tcp proto cksum\n");
2647                         freeblist(bp);
2648                         return;
2649                 }
2650                 h6->ttl = ttl;
2651                 h6->proto = proto;
2652                 hnputs(h6->ploadlen, length);
2653
2654                 hdrlen = ntohtcp6(&seg, &bp);
2655                 if (hdrlen < 0) {
2656                         tpriv->stats[HlenErrs]++;
2657                         tpriv->stats[InErrs]++;
2658                         netlog(f, Logtcp, "bad tcp hdr len\n");
2659                         return;
2660                 }
2661
2662                 s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest);
2663                 if (s && s->state == Bypass) {
2664                         bypass_or_drop(s, bp);
2665                         return;
2666                 }
2667
2668                 /* trim the packet to the size claimed by the datagram */
2669                 length -= hdrlen;
2670                 bp = trimblock(bp, hdrlen + TCP6_PKT, length);
2671                 if (bp == NULL) {
2672                         tpriv->stats[LenErrs]++;
2673                         tpriv->stats[InErrs]++;
2674                         netlog(f, Logtcp, "tcp len < 0 after trim\n");
2675                         return;
2676                 }
2677         }
2678
2679         /* s, the conv matching the n-tuple, was set above */
2680         if (s == NULL) {
2681                 netlog(f, Logtcpreset, "iphtlook failed: src %I:%u, dst %I:%u\n",
2682                        source, seg.source, dest, seg.dest);
2683 reset:
2684                 sndrst(tcp, source, dest, length, &seg, version, "no conversation");
2685                 freeblist(bp);
2686                 return;
2687         }
2688
2689         /* lock protocol for unstate Plan 9 invariants.  funcs like limbo or
2690          * incoming might rely on it. */
2691         qlock(&tcp->qlock);
2692
2693         /* if it's a listener, look for the right flags and get a new conv */
2694         tcb = (Tcpctl *) s->ptcl;
2695         if (tcb->state == Listen) {
2696                 if (seg.flags & RST) {
2697                         limborst(s, &seg, source, dest, version);
2698                         qunlock(&tcp->qlock);
2699                         freeblist(bp);
2700                         return;
2701                 }
2702
2703                 /* if this is a new SYN, put the call into limbo */
2704                 if ((seg.flags & SYN) && (seg.flags & ACK) == 0) {
2705                         limbo(s, source, dest, &seg, version);
2706                         qunlock(&tcp->qlock);
2707                         freeblist(bp);
2708                         return;
2709                 }
2710
2711                 /* if there's a matching call in limbo, tcpincoming will return it */
2712                 s = tcpincoming(s, &seg, source, dest, version);
2713                 if (s == NULL) {
2714                         qunlock(&tcp->qlock);
2715                         goto reset;
2716                 }
2717         }
2718
2719         /* The rest of the input state machine is run with the control block
2720          * locked and implements the state machine directly out of the RFC.
2721          * Out-of-band data is ignored - it was always a bad idea.
2722          */
2723         tcb = (Tcpctl *) s->ptcl;
2724         if (waserror()) {
2725                 qunlock(&s->qlock);
2726                 nexterror();
2727         }
2728         qlock(&s->qlock);
2729         qunlock(&tcp->qlock);
2730
2731         update_tcb_ts(tcb, &seg);
2732         /* fix up window */
2733         seg.wnd <<= tcb->rcv.scale;
2734
2735         /* every input packet in puts off the keep alive time out */
2736         tcpsetkacounter(tcb);
2737
2738         switch (tcb->state) {
2739                 case Closed:
2740                         sndrst(tcp, source, dest, length, &seg, version,
2741                                    "sending to Closed");
2742                         goto raise;
2743                 case Syn_sent:
2744                         if (seg.flags & ACK) {
2745                                 if (!seq_within(seg.ack, tcb->iss + 1, tcb->snd.nxt)) {
2746                                         sndrst(tcp, source, dest, length, &seg, version,
2747                                                    "bad seq in Syn_sent");
2748                                         goto raise;
2749                                 }
2750                         }
2751                         if (seg.flags & RST) {
2752                                 if (seg.flags & ACK)
2753                                         localclose(s, "connection refused");
2754                                 goto raise;
2755                         }
2756
2757                         if (seg.flags & SYN) {
2758                                 procsyn(s, &seg);
2759                                 if (seg.flags & ACK) {
2760                                         update(s, &seg);
2761                                         tcpsynackrtt(s);
2762                                         tcpsetstate(s, Established);
2763                                         /* Here's where we get the results of header option
2764                                          * negotiations for connections we started. (SYNACK has the
2765                                          * response) */
2766                                         tcpsetscale(s, tcb, seg.ws, tcb->scale);
2767                                         tcb->sack_ok = seg.sack_ok;
2768                                 } else {
2769                                         sndrst(tcp, source, dest, length, &seg, version,
2770                                                    "Got SYN with no ACK");
2771                                         goto raise;
2772                                 }
2773
2774                                 if (length != 0 || (seg.flags & FIN))
2775                                         break;
2776
2777                                 freeblist(bp);
2778                                 goto output;
2779                         } else
2780                                 freeblist(bp);
2781
2782                         qunlock(&s->qlock);
2783                         poperror();
2784                         return;
2785         }
2786
2787         /*
2788          *  One DOS attack is to open connections to us and then forget about them,
2789          *  thereby tying up a conv at no long term cost to the attacker.
2790          *  This is an attempt to defeat these stateless DOS attacks.  See
2791          *  corresponding code in tcpsendka().
2792          */
2793         if ((seg.flags & RST) == 0) {
2794                 if (tcpporthogdefense
2795                         && seq_within(seg.ack, tcb->snd.una - (1 << 31),
2796                                                   tcb->snd.una - (1 << 29))) {
2797                         printd("stateless hog %I.%d->%I.%d f 0x%x 0x%lx - 0x%lx - 0x%lx\n",
2798                                    source, seg.source, dest, seg.dest, seg.flags,
2799                                    tcb->snd.una - (1 << 31), seg.ack, tcb->snd.una - (1 << 29));
2800                         localclose(s, "stateless hog");
2801                 }
2802         }
2803
2804         /* Cut the data to fit the receive window */
2805         if (tcptrim(tcb, &seg, &bp, &length) == -1) {
2806                 netlog(f, Logtcp, "%I.%d -> %I.%d: tcp len < 0, %lu %d\n",
2807                        s->raddr, s->rport, s->laddr, s->lport, seg.seq, length);
2808                 update(s, &seg);
2809                 if (qlen(s->wq) + tcb->flgcnt == 0 && tcb->state == Closing) {
2810                         tcphalt(tpriv, &tcb->rtt_timer);
2811                         tcphalt(tpriv, &tcb->acktimer);
2812                         tcphalt(tpriv, &tcb->katimer);
2813                         tcpsetstate(s, Time_wait);
2814                         tcb->timer.start = MSL2 * (1000 / MSPTICK);
2815                         tcpgo(tpriv, &tcb->timer);
2816                 }
2817                 if (!(seg.flags & RST)) {
2818                         tcb->flags |= FORCE;
2819                         goto output;
2820                 }
2821                 qunlock(&s->qlock);
2822                 poperror();
2823                 return;
2824         }
2825
2826         /* Cannot accept so answer with a rst */
2827         if (length && tcb->state == Closed) {
2828                 sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
2829                 goto raise;
2830         }
2831
2832         /* The segment is beyond the current receive pointer so
2833          * queue the data in the resequence queue
2834          */
2835         if (seg.seq != tcb->rcv.nxt)
2836                 if (length != 0 || (seg.flags & (SYN | FIN))) {
2837                         update(s, &seg);
2838                         if (addreseq(tcb, tpriv, &seg, bp, length) < 0)
2839                                 printd("reseq %I.%d -> %I.%d\n", s->raddr, s->rport, s->laddr,
2840                                            s->lport);
2841                         tcb->flags |= FORCE;
2842                         goto output;
2843                 }
2844
2845         /*
2846          *  keep looping till we've processed this packet plus any
2847          *  adjacent packets in the resequence queue
2848          */
2849         for (;;) {
2850                 if (seg.flags & RST) {
2851                         if (tcb->state == Established) {
2852                                 tpriv->stats[EstabResets]++;
2853                                 if (tcb->rcv.nxt != seg.seq)
2854                                         printd
2855                                                 ("out of order RST rcvd: %I.%d -> %I.%d, rcv.nxt 0x%lx seq 0x%lx\n",
2856                                                  s->raddr, s->rport, s->laddr, s->lport, tcb->rcv.nxt,
2857                                                  seg.seq);
2858                         }
2859                         localclose(s, "connection refused");
2860                         goto raise;
2861                 }
2862
2863                 if ((seg.flags & ACK) == 0)
2864                         goto raise;
2865
2866                 switch (tcb->state) {
2867                         case Established:
2868                         case Close_wait:
2869                                 update(s, &seg);
2870                                 break;
2871                         case Finwait1:
2872                                 update(s, &seg);
2873                                 if (qlen(s->wq) + tcb->flgcnt == 0) {
2874                                         tcphalt(tpriv, &tcb->rtt_timer);
2875                                         tcphalt(tpriv, &tcb->acktimer);
2876                                         tcpsetkacounter(tcb);
2877                                         tcb->time = NOW;
2878                                         tcpsetstate(s, Finwait2);
2879                                         tcb->katimer.start = MSL2 * (1000 / MSPTICK);
2880                                         tcpgo(tpriv, &tcb->katimer);
2881                                 }
2882                                 break;
2883                         case Finwait2:
2884                                 update(s, &seg);
2885                                 break;
2886                         case Closing:
2887                                 update(s, &seg);
2888                                 if (qlen(s->wq) + tcb->flgcnt == 0) {
2889                                         tcphalt(tpriv, &tcb->rtt_timer);
2890                                         tcphalt(tpriv, &tcb->acktimer);
2891                                         tcphalt(tpriv, &tcb->katimer);
2892                                         tcpsetstate(s, Time_wait);
2893                                         tcb->timer.start = MSL2 * (1000 / MSPTICK);
2894                                         tcpgo(tpriv, &tcb->timer);
2895                                 }
2896                                 break;
2897                         case Last_ack:
2898                                 update(s, &seg);
2899                                 if (qlen(s->wq) + tcb->flgcnt == 0) {
2900                                         localclose(s, NULL);
2901                                         goto raise;
2902                                 }
2903                         case Time_wait:
2904                                 tcb->flags |= FORCE;
2905                                 if (tcb->timer.state != TcptimerON)
2906                                         tcpgo(tpriv, &tcb->timer);
2907                 }
2908
2909                 if ((seg.flags & URG) && seg.urg) {
2910                         if (seq_gt(seg.urg + seg.seq, tcb->rcv.urg)) {
2911                                 tcb->rcv.urg = seg.urg + seg.seq;
2912                                 pullblock(&bp, seg.urg);
2913                         }
2914                 } else if (seq_gt(tcb->rcv.nxt, tcb->rcv.urg))
2915                         tcb->rcv.urg = tcb->rcv.nxt;
2916
2917                 if (length == 0) {
2918                         if (bp != NULL)
2919                                 freeblist(bp);
2920                 } else {
2921                         switch (tcb->state) {
2922                                 default:
2923                                         /* Ignore segment text */
2924                                         if (bp != NULL)
2925                                                 freeblist(bp);
2926                                         break;
2927
2928                                 case Established:
2929                                 case Finwait1:
2930                                         /* If we still have some data place on
2931                                          * receive queue
2932                                          */
2933                                         if (bp) {
2934                                                 bp = packblock(bp);
2935                                                 if (bp == NULL)
2936                                                         panic("tcp packblock");
2937                                                 qpassnolim(s->rq, bp);
2938                                                 bp = NULL;
2939
2940                                                 /*
2941                                                  *  Force an ack every 2 data messages.  This is
2942                                                  *  a hack for rob to make his home system run
2943                                                  *  faster.
2944                                                  *
2945                                                  *  this also keeps the standard TCP congestion
2946                                                  *  control working since it needs an ack every
2947                                                  *  2 max segs worth.  This is not quite that,
2948                                                  *  but under a real stream is equivalent since
2949                                                  *  every packet has a max seg in it.
2950                                                  */
2951                                                 if (++(tcb->rcv.una) >= 2)
2952                                                         tcb->flags |= FORCE;
2953                                         }
2954                                         tcb->rcv.nxt += length;
2955                                         drop_old_rcv_sacks(tcb);
2956
2957                                         /*
2958                                          *  update our rcv window
2959                                          */
2960                                         tcprcvwin(s);
2961
2962                                         /*
2963                                          *  turn on the acktimer if there's something
2964                                          *  to ack
2965                                          */
2966                                         if (tcb->acktimer.state != TcptimerON)
2967                                                 tcpgo(tpriv, &tcb->acktimer);
2968
2969                                         break;
2970                                 case Finwait2:
2971                                         /* no process to read the data, send a reset */
2972                                         if (bp != NULL)
2973                                                 freeblist(bp);
2974                                         sndrst(tcp, source, dest, length, &seg, version,
2975                                                    "send to Finwait2");
2976                                         qunlock(&s->qlock);
2977                                         poperror();
2978                                         return;
2979                         }
2980                 }
2981
2982                 if (seg.flags & FIN) {
2983                         tcb->flags |= FORCE;
2984
2985                         switch (tcb->state) {
2986                                 case Established:
2987                                         tcb->rcv.nxt++;
2988                                         tcpsetstate(s, Close_wait);
2989                                         break;
2990                                 case Finwait1:
2991                                         tcb->rcv.nxt++;
2992                                         if (qlen(s->wq) + tcb->flgcnt == 0) {
2993                                                 tcphalt(tpriv, &tcb->rtt_timer);
2994                                                 tcphalt(tpriv, &tcb->acktimer);
2995                                                 tcphalt(tpriv, &tcb->katimer);
2996                                                 tcpsetstate(s, Time_wait);
2997                                                 tcb->timer.start = MSL2 * (1000 / MSPTICK);
2998                                                 tcpgo(tpriv, &tcb->timer);
2999                                         } else
3000                                                 tcpsetstate(s, Closing);
3001                                         break;
3002                                 case Finwait2:
3003                                         tcb->rcv.nxt++;
3004                                         tcphalt(tpriv, &tcb->rtt_timer);
3005                                         tcphalt(tpriv, &tcb->acktimer);
3006                                         tcphalt(tpriv, &tcb->katimer);
3007                                         tcpsetstate(s, Time_wait);
3008                                         tcb->timer.start = MSL2 * (1000 / MSPTICK);
3009                                         tcpgo(tpriv, &tcb->timer);
3010                                         break;
3011                                 case Close_wait:
3012                                 case Closing:
3013                                 case Last_ack:
3014                                         break;
3015                                 case Time_wait:
3016                                         tcpgo(tpriv, &tcb->timer);
3017                                         break;
3018                         }
3019                 }
3020
3021                 /*
3022                  *  get next adjacent segment from the resequence queue.
3023                  *  dump/trim any overlapping segments
3024                  */
3025                 for (;;) {
3026                         if (tcb->reseq == NULL)
3027                                 goto output;
3028
3029                         if (seq_ge(tcb->rcv.nxt, tcb->reseq->seg.seq) == 0)
3030                                 goto output;
3031
3032                         getreseq(tcb, &seg, &bp, &length);
3033
3034                         if (tcptrim(tcb, &seg, &bp, &length) == 0)
3035                                 break;
3036                 }
3037         }
3038 output:
3039         tcpoutput(s);
3040         qunlock(&s->qlock);
3041         poperror();
3042         return;
3043 raise:
3044         qunlock(&s->qlock);
3045         poperror();
3046         freeblist(bp);
3047         tcpkick(s);
3048 }
3049
3050 /* The advertised mss = data + TCP headers */
3051 static uint16_t derive_payload_mss(Tcpctl *tcb)
3052 {
3053         uint16_t payload_mss = tcb->mss;
3054         uint16_t opt_size = 0;
3055
3056         if (tcb->ts_recent) {
3057                 opt_size += TS_LENGTH;
3058                 /* Note that when we're a SYN, we overestimate slightly.  This is safe,
3059                  * and not really a problem. */
3060                 opt_size += TS_SEND_PREPAD;
3061         }
3062         if (tcb->rcv.nr_sacks)
3063                 opt_size += 2 + tcb->rcv.nr_sacks * 8;
3064         opt_size = ROUNDUP(opt_size, 4);
3065         payload_mss -= opt_size;
3066         return payload_mss;
3067 }
3068
3069 /* Decreases the xmit amt, given the MSS / TSO. */
3070 static uint32_t throttle_for_mss(Tcpctl *tcb, uint32_t ssize,
3071                                  uint16_t payload_mss, bool retrans)
3072 {
3073         if (ssize > payload_mss) {
3074                 if ((tcb->flags & TSO) == 0) {
3075                         ssize = payload_mss;
3076                 } else {
3077                         /* Don't send too much.  32K is arbitrary.. */
3078                         if (ssize > 32 * 1024)
3079                                 ssize = 32 * 1024;
3080                         if (!retrans) {
3081                                 /* Clamp xmit to an integral MSS to avoid ragged tail segments
3082                                  * causing poor link utilization. */
3083                                 ssize = ROUNDDOWN(ssize, payload_mss);
3084                         }
3085                 }
3086         }
3087         return ssize;
3088 }
3089
3090 /* Reduces ssize for a variety of reasons.  Returns FALSE if we should abort
3091  * sending the packet.  o/w returns TRUE and modifies ssize by reference. */
3092 static bool throttle_ssize(struct conv *s, Tcpctl *tcb, uint32_t *ssize_p,
3093                            uint16_t payload_mss, bool retrans)
3094 {
3095         struct Fs *f = s->p->f;
3096         uint32_t usable;
3097         uint32_t ssize = *ssize_p;
3098
3099         /* Compute usable segment based on offered window and limit
3100          * window probes to one */
3101         if (tcb->snd.wnd == 0) {
3102                 if (tcb->snd.in_flight != 0) {
3103                         if ((tcb->flags & FORCE) == 0)
3104                                 return FALSE;
3105                 }
3106                 usable = 1;
3107         } else {
3108                 usable = tcb->cwind;
3109                 if (tcb->snd.wnd < usable)
3110                         usable = tcb->snd.wnd;
3111                 if (usable > tcb->snd.in_flight)
3112                         usable -= tcb->snd.in_flight;
3113                 else
3114                         usable = 0;
3115                 /* Avoid Silly Window Syndrome.  This is a little different thant RFC
3116                  * 813.  I took their additional enhancement of "< MSS" as an AND, not
3117                  * an OR.  25% of a large snd.wnd is pretty large, and our main goal is
3118                  * to avoid packets smaller than MSS.  I still use the 25% threshold,
3119                  * because it is important that there is *some* data in_flight.  If
3120                  * usable < MSS because snd.wnd is very small (but not 0), we might
3121                  * never get an ACK and would need to set up a timer.
3122                  *
3123                  * Also, I'm using 'ssize' as a proxy for a PSH point.  If there's just
3124                  * a small blob in the qio (or retrans!), then we might as well just
3125                  * send it. */
3126                 if ((usable < tcb->typical_mss) && (usable < tcb->snd.wnd >> 2)
3127                     && (usable < ssize)) {
3128                         return FALSE;
3129                 }
3130         }
3131         if (ssize && usable < 2)
3132                 netlog(s->p->f, Logtcpverbose,
3133                        "%I.%d -> %I.%d: throttled snd.wnd %lu cwind %lu\n",
3134                        s->laddr, s->lport, s->raddr, s->rport,
3135                        tcb->snd.wnd, tcb->cwind);
3136         if (usable < ssize)
3137                 ssize = usable;
3138
3139         ssize = throttle_for_mss(tcb, ssize, payload_mss, retrans);
3140
3141         *ssize_p = ssize;
3142         return TRUE;
3143 }
3144
3145 /* Helper, picks the next segment to send, which is possibly a retransmission.
3146  * Returns TRUE if we have a segment, FALSE o/w.  Returns ssize, from_seq, and
3147  * sent by reference.
3148  *
3149  * from_seq is the seq number we are transmitting from.
3150  *
3151  * sent includes all seq from una to from_seq *including* any previously sent
3152  * flags (part of tcb->flgcnt), for instance an unacknowledged SYN (which counts
3153  * as a seq number).  Those flags are in the e.g. snd.nxt - snd.una range, and
3154  * they get dropped after qdiscard.
3155  *
3156  * ssize is the amount of data we are sending, starting from from_seq, and it
3157  * will include any *new* flags, which haven't been accounted for yet.
3158  *
3159  * tcb->flgcnt consists of the flags both in ssize and in sent.
3160  *
3161  * Note that we could be in recovery and not sack_retrans a segment. */
3162 static bool get_xmit_segment(struct conv *s, Tcpctl *tcb, uint16_t payload_mss,
3163                              uint32_t *from_seq_p, uint32_t *sent_p,
3164                              uint32_t *ssize_p)
3165 {
3166         struct Fs *f = s->p->f;
3167         struct tcppriv *tpriv = s->p->priv;
3168         uint32_t ssize, sent, from_seq;
3169         bool sack_retrans = FALSE;
3170         struct sack_block *tcb_sack = 0;
3171
3172         for (int i = 0; i < tcb->snd.nr_sacks; i++) {
3173                 tcb_sack = &tcb->snd.sacks[i];
3174                 if (seq_lt(tcb->snd.rtx, tcb_sack->left)) {
3175                         /* So ssize is supposed to include any *new* flags to flgcnt, which
3176                          * at this point would be a FIN.
3177                          *
3178                          * It might be possible that flgcnt is incremented so we send a FIN,
3179                          * even for an intermediate sack retrans.  Perhaps the user closed
3180                          * the conv.
3181                          *
3182                          * However, the way the "flgcnt for FIN" works is that it inflates
3183                          * the desired amount we'd like to send (qlen + flgcnt).
3184                          * Eventually, we reach the end of the queue and fail to extract all
3185                          * of dsize.  At that point, we put on the FIN, and that's where the
3186                          * extra 'byte' comes from.
3187                          *
3188                          * For sack retrans, since we're extracting from parts of the qio
3189                          * that aren't the right-most edge, we don't need to consider flgcnt
3190                          * when setting ssize. */
3191                         from_seq = tcb->snd.rtx;
3192                         sent = from_seq - tcb->snd.una;
3193                         ssize = tcb_sack->left - from_seq;
3194                         sack_retrans = TRUE;
3195                         break;
3196                 }
3197         }
3198         /* SACK holes have first dibs, but we can still opportunisitically send new
3199          * data.
3200          *
3201          * During other types of recovery, we'll just send from the retrans point.
3202          * If we're in an RTO while we still have sacks, we could be resending data
3203          * that wasn't lost.  Consider a sack that is still growing (usually the
3204          * right-most), but we haven't received the ACK yet.  rxt may be included in
3205          * that area.  Given we had two losses or otherwise timed out, I'm not too
3206          * concerned.
3207          *
3208          * Note that Fast and RTO can send data beyond nxt.  If we change that,
3209          * change the accounting below. */
3210         if (!sack_retrans) {
3211                 switch (tcb->snd.recovery) {
3212                 default:
3213                 case SACK_RETRANS_RECOVERY:
3214                         from_seq = tcb->snd.nxt;
3215                         break;
3216                 case FAST_RETRANS_RECOVERY:
3217                 case RTO_RETRANS_RECOVERY:
3218                         from_seq = tcb->snd.rtx;
3219                         break;
3220                 }
3221                 sent = from_seq - tcb->snd.una;
3222                 /* qlen + flgcnt is every seq we want to have sent, including unack'd
3223                  * data, unacked flags, and new flags. */
3224                 ssize = qlen(s->wq) + tcb->flgcnt - sent;
3225         }
3226
3227         if (!throttle_ssize(s, tcb, &ssize, payload_mss, sack_retrans))
3228                 return FALSE;
3229
3230         /* This counts flags, which is a little hokey, but it's okay since in_flight
3231          * gets reset on each ACK */
3232         tcb->snd.in_flight += ssize;
3233         /* Log and track rxmit.  This covers both SACK (retrans) and fast rxmit. */
3234         if (ssize && seq_lt(tcb->snd.rtx, tcb->snd.nxt)) {
3235                 netlog(f, Logtcpverbose,
3236                        "%I.%d -> %I.%d: rxmit: rtx %u amt %u, nxt %u\n",
3237                        s->laddr, s->lport, s->raddr, s->rport,
3238                        tcb->snd.rtx, MIN(tcb->snd.nxt - tcb->snd.rtx, ssize),
3239                        tcb->snd.nxt);
3240                 tpriv->stats[RetransSegs]++;
3241         }
3242         if (sack_retrans) {
3243                 /* If we'll send up to the left edge, advance snd.rtx to the right.
3244                  *
3245                  * This includes the largest sack.  It might get removed later, in which
3246                  * case we'll underestimate the amount in-flight.  The alternative is to
3247                  * not count the rightmost sack, but when it gets removed, we'll retrans
3248                  * it anyway.  No matter what, we'd count it. */
3249                 tcb->snd.rtx += ssize;
3250                 if (tcb->snd.rtx == tcb_sack->left)
3251                         tcb->snd.rtx = tcb_sack->right;
3252                 /* RFC 6675 says we MAY rearm the RTO timer on each retrans, since we
3253                  * might not be getting ACKs for a while. */
3254                 tcpsettimer(tcb);
3255         } else {
3256                 switch (tcb->snd.recovery) {
3257                 default:
3258                         /* under normal op, we drag rtx along with nxt.  this prevents us
3259                          * from sending sacks too early (up above), since rtx doesn't get
3260                          * reset to una until we have a loss (e.g. 3 dupacks/sacks). */
3261                         tcb->snd.nxt += ssize;
3262                         tcb->snd.rtx = tcb->snd.nxt;
3263                         break;
3264                 case SACK_RETRANS_RECOVERY:
3265                         /* We explicitly do not want to increase rtx here.  We might still
3266                          * need it to fill in a sack gap below nxt if we get new, higher
3267                          * sacks. */
3268                         tcb->snd.nxt += ssize;
3269                         break;
3270                 case FAST_RETRANS_RECOVERY:
3271                 case RTO_RETRANS_RECOVERY:
3272                         tcb->snd.rtx += ssize;
3273                         /* Fast and RTO can send new data, advancing nxt. */
3274                         if (seq_gt(tcb->snd.rtx, tcb->snd.nxt))
3275                                 tcb->snd.nxt = tcb->snd.rtx;
3276                         break;
3277                 }
3278         }
3279         *from_seq_p = from_seq;
3280         *sent_p = sent;
3281         *ssize_p = ssize;
3282
3283         return TRUE;
3284 }
3285
3286 /*
3287  *  always enters and exits with the s locked.  We drop
3288  *  the lock to ipoput the packet so some care has to be
3289  *  taken by callers.
3290  */
3291 void tcpoutput(struct conv *s)
3292 {
3293         Tcp seg;
3294         int msgs;
3295         int next_yield = 1;
3296         Tcpctl *tcb;
3297         struct block *hbp, *bp;
3298         uint32_t ssize, dsize, sent, from_seq;
3299         struct Fs *f;
3300         struct tcppriv *tpriv;
3301         uint8_t version;
3302         uint16_t payload_mss;
3303
3304         f = s->p->f;
3305         tpriv = s->p->priv;
3306         version = s->ipversion;
3307
3308         for (msgs = 0; msgs < 100; msgs++) {
3309                 tcb = (Tcpctl *) s->ptcl;
3310
3311                 switch (tcb->state) {
3312                         case Listen:
3313                         case Closed:
3314                         case Finwait2:
3315                                 return;
3316                 }
3317
3318                 /* force an ack when a window has opened up */
3319                 if (tcb->rcv.blocked && tcb->rcv.wnd >= tcb->mss) {
3320                         tcb->rcv.blocked = 0;
3321                         tcb->flags |= FORCE;
3322                 }
3323
3324                 /* Don't send anything else until our SYN has been acked */
3325                 if (tcb->snd.nxt != tcb->iss && (tcb->flags & SYNACK) == 0)
3326                         break;
3327
3328                 /* payload_mss is the actual amount of data in the packet, which is the
3329                  * advertised (mss - header opts).  This varies from packet to packet,
3330                  * based on the options that might be present (e.g. always timestamps,
3331                  * sometimes SACKs) */
3332                 payload_mss = derive_payload_mss(tcb);
3333
3334                 if (!get_xmit_segment(s, tcb, payload_mss, &from_seq, &sent, &ssize))
3335                         break;
3336
3337                 dsize = ssize;
3338                 seg.urg = 0;
3339
3340                 if (ssize == 0)
3341                         if ((tcb->flags & FORCE) == 0)
3342                                 break;
3343
3344                 tcb->flags &= ~FORCE;
3345                 tcprcvwin(s);
3346
3347                 /* By default we will generate an ack, so we can normally turn off the
3348                  * timer.  If we're blocked, we'll want the timer so we can send a
3349                  * window update. */
3350                 if (!tcb->rcv.blocked)
3351                         tcphalt(tpriv, &tcb->acktimer);
3352                 tcb->rcv.una = 0;
3353                 seg.source = s->lport;
3354                 seg.dest = s->rport;
3355                 seg.flags = ACK;
3356                 seg.mss = 0;
3357                 seg.ws = 0;
3358                 seg.sack_ok = FALSE;
3359                 seg.nr_sacks = 0;
3360                 /* When outputting, Syn_sent means "send the Syn", for connections we
3361                  * initiate.  SYNACKs are sent from sndsynack directly. */
3362                 if (tcb->state == Syn_sent) {
3363                         seg.flags = 0;
3364                         seg.sack_ok = SACK_SUPPORTED;   /* here's where we advertise SACK */
3365                         if (tcb->snd.nxt - ssize == tcb->iss) {
3366                                 seg.flags |= SYN;
3367                                 dsize--;
3368                                 seg.mss = tcb->mss;
3369                                 seg.ws = tcb->scale;
3370                         } else {
3371                                 /* TODO: Not sure why we'd get here. */
3372                                 warn("TCP: weird Syn_sent state, tell someone you saw this");
3373                         }
3374                 }
3375                 seg.seq = from_seq;
3376                 seg.ack = tcb->rcv.nxt;
3377                 tcb->last_ack_sent = seg.ack;
3378                 seg.wnd = tcb->rcv.wnd;
3379                 seg.ts_val = tcb->ts_recent;
3380
3381                 /* Pull out data to send */
3382                 bp = NULL;
3383                 if (dsize != 0) {
3384                         bp = qcopy(s->wq, dsize, sent);
3385                         if (BLEN(bp) != dsize) {
3386                                 /* Here's where the flgcnt kicked in.  Note dsize is
3387                                  * decremented, but ssize isn't.  Not that we use ssize for much
3388                                  * anymore.  Decrementing dsize prevents us from sending a PSH
3389                                  * with the FIN. */
3390                                 seg.flags |= FIN;
3391                                 dsize--;
3392                         }
3393                         if (BLEN(bp) > payload_mss) {
3394                                 bp->flag |= Btso;
3395                                 bp->mss = payload_mss;
3396                         }
3397                 }
3398
3399                 if (sent + dsize == qlen(s->wq) + tcb->flgcnt)
3400                         seg.flags |= PSH;
3401
3402                 /* Build header, link data and compute cksum */
3403                 switch (version) {
3404                         case V4:
3405                                 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
3406                                 hbp = htontcp4(&seg, bp, &tcb->protohdr.tcp4hdr, tcb);
3407                                 if (hbp == NULL) {
3408                                         freeblist(bp);
3409                                         return;
3410                                 }
3411                                 break;
3412                         case V6:
3413                                 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
3414                                 hbp = htontcp6(&seg, bp, &tcb->protohdr.tcp6hdr, tcb);
3415                                 if (hbp == NULL) {
3416                                         freeblist(bp);
3417                                         return;
3418                                 }
3419                                 break;
3420                         default:
3421                                 hbp = NULL;     /* to suppress a warning */
3422                                 panic("tcpoutput: version %d", version);
3423                 }
3424
3425                 /* Start the transmission timers if there is new data and we
3426                  * expect acknowledges
3427                  */
3428                 if (ssize != 0) {
3429                         if (tcb->timer.state != TcptimerON)
3430                                 tcpgo(tpriv, &tcb->timer);
3431
3432                         if (!tcb->ts_recent && (tcb->rtt_timer.state != TcptimerON)) {
3433                                 /* If round trip timer isn't running, start it. */
3434                                 tcpgo(tpriv, &tcb->rtt_timer);
3435                                 tcb->rttseq = from_seq + ssize;
3436                         }
3437                 }
3438
3439                 tpriv->stats[OutSegs]++;
3440
3441                 /* put off the next keep alive */
3442                 tcpgo(tpriv, &tcb->katimer);
3443
3444                 switch (version) {
3445                         case V4:
3446                                 if (ipoput4(f, hbp, 0, s->ttl, s->tos, s) < 0) {
3447                                         /* a negative return means no route */
3448                                         localclose(s, "no route");
3449                                 }
3450                                 break;
3451                         case V6:
3452                                 if (ipoput6(f, hbp, 0, s->ttl, s->tos, s) < 0) {
3453                                         /* a negative return means no route */
3454                                         localclose(s, "no route");
3455                                 }
3456                                 break;
3457                         default:
3458                                 panic("tcpoutput2: version %d", version);
3459                 }
3460                 if (ssize) {
3461                         /* The outer loop thinks we sent one packet.  If we used TSO, we
3462                          * might have sent several.  Minus one for the loop increment. */
3463                         msgs += DIV_ROUND_UP(ssize, payload_mss) - 1;
3464                 }
3465                 /* Old Plan 9 tidbit - yield every four messages.  We want to break out
3466                  * and unlock so we can process inbound ACKs which might do things like
3467                  * say "slow down". */
3468                 if (msgs >= next_yield) {
3469                         next_yield = msgs + 4;
3470                         qunlock(&s->qlock);
3471                         kthread_yield();
3472                         qlock(&s->qlock);
3473                 }
3474         }
3475 }
3476
3477 /*
3478  *  the BSD convention (hack?) for keep alives.  resend last uint8_t acked.
3479  */
3480 void tcpsendka(struct conv *s)
3481 {
3482         Tcp seg;
3483         Tcpctl *tcb;
3484         struct block *hbp, *dbp;
3485
3486         tcb = (Tcpctl *) s->ptcl;
3487
3488         dbp = NULL;
3489         seg.urg = 0;
3490         seg.source = s->lport;
3491         seg.dest = s->rport;
3492         seg.flags = ACK | PSH;
3493         seg.mss = 0;
3494         seg.ws = 0;
3495         seg.sack_ok = FALSE;
3496         seg.nr_sacks = 0;
3497         if (tcpporthogdefense)
3498                 urandom_read(&seg.seq, sizeof(seg.seq));
3499         else
3500                 seg.seq = tcb->snd.una - 1;
3501         seg.ack = tcb->rcv.nxt;
3502         tcb->last_ack_sent = seg.ack;
3503         tcb->rcv.una = 0;
3504         seg.wnd = tcb->rcv.wnd;
3505         seg.ts_val = tcb->ts_recent;
3506         if (tcb->state == Finwait2) {
3507                 seg.flags |= FIN;
3508         } else {
3509                 dbp = block_alloc(1, MEM_WAIT);
3510                 dbp->wp++;
3511         }
3512
3513         if (isv4(s->raddr)) {
3514                 /* Build header, link data and compute cksum */
3515                 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
3516                 hbp = htontcp4(&seg, dbp, &tcb->protohdr.tcp4hdr, tcb);
3517                 if (hbp == NULL) {
3518                         freeblist(dbp);
3519                         return;
3520                 }
3521                 ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
3522         } else {
3523                 /* Build header, link data and compute cksum */
3524                 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
3525                 hbp = htontcp6(&seg, dbp, &tcb->protohdr.tcp6hdr, tcb);
3526                 if (hbp == NULL) {
3527                         freeblist(dbp);
3528                         return;
3529                 }
3530                 ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
3531         }
3532 }
3533
3534 /*
3535  *  set connection to time out after 12 minutes
3536  */
3537 void tcpsetkacounter(Tcpctl * tcb)
3538 {
3539         tcb->kacounter = (12 * 60 * 1000) / (tcb->katimer.start * MSPTICK);
3540         if (tcb->kacounter < 3)
3541                 tcb->kacounter = 3;
3542 }
3543
3544 /*
3545  *  if we've timed out, close the connection
3546  *  otherwise, send a keepalive and restart the timer
3547  */
3548 void tcpkeepalive(void *v)
3549 {
3550         ERRSTACK(1);
3551         Tcpctl *tcb;
3552         struct conv *s;
3553
3554         s = v;
3555         tcb = (Tcpctl *) s->ptcl;
3556         qlock(&s->qlock);
3557         if (waserror()) {
3558                 qunlock(&s->qlock);
3559                 nexterror();
3560         }
3561         if (tcb->state != Closed) {
3562                 if (--(tcb->kacounter) <= 0) {
3563                         localclose(s, "connection timed out");
3564                 } else {
3565                         tcpsendka(s);
3566                         tcpgo(s->p->priv, &tcb->katimer);
3567                 }
3568         }
3569         qunlock(&s->qlock);
3570         poperror();
3571 }
3572
3573 /*
3574  *  start keepalive timer
3575  */
3576 static void tcpstartka(struct conv *s, char **f, int n)
3577 {
3578         Tcpctl *tcb;
3579         int x;
3580
3581         tcb = (Tcpctl *) s->ptcl;
3582         if (tcb->state != Established)
3583                 error(ENOTCONN, "connection must be in Establised state");
3584         if (n > 1) {
3585                 x = atoi(f[1]);
3586                 if (x >= MSPTICK)
3587                         tcb->katimer.start = x / MSPTICK;
3588         }
3589         tcpsetkacounter(tcb);
3590         tcpgo(s->p->priv, &tcb->katimer);
3591 }
3592
3593 /*
3594  *  turn checksums on/off
3595  */
3596 static void tcpsetchecksum(struct conv *s, char **f, int unused)
3597 {
3598         Tcpctl *tcb;
3599
3600         tcb = (Tcpctl *) s->ptcl;
3601         tcb->nochecksum = !atoi(f[1]);
3602 }
3603
3604 static void tcp_loss_event(struct conv *s, Tcpctl *tcb)
3605 {
3606         uint32_t old_cwnd = tcb->cwind;
3607
3608         /* Reno */
3609         tcb->ssthresh = tcb->cwind / 2;
3610         tcb->cwind = tcb->ssthresh;
3611         netlog(s->p->f, Logtcprxmt,
3612                "%I.%d -> %I.%d: loss event, cwnd was %d, now %d\n",
3613                s->laddr, s->lport, s->raddr, s->rport,
3614                old_cwnd, tcb->cwind);
3615 }
3616
3617 /* Called when we need to retrans the entire outstanding window (everything
3618  * previously sent, but unacknowledged). */
3619 void tcprxmit(struct conv *s)
3620 {
3621         Tcpctl *tcb;
3622
3623         tcb = (Tcpctl *) s->ptcl;
3624
3625         tcb->flags |= FORCE;
3626         tcb->snd.rtx = tcb->snd.una;
3627         set_in_flight(tcb);
3628
3629         tcpoutput(s);
3630 }
3631
3632 /* The original RFC said to drop sacks on a timeout, since the receiver could
3633  * renege.  Later RFCs say we can keep them around, so long as we are careful.
3634  *
3635  * We'll go with a "flush if we have two timeouts" plan.  This doesn't have to
3636  * be perfect - there might be cases where we accidentally flush the sacks too
3637  * often.  Perhaps we never get dup_acks to start fast/sack rxmit.  The main
3638  * thing is that after multiple timeouts we flush the sacks, since the receiver
3639  * might renege.
3640  *
3641  * We also have an Akaros-specific problem.  We use the sacks to determine
3642  * in_flight.  Specifically, the (snd.nxt - upper right edge) is tracked as in
3643  * flight.  Usually the receiver will keep sac