net: tcp: Remove the RETRAN flag
[akaros.git] / kern / src / net / tcp.c
1 /* Copyright © 1994-1999 Lucent Technologies Inc.  All rights reserved.
2  * Portions Copyright © 1997-1999 Vita Nuova Limited
3  * Portions Copyright © 2000-2007 Vita Nuova Holdings Limited
4  *                                (www.vitanuova.com)
5  * Revisions Copyright © 2000-2007 Lucent Technologies Inc. and others
6  *
7  * Modified for the Akaros operating system:
8  * Copyright (c) 2013-2014 The Regents of the University of California
9  * Copyright (c) 2013-2015 Google Inc.
10  *
11  * Permission is hereby granted, free of charge, to any person obtaining a copy
12  * of this software and associated documentation files (the "Software"), to deal
13  * in the Software without restriction, including without limitation the rights
14  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15  * copies of the Software, and to permit persons to whom the Software is
16  * furnished to do so, subject to the following conditions:
17  *
18  * The above copyright notice and this permission notice shall be included in
19  * all copies or substantial portions of the Software.
20  *
21  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
24  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27  * SOFTWARE. */
28
29 #include <vfs.h>
30 #include <kfs.h>
31 #include <slab.h>
32 #include <kmalloc.h>
33 #include <kref.h>
34 #include <string.h>
35 #include <stdio.h>
36 #include <assert.h>
37 #include <error.h>
38 #include <cpio.h>
39 #include <pmap.h>
40 #include <smp.h>
41 #include <ip.h>
42
43 #include <vfs.h>
44 #include <kfs.h>
45 #include <slab.h>
46 #include <kmalloc.h>
47 #include <kref.h>
48 #include <string.h>
49 #include <stdio.h>
50 #include <assert.h>
51 #include <error.h>
52 #include <cpio.h>
53 #include <pmap.h>
54 #include <smp.h>
55 #include <ip.h>
56
57 enum {
58         QMAX = 64 * 1024 - 1,
59         IP_TCPPROTO = 6,
60
61         TCP4_IPLEN = 8,
62         TCP4_PHDRSIZE = 12,
63         TCP4_HDRSIZE = 20,
64         TCP4_TCBPHDRSZ = 40,
65         TCP4_PKT = TCP4_IPLEN + TCP4_PHDRSIZE,
66
67         TCP6_IPLEN = 0,
68         TCP6_PHDRSIZE = 40,
69         TCP6_HDRSIZE = 20,
70         TCP6_TCBPHDRSZ = 60,
71         TCP6_PKT = TCP6_IPLEN + TCP6_PHDRSIZE,
72
73         TcptimerOFF = 0,
74         TcptimerON = 1,
75         TcptimerDONE = 2,
76         MAX_TIME = (1 << 20),   /* Forever */
77         TCP_ACK = 50,   /* Timed ack sequence in ms */
78         MAXBACKMS = 9 * 60 * 1000,      /* longest backoff time (ms) before hangup */
79
80         URG = 0x20,     /* Data marked urgent */
81         ACK = 0x10,     /* Acknowledge is valid */
82         PSH = 0x08,     /* Whole data pipe is pushed */
83         RST = 0x04,     /* Reset connection */
84         SYN = 0x02,     /* Pkt. is synchronise */
85         FIN = 0x01,     /* Start close down */
86
87         EOLOPT = 0,
88         NOOPOPT = 1,
89         MSSOPT = 2,
90         MSS_LENGTH = 4, /* max segment size header option length */
91         WSOPT = 3,
92         WS_LENGTH = 3,  /* WS header option length */
93         MAX_WS_VALUE = 14,      /* RFC specified.  Limits available window to 2^30 */
94         TS_OPT = 8,
95         TS_LENGTH = 10,
96         TS_SEND_PREPAD = 2,     /* For non-SYNs, pre-pad 2 nops for 32 byte alignment */
97         SACK_OK_OPT = 4,
98         SACK_OK_LENGTH = 2,
99         SACK_OPT = 5,
100         MSL2 = 10,
101         MSPTICK = 50,   /* Milliseconds per timer tick */
102         DEF_MSS = 1460, /* Default mean segment */
103         DEF_MSS6 = 1280,        /* Default mean segment (min) for v6 */
104         SACK_SUPPORTED = TRUE,  /* SACK is on by default */
105         MAX_NR_SACKS_PER_PACKET = 4,    /* limited by TCP's opts size */
106         MAX_NR_SND_SACKS = 10,
107         MAX_NR_RCV_SACKS = 3,   /* We could try for 4, but don't need to */
108         DEF_RTT = 500,  /* Default round trip */
109         DEF_KAT = 120000,       /* Default time (ms) between keep alives */
110         TCP_LISTEN = 0, /* Listen connection */
111         TCP_CONNECT = 1,        /* Outgoing connection */
112         SYNACK_RXTIMER = 250,   /* ms between SYNACK retransmits */
113
114         TCPREXMTTHRESH = 3,     /* dupack threshold for recovery */
115         SACK_RETRANS_RECOVERY = 1,
116         FAST_RETRANS_RECOVERY = 2,
117         RTO_RETRANS_RECOVERY = 3,
118         CWIND_SCALE = 10,       /* initial CWIND will be MSS * this */
119
120         FORCE                   = 1 << 0,
121         CLONE                   = 1 << 1,
122         ACTIVE                  = 1 << 2,
123         SYNACK                  = 1 << 3,
124         TSO                             = 1 << 4,
125
126         LOGAGAIN = 3,
127         LOGDGAIN = 2,
128
129         Closed = 0,     /* Connection states */
130         Listen,
131         Syn_sent,
132         Established,
133         Finwait1,
134         Finwait2,
135         Close_wait,
136         Closing,
137         Last_ack,
138         Time_wait,
139
140         Maxlimbo = 1000,        /* maximum procs waiting for response to SYN ACK */
141         NLHT = 256,     /* hash table size, must be a power of 2 */
142         LHTMASK = NLHT - 1,
143
144         HaveWS = 1 << 8,
145 };
146
147 /* Must correspond to the enumeration above */
148 char *tcpstates[] = {
149         "Closed", "Listen", "Syn_sent",
150         "Established", "Finwait1", "Finwait2", "Close_wait",
151         "Closing", "Last_ack", "Time_wait"
152 };
153
154 typedef struct Tcptimer Tcptimer;
155 struct Tcptimer {
156         Tcptimer *next;
157         Tcptimer *prev;
158         Tcptimer *readynext;
159         int state;
160         uint64_t start;
161         uint64_t count;
162         void (*func) (void *);
163         void *arg;
164 };
165
166 /*
167  *  v4 and v6 pseudo headers used for
168  *  checksuming tcp
169  */
170 typedef struct Tcp4hdr Tcp4hdr;
171 struct Tcp4hdr {
172         uint8_t vihl;                           /* Version and header length */
173         uint8_t tos;                            /* Type of service */
174         uint8_t length[2];                      /* packet length */
175         uint8_t id[2];                          /* Identification */
176         uint8_t frag[2];                        /* Fragment information */
177         uint8_t Unused;
178         uint8_t proto;
179         uint8_t tcplen[2];
180         uint8_t tcpsrc[4];
181         uint8_t tcpdst[4];
182         uint8_t tcpsport[2];
183         uint8_t tcpdport[2];
184         uint8_t tcpseq[4];
185         uint8_t tcpack[4];
186         uint8_t tcpflag[2];
187         uint8_t tcpwin[2];
188         uint8_t tcpcksum[2];
189         uint8_t tcpurg[2];
190         /* Options segment */
191         uint8_t tcpopt[1];
192 };
193
194 typedef struct Tcp6hdr Tcp6hdr;
195 struct Tcp6hdr {
196         uint8_t vcf[4];
197         uint8_t ploadlen[2];
198         uint8_t proto;
199         uint8_t ttl;
200         uint8_t tcpsrc[IPaddrlen];
201         uint8_t tcpdst[IPaddrlen];
202         uint8_t tcpsport[2];
203         uint8_t tcpdport[2];
204         uint8_t tcpseq[4];
205         uint8_t tcpack[4];
206         uint8_t tcpflag[2];
207         uint8_t tcpwin[2];
208         uint8_t tcpcksum[2];
209         uint8_t tcpurg[2];
210         /* Options segment */
211         uint8_t tcpopt[1];
212 };
213
214 struct sack_block {
215         uint32_t left;
216         uint32_t right;
217 };
218
219 /*
220  *  this represents the control info
221  *  for a single packet.  It is derived from
222  *  a packet in ntohtcp{4,6}() and stuck into
223  *  a packet in htontcp{4,6}().
224  */
225 typedef struct Tcp Tcp;
226 struct Tcp {
227         uint16_t source;
228         uint16_t dest;
229         uint32_t seq;
230         uint32_t ack;
231         uint8_t flags;
232         uint16_t ws;                            /* window scale option (if not zero) */
233         uint32_t wnd;
234         uint16_t urg;
235         uint16_t mss;                           /* max segment size option (if not zero) */
236         uint16_t len;                           /* size of data */
237         uint32_t ts_val;                        /* timestamp val from sender */
238         uint32_t ts_ecr;                        /* timestamp echo response from sender */
239         bool sack_ok;                           /* header had/should have SACK_PERMITTED */
240         uint8_t nr_sacks;
241         struct sack_block sacks[MAX_NR_SACKS_PER_PACKET];
242 };
243
244 /*
245  *  this header is malloc'd to thread together fragments
246  *  waiting to be coalesced
247  */
248 typedef struct Reseq Reseq;
249 struct Reseq {
250         Reseq *next;
251         Tcp seg;
252         struct block *bp;
253         uint16_t length;
254 };
255
256 /*
257  *  the qlock in the Conv locks this structure
258  */
259 typedef struct Tcpctl Tcpctl;
260 struct Tcpctl {
261         uint8_t state;                          /* Connection state */
262         uint8_t type;                           /* Listening or active connection */
263         uint8_t code;                           /* Icmp code */
264         struct {
265                 uint32_t una;                   /* Left edge of unacked data region */
266                 uint32_t nxt;                   /* Next seq to send, right edge of unacked */
267                 uint32_t rtx;                   /* Next to send for retrans */
268                 uint32_t wnd;                   /* Tcp send window */
269                 uint32_t urg;                   /* Urgent data pointer */
270                 uint32_t wl2;
271                 int scale;                              /* how much to right shift window for xmit */
272                 uint32_t in_flight;             /* estimate of how much is in flight */
273                 uint8_t loss_hint;              /* number of loss hints rcvd */
274                 uint8_t sack_loss_hint; /* For detecting sack rxmit losses */
275                 bool flush_sacks;               /* Two timeouts in a row == dump sacks */
276                 uint8_t recovery;               /* loss recovery flag */
277                 uint32_t recovery_pt;   /* right window for recovery point */
278                 uint8_t nr_sacks;
279                 struct sack_block sacks[MAX_NR_SND_SACKS];
280         } snd;
281         struct {
282                 uint32_t nxt;                   /* Receive pointer to next uint8_t slot */
283                 uint32_t wnd;                   /* Receive window incoming */
284                 uint32_t urg;                   /* Urgent pointer */
285                 int blocked;
286                 int una;                                /* unacked data segs */
287                 int scale;                              /* how much to left shift window for rx */
288                 uint8_t nr_sacks;
289                 struct sack_block sacks[MAX_NR_RCV_SACKS];
290         } rcv;
291         uint32_t iss;                           /* Initial sequence number */
292         int sawwsopt;                           /* true if we saw a wsopt on the incoming SYN */
293         uint32_t cwind;                         /* Congestion window */
294         int scale;                                      /* desired snd.scale */
295         uint32_t ssthresh;                      /* Slow start threshold */
296         int irs;                                        /* Initial received squence */
297         uint16_t mss;                           /* Max segment size */
298         uint16_t typical_mss;           /* MSS for most packets (< MSS for some opts) */
299         int rerecv;                                     /* Overlap of data rerecevived */
300         uint32_t window;                        /* Recevive window */
301         uint8_t backoff;                        /* Exponential backoff counter */
302         int backedoff;                          /* ms we've backed off for rexmits */
303         uint8_t flags;                          /* State flags */
304         Reseq *reseq;                           /* Resequencing queue */
305         Tcptimer timer;                         /* Activity timer */
306         Tcptimer acktimer;                      /* Acknowledge timer */
307         Tcptimer rtt_timer;                     /* Round trip timer */
308         Tcptimer katimer;                       /* keep alive timer */
309         uint32_t rttseq;                        /* Round trip sequence */
310         int srtt;                                       /* Shortened round trip */
311         int mdev;                                       /* Mean deviation of round trip */
312         int kacounter;                          /* count down for keep alive */
313         uint64_t sndsyntime;            /* time syn sent */
314         uint64_t time;                          /* time Finwait2 was sent */
315         int nochecksum;                         /* non-zero means don't send checksums */
316         int flgcnt;                                     /* number of flags in the sequence (FIN,SYN) */
317         uint32_t ts_recent;                     /* timestamp received around last_ack_sent */
318         uint32_t last_ack_sent;         /* to determine when to update timestamp */
319         bool sack_ok;                           /* Can use SACK for this connection */
320
321         union {
322                 Tcp4hdr tcp4hdr;
323                 Tcp6hdr tcp6hdr;
324         } protohdr;                                     /* prototype header */
325 };
326
327 /*
328  *  New calls are put in limbo rather than having a conversation structure
329  *  allocated.  Thus, a SYN attack results in lots of limbo'd calls but not
330  *  any real Conv structures mucking things up.  Calls in limbo rexmit their
331  *  SYN ACK every SYNACK_RXTIMER ms up to 4 times, i.e., they disappear after 1 second.
332  *
333  *  In particular they aren't on a listener's queue so that they don't figure
334  *  in the input queue limit.
335  *
336  *  If 1/2 of a T3 was attacking SYN packets, we'ld have a permanent queue
337  *  of 70000 limbo'd calls.  Not great for a linear list but doable.  Therefore
338  *  there is no hashing of this list.
339  */
340 typedef struct Limbo Limbo;
341 struct Limbo {
342         Limbo *next;
343
344         uint8_t laddr[IPaddrlen];
345         uint8_t raddr[IPaddrlen];
346         uint16_t lport;
347         uint16_t rport;
348         uint32_t irs;                           /* initial received sequence */
349         uint32_t iss;                           /* initial sent sequence */
350         uint16_t mss;                           /* mss from the other end */
351         uint16_t rcvscale;                      /* how much to scale rcvd windows */
352         uint16_t sndscale;                      /* how much to scale sent windows */
353         uint64_t lastsend;                      /* last time we sent a synack */
354         uint8_t version;                        /* v4 or v6 */
355         uint8_t rexmits;                        /* number of retransmissions */
356         bool sack_ok;                           /* other side said SACK_OK */
357         uint32_t ts_val;                        /* timestamp val from sender */
358 };
359
360 int tcp_irtt = DEF_RTT;                 /* Initial guess at round trip time */
361 uint16_t tcp_mss = DEF_MSS;             /* Maximum segment size to be sent */
362
363 enum {
364         /* MIB stats */
365         MaxConn,
366         ActiveOpens,
367         PassiveOpens,
368         EstabResets,
369         CurrEstab,
370         InSegs,
371         OutSegs,
372         RetransSegs,
373         RetransTimeouts,
374         InErrs,
375         OutRsts,
376
377         /* non-MIB stats */
378         CsumErrs,
379         HlenErrs,
380         LenErrs,
381         OutOfOrder,
382
383         Nstats
384 };
385
386 static char *statnames[] = {
387         [MaxConn] "MaxConn",
388         [ActiveOpens] "ActiveOpens",
389         [PassiveOpens] "PassiveOpens",
390         [EstabResets] "EstabResets",
391         [CurrEstab] "CurrEstab",
392         [InSegs] "InSegs",
393         [OutSegs] "OutSegs",
394         [RetransSegs] "RetransSegs",
395         [RetransTimeouts] "RetransTimeouts",
396         [InErrs] "InErrs",
397         [OutRsts] "OutRsts",
398         [CsumErrs] "CsumErrs",
399         [HlenErrs] "HlenErrs",
400         [LenErrs] "LenErrs",
401         [OutOfOrder] "OutOfOrder",
402 };
403
404 typedef struct Tcppriv Tcppriv;
405 struct tcppriv {
406         /* List of active timers */
407         qlock_t tl;
408         Tcptimer *timers;
409
410         /* hash table for matching conversations */
411         struct Ipht ht;
412
413         /* calls in limbo waiting for an ACK to our SYN ACK */
414         int nlimbo;
415         Limbo *lht[NLHT];
416
417         /* for keeping track of tcpackproc */
418         qlock_t apl;
419         int ackprocstarted;
420
421         uint32_t stats[Nstats];
422 };
423
424 /*
425  *  Setting tcpporthogdefense to non-zero enables Dong Lin's
426  *  solution to hijacked systems staking out port's as a form
427  *  of DoS attack.
428  *
429  *  To avoid stateless Conv hogs, we pick a sequence number at random.  If
430  *  it that number gets acked by the other end, we shut down the connection.
431  *  Look for tcpporthogedefense in the code.
432  */
433 int tcpporthogdefense = 0;
434
435 int addreseq(Tcpctl *, struct tcppriv *, Tcp *, struct block *, uint16_t);
436 void getreseq(Tcpctl *, Tcp *, struct block **, uint16_t *);
437 void localclose(struct conv *, char *unused_char_p_t);
438 void procsyn(struct conv *, Tcp *);
439 void tcpiput(struct Proto *, struct Ipifc *, struct block *);
440 void tcpoutput(struct conv *);
441 int tcptrim(Tcpctl *, Tcp *, struct block **, uint16_t *);
442 void tcpstart(struct conv *, int);
443 void tcptimeout(void *);
444 void tcpsndsyn(struct conv *, Tcpctl *);
445 void tcprcvwin(struct conv *);
446 void tcpacktimer(void *);
447 void tcpkeepalive(void *);
448 void tcpsetkacounter(Tcpctl *);
449 void tcprxmit(struct conv *);
450 void tcpsettimer(Tcpctl *);
451 void tcpsynackrtt(struct conv *);
452 void tcpsetscale(struct conv *, Tcpctl *, uint16_t, uint16_t);
453 static void tcp_loss_event(struct conv *s, Tcpctl *tcb);
454 static uint16_t derive_payload_mss(Tcpctl *tcb);
455 static int seq_within(uint32_t x, uint32_t low, uint32_t high);
456 static int seq_lt(uint32_t x, uint32_t y);
457 static int seq_le(uint32_t x, uint32_t y);
458 static int seq_gt(uint32_t x, uint32_t y);
459 static int seq_ge(uint32_t x, uint32_t y);
460 static uint32_t seq_max(uint32_t x, uint32_t y);
461 static uint32_t seq_min(uint32_t x, uint32_t y);
462 static void set_in_flight(Tcpctl *tcb);
463
464 static void limborexmit(struct Proto *);
465 static void limbo(struct conv *, uint8_t * unused_uint8_p_t, uint8_t *, Tcp *,
466                                   int);
467
468 void tcpsetstate(struct conv *s, uint8_t newstate)
469 {
470         Tcpctl *tcb;
471         uint8_t oldstate;
472         struct tcppriv *tpriv;
473
474         tpriv = s->p->priv;
475
476         tcb = (Tcpctl *) s->ptcl;
477
478         oldstate = tcb->state;
479         if (oldstate == newstate)
480                 return;
481
482         if (oldstate == Established)
483                 tpriv->stats[CurrEstab]--;
484         if (newstate == Established)
485                 tpriv->stats[CurrEstab]++;
486
487         /**
488         print( "%d/%d %s->%s CurrEstab=%d\n", s->lport, s->rport,
489                 tcpstates[oldstate], tcpstates[newstate], tpriv->tstats.tcpCurrEstab );
490         **/
491
492         switch (newstate) {
493                 case Closed:
494                         qclose(s->rq);
495                         qclose(s->wq);
496                         qclose(s->eq);
497                         break;
498
499                 case Close_wait:        /* Remote closes */
500                         qhangup(s->rq, NULL);
501                         break;
502         }
503
504         tcb->state = newstate;
505
506         if (oldstate == Syn_sent && newstate != Closed)
507                 Fsconnected(s, NULL);
508 }
509
510 static void tcpconnect(struct conv *c, char **argv, int argc)
511 {
512         Fsstdconnect(c, argv, argc);
513         tcpstart(c, TCP_CONNECT);
514 }
515
516 static int tcpstate(struct conv *c, char *state, int n)
517 {
518         Tcpctl *s;
519
520         s = (Tcpctl *) (c->ptcl);
521
522         return snprintf(state, n,
523                                         "%s qin %d qout %d srtt %d mdev %d cwin %u swin %u>>%d rwin %u>>%d timer.start %llu timer.count %llu rerecv %d katimer.start %d katimer.count %d\n",
524                                         tcpstates[s->state],
525                                         c->rq ? qlen(c->rq) : 0,
526                                         c->wq ? qlen(c->wq) : 0,
527                                         s->srtt, s->mdev,
528                                         s->cwind, s->snd.wnd, s->rcv.scale, s->rcv.wnd,
529                                         s->snd.scale, s->timer.start, s->timer.count, s->rerecv,
530                                         s->katimer.start, s->katimer.count);
531 }
532
533 static int tcpinuse(struct conv *c)
534 {
535         Tcpctl *s;
536
537         s = (Tcpctl *) (c->ptcl);
538         return s->state != Closed;
539 }
540
541 static void tcpannounce(struct conv *c, char **argv, int argc)
542 {
543         Fsstdannounce(c, argv, argc);
544         tcpstart(c, TCP_LISTEN);
545         Fsconnected(c, NULL);
546 }
547
548 static void tcpbypass(struct conv *cv, char **argv, int argc)
549 {
550         struct tcppriv *tpriv = cv->p->priv;
551
552         Fsstdbypass(cv, argv, argc);
553         iphtadd(&tpriv->ht, cv);
554 }
555
556 static void tcpshutdown(struct conv *c, int how)
557 {
558         Tcpctl *tcb = (Tcpctl*)c->ptcl;
559
560         /* Do nothing for the read side */
561         if (how == SHUT_RD)
562                 return;
563         /* Sends a FIN.  If we're in another state (like Listen), we'll run into
564          * issues, since we'll never send the FIN.  We'll be shutdown on our end,
565          * but we'll never tell the distant end.  Might just be an app issue. */
566         switch (tcb->state) {
567         case Established:
568                 tcb->flgcnt++;
569                 tcb->snd.nxt++;
570                 tcpsetstate(c, Finwait1);
571                 tcpoutput(c);
572                 break;
573         }
574 }
575
576 /*
577  *  tcpclose is always called with the q locked
578  */
579 static void tcpclose(struct conv *c)
580 {
581         Tcpctl *tcb;
582
583         tcb = (Tcpctl *) c->ptcl;
584
585         qhangup(c->rq, NULL);
586         qhangup(c->wq, NULL);
587         qhangup(c->eq, NULL);
588         qflush(c->rq);
589
590         switch (tcb->state) {
591                 case Listen:
592                         /*
593                          *  reset any incoming calls to this listener
594                          */
595                         Fsconnected(c, "Hangup");
596
597                         localclose(c, NULL);
598                         break;
599                 case Closed:
600                 case Syn_sent:
601                         localclose(c, NULL);
602                         break;
603                 case Established:
604                         tcb->flgcnt++;
605                         tcb->snd.nxt++;
606                         tcpsetstate(c, Finwait1);
607                         tcpoutput(c);
608                         break;
609                 case Close_wait:
610                         tcb->flgcnt++;
611                         tcb->snd.nxt++;
612                         tcpsetstate(c, Last_ack);
613                         tcpoutput(c);
614                         break;
615         }
616 }
617
618 void tcpkick(void *x)
619 {
620         ERRSTACK(1);
621         struct conv *s = x;
622         Tcpctl *tcb;
623
624         tcb = (Tcpctl *) s->ptcl;
625
626         qlock(&s->qlock);
627         if (waserror()) {
628                 qunlock(&s->qlock);
629                 nexterror();
630         }
631
632         switch (tcb->state) {
633                 case Syn_sent:
634                 case Established:
635                 case Close_wait:
636                         /*
637                          * Push data
638                          */
639                         tcprcvwin(s);
640                         tcpoutput(s);
641                         break;
642                 default:
643                         localclose(s, "Hangup");
644                         break;
645         }
646
647         qunlock(&s->qlock);
648         poperror();
649 }
650
651 void tcprcvwin(struct conv *s)
652 {
653         /* Call with tcb locked */
654         int w;
655         Tcpctl *tcb;
656
657         tcb = (Tcpctl *) s->ptcl;
658         w = tcb->window - qlen(s->rq);
659         if (w < 0)
660                 w = 0;
661
662         /* RFC 813: Avoid SWS.  We'll always reduce the window (because the qio
663          * increased - that's legit), and we'll always advertise the window
664          * increases (corresponding to qio drains) when those are greater than MSS.
665          * But we don't advertise increases less than MSS.
666          *
667          * Note we don't shrink the window at all - that'll result in tcptrim()
668          * dropping packets that were sent before the sender gets our update. */
669         if ((w < tcb->rcv.wnd) || (w >= tcb->mss))
670                 tcb->rcv.wnd = w;
671         /* We've delayed sending an update to rcv.wnd, and we might never get
672          * another ACK to drive the TCP stack after the qio is drained.  We could
673          * replace this stuff with qio kicks or callbacks, but that might be
674          * trickier with the MSS limitation.  (and 'edge' isn't empty or not). */
675         if (w < tcb->mss)
676                 tcb->rcv.blocked = 1;
677 }
678
679 void tcpacktimer(void *v)
680 {
681         ERRSTACK(1);
682         Tcpctl *tcb;
683         struct conv *s;
684
685         s = v;
686         tcb = (Tcpctl *) s->ptcl;
687
688         qlock(&s->qlock);
689         if (waserror()) {
690                 qunlock(&s->qlock);
691                 nexterror();
692         }
693         if (tcb->state != Closed) {
694                 tcb->flags |= FORCE;
695                 tcprcvwin(s);
696                 tcpoutput(s);
697         }
698         qunlock(&s->qlock);
699         poperror();
700 }
701
702 static void tcpcreate(struct conv *c)
703 {
704         /* We don't use qio limits.  Instead, TCP manages flow control on its own.
705          * We only use qpassnolim().  Note for qio that 0 doesn't mean no limit. */
706         c->rq = qopen(0, Qcoalesce, 0, 0);
707         c->wq = qopen(8 * QMAX, Qkick, tcpkick, c);
708 }
709
710 static void timerstate(struct tcppriv *priv, Tcptimer * t, int newstate)
711 {
712         if (newstate != TcptimerON) {
713                 if (t->state == TcptimerON) {
714                         // unchain
715                         if (priv->timers == t) {
716                                 priv->timers = t->next;
717                                 if (t->prev != NULL)
718                                         panic("timerstate1");
719                         }
720                         if (t->next)
721                                 t->next->prev = t->prev;
722                         if (t->prev)
723                                 t->prev->next = t->next;
724                         t->next = t->prev = NULL;
725                 }
726         } else {
727                 if (t->state != TcptimerON) {
728                         // chain
729                         if (t->prev != NULL || t->next != NULL)
730                                 panic("timerstate2");
731                         t->prev = NULL;
732                         t->next = priv->timers;
733                         if (t->next)
734                                 t->next->prev = t;
735                         priv->timers = t;
736                 }
737         }
738         t->state = newstate;
739 }
740
741 void tcpackproc(void *a)
742 {
743         ERRSTACK(1);
744         Tcptimer *t, *tp, *timeo;
745         struct Proto *tcp;
746         struct tcppriv *priv;
747         int loop;
748
749         tcp = a;
750         priv = tcp->priv;
751
752         for (;;) {
753                 kthread_usleep(MSPTICK * 1000);
754
755                 qlock(&priv->tl);
756                 timeo = NULL;
757                 loop = 0;
758                 for (t = priv->timers; t != NULL; t = tp) {
759                         if (loop++ > 10000)
760                                 panic("tcpackproc1");
761                         tp = t->next;
762                         if (t->state == TcptimerON) {
763                                 t->count--;
764                                 if (t->count == 0) {
765                                         timerstate(priv, t, TcptimerDONE);
766                                         t->readynext = timeo;
767                                         timeo = t;
768                                 }
769                         }
770                 }
771                 qunlock(&priv->tl);
772
773                 loop = 0;
774                 for (t = timeo; t != NULL; t = t->readynext) {
775                         if (loop++ > 10000)
776                                 panic("tcpackproc2");
777                         if (t->state == TcptimerDONE && t->func != NULL) {
778                                 /* discard error style */
779                                 if (!waserror())
780                                         (*t->func) (t->arg);
781                                 poperror();
782                         }
783                 }
784
785                 limborexmit(tcp);
786         }
787 }
788
789 void tcpgo(struct tcppriv *priv, Tcptimer * t)
790 {
791         if (t == NULL || t->start == 0)
792                 return;
793
794         qlock(&priv->tl);
795         t->count = t->start;
796         timerstate(priv, t, TcptimerON);
797         qunlock(&priv->tl);
798 }
799
800 void tcphalt(struct tcppriv *priv, Tcptimer * t)
801 {
802         if (t == NULL)
803                 return;
804
805         qlock(&priv->tl);
806         timerstate(priv, t, TcptimerOFF);
807         qunlock(&priv->tl);
808 }
809
810 int backoff(int n)
811 {
812         return 1 << n;
813 }
814
815 void localclose(struct conv *s, char *reason)
816 {       /* called with tcb locked */
817         Tcpctl *tcb;
818         Reseq *rp, *rp1;
819         struct tcppriv *tpriv;
820
821         tpriv = s->p->priv;
822         tcb = (Tcpctl *) s->ptcl;
823
824         iphtrem(&tpriv->ht, s);
825
826         tcphalt(tpriv, &tcb->timer);
827         tcphalt(tpriv, &tcb->rtt_timer);
828         tcphalt(tpriv, &tcb->acktimer);
829         tcphalt(tpriv, &tcb->katimer);
830
831         /* Flush reassembly queue; nothing more can arrive */
832         for (rp = tcb->reseq; rp != NULL; rp = rp1) {
833                 rp1 = rp->next;
834                 freeblist(rp->bp);
835                 kfree(rp);
836         }
837         tcb->reseq = NULL;
838
839         if (tcb->state == Syn_sent)
840                 Fsconnected(s, reason);
841
842         qhangup(s->rq, reason);
843         qhangup(s->wq, reason);
844
845         tcpsetstate(s, Closed);
846
847         /* listener will check the rq state */
848         if (s->state == Announced)
849                 rendez_wakeup(&s->listenr);
850 }
851
852 /* mtu (- TCP + IP hdr len) of 1st hop */
853 int tcpmtu(struct Proto *tcp, uint8_t * addr, int version, int *scale,
854            uint8_t *flags)
855 {
856         struct Ipifc *ifc;
857         int mtu;
858
859         ifc = findipifc(tcp->f, addr, 0);
860         switch (version) {
861                 default:
862                 case V4:
863                         mtu = DEF_MSS;
864                         if (ifc != NULL)
865                                 mtu = ifc->maxtu - ifc->m->hsize - (TCP4_PKT + TCP4_HDRSIZE);
866                         break;
867                 case V6:
868                         mtu = DEF_MSS6;
869                         if (ifc != NULL)
870                                 mtu = ifc->maxtu - ifc->m->hsize - (TCP6_PKT + TCP6_HDRSIZE);
871                         break;
872         }
873         *flags &= ~TSO;
874         if (ifc && (ifc->feat & NETF_TSO))
875                 *flags |= TSO;
876         *scale = HaveWS | 7;
877
878         return mtu;
879 }
880
881 void inittcpctl(struct conv *s, int mode)
882 {
883         Tcpctl *tcb;
884         Tcp4hdr *h4;
885         Tcp6hdr *h6;
886         int mss;
887
888         tcb = (Tcpctl *) s->ptcl;
889
890         memset(tcb, 0, sizeof(Tcpctl));
891
892         tcb->ssthresh = UINT32_MAX;
893         tcb->srtt = tcp_irtt << LOGAGAIN;
894         tcb->mdev = 0;
895
896         /* setup timers */
897         tcb->timer.start = tcp_irtt / MSPTICK;
898         tcb->timer.func = tcptimeout;
899         tcb->timer.arg = s;
900         tcb->rtt_timer.start = MAX_TIME;
901         tcb->acktimer.start = TCP_ACK / MSPTICK;
902         tcb->acktimer.func = tcpacktimer;
903         tcb->acktimer.arg = s;
904         tcb->katimer.start = DEF_KAT / MSPTICK;
905         tcb->katimer.func = tcpkeepalive;
906         tcb->katimer.arg = s;
907
908         mss = DEF_MSS;
909
910         /* create a prototype(pseudo) header */
911         if (mode != TCP_LISTEN) {
912                 if (ipcmp(s->laddr, IPnoaddr) == 0)
913                         findlocalip(s->p->f, s->laddr, s->raddr);
914
915                 switch (s->ipversion) {
916                         case V4:
917                                 h4 = &tcb->protohdr.tcp4hdr;
918                                 memset(h4, 0, sizeof(*h4));
919                                 h4->proto = IP_TCPPROTO;
920                                 hnputs(h4->tcpsport, s->lport);
921                                 hnputs(h4->tcpdport, s->rport);
922                                 v6tov4(h4->tcpsrc, s->laddr);
923                                 v6tov4(h4->tcpdst, s->raddr);
924                                 break;
925                         case V6:
926                                 h6 = &tcb->protohdr.tcp6hdr;
927                                 memset(h6, 0, sizeof(*h6));
928                                 h6->proto = IP_TCPPROTO;
929                                 hnputs(h6->tcpsport, s->lport);
930                                 hnputs(h6->tcpdport, s->rport);
931                                 ipmove(h6->tcpsrc, s->laddr);
932                                 ipmove(h6->tcpdst, s->raddr);
933                                 mss = DEF_MSS6;
934                                 break;
935                         default:
936                                 panic("inittcpctl: version %d", s->ipversion);
937                 }
938         }
939
940         tcb->mss = mss;
941         tcb->typical_mss = mss;
942         tcb->cwind = tcb->typical_mss * CWIND_SCALE;
943
944         /* default is no window scaling */
945         tcb->window = QMAX;
946         tcb->rcv.wnd = QMAX;
947         tcb->rcv.scale = 0;
948         tcb->snd.scale = 0;
949 }
950
951 /*
952  *  called with s qlocked
953  */
954 void tcpstart(struct conv *s, int mode)
955 {
956         Tcpctl *tcb;
957         struct tcppriv *tpriv;
958         char *kpname;
959
960         tpriv = s->p->priv;
961
962         if (tpriv->ackprocstarted == 0) {
963                 qlock(&tpriv->apl);
964                 if (tpriv->ackprocstarted == 0) {
965                         /* tcpackproc needs to free this if it ever exits */
966                         kpname = kmalloc(KNAMELEN, MEM_WAIT);
967                         snprintf(kpname, KNAMELEN, "#I%dtcpack", s->p->f->dev);
968                         ktask(kpname, tcpackproc, s->p);
969                         tpriv->ackprocstarted = 1;
970                 }
971                 qunlock(&tpriv->apl);
972         }
973
974         tcb = (Tcpctl *) s->ptcl;
975
976         inittcpctl(s, mode);
977
978         iphtadd(&tpriv->ht, s);
979         switch (mode) {
980                 case TCP_LISTEN:
981                         tpriv->stats[PassiveOpens]++;
982                         tcb->flags |= CLONE;
983                         tcpsetstate(s, Listen);
984                         break;
985
986                 case TCP_CONNECT:
987                         tpriv->stats[ActiveOpens]++;
988                         tcb->flags |= ACTIVE;
989                         tcpsndsyn(s, tcb);
990                         tcpsetstate(s, Syn_sent);
991                         tcpoutput(s);
992                         break;
993         }
994 }
995
996 static char *tcpflag(uint16_t flag)
997 {
998         static char buf[128];
999
1000         snprintf(buf, sizeof(buf), "%d", flag >> 10);   /* Head len */
1001         if (flag & URG)
1002                 snprintf(buf, sizeof(buf), "%s%s", buf, " URG");
1003         if (flag & ACK)
1004                 snprintf(buf, sizeof(buf), "%s%s", buf, " ACK");
1005         if (flag & PSH)
1006                 snprintf(buf, sizeof(buf), "%s%s", buf, " PSH");
1007         if (flag & RST)
1008                 snprintf(buf, sizeof(buf), "%s%s", buf, " RST");
1009         if (flag & SYN)
1010                 snprintf(buf, sizeof(buf), "%s%s", buf, " SYN");
1011         if (flag & FIN)
1012                 snprintf(buf, sizeof(buf), "%s%s", buf, " FIN");
1013
1014         return buf;
1015 }
1016
1017 /* Helper, determine if we should send a TCP timestamp.  ts_val was the
1018  * timestamp from our distant end.  We'll also send a TS on SYN (no ACK). */
1019 static bool tcp_seg_has_ts(Tcp *tcph)
1020 {
1021         return tcph->ts_val || ((tcph->flags & SYN) && !(tcph->flags & ACK));
1022 }
1023
1024 /* Given a TCP header/segment and default header size (e.g. TCP4_HDRSIZE),
1025  * return the actual hdr_len and opt_pad */
1026 static void compute_hdrlen_optpad(Tcp *tcph, uint16_t default_hdrlen,
1027                                   uint16_t *ret_hdrlen, uint16_t *ret_optpad,
1028                                   Tcpctl *tcb)
1029 {
1030         uint16_t hdrlen = default_hdrlen;
1031         uint16_t optpad = 0;
1032
1033         if (tcph->flags & SYN) {
1034                 if (tcph->mss)
1035                         hdrlen += MSS_LENGTH;
1036                 if (tcph->ws)
1037                         hdrlen += WS_LENGTH;
1038                 if (tcph->sack_ok)
1039                         hdrlen += SACK_OK_LENGTH;
1040         }
1041         if (tcp_seg_has_ts(tcph)) {
1042                 hdrlen += TS_LENGTH;
1043                 /* SYNs have other opts, don't do the PREPAD NOOP optimization. */
1044                 if (!(tcph->flags & SYN))
1045                         hdrlen += TS_SEND_PREPAD;
1046         }
1047         if (tcb && tcb->rcv.nr_sacks)
1048                 hdrlen += 2 + tcb->rcv.nr_sacks * 8;
1049         optpad = hdrlen & 3;
1050         if (optpad)
1051                 optpad = 4 - optpad;
1052         hdrlen += optpad;
1053         *ret_hdrlen = hdrlen;
1054         *ret_optpad = optpad;
1055 }
1056
1057 /* Writes the TCP options for tcph to opt. */
1058 static void write_opts(Tcp *tcph, uint8_t *opt, uint16_t optpad, Tcpctl *tcb)
1059 {
1060         if (tcph->flags & SYN) {
1061                 if (tcph->mss != 0) {
1062                         *opt++ = MSSOPT;
1063                         *opt++ = MSS_LENGTH;
1064                         hnputs(opt, tcph->mss);
1065                         opt += 2;
1066                 }
1067                 if (tcph->ws != 0) {
1068                         *opt++ = WSOPT;
1069                         *opt++ = WS_LENGTH;
1070                         *opt++ = tcph->ws;
1071                 }
1072                 if (tcph->sack_ok) {
1073                         *opt++ = SACK_OK_OPT;
1074                         *opt++ = SACK_OK_LENGTH;
1075                 }
1076         }
1077         if (tcp_seg_has_ts(tcph)) {
1078                 if (!(tcph->flags & SYN)) {
1079                         *opt++ = NOOPOPT;
1080                         *opt++ = NOOPOPT;
1081                 }
1082                 *opt++ = TS_OPT;
1083                 *opt++ = TS_LENGTH;
1084                 /* Setting TSval, our time */
1085                 hnputl(opt, milliseconds());
1086                 opt += 4;
1087                 /* Setting TSecr, the time we last saw from them, stored in ts_val */
1088                 hnputl(opt, tcph->ts_val);
1089                 opt += 4;
1090         }
1091         if (tcb && tcb->rcv.nr_sacks) {
1092                 *opt++ = SACK_OPT;
1093                 *opt++ = 2 + tcb->rcv.nr_sacks * 8;
1094                 for (int i = 0; i < tcb->rcv.nr_sacks; i++) {
1095                         hnputl(opt, tcb->rcv.sacks[i].left);
1096                         opt += 4;
1097                         hnputl(opt, tcb->rcv.sacks[i].right);
1098                         opt += 4;
1099                 }
1100         }
1101         while (optpad-- > 0)
1102                 *opt++ = NOOPOPT;
1103 }
1104
1105 /* Given a data block (or NULL) returns a block with enough header room that we
1106  * can send out.  block->wp is set to the beginning of the payload.  Returns
1107  * NULL on some sort of error. */
1108 static struct block *alloc_or_pad_block(struct block *data,
1109                                         uint16_t total_hdr_size)
1110 {
1111         if (data) {
1112                 data = padblock(data, total_hdr_size);
1113                 if (data == NULL)
1114                         return NULL;
1115         } else {
1116                 /* the 64 pad is to meet mintu's */
1117                 data = block_alloc(total_hdr_size + 64, MEM_WAIT);
1118                 if (data == NULL)
1119                         return NULL;
1120                 data->wp += total_hdr_size;
1121         }
1122         return data;
1123 }
1124
1125 struct block *htontcp6(Tcp *tcph, struct block *data, Tcp6hdr *ph,
1126                                            Tcpctl *tcb)
1127 {
1128         int dlen = blocklen(data);
1129         Tcp6hdr *h;
1130         uint16_t csum;
1131         uint16_t hdrlen, optpad;
1132
1133         compute_hdrlen_optpad(tcph, TCP6_HDRSIZE, &hdrlen, &optpad, tcb);
1134
1135         data = alloc_or_pad_block(data, hdrlen + TCP6_PKT);
1136         if (data == NULL)
1137                 return NULL;
1138         /* relative to the block start (bp->rp) */
1139         data->transport_header_end = hdrlen + TCP6_PKT;
1140
1141         /* copy in pseudo ip header plus port numbers */
1142         h = (Tcp6hdr *) (data->rp);
1143         memmove(h, ph, TCP6_TCBPHDRSZ);
1144
1145         /* compose pseudo tcp header, do cksum calculation */
1146         hnputl(h->vcf, hdrlen + dlen);
1147         h->ploadlen[0] = h->ploadlen[1] = h->proto = 0;
1148         h->ttl = ph->proto;
1149
1150         /* copy in variable bits */
1151         hnputl(h->tcpseq, tcph->seq);
1152         hnputl(h->tcpack, tcph->ack);
1153         hnputs(h->tcpflag, (hdrlen << 10) | tcph->flags);
1154         hnputs(h->tcpwin, tcph->wnd >> (tcb != NULL ? tcb->snd.scale : 0));
1155         hnputs(h->tcpurg, tcph->urg);
1156
1157         write_opts(tcph, h->tcpopt, optpad, tcb);
1158
1159         if (tcb != NULL && tcb->nochecksum) {
1160                 h->tcpcksum[0] = h->tcpcksum[1] = 0;
1161         } else {
1162                 csum = ptclcsum(data, TCP6_IPLEN, hdrlen + dlen + TCP6_PHDRSIZE);
1163                 hnputs(h->tcpcksum, csum);
1164         }
1165
1166         /* move from pseudo header back to normal ip header */
1167         memset(h->vcf, 0, 4);
1168         h->vcf[0] = IP_VER6;
1169         hnputs(h->ploadlen, hdrlen + dlen);
1170         h->proto = ph->proto;
1171
1172         return data;
1173 }
1174
1175 struct block *htontcp4(Tcp *tcph, struct block *data, Tcp4hdr *ph,
1176                                            Tcpctl *tcb)
1177 {
1178         int dlen = blocklen(data);
1179         Tcp4hdr *h;
1180         uint16_t csum;
1181         uint16_t hdrlen, optpad;
1182
1183         compute_hdrlen_optpad(tcph, TCP4_HDRSIZE, &hdrlen, &optpad, tcb);
1184
1185         data = alloc_or_pad_block(data, hdrlen + TCP4_PKT);
1186         if (data == NULL)
1187                 return NULL;
1188         /* relative to the block start (bp->rp) */
1189         data->transport_header_end = hdrlen + TCP4_PKT;
1190
1191         /* copy in pseudo ip header plus port numbers */
1192         h = (Tcp4hdr *) (data->rp);
1193         memmove(h, ph, TCP4_TCBPHDRSZ);
1194
1195         /* copy in variable bits */
1196         hnputs(h->tcplen, hdrlen + dlen);
1197         hnputl(h->tcpseq, tcph->seq);
1198         hnputl(h->tcpack, tcph->ack);
1199         hnputs(h->tcpflag, (hdrlen << 10) | tcph->flags);
1200         hnputs(h->tcpwin, tcph->wnd >> (tcb != NULL ? tcb->snd.scale : 0));
1201         hnputs(h->tcpurg, tcph->urg);
1202
1203         write_opts(tcph, h->tcpopt, optpad, tcb);
1204
1205         if (tcb != NULL && tcb->nochecksum) {
1206                 h->tcpcksum[0] = h->tcpcksum[1] = 0;
1207         } else {
1208                 csum = ~ptclcsum(data, TCP4_IPLEN, TCP4_PHDRSIZE);
1209                 hnputs(h->tcpcksum, csum);
1210                 data->checksum_start = TCP4_IPLEN + TCP4_PHDRSIZE;
1211                 data->checksum_offset = ph->tcpcksum - ph->tcpsport;
1212                 data->flag |= Btcpck;
1213         }
1214
1215         return data;
1216 }
1217
1218 static void parse_inbound_sacks(Tcp *tcph, uint8_t *opt, uint16_t optlen)
1219 {
1220         uint8_t nr_sacks;
1221         uint32_t left, right;
1222
1223         nr_sacks = (optlen - 2) / 8;
1224         if (nr_sacks > MAX_NR_SACKS_PER_PACKET)
1225                 return;
1226         opt += 2;
1227         for (int i = 0; i < nr_sacks; i++, opt += 8) {
1228                 left = nhgetl(opt);
1229                 right = nhgetl(opt + 4);
1230                 if (seq_ge(left, right)) {
1231                         /* bad / malicious SACK.  Skip it, and adjust. */
1232                         nr_sacks--;
1233                         i--;    /* stay on this array element next loop */
1234                         continue;
1235                 }
1236                 tcph->sacks[i].left = left;
1237                 tcph->sacks[i].right = right;
1238         }
1239         tcph->nr_sacks = nr_sacks;
1240 }
1241
1242 static void parse_inbound_opts(Tcp *tcph, uint8_t *opt, uint16_t optsize)
1243 {
1244         uint16_t optlen;
1245
1246         while (optsize > 0 && *opt != EOLOPT) {
1247                 if (*opt == NOOPOPT) {
1248                         optsize--;
1249                         opt++;
1250                         continue;
1251                 }
1252                 optlen = opt[1];
1253                 if (optlen < 2 || optlen > optsize)
1254                         break;
1255                 switch (*opt) {
1256                         case MSSOPT:
1257                                 if (optlen == MSS_LENGTH)
1258                                         tcph->mss = nhgets(opt + 2);
1259                                 break;
1260                         case WSOPT:
1261                                 if (optlen == WS_LENGTH && *(opt + 2) <= MAX_WS_VALUE)
1262                                         tcph->ws = HaveWS | *(opt + 2);
1263                                 break;
1264                         case SACK_OK_OPT:
1265                                 if (optlen == SACK_OK_LENGTH)
1266                                         tcph->sack_ok = TRUE;
1267                                 break;
1268                         case SACK_OPT:
1269                                 parse_inbound_sacks(tcph, opt, optlen);
1270                                 break;
1271                         case TS_OPT:
1272                                 if (optlen == TS_LENGTH) {
1273                                         tcph->ts_val = nhgetl(opt + 2);
1274                                         tcph->ts_ecr = nhgetl(opt + 6);
1275                                 }
1276                                 break;
1277                 }
1278                 optsize -= optlen;
1279                 opt += optlen;
1280         }
1281 }
1282
1283 /* Helper, clears the opts.  We'll later set them with e.g. parse_inbound_opts,
1284  * set them manually, or something else. */
1285 static void clear_tcph_opts(Tcp *tcph)
1286 {
1287         tcph->mss = 0;
1288         tcph->ws = 0;
1289         tcph->sack_ok = FALSE;
1290         tcph->nr_sacks = 0;
1291         tcph->ts_val = 0;
1292         tcph->ts_ecr = 0;
1293 }
1294
1295 int ntohtcp6(Tcp * tcph, struct block **bpp)
1296 {
1297         Tcp6hdr *h;
1298         uint16_t hdrlen;
1299
1300         *bpp = pullupblock(*bpp, TCP6_PKT + TCP6_HDRSIZE);
1301         if (*bpp == NULL)
1302                 return -1;
1303
1304         h = (Tcp6hdr *) ((*bpp)->rp);
1305         tcph->source = nhgets(h->tcpsport);
1306         tcph->dest = nhgets(h->tcpdport);
1307         tcph->seq = nhgetl(h->tcpseq);
1308         tcph->ack = nhgetl(h->tcpack);
1309         hdrlen = (h->tcpflag[0] >> 2) & ~3;
1310         if (hdrlen < TCP6_HDRSIZE) {
1311                 freeblist(*bpp);
1312                 return -1;
1313         }
1314
1315         tcph->flags = h->tcpflag[1];
1316         tcph->wnd = nhgets(h->tcpwin);
1317         tcph->urg = nhgets(h->tcpurg);
1318         clear_tcph_opts(tcph);
1319         tcph->len = nhgets(h->ploadlen) - hdrlen;
1320
1321         *bpp = pullupblock(*bpp, hdrlen + TCP6_PKT);
1322         if (*bpp == NULL)
1323                 return -1;
1324         parse_inbound_opts(tcph, h->tcpopt, hdrlen - TCP6_HDRSIZE);
1325         return hdrlen;
1326 }
1327
1328 int ntohtcp4(Tcp * tcph, struct block **bpp)
1329 {
1330         Tcp4hdr *h;
1331         uint16_t hdrlen;
1332
1333         *bpp = pullupblock(*bpp, TCP4_PKT + TCP4_HDRSIZE);
1334         if (*bpp == NULL)
1335                 return -1;
1336
1337         h = (Tcp4hdr *) ((*bpp)->rp);
1338         tcph->source = nhgets(h->tcpsport);
1339         tcph->dest = nhgets(h->tcpdport);
1340         tcph->seq = nhgetl(h->tcpseq);
1341         tcph->ack = nhgetl(h->tcpack);
1342
1343         hdrlen = (h->tcpflag[0] >> 2) & ~3;
1344         if (hdrlen < TCP4_HDRSIZE) {
1345                 freeblist(*bpp);
1346                 return -1;
1347         }
1348
1349         tcph->flags = h->tcpflag[1];
1350         tcph->wnd = nhgets(h->tcpwin);
1351         tcph->urg = nhgets(h->tcpurg);
1352         clear_tcph_opts(tcph);
1353         tcph->len = nhgets(h->length) - (hdrlen + TCP4_PKT);
1354
1355         *bpp = pullupblock(*bpp, hdrlen + TCP4_PKT);
1356         if (*bpp == NULL)
1357                 return -1;
1358         parse_inbound_opts(tcph, h->tcpopt, hdrlen - TCP4_HDRSIZE);
1359         return hdrlen;
1360 }
1361
1362 /*
1363  *  For outgoing calls, generate an initial sequence
1364  *  number and put a SYN on the send queue
1365  */
1366 void tcpsndsyn(struct conv *s, Tcpctl * tcb)
1367 {
1368         urandom_read(&tcb->iss, sizeof(tcb->iss));
1369         tcb->rttseq = tcb->iss;
1370         tcb->snd.wl2 = tcb->iss;
1371         tcb->snd.una = tcb->iss;
1372         tcb->snd.rtx = tcb->rttseq;
1373         tcb->snd.nxt = tcb->rttseq;
1374         tcb->flgcnt++;
1375         tcb->flags |= FORCE;
1376         tcb->sndsyntime = NOW;
1377
1378         /* set desired mss and scale */
1379         tcb->mss = tcpmtu(s->p, s->laddr, s->ipversion, &tcb->scale,
1380                           &tcb->flags);
1381 }
1382
1383 void
1384 sndrst(struct Proto *tcp, uint8_t * source, uint8_t * dest,
1385            uint16_t length, Tcp * seg, uint8_t version, char *reason)
1386 {
1387         struct block *hbp;
1388         uint8_t rflags;
1389         struct tcppriv *tpriv;
1390         Tcp4hdr ph4;
1391         Tcp6hdr ph6;
1392
1393         netlog(tcp->f, Logtcpreset, "sndrst: %s\n", reason);
1394
1395         tpriv = tcp->priv;
1396
1397         if (seg->flags & RST)
1398                 return;
1399
1400         /* make pseudo header */
1401         switch (version) {
1402                 case V4:
1403                         memset(&ph4, 0, sizeof(ph4));
1404                         ph4.vihl = IP_VER4;
1405                         v6tov4(ph4.tcpsrc, dest);
1406                         v6tov4(ph4.tcpdst, source);
1407                         ph4.proto = IP_TCPPROTO;
1408                         hnputs(ph4.tcplen, TCP4_HDRSIZE);
1409                         hnputs(ph4.tcpsport, seg->dest);
1410                         hnputs(ph4.tcpdport, seg->source);
1411                         break;
1412                 case V6:
1413                         memset(&ph6, 0, sizeof(ph6));
1414                         ph6.vcf[0] = IP_VER6;
1415                         ipmove(ph6.tcpsrc, dest);
1416                         ipmove(ph6.tcpdst, source);
1417                         ph6.proto = IP_TCPPROTO;
1418                         hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1419                         hnputs(ph6.tcpsport, seg->dest);
1420                         hnputs(ph6.tcpdport, seg->source);
1421                         break;
1422                 default:
1423                         panic("sndrst: version %d", version);
1424         }
1425
1426         tpriv->stats[OutRsts]++;
1427         rflags = RST;
1428
1429         /* convince the other end that this reset is in band */
1430         if (seg->flags & ACK) {
1431                 seg->seq = seg->ack;
1432                 seg->ack = 0;
1433         } else {
1434                 rflags |= ACK;
1435                 seg->ack = seg->seq;
1436                 seg->seq = 0;
1437                 if (seg->flags & SYN)
1438                         seg->ack++;
1439                 seg->ack += length;
1440                 if (seg->flags & FIN)
1441                         seg->ack++;
1442         }
1443         seg->flags = rflags;
1444         seg->wnd = 0;
1445         seg->urg = 0;
1446         seg->mss = 0;
1447         seg->ws = 0;
1448         seg->sack_ok = FALSE;
1449         seg->nr_sacks = 0;
1450         /* seg->ts_val is already set with their timestamp */
1451         switch (version) {
1452                 case V4:
1453                         hbp = htontcp4(seg, NULL, &ph4, NULL);
1454                         if (hbp == NULL)
1455                                 return;
1456                         ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1457                         break;
1458                 case V6:
1459                         hbp = htontcp6(seg, NULL, &ph6, NULL);
1460                         if (hbp == NULL)
1461                                 return;
1462                         ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1463                         break;
1464                 default:
1465                         panic("sndrst2: version %d", version);
1466         }
1467 }
1468
1469 /*
1470  *  send a reset to the remote side and close the conversation
1471  *  called with s qlocked
1472  */
1473 static void tcphangup(struct conv *s)
1474 {
1475         ERRSTACK(1);
1476         Tcp seg;
1477         Tcpctl *tcb;
1478         struct block *hbp;
1479
1480         tcb = (Tcpctl *) s->ptcl;
1481         if (ipcmp(s->raddr, IPnoaddr)) {
1482                 /* discard error style, poperror regardless */
1483                 if (!waserror()) {
1484                         seg.flags = RST | ACK;
1485                         seg.ack = tcb->rcv.nxt;
1486                         tcb->last_ack_sent = seg.ack;
1487                         tcb->rcv.una = 0;
1488                         seg.seq = tcb->snd.nxt;
1489                         seg.wnd = 0;
1490                         seg.urg = 0;
1491                         seg.mss = 0;
1492                         seg.ws = 0;
1493                         seg.sack_ok = FALSE;
1494                         seg.nr_sacks = 0;
1495                         seg.ts_val = tcb->ts_recent;
1496                         switch (s->ipversion) {
1497                                 case V4:
1498                                         tcb->protohdr.tcp4hdr.vihl = IP_VER4;
1499                                         hbp = htontcp4(&seg, NULL, &tcb->protohdr.tcp4hdr, tcb);
1500                                         ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
1501                                         break;
1502                                 case V6:
1503                                         tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
1504                                         hbp = htontcp6(&seg, NULL, &tcb->protohdr.tcp6hdr, tcb);
1505                                         ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
1506                                         break;
1507                                 default:
1508                                         panic("tcphangup: version %d", s->ipversion);
1509                         }
1510                 }
1511                 poperror();
1512         }
1513         localclose(s, NULL);
1514 }
1515
1516 /*
1517  *  (re)send a SYN ACK
1518  */
1519 int sndsynack(struct Proto *tcp, Limbo * lp)
1520 {
1521         struct block *hbp;
1522         Tcp4hdr ph4;
1523         Tcp6hdr ph6;
1524         Tcp seg;
1525         int scale;
1526         uint8_t flag = 0;
1527
1528         /* make pseudo header */
1529         switch (lp->version) {
1530                 case V4:
1531                         memset(&ph4, 0, sizeof(ph4));
1532                         ph4.vihl = IP_VER4;
1533                         v6tov4(ph4.tcpsrc, lp->laddr);
1534                         v6tov4(ph4.tcpdst, lp->raddr);
1535                         ph4.proto = IP_TCPPROTO;
1536                         hnputs(ph4.tcplen, TCP4_HDRSIZE);
1537                         hnputs(ph4.tcpsport, lp->lport);
1538                         hnputs(ph4.tcpdport, lp->rport);
1539                         break;
1540                 case V6:
1541                         memset(&ph6, 0, sizeof(ph6));
1542                         ph6.vcf[0] = IP_VER6;
1543                         ipmove(ph6.tcpsrc, lp->laddr);
1544                         ipmove(ph6.tcpdst, lp->raddr);
1545                         ph6.proto = IP_TCPPROTO;
1546                         hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1547                         hnputs(ph6.tcpsport, lp->lport);
1548                         hnputs(ph6.tcpdport, lp->rport);
1549                         break;
1550                 default:
1551                         panic("sndrst: version %d", lp->version);
1552         }
1553
1554         seg.seq = lp->iss;
1555         seg.ack = lp->irs + 1;
1556         seg.flags = SYN | ACK;
1557         seg.urg = 0;
1558         seg.mss = tcpmtu(tcp, lp->laddr, lp->version, &scale, &flag);
1559         seg.wnd = QMAX;
1560         seg.ts_val = lp->ts_val;
1561         seg.nr_sacks = 0;
1562
1563         /* if the other side set scale, we should too */
1564         if (lp->rcvscale) {
1565                 seg.ws = scale;
1566                 lp->sndscale = scale;
1567         } else {
1568                 seg.ws = 0;
1569                 lp->sndscale = 0;
1570         }
1571         if (SACK_SUPPORTED)
1572                 seg.sack_ok = lp->sack_ok;
1573         else
1574                 seg.sack_ok = FALSE;
1575
1576         switch (lp->version) {
1577                 case V4:
1578                         hbp = htontcp4(&seg, NULL, &ph4, NULL);
1579                         if (hbp == NULL)
1580                                 return -1;
1581                         ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1582                         break;
1583                 case V6:
1584                         hbp = htontcp6(&seg, NULL, &ph6, NULL);
1585                         if (hbp == NULL)
1586                                 return -1;
1587                         ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1588                         break;
1589                 default:
1590                         panic("sndsnack: version %d", lp->version);
1591         }
1592         lp->lastsend = NOW;
1593         return 0;
1594 }
1595
1596 #define hashipa(a, p) ( ( (a)[IPaddrlen-2] + (a)[IPaddrlen-1] + p )&LHTMASK )
1597
1598 /*
1599  *  put a call into limbo and respond with a SYN ACK
1600  *
1601  *  called with proto locked
1602  */
1603 static void
1604 limbo(struct conv *s, uint8_t * source, uint8_t * dest, Tcp * seg, int version)
1605 {
1606         Limbo *lp, **l;
1607         struct tcppriv *tpriv;
1608         int h;
1609
1610         tpriv = s->p->priv;
1611         h = hashipa(source, seg->source);
1612
1613         for (l = &tpriv->lht[h]; *l != NULL; l = &lp->next) {
1614                 lp = *l;
1615                 if (lp->lport != seg->dest || lp->rport != seg->source
1616                         || lp->version != version)
1617                         continue;
1618                 if (ipcmp(lp->raddr, source) != 0)
1619                         continue;
1620                 if (ipcmp(lp->laddr, dest) != 0)
1621                         continue;
1622
1623                 /* each new SYN restarts the retransmits */
1624                 lp->irs = seg->seq;
1625                 break;
1626         }
1627         lp = *l;
1628         if (lp == NULL) {
1629                 if (tpriv->nlimbo >= Maxlimbo && tpriv->lht[h]) {
1630                         lp = tpriv->lht[h];
1631                         tpriv->lht[h] = lp->next;
1632                         lp->next = NULL;
1633                 } else {
1634                         lp = kzmalloc(sizeof(*lp), 0);
1635                         if (lp == NULL)
1636                                 return;
1637                         tpriv->nlimbo++;
1638                 }
1639                 *l = lp;
1640                 lp->version = version;
1641                 ipmove(lp->laddr, dest);
1642                 ipmove(lp->raddr, source);
1643                 lp->lport = seg->dest;
1644                 lp->rport = seg->source;
1645                 lp->mss = seg->mss;
1646                 lp->rcvscale = seg->ws;
1647                 lp->sack_ok = seg->sack_ok;
1648                 lp->irs = seg->seq;
1649                 lp->ts_val = seg->ts_val;
1650                 urandom_read(&lp->iss, sizeof(lp->iss));
1651         }
1652
1653         if (sndsynack(s->p, lp) < 0) {
1654                 *l = lp->next;
1655                 tpriv->nlimbo--;
1656                 kfree(lp);
1657         }
1658 }
1659
1660 /*
1661  *  resend SYN ACK's once every SYNACK_RXTIMER ms.
1662  */
1663 static void limborexmit(struct Proto *tcp)
1664 {
1665         struct tcppriv *tpriv;
1666         Limbo **l, *lp;
1667         int h;
1668         int seen;
1669         uint64_t now;
1670
1671         tpriv = tcp->priv;
1672
1673         if (!canqlock(&tcp->qlock))
1674                 return;
1675         seen = 0;
1676         now = NOW;
1677         for (h = 0; h < NLHT && seen < tpriv->nlimbo; h++) {
1678                 for (l = &tpriv->lht[h]; *l != NULL && seen < tpriv->nlimbo;) {
1679                         lp = *l;
1680                         seen++;
1681                         if (now - lp->lastsend < (lp->rexmits + 1) * SYNACK_RXTIMER)
1682                                 continue;
1683
1684                         /* time it out after 1 second */
1685                         if (++(lp->rexmits) > 5) {
1686                                 tpriv->nlimbo--;
1687                                 *l = lp->next;
1688                                 kfree(lp);
1689                                 continue;
1690                         }
1691
1692                         /* if we're being attacked, don't bother resending SYN ACK's */
1693                         if (tpriv->nlimbo > 100)
1694                                 continue;
1695
1696                         if (sndsynack(tcp, lp) < 0) {
1697                                 tpriv->nlimbo--;
1698                                 *l = lp->next;
1699                                 kfree(lp);
1700                                 continue;
1701                         }
1702
1703                         l = &lp->next;
1704                 }
1705         }
1706         qunlock(&tcp->qlock);
1707 }
1708
1709 /*
1710  *  lookup call in limbo.  if found, throw it out.
1711  *
1712  *  called with proto locked
1713  */
1714 static void
1715 limborst(struct conv *s, Tcp * segp, uint8_t * src, uint8_t * dst,
1716                  uint8_t version)
1717 {
1718         Limbo *lp, **l;
1719         int h;
1720         struct tcppriv *tpriv;
1721
1722         tpriv = s->p->priv;
1723
1724         /* find a call in limbo */
1725         h = hashipa(src, segp->source);
1726         for (l = &tpriv->lht[h]; *l != NULL; l = &lp->next) {
1727                 lp = *l;
1728                 if (lp->lport != segp->dest || lp->rport != segp->source
1729                         || lp->version != version)
1730                         continue;
1731                 if (ipcmp(lp->laddr, dst) != 0)
1732                         continue;
1733                 if (ipcmp(lp->raddr, src) != 0)
1734                         continue;
1735
1736                 /* RST can only follow the SYN */
1737                 if (segp->seq == lp->irs + 1) {
1738                         tpriv->nlimbo--;
1739                         *l = lp->next;
1740                         kfree(lp);
1741                 }
1742                 break;
1743         }
1744 }
1745
1746 /* The advertised MSS (e.g. 1460) includes any per-packet TCP options, such as
1747  * TCP timestamps.  A given packet will contain mss bytes, but only typical_mss
1748  * bytes of *data*.  If we know we'll use those options, we should adjust our
1749  * typical_mss, which will affect the cwnd. */
1750 static void adjust_typical_mss_for_opts(Tcp *tcph, Tcpctl *tcb)
1751 {
1752         uint16_t opt_size = 0;
1753
1754         if (tcph->ts_val)
1755                 opt_size += TS_LENGTH + TS_SEND_PREPAD;
1756         opt_size = ROUNDUP(opt_size, 4);
1757         tcb->typical_mss -= opt_size;
1758 }
1759
1760 /*
1761  *  come here when we finally get an ACK to our SYN-ACK.
1762  *  lookup call in limbo.  if found, create a new conversation
1763  *
1764  *  called with proto locked
1765  */
1766 static struct conv *tcpincoming(struct conv *s, Tcp * segp, uint8_t * src,
1767                                                                 uint8_t * dst, uint8_t version)
1768 {
1769         struct conv *new;
1770         Tcpctl *tcb;
1771         struct tcppriv *tpriv;
1772         Tcp4hdr *h4;
1773         Tcp6hdr *h6;
1774         Limbo *lp, **l;
1775         int h;
1776
1777         /* unless it's just an ack, it can't be someone coming out of limbo */
1778         if ((segp->flags & SYN) || (segp->flags & ACK) == 0)
1779                 return NULL;
1780
1781         tpriv = s->p->priv;
1782
1783         /* find a call in limbo */
1784         h = hashipa(src, segp->source);
1785         for (l = &tpriv->lht[h]; (lp = *l) != NULL; l = &lp->next) {
1786                 netlog(s->p->f, Logtcp,
1787                            "tcpincoming s %I!%d/%I!%d d %I!%d/%I!%d v %d/%d\n", src,
1788                            segp->source, lp->raddr, lp->rport, dst, segp->dest, lp->laddr,
1789                            lp->lport, version, lp->version);
1790
1791                 if (lp->lport != segp->dest || lp->rport != segp->source
1792                         || lp->version != version)
1793                         continue;
1794                 if (ipcmp(lp->laddr, dst) != 0)
1795                         continue;
1796                 if (ipcmp(lp->raddr, src) != 0)
1797                         continue;
1798
1799                 /* we're assuming no data with the initial SYN */
1800                 if (segp->seq != lp->irs + 1 || segp->ack != lp->iss + 1) {
1801                         netlog(s->p->f, Logtcp, "tcpincoming s 0x%lx/0x%lx a 0x%lx 0x%lx\n",
1802                                    segp->seq, lp->irs + 1, segp->ack, lp->iss + 1);
1803                         lp = NULL;
1804                 } else {
1805                         tpriv->nlimbo--;
1806                         *l = lp->next;
1807                 }
1808                 break;
1809         }
1810         if (lp == NULL)
1811                 return NULL;
1812
1813         new = Fsnewcall(s, src, segp->source, dst, segp->dest, version);
1814         if (new == NULL)
1815                 return NULL;
1816
1817         memmove(new->ptcl, s->ptcl, sizeof(Tcpctl));
1818         tcb = (Tcpctl *) new->ptcl;
1819         tcb->flags &= ~CLONE;
1820         tcb->timer.arg = new;
1821         tcb->timer.state = TcptimerOFF;
1822         tcb->acktimer.arg = new;
1823         tcb->acktimer.state = TcptimerOFF;
1824         tcb->katimer.arg = new;
1825         tcb->katimer.state = TcptimerOFF;
1826         tcb->rtt_timer.arg = new;
1827         tcb->rtt_timer.state = TcptimerOFF;
1828
1829         tcb->irs = lp->irs;
1830         tcb->rcv.nxt = tcb->irs + 1;
1831         tcb->rcv.urg = tcb->rcv.nxt;
1832
1833         tcb->iss = lp->iss;
1834         tcb->rttseq = tcb->iss;
1835         tcb->snd.wl2 = tcb->iss;
1836         tcb->snd.una = tcb->iss + 1;
1837         tcb->snd.rtx = tcb->iss + 1;
1838         tcb->snd.nxt = tcb->iss + 1;
1839         tcb->flgcnt = 0;
1840         tcb->flags |= SYNACK;
1841
1842         /* our sending max segment size cannot be bigger than what he asked for */
1843         if (lp->mss != 0 && lp->mss < tcb->mss) {
1844                 tcb->mss = lp->mss;
1845                 tcb->typical_mss = tcb->mss;
1846         }
1847         adjust_typical_mss_for_opts(segp, tcb);
1848
1849         /* Here's where we record the previously-decided header options.  They were
1850          * actually decided on when we agreed to them in the SYNACK we sent.  We
1851          * didn't create an actual TCB until now, so we can copy those decisions out
1852          * of the limbo tracker and into the TCB. */
1853         tcb->sack_ok = lp->sack_ok;
1854         /* window scaling */
1855         tcpsetscale(new, tcb, lp->rcvscale, lp->sndscale);
1856
1857         tcb->snd.wnd = segp->wnd;
1858         tcb->cwind = tcb->typical_mss * CWIND_SCALE;
1859
1860         /* set initial round trip time */
1861         tcb->sndsyntime = lp->lastsend + lp->rexmits * SYNACK_RXTIMER;
1862         tcpsynackrtt(new);
1863
1864         kfree(lp);
1865
1866         /* set up proto header */
1867         switch (version) {
1868                 case V4:
1869                         h4 = &tcb->protohdr.tcp4hdr;
1870                         memset(h4, 0, sizeof(*h4));
1871                         h4->proto = IP_TCPPROTO;
1872                         hnputs(h4->tcpsport, new->lport);
1873                         hnputs(h4->tcpdport, new->rport);
1874                         v6tov4(h4->tcpsrc, dst);
1875                         v6tov4(h4->tcpdst, src);
1876                         break;
1877                 case V6:
1878                         h6 = &tcb->protohdr.tcp6hdr;
1879                         memset(h6, 0, sizeof(*h6));
1880                         h6->proto = IP_TCPPROTO;
1881                         hnputs(h6->tcpsport, new->lport);
1882                         hnputs(h6->tcpdport, new->rport);
1883                         ipmove(h6->tcpsrc, dst);
1884                         ipmove(h6->tcpdst, src);
1885                         break;
1886                 default:
1887                         panic("tcpincoming: version %d", new->ipversion);
1888         }
1889
1890         tcpsetstate(new, Established);
1891
1892         iphtadd(&tpriv->ht, new);
1893
1894         return new;
1895 }
1896
1897 int seq_within(uint32_t x, uint32_t low, uint32_t high)
1898 {
1899         if (low <= high) {
1900                 if (low <= x && x <= high)
1901                         return 1;
1902         } else {
1903                 if (x >= low || x <= high)
1904                         return 1;
1905         }
1906         return 0;
1907 }
1908
1909 int seq_lt(uint32_t x, uint32_t y)
1910 {
1911         return (int)(x - y) < 0;
1912 }
1913
1914 int seq_le(uint32_t x, uint32_t y)
1915 {
1916         return (int)(x - y) <= 0;
1917 }
1918
1919 int seq_gt(uint32_t x, uint32_t y)
1920 {
1921         return (int)(x - y) > 0;
1922 }
1923
1924 int seq_ge(uint32_t x, uint32_t y)
1925 {
1926         return (int)(x - y) >= 0;
1927 }
1928
1929 static uint32_t seq_max(uint32_t x, uint32_t y)
1930 {
1931         return seq_ge(x, y) ? x : y;
1932 }
1933
1934 static uint32_t seq_min(uint32_t x, uint32_t y)
1935 {
1936         return seq_le(x, y) ? x : y;
1937 }
1938
1939 /*
1940  *  use the time between the first SYN and it's ack as the
1941  *  initial round trip time
1942  */
1943 void tcpsynackrtt(struct conv *s)
1944 {
1945         Tcpctl *tcb;
1946         uint64_t delta;
1947         struct tcppriv *tpriv;
1948
1949         tcb = (Tcpctl *) s->ptcl;
1950         tpriv = s->p->priv;
1951
1952         delta = NOW - tcb->sndsyntime;
1953         tcb->srtt = delta << LOGAGAIN;
1954         tcb->mdev = delta << LOGDGAIN;
1955
1956         /* halt round trip timer */
1957         tcphalt(tpriv, &tcb->rtt_timer);
1958 }
1959
1960 /* For LFNs (long/fat), our default tx queue doesn't hold enough data, and TCP
1961  * blocks on the application - even if the app already has the data ready to go.
1962  * We need to hold the sent, unacked data (1x cwnd), plus all the data we might
1963  * send next RTT (1x cwnd).  Note this is called after cwnd was expanded. */
1964 static void adjust_tx_qio_limit(struct conv *s)
1965 {
1966         Tcpctl *tcb = (Tcpctl *) s->ptcl;
1967         size_t ideal_limit = tcb->cwind * 2;
1968
1969         /* This is called for every ACK, and it's not entirely free to update the
1970          * limit (locks, CVs, taps).  Updating in chunks of mss seems reasonable.
1971          * During SS, we'll update this on most ACKs (given each ACK increased the
1972          * cwind by > MSS).
1973          *
1974          * We also don't want a lot of tiny blocks from the user, but the way qio
1975          * works, you can put in as much as you want (Maxatomic) and then get
1976          * flow-controlled. */
1977         if (qgetlimit(s->wq) + tcb->typical_mss < ideal_limit)
1978                 qsetlimit(s->wq, ideal_limit);
1979         /* TODO: we could shrink the qio limit too, if we had a better idea what the
1980          * actual threshold was.  We want the limit to be the 'stable' cwnd * 2. */
1981 }
1982
1983 /* Attempts to merge later sacks into sack 'into' (index in the array) */
1984 static void merge_sacks_into(Tcpctl *tcb, int into)
1985 {
1986         struct sack_block *into_sack = &tcb->snd.sacks[into];
1987         struct sack_block *tcb_sack;
1988         int shift = 0;
1989
1990         for (int i = into + 1; i < tcb->snd.nr_sacks; i++) {
1991                 tcb_sack = &tcb->snd.sacks[i];
1992                 if (seq_lt(into_sack->right, tcb_sack->left))
1993                         break;
1994                 if (seq_gt(tcb_sack->right, into_sack->right))
1995                         into_sack->right = tcb_sack->right;
1996                 shift++;
1997         }
1998         if (shift) {
1999                 memmove(tcb->snd.sacks + into + 1,
2000                         tcb->snd.sacks + into + 1 + shift,
2001                         sizeof(struct sack_block) * (tcb->snd.nr_sacks - into - 1
2002                                                              - shift));
2003                 tcb->snd.nr_sacks -= shift;
2004         }
2005 }
2006
2007 /* If we update a sack, it means they received a packet (possibly out of order),
2008  * but they have not received earlier packets.  Otherwise, they would do a full
2009  * ACK.
2010  *
2011  * The trick is in knowing whether the reception growing this sack is due to a
2012  * retrans or due to packets from before our last loss event.  The rightmost
2013  * sack tends to grow a lot with packets we sent before the loss.  However,
2014  * intermediate sacks that grow are signs of a loss, since they only grow as a
2015  * result of retrans.
2016  *
2017  * This is only true for the first time through a retrans.  After we've gone
2018  * through a full retrans blast, the sack that hinted at the retrans loss (and
2019  * there could be multiple of them!) will continue to grow.  We could come up
2020  * with some tracking for this, but instead we'll just do a one-time deal.  You
2021  * can recover from one detected sack retrans loss.  After that, you'll have to
2022  * use the RTO.
2023  *
2024  * This won't catch some things, like a sack that grew and merged with the
2025  * rightmost sack.  This also won't work if you have a single sack.  We can't
2026  * tell where the retrans ends and the sending begins. */
2027 static bool sack_hints_at_loss(Tcpctl *tcb, struct sack_block *tcb_sack)
2028 {
2029         if (tcb->snd.recovery != SACK_RETRANS_RECOVERY)
2030                 return FALSE;
2031         return &tcb->snd.sacks[tcb->snd.nr_sacks - 1] != tcb_sack;
2032 }
2033
2034 static bool sack_contains(struct sack_block *tcb_sack, uint32_t seq)
2035 {
2036         return seq_le(tcb_sack->left, seq) && seq_lt(seq, tcb_sack->right);
2037 }
2038
2039 /* Debugging helper! */
2040 static void sack_asserter(Tcpctl *tcb, char *str)
2041 {
2042         struct sack_block *tcb_sack;
2043
2044         for (int i = 0; i < tcb->snd.nr_sacks; i++) {
2045                 tcb_sack = &tcb->snd.sacks[i];
2046                 /* Checking invariants: snd.rtx is never inside a sack, sacks are always
2047                  * mutually exclusive. */
2048                 if (sack_contains(tcb_sack, tcb->snd.rtx) ||
2049                     ((i + 1 < tcb->snd.nr_sacks) && seq_ge(tcb_sack->right,
2050                                                                (tcb_sack + 1)->left))) {
2051                         printk("SACK ASSERT ERROR at %s\n", str);
2052                         printk("rtx %u una %u nxt %u, sack [%u, %u)\n",
2053                                tcb->snd.rtx, tcb->snd.una, tcb->snd.nxt, tcb_sack->left,
2054                                    tcb_sack->right);
2055                         for (int i = 0; i < tcb->snd.nr_sacks; i++)
2056                                 printk("\t %d: [%u, %u)\n", i, tcb->snd.sacks[i].left,
2057                                        tcb->snd.sacks[i].right);
2058                         backtrace();
2059                         panic("");
2060                 }
2061         }
2062 }
2063
2064 /* Updates bookkeeping whenever a sack is added or updated */
2065 static void sack_has_changed(struct conv *s, Tcpctl *tcb,
2066                              struct sack_block *tcb_sack)
2067 {
2068         /* Due to the change, snd.rtx might be in the middle of this sack.  Advance
2069          * it to the right edge. */
2070         if (sack_contains(tcb_sack, tcb->snd.rtx))
2071                 tcb->snd.rtx = tcb_sack->right;
2072
2073         /* This is a sack for something we retransed and we think it means there was
2074          * another loss.  Instead of waiting for the RTO, we can take action. */
2075         if (sack_hints_at_loss(tcb, tcb_sack)) {
2076                 if (++tcb->snd.sack_loss_hint == TCPREXMTTHRESH) {
2077                         netlog(s->p->f, Logtcprxmt,
2078                                "%I.%d -> %I.%d: sack rxmit loss: snd.rtx %u, sack [%u,%u), una %u, recovery_pt %u\n",
2079                                s->laddr, s->lport, s->raddr, s->rport,
2080                                tcb->snd.rtx, tcb_sack->left, tcb_sack->right, tcb->snd.una,
2081                                tcb->snd.recovery_pt);
2082                         /* Redo retrans, but keep the sacks and recovery point */
2083                         tcp_loss_event(s, tcb);
2084                         tcb->snd.rtx = tcb->snd.una;
2085                         tcb->snd.sack_loss_hint = 0;
2086                         /* Act like an RTO.  We just detected it earlier.  This prevents us
2087                          * from getting another sack hint loss this recovery period and from
2088                          * advancing the opportunistic right edge. */
2089                         tcb->snd.recovery = RTO_RETRANS_RECOVERY;
2090                         /* We didn't actually time out yet and we expect to keep getting
2091                          * sacks, so we don't want to flush or worry about in_flight.  If we
2092                          * messed something up, the RTO will still fire. */
2093                         set_in_flight(tcb);
2094                 }
2095         }
2096 }
2097
2098 /* Advances tcb_sack's right edge, if new_right is farther, and updates the
2099  * bookkeeping due to the change. */
2100 static void update_right_edge(struct conv *s, Tcpctl *tcb,
2101                               struct sack_block *tcb_sack, uint32_t new_right)
2102 {
2103         if (seq_le(new_right, tcb_sack->right))
2104                 return;
2105         tcb_sack->right = new_right;
2106         merge_sacks_into(tcb, tcb_sack - tcb->snd.sacks);
2107         sack_has_changed(s, tcb, tcb_sack);
2108 }
2109
2110 static void update_or_insert_sack(struct conv *s, Tcpctl *tcb,
2111                                   struct sack_block *seg_sack)
2112 {
2113         struct sack_block *tcb_sack;
2114
2115         for (int i = 0; i < tcb->snd.nr_sacks; i++) {
2116                 tcb_sack = &tcb->snd.sacks[i];
2117                 if (seq_lt(tcb_sack->left, seg_sack->left)) {
2118                         /* This includes adjacent (which I've seen!) and overlap. */
2119                         if (seq_le(seg_sack->left, tcb_sack->right)) {
2120                                 update_right_edge(s, tcb, tcb_sack, seg_sack->right);
2121                                 return;
2122                         }
2123                         continue;
2124                 }
2125                 /* Update existing sack */
2126                 if (tcb_sack->left == seg_sack->left) {
2127                         update_right_edge(s, tcb, tcb_sack, seg_sack->right);
2128                         return;
2129                 }
2130                 /* Found our slot */
2131                 if (seq_gt(tcb_sack->left, seg_sack->left)) {
2132                         if (tcb->snd.nr_sacks == MAX_NR_SND_SACKS) {
2133                                 /* Out of room, but it is possible this sack overlaps later
2134                                  * sacks, including the max sack's right edge. */
2135                                 if (seq_ge(seg_sack->right, tcb_sack->left)) {
2136                                         /* Take over the sack */
2137                                         tcb_sack->left = seg_sack->left;
2138                                         update_right_edge(s, tcb, tcb_sack, seg_sack->right);
2139                                 }
2140                                 return;
2141                         }
2142                         /* O/W, it's our slot and we have room (at least one spot). */
2143                         memmove(&tcb->snd.sacks[i + 1], &tcb->snd.sacks[i],
2144                                 sizeof(struct sack_block) * (tcb->snd.nr_sacks - i));
2145                         tcb_sack->left = seg_sack->left;
2146                         tcb_sack->right = seg_sack->right;
2147                         tcb->snd.nr_sacks++;
2148                         merge_sacks_into(tcb, i);
2149                         sack_has_changed(s, tcb, tcb_sack);
2150                         return;
2151                 }
2152         }
2153         if (tcb->snd.nr_sacks == MAX_NR_SND_SACKS) {
2154                 /* We didn't find space in the sack array. */
2155                 tcb_sack = &tcb->snd.sacks[MAX_NR_SND_SACKS - 1];
2156                 /* Need to always maintain the rightmost sack, discarding the prev */
2157                 if (seq_gt(seg_sack->right, tcb_sack->right)) {
2158                         tcb_sack->left = seg_sack->left;
2159                         tcb_sack->right = seg_sack->right;
2160                         sack_has_changed(s, tcb, tcb_sack);
2161                 }
2162                 return;
2163         }
2164         tcb_sack = &tcb->snd.sacks[tcb->snd.nr_sacks];
2165         tcb->snd.nr_sacks++;
2166         tcb_sack->left = seg_sack->left;
2167         tcb_sack->right = seg_sack->right;
2168         sack_has_changed(s, tcb, tcb_sack);
2169 }
2170
2171 /* Given the packet seg, track the sacks in TCB.  There are a few things: if seg
2172  * acks new data, some sacks might no longer be needed.  Some sacks might grow,
2173  * we might add new sacks, either of which can cause a merger.
2174  *
2175  * The important thing is that we always have the max sack entry: it must be
2176  * inserted for sure and findable.  We need that for our measurement of what
2177  * packets are in the network.
2178  *
2179  * Note that we keep sacks that are below snd.rtx (and above
2180  * seg.ack/tcb->snd.una) as best we can - we don't prune them.  We'll need those
2181  * for the in_flight estimate.
2182  *
2183  * When we run out of room, we'll have to throw away a sack.  Anything we throw
2184  * away below snd.rtx will be counted as 'in flight', even though it isn't.  If
2185  * we throw away something greater than snd.rtx, we'll also retrans it.  For
2186  * simplicity, we throw-away / replace the rightmost sack, since we're always
2187  * maintaining a highest sack. */
2188 static void update_sacks(struct conv *s, Tcpctl *tcb, Tcp *seg)
2189 {
2190         int prune = 0;
2191         struct sack_block *tcb_sack;
2192
2193         for (int i = 0; i < tcb->snd.nr_sacks; i++) {
2194                 tcb_sack = &tcb->snd.sacks[i];
2195                 /* For the equality case, if they acked up to, but not including an old
2196                  * sack, they must have reneged it.  Otherwise they would have acked
2197                  * beyond the sack. */
2198                 if (seq_lt(seg->ack, tcb_sack->left))
2199                         break;
2200                 prune++;
2201         }
2202         if (prune) {
2203                 memmove(tcb->snd.sacks, tcb->snd.sacks + prune,
2204                         sizeof(struct sack_block) * (tcb->snd.nr_sacks - prune));
2205                 tcb->snd.nr_sacks -= prune;
2206         }
2207         for (int i = 0; i < seg->nr_sacks; i++) {
2208                 /* old sacks */
2209                 if (seq_lt(seg->sacks[i].left, seg->ack))
2210                         continue;
2211                 /* buggy sack: out of range */
2212                 if (seq_gt(seg->sacks[i].right, tcb->snd.nxt))
2213                         continue;
2214                 update_or_insert_sack(s, tcb, &seg->sacks[i]);
2215         }
2216 }
2217
2218 /* This is a little bit of an under estimate, since we assume a packet is lost
2219  * once we have any sacks above it.  Overall, it's at most 2 * MSS of an
2220  * overestimate.
2221  *
2222  * If we have no sacks (either reneged or never used) we'll assume all packets
2223  * above snd.rtx are lost.  This will be the case for sackless fast rxmit
2224  * (Dong's stuff) or for a timeout.  In the former case, this is probably not
2225  * true, and in_flight should be higher, but we have no knowledge without the
2226  * sacks. */
2227 static void set_in_flight(Tcpctl *tcb)
2228 {
2229         struct sack_block *tcb_sack;
2230         uint32_t in_flight = 0;
2231         uint32_t from;
2232
2233         if (!tcb->snd.nr_sacks) {
2234                 tcb->snd.in_flight = tcb->snd.rtx - tcb->snd.una;
2235                 return;
2236         }
2237
2238         /* Everything to the right of the unsacked */
2239         tcb_sack = &tcb->snd.sacks[tcb->snd.nr_sacks - 1];
2240         in_flight += tcb->snd.nxt - tcb_sack->right;
2241
2242         /* Everything retransed (from una to snd.rtx, minus sacked regions.  Note
2243          * we only retrans at most the last sack's left edge.  snd.rtx will be
2244          * advanced to the right edge of some sack (possibly the last one). */
2245         from = tcb->snd.una;
2246         for (int i = 0; i < tcb->snd.nr_sacks; i++) {
2247                 tcb_sack = &tcb->snd.sacks[i];
2248                 if (seq_ge(tcb_sack->left, tcb->snd.rtx))
2249                         break;
2250                 assert(seq_ge(tcb->snd.rtx, tcb_sack->right));
2251                 in_flight += tcb_sack->left - from;
2252                 from = tcb_sack->right;
2253         }
2254         in_flight += tcb->snd.rtx - from;
2255
2256         tcb->snd.in_flight = in_flight;
2257 }
2258
2259 static void reset_recovery(struct conv *s, Tcpctl *tcb)
2260 {
2261         netlog(s->p->f, Logtcprxmt,
2262                "%I.%d -> %I.%d: recovery complete, una %u, rtx %u, nxt %u, recovery %u\n",
2263                s->laddr, s->lport, s->raddr, s->rport,
2264                tcb->snd.una, tcb->snd.rtx, tcb->snd.nxt, tcb->snd.recovery_pt);
2265         tcb->snd.recovery = 0;
2266         tcb->snd.recovery_pt = 0;
2267         tcb->snd.loss_hint = 0;
2268         tcb->snd.flush_sacks = FALSE;
2269         tcb->snd.sack_loss_hint = 0;
2270 }
2271
2272 static bool is_dup_ack(Tcpctl *tcb, Tcp *seg)
2273 {
2274         /* this is a pure ack w/o window update */
2275         return (seg->ack == tcb->snd.una) &&
2276                (tcb->snd.una != tcb->snd.nxt) &&
2277                (seg->len == 0) &&
2278                (seg->wnd == tcb->snd.wnd);
2279 }
2280
2281 /* If we have sacks, we'll ignore dupacks and look at the sacks ahead of una
2282  * (which are managed by the TCB).  The tcb will not have old sacks (below
2283  * ack/snd.rtx).  Receivers often send sacks below their ack point when we are
2284  * coming out of a loss, and we don't want those to count.
2285  *
2286  * Note the tcb could have sacks (in the future), but the receiver stopped using
2287  * them (reneged).  We'll catch that with the RTO.  If we try to catch it here,
2288  * we could get in a state where we never allow them to renege. */
2289 static bool is_potential_loss(Tcpctl *tcb, Tcp *seg)
2290 {
2291         if (seg->nr_sacks > 0)
2292                 return tcb->snd.nr_sacks > 0;
2293         else
2294                 return is_dup_ack(tcb, seg);
2295 }
2296
2297 void update(struct conv *s, Tcp * seg)
2298 {
2299         int rtt, delta;
2300         Tcpctl *tcb;
2301         uint32_t acked, expand;
2302         struct tcppriv *tpriv;
2303
2304         tpriv = s->p->priv;
2305         tcb = (Tcpctl *) s->ptcl;
2306
2307         /* if everything has been acked, force output(?) */
2308         if (seq_gt(seg->ack, tcb->snd.nxt)) {
2309                 tcb->flags |= FORCE;
2310                 return;
2311         }
2312
2313         acked = seg->ack - tcb->snd.una;
2314         tcb->snd.una = seg->ack;
2315         if (seq_gt(seg->ack, tcb->snd.rtx))
2316                 tcb->snd.rtx = seg->ack;
2317
2318         update_sacks(s, tcb, seg);
2319         set_in_flight(tcb);
2320
2321         /* We treat either a dupack or forward SACKs as a hint that there is a loss.
2322          * The RFCs suggest three dupacks before treating it as a loss (alternative
2323          * is reordered packets).  We'll treat three SACKs the same way. */
2324         if (is_potential_loss(tcb, seg) && !tcb->snd.recovery) {
2325                 tcb->snd.loss_hint++;
2326                 if (tcb->snd.loss_hint == TCPREXMTTHRESH) {
2327                         netlog(s->p->f, Logtcprxmt,
2328                                "%I.%d -> %I.%d: loss hint thresh, nr sacks %u, nxt %u, una %u, cwnd %u\n",
2329                                s->laddr, s->lport, s->raddr, s->rport,
2330                                tcb->snd.nr_sacks, tcb->snd.nxt, tcb->snd.una, tcb->cwind);
2331                         tcp_loss_event(s, tcb);
2332                         tcb->snd.recovery_pt = tcb->snd.nxt;
2333                         if (tcb->snd.nr_sacks) {
2334                                 tcb->snd.recovery = SACK_RETRANS_RECOVERY;
2335                                 tcb->snd.flush_sacks = FALSE;
2336                                 tcb->snd.sack_loss_hint = 0;
2337                         } else {
2338                                 tcb->snd.recovery = FAST_RETRANS_RECOVERY;
2339                         }
2340                         tcprxmit(s);
2341                 }
2342         }
2343
2344         /*
2345          *  update window
2346          */
2347         if (seq_gt(seg->ack, tcb->snd.wl2)
2348                 || (tcb->snd.wl2 == seg->ack && seg->wnd > tcb->snd.wnd)) {
2349                 tcb->snd.wnd = seg->wnd;
2350                 tcb->snd.wl2 = seg->ack;
2351         }
2352
2353         if (!acked) {
2354                 /*
2355                  *  don't let us hangup if sending into a closed window and
2356                  *  we're still getting acks
2357                  */
2358                 if (tcb->snd.recovery && (tcb->snd.wnd == 0))
2359                         tcb->backedoff = MAXBACKMS / 4;
2360                 return;
2361         }
2362         /* At this point, they have acked something new. (positive ack, ack > una).
2363          *
2364          * If we hadn't reached the threshold for recovery yet, the positive ACK
2365          * will reset our loss_hint count. */
2366         if (!tcb->snd.recovery)
2367                 tcb->snd.loss_hint = 0;
2368         else if (seq_ge(seg->ack, tcb->snd.recovery_pt))
2369                 reset_recovery(s, tcb);
2370
2371         /* avoid slow start and timers for SYN acks */
2372         if ((tcb->flags & SYNACK) == 0) {
2373                 tcb->flags |= SYNACK;
2374                 acked--;
2375                 tcb->flgcnt--;
2376                 goto done;
2377         }
2378
2379         /* slow start as long as we're not recovering from lost packets */
2380         if (tcb->cwind < tcb->snd.wnd && !tcb->snd.recovery) {
2381                 if (tcb->cwind < tcb->ssthresh) {
2382                         /* We increase the cwind by every byte we receive.  We want to
2383                          * increase the cwind by one MSS for every MSS that gets ACKed.
2384                          * Note that multiple MSSs can be ACKed in a single ACK.  If we had
2385                          * a remainder of acked / MSS, we'd add just that remainder - not 0
2386                          * or 1 MSS. */
2387                         expand = acked;
2388                 } else {
2389                         /* Every RTT, which consists of CWND bytes, we're supposed to expand
2390                          * by MSS bytes.  The classic algorithm was
2391                          *              expand = (tcb->mss * tcb->mss) / tcb->cwind;
2392                          * which assumes the ACK was for MSS bytes.  Instead, for every
2393                          * 'acked' bytes, we increase the window by acked / CWND (in units
2394                          * of MSS). */
2395                         expand = MAX(acked, tcb->typical_mss) * tcb->typical_mss
2396                                  / tcb->cwind;
2397                 }
2398
2399                 if (tcb->cwind + expand < tcb->cwind)
2400                         expand = tcb->snd.wnd - tcb->cwind;
2401                 if (tcb->cwind + expand > tcb->snd.wnd)
2402                         expand = tcb->snd.wnd - tcb->cwind;
2403                 tcb->cwind += expand;
2404         }
2405         adjust_tx_qio_limit(s);
2406
2407         /* Adjust the timers according to the round trip time */
2408         if (tcb->rtt_timer.state == TcptimerON && seq_ge(seg->ack, tcb->rttseq)) {
2409                 tcphalt(tpriv, &tcb->rtt_timer);
2410                 if (!tcb->snd.recovery) {
2411                         tcb->backoff = 0;
2412                         tcb->backedoff = 0;
2413                         rtt = tcb->rtt_timer.start - tcb->rtt_timer.count;
2414                         if (rtt == 0)
2415                                 rtt = 1;        /* otherwise all close systems will rexmit in 0 time */
2416                         rtt *= MSPTICK;
2417                         if (tcb->srtt == 0) {
2418                                 tcb->srtt = rtt << LOGAGAIN;
2419                                 tcb->mdev = rtt << LOGDGAIN;
2420                         } else {
2421                                 delta = rtt - (tcb->srtt >> LOGAGAIN);
2422                                 tcb->srtt += delta;
2423                                 if (tcb->srtt <= 0)
2424                                         tcb->srtt = 1;
2425
2426                                 delta = abs(delta) - (tcb->mdev >> LOGDGAIN);
2427                                 tcb->mdev += delta;
2428                                 if (tcb->mdev <= 0)
2429                                         tcb->mdev = 1;
2430                         }
2431                         tcpsettimer(tcb);
2432                 }
2433         }
2434
2435 done:
2436         if (qdiscard(s->wq, acked) < acked)
2437                 tcb->flgcnt--;
2438
2439         if (seq_gt(seg->ack, tcb->snd.urg))
2440                 tcb->snd.urg = seg->ack;
2441
2442         if (tcb->snd.una != tcb->snd.nxt)
2443                 tcpgo(tpriv, &tcb->timer);
2444         else
2445                 tcphalt(tpriv, &tcb->timer);
2446
2447         tcb->backoff = 0;
2448         tcb->backedoff = 0;
2449 }
2450
2451 static void update_tcb_ts(Tcpctl *tcb, Tcp *seg)
2452 {
2453         /* Get timestamp info from the tcp header.  Even though the timestamps
2454          * aren't sequence numbers, we still need to protect for wraparound.  Though
2455          * if the values were 0, assume that means we need an update.  We could have
2456          * an initial ts_val that appears negative (signed). */
2457         if (!tcb->ts_recent || !tcb->last_ack_sent ||
2458             (seq_ge(seg->ts_val, tcb->ts_recent) &&
2459              seq_le(seg->seq, tcb->last_ack_sent)))
2460                 tcb->ts_recent = seg->ts_val;
2461 }
2462
2463 /* Overlap happens when one sack's left edge is inside another sack. */
2464 static bool sacks_overlap(struct sack_block *x, struct sack_block *y)
2465 {
2466         return (seq_le(x->left, y->left) && seq_le(y->left, x->right)) ||
2467                (seq_le(y->left, x->left) && seq_le(x->left, y->right));
2468 }
2469
2470 static void make_sack_first(Tcpctl *tcb, struct sack_block *tcb_sack)
2471 {
2472         struct sack_block temp;
2473
2474         if (tcb_sack == &tcb->rcv.sacks[0])
2475                 return;
2476         temp = tcb->rcv.sacks[0];
2477         tcb->rcv.sacks[0] = *tcb_sack;
2478         *tcb_sack = temp;
2479 }
2480
2481 /* Track sack in our tcb for a block of data we received.  This handles all the
2482  * stuff: making sure sack is first (since it's the most recent sack change),
2483  * updating or merging sacks, and dropping excess sacks (we only need to
2484  * maintain 3).  Unlike on the snd side, our tcb sacks are *not* sorted. */
2485 static void track_rcv_sack(Tcpctl *tcb, uint32_t left, uint32_t right)
2486 {
2487         struct sack_block *tcb_sack;
2488         struct sack_block sack[1];
2489
2490         if (!tcb->sack_ok)
2491                 return;
2492         assert(seq_lt(left, right));
2493         sack->left = left;
2494         sack->right = right;
2495         /* We can reuse an existing sack if we're merging or overlapping. */
2496         for (int i = 0; i < tcb->rcv.nr_sacks; i++) {
2497                 tcb_sack = &tcb->rcv.sacks[i];
2498                 if (sacks_overlap(tcb_sack, sack)) {
2499                         tcb_sack->left = seq_min(tcb_sack->left, sack->left);
2500                         tcb_sack->right = seq_max(tcb_sack->right, sack->right);
2501                         make_sack_first(tcb, tcb_sack);
2502                         return;
2503                 }
2504         }
2505         /* We can discard the last sack (right shift) - we should have sent it at
2506          * least once by now.  If not, oh well. */
2507         memmove(tcb->rcv.sacks + 1, tcb->rcv.sacks, sizeof(struct sack_block) *
2508                 MIN(MAX_NR_RCV_SACKS - 1, tcb->rcv.nr_sacks));
2509         tcb->rcv.sacks[0] = *sack;
2510         if (tcb->rcv.nr_sacks < MAX_NR_RCV_SACKS)
2511                 tcb->rcv.nr_sacks++;
2512 }
2513
2514 /* Once we receive everything and move rcv.nxt past a sack, we don't need to
2515  * track it.  I've seen Linux report sacks in the past, but we probably
2516  * shouldn't. */
2517 static void drop_old_rcv_sacks(Tcpctl *tcb)
2518 {
2519         struct sack_block *tcb_sack;
2520
2521         for (int i = 0; i < tcb->rcv.nr_sacks; i++) {
2522                 tcb_sack = &tcb->rcv.sacks[i];
2523                 /* Moving up to or past the left is enough to drop it. */
2524                 if (seq_ge(tcb->rcv.nxt, tcb_sack->left)) {
2525                         memmove(tcb->rcv.sacks + i, tcb->rcv.sacks + i + 1,
2526                                 sizeof(struct sack_block) * (tcb->rcv.nr_sacks - i - 1));
2527                         tcb->rcv.nr_sacks--;
2528                         i--;
2529                 }
2530         }
2531 }
2532
2533 void tcpiput(struct Proto *tcp, struct Ipifc *unused, struct block *bp)
2534 {
2535         ERRSTACK(1);
2536         Tcp seg;
2537         Tcp4hdr *h4;
2538         Tcp6hdr *h6;
2539         int hdrlen;
2540         Tcpctl *tcb;
2541         uint16_t length;
2542         uint8_t source[IPaddrlen], dest[IPaddrlen];
2543         struct conv *s;
2544         struct Fs *f;
2545         struct tcppriv *tpriv;
2546         uint8_t version;
2547
2548         f = tcp->f;
2549         tpriv = tcp->priv;
2550
2551         tpriv->stats[InSegs]++;
2552
2553         h4 = (Tcp4hdr *) (bp->rp);
2554         h6 = (Tcp6hdr *) (bp->rp);
2555
2556         if ((h4->vihl & 0xF0) == IP_VER4) {
2557                 uint8_t ttl;
2558
2559                 version = V4;
2560                 length = nhgets(h4->length);
2561                 v4tov6(dest, h4->tcpdst);
2562                 v4tov6(source, h4->tcpsrc);
2563
2564                 /* ttl isn't part of the xsum pseudo header, but bypass needs it. */
2565                 ttl = h4->Unused;
2566                 h4->Unused = 0;
2567                 hnputs(h4->tcplen, length - TCP4_PKT);
2568                 if (!(bp->flag & Btcpck) && (h4->tcpcksum[0] || h4->tcpcksum[1]) &&
2569                         ptclcsum(bp, TCP4_IPLEN, length - TCP4_IPLEN)) {
2570                         tpriv->stats[CsumErrs]++;
2571                         tpriv->stats[InErrs]++;
2572                         netlog(f, Logtcp, "bad tcp proto cksum\n");
2573                         freeblist(bp);
2574                         return;
2575                 }
2576                 h4->Unused = ttl;
2577
2578                 hdrlen = ntohtcp4(&seg, &bp);
2579                 if (hdrlen < 0) {
2580                         tpriv->stats[HlenErrs]++;
2581                         tpriv->stats[InErrs]++;
2582                         netlog(f, Logtcp, "bad tcp hdr len\n");
2583                         return;
2584                 }
2585
2586                 s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest);
2587                 if (s && s->state == Bypass) {
2588                         bypass_or_drop(s, bp);
2589                         return;
2590                 }
2591
2592                 /* trim the packet to the size claimed by the datagram */
2593                 length -= hdrlen + TCP4_PKT;
2594                 bp = trimblock(bp, hdrlen + TCP4_PKT, length);
2595                 if (bp == NULL) {
2596                         tpriv->stats[LenErrs]++;
2597                         tpriv->stats[InErrs]++;
2598                         netlog(f, Logtcp, "tcp len < 0 after trim\n");
2599                         return;
2600                 }
2601         } else {
2602                 int ttl = h6->ttl;
2603                 int proto = h6->proto;
2604
2605                 version = V6;
2606                 length = nhgets(h6->ploadlen);
2607                 ipmove(dest, h6->tcpdst);
2608                 ipmove(source, h6->tcpsrc);
2609
2610                 h6->ploadlen[0] = h6->ploadlen[1] = h6->proto = 0;
2611                 h6->ttl = proto;
2612                 hnputl(h6->vcf, length);
2613                 if ((h6->tcpcksum[0] || h6->tcpcksum[1]) &&
2614                         ptclcsum(bp, TCP6_IPLEN, length + TCP6_PHDRSIZE)) {
2615                         tpriv->stats[CsumErrs]++;
2616                         tpriv->stats[InErrs]++;
2617                         netlog(f, Logtcp, "bad tcp proto cksum\n");
2618                         freeblist(bp);
2619                         return;
2620                 }
2621                 h6->ttl = ttl;
2622                 h6->proto = proto;
2623                 hnputs(h6->ploadlen, length);
2624
2625                 hdrlen = ntohtcp6(&seg, &bp);
2626                 if (hdrlen < 0) {
2627                         tpriv->stats[HlenErrs]++;
2628                         tpriv->stats[InErrs]++;
2629                         netlog(f, Logtcp, "bad tcp hdr len\n");
2630                         return;
2631                 }
2632
2633                 s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest);
2634                 if (s && s->state == Bypass) {
2635                         bypass_or_drop(s, bp);
2636                         return;
2637                 }
2638
2639                 /* trim the packet to the size claimed by the datagram */
2640                 length -= hdrlen;
2641                 bp = trimblock(bp, hdrlen + TCP6_PKT, length);
2642                 if (bp == NULL) {
2643                         tpriv->stats[LenErrs]++;
2644                         tpriv->stats[InErrs]++;
2645                         netlog(f, Logtcp, "tcp len < 0 after trim\n");
2646                         return;
2647                 }
2648         }
2649
2650         /* s, the conv matching the n-tuple, was set above */
2651         if (s == NULL) {
2652                 netlog(f, Logtcpreset, "iphtlook failed: src %I:%u, dst %I:%u\n",
2653                        source, seg.source, dest, seg.dest);
2654 reset:
2655                 sndrst(tcp, source, dest, length, &seg, version, "no conversation");
2656                 freeblist(bp);
2657                 return;
2658         }
2659
2660         /* lock protocol for unstate Plan 9 invariants.  funcs like limbo or
2661          * incoming might rely on it. */
2662         qlock(&tcp->qlock);
2663
2664         /* if it's a listener, look for the right flags and get a new conv */
2665         tcb = (Tcpctl *) s->ptcl;
2666         if (tcb->state == Listen) {
2667                 if (seg.flags & RST) {
2668                         limborst(s, &seg, source, dest, version);
2669                         qunlock(&tcp->qlock);
2670                         freeblist(bp);
2671                         return;
2672                 }
2673
2674                 /* if this is a new SYN, put the call into limbo */
2675                 if ((seg.flags & SYN) && (seg.flags & ACK) == 0) {
2676                         limbo(s, source, dest, &seg, version);
2677                         qunlock(&tcp->qlock);
2678                         freeblist(bp);
2679                         return;
2680                 }
2681
2682                 /* if there's a matching call in limbo, tcpincoming will return it */
2683                 s = tcpincoming(s, &seg, source, dest, version);
2684                 if (s == NULL) {
2685                         qunlock(&tcp->qlock);
2686                         goto reset;
2687                 }
2688         }
2689
2690         /* The rest of the input state machine is run with the control block
2691          * locked and implements the state machine directly out of the RFC.
2692          * Out-of-band data is ignored - it was always a bad idea.
2693          */
2694         tcb = (Tcpctl *) s->ptcl;
2695         if (waserror()) {
2696                 qunlock(&s->qlock);
2697                 nexterror();
2698         }
2699         qlock(&s->qlock);
2700         qunlock(&tcp->qlock);
2701
2702         update_tcb_ts(tcb, &seg);
2703         /* fix up window */
2704         seg.wnd <<= tcb->rcv.scale;
2705
2706         /* every input packet in puts off the keep alive time out */
2707         tcpsetkacounter(tcb);
2708
2709         switch (tcb->state) {
2710                 case Closed:
2711                         sndrst(tcp, source, dest, length, &seg, version,
2712                                    "sending to Closed");
2713                         goto raise;
2714                 case Syn_sent:
2715                         if (seg.flags & ACK) {
2716                                 if (!seq_within(seg.ack, tcb->iss + 1, tcb->snd.nxt)) {
2717                                         sndrst(tcp, source, dest, length, &seg, version,
2718                                                    "bad seq in Syn_sent");
2719                                         goto raise;
2720                                 }
2721                         }
2722                         if (seg.flags & RST) {
2723                                 if (seg.flags & ACK)
2724                                         localclose(s, "connection refused");
2725                                 goto raise;
2726                         }
2727
2728                         if (seg.flags & SYN) {
2729                                 procsyn(s, &seg);
2730                                 if (seg.flags & ACK) {
2731                                         update(s, &seg);
2732                                         tcpsynackrtt(s);
2733                                         tcpsetstate(s, Established);
2734                                         /* Here's where we get the results of header option
2735                                          * negotiations for connections we started. (SYNACK has the
2736                                          * response) */
2737                                         tcpsetscale(s, tcb, seg.ws, tcb->scale);
2738                                         tcb->sack_ok = seg.sack_ok;
2739                                 } else {
2740                                         sndrst(tcp, source, dest, length, &seg, version,
2741                                                    "Got SYN with no ACK");
2742                                         goto raise;
2743                                 }
2744
2745                                 if (length != 0 || (seg.flags & FIN))
2746                                         break;
2747
2748                                 freeblist(bp);
2749                                 goto output;
2750                         } else
2751                                 freeblist(bp);
2752
2753                         qunlock(&s->qlock);
2754                         poperror();
2755                         return;
2756         }
2757
2758         /*
2759          *  One DOS attack is to open connections to us and then forget about them,
2760          *  thereby tying up a conv at no long term cost to the attacker.
2761          *  This is an attempt to defeat these stateless DOS attacks.  See
2762          *  corresponding code in tcpsendka().
2763          */
2764         if ((seg.flags & RST) == 0) {
2765                 if (tcpporthogdefense
2766                         && seq_within(seg.ack, tcb->snd.una - (1 << 31),
2767                                                   tcb->snd.una - (1 << 29))) {
2768                         printd("stateless hog %I.%d->%I.%d f 0x%x 0x%lx - 0x%lx - 0x%lx\n",
2769                                    source, seg.source, dest, seg.dest, seg.flags,
2770                                    tcb->snd.una - (1 << 31), seg.ack, tcb->snd.una - (1 << 29));
2771                         localclose(s, "stateless hog");
2772                 }
2773         }
2774
2775         /* Cut the data to fit the receive window */
2776         if (tcptrim(tcb, &seg, &bp, &length) == -1) {
2777                 netlog(f, Logtcp, "%I.%d -> %I.%d: tcp len < 0, %lu %d\n",
2778                        s->raddr, s->rport, s->laddr, s->lport, seg.seq, length);
2779                 update(s, &seg);
2780                 if (qlen(s->wq) + tcb->flgcnt == 0 && tcb->state == Closing) {
2781                         tcphalt(tpriv, &tcb->rtt_timer);
2782                         tcphalt(tpriv, &tcb->acktimer);
2783                         tcphalt(tpriv, &tcb->katimer);
2784                         tcpsetstate(s, Time_wait);
2785                         tcb->timer.start = MSL2 * (1000 / MSPTICK);
2786                         tcpgo(tpriv, &tcb->timer);
2787                 }
2788                 if (!(seg.flags & RST)) {
2789                         tcb->flags |= FORCE;
2790                         goto output;
2791                 }
2792                 qunlock(&s->qlock);
2793                 poperror();
2794                 return;
2795         }
2796
2797         /* Cannot accept so answer with a rst */
2798         if (length && tcb->state == Closed) {
2799                 sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
2800                 goto raise;
2801         }
2802
2803         /* The segment is beyond the current receive pointer so
2804          * queue the data in the resequence queue
2805          */
2806         if (seg.seq != tcb->rcv.nxt)
2807                 if (length != 0 || (seg.flags & (SYN | FIN))) {
2808                         update(s, &seg);
2809                         if (addreseq(tcb, tpriv, &seg, bp, length) < 0)
2810                                 printd("reseq %I.%d -> %I.%d\n", s->raddr, s->rport, s->laddr,
2811                                            s->lport);
2812                         tcb->flags |= FORCE;
2813                         goto output;
2814                 }
2815
2816         /*
2817          *  keep looping till we've processed this packet plus any
2818          *  adjacent packets in the resequence queue
2819          */
2820         for (;;) {
2821                 if (seg.flags & RST) {
2822                         if (tcb->state == Established) {
2823                                 tpriv->stats[EstabResets]++;
2824                                 if (tcb->rcv.nxt != seg.seq)
2825                                         printd
2826                                                 ("out of order RST rcvd: %I.%d -> %I.%d, rcv.nxt 0x%lx seq 0x%lx\n",
2827                                                  s->raddr, s->rport, s->laddr, s->lport, tcb->rcv.nxt,
2828                                                  seg.seq);
2829                         }
2830                         localclose(s, "connection refused");
2831                         goto raise;
2832                 }
2833
2834                 if ((seg.flags & ACK) == 0)
2835                         goto raise;
2836
2837                 switch (tcb->state) {
2838                         case Established:
2839                         case Close_wait:
2840                                 update(s, &seg);
2841                                 break;
2842                         case Finwait1:
2843                                 update(s, &seg);
2844                                 if (qlen(s->wq) + tcb->flgcnt == 0) {
2845                                         tcphalt(tpriv, &tcb->rtt_timer);
2846                                         tcphalt(tpriv, &tcb->acktimer);
2847                                         tcpsetkacounter(tcb);
2848                                         tcb->time = NOW;
2849                                         tcpsetstate(s, Finwait2);
2850                                         tcb->katimer.start = MSL2 * (1000 / MSPTICK);
2851                                         tcpgo(tpriv, &tcb->katimer);
2852                                 }
2853                                 break;
2854                         case Finwait2:
2855                                 update(s, &seg);
2856                                 break;
2857                         case Closing:
2858                                 update(s, &seg);
2859                                 if (qlen(s->wq) + tcb->flgcnt == 0) {
2860                                         tcphalt(tpriv, &tcb->rtt_timer);
2861                                         tcphalt(tpriv, &tcb->acktimer);
2862                                         tcphalt(tpriv, &tcb->katimer);
2863                                         tcpsetstate(s, Time_wait);
2864                                         tcb->timer.start = MSL2 * (1000 / MSPTICK);
2865                                         tcpgo(tpriv, &tcb->timer);
2866                                 }
2867                                 break;
2868                         case Last_ack:
2869                                 update(s, &seg);
2870                                 if (qlen(s->wq) + tcb->flgcnt == 0) {
2871                                         localclose(s, NULL);
2872                                         goto raise;
2873                                 }
2874                         case Time_wait:
2875                                 tcb->flags |= FORCE;
2876                                 if (tcb->timer.state != TcptimerON)
2877                                         tcpgo(tpriv, &tcb->timer);
2878                 }
2879
2880                 if ((seg.flags & URG) && seg.urg) {
2881                         if (seq_gt(seg.urg + seg.seq, tcb->rcv.urg)) {
2882                                 tcb->rcv.urg = seg.urg + seg.seq;
2883                                 pullblock(&bp, seg.urg);
2884                         }
2885                 } else if (seq_gt(tcb->rcv.nxt, tcb->rcv.urg))
2886                         tcb->rcv.urg = tcb->rcv.nxt;
2887
2888                 if (length == 0) {
2889                         if (bp != NULL)
2890                                 freeblist(bp);
2891                 } else {
2892                         switch (tcb->state) {
2893                                 default:
2894                                         /* Ignore segment text */
2895                                         if (bp != NULL)
2896                                                 freeblist(bp);
2897                                         break;
2898
2899                                 case Established:
2900                                 case Finwait1:
2901                                         /* If we still have some data place on
2902                                          * receive queue
2903                                          */
2904                                         if (bp) {
2905                                                 bp = packblock(bp);
2906                                                 if (bp == NULL)
2907                                                         panic("tcp packblock");
2908                                                 qpassnolim(s->rq, bp);
2909                                                 bp = NULL;
2910
2911                                                 /*
2912                                                  *  Force an ack every 2 data messages.  This is
2913                                                  *  a hack for rob to make his home system run
2914                                                  *  faster.
2915                                                  *
2916                                                  *  this also keeps the standard TCP congestion
2917                                                  *  control working since it needs an ack every
2918                                                  *  2 max segs worth.  This is not quite that,
2919                                                  *  but under a real stream is equivalent since
2920                                                  *  every packet has a max seg in it.
2921                                                  */
2922                                                 if (++(tcb->rcv.una) >= 2)
2923                                                         tcb->flags |= FORCE;
2924                                         }
2925                                         tcb->rcv.nxt += length;
2926                                         drop_old_rcv_sacks(tcb);
2927
2928                                         /*
2929                                          *  update our rcv window
2930                                          */
2931                                         tcprcvwin(s);
2932
2933                                         /*
2934                                          *  turn on the acktimer if there's something
2935                                          *  to ack
2936                                          */
2937                                         if (tcb->acktimer.state != TcptimerON)
2938                                                 tcpgo(tpriv, &tcb->acktimer);
2939
2940                                         break;
2941                                 case Finwait2:
2942                                         /* no process to read the data, send a reset */
2943                                         if (bp != NULL)
2944                                                 freeblist(bp);
2945                                         sndrst(tcp, source, dest, length, &seg, version,
2946                                                    "send to Finwait2");
2947                                         qunlock(&s->qlock);
2948                                         poperror();
2949                                         return;
2950                         }
2951                 }
2952
2953                 if (seg.flags & FIN) {
2954                         tcb->flags |= FORCE;
2955
2956                         switch (tcb->state) {
2957                                 case Established:
2958                                         tcb->rcv.nxt++;
2959                                         tcpsetstate(s, Close_wait);
2960                                         break;
2961                                 case Finwait1:
2962                                         tcb->rcv.nxt++;
2963                                         if (qlen(s->wq) + tcb->flgcnt == 0) {
2964                                                 tcphalt(tpriv, &tcb->rtt_timer);
2965                                                 tcphalt(tpriv, &tcb->acktimer);
2966                                                 tcphalt(tpriv, &tcb->katimer);
2967                                                 tcpsetstate(s, Time_wait);
2968                                                 tcb->timer.start = MSL2 * (1000 / MSPTICK);
2969                                                 tcpgo(tpriv, &tcb->timer);
2970                                         } else
2971                                                 tcpsetstate(s, Closing);
2972                                         break;
2973                                 case Finwait2:
2974                                         tcb->rcv.nxt++;
2975                                         tcphalt(tpriv, &tcb->rtt_timer);
2976                                         tcphalt(tpriv, &tcb->acktimer);
2977                                         tcphalt(tpriv, &tcb->katimer);
2978                                         tcpsetstate(s, Time_wait);
2979                                         tcb->timer.start = MSL2 * (1000 / MSPTICK);
2980                                         tcpgo(tpriv, &tcb->timer);
2981                                         break;
2982                                 case Close_wait:
2983                                 case Closing:
2984                                 case Last_ack:
2985                                         break;
2986                                 case Time_wait:
2987                                         tcpgo(tpriv, &tcb->timer);
2988                                         break;
2989                         }
2990                 }
2991
2992                 /*
2993                  *  get next adjacent segment from the resequence queue.
2994                  *  dump/trim any overlapping segments
2995                  */
2996                 for (;;) {
2997                         if (tcb->reseq == NULL)
2998                                 goto output;
2999
3000                         if (seq_ge(tcb->rcv.nxt, tcb->reseq->seg.seq) == 0)
3001                                 goto output;
3002
3003                         getreseq(tcb, &seg, &bp, &length);
3004
3005                         if (tcptrim(tcb, &seg, &bp, &length) == 0)
3006                                 break;
3007                 }
3008         }
3009 output:
3010         tcpoutput(s);
3011         qunlock(&s->qlock);
3012         poperror();
3013         return;
3014 raise:
3015         qunlock(&s->qlock);
3016         poperror();
3017         freeblist(bp);
3018         tcpkick(s);
3019 }
3020
3021 /* The advertised mss = data + TCP headers */
3022 static uint16_t derive_payload_mss(Tcpctl *tcb)
3023 {
3024         uint16_t payload_mss = tcb->mss;
3025         uint16_t opt_size = 0;
3026
3027         if (tcb->ts_recent) {
3028                 opt_size += TS_LENGTH;
3029                 /* Note that when we're a SYN, we overestimate slightly.  This is safe,
3030                  * and not really a problem. */
3031                 opt_size += TS_SEND_PREPAD;
3032         }
3033         if (tcb->rcv.nr_sacks)
3034                 opt_size += 2 + tcb->rcv.nr_sacks * 8;
3035         opt_size = ROUNDUP(opt_size, 4);
3036         payload_mss -= opt_size;
3037         return payload_mss;
3038 }
3039
3040 /* Decreases the xmit amt, given the MSS / TSO. */
3041 static uint32_t throttle_for_mss(Tcpctl *tcb, uint32_t ssize,
3042                                  uint16_t payload_mss, bool retrans)
3043 {
3044         if (ssize > payload_mss) {
3045                 if ((tcb->flags & TSO) == 0) {
3046                         ssize = payload_mss;
3047                 } else {
3048                         /* Don't send too much.  32K is arbitrary.. */
3049                         if (ssize > 32 * 1024)
3050                                 ssize = 32 * 1024;
3051                         if (!retrans) {
3052                                 /* Clamp xmit to an integral MSS to avoid ragged tail segments
3053                                  * causing poor link utilization. */
3054                                 ssize = ROUNDDOWN(ssize, payload_mss);
3055                         }
3056                 }
3057         }
3058         return ssize;
3059 }
3060
3061 /* Reduces ssize for a variety of reasons.  Returns FALSE if we should abort
3062  * sending the packet.  o/w returns TRUE and modifies ssize by reference. */
3063 static bool throttle_ssize(struct conv *s, Tcpctl *tcb, uint32_t *ssize_p,
3064                            uint16_t payload_mss, bool retrans)
3065 {
3066         struct Fs *f = s->p->f;
3067         uint32_t usable;
3068         uint32_t ssize = *ssize_p;
3069
3070         /* Compute usable segment based on offered window and limit
3071          * window probes to one */
3072         if (tcb->snd.wnd == 0) {
3073                 if (tcb->snd.in_flight != 0) {
3074                         if ((tcb->flags & FORCE) == 0)
3075                                 return FALSE;
3076                 }
3077                 usable = 1;
3078         } else {
3079                 usable = tcb->cwind;
3080                 if (tcb->snd.wnd < usable)
3081                         usable = tcb->snd.wnd;
3082                 if (usable > tcb->snd.in_flight)
3083                         usable -= tcb->snd.in_flight;
3084                 else
3085                         usable = 0;
3086                 /* Avoid Silly Window Syndrome.  This is a little different thant RFC
3087                  * 813.  I took their additional enhancement of "< MSS" as an AND, not
3088                  * an OR.  25% of a large snd.wnd is pretty large, and our main goal is
3089                  * to avoid packets smaller than MSS.  I still use the 25% threshold,
3090                  * because it is important that there is *some* data in_flight.  If
3091                  * usable < MSS because snd.wnd is very small (but not 0), we might
3092                  * never get an ACK and would need to set up a timer.
3093                  *
3094                  * Also, I'm using 'ssize' as a proxy for a PSH point.  If there's just
3095                  * a small blob in the qio (or retrans!), then we might as well just
3096                  * send it. */
3097                 if ((usable < tcb->typical_mss) && (usable < tcb->snd.wnd >> 2)
3098                     && (usable < ssize)) {
3099                         return FALSE;
3100                 }
3101         }
3102         if (ssize && usable < 2)
3103                 netlog(s->p->f, Logtcpverbose,
3104                        "%I.%d -> %I.%d: throttled snd.wnd %lu cwind %lu\n",
3105                        s->laddr, s->lport, s->raddr, s->rport,
3106                        tcb->snd.wnd, tcb->cwind);
3107         if (usable < ssize)
3108                 ssize = usable;
3109
3110         ssize = throttle_for_mss(tcb, ssize, payload_mss, retrans);
3111
3112         *ssize_p = ssize;
3113         return TRUE;
3114 }
3115
3116 /* Helper, picks the next segment to send, which is possibly a retransmission.
3117  * Returns TRUE if we have a segment, FALSE o/w.  Returns ssize, from_seq, and
3118  * sent by reference.
3119  *
3120  * from_seq is the seq number we are transmitting from.
3121  *
3122  * sent includes all seq from una to from_seq *including* any previously sent
3123  * flags (part of tcb->flgcnt), for instance an unacknowledged SYN (which counts
3124  * as a seq number).  Those flags are in the e.g. snd.nxt - snd.una range, and
3125  * they get dropped after qdiscard.
3126  *
3127  * ssize is the amount of data we are sending, starting from from_seq, and it
3128  * will include any *new* flags, which haven't been accounted for yet.
3129  *
3130  * tcb->flgcnt consists of the flags both in ssize and in sent.
3131  *
3132  * Note that we could be in recovery and not sack_retrans a segment. */
3133 static bool get_xmit_segment(struct conv *s, Tcpctl *tcb, uint16_t payload_mss,
3134                              uint32_t *from_seq_p, uint32_t *sent_p,
3135                              uint32_t *ssize_p)
3136 {
3137         struct Fs *f = s->p->f;
3138         struct tcppriv *tpriv = s->p->priv;
3139         uint32_t ssize, sent, from_seq;
3140         bool sack_retrans = FALSE;
3141         struct sack_block *tcb_sack = 0;
3142
3143         for (int i = 0; i < tcb->snd.nr_sacks; i++) {
3144                 tcb_sack = &tcb->snd.sacks[i];
3145                 if (seq_lt(tcb->snd.rtx, tcb_sack->left)) {
3146                         /* So ssize is supposed to include any *new* flags to flgcnt, which
3147                          * at this point would be a FIN.
3148                          *
3149                          * It might be possible that flgcnt is incremented so we send a FIN,
3150                          * even for an intermediate sack retrans.  Perhaps the user closed
3151                          * the conv.
3152                          *
3153                          * However, the way the "flgcnt for FIN" works is that it inflates
3154                          * the desired amount we'd like to send (qlen + flgcnt).
3155                          * Eventually, we reach the end of the queue and fail to extract all
3156                          * of dsize.  At that point, we put on the FIN, and that's where the
3157                          * extra 'byte' comes from.
3158                          *
3159                          * For sack retrans, since we're extracting from parts of the qio
3160                          * that aren't the right-most edge, we don't need to consider flgcnt
3161                          * when setting ssize. */
3162                         from_seq = tcb->snd.rtx;
3163                         sent = from_seq - tcb->snd.una;
3164                         ssize = tcb_sack->left - from_seq;
3165                         sack_retrans = TRUE;
3166                         break;
3167                 }
3168         }
3169         /* SACK holes have first dibs, but we can still opportunisitically send new
3170