Remove "early routine kmsg" context
[akaros.git] / kern / src / net / tcp.c
1 /* Copyright © 1994-1999 Lucent Technologies Inc.  All rights reserved.
2  * Portions Copyright © 1997-1999 Vita Nuova Limited
3  * Portions Copyright © 2000-2007 Vita Nuova Holdings Limited
4  *                                (www.vitanuova.com)
5  * Revisions Copyright © 2000-2007 Lucent Technologies Inc. and others
6  *
7  * Modified for the Akaros operating system:
8  * Copyright (c) 2013-2014 The Regents of the University of California
9  * Copyright (c) 2013-2017 Google Inc.
10  *
11  * Permission is hereby granted, free of charge, to any person obtaining a copy
12  * of this software and associated documentation files (the "Software"), to deal
13  * in the Software without restriction, including without limitation the rights
14  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15  * copies of the Software, and to permit persons to whom the Software is
16  * furnished to do so, subject to the following conditions:
17  *
18  * The above copyright notice and this permission notice shall be included in
19  * all copies or substantial portions of the Software.
20  *
21  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
24  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27  * SOFTWARE. */
28
29 #include <slab.h>
30 #include <kmalloc.h>
31 #include <kref.h>
32 #include <string.h>
33 #include <stdio.h>
34 #include <assert.h>
35 #include <error.h>
36 #include <cpio.h>
37 #include <pmap.h>
38 #include <smp.h>
39 #include <net/ip.h>
40 #include <net/tcp.h>
41
42 /* Must correspond to the enumeration in tcp.h */
43 static char *tcpstates[] = {
44         "Closed", "Listen", "Syn_sent",
45         "Established", "Finwait1", "Finwait2", "Close_wait",
46         "Closing", "Last_ack", "Time_wait"
47 };
48
49 static int tcp_irtt = DEF_RTT;                  /* Initial guess at round trip time */
50 static uint16_t tcp_mss = DEF_MSS;              /* Maximum segment size to be sent */
51
52 /* Must correspond to the enumeration in tcp.h */
53 static char *statnames[] = {
54         [MaxConn] "MaxConn",
55         [ActiveOpens] "ActiveOpens",
56         [PassiveOpens] "PassiveOpens",
57         [EstabResets] "EstabResets",
58         [CurrEstab] "CurrEstab",
59         [InSegs] "InSegs",
60         [OutSegs] "OutSegs",
61         [RetransSegs] "RetransSegs",
62         [RetransTimeouts] "RetransTimeouts",
63         [InErrs] "InErrs",
64         [OutRsts] "OutRsts",
65         [CsumErrs] "CsumErrs",
66         [HlenErrs] "HlenErrs",
67         [LenErrs] "LenErrs",
68         [OutOfOrder] "OutOfOrder",
69 };
70
71 /*
72  *  Setting tcpporthogdefense to non-zero enables Dong Lin's
73  *  solution to hijacked systems staking out port's as a form
74  *  of DoS attack.
75  *
76  *  To avoid stateless Conv hogs, we pick a sequence number at random.  If
77  *  it that number gets acked by the other end, we shut down the connection.
78  *  Look for tcpporthogedefense in the code.
79  */
80 static int tcpporthogdefense = 0;
81
82 static int addreseq(Tcpctl *, struct tcppriv *, Tcp *, struct block *,
83                     uint16_t);
84 static void getreseq(Tcpctl *, Tcp *, struct block **, uint16_t *);
85 static void localclose(struct conv *, char *unused_char_p_t);
86 static void procsyn(struct conv *, Tcp *);
87 static void tcpiput(struct Proto *, struct Ipifc *, struct block *);
88 static void tcpoutput(struct conv *);
89 static int tcptrim(Tcpctl *, Tcp *, struct block **, uint16_t *);
90 static void tcpstart(struct conv *, int);
91 static void tcptimeout(void *);
92 static void tcpsndsyn(struct conv *, Tcpctl *);
93 static void tcprcvwin(struct conv *);
94 static void tcpacktimer(void *);
95 static void tcpkeepalive(void *);
96 static void tcpsetkacounter(Tcpctl *);
97 static void tcprxmit(struct conv *);
98 static void tcpsettimer(Tcpctl *);
99 static void tcpsynackrtt(struct conv *);
100 static void tcpsetscale(struct conv *, Tcpctl *, uint16_t, uint16_t);
101 static void tcp_loss_event(struct conv *s, Tcpctl *tcb);
102 static uint16_t derive_payload_mss(Tcpctl *tcb);
103 static void set_in_flight(Tcpctl *tcb);
104
105 static void limborexmit(struct Proto *);
106 static void limbo(struct conv *, uint8_t *unused_uint8_p_t, uint8_t *, Tcp *,
107                                   int);
108
109 static void tcpsetstate(struct conv *s, uint8_t newstate)
110 {
111         Tcpctl *tcb;
112         uint8_t oldstate;
113         struct tcppriv *tpriv;
114
115         tpriv = s->p->priv;
116
117         tcb = (Tcpctl *) s->ptcl;
118
119         oldstate = tcb->state;
120         if (oldstate == newstate)
121                 return;
122
123         if (oldstate == Established)
124                 tpriv->stats[CurrEstab]--;
125         if (newstate == Established)
126                 tpriv->stats[CurrEstab]++;
127
128         /**
129         print( "%d/%d %s->%s CurrEstab=%d\n", s->lport, s->rport,
130                 tcpstates[oldstate], tcpstates[newstate], tpriv->tstats.tcpCurrEstab );
131         **/
132
133         switch (newstate) {
134                 case Closed:
135                         qclose(s->rq);
136                         qclose(s->wq);
137                         qclose(s->eq);
138                         break;
139
140                 case Close_wait:        /* Remote closes */
141                         qhangup(s->rq, NULL);
142                         break;
143         }
144
145         tcb->state = newstate;
146
147         if (oldstate == Syn_sent && newstate != Closed)
148                 Fsconnected(s, NULL);
149 }
150
151 static void tcpconnect(struct conv *c, char **argv, int argc)
152 {
153         Fsstdconnect(c, argv, argc);
154         tcpstart(c, TCP_CONNECT);
155 }
156
157 static int tcpstate(struct conv *c, char *state, int n)
158 {
159         Tcpctl *s;
160
161         s = (Tcpctl *) (c->ptcl);
162
163         return snprintf(state, n,
164                                         "%s qin %d qout %d srtt %d mdev %d cwin %u swin %u>>%d rwin %u>>%d timer.start %llu timer.count %llu rerecv %d katimer.start %d katimer.count %d\n",
165                                         tcpstates[s->state],
166                                         c->rq ? qlen(c->rq) : 0,
167                                         c->wq ? qlen(c->wq) : 0,
168                                         s->srtt, s->mdev,
169                                         s->cwind, s->snd.wnd, s->rcv.scale, s->rcv.wnd,
170                                         s->snd.scale, s->timer.start, s->timer.count, s->rerecv,
171                                         s->katimer.start, s->katimer.count);
172 }
173
174 static int tcpinuse(struct conv *c)
175 {
176         Tcpctl *s;
177
178         s = (Tcpctl *) (c->ptcl);
179         return s->state != Closed;
180 }
181
182 static void tcpannounce(struct conv *c, char **argv, int argc)
183 {
184         Fsstdannounce(c, argv, argc);
185         tcpstart(c, TCP_LISTEN);
186         Fsconnected(c, NULL);
187 }
188
189 static void tcpbypass(struct conv *cv, char **argv, int argc)
190 {
191         struct tcppriv *tpriv = cv->p->priv;
192
193         Fsstdbypass(cv, argv, argc);
194         iphtadd(&tpriv->ht, cv);
195 }
196
197 static void tcpshutdown(struct conv *c, int how)
198 {
199         Tcpctl *tcb = (Tcpctl*)c->ptcl;
200
201         /* Do nothing for the read side */
202         if (how == SHUT_RD)
203                 return;
204         /* Sends a FIN.  If we're in another state (like Listen), we'll run into
205          * issues, since we'll never send the FIN.  We'll be shutdown on our end,
206          * but we'll never tell the distant end.  Might just be an app issue. */
207         switch (tcb->state) {
208         case Established:
209                 tcb->flgcnt++;
210                 tcpsetstate(c, Finwait1);
211                 tcpoutput(c);
212                 break;
213         }
214 }
215
216 /*
217  *  tcpclose is always called with the q locked
218  */
219 static void tcpclose(struct conv *c)
220 {
221         Tcpctl *tcb;
222
223         tcb = (Tcpctl *) c->ptcl;
224
225         qhangup(c->rq, NULL);
226         qhangup(c->wq, NULL);
227         qhangup(c->eq, NULL);
228         qflush(c->rq);
229
230         switch (tcb->state) {
231                 case Listen:
232                         /*
233                          *  reset any incoming calls to this listener
234                          */
235                         Fsconnected(c, "Hangup");
236
237                         localclose(c, NULL);
238                         break;
239                 case Closed:
240                 case Syn_sent:
241                         localclose(c, NULL);
242                         break;
243                 case Established:
244                         tcb->flgcnt++;
245                         tcpsetstate(c, Finwait1);
246                         tcpoutput(c);
247                         break;
248                 case Close_wait:
249                         tcb->flgcnt++;
250                         tcpsetstate(c, Last_ack);
251                         tcpoutput(c);
252                         break;
253         }
254 }
255
256 static void tcpkick(void *x)
257 {
258         ERRSTACK(1);
259         struct conv *s = x;
260         Tcpctl *tcb;
261
262         tcb = (Tcpctl *) s->ptcl;
263
264         qlock(&s->qlock);
265         if (waserror()) {
266                 qunlock(&s->qlock);
267                 nexterror();
268         }
269
270         switch (tcb->state) {
271                 case Syn_sent:
272                 case Established:
273                 case Close_wait:
274                         /*
275                          * Push data
276                          */
277                         tcprcvwin(s);
278                         tcpoutput(s);
279                         break;
280                 default:
281                         localclose(s, "Hangup");
282                         break;
283         }
284
285         qunlock(&s->qlock);
286         poperror();
287 }
288
289 static void tcprcvwin(struct conv *s)
290 {
291         /* Call with tcb locked */
292         int w;
293         Tcpctl *tcb;
294
295         tcb = (Tcpctl *) s->ptcl;
296         w = tcb->window - qlen(s->rq);
297         if (w < 0)
298                 w = 0;
299
300         /* RFC 813: Avoid SWS.  We'll always reduce the window (because the qio
301          * increased - that's legit), and we'll always advertise the window
302          * increases (corresponding to qio drains) when those are greater than MSS.
303          * But we don't advertise increases less than MSS.
304          *
305          * Note we don't shrink the window at all - that'll result in tcptrim()
306          * dropping packets that were sent before the sender gets our update. */
307         if ((w < tcb->rcv.wnd) || (w >= tcb->mss))
308                 tcb->rcv.wnd = w;
309         /* We've delayed sending an update to rcv.wnd, and we might never get
310          * another ACK to drive the TCP stack after the qio is drained.  We could
311          * replace this stuff with qio kicks or callbacks, but that might be
312          * trickier with the MSS limitation.  (and 'edge' isn't empty or not). */
313         if (w < tcb->mss)
314                 tcb->rcv.blocked = 1;
315 }
316
317 static void tcpacktimer(void *v)
318 {
319         ERRSTACK(1);
320         Tcpctl *tcb;
321         struct conv *s;
322
323         s = v;
324         tcb = (Tcpctl *) s->ptcl;
325
326         qlock(&s->qlock);
327         if (waserror()) {
328                 qunlock(&s->qlock);
329                 nexterror();
330         }
331         if (tcb->state != Closed) {
332                 tcb->flags |= FORCE;
333                 tcprcvwin(s);
334                 tcpoutput(s);
335         }
336         qunlock(&s->qlock);
337         poperror();
338 }
339
340 static void tcpcreate(struct conv *c)
341 {
342         /* We don't use qio limits.  Instead, TCP manages flow control on its own.
343          * We only use qpassnolim().  Note for qio that 0 doesn't mean no limit. */
344         c->rq = qopen(0, Qcoalesce, 0, 0);
345         c->wq = qopen(8 * QMAX, Qkick, tcpkick, c);
346 }
347
348 static void timerstate(struct tcppriv *priv, Tcptimer *t, int newstate)
349 {
350         if (newstate != TcptimerON) {
351                 if (t->state == TcptimerON) {
352                         // unchain
353                         if (priv->timers == t) {
354                                 priv->timers = t->next;
355                                 if (t->prev != NULL)
356                                         panic("timerstate1");
357                         }
358                         if (t->next)
359                                 t->next->prev = t->prev;
360                         if (t->prev)
361                                 t->prev->next = t->next;
362                         t->next = t->prev = NULL;
363                 }
364         } else {
365                 if (t->state != TcptimerON) {
366                         // chain
367                         if (t->prev != NULL || t->next != NULL)
368                                 panic("timerstate2");
369                         t->prev = NULL;
370                         t->next = priv->timers;
371                         if (t->next)
372                                 t->next->prev = t;
373                         priv->timers = t;
374                 }
375         }
376         t->state = newstate;
377 }
378
379 static void tcpackproc(void *a)
380 {
381         ERRSTACK(1);
382         Tcptimer *t, *tp, *timeo;
383         struct Proto *tcp;
384         struct tcppriv *priv;
385         int loop;
386
387         tcp = a;
388         priv = tcp->priv;
389
390         for (;;) {
391                 kthread_usleep(MSPTICK * 1000);
392
393                 qlock(&priv->tl);
394                 timeo = NULL;
395                 loop = 0;
396                 for (t = priv->timers; t != NULL; t = tp) {
397                         if (loop++ > 10000)
398                                 panic("tcpackproc1");
399                         tp = t->next;
400                         if (t->state == TcptimerON) {
401                                 t->count--;
402                                 if (t->count == 0) {
403                                         timerstate(priv, t, TcptimerDONE);
404                                         t->readynext = timeo;
405                                         timeo = t;
406                                 }
407                         }
408                 }
409                 qunlock(&priv->tl);
410
411                 loop = 0;
412                 for (t = timeo; t != NULL; t = t->readynext) {
413                         if (loop++ > 10000)
414                                 panic("tcpackproc2");
415                         if (t->state == TcptimerDONE && t->func != NULL) {
416                                 /* discard error style */
417                                 if (!waserror())
418                                         (*t->func) (t->arg);
419                                 poperror();
420                         }
421                 }
422
423                 limborexmit(tcp);
424         }
425 }
426
427 static void tcpgo(struct tcppriv *priv, Tcptimer *t)
428 {
429         if (t == NULL || t->start == 0)
430                 return;
431
432         qlock(&priv->tl);
433         t->count = t->start;
434         timerstate(priv, t, TcptimerON);
435         qunlock(&priv->tl);
436 }
437
438 static void tcphalt(struct tcppriv *priv, Tcptimer *t)
439 {
440         if (t == NULL)
441                 return;
442
443         qlock(&priv->tl);
444         timerstate(priv, t, TcptimerOFF);
445         qunlock(&priv->tl);
446 }
447
448 static int backoff(int n)
449 {
450         return 1 << n;
451 }
452
453 static void localclose(struct conv *s, char *reason)
454 {
455         /* called with tcb locked */
456         Tcpctl *tcb;
457         Reseq *rp, *rp1;
458         struct tcppriv *tpriv;
459
460         tpriv = s->p->priv;
461         tcb = (Tcpctl *) s->ptcl;
462
463         iphtrem(&tpriv->ht, s);
464
465         tcphalt(tpriv, &tcb->timer);
466         tcphalt(tpriv, &tcb->rtt_timer);
467         tcphalt(tpriv, &tcb->acktimer);
468         tcphalt(tpriv, &tcb->katimer);
469
470         /* Flush reassembly queue; nothing more can arrive */
471         for (rp = tcb->reseq; rp != NULL; rp = rp1) {
472                 rp1 = rp->next;
473                 freeblist(rp->bp);
474                 kfree(rp);
475         }
476         tcb->reseq = NULL;
477
478         if (tcb->state == Syn_sent)
479                 Fsconnected(s, reason);
480
481         qhangup(s->rq, reason);
482         qhangup(s->wq, reason);
483
484         tcpsetstate(s, Closed);
485
486         /* listener will check the rq state */
487         if (s->state == Announced)
488                 rendez_wakeup(&s->listenr);
489 }
490
491 /* mtu (- TCP + IP hdr len) of 1st hop */
492 static int tcpmtu(struct Ipifc *ifc, int version, int *scale)
493 {
494         int mtu;
495
496         switch (version) {
497                 default:
498                 case V4:
499                         mtu = DEF_MSS;
500                         if (ifc != NULL)
501                                 mtu = ifc->maxtu - ifc->m->hsize - (TCP4_PKT + TCP4_HDRSIZE);
502                         break;
503                 case V6:
504                         mtu = DEF_MSS6;
505                         if (ifc != NULL)
506                                 mtu = ifc->maxtu - ifc->m->hsize - (TCP6_PKT + TCP6_HDRSIZE);
507                         break;
508         }
509         *scale = HaveWS | 7;
510
511         return mtu;
512 }
513
514 static void tcb_check_tso(Tcpctl *tcb)
515 {
516         /* This can happen if the netdev isn't up yet. */
517         if (!tcb->ifc)
518                 return;
519         if (tcb->ifc->feat & NETF_TSO)
520                 tcb->flags |= TSO;
521         else
522                 tcb->flags &= ~TSO;
523 }
524
525 static void inittcpctl(struct conv *s, int mode)
526 {
527         Tcpctl *tcb;
528         Tcp4hdr *h4;
529         Tcp6hdr *h6;
530         int mss;
531
532         tcb = (Tcpctl *) s->ptcl;
533
534         memset(tcb, 0, sizeof(Tcpctl));
535
536         tcb->ssthresh = UINT32_MAX;
537         tcb->srtt = tcp_irtt;
538         tcb->mdev = 0;
539
540         /* setup timers */
541         tcb->timer.start = tcp_irtt / MSPTICK;
542         tcb->timer.func = tcptimeout;
543         tcb->timer.arg = s;
544         tcb->rtt_timer.start = MAX_TIME;
545         tcb->acktimer.start = TCP_ACK / MSPTICK;
546         tcb->acktimer.func = tcpacktimer;
547         tcb->acktimer.arg = s;
548         tcb->katimer.start = DEF_KAT / MSPTICK;
549         tcb->katimer.func = tcpkeepalive;
550         tcb->katimer.arg = s;
551
552         mss = DEF_MSS;
553
554         /* create a prototype(pseudo) header */
555         if (mode != TCP_LISTEN) {
556                 if (ipcmp(s->laddr, IPnoaddr) == 0)
557                         findlocalip(s->p->f, s->laddr, s->raddr);
558
559                 switch (s->ipversion) {
560                         case V4:
561                                 h4 = &tcb->protohdr.tcp4hdr;
562                                 memset(h4, 0, sizeof(*h4));
563                                 h4->proto = IP_TCPPROTO;
564                                 hnputs(h4->tcpsport, s->lport);
565                                 hnputs(h4->tcpdport, s->rport);
566                                 v6tov4(h4->tcpsrc, s->laddr);
567                                 v6tov4(h4->tcpdst, s->raddr);
568                                 break;
569                         case V6:
570                                 h6 = &tcb->protohdr.tcp6hdr;
571                                 memset(h6, 0, sizeof(*h6));
572                                 h6->proto = IP_TCPPROTO;
573                                 hnputs(h6->tcpsport, s->lport);
574                                 hnputs(h6->tcpdport, s->rport);
575                                 ipmove(h6->tcpsrc, s->laddr);
576                                 ipmove(h6->tcpdst, s->raddr);
577                                 mss = DEF_MSS6;
578                                 break;
579                         default:
580                                 panic("inittcpctl: version %d", s->ipversion);
581                 }
582         }
583
584         tcb->ifc = findipifc(s->p->f, s->laddr, 0);
585         tcb->mss = mss;
586         tcb->typical_mss = mss;
587         tcb->cwind = tcb->typical_mss * CWIND_SCALE;
588
589         /* default is no window scaling */
590         tcb->window = QMAX;
591         tcb->rcv.wnd = QMAX;
592         tcb->rcv.scale = 0;
593         tcb->snd.scale = 0;
594         tcb_check_tso(tcb);
595 }
596
597 /*
598  *  called with s qlocked
599  */
600 static void tcpstart(struct conv *s, int mode)
601 {
602         Tcpctl *tcb;
603         struct tcppriv *tpriv;
604         char *kpname;
605
606         tpriv = s->p->priv;
607
608         if (tpriv->ackprocstarted == 0) {
609                 qlock(&tpriv->apl);
610                 if (tpriv->ackprocstarted == 0) {
611                         /* tcpackproc needs to free this if it ever exits */
612                         kpname = kmalloc(KNAMELEN, MEM_WAIT);
613                         snprintf(kpname, KNAMELEN, "#I%dtcpack", s->p->f->dev);
614                         ktask(kpname, tcpackproc, s->p);
615                         tpriv->ackprocstarted = 1;
616                 }
617                 qunlock(&tpriv->apl);
618         }
619
620         tcb = (Tcpctl *) s->ptcl;
621
622         inittcpctl(s, mode);
623
624         iphtadd(&tpriv->ht, s);
625         switch (mode) {
626                 case TCP_LISTEN:
627                         tpriv->stats[PassiveOpens]++;
628                         tcb->flags |= CLONE;
629                         tcpsetstate(s, Listen);
630                         break;
631
632                 case TCP_CONNECT:
633                         tpriv->stats[ActiveOpens]++;
634                         tcb->flags |= ACTIVE;
635                         tcpsndsyn(s, tcb);
636                         tcpsetstate(s, Syn_sent);
637                         tcpoutput(s);
638                         break;
639         }
640 }
641
642 static char *tcpflag(uint16_t flag)
643 {
644         static char buf[128];
645
646         snprintf(buf, sizeof(buf), "%d", flag >> 10);   /* Head len */
647         if (flag & URG)
648                 snprintf(buf, sizeof(buf), "%s%s", buf, " URG");
649         if (flag & ACK)
650                 snprintf(buf, sizeof(buf), "%s%s", buf, " ACK");
651         if (flag & PSH)
652                 snprintf(buf, sizeof(buf), "%s%s", buf, " PSH");
653         if (flag & RST)
654                 snprintf(buf, sizeof(buf), "%s%s", buf, " RST");
655         if (flag & SYN)
656                 snprintf(buf, sizeof(buf), "%s%s", buf, " SYN");
657         if (flag & FIN)
658                 snprintf(buf, sizeof(buf), "%s%s", buf, " FIN");
659
660         return buf;
661 }
662
663 /* Helper, determine if we should send a TCP timestamp.  ts_val was the
664  * timestamp from our distant end.  We'll also send a TS on SYN (no ACK). */
665 static bool tcp_seg_has_ts(Tcp *tcph)
666 {
667         return tcph->ts_val || ((tcph->flags & SYN) && !(tcph->flags & ACK));
668 }
669
670 /* Given a TCP header/segment and default header size (e.g. TCP4_HDRSIZE),
671  * return the actual hdr_len and opt_pad */
672 static void compute_hdrlen_optpad(Tcp *tcph, uint16_t default_hdrlen,
673                                   uint16_t *ret_hdrlen, uint16_t *ret_optpad,
674                                   Tcpctl *tcb)
675 {
676         uint16_t hdrlen = default_hdrlen;
677         uint16_t optpad = 0;
678
679         if (tcph->flags & SYN) {
680                 if (tcph->mss)
681                         hdrlen += MSS_LENGTH;
682                 if (tcph->ws)
683                         hdrlen += WS_LENGTH;
684                 if (tcph->sack_ok)
685                         hdrlen += SACK_OK_LENGTH;
686         }
687         if (tcp_seg_has_ts(tcph)) {
688                 hdrlen += TS_LENGTH;
689                 /* SYNs have other opts, don't do the PREPAD NOOP optimization. */
690                 if (!(tcph->flags & SYN))
691                         hdrlen += TS_SEND_PREPAD;
692         }
693         if (tcb && tcb->rcv.nr_sacks)
694                 hdrlen += 2 + tcb->rcv.nr_sacks * 8;
695         optpad = hdrlen & 3;
696         if (optpad)
697                 optpad = 4 - optpad;
698         hdrlen += optpad;
699         *ret_hdrlen = hdrlen;
700         *ret_optpad = optpad;
701 }
702
703 /* Writes the TCP options for tcph to opt. */
704 static void write_opts(Tcp *tcph, uint8_t *opt, uint16_t optpad, Tcpctl *tcb)
705 {
706         if (tcph->flags & SYN) {
707                 if (tcph->mss != 0) {
708                         *opt++ = MSSOPT;
709                         *opt++ = MSS_LENGTH;
710                         hnputs(opt, tcph->mss);
711                         opt += 2;
712                 }
713                 if (tcph->ws != 0) {
714                         *opt++ = WSOPT;
715                         *opt++ = WS_LENGTH;
716                         *opt++ = tcph->ws;
717                 }
718                 if (tcph->sack_ok) {
719                         *opt++ = SACK_OK_OPT;
720                         *opt++ = SACK_OK_LENGTH;
721                 }
722         }
723         if (tcp_seg_has_ts(tcph)) {
724                 if (!(tcph->flags & SYN)) {
725                         *opt++ = NOOPOPT;
726                         *opt++ = NOOPOPT;
727                 }
728                 *opt++ = TS_OPT;
729                 *opt++ = TS_LENGTH;
730                 /* Setting TSval, our time */
731                 hnputl(opt, milliseconds());
732                 opt += 4;
733                 /* Setting TSecr, the time we last saw from them, stored in ts_val */
734                 hnputl(opt, tcph->ts_val);
735                 opt += 4;
736         }
737         if (tcb && tcb->rcv.nr_sacks) {
738                 *opt++ = SACK_OPT;
739                 *opt++ = 2 + tcb->rcv.nr_sacks * 8;
740                 for (int i = 0; i < tcb->rcv.nr_sacks; i++) {
741                         hnputl(opt, tcb->rcv.sacks[i].left);
742                         opt += 4;
743                         hnputl(opt, tcb->rcv.sacks[i].right);
744                         opt += 4;
745                 }
746         }
747         while (optpad-- > 0)
748                 *opt++ = NOOPOPT;
749 }
750
751 /* Given a data block (or NULL) returns a block with enough header room that we
752  * can send out.  block->wp is set to the beginning of the payload.  Returns
753  * NULL on some sort of error. */
754 static struct block *alloc_or_pad_block(struct block *data,
755                                         uint16_t total_hdr_size)
756 {
757         if (data) {
758                 data = padblock(data, total_hdr_size);
759                 if (data == NULL)
760                         return NULL;
761         } else {
762                 /* the 64 pad is to meet mintu's */
763                 data = block_alloc(total_hdr_size + 64, MEM_WAIT);
764                 if (data == NULL)
765                         return NULL;
766                 data->wp += total_hdr_size;
767         }
768         return data;
769 }
770
771 static struct block *htontcp6(Tcp *tcph, struct block *data, Tcp6hdr *ph,
772                               Tcpctl *tcb)
773 {
774         int dlen = blocklen(data);
775         Tcp6hdr *h;
776         uint16_t csum;
777         uint16_t hdrlen, optpad;
778
779         compute_hdrlen_optpad(tcph, TCP6_HDRSIZE, &hdrlen, &optpad, tcb);
780
781         data = alloc_or_pad_block(data, hdrlen + TCP6_PKT);
782         if (data == NULL)
783                 return NULL;
784         /* relative to the block start (bp->rp).  Note TCP structs include IP. */
785         data->network_offset = 0;
786         data->transport_offset = offsetof(Tcp6hdr, tcpsport);
787
788         /* copy in pseudo ip header plus port numbers */
789         h = (Tcp6hdr *) (data->rp);
790         memmove(h, ph, TCP6_TCBPHDRSZ);
791
792         /* compose pseudo tcp header, do cksum calculation */
793         hnputl(h->vcf, hdrlen + dlen);
794         h->ploadlen[0] = h->ploadlen[1] = h->proto = 0;
795         h->ttl = ph->proto;
796
797         /* copy in variable bits */
798         hnputl(h->tcpseq, tcph->seq);
799         hnputl(h->tcpack, tcph->ack);
800         hnputs(h->tcpflag, (hdrlen << 10) | tcph->flags);
801         hnputs(h->tcpwin, tcph->wnd >> (tcb != NULL ? tcb->snd.scale : 0));
802         hnputs(h->tcpurg, tcph->urg);
803
804         write_opts(tcph, h->tcpopt, optpad, tcb);
805
806         if (tcb != NULL && tcb->nochecksum) {
807                 h->tcpcksum[0] = h->tcpcksum[1] = 0;
808         } else {
809                 csum = ptclcsum(data, TCP6_IPLEN, hdrlen + dlen + TCP6_PHDRSIZE);
810                 hnputs(h->tcpcksum, csum);
811         }
812
813         /* move from pseudo header back to normal ip header */
814         memset(h->vcf, 0, 4);
815         h->vcf[0] = IP_VER6;
816         hnputs(h->ploadlen, hdrlen + dlen);
817         h->proto = ph->proto;
818
819         return data;
820 }
821
822 static struct block *htontcp4(Tcp *tcph, struct block *data, Tcp4hdr *ph,
823                               Tcpctl *tcb)
824 {
825         int dlen = blocklen(data);
826         Tcp4hdr *h;
827         uint16_t csum;
828         uint16_t hdrlen, optpad;
829
830         compute_hdrlen_optpad(tcph, TCP4_HDRSIZE, &hdrlen, &optpad, tcb);
831
832         data = alloc_or_pad_block(data, hdrlen + TCP4_PKT);
833         if (data == NULL)
834                 return NULL;
835         /* relative to the block start (bp->rp).  Note TCP structs include IP. */
836         data->network_offset = 0;
837         data->transport_offset = offsetof(Tcp4hdr, tcpsport);
838
839         /* copy in pseudo ip header plus port numbers */
840         h = (Tcp4hdr *) (data->rp);
841         memmove(h, ph, TCP4_TCBPHDRSZ);
842
843         /* copy in variable bits */
844         hnputs(h->tcplen, hdrlen + dlen);
845         hnputl(h->tcpseq, tcph->seq);
846         hnputl(h->tcpack, tcph->ack);
847         hnputs(h->tcpflag, (hdrlen << 10) | tcph->flags);
848         hnputs(h->tcpwin, tcph->wnd >> (tcb != NULL ? tcb->snd.scale : 0));
849         hnputs(h->tcpurg, tcph->urg);
850
851         write_opts(tcph, h->tcpopt, optpad, tcb);
852
853         if (tcb != NULL && tcb->nochecksum) {
854                 h->tcpcksum[0] = h->tcpcksum[1] = 0;
855         } else {
856                 assert(data->transport_offset == TCP4_IPLEN + TCP4_PHDRSIZE);
857                 csum = ~ptclcsum(data, TCP4_IPLEN, TCP4_PHDRSIZE);
858                 hnputs(h->tcpcksum, csum);
859                 data->tx_csum_offset = ph->tcpcksum - ph->tcpsport;
860                 data->flag |= Btcpck;
861         }
862
863         return data;
864 }
865
866 static void parse_inbound_sacks(Tcp *tcph, uint8_t *opt, uint16_t optlen)
867 {
868         uint8_t nr_sacks;
869         uint32_t left, right;
870
871         nr_sacks = (optlen - 2) / 8;
872         if (nr_sacks > MAX_NR_SACKS_PER_PACKET)
873                 return;
874         opt += 2;
875         for (int i = 0; i < nr_sacks; i++, opt += 8) {
876                 left = nhgetl(opt);
877                 right = nhgetl(opt + 4);
878                 if (seq_ge(left, right)) {
879                         /* bad / malicious SACK.  Skip it, and adjust. */
880                         nr_sacks--;
881                         i--;    /* stay on this array element next loop */
882                         continue;
883                 }
884                 tcph->sacks[i].left = left;
885                 tcph->sacks[i].right = right;
886         }
887         tcph->nr_sacks = nr_sacks;
888 }
889
890 static void parse_inbound_opts(Tcp *tcph, uint8_t *opt, uint16_t optsize)
891 {
892         uint16_t optlen;
893
894         while (optsize > 0 && *opt != EOLOPT) {
895                 if (*opt == NOOPOPT) {
896                         optsize--;
897                         opt++;
898                         continue;
899                 }
900                 optlen = opt[1];
901                 if (optlen < 2 || optlen > optsize)
902                         break;
903                 switch (*opt) {
904                         case MSSOPT:
905                                 if (optlen == MSS_LENGTH)
906                                         tcph->mss = nhgets(opt + 2);
907                                 break;
908                         case WSOPT:
909                                 if (optlen == WS_LENGTH && *(opt + 2) <= MAX_WS_VALUE)
910                                         tcph->ws = HaveWS | *(opt + 2);
911                                 break;
912                         case SACK_OK_OPT:
913                                 if (optlen == SACK_OK_LENGTH)
914                                         tcph->sack_ok = TRUE;
915                                 break;
916                         case SACK_OPT:
917                                 parse_inbound_sacks(tcph, opt, optlen);
918                                 break;
919                         case TS_OPT:
920                                 if (optlen == TS_LENGTH) {
921                                         tcph->ts_val = nhgetl(opt + 2);
922                                         tcph->ts_ecr = nhgetl(opt + 6);
923                                 }
924                                 break;
925                 }
926                 optsize -= optlen;
927                 opt += optlen;
928         }
929 }
930
931 /* Helper, clears the opts.  We'll later set them with e.g. parse_inbound_opts,
932  * set them manually, or something else. */
933 static void clear_tcph_opts(Tcp *tcph)
934 {
935         tcph->mss = 0;
936         tcph->ws = 0;
937         tcph->sack_ok = FALSE;
938         tcph->nr_sacks = 0;
939         tcph->ts_val = 0;
940         tcph->ts_ecr = 0;
941 }
942
943 static int ntohtcp6(Tcp *tcph, struct block **bpp)
944 {
945         Tcp6hdr *h;
946         uint16_t hdrlen;
947
948         *bpp = pullupblock(*bpp, TCP6_PKT + TCP6_HDRSIZE);
949         if (*bpp == NULL)
950                 return -1;
951
952         h = (Tcp6hdr *) ((*bpp)->rp);
953         tcph->source = nhgets(h->tcpsport);
954         tcph->dest = nhgets(h->tcpdport);
955         tcph->seq = nhgetl(h->tcpseq);
956         tcph->ack = nhgetl(h->tcpack);
957         hdrlen = (h->tcpflag[0] >> 2) & ~3;
958         if (hdrlen < TCP6_HDRSIZE) {
959                 freeblist(*bpp);
960                 return -1;
961         }
962
963         tcph->flags = h->tcpflag[1];
964         tcph->wnd = nhgets(h->tcpwin);
965         tcph->urg = nhgets(h->tcpurg);
966         clear_tcph_opts(tcph);
967         tcph->len = nhgets(h->ploadlen) - hdrlen;
968
969         *bpp = pullupblock(*bpp, hdrlen + TCP6_PKT);
970         if (*bpp == NULL)
971                 return -1;
972         parse_inbound_opts(tcph, h->tcpopt, hdrlen - TCP6_HDRSIZE);
973         return hdrlen;
974 }
975
976 static int ntohtcp4(Tcp *tcph, struct block **bpp)
977 {
978         Tcp4hdr *h;
979         uint16_t hdrlen;
980
981         *bpp = pullupblock(*bpp, TCP4_PKT + TCP4_HDRSIZE);
982         if (*bpp == NULL)
983                 return -1;
984
985         h = (Tcp4hdr *) ((*bpp)->rp);
986         tcph->source = nhgets(h->tcpsport);
987         tcph->dest = nhgets(h->tcpdport);
988         tcph->seq = nhgetl(h->tcpseq);
989         tcph->ack = nhgetl(h->tcpack);
990
991         hdrlen = (h->tcpflag[0] >> 2) & ~3;
992         if (hdrlen < TCP4_HDRSIZE) {
993                 freeblist(*bpp);
994                 return -1;
995         }
996
997         tcph->flags = h->tcpflag[1];
998         tcph->wnd = nhgets(h->tcpwin);
999         tcph->urg = nhgets(h->tcpurg);
1000         clear_tcph_opts(tcph);
1001         tcph->len = nhgets(h->length) - (hdrlen + TCP4_PKT);
1002
1003         *bpp = pullupblock(*bpp, hdrlen + TCP4_PKT);
1004         if (*bpp == NULL)
1005                 return -1;
1006         parse_inbound_opts(tcph, h->tcpopt, hdrlen - TCP4_HDRSIZE);
1007         return hdrlen;
1008 }
1009
1010 /*
1011  *  For outgoing calls, generate an initial sequence
1012  *  number and put a SYN on the send queue
1013  */
1014 static void tcpsndsyn(struct conv *s, Tcpctl *tcb)
1015 {
1016         urandom_read(&tcb->iss, sizeof(tcb->iss));
1017         tcb->rttseq = tcb->iss;
1018         tcb->snd.wl2 = tcb->iss;
1019         tcb->snd.una = tcb->iss;
1020         tcb->snd.rtx = tcb->rttseq;
1021         tcb->snd.nxt = tcb->rttseq;
1022         tcb->flgcnt++;
1023         tcb->flags |= FORCE;
1024         tcb->sndsyntime = NOW;
1025
1026         /* set desired mss and scale */
1027         tcb->mss = tcpmtu(tcb->ifc, s->ipversion, &tcb->scale);
1028 }
1029
1030 static void sndrst(struct Proto *tcp, uint8_t *source, uint8_t *dest,
1031                    uint16_t length, Tcp *seg, uint8_t version, char *reason)
1032 {
1033         struct block *hbp;
1034         uint8_t rflags;
1035         struct tcppriv *tpriv;
1036         Tcp4hdr ph4;
1037         Tcp6hdr ph6;
1038
1039         netlog(tcp->f, Logtcpreset, "sndrst: %s\n", reason);
1040
1041         tpriv = tcp->priv;
1042
1043         if (seg->flags & RST)
1044                 return;
1045
1046         /* make pseudo header */
1047         switch (version) {
1048                 case V4:
1049                         memset(&ph4, 0, sizeof(ph4));
1050                         ph4.vihl = IP_VER4;
1051                         v6tov4(ph4.tcpsrc, dest);
1052                         v6tov4(ph4.tcpdst, source);
1053                         ph4.proto = IP_TCPPROTO;
1054                         hnputs(ph4.tcplen, TCP4_HDRSIZE);
1055                         hnputs(ph4.tcpsport, seg->dest);
1056                         hnputs(ph4.tcpdport, seg->source);
1057                         break;
1058                 case V6:
1059                         memset(&ph6, 0, sizeof(ph6));
1060                         ph6.vcf[0] = IP_VER6;
1061                         ipmove(ph6.tcpsrc, dest);
1062                         ipmove(ph6.tcpdst, source);
1063                         ph6.proto = IP_TCPPROTO;
1064                         hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1065                         hnputs(ph6.tcpsport, seg->dest);
1066                         hnputs(ph6.tcpdport, seg->source);
1067                         break;
1068                 default:
1069                         panic("sndrst: version %d", version);
1070         }
1071
1072         tpriv->stats[OutRsts]++;
1073         rflags = RST;
1074
1075         /* convince the other end that this reset is in band */
1076         if (seg->flags & ACK) {
1077                 seg->seq = seg->ack;
1078                 seg->ack = 0;
1079         } else {
1080                 rflags |= ACK;
1081                 seg->ack = seg->seq;
1082                 seg->seq = 0;
1083                 if (seg->flags & SYN)
1084                         seg->ack++;
1085                 seg->ack += length;
1086                 if (seg->flags & FIN)
1087                         seg->ack++;
1088         }
1089         seg->flags = rflags;
1090         seg->wnd = 0;
1091         seg->urg = 0;
1092         seg->mss = 0;
1093         seg->ws = 0;
1094         seg->sack_ok = FALSE;
1095         seg->nr_sacks = 0;
1096         /* seg->ts_val is already set with their timestamp */
1097         switch (version) {
1098                 case V4:
1099                         hbp = htontcp4(seg, NULL, &ph4, NULL);
1100                         if (hbp == NULL)
1101                                 return;
1102                         ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1103                         break;
1104                 case V6:
1105                         hbp = htontcp6(seg, NULL, &ph6, NULL);
1106                         if (hbp == NULL)
1107                                 return;
1108                         ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1109                         break;
1110                 default:
1111                         panic("sndrst2: version %d", version);
1112         }
1113 }
1114
1115 /*
1116  *  send a reset to the remote side and close the conversation
1117  *  called with s qlocked
1118  */
1119 static void tcphangup(struct conv *s)
1120 {
1121         ERRSTACK(1);
1122         Tcp seg;
1123         Tcpctl *tcb;
1124         struct block *hbp;
1125
1126         tcb = (Tcpctl *) s->ptcl;
1127         if (ipcmp(s->raddr, IPnoaddr)) {
1128                 /* discard error style, poperror regardless */
1129                 if (!waserror()) {
1130                         seg.flags = RST | ACK;
1131                         seg.ack = tcb->rcv.nxt;
1132                         tcb->last_ack_sent = seg.ack;
1133                         tcb->rcv.una = 0;
1134                         seg.seq = tcb->snd.nxt;
1135                         seg.wnd = 0;
1136                         seg.urg = 0;
1137                         seg.mss = 0;
1138                         seg.ws = 0;
1139                         seg.sack_ok = FALSE;
1140                         seg.nr_sacks = 0;
1141                         seg.ts_val = tcb->ts_recent;
1142                         switch (s->ipversion) {
1143                                 case V4:
1144                                         tcb->protohdr.tcp4hdr.vihl = IP_VER4;
1145                                         hbp = htontcp4(&seg, NULL, &tcb->protohdr.tcp4hdr, tcb);
1146                                         ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
1147                                         break;
1148                                 case V6:
1149                                         tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
1150                                         hbp = htontcp6(&seg, NULL, &tcb->protohdr.tcp6hdr, tcb);
1151                                         ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
1152                                         break;
1153                                 default:
1154                                         panic("tcphangup: version %d", s->ipversion);
1155                         }
1156                 }
1157                 poperror();
1158         }
1159         localclose(s, NULL);
1160 }
1161
1162 /*
1163  *  (re)send a SYN ACK
1164  */
1165 static int sndsynack(struct Proto *tcp, Limbo *lp)
1166 {
1167         struct block *hbp;
1168         Tcp4hdr ph4;
1169         Tcp6hdr ph6;
1170         Tcp seg;
1171         int scale;
1172         uint8_t flag = 0;
1173
1174         /* make pseudo header */
1175         switch (lp->version) {
1176                 case V4:
1177                         memset(&ph4, 0, sizeof(ph4));
1178                         ph4.vihl = IP_VER4;
1179                         v6tov4(ph4.tcpsrc, lp->laddr);
1180                         v6tov4(ph4.tcpdst, lp->raddr);
1181                         ph4.proto = IP_TCPPROTO;
1182                         hnputs(ph4.tcplen, TCP4_HDRSIZE);
1183                         hnputs(ph4.tcpsport, lp->lport);
1184                         hnputs(ph4.tcpdport, lp->rport);
1185                         break;
1186                 case V6:
1187                         memset(&ph6, 0, sizeof(ph6));
1188                         ph6.vcf[0] = IP_VER6;
1189                         ipmove(ph6.tcpsrc, lp->laddr);
1190                         ipmove(ph6.tcpdst, lp->raddr);
1191                         ph6.proto = IP_TCPPROTO;
1192                         hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1193                         hnputs(ph6.tcpsport, lp->lport);
1194                         hnputs(ph6.tcpdport, lp->rport);
1195                         break;
1196                 default:
1197                         panic("sndrst: version %d", lp->version);
1198         }
1199         lp->ifc = findipifc(tcp->f, lp->laddr, 0);
1200
1201         seg.seq = lp->iss;
1202         seg.ack = lp->irs + 1;
1203         seg.flags = SYN | ACK;
1204         seg.urg = 0;
1205         seg.mss = tcpmtu(lp->ifc, lp->version, &scale);
1206         seg.wnd = QMAX;
1207         seg.ts_val = lp->ts_val;
1208         seg.nr_sacks = 0;
1209
1210         /* if the other side set scale, we should too */
1211         if (lp->rcvscale) {
1212                 seg.ws = scale;
1213                 lp->sndscale = scale;
1214         } else {
1215                 seg.ws = 0;
1216                 lp->sndscale = 0;
1217         }
1218         if (SACK_SUPPORTED)
1219                 seg.sack_ok = lp->sack_ok;
1220         else
1221                 seg.sack_ok = FALSE;
1222
1223         switch (lp->version) {
1224                 case V4:
1225                         hbp = htontcp4(&seg, NULL, &ph4, NULL);
1226                         if (hbp == NULL)
1227                                 return -1;
1228                         ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1229                         break;
1230                 case V6:
1231                         hbp = htontcp6(&seg, NULL, &ph6, NULL);
1232                         if (hbp == NULL)
1233                                 return -1;
1234                         ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1235                         break;
1236                 default:
1237                         panic("sndsnack: version %d", lp->version);
1238         }
1239         lp->lastsend = NOW;
1240         return 0;
1241 }
1242
1243 #define hashipa(a, p) ( ( (a)[IPaddrlen-2] + (a)[IPaddrlen-1] + p )&LHTMASK )
1244
1245 /*
1246  *  put a call into limbo and respond with a SYN ACK
1247  *
1248  *  called with proto locked
1249  */
1250 static void limbo(struct conv *s, uint8_t *source, uint8_t *dest, Tcp *seg,
1251                   int version)
1252 {
1253         Limbo *lp, **l;
1254         struct tcppriv *tpriv;
1255         int h;
1256
1257         tpriv = s->p->priv;
1258         h = hashipa(source, seg->source);
1259
1260         for (l = &tpriv->lht[h]; *l != NULL; l = &lp->next) {
1261                 lp = *l;
1262                 if (lp->lport != seg->dest || lp->rport != seg->source
1263                         || lp->version != version)
1264                         continue;
1265                 if (ipcmp(lp->raddr, source) != 0)
1266                         continue;
1267                 if (ipcmp(lp->laddr, dest) != 0)
1268                         continue;
1269
1270                 /* each new SYN restarts the retransmits */
1271                 lp->irs = seg->seq;
1272                 break;
1273         }
1274         lp = *l;
1275         if (lp == NULL) {
1276                 if (tpriv->nlimbo >= Maxlimbo && tpriv->lht[h]) {
1277                         lp = tpriv->lht[h];
1278                         tpriv->lht[h] = lp->next;
1279                         lp->next = NULL;
1280                 } else {
1281                         lp = kzmalloc(sizeof(*lp), 0);
1282                         if (lp == NULL)
1283                                 return;
1284                         tpriv->nlimbo++;
1285                 }
1286                 *l = lp;
1287                 lp->version = version;
1288                 ipmove(lp->laddr, dest);
1289                 ipmove(lp->raddr, source);
1290                 lp->lport = seg->dest;
1291                 lp->rport = seg->source;
1292                 lp->mss = seg->mss;
1293                 lp->rcvscale = seg->ws;
1294                 lp->sack_ok = seg->sack_ok;
1295                 lp->irs = seg->seq;
1296                 lp->ts_val = seg->ts_val;
1297                 urandom_read(&lp->iss, sizeof(lp->iss));
1298         }
1299
1300         if (sndsynack(s->p, lp) < 0) {
1301                 *l = lp->next;
1302                 tpriv->nlimbo--;
1303                 kfree(lp);
1304         }
1305 }
1306
1307 /*
1308  *  resend SYN ACK's once every SYNACK_RXTIMER ms.
1309  */
1310 static void limborexmit(struct Proto *tcp)
1311 {
1312         struct tcppriv *tpriv;
1313         Limbo **l, *lp;
1314         int h;
1315         int seen;
1316         uint64_t now;
1317
1318         tpriv = tcp->priv;
1319
1320         if (!canqlock(&tcp->qlock))
1321                 return;
1322         seen = 0;
1323         now = NOW;
1324         for (h = 0; h < NLHT && seen < tpriv->nlimbo; h++) {
1325                 for (l = &tpriv->lht[h]; *l != NULL && seen < tpriv->nlimbo;) {
1326                         lp = *l;
1327                         seen++;
1328                         if (now - lp->lastsend < (lp->rexmits + 1) * SYNACK_RXTIMER)
1329                                 continue;
1330
1331                         /* time it out after 1 second */
1332                         if (++(lp->rexmits) > 5) {
1333                                 tpriv->nlimbo--;
1334                                 *l = lp->next;
1335                                 kfree(lp);
1336                                 continue;
1337                         }
1338
1339                         /* if we're being attacked, don't bother resending SYN ACK's */
1340                         if (tpriv->nlimbo > 100)
1341                                 continue;
1342
1343                         if (sndsynack(tcp, lp) < 0) {
1344                                 tpriv->nlimbo--;
1345                                 *l = lp->next;
1346                                 kfree(lp);
1347                                 continue;
1348                         }
1349
1350                         l = &lp->next;
1351                 }
1352         }
1353         qunlock(&tcp->qlock);
1354 }
1355
1356 /*
1357  *  lookup call in limbo.  if found, throw it out.
1358  *
1359  *  called with proto locked
1360  */
1361 static void limborst(struct conv *s, Tcp *segp, uint8_t *src, uint8_t *dst,
1362                      uint8_t version)
1363 {
1364         Limbo *lp, **l;
1365         int h;
1366         struct tcppriv *tpriv;
1367
1368         tpriv = s->p->priv;
1369
1370         /* find a call in limbo */
1371         h = hashipa(src, segp->source);
1372         for (l = &tpriv->lht[h]; *l != NULL; l = &lp->next) {
1373                 lp = *l;
1374                 if (lp->lport != segp->dest || lp->rport != segp->source
1375                         || lp->version != version)
1376                         continue;
1377                 if (ipcmp(lp->laddr, dst) != 0)
1378                         continue;
1379                 if (ipcmp(lp->raddr, src) != 0)
1380                         continue;
1381
1382                 /* RST can only follow the SYN */
1383                 if (segp->seq == lp->irs + 1) {
1384                         tpriv->nlimbo--;
1385                         *l = lp->next;
1386                         kfree(lp);
1387                 }
1388                 break;
1389         }
1390 }
1391
1392 /* The advertised MSS (e.g. 1460) includes any per-packet TCP options, such as
1393  * TCP timestamps.  A given packet will contain mss bytes, but only typical_mss
1394  * bytes of *data*.  If we know we'll use those options, we should adjust our
1395  * typical_mss, which will affect the cwnd. */
1396 static void adjust_typical_mss_for_opts(Tcp *tcph, Tcpctl *tcb)
1397 {
1398         uint16_t opt_size = 0;
1399
1400         if (tcph->ts_val)
1401                 opt_size += TS_LENGTH + TS_SEND_PREPAD;
1402         opt_size = ROUNDUP(opt_size, 4);
1403         tcb->typical_mss -= opt_size;
1404 }
1405
1406 /*
1407  *  come here when we finally get an ACK to our SYN-ACK.
1408  *  lookup call in limbo.  if found, create a new conversation
1409  *
1410  *  called with proto locked
1411  */
1412 static struct conv *tcpincoming(struct conv *s, Tcp *segp, uint8_t *src,
1413                                                                 uint8_t *dst, uint8_t version)
1414 {
1415         struct conv *new;
1416         Tcpctl *tcb;
1417         struct tcppriv *tpriv;
1418         Tcp4hdr *h4;
1419         Tcp6hdr *h6;
1420         Limbo *lp, **l;
1421         int h;
1422
1423         /* unless it's just an ack, it can't be someone coming out of limbo */
1424         if ((segp->flags & SYN) || (segp->flags & ACK) == 0)
1425                 return NULL;
1426
1427         tpriv = s->p->priv;
1428
1429         /* find a call in limbo */
1430         h = hashipa(src, segp->source);
1431         for (l = &tpriv->lht[h]; (lp = *l) != NULL; l = &lp->next) {
1432                 netlog(s->p->f, Logtcp,
1433                            "tcpincoming s %I!%d/%I!%d d %I!%d/%I!%d v %d/%d\n", src,
1434                            segp->source, lp->raddr, lp->rport, dst, segp->dest, lp->laddr,
1435                            lp->lport, version, lp->version);
1436
1437                 if (lp->lport != segp->dest || lp->rport != segp->source
1438                         || lp->version != version)
1439                         continue;
1440                 if (ipcmp(lp->laddr, dst) != 0)
1441                         continue;
1442                 if (ipcmp(lp->raddr, src) != 0)
1443                         continue;
1444
1445                 /* we're assuming no data with the initial SYN */
1446                 if (segp->seq != lp->irs + 1 || segp->ack != lp->iss + 1) {
1447                         netlog(s->p->f, Logtcp, "tcpincoming s 0x%lx/0x%lx a 0x%lx 0x%lx\n",
1448                                    segp->seq, lp->irs + 1, segp->ack, lp->iss + 1);
1449                         lp = NULL;
1450                 } else {
1451                         tpriv->nlimbo--;
1452                         *l = lp->next;
1453                 }
1454                 break;
1455         }
1456         if (lp == NULL)
1457                 return NULL;
1458
1459         new = Fsnewcall(s, src, segp->source, dst, segp->dest, version);
1460         if (new == NULL)
1461                 return NULL;
1462
1463         memmove(new->ptcl, s->ptcl, sizeof(Tcpctl));
1464         tcb = (Tcpctl *) new->ptcl;
1465         tcb->flags &= ~CLONE;
1466         tcb->timer.arg = new;
1467         tcb->timer.state = TcptimerOFF;
1468         tcb->acktimer.arg = new;
1469         tcb->acktimer.state = TcptimerOFF;
1470         tcb->katimer.arg = new;
1471         tcb->katimer.state = TcptimerOFF;
1472         tcb->rtt_timer.arg = new;
1473         tcb->rtt_timer.state = TcptimerOFF;
1474
1475         tcb->irs = lp->irs;
1476         tcb->rcv.nxt = tcb->irs + 1;
1477         tcb->rcv.urg = tcb->rcv.nxt;
1478
1479         tcb->iss = lp->iss;
1480         tcb->rttseq = tcb->iss;
1481         tcb->snd.wl2 = tcb->iss;
1482         tcb->snd.una = tcb->iss + 1;
1483         tcb->snd.rtx = tcb->iss + 1;
1484         tcb->snd.nxt = tcb->iss + 1;
1485         tcb->flgcnt = 0;
1486         tcb->flags |= SYNACK;
1487
1488         /* our sending max segment size cannot be bigger than what he asked for */
1489         if (lp->mss != 0 && lp->mss < tcb->mss) {
1490                 tcb->mss = lp->mss;
1491                 tcb->typical_mss = tcb->mss;
1492         }
1493         adjust_typical_mss_for_opts(segp, tcb);
1494
1495         /* Here's where we record the previously-decided header options.  They were
1496          * actually decided on when we agreed to them in the SYNACK we sent.  We
1497          * didn't create an actual TCB until now, so we can copy those decisions out
1498          * of the limbo tracker and into the TCB. */
1499         tcb->ifc = lp->ifc;
1500         tcb->sack_ok = lp->sack_ok;
1501         /* window scaling */
1502         tcpsetscale(new, tcb, lp->rcvscale, lp->sndscale);
1503         tcb_check_tso(tcb);
1504
1505         tcb->snd.wnd = segp->wnd;
1506         tcb->cwind = tcb->typical_mss * CWIND_SCALE;
1507
1508         /* set initial round trip time */
1509         tcb->sndsyntime = lp->lastsend + lp->rexmits * SYNACK_RXTIMER;
1510         tcpsynackrtt(new);
1511
1512         kfree(lp);
1513
1514         /* set up proto header */
1515         switch (version) {
1516                 case V4:
1517                         h4 = &tcb->protohdr.tcp4hdr;
1518                         memset(h4, 0, sizeof(*h4));
1519                         h4->proto = IP_TCPPROTO;
1520                         hnputs(h4->tcpsport, new->lport);
1521                         hnputs(h4->tcpdport, new->rport);
1522                         v6tov4(h4->tcpsrc, dst);
1523                         v6tov4(h4->tcpdst, src);
1524                         break;
1525                 case V6:
1526                         h6 = &tcb->protohdr.tcp6hdr;
1527                         memset(h6, 0, sizeof(*h6));
1528                         h6->proto = IP_TCPPROTO;
1529                         hnputs(h6->tcpsport, new->lport);
1530                         hnputs(h6->tcpdport, new->rport);
1531                         ipmove(h6->tcpsrc, dst);
1532                         ipmove(h6->tcpdst, src);
1533                         break;
1534                 default:
1535                         panic("tcpincoming: version %d", new->ipversion);
1536         }
1537
1538         tcpsetstate(new, Established);
1539
1540         iphtadd(&tpriv->ht, new);
1541
1542         return new;
1543 }
1544
1545 /*
1546  *  use the time between the first SYN and it's ack as the
1547  *  initial round trip time
1548  */
1549 static void tcpsynackrtt(struct conv *s)
1550 {
1551         Tcpctl *tcb;
1552         uint64_t delta;
1553         struct tcppriv *tpriv;
1554
1555         tcb = (Tcpctl *) s->ptcl;
1556         tpriv = s->p->priv;
1557
1558         delta = NOW - tcb->sndsyntime;
1559         tcb->srtt = delta;
1560         tcb->mdev = delta / 2;
1561
1562         /* halt round trip timer */
1563         tcphalt(tpriv, &tcb->rtt_timer);
1564 }
1565
1566 /* For LFNs (long/fat), our default tx queue doesn't hold enough data, and TCP
1567  * blocks on the application - even if the app already has the data ready to go.
1568  * We need to hold the sent, unacked data (1x cwnd), plus all the data we might
1569  * send next RTT (1x cwnd).  Note this is called after cwnd was expanded. */
1570 static void adjust_tx_qio_limit(struct conv *s)
1571 {
1572         Tcpctl *tcb = (Tcpctl *) s->ptcl;
1573         size_t ideal_limit = tcb->cwind * 2;
1574
1575         /* This is called for every ACK, and it's not entirely free to update the
1576          * limit (locks, CVs, taps).  Updating in chunks of mss seems reasonable.
1577          * During SS, we'll update this on most ACKs (given each ACK increased the
1578          * cwind by > MSS).
1579          *
1580          * We also don't want a lot of tiny blocks from the user, but the way qio
1581          * works, you can put in as much as you want (Maxatomic) and then get
1582          * flow-controlled. */
1583         if (qgetlimit(s->wq) + tcb->typical_mss < ideal_limit)
1584                 qsetlimit(s->wq, ideal_limit);
1585         /* TODO: we could shrink the qio limit too, if we had a better idea what the
1586          * actual threshold was.  We want the limit to be the 'stable' cwnd * 2. */
1587 }
1588
1589 /* Attempts to merge later sacks into sack 'into' (index in the array) */
1590 static void merge_sacks_into(Tcpctl *tcb, int into)
1591 {
1592         struct sack_block *into_sack = &tcb->snd.sacks[into];
1593         struct sack_block *tcb_sack;
1594         int shift = 0;
1595
1596         for (int i = into + 1; i < tcb->snd.nr_sacks; i++) {
1597                 tcb_sack = &tcb->snd.sacks[i];
1598                 if (seq_lt(into_sack->right, tcb_sack->left))
1599                         break;
1600                 if (seq_gt(tcb_sack->right, into_sack->right))
1601                         into_sack->right = tcb_sack->right;
1602                 shift++;
1603         }
1604         if (shift) {
1605                 memmove(tcb->snd.sacks + into + 1,
1606                         tcb->snd.sacks + into + 1 + shift,
1607                         sizeof(struct sack_block) * (tcb->snd.nr_sacks - into - 1
1608                                                              - shift));
1609                 tcb->snd.nr_sacks -= shift;
1610         }
1611 }
1612
1613 /* If we update a sack, it means they received a packet (possibly out of order),
1614  * but they have not received earlier packets.  Otherwise, they would do a full
1615  * ACK.
1616  *
1617  * The trick is in knowing whether the reception growing this sack is due to a
1618  * retrans or due to packets from before our last loss event.  The rightmost
1619  * sack tends to grow a lot with packets we sent before the loss.  However,
1620  * intermediate sacks that grow are signs of a loss, since they only grow as a
1621  * result of retrans.
1622  *
1623  * This is only true for the first time through a retrans.  After we've gone
1624  * through a full retrans blast, the sack that hinted at the retrans loss (and
1625  * there could be multiple of them!) will continue to grow.  We could come up
1626  * with some tracking for this, but instead we'll just do a one-time deal.  You
1627  * can recover from one detected sack retrans loss.  After that, you'll have to
1628  * use the RTO.
1629  *
1630  * This won't catch some things, like a sack that grew and merged with the
1631  * rightmost sack.  This also won't work if you have a single sack.  We can't
1632  * tell where the retrans ends and the sending begins. */
1633 static bool sack_hints_at_loss(Tcpctl *tcb, struct sack_block *tcb_sack)
1634 {
1635         if (tcb->snd.recovery != SACK_RETRANS_RECOVERY)
1636                 return FALSE;
1637         return &tcb->snd.sacks[tcb->snd.nr_sacks - 1] != tcb_sack;
1638 }
1639
1640 static bool sack_contains(struct sack_block *tcb_sack, uint32_t seq)
1641 {
1642         return seq_le(tcb_sack->left, seq) && seq_lt(seq, tcb_sack->right);
1643 }
1644
1645 /* Debugging helper! */
1646 static void sack_asserter(Tcpctl *tcb, char *str)
1647 {
1648         struct sack_block *tcb_sack;
1649
1650         for (int i = 0; i < tcb->snd.nr_sacks; i++) {
1651                 tcb_sack = &tcb->snd.sacks[i];
1652                 /* Checking invariants: snd.rtx is never inside a sack, sacks are always
1653                  * mutually exclusive. */
1654                 if (sack_contains(tcb_sack, tcb->snd.rtx) ||
1655                     ((i + 1 < tcb->snd.nr_sacks) && seq_ge(tcb_sack->right,
1656                                                                (tcb_sack + 1)->left))) {
1657                         printk("SACK ASSERT ERROR at %s\n", str);
1658                         printk("rtx %u una %u nxt %u, sack [%u, %u)\n",
1659                                tcb->snd.rtx, tcb->snd.una, tcb->snd.nxt, tcb_sack->left,
1660                                    tcb_sack->right);
1661                         for (int i = 0; i < tcb->snd.nr_sacks; i++)
1662                                 printk("\t %d: [%u, %u)\n", i, tcb->snd.sacks[i].left,
1663                                        tcb->snd.sacks[i].right);
1664                         backtrace();
1665                         panic("");
1666                 }
1667         }
1668 }
1669
1670 /* Updates bookkeeping whenever a sack is added or updated */
1671 static void sack_has_changed(struct conv *s, Tcpctl *tcb,
1672                              struct sack_block *tcb_sack)
1673 {
1674         /* Due to the change, snd.rtx might be in the middle of this sack.  Advance
1675          * it to the right edge. */
1676         if (sack_contains(tcb_sack, tcb->snd.rtx))
1677                 tcb->snd.rtx = tcb_sack->right;
1678
1679         /* This is a sack for something we retransed and we think it means there was
1680          * another loss.  Instead of waiting for the RTO, we can take action. */
1681         if (sack_hints_at_loss(tcb, tcb_sack)) {
1682                 if (++tcb->snd.sack_loss_hint == TCPREXMTTHRESH) {
1683                         netlog(s->p->f, Logtcprxmt,
1684                                "%I.%d -> %I.%d: sack rxmit loss: snd.rtx %u, sack [%u,%u), una %u, recovery_pt %u\n",
1685                                s->laddr, s->lport, s->raddr, s->rport,
1686                                tcb->snd.rtx, tcb_sack->left, tcb_sack->right, tcb->snd.una,
1687                                tcb->snd.recovery_pt);
1688                         /* Redo retrans, but keep the sacks and recovery point */
1689                         tcp_loss_event(s, tcb);
1690                         tcb->snd.rtx = tcb->snd.una;
1691                         tcb->snd.sack_loss_hint = 0;
1692                         /* Act like an RTO.  We just detected it earlier.  This prevents us
1693                          * from getting another sack hint loss this recovery period and from
1694                          * advancing the opportunistic right edge. */
1695                         tcb->snd.recovery = RTO_RETRANS_RECOVERY;
1696                         /* We didn't actually time out yet and we expect to keep getting
1697                          * sacks, so we don't want to flush or worry about in_flight.  If we
1698                          * messed something up, the RTO will still fire. */
1699                         set_in_flight(tcb);
1700                 }
1701         }
1702 }
1703
1704 /* Advances tcb_sack's right edge, if new_right is farther, and updates the
1705  * bookkeeping due to the change. */
1706 static void update_right_edge(struct conv *s, Tcpctl *tcb,
1707                               struct sack_block *tcb_sack, uint32_t new_right)
1708 {
1709         if (seq_le(new_right, tcb_sack->right))
1710                 return;
1711         tcb_sack->right = new_right;
1712         merge_sacks_into(tcb, tcb_sack - tcb->snd.sacks);
1713         sack_has_changed(s, tcb, tcb_sack);
1714 }
1715
1716 static void update_or_insert_sack(struct conv *s, Tcpctl *tcb,
1717                                   struct sack_block *seg_sack)
1718 {
1719         struct sack_block *tcb_sack;
1720
1721         for (int i = 0; i < tcb->snd.nr_sacks; i++) {
1722                 tcb_sack = &tcb->snd.sacks[i];
1723                 if (seq_lt(tcb_sack->left, seg_sack->left)) {
1724                         /* This includes adjacent (which I've seen!) and overlap. */
1725                         if (seq_le(seg_sack->left, tcb_sack->right)) {
1726                                 update_right_edge(s, tcb, tcb_sack, seg_sack->right);
1727                                 return;
1728                         }
1729                         continue;
1730                 }
1731                 /* Update existing sack */
1732                 if (tcb_sack->left == seg_sack->left) {
1733                         update_right_edge(s, tcb, tcb_sack, seg_sack->right);
1734                         return;
1735                 }
1736                 /* Found our slot */
1737                 if (seq_gt(tcb_sack->left, seg_sack->left)) {
1738                         if (tcb->snd.nr_sacks == MAX_NR_SND_SACKS) {
1739                                 /* Out of room, but it is possible this sack overlaps later
1740                                  * sacks, including the max sack's right edge. */
1741                                 if (seq_ge(seg_sack->right, tcb_sack->left)) {
1742                                         /* Take over the sack */
1743                                         tcb_sack->left = seg_sack->left;
1744                                         update_right_edge(s, tcb, tcb_sack, seg_sack->right);
1745                                 }
1746                                 return;
1747                         }
1748                         /* O/W, it's our slot and we have room (at least one spot). */
1749                         memmove(&tcb->snd.sacks[i + 1], &tcb->snd.sacks[i],
1750                                 sizeof(struct sack_block) * (tcb->snd.nr_sacks - i));
1751                         tcb_sack->left = seg_sack->left;
1752                         tcb_sack->right = seg_sack->right;
1753                         tcb->snd.nr_sacks++;
1754                         merge_sacks_into(tcb, i);
1755                         sack_has_changed(s, tcb, tcb_sack);
1756                         return;
1757                 }
1758         }
1759         if (tcb->snd.nr_sacks == MAX_NR_SND_SACKS) {
1760                 /* We didn't find space in the sack array. */
1761                 tcb_sack = &tcb->snd.sacks[MAX_NR_SND_SACKS - 1];
1762                 /* Need to always maintain the rightmost sack, discarding the prev */
1763                 if (seq_gt(seg_sack->right, tcb_sack->right)) {
1764                         tcb_sack->left = seg_sack->left;
1765                         tcb_sack->right = seg_sack->right;
1766                         sack_has_changed(s, tcb, tcb_sack);
1767                 }
1768                 return;
1769         }
1770         tcb_sack = &tcb->snd.sacks[tcb->snd.nr_sacks];
1771         tcb->snd.nr_sacks++;
1772         tcb_sack->left = seg_sack->left;
1773         tcb_sack->right = seg_sack->right;
1774         sack_has_changed(s, tcb, tcb_sack);
1775 }
1776
1777 /* Given the packet seg, track the sacks in TCB.  There are a few things: if seg
1778  * acks new data, some sacks might no longer be needed.  Some sacks might grow,
1779  * we might add new sacks, either of which can cause a merger.
1780  *
1781  * The important thing is that we always have the max sack entry: it must be
1782  * inserted for sure and findable.  We need that for our measurement of what
1783  * packets are in the network.
1784  *
1785  * Note that we keep sacks that are below snd.rtx (and above
1786  * seg.ack/tcb->snd.una) as best we can - we don't prune them.  We'll need those
1787  * for the in_flight estimate.
1788  *
1789  * When we run out of room, we'll have to throw away a sack.  Anything we throw
1790  * away below snd.rtx will be counted as 'in flight', even though it isn't.  If
1791  * we throw away something greater than snd.rtx, we'll also retrans it.  For
1792  * simplicity, we throw-away / replace the rightmost sack, since we're always
1793  * maintaining a highest sack. */
1794 static void update_sacks(struct conv *s, Tcpctl *tcb, Tcp *seg)
1795 {
1796         int prune = 0;
1797         struct sack_block *tcb_sack;
1798
1799         for (int i = 0; i < tcb->snd.nr_sacks; i++) {
1800                 tcb_sack = &tcb->snd.sacks[i];
1801                 /* For the equality case, if they acked up to, but not including an old
1802                  * sack, they must have reneged it.  Otherwise they would have acked
1803                  * beyond the sack. */
1804                 if (seq_lt(seg->ack, tcb_sack->left))
1805                         break;
1806                 prune++;
1807         }
1808         if (prune) {
1809                 memmove(tcb->snd.sacks, tcb->snd.sacks + prune,
1810                         sizeof(struct sack_block) * (tcb->snd.nr_sacks - prune));
1811                 tcb->snd.nr_sacks -= prune;
1812         }
1813         for (int i = 0; i < seg->nr_sacks; i++) {
1814                 /* old sacks */
1815                 if (seq_lt(seg->sacks[i].left, seg->ack))
1816                         continue;
1817                 /* buggy sack: out of range */
1818                 if (seq_gt(seg->sacks[i].right, tcb->snd.nxt))
1819                         continue;
1820                 update_or_insert_sack(s, tcb, &seg->sacks[i]);
1821         }
1822 }
1823
1824 /* This is a little bit of an under estimate, since we assume a packet is lost
1825  * once we have any sacks above it.  Overall, it's at most 2 * MSS of an
1826  * overestimate.
1827  *
1828  * If we have no sacks (either reneged or never used) we'll assume all packets
1829  * above snd.rtx are lost.  This will be the case for sackless fast rxmit
1830  * (Dong's stuff) or for a timeout.  In the former case, this is probably not
1831  * true, and in_flight should be higher, but we have no knowledge without the
1832  * sacks. */
1833 static void set_in_flight(Tcpctl *tcb)
1834 {
1835         struct sack_block *tcb_sack;
1836         uint32_t in_flight = 0;
1837         uint32_t from;
1838
1839         if (!tcb->snd.nr_sacks) {
1840                 tcb->snd.in_flight = tcb->snd.rtx - tcb->snd.una;
1841                 return;
1842         }
1843
1844         /* Everything to the right of the unsacked */
1845         tcb_sack = &tcb->snd.sacks[tcb->snd.nr_sacks - 1];
1846         in_flight += tcb->snd.nxt - tcb_sack->right;
1847
1848         /* Everything retransed (from una to snd.rtx, minus sacked regions.  Note
1849          * we only retrans at most the last sack's left edge.  snd.rtx will be
1850          * advanced to the right edge of some sack (possibly the last one). */
1851         from = tcb->snd.una;
1852         for (int i = 0; i < tcb->snd.nr_sacks; i++) {
1853                 tcb_sack = &tcb->snd.sacks[i];
1854                 if (seq_ge(tcb_sack->left, tcb->snd.rtx))
1855                         break;
1856                 assert(seq_ge(tcb->snd.rtx, tcb_sack->right));
1857                 in_flight += tcb_sack->left - from;
1858                 from = tcb_sack->right;
1859         }
1860         in_flight += tcb->snd.rtx - from;
1861
1862         tcb->snd.in_flight = in_flight;
1863 }
1864
1865 static void reset_recovery(struct conv *s, Tcpctl *tcb)
1866 {
1867         netlog(s->p->f, Logtcprxmt,
1868                "%I.%d -> %I.%d: recovery complete, una %u, rtx %u, nxt %u, recovery %u\n",
1869                s->laddr, s->lport, s->raddr, s->rport,
1870                tcb->snd.una, tcb->snd.rtx, tcb->snd.nxt, tcb->snd.recovery_pt);
1871         tcb->snd.recovery = 0;
1872         tcb->snd.recovery_pt = 0;
1873         tcb->snd.loss_hint = 0;
1874         tcb->snd.flush_sacks = FALSE;
1875         tcb->snd.sack_loss_hint = 0;
1876 }
1877
1878 static bool is_dup_ack(Tcpctl *tcb, Tcp *seg)
1879 {
1880         /* this is a pure ack w/o window update */
1881         return (seg->ack == tcb->snd.una) &&
1882                (tcb->snd.una != tcb->snd.nxt) &&
1883                (seg->len == 0) &&
1884                (seg->wnd == tcb->snd.wnd);
1885 }
1886
1887 /* If we have sacks, we'll ignore dupacks and look at the sacks ahead of una
1888  * (which are managed by the TCB).  The tcb will not have old sacks (below
1889  * ack/snd.rtx).  Receivers often send sacks below their ack point when we are
1890  * coming out of a loss, and we don't want those to count.
1891  *
1892  * Note the tcb could have sacks (in the future), but the receiver stopped using
1893  * them (reneged).  We'll catch that with the RTO.  If we try to catch it here,
1894  * we could get in a state where we never allow them to renege. */
1895 static bool is_potential_loss(Tcpctl *tcb, Tcp *seg)
1896 {
1897         if (seg->nr_sacks > 0)
1898                 return tcb->snd.nr_sacks > 0;
1899         else
1900                 return is_dup_ack(tcb, seg);
1901 }
1902
1903 /* When we use timestamps for RTTM, RFC 7323 suggests scaling by
1904  * expected_samples (per cwnd).  They say:
1905  *
1906  * ExpectedSamples = ceiling(FlightSize / (SMSS * 2))
1907  *
1908  * However, SMMS * 2 is really "number of bytes expected to be acked in a
1909  * packet.".  We'll use 'acked' to approximate that.  When the receiver uses
1910  * LRO, they'll send back large ACKs, which decreases the number of samples.
1911  *
1912  * If it turns out that all the divides are bad, we can just go back to not
1913  * using expected_samples at all. */
1914 static int expected_samples_ts(Tcpctl *tcb, uint32_t acked)
1915 {
1916         assert(acked);
1917         return MAX(DIV_ROUND_UP(tcb->snd.nxt - tcb->snd.una, acked), 1);
1918 }
1919
1920 /* Updates the RTT, given the currently sampled RTT and the number samples per
1921  * cwnd.  For non-TS RTTM, that'll be 1. */
1922 static void update_rtt(Tcpctl *tcb, int rtt_sample, int expected_samples)
1923 {
1924         int delta;
1925
1926         tcb->backoff = 0;
1927         tcb->backedoff = 0;
1928         if (tcb->srtt == 0) {
1929                 tcb->srtt = rtt_sample;
1930                 tcb->mdev = rtt_sample / 2;
1931         } else {
1932                 delta = rtt_sample - tcb->srtt;
1933                 tcb->srtt += (delta >> RTTM_ALPHA_SHIFT) / expected_samples;
1934                 if (tcb->srtt <= 0)
1935                         tcb->srtt = 1;
1936                 tcb->mdev += ((abs(delta) - tcb->mdev) >> RTTM_BRAVO_SHIFT) /
1937                              expected_samples;
1938                 if (tcb->mdev <= 0)
1939                         tcb->mdev = 1;
1940         }
1941         tcpsettimer(tcb);
1942 }
1943
1944 static void update(struct conv *s, Tcp *seg)
1945 {
1946         int rtt;
1947         Tcpctl *tcb;
1948         uint32_t acked, expand;
1949         struct tcppriv *tpriv;
1950
1951         tpriv = s->p->priv;
1952         tcb = (Tcpctl *) s->ptcl;
1953
1954         if (!seq_within(seg->ack, tcb->snd.una, tcb->snd.nxt))
1955                 return;
1956
1957         acked = seg->ack - tcb->snd.una;
1958         tcb->snd.una = seg->ack;
1959         if (seq_gt(seg->ack, tcb->snd.rtx))
1960                 tcb->snd.rtx = seg->ack;
1961
1962         update_sacks(s, tcb, seg);
1963         set_in_flight(tcb);
1964
1965         /* We treat either a dupack or forward SACKs as a hint that there is a loss.
1966          * The RFCs suggest three dupacks before treating it as a loss (alternative
1967          * is reordered packets).  We'll treat three SACKs the same way. */
1968         if (is_potential_loss(tcb, seg) && !tcb->snd.recovery) {
1969                 tcb->snd.loss_hint++;
1970                 if (tcb->snd.loss_hint == TCPREXMTTHRESH) {
1971                         netlog(s->p->f, Logtcprxmt,
1972                                "%I.%d -> %I.%d: loss hint thresh, nr sacks %u, nxt %u, una %u, cwnd %u\n",
1973                                s->laddr, s->lport, s->raddr, s->rport,
1974                                tcb->snd.nr_sacks, tcb->snd.nxt, tcb->snd.una, tcb->cwind);
1975                         tcp_loss_event(s, tcb);
1976                         tcb->snd.recovery_pt = tcb->snd.nxt;
1977                         if (tcb->snd.nr_sacks) {
1978                                 tcb->snd.recovery = SACK_RETRANS_RECOVERY;
1979                                 tcb->snd.flush_sacks = FALSE;
1980                                 tcb->snd.sack_loss_hint = 0;
1981                         } else {
1982                                 tcb->snd.recovery = FAST_RETRANS_RECOVERY;
1983                         }
1984                         tcprxmit(s);
1985                 }
1986         }
1987
1988         /*
1989          *  update window
1990          */
1991         if (seq_gt(seg->ack, tcb->snd.wl2)
1992                 || (tcb->snd.wl2 == seg->ack && seg->wnd > tcb->snd.wnd)) {
1993                 tcb->snd.wnd = seg->wnd;
1994                 tcb->snd.wl2 = seg->ack;
1995         }
1996
1997         if (!acked) {
1998                 /*
1999                  *  don't let us hangup if sending into a closed window and
2000                  *  we're still getting acks
2001                  */
2002                 if (tcb->snd.recovery && (tcb->snd.wnd == 0))
2003                         tcb->backedoff = MAXBACKMS / 4;
2004                 return;
2005         }
2006         /* At this point, they have acked something new. (positive ack, ack > una).
2007          *
2008          * If we hadn't reached the threshold for recovery yet, the positive ACK
2009          * will reset our loss_hint count. */
2010         if (!tcb->snd.recovery)
2011                 tcb->snd.loss_hint = 0;
2012         else if (seq_ge(seg->ack, tcb->snd.recovery_pt))
2013                 reset_recovery(s, tcb);
2014
2015         /* avoid slow start and timers for SYN acks */
2016         if ((tcb->flags & SYNACK) == 0) {
2017                 tcb->flags |= SYNACK;
2018                 acked--;
2019                 tcb->flgcnt--;
2020                 goto done;
2021         }
2022
2023         /* slow start as long as we're not recovering from lost packets */
2024         if (tcb->cwind < tcb->snd.wnd && !tcb->snd.recovery) {
2025                 if (tcb->cwind < tcb->ssthresh) {
2026                         /* We increase the cwind by every byte we receive.  We want to
2027                          * increase the cwind by one MSS for every MSS that gets ACKed.
2028                          * Note that multiple MSSs can be ACKed in a single ACK.  If we had
2029                          * a remainder of acked / MSS, we'd add just that remainder - not 0
2030                          * or 1 MSS. */
2031                         expand = acked;
2032                 } else {
2033                         /* Every RTT, which consists of CWND bytes, we're supposed to expand
2034                          * by MSS bytes.  The classic algorithm was
2035                          *              expand = (tcb->mss * tcb->mss) / tcb->cwind;
2036                          * which assumes the ACK was for MSS bytes.  Instead, for every
2037                          * 'acked' bytes, we increase the window by acked / CWND (in units
2038                          * of MSS). */
2039                         expand = MAX(acked, tcb->typical_mss) * tcb->typical_mss
2040                                  / tcb->cwind;
2041                 }
2042
2043                 if (tcb->cwind + expand < tcb->cwind)
2044                         expand = tcb->snd.wnd - tcb->cwind;
2045                 if (tcb->cwind + expand > tcb->snd.wnd)
2046                         expand = tcb->snd.wnd - tcb->cwind;
2047                 tcb->cwind += expand;
2048         }
2049         adjust_tx_qio_limit(s);
2050
2051         if (tcb->ts_recent) {
2052                 update_rtt(tcb, abs(milliseconds() - seg->ts_ecr),
2053                            expected_samples_ts(tcb, acked));
2054         } else if (tcb->rtt_timer.state == TcptimerON &&
2055                    seq_ge(seg->ack, tcb->rttseq)) {
2056                 /* Adjust the timers according to the round trip time */
2057                 tcphalt(tpriv, &tcb->rtt_timer);
2058                 if (!tcb->snd.recovery) {
2059                         rtt = tcb->rtt_timer.start - tcb->rtt_timer.count;
2060                         if (rtt == 0)
2061                                 rtt = 1;        /* o/w all close systems will rexmit in 0 time */
2062                         rtt *= MSPTICK;
2063                         update_rtt(tcb, rtt, 1);
2064                 }
2065         }
2066
2067 done:
2068         if (qdiscard(s->wq, acked) < acked) {
2069                 tcb->flgcnt--;
2070                 /* This happened due to another bug where acked was very large
2071                  * (negative), which was interpreted as "hey, one less flag, since they
2072                  * acked one of our flags (like a SYN).  If flgcnt goes negative,
2073                  * get_xmit_segment() will attempt to send out large packets. */
2074                 assert(tcb->flgcnt >= 0);
2075         }
2076
2077         if (seq_gt(seg->ack, tcb->snd.urg))
2078                 tcb->snd.urg = seg->ack;
2079
2080         if (tcb->snd.una != tcb->snd.nxt)
2081                 tcpgo(tpriv, &tcb->timer);
2082         else
2083                 tcphalt(tpriv, &tcb->timer);
2084
2085         tcb->backoff = 0;
2086         tcb->backedoff = 0;
2087 }
2088
2089 static void update_tcb_ts(Tcpctl *tcb, Tcp *seg)
2090 {
2091         /* Get timestamp info from the tcp header.  Even though the timestamps
2092          * aren't sequence numbers, we still need to protect for wraparound.  Though
2093          * if the values were 0, assume that means we need an update.  We could have
2094          * an initial ts_val that appears negative (signed). */
2095         if (!tcb->ts_recent || !tcb->last_ack_sent ||
2096             (seq_ge(seg->ts_val, tcb->ts_recent) &&
2097              seq_le(seg->seq, tcb->last_ack_sent)))
2098                 tcb->ts_recent = seg->ts_val;
2099 }
2100
2101 /* Overlap happens when one sack's left edge is inside another sack. */
2102 static bool sacks_overlap(struct sack_block *x, struct sack_block *y)
2103 {
2104         return (seq_le(x->left, y->left) && seq_le(y->left, x->right)) ||
2105                (seq_le(y->left, x->left) && seq_le(x->left, y->right));
2106 }
2107
2108 static void make_sack_first(Tcpctl *tcb, struct sack_block *tcb_sack)
2109 {
2110         struct sack_block temp;
2111
2112         if (tcb_sack == &tcb->rcv.sacks[0])
2113                 return;
2114         temp = tcb->rcv.sacks[0];
2115         tcb->rcv.sacks[0] = *tcb_sack;
2116         *tcb_sack = temp;
2117 }
2118
2119 /* Track sack in our tcb for a block of data we received.  This handles all the
2120  * stuff: making sure sack is first (since it's the most recent sack change),
2121  * updating or merging sacks, and dropping excess sacks (we only need to
2122  * maintain 3).  Unlike on the snd side, our tcb sacks are *not* sorted. */
2123 static void track_rcv_sack(Tcpctl *tcb, uint32_t left, uint32_t right)
2124 {
2125         struct sack_block *tcb_sack;
2126         struct sack_block sack[1];
2127
2128         if (!tcb->sack_ok)
2129                 return;
2130         if (left == right)
2131                 return;
2132         assert(seq_lt(left, right));
2133         sack->left = left;
2134         sack->right = right;
2135         /* We can reuse an existing sack if we're merging or overlapping. */
2136         for (int i = 0; i < tcb->rcv.nr_sacks; i++) {
2137                 tcb_sack = &tcb->rcv.sacks[i];
2138                 if (sacks_overlap(tcb_sack, sack)) {
2139                         tcb_sack->left = seq_min(tcb_sack->left, sack->left);
2140                         tcb_sack->right = seq_max(tcb_sack->right, sack->right);
2141                         make_sack_first(tcb, tcb_sack);
2142                         return;
2143                 }
2144         }
2145         /* We can discard the last sack (right shift) - we should have sent it at
2146          * least once by now.  If not, oh well. */
2147         memmove(tcb->rcv.sacks + 1, tcb->rcv.sacks, sizeof(struct sack_block) *
2148                 MIN(MAX_NR_RCV_SACKS - 1, tcb->rcv.nr_sacks));
2149         tcb->rcv.sacks[0] = *sack;
2150         if (tcb->rcv.nr_sacks < MAX_NR_RCV_SACKS)
2151                 tcb->rcv.nr_sacks++;
2152 }
2153
2154 /* Once we receive everything and move rcv.nxt past a sack, we don't need to
2155  * track it.  I've seen Linux report sacks in the past, but we probably
2156  * shouldn't. */
2157 static void drop_old_rcv_sacks(Tcpctl *tcb)
2158 {
2159         struct sack_block *tcb_sack;
2160
2161         for (int i = 0; i < tcb->rcv.nr_sacks; i++) {
2162                 tcb_sack = &tcb->rcv.sacks[i];
2163                 /* Moving up to or past the left is enough to drop it. */
2164                 if (seq_ge(tcb->rcv.nxt, tcb_sack->left)) {
2165                         memmove(tcb->rcv.sacks + i, tcb->rcv.sacks + i + 1,
2166                                 sizeof(struct sack_block) * (tcb->rcv.nr_sacks - i - 1));
2167                         tcb->rcv.nr_sacks--;
2168                         i--;
2169                 }
2170         }
2171 }
2172
2173 static void tcpiput(struct Proto *tcp, struct Ipifc *unused, struct block *bp)
2174 {
2175         ERRSTACK(1);
2176         Tcp seg;
2177         Tcp4hdr *h4;
2178         Tcp6hdr *h6;
2179         int hdrlen;
2180         Tcpctl *tcb;
2181         uint16_t length;
2182         uint8_t source[IPaddrlen], dest[IPaddrlen];
2183         struct conv *s;
2184         struct Fs *f;
2185         struct tcppriv *tpriv;
2186         uint8_t version;
2187
2188         f = tcp->f;
2189         tpriv = tcp->priv;
2190
2191         tpriv->stats[InSegs]++;
2192
2193         h4 = (Tcp4hdr *) (bp->rp);
2194         h6 = (Tcp6hdr *) (bp->rp);
2195
2196         if ((h4->vihl & 0xF0) == IP_VER4) {
2197                 uint8_t ttl;
2198
2199                 version = V4;
2200                 length = nhgets(h4->length);
2201                 v4tov6(dest, h4->tcpdst);
2202                 v4tov6(source, h4->tcpsrc);
2203
2204                 /* ttl isn't part of the xsum pseudo header, but bypass needs it. */
2205                 ttl = h4->Unused;
2206                 h4->Unused = 0;
2207                 hnputs(h4->tcplen, length - TCP4_PKT);
2208                 if (!(bp->flag & Btcpck) && (h4->tcpcksum[0] || h4->tcpcksum[1]) &&
2209                         ptclcsum(bp, TCP4_IPLEN, length - TCP4_IPLEN)) {
2210                         tpriv->stats[CsumErrs]++;
2211                         tpriv->stats[InErrs]++;
2212                         netlog(f, Logtcp, "bad tcp proto cksum\n");
2213                         freeblist(bp);
2214                         return;
2215                 }
2216                 h4->Unused = ttl;
2217
2218                 hdrlen = ntohtcp4(&seg, &bp);
2219                 if (hdrlen < 0) {
2220                         tpriv->stats[HlenErrs]++;
2221                         tpriv->stats[InErrs]++;
2222                         netlog(f, Logtcp, "bad tcp hdr len\n");
2223                         return;
2224                 }
2225
2226                 s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest);
2227                 if (s && s->state == Bypass) {
2228                         bypass_or_drop(s, bp);
2229                         return;
2230                 }
2231
2232                 /* trim the packet to the size claimed by the datagram */
2233                 length -= hdrlen + TCP4_PKT;
2234                 bp = trimblock(bp, hdrlen + TCP4_PKT, length);
2235                 if (bp == NULL) {
2236                         tpriv->stats[LenErrs]++;
2237                         tpriv->stats[InErrs]++;
2238                         netlog(f, Logtcp, "tcp len < 0 after trim\n");
2239                         return;
2240                 }
2241         } else {
2242                 int ttl = h6->ttl;
2243                 int proto = h6->proto;
2244
2245                 version = V6;
2246                 length = nhgets(h6->ploadlen);
2247                 ipmove(dest, h6->tcpdst);
2248                 ipmove(source, h6->tcpsrc);
2249
2250                 h6->ploadlen[0] = h6->ploadlen[1] = h6->proto = 0;
2251                 h6->ttl = proto;
2252                 hnputl(h6->vcf, length);
2253                 if ((h6->tcpcksum[0] || h6->tcpcksum[1]) &&
2254                         ptclcsum(bp, TCP6_IPLEN, length + TCP6_PHDRSIZE)) {
2255                         tpriv->stats[CsumErrs]++;
2256                         tpriv->stats[InErrs]++;
2257                         netlog(f, Logtcp, "bad tcp proto cksum\n");
2258                         freeblist(bp);
2259                         return;
2260                 }
2261                 h6->ttl = ttl;
2262                 h6->proto = proto;
2263                 hnputs(h6->ploadlen, length);
2264
2265                 hdrlen = ntohtcp6(&seg, &bp);
2266                 if (hdrlen < 0) {
2267                         tpriv->stats[HlenErrs]++;
2268                         tpriv->stats[InErrs]++;
2269                         netlog(f, Logtcp, "bad tcp hdr len\n");
2270                         return;
2271                 }
2272
2273                 s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest);
2274                 if (s && s->state == Bypass) {
2275                         bypass_or_drop(s, bp);
2276                         return;
2277                 }
2278
2279                 /* trim the packet to the size claimed by the datagram */
2280                 length -= hdrlen;
2281                 bp = trimblock(bp, hdrlen + TCP6_PKT, length);
2282                 if (bp == NULL) {
2283                         tpriv->stats[LenErrs]++;
2284                         tpriv->stats[InErrs]++;
2285                         netlog(f, Logtcp, "tcp len < 0 after trim\n");
2286                         return;
2287                 }
2288         }
2289
2290         /* s, the conv matching the n-tuple, was set above */
2291         if (s == NULL) {
2292                 netlog(f, Logtcpreset, "iphtlook failed: src %I:%u, dst %I:%u\n",
2293                        source, seg.source, dest, seg.dest);
2294 reset:
2295                 sndrst(tcp, source, dest, length, &seg, version, "no conversation");
2296                 freeblist(bp);
2297                 return;
2298         }
2299
2300         /* lock protocol for unstate Plan 9 invariants.  funcs like limbo or
2301          * incoming might rely on it. */
2302         qlock(&tcp->qlock);
2303
2304         /* if it's a listener, look for the right flags and get a new conv */
2305         tcb = (Tcpctl *) s->ptcl;
2306         if (tcb->state == Listen) {
2307                 if (seg.flags & RST) {
2308                         limborst(s, &seg, source, dest, version);
2309                         qunlock(&tcp->qlock);
2310                         freeblist(bp);
2311                         return;
2312                 }
2313
2314                 /* if this is a new SYN, put the call into limbo */
2315                 if ((seg.flags & SYN) && (seg.flags & ACK) == 0) {
2316                         limbo(s, source, dest, &seg, version);
2317                         qunlock(&tcp->qlock);
2318                         freeblist(bp);
2319                         return;
2320                 }
2321
2322                 /* if there's a matching call in limbo, tcpincoming will return it */
2323                 s = tcpincoming(s, &seg, source, dest, version);
2324                 if (s == NULL) {
2325                         qunlock(&tcp->qlock);
2326                         goto reset;
2327                 }
2328         }
2329
2330         /* The rest of the input state machine is run with the control block
2331          * locked and implements the state machine directly out of the RFC.
2332          * Out-of-band data is ignored - it was always a bad idea.
2333          */
2334         tcb = (Tcpctl *) s->ptcl;
2335         if (waserror()) {
2336                 qunlock(&s->qlock);
2337                 nexterror();
2338         }
2339         qlock(&s->qlock);
2340         qunlock(&tcp->qlock);
2341
2342         update_tcb_ts(tcb, &seg);
2343         /* fix up window */
2344         seg.wnd <<= tcb->rcv.scale;
2345
2346         /* every input packet in puts off the keep alive time out */
2347         tcpsetkacounter(tcb);
2348
2349         switch (tcb->state) {
2350                 case Closed:
2351                         sndrst(tcp, source, dest, length, &seg, version,
2352                                    "sending to Closed");
2353                         goto raise;
2354                 case Syn_sent:
2355                         if (seg.flags & ACK) {
2356                                 if (!seq_within(seg.ack, tcb->iss + 1, tcb->snd.nxt)) {
2357                                         sndrst(tcp, source, dest, length, &seg, version,
2358                                                    "bad seq in Syn_sent");
2359                                         goto raise;
2360                                 }
2361                         }
2362                         if (seg.flags & RST) {
2363                                 if (seg.flags & ACK)
2364                                         localclose(s, "connection refused");
2365                                 goto raise;
2366                         }
2367
2368                         if (seg.flags & SYN) {
2369                                 procsyn(s, &seg);
2370                                 if (seg.flags & ACK) {
2371                                         update(s, &seg);
2372                                         tcpsynackrtt(s);
2373                                         tcpsetstate(s, Established);
2374                                         /* Here's where we get the results of header option
2375                                          * negotiations for connections we started. (SYNACK has the
2376                                          * response) */
2377                                         tcpsetscale(s, tcb, seg.ws, tcb->scale);
2378                                         tcb->sack_ok = seg.sack_ok;
2379                                 } else {
2380                                         sndrst(tcp, source, dest, length, &seg, version,
2381                                                    "Got SYN with no ACK");
2382                                         goto raise;
2383                                 }
2384
2385                                 if (length != 0 || (seg.flags & FIN))
2386                                         break;
2387
2388                                 freeblist(bp);
2389                                 goto output;
2390                         } else
2391                                 freeblist(bp);
2392
2393                         qunlock(&s->qlock);
2394                         poperror();
2395                         return;
2396         }
2397
2398         /*
2399          *  One DOS attack is to open connections to us and then forget about them,
2400          *  thereby tying up a conv at no long term cost to the attacker.
2401          *  This is an attempt to defeat these stateless DOS attacks.  See
2402          *  corresponding code in tcpsendka().
2403          */
2404         if ((seg.flags & RST) == 0) {
2405                 if (tcpporthogdefense
2406                         && seq_within(seg.ack, tcb->snd.una - (1 << 31),
2407                                                   tcb->snd.una - (1 << 29))) {
2408                         printd("stateless hog %I.%d->%I.%d f 0x%x 0x%lx - 0x%lx - 0x%lx\n",
2409                                    source, seg.source, dest, seg.dest, seg.flags,
2410                                    tcb->snd.una - (1 << 31), seg.ack, tcb->snd.una - (1 << 29));
2411                         localclose(s, "stateless hog");
2412                 }
2413         }
2414
2415         /* Cut the data to fit the receive window */
2416         if (tcptrim(tcb, &seg, &bp, &length) == -1) {
2417                 netlog(f, Logtcp, "%I.%d -> %I.%d: tcp len < 0, %lu %d\n",
2418                        s->raddr, s->rport, s->laddr, s->lport, seg.seq, length);
2419                 update(s, &seg);
2420                 if (qlen(s->wq) + tcb->flgcnt == 0 && tcb->state == Closing) {
2421                         tcphalt(tpriv, &tcb->rtt_timer);
2422                         tcphalt(tpriv, &tcb->acktimer);
2423                         tcphalt(tpriv, &tcb->katimer);
2424                         tcpsetstate(s, Time_wait);
2425                         tcb->timer.start = MSL2 * (1000 / MSPTICK);
2426                         tcpgo(tpriv, &tcb->timer);
2427                 }
2428                 if (!(seg.flags & RST)) {
2429                         tcb->flags |= FORCE;
2430                         goto output;
2431                 }
2432                 qunlock(&s->qlock);
2433                 poperror();
2434                 return;
2435         }
2436
2437         /* Cannot accept so answer with a rst */
2438         if (length && tcb->state == Closed) {
2439                 sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
2440                 goto raise;
2441         }
2442
2443         /* The segment is beyond the current receive pointer so
2444          * queue the data in the resequence queue
2445          */
2446         if (seg.seq != tcb->rcv.nxt)
2447                 if (length != 0 || (seg.flags & (SYN | FIN))) {
2448                         update(s, &seg);
2449                         if (addreseq(tcb, tpriv, &seg, bp, length) < 0)
2450                                 printd("reseq %I.%d -> %I.%d\n", s->raddr, s->rport, s->laddr,
2451                                            s->lport);
2452                         tcb->flags |= FORCE;
2453                         goto output;
2454                 }
2455
2456         /*
2457          *  keep looping till we've processed this packet plus any
2458          *  adjacent packets in the resequence queue
2459          */
2460         for (;;) {
2461                 if (seg.flags & RST) {
2462                         if (tcb->state == Established) {
2463                                 tpriv->stats[EstabResets]++;
2464                                 if (tcb->rcv.nxt != seg.seq)
2465                                         printd
2466                                                 ("out of order RST rcvd: %I.%d -> %I.%d, rcv.nxt 0x%lx seq 0x%lx\n",
2467                                                  s->raddr, s->rport, s->laddr, s->lport, tcb->rcv.nxt,
2468                                                  seg.seq);
2469                         }
2470                         localclose(s, "connection refused");
2471                         goto raise;
2472                 }
2473
2474                 if ((seg.flags & ACK) == 0)
2475                         goto raise;
2476
2477                 switch (tcb->state) {
2478                         case Established:
2479                         case Close_wait:
2480                                 update(s, &seg);
2481                                 break;
2482                         case Finwait1:
2483                                 update(s, &seg);
2484                                 if (qlen(s->wq) + tcb->flgcnt == 0) {
2485                                         tcphalt(tpriv, &tcb->rtt_timer);
2486                                         tcphalt(tpriv, &tcb->acktimer);
2487                                         tcpsetkacounter(tcb);
2488                                         tcb->time = NOW;
2489                                         tcpsetstate(s, Finwait2);
2490                                         tcb->katimer.start = MSL2 * (1000 / MSPTICK);
2491                                         tcpgo(tpriv, &tcb->katimer);
2492                                 }
2493                                 break;
2494                         case Finwait2:
2495                                 update(s, &seg);
2496                                 break;
2497                         case Closing:
2498                                 update(s, &seg);
2499                                 if (qlen(s->wq) + tcb->flgcnt == 0) {
2500                                         tcphalt(tpriv, &tcb->rtt_timer);
2501                                         tcphalt(tpriv, &tcb->acktimer);
2502                                         tcphalt(tpriv, &tcb->katimer);
2503                                         tcpsetstate(s, Time_wait);
2504                                         tcb->timer.start = MSL2 * (1000 / MSPTICK);
2505                                         tcpgo(tpriv, &tcb->timer);
2506                                 }
2507                                 break;
2508                         case Last_ack:
2509                                 update(s, &seg);
2510                                 if (qlen(s->wq) + tcb->flgcnt == 0) {
2511                                         localclose(s, NULL);
2512                                         goto raise;
2513                                 }
2514                         case Time_wait:
2515                                 tcb->flags |= FORCE;
2516                                 if (tcb->timer.state != TcptimerON)
2517                                         tcpgo(tpriv, &tcb->timer);
2518                 }
2519
2520                 if ((seg.flags & URG) && seg.urg) {
2521                         if (seq_gt(seg.urg + seg.seq, tcb->rcv.urg)) {
2522                                 tcb->rcv.urg = seg.urg + seg.seq;
2523                                 pullblock(&bp, seg.urg);
2524                         }
2525                 } else if (seq_gt(tcb->rcv.nxt, tcb->rcv.urg))
2526                         tcb->rcv.urg = tcb->rcv.nxt;
2527
2528                 if (length == 0) {
2529                         if (bp != NULL)
2530                                 freeblist(bp);
2531                 } else {
2532                         switch (tcb->state) {
2533                                 default:
2534                                         /* Ignore segment text */
2535                                         if (bp != NULL)
2536                                                 freeblist(bp);
2537                                         break;
2538
2539                                 case Established:
2540                                 case Finwait1:
2541                                         /* If we still have some data place on
2542                                          * receive queue
2543                                          */
2544                                         if (bp) {
2545                                                 bp = packblock(bp);
2546                                                 if (bp == NULL)
2547                                                         panic("tcp packblock");
2548                                                 qpassnolim(s->rq, bp);
2549                                                 bp = NULL;
2550
2551                                                 /*
2552                                                  *  Force an ack every 2 data messages.  This is
2553                                                  *  a hack for rob to make his home system run
2554                                                  *  faster.
2555                                                  *
2556                                                  *  this also keeps the standard TCP congestion
2557                                                  *  control working since it needs an ack every
2558                                                  *  2 max segs worth.  This is not quite that,
2559                                                  *  but under a real stream is equivalent since
2560                                                  *  every packet has a max seg in it.
2561                                                  */
2562                                                 if (++(tcb->rcv.una) >= 2)
2563                                                         tcb->flags |= FORCE;
2564                                         }
2565                                         tcb->rcv.nxt += length;
2566                                         drop_old_rcv_sacks(tcb);
2567
2568                                         /*
2569                                          *  update our rcv window
2570                                          */
2571                                         tcprcvwin(s);
2572
2573                                         /*
2574                                          *  turn on the acktimer if there's something
2575                                          *  to ack
2576                                          */
2577                                         if (tcb->acktimer.state != TcptimerON)
2578                                                 tcpgo(tpriv, &tcb->acktimer);
2579
2580                                         break;
2581                                 case Finwait2:
2582                                         /* no process to read the data, send a reset */
2583                                         if (bp != NULL)
2584                                                 freeblist(bp);
2585                                         sndrst(tcp, source, dest, length, &seg, version,
2586                                                    "send to Finwait2");
2587                                         qunlock(&s->qlock);
2588                                         poperror();
2589                                         return;
2590                         }
2591                 }
2592
2593                 if (seg.flags & FIN) {
2594                         tcb->flags |= FORCE;
2595
2596                         switch (tcb->state) {
2597                                 case Established:
2598                                         tcb->rcv.nxt++;
2599                                         tcpsetstate(s, Close_wait);
2600                                         break;
2601                                 case Finwait1:
2602                                         tcb->rcv.nxt++;
2603                                         if (qlen(s->wq) + tcb->flgcnt == 0) {
2604                                                 tcphalt(tpriv, &tcb->rtt_timer);
2605                                                 tcphalt(tpriv, &tcb->acktimer);
2606                                                 tcphalt(tpriv, &tcb->katimer);
2607                                                 tcpsetstate(s, Time_wait);
2608                                                 tcb->timer.start = MSL2 * (1000 / MSPTICK);
2609                                                 tcpgo(tpriv, &tcb->timer);
2610                                         } else
2611                                                 tcpsetstate(s, Closing);
2612                                         break;
2613                                 case Finwait2:
2614                                         tcb->rcv.nxt++;
2615                                         tcphalt(tpriv, &tcb->rtt_timer);
2616                                         tcphalt(tpriv, &tcb->acktimer);
2617                                         tcphalt(tpriv, &tcb->katimer);
2618                                         tcpsetstate(s, Time_wait);
2619                                         tcb->timer.start = MSL2 * (1000 / MSPTICK);
2620                                         tcpgo(tpriv, &tcb->timer);
2621                                         break;
2622                                 case Close_wait:
2623                                 case Closing:
2624                                 case Last_ack:
2625                                         break;
2626                                 case Time_wait:
2627                                         tcpgo(tpriv, &tcb->timer);
2628                                         break;
2629                         }
2630                 }
2631
2632                 /*
2633                  *  get next adjacent segment from the resequence queue.
2634                  *  dump/trim any overlapping segments
2635                  */
2636                 for (;;) {
2637                         if (tcb->reseq == NULL)
2638                                 goto output;
2639
2640                         if (seq_ge(tcb->rcv.nxt, tcb->reseq->seg.seq) == 0)
2641                                 goto output;
2642
2643                         getreseq(tcb, &seg, &bp, &length);
2644
2645                         if (tcptrim(tcb, &seg, &bp, &length) == 0)
2646                                 break;
2647                 }
2648         }
2649 output:
2650         tcpoutput(s);
2651         qunlock(&s->qlock);
2652         poperror();
2653         return;
2654 raise:
2655         qunlock(&s->qlock);
2656         poperror();
2657         freeblist(bp);
2658         tcpkick(s);
2659 }
2660
2661 /* The advertised mss = data + TCP headers */
2662 static uint16_t derive_payload_mss(Tcpctl *tcb)
2663 {
2664         uint16_t payload_mss = tcb->mss;
2665         uint16_t opt_size = 0;
2666
2667         if (tcb->ts_recent) {
2668                 opt_size += TS_LENGTH;
2669                 /* Note that when we're a SYN, we overestimate slightly.  This is safe,
2670                  * and not really a problem. */
2671                 opt_size += TS_SEND_PREPAD;
2672         }
2673         if (tcb->rcv.nr_sacks)
2674                 opt_size += 2 + tcb->rcv.nr_sacks * 8;
2675         opt_size = ROUNDUP(opt_size, 4);
2676         payload_mss -= opt_size;
2677         return payload_mss;
2678 }
2679
2680 /* Decreases the xmit amt, given the MSS / TSO. */
2681 static uint32_t throttle_for_mss(Tcpctl *tcb, uint32_t ssize,
2682                                  uint16_t payload_mss, bool retrans)
2683 {
2684         if (ssize > payload_mss) {
2685                 if ((tcb->flags & TSO) == 0) {
2686                         ssize = payload_mss;
2687                 } else {
2688                         /* Don't send too much.  32K is arbitrary.. */
2689                         if (ssize > 32 * 1024)
2690                                 ssize = 32 * 1024;
2691                         if (!retrans) {
2692                                 /* Clamp xmit to an integral MSS to avoid ragged tail segments
2693                                  * causing poor link utilization. */
2694                                 ssize = ROUNDDOWN(ssize, payload_mss);
2695                         }
2696                 }
2697         }
2698         return ssize;
2699 }
2700
2701 /* Reduces ssize for a variety of reasons.  Returns FALSE if we should abort
2702  * sending the packet.  o/w returns TRUE and modifies ssize by reference. */
2703 static bool throttle_ssize(struct conv *s, Tcpctl *tcb, uint32_t *ssize_p,
2704                            uint16_t payload_mss, bool retrans)
2705 {
2706         struct Fs *f = s->p->f;
2707         uint32_t usable;
2708         uint32_t ssize = *ssize_p;
2709
2710         /* Compute usable segment based on offered window and limit
2711          * window probes to one */
2712         if (tcb->snd.wnd == 0) {
2713                 if (tcb->snd.in_flight != 0) {
2714                         if ((tcb->flags & FORCE) == 0)
2715                                 return FALSE;
2716                 }
2717                 usable = 1;
2718         } else {
2719                 usable = tcb->cwind;
2720                 if (tcb->snd.wnd < usable)
2721                         usable = tcb->snd.wnd;
2722                 if (usable > tcb->snd.in_flight)
2723                         usable -= tcb->snd.in_flight;
2724                 else
2725                         usable = 0;
2726                 /* Avoid Silly Window Syndrome.  This is a little different thant RFC
2727                  * 813.  I took their additional enhancement of "< MSS" as an AND, not
2728                  * an OR.  25% of a large snd.wnd is pretty large, and our main goal is
2729                  * to avoid packets smaller than MSS.  I still use the 25% threshold,
2730                  * because it is important that there is *some* data in_flight.  If
2731                  * usable < MSS because snd.wnd is very small (but not 0), we might
2732                  * never get an ACK and would need to set up a timer.
2733                  *
2734                  * Also, I'm using 'ssize' as a proxy for a PSH point.  If there's just
2735                  * a small blob in the qio (or retrans!), then we might as well just
2736                  * send it. */
2737                 if ((usable < tcb->typical_mss) && (usable < tcb->snd.wnd >> 2)
2738                     && (usable < ssize)) {
2739                         return FALSE;
2740                 }
2741         }
2742         if (ssize && usable < 2)
2743                 netlog(s->p->f, Logtcpverbose,
2744                        "%I.%d -> %I.%d: throttled snd.wnd %lu cwind %lu\n",
2745                        s->laddr, s->lport, s->raddr, s->rport,
2746                        tcb->snd.wnd, tcb->cwind);
2747         if (usable < ssize)
2748                 ssize = usable;
2749
2750         ssize = throttle_for_mss(tcb, ssize, payload_mss, retrans);
2751
2752         *ssize_p = ssize;
2753         return TRUE;
2754 }
2755
2756 /* Helper, picks the next segment to send, which is possibly a retransmission.
2757  * Returns TRUE if we have a segment, FALSE o/w.  Returns ssize, from_seq, and
2758  * sent by reference.
2759  *
2760  * from_seq is the seq number we are transmitting from.
2761  *
2762  * sent includes all seq from una to from_seq *including* any previously sent
2763  * flags (part of tcb->flgcnt), for instance an unacknowledged SYN (which counts
2764  * as a seq number).  Those flags are in the e.g. snd.nxt - snd.una range, and
2765  * they get dropped after qdiscard.
2766  *
2767  * ssize is the amount of data we are sending, starting from from_seq, and it
2768  * will include any *new* flags, which haven't been accounted for yet.
2769  *
2770  * tcb->flgcnt consists of the flags both in ssize and in sent.
2771  *
2772  * Note that we could be in recovery and not sack_retrans a segment. */
2773 static bool get_xmit_segment(struct conv *s, Tcpctl *tcb, uint16_t payload_mss,
2774                              uint32_t *from_seq_p, uint32_t *sent_p,
2775                              uint32_t *ssize_p)
2776 {
2777         struct Fs *f = s->p->f;
2778         struct tcppriv *tpriv = s->p->priv;
2779         uint32_t ssize, sent, from_seq;
2780         bool sack_retrans = FALSE;
2781         struct sack_block *tcb_sack = 0;
2782
2783         for (int i = 0; i < tcb->snd.nr_sacks; i++) {
2784                 tcb_sack = &tcb->snd.sacks[i];
2785                 if (seq_lt(tcb->snd.rtx, tcb_sack->left)) {
2786                         /* So ssize is supposed to include any *new* flags to flgcnt, which
2787                          * at this point would be a FIN.
2788                          *
2789                          * It might be possible that flgcnt is incremented so we send a FIN,
2790                          * even for an intermediate sack retrans.  Perhaps the user closed
2791                          * the conv.
2792                          *
2793                          * However, the way the "flgcnt for FIN" works is that it inflates
2794                          * the desired amount we'd like to send (qlen + flgcnt).
2795                          * Eventually, we reach the end of the queue and fail to extract all
2796                          * of dsize.  At that point, we put on the FIN, and that's where the
2797                          * extra 'byte' comes from.
2798                          *
2799                          * For sack retrans, since we're extracting from parts of the qio
2800                          * that aren't the right-most edge, we don't need to consider flgcnt
2801                          * when setting ssize. */
2802                         from_seq = tcb->snd.rtx;
2803                         sent = from_seq - tcb->snd.una;
2804                         ssize = tcb_sack->left - from_seq;
2805                         sack_retrans = TRUE;
2806                         break;
2807                 }
2808         }
2809         /* SACK holes have first dibs, but we can still opportunisitically send new
2810          * data.
2811          *
2812          * During other types of recovery, we'll just send from the retrans point.
2813          * If we're in an RTO while we still have sacks, we could be resending data
2814          * that wasn't lost.  Consider a sack that is still growing (usually the
2815          * right-most), but we haven't received the ACK yet.  rxt may be included in
2816          * that area.  Given we had two losses or otherwise timed out, I'm not too
2817          * concerned.
2818          *
2819          * Note that Fast and RTO can send data beyond nxt.  If we change that,
2820          * change the accounting below. */
2821         if (!sack_retrans) {
2822                 switch (tcb->snd.recovery) {
2823                 default:
2824                 case SACK_RETRANS_RECOVERY:
2825                         from_seq = tcb->snd.nxt;
2826                         break;
2827                 case FAST_RETRANS_RECOVERY:
2828                 case RTO_RETRANS_RECOVERY:
2829                         from_seq = tcb->snd.rtx;
2830                         break;
2831                 }
2832                 sent = from_seq - tcb->snd.una;
2833                 /* qlen + flgcnt is every seq we want to have sent, including unack'd
2834                  * data, unacked flags, and new flags. */
2835                 ssize = qlen(s->wq) + tcb->flgcnt - sent;
2836         }
2837
2838         if (!throttle_ssize(s, tcb, &ssize, payload_mss, sack_retrans))
2839                 return FALSE;
2840
2841         /* This counts flags, which is a little hokey, but it's okay since in_flight
2842          * gets reset on each ACK */
2843         tcb->snd.in_flight += ssize;
2844         /* Log and track rxmit.  This covers both SACK (retrans) and fast rxmit. */
2845         if (ssize && seq_lt(tcb->snd.rtx, tcb->snd.nxt)) {
2846                 netlog(f, Logtcpverbose,
2847                        "%I.%d -> %I.%d: rxmit: rtx %u amt %u, nxt %u\n",
2848                        s->laddr, s->lport, s->raddr, s->rport,
2849                        tcb->snd.rtx, MIN(tcb->snd.nxt - tcb->snd.rtx, ssize),
2850                        tcb->snd.nxt);
2851                 tpriv->stats[RetransSegs]++;
2852         }
2853         if (sack_retrans) {
2854                 /* If we'll send up to the left edge, advance snd.rtx to the right.
2855                  *
2856                  * This includes the largest sack.  It might get removed later, in which
2857                  * case we'll underestimate the amount in-flight.  The alternative is to
2858                  * not count the rightmost sack, but when it gets removed, we'll retrans
2859                  * it anyway.  No matter what, we'd count it. */
2860                 tcb->snd.rtx += ssize;
2861                 if (tcb->snd.rtx == tcb_sack->left)
2862                         tcb->snd.rtx = tcb_sack->right;
2863                 /* RFC 6675 says we MAY rearm the RTO timer on each retrans, since we
2864                  * might not be getting ACKs for a while. */
2865                 tcpsettimer(tcb);
2866         } else {
2867                 switch (tcb->snd.recovery) {
2868                 default:
2869                         /* under normal op, we drag rtx along with nxt.  this prevents us
2870                          * from sending sacks too early (up above), since rtx doesn't get
2871                          * reset to una until we have a loss (e.g. 3 dupacks/sacks). */
2872                         tcb->snd.nxt += ssize;
2873                         tcb->snd.rtx = tcb->snd.nxt;
2874                         break;
2875                 case SACK_RETRANS_RECOVERY:
2876                         /* We explicitly do not want to increase rtx here.  We might still
2877                          * need it to fill in a sack gap below nxt if we get new, higher
2878                          * sacks. */
2879                         tcb->snd.nxt += ssize;
2880                         break;
2881                 case FAST_RETRANS_RECOVERY:
2882                 case RTO_RETRANS_RECOVERY:
2883                         tcb->snd.rtx += ssize;
2884                         /* Fast and RTO can send new data, advancing nxt. */
2885                         if (seq_gt(tcb->snd.rtx, tcb->snd.nxt))
2886                                 tcb->snd.nxt = tcb->snd.rtx;
2887                         break;
2888                 }
2889         }
2890         *from_seq_p = from_seq;
2891         *sent_p = sent;
2892         *ssize_p = ssize;
2893
2894         return TRUE;
2895 }
2896
2897 /*
2898  *  always enters and exits with the s locked.  We drop
2899  *  the lock to ipoput the packet so some care has to be
2900  *  taken by callers.
2901  */
2902 static void tcpoutput(struct conv *s)
2903 {
2904         Tcp seg;
2905         int msgs;
2906         int next_yield = 1;
2907         Tcpctl *tcb;
2908         struct block *hbp, *bp;
2909         uint32_t ssize, dsize, sent, from_seq;
2910         struct Fs *f;
2911         struct tcppriv *tpriv;
2912         uint8_t version;
2913         uint16_t payload_mss;
2914
2915         f = s->p->f;
2916         tpriv = s->p->priv;
2917         version = s->ipversion;
2918
2919         for (msgs = 0; msgs < 100; msgs++) {
2920                 tcb = (Tcpctl *) s->ptcl;
2921
2922                 switch (tcb->state) {
2923                         case Listen:
2924                         case Closed:
2925                         case Finwait2:
2926                                 return;
2927                 }
2928
2929                 /* force an ack when a window has opened up */
2930                 if (tcb->rcv.blocked && tcb->rcv.wnd >= tcb->mss) {
2931                         tcb->rcv.blocked = 0;
2932                         tcb->flags |= FORCE;
2933                 }
2934
2935                 /* Don't send anything else until our SYN has been acked */
2936                 if (tcb->snd.nxt != tcb->iss && (tcb->flags & SYNACK) == 0)
2937                         break;
2938
2939                 /* payload_mss is the actual amount of data in the packet, which is the
2940                  * advertised (mss - header opts).  This varies from packet to packet,
2941                  * based on the options that might be present (e.g. always timestamps,
2942                  * sometimes SACKs) */
2943                 payload_mss = derive_payload_mss(tcb);
2944
2945                 if (!get_xmit_segment(s, tcb, payload_mss, &from_seq, &sent, &ssize))
2946                         break;
2947
2948                 dsize = ssize;
2949                 seg.urg = 0;
2950
2951                 if (ssize == 0)
2952                         if ((tcb->flags & FORCE) == 0)
2953                                 break;
2954
2955                 tcb->flags &= ~FORCE;
2956                 tcprcvwin(s);
2957
2958                 /* By default we will generate an ack, so we can normally turn off the
2959                  * timer.  If we're blocked, we'll want the timer so we can send a
2960                  * window update. */
2961                 if (!tcb->rcv.blocked)
2962                         tcphalt(tpriv, &tcb->acktimer);
2963                 tcb->rcv.una = 0;
2964                 seg.source = s->lport;
2965                 seg.dest = s->rport;
2966                 seg.flags = ACK;
2967                 seg.mss = 0;
2968                 seg.ws = 0;
2969                 seg.sack_ok = FALSE;
2970                 seg.nr_sacks = 0;
2971                 /* When outputting, Syn_sent means "send the Syn", for connections we
2972                  * initiate.  SYNACKs are sent from sndsynack directly. */
2973                 if (tcb->state == Syn_sent) {
2974                         seg.flags = 0;
2975                         seg.sack_ok = SACK_SUPPORTED;   /* here's where we advertise SACK */
2976                         if (tcb->snd.nxt - ssize == tcb->iss) {
2977                                 seg.flags |= SYN;
2978                                 dsize--;
2979                                 seg.mss = tcb->mss;
2980                                 seg.ws = tcb->scale;
2981                         } else {
2982                                 /* TODO: Not sure why we'd get here. */
2983                                 warn("TCP: weird Syn_sent state, tell someone you saw this");
2984                         }
2985                 }
2986                 seg.seq = from_seq;
2987                 seg.ack = tcb->rcv.nxt;
2988                 tcb->last_ack_sent = seg.ack;
2989                 seg.wnd = tcb->rcv.wnd;
2990                 seg.ts_val = tcb->ts_recent;
2991
2992                 /* Pull out data to send */
2993                 bp = NULL;
2994                 if (dsize != 0) {
2995                         bp = qcopy(s->wq, dsize, sent);
2996                         if (BLEN(bp) != dsize) {
2997                                 /* Here's where the flgcnt kicked in.  Note dsize is
2998                                  * decremented, but ssize isn't.  Not that we use ssize for much
2999                                  * anymore.  Decrementing dsize prevents us from sending a PSH
3000                                  * with the FIN. */
3001                                 seg.flags |= FIN;
3002                                 dsize--;
3003                         }
3004                         if (BLEN(bp) > payload_mss) {
3005                                 bp->flag |= Btso;
3006                                 bp->mss = payload_mss;
3007                         }
3008                 }
3009
3010                 if (sent + dsize == qlen(s->wq) + tcb->flgcnt)
3011                         seg.flags |= PSH;
3012
3013                 /* Build header, link data and compute cksum */
3014                 switch (version) {
3015                         case V4:
3016                                 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
3017                                 hbp = htontcp4(&seg, bp, &tcb->protohdr.tcp4hdr, tcb);
3018                                 if (hbp == NULL) {
3019                                         freeblist(bp);
3020                                         return;
3021                                 }
3022                                 break;
3023                         case V6:
3024                                 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
3025                                 hbp = htontcp6(&seg, bp, &tcb->protohdr.tcp6hdr, tcb);
3026                                 if (hbp == NULL) {
3027                                         freeblist(bp);
3028                                         return;
3029                                 }
3030                                 break;
3031                         default:
3032                                 hbp = NULL;     /* to suppress a warning */
3033                                 panic("tcpoutput: version %d", version);
3034                 }
3035
3036                 /* Start the transmission timers if there is new data and we
3037                  * expect acknowledges
3038                  */
3039                 if (ssize != 0) {
3040                         if (tcb->timer.state != TcptimerON)
3041                                 tcpgo(tpriv, &tcb->timer);
3042
3043                         if (!tcb->ts_recent && (tcb->rtt_timer.state != TcptimerON)) {
3044                                 /* If round trip timer isn't running, start it. */
3045                                 tcpgo(tpriv, &tcb->rtt_timer);
3046                                 tcb->rttseq = from_seq + ssize;
3047                         }
3048                 }
3049
3050                 tpriv->stats[OutSegs]++;
3051
3052                 /* put off the next keep alive */
3053                 tcpgo(tpriv, &tcb->katimer);
3054
3055                 switch (version) {
3056                         case V4:
3057                                 if (ipoput4(f, hbp, 0, s->ttl, s->tos, s) < 0) {
3058                                         /* a negative return means no route */
3059                                         localclose(s, "no route");
3060                                 }
3061                                 break;
3062                         case V6:
3063                                 if (ipoput6(f, hbp, 0, s->ttl, s->tos, s) < 0) {
3064                                         /* a negative return means no route */
3065                                         localclose(s, "no route");
3066                                 }
3067                                 break;
3068                         default:
3069                                 panic("tcpoutput2: version %d", version);
3070                 }
3071                 if (ssize) {
3072                         /* The outer loop thinks we sent one packet.  If we used TSO, we
3073                          * might have sent several.  Minus one for the loop increment. */
3074                         msgs += DIV_ROUND_UP(ssize, payload_mss) - 1;
3075                 }
3076                 /* Old Plan 9 tidbit - yield every four messages.  We want to break out
3077                  * and unlock so we can process inbound ACKs which might do things like
3078                  * say "slow down". */
3079                 if (msgs >= next_yield) {
3080                         next_yield = msgs + 4;
3081                         qunlock(&s->qlock);
3082                         kthread_yield();
3083                         qlock(&s->qlock);
3084                 }
3085         }
3086 }
3087
3088 /*
3089  *  the BSD convention (hack?) for keep alives.  resend last uint8_t acked.
3090  */
3091 static void tcpsendka(struct conv *s)
3092 {
3093         Tcp seg;
3094         Tcpctl *tcb;
3095         struct block *hbp, *dbp;
3096
3097         tcb = (Tcpctl *) s->ptcl;
3098
3099         dbp = NULL;
3100         seg.urg = 0;
3101         seg.source = s->lport;
3102         seg.dest = s->rport;
3103         seg.flags = ACK | PSH;
3104         seg.mss = 0;
3105         seg.ws = 0;
3106         seg.sack_ok = FALSE;
3107         seg.nr_sacks = 0;
3108         if (tcpporthogdefense)
3109                 urandom_read(&seg.seq, sizeof(seg.seq));
3110         else
3111                 seg.seq = tcb->snd.una - 1;
3112         seg.ack = tcb->rcv.nxt;
3113         tcb->last_ack_sent = seg.ack;
3114         tcb->rcv.una = 0;
3115         seg.wnd = tcb->rcv.wnd;
3116         seg.ts_val = tcb->ts_recent;
3117         if (tcb->state == Finwait2) {
3118                 seg.flags |= FIN;
3119         } else {
3120                 dbp = block_alloc(1, MEM_WAIT);
3121                 dbp->wp++;
3122         }
3123
3124         if (isv4(s->raddr)) {
3125                 /* Build header, link data and compute cksum */
3126                 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
3127                 hbp = htontcp4(&seg, dbp, &tcb->protohdr.tcp4hdr, tcb);
3128                 if (hbp == NULL) {
3129                         freeblist(dbp);
3130                         return;
3131                 }
3132                 ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
3133         } else {
3134                 /* Build header, link data and compute cksum */
3135                 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;