WIP-pop-3000
[akaros.git] / kern / src / net / tcp.c
1 /* Copyright © 1994-1999 Lucent Technologies Inc.  All rights reserved.
2  * Portions Copyright © 1997-1999 Vita Nuova Limited
3  * Portions Copyright © 2000-2007 Vita Nuova Holdings Limited
4  *                                (www.vitanuova.com)
5  * Revisions Copyright © 2000-2007 Lucent Technologies Inc. and others
6  *
7  * Modified for the Akaros operating system:
8  * Copyright (c) 2013-2014 The Regents of the University of California
9  * Copyright (c) 2013-2017 Google Inc.
10  *
11  * Permission is hereby granted, free of charge, to any person obtaining a copy
12  * of this software and associated documentation files (the "Software"), to deal
13  * in the Software without restriction, including without limitation the rights
14  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15  * copies of the Software, and to permit persons to whom the Software is
16  * furnished to do so, subject to the following conditions:
17  *
18  * The above copyright notice and this permission notice shall be included in
19  * all copies or substantial portions of the Software.
20  *
21  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
24  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27  * SOFTWARE. */
28
29 #include <vfs.h>
30 #include <kfs.h>
31 #include <slab.h>
32 #include <kmalloc.h>
33 #include <kref.h>
34 #include <string.h>
35 #include <stdio.h>
36 #include <assert.h>
37 #include <error.h>
38 #include <cpio.h>
39 #include <pmap.h>
40 #include <smp.h>
41 #include <net/ip.h>
42 #include <net/tcp.h>
43
44 /* Must correspond to the enumeration in tcp.h */
45 static char *tcpstates[] = {
46         "Closed", "Listen", "Syn_sent",
47         "Established", "Finwait1", "Finwait2", "Close_wait",
48         "Closing", "Last_ack", "Time_wait"
49 };
50
51 static int tcp_irtt = DEF_RTT;                  /* Initial guess at round trip time */
52 static uint16_t tcp_mss = DEF_MSS;              /* Maximum segment size to be sent */
53
54 /* Must correspond to the enumeration in tcp.h */
55 static char *statnames[] = {
56         [MaxConn] "MaxConn",
57         [ActiveOpens] "ActiveOpens",
58         [PassiveOpens] "PassiveOpens",
59         [EstabResets] "EstabResets",
60         [CurrEstab] "CurrEstab",
61         [InSegs] "InSegs",
62         [OutSegs] "OutSegs",
63         [RetransSegs] "RetransSegs",
64         [RetransTimeouts] "RetransTimeouts",
65         [InErrs] "InErrs",
66         [OutRsts] "OutRsts",
67         [CsumErrs] "CsumErrs",
68         [HlenErrs] "HlenErrs",
69         [LenErrs] "LenErrs",
70         [OutOfOrder] "OutOfOrder",
71 };
72
73 /*
74  *  Setting tcpporthogdefense to non-zero enables Dong Lin's
75  *  solution to hijacked systems staking out port's as a form
76  *  of DoS attack.
77  *
78  *  To avoid stateless Conv hogs, we pick a sequence number at random.  If
79  *  it that number gets acked by the other end, we shut down the connection.
80  *  Look for tcpporthogedefense in the code.
81  */
82 static int tcpporthogdefense = 0;
83
84 static int addreseq(Tcpctl *, struct tcppriv *, Tcp *, struct block *,
85                     uint16_t);
86 static void getreseq(Tcpctl *, Tcp *, struct block **, uint16_t *);
87 static void localclose(struct conv *, char *unused_char_p_t);
88 static void procsyn(struct conv *, Tcp *);
89 static void tcpiput(struct Proto *, struct Ipifc *, struct block *);
90 static void tcpoutput(struct conv *);
91 static int tcptrim(Tcpctl *, Tcp *, struct block **, uint16_t *);
92 static void tcpstart(struct conv *, int);
93 static void tcptimeout(void *);
94 static void tcpsndsyn(struct conv *, Tcpctl *);
95 static void tcprcvwin(struct conv *);
96 static void tcpacktimer(void *);
97 static void tcpkeepalive(void *);
98 static void tcpsetkacounter(Tcpctl *);
99 static void tcprxmit(struct conv *);
100 static void tcpsettimer(Tcpctl *);
101 static void tcpsynackrtt(struct conv *);
102 static void tcpsetscale(struct conv *, Tcpctl *, uint16_t, uint16_t);
103 static void tcp_loss_event(struct conv *s, Tcpctl *tcb);
104 static uint16_t derive_payload_mss(Tcpctl *tcb);
105 static void set_in_flight(Tcpctl *tcb);
106
107 static void limborexmit(struct Proto *);
108 static void limbo(struct conv *, uint8_t *unused_uint8_p_t, uint8_t *, Tcp *,
109                                   int);
110
111 static void tcpsetstate(struct conv *s, uint8_t newstate)
112 {
113         Tcpctl *tcb;
114         uint8_t oldstate;
115         struct tcppriv *tpriv;
116
117         tpriv = s->p->priv;
118
119         tcb = (Tcpctl *) s->ptcl;
120
121         oldstate = tcb->state;
122         if (oldstate == newstate)
123                 return;
124
125         if (oldstate == Established)
126                 tpriv->stats[CurrEstab]--;
127         if (newstate == Established)
128                 tpriv->stats[CurrEstab]++;
129
130         /**
131         print( "%d/%d %s->%s CurrEstab=%d\n", s->lport, s->rport,
132                 tcpstates[oldstate], tcpstates[newstate], tpriv->tstats.tcpCurrEstab );
133         **/
134
135         switch (newstate) {
136                 case Closed:
137                         qclose(s->rq);
138                         qclose(s->wq);
139                         qclose(s->eq);
140                         break;
141
142                 case Close_wait:        /* Remote closes */
143                         qhangup(s->rq, NULL);
144                         break;
145         }
146
147         tcb->state = newstate;
148
149         if (oldstate == Syn_sent && newstate != Closed)
150                 Fsconnected(s, NULL);
151 }
152
153 static void tcpconnect(struct conv *c, char **argv, int argc)
154 {
155         Fsstdconnect(c, argv, argc);
156         tcpstart(c, TCP_CONNECT);
157 }
158
159 static int tcpstate(struct conv *c, char *state, int n)
160 {
161         Tcpctl *s;
162
163         s = (Tcpctl *) (c->ptcl);
164
165         return snprintf(state, n,
166                                         "%s qin %d qout %d srtt %d mdev %d cwin %u swin %u>>%d rwin %u>>%d timer.start %llu timer.count %llu rerecv %d katimer.start %d katimer.count %d\n",
167                                         tcpstates[s->state],
168                                         c->rq ? qlen(c->rq) : 0,
169                                         c->wq ? qlen(c->wq) : 0,
170                                         s->srtt, s->mdev,
171                                         s->cwind, s->snd.wnd, s->rcv.scale, s->rcv.wnd,
172                                         s->snd.scale, s->timer.start, s->timer.count, s->rerecv,
173                                         s->katimer.start, s->katimer.count);
174 }
175
176 static int tcpinuse(struct conv *c)
177 {
178         Tcpctl *s;
179
180         s = (Tcpctl *) (c->ptcl);
181         return s->state != Closed;
182 }
183
184 static void tcpannounce(struct conv *c, char **argv, int argc)
185 {
186         Fsstdannounce(c, argv, argc);
187         tcpstart(c, TCP_LISTEN);
188         Fsconnected(c, NULL);
189 }
190
191 static void tcpbypass(struct conv *cv, char **argv, int argc)
192 {
193         struct tcppriv *tpriv = cv->p->priv;
194
195         Fsstdbypass(cv, argv, argc);
196         iphtadd(&tpriv->ht, cv);
197 }
198
199 static void tcpshutdown(struct conv *c, int how)
200 {
201         Tcpctl *tcb = (Tcpctl*)c->ptcl;
202
203         /* Do nothing for the read side */
204         if (how == SHUT_RD)
205                 return;
206         /* Sends a FIN.  If we're in another state (like Listen), we'll run into
207          * issues, since we'll never send the FIN.  We'll be shutdown on our end,
208          * but we'll never tell the distant end.  Might just be an app issue. */
209         switch (tcb->state) {
210         case Established:
211                 tcb->flgcnt++;
212                 tcpsetstate(c, Finwait1);
213                 tcpoutput(c);
214                 break;
215         }
216 }
217
218 /*
219  *  tcpclose is always called with the q locked
220  */
221 static void tcpclose(struct conv *c)
222 {
223         Tcpctl *tcb;
224
225         tcb = (Tcpctl *) c->ptcl;
226
227         qhangup(c->rq, NULL);
228         qhangup(c->wq, NULL);
229         qhangup(c->eq, NULL);
230         qflush(c->rq);
231
232         switch (tcb->state) {
233                 case Listen:
234                         /*
235                          *  reset any incoming calls to this listener
236                          */
237                         Fsconnected(c, "Hangup");
238
239                         localclose(c, NULL);
240                         break;
241                 case Closed:
242                 case Syn_sent:
243                         localclose(c, NULL);
244                         break;
245                 case Established:
246                         tcb->flgcnt++;
247                         tcpsetstate(c, Finwait1);
248                         tcpoutput(c);
249                         break;
250                 case Close_wait:
251                         tcb->flgcnt++;
252                         tcpsetstate(c, Last_ack);
253                         tcpoutput(c);
254                         break;
255         }
256 }
257
258 static void tcpkick(void *x)
259 {
260         ERRSTACK(1);
261         struct conv *s = x;
262         Tcpctl *tcb;
263
264         tcb = (Tcpctl *) s->ptcl;
265
266         qlock(&s->qlock);
267         if (waserror()) {
268                 qunlock(&s->qlock);
269                 nexterror();
270         }
271
272         switch (tcb->state) {
273                 case Syn_sent:
274                 case Established:
275                 case Close_wait:
276                         /*
277                          * Push data
278                          */
279                         tcprcvwin(s);
280                         tcpoutput(s);
281                         break;
282                 default:
283                         localclose(s, "Hangup");
284                         break;
285         }
286
287         qunlock(&s->qlock);
288         poperror();
289 }
290
291 static void tcprcvwin(struct conv *s)
292 {
293         /* Call with tcb locked */
294         int w;
295         Tcpctl *tcb;
296
297         tcb = (Tcpctl *) s->ptcl;
298         w = tcb->window - qlen(s->rq);
299         if (w < 0)
300                 w = 0;
301
302         /* RFC 813: Avoid SWS.  We'll always reduce the window (because the qio
303          * increased - that's legit), and we'll always advertise the window
304          * increases (corresponding to qio drains) when those are greater than MSS.
305          * But we don't advertise increases less than MSS.
306          *
307          * Note we don't shrink the window at all - that'll result in tcptrim()
308          * dropping packets that were sent before the sender gets our update. */
309         if ((w < tcb->rcv.wnd) || (w >= tcb->mss))
310                 tcb->rcv.wnd = w;
311         /* We've delayed sending an update to rcv.wnd, and we might never get
312          * another ACK to drive the TCP stack after the qio is drained.  We could
313          * replace this stuff with qio kicks or callbacks, but that might be
314          * trickier with the MSS limitation.  (and 'edge' isn't empty or not). */
315         if (w < tcb->mss)
316                 tcb->rcv.blocked = 1;
317 }
318
319 static void tcpacktimer(void *v)
320 {
321         ERRSTACK(1);
322         Tcpctl *tcb;
323         struct conv *s;
324
325         s = v;
326         tcb = (Tcpctl *) s->ptcl;
327
328         qlock(&s->qlock);
329         if (waserror()) {
330                 qunlock(&s->qlock);
331                 nexterror();
332         }
333         if (tcb->state != Closed) {
334                 tcb->flags |= FORCE;
335                 tcprcvwin(s);
336                 tcpoutput(s);
337         }
338         qunlock(&s->qlock);
339         poperror();
340 }
341
342 static void tcpcreate(struct conv *c)
343 {
344         /* We don't use qio limits.  Instead, TCP manages flow control on its own.
345          * We only use qpassnolim().  Note for qio that 0 doesn't mean no limit. */
346         c->rq = qopen(0, Qcoalesce, 0, 0);
347         c->wq = qopen(8 * QMAX, Qkick, tcpkick, c);
348 }
349
350 static void timerstate(struct tcppriv *priv, Tcptimer *t, int newstate)
351 {
352         if (newstate != TcptimerON) {
353                 if (t->state == TcptimerON) {
354                         // unchain
355                         if (priv->timers == t) {
356                                 priv->timers = t->next;
357                                 if (t->prev != NULL)
358                                         panic("timerstate1");
359                         }
360                         if (t->next)
361                                 t->next->prev = t->prev;
362                         if (t->prev)
363                                 t->prev->next = t->next;
364                         t->next = t->prev = NULL;
365                 }
366         } else {
367                 if (t->state != TcptimerON) {
368                         // chain
369                         if (t->prev != NULL || t->next != NULL)
370                                 panic("timerstate2");
371                         t->prev = NULL;
372                         t->next = priv->timers;
373                         if (t->next)
374                                 t->next->prev = t;
375                         priv->timers = t;
376                 }
377         }
378         t->state = newstate;
379 }
380
381 static void tcpackproc(void *a)
382 {
383         ERRSTACK(1);
384         Tcptimer *t, *tp, *timeo;
385         struct Proto *tcp;
386         struct tcppriv *priv;
387         int loop;
388
389         tcp = a;
390         priv = tcp->priv;
391
392         for (;;) {
393                 kthread_usleep(MSPTICK * 1000);
394
395                 qlock(&priv->tl);
396                 timeo = NULL;
397                 loop = 0;
398                 for (t = priv->timers; t != NULL; t = tp) {
399                         if (loop++ > 10000)
400                                 panic("tcpackproc1");
401                         tp = t->next;
402                         /* this is a little odd.  overall, we wake up once per 'tick' (50ms,
403                          * whatever).  then, we decrement count.  so the timer val is in
404                          * units of 50 ms.  the timer list isn't sorted either.  once
405                          * someone expires, we get moved to another LL, local, and we fire
406                          * those alarms.
407                          *
408                          * the best anyone could do would be 50 ms granularity.
409                          *
410                          * if things are slow, you could skew later too.
411                          *
412                          * actually, you're expected value is 25ms for the first count.  so
413                          * whatever your timer.start is, your wait time is start * 50 - 25.
414                          *              which is why we wait 25 ms to open up our window again.
415                          *
416                          * might be issues with concurrency.  once the alarm is set to done
417                          * and yanked off the list, what's to stop a concurrent setter from
418                          * putting it back on the list and setting TcptimerON?
419                          *              there's a lot of lockless peeks at the timer.state
420                          *
421                          * probably be better served with a kthread timer chain
422                          *              one assumption with the timerchain stuff is that the source
423                          *              is an IRQ, and thus IRQ context matters, etc.
424                          *
425                          *              with a kth tchain, we're in kth context already.  and you
426                          *              probably don't want to send another RKM for each timer.
427                          *              unless the locking matters.
428                          *
429                          *              interesting - even the pcpu tchains - should those be a
430                          *              per-core kth?  does any alarm need to run from IRQ ctx?
431                          *                              maybe.
432                          * */
433                         if (t->state == TcptimerON) {
434                                 t->count--;
435                                 if (t->count == 0) {
436                                         timerstate(priv, t, TcptimerDONE);
437                                         t->readynext = timeo;
438                                         timeo = t;
439                                 }
440                         }
441                 }
442                 qunlock(&priv->tl);
443
444                 loop = 0;
445                 for (t = timeo; t != NULL; t = t->readynext) {
446                         if (loop++ > 10000)
447                                 panic("tcpackproc2");
448                         if (t->state == TcptimerDONE && t->func != NULL) {
449                                 /* discard error style */
450                                 if (!waserror())
451                                         (*t->func) (t->arg);
452                                 poperror();
453                         }
454                 }
455
456                 limborexmit(tcp);
457         }
458 }
459
460 static void tcpgo(struct tcppriv *priv, Tcptimer *t)
461 {
462         if (t == NULL || t->start == 0)
463                 return;
464
465         qlock(&priv->tl);
466         t->count = t->start;
467         timerstate(priv, t, TcptimerON);
468         qunlock(&priv->tl);
469 }
470
471 static void tcphalt(struct tcppriv *priv, Tcptimer *t)
472 {
473         if (t == NULL)
474                 return;
475
476         qlock(&priv->tl);
477         timerstate(priv, t, TcptimerOFF);
478         qunlock(&priv->tl);
479 }
480
481 static int backoff(int n)
482 {
483         return 1 << n;
484 }
485
486 static void localclose(struct conv *s, char *reason)
487 {
488         /* called with tcb locked */
489         Tcpctl *tcb;
490         Reseq *rp, *rp1;
491         struct tcppriv *tpriv;
492
493         tpriv = s->p->priv;
494         tcb = (Tcpctl *) s->ptcl;
495
496         iphtrem(&tpriv->ht, s);
497
498         tcphalt(tpriv, &tcb->timer);
499         tcphalt(tpriv, &tcb->rtt_timer);
500         tcphalt(tpriv, &tcb->acktimer);
501         tcphalt(tpriv, &tcb->katimer);
502
503         /* Flush reassembly queue; nothing more can arrive */
504         for (rp = tcb->reseq; rp != NULL; rp = rp1) {
505                 rp1 = rp->next;
506                 freeblist(rp->bp);
507                 kfree(rp);
508         }
509         tcb->reseq = NULL;
510
511         if (tcb->state == Syn_sent)
512                 Fsconnected(s, reason);
513
514         qhangup(s->rq, reason);
515         qhangup(s->wq, reason);
516
517         tcpsetstate(s, Closed);
518
519         /* listener will check the rq state */
520         if (s->state == Announced)
521                 rendez_wakeup(&s->listenr);
522 }
523
524 /* mtu (- TCP + IP hdr len) of 1st hop */
525 static int tcpmtu(struct Ipifc *ifc, int version, int *scale)
526 {
527         int mtu;
528
529         switch (version) {
530                 default:
531                 case V4:
532                         mtu = DEF_MSS;
533                         if (ifc != NULL)
534                                 mtu = ifc->maxtu - ifc->m->hsize - (TCP4_PKT + TCP4_HDRSIZE);
535                         break;
536                 case V6:
537                         mtu = DEF_MSS6;
538                         if (ifc != NULL)
539                                 mtu = ifc->maxtu - ifc->m->hsize - (TCP6_PKT + TCP6_HDRSIZE);
540                         break;
541         }
542         *scale = HaveWS | 7;
543
544         return mtu;
545 }
546
547 static void tcb_check_tso(Tcpctl *tcb)
548 {
549         /* This can happen if the netdev isn't up yet. */
550         if (!tcb->ifc)
551                 return;
552         if (tcb->ifc->feat & NETF_TSO)
553                 tcb->flags |= TSO;
554         else
555                 tcb->flags &= ~TSO;
556 }
557
558 static void inittcpctl(struct conv *s, int mode)
559 {
560         Tcpctl *tcb;
561         Tcp4hdr *h4;
562         Tcp6hdr *h6;
563         int mss;
564
565         tcb = (Tcpctl *) s->ptcl;
566
567         memset(tcb, 0, sizeof(Tcpctl));
568
569         tcb->ssthresh = UINT32_MAX;
570         tcb->srtt = tcp_irtt;
571         tcb->mdev = 0;
572
573         /* setup timers */
574         tcb->timer.start = tcp_irtt / MSPTICK;
575         tcb->timer.func = tcptimeout;
576         tcb->timer.arg = s;
577         tcb->rtt_timer.start = MAX_TIME;
578         tcb->acktimer.start = TCP_ACK / MSPTICK;
579         tcb->acktimer.func = tcpacktimer;
580         tcb->acktimer.arg = s;
581         tcb->katimer.start = DEF_KAT / MSPTICK;
582         tcb->katimer.func = tcpkeepalive;
583         tcb->katimer.arg = s;
584
585         mss = DEF_MSS;
586
587         /* create a prototype(pseudo) header */
588         if (mode != TCP_LISTEN) {
589                 if (ipcmp(s->laddr, IPnoaddr) == 0)
590                         findlocalip(s->p->f, s->laddr, s->raddr);
591
592                 switch (s->ipversion) {
593                         case V4:
594                                 h4 = &tcb->protohdr.tcp4hdr;
595                                 memset(h4, 0, sizeof(*h4));
596                                 h4->proto = IP_TCPPROTO;
597                                 hnputs(h4->tcpsport, s->lport);
598                                 hnputs(h4->tcpdport, s->rport);
599                                 v6tov4(h4->tcpsrc, s->laddr);
600                                 v6tov4(h4->tcpdst, s->raddr);
601                                 break;
602                         case V6:
603                                 h6 = &tcb->protohdr.tcp6hdr;
604                                 memset(h6, 0, sizeof(*h6));
605                                 h6->proto = IP_TCPPROTO;
606                                 hnputs(h6->tcpsport, s->lport);
607                                 hnputs(h6->tcpdport, s->rport);
608                                 ipmove(h6->tcpsrc, s->laddr);
609                                 ipmove(h6->tcpdst, s->raddr);
610                                 mss = DEF_MSS6;
611                                 break;
612                         default:
613                                 panic("inittcpctl: version %d", s->ipversion);
614                 }
615         }
616
617         tcb->ifc = findipifc(s->p->f, s->laddr, 0);
618         tcb->mss = mss;
619         tcb->typical_mss = mss;
620         tcb->cwind = tcb->typical_mss * CWIND_SCALE;
621
622         /* default is no window scaling */
623         tcb->window = QMAX;
624         tcb->rcv.wnd = QMAX;
625         tcb->rcv.scale = 0;
626         tcb->snd.scale = 0;
627         tcb_check_tso(tcb);
628 }
629
630 /*
631  *  called with s qlocked
632  */
633 static void tcpstart(struct conv *s, int mode)
634 {
635         Tcpctl *tcb;
636         struct tcppriv *tpriv;
637         char *kpname;
638
639         tpriv = s->p->priv;
640
641         if (tpriv->ackprocstarted == 0) {
642                 qlock(&tpriv->apl);
643                 if (tpriv->ackprocstarted == 0) {
644                         /* tcpackproc needs to free this if it ever exits */
645                         kpname = kmalloc(KNAMELEN, MEM_WAIT);
646                         snprintf(kpname, KNAMELEN, "#I%dtcpack", s->p->f->dev);
647                         ktask(kpname, tcpackproc, s->p);
648                         tpriv->ackprocstarted = 1;
649                 }
650                 qunlock(&tpriv->apl);
651         }
652
653         tcb = (Tcpctl *) s->ptcl;
654
655         inittcpctl(s, mode);
656
657         iphtadd(&tpriv->ht, s);
658         switch (mode) {
659                 case TCP_LISTEN:
660                         tpriv->stats[PassiveOpens]++;
661                         tcb->flags |= CLONE;
662                         tcpsetstate(s, Listen);
663                         break;
664
665                 case TCP_CONNECT:
666                         tpriv->stats[ActiveOpens]++;
667                         tcb->flags |= ACTIVE;
668                         tcpsndsyn(s, tcb);
669                         tcpsetstate(s, Syn_sent);
670                         tcpoutput(s);
671                         break;
672         }
673 }
674
675 static char *tcpflag(uint16_t flag)
676 {
677         static char buf[128];
678
679         snprintf(buf, sizeof(buf), "%d", flag >> 10);   /* Head len */
680         if (flag & URG)
681                 snprintf(buf, sizeof(buf), "%s%s", buf, " URG");
682         if (flag & ACK)
683                 snprintf(buf, sizeof(buf), "%s%s", buf, " ACK");
684         if (flag & PSH)
685                 snprintf(buf, sizeof(buf), "%s%s", buf, " PSH");
686         if (flag & RST)
687                 snprintf(buf, sizeof(buf), "%s%s", buf, " RST");
688         if (flag & SYN)
689                 snprintf(buf, sizeof(buf), "%s%s", buf, " SYN");
690         if (flag & FIN)
691                 snprintf(buf, sizeof(buf), "%s%s", buf, " FIN");
692
693         return buf;
694 }
695
696 /* Helper, determine if we should send a TCP timestamp.  ts_val was the
697  * timestamp from our distant end.  We'll also send a TS on SYN (no ACK). */
698 static bool tcp_seg_has_ts(Tcp *tcph)
699 {
700         return tcph->ts_val || ((tcph->flags & SYN) && !(tcph->flags & ACK));
701 }
702
703 /* Given a TCP header/segment and default header size (e.g. TCP4_HDRSIZE),
704  * return the actual hdr_len and opt_pad */
705 static void compute_hdrlen_optpad(Tcp *tcph, uint16_t default_hdrlen,
706                                   uint16_t *ret_hdrlen, uint16_t *ret_optpad,
707                                   Tcpctl *tcb)
708 {
709         uint16_t hdrlen = default_hdrlen;
710         uint16_t optpad = 0;
711
712         if (tcph->flags & SYN) {
713                 if (tcph->mss)
714                         hdrlen += MSS_LENGTH;
715                 if (tcph->ws)
716                         hdrlen += WS_LENGTH;
717                 if (tcph->sack_ok)
718                         hdrlen += SACK_OK_LENGTH;
719         }
720         if (tcp_seg_has_ts(tcph)) {
721                 hdrlen += TS_LENGTH;
722                 /* SYNs have other opts, don't do the PREPAD NOOP optimization. */
723                 if (!(tcph->flags & SYN))
724                         hdrlen += TS_SEND_PREPAD;
725         }
726         if (tcb && tcb->rcv.nr_sacks)
727                 hdrlen += 2 + tcb->rcv.nr_sacks * 8;
728         optpad = hdrlen & 3;
729         if (optpad)
730                 optpad = 4 - optpad;
731         hdrlen += optpad;
732         *ret_hdrlen = hdrlen;
733         *ret_optpad = optpad;
734 }
735
736 /* Writes the TCP options for tcph to opt. */
737 static void write_opts(Tcp *tcph, uint8_t *opt, uint16_t optpad, Tcpctl *tcb)
738 {
739         if (tcph->flags & SYN) {
740                 if (tcph->mss != 0) {
741                         *opt++ = MSSOPT;
742                         *opt++ = MSS_LENGTH;
743                         hnputs(opt, tcph->mss);
744                         opt += 2;
745                 }
746                 if (tcph->ws != 0) {
747                         *opt++ = WSOPT;
748                         *opt++ = WS_LENGTH;
749                         *opt++ = tcph->ws;
750                 }
751                 if (tcph->sack_ok) {
752                         *opt++ = SACK_OK_OPT;
753                         *opt++ = SACK_OK_LENGTH;
754                 }
755         }
756         if (tcp_seg_has_ts(tcph)) {
757                 if (!(tcph->flags & SYN)) {
758                         *opt++ = NOOPOPT;
759                         *opt++ = NOOPOPT;
760                 }
761                 *opt++ = TS_OPT;
762                 *opt++ = TS_LENGTH;
763                 /* Setting TSval, our time */
764                 hnputl(opt, milliseconds());
765                 opt += 4;
766                 /* Setting TSecr, the time we last saw from them, stored in ts_val */
767                 hnputl(opt, tcph->ts_val);
768                 opt += 4;
769         }
770         if (tcb && tcb->rcv.nr_sacks) {
771                 *opt++ = SACK_OPT;
772                 *opt++ = 2 + tcb->rcv.nr_sacks * 8;
773                 for (int i = 0; i < tcb->rcv.nr_sacks; i++) {
774                         hnputl(opt, tcb->rcv.sacks[i].left);
775                         opt += 4;
776                         hnputl(opt, tcb->rcv.sacks[i].right);
777                         opt += 4;
778                 }
779         }
780         while (optpad-- > 0)
781                 *opt++ = NOOPOPT;
782 }
783
784 /* Given a data block (or NULL) returns a block with enough header room that we
785  * can send out.  block->wp is set to the beginning of the payload.  Returns
786  * NULL on some sort of error. */
787 static struct block *alloc_or_pad_block(struct block *data,
788                                         uint16_t total_hdr_size)
789 {
790         if (data) {
791                 data = padblock(data, total_hdr_size);
792                 if (data == NULL)
793                         return NULL;
794         } else {
795                 /* the 64 pad is to meet mintu's */
796                 data = block_alloc(total_hdr_size + 64, MEM_WAIT);
797                 if (data == NULL)
798                         return NULL;
799                 data->wp += total_hdr_size;
800         }
801         return data;
802 }
803
804 static struct block *htontcp6(Tcp *tcph, struct block *data, Tcp6hdr *ph,
805                               Tcpctl *tcb)
806 {
807         int dlen = blocklen(data);
808         Tcp6hdr *h;
809         uint16_t csum;
810         uint16_t hdrlen, optpad;
811
812         compute_hdrlen_optpad(tcph, TCP6_HDRSIZE, &hdrlen, &optpad, tcb);
813
814         data = alloc_or_pad_block(data, hdrlen + TCP6_PKT);
815         if (data == NULL)
816                 return NULL;
817         /* relative to the block start (bp->rp).  Note TCP structs include IP. */
818         data->network_offset = 0;
819         data->transport_offset = offsetof(Tcp6hdr, tcpsport);
820
821         /* copy in pseudo ip header plus port numbers */
822         h = (Tcp6hdr *) (data->rp);
823         memmove(h, ph, TCP6_TCBPHDRSZ);
824
825         /* compose pseudo tcp header, do cksum calculation */
826         hnputl(h->vcf, hdrlen + dlen);
827         h->ploadlen[0] = h->ploadlen[1] = h->proto = 0;
828         h->ttl = ph->proto;
829
830         /* copy in variable bits */
831         hnputl(h->tcpseq, tcph->seq);
832         hnputl(h->tcpack, tcph->ack);
833         hnputs(h->tcpflag, (hdrlen << 10) | tcph->flags);
834         hnputs(h->tcpwin, tcph->wnd >> (tcb != NULL ? tcb->snd.scale : 0));
835         hnputs(h->tcpurg, tcph->urg);
836
837         write_opts(tcph, h->tcpopt, optpad, tcb);
838
839         if (tcb != NULL && tcb->nochecksum) {
840                 h->tcpcksum[0] = h->tcpcksum[1] = 0;
841         } else {
842                 csum = ptclcsum(data, TCP6_IPLEN, hdrlen + dlen + TCP6_PHDRSIZE);
843                 hnputs(h->tcpcksum, csum);
844         }
845
846         /* move from pseudo header back to normal ip header */
847         memset(h->vcf, 0, 4);
848         h->vcf[0] = IP_VER6;
849         hnputs(h->ploadlen, hdrlen + dlen);
850         h->proto = ph->proto;
851
852         return data;
853 }
854
855 static struct block *htontcp4(Tcp *tcph, struct block *data, Tcp4hdr *ph,
856                               Tcpctl *tcb)
857 {
858         int dlen = blocklen(data);
859         Tcp4hdr *h;
860         uint16_t csum;
861         uint16_t hdrlen, optpad;
862
863         compute_hdrlen_optpad(tcph, TCP4_HDRSIZE, &hdrlen, &optpad, tcb);
864
865         data = alloc_or_pad_block(data, hdrlen + TCP4_PKT);
866         if (data == NULL)
867                 return NULL;
868         /* relative to the block start (bp->rp).  Note TCP structs include IP. */
869         data->network_offset = 0;
870         data->transport_offset = offsetof(Tcp4hdr, tcpsport);
871
872         /* copy in pseudo ip header plus port numbers */
873         h = (Tcp4hdr *) (data->rp);
874         memmove(h, ph, TCP4_TCBPHDRSZ);
875
876         /* copy in variable bits */
877         hnputs(h->tcplen, hdrlen + dlen);
878         hnputl(h->tcpseq, tcph->seq);
879         hnputl(h->tcpack, tcph->ack);
880         hnputs(h->tcpflag, (hdrlen << 10) | tcph->flags);
881         hnputs(h->tcpwin, tcph->wnd >> (tcb != NULL ? tcb->snd.scale : 0));
882         hnputs(h->tcpurg, tcph->urg);
883
884         write_opts(tcph, h->tcpopt, optpad, tcb);
885
886         if (tcb != NULL && tcb->nochecksum) {
887                 h->tcpcksum[0] = h->tcpcksum[1] = 0;
888         } else {
889                 assert(data->transport_offset == TCP4_IPLEN + TCP4_PHDRSIZE);
890                 csum = ~ptclcsum(data, TCP4_IPLEN, TCP4_PHDRSIZE);
891                 hnputs(h->tcpcksum, csum);
892                 data->tx_csum_offset = ph->tcpcksum - ph->tcpsport;
893                 data->flag |= Btcpck;
894         }
895
896         return data;
897 }
898
899 static void parse_inbound_sacks(Tcp *tcph, uint8_t *opt, uint16_t optlen)
900 {
901         uint8_t nr_sacks;
902         uint32_t left, right;
903
904         nr_sacks = (optlen - 2) / 8;
905         if (nr_sacks > MAX_NR_SACKS_PER_PACKET)
906                 return;
907         opt += 2;
908         for (int i = 0; i < nr_sacks; i++, opt += 8) {
909                 left = nhgetl(opt);
910                 right = nhgetl(opt + 4);
911                 if (seq_ge(left, right)) {
912                         /* bad / malicious SACK.  Skip it, and adjust. */
913                         nr_sacks--;
914                         i--;    /* stay on this array element next loop */
915                         continue;
916                 }
917                 tcph->sacks[i].left = left;
918                 tcph->sacks[i].right = right;
919         }
920         tcph->nr_sacks = nr_sacks;
921 }
922
923 static void parse_inbound_opts(Tcp *tcph, uint8_t *opt, uint16_t optsize)
924 {
925         uint16_t optlen;
926
927         while (optsize > 0 && *opt != EOLOPT) {
928                 if (*opt == NOOPOPT) {
929                         optsize--;
930                         opt++;
931                         continue;
932                 }
933                 optlen = opt[1];
934                 if (optlen < 2 || optlen > optsize)
935                         break;
936                 switch (*opt) {
937                         case MSSOPT:
938                                 if (optlen == MSS_LENGTH)
939                                         tcph->mss = nhgets(opt + 2);
940                                 break;
941                         case WSOPT:
942                                 if (optlen == WS_LENGTH && *(opt + 2) <= MAX_WS_VALUE)
943                                         tcph->ws = HaveWS | *(opt + 2);
944                                 break;
945                         case SACK_OK_OPT:
946                                 if (optlen == SACK_OK_LENGTH)
947                                         tcph->sack_ok = TRUE;
948                                 break;
949                         case SACK_OPT:
950                                 parse_inbound_sacks(tcph, opt, optlen);
951                                 break;
952                         case TS_OPT:
953                                 if (optlen == TS_LENGTH) {
954                                         tcph->ts_val = nhgetl(opt + 2);
955                                         tcph->ts_ecr = nhgetl(opt + 6);
956                                 }
957                                 break;
958                 }
959                 optsize -= optlen;
960                 opt += optlen;
961         }
962 }
963
964 /* Helper, clears the opts.  We'll later set them with e.g. parse_inbound_opts,
965  * set them manually, or something else. */
966 static void clear_tcph_opts(Tcp *tcph)
967 {
968         tcph->mss = 0;
969         tcph->ws = 0;
970         tcph->sack_ok = FALSE;
971         tcph->nr_sacks = 0;
972         tcph->ts_val = 0;
973         tcph->ts_ecr = 0;
974 }
975
976 static int ntohtcp6(Tcp *tcph, struct block **bpp)
977 {
978         Tcp6hdr *h;
979         uint16_t hdrlen;
980
981         *bpp = pullupblock(*bpp, TCP6_PKT + TCP6_HDRSIZE);
982         if (*bpp == NULL)
983                 return -1;
984
985         h = (Tcp6hdr *) ((*bpp)->rp);
986         tcph->source = nhgets(h->tcpsport);
987         tcph->dest = nhgets(h->tcpdport);
988         tcph->seq = nhgetl(h->tcpseq);
989         tcph->ack = nhgetl(h->tcpack);
990         hdrlen = (h->tcpflag[0] >> 2) & ~3;
991         if (hdrlen < TCP6_HDRSIZE) {
992                 freeblist(*bpp);
993                 return -1;
994         }
995
996         tcph->flags = h->tcpflag[1];
997         tcph->wnd = nhgets(h->tcpwin);
998         tcph->urg = nhgets(h->tcpurg);
999         clear_tcph_opts(tcph);
1000         tcph->len = nhgets(h->ploadlen) - hdrlen;
1001
1002         *bpp = pullupblock(*bpp, hdrlen + TCP6_PKT);
1003         if (*bpp == NULL)
1004                 return -1;
1005         parse_inbound_opts(tcph, h->tcpopt, hdrlen - TCP6_HDRSIZE);
1006         return hdrlen;
1007 }
1008
1009 static int ntohtcp4(Tcp *tcph, struct block **bpp)
1010 {
1011         Tcp4hdr *h;
1012         uint16_t hdrlen;
1013
1014         *bpp = pullupblock(*bpp, TCP4_PKT + TCP4_HDRSIZE);
1015         if (*bpp == NULL)
1016                 return -1;
1017
1018         h = (Tcp4hdr *) ((*bpp)->rp);
1019         tcph->source = nhgets(h->tcpsport);
1020         tcph->dest = nhgets(h->tcpdport);
1021         tcph->seq = nhgetl(h->tcpseq);
1022         tcph->ack = nhgetl(h->tcpack);
1023
1024         hdrlen = (h->tcpflag[0] >> 2) & ~3;
1025         if (hdrlen < TCP4_HDRSIZE) {
1026                 freeblist(*bpp);
1027                 return -1;
1028         }
1029
1030         tcph->flags = h->tcpflag[1];
1031         tcph->wnd = nhgets(h->tcpwin);
1032         tcph->urg = nhgets(h->tcpurg);
1033         clear_tcph_opts(tcph);
1034         tcph->len = nhgets(h->length) - (hdrlen + TCP4_PKT);
1035
1036         *bpp = pullupblock(*bpp, hdrlen + TCP4_PKT);
1037         if (*bpp == NULL)
1038                 return -1;
1039         parse_inbound_opts(tcph, h->tcpopt, hdrlen - TCP4_HDRSIZE);
1040         return hdrlen;
1041 }
1042
1043 /*
1044  *  For outgoing calls, generate an initial sequence
1045  *  number and put a SYN on the send queue
1046  */
1047 static void tcpsndsyn(struct conv *s, Tcpctl *tcb)
1048 {
1049         urandom_read(&tcb->iss, sizeof(tcb->iss));
1050         tcb->rttseq = tcb->iss;
1051         tcb->snd.wl2 = tcb->iss;
1052         tcb->snd.una = tcb->iss;
1053         tcb->snd.rtx = tcb->rttseq;
1054         tcb->snd.nxt = tcb->rttseq;
1055         tcb->flgcnt++;
1056         tcb->flags |= FORCE;
1057         tcb->sndsyntime = NOW;
1058
1059         /* set desired mss and scale */
1060         tcb->mss = tcpmtu(tcb->ifc, s->ipversion, &tcb->scale);
1061 }
1062
1063 static void sndrst(struct Proto *tcp, uint8_t *source, uint8_t *dest,
1064                    uint16_t length, Tcp *seg, uint8_t version, char *reason)
1065 {
1066         struct block *hbp;
1067         uint8_t rflags;
1068         struct tcppriv *tpriv;
1069         Tcp4hdr ph4;
1070         Tcp6hdr ph6;
1071
1072         netlog(tcp->f, Logtcpreset, "sndrst: %s\n", reason);
1073
1074         tpriv = tcp->priv;
1075
1076         if (seg->flags & RST)
1077                 return;
1078
1079         /* make pseudo header */
1080         switch (version) {
1081                 case V4:
1082                         memset(&ph4, 0, sizeof(ph4));
1083                         ph4.vihl = IP_VER4;
1084                         v6tov4(ph4.tcpsrc, dest);
1085                         v6tov4(ph4.tcpdst, source);
1086                         ph4.proto = IP_TCPPROTO;
1087                         hnputs(ph4.tcplen, TCP4_HDRSIZE);
1088                         hnputs(ph4.tcpsport, seg->dest);
1089                         hnputs(ph4.tcpdport, seg->source);
1090                         break;
1091                 case V6:
1092                         memset(&ph6, 0, sizeof(ph6));
1093                         ph6.vcf[0] = IP_VER6;
1094                         ipmove(ph6.tcpsrc, dest);
1095                         ipmove(ph6.tcpdst, source);
1096                         ph6.proto = IP_TCPPROTO;
1097                         hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1098                         hnputs(ph6.tcpsport, seg->dest);
1099                         hnputs(ph6.tcpdport, seg->source);
1100                         break;
1101                 default:
1102                         panic("sndrst: version %d", version);
1103         }
1104
1105         tpriv->stats[OutRsts]++;
1106         rflags = RST;
1107
1108         /* convince the other end that this reset is in band */
1109         if (seg->flags & ACK) {
1110                 seg->seq = seg->ack;
1111                 seg->ack = 0;
1112         } else {
1113                 rflags |= ACK;
1114                 seg->ack = seg->seq;
1115                 seg->seq = 0;
1116                 if (seg->flags & SYN)
1117                         seg->ack++;
1118                 seg->ack += length;
1119                 if (seg->flags & FIN)
1120                         seg->ack++;
1121         }
1122         seg->flags = rflags;
1123         seg->wnd = 0;
1124         seg->urg = 0;
1125         seg->mss = 0;
1126         seg->ws = 0;
1127         seg->sack_ok = FALSE;
1128         seg->nr_sacks = 0;
1129         /* seg->ts_val is already set with their timestamp */
1130         switch (version) {
1131                 case V4:
1132                         hbp = htontcp4(seg, NULL, &ph4, NULL);
1133                         if (hbp == NULL)
1134                                 return;
1135                         ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1136                         break;
1137                 case V6:
1138                         hbp = htontcp6(seg, NULL, &ph6, NULL);
1139                         if (hbp == NULL)
1140                                 return;
1141                         ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1142                         break;
1143                 default:
1144                         panic("sndrst2: version %d", version);
1145         }
1146 }
1147
1148 /*
1149  *  send a reset to the remote side and close the conversation
1150  *  called with s qlocked
1151  */
1152 static void tcphangup(struct conv *s)
1153 {
1154         ERRSTACK(1);
1155         Tcp seg;
1156         Tcpctl *tcb;
1157         struct block *hbp;
1158
1159         tcb = (Tcpctl *) s->ptcl;
1160         if (ipcmp(s->raddr, IPnoaddr)) {
1161                 /* discard error style, poperror regardless */
1162                 if (!waserror()) {
1163                         seg.flags = RST | ACK;
1164                         seg.ack = tcb->rcv.nxt;
1165                         tcb->last_ack_sent = seg.ack;
1166                         tcb->rcv.una = 0;
1167                         seg.seq = tcb->snd.nxt;
1168                         seg.wnd = 0;
1169                         seg.urg = 0;
1170                         seg.mss = 0;
1171                         seg.ws = 0;
1172                         seg.sack_ok = FALSE;
1173                         seg.nr_sacks = 0;
1174                         seg.ts_val = tcb->ts_recent;
1175                         switch (s->ipversion) {
1176                                 case V4:
1177                                         tcb->protohdr.tcp4hdr.vihl = IP_VER4;
1178                                         hbp = htontcp4(&seg, NULL, &tcb->protohdr.tcp4hdr, tcb);
1179                                         ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
1180                                         break;
1181                                 case V6:
1182                                         tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
1183                                         hbp = htontcp6(&seg, NULL, &tcb->protohdr.tcp6hdr, tcb);
1184                                         ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
1185                                         break;
1186                                 default:
1187                                         panic("tcphangup: version %d", s->ipversion);
1188                         }
1189                 }
1190                 poperror();
1191         }
1192         localclose(s, NULL);
1193 }
1194
1195 /*
1196  *  (re)send a SYN ACK
1197  */
1198 static int sndsynack(struct Proto *tcp, Limbo *lp)
1199 {
1200         struct block *hbp;
1201         Tcp4hdr ph4;
1202         Tcp6hdr ph6;
1203         Tcp seg;
1204         int scale;
1205         uint8_t flag = 0;
1206
1207         /* make pseudo header */
1208         switch (lp->version) {
1209                 case V4:
1210                         memset(&ph4, 0, sizeof(ph4));
1211                         ph4.vihl = IP_VER4;
1212                         v6tov4(ph4.tcpsrc, lp->laddr);
1213                         v6tov4(ph4.tcpdst, lp->raddr);
1214                         ph4.proto = IP_TCPPROTO;
1215                         hnputs(ph4.tcplen, TCP4_HDRSIZE);
1216                         hnputs(ph4.tcpsport, lp->lport);
1217                         hnputs(ph4.tcpdport, lp->rport);
1218                         break;
1219                 case V6:
1220                         memset(&ph6, 0, sizeof(ph6));
1221                         ph6.vcf[0] = IP_VER6;
1222                         ipmove(ph6.tcpsrc, lp->laddr);
1223                         ipmove(ph6.tcpdst, lp->raddr);
1224                         ph6.proto = IP_TCPPROTO;
1225                         hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1226                         hnputs(ph6.tcpsport, lp->lport);
1227                         hnputs(ph6.tcpdport, lp->rport);
1228                         break;
1229                 default:
1230                         panic("sndrst: version %d", lp->version);
1231         }
1232         lp->ifc = findipifc(tcp->f, lp->laddr, 0);
1233
1234         seg.seq = lp->iss;
1235         seg.ack = lp->irs + 1;
1236         seg.flags = SYN | ACK;
1237         seg.urg = 0;
1238         seg.mss = tcpmtu(lp->ifc, lp->version, &scale);
1239         seg.wnd = QMAX;
1240         seg.ts_val = lp->ts_val;
1241         seg.nr_sacks = 0;
1242
1243         /* if the other side set scale, we should too */
1244         if (lp->rcvscale) {
1245                 seg.ws = scale;
1246                 lp->sndscale = scale;
1247         } else {
1248                 seg.ws = 0;
1249                 lp->sndscale = 0;
1250         }
1251         if (SACK_SUPPORTED)
1252                 seg.sack_ok = lp->sack_ok;
1253         else
1254                 seg.sack_ok = FALSE;
1255
1256         switch (lp->version) {
1257                 case V4:
1258                         hbp = htontcp4(&seg, NULL, &ph4, NULL);
1259                         if (hbp == NULL)
1260                                 return -1;
1261                         ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1262                         break;
1263                 case V6:
1264                         hbp = htontcp6(&seg, NULL, &ph6, NULL);
1265                         if (hbp == NULL)
1266                                 return -1;
1267                         ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1268                         break;
1269                 default:
1270                         panic("sndsnack: version %d", lp->version);
1271         }
1272         lp->lastsend = NOW;
1273         return 0;
1274 }
1275
1276 #define hashipa(a, p) ( ( (a)[IPaddrlen-2] + (a)[IPaddrlen-1] + p )&LHTMASK )
1277
1278 /*
1279  *  put a call into limbo and respond with a SYN ACK
1280  *
1281  *  called with proto locked
1282  */
1283 static void limbo(struct conv *s, uint8_t *source, uint8_t *dest, Tcp *seg,
1284                   int version)
1285 {
1286         Limbo *lp, **l;
1287         struct tcppriv *tpriv;
1288         int h;
1289
1290         tpriv = s->p->priv;
1291         h = hashipa(source, seg->source);
1292
1293         for (l = &tpriv->lht[h]; *l != NULL; l = &lp->next) {
1294                 lp = *l;
1295                 if (lp->lport != seg->dest || lp->rport != seg->source
1296                         || lp->version != version)
1297                         continue;
1298                 if (ipcmp(lp->raddr, source) != 0)
1299                         continue;
1300                 if (ipcmp(lp->laddr, dest) != 0)
1301                         continue;
1302
1303                 /* each new SYN restarts the retransmits */
1304                 lp->irs = seg->seq;
1305                 break;
1306         }
1307         lp = *l;
1308         if (lp == NULL) {
1309                 if (tpriv->nlimbo >= Maxlimbo && tpriv->lht[h]) {
1310                         lp = tpriv->lht[h];
1311                         tpriv->lht[h] = lp->next;
1312                         lp->next = NULL;
1313                 } else {
1314                         lp = kzmalloc(sizeof(*lp), 0);
1315                         if (lp == NULL)
1316                                 return;
1317                         tpriv->nlimbo++;
1318                 }
1319                 *l = lp;
1320                 lp->version = version;
1321                 ipmove(lp->laddr, dest);
1322                 ipmove(lp->raddr, source);
1323                 lp->lport = seg->dest;
1324                 lp->rport = seg->source;
1325                 lp->mss = seg->mss;
1326                 lp->rcvscale = seg->ws;
1327                 lp->sack_ok = seg->sack_ok;
1328                 lp->irs = seg->seq;
1329                 lp->ts_val = seg->ts_val;
1330                 urandom_read(&lp->iss, sizeof(lp->iss));
1331         }
1332
1333         if (sndsynack(s->p, lp) < 0) {
1334                 *l = lp->next;
1335                 tpriv->nlimbo--;
1336                 kfree(lp);
1337         }
1338 }
1339
1340 /*
1341  *  resend SYN ACK's once every SYNACK_RXTIMER ms.
1342  */
1343 static void limborexmit(struct Proto *tcp)
1344 {
1345         struct tcppriv *tpriv;
1346         Limbo **l, *lp;
1347         int h;
1348         int seen;
1349         uint64_t now;
1350
1351         tpriv = tcp->priv;
1352
1353         if (!canqlock(&tcp->qlock))
1354                 return;
1355         seen = 0;
1356         now = NOW;
1357         for (h = 0; h < NLHT && seen < tpriv->nlimbo; h++) {
1358                 for (l = &tpriv->lht[h]; *l != NULL && seen < tpriv->nlimbo;) {
1359                         lp = *l;
1360                         seen++;
1361                         if (now - lp->lastsend < (lp->rexmits + 1) * SYNACK_RXTIMER)
1362                                 continue;
1363
1364                         /* time it out after 1 second */
1365                         if (++(lp->rexmits) > 5) {
1366                                 tpriv->nlimbo--;
1367                                 *l = lp->next;
1368                                 kfree(lp);
1369                                 continue;
1370                         }
1371
1372                         /* if we're being attacked, don't bother resending SYN ACK's */
1373                         if (tpriv->nlimbo > 100)
1374                                 continue;
1375
1376                         if (sndsynack(tcp, lp) < 0) {
1377                                 tpriv->nlimbo--;
1378                                 *l = lp->next;
1379                                 kfree(lp);
1380                                 continue;
1381                         }
1382
1383                         l = &lp->next;
1384                 }
1385         }
1386         qunlock(&tcp->qlock);
1387 }
1388
1389 /*
1390  *  lookup call in limbo.  if found, throw it out.
1391  *
1392  *  called with proto locked
1393  */
1394 static void limborst(struct conv *s, Tcp *segp, uint8_t *src, uint8_t *dst,
1395                      uint8_t version)
1396 {
1397         Limbo *lp, **l;
1398         int h;
1399         struct tcppriv *tpriv;
1400
1401         tpriv = s->p->priv;
1402
1403         /* find a call in limbo */
1404         h = hashipa(src, segp->source);
1405         for (l = &tpriv->lht[h]; *l != NULL; l = &lp->next) {
1406                 lp = *l;
1407                 if (lp->lport != segp->dest || lp->rport != segp->source
1408                         || lp->version != version)
1409                         continue;
1410                 if (ipcmp(lp->laddr, dst) != 0)
1411                         continue;
1412                 if (ipcmp(lp->raddr, src) != 0)
1413                         continue;
1414
1415                 /* RST can only follow the SYN */
1416                 if (segp->seq == lp->irs + 1) {
1417                         tpriv->nlimbo--;
1418                         *l = lp->next;
1419                         kfree(lp);
1420                 }
1421                 break;
1422         }
1423 }
1424
1425 /* The advertised MSS (e.g. 1460) includes any per-packet TCP options, such as
1426  * TCP timestamps.  A given packet will contain mss bytes, but only typical_mss
1427  * bytes of *data*.  If we know we'll use those options, we should adjust our
1428  * typical_mss, which will affect the cwnd. */
1429 static void adjust_typical_mss_for_opts(Tcp *tcph, Tcpctl *tcb)
1430 {
1431         uint16_t opt_size = 0;
1432
1433         if (tcph->ts_val)
1434                 opt_size += TS_LENGTH + TS_SEND_PREPAD;
1435         opt_size = ROUNDUP(opt_size, 4);
1436         tcb->typical_mss -= opt_size;
1437 }
1438
1439 /*
1440  *  come here when we finally get an ACK to our SYN-ACK.
1441  *  lookup call in limbo.  if found, create a new conversation
1442  *
1443  *  called with proto locked
1444  */
1445 static struct conv *tcpincoming(struct conv *s, Tcp *segp, uint8_t *src,
1446                                                                 uint8_t *dst, uint8_t version)
1447 {
1448         struct conv *new;
1449         Tcpctl *tcb;
1450         struct tcppriv *tpriv;
1451         Tcp4hdr *h4;
1452         Tcp6hdr *h6;
1453         Limbo *lp, **l;
1454         int h;
1455
1456         /* unless it's just an ack, it can't be someone coming out of limbo */
1457         if ((segp->flags & SYN) || (segp->flags & ACK) == 0)
1458                 return NULL;
1459
1460         tpriv = s->p->priv;
1461
1462         /* find a call in limbo */
1463         h = hashipa(src, segp->source);
1464         for (l = &tpriv->lht[h]; (lp = *l) != NULL; l = &lp->next) {
1465                 netlog(s->p->f, Logtcp,
1466                            "tcpincoming s %I!%d/%I!%d d %I!%d/%I!%d v %d/%d\n", src,
1467                            segp->source, lp->raddr, lp->rport, dst, segp->dest, lp->laddr,
1468                            lp->lport, version, lp->version);
1469
1470                 if (lp->lport != segp->dest || lp->rport != segp->source
1471                         || lp->version != version)
1472                         continue;
1473                 if (ipcmp(lp->laddr, dst) != 0)
1474                         continue;
1475                 if (ipcmp(lp->raddr, src) != 0)
1476                         continue;
1477
1478                 /* we're assuming no data with the initial SYN */
1479                 if (segp->seq != lp->irs + 1 || segp->ack != lp->iss + 1) {
1480                         netlog(s->p->f, Logtcp, "tcpincoming s 0x%lx/0x%lx a 0x%lx 0x%lx\n",
1481                                    segp->seq, lp->irs + 1, segp->ack, lp->iss + 1);
1482                         lp = NULL;
1483                 } else {
1484                         tpriv->nlimbo--;
1485                         *l = lp->next;
1486                 }
1487                 break;
1488         }
1489         if (lp == NULL)
1490                 return NULL;
1491
1492         new = Fsnewcall(s, src, segp->source, dst, segp->dest, version);
1493         if (new == NULL)
1494                 return NULL;
1495
1496         memmove(new->ptcl, s->ptcl, sizeof(Tcpctl));
1497         tcb = (Tcpctl *) new->ptcl;
1498         tcb->flags &= ~CLONE;
1499         tcb->timer.arg = new;
1500         tcb->timer.state = TcptimerOFF;
1501         tcb->acktimer.arg = new;
1502         tcb->acktimer.state = TcptimerOFF;
1503         tcb->katimer.arg = new;
1504         tcb->katimer.state = TcptimerOFF;
1505         tcb->rtt_timer.arg = new;
1506         tcb->rtt_timer.state = TcptimerOFF;
1507
1508         tcb->irs = lp->irs;
1509         tcb->rcv.nxt = tcb->irs + 1;
1510         tcb->rcv.urg = tcb->rcv.nxt;
1511
1512         tcb->iss = lp->iss;
1513         tcb->rttseq = tcb->iss;
1514         tcb->snd.wl2 = tcb->iss;
1515         tcb->snd.una = tcb->iss + 1;
1516         tcb->snd.rtx = tcb->iss + 1;
1517         tcb->snd.nxt = tcb->iss + 1;
1518         tcb->flgcnt = 0;
1519         tcb->flags |= SYNACK;
1520
1521         /* our sending max segment size cannot be bigger than what he asked for */
1522         if (lp->mss != 0 && lp->mss < tcb->mss) {
1523                 tcb->mss = lp->mss;
1524                 tcb->typical_mss = tcb->mss;
1525         }
1526         adjust_typical_mss_for_opts(segp, tcb);
1527
1528         /* Here's where we record the previously-decided header options.  They were
1529          * actually decided on when we agreed to them in the SYNACK we sent.  We
1530          * didn't create an actual TCB until now, so we can copy those decisions out
1531          * of the limbo tracker and into the TCB. */
1532         tcb->ifc = lp->ifc;
1533         tcb->sack_ok = lp->sack_ok;
1534         /* window scaling */
1535         tcpsetscale(new, tcb, lp->rcvscale, lp->sndscale);
1536         tcb_check_tso(tcb);
1537
1538         tcb->snd.wnd = segp->wnd;
1539         tcb->cwind = tcb->typical_mss * CWIND_SCALE;
1540
1541         /* set initial round trip time */
1542         tcb->sndsyntime = lp->lastsend + lp->rexmits * SYNACK_RXTIMER;
1543         tcpsynackrtt(new);
1544
1545         kfree(lp);
1546
1547         /* set up proto header */
1548         switch (version) {
1549                 case V4:
1550                         h4 = &tcb->protohdr.tcp4hdr;
1551                         memset(h4, 0, sizeof(*h4));
1552                         h4->proto = IP_TCPPROTO;
1553                         hnputs(h4->tcpsport, new->lport);
1554                         hnputs(h4->tcpdport, new->rport);
1555                         v6tov4(h4->tcpsrc, dst);
1556                         v6tov4(h4->tcpdst, src);
1557                         break;
1558                 case V6:
1559                         h6 = &tcb->protohdr.tcp6hdr;
1560                         memset(h6, 0, sizeof(*h6));
1561                         h6->proto = IP_TCPPROTO;
1562                         hnputs(h6->tcpsport, new->lport);
1563                         hnputs(h6->tcpdport, new->rport);
1564                         ipmove(h6->tcpsrc, dst);
1565                         ipmove(h6->tcpdst, src);
1566                         break;
1567                 default:
1568                         panic("tcpincoming: version %d", new->ipversion);
1569         }
1570
1571         tcpsetstate(new, Established);
1572
1573         iphtadd(&tpriv->ht, new);
1574
1575         return new;
1576 }
1577
1578 /*
1579  *  use the time between the first SYN and it's ack as the
1580  *  initial round trip time
1581  */
1582 static void tcpsynackrtt(struct conv *s)
1583 {
1584         Tcpctl *tcb;
1585         uint64_t delta;
1586         struct tcppriv *tpriv;
1587
1588         tcb = (Tcpctl *) s->ptcl;
1589         tpriv = s->p->priv;
1590
1591         delta = NOW - tcb->sndsyntime;
1592         tcb->srtt = delta;
1593         tcb->mdev = delta / 2;
1594
1595         /* halt round trip timer */
1596         tcphalt(tpriv, &tcb->rtt_timer);
1597 }
1598
1599 /* For LFNs (long/fat), our default tx queue doesn't hold enough data, and TCP
1600  * blocks on the application - even if the app already has the data ready to go.
1601  * We need to hold the sent, unacked data (1x cwnd), plus all the data we might
1602  * send next RTT (1x cwnd).  Note this is called after cwnd was expanded. */
1603 static void adjust_tx_qio_limit(struct conv *s)
1604 {
1605         Tcpctl *tcb = (Tcpctl *) s->ptcl;
1606         size_t ideal_limit = tcb->cwind * 2;
1607
1608         /* This is called for every ACK, and it's not entirely free to update the
1609          * limit (locks, CVs, taps).  Updating in chunks of mss seems reasonable.
1610          * During SS, we'll update this on most ACKs (given each ACK increased the
1611          * cwind by > MSS).
1612          *
1613          * We also don't want a lot of tiny blocks from the user, but the way qio
1614          * works, you can put in as much as you want (Maxatomic) and then get
1615          * flow-controlled. */
1616         if (qgetlimit(s->wq) + tcb->typical_mss < ideal_limit)
1617                 qsetlimit(s->wq, ideal_limit);
1618         /* TODO: we could shrink the qio limit too, if we had a better idea what the
1619          * actual threshold was.  We want the limit to be the 'stable' cwnd * 2. */
1620 }
1621
1622 /* Attempts to merge later sacks into sack 'into' (index in the array) */
1623 static void merge_sacks_into(Tcpctl *tcb, int into)
1624 {
1625         struct sack_block *into_sack = &tcb->snd.sacks[into];
1626         struct sack_block *tcb_sack;
1627         int shift = 0;
1628
1629         for (int i = into + 1; i < tcb->snd.nr_sacks; i++) {
1630                 tcb_sack = &tcb->snd.sacks[i];
1631                 if (seq_lt(into_sack->right, tcb_sack->left))
1632                         break;
1633                 if (seq_gt(tcb_sack->right, into_sack->right))
1634                         into_sack->right = tcb_sack->right;
1635                 shift++;
1636         }
1637         if (shift) {
1638                 memmove(tcb->snd.sacks + into + 1,
1639                         tcb->snd.sacks + into + 1 + shift,
1640                         sizeof(struct sack_block) * (tcb->snd.nr_sacks - into - 1
1641                                                              - shift));
1642                 tcb->snd.nr_sacks -= shift;
1643         }
1644 }
1645
1646 /* If we update a sack, it means they received a packet (possibly out of order),
1647  * but they have not received earlier packets.  Otherwise, they would do a full
1648  * ACK.
1649  *
1650  * The trick is in knowing whether the reception growing this sack is due to a
1651  * retrans or due to packets from before our last loss event.  The rightmost
1652  * sack tends to grow a lot with packets we sent before the loss.  However,
1653  * intermediate sacks that grow are signs of a loss, since they only grow as a
1654  * result of retrans.
1655  *
1656  * This is only true for the first time through a retrans.  After we've gone
1657  * through a full retrans blast, the sack that hinted at the retrans loss (and
1658  * there could be multiple of them!) will continue to grow.  We could come up
1659  * with some tracking for this, but instead we'll just do a one-time deal.  You
1660  * can recover from one detected sack retrans loss.  After that, you'll have to
1661  * use the RTO.
1662  *
1663  * This won't catch some things, like a sack that grew and merged with the
1664  * rightmost sack.  This also won't work if you have a single sack.  We can't
1665  * tell where the retrans ends and the sending begins. */
1666 static bool sack_hints_at_loss(Tcpctl *tcb, struct sack_block *tcb_sack)
1667 {
1668         if (tcb->snd.recovery != SACK_RETRANS_RECOVERY)
1669                 return FALSE;
1670         return &tcb->snd.sacks[tcb->snd.nr_sacks - 1] != tcb_sack;
1671 }
1672
1673 static bool sack_contains(struct sack_block *tcb_sack, uint32_t seq)
1674 {
1675         return seq_le(tcb_sack->left, seq) && seq_lt(seq, tcb_sack->right);
1676 }
1677
1678 /* Debugging helper! */
1679 static void sack_asserter(Tcpctl *tcb, char *str)
1680 {
1681         struct sack_block *tcb_sack;
1682
1683         for (int i = 0; i < tcb->snd.nr_sacks; i++) {
1684                 tcb_sack = &tcb->snd.sacks[i];
1685                 /* Checking invariants: snd.rtx is never inside a sack, sacks are always
1686                  * mutually exclusive. */
1687                 if (sack_contains(tcb_sack, tcb->snd.rtx) ||
1688                     ((i + 1 < tcb->snd.nr_sacks) && seq_ge(tcb_sack->right,
1689                                                                (tcb_sack + 1)->left))) {
1690                         printk("SACK ASSERT ERROR at %s\n", str);
1691                         printk("rtx %u una %u nxt %u, sack [%u, %u)\n",
1692                                tcb->snd.rtx, tcb->snd.una, tcb->snd.nxt, tcb_sack->left,
1693                                    tcb_sack->right);
1694                         for (int i = 0; i < tcb->snd.nr_sacks; i++)
1695                                 printk("\t %d: [%u, %u)\n", i, tcb->snd.sacks[i].left,
1696                                        tcb->snd.sacks[i].right);
1697                         backtrace();
1698                         panic("");
1699                 }
1700         }
1701 }
1702
1703 /* Updates bookkeeping whenever a sack is added or updated */
1704 static void sack_has_changed(struct conv *s, Tcpctl *tcb,
1705                              struct sack_block *tcb_sack)
1706 {
1707         /* Due to the change, snd.rtx might be in the middle of this sack.  Advance
1708          * it to the right edge. */
1709         if (sack_contains(tcb_sack, tcb->snd.rtx))
1710                 tcb->snd.rtx = tcb_sack->right;
1711
1712         /* This is a sack for something we retransed and we think it means there was
1713          * another loss.  Instead of waiting for the RTO, we can take action. */
1714         if (sack_hints_at_loss(tcb, tcb_sack)) {
1715                 if (++tcb->snd.sack_loss_hint == TCPREXMTTHRESH) {
1716                         netlog(s->p->f, Logtcprxmt,
1717                                "%I.%d -> %I.%d: sack rxmit loss: snd.rtx %u, sack [%u,%u), una %u, recovery_pt %u\n",
1718                                s->laddr, s->lport, s->raddr, s->rport,
1719                                tcb->snd.rtx, tcb_sack->left, tcb_sack->right, tcb->snd.una,
1720                                tcb->snd.recovery_pt);
1721                         /* Redo retrans, but keep the sacks and recovery point */
1722                         tcp_loss_event(s, tcb);
1723                         tcb->snd.rtx = tcb->snd.una;
1724                         tcb->snd.sack_loss_hint = 0;
1725                         /* Act like an RTO.  We just detected it earlier.  This prevents us
1726                          * from getting another sack hint loss this recovery period and from
1727                          * advancing the opportunistic right edge. */
1728                         tcb->snd.recovery = RTO_RETRANS_RECOVERY;
1729                         /* We didn't actually time out yet and we expect to keep getting
1730                          * sacks, so we don't want to flush or worry about in_flight.  If we
1731                          * messed something up, the RTO will still fire. */
1732                         set_in_flight(tcb);
1733                 }
1734         }
1735 }
1736
1737 /* Advances tcb_sack's right edge, if new_right is farther, and updates the
1738  * bookkeeping due to the change. */
1739 static void update_right_edge(struct conv *s, Tcpctl *tcb,
1740                               struct sack_block *tcb_sack, uint32_t new_right)
1741 {
1742         if (seq_le(new_right, tcb_sack->right))
1743                 return;
1744         tcb_sack->right = new_right;
1745         merge_sacks_into(tcb, tcb_sack - tcb->snd.sacks);
1746         sack_has_changed(s, tcb, tcb_sack);
1747 }
1748
1749 static void update_or_insert_sack(struct conv *s, Tcpctl *tcb,
1750                                   struct sack_block *seg_sack)
1751 {
1752         struct sack_block *tcb_sack;
1753
1754         for (int i = 0; i < tcb->snd.nr_sacks; i++) {
1755                 tcb_sack = &tcb->snd.sacks[i];
1756                 if (seq_lt(tcb_sack->left, seg_sack->left)) {
1757                         /* This includes adjacent (which I've seen!) and overlap. */
1758                         if (seq_le(seg_sack->left, tcb_sack->right)) {
1759                                 update_right_edge(s, tcb, tcb_sack, seg_sack->right);
1760                                 return;
1761                         }
1762                         continue;
1763                 }
1764                 /* Update existing sack */
1765                 if (tcb_sack->left == seg_sack->left) {
1766                         update_right_edge(s, tcb, tcb_sack, seg_sack->right);
1767                         return;
1768                 }
1769                 /* Found our slot */
1770                 if (seq_gt(tcb_sack->left, seg_sack->left)) {
1771                         if (tcb->snd.nr_sacks == MAX_NR_SND_SACKS) {
1772                                 /* Out of room, but it is possible this sack overlaps later
1773                                  * sacks, including the max sack's right edge. */
1774                                 if (seq_ge(seg_sack->right, tcb_sack->left)) {
1775                                         /* Take over the sack */
1776                                         tcb_sack->left = seg_sack->left;
1777                                         update_right_edge(s, tcb, tcb_sack, seg_sack->right);
1778                                 }
1779                                 return;
1780                         }
1781                         /* O/W, it's our slot and we have room (at least one spot). */
1782                         memmove(&tcb->snd.sacks[i + 1], &tcb->snd.sacks[i],
1783                                 sizeof(struct sack_block) * (tcb->snd.nr_sacks - i));
1784                         tcb_sack->left = seg_sack->left;
1785                         tcb_sack->right = seg_sack->right;
1786                         tcb->snd.nr_sacks++;
1787                         merge_sacks_into(tcb, i);
1788                         sack_has_changed(s, tcb, tcb_sack);
1789                         return;
1790                 }
1791         }
1792         if (tcb->snd.nr_sacks == MAX_NR_SND_SACKS) {
1793                 /* We didn't find space in the sack array. */
1794                 tcb_sack = &tcb->snd.sacks[MAX_NR_SND_SACKS - 1];
1795                 /* Need to always maintain the rightmost sack, discarding the prev */
1796                 if (seq_gt(seg_sack->right, tcb_sack->right)) {
1797                         tcb_sack->left = seg_sack->left;
1798                         tcb_sack->right = seg_sack->right;
1799                         sack_has_changed(s, tcb, tcb_sack);
1800                 }
1801                 return;
1802         }
1803         tcb_sack = &tcb->snd.sacks[tcb->snd.nr_sacks];
1804         tcb->snd.nr_sacks++;
1805         tcb_sack->left = seg_sack->left;
1806         tcb_sack->right = seg_sack->right;
1807         sack_has_changed(s, tcb, tcb_sack);
1808 }
1809
1810 /* Given the packet seg, track the sacks in TCB.  There are a few things: if seg
1811  * acks new data, some sacks might no longer be needed.  Some sacks might grow,
1812  * we might add new sacks, either of which can cause a merger.
1813  *
1814  * The important thing is that we always have the max sack entry: it must be
1815  * inserted for sure and findable.  We need that for our measurement of what
1816  * packets are in the network.
1817  *
1818  * Note that we keep sacks that are below snd.rtx (and above
1819  * seg.ack/tcb->snd.una) as best we can - we don't prune them.  We'll need those
1820  * for the in_flight estimate.
1821  *
1822  * When we run out of room, we'll have to throw away a sack.  Anything we throw
1823  * away below snd.rtx will be counted as 'in flight', even though it isn't.  If
1824  * we throw away something greater than snd.rtx, we'll also retrans it.  For
1825  * simplicity, we throw-away / replace the rightmost sack, since we're always
1826  * maintaining a highest sack. */
1827 static void update_sacks(struct conv *s, Tcpctl *tcb, Tcp *seg)
1828 {
1829         int prune = 0;
1830         struct sack_block *tcb_sack;
1831
1832         for (int i = 0; i < tcb->snd.nr_sacks; i++) {
1833                 tcb_sack = &tcb->snd.sacks[i];
1834                 /* For the equality case, if they acked up to, but not including an old
1835                  * sack, they must have reneged it.  Otherwise they would have acked
1836                  * beyond the sack. */
1837                 if (seq_lt(seg->ack, tcb_sack->left))
1838                         break;
1839                 prune++;
1840         }
1841         if (prune) {
1842                 memmove(tcb->snd.sacks, tcb->snd.sacks + prune,
1843                         sizeof(struct sack_block) * (tcb->snd.nr_sacks - prune));
1844                 tcb->snd.nr_sacks -= prune;
1845         }
1846         for (int i = 0; i < seg->nr_sacks; i++) {
1847                 /* old sacks */
1848                 if (seq_lt(seg->sacks[i].left, seg->ack))
1849                         continue;
1850                 /* buggy sack: out of range */
1851                 if (seq_gt(seg->sacks[i].right, tcb->snd.nxt))
1852                         continue;
1853                 update_or_insert_sack(s, tcb, &seg->sacks[i]);
1854         }
1855 }
1856
1857 /* This is a little bit of an under estimate, since we assume a packet is lost
1858  * once we have any sacks above it.  Overall, it's at most 2 * MSS of an
1859  * overestimate.
1860  *
1861  * If we have no sacks (either reneged or never used) we'll assume all packets
1862  * above snd.rtx are lost.  This will be the case for sackless fast rxmit
1863  * (Dong's stuff) or for a timeout.  In the former case, this is probably not
1864  * true, and in_flight should be higher, but we have no knowledge without the
1865  * sacks. */
1866 static void set_in_flight(Tcpctl *tcb)
1867 {
1868         struct sack_block *tcb_sack;
1869         uint32_t in_flight = 0;
1870         uint32_t from;
1871
1872         if (!tcb->snd.nr_sacks) {
1873                 tcb->snd.in_flight = tcb->snd.rtx - tcb->snd.una;
1874                 return;
1875         }
1876
1877         /* Everything to the right of the unsacked */
1878         tcb_sack = &tcb->snd.sacks[tcb->snd.nr_sacks - 1];
1879         in_flight += tcb->snd.nxt - tcb_sack->right;
1880
1881         /* Everything retransed (from una to snd.rtx, minus sacked regions.  Note
1882          * we only retrans at most the last sack's left edge.  snd.rtx will be
1883          * advanced to the right edge of some sack (possibly the last one). */
1884         from = tcb->snd.una;
1885         for (int i = 0; i < tcb->snd.nr_sacks; i++) {
1886                 tcb_sack = &tcb->snd.sacks[i];
1887                 if (seq_ge(tcb_sack->left, tcb->snd.rtx))
1888                         break;
1889                 assert(seq_ge(tcb->snd.rtx, tcb_sack->right));
1890                 in_flight += tcb_sack->left - from;
1891                 from = tcb_sack->right;
1892         }
1893         in_flight += tcb->snd.rtx - from;
1894
1895         tcb->snd.in_flight = in_flight;
1896 }
1897
1898 static void reset_recovery(struct conv *s, Tcpctl *tcb)
1899 {
1900         netlog(s->p->f, Logtcprxmt,
1901                "%I.%d -> %I.%d: recovery complete, una %u, rtx %u, nxt %u, recovery %u\n",
1902                s->laddr, s->lport, s->raddr, s->rport,
1903                tcb->snd.una, tcb->snd.rtx, tcb->snd.nxt, tcb->snd.recovery_pt);
1904         tcb->snd.recovery = 0;
1905         tcb->snd.recovery_pt = 0;
1906         tcb->snd.loss_hint = 0;
1907         tcb->snd.flush_sacks = FALSE;
1908         tcb->snd.sack_loss_hint = 0;
1909 }
1910
1911 static bool is_dup_ack(Tcpctl *tcb, Tcp *seg)
1912 {
1913         /* this is a pure ack w/o window update */
1914         return (seg->ack == tcb->snd.una) &&
1915                (tcb->snd.una != tcb->snd.nxt) &&
1916                (seg->len == 0) &&
1917                (seg->wnd == tcb->snd.wnd);
1918 }
1919
1920 /* If we have sacks, we'll ignore dupacks and look at the sacks ahead of una
1921  * (which are managed by the TCB).  The tcb will not have old sacks (below
1922  * ack/snd.rtx).  Receivers often send sacks below their ack point when we are
1923  * coming out of a loss, and we don't want those to count.
1924  *
1925  * Note the tcb could have sacks (in the future), but the receiver stopped using
1926  * them (reneged).  We'll catch that with the RTO.  If we try to catch it here,
1927  * we could get in a state where we never allow them to renege. */
1928 static bool is_potential_loss(Tcpctl *tcb, Tcp *seg)
1929 {
1930         if (seg->nr_sacks > 0)
1931                 return tcb->snd.nr_sacks > 0;
1932         else
1933                 return is_dup_ack(tcb, seg);
1934 }
1935
1936 /* When we use timestamps for RTTM, RFC 7323 suggests scaling by
1937  * expected_samples (per cwnd).  They say:
1938  *
1939  * ExpectedSamples = ceiling(FlightSize / (SMSS * 2))
1940  *
1941  * However, SMMS * 2 is really "number of bytes expected to be acked in a
1942  * packet.".  We'll use 'acked' to approximate that.  When the receiver uses
1943  * LRO, they'll send back large ACKs, which decreases the number of samples.
1944  *
1945  * If it turns out that all the divides are bad, we can just go back to not
1946  * using expected_samples at all. */
1947 static int expected_samples_ts(Tcpctl *tcb, uint32_t acked)
1948 {
1949         assert(acked);
1950         return MAX(DIV_ROUND_UP(tcb->snd.nxt - tcb->snd.una, acked), 1);
1951 }
1952
1953 /* Updates the RTT, given the currently sampled RTT and the number samples per
1954  * cwnd.  For non-TS RTTM, that'll be 1. */
1955 static void update_rtt(Tcpctl *tcb, int rtt_sample, int expected_samples)
1956 {
1957         int delta;
1958
1959         tcb->backoff = 0;
1960         tcb->backedoff = 0;
1961         if (tcb->srtt == 0) {
1962                 tcb->srtt = rtt_sample;
1963                 tcb->mdev = rtt_sample / 2;
1964         } else {
1965                 delta = rtt_sample - tcb->srtt;
1966                 tcb->srtt += (delta >> RTTM_ALPHA_SHIFT) / expected_samples;
1967                 if (tcb->srtt <= 0)
1968                         tcb->srtt = 1;
1969                 tcb->mdev += ((abs(delta) - tcb->mdev) >> RTTM_BRAVO_SHIFT) /
1970                              expected_samples;
1971                 if (tcb->mdev <= 0)
1972                         tcb->mdev = 1;
1973         }
1974         tcpsettimer(tcb);
1975 }
1976
1977 static void update(struct conv *s, Tcp *seg)
1978 {
1979         int rtt;
1980         Tcpctl *tcb;
1981         uint32_t acked, expand;
1982         struct tcppriv *tpriv;
1983
1984         tpriv = s->p->priv;
1985         tcb = (Tcpctl *) s->ptcl;
1986
1987         if (!seq_within(seg->ack, tcb->snd.una, tcb->snd.nxt))
1988                 return;
1989
1990         acked = seg->ack - tcb->snd.una;
1991         tcb->snd.una = seg->ack;
1992         if (seq_gt(seg->ack, tcb->snd.rtx))
1993                 tcb->snd.rtx = seg->ack;
1994
1995         update_sacks(s, tcb, seg);
1996         set_in_flight(tcb);
1997
1998         /* We treat either a dupack or forward SACKs as a hint that there is a loss.
1999          * The RFCs suggest three dupacks before treating it as a loss (alternative
2000          * is reordered packets).  We'll treat three SACKs the same way. */
2001         if (is_potential_loss(tcb, seg) && !tcb->snd.recovery) {
2002                 tcb->snd.loss_hint++;
2003                 if (tcb->snd.loss_hint == TCPREXMTTHRESH) {
2004                         netlog(s->p->f, Logtcprxmt,
2005                                "%I.%d -> %I.%d: loss hint thresh, nr sacks %u, nxt %u, una %u, cwnd %u\n",
2006                                s->laddr, s->lport, s->raddr, s->rport,
2007                                tcb->snd.nr_sacks, tcb->snd.nxt, tcb->snd.una, tcb->cwind);
2008                         tcp_loss_event(s, tcb);
2009                         tcb->snd.recovery_pt = tcb->snd.nxt;
2010                         if (tcb->snd.nr_sacks) {
2011                                 tcb->snd.recovery = SACK_RETRANS_RECOVERY;
2012                                 tcb->snd.flush_sacks = FALSE;
2013                                 tcb->snd.sack_loss_hint = 0;
2014                         } else {
2015                                 tcb->snd.recovery = FAST_RETRANS_RECOVERY;
2016                         }
2017                         tcprxmit(s);
2018                 }
2019         }
2020
2021         /*
2022          *  update window
2023          */
2024         if (seq_gt(seg->ack, tcb->snd.wl2)
2025                 || (tcb->snd.wl2 == seg->ack && seg->wnd > tcb->snd.wnd)) {
2026                 tcb->snd.wnd = seg->wnd;
2027                 tcb->snd.wl2 = seg->ack;
2028         }
2029
2030         if (!acked) {
2031                 /*
2032                  *  don't let us hangup if sending into a closed window and
2033                  *  we're still getting acks
2034                  */
2035                 if (tcb->snd.recovery && (tcb->snd.wnd == 0))
2036                         tcb->backedoff = MAXBACKMS / 4;
2037                 return;
2038         }
2039         /* At this point, they have acked something new. (positive ack, ack > una).
2040          *
2041          * If we hadn't reached the threshold for recovery yet, the positive ACK
2042          * will reset our loss_hint count. */
2043         if (!tcb->snd.recovery)
2044                 tcb->snd.loss_hint = 0;
2045         else if (seq_ge(seg->ack, tcb->snd.recovery_pt))
2046                 reset_recovery(s, tcb);
2047
2048         /* avoid slow start and timers for SYN acks */
2049         if ((tcb->flags & SYNACK) == 0) {
2050                 tcb->flags |= SYNACK;
2051                 acked--;
2052                 tcb->flgcnt--;
2053                 goto done;
2054         }
2055
2056         /* slow start as long as we're not recovering from lost packets */
2057         if (tcb->cwind < tcb->snd.wnd && !tcb->snd.recovery) {
2058                 if (tcb->cwind < tcb->ssthresh) {
2059                         /* We increase the cwind by every byte we receive.  We want to
2060                          * increase the cwind by one MSS for every MSS that gets ACKed.
2061                          * Note that multiple MSSs can be ACKed in a single ACK.  If we had
2062                          * a remainder of acked / MSS, we'd add just that remainder - not 0
2063                          * or 1 MSS. */
2064                         expand = acked;
2065                 } else {
2066                         /* Every RTT, which consists of CWND bytes, we're supposed to expand
2067                          * by MSS bytes.  The classic algorithm was
2068                          *              expand = (tcb->mss * tcb->mss) / tcb->cwind;
2069                          * which assumes the ACK was for MSS bytes.  Instead, for every
2070                          * 'acked' bytes, we increase the window by acked / CWND (in units
2071                          * of MSS). */
2072                         expand = MAX(acked, tcb->typical_mss) * tcb->typical_mss
2073                                  / tcb->cwind;
2074                 }
2075
2076                 if (tcb->cwind + expand < tcb->cwind)
2077                         expand = tcb->snd.wnd - tcb->cwind;
2078                 if (tcb->cwind + expand > tcb->snd.wnd)
2079                         expand = tcb->snd.wnd - tcb->cwind;
2080                 tcb->cwind += expand;
2081         }
2082         adjust_tx_qio_limit(s);
2083
2084         if (tcb->ts_recent) {
2085                 update_rtt(tcb, abs(milliseconds() - seg->ts_ecr),
2086                            expected_samples_ts(tcb, acked));
2087         } else if (tcb->rtt_timer.state == TcptimerON &&
2088                    seq_ge(seg->ack, tcb->rttseq)) {
2089                 /* Adjust the timers according to the round trip time */
2090                 tcphalt(tpriv, &tcb->rtt_timer);
2091                 if (!tcb->snd.recovery) {
2092                         rtt = tcb->rtt_timer.start - tcb->rtt_timer.count;
2093                         if (rtt == 0)
2094                                 rtt = 1;        /* o/w all close systems will rexmit in 0 time */
2095                         rtt *= MSPTICK;
2096                         update_rtt(tcb, rtt, 1);
2097                 }
2098         }
2099
2100 done:
2101         if (qdiscard(s->wq, acked) < acked) {
2102                 tcb->flgcnt--;
2103                 /* This happened due to another bug where acked was very large
2104                  * (negative), which was interpreted as "hey, one less flag, since they
2105                  * acked one of our flags (like a SYN).  If flgcnt goes negative,
2106                  * get_xmit_segment() will attempt to send out large packets. */
2107                 assert(tcb->flgcnt >= 0);
2108         }
2109
2110         if (seq_gt(seg->ack, tcb->snd.urg))
2111                 tcb->snd.urg = seg->ack;
2112
2113         if (tcb->snd.una != tcb->snd.nxt)
2114                 tcpgo(tpriv, &tcb->timer);
2115         else
2116                 tcphalt(tpriv, &tcb->timer);
2117
2118         tcb->backoff = 0;
2119         tcb->backedoff = 0;
2120 }
2121
2122 static void update_tcb_ts(Tcpctl *tcb, Tcp *seg)
2123 {
2124         /* Get timestamp info from the tcp header.  Even though the timestamps
2125          * aren't sequence numbers, we still need to protect for wraparound.  Though
2126          * if the values were 0, assume that means we need an update.  We could have
2127          * an initial ts_val that appears negative (signed). */
2128         if (!tcb->ts_recent || !tcb->last_ack_sent ||
2129             (seq_ge(seg->ts_val, tcb->ts_recent) &&
2130              seq_le(seg->seq, tcb->last_ack_sent)))
2131                 tcb->ts_recent = seg->ts_val;
2132 }
2133
2134 /* Overlap happens when one sack's left edge is inside another sack. */
2135 static bool sacks_overlap(struct sack_block *x, struct sack_block *y)
2136 {
2137         return (seq_le(x->left, y->left) && seq_le(y->left, x->right)) ||
2138                (seq_le(y->left, x->left) && seq_le(x->left, y->right));
2139 }
2140
2141 static void make_sack_first(Tcpctl *tcb, struct sack_block *tcb_sack)
2142 {
2143         struct sack_block temp;
2144
2145         if (tcb_sack == &tcb->rcv.sacks[0])
2146                 return;
2147         temp = tcb->rcv.sacks[0];
2148         tcb->rcv.sacks[0] = *tcb_sack;
2149         *tcb_sack = temp;
2150 }
2151
2152 /* Track sack in our tcb for a block of data we received.  This handles all the
2153  * stuff: making sure sack is first (since it's the most recent sack change),
2154  * updating or merging sacks, and dropping excess sacks (we only need to
2155  * maintain 3).  Unlike on the snd side, our tcb sacks are *not* sorted. */
2156 static void track_rcv_sack(Tcpctl *tcb, uint32_t left, uint32_t right)
2157 {
2158         struct sack_block *tcb_sack;
2159         struct sack_block sack[1];
2160
2161         if (!tcb->sack_ok)
2162                 return;
2163         if (left == right)
2164                 return;
2165         assert(seq_lt(left, right));
2166         sack->left = left;
2167         sack->right = right;
2168         /* We can reuse an existing sack if we're merging or overlapping. */
2169         for (int i = 0; i < tcb->rcv.nr_sacks; i++) {
2170                 tcb_sack = &tcb->rcv.sacks[i];
2171                 if (sacks_overlap(tcb_sack, sack)) {
2172                         tcb_sack->left = seq_min(tcb_sack->left, sack->left);
2173                         tcb_sack->right = seq_max(tcb_sack->right, sack->right);
2174                         make_sack_first(tcb, tcb_sack);
2175                         return;
2176                 }
2177         }
2178         /* We can discard the last sack (right shift) - we should have sent it at
2179          * least once by now.  If not, oh well. */
2180         memmove(tcb->rcv.sacks + 1, tcb->rcv.sacks, sizeof(struct sack_block) *
2181                 MIN(MAX_NR_RCV_SACKS - 1, tcb->rcv.nr_sacks));
2182         tcb->rcv.sacks[0] = *sack;
2183         if (tcb->rcv.nr_sacks < MAX_NR_RCV_SACKS)
2184                 tcb->rcv.nr_sacks++;
2185 }
2186
2187 /* Once we receive everything and move rcv.nxt past a sack, we don't need to
2188  * track it.  I've seen Linux report sacks in the past, but we probably
2189  * shouldn't. */
2190 static void drop_old_rcv_sacks(Tcpctl *tcb)
2191 {
2192         struct sack_block *tcb_sack;
2193
2194         for (int i = 0; i < tcb->rcv.nr_sacks; i++) {
2195                 tcb_sack = &tcb->rcv.sacks[i];
2196                 /* Moving up to or past the left is enough to drop it. */
2197                 if (seq_ge(tcb->rcv.nxt, tcb_sack->left)) {
2198                         memmove(tcb->rcv.sacks + i, tcb->rcv.sacks + i + 1,
2199                                 sizeof(struct sack_block) * (tcb->rcv.nr_sacks - i - 1));
2200                         tcb->rcv.nr_sacks--;
2201                         i--;
2202                 }
2203         }
2204 }
2205
2206 static void tcpiput(struct Proto *tcp, struct Ipifc *unused, struct block *bp)
2207 {
2208         ERRSTACK(1);
2209         Tcp seg;
2210         Tcp4hdr *h4;
2211         Tcp6hdr *h6;
2212         int hdrlen;
2213         Tcpctl *tcb;
2214         uint16_t length;
2215         uint8_t source[IPaddrlen], dest[IPaddrlen];
2216         struct conv *s;
2217         struct Fs *f;
2218         struct tcppriv *tpriv;
2219         uint8_t version;
2220
2221         f = tcp->f;
2222         tpriv = tcp->priv;
2223
2224         tpriv->stats[InSegs]++;
2225
2226         h4 = (Tcp4hdr *) (bp->rp);
2227         h6 = (Tcp6hdr *) (bp->rp);
2228
2229         if ((h4->vihl & 0xF0) == IP_VER4) {
2230                 uint8_t ttl;
2231
2232                 version = V4;
2233                 length = nhgets(h4->length);
2234                 v4tov6(dest, h4->tcpdst);
2235                 v4tov6(source, h4->tcpsrc);
2236
2237                 /* ttl isn't part of the xsum pseudo header, but bypass needs it. */
2238                 ttl = h4->Unused;
2239                 h4->Unused = 0;
2240                 hnputs(h4->tcplen, length - TCP4_PKT);
2241                 if (!(bp->flag & Btcpck) && (h4->tcpcksum[0] || h4->tcpcksum[1]) &&
2242                         ptclcsum(bp, TCP4_IPLEN, length - TCP4_IPLEN)) {
2243                         tpriv->stats[CsumErrs]++;
2244                         tpriv->stats[InErrs]++;
2245                         netlog(f, Logtcp, "bad tcp proto cksum\n");
2246                         freeblist(bp);
2247                         return;
2248                 }
2249                 h4->Unused = ttl;
2250
2251                 hdrlen = ntohtcp4(&seg, &bp);
2252                 if (hdrlen < 0) {
2253                         tpriv->stats[HlenErrs]++;
2254                         tpriv->stats[InErrs]++;
2255                         netlog(f, Logtcp, "bad tcp hdr len\n");
2256                         return;
2257                 }
2258
2259                 s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest);
2260                 if (s && s->state == Bypass) {
2261                         bypass_or_drop(s, bp);
2262                         return;
2263                 }
2264
2265                 /* trim the packet to the size claimed by the datagram */
2266                 length -= hdrlen + TCP4_PKT;
2267                 bp = trimblock(bp, hdrlen + TCP4_PKT, length);
2268                 if (bp == NULL) {
2269                         tpriv->stats[LenErrs]++;
2270                         tpriv->stats[InErrs]++;
2271                         netlog(f, Logtcp, "tcp len < 0 after trim\n");
2272                         return;
2273                 }
2274         } else {
2275                 int ttl = h6->ttl;
2276                 int proto = h6->proto;
2277
2278                 version = V6;
2279                 length = nhgets(h6->ploadlen);
2280                 ipmove(dest, h6->tcpdst);
2281                 ipmove(source, h6->tcpsrc);
2282
2283                 h6->ploadlen[0] = h6->ploadlen[1] = h6->proto = 0;
2284                 h6->ttl = proto;
2285                 hnputl(h6->vcf, length);
2286                 if ((h6->tcpcksum[0] || h6->tcpcksum[1]) &&
2287                         ptclcsum(bp, TCP6_IPLEN, length + TCP6_PHDRSIZE)) {
2288                         tpriv->stats[CsumErrs]++;
2289                         tpriv->stats[InErrs]++;
2290                         netlog(f, Logtcp, "bad tcp proto cksum\n");
2291                         freeblist(bp);
2292                         return;
2293                 }
2294                 h6->ttl = ttl;
2295                 h6->proto = proto;
2296                 hnputs(h6->ploadlen, length);
2297
2298                 hdrlen = ntohtcp6(&seg, &bp);
2299                 if (hdrlen < 0) {
2300                         tpriv->stats[HlenErrs]++;
2301                         tpriv->stats[InErrs]++;
2302                         netlog(f, Logtcp, "bad tcp hdr len\n");
2303                         return;
2304                 }
2305
2306                 s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest);
2307                 if (s && s->state == Bypass) {
2308                         bypass_or_drop(s, bp);
2309                         return;
2310                 }
2311
2312                 /* trim the packet to the size claimed by the datagram */
2313                 length -= hdrlen;
2314                 bp = trimblock(bp, hdrlen + TCP6_PKT, length);
2315                 if (bp == NULL) {
2316                         tpriv->stats[LenErrs]++;
2317                         tpriv->stats[InErrs]++;
2318                         netlog(f, Logtcp, "tcp len < 0 after trim\n");
2319                         return;
2320                 }
2321         }
2322
2323         /* s, the conv matching the n-tuple, was set above */
2324         if (s == NULL) {
2325                 netlog(f, Logtcpreset, "iphtlook failed: src %I:%u, dst %I:%u\n",
2326                        source, seg.source, dest, seg.dest);
2327 reset:
2328                 sndrst(tcp, source, dest, length, &seg, version, "no conversation");
2329                 freeblist(bp);
2330                 return;
2331         }
2332
2333         /* lock protocol for unstate Plan 9 invariants.  funcs like limbo or
2334          * incoming might rely on it. */
2335         qlock(&tcp->qlock);
2336
2337         /* if it's a listener, look for the right flags and get a new conv */
2338         tcb = (Tcpctl *) s->ptcl;
2339         if (tcb->state == Listen) {
2340                 if (seg.flags & RST) {
2341                         limborst(s, &seg, source, dest, version);
2342                         qunlock(&tcp->qlock);
2343                         freeblist(bp);
2344                         return;
2345                 }
2346
2347                 /* if this is a new SYN, put the call into limbo */
2348                 if ((seg.flags & SYN) && (seg.flags & ACK) == 0) {
2349                         limbo(s, source, dest, &seg, version);
2350                         qunlock(&tcp->qlock);
2351                         freeblist(bp);
2352                         return;
2353                 }
2354
2355                 /* if there's a matching call in limbo, tcpincoming will return it */
2356                 s = tcpincoming(s, &seg, source, dest, version);
2357                 if (s == NULL) {
2358                         qunlock(&tcp->qlock);
2359                         goto reset;
2360                 }
2361         }
2362
2363         /* The rest of the input state machine is run with the control block
2364          * locked and implements the state machine directly out of the RFC.
2365          * Out-of-band data is ignored - it was always a bad idea.
2366          */
2367         tcb = (Tcpctl *) s->ptcl;
2368         if (waserror()) {
2369                 qunlock(&s->qlock);
2370                 nexterror();
2371         }
2372         qlock(&s->qlock);
2373         qunlock(&tcp->qlock);
2374
2375         update_tcb_ts(tcb, &seg);
2376         /* fix up window */
2377         seg.wnd <<= tcb->rcv.scale;
2378
2379         /* every input packet in puts off the keep alive time out */
2380         tcpsetkacounter(tcb);
2381
2382         switch (tcb->state) {
2383                 case Closed:
2384                         sndrst(tcp, source, dest, length, &seg, version,
2385                                    "sending to Closed");
2386                         goto raise;
2387                 case Syn_sent:
2388                         if (seg.flags & ACK) {
2389                                 if (!seq_within(seg.ack, tcb->iss + 1, tcb->snd.nxt)) {
2390                                         sndrst(tcp, source, dest, length, &seg, version,
2391                                                    "bad seq in Syn_sent");
2392                                         goto raise;
2393                                 }
2394                         }
2395                         if (seg.flags & RST) {
2396                                 if (seg.flags & ACK)
2397                                         localclose(s, "connection refused");
2398                                 goto raise;
2399                         }
2400
2401                         if (seg.flags & SYN) {
2402                                 procsyn(s, &seg);
2403                                 if (seg.flags & ACK) {
2404                                         update(s, &seg);
2405                                         tcpsynackrtt(s);
2406                                         tcpsetstate(s, Established);
2407                                         /* Here's where we get the results of header option
2408                                          * negotiations for connections we started. (SYNACK has the
2409                                          * response) */
2410                                         tcpsetscale(s, tcb, seg.ws, tcb->scale);
2411                                         tcb->sack_ok = seg.sack_ok;
2412                                 } else {
2413                                         sndrst(tcp, source, dest, length, &seg, version,
2414                                                    "Got SYN with no ACK");
2415                                         goto raise;
2416                                 }
2417
2418                                 if (length != 0 || (seg.flags & FIN))
2419                                         break;
2420
2421                                 freeblist(bp);
2422                                 goto output;
2423                         } else
2424                                 freeblist(bp);
2425
2426                         qunlock(&s->qlock);
2427                         poperror();
2428                         return;
2429         }
2430
2431         /*
2432          *  One DOS attack is to open connections to us and then forget about them,
2433          *  thereby tying up a conv at no long term cost to the attacker.
2434          *  This is an attempt to defeat these stateless DOS attacks.  See
2435          *  corresponding code in tcpsendka().
2436          */
2437         if ((seg.flags & RST) == 0) {
2438                 if (tcpporthogdefense
2439                         && seq_within(seg.ack, tcb->snd.una - (1 << 31),
2440                                                   tcb->snd.una - (1 << 29))) {
2441                         printd("stateless hog %I.%d->%I.%d f 0x%x 0x%lx - 0x%lx - 0x%lx\n",
2442                                    source, seg.source, dest, seg.dest, seg.flags,
2443                                    tcb->snd.una - (1 << 31), seg.ack, tcb->snd.una - (1 << 29));
2444                         localclose(s, "stateless hog");
2445                 }
2446         }
2447
2448         /* Cut the data to fit the receive window */
2449         if (tcptrim(tcb, &seg, &bp, &length) == -1) {
2450                 netlog(f, Logtcp, "%I.%d -> %I.%d: tcp len < 0, %lu %d\n",
2451                        s->raddr, s->rport, s->laddr, s->lport, seg.seq, length);
2452                 update(s, &seg);
2453                 if (qlen(s->wq) + tcb->flgcnt == 0 && tcb->state == Closing) {
2454                         tcphalt(tpriv, &tcb->rtt_timer);
2455                         tcphalt(tpriv, &tcb->acktimer);
2456                         tcphalt(tpriv, &tcb->katimer);
2457                         tcpsetstate(s, Time_wait);
2458                         tcb->timer.start = MSL2 * (1000 / MSPTICK);
2459                         tcpgo(tpriv, &tcb->timer);
2460                 }
2461                 if (!(seg.flags & RST)) {
2462                         tcb->flags |= FORCE;
2463                         goto output;
2464                 }
2465                 qunlock(&s->qlock);
2466                 poperror();
2467                 return;
2468         }
2469
2470         /* Cannot accept so answer with a rst */
2471         if (length && tcb->state == Closed) {
2472                 sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
2473                 goto raise;
2474         }
2475
2476         /* The segment is beyond the current receive pointer so
2477          * queue the data in the resequence queue
2478          */
2479         if (seg.seq != tcb->rcv.nxt)
2480                 if (length != 0 || (seg.flags & (SYN | FIN))) {
2481                         update(s, &seg);
2482                         if (addreseq(tcb, tpriv, &seg, bp, length) < 0)
2483                                 printd("reseq %I.%d -> %I.%d\n", s->raddr, s->rport, s->laddr,
2484                                            s->lport);
2485                         tcb->flags |= FORCE;
2486                         goto output;
2487                 }
2488
2489         /*
2490          *  keep looping till we've processed this packet plus any
2491          *  adjacent packets in the resequence queue
2492          */
2493         for (;;) {
2494                 if (seg.flags & RST) {
2495                         if (tcb->state == Established) {
2496                                 tpriv->stats[EstabResets]++;
2497                                 if (tcb->rcv.nxt != seg.seq)
2498                                         printd
2499                                                 ("out of order RST rcvd: %I.%d -> %I.%d, rcv.nxt 0x%lx seq 0x%lx\n",
2500                                                  s->raddr, s->rport, s->laddr, s->lport, tcb->rcv.nxt,
2501                                                  seg.seq);
2502                         }
2503                         localclose(s, "connection refused");
2504                         goto raise;
2505                 }
2506
2507                 if ((seg.flags & ACK) == 0)
2508                         goto raise;
2509
2510                 switch (tcb->state) {
2511                         case Established:
2512                         case Close_wait:
2513                                 update(s, &seg);
2514                                 break;
2515                         case Finwait1:
2516                                 update(s, &seg);
2517                                 if (qlen(s->wq) + tcb->flgcnt == 0) {
2518                                         tcphalt(tpriv, &tcb->rtt_timer);
2519                                         tcphalt(tpriv, &tcb->acktimer);
2520                                         tcpsetkacounter(tcb);
2521                                         tcb->time = NOW;
2522                                         tcpsetstate(s, Finwait2);
2523                                         tcb->katimer.start = MSL2 * (1000 / MSPTICK);
2524                                         tcpgo(tpriv, &tcb->katimer);
2525                                 }
2526                                 break;
2527                         case Finwait2:
2528                                 update(s, &seg);
2529                                 break;
2530                         case Closing:
2531                                 update(s, &seg);
2532                                 if (qlen(s->wq) + tcb->flgcnt == 0) {
2533                                         tcphalt(tpriv, &tcb->rtt_timer);
2534                                         tcphalt(tpriv, &tcb->acktimer);
2535                                         tcphalt(tpriv, &tcb->katimer);
2536                                         tcpsetstate(s, Time_wait);
2537                                         tcb->timer.start = MSL2 * (1000 / MSPTICK);
2538                                         tcpgo(tpriv, &tcb->timer);
2539                                 }
2540                                 break;
2541                         case Last_ack:
2542                                 update(s, &seg);
2543                                 if (qlen(s->wq) + tcb->flgcnt == 0) {
2544                                         localclose(s, NULL);
2545                                         goto raise;
2546                                 }
2547                         case Time_wait:
2548                                 tcb->flags |= FORCE;
2549                                 if (tcb->timer.state != TcptimerON)
2550                                         tcpgo(tpriv, &tcb->timer);
2551                 }
2552
2553                 if ((seg.flags & URG) && seg.urg) {
2554                         if (seq_gt(seg.urg + seg.seq, tcb->rcv.urg)) {
2555                                 tcb->rcv.urg = seg.urg + seg.seq;
2556                                 pullblock(&bp, seg.urg);
2557                         }
2558                 } else if (seq_gt(tcb->rcv.nxt, tcb->rcv.urg))
2559                         tcb->rcv.urg = tcb->rcv.nxt;
2560
2561                 if (length == 0) {
2562                         if (bp != NULL)
2563                                 freeblist(bp);
2564                 } else {
2565                         switch (tcb->state) {
2566                                 default:
2567                                         /* Ignore segment text */
2568                                         if (bp != NULL)
2569                                                 freeblist(bp);
2570                                         break;
2571
2572                                 case Established:
2573                                 case Finwait1:
2574                                         /* If we still have some data place on
2575                                          * receive queue
2576                                          */
2577                                         if (bp) {
2578                                                 bp = packblock(bp);
2579                                                 if (bp == NULL)
2580                                                         panic("tcp packblock");
2581                                                 qpassnolim(s->rq, bp);
2582                                                 bp = NULL;
2583
2584                                                 /*
2585                                                  *  Force an ack every 2 data messages.  This is
2586                                                  *  a hack for rob to make his home system run
2587                                                  *  faster.
2588                                                  *
2589                                                  *  this also keeps the standard TCP congestion
2590                                                  *  control working since it needs an ack every
2591                                                  *  2 max segs worth.  This is not quite that,
2592                                                  *  but under a real stream is equivalent since
2593                                                  *  every packet has a max seg in it.
2594                                                  */
2595                                                 if (++(tcb->rcv.una) >= 2)
2596                                                         tcb->flags |= FORCE;
2597                                         }
2598                                         tcb->rcv.nxt += length;
2599                                         drop_old_rcv_sacks(tcb);
2600
2601                                         /*
2602                                          *  update our rcv window
2603                                          */
2604                                         tcprcvwin(s);
2605
2606                                         /*
2607                                          *  turn on the acktimer if there's something
2608                                          *  to ack
2609                                          */
2610                                         if (tcb->acktimer.state != TcptimerON)
2611                                                 tcpgo(tpriv, &tcb->acktimer);
2612
2613                                         break;
2614                                 case Finwait2:
2615                                         /* no process to read the data, send a reset */
2616                                         if (bp != NULL)
2617                                                 freeblist(bp);
2618                                         sndrst(tcp, source, dest, length, &seg, version,
2619                                                    "send to Finwait2");
2620                                         qunlock(&s->qlock);
2621                                         poperror();
2622                                         return;
2623                         }
2624                 }
2625
2626                 if (seg.flags & FIN) {
2627                         tcb->flags |= FORCE;
2628
2629                         switch (tcb->state) {
2630                                 case Established:
2631                                         tcb->rcv.nxt++;
2632                                         tcpsetstate(s, Close_wait);
2633                                         break;
2634                                 case Finwait1:
2635                                         tcb->rcv.nxt++;
2636                                         if (qlen(s->wq) + tcb->flgcnt == 0) {
2637                                                 tcphalt(tpriv, &tcb->rtt_timer);
2638                                                 tcphalt(tpriv, &tcb->acktimer);
2639                                                 tcphalt(tpriv, &tcb->katimer);
2640                                                 tcpsetstate(s, Time_wait);
2641                                                 tcb->timer.start = MSL2 * (1000 / MSPTICK);
2642                                                 tcpgo(tpriv, &tcb->timer);
2643                                         } else
2644                                                 tcpsetstate(s, Closing);
2645                                         break;
2646                                 case Finwait2:
2647                                         tcb->rcv.nxt++;
2648                                         tcphalt(tpriv, &tcb->rtt_timer);
2649                                         tcphalt(tpriv, &tcb->acktimer);
2650                                         tcphalt(tpriv, &tcb->katimer);
2651                                         tcpsetstate(s, Time_wait);
2652                                         tcb->timer.start = MSL2 * (1000 / MSPTICK);
2653                                         tcpgo(tpriv, &tcb->timer);
2654                                         break;
2655                                 case Close_wait:
2656                                 case Closing:
2657                                 case Last_ack:
2658                                         break;
2659                                 case Time_wait:
2660                                         tcpgo(tpriv, &tcb->timer);
2661                                         break;
2662                         }
2663                 }
2664
2665                 /*
2666                  *  get next adjacent segment from the resequence queue.
2667                  *  dump/trim any overlapping segments
2668                  */
2669                 for (;;) {
2670                         if (tcb->reseq == NULL)
2671                                 goto output;
2672
2673                         if (seq_ge(tcb->rcv.nxt, tcb->reseq->seg.seq) == 0)
2674                                 goto output;
2675
2676                         getreseq(tcb, &seg, &bp, &length);
2677
2678                         if (tcptrim(tcb, &seg, &bp, &length) == 0)
2679                                 break;
2680                 }
2681         }
2682 output:
2683         tcpoutput(s);
2684         qunlock(&s->qlock);
2685         poperror();
2686         return;
2687 raise:
2688         qunlock(&s->qlock);
2689         poperror();
2690         freeblist(bp);
2691         tcpkick(s);
2692 }
2693
2694 /* The advertised mss = data + TCP headers */
2695 static uint16_t derive_payload_mss(Tcpctl *tcb)
2696 {
2697         uint16_t payload_mss = tcb->mss;
2698         uint16_t opt_size = 0;
2699
2700         if (tcb->ts_recent) {
2701                 opt_size += TS_LENGTH;
2702                 /* Note that when we're a SYN, we overestimate slightly.  This is safe,
2703                  * and not really a problem. */
2704                 opt_size += TS_SEND_PREPAD;
2705         }
2706         if (tcb->rcv.nr_sacks)
2707                 opt_size += 2 + tcb->rcv.nr_sacks * 8;
2708         opt_size = ROUNDUP(opt_size, 4);
2709         payload_mss -= opt_size;
2710         return payload_mss;
2711 }
2712
2713 /* Decreases the xmit amt, given the MSS / TSO. */
2714 static uint32_t throttle_for_mss(Tcpctl *tcb, uint32_t ssize,
2715                                  uint16_t payload_mss, bool retrans)
2716 {
2717         if (ssize > payload_mss) {
2718                 if ((tcb->flags & TSO) == 0) {
2719                         ssize = payload_mss;
2720                 } else {
2721                         /* Don't send too much.  32K is arbitrary.. */
2722                         if (ssize > 32 * 1024)
2723                                 ssize = 32 * 1024;
2724                         if (!retrans) {
2725                                 /* Clamp xmit to an integral MSS to avoid ragged tail segments
2726                                  * causing poor link utilization. */
2727                                 ssize = ROUNDDOWN(ssize, payload_mss);
2728                         }
2729                 }
2730         }
2731         return ssize;
2732 }
2733
2734 /* Reduces ssize for a variety of reasons.  Returns FALSE if we should abort
2735  * sending the packet.  o/w returns TRUE and modifies ssize by reference. */
2736 static bool throttle_ssize(struct conv *s, Tcpctl *tcb, uint32_t *ssize_p,
2737                            uint16_t payload_mss, bool retrans)
2738 {
2739         struct Fs *f = s->p->f;
2740         uint32_t usable;
2741         uint32_t ssize = *ssize_p;
2742
2743         /* Compute usable segment based on offered window and limit
2744          * window probes to one */
2745         if (tcb->snd.wnd == 0) {
2746                 if (tcb->snd.in_flight != 0) {
2747                         if ((tcb->flags & FORCE) == 0)
2748                                 return FALSE;
2749                 }
2750                 usable = 1;
2751         } else {
2752                 usable = tcb->cwind;
2753                 if (tcb->snd.wnd < usable)
2754                         usable = tcb->snd.wnd;
2755                 if (usable > tcb->snd.in_flight)
2756                         usable -= tcb->snd.in_flight;
2757                 else
2758                         usable = 0;
2759                 /* Avoid Silly Window Syndrome.  This is a little different thant RFC
2760                  * 813.  I took their additional enhancement of "< MSS" as an AND, not
2761                  * an OR.  25% of a large snd.wnd is pretty large, and our main goal is
2762                  * to avoid packets smaller than MSS.  I still use the 25% threshold,
2763                  * because it is important that there is *some* data in_flight.  If
2764                  * usable < MSS because snd.wnd is very small (but not 0), we might
2765                  * never get an ACK and would need to set up a timer.
2766                  *
2767                  * Also, I'm using 'ssize' as a proxy for a PSH point.  If there's just
2768                  * a small blob in the qio (or retrans!), then we might as well just
2769                  * send it. */
2770                 if ((usable < tcb->typical_mss) && (usable < tcb->snd.wnd >> 2)
2771                     && (usable < ssize)) {
2772                         return FALSE;
2773                 }
2774         }
2775         if (ssize && usable < 2)
2776                 netlog(s->p->f, Logtcpverbose,
2777                        "%I.%d -> %I.%d: throttled snd.wnd %lu cwind %lu\n",
2778                        s->laddr, s->lport, s->raddr, s->rport,
2779                        tcb->snd.wnd, tcb->cwind);
2780         if (usable < ssize)
2781                 ssize = usable;
2782
2783         ssize = throttle_for_mss(tcb, ssize, payload_mss, retrans);
2784
2785         *ssize_p = ssize;
2786         return TRUE;
2787 }
2788
2789 /* Helper, picks the next segment to send, which is possibly a retransmission.
2790  * Returns TRUE if we have a segment, FALSE o/w.  Returns ssize, from_seq, and
2791  * sent by reference.
2792  *
2793  * from_seq is the seq number we are transmitting from.
2794  *
2795  * sent includes all seq from una to from_seq *including* any previously sent
2796  * flags (part of tcb->flgcnt), for instance an unacknowledged SYN (which counts
2797  * as a seq number).  Those flags are in the e.g. snd.nxt - snd.una range, and
2798  * they get dropped after qdiscard.
2799  *
2800  * ssize is the amount of data we are sending, starting from from_seq, and it
2801  * will include any *new* flags, which haven't been accounted for yet.
2802  *
2803  * tcb->flgcnt consists of the flags both in ssize and in sent.
2804  *
2805  * Note that we could be in recovery and not sack_retrans a segment. */
2806 static bool get_xmit_segment(struct conv *s, Tcpctl *tcb, uint16_t payload_mss,
2807                              uint32_t *from_seq_p, uint32_t *sent_p,
2808                              uint32_t *ssize_p)
2809 {
2810         struct Fs *f = s->p->f;
2811         struct tcppriv *tpriv = s->p->priv;
2812         uint32_t ssize, sent, from_seq;
2813         bool sack_retrans = FALSE;
2814         struct sack_block *tcb_sack = 0;
2815
2816         for (int i = 0; i < tcb->snd.nr_sacks; i++) {
2817                 tcb_sack = &tcb->snd.sacks[i];
2818                 if (seq_lt(tcb->snd.rtx, tcb_sack->left)) {
2819                         /* So ssize is supposed to include any *new* flags to flgcnt, which
2820                          * at this point would be a FIN.
2821                          *
2822                          * It might be possible that flgcnt is incremented so we send a FIN,
2823                          * even for an intermediate sack retrans.  Perhaps the user closed
2824                          * the conv.
2825                          *
2826                          * However, the way the "flgcnt for FIN" works is that it inflates
2827                          * the desired amount we'd like to send (qlen + flgcnt).
2828                          * Eventually, we reach the end of the queue and fail to extract all
2829                          * of dsize.  At that point, we put on the FIN, and that's where the
2830                          * extra 'byte' comes from.
2831                          *
2832                          * For sack retrans, since we're extracting from parts of the qio
2833                          * that aren't the right-most edge, we don't need to consider flgcnt
2834                          * when setting ssize. */
2835                         from_seq = tcb->snd.rtx;
2836                         sent = from_seq - tcb->snd.una;
2837                         ssize = tcb_sack->left - from_seq;
2838                         sack_retrans = TRUE;
2839                         break;
2840                 }
2841         }
2842         /* SACK holes have first dibs, but we can still opportunisitically send new
2843          * data.
2844          *
2845          * During other types of recovery, we'll just send from the retrans point.
2846          * If we're in an RTO while we still have sacks, we could be resending data
2847          * that wasn't lost.  Consider a sack that is still growing (usually the
2848          * right-most), but we haven't received the ACK yet.  rxt may be included in
2849          * that area.  Given we had two losses or otherwise timed out, I'm not too
2850          * concerned.
2851          *
2852          * Note that Fast and RTO can send data beyond nxt.  If we change that,
2853          * change the accounting below. */
2854         if (!sack_retrans) {
2855                 switch (tcb->snd.recovery) {
2856                 default:
2857                 case SACK_RETRANS_RECOVERY:
2858                         from_seq = tcb->snd.nxt;
2859                         break;
2860                 case FAST_RETRANS_RECOVERY:
2861                 case RTO_RETRANS_RECOVERY:
2862                         from_seq = tcb->snd.rtx;
2863                         break;
2864                 }
2865                 sent = from_seq - tcb->snd.una;
2866                 /* qlen + flgcnt is every seq we want to have sent, including unack'd
2867                  * data, unacked flags, and new flags. */
2868                 ssize = qlen(s->wq) + tcb->flgcnt - sent;
2869         }
2870
2871         if (!throttle_ssize(s, tcb, &ssize, payload_mss, sack_retrans))
2872                 return FALSE;
2873
2874         /* This counts flags, which is a little hokey, but it's okay since in_flight
2875          * gets reset on each ACK */
2876         tcb->snd.in_flight += ssize;
2877         /* Log and track rxmit.  This covers both SACK (retrans) and fast rxmit. */
2878         if (ssize && seq_lt(tcb->snd.rtx, tcb->snd.nxt)) {
2879                 netlog(f, Logtcpverbose,
2880                        "%I.%d -> %I.%d: rxmit: rtx %u amt %u, nxt %u\n",
2881                        s->laddr, s->lport, s->raddr, s->rport,
2882                        tcb->snd.rtx, MIN(tcb->snd.nxt - tcb->snd.rtx, ssize),
2883                        tcb->snd.nxt);
2884                 tpriv->stats[RetransSegs]++;
2885         }
2886         if (sack_retrans) {
2887                 /* If we'll send up to the left edge, advance snd.rtx to the right.
2888                  *
2889                  * This includes the largest sack.  It might get removed later, in which
2890                  * case we'll underestimate the amount in-flight.  The alternative is to
2891                  * not count the rightmost sack, but when it gets removed, we'll retrans
2892                  * it anyway.  No matter what, we'd count it. */
2893                 tcb->snd.rtx += ssize;
2894                 if (tcb->snd.rtx == tcb_sack->left)
2895                         tcb->snd.rtx = tcb_sack->right;
2896                 /* RFC 6675 says we MAY rearm the RTO timer on each retrans, since we
2897                  * might not be getting ACKs for a while. */
2898                 tcpsettimer(tcb);
2899         } else {
2900                 switch (tcb->snd.recovery) {
2901                 default:
2902                         /* under normal op, we drag rtx along with nxt.  this prevents us
2903                          * from sending sacks too early (up above), since rtx doesn't get
2904                          * reset to una until we have a loss (e.g. 3 dupacks/sacks). */
2905                         tcb->snd.nxt += ssize;
2906                         tcb->snd.rtx = tcb->snd.nxt;
2907                         break;
2908                 case SACK_RETRANS_RECOVERY:
2909                         /* We explicitly do not want to increase rtx here.  We might still
2910                          * need it to fill in a sack gap below nxt if we get new, higher
2911                          * sacks. */
2912                         tcb->snd.nxt += ssize;
2913                         break;
2914                 case FAST_RETRANS_RECOVERY:
2915                 case RTO_RETRANS_RECOVERY:
2916                         tcb->snd.rtx += ssize;
2917                         /* Fast and RTO can send new data, advancing nxt. */
2918                         if (seq_gt(tcb->snd.rtx, tcb->snd.nxt))
2919                                 tcb->snd.nxt = tcb->snd.rtx;
2920                         break;
2921                 }
2922         }
2923         *from_seq_p = from_seq;
2924         *sent_p = sent;
2925         *ssize_p = ssize;
2926
2927         return TRUE;
2928 }
2929
2930 /*
2931  *  always enters and exits with the s locked.  We drop
2932  *  the lock to ipoput the packet so some care has to be
2933  *  taken by callers.
2934  */
2935 static void tcpoutput(struct conv *s)
2936 {
2937         Tcp seg;
2938         int msgs;
2939         int next_yield = 1;
2940         Tcpctl *tcb;
2941         struct block *hbp, *bp;
2942         uint32_t ssize, dsize, sent, from_seq;
2943         struct Fs *f;
2944         struct tcppriv *tpriv;
2945         uint8_t version;
2946         uint16_t payload_mss;
2947
2948         f = s->p->f;
2949         tpriv = s->p->priv;
2950         version = s->ipversion;
2951
2952         for (msgs = 0; msgs < 100; msgs++) {
2953                 tcb = (Tcpctl *) s->ptcl;
2954
2955                 switch (tcb->state) {
2956                         case Listen:
2957                         case Closed:
2958                         case Finwait2:
2959                                 return;
2960                 }
2961
2962                 /* force an ack when a window has opened up */
2963                 if (tcb->rcv.blocked && tcb->rcv.wnd >= tcb->mss) {
2964                         tcb->rcv.blocked = 0;
2965                         tcb->flags |= FORCE;
2966                 }
2967
2968                 /* Don't send anything else until our SYN has been acked */
2969                 if (tcb->snd.nxt != tcb->iss && (tcb->flags & SYNACK) == 0)
2970                         break;
2971
2972                 /* payload_mss is the actual amount of data in the packet, which is the
2973                  * advertised (mss - header opts).  This varies from packet to packet,
2974                  * based on the options that might be present (e.g. always timestamps,
2975                  * sometimes SACKs) */
2976                 payload_mss = derive_payload_mss(tcb);
2977
2978                 if (!get_xmit_segment(s, tcb, payload_mss, &from_seq, &sent, &ssize))
2979                         break;
2980
2981                 dsize = ssize;
2982                 seg.urg = 0;
2983
2984                 if (ssize == 0)
2985                         if ((tcb->flags & FORCE) == 0)
2986                                 break;
2987
2988                 tcb->flags &= ~FORCE;
2989                 tcprcvwin(s);
2990
2991                 /* By default we will generate an ack, so we can normally turn off the
2992                  * timer.  If we're blocked, we'll want the timer so we can send a
2993                  * window update. */
2994                 if (!tcb->rcv.blocked)
2995                         tcphalt(tpriv, &tcb->acktimer);
2996                 tcb->rcv.una = 0;
2997                 seg.source = s->lport;
2998                 seg.dest = s->rport;
2999                 seg.flags = ACK;
3000                 seg.mss = 0;
3001                 seg.ws = 0;
3002                 seg.sack_ok = FALSE;
3003                 seg.nr_sacks = 0;
3004                 /* When outputting, Syn_sent means "send the Syn", for connections we
3005                  * initiate.  SYNACKs are sent from sndsynack directly. */
3006                 if (tcb->state == Syn_sent) {
3007                         seg.flags = 0;
3008                         seg.sack_ok = SACK_SUPPORTED;   /* here's where we advertise SACK */
3009                         if (tcb->snd.nxt - ssize == tcb->iss) {
3010                                 seg.flags |= SYN;
3011                                 dsize--;
3012                                 seg.mss = tcb->mss;
3013                                 seg.ws = tcb->scale;
3014                         } else {
3015                                 /* TODO: Not sure why we'd get here. */
3016                                 warn("TCP: weird Syn_sent state, tell someone you saw this");
3017                         }
3018                 }
3019                 seg.seq = from_seq;
3020                 seg.ack = tcb->rcv.nxt;
3021                 tcb->last_ack_sent = seg.ack;
3022                 seg.wnd = tcb->rcv.wnd;
3023                 seg.ts_val = tcb->ts_recent;
3024
3025                 /* Pull out data to send */
3026                 bp = NULL;
3027                 if (dsize != 0) {
3028                         bp = qcopy(s->wq, dsize, sent);
3029                         if (BLEN(bp) != dsize) {
3030                                 /* Here's where the flgcnt kicked in.  Note dsize is
3031                                  * decremented, but ssize isn't.  Not that we use ssize for much
3032                                  * anymore.  Decrementing dsize prevents us from sending a PSH
3033                                  * with the FIN. */
3034                                 seg.flags |= FIN;
3035                                 dsize--;
3036                         }
3037                         if (BLEN(bp) > payload_mss) {
3038                                 bp->flag |= Btso;
3039                                 bp->mss = payload_mss;
3040                         }
3041                 }
3042
3043                 if (sent + dsize == qlen(s->wq) + tcb->flgcnt)
3044                         seg.flags |= PSH;
3045
3046                 /* Build header, link data and compute cksum */
3047                 switch (version) {
3048                         case V4:
3049                                 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
3050                                 hbp = htontcp4(&seg, bp, &tcb->protohdr.tcp4hdr, tcb);
3051                                 if (hbp == NULL) {
3052                                         freeblist(bp);
3053                                         return;
3054                                 }
3055                                 break;
3056                         case V6:
3057                                 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
3058                                 hbp = htontcp6(&seg, bp, &tcb->protohdr.tcp6hdr, tcb);
3059                                 if (hbp == NULL) {
3060                                         freeblist(bp);
3061                                         return;
3062                                 }
3063                                 break;
3064                         default:
3065                                 hbp = NULL;     /* to suppress a warning */
3066                                 panic("tcpoutput: version %d", version);
3067                 }
3068
3069                 /* Start the transmission timers if there is new data and we
3070                  * expect acknowledges
3071                  */
3072                 if (ssize != 0) {
3073                         if (tcb->timer.state != TcptimerON)
3074                                 tcpgo(tpriv, &tcb->timer);
3075
3076                         if (!tcb->ts_recent && (tcb->rtt_timer.state != TcptimerON)) {
3077                                 /* If round trip timer isn't running, start it. */
3078                                 tcpgo(tpriv, &tcb->rtt_timer);
3079                                 tcb->rttseq = from_seq + ssize;
3080                         }
3081                 }
3082
3083                 tpriv->stats[OutSegs]++;
3084
3085                 /* put off the next keep alive */
3086                 tcpgo(tpriv, &tcb->katimer);
3087
3088                 switch (version) {
3089                         case V4:
3090                                 if (ipoput4(f, hbp, 0, s->ttl, s->tos, s) < 0) {
3091                                         /* a negative return means no route */
3092                                         localclose(s, "no route");
3093                                 }
3094                                 break;
3095                         case V6:
3096                                 if (ipoput6(f, hbp, 0, s->ttl, s->tos, s) < 0) {
3097                                         /* a negative return means no route */
3098                                         localclose(s, "no route");
3099                                 }
3100                                 break;
3101                         default:
3102                                 panic("tcpoutput2: version %d", version);
3103                 }
3104                 if (ssize) {
3105                         /* The outer loop thinks we sent one packet.  If we used TSO, we
3106                          * might have sent several.  Minus one for the loop increment. */
3107                         msgs += DIV_ROUND_UP(ssize, payload_mss) - 1;
3108                 }
3109                 /* Old Plan 9 tidbit - yield every four messages.  We want to break out
3110                  * and unlock so we can process inbound ACKs which might do things like
3111                  * say "slow down". */
3112                 if (msgs >= next_yield) {
3113                         next_yield = msgs + 4;
3114                         qunlock(&s->qlock);
3115                         kthread_yield();
3116                         qlock(&s->qlock);
3117                 }
3118         }
3119 }
3120
3121 /*
3122  *  the BSD convention (hack?) for keep alives.  resend last uint8_t acked.
3123  */
3124 static void tcpsendka(struct conv *s)
3125 {
3126         Tcp seg;
3127         Tcpctl *tcb;
3128         struct block *hbp, *dbp;
3129
3130         tcb = (Tcpctl *) s->ptcl;
3131
3132         dbp = NULL;
3133         seg.urg = 0;
3134         seg.source = s->lport;
3135         seg.dest = s->rport;
3136         seg.flags = ACK | PSH;
3137         seg.mss = 0;
3138         seg.ws = 0;
3139         seg.sack_ok = FALSE;
3140         seg.nr_sacks = 0;
3141         if (tcpporthogdefense)
3142                 urandom_read(&seg.seq, sizeof(seg.seq));
3143         else
3144                 seg.seq = tcb->snd.una - 1;
3145         seg.ack = tcb->rcv.nxt;
3146         tcb->last_ack_sent = seg.ack;
3147         tcb->rcv.una = 0;
3148         seg.wnd = tcb->rcv.wnd;
3149         seg.ts_val = tcb->ts_recent;
3150         if (tcb->state == Finwait2) {
3151                 seg.flags |= FIN;
3152         } else {
3153                 dbp = block_alloc(1, MEM_WAIT);
3154                 dbp->wp++;
3155         }
3156
3157         if (isv4(s->raddr)) {
3158                 /* Build header, link data and compute cksum */
3159                 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
3160                 hbp = htontcp4(&seg, dbp, &tcb->protohdr.tcp4hdr, tcb);
3161                 if (hbp == NULL) {
3162                         freeblist(dbp);
3163                         return;
3164                 }
3165                 ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
3166         } else {
3167                 /* Build header, link data and compute cksum */
3168                 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
3169                 hbp = htontcp6(&seg, dbp, &tcb->protohdr.tcp6hdr, tcb);
3170                 if (hbp == NULL) {
3171                         freeblist(dbp);
3172                         return;
3173                 }
3174                 ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
3175         }
3176 }
3177
3178 /*
3179  *  set connection to time out after 12 minutes
3180  */
3181 static void tcpsetkacounter(Tcpctl *tcb)
3182 {
3183         tcb->kacounter = (12 * 60 * 1000) / (tcb->katimer.start * MSPTICK);
3184         if (tcb->kacounter < 3)
3185                 tcb->kacounter = 3;
3186 }
3187
3188 /*
3189  *  if we've timed out, close the connection
3190  *  otherwise, send a keepalive and restart the timer
3191  */
3192 static void tcpkeepalive(void *v)
3193 {
3194         ERRSTACK(1);
3195         Tcpctl *tcb;
3196         struct conv *s;
3197
3198         s = v;
3199         tcb = (Tcpctl *) s->ptcl;
3200         qlock(&s->qlock);
3201         if (waserror()) {
3202                 qunlock(&s->qlock);
3203                 nexterror();
3204         }
3205         if (tcb->state != Closed) {
3206                 if (--(tcb->kacounter) <= 0) {
3207                         localclose(s, "connection timed out");
3208                 } else {
3209                         tcpsendka(s);
3210                         tcpgo(s->p->priv, &tcb->katimer);
3211                 }
3212         }
3213         qunlock(&s->qlock);
3214         poperror();
3215 }
3216
3217 /*
3218  *  start keepalive timer
3219  */
3220 static void tcpstartka(struct conv *s, char **f, int n)
3221 {
3222         Tcpctl *tcb;
3223         int x;
3224
3225         tcb = (Tcpctl *) s->ptcl;
3226         if (tcb->state != Established)
3227                 error(ENOTCONN, "connection must be in Establised state");
3228         if (n > 1) {
3229                 x = atoi(f[1]);
3230                 if (x >= MSPTICK)
3231                         tcb->katimer.start = x / MSPTICK;
3232         }
3233         tcpsetkacounter(tcb);
3234         tcpgo(s->p->priv, &tcb->katimer);
3235 }
3236
3237 /*
3238  *  turn checksums on/off
3239  */
3240 static void tcpsetchecksum(struct conv *s, char **f, int unused)
3241 {
3242         Tcpctl *tcb;
3243
3244         tcb = (Tcpctl *) s->ptcl;
3245         tcb->nochecksum = !atoi(f[1]);
3246 }
3247
3248 static void tcp_loss_event(struct conv *s, Tcpctl *tcb)
3249 {
3250         uint32_t old_cwnd = tcb->cwind;
3251
3252         /* Reno */
3253         tcb->ssthresh = tcb->cwind / 2;
3254         tcb->cwind = tcb->ssthresh;
3255         netlog(s->p->f, Logtcprxmt,
3256                "%I.%d -> %I.%d: loss event, cwnd was %d, now %d\n",
3257                s->laddr, s->lport, s->raddr, s->rport,
3258                old_cwnd, tcb->cwind);
3259 }
3260
3261 /* Called when we need to retrans the entire outstanding window (everything
3262  * previously sent, but unacknowledged). */
3263 static void tcprxmit(struct conv *s)
3264 {
3265         Tcpctl *tcb;
3266
3267         tcb = (Tcpctl *) s->ptcl;
3268
3269         tcb->flags |= FORCE;
3270         tcb->snd.rtx = tcb->snd.una;
3271         set_in_flight(tcb);
3272
3273         tcpoutput(s);
3274 }
3275
3276 /* The original RFC said to drop sacks on a timeout, since the receiver could
3277  * renege.  Later RFCs say we can keep them around, so long as we are careful.
3278  *
3279  * We'll go with a "flush if we have two timeouts" plan.  This doesn't have to
3280  * be perfect - there might be cases where we accidentally flush the sacks too
3281  * often.  Perhaps we never get dup_acks to start fast/sack rxmit.  The main
3282  * thing is that after multiple timeouts we flush the sacks, since the receiver
3283  * might renege.
3284  *
3285  * We also have an Akaros-specific problem.  We use the sacks to determine
3286  * in_flight.  Specifically, the (snd.nxt - upper right edge) is tracked as in
3287  * flight.  Usually the receiver will keep sacking that right edge all the way
3288  * up to snd.nxt, but they might not, and the gap might be quite large.  After a
3289  * timeout, that data is definitely not in flight.  If that block's size is
3290  * greater than cwnd, we'll never transmit.  This should be rare, and in that
3291  * case we can just dump the sacks.  The typical_mss fudge factor is so we can
3292  * send a reasonably-sized packet. */
3293 static void timeout_handle_sacks(Tcpctl *tcb)
3294 {
3295         struct sack_block *last_sack;
3296
3297         if (tcb->snd.nr_sacks) {
3298                 last_sack = &tcb->snd.sacks[tcb->snd.nr_sacks - 1];
3299                 if (tcb->snd.flush_sacks || (tcb->snd.nxt - last_sack->right >=
3300                                              tcb->cwind - tcb->typical_mss)) {
3301                         tcb->snd.nr_sacks = 0;
3302                         tcb->snd.flush_sacks = FALSE;
3303                 } else {
3304                         tcb->snd.flush_sacks = TRUE;
3305                 }
3306         }
3307 }
3308
3309 static void tcptimeout(void *arg)
3310 {
3311         ERRSTACK(1);
3312         struct conv *s;
3313         Tcpctl *tcb;
3314         int maxback;
3315         struct tcppriv *tpriv;
3316
3317         s = (struct conv *)arg;
3318         tpriv = s->p->priv;
3319         tcb = (Tcpctl *) s->ptcl;
3320
3321         qlock(&s->qlock);
3322         if (waserror()) {
3323                 qunlock(&s->qlock);
3324                 nexterror();
3325         }
3326         switch (tcb->state) {
3327                 default:
3328                         tcb->backoff++;
3329                         if (tcb->state == Syn_sent)
3330                                 maxback = MAXBACKMS / 2;
3331                         else
3332                                 maxback = MAXBACKMS;
3333                         tcb->backedoff += tcb->timer.start * MSPTICK;
3334                         if (tcb->backedoff >= maxback) {
3335                                 localclose(s, "connection timed out");
3336                                 break;
3337                         }
3338                         netlog(s->p->f, Logtcprxmt,
3339                                "%I.%d -> %I.%d: timeout rxmit una %u, rtx %u, nxt %u, in_flight %u, timer.start %u\n",
3340                                s->laddr, s->lport, s->raddr, s->rport,
3341                                tcb->snd.una, tcb->snd.rtx, tcb->snd.nxt, tcb->snd.in_flight,
3342                                tcb->timer.start);
3343                         tcpsettimer(tcb);
3344                         tcp_loss_event(s, tcb);
3345                         /* Advance the recovery point.  Any dupacks/sacks below this won't
3346                          * trigger a new loss, since we won't reset_recovery() until we ack
3347                          * past recovery_pt. */
3348                         tcb->snd.recovery = RTO_RETRANS_RECOVERY;
3349                         tcb->snd.recovery_pt = tcb->snd.nxt;
3350                         timeout_handle_sacks(tcb);
3351                         tcprxmit(s);
3352                         tpriv->stats[RetransTimeouts]++;
3353                         break;
3354                 case Time_wait:
3355                         localclose(s, NULL);
3356                         break;
3357                 case Closed:
3358                         break;
3359         }
3360         qunlock(&s->qlock);
3361         poperror();
3362 }
3363
3364 static int inwindow(Tcpctl *tcb, int seq)
3365 {
3366         return seq_within(seq, tcb->rcv.nxt, tcb->rcv.nxt + tcb->rcv.wnd - 1);
3367 }
3368
3369 /*
3370  *  set up state for a received SYN (or SYN ACK) packet
3371  */
3372 static void procsyn(struct conv *s, Tcp *seg)
3373 {
3374         Tcpctl *tcb;
3375
3376         tcb = (Tcpctl *) s->ptcl;
3377         tcb->flags |= FORCE;
3378
3379         tcb->rcv.nxt = seg->seq + 1;
3380         tcb->rcv.urg = tcb->rcv.nxt;
3381         tcb->irs = seg->seq;
3382
3383         /* our sending max segment size cannot be bigger than what he asked for */
3384         if (seg->mss != 0 && seg->mss < tcb->mss) {
3385                 tcb->mss = seg->mss;
3386                 tcb->typical_mss = tcb->mss;
3387         }
3388         adjust_typical_mss_for_opts(seg, tcb);
3389
3390         tcb->snd.wnd = seg->wnd;
3391         tcb->cwind = tcb->typical_mss * CWIND_SCALE;
3392 }
3393
3394 static int addreseq(Tcpctl *tcb, struct tcppriv *tpriv, Tcp *seg,
3395                     struct block *bp, uint16_t length)
3396 {
3397         Reseq *rp, *rp1;
3398         int i, rqlen, qmax;
3399
3400         rp = kzmalloc(sizeof(Reseq), 0);
3401         if (rp == NULL) {
3402                 freeblist(bp);  /* bp always consumed by add_reseq */
3403                 return 0;
3404         }
3405
3406         rp->seg = *seg;
3407         rp->bp = bp;
3408         rp->length = length;
3409
3410         track_rcv_sack(tcb, seg->seq, seg->seq + length);
3411         /* Place on reassembly list sorting by starting seq number */
3412         rp1 = tcb->reseq;