vmm: refactor userspace's emsr_fakewrite()
[akaros.git] / kern / src / net / tcp.c
1 /* Copyright © 1994-1999 Lucent Technologies Inc.  All rights reserved.
2  * Portions Copyright © 1997-1999 Vita Nuova Limited
3  * Portions Copyright © 2000-2007 Vita Nuova Holdings Limited
4  *                                (www.vitanuova.com)
5  * Revisions Copyright © 2000-2007 Lucent Technologies Inc. and others
6  *
7  * Modified for the Akaros operating system:
8  * Copyright (c) 2013-2014 The Regents of the University of California
9  * Copyright (c) 2013-2017 Google Inc.
10  *
11  * Permission is hereby granted, free of charge, to any person obtaining a copy
12  * of this software and associated documentation files (the "Software"), to deal
13  * in the Software without restriction, including without limitation the rights
14  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15  * copies of the Software, and to permit persons to whom the Software is
16  * furnished to do so, subject to the following conditions:
17  *
18  * The above copyright notice and this permission notice shall be included in
19  * all copies or substantial portions of the Software.
20  *
21  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
24  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27  * SOFTWARE. */
28
29 #include <slab.h>
30 #include <kmalloc.h>
31 #include <kref.h>
32 #include <string.h>
33 #include <stdio.h>
34 #include <assert.h>
35 #include <error.h>
36 #include <cpio.h>
37 #include <pmap.h>
38 #include <smp.h>
39 #include <net/ip.h>
40 #include <net/tcp.h>
41
42 /* Must correspond to the enumeration in tcp.h */
43 static char *tcpstates[] = {
44         "Closed", "Listen", "Syn_sent",
45         "Established", "Finwait1", "Finwait2", "Close_wait",
46         "Closing", "Last_ack", "Time_wait"
47 };
48
49 static int tcp_irtt = DEF_RTT;          /* Initial guess at round trip time */
50 static uint16_t tcp_mss = DEF_MSS;      /* Maximum segment size to be sent */
51
52 /* Must correspond to the enumeration in tcp.h */
53 static char *statnames[] = {
54         [MaxConn] "MaxConn",
55         [ActiveOpens] "ActiveOpens",
56         [PassiveOpens] "PassiveOpens",
57         [EstabResets] "EstabResets",
58         [CurrEstab] "CurrEstab",
59         [InSegs] "InSegs",
60         [OutSegs] "OutSegs",
61         [RetransSegs] "RetransSegs",
62         [RetransTimeouts] "RetransTimeouts",
63         [InErrs] "InErrs",
64         [OutRsts] "OutRsts",
65         [CsumErrs] "CsumErrs",
66         [HlenErrs] "HlenErrs",
67         [LenErrs] "LenErrs",
68         [OutOfOrder] "OutOfOrder",
69 };
70
71 /*
72  *  Setting tcpporthogdefense to non-zero enables Dong Lin's
73  *  solution to hijacked systems staking out port's as a form
74  *  of DoS attack.
75  *
76  *  To avoid stateless Conv hogs, we pick a sequence number at random.  If
77  *  it that number gets acked by the other end, we shut down the connection.
78  *  Look for tcpporthogedefense in the code.
79  */
80 static int tcpporthogdefense = 0;
81
82 static int addreseq(Tcpctl *, struct tcppriv *, Tcp *, struct block *,
83                     uint16_t);
84 static void getreseq(Tcpctl *, Tcp *, struct block **, uint16_t *);
85 static void localclose(struct conv *, char *unused_char_p_t);
86 static void procsyn(struct conv *, Tcp *);
87 static void tcpiput(struct Proto *, struct Ipifc *, struct block *);
88 static void tcpoutput(struct conv *);
89 static int tcptrim(Tcpctl *, Tcp *, struct block **, uint16_t *);
90 static void tcpstart(struct conv *, int);
91 static void tcptimeout(void *);
92 static void tcpsndsyn(struct conv *, Tcpctl *);
93 static void tcprcvwin(struct conv *);
94 static void tcpacktimer(void *);
95 static void tcpkeepalive(void *);
96 static void tcpsetkacounter(Tcpctl *);
97 static void tcprxmit(struct conv *);
98 static void tcpsettimer(Tcpctl *);
99 static void tcpsynackrtt(struct conv *);
100 static void tcpsetscale(struct conv *, Tcpctl *, uint16_t, uint16_t);
101 static void tcp_loss_event(struct conv *s, Tcpctl *tcb);
102 static uint16_t derive_payload_mss(Tcpctl *tcb);
103 static void set_in_flight(Tcpctl *tcb);
104
105 static void limborexmit(struct Proto *);
106 static void limbo(struct conv *, uint8_t *unused_uint8_p_t, uint8_t *, Tcp *,
107                   int);
108
109 static void tcpsetstate(struct conv *s, uint8_t newstate)
110 {
111         Tcpctl *tcb;
112         uint8_t oldstate;
113         struct tcppriv *tpriv;
114
115         tpriv = s->p->priv;
116
117         tcb = (Tcpctl *) s->ptcl;
118
119         oldstate = tcb->state;
120         if (oldstate == newstate)
121                 return;
122
123         if (oldstate == Established)
124                 tpriv->stats[CurrEstab]--;
125         if (newstate == Established)
126                 tpriv->stats[CurrEstab]++;
127
128         /**
129         print( "%d/%d %s->%s CurrEstab=%d\n", s->lport, s->rport,
130         tcpstates[oldstate], tcpstates[newstate], tpriv->tstats.tcpCurrEstab );
131         **/
132
133         switch (newstate) {
134         case Closed:
135                 qclose(s->rq);
136                 qclose(s->wq);
137                 qclose(s->eq);
138                 break;
139
140         case Close_wait:        /* Remote closes */
141                 qhangup(s->rq, NULL);
142                 break;
143         }
144
145         tcb->state = newstate;
146
147         if (oldstate == Syn_sent && newstate != Closed)
148                 Fsconnected(s, NULL);
149 }
150
151 static void tcpconnect(struct conv *c, char **argv, int argc)
152 {
153         Fsstdconnect(c, argv, argc);
154         tcpstart(c, TCP_CONNECT);
155 }
156
157 static int tcpstate(struct conv *c, char *state, int n)
158 {
159         Tcpctl *s;
160
161         s = (Tcpctl *) (c->ptcl);
162
163         return snprintf(state, n,
164                         "%s qin %d qout %d srtt %d mdev %d cwin %u swin %u>>%d rwin %u>>%d timer.start %llu timer.count %llu rerecv %d katimer.start %d katimer.count %d\n",
165                         tcpstates[s->state],
166                         c->rq ? qlen(c->rq) : 0,
167                         c->wq ? qlen(c->wq) : 0,
168                         s->srtt, s->mdev,
169                         s->cwind, s->snd.wnd, s->rcv.scale, s->rcv.wnd,
170                         s->snd.scale, s->timer.start, s->timer.count, s->rerecv,
171                         s->katimer.start, s->katimer.count);
172 }
173
174 static int tcpinuse(struct conv *c)
175 {
176         Tcpctl *s;
177
178         s = (Tcpctl *) (c->ptcl);
179         return s->state != Closed;
180 }
181
182 static void tcpannounce(struct conv *c, char **argv, int argc)
183 {
184         Fsstdannounce(c, argv, argc);
185         tcpstart(c, TCP_LISTEN);
186         Fsconnected(c, NULL);
187 }
188
189 static void tcpbypass(struct conv *cv, char **argv, int argc)
190 {
191         struct tcppriv *tpriv = cv->p->priv;
192
193         Fsstdbypass(cv, argv, argc);
194         iphtadd(&tpriv->ht, cv);
195 }
196
197 static void tcpshutdown(struct conv *c, int how)
198 {
199         Tcpctl *tcb = (Tcpctl*)c->ptcl;
200
201         /* Do nothing for the read side */
202         if (how == SHUT_RD)
203                 return;
204         /* Sends a FIN.  If we're in another state (like Listen), we'll run into
205          * issues, since we'll never send the FIN.  We'll be shutdown on our
206          * end, but we'll never tell the distant end.  Might just be an app
207          * issue. */
208         switch (tcb->state) {
209         case Established:
210                 tcb->flgcnt++;
211                 tcpsetstate(c, Finwait1);
212                 tcpoutput(c);
213                 break;
214         }
215 }
216
217 /*
218  *  tcpclose is always called with the q locked
219  */
220 static void tcpclose(struct conv *c)
221 {
222         Tcpctl *tcb;
223
224         tcb = (Tcpctl *) c->ptcl;
225
226         qhangup(c->rq, NULL);
227         qhangup(c->wq, NULL);
228         qhangup(c->eq, NULL);
229         qflush(c->rq);
230
231         switch (tcb->state) {
232         case Listen:
233                 /*
234                  *  reset any incoming calls to this listener
235                  */
236                 Fsconnected(c, "Hangup");
237
238                 localclose(c, NULL);
239                 break;
240         case Closed:
241         case Syn_sent:
242                 localclose(c, NULL);
243                 break;
244         case Established:
245                 tcb->flgcnt++;
246                 tcpsetstate(c, Finwait1);
247                 tcpoutput(c);
248                 break;
249         case Close_wait:
250                 tcb->flgcnt++;
251                 tcpsetstate(c, Last_ack);
252                 tcpoutput(c);
253                 break;
254         }
255 }
256
257 static void tcpkick(void *x)
258 {
259         ERRSTACK(1);
260         struct conv *s = x;
261         Tcpctl *tcb;
262
263         tcb = (Tcpctl *) s->ptcl;
264
265         qlock(&s->qlock);
266         if (waserror()) {
267                 qunlock(&s->qlock);
268                 nexterror();
269         }
270
271         switch (tcb->state) {
272         case Syn_sent:
273         case Established:
274         case Close_wait:
275                 /*
276                  * Push data
277                  */
278                 tcprcvwin(s);
279                 tcpoutput(s);
280                 break;
281         default:
282                 localclose(s, "Hangup");
283                 break;
284         }
285
286         qunlock(&s->qlock);
287         poperror();
288 }
289
290 static void tcprcvwin(struct conv *s)
291 {
292         /* Call with tcb locked */
293         int w;
294         Tcpctl *tcb;
295
296         tcb = (Tcpctl *) s->ptcl;
297         w = tcb->window - qlen(s->rq);
298         if (w < 0)
299                 w = 0;
300
301         /* RFC 813: Avoid SWS.  We'll always reduce the window (because the qio
302          * increased - that's legit), and we'll always advertise the window
303          * increases (corresponding to qio drains) when those are greater than
304          * MSS.  But we don't advertise increases less than MSS.
305          *
306          * Note we don't shrink the window at all - that'll result in tcptrim()
307          * dropping packets that were sent before the sender gets our update. */
308         if ((w < tcb->rcv.wnd) || (w >= tcb->mss))
309                 tcb->rcv.wnd = w;
310         /* We've delayed sending an update to rcv.wnd, and we might never get
311          * another ACK to drive the TCP stack after the qio is drained.  We
312          * could replace this stuff with qio kicks or callbacks, but that might
313          * be trickier with the MSS limitation.  (and 'edge' isn't empty or
314          * not). */
315         if (w < tcb->mss)
316                 tcb->rcv.blocked = 1;
317 }
318
319 static void tcpacktimer(void *v)
320 {
321         ERRSTACK(1);
322         Tcpctl *tcb;
323         struct conv *s;
324
325         s = v;
326         tcb = (Tcpctl *) s->ptcl;
327
328         qlock(&s->qlock);
329         if (waserror()) {
330                 qunlock(&s->qlock);
331                 nexterror();
332         }
333         if (tcb->state != Closed) {
334                 tcb->flags |= FORCE;
335                 tcprcvwin(s);
336                 tcpoutput(s);
337         }
338         qunlock(&s->qlock);
339         poperror();
340 }
341
342 static void tcpcreate(struct conv *c)
343 {
344         /* We don't use qio limits.  Instead, TCP manages flow control on its
345          * own.  We only use qpassnolim().  Note for qio that 0 doesn't mean no
346          * limit. */
347         c->rq = qopen(0, Qcoalesce, 0, 0);
348         c->wq = qopen(8 * QMAX, Qkick, tcpkick, c);
349 }
350
351 static void timerstate(struct tcppriv *priv, Tcptimer *t, int newstate)
352 {
353         if (newstate != TcptimerON) {
354                 if (t->state == TcptimerON) {
355                         // unchain
356                         if (priv->timers == t) {
357                                 priv->timers = t->next;
358                                 if (t->prev != NULL)
359                                         panic("timerstate1");
360                         }
361                         if (t->next)
362                                 t->next->prev = t->prev;
363                         if (t->prev)
364                                 t->prev->next = t->next;
365                         t->next = t->prev = NULL;
366                 }
367         } else {
368                 if (t->state != TcptimerON) {
369                         // chain
370                         if (t->prev != NULL || t->next != NULL)
371                                 panic("timerstate2");
372                         t->prev = NULL;
373                         t->next = priv->timers;
374                         if (t->next)
375                                 t->next->prev = t;
376                         priv->timers = t;
377                 }
378         }
379         t->state = newstate;
380 }
381
382 static void tcpackproc(void *a)
383 {
384         ERRSTACK(1);
385         Tcptimer *t, *tp, *timeo;
386         struct Proto *tcp;
387         struct tcppriv *priv;
388         int loop;
389
390         tcp = a;
391         priv = tcp->priv;
392
393         for (;;) {
394                 kthread_usleep(MSPTICK * 1000);
395
396                 qlock(&priv->tl);
397                 timeo = NULL;
398                 loop = 0;
399                 for (t = priv->timers; t != NULL; t = tp) {
400                         if (loop++ > 10000)
401                                 panic("tcpackproc1");
402                         tp = t->next;
403                         if (t->state == TcptimerON) {
404                                 t->count--;
405                                 if (t->count == 0) {
406                                         timerstate(priv, t, TcptimerDONE);
407                                         t->readynext = timeo;
408                                         timeo = t;
409                                 }
410                         }
411                 }
412                 qunlock(&priv->tl);
413
414                 loop = 0;
415                 for (t = timeo; t != NULL; t = t->readynext) {
416                         if (loop++ > 10000)
417                                 panic("tcpackproc2");
418                         if (t->state == TcptimerDONE && t->func != NULL) {
419                                 /* discard error style */
420                                 if (!waserror())
421                                         (*t->func) (t->arg);
422                                 poperror();
423                         }
424                 }
425
426                 limborexmit(tcp);
427         }
428 }
429
430 static void tcpgo(struct tcppriv *priv, Tcptimer *t)
431 {
432         if (t == NULL || t->start == 0)
433                 return;
434
435         qlock(&priv->tl);
436         t->count = t->start;
437         timerstate(priv, t, TcptimerON);
438         qunlock(&priv->tl);
439 }
440
441 static void tcphalt(struct tcppriv *priv, Tcptimer *t)
442 {
443         if (t == NULL)
444                 return;
445
446         qlock(&priv->tl);
447         timerstate(priv, t, TcptimerOFF);
448         qunlock(&priv->tl);
449 }
450
451 static int backoff(int n)
452 {
453         return 1 << n;
454 }
455
456 static void localclose(struct conv *s, char *reason)
457 {
458         /* called with tcb locked */
459         Tcpctl *tcb;
460         Reseq *rp, *rp1;
461         struct tcppriv *tpriv;
462
463         tpriv = s->p->priv;
464         tcb = (Tcpctl *) s->ptcl;
465
466         iphtrem(&tpriv->ht, s);
467
468         tcphalt(tpriv, &tcb->timer);
469         tcphalt(tpriv, &tcb->rtt_timer);
470         tcphalt(tpriv, &tcb->acktimer);
471         tcphalt(tpriv, &tcb->katimer);
472
473         /* Flush reassembly queue; nothing more can arrive */
474         for (rp = tcb->reseq; rp != NULL; rp = rp1) {
475                 rp1 = rp->next;
476                 freeblist(rp->bp);
477                 kfree(rp);
478         }
479         tcb->reseq = NULL;
480
481         if (tcb->state == Syn_sent)
482                 Fsconnected(s, reason);
483
484         qhangup(s->rq, reason);
485         qhangup(s->wq, reason);
486
487         tcpsetstate(s, Closed);
488
489         /* listener will check the rq state */
490         if (s->state == Announced)
491                 rendez_wakeup(&s->listenr);
492 }
493
494 /* mtu (- TCP + IP hdr len) of 1st hop */
495 static int tcpmtu(struct Ipifc *ifc, int version, int *scale)
496 {
497         int mtu;
498
499         switch (version) {
500         default:
501         case V4:
502                 mtu = DEF_MSS;
503                 if (ifc != NULL)
504                         mtu = ifc->maxtu - ifc->m->hsize - (TCP4_PKT +
505                                                             TCP4_HDRSIZE);
506                 break;
507         case V6:
508                 mtu = DEF_MSS6;
509                 if (ifc != NULL)
510                         mtu = ifc->maxtu - ifc->m->hsize - (TCP6_PKT +
511                                                             TCP6_HDRSIZE);
512                 break;
513         }
514         *scale = HaveWS | 7;
515
516         return mtu;
517 }
518
519 static void tcb_check_tso(Tcpctl *tcb)
520 {
521         /* This can happen if the netdev isn't up yet. */
522         if (!tcb->ifc)
523                 return;
524         if (tcb->ifc->feat & NETF_TSO)
525                 tcb->flags |= TSO;
526         else
527                 tcb->flags &= ~TSO;
528 }
529
530 static void inittcpctl(struct conv *s, int mode)
531 {
532         Tcpctl *tcb;
533         Tcp4hdr *h4;
534         Tcp6hdr *h6;
535         int mss;
536
537         tcb = (Tcpctl *) s->ptcl;
538
539         memset(tcb, 0, sizeof(Tcpctl));
540
541         tcb->ssthresh = UINT32_MAX;
542         tcb->srtt = tcp_irtt;
543         tcb->mdev = 0;
544
545         /* setup timers */
546         tcb->timer.start = tcp_irtt / MSPTICK;
547         tcb->timer.func = tcptimeout;
548         tcb->timer.arg = s;
549         tcb->rtt_timer.start = MAX_TIME;
550         tcb->acktimer.start = TCP_ACK / MSPTICK;
551         tcb->acktimer.func = tcpacktimer;
552         tcb->acktimer.arg = s;
553         tcb->katimer.start = DEF_KAT / MSPTICK;
554         tcb->katimer.func = tcpkeepalive;
555         tcb->katimer.arg = s;
556
557         mss = DEF_MSS;
558
559         /* create a prototype(pseudo) header */
560         if (mode != TCP_LISTEN) {
561                 if (ipcmp(s->laddr, IPnoaddr) == 0)
562                         findlocalip(s->p->f, s->laddr, s->raddr);
563
564                 switch (s->ipversion) {
565                 case V4:
566                         h4 = &tcb->protohdr.tcp4hdr;
567                         memset(h4, 0, sizeof(*h4));
568                         h4->proto = IP_TCPPROTO;
569                         hnputs(h4->tcpsport, s->lport);
570                         hnputs(h4->tcpdport, s->rport);
571                         v6tov4(h4->tcpsrc, s->laddr);
572                         v6tov4(h4->tcpdst, s->raddr);
573                         break;
574                 case V6:
575                         h6 = &tcb->protohdr.tcp6hdr;
576                         memset(h6, 0, sizeof(*h6));
577                         h6->proto = IP_TCPPROTO;
578                         hnputs(h6->tcpsport, s->lport);
579                         hnputs(h6->tcpdport, s->rport);
580                         ipmove(h6->tcpsrc, s->laddr);
581                         ipmove(h6->tcpdst, s->raddr);
582                         mss = DEF_MSS6;
583                         break;
584                 default:
585                         panic("inittcpctl: version %d", s->ipversion);
586                 }
587         }
588
589         tcb->ifc = findipifc(s->p->f, s->laddr, 0);
590         tcb->mss = mss;
591         tcb->typical_mss = mss;
592         tcb->cwind = tcb->typical_mss * CWIND_SCALE;
593
594         /* default is no window scaling */
595         tcb->window = QMAX;
596         tcb->rcv.wnd = QMAX;
597         tcb->rcv.scale = 0;
598         tcb->snd.scale = 0;
599         tcb_check_tso(tcb);
600 }
601
602 /*
603  *  called with s qlocked
604  */
605 static void tcpstart(struct conv *s, int mode)
606 {
607         Tcpctl *tcb;
608         struct tcppriv *tpriv;
609         char *kpname;
610
611         tpriv = s->p->priv;
612
613         if (tpriv->ackprocstarted == 0) {
614                 qlock(&tpriv->apl);
615                 if (tpriv->ackprocstarted == 0) {
616                         /* tcpackproc needs to free this if it ever exits */
617                         kpname = kmalloc(KNAMELEN, MEM_WAIT);
618                         snprintf(kpname, KNAMELEN, "#I%dtcpack", s->p->f->dev);
619                         ktask(kpname, tcpackproc, s->p);
620                         tpriv->ackprocstarted = 1;
621                 }
622                 qunlock(&tpriv->apl);
623         }
624
625         tcb = (Tcpctl *) s->ptcl;
626
627         inittcpctl(s, mode);
628
629         iphtadd(&tpriv->ht, s);
630         switch (mode) {
631         case TCP_LISTEN:
632                 tpriv->stats[PassiveOpens]++;
633                 tcb->flags |= CLONE;
634                 tcpsetstate(s, Listen);
635                 break;
636
637         case TCP_CONNECT:
638                 tpriv->stats[ActiveOpens]++;
639                 tcb->flags |= ACTIVE;
640                 tcpsndsyn(s, tcb);
641                 tcpsetstate(s, Syn_sent);
642                 tcpoutput(s);
643                 break;
644         }
645 }
646
647 static char *tcpflag(uint16_t flag)
648 {
649         static char buf[128];
650
651         snprintf(buf, sizeof(buf), "%d", flag >> 10);   /* Head len */
652         if (flag & URG)
653                 snprintf(buf, sizeof(buf), "%s%s", buf, " URG");
654         if (flag & ACK)
655                 snprintf(buf, sizeof(buf), "%s%s", buf, " ACK");
656         if (flag & PSH)
657                 snprintf(buf, sizeof(buf), "%s%s", buf, " PSH");
658         if (flag & RST)
659                 snprintf(buf, sizeof(buf), "%s%s", buf, " RST");
660         if (flag & SYN)
661                 snprintf(buf, sizeof(buf), "%s%s", buf, " SYN");
662         if (flag & FIN)
663                 snprintf(buf, sizeof(buf), "%s%s", buf, " FIN");
664
665         return buf;
666 }
667
668 /* Helper, determine if we should send a TCP timestamp.  ts_val was the
669  * timestamp from our distant end.  We'll also send a TS on SYN (no ACK). */
670 static bool tcp_seg_has_ts(Tcp *tcph)
671 {
672         return tcph->ts_val || ((tcph->flags & SYN) && !(tcph->flags & ACK));
673 }
674
675 /* Given a TCP header/segment and default header size (e.g. TCP4_HDRSIZE),
676  * return the actual hdr_len and opt_pad */
677 static void compute_hdrlen_optpad(Tcp *tcph, uint16_t default_hdrlen,
678                                   uint16_t *ret_hdrlen, uint16_t *ret_optpad,
679                                   Tcpctl *tcb)
680 {
681         uint16_t hdrlen = default_hdrlen;
682         uint16_t optpad = 0;
683
684         if (tcph->flags & SYN) {
685                 if (tcph->mss)
686                         hdrlen += MSS_LENGTH;
687                 if (tcph->ws)
688                         hdrlen += WS_LENGTH;
689                 if (tcph->sack_ok)
690                         hdrlen += SACK_OK_LENGTH;
691         }
692         if (tcp_seg_has_ts(tcph)) {
693                 hdrlen += TS_LENGTH;
694                 /* SYNs have other opts, don't do the PREPAD NOOP optimization.
695                  */
696                 if (!(tcph->flags & SYN))
697                         hdrlen += TS_SEND_PREPAD;
698         }
699         if (tcb && tcb->rcv.nr_sacks)
700                 hdrlen += 2 + tcb->rcv.nr_sacks * 8;
701         optpad = hdrlen & 3;
702         if (optpad)
703                 optpad = 4 - optpad;
704         hdrlen += optpad;
705         *ret_hdrlen = hdrlen;
706         *ret_optpad = optpad;
707 }
708
709 /* Writes the TCP options for tcph to opt. */
710 static void write_opts(Tcp *tcph, uint8_t *opt, uint16_t optpad, Tcpctl *tcb)
711 {
712         if (tcph->flags & SYN) {
713                 if (tcph->mss != 0) {
714                         *opt++ = MSSOPT;
715                         *opt++ = MSS_LENGTH;
716                         hnputs(opt, tcph->mss);
717                         opt += 2;
718                 }
719                 if (tcph->ws != 0) {
720                         *opt++ = WSOPT;
721                         *opt++ = WS_LENGTH;
722                         *opt++ = tcph->ws;
723                 }
724                 if (tcph->sack_ok) {
725                         *opt++ = SACK_OK_OPT;
726                         *opt++ = SACK_OK_LENGTH;
727                 }
728         }
729         if (tcp_seg_has_ts(tcph)) {
730                 if (!(tcph->flags & SYN)) {
731                         *opt++ = NOOPOPT;
732                         *opt++ = NOOPOPT;
733                 }
734                 *opt++ = TS_OPT;
735                 *opt++ = TS_LENGTH;
736                 /* Setting TSval, our time */
737                 hnputl(opt, milliseconds());
738                 opt += 4;
739                 /* Setting TSecr, the time we last saw from them, stored in
740                  * ts_val */
741                 hnputl(opt, tcph->ts_val);
742                 opt += 4;
743         }
744         if (tcb && tcb->rcv.nr_sacks) {
745                 *opt++ = SACK_OPT;
746                 *opt++ = 2 + tcb->rcv.nr_sacks * 8;
747                 for (int i = 0; i < tcb->rcv.nr_sacks; i++) {
748                         hnputl(opt, tcb->rcv.sacks[i].left);
749                         opt += 4;
750                         hnputl(opt, tcb->rcv.sacks[i].right);
751                         opt += 4;
752                 }
753         }
754         while (optpad-- > 0)
755                 *opt++ = NOOPOPT;
756 }
757
758 /* Given a data block (or NULL) returns a block with enough header room that we
759  * can send out.  block->wp is set to the beginning of the payload.  Returns
760  * NULL on some sort of error. */
761 static struct block *alloc_or_pad_block(struct block *data,
762                                         uint16_t total_hdr_size)
763 {
764         if (data) {
765                 data = padblock(data, total_hdr_size);
766                 if (data == NULL)
767                         return NULL;
768         } else {
769                 /* the 64 pad is to meet mintu's */
770                 data = block_alloc(total_hdr_size + 64, MEM_WAIT);
771                 if (data == NULL)
772                         return NULL;
773                 data->wp += total_hdr_size;
774         }
775         return data;
776 }
777
778 static struct block *htontcp6(Tcp *tcph, struct block *data, Tcp6hdr *ph,
779                               Tcpctl *tcb)
780 {
781         int dlen = blocklen(data);
782         Tcp6hdr *h;
783         uint16_t csum;
784         uint16_t hdrlen, optpad;
785
786         compute_hdrlen_optpad(tcph, TCP6_HDRSIZE, &hdrlen, &optpad, tcb);
787
788         data = alloc_or_pad_block(data, hdrlen + TCP6_PKT);
789         if (data == NULL)
790                 return NULL;
791         /* relative to the block start (bp->rp).  Note TCP structs include IP.
792          */
793         data->network_offset = 0;
794         data->transport_offset = offsetof(Tcp6hdr, tcpsport);
795
796         /* copy in pseudo ip header plus port numbers */
797         h = (Tcp6hdr *) (data->rp);
798         memmove(h, ph, TCP6_TCBPHDRSZ);
799
800         /* compose pseudo tcp header, do cksum calculation */
801         hnputl(h->vcf, hdrlen + dlen);
802         h->ploadlen[0] = h->ploadlen[1] = h->proto = 0;
803         h->ttl = ph->proto;
804
805         /* copy in variable bits */
806         hnputl(h->tcpseq, tcph->seq);
807         hnputl(h->tcpack, tcph->ack);
808         hnputs(h->tcpflag, (hdrlen << 10) | tcph->flags);
809         hnputs(h->tcpwin, tcph->wnd >> (tcb != NULL ? tcb->snd.scale : 0));
810         hnputs(h->tcpurg, tcph->urg);
811
812         write_opts(tcph, h->tcpopt, optpad, tcb);
813
814         if (tcb != NULL && tcb->nochecksum) {
815                 h->tcpcksum[0] = h->tcpcksum[1] = 0;
816         } else {
817                 csum = ptclcsum(data, TCP6_IPLEN, hdrlen + dlen +
818                                 TCP6_PHDRSIZE);
819                 hnputs(h->tcpcksum, csum);
820         }
821
822         /* move from pseudo header back to normal ip header */
823         memset(h->vcf, 0, 4);
824         h->vcf[0] = IP_VER6;
825         hnputs(h->ploadlen, hdrlen + dlen);
826         h->proto = ph->proto;
827
828         return data;
829 }
830
831 static struct block *htontcp4(Tcp *tcph, struct block *data, Tcp4hdr *ph,
832                               Tcpctl *tcb)
833 {
834         int dlen = blocklen(data);
835         Tcp4hdr *h;
836         uint16_t csum;
837         uint16_t hdrlen, optpad;
838
839         compute_hdrlen_optpad(tcph, TCP4_HDRSIZE, &hdrlen, &optpad, tcb);
840
841         data = alloc_or_pad_block(data, hdrlen + TCP4_PKT);
842         if (data == NULL)
843                 return NULL;
844         /* relative to the block start (bp->rp).  Note TCP structs include IP.*/
845         data->network_offset = 0;
846         data->transport_offset = offsetof(Tcp4hdr, tcpsport);
847
848         /* copy in pseudo ip header plus port numbers */
849         h = (Tcp4hdr *) (data->rp);
850         memmove(h, ph, TCP4_TCBPHDRSZ);
851
852         /* copy in variable bits */
853         hnputs(h->tcplen, hdrlen + dlen);
854         hnputl(h->tcpseq, tcph->seq);
855         hnputl(h->tcpack, tcph->ack);
856         hnputs(h->tcpflag, (hdrlen << 10) | tcph->flags);
857         hnputs(h->tcpwin, tcph->wnd >> (tcb != NULL ? tcb->snd.scale : 0));
858         hnputs(h->tcpurg, tcph->urg);
859
860         write_opts(tcph, h->tcpopt, optpad, tcb);
861
862         if (tcb != NULL && tcb->nochecksum) {
863                 h->tcpcksum[0] = h->tcpcksum[1] = 0;
864         } else {
865                 assert(data->transport_offset == TCP4_IPLEN + TCP4_PHDRSIZE);
866                 csum = ~ptclcsum(data, TCP4_IPLEN, TCP4_PHDRSIZE);
867                 hnputs(h->tcpcksum, csum);
868                 data->tx_csum_offset = ph->tcpcksum - ph->tcpsport;
869                 data->flag |= Btcpck;
870         }
871
872         return data;
873 }
874
875 static void parse_inbound_sacks(Tcp *tcph, uint8_t *opt, uint16_t optlen)
876 {
877         uint8_t nr_sacks;
878         uint32_t left, right;
879
880         nr_sacks = (optlen - 2) / 8;
881         if (nr_sacks > MAX_NR_SACKS_PER_PACKET)
882                 return;
883         opt += 2;
884         for (int i = 0; i < nr_sacks; i++, opt += 8) {
885                 left = nhgetl(opt);
886                 right = nhgetl(opt + 4);
887                 if (seq_ge(left, right)) {
888                         /* bad / malicious SACK.  Skip it, and adjust. */
889                         nr_sacks--;
890                         i--;    /* stay on this array element next loop */
891                         continue;
892                 }
893                 tcph->sacks[i].left = left;
894                 tcph->sacks[i].right = right;
895         }
896         tcph->nr_sacks = nr_sacks;
897 }
898
899 static void parse_inbound_opts(Tcp *tcph, uint8_t *opt, uint16_t optsize)
900 {
901         uint16_t optlen;
902
903         while (optsize > 0 && *opt != EOLOPT) {
904                 if (*opt == NOOPOPT) {
905                         optsize--;
906                         opt++;
907                         continue;
908                 }
909                 optlen = opt[1];
910                 if (optlen < 2 || optlen > optsize)
911                         break;
912                 switch (*opt) {
913                 case MSSOPT:
914                         if (optlen == MSS_LENGTH)
915                                 tcph->mss = nhgets(opt + 2);
916                         break;
917                 case WSOPT:
918                         if (optlen == WS_LENGTH && *(opt + 2) <= MAX_WS_VALUE)
919                                 tcph->ws = HaveWS | *(opt + 2);
920                         break;
921                 case SACK_OK_OPT:
922                         if (optlen == SACK_OK_LENGTH)
923                                 tcph->sack_ok = TRUE;
924                         break;
925                 case SACK_OPT:
926                         parse_inbound_sacks(tcph, opt, optlen);
927                         break;
928                 case TS_OPT:
929                         if (optlen == TS_LENGTH) {
930                                 tcph->ts_val = nhgetl(opt + 2);
931                                 tcph->ts_ecr = nhgetl(opt + 6);
932                         }
933                         break;
934                 }
935                 optsize -= optlen;
936                 opt += optlen;
937         }
938 }
939
940 /* Helper, clears the opts.  We'll later set them with e.g. parse_inbound_opts,
941  * set them manually, or something else. */
942 static void clear_tcph_opts(Tcp *tcph)
943 {
944         tcph->mss = 0;
945         tcph->ws = 0;
946         tcph->sack_ok = FALSE;
947         tcph->nr_sacks = 0;
948         tcph->ts_val = 0;
949         tcph->ts_ecr = 0;
950 }
951
952 static int ntohtcp6(Tcp *tcph, struct block **bpp)
953 {
954         Tcp6hdr *h;
955         uint16_t hdrlen;
956
957         *bpp = pullupblock(*bpp, TCP6_PKT + TCP6_HDRSIZE);
958         if (*bpp == NULL)
959                 return -1;
960
961         h = (Tcp6hdr *) ((*bpp)->rp);
962         tcph->source = nhgets(h->tcpsport);
963         tcph->dest = nhgets(h->tcpdport);
964         tcph->seq = nhgetl(h->tcpseq);
965         tcph->ack = nhgetl(h->tcpack);
966         hdrlen = (h->tcpflag[0] >> 2) & ~3;
967         if (hdrlen < TCP6_HDRSIZE) {
968                 freeblist(*bpp);
969                 return -1;
970         }
971
972         tcph->flags = h->tcpflag[1];
973         tcph->wnd = nhgets(h->tcpwin);
974         tcph->urg = nhgets(h->tcpurg);
975         clear_tcph_opts(tcph);
976         tcph->len = nhgets(h->ploadlen) - hdrlen;
977
978         *bpp = pullupblock(*bpp, hdrlen + TCP6_PKT);
979         if (*bpp == NULL)
980                 return -1;
981         parse_inbound_opts(tcph, h->tcpopt, hdrlen - TCP6_HDRSIZE);
982         return hdrlen;
983 }
984
985 static int ntohtcp4(Tcp *tcph, struct block **bpp)
986 {
987         Tcp4hdr *h;
988         uint16_t hdrlen;
989
990         *bpp = pullupblock(*bpp, TCP4_PKT + TCP4_HDRSIZE);
991         if (*bpp == NULL)
992                 return -1;
993
994         h = (Tcp4hdr *) ((*bpp)->rp);
995         tcph->source = nhgets(h->tcpsport);
996         tcph->dest = nhgets(h->tcpdport);
997         tcph->seq = nhgetl(h->tcpseq);
998         tcph->ack = nhgetl(h->tcpack);
999
1000         hdrlen = (h->tcpflag[0] >> 2) & ~3;
1001         if (hdrlen < TCP4_HDRSIZE) {
1002                 freeblist(*bpp);
1003                 return -1;
1004         }
1005
1006         tcph->flags = h->tcpflag[1];
1007         tcph->wnd = nhgets(h->tcpwin);
1008         tcph->urg = nhgets(h->tcpurg);
1009         clear_tcph_opts(tcph);
1010         tcph->len = nhgets(h->length) - (hdrlen + TCP4_PKT);
1011
1012         *bpp = pullupblock(*bpp, hdrlen + TCP4_PKT);
1013         if (*bpp == NULL)
1014                 return -1;
1015         parse_inbound_opts(tcph, h->tcpopt, hdrlen - TCP4_HDRSIZE);
1016         return hdrlen;
1017 }
1018
1019 /*
1020  *  For outgoing calls, generate an initial sequence
1021  *  number and put a SYN on the send queue
1022  */
1023 static void tcpsndsyn(struct conv *s, Tcpctl *tcb)
1024 {
1025         urandom_read(&tcb->iss, sizeof(tcb->iss));
1026         tcb->rttseq = tcb->iss;
1027         tcb->snd.wl2 = tcb->iss;
1028         tcb->snd.una = tcb->iss;
1029         tcb->snd.rtx = tcb->rttseq;
1030         tcb->snd.nxt = tcb->rttseq;
1031         tcb->flgcnt++;
1032         tcb->flags |= FORCE;
1033         tcb->sndsyntime = NOW;
1034
1035         /* set desired mss and scale */
1036         tcb->mss = tcpmtu(tcb->ifc, s->ipversion, &tcb->scale);
1037 }
1038
1039 static void sndrst(struct Proto *tcp, uint8_t *source, uint8_t *dest,
1040                    uint16_t length, Tcp *seg, uint8_t version, char *reason)
1041 {
1042         struct block *hbp;
1043         uint8_t rflags;
1044         struct tcppriv *tpriv;
1045         Tcp4hdr ph4;
1046         Tcp6hdr ph6;
1047
1048         netlog(tcp->f, Logtcpreset, "sndrst: %s\n", reason);
1049
1050         tpriv = tcp->priv;
1051
1052         if (seg->flags & RST)
1053                 return;
1054
1055         /* make pseudo header */
1056         switch (version) {
1057         case V4:
1058                 memset(&ph4, 0, sizeof(ph4));
1059                 ph4.vihl = IP_VER4;
1060                 v6tov4(ph4.tcpsrc, dest);
1061                 v6tov4(ph4.tcpdst, source);
1062                 ph4.proto = IP_TCPPROTO;
1063                 hnputs(ph4.tcplen, TCP4_HDRSIZE);
1064                 hnputs(ph4.tcpsport, seg->dest);
1065                 hnputs(ph4.tcpdport, seg->source);
1066                 break;
1067         case V6:
1068                 memset(&ph6, 0, sizeof(ph6));
1069                 ph6.vcf[0] = IP_VER6;
1070                 ipmove(ph6.tcpsrc, dest);
1071                 ipmove(ph6.tcpdst, source);
1072                 ph6.proto = IP_TCPPROTO;
1073                 hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1074                 hnputs(ph6.tcpsport, seg->dest);
1075                 hnputs(ph6.tcpdport, seg->source);
1076                 break;
1077         default:
1078                 panic("sndrst: version %d", version);
1079         }
1080
1081         tpriv->stats[OutRsts]++;
1082         rflags = RST;
1083
1084         /* convince the other end that this reset is in band */
1085         if (seg->flags & ACK) {
1086                 seg->seq = seg->ack;
1087                 seg->ack = 0;
1088         } else {
1089                 rflags |= ACK;
1090                 seg->ack = seg->seq;
1091                 seg->seq = 0;
1092                 if (seg->flags & SYN)
1093                         seg->ack++;
1094                 seg->ack += length;
1095                 if (seg->flags & FIN)
1096                         seg->ack++;
1097         }
1098         seg->flags = rflags;
1099         seg->wnd = 0;
1100         seg->urg = 0;
1101         seg->mss = 0;
1102         seg->ws = 0;
1103         seg->sack_ok = FALSE;
1104         seg->nr_sacks = 0;
1105         /* seg->ts_val is already set with their timestamp */
1106         switch (version) {
1107         case V4:
1108                 hbp = htontcp4(seg, NULL, &ph4, NULL);
1109                 if (hbp == NULL)
1110                         return;
1111                 ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1112                 break;
1113         case V6:
1114                 hbp = htontcp6(seg, NULL, &ph6, NULL);
1115                 if (hbp == NULL)
1116                         return;
1117                 ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1118                 break;
1119         default:
1120                 panic("sndrst2: version %d", version);
1121         }
1122 }
1123
1124 /*
1125  *  send a reset to the remote side and close the conversation
1126  *  called with s qlocked
1127  */
1128 static void tcphangup(struct conv *s)
1129 {
1130         ERRSTACK(1);
1131         Tcp seg;
1132         Tcpctl *tcb;
1133         struct block *hbp;
1134
1135         tcb = (Tcpctl *) s->ptcl;
1136         if (ipcmp(s->raddr, IPnoaddr)) {
1137                 /* discard error style, poperror regardless */
1138                 if (!waserror()) {
1139                         seg.flags = RST | ACK;
1140                         seg.ack = tcb->rcv.nxt;
1141                         tcb->last_ack_sent = seg.ack;
1142                         tcb->rcv.una = 0;
1143                         seg.seq = tcb->snd.nxt;
1144                         seg.wnd = 0;
1145                         seg.urg = 0;
1146                         seg.mss = 0;
1147                         seg.ws = 0;
1148                         seg.sack_ok = FALSE;
1149                         seg.nr_sacks = 0;
1150                         seg.ts_val = tcb->ts_recent;
1151                         switch (s->ipversion) {
1152                         case V4:
1153                                 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
1154                                 hbp = htontcp4(&seg, NULL,
1155                                                &tcb->protohdr.tcp4hdr, tcb);
1156                                 ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
1157                                 break;
1158                         case V6:
1159                                 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
1160                                 hbp = htontcp6(&seg, NULL,
1161                                                &tcb->protohdr.tcp6hdr, tcb);
1162                                 ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
1163                                 break;
1164                         default:
1165                                 panic("tcphangup: version %d", s->ipversion);
1166                         }
1167                 }
1168                 poperror();
1169         }
1170         localclose(s, NULL);
1171 }
1172
1173 /*
1174  *  (re)send a SYN ACK
1175  */
1176 static int sndsynack(struct Proto *tcp, Limbo *lp)
1177 {
1178         struct block *hbp;
1179         Tcp4hdr ph4;
1180         Tcp6hdr ph6;
1181         Tcp seg;
1182         int scale;
1183         uint8_t flag = 0;
1184
1185         /* make pseudo header */
1186         switch (lp->version) {
1187         case V4:
1188                 memset(&ph4, 0, sizeof(ph4));
1189                 ph4.vihl = IP_VER4;
1190                 v6tov4(ph4.tcpsrc, lp->laddr);
1191                 v6tov4(ph4.tcpdst, lp->raddr);
1192                 ph4.proto = IP_TCPPROTO;
1193                 hnputs(ph4.tcplen, TCP4_HDRSIZE);
1194                 hnputs(ph4.tcpsport, lp->lport);
1195                 hnputs(ph4.tcpdport, lp->rport);
1196                 break;
1197         case V6:
1198                 memset(&ph6, 0, sizeof(ph6));
1199                 ph6.vcf[0] = IP_VER6;
1200                 ipmove(ph6.tcpsrc, lp->laddr);
1201                 ipmove(ph6.tcpdst, lp->raddr);
1202                 ph6.proto = IP_TCPPROTO;
1203                 hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1204                 hnputs(ph6.tcpsport, lp->lport);
1205                 hnputs(ph6.tcpdport, lp->rport);
1206                 break;
1207         default:
1208                 panic("sndrst: version %d", lp->version);
1209         }
1210         lp->ifc = findipifc(tcp->f, lp->laddr, 0);
1211
1212         seg.seq = lp->iss;
1213         seg.ack = lp->irs + 1;
1214         seg.flags = SYN | ACK;
1215         seg.urg = 0;
1216         seg.mss = tcpmtu(lp->ifc, lp->version, &scale);
1217         seg.wnd = QMAX;
1218         seg.ts_val = lp->ts_val;
1219         seg.nr_sacks = 0;
1220
1221         /* if the other side set scale, we should too */
1222         if (lp->rcvscale) {
1223                 seg.ws = scale;
1224                 lp->sndscale = scale;
1225         } else {
1226                 seg.ws = 0;
1227                 lp->sndscale = 0;
1228         }
1229         if (SACK_SUPPORTED)
1230                 seg.sack_ok = lp->sack_ok;
1231         else
1232                 seg.sack_ok = FALSE;
1233
1234         switch (lp->version) {
1235         case V4:
1236                 hbp = htontcp4(&seg, NULL, &ph4, NULL);
1237                 if (hbp == NULL)
1238                         return -1;
1239                 ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1240                 break;
1241         case V6:
1242                 hbp = htontcp6(&seg, NULL, &ph6, NULL);
1243                 if (hbp == NULL)
1244                         return -1;
1245                 ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1246                 break;
1247         default:
1248                 panic("sndsnack: version %d", lp->version);
1249         }
1250         lp->lastsend = NOW;
1251         return 0;
1252 }
1253
1254 #define hashipa(a, p) ( ( (a)[IPaddrlen-2] + (a)[IPaddrlen-1] + p )&LHTMASK )
1255
1256 /*
1257  *  put a call into limbo and respond with a SYN ACK
1258  *
1259  *  called with proto locked
1260  */
1261 static void limbo(struct conv *s, uint8_t *source, uint8_t *dest, Tcp *seg,
1262                   int version)
1263 {
1264         Limbo *lp, **l;
1265         struct tcppriv *tpriv;
1266         int h;
1267
1268         tpriv = s->p->priv;
1269         h = hashipa(source, seg->source);
1270
1271         for (l = &tpriv->lht[h]; *l != NULL; l = &lp->next) {
1272                 lp = *l;
1273                 if (lp->lport != seg->dest || lp->rport != seg->source
1274                         || lp->version != version)
1275                         continue;
1276                 if (ipcmp(lp->raddr, source) != 0)
1277                         continue;
1278                 if (ipcmp(lp->laddr, dest) != 0)
1279                         continue;
1280
1281                 /* each new SYN restarts the retransmits */
1282                 lp->irs = seg->seq;
1283                 break;
1284         }
1285         lp = *l;
1286         if (lp == NULL) {
1287                 if (tpriv->nlimbo >= Maxlimbo && tpriv->lht[h]) {
1288                         lp = tpriv->lht[h];
1289                         tpriv->lht[h] = lp->next;
1290                         lp->next = NULL;
1291                 } else {
1292                         lp = kzmalloc(sizeof(*lp), 0);
1293                         if (lp == NULL)
1294                                 return;
1295                         tpriv->nlimbo++;
1296                 }
1297                 *l = lp;
1298                 lp->version = version;
1299                 ipmove(lp->laddr, dest);
1300                 ipmove(lp->raddr, source);
1301                 lp->lport = seg->dest;
1302                 lp->rport = seg->source;
1303                 lp->mss = seg->mss;
1304                 lp->rcvscale = seg->ws;
1305                 lp->sack_ok = seg->sack_ok;
1306                 lp->irs = seg->seq;
1307                 lp->ts_val = seg->ts_val;
1308                 urandom_read(&lp->iss, sizeof(lp->iss));
1309         }
1310
1311         if (sndsynack(s->p, lp) < 0) {
1312                 *l = lp->next;
1313                 tpriv->nlimbo--;
1314                 kfree(lp);
1315         }
1316 }
1317
1318 /*
1319  *  resend SYN ACK's once every SYNACK_RXTIMER ms.
1320  */
1321 static void limborexmit(struct Proto *tcp)
1322 {
1323         struct tcppriv *tpriv;
1324         Limbo **l, *lp;
1325         int h;
1326         int seen;
1327         uint64_t now;
1328
1329         tpriv = tcp->priv;
1330
1331         if (!canqlock(&tcp->qlock))
1332                 return;
1333         seen = 0;
1334         now = NOW;
1335         for (h = 0; h < NLHT && seen < tpriv->nlimbo; h++) {
1336                 for (l = &tpriv->lht[h]; *l != NULL && seen < tpriv->nlimbo;) {
1337                         lp = *l;
1338                         seen++;
1339                         if (now - lp->lastsend <
1340                             (lp->rexmits + 1) * SYNACK_RXTIMER)
1341                                 continue;
1342
1343                         /* time it out after 1 second */
1344                         if (++(lp->rexmits) > 5) {
1345                                 tpriv->nlimbo--;
1346                                 *l = lp->next;
1347                                 kfree(lp);
1348                                 continue;
1349                         }
1350
1351                         /* if we're being attacked, don't bother resending SYN
1352                          * ACK's */
1353                         if (tpriv->nlimbo > 100)
1354                                 continue;
1355
1356                         if (sndsynack(tcp, lp) < 0) {
1357                                 tpriv->nlimbo--;
1358                                 *l = lp->next;
1359                                 kfree(lp);
1360                                 continue;
1361                         }
1362
1363                         l = &lp->next;
1364                 }
1365         }
1366         qunlock(&tcp->qlock);
1367 }
1368
1369 /*
1370  *  lookup call in limbo.  if found, throw it out.
1371  *
1372  *  called with proto locked
1373  */
1374 static void limborst(struct conv *s, Tcp *segp, uint8_t *src, uint8_t *dst,
1375                      uint8_t version)
1376 {
1377         Limbo *lp, **l;
1378         int h;
1379         struct tcppriv *tpriv;
1380
1381         tpriv = s->p->priv;
1382
1383         /* find a call in limbo */
1384         h = hashipa(src, segp->source);
1385         for (l = &tpriv->lht[h]; *l != NULL; l = &lp->next) {
1386                 lp = *l;
1387                 if (lp->lport != segp->dest || lp->rport != segp->source
1388                         || lp->version != version)
1389                         continue;
1390                 if (ipcmp(lp->laddr, dst) != 0)
1391                         continue;
1392                 if (ipcmp(lp->raddr, src) != 0)
1393                         continue;
1394
1395                 /* RST can only follow the SYN */
1396                 if (segp->seq == lp->irs + 1) {
1397                         tpriv->nlimbo--;
1398                         *l = lp->next;
1399                         kfree(lp);
1400                 }
1401                 break;
1402         }
1403 }
1404
1405 /* The advertised MSS (e.g. 1460) includes any per-packet TCP options, such as
1406  * TCP timestamps.  A given packet will contain mss bytes, but only typical_mss
1407  * bytes of *data*.  If we know we'll use those options, we should adjust our
1408  * typical_mss, which will affect the cwnd. */
1409 static void adjust_typical_mss_for_opts(Tcp *tcph, Tcpctl *tcb)
1410 {
1411         uint16_t opt_size = 0;
1412
1413         if (tcph->ts_val)
1414                 opt_size += TS_LENGTH + TS_SEND_PREPAD;
1415         opt_size = ROUNDUP(opt_size, 4);
1416         tcb->typical_mss -= opt_size;
1417 }
1418
1419 /*
1420  *  come here when we finally get an ACK to our SYN-ACK.
1421  *  lookup call in limbo.  if found, create a new conversation
1422  *
1423  *  called with proto locked
1424  */
1425 static struct conv *tcpincoming(struct conv *s, Tcp *segp, uint8_t *src,
1426                                                                 uint8_t *dst, uint8_t version)
1427 {
1428         struct conv *new;
1429         Tcpctl *tcb;
1430         struct tcppriv *tpriv;
1431         Tcp4hdr *h4;
1432         Tcp6hdr *h6;
1433         Limbo *lp, **l;
1434         int h;
1435
1436         /* unless it's just an ack, it can't be someone coming out of limbo */
1437         if ((segp->flags & SYN) || (segp->flags & ACK) == 0)
1438                 return NULL;
1439
1440         tpriv = s->p->priv;
1441
1442         /* find a call in limbo */
1443         h = hashipa(src, segp->source);
1444         for (l = &tpriv->lht[h]; (lp = *l) != NULL; l = &lp->next) {
1445                 netlog(s->p->f, Logtcp,
1446                            "tcpincoming s %I!%d/%I!%d d %I!%d/%I!%d v %d/%d\n",
1447                            src, segp->source, lp->raddr, lp->rport, dst,
1448                            segp->dest, lp->laddr, lp->lport, version,
1449                            lp->version);
1450
1451                 if (lp->lport != segp->dest || lp->rport != segp->source
1452                         || lp->version != version)
1453                         continue;
1454                 if (ipcmp(lp->laddr, dst) != 0)
1455                         continue;
1456                 if (ipcmp(lp->raddr, src) != 0)
1457                         continue;
1458
1459                 /* we're assuming no data with the initial SYN */
1460                 if (segp->seq != lp->irs + 1 || segp->ack != lp->iss + 1) {
1461                         netlog(s->p->f, Logtcp,
1462                                "tcpincoming s 0x%lx/0x%lx a 0x%lx 0x%lx\n",
1463                                segp->seq, lp->irs + 1, segp->ack, lp->iss + 1);
1464                         lp = NULL;
1465                 } else {
1466                         tpriv->nlimbo--;
1467                         *l = lp->next;
1468                 }
1469                 break;
1470         }
1471         if (lp == NULL)
1472                 return NULL;
1473
1474         new = Fsnewcall(s, src, segp->source, dst, segp->dest, version);
1475         if (new == NULL)
1476                 return NULL;
1477
1478         memmove(new->ptcl, s->ptcl, sizeof(Tcpctl));
1479         tcb = (Tcpctl *) new->ptcl;
1480         tcb->flags &= ~CLONE;
1481         tcb->timer.arg = new;
1482         tcb->timer.state = TcptimerOFF;
1483         tcb->acktimer.arg = new;
1484         tcb->acktimer.state = TcptimerOFF;
1485         tcb->katimer.arg = new;
1486         tcb->katimer.state = TcptimerOFF;
1487         tcb->rtt_timer.arg = new;
1488         tcb->rtt_timer.state = TcptimerOFF;
1489
1490         tcb->irs = lp->irs;
1491         tcb->rcv.nxt = tcb->irs + 1;
1492         tcb->rcv.urg = tcb->rcv.nxt;
1493
1494         tcb->iss = lp->iss;
1495         tcb->rttseq = tcb->iss;
1496         tcb->snd.wl2 = tcb->iss;
1497         tcb->snd.una = tcb->iss + 1;
1498         tcb->snd.rtx = tcb->iss + 1;
1499         tcb->snd.nxt = tcb->iss + 1;
1500         tcb->flgcnt = 0;
1501         tcb->flags |= SYNACK;
1502
1503         /* our sending max segment size cannot be bigger than what he asked for
1504          */
1505         if (lp->mss != 0 && lp->mss < tcb->mss) {
1506                 tcb->mss = lp->mss;
1507                 tcb->typical_mss = tcb->mss;
1508         }
1509         adjust_typical_mss_for_opts(segp, tcb);
1510
1511         /* Here's where we record the previously-decided header options.  They
1512          * were actually decided on when we agreed to them in the SYNACK we
1513          * sent.  We didn't create an actual TCB until now, so we can copy those
1514          * decisions out of the limbo tracker and into the TCB. */
1515         tcb->ifc = lp->ifc;
1516         tcb->sack_ok = lp->sack_ok;
1517         /* window scaling */
1518         tcpsetscale(new, tcb, lp->rcvscale, lp->sndscale);
1519         tcb_check_tso(tcb);
1520
1521         tcb->snd.wnd = segp->wnd;
1522         tcb->cwind = tcb->typical_mss * CWIND_SCALE;
1523
1524         /* set initial round trip time */
1525         tcb->sndsyntime = lp->lastsend + lp->rexmits * SYNACK_RXTIMER;
1526         tcpsynackrtt(new);
1527
1528         kfree(lp);
1529
1530         /* set up proto header */
1531         switch (version) {
1532         case V4:
1533                 h4 = &tcb->protohdr.tcp4hdr;
1534                 memset(h4, 0, sizeof(*h4));
1535                 h4->proto = IP_TCPPROTO;
1536                 hnputs(h4->tcpsport, new->lport);
1537                 hnputs(h4->tcpdport, new->rport);
1538                 v6tov4(h4->tcpsrc, dst);
1539                 v6tov4(h4->tcpdst, src);
1540                 break;
1541         case V6:
1542                 h6 = &tcb->protohdr.tcp6hdr;
1543                 memset(h6, 0, sizeof(*h6));
1544                 h6->proto = IP_TCPPROTO;
1545                 hnputs(h6->tcpsport, new->lport);
1546                 hnputs(h6->tcpdport, new->rport);
1547                 ipmove(h6->tcpsrc, dst);
1548                 ipmove(h6->tcpdst, src);
1549                 break;
1550         default:
1551                 panic("tcpincoming: version %d", new->ipversion);
1552         }
1553
1554         tcpsetstate(new, Established);
1555
1556         iphtadd(&tpriv->ht, new);
1557
1558         return new;
1559 }
1560
1561 /*
1562  *  use the time between the first SYN and it's ack as the
1563  *  initial round trip time
1564  */
1565 static void tcpsynackrtt(struct conv *s)
1566 {
1567         Tcpctl *tcb;
1568         uint64_t delta;
1569         struct tcppriv *tpriv;
1570
1571         tcb = (Tcpctl *) s->ptcl;
1572         tpriv = s->p->priv;
1573
1574         delta = NOW - tcb->sndsyntime;
1575         tcb->srtt = delta;
1576         tcb->mdev = delta / 2;
1577
1578         /* halt round trip timer */
1579         tcphalt(tpriv, &tcb->rtt_timer);
1580 }
1581
1582 /* For LFNs (long/fat), our default tx queue doesn't hold enough data, and TCP
1583  * blocks on the application - even if the app already has the data ready to go.
1584  * We need to hold the sent, unacked data (1x cwnd), plus all the data we might
1585  * send next RTT (1x cwnd).  Note this is called after cwnd was expanded. */
1586 static void adjust_tx_qio_limit(struct conv *s)
1587 {
1588         Tcpctl *tcb = (Tcpctl *) s->ptcl;
1589         size_t ideal_limit = tcb->cwind * 2;
1590
1591         /* This is called for every ACK, and it's not entirely free to update
1592          * the limit (locks, CVs, taps).  Updating in chunks of mss seems
1593          * reasonable.  During SS, we'll update this on most ACKs (given each
1594          * ACK increased the cwind by > MSS).
1595          *
1596          * We also don't want a lot of tiny blocks from the user, but the way
1597          * qio works, you can put in as much as you want (Maxatomic) and then
1598          * get flow-controlled. */
1599         if (qgetlimit(s->wq) + tcb->typical_mss < ideal_limit)
1600                 qsetlimit(s->wq, ideal_limit);
1601         /* TODO: we could shrink the qio limit too, if we had a better idea what
1602          * the actual threshold was.  We want the limit to be the 'stable' cwnd
1603          * times 2. */
1604 }
1605
1606 /* Attempts to merge later sacks into sack 'into' (index in the array) */
1607 static void merge_sacks_into(Tcpctl *tcb, int into)
1608 {
1609         struct sack_block *into_sack = &tcb->snd.sacks[into];
1610         struct sack_block *tcb_sack;
1611         int shift = 0;
1612
1613         for (int i = into + 1; i < tcb->snd.nr_sacks; i++) {
1614                 tcb_sack = &tcb->snd.sacks[i];
1615                 if (seq_lt(into_sack->right, tcb_sack->left))
1616                         break;
1617                 if (seq_gt(tcb_sack->right, into_sack->right))
1618                         into_sack->right = tcb_sack->right;
1619                 shift++;
1620         }
1621         if (shift) {
1622                 memmove(tcb->snd.sacks + into + 1,
1623                         tcb->snd.sacks + into + 1 + shift,
1624                         sizeof(struct sack_block) * (tcb->snd.nr_sacks - into -
1625                                                      1 - shift));
1626                 tcb->snd.nr_sacks -= shift;
1627         }
1628 }
1629
1630 /* If we update a sack, it means they received a packet (possibly out of order),
1631  * but they have not received earlier packets.  Otherwise, they would do a full
1632  * ACK.
1633  *
1634  * The trick is in knowing whether the reception growing this sack is due to a
1635  * retrans or due to packets from before our last loss event.  The rightmost
1636  * sack tends to grow a lot with packets we sent before the loss.  However,
1637  * intermediate sacks that grow are signs of a loss, since they only grow as a
1638  * result of retrans.
1639  *
1640  * This is only true for the first time through a retrans.  After we've gone
1641  * through a full retrans blast, the sack that hinted at the retrans loss (and
1642  * there could be multiple of them!) will continue to grow.  We could come up
1643  * with some tracking for this, but instead we'll just do a one-time deal.  You
1644  * can recover from one detected sack retrans loss.  After that, you'll have to
1645  * use the RTO.
1646  *
1647  * This won't catch some things, like a sack that grew and merged with the
1648  * rightmost sack.  This also won't work if you have a single sack.  We can't
1649  * tell where the retrans ends and the sending begins. */
1650 static bool sack_hints_at_loss(Tcpctl *tcb, struct sack_block *tcb_sack)
1651 {
1652         if (tcb->snd.recovery != SACK_RETRANS_RECOVERY)
1653                 return FALSE;
1654         return &tcb->snd.sacks[tcb->snd.nr_sacks - 1] != tcb_sack;
1655 }
1656
1657 static bool sack_contains(struct sack_block *tcb_sack, uint32_t seq)
1658 {
1659         return seq_le(tcb_sack->left, seq) && seq_lt(seq, tcb_sack->right);
1660 }
1661
1662 /* Debugging helper! */
1663 static void sack_asserter(Tcpctl *tcb, char *str)
1664 {
1665         struct sack_block *tcb_sack;
1666
1667         for (int i = 0; i < tcb->snd.nr_sacks; i++) {
1668                 tcb_sack = &tcb->snd.sacks[i];
1669                 /* Checking invariants: snd.rtx is never inside a sack, sacks
1670                  * are always mutually exclusive. */
1671                 if (sack_contains(tcb_sack, tcb->snd.rtx) ||
1672                     ((i + 1 < tcb->snd.nr_sacks) &&
1673                      seq_ge(tcb_sack->right, (tcb_sack + 1)->left))) {
1674                         printk("SACK ASSERT ERROR at %s\n", str);
1675                         printk("rtx %u una %u nxt %u, sack [%u, %u)\n",
1676                                tcb->snd.rtx, tcb->snd.una, tcb->snd.nxt,
1677                                tcb_sack->left, tcb_sack->right);
1678                         for (int i = 0; i < tcb->snd.nr_sacks; i++)
1679                                 printk("\t %d: [%u, %u)\n", i,
1680                                        tcb->snd.sacks[i].left,
1681                                        tcb->snd.sacks[i].right);
1682                         backtrace();
1683                         panic("");
1684                 }
1685         }
1686 }
1687
1688 /* Updates bookkeeping whenever a sack is added or updated */
1689 static void sack_has_changed(struct conv *s, Tcpctl *tcb,
1690                              struct sack_block *tcb_sack)
1691 {
1692         /* Due to the change, snd.rtx might be in the middle of this sack.
1693          * Advance it to the right edge. */
1694         if (sack_contains(tcb_sack, tcb->snd.rtx))
1695                 tcb->snd.rtx = tcb_sack->right;
1696
1697         /* This is a sack for something we retransed and we think it means there
1698          * was another loss.  Instead of waiting for the RTO, we can take
1699          * action. */
1700         if (sack_hints_at_loss(tcb, tcb_sack)) {
1701                 if (++tcb->snd.sack_loss_hint == TCPREXMTTHRESH) {
1702                         netlog(s->p->f, Logtcprxmt,
1703                                "%I.%d -> %I.%d: sack rxmit loss: snd.rtx %u, sack [%u,%u), una %u, recovery_pt %u\n",
1704                                s->laddr, s->lport, s->raddr, s->rport,
1705                                tcb->snd.rtx, tcb_sack->left, tcb_sack->right,
1706                                tcb->snd.una, tcb->snd.recovery_pt);
1707                         /* Redo retrans, but keep the sacks and recovery point*/
1708                         tcp_loss_event(s, tcb);
1709                         tcb->snd.rtx = tcb->snd.una;
1710                         tcb->snd.sack_loss_hint = 0;
1711                         /* Act like an RTO.  We just detected it earlier.  This
1712                          * prevents us from getting another sack hint loss this
1713                          * recovery period and from advancing the opportunistic
1714                          * right edge. */
1715                         tcb->snd.recovery = RTO_RETRANS_RECOVERY;
1716                         /* We didn't actually time out yet and we expect to keep
1717                          * getting sacks, so we don't want to flush or worry
1718                          * about in_flight.  If we messed something up, the RTO
1719                          * will still fire. */
1720                         set_in_flight(tcb);
1721                 }
1722         }
1723 }
1724
1725 /* Advances tcb_sack's right edge, if new_right is farther, and updates the
1726  * bookkeeping due to the change. */
1727 static void update_right_edge(struct conv *s, Tcpctl *tcb,
1728                               struct sack_block *tcb_sack, uint32_t new_right)
1729 {
1730         if (seq_le(new_right, tcb_sack->right))
1731                 return;
1732         tcb_sack->right = new_right;
1733         merge_sacks_into(tcb, tcb_sack - tcb->snd.sacks);
1734         sack_has_changed(s, tcb, tcb_sack);
1735 }
1736
1737 static void update_or_insert_sack(struct conv *s, Tcpctl *tcb,
1738                                   struct sack_block *seg_sack)
1739 {
1740         struct sack_block *tcb_sack;
1741
1742         for (int i = 0; i < tcb->snd.nr_sacks; i++) {
1743                 tcb_sack = &tcb->snd.sacks[i];
1744                 if (seq_lt(tcb_sack->left, seg_sack->left)) {
1745                         /* This includes adjacent (which I've seen!) and
1746                          * overlap. */
1747                         if (seq_le(seg_sack->left, tcb_sack->right)) {
1748                                 update_right_edge(s, tcb, tcb_sack,
1749                                                   seg_sack->right);
1750                                 return;
1751                         }
1752                         continue;
1753                 }
1754                 /* Update existing sack */
1755                 if (tcb_sack->left == seg_sack->left) {
1756                         update_right_edge(s, tcb, tcb_sack, seg_sack->right);
1757                         return;
1758                 }
1759                 /* Found our slot */
1760                 if (seq_gt(tcb_sack->left, seg_sack->left)) {
1761                         if (tcb->snd.nr_sacks == MAX_NR_SND_SACKS) {
1762                                 /* Out of room, but it is possible this sack
1763                                  * overlaps later sacks, including the max
1764                                  * sack's right edge. */
1765                                 if (seq_ge(seg_sack->right, tcb_sack->left)) {
1766                                         /* Take over the sack */
1767                                         tcb_sack->left = seg_sack->left;
1768                                         update_right_edge(s, tcb, tcb_sack,
1769                                                           seg_sack->right);
1770                                 }
1771                                 return;
1772                         }
1773                         /* O/W, it's our slot and we have room (at least one
1774                          * spot). */
1775                         memmove(&tcb->snd.sacks[i + 1], &tcb->snd.sacks[i],
1776                                 sizeof(struct sack_block) * (tcb->snd.nr_sacks -
1777                                                              i));
1778                         tcb_sack->left = seg_sack->left;
1779                         tcb_sack->right = seg_sack->right;
1780                         tcb->snd.nr_sacks++;
1781                         merge_sacks_into(tcb, i);
1782                         sack_has_changed(s, tcb, tcb_sack);
1783                         return;
1784                 }
1785         }
1786         if (tcb->snd.nr_sacks == MAX_NR_SND_SACKS) {
1787                 /* We didn't find space in the sack array. */
1788                 tcb_sack = &tcb->snd.sacks[MAX_NR_SND_SACKS - 1];
1789                 /* Need to always maintain the rightmost sack, discarding the
1790                  * prev */
1791                 if (seq_gt(seg_sack->right, tcb_sack->right)) {
1792                         tcb_sack->left = seg_sack->left;
1793                         tcb_sack->right = seg_sack->right;
1794                         sack_has_changed(s, tcb, tcb_sack);
1795                 }
1796                 return;
1797         }
1798         tcb_sack = &tcb->snd.sacks[tcb->snd.nr_sacks];
1799         tcb->snd.nr_sacks++;
1800         tcb_sack->left = seg_sack->left;
1801         tcb_sack->right = seg_sack->right;
1802         sack_has_changed(s, tcb, tcb_sack);
1803 }
1804
1805 /* Given the packet seg, track the sacks in TCB.  There are a few things: if seg
1806  * acks new data, some sacks might no longer be needed.  Some sacks might grow,
1807  * we might add new sacks, either of which can cause a merger.
1808  *
1809  * The important thing is that we always have the max sack entry: it must be
1810  * inserted for sure and findable.  We need that for our measurement of what
1811  * packets are in the network.
1812  *
1813  * Note that we keep sacks that are below snd.rtx (and above
1814  * seg.ack/tcb->snd.una) as best we can - we don't prune them.  We'll need those
1815  * for the in_flight estimate.
1816  *
1817  * When we run out of room, we'll have to throw away a sack.  Anything we throw
1818  * away below snd.rtx will be counted as 'in flight', even though it isn't.  If
1819  * we throw away something greater than snd.rtx, we'll also retrans it.  For
1820  * simplicity, we throw-away / replace the rightmost sack, since we're always
1821  * maintaining a highest sack. */
1822 static void update_sacks(struct conv *s, Tcpctl *tcb, Tcp *seg)
1823 {
1824         int prune = 0;
1825         struct sack_block *tcb_sack;
1826
1827         for (int i = 0; i < tcb->snd.nr_sacks; i++) {
1828                 tcb_sack = &tcb->snd.sacks[i];
1829                 /* For the equality case, if they acked up to, but not including
1830                  * an old sack, they must have reneged it.  Otherwise they would
1831                  * have acked beyond the sack. */
1832                 if (seq_lt(seg->ack, tcb_sack->left))
1833                         break;
1834                 prune++;
1835         }
1836         if (prune) {
1837                 memmove(tcb->snd.sacks, tcb->snd.sacks + prune,
1838                         sizeof(struct sack_block) * (tcb->snd.nr_sacks -
1839                                                      prune));
1840                 tcb->snd.nr_sacks -= prune;
1841         }
1842         for (int i = 0; i < seg->nr_sacks; i++) {
1843                 /* old sacks */
1844                 if (seq_lt(seg->sacks[i].left, seg->ack))
1845                         continue;
1846                 /* buggy sack: out of range */
1847                 if (seq_gt(seg->sacks[i].right, tcb->snd.nxt))
1848                         continue;
1849                 update_or_insert_sack(s, tcb, &seg->sacks[i]);
1850         }
1851 }
1852
1853 /* This is a little bit of an under estimate, since we assume a packet is lost
1854  * once we have any sacks above it.  Overall, it's at most 2 * MSS of an
1855  * overestimate.
1856  *
1857  * If we have no sacks (either reneged or never used) we'll assume all packets
1858  * above snd.rtx are lost.  This will be the case for sackless fast rxmit
1859  * (Dong's stuff) or for a timeout.  In the former case, this is probably not
1860  * true, and in_flight should be higher, but we have no knowledge without the
1861  * sacks. */
1862 static void set_in_flight(Tcpctl *tcb)
1863 {
1864         struct sack_block *tcb_sack;
1865         uint32_t in_flight = 0;
1866         uint32_t from;
1867
1868         if (!tcb->snd.nr_sacks) {
1869                 tcb->snd.in_flight = tcb->snd.rtx - tcb->snd.una;
1870                 return;
1871         }
1872
1873         /* Everything to the right of the unsacked */
1874         tcb_sack = &tcb->snd.sacks[tcb->snd.nr_sacks - 1];
1875         in_flight += tcb->snd.nxt - tcb_sack->right;
1876
1877         /* Everything retransed (from una to snd.rtx, minus sacked regions.
1878          * Note we only retrans at most the last sack's left edge.  snd.rtx will
1879          * be advanced to the right edge of some sack (possibly the last one).
1880          * */
1881         from = tcb->snd.una;
1882         for (int i = 0; i < tcb->snd.nr_sacks; i++) {
1883                 tcb_sack = &tcb->snd.sacks[i];
1884                 if (seq_ge(tcb_sack->left, tcb->snd.rtx))
1885                         break;
1886                 assert(seq_ge(tcb->snd.rtx, tcb_sack->right));
1887                 in_flight += tcb_sack->left - from;
1888                 from = tcb_sack->right;
1889         }
1890         in_flight += tcb->snd.rtx - from;
1891
1892         tcb->snd.in_flight = in_flight;
1893 }
1894
1895 static void reset_recovery(struct conv *s, Tcpctl *tcb)
1896 {
1897         netlog(s->p->f, Logtcprxmt,
1898                "%I.%d -> %I.%d: recovery complete, una %u, rtx %u, nxt %u, recovery %u\n",
1899                s->laddr, s->lport, s->raddr, s->rport,
1900                tcb->snd.una, tcb->snd.rtx, tcb->snd.nxt, tcb->snd.recovery_pt);
1901         tcb->snd.recovery = 0;
1902         tcb->snd.recovery_pt = 0;
1903         tcb->snd.loss_hint = 0;
1904         tcb->snd.flush_sacks = FALSE;
1905         tcb->snd.sack_loss_hint = 0;
1906 }
1907
1908 static bool is_dup_ack(Tcpctl *tcb, Tcp *seg)
1909 {
1910         /* this is a pure ack w/o window update */
1911         return (seg->ack == tcb->snd.una) &&
1912                (tcb->snd.una != tcb->snd.nxt) &&
1913                (seg->len == 0) &&
1914                (seg->wnd == tcb->snd.wnd);
1915 }
1916
1917 /* If we have sacks, we'll ignore dupacks and look at the sacks ahead of una
1918  * (which are managed by the TCB).  The tcb will not have old sacks (below
1919  * ack/snd.rtx).  Receivers often send sacks below their ack point when we are
1920  * coming out of a loss, and we don't want those to count.
1921  *
1922  * Note the tcb could have sacks (in the future), but the receiver stopped using
1923  * them (reneged).  We'll catch that with the RTO.  If we try to catch it here,
1924  * we could get in a state where we never allow them to renege. */
1925 static bool is_potential_loss(Tcpctl *tcb, Tcp *seg)
1926 {
1927         if (seg->nr_sacks > 0)
1928                 return tcb->snd.nr_sacks > 0;
1929         else
1930                 return is_dup_ack(tcb, seg);
1931 }
1932
1933 /* When we use timestamps for RTTM, RFC 7323 suggests scaling by
1934  * expected_samples (per cwnd).  They say:
1935  *
1936  * ExpectedSamples = ceiling(FlightSize / (SMSS * 2))
1937  *
1938  * However, SMMS * 2 is really "number of bytes expected to be acked in a
1939  * packet.".  We'll use 'acked' to approximate that.  When the receiver uses
1940  * LRO, they'll send back large ACKs, which decreases the number of samples.
1941  *
1942  * If it turns out that all the divides are bad, we can just go back to not
1943  * using expected_samples at all. */
1944 static int expected_samples_ts(Tcpctl *tcb, uint32_t acked)
1945 {
1946         assert(acked);
1947         return MAX(DIV_ROUND_UP(tcb->snd.nxt - tcb->snd.una, acked), 1);
1948 }
1949
1950 /* Updates the RTT, given the currently sampled RTT and the number samples per
1951  * cwnd.  For non-TS RTTM, that'll be 1. */
1952 static void update_rtt(Tcpctl *tcb, int rtt_sample, int expected_samples)
1953 {
1954         int delta;
1955
1956         tcb->backoff = 0;
1957         tcb->backedoff = 0;
1958         if (tcb->srtt == 0) {
1959                 tcb->srtt = rtt_sample;
1960                 tcb->mdev = rtt_sample / 2;
1961         } else {
1962                 delta = rtt_sample - tcb->srtt;
1963                 tcb->srtt += (delta >> RTTM_ALPHA_SHIFT) / expected_samples;
1964                 if (tcb->srtt <= 0)
1965                         tcb->srtt = 1;
1966                 tcb->mdev += ((abs(delta) - tcb->mdev) >> RTTM_BRAVO_SHIFT) /
1967                              expected_samples;
1968                 if (tcb->mdev <= 0)
1969                         tcb->mdev = 1;
1970         }
1971         tcpsettimer(tcb);
1972 }
1973
1974 static void update(struct conv *s, Tcp *seg)
1975 {
1976         int rtt;
1977         Tcpctl *tcb;
1978         uint32_t acked, expand;
1979         struct tcppriv *tpriv;
1980
1981         tpriv = s->p->priv;
1982         tcb = (Tcpctl *) s->ptcl;
1983
1984         if (!seq_within(seg->ack, tcb->snd.una, tcb->snd.nxt))
1985                 return;
1986
1987         acked = seg->ack - tcb->snd.una;
1988         tcb->snd.una = seg->ack;
1989         if (seq_gt(seg->ack, tcb->snd.rtx))
1990                 tcb->snd.rtx = seg->ack;
1991
1992         update_sacks(s, tcb, seg);
1993         set_in_flight(tcb);
1994
1995         /* We treat either a dupack or forward SACKs as a hint that there is a
1996          * loss.  The RFCs suggest three dupacks before treating it as a loss
1997          * (alternative is reordered packets).  We'll treat three SACKs the same
1998          * way. */
1999         if (is_potential_loss(tcb, seg) && !tcb->snd.recovery) {
2000                 tcb->snd.loss_hint++;
2001                 if (tcb->snd.loss_hint == TCPREXMTTHRESH) {
2002                         netlog(s->p->f, Logtcprxmt,
2003                                "%I.%d -> %I.%d: loss hint thresh, nr sacks %u, nxt %u, una %u, cwnd %u\n",
2004                                s->laddr, s->lport, s->raddr, s->rport,
2005                                tcb->snd.nr_sacks, tcb->snd.nxt, tcb->snd.una,
2006                                tcb->cwind);
2007                         tcp_loss_event(s, tcb);
2008                         tcb->snd.recovery_pt = tcb->snd.nxt;
2009                         if (tcb->snd.nr_sacks) {
2010                                 tcb->snd.recovery = SACK_RETRANS_RECOVERY;
2011                                 tcb->snd.flush_sacks = FALSE;
2012                                 tcb->snd.sack_loss_hint = 0;
2013                         } else {
2014                                 tcb->snd.recovery = FAST_RETRANS_RECOVERY;
2015                         }
2016                         tcprxmit(s);
2017                 }
2018         }
2019
2020         /*
2021          *  update window
2022          */
2023         if (seq_gt(seg->ack, tcb->snd.wl2)
2024                 || (tcb->snd.wl2 == seg->ack && seg->wnd > tcb->snd.wnd)) {
2025                 tcb->snd.wnd = seg->wnd;
2026                 tcb->snd.wl2 = seg->ack;
2027         }
2028
2029         if (!acked) {
2030                 /*
2031                  *  don't let us hangup if sending into a closed window and
2032                  *  we're still getting acks
2033                  */
2034                 if (tcb->snd.recovery && (tcb->snd.wnd == 0))
2035                         tcb->backedoff = MAXBACKMS / 4;
2036                 return;
2037         }
2038         /* At this point, they have acked something new. (positive ack, ack >
2039          * una).
2040          *
2041          * If we hadn't reached the threshold for recovery yet, the positive ACK
2042          * will reset our loss_hint count. */
2043         if (!tcb->snd.recovery)
2044                 tcb->snd.loss_hint = 0;
2045         else if (seq_ge(seg->ack, tcb->snd.recovery_pt))
2046                 reset_recovery(s, tcb);
2047
2048         /* avoid slow start and timers for SYN acks */
2049         if ((tcb->flags & SYNACK) == 0) {
2050                 tcb->flags |= SYNACK;
2051                 acked--;
2052                 tcb->flgcnt--;
2053                 goto done;
2054         }
2055
2056         /* slow start as long as we're not recovering from lost packets */
2057         if (tcb->cwind < tcb->snd.wnd && !tcb->snd.recovery) {
2058                 if (tcb->cwind < tcb->ssthresh) {
2059                         /* We increase the cwind by every byte we receive.  We
2060                          * want to increase the cwind by one MSS for every MSS
2061                          * that gets ACKed.  Note that multiple MSSs can be
2062                          * ACKed in a single ACK.  If we had a remainder of
2063                          * acked / MSS, we'd add just that remainder - not 0 or
2064                          * 1 MSS. */
2065                         expand = acked;
2066                 } else {
2067                         /* Every RTT, which consists of CWND bytes, we're
2068                          * supposed to expand by MSS bytes.  The classic
2069                          * algorithm was
2070                          *      expand = (tcb->mss * tcb->mss) / tcb->cwind;
2071                          * which assumes the ACK was for MSS bytes.  Instead,
2072                          * for every 'acked' bytes, we increase the window by
2073                          * acked / CWND (in units of MSS). */
2074                         expand = MAX(acked, tcb->typical_mss) * tcb->typical_mss
2075                                  / tcb->cwind;
2076                 }
2077
2078                 if (tcb->cwind + expand < tcb->cwind)
2079                         expand = tcb->snd.wnd - tcb->cwind;
2080                 if (tcb->cwind + expand > tcb->snd.wnd)
2081                         expand = tcb->snd.wnd - tcb->cwind;
2082                 tcb->cwind += expand;
2083         }
2084         adjust_tx_qio_limit(s);
2085
2086         if (tcb->ts_recent) {
2087                 update_rtt(tcb, abs(milliseconds() - seg->ts_ecr),
2088                            expected_samples_ts(tcb, acked));
2089         } else if (tcb->rtt_timer.state == TcptimerON &&
2090                    seq_ge(seg->ack, tcb->rttseq)) {
2091                 /* Adjust the timers according to the round trip time */
2092                 tcphalt(tpriv, &tcb->rtt_timer);
2093                 if (!tcb->snd.recovery) {
2094                         rtt = tcb->rtt_timer.start - tcb->rtt_timer.count;
2095                         if (rtt == 0) {
2096                                 /* o/w all close systems will rxmit in 0 time */
2097                                 rtt = 1;
2098                         }
2099                         rtt *= MSPTICK;
2100                         update_rtt(tcb, rtt, 1);
2101                 }
2102         }
2103
2104 done:
2105         if (qdiscard(s->wq, acked) < acked) {
2106                 tcb->flgcnt--;
2107                 /* This happened due to another bug where acked was very large
2108                  * (negative), which was interpreted as "hey, one less flag,
2109                  * since they acked one of our flags (like a SYN).  If flgcnt
2110                  * goes negative, get_xmit_segment() will attempt to send out
2111                  * large packets. */
2112                 assert(tcb->flgcnt >= 0);
2113         }
2114
2115         if (seq_gt(seg->ack, tcb->snd.urg))
2116                 tcb->snd.urg = seg->ack;
2117
2118         if (tcb->snd.una != tcb->snd.nxt)
2119                 tcpgo(tpriv, &tcb->timer);
2120         else
2121                 tcphalt(tpriv, &tcb->timer);
2122
2123         tcb->backoff = 0;
2124         tcb->backedoff = 0;
2125 }
2126
2127 static void update_tcb_ts(Tcpctl *tcb, Tcp *seg)
2128 {
2129         /* Get timestamp info from the tcp header.  Even though the timestamps
2130          * aren't sequence numbers, we still need to protect for wraparound.
2131          * Though if the values were 0, assume that means we need an update.  We
2132          * could have an initial ts_val that appears negative (signed). */
2133         if (!tcb->ts_recent || !tcb->last_ack_sent ||
2134             (seq_ge(seg->ts_val, tcb->ts_recent) &&
2135              seq_le(seg->seq, tcb->last_ack_sent)))
2136                 tcb->ts_recent = seg->ts_val;
2137 }
2138
2139 /* Overlap happens when one sack's left edge is inside another sack. */
2140 static bool sacks_overlap(struct sack_block *x, struct sack_block *y)
2141 {
2142         return (seq_le(x->left, y->left) && seq_le(y->left, x->right)) ||
2143                (seq_le(y->left, x->left) && seq_le(x->left, y->right));
2144 }
2145
2146 static void make_sack_first(Tcpctl *tcb, struct sack_block *tcb_sack)
2147 {
2148         struct sack_block temp;
2149
2150         if (tcb_sack == &tcb->rcv.sacks[0])
2151                 return;
2152         temp = tcb->rcv.sacks[0];
2153         tcb->rcv.sacks[0] = *tcb_sack;
2154         *tcb_sack = temp;
2155 }
2156
2157 /* Track sack in our tcb for a block of data we received.  This handles all the
2158  * stuff: making sure sack is first (since it's the most recent sack change),
2159  * updating or merging sacks, and dropping excess sacks (we only need to
2160  * maintain 3).  Unlike on the snd side, our tcb sacks are *not* sorted. */
2161 static void track_rcv_sack(Tcpctl *tcb, uint32_t left, uint32_t right)
2162 {
2163         struct sack_block *tcb_sack;
2164         struct sack_block sack[1];
2165
2166         if (!tcb->sack_ok)
2167                 return;
2168         if (left == right)
2169                 return;
2170         assert(seq_lt(left, right));
2171         sack->left = left;
2172         sack->right = right;
2173         /* We can reuse an existing sack if we're merging or overlapping. */
2174         for (int i = 0; i < tcb->rcv.nr_sacks; i++) {
2175                 tcb_sack = &tcb->rcv.sacks[i];
2176                 if (sacks_overlap(tcb_sack, sack)) {
2177                         tcb_sack->left = seq_min(tcb_sack->left, sack->left);
2178                         tcb_sack->right = seq_max(tcb_sack->right, sack->right);
2179                         make_sack_first(tcb, tcb_sack);
2180                         return;
2181                 }
2182         }
2183         /* We can discard the last sack (right shift) - we should have sent it
2184          * at least once by now.  If not, oh well. */
2185         memmove(tcb->rcv.sacks + 1, tcb->rcv.sacks, sizeof(struct sack_block) *
2186                 MIN(MAX_NR_RCV_SACKS - 1, tcb->rcv.nr_sacks));
2187         tcb->rcv.sacks[0] = *sack;
2188         if (tcb->rcv.nr_sacks < MAX_NR_RCV_SACKS)
2189                 tcb->rcv.nr_sacks++;
2190 }
2191
2192 /* Once we receive everything and move rcv.nxt past a sack, we don't need to
2193  * track it.  I've seen Linux report sacks in the past, but we probably
2194  * shouldn't. */
2195 static void drop_old_rcv_sacks(Tcpctl *tcb)
2196 {
2197         struct sack_block *tcb_sack;
2198
2199         for (int i = 0; i < tcb->rcv.nr_sacks; i++) {
2200                 tcb_sack = &tcb->rcv.sacks[i];
2201                 /* Moving up to or past the left is enough to drop it. */
2202                 if (seq_ge(tcb->rcv.nxt, tcb_sack->left)) {
2203                         memmove(tcb->rcv.sacks + i, tcb->rcv.sacks + i + 1,
2204                                 sizeof(struct sack_block) * (tcb->rcv.nr_sacks -
2205                                                              i - 1));
2206                         tcb->rcv.nr_sacks--;
2207                         i--;
2208                 }
2209         }
2210 }
2211
2212 static void tcpiput(struct Proto *tcp, struct Ipifc *unused, struct block *bp)
2213 {
2214         ERRSTACK(1);
2215         Tcp seg;
2216         Tcp4hdr *h4;
2217         Tcp6hdr *h6;
2218         int hdrlen;
2219         Tcpctl *tcb;
2220         uint16_t length;
2221         uint8_t source[IPaddrlen], dest[IPaddrlen];
2222         struct conv *s;
2223         struct Fs *f;
2224         struct tcppriv *tpriv;
2225         uint8_t version;
2226
2227         f = tcp->f;
2228         tpriv = tcp->priv;
2229
2230         tpriv->stats[InSegs]++;
2231
2232         h4 = (Tcp4hdr *) (bp->rp);
2233         h6 = (Tcp6hdr *) (bp->rp);
2234
2235         if ((h4->vihl & 0xF0) == IP_VER4) {
2236                 uint8_t ttl;
2237
2238                 version = V4;
2239                 length = nhgets(h4->length);
2240                 v4tov6(dest, h4->tcpdst);
2241                 v4tov6(source, h4->tcpsrc);
2242
2243                 /* ttl isn't part of the xsum pseudo header, but bypass needs
2244                  * it. */
2245                 ttl = h4->Unused;
2246                 h4->Unused = 0;
2247                 hnputs(h4->tcplen, length - TCP4_PKT);
2248                 if (!(bp->flag & Btcpck) && (h4->tcpcksum[0] || h4->tcpcksum[1])
2249                     && ptclcsum(bp, TCP4_IPLEN, length - TCP4_IPLEN)) {
2250                         tpriv->stats[CsumErrs]++;
2251                         tpriv->stats[InErrs]++;
2252                         netlog(f, Logtcp, "bad tcp proto cksum\n");
2253                         freeblist(bp);
2254                         return;
2255                 }
2256                 h4->Unused = ttl;
2257
2258                 hdrlen = ntohtcp4(&seg, &bp);
2259                 if (hdrlen < 0) {
2260                         tpriv->stats[HlenErrs]++;
2261                         tpriv->stats[InErrs]++;
2262                         netlog(f, Logtcp, "bad tcp hdr len\n");
2263                         return;
2264                 }
2265
2266                 s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest);
2267                 if (s && s->state == Bypass) {
2268                         bypass_or_drop(s, bp);
2269                         return;
2270                 }
2271
2272                 /* trim the packet to the size claimed by the datagram */
2273                 length -= hdrlen + TCP4_PKT;
2274                 bp = trimblock(bp, hdrlen + TCP4_PKT, length);
2275                 if (bp == NULL) {
2276                         tpriv->stats[LenErrs]++;
2277                         tpriv->stats[InErrs]++;
2278                         netlog(f, Logtcp, "tcp len < 0 after trim\n");
2279                         return;
2280                 }
2281         } else {
2282                 int ttl = h6->ttl;
2283                 int proto = h6->proto;
2284
2285                 version = V6;
2286                 length = nhgets(h6->ploadlen);
2287                 ipmove(dest, h6->tcpdst);
2288                 ipmove(source, h6->tcpsrc);
2289
2290                 h6->ploadlen[0] = h6->ploadlen[1] = h6->proto = 0;
2291                 h6->ttl = proto;
2292                 hnputl(h6->vcf, length);
2293                 if ((h6->tcpcksum[0] || h6->tcpcksum[1]) &&
2294                         ptclcsum(bp, TCP6_IPLEN, length + TCP6_PHDRSIZE)) {
2295                         tpriv->stats[CsumErrs]++;
2296                         tpriv->stats[InErrs]++;
2297                         netlog(f, Logtcp, "bad tcp proto cksum\n");
2298                         freeblist(bp);
2299                         return;
2300                 }
2301                 h6->ttl = ttl;
2302                 h6->proto = proto;
2303                 hnputs(h6->ploadlen, length);
2304
2305                 hdrlen = ntohtcp6(&seg, &bp);
2306                 if (hdrlen < 0) {
2307                         tpriv->stats[HlenErrs]++;
2308                         tpriv->stats[InErrs]++;
2309                         netlog(f, Logtcp, "bad tcp hdr len\n");
2310                         return;
2311                 }
2312
2313                 s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest);
2314                 if (s && s->state == Bypass) {
2315                         bypass_or_drop(s, bp);
2316                         return;
2317                 }
2318
2319                 /* trim the packet to the size claimed by the datagram */
2320                 length -= hdrlen;
2321                 bp = trimblock(bp, hdrlen + TCP6_PKT, length);
2322                 if (bp == NULL) {
2323                         tpriv->stats[LenErrs]++;
2324                         tpriv->stats[InErrs]++;
2325                         netlog(f, Logtcp, "tcp len < 0 after trim\n");
2326                         return;
2327                 }
2328         }
2329
2330         /* s, the conv matching the n-tuple, was set above */
2331         if (s == NULL) {
2332                 netlog(f, Logtcpreset,
2333                        "iphtlook failed: src %I:%u, dst %I:%u\n",
2334                        source, seg.source, dest, seg.dest);
2335 reset:
2336                 sndrst(tcp, source, dest, length, &seg, version,
2337                        "no conversation");
2338                 freeblist(bp);
2339                 return;
2340         }
2341
2342         /* lock protocol for unstate Plan 9 invariants.  funcs like limbo or
2343          * incoming might rely on it. */
2344         qlock(&tcp->qlock);
2345
2346         /* if it's a listener, look for the right flags and get a new conv */
2347         tcb = (Tcpctl *) s->ptcl;
2348         if (tcb->state == Listen) {
2349                 if (seg.flags & RST) {
2350                         limborst(s, &seg, source, dest, version);
2351                         qunlock(&tcp->qlock);
2352                         freeblist(bp);
2353                         return;
2354                 }
2355
2356                 /* if this is a new SYN, put the call into limbo */
2357                 if ((seg.flags & SYN) && (seg.flags & ACK) == 0) {
2358                         limbo(s, source, dest, &seg, version);
2359                         qunlock(&tcp->qlock);
2360                         freeblist(bp);
2361                         return;
2362                 }
2363
2364                 /* if there's a matching call in limbo, tcpincoming will return
2365                  * it */
2366                 s = tcpincoming(s, &seg, source, dest, version);
2367                 if (s == NULL) {
2368                         qunlock(&tcp->qlock);
2369                         goto reset;
2370                 }
2371         }
2372
2373         /* The rest of the input state machine is run with the control block
2374          * locked and implements the state machine directly out of the RFC.
2375          * Out-of-band data is ignored - it was always a bad idea.
2376          */
2377         tcb = (Tcpctl *) s->ptcl;
2378         if (waserror()) {
2379                 qunlock(&s->qlock);
2380                 nexterror();
2381         }
2382         qlock(&s->qlock);
2383         qunlock(&tcp->qlock);
2384
2385         update_tcb_ts(tcb, &seg);
2386         /* fix up window */
2387         seg.wnd <<= tcb->rcv.scale;
2388
2389         /* every input packet in puts off the keep alive time out */
2390         tcpsetkacounter(tcb);
2391
2392         switch (tcb->state) {
2393         case Closed:
2394                 sndrst(tcp, source, dest, length, &seg, version,
2395                            "sending to Closed");
2396                 goto raise;
2397         case Syn_sent:
2398                 if (seg.flags & ACK) {
2399                         if (!seq_within(seg.ack, tcb->iss + 1,
2400                                         tcb->snd.nxt)) {
2401                                 sndrst(tcp, source, dest, length, &seg,
2402                                        version, "bad seq in Syn_sent");
2403                                 goto raise;
2404                         }
2405                 }
2406                 if (seg.flags & RST) {
2407                         if (seg.flags & ACK)
2408                                 localclose(s, "connection refused");
2409                         goto raise;
2410                 }
2411
2412                 if (seg.flags & SYN) {
2413                         procsyn(s, &seg);
2414                         if (seg.flags & ACK) {
2415                                 update(s, &seg);
2416                                 tcpsynackrtt(s);
2417                                 tcpsetstate(s, Established);
2418                                 /* Here's where we get the results of
2419                                  * header option negotiations for
2420                                  * connections we started. (SYNACK has
2421                                  * the response) */
2422                                 tcpsetscale(s, tcb, seg.ws, tcb->scale);
2423                                 tcb->sack_ok = seg.sack_ok;
2424                         } else {
2425                                 sndrst(tcp, source, dest, length, &seg,
2426                                        version, "Got SYN with no ACK");
2427                                 goto raise;
2428                         }
2429
2430                         if (length != 0 || (seg.flags & FIN))
2431                                 break;
2432
2433                         freeblist(bp);
2434                         goto output;
2435                 } else
2436                         freeblist(bp);
2437
2438                 qunlock(&s->qlock);
2439                 poperror();
2440                 return;
2441         }
2442
2443         /*
2444          *  One DOS attack is to open connections to us and then forget about
2445          *  them, thereby tying up a conv at no long term cost to the attacker.
2446          *  This is an attempt to defeat these stateless DOS attacks.  See
2447          *  corresponding code in tcpsendka().
2448          */
2449         if ((seg.flags & RST) == 0) {
2450                 if (tcpporthogdefense
2451                         && seq_within(seg.ack, tcb->snd.una - (1 << 31),
2452                                                   tcb->snd.una - (1 << 29))) {
2453                         printd("stateless hog %I.%d->%I.%d f 0x%x 0x%lx - 0x%lx - 0x%lx\n",
2454                                source, seg.source, dest, seg.dest, seg.flags,
2455                                tcb->snd.una - (1 << 31), seg.ack,
2456                                tcb->snd.una - (1 << 29));
2457                         localclose(s, "stateless hog");
2458                 }
2459         }
2460
2461         /* Cut the data to fit the receive window */
2462         if (tcptrim(tcb, &seg, &bp, &length) == -1) {
2463                 netlog(f, Logtcp, "%I.%d -> %I.%d: tcp len < 0, %lu %d\n",
2464                        s->raddr, s->rport, s->laddr, s->lport, seg.seq, length);
2465                 update(s, &seg);
2466                 if (qlen(s->wq) + tcb->flgcnt == 0 && tcb->state == Closing) {
2467                         tcphalt(tpriv, &tcb->rtt_timer);
2468                         tcphalt(tpriv, &tcb->acktimer);
2469                         tcphalt(tpriv, &tcb->katimer);
2470                         tcpsetstate(s, Time_wait);
2471                         tcb->timer.start = MSL2 * (1000 / MSPTICK);
2472                         tcpgo(tpriv, &tcb->timer);
2473                 }
2474                 if (!(seg.flags & RST)) {
2475                         tcb->flags |= FORCE;
2476                         goto output;
2477                 }
2478                 qunlock(&s->qlock);
2479                 poperror();
2480                 return;
2481         }
2482
2483         /* Cannot accept so answer with a rst */
2484         if (length && tcb->state == Closed) {
2485                 sndrst(tcp, source, dest, length, &seg, version,
2486                        "sending to Closed");
2487                 goto raise;
2488         }
2489
2490         /* The segment is beyond the current receive pointer so
2491          * queue the data in the resequence queue
2492          */
2493         if (seg.seq != tcb->rcv.nxt)
2494                 if (length != 0 || (seg.flags & (SYN | FIN))) {
2495                         update(s, &seg);
2496                         if (addreseq(tcb, tpriv, &seg, bp, length) < 0)
2497                                 printd("reseq %I.%d -> %I.%d\n", s->raddr,
2498                                        s->rport, s->laddr, s->lport);
2499                         tcb->flags |= FORCE;
2500                         goto output;
2501                 }
2502
2503         /*
2504          *  keep looping till we've processed this packet plus any
2505          *  adjacent packets in the resequence queue
2506          */
2507         for (;;) {
2508                 if (seg.flags & RST) {
2509                         if (tcb->state == Established) {
2510                                 tpriv->stats[EstabResets]++;
2511                                 if (tcb->rcv.nxt != seg.seq)
2512                                         printd("out of order RST rcvd: %I.%d -> %I.%d, rcv.nxt 0x%lx seq 0x%lx\n",
2513                                                s->raddr, s->rport, s->laddr,
2514                                                s->lport, tcb->rcv.nxt, seg.seq);
2515                         }
2516                         localclose(s, "connection refused");
2517                         goto raise;
2518                 }
2519
2520                 if ((seg.flags & ACK) == 0)
2521                         goto raise;
2522
2523                 switch (tcb->state) {
2524                 case Established:
2525                 case Close_wait:
2526                         update(s, &seg);
2527                         break;
2528                 case Finwait1:
2529                         update(s, &seg);
2530                         if (qlen(s->wq) + tcb->flgcnt == 0) {
2531                                 tcphalt(tpriv, &tcb->rtt_timer);
2532                                 tcphalt(tpriv, &tcb->acktimer);
2533                                 tcpsetkacounter(tcb);
2534                                 tcb->time = NOW;
2535                                 tcpsetstate(s, Finwait2);
2536                                 tcb->katimer.start = MSL2 * (1000 / MSPTICK);
2537                                 tcpgo(tpriv, &tcb->katimer);
2538                         }
2539                         break;
2540                 case Finwait2:
2541                         update(s, &seg);
2542                         break;
2543                 case Closing:
2544                         update(s, &seg);
2545                         if (qlen(s->wq) + tcb->flgcnt == 0) {
2546                                 tcphalt(tpriv, &tcb->rtt_timer);
2547                                 tcphalt(tpriv, &tcb->acktimer);
2548                                 tcphalt(tpriv, &tcb->katimer);
2549                                 tcpsetstate(s, Time_wait);
2550                                 tcb->timer.start = MSL2 * (1000 / MSPTICK);
2551                                 tcpgo(tpriv, &tcb->timer);
2552                         }
2553                         break;
2554                 case Last_ack:
2555                         update(s, &seg);
2556                         if (qlen(s->wq) + tcb->flgcnt == 0) {
2557                                 localclose(s, NULL);
2558                                 goto raise;
2559                         }
2560                 case Time_wait:
2561                         if (seg.flags & FIN)
2562                                 tcb->flags |= FORCE;
2563                         if (tcb->timer.state != TcptimerON)
2564                                 tcpgo(tpriv, &tcb->timer);
2565                 }
2566
2567                 if ((seg.flags & URG) && seg.urg) {
2568                         if (seq_gt(seg.urg + seg.seq, tcb->rcv.urg)) {
2569                                 tcb->rcv.urg = seg.urg + seg.seq;
2570                                 pullblock(&bp, seg.urg);
2571                         }
2572                 } else if (seq_gt(tcb->rcv.nxt, tcb->rcv.urg))
2573                         tcb->rcv.urg = tcb->rcv.nxt;
2574
2575                 if (length == 0) {
2576                         if (bp != NULL)
2577                                 freeblist(bp);
2578                 } else {
2579                         switch (tcb->state) {
2580                         default:
2581                                 /* Ignore segment text */
2582                                 if (bp != NULL)
2583                                         freeblist(bp);
2584                                 break;
2585
2586                         case Established:
2587                         case Finwait1:
2588                                 /* If we still have some data place on
2589                                  * receive queue
2590                                  */
2591                                 if (bp) {
2592                                         bp = packblock(bp);
2593                                         if (bp == NULL)
2594                                                 panic("tcp packblock");
2595                                         qpassnolim(s->rq, bp);
2596                                         bp = NULL;
2597
2598                                         /*
2599                                          * Force an ack every 2 data messages.
2600                                          * This is a hack for rob to make his
2601                                          * home system run faster.
2602                                          *
2603                                          * this also keeps the standard TCP
2604                                          * congestion control working since it
2605                                          * needs an ack every 2 max segs worth.
2606                                          * This is not quite that, but under a
2607                                          * real stream is equivalent since every
2608                                          * packet has a max seg in it.
2609                                          */
2610                                         if (++(tcb->rcv.una) >= 2)
2611                                                 tcb->flags |= FORCE;
2612                                 }
2613                                 tcb->rcv.nxt += length;
2614                                 drop_old_rcv_sacks(tcb);
2615
2616                                 /*
2617                                  *  update our rcv window
2618                                  */
2619                                 tcprcvwin(s);
2620
2621                                 /*
2622                                  *  turn on the acktimer if there's something
2623                                  *  to ack
2624                                  */
2625                                 if (tcb->acktimer.state != TcptimerON)
2626                                         tcpgo(tpriv, &tcb->acktimer);
2627
2628                                 break;
2629                         case Finwait2:
2630                                 /* no process to read the data, send a reset */
2631                                 if (bp != NULL)
2632                                         freeblist(bp);
2633                                 sndrst(tcp, source, dest, length, &seg, version,
2634                                            "send to Finwait2");
2635                                 qunlock(&s->qlock);
2636                                 poperror();
2637                                 return;
2638                         }
2639                 }
2640
2641                 if (seg.flags & FIN) {
2642                         tcb->flags |= FORCE;
2643
2644                         switch (tcb->state) {
2645                         case Established:
2646                                 tcb->rcv.nxt++;
2647                                 tcpsetstate(s, Close_wait);
2648                                 break;
2649                         case Finwait1:
2650                                 tcb->rcv.nxt++;
2651                                 if (qlen(s->wq) + tcb->flgcnt == 0) {
2652                                         tcphalt(tpriv, &tcb->rtt_timer);
2653                                         tcphalt(tpriv, &tcb->acktimer);
2654                                         tcphalt(tpriv, &tcb->katimer);
2655                                         tcpsetstate(s, Time_wait);
2656                                         tcb->timer.start = MSL2 * (1000 /
2657                                                                    MSPTICK);
2658                                         tcpgo(tpriv, &tcb->timer);
2659                                 } else
2660                                         tcpsetstate(s, Closing);
2661                                 break;
2662                         case Finwait2:
2663                                 tcb->rcv.nxt++;
2664                                 tcphalt(tpriv, &tcb->rtt_timer);
2665                                 tcphalt(tpriv, &tcb->acktimer);
2666                                 tcphalt(tpriv, &tcb->katimer);
2667                                 tcpsetstate(s, Time_wait);
2668                                 tcb->timer.start = MSL2 * (1000 / MSPTICK);
2669                                 tcpgo(tpriv, &tcb->timer);
2670                                 break;
2671                         case Close_wait:
2672                         case Closing:
2673                         case Last_ack:
2674                                 break;
2675                         case Time_wait:
2676                                 tcpgo(tpriv, &tcb->timer);
2677                                 break;
2678                         }
2679                 }
2680
2681                 /*
2682                  *  get next adjacent segment from the resequence queue.
2683                  *  dump/trim any overlapping segments
2684                  */
2685                 for (;;) {
2686                         if (tcb->reseq == NULL)
2687                                 goto output;
2688
2689                         if (seq_ge(tcb->rcv.nxt, tcb->reseq->seg.seq) == 0)
2690                                 goto output;
2691
2692                         getreseq(tcb, &seg, &bp, &length);
2693
2694                         if (tcptrim(tcb, &seg, &bp, &length) == 0)
2695                                 break;
2696                 }
2697         }
2698 output:
2699         tcpoutput(s);
2700         qunlock(&s->qlock);
2701         poperror();
2702         return;
2703 raise:
2704         qunlock(&s->qlock);
2705         poperror();
2706         freeblist(bp);
2707         tcpkick(s);
2708 }
2709
2710 /* The advertised mss = data + TCP headers */
2711 static uint16_t derive_payload_mss(Tcpctl *tcb)
2712 {
2713         uint16_t payload_mss = tcb->mss;
2714         uint16_t opt_size = 0;
2715
2716         if (tcb->ts_recent) {
2717                 opt_size += TS_LENGTH;
2718                 /* Note that when we're a SYN, we overestimate slightly.  This
2719                  * is safe, and not really a problem. */
2720                 opt_size += TS_SEND_PREPAD;
2721         }
2722         if (tcb->rcv.nr_sacks)
2723                 opt_size += 2 + tcb->rcv.nr_sacks * 8;
2724         opt_size = ROUNDUP(opt_size, 4);
2725         payload_mss -= opt_size;
2726         return payload_mss;
2727 }
2728
2729 /* Decreases the xmit amt, given the MSS / TSO. */
2730 static uint32_t throttle_for_mss(Tcpctl *tcb, uint32_t ssize,
2731                                  uint16_t payload_mss, bool retrans)
2732 {
2733         if (ssize > payload_mss) {
2734                 if ((tcb->flags & TSO) == 0) {
2735                         ssize = payload_mss;
2736                 } else {
2737                         /* Don't send too much.  32K is arbitrary.. */
2738                         if (ssize > 32 * 1024)
2739                                 ssize = 32 * 1024;
2740                         if (!retrans) {
2741                                 /* Clamp xmit to an integral MSS to avoid ragged
2742                                  * tail segments causing poor link utilization.
2743                                  */
2744                                 ssize = ROUNDDOWN(ssize, payload_mss);
2745                         }
2746                 }
2747         }
2748         return ssize;
2749 }
2750
2751 /* Reduces ssize for a variety of reasons.  Returns FALSE if we should abort
2752  * sending the packet.  o/w returns TRUE and modifies ssize by reference. */
2753 static bool throttle_ssize(struct conv *s, Tcpctl *tcb, uint32_t *ssize_p,
2754                            uint16_t payload_mss, bool retrans)
2755 {
2756         struct Fs *f = s->p->f;
2757         uint32_t usable;
2758         uint32_t ssize = *ssize_p;
2759
2760         /* Compute usable segment based on offered window and limit
2761          * window probes to one */
2762         if (tcb->snd.wnd == 0) {
2763                 if (tcb->snd.in_flight != 0) {
2764                         if ((tcb->flags & FORCE) == 0)
2765                                 return FALSE;
2766                 }
2767                 usable = 1;
2768         } else {
2769                 usable = tcb->cwind;
2770                 if (tcb->snd.wnd < usable)
2771                         usable = tcb->snd.wnd;
2772                 if (usable > tcb->snd.in_flight)
2773                         usable -= tcb->snd.in_flight;
2774                 else
2775                         usable = 0;
2776                 /* Avoid Silly Window Syndrome.  This is a little different
2777                  * thant RFC 813.  I took their additional enhancement of "<
2778                  * MSS" as an AND, not an OR.  25% of a large snd.wnd is pretty
2779                  * large, and our main goal is to avoid packets smaller than
2780                  * MSS.  I still use the 25% threshold, because it is important
2781                  * that there is *some* data in_flight.  If usable < MSS because
2782                  * snd.wnd is very small (but not 0), we might never get an ACK
2783                  * and would need to set up a timer.
2784                  *
2785                  * Also, I'm using 'ssize' as a proxy for a PSH point.  If
2786                  * there's just a small blob in the qio (or retrans!), then we
2787                  * might as well just send it. */
2788                 if ((usable < tcb->typical_mss) && (usable < tcb->snd.wnd >> 2)
2789                     && (usable < ssize)) {
2790                         return FALSE;
2791                 }
2792         }
2793         if (ssize && usable < 2)
2794                 netlog(s->p->f, Logtcpverbose,
2795                        "%I.%d -> %I.%d: throttled snd.wnd %lu cwind %lu\n",
2796                        s->laddr, s->lport, s->raddr, s->rport,
2797                        tcb->snd.wnd, tcb->cwind);
2798         if (usable < ssize)
2799                 ssize = usable;
2800
2801         ssize = throttle_for_mss(tcb, ssize, payload_mss, retrans);
2802
2803         *ssize_p = ssize;
2804         return TRUE;
2805 }
2806
2807 /* Helper, picks the next segment to send, which is possibly a retransmission.
2808  * Returns TRUE if we have a segment, FALSE o/w.  Returns ssize, from_seq, and
2809  * sent by reference.
2810  *
2811  * from_seq is the seq number we are transmitting from.
2812  *
2813  * sent includes all seq from una to from_seq *including* any previously sent
2814  * flags (part of tcb->flgcnt), for instance an unacknowledged SYN (which counts
2815  * as a seq number).  Those flags are in the e.g. snd.nxt - snd.una range, and
2816  * they get dropped after qdiscard.
2817  *
2818  * ssize is the amount of data we are sending, starting from from_seq, and it
2819  * will include any *new* flags, which haven't been accounted for yet.
2820  *
2821  * tcb->flgcnt consists of the flags both in ssize and in sent.
2822  *
2823  * Note that we could be in recovery and not sack_retrans a segment. */
2824 static bool get_xmit_segment(struct conv *s, Tcpctl *tcb, uint16_t payload_mss,
2825                              uint32_t *from_seq_p, uint32_t *sent_p,
2826                              uint32_t *ssize_p)
2827 {
2828         struct Fs *f = s->p->f;
2829         struct tcppriv *tpriv = s->p->priv;
2830         uint32_t ssize, sent, from_seq;
2831         bool sack_retrans = FALSE;
2832         struct sack_block *tcb_sack = 0;
2833
2834         for (int i = 0; i < tcb->snd.nr_sacks; i++) {
2835                 tcb_sack = &tcb->snd.sacks[i];
2836                 if (seq_lt(tcb->snd.rtx, tcb_sack->left)) {
2837                         /* So ssize is supposed to include any *new* flags to
2838                          * flgcnt, which at this point would be a FIN.
2839                          *
2840                          * It might be possible that flgcnt is incremented so we
2841                          * send a FIN, even for an intermediate sack retrans.
2842                          * Perhaps the user closed the conv.
2843                          *
2844                          * However, the way the "flgcnt for FIN" works is that
2845                          * it inflates the desired amount we'd like to send
2846                          * (qlen + flgcnt).  Eventually, we reach the end of the
2847                          * queue and fail to extract all of dsize.  At that
2848                          * point, we put on the FIN, and that's where the extra
2849                          * 'byte' comes from.
2850                          *
2851                          * For sack retrans, since we're extracting from parts
2852                          * of the qio that aren't the right-most edge, we don't
2853                          * need to consider flgcnt when setting ssize. */
2854                         from_seq = tcb->snd.rtx;
2855                         sent = from_seq - tcb->snd.una;
2856                         ssize = tcb_sack->left - from_seq;
2857                         sack_retrans = TRUE;
2858                         break;
2859                 }
2860         }
2861         /* SACK holes have first dibs, but we can still opportunisitically send
2862          * new data.
2863          *
2864          * During other types of recovery, we'll just send from the retrans
2865          * point.  If we're in an RTO while we still have sacks, we could be
2866          * resending data that wasn't lost.  Consider a sack that is still
2867          * growing (usually the right-most), but we haven't received the ACK
2868          * yet.  rxt may be included in that area.  Given we had two losses or
2869          * otherwise timed out, I'm not too concerned.
2870          *
2871          * Note that Fast and RTO can send data beyond nxt.  If we change that,
2872          * change the accounting below. */
2873         if (!sack_retrans) {
2874                 switch (tcb->snd.recovery) {
2875                 default:
2876                 case SACK_RETRANS_RECOVERY:
2877                         from_seq = tcb->snd.nxt;
2878                         break;
2879                 case FAST_RETRANS_RECOVERY:
2880                 case RTO_RETRANS_RECOVERY:
2881                         from_seq = tcb->snd.rtx;
2882                         break;
2883                 }
2884                 sent = from_seq - tcb->snd.una;
2885                 /* qlen + flgcnt is every seq we want to have sent, including
2886                  * unack'd data, unacked flags, and new flags. */
2887                 ssize = qlen(s->wq) + tcb->flgcnt - sent;
2888         }
2889
2890         if (!throttle_ssize(s, tcb, &ssize, payload_mss, sack_retrans))
2891                 return FALSE;
2892
2893         /* This counts flags, which is a little hokey, but it's okay since
2894          * in_flight gets reset on each ACK */
2895         tcb->snd.in_flight += ssize;
2896         /* Log and track rxmit.  This covers both SACK (retrans) and fast rxmit.
2897          */
2898         if (ssize && seq_lt(tcb->snd.rtx, tcb->snd.nxt)) {
2899                 netlog(f, Logtcpverbose,
2900                        "%I.%d -> %I.%d: rxmit: rtx %u amt %u, nxt %u\n",
2901                        s->laddr, s->lport, s->raddr, s->rport,
2902                        tcb->snd.rtx, MIN(tcb->snd.nxt - tcb->snd.rtx, ssize),
2903                        tcb->snd.nxt);
2904                 tpriv->stats[RetransSegs]++;
2905         }
2906         if (sack_retrans) {
2907                 /* If we'll send up to the left edge, advance snd.rtx to the
2908                  * right.
2909                  *
2910                  * This includes the largest sack.  It might get removed later,
2911                  * in which case we'll underestimate the amount in-flight.  The
2912                  * alternative is to not count the rightmost sack, but when it
2913                  * gets removed, we'll retrans it anyway.  No matter what, we'd
2914                  * count it. */
2915                 tcb->snd.rtx += ssize;
2916                 if (tcb->snd.rtx == tcb_sack->left)
2917                         tcb->snd.rtx = tcb_sack->right;
2918                 /* RFC 6675 says we MAY rearm the RTO timer on each retrans,
2919                  * since we might not be getting ACKs for a while. */
2920                 tcpsettimer(tcb);
2921         } else {
2922                 switch (tcb->snd.recovery) {
2923                 default:
2924                         /* under normal op, we drag rtx along with nxt.  this
2925                          * prevents us from sending sacks too early (up above),
2926                          * since rtx doesn't get reset to una until we have a
2927                          * loss (e.g. 3 dupacks/sacks). */
2928                         tcb->snd.nxt += ssize;
2929                         tcb->snd.rtx = tcb->snd.nxt;
2930                         break;
2931                 case SACK_RETRANS_RECOVERY:
2932                         /* We explicitly do not want to increase rtx here.  We
2933                          * might still need it to fill in a sack gap below nxt
2934                          * if we get new, higher sacks. */
2935                         tcb->snd.nxt += ssize;
2936                         break;
2937                 case FAST_RETRANS_RECOVERY:
2938                 case RTO_RETRANS_RECOVERY:
2939                         tcb->snd.rtx += ssize;
2940                         /* Fast and RTO can send new data, advancing nxt. */
2941                         if (seq_gt(tcb->snd.rtx, tcb->snd.nxt))
2942                                 tcb->snd.nxt = tcb->snd.rtx;
2943                         break;
2944                 }
2945         }
2946         *from_seq_p = from_seq;
2947         *sent_p = sent;
2948         *ssize_p = ssize;
2949
2950         return TRUE;
2951 }
2952
2953 /*
2954  *  always enters and exits with the s locked.  We drop
2955  *  the lock to ipoput the packet so some care has to be
2956  *  taken by callers.
2957  */
2958 static void tcpoutput(struct conv *s)
2959 {
2960         Tcp seg;
2961         int msgs;
2962         int next_yield = 1;
2963         Tcpctl *tcb;
2964         struct block *hbp, *bp;
2965         uint32_t ssize, dsize, sent, from_seq;
2966         struct Fs *f;
2967         struct tcppriv *tpriv;
2968         uint8_t version;
2969         uint16_t payload_mss;
2970
2971         f = s->p->f;
2972         tpriv = s->p->priv;
2973         version = s->ipversion;
2974
2975         for (msgs = 0; msgs < 100; msgs++) {
2976                 tcb = (Tcpctl *) s->ptcl;
2977
2978                 switch (tcb->state) {
2979                 case Listen:
2980                 case Closed:
2981                 case Finwait2:
2982                         return;
2983                 }
2984
2985                 /* force an ack when a window has opened up */
2986                 if (tcb->rcv.blocked && tcb->rcv.wnd >= tcb->mss) {
2987                         tcb->rcv.blocked = 0;
2988                         tcb->flags |= FORCE;
2989                 }
2990
2991                 /* Don't send anything else until our SYN has been acked */
2992                 if (tcb->snd.nxt != tcb->iss && (tcb->flags & SYNACK) == 0)
2993                         break;
2994
2995                 /* payload_mss is the actual amount of data in the packet, which
2996                  * is the advertised (mss - header opts).  This varies from
2997                  * packet to packet, based on the options that might be present
2998                  * (e.g. always timestamps, sometimes SACKs) */
2999                 payload_mss = derive_payload_mss(tcb);
3000
3001                 if (!get_xmit_segment(s, tcb, payload_mss, &from_seq, &sent,
3002                                       &ssize))
3003                         break;
3004
3005                 dsize = ssize;
3006                 seg.urg = 0;
3007
3008                 if (ssize == 0)
3009                         if ((tcb->flags & FORCE) == 0)
3010                                 break;
3011
3012                 tcb->flags &= ~FORCE;
3013                 tcprcvwin(s);
3014
3015                 /* By default we will generate an ack, so we can normally turn
3016                  * off the timer.  If we're blocked, we'll want the timer so we
3017                  * can send a window update. */
3018                 if (!tcb->rcv.blocked)
3019                         tcphalt(tpriv, &tcb->acktimer);
3020                 tcb->rcv.una = 0;
3021                 seg.source = s->lport;
3022                 seg.dest = s->rport;
3023                 seg.flags = ACK;
3024                 seg.mss = 0;
3025                 seg.ws = 0;
3026                 seg.sack_ok = FALSE;
3027                 seg.nr_sacks = 0;
3028                 /* When outputting, Syn_sent means "send the Syn", for
3029                  * connections we initiate.  SYNACKs are sent from sndsynack
3030                  * directly. */
3031                 if (tcb->state == Syn_sent) {
3032                         seg.flags = 0;
3033                         /* here's where we advertise SACK */
3034                         seg.sack_ok = SACK_SUPPORTED;
3035                         if (tcb->snd.nxt - ssize == tcb->iss) {
3036                                 seg.flags |= SYN;
3037                                 dsize--;
3038                                 seg.mss = tcb->mss;
3039                                 seg.ws = tcb->scale;
3040                         } else {
3041                                 /* TODO: Not sure why we'd get here. */
3042                                 warn("TCP: weird Syn_sent state, tell someone you saw this");
3043                         }
3044                 }
3045                 seg.seq = from_seq;
3046                 seg.ack = tcb->rcv.nxt;
3047                 tcb->last_ack_sent = seg.ack;
3048                 seg.wnd = tcb->rcv.wnd;
3049                 seg.ts_val = tcb->ts_recent;
3050
3051                 /* Pull out data to send */
3052                 bp = NULL;
3053                 if (dsize != 0) {
3054                         bp = qcopy(s->wq, dsize, sent);
3055                         if (BLEN(bp) != dsize) {
3056                                 /* Here's where the flgcnt kicked in.  Note
3057                                  * dsize is decremented, but ssize isn't.  Not
3058                                  * that we use ssize for much anymore.
3059                                  * Decrementing dsize prevents us from sending a
3060                                  * PSH with the FIN. */
3061                                 seg.flags |= FIN;
3062                                 dsize--;
3063                         }
3064                         if (BLEN(bp) > payload_mss) {
3065                                 bp->flag |= Btso;
3066                                 bp->mss = payload_mss;
3067                         }
3068                 }
3069
3070                 if (sent + dsize == qlen(s->wq) + tcb->flgcnt)
3071                         seg.flags |= PSH;
3072
3073                 /* Build header, link data and compute cksum */
3074                 switch (version) {
3075                 case V4:
3076                         tcb->protohdr.tcp4hdr.vihl = IP_VER4;
3077                         hbp = htontcp4(&seg, bp, &tcb->protohdr.tcp4hdr, tcb);
3078                         if (hbp == NULL) {
3079                                 freeblist(bp);
3080                                 return;
3081                         }
3082                         break;
3083                 case V6:
3084                         tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
3085                         hbp = htontcp6(&seg, bp, &tcb->protohdr.tcp6hdr, tcb);
3086                         if (hbp == NULL) {
3087                                 freeblist(bp);
3088                                 return;
3089                         }
3090                         break;
3091                 default:
3092                         hbp = NULL;     /* to suppress a warning */
3093                         panic("tcpoutput: version %d", version);
3094                 }
3095
3096                 /* Start the transmission timers if there is new data and we
3097                  * expect acknowledges
3098                  */
3099                 if (ssize != 0) {
3100                         if (tcb->timer.state != TcptimerON)
3101                                 tcpgo(tpriv, &tcb->timer);
3102
3103                         if (!tcb->ts_recent && (tcb->rtt_timer.state !=
3104                                                 TcptimerON)) {
3105                                 tcpgo(tpriv, &tcb->rtt_timer);
3106                                 tcb->rttseq = from_seq + ssize;
3107                         }
3108                 }
3109
3110                 tpriv->stats[OutSegs]++;
3111
3112                 /* put off the next keep alive */
3113                 tcpgo(tpriv, &tcb->katimer);
3114
3115                 switch (version) {
3116                 case V4:
3117                         if (ipoput4(f, hbp, 0, s->ttl, s->tos, s) < 0) {
3118                                 /* a negative return means no route */
3119                                 localclose(s, "no route");
3120                         }
3121                         break;
3122                 case V6:
3123                         if (ipoput6(f, hbp, 0, s->ttl, s->tos, s) < 0) {
3124                                 /* a negative return means no route */
3125                                 localclose(s, "no route");
3126                         }
3127                         break;
3128                 default:
3129                         panic("tcpoutput2: version %d", version);
3130                 }
3131                 if (ssize) {
3132                         /* The outer loop thinks we sent one packet.  If we used
3133                          * TSO, we might have sent several.  Minus one for the
3134                          * loop increment. */
3135                         msgs += DIV_ROUND_UP(ssize, payload_mss) - 1;
3136                 }
3137                 /* Old Plan 9 tidbit - yield every four messages.  We want to
3138                  * break out and unlock so we can process inbound ACKs which
3139                  * might do things like say "slow down". */
3140                 if (msgs >= next_yield) {
3141                         next_yield = msgs + 4;
3142                         qunlock(&s->qlock);
3143                         kthread_yield();
3144                         qlock(&s->qlock);
3145                 }
3146         }
3147 }
3148
3149 /*
3150  *  the BSD convention (hack?) for keep alives.  resend last uint8_t acked.
3151  */
3152 static void tcpsendka(struct conv *s)
3153 {
3154         Tcp seg;
3155         Tcpctl *tcb;
3156         struct block *hbp, *dbp;
3157
3158         tcb = (Tcpctl *) s->ptcl;
3159
3160         dbp = NULL;
3161         seg.urg = 0;
3162         seg.source = s->lport;
3163         seg.dest = s->rport;
3164         seg.flags = ACK | PSH;
3165         seg.mss = 0;
3166         seg.ws = 0;
3167         seg.sack_ok = FALSE;
3168         seg.nr_sacks = 0;
3169         if (tcpporthogdefense)
3170                 urandom_read(&seg.seq, sizeof(seg.seq));
3171         else
3172                 seg.seq = tcb->snd.una - 1;
3173         seg.ack = tcb->rcv.nxt;
3174         tcb->last_ack_sent = seg.ack;
3175         tcb->rcv.una = 0;
3176         seg.wnd = tcb->rcv.wnd;
3177         seg.ts_val = tcb->ts_recent;
3178         if (tcb->state == Finwait2) {
3179                 seg.flags |= FIN;
3180         } else {
3181                 dbp = block_alloc(1, MEM_WAIT);
3182                 dbp->wp++;
3183         }
3184
3185         if (isv4(s->raddr)) {
3186                 /* Build header, link data and compute cksum */
3187                 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
3188                 hbp = htontcp4(&seg, dbp, &tcb->protohdr.tcp4hdr, tcb);
3189                 if (hbp == NULL) {
3190                         freeblist(dbp);
3191                         return;
3192                 }
3193                 ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
3194         } else {
3195                 /* Build header, link data and compute cksum */
3196                 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
3197                 hbp = htontcp6(&seg, dbp, &tcb->protohdr.tcp6hdr, tcb);
3198                 if (hbp == NULL) {
3199                         freeblist(dbp);
3200                         return;
3201                 }
3202                 ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
3203         }
3204 }
3205
3206 /*
3207  *  set connection to time out after 12 minutes
3208  */
3209 static void tcpsetkacounter(Tcpctl *tcb)
3210 {
3211         tcb->kacounter = (12 * 60 * 1000) / (tcb->katimer.start * MSPTICK);
3212         if (tcb->kacounter < 3)
3213                 tcb->kacounter = 3;
3214 }
3215
3216 /*
3217  *  if we've timed out, close the connection
3218  *  otherwise, send a keepalive and restart the timer
3219  */
3220 static void tcpkeepalive(void *v)
3221 {
3222         ERRSTACK(1);
3223         Tcpctl *tcb;
3224         struct conv *s;
3225
3226         s = v;
3227         tcb = (Tcpctl *) s->ptcl;
3228         qlock(&s->qlock);
3229         if (waserror()) {
3230                 qunlock(&s->qlock);
3231                 nexterror();
3232         }
3233         if (tcb->state != Closed) {
3234                 if (--(tcb->kacounter) <= 0) {
3235                         localclose(s, "connection timed out");
3236                 } else {
3237                         tcpsendka(s);
3238                         tcpgo(s->p->priv, &tcb->katimer);
3239                 }
3240         }
3241         qunlock(&s->qlock);
3242         poperror();
3243 }
3244
3245 /*
3246  *  start keepalive timer
3247  */
3248 static void tcpstartka(struct conv *s, char **f, int n)
3249 {
3250         Tcpctl *tcb;
3251         int x;
3252
3253         tcb = (Tcpctl *) s->ptcl;
3254         if (tcb->state != Established)
3255                 error(ENOTCONN, "connection must be in Establised state");
3256         if (n > 1) {
3257                 x = atoi(f[1]);
3258                 if (x >= MSPTICK)
3259                         tcb->katimer.start = x / MSPTICK;
3260         }
3261         tcpsetkacounter(tcb);
3262         tcpgo(s->p->priv, &tcb->katimer);
3263 }
3264
3265 /*
3266  *  turn checksums on/off
3267  */
3268 static void tcpsetchecksum(struct conv *s, char **f, int unused)
3269 {
3270         Tcpctl *tcb;
3271
3272         tcb = (Tcpctl *) s->ptcl;
3273         tcb->nochecksum = !atoi(f[1]);
3274 }
3275
3276 static void tcp_loss_event(struct conv *s, Tcpctl *tcb)
3277 {
3278         uint32_t old_cwnd = tcb->cwind;
3279
3280         /* Reno */
3281         tcb->ssthresh = tcb->cwind / 2;
3282         tcb->cwind = tcb->ssthresh;
3283         netlog(s->p->f, Logtcprxmt,
3284                "%I.%d -> %I.%d: loss event, cwnd was %d, now %d\n",
3285                s->laddr, s->lport, s->raddr, s->rport,
3286                old_cwnd, tcb->cwind);
3287 }
3288
3289 /* Called when we need to retrans the entire outstanding window (everything
3290  * previously sent, but unacknowledged). */
3291 static void tcprxmit(struct conv *s)
3292 {
3293         Tcpctl *tcb;
3294
3295         tcb = (Tcpctl *) s->ptcl;
3296
3297         tcb->flags |= FORCE;
3298         tcb->snd.rtx = tcb->snd.una;
3299         set_in_flight(tcb);
3300
3301         tcpoutput(s);
3302 }
3303
3304 /* The original RFC said to drop sacks on a timeout, since the receiver could
3305  * renege.  Later RFCs say we can keep them around, so long as we are careful.
3306  *
3307  * We'll go with a "flush if we have two timeouts" plan.  This doesn't have to
3308  * be perfect - there might be cases where we accidentally flush the sacks too
3309  * often.  Perhaps we never get dup_acks to start fast/sack rxmit.  The main
3310  * thing is that after multiple timeouts we flush the sacks, since the receiver
3311  * might renege.
3312  *
3313  * We also have an Akaros-specific problem.  We use the sacks to determine
3314  * in_flight.  Specifically, the (snd.nxt - upper right edge) is tracked as in
3315  * flight.  Usually the receiver will keep sacking that right edge all the way
3316  * up to snd.nxt, but they might not, and the gap might be quite large.  After a
3317  * timeout, that data is definitely not in flight.  If that block's size is
3318  * greater than cwnd, we'll never transmit.  This should be rare, and in that
3319  * case we can just dump the sacks.  The typical_mss fudge factor is so we can
3320  * send a reasonably-sized packet. */
3321 static void timeout_handle_sacks(Tcpctl *tcb)
3322 {
3323         struct sack_block *last_sack;
3324
3325         if (tcb->snd.nr_sacks) {
3326                 last_sack = &tcb->snd.sacks[tcb->snd.nr_sacks - 1];
3327                 if (tcb->snd.flush_sacks || (tcb->snd.nxt - last_sack->right >=
3328                                              tcb->cwind - tcb->typical_mss)) {
3329                         tcb->snd.nr_sacks = 0;
3330                         tcb->snd.flush_sacks = FALSE;
3331                 } else {
3332                         tcb->snd.flush_sacks = TRUE;
3333                 }
3334         }
3335 }
3336
3337 static void tcptimeout(void *arg)
3338 {
3339         ERRSTACK(1);
3340         struct conv *s;
3341         Tcpctl *tcb;
3342         int maxback;
3343         struct tcppriv *tpriv;
3344
3345         s = (struct conv *)arg;
3346         tpriv = s->p->priv;
3347         tcb = (Tcpctl *) s->ptcl;
3348
3349         qlock(&s->qlock);
3350         if (waserror()) {
3351                 qunlock(&s->qlock);
3352                 nexterror();
3353         }
3354         switch (tcb->state) {
3355         default:
3356                 tcb->backoff++;
3357                 if (tcb->state == Syn_sent)
3358                         maxback = MAXBACKMS / 2;
3359                 else
3360                         maxback = MAXBACKMS;
3361                 tcb->backedoff += tcb->timer.start * MSPTICK;
3362                 if (tcb->backedoff >= maxback) {
3363                         localclose(s, "connection timed out");
3364                         break;
3365                 }
3366                 netlog(s->p->f, Logtcprxmt,
3367                        "%I.%d -> %I.%d: timeout rxmit una %u, rtx %u, nxt %u, in_flight %u, timer.start %u\n",
3368                        s->laddr, s->lport, s->raddr, s->rport,
3369                        tcb->snd.una, tcb->snd.rtx, tcb->snd.nxt,
3370                        tcb->snd.in_flight, tcb->timer.start);
3371                 tcpsettimer(tcb);
3372                 tcp_loss_event(s, tcb);
3373                 /* Advance the recovery point.  Any dupacks/sacks below this
3374                  * won't trigger a new loss, since we won't reset_recovery()
3375                  * until we ack past recovery_pt. */
3376                 tcb->snd.recovery = RTO_RETRANS_RECOVERY;
3377                 tcb->snd.recovery_pt = tcb->snd.nxt;
3378                 timeout_handle_sacks(tcb);
3379                 tcprxmit(s);
3380                 tpriv->stats[RetransTimeouts]++;
3381                 break;
3382         case Time_wait:
3383                 localclose(s, NULL);
3384                 break;
3385         case Closed:
3386                 break;
3387         }
3388         qunlock(&s->qlock);
3389         poperror();
3390 }
3391
3392 static int inwindow(Tcpctl *tcb, int seq)
3393 {
3394         return seq_within(seq, tcb->rcv.nxt, tcb->rcv.nxt + tcb->rcv.wnd - 1);
3395 }
3396
3397 /*
3398  *  set up state for a received SYN (or SYN ACK) packet
3399  */
3400 static void procsyn(struct conv *s, Tcp *seg)
3401 {
3402         Tcpctl *tcb;
3403
3404         tcb = (Tcpctl *) s->ptcl;
3405         tcb->flags |= FORCE;
3406
3407         tcb->rcv.nxt = seg->seq + 1;
3408         tcb->rcv.urg = tcb->rcv.nxt;
3409         tcb->irs = seg->seq;
3410
3411         /* our sending max segment size cannot be bigger than what he asked for
3412          */
3413         if (seg->mss != 0 && seg->mss < tcb->mss) {
3414                 tcb->mss = seg->mss;
3415                 tcb->typical_mss = tcb->mss;
3416         }
3417         adjust_typical_mss_for_opts(seg, tcb);
3418
3419         tcb->snd.wnd = seg->wnd;
3420         tcb->cwind = tcb->typical_mss * CWIND_SCALE;
3421 }
3422
3423 static int addreseq(Tcpctl *tcb, struct tcppriv *tpriv, Tcp *seg,
3424                     struct block *bp, uint16_t length)
3425 {
3426         Reseq *rp, *rp1;
3427         int i, rqlen, qmax;
3428
3429         rp = kzmalloc(sizeof(Reseq), 0);
3430         if (rp == NULL) {
3431                 freeblist(bp);  /* bp always consumed by add_reseq */
3432                 return 0;
3433         }
3434
3435         rp->seg = *seg;
3436         rp->bp = bp;
3437         rp->length = length;
3438
3439         track_rcv_sack(tcb, seg->seq, seg->seq + length);
3440         /* Place on reassembly list sorting by starting seq number */
3441         rp1 = tcb->reseq;
3442         if (rp1 == NULL || seq_lt(seg->seq, rp1->seg.seq)) {
3443                 rp->next = rp1;
3444                 tcb->reseq = rp;
3445                 if (rp->next != NULL)
3446                         tpriv->stats[OutOfOrder]++;
3447                 return 0;
3448         }
3449
3450         rqlen = 0;
3451         for (i = 0;; i++) {
3452                 rqlen += rp1->length;
3453                 if (rp1->next == NULL || seq_lt(seg->seq, rp1->next->seg.seq)) {
3454                         rp->next = rp1->next;
3455                         rp1->next = rp;
3456                         if (rp->next != NULL)
3457                                 tpriv->stats[OutOfOrder]++;
3458                         break;
3459                 }
3460                 rp1 = rp1->next;
3461         }
3462         qmax = QMAX << tcb->rcv.scale;
3463         /* Here's where we're reneging on previously reported sacks. */
3464         if (rqlen > qmax) {
3465                 printd("resequence queue > window: %d > %d\n", rqlen, qmax);
3466                 i = 0;
3467                 for (rp1 = tcb->reseq; rp1 != NULL; rp1 = rp1->next) {
3468                         printd("0x%#lx 0x%#lx 0x%#x\n", rp1->seg.seq,
3469                                    rp1->seg.ack, rp1->seg.flags);
3470                         if (i++ > 10) {
3471                                 printd("...\n");
3472                                 break;
3473                         }
3474                 }
3475
3476                 // delete entire reassembly queue; wait for retransmit.
3477                 // - should we be smarter and only delete the tail?
3478                 for (rp = tcb->reseq; rp != NULL; rp = rp1) {
3479                         rp1 = rp->next;
3480                         freeblist(rp->bp);
3481                         kfree(rp);
3482                 }
3483                 tcb->reseq = NULL;
3484                 tcb->rcv.nr_sacks = 0;
3485
3486                 return -1;
3487         }
3488         return 0;
3489 }
3490
3491 static void getreseq(Tcpctl *tcb, Tcp *seg, struct block **bp, uint16_t *length)
3492 {
3493         Reseq *rp;
3494
3495         rp = tcb->reseq;
3496         if (rp == NULL)
3497                 return;
3498
3499         tcb->reseq = rp->next;
3500
3501         *seg = rp->seg;
3502         *bp = rp->bp;
3503         *length = rp->length;
3504
3505         kfree(rp);
3506 }
3507
3508 static int tcptrim(Tcpctl *tcb, Tcp *seg, struct block **bp, uint16_t *length)
3509 {
3510         uint16_t len;
3511         uint8_t accept;
3512         int dupcnt, excess;
3513
3514         accept = 0;
3515         len = *length;
3516         if (seg->flags & SYN)
3517                 len++;
3518         if (seg->flags & FIN)
3519                 len++;
3520
3521         if (tcb->rcv.wnd == 0) {
3522                 if (len == 0 && seg->seq == tcb->rcv.nxt)
3523                         return 0;
3524         } else {
3525                 /* Some part of the segment should be in the window */
3526                 if (inwindow(tcb, seg->seq))
3527                         accept++;
3528                 else if (len != 0) {
3529                         if (inwindow(tcb, seg->seq + len - 1) ||
3530                                 seq_within(tcb->rcv.nxt, seg->seq,
3531                                            seg->seq + len - 1))
3532                                 accept++;
3533                 }
3534         }
3535         if (!accept) {
3536                 freeblist(*bp);
3537                 return -1;
3538         }
3539         dupcnt = tcb->rcv.nxt - seg->seq;
3540         if (dupcnt > 0) {
3541                 tcb->rerecv += dupcnt;
3542                 if (seg->flags & SYN) {
3543                         seg->flags &= ~SYN;
3544                         seg->seq++;
3545
3546                         if (seg->urg > 1)
3547                                 seg->urg--;
3548                         else
3549                                 seg->flags &= ~URG;
3550                         dupcnt--;
3551                 }
3552                 if (dupcnt > 0) {
3553                         pullblock(bp, (uint16_t) dupcnt);
3554                         seg->seq += dupcnt;
3555                         *length -= dupcnt;
3556
3557                         if (seg->urg > dupcnt)
3558                                 seg->urg -= dupcnt;
3559                         else {
3560                                 seg->flags &= ~URG;
3561                                 seg->urg = 0;
3562                         }
3563                 }
3564         }
3565         excess = seg->seq + *length - (tcb->rcv.nxt + tcb->rcv.wnd);
3566         if (excess > 0) {
3567                 tcb->rerecv += excess;
3568                 *length -= excess;
3569                 *bp = trimblock(*bp, 0, *length);
3570                 if (*bp == NULL)
3571                         panic("presotto is a boofhead");
3572                 seg->flags &= ~FIN;
3573         }
3574         return 0;
3575 }
3576
3577 static void tcpadvise(struct Proto *tcp, struct block *bp, char *msg)
3578 {
3579         Tcp4hdr *h4;
3580         Tcp6hdr *h6;
3581         Tcpctl *tcb;
3582         uint8_t source[IPaddrlen];
3583         uint8_t dest[IPaddrlen];
3584         uint16_t psource, pdest;
3585         struct conv *s, **p;
3586
3587         h4 = (Tcp4hdr *) (bp->rp);
3588         h6 = (Tcp6hdr *) (bp->rp);
3589
3590         if ((h4->vihl & 0xF0) == IP_VER4) {
3591                 v4tov6(dest, h4->tcpdst);
3592                 v4tov6(source, h4->tcpsrc);
3593                 psource = nhgets(h4->tcpsport);
3594                 pdest = nhgets(h4->tcpdport);
3595         } else {
3596                 ipmove(dest, h6->tcpdst);
3597                 ipmove(source, h6->tcpsrc);
3598                 psource = nhgets(h6->tcpsport);
3599                 pdest = nhgets(h6->tcpdport);
3600         }
3601
3602         /* Look for a connection */
3603         for (p = tcp->conv; *p; p++) {
3604                 s = *p;
3605                 tcb = (Tcpctl *) s->ptcl;
3606                 if ((s->rport == pdest) && (s->lport == psource)
3607                     && (tcb->state != Closed) && (ipcmp(s->raddr, dest) == 0)
3608                     && (ipcmp(s->laddr, source) == 0)) {
3609                         qlock(&s->qlock);
3610                         switch (tcb->state) {
3611                         case Syn_sent:
3612                                 localclose(s, msg);
3613                                 break;
3614                         }
3615                         qunlock(&s->qlock);
3616                         freeblist(bp);
3617                         return;
3618                 }
3619         }
3620         freeblist(bp);
3621 }
3622
3623 static void tcpporthogdefensectl(char *val)
3624 {
3625         if (strcmp(val, "on") == 0)
3626                 tcpporthogdefense = 1;
3627         else if (strcmp(val, "off") == 0)
3628                 tcpporthogdefense = 0;
3629         else
3630                 error(EINVAL, "unknown value for tcpporthogdefense");
3631 }
3632
3633 /* called with c qlocked */
3634 static void tcpctl(struct conv *c, char **f, int n)
3635 {
3636         if (n == 1 && strcmp(f[0], "hangup") == 0)
3637                 tcphangup(c);
3638         else if (n >= 1 && strcmp(f[0], "keepalive") == 0)
3639                 tcpstartka(c, f, n);
3640         else if (n >= 1 && strcmp(f[0], "checksum") == 0)
3641                 tcpsetchecksum(c, f, n);
3642         else if (n >= 1 && strcmp(f[0], "tcpporthogdefense") == 0)
3643                 tcpporthogdefensectl(f[1]);
3644         else
3645                 error(EINVAL, "unknown command to %s", __func__);
3646 }
3647
3648 static int tcpstats(struct Proto *tcp, char *buf, int len)
3649 {
3650         struct tcppriv *priv;
3651         char *p, *e;
3652         int i;
3653
3654         priv = tcp->priv;
3655         p = buf;
3656         e = p + len;
3657         for (i = 0; i < Nstats; i++)
3658                 p = seprintf(p, e, "%s: %u\n", statnames[i], priv->stats[i]);
3659         return p - buf;
3660 }
3661
3662 /*
3663  *  garbage collect any stale conversations:
3664  *      - SYN received but no SYN-ACK after 5 seconds (could be the SYN attack)
3665  *      - Finwait2 after 5 minutes
3666  *
3667  *  this is called whenever we run out of channels.  Both checks are
3668  *  of questionable validity so we try to use them only when we're
3669  *  up against the wall.
3670  */
3671 static int tcpgc(struct Proto *tcp)
3672 {
3673         struct conv *c, **pp, **ep;
3674         int n;
3675         Tcpctl *tcb;
3676
3677         n = 0;
3678         ep = &tcp->conv[tcp->nc];
3679         for (pp = tcp->conv; pp < ep; pp++) {
3680                 c = *pp;
3681                 if (c == NULL)
3682                         break;
3683                 if (!canqlock(&c->qlock))
3684                         continue;
3685                 tcb = (Tcpctl *) c->ptcl;
3686                 if (tcb->state == Finwait2) {
3687                         if (NOW - tcb->time > 5 * 60 * 1000) {
3688                                 localclose(c, "timed out");
3689                                 n++;
3690                         }
3691                 }
3692                 qunlock(&c->qlock);
3693         }
3694         return n;
3695 }
3696
3697 static void tcpsettimer(Tcpctl *tcb)
3698 {
3699         int x;
3700
3701         /* round trip dependency */
3702         x = backoff(tcb->backoff) * (tcb->srtt + MAX(4 * tcb->mdev, MSPTICK));
3703         x = DIV_ROUND_UP(x, MSPTICK);
3704
3705         /* Bounded twixt 1/2 and 64 seconds.  RFC 6298 suggested min is 1
3706          * second. */
3707         if (x < 500 / MSPTICK)
3708                 x = 500 / MSPTICK;
3709         else if (x > (64000 / MSPTICK))
3710                 x = 64000 / MSPTICK;
3711         tcb->timer.start = x;
3712 }
3713
3714 static struct tcppriv *debug_priv;
3715
3716 /* Kfunc this */
3717 int dump_tcp_ht(void)
3718 {
3719         if (!debug_priv)
3720                 return -1;
3721         dump_ipht(&debug_priv->ht);
3722         return 0;
3723 }
3724
3725 void tcpinit(struct Fs *fs)
3726 {
3727         struct Proto *tcp;
3728         struct tcppriv *tpriv;
3729
3730         tcp = kzmalloc(sizeof(struct Proto), 0);
3731         tpriv = tcp->priv = kzmalloc(sizeof(struct tcppriv), 0);
3732         debug_priv = tpriv;
3733         qlock_init(&tpriv->tl);
3734         qlock_init(&tpriv->apl);
3735         tcp->name = "tcp";
3736         tcp->connect = tcpconnect;
3737         tcp->announce = tcpannounce;
3738         tcp->bypass = tcpbypass;
3739         tcp->ctl = tcpctl;
3740         tcp->state = tcpstate;
3741         tcp->create = tcpcreate;
3742         tcp->close = tcpclose;
3743         tcp->shutdown = tcpshutdown;
3744         tcp->rcv = tcpiput;
3745         tcp->advise = tcpadvise;
3746         tcp->stats = tcpstats;
3747         tcp->inuse = tcpinuse;
3748         tcp->gc = tcpgc;
3749         tcp->ipproto = IP_TCPPROTO;
3750         tcp->nc = 4096;
3751         tcp->ptclsize = sizeof(Tcpctl);
3752         tpriv->stats[MaxConn] = tcp->nc;
3753
3754         Fsproto(fs, tcp);
3755 }
3756
3757 static void tcpsetscale(struct conv *s, Tcpctl *tcb, uint16_t rcvscale,
3758                         uint16_t sndscale)
3759 {
3760         if (rcvscale) {
3761                 tcb->rcv.scale = rcvscale & 0xff;
3762                 tcb->snd.scale = sndscale & 0xff;
3763                 tcb->window = QMAX << tcb->rcv.scale;
3764         } else {
3765                 tcb->rcv.scale = 0;
3766                 tcb->snd.scale = 0;
3767                 tcb->window = QMAX;
3768         }
3769 }