net: tcp: Handle reseqs of length 0
[akaros.git] / kern / src / net / tcp.c
1 /* Copyright © 1994-1999 Lucent Technologies Inc.  All rights reserved.
2  * Portions Copyright © 1997-1999 Vita Nuova Limited
3  * Portions Copyright © 2000-2007 Vita Nuova Holdings Limited
4  *                                (www.vitanuova.com)
5  * Revisions Copyright © 2000-2007 Lucent Technologies Inc. and others
6  *
7  * Modified for the Akaros operating system:
8  * Copyright (c) 2013-2014 The Regents of the University of California
9  * Copyright (c) 2013-2017 Google Inc.
10  *
11  * Permission is hereby granted, free of charge, to any person obtaining a copy
12  * of this software and associated documentation files (the "Software"), to deal
13  * in the Software without restriction, including without limitation the rights
14  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15  * copies of the Software, and to permit persons to whom the Software is
16  * furnished to do so, subject to the following conditions:
17  *
18  * The above copyright notice and this permission notice shall be included in
19  * all copies or substantial portions of the Software.
20  *
21  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
24  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27  * SOFTWARE. */
28
29 #include <vfs.h>
30 #include <kfs.h>
31 #include <slab.h>
32 #include <kmalloc.h>
33 #include <kref.h>
34 #include <string.h>
35 #include <stdio.h>
36 #include <assert.h>
37 #include <error.h>
38 #include <cpio.h>
39 #include <pmap.h>
40 #include <smp.h>
41 #include <net/ip.h>
42 #include <net/tcp.h>
43
44 /* Must correspond to the enumeration in tcp.h */
45 static char *tcpstates[] = {
46         "Closed", "Listen", "Syn_sent",
47         "Established", "Finwait1", "Finwait2", "Close_wait",
48         "Closing", "Last_ack", "Time_wait"
49 };
50
51 static int tcp_irtt = DEF_RTT;                  /* Initial guess at round trip time */
52 static uint16_t tcp_mss = DEF_MSS;              /* Maximum segment size to be sent */
53
54 /* Must correspond to the enumeration in tcp.h */
55 static char *statnames[] = {
56         [MaxConn] "MaxConn",
57         [ActiveOpens] "ActiveOpens",
58         [PassiveOpens] "PassiveOpens",
59         [EstabResets] "EstabResets",
60         [CurrEstab] "CurrEstab",
61         [InSegs] "InSegs",
62         [OutSegs] "OutSegs",
63         [RetransSegs] "RetransSegs",
64         [RetransTimeouts] "RetransTimeouts",
65         [InErrs] "InErrs",
66         [OutRsts] "OutRsts",
67         [CsumErrs] "CsumErrs",
68         [HlenErrs] "HlenErrs",
69         [LenErrs] "LenErrs",
70         [OutOfOrder] "OutOfOrder",
71 };
72
73 /*
74  *  Setting tcpporthogdefense to non-zero enables Dong Lin's
75  *  solution to hijacked systems staking out port's as a form
76  *  of DoS attack.
77  *
78  *  To avoid stateless Conv hogs, we pick a sequence number at random.  If
79  *  it that number gets acked by the other end, we shut down the connection.
80  *  Look for tcpporthogedefense in the code.
81  */
82 static int tcpporthogdefense = 0;
83
84 static int addreseq(Tcpctl *, struct tcppriv *, Tcp *, struct block *,
85                     uint16_t);
86 static void getreseq(Tcpctl *, Tcp *, struct block **, uint16_t *);
87 static void localclose(struct conv *, char *unused_char_p_t);
88 static void procsyn(struct conv *, Tcp *);
89 static void tcpiput(struct Proto *, struct Ipifc *, struct block *);
90 static void tcpoutput(struct conv *);
91 static int tcptrim(Tcpctl *, Tcp *, struct block **, uint16_t *);
92 static void tcpstart(struct conv *, int);
93 static void tcptimeout(void *);
94 static void tcpsndsyn(struct conv *, Tcpctl *);
95 static void tcprcvwin(struct conv *);
96 static void tcpacktimer(void *);
97 static void tcpkeepalive(void *);
98 static void tcpsetkacounter(Tcpctl *);
99 static void tcprxmit(struct conv *);
100 static void tcpsettimer(Tcpctl *);
101 static void tcpsynackrtt(struct conv *);
102 static void tcpsetscale(struct conv *, Tcpctl *, uint16_t, uint16_t);
103 static void tcp_loss_event(struct conv *s, Tcpctl *tcb);
104 static uint16_t derive_payload_mss(Tcpctl *tcb);
105 static void set_in_flight(Tcpctl *tcb);
106
107 static void limborexmit(struct Proto *);
108 static void limbo(struct conv *, uint8_t *unused_uint8_p_t, uint8_t *, Tcp *,
109                                   int);
110
111 static void tcpsetstate(struct conv *s, uint8_t newstate)
112 {
113         Tcpctl *tcb;
114         uint8_t oldstate;
115         struct tcppriv *tpriv;
116
117         tpriv = s->p->priv;
118
119         tcb = (Tcpctl *) s->ptcl;
120
121         oldstate = tcb->state;
122         if (oldstate == newstate)
123                 return;
124
125         if (oldstate == Established)
126                 tpriv->stats[CurrEstab]--;
127         if (newstate == Established)
128                 tpriv->stats[CurrEstab]++;
129
130         /**
131         print( "%d/%d %s->%s CurrEstab=%d\n", s->lport, s->rport,
132                 tcpstates[oldstate], tcpstates[newstate], tpriv->tstats.tcpCurrEstab );
133         **/
134
135         switch (newstate) {
136                 case Closed:
137                         qclose(s->rq);
138                         qclose(s->wq);
139                         qclose(s->eq);
140                         break;
141
142                 case Close_wait:        /* Remote closes */
143                         qhangup(s->rq, NULL);
144                         break;
145         }
146
147         tcb->state = newstate;
148
149         if (oldstate == Syn_sent && newstate != Closed)
150                 Fsconnected(s, NULL);
151 }
152
153 static void tcpconnect(struct conv *c, char **argv, int argc)
154 {
155         Fsstdconnect(c, argv, argc);
156         tcpstart(c, TCP_CONNECT);
157 }
158
159 static int tcpstate(struct conv *c, char *state, int n)
160 {
161         Tcpctl *s;
162
163         s = (Tcpctl *) (c->ptcl);
164
165         return snprintf(state, n,
166                                         "%s qin %d qout %d srtt %d mdev %d cwin %u swin %u>>%d rwin %u>>%d timer.start %llu timer.count %llu rerecv %d katimer.start %d katimer.count %d\n",
167                                         tcpstates[s->state],
168                                         c->rq ? qlen(c->rq) : 0,
169                                         c->wq ? qlen(c->wq) : 0,
170                                         s->srtt, s->mdev,
171                                         s->cwind, s->snd.wnd, s->rcv.scale, s->rcv.wnd,
172                                         s->snd.scale, s->timer.start, s->timer.count, s->rerecv,
173                                         s->katimer.start, s->katimer.count);
174 }
175
176 static int tcpinuse(struct conv *c)
177 {
178         Tcpctl *s;
179
180         s = (Tcpctl *) (c->ptcl);
181         return s->state != Closed;
182 }
183
184 static void tcpannounce(struct conv *c, char **argv, int argc)
185 {
186         Fsstdannounce(c, argv, argc);
187         tcpstart(c, TCP_LISTEN);
188         Fsconnected(c, NULL);
189 }
190
191 static void tcpbypass(struct conv *cv, char **argv, int argc)
192 {
193         struct tcppriv *tpriv = cv->p->priv;
194
195         Fsstdbypass(cv, argv, argc);
196         iphtadd(&tpriv->ht, cv);
197 }
198
199 static void tcpshutdown(struct conv *c, int how)
200 {
201         Tcpctl *tcb = (Tcpctl*)c->ptcl;
202
203         /* Do nothing for the read side */
204         if (how == SHUT_RD)
205                 return;
206         /* Sends a FIN.  If we're in another state (like Listen), we'll run into
207          * issues, since we'll never send the FIN.  We'll be shutdown on our end,
208          * but we'll never tell the distant end.  Might just be an app issue. */
209         switch (tcb->state) {
210         case Established:
211                 tcb->flgcnt++;
212                 tcpsetstate(c, Finwait1);
213                 tcpoutput(c);
214                 break;
215         }
216 }
217
218 /*
219  *  tcpclose is always called with the q locked
220  */
221 static void tcpclose(struct conv *c)
222 {
223         Tcpctl *tcb;
224
225         tcb = (Tcpctl *) c->ptcl;
226
227         qhangup(c->rq, NULL);
228         qhangup(c->wq, NULL);
229         qhangup(c->eq, NULL);
230         qflush(c->rq);
231
232         switch (tcb->state) {
233                 case Listen:
234                         /*
235                          *  reset any incoming calls to this listener
236                          */
237                         Fsconnected(c, "Hangup");
238
239                         localclose(c, NULL);
240                         break;
241                 case Closed:
242                 case Syn_sent:
243                         localclose(c, NULL);
244                         break;
245                 case Established:
246                         tcb->flgcnt++;
247                         tcpsetstate(c, Finwait1);
248                         tcpoutput(c);
249                         break;
250                 case Close_wait:
251                         tcb->flgcnt++;
252                         tcpsetstate(c, Last_ack);
253                         tcpoutput(c);
254                         break;
255         }
256 }
257
258 static void tcpkick(void *x)
259 {
260         ERRSTACK(1);
261         struct conv *s = x;
262         Tcpctl *tcb;
263
264         tcb = (Tcpctl *) s->ptcl;
265
266         qlock(&s->qlock);
267         if (waserror()) {
268                 qunlock(&s->qlock);
269                 nexterror();
270         }
271
272         switch (tcb->state) {
273                 case Syn_sent:
274                 case Established:
275                 case Close_wait:
276                         /*
277                          * Push data
278                          */
279                         tcprcvwin(s);
280                         tcpoutput(s);
281                         break;
282                 default:
283                         localclose(s, "Hangup");
284                         break;
285         }
286
287         qunlock(&s->qlock);
288         poperror();
289 }
290
291 static void tcprcvwin(struct conv *s)
292 {
293         /* Call with tcb locked */
294         int w;
295         Tcpctl *tcb;
296
297         tcb = (Tcpctl *) s->ptcl;
298         w = tcb->window - qlen(s->rq);
299         if (w < 0)
300                 w = 0;
301
302         /* RFC 813: Avoid SWS.  We'll always reduce the window (because the qio
303          * increased - that's legit), and we'll always advertise the window
304          * increases (corresponding to qio drains) when those are greater than MSS.
305          * But we don't advertise increases less than MSS.
306          *
307          * Note we don't shrink the window at all - that'll result in tcptrim()
308          * dropping packets that were sent before the sender gets our update. */
309         if ((w < tcb->rcv.wnd) || (w >= tcb->mss))
310                 tcb->rcv.wnd = w;
311         /* We've delayed sending an update to rcv.wnd, and we might never get
312          * another ACK to drive the TCP stack after the qio is drained.  We could
313          * replace this stuff with qio kicks or callbacks, but that might be
314          * trickier with the MSS limitation.  (and 'edge' isn't empty or not). */
315         if (w < tcb->mss)
316                 tcb->rcv.blocked = 1;
317 }
318
319 static void tcpacktimer(void *v)
320 {
321         ERRSTACK(1);
322         Tcpctl *tcb;
323         struct conv *s;
324
325         s = v;
326         tcb = (Tcpctl *) s->ptcl;
327
328         qlock(&s->qlock);
329         if (waserror()) {
330                 qunlock(&s->qlock);
331                 nexterror();
332         }
333         if (tcb->state != Closed) {
334                 tcb->flags |= FORCE;
335                 tcprcvwin(s);
336                 tcpoutput(s);
337         }
338         qunlock(&s->qlock);
339         poperror();
340 }
341
342 static void tcpcreate(struct conv *c)
343 {
344         /* We don't use qio limits.  Instead, TCP manages flow control on its own.
345          * We only use qpassnolim().  Note for qio that 0 doesn't mean no limit. */
346         c->rq = qopen(0, Qcoalesce, 0, 0);
347         c->wq = qopen(8 * QMAX, Qkick, tcpkick, c);
348 }
349
350 static void timerstate(struct tcppriv *priv, Tcptimer *t, int newstate)
351 {
352         if (newstate != TcptimerON) {
353                 if (t->state == TcptimerON) {
354                         // unchain
355                         if (priv->timers == t) {
356                                 priv->timers = t->next;
357                                 if (t->prev != NULL)
358                                         panic("timerstate1");
359                         }
360                         if (t->next)
361                                 t->next->prev = t->prev;
362                         if (t->prev)
363                                 t->prev->next = t->next;
364                         t->next = t->prev = NULL;
365                 }
366         } else {
367                 if (t->state != TcptimerON) {
368                         // chain
369                         if (t->prev != NULL || t->next != NULL)
370                                 panic("timerstate2");
371                         t->prev = NULL;
372                         t->next = priv->timers;
373                         if (t->next)
374                                 t->next->prev = t;
375                         priv->timers = t;
376                 }
377         }
378         t->state = newstate;
379 }
380
381 static void tcpackproc(void *a)
382 {
383         ERRSTACK(1);
384         Tcptimer *t, *tp, *timeo;
385         struct Proto *tcp;
386         struct tcppriv *priv;
387         int loop;
388
389         tcp = a;
390         priv = tcp->priv;
391
392         for (;;) {
393                 kthread_usleep(MSPTICK * 1000);
394
395                 qlock(&priv->tl);
396                 timeo = NULL;
397                 loop = 0;
398                 for (t = priv->timers; t != NULL; t = tp) {
399                         if (loop++ > 10000)
400                                 panic("tcpackproc1");
401                         tp = t->next;
402                         if (t->state == TcptimerON) {
403                                 t->count--;
404                                 if (t->count == 0) {
405                                         timerstate(priv, t, TcptimerDONE);
406                                         t->readynext = timeo;
407                                         timeo = t;
408                                 }
409                         }
410                 }
411                 qunlock(&priv->tl);
412
413                 loop = 0;
414                 for (t = timeo; t != NULL; t = t->readynext) {
415                         if (loop++ > 10000)
416                                 panic("tcpackproc2");
417                         if (t->state == TcptimerDONE && t->func != NULL) {
418                                 /* discard error style */
419                                 if (!waserror())
420                                         (*t->func) (t->arg);
421                                 poperror();
422                         }
423                 }
424
425                 limborexmit(tcp);
426         }
427 }
428
429 static void tcpgo(struct tcppriv *priv, Tcptimer *t)
430 {
431         if (t == NULL || t->start == 0)
432                 return;
433
434         qlock(&priv->tl);
435         t->count = t->start;
436         timerstate(priv, t, TcptimerON);
437         qunlock(&priv->tl);
438 }
439
440 static void tcphalt(struct tcppriv *priv, Tcptimer *t)
441 {
442         if (t == NULL)
443                 return;
444
445         qlock(&priv->tl);
446         timerstate(priv, t, TcptimerOFF);
447         qunlock(&priv->tl);
448 }
449
450 static int backoff(int n)
451 {
452         return 1 << n;
453 }
454
455 static void localclose(struct conv *s, char *reason)
456 {
457         /* called with tcb locked */
458         Tcpctl *tcb;
459         Reseq *rp, *rp1;
460         struct tcppriv *tpriv;
461
462         tpriv = s->p->priv;
463         tcb = (Tcpctl *) s->ptcl;
464
465         iphtrem(&tpriv->ht, s);
466
467         tcphalt(tpriv, &tcb->timer);
468         tcphalt(tpriv, &tcb->rtt_timer);
469         tcphalt(tpriv, &tcb->acktimer);
470         tcphalt(tpriv, &tcb->katimer);
471
472         /* Flush reassembly queue; nothing more can arrive */
473         for (rp = tcb->reseq; rp != NULL; rp = rp1) {
474                 rp1 = rp->next;
475                 freeblist(rp->bp);
476                 kfree(rp);
477         }
478         tcb->reseq = NULL;
479
480         if (tcb->state == Syn_sent)
481                 Fsconnected(s, reason);
482
483         qhangup(s->rq, reason);
484         qhangup(s->wq, reason);
485
486         tcpsetstate(s, Closed);
487
488         /* listener will check the rq state */
489         if (s->state == Announced)
490                 rendez_wakeup(&s->listenr);
491 }
492
493 /* mtu (- TCP + IP hdr len) of 1st hop */
494 static int tcpmtu(struct Ipifc *ifc, int version, int *scale)
495 {
496         int mtu;
497
498         switch (version) {
499                 default:
500                 case V4:
501                         mtu = DEF_MSS;
502                         if (ifc != NULL)
503                                 mtu = ifc->maxtu - ifc->m->hsize - (TCP4_PKT + TCP4_HDRSIZE);
504                         break;
505                 case V6:
506                         mtu = DEF_MSS6;
507                         if (ifc != NULL)
508                                 mtu = ifc->maxtu - ifc->m->hsize - (TCP6_PKT + TCP6_HDRSIZE);
509                         break;
510         }
511         *scale = HaveWS | 7;
512
513         return mtu;
514 }
515
516 static void tcb_check_tso(Tcpctl *tcb)
517 {
518         /* This can happen if the netdev isn't up yet. */
519         if (!tcb->ifc)
520                 return;
521         if (tcb->ifc->feat & NETF_TSO)
522                 tcb->flags |= TSO;
523         else
524                 tcb->flags &= ~TSO;
525 }
526
527 static void inittcpctl(struct conv *s, int mode)
528 {
529         Tcpctl *tcb;
530         Tcp4hdr *h4;
531         Tcp6hdr *h6;
532         int mss;
533
534         tcb = (Tcpctl *) s->ptcl;
535
536         memset(tcb, 0, sizeof(Tcpctl));
537
538         tcb->ssthresh = UINT32_MAX;
539         tcb->srtt = tcp_irtt;
540         tcb->mdev = 0;
541
542         /* setup timers */
543         tcb->timer.start = tcp_irtt / MSPTICK;
544         tcb->timer.func = tcptimeout;
545         tcb->timer.arg = s;
546         tcb->rtt_timer.start = MAX_TIME;
547         tcb->acktimer.start = TCP_ACK / MSPTICK;
548         tcb->acktimer.func = tcpacktimer;
549         tcb->acktimer.arg = s;
550         tcb->katimer.start = DEF_KAT / MSPTICK;
551         tcb->katimer.func = tcpkeepalive;
552         tcb->katimer.arg = s;
553
554         mss = DEF_MSS;
555
556         /* create a prototype(pseudo) header */
557         if (mode != TCP_LISTEN) {
558                 if (ipcmp(s->laddr, IPnoaddr) == 0)
559                         findlocalip(s->p->f, s->laddr, s->raddr);
560
561                 switch (s->ipversion) {
562                         case V4:
563                                 h4 = &tcb->protohdr.tcp4hdr;
564                                 memset(h4, 0, sizeof(*h4));
565                                 h4->proto = IP_TCPPROTO;
566                                 hnputs(h4->tcpsport, s->lport);
567                                 hnputs(h4->tcpdport, s->rport);
568                                 v6tov4(h4->tcpsrc, s->laddr);
569                                 v6tov4(h4->tcpdst, s->raddr);
570                                 break;
571                         case V6:
572                                 h6 = &tcb->protohdr.tcp6hdr;
573                                 memset(h6, 0, sizeof(*h6));
574                                 h6->proto = IP_TCPPROTO;
575                                 hnputs(h6->tcpsport, s->lport);
576                                 hnputs(h6->tcpdport, s->rport);
577                                 ipmove(h6->tcpsrc, s->laddr);
578                                 ipmove(h6->tcpdst, s->raddr);
579                                 mss = DEF_MSS6;
580                                 break;
581                         default:
582                                 panic("inittcpctl: version %d", s->ipversion);
583                 }
584         }
585
586         tcb->ifc = findipifc(s->p->f, s->laddr, 0);
587         tcb->mss = mss;
588         tcb->typical_mss = mss;
589         tcb->cwind = tcb->typical_mss * CWIND_SCALE;
590
591         /* default is no window scaling */
592         tcb->window = QMAX;
593         tcb->rcv.wnd = QMAX;
594         tcb->rcv.scale = 0;
595         tcb->snd.scale = 0;
596         tcb_check_tso(tcb);
597 }
598
599 /*
600  *  called with s qlocked
601  */
602 static void tcpstart(struct conv *s, int mode)
603 {
604         Tcpctl *tcb;
605         struct tcppriv *tpriv;
606         char *kpname;
607
608         tpriv = s->p->priv;
609
610         if (tpriv->ackprocstarted == 0) {
611                 qlock(&tpriv->apl);
612                 if (tpriv->ackprocstarted == 0) {
613                         /* tcpackproc needs to free this if it ever exits */
614                         kpname = kmalloc(KNAMELEN, MEM_WAIT);
615                         snprintf(kpname, KNAMELEN, "#I%dtcpack", s->p->f->dev);
616                         ktask(kpname, tcpackproc, s->p);
617                         tpriv->ackprocstarted = 1;
618                 }
619                 qunlock(&tpriv->apl);
620         }
621
622         tcb = (Tcpctl *) s->ptcl;
623
624         inittcpctl(s, mode);
625
626         iphtadd(&tpriv->ht, s);
627         switch (mode) {
628                 case TCP_LISTEN:
629                         tpriv->stats[PassiveOpens]++;
630                         tcb->flags |= CLONE;
631                         tcpsetstate(s, Listen);
632                         break;
633
634                 case TCP_CONNECT:
635                         tpriv->stats[ActiveOpens]++;
636                         tcb->flags |= ACTIVE;
637                         tcpsndsyn(s, tcb);
638                         tcpsetstate(s, Syn_sent);
639                         tcpoutput(s);
640                         break;
641         }
642 }
643
644 static char *tcpflag(uint16_t flag)
645 {
646         static char buf[128];
647
648         snprintf(buf, sizeof(buf), "%d", flag >> 10);   /* Head len */
649         if (flag & URG)
650                 snprintf(buf, sizeof(buf), "%s%s", buf, " URG");
651         if (flag & ACK)
652                 snprintf(buf, sizeof(buf), "%s%s", buf, " ACK");
653         if (flag & PSH)
654                 snprintf(buf, sizeof(buf), "%s%s", buf, " PSH");
655         if (flag & RST)
656                 snprintf(buf, sizeof(buf), "%s%s", buf, " RST");
657         if (flag & SYN)
658                 snprintf(buf, sizeof(buf), "%s%s", buf, " SYN");
659         if (flag & FIN)
660                 snprintf(buf, sizeof(buf), "%s%s", buf, " FIN");
661
662         return buf;
663 }
664
665 /* Helper, determine if we should send a TCP timestamp.  ts_val was the
666  * timestamp from our distant end.  We'll also send a TS on SYN (no ACK). */
667 static bool tcp_seg_has_ts(Tcp *tcph)
668 {
669         return tcph->ts_val || ((tcph->flags & SYN) && !(tcph->flags & ACK));
670 }
671
672 /* Given a TCP header/segment and default header size (e.g. TCP4_HDRSIZE),
673  * return the actual hdr_len and opt_pad */
674 static void compute_hdrlen_optpad(Tcp *tcph, uint16_t default_hdrlen,
675                                   uint16_t *ret_hdrlen, uint16_t *ret_optpad,
676                                   Tcpctl *tcb)
677 {
678         uint16_t hdrlen = default_hdrlen;
679         uint16_t optpad = 0;
680
681         if (tcph->flags & SYN) {
682                 if (tcph->mss)
683                         hdrlen += MSS_LENGTH;
684                 if (tcph->ws)
685                         hdrlen += WS_LENGTH;
686                 if (tcph->sack_ok)
687                         hdrlen += SACK_OK_LENGTH;
688         }
689         if (tcp_seg_has_ts(tcph)) {
690                 hdrlen += TS_LENGTH;
691                 /* SYNs have other opts, don't do the PREPAD NOOP optimization. */
692                 if (!(tcph->flags & SYN))
693                         hdrlen += TS_SEND_PREPAD;
694         }
695         if (tcb && tcb->rcv.nr_sacks)
696                 hdrlen += 2 + tcb->rcv.nr_sacks * 8;
697         optpad = hdrlen & 3;
698         if (optpad)
699                 optpad = 4 - optpad;
700         hdrlen += optpad;
701         *ret_hdrlen = hdrlen;
702         *ret_optpad = optpad;
703 }
704
705 /* Writes the TCP options for tcph to opt. */
706 static void write_opts(Tcp *tcph, uint8_t *opt, uint16_t optpad, Tcpctl *tcb)
707 {
708         if (tcph->flags & SYN) {
709                 if (tcph->mss != 0) {
710                         *opt++ = MSSOPT;
711                         *opt++ = MSS_LENGTH;
712                         hnputs(opt, tcph->mss);
713                         opt += 2;
714                 }
715                 if (tcph->ws != 0) {
716                         *opt++ = WSOPT;
717                         *opt++ = WS_LENGTH;
718                         *opt++ = tcph->ws;
719                 }
720                 if (tcph->sack_ok) {
721                         *opt++ = SACK_OK_OPT;
722                         *opt++ = SACK_OK_LENGTH;
723                 }
724         }
725         if (tcp_seg_has_ts(tcph)) {
726                 if (!(tcph->flags & SYN)) {
727                         *opt++ = NOOPOPT;
728                         *opt++ = NOOPOPT;
729                 }
730                 *opt++ = TS_OPT;
731                 *opt++ = TS_LENGTH;
732                 /* Setting TSval, our time */
733                 hnputl(opt, milliseconds());
734                 opt += 4;
735                 /* Setting TSecr, the time we last saw from them, stored in ts_val */
736                 hnputl(opt, tcph->ts_val);
737                 opt += 4;
738         }
739         if (tcb && tcb->rcv.nr_sacks) {
740                 *opt++ = SACK_OPT;
741                 *opt++ = 2 + tcb->rcv.nr_sacks * 8;
742                 for (int i = 0; i < tcb->rcv.nr_sacks; i++) {
743                         hnputl(opt, tcb->rcv.sacks[i].left);
744                         opt += 4;
745                         hnputl(opt, tcb->rcv.sacks[i].right);
746                         opt += 4;
747                 }
748         }
749         while (optpad-- > 0)
750                 *opt++ = NOOPOPT;
751 }
752
753 /* Given a data block (or NULL) returns a block with enough header room that we
754  * can send out.  block->wp is set to the beginning of the payload.  Returns
755  * NULL on some sort of error. */
756 static struct block *alloc_or_pad_block(struct block *data,
757                                         uint16_t total_hdr_size)
758 {
759         if (data) {
760                 data = padblock(data, total_hdr_size);
761                 if (data == NULL)
762                         return NULL;
763         } else {
764                 /* the 64 pad is to meet mintu's */
765                 data = block_alloc(total_hdr_size + 64, MEM_WAIT);
766                 if (data == NULL)
767                         return NULL;
768                 data->wp += total_hdr_size;
769         }
770         return data;
771 }
772
773 static struct block *htontcp6(Tcp *tcph, struct block *data, Tcp6hdr *ph,
774                               Tcpctl *tcb)
775 {
776         int dlen = blocklen(data);
777         Tcp6hdr *h;
778         uint16_t csum;
779         uint16_t hdrlen, optpad;
780
781         compute_hdrlen_optpad(tcph, TCP6_HDRSIZE, &hdrlen, &optpad, tcb);
782
783         data = alloc_or_pad_block(data, hdrlen + TCP6_PKT);
784         if (data == NULL)
785                 return NULL;
786         /* relative to the block start (bp->rp).  Note TCP structs include IP. */
787         data->network_offset = 0;
788         data->transport_offset = offsetof(Tcp6hdr, tcpsport);
789
790         /* copy in pseudo ip header plus port numbers */
791         h = (Tcp6hdr *) (data->rp);
792         memmove(h, ph, TCP6_TCBPHDRSZ);
793
794         /* compose pseudo tcp header, do cksum calculation */
795         hnputl(h->vcf, hdrlen + dlen);
796         h->ploadlen[0] = h->ploadlen[1] = h->proto = 0;
797         h->ttl = ph->proto;
798
799         /* copy in variable bits */
800         hnputl(h->tcpseq, tcph->seq);
801         hnputl(h->tcpack, tcph->ack);
802         hnputs(h->tcpflag, (hdrlen << 10) | tcph->flags);
803         hnputs(h->tcpwin, tcph->wnd >> (tcb != NULL ? tcb->snd.scale : 0));
804         hnputs(h->tcpurg, tcph->urg);
805
806         write_opts(tcph, h->tcpopt, optpad, tcb);
807
808         if (tcb != NULL && tcb->nochecksum) {
809                 h->tcpcksum[0] = h->tcpcksum[1] = 0;
810         } else {
811                 csum = ptclcsum(data, TCP6_IPLEN, hdrlen + dlen + TCP6_PHDRSIZE);
812                 hnputs(h->tcpcksum, csum);
813         }
814
815         /* move from pseudo header back to normal ip header */
816         memset(h->vcf, 0, 4);
817         h->vcf[0] = IP_VER6;
818         hnputs(h->ploadlen, hdrlen + dlen);
819         h->proto = ph->proto;
820
821         return data;
822 }
823
824 static struct block *htontcp4(Tcp *tcph, struct block *data, Tcp4hdr *ph,
825                               Tcpctl *tcb)
826 {
827         int dlen = blocklen(data);
828         Tcp4hdr *h;
829         uint16_t csum;
830         uint16_t hdrlen, optpad;
831
832         compute_hdrlen_optpad(tcph, TCP4_HDRSIZE, &hdrlen, &optpad, tcb);
833
834         data = alloc_or_pad_block(data, hdrlen + TCP4_PKT);
835         if (data == NULL)
836                 return NULL;
837         /* relative to the block start (bp->rp).  Note TCP structs include IP. */
838         data->network_offset = 0;
839         data->transport_offset = offsetof(Tcp4hdr, tcpsport);
840
841         /* copy in pseudo ip header plus port numbers */
842         h = (Tcp4hdr *) (data->rp);
843         memmove(h, ph, TCP4_TCBPHDRSZ);
844
845         /* copy in variable bits */
846         hnputs(h->tcplen, hdrlen + dlen);
847         hnputl(h->tcpseq, tcph->seq);
848         hnputl(h->tcpack, tcph->ack);
849         hnputs(h->tcpflag, (hdrlen << 10) | tcph->flags);
850         hnputs(h->tcpwin, tcph->wnd >> (tcb != NULL ? tcb->snd.scale : 0));
851         hnputs(h->tcpurg, tcph->urg);
852
853         write_opts(tcph, h->tcpopt, optpad, tcb);
854
855         if (tcb != NULL && tcb->nochecksum) {
856                 h->tcpcksum[0] = h->tcpcksum[1] = 0;
857         } else {
858                 assert(data->transport_offset == TCP4_IPLEN + TCP4_PHDRSIZE);
859                 csum = ~ptclcsum(data, TCP4_IPLEN, TCP4_PHDRSIZE);
860                 hnputs(h->tcpcksum, csum);
861                 data->tx_csum_offset = ph->tcpcksum - ph->tcpsport;
862                 data->flag |= Btcpck;
863         }
864
865         return data;
866 }
867
868 static void parse_inbound_sacks(Tcp *tcph, uint8_t *opt, uint16_t optlen)
869 {
870         uint8_t nr_sacks;
871         uint32_t left, right;
872
873         nr_sacks = (optlen - 2) / 8;
874         if (nr_sacks > MAX_NR_SACKS_PER_PACKET)
875                 return;
876         opt += 2;
877         for (int i = 0; i < nr_sacks; i++, opt += 8) {
878                 left = nhgetl(opt);
879                 right = nhgetl(opt + 4);
880                 if (seq_ge(left, right)) {
881                         /* bad / malicious SACK.  Skip it, and adjust. */
882                         nr_sacks--;
883                         i--;    /* stay on this array element next loop */
884                         continue;
885                 }
886                 tcph->sacks[i].left = left;
887                 tcph->sacks[i].right = right;
888         }
889         tcph->nr_sacks = nr_sacks;
890 }
891
892 static void parse_inbound_opts(Tcp *tcph, uint8_t *opt, uint16_t optsize)
893 {
894         uint16_t optlen;
895
896         while (optsize > 0 && *opt != EOLOPT) {
897                 if (*opt == NOOPOPT) {
898                         optsize--;
899                         opt++;
900                         continue;
901                 }
902                 optlen = opt[1];
903                 if (optlen < 2 || optlen > optsize)
904                         break;
905                 switch (*opt) {
906                         case MSSOPT:
907                                 if (optlen == MSS_LENGTH)
908                                         tcph->mss = nhgets(opt + 2);
909                                 break;
910                         case WSOPT:
911                                 if (optlen == WS_LENGTH && *(opt + 2) <= MAX_WS_VALUE)
912                                         tcph->ws = HaveWS | *(opt + 2);
913                                 break;
914                         case SACK_OK_OPT:
915                                 if (optlen == SACK_OK_LENGTH)
916                                         tcph->sack_ok = TRUE;
917                                 break;
918                         case SACK_OPT:
919                                 parse_inbound_sacks(tcph, opt, optlen);
920                                 break;
921                         case TS_OPT:
922                                 if (optlen == TS_LENGTH) {
923                                         tcph->ts_val = nhgetl(opt + 2);
924                                         tcph->ts_ecr = nhgetl(opt + 6);
925                                 }
926                                 break;
927                 }
928                 optsize -= optlen;
929                 opt += optlen;
930         }
931 }
932
933 /* Helper, clears the opts.  We'll later set them with e.g. parse_inbound_opts,
934  * set them manually, or something else. */
935 static void clear_tcph_opts(Tcp *tcph)
936 {
937         tcph->mss = 0;
938         tcph->ws = 0;
939         tcph->sack_ok = FALSE;
940         tcph->nr_sacks = 0;
941         tcph->ts_val = 0;
942         tcph->ts_ecr = 0;
943 }
944
945 static int ntohtcp6(Tcp *tcph, struct block **bpp)
946 {
947         Tcp6hdr *h;
948         uint16_t hdrlen;
949
950         *bpp = pullupblock(*bpp, TCP6_PKT + TCP6_HDRSIZE);
951         if (*bpp == NULL)
952                 return -1;
953
954         h = (Tcp6hdr *) ((*bpp)->rp);
955         tcph->source = nhgets(h->tcpsport);
956         tcph->dest = nhgets(h->tcpdport);
957         tcph->seq = nhgetl(h->tcpseq);
958         tcph->ack = nhgetl(h->tcpack);
959         hdrlen = (h->tcpflag[0] >> 2) & ~3;
960         if (hdrlen < TCP6_HDRSIZE) {
961                 freeblist(*bpp);
962                 return -1;
963         }
964
965         tcph->flags = h->tcpflag[1];
966         tcph->wnd = nhgets(h->tcpwin);
967         tcph->urg = nhgets(h->tcpurg);
968         clear_tcph_opts(tcph);
969         tcph->len = nhgets(h->ploadlen) - hdrlen;
970
971         *bpp = pullupblock(*bpp, hdrlen + TCP6_PKT);
972         if (*bpp == NULL)
973                 return -1;
974         parse_inbound_opts(tcph, h->tcpopt, hdrlen - TCP6_HDRSIZE);
975         return hdrlen;
976 }
977
978 static int ntohtcp4(Tcp *tcph, struct block **bpp)
979 {
980         Tcp4hdr *h;
981         uint16_t hdrlen;
982
983         *bpp = pullupblock(*bpp, TCP4_PKT + TCP4_HDRSIZE);
984         if (*bpp == NULL)
985                 return -1;
986
987         h = (Tcp4hdr *) ((*bpp)->rp);
988         tcph->source = nhgets(h->tcpsport);
989         tcph->dest = nhgets(h->tcpdport);
990         tcph->seq = nhgetl(h->tcpseq);
991         tcph->ack = nhgetl(h->tcpack);
992
993         hdrlen = (h->tcpflag[0] >> 2) & ~3;
994         if (hdrlen < TCP4_HDRSIZE) {
995                 freeblist(*bpp);
996                 return -1;
997         }
998
999         tcph->flags = h->tcpflag[1];
1000         tcph->wnd = nhgets(h->tcpwin);
1001         tcph->urg = nhgets(h->tcpurg);
1002         clear_tcph_opts(tcph);
1003         tcph->len = nhgets(h->length) - (hdrlen + TCP4_PKT);
1004
1005         *bpp = pullupblock(*bpp, hdrlen + TCP4_PKT);
1006         if (*bpp == NULL)
1007                 return -1;
1008         parse_inbound_opts(tcph, h->tcpopt, hdrlen - TCP4_HDRSIZE);
1009         return hdrlen;
1010 }
1011
1012 /*
1013  *  For outgoing calls, generate an initial sequence
1014  *  number and put a SYN on the send queue
1015  */
1016 static void tcpsndsyn(struct conv *s, Tcpctl *tcb)
1017 {
1018         urandom_read(&tcb->iss, sizeof(tcb->iss));
1019         tcb->rttseq = tcb->iss;
1020         tcb->snd.wl2 = tcb->iss;
1021         tcb->snd.una = tcb->iss;
1022         tcb->snd.rtx = tcb->rttseq;
1023         tcb->snd.nxt = tcb->rttseq;
1024         tcb->flgcnt++;
1025         tcb->flags |= FORCE;
1026         tcb->sndsyntime = NOW;
1027
1028         /* set desired mss and scale */
1029         tcb->mss = tcpmtu(tcb->ifc, s->ipversion, &tcb->scale);
1030 }
1031
1032 static void sndrst(struct Proto *tcp, uint8_t *source, uint8_t *dest,
1033                    uint16_t length, Tcp *seg, uint8_t version, char *reason)
1034 {
1035         struct block *hbp;
1036         uint8_t rflags;
1037         struct tcppriv *tpriv;
1038         Tcp4hdr ph4;
1039         Tcp6hdr ph6;
1040
1041         netlog(tcp->f, Logtcpreset, "sndrst: %s\n", reason);
1042
1043         tpriv = tcp->priv;
1044
1045         if (seg->flags & RST)
1046                 return;
1047
1048         /* make pseudo header */
1049         switch (version) {
1050                 case V4:
1051                         memset(&ph4, 0, sizeof(ph4));
1052                         ph4.vihl = IP_VER4;
1053                         v6tov4(ph4.tcpsrc, dest);
1054                         v6tov4(ph4.tcpdst, source);
1055                         ph4.proto = IP_TCPPROTO;
1056                         hnputs(ph4.tcplen, TCP4_HDRSIZE);
1057                         hnputs(ph4.tcpsport, seg->dest);
1058                         hnputs(ph4.tcpdport, seg->source);
1059                         break;
1060                 case V6:
1061                         memset(&ph6, 0, sizeof(ph6));
1062                         ph6.vcf[0] = IP_VER6;
1063                         ipmove(ph6.tcpsrc, dest);
1064                         ipmove(ph6.tcpdst, source);
1065                         ph6.proto = IP_TCPPROTO;
1066                         hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1067                         hnputs(ph6.tcpsport, seg->dest);
1068                         hnputs(ph6.tcpdport, seg->source);
1069                         break;
1070                 default:
1071                         panic("sndrst: version %d", version);
1072         }
1073
1074         tpriv->stats[OutRsts]++;
1075         rflags = RST;
1076
1077         /* convince the other end that this reset is in band */
1078         if (seg->flags & ACK) {
1079                 seg->seq = seg->ack;
1080                 seg->ack = 0;
1081         } else {
1082                 rflags |= ACK;
1083                 seg->ack = seg->seq;
1084                 seg->seq = 0;
1085                 if (seg->flags & SYN)
1086                         seg->ack++;
1087                 seg->ack += length;
1088                 if (seg->flags & FIN)
1089                         seg->ack++;
1090         }
1091         seg->flags = rflags;
1092         seg->wnd = 0;
1093         seg->urg = 0;
1094         seg->mss = 0;
1095         seg->ws = 0;
1096         seg->sack_ok = FALSE;
1097         seg->nr_sacks = 0;
1098         /* seg->ts_val is already set with their timestamp */
1099         switch (version) {
1100                 case V4:
1101                         hbp = htontcp4(seg, NULL, &ph4, NULL);
1102                         if (hbp == NULL)
1103                                 return;
1104                         ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1105                         break;
1106                 case V6:
1107                         hbp = htontcp6(seg, NULL, &ph6, NULL);
1108                         if (hbp == NULL)
1109                                 return;
1110                         ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1111                         break;
1112                 default:
1113                         panic("sndrst2: version %d", version);
1114         }
1115 }
1116
1117 /*
1118  *  send a reset to the remote side and close the conversation
1119  *  called with s qlocked
1120  */
1121 static void tcphangup(struct conv *s)
1122 {
1123         ERRSTACK(1);
1124         Tcp seg;
1125         Tcpctl *tcb;
1126         struct block *hbp;
1127
1128         tcb = (Tcpctl *) s->ptcl;
1129         if (ipcmp(s->raddr, IPnoaddr)) {
1130                 /* discard error style, poperror regardless */
1131                 if (!waserror()) {
1132                         seg.flags = RST | ACK;
1133                         seg.ack = tcb->rcv.nxt;
1134                         tcb->last_ack_sent = seg.ack;
1135                         tcb->rcv.una = 0;
1136                         seg.seq = tcb->snd.nxt;
1137                         seg.wnd = 0;
1138                         seg.urg = 0;
1139                         seg.mss = 0;
1140                         seg.ws = 0;
1141                         seg.sack_ok = FALSE;
1142                         seg.nr_sacks = 0;
1143                         seg.ts_val = tcb->ts_recent;
1144                         switch (s->ipversion) {
1145                                 case V4:
1146                                         tcb->protohdr.tcp4hdr.vihl = IP_VER4;
1147                                         hbp = htontcp4(&seg, NULL, &tcb->protohdr.tcp4hdr, tcb);
1148                                         ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
1149                                         break;
1150                                 case V6:
1151                                         tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
1152                                         hbp = htontcp6(&seg, NULL, &tcb->protohdr.tcp6hdr, tcb);
1153                                         ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
1154                                         break;
1155                                 default:
1156                                         panic("tcphangup: version %d", s->ipversion);
1157                         }
1158                 }
1159                 poperror();
1160         }
1161         localclose(s, NULL);
1162 }
1163
1164 /*
1165  *  (re)send a SYN ACK
1166  */
1167 static int sndsynack(struct Proto *tcp, Limbo *lp)
1168 {
1169         struct block *hbp;
1170         Tcp4hdr ph4;
1171         Tcp6hdr ph6;
1172         Tcp seg;
1173         int scale;
1174         uint8_t flag = 0;
1175
1176         /* make pseudo header */
1177         switch (lp->version) {
1178                 case V4:
1179                         memset(&ph4, 0, sizeof(ph4));
1180                         ph4.vihl = IP_VER4;
1181                         v6tov4(ph4.tcpsrc, lp->laddr);
1182                         v6tov4(ph4.tcpdst, lp->raddr);
1183                         ph4.proto = IP_TCPPROTO;
1184                         hnputs(ph4.tcplen, TCP4_HDRSIZE);
1185                         hnputs(ph4.tcpsport, lp->lport);
1186                         hnputs(ph4.tcpdport, lp->rport);
1187                         break;
1188                 case V6:
1189                         memset(&ph6, 0, sizeof(ph6));
1190                         ph6.vcf[0] = IP_VER6;
1191                         ipmove(ph6.tcpsrc, lp->laddr);
1192                         ipmove(ph6.tcpdst, lp->raddr);
1193                         ph6.proto = IP_TCPPROTO;
1194                         hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1195                         hnputs(ph6.tcpsport, lp->lport);
1196                         hnputs(ph6.tcpdport, lp->rport);
1197                         break;
1198                 default:
1199                         panic("sndrst: version %d", lp->version);
1200         }
1201         lp->ifc = findipifc(tcp->f, lp->laddr, 0);
1202
1203         seg.seq = lp->iss;
1204         seg.ack = lp->irs + 1;
1205         seg.flags = SYN | ACK;
1206         seg.urg = 0;
1207         seg.mss = tcpmtu(lp->ifc, lp->version, &scale);
1208         seg.wnd = QMAX;
1209         seg.ts_val = lp->ts_val;
1210         seg.nr_sacks = 0;
1211
1212         /* if the other side set scale, we should too */
1213         if (lp->rcvscale) {
1214                 seg.ws = scale;
1215                 lp->sndscale = scale;
1216         } else {
1217                 seg.ws = 0;
1218                 lp->sndscale = 0;
1219         }
1220         if (SACK_SUPPORTED)
1221                 seg.sack_ok = lp->sack_ok;
1222         else
1223                 seg.sack_ok = FALSE;
1224
1225         switch (lp->version) {
1226                 case V4:
1227                         hbp = htontcp4(&seg, NULL, &ph4, NULL);
1228                         if (hbp == NULL)
1229                                 return -1;
1230                         ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1231                         break;
1232                 case V6:
1233                         hbp = htontcp6(&seg, NULL, &ph6, NULL);
1234                         if (hbp == NULL)
1235                                 return -1;
1236                         ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1237                         break;
1238                 default:
1239                         panic("sndsnack: version %d", lp->version);
1240         }
1241         lp->lastsend = NOW;
1242         return 0;
1243 }
1244
1245 #define hashipa(a, p) ( ( (a)[IPaddrlen-2] + (a)[IPaddrlen-1] + p )&LHTMASK )
1246
1247 /*
1248  *  put a call into limbo and respond with a SYN ACK
1249  *
1250  *  called with proto locked
1251  */
1252 static void limbo(struct conv *s, uint8_t *source, uint8_t *dest, Tcp *seg,
1253                   int version)
1254 {
1255         Limbo *lp, **l;
1256         struct tcppriv *tpriv;
1257         int h;
1258
1259         tpriv = s->p->priv;
1260         h = hashipa(source, seg->source);
1261
1262         for (l = &tpriv->lht[h]; *l != NULL; l = &lp->next) {
1263                 lp = *l;
1264                 if (lp->lport != seg->dest || lp->rport != seg->source
1265                         || lp->version != version)
1266                         continue;
1267                 if (ipcmp(lp->raddr, source) != 0)
1268                         continue;
1269                 if (ipcmp(lp->laddr, dest) != 0)
1270                         continue;
1271
1272                 /* each new SYN restarts the retransmits */
1273                 lp->irs = seg->seq;
1274                 break;
1275         }
1276         lp = *l;
1277         if (lp == NULL) {
1278                 if (tpriv->nlimbo >= Maxlimbo && tpriv->lht[h]) {
1279                         lp = tpriv->lht[h];
1280                         tpriv->lht[h] = lp->next;
1281                         lp->next = NULL;
1282                 } else {
1283                         lp = kzmalloc(sizeof(*lp), 0);
1284                         if (lp == NULL)
1285                                 return;
1286                         tpriv->nlimbo++;
1287                 }
1288                 *l = lp;
1289                 lp->version = version;
1290                 ipmove(lp->laddr, dest);
1291                 ipmove(lp->raddr, source);
1292                 lp->lport = seg->dest;
1293                 lp->rport = seg->source;
1294                 lp->mss = seg->mss;
1295                 lp->rcvscale = seg->ws;
1296                 lp->sack_ok = seg->sack_ok;
1297                 lp->irs = seg->seq;
1298                 lp->ts_val = seg->ts_val;
1299                 urandom_read(&lp->iss, sizeof(lp->iss));
1300         }
1301
1302         if (sndsynack(s->p, lp) < 0) {
1303                 *l = lp->next;
1304                 tpriv->nlimbo--;
1305                 kfree(lp);
1306         }
1307 }
1308
1309 /*
1310  *  resend SYN ACK's once every SYNACK_RXTIMER ms.
1311  */
1312 static void limborexmit(struct Proto *tcp)
1313 {
1314         struct tcppriv *tpriv;
1315         Limbo **l, *lp;
1316         int h;
1317         int seen;
1318         uint64_t now;
1319
1320         tpriv = tcp->priv;
1321
1322         if (!canqlock(&tcp->qlock))
1323                 return;
1324         seen = 0;
1325         now = NOW;
1326         for (h = 0; h < NLHT && seen < tpriv->nlimbo; h++) {
1327                 for (l = &tpriv->lht[h]; *l != NULL && seen < tpriv->nlimbo;) {
1328                         lp = *l;
1329                         seen++;
1330                         if (now - lp->lastsend < (lp->rexmits + 1) * SYNACK_RXTIMER)
1331                                 continue;
1332
1333                         /* time it out after 1 second */
1334                         if (++(lp->rexmits) > 5) {
1335                                 tpriv->nlimbo--;
1336                                 *l = lp->next;
1337                                 kfree(lp);
1338                                 continue;
1339                         }
1340
1341                         /* if we're being attacked, don't bother resending SYN ACK's */
1342                         if (tpriv->nlimbo > 100)
1343                                 continue;
1344
1345                         if (sndsynack(tcp, lp) < 0) {
1346                                 tpriv->nlimbo--;
1347                                 *l = lp->next;
1348                                 kfree(lp);
1349                                 continue;
1350                         }
1351
1352                         l = &lp->next;
1353                 }
1354         }
1355         qunlock(&tcp->qlock);
1356 }
1357
1358 /*
1359  *  lookup call in limbo.  if found, throw it out.
1360  *
1361  *  called with proto locked
1362  */
1363 static void limborst(struct conv *s, Tcp *segp, uint8_t *src, uint8_t *dst,
1364                      uint8_t version)
1365 {
1366         Limbo *lp, **l;
1367         int h;
1368         struct tcppriv *tpriv;
1369
1370         tpriv = s->p->priv;
1371
1372         /* find a call in limbo */
1373         h = hashipa(src, segp->source);
1374         for (l = &tpriv->lht[h]; *l != NULL; l = &lp->next) {
1375                 lp = *l;
1376                 if (lp->lport != segp->dest || lp->rport != segp->source
1377                         || lp->version != version)
1378                         continue;
1379                 if (ipcmp(lp->laddr, dst) != 0)
1380                         continue;
1381                 if (ipcmp(lp->raddr, src) != 0)
1382                         continue;
1383
1384                 /* RST can only follow the SYN */
1385                 if (segp->seq == lp->irs + 1) {
1386                         tpriv->nlimbo--;
1387                         *l = lp->next;
1388                         kfree(lp);
1389                 }
1390                 break;
1391         }
1392 }
1393
1394 /* The advertised MSS (e.g. 1460) includes any per-packet TCP options, such as
1395  * TCP timestamps.  A given packet will contain mss bytes, but only typical_mss
1396  * bytes of *data*.  If we know we'll use those options, we should adjust our
1397  * typical_mss, which will affect the cwnd. */
1398 static void adjust_typical_mss_for_opts(Tcp *tcph, Tcpctl *tcb)
1399 {
1400         uint16_t opt_size = 0;
1401
1402         if (tcph->ts_val)
1403                 opt_size += TS_LENGTH + TS_SEND_PREPAD;
1404         opt_size = ROUNDUP(opt_size, 4);
1405         tcb->typical_mss -= opt_size;
1406 }
1407
1408 /*
1409  *  come here when we finally get an ACK to our SYN-ACK.
1410  *  lookup call in limbo.  if found, create a new conversation
1411  *
1412  *  called with proto locked
1413  */
1414 static struct conv *tcpincoming(struct conv *s, Tcp *segp, uint8_t *src,
1415                                                                 uint8_t *dst, uint8_t version)
1416 {
1417         struct conv *new;
1418         Tcpctl *tcb;
1419         struct tcppriv *tpriv;
1420         Tcp4hdr *h4;
1421         Tcp6hdr *h6;
1422         Limbo *lp, **l;
1423         int h;
1424
1425         /* unless it's just an ack, it can't be someone coming out of limbo */
1426         if ((segp->flags & SYN) || (segp->flags & ACK) == 0)
1427                 return NULL;
1428
1429         tpriv = s->p->priv;
1430
1431         /* find a call in limbo */
1432         h = hashipa(src, segp->source);
1433         for (l = &tpriv->lht[h]; (lp = *l) != NULL; l = &lp->next) {
1434                 netlog(s->p->f, Logtcp,
1435                            "tcpincoming s %I!%d/%I!%d d %I!%d/%I!%d v %d/%d\n", src,
1436                            segp->source, lp->raddr, lp->rport, dst, segp->dest, lp->laddr,
1437                            lp->lport, version, lp->version);
1438
1439                 if (lp->lport != segp->dest || lp->rport != segp->source
1440                         || lp->version != version)
1441                         continue;
1442                 if (ipcmp(lp->laddr, dst) != 0)
1443                         continue;
1444                 if (ipcmp(lp->raddr, src) != 0)
1445                         continue;
1446
1447                 /* we're assuming no data with the initial SYN */
1448                 if (segp->seq != lp->irs + 1 || segp->ack != lp->iss + 1) {
1449                         netlog(s->p->f, Logtcp, "tcpincoming s 0x%lx/0x%lx a 0x%lx 0x%lx\n",
1450                                    segp->seq, lp->irs + 1, segp->ack, lp->iss + 1);
1451                         lp = NULL;
1452                 } else {
1453                         tpriv->nlimbo--;
1454                         *l = lp->next;
1455                 }
1456                 break;
1457         }
1458         if (lp == NULL)
1459                 return NULL;
1460
1461         new = Fsnewcall(s, src, segp->source, dst, segp->dest, version);
1462         if (new == NULL)
1463                 return NULL;
1464
1465         memmove(new->ptcl, s->ptcl, sizeof(Tcpctl));
1466         tcb = (Tcpctl *) new->ptcl;
1467         tcb->flags &= ~CLONE;
1468         tcb->timer.arg = new;
1469         tcb->timer.state = TcptimerOFF;
1470         tcb->acktimer.arg = new;
1471         tcb->acktimer.state = TcptimerOFF;
1472         tcb->katimer.arg = new;
1473         tcb->katimer.state = TcptimerOFF;
1474         tcb->rtt_timer.arg = new;
1475         tcb->rtt_timer.state = TcptimerOFF;
1476
1477         tcb->irs = lp->irs;
1478         tcb->rcv.nxt = tcb->irs + 1;
1479         tcb->rcv.urg = tcb->rcv.nxt;
1480
1481         tcb->iss = lp->iss;
1482         tcb->rttseq = tcb->iss;
1483         tcb->snd.wl2 = tcb->iss;
1484         tcb->snd.una = tcb->iss + 1;
1485         tcb->snd.rtx = tcb->iss + 1;
1486         tcb->snd.nxt = tcb->iss + 1;
1487         tcb->flgcnt = 0;
1488         tcb->flags |= SYNACK;
1489
1490         /* our sending max segment size cannot be bigger than what he asked for */
1491         if (lp->mss != 0 && lp->mss < tcb->mss) {
1492                 tcb->mss = lp->mss;
1493                 tcb->typical_mss = tcb->mss;
1494         }
1495         adjust_typical_mss_for_opts(segp, tcb);
1496
1497         /* Here's where we record the previously-decided header options.  They were
1498          * actually decided on when we agreed to them in the SYNACK we sent.  We
1499          * didn't create an actual TCB until now, so we can copy those decisions out
1500          * of the limbo tracker and into the TCB. */
1501         tcb->ifc = lp->ifc;
1502         tcb->sack_ok = lp->sack_ok;
1503         /* window scaling */
1504         tcpsetscale(new, tcb, lp->rcvscale, lp->sndscale);
1505         tcb_check_tso(tcb);
1506
1507         tcb->snd.wnd = segp->wnd;
1508         tcb->cwind = tcb->typical_mss * CWIND_SCALE;
1509
1510         /* set initial round trip time */
1511         tcb->sndsyntime = lp->lastsend + lp->rexmits * SYNACK_RXTIMER;
1512         tcpsynackrtt(new);
1513
1514         kfree(lp);
1515
1516         /* set up proto header */
1517         switch (version) {
1518                 case V4:
1519                         h4 = &tcb->protohdr.tcp4hdr;
1520                         memset(h4, 0, sizeof(*h4));
1521                         h4->proto = IP_TCPPROTO;
1522                         hnputs(h4->tcpsport, new->lport);
1523                         hnputs(h4->tcpdport, new->rport);
1524                         v6tov4(h4->tcpsrc, dst);
1525                         v6tov4(h4->tcpdst, src);
1526                         break;
1527                 case V6:
1528                         h6 = &tcb->protohdr.tcp6hdr;
1529                         memset(h6, 0, sizeof(*h6));
1530                         h6->proto = IP_TCPPROTO;
1531                         hnputs(h6->tcpsport, new->lport);
1532                         hnputs(h6->tcpdport, new->rport);
1533                         ipmove(h6->tcpsrc, dst);
1534                         ipmove(h6->tcpdst, src);
1535                         break;
1536                 default:
1537                         panic("tcpincoming: version %d", new->ipversion);
1538         }
1539
1540         tcpsetstate(new, Established);
1541
1542         iphtadd(&tpriv->ht, new);
1543
1544         return new;
1545 }
1546
1547 /*
1548  *  use the time between the first SYN and it's ack as the
1549  *  initial round trip time
1550  */
1551 static void tcpsynackrtt(struct conv *s)
1552 {
1553         Tcpctl *tcb;
1554         uint64_t delta;
1555         struct tcppriv *tpriv;
1556
1557         tcb = (Tcpctl *) s->ptcl;
1558         tpriv = s->p->priv;
1559
1560         delta = NOW - tcb->sndsyntime;
1561         tcb->srtt = delta;
1562         tcb->mdev = delta / 2;
1563
1564         /* halt round trip timer */
1565         tcphalt(tpriv, &tcb->rtt_timer);
1566 }
1567
1568 /* For LFNs (long/fat), our default tx queue doesn't hold enough data, and TCP
1569  * blocks on the application - even if the app already has the data ready to go.
1570  * We need to hold the sent, unacked data (1x cwnd), plus all the data we might
1571  * send next RTT (1x cwnd).  Note this is called after cwnd was expanded. */
1572 static void adjust_tx_qio_limit(struct conv *s)
1573 {
1574         Tcpctl *tcb = (Tcpctl *) s->ptcl;
1575         size_t ideal_limit = tcb->cwind * 2;
1576
1577         /* This is called for every ACK, and it's not entirely free to update the
1578          * limit (locks, CVs, taps).  Updating in chunks of mss seems reasonable.
1579          * During SS, we'll update this on most ACKs (given each ACK increased the
1580          * cwind by > MSS).
1581          *
1582          * We also don't want a lot of tiny blocks from the user, but the way qio
1583          * works, you can put in as much as you want (Maxatomic) and then get
1584          * flow-controlled. */
1585         if (qgetlimit(s->wq) + tcb->typical_mss < ideal_limit)
1586                 qsetlimit(s->wq, ideal_limit);
1587         /* TODO: we could shrink the qio limit too, if we had a better idea what the
1588          * actual threshold was.  We want the limit to be the 'stable' cwnd * 2. */
1589 }
1590
1591 /* Attempts to merge later sacks into sack 'into' (index in the array) */
1592 static void merge_sacks_into(Tcpctl *tcb, int into)
1593 {
1594         struct sack_block *into_sack = &tcb->snd.sacks[into];
1595         struct sack_block *tcb_sack;
1596         int shift = 0;
1597
1598         for (int i = into + 1; i < tcb->snd.nr_sacks; i++) {
1599                 tcb_sack = &tcb->snd.sacks[i];
1600                 if (seq_lt(into_sack->right, tcb_sack->left))
1601                         break;
1602                 if (seq_gt(tcb_sack->right, into_sack->right))
1603                         into_sack->right = tcb_sack->right;
1604                 shift++;
1605         }
1606         if (shift) {
1607                 memmove(tcb->snd.sacks + into + 1,
1608                         tcb->snd.sacks + into + 1 + shift,
1609                         sizeof(struct sack_block) * (tcb->snd.nr_sacks - into - 1
1610                                                              - shift));
1611                 tcb->snd.nr_sacks -= shift;
1612         }
1613 }
1614
1615 /* If we update a sack, it means they received a packet (possibly out of order),
1616  * but they have not received earlier packets.  Otherwise, they would do a full
1617  * ACK.
1618  *
1619  * The trick is in knowing whether the reception growing this sack is due to a
1620  * retrans or due to packets from before our last loss event.  The rightmost
1621  * sack tends to grow a lot with packets we sent before the loss.  However,
1622  * intermediate sacks that grow are signs of a loss, since they only grow as a
1623  * result of retrans.
1624  *
1625  * This is only true for the first time through a retrans.  After we've gone
1626  * through a full retrans blast, the sack that hinted at the retrans loss (and
1627  * there could be multiple of them!) will continue to grow.  We could come up
1628  * with some tracking for this, but instead we'll just do a one-time deal.  You
1629  * can recover from one detected sack retrans loss.  After that, you'll have to
1630  * use the RTO.
1631  *
1632  * This won't catch some things, like a sack that grew and merged with the
1633  * rightmost sack.  This also won't work if you have a single sack.  We can't
1634  * tell where the retrans ends and the sending begins. */
1635 static bool sack_hints_at_loss(Tcpctl *tcb, struct sack_block *tcb_sack)
1636 {
1637         if (tcb->snd.recovery != SACK_RETRANS_RECOVERY)
1638                 return FALSE;
1639         return &tcb->snd.sacks[tcb->snd.nr_sacks - 1] != tcb_sack;
1640 }
1641
1642 static bool sack_contains(struct sack_block *tcb_sack, uint32_t seq)
1643 {
1644         return seq_le(tcb_sack->left, seq) && seq_lt(seq, tcb_sack->right);
1645 }
1646
1647 /* Debugging helper! */
1648 static void sack_asserter(Tcpctl *tcb, char *str)
1649 {
1650         struct sack_block *tcb_sack;
1651
1652         for (int i = 0; i < tcb->snd.nr_sacks; i++) {
1653                 tcb_sack = &tcb->snd.sacks[i];
1654                 /* Checking invariants: snd.rtx is never inside a sack, sacks are always
1655                  * mutually exclusive. */
1656                 if (sack_contains(tcb_sack, tcb->snd.rtx) ||
1657                     ((i + 1 < tcb->snd.nr_sacks) && seq_ge(tcb_sack->right,
1658                                                                (tcb_sack + 1)->left))) {
1659                         printk("SACK ASSERT ERROR at %s\n", str);
1660                         printk("rtx %u una %u nxt %u, sack [%u, %u)\n",
1661                                tcb->snd.rtx, tcb->snd.una, tcb->snd.nxt, tcb_sack->left,
1662                                    tcb_sack->right);
1663                         for (int i = 0; i < tcb->snd.nr_sacks; i++)
1664                                 printk("\t %d: [%u, %u)\n", i, tcb->snd.sacks[i].left,
1665                                        tcb->snd.sacks[i].right);
1666                         backtrace();
1667                         panic("");
1668                 }
1669         }
1670 }
1671
1672 /* Updates bookkeeping whenever a sack is added or updated */
1673 static void sack_has_changed(struct conv *s, Tcpctl *tcb,
1674                              struct sack_block *tcb_sack)
1675 {
1676         /* Due to the change, snd.rtx might be in the middle of this sack.  Advance
1677          * it to the right edge. */
1678         if (sack_contains(tcb_sack, tcb->snd.rtx))
1679                 tcb->snd.rtx = tcb_sack->right;
1680
1681         /* This is a sack for something we retransed and we think it means there was
1682          * another loss.  Instead of waiting for the RTO, we can take action. */
1683         if (sack_hints_at_loss(tcb, tcb_sack)) {
1684                 if (++tcb->snd.sack_loss_hint == TCPREXMTTHRESH) {
1685                         netlog(s->p->f, Logtcprxmt,
1686                                "%I.%d -> %I.%d: sack rxmit loss: snd.rtx %u, sack [%u,%u), una %u, recovery_pt %u\n",
1687                                s->laddr, s->lport, s->raddr, s->rport,
1688                                tcb->snd.rtx, tcb_sack->left, tcb_sack->right, tcb->snd.una,
1689                                tcb->snd.recovery_pt);
1690                         /* Redo retrans, but keep the sacks and recovery point */
1691                         tcp_loss_event(s, tcb);
1692                         tcb->snd.rtx = tcb->snd.una;
1693                         tcb->snd.sack_loss_hint = 0;
1694                         /* Act like an RTO.  We just detected it earlier.  This prevents us
1695                          * from getting another sack hint loss this recovery period and from
1696                          * advancing the opportunistic right edge. */
1697                         tcb->snd.recovery = RTO_RETRANS_RECOVERY;
1698                         /* We didn't actually time out yet and we expect to keep getting
1699                          * sacks, so we don't want to flush or worry about in_flight.  If we
1700                          * messed something up, the RTO will still fire. */
1701                         set_in_flight(tcb);
1702                 }
1703         }
1704 }
1705
1706 /* Advances tcb_sack's right edge, if new_right is farther, and updates the
1707  * bookkeeping due to the change. */
1708 static void update_right_edge(struct conv *s, Tcpctl *tcb,
1709                               struct sack_block *tcb_sack, uint32_t new_right)
1710 {
1711         if (seq_le(new_right, tcb_sack->right))
1712                 return;
1713         tcb_sack->right = new_right;
1714         merge_sacks_into(tcb, tcb_sack - tcb->snd.sacks);
1715         sack_has_changed(s, tcb, tcb_sack);
1716 }
1717
1718 static void update_or_insert_sack(struct conv *s, Tcpctl *tcb,
1719                                   struct sack_block *seg_sack)
1720 {
1721         struct sack_block *tcb_sack;
1722
1723         for (int i = 0; i < tcb->snd.nr_sacks; i++) {
1724                 tcb_sack = &tcb->snd.sacks[i];
1725                 if (seq_lt(tcb_sack->left, seg_sack->left)) {
1726                         /* This includes adjacent (which I've seen!) and overlap. */
1727                         if (seq_le(seg_sack->left, tcb_sack->right)) {
1728                                 update_right_edge(s, tcb, tcb_sack, seg_sack->right);
1729                                 return;
1730                         }
1731                         continue;
1732                 }
1733                 /* Update existing sack */
1734                 if (tcb_sack->left == seg_sack->left) {
1735                         update_right_edge(s, tcb, tcb_sack, seg_sack->right);
1736                         return;
1737                 }
1738                 /* Found our slot */
1739                 if (seq_gt(tcb_sack->left, seg_sack->left)) {
1740                         if (tcb->snd.nr_sacks == MAX_NR_SND_SACKS) {
1741                                 /* Out of room, but it is possible this sack overlaps later
1742                                  * sacks, including the max sack's right edge. */
1743                                 if (seq_ge(seg_sack->right, tcb_sack->left)) {
1744                                         /* Take over the sack */
1745                                         tcb_sack->left = seg_sack->left;
1746                                         update_right_edge(s, tcb, tcb_sack, seg_sack->right);
1747                                 }
1748                                 return;
1749                         }
1750                         /* O/W, it's our slot and we have room (at least one spot). */
1751                         memmove(&tcb->snd.sacks[i + 1], &tcb->snd.sacks[i],
1752                                 sizeof(struct sack_block) * (tcb->snd.nr_sacks - i));
1753                         tcb_sack->left = seg_sack->left;
1754                         tcb_sack->right = seg_sack->right;
1755                         tcb->snd.nr_sacks++;
1756                         merge_sacks_into(tcb, i);
1757                         sack_has_changed(s, tcb, tcb_sack);
1758                         return;
1759                 }
1760         }
1761         if (tcb->snd.nr_sacks == MAX_NR_SND_SACKS) {
1762                 /* We didn't find space in the sack array. */
1763                 tcb_sack = &tcb->snd.sacks[MAX_NR_SND_SACKS - 1];
1764                 /* Need to always maintain the rightmost sack, discarding the prev */
1765                 if (seq_gt(seg_sack->right, tcb_sack->right)) {
1766                         tcb_sack->left = seg_sack->left;
1767                         tcb_sack->right = seg_sack->right;
1768                         sack_has_changed(s, tcb, tcb_sack);
1769                 }
1770                 return;
1771         }
1772         tcb_sack = &tcb->snd.sacks[tcb->snd.nr_sacks];
1773         tcb->snd.nr_sacks++;
1774         tcb_sack->left = seg_sack->left;
1775         tcb_sack->right = seg_sack->right;
1776         sack_has_changed(s, tcb, tcb_sack);
1777 }
1778
1779 /* Given the packet seg, track the sacks in TCB.  There are a few things: if seg
1780  * acks new data, some sacks might no longer be needed.  Some sacks might grow,
1781  * we might add new sacks, either of which can cause a merger.
1782  *
1783  * The important thing is that we always have the max sack entry: it must be
1784  * inserted for sure and findable.  We need that for our measurement of what
1785  * packets are in the network.
1786  *
1787  * Note that we keep sacks that are below snd.rtx (and above
1788  * seg.ack/tcb->snd.una) as best we can - we don't prune them.  We'll need those
1789  * for the in_flight estimate.
1790  *
1791  * When we run out of room, we'll have to throw away a sack.  Anything we throw
1792  * away below snd.rtx will be counted as 'in flight', even though it isn't.  If
1793  * we throw away something greater than snd.rtx, we'll also retrans it.  For
1794  * simplicity, we throw-away / replace the rightmost sack, since we're always
1795  * maintaining a highest sack. */
1796 static void update_sacks(struct conv *s, Tcpctl *tcb, Tcp *seg)
1797 {
1798         int prune = 0;
1799         struct sack_block *tcb_sack;
1800
1801         for (int i = 0; i < tcb->snd.nr_sacks; i++) {
1802                 tcb_sack = &tcb->snd.sacks[i];
1803                 /* For the equality case, if they acked up to, but not including an old
1804                  * sack, they must have reneged it.  Otherwise they would have acked
1805                  * beyond the sack. */
1806                 if (seq_lt(seg->ack, tcb_sack->left))
1807                         break;
1808                 prune++;
1809         }
1810         if (prune) {
1811                 memmove(tcb->snd.sacks, tcb->snd.sacks + prune,
1812                         sizeof(struct sack_block) * (tcb->snd.nr_sacks - prune));
1813                 tcb->snd.nr_sacks -= prune;
1814         }
1815         for (int i = 0; i < seg->nr_sacks; i++) {
1816                 /* old sacks */
1817                 if (seq_lt(seg->sacks[i].left, seg->ack))
1818                         continue;
1819                 /* buggy sack: out of range */
1820                 if (seq_gt(seg->sacks[i].right, tcb->snd.nxt))
1821                         continue;
1822                 update_or_insert_sack(s, tcb, &seg->sacks[i]);
1823         }
1824 }
1825
1826 /* This is a little bit of an under estimate, since we assume a packet is lost
1827  * once we have any sacks above it.  Overall, it's at most 2 * MSS of an
1828  * overestimate.
1829  *
1830  * If we have no sacks (either reneged or never used) we'll assume all packets
1831  * above snd.rtx are lost.  This will be the case for sackless fast rxmit
1832  * (Dong's stuff) or for a timeout.  In the former case, this is probably not
1833  * true, and in_flight should be higher, but we have no knowledge without the
1834  * sacks. */
1835 static void set_in_flight(Tcpctl *tcb)
1836 {
1837         struct sack_block *tcb_sack;
1838         uint32_t in_flight = 0;
1839         uint32_t from;
1840
1841         if (!tcb->snd.nr_sacks) {
1842                 tcb->snd.in_flight = tcb->snd.rtx - tcb->snd.una;
1843                 return;
1844         }
1845
1846         /* Everything to the right of the unsacked */
1847         tcb_sack = &tcb->snd.sacks[tcb->snd.nr_sacks - 1];
1848         in_flight += tcb->snd.nxt - tcb_sack->right;
1849
1850         /* Everything retransed (from una to snd.rtx, minus sacked regions.  Note
1851          * we only retrans at most the last sack's left edge.  snd.rtx will be
1852          * advanced to the right edge of some sack (possibly the last one). */
1853         from = tcb->snd.una;
1854         for (int i = 0; i < tcb->snd.nr_sacks; i++) {
1855                 tcb_sack = &tcb->snd.sacks[i];
1856                 if (seq_ge(tcb_sack->left, tcb->snd.rtx))
1857                         break;
1858                 assert(seq_ge(tcb->snd.rtx, tcb_sack->right));
1859                 in_flight += tcb_sack->left - from;
1860                 from = tcb_sack->right;
1861         }
1862         in_flight += tcb->snd.rtx - from;
1863
1864         tcb->snd.in_flight = in_flight;
1865 }
1866
1867 static void reset_recovery(struct conv *s, Tcpctl *tcb)
1868 {
1869         netlog(s->p->f, Logtcprxmt,
1870                "%I.%d -> %I.%d: recovery complete, una %u, rtx %u, nxt %u, recovery %u\n",
1871                s->laddr, s->lport, s->raddr, s->rport,
1872                tcb->snd.una, tcb->snd.rtx, tcb->snd.nxt, tcb->snd.recovery_pt);
1873         tcb->snd.recovery = 0;
1874         tcb->snd.recovery_pt = 0;
1875         tcb->snd.loss_hint = 0;
1876         tcb->snd.flush_sacks = FALSE;
1877         tcb->snd.sack_loss_hint = 0;
1878 }
1879
1880 static bool is_dup_ack(Tcpctl *tcb, Tcp *seg)
1881 {
1882         /* this is a pure ack w/o window update */
1883         return (seg->ack == tcb->snd.una) &&
1884                (tcb->snd.una != tcb->snd.nxt) &&
1885                (seg->len == 0) &&
1886                (seg->wnd == tcb->snd.wnd);
1887 }
1888
1889 /* If we have sacks, we'll ignore dupacks and look at the sacks ahead of una
1890  * (which are managed by the TCB).  The tcb will not have old sacks (below
1891  * ack/snd.rtx).  Receivers often send sacks below their ack point when we are
1892  * coming out of a loss, and we don't want those to count.
1893  *
1894  * Note the tcb could have sacks (in the future), but the receiver stopped using
1895  * them (reneged).  We'll catch that with the RTO.  If we try to catch it here,
1896  * we could get in a state where we never allow them to renege. */
1897 static bool is_potential_loss(Tcpctl *tcb, Tcp *seg)
1898 {
1899         if (seg->nr_sacks > 0)
1900                 return tcb->snd.nr_sacks > 0;
1901         else
1902                 return is_dup_ack(tcb, seg);
1903 }
1904
1905 /* When we use timestamps for RTTM, RFC 7323 suggests scaling by
1906  * expected_samples (per cwnd).  They say:
1907  *
1908  * ExpectedSamples = ceiling(FlightSize / (SMSS * 2))
1909  *
1910  * However, SMMS * 2 is really "number of bytes expected to be acked in a
1911  * packet.".  We'll use 'acked' to approximate that.  When the receiver uses
1912  * LRO, they'll send back large ACKs, which decreases the number of samples.
1913  *
1914  * If it turns out that all the divides are bad, we can just go back to not
1915  * using expected_samples at all. */
1916 static int expected_samples_ts(Tcpctl *tcb, uint32_t acked)
1917 {
1918         assert(acked);
1919         return MAX(DIV_ROUND_UP(tcb->snd.nxt - tcb->snd.una, acked), 1);
1920 }
1921
1922 /* Updates the RTT, given the currently sampled RTT and the number samples per
1923  * cwnd.  For non-TS RTTM, that'll be 1. */
1924 static void update_rtt(Tcpctl *tcb, int rtt_sample, int expected_samples)
1925 {
1926         int delta;
1927
1928         tcb->backoff = 0;
1929         tcb->backedoff = 0;
1930         if (tcb->srtt == 0) {
1931                 tcb->srtt = rtt_sample;
1932                 tcb->mdev = rtt_sample / 2;
1933         } else {
1934                 delta = rtt_sample - tcb->srtt;
1935                 tcb->srtt += (delta >> RTTM_ALPHA_SHIFT) / expected_samples;
1936                 if (tcb->srtt <= 0)
1937                         tcb->srtt = 1;
1938                 tcb->mdev += ((abs(delta) - tcb->mdev) >> RTTM_BRAVO_SHIFT) /
1939                              expected_samples;
1940                 if (tcb->mdev <= 0)
1941                         tcb->mdev = 1;
1942         }
1943         tcpsettimer(tcb);
1944 }
1945
1946 static void update(struct conv *s, Tcp *seg)
1947 {
1948         int rtt;
1949         Tcpctl *tcb;
1950         uint32_t acked, expand;
1951         struct tcppriv *tpriv;
1952
1953         tpriv = s->p->priv;
1954         tcb = (Tcpctl *) s->ptcl;
1955
1956         if (!seq_within(seg->ack, tcb->snd.una, tcb->snd.nxt))
1957                 return;
1958
1959         acked = seg->ack - tcb->snd.una;
1960         tcb->snd.una = seg->ack;
1961         if (seq_gt(seg->ack, tcb->snd.rtx))
1962                 tcb->snd.rtx = seg->ack;
1963
1964         update_sacks(s, tcb, seg);
1965         set_in_flight(tcb);
1966
1967         /* We treat either a dupack or forward SACKs as a hint that there is a loss.
1968          * The RFCs suggest three dupacks before treating it as a loss (alternative
1969          * is reordered packets).  We'll treat three SACKs the same way. */
1970         if (is_potential_loss(tcb, seg) && !tcb->snd.recovery) {
1971                 tcb->snd.loss_hint++;
1972                 if (tcb->snd.loss_hint == TCPREXMTTHRESH) {
1973                         netlog(s->p->f, Logtcprxmt,
1974                                "%I.%d -> %I.%d: loss hint thresh, nr sacks %u, nxt %u, una %u, cwnd %u\n",
1975                                s->laddr, s->lport, s->raddr, s->rport,
1976                                tcb->snd.nr_sacks, tcb->snd.nxt, tcb->snd.una, tcb->cwind);
1977                         tcp_loss_event(s, tcb);
1978                         tcb->snd.recovery_pt = tcb->snd.nxt;
1979                         if (tcb->snd.nr_sacks) {
1980                                 tcb->snd.recovery = SACK_RETRANS_RECOVERY;
1981                                 tcb->snd.flush_sacks = FALSE;
1982                                 tcb->snd.sack_loss_hint = 0;
1983                         } else {
1984                                 tcb->snd.recovery = FAST_RETRANS_RECOVERY;
1985                         }
1986                         tcprxmit(s);
1987                 }
1988         }
1989
1990         /*
1991          *  update window
1992          */
1993         if (seq_gt(seg->ack, tcb->snd.wl2)
1994                 || (tcb->snd.wl2 == seg->ack && seg->wnd > tcb->snd.wnd)) {
1995                 tcb->snd.wnd = seg->wnd;
1996                 tcb->snd.wl2 = seg->ack;
1997         }
1998
1999         if (!acked) {
2000                 /*
2001                  *  don't let us hangup if sending into a closed window and
2002                  *  we're still getting acks
2003                  */
2004                 if (tcb->snd.recovery && (tcb->snd.wnd == 0))
2005                         tcb->backedoff = MAXBACKMS / 4;
2006                 return;
2007         }
2008         /* At this point, they have acked something new. (positive ack, ack > una).
2009          *
2010          * If we hadn't reached the threshold for recovery yet, the positive ACK
2011          * will reset our loss_hint count. */
2012         if (!tcb->snd.recovery)
2013                 tcb->snd.loss_hint = 0;
2014         else if (seq_ge(seg->ack, tcb->snd.recovery_pt))
2015                 reset_recovery(s, tcb);
2016
2017         /* avoid slow start and timers for SYN acks */
2018         if ((tcb->flags & SYNACK) == 0) {
2019                 tcb->flags |= SYNACK;
2020                 acked--;
2021                 tcb->flgcnt--;
2022                 goto done;
2023         }
2024
2025         /* slow start as long as we're not recovering from lost packets */
2026         if (tcb->cwind < tcb->snd.wnd && !tcb->snd.recovery) {
2027                 if (tcb->cwind < tcb->ssthresh) {
2028                         /* We increase the cwind by every byte we receive.  We want to
2029                          * increase the cwind by one MSS for every MSS that gets ACKed.
2030                          * Note that multiple MSSs can be ACKed in a single ACK.  If we had
2031                          * a remainder of acked / MSS, we'd add just that remainder - not 0
2032                          * or 1 MSS. */
2033                         expand = acked;
2034                 } else {
2035                         /* Every RTT, which consists of CWND bytes, we're supposed to expand
2036                          * by MSS bytes.  The classic algorithm was
2037                          *              expand = (tcb->mss * tcb->mss) / tcb->cwind;
2038                          * which assumes the ACK was for MSS bytes.  Instead, for every
2039                          * 'acked' bytes, we increase the window by acked / CWND (in units
2040                          * of MSS). */
2041                         expand = MAX(acked, tcb->typical_mss) * tcb->typical_mss
2042                                  / tcb->cwind;
2043                 }
2044
2045                 if (tcb->cwind + expand < tcb->cwind)
2046                         expand = tcb->snd.wnd - tcb->cwind;
2047                 if (tcb->cwind + expand > tcb->snd.wnd)
2048                         expand = tcb->snd.wnd - tcb->cwind;
2049                 tcb->cwind += expand;
2050         }
2051         adjust_tx_qio_limit(s);
2052
2053         if (tcb->ts_recent) {
2054                 update_rtt(tcb, abs(milliseconds() - seg->ts_ecr),
2055                            expected_samples_ts(tcb, acked));
2056         } else if (tcb->rtt_timer.state == TcptimerON &&
2057                    seq_ge(seg->ack, tcb->rttseq)) {
2058                 /* Adjust the timers according to the round trip time */
2059                 tcphalt(tpriv, &tcb->rtt_timer);
2060                 if (!tcb->snd.recovery) {
2061                         rtt = tcb->rtt_timer.start - tcb->rtt_timer.count;
2062                         if (rtt == 0)
2063                                 rtt = 1;        /* o/w all close systems will rexmit in 0 time */
2064                         rtt *= MSPTICK;
2065                         update_rtt(tcb, rtt, 1);
2066                 }
2067         }
2068
2069 done:
2070         if (qdiscard(s->wq, acked) < acked) {
2071                 tcb->flgcnt--;
2072                 /* This happened due to another bug where acked was very large
2073                  * (negative), which was interpreted as "hey, one less flag, since they
2074                  * acked one of our flags (like a SYN).  If flgcnt goes negative,
2075                  * get_xmit_segment() will attempt to send out large packets. */
2076                 assert(tcb->flgcnt >= 0);
2077         }
2078
2079         if (seq_gt(seg->ack, tcb->snd.urg))
2080                 tcb->snd.urg = seg->ack;
2081
2082         if (tcb->snd.una != tcb->snd.nxt)
2083                 tcpgo(tpriv, &tcb->timer);
2084         else
2085                 tcphalt(tpriv, &tcb->timer);
2086
2087         tcb->backoff = 0;
2088         tcb->backedoff = 0;
2089 }
2090
2091 static void update_tcb_ts(Tcpctl *tcb, Tcp *seg)
2092 {
2093         /* Get timestamp info from the tcp header.  Even though the timestamps
2094          * aren't sequence numbers, we still need to protect for wraparound.  Though
2095          * if the values were 0, assume that means we need an update.  We could have
2096          * an initial ts_val that appears negative (signed). */
2097         if (!tcb->ts_recent || !tcb->last_ack_sent ||
2098             (seq_ge(seg->ts_val, tcb->ts_recent) &&
2099              seq_le(seg->seq, tcb->last_ack_sent)))
2100                 tcb->ts_recent = seg->ts_val;
2101 }
2102
2103 /* Overlap happens when one sack's left edge is inside another sack. */
2104 static bool sacks_overlap(struct sack_block *x, struct sack_block *y)
2105 {
2106         return (seq_le(x->left, y->left) && seq_le(y->left, x->right)) ||
2107                (seq_le(y->left, x->left) && seq_le(x->left, y->right));
2108 }
2109
2110 static void make_sack_first(Tcpctl *tcb, struct sack_block *tcb_sack)
2111 {
2112         struct sack_block temp;
2113
2114         if (tcb_sack == &tcb->rcv.sacks[0])
2115                 return;
2116         temp = tcb->rcv.sacks[0];
2117         tcb->rcv.sacks[0] = *tcb_sack;
2118         *tcb_sack = temp;
2119 }
2120
2121 /* Track sack in our tcb for a block of data we received.  This handles all the
2122  * stuff: making sure sack is first (since it's the most recent sack change),
2123  * updating or merging sacks, and dropping excess sacks (we only need to
2124  * maintain 3).  Unlike on the snd side, our tcb sacks are *not* sorted. */
2125 static void track_rcv_sack(Tcpctl *tcb, uint32_t left, uint32_t right)
2126 {
2127         struct sack_block *tcb_sack;
2128         struct sack_block sack[1];
2129
2130         if (!tcb->sack_ok)
2131                 return;
2132         if (left == right)
2133                 return;
2134         assert(seq_lt(left, right));
2135         sack->left = left;
2136         sack->right = right;
2137         /* We can reuse an existing sack if we're merging or overlapping. */
2138         for (int i = 0; i < tcb->rcv.nr_sacks; i++) {
2139                 tcb_sack = &tcb->rcv.sacks[i];
2140                 if (sacks_overlap(tcb_sack, sack)) {
2141                         tcb_sack->left = seq_min(tcb_sack->left, sack->left);
2142                         tcb_sack->right = seq_max(tcb_sack->right, sack->right);
2143                         make_sack_first(tcb, tcb_sack);
2144                         return;
2145                 }
2146         }
2147         /* We can discard the last sack (right shift) - we should have sent it at
2148          * least once by now.  If not, oh well. */
2149         memmove(tcb->rcv.sacks + 1, tcb->rcv.sacks, sizeof(struct sack_block) *
2150                 MIN(MAX_NR_RCV_SACKS - 1, tcb->rcv.nr_sacks));
2151         tcb->rcv.sacks[0] = *sack;
2152         if (tcb->rcv.nr_sacks < MAX_NR_RCV_SACKS)
2153                 tcb->rcv.nr_sacks++;
2154 }
2155
2156 /* Once we receive everything and move rcv.nxt past a sack, we don't need to
2157  * track it.  I've seen Linux report sacks in the past, but we probably
2158  * shouldn't. */
2159 static void drop_old_rcv_sacks(Tcpctl *tcb)
2160 {
2161         struct sack_block *tcb_sack;
2162
2163         for (int i = 0; i < tcb->rcv.nr_sacks; i++) {
2164                 tcb_sack = &tcb->rcv.sacks[i];
2165                 /* Moving up to or past the left is enough to drop it. */
2166                 if (seq_ge(tcb->rcv.nxt, tcb_sack->left)) {
2167                         memmove(tcb->rcv.sacks + i, tcb->rcv.sacks + i + 1,
2168                                 sizeof(struct sack_block) * (tcb->rcv.nr_sacks - i - 1));
2169                         tcb->rcv.nr_sacks--;
2170                         i--;
2171                 }
2172         }
2173 }
2174
2175 static void tcpiput(struct Proto *tcp, struct Ipifc *unused, struct block *bp)
2176 {
2177         ERRSTACK(1);
2178         Tcp seg;
2179         Tcp4hdr *h4;
2180         Tcp6hdr *h6;
2181         int hdrlen;
2182         Tcpctl *tcb;
2183         uint16_t length;
2184         uint8_t source[IPaddrlen], dest[IPaddrlen];
2185         struct conv *s;
2186         struct Fs *f;
2187         struct tcppriv *tpriv;
2188         uint8_t version;
2189
2190         f = tcp->f;
2191         tpriv = tcp->priv;
2192
2193         tpriv->stats[InSegs]++;
2194
2195         h4 = (Tcp4hdr *) (bp->rp);
2196         h6 = (Tcp6hdr *) (bp->rp);
2197
2198         if ((h4->vihl & 0xF0) == IP_VER4) {
2199                 uint8_t ttl;
2200
2201                 version = V4;
2202                 length = nhgets(h4->length);
2203                 v4tov6(dest, h4->tcpdst);
2204                 v4tov6(source, h4->tcpsrc);
2205
2206                 /* ttl isn't part of the xsum pseudo header, but bypass needs it. */
2207                 ttl = h4->Unused;
2208                 h4->Unused = 0;
2209                 hnputs(h4->tcplen, length - TCP4_PKT);
2210                 if (!(bp->flag & Btcpck) && (h4->tcpcksum[0] || h4->tcpcksum[1]) &&
2211                         ptclcsum(bp, TCP4_IPLEN, length - TCP4_IPLEN)) {
2212                         tpriv->stats[CsumErrs]++;
2213                         tpriv->stats[InErrs]++;
2214                         netlog(f, Logtcp, "bad tcp proto cksum\n");
2215                         freeblist(bp);
2216                         return;
2217                 }
2218                 h4->Unused = ttl;
2219
2220                 hdrlen = ntohtcp4(&seg, &bp);
2221                 if (hdrlen < 0) {
2222                         tpriv->stats[HlenErrs]++;
2223                         tpriv->stats[InErrs]++;
2224                         netlog(f, Logtcp, "bad tcp hdr len\n");
2225                         return;
2226                 }
2227
2228                 s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest);
2229                 if (s && s->state == Bypass) {
2230                         bypass_or_drop(s, bp);
2231                         return;
2232                 }
2233
2234                 /* trim the packet to the size claimed by the datagram */
2235                 length -= hdrlen + TCP4_PKT;
2236                 bp = trimblock(bp, hdrlen + TCP4_PKT, length);
2237                 if (bp == NULL) {
2238                         tpriv->stats[LenErrs]++;
2239                         tpriv->stats[InErrs]++;
2240                         netlog(f, Logtcp, "tcp len < 0 after trim\n");
2241                         return;
2242                 }
2243         } else {
2244                 int ttl = h6->ttl;
2245                 int proto = h6->proto;
2246
2247                 version = V6;
2248                 length = nhgets(h6->ploadlen);
2249                 ipmove(dest, h6->tcpdst);
2250                 ipmove(source, h6->tcpsrc);
2251
2252                 h6->ploadlen[0] = h6->ploadlen[1] = h6->proto = 0;
2253                 h6->ttl = proto;
2254                 hnputl(h6->vcf, length);
2255                 if ((h6->tcpcksum[0] || h6->tcpcksum[1]) &&
2256                         ptclcsum(bp, TCP6_IPLEN, length + TCP6_PHDRSIZE)) {
2257                         tpriv->stats[CsumErrs]++;
2258                         tpriv->stats[InErrs]++;
2259                         netlog(f, Logtcp, "bad tcp proto cksum\n");
2260                         freeblist(bp);
2261                         return;
2262                 }
2263                 h6->ttl = ttl;
2264                 h6->proto = proto;
2265                 hnputs(h6->ploadlen, length);
2266
2267                 hdrlen = ntohtcp6(&seg, &bp);
2268                 if (hdrlen < 0) {
2269                         tpriv->stats[HlenErrs]++;
2270                         tpriv->stats[InErrs]++;
2271                         netlog(f, Logtcp, "bad tcp hdr len\n");
2272                         return;
2273                 }
2274
2275                 s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest);
2276                 if (s && s->state == Bypass) {
2277                         bypass_or_drop(s, bp);
2278                         return;
2279                 }
2280
2281                 /* trim the packet to the size claimed by the datagram */
2282                 length -= hdrlen;
2283                 bp = trimblock(bp, hdrlen + TCP6_PKT, length);
2284                 if (bp == NULL) {
2285                         tpriv->stats[LenErrs]++;
2286                         tpriv->stats[InErrs]++;
2287                         netlog(f, Logtcp, "tcp len < 0 after trim\n");
2288                         return;
2289                 }
2290         }
2291
2292         /* s, the conv matching the n-tuple, was set above */
2293         if (s == NULL) {
2294                 netlog(f, Logtcpreset, "iphtlook failed: src %I:%u, dst %I:%u\n",
2295                        source, seg.source, dest, seg.dest);
2296 reset:
2297                 sndrst(tcp, source, dest, length, &seg, version, "no conversation");
2298                 freeblist(bp);
2299                 return;
2300         }
2301
2302         /* lock protocol for unstate Plan 9 invariants.  funcs like limbo or
2303          * incoming might rely on it. */
2304         qlock(&tcp->qlock);
2305
2306         /* if it's a listener, look for the right flags and get a new conv */
2307         tcb = (Tcpctl *) s->ptcl;
2308         if (tcb->state == Listen) {
2309                 if (seg.flags & RST) {
2310                         limborst(s, &seg, source, dest, version);
2311                         qunlock(&tcp->qlock);
2312                         freeblist(bp);
2313                         return;
2314                 }
2315
2316                 /* if this is a new SYN, put the call into limbo */
2317                 if ((seg.flags & SYN) && (seg.flags & ACK) == 0) {
2318                         limbo(s, source, dest, &seg, version);
2319                         qunlock(&tcp->qlock);
2320                         freeblist(bp);
2321                         return;
2322                 }
2323
2324                 /* if there's a matching call in limbo, tcpincoming will return it */
2325                 s = tcpincoming(s, &seg, source, dest, version);
2326                 if (s == NULL) {
2327                         qunlock(&tcp->qlock);
2328                         goto reset;
2329                 }
2330         }
2331
2332         /* The rest of the input state machine is run with the control block
2333          * locked and implements the state machine directly out of the RFC.
2334          * Out-of-band data is ignored - it was always a bad idea.
2335          */
2336         tcb = (Tcpctl *) s->ptcl;
2337         if (waserror()) {
2338                 qunlock(&s->qlock);
2339                 nexterror();
2340         }
2341         qlock(&s->qlock);
2342         qunlock(&tcp->qlock);
2343
2344         update_tcb_ts(tcb, &seg);
2345         /* fix up window */
2346         seg.wnd <<= tcb->rcv.scale;
2347
2348         /* every input packet in puts off the keep alive time out */
2349         tcpsetkacounter(tcb);
2350
2351         switch (tcb->state) {
2352                 case Closed:
2353                         sndrst(tcp, source, dest, length, &seg, version,
2354                                    "sending to Closed");
2355                         goto raise;
2356                 case Syn_sent:
2357                         if (seg.flags & ACK) {
2358                                 if (!seq_within(seg.ack, tcb->iss + 1, tcb->snd.nxt)) {
2359                                         sndrst(tcp, source, dest, length, &seg, version,
2360                                                    "bad seq in Syn_sent");
2361                                         goto raise;
2362                                 }
2363                         }
2364                         if (seg.flags & RST) {
2365                                 if (seg.flags & ACK)
2366                                         localclose(s, "connection refused");
2367                                 goto raise;
2368                         }
2369
2370                         if (seg.flags & SYN) {
2371                                 procsyn(s, &seg);
2372                                 if (seg.flags & ACK) {
2373                                         update(s, &seg);
2374                                         tcpsynackrtt(s);
2375                                         tcpsetstate(s, Established);
2376                                         /* Here's where we get the results of header option
2377                                          * negotiations for connections we started. (SYNACK has the
2378                                          * response) */
2379                                         tcpsetscale(s, tcb, seg.ws, tcb->scale);
2380                                         tcb->sack_ok = seg.sack_ok;
2381                                 } else {
2382                                         sndrst(tcp, source, dest, length, &seg, version,
2383                                                    "Got SYN with no ACK");
2384                                         goto raise;
2385                                 }
2386
2387                                 if (length != 0 || (seg.flags & FIN))
2388                                         break;
2389
2390                                 freeblist(bp);
2391                                 goto output;
2392                         } else
2393                                 freeblist(bp);
2394
2395                         qunlock(&s->qlock);
2396                         poperror();
2397                         return;
2398         }
2399
2400         /*
2401          *  One DOS attack is to open connections to us and then forget about them,
2402          *  thereby tying up a conv at no long term cost to the attacker.
2403          *  This is an attempt to defeat these stateless DOS attacks.  See
2404          *  corresponding code in tcpsendka().
2405          */
2406         if ((seg.flags & RST) == 0) {
2407                 if (tcpporthogdefense
2408                         && seq_within(seg.ack, tcb->snd.una - (1 << 31),
2409                                                   tcb->snd.una - (1 << 29))) {
2410                         printd("stateless hog %I.%d->%I.%d f 0x%x 0x%lx - 0x%lx - 0x%lx\n",
2411                                    source, seg.source, dest, seg.dest, seg.flags,
2412                                    tcb->snd.una - (1 << 31), seg.ack, tcb->snd.una - (1 << 29));
2413                         localclose(s, "stateless hog");
2414                 }
2415         }
2416
2417         /* Cut the data to fit the receive window */
2418         if (tcptrim(tcb, &seg, &bp, &length) == -1) {
2419                 netlog(f, Logtcp, "%I.%d -> %I.%d: tcp len < 0, %lu %d\n",
2420                        s->raddr, s->rport, s->laddr, s->lport, seg.seq, length);
2421                 update(s, &seg);
2422                 if (qlen(s->wq) + tcb->flgcnt == 0 && tcb->state == Closing) {
2423                         tcphalt(tpriv, &tcb->rtt_timer);
2424                         tcphalt(tpriv, &tcb->acktimer);
2425                         tcphalt(tpriv, &tcb->katimer);
2426                         tcpsetstate(s, Time_wait);
2427                         tcb->timer.start = MSL2 * (1000 / MSPTICK);
2428                         tcpgo(tpriv, &tcb->timer);
2429                 }
2430                 if (!(seg.flags & RST)) {
2431                         tcb->flags |= FORCE;
2432                         goto output;
2433                 }
2434                 qunlock(&s->qlock);
2435                 poperror();
2436                 return;
2437         }
2438
2439         /* Cannot accept so answer with a rst */
2440         if (length && tcb->state == Closed) {
2441                 sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
2442                 goto raise;
2443         }
2444
2445         /* The segment is beyond the current receive pointer so
2446          * queue the data in the resequence queue
2447          */
2448         if (seg.seq != tcb->rcv.nxt)
2449                 if (length != 0 || (seg.flags & (SYN | FIN))) {
2450                         update(s, &seg);
2451                         if (addreseq(tcb, tpriv, &seg, bp, length) < 0)
2452                                 printd("reseq %I.%d -> %I.%d\n", s->raddr, s->rport, s->laddr,
2453                                            s->lport);
2454                         tcb->flags |= FORCE;
2455                         goto output;
2456                 }
2457
2458         /*
2459          *  keep looping till we've processed this packet plus any
2460          *  adjacent packets in the resequence queue
2461          */
2462         for (;;) {
2463                 if (seg.flags & RST) {
2464                         if (tcb->state == Established) {
2465                                 tpriv->stats[EstabResets]++;
2466                                 if (tcb->rcv.nxt != seg.seq)
2467                                         printd
2468                                                 ("out of order RST rcvd: %I.%d -> %I.%d, rcv.nxt 0x%lx seq 0x%lx\n",
2469                                                  s->raddr, s->rport, s->laddr, s->lport, tcb->rcv.nxt,
2470                                                  seg.seq);
2471                         }
2472                         localclose(s, "connection refused");
2473                         goto raise;
2474                 }
2475
2476                 if ((seg.flags & ACK) == 0)
2477                         goto raise;
2478
2479                 switch (tcb->state) {
2480                         case Established:
2481                         case Close_wait:
2482                                 update(s, &seg);
2483                                 break;
2484                         case Finwait1:
2485                                 update(s, &seg);
2486                                 if (qlen(s->wq) + tcb->flgcnt == 0) {
2487                                         tcphalt(tpriv, &tcb->rtt_timer);
2488                                         tcphalt(tpriv, &tcb->acktimer);
2489                                         tcpsetkacounter(tcb);
2490                                         tcb->time = NOW;
2491                                         tcpsetstate(s, Finwait2);
2492                                         tcb->katimer.start = MSL2 * (1000 / MSPTICK);
2493                                         tcpgo(tpriv, &tcb->katimer);
2494                                 }
2495                                 break;
2496                         case Finwait2:
2497                                 update(s, &seg);
2498                                 break;
2499                         case Closing:
2500                                 update(s, &seg);
2501                                 if (qlen(s->wq) + tcb->flgcnt == 0) {
2502                                         tcphalt(tpriv, &tcb->rtt_timer);
2503                                         tcphalt(tpriv, &tcb->acktimer);
2504                                         tcphalt(tpriv, &tcb->katimer);
2505                                         tcpsetstate(s, Time_wait);
2506                                         tcb->timer.start = MSL2 * (1000 / MSPTICK);
2507                                         tcpgo(tpriv, &tcb->timer);
2508                                 }
2509                                 break;
2510                         case Last_ack:
2511                                 update(s, &seg);
2512                                 if (qlen(s->wq) + tcb->flgcnt == 0) {
2513                                         localclose(s, NULL);
2514                                         goto raise;
2515                                 }
2516                         case Time_wait:
2517                                 tcb->flags |= FORCE;
2518                                 if (tcb->timer.state != TcptimerON)
2519                                         tcpgo(tpriv, &tcb->timer);
2520                 }
2521
2522                 if ((seg.flags & URG) && seg.urg) {
2523                         if (seq_gt(seg.urg + seg.seq, tcb->rcv.urg)) {
2524                                 tcb->rcv.urg = seg.urg + seg.seq;
2525                                 pullblock(&bp, seg.urg);
2526                         }
2527                 } else if (seq_gt(tcb->rcv.nxt, tcb->rcv.urg))
2528                         tcb->rcv.urg = tcb->rcv.nxt;
2529
2530                 if (length == 0) {
2531                         if (bp != NULL)
2532                                 freeblist(bp);
2533                 } else {
2534                         switch (tcb->state) {
2535                                 default:
2536                                         /* Ignore segment text */
2537                                         if (bp != NULL)
2538                                                 freeblist(bp);
2539                                         break;
2540
2541                                 case Established:
2542                                 case Finwait1:
2543                                         /* If we still have some data place on
2544                                          * receive queue
2545                                          */
2546                                         if (bp) {
2547                                                 bp = packblock(bp);
2548                                                 if (bp == NULL)
2549                                                         panic("tcp packblock");
2550                                                 qpassnolim(s->rq, bp);
2551                                                 bp = NULL;
2552
2553                                                 /*
2554                                                  *  Force an ack every 2 data messages.  This is
2555                                                  *  a hack for rob to make his home system run
2556                                                  *  faster.
2557                                                  *
2558                                                  *  this also keeps the standard TCP congestion
2559                                                  *  control working since it needs an ack every
2560                                                  *  2 max segs worth.  This is not quite that,
2561                                                  *  but under a real stream is equivalent since
2562                                                  *  every packet has a max seg in it.
2563                                                  */
2564                                                 if (++(tcb->rcv.una) >= 2)
2565                                                         tcb->flags |= FORCE;
2566                                         }
2567                                         tcb->rcv.nxt += length;
2568                                         drop_old_rcv_sacks(tcb);
2569
2570                                         /*
2571                                          *  update our rcv window
2572                                          */
2573                                         tcprcvwin(s);
2574
2575                                         /*
2576                                          *  turn on the acktimer if there's something
2577                                          *  to ack
2578                                          */
2579                                         if (tcb->acktimer.state != TcptimerON)
2580                                                 tcpgo(tpriv, &tcb->acktimer);
2581
2582                                         break;
2583                                 case Finwait2:
2584                                         /* no process to read the data, send a reset */
2585                                         if (bp != NULL)
2586                                                 freeblist(bp);
2587                                         sndrst(tcp, source, dest, length, &seg, version,
2588                                                    "send to Finwait2");
2589                                         qunlock(&s->qlock);
2590                                         poperror();
2591                                         return;
2592                         }
2593                 }
2594
2595                 if (seg.flags & FIN) {
2596                         tcb->flags |= FORCE;
2597
2598                         switch (tcb->state) {
2599                                 case Established:
2600                                         tcb->rcv.nxt++;
2601                                         tcpsetstate(s, Close_wait);
2602                                         break;
2603                                 case Finwait1:
2604                                         tcb->rcv.nxt++;
2605                                         if (qlen(s->wq) + tcb->flgcnt == 0) {
2606                                                 tcphalt(tpriv, &tcb->rtt_timer);
2607                                                 tcphalt(tpriv, &tcb->acktimer);
2608                                                 tcphalt(tpriv, &tcb->katimer);
2609                                                 tcpsetstate(s, Time_wait);
2610                                                 tcb->timer.start = MSL2 * (1000 / MSPTICK);
2611                                                 tcpgo(tpriv, &tcb->timer);
2612                                         } else
2613                                                 tcpsetstate(s, Closing);
2614                                         break;
2615                                 case Finwait2:
2616                                         tcb->rcv.nxt++;
2617                                         tcphalt(tpriv, &tcb->rtt_timer);
2618                                         tcphalt(tpriv, &tcb->acktimer);
2619                                         tcphalt(tpriv, &tcb->katimer);
2620                                         tcpsetstate(s, Time_wait);
2621                                         tcb->timer.start = MSL2 * (1000 / MSPTICK);
2622                                         tcpgo(tpriv, &tcb->timer);
2623                                         break;
2624                                 case Close_wait:
2625                                 case Closing:
2626                                 case Last_ack:
2627                                         break;
2628                                 case Time_wait:
2629                                         tcpgo(tpriv, &tcb->timer);
2630                                         break;
2631                         }
2632                 }
2633
2634                 /*
2635                  *  get next adjacent segment from the resequence queue.
2636                  *  dump/trim any overlapping segments
2637                  */
2638                 for (;;) {
2639                         if (tcb->reseq == NULL)
2640                                 goto output;
2641
2642                         if (seq_ge(tcb->rcv.nxt, tcb->reseq->seg.seq) == 0)
2643                                 goto output;
2644
2645                         getreseq(tcb, &seg, &bp, &length);
2646
2647                         if (tcptrim(tcb, &seg, &bp, &length) == 0)
2648                                 break;
2649                 }
2650         }
2651 output:
2652         tcpoutput(s);
2653         qunlock(&s->qlock);
2654         poperror();
2655         return;
2656 raise:
2657         qunlock(&s->qlock);
2658         poperror();
2659         freeblist(bp);
2660         tcpkick(s);
2661 }
2662
2663 /* The advertised mss = data + TCP headers */
2664 static uint16_t derive_payload_mss(Tcpctl *tcb)
2665 {
2666         uint16_t payload_mss = tcb->mss;
2667         uint16_t opt_size = 0;
2668
2669         if (tcb->ts_recent) {
2670                 opt_size += TS_LENGTH;
2671                 /* Note that when we're a SYN, we overestimate slightly.  This is safe,
2672                  * and not really a problem. */
2673                 opt_size += TS_SEND_PREPAD;
2674         }
2675         if (tcb->rcv.nr_sacks)
2676                 opt_size += 2 + tcb->rcv.nr_sacks * 8;
2677         opt_size = ROUNDUP(opt_size, 4);
2678         payload_mss -= opt_size;
2679         return payload_mss;
2680 }
2681
2682 /* Decreases the xmit amt, given the MSS / TSO. */
2683 static uint32_t throttle_for_mss(Tcpctl *tcb, uint32_t ssize,
2684                                  uint16_t payload_mss, bool retrans)
2685 {
2686         if (ssize > payload_mss) {
2687                 if ((tcb->flags & TSO) == 0) {
2688                         ssize = payload_mss;
2689                 } else {
2690                         /* Don't send too much.  32K is arbitrary.. */
2691                         if (ssize > 32 * 1024)
2692                                 ssize = 32 * 1024;
2693                         if (!retrans) {
2694                                 /* Clamp xmit to an integral MSS to avoid ragged tail segments
2695                                  * causing poor link utilization. */
2696                                 ssize = ROUNDDOWN(ssize, payload_mss);
2697                         }
2698                 }
2699         }
2700         return ssize;
2701 }
2702
2703 /* Reduces ssize for a variety of reasons.  Returns FALSE if we should abort
2704  * sending the packet.  o/w returns TRUE and modifies ssize by reference. */
2705 static bool throttle_ssize(struct conv *s, Tcpctl *tcb, uint32_t *ssize_p,
2706                            uint16_t payload_mss, bool retrans)
2707 {
2708         struct Fs *f = s->p->f;
2709         uint32_t usable;
2710         uint32_t ssize = *ssize_p;
2711
2712         /* Compute usable segment based on offered window and limit
2713          * window probes to one */
2714         if (tcb->snd.wnd == 0) {
2715                 if (tcb->snd.in_flight != 0) {
2716                         if ((tcb->flags & FORCE) == 0)
2717                                 return FALSE;
2718                 }
2719                 usable = 1;
2720         } else {
2721                 usable = tcb->cwind;
2722                 if (tcb->snd.wnd < usable)
2723                         usable = tcb->snd.wnd;
2724                 if (usable > tcb->snd.in_flight)
2725                         usable -= tcb->snd.in_flight;
2726                 else
2727                         usable = 0;
2728                 /* Avoid Silly Window Syndrome.  This is a little different thant RFC
2729                  * 813.  I took their additional enhancement of "< MSS" as an AND, not
2730                  * an OR.  25% of a large snd.wnd is pretty large, and our main goal is
2731                  * to avoid packets smaller than MSS.  I still use the 25% threshold,
2732                  * because it is important that there is *some* data in_flight.  If
2733                  * usable < MSS because snd.wnd is very small (but not 0), we might
2734                  * never get an ACK and would need to set up a timer.
2735                  *
2736                  * Also, I'm using 'ssize' as a proxy for a PSH point.  If there's just
2737                  * a small blob in the qio (or retrans!), then we might as well just
2738                  * send it. */
2739                 if ((usable < tcb->typical_mss) && (usable < tcb->snd.wnd >> 2)
2740                     && (usable < ssize)) {
2741                         return FALSE;
2742                 }
2743         }
2744         if (ssize && usable < 2)
2745                 netlog(s->p->f, Logtcpverbose,
2746                        "%I.%d -> %I.%d: throttled snd.wnd %lu cwind %lu\n",
2747                        s->laddr, s->lport, s->raddr, s->rport,
2748                        tcb->snd.wnd, tcb->cwind);
2749         if (usable < ssize)
2750                 ssize = usable;
2751
2752         ssize = throttle_for_mss(tcb, ssize, payload_mss, retrans);
2753
2754         *ssize_p = ssize;
2755         return TRUE;
2756 }
2757
2758 /* Helper, picks the next segment to send, which is possibly a retransmission.
2759  * Returns TRUE if we have a segment, FALSE o/w.  Returns ssize, from_seq, and
2760  * sent by reference.
2761  *
2762  * from_seq is the seq number we are transmitting from.
2763  *
2764  * sent includes all seq from una to from_seq *including* any previously sent
2765  * flags (part of tcb->flgcnt), for instance an unacknowledged SYN (which counts
2766  * as a seq number).  Those flags are in the e.g. snd.nxt - snd.una range, and
2767  * they get dropped after qdiscard.
2768  *
2769  * ssize is the amount of data we are sending, starting from from_seq, and it
2770  * will include any *new* flags, which haven't been accounted for yet.
2771  *
2772  * tcb->flgcnt consists of the flags both in ssize and in sent.
2773  *
2774  * Note that we could be in recovery and not sack_retrans a segment. */
2775 static bool get_xmit_segment(struct conv *s, Tcpctl *tcb, uint16_t payload_mss,
2776                              uint32_t *from_seq_p, uint32_t *sent_p,
2777                              uint32_t *ssize_p)
2778 {
2779         struct Fs *f = s->p->f;
2780         struct tcppriv *tpriv = s->p->priv;
2781         uint32_t ssize, sent, from_seq;
2782         bool sack_retrans = FALSE;
2783         struct sack_block *tcb_sack = 0;
2784
2785         for (int i = 0; i < tcb->snd.nr_sacks; i++) {
2786                 tcb_sack = &tcb->snd.sacks[i];
2787                 if (seq_lt(tcb->snd.rtx, tcb_sack->left)) {
2788                         /* So ssize is supposed to include any *new* flags to flgcnt, which
2789                          * at this point would be a FIN.
2790                          *
2791                          * It might be possible that flgcnt is incremented so we send a FIN,
2792                          * even for an intermediate sack retrans.  Perhaps the user closed
2793                          * the conv.
2794                          *
2795                          * However, the way the "flgcnt for FIN" works is that it inflates
2796                          * the desired amount we'd like to send (qlen + flgcnt).
2797                          * Eventually, we reach the end of the queue and fail to extract all
2798                          * of dsize.  At that point, we put on the FIN, and that's where the
2799                          * extra 'byte' comes from.
2800                          *
2801                          * For sack retrans, since we're extracting from parts of the qio
2802                          * that aren't the right-most edge, we don't need to consider flgcnt
2803                          * when setting ssize. */
2804                         from_seq = tcb->snd.rtx;
2805                         sent = from_seq - tcb->snd.una;
2806                         ssize = tcb_sack->left - from_seq;
2807                         sack_retrans = TRUE;
2808                         break;
2809                 }
2810         }
2811         /* SACK holes have first dibs, but we can still opportunisitically send new
2812          * data.
2813          *
2814          * During other types of recovery, we'll just send from the retrans point.
2815          * If we're in an RTO while we still have sacks, we could be resending data
2816          * that wasn't lost.  Consider a sack that is still growing (usually the
2817          * right-most), but we haven't received the ACK yet.  rxt may be included in
2818          * that area.  Given we had two losses or otherwise timed out, I'm not too
2819          * concerned.
2820          *
2821          * Note that Fast and RTO can send data beyond nxt.  If we change that,
2822          * change the accounting below. */
2823         if (!sack_retrans) {
2824                 switch (tcb->snd.recovery) {
2825                 default:
2826                 case SACK_RETRANS_RECOVERY:
2827                         from_seq = tcb->snd.nxt;
2828                         break;
2829                 case FAST_RETRANS_RECOVERY:
2830                 case RTO_RETRANS_RECOVERY:
2831                         from_seq = tcb->snd.rtx;
2832                         break;
2833                 }
2834                 sent = from_seq - tcb->snd.una;
2835                 /* qlen + flgcnt is every seq we want to have sent, including unack'd
2836                  * data, unacked flags, and new flags. */
2837                 ssize = qlen(s->wq) + tcb->flgcnt - sent;
2838         }
2839
2840         if (!throttle_ssize(s, tcb, &ssize, payload_mss, sack_retrans))
2841                 return FALSE;
2842
2843         /* This counts flags, which is a little hokey, but it's okay since in_flight
2844          * gets reset on each ACK */
2845         tcb->snd.in_flight += ssize;
2846         /* Log and track rxmit.  This covers both SACK (retrans) and fast rxmit. */
2847         if (ssize && seq_lt(tcb->snd.rtx, tcb->snd.nxt)) {
2848                 netlog(f, Logtcpverbose,
2849                        "%I.%d -> %I.%d: rxmit: rtx %u amt %u, nxt %u\n",
2850                        s->laddr, s->lport, s->raddr, s->rport,
2851                        tcb->snd.rtx, MIN(tcb->snd.nxt - tcb->snd.rtx, ssize),
2852                        tcb->snd.nxt);
2853                 tpriv->stats[RetransSegs]++;
2854         }
2855         if (sack_retrans) {
2856                 /* If we'll send up to the left edge, advance snd.rtx to the right.
2857                  *
2858                  * This includes the largest sack.  It might get removed later, in which
2859                  * case we'll underestimate the amount in-flight.  The alternative is to
2860                  * not count the rightmost sack, but when it gets removed, we'll retrans
2861                  * it anyway.  No matter what, we'd count it. */
2862                 tcb->snd.rtx += ssize;
2863                 if (tcb->snd.rtx == tcb_sack->left)
2864                         tcb->snd.rtx = tcb_sack->right;
2865                 /* RFC 6675 says we MAY rearm the RTO timer on each retrans, since we
2866                  * might not be getting ACKs for a while. */
2867                 tcpsettimer(tcb);
2868         } else {
2869                 switch (tcb->snd.recovery) {
2870                 default:
2871                         /* under normal op, we drag rtx along with nxt.  this prevents us
2872                          * from sending sacks too early (up above), since rtx doesn't get
2873                          * reset to una until we have a loss (e.g. 3 dupacks/sacks). */
2874                         tcb->snd.nxt += ssize;
2875                         tcb->snd.rtx = tcb->snd.nxt;
2876                         break;
2877                 case SACK_RETRANS_RECOVERY:
2878                         /* We explicitly do not want to increase rtx here.  We might still
2879                          * need it to fill in a sack gap below nxt if we get new, higher
2880                          * sacks. */
2881                         tcb->snd.nxt += ssize;
2882                         break;
2883                 case FAST_RETRANS_RECOVERY:
2884                 case RTO_RETRANS_RECOVERY:
2885                         tcb->snd.rtx += ssize;
2886                         /* Fast and RTO can send new data, advancing nxt. */
2887                         if (seq_gt(tcb->snd.rtx, tcb->snd.nxt))
2888                                 tcb->snd.nxt = tcb->snd.rtx;
2889                         break;
2890                 }
2891         }
2892         *from_seq_p = from_seq;
2893         *sent_p = sent;
2894         *ssize_p = ssize;
2895
2896         return TRUE;
2897 }
2898
2899 /*
2900  *  always enters and exits with the s locked.  We drop
2901  *  the lock to ipoput the packet so some care has to be
2902  *  taken by callers.
2903  */
2904 static void tcpoutput(struct conv *s)
2905 {
2906         Tcp seg;
2907         int msgs;
2908         int next_yield = 1;
2909         Tcpctl *tcb;
2910         struct block *hbp, *bp;
2911         uint32_t ssize, dsize, sent, from_seq;
2912         struct Fs *f;
2913         struct tcppriv *tpriv;
2914         uint8_t version;
2915         uint16_t payload_mss;
2916
2917         f = s->p->f;
2918         tpriv = s->p->priv;
2919         version = s->ipversion;
2920
2921         for (msgs = 0; msgs < 100; msgs++) {
2922                 tcb = (Tcpctl *) s->ptcl;
2923
2924                 switch (tcb->state) {
2925                         case Listen:
2926                         case Closed:
2927                         case Finwait2:
2928                                 return;
2929                 }
2930
2931                 /* force an ack when a window has opened up */
2932                 if (tcb->rcv.blocked && tcb->rcv.wnd >= tcb->mss) {
2933                         tcb->rcv.blocked = 0;
2934                         tcb->flags |= FORCE;
2935                 }
2936
2937                 /* Don't send anything else until our SYN has been acked */
2938                 if (tcb->snd.nxt != tcb->iss && (tcb->flags & SYNACK) == 0)
2939                         break;
2940
2941                 /* payload_mss is the actual amount of data in the packet, which is the
2942                  * advertised (mss - header opts).  This varies from packet to packet,
2943                  * based on the options that might be present (e.g. always timestamps,
2944                  * sometimes SACKs) */
2945                 payload_mss = derive_payload_mss(tcb);
2946
2947                 if (!get_xmit_segment(s, tcb, payload_mss, &from_seq, &sent, &ssize))
2948                         break;
2949
2950                 dsize = ssize;
2951                 seg.urg = 0;
2952
2953                 if (ssize == 0)
2954                         if ((tcb->flags & FORCE) == 0)
2955                                 break;
2956
2957                 tcb->flags &= ~FORCE;
2958                 tcprcvwin(s);
2959
2960                 /* By default we will generate an ack, so we can normally turn off the
2961                  * timer.  If we're blocked, we'll want the timer so we can send a
2962                  * window update. */
2963                 if (!tcb->rcv.blocked)
2964                         tcphalt(tpriv, &tcb->acktimer);
2965                 tcb->rcv.una = 0;
2966                 seg.source = s->lport;
2967                 seg.dest = s->rport;
2968                 seg.flags = ACK;
2969                 seg.mss = 0;
2970                 seg.ws = 0;
2971                 seg.sack_ok = FALSE;
2972                 seg.nr_sacks = 0;
2973                 /* When outputting, Syn_sent means "send the Syn", for connections we
2974                  * initiate.  SYNACKs are sent from sndsynack directly. */
2975                 if (tcb->state == Syn_sent) {
2976                         seg.flags = 0;
2977                         seg.sack_ok = SACK_SUPPORTED;   /* here's where we advertise SACK */
2978                         if (tcb->snd.nxt - ssize == tcb->iss) {
2979                                 seg.flags |= SYN;
2980                                 dsize--;
2981                                 seg.mss = tcb->mss;
2982                                 seg.ws = tcb->scale;
2983                         } else {
2984                                 /* TODO: Not sure why we'd get here. */
2985                                 warn("TCP: weird Syn_sent state, tell someone you saw this");
2986                         }
2987                 }
2988                 seg.seq = from_seq;
2989                 seg.ack = tcb->rcv.nxt;
2990                 tcb->last_ack_sent = seg.ack;
2991                 seg.wnd = tcb->rcv.wnd;
2992                 seg.ts_val = tcb->ts_recent;
2993
2994                 /* Pull out data to send */
2995                 bp = NULL;
2996                 if (dsize != 0) {
2997                         bp = qcopy(s->wq, dsize, sent);
2998                         if (BLEN(bp) != dsize) {
2999                                 /* Here's where the flgcnt kicked in.  Note dsize is
3000                                  * decremented, but ssize isn't.  Not that we use ssize for much
3001                                  * anymore.  Decrementing dsize prevents us from sending a PSH
3002                                  * with the FIN. */
3003                                 seg.flags |= FIN;
3004                                 dsize--;
3005                         }
3006                         if (BLEN(bp) > payload_mss) {
3007                                 bp->flag |= Btso;
3008                                 bp->mss = payload_mss;
3009                         }
3010                 }
3011
3012                 if (sent + dsize == qlen(s->wq) + tcb->flgcnt)
3013                         seg.flags |= PSH;
3014
3015                 /* Build header, link data and compute cksum */
3016                 switch (version) {
3017                         case V4:
3018                                 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
3019                                 hbp = htontcp4(&seg, bp, &tcb->protohdr.tcp4hdr, tcb);
3020                                 if (hbp == NULL) {
3021                                         freeblist(bp);
3022                                         return;
3023                                 }
3024                                 break;
3025                         case V6:
3026                                 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
3027                                 hbp = htontcp6(&seg, bp, &tcb->protohdr.tcp6hdr, tcb);
3028                                 if (hbp == NULL) {
3029                                         freeblist(bp);
3030                                         return;
3031                                 }
3032                                 break;
3033                         default:
3034                                 hbp = NULL;     /* to suppress a warning */
3035                                 panic("tcpoutput: version %d", version);
3036                 }
3037
3038                 /* Start the transmission timers if there is new data and we
3039                  * expect acknowledges
3040                  */
3041                 if (ssize != 0) {
3042                         if (tcb->timer.state != TcptimerON)
3043                                 tcpgo(tpriv, &tcb->timer);
3044
3045                         if (!tcb->ts_recent && (tcb->rtt_timer.state != TcptimerON)) {
3046                                 /* If round trip timer isn't running, start it. */
3047                                 tcpgo(tpriv, &tcb->rtt_timer);
3048                                 tcb->rttseq = from_seq + ssize;
3049                         }
3050                 }
3051
3052                 tpriv->stats[OutSegs]++;
3053
3054                 /* put off the next keep alive */
3055                 tcpgo(tpriv, &tcb->katimer);
3056
3057                 switch (version) {
3058                         case V4:
3059                                 if (ipoput4(f, hbp, 0, s->ttl, s->tos, s) < 0) {
3060                                         /* a negative return means no route */
3061                                         localclose(s, "no route");
3062                                 }
3063                                 break;
3064                         case V6:
3065                                 if (ipoput6(f, hbp, 0, s->ttl, s->tos, s) < 0) {
3066                                         /* a negative return means no route */
3067                                         localclose(s, "no route");
3068                                 }
3069                                 break;
3070                         default:
3071                                 panic("tcpoutput2: version %d", version);
3072                 }
3073                 if (ssize) {
3074                         /* The outer loop thinks we sent one packet.  If we used TSO, we
3075                          * might have sent several.  Minus one for the loop increment. */
3076                         msgs += DIV_ROUND_UP(ssize, payload_mss) - 1;
3077                 }
3078                 /* Old Plan 9 tidbit - yield every four messages.  We want to break out
3079                  * and unlock so we can process inbound ACKs which might do things like
3080                  * say "slow down". */
3081                 if (msgs >= next_yield) {
3082                         next_yield = msgs + 4;
3083                         qunlock(&s->qlock);
3084                         kthread_yield();
3085                         qlock(&s->qlock);
3086                 }
3087         }
3088 }
3089
3090 /*
3091  *  the BSD convention (hack?) for keep alives.  resend last uint8_t acked.
3092  */
3093 static void tcpsendka(struct conv *s)
3094 {
3095         Tcp seg;
3096         Tcpctl *tcb;
3097         struct block *hbp, *dbp;
3098
3099         tcb = (Tcpctl *) s->ptcl;
3100
3101         dbp = NULL;
3102         seg.urg = 0;
3103         seg.source = s->lport;
3104         seg.dest = s->rport;
3105         seg.flags = ACK | PSH;
3106         seg.mss = 0;
3107         seg.ws = 0;
3108         seg.sack_ok = FALSE;
3109         seg.nr_sacks = 0;
3110         if (tcpporthogdefense)
3111                 urandom_read(&seg.seq, sizeof(seg.seq));
3112         else
3113                 seg.seq = tcb->snd.una - 1;
3114         seg.ack = tcb->rcv.nxt;
3115         tcb->last_ack_sent = seg.ack;
3116         tcb->rcv.una = 0;
3117         seg.wnd = tcb->rcv.wnd;
3118         seg.ts_val = tcb->ts_recent;
3119         if (tcb->state == Finwait2) {
3120                 seg.flags |= FIN;
3121         } else {
3122                 dbp = block_alloc(1, MEM_WAIT);
3123                 dbp->wp++;
3124         }
3125
3126         if (isv4(s->raddr)) {
3127                 /* Build header, link data and compute cksum */
3128                 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
3129                 hbp = htontcp4(&seg, dbp, &tcb->protohdr.tcp4hdr, tcb);
3130                 if (hbp == NULL) {
3131                         freeblist(dbp);
3132                         return;
3133                 }
3134                 ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
3135         } else {
3136                 /* Build header, link data and comp