2954840407a93cd1115468312d0836c481fc175e
[akaros.git] / kern / src / net / tcp.c
1 /* Copyright © 1994-1999 Lucent Technologies Inc.  All rights reserved.
2  * Portions Copyright © 1997-1999 Vita Nuova Limited
3  * Portions Copyright © 2000-2007 Vita Nuova Holdings Limited
4  *                                (www.vitanuova.com)
5  * Revisions Copyright © 2000-2007 Lucent Technologies Inc. and others
6  *
7  * Modified for the Akaros operating system:
8  * Copyright (c) 2013-2014 The Regents of the University of California
9  * Copyright (c) 2013-2017 Google Inc.
10  *
11  * Permission is hereby granted, free of charge, to any person obtaining a copy
12  * of this software and associated documentation files (the "Software"), to deal
13  * in the Software without restriction, including without limitation the rights
14  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15  * copies of the Software, and to permit persons to whom the Software is
16  * furnished to do so, subject to the following conditions:
17  *
18  * The above copyright notice and this permission notice shall be included in
19  * all copies or substantial portions of the Software.
20  *
21  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
24  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27  * SOFTWARE. */
28
29 #include <vfs.h>
30 #include <kfs.h>
31 #include <slab.h>
32 #include <kmalloc.h>
33 #include <kref.h>
34 #include <string.h>
35 #include <stdio.h>
36 #include <assert.h>
37 #include <error.h>
38 #include <cpio.h>
39 #include <pmap.h>
40 #include <smp.h>
41 #include <net/ip.h>
42 #include <net/tcp.h>
43
44 /* Must correspond to the enumeration in tcp.h */
45 static char *tcpstates[] = {
46         "Closed", "Listen", "Syn_sent",
47         "Established", "Finwait1", "Finwait2", "Close_wait",
48         "Closing", "Last_ack", "Time_wait"
49 };
50
51 static int tcp_irtt = DEF_RTT;                  /* Initial guess at round trip time */
52 static uint16_t tcp_mss = DEF_MSS;              /* Maximum segment size to be sent */
53
54 /* Must correspond to the enumeration in tcp.h */
55 static char *statnames[] = {
56         [MaxConn] "MaxConn",
57         [ActiveOpens] "ActiveOpens",
58         [PassiveOpens] "PassiveOpens",
59         [EstabResets] "EstabResets",
60         [CurrEstab] "CurrEstab",
61         [InSegs] "InSegs",
62         [OutSegs] "OutSegs",
63         [RetransSegs] "RetransSegs",
64         [RetransTimeouts] "RetransTimeouts",
65         [InErrs] "InErrs",
66         [OutRsts] "OutRsts",
67         [CsumErrs] "CsumErrs",
68         [HlenErrs] "HlenErrs",
69         [LenErrs] "LenErrs",
70         [OutOfOrder] "OutOfOrder",
71 };
72
73 /*
74  *  Setting tcpporthogdefense to non-zero enables Dong Lin's
75  *  solution to hijacked systems staking out port's as a form
76  *  of DoS attack.
77  *
78  *  To avoid stateless Conv hogs, we pick a sequence number at random.  If
79  *  it that number gets acked by the other end, we shut down the connection.
80  *  Look for tcpporthogedefense in the code.
81  */
82 static int tcpporthogdefense = 0;
83
84 static int addreseq(Tcpctl *, struct tcppriv *, Tcp *, struct block *,
85                     uint16_t);
86 static void getreseq(Tcpctl *, Tcp *, struct block **, uint16_t *);
87 static void localclose(struct conv *, char *unused_char_p_t);
88 static void procsyn(struct conv *, Tcp *);
89 static void tcpiput(struct Proto *, struct Ipifc *, struct block *);
90 static void tcpoutput(struct conv *);
91 static int tcptrim(Tcpctl *, Tcp *, struct block **, uint16_t *);
92 static void tcpstart(struct conv *, int);
93 static void tcptimeout(void *);
94 static void tcpsndsyn(struct conv *, Tcpctl *);
95 static void tcprcvwin(struct conv *);
96 static void tcpacktimer(void *);
97 static void tcpkeepalive(void *);
98 static void tcpsetkacounter(Tcpctl *);
99 static void tcprxmit(struct conv *);
100 static void tcpsettimer(Tcpctl *);
101 static void tcpsynackrtt(struct conv *);
102 static void tcpsetscale(struct conv *, Tcpctl *, uint16_t, uint16_t);
103 static void tcp_loss_event(struct conv *s, Tcpctl *tcb);
104 static uint16_t derive_payload_mss(Tcpctl *tcb);
105 static void set_in_flight(Tcpctl *tcb);
106
107 static void limborexmit(struct Proto *);
108 static void limbo(struct conv *, uint8_t *unused_uint8_p_t, uint8_t *, Tcp *,
109                                   int);
110
111 static void tcpsetstate(struct conv *s, uint8_t newstate)
112 {
113         Tcpctl *tcb;
114         uint8_t oldstate;
115         struct tcppriv *tpriv;
116
117         tpriv = s->p->priv;
118
119         tcb = (Tcpctl *) s->ptcl;
120
121         oldstate = tcb->state;
122         if (oldstate == newstate)
123                 return;
124
125         if (oldstate == Established)
126                 tpriv->stats[CurrEstab]--;
127         if (newstate == Established)
128                 tpriv->stats[CurrEstab]++;
129
130         /**
131         print( "%d/%d %s->%s CurrEstab=%d\n", s->lport, s->rport,
132                 tcpstates[oldstate], tcpstates[newstate], tpriv->tstats.tcpCurrEstab );
133         **/
134
135         switch (newstate) {
136                 case Closed:
137                         qclose(s->rq);
138                         qclose(s->wq);
139                         qclose(s->eq);
140                         break;
141
142                 case Close_wait:        /* Remote closes */
143                         qhangup(s->rq, NULL);
144                         break;
145         }
146
147         tcb->state = newstate;
148
149         if (oldstate == Syn_sent && newstate != Closed)
150                 Fsconnected(s, NULL);
151 }
152
153 static void tcpconnect(struct conv *c, char **argv, int argc)
154 {
155         Fsstdconnect(c, argv, argc);
156         tcpstart(c, TCP_CONNECT);
157 }
158
159 static int tcpstate(struct conv *c, char *state, int n)
160 {
161         Tcpctl *s;
162
163         s = (Tcpctl *) (c->ptcl);
164
165         return snprintf(state, n,
166                                         "%s qin %d qout %d srtt %d mdev %d cwin %u swin %u>>%d rwin %u>>%d timer.start %llu timer.count %llu rerecv %d katimer.start %d katimer.count %d\n",
167                                         tcpstates[s->state],
168                                         c->rq ? qlen(c->rq) : 0,
169                                         c->wq ? qlen(c->wq) : 0,
170                                         s->srtt, s->mdev,
171                                         s->cwind, s->snd.wnd, s->rcv.scale, s->rcv.wnd,
172                                         s->snd.scale, s->timer.start, s->timer.count, s->rerecv,
173                                         s->katimer.start, s->katimer.count);
174 }
175
176 static int tcpinuse(struct conv *c)
177 {
178         Tcpctl *s;
179
180         s = (Tcpctl *) (c->ptcl);
181         return s->state != Closed;
182 }
183
184 static void tcpannounce(struct conv *c, char **argv, int argc)
185 {
186         Fsstdannounce(c, argv, argc);
187         tcpstart(c, TCP_LISTEN);
188         Fsconnected(c, NULL);
189 }
190
191 static void tcpbypass(struct conv *cv, char **argv, int argc)
192 {
193         struct tcppriv *tpriv = cv->p->priv;
194
195         Fsstdbypass(cv, argv, argc);
196         iphtadd(&tpriv->ht, cv);
197 }
198
199 static void tcpshutdown(struct conv *c, int how)
200 {
201         Tcpctl *tcb = (Tcpctl*)c->ptcl;
202
203         /* Do nothing for the read side */
204         if (how == SHUT_RD)
205                 return;
206         /* Sends a FIN.  If we're in another state (like Listen), we'll run into
207          * issues, since we'll never send the FIN.  We'll be shutdown on our end,
208          * but we'll never tell the distant end.  Might just be an app issue. */
209         switch (tcb->state) {
210         case Established:
211                 tcb->flgcnt++;
212                 tcpsetstate(c, Finwait1);
213                 tcpoutput(c);
214                 break;
215         }
216 }
217
218 /*
219  *  tcpclose is always called with the q locked
220  */
221 static void tcpclose(struct conv *c)
222 {
223         Tcpctl *tcb;
224
225         tcb = (Tcpctl *) c->ptcl;
226
227         qhangup(c->rq, NULL);
228         qhangup(c->wq, NULL);
229         qhangup(c->eq, NULL);
230         qflush(c->rq);
231
232         switch (tcb->state) {
233                 case Listen:
234                         /*
235                          *  reset any incoming calls to this listener
236                          */
237                         Fsconnected(c, "Hangup");
238
239                         localclose(c, NULL);
240                         break;
241                 case Closed:
242                 case Syn_sent:
243                         localclose(c, NULL);
244                         break;
245                 case Established:
246                         tcb->flgcnt++;
247                         tcpsetstate(c, Finwait1);
248                         tcpoutput(c);
249                         break;
250                 case Close_wait:
251                         tcb->flgcnt++;
252                         tcpsetstate(c, Last_ack);
253                         tcpoutput(c);
254                         break;
255         }
256 }
257
258 static void tcpkick(void *x)
259 {
260         ERRSTACK(1);
261         struct conv *s = x;
262         Tcpctl *tcb;
263
264         tcb = (Tcpctl *) s->ptcl;
265
266         qlock(&s->qlock);
267         if (waserror()) {
268                 qunlock(&s->qlock);
269                 nexterror();
270         }
271
272         switch (tcb->state) {
273                 case Syn_sent:
274                 case Established:
275                 case Close_wait:
276                         /*
277                          * Push data
278                          */
279                         tcprcvwin(s);
280                         tcpoutput(s);
281                         break;
282                 default:
283                         localclose(s, "Hangup");
284                         break;
285         }
286
287         qunlock(&s->qlock);
288         poperror();
289 }
290
291 static void tcprcvwin(struct conv *s)
292 {
293         /* Call with tcb locked */
294         int w;
295         Tcpctl *tcb;
296
297         tcb = (Tcpctl *) s->ptcl;
298         w = tcb->window - qlen(s->rq);
299         if (w < 0)
300                 w = 0;
301
302         /* RFC 813: Avoid SWS.  We'll always reduce the window (because the qio
303          * increased - that's legit), and we'll always advertise the window
304          * increases (corresponding to qio drains) when those are greater than MSS.
305          * But we don't advertise increases less than MSS.
306          *
307          * Note we don't shrink the window at all - that'll result in tcptrim()
308          * dropping packets that were sent before the sender gets our update. */
309         if ((w < tcb->rcv.wnd) || (w >= tcb->mss))
310                 tcb->rcv.wnd = w;
311         /* We've delayed sending an update to rcv.wnd, and we might never get
312          * another ACK to drive the TCP stack after the qio is drained.  We could
313          * replace this stuff with qio kicks or callbacks, but that might be
314          * trickier with the MSS limitation.  (and 'edge' isn't empty or not). */
315         if (w < tcb->mss)
316                 tcb->rcv.blocked = 1;
317 }
318
319 static void tcpacktimer(void *v)
320 {
321         ERRSTACK(1);
322         Tcpctl *tcb;
323         struct conv *s;
324
325         s = v;
326         tcb = (Tcpctl *) s->ptcl;
327
328         qlock(&s->qlock);
329         if (waserror()) {
330                 qunlock(&s->qlock);
331                 nexterror();
332         }
333         if (tcb->state != Closed) {
334                 tcb->flags |= FORCE;
335                 tcprcvwin(s);
336                 tcpoutput(s);
337         }
338         qunlock(&s->qlock);
339         poperror();
340 }
341
342 static void tcpcreate(struct conv *c)
343 {
344         /* We don't use qio limits.  Instead, TCP manages flow control on its own.
345          * We only use qpassnolim().  Note for qio that 0 doesn't mean no limit. */
346         c->rq = qopen(0, Qcoalesce, 0, 0);
347         c->wq = qopen(8 * QMAX, Qkick, tcpkick, c);
348 }
349
350 static void timerstate(struct tcppriv *priv, Tcptimer *t, int newstate)
351 {
352         if (newstate != TcptimerON) {
353                 if (t->state == TcptimerON) {
354                         // unchain
355                         if (priv->timers == t) {
356                                 priv->timers = t->next;
357                                 if (t->prev != NULL)
358                                         panic("timerstate1");
359                         }
360                         if (t->next)
361                                 t->next->prev = t->prev;
362                         if (t->prev)
363                                 t->prev->next = t->next;
364                         t->next = t->prev = NULL;
365                 }
366         } else {
367                 if (t->state != TcptimerON) {
368                         // chain
369                         if (t->prev != NULL || t->next != NULL)
370                                 panic("timerstate2");
371                         t->prev = NULL;
372                         t->next = priv->timers;
373                         if (t->next)
374                                 t->next->prev = t;
375                         priv->timers = t;
376                 }
377         }
378         t->state = newstate;
379 }
380
381 static void tcpackproc(void *a)
382 {
383         ERRSTACK(1);
384         Tcptimer *t, *tp, *timeo;
385         struct Proto *tcp;
386         struct tcppriv *priv;
387         int loop;
388
389         tcp = a;
390         priv = tcp->priv;
391
392         for (;;) {
393                 kthread_usleep(MSPTICK * 1000);
394
395                 qlock(&priv->tl);
396                 timeo = NULL;
397                 loop = 0;
398                 for (t = priv->timers; t != NULL; t = tp) {
399                         if (loop++ > 10000)
400                                 panic("tcpackproc1");
401                         tp = t->next;
402                         if (t->state == TcptimerON) {
403                                 t->count--;
404                                 if (t->count == 0) {
405                                         timerstate(priv, t, TcptimerDONE);
406                                         t->readynext = timeo;
407                                         timeo = t;
408                                 }
409                         }
410                 }
411                 qunlock(&priv->tl);
412
413                 loop = 0;
414                 for (t = timeo; t != NULL; t = t->readynext) {
415                         if (loop++ > 10000)
416                                 panic("tcpackproc2");
417                         if (t->state == TcptimerDONE && t->func != NULL) {
418                                 /* discard error style */
419                                 if (!waserror())
420                                         (*t->func) (t->arg);
421                                 poperror();
422                         }
423                 }
424
425                 limborexmit(tcp);
426         }
427 }
428
429 static void tcpgo(struct tcppriv *priv, Tcptimer *t)
430 {
431         if (t == NULL || t->start == 0)
432                 return;
433
434         qlock(&priv->tl);
435         t->count = t->start;
436         timerstate(priv, t, TcptimerON);
437         qunlock(&priv->tl);
438 }
439
440 static void tcphalt(struct tcppriv *priv, Tcptimer *t)
441 {
442         if (t == NULL)
443                 return;
444
445         qlock(&priv->tl);
446         timerstate(priv, t, TcptimerOFF);
447         qunlock(&priv->tl);
448 }
449
450 static int backoff(int n)
451 {
452         return 1 << n;
453 }
454
455 static void localclose(struct conv *s, char *reason)
456 {
457         /* called with tcb locked */
458         Tcpctl *tcb;
459         Reseq *rp, *rp1;
460         struct tcppriv *tpriv;
461
462         tpriv = s->p->priv;
463         tcb = (Tcpctl *) s->ptcl;
464
465         iphtrem(&tpriv->ht, s);
466
467         tcphalt(tpriv, &tcb->timer);
468         tcphalt(tpriv, &tcb->rtt_timer);
469         tcphalt(tpriv, &tcb->acktimer);
470         tcphalt(tpriv, &tcb->katimer);
471
472         /* Flush reassembly queue; nothing more can arrive */
473         for (rp = tcb->reseq; rp != NULL; rp = rp1) {
474                 rp1 = rp->next;
475                 freeblist(rp->bp);
476                 kfree(rp);
477         }
478         tcb->reseq = NULL;
479
480         if (tcb->state == Syn_sent)
481                 Fsconnected(s, reason);
482
483         qhangup(s->rq, reason);
484         qhangup(s->wq, reason);
485
486         tcpsetstate(s, Closed);
487
488         /* listener will check the rq state */
489         if (s->state == Announced)
490                 rendez_wakeup(&s->listenr);
491 }
492
493 /* mtu (- TCP + IP hdr len) of 1st hop */
494 static int tcpmtu(struct Proto *tcp, uint8_t *addr, int version, int *scale,
495                   uint8_t *flags)
496 {
497         struct Ipifc *ifc;
498         int mtu;
499
500         ifc = findipifc(tcp->f, addr, 0);
501         switch (version) {
502                 default:
503                 case V4:
504                         mtu = DEF_MSS;
505                         if (ifc != NULL)
506                                 mtu = ifc->maxtu - ifc->m->hsize - (TCP4_PKT + TCP4_HDRSIZE);
507                         break;
508                 case V6:
509                         mtu = DEF_MSS6;
510                         if (ifc != NULL)
511                                 mtu = ifc->maxtu - ifc->m->hsize - (TCP6_PKT + TCP6_HDRSIZE);
512                         break;
513         }
514         *flags &= ~TSO;
515         if (ifc && (ifc->feat & NETF_TSO))
516                 *flags |= TSO;
517         *scale = HaveWS | 7;
518
519         return mtu;
520 }
521
522 static void inittcpctl(struct conv *s, int mode)
523 {
524         Tcpctl *tcb;
525         Tcp4hdr *h4;
526         Tcp6hdr *h6;
527         int mss;
528
529         tcb = (Tcpctl *) s->ptcl;
530
531         memset(tcb, 0, sizeof(Tcpctl));
532
533         tcb->ssthresh = UINT32_MAX;
534         tcb->srtt = tcp_irtt;
535         tcb->mdev = 0;
536
537         /* setup timers */
538         tcb->timer.start = tcp_irtt / MSPTICK;
539         tcb->timer.func = tcptimeout;
540         tcb->timer.arg = s;
541         tcb->rtt_timer.start = MAX_TIME;
542         tcb->acktimer.start = TCP_ACK / MSPTICK;
543         tcb->acktimer.func = tcpacktimer;
544         tcb->acktimer.arg = s;
545         tcb->katimer.start = DEF_KAT / MSPTICK;
546         tcb->katimer.func = tcpkeepalive;
547         tcb->katimer.arg = s;
548
549         mss = DEF_MSS;
550
551         /* create a prototype(pseudo) header */
552         if (mode != TCP_LISTEN) {
553                 if (ipcmp(s->laddr, IPnoaddr) == 0)
554                         findlocalip(s->p->f, s->laddr, s->raddr);
555
556                 switch (s->ipversion) {
557                         case V4:
558                                 h4 = &tcb->protohdr.tcp4hdr;
559                                 memset(h4, 0, sizeof(*h4));
560                                 h4->proto = IP_TCPPROTO;
561                                 hnputs(h4->tcpsport, s->lport);
562                                 hnputs(h4->tcpdport, s->rport);
563                                 v6tov4(h4->tcpsrc, s->laddr);
564                                 v6tov4(h4->tcpdst, s->raddr);
565                                 break;
566                         case V6:
567                                 h6 = &tcb->protohdr.tcp6hdr;
568                                 memset(h6, 0, sizeof(*h6));
569                                 h6->proto = IP_TCPPROTO;
570                                 hnputs(h6->tcpsport, s->lport);
571                                 hnputs(h6->tcpdport, s->rport);
572                                 ipmove(h6->tcpsrc, s->laddr);
573                                 ipmove(h6->tcpdst, s->raddr);
574                                 mss = DEF_MSS6;
575                                 break;
576                         default:
577                                 panic("inittcpctl: version %d", s->ipversion);
578                 }
579         }
580
581         tcb->mss = mss;
582         tcb->typical_mss = mss;
583         tcb->cwind = tcb->typical_mss * CWIND_SCALE;
584
585         /* default is no window scaling */
586         tcb->window = QMAX;
587         tcb->rcv.wnd = QMAX;
588         tcb->rcv.scale = 0;
589         tcb->snd.scale = 0;
590 }
591
592 /*
593  *  called with s qlocked
594  */
595 static void tcpstart(struct conv *s, int mode)
596 {
597         Tcpctl *tcb;
598         struct tcppriv *tpriv;
599         char *kpname;
600
601         tpriv = s->p->priv;
602
603         if (tpriv->ackprocstarted == 0) {
604                 qlock(&tpriv->apl);
605                 if (tpriv->ackprocstarted == 0) {
606                         /* tcpackproc needs to free this if it ever exits */
607                         kpname = kmalloc(KNAMELEN, MEM_WAIT);
608                         snprintf(kpname, KNAMELEN, "#I%dtcpack", s->p->f->dev);
609                         ktask(kpname, tcpackproc, s->p);
610                         tpriv->ackprocstarted = 1;
611                 }
612                 qunlock(&tpriv->apl);
613         }
614
615         tcb = (Tcpctl *) s->ptcl;
616
617         inittcpctl(s, mode);
618
619         iphtadd(&tpriv->ht, s);
620         switch (mode) {
621                 case TCP_LISTEN:
622                         tpriv->stats[PassiveOpens]++;
623                         tcb->flags |= CLONE;
624                         tcpsetstate(s, Listen);
625                         break;
626
627                 case TCP_CONNECT:
628                         tpriv->stats[ActiveOpens]++;
629                         tcb->flags |= ACTIVE;
630                         tcpsndsyn(s, tcb);
631                         tcpsetstate(s, Syn_sent);
632                         tcpoutput(s);
633                         break;
634         }
635 }
636
637 static char *tcpflag(uint16_t flag)
638 {
639         static char buf[128];
640
641         snprintf(buf, sizeof(buf), "%d", flag >> 10);   /* Head len */
642         if (flag & URG)
643                 snprintf(buf, sizeof(buf), "%s%s", buf, " URG");
644         if (flag & ACK)
645                 snprintf(buf, sizeof(buf), "%s%s", buf, " ACK");
646         if (flag & PSH)
647                 snprintf(buf, sizeof(buf), "%s%s", buf, " PSH");
648         if (flag & RST)
649                 snprintf(buf, sizeof(buf), "%s%s", buf, " RST");
650         if (flag & SYN)
651                 snprintf(buf, sizeof(buf), "%s%s", buf, " SYN");
652         if (flag & FIN)
653                 snprintf(buf, sizeof(buf), "%s%s", buf, " FIN");
654
655         return buf;
656 }
657
658 /* Helper, determine if we should send a TCP timestamp.  ts_val was the
659  * timestamp from our distant end.  We'll also send a TS on SYN (no ACK). */
660 static bool tcp_seg_has_ts(Tcp *tcph)
661 {
662         return tcph->ts_val || ((tcph->flags & SYN) && !(tcph->flags & ACK));
663 }
664
665 /* Given a TCP header/segment and default header size (e.g. TCP4_HDRSIZE),
666  * return the actual hdr_len and opt_pad */
667 static void compute_hdrlen_optpad(Tcp *tcph, uint16_t default_hdrlen,
668                                   uint16_t *ret_hdrlen, uint16_t *ret_optpad,
669                                   Tcpctl *tcb)
670 {
671         uint16_t hdrlen = default_hdrlen;
672         uint16_t optpad = 0;
673
674         if (tcph->flags & SYN) {
675                 if (tcph->mss)
676                         hdrlen += MSS_LENGTH;
677                 if (tcph->ws)
678                         hdrlen += WS_LENGTH;
679                 if (tcph->sack_ok)
680                         hdrlen += SACK_OK_LENGTH;
681         }
682         if (tcp_seg_has_ts(tcph)) {
683                 hdrlen += TS_LENGTH;
684                 /* SYNs have other opts, don't do the PREPAD NOOP optimization. */
685                 if (!(tcph->flags & SYN))
686                         hdrlen += TS_SEND_PREPAD;
687         }
688         if (tcb && tcb->rcv.nr_sacks)
689                 hdrlen += 2 + tcb->rcv.nr_sacks * 8;
690         optpad = hdrlen & 3;
691         if (optpad)
692                 optpad = 4 - optpad;
693         hdrlen += optpad;
694         *ret_hdrlen = hdrlen;
695         *ret_optpad = optpad;
696 }
697
698 /* Writes the TCP options for tcph to opt. */
699 static void write_opts(Tcp *tcph, uint8_t *opt, uint16_t optpad, Tcpctl *tcb)
700 {
701         if (tcph->flags & SYN) {
702                 if (tcph->mss != 0) {
703                         *opt++ = MSSOPT;
704                         *opt++ = MSS_LENGTH;
705                         hnputs(opt, tcph->mss);
706                         opt += 2;
707                 }
708                 if (tcph->ws != 0) {
709                         *opt++ = WSOPT;
710                         *opt++ = WS_LENGTH;
711                         *opt++ = tcph->ws;
712                 }
713                 if (tcph->sack_ok) {
714                         *opt++ = SACK_OK_OPT;
715                         *opt++ = SACK_OK_LENGTH;
716                 }
717         }
718         if (tcp_seg_has_ts(tcph)) {
719                 if (!(tcph->flags & SYN)) {
720                         *opt++ = NOOPOPT;
721                         *opt++ = NOOPOPT;
722                 }
723                 *opt++ = TS_OPT;
724                 *opt++ = TS_LENGTH;
725                 /* Setting TSval, our time */
726                 hnputl(opt, milliseconds());
727                 opt += 4;
728                 /* Setting TSecr, the time we last saw from them, stored in ts_val */
729                 hnputl(opt, tcph->ts_val);
730                 opt += 4;
731         }
732         if (tcb && tcb->rcv.nr_sacks) {
733                 *opt++ = SACK_OPT;
734                 *opt++ = 2 + tcb->rcv.nr_sacks * 8;
735                 for (int i = 0; i < tcb->rcv.nr_sacks; i++) {
736                         hnputl(opt, tcb->rcv.sacks[i].left);
737                         opt += 4;
738                         hnputl(opt, tcb->rcv.sacks[i].right);
739                         opt += 4;
740                 }
741         }
742         while (optpad-- > 0)
743                 *opt++ = NOOPOPT;
744 }
745
746 /* Given a data block (or NULL) returns a block with enough header room that we
747  * can send out.  block->wp is set to the beginning of the payload.  Returns
748  * NULL on some sort of error. */
749 static struct block *alloc_or_pad_block(struct block *data,
750                                         uint16_t total_hdr_size)
751 {
752         if (data) {
753                 data = padblock(data, total_hdr_size);
754                 if (data == NULL)
755                         return NULL;
756         } else {
757                 /* the 64 pad is to meet mintu's */
758                 data = block_alloc(total_hdr_size + 64, MEM_WAIT);
759                 if (data == NULL)
760                         return NULL;
761                 data->wp += total_hdr_size;
762         }
763         return data;
764 }
765
766 static struct block *htontcp6(Tcp *tcph, struct block *data, Tcp6hdr *ph,
767                               Tcpctl *tcb)
768 {
769         int dlen = blocklen(data);
770         Tcp6hdr *h;
771         uint16_t csum;
772         uint16_t hdrlen, optpad;
773
774         compute_hdrlen_optpad(tcph, TCP6_HDRSIZE, &hdrlen, &optpad, tcb);
775
776         data = alloc_or_pad_block(data, hdrlen + TCP6_PKT);
777         if (data == NULL)
778                 return NULL;
779         /* relative to the block start (bp->rp) */
780         data->transport_header_end = hdrlen + TCP6_PKT;
781
782         /* copy in pseudo ip header plus port numbers */
783         h = (Tcp6hdr *) (data->rp);
784         memmove(h, ph, TCP6_TCBPHDRSZ);
785
786         /* compose pseudo tcp header, do cksum calculation */
787         hnputl(h->vcf, hdrlen + dlen);
788         h->ploadlen[0] = h->ploadlen[1] = h->proto = 0;
789         h->ttl = ph->proto;
790
791         /* copy in variable bits */
792         hnputl(h->tcpseq, tcph->seq);
793         hnputl(h->tcpack, tcph->ack);
794         hnputs(h->tcpflag, (hdrlen << 10) | tcph->flags);
795         hnputs(h->tcpwin, tcph->wnd >> (tcb != NULL ? tcb->snd.scale : 0));
796         hnputs(h->tcpurg, tcph->urg);
797
798         write_opts(tcph, h->tcpopt, optpad, tcb);
799
800         if (tcb != NULL && tcb->nochecksum) {
801                 h->tcpcksum[0] = h->tcpcksum[1] = 0;
802         } else {
803                 csum = ptclcsum(data, TCP6_IPLEN, hdrlen + dlen + TCP6_PHDRSIZE);
804                 hnputs(h->tcpcksum, csum);
805         }
806
807         /* move from pseudo header back to normal ip header */
808         memset(h->vcf, 0, 4);
809         h->vcf[0] = IP_VER6;
810         hnputs(h->ploadlen, hdrlen + dlen);
811         h->proto = ph->proto;
812
813         return data;
814 }
815
816 static struct block *htontcp4(Tcp *tcph, struct block *data, Tcp4hdr *ph,
817                               Tcpctl *tcb)
818 {
819         int dlen = blocklen(data);
820         Tcp4hdr *h;
821         uint16_t csum;
822         uint16_t hdrlen, optpad;
823
824         compute_hdrlen_optpad(tcph, TCP4_HDRSIZE, &hdrlen, &optpad, tcb);
825
826         data = alloc_or_pad_block(data, hdrlen + TCP4_PKT);
827         if (data == NULL)
828                 return NULL;
829         /* relative to the block start (bp->rp) */
830         data->transport_header_end = hdrlen + TCP4_PKT;
831
832         /* copy in pseudo ip header plus port numbers */
833         h = (Tcp4hdr *) (data->rp);
834         memmove(h, ph, TCP4_TCBPHDRSZ);
835
836         /* copy in variable bits */
837         hnputs(h->tcplen, hdrlen + dlen);
838         hnputl(h->tcpseq, tcph->seq);
839         hnputl(h->tcpack, tcph->ack);
840         hnputs(h->tcpflag, (hdrlen << 10) | tcph->flags);
841         hnputs(h->tcpwin, tcph->wnd >> (tcb != NULL ? tcb->snd.scale : 0));
842         hnputs(h->tcpurg, tcph->urg);
843
844         write_opts(tcph, h->tcpopt, optpad, tcb);
845
846         if (tcb != NULL && tcb->nochecksum) {
847                 h->tcpcksum[0] = h->tcpcksum[1] = 0;
848         } else {
849                 csum = ~ptclcsum(data, TCP4_IPLEN, TCP4_PHDRSIZE);
850                 hnputs(h->tcpcksum, csum);
851                 data->checksum_start = TCP4_IPLEN + TCP4_PHDRSIZE;
852                 data->checksum_offset = ph->tcpcksum - ph->tcpsport;
853                 data->flag |= Btcpck;
854         }
855
856         return data;
857 }
858
859 static void parse_inbound_sacks(Tcp *tcph, uint8_t *opt, uint16_t optlen)
860 {
861         uint8_t nr_sacks;
862         uint32_t left, right;
863
864         nr_sacks = (optlen - 2) / 8;
865         if (nr_sacks > MAX_NR_SACKS_PER_PACKET)
866                 return;
867         opt += 2;
868         for (int i = 0; i < nr_sacks; i++, opt += 8) {
869                 left = nhgetl(opt);
870                 right = nhgetl(opt + 4);
871                 if (seq_ge(left, right)) {
872                         /* bad / malicious SACK.  Skip it, and adjust. */
873                         nr_sacks--;
874                         i--;    /* stay on this array element next loop */
875                         continue;
876                 }
877                 tcph->sacks[i].left = left;
878                 tcph->sacks[i].right = right;
879         }
880         tcph->nr_sacks = nr_sacks;
881 }
882
883 static void parse_inbound_opts(Tcp *tcph, uint8_t *opt, uint16_t optsize)
884 {
885         uint16_t optlen;
886
887         while (optsize > 0 && *opt != EOLOPT) {
888                 if (*opt == NOOPOPT) {
889                         optsize--;
890                         opt++;
891                         continue;
892                 }
893                 optlen = opt[1];
894                 if (optlen < 2 || optlen > optsize)
895                         break;
896                 switch (*opt) {
897                         case MSSOPT:
898                                 if (optlen == MSS_LENGTH)
899                                         tcph->mss = nhgets(opt + 2);
900                                 break;
901                         case WSOPT:
902                                 if (optlen == WS_LENGTH && *(opt + 2) <= MAX_WS_VALUE)
903                                         tcph->ws = HaveWS | *(opt + 2);
904                                 break;
905                         case SACK_OK_OPT:
906                                 if (optlen == SACK_OK_LENGTH)
907                                         tcph->sack_ok = TRUE;
908                                 break;
909                         case SACK_OPT:
910                                 parse_inbound_sacks(tcph, opt, optlen);
911                                 break;
912                         case TS_OPT:
913                                 if (optlen == TS_LENGTH) {
914                                         tcph->ts_val = nhgetl(opt + 2);
915                                         tcph->ts_ecr = nhgetl(opt + 6);
916                                 }
917                                 break;
918                 }
919                 optsize -= optlen;
920                 opt += optlen;
921         }
922 }
923
924 /* Helper, clears the opts.  We'll later set them with e.g. parse_inbound_opts,
925  * set them manually, or something else. */
926 static void clear_tcph_opts(Tcp *tcph)
927 {
928         tcph->mss = 0;
929         tcph->ws = 0;
930         tcph->sack_ok = FALSE;
931         tcph->nr_sacks = 0;
932         tcph->ts_val = 0;
933         tcph->ts_ecr = 0;
934 }
935
936 static int ntohtcp6(Tcp *tcph, struct block **bpp)
937 {
938         Tcp6hdr *h;
939         uint16_t hdrlen;
940
941         *bpp = pullupblock(*bpp, TCP6_PKT + TCP6_HDRSIZE);
942         if (*bpp == NULL)
943                 return -1;
944
945         h = (Tcp6hdr *) ((*bpp)->rp);
946         tcph->source = nhgets(h->tcpsport);
947         tcph->dest = nhgets(h->tcpdport);
948         tcph->seq = nhgetl(h->tcpseq);
949         tcph->ack = nhgetl(h->tcpack);
950         hdrlen = (h->tcpflag[0] >> 2) & ~3;
951         if (hdrlen < TCP6_HDRSIZE) {
952                 freeblist(*bpp);
953                 return -1;
954         }
955
956         tcph->flags = h->tcpflag[1];
957         tcph->wnd = nhgets(h->tcpwin);
958         tcph->urg = nhgets(h->tcpurg);
959         clear_tcph_opts(tcph);
960         tcph->len = nhgets(h->ploadlen) - hdrlen;
961
962         *bpp = pullupblock(*bpp, hdrlen + TCP6_PKT);
963         if (*bpp == NULL)
964                 return -1;
965         parse_inbound_opts(tcph, h->tcpopt, hdrlen - TCP6_HDRSIZE);
966         return hdrlen;
967 }
968
969 static int ntohtcp4(Tcp *tcph, struct block **bpp)
970 {
971         Tcp4hdr *h;
972         uint16_t hdrlen;
973
974         *bpp = pullupblock(*bpp, TCP4_PKT + TCP4_HDRSIZE);
975         if (*bpp == NULL)
976                 return -1;
977
978         h = (Tcp4hdr *) ((*bpp)->rp);
979         tcph->source = nhgets(h->tcpsport);
980         tcph->dest = nhgets(h->tcpdport);
981         tcph->seq = nhgetl(h->tcpseq);
982         tcph->ack = nhgetl(h->tcpack);
983
984         hdrlen = (h->tcpflag[0] >> 2) & ~3;
985         if (hdrlen < TCP4_HDRSIZE) {
986                 freeblist(*bpp);
987                 return -1;
988         }
989
990         tcph->flags = h->tcpflag[1];
991         tcph->wnd = nhgets(h->tcpwin);
992         tcph->urg = nhgets(h->tcpurg);
993         clear_tcph_opts(tcph);
994         tcph->len = nhgets(h->length) - (hdrlen + TCP4_PKT);
995
996         *bpp = pullupblock(*bpp, hdrlen + TCP4_PKT);
997         if (*bpp == NULL)
998                 return -1;
999         parse_inbound_opts(tcph, h->tcpopt, hdrlen - TCP4_HDRSIZE);
1000         return hdrlen;
1001 }
1002
1003 /*
1004  *  For outgoing calls, generate an initial sequence
1005  *  number and put a SYN on the send queue
1006  */
1007 static void tcpsndsyn(struct conv *s, Tcpctl *tcb)
1008 {
1009         urandom_read(&tcb->iss, sizeof(tcb->iss));
1010         tcb->rttseq = tcb->iss;
1011         tcb->snd.wl2 = tcb->iss;
1012         tcb->snd.una = tcb->iss;
1013         tcb->snd.rtx = tcb->rttseq;
1014         tcb->snd.nxt = tcb->rttseq;
1015         tcb->flgcnt++;
1016         tcb->flags |= FORCE;
1017         tcb->sndsyntime = NOW;
1018
1019         /* set desired mss and scale */
1020         tcb->mss = tcpmtu(s->p, s->laddr, s->ipversion, &tcb->scale,
1021                           &tcb->flags);
1022 }
1023
1024 static void sndrst(struct Proto *tcp, uint8_t *source, uint8_t *dest,
1025                    uint16_t length, Tcp *seg, uint8_t version, char *reason)
1026 {
1027         struct block *hbp;
1028         uint8_t rflags;
1029         struct tcppriv *tpriv;
1030         Tcp4hdr ph4;
1031         Tcp6hdr ph6;
1032
1033         netlog(tcp->f, Logtcpreset, "sndrst: %s\n", reason);
1034
1035         tpriv = tcp->priv;
1036
1037         if (seg->flags & RST)
1038                 return;
1039
1040         /* make pseudo header */
1041         switch (version) {
1042                 case V4:
1043                         memset(&ph4, 0, sizeof(ph4));
1044                         ph4.vihl = IP_VER4;
1045                         v6tov4(ph4.tcpsrc, dest);
1046                         v6tov4(ph4.tcpdst, source);
1047                         ph4.proto = IP_TCPPROTO;
1048                         hnputs(ph4.tcplen, TCP4_HDRSIZE);
1049                         hnputs(ph4.tcpsport, seg->dest);
1050                         hnputs(ph4.tcpdport, seg->source);
1051                         break;
1052                 case V6:
1053                         memset(&ph6, 0, sizeof(ph6));
1054                         ph6.vcf[0] = IP_VER6;
1055                         ipmove(ph6.tcpsrc, dest);
1056                         ipmove(ph6.tcpdst, source);
1057                         ph6.proto = IP_TCPPROTO;
1058                         hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1059                         hnputs(ph6.tcpsport, seg->dest);
1060                         hnputs(ph6.tcpdport, seg->source);
1061                         break;
1062                 default:
1063                         panic("sndrst: version %d", version);
1064         }
1065
1066         tpriv->stats[OutRsts]++;
1067         rflags = RST;
1068
1069         /* convince the other end that this reset is in band */
1070         if (seg->flags & ACK) {
1071                 seg->seq = seg->ack;
1072                 seg->ack = 0;
1073         } else {
1074                 rflags |= ACK;
1075                 seg->ack = seg->seq;
1076                 seg->seq = 0;
1077                 if (seg->flags & SYN)
1078                         seg->ack++;
1079                 seg->ack += length;
1080                 if (seg->flags & FIN)
1081                         seg->ack++;
1082         }
1083         seg->flags = rflags;
1084         seg->wnd = 0;
1085         seg->urg = 0;
1086         seg->mss = 0;
1087         seg->ws = 0;
1088         seg->sack_ok = FALSE;
1089         seg->nr_sacks = 0;
1090         /* seg->ts_val is already set with their timestamp */
1091         switch (version) {
1092                 case V4:
1093                         hbp = htontcp4(seg, NULL, &ph4, NULL);
1094                         if (hbp == NULL)
1095                                 return;
1096                         ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1097                         break;
1098                 case V6:
1099                         hbp = htontcp6(seg, NULL, &ph6, NULL);
1100                         if (hbp == NULL)
1101                                 return;
1102                         ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1103                         break;
1104                 default:
1105                         panic("sndrst2: version %d", version);
1106         }
1107 }
1108
1109 /*
1110  *  send a reset to the remote side and close the conversation
1111  *  called with s qlocked
1112  */
1113 static void tcphangup(struct conv *s)
1114 {
1115         ERRSTACK(1);
1116         Tcp seg;
1117         Tcpctl *tcb;
1118         struct block *hbp;
1119
1120         tcb = (Tcpctl *) s->ptcl;
1121         if (ipcmp(s->raddr, IPnoaddr)) {
1122                 /* discard error style, poperror regardless */
1123                 if (!waserror()) {
1124                         seg.flags = RST | ACK;
1125                         seg.ack = tcb->rcv.nxt;
1126                         tcb->last_ack_sent = seg.ack;
1127                         tcb->rcv.una = 0;
1128                         seg.seq = tcb->snd.nxt;
1129                         seg.wnd = 0;
1130                         seg.urg = 0;
1131                         seg.mss = 0;
1132                         seg.ws = 0;
1133                         seg.sack_ok = FALSE;
1134                         seg.nr_sacks = 0;
1135                         seg.ts_val = tcb->ts_recent;
1136                         switch (s->ipversion) {
1137                                 case V4:
1138                                         tcb->protohdr.tcp4hdr.vihl = IP_VER4;
1139                                         hbp = htontcp4(&seg, NULL, &tcb->protohdr.tcp4hdr, tcb);
1140                                         ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
1141                                         break;
1142                                 case V6:
1143                                         tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
1144                                         hbp = htontcp6(&seg, NULL, &tcb->protohdr.tcp6hdr, tcb);
1145                                         ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
1146                                         break;
1147                                 default:
1148                                         panic("tcphangup: version %d", s->ipversion);
1149                         }
1150                 }
1151                 poperror();
1152         }
1153         localclose(s, NULL);
1154 }
1155
1156 /*
1157  *  (re)send a SYN ACK
1158  */
1159 static int sndsynack(struct Proto *tcp, Limbo *lp)
1160 {
1161         struct block *hbp;
1162         Tcp4hdr ph4;
1163         Tcp6hdr ph6;
1164         Tcp seg;
1165         int scale;
1166         uint8_t flag = 0;
1167
1168         /* make pseudo header */
1169         switch (lp->version) {
1170                 case V4:
1171                         memset(&ph4, 0, sizeof(ph4));
1172                         ph4.vihl = IP_VER4;
1173                         v6tov4(ph4.tcpsrc, lp->laddr);
1174                         v6tov4(ph4.tcpdst, lp->raddr);
1175                         ph4.proto = IP_TCPPROTO;
1176                         hnputs(ph4.tcplen, TCP4_HDRSIZE);
1177                         hnputs(ph4.tcpsport, lp->lport);
1178                         hnputs(ph4.tcpdport, lp->rport);
1179                         break;
1180                 case V6:
1181                         memset(&ph6, 0, sizeof(ph6));
1182                         ph6.vcf[0] = IP_VER6;
1183                         ipmove(ph6.tcpsrc, lp->laddr);
1184                         ipmove(ph6.tcpdst, lp->raddr);
1185                         ph6.proto = IP_TCPPROTO;
1186                         hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1187                         hnputs(ph6.tcpsport, lp->lport);
1188                         hnputs(ph6.tcpdport, lp->rport);
1189                         break;
1190                 default:
1191                         panic("sndrst: version %d", lp->version);
1192         }
1193
1194         seg.seq = lp->iss;
1195         seg.ack = lp->irs + 1;
1196         seg.flags = SYN | ACK;
1197         seg.urg = 0;
1198         seg.mss = tcpmtu(tcp, lp->laddr, lp->version, &scale, &flag);
1199         seg.wnd = QMAX;
1200         seg.ts_val = lp->ts_val;
1201         seg.nr_sacks = 0;
1202
1203         /* if the other side set scale, we should too */
1204         if (lp->rcvscale) {
1205                 seg.ws = scale;
1206                 lp->sndscale = scale;
1207         } else {
1208                 seg.ws = 0;
1209                 lp->sndscale = 0;
1210         }
1211         if (SACK_SUPPORTED)
1212                 seg.sack_ok = lp->sack_ok;
1213         else
1214                 seg.sack_ok = FALSE;
1215
1216         switch (lp->version) {
1217                 case V4:
1218                         hbp = htontcp4(&seg, NULL, &ph4, NULL);
1219                         if (hbp == NULL)
1220                                 return -1;
1221                         ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1222                         break;
1223                 case V6:
1224                         hbp = htontcp6(&seg, NULL, &ph6, NULL);
1225                         if (hbp == NULL)
1226                                 return -1;
1227                         ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1228                         break;
1229                 default:
1230                         panic("sndsnack: version %d", lp->version);
1231         }
1232         lp->lastsend = NOW;
1233         return 0;
1234 }
1235
1236 #define hashipa(a, p) ( ( (a)[IPaddrlen-2] + (a)[IPaddrlen-1] + p )&LHTMASK )
1237
1238 /*
1239  *  put a call into limbo and respond with a SYN ACK
1240  *
1241  *  called with proto locked
1242  */
1243 static void limbo(struct conv *s, uint8_t *source, uint8_t *dest, Tcp *seg,
1244                   int version)
1245 {
1246         Limbo *lp, **l;
1247         struct tcppriv *tpriv;
1248         int h;
1249
1250         tpriv = s->p->priv;
1251         h = hashipa(source, seg->source);
1252
1253         for (l = &tpriv->lht[h]; *l != NULL; l = &lp->next) {
1254                 lp = *l;
1255                 if (lp->lport != seg->dest || lp->rport != seg->source
1256                         || lp->version != version)
1257                         continue;
1258                 if (ipcmp(lp->raddr, source) != 0)
1259                         continue;
1260                 if (ipcmp(lp->laddr, dest) != 0)
1261                         continue;
1262
1263                 /* each new SYN restarts the retransmits */
1264                 lp->irs = seg->seq;
1265                 break;
1266         }
1267         lp = *l;
1268         if (lp == NULL) {
1269                 if (tpriv->nlimbo >= Maxlimbo && tpriv->lht[h]) {
1270                         lp = tpriv->lht[h];
1271                         tpriv->lht[h] = lp->next;
1272                         lp->next = NULL;
1273                 } else {
1274                         lp = kzmalloc(sizeof(*lp), 0);
1275                         if (lp == NULL)
1276                                 return;
1277                         tpriv->nlimbo++;
1278                 }
1279                 *l = lp;
1280                 lp->version = version;
1281                 ipmove(lp->laddr, dest);
1282                 ipmove(lp->raddr, source);
1283                 lp->lport = seg->dest;
1284                 lp->rport = seg->source;
1285                 lp->mss = seg->mss;
1286                 lp->rcvscale = seg->ws;
1287                 lp->sack_ok = seg->sack_ok;
1288                 lp->irs = seg->seq;
1289                 lp->ts_val = seg->ts_val;
1290                 urandom_read(&lp->iss, sizeof(lp->iss));
1291         }
1292
1293         if (sndsynack(s->p, lp) < 0) {
1294                 *l = lp->next;
1295                 tpriv->nlimbo--;
1296                 kfree(lp);
1297         }
1298 }
1299
1300 /*
1301  *  resend SYN ACK's once every SYNACK_RXTIMER ms.
1302  */
1303 static void limborexmit(struct Proto *tcp)
1304 {
1305         struct tcppriv *tpriv;
1306         Limbo **l, *lp;
1307         int h;
1308         int seen;
1309         uint64_t now;
1310
1311         tpriv = tcp->priv;
1312
1313         if (!canqlock(&tcp->qlock))
1314                 return;
1315         seen = 0;
1316         now = NOW;
1317         for (h = 0; h < NLHT && seen < tpriv->nlimbo; h++) {
1318                 for (l = &tpriv->lht[h]; *l != NULL && seen < tpriv->nlimbo;) {
1319                         lp = *l;
1320                         seen++;
1321                         if (now - lp->lastsend < (lp->rexmits + 1) * SYNACK_RXTIMER)
1322                                 continue;
1323
1324                         /* time it out after 1 second */
1325                         if (++(lp->rexmits) > 5) {
1326                                 tpriv->nlimbo--;
1327                                 *l = lp->next;
1328                                 kfree(lp);
1329                                 continue;
1330                         }
1331
1332                         /* if we're being attacked, don't bother resending SYN ACK's */
1333                         if (tpriv->nlimbo > 100)
1334                                 continue;
1335
1336                         if (sndsynack(tcp, lp) < 0) {
1337                                 tpriv->nlimbo--;
1338                                 *l = lp->next;
1339                                 kfree(lp);
1340                                 continue;
1341                         }
1342
1343                         l = &lp->next;
1344                 }
1345         }
1346         qunlock(&tcp->qlock);
1347 }
1348
1349 /*
1350  *  lookup call in limbo.  if found, throw it out.
1351  *
1352  *  called with proto locked
1353  */
1354 static void limborst(struct conv *s, Tcp *segp, uint8_t *src, uint8_t *dst,
1355                      uint8_t version)
1356 {
1357         Limbo *lp, **l;
1358         int h;
1359         struct tcppriv *tpriv;
1360
1361         tpriv = s->p->priv;
1362
1363         /* find a call in limbo */
1364         h = hashipa(src, segp->source);
1365         for (l = &tpriv->lht[h]; *l != NULL; l = &lp->next) {
1366                 lp = *l;
1367                 if (lp->lport != segp->dest || lp->rport != segp->source
1368                         || lp->version != version)
1369                         continue;
1370                 if (ipcmp(lp->laddr, dst) != 0)
1371                         continue;
1372                 if (ipcmp(lp->raddr, src) != 0)
1373                         continue;
1374
1375                 /* RST can only follow the SYN */
1376                 if (segp->seq == lp->irs + 1) {
1377                         tpriv->nlimbo--;
1378                         *l = lp->next;
1379                         kfree(lp);
1380                 }
1381                 break;
1382         }
1383 }
1384
1385 /* The advertised MSS (e.g. 1460) includes any per-packet TCP options, such as
1386  * TCP timestamps.  A given packet will contain mss bytes, but only typical_mss
1387  * bytes of *data*.  If we know we'll use those options, we should adjust our
1388  * typical_mss, which will affect the cwnd. */
1389 static void adjust_typical_mss_for_opts(Tcp *tcph, Tcpctl *tcb)
1390 {
1391         uint16_t opt_size = 0;
1392
1393         if (tcph->ts_val)
1394                 opt_size += TS_LENGTH + TS_SEND_PREPAD;
1395         opt_size = ROUNDUP(opt_size, 4);
1396         tcb->typical_mss -= opt_size;
1397 }
1398
1399 /*
1400  *  come here when we finally get an ACK to our SYN-ACK.
1401  *  lookup call in limbo.  if found, create a new conversation
1402  *
1403  *  called with proto locked
1404  */
1405 static struct conv *tcpincoming(struct conv *s, Tcp *segp, uint8_t *src,
1406                                                                 uint8_t *dst, uint8_t version)
1407 {
1408         struct conv *new;
1409         Tcpctl *tcb;
1410         struct tcppriv *tpriv;
1411         Tcp4hdr *h4;
1412         Tcp6hdr *h6;
1413         Limbo *lp, **l;
1414         int h;
1415
1416         /* unless it's just an ack, it can't be someone coming out of limbo */
1417         if ((segp->flags & SYN) || (segp->flags & ACK) == 0)
1418                 return NULL;
1419
1420         tpriv = s->p->priv;
1421
1422         /* find a call in limbo */
1423         h = hashipa(src, segp->source);
1424         for (l = &tpriv->lht[h]; (lp = *l) != NULL; l = &lp->next) {
1425                 netlog(s->p->f, Logtcp,
1426                            "tcpincoming s %I!%d/%I!%d d %I!%d/%I!%d v %d/%d\n", src,
1427                            segp->source, lp->raddr, lp->rport, dst, segp->dest, lp->laddr,
1428                            lp->lport, version, lp->version);
1429
1430                 if (lp->lport != segp->dest || lp->rport != segp->source
1431                         || lp->version != version)
1432                         continue;
1433                 if (ipcmp(lp->laddr, dst) != 0)
1434                         continue;
1435                 if (ipcmp(lp->raddr, src) != 0)
1436                         continue;
1437
1438                 /* we're assuming no data with the initial SYN */
1439                 if (segp->seq != lp->irs + 1 || segp->ack != lp->iss + 1) {
1440                         netlog(s->p->f, Logtcp, "tcpincoming s 0x%lx/0x%lx a 0x%lx 0x%lx\n",
1441                                    segp->seq, lp->irs + 1, segp->ack, lp->iss + 1);
1442                         lp = NULL;
1443                 } else {
1444                         tpriv->nlimbo--;
1445                         *l = lp->next;
1446                 }
1447                 break;
1448         }
1449         if (lp == NULL)
1450                 return NULL;
1451
1452         new = Fsnewcall(s, src, segp->source, dst, segp->dest, version);
1453         if (new == NULL)
1454                 return NULL;
1455
1456         memmove(new->ptcl, s->ptcl, sizeof(Tcpctl));
1457         tcb = (Tcpctl *) new->ptcl;
1458         tcb->flags &= ~CLONE;
1459         tcb->timer.arg = new;
1460         tcb->timer.state = TcptimerOFF;
1461         tcb->acktimer.arg = new;
1462         tcb->acktimer.state = TcptimerOFF;
1463         tcb->katimer.arg = new;
1464         tcb->katimer.state = TcptimerOFF;
1465         tcb->rtt_timer.arg = new;
1466         tcb->rtt_timer.state = TcptimerOFF;
1467
1468         tcb->irs = lp->irs;
1469         tcb->rcv.nxt = tcb->irs + 1;
1470         tcb->rcv.urg = tcb->rcv.nxt;
1471
1472         tcb->iss = lp->iss;
1473         tcb->rttseq = tcb->iss;
1474         tcb->snd.wl2 = tcb->iss;
1475         tcb->snd.una = tcb->iss + 1;
1476         tcb->snd.rtx = tcb->iss + 1;
1477         tcb->snd.nxt = tcb->iss + 1;
1478         tcb->flgcnt = 0;
1479         tcb->flags |= SYNACK;
1480
1481         /* our sending max segment size cannot be bigger than what he asked for */
1482         if (lp->mss != 0 && lp->mss < tcb->mss) {
1483                 tcb->mss = lp->mss;
1484                 tcb->typical_mss = tcb->mss;
1485         }
1486         adjust_typical_mss_for_opts(segp, tcb);
1487
1488         /* Here's where we record the previously-decided header options.  They were
1489          * actually decided on when we agreed to them in the SYNACK we sent.  We
1490          * didn't create an actual TCB until now, so we can copy those decisions out
1491          * of the limbo tracker and into the TCB. */
1492         tcb->sack_ok = lp->sack_ok;
1493         /* window scaling */
1494         tcpsetscale(new, tcb, lp->rcvscale, lp->sndscale);
1495
1496         tcb->snd.wnd = segp->wnd;
1497         tcb->cwind = tcb->typical_mss * CWIND_SCALE;
1498
1499         /* set initial round trip time */
1500         tcb->sndsyntime = lp->lastsend + lp->rexmits * SYNACK_RXTIMER;
1501         tcpsynackrtt(new);
1502
1503         kfree(lp);
1504
1505         /* set up proto header */
1506         switch (version) {
1507                 case V4:
1508                         h4 = &tcb->protohdr.tcp4hdr;
1509                         memset(h4, 0, sizeof(*h4));
1510                         h4->proto = IP_TCPPROTO;
1511                         hnputs(h4->tcpsport, new->lport);
1512                         hnputs(h4->tcpdport, new->rport);
1513                         v6tov4(h4->tcpsrc, dst);
1514                         v6tov4(h4->tcpdst, src);
1515                         break;
1516                 case V6:
1517                         h6 = &tcb->protohdr.tcp6hdr;
1518                         memset(h6, 0, sizeof(*h6));
1519                         h6->proto = IP_TCPPROTO;
1520                         hnputs(h6->tcpsport, new->lport);
1521                         hnputs(h6->tcpdport, new->rport);
1522                         ipmove(h6->tcpsrc, dst);
1523                         ipmove(h6->tcpdst, src);
1524                         break;
1525                 default:
1526                         panic("tcpincoming: version %d", new->ipversion);
1527         }
1528
1529         tcpsetstate(new, Established);
1530
1531         iphtadd(&tpriv->ht, new);
1532
1533         return new;
1534 }
1535
1536 /*
1537  *  use the time between the first SYN and it's ack as the
1538  *  initial round trip time
1539  */
1540 static void tcpsynackrtt(struct conv *s)
1541 {
1542         Tcpctl *tcb;
1543         uint64_t delta;
1544         struct tcppriv *tpriv;
1545
1546         tcb = (Tcpctl *) s->ptcl;
1547         tpriv = s->p->priv;
1548
1549         delta = NOW - tcb->sndsyntime;
1550         tcb->srtt = delta;
1551         tcb->mdev = delta / 2;
1552
1553         /* halt round trip timer */
1554         tcphalt(tpriv, &tcb->rtt_timer);
1555 }
1556
1557 /* For LFNs (long/fat), our default tx queue doesn't hold enough data, and TCP
1558  * blocks on the application - even if the app already has the data ready to go.
1559  * We need to hold the sent, unacked data (1x cwnd), plus all the data we might
1560  * send next RTT (1x cwnd).  Note this is called after cwnd was expanded. */
1561 static void adjust_tx_qio_limit(struct conv *s)
1562 {
1563         Tcpctl *tcb = (Tcpctl *) s->ptcl;
1564         size_t ideal_limit = tcb->cwind * 2;
1565
1566         /* This is called for every ACK, and it's not entirely free to update the
1567          * limit (locks, CVs, taps).  Updating in chunks of mss seems reasonable.
1568          * During SS, we'll update this on most ACKs (given each ACK increased the
1569          * cwind by > MSS).
1570          *
1571          * We also don't want a lot of tiny blocks from the user, but the way qio
1572          * works, you can put in as much as you want (Maxatomic) and then get
1573          * flow-controlled. */
1574         if (qgetlimit(s->wq) + tcb->typical_mss < ideal_limit)
1575                 qsetlimit(s->wq, ideal_limit);
1576         /* TODO: we could shrink the qio limit too, if we had a better idea what the
1577          * actual threshold was.  We want the limit to be the 'stable' cwnd * 2. */
1578 }
1579
1580 /* Attempts to merge later sacks into sack 'into' (index in the array) */
1581 static void merge_sacks_into(Tcpctl *tcb, int into)
1582 {
1583         struct sack_block *into_sack = &tcb->snd.sacks[into];
1584         struct sack_block *tcb_sack;
1585         int shift = 0;
1586
1587         for (int i = into + 1; i < tcb->snd.nr_sacks; i++) {
1588                 tcb_sack = &tcb->snd.sacks[i];
1589                 if (seq_lt(into_sack->right, tcb_sack->left))
1590                         break;
1591                 if (seq_gt(tcb_sack->right, into_sack->right))
1592                         into_sack->right = tcb_sack->right;
1593                 shift++;
1594         }
1595         if (shift) {
1596                 memmove(tcb->snd.sacks + into + 1,
1597                         tcb->snd.sacks + into + 1 + shift,
1598                         sizeof(struct sack_block) * (tcb->snd.nr_sacks - into - 1
1599                                                              - shift));
1600                 tcb->snd.nr_sacks -= shift;
1601         }
1602 }
1603
1604 /* If we update a sack, it means they received a packet (possibly out of order),
1605  * but they have not received earlier packets.  Otherwise, they would do a full
1606  * ACK.
1607  *
1608  * The trick is in knowing whether the reception growing this sack is due to a
1609  * retrans or due to packets from before our last loss event.  The rightmost
1610  * sack tends to grow a lot with packets we sent before the loss.  However,
1611  * intermediate sacks that grow are signs of a loss, since they only grow as a
1612  * result of retrans.
1613  *
1614  * This is only true for the first time through a retrans.  After we've gone
1615  * through a full retrans blast, the sack that hinted at the retrans loss (and
1616  * there could be multiple of them!) will continue to grow.  We could come up
1617  * with some tracking for this, but instead we'll just do a one-time deal.  You
1618  * can recover from one detected sack retrans loss.  After that, you'll have to
1619  * use the RTO.
1620  *
1621  * This won't catch some things, like a sack that grew and merged with the
1622  * rightmost sack.  This also won't work if you have a single sack.  We can't
1623  * tell where the retrans ends and the sending begins. */
1624 static bool sack_hints_at_loss(Tcpctl *tcb, struct sack_block *tcb_sack)
1625 {
1626         if (tcb->snd.recovery != SACK_RETRANS_RECOVERY)
1627                 return FALSE;
1628         return &tcb->snd.sacks[tcb->snd.nr_sacks - 1] != tcb_sack;
1629 }
1630
1631 static bool sack_contains(struct sack_block *tcb_sack, uint32_t seq)
1632 {
1633         return seq_le(tcb_sack->left, seq) && seq_lt(seq, tcb_sack->right);
1634 }
1635
1636 /* Debugging helper! */
1637 static void sack_asserter(Tcpctl *tcb, char *str)
1638 {
1639         struct sack_block *tcb_sack;
1640
1641         for (int i = 0; i < tcb->snd.nr_sacks; i++) {
1642                 tcb_sack = &tcb->snd.sacks[i];
1643                 /* Checking invariants: snd.rtx is never inside a sack, sacks are always
1644                  * mutually exclusive. */
1645                 if (sack_contains(tcb_sack, tcb->snd.rtx) ||
1646                     ((i + 1 < tcb->snd.nr_sacks) && seq_ge(tcb_sack->right,
1647                                                                (tcb_sack + 1)->left))) {
1648                         printk("SACK ASSERT ERROR at %s\n", str);
1649                         printk("rtx %u una %u nxt %u, sack [%u, %u)\n",
1650                                tcb->snd.rtx, tcb->snd.una, tcb->snd.nxt, tcb_sack->left,
1651                                    tcb_sack->right);
1652                         for (int i = 0; i < tcb->snd.nr_sacks; i++)
1653                                 printk("\t %d: [%u, %u)\n", i, tcb->snd.sacks[i].left,
1654                                        tcb->snd.sacks[i].right);
1655                         backtrace();
1656                         panic("");
1657                 }
1658         }
1659 }
1660
1661 /* Updates bookkeeping whenever a sack is added or updated */
1662 static void sack_has_changed(struct conv *s, Tcpctl *tcb,
1663                              struct sack_block *tcb_sack)
1664 {
1665         /* Due to the change, snd.rtx might be in the middle of this sack.  Advance
1666          * it to the right edge. */
1667         if (sack_contains(tcb_sack, tcb->snd.rtx))
1668                 tcb->snd.rtx = tcb_sack->right;
1669
1670         /* This is a sack for something we retransed and we think it means there was
1671          * another loss.  Instead of waiting for the RTO, we can take action. */
1672         if (sack_hints_at_loss(tcb, tcb_sack)) {
1673                 if (++tcb->snd.sack_loss_hint == TCPREXMTTHRESH) {
1674                         netlog(s->p->f, Logtcprxmt,
1675                                "%I.%d -> %I.%d: sack rxmit loss: snd.rtx %u, sack [%u,%u), una %u, recovery_pt %u\n",
1676                                s->laddr, s->lport, s->raddr, s->rport,
1677                                tcb->snd.rtx, tcb_sack->left, tcb_sack->right, tcb->snd.una,
1678                                tcb->snd.recovery_pt);
1679                         /* Redo retrans, but keep the sacks and recovery point */
1680                         tcp_loss_event(s, tcb);
1681                         tcb->snd.rtx = tcb->snd.una;
1682                         tcb->snd.sack_loss_hint = 0;
1683                         /* Act like an RTO.  We just detected it earlier.  This prevents us
1684                          * from getting another sack hint loss this recovery period and from
1685                          * advancing the opportunistic right edge. */
1686                         tcb->snd.recovery = RTO_RETRANS_RECOVERY;
1687                         /* We didn't actually time out yet and we expect to keep getting
1688                          * sacks, so we don't want to flush or worry about in_flight.  If we
1689                          * messed something up, the RTO will still fire. */
1690                         set_in_flight(tcb);
1691                 }
1692         }
1693 }
1694
1695 /* Advances tcb_sack's right edge, if new_right is farther, and updates the
1696  * bookkeeping due to the change. */
1697 static void update_right_edge(struct conv *s, Tcpctl *tcb,
1698                               struct sack_block *tcb_sack, uint32_t new_right)
1699 {
1700         if (seq_le(new_right, tcb_sack->right))
1701                 return;
1702         tcb_sack->right = new_right;
1703         merge_sacks_into(tcb, tcb_sack - tcb->snd.sacks);
1704         sack_has_changed(s, tcb, tcb_sack);
1705 }
1706
1707 static void update_or_insert_sack(struct conv *s, Tcpctl *tcb,
1708                                   struct sack_block *seg_sack)
1709 {
1710         struct sack_block *tcb_sack;
1711
1712         for (int i = 0; i < tcb->snd.nr_sacks; i++) {
1713                 tcb_sack = &tcb->snd.sacks[i];
1714                 if (seq_lt(tcb_sack->left, seg_sack->left)) {
1715                         /* This includes adjacent (which I've seen!) and overlap. */
1716                         if (seq_le(seg_sack->left, tcb_sack->right)) {
1717                                 update_right_edge(s, tcb, tcb_sack, seg_sack->right);
1718                                 return;
1719                         }
1720                         continue;
1721                 }
1722                 /* Update existing sack */
1723                 if (tcb_sack->left == seg_sack->left) {
1724                         update_right_edge(s, tcb, tcb_sack, seg_sack->right);
1725                         return;
1726                 }
1727                 /* Found our slot */
1728                 if (seq_gt(tcb_sack->left, seg_sack->left)) {
1729                         if (tcb->snd.nr_sacks == MAX_NR_SND_SACKS) {
1730                                 /* Out of room, but it is possible this sack overlaps later
1731                                  * sacks, including the max sack's right edge. */
1732                                 if (seq_ge(seg_sack->right, tcb_sack->left)) {
1733                                         /* Take over the sack */
1734                                         tcb_sack->left = seg_sack->left;
1735                                         update_right_edge(s, tcb, tcb_sack, seg_sack->right);
1736                                 }
1737                                 return;
1738                         }
1739                         /* O/W, it's our slot and we have room (at least one spot). */
1740                         memmove(&tcb->snd.sacks[i + 1], &tcb->snd.sacks[i],
1741                                 sizeof(struct sack_block) * (tcb->snd.nr_sacks - i));
1742                         tcb_sack->left = seg_sack->left;
1743                         tcb_sack->right = seg_sack->right;
1744                         tcb->snd.nr_sacks++;
1745                         merge_sacks_into(tcb, i);
1746                         sack_has_changed(s, tcb, tcb_sack);
1747                         return;
1748                 }
1749         }
1750         if (tcb->snd.nr_sacks == MAX_NR_SND_SACKS) {
1751                 /* We didn't find space in the sack array. */
1752                 tcb_sack = &tcb->snd.sacks[MAX_NR_SND_SACKS - 1];
1753                 /* Need to always maintain the rightmost sack, discarding the prev */
1754                 if (seq_gt(seg_sack->right, tcb_sack->right)) {
1755                         tcb_sack->left = seg_sack->left;
1756                         tcb_sack->right = seg_sack->right;
1757                         sack_has_changed(s, tcb, tcb_sack);
1758                 }
1759                 return;
1760         }
1761         tcb_sack = &tcb->snd.sacks[tcb->snd.nr_sacks];
1762         tcb->snd.nr_sacks++;
1763         tcb_sack->left = seg_sack->left;
1764         tcb_sack->right = seg_sack->right;
1765         sack_has_changed(s, tcb, tcb_sack);
1766 }
1767
1768 /* Given the packet seg, track the sacks in TCB.  There are a few things: if seg
1769  * acks new data, some sacks might no longer be needed.  Some sacks might grow,
1770  * we might add new sacks, either of which can cause a merger.
1771  *
1772  * The important thing is that we always have the max sack entry: it must be
1773  * inserted for sure and findable.  We need that for our measurement of what
1774  * packets are in the network.
1775  *
1776  * Note that we keep sacks that are below snd.rtx (and above
1777  * seg.ack/tcb->snd.una) as best we can - we don't prune them.  We'll need those
1778  * for the in_flight estimate.
1779  *
1780  * When we run out of room, we'll have to throw away a sack.  Anything we throw
1781  * away below snd.rtx will be counted as 'in flight', even though it isn't.  If
1782  * we throw away something greater than snd.rtx, we'll also retrans it.  For
1783  * simplicity, we throw-away / replace the rightmost sack, since we're always
1784  * maintaining a highest sack. */
1785 static void update_sacks(struct conv *s, Tcpctl *tcb, Tcp *seg)
1786 {
1787         int prune = 0;
1788         struct sack_block *tcb_sack;
1789
1790         for (int i = 0; i < tcb->snd.nr_sacks; i++) {
1791                 tcb_sack = &tcb->snd.sacks[i];
1792                 /* For the equality case, if they acked up to, but not including an old
1793                  * sack, they must have reneged it.  Otherwise they would have acked
1794                  * beyond the sack. */
1795                 if (seq_lt(seg->ack, tcb_sack->left))
1796                         break;
1797                 prune++;
1798         }
1799         if (prune) {
1800                 memmove(tcb->snd.sacks, tcb->snd.sacks + prune,
1801                         sizeof(struct sack_block) * (tcb->snd.nr_sacks - prune));
1802                 tcb->snd.nr_sacks -= prune;
1803         }
1804         for (int i = 0; i < seg->nr_sacks; i++) {
1805                 /* old sacks */
1806                 if (seq_lt(seg->sacks[i].left, seg->ack))
1807                         continue;
1808                 /* buggy sack: out of range */
1809                 if (seq_gt(seg->sacks[i].right, tcb->snd.nxt))
1810                         continue;
1811                 update_or_insert_sack(s, tcb, &seg->sacks[i]);
1812         }
1813 }
1814
1815 /* This is a little bit of an under estimate, since we assume a packet is lost
1816  * once we have any sacks above it.  Overall, it's at most 2 * MSS of an
1817  * overestimate.
1818  *
1819  * If we have no sacks (either reneged or never used) we'll assume all packets
1820  * above snd.rtx are lost.  This will be the case for sackless fast rxmit
1821  * (Dong's stuff) or for a timeout.  In the former case, this is probably not
1822  * true, and in_flight should be higher, but we have no knowledge without the
1823  * sacks. */
1824 static void set_in_flight(Tcpctl *tcb)
1825 {
1826         struct sack_block *tcb_sack;
1827         uint32_t in_flight = 0;
1828         uint32_t from;
1829
1830         if (!tcb->snd.nr_sacks) {
1831                 tcb->snd.in_flight = tcb->snd.rtx - tcb->snd.una;
1832                 return;
1833         }
1834
1835         /* Everything to the right of the unsacked */
1836         tcb_sack = &tcb->snd.sacks[tcb->snd.nr_sacks - 1];
1837         in_flight += tcb->snd.nxt - tcb_sack->right;
1838
1839         /* Everything retransed (from una to snd.rtx, minus sacked regions.  Note
1840          * we only retrans at most the last sack's left edge.  snd.rtx will be
1841          * advanced to the right edge of some sack (possibly the last one). */
1842         from = tcb->snd.una;
1843         for (int i = 0; i < tcb->snd.nr_sacks; i++) {
1844                 tcb_sack = &tcb->snd.sacks[i];
1845                 if (seq_ge(tcb_sack->left, tcb->snd.rtx))
1846                         break;
1847                 assert(seq_ge(tcb->snd.rtx, tcb_sack->right));
1848                 in_flight += tcb_sack->left - from;
1849                 from = tcb_sack->right;
1850         }
1851         in_flight += tcb->snd.rtx - from;
1852
1853         tcb->snd.in_flight = in_flight;
1854 }
1855
1856 static void reset_recovery(struct conv *s, Tcpctl *tcb)
1857 {
1858         netlog(s->p->f, Logtcprxmt,
1859                "%I.%d -> %I.%d: recovery complete, una %u, rtx %u, nxt %u, recovery %u\n",
1860                s->laddr, s->lport, s->raddr, s->rport,
1861                tcb->snd.una, tcb->snd.rtx, tcb->snd.nxt, tcb->snd.recovery_pt);
1862         tcb->snd.recovery = 0;
1863         tcb->snd.recovery_pt = 0;
1864         tcb->snd.loss_hint = 0;
1865         tcb->snd.flush_sacks = FALSE;
1866         tcb->snd.sack_loss_hint = 0;
1867 }
1868
1869 static bool is_dup_ack(Tcpctl *tcb, Tcp *seg)
1870 {
1871         /* this is a pure ack w/o window update */
1872         return (seg->ack == tcb->snd.una) &&
1873                (tcb->snd.una != tcb->snd.nxt) &&
1874                (seg->len == 0) &&
1875                (seg->wnd == tcb->snd.wnd);
1876 }
1877
1878 /* If we have sacks, we'll ignore dupacks and look at the sacks ahead of una
1879  * (which are managed by the TCB).  The tcb will not have old sacks (below
1880  * ack/snd.rtx).  Receivers often send sacks below their ack point when we are
1881  * coming out of a loss, and we don't want those to count.
1882  *
1883  * Note the tcb could have sacks (in the future), but the receiver stopped using
1884  * them (reneged).  We'll catch that with the RTO.  If we try to catch it here,
1885  * we could get in a state where we never allow them to renege. */
1886 static bool is_potential_loss(Tcpctl *tcb, Tcp *seg)
1887 {
1888         if (seg->nr_sacks > 0)
1889                 return tcb->snd.nr_sacks > 0;
1890         else
1891                 return is_dup_ack(tcb, seg);
1892 }
1893
1894 /* When we use timestamps for RTTM, RFC 7323 suggests scaling by
1895  * expected_samples (per cwnd).  They say:
1896  *
1897  * ExpectedSamples = ceiling(FlightSize / (SMSS * 2))
1898  *
1899  * However, SMMS * 2 is really "number of bytes expected to be acked in a
1900  * packet.".  We'll use 'acked' to approximate that.  When the receiver uses
1901  * LRO, they'll send back large ACKs, which decreases the number of samples.
1902  *
1903  * If it turns out that all the divides are bad, we can just go back to not
1904  * using expected_samples at all. */
1905 static int expected_samples_ts(Tcpctl *tcb, uint32_t acked)
1906 {
1907         assert(acked);
1908         return MAX(DIV_ROUND_UP(tcb->snd.nxt - tcb->snd.una, acked), 1);
1909 }
1910
1911 /* Updates the RTT, given the currently sampled RTT and the number samples per
1912  * cwnd.  For non-TS RTTM, that'll be 1. */
1913 static void update_rtt(Tcpctl *tcb, int rtt_sample, int expected_samples)
1914 {
1915         int delta;
1916
1917         tcb->backoff = 0;
1918         tcb->backedoff = 0;
1919         if (tcb->srtt == 0) {
1920                 tcb->srtt = rtt_sample;
1921                 tcb->mdev = rtt_sample / 2;
1922         } else {
1923                 delta = rtt_sample - tcb->srtt;
1924                 tcb->srtt += (delta >> RTTM_ALPHA_SHIFT) / expected_samples;
1925                 if (tcb->srtt <= 0)
1926                         tcb->srtt = 1;
1927                 tcb->mdev += ((abs(delta) - tcb->mdev) >> RTTM_BRAVO_SHIFT) /
1928                              expected_samples;
1929                 if (tcb->mdev <= 0)
1930                         tcb->mdev = 1;
1931         }
1932         tcpsettimer(tcb);
1933 }
1934
1935 static void update(struct conv *s, Tcp *seg)
1936 {
1937         int rtt;
1938         Tcpctl *tcb;
1939         uint32_t acked, expand;
1940         struct tcppriv *tpriv;
1941
1942         tpriv = s->p->priv;
1943         tcb = (Tcpctl *) s->ptcl;
1944
1945         if (!seq_within(seg->ack, tcb->snd.una, tcb->snd.nxt))
1946                 return;
1947
1948         acked = seg->ack - tcb->snd.una;
1949         tcb->snd.una = seg->ack;
1950         if (seq_gt(seg->ack, tcb->snd.rtx))
1951                 tcb->snd.rtx = seg->ack;
1952
1953         update_sacks(s, tcb, seg);
1954         set_in_flight(tcb);
1955
1956         /* We treat either a dupack or forward SACKs as a hint that there is a loss.
1957          * The RFCs suggest three dupacks before treating it as a loss (alternative
1958          * is reordered packets).  We'll treat three SACKs the same way. */
1959         if (is_potential_loss(tcb, seg) && !tcb->snd.recovery) {
1960                 tcb->snd.loss_hint++;
1961                 if (tcb->snd.loss_hint == TCPREXMTTHRESH) {
1962                         netlog(s->p->f, Logtcprxmt,
1963                                "%I.%d -> %I.%d: loss hint thresh, nr sacks %u, nxt %u, una %u, cwnd %u\n",
1964                                s->laddr, s->lport, s->raddr, s->rport,
1965                                tcb->snd.nr_sacks, tcb->snd.nxt, tcb->snd.una, tcb->cwind);
1966                         tcp_loss_event(s, tcb);
1967                         tcb->snd.recovery_pt = tcb->snd.nxt;
1968                         if (tcb->snd.nr_sacks) {
1969                                 tcb->snd.recovery = SACK_RETRANS_RECOVERY;
1970                                 tcb->snd.flush_sacks = FALSE;
1971                                 tcb->snd.sack_loss_hint = 0;
1972                         } else {
1973                                 tcb->snd.recovery = FAST_RETRANS_RECOVERY;
1974                         }
1975                         tcprxmit(s);
1976                 }
1977         }
1978
1979         /*
1980          *  update window
1981          */
1982         if (seq_gt(seg->ack, tcb->snd.wl2)
1983                 || (tcb->snd.wl2 == seg->ack && seg->wnd > tcb->snd.wnd)) {
1984                 tcb->snd.wnd = seg->wnd;
1985                 tcb->snd.wl2 = seg->ack;
1986         }
1987
1988         if (!acked) {
1989                 /*
1990                  *  don't let us hangup if sending into a closed window and
1991                  *  we're still getting acks
1992                  */
1993                 if (tcb->snd.recovery && (tcb->snd.wnd == 0))
1994                         tcb->backedoff = MAXBACKMS / 4;
1995                 return;
1996         }
1997         /* At this point, they have acked something new. (positive ack, ack > una).
1998          *
1999          * If we hadn't reached the threshold for recovery yet, the positive ACK
2000          * will reset our loss_hint count. */
2001         if (!tcb->snd.recovery)
2002                 tcb->snd.loss_hint = 0;
2003         else if (seq_ge(seg->ack, tcb->snd.recovery_pt))
2004                 reset_recovery(s, tcb);
2005
2006         /* avoid slow start and timers for SYN acks */
2007         if ((tcb->flags & SYNACK) == 0) {
2008                 tcb->flags |= SYNACK;
2009                 acked--;
2010                 tcb->flgcnt--;
2011                 goto done;
2012         }
2013
2014         /* slow start as long as we're not recovering from lost packets */
2015         if (tcb->cwind < tcb->snd.wnd && !tcb->snd.recovery) {
2016                 if (tcb->cwind < tcb->ssthresh) {
2017                         /* We increase the cwind by every byte we receive.  We want to
2018                          * increase the cwind by one MSS for every MSS that gets ACKed.
2019                          * Note that multiple MSSs can be ACKed in a single ACK.  If we had
2020                          * a remainder of acked / MSS, we'd add just that remainder - not 0
2021                          * or 1 MSS. */
2022                         expand = acked;
2023                 } else {
2024                         /* Every RTT, which consists of CWND bytes, we're supposed to expand
2025                          * by MSS bytes.  The classic algorithm was
2026                          *              expand = (tcb->mss * tcb->mss) / tcb->cwind;
2027                          * which assumes the ACK was for MSS bytes.  Instead, for every
2028                          * 'acked' bytes, we increase the window by acked / CWND (in units
2029                          * of MSS). */
2030                         expand = MAX(acked, tcb->typical_mss) * tcb->typical_mss
2031                                  / tcb->cwind;
2032                 }
2033
2034                 if (tcb->cwind + expand < tcb->cwind)
2035                         expand = tcb->snd.wnd - tcb->cwind;
2036                 if (tcb->cwind + expand > tcb->snd.wnd)
2037                         expand = tcb->snd.wnd - tcb->cwind;
2038                 tcb->cwind += expand;
2039         }
2040         adjust_tx_qio_limit(s);
2041
2042         if (tcb->ts_recent) {
2043                 update_rtt(tcb, abs(milliseconds() - seg->ts_ecr),
2044                            expected_samples_ts(tcb, acked));
2045         } else if (tcb->rtt_timer.state == TcptimerON &&
2046                    seq_ge(seg->ack, tcb->rttseq)) {
2047                 /* Adjust the timers according to the round trip time */
2048                 tcphalt(tpriv, &tcb->rtt_timer);
2049                 if (!tcb->snd.recovery) {
2050                         rtt = tcb->rtt_timer.start - tcb->rtt_timer.count;
2051                         if (rtt == 0)
2052                                 rtt = 1;        /* o/w all close systems will rexmit in 0 time */
2053                         rtt *= MSPTICK;
2054                         update_rtt(tcb, rtt, 1);
2055                 }
2056         }
2057
2058 done:
2059         if (qdiscard(s->wq, acked) < acked) {
2060                 tcb->flgcnt--;
2061                 /* This happened due to another bug where acked was very large
2062                  * (negative), which was interpreted as "hey, one less flag, since they
2063                  * acked one of our flags (like a SYN).  If flgcnt goes negative,
2064                  * get_xmit_segment() will attempt to send out large packets. */
2065                 assert(tcb->flgcnt >= 0);
2066         }
2067
2068         if (seq_gt(seg->ack, tcb->snd.urg))
2069                 tcb->snd.urg = seg->ack;
2070
2071         if (tcb->snd.una != tcb->snd.nxt)
2072                 tcpgo(tpriv, &tcb->timer);
2073         else
2074                 tcphalt(tpriv, &tcb->timer);
2075
2076         tcb->backoff = 0;
2077         tcb->backedoff = 0;
2078 }
2079
2080 static void update_tcb_ts(Tcpctl *tcb, Tcp *seg)
2081 {
2082         /* Get timestamp info from the tcp header.  Even though the timestamps
2083          * aren't sequence numbers, we still need to protect for wraparound.  Though
2084          * if the values were 0, assume that means we need an update.  We could have
2085          * an initial ts_val that appears negative (signed). */
2086         if (!tcb->ts_recent || !tcb->last_ack_sent ||
2087             (seq_ge(seg->ts_val, tcb->ts_recent) &&
2088              seq_le(seg->seq, tcb->last_ack_sent)))
2089                 tcb->ts_recent = seg->ts_val;
2090 }
2091
2092 /* Overlap happens when one sack's left edge is inside another sack. */
2093 static bool sacks_overlap(struct sack_block *x, struct sack_block *y)
2094 {
2095         return (seq_le(x->left, y->left) && seq_le(y->left, x->right)) ||
2096                (seq_le(y->left, x->left) && seq_le(x->left, y->right));
2097 }
2098
2099 static void make_sack_first(Tcpctl *tcb, struct sack_block *tcb_sack)
2100 {
2101         struct sack_block temp;
2102
2103         if (tcb_sack == &tcb->rcv.sacks[0])
2104                 return;
2105         temp = tcb->rcv.sacks[0];
2106         tcb->rcv.sacks[0] = *tcb_sack;
2107         *tcb_sack = temp;
2108 }
2109
2110 /* Track sack in our tcb for a block of data we received.  This handles all the
2111  * stuff: making sure sack is first (since it's the most recent sack change),
2112  * updating or merging sacks, and dropping excess sacks (we only need to
2113  * maintain 3).  Unlike on the snd side, our tcb sacks are *not* sorted. */
2114 static void track_rcv_sack(Tcpctl *tcb, uint32_t left, uint32_t right)
2115 {
2116         struct sack_block *tcb_sack;
2117         struct sack_block sack[1];
2118
2119         if (!tcb->sack_ok)
2120                 return;
2121         assert(seq_lt(left, right));
2122         sack->left = left;
2123         sack->right = right;
2124         /* We can reuse an existing sack if we're merging or overlapping. */
2125         for (int i = 0; i < tcb->rcv.nr_sacks; i++) {
2126                 tcb_sack = &tcb->rcv.sacks[i];
2127                 if (sacks_overlap(tcb_sack, sack)) {
2128                         tcb_sack->left = seq_min(tcb_sack->left, sack->left);
2129                         tcb_sack->right = seq_max(tcb_sack->right, sack->right);
2130                         make_sack_first(tcb, tcb_sack);
2131                         return;
2132                 }
2133         }
2134         /* We can discard the last sack (right shift) - we should have sent it at
2135          * least once by now.  If not, oh well. */
2136         memmove(tcb->rcv.sacks + 1, tcb->rcv.sacks, sizeof(struct sack_block) *
2137                 MIN(MAX_NR_RCV_SACKS - 1, tcb->rcv.nr_sacks));
2138         tcb->rcv.sacks[0] = *sack;
2139         if (tcb->rcv.nr_sacks < MAX_NR_RCV_SACKS)
2140                 tcb->rcv.nr_sacks++;
2141 }
2142
2143 /* Once we receive everything and move rcv.nxt past a sack, we don't need to
2144  * track it.  I've seen Linux report sacks in the past, but we probably
2145  * shouldn't. */
2146 static void drop_old_rcv_sacks(Tcpctl *tcb)
2147 {
2148         struct sack_block *tcb_sack;
2149
2150         for (int i = 0; i < tcb->rcv.nr_sacks; i++) {
2151                 tcb_sack = &tcb->rcv.sacks[i];
2152                 /* Moving up to or past the left is enough to drop it. */
2153                 if (seq_ge(tcb->rcv.nxt, tcb_sack->left)) {
2154                         memmove(tcb->rcv.sacks + i, tcb->rcv.sacks + i + 1,
2155                                 sizeof(struct sack_block) * (tcb->rcv.nr_sacks - i - 1));
2156                         tcb->rcv.nr_sacks--;
2157                         i--;
2158                 }
2159         }
2160 }
2161
2162 static void tcpiput(struct Proto *tcp, struct Ipifc *unused, struct block *bp)
2163 {
2164         ERRSTACK(1);
2165         Tcp seg;
2166         Tcp4hdr *h4;
2167         Tcp6hdr *h6;
2168         int hdrlen;
2169         Tcpctl *tcb;
2170         uint16_t length;
2171         uint8_t source[IPaddrlen], dest[IPaddrlen];
2172         struct conv *s;
2173         struct Fs *f;
2174         struct tcppriv *tpriv;
2175         uint8_t version;
2176
2177         f = tcp->f;
2178         tpriv = tcp->priv;
2179
2180         tpriv->stats[InSegs]++;
2181
2182         h4 = (Tcp4hdr *) (bp->rp);
2183         h6 = (Tcp6hdr *) (bp->rp);
2184
2185         if ((h4->vihl & 0xF0) == IP_VER4) {
2186                 uint8_t ttl;
2187
2188                 version = V4;
2189                 length = nhgets(h4->length);
2190                 v4tov6(dest, h4->tcpdst);
2191                 v4tov6(source, h4->tcpsrc);
2192
2193                 /* ttl isn't part of the xsum pseudo header, but bypass needs it. */
2194                 ttl = h4->Unused;
2195                 h4->Unused = 0;
2196                 hnputs(h4->tcplen, length - TCP4_PKT);
2197                 if (!(bp->flag & Btcpck) && (h4->tcpcksum[0] || h4->tcpcksum[1]) &&
2198                         ptclcsum(bp, TCP4_IPLEN, length - TCP4_IPLEN)) {
2199                         tpriv->stats[CsumErrs]++;
2200                         tpriv->stats[InErrs]++;
2201                         netlog(f, Logtcp, "bad tcp proto cksum\n");
2202                         freeblist(bp);
2203                         return;
2204                 }
2205                 h4->Unused = ttl;
2206
2207                 hdrlen = ntohtcp4(&seg, &bp);
2208                 if (hdrlen < 0) {
2209                         tpriv->stats[HlenErrs]++;
2210                         tpriv->stats[InErrs]++;
2211                         netlog(f, Logtcp, "bad tcp hdr len\n");
2212                         return;
2213                 }
2214
2215                 s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest);
2216                 if (s && s->state == Bypass) {
2217                         bypass_or_drop(s, bp);
2218                         return;
2219                 }
2220
2221                 /* trim the packet to the size claimed by the datagram */
2222                 length -= hdrlen + TCP4_PKT;
2223                 bp = trimblock(bp, hdrlen + TCP4_PKT, length);
2224                 if (bp == NULL) {
2225                         tpriv->stats[LenErrs]++;
2226                         tpriv->stats[InErrs]++;
2227                         netlog(f, Logtcp, "tcp len < 0 after trim\n");
2228                         return;
2229                 }
2230         } else {
2231                 int ttl = h6->ttl;
2232                 int proto = h6->proto;
2233
2234                 version = V6;
2235                 length = nhgets(h6->ploadlen);
2236                 ipmove(dest, h6->tcpdst);
2237                 ipmove(source, h6->tcpsrc);
2238
2239                 h6->ploadlen[0] = h6->ploadlen[1] = h6->proto = 0;
2240                 h6->ttl = proto;
2241                 hnputl(h6->vcf, length);
2242                 if ((h6->tcpcksum[0] || h6->tcpcksum[1]) &&
2243                         ptclcsum(bp, TCP6_IPLEN, length + TCP6_PHDRSIZE)) {
2244                         tpriv->stats[CsumErrs]++;
2245                         tpriv->stats[InErrs]++;
2246                         netlog(f, Logtcp, "bad tcp proto cksum\n");
2247                         freeblist(bp);
2248                         return;
2249                 }
2250                 h6->ttl = ttl;
2251                 h6->proto = proto;
2252                 hnputs(h6->ploadlen, length);
2253
2254                 hdrlen = ntohtcp6(&seg, &bp);
2255                 if (hdrlen < 0) {
2256                         tpriv->stats[HlenErrs]++;
2257                         tpriv->stats[InErrs]++;
2258                         netlog(f, Logtcp, "bad tcp hdr len\n");
2259                         return;
2260                 }
2261
2262                 s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest);
2263                 if (s && s->state == Bypass) {
2264                         bypass_or_drop(s, bp);
2265                         return;
2266                 }
2267
2268                 /* trim the packet to the size claimed by the datagram */
2269                 length -= hdrlen;
2270                 bp = trimblock(bp, hdrlen + TCP6_PKT, length);
2271                 if (bp == NULL) {
2272                         tpriv->stats[LenErrs]++;
2273                         tpriv->stats[InErrs]++;
2274                         netlog(f, Logtcp, "tcp len < 0 after trim\n");
2275                         return;
2276                 }
2277         }
2278
2279         /* s, the conv matching the n-tuple, was set above */
2280         if (s == NULL) {
2281                 netlog(f, Logtcpreset, "iphtlook failed: src %I:%u, dst %I:%u\n",
2282                        source, seg.source, dest, seg.dest);
2283 reset:
2284                 sndrst(tcp, source, dest, length, &seg, version, "no conversation");
2285                 freeblist(bp);
2286                 return;
2287         }
2288
2289         /* lock protocol for unstate Plan 9 invariants.  funcs like limbo or
2290          * incoming might rely on it. */
2291         qlock(&tcp->qlock);
2292
2293         /* if it's a listener, look for the right flags and get a new conv */
2294         tcb = (Tcpctl *) s->ptcl;
2295         if (tcb->state == Listen) {
2296                 if (seg.flags & RST) {
2297                         limborst(s, &seg, source, dest, version);
2298                         qunlock(&tcp->qlock);
2299                         freeblist(bp);
2300                         return;
2301                 }
2302
2303                 /* if this is a new SYN, put the call into limbo */
2304                 if ((seg.flags & SYN) && (seg.flags & ACK) == 0) {
2305                         limbo(s, source, dest, &seg, version);
2306                         qunlock(&tcp->qlock);
2307                         freeblist(bp);
2308                         return;
2309                 }
2310
2311                 /* if there's a matching call in limbo, tcpincoming will return it */
2312                 s = tcpincoming(s, &seg, source, dest, version);
2313                 if (s == NULL) {
2314                         qunlock(&tcp->qlock);
2315                         goto reset;
2316                 }
2317         }
2318
2319         /* The rest of the input state machine is run with the control block
2320          * locked and implements the state machine directly out of the RFC.
2321          * Out-of-band data is ignored - it was always a bad idea.
2322          */
2323         tcb = (Tcpctl *) s->ptcl;
2324         if (waserror()) {
2325                 qunlock(&s->qlock);
2326                 nexterror();
2327         }
2328         qlock(&s->qlock);
2329         qunlock(&tcp->qlock);
2330
2331         update_tcb_ts(tcb, &seg);
2332         /* fix up window */
2333         seg.wnd <<= tcb->rcv.scale;
2334
2335         /* every input packet in puts off the keep alive time out */
2336         tcpsetkacounter(tcb);
2337
2338         switch (tcb->state) {
2339                 case Closed:
2340                         sndrst(tcp, source, dest, length, &seg, version,
2341                                    "sending to Closed");
2342                         goto raise;
2343                 case Syn_sent:
2344                         if (seg.flags & ACK) {
2345                                 if (!seq_within(seg.ack, tcb->iss + 1, tcb->snd.nxt)) {
2346                                         sndrst(tcp, source, dest, length, &seg, version,
2347                                                    "bad seq in Syn_sent");
2348                                         goto raise;
2349                                 }
2350                         }
2351                         if (seg.flags & RST) {
2352                                 if (seg.flags & ACK)
2353                                         localclose(s, "connection refused");
2354                                 goto raise;
2355                         }
2356
2357                         if (seg.flags & SYN) {
2358                                 procsyn(s, &seg);
2359                                 if (seg.flags & ACK) {
2360                                         update(s, &seg);
2361                                         tcpsynackrtt(s);
2362                                         tcpsetstate(s, Established);
2363                                         /* Here's where we get the results of header option
2364                                          * negotiations for connections we started. (SYNACK has the
2365                                          * response) */
2366                                         tcpsetscale(s, tcb, seg.ws, tcb->scale);
2367                                         tcb->sack_ok = seg.sack_ok;
2368                                 } else {
2369                                         sndrst(tcp, source, dest, length, &seg, version,
2370                                                    "Got SYN with no ACK");
2371                                         goto raise;
2372                                 }
2373
2374                                 if (length != 0 || (seg.flags & FIN))
2375                                         break;
2376
2377                                 freeblist(bp);
2378                                 goto output;
2379                         } else
2380                                 freeblist(bp);
2381
2382                         qunlock(&s->qlock);
2383                         poperror();
2384                         return;
2385         }
2386
2387         /*
2388          *  One DOS attack is to open connections to us and then forget about them,
2389          *  thereby tying up a conv at no long term cost to the attacker.
2390          *  This is an attempt to defeat these stateless DOS attacks.  See
2391          *  corresponding code in tcpsendka().
2392          */
2393         if ((seg.flags & RST) == 0) {
2394                 if (tcpporthogdefense
2395                         && seq_within(seg.ack, tcb->snd.una - (1 << 31),
2396                                                   tcb->snd.una - (1 << 29))) {
2397                         printd("stateless hog %I.%d->%I.%d f 0x%x 0x%lx - 0x%lx - 0x%lx\n",
2398                                    source, seg.source, dest, seg.dest, seg.flags,
2399                                    tcb->snd.una - (1 << 31), seg.ack, tcb->snd.una - (1 << 29));
2400                         localclose(s, "stateless hog");
2401                 }
2402         }
2403
2404         /* Cut the data to fit the receive window */
2405         if (tcptrim(tcb, &seg, &bp, &length) == -1) {
2406                 netlog(f, Logtcp, "%I.%d -> %I.%d: tcp len < 0, %lu %d\n",
2407                        s->raddr, s->rport, s->laddr, s->lport, seg.seq, length);
2408                 update(s, &seg);
2409                 if (qlen(s->wq) + tcb->flgcnt == 0 && tcb->state == Closing) {
2410                         tcphalt(tpriv, &tcb->rtt_timer);
2411                         tcphalt(tpriv, &tcb->acktimer);
2412                         tcphalt(tpriv, &tcb->katimer);
2413                         tcpsetstate(s, Time_wait);
2414                         tcb->timer.start = MSL2 * (1000 / MSPTICK);
2415                         tcpgo(tpriv, &tcb->timer);
2416                 }
2417                 if (!(seg.flags & RST)) {
2418                         tcb->flags |= FORCE;
2419                         goto output;
2420                 }
2421                 qunlock(&s->qlock);
2422                 poperror();
2423                 return;
2424         }
2425
2426         /* Cannot accept so answer with a rst */
2427         if (length && tcb->state == Closed) {
2428                 sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
2429                 goto raise;
2430         }
2431
2432         /* The segment is beyond the current receive pointer so
2433          * queue the data in the resequence queue
2434          */
2435         if (seg.seq != tcb->rcv.nxt)
2436                 if (length != 0 || (seg.flags & (SYN | FIN))) {
2437                         update(s, &seg);
2438                         if (addreseq(tcb, tpriv, &seg, bp, length) < 0)
2439                                 printd("reseq %I.%d -> %I.%d\n", s->raddr, s->rport, s->laddr,
2440                                            s->lport);
2441                         tcb->flags |= FORCE;
2442                         goto output;
2443                 }
2444
2445         /*
2446          *  keep looping till we've processed this packet plus any
2447          *  adjacent packets in the resequence queue
2448          */
2449         for (;;) {
2450                 if (seg.flags & RST) {
2451                         if (tcb->state == Established) {
2452                                 tpriv->stats[EstabResets]++;
2453                                 if (tcb->rcv.nxt != seg.seq)
2454                                         printd
2455                                                 ("out of order RST rcvd: %I.%d -> %I.%d, rcv.nxt 0x%lx seq 0x%lx\n",
2456                                                  s->raddr, s->rport, s->laddr, s->lport, tcb->rcv.nxt,
2457                                                  seg.seq);
2458                         }
2459                         localclose(s, "connection refused");
2460                         goto raise;
2461                 }
2462
2463                 if ((seg.flags & ACK) == 0)
2464                         goto raise;
2465
2466                 switch (tcb->state) {
2467                         case Established:
2468                         case Close_wait:
2469                                 update(s, &seg);
2470                                 break;
2471                         case Finwait1:
2472                                 update(s, &seg);
2473                                 if (qlen(s->wq) + tcb->flgcnt == 0) {
2474                                         tcphalt(tpriv, &tcb->rtt_timer);
2475                                         tcphalt(tpriv, &tcb->acktimer);
2476                                         tcpsetkacounter(tcb);
2477                                         tcb->time = NOW;
2478                                         tcpsetstate(s, Finwait2);
2479                                         tcb->katimer.start = MSL2 * (1000 / MSPTICK);
2480                                         tcpgo(tpriv, &tcb->katimer);
2481                                 }
2482                                 break;
2483                         case Finwait2:
2484                                 update(s, &seg);
2485                                 break;
2486                         case Closing:
2487                                 update(s, &seg);
2488                                 if (qlen(s->wq) + tcb->flgcnt == 0) {
2489                                         tcphalt(tpriv, &tcb->rtt_timer);
2490                                         tcphalt(tpriv, &tcb->acktimer);
2491                                         tcphalt(tpriv, &tcb->katimer);
2492                                         tcpsetstate(s, Time_wait);
2493                                         tcb->timer.start = MSL2 * (1000 / MSPTICK);
2494                                         tcpgo(tpriv, &tcb->timer);
2495                                 }
2496                                 break;
2497                         case Last_ack:
2498                                 update(s, &seg);
2499                                 if (qlen(s->wq) + tcb->flgcnt == 0) {
2500                                         localclose(s, NULL);
2501                                         goto raise;
2502                                 }
2503                         case Time_wait:
2504                                 tcb->flags |= FORCE;
2505                                 if (tcb->timer.state != TcptimerON)
2506                                         tcpgo(tpriv, &tcb->timer);
2507                 }
2508
2509                 if ((seg.flags & URG) && seg.urg) {
2510                         if (seq_gt(seg.urg + seg.seq, tcb->rcv.urg)) {
2511                                 tcb->rcv.urg = seg.urg + seg.seq;
2512                                 pullblock(&bp, seg.urg);
2513                         }
2514                 } else if (seq_gt(tcb->rcv.nxt, tcb->rcv.urg))
2515                         tcb->rcv.urg = tcb->rcv.nxt;
2516
2517                 if (length == 0) {
2518                         if (bp != NULL)
2519                                 freeblist(bp);
2520                 } else {
2521                         switch (tcb->state) {
2522                                 default:
2523                                         /* Ignore segment text */
2524                                         if (bp != NULL)
2525                                                 freeblist(bp);
2526                                         break;
2527
2528                                 case Established:
2529                                 case Finwait1:
2530                                         /* If we still have some data place on
2531                                          * receive queue
2532                                          */
2533                                         if (bp) {
2534                                                 bp = packblock(bp);
2535                                                 if (bp == NULL)
2536                                                         panic("tcp packblock");
2537                                                 qpassnolim(s->rq, bp);
2538                                                 bp = NULL;
2539
2540                                                 /*
2541                                                  *  Force an ack every 2 data messages.  This is
2542                                                  *  a hack for rob to make his home system run
2543                                                  *  faster.
2544                                                  *
2545                                                  *  this also keeps the standard TCP congestion
2546                                                  *  control working since it needs an ack every
2547                                                  *  2 max segs worth.  This is not quite that,
2548                                                  *  but under a real stream is equivalent since
2549                                                  *  every packet has a max seg in it.
2550                                                  */
2551                                                 if (++(tcb->rcv.una) >= 2)
2552                                                         tcb->flags |= FORCE;
2553                                         }
2554                                         tcb->rcv.nxt += length;
2555                                         drop_old_rcv_sacks(tcb);
2556
2557                                         /*
2558                                          *  update our rcv window
2559                                          */
2560                                         tcprcvwin(s);
2561
2562                                         /*
2563                                          *  turn on the acktimer if there's something
2564                                          *  to ack
2565                                          */
2566                                         if (tcb->acktimer.state != TcptimerON)
2567                                                 tcpgo(tpriv, &tcb->acktimer);
2568
2569                                         break;
2570                                 case Finwait2:
2571                                         /* no process to read the data, send a reset */
2572                                         if (bp != NULL)
2573                                                 freeblist(bp);
2574                                         sndrst(tcp, source, dest, length, &seg, version,
2575                                                    "send to Finwait2");
2576                                         qunlock(&s->qlock);
2577                                         poperror();
2578                                         return;
2579                         }
2580                 }
2581
2582                 if (seg.flags & FIN) {
2583                         tcb->flags |= FORCE;
2584
2585                         switch (tcb->state) {
2586                                 case Established:
2587                                         tcb->rcv.nxt++;
2588                                         tcpsetstate(s, Close_wait);
2589                                         break;
2590                                 case Finwait1:
2591                                         tcb->rcv.nxt++;
2592                                         if (qlen(s->wq) + tcb->flgcnt == 0) {
2593                                                 tcphalt(tpriv, &tcb->rtt_timer);
2594                                                 tcphalt(tpriv, &tcb->acktimer);
2595                                                 tcphalt(tpriv, &tcb->katimer);
2596                                                 tcpsetstate(s, Time_wait);
2597                                                 tcb->timer.start = MSL2 * (1000 / MSPTICK);
2598                                                 tcpgo(tpriv, &tcb->timer);
2599                                         } else
2600                                                 tcpsetstate(s, Closing);
2601                                         break;
2602                                 case Finwait2:
2603                                         tcb->rcv.nxt++;
2604                                         tcphalt(tpriv, &tcb->rtt_timer);
2605                                         tcphalt(tpriv, &tcb->acktimer);
2606                                         tcphalt(tpriv, &tcb->katimer);
2607                                         tcpsetstate(s, Time_wait);
2608                                         tcb->timer.start = MSL2 * (1000 / MSPTICK);
2609                                         tcpgo(tpriv, &tcb->timer);
2610                                         break;
2611                                 case Close_wait:
2612                                 case Closing:
2613                                 case Last_ack:
2614                                         break;
2615                                 case Time_wait:
2616                                         tcpgo(tpriv, &tcb->timer);
2617                                         break;
2618                         }
2619                 }
2620
2621                 /*
2622                  *  get next adjacent segment from the resequence queue.
2623                  *  dump/trim any overlapping segments
2624                  */
2625                 for (;;) {
2626                         if (tcb->reseq == NULL)
2627                                 goto output;
2628
2629                         if (seq_ge(tcb->rcv.nxt, tcb->reseq->seg.seq) == 0)
2630                                 goto output;
2631
2632                         getreseq(tcb, &seg, &bp, &length);
2633
2634                         if (tcptrim(tcb, &seg, &bp, &length) == 0)
2635                                 break;
2636                 }
2637         }
2638 output:
2639         tcpoutput(s);
2640         qunlock(&s->qlock);
2641         poperror();
2642         return;
2643 raise:
2644         qunlock(&s->qlock);
2645         poperror();
2646         freeblist(bp);
2647         tcpkick(s);
2648 }
2649
2650 /* The advertised mss = data + TCP headers */
2651 static uint16_t derive_payload_mss(Tcpctl *tcb)
2652 {
2653         uint16_t payload_mss = tcb->mss;
2654         uint16_t opt_size = 0;
2655
2656         if (tcb->ts_recent) {
2657                 opt_size += TS_LENGTH;
2658                 /* Note that when we're a SYN, we overestimate slightly.  This is safe,
2659                  * and not really a problem. */
2660                 opt_size += TS_SEND_PREPAD;
2661         }
2662         if (tcb->rcv.nr_sacks)
2663                 opt_size += 2 + tcb->rcv.nr_sacks * 8;
2664         opt_size = ROUNDUP(opt_size, 4);
2665         payload_mss -= opt_size;
2666         return payload_mss;
2667 }
2668
2669 /* Decreases the xmit amt, given the MSS / TSO. */
2670 static uint32_t throttle_for_mss(Tcpctl *tcb, uint32_t ssize,
2671                                  uint16_t payload_mss, bool retrans)
2672 {
2673         if (ssize > payload_mss) {
2674                 if ((tcb->flags & TSO) == 0) {
2675                         ssize = payload_mss;
2676                 } else {
2677                         /* Don't send too much.  32K is arbitrary.. */
2678                         if (ssize > 32 * 1024)
2679                                 ssize = 32 * 1024;
2680                         if (!retrans) {
2681                                 /* Clamp xmit to an integral MSS to avoid ragged tail segments
2682                                  * causing poor link utilization. */
2683                                 ssize = ROUNDDOWN(ssize, payload_mss);
2684                         }
2685                 }
2686         }
2687         return ssize;
2688 }
2689
2690 /* Reduces ssize for a variety of reasons.  Returns FALSE if we should abort
2691  * sending the packet.  o/w returns TRUE and modifies ssize by reference. */
2692 static bool throttle_ssize(struct conv *s, Tcpctl *tcb, uint32_t *ssize_p,
2693                            uint16_t payload_mss, bool retrans)
2694 {
2695         struct Fs *f = s->p->f;
2696         uint32_t usable;
2697         uint32_t ssize = *ssize_p;
2698
2699         /* Compute usable segment based on offered window and limit
2700          * window probes to one */
2701         if (tcb->snd.wnd == 0) {
2702                 if (tcb->snd.in_flight != 0) {
2703                         if ((tcb->flags & FORCE) == 0)
2704                                 return FALSE;
2705                 }
2706                 usable = 1;
2707         } else {
2708                 usable = tcb->cwind;
2709                 if (tcb->snd.wnd < usable)
2710                         usable = tcb->snd.wnd;
2711                 if (usable > tcb->snd.in_flight)
2712                         usable -= tcb->snd.in_flight;
2713                 else
2714                         usable = 0;
2715                 /* Avoid Silly Window Syndrome.  This is a little different thant RFC
2716                  * 813.  I took their additional enhancement of "< MSS" as an AND, not
2717                  * an OR.  25% of a large snd.wnd is pretty large, and our main goal is
2718                  * to avoid packets smaller than MSS.  I still use the 25% threshold,
2719                  * because it is important that there is *some* data in_flight.  If
2720                  * usable < MSS because snd.wnd is very small (but not 0), we might
2721                  * never get an ACK and would need to set up a timer.
2722                  *
2723                  * Also, I'm using 'ssize' as a proxy for a PSH point.  If there's just
2724                  * a small blob in the qio (or retrans!), then we might as well just
2725                  * send it. */
2726                 if ((usable < tcb->typical_mss) && (usable < tcb->snd.wnd >> 2)
2727                     && (usable < ssize)) {
2728                         return FALSE;
2729                 }
2730         }
2731         if (ssize && usable < 2)
2732                 netlog(s->p->f, Logtcpverbose,
2733                        "%I.%d -> %I.%d: throttled snd.wnd %lu cwind %lu\n",
2734                        s->laddr, s->lport, s->raddr, s->rport,
2735                        tcb->snd.wnd, tcb->cwind);
2736         if (usable < ssize)
2737                 ssize = usable;
2738
2739         ssize = throttle_for_mss(tcb, ssize, payload_mss, retrans);
2740
2741         *ssize_p = ssize;
2742         return TRUE;
2743 }
2744
2745 /* Helper, picks the next segment to send, which is possibly a retransmission.
2746  * Returns TRUE if we have a segment, FALSE o/w.  Returns ssize, from_seq, and
2747  * sent by reference.
2748  *
2749  * from_seq is the seq number we are transmitting from.
2750  *
2751  * sent includes all seq from una to from_seq *including* any previously sent
2752  * flags (part of tcb->flgcnt), for instance an unacknowledged SYN (which counts
2753  * as a seq number).  Those flags are in the e.g. snd.nxt - snd.una range, and
2754  * they get dropped after qdiscard.
2755  *
2756  * ssize is the amount of data we are sending, starting from from_seq, and it
2757  * will include any *new* flags, which haven't been accounted for yet.
2758  *
2759  * tcb->flgcnt consists of the flags both in ssize and in sent.
2760  *
2761  * Note that we could be in recovery and not sack_retrans a segment. */
2762 static bool get_xmit_segment(struct conv *s, Tcpctl *tcb, uint16_t payload_mss,
2763                              uint32_t *from_seq_p, uint32_t *sent_p,
2764                              uint32_t *ssize_p)
2765 {
2766         struct Fs *f = s->p->f;
2767         struct tcppriv *tpriv = s->p->priv;
2768         uint32_t ssize, sent, from_seq;
2769         bool sack_retrans = FALSE;
2770         struct sack_block *tcb_sack = 0;
2771
2772         for (int i = 0; i < tcb->snd.nr_sacks; i++) {
2773                 tcb_sack = &tcb->snd.sacks[i];
2774                 if (seq_lt(tcb->snd.rtx, tcb_sack->left)) {
2775                         /* So ssize is supposed to include any *new* flags to flgcnt, which
2776                          * at this point would be a FIN.
2777                          *
2778                          * It might be possible that flgcnt is incremented so we send a FIN,
2779                          * even for an intermediate sack retrans.  Perhaps the user closed
2780                          * the conv.
2781                          *
2782                          * However, the way the "flgcnt for FIN" works is that it inflates
2783                          * the desired amount we'd like to send (qlen + flgcnt).
2784                          * Eventually, we reach the end of the queue and fail to extract all
2785                          * of dsize.  At that point, we put on the FIN, and that's where the
2786                          * extra 'byte' comes from.
2787                          *
2788                          * For sack retrans, since we're extracting from parts of the qio
2789                          * that aren't the right-most edge, we don't need to consider flgcnt
2790                          * when setting ssize. */
2791                         from_seq = tcb->snd.rtx;
2792                         sent = from_seq - tcb->snd.una;
2793                         ssize = tcb_sack->left - from_seq;
2794                         sack_retrans = TRUE;
2795                         break;
2796                 }
2797         }
2798         /* SACK holes have first dibs, but we can still opportunisitically send new
2799          * data.
2800          *
2801          * During other types of recovery, we'll just send from the retrans point.
2802          * If we're in an RTO while we still have sacks, we could be resending data
2803          * that wasn't lost.  Consider a sack that is still growing (usually the
2804          * right-most), but we haven't received the ACK yet.  rxt may be included in
2805          * that area.  Given we had two losses or otherwise timed out, I'm not too
2806          * concerned.
2807          *
2808          * Note that Fast and RTO can send data beyond nxt.  If we change that,
2809          * change the accounting below. */
2810         if (!sack_retrans) {
2811                 switch (tcb->snd.recovery) {
2812                 default:
2813                 case SACK_RETRANS_RECOVERY:
2814                         from_seq = tcb->snd.nxt;
2815                         break;
2816                 case FAST_RETRANS_RECOVERY:
2817                 case RTO_RETRANS_RECOVERY:
2818                         from_seq = tcb->snd.rtx;
2819                         break;
2820                 }
2821                 sent = from_seq - tcb->snd.una;
2822                 /* qlen + flgcnt is every seq we want to have sent, including unack'd
2823                  * data, unacked flags, and new flags. */
2824                 ssize = qlen(s->wq) + tcb->flgcnt - sent;
2825         }
2826
2827         if (!throttle_ssize(s, tcb, &ssize, payload_mss, sack_retrans))
2828                 return FALSE;
2829
2830         /* This counts flags, which is a little hokey, but it's okay since in_flight
2831          * gets reset on each ACK */
2832         tcb->snd.in_flight += ssize;
2833         /* Log and track rxmit.  This covers both SACK (retrans) and fast rxmit. */
2834         if (ssize && seq_lt(tcb->snd.rtx, tcb->snd.nxt)) {
2835                 netlog(f, Logtcpverbose,
2836                        "%I.%d -> %I.%d: rxmit: rtx %u amt %u, nxt %u\n",
2837                        s->laddr, s->lport, s->raddr, s->rport,
2838                        tcb->snd.rtx, MIN(tcb->snd.nxt - tcb->snd.rtx, ssize),
2839                        tcb->snd.nxt);
2840                 tpriv->stats[RetransSegs]++;
2841         }
2842         if (sack_retrans) {
2843                 /* If we'll send up to the left edge, advance snd.rtx to the right.
2844                  *
2845                  * This includes the largest sack.  It might get removed later, in which
2846                  * case we'll underestimate the amount in-flight.  The alternative is to
2847                  * not count the rightmost sack, but when it gets removed, we'll retrans
2848                  * it anyway.  No matter what, we'd count it. */
2849                 tcb->snd.rtx += ssize;
2850                 if (tcb->snd.rtx == tcb_sack->left)
2851                         tcb->snd.rtx = tcb_sack->right;
2852                 /* RFC 6675 says we MAY rearm the RTO timer on each retrans, since we
2853                  * might not be getting ACKs for a while. */
2854                 tcpsettimer(tcb);
2855         } else {
2856                 switch (tcb->snd.recovery) {
2857                 default:
2858                         /* under normal op, we drag rtx along with nxt.  this prevents us
2859                          * from sending sacks too early (up above), since rtx doesn't get
2860                          * reset to una until we have a loss (e.g. 3 dupacks/sacks). */
2861                         tcb->snd.nxt += ssize;
2862                         tcb->snd.rtx = tcb->snd.nxt;
2863                         break;
2864                 case SACK_RETRANS_RECOVERY:
2865                         /* We explicitly do not want to increase rtx here.  We might still
2866                          * need it to fill in a sack gap below nxt if we get new, higher
2867                          * sacks. */
2868                         tcb->snd.nxt += ssize;
2869                         break;
2870                 case FAST_RETRANS_RECOVERY:
2871                 case RTO_RETRANS_RECOVERY:
2872                         tcb->snd.rtx += ssize;
2873                         /* Fast and RTO can send new data, advancing nxt. */
2874                         if (seq_gt(tcb->snd.rtx, tcb->snd.nxt))
2875                                 tcb->snd.nxt = tcb->snd.rtx;
2876                         break;
2877                 }
2878         }
2879         *from_seq_p = from_seq;
2880         *sent_p = sent;
2881         *ssize_p = ssize;
2882
2883         return TRUE;
2884 }
2885
2886 /*
2887  *  always enters and exits with the s locked.  We drop
2888  *  the lock to ipoput the packet so some care has to be
2889  *  taken by callers.
2890  */
2891 static void tcpoutput(struct conv *s)
2892 {
2893         Tcp seg;
2894         int msgs;
2895         int next_yield = 1;
2896         Tcpctl *tcb;
2897         struct block *hbp, *bp;
2898         uint32_t ssize, dsize, sent, from_seq;
2899         struct Fs *f;
2900         struct tcppriv *tpriv;
2901         uint8_t version;
2902         uint16_t payload_mss;
2903
2904         f = s->p->f;
2905         tpriv = s->p->priv;
2906         version = s->ipversion;
2907
2908         for (msgs = 0; msgs < 100; msgs++) {
2909                 tcb = (Tcpctl *) s->ptcl;
2910
2911                 switch (tcb->state) {
2912                         case Listen:
2913                         case Closed:
2914                         case Finwait2:
2915                                 return;
2916                 }
2917
2918                 /* force an ack when a window has opened up */
2919                 if (tcb->rcv.blocked && tcb->rcv.wnd >= tcb->mss) {
2920                         tcb->rcv.blocked = 0;
2921                         tcb->flags |= FORCE;
2922                 }
2923
2924                 /* Don't send anything else until our SYN has been acked */
2925                 if (tcb->snd.nxt != tcb->iss && (tcb->flags & SYNACK) == 0)
2926                         break;
2927
2928                 /* payload_mss is the actual amount of data in the packet, which is the
2929                  * advertised (mss - header opts).  This varies from packet to packet,
2930                  * based on the options that might be present (e.g. always timestamps,
2931                  * sometimes SACKs) */
2932                 payload_mss = derive_payload_mss(tcb);
2933
2934                 if (!get_xmit_segment(s, tcb, payload_mss, &from_seq, &sent, &ssize))
2935                         break;
2936
2937                 dsize = ssize;
2938                 seg.urg = 0;
2939
2940                 if (ssize == 0)
2941                         if ((tcb->flags & FORCE) == 0)
2942                                 break;
2943
2944                 tcb->flags &= ~FORCE;
2945                 tcprcvwin(s);
2946
2947                 /* By default we will generate an ack, so we can normally turn off the
2948                  * timer.  If we're blocked, we'll want the timer so we can send a
2949                  * window update. */
2950                 if (!tcb->rcv.blocked)
2951                         tcphalt(tpriv, &tcb->acktimer);
2952                 tcb->rcv.una = 0;
2953                 seg.source = s->lport;
2954                 seg.dest = s->rport;
2955                 seg.flags = ACK;
2956                 seg.mss = 0;
2957                 seg.ws = 0;
2958                 seg.sack_ok = FALSE;
2959                 seg.nr_sacks = 0;
2960                 /* When outputting, Syn_sent means "send the Syn", for connections we
2961                  * initiate.  SYNACKs are sent from sndsynack directly. */
2962                 if (tcb->state == Syn_sent) {
2963                         seg.flags = 0;
2964                         seg.sack_ok = SACK_SUPPORTED;   /* here's where we advertise SACK */
2965                         if (tcb->snd.nxt - ssize == tcb->iss) {
2966                                 seg.flags |= SYN;
2967                                 dsize--;
2968                                 seg.mss = tcb->mss;
2969                                 seg.ws = tcb->scale;
2970                         } else {
2971                                 /* TODO: Not sure why we'd get here. */
2972                                 warn("TCP: weird Syn_sent state, tell someone you saw this");
2973                         }
2974                 }
2975                 seg.seq = from_seq;
2976                 seg.ack = tcb->rcv.nxt;
2977                 tcb->last_ack_sent = seg.ack;
2978                 seg.wnd = tcb->rcv.wnd;
2979                 seg.ts_val = tcb->ts_recent;
2980
2981                 /* Pull out data to send */
2982                 bp = NULL;
2983                 if (dsize != 0) {
2984                         bp = qcopy(s->wq, dsize, sent);
2985                         if (BLEN(bp) != dsize) {
2986                                 /* Here's where the flgcnt kicked in.  Note dsize is
2987                                  * decremented, but ssize isn't.  Not that we use ssize for much
2988                                  * anymore.  Decrementing dsize prevents us from sending a PSH
2989                                  * with the FIN. */
2990                                 seg.flags |= FIN;
2991                                 dsize--;
2992                         }
2993                         if (BLEN(bp) > payload_mss) {
2994                                 bp->flag |= Btso;
2995                                 bp->mss = payload_mss;
2996                         }
2997                 }
2998
2999                 if (sent + dsize == qlen(s->wq) + tcb->flgcnt)
3000                         seg.flags |= PSH;
3001
3002                 /* Build header, link data and compute cksum */
3003                 switch (version) {
3004                         case V4:
3005                                 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
3006                                 hbp = htontcp4(&seg, bp, &tcb->protohdr.tcp4hdr, tcb);
3007                                 if (hbp == NULL) {
3008                                         freeblist(bp);
3009                                         return;
3010                                 }
3011                                 break;
3012                         case V6:
3013                                 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
3014                                 hbp = htontcp6(&seg, bp, &tcb->protohdr.tcp6hdr, tcb);
3015                                 if (hbp == NULL) {
3016                                         freeblist(bp);
3017                                         return;
3018                                 }
3019                                 break;
3020                         default:
3021                                 hbp = NULL;     /* to suppress a warning */
3022                                 panic("tcpoutput: version %d", version);
3023                 }
3024
3025                 /* Start the transmission timers if there is new data and we
3026                  * expect acknowledges
3027                  */
3028                 if (ssize != 0) {
3029                         if (tcb->timer.state != TcptimerON)
3030                                 tcpgo(tpriv, &tcb->timer);
3031
3032                         if (!tcb->ts_recent && (tcb->rtt_timer.state != TcptimerON)) {
3033                                 /* If round trip timer isn't running, start it. */
3034                                 tcpgo(tpriv, &tcb->rtt_timer);
3035                                 tcb->rttseq = from_seq + ssize;
3036                         }
3037                 }
3038
3039                 tpriv->stats[OutSegs]++;
3040
3041                 /* put off the next keep alive */
3042                 tcpgo(tpriv, &tcb->katimer);
3043
3044                 switch (version) {
3045                         case V4:
3046                                 if (ipoput4(f, hbp, 0, s->ttl, s->tos, s) < 0) {
3047                                         /* a negative return means no route */
3048                                         localclose(s, "no route");
3049                                 }
3050                                 break;
3051                         case V6:
3052                                 if (ipoput6(f, hbp, 0, s->ttl, s->tos, s) < 0) {
3053                                         /* a negative return means no route */
3054                                         localclose(s, "no route");
3055                                 }
3056                                 break;
3057                         default:
3058                                 panic("tcpoutput2: version %d", version);
3059                 }
3060                 if (ssize) {
3061                         /* The outer loop thinks we sent one packet.  If we used TSO, we
3062                          * might have sent several.  Minus one for the loop increment. */
3063                         msgs += DIV_ROUND_UP(ssize, payload_mss) - 1;
3064                 }
3065                 /* Old Plan 9 tidbit - yield every four messages.  We want to break out
3066                  * and unlock so we can process inbound ACKs which might do things like
3067                  * say "slow down". */
3068                 if (msgs >= next_yield) {
3069                         next_yield = msgs + 4;
3070                         qunlock(&s->qlock);
3071                         kthread_yield();
3072                         qlock(&s->qlock);
3073                 }
3074         }
3075 }
3076
3077 /*
3078  *  the BSD convention (hack?) for keep alives.  resend last uint8_t acked.
3079  */
3080 static void tcpsendka(struct conv *s)
3081 {
3082         Tcp seg;
3083         Tcpctl *tcb;
3084         struct block *hbp, *dbp;
3085
3086         tcb = (Tcpctl *) s->ptcl;
3087
3088         dbp = NULL;
3089         seg.urg = 0;
3090         seg.source = s->lport;
3091         seg.dest = s->rport;
3092         seg.flags = ACK | PSH;
3093         seg.mss = 0;
3094         seg.ws = 0;
3095         seg.sack_ok = FALSE;
3096         seg.nr_sacks = 0;
3097         if (tcpporthogdefense)
3098                 urandom_read(&seg.seq, sizeof(seg.seq));
3099         else
3100                 seg.seq = tcb->snd.una - 1;
3101         seg.ack = tcb->rcv.nxt;
3102         tcb->last_ack_sent = seg.ack;
3103         tcb->rcv.una = 0;
3104         seg.wnd = tcb->rcv.wnd;
3105         seg.ts_val = tcb->ts_recent;
3106         if (tcb->state == Finwait2) {
3107                 seg.flags |= FIN;
3108         } else {
3109                 dbp = block_alloc(1, MEM_WAIT);
3110                 dbp->wp++;
3111         }
3112
3113         if (isv4(s->raddr)) {
3114                 /* Build header, link data and compute cksum */
3115                 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
3116                 hbp = htontcp4(&seg, dbp, &tcb->protohdr.tcp4hdr, tcb);
3117                 if (hbp == NULL) {
3118                         freeblist(dbp);
3119                         return;
3120                 }
3121                 ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
3122         } else {
3123                 /* Build header, link data and compute cksum */
3124                 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
3125                 hbp = htontcp6(&seg, dbp, &tcb->protohdr.tcp6hdr, tcb);
3126                 if (hbp == NULL) {
3127                         freeblist(dbp);
3128                         return;
3129                 }
3130                 ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
3131         }
3132 }
3133
3134 /*
3135  *  set connection to time out after 12 minutes
3136  */
3137 static void tcpsetkacounter(Tcpctl *tcb)
3138 {