208b1ce28b2bfc7ceaa59da6d19cbda81d0004c7
[akaros.git] / kern / src / net / tcp.c
1 /* Copyright © 1994-1999 Lucent Technologies Inc.  All rights reserved.
2  * Portions Copyright © 1997-1999 Vita Nuova Limited
3  * Portions Copyright © 2000-2007 Vita Nuova Holdings Limited
4  *                                (www.vitanuova.com)
5  * Revisions Copyright © 2000-2007 Lucent Technologies Inc. and others
6  *
7  * Modified for the Akaros operating system:
8  * Copyright (c) 2013-2014 The Regents of the University of California
9  * Copyright (c) 2013-2017 Google Inc.
10  *
11  * Permission is hereby granted, free of charge, to any person obtaining a copy
12  * of this software and associated documentation files (the "Software"), to deal
13  * in the Software without restriction, including without limitation the rights
14  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15  * copies of the Software, and to permit persons to whom the Software is
16  * furnished to do so, subject to the following conditions:
17  *
18  * The above copyright notice and this permission notice shall be included in
19  * all copies or substantial portions of the Software.
20  *
21  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
24  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27  * SOFTWARE. */
28
29 #include <vfs.h>
30 #include <kfs.h>
31 #include <slab.h>
32 #include <kmalloc.h>
33 #include <kref.h>
34 #include <string.h>
35 #include <stdio.h>
36 #include <assert.h>
37 #include <error.h>
38 #include <cpio.h>
39 #include <pmap.h>
40 #include <smp.h>
41 #include <net/ip.h>
42 #include <net/tcp.h>
43
44 /* Must correspond to the enumeration in tcp.h */
45 static char *tcpstates[] = {
46         "Closed", "Listen", "Syn_sent",
47         "Established", "Finwait1", "Finwait2", "Close_wait",
48         "Closing", "Last_ack", "Time_wait"
49 };
50
51 static int tcp_irtt = DEF_RTT;                  /* Initial guess at round trip time */
52 static uint16_t tcp_mss = DEF_MSS;              /* Maximum segment size to be sent */
53
54 /* Must correspond to the enumeration in tcp.h */
55 static char *statnames[] = {
56         [MaxConn] "MaxConn",
57         [ActiveOpens] "ActiveOpens",
58         [PassiveOpens] "PassiveOpens",
59         [EstabResets] "EstabResets",
60         [CurrEstab] "CurrEstab",
61         [InSegs] "InSegs",
62         [OutSegs] "OutSegs",
63         [RetransSegs] "RetransSegs",
64         [RetransTimeouts] "RetransTimeouts",
65         [InErrs] "InErrs",
66         [OutRsts] "OutRsts",
67         [CsumErrs] "CsumErrs",
68         [HlenErrs] "HlenErrs",
69         [LenErrs] "LenErrs",
70         [OutOfOrder] "OutOfOrder",
71 };
72
73 /*
74  *  Setting tcpporthogdefense to non-zero enables Dong Lin's
75  *  solution to hijacked systems staking out port's as a form
76  *  of DoS attack.
77  *
78  *  To avoid stateless Conv hogs, we pick a sequence number at random.  If
79  *  it that number gets acked by the other end, we shut down the connection.
80  *  Look for tcpporthogedefense in the code.
81  */
82 static int tcpporthogdefense = 0;
83
84 static int addreseq(Tcpctl *, struct tcppriv *, Tcp *, struct block *,
85                     uint16_t);
86 static void getreseq(Tcpctl *, Tcp *, struct block **, uint16_t *);
87 static void localclose(struct conv *, char *unused_char_p_t);
88 static void procsyn(struct conv *, Tcp *);
89 static void tcpiput(struct Proto *, struct Ipifc *, struct block *);
90 static void tcpoutput(struct conv *);
91 static int tcptrim(Tcpctl *, Tcp *, struct block **, uint16_t *);
92 static void tcpstart(struct conv *, int);
93 static void tcptimeout(void *);
94 static void tcpsndsyn(struct conv *, Tcpctl *);
95 static void tcprcvwin(struct conv *);
96 static void tcpacktimer(void *);
97 static void tcpkeepalive(void *);
98 static void tcpsetkacounter(Tcpctl *);
99 static void tcprxmit(struct conv *);
100 static void tcpsettimer(Tcpctl *);
101 static void tcpsynackrtt(struct conv *);
102 static void tcpsetscale(struct conv *, Tcpctl *, uint16_t, uint16_t);
103 static void tcp_loss_event(struct conv *s, Tcpctl *tcb);
104 static uint16_t derive_payload_mss(Tcpctl *tcb);
105 static void set_in_flight(Tcpctl *tcb);
106
107 static void limborexmit(struct Proto *);
108 static void limbo(struct conv *, uint8_t *unused_uint8_p_t, uint8_t *, Tcp *,
109                                   int);
110
111 static void tcpsetstate(struct conv *s, uint8_t newstate)
112 {
113         Tcpctl *tcb;
114         uint8_t oldstate;
115         struct tcppriv *tpriv;
116
117         tpriv = s->p->priv;
118
119         tcb = (Tcpctl *) s->ptcl;
120
121         oldstate = tcb->state;
122         if (oldstate == newstate)
123                 return;
124
125         if (oldstate == Established)
126                 tpriv->stats[CurrEstab]--;
127         if (newstate == Established)
128                 tpriv->stats[CurrEstab]++;
129
130         /**
131         print( "%d/%d %s->%s CurrEstab=%d\n", s->lport, s->rport,
132                 tcpstates[oldstate], tcpstates[newstate], tpriv->tstats.tcpCurrEstab );
133         **/
134
135         switch (newstate) {
136                 case Closed:
137                         qclose(s->rq);
138                         qclose(s->wq);
139                         qclose(s->eq);
140                         break;
141
142                 case Close_wait:        /* Remote closes */
143                         qhangup(s->rq, NULL);
144                         break;
145         }
146
147         tcb->state = newstate;
148
149         if (oldstate == Syn_sent && newstate != Closed)
150                 Fsconnected(s, NULL);
151 }
152
153 static void tcpconnect(struct conv *c, char **argv, int argc)
154 {
155         Fsstdconnect(c, argv, argc);
156         tcpstart(c, TCP_CONNECT);
157 }
158
159 static int tcpstate(struct conv *c, char *state, int n)
160 {
161         Tcpctl *s;
162
163         s = (Tcpctl *) (c->ptcl);
164
165         return snprintf(state, n,
166                                         "%s qin %d qout %d srtt %d mdev %d cwin %u swin %u>>%d rwin %u>>%d timer.start %llu timer.count %llu rerecv %d katimer.start %d katimer.count %d\n",
167                                         tcpstates[s->state],
168                                         c->rq ? qlen(c->rq) : 0,
169                                         c->wq ? qlen(c->wq) : 0,
170                                         s->srtt, s->mdev,
171                                         s->cwind, s->snd.wnd, s->rcv.scale, s->rcv.wnd,
172                                         s->snd.scale, s->timer.start, s->timer.count, s->rerecv,
173                                         s->katimer.start, s->katimer.count);
174 }
175
176 static int tcpinuse(struct conv *c)
177 {
178         Tcpctl *s;
179
180         s = (Tcpctl *) (c->ptcl);
181         return s->state != Closed;
182 }
183
184 static void tcpannounce(struct conv *c, char **argv, int argc)
185 {
186         Fsstdannounce(c, argv, argc);
187         tcpstart(c, TCP_LISTEN);
188         Fsconnected(c, NULL);
189 }
190
191 static void tcpbypass(struct conv *cv, char **argv, int argc)
192 {
193         struct tcppriv *tpriv = cv->p->priv;
194
195         Fsstdbypass(cv, argv, argc);
196         iphtadd(&tpriv->ht, cv);
197 }
198
199 static void tcpshutdown(struct conv *c, int how)
200 {
201         Tcpctl *tcb = (Tcpctl*)c->ptcl;
202
203         /* Do nothing for the read side */
204         if (how == SHUT_RD)
205                 return;
206         /* Sends a FIN.  If we're in another state (like Listen), we'll run into
207          * issues, since we'll never send the FIN.  We'll be shutdown on our end,
208          * but we'll never tell the distant end.  Might just be an app issue. */
209         switch (tcb->state) {
210         case Established:
211                 tcb->flgcnt++;
212                 tcpsetstate(c, Finwait1);
213                 tcpoutput(c);
214                 break;
215         }
216 }
217
218 /*
219  *  tcpclose is always called with the q locked
220  */
221 static void tcpclose(struct conv *c)
222 {
223         Tcpctl *tcb;
224
225         tcb = (Tcpctl *) c->ptcl;
226
227         qhangup(c->rq, NULL);
228         qhangup(c->wq, NULL);
229         qhangup(c->eq, NULL);
230         qflush(c->rq);
231
232         switch (tcb->state) {
233                 case Listen:
234                         /*
235                          *  reset any incoming calls to this listener
236                          */
237                         Fsconnected(c, "Hangup");
238
239                         localclose(c, NULL);
240                         break;
241                 case Closed:
242                 case Syn_sent:
243                         localclose(c, NULL);
244                         break;
245                 case Established:
246                         tcb->flgcnt++;
247                         tcpsetstate(c, Finwait1);
248                         tcpoutput(c);
249                         break;
250                 case Close_wait:
251                         tcb->flgcnt++;
252                         tcpsetstate(c, Last_ack);
253                         tcpoutput(c);
254                         break;
255         }
256 }
257
258 static void tcpkick(void *x)
259 {
260         ERRSTACK(1);
261         struct conv *s = x;
262         Tcpctl *tcb;
263
264         tcb = (Tcpctl *) s->ptcl;
265
266         qlock(&s->qlock);
267         if (waserror()) {
268                 qunlock(&s->qlock);
269                 nexterror();
270         }
271
272         switch (tcb->state) {
273                 case Syn_sent:
274                 case Established:
275                 case Close_wait:
276                         /*
277                          * Push data
278                          */
279                         tcprcvwin(s);
280                         tcpoutput(s);
281                         break;
282                 default:
283                         localclose(s, "Hangup");
284                         break;
285         }
286
287         qunlock(&s->qlock);
288         poperror();
289 }
290
291 static void tcprcvwin(struct conv *s)
292 {
293         /* Call with tcb locked */
294         int w;
295         Tcpctl *tcb;
296
297         tcb = (Tcpctl *) s->ptcl;
298         w = tcb->window - qlen(s->rq);
299         if (w < 0)
300                 w = 0;
301
302         /* RFC 813: Avoid SWS.  We'll always reduce the window (because the qio
303          * increased - that's legit), and we'll always advertise the window
304          * increases (corresponding to qio drains) when those are greater than MSS.
305          * But we don't advertise increases less than MSS.
306          *
307          * Note we don't shrink the window at all - that'll result in tcptrim()
308          * dropping packets that were sent before the sender gets our update. */
309         if ((w < tcb->rcv.wnd) || (w >= tcb->mss))
310                 tcb->rcv.wnd = w;
311         /* We've delayed sending an update to rcv.wnd, and we might never get
312          * another ACK to drive the TCP stack after the qio is drained.  We could
313          * replace this stuff with qio kicks or callbacks, but that might be
314          * trickier with the MSS limitation.  (and 'edge' isn't empty or not). */
315         if (w < tcb->mss)
316                 tcb->rcv.blocked = 1;
317 }
318
319 static void tcpacktimer(void *v)
320 {
321         ERRSTACK(1);
322         Tcpctl *tcb;
323         struct conv *s;
324
325         s = v;
326         tcb = (Tcpctl *) s->ptcl;
327
328         qlock(&s->qlock);
329         if (waserror()) {
330                 qunlock(&s->qlock);
331                 nexterror();
332         }
333         if (tcb->state != Closed) {
334                 tcb->flags |= FORCE;
335                 tcprcvwin(s);
336                 tcpoutput(s);
337         }
338         qunlock(&s->qlock);
339         poperror();
340 }
341
342 static void tcpcreate(struct conv *c)
343 {
344         /* We don't use qio limits.  Instead, TCP manages flow control on its own.
345          * We only use qpassnolim().  Note for qio that 0 doesn't mean no limit. */
346         c->rq = qopen(0, Qcoalesce, 0, 0);
347         c->wq = qopen(8 * QMAX, Qkick, tcpkick, c);
348 }
349
350 static void timerstate(struct tcppriv *priv, Tcptimer *t, int newstate)
351 {
352         if (newstate != TcptimerON) {
353                 if (t->state == TcptimerON) {
354                         // unchain
355                         if (priv->timers == t) {
356                                 priv->timers = t->next;
357                                 if (t->prev != NULL)
358                                         panic("timerstate1");
359                         }
360                         if (t->next)
361                                 t->next->prev = t->prev;
362                         if (t->prev)
363                                 t->prev->next = t->next;
364                         t->next = t->prev = NULL;
365                 }
366         } else {
367                 if (t->state != TcptimerON) {
368                         // chain
369                         if (t->prev != NULL || t->next != NULL)
370                                 panic("timerstate2");
371                         t->prev = NULL;
372                         t->next = priv->timers;
373                         if (t->next)
374                                 t->next->prev = t;
375                         priv->timers = t;
376                 }
377         }
378         t->state = newstate;
379 }
380
381 static void tcpackproc(void *a)
382 {
383         ERRSTACK(1);
384         Tcptimer *t, *tp, *timeo;
385         struct Proto *tcp;
386         struct tcppriv *priv;
387         int loop;
388
389         tcp = a;
390         priv = tcp->priv;
391
392         for (;;) {
393                 kthread_usleep(MSPTICK * 1000);
394
395                 qlock(&priv->tl);
396                 timeo = NULL;
397                 loop = 0;
398                 for (t = priv->timers; t != NULL; t = tp) {
399                         if (loop++ > 10000)
400                                 panic("tcpackproc1");
401                         tp = t->next;
402                         if (t->state == TcptimerON) {
403                                 t->count--;
404                                 if (t->count == 0) {
405                                         timerstate(priv, t, TcptimerDONE);
406                                         t->readynext = timeo;
407                                         timeo = t;
408                                 }
409                         }
410                 }
411                 qunlock(&priv->tl);
412
413                 loop = 0;
414                 for (t = timeo; t != NULL; t = t->readynext) {
415                         if (loop++ > 10000)
416                                 panic("tcpackproc2");
417                         if (t->state == TcptimerDONE && t->func != NULL) {
418                                 /* discard error style */
419                                 if (!waserror())
420                                         (*t->func) (t->arg);
421                                 poperror();
422                         }
423                 }
424
425                 limborexmit(tcp);
426         }
427 }
428
429 static void tcpgo(struct tcppriv *priv, Tcptimer *t)
430 {
431         if (t == NULL || t->start == 0)
432                 return;
433
434         qlock(&priv->tl);
435         t->count = t->start;
436         timerstate(priv, t, TcptimerON);
437         qunlock(&priv->tl);
438 }
439
440 static void tcphalt(struct tcppriv *priv, Tcptimer *t)
441 {
442         if (t == NULL)
443                 return;
444
445         qlock(&priv->tl);
446         timerstate(priv, t, TcptimerOFF);
447         qunlock(&priv->tl);
448 }
449
450 static int backoff(int n)
451 {
452         return 1 << n;
453 }
454
455 static void localclose(struct conv *s, char *reason)
456 {
457         /* called with tcb locked */
458         Tcpctl *tcb;
459         Reseq *rp, *rp1;
460         struct tcppriv *tpriv;
461
462         tpriv = s->p->priv;
463         tcb = (Tcpctl *) s->ptcl;
464
465         iphtrem(&tpriv->ht, s);
466
467         tcphalt(tpriv, &tcb->timer);
468         tcphalt(tpriv, &tcb->rtt_timer);
469         tcphalt(tpriv, &tcb->acktimer);
470         tcphalt(tpriv, &tcb->katimer);
471
472         /* Flush reassembly queue; nothing more can arrive */
473         for (rp = tcb->reseq; rp != NULL; rp = rp1) {
474                 rp1 = rp->next;
475                 freeblist(rp->bp);
476                 kfree(rp);
477         }
478         tcb->reseq = NULL;
479
480         if (tcb->state == Syn_sent)
481                 Fsconnected(s, reason);
482
483         qhangup(s->rq, reason);
484         qhangup(s->wq, reason);
485
486         tcpsetstate(s, Closed);
487
488         /* listener will check the rq state */
489         if (s->state == Announced)
490                 rendez_wakeup(&s->listenr);
491 }
492
493 /* mtu (- TCP + IP hdr len) of 1st hop */
494 static int tcpmtu(struct Proto *tcp, uint8_t *addr, int version, int *scale,
495                   uint8_t *flags)
496 {
497         struct Ipifc *ifc;
498         int mtu;
499
500         ifc = findipifc(tcp->f, addr, 0);
501         switch (version) {
502                 default:
503                 case V4:
504                         mtu = DEF_MSS;
505                         if (ifc != NULL)
506                                 mtu = ifc->maxtu - ifc->m->hsize - (TCP4_PKT + TCP4_HDRSIZE);
507                         break;
508                 case V6:
509                         mtu = DEF_MSS6;
510                         if (ifc != NULL)
511                                 mtu = ifc->maxtu - ifc->m->hsize - (TCP6_PKT + TCP6_HDRSIZE);
512                         break;
513         }
514         *flags &= ~TSO;
515         if (ifc && (ifc->feat & NETF_TSO))
516                 *flags |= TSO;
517         *scale = HaveWS | 7;
518
519         return mtu;
520 }
521
522 static void inittcpctl(struct conv *s, int mode)
523 {
524         Tcpctl *tcb;
525         Tcp4hdr *h4;
526         Tcp6hdr *h6;
527         int mss;
528
529         tcb = (Tcpctl *) s->ptcl;
530
531         memset(tcb, 0, sizeof(Tcpctl));
532
533         tcb->ssthresh = UINT32_MAX;
534         tcb->srtt = tcp_irtt;
535         tcb->mdev = 0;
536
537         /* setup timers */
538         tcb->timer.start = tcp_irtt / MSPTICK;
539         tcb->timer.func = tcptimeout;
540         tcb->timer.arg = s;
541         tcb->rtt_timer.start = MAX_TIME;
542         tcb->acktimer.start = TCP_ACK / MSPTICK;
543         tcb->acktimer.func = tcpacktimer;
544         tcb->acktimer.arg = s;
545         tcb->katimer.start = DEF_KAT / MSPTICK;
546         tcb->katimer.func = tcpkeepalive;
547         tcb->katimer.arg = s;
548
549         mss = DEF_MSS;
550
551         /* create a prototype(pseudo) header */
552         if (mode != TCP_LISTEN) {
553                 if (ipcmp(s->laddr, IPnoaddr) == 0)
554                         findlocalip(s->p->f, s->laddr, s->raddr);
555
556                 switch (s->ipversion) {
557                         case V4:
558                                 h4 = &tcb->protohdr.tcp4hdr;
559                                 memset(h4, 0, sizeof(*h4));
560                                 h4->proto = IP_TCPPROTO;
561                                 hnputs(h4->tcpsport, s->lport);
562                                 hnputs(h4->tcpdport, s->rport);
563                                 v6tov4(h4->tcpsrc, s->laddr);
564                                 v6tov4(h4->tcpdst, s->raddr);
565                                 break;
566                         case V6:
567                                 h6 = &tcb->protohdr.tcp6hdr;
568                                 memset(h6, 0, sizeof(*h6));
569                                 h6->proto = IP_TCPPROTO;
570                                 hnputs(h6->tcpsport, s->lport);
571                                 hnputs(h6->tcpdport, s->rport);
572                                 ipmove(h6->tcpsrc, s->laddr);
573                                 ipmove(h6->tcpdst, s->raddr);
574                                 mss = DEF_MSS6;
575                                 break;
576                         default:
577                                 panic("inittcpctl: version %d", s->ipversion);
578                 }
579         }
580
581         tcb->mss = mss;
582         tcb->typical_mss = mss;
583         tcb->cwind = tcb->typical_mss * CWIND_SCALE;
584
585         /* default is no window scaling */
586         tcb->window = QMAX;
587         tcb->rcv.wnd = QMAX;
588         tcb->rcv.scale = 0;
589         tcb->snd.scale = 0;
590 }
591
592 /*
593  *  called with s qlocked
594  */
595 static void tcpstart(struct conv *s, int mode)
596 {
597         Tcpctl *tcb;
598         struct tcppriv *tpriv;
599         char *kpname;
600
601         tpriv = s->p->priv;
602
603         if (tpriv->ackprocstarted == 0) {
604                 qlock(&tpriv->apl);
605                 if (tpriv->ackprocstarted == 0) {
606                         /* tcpackproc needs to free this if it ever exits */
607                         kpname = kmalloc(KNAMELEN, MEM_WAIT);
608                         snprintf(kpname, KNAMELEN, "#I%dtcpack", s->p->f->dev);
609                         ktask(kpname, tcpackproc, s->p);
610                         tpriv->ackprocstarted = 1;
611                 }
612                 qunlock(&tpriv->apl);
613         }
614
615         tcb = (Tcpctl *) s->ptcl;
616
617         inittcpctl(s, mode);
618
619         iphtadd(&tpriv->ht, s);
620         switch (mode) {
621                 case TCP_LISTEN:
622                         tpriv->stats[PassiveOpens]++;
623                         tcb->flags |= CLONE;
624                         tcpsetstate(s, Listen);
625                         break;
626
627                 case TCP_CONNECT:
628                         tpriv->stats[ActiveOpens]++;
629                         tcb->flags |= ACTIVE;
630                         tcpsndsyn(s, tcb);
631                         tcpsetstate(s, Syn_sent);
632                         tcpoutput(s);
633                         break;
634         }
635 }
636
637 static char *tcpflag(uint16_t flag)
638 {
639         static char buf[128];
640
641         snprintf(buf, sizeof(buf), "%d", flag >> 10);   /* Head len */
642         if (flag & URG)
643                 snprintf(buf, sizeof(buf), "%s%s", buf, " URG");
644         if (flag & ACK)
645                 snprintf(buf, sizeof(buf), "%s%s", buf, " ACK");
646         if (flag & PSH)
647                 snprintf(buf, sizeof(buf), "%s%s", buf, " PSH");
648         if (flag & RST)
649                 snprintf(buf, sizeof(buf), "%s%s", buf, " RST");
650         if (flag & SYN)
651                 snprintf(buf, sizeof(buf), "%s%s", buf, " SYN");
652         if (flag & FIN)
653                 snprintf(buf, sizeof(buf), "%s%s", buf, " FIN");
654
655         return buf;
656 }
657
658 /* Helper, determine if we should send a TCP timestamp.  ts_val was the
659  * timestamp from our distant end.  We'll also send a TS on SYN (no ACK). */
660 static bool tcp_seg_has_ts(Tcp *tcph)
661 {
662         return tcph->ts_val || ((tcph->flags & SYN) && !(tcph->flags & ACK));
663 }
664
665 /* Given a TCP header/segment and default header size (e.g. TCP4_HDRSIZE),
666  * return the actual hdr_len and opt_pad */
667 static void compute_hdrlen_optpad(Tcp *tcph, uint16_t default_hdrlen,
668                                   uint16_t *ret_hdrlen, uint16_t *ret_optpad,
669                                   Tcpctl *tcb)
670 {
671         uint16_t hdrlen = default_hdrlen;
672         uint16_t optpad = 0;
673
674         if (tcph->flags & SYN) {
675                 if (tcph->mss)
676                         hdrlen += MSS_LENGTH;
677                 if (tcph->ws)
678                         hdrlen += WS_LENGTH;
679                 if (tcph->sack_ok)
680                         hdrlen += SACK_OK_LENGTH;
681         }
682         if (tcp_seg_has_ts(tcph)) {
683                 hdrlen += TS_LENGTH;
684                 /* SYNs have other opts, don't do the PREPAD NOOP optimization. */
685                 if (!(tcph->flags & SYN))
686                         hdrlen += TS_SEND_PREPAD;
687         }
688         if (tcb && tcb->rcv.nr_sacks)
689                 hdrlen += 2 + tcb->rcv.nr_sacks * 8;
690         optpad = hdrlen & 3;
691         if (optpad)
692                 optpad = 4 - optpad;
693         hdrlen += optpad;
694         *ret_hdrlen = hdrlen;
695         *ret_optpad = optpad;
696 }
697
698 /* Writes the TCP options for tcph to opt. */
699 static void write_opts(Tcp *tcph, uint8_t *opt, uint16_t optpad, Tcpctl *tcb)
700 {
701         if (tcph->flags & SYN) {
702                 if (tcph->mss != 0) {
703                         *opt++ = MSSOPT;
704                         *opt++ = MSS_LENGTH;
705                         hnputs(opt, tcph->mss);
706                         opt += 2;
707                 }
708                 if (tcph->ws != 0) {
709                         *opt++ = WSOPT;
710                         *opt++ = WS_LENGTH;
711                         *opt++ = tcph->ws;
712                 }
713                 if (tcph->sack_ok) {
714                         *opt++ = SACK_OK_OPT;
715                         *opt++ = SACK_OK_LENGTH;
716                 }
717         }
718         if (tcp_seg_has_ts(tcph)) {
719                 if (!(tcph->flags & SYN)) {
720                         *opt++ = NOOPOPT;
721                         *opt++ = NOOPOPT;
722                 }
723                 *opt++ = TS_OPT;
724                 *opt++ = TS_LENGTH;
725                 /* Setting TSval, our time */
726                 hnputl(opt, milliseconds());
727                 opt += 4;
728                 /* Setting TSecr, the time we last saw from them, stored in ts_val */
729                 hnputl(opt, tcph->ts_val);
730                 opt += 4;
731         }
732         if (tcb && tcb->rcv.nr_sacks) {
733                 *opt++ = SACK_OPT;
734                 *opt++ = 2 + tcb->rcv.nr_sacks * 8;
735                 for (int i = 0; i < tcb->rcv.nr_sacks; i++) {
736                         hnputl(opt, tcb->rcv.sacks[i].left);
737                         opt += 4;
738                         hnputl(opt, tcb->rcv.sacks[i].right);
739                         opt += 4;
740                 }
741         }
742         while (optpad-- > 0)
743                 *opt++ = NOOPOPT;
744 }
745
746 /* Given a data block (or NULL) returns a block with enough header room that we
747  * can send out.  block->wp is set to the beginning of the payload.  Returns
748  * NULL on some sort of error. */
749 static struct block *alloc_or_pad_block(struct block *data,
750                                         uint16_t total_hdr_size)
751 {
752         if (data) {
753                 data = padblock(data, total_hdr_size);
754                 if (data == NULL)
755                         return NULL;
756         } else {
757                 /* the 64 pad is to meet mintu's */
758                 data = block_alloc(total_hdr_size + 64, MEM_WAIT);
759                 if (data == NULL)
760                         return NULL;
761                 data->wp += total_hdr_size;
762         }
763         return data;
764 }
765
766 static struct block *htontcp6(Tcp *tcph, struct block *data, Tcp6hdr *ph,
767                               Tcpctl *tcb)
768 {
769         int dlen = blocklen(data);
770         Tcp6hdr *h;
771         uint16_t csum;
772         uint16_t hdrlen, optpad;
773
774         compute_hdrlen_optpad(tcph, TCP6_HDRSIZE, &hdrlen, &optpad, tcb);
775
776         data = alloc_or_pad_block(data, hdrlen + TCP6_PKT);
777         if (data == NULL)
778                 return NULL;
779         /* relative to the block start (bp->rp).  Note TCP structs include IP. */
780         data->network_offset = 0;
781         data->transport_offset = offsetof(Tcp6hdr, tcpsport);
782
783         /* copy in pseudo ip header plus port numbers */
784         h = (Tcp6hdr *) (data->rp);
785         memmove(h, ph, TCP6_TCBPHDRSZ);
786
787         /* compose pseudo tcp header, do cksum calculation */
788         hnputl(h->vcf, hdrlen + dlen);
789         h->ploadlen[0] = h->ploadlen[1] = h->proto = 0;
790         h->ttl = ph->proto;
791
792         /* copy in variable bits */
793         hnputl(h->tcpseq, tcph->seq);
794         hnputl(h->tcpack, tcph->ack);
795         hnputs(h->tcpflag, (hdrlen << 10) | tcph->flags);
796         hnputs(h->tcpwin, tcph->wnd >> (tcb != NULL ? tcb->snd.scale : 0));
797         hnputs(h->tcpurg, tcph->urg);
798
799         write_opts(tcph, h->tcpopt, optpad, tcb);
800
801         if (tcb != NULL && tcb->nochecksum) {
802                 h->tcpcksum[0] = h->tcpcksum[1] = 0;
803         } else {
804                 csum = ptclcsum(data, TCP6_IPLEN, hdrlen + dlen + TCP6_PHDRSIZE);
805                 hnputs(h->tcpcksum, csum);
806         }
807
808         /* move from pseudo header back to normal ip header */
809         memset(h->vcf, 0, 4);
810         h->vcf[0] = IP_VER6;
811         hnputs(h->ploadlen, hdrlen + dlen);
812         h->proto = ph->proto;
813
814         return data;
815 }
816
817 static struct block *htontcp4(Tcp *tcph, struct block *data, Tcp4hdr *ph,
818                               Tcpctl *tcb)
819 {
820         int dlen = blocklen(data);
821         Tcp4hdr *h;
822         uint16_t csum;
823         uint16_t hdrlen, optpad;
824
825         compute_hdrlen_optpad(tcph, TCP4_HDRSIZE, &hdrlen, &optpad, tcb);
826
827         data = alloc_or_pad_block(data, hdrlen + TCP4_PKT);
828         if (data == NULL)
829                 return NULL;
830         /* relative to the block start (bp->rp).  Note TCP structs include IP. */
831         data->network_offset = 0;
832         data->transport_offset = offsetof(Tcp4hdr, tcpsport);
833
834         /* copy in pseudo ip header plus port numbers */
835         h = (Tcp4hdr *) (data->rp);
836         memmove(h, ph, TCP4_TCBPHDRSZ);
837
838         /* copy in variable bits */
839         hnputs(h->tcplen, hdrlen + dlen);
840         hnputl(h->tcpseq, tcph->seq);
841         hnputl(h->tcpack, tcph->ack);
842         hnputs(h->tcpflag, (hdrlen << 10) | tcph->flags);
843         hnputs(h->tcpwin, tcph->wnd >> (tcb != NULL ? tcb->snd.scale : 0));
844         hnputs(h->tcpurg, tcph->urg);
845
846         write_opts(tcph, h->tcpopt, optpad, tcb);
847
848         if (tcb != NULL && tcb->nochecksum) {
849                 h->tcpcksum[0] = h->tcpcksum[1] = 0;
850         } else {
851                 csum = ~ptclcsum(data, TCP4_IPLEN, TCP4_PHDRSIZE);
852                 hnputs(h->tcpcksum, csum);
853                 data->checksum_start = TCP4_IPLEN + TCP4_PHDRSIZE;
854                 data->checksum_offset = ph->tcpcksum - ph->tcpsport;
855                 data->flag |= Btcpck;
856         }
857
858         return data;
859 }
860
861 static void parse_inbound_sacks(Tcp *tcph, uint8_t *opt, uint16_t optlen)
862 {
863         uint8_t nr_sacks;
864         uint32_t left, right;
865
866         nr_sacks = (optlen - 2) / 8;
867         if (nr_sacks > MAX_NR_SACKS_PER_PACKET)
868                 return;
869         opt += 2;
870         for (int i = 0; i < nr_sacks; i++, opt += 8) {
871                 left = nhgetl(opt);
872                 right = nhgetl(opt + 4);
873                 if (seq_ge(left, right)) {
874                         /* bad / malicious SACK.  Skip it, and adjust. */
875                         nr_sacks--;
876                         i--;    /* stay on this array element next loop */
877                         continue;
878                 }
879                 tcph->sacks[i].left = left;
880                 tcph->sacks[i].right = right;
881         }
882         tcph->nr_sacks = nr_sacks;
883 }
884
885 static void parse_inbound_opts(Tcp *tcph, uint8_t *opt, uint16_t optsize)
886 {
887         uint16_t optlen;
888
889         while (optsize > 0 && *opt != EOLOPT) {
890                 if (*opt == NOOPOPT) {
891                         optsize--;
892                         opt++;
893                         continue;
894                 }
895                 optlen = opt[1];
896                 if (optlen < 2 || optlen > optsize)
897                         break;
898                 switch (*opt) {
899                         case MSSOPT:
900                                 if (optlen == MSS_LENGTH)
901                                         tcph->mss = nhgets(opt + 2);
902                                 break;
903                         case WSOPT:
904                                 if (optlen == WS_LENGTH && *(opt + 2) <= MAX_WS_VALUE)
905                                         tcph->ws = HaveWS | *(opt + 2);
906                                 break;
907                         case SACK_OK_OPT:
908                                 if (optlen == SACK_OK_LENGTH)
909                                         tcph->sack_ok = TRUE;
910                                 break;
911                         case SACK_OPT:
912                                 parse_inbound_sacks(tcph, opt, optlen);
913                                 break;
914                         case TS_OPT:
915                                 if (optlen == TS_LENGTH) {
916                                         tcph->ts_val = nhgetl(opt + 2);
917                                         tcph->ts_ecr = nhgetl(opt + 6);
918                                 }
919                                 break;
920                 }
921                 optsize -= optlen;
922                 opt += optlen;
923         }
924 }
925
926 /* Helper, clears the opts.  We'll later set them with e.g. parse_inbound_opts,
927  * set them manually, or something else. */
928 static void clear_tcph_opts(Tcp *tcph)
929 {
930         tcph->mss = 0;
931         tcph->ws = 0;
932         tcph->sack_ok = FALSE;
933         tcph->nr_sacks = 0;
934         tcph->ts_val = 0;
935         tcph->ts_ecr = 0;
936 }
937
938 static int ntohtcp6(Tcp *tcph, struct block **bpp)
939 {
940         Tcp6hdr *h;
941         uint16_t hdrlen;
942
943         *bpp = pullupblock(*bpp, TCP6_PKT + TCP6_HDRSIZE);
944         if (*bpp == NULL)
945                 return -1;
946
947         h = (Tcp6hdr *) ((*bpp)->rp);
948         tcph->source = nhgets(h->tcpsport);
949         tcph->dest = nhgets(h->tcpdport);
950         tcph->seq = nhgetl(h->tcpseq);
951         tcph->ack = nhgetl(h->tcpack);
952         hdrlen = (h->tcpflag[0] >> 2) & ~3;
953         if (hdrlen < TCP6_HDRSIZE) {
954                 freeblist(*bpp);
955                 return -1;
956         }
957
958         tcph->flags = h->tcpflag[1];
959         tcph->wnd = nhgets(h->tcpwin);
960         tcph->urg = nhgets(h->tcpurg);
961         clear_tcph_opts(tcph);
962         tcph->len = nhgets(h->ploadlen) - hdrlen;
963
964         *bpp = pullupblock(*bpp, hdrlen + TCP6_PKT);
965         if (*bpp == NULL)
966                 return -1;
967         parse_inbound_opts(tcph, h->tcpopt, hdrlen - TCP6_HDRSIZE);
968         return hdrlen;
969 }
970
971 static int ntohtcp4(Tcp *tcph, struct block **bpp)
972 {
973         Tcp4hdr *h;
974         uint16_t hdrlen;
975
976         *bpp = pullupblock(*bpp, TCP4_PKT + TCP4_HDRSIZE);
977         if (*bpp == NULL)
978                 return -1;
979
980         h = (Tcp4hdr *) ((*bpp)->rp);
981         tcph->source = nhgets(h->tcpsport);
982         tcph->dest = nhgets(h->tcpdport);
983         tcph->seq = nhgetl(h->tcpseq);
984         tcph->ack = nhgetl(h->tcpack);
985
986         hdrlen = (h->tcpflag[0] >> 2) & ~3;
987         if (hdrlen < TCP4_HDRSIZE) {
988                 freeblist(*bpp);
989                 return -1;
990         }
991
992         tcph->flags = h->tcpflag[1];
993         tcph->wnd = nhgets(h->tcpwin);
994         tcph->urg = nhgets(h->tcpurg);
995         clear_tcph_opts(tcph);
996         tcph->len = nhgets(h->length) - (hdrlen + TCP4_PKT);
997
998         *bpp = pullupblock(*bpp, hdrlen + TCP4_PKT);
999         if (*bpp == NULL)
1000                 return -1;
1001         parse_inbound_opts(tcph, h->tcpopt, hdrlen - TCP4_HDRSIZE);
1002         return hdrlen;
1003 }
1004
1005 /*
1006  *  For outgoing calls, generate an initial sequence
1007  *  number and put a SYN on the send queue
1008  */
1009 static void tcpsndsyn(struct conv *s, Tcpctl *tcb)
1010 {
1011         urandom_read(&tcb->iss, sizeof(tcb->iss));
1012         tcb->rttseq = tcb->iss;
1013         tcb->snd.wl2 = tcb->iss;
1014         tcb->snd.una = tcb->iss;
1015         tcb->snd.rtx = tcb->rttseq;
1016         tcb->snd.nxt = tcb->rttseq;
1017         tcb->flgcnt++;
1018         tcb->flags |= FORCE;
1019         tcb->sndsyntime = NOW;
1020
1021         /* set desired mss and scale */
1022         tcb->mss = tcpmtu(s->p, s->laddr, s->ipversion, &tcb->scale,
1023                           &tcb->flags);
1024 }
1025
1026 static void sndrst(struct Proto *tcp, uint8_t *source, uint8_t *dest,
1027                    uint16_t length, Tcp *seg, uint8_t version, char *reason)
1028 {
1029         struct block *hbp;
1030         uint8_t rflags;
1031         struct tcppriv *tpriv;
1032         Tcp4hdr ph4;
1033         Tcp6hdr ph6;
1034
1035         netlog(tcp->f, Logtcpreset, "sndrst: %s\n", reason);
1036
1037         tpriv = tcp->priv;
1038
1039         if (seg->flags & RST)
1040                 return;
1041
1042         /* make pseudo header */
1043         switch (version) {
1044                 case V4:
1045                         memset(&ph4, 0, sizeof(ph4));
1046                         ph4.vihl = IP_VER4;
1047                         v6tov4(ph4.tcpsrc, dest);
1048                         v6tov4(ph4.tcpdst, source);
1049                         ph4.proto = IP_TCPPROTO;
1050                         hnputs(ph4.tcplen, TCP4_HDRSIZE);
1051                         hnputs(ph4.tcpsport, seg->dest);
1052                         hnputs(ph4.tcpdport, seg->source);
1053                         break;
1054                 case V6:
1055                         memset(&ph6, 0, sizeof(ph6));
1056                         ph6.vcf[0] = IP_VER6;
1057                         ipmove(ph6.tcpsrc, dest);
1058                         ipmove(ph6.tcpdst, source);
1059                         ph6.proto = IP_TCPPROTO;
1060                         hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1061                         hnputs(ph6.tcpsport, seg->dest);
1062                         hnputs(ph6.tcpdport, seg->source);
1063                         break;
1064                 default:
1065                         panic("sndrst: version %d", version);
1066         }
1067
1068         tpriv->stats[OutRsts]++;
1069         rflags = RST;
1070
1071         /* convince the other end that this reset is in band */
1072         if (seg->flags & ACK) {
1073                 seg->seq = seg->ack;
1074                 seg->ack = 0;
1075         } else {
1076                 rflags |= ACK;
1077                 seg->ack = seg->seq;
1078                 seg->seq = 0;
1079                 if (seg->flags & SYN)
1080                         seg->ack++;
1081                 seg->ack += length;
1082                 if (seg->flags & FIN)
1083                         seg->ack++;
1084         }
1085         seg->flags = rflags;
1086         seg->wnd = 0;
1087         seg->urg = 0;
1088         seg->mss = 0;
1089         seg->ws = 0;
1090         seg->sack_ok = FALSE;
1091         seg->nr_sacks = 0;
1092         /* seg->ts_val is already set with their timestamp */
1093         switch (version) {
1094                 case V4:
1095                         hbp = htontcp4(seg, NULL, &ph4, NULL);
1096                         if (hbp == NULL)
1097                                 return;
1098                         ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1099                         break;
1100                 case V6:
1101                         hbp = htontcp6(seg, NULL, &ph6, NULL);
1102                         if (hbp == NULL)
1103                                 return;
1104                         ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1105                         break;
1106                 default:
1107                         panic("sndrst2: version %d", version);
1108         }
1109 }
1110
1111 /*
1112  *  send a reset to the remote side and close the conversation
1113  *  called with s qlocked
1114  */
1115 static void tcphangup(struct conv *s)
1116 {
1117         ERRSTACK(1);
1118         Tcp seg;
1119         Tcpctl *tcb;
1120         struct block *hbp;
1121
1122         tcb = (Tcpctl *) s->ptcl;
1123         if (ipcmp(s->raddr, IPnoaddr)) {
1124                 /* discard error style, poperror regardless */
1125                 if (!waserror()) {
1126                         seg.flags = RST | ACK;
1127                         seg.ack = tcb->rcv.nxt;
1128                         tcb->last_ack_sent = seg.ack;
1129                         tcb->rcv.una = 0;
1130                         seg.seq = tcb->snd.nxt;
1131                         seg.wnd = 0;
1132                         seg.urg = 0;
1133                         seg.mss = 0;
1134                         seg.ws = 0;
1135                         seg.sack_ok = FALSE;
1136                         seg.nr_sacks = 0;
1137                         seg.ts_val = tcb->ts_recent;
1138                         switch (s->ipversion) {
1139                                 case V4:
1140                                         tcb->protohdr.tcp4hdr.vihl = IP_VER4;
1141                                         hbp = htontcp4(&seg, NULL, &tcb->protohdr.tcp4hdr, tcb);
1142                                         ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
1143                                         break;
1144                                 case V6:
1145                                         tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
1146                                         hbp = htontcp6(&seg, NULL, &tcb->protohdr.tcp6hdr, tcb);
1147                                         ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
1148                                         break;
1149                                 default:
1150                                         panic("tcphangup: version %d", s->ipversion);
1151                         }
1152                 }
1153                 poperror();
1154         }
1155         localclose(s, NULL);
1156 }
1157
1158 /*
1159  *  (re)send a SYN ACK
1160  */
1161 static int sndsynack(struct Proto *tcp, Limbo *lp)
1162 {
1163         struct block *hbp;
1164         Tcp4hdr ph4;
1165         Tcp6hdr ph6;
1166         Tcp seg;
1167         int scale;
1168         uint8_t flag = 0;
1169
1170         /* make pseudo header */
1171         switch (lp->version) {
1172                 case V4:
1173                         memset(&ph4, 0, sizeof(ph4));
1174                         ph4.vihl = IP_VER4;
1175                         v6tov4(ph4.tcpsrc, lp->laddr);
1176                         v6tov4(ph4.tcpdst, lp->raddr);
1177                         ph4.proto = IP_TCPPROTO;
1178                         hnputs(ph4.tcplen, TCP4_HDRSIZE);
1179                         hnputs(ph4.tcpsport, lp->lport);
1180                         hnputs(ph4.tcpdport, lp->rport);
1181                         break;
1182                 case V6:
1183                         memset(&ph6, 0, sizeof(ph6));
1184                         ph6.vcf[0] = IP_VER6;
1185                         ipmove(ph6.tcpsrc, lp->laddr);
1186                         ipmove(ph6.tcpdst, lp->raddr);
1187                         ph6.proto = IP_TCPPROTO;
1188                         hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1189                         hnputs(ph6.tcpsport, lp->lport);
1190                         hnputs(ph6.tcpdport, lp->rport);
1191                         break;
1192                 default:
1193                         panic("sndrst: version %d", lp->version);
1194         }
1195
1196         seg.seq = lp->iss;
1197         seg.ack = lp->irs + 1;
1198         seg.flags = SYN | ACK;
1199         seg.urg = 0;
1200         seg.mss = tcpmtu(tcp, lp->laddr, lp->version, &scale, &flag);
1201         seg.wnd = QMAX;
1202         seg.ts_val = lp->ts_val;
1203         seg.nr_sacks = 0;
1204
1205         /* if the other side set scale, we should too */
1206         if (lp->rcvscale) {
1207                 seg.ws = scale;
1208                 lp->sndscale = scale;
1209         } else {
1210                 seg.ws = 0;
1211                 lp->sndscale = 0;
1212         }
1213         if (SACK_SUPPORTED)
1214                 seg.sack_ok = lp->sack_ok;
1215         else
1216                 seg.sack_ok = FALSE;
1217
1218         switch (lp->version) {
1219                 case V4:
1220                         hbp = htontcp4(&seg, NULL, &ph4, NULL);
1221                         if (hbp == NULL)
1222                                 return -1;
1223                         ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1224                         break;
1225                 case V6:
1226                         hbp = htontcp6(&seg, NULL, &ph6, NULL);
1227                         if (hbp == NULL)
1228                                 return -1;
1229                         ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1230                         break;
1231                 default:
1232                         panic("sndsnack: version %d", lp->version);
1233         }
1234         lp->lastsend = NOW;
1235         return 0;
1236 }
1237
1238 #define hashipa(a, p) ( ( (a)[IPaddrlen-2] + (a)[IPaddrlen-1] + p )&LHTMASK )
1239
1240 /*
1241  *  put a call into limbo and respond with a SYN ACK
1242  *
1243  *  called with proto locked
1244  */
1245 static void limbo(struct conv *s, uint8_t *source, uint8_t *dest, Tcp *seg,
1246                   int version)
1247 {
1248         Limbo *lp, **l;
1249         struct tcppriv *tpriv;
1250         int h;
1251
1252         tpriv = s->p->priv;
1253         h = hashipa(source, seg->source);
1254
1255         for (l = &tpriv->lht[h]; *l != NULL; l = &lp->next) {
1256                 lp = *l;
1257                 if (lp->lport != seg->dest || lp->rport != seg->source
1258                         || lp->version != version)
1259                         continue;
1260                 if (ipcmp(lp->raddr, source) != 0)
1261                         continue;
1262                 if (ipcmp(lp->laddr, dest) != 0)
1263                         continue;
1264
1265                 /* each new SYN restarts the retransmits */
1266                 lp->irs = seg->seq;
1267                 break;
1268         }
1269         lp = *l;
1270         if (lp == NULL) {
1271                 if (tpriv->nlimbo >= Maxlimbo && tpriv->lht[h]) {
1272                         lp = tpriv->lht[h];
1273                         tpriv->lht[h] = lp->next;
1274                         lp->next = NULL;
1275                 } else {
1276                         lp = kzmalloc(sizeof(*lp), 0);
1277                         if (lp == NULL)
1278                                 return;
1279                         tpriv->nlimbo++;
1280                 }
1281                 *l = lp;
1282                 lp->version = version;
1283                 ipmove(lp->laddr, dest);
1284                 ipmove(lp->raddr, source);
1285                 lp->lport = seg->dest;
1286                 lp->rport = seg->source;
1287                 lp->mss = seg->mss;
1288                 lp->rcvscale = seg->ws;
1289                 lp->sack_ok = seg->sack_ok;
1290                 lp->irs = seg->seq;
1291                 lp->ts_val = seg->ts_val;
1292                 urandom_read(&lp->iss, sizeof(lp->iss));
1293         }
1294
1295         if (sndsynack(s->p, lp) < 0) {
1296                 *l = lp->next;
1297                 tpriv->nlimbo--;
1298                 kfree(lp);
1299         }
1300 }
1301
1302 /*
1303  *  resend SYN ACK's once every SYNACK_RXTIMER ms.
1304  */
1305 static void limborexmit(struct Proto *tcp)
1306 {
1307         struct tcppriv *tpriv;
1308         Limbo **l, *lp;
1309         int h;
1310         int seen;
1311         uint64_t now;
1312
1313         tpriv = tcp->priv;
1314
1315         if (!canqlock(&tcp->qlock))
1316                 return;
1317         seen = 0;
1318         now = NOW;
1319         for (h = 0; h < NLHT && seen < tpriv->nlimbo; h++) {
1320                 for (l = &tpriv->lht[h]; *l != NULL && seen < tpriv->nlimbo;) {
1321                         lp = *l;
1322                         seen++;
1323                         if (now - lp->lastsend < (lp->rexmits + 1) * SYNACK_RXTIMER)
1324                                 continue;
1325
1326                         /* time it out after 1 second */
1327                         if (++(lp->rexmits) > 5) {
1328                                 tpriv->nlimbo--;
1329                                 *l = lp->next;
1330                                 kfree(lp);
1331                                 continue;
1332                         }
1333
1334                         /* if we're being attacked, don't bother resending SYN ACK's */
1335                         if (tpriv->nlimbo > 100)
1336                                 continue;
1337
1338                         if (sndsynack(tcp, lp) < 0) {
1339                                 tpriv->nlimbo--;
1340                                 *l = lp->next;
1341                                 kfree(lp);
1342                                 continue;
1343                         }
1344
1345                         l = &lp->next;
1346                 }
1347         }
1348         qunlock(&tcp->qlock);
1349 }
1350
1351 /*
1352  *  lookup call in limbo.  if found, throw it out.
1353  *
1354  *  called with proto locked
1355  */
1356 static void limborst(struct conv *s, Tcp *segp, uint8_t *src, uint8_t *dst,
1357                      uint8_t version)
1358 {
1359         Limbo *lp, **l;
1360         int h;
1361         struct tcppriv *tpriv;
1362
1363         tpriv = s->p->priv;
1364
1365         /* find a call in limbo */
1366         h = hashipa(src, segp->source);
1367         for (l = &tpriv->lht[h]; *l != NULL; l = &lp->next) {
1368                 lp = *l;
1369                 if (lp->lport != segp->dest || lp->rport != segp->source
1370                         || lp->version != version)
1371                         continue;
1372                 if (ipcmp(lp->laddr, dst) != 0)
1373                         continue;
1374                 if (ipcmp(lp->raddr, src) != 0)
1375                         continue;
1376
1377                 /* RST can only follow the SYN */
1378                 if (segp->seq == lp->irs + 1) {
1379                         tpriv->nlimbo--;
1380                         *l = lp->next;
1381                         kfree(lp);
1382                 }
1383                 break;
1384         }
1385 }
1386
1387 /* The advertised MSS (e.g. 1460) includes any per-packet TCP options, such as
1388  * TCP timestamps.  A given packet will contain mss bytes, but only typical_mss
1389  * bytes of *data*.  If we know we'll use those options, we should adjust our
1390  * typical_mss, which will affect the cwnd. */
1391 static void adjust_typical_mss_for_opts(Tcp *tcph, Tcpctl *tcb)
1392 {
1393         uint16_t opt_size = 0;
1394
1395         if (tcph->ts_val)
1396                 opt_size += TS_LENGTH + TS_SEND_PREPAD;
1397         opt_size = ROUNDUP(opt_size, 4);
1398         tcb->typical_mss -= opt_size;
1399 }
1400
1401 /*
1402  *  come here when we finally get an ACK to our SYN-ACK.
1403  *  lookup call in limbo.  if found, create a new conversation
1404  *
1405  *  called with proto locked
1406  */
1407 static struct conv *tcpincoming(struct conv *s, Tcp *segp, uint8_t *src,
1408                                                                 uint8_t *dst, uint8_t version)
1409 {
1410         struct conv *new;
1411         Tcpctl *tcb;
1412         struct tcppriv *tpriv;
1413         Tcp4hdr *h4;
1414         Tcp6hdr *h6;
1415         Limbo *lp, **l;
1416         int h;
1417
1418         /* unless it's just an ack, it can't be someone coming out of limbo */
1419         if ((segp->flags & SYN) || (segp->flags & ACK) == 0)
1420                 return NULL;
1421
1422         tpriv = s->p->priv;
1423
1424         /* find a call in limbo */
1425         h = hashipa(src, segp->source);
1426         for (l = &tpriv->lht[h]; (lp = *l) != NULL; l = &lp->next) {
1427                 netlog(s->p->f, Logtcp,
1428                            "tcpincoming s %I!%d/%I!%d d %I!%d/%I!%d v %d/%d\n", src,
1429                            segp->source, lp->raddr, lp->rport, dst, segp->dest, lp->laddr,
1430                            lp->lport, version, lp->version);
1431
1432                 if (lp->lport != segp->dest || lp->rport != segp->source
1433                         || lp->version != version)
1434                         continue;
1435                 if (ipcmp(lp->laddr, dst) != 0)
1436                         continue;
1437                 if (ipcmp(lp->raddr, src) != 0)
1438                         continue;
1439
1440                 /* we're assuming no data with the initial SYN */
1441                 if (segp->seq != lp->irs + 1 || segp->ack != lp->iss + 1) {
1442                         netlog(s->p->f, Logtcp, "tcpincoming s 0x%lx/0x%lx a 0x%lx 0x%lx\n",
1443                                    segp->seq, lp->irs + 1, segp->ack, lp->iss + 1);
1444                         lp = NULL;
1445                 } else {
1446                         tpriv->nlimbo--;
1447                         *l = lp->next;
1448                 }
1449                 break;
1450         }
1451         if (lp == NULL)
1452                 return NULL;
1453
1454         new = Fsnewcall(s, src, segp->source, dst, segp->dest, version);
1455         if (new == NULL)
1456                 return NULL;
1457
1458         memmove(new->ptcl, s->ptcl, sizeof(Tcpctl));
1459         tcb = (Tcpctl *) new->ptcl;
1460         tcb->flags &= ~CLONE;
1461         tcb->timer.arg = new;
1462         tcb->timer.state = TcptimerOFF;
1463         tcb->acktimer.arg = new;
1464         tcb->acktimer.state = TcptimerOFF;
1465         tcb->katimer.arg = new;
1466         tcb->katimer.state = TcptimerOFF;
1467         tcb->rtt_timer.arg = new;
1468         tcb->rtt_timer.state = TcptimerOFF;
1469
1470         tcb->irs = lp->irs;
1471         tcb->rcv.nxt = tcb->irs + 1;
1472         tcb->rcv.urg = tcb->rcv.nxt;
1473
1474         tcb->iss = lp->iss;
1475         tcb->rttseq = tcb->iss;
1476         tcb->snd.wl2 = tcb->iss;
1477         tcb->snd.una = tcb->iss + 1;
1478         tcb->snd.rtx = tcb->iss + 1;
1479         tcb->snd.nxt = tcb->iss + 1;
1480         tcb->flgcnt = 0;
1481         tcb->flags |= SYNACK;
1482
1483         /* our sending max segment size cannot be bigger than what he asked for */
1484         if (lp->mss != 0 && lp->mss < tcb->mss) {
1485                 tcb->mss = lp->mss;
1486                 tcb->typical_mss = tcb->mss;
1487         }
1488         adjust_typical_mss_for_opts(segp, tcb);
1489
1490         /* Here's where we record the previously-decided header options.  They were
1491          * actually decided on when we agreed to them in the SYNACK we sent.  We
1492          * didn't create an actual TCB until now, so we can copy those decisions out
1493          * of the limbo tracker and into the TCB. */
1494         tcb->sack_ok = lp->sack_ok;
1495         /* window scaling */
1496         tcpsetscale(new, tcb, lp->rcvscale, lp->sndscale);
1497
1498         tcb->snd.wnd = segp->wnd;
1499         tcb->cwind = tcb->typical_mss * CWIND_SCALE;
1500
1501         /* set initial round trip time */
1502         tcb->sndsyntime = lp->lastsend + lp->rexmits * SYNACK_RXTIMER;
1503         tcpsynackrtt(new);
1504
1505         kfree(lp);
1506
1507         /* set up proto header */
1508         switch (version) {
1509                 case V4:
1510                         h4 = &tcb->protohdr.tcp4hdr;
1511                         memset(h4, 0, sizeof(*h4));
1512                         h4->proto = IP_TCPPROTO;
1513                         hnputs(h4->tcpsport, new->lport);
1514                         hnputs(h4->tcpdport, new->rport);
1515                         v6tov4(h4->tcpsrc, dst);
1516                         v6tov4(h4->tcpdst, src);
1517                         break;
1518                 case V6:
1519                         h6 = &tcb->protohdr.tcp6hdr;
1520                         memset(h6, 0, sizeof(*h6));
1521                         h6->proto = IP_TCPPROTO;
1522                         hnputs(h6->tcpsport, new->lport);
1523                         hnputs(h6->tcpdport, new->rport);
1524                         ipmove(h6->tcpsrc, dst);
1525                         ipmove(h6->tcpdst, src);
1526                         break;
1527                 default:
1528                         panic("tcpincoming: version %d", new->ipversion);
1529         }
1530
1531         tcpsetstate(new, Established);
1532
1533         iphtadd(&tpriv->ht, new);
1534
1535         return new;
1536 }
1537
1538 /*
1539  *  use the time between the first SYN and it's ack as the
1540  *  initial round trip time
1541  */
1542 static void tcpsynackrtt(struct conv *s)
1543 {
1544         Tcpctl *tcb;
1545         uint64_t delta;
1546         struct tcppriv *tpriv;
1547
1548         tcb = (Tcpctl *) s->ptcl;
1549         tpriv = s->p->priv;
1550
1551         delta = NOW - tcb->sndsyntime;
1552         tcb->srtt = delta;
1553         tcb->mdev = delta / 2;
1554
1555         /* halt round trip timer */
1556         tcphalt(tpriv, &tcb->rtt_timer);
1557 }
1558
1559 /* For LFNs (long/fat), our default tx queue doesn't hold enough data, and TCP
1560  * blocks on the application - even if the app already has the data ready to go.
1561  * We need to hold the sent, unacked data (1x cwnd), plus all the data we might
1562  * send next RTT (1x cwnd).  Note this is called after cwnd was expanded. */
1563 static void adjust_tx_qio_limit(struct conv *s)
1564 {
1565         Tcpctl *tcb = (Tcpctl *) s->ptcl;
1566         size_t ideal_limit = tcb->cwind * 2;
1567
1568         /* This is called for every ACK, and it's not entirely free to update the
1569          * limit (locks, CVs, taps).  Updating in chunks of mss seems reasonable.
1570          * During SS, we'll update this on most ACKs (given each ACK increased the
1571          * cwind by > MSS).
1572          *
1573          * We also don't want a lot of tiny blocks from the user, but the way qio
1574          * works, you can put in as much as you want (Maxatomic) and then get
1575          * flow-controlled. */
1576         if (qgetlimit(s->wq) + tcb->typical_mss < ideal_limit)
1577                 qsetlimit(s->wq, ideal_limit);
1578         /* TODO: we could shrink the qio limit too, if we had a better idea what the
1579          * actual threshold was.  We want the limit to be the 'stable' cwnd * 2. */
1580 }
1581
1582 /* Attempts to merge later sacks into sack 'into' (index in the array) */
1583 static void merge_sacks_into(Tcpctl *tcb, int into)
1584 {
1585         struct sack_block *into_sack = &tcb->snd.sacks[into];
1586         struct sack_block *tcb_sack;
1587         int shift = 0;
1588
1589         for (int i = into + 1; i < tcb->snd.nr_sacks; i++) {
1590                 tcb_sack = &tcb->snd.sacks[i];
1591                 if (seq_lt(into_sack->right, tcb_sack->left))
1592                         break;
1593                 if (seq_gt(tcb_sack->right, into_sack->right))
1594                         into_sack->right = tcb_sack->right;
1595                 shift++;
1596         }
1597         if (shift) {
1598                 memmove(tcb->snd.sacks + into + 1,
1599                         tcb->snd.sacks + into + 1 + shift,
1600                         sizeof(struct sack_block) * (tcb->snd.nr_sacks - into - 1
1601                                                              - shift));
1602                 tcb->snd.nr_sacks -= shift;
1603         }
1604 }
1605
1606 /* If we update a sack, it means they received a packet (possibly out of order),
1607  * but they have not received earlier packets.  Otherwise, they would do a full
1608  * ACK.
1609  *
1610  * The trick is in knowing whether the reception growing this sack is due to a
1611  * retrans or due to packets from before our last loss event.  The rightmost
1612  * sack tends to grow a lot with packets we sent before the loss.  However,
1613  * intermediate sacks that grow are signs of a loss, since they only grow as a
1614  * result of retrans.
1615  *
1616  * This is only true for the first time through a retrans.  After we've gone
1617  * through a full retrans blast, the sack that hinted at the retrans loss (and
1618  * there could be multiple of them!) will continue to grow.  We could come up
1619  * with some tracking for this, but instead we'll just do a one-time deal.  You
1620  * can recover from one detected sack retrans loss.  After that, you'll have to
1621  * use the RTO.
1622  *
1623  * This won't catch some things, like a sack that grew and merged with the
1624  * rightmost sack.  This also won't work if you have a single sack.  We can't
1625  * tell where the retrans ends and the sending begins. */
1626 static bool sack_hints_at_loss(Tcpctl *tcb, struct sack_block *tcb_sack)
1627 {
1628         if (tcb->snd.recovery != SACK_RETRANS_RECOVERY)
1629                 return FALSE;
1630         return &tcb->snd.sacks[tcb->snd.nr_sacks - 1] != tcb_sack;
1631 }
1632
1633 static bool sack_contains(struct sack_block *tcb_sack, uint32_t seq)
1634 {
1635         return seq_le(tcb_sack->left, seq) && seq_lt(seq, tcb_sack->right);
1636 }
1637
1638 /* Debugging helper! */
1639 static void sack_asserter(Tcpctl *tcb, char *str)
1640 {
1641         struct sack_block *tcb_sack;
1642
1643         for (int i = 0; i < tcb->snd.nr_sacks; i++) {
1644                 tcb_sack = &tcb->snd.sacks[i];
1645                 /* Checking invariants: snd.rtx is never inside a sack, sacks are always
1646                  * mutually exclusive. */
1647                 if (sack_contains(tcb_sack, tcb->snd.rtx) ||
1648                     ((i + 1 < tcb->snd.nr_sacks) && seq_ge(tcb_sack->right,
1649                                                                (tcb_sack + 1)->left))) {
1650                         printk("SACK ASSERT ERROR at %s\n", str);
1651                         printk("rtx %u una %u nxt %u, sack [%u, %u)\n",
1652                                tcb->snd.rtx, tcb->snd.una, tcb->snd.nxt, tcb_sack->left,
1653                                    tcb_sack->right);
1654                         for (int i = 0; i < tcb->snd.nr_sacks; i++)
1655                                 printk("\t %d: [%u, %u)\n", i, tcb->snd.sacks[i].left,
1656                                        tcb->snd.sacks[i].right);
1657                         backtrace();
1658                         panic("");
1659                 }
1660         }
1661 }
1662
1663 /* Updates bookkeeping whenever a sack is added or updated */
1664 static void sack_has_changed(struct conv *s, Tcpctl *tcb,
1665                              struct sack_block *tcb_sack)
1666 {
1667         /* Due to the change, snd.rtx might be in the middle of this sack.  Advance
1668          * it to the right edge. */
1669         if (sack_contains(tcb_sack, tcb->snd.rtx))
1670                 tcb->snd.rtx = tcb_sack->right;
1671
1672         /* This is a sack for something we retransed and we think it means there was
1673          * another loss.  Instead of waiting for the RTO, we can take action. */
1674         if (sack_hints_at_loss(tcb, tcb_sack)) {
1675                 if (++tcb->snd.sack_loss_hint == TCPREXMTTHRESH) {
1676                         netlog(s->p->f, Logtcprxmt,
1677                                "%I.%d -> %I.%d: sack rxmit loss: snd.rtx %u, sack [%u,%u), una %u, recovery_pt %u\n",
1678                                s->laddr, s->lport, s->raddr, s->rport,
1679                                tcb->snd.rtx, tcb_sack->left, tcb_sack->right, tcb->snd.una,
1680                                tcb->snd.recovery_pt);
1681                         /* Redo retrans, but keep the sacks and recovery point */
1682                         tcp_loss_event(s, tcb);
1683                         tcb->snd.rtx = tcb->snd.una;
1684                         tcb->snd.sack_loss_hint = 0;
1685                         /* Act like an RTO.  We just detected it earlier.  This prevents us
1686                          * from getting another sack hint loss this recovery period and from
1687                          * advancing the opportunistic right edge. */
1688                         tcb->snd.recovery = RTO_RETRANS_RECOVERY;
1689                         /* We didn't actually time out yet and we expect to keep getting
1690                          * sacks, so we don't want to flush or worry about in_flight.  If we
1691                          * messed something up, the RTO will still fire. */
1692                         set_in_flight(tcb);
1693                 }
1694         }
1695 }
1696
1697 /* Advances tcb_sack's right edge, if new_right is farther, and updates the
1698  * bookkeeping due to the change. */
1699 static void update_right_edge(struct conv *s, Tcpctl *tcb,
1700                               struct sack_block *tcb_sack, uint32_t new_right)
1701 {
1702         if (seq_le(new_right, tcb_sack->right))
1703                 return;
1704         tcb_sack->right = new_right;
1705         merge_sacks_into(tcb, tcb_sack - tcb->snd.sacks);
1706         sack_has_changed(s, tcb, tcb_sack);
1707 }
1708
1709 static void update_or_insert_sack(struct conv *s, Tcpctl *tcb,
1710                                   struct sack_block *seg_sack)
1711 {
1712         struct sack_block *tcb_sack;
1713
1714         for (int i = 0; i < tcb->snd.nr_sacks; i++) {
1715                 tcb_sack = &tcb->snd.sacks[i];
1716                 if (seq_lt(tcb_sack->left, seg_sack->left)) {
1717                         /* This includes adjacent (which I've seen!) and overlap. */
1718                         if (seq_le(seg_sack->left, tcb_sack->right)) {
1719                                 update_right_edge(s, tcb, tcb_sack, seg_sack->right);
1720                                 return;
1721                         }
1722                         continue;
1723                 }
1724                 /* Update existing sack */
1725                 if (tcb_sack->left == seg_sack->left) {
1726                         update_right_edge(s, tcb, tcb_sack, seg_sack->right);
1727                         return;
1728                 }
1729                 /* Found our slot */
1730                 if (seq_gt(tcb_sack->left, seg_sack->left)) {
1731                         if (tcb->snd.nr_sacks == MAX_NR_SND_SACKS) {
1732                                 /* Out of room, but it is possible this sack overlaps later
1733                                  * sacks, including the max sack's right edge. */
1734                                 if (seq_ge(seg_sack->right, tcb_sack->left)) {
1735                                         /* Take over the sack */
1736                                         tcb_sack->left = seg_sack->left;
1737                                         update_right_edge(s, tcb, tcb_sack, seg_sack->right);
1738                                 }
1739                                 return;
1740                         }
1741                         /* O/W, it's our slot and we have room (at least one spot). */
1742                         memmove(&tcb->snd.sacks[i + 1], &tcb->snd.sacks[i],
1743                                 sizeof(struct sack_block) * (tcb->snd.nr_sacks - i));
1744                         tcb_sack->left = seg_sack->left;
1745                         tcb_sack->right = seg_sack->right;
1746                         tcb->snd.nr_sacks++;
1747                         merge_sacks_into(tcb, i);
1748                         sack_has_changed(s, tcb, tcb_sack);
1749                         return;
1750                 }
1751         }
1752         if (tcb->snd.nr_sacks == MAX_NR_SND_SACKS) {
1753                 /* We didn't find space in the sack array. */
1754                 tcb_sack = &tcb->snd.sacks[MAX_NR_SND_SACKS - 1];
1755                 /* Need to always maintain the rightmost sack, discarding the prev */
1756                 if (seq_gt(seg_sack->right, tcb_sack->right)) {
1757                         tcb_sack->left = seg_sack->left;
1758                         tcb_sack->right = seg_sack->right;
1759                         sack_has_changed(s, tcb, tcb_sack);
1760                 }
1761                 return;
1762         }
1763         tcb_sack = &tcb->snd.sacks[tcb->snd.nr_sacks];
1764         tcb->snd.nr_sacks++;
1765         tcb_sack->left = seg_sack->left;
1766         tcb_sack->right = seg_sack->right;
1767         sack_has_changed(s, tcb, tcb_sack);
1768 }
1769
1770 /* Given the packet seg, track the sacks in TCB.  There are a few things: if seg
1771  * acks new data, some sacks might no longer be needed.  Some sacks might grow,
1772  * we might add new sacks, either of which can cause a merger.
1773  *
1774  * The important thing is that we always have the max sack entry: it must be
1775  * inserted for sure and findable.  We need that for our measurement of what
1776  * packets are in the network.
1777  *
1778  * Note that we keep sacks that are below snd.rtx (and above
1779  * seg.ack/tcb->snd.una) as best we can - we don't prune them.  We'll need those
1780  * for the in_flight estimate.
1781  *
1782  * When we run out of room, we'll have to throw away a sack.  Anything we throw
1783  * away below snd.rtx will be counted as 'in flight', even though it isn't.  If
1784  * we throw away something greater than snd.rtx, we'll also retrans it.  For
1785  * simplicity, we throw-away / replace the rightmost sack, since we're always
1786  * maintaining a highest sack. */
1787 static void update_sacks(struct conv *s, Tcpctl *tcb, Tcp *seg)
1788 {
1789         int prune = 0;
1790         struct sack_block *tcb_sack;
1791
1792         for (int i = 0; i < tcb->snd.nr_sacks; i++) {
1793                 tcb_sack = &tcb->snd.sacks[i];
1794                 /* For the equality case, if they acked up to, but not including an old
1795                  * sack, they must have reneged it.  Otherwise they would have acked
1796                  * beyond the sack. */
1797                 if (seq_lt(seg->ack, tcb_sack->left))
1798                         break;
1799                 prune++;
1800         }
1801         if (prune) {
1802                 memmove(tcb->snd.sacks, tcb->snd.sacks + prune,
1803                         sizeof(struct sack_block) * (tcb->snd.nr_sacks - prune));
1804                 tcb->snd.nr_sacks -= prune;
1805         }
1806         for (int i = 0; i < seg->nr_sacks; i++) {
1807                 /* old sacks */
1808                 if (seq_lt(seg->sacks[i].left, seg->ack))
1809                         continue;
1810                 /* buggy sack: out of range */
1811                 if (seq_gt(seg->sacks[i].right, tcb->snd.nxt))
1812                         continue;
1813                 update_or_insert_sack(s, tcb, &seg->sacks[i]);
1814         }
1815 }
1816
1817 /* This is a little bit of an under estimate, since we assume a packet is lost
1818  * once we have any sacks above it.  Overall, it's at most 2 * MSS of an
1819  * overestimate.
1820  *
1821  * If we have no sacks (either reneged or never used) we'll assume all packets
1822  * above snd.rtx are lost.  This will be the case for sackless fast rxmit
1823  * (Dong's stuff) or for a timeout.  In the former case, this is probably not
1824  * true, and in_flight should be higher, but we have no knowledge without the
1825  * sacks. */
1826 static void set_in_flight(Tcpctl *tcb)
1827 {
1828         struct sack_block *tcb_sack;
1829         uint32_t in_flight = 0;
1830         uint32_t from;
1831
1832         if (!tcb->snd.nr_sacks) {
1833                 tcb->snd.in_flight = tcb->snd.rtx - tcb->snd.una;
1834                 return;
1835         }
1836
1837         /* Everything to the right of the unsacked */
1838         tcb_sack = &tcb->snd.sacks[tcb->snd.nr_sacks - 1];
1839         in_flight += tcb->snd.nxt - tcb_sack->right;
1840
1841         /* Everything retransed (from una to snd.rtx, minus sacked regions.  Note
1842          * we only retrans at most the last sack's left edge.  snd.rtx will be
1843          * advanced to the right edge of some sack (possibly the last one). */
1844         from = tcb->snd.una;
1845         for (int i = 0; i < tcb->snd.nr_sacks; i++) {
1846                 tcb_sack = &tcb->snd.sacks[i];
1847                 if (seq_ge(tcb_sack->left, tcb->snd.rtx))
1848                         break;
1849                 assert(seq_ge(tcb->snd.rtx, tcb_sack->right));
1850                 in_flight += tcb_sack->left - from;
1851                 from = tcb_sack->right;
1852         }
1853         in_flight += tcb->snd.rtx - from;
1854
1855         tcb->snd.in_flight = in_flight;
1856 }
1857
1858 static void reset_recovery(struct conv *s, Tcpctl *tcb)
1859 {
1860         netlog(s->p->f, Logtcprxmt,
1861                "%I.%d -> %I.%d: recovery complete, una %u, rtx %u, nxt %u, recovery %u\n",
1862                s->laddr, s->lport, s->raddr, s->rport,
1863                tcb->snd.una, tcb->snd.rtx, tcb->snd.nxt, tcb->snd.recovery_pt);
1864         tcb->snd.recovery = 0;
1865         tcb->snd.recovery_pt = 0;
1866         tcb->snd.loss_hint = 0;
1867         tcb->snd.flush_sacks = FALSE;
1868         tcb->snd.sack_loss_hint = 0;
1869 }
1870
1871 static bool is_dup_ack(Tcpctl *tcb, Tcp *seg)
1872 {
1873         /* this is a pure ack w/o window update */
1874         return (seg->ack == tcb->snd.una) &&
1875                (tcb->snd.una != tcb->snd.nxt) &&
1876                (seg->len == 0) &&
1877                (seg->wnd == tcb->snd.wnd);
1878 }
1879
1880 /* If we have sacks, we'll ignore dupacks and look at the sacks ahead of una
1881  * (which are managed by the TCB).  The tcb will not have old sacks (below
1882  * ack/snd.rtx).  Receivers often send sacks below their ack point when we are
1883  * coming out of a loss, and we don't want those to count.
1884  *
1885  * Note the tcb could have sacks (in the future), but the receiver stopped using
1886  * them (reneged).  We'll catch that with the RTO.  If we try to catch it here,
1887  * we could get in a state where we never allow them to renege. */
1888 static bool is_potential_loss(Tcpctl *tcb, Tcp *seg)
1889 {
1890         if (seg->nr_sacks > 0)
1891                 return tcb->snd.nr_sacks > 0;
1892         else
1893                 return is_dup_ack(tcb, seg);
1894 }
1895
1896 /* When we use timestamps for RTTM, RFC 7323 suggests scaling by
1897  * expected_samples (per cwnd).  They say:
1898  *
1899  * ExpectedSamples = ceiling(FlightSize / (SMSS * 2))
1900  *
1901  * However, SMMS * 2 is really "number of bytes expected to be acked in a
1902  * packet.".  We'll use 'acked' to approximate that.  When the receiver uses
1903  * LRO, they'll send back large ACKs, which decreases the number of samples.
1904  *
1905  * If it turns out that all the divides are bad, we can just go back to not
1906  * using expected_samples at all. */
1907 static int expected_samples_ts(Tcpctl *tcb, uint32_t acked)
1908 {
1909         assert(acked);
1910         return MAX(DIV_ROUND_UP(tcb->snd.nxt - tcb->snd.una, acked), 1);
1911 }
1912
1913 /* Updates the RTT, given the currently sampled RTT and the number samples per
1914  * cwnd.  For non-TS RTTM, that'll be 1. */
1915 static void update_rtt(Tcpctl *tcb, int rtt_sample, int expected_samples)
1916 {
1917         int delta;
1918
1919         tcb->backoff = 0;
1920         tcb->backedoff = 0;
1921         if (tcb->srtt == 0) {
1922                 tcb->srtt = rtt_sample;
1923                 tcb->mdev = rtt_sample / 2;
1924         } else {
1925                 delta = rtt_sample - tcb->srtt;
1926                 tcb->srtt += (delta >> RTTM_ALPHA_SHIFT) / expected_samples;
1927                 if (tcb->srtt <= 0)
1928                         tcb->srtt = 1;
1929                 tcb->mdev += ((abs(delta) - tcb->mdev) >> RTTM_BRAVO_SHIFT) /
1930                              expected_samples;
1931                 if (tcb->mdev <= 0)
1932                         tcb->mdev = 1;
1933         }
1934         tcpsettimer(tcb);
1935 }
1936
1937 static void update(struct conv *s, Tcp *seg)
1938 {
1939         int rtt;
1940         Tcpctl *tcb;
1941         uint32_t acked, expand;
1942         struct tcppriv *tpriv;
1943
1944         tpriv = s->p->priv;
1945         tcb = (Tcpctl *) s->ptcl;
1946
1947         if (!seq_within(seg->ack, tcb->snd.una, tcb->snd.nxt))
1948                 return;
1949
1950         acked = seg->ack - tcb->snd.una;
1951         tcb->snd.una = seg->ack;
1952         if (seq_gt(seg->ack, tcb->snd.rtx))
1953                 tcb->snd.rtx = seg->ack;
1954
1955         update_sacks(s, tcb, seg);
1956         set_in_flight(tcb);
1957
1958         /* We treat either a dupack or forward SACKs as a hint that there is a loss.
1959          * The RFCs suggest three dupacks before treating it as a loss (alternative
1960          * is reordered packets).  We'll treat three SACKs the same way. */
1961         if (is_potential_loss(tcb, seg) && !tcb->snd.recovery) {
1962                 tcb->snd.loss_hint++;
1963                 if (tcb->snd.loss_hint == TCPREXMTTHRESH) {
1964                         netlog(s->p->f, Logtcprxmt,
1965                                "%I.%d -> %I.%d: loss hint thresh, nr sacks %u, nxt %u, una %u, cwnd %u\n",
1966                                s->laddr, s->lport, s->raddr, s->rport,
1967                                tcb->snd.nr_sacks, tcb->snd.nxt, tcb->snd.una, tcb->cwind);
1968                         tcp_loss_event(s, tcb);
1969                         tcb->snd.recovery_pt = tcb->snd.nxt;
1970                         if (tcb->snd.nr_sacks) {
1971                                 tcb->snd.recovery = SACK_RETRANS_RECOVERY;
1972                                 tcb->snd.flush_sacks = FALSE;
1973                                 tcb->snd.sack_loss_hint = 0;
1974                         } else {
1975                                 tcb->snd.recovery = FAST_RETRANS_RECOVERY;
1976                         }
1977                         tcprxmit(s);
1978                 }
1979         }
1980
1981         /*
1982          *  update window
1983          */
1984         if (seq_gt(seg->ack, tcb->snd.wl2)
1985                 || (tcb->snd.wl2 == seg->ack && seg->wnd > tcb->snd.wnd)) {
1986                 tcb->snd.wnd = seg->wnd;
1987                 tcb->snd.wl2 = seg->ack;
1988         }
1989
1990         if (!acked) {
1991                 /*
1992                  *  don't let us hangup if sending into a closed window and
1993                  *  we're still getting acks
1994                  */
1995                 if (tcb->snd.recovery && (tcb->snd.wnd == 0))
1996                         tcb->backedoff = MAXBACKMS / 4;
1997                 return;
1998         }
1999         /* At this point, they have acked something new. (positive ack, ack > una).
2000          *
2001          * If we hadn't reached the threshold for recovery yet, the positive ACK
2002          * will reset our loss_hint count. */
2003         if (!tcb->snd.recovery)
2004                 tcb->snd.loss_hint = 0;
2005         else if (seq_ge(seg->ack, tcb->snd.recovery_pt))
2006                 reset_recovery(s, tcb);
2007
2008         /* avoid slow start and timers for SYN acks */
2009         if ((tcb->flags & SYNACK) == 0) {
2010                 tcb->flags |= SYNACK;
2011                 acked--;
2012                 tcb->flgcnt--;
2013                 goto done;
2014         }
2015
2016         /* slow start as long as we're not recovering from lost packets */
2017         if (tcb->cwind < tcb->snd.wnd && !tcb->snd.recovery) {
2018                 if (tcb->cwind < tcb->ssthresh) {
2019                         /* We increase the cwind by every byte we receive.  We want to
2020                          * increase the cwind by one MSS for every MSS that gets ACKed.
2021                          * Note that multiple MSSs can be ACKed in a single ACK.  If we had
2022                          * a remainder of acked / MSS, we'd add just that remainder - not 0
2023                          * or 1 MSS. */
2024                         expand = acked;
2025                 } else {
2026                         /* Every RTT, which consists of CWND bytes, we're supposed to expand
2027                          * by MSS bytes.  The classic algorithm was
2028                          *              expand = (tcb->mss * tcb->mss) / tcb->cwind;
2029                          * which assumes the ACK was for MSS bytes.  Instead, for every
2030                          * 'acked' bytes, we increase the window by acked / CWND (in units
2031                          * of MSS). */
2032                         expand = MAX(acked, tcb->typical_mss) * tcb->typical_mss
2033                                  / tcb->cwind;
2034                 }
2035
2036                 if (tcb->cwind + expand < tcb->cwind)
2037                         expand = tcb->snd.wnd - tcb->cwind;
2038                 if (tcb->cwind + expand > tcb->snd.wnd)
2039                         expand = tcb->snd.wnd - tcb->cwind;
2040                 tcb->cwind += expand;
2041         }
2042         adjust_tx_qio_limit(s);
2043
2044         if (tcb->ts_recent) {
2045                 update_rtt(tcb, abs(milliseconds() - seg->ts_ecr),
2046                            expected_samples_ts(tcb, acked));
2047         } else if (tcb->rtt_timer.state == TcptimerON &&
2048                    seq_ge(seg->ack, tcb->rttseq)) {
2049                 /* Adjust the timers according to the round trip time */
2050                 tcphalt(tpriv, &tcb->rtt_timer);
2051                 if (!tcb->snd.recovery) {
2052                         rtt = tcb->rtt_timer.start - tcb->rtt_timer.count;
2053                         if (rtt == 0)
2054                                 rtt = 1;        /* o/w all close systems will rexmit in 0 time */
2055                         rtt *= MSPTICK;
2056                         update_rtt(tcb, rtt, 1);
2057                 }
2058         }
2059
2060 done:
2061         if (qdiscard(s->wq, acked) < acked) {
2062                 tcb->flgcnt--;
2063                 /* This happened due to another bug where acked was very large
2064                  * (negative), which was interpreted as "hey, one less flag, since they
2065                  * acked one of our flags (like a SYN).  If flgcnt goes negative,
2066                  * get_xmit_segment() will attempt to send out large packets. */
2067                 assert(tcb->flgcnt >= 0);
2068         }
2069
2070         if (seq_gt(seg->ack, tcb->snd.urg))
2071                 tcb->snd.urg = seg->ack;
2072
2073         if (tcb->snd.una != tcb->snd.nxt)
2074                 tcpgo(tpriv, &tcb->timer);
2075         else
2076                 tcphalt(tpriv, &tcb->timer);
2077
2078         tcb->backoff = 0;
2079         tcb->backedoff = 0;
2080 }
2081
2082 static void update_tcb_ts(Tcpctl *tcb, Tcp *seg)
2083 {
2084         /* Get timestamp info from the tcp header.  Even though the timestamps
2085          * aren't sequence numbers, we still need to protect for wraparound.  Though
2086          * if the values were 0, assume that means we need an update.  We could have
2087          * an initial ts_val that appears negative (signed). */
2088         if (!tcb->ts_recent || !tcb->last_ack_sent ||
2089             (seq_ge(seg->ts_val, tcb->ts_recent) &&
2090              seq_le(seg->seq, tcb->last_ack_sent)))
2091                 tcb->ts_recent = seg->ts_val;
2092 }
2093
2094 /* Overlap happens when one sack's left edge is inside another sack. */
2095 static bool sacks_overlap(struct sack_block *x, struct sack_block *y)
2096 {
2097         return (seq_le(x->left, y->left) && seq_le(y->left, x->right)) ||
2098                (seq_le(y->left, x->left) && seq_le(x->left, y->right));
2099 }
2100
2101 static void make_sack_first(Tcpctl *tcb, struct sack_block *tcb_sack)
2102 {
2103         struct sack_block temp;
2104
2105         if (tcb_sack == &tcb->rcv.sacks[0])
2106                 return;
2107         temp = tcb->rcv.sacks[0];
2108         tcb->rcv.sacks[0] = *tcb_sack;
2109         *tcb_sack = temp;
2110 }
2111
2112 /* Track sack in our tcb for a block of data we received.  This handles all the
2113  * stuff: making sure sack is first (since it's the most recent sack change),
2114  * updating or merging sacks, and dropping excess sacks (we only need to
2115  * maintain 3).  Unlike on the snd side, our tcb sacks are *not* sorted. */
2116 static void track_rcv_sack(Tcpctl *tcb, uint32_t left, uint32_t right)
2117 {
2118         struct sack_block *tcb_sack;
2119         struct sack_block sack[1];
2120
2121         if (!tcb->sack_ok)
2122                 return;
2123         assert(seq_lt(left, right));
2124         sack->left = left;
2125         sack->right = right;
2126         /* We can reuse an existing sack if we're merging or overlapping. */
2127         for (int i = 0; i < tcb->rcv.nr_sacks; i++) {
2128                 tcb_sack = &tcb->rcv.sacks[i];
2129                 if (sacks_overlap(tcb_sack, sack)) {
2130                         tcb_sack->left = seq_min(tcb_sack->left, sack->left);
2131                         tcb_sack->right = seq_max(tcb_sack->right, sack->right);
2132                         make_sack_first(tcb, tcb_sack);
2133                         return;
2134                 }
2135         }
2136         /* We can discard the last sack (right shift) - we should have sent it at
2137          * least once by now.  If not, oh well. */
2138         memmove(tcb->rcv.sacks + 1, tcb->rcv.sacks, sizeof(struct sack_block) *
2139                 MIN(MAX_NR_RCV_SACKS - 1, tcb->rcv.nr_sacks));
2140         tcb->rcv.sacks[0] = *sack;
2141         if (tcb->rcv.nr_sacks < MAX_NR_RCV_SACKS)
2142                 tcb->rcv.nr_sacks++;
2143 }
2144
2145 /* Once we receive everything and move rcv.nxt past a sack, we don't need to
2146  * track it.  I've seen Linux report sacks in the past, but we probably
2147  * shouldn't. */
2148 static void drop_old_rcv_sacks(Tcpctl *tcb)
2149 {
2150         struct sack_block *tcb_sack;
2151
2152         for (int i = 0; i < tcb->rcv.nr_sacks; i++) {
2153                 tcb_sack = &tcb->rcv.sacks[i];
2154                 /* Moving up to or past the left is enough to drop it. */
2155                 if (seq_ge(tcb->rcv.nxt, tcb_sack->left)) {
2156                         memmove(tcb->rcv.sacks + i, tcb->rcv.sacks + i + 1,
2157                                 sizeof(struct sack_block) * (tcb->rcv.nr_sacks - i - 1));
2158                         tcb->rcv.nr_sacks--;
2159                         i--;
2160                 }
2161         }
2162 }
2163
2164 static void tcpiput(struct Proto *tcp, struct Ipifc *unused, struct block *bp)
2165 {
2166         ERRSTACK(1);
2167         Tcp seg;
2168         Tcp4hdr *h4;
2169         Tcp6hdr *h6;
2170         int hdrlen;
2171         Tcpctl *tcb;
2172         uint16_t length;
2173         uint8_t source[IPaddrlen], dest[IPaddrlen];
2174         struct conv *s;
2175         struct Fs *f;
2176         struct tcppriv *tpriv;
2177         uint8_t version;
2178
2179         f = tcp->f;
2180         tpriv = tcp->priv;
2181
2182         tpriv->stats[InSegs]++;
2183
2184         h4 = (Tcp4hdr *) (bp->rp);
2185         h6 = (Tcp6hdr *) (bp->rp);
2186
2187         if ((h4->vihl & 0xF0) == IP_VER4) {
2188                 uint8_t ttl;
2189
2190                 version = V4;
2191                 length = nhgets(h4->length);
2192                 v4tov6(dest, h4->tcpdst);
2193                 v4tov6(source, h4->tcpsrc);
2194
2195                 /* ttl isn't part of the xsum pseudo header, but bypass needs it. */
2196                 ttl = h4->Unused;
2197                 h4->Unused = 0;
2198                 hnputs(h4->tcplen, length - TCP4_PKT);
2199                 if (!(bp->flag & Btcpck) && (h4->tcpcksum[0] || h4->tcpcksum[1]) &&
2200                         ptclcsum(bp, TCP4_IPLEN, length - TCP4_IPLEN)) {
2201                         tpriv->stats[CsumErrs]++;
2202                         tpriv->stats[InErrs]++;
2203                         netlog(f, Logtcp, "bad tcp proto cksum\n");
2204                         freeblist(bp);
2205                         return;
2206                 }
2207                 h4->Unused = ttl;
2208
2209                 hdrlen = ntohtcp4(&seg, &bp);
2210                 if (hdrlen < 0) {
2211                         tpriv->stats[HlenErrs]++;
2212                         tpriv->stats[InErrs]++;
2213                         netlog(f, Logtcp, "bad tcp hdr len\n");
2214                         return;
2215                 }
2216
2217                 s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest);
2218                 if (s && s->state == Bypass) {
2219                         bypass_or_drop(s, bp);
2220                         return;
2221                 }
2222
2223                 /* trim the packet to the size claimed by the datagram */
2224                 length -= hdrlen + TCP4_PKT;
2225                 bp = trimblock(bp, hdrlen + TCP4_PKT, length);
2226                 if (bp == NULL) {
2227                         tpriv->stats[LenErrs]++;
2228                         tpriv->stats[InErrs]++;
2229                         netlog(f, Logtcp, "tcp len < 0 after trim\n");
2230                         return;
2231                 }
2232         } else {
2233                 int ttl = h6->ttl;
2234                 int proto = h6->proto;
2235
2236                 version = V6;
2237                 length = nhgets(h6->ploadlen);
2238                 ipmove(dest, h6->tcpdst);
2239                 ipmove(source, h6->tcpsrc);
2240
2241                 h6->ploadlen[0] = h6->ploadlen[1] = h6->proto = 0;
2242                 h6->ttl = proto;
2243                 hnputl(h6->vcf, length);
2244                 if ((h6->tcpcksum[0] || h6->tcpcksum[1]) &&
2245                         ptclcsum(bp, TCP6_IPLEN, length + TCP6_PHDRSIZE)) {
2246                         tpriv->stats[CsumErrs]++;
2247                         tpriv->stats[InErrs]++;
2248                         netlog(f, Logtcp, "bad tcp proto cksum\n");
2249                         freeblist(bp);
2250                         return;
2251                 }
2252                 h6->ttl = ttl;
2253                 h6->proto = proto;
2254                 hnputs(h6->ploadlen, length);
2255
2256                 hdrlen = ntohtcp6(&seg, &bp);
2257                 if (hdrlen < 0) {
2258                         tpriv->stats[HlenErrs]++;
2259                         tpriv->stats[InErrs]++;
2260                         netlog(f, Logtcp, "bad tcp hdr len\n");
2261                         return;
2262                 }
2263
2264                 s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest);
2265                 if (s && s->state == Bypass) {
2266                         bypass_or_drop(s, bp);
2267                         return;
2268                 }
2269
2270                 /* trim the packet to the size claimed by the datagram */
2271                 length -= hdrlen;
2272                 bp = trimblock(bp, hdrlen + TCP6_PKT, length);
2273                 if (bp == NULL) {
2274                         tpriv->stats[LenErrs]++;
2275                         tpriv->stats[InErrs]++;
2276                         netlog(f, Logtcp, "tcp len < 0 after trim\n");
2277                         return;
2278                 }
2279         }
2280
2281         /* s, the conv matching the n-tuple, was set above */
2282         if (s == NULL) {
2283                 netlog(f, Logtcpreset, "iphtlook failed: src %I:%u, dst %I:%u\n",
2284                        source, seg.source, dest, seg.dest);
2285 reset:
2286                 sndrst(tcp, source, dest, length, &seg, version, "no conversation");
2287                 freeblist(bp);
2288                 return;
2289         }
2290
2291         /* lock protocol for unstate Plan 9 invariants.  funcs like limbo or
2292          * incoming might rely on it. */
2293         qlock(&tcp->qlock);
2294
2295         /* if it's a listener, look for the right flags and get a new conv */
2296         tcb = (Tcpctl *) s->ptcl;
2297         if (tcb->state == Listen) {
2298                 if (seg.flags & RST) {
2299                         limborst(s, &seg, source, dest, version);
2300                         qunlock(&tcp->qlock);
2301                         freeblist(bp);
2302                         return;
2303                 }
2304
2305                 /* if this is a new SYN, put the call into limbo */
2306                 if ((seg.flags & SYN) && (seg.flags & ACK) == 0) {
2307                         limbo(s, source, dest, &seg, version);
2308                         qunlock(&tcp->qlock);
2309                         freeblist(bp);
2310                         return;
2311                 }
2312
2313                 /* if there's a matching call in limbo, tcpincoming will return it */
2314                 s = tcpincoming(s, &seg, source, dest, version);
2315                 if (s == NULL) {
2316                         qunlock(&tcp->qlock);
2317                         goto reset;
2318                 }
2319         }
2320
2321         /* The rest of the input state machine is run with the control block
2322          * locked and implements the state machine directly out of the RFC.
2323          * Out-of-band data is ignored - it was always a bad idea.
2324          */
2325         tcb = (Tcpctl *) s->ptcl;
2326         if (waserror()) {
2327                 qunlock(&s->qlock);
2328                 nexterror();
2329         }
2330         qlock(&s->qlock);
2331         qunlock(&tcp->qlock);
2332
2333         update_tcb_ts(tcb, &seg);
2334         /* fix up window */
2335         seg.wnd <<= tcb->rcv.scale;
2336
2337         /* every input packet in puts off the keep alive time out */
2338         tcpsetkacounter(tcb);
2339
2340         switch (tcb->state) {
2341                 case Closed:
2342                         sndrst(tcp, source, dest, length, &seg, version,
2343                                    "sending to Closed");
2344                         goto raise;
2345                 case Syn_sent:
2346                         if (seg.flags & ACK) {
2347                                 if (!seq_within(seg.ack, tcb->iss + 1, tcb->snd.nxt)) {
2348                                         sndrst(tcp, source, dest, length, &seg, version,
2349                                                    "bad seq in Syn_sent");
2350                                         goto raise;
2351                                 }
2352                         }
2353                         if (seg.flags & RST) {
2354                                 if (seg.flags & ACK)
2355                                         localclose(s, "connection refused");
2356                                 goto raise;
2357                         }
2358
2359                         if (seg.flags & SYN) {
2360                                 procsyn(s, &seg);
2361                                 if (seg.flags & ACK) {
2362                                         update(s, &seg);
2363                                         tcpsynackrtt(s);
2364                                         tcpsetstate(s, Established);
2365                                         /* Here's where we get the results of header option
2366                                          * negotiations for connections we started. (SYNACK has the
2367                                          * response) */
2368                                         tcpsetscale(s, tcb, seg.ws, tcb->scale);
2369                                         tcb->sack_ok = seg.sack_ok;
2370                                 } else {
2371                                         sndrst(tcp, source, dest, length, &seg, version,
2372                                                    "Got SYN with no ACK");
2373                                         goto raise;
2374                                 }
2375
2376                                 if (length != 0 || (seg.flags & FIN))
2377                                         break;
2378
2379                                 freeblist(bp);
2380                                 goto output;
2381                         } else
2382                                 freeblist(bp);
2383
2384                         qunlock(&s->qlock);
2385                         poperror();
2386                         return;
2387         }
2388
2389         /*
2390          *  One DOS attack is to open connections to us and then forget about them,
2391          *  thereby tying up a conv at no long term cost to the attacker.
2392          *  This is an attempt to defeat these stateless DOS attacks.  See
2393          *  corresponding code in tcpsendka().
2394          */
2395         if ((seg.flags & RST) == 0) {
2396                 if (tcpporthogdefense
2397                         && seq_within(seg.ack, tcb->snd.una - (1 << 31),
2398                                                   tcb->snd.una - (1 << 29))) {
2399                         printd("stateless hog %I.%d->%I.%d f 0x%x 0x%lx - 0x%lx - 0x%lx\n",
2400                                    source, seg.source, dest, seg.dest, seg.flags,
2401                                    tcb->snd.una - (1 << 31), seg.ack, tcb->snd.una - (1 << 29));
2402                         localclose(s, "stateless hog");
2403                 }
2404         }
2405
2406         /* Cut the data to fit the receive window */
2407         if (tcptrim(tcb, &seg, &bp, &length) == -1) {
2408                 netlog(f, Logtcp, "%I.%d -> %I.%d: tcp len < 0, %lu %d\n",
2409                        s->raddr, s->rport, s->laddr, s->lport, seg.seq, length);
2410                 update(s, &seg);
2411                 if (qlen(s->wq) + tcb->flgcnt == 0 && tcb->state == Closing) {
2412                         tcphalt(tpriv, &tcb->rtt_timer);
2413                         tcphalt(tpriv, &tcb->acktimer);
2414                         tcphalt(tpriv, &tcb->katimer);
2415                         tcpsetstate(s, Time_wait);
2416                         tcb->timer.start = MSL2 * (1000 / MSPTICK);
2417                         tcpgo(tpriv, &tcb->timer);
2418                 }
2419                 if (!(seg.flags & RST)) {
2420                         tcb->flags |= FORCE;
2421                         goto output;
2422                 }
2423                 qunlock(&s->qlock);
2424                 poperror();
2425                 return;
2426         }
2427
2428         /* Cannot accept so answer with a rst */
2429         if (length && tcb->state == Closed) {
2430                 sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
2431                 goto raise;
2432         }
2433
2434         /* The segment is beyond the current receive pointer so
2435          * queue the data in the resequence queue
2436          */
2437         if (seg.seq != tcb->rcv.nxt)
2438                 if (length != 0 || (seg.flags & (SYN | FIN))) {
2439                         update(s, &seg);
2440                         if (addreseq(tcb, tpriv, &seg, bp, length) < 0)
2441                                 printd("reseq %I.%d -> %I.%d\n", s->raddr, s->rport, s->laddr,
2442                                            s->lport);
2443                         tcb->flags |= FORCE;
2444                         goto output;
2445                 }
2446
2447         /*
2448          *  keep looping till we've processed this packet plus any
2449          *  adjacent packets in the resequence queue
2450          */
2451         for (;;) {
2452                 if (seg.flags & RST) {
2453                         if (tcb->state == Established) {
2454                                 tpriv->stats[EstabResets]++;
2455                                 if (tcb->rcv.nxt != seg.seq)
2456                                         printd
2457                                                 ("out of order RST rcvd: %I.%d -> %I.%d, rcv.nxt 0x%lx seq 0x%lx\n",
2458                                                  s->raddr, s->rport, s->laddr, s->lport, tcb->rcv.nxt,
2459                                                  seg.seq);
2460                         }
2461                         localclose(s, "connection refused");
2462                         goto raise;
2463                 }
2464
2465                 if ((seg.flags & ACK) == 0)
2466                         goto raise;
2467
2468                 switch (tcb->state) {
2469                         case Established:
2470                         case Close_wait:
2471                                 update(s, &seg);
2472                                 break;
2473                         case Finwait1:
2474                                 update(s, &seg);
2475                                 if (qlen(s->wq) + tcb->flgcnt == 0) {
2476                                         tcphalt(tpriv, &tcb->rtt_timer);
2477                                         tcphalt(tpriv, &tcb->acktimer);
2478                                         tcpsetkacounter(tcb);
2479                                         tcb->time = NOW;
2480                                         tcpsetstate(s, Finwait2);
2481                                         tcb->katimer.start = MSL2 * (1000 / MSPTICK);
2482                                         tcpgo(tpriv, &tcb->katimer);
2483                                 }
2484                                 break;
2485                         case Finwait2:
2486                                 update(s, &seg);
2487                                 break;
2488                         case Closing:
2489                                 update(s, &seg);
2490                                 if (qlen(s->wq) + tcb->flgcnt == 0) {
2491                                         tcphalt(tpriv, &tcb->rtt_timer);
2492                                         tcphalt(tpriv, &tcb->acktimer);
2493                                         tcphalt(tpriv, &tcb->katimer);
2494                                         tcpsetstate(s, Time_wait);
2495                                         tcb->timer.start = MSL2 * (1000 / MSPTICK);
2496                                         tcpgo(tpriv, &tcb->timer);
2497                                 }
2498                                 break;
2499                         case Last_ack:
2500                                 update(s, &seg);
2501                                 if (qlen(s->wq) + tcb->flgcnt == 0) {
2502                                         localclose(s, NULL);
2503                                         goto raise;
2504                                 }
2505                         case Time_wait:
2506                                 tcb->flags |= FORCE;
2507                                 if (tcb->timer.state != TcptimerON)
2508                                         tcpgo(tpriv, &tcb->timer);
2509                 }
2510
2511                 if ((seg.flags & URG) && seg.urg) {
2512                         if (seq_gt(seg.urg + seg.seq, tcb->rcv.urg)) {
2513                                 tcb->rcv.urg = seg.urg + seg.seq;
2514                                 pullblock(&bp, seg.urg);
2515                         }
2516                 } else if (seq_gt(tcb->rcv.nxt, tcb->rcv.urg))
2517                         tcb->rcv.urg = tcb->rcv.nxt;
2518
2519                 if (length == 0) {
2520                         if (bp != NULL)
2521                                 freeblist(bp);
2522                 } else {
2523                         switch (tcb->state) {
2524                                 default:
2525                                         /* Ignore segment text */
2526                                         if (bp != NULL)
2527                                                 freeblist(bp);
2528                                         break;
2529
2530                                 case Established:
2531                                 case Finwait1:
2532                                         /* If we still have some data place on
2533                                          * receive queue
2534                                          */
2535                                         if (bp) {
2536                                                 bp = packblock(bp);
2537                                                 if (bp == NULL)
2538                                                         panic("tcp packblock");
2539                                                 qpassnolim(s->rq, bp);
2540                                                 bp = NULL;
2541
2542                                                 /*
2543                                                  *  Force an ack every 2 data messages.  This is
2544                                                  *  a hack for rob to make his home system run
2545                                                  *  faster.
2546                                                  *
2547                                                  *  this also keeps the standard TCP congestion
2548                                                  *  control working since it needs an ack every
2549                                                  *  2 max segs worth.  This is not quite that,
2550                                                  *  but under a real stream is equivalent since
2551                                                  *  every packet has a max seg in it.
2552                                                  */
2553                                                 if (++(tcb->rcv.una) >= 2)
2554                                                         tcb->flags |= FORCE;
2555                                         }
2556                                         tcb->rcv.nxt += length;
2557                                         drop_old_rcv_sacks(tcb);
2558
2559                                         /*
2560                                          *  update our rcv window
2561                                          */
2562                                         tcprcvwin(s);
2563
2564                                         /*
2565                                          *  turn on the acktimer if there's something
2566                                          *  to ack
2567                                          */
2568                                         if (tcb->acktimer.state != TcptimerON)
2569                                                 tcpgo(tpriv, &tcb->acktimer);
2570
2571                                         break;
2572                                 case Finwait2:
2573                                         /* no process to read the data, send a reset */
2574                                         if (bp != NULL)
2575                                                 freeblist(bp);
2576                                         sndrst(tcp, source, dest, length, &seg, version,
2577                                                    "send to Finwait2");
2578                                         qunlock(&s->qlock);
2579                                         poperror();
2580                                         return;
2581                         }
2582                 }
2583
2584                 if (seg.flags & FIN) {
2585                         tcb->flags |= FORCE;
2586
2587                         switch (tcb->state) {
2588                                 case Established:
2589                                         tcb->rcv.nxt++;
2590                                         tcpsetstate(s, Close_wait);
2591                                         break;
2592                                 case Finwait1:
2593                                         tcb->rcv.nxt++;
2594                                         if (qlen(s->wq) + tcb->flgcnt == 0) {
2595                                                 tcphalt(tpriv, &tcb->rtt_timer);
2596                                                 tcphalt(tpriv, &tcb->acktimer);
2597                                                 tcphalt(tpriv, &tcb->katimer);
2598                                                 tcpsetstate(s, Time_wait);
2599                                                 tcb->timer.start = MSL2 * (1000 / MSPTICK);
2600                                                 tcpgo(tpriv, &tcb->timer);
2601                                         } else
2602                                                 tcpsetstate(s, Closing);
2603                                         break;
2604                                 case Finwait2:
2605                                         tcb->rcv.nxt++;
2606                                         tcphalt(tpriv, &tcb->rtt_timer);
2607                                         tcphalt(tpriv, &tcb->acktimer);
2608                                         tcphalt(tpriv, &tcb->katimer);
2609                                         tcpsetstate(s, Time_wait);
2610                                         tcb->timer.start = MSL2 * (1000 / MSPTICK);
2611                                         tcpgo(tpriv, &tcb->timer);
2612                                         break;
2613                                 case Close_wait:
2614                                 case Closing:
2615                                 case Last_ack:
2616                                         break;
2617                                 case Time_wait:
2618                                         tcpgo(tpriv, &tcb->timer);
2619                                         break;
2620                         }
2621                 }
2622
2623                 /*
2624                  *  get next adjacent segment from the resequence queue.
2625                  *  dump/trim any overlapping segments
2626                  */
2627                 for (;;) {
2628                         if (tcb->reseq == NULL)
2629                                 goto output;
2630
2631                         if (seq_ge(tcb->rcv.nxt, tcb->reseq->seg.seq) == 0)
2632                                 goto output;
2633
2634                         getreseq(tcb, &seg, &bp, &length);
2635
2636                         if (tcptrim(tcb, &seg, &bp, &length) == 0)
2637                                 break;
2638                 }
2639         }
2640 output:
2641         tcpoutput(s);
2642         qunlock(&s->qlock);
2643         poperror();
2644         return;
2645 raise:
2646         qunlock(&s->qlock);
2647         poperror();
2648         freeblist(bp);
2649         tcpkick(s);
2650 }
2651
2652 /* The advertised mss = data + TCP headers */
2653 static uint16_t derive_payload_mss(Tcpctl *tcb)
2654 {
2655         uint16_t payload_mss = tcb->mss;
2656         uint16_t opt_size = 0;
2657
2658         if (tcb->ts_recent) {
2659                 opt_size += TS_LENGTH;
2660                 /* Note that when we're a SYN, we overestimate slightly.  This is safe,
2661                  * and not really a problem. */
2662                 opt_size += TS_SEND_PREPAD;
2663         }
2664         if (tcb->rcv.nr_sacks)
2665                 opt_size += 2 + tcb->rcv.nr_sacks * 8;
2666         opt_size = ROUNDUP(opt_size, 4);
2667         payload_mss -= opt_size;
2668         return payload_mss;
2669 }
2670
2671 /* Decreases the xmit amt, given the MSS / TSO. */
2672 static uint32_t throttle_for_mss(Tcpctl *tcb, uint32_t ssize,
2673                                  uint16_t payload_mss, bool retrans)
2674 {
2675         if (ssize > payload_mss) {
2676                 if ((tcb->flags & TSO) == 0) {
2677                         ssize = payload_mss;
2678                 } else {
2679                         /* Don't send too much.  32K is arbitrary.. */
2680                         if (ssize > 32 * 1024)
2681                                 ssize = 32 * 1024;
2682                         if (!retrans) {
2683                                 /* Clamp xmit to an integral MSS to avoid ragged tail segments
2684                                  * causing poor link utilization. */
2685                                 ssize = ROUNDDOWN(ssize, payload_mss);
2686                         }
2687                 }
2688         }
2689         return ssize;
2690 }
2691
2692 /* Reduces ssize for a variety of reasons.  Returns FALSE if we should abort
2693  * sending the packet.  o/w returns TRUE and modifies ssize by reference. */
2694 static bool throttle_ssize(struct conv *s, Tcpctl *tcb, uint32_t *ssize_p,
2695                            uint16_t payload_mss, bool retrans)
2696 {
2697         struct Fs *f = s->p->f;
2698         uint32_t usable;
2699         uint32_t ssize = *ssize_p;
2700
2701         /* Compute usable segment based on offered window and limit
2702          * window probes to one */
2703         if (tcb->snd.wnd == 0) {
2704                 if (tcb->snd.in_flight != 0) {
2705                         if ((tcb->flags & FORCE) == 0)
2706                                 return FALSE;
2707                 }
2708                 usable = 1;
2709         } else {
2710                 usable = tcb->cwind;
2711                 if (tcb->snd.wnd < usable)
2712                         usable = tcb->snd.wnd;
2713                 if (usable > tcb->snd.in_flight)
2714                         usable -= tcb->snd.in_flight;
2715                 else
2716                         usable = 0;
2717                 /* Avoid Silly Window Syndrome.  This is a little different thant RFC
2718                  * 813.  I took their additional enhancement of "< MSS" as an AND, not
2719                  * an OR.  25% of a large snd.wnd is pretty large, and our main goal is
2720                  * to avoid packets smaller than MSS.  I still use the 25% threshold,
2721                  * because it is important that there is *some* data in_flight.  If
2722                  * usable < MSS because snd.wnd is very small (but not 0), we might
2723                  * never get an ACK and would need to set up a timer.
2724                  *
2725                  * Also, I'm using 'ssize' as a proxy for a PSH point.  If there's just
2726                  * a small blob in the qio (or retrans!), then we might as well just
2727                  * send it. */
2728                 if ((usable < tcb->typical_mss) && (usable < tcb->snd.wnd >> 2)
2729                     && (usable < ssize)) {
2730                         return FALSE;
2731                 }
2732         }
2733         if (ssize && usable < 2)
2734                 netlog(s->p->f, Logtcpverbose,
2735                        "%I.%d -> %I.%d: throttled snd.wnd %lu cwind %lu\n",
2736                        s->laddr, s->lport, s->raddr, s->rport,
2737                        tcb->snd.wnd, tcb->cwind);
2738         if (usable < ssize)
2739                 ssize = usable;
2740
2741         ssize = throttle_for_mss(tcb, ssize, payload_mss, retrans);
2742
2743         *ssize_p = ssize;
2744         return TRUE;
2745 }
2746
2747 /* Helper, picks the next segment to send, which is possibly a retransmission.
2748  * Returns TRUE if we have a segment, FALSE o/w.  Returns ssize, from_seq, and
2749  * sent by reference.
2750  *
2751  * from_seq is the seq number we are transmitting from.
2752  *
2753  * sent includes all seq from una to from_seq *including* any previously sent
2754  * flags (part of tcb->flgcnt), for instance an unacknowledged SYN (which counts
2755  * as a seq number).  Those flags are in the e.g. snd.nxt - snd.una range, and
2756  * they get dropped after qdiscard.
2757  *
2758  * ssize is the amount of data we are sending, starting from from_seq, and it
2759  * will include any *new* flags, which haven't been accounted for yet.
2760  *
2761  * tcb->flgcnt consists of the flags both in ssize and in sent.
2762  *
2763  * Note that we could be in recovery and not sack_retrans a segment. */
2764 static bool get_xmit_segment(struct conv *s, Tcpctl *tcb, uint16_t payload_mss,
2765                              uint32_t *from_seq_p, uint32_t *sent_p,
2766                              uint32_t *ssize_p)
2767 {
2768         struct Fs *f = s->p->f;
2769         struct tcppriv *tpriv = s->p->priv;
2770         uint32_t ssize, sent, from_seq;
2771         bool sack_retrans = FALSE;
2772         struct sack_block *tcb_sack = 0;
2773
2774         for (int i = 0; i < tcb->snd.nr_sacks; i++) {
2775                 tcb_sack = &tcb->snd.sacks[i];
2776                 if (seq_lt(tcb->snd.rtx, tcb_sack->left)) {
2777                         /* So ssize is supposed to include any *new* flags to flgcnt, which
2778                          * at this point would be a FIN.
2779                          *
2780                          * It might be possible that flgcnt is incremented so we send a FIN,
2781                          * even for an intermediate sack retrans.  Perhaps the user closed
2782                          * the conv.
2783                          *
2784                          * However, the way the "flgcnt for FIN" works is that it inflates
2785                          * the desired amount we'd like to send (qlen + flgcnt).
2786                          * Eventually, we reach the end of the queue and fail to extract all
2787                          * of dsize.  At that point, we put on the FIN, and that's where the
2788                          * extra 'byte' comes from.
2789                          *
2790                          * For sack retrans, since we're extracting from parts of the qio
2791                          * that aren't the right-most edge, we don't need to consider flgcnt
2792                          * when setting ssize. */
2793                         from_seq = tcb->snd.rtx;
2794                         sent = from_seq - tcb->snd.una;
2795                         ssize = tcb_sack->left - from_seq;
2796                         sack_retrans = TRUE;
2797                         break;
2798                 }
2799         }
2800         /* SACK holes have first dibs, but we can still opportunisitically send new
2801          * data.
2802          *
2803          * During other types of recovery, we'll just send from the retrans point.
2804          * If we're in an RTO while we still have sacks, we could be resending data
2805          * that wasn't lost.  Consider a sack that is still growing (usually the
2806          * right-most), but we haven't received the ACK yet.  rxt may be included in
2807          * that area.  Given we had two losses or otherwise timed out, I'm not too
2808          * concerned.
2809          *
2810          * Note that Fast and RTO can send data beyond nxt.  If we change that,
2811          * change the accounting below. */
2812         if (!sack_retrans) {
2813                 switch (tcb->snd.recovery) {
2814                 default:
2815                 case SACK_RETRANS_RECOVERY:
2816                         from_seq = tcb->snd.nxt;
2817                         break;
2818                 case FAST_RETRANS_RECOVERY:
2819                 case RTO_RETRANS_RECOVERY:
2820                         from_seq = tcb->snd.rtx;
2821                         break;
2822                 }
2823                 sent = from_seq - tcb->snd.una;
2824                 /* qlen + flgcnt is every seq we want to have sent, including unack'd
2825                  * data, unacked flags, and new flags. */
2826                 ssize = qlen(s->wq) + tcb->flgcnt - sent;
2827         }
2828
2829         if (!throttle_ssize(s, tcb, &ssize, payload_mss, sack_retrans))
2830                 return FALSE;
2831
2832         /* This counts flags, which is a little hokey, but it's okay since in_flight
2833          * gets reset on each ACK */
2834         tcb->snd.in_flight += ssize;
2835         /* Log and track rxmit.  This covers both SACK (retrans) and fast rxmit. */
2836         if (ssize && seq_lt(tcb->snd.rtx, tcb->snd.nxt)) {
2837                 netlog(f, Logtcpverbose,
2838                        "%I.%d -> %I.%d: rxmit: rtx %u amt %u, nxt %u\n",
2839                        s->laddr, s->lport, s->raddr, s->rport,
2840                        tcb->snd.rtx, MIN(tcb->snd.nxt - tcb->snd.rtx, ssize),
2841                        tcb->snd.nxt);
2842                 tpriv->stats[RetransSegs]++;
2843         }
2844         if (sack_retrans) {
2845                 /* If we'll send up to the left edge, advance snd.rtx to the right.
2846                  *
2847                  * This includes the largest sack.  It might get removed later, in which
2848                  * case we'll underestimate the amount in-flight.  The alternative is to
2849                  * not count the rightmost sack, but when it gets removed, we'll retrans
2850                  * it anyway.  No matter what, we'd count it. */
2851                 tcb->snd.rtx += ssize;
2852                 if (tcb->snd.rtx == tcb_sack->left)
2853                         tcb->snd.rtx = tcb_sack->right;
2854                 /* RFC 6675 says we MAY rearm the RTO timer on each retrans, since we
2855                  * might not be getting ACKs for a while. */
2856                 tcpsettimer(tcb);
2857         } else {
2858                 switch (tcb->snd.recovery) {
2859                 default:
2860                         /* under normal op, we drag rtx along with nxt.  this prevents us
2861                          * from sending sacks too early (up above), since rtx doesn't get
2862                          * reset to una until we have a loss (e.g. 3 dupacks/sacks). */
2863                         tcb->snd.nxt += ssize;
2864                         tcb->snd.rtx = tcb->snd.nxt;
2865                         break;
2866                 case SACK_RETRANS_RECOVERY:
2867                         /* We explicitly do not want to increase rtx here.  We might still
2868                          * need it to fill in a sack gap below nxt if we get new, higher
2869                          * sacks. */
2870                         tcb->snd.nxt += ssize;
2871                         break;
2872                 case FAST_RETRANS_RECOVERY:
2873                 case RTO_RETRANS_RECOVERY:
2874                         tcb->snd.rtx += ssize;
2875                         /* Fast and RTO can send new data, advancing nxt. */
2876                         if (seq_gt(tcb->snd.rtx, tcb->snd.nxt))
2877                                 tcb->snd.nxt = tcb->snd.rtx;
2878                         break;
2879                 }
2880         }
2881         *from_seq_p = from_seq;
2882         *sent_p = sent;
2883         *ssize_p = ssize;
2884
2885         return TRUE;
2886 }
2887
2888 /*
2889  *  always enters and exits with the s locked.  We drop
2890  *  the lock to ipoput the packet so some care has to be
2891  *  taken by callers.
2892  */
2893 static void tcpoutput(struct conv *s)
2894 {
2895         Tcp seg;
2896         int msgs;
2897         int next_yield = 1;
2898         Tcpctl *tcb;
2899         struct block *hbp, *bp;
2900         uint32_t ssize, dsize, sent, from_seq;
2901         struct Fs *f;
2902         struct tcppriv *tpriv;
2903         uint8_t version;
2904         uint16_t payload_mss;
2905
2906         f = s->p->f;
2907         tpriv = s->p->priv;
2908         version = s->ipversion;
2909
2910         for (msgs = 0; msgs < 100; msgs++) {
2911                 tcb = (Tcpctl *) s->ptcl;
2912
2913                 switch (tcb->state) {
2914                         case Listen:
2915                         case Closed:
2916                         case Finwait2:
2917                                 return;
2918                 }
2919
2920                 /* force an ack when a window has opened up */
2921                 if (tcb->rcv.blocked && tcb->rcv.wnd >= tcb->mss) {
2922                         tcb->rcv.blocked = 0;
2923                         tcb->flags |= FORCE;
2924                 }
2925
2926                 /* Don't send anything else until our SYN has been acked */
2927                 if (tcb->snd.nxt != tcb->iss && (tcb->flags & SYNACK) == 0)
2928                         break;
2929
2930                 /* payload_mss is the actual amount of data in the packet, which is the
2931                  * advertised (mss - header opts).  This varies from packet to packet,
2932                  * based on the options that might be present (e.g. always timestamps,
2933                  * sometimes SACKs) */
2934                 payload_mss = derive_payload_mss(tcb);
2935
2936                 if (!get_xmit_segment(s, tcb, payload_mss, &from_seq, &sent, &ssize))
2937                         break;
2938
2939                 dsize = ssize;
2940                 seg.urg = 0;
2941
2942                 if (ssize == 0)
2943                         if ((tcb->flags & FORCE) == 0)
2944                                 break;
2945
2946                 tcb->flags &= ~FORCE;
2947                 tcprcvwin(s);
2948
2949                 /* By default we will generate an ack, so we can normally turn off the
2950                  * timer.  If we're blocked, we'll want the timer so we can send a
2951                  * window update. */
2952                 if (!tcb->rcv.blocked)
2953                         tcphalt(tpriv, &tcb->acktimer);
2954                 tcb->rcv.una = 0;
2955                 seg.source = s->lport;
2956                 seg.dest = s->rport;
2957                 seg.flags = ACK;
2958                 seg.mss = 0;
2959                 seg.ws = 0;
2960                 seg.sack_ok = FALSE;
2961                 seg.nr_sacks = 0;
2962                 /* When outputting, Syn_sent means "send the Syn", for connections we
2963                  * initiate.  SYNACKs are sent from sndsynack directly. */
2964                 if (tcb->state == Syn_sent) {
2965                         seg.flags = 0;
2966                         seg.sack_ok = SACK_SUPPORTED;   /* here's where we advertise SACK */
2967                         if (tcb->snd.nxt - ssize == tcb->iss) {
2968                                 seg.flags |= SYN;
2969                                 dsize--;
2970                                 seg.mss = tcb->mss;
2971                                 seg.ws = tcb->scale;
2972                         } else {
2973                                 /* TODO: Not sure why we'd get here. */
2974                                 warn("TCP: weird Syn_sent state, tell someone you saw this");
2975                         }
2976                 }
2977                 seg.seq = from_seq;
2978                 seg.ack = tcb->rcv.nxt;
2979                 tcb->last_ack_sent = seg.ack;
2980                 seg.wnd = tcb->rcv.wnd;
2981                 seg.ts_val = tcb->ts_recent;
2982
2983                 /* Pull out data to send */
2984                 bp = NULL;
2985                 if (dsize != 0) {
2986                         bp = qcopy(s->wq, dsize, sent);
2987                         if (BLEN(bp) != dsize) {
2988                                 /* Here's where the flgcnt kicked in.  Note dsize is
2989                                  * decremented, but ssize isn't.  Not that we use ssize for much
2990                                  * anymore.  Decrementing dsize prevents us from sending a PSH
2991                                  * with the FIN. */
2992                                 seg.flags |= FIN;
2993                                 dsize--;
2994                         }
2995                         if (BLEN(bp) > payload_mss) {
2996                                 bp->flag |= Btso;
2997                                 bp->mss = payload_mss;
2998                         }
2999                 }
3000
3001                 if (sent + dsize == qlen(s->wq) + tcb->flgcnt)
3002                         seg.flags |= PSH;
3003
3004                 /* Build header, link data and compute cksum */
3005                 switch (version) {
3006                         case V4:
3007                                 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
3008                                 hbp = htontcp4(&seg, bp, &tcb->protohdr.tcp4hdr, tcb);
3009                                 if (hbp == NULL) {
3010                                         freeblist(bp);
3011                                         return;
3012                                 }
3013                                 break;
3014                         case V6:
3015                                 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
3016                                 hbp = htontcp6(&seg, bp, &tcb->protohdr.tcp6hdr, tcb);
3017                                 if (hbp == NULL) {
3018                                         freeblist(bp);
3019                                         return;
3020                                 }
3021                                 break;
3022                         default:
3023                                 hbp = NULL;     /* to suppress a warning */
3024                                 panic("tcpoutput: version %d", version);
3025                 }
3026
3027                 /* Start the transmission timers if there is new data and we
3028                  * expect acknowledges
3029                  */
3030                 if (ssize != 0) {
3031                         if (tcb->timer.state != TcptimerON)
3032                                 tcpgo(tpriv, &tcb->timer);
3033
3034                         if (!tcb->ts_recent && (tcb->rtt_timer.state != TcptimerON)) {
3035                                 /* If round trip timer isn't running, start it. */
3036                                 tcpgo(tpriv, &tcb->rtt_timer);
3037                                 tcb->rttseq = from_seq + ssize;
3038                         }
3039                 }
3040
3041                 tpriv->stats[OutSegs]++;
3042
3043                 /* put off the next keep alive */
3044                 tcpgo(tpriv, &tcb->katimer);
3045
3046                 switch (version) {
3047                         case V4:
3048                                 if (ipoput4(f, hbp, 0, s->ttl, s->tos, s) < 0) {
3049                                         /* a negative return means no route */
3050                                         localclose(s, "no route");
3051                                 }
3052                                 break;
3053                         case V6:
3054                                 if (ipoput6(f, hbp, 0, s->ttl, s->tos, s) < 0) {
3055                                         /* a negative return means no route */
3056                                         localclose(s, "no route");
3057                                 }
3058                                 break;
3059                         default:
3060                                 panic("tcpoutput2: version %d", version);
3061                 }
3062                 if (ssize) {
3063                         /* The outer loop thinks we sent one packet.  If we used TSO, we
3064                          * might have sent several.  Minus one for the loop increment. */
3065                         msgs += DIV_ROUND_UP(ssize, payload_mss) - 1;
3066                 }
3067                 /* Old Plan 9 tidbit - yield every four messages.  We want to break out
3068                  * and unlock so we can process inbound ACKs which might do things like
3069                  * say "slow down". */
3070                 if (msgs >= next_yield) {
3071                         next_yield = msgs + 4;
3072                         qunlock(&s->qlock);
3073                         kthread_yield();
3074                         qlock(&s->qlock);
3075                 }
3076         }
3077 }
3078
3079 /*
3080  *  the BSD convention (hack?) for keep alives.  resend last uint8_t acked.
3081  */
3082 static void tcpsendka(struct conv *s)
3083 {
3084         Tcp seg;
3085         Tcpctl *tcb;
3086         struct block *hbp, *dbp;
3087
3088         tcb = (Tcpctl *) s->ptcl;
3089
3090         dbp = NULL;
3091         seg.urg = 0;
3092         seg.source = s->lport;
3093         seg.dest = s->rport;
3094         seg.flags = ACK | PSH;
3095         seg.mss = 0;
3096         seg.ws = 0;
3097         seg.sack_ok = FALSE;
3098         seg.nr_sacks = 0;
3099         if (tcpporthogdefense)
3100                 urandom_read(&seg.seq, sizeof(seg.seq));
3101         else
3102                 seg.seq = tcb->snd.una - 1;
3103         seg.ack = tcb->rcv.nxt;
3104         tcb->last_ack_sent = seg.ack;
3105         tcb->rcv.una = 0;
3106         seg.wnd = tcb->rcv.wnd;
3107         seg.ts_val = tcb->ts_recent;
3108         if (tcb->state == Finwait2) {
3109                 seg.flags |= FIN;
3110         } else {
3111                 dbp = block_alloc(1, MEM_WAIT);
3112                 dbp->wp++;
3113         }
3114
3115         if (isv4(s->raddr)) {
3116                 /* Build header, link data and compute cksum */
3117                 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
3118                 hbp = htontcp4(&seg, dbp, &tcb->protohdr.tcp4hdr, tcb);
3119                 if (hbp == NULL) {
3120                         freeblist(dbp);
3121                         return;
3122                 }
3123                 ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
3124         } else {
3125                 /* Build header, link data and compute cksum */
3126                 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
3127                 hbp = htontcp6(&seg, dbp, &tcb->protohdr.tcp6hdr, tcb);
3128                 if (hbp == NULL) {
3129                         freeblist(dbp);
3130                         return;
3131                 }
3132                 ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
3133         }
3134 }
3135
3136 /*
3137  *  set connection to time out after 12 minutes
3138  */
3139 static void tcpsetkacounter(Tcpctl *tcb)
3140 {
3141         tcb->kacounter = (12 * 60 * 1000) / (tcb->katimer.start * MSPTICK);
3142         if (tcb->kacounter < 3)
3143                 tcb->kacounter = 3;
3144 }
3145
3146 /*
3147  *  if we've timed out, close the connection
3148  *  otherwise, send a keepalive and restart the timer
3149  */
3150 static void tcpkeepalive(void *v)
3151 {
3152         ERRSTACK(1);
3153         Tcpctl *tcb;
3154         struct conv *s;
3155
3156         s = v;
3157         tcb = (Tcpctl *) s->ptcl;
3158         qlock(&s->qlock);
3159         if (waserror()) {
3160                 qunlock(&s->qlock);
3161                 nexterror();
3162         }
3163         if (tcb->state != Closed) {
3164                 if (--(tcb->kacounter) <= 0) {
3165                         localclose(s, "connection timed out");
3166                 } else {
3167                         tcpsendka(s);
3168                         tcpgo(s->p->priv, &tcb->katimer);
3169                 }
3170         }
3171         qunlock(&s->qlock);
3172         poperror();
3173 }
3174
3175 /*
3176  *  start keepalive timer
3177  */
3178 static void tcpstartka(struct conv *s, char **f, int n)
3179 {
3180         Tcpctl *tcb;
3181         int x;
3182
3183         tcb = (Tcpctl *) s->ptcl;
3184         if (tcb->state != Established)
3185                 error(ENOTCONN, "connection must be in Establised state");
3186         if (n > 1) {
3187                 x = atoi(f[1]);
3188                 if (x >= MSPTICK)
3189                         tcb->katimer.start = x / MSPTICK;
3190         }
3191         tcpsetkacounter(tcb);
3192         tcpgo(s->p->priv, &tcb->katimer);
3193 }
3194
3195 /*
3196  *  turn checksums on/off
3197  */
3198 static void tcpsetchecksum(struct conv *s, char **f, int unused)
3199 {
3200         Tcpctl *tcb;
3201
3202         tcb = (Tcpctl *) s->ptcl;
3203         tcb->nochecksum = !atoi(f[1]);
3204 }
3205
3206 static void tcp_loss_event(struct conv *s, Tcpctl *tcb)
3207 {
3208         uint32_t old_cwnd = tcb->cwind;
3209
3210         /* Reno */
3211         tcb->ssthresh = tcb->cwind / 2;
3212         tcb->cwind = tcb->ssthresh;
3213         netlog(s->p->f, Logtcprxmt,
3214                "%I.%d -> %I.%d: loss event, cwnd was %d, now %d\n",
3215                s->laddr, s->lport, s->raddr, s->rport,
3216                old_cwnd, tcb->cwind);
3217 }
3218
3219 /* Called when we need to retrans the entire outstanding window (everything
3220  * previously sent, but unacknowledged). */
3221 static void tcprxmit(struct conv *s)
3222 {
3223         Tcpctl *tcb;
3224
3225         tcb = (Tcpctl *) s->ptcl;
3226
3227         tcb->flags |= FORCE;
3228         tcb->snd.rtx = tcb->snd.una;
3229         set_in_flight(tcb);
3230
3231         tcpoutput(s);
3232 }
3233
3234 /* The original RFC said to drop sacks on a timeout, since the receiver could
3235  * renege.  Later RFCs say we can keep them around, so long as we are careful.
3236  *
3237  * We'll go with a "flush if we have two timeouts" plan.  This doesn't have to
3238  * be perfect - there might be cases where we accidentally flush the sacks too
3239  * often.  Perhaps we never get dup_acks to start fast/sack rxmit.  The main
3240  * thing is that after multiple timeouts we flush the sacks, since the receiver
3241  * might renege.
3242  *
3243  * We also have an Akaros-specific problem.  We use the sacks to determine
3244  * in_flight.  Specifically, the (snd.nxt - upper right edge) is tracked as in
3245  * flight.  Usually the receiver will keep sacking that right edge all the way
3246  * up to snd.nxt, but they might not, and the gap might be quite large.  After a
3247  * timeout, that data is definitely not in flight.  If that block's size is
3248  * greater than cwnd, we'll never transmit.  This should be rare, and in that
3249  * case we can just dump the sacks.  The typical_mss fudge factor is so we can
3250  * send a reasonably-sized packet. */
3251 static void timeout_handle_sacks(Tcpctl *tcb)
3252 {
3253         struct sack_block *last_sack;
3254
3255         if (tcb->snd.nr_sacks) {
3256                 last_sack = &tcb->snd.sacks[tcb->snd.nr_sacks - 1];
3257                 if (tcb->snd.flush_sacks || (tcb->snd.nxt - last_sack->right >=
3258                                              tcb->cwind - tcb->typical_mss)) {
3259                         tcb->snd.nr_sacks = 0;
3260                         tcb->snd.flush_sacks = FALSE;
3261                 } else {
3262                         tcb->snd.flush_sacks = TRUE;
3263                 }
3264         }
3265 }
3266
3267 static void tcptimeout(void *arg)
3268 {
3269         ERRSTACK(1);
3270         struct conv *s;
3271         Tcpctl *tcb;
3272         int maxback;
3273         struct tcppriv *tpriv;
3274
3275         s = (struct conv *)arg;
3276         tpriv = s->p->priv;
3277         tcb = (Tcpctl *) s->ptcl;
3278
3279         qlock(&s->qlock);
3280         if (waserror()) {
3281                 qunlock(&s->qlock);
3282                 nexterror();
3283         }
3284         switch (tcb->state) {
3285                 default:
3286                         tcb->backoff++;
3287                         if (tcb->state == Syn_sent)
3288                                 maxback = MAXBACKMS / 2;
3289                         else
3290                                 maxback = MAXBACKMS;
3291                         tcb->backedoff += tcb->timer.start * MSPTICK;
3292                         if (tcb->backedoff >= maxback) {
3293                                 localclose(s, "connection timed out");
3294                                 break;
3295                         }
3296                         netlog(s->p->f, Logtcprxmt,
3297                                "%I.%d -> %I.%d: timeout rxmit una %u, rtx %u, nxt %u, in_flight %u, timer.start %u\n",
3298                                s->laddr, s->lport, s->raddr, s->rport,
3299                                tcb->snd.una, tcb->snd.rtx, tcb->snd.nxt, tcb->snd.in_flight,
3300                                tcb->timer.start);
3301                         tcpsettimer(tcb);
3302                         tcp_loss_event(s, tcb);
3303                         /* Advance the recovery point.  Any dupacks/sacks below this won't
3304                          * trigger a new loss, since we won't reset_recovery() until we ack
3305                          * past recovery_pt. */
3306                         tcb->snd.recovery = RTO_RETRANS_RECOVERY;
3307                         tcb->snd.recovery_pt = tcb->snd.nxt;
3308                         timeout_handle_sacks(tcb);
3309                         tcprxmit(s);
3310                         tpriv->stats[RetransTimeouts]++;
3311                         break;
3312                 case Time_wait:
3313                         localclose(s, NULL);
3314                         break;
3315                 case Closed:
3316                         break;
3317         }
3318         qunlock(&s->qlock);
3319         poperror();
3320 }
3321
3322 static int inwindow(Tcpctl *tcb, int seq)
3323 {
3324         return seq_within(seq, tcb->rcv.nxt, tcb->rcv.nxt + tcb->rcv.wnd - 1);
3325 }
3326
3327 /*
3328  *  set up state for a received SYN (or SYN ACK) packet
3329  */
3330 static void procsyn(struct conv *s, Tcp *seg)
3331 {
3332         Tcpctl *tcb;
3333
3334         tcb = (Tcpctl *) s->ptcl;
3335         tcb->flags |= FORCE;
3336
3337         tcb->rcv.nxt = seg->seq + 1;
3338         tcb->rcv.urg = tcb->rcv.nxt;
3339         tcb->irs = seg->seq;
3340
3341         /* our sending max segment size cannot be bigger than what he asked for */
3342         if (seg->mss != 0 && seg->mss < tcb->mss) {
3343                 tcb->mss = seg->mss;
3344                 tcb->typical_mss = tcb->mss;
3345         }
3346         adjust_typical_mss_for_opts(seg, tcb);
3347
3348         tcb->snd.wnd = seg->wnd;
3349         tcb->cwind = tcb->typical_mss * CWIND_SCALE;
3350 }
3351
3352 static int addreseq(Tcpctl *tcb, struct tcppriv *tpriv, Tcp *seg,
3353                     struct block *bp, uint16_t length)
3354 {
3355         Reseq *rp, *rp1;
3356         int i, rqlen, qmax;
3357
3358         rp = kzmalloc(sizeof(Reseq), 0);
3359         if (rp == NULL) {
3360                 freeblist(bp);  /* bp always consumed by add_reseq */
3361                 return 0;
3362         }
3363
3364         rp->seg = *seg;
3365         rp->bp = bp;
3366         rp->length = length;
3367
3368         track_rcv_sack(tcb, seg->seq, seg->seq + length);
3369         /* Place on reassembly list sorting by starting seq number */
3370         rp1 = tcb->reseq;
3371         if (rp1 == NULL || seq_lt(seg->seq, rp1->seg.seq)) {
3372                 rp->next = rp1;
3373                 tcb->reseq = rp;
3374                 if (rp->next != NULL)
3375                         tpriv->stats[OutOfOrder]++;
3376                 return 0;
3377         }
3378
3379         rqlen = 0;
3380         for (i = 0;; i++) {
3381                 rqlen += rp1->length;
3382                 if (rp1->next == NULL || seq_lt(seg->seq, rp1->next->seg.seq)) {
3383                         rp->next = rp1->next;
3384                         rp1->next = rp;
3385                         if (rp->next != NULL)
3386                                 tpriv->stats[OutOfOrder]++;
3387                         break;
3388                 }
3389                 rp1 = rp1->next;
3390         }
3391         qmax = QMAX << tcb->rcv.scale;
3392         /* Here's where we're reneging on previously reported sacks. */
3393         if (rqlen > qmax) {
3394                 printd("resequence queue > window: %d > %d\n", rqlen, qmax);
3395                 i = 0;
3396                 for (rp1 = tcb->reseq; rp1 != NULL; rp1 = rp1->next) {
3397                         printd("0x%#lx 0x%#lx 0x%#x\n", rp1->seg.seq,
3398                                    rp1->seg.ack, rp1->seg.flags);
3399                         if (i++ > 10) {
3400                                 printd("...\n");
3401                                 break;
3402                         }
3403                 }
3404
3405                 // delete entire reassembly queue; wait for retransmit.
3406                 // - should we be smarter and only delete the tail?
3407                 for (rp = tcb->reseq; rp != NULL; rp = rp1) {
3408                         rp1 = rp->next;
3409                         freeblist(rp->bp);
3410                         kfree(rp);
3411                 }
3412                 tcb->reseq = NULL;
3413                 tcb->rcv.nr_sacks = 0;
3414
3415                 return -1;
3416         }
3417         return 0;
3418 }
3419
3420 static void getreseq(Tcpctl *tcb, Tcp *seg, struct block **bp, uint16_t *length)
3421 {
3422         Reseq *rp;
3423
3424         rp = tcb->reseq;
3425         if (rp == NULL)
3426                 return;
3427
3428         tcb->reseq = rp->next;
3429
3430         *seg = rp->seg;
3431         *bp = rp->bp;
3432         *length = rp->length;
3433
3434         kfree(rp);
3435 }
3436
3437 static int tcptrim(Tcpctl *tcb, Tcp *seg, struct block **bp, uint16_t *length)
3438 {
3439         uint16_t len;
3440         uint8_t accept;
3441         int dupcnt, excess;
3442
3443         accept = 0;
3444         len = *length;
3445         if (seg->flags & SYN)
3446                 len++;
3447         if (seg->flags & FIN)
3448                 len++;
3449
3450         if (tcb->rcv.wnd == 0) {
3451                 if (len == 0 && seg->seq == tcb->rcv.nxt)
3452                         return 0;
3453         } else {
3454                 /* Some part of the segment should be in the window */
3455                 if (inwindow(tcb, seg->seq))
3456                         accept++;
3457                 else if (len != 0) {
3458                         if (inwindow(tcb, seg->seq + len - 1) ||
3459                                 seq_within(tcb->rcv.nxt, seg->seq, seg->seq + len - 1))
3460                                 accept++;
3461                 }
3462         }
3463         if (!accept) {
3464                 freeblist(*bp);
3465                 return -1;
3466         }
3467         dupcnt = tcb->rcv.nxt - seg->seq;
3468         if (dupcnt > 0) {
3469                 tcb->rerecv += dupcnt;
3470                 if (seg->flags & SYN) {
3471                         seg->flags &= ~SYN;
3472                         seg->seq++;
3473
3474                         if (seg->urg > 1)
3475                                 seg->urg--;
3476                         else
3477                                 seg->flags &= ~URG;
3478                         dupcnt--;
3479                 }
3480                 if (dupcnt > 0) {
3481                         pullblock(bp, (uint16_t) dupcnt);
3482                         seg->seq += dupcnt;
3483                         *length -= dupcnt;
3484
3485                         if (seg->urg > dupcnt)
3486                                 seg->urg -= dupcnt;
3487                         else {
3488                                 seg->flags &= ~URG;
3489                                 seg->urg = 0;
3490                         }
3491                 }
3492         }
3493         excess = seg->seq + *length - (tcb->rcv.nxt + tcb->rcv.wnd);
3494         if (excess > 0) {
3495                 tcb->rerecv += excess;
3496                 *length -= excess;
3497                 *bp = trimblock(*bp, 0, *length);
3498                 if (*bp == NULL)
3499                         panic("presotto is a boofhead");
3500                 seg->flags &= ~FIN;
3501         }
3502         return 0;
3503 }
3504
3505 static void tcpadvise(struct Proto *tcp, struct block *bp, char *msg)
3506 {
3507         Tcp4hdr *h4;
3508         Tcp6hdr *h6;
3509         Tcpctl *tcb;
3510         uint8_t source[IPaddrlen];
3511         uint8_t dest[IPaddrlen];
3512         uint16_t psource, pdest;
3513         struct conv *s, **p;
3514
3515         h4 = (Tcp4hdr *) (bp->rp);
3516         h6 = (Tcp6hdr *) (bp->rp);
3517
3518         if ((h4->vihl & 0xF0) == IP_VER4) {
3519                 v4tov6(dest, h4->tcpdst);
3520                 v4tov6(source, h4->tcpsrc);
3521                 psource = nhgets(h4->tcpsport);
3522                 pdest = nhgets(h4->tcpdport);
3523         } else {
3524                 ipmove(dest, h6->tcpdst);
3525                 ipmove(source, h6->tcpsrc);
3526                 psource = nhgets(h6->tcpsport);
3527                 pdest = nhgets(h6->tcpdport);
3528         }
3529
3530         /* Look for a connection */
3531         for (p = tcp->conv; *p; p++) {
3532                 s = *p;
3533                 tcb = (Tcpctl *) s->ptcl;
3534                 if (s->rport == pdest)
3535                         if (s->lport == psource)
3536                                 if (tcb->state != Closed)
3537                                         if (ipcmp(s->raddr, dest) == 0)
3538                                                 if (ipcmp(s->laddr, source) == 0) {
3539                                                         qlock(&s->qlock);
3540                                                         switch (tcb->state) {
3541                                                                 case Syn_sent:
3542                                                                         localclose(s, msg);
3543                                                                         break;
3544                                                         }
3545                                                         qunlock(&s->qlock);
3546                                                         freeblist(bp);
3547                                                         return;
3548                                                 }
3549         }
3550         freeblist(bp);
3551 }
3552
3553 static void tcpporthogdefensectl(char *val)
3554 {
3555         if (strcmp(val, "on") == 0)
3556                 tcpporthogdefense = 1;
3557         else if (strcmp(val, "off") == 0)
3558                 tcpporthogdefense = 0;
3559         else
3560                 error(EINVAL, "unknown value for tcpporthogdefense");
3561 }
3562
3563 /* called with c qlocked */
3564 static void tcpctl(struct conv *c, char **f, int n)
3565 {
3566         if (n == 1 && strcmp(f[0], "hangup") == 0)
3567                 tcphangup(c);
3568         else if (n >= 1 && strcmp(f[0], "keepalive") == 0)
3569                 tcpstartka(c, f, n);
3570         else if (n >= 1 && strcmp(f[0], "checksum") == 0)
3571                 tcpsetchecksum(c, f, n);
3572         else if (n >= 1 && strcmp(f[0], "tcpporthogdefense") == 0)
3573                 tcpporthogdefensectl(f[1]);
3574         else
3575                 error(EINVAL, "unknown command to %s", __func__);
3576 }
3577
3578 static int tcpstats(struct Proto *tcp, char *buf, int len)
3579 {
3580         struct tcppriv *priv;
3581         char *p, *e;
3582         int i;
3583
3584         priv = tcp->priv;
3585         p = buf;
3586         e = p + len;
3587         for (i = 0; i < Nstats; i++)
3588                 p = seprintf(p, e, "%s: %u\n", statnames[i], priv->stats[i]);
3589         return p - buf;
3590 }
3591
3592 /*
3593  *  garbage collect any stale conversations:
3594  *      - SYN received but no SYN-ACK after 5 seconds (could be the SYN attack)
3595  *      - Finwait2 after 5 minutes
3596  *
3597  *  this is called whenever we run out of channels.  Both checks are
3598  *  of questionable validity so we try to use them only when we're
3599  *  up against the wall.
3600  */
3601 static int tcpgc(struct Proto *tcp)
3602 {
3603         struct conv *c, **pp, **ep;
3604         int n;
3605         Tcpctl *tcb;
3606
3607         n = 0;
3608         ep = &tcp->conv[tcp->nc];
3609         for (pp = tcp->conv; pp < ep; pp++) {
3610                 c = *pp;
3611                 if (c == NULL)
3612                         break;
3613                 if (!canqlock(&c->qlock))
3614                         continue;
3615                 tcb = (Tcpctl *) c->ptcl;
3616                 if (tcb->state == Finwait2) {
3617                         if (NOW - tcb->time > 5 * 60 * 1000) {
3618                                 localclose(c, "timed out");
3619                                 n++;
3620                         }
3621                 }
3622                 qunlock(&c->qlock);
3623         }
3624         return n;
3625 }
3626
3627 static void tcpsettimer(Tcpctl *tcb)
3628 {
3629         int x;
3630
3631         /* round trip dependency */
3632         x = backoff(tcb->backoff) * (tcb->srtt + MAX(4 * tcb->mdev, MSPTICK));
3633         x = DIV_ROUND_UP(x, MSPTICK);
3634
3635         /* Bounded twixt 1/2 and 64 seconds.  RFC 6298 suggested min is 1 second. */
3636         if (x < 500 / MSPTICK)
3637                 x = 500 / MSPTICK;
3638         else if (x > (64000 / MSPTICK))
3639                 x = 64000 / MSPTICK;
3640         tcb->timer.start = x;
3641 }
3642
3643 static struct tcppriv *debug_priv;
3644
3645 /* Kfunc this */
3646 int dump_tcp_ht(void)
3647 {
3648         if (!debug_priv)
3649                 return -1;