net: tcp: Fix up the receive window
[akaros.git] / kern / src / net / tcp.c
1 /* Copyright © 1994-1999 Lucent Technologies Inc.  All rights reserved.
2  * Portions Copyright © 1997-1999 Vita Nuova Limited
3  * Portions Copyright © 2000-2007 Vita Nuova Holdings Limited
4  *                                (www.vitanuova.com)
5  * Revisions Copyright © 2000-2007 Lucent Technologies Inc. and others
6  *
7  * Modified for the Akaros operating system:
8  * Copyright (c) 2013-2014 The Regents of the University of California
9  * Copyright (c) 2013-2015 Google Inc.
10  *
11  * Permission is hereby granted, free of charge, to any person obtaining a copy
12  * of this software and associated documentation files (the "Software"), to deal
13  * in the Software without restriction, including without limitation the rights
14  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15  * copies of the Software, and to permit persons to whom the Software is
16  * furnished to do so, subject to the following conditions:
17  *
18  * The above copyright notice and this permission notice shall be included in
19  * all copies or substantial portions of the Software.
20  *
21  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
24  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27  * SOFTWARE. */
28
29 #include <vfs.h>
30 #include <kfs.h>
31 #include <slab.h>
32 #include <kmalloc.h>
33 #include <kref.h>
34 #include <string.h>
35 #include <stdio.h>
36 #include <assert.h>
37 #include <error.h>
38 #include <cpio.h>
39 #include <pmap.h>
40 #include <smp.h>
41 #include <ip.h>
42
43 #include <vfs.h>
44 #include <kfs.h>
45 #include <slab.h>
46 #include <kmalloc.h>
47 #include <kref.h>
48 #include <string.h>
49 #include <stdio.h>
50 #include <assert.h>
51 #include <error.h>
52 #include <cpio.h>
53 #include <pmap.h>
54 #include <smp.h>
55 #include <ip.h>
56
57 enum {
58         QMAX = 64 * 1024 - 1,
59         IP_TCPPROTO = 6,
60
61         TCP4_IPLEN = 8,
62         TCP4_PHDRSIZE = 12,
63         TCP4_HDRSIZE = 20,
64         TCP4_TCBPHDRSZ = 40,
65         TCP4_PKT = TCP4_IPLEN + TCP4_PHDRSIZE,
66
67         TCP6_IPLEN = 0,
68         TCP6_PHDRSIZE = 40,
69         TCP6_HDRSIZE = 20,
70         TCP6_TCBPHDRSZ = 60,
71         TCP6_PKT = TCP6_IPLEN + TCP6_PHDRSIZE,
72
73         TcptimerOFF = 0,
74         TcptimerON = 1,
75         TcptimerDONE = 2,
76         MAX_TIME = (1 << 20),   /* Forever */
77         TCP_ACK = 50,   /* Timed ack sequence in ms */
78         MAXBACKMS = 9 * 60 * 1000,      /* longest backoff time (ms) before hangup */
79
80         URG = 0x20,     /* Data marked urgent */
81         ACK = 0x10,     /* Acknowledge is valid */
82         PSH = 0x08,     /* Whole data pipe is pushed */
83         RST = 0x04,     /* Reset connection */
84         SYN = 0x02,     /* Pkt. is synchronise */
85         FIN = 0x01,     /* Start close down */
86
87         EOLOPT = 0,
88         NOOPOPT = 1,
89         MSSOPT = 2,
90         MSS_LENGTH = 4, /* max segment size header option length */
91         WSOPT = 3,
92         WS_LENGTH = 3,  /* WS header option length */
93         MAX_WS_VALUE = 14,      /* RFC specified.  Limits available window to 2^30 */
94         TS_OPT = 8,
95         TS_LENGTH = 10,
96         TS_SEND_PREPAD = 2,     /* For non-SYNs, pre-pad 2 nops for 32 byte alignment */
97         SACK_OK_OPT = 4,
98         SACK_OK_LENGTH = 2,
99         SACK_OPT = 5,
100         MSL2 = 10,
101         MSPTICK = 50,   /* Milliseconds per timer tick */
102         DEF_MSS = 1460, /* Default mean segment */
103         DEF_MSS6 = 1280,        /* Default mean segment (min) for v6 */
104         SACK_SUPPORTED = TRUE,  /* SACK is on by default */
105         MAX_NR_SACKS_PER_PACKET = 4,    /* limited by TCP's opts size */
106         MAX_NR_SND_SACKS = 10,
107         MAX_NR_RCV_SACKS = 3,   /* We could try for 4, but don't need to */
108         DEF_RTT = 500,  /* Default round trip */
109         DEF_KAT = 120000,       /* Default time (ms) between keep alives */
110         TCP_LISTEN = 0, /* Listen connection */
111         TCP_CONNECT = 1,        /* Outgoing connection */
112         SYNACK_RXTIMER = 250,   /* ms between SYNACK retransmits */
113
114         TCPREXMTTHRESH = 3,     /* dupack threshold for recovery */
115         SACK_RETRANS_RECOVERY = 1,
116         FAST_RETRANS_RECOVERY = 2,
117         RTO_RETRANS_RECOVERY = 3,
118         CWIND_SCALE = 10,       /* initial CWIND will be MSS * this */
119
120         FORCE = 1,
121         CLONE = 2,
122         RETRAN = 4,
123         ACTIVE = 8,
124         SYNACK = 16,
125         TSO = 32,
126
127         LOGAGAIN = 3,
128         LOGDGAIN = 2,
129
130         Closed = 0,     /* Connection states */
131         Listen,
132         Syn_sent,
133         Established,
134         Finwait1,
135         Finwait2,
136         Close_wait,
137         Closing,
138         Last_ack,
139         Time_wait,
140
141         Maxlimbo = 1000,        /* maximum procs waiting for response to SYN ACK */
142         NLHT = 256,     /* hash table size, must be a power of 2 */
143         LHTMASK = NLHT - 1,
144
145         HaveWS = 1 << 8,
146 };
147
148 /* Must correspond to the enumeration above */
149 char *tcpstates[] = {
150         "Closed", "Listen", "Syn_sent",
151         "Established", "Finwait1", "Finwait2", "Close_wait",
152         "Closing", "Last_ack", "Time_wait"
153 };
154
155 typedef struct Tcptimer Tcptimer;
156 struct Tcptimer {
157         Tcptimer *next;
158         Tcptimer *prev;
159         Tcptimer *readynext;
160         int state;
161         uint64_t start;
162         uint64_t count;
163         void (*func) (void *);
164         void *arg;
165 };
166
167 /*
168  *  v4 and v6 pseudo headers used for
169  *  checksuming tcp
170  */
171 typedef struct Tcp4hdr Tcp4hdr;
172 struct Tcp4hdr {
173         uint8_t vihl;                           /* Version and header length */
174         uint8_t tos;                            /* Type of service */
175         uint8_t length[2];                      /* packet length */
176         uint8_t id[2];                          /* Identification */
177         uint8_t frag[2];                        /* Fragment information */
178         uint8_t Unused;
179         uint8_t proto;
180         uint8_t tcplen[2];
181         uint8_t tcpsrc[4];
182         uint8_t tcpdst[4];
183         uint8_t tcpsport[2];
184         uint8_t tcpdport[2];
185         uint8_t tcpseq[4];
186         uint8_t tcpack[4];
187         uint8_t tcpflag[2];
188         uint8_t tcpwin[2];
189         uint8_t tcpcksum[2];
190         uint8_t tcpurg[2];
191         /* Options segment */
192         uint8_t tcpopt[1];
193 };
194
195 typedef struct Tcp6hdr Tcp6hdr;
196 struct Tcp6hdr {
197         uint8_t vcf[4];
198         uint8_t ploadlen[2];
199         uint8_t proto;
200         uint8_t ttl;
201         uint8_t tcpsrc[IPaddrlen];
202         uint8_t tcpdst[IPaddrlen];
203         uint8_t tcpsport[2];
204         uint8_t tcpdport[2];
205         uint8_t tcpseq[4];
206         uint8_t tcpack[4];
207         uint8_t tcpflag[2];
208         uint8_t tcpwin[2];
209         uint8_t tcpcksum[2];
210         uint8_t tcpurg[2];
211         /* Options segment */
212         uint8_t tcpopt[1];
213 };
214
215 struct sack_block {
216         uint32_t left;
217         uint32_t right;
218 };
219
220 /*
221  *  this represents the control info
222  *  for a single packet.  It is derived from
223  *  a packet in ntohtcp{4,6}() and stuck into
224  *  a packet in htontcp{4,6}().
225  */
226 typedef struct Tcp Tcp;
227 struct Tcp {
228         uint16_t source;
229         uint16_t dest;
230         uint32_t seq;
231         uint32_t ack;
232         uint8_t flags;
233         uint16_t ws;                            /* window scale option (if not zero) */
234         uint32_t wnd;
235         uint16_t urg;
236         uint16_t mss;                           /* max segment size option (if not zero) */
237         uint16_t len;                           /* size of data */
238         uint32_t ts_val;                        /* timestamp val from sender */
239         uint32_t ts_ecr;                        /* timestamp echo response from sender */
240         bool sack_ok;                           /* header had/should have SACK_PERMITTED */
241         uint8_t nr_sacks;
242         struct sack_block sacks[MAX_NR_SACKS_PER_PACKET];
243 };
244
245 /*
246  *  this header is malloc'd to thread together fragments
247  *  waiting to be coalesced
248  */
249 typedef struct Reseq Reseq;
250 struct Reseq {
251         Reseq *next;
252         Tcp seg;
253         struct block *bp;
254         uint16_t length;
255 };
256
257 /*
258  *  the qlock in the Conv locks this structure
259  */
260 typedef struct Tcpctl Tcpctl;
261 struct Tcpctl {
262         uint8_t state;                          /* Connection state */
263         uint8_t type;                           /* Listening or active connection */
264         uint8_t code;                           /* Icmp code */
265         struct {
266                 uint32_t una;                   /* Left edge of unacked data region */
267                 uint32_t nxt;                   /* Next seq to send, right edge of unacked */
268                 uint32_t rtx;                   /* Next to send for retrans */
269                 uint32_t wnd;                   /* Tcp send window */
270                 uint32_t urg;                   /* Urgent data pointer */
271                 uint32_t wl2;
272                 int scale;                              /* how much to right shift window for xmit */
273                 uint32_t in_flight;             /* estimate of how much is in flight */
274                 uint8_t loss_hint;              /* number of loss hints rcvd */
275                 uint8_t sack_loss_hint; /* For detecting sack rxmit losses */
276                 bool flush_sacks;               /* Two timeouts in a row == dump sacks */
277                 uint8_t recovery;               /* loss recovery flag */
278                 uint32_t recovery_pt;   /* right window for recovery point */
279                 uint8_t nr_sacks;
280                 struct sack_block sacks[MAX_NR_SND_SACKS];
281         } snd;
282         struct {
283                 uint32_t nxt;                   /* Receive pointer to next uint8_t slot */
284                 uint32_t wnd;                   /* Receive window incoming */
285                 uint32_t urg;                   /* Urgent pointer */
286                 int blocked;
287                 int una;                                /* unacked data segs */
288                 int scale;                              /* how much to left shift window for rx */
289                 uint8_t nr_sacks;
290                 struct sack_block sacks[MAX_NR_RCV_SACKS];
291         } rcv;
292         uint32_t iss;                           /* Initial sequence number */
293         int sawwsopt;                           /* true if we saw a wsopt on the incoming SYN */
294         uint32_t cwind;                         /* Congestion window */
295         int scale;                                      /* desired snd.scale */
296         uint32_t ssthresh;                      /* Slow start threshold */
297         int irs;                                        /* Initial received squence */
298         uint16_t mss;                           /* Max segment size */
299         uint16_t typical_mss;           /* MSS for most packets (< MSS for some opts) */
300         int rerecv;                                     /* Overlap of data rerecevived */
301         uint32_t window;                        /* Recevive window */
302         uint8_t backoff;                        /* Exponential backoff counter */
303         int backedoff;                          /* ms we've backed off for rexmits */
304         uint8_t flags;                          /* State flags */
305         Reseq *reseq;                           /* Resequencing queue */
306         Tcptimer timer;                         /* Activity timer */
307         Tcptimer acktimer;                      /* Acknowledge timer */
308         Tcptimer rtt_timer;                     /* Round trip timer */
309         Tcptimer katimer;                       /* keep alive timer */
310         uint32_t rttseq;                        /* Round trip sequence */
311         int srtt;                                       /* Shortened round trip */
312         int mdev;                                       /* Mean deviation of round trip */
313         int kacounter;                          /* count down for keep alive */
314         uint64_t sndsyntime;            /* time syn sent */
315         uint64_t time;                          /* time Finwait2 was sent */
316         int nochecksum;                         /* non-zero means don't send checksums */
317         int flgcnt;                                     /* number of flags in the sequence (FIN,SYN) */
318         uint32_t ts_recent;                     /* timestamp received around last_ack_sent */
319         uint32_t last_ack_sent;         /* to determine when to update timestamp */
320         bool sack_ok;                           /* Can use SACK for this connection */
321
322         union {
323                 Tcp4hdr tcp4hdr;
324                 Tcp6hdr tcp6hdr;
325         } protohdr;                                     /* prototype header */
326 };
327
328 /*
329  *  New calls are put in limbo rather than having a conversation structure
330  *  allocated.  Thus, a SYN attack results in lots of limbo'd calls but not
331  *  any real Conv structures mucking things up.  Calls in limbo rexmit their
332  *  SYN ACK every SYNACK_RXTIMER ms up to 4 times, i.e., they disappear after 1 second.
333  *
334  *  In particular they aren't on a listener's queue so that they don't figure
335  *  in the input queue limit.
336  *
337  *  If 1/2 of a T3 was attacking SYN packets, we'ld have a permanent queue
338  *  of 70000 limbo'd calls.  Not great for a linear list but doable.  Therefore
339  *  there is no hashing of this list.
340  */
341 typedef struct Limbo Limbo;
342 struct Limbo {
343         Limbo *next;
344
345         uint8_t laddr[IPaddrlen];
346         uint8_t raddr[IPaddrlen];
347         uint16_t lport;
348         uint16_t rport;
349         uint32_t irs;                           /* initial received sequence */
350         uint32_t iss;                           /* initial sent sequence */
351         uint16_t mss;                           /* mss from the other end */
352         uint16_t rcvscale;                      /* how much to scale rcvd windows */
353         uint16_t sndscale;                      /* how much to scale sent windows */
354         uint64_t lastsend;                      /* last time we sent a synack */
355         uint8_t version;                        /* v4 or v6 */
356         uint8_t rexmits;                        /* number of retransmissions */
357         bool sack_ok;                           /* other side said SACK_OK */
358         uint32_t ts_val;                        /* timestamp val from sender */
359 };
360
361 int tcp_irtt = DEF_RTT;                 /* Initial guess at round trip time */
362 uint16_t tcp_mss = DEF_MSS;             /* Maximum segment size to be sent */
363
364 enum {
365         /* MIB stats */
366         MaxConn,
367         ActiveOpens,
368         PassiveOpens,
369         EstabResets,
370         CurrEstab,
371         InSegs,
372         OutSegs,
373         RetransSegs,
374         RetransTimeouts,
375         InErrs,
376         OutRsts,
377
378         /* non-MIB stats */
379         CsumErrs,
380         HlenErrs,
381         LenErrs,
382         OutOfOrder,
383
384         Nstats
385 };
386
387 static char *statnames[] = {
388         [MaxConn] "MaxConn",
389         [ActiveOpens] "ActiveOpens",
390         [PassiveOpens] "PassiveOpens",
391         [EstabResets] "EstabResets",
392         [CurrEstab] "CurrEstab",
393         [InSegs] "InSegs",
394         [OutSegs] "OutSegs",
395         [RetransSegs] "RetransSegs",
396         [RetransTimeouts] "RetransTimeouts",
397         [InErrs] "InErrs",
398         [OutRsts] "OutRsts",
399         [CsumErrs] "CsumErrs",
400         [HlenErrs] "HlenErrs",
401         [LenErrs] "LenErrs",
402         [OutOfOrder] "OutOfOrder",
403 };
404
405 typedef struct Tcppriv Tcppriv;
406 struct tcppriv {
407         /* List of active timers */
408         qlock_t tl;
409         Tcptimer *timers;
410
411         /* hash table for matching conversations */
412         struct Ipht ht;
413
414         /* calls in limbo waiting for an ACK to our SYN ACK */
415         int nlimbo;
416         Limbo *lht[NLHT];
417
418         /* for keeping track of tcpackproc */
419         qlock_t apl;
420         int ackprocstarted;
421
422         uint32_t stats[Nstats];
423 };
424
425 /*
426  *  Setting tcpporthogdefense to non-zero enables Dong Lin's
427  *  solution to hijacked systems staking out port's as a form
428  *  of DoS attack.
429  *
430  *  To avoid stateless Conv hogs, we pick a sequence number at random.  If
431  *  it that number gets acked by the other end, we shut down the connection.
432  *  Look for tcpporthogedefense in the code.
433  */
434 int tcpporthogdefense = 0;
435
436 int addreseq(Tcpctl *, struct tcppriv *, Tcp *, struct block *, uint16_t);
437 void getreseq(Tcpctl *, Tcp *, struct block **, uint16_t *);
438 void localclose(struct conv *, char *unused_char_p_t);
439 void procsyn(struct conv *, Tcp *);
440 void tcpiput(struct Proto *, struct Ipifc *, struct block *);
441 void tcpoutput(struct conv *);
442 int tcptrim(Tcpctl *, Tcp *, struct block **, uint16_t *);
443 void tcpstart(struct conv *, int);
444 void tcptimeout(void *);
445 void tcpsndsyn(struct conv *, Tcpctl *);
446 void tcprcvwin(struct conv *);
447 void tcpacktimer(void *);
448 void tcpkeepalive(void *);
449 void tcpsetkacounter(Tcpctl *);
450 void tcprxmit(struct conv *);
451 void tcpsettimer(Tcpctl *);
452 void tcpsynackrtt(struct conv *);
453 void tcpsetscale(struct conv *, Tcpctl *, uint16_t, uint16_t);
454 static void tcp_loss_event(struct conv *s, Tcpctl *tcb);
455 static uint16_t derive_payload_mss(Tcpctl *tcb);
456 static int seq_within(uint32_t x, uint32_t low, uint32_t high);
457 static int seq_lt(uint32_t x, uint32_t y);
458 static int seq_le(uint32_t x, uint32_t y);
459 static int seq_gt(uint32_t x, uint32_t y);
460 static int seq_ge(uint32_t x, uint32_t y);
461 static uint32_t seq_max(uint32_t x, uint32_t y);
462 static uint32_t seq_min(uint32_t x, uint32_t y);
463 static void set_in_flight(Tcpctl *tcb);
464
465 static void limborexmit(struct Proto *);
466 static void limbo(struct conv *, uint8_t * unused_uint8_p_t, uint8_t *, Tcp *,
467                                   int);
468
469 void tcpsetstate(struct conv *s, uint8_t newstate)
470 {
471         Tcpctl *tcb;
472         uint8_t oldstate;
473         struct tcppriv *tpriv;
474
475         tpriv = s->p->priv;
476
477         tcb = (Tcpctl *) s->ptcl;
478
479         oldstate = tcb->state;
480         if (oldstate == newstate)
481                 return;
482
483         if (oldstate == Established)
484                 tpriv->stats[CurrEstab]--;
485         if (newstate == Established)
486                 tpriv->stats[CurrEstab]++;
487
488         /**
489         print( "%d/%d %s->%s CurrEstab=%d\n", s->lport, s->rport,
490                 tcpstates[oldstate], tcpstates[newstate], tpriv->tstats.tcpCurrEstab );
491         **/
492
493         switch (newstate) {
494                 case Closed:
495                         qclose(s->rq);
496                         qclose(s->wq);
497                         qclose(s->eq);
498                         break;
499
500                 case Close_wait:        /* Remote closes */
501                         qhangup(s->rq, NULL);
502                         break;
503         }
504
505         tcb->state = newstate;
506
507         if (oldstate == Syn_sent && newstate != Closed)
508                 Fsconnected(s, NULL);
509 }
510
511 static void tcpconnect(struct conv *c, char **argv, int argc)
512 {
513         Fsstdconnect(c, argv, argc);
514         tcpstart(c, TCP_CONNECT);
515 }
516
517 static int tcpstate(struct conv *c, char *state, int n)
518 {
519         Tcpctl *s;
520
521         s = (Tcpctl *) (c->ptcl);
522
523         return snprintf(state, n,
524                                         "%s qin %d qout %d srtt %d mdev %d cwin %u swin %u>>%d rwin %u>>%d timer.start %llu timer.count %llu rerecv %d katimer.start %d katimer.count %d\n",
525                                         tcpstates[s->state],
526                                         c->rq ? qlen(c->rq) : 0,
527                                         c->wq ? qlen(c->wq) : 0,
528                                         s->srtt, s->mdev,
529                                         s->cwind, s->snd.wnd, s->rcv.scale, s->rcv.wnd,
530                                         s->snd.scale, s->timer.start, s->timer.count, s->rerecv,
531                                         s->katimer.start, s->katimer.count);
532 }
533
534 static int tcpinuse(struct conv *c)
535 {
536         Tcpctl *s;
537
538         s = (Tcpctl *) (c->ptcl);
539         return s->state != Closed;
540 }
541
542 static void tcpannounce(struct conv *c, char **argv, int argc)
543 {
544         Fsstdannounce(c, argv, argc);
545         tcpstart(c, TCP_LISTEN);
546         Fsconnected(c, NULL);
547 }
548
549 static void tcpbypass(struct conv *cv, char **argv, int argc)
550 {
551         struct tcppriv *tpriv = cv->p->priv;
552
553         Fsstdbypass(cv, argv, argc);
554         iphtadd(&tpriv->ht, cv);
555 }
556
557 static void tcpshutdown(struct conv *c, int how)
558 {
559         Tcpctl *tcb = (Tcpctl*)c->ptcl;
560
561         /* Do nothing for the read side */
562         if (how == SHUT_RD)
563                 return;
564         /* Sends a FIN.  If we're in another state (like Listen), we'll run into
565          * issues, since we'll never send the FIN.  We'll be shutdown on our end,
566          * but we'll never tell the distant end.  Might just be an app issue. */
567         switch (tcb->state) {
568         case Established:
569                 tcb->flgcnt++;
570                 tcb->snd.nxt++;
571                 tcpsetstate(c, Finwait1);
572                 tcpoutput(c);
573                 break;
574         }
575 }
576
577 /*
578  *  tcpclose is always called with the q locked
579  */
580 static void tcpclose(struct conv *c)
581 {
582         Tcpctl *tcb;
583
584         tcb = (Tcpctl *) c->ptcl;
585
586         qhangup(c->rq, NULL);
587         qhangup(c->wq, NULL);
588         qhangup(c->eq, NULL);
589         qflush(c->rq);
590
591         switch (tcb->state) {
592                 case Listen:
593                         /*
594                          *  reset any incoming calls to this listener
595                          */
596                         Fsconnected(c, "Hangup");
597
598                         localclose(c, NULL);
599                         break;
600                 case Closed:
601                 case Syn_sent:
602                         localclose(c, NULL);
603                         break;
604                 case Established:
605                         tcb->flgcnt++;
606                         tcb->snd.nxt++;
607                         tcpsetstate(c, Finwait1);
608                         tcpoutput(c);
609                         break;
610                 case Close_wait:
611                         tcb->flgcnt++;
612                         tcb->snd.nxt++;
613                         tcpsetstate(c, Last_ack);
614                         tcpoutput(c);
615                         break;
616         }
617 }
618
619 void tcpkick(void *x)
620 {
621         ERRSTACK(1);
622         struct conv *s = x;
623         Tcpctl *tcb;
624
625         tcb = (Tcpctl *) s->ptcl;
626
627         qlock(&s->qlock);
628         if (waserror()) {
629                 qunlock(&s->qlock);
630                 nexterror();
631         }
632
633         switch (tcb->state) {
634                 case Syn_sent:
635                 case Established:
636                 case Close_wait:
637                         /*
638                          * Push data
639                          */
640                         tcprcvwin(s);
641                         tcpoutput(s);
642                         break;
643                 default:
644                         localclose(s, "Hangup");
645                         break;
646         }
647
648         qunlock(&s->qlock);
649         poperror();
650 }
651
652 void tcprcvwin(struct conv *s)
653 {
654         /* Call with tcb locked */
655         int w;
656         Tcpctl *tcb;
657
658         tcb = (Tcpctl *) s->ptcl;
659         w = tcb->window - qlen(s->rq);
660         if (w < 0)
661                 w = 0;
662
663         /* RFC 813: Avoid SWS.  We'll always reduce the window (because the qio
664          * increased - that's legit), and we'll always advertise the window
665          * increases (corresponding to qio drains) when those are greater than MSS.
666          * But we don't advertise increases less than MSS.
667          *
668          * Note we don't shrink the window at all - that'll result in tcptrim()
669          * dropping packets that were sent before the sender gets our update. */
670         if ((w < tcb->rcv.wnd) || (w >= tcb->mss))
671                 tcb->rcv.wnd = w;
672         /* We've delayed sending an update to rcv.wnd, and we might never get
673          * another ACK to drive the TCP stack after the qio is drained.  We could
674          * replace this stuff with qio kicks or callbacks, but that might be
675          * trickier with the MSS limitation.  (and 'edge' isn't empty or not). */
676         if (w < tcb->mss)
677                 tcb->rcv.blocked = 1;
678 }
679
680 void tcpacktimer(void *v)
681 {
682         ERRSTACK(1);
683         Tcpctl *tcb;
684         struct conv *s;
685
686         s = v;
687         tcb = (Tcpctl *) s->ptcl;
688
689         qlock(&s->qlock);
690         if (waserror()) {
691                 qunlock(&s->qlock);
692                 nexterror();
693         }
694         if (tcb->state != Closed) {
695                 tcb->flags |= FORCE;
696                 tcprcvwin(s);
697                 tcpoutput(s);
698         }
699         qunlock(&s->qlock);
700         poperror();
701 }
702
703 static void tcpcreate(struct conv *c)
704 {
705         /* We don't use qio limits.  Instead, TCP manages flow control on its own.
706          * We only use qpassnolim().  Note for qio that 0 doesn't mean no limit. */
707         c->rq = qopen(0, Qcoalesce, 0, 0);
708         c->wq = qopen(8 * QMAX, Qkick, tcpkick, c);
709 }
710
711 static void timerstate(struct tcppriv *priv, Tcptimer * t, int newstate)
712 {
713         if (newstate != TcptimerON) {
714                 if (t->state == TcptimerON) {
715                         // unchain
716                         if (priv->timers == t) {
717                                 priv->timers = t->next;
718                                 if (t->prev != NULL)
719                                         panic("timerstate1");
720                         }
721                         if (t->next)
722                                 t->next->prev = t->prev;
723                         if (t->prev)
724                                 t->prev->next = t->next;
725                         t->next = t->prev = NULL;
726                 }
727         } else {
728                 if (t->state != TcptimerON) {
729                         // chain
730                         if (t->prev != NULL || t->next != NULL)
731                                 panic("timerstate2");
732                         t->prev = NULL;
733                         t->next = priv->timers;
734                         if (t->next)
735                                 t->next->prev = t;
736                         priv->timers = t;
737                 }
738         }
739         t->state = newstate;
740 }
741
742 void tcpackproc(void *a)
743 {
744         ERRSTACK(1);
745         Tcptimer *t, *tp, *timeo;
746         struct Proto *tcp;
747         struct tcppriv *priv;
748         int loop;
749
750         tcp = a;
751         priv = tcp->priv;
752
753         for (;;) {
754                 kthread_usleep(MSPTICK * 1000);
755
756                 qlock(&priv->tl);
757                 timeo = NULL;
758                 loop = 0;
759                 for (t = priv->timers; t != NULL; t = tp) {
760                         if (loop++ > 10000)
761                                 panic("tcpackproc1");
762                         tp = t->next;
763                         if (t->state == TcptimerON) {
764                                 t->count--;
765                                 if (t->count == 0) {
766                                         timerstate(priv, t, TcptimerDONE);
767                                         t->readynext = timeo;
768                                         timeo = t;
769                                 }
770                         }
771                 }
772                 qunlock(&priv->tl);
773
774                 loop = 0;
775                 for (t = timeo; t != NULL; t = t->readynext) {
776                         if (loop++ > 10000)
777                                 panic("tcpackproc2");
778                         if (t->state == TcptimerDONE && t->func != NULL) {
779                                 /* discard error style */
780                                 if (!waserror())
781                                         (*t->func) (t->arg);
782                                 poperror();
783                         }
784                 }
785
786                 limborexmit(tcp);
787         }
788 }
789
790 void tcpgo(struct tcppriv *priv, Tcptimer * t)
791 {
792         if (t == NULL || t->start == 0)
793                 return;
794
795         qlock(&priv->tl);
796         t->count = t->start;
797         timerstate(priv, t, TcptimerON);
798         qunlock(&priv->tl);
799 }
800
801 void tcphalt(struct tcppriv *priv, Tcptimer * t)
802 {
803         if (t == NULL)
804                 return;
805
806         qlock(&priv->tl);
807         timerstate(priv, t, TcptimerOFF);
808         qunlock(&priv->tl);
809 }
810
811 int backoff(int n)
812 {
813         return 1 << n;
814 }
815
816 void localclose(struct conv *s, char *reason)
817 {       /* called with tcb locked */
818         Tcpctl *tcb;
819         Reseq *rp, *rp1;
820         struct tcppriv *tpriv;
821
822         tpriv = s->p->priv;
823         tcb = (Tcpctl *) s->ptcl;
824
825         iphtrem(&tpriv->ht, s);
826
827         tcphalt(tpriv, &tcb->timer);
828         tcphalt(tpriv, &tcb->rtt_timer);
829         tcphalt(tpriv, &tcb->acktimer);
830         tcphalt(tpriv, &tcb->katimer);
831
832         /* Flush reassembly queue; nothing more can arrive */
833         for (rp = tcb->reseq; rp != NULL; rp = rp1) {
834                 rp1 = rp->next;
835                 freeblist(rp->bp);
836                 kfree(rp);
837         }
838         tcb->reseq = NULL;
839
840         if (tcb->state == Syn_sent)
841                 Fsconnected(s, reason);
842
843         qhangup(s->rq, reason);
844         qhangup(s->wq, reason);
845
846         tcpsetstate(s, Closed);
847
848         /* listener will check the rq state */
849         if (s->state == Announced)
850                 rendez_wakeup(&s->listenr);
851 }
852
853 /* mtu (- TCP + IP hdr len) of 1st hop */
854 int tcpmtu(struct Proto *tcp, uint8_t * addr, int version, int *scale,
855            uint8_t *flags)
856 {
857         struct Ipifc *ifc;
858         int mtu;
859
860         ifc = findipifc(tcp->f, addr, 0);
861         switch (version) {
862                 default:
863                 case V4:
864                         mtu = DEF_MSS;
865                         if (ifc != NULL)
866                                 mtu = ifc->maxtu - ifc->m->hsize - (TCP4_PKT + TCP4_HDRSIZE);
867                         break;
868                 case V6:
869                         mtu = DEF_MSS6;
870                         if (ifc != NULL)
871                                 mtu = ifc->maxtu - ifc->m->hsize - (TCP6_PKT + TCP6_HDRSIZE);
872                         break;
873         }
874         *flags &= ~TSO;
875         if (ifc && (ifc->feat & NETF_TSO))
876                 *flags |= TSO;
877         *scale = HaveWS | 7;
878
879         return mtu;
880 }
881
882 void inittcpctl(struct conv *s, int mode)
883 {
884         Tcpctl *tcb;
885         Tcp4hdr *h4;
886         Tcp6hdr *h6;
887         int mss;
888
889         tcb = (Tcpctl *) s->ptcl;
890
891         memset(tcb, 0, sizeof(Tcpctl));
892
893         tcb->ssthresh = UINT32_MAX;
894         tcb->srtt = tcp_irtt << LOGAGAIN;
895         tcb->mdev = 0;
896
897         /* setup timers */
898         tcb->timer.start = tcp_irtt / MSPTICK;
899         tcb->timer.func = tcptimeout;
900         tcb->timer.arg = s;
901         tcb->rtt_timer.start = MAX_TIME;
902         tcb->acktimer.start = TCP_ACK / MSPTICK;
903         tcb->acktimer.func = tcpacktimer;
904         tcb->acktimer.arg = s;
905         tcb->katimer.start = DEF_KAT / MSPTICK;
906         tcb->katimer.func = tcpkeepalive;
907         tcb->katimer.arg = s;
908
909         mss = DEF_MSS;
910
911         /* create a prototype(pseudo) header */
912         if (mode != TCP_LISTEN) {
913                 if (ipcmp(s->laddr, IPnoaddr) == 0)
914                         findlocalip(s->p->f, s->laddr, s->raddr);
915
916                 switch (s->ipversion) {
917                         case V4:
918                                 h4 = &tcb->protohdr.tcp4hdr;
919                                 memset(h4, 0, sizeof(*h4));
920                                 h4->proto = IP_TCPPROTO;
921                                 hnputs(h4->tcpsport, s->lport);
922                                 hnputs(h4->tcpdport, s->rport);
923                                 v6tov4(h4->tcpsrc, s->laddr);
924                                 v6tov4(h4->tcpdst, s->raddr);
925                                 break;
926                         case V6:
927                                 h6 = &tcb->protohdr.tcp6hdr;
928                                 memset(h6, 0, sizeof(*h6));
929                                 h6->proto = IP_TCPPROTO;
930                                 hnputs(h6->tcpsport, s->lport);
931                                 hnputs(h6->tcpdport, s->rport);
932                                 ipmove(h6->tcpsrc, s->laddr);
933                                 ipmove(h6->tcpdst, s->raddr);
934                                 mss = DEF_MSS6;
935                                 break;
936                         default:
937                                 panic("inittcpctl: version %d", s->ipversion);
938                 }
939         }
940
941         tcb->mss = mss;
942         tcb->typical_mss = mss;
943         tcb->cwind = tcb->typical_mss * CWIND_SCALE;
944
945         /* default is no window scaling */
946         tcb->window = QMAX;
947         tcb->rcv.wnd = QMAX;
948         tcb->rcv.scale = 0;
949         tcb->snd.scale = 0;
950 }
951
952 /*
953  *  called with s qlocked
954  */
955 void tcpstart(struct conv *s, int mode)
956 {
957         Tcpctl *tcb;
958         struct tcppriv *tpriv;
959         char *kpname;
960
961         tpriv = s->p->priv;
962
963         if (tpriv->ackprocstarted == 0) {
964                 qlock(&tpriv->apl);
965                 if (tpriv->ackprocstarted == 0) {
966                         /* tcpackproc needs to free this if it ever exits */
967                         kpname = kmalloc(KNAMELEN, MEM_WAIT);
968                         snprintf(kpname, KNAMELEN, "#I%dtcpack", s->p->f->dev);
969                         ktask(kpname, tcpackproc, s->p);
970                         tpriv->ackprocstarted = 1;
971                 }
972                 qunlock(&tpriv->apl);
973         }
974
975         tcb = (Tcpctl *) s->ptcl;
976
977         inittcpctl(s, mode);
978
979         iphtadd(&tpriv->ht, s);
980         switch (mode) {
981                 case TCP_LISTEN:
982                         tpriv->stats[PassiveOpens]++;
983                         tcb->flags |= CLONE;
984                         tcpsetstate(s, Listen);
985                         break;
986
987                 case TCP_CONNECT:
988                         tpriv->stats[ActiveOpens]++;
989                         tcb->flags |= ACTIVE;
990                         tcpsndsyn(s, tcb);
991                         tcpsetstate(s, Syn_sent);
992                         tcpoutput(s);
993                         break;
994         }
995 }
996
997 static char *tcpflag(uint16_t flag)
998 {
999         static char buf[128];
1000
1001         snprintf(buf, sizeof(buf), "%d", flag >> 10);   /* Head len */
1002         if (flag & URG)
1003                 snprintf(buf, sizeof(buf), "%s%s", buf, " URG");
1004         if (flag & ACK)
1005                 snprintf(buf, sizeof(buf), "%s%s", buf, " ACK");
1006         if (flag & PSH)
1007                 snprintf(buf, sizeof(buf), "%s%s", buf, " PSH");
1008         if (flag & RST)
1009                 snprintf(buf, sizeof(buf), "%s%s", buf, " RST");
1010         if (flag & SYN)
1011                 snprintf(buf, sizeof(buf), "%s%s", buf, " SYN");
1012         if (flag & FIN)
1013                 snprintf(buf, sizeof(buf), "%s%s", buf, " FIN");
1014
1015         return buf;
1016 }
1017
1018 /* Helper, determine if we should send a TCP timestamp.  ts_val was the
1019  * timestamp from our distant end.  We'll also send a TS on SYN (no ACK). */
1020 static bool tcp_seg_has_ts(Tcp *tcph)
1021 {
1022         return tcph->ts_val || ((tcph->flags & SYN) && !(tcph->flags & ACK));
1023 }
1024
1025 /* Given a TCP header/segment and default header size (e.g. TCP4_HDRSIZE),
1026  * return the actual hdr_len and opt_pad */
1027 static void compute_hdrlen_optpad(Tcp *tcph, uint16_t default_hdrlen,
1028                                   uint16_t *ret_hdrlen, uint16_t *ret_optpad,
1029                                   Tcpctl *tcb)
1030 {
1031         uint16_t hdrlen = default_hdrlen;
1032         uint16_t optpad = 0;
1033
1034         if (tcph->flags & SYN) {
1035                 if (tcph->mss)
1036                         hdrlen += MSS_LENGTH;
1037                 if (tcph->ws)
1038                         hdrlen += WS_LENGTH;
1039                 if (tcph->sack_ok)
1040                         hdrlen += SACK_OK_LENGTH;
1041         }
1042         if (tcp_seg_has_ts(tcph)) {
1043                 hdrlen += TS_LENGTH;
1044                 /* SYNs have other opts, don't do the PREPAD NOOP optimization. */
1045                 if (!(tcph->flags & SYN))
1046                         hdrlen += TS_SEND_PREPAD;
1047         }
1048         if (tcb && tcb->rcv.nr_sacks)
1049                 hdrlen += 2 + tcb->rcv.nr_sacks * 8;
1050         optpad = hdrlen & 3;
1051         if (optpad)
1052                 optpad = 4 - optpad;
1053         hdrlen += optpad;
1054         *ret_hdrlen = hdrlen;
1055         *ret_optpad = optpad;
1056 }
1057
1058 /* Writes the TCP options for tcph to opt. */
1059 static void write_opts(Tcp *tcph, uint8_t *opt, uint16_t optpad, Tcpctl *tcb)
1060 {
1061         if (tcph->flags & SYN) {
1062                 if (tcph->mss != 0) {
1063                         *opt++ = MSSOPT;
1064                         *opt++ = MSS_LENGTH;
1065                         hnputs(opt, tcph->mss);
1066                         opt += 2;
1067                 }
1068                 if (tcph->ws != 0) {
1069                         *opt++ = WSOPT;
1070                         *opt++ = WS_LENGTH;
1071                         *opt++ = tcph->ws;
1072                 }
1073                 if (tcph->sack_ok) {
1074                         *opt++ = SACK_OK_OPT;
1075                         *opt++ = SACK_OK_LENGTH;
1076                 }
1077         }
1078         if (tcp_seg_has_ts(tcph)) {
1079                 if (!(tcph->flags & SYN)) {
1080                         *opt++ = NOOPOPT;
1081                         *opt++ = NOOPOPT;
1082                 }
1083                 *opt++ = TS_OPT;
1084                 *opt++ = TS_LENGTH;
1085                 /* Setting TSval, our time */
1086                 hnputl(opt, milliseconds());
1087                 opt += 4;
1088                 /* Setting TSecr, the time we last saw from them, stored in ts_val */
1089                 hnputl(opt, tcph->ts_val);
1090                 opt += 4;
1091         }
1092         if (tcb && tcb->rcv.nr_sacks) {
1093                 *opt++ = SACK_OPT;
1094                 *opt++ = 2 + tcb->rcv.nr_sacks * 8;
1095                 for (int i = 0; i < tcb->rcv.nr_sacks; i++) {
1096                         hnputl(opt, tcb->rcv.sacks[i].left);
1097                         opt += 4;
1098                         hnputl(opt, tcb->rcv.sacks[i].right);
1099                         opt += 4;
1100                 }
1101         }
1102         while (optpad-- > 0)
1103                 *opt++ = NOOPOPT;
1104 }
1105
1106 /* Given a data block (or NULL) returns a block with enough header room that we
1107  * can send out.  block->wp is set to the beginning of the payload.  Returns
1108  * NULL on some sort of error. */
1109 static struct block *alloc_or_pad_block(struct block *data,
1110                                         uint16_t total_hdr_size)
1111 {
1112         if (data) {
1113                 data = padblock(data, total_hdr_size);
1114                 if (data == NULL)
1115                         return NULL;
1116         } else {
1117                 /* the 64 pad is to meet mintu's */
1118                 data = block_alloc(total_hdr_size + 64, MEM_WAIT);
1119                 if (data == NULL)
1120                         return NULL;
1121                 data->wp += total_hdr_size;
1122         }
1123         return data;
1124 }
1125
1126 struct block *htontcp6(Tcp *tcph, struct block *data, Tcp6hdr *ph,
1127                                            Tcpctl *tcb)
1128 {
1129         int dlen = blocklen(data);
1130         Tcp6hdr *h;
1131         uint16_t csum;
1132         uint16_t hdrlen, optpad;
1133
1134         compute_hdrlen_optpad(tcph, TCP6_HDRSIZE, &hdrlen, &optpad, tcb);
1135
1136         data = alloc_or_pad_block(data, hdrlen + TCP6_PKT);
1137         if (data == NULL)
1138                 return NULL;
1139         /* relative to the block start (bp->rp) */
1140         data->transport_header_end = hdrlen + TCP6_PKT;
1141
1142         /* copy in pseudo ip header plus port numbers */
1143         h = (Tcp6hdr *) (data->rp);
1144         memmove(h, ph, TCP6_TCBPHDRSZ);
1145
1146         /* compose pseudo tcp header, do cksum calculation */
1147         hnputl(h->vcf, hdrlen + dlen);
1148         h->ploadlen[0] = h->ploadlen[1] = h->proto = 0;
1149         h->ttl = ph->proto;
1150
1151         /* copy in variable bits */
1152         hnputl(h->tcpseq, tcph->seq);
1153         hnputl(h->tcpack, tcph->ack);
1154         hnputs(h->tcpflag, (hdrlen << 10) | tcph->flags);
1155         hnputs(h->tcpwin, tcph->wnd >> (tcb != NULL ? tcb->snd.scale : 0));
1156         hnputs(h->tcpurg, tcph->urg);
1157
1158         write_opts(tcph, h->tcpopt, optpad, tcb);
1159
1160         if (tcb != NULL && tcb->nochecksum) {
1161                 h->tcpcksum[0] = h->tcpcksum[1] = 0;
1162         } else {
1163                 csum = ptclcsum(data, TCP6_IPLEN, hdrlen + dlen + TCP6_PHDRSIZE);
1164                 hnputs(h->tcpcksum, csum);
1165         }
1166
1167         /* move from pseudo header back to normal ip header */
1168         memset(h->vcf, 0, 4);
1169         h->vcf[0] = IP_VER6;
1170         hnputs(h->ploadlen, hdrlen + dlen);
1171         h->proto = ph->proto;
1172
1173         return data;
1174 }
1175
1176 struct block *htontcp4(Tcp *tcph, struct block *data, Tcp4hdr *ph,
1177                                            Tcpctl *tcb)
1178 {
1179         int dlen = blocklen(data);
1180         Tcp4hdr *h;
1181         uint16_t csum;
1182         uint16_t hdrlen, optpad;
1183
1184         compute_hdrlen_optpad(tcph, TCP4_HDRSIZE, &hdrlen, &optpad, tcb);
1185
1186         data = alloc_or_pad_block(data, hdrlen + TCP4_PKT);
1187         if (data == NULL)
1188                 return NULL;
1189         /* relative to the block start (bp->rp) */
1190         data->transport_header_end = hdrlen + TCP4_PKT;
1191
1192         /* copy in pseudo ip header plus port numbers */
1193         h = (Tcp4hdr *) (data->rp);
1194         memmove(h, ph, TCP4_TCBPHDRSZ);
1195
1196         /* copy in variable bits */
1197         hnputs(h->tcplen, hdrlen + dlen);
1198         hnputl(h->tcpseq, tcph->seq);
1199         hnputl(h->tcpack, tcph->ack);
1200         hnputs(h->tcpflag, (hdrlen << 10) | tcph->flags);
1201         hnputs(h->tcpwin, tcph->wnd >> (tcb != NULL ? tcb->snd.scale : 0));
1202         hnputs(h->tcpurg, tcph->urg);
1203
1204         write_opts(tcph, h->tcpopt, optpad, tcb);
1205
1206         if (tcb != NULL && tcb->nochecksum) {
1207                 h->tcpcksum[0] = h->tcpcksum[1] = 0;
1208         } else {
1209                 csum = ~ptclcsum(data, TCP4_IPLEN, TCP4_PHDRSIZE);
1210                 hnputs(h->tcpcksum, csum);
1211                 data->checksum_start = TCP4_IPLEN + TCP4_PHDRSIZE;
1212                 data->checksum_offset = ph->tcpcksum - ph->tcpsport;
1213                 data->flag |= Btcpck;
1214         }
1215
1216         return data;
1217 }
1218
1219 static void parse_inbound_sacks(Tcp *tcph, uint8_t *opt, uint16_t optlen)
1220 {
1221         uint8_t nr_sacks;
1222         uint32_t left, right;
1223
1224         nr_sacks = (optlen - 2) / 8;
1225         if (nr_sacks > MAX_NR_SACKS_PER_PACKET)
1226                 return;
1227         opt += 2;
1228         for (int i = 0; i < nr_sacks; i++, opt += 8) {
1229                 left = nhgetl(opt);
1230                 right = nhgetl(opt + 4);
1231                 if (seq_ge(left, right)) {
1232                         /* bad / malicious SACK.  Skip it, and adjust. */
1233                         nr_sacks--;
1234                         i--;    /* stay on this array element next loop */
1235                         continue;
1236                 }
1237                 tcph->sacks[i].left = left;
1238                 tcph->sacks[i].right = right;
1239         }
1240         tcph->nr_sacks = nr_sacks;
1241 }
1242
1243 static void parse_inbound_opts(Tcp *tcph, uint8_t *opt, uint16_t optsize)
1244 {
1245         uint16_t optlen;
1246
1247         while (optsize > 0 && *opt != EOLOPT) {
1248                 if (*opt == NOOPOPT) {
1249                         optsize--;
1250                         opt++;
1251                         continue;
1252                 }
1253                 optlen = opt[1];
1254                 if (optlen < 2 || optlen > optsize)
1255                         break;
1256                 switch (*opt) {
1257                         case MSSOPT:
1258                                 if (optlen == MSS_LENGTH)
1259                                         tcph->mss = nhgets(opt + 2);
1260                                 break;
1261                         case WSOPT:
1262                                 if (optlen == WS_LENGTH && *(opt + 2) <= MAX_WS_VALUE)
1263                                         tcph->ws = HaveWS | *(opt + 2);
1264                                 break;
1265                         case SACK_OK_OPT:
1266                                 if (optlen == SACK_OK_LENGTH)
1267                                         tcph->sack_ok = TRUE;
1268                                 break;
1269                         case SACK_OPT:
1270                                 parse_inbound_sacks(tcph, opt, optlen);
1271                                 break;
1272                         case TS_OPT:
1273                                 if (optlen == TS_LENGTH) {
1274                                         tcph->ts_val = nhgetl(opt + 2);
1275                                         tcph->ts_ecr = nhgetl(opt + 6);
1276                                 }
1277                                 break;
1278                 }
1279                 optsize -= optlen;
1280                 opt += optlen;
1281         }
1282 }
1283
1284 /* Helper, clears the opts.  We'll later set them with e.g. parse_inbound_opts,
1285  * set them manually, or something else. */
1286 static void clear_tcph_opts(Tcp *tcph)
1287 {
1288         tcph->mss = 0;
1289         tcph->ws = 0;
1290         tcph->sack_ok = FALSE;
1291         tcph->nr_sacks = 0;
1292         tcph->ts_val = 0;
1293         tcph->ts_ecr = 0;
1294 }
1295
1296 int ntohtcp6(Tcp * tcph, struct block **bpp)
1297 {
1298         Tcp6hdr *h;
1299         uint16_t hdrlen;
1300
1301         *bpp = pullupblock(*bpp, TCP6_PKT + TCP6_HDRSIZE);
1302         if (*bpp == NULL)
1303                 return -1;
1304
1305         h = (Tcp6hdr *) ((*bpp)->rp);
1306         tcph->source = nhgets(h->tcpsport);
1307         tcph->dest = nhgets(h->tcpdport);
1308         tcph->seq = nhgetl(h->tcpseq);
1309         tcph->ack = nhgetl(h->tcpack);
1310         hdrlen = (h->tcpflag[0] >> 2) & ~3;
1311         if (hdrlen < TCP6_HDRSIZE) {
1312                 freeblist(*bpp);
1313                 return -1;
1314         }
1315
1316         tcph->flags = h->tcpflag[1];
1317         tcph->wnd = nhgets(h->tcpwin);
1318         tcph->urg = nhgets(h->tcpurg);
1319         clear_tcph_opts(tcph);
1320         tcph->len = nhgets(h->ploadlen) - hdrlen;
1321
1322         *bpp = pullupblock(*bpp, hdrlen + TCP6_PKT);
1323         if (*bpp == NULL)
1324                 return -1;
1325         parse_inbound_opts(tcph, h->tcpopt, hdrlen - TCP6_HDRSIZE);
1326         return hdrlen;
1327 }
1328
1329 int ntohtcp4(Tcp * tcph, struct block **bpp)
1330 {
1331         Tcp4hdr *h;
1332         uint16_t hdrlen;
1333
1334         *bpp = pullupblock(*bpp, TCP4_PKT + TCP4_HDRSIZE);
1335         if (*bpp == NULL)
1336                 return -1;
1337
1338         h = (Tcp4hdr *) ((*bpp)->rp);
1339         tcph->source = nhgets(h->tcpsport);
1340         tcph->dest = nhgets(h->tcpdport);
1341         tcph->seq = nhgetl(h->tcpseq);
1342         tcph->ack = nhgetl(h->tcpack);
1343
1344         hdrlen = (h->tcpflag[0] >> 2) & ~3;
1345         if (hdrlen < TCP4_HDRSIZE) {
1346                 freeblist(*bpp);
1347                 return -1;
1348         }
1349
1350         tcph->flags = h->tcpflag[1];
1351         tcph->wnd = nhgets(h->tcpwin);
1352         tcph->urg = nhgets(h->tcpurg);
1353         clear_tcph_opts(tcph);
1354         tcph->len = nhgets(h->length) - (hdrlen + TCP4_PKT);
1355
1356         *bpp = pullupblock(*bpp, hdrlen + TCP4_PKT);
1357         if (*bpp == NULL)
1358                 return -1;
1359         parse_inbound_opts(tcph, h->tcpopt, hdrlen - TCP4_HDRSIZE);
1360         return hdrlen;
1361 }
1362
1363 /*
1364  *  For outgoing calls, generate an initial sequence
1365  *  number and put a SYN on the send queue
1366  */
1367 void tcpsndsyn(struct conv *s, Tcpctl * tcb)
1368 {
1369         urandom_read(&tcb->iss, sizeof(tcb->iss));
1370         tcb->rttseq = tcb->iss;
1371         tcb->snd.wl2 = tcb->iss;
1372         tcb->snd.una = tcb->iss;
1373         tcb->snd.rtx = tcb->rttseq;
1374         tcb->snd.nxt = tcb->rttseq;
1375         tcb->flgcnt++;
1376         tcb->flags |= FORCE;
1377         tcb->sndsyntime = NOW;
1378
1379         /* set desired mss and scale */
1380         tcb->mss = tcpmtu(s->p, s->laddr, s->ipversion, &tcb->scale,
1381                           &tcb->flags);
1382 }
1383
1384 void
1385 sndrst(struct Proto *tcp, uint8_t * source, uint8_t * dest,
1386            uint16_t length, Tcp * seg, uint8_t version, char *reason)
1387 {
1388         struct block *hbp;
1389         uint8_t rflags;
1390         struct tcppriv *tpriv;
1391         Tcp4hdr ph4;
1392         Tcp6hdr ph6;
1393
1394         netlog(tcp->f, Logtcpreset, "sndrst: %s\n", reason);
1395
1396         tpriv = tcp->priv;
1397
1398         if (seg->flags & RST)
1399                 return;
1400
1401         /* make pseudo header */
1402         switch (version) {
1403                 case V4:
1404                         memset(&ph4, 0, sizeof(ph4));
1405                         ph4.vihl = IP_VER4;
1406                         v6tov4(ph4.tcpsrc, dest);
1407                         v6tov4(ph4.tcpdst, source);
1408                         ph4.proto = IP_TCPPROTO;
1409                         hnputs(ph4.tcplen, TCP4_HDRSIZE);
1410                         hnputs(ph4.tcpsport, seg->dest);
1411                         hnputs(ph4.tcpdport, seg->source);
1412                         break;
1413                 case V6:
1414                         memset(&ph6, 0, sizeof(ph6));
1415                         ph6.vcf[0] = IP_VER6;
1416                         ipmove(ph6.tcpsrc, dest);
1417                         ipmove(ph6.tcpdst, source);
1418                         ph6.proto = IP_TCPPROTO;
1419                         hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1420                         hnputs(ph6.tcpsport, seg->dest);
1421                         hnputs(ph6.tcpdport, seg->source);
1422                         break;
1423                 default:
1424                         panic("sndrst: version %d", version);
1425         }
1426
1427         tpriv->stats[OutRsts]++;
1428         rflags = RST;
1429
1430         /* convince the other end that this reset is in band */
1431         if (seg->flags & ACK) {
1432                 seg->seq = seg->ack;
1433                 seg->ack = 0;
1434         } else {
1435                 rflags |= ACK;
1436                 seg->ack = seg->seq;
1437                 seg->seq = 0;
1438                 if (seg->flags & SYN)
1439                         seg->ack++;
1440                 seg->ack += length;
1441                 if (seg->flags & FIN)
1442                         seg->ack++;
1443         }
1444         seg->flags = rflags;
1445         seg->wnd = 0;
1446         seg->urg = 0;
1447         seg->mss = 0;
1448         seg->ws = 0;
1449         seg->sack_ok = FALSE;
1450         seg->nr_sacks = 0;
1451         /* seg->ts_val is already set with their timestamp */
1452         switch (version) {
1453                 case V4:
1454                         hbp = htontcp4(seg, NULL, &ph4, NULL);
1455                         if (hbp == NULL)
1456                                 return;
1457                         ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1458                         break;
1459                 case V6:
1460                         hbp = htontcp6(seg, NULL, &ph6, NULL);
1461                         if (hbp == NULL)
1462                                 return;
1463                         ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1464                         break;
1465                 default:
1466                         panic("sndrst2: version %d", version);
1467         }
1468 }
1469
1470 /*
1471  *  send a reset to the remote side and close the conversation
1472  *  called with s qlocked
1473  */
1474 static void tcphangup(struct conv *s)
1475 {
1476         ERRSTACK(1);
1477         Tcp seg;
1478         Tcpctl *tcb;
1479         struct block *hbp;
1480
1481         tcb = (Tcpctl *) s->ptcl;
1482         if (ipcmp(s->raddr, IPnoaddr)) {
1483                 /* discard error style, poperror regardless */
1484                 if (!waserror()) {
1485                         seg.flags = RST | ACK;
1486                         seg.ack = tcb->rcv.nxt;
1487                         tcb->last_ack_sent = seg.ack;
1488                         tcb->rcv.una = 0;
1489                         seg.seq = tcb->snd.nxt;
1490                         seg.wnd = 0;
1491                         seg.urg = 0;
1492                         seg.mss = 0;
1493                         seg.ws = 0;
1494                         seg.sack_ok = FALSE;
1495                         seg.nr_sacks = 0;
1496                         seg.ts_val = tcb->ts_recent;
1497                         switch (s->ipversion) {
1498                                 case V4:
1499                                         tcb->protohdr.tcp4hdr.vihl = IP_VER4;
1500                                         hbp = htontcp4(&seg, NULL, &tcb->protohdr.tcp4hdr, tcb);
1501                                         ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
1502                                         break;
1503                                 case V6:
1504                                         tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
1505                                         hbp = htontcp6(&seg, NULL, &tcb->protohdr.tcp6hdr, tcb);
1506                                         ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
1507                                         break;
1508                                 default:
1509                                         panic("tcphangup: version %d", s->ipversion);
1510                         }
1511                 }
1512                 poperror();
1513         }
1514         localclose(s, NULL);
1515 }
1516
1517 /*
1518  *  (re)send a SYN ACK
1519  */
1520 int sndsynack(struct Proto *tcp, Limbo * lp)
1521 {
1522         struct block *hbp;
1523         Tcp4hdr ph4;
1524         Tcp6hdr ph6;
1525         Tcp seg;
1526         int scale;
1527         uint8_t flag = 0;
1528
1529         /* make pseudo header */
1530         switch (lp->version) {
1531                 case V4:
1532                         memset(&ph4, 0, sizeof(ph4));
1533                         ph4.vihl = IP_VER4;
1534                         v6tov4(ph4.tcpsrc, lp->laddr);
1535                         v6tov4(ph4.tcpdst, lp->raddr);
1536                         ph4.proto = IP_TCPPROTO;
1537                         hnputs(ph4.tcplen, TCP4_HDRSIZE);
1538                         hnputs(ph4.tcpsport, lp->lport);
1539                         hnputs(ph4.tcpdport, lp->rport);
1540                         break;
1541                 case V6:
1542                         memset(&ph6, 0, sizeof(ph6));
1543                         ph6.vcf[0] = IP_VER6;
1544                         ipmove(ph6.tcpsrc, lp->laddr);
1545                         ipmove(ph6.tcpdst, lp->raddr);
1546                         ph6.proto = IP_TCPPROTO;
1547                         hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1548                         hnputs(ph6.tcpsport, lp->lport);
1549                         hnputs(ph6.tcpdport, lp->rport);
1550                         break;
1551                 default:
1552                         panic("sndrst: version %d", lp->version);
1553         }
1554
1555         seg.seq = lp->iss;
1556         seg.ack = lp->irs + 1;
1557         seg.flags = SYN | ACK;
1558         seg.urg = 0;
1559         seg.mss = tcpmtu(tcp, lp->laddr, lp->version, &scale, &flag);
1560         seg.wnd = QMAX;
1561         seg.ts_val = lp->ts_val;
1562         seg.nr_sacks = 0;
1563
1564         /* if the other side set scale, we should too */
1565         if (lp->rcvscale) {
1566                 seg.ws = scale;
1567                 lp->sndscale = scale;
1568         } else {
1569                 seg.ws = 0;
1570                 lp->sndscale = 0;
1571         }
1572         if (SACK_SUPPORTED)
1573                 seg.sack_ok = lp->sack_ok;
1574         else
1575                 seg.sack_ok = FALSE;
1576
1577         switch (lp->version) {
1578                 case V4:
1579                         hbp = htontcp4(&seg, NULL, &ph4, NULL);
1580                         if (hbp == NULL)
1581                                 return -1;
1582                         ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1583                         break;
1584                 case V6:
1585                         hbp = htontcp6(&seg, NULL, &ph6, NULL);
1586                         if (hbp == NULL)
1587                                 return -1;
1588                         ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1589                         break;
1590                 default:
1591                         panic("sndsnack: version %d", lp->version);
1592         }
1593         lp->lastsend = NOW;
1594         return 0;
1595 }
1596
1597 #define hashipa(a, p) ( ( (a)[IPaddrlen-2] + (a)[IPaddrlen-1] + p )&LHTMASK )
1598
1599 /*
1600  *  put a call into limbo and respond with a SYN ACK
1601  *
1602  *  called with proto locked
1603  */
1604 static void
1605 limbo(struct conv *s, uint8_t * source, uint8_t * dest, Tcp * seg, int version)
1606 {
1607         Limbo *lp, **l;
1608         struct tcppriv *tpriv;
1609         int h;
1610
1611         tpriv = s->p->priv;
1612         h = hashipa(source, seg->source);
1613
1614         for (l = &tpriv->lht[h]; *l != NULL; l = &lp->next) {
1615                 lp = *l;
1616                 if (lp->lport != seg->dest || lp->rport != seg->source
1617                         || lp->version != version)
1618                         continue;
1619                 if (ipcmp(lp->raddr, source) != 0)
1620                         continue;
1621                 if (ipcmp(lp->laddr, dest) != 0)
1622                         continue;
1623
1624                 /* each new SYN restarts the retransmits */
1625                 lp->irs = seg->seq;
1626                 break;
1627         }
1628         lp = *l;
1629         if (lp == NULL) {
1630                 if (tpriv->nlimbo >= Maxlimbo && tpriv->lht[h]) {
1631                         lp = tpriv->lht[h];
1632                         tpriv->lht[h] = lp->next;
1633                         lp->next = NULL;
1634                 } else {
1635                         lp = kzmalloc(sizeof(*lp), 0);
1636                         if (lp == NULL)
1637                                 return;
1638                         tpriv->nlimbo++;
1639                 }
1640                 *l = lp;
1641                 lp->version = version;
1642                 ipmove(lp->laddr, dest);
1643                 ipmove(lp->raddr, source);
1644                 lp->lport = seg->dest;
1645                 lp->rport = seg->source;
1646                 lp->mss = seg->mss;
1647                 lp->rcvscale = seg->ws;
1648                 lp->sack_ok = seg->sack_ok;
1649                 lp->irs = seg->seq;
1650                 lp->ts_val = seg->ts_val;
1651                 urandom_read(&lp->iss, sizeof(lp->iss));
1652         }
1653
1654         if (sndsynack(s->p, lp) < 0) {
1655                 *l = lp->next;
1656                 tpriv->nlimbo--;
1657                 kfree(lp);
1658         }
1659 }
1660
1661 /*
1662  *  resend SYN ACK's once every SYNACK_RXTIMER ms.
1663  */
1664 static void limborexmit(struct Proto *tcp)
1665 {
1666         struct tcppriv *tpriv;
1667         Limbo **l, *lp;
1668         int h;
1669         int seen;
1670         uint64_t now;
1671
1672         tpriv = tcp->priv;
1673
1674         if (!canqlock(&tcp->qlock))
1675                 return;
1676         seen = 0;
1677         now = NOW;
1678         for (h = 0; h < NLHT && seen < tpriv->nlimbo; h++) {
1679                 for (l = &tpriv->lht[h]; *l != NULL && seen < tpriv->nlimbo;) {
1680                         lp = *l;
1681                         seen++;
1682                         if (now - lp->lastsend < (lp->rexmits + 1) * SYNACK_RXTIMER)
1683                                 continue;
1684
1685                         /* time it out after 1 second */
1686                         if (++(lp->rexmits) > 5) {
1687                                 tpriv->nlimbo--;
1688                                 *l = lp->next;
1689                                 kfree(lp);
1690                                 continue;
1691                         }
1692
1693                         /* if we're being attacked, don't bother resending SYN ACK's */
1694                         if (tpriv->nlimbo > 100)
1695                                 continue;
1696
1697                         if (sndsynack(tcp, lp) < 0) {
1698                                 tpriv->nlimbo--;
1699                                 *l = lp->next;
1700                                 kfree(lp);
1701                                 continue;
1702                         }
1703
1704                         l = &lp->next;
1705                 }
1706         }
1707         qunlock(&tcp->qlock);
1708 }
1709
1710 /*
1711  *  lookup call in limbo.  if found, throw it out.
1712  *
1713  *  called with proto locked
1714  */
1715 static void
1716 limborst(struct conv *s, Tcp * segp, uint8_t * src, uint8_t * dst,
1717                  uint8_t version)
1718 {
1719         Limbo *lp, **l;
1720         int h;
1721         struct tcppriv *tpriv;
1722
1723         tpriv = s->p->priv;
1724
1725         /* find a call in limbo */
1726         h = hashipa(src, segp->source);
1727         for (l = &tpriv->lht[h]; *l != NULL; l = &lp->next) {
1728                 lp = *l;
1729                 if (lp->lport != segp->dest || lp->rport != segp->source
1730                         || lp->version != version)
1731                         continue;
1732                 if (ipcmp(lp->laddr, dst) != 0)
1733                         continue;
1734                 if (ipcmp(lp->raddr, src) != 0)
1735                         continue;
1736
1737                 /* RST can only follow the SYN */
1738                 if (segp->seq == lp->irs + 1) {
1739                         tpriv->nlimbo--;
1740                         *l = lp->next;
1741                         kfree(lp);
1742                 }
1743                 break;
1744         }
1745 }
1746
1747 /* The advertised MSS (e.g. 1460) includes any per-packet TCP options, such as
1748  * TCP timestamps.  A given packet will contain mss bytes, but only typical_mss
1749  * bytes of *data*.  If we know we'll use those options, we should adjust our
1750  * typical_mss, which will affect the cwnd. */
1751 static void adjust_typical_mss_for_opts(Tcp *tcph, Tcpctl *tcb)
1752 {
1753         uint16_t opt_size = 0;
1754
1755         if (tcph->ts_val)
1756                 opt_size += TS_LENGTH + TS_SEND_PREPAD;
1757         opt_size = ROUNDUP(opt_size, 4);
1758         tcb->typical_mss -= opt_size;
1759 }
1760
1761 /*
1762  *  come here when we finally get an ACK to our SYN-ACK.
1763  *  lookup call in limbo.  if found, create a new conversation
1764  *
1765  *  called with proto locked
1766  */
1767 static struct conv *tcpincoming(struct conv *s, Tcp * segp, uint8_t * src,
1768                                                                 uint8_t * dst, uint8_t version)
1769 {
1770         struct conv *new;
1771         Tcpctl *tcb;
1772         struct tcppriv *tpriv;
1773         Tcp4hdr *h4;
1774         Tcp6hdr *h6;
1775         Limbo *lp, **l;
1776         int h;
1777
1778         /* unless it's just an ack, it can't be someone coming out of limbo */
1779         if ((segp->flags & SYN) || (segp->flags & ACK) == 0)
1780                 return NULL;
1781
1782         tpriv = s->p->priv;
1783
1784         /* find a call in limbo */
1785         h = hashipa(src, segp->source);
1786         for (l = &tpriv->lht[h]; (lp = *l) != NULL; l = &lp->next) {
1787                 netlog(s->p->f, Logtcp,
1788                            "tcpincoming s %I!%d/%I!%d d %I!%d/%I!%d v %d/%d\n", src,
1789                            segp->source, lp->raddr, lp->rport, dst, segp->dest, lp->laddr,
1790                            lp->lport, version, lp->version);
1791
1792                 if (lp->lport != segp->dest || lp->rport != segp->source
1793                         || lp->version != version)
1794                         continue;
1795                 if (ipcmp(lp->laddr, dst) != 0)
1796                         continue;
1797                 if (ipcmp(lp->raddr, src) != 0)
1798                         continue;
1799
1800                 /* we're assuming no data with the initial SYN */
1801                 if (segp->seq != lp->irs + 1 || segp->ack != lp->iss + 1) {
1802                         netlog(s->p->f, Logtcp, "tcpincoming s 0x%lx/0x%lx a 0x%lx 0x%lx\n",
1803                                    segp->seq, lp->irs + 1, segp->ack, lp->iss + 1);
1804                         lp = NULL;
1805                 } else {
1806                         tpriv->nlimbo--;
1807                         *l = lp->next;
1808                 }
1809                 break;
1810         }
1811         if (lp == NULL)
1812                 return NULL;
1813
1814         new = Fsnewcall(s, src, segp->source, dst, segp->dest, version);
1815         if (new == NULL)
1816                 return NULL;
1817
1818         memmove(new->ptcl, s->ptcl, sizeof(Tcpctl));
1819         tcb = (Tcpctl *) new->ptcl;
1820         tcb->flags &= ~CLONE;
1821         tcb->timer.arg = new;
1822         tcb->timer.state = TcptimerOFF;
1823         tcb->acktimer.arg = new;
1824         tcb->acktimer.state = TcptimerOFF;
1825         tcb->katimer.arg = new;
1826         tcb->katimer.state = TcptimerOFF;
1827         tcb->rtt_timer.arg = new;
1828         tcb->rtt_timer.state = TcptimerOFF;
1829
1830         tcb->irs = lp->irs;
1831         tcb->rcv.nxt = tcb->irs + 1;
1832         tcb->rcv.urg = tcb->rcv.nxt;
1833
1834         tcb->iss = lp->iss;
1835         tcb->rttseq = tcb->iss;
1836         tcb->snd.wl2 = tcb->iss;
1837         tcb->snd.una = tcb->iss + 1;
1838         tcb->snd.rtx = tcb->iss + 1;
1839         tcb->snd.nxt = tcb->iss + 1;
1840         tcb->flgcnt = 0;
1841         tcb->flags |= SYNACK;
1842
1843         /* our sending max segment size cannot be bigger than what he asked for */
1844         if (lp->mss != 0 && lp->mss < tcb->mss) {
1845                 tcb->mss = lp->mss;
1846                 tcb->typical_mss = tcb->mss;
1847         }
1848         adjust_typical_mss_for_opts(segp, tcb);
1849
1850         /* Here's where we record the previously-decided header options.  They were
1851          * actually decided on when we agreed to them in the SYNACK we sent.  We
1852          * didn't create an actual TCB until now, so we can copy those decisions out
1853          * of the limbo tracker and into the TCB. */
1854         tcb->sack_ok = lp->sack_ok;
1855         /* window scaling */
1856         tcpsetscale(new, tcb, lp->rcvscale, lp->sndscale);
1857
1858         tcb->snd.wnd = segp->wnd;
1859         tcb->cwind = tcb->typical_mss * CWIND_SCALE;
1860
1861         /* set initial round trip time */
1862         tcb->sndsyntime = lp->lastsend + lp->rexmits * SYNACK_RXTIMER;
1863         tcpsynackrtt(new);
1864
1865         kfree(lp);
1866
1867         /* set up proto header */
1868         switch (version) {
1869                 case V4:
1870                         h4 = &tcb->protohdr.tcp4hdr;
1871                         memset(h4, 0, sizeof(*h4));
1872                         h4->proto = IP_TCPPROTO;
1873                         hnputs(h4->tcpsport, new->lport);
1874                         hnputs(h4->tcpdport, new->rport);
1875                         v6tov4(h4->tcpsrc, dst);
1876                         v6tov4(h4->tcpdst, src);
1877                         break;
1878                 case V6:
1879                         h6 = &tcb->protohdr.tcp6hdr;
1880                         memset(h6, 0, sizeof(*h6));
1881                         h6->proto = IP_TCPPROTO;
1882                         hnputs(h6->tcpsport, new->lport);
1883                         hnputs(h6->tcpdport, new->rport);
1884                         ipmove(h6->tcpsrc, dst);
1885                         ipmove(h6->tcpdst, src);
1886                         break;
1887                 default:
1888                         panic("tcpincoming: version %d", new->ipversion);
1889         }
1890
1891         tcpsetstate(new, Established);
1892
1893         iphtadd(&tpriv->ht, new);
1894
1895         return new;
1896 }
1897
1898 int seq_within(uint32_t x, uint32_t low, uint32_t high)
1899 {
1900         if (low <= high) {
1901                 if (low <= x && x <= high)
1902                         return 1;
1903         } else {
1904                 if (x >= low || x <= high)
1905                         return 1;
1906         }
1907         return 0;
1908 }
1909
1910 int seq_lt(uint32_t x, uint32_t y)
1911 {
1912         return (int)(x - y) < 0;
1913 }
1914
1915 int seq_le(uint32_t x, uint32_t y)
1916 {
1917         return (int)(x - y) <= 0;
1918 }
1919
1920 int seq_gt(uint32_t x, uint32_t y)
1921 {
1922         return (int)(x - y) > 0;
1923 }
1924
1925 int seq_ge(uint32_t x, uint32_t y)
1926 {
1927         return (int)(x - y) >= 0;
1928 }
1929
1930 static uint32_t seq_max(uint32_t x, uint32_t y)
1931 {
1932         return seq_ge(x, y) ? x : y;
1933 }
1934
1935 static uint32_t seq_min(uint32_t x, uint32_t y)
1936 {
1937         return seq_le(x, y) ? x : y;
1938 }
1939
1940 /*
1941  *  use the time between the first SYN and it's ack as the
1942  *  initial round trip time
1943  */
1944 void tcpsynackrtt(struct conv *s)
1945 {
1946         Tcpctl *tcb;
1947         uint64_t delta;
1948         struct tcppriv *tpriv;
1949
1950         tcb = (Tcpctl *) s->ptcl;
1951         tpriv = s->p->priv;
1952
1953         delta = NOW - tcb->sndsyntime;
1954         tcb->srtt = delta << LOGAGAIN;
1955         tcb->mdev = delta << LOGDGAIN;
1956
1957         /* halt round trip timer */
1958         tcphalt(tpriv, &tcb->rtt_timer);
1959 }
1960
1961 /* For LFNs (long/fat), our default tx queue doesn't hold enough data, and TCP
1962  * blocks on the application - even if the app already has the data ready to go.
1963  * We need to hold the sent, unacked data (1x cwnd), plus all the data we might
1964  * send next RTT (1x cwnd).  Note this is called after cwnd was expanded. */
1965 static void adjust_tx_qio_limit(struct conv *s)
1966 {
1967         Tcpctl *tcb = (Tcpctl *) s->ptcl;
1968         size_t ideal_limit = tcb->cwind * 2;
1969
1970         /* This is called for every ACK, and it's not entirely free to update the
1971          * limit (locks, CVs, taps).  Updating in chunks of mss seems reasonable.
1972          * During SS, we'll update this on most ACKs (given each ACK increased the
1973          * cwind by > MSS).
1974          *
1975          * We also don't want a lot of tiny blocks from the user, but the way qio
1976          * works, you can put in as much as you want (Maxatomic) and then get
1977          * flow-controlled. */
1978         if (qgetlimit(s->wq) + tcb->typical_mss < ideal_limit)
1979                 qsetlimit(s->wq, ideal_limit);
1980         /* TODO: we could shrink the qio limit too, if we had a better idea what the
1981          * actual threshold was.  We want the limit to be the 'stable' cwnd * 2. */
1982 }
1983
1984 /* Attempts to merge later sacks into sack 'into' (index in the array) */
1985 static void merge_sacks_into(Tcpctl *tcb, int into)
1986 {
1987         struct sack_block *into_sack = &tcb->snd.sacks[into];
1988         struct sack_block *tcb_sack;
1989         int shift = 0;
1990
1991         for (int i = into + 1; i < tcb->snd.nr_sacks; i++) {
1992                 tcb_sack = &tcb->snd.sacks[i];
1993                 if (seq_lt(into_sack->right, tcb_sack->left))
1994                         break;
1995                 if (seq_gt(tcb_sack->right, into_sack->right))
1996                         into_sack->right = tcb_sack->right;
1997                 shift++;
1998         }
1999         if (shift) {
2000                 memmove(tcb->snd.sacks + into + 1,
2001                         tcb->snd.sacks + into + 1 + shift,
2002                         sizeof(struct sack_block) * (tcb->snd.nr_sacks - into - 1
2003                                                              - shift));
2004                 tcb->snd.nr_sacks -= shift;
2005         }
2006 }
2007
2008 /* If we update a sack, it means they received a packet (possibly out of order),
2009  * but they have not received earlier packets.  Otherwise, they would do a full
2010  * ACK.
2011  *
2012  * The trick is in knowing whether the reception growing this sack is due to a
2013  * retrans or due to packets from before our last loss event.  The rightmost
2014  * sack tends to grow a lot with packets we sent before the loss.  However,
2015  * intermediate sacks that grow are signs of a loss, since they only grow as a
2016  * result of retrans.
2017  *
2018  * This is only true for the first time through a retrans.  After we've gone
2019  * through a full retrans blast, the sack that hinted at the retrans loss (and
2020  * there could be multiple of them!) will continue to grow.  We could come up
2021  * with some tracking for this, but instead we'll just do a one-time deal.  You
2022  * can recover from one detected sack retrans loss.  After that, you'll have to
2023  * use the RTO.
2024  *
2025  * This won't catch some things, like a sack that grew and merged with the
2026  * rightmost sack.  This also won't work if you have a single sack.  We can't
2027  * tell where the retrans ends and the sending begins. */
2028 static bool sack_hints_at_loss(Tcpctl *tcb, struct sack_block *tcb_sack)
2029 {
2030         if (tcb->snd.recovery != SACK_RETRANS_RECOVERY)
2031                 return FALSE;
2032         return &tcb->snd.sacks[tcb->snd.nr_sacks - 1] != tcb_sack;
2033 }
2034
2035 static bool sack_contains(struct sack_block *tcb_sack, uint32_t seq)
2036 {
2037         return seq_le(tcb_sack->left, seq) && seq_lt(seq, tcb_sack->right);
2038 }
2039
2040 /* Debugging helper! */
2041 static void sack_asserter(Tcpctl *tcb, char *str)
2042 {
2043         struct sack_block *tcb_sack;
2044
2045         for (int i = 0; i < tcb->snd.nr_sacks; i++) {
2046                 tcb_sack = &tcb->snd.sacks[i];
2047                 /* Checking invariants: snd.rtx is never inside a sack, sacks are always
2048                  * mutually exclusive. */
2049                 if (sack_contains(tcb_sack, tcb->snd.rtx) ||
2050                     ((i + 1 < tcb->snd.nr_sacks) && seq_ge(tcb_sack->right,
2051                                                                (tcb_sack + 1)->left))) {
2052                         printk("SACK ASSERT ERROR at %s\n", str);
2053                         printk("rtx %u una %u nxt %u, sack [%u, %u)\n",
2054                                tcb->snd.rtx, tcb->snd.una, tcb->snd.nxt, tcb_sack->left,
2055                                    tcb_sack->right);
2056                         for (int i = 0; i < tcb->snd.nr_sacks; i++)
2057                                 printk("\t %d: [%u, %u)\n", i, tcb->snd.sacks[i].left,
2058                                        tcb->snd.sacks[i].right);
2059                         backtrace();
2060                         panic("");
2061                 }
2062         }
2063 }
2064
2065 /* Updates bookkeeping whenever a sack is added or updated */
2066 static void sack_has_changed(struct conv *s, Tcpctl *tcb,
2067                              struct sack_block *tcb_sack)
2068 {
2069         /* Due to the change, snd.rtx might be in the middle of this sack.  Advance
2070          * it to the right edge. */
2071         if (sack_contains(tcb_sack, tcb->snd.rtx))
2072                 tcb->snd.rtx = tcb_sack->right;
2073
2074         /* This is a sack for something we retransed and we think it means there was
2075          * another loss.  Instead of waiting for the RTO, we can take action. */
2076         if (sack_hints_at_loss(tcb, tcb_sack)) {
2077                 if (++tcb->snd.sack_loss_hint == TCPREXMTTHRESH) {
2078                         netlog(s->p->f, Logtcprxmt,
2079                                "%I.%d -> %I.%d: sack rxmit loss: snd.rtx %u, sack [%u,%u), una %u, recovery_pt %u\n",
2080                                s->laddr, s->lport, s->raddr, s->rport,
2081                                tcb->snd.rtx, tcb_sack->left, tcb_sack->right, tcb->snd.una,
2082                                tcb->snd.recovery_pt);
2083                         /* Redo retrans, but keep the sacks and recovery point */
2084                         tcp_loss_event(s, tcb);
2085                         tcb->snd.rtx = tcb->snd.una;
2086                         tcb->snd.sack_loss_hint = 0;
2087                         /* Act like an RTO.  We just detected it earlier.  This prevents us
2088                          * from getting another sack hint loss this recovery period and from
2089                          * advancing the opportunistic right edge. */
2090                         tcb->snd.recovery = RTO_RETRANS_RECOVERY;
2091                         /* We didn't actually time out yet and we expect to keep getting
2092                          * sacks, so we don't want to flush or worry about in_flight.  If we
2093                          * messed something up, the RTO will still fire. */
2094                         set_in_flight(tcb);
2095                 }
2096         }
2097 }
2098
2099 /* Advances tcb_sack's right edge, if new_right is farther, and updates the
2100  * bookkeeping due to the change. */
2101 static void update_right_edge(struct conv *s, Tcpctl *tcb,
2102                               struct sack_block *tcb_sack, uint32_t new_right)
2103 {
2104         if (seq_le(new_right, tcb_sack->right))
2105                 return;
2106         tcb_sack->right = new_right;
2107         merge_sacks_into(tcb, tcb_sack - tcb->snd.sacks);
2108         sack_has_changed(s, tcb, tcb_sack);
2109 }
2110
2111 static void update_or_insert_sack(struct conv *s, Tcpctl *tcb,
2112                                   struct sack_block *seg_sack)
2113 {
2114         struct sack_block *tcb_sack;
2115
2116         for (int i = 0; i < tcb->snd.nr_sacks; i++) {
2117                 tcb_sack = &tcb->snd.sacks[i];
2118                 if (seq_lt(tcb_sack->left, seg_sack->left)) {
2119                         /* This includes adjacent (which I've seen!) and overlap. */
2120                         if (seq_le(seg_sack->left, tcb_sack->right)) {
2121                                 update_right_edge(s, tcb, tcb_sack, seg_sack->right);
2122                                 return;
2123                         }
2124                         continue;
2125                 }
2126                 /* Update existing sack */
2127                 if (tcb_sack->left == seg_sack->left) {
2128                         update_right_edge(s, tcb, tcb_sack, seg_sack->right);
2129                         return;
2130                 }
2131                 /* Found our slot */
2132                 if (seq_gt(tcb_sack->left, seg_sack->left)) {
2133                         if (tcb->snd.nr_sacks == MAX_NR_SND_SACKS) {
2134                                 /* Out of room, but it is possible this sack overlaps later
2135                                  * sacks, including the max sack's right edge. */
2136                                 if (seq_ge(seg_sack->right, tcb_sack->left)) {
2137                                         /* Take over the sack */
2138                                         tcb_sack->left = seg_sack->left;
2139                                         update_right_edge(s, tcb, tcb_sack, seg_sack->right);
2140                                 }
2141                                 return;
2142                         }
2143                         /* O/W, it's our slot and we have room (at least one spot). */
2144                         memmove(&tcb->snd.sacks[i + 1], &tcb->snd.sacks[i],
2145                                 sizeof(struct sack_block) * (tcb->snd.nr_sacks - i));
2146                         tcb_sack->left = seg_sack->left;
2147                         tcb_sack->right = seg_sack->right;
2148                         tcb->snd.nr_sacks++;
2149                         merge_sacks_into(tcb, i);
2150                         sack_has_changed(s, tcb, tcb_sack);
2151                         return;
2152                 }
2153         }
2154         if (tcb->snd.nr_sacks == MAX_NR_SND_SACKS) {
2155                 /* We didn't find space in the sack array. */
2156                 tcb_sack = &tcb->snd.sacks[MAX_NR_SND_SACKS - 1];
2157                 /* Need to always maintain the rightmost sack, discarding the prev */
2158                 if (seq_gt(seg_sack->right, tcb_sack->right)) {
2159                         tcb_sack->left = seg_sack->left;
2160                         tcb_sack->right = seg_sack->right;
2161                         sack_has_changed(s, tcb, tcb_sack);
2162                 }
2163                 return;
2164         }
2165         tcb_sack = &tcb->snd.sacks[tcb->snd.nr_sacks];
2166         tcb->snd.nr_sacks++;
2167         tcb_sack->left = seg_sack->left;
2168         tcb_sack->right = seg_sack->right;
2169         sack_has_changed(s, tcb, tcb_sack);
2170 }
2171
2172 /* Given the packet seg, track the sacks in TCB.  There are a few things: if seg
2173  * acks new data, some sacks might no longer be needed.  Some sacks might grow,
2174  * we might add new sacks, either of which can cause a merger.
2175  *
2176  * The important thing is that we always have the max sack entry: it must be
2177  * inserted for sure and findable.  We need that for our measurement of what
2178  * packets are in the network.
2179  *
2180  * Note that we keep sacks that are below snd.rtx (and above
2181  * seg.ack/tcb->snd.una) as best we can - we don't prune them.  We'll need those
2182  * for the in_flight estimate.
2183  *
2184  * When we run out of room, we'll have to throw away a sack.  Anything we throw
2185  * away below snd.rtx will be counted as 'in flight', even though it isn't.  If
2186  * we throw away something greater than snd.rtx, we'll also retrans it.  For
2187  * simplicity, we throw-away / replace the rightmost sack, since we're always
2188  * maintaining a highest sack. */
2189 static void update_sacks(struct conv *s, Tcpctl *tcb, Tcp *seg)
2190 {
2191         int prune = 0;
2192         struct sack_block *tcb_sack;
2193
2194         for (int i = 0; i < tcb->snd.nr_sacks; i++) {
2195                 tcb_sack = &tcb->snd.sacks[i];
2196                 /* For the equality case, if they acked up to, but not including an old
2197                  * sack, they must have reneged it.  Otherwise they would have acked
2198                  * beyond the sack. */
2199                 if (seq_lt(seg->ack, tcb_sack->left))
2200                         break;
2201                 prune++;
2202         }
2203         if (prune) {
2204                 memmove(tcb->snd.sacks, tcb->snd.sacks + prune,
2205                         sizeof(struct sack_block) * (tcb->snd.nr_sacks - prune));
2206                 tcb->snd.nr_sacks -= prune;
2207         }
2208         for (int i = 0; i < seg->nr_sacks; i++) {
2209                 /* old sacks */
2210                 if (seq_lt(seg->sacks[i].left, seg->ack))
2211                         continue;
2212                 /* buggy sack: out of range */
2213                 if (seq_gt(seg->sacks[i].right, tcb->snd.nxt))
2214                         continue;
2215                 update_or_insert_sack(s, tcb, &seg->sacks[i]);
2216         }
2217 }
2218
2219 /* This is a little bit of an under estimate, since we assume a packet is lost
2220  * once we have any sacks above it.  Overall, it's at most 2 * MSS of an
2221  * overestimate.
2222  *
2223  * If we have no sacks (either reneged or never used) we'll assume all packets
2224  * above snd.rtx are lost.  This will be the case for sackless fast rxmit
2225  * (Dong's stuff) or for a timeout.  In the former case, this is probably not
2226  * true, and in_flight should be higher, but we have no knowledge without the
2227  * sacks. */
2228 static void set_in_flight(Tcpctl *tcb)
2229 {
2230         struct sack_block *tcb_sack;
2231         uint32_t in_flight = 0;
2232         uint32_t from;
2233
2234         if (!tcb->snd.nr_sacks) {
2235                 tcb->snd.in_flight = tcb->snd.rtx - tcb->snd.una;
2236                 return;
2237         }
2238
2239         /* Everything to the right of the unsacked */
2240         tcb_sack = &tcb->snd.sacks[tcb->snd.nr_sacks - 1];
2241         in_flight += tcb->snd.nxt - tcb_sack->right;
2242
2243         /* Everything retransed (from una to snd.rtx, minus sacked regions.  Note
2244          * we only retrans at most the last sack's left edge.  snd.rtx will be
2245          * advanced to the right edge of some sack (possibly the last one). */
2246         from = tcb->snd.una;
2247         for (int i = 0; i < tcb->snd.nr_sacks; i++) {
2248                 tcb_sack = &tcb->snd.sacks[i];
2249                 if (seq_ge(tcb_sack->left, tcb->snd.rtx))
2250                         break;
2251                 assert(seq_ge(tcb->snd.rtx, tcb_sack->right));
2252                 in_flight += tcb_sack->left - from;
2253                 from = tcb_sack->right;
2254         }
2255         in_flight += tcb->snd.rtx - from;
2256
2257         tcb->snd.in_flight = in_flight;
2258 }
2259
2260 static void reset_recovery(struct conv *s, Tcpctl *tcb)
2261 {
2262         netlog(s->p->f, Logtcprxmt,
2263                "%I.%d -> %I.%d: recovery complete, una %u, rtx %u, nxt %u, recovery %u\n",
2264                s->laddr, s->lport, s->raddr, s->rport,
2265                tcb->snd.una, tcb->snd.rtx, tcb->snd.nxt, tcb->snd.recovery_pt);
2266         tcb->snd.recovery = 0;
2267         tcb->snd.recovery_pt = 0;
2268         tcb->snd.loss_hint = 0;
2269         tcb->snd.flush_sacks = FALSE;
2270         tcb->snd.sack_loss_hint = 0;
2271 }
2272
2273 static bool is_dup_ack(Tcpctl *tcb, Tcp *seg)
2274 {
2275         /* this is a pure ack w/o window update */
2276         return (seg->ack == tcb->snd.una) &&
2277                (tcb->snd.una != tcb->snd.nxt) &&
2278                (seg->len == 0) &&
2279                (seg->wnd == tcb->snd.wnd);
2280 }
2281
2282 /* If we have sacks, we'll ignore dupacks and look at the sacks ahead of una
2283  * (which are managed by the TCB).  The tcb will not have old sacks (below
2284  * ack/snd.rtx).  Receivers often send sacks below their ack point when we are
2285  * coming out of a loss, and we don't want those to count.
2286  *
2287  * Note the tcb could have sacks (in the future), but the receiver stopped using
2288  * them (reneged).  We'll catch that with the RTO.  If we try to catch it here,
2289  * we could get in a state where we never allow them to renege. */
2290 static bool is_potential_loss(Tcpctl *tcb, Tcp *seg)
2291 {
2292         if (seg->nr_sacks > 0)
2293                 return tcb->snd.nr_sacks > 0;
2294         else
2295                 return is_dup_ack(tcb, seg);
2296 }
2297
2298 void update(struct conv *s, Tcp * seg)
2299 {
2300         int rtt, delta;
2301         Tcpctl *tcb;
2302         uint32_t acked, expand;
2303         struct tcppriv *tpriv;
2304
2305         tpriv = s->p->priv;
2306         tcb = (Tcpctl *) s->ptcl;
2307
2308         /* if everything has been acked, force output(?) */
2309         if (seq_gt(seg->ack, tcb->snd.nxt)) {
2310                 tcb->flags |= FORCE;
2311                 return;
2312         }
2313
2314         acked = seg->ack - tcb->snd.una;
2315         tcb->snd.una = seg->ack;
2316         if (seq_gt(seg->ack, tcb->snd.rtx))
2317                 tcb->snd.rtx = seg->ack;
2318
2319         update_sacks(s, tcb, seg);
2320         set_in_flight(tcb);
2321
2322         /* We treat either a dupack or forward SACKs as a hint that there is a loss.
2323          * The RFCs suggest three dupacks before treating it as a loss (alternative
2324          * is reordered packets).  We'll treat three SACKs the same way. */
2325         if (is_potential_loss(tcb, seg) && !tcb->snd.recovery) {
2326                 tcb->snd.loss_hint++;
2327                 if (tcb->snd.loss_hint == TCPREXMTTHRESH) {
2328                         netlog(s->p->f, Logtcprxmt,
2329                                "%I.%d -> %I.%d: loss hint thresh, nr sacks %u, nxt %u, una %u, cwnd %u\n",
2330                                s->laddr, s->lport, s->raddr, s->rport,
2331                                tcb->snd.nr_sacks, tcb->snd.nxt, tcb->snd.una, tcb->cwind);
2332                         tcp_loss_event(s, tcb);
2333                         tcb->snd.recovery_pt = tcb->snd.nxt;
2334                         if (tcb->snd.nr_sacks) {
2335                                 tcb->snd.recovery = SACK_RETRANS_RECOVERY;
2336                                 tcb->snd.flush_sacks = FALSE;
2337                                 tcb->snd.sack_loss_hint = 0;
2338                         } else {
2339                                 tcb->snd.recovery = FAST_RETRANS_RECOVERY;
2340                         }
2341                         tcprxmit(s);
2342                 }
2343         }
2344
2345         /*
2346          *  update window
2347          */
2348         if (seq_gt(seg->ack, tcb->snd.wl2)
2349                 || (tcb->snd.wl2 == seg->ack && seg->wnd > tcb->snd.wnd)) {
2350                 tcb->snd.wnd = seg->wnd;
2351                 tcb->snd.wl2 = seg->ack;
2352         }
2353
2354         if (!acked) {
2355                 /*
2356                  *  don't let us hangup if sending into a closed window and
2357                  *  we're still getting acks
2358                  */
2359                 if ((tcb->flags & RETRAN) && tcb->snd.wnd == 0) {
2360                         tcb->backedoff = MAXBACKMS / 4;
2361                 }
2362                 return;
2363         }
2364         /* At this point, they have acked something new. (positive ack, ack > una).
2365          *
2366          * If we hadn't reached the threshold for recovery yet, the positive ACK
2367          * will reset our loss_hint count. */
2368         if (!tcb->snd.recovery)
2369                 tcb->snd.loss_hint = 0;
2370         else if (seq_ge(seg->ack, tcb->snd.recovery_pt))
2371                 reset_recovery(s, tcb);
2372
2373         /* avoid slow start and timers for SYN acks */
2374         if ((tcb->flags & SYNACK) == 0) {
2375                 tcb->flags |= SYNACK;
2376                 acked--;
2377                 tcb->flgcnt--;
2378                 goto done;
2379         }
2380
2381         /* slow start as long as we're not recovering from lost packets */
2382         if (tcb->cwind < tcb->snd.wnd && !tcb->snd.recovery) {
2383                 if (tcb->cwind < tcb->ssthresh) {
2384                         /* We increase the cwind by every byte we receive.  We want to
2385                          * increase the cwind by one MSS for every MSS that gets ACKed.
2386                          * Note that multiple MSSs can be ACKed in a single ACK.  If we had
2387                          * a remainder of acked / MSS, we'd add just that remainder - not 0
2388                          * or 1 MSS. */
2389                         expand = acked;
2390                 } else {
2391                         /* Every RTT, which consists of CWND bytes, we're supposed to expand
2392                          * by MSS bytes.  The classic algorithm was
2393                          *              expand = (tcb->mss * tcb->mss) / tcb->cwind;
2394                          * which assumes the ACK was for MSS bytes.  Instead, for every
2395                          * 'acked' bytes, we increase the window by acked / CWND (in units
2396                          * of MSS). */
2397                         expand = MAX(acked, tcb->typical_mss) * tcb->typical_mss
2398                                  / tcb->cwind;
2399                 }
2400
2401                 if (tcb->cwind + expand < tcb->cwind)
2402                         expand = tcb->snd.wnd - tcb->cwind;
2403                 if (tcb->cwind + expand > tcb->snd.wnd)
2404                         expand = tcb->snd.wnd - tcb->cwind;
2405                 tcb->cwind += expand;
2406         }
2407         adjust_tx_qio_limit(s);
2408
2409         /* Adjust the timers according to the round trip time */
2410         if (tcb->rtt_timer.state == TcptimerON && seq_ge(seg->ack, tcb->rttseq)) {
2411                 tcphalt(tpriv, &tcb->rtt_timer);
2412                 if ((tcb->flags & RETRAN) == 0) {
2413                         tcb->backoff = 0;
2414                         tcb->backedoff = 0;
2415                         rtt = tcb->rtt_timer.start - tcb->rtt_timer.count;
2416                         if (rtt == 0)
2417                                 rtt = 1;        /* otherwise all close systems will rexmit in 0 time */
2418                         rtt *= MSPTICK;
2419                         if (tcb->srtt == 0) {
2420                                 tcb->srtt = rtt << LOGAGAIN;
2421                                 tcb->mdev = rtt << LOGDGAIN;
2422                         } else {
2423                                 delta = rtt - (tcb->srtt >> LOGAGAIN);
2424                                 tcb->srtt += delta;
2425                                 if (tcb->srtt <= 0)
2426                                         tcb->srtt = 1;
2427
2428                                 delta = abs(delta) - (tcb->mdev >> LOGDGAIN);
2429                                 tcb->mdev += delta;
2430                                 if (tcb->mdev <= 0)
2431                                         tcb->mdev = 1;
2432                         }
2433                         tcpsettimer(tcb);
2434                 }
2435         }
2436
2437 done:
2438         if (qdiscard(s->wq, acked) < acked)
2439                 tcb->flgcnt--;
2440
2441         if (seq_gt(seg->ack, tcb->snd.urg))
2442                 tcb->snd.urg = seg->ack;
2443
2444         if (tcb->snd.una != tcb->snd.nxt)
2445                 tcpgo(tpriv, &tcb->timer);
2446         else
2447                 tcphalt(tpriv, &tcb->timer);
2448
2449         tcb->flags &= ~RETRAN;
2450         tcb->backoff = 0;
2451         tcb->backedoff = 0;
2452 }
2453
2454 static void update_tcb_ts(Tcpctl *tcb, Tcp *seg)
2455 {
2456         /* Get timestamp info from the tcp header.  Even though the timestamps
2457          * aren't sequence numbers, we still need to protect for wraparound.  Though
2458          * if the values were 0, assume that means we need an update.  We could have
2459          * an initial ts_val that appears negative (signed). */
2460         if (!tcb->ts_recent || !tcb->last_ack_sent ||
2461             (seq_ge(seg->ts_val, tcb->ts_recent) &&
2462              seq_le(seg->seq, tcb->last_ack_sent)))
2463                 tcb->ts_recent = seg->ts_val;
2464 }
2465
2466 /* Overlap happens when one sack's left edge is inside another sack. */
2467 static bool sacks_overlap(struct sack_block *x, struct sack_block *y)
2468 {
2469         return (seq_le(x->left, y->left) && seq_le(y->left, x->right)) ||
2470                (seq_le(y->left, x->left) && seq_le(x->left, y->right));
2471 }
2472
2473 static void make_sack_first(Tcpctl *tcb, struct sack_block *tcb_sack)
2474 {
2475         struct sack_block temp;
2476
2477         if (tcb_sack == &tcb->rcv.sacks[0])
2478                 return;
2479         temp = tcb->rcv.sacks[0];
2480         tcb->rcv.sacks[0] = *tcb_sack;
2481         *tcb_sack = temp;
2482 }
2483
2484 /* Track sack in our tcb for a block of data we received.  This handles all the
2485  * stuff: making sure sack is first (since it's the most recent sack change),
2486  * updating or merging sacks, and dropping excess sacks (we only need to
2487  * maintain 3).  Unlike on the snd side, our tcb sacks are *not* sorted. */
2488 static void track_rcv_sack(Tcpctl *tcb, uint32_t left, uint32_t right)
2489 {
2490         struct sack_block *tcb_sack;
2491         struct sack_block sack[1];
2492
2493         if (!tcb->sack_ok)
2494                 return;
2495         assert(seq_lt(left, right));
2496         sack->left = left;
2497         sack->right = right;
2498         /* We can reuse an existing sack if we're merging or overlapping. */
2499         for (int i = 0; i < tcb->rcv.nr_sacks; i++) {
2500                 tcb_sack = &tcb->rcv.sacks[i];
2501                 if (sacks_overlap(tcb_sack, sack)) {
2502                         tcb_sack->left = seq_min(tcb_sack->left, sack->left);
2503                         tcb_sack->right = seq_max(tcb_sack->right, sack->right);
2504                         make_sack_first(tcb, tcb_sack);
2505                         return;
2506                 }
2507         }
2508         /* We can discard the last sack (right shift) - we should have sent it at
2509          * least once by now.  If not, oh well. */
2510         memmove(tcb->rcv.sacks + 1, tcb->rcv.sacks, sizeof(struct sack_block) *
2511                 MIN(MAX_NR_RCV_SACKS - 1, tcb->rcv.nr_sacks));
2512         tcb->rcv.sacks[0] = *sack;
2513         if (tcb->rcv.nr_sacks < MAX_NR_RCV_SACKS)
2514                 tcb->rcv.nr_sacks++;
2515 }
2516
2517 /* Once we receive everything and move rcv.nxt past a sack, we don't need to
2518  * track it.  I've seen Linux report sacks in the past, but we probably
2519  * shouldn't. */
2520 static void drop_old_rcv_sacks(Tcpctl *tcb)
2521 {
2522         struct sack_block *tcb_sack;
2523
2524         for (int i = 0; i < tcb->rcv.nr_sacks; i++) {
2525                 tcb_sack = &tcb->rcv.sacks[i];
2526                 /* Moving up to or past the left is enough to drop it. */
2527                 if (seq_ge(tcb->rcv.nxt, tcb_sack->left)) {
2528                         memmove(tcb->rcv.sacks + i, tcb->rcv.sacks + i + 1,
2529                                 sizeof(struct sack_block) * (tcb->rcv.nr_sacks - i - 1));
2530                         tcb->rcv.nr_sacks--;
2531                         i--;
2532                 }
2533         }
2534 }
2535
2536 void tcpiput(struct Proto *tcp, struct Ipifc *unused, struct block *bp)
2537 {
2538         ERRSTACK(1);
2539         Tcp seg;
2540         Tcp4hdr *h4;
2541         Tcp6hdr *h6;
2542         int hdrlen;
2543         Tcpctl *tcb;
2544         uint16_t length;
2545         uint8_t source[IPaddrlen], dest[IPaddrlen];
2546         struct conv *s;
2547         struct Fs *f;
2548         struct tcppriv *tpriv;
2549         uint8_t version;
2550
2551         f = tcp->f;
2552         tpriv = tcp->priv;
2553
2554         tpriv->stats[InSegs]++;
2555
2556         h4 = (Tcp4hdr *) (bp->rp);
2557         h6 = (Tcp6hdr *) (bp->rp);
2558
2559         if ((h4->vihl & 0xF0) == IP_VER4) {
2560                 uint8_t ttl;
2561
2562                 version = V4;
2563                 length = nhgets(h4->length);
2564                 v4tov6(dest, h4->tcpdst);
2565                 v4tov6(source, h4->tcpsrc);
2566
2567                 /* ttl isn't part of the xsum pseudo header, but bypass needs it. */
2568                 ttl = h4->Unused;
2569                 h4->Unused = 0;
2570                 hnputs(h4->tcplen, length - TCP4_PKT);
2571                 if (!(bp->flag & Btcpck) && (h4->tcpcksum[0] || h4->tcpcksum[1]) &&
2572                         ptclcsum(bp, TCP4_IPLEN, length - TCP4_IPLEN)) {
2573                         tpriv->stats[CsumErrs]++;
2574                         tpriv->stats[InErrs]++;
2575                         netlog(f, Logtcp, "bad tcp proto cksum\n");
2576                         freeblist(bp);
2577                         return;
2578                 }
2579                 h4->Unused = ttl;
2580
2581                 hdrlen = ntohtcp4(&seg, &bp);
2582                 if (hdrlen < 0) {
2583                         tpriv->stats[HlenErrs]++;
2584                         tpriv->stats[InErrs]++;
2585                         netlog(f, Logtcp, "bad tcp hdr len\n");
2586                         return;
2587                 }
2588
2589                 s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest);
2590                 if (s && s->state == Bypass) {
2591                         bypass_or_drop(s, bp);
2592                         return;
2593                 }
2594
2595                 /* trim the packet to the size claimed by the datagram */
2596                 length -= hdrlen + TCP4_PKT;
2597                 bp = trimblock(bp, hdrlen + TCP4_PKT, length);
2598                 if (bp == NULL) {
2599                         tpriv->stats[LenErrs]++;
2600                         tpriv->stats[InErrs]++;
2601                         netlog(f, Logtcp, "tcp len < 0 after trim\n");
2602                         return;
2603                 }
2604         } else {
2605                 int ttl = h6->ttl;
2606                 int proto = h6->proto;
2607
2608                 version = V6;
2609                 length = nhgets(h6->ploadlen);
2610                 ipmove(dest, h6->tcpdst);
2611                 ipmove(source, h6->tcpsrc);
2612
2613                 h6->ploadlen[0] = h6->ploadlen[1] = h6->proto = 0;
2614                 h6->ttl = proto;
2615                 hnputl(h6->vcf, length);
2616                 if ((h6->tcpcksum[0] || h6->tcpcksum[1]) &&
2617                         ptclcsum(bp, TCP6_IPLEN, length + TCP6_PHDRSIZE)) {
2618                         tpriv->stats[CsumErrs]++;
2619                         tpriv->stats[InErrs]++;
2620                         netlog(f, Logtcp, "bad tcp proto cksum\n");
2621                         freeblist(bp);
2622                         return;
2623                 }
2624                 h6->ttl = ttl;
2625                 h6->proto = proto;
2626                 hnputs(h6->ploadlen, length);
2627
2628                 hdrlen = ntohtcp6(&seg, &bp);
2629                 if (hdrlen < 0) {
2630                         tpriv->stats[HlenErrs]++;
2631                         tpriv->stats[InErrs]++;
2632                         netlog(f, Logtcp, "bad tcp hdr len\n");
2633                         return;
2634                 }
2635
2636                 s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest);
2637                 if (s && s->state == Bypass) {
2638                         bypass_or_drop(s, bp);
2639                         return;
2640                 }
2641
2642                 /* trim the packet to the size claimed by the datagram */
2643                 length -= hdrlen;
2644                 bp = trimblock(bp, hdrlen + TCP6_PKT, length);
2645                 if (bp == NULL) {
2646                         tpriv->stats[LenErrs]++;
2647                         tpriv->stats[InErrs]++;
2648                         netlog(f, Logtcp, "tcp len < 0 after trim\n");
2649                         return;
2650                 }
2651         }
2652
2653         /* s, the conv matching the n-tuple, was set above */
2654         if (s == NULL) {
2655                 netlog(f, Logtcpreset, "iphtlook failed: src %I:%u, dst %I:%u\n",
2656                        source, seg.source, dest, seg.dest);
2657 reset:
2658                 sndrst(tcp, source, dest, length, &seg, version, "no conversation");
2659                 freeblist(bp);
2660                 return;
2661         }
2662
2663         /* lock protocol for unstate Plan 9 invariants.  funcs like limbo or
2664          * incoming might rely on it. */
2665         qlock(&tcp->qlock);
2666
2667         /* if it's a listener, look for the right flags and get a new conv */
2668         tcb = (Tcpctl *) s->ptcl;
2669         if (tcb->state == Listen) {
2670                 if (seg.flags & RST) {
2671                         limborst(s, &seg, source, dest, version);
2672                         qunlock(&tcp->qlock);
2673                         freeblist(bp);
2674                         return;
2675                 }
2676
2677                 /* if this is a new SYN, put the call into limbo */
2678                 if ((seg.flags & SYN) && (seg.flags & ACK) == 0) {
2679                         limbo(s, source, dest, &seg, version);
2680                         qunlock(&tcp->qlock);
2681                         freeblist(bp);
2682                         return;
2683                 }
2684
2685                 /* if there's a matching call in limbo, tcpincoming will return it */
2686                 s = tcpincoming(s, &seg, source, dest, version);
2687                 if (s == NULL) {
2688                         qunlock(&tcp->qlock);
2689                         goto reset;
2690                 }
2691         }
2692
2693         /* The rest of the input state machine is run with the control block
2694          * locked and implements the state machine directly out of the RFC.
2695          * Out-of-band data is ignored - it was always a bad idea.
2696          */
2697         tcb = (Tcpctl *) s->ptcl;
2698         if (waserror()) {
2699                 qunlock(&s->qlock);
2700                 nexterror();
2701         }
2702         qlock(&s->qlock);
2703         qunlock(&tcp->qlock);
2704
2705         update_tcb_ts(tcb, &seg);
2706         /* fix up window */
2707         seg.wnd <<= tcb->rcv.scale;
2708
2709         /* every input packet in puts off the keep alive time out */
2710         tcpsetkacounter(tcb);
2711
2712         switch (tcb->state) {
2713                 case Closed:
2714                         sndrst(tcp, source, dest, length, &seg, version,
2715                                    "sending to Closed");
2716                         goto raise;
2717                 case Syn_sent:
2718                         if (seg.flags & ACK) {
2719                                 if (!seq_within(seg.ack, tcb->iss + 1, tcb->snd.nxt)) {
2720                                         sndrst(tcp, source, dest, length, &seg, version,
2721                                                    "bad seq in Syn_sent");
2722                                         goto raise;
2723                                 }
2724                         }
2725                         if (seg.flags & RST) {
2726                                 if (seg.flags & ACK)
2727                                         localclose(s, "connection refused");
2728                                 goto raise;
2729                         }
2730
2731                         if (seg.flags & SYN) {
2732                                 procsyn(s, &seg);
2733                                 if (seg.flags & ACK) {
2734                                         update(s, &seg);
2735                                         tcpsynackrtt(s);
2736                                         tcpsetstate(s, Established);
2737                                         /* Here's where we get the results of header option
2738                                          * negotiations for connections we started. (SYNACK has the
2739                                          * response) */
2740                                         tcpsetscale(s, tcb, seg.ws, tcb->scale);
2741                                         tcb->sack_ok = seg.sack_ok;
2742                                 } else {
2743                                         sndrst(tcp, source, dest, length, &seg, version,
2744                                                    "Got SYN with no ACK");
2745                                         goto raise;
2746                                 }
2747
2748                                 if (length != 0 || (seg.flags & FIN))
2749                                         break;
2750
2751                                 freeblist(bp);
2752                                 goto output;
2753                         } else
2754                                 freeblist(bp);
2755
2756                         qunlock(&s->qlock);
2757                         poperror();
2758                         return;
2759         }
2760
2761         /*
2762          *  One DOS attack is to open connections to us and then forget about them,
2763          *  thereby tying up a conv at no long term cost to the attacker.
2764          *  This is an attempt to defeat these stateless DOS attacks.  See
2765          *  corresponding code in tcpsendka().
2766          */
2767         if ((seg.flags & RST) == 0) {
2768                 if (tcpporthogdefense
2769                         && seq_within(seg.ack, tcb->snd.una - (1 << 31),
2770                                                   tcb->snd.una - (1 << 29))) {
2771                         printd("stateless hog %I.%d->%I.%d f 0x%x 0x%lx - 0x%lx - 0x%lx\n",
2772                                    source, seg.source, dest, seg.dest, seg.flags,
2773                                    tcb->snd.una - (1 << 31), seg.ack, tcb->snd.una - (1 << 29));
2774                         localclose(s, "stateless hog");
2775                 }
2776         }
2777
2778         /* Cut the data to fit the receive window */
2779         if (tcptrim(tcb, &seg, &bp, &length) == -1) {
2780                 netlog(f, Logtcp, "%I.%d -> %I.%d: tcp len < 0, %lu %d\n",
2781                        s->raddr, s->rport, s->laddr, s->lport, seg.seq, length);
2782                 update(s, &seg);
2783                 if (qlen(s->wq) + tcb->flgcnt == 0 && tcb->state == Closing) {
2784                         tcphalt(tpriv, &tcb->rtt_timer);
2785                         tcphalt(tpriv, &tcb->acktimer);
2786                         tcphalt(tpriv, &tcb->katimer);
2787                         tcpsetstate(s, Time_wait);
2788                         tcb->timer.start = MSL2 * (1000 / MSPTICK);
2789                         tcpgo(tpriv, &tcb->timer);
2790                 }
2791                 if (!(seg.flags & RST)) {
2792                         tcb->flags |= FORCE;
2793                         goto output;
2794                 }
2795                 qunlock(&s->qlock);
2796                 poperror();
2797                 return;
2798         }
2799
2800         /* Cannot accept so answer with a rst */
2801         if (length && tcb->state == Closed) {
2802                 sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
2803                 goto raise;
2804         }
2805
2806         /* The segment is beyond the current receive pointer so
2807          * queue the data in the resequence queue
2808          */
2809         if (seg.seq != tcb->rcv.nxt)
2810                 if (length != 0 || (seg.flags & (SYN | FIN))) {
2811                         update(s, &seg);
2812                         if (addreseq(tcb, tpriv, &seg, bp, length) < 0)
2813                                 printd("reseq %I.%d -> %I.%d\n", s->raddr, s->rport, s->laddr,
2814                                            s->lport);
2815                         tcb->flags |= FORCE;
2816                         goto output;
2817                 }
2818
2819         /*
2820          *  keep looping till we've processed this packet plus any
2821          *  adjacent packets in the resequence queue
2822          */
2823         for (;;) {
2824                 if (seg.flags & RST) {
2825                         if (tcb->state == Established) {
2826                                 tpriv->stats[EstabResets]++;
2827                                 if (tcb->rcv.nxt != seg.seq)
2828                                         printd
2829                                                 ("out of order RST rcvd: %I.%d -> %I.%d, rcv.nxt 0x%lx seq 0x%lx\n",
2830                                                  s->raddr, s->rport, s->laddr, s->lport, tcb->rcv.nxt,
2831                                                  seg.seq);
2832                         }
2833                         localclose(s, "connection refused");
2834                         goto raise;
2835                 }
2836
2837                 if ((seg.flags & ACK) == 0)
2838                         goto raise;
2839
2840                 switch (tcb->state) {
2841                         case Established:
2842                         case Close_wait:
2843                                 update(s, &seg);
2844                                 break;
2845                         case Finwait1:
2846                                 update(s, &seg);
2847                                 if (qlen(s->wq) + tcb->flgcnt == 0) {
2848                                         tcphalt(tpriv, &tcb->rtt_timer);
2849                                         tcphalt(tpriv, &tcb->acktimer);
2850                                         tcpsetkacounter(tcb);
2851                                         tcb->time = NOW;
2852                                         tcpsetstate(s, Finwait2);
2853                                         tcb->katimer.start = MSL2 * (1000 / MSPTICK);
2854                                         tcpgo(tpriv, &tcb->katimer);
2855                                 }
2856                                 break;
2857                         case Finwait2:
2858                                 update(s, &seg);
2859                                 break;
2860                         case Closing:
2861                                 update(s, &seg);
2862                                 if (qlen(s->wq) + tcb->flgcnt == 0) {
2863                                         tcphalt(tpriv, &tcb->rtt_timer);
2864                                         tcphalt(tpriv, &tcb->acktimer);
2865                                         tcphalt(tpriv, &tcb->katimer);
2866                                         tcpsetstate(s, Time_wait);
2867                                         tcb->timer.start = MSL2 * (1000 / MSPTICK);
2868                                         tcpgo(tpriv, &tcb->timer);
2869                                 }
2870                                 break;
2871                         case Last_ack:
2872                                 update(s, &seg);
2873                                 if (qlen(s->wq) + tcb->flgcnt == 0) {
2874                                         localclose(s, NULL);
2875                                         goto raise;
2876                                 }
2877                         case Time_wait:
2878                                 tcb->flags |= FORCE;
2879                                 if (tcb->timer.state != TcptimerON)
2880                                         tcpgo(tpriv, &tcb->timer);
2881                 }
2882
2883                 if ((seg.flags & URG) && seg.urg) {
2884                         if (seq_gt(seg.urg + seg.seq, tcb->rcv.urg)) {
2885                                 tcb->rcv.urg = seg.urg + seg.seq;
2886                                 pullblock(&bp, seg.urg);
2887                         }
2888                 } else if (seq_gt(tcb->rcv.nxt, tcb->rcv.urg))
2889                         tcb->rcv.urg = tcb->rcv.nxt;
2890
2891                 if (length == 0) {
2892                         if (bp != NULL)
2893                                 freeblist(bp);
2894                 } else {
2895                         switch (tcb->state) {
2896                                 default:
2897                                         /* Ignore segment text */
2898                                         if (bp != NULL)
2899                                                 freeblist(bp);
2900                                         break;
2901
2902                                 case Established:
2903                                 case Finwait1:
2904                                         /* If we still have some data place on
2905                                          * receive queue
2906                                          */
2907                                         if (bp) {
2908                                                 bp = packblock(bp);
2909                                                 if (bp == NULL)
2910                                                         panic("tcp packblock");
2911                                                 qpassnolim(s->rq, bp);
2912                                                 bp = NULL;
2913
2914                                                 /*
2915                                                  *  Force an ack every 2 data messages.  This is
2916                                                  *  a hack for rob to make his home system run
2917                                                  *  faster.
2918                                                  *
2919                                                  *  this also keeps the standard TCP congestion
2920                                                  *  control working since it needs an ack every
2921                                                  *  2 max segs worth.  This is not quite that,
2922                                                  *  but under a real stream is equivalent since
2923                                                  *  every packet has a max seg in it.
2924                                                  */
2925                                                 if (++(tcb->rcv.una) >= 2)
2926                                                         tcb->flags |= FORCE;
2927                                         }
2928                                         tcb->rcv.nxt += length;
2929                                         drop_old_rcv_sacks(tcb);
2930
2931                                         /*
2932                                          *  update our rcv window
2933                                          */
2934                                         tcprcvwin(s);
2935
2936                                         /*
2937                                          *  turn on the acktimer if there's something
2938                                          *  to ack
2939                                          */
2940                                         if (tcb->acktimer.state != TcptimerON)
2941                                                 tcpgo(tpriv, &tcb->acktimer);
2942
2943                                         break;
2944                                 case Finwait2:
2945                                         /* no process to read the data, send a reset */
2946                                         if (bp != NULL)
2947                                                 freeblist(bp);
2948                                         sndrst(tcp, source, dest, length, &seg, version,
2949                                                    "send to Finwait2");
2950                                         qunlock(&s->qlock);
2951                                         poperror();
2952                                         return;
2953                         }
2954                 }
2955
2956                 if (seg.flags & FIN) {
2957                         tcb->flags |= FORCE;
2958
2959                         switch (tcb->state) {
2960                                 case Established:
2961                                         tcb->rcv.nxt++;
2962                                         tcpsetstate(s, Close_wait);
2963                                         break;
2964                                 case Finwait1:
2965                                         tcb->rcv.nxt++;
2966                                         if (qlen(s->wq) + tcb->flgcnt == 0) {
2967                                                 tcphalt(tpriv, &tcb->rtt_timer);
2968                                                 tcphalt(tpriv, &tcb->acktimer);
2969                                                 tcphalt(tpriv, &tcb->katimer);
2970                                                 tcpsetstate(s, Time_wait);
2971                                                 tcb->timer.start = MSL2 * (1000 / MSPTICK);
2972                                                 tcpgo(tpriv, &tcb->timer);
2973                                         } else
2974                                                 tcpsetstate(s, Closing);
2975                                         break;
2976                                 case Finwait2:
2977                                         tcb->rcv.nxt++;
2978                                         tcphalt(tpriv, &tcb->rtt_timer);
2979                                         tcphalt(tpriv, &tcb->acktimer);
2980                                         tcphalt(tpriv, &tcb->katimer);
2981                                         tcpsetstate(s, Time_wait);
2982                                         tcb->timer.start = MSL2 * (1000 / MSPTICK);
2983                                         tcpgo(tpriv, &tcb->timer);
2984                                         break;
2985                                 case Close_wait:
2986                                 case Closing:
2987                                 case Last_ack:
2988                                         break;
2989                                 case Time_wait:
2990                                         tcpgo(tpriv, &tcb->timer);
2991                                         break;
2992                         }
2993                 }
2994
2995                 /*
2996                  *  get next adjacent segment from the resequence queue.
2997                  *  dump/trim any overlapping segments
2998                  */
2999                 for (;;) {
3000                         if (tcb->reseq == NULL)
3001                                 goto output;
3002
3003                         if (seq_ge(tcb->rcv.nxt, tcb->reseq->seg.seq) == 0)
3004                                 goto output;
3005
3006                         getreseq(tcb, &seg, &bp, &length);
3007
3008                         if (tcptrim(tcb, &seg, &bp, &length) == 0)
3009                                 break;
3010                 }
3011         }
3012 output:
3013         tcpoutput(s);
3014         qunlock(&s->qlock);
3015         poperror();
3016         return;
3017 raise:
3018         qunlock(&s->qlock);
3019         poperror();
3020         freeblist(bp);
3021         tcpkick(s);
3022 }
3023
3024 /* The advertised mss = data + TCP headers */
3025 static uint16_t derive_payload_mss(Tcpctl *tcb)
3026 {
3027         uint16_t payload_mss = tcb->mss;
3028         uint16_t opt_size = 0;
3029
3030         if (tcb->ts_recent) {
3031                 opt_size += TS_LENGTH;
3032                 /* Note that when we're a SYN, we overestimate slightly.  This is safe,
3033                  * and not really a problem. */
3034                 opt_size += TS_SEND_PREPAD;
3035         }
3036         if (tcb->rcv.nr_sacks)
3037                 opt_size += 2 + tcb->rcv.nr_sacks * 8;
3038         opt_size = ROUNDUP(opt_size, 4);
3039         payload_mss -= opt_size;
3040         return payload_mss;
3041 }
3042
3043 /* Decreases the xmit amt, given the MSS / TSO. */
3044 static uint32_t throttle_for_mss(Tcpctl *tcb, uint32_t ssize,
3045                                  uint16_t payload_mss, bool retrans)
3046 {
3047         if (ssize > payload_mss) {
3048                 if ((tcb->flags & TSO) == 0) {
3049                         ssize = payload_mss;
3050                 } else {
3051                         /* Don't send too much.  32K is arbitrary.. */
3052                         if (ssize > 32 * 1024)
3053                                 ssize = 32 * 1024;
3054                         if (!retrans) {
3055                                 /* Clamp xmit to an integral MSS to avoid ragged tail segments
3056                                  * causing poor link utilization. */
3057                                 ssize = ROUNDDOWN(ssize, payload_mss);
3058                         }
3059                 }
3060         }
3061         return ssize;
3062 }
3063
3064 /* Reduces ssize for a variety of reasons.  Returns FALSE if we should abort
3065  * sending the packet.  o/w returns TRUE and modifies ssize by reference. */
3066 static bool throttle_ssize(struct conv *s, Tcpctl *tcb, uint32_t *ssize_p,
3067                            uint16_t payload_mss, bool retrans)
3068 {
3069         struct Fs *f = s->p->f;
3070         uint32_t usable;
3071         uint32_t ssize = *ssize_p;
3072
3073         /* Compute usable segment based on offered window and limit
3074          * window probes to one */
3075         if (tcb->snd.wnd == 0) {
3076                 if (tcb->snd.in_flight != 0) {
3077                         if ((tcb->flags & FORCE) == 0)
3078                                 return FALSE;
3079                 }
3080                 usable = 1;
3081         } else {
3082                 usable = tcb->cwind;
3083                 if (tcb->snd.wnd < usable)
3084                         usable = tcb->snd.wnd;
3085                 if (usable > tcb->snd.in_flight)
3086                         usable -= tcb->snd.in_flight;
3087                 else
3088                         usable = 0;
3089                 /* Avoid Silly Window Syndrome.  This is a little different thant RFC
3090                  * 813.  I took their additional enhancement of "< MSS" as an AND, not
3091                  * an OR.  25% of a large snd.wnd is pretty large, and our main goal is
3092                  * to avoid packets smaller than MSS.  I still use the 25% threshold,
3093                  * because it is important that there is *some* data in_flight.  If
3094                  * usable < MSS because snd.wnd is very small (but not 0), we might
3095                  * never get an ACK and would need to set up a timer.
3096                  *
3097                  * Also, I'm using 'ssize' as a proxy for a PSH point.  If there's just
3098                  * a small blob in the qio (or retrans!), then we might as well just
3099                  * send it. */
3100                 if ((usable < tcb->typical_mss) && (usable < tcb->snd.wnd >> 2)
3101                     && (usable < ssize)) {
3102                         return FALSE;
3103                 }
3104         }
3105         if (ssize && usable < 2)
3106                 netlog(s->p->f, Logtcpverbose,
3107                        "%I.%d -> %I.%d: throttled snd.wnd %lu cwind %lu\n",
3108                        s->laddr, s->lport, s->raddr, s->rport,
3109                        tcb->snd.wnd, tcb->cwind);
3110         if (usable < ssize)
3111                 ssize = usable;
3112
3113         ssize = throttle_for_mss(tcb, ssize, payload_mss, retrans);
3114
3115         *ssize_p = ssize;
3116         return TRUE;
3117 }
3118
3119 /* Helper, picks the next segment to send, which is possibly a retransmission.
3120  * Returns TRUE if we have a segment, FALSE o/w.  Returns ssize, from_seq, and
3121  * sent by reference.
3122  *
3123  * from_seq is the seq number we are transmitting from.
3124  *
3125  * sent includes all seq from una to from_seq *including* any previously sent
3126  * flags (part of tcb->flgcnt), for instance an unacknowledged SYN (which counts
3127  * as a seq number).  Those flags are in the e.g. snd.nxt - snd.una range, and
3128  * they get dropped after qdiscard.
3129  *
3130  * ssize is the amount of data we are sending, starting from from_seq, and it
3131  * will include any *new* flags, which haven't been accounted for yet.
3132  *
3133  * tcb->flgcnt consists of the flags both in ssize and in sent.
3134  *
3135  * Note that we could be in recovery and not sack_retrans a segment. */
3136 static bool get_xmit_segment(struct conv *s, Tcpctl *tcb, uint16_t payload_mss,
3137                              uint32_t *from_seq_p, uint32_t *sent_p,
3138                              uint32_t *ssize_p)
3139 {
3140         struct Fs *f = s->p->f;
3141         struct tcppriv *tpriv = s->p->priv;
3142         uint32_t ssize, sent, from_seq;
3143         bool sack_retrans = FALSE;
3144         struct sack_block *tcb_sack = 0;
3145
3146         for (int i = 0; i < tcb->snd.nr_sacks; i++) {
3147                 tcb_sack = &tcb->snd.sacks[i];
3148                 if (seq_lt(tcb->snd.rtx, tcb_sack->left)) {
3149                         /* So ssize is supposed to include any *new* flags to flgcnt, which
3150                          * at this point would be a FIN.
3151                          *
3152                          * It might be possible that flgcnt is incremented so we send a FIN,
3153                          * even for an intermediate sack retrans.  Perhaps the user closed
3154                          * the conv.
3155                          *
3156                          * However, the way the "flgcnt for FIN" works is that it inflates
3157                          * the desired amount we'd like to send (qlen + flgcnt).
3158                          * Eventually, we reach the end of the queue and fail to extract all
3159                          * of dsize.  At that point, we put on the FIN, and that's where the
3160                          * extra 'byte' comes from.
3161                          *
3162                          * For sack retrans, since we're extracting from parts of the qio
3163                          * that aren't the right-most edge, we don't need to consider flgcnt
3164                          * when setting ssize. */
3165                         from_seq = tcb->snd.rtx;
3166                         sent = from_seq - tcb->snd.una;
3167                         ssize = tcb_sack->left - from_seq;
3168                         sack_retrans = TRUE;
3169                         break;
3170                 }
3171         }
3172         /* SACK holes have first dibs, but we can still opportunisitically send new
3173          * data.
3174          *
3175          * During other types of recovery, we'll just send from the retrans point.
3176          * If we're in an RTO while we still have sacks, we could be resending data
3177          * that wasn't lost.  Consider a sack that is still growing (usually the
3178          * right-most), but we haven't received the ACK yet.  rxt may be included in
3179          * that area.  Given we had two losses or otherwise timed out, I'm not too
3180          * concerned.
3181          *
3182          * Note that Fast and RTO can send data beyond nxt.  If we change that,
3183          * change the accounting below. */
3184         if (!sack_retrans) {
3185                 switch (tcb->snd.recovery) {
3186                 default:
3187                 case SACK_RETRANS_RECOVERY:
3188                         from_seq = tcb->snd.nxt;
3189                         break;
3190                 case FAST_RETRANS_RECOVERY:
3191                 case RTO_RETRANS_RECOVERY:
3192                         from_seq = tcb->snd.rtx;
3193                         break;
3194                 }
3195                 sent = from_seq - tcb->snd.una;
3196                 /* qlen + flgcnt is every seq we want to have sent, including unack'd
3197                  * data, unacked flags, and new flags. */
3198                 ssize = qlen(s->wq) + tcb->flgcnt - sent;
3199         }
3200
3201         if (!throttle_ssize(s, tcb, &ssize, payload_mss, sack_retrans))
3202                 return FALSE;
3203
3204         /* This counts flags, which is a little hokey, but it's okay since in_flight
3205          * gets reset on each ACK */
3206         tcb->snd.in_flight += ssize;
3207         /* Log and track rxmit.  This covers both SACK (retrans) and fast rxmit. */
3208         if (ssize && seq_lt(tcb->snd.rtx, tcb->snd.nxt)) {
3209                 netlog(f, Logtcpverbose,
3210                        "%I.%d -> %I.%d: rxmit: rtx %u amt %u, nxt %u\n",
3211                        s->laddr, s->lport, s->raddr, s->rport,
3212                        tcb->snd.rtx, MIN(tcb->snd.nxt - tcb->snd.rtx, ssize),
3213                        tcb->snd.nxt);
3214                 tpriv->stats[RetransSegs]++;
3215         }
3216         if (sack_retrans) {
3217                 /* If we'll send up to the left edge, advance snd.rtx to the right.
3218                  *
3219                  * This includes the largest sack.  It might get removed later, in which
3220                  * case we'll underestimate the amount in-flight.  The alternative is to
3221                  * not count the rightmost sack, but when it gets removed, we'll retrans
3222                  * it anyway.  No matter what, we'd count it. */
3223                 tcb->snd.rtx += ssize;
3224                 if (tcb->snd.rtx == tcb_sack->left)
3225                         tcb->snd.rtx = tcb_sack->right;
3226                 /* RFC 6675 says we MAY rearm the RTO timer on each retrans, since we
3227                  * might not be getting ACKs for a while. */
3228                 tcpsettimer(tcb);
3229         } else {
3230                 switch (tcb->snd.recovery) {
3231                 default:
3232                         /* under normal op, we drag rtx along with nxt.  this prevents us
3233                          * from sending sacks too early (up above), since rtx doesn't get
3234                          * reset to una until we have a loss (e.g. 3 dupacks/sacks). */
3235                         tcb->snd.nxt += ssize;
3236                         tcb->snd.rtx = tcb->snd.nxt;
3237                         break;
3238                 case SACK_RETRANS_RECOVERY:
3239                         /* We explicitly do not want to increase rtx here.  We might still
3240                          * need it to fill in a sack gap below nxt if we get new, higher
3241                          * sacks. */
3242                         tcb->snd.nxt += ssize;
3243                         break;
3244                 case FAST_RETRANS_RECOVERY:
3245                 case RTO_RETRANS_RECOVERY:
3246                         tcb->snd.rtx += ssize;
3247                         /* Fast and RTO can send new data, advancing nxt. */
3248                         if (seq_gt(tcb->snd.rtx, tcb->snd.nxt))
3249                                 tcb->snd.nxt = tcb->snd.rtx;
3250                         break;
3251                 }
3252         }
3253         *from_seq_p = from_seq;
3254         *sent_p = sent;
3255         *ssize_p = ssize;
3256
3257         return TRUE;
3258 }
3259
3260 /*
3261  *  always enters and exits with the s locked.  We drop
3262  *  the lock to ipoput the packet so some care has to be
3263  *  taken by callers.
3264  */
3265 void tcpoutput(struct conv *s)
3266 {
3267         Tcp seg;
3268         int msgs;
3269         int next_yield = 1;
3270         Tcpctl *tcb;
3271         struct block *hbp, *bp;
3272         uint32_t ssize, dsize, sent, from_seq;
3273         struct Fs *f;
3274         struct tcppriv *tpriv;
3275         uint8_t version;
3276         uint16_t payload_mss;
3277
3278         f = s->p->f;
3279         tpriv = s->p->priv;
3280         version = s->ipversion;
3281
3282         for (msgs = 0; msgs < 100; msgs++) {
3283                 tcb = (Tcpctl *) s->ptcl;
3284
3285                 switch (tcb->state) {
3286                         case Listen:
3287                         case Closed:
3288                         case Finwait2:
3289                                 return;
3290                 }
3291
3292                 /* force an ack when a window has opened up */
3293                 if (tcb->rcv.blocked && tcb->rcv.wnd >= tcb->mss) {
3294                         tcb->rcv.blocked = 0;
3295                         tcb->flags |= FORCE;
3296                 }
3297
3298                 /* Don't send anything else until our SYN has been acked */
3299                 if (tcb->snd.nxt != tcb->iss && (tcb->flags & SYNACK) == 0)
3300                         break;
3301
3302                 /* payload_mss is the actual amount of data in the packet, which is the
3303                  * advertised (mss - header opts).  This varies from packet to packet,
3304                  * based on the options that might be present (e.g. always timestamps,
3305                  * sometimes SACKs) */
3306                 payload_mss = derive_payload_mss(tcb);
3307
3308                 if (!get_xmit_segment(s, tcb, payload_mss, &from_seq, &sent, &ssize))
3309                         break;
3310
3311                 dsize = ssize;
3312                 seg.urg = 0;
3313
3314                 if (ssize == 0)
3315                         if ((tcb->flags & FORCE) == 0)
3316                                 break;
3317
3318                 tcb->flags &= ~FORCE;
3319                 tcprcvwin(s);
3320
3321                 /* By default we will generate an ack, so we can normally turn off the
3322                  * timer.  If we're blocked, we'll want the timer so we can send a
3323                  * window update. */
3324                 if (!tcb->rcv.blocked)
3325                         tcphalt(tpriv, &tcb->acktimer);
3326                 tcb->rcv.una = 0;
3327                 seg.source = s->lport;
3328                 seg.dest = s->rport;
3329                 seg.flags = ACK;
3330                 seg.mss = 0;
3331                 seg.ws = 0;
3332                 seg.sack_ok = FALSE;
3333                 seg.nr_sacks = 0;
3334                 /* When outputting, Syn_sent means "send the Syn", for connections we
3335                  * initiate.  SYNACKs are sent from sndsynack directly. */
3336                 if (tcb->state == Syn_sent) {
3337                         seg.flags = 0;
3338                         seg.sack_ok = SACK_SUPPORTED;   /* here's where we advertise SACK */
3339                         if (tcb->snd.nxt - ssize == tcb->iss) {
3340                                 seg.flags |= SYN;
3341                                 dsize--;
3342                                 seg.mss = tcb->mss;
3343                                 seg.ws = tcb->scale;
3344                         } else {
3345                                 /* TODO: Not sure why we'd get here. */
3346                                 warn("TCP: weird Syn_sent state, tell someone you saw this");
3347                         }
3348                 }
3349                 seg.seq = from_seq;
3350                 seg.ack = tcb->rcv.nxt;
3351                 tcb->last_ack_sent = seg.ack;
3352                 seg.wnd = tcb->rcv.wnd;
3353                 seg.ts_val = tcb->ts_recent;
3354
3355                 /* Pull out data to send */
3356                 bp = NULL;
3357                 if (dsize != 0) {
3358                         bp = qcopy(s->wq, dsize, sent);
3359                         if (BLEN(bp) != dsize) {
3360                                 /* Here's where the flgcnt kicked in.  Note dsize is
3361                                  * decremented, but ssize isn't.  Not that we use ssize for much
3362                                  * anymore.  Decrementing dsize prevents us from sending a PSH
3363                                  * with the FIN. */
3364                                 seg.flags |= FIN;
3365                                 dsize--;
3366                         }
3367                         if (BLEN(bp) > payload_mss) {
3368                                 bp->flag |= Btso;
3369                                 bp->mss = payload_mss;
3370                         }
3371                 }
3372
3373                 if (sent + dsize == qlen(s->wq) + tcb->flgcnt)
3374                         seg.flags |= PSH;
3375
3376                 /* Build header, link data and compute cksum */
3377                 switch (version) {
3378                         case V4:
3379                                 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
3380                                 hbp = htontcp4(&seg, bp, &tcb->protohdr.tcp4hdr, tcb);
3381                                 if (hbp == NULL) {
3382                                         freeblist(bp);
3383                                         return;
3384                                 }
3385                                 break;
3386                         case V6:
3387                                 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
3388                                 hbp = htontcp6(&seg, bp, &tcb->protohdr.tcp6hdr, tcb);
3389                                 if (hbp == NULL) {
3390                                         freeblist(bp);
3391                                         return;
3392                                 }
3393                                 break;
3394                         default:
3395                                 hbp = NULL;     /* to suppress a warning */
3396                                 panic("tcpoutput: version %d", version);
3397                 }
3398
3399                 /* Start the transmission timers if there is new data and we
3400                  * expect acknowledges
3401                  */
3402                 if (ssize != 0) {
3403                         if (tcb->timer.state != TcptimerON)
3404                                 tcpgo(tpriv, &tcb->timer);
3405
3406                         /* If round trip timer isn't running, start it. */
3407                         if (tcb->rtt_timer.state != TcptimerON) {
3408                                 tcpgo(tpriv, &tcb->rtt_timer);
3409                                 tcb->rttseq = from_seq + ssize;
3410                         }
3411                 }
3412
3413                 tpriv->stats[OutSegs]++;
3414
3415                 /* put off the next keep alive */
3416                 tcpgo(tpriv, &tcb->katimer);
3417
3418                 switch (version) {
3419                         case V4:
3420                                 if (ipoput4(f, hbp, 0, s->ttl, s->tos, s) < 0) {
3421                                         /* a negative return means no route */
3422                                         localclose(s, "no route");
3423                                 }
3424                                 break;
3425                         case V6:
3426                                 if (ipoput6(f, hbp, 0, s->ttl, s->tos, s) < 0) {
3427                                         /* a negative return means no route */
3428                                         localclose(s, "no route");
3429                                 }
3430                                 break;
3431                         default:
3432                                 panic("tcpoutput2: version %d", version);
3433                 }
3434                 if (ssize) {
3435                         /* The outer loop thinks we sent one packet.  If we used TSO, we
3436                          * might have sent several.  Minus one for the loop increment. */
3437                         msgs += DIV_ROUND_UP(ssize, payload_mss) - 1;
3438                 }
3439                 /* Old Plan 9 tidbit - yield every four messages.  We want to break out
3440                  * and unlock so we can process inbound ACKs which might do things like
3441                  * say "slow down". */
3442                 if (msgs >= next_yield) {
3443                         next_yield = msgs + 4;
3444                         qunlock(&s->qlock);
3445                         kthread_yield();
3446                         qlock(&s->qlock);
3447                 }
3448         }
3449 }
3450
3451 /*
3452  *  the BSD convention (hack?) for keep alives.  resend last uint8_t acked.
3453  */
3454 void tcpsendka(struct conv *s)
3455 {
3456         Tcp seg;
3457         Tcpctl *tcb;
3458         struct block *hbp, *dbp;
3459
3460         tcb = (Tcpctl *) s->ptcl;
3461
3462         dbp = NULL;
3463         seg.urg = 0;
3464         seg.source = s->lport;
3465         seg.dest = s->rport;
3466         seg.flags = ACK | PSH;
3467         seg.mss = 0;
3468         seg.ws = 0;
3469         seg.sack_ok = FALSE;
3470         seg.nr_sacks = 0;
3471         if (tcpporthogdefense)
3472                 urandom_read(&seg.seq, sizeof(seg.seq));
3473         else
3474                 seg.seq = tcb->snd.una - 1;
3475         seg.ack = tcb->rcv.nxt;
3476         tcb->last_ack_sent = seg.ack;
3477         tcb->rcv.una = 0;
3478         seg.wnd = tcb->rcv.wnd;
3479         seg.ts_val = tcb->ts_recent;
3480         if (tcb->state == Finwait2) {
3481                 seg.flags |= FIN;
3482         } else {
3483                 dbp = block_alloc(1, MEM_WAIT);
3484                 dbp->wp++;
3485         }
3486
3487         if (isv4(s->raddr)) {
3488                 /* Build header, link data and compute cksum */
3489                 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
3490                 hbp = htontcp4(&seg, dbp, &tcb->protohdr.tcp4hdr, tcb);
3491                 if (hbp == NULL) {
3492                         freeblist(dbp);
3493                         return;
3494                 }
3495                 ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
3496         } else {
3497                 /* Build header, link data and compute cksum */
3498                 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
3499                 hbp = htontcp6(&seg, dbp, &tcb->protohdr.tcp6hdr, tcb);
3500                 if (hbp == NULL) {
3501                         freeblist(dbp);
3502                         return;
3503                 }
3504                 ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
3505         }
3506 }
3507
3508 /*
3509  *  set connection to time out after 12 minutes
3510  */
3511 void tcpsetkacounter(Tcpctl * tcb)
3512 {
3513         tcb->kacounter = (12 * 60 * 1000) / (tcb->katimer.start * MSPTICK);
3514         if (tcb->kacounter < 3)
3515                 tcb->kacounter = 3;
3516 }
3517
3518 /*
3519  *  if we've timed out, close the connection
3520  *  otherwise, send a keepalive and restart the timer
3521  */
3522 void tcpkeepalive(void *v)
3523 {
3524         ERRSTACK(1);
3525         Tcpctl *tcb;
3526         struct conv *s;
3527
3528         s = v;
3529         tcb = (Tcpctl *) s->ptcl;
3530         qlock(&s->qlock);
3531         if (waserror()) {
3532                 qunlock(&s->qlock);
3533                 nexterror();
3534         }
3535         if (tcb->state != Closed) {
3536                 if (--(tcb->kacounter) <= 0) {
3537                         localclose(s, "connection timed out");
3538                 } else {
3539                         tcpsendka(s);
3540                         tcpgo(s->p->priv, &tcb->katimer);
3541                 }
3542         }
3543         qunlock(&s->qlock);
3544         poperror();
3545 }
3546
3547 /*
3548  *  start keepalive timer
3549  */
3550 static void tcpstartka(struct conv *s, char **f, int n)
3551 {
3552         Tcpctl *tcb;
3553         int x;
3554
3555         tcb = (Tcpctl *) s->ptcl;
3556         if (tcb->state != Established)
3557                 error(ENOTCONN, "connection must be in Establised state");
3558         if (n > 1) {
3559                 x = atoi(f[1]);
3560                 if (x >= MSPTICK)
3561                         tcb->katimer.start = x / MSPTICK;
3562         }
3563         tcpsetkacounter(tcb);
3564         tcpgo(s->p->priv, &tcb->katimer);
3565 }
3566
3567 /*
3568  *  turn checksums on/off
3569  */
3570 static void tcpsetchecksum(struct conv *s, char **f, int unused)
3571 {
3572         Tcpctl *tcb;
3573
3574         tcb = (Tcpctl *) s->ptcl;
3575         tcb->nochecksum = !atoi(f[1]);
3576 }
3577
3578 static void tcp_loss_event(struct conv *s, Tcpctl *tcb)
3579 {
3580         uint32_t old_cwnd = tcb->cwind;
3581
3582         /* Reno */
3583         tcb->ssthresh = tcb->cwind / 2;
3584         tcb->cwind = tcb->ssthresh;
3585         netlog(s->p->f, Logtcprxmt,
3586                "%I.%d -> %I.%d: loss event, cwnd was %d, now %d\n",
3587                s->laddr, s->lport, s->raddr, s->rport,
3588                old_cwnd, tcb->cwind);
3589 }
3590
3591 /* Called when we need to retrans the entire outstanding window (everything
3592  * previously sent, but unacknowledged). */
3593 void tcprxmit(struct conv *s)
3594 {
3595         Tcpctl *tcb;
3596
3597         tcb = (Tcpctl *) s->ptcl;
3598
3599         tcb->flags |= RETRAN | FORCE;
3600         tcb->snd.rtx = tcb->snd.una;
3601         set_in_flight(tcb);
3602
3603         tcpoutput(s);
3604 }
3605
3606 /* The original RFC said to drop sacks on a timeout, since the receiver could
3607  * renege.  Later RFCs say we can keep them around, so long as we are careful.
3608  *
3609  * We'll go with a "flush if we have two timeouts" plan.  This doesn't have to
3610  * be perfect - there might be cases where we accidentally flush the sacks too
3611  * often.  Perhaps we never get dup_acks to start fast/sack rxmit.  The main
3612  * thing is that after multiple timeouts we flush the sacks, since the receiver
3613  * might renege.
3614  *
3615  * We also have an Akaros-specific problem.  We use the sacks to determine
3616  * in_flight.  Specifically, the (snd.nxt - upper right edge) is tracked as in
3617  * flight.  Usually the receiver will keep sacking that right edge all the way
3618  * up to snd.nxt, but they might not, and the gap might be quite large.  After a
3619  * timeout, that data is definitely not in flight.  If that block's size is
3620  * greater than cwnd, we'll never transmit.  This should be rare, and in that
3621  * case we can just dump the sacks.  The typical_mss fudge factor is so we can
3622  * send a reasonably-sized packet. */
3623 static void timeout_handle_sacks(Tcpctl *tcb)
3624 {
3625         struct sack_block *last_sack;
3626
3627         if (tcb->snd.nr_sacks) {
3628                 last_sack = &tcb->snd.sacks[tcb->snd.nr_sacks - 1];
3629                 if (tcb->snd.flush_sacks || (tcb->snd.nxt - last_sack->right >=
3630                                              tcb->cwind - tcb->typical_mss)) {
3631                         tcb->snd.nr_sacks = 0;
3632                         tcb->snd.flush_sacks = FALSE;
3633                 } else {
3634                         tcb->snd.flush_sacks = TRUE;
3635                 }
3636         }
3637 }
3638
3639 void tcptimeout(void *arg)