net: Allow snooping on ethermedium
[akaros.git] / kern / src / net / tcp.c
1 /* Copyright © 1994-1999 Lucent Technologies Inc.  All rights reserved.
2  * Portions Copyright © 1997-1999 Vita Nuova Limited
3  * Portions Copyright © 2000-2007 Vita Nuova Holdings Limited
4  *                                (www.vitanuova.com)
5  * Revisions Copyright © 2000-2007 Lucent Technologies Inc. and others
6  *
7  * Modified for the Akaros operating system:
8  * Copyright (c) 2013-2014 The Regents of the University of California
9  * Copyright (c) 2013-2015 Google Inc.
10  *
11  * Permission is hereby granted, free of charge, to any person obtaining a copy
12  * of this software and associated documentation files (the "Software"), to deal
13  * in the Software without restriction, including without limitation the rights
14  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15  * copies of the Software, and to permit persons to whom the Software is
16  * furnished to do so, subject to the following conditions:
17  *
18  * The above copyright notice and this permission notice shall be included in
19  * all copies or substantial portions of the Software.
20  *
21  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
24  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27  * SOFTWARE. */
28
29 #include <vfs.h>
30 #include <kfs.h>
31 #include <slab.h>
32 #include <kmalloc.h>
33 #include <kref.h>
34 #include <string.h>
35 #include <stdio.h>
36 #include <assert.h>
37 #include <error.h>
38 #include <cpio.h>
39 #include <pmap.h>
40 #include <smp.h>
41 #include <ip.h>
42
43 #include <vfs.h>
44 #include <kfs.h>
45 #include <slab.h>
46 #include <kmalloc.h>
47 #include <kref.h>
48 #include <string.h>
49 #include <stdio.h>
50 #include <assert.h>
51 #include <error.h>
52 #include <cpio.h>
53 #include <pmap.h>
54 #include <smp.h>
55 #include <ip.h>
56
57 enum {
58         QMAX = 64 * 1024 - 1,
59         IP_TCPPROTO = 6,
60
61         TCP4_IPLEN = 8,
62         TCP4_PHDRSIZE = 12,
63         TCP4_HDRSIZE = 20,
64         TCP4_TCBPHDRSZ = 40,
65         TCP4_PKT = TCP4_IPLEN + TCP4_PHDRSIZE,
66
67         TCP6_IPLEN = 0,
68         TCP6_PHDRSIZE = 40,
69         TCP6_HDRSIZE = 20,
70         TCP6_TCBPHDRSZ = 60,
71         TCP6_PKT = TCP6_IPLEN + TCP6_PHDRSIZE,
72
73         TcptimerOFF = 0,
74         TcptimerON = 1,
75         TcptimerDONE = 2,
76         MAX_TIME = (1 << 20),   /* Forever */
77         TCP_ACK = 50,   /* Timed ack sequence in ms */
78         MAXBACKMS = 9 * 60 * 1000,      /* longest backoff time (ms) before hangup */
79
80         URG = 0x20,     /* Data marked urgent */
81         ACK = 0x10,     /* Acknowledge is valid */
82         PSH = 0x08,     /* Whole data pipe is pushed */
83         RST = 0x04,     /* Reset connection */
84         SYN = 0x02,     /* Pkt. is synchronise */
85         FIN = 0x01,     /* Start close down */
86
87         EOLOPT = 0,
88         NOOPOPT = 1,
89         MSSOPT = 2,
90         MSS_LENGTH = 4, /* Mean segment size */
91         WSOPT = 3,
92         WS_LENGTH = 3,  /* Bits to scale window size by */
93         MSL2 = 10,
94         MSPTICK = 50,   /* Milliseconds per timer tick */
95         DEF_MSS = 1460, /* Default mean segment */
96         DEF_MSS6 = 1280,        /* Default mean segment (min) for v6 */
97         DEF_RTT = 500,  /* Default round trip */
98         DEF_KAT = 120000,       /* Default time (ms) between keep alives */
99         TCP_LISTEN = 0, /* Listen connection */
100         TCP_CONNECT = 1,        /* Outgoing connection */
101         SYNACK_RXTIMER = 250,   /* ms between SYNACK retransmits */
102
103         TCPREXMTTHRESH = 3,     /* dupack threshhold for rxt */
104
105         FORCE = 1,
106         CLONE = 2,
107         RETRAN = 4,
108         ACTIVE = 8,
109         SYNACK = 16,
110         TSO = 32,
111
112         LOGAGAIN = 3,
113         LOGDGAIN = 2,
114
115         Closed = 0,     /* Connection states */
116         Listen,
117         Syn_sent,
118         Syn_received,
119         Established,
120         Finwait1,
121         Finwait2,
122         Close_wait,
123         Closing,
124         Last_ack,
125         Time_wait,
126
127         Maxlimbo = 1000,        /* maximum procs waiting for response to SYN ACK */
128         NLHT = 256,     /* hash table size, must be a power of 2 */
129         LHTMASK = NLHT - 1,
130
131         HaveWS = 1 << 8,
132 };
133
134 /* Must correspond to the enumeration above */
135 char *tcpstates[] = {
136         "Closed", "Listen", "Syn_sent", "Syn_received",
137         "Established", "Finwait1", "Finwait2", "Close_wait",
138         "Closing", "Last_ack", "Time_wait"
139 };
140
141 typedef struct Tcptimer Tcptimer;
142 struct Tcptimer {
143         Tcptimer *next;
144         Tcptimer *prev;
145         Tcptimer *readynext;
146         int state;
147         uint64_t start;
148         uint64_t count;
149         void (*func) (void *);
150         void *arg;
151 };
152
153 /*
154  *  v4 and v6 pseudo headers used for
155  *  checksuming tcp
156  */
157 typedef struct Tcp4hdr Tcp4hdr;
158 struct Tcp4hdr {
159         uint8_t vihl;                           /* Version and header length */
160         uint8_t tos;                            /* Type of service */
161         uint8_t length[2];                      /* packet length */
162         uint8_t id[2];                          /* Identification */
163         uint8_t frag[2];                        /* Fragment information */
164         uint8_t Unused;
165         uint8_t proto;
166         uint8_t tcplen[2];
167         uint8_t tcpsrc[4];
168         uint8_t tcpdst[4];
169         uint8_t tcpsport[2];
170         uint8_t tcpdport[2];
171         uint8_t tcpseq[4];
172         uint8_t tcpack[4];
173         uint8_t tcpflag[2];
174         uint8_t tcpwin[2];
175         uint8_t tcpcksum[2];
176         uint8_t tcpurg[2];
177         /* Options segment */
178         uint8_t tcpopt[1];
179 };
180
181 typedef struct Tcp6hdr Tcp6hdr;
182 struct Tcp6hdr {
183         uint8_t vcf[4];
184         uint8_t ploadlen[2];
185         uint8_t proto;
186         uint8_t ttl;
187         uint8_t tcpsrc[IPaddrlen];
188         uint8_t tcpdst[IPaddrlen];
189         uint8_t tcpsport[2];
190         uint8_t tcpdport[2];
191         uint8_t tcpseq[4];
192         uint8_t tcpack[4];
193         uint8_t tcpflag[2];
194         uint8_t tcpwin[2];
195         uint8_t tcpcksum[2];
196         uint8_t tcpurg[2];
197         /* Options segment */
198         uint8_t tcpopt[1];
199 };
200
201 /*
202  *  this represents the control info
203  *  for a single packet.  It is derived from
204  *  a packet in ntohtcp{4,6}() and stuck into
205  *  a packet in htontcp{4,6}().
206  */
207 typedef struct Tcp Tcp;
208 struct Tcp {
209         uint16_t source;
210         uint16_t dest;
211         uint32_t seq;
212         uint32_t ack;
213         uint8_t flags;
214         uint16_t ws;                            /* window scale option (if not zero) */
215         uint32_t wnd;
216         uint16_t urg;
217         uint16_t mss;                           /* max segment size option (if not zero) */
218         uint16_t len;                           /* size of data */
219 };
220
221 /*
222  *  this header is malloc'd to thread together fragments
223  *  waiting to be coalesced
224  */
225 typedef struct Reseq Reseq;
226 struct Reseq {
227         Reseq *next;
228         Tcp seg;
229         struct block *bp;
230         uint16_t length;
231 };
232
233 /*
234  *  the qlock in the Conv locks this structure
235  */
236 typedef struct Tcpctl Tcpctl;
237 struct Tcpctl {
238         uint8_t state;                          /* Connection state */
239         uint8_t type;                           /* Listening or active connection */
240         uint8_t code;                           /* Icmp code */
241         struct {
242                 uint32_t una;                   /* Unacked data pointer */
243                 uint32_t nxt;                   /* Next sequence expected */
244                 uint32_t ptr;                   /* Data pointer */
245                 uint32_t wnd;                   /* Tcp send window */
246                 uint32_t urg;                   /* Urgent data pointer */
247                 uint32_t wl2;
248                 int scale;                              /* how much to right shift window in xmitted packets */
249                 /* to implement tahoe and reno TCP */
250                 uint32_t dupacks;               /* number of duplicate acks rcvd */
251                 int recovery;                   /* loss recovery flag */
252                 uint32_t rxt;                   /* right window marker for recovery */
253         } snd;
254         struct {
255                 uint32_t nxt;                   /* Receive pointer to next uint8_t slot */
256                 uint32_t wnd;                   /* Receive window incoming */
257                 uint32_t urg;                   /* Urgent pointer */
258                 int blocked;
259                 int una;                                /* unacked data segs */
260                 int scale;                              /* how much to left shift window in rcved packets */
261         } rcv;
262         uint32_t iss;                           /* Initial sequence number */
263         int sawwsopt;                           /* true if we saw a wsopt on the incoming SYN */
264         uint32_t cwind;                         /* Congestion window */
265         int scale;                                      /* desired snd.scale */
266         uint16_t ssthresh;                      /* Slow start threshold */
267         int resent;                                     /* Bytes just resent */
268         int irs;                                        /* Initial received squence */
269         uint16_t mss;                           /* Mean segment size */
270         int rerecv;                                     /* Overlap of data rerecevived */
271         uint32_t window;                        /* Recevive window */
272         uint8_t backoff;                        /* Exponential backoff counter */
273         int backedoff;                          /* ms we've backed off for rexmits */
274         uint8_t flags;                          /* State flags */
275         Reseq *reseq;                           /* Resequencing queue */
276         Tcptimer timer;                         /* Activity timer */
277         Tcptimer acktimer;                      /* Acknowledge timer */
278         Tcptimer rtt_timer;                     /* Round trip timer */
279         Tcptimer katimer;                       /* keep alive timer */
280         uint32_t rttseq;                        /* Round trip sequence */
281         int srtt;                                       /* Shortened round trip */
282         int mdev;                                       /* Mean deviation of round trip */
283         int kacounter;                          /* count down for keep alive */
284         uint64_t sndsyntime;            /* time syn sent */
285         uint64_t time;                          /* time Finwait2 or Syn_received was sent */
286         int nochecksum;                         /* non-zero means don't send checksums */
287         int flgcnt;                                     /* number of flags in the sequence (FIN,SEQ) */
288
289         union {
290                 Tcp4hdr tcp4hdr;
291                 Tcp6hdr tcp6hdr;
292         } protohdr;                                     /* prototype header */
293 };
294
295 /*
296  *  New calls are put in limbo rather than having a conversation structure
297  *  allocated.  Thus, a SYN attack results in lots of limbo'd calls but not
298  *  any real Conv structures mucking things up.  Calls in limbo rexmit their
299  *  SYN ACK every SYNACK_RXTIMER ms up to 4 times, i.e., they disappear after 1 second.
300  *
301  *  In particular they aren't on a listener's queue so that they don't figure
302  *  in the input queue limit.
303  *
304  *  If 1/2 of a T3 was attacking SYN packets, we'ld have a permanent queue
305  *  of 70000 limbo'd calls.  Not great for a linear list but doable.  Therefore
306  *  there is no hashing of this list.
307  */
308 typedef struct Limbo Limbo;
309 struct Limbo {
310         Limbo *next;
311
312         uint8_t laddr[IPaddrlen];
313         uint8_t raddr[IPaddrlen];
314         uint16_t lport;
315         uint16_t rport;
316         uint32_t irs;                           /* initial received sequence */
317         uint32_t iss;                           /* initial sent sequence */
318         uint16_t mss;                           /* mss from the other end */
319         uint16_t rcvscale;                      /* how much to scale rcvd windows */
320         uint16_t sndscale;                      /* how much to scale sent windows */
321         uint64_t lastsend;                      /* last time we sent a synack */
322         uint8_t version;                        /* v4 or v6 */
323         uint8_t rexmits;                        /* number of retransmissions */
324 };
325
326 int tcp_irtt = DEF_RTT;                 /* Initial guess at round trip time */
327 uint16_t tcp_mss = DEF_MSS;             /* Maximum segment size to be sent */
328
329 enum {
330         /* MIB stats */
331         MaxConn,
332         ActiveOpens,
333         PassiveOpens,
334         EstabResets,
335         CurrEstab,
336         InSegs,
337         OutSegs,
338         RetransSegs,
339         RetransTimeouts,
340         InErrs,
341         OutRsts,
342
343         /* non-MIB stats */
344         CsumErrs,
345         HlenErrs,
346         LenErrs,
347         OutOfOrder,
348
349         Nstats
350 };
351
352 static char *statnames[] = {
353         [MaxConn] "MaxConn",
354         [ActiveOpens] "ActiveOpens",
355         [PassiveOpens] "PassiveOpens",
356         [EstabResets] "EstabResets",
357         [CurrEstab] "CurrEstab",
358         [InSegs] "InSegs",
359         [OutSegs] "OutSegs",
360         [RetransSegs] "RetransSegs",
361         [RetransTimeouts] "RetransTimeouts",
362         [InErrs] "InErrs",
363         [OutRsts] "OutRsts",
364         [CsumErrs] "CsumErrs",
365         [HlenErrs] "HlenErrs",
366         [LenErrs] "LenErrs",
367         [OutOfOrder] "OutOfOrder",
368 };
369
370 typedef struct Tcppriv Tcppriv;
371 struct tcppriv {
372         /* List of active timers */
373         qlock_t tl;
374         Tcptimer *timers;
375
376         /* hash table for matching conversations */
377         struct Ipht ht;
378
379         /* calls in limbo waiting for an ACK to our SYN ACK */
380         int nlimbo;
381         Limbo *lht[NLHT];
382
383         /* for keeping track of tcpackproc */
384         qlock_t apl;
385         int ackprocstarted;
386
387         uint32_t stats[Nstats];
388 };
389
390 /*
391  *  Setting tcpporthogdefense to non-zero enables Dong Lin's
392  *  solution to hijacked systems staking out port's as a form
393  *  of DoS attack.
394  *
395  *  To avoid stateless Conv hogs, we pick a sequence number at random.  If
396  *  it that number gets acked by the other end, we shut down the connection.
397  *  Look for tcpporthogedefense in the code.
398  */
399 int tcpporthogdefense = 0;
400
401 int addreseq(Tcpctl *, struct tcppriv *, Tcp *, struct block *, uint16_t);
402 void getreseq(Tcpctl *, Tcp *, struct block **, uint16_t *);
403 void localclose(struct conv *, char *unused_char_p_t);
404 void procsyn(struct conv *, Tcp *);
405 void tcpiput(struct Proto *, struct Ipifc *, struct block *);
406 void tcpoutput(struct conv *);
407 int tcptrim(Tcpctl *, Tcp *, struct block **, uint16_t *);
408 void tcpstart(struct conv *, int);
409 void tcptimeout(void *);
410 void tcpsndsyn(struct conv *, Tcpctl *);
411 void tcprcvwin(struct conv *);
412 void tcpacktimer(void *);
413 void tcpkeepalive(void *);
414 void tcpsetkacounter(Tcpctl *);
415 void tcprxmit(struct conv *);
416 void tcpsettimer(Tcpctl *);
417 void tcpsynackrtt(struct conv *);
418 void tcpsetscale(struct conv *, Tcpctl *, uint16_t, uint16_t);
419
420 static void limborexmit(struct Proto *);
421 static void limbo(struct conv *, uint8_t * unused_uint8_p_t, uint8_t *, Tcp *,
422                                   int);
423
424 void tcpsetstate(struct conv *s, uint8_t newstate)
425 {
426         Tcpctl *tcb;
427         uint8_t oldstate;
428         struct tcppriv *tpriv;
429
430         tpriv = s->p->priv;
431
432         tcb = (Tcpctl *) s->ptcl;
433
434         oldstate = tcb->state;
435         if (oldstate == newstate)
436                 return;
437
438         if (oldstate == Established)
439                 tpriv->stats[CurrEstab]--;
440         if (newstate == Established)
441                 tpriv->stats[CurrEstab]++;
442
443         /**
444         print( "%d/%d %s->%s CurrEstab=%d\n", s->lport, s->rport,
445                 tcpstates[oldstate], tcpstates[newstate], tpriv->tstats.tcpCurrEstab );
446         **/
447
448         switch (newstate) {
449                 case Closed:
450                         qclose(s->rq);
451                         qclose(s->wq);
452                         qclose(s->eq);
453                         break;
454
455                 case Close_wait:        /* Remote closes */
456                         qhangup(s->rq, NULL);
457                         break;
458         }
459
460         tcb->state = newstate;
461
462         if (oldstate == Syn_sent && newstate != Closed)
463                 Fsconnected(s, NULL);
464 }
465
466 static void tcpconnect(struct conv *c, char **argv, int argc)
467 {
468         Fsstdconnect(c, argv, argc);
469         tcpstart(c, TCP_CONNECT);
470 }
471
472 static int tcpstate(struct conv *c, char *state, int n)
473 {
474         Tcpctl *s;
475
476         s = (Tcpctl *) (c->ptcl);
477
478         return snprintf(state, n,
479                                         "%s qin %d qout %d srtt %d mdev %d cwin %u swin %u>>%d rwin %u>>%d timer.start %llu timer.count %llu rerecv %d katimer.start %d katimer.count %d\n",
480                                         tcpstates[s->state],
481                                         c->rq ? qlen(c->rq) : 0,
482                                         c->wq ? qlen(c->wq) : 0,
483                                         s->srtt, s->mdev,
484                                         s->cwind, s->snd.wnd, s->rcv.scale, s->rcv.wnd,
485                                         s->snd.scale, s->timer.start, s->timer.count, s->rerecv,
486                                         s->katimer.start, s->katimer.count);
487 }
488
489 static int tcpinuse(struct conv *c)
490 {
491         Tcpctl *s;
492
493         s = (Tcpctl *) (c->ptcl);
494         return s->state != Closed;
495 }
496
497 static void tcpannounce(struct conv *c, char **argv, int argc)
498 {
499         Fsstdannounce(c, argv, argc);
500         tcpstart(c, TCP_LISTEN);
501         Fsconnected(c, NULL);
502 }
503
504 static void tcpshutdown(struct conv *c, int how)
505 {
506         Tcpctl *tcb = (Tcpctl*)c->ptcl;
507
508         /* Do nothing for the read side */
509         if (how == SHUT_RD)
510                 return;
511         /* Sends a FIN.  If we're in another state (like Listen), we'll run into
512          * issues, since we'll never send the FIN.  We'll be shutdown on our end,
513          * but we'll never tell the distant end.  Might just be an app issue. */
514         switch (tcb->state) {
515         case Syn_received:
516         case Established:
517                 tcb->flgcnt++;
518                 tcb->snd.nxt++;
519                 tcpsetstate(c, Finwait1);
520                 tcpoutput(c);
521                 break;
522         }
523 }
524
525 /*
526  *  tcpclose is always called with the q locked
527  */
528 static void tcpclose(struct conv *c)
529 {
530         Tcpctl *tcb;
531
532         tcb = (Tcpctl *) c->ptcl;
533
534         qhangup(c->rq, NULL);
535         qhangup(c->wq, NULL);
536         qhangup(c->eq, NULL);
537         qflush(c->rq);
538
539         switch (tcb->state) {
540                 case Listen:
541                         /*
542                          *  reset any incoming calls to this listener
543                          */
544                         Fsconnected(c, "Hangup");
545
546                         localclose(c, NULL);
547                         break;
548                 case Closed:
549                 case Syn_sent:
550                         localclose(c, NULL);
551                         break;
552                 case Syn_received:
553                 case Established:
554                         tcb->flgcnt++;
555                         tcb->snd.nxt++;
556                         tcpsetstate(c, Finwait1);
557                         tcpoutput(c);
558                         break;
559                 case Close_wait:
560                         tcb->flgcnt++;
561                         tcb->snd.nxt++;
562                         tcpsetstate(c, Last_ack);
563                         tcpoutput(c);
564                         break;
565         }
566 }
567
568 void tcpkick(void *x)
569 {
570         ERRSTACK(1);
571         struct conv *s = x;
572         Tcpctl *tcb;
573
574         tcb = (Tcpctl *) s->ptcl;
575
576         qlock(&s->qlock);
577         if (waserror()) {
578                 qunlock(&s->qlock);
579                 nexterror();
580         }
581
582         switch (tcb->state) {
583                 case Syn_sent:
584                 case Syn_received:
585                 case Established:
586                 case Close_wait:
587                         /*
588                          * Push data
589                          */
590                         tcprcvwin(s);
591                         tcpoutput(s);
592                         break;
593                 default:
594                         localclose(s, "Hangup");
595                         break;
596         }
597
598         qunlock(&s->qlock);
599         poperror();
600 }
601
602 void tcprcvwin(struct conv *s)
603 {       /* Call with tcb locked */
604         int w;
605         Tcpctl *tcb;
606
607         tcb = (Tcpctl *) s->ptcl;
608         w = tcb->window - qlen(s->rq);
609         if (w < 0)
610                 w = 0;
611         tcb->rcv.wnd = w;
612         if (w == 0)
613                 tcb->rcv.blocked = 1;
614 }
615
616 void tcpacktimer(void *v)
617 {
618         ERRSTACK(1);
619         Tcpctl *tcb;
620         struct conv *s;
621
622         s = v;
623         tcb = (Tcpctl *) s->ptcl;
624
625         qlock(&s->qlock);
626         if (waserror()) {
627                 qunlock(&s->qlock);
628                 nexterror();
629         }
630         if (tcb->state != Closed) {
631                 tcb->flags |= FORCE;
632                 tcprcvwin(s);
633                 tcpoutput(s);
634         }
635         qunlock(&s->qlock);
636         poperror();
637 }
638
639 static void tcpcreate(struct conv *c)
640 {
641         c->rq = qopen(QMAX, Qcoalesce, 0, 0);
642         c->wq = qopen(8 * QMAX, Qkick, tcpkick, c);
643 }
644
645 static void timerstate(struct tcppriv *priv, Tcptimer * t, int newstate)
646 {
647         if (newstate != TcptimerON) {
648                 if (t->state == TcptimerON) {
649                         // unchain
650                         if (priv->timers == t) {
651                                 priv->timers = t->next;
652                                 if (t->prev != NULL)
653                                         panic("timerstate1");
654                         }
655                         if (t->next)
656                                 t->next->prev = t->prev;
657                         if (t->prev)
658                                 t->prev->next = t->next;
659                         t->next = t->prev = NULL;
660                 }
661         } else {
662                 if (t->state != TcptimerON) {
663                         // chain
664                         if (t->prev != NULL || t->next != NULL)
665                                 panic("timerstate2");
666                         t->prev = NULL;
667                         t->next = priv->timers;
668                         if (t->next)
669                                 t->next->prev = t;
670                         priv->timers = t;
671                 }
672         }
673         t->state = newstate;
674 }
675
676 void tcpackproc(void *a)
677 {
678         ERRSTACK(1);
679         Tcptimer *t, *tp, *timeo;
680         struct Proto *tcp;
681         struct tcppriv *priv;
682         int loop;
683
684         tcp = a;
685         priv = tcp->priv;
686
687         for (;;) {
688                 kthread_usleep(MSPTICK * 1000);
689
690                 qlock(&priv->tl);
691                 timeo = NULL;
692                 loop = 0;
693                 for (t = priv->timers; t != NULL; t = tp) {
694                         if (loop++ > 10000)
695                                 panic("tcpackproc1");
696                         tp = t->next;
697                         if (t->state == TcptimerON) {
698                                 t->count--;
699                                 if (t->count == 0) {
700                                         timerstate(priv, t, TcptimerDONE);
701                                         t->readynext = timeo;
702                                         timeo = t;
703                                 }
704                         }
705                 }
706                 qunlock(&priv->tl);
707
708                 loop = 0;
709                 for (t = timeo; t != NULL; t = t->readynext) {
710                         if (loop++ > 10000)
711                                 panic("tcpackproc2");
712                         if (t->state == TcptimerDONE && t->func != NULL) {
713                                 /* discard error style */
714                                 if (!waserror())
715                                         (*t->func) (t->arg);
716                                 poperror();
717                         }
718                 }
719
720                 limborexmit(tcp);
721         }
722 }
723
724 void tcpgo(struct tcppriv *priv, Tcptimer * t)
725 {
726         if (t == NULL || t->start == 0)
727                 return;
728
729         qlock(&priv->tl);
730         t->count = t->start;
731         timerstate(priv, t, TcptimerON);
732         qunlock(&priv->tl);
733 }
734
735 void tcphalt(struct tcppriv *priv, Tcptimer * t)
736 {
737         if (t == NULL)
738                 return;
739
740         qlock(&priv->tl);
741         timerstate(priv, t, TcptimerOFF);
742         qunlock(&priv->tl);
743 }
744
745 int backoff(int n)
746 {
747         return 1 << n;
748 }
749
750 void localclose(struct conv *s, char *reason)
751 {       /* called with tcb locked */
752         Tcpctl *tcb;
753         Reseq *rp, *rp1;
754         struct tcppriv *tpriv;
755
756         tpriv = s->p->priv;
757         tcb = (Tcpctl *) s->ptcl;
758
759         iphtrem(&tpriv->ht, s);
760
761         tcphalt(tpriv, &tcb->timer);
762         tcphalt(tpriv, &tcb->rtt_timer);
763         tcphalt(tpriv, &tcb->acktimer);
764         tcphalt(tpriv, &tcb->katimer);
765
766         /* Flush reassembly queue; nothing more can arrive */
767         for (rp = tcb->reseq; rp != NULL; rp = rp1) {
768                 rp1 = rp->next;
769                 freeblist(rp->bp);
770                 kfree(rp);
771         }
772         tcb->reseq = NULL;
773
774         if (tcb->state == Syn_sent)
775                 Fsconnected(s, reason);
776
777         qhangup(s->rq, reason);
778         qhangup(s->wq, reason);
779
780         tcpsetstate(s, Closed);
781
782         /* listener will check the rq state */
783         if (s->state == Announced)
784                 rendez_wakeup(&s->listenr);
785 }
786
787 /* mtu (- TCP + IP hdr len) of 1st hop */
788 int tcpmtu(struct Proto *tcp, uint8_t * addr, int version, int *scale,
789            uint8_t *flags)
790 {
791         struct Ipifc *ifc;
792         int mtu;
793
794         ifc = findipifc(tcp->f, addr, 0);
795         switch (version) {
796                 default:
797                 case V4:
798                         mtu = DEF_MSS;
799                         if (ifc != NULL)
800                                 mtu = ifc->maxtu - ifc->m->hsize - (TCP4_PKT + TCP4_HDRSIZE);
801                         break;
802                 case V6:
803                         mtu = DEF_MSS6;
804                         if (ifc != NULL)
805                                 mtu = ifc->maxtu - ifc->m->hsize - (TCP6_PKT + TCP6_HDRSIZE);
806                         break;
807         }
808         *flags &= ~TSO;
809
810         if (ifc != NULL) {
811                 if (ifc->mbps > 100)
812                         *scale = HaveWS | 3;
813                 else if (ifc->mbps > 10)
814                         *scale = HaveWS | 1;
815                 else
816                         *scale = HaveWS | 0;
817                 if (ifc->feat & NETF_TSO)
818                         *flags |= TSO;
819         } else
820                 *scale = HaveWS | 0;
821
822         return mtu;
823 }
824
825 void inittcpctl(struct conv *s, int mode)
826 {
827         Tcpctl *tcb;
828         Tcp4hdr *h4;
829         Tcp6hdr *h6;
830         int mss;
831
832         tcb = (Tcpctl *) s->ptcl;
833
834         memset(tcb, 0, sizeof(Tcpctl));
835
836         tcb->ssthresh = 65535;
837         tcb->srtt = tcp_irtt << LOGAGAIN;
838         tcb->mdev = 0;
839
840         /* setup timers */
841         tcb->timer.start = tcp_irtt / MSPTICK;
842         tcb->timer.func = tcptimeout;
843         tcb->timer.arg = s;
844         tcb->rtt_timer.start = MAX_TIME;
845         tcb->acktimer.start = TCP_ACK / MSPTICK;
846         tcb->acktimer.func = tcpacktimer;
847         tcb->acktimer.arg = s;
848         tcb->katimer.start = DEF_KAT / MSPTICK;
849         tcb->katimer.func = tcpkeepalive;
850         tcb->katimer.arg = s;
851
852         mss = DEF_MSS;
853
854         /* create a prototype(pseudo) header */
855         if (mode != TCP_LISTEN) {
856                 if (ipcmp(s->laddr, IPnoaddr) == 0)
857                         findlocalip(s->p->f, s->laddr, s->raddr);
858
859                 switch (s->ipversion) {
860                         case V4:
861                                 h4 = &tcb->protohdr.tcp4hdr;
862                                 memset(h4, 0, sizeof(*h4));
863                                 h4->proto = IP_TCPPROTO;
864                                 hnputs(h4->tcpsport, s->lport);
865                                 hnputs(h4->tcpdport, s->rport);
866                                 v6tov4(h4->tcpsrc, s->laddr);
867                                 v6tov4(h4->tcpdst, s->raddr);
868                                 break;
869                         case V6:
870                                 h6 = &tcb->protohdr.tcp6hdr;
871                                 memset(h6, 0, sizeof(*h6));
872                                 h6->proto = IP_TCPPROTO;
873                                 hnputs(h6->tcpsport, s->lport);
874                                 hnputs(h6->tcpdport, s->rport);
875                                 ipmove(h6->tcpsrc, s->laddr);
876                                 ipmove(h6->tcpdst, s->raddr);
877                                 mss = DEF_MSS6;
878                                 break;
879                         default:
880                                 panic("inittcpctl: version %d", s->ipversion);
881                 }
882         }
883
884         tcb->mss = tcb->cwind = mss;
885
886         /* default is no window scaling */
887         tcb->window = QMAX;
888         tcb->rcv.wnd = QMAX;
889         tcb->rcv.scale = 0;
890         tcb->snd.scale = 0;
891         qsetlimit(s->rq, QMAX);
892 }
893
894 /*
895  *  called with s qlocked
896  */
897 void tcpstart(struct conv *s, int mode)
898 {
899         Tcpctl *tcb;
900         struct tcppriv *tpriv;
901         /* tcpackproc needs to free this if it ever exits */
902         char *kpname = kmalloc(KNAMELEN, MEM_WAIT);
903
904         tpriv = s->p->priv;
905
906         if (tpriv->ackprocstarted == 0) {
907                 qlock(&tpriv->apl);
908                 if (tpriv->ackprocstarted == 0) {
909                         snprintf(kpname, KNAMELEN, "#I%dtcpack", s->p->f->dev);
910                         ktask(kpname, tcpackproc, s->p);
911                         tpriv->ackprocstarted = 1;
912                 }
913                 qunlock(&tpriv->apl);
914         }
915
916         tcb = (Tcpctl *) s->ptcl;
917
918         inittcpctl(s, mode);
919
920         iphtadd(&tpriv->ht, s);
921         switch (mode) {
922                 case TCP_LISTEN:
923                         tpriv->stats[PassiveOpens]++;
924                         tcb->flags |= CLONE;
925                         tcpsetstate(s, Listen);
926                         break;
927
928                 case TCP_CONNECT:
929                         tpriv->stats[ActiveOpens]++;
930                         tcb->flags |= ACTIVE;
931                         tcpsndsyn(s, tcb);
932                         tcpsetstate(s, Syn_sent);
933                         tcpoutput(s);
934                         break;
935         }
936 }
937
938 static char *tcpflag(uint16_t flag)
939 {
940         static char buf[128];
941
942         snprintf(buf, sizeof(buf), "%d", flag >> 10);   /* Head len */
943         if (flag & URG)
944                 snprintf(buf, sizeof(buf), "%s%s", buf, " URG");
945         if (flag & ACK)
946                 snprintf(buf, sizeof(buf), "%s%s", buf, " ACK");
947         if (flag & PSH)
948                 snprintf(buf, sizeof(buf), "%s%s", buf, " PSH");
949         if (flag & RST)
950                 snprintf(buf, sizeof(buf), "%s%s", buf, " RST");
951         if (flag & SYN)
952                 snprintf(buf, sizeof(buf), "%s%s", buf, " SYN");
953         if (flag & FIN)
954                 snprintf(buf, sizeof(buf), "%s%s", buf, " FIN");
955
956         return buf;
957 }
958
959 struct block *htontcp6(Tcp * tcph, struct block *data, Tcp6hdr * ph,
960                                            Tcpctl * tcb)
961 {
962         int dlen;
963         Tcp6hdr *h;
964         uint16_t csum;
965         uint16_t hdrlen, optpad = 0;
966         uint8_t *opt;
967
968         hdrlen = TCP6_HDRSIZE;
969         if (tcph->flags & SYN) {
970                 if (tcph->mss)
971                         hdrlen += MSS_LENGTH;
972                 if (tcph->ws)
973                         hdrlen += WS_LENGTH;
974                 optpad = hdrlen & 3;
975                 if (optpad)
976                         optpad = 4 - optpad;
977                 hdrlen += optpad;
978         }
979
980         if (data) {
981                 dlen = blocklen(data);
982                 data = padblock(data, hdrlen + TCP6_PKT);
983                 if (data == NULL)
984                         return NULL;
985         } else {
986                 dlen = 0;
987                 /* the 64 pad is to meet mintu's */
988                 data = block_alloc(hdrlen + TCP6_PKT + 64, MEM_WAIT);
989                 if (data == NULL)
990                         return NULL;
991                 data->wp += hdrlen + TCP6_PKT;
992         }
993
994         /* copy in pseudo ip header plus port numbers */
995         h = (Tcp6hdr *) (data->rp);
996         memmove(h, ph, TCP6_TCBPHDRSZ);
997
998         /* compose pseudo tcp header, do cksum calculation */
999         hnputl(h->vcf, hdrlen + dlen);
1000         h->ploadlen[0] = h->ploadlen[1] = h->proto = 0;
1001         h->ttl = ph->proto;
1002
1003         /* copy in variable bits */
1004         hnputl(h->tcpseq, tcph->seq);
1005         hnputl(h->tcpack, tcph->ack);
1006         hnputs(h->tcpflag, (hdrlen << 10) | tcph->flags);
1007         hnputs(h->tcpwin, tcph->wnd >> (tcb != NULL ? tcb->snd.scale : 0));
1008         hnputs(h->tcpurg, tcph->urg);
1009
1010         if (tcph->flags & SYN) {
1011                 opt = h->tcpopt;
1012                 if (tcph->mss != 0) {
1013                         *opt++ = MSSOPT;
1014                         *opt++ = MSS_LENGTH;
1015                         hnputs(opt, tcph->mss);
1016                         opt += 2;
1017                 }
1018                 if (tcph->ws != 0) {
1019                         *opt++ = WSOPT;
1020                         *opt++ = WS_LENGTH;
1021                         *opt++ = tcph->ws;
1022                 }
1023                 while (optpad-- > 0)
1024                         *opt++ = NOOPOPT;
1025         }
1026
1027         if (tcb != NULL && tcb->nochecksum) {
1028                 h->tcpcksum[0] = h->tcpcksum[1] = 0;
1029         } else {
1030                 csum = ptclcsum(data, TCP6_IPLEN, hdrlen + dlen + TCP6_PHDRSIZE);
1031                 hnputs(h->tcpcksum, csum);
1032         }
1033
1034         /* move from pseudo header back to normal ip header */
1035         memset(h->vcf, 0, 4);
1036         h->vcf[0] = IP_VER6;
1037         hnputs(h->ploadlen, hdrlen + dlen);
1038         h->proto = ph->proto;
1039
1040         return data;
1041 }
1042
1043 struct block *htontcp4(Tcp * tcph, struct block *data, Tcp4hdr * ph,
1044                                            Tcpctl * tcb)
1045 {
1046         int dlen;
1047         Tcp4hdr *h;
1048         uint16_t csum;
1049         uint16_t hdrlen, optpad = 0;
1050         uint8_t *opt;
1051
1052         hdrlen = TCP4_HDRSIZE;
1053         if (tcph->flags & SYN) {
1054                 if (tcph->mss)
1055                         hdrlen += MSS_LENGTH;
1056                 if (tcph->ws)
1057                         hdrlen += WS_LENGTH;
1058                 optpad = hdrlen & 3;
1059                 if (optpad)
1060                         optpad = 4 - optpad;
1061                 hdrlen += optpad;
1062         }
1063
1064         if (data) {
1065                 dlen = blocklen(data);
1066                 data = padblock(data, hdrlen + TCP4_PKT);
1067                 if (data == NULL)
1068                         return NULL;
1069         } else {
1070                 dlen = 0;
1071                 /* the 64 pad is to meet mintu's */
1072                 data = block_alloc(hdrlen + TCP4_PKT + 64, MEM_WAIT);
1073                 if (data == NULL)
1074                         return NULL;
1075                 data->wp += hdrlen + TCP4_PKT;
1076         }
1077
1078         /* copy in pseudo ip header plus port numbers */
1079         h = (Tcp4hdr *) (data->rp);
1080         memmove(h, ph, TCP4_TCBPHDRSZ);
1081
1082         /* copy in variable bits */
1083         hnputs(h->tcplen, hdrlen + dlen);
1084         hnputl(h->tcpseq, tcph->seq);
1085         hnputl(h->tcpack, tcph->ack);
1086         hnputs(h->tcpflag, (hdrlen << 10) | tcph->flags);
1087         hnputs(h->tcpwin, tcph->wnd >> (tcb != NULL ? tcb->snd.scale : 0));
1088         hnputs(h->tcpurg, tcph->urg);
1089
1090         if (tcph->flags & SYN) {
1091                 opt = h->tcpopt;
1092                 if (tcph->mss != 0) {
1093                         *opt++ = MSSOPT;
1094                         *opt++ = MSS_LENGTH;
1095                         hnputs(opt, tcph->mss);
1096                         opt += 2;
1097                 }
1098                 if (tcph->ws != 0) {
1099                         *opt++ = WSOPT;
1100                         *opt++ = WS_LENGTH;
1101                         *opt++ = tcph->ws;
1102                 }
1103                 while (optpad-- > 0)
1104                         *opt++ = NOOPOPT;
1105         }
1106
1107         if (tcb != NULL && tcb->nochecksum) {
1108                 h->tcpcksum[0] = h->tcpcksum[1] = 0;
1109         } else {
1110                 csum = ~ptclcsum(data, TCP4_IPLEN, TCP4_PHDRSIZE);
1111                 hnputs(h->tcpcksum, csum);
1112                 data->checksum_start = TCP4_IPLEN + TCP4_PHDRSIZE;
1113                 data->checksum_offset = ph->tcpcksum - ph->tcpsport;
1114                 data->flag |= Btcpck;
1115         }
1116
1117         return data;
1118 }
1119
1120 int ntohtcp6(Tcp * tcph, struct block **bpp)
1121 {
1122         Tcp6hdr *h;
1123         uint8_t *optr;
1124         uint16_t hdrlen;
1125         uint16_t optlen;
1126         int n;
1127
1128         *bpp = pullupblock(*bpp, TCP6_PKT + TCP6_HDRSIZE);
1129         if (*bpp == NULL)
1130                 return -1;
1131
1132         h = (Tcp6hdr *) ((*bpp)->rp);
1133         tcph->source = nhgets(h->tcpsport);
1134         tcph->dest = nhgets(h->tcpdport);
1135         tcph->seq = nhgetl(h->tcpseq);
1136         tcph->ack = nhgetl(h->tcpack);
1137         hdrlen = (h->tcpflag[0] >> 2) & ~3;
1138         if (hdrlen < TCP6_HDRSIZE) {
1139                 freeblist(*bpp);
1140                 return -1;
1141         }
1142
1143         tcph->flags = h->tcpflag[1];
1144         tcph->wnd = nhgets(h->tcpwin);
1145         tcph->urg = nhgets(h->tcpurg);
1146         tcph->mss = 0;
1147         tcph->ws = 0;
1148         tcph->len = nhgets(h->ploadlen) - hdrlen;
1149
1150         *bpp = pullupblock(*bpp, hdrlen + TCP6_PKT);
1151         if (*bpp == NULL)
1152                 return -1;
1153
1154         optr = h->tcpopt;
1155         n = hdrlen - TCP6_HDRSIZE;
1156         while (n > 0 && *optr != EOLOPT) {
1157                 if (*optr == NOOPOPT) {
1158                         n--;
1159                         optr++;
1160                         continue;
1161                 }
1162                 optlen = optr[1];
1163                 if (optlen < 2 || optlen > n)
1164                         break;
1165                 switch (*optr) {
1166                         case MSSOPT:
1167                                 if (optlen == MSS_LENGTH)
1168                                         tcph->mss = nhgets(optr + 2);
1169                                 break;
1170                         case WSOPT:
1171                                 if (optlen == WS_LENGTH && *(optr + 2) <= 14)
1172                                         tcph->ws = HaveWS | *(optr + 2);
1173                                 break;
1174                 }
1175                 n -= optlen;
1176                 optr += optlen;
1177         }
1178         return hdrlen;
1179 }
1180
1181 int ntohtcp4(Tcp * tcph, struct block **bpp)
1182 {
1183         Tcp4hdr *h;
1184         uint8_t *optr;
1185         uint16_t hdrlen;
1186         uint16_t optlen;
1187         int n;
1188
1189         *bpp = pullupblock(*bpp, TCP4_PKT + TCP4_HDRSIZE);
1190         if (*bpp == NULL)
1191                 return -1;
1192
1193         h = (Tcp4hdr *) ((*bpp)->rp);
1194         tcph->source = nhgets(h->tcpsport);
1195         tcph->dest = nhgets(h->tcpdport);
1196         tcph->seq = nhgetl(h->tcpseq);
1197         tcph->ack = nhgetl(h->tcpack);
1198
1199         hdrlen = (h->tcpflag[0] >> 2) & ~3;
1200         if (hdrlen < TCP4_HDRSIZE) {
1201                 freeblist(*bpp);
1202                 return -1;
1203         }
1204
1205         tcph->flags = h->tcpflag[1];
1206         tcph->wnd = nhgets(h->tcpwin);
1207         tcph->urg = nhgets(h->tcpurg);
1208         tcph->mss = 0;
1209         tcph->ws = 0;
1210         tcph->len = nhgets(h->length) - (hdrlen + TCP4_PKT);
1211
1212         *bpp = pullupblock(*bpp, hdrlen + TCP4_PKT);
1213         if (*bpp == NULL)
1214                 return -1;
1215
1216         optr = h->tcpopt;
1217         n = hdrlen - TCP4_HDRSIZE;
1218         while (n > 0 && *optr != EOLOPT) {
1219                 if (*optr == NOOPOPT) {
1220                         n--;
1221                         optr++;
1222                         continue;
1223                 }
1224                 optlen = optr[1];
1225                 if (optlen < 2 || optlen > n)
1226                         break;
1227                 switch (*optr) {
1228                         case MSSOPT:
1229                                 if (optlen == MSS_LENGTH)
1230                                         tcph->mss = nhgets(optr + 2);
1231                                 break;
1232                         case WSOPT:
1233                                 if (optlen == WS_LENGTH && *(optr + 2) <= 14)
1234                                         tcph->ws = HaveWS | *(optr + 2);
1235                                 break;
1236                 }
1237                 n -= optlen;
1238                 optr += optlen;
1239         }
1240         return hdrlen;
1241 }
1242
1243 /*
1244  *  For outgiing calls, generate an initial sequence
1245  *  number and put a SYN on the send queue
1246  */
1247 void tcpsndsyn(struct conv *s, Tcpctl * tcb)
1248 {
1249         urandom_read(&tcb->iss, sizeof(tcb->iss));
1250         tcb->rttseq = tcb->iss;
1251         tcb->snd.wl2 = tcb->iss;
1252         tcb->snd.una = tcb->iss;
1253         tcb->snd.ptr = tcb->rttseq;
1254         tcb->snd.nxt = tcb->rttseq;
1255         tcb->flgcnt++;
1256         tcb->flags |= FORCE;
1257         tcb->sndsyntime = NOW;
1258
1259         /* set desired mss and scale */
1260         tcb->mss = tcpmtu(s->p, s->laddr, s->ipversion, &tcb->scale,
1261                           &tcb->flags);
1262 }
1263
1264 void
1265 sndrst(struct Proto *tcp, uint8_t * source, uint8_t * dest,
1266            uint16_t length, Tcp * seg, uint8_t version, char *reason)
1267 {
1268         struct block *hbp;
1269         uint8_t rflags;
1270         struct tcppriv *tpriv;
1271         Tcp4hdr ph4;
1272         Tcp6hdr ph6;
1273
1274         netlog(tcp->f, Logtcp, "sndrst: %s\n", reason);
1275
1276         tpriv = tcp->priv;
1277
1278         if (seg->flags & RST)
1279                 return;
1280
1281         /* make pseudo header */
1282         switch (version) {
1283                 case V4:
1284                         memset(&ph4, 0, sizeof(ph4));
1285                         ph4.vihl = IP_VER4;
1286                         v6tov4(ph4.tcpsrc, dest);
1287                         v6tov4(ph4.tcpdst, source);
1288                         ph4.proto = IP_TCPPROTO;
1289                         hnputs(ph4.tcplen, TCP4_HDRSIZE);
1290                         hnputs(ph4.tcpsport, seg->dest);
1291                         hnputs(ph4.tcpdport, seg->source);
1292                         break;
1293                 case V6:
1294                         memset(&ph6, 0, sizeof(ph6));
1295                         ph6.vcf[0] = IP_VER6;
1296                         ipmove(ph6.tcpsrc, dest);
1297                         ipmove(ph6.tcpdst, source);
1298                         ph6.proto = IP_TCPPROTO;
1299                         hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1300                         hnputs(ph6.tcpsport, seg->dest);
1301                         hnputs(ph6.tcpdport, seg->source);
1302                         break;
1303                 default:
1304                         panic("sndrst: version %d", version);
1305         }
1306
1307         tpriv->stats[OutRsts]++;
1308         rflags = RST;
1309
1310         /* convince the other end that this reset is in band */
1311         if (seg->flags & ACK) {
1312                 seg->seq = seg->ack;
1313                 seg->ack = 0;
1314         } else {
1315                 rflags |= ACK;
1316                 seg->ack = seg->seq;
1317                 seg->seq = 0;
1318                 if (seg->flags & SYN)
1319                         seg->ack++;
1320                 seg->ack += length;
1321                 if (seg->flags & FIN)
1322                         seg->ack++;
1323         }
1324         seg->flags = rflags;
1325         seg->wnd = 0;
1326         seg->urg = 0;
1327         seg->mss = 0;
1328         seg->ws = 0;
1329         switch (version) {
1330                 case V4:
1331                         hbp = htontcp4(seg, NULL, &ph4, NULL);
1332                         if (hbp == NULL)
1333                                 return;
1334                         ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1335                         break;
1336                 case V6:
1337                         hbp = htontcp6(seg, NULL, &ph6, NULL);
1338                         if (hbp == NULL)
1339                                 return;
1340                         ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1341                         break;
1342                 default:
1343                         panic("sndrst2: version %d", version);
1344         }
1345 }
1346
1347 /*
1348  *  send a reset to the remote side and close the conversation
1349  *  called with s qlocked
1350  */
1351 static void tcphangup(struct conv *s)
1352 {
1353         ERRSTACK(1);
1354         Tcp seg;
1355         Tcpctl *tcb;
1356         struct block *hbp;
1357
1358         tcb = (Tcpctl *) s->ptcl;
1359         if (ipcmp(s->raddr, IPnoaddr)) {
1360                 /* discard error style, poperror regardless */
1361                 if (!waserror()) {
1362                         seg.flags = RST | ACK;
1363                         seg.ack = tcb->rcv.nxt;
1364                         tcb->rcv.una = 0;
1365                         seg.seq = tcb->snd.ptr;
1366                         seg.wnd = 0;
1367                         seg.urg = 0;
1368                         seg.mss = 0;
1369                         seg.ws = 0;
1370                         switch (s->ipversion) {
1371                                 case V4:
1372                                         tcb->protohdr.tcp4hdr.vihl = IP_VER4;
1373                                         hbp = htontcp4(&seg, NULL, &tcb->protohdr.tcp4hdr, tcb);
1374                                         ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
1375                                         break;
1376                                 case V6:
1377                                         tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
1378                                         hbp = htontcp6(&seg, NULL, &tcb->protohdr.tcp6hdr, tcb);
1379                                         ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
1380                                         break;
1381                                 default:
1382                                         panic("tcphangup: version %d", s->ipversion);
1383                         }
1384                 }
1385                 poperror();
1386         }
1387         localclose(s, NULL);
1388 }
1389
1390 /*
1391  *  (re)send a SYN ACK
1392  */
1393 int sndsynack(struct Proto *tcp, Limbo * lp)
1394 {
1395         struct block *hbp;
1396         Tcp4hdr ph4;
1397         Tcp6hdr ph6;
1398         Tcp seg;
1399         int scale;
1400         uint8_t flag = 0;
1401
1402         /* make pseudo header */
1403         switch (lp->version) {
1404                 case V4:
1405                         memset(&ph4, 0, sizeof(ph4));
1406                         ph4.vihl = IP_VER4;
1407                         v6tov4(ph4.tcpsrc, lp->laddr);
1408                         v6tov4(ph4.tcpdst, lp->raddr);
1409                         ph4.proto = IP_TCPPROTO;
1410                         hnputs(ph4.tcplen, TCP4_HDRSIZE);
1411                         hnputs(ph4.tcpsport, lp->lport);
1412                         hnputs(ph4.tcpdport, lp->rport);
1413                         break;
1414                 case V6:
1415                         memset(&ph6, 0, sizeof(ph6));
1416                         ph6.vcf[0] = IP_VER6;
1417                         ipmove(ph6.tcpsrc, lp->laddr);
1418                         ipmove(ph6.tcpdst, lp->raddr);
1419                         ph6.proto = IP_TCPPROTO;
1420                         hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1421                         hnputs(ph6.tcpsport, lp->lport);
1422                         hnputs(ph6.tcpdport, lp->rport);
1423                         break;
1424                 default:
1425                         panic("sndrst: version %d", lp->version);
1426         }
1427
1428         seg.seq = lp->iss;
1429         seg.ack = lp->irs + 1;
1430         seg.flags = SYN | ACK;
1431         seg.urg = 0;
1432         seg.mss = tcpmtu(tcp, lp->laddr, lp->version, &scale, &flag);
1433         seg.wnd = QMAX;
1434
1435         /* if the other side set scale, we should too */
1436         if (lp->rcvscale) {
1437                 seg.ws = scale;
1438                 lp->sndscale = scale;
1439         } else {
1440                 seg.ws = 0;
1441                 lp->sndscale = 0;
1442         }
1443
1444         switch (lp->version) {
1445                 case V4:
1446                         hbp = htontcp4(&seg, NULL, &ph4, NULL);
1447                         if (hbp == NULL)
1448                                 return -1;
1449                         ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1450                         break;
1451                 case V6:
1452                         hbp = htontcp6(&seg, NULL, &ph6, NULL);
1453                         if (hbp == NULL)
1454                                 return -1;
1455                         ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1456                         break;
1457                 default:
1458                         panic("sndsnack: version %d", lp->version);
1459         }
1460         lp->lastsend = NOW;
1461         return 0;
1462 }
1463
1464 #define hashipa(a, p) ( ( (a)[IPaddrlen-2] + (a)[IPaddrlen-1] + p )&LHTMASK )
1465
1466 /*
1467  *  put a call into limbo and respond with a SYN ACK
1468  *
1469  *  called with proto locked
1470  */
1471 static void
1472 limbo(struct conv *s, uint8_t * source, uint8_t * dest, Tcp * seg, int version)
1473 {
1474         Limbo *lp, **l;
1475         struct tcppriv *tpriv;
1476         int h;
1477
1478         tpriv = s->p->priv;
1479         h = hashipa(source, seg->source);
1480
1481         for (l = &tpriv->lht[h]; *l != NULL; l = &lp->next) {
1482                 lp = *l;
1483                 if (lp->lport != seg->dest || lp->rport != seg->source
1484                         || lp->version != version)
1485                         continue;
1486                 if (ipcmp(lp->raddr, source) != 0)
1487                         continue;
1488                 if (ipcmp(lp->laddr, dest) != 0)
1489                         continue;
1490
1491                 /* each new SYN restarts the retransmits */
1492                 lp->irs = seg->seq;
1493                 break;
1494         }
1495         lp = *l;
1496         if (lp == NULL) {
1497                 if (tpriv->nlimbo >= Maxlimbo && tpriv->lht[h]) {
1498                         lp = tpriv->lht[h];
1499                         tpriv->lht[h] = lp->next;
1500                         lp->next = NULL;
1501                 } else {
1502                         lp = kzmalloc(sizeof(*lp), 0);
1503                         if (lp == NULL)
1504                                 return;
1505                         tpriv->nlimbo++;
1506                 }
1507                 *l = lp;
1508                 lp->version = version;
1509                 ipmove(lp->laddr, dest);
1510                 ipmove(lp->raddr, source);
1511                 lp->lport = seg->dest;
1512                 lp->rport = seg->source;
1513                 lp->mss = seg->mss;
1514                 lp->rcvscale = seg->ws;
1515                 lp->irs = seg->seq;
1516                 urandom_read(&lp->iss, sizeof(lp->iss));
1517         }
1518
1519         if (sndsynack(s->p, lp) < 0) {
1520                 *l = lp->next;
1521                 tpriv->nlimbo--;
1522                 kfree(lp);
1523         }
1524 }
1525
1526 /*
1527  *  resend SYN ACK's once every SYNACK_RXTIMER ms.
1528  */
1529 static void limborexmit(struct Proto *tcp)
1530 {
1531         struct tcppriv *tpriv;
1532         Limbo **l, *lp;
1533         int h;
1534         int seen;
1535         uint64_t now;
1536
1537         tpriv = tcp->priv;
1538
1539         if (!canqlock(&tcp->qlock))
1540                 return;
1541         seen = 0;
1542         now = NOW;
1543         for (h = 0; h < NLHT && seen < tpriv->nlimbo; h++) {
1544                 for (l = &tpriv->lht[h]; *l != NULL && seen < tpriv->nlimbo;) {
1545                         lp = *l;
1546                         seen++;
1547                         if (now - lp->lastsend < (lp->rexmits + 1) * SYNACK_RXTIMER)
1548                                 continue;
1549
1550                         /* time it out after 1 second */
1551                         if (++(lp->rexmits) > 5) {
1552                                 tpriv->nlimbo--;
1553                                 *l = lp->next;
1554                                 kfree(lp);
1555                                 continue;
1556                         }
1557
1558                         /* if we're being attacked, don't bother resending SYN ACK's */
1559                         if (tpriv->nlimbo > 100)
1560                                 continue;
1561
1562                         if (sndsynack(tcp, lp) < 0) {
1563                                 tpriv->nlimbo--;
1564                                 *l = lp->next;
1565                                 kfree(lp);
1566                                 continue;
1567                         }
1568
1569                         l = &lp->next;
1570                 }
1571         }
1572         qunlock(&tcp->qlock);
1573 }
1574
1575 /*
1576  *  lookup call in limbo.  if found, throw it out.
1577  *
1578  *  called with proto locked
1579  */
1580 static void
1581 limborst(struct conv *s, Tcp * segp, uint8_t * src, uint8_t * dst,
1582                  uint8_t version)
1583 {
1584         Limbo *lp, **l;
1585         int h;
1586         struct tcppriv *tpriv;
1587
1588         tpriv = s->p->priv;
1589
1590         /* find a call in limbo */
1591         h = hashipa(src, segp->source);
1592         for (l = &tpriv->lht[h]; *l != NULL; l = &lp->next) {
1593                 lp = *l;
1594                 if (lp->lport != segp->dest || lp->rport != segp->source
1595                         || lp->version != version)
1596                         continue;
1597                 if (ipcmp(lp->laddr, dst) != 0)
1598                         continue;
1599                 if (ipcmp(lp->raddr, src) != 0)
1600                         continue;
1601
1602                 /* RST can only follow the SYN */
1603                 if (segp->seq == lp->irs + 1) {
1604                         tpriv->nlimbo--;
1605                         *l = lp->next;
1606                         kfree(lp);
1607                 }
1608                 break;
1609         }
1610 }
1611
1612 /*
1613  *  come here when we finally get an ACK to our SYN-ACK.
1614  *  lookup call in limbo.  if found, create a new conversation
1615  *
1616  *  called with proto locked
1617  */
1618 static struct conv *tcpincoming(struct conv *s, Tcp * segp, uint8_t * src,
1619                                                                 uint8_t * dst, uint8_t version)
1620 {
1621         struct conv *new;
1622         Tcpctl *tcb;
1623         struct tcppriv *tpriv;
1624         Tcp4hdr *h4;
1625         Tcp6hdr *h6;
1626         Limbo *lp, **l;
1627         int h;
1628
1629         /* unless it's just an ack, it can't be someone coming out of limbo */
1630         if ((segp->flags & SYN) || (segp->flags & ACK) == 0)
1631                 return NULL;
1632
1633         tpriv = s->p->priv;
1634
1635         /* find a call in limbo */
1636         h = hashipa(src, segp->source);
1637         for (l = &tpriv->lht[h]; (lp = *l) != NULL; l = &lp->next) {
1638                 netlog(s->p->f, Logtcp,
1639                            "tcpincoming s %I!%d/%I!%d d %I!%d/%I!%d v %d/%d\n", src,
1640                            segp->source, lp->raddr, lp->rport, dst, segp->dest, lp->laddr,
1641                            lp->lport, version, lp->version);
1642
1643                 if (lp->lport != segp->dest || lp->rport != segp->source
1644                         || lp->version != version)
1645                         continue;
1646                 if (ipcmp(lp->laddr, dst) != 0)
1647                         continue;
1648                 if (ipcmp(lp->raddr, src) != 0)
1649                         continue;
1650
1651                 /* we're assuming no data with the initial SYN */
1652                 if (segp->seq != lp->irs + 1 || segp->ack != lp->iss + 1) {
1653                         netlog(s->p->f, Logtcp, "tcpincoming s 0x%lx/0x%lx a 0x%lx 0x%lx\n",
1654                                    segp->seq, lp->irs + 1, segp->ack, lp->iss + 1);
1655                         lp = NULL;
1656                 } else {
1657                         tpriv->nlimbo--;
1658                         *l = lp->next;
1659                 }
1660                 break;
1661         }
1662         if (lp == NULL)
1663                 return NULL;
1664
1665         new = Fsnewcall(s, src, segp->source, dst, segp->dest, version);
1666         if (new == NULL)
1667                 return NULL;
1668
1669         memmove(new->ptcl, s->ptcl, sizeof(Tcpctl));
1670         tcb = (Tcpctl *) new->ptcl;
1671         tcb->flags &= ~CLONE;
1672         tcb->timer.arg = new;
1673         tcb->timer.state = TcptimerOFF;
1674         tcb->acktimer.arg = new;
1675         tcb->acktimer.state = TcptimerOFF;
1676         tcb->katimer.arg = new;
1677         tcb->katimer.state = TcptimerOFF;
1678         tcb->rtt_timer.arg = new;
1679         tcb->rtt_timer.state = TcptimerOFF;
1680
1681         tcb->irs = lp->irs;
1682         tcb->rcv.nxt = tcb->irs + 1;
1683         tcb->rcv.urg = tcb->rcv.nxt;
1684
1685         tcb->iss = lp->iss;
1686         tcb->rttseq = tcb->iss;
1687         tcb->snd.wl2 = tcb->iss;
1688         tcb->snd.una = tcb->iss + 1;
1689         tcb->snd.ptr = tcb->iss + 1;
1690         tcb->snd.nxt = tcb->iss + 1;
1691         tcb->flgcnt = 0;
1692         tcb->flags |= SYNACK;
1693
1694         /* our sending max segment size cannot be bigger than what he asked for */
1695         if (lp->mss != 0 && lp->mss < tcb->mss)
1696                 tcb->mss = lp->mss;
1697
1698         /* window scaling */
1699         tcpsetscale(new, tcb, lp->rcvscale, lp->sndscale);
1700
1701         /* the congestion window always starts out as a single segment */
1702         tcb->snd.wnd = segp->wnd;
1703         tcb->cwind = tcb->mss;
1704
1705         /* set initial round trip time */
1706         tcb->sndsyntime = lp->lastsend + lp->rexmits * SYNACK_RXTIMER;
1707         tcpsynackrtt(new);
1708
1709         kfree(lp);
1710
1711         /* set up proto header */
1712         switch (version) {
1713                 case V4:
1714                         h4 = &tcb->protohdr.tcp4hdr;
1715                         memset(h4, 0, sizeof(*h4));
1716                         h4->proto = IP_TCPPROTO;
1717                         hnputs(h4->tcpsport, new->lport);
1718                         hnputs(h4->tcpdport, new->rport);
1719                         v6tov4(h4->tcpsrc, dst);
1720                         v6tov4(h4->tcpdst, src);
1721                         break;
1722                 case V6:
1723                         h6 = &tcb->protohdr.tcp6hdr;
1724                         memset(h6, 0, sizeof(*h6));
1725                         h6->proto = IP_TCPPROTO;
1726                         hnputs(h6->tcpsport, new->lport);
1727                         hnputs(h6->tcpdport, new->rport);
1728                         ipmove(h6->tcpsrc, dst);
1729                         ipmove(h6->tcpdst, src);
1730                         break;
1731                 default:
1732                         panic("tcpincoming: version %d", new->ipversion);
1733         }
1734
1735         tcpsetstate(new, Established);
1736
1737         iphtadd(&tpriv->ht, new);
1738
1739         return new;
1740 }
1741
1742 int seq_within(uint32_t x, uint32_t low, uint32_t high)
1743 {
1744         if (low <= high) {
1745                 if (low <= x && x <= high)
1746                         return 1;
1747         } else {
1748                 if (x >= low || x <= high)
1749                         return 1;
1750         }
1751         return 0;
1752 }
1753
1754 int seq_lt(uint32_t x, uint32_t y)
1755 {
1756         return (int)(x - y) < 0;
1757 }
1758
1759 int seq_le(uint32_t x, uint32_t y)
1760 {
1761         return (int)(x - y) <= 0;
1762 }
1763
1764 int seq_gt(uint32_t x, uint32_t y)
1765 {
1766         return (int)(x - y) > 0;
1767 }
1768
1769 int seq_ge(uint32_t x, uint32_t y)
1770 {
1771         return (int)(x - y) >= 0;
1772 }
1773
1774 /*
1775  *  use the time between the first SYN and it's ack as the
1776  *  initial round trip time
1777  */
1778 void tcpsynackrtt(struct conv *s)
1779 {
1780         Tcpctl *tcb;
1781         uint64_t delta;
1782         struct tcppriv *tpriv;
1783
1784         tcb = (Tcpctl *) s->ptcl;
1785         tpriv = s->p->priv;
1786
1787         delta = NOW - tcb->sndsyntime;
1788         tcb->srtt = delta << LOGAGAIN;
1789         tcb->mdev = delta << LOGDGAIN;
1790
1791         /* halt round trip timer */
1792         tcphalt(tpriv, &tcb->rtt_timer);
1793 }
1794
1795 void update(struct conv *s, Tcp * seg)
1796 {
1797         int rtt, delta;
1798         Tcpctl *tcb;
1799         uint32_t acked;
1800         uint32_t expand;
1801         struct tcppriv *tpriv;
1802
1803         tpriv = s->p->priv;
1804         tcb = (Tcpctl *) s->ptcl;
1805
1806         /* if everything has been acked, force output(?) */
1807         if (seq_gt(seg->ack, tcb->snd.nxt)) {
1808                 tcb->flags |= FORCE;
1809                 return;
1810         }
1811
1812         /* added by Dong Lin for fast retransmission */
1813         if (seg->ack == tcb->snd.una
1814                 && tcb->snd.una != tcb->snd.nxt
1815                 && seg->len == 0 && seg->wnd == tcb->snd.wnd) {
1816
1817                 /* this is a pure ack w/o window update */
1818                 netlog(s->p->f, Logtcprxmt, "dupack %lu ack %lu sndwnd %d advwin %d\n",
1819                            tcb->snd.dupacks, seg->ack, tcb->snd.wnd, seg->wnd);
1820
1821                 if (++tcb->snd.dupacks == TCPREXMTTHRESH) {
1822                         /*
1823                          *  tahoe tcp rxt the packet, half sshthresh,
1824                          *  and set cwnd to one packet
1825                          */
1826                         tcb->snd.recovery = 1;
1827                         tcb->snd.rxt = tcb->snd.nxt;
1828                         netlog(s->p->f, Logtcprxmt, "fast rxt %lu, nxt %lu\n", tcb->snd.una,
1829                                    tcb->snd.nxt);
1830                         tcprxmit(s);
1831                 } else {
1832                         /* do reno tcp here. */
1833                 }
1834         }
1835
1836         /*
1837          *  update window
1838          */
1839         if (seq_gt(seg->ack, tcb->snd.wl2)
1840                 || (tcb->snd.wl2 == seg->ack && seg->wnd > tcb->snd.wnd)) {
1841                 tcb->snd.wnd = seg->wnd;
1842                 tcb->snd.wl2 = seg->ack;
1843         }
1844
1845         if (!seq_gt(seg->ack, tcb->snd.una)) {
1846                 /*
1847                  *  don't let us hangup if sending into a closed window and
1848                  *  we're still getting acks
1849                  */
1850                 if ((tcb->flags & RETRAN) && tcb->snd.wnd == 0) {
1851                         tcb->backedoff = MAXBACKMS / 4;
1852                 }
1853                 return;
1854         }
1855
1856         /*
1857          *  any positive ack turns off fast rxt,
1858          *  (should we do new-reno on partial acks?)
1859          */
1860         if (!tcb->snd.recovery || seq_ge(seg->ack, tcb->snd.rxt)) {
1861                 tcb->snd.dupacks = 0;
1862                 tcb->snd.recovery = 0;
1863         } else
1864                 netlog(s->p->f, Logtcp, "rxt next %lu, cwin %u\n", seg->ack,
1865                            tcb->cwind);
1866
1867         /* Compute the new send window size */
1868         acked = seg->ack - tcb->snd.una;
1869
1870         /* avoid slow start and timers for SYN acks */
1871         if ((tcb->flags & SYNACK) == 0) {
1872                 tcb->flags |= SYNACK;
1873                 acked--;
1874                 tcb->flgcnt--;
1875                 goto done;
1876         }
1877
1878         /* slow start as long as we're not recovering from lost packets */
1879         if (tcb->cwind < tcb->snd.wnd && !tcb->snd.recovery) {
1880                 if (tcb->cwind < tcb->ssthresh) {
1881                         expand = tcb->mss;
1882                         if (acked < expand)
1883                                 expand = acked;
1884                 } else
1885                         expand = ((int)tcb->mss * tcb->mss) / tcb->cwind;
1886
1887                 if (tcb->cwind + expand < tcb->cwind)
1888                         expand = tcb->snd.wnd - tcb->cwind;
1889                 if (tcb->cwind + expand > tcb->snd.wnd)
1890                         expand = tcb->snd.wnd - tcb->cwind;
1891                 tcb->cwind += expand;
1892         }
1893
1894         /* Adjust the timers according to the round trip time */
1895         if (tcb->rtt_timer.state == TcptimerON && seq_ge(seg->ack, tcb->rttseq)) {
1896                 tcphalt(tpriv, &tcb->rtt_timer);
1897                 if ((tcb->flags & RETRAN) == 0) {
1898                         tcb->backoff = 0;
1899                         tcb->backedoff = 0;
1900                         rtt = tcb->rtt_timer.start - tcb->rtt_timer.count;
1901                         if (rtt == 0)
1902                                 rtt = 1;        /* otherwise all close systems will rexmit in 0 time */
1903                         rtt *= MSPTICK;
1904                         if (tcb->srtt == 0) {
1905                                 tcb->srtt = rtt << LOGAGAIN;
1906                                 tcb->mdev = rtt << LOGDGAIN;
1907                         } else {
1908                                 delta = rtt - (tcb->srtt >> LOGAGAIN);
1909                                 tcb->srtt += delta;
1910                                 if (tcb->srtt <= 0)
1911                                         tcb->srtt = 1;
1912
1913                                 delta = abs(delta) - (tcb->mdev >> LOGDGAIN);
1914                                 tcb->mdev += delta;
1915                                 if (tcb->mdev <= 0)
1916                                         tcb->mdev = 1;
1917                         }
1918                         tcpsettimer(tcb);
1919                 }
1920         }
1921
1922 done:
1923         if (qdiscard(s->wq, acked) < acked)
1924                 tcb->flgcnt--;
1925
1926         tcb->snd.una = seg->ack;
1927         if (seq_gt(seg->ack, tcb->snd.urg))
1928                 tcb->snd.urg = seg->ack;
1929
1930         if (tcb->snd.una != tcb->snd.nxt)
1931                 tcpgo(tpriv, &tcb->timer);
1932         else
1933                 tcphalt(tpriv, &tcb->timer);
1934
1935         if (seq_lt(tcb->snd.ptr, tcb->snd.una))
1936                 tcb->snd.ptr = tcb->snd.una;
1937
1938         tcb->flags &= ~RETRAN;
1939         tcb->backoff = 0;
1940         tcb->backedoff = 0;
1941 }
1942
1943 void tcpiput(struct Proto *tcp, struct Ipifc *unused, struct block *bp)
1944 {
1945         ERRSTACK(1);
1946         Tcp seg;
1947         Tcp4hdr *h4;
1948         Tcp6hdr *h6;
1949         int hdrlen;
1950         Tcpctl *tcb;
1951         uint16_t length;
1952         uint8_t source[IPaddrlen], dest[IPaddrlen];
1953         struct conv *s;
1954         struct Fs *f;
1955         struct tcppriv *tpriv;
1956         uint8_t version;
1957
1958         f = tcp->f;
1959         tpriv = tcp->priv;
1960
1961         tpriv->stats[InSegs]++;
1962
1963         h4 = (Tcp4hdr *) (bp->rp);
1964         h6 = (Tcp6hdr *) (bp->rp);
1965
1966         if ((h4->vihl & 0xF0) == IP_VER4) {
1967                 version = V4;
1968                 length = nhgets(h4->length);
1969                 v4tov6(dest, h4->tcpdst);
1970                 v4tov6(source, h4->tcpsrc);
1971
1972                 h4->Unused = 0;
1973                 hnputs(h4->tcplen, length - TCP4_PKT);
1974                 if (!(bp->flag & Btcpck) && (h4->tcpcksum[0] || h4->tcpcksum[1]) &&
1975                         ptclcsum(bp, TCP4_IPLEN, length - TCP4_IPLEN)) {
1976                         tpriv->stats[CsumErrs]++;
1977                         tpriv->stats[InErrs]++;
1978                         netlog(f, Logtcp, "bad tcp proto cksum\n");
1979                         freeblist(bp);
1980                         return;
1981                 }
1982
1983                 hdrlen = ntohtcp4(&seg, &bp);
1984                 if (hdrlen < 0) {
1985                         tpriv->stats[HlenErrs]++;
1986                         tpriv->stats[InErrs]++;
1987                         netlog(f, Logtcp, "bad tcp hdr len\n");
1988                         return;
1989                 }
1990
1991                 /* trim the packet to the size claimed by the datagram */
1992                 length -= hdrlen + TCP4_PKT;
1993                 bp = trimblock(bp, hdrlen + TCP4_PKT, length);
1994                 if (bp == NULL) {
1995                         tpriv->stats[LenErrs]++;
1996                         tpriv->stats[InErrs]++;
1997                         netlog(f, Logtcp, "tcp len < 0 after trim\n");
1998                         return;
1999                 }
2000         } else {
2001                 int ttl = h6->ttl;
2002                 int proto = h6->proto;
2003
2004                 version = V6;
2005                 length = nhgets(h6->ploadlen);
2006                 ipmove(dest, h6->tcpdst);
2007                 ipmove(source, h6->tcpsrc);
2008
2009                 h6->ploadlen[0] = h6->ploadlen[1] = h6->proto = 0;
2010                 h6->ttl = proto;
2011                 hnputl(h6->vcf, length);
2012                 if ((h6->tcpcksum[0] || h6->tcpcksum[1]) &&
2013                         ptclcsum(bp, TCP6_IPLEN, length + TCP6_PHDRSIZE)) {
2014                         tpriv->stats[CsumErrs]++;
2015                         tpriv->stats[InErrs]++;
2016                         netlog(f, Logtcp, "bad tcp proto cksum\n");
2017                         freeblist(bp);
2018                         return;
2019                 }
2020                 h6->ttl = ttl;
2021                 h6->proto = proto;
2022                 hnputs(h6->ploadlen, length);
2023
2024                 hdrlen = ntohtcp6(&seg, &bp);
2025                 if (hdrlen < 0) {
2026                         tpriv->stats[HlenErrs]++;
2027                         tpriv->stats[InErrs]++;
2028                         netlog(f, Logtcp, "bad tcp hdr len\n");
2029                         return;
2030                 }
2031
2032                 /* trim the packet to the size claimed by the datagram */
2033                 length -= hdrlen;
2034                 bp = trimblock(bp, hdrlen + TCP6_PKT, length);
2035                 if (bp == NULL) {
2036                         tpriv->stats[LenErrs]++;
2037                         tpriv->stats[InErrs]++;
2038                         netlog(f, Logtcp, "tcp len < 0 after trim\n");
2039                         return;
2040                 }
2041         }
2042
2043         /* lock protocol while searching for a conversation */
2044         qlock(&tcp->qlock);
2045
2046         /* Look for a matching conversation */
2047         s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest);
2048         if (s == NULL) {
2049                 netlog(f, Logtcp, "iphtlook failed\n");
2050 reset:
2051                 qunlock(&tcp->qlock);
2052                 sndrst(tcp, source, dest, length, &seg, version, "no conversation");
2053                 freeblist(bp);
2054                 return;
2055         }
2056
2057         /* if it's a listener, look for the right flags and get a new conv */
2058         tcb = (Tcpctl *) s->ptcl;
2059         if (tcb->state == Listen) {
2060                 if (seg.flags & RST) {
2061                         limborst(s, &seg, source, dest, version);
2062                         qunlock(&tcp->qlock);
2063                         freeblist(bp);
2064                         return;
2065                 }
2066
2067                 /* if this is a new SYN, put the call into limbo */
2068                 if ((seg.flags & SYN) && (seg.flags & ACK) == 0) {
2069                         limbo(s, source, dest, &seg, version);
2070                         qunlock(&tcp->qlock);
2071                         freeblist(bp);
2072                         return;
2073                 }
2074
2075                 /*
2076                  *  if there's a matching call in limbo, tcpincoming will
2077                  *  return it in state Syn_received
2078                  */
2079                 s = tcpincoming(s, &seg, source, dest, version);
2080                 if (s == NULL)
2081                         goto reset;
2082         }
2083
2084         /* The rest of the input state machine is run with the control block
2085          * locked and implements the state machine directly out of the RFC.
2086          * Out-of-band data is ignored - it was always a bad idea.
2087          */
2088         tcb = (Tcpctl *) s->ptcl;
2089         if (waserror()) {
2090                 qunlock(&s->qlock);
2091                 nexterror();
2092         }
2093         qlock(&s->qlock);
2094         qunlock(&tcp->qlock);
2095
2096         /* fix up window */
2097         seg.wnd <<= tcb->rcv.scale;
2098
2099         /* every input packet in puts off the keep alive time out */
2100         tcpsetkacounter(tcb);
2101
2102         switch (tcb->state) {
2103                 case Closed:
2104                         sndrst(tcp, source, dest, length, &seg, version,
2105                                    "sending to Closed");
2106                         goto raise;
2107                 case Syn_sent:
2108                         if (seg.flags & ACK) {
2109                                 if (!seq_within(seg.ack, tcb->iss + 1, tcb->snd.nxt)) {
2110                                         sndrst(tcp, source, dest, length, &seg, version,
2111                                                    "bad seq in Syn_sent");
2112                                         goto raise;
2113                                 }
2114                         }
2115                         if (seg.flags & RST) {
2116                                 if (seg.flags & ACK)
2117                                         localclose(s, "connection refused");
2118                                 goto raise;
2119                         }
2120
2121                         if (seg.flags & SYN) {
2122                                 procsyn(s, &seg);
2123                                 if (seg.flags & ACK) {
2124                                         update(s, &seg);
2125                                         tcpsynackrtt(s);
2126                                         tcpsetstate(s, Established);
2127                                         tcpsetscale(s, tcb, seg.ws, tcb->scale);
2128                                 } else {
2129                                         tcb->time = NOW;
2130                                         tcpsetstate(s, Syn_received);   /* DLP - shouldn't this be a reset? */
2131                                 }
2132
2133                                 if (length != 0 || (seg.flags & FIN))
2134                                         break;
2135
2136                                 freeblist(bp);
2137                                 goto output;
2138                         } else
2139                                 freeblist(bp);
2140
2141                         qunlock(&s->qlock);
2142                         poperror();
2143                         return;
2144                 case Syn_received:
2145                         /* doesn't matter if it's the correct ack, we're just trying to set timing */
2146                         if (seg.flags & ACK)
2147                                 tcpsynackrtt(s);
2148                         break;
2149         }
2150
2151         /*
2152          *  One DOS attack is to open connections to us and then forget about them,
2153          *  thereby tying up a conv at no long term cost to the attacker.
2154          *  This is an attempt to defeat these stateless DOS attacks.  See
2155          *  corresponding code in tcpsendka().
2156          */
2157         if (tcb->state != Syn_received && (seg.flags & RST) == 0) {
2158                 if (tcpporthogdefense
2159                         && seq_within(seg.ack, tcb->snd.una - (1 << 31),
2160                                                   tcb->snd.una - (1 << 29))) {
2161                         printd("stateless hog %I.%d->%I.%d f 0x%x 0x%lx - 0x%lx - 0x%lx\n",
2162                                    source, seg.source, dest, seg.dest, seg.flags,
2163                                    tcb->snd.una - (1 << 31), seg.ack, tcb->snd.una - (1 << 29));
2164                         localclose(s, "stateless hog");
2165                 }
2166         }
2167
2168         /* Cut the data to fit the receive window */
2169         if (tcptrim(tcb, &seg, &bp, &length) == -1) {
2170                 netlog(f, Logtcp, "tcp len < 0, %lu %d\n", seg.seq, length);
2171                 update(s, &seg);
2172                 if (qlen(s->wq) + tcb->flgcnt == 0 && tcb->state == Closing) {
2173                         tcphalt(tpriv, &tcb->rtt_timer);
2174                         tcphalt(tpriv, &tcb->acktimer);
2175                         tcphalt(tpriv, &tcb->katimer);
2176                         tcpsetstate(s, Time_wait);
2177                         tcb->timer.start = MSL2 * (1000 / MSPTICK);
2178                         tcpgo(tpriv, &tcb->timer);
2179                 }
2180                 if (!(seg.flags & RST)) {
2181                         tcb->flags |= FORCE;
2182                         goto output;
2183                 }
2184                 qunlock(&s->qlock);
2185                 poperror();
2186                 return;
2187         }
2188
2189         /* Cannot accept so answer with a rst */
2190         if (length && tcb->state == Closed) {
2191                 sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
2192                 goto raise;
2193         }
2194
2195         /* The segment is beyond the current receive pointer so
2196          * queue the data in the resequence queue
2197          */
2198         if (seg.seq != tcb->rcv.nxt)
2199                 if (length != 0 || (seg.flags & (SYN | FIN))) {
2200                         update(s, &seg);
2201                         if (addreseq(tcb, tpriv, &seg, bp, length) < 0)
2202                                 printd("reseq %I.%d -> %I.%d\n", s->raddr, s->rport, s->laddr,
2203                                            s->lport);
2204                         tcb->flags |= FORCE;
2205                         goto output;
2206                 }
2207
2208         /*
2209          *  keep looping till we've processed this packet plus any
2210          *  adjacent packets in the resequence queue
2211          */
2212         for (;;) {
2213                 if (seg.flags & RST) {
2214                         if (tcb->state == Established) {
2215                                 tpriv->stats[EstabResets]++;
2216                                 if (tcb->rcv.nxt != seg.seq)
2217                                         printd
2218                                                 ("out of order RST rcvd: %I.%d -> %I.%d, rcv.nxt 0x%lx seq 0x%lx\n",
2219                                                  s->raddr, s->rport, s->laddr, s->lport, tcb->rcv.nxt,
2220                                                  seg.seq);
2221                         }
2222                         localclose(s, "connection refused");
2223                         goto raise;
2224                 }
2225
2226                 if ((seg.flags & ACK) == 0)
2227                         goto raise;
2228
2229                 switch (tcb->state) {
2230                         case Syn_received:
2231                                 if (!seq_within(seg.ack, tcb->snd.una + 1, tcb->snd.nxt)) {
2232                                         sndrst(tcp, source, dest, length, &seg, version,
2233                                                    "bad seq in Syn_received");
2234                                         goto raise;
2235                                 }
2236                                 update(s, &seg);
2237                                 tcpsetstate(s, Established);
2238                         case Established:
2239                         case Close_wait:
2240                                 update(s, &seg);
2241                                 break;
2242                         case Finwait1:
2243                                 update(s, &seg);
2244                                 if (qlen(s->wq) + tcb->flgcnt == 0) {
2245                                         tcphalt(tpriv, &tcb->rtt_timer);
2246                                         tcphalt(tpriv, &tcb->acktimer);
2247                                         tcpsetkacounter(tcb);
2248                                         tcb->time = NOW;
2249                                         tcpsetstate(s, Finwait2);
2250                                         tcb->katimer.start = MSL2 * (1000 / MSPTICK);
2251                                         tcpgo(tpriv, &tcb->katimer);
2252                                 }
2253                                 break;
2254                         case Finwait2:
2255                                 update(s, &seg);
2256                                 break;
2257                         case Closing:
2258                                 update(s, &seg);
2259                                 if (qlen(s->wq) + tcb->flgcnt == 0) {
2260                                         tcphalt(tpriv, &tcb->rtt_timer);
2261                                         tcphalt(tpriv, &tcb->acktimer);
2262                                         tcphalt(tpriv, &tcb->katimer);
2263                                         tcpsetstate(s, Time_wait);
2264                                         tcb->timer.start = MSL2 * (1000 / MSPTICK);
2265                                         tcpgo(tpriv, &tcb->timer);
2266                                 }
2267                                 break;
2268                         case Last_ack:
2269                                 update(s, &seg);
2270                                 if (qlen(s->wq) + tcb->flgcnt == 0) {
2271                                         localclose(s, NULL);
2272                                         goto raise;
2273                                 }
2274                         case Time_wait:
2275                                 tcb->flags |= FORCE;
2276                                 if (tcb->timer.state != TcptimerON)
2277                                         tcpgo(tpriv, &tcb->timer);
2278                 }
2279
2280                 if ((seg.flags & URG) && seg.urg) {
2281                         if (seq_gt(seg.urg + seg.seq, tcb->rcv.urg)) {
2282                                 tcb->rcv.urg = seg.urg + seg.seq;
2283                                 pullblock(&bp, seg.urg);
2284                         }
2285                 } else if (seq_gt(tcb->rcv.nxt, tcb->rcv.urg))
2286                         tcb->rcv.urg = tcb->rcv.nxt;
2287
2288                 if (length == 0) {
2289                         if (bp != NULL)
2290                                 freeblist(bp);
2291                 } else {
2292                         switch (tcb->state) {
2293                                 default:
2294                                         /* Ignore segment text */
2295                                         if (bp != NULL)
2296                                                 freeblist(bp);
2297                                         break;
2298
2299                                 case Syn_received:
2300                                 case Established:
2301                                 case Finwait1:
2302                                         /* If we still have some data place on
2303                                          * receive queue
2304                                          */
2305                                         if (bp) {
2306                                                 bp = packblock(bp);
2307                                                 if (bp == NULL)
2308                                                         panic("tcp packblock");
2309                                                 qpassnolim(s->rq, bp);
2310                                                 bp = NULL;
2311
2312                                                 /*
2313                                                  *  Force an ack every 2 data messages.  This is
2314                                                  *  a hack for rob to make his home system run
2315                                                  *  faster.
2316                                                  *
2317                                                  *  this also keeps the standard TCP congestion
2318                                                  *  control working since it needs an ack every
2319                                                  *  2 max segs worth.  This is not quite that,
2320                                                  *  but under a real stream is equivalent since
2321                                                  *  every packet has a max seg in it.
2322                                                  */
2323                                                 if (++(tcb->rcv.una) >= 2)
2324                                                         tcb->flags |= FORCE;
2325                                         }
2326                                         tcb->rcv.nxt += length;
2327
2328                                         /*
2329                                          *  update our rcv window
2330                                          */
2331                                         tcprcvwin(s);
2332
2333                                         /*
2334                                          *  turn on the acktimer if there's something
2335                                          *  to ack
2336                                          */
2337                                         if (tcb->acktimer.state != TcptimerON)
2338                                                 tcpgo(tpriv, &tcb->acktimer);
2339
2340                                         break;
2341                                 case Finwait2:
2342                                         /* no process to read the data, send a reset */
2343                                         if (bp != NULL)
2344                                                 freeblist(bp);
2345                                         sndrst(tcp, source, dest, length, &seg, version,
2346                                                    "send to Finwait2");
2347                                         qunlock(&s->qlock);
2348                                         poperror();
2349                                         return;
2350                         }
2351                 }
2352
2353                 if (seg.flags & FIN) {
2354                         tcb->flags |= FORCE;
2355
2356                         switch (tcb->state) {
2357                                 case Syn_received:
2358                                 case Established:
2359                                         tcb->rcv.nxt++;
2360                                         tcpsetstate(s, Close_wait);
2361                                         break;
2362                                 case Finwait1:
2363                                         tcb->rcv.nxt++;
2364                                         if (qlen(s->wq) + tcb->flgcnt == 0) {
2365                                                 tcphalt(tpriv, &tcb->rtt_timer);
2366                                                 tcphalt(tpriv, &tcb->acktimer);
2367                                                 tcphalt(tpriv, &tcb->katimer);
2368                                                 tcpsetstate(s, Time_wait);
2369                                                 tcb->timer.start = MSL2 * (1000 / MSPTICK);
2370                                                 tcpgo(tpriv, &tcb->timer);
2371                                         } else
2372                                                 tcpsetstate(s, Closing);
2373                                         break;
2374                                 case Finwait2:
2375                                         tcb->rcv.nxt++;
2376                                         tcphalt(tpriv, &tcb->rtt_timer);
2377                                         tcphalt(tpriv, &tcb->acktimer);
2378                                         tcphalt(tpriv, &tcb->katimer);
2379                                         tcpsetstate(s, Time_wait);
2380                                         tcb->timer.start = MSL2 * (1000 / MSPTICK);
2381                                         tcpgo(tpriv, &tcb->timer);
2382                                         break;
2383                                 case Close_wait:
2384                                 case Closing:
2385                                 case Last_ack:
2386                                         break;
2387                                 case Time_wait:
2388                                         tcpgo(tpriv, &tcb->timer);
2389                                         break;
2390                         }
2391                 }
2392
2393                 /*
2394                  *  get next adjacent segment from the resequence queue.
2395                  *  dump/trim any overlapping segments
2396                  */
2397                 for (;;) {
2398                         if (tcb->reseq == NULL)
2399                                 goto output;
2400
2401                         if (seq_ge(tcb->rcv.nxt, tcb->reseq->seg.seq) == 0)
2402                                 goto output;
2403
2404                         getreseq(tcb, &seg, &bp, &length);
2405
2406                         if (tcptrim(tcb, &seg, &bp, &length) == 0)
2407                                 break;
2408                 }
2409         }
2410 output:
2411         tcpoutput(s);
2412         qunlock(&s->qlock);
2413         poperror();
2414         return;
2415 raise:
2416         qunlock(&s->qlock);
2417         poperror();
2418         freeblist(bp);
2419         tcpkick(s);
2420 }
2421
2422 /*
2423  *  always enters and exits with the s locked.  We drop
2424  *  the lock to ipoput the packet so some care has to be
2425  *  taken by callers.
2426  */
2427 void tcpoutput(struct conv *s)
2428 {
2429         Tcp seg;
2430         int msgs;
2431         Tcpctl *tcb;
2432         struct block *hbp, *bp;
2433         int sndcnt, n;
2434         uint32_t ssize, dsize, usable, sent;
2435         struct Fs *f;
2436         struct tcppriv *tpriv;
2437         uint8_t version;
2438
2439         f = s->p->f;
2440         tpriv = s->p->priv;
2441         version = s->ipversion;
2442
2443         for (msgs = 0; msgs < 100; msgs++) {
2444                 tcb = (Tcpctl *) s->ptcl;
2445
2446                 switch (tcb->state) {
2447                         case Listen:
2448                         case Closed:
2449                         case Finwait2:
2450                                 return;
2451                 }
2452
2453                 /* force an ack when a window has opened up */
2454                 if (tcb->rcv.blocked && tcb->rcv.wnd > 0) {
2455                         tcb->rcv.blocked = 0;
2456                         tcb->flags |= FORCE;
2457                 }
2458
2459                 sndcnt = qlen(s->wq) + tcb->flgcnt;
2460                 sent = tcb->snd.ptr - tcb->snd.una;
2461
2462                 /* Don't send anything else until our SYN has been acked */
2463                 if (tcb->snd.ptr != tcb->iss && (tcb->flags & SYNACK) == 0)
2464                         break;
2465
2466                 /* Compute usable segment based on offered window and limit
2467                  * window probes to one
2468                  */
2469                 if (tcb->snd.wnd == 0) {
2470                         if (sent != 0) {
2471                                 if ((tcb->flags & FORCE) == 0)
2472                                         break;
2473 //              tcb->snd.ptr = tcb->snd.una;
2474                         }
2475                         usable = 1;
2476                 } else {
2477                         usable = tcb->cwind;
2478                         if (tcb->snd.wnd < usable)
2479                                 usable = tcb->snd.wnd;
2480                         usable -= sent;
2481                 }
2482                 ssize = sndcnt - sent;
2483                 if (ssize && usable < 2)
2484                         netlog(s->p->f, Logtcp, "throttled snd.wnd %lu cwind %lu\n",
2485                                    tcb->snd.wnd, tcb->cwind);
2486                 if (usable < ssize)
2487                         ssize = usable;
2488                 if (ssize > tcb->mss) {
2489                         if ((tcb->flags & TSO) == 0) {
2490                                 ssize = tcb->mss;
2491                         } else {
2492                                 int segs, window;
2493
2494                                 /*  Don't send too much.  32K is arbitrary..
2495                                  */
2496                                 if (ssize > 32 * 1024)
2497                                         ssize = 32 * 1024;
2498
2499                                 /* Clamp xmit to an integral MSS to
2500                                  * avoid ragged tail segments causing
2501                                  * poor link utilization.  Also
2502                                  * account for each segment sent in
2503                                  * msg heuristic, and round up to the
2504                                  * next multiple of 4, to ensure we
2505                                  * still yeild.
2506                                  */
2507                                 segs = ssize / tcb->mss;
2508                                 ssize = segs * tcb->mss;
2509                                 msgs += segs;
2510                                 if (segs > 3)
2511                                         msgs = (msgs + 4) & ~3;
2512                         }
2513                 }
2514
2515                 dsize = ssize;
2516                 seg.urg = 0;
2517
2518                 if (ssize == 0)
2519                         if ((tcb->flags & FORCE) == 0)
2520                                 break;
2521
2522                 tcb->flags &= ~FORCE;
2523                 tcprcvwin(s);
2524
2525                 /* By default we will generate an ack */
2526                 tcphalt(tpriv, &tcb->acktimer);
2527                 tcb->rcv.una = 0;
2528                 seg.source = s->lport;
2529                 seg.dest = s->rport;
2530                 seg.flags = ACK;
2531                 seg.mss = 0;
2532                 seg.ws = 0;
2533                 switch (tcb->state) {
2534                         case Syn_sent:
2535                                 seg.flags = 0;
2536                                 if (tcb->snd.ptr == tcb->iss) {
2537                                         seg.flags |= SYN;
2538                                         dsize--;
2539                                         seg.mss = tcb->mss;
2540                                         seg.ws = tcb->scale;
2541                                 }
2542                                 break;
2543                         case Syn_received:
2544                                 /*
2545                                  *  don't send any data with a SYN/ACK packet
2546                                  *  because Linux rejects the packet in its
2547                                  *  attempt to solve the SYN attack problem
2548                                  */
2549                                 if (tcb->snd.ptr == tcb->iss) {
2550                                         seg.flags |= SYN;
2551                                         dsize = 0;
2552                                         ssize = 1;
2553                                         seg.mss = tcb->mss;
2554                                         seg.ws = tcb->scale;
2555                                 }
2556                                 break;
2557                 }
2558                 seg.seq = tcb->snd.ptr;
2559                 seg.ack = tcb->rcv.nxt;
2560                 seg.wnd = tcb->rcv.wnd;
2561
2562                 /* Pull out data to send */
2563                 bp = NULL;
2564                 if (dsize != 0) {
2565                         bp = qcopy(s->wq, dsize, sent);
2566                         if (BLEN(bp) != dsize) {
2567                                 seg.flags |= FIN;
2568                                 dsize--;
2569                         }
2570                         if (BLEN(bp) > tcb->mss) {
2571                                 bp->flag |= Btso;
2572                                 bp->mss = tcb->mss;
2573                         }
2574                 }
2575
2576                 if (sent + dsize == sndcnt)
2577                         seg.flags |= PSH;
2578
2579                 /* keep track of balance of resent data */
2580                 if (seq_lt(tcb->snd.ptr, tcb->snd.nxt)) {
2581                         n = tcb->snd.nxt - tcb->snd.ptr;
2582                         if (ssize < n)
2583                                 n = ssize;
2584                         tcb->resent += n;
2585                         netlog(f, Logtcp, "rexmit: %I.%d -> %I.%d ptr 0x%lx nxt 0x%lx\n",
2586                                    s->raddr, s->rport, s->laddr, s->lport, tcb->snd.ptr,
2587                                    tcb->snd.nxt);
2588                         tpriv->stats[RetransSegs]++;
2589                 }
2590
2591                 tcb->snd.ptr += ssize;
2592
2593                 /* Pull up the send pointer so we can accept acks
2594                  * for this window
2595                  */
2596                 if (seq_gt(tcb->snd.ptr, tcb->snd.nxt))
2597                         tcb->snd.nxt = tcb->snd.ptr;
2598
2599                 /* Build header, link data and compute cksum */
2600                 switch (version) {
2601                         case V4:
2602                                 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
2603                                 hbp = htontcp4(&seg, bp, &tcb->protohdr.tcp4hdr, tcb);
2604                                 if (hbp == NULL) {
2605                                         freeblist(bp);
2606                                         return;
2607                                 }
2608                                 break;
2609                         case V6:
2610                                 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
2611                                 hbp = htontcp6(&seg, bp, &tcb->protohdr.tcp6hdr, tcb);
2612                                 if (hbp == NULL) {
2613                                         freeblist(bp);
2614                                         return;
2615                                 }
2616                                 break;
2617                         default:
2618                                 hbp = NULL;     /* to suppress a warning */
2619                                 panic("tcpoutput: version %d", version);
2620                 }
2621
2622                 /* Start the transmission timers if there is new data and we
2623                  * expect acknowledges
2624                  */
2625                 if (ssize != 0) {
2626                         if (tcb->timer.state != TcptimerON)
2627                                 tcpgo(tpriv, &tcb->timer);
2628
2629                         /*  If round trip timer isn't running, start it.
2630                          *  measure the longest packet only in case the
2631                          *  transmission time dominates RTT
2632                          */
2633                         if (tcb->rtt_timer.state != TcptimerON)
2634                                 if (ssize == tcb->mss) {
2635                                         tcpgo(tpriv, &tcb->rtt_timer);
2636                                         tcb->rttseq = tcb->snd.ptr;
2637                                 }
2638                 }
2639
2640                 tpriv->stats[OutSegs]++;
2641
2642                 /* put off the next keep alive */
2643                 tcpgo(tpriv, &tcb->katimer);
2644
2645                 switch (version) {
2646                         case V4:
2647                                 if (ipoput4(f, hbp, 0, s->ttl, s->tos, s) < 0) {
2648                                         /* a negative return means no route */
2649                                         localclose(s, "no route");
2650                                 }
2651                                 break;
2652                         case V6:
2653                                 if (ipoput6(f, hbp, 0, s->ttl, s->tos, s) < 0) {
2654                                         /* a negative return means no route */
2655                                         localclose(s, "no route");
2656                                 }
2657                                 break;
2658                         default:
2659                                 panic("tcpoutput2: version %d", version);
2660                 }
2661                 if ((msgs % 4) == 1) {
2662                         qunlock(&s->qlock);
2663                         kthread_yield();
2664                         qlock(&s->qlock);
2665                 }
2666         }
2667 }
2668
2669 /*
2670  *  the BSD convention (hack?) for keep alives.  resend last uint8_t acked.
2671  */
2672 void tcpsendka(struct conv *s)
2673 {
2674         Tcp seg;
2675         Tcpctl *tcb;
2676         struct block *hbp, *dbp;
2677
2678         tcb = (Tcpctl *) s->ptcl;
2679
2680         dbp = NULL;
2681         seg.urg = 0;
2682         seg.source = s->lport;
2683         seg.dest = s->rport;
2684         seg.flags = ACK | PSH;
2685         seg.mss = 0;
2686         seg.ws = 0;
2687         if (tcpporthogdefense)
2688                 urandom_read(&seg.seq, sizeof(seg.seq));
2689         else
2690                 seg.seq = tcb->snd.una - 1;
2691         seg.ack = tcb->rcv.nxt;
2692         tcb->rcv.una = 0;
2693         seg.wnd = tcb->rcv.wnd;
2694         if (tcb->state == Finwait2) {
2695                 seg.flags |= FIN;
2696         } else {
2697                 dbp = block_alloc(1, MEM_WAIT);
2698                 dbp->wp++;
2699         }
2700
2701         if (isv4(s->raddr)) {
2702                 /* Build header, link data and compute cksum */
2703                 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
2704                 hbp = htontcp4(&seg, dbp, &tcb->protohdr.tcp4hdr, tcb);
2705                 if (hbp == NULL) {
2706                         freeblist(dbp);
2707                         return;
2708                 }
2709                 ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
2710         } else {
2711                 /* Build header, link data and compute cksum */
2712                 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
2713                 hbp = htontcp6(&seg, dbp, &tcb->protohdr.tcp6hdr, tcb);
2714                 if (hbp == NULL) {
2715                         freeblist(dbp);
2716                         return;
2717                 }
2718                 ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
2719         }
2720 }
2721
2722 /*
2723  *  set connection to time out after 12 minutes
2724  */
2725 void tcpsetkacounter(Tcpctl * tcb)
2726 {
2727         tcb->kacounter = (12 * 60 * 1000) / (tcb->katimer.start * MSPTICK);
2728         if (tcb->kacounter < 3)
2729                 tcb->kacounter = 3;
2730 }
2731
2732 /*
2733  *  if we've timed out, close the connection
2734  *  otherwise, send a keepalive and restart the timer
2735  */
2736 void tcpkeepalive(void *v)
2737 {
2738         ERRSTACK(1);
2739         Tcpctl *tcb;
2740         struct conv *s;
2741
2742         s = v;
2743         tcb = (Tcpctl *) s->ptcl;
2744         qlock(&s->qlock);
2745         if (waserror()) {
2746                 qunlock(&s->qlock);
2747                 nexterror();
2748         }
2749         if (tcb->state != Closed) {
2750                 if (--(tcb->kacounter) <= 0) {
2751                         localclose(s, "connection timed out");
2752                 } else {
2753                         tcpsendka(s);
2754                         tcpgo(s->p->priv, &tcb->katimer);
2755                 }
2756         }
2757         qunlock(&s->qlock);
2758         poperror();
2759 }
2760
2761 /*
2762  *  start keepalive timer
2763  */
2764 static void tcpstartka(struct conv *s, char **f, int n)
2765 {
2766         Tcpctl *tcb;
2767         int x;
2768
2769         tcb = (Tcpctl *) s->ptcl;
2770         if (tcb->state != Established)
2771                 error(ENOTCONN, "connection must be in Establised state");
2772         if (n > 1) {
2773                 x = atoi(f[1]);
2774                 if (x >= MSPTICK)
2775                         tcb->katimer.start = x / MSPTICK;
2776         }
2777         tcpsetkacounter(tcb);
2778         tcpgo(s->p->priv, &tcb->katimer);
2779 }
2780
2781 /*
2782  *  turn checksums on/off
2783  */
2784 static void tcpsetchecksum(struct conv *s, char **f, int unused)
2785 {
2786         Tcpctl *tcb;
2787
2788         tcb = (Tcpctl *) s->ptcl;
2789         tcb->nochecksum = !atoi(f[1]);
2790 }
2791
2792 void tcprxmit(struct conv *s)
2793 {
2794         Tcpctl *tcb;
2795
2796         tcb = (Tcpctl *) s->ptcl;
2797
2798         tcb->flags |= RETRAN | FORCE;
2799         tcb->snd.ptr = tcb->snd.una;
2800
2801         /*
2802          *  We should be halving the slow start threshhold (down to one
2803          *  mss) but leaving it at mss seems to work well enough
2804          */
2805         tcb->ssthresh = tcb->mss;
2806
2807         /*
2808          *  pull window down to a single packet
2809          */
2810         tcb->cwind = tcb->mss;
2811         tcpoutput(s);
2812 }
2813
2814 void tcptimeout(void *arg)
2815 {
2816         ERRSTACK(1);
2817         struct conv *s;
2818         Tcpctl *tcb;
2819         int maxback;
2820         struct tcppriv *tpriv;
2821
2822         s = (struct conv *)arg;
2823         tpriv = s->p->priv;
2824         tcb = (Tcpctl *) s->ptcl;
2825
2826         qlock(&s->qlock);
2827         if (waserror()) {
2828                 qunlock(&s->qlock);
2829                 nexterror();
2830         }
2831         switch (tcb->state) {
2832                 default:
2833                         tcb->backoff++;
2834                         if (tcb->state == Syn_sent)
2835                                 maxback = MAXBACKMS / 2;
2836                         else
2837                                 maxback = MAXBACKMS;
2838                         tcb->backedoff += tcb->timer.start * MSPTICK;
2839                         if (tcb->backedoff >= maxback) {
2840                                 localclose(s, "connection timed out");
2841                                 break;
2842                         }
2843                         netlog(s->p->f, Logtcprxmt, "timeout rexmit 0x%lx %llu/%llu\n",
2844                                    tcb->snd.una, tcb->timer.start, NOW);
2845                         tcpsettimer(tcb);
2846                         tcprxmit(s);
2847                         tpriv->stats[RetransTimeouts]++;
2848                         tcb->snd.dupacks = 0;
2849                         break;
2850                 case Time_wait:
2851                         localclose(s, NULL);
2852                         break;
2853                 case Closed:
2854                         break;
2855         }
2856         qunlock(&s->qlock);
2857         poperror();
2858 }
2859
2860 int inwindow(Tcpctl * tcb, int seq)
2861 {
2862         return seq_within(seq, tcb->rcv.nxt, tcb->rcv.nxt + tcb->rcv.wnd - 1);
2863 }
2864
2865 /*
2866  *  set up state for a received SYN (or SYN ACK) packet
2867  */
2868 void procsyn(struct conv *s, Tcp * seg)
2869 {
2870         Tcpctl *tcb;
2871
2872         tcb = (Tcpctl *) s->ptcl;
2873         tcb->flags |= FORCE;
2874
2875         tcb->rcv.nxt = seg->seq + 1;
2876         tcb->rcv.urg = tcb->rcv.nxt;
2877         tcb->irs = seg->seq;
2878
2879         /* our sending max segment size cannot be bigger than what he asked for */
2880         if (seg->mss != 0 && seg->mss < tcb->mss)
2881                 tcb->mss = seg->mss;
2882
2883         /* the congestion window always starts out as a single segment */
2884         tcb->snd.wnd = seg->wnd;
2885         tcb->cwind = tcb->mss;
2886 }
2887
2888 int
2889 addreseq(Tcpctl * tcb, struct tcppriv *tpriv, Tcp * seg,
2890                  struct block *bp, uint16_t length)
2891 {
2892         Reseq *rp, *rp1;
2893         int i, rqlen, qmax;
2894
2895         rp = kzmalloc(sizeof(Reseq), 0);
2896         if (rp == NULL) {
2897                 freeblist(bp);  /* bp always consumed by add_reseq */
2898                 return 0;
2899         }
2900
2901         rp->seg = *seg;
2902         rp->bp = bp;
2903         rp->length = length;
2904
2905         /* Place on reassembly list sorting by starting seq number */
2906         rp1 = tcb->reseq;
2907         if (rp1 == NULL || seq_lt(seg->seq, rp1->seg.seq)) {
2908                 rp->next = rp1;
2909                 tcb->reseq = rp;
2910                 if (rp->next != NULL)
2911                         tpriv->stats[OutOfOrder]++;
2912                 return 0;
2913         }
2914
2915         rqlen = 0;
2916         for (i = 0;; i++) {
2917                 rqlen += rp1->length;
2918                 if (rp1->next == NULL || seq_lt(seg->seq, rp1->next->seg.seq)) {
2919                         rp->next = rp1->next;
2920                         rp1->next = rp;
2921                         if (rp->next != NULL)
2922                                 tpriv->stats[OutOfOrder]++;
2923                         break;
2924                 }
2925                 rp1 = rp1->next;
2926         }
2927         qmax = QMAX << tcb->rcv.scale;
2928         if (rqlen > qmax) {
2929                 printd("resequence queue > window: %d > %d\n", rqlen, qmax);
2930                 i = 0;
2931                 for (rp1 = tcb->reseq; rp1 != NULL; rp1 = rp1->next) {
2932                         printd("0x%#lx 0x%#lx 0x%#x\n", rp1->seg.seq,
2933                                    rp1->seg.ack, rp1->seg.flags);
2934                         if (i++ > 10) {
2935                                 printd("...\n");
2936                                 break;
2937                         }
2938                 }
2939
2940                 // delete entire reassembly queue; wait for retransmit.
2941                 // - should we be smarter and only delete the tail?
2942                 for (rp = tcb->reseq; rp != NULL; rp = rp1) {
2943                         rp1 = rp->next;
2944                         freeblist(rp->bp);
2945                         kfree(rp);
2946                 }
2947                 tcb->reseq = NULL;
2948
2949                 return -1;
2950         }
2951         return 0;
2952 }
2953
2954 void getreseq(Tcpctl * tcb, Tcp * seg, struct block **bp, uint16_t * length)
2955 {
2956         Reseq *rp;
2957
2958         rp = tcb->reseq;
2959         if (rp == NULL)
2960                 return;
2961
2962         tcb->reseq = rp->next;
2963
2964         *seg = rp->seg;
2965         *bp = rp->bp;
2966         *length = rp->length;
2967
2968         kfree(rp);
2969 }
2970
2971 int tcptrim(Tcpctl * tcb, Tcp * seg, struct block **bp, uint16_t * length)
2972 {
2973         uint16_t len;
2974         uint8_t accept;
2975         int dupcnt, excess;
2976
2977         accept = 0;
2978         len = *length;
2979         if (seg->flags & SYN)
2980                 len++;
2981         if (seg->flags & FIN)
2982                 len++;
2983
2984         if (tcb->rcv.wnd == 0) {
2985                 if (len == 0 && seg->seq == tcb->rcv.nxt)
2986                         return 0;
2987         } else {
2988                 /* Some part of the segment should be in the window */
2989                 if (inwindow(tcb, seg->seq))
2990                         accept++;
2991                 else if (len != 0) {
2992                         if (inwindow(tcb, seg->seq + len - 1) ||
2993                                 seq_within(tcb->rcv.nxt, seg->seq, seg->seq + len - 1))
2994                                 accept++;
2995                 }
2996         }
2997         if (!accept) {
2998                 freeblist(*bp);
2999                 return -1;
3000         }
3001         dupcnt = tcb->rcv.nxt - seg->seq;
3002         if (dupcnt > 0) {
3003                 tcb->rerecv += dupcnt;
3004                 if (seg->flags & SYN) {
3005                         seg->flags &= ~SYN;
3006                         seg->seq++;
3007
3008                         if (seg->urg > 1)
3009                                 seg->urg--;
3010                         else
3011                                 seg->flags &= ~URG;
3012                         dupcnt--;
3013                 }
3014                 if (dupcnt > 0) {
3015                         pullblock(bp, (uint16_t) dupcnt);
3016                         seg->seq += dupcnt;
3017                         *length -= dupcnt;
3018
3019                         if (seg->urg > dupcnt)
3020                                 seg->urg -= dupcnt;
3021                         else {
3022                                 seg->flags &= ~URG;
3023                                 seg->urg = 0;
3024                         }
3025                 }
3026         }
3027         excess = seg->seq + *length - (tcb->rcv.nxt + tcb->rcv.wnd);
3028         if (excess > 0) {
3029                 tcb->rerecv += excess;
3030                 *length -= excess;
3031                 *bp = trimblock(*bp, 0, *length);
3032                 if (*bp == NULL)
3033                         panic("presotto is a boofhead");
3034                 seg->flags &= ~FIN;
3035         }
3036         return 0;
3037 }
3038
3039 void tcpadvise(struct Proto *tcp, struct block *bp, char *msg)
3040 {
3041         Tcp4hdr *h4;
3042         Tcp6hdr *h6;
3043         Tcpctl *tcb;
3044         uint8_t source[IPaddrlen];
3045         uint8_t dest[IPaddrlen];
3046         uint16_t psource, pdest;
3047         struct conv *s, **p;
3048
3049         h4 = (Tcp4hdr *) (bp->rp);
3050         h6 = (Tcp6hdr *) (bp->rp);
3051
3052         if ((h4->vihl & 0xF0) == IP_VER4) {
3053                 v4tov6(dest, h4->tcpdst);
3054                 v4tov6(source, h4->tcpsrc);
3055                 psource = nhgets(h4->tcpsport);
3056                 pdest = nhgets(h4->tcpdport);
3057         } else {
3058                 ipmove(dest, h6->tcpdst);
3059                 ipmove(source, h6->tcpsrc);
3060                 psource = nhgets(h6->tcpsport);
3061                 pdest = nhgets(h6->tcpdport);
3062         }
3063
3064         /* Look for a connection */
3065         qlock(&tcp->qlock);
3066         for (p = tcp->conv; *p; p++) {
3067                 s = *p;
3068                 tcb = (Tcpctl *) s->ptcl;
3069                 if (s->rport == pdest)
3070                         if (s->lport == psource)
3071                                 if (tcb->state != Closed)
3072                                         if (ipcmp(s->raddr, dest) == 0)
3073                                                 if (ipcmp(s->laddr, source) == 0) {
3074                                                         qlock(&s->qlock);
3075                                                         qunlock(&tcp->qlock);
3076                                                         switch (tcb->state) {
3077                                                                 case Syn_sent:
3078                                                                         localclose(s, msg);
3079                                                                         break;
3080                                                         }
3081                                                         qunlock(&s->qlock);
3082                                                         freeblist(bp);
3083                                                         return;
3084                                                 }
3085         }
3086         qunlock(&tcp->qlock);
3087         freeblist(bp);
3088 }
3089
3090 static void tcpporthogdefensectl(char *val)
3091 {
3092         if (strcmp(val, "on") == 0)
3093                 tcpporthogdefense = 1;
3094         else if (strcmp(val, "off") == 0)
3095                 tcpporthogdefense = 0;
3096         else
3097                 error(EINVAL, "unknown value for tcpporthogdefense");
3098 }
3099
3100 /* called with c qlocked */
3101 static void tcpctl(struct conv *c, char **f, int n)
3102 {
3103         if (n == 1 && strcmp(f[0], "hangup") == 0)
3104                 tcphangup(c);
3105         else if (n >= 1 && strcmp(f[0], "keepalive") == 0)
3106                 tcpstartka(c, f, n);
3107         else if (n >= 1 && strcmp(f[0], "checksum") == 0)
3108                 tcpsetchecksum(c, f, n);
3109         else if (n >= 1 && strcmp(f[0], "tcpporthogdefense") == 0)
3110                 tcpporthogdefensectl(f[1]);
3111         else
3112                 error(EINVAL, "unknown command to %s", __func__);
3113 }
3114
3115 int tcpstats(struct Proto *tcp, char *buf, int len)
3116 {
3117         struct tcppriv *priv;
3118         char *p, *e;
3119         int i;
3120
3121         priv = tcp->priv;
3122         p = buf;
3123         e = p + len;
3124         for (i = 0; i < Nstats; i++)
3125                 p = seprintf(p, e, "%s: %u\n", statnames[i], priv->stats[i]);
3126         return p - buf;
3127 }
3128
3129 /*
3130  *  garbage collect any stale conversations:
3131  *      - SYN received but no SYN-ACK after 5 seconds (could be the SYN attack)
3132  *      - Finwait2 after 5 minutes
3133  *
3134  *  this is called whenever we run out of channels.  Both checks are
3135  *  of questionable validity so we try to use them only when we're
3136  *  up against the wall.
3137  */
3138 int tcpgc(struct Proto *tcp)
3139 {
3140         struct conv *c, **pp, **ep;
3141         int n;
3142         Tcpctl *tcb;
3143
3144         n = 0;
3145         ep = &tcp->conv[tcp->nc];
3146         for (pp = tcp->conv; pp < ep; pp++) {
3147                 c = *pp;
3148                 if (c == NULL)
3149                         break;
3150                 if (!canqlock(&c->qlock))
3151                         continue;
3152                 tcb = (Tcpctl *) c->ptcl;
3153                 switch (tcb->state) {
3154                         case Syn_received:
3155                                 if (NOW - tcb->time > 5000) {
3156                                         localclose(c, "timed out");
3157                                         n++;
3158                                 }
3159                                 break;
3160                         case Finwait2:
3161                                 if (NOW - tcb->time > 5 * 60 * 1000) {
3162                                         localclose(c, "timed out");
3163                                         n++;
3164                                 }
3165                                 break;
3166                 }
3167                 qunlock(&c->qlock);
3168         }
3169         return n;
3170 }
3171
3172 void tcpsettimer(Tcpctl * tcb)
3173 {
3174         int x;
3175
3176         /* round trip dependency */