Rename KMALLOC_* -> MEM_* [2/2]
[akaros.git] / kern / src / net / tcp.c
1 /* Copyright © 1994-1999 Lucent Technologies Inc.  All rights reserved.
2  * Portions Copyright © 1997-1999 Vita Nuova Limited
3  * Portions Copyright © 2000-2007 Vita Nuova Holdings Limited
4  *                                (www.vitanuova.com)
5  * Revisions Copyright © 2000-2007 Lucent Technologies Inc. and others
6  *
7  * Modified for the Akaros operating system:
8  * Copyright (c) 2013-2014 The Regents of the University of California
9  * Copyright (c) 2013-2015 Google Inc.
10  *
11  * Permission is hereby granted, free of charge, to any person obtaining a copy
12  * of this software and associated documentation files (the "Software"), to deal
13  * in the Software without restriction, including without limitation the rights
14  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15  * copies of the Software, and to permit persons to whom the Software is
16  * furnished to do so, subject to the following conditions:
17  *
18  * The above copyright notice and this permission notice shall be included in
19  * all copies or substantial portions of the Software.
20  *
21  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
24  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27  * SOFTWARE. */
28
29 #include <vfs.h>
30 #include <kfs.h>
31 #include <slab.h>
32 #include <kmalloc.h>
33 #include <kref.h>
34 #include <string.h>
35 #include <stdio.h>
36 #include <assert.h>
37 #include <error.h>
38 #include <cpio.h>
39 #include <pmap.h>
40 #include <smp.h>
41 #include <ip.h>
42
43 #include <vfs.h>
44 #include <kfs.h>
45 #include <slab.h>
46 #include <kmalloc.h>
47 #include <kref.h>
48 #include <string.h>
49 #include <stdio.h>
50 #include <assert.h>
51 #include <error.h>
52 #include <cpio.h>
53 #include <pmap.h>
54 #include <smp.h>
55 #include <ip.h>
56
57 enum {
58         QMAX = 64 * 1024 - 1,
59         IP_TCPPROTO = 6,
60
61         TCP4_IPLEN = 8,
62         TCP4_PHDRSIZE = 12,
63         TCP4_HDRSIZE = 20,
64         TCP4_TCBPHDRSZ = 40,
65         TCP4_PKT = TCP4_IPLEN + TCP4_PHDRSIZE,
66
67         TCP6_IPLEN = 0,
68         TCP6_PHDRSIZE = 40,
69         TCP6_HDRSIZE = 20,
70         TCP6_TCBPHDRSZ = 60,
71         TCP6_PKT = TCP6_IPLEN + TCP6_PHDRSIZE,
72
73         TcptimerOFF = 0,
74         TcptimerON = 1,
75         TcptimerDONE = 2,
76         MAX_TIME = (1 << 20),   /* Forever */
77         TCP_ACK = 50,   /* Timed ack sequence in ms */
78         MAXBACKMS = 9 * 60 * 1000,      /* longest backoff time (ms) before hangup */
79
80         URG = 0x20,     /* Data marked urgent */
81         ACK = 0x10,     /* Acknowledge is valid */
82         PSH = 0x08,     /* Whole data pipe is pushed */
83         RST = 0x04,     /* Reset connection */
84         SYN = 0x02,     /* Pkt. is synchronise */
85         FIN = 0x01,     /* Start close down */
86
87         EOLOPT = 0,
88         NOOPOPT = 1,
89         MSSOPT = 2,
90         MSS_LENGTH = 4, /* Mean segment size */
91         WSOPT = 3,
92         WS_LENGTH = 3,  /* Bits to scale window size by */
93         MSL2 = 10,
94         MSPTICK = 50,   /* Milliseconds per timer tick */
95         DEF_MSS = 1460, /* Default mean segment */
96         DEF_MSS6 = 1280,        /* Default mean segment (min) for v6 */
97         DEF_RTT = 500,  /* Default round trip */
98         DEF_KAT = 120000,       /* Default time (ms) between keep alives */
99         TCP_LISTEN = 0, /* Listen connection */
100         TCP_CONNECT = 1,        /* Outgoing connection */
101         SYNACK_RXTIMER = 250,   /* ms between SYNACK retransmits */
102
103         TCPREXMTTHRESH = 3,     /* dupack threshhold for rxt */
104
105         FORCE = 1,
106         CLONE = 2,
107         RETRAN = 4,
108         ACTIVE = 8,
109         SYNACK = 16,
110         TSO = 32,
111
112         LOGAGAIN = 3,
113         LOGDGAIN = 2,
114
115         Closed = 0,     /* Connection states */
116         Listen,
117         Syn_sent,
118         Syn_received,
119         Established,
120         Finwait1,
121         Finwait2,
122         Close_wait,
123         Closing,
124         Last_ack,
125         Time_wait,
126
127         Maxlimbo = 1000,        /* maximum procs waiting for response to SYN ACK */
128         NLHT = 256,     /* hash table size, must be a power of 2 */
129         LHTMASK = NLHT - 1,
130
131         HaveWS = 1 << 8,
132 };
133
134 /* Must correspond to the enumeration above */
135 char *tcpstates[] = {
136         "Closed", "Listen", "Syn_sent", "Syn_received",
137         "Established", "Finwait1", "Finwait2", "Close_wait",
138         "Closing", "Last_ack", "Time_wait"
139 };
140
141 typedef struct Tcptimer Tcptimer;
142 struct Tcptimer {
143         Tcptimer *next;
144         Tcptimer *prev;
145         Tcptimer *readynext;
146         int state;
147         uint64_t start;
148         uint64_t count;
149         void (*func) (void *);
150         void *arg;
151 };
152
153 /*
154  *  v4 and v6 pseudo headers used for
155  *  checksuming tcp
156  */
157 typedef struct Tcp4hdr Tcp4hdr;
158 struct Tcp4hdr {
159         uint8_t vihl;                           /* Version and header length */
160         uint8_t tos;                            /* Type of service */
161         uint8_t length[2];                      /* packet length */
162         uint8_t id[2];                          /* Identification */
163         uint8_t frag[2];                        /* Fragment information */
164         uint8_t Unused;
165         uint8_t proto;
166         uint8_t tcplen[2];
167         uint8_t tcpsrc[4];
168         uint8_t tcpdst[4];
169         uint8_t tcpsport[2];
170         uint8_t tcpdport[2];
171         uint8_t tcpseq[4];
172         uint8_t tcpack[4];
173         uint8_t tcpflag[2];
174         uint8_t tcpwin[2];
175         uint8_t tcpcksum[2];
176         uint8_t tcpurg[2];
177         /* Options segment */
178         uint8_t tcpopt[1];
179 };
180
181 typedef struct Tcp6hdr Tcp6hdr;
182 struct Tcp6hdr {
183         uint8_t vcf[4];
184         uint8_t ploadlen[2];
185         uint8_t proto;
186         uint8_t ttl;
187         uint8_t tcpsrc[IPaddrlen];
188         uint8_t tcpdst[IPaddrlen];
189         uint8_t tcpsport[2];
190         uint8_t tcpdport[2];
191         uint8_t tcpseq[4];
192         uint8_t tcpack[4];
193         uint8_t tcpflag[2];
194         uint8_t tcpwin[2];
195         uint8_t tcpcksum[2];
196         uint8_t tcpurg[2];
197         /* Options segment */
198         uint8_t tcpopt[1];
199 };
200
201 /*
202  *  this represents the control info
203  *  for a single packet.  It is derived from
204  *  a packet in ntohtcp{4,6}() and stuck into
205  *  a packet in htontcp{4,6}().
206  */
207 typedef struct Tcp Tcp;
208 struct Tcp {
209         uint16_t source;
210         uint16_t dest;
211         uint32_t seq;
212         uint32_t ack;
213         uint8_t flags;
214         uint16_t ws;                            /* window scale option (if not zero) */
215         uint32_t wnd;
216         uint16_t urg;
217         uint16_t mss;                           /* max segment size option (if not zero) */
218         uint16_t len;                           /* size of data */
219 };
220
221 /*
222  *  this header is malloc'd to thread together fragments
223  *  waiting to be coalesced
224  */
225 typedef struct Reseq Reseq;
226 struct Reseq {
227         Reseq *next;
228         Tcp seg;
229         struct block *bp;
230         uint16_t length;
231 };
232
233 /*
234  *  the qlock in the Conv locks this structure
235  */
236 typedef struct Tcpctl Tcpctl;
237 struct Tcpctl {
238         uint8_t state;                          /* Connection state */
239         uint8_t type;                           /* Listening or active connection */
240         uint8_t code;                           /* Icmp code */
241         struct {
242                 uint32_t una;                   /* Unacked data pointer */
243                 uint32_t nxt;                   /* Next sequence expected */
244                 uint32_t ptr;                   /* Data pointer */
245                 uint32_t wnd;                   /* Tcp send window */
246                 uint32_t urg;                   /* Urgent data pointer */
247                 uint32_t wl2;
248                 int scale;                              /* how much to right shift window in xmitted packets */
249                 /* to implement tahoe and reno TCP */
250                 uint32_t dupacks;               /* number of duplicate acks rcvd */
251                 int recovery;                   /* loss recovery flag */
252                 uint32_t rxt;                   /* right window marker for recovery */
253         } snd;
254         struct {
255                 uint32_t nxt;                   /* Receive pointer to next uint8_t slot */
256                 uint32_t wnd;                   /* Receive window incoming */
257                 uint32_t urg;                   /* Urgent pointer */
258                 int blocked;
259                 int una;                                /* unacked data segs */
260                 int scale;                              /* how much to left shift window in rcved packets */
261         } rcv;
262         uint32_t iss;                           /* Initial sequence number */
263         int sawwsopt;                           /* true if we saw a wsopt on the incoming SYN */
264         uint32_t cwind;                         /* Congestion window */
265         int scale;                                      /* desired snd.scale */
266         uint16_t ssthresh;                      /* Slow start threshold */
267         int resent;                                     /* Bytes just resent */
268         int irs;                                        /* Initial received squence */
269         uint16_t mss;                           /* Mean segment size */
270         int rerecv;                                     /* Overlap of data rerecevived */
271         uint32_t window;                        /* Recevive window */
272         uint8_t backoff;                        /* Exponential backoff counter */
273         int backedoff;                          /* ms we've backed off for rexmits */
274         uint8_t flags;                          /* State flags */
275         Reseq *reseq;                           /* Resequencing queue */
276         Tcptimer timer;                         /* Activity timer */
277         Tcptimer acktimer;                      /* Acknowledge timer */
278         Tcptimer rtt_timer;                     /* Round trip timer */
279         Tcptimer katimer;                       /* keep alive timer */
280         uint32_t rttseq;                        /* Round trip sequence */
281         int srtt;                                       /* Shortened round trip */
282         int mdev;                                       /* Mean deviation of round trip */
283         int kacounter;                          /* count down for keep alive */
284         uint64_t sndsyntime;            /* time syn sent */
285         uint64_t time;                          /* time Finwait2 or Syn_received was sent */
286         int nochecksum;                         /* non-zero means don't send checksums */
287         int flgcnt;                                     /* number of flags in the sequence (FIN,SEQ) */
288
289         union {
290                 Tcp4hdr tcp4hdr;
291                 Tcp6hdr tcp6hdr;
292         } protohdr;                                     /* prototype header */
293 };
294
295 /*
296  *  New calls are put in limbo rather than having a conversation structure
297  *  allocated.  Thus, a SYN attack results in lots of limbo'd calls but not
298  *  any real Conv structures mucking things up.  Calls in limbo rexmit their
299  *  SYN ACK every SYNACK_RXTIMER ms up to 4 times, i.e., they disappear after 1 second.
300  *
301  *  In particular they aren't on a listener's queue so that they don't figure
302  *  in the input queue limit.
303  *
304  *  If 1/2 of a T3 was attacking SYN packets, we'ld have a permanent queue
305  *  of 70000 limbo'd calls.  Not great for a linear list but doable.  Therefore
306  *  there is no hashing of this list.
307  */
308 typedef struct Limbo Limbo;
309 struct Limbo {
310         Limbo *next;
311
312         uint8_t laddr[IPaddrlen];
313         uint8_t raddr[IPaddrlen];
314         uint16_t lport;
315         uint16_t rport;
316         uint32_t irs;                           /* initial received sequence */
317         uint32_t iss;                           /* initial sent sequence */
318         uint16_t mss;                           /* mss from the other end */
319         uint16_t rcvscale;                      /* how much to scale rcvd windows */
320         uint16_t sndscale;                      /* how much to scale sent windows */
321         uint64_t lastsend;                      /* last time we sent a synack */
322         uint8_t version;                        /* v4 or v6 */
323         uint8_t rexmits;                        /* number of retransmissions */
324 };
325
326 int tcp_irtt = DEF_RTT;                 /* Initial guess at round trip time */
327 uint16_t tcp_mss = DEF_MSS;             /* Maximum segment size to be sent */
328
329 enum {
330         /* MIB stats */
331         MaxConn,
332         ActiveOpens,
333         PassiveOpens,
334         EstabResets,
335         CurrEstab,
336         InSegs,
337         OutSegs,
338         RetransSegs,
339         RetransTimeouts,
340         InErrs,
341         OutRsts,
342
343         /* non-MIB stats */
344         CsumErrs,
345         HlenErrs,
346         LenErrs,
347         OutOfOrder,
348
349         Nstats
350 };
351
352 static char *statnames[] = {
353         [MaxConn] "MaxConn",
354         [ActiveOpens] "ActiveOpens",
355         [PassiveOpens] "PassiveOpens",
356         [EstabResets] "EstabResets",
357         [CurrEstab] "CurrEstab",
358         [InSegs] "InSegs",
359         [OutSegs] "OutSegs",
360         [RetransSegs] "RetransSegs",
361         [RetransTimeouts] "RetransTimeouts",
362         [InErrs] "InErrs",
363         [OutRsts] "OutRsts",
364         [CsumErrs] "CsumErrs",
365         [HlenErrs] "HlenErrs",
366         [LenErrs] "LenErrs",
367         [OutOfOrder] "OutOfOrder",
368 };
369
370 typedef struct Tcppriv Tcppriv;
371 struct tcppriv {
372         /* List of active timers */
373         qlock_t tl;
374         Tcptimer *timers;
375
376         /* hash table for matching conversations */
377         struct Ipht ht;
378
379         /* calls in limbo waiting for an ACK to our SYN ACK */
380         int nlimbo;
381         Limbo *lht[NLHT];
382
383         /* for keeping track of tcpackproc */
384         qlock_t apl;
385         int ackprocstarted;
386
387         uint32_t stats[Nstats];
388 };
389
390 /*
391  *  Setting tcpporthogdefense to non-zero enables Dong Lin's
392  *  solution to hijacked systems staking out port's as a form
393  *  of DoS attack.
394  *
395  *  To avoid stateless Conv hogs, we pick a sequence number at random.  If
396  *  it that number gets acked by the other end, we shut down the connection.
397  *  Look for tcpporthogedefense in the code.
398  */
399 int tcpporthogdefense = 0;
400
401 int addreseq(Tcpctl *, struct tcppriv *, Tcp *, struct block *, uint16_t);
402 void getreseq(Tcpctl *, Tcp *, struct block **, uint16_t *);
403 void localclose(struct conv *, char *unused_char_p_t);
404 void procsyn(struct conv *, Tcp *);
405 void tcpiput(struct Proto *, struct Ipifc *, struct block *);
406 void tcpoutput(struct conv *);
407 int tcptrim(Tcpctl *, Tcp *, struct block **, uint16_t *);
408 void tcpstart(struct conv *, int);
409 void tcptimeout(void *);
410 void tcpsndsyn(struct conv *, Tcpctl *);
411 void tcprcvwin(struct conv *);
412 void tcpacktimer(void *);
413 void tcpkeepalive(void *);
414 void tcpsetkacounter(Tcpctl *);
415 void tcprxmit(struct conv *);
416 void tcpsettimer(Tcpctl *);
417 void tcpsynackrtt(struct conv *);
418 void tcpsetscale(struct conv *, Tcpctl *, uint16_t, uint16_t);
419
420 static void limborexmit(struct Proto *);
421 static void limbo(struct conv *, uint8_t * unused_uint8_p_t, uint8_t *, Tcp *,
422                                   int);
423
424 void tcpsetstate(struct conv *s, uint8_t newstate)
425 {
426         Tcpctl *tcb;
427         uint8_t oldstate;
428         struct tcppriv *tpriv;
429
430         tpriv = s->p->priv;
431
432         tcb = (Tcpctl *) s->ptcl;
433
434         oldstate = tcb->state;
435         if (oldstate == newstate)
436                 return;
437
438         if (oldstate == Established)
439                 tpriv->stats[CurrEstab]--;
440         if (newstate == Established)
441                 tpriv->stats[CurrEstab]++;
442
443         /**
444         print( "%d/%d %s->%s CurrEstab=%d\n", s->lport, s->rport,
445                 tcpstates[oldstate], tcpstates[newstate], tpriv->tstats.tcpCurrEstab );
446         **/
447
448         switch (newstate) {
449                 case Closed:
450                         qclose(s->rq);
451                         qclose(s->wq);
452                         qclose(s->eq);
453                         break;
454
455                 case Close_wait:        /* Remote closes */
456                         qhangup(s->rq, NULL);
457                         break;
458         }
459
460         tcb->state = newstate;
461
462         if (oldstate == Syn_sent && newstate != Closed)
463                 Fsconnected(s, NULL);
464 }
465
466 static void tcpconnect(struct conv *c, char **argv, int argc)
467 {
468         Fsstdconnect(c, argv, argc);
469         tcpstart(c, TCP_CONNECT);
470 }
471
472 static int tcpstate(struct conv *c, char *state, int n)
473 {
474         Tcpctl *s;
475
476         s = (Tcpctl *) (c->ptcl);
477
478         return snprintf(state, n,
479                                         "%s qin %d qout %d srtt %d mdev %d cwin %u swin %u>>%d rwin %u>>%d timer.start %llu timer.count %llu rerecv %d katimer.start %d katimer.count %d\n",
480                                         tcpstates[s->state],
481                                         c->rq ? qlen(c->rq) : 0,
482                                         c->wq ? qlen(c->wq) : 0,
483                                         s->srtt, s->mdev,
484                                         s->cwind, s->snd.wnd, s->rcv.scale, s->rcv.wnd,
485                                         s->snd.scale, s->timer.start, s->timer.count, s->rerecv,
486                                         s->katimer.start, s->katimer.count);
487 }
488
489 static int tcpinuse(struct conv *c)
490 {
491         Tcpctl *s;
492
493         s = (Tcpctl *) (c->ptcl);
494         return s->state != Closed;
495 }
496
497 static void tcpannounce(struct conv *c, char **argv, int argc)
498 {
499         Fsstdannounce(c, argv, argc);
500         tcpstart(c, TCP_LISTEN);
501         Fsconnected(c, NULL);
502 }
503
504 static void tcpshutdown(struct conv *c, int how)
505 {
506         Tcpctl *tcb = (Tcpctl*)c->ptcl;
507
508         /* Do nothing for the read side */
509         if (how == SHUT_RD)
510                 return;
511         /* Sends a FIN.  If we're in another state (like Listen), we'll run into
512          * issues, since we'll never send the FIN.  We'll be shutdown on our end,
513          * but we'll never tell the distant end.  Might just be an app issue. */
514         switch (tcb->state) {
515         case Syn_received:
516         case Established:
517                 tcb->flgcnt++;
518                 tcb->snd.nxt++;
519                 tcpsetstate(c, Finwait1);
520                 tcpoutput(c);
521                 break;
522         }
523 }
524
525 /*
526  *  tcpclose is always called with the q locked
527  */
528 static void tcpclose(struct conv *c)
529 {
530         Tcpctl *tcb;
531
532         tcb = (Tcpctl *) c->ptcl;
533
534         qhangup(c->rq, NULL);
535         qhangup(c->wq, NULL);
536         qhangup(c->eq, NULL);
537         qflush(c->rq);
538
539         switch (tcb->state) {
540                 case Listen:
541                         /*
542                          *  reset any incoming calls to this listener
543                          */
544                         Fsconnected(c, "Hangup");
545
546                         localclose(c, NULL);
547                         break;
548                 case Closed:
549                 case Syn_sent:
550                         localclose(c, NULL);
551                         break;
552                 case Syn_received:
553                 case Established:
554                         tcb->flgcnt++;
555                         tcb->snd.nxt++;
556                         tcpsetstate(c, Finwait1);
557                         tcpoutput(c);
558                         break;
559                 case Close_wait:
560                         tcb->flgcnt++;
561                         tcb->snd.nxt++;
562                         tcpsetstate(c, Last_ack);
563                         tcpoutput(c);
564                         break;
565         }
566 }
567
568 void tcpkick(void *x)
569 {
570         ERRSTACK(1);
571         struct conv *s = x;
572         Tcpctl *tcb;
573
574         tcb = (Tcpctl *) s->ptcl;
575
576         qlock(&s->qlock);
577         if (waserror()) {
578                 qunlock(&s->qlock);
579                 nexterror();
580         }
581
582         switch (tcb->state) {
583                 case Syn_sent:
584                 case Syn_received:
585                 case Established:
586                 case Close_wait:
587                         /*
588                          * Push data
589                          */
590                         tcprcvwin(s);
591                         tcpoutput(s);
592                         break;
593                 default:
594                         localclose(s, "Hangup");
595                         break;
596         }
597
598         qunlock(&s->qlock);
599         poperror();
600 }
601
602 void tcprcvwin(struct conv *s)
603 {       /* Call with tcb locked */
604         int w;
605         Tcpctl *tcb;
606
607         tcb = (Tcpctl *) s->ptcl;
608         w = tcb->window - qlen(s->rq);
609         if (w < 0)
610                 w = 0;
611         tcb->rcv.wnd = w;
612         if (w == 0)
613                 tcb->rcv.blocked = 1;
614 }
615
616 void tcpacktimer(void *v)
617 {
618         ERRSTACK(1);
619         Tcpctl *tcb;
620         struct conv *s;
621
622         s = v;
623         tcb = (Tcpctl *) s->ptcl;
624
625         qlock(&s->qlock);
626         if (waserror()) {
627                 qunlock(&s->qlock);
628                 nexterror();
629         }
630         if (tcb->state != Closed) {
631                 tcb->flags |= FORCE;
632                 tcprcvwin(s);
633                 tcpoutput(s);
634         }
635         qunlock(&s->qlock);
636         poperror();
637 }
638
639 static void tcpcreate(struct conv *c)
640 {
641         c->rq = qopen(QMAX, Qcoalesce, tcpacktimer, c);
642         c->wq = qopen(8 * QMAX, Qkick, tcpkick, c);
643 }
644
645 static void timerstate(struct tcppriv *priv, Tcptimer * t, int newstate)
646 {
647         if (newstate != TcptimerON) {
648                 if (t->state == TcptimerON) {
649                         // unchain
650                         if (priv->timers == t) {
651                                 priv->timers = t->next;
652                                 if (t->prev != NULL)
653                                         panic("timerstate1");
654                         }
655                         if (t->next)
656                                 t->next->prev = t->prev;
657                         if (t->prev)
658                                 t->prev->next = t->next;
659                         t->next = t->prev = NULL;
660                 }
661         } else {
662                 if (t->state != TcptimerON) {
663                         // chain
664                         if (t->prev != NULL || t->next != NULL)
665                                 panic("timerstate2");
666                         t->prev = NULL;
667                         t->next = priv->timers;
668                         if (t->next)
669                                 t->next->prev = t;
670                         priv->timers = t;
671                 }
672         }
673         t->state = newstate;
674 }
675
676 void tcpackproc(void *a)
677 {
678         ERRSTACK(1);
679         Tcptimer *t, *tp, *timeo;
680         struct Proto *tcp;
681         struct tcppriv *priv;
682         int loop;
683
684         tcp = a;
685         priv = tcp->priv;
686
687         for (;;) {
688                 kthread_usleep(MSPTICK * 1000);
689
690                 qlock(&priv->tl);
691                 timeo = NULL;
692                 loop = 0;
693                 for (t = priv->timers; t != NULL; t = tp) {
694                         if (loop++ > 10000)
695                                 panic("tcpackproc1");
696                         tp = t->next;
697                         if (t->state == TcptimerON) {
698                                 t->count--;
699                                 if (t->count == 0) {
700                                         timerstate(priv, t, TcptimerDONE);
701                                         t->readynext = timeo;
702                                         timeo = t;
703                                 }
704                         }
705                 }
706                 qunlock(&priv->tl);
707
708                 loop = 0;
709                 for (t = timeo; t != NULL; t = t->readynext) {
710                         if (loop++ > 10000)
711                                 panic("tcpackproc2");
712                         if (t->state == TcptimerDONE && t->func != NULL) {
713                                 /* discard error style */
714                                 if (!waserror())
715                                         (*t->func) (t->arg);
716                                 poperror();
717                         }
718                 }
719
720                 limborexmit(tcp);
721         }
722 }
723
724 void tcpgo(struct tcppriv *priv, Tcptimer * t)
725 {
726         if (t == NULL || t->start == 0)
727                 return;
728
729         qlock(&priv->tl);
730         t->count = t->start;
731         timerstate(priv, t, TcptimerON);
732         qunlock(&priv->tl);
733 }
734
735 void tcphalt(struct tcppriv *priv, Tcptimer * t)
736 {
737         if (t == NULL)
738                 return;
739
740         qlock(&priv->tl);
741         timerstate(priv, t, TcptimerOFF);
742         qunlock(&priv->tl);
743 }
744
745 int backoff(int n)
746 {
747         return 1 << n;
748 }
749
750 void localclose(struct conv *s, char *reason)
751 {       /* called with tcb locked */
752         Tcpctl *tcb;
753         Reseq *rp, *rp1;
754         struct tcppriv *tpriv;
755
756         tpriv = s->p->priv;
757         tcb = (Tcpctl *) s->ptcl;
758
759         iphtrem(&tpriv->ht, s);
760
761         tcphalt(tpriv, &tcb->timer);
762         tcphalt(tpriv, &tcb->rtt_timer);
763         tcphalt(tpriv, &tcb->acktimer);
764         tcphalt(tpriv, &tcb->katimer);
765
766         /* Flush reassembly queue; nothing more can arrive */
767         for (rp = tcb->reseq; rp != NULL; rp = rp1) {
768                 rp1 = rp->next;
769                 freeblist(rp->bp);
770                 kfree(rp);
771         }
772         tcb->reseq = NULL;
773
774         if (tcb->state == Syn_sent)
775                 Fsconnected(s, reason);
776
777         qhangup(s->rq, reason);
778         qhangup(s->wq, reason);
779
780         tcpsetstate(s, Closed);
781
782         /* listener will check the rq state */
783         if (s->state == Announced)
784                 rendez_wakeup(&s->listenr);
785 }
786
787 /* mtu (- TCP + IP hdr len) of 1st hop */
788 int tcpmtu(struct Proto *tcp, uint8_t * addr, int version, int *scale,
789            uint8_t *flags)
790 {
791         struct Ipifc *ifc;
792         int mtu;
793
794         ifc = findipifc(tcp->f, addr, 0);
795         switch (version) {
796                 default:
797                 case V4:
798                         mtu = DEF_MSS;
799                         if (ifc != NULL)
800                                 mtu = ifc->maxtu - ifc->m->hsize - (TCP4_PKT + TCP4_HDRSIZE);
801                         break;
802                 case V6:
803                         mtu = DEF_MSS6;
804                         if (ifc != NULL)
805                                 mtu = ifc->maxtu - ifc->m->hsize - (TCP6_PKT + TCP6_HDRSIZE);
806                         break;
807         }
808         *flags &= ~TSO;
809
810         if (ifc != NULL) {
811                 if (ifc->mbps > 100)
812                         *scale = HaveWS | 3;
813                 else if (ifc->mbps > 10)
814                         *scale = HaveWS | 1;
815                 else
816                         *scale = HaveWS | 0;
817                 if (ifc->feat & NETF_TSO)
818                         *flags |= TSO;
819         } else
820                 *scale = HaveWS | 0;
821
822         return mtu;
823 }
824
825 void inittcpctl(struct conv *s, int mode)
826 {
827         Tcpctl *tcb;
828         Tcp4hdr *h4;
829         Tcp6hdr *h6;
830         int mss;
831
832         tcb = (Tcpctl *) s->ptcl;
833
834         memset(tcb, 0, sizeof(Tcpctl));
835
836         tcb->ssthresh = 65535;
837         tcb->srtt = tcp_irtt << LOGAGAIN;
838         tcb->mdev = 0;
839
840         /* setup timers */
841         tcb->timer.start = tcp_irtt / MSPTICK;
842         tcb->timer.func = tcptimeout;
843         tcb->timer.arg = s;
844         tcb->rtt_timer.start = MAX_TIME;
845         tcb->acktimer.start = TCP_ACK / MSPTICK;
846         tcb->acktimer.func = tcpacktimer;
847         tcb->acktimer.arg = s;
848         tcb->katimer.start = DEF_KAT / MSPTICK;
849         tcb->katimer.func = tcpkeepalive;
850         tcb->katimer.arg = s;
851
852         mss = DEF_MSS;
853
854         /* create a prototype(pseudo) header */
855         if (mode != TCP_LISTEN) {
856                 if (ipcmp(s->laddr, IPnoaddr) == 0)
857                         findlocalip(s->p->f, s->laddr, s->raddr);
858
859                 switch (s->ipversion) {
860                         case V4:
861                                 h4 = &tcb->protohdr.tcp4hdr;
862                                 memset(h4, 0, sizeof(*h4));
863                                 h4->proto = IP_TCPPROTO;
864                                 hnputs(h4->tcpsport, s->lport);
865                                 hnputs(h4->tcpdport, s->rport);
866                                 v6tov4(h4->tcpsrc, s->laddr);
867                                 v6tov4(h4->tcpdst, s->raddr);
868                                 break;
869                         case V6:
870                                 h6 = &tcb->protohdr.tcp6hdr;
871                                 memset(h6, 0, sizeof(*h6));
872                                 h6->proto = IP_TCPPROTO;
873                                 hnputs(h6->tcpsport, s->lport);
874                                 hnputs(h6->tcpdport, s->rport);
875                                 ipmove(h6->tcpsrc, s->laddr);
876                                 ipmove(h6->tcpdst, s->raddr);
877                                 mss = DEF_MSS6;
878                                 break;
879                         default:
880                                 panic("inittcpctl: version %d", s->ipversion);
881                 }
882         }
883
884         tcb->mss = tcb->cwind = mss;
885
886         /* default is no window scaling */
887         tcb->window = QMAX;
888         tcb->rcv.wnd = QMAX;
889         tcb->rcv.scale = 0;
890         tcb->snd.scale = 0;
891         qsetlimit(s->rq, QMAX);
892 }
893
894 /*
895  *  called with s qlocked
896  */
897 void tcpstart(struct conv *s, int mode)
898 {
899         Tcpctl *tcb;
900         struct tcppriv *tpriv;
901         /* tcpackproc needs to free this if it ever exits */
902         char *kpname = kmalloc(KNAMELEN, MEM_WAIT);
903
904         tpriv = s->p->priv;
905
906         if (tpriv->ackprocstarted == 0) {
907                 qlock(&tpriv->apl);
908                 if (tpriv->ackprocstarted == 0) {
909                         snprintf(kpname, KNAMELEN, "#I%dtcpack", s->p->f->dev);
910                         ktask(kpname, tcpackproc, s->p);
911                         tpriv->ackprocstarted = 1;
912                 }
913                 qunlock(&tpriv->apl);
914         }
915
916         tcb = (Tcpctl *) s->ptcl;
917
918         inittcpctl(s, mode);
919
920         iphtadd(&tpriv->ht, s);
921         switch (mode) {
922                 case TCP_LISTEN:
923                         tpriv->stats[PassiveOpens]++;
924                         tcb->flags |= CLONE;
925                         tcpsetstate(s, Listen);
926                         break;
927
928                 case TCP_CONNECT:
929                         tpriv->stats[ActiveOpens]++;
930                         tcb->flags |= ACTIVE;
931                         tcpsndsyn(s, tcb);
932                         tcpsetstate(s, Syn_sent);
933                         tcpoutput(s);
934                         break;
935         }
936 }
937
938 static char *tcpflag(uint16_t flag)
939 {
940         static char buf[128];
941
942         snprintf(buf, sizeof(buf), "%d", flag >> 10);   /* Head len */
943         if (flag & URG)
944                 snprintf(buf, sizeof(buf), "%s%s", buf, " URG");
945         if (flag & ACK)
946                 snprintf(buf, sizeof(buf), "%s%s", buf, " ACK");
947         if (flag & PSH)
948                 snprintf(buf, sizeof(buf), "%s%s", buf, " PSH");
949         if (flag & RST)
950                 snprintf(buf, sizeof(buf), "%s%s", buf, " RST");
951         if (flag & SYN)
952                 snprintf(buf, sizeof(buf), "%s%s", buf, " SYN");
953         if (flag & FIN)
954                 snprintf(buf, sizeof(buf), "%s%s", buf, " FIN");
955
956         return buf;
957 }
958
959 struct block *htontcp6(Tcp * tcph, struct block *data, Tcp6hdr * ph,
960                                            Tcpctl * tcb)
961 {
962         int dlen;
963         Tcp6hdr *h;
964         uint16_t csum;
965         uint16_t hdrlen, optpad = 0;
966         uint8_t *opt;
967
968         hdrlen = TCP6_HDRSIZE;
969         if (tcph->flags & SYN) {
970                 if (tcph->mss)
971                         hdrlen += MSS_LENGTH;
972                 if (tcph->ws)
973                         hdrlen += WS_LENGTH;
974                 optpad = hdrlen & 3;
975                 if (optpad)
976                         optpad = 4 - optpad;
977                 hdrlen += optpad;
978         }
979
980         if (data) {
981                 dlen = blocklen(data);
982                 data = padblock(data, hdrlen + TCP6_PKT);
983                 if (data == NULL)
984                         return NULL;
985         } else {
986                 dlen = 0;
987                 data = allocb(hdrlen + TCP6_PKT + 64);  /* the 64 pad is to meet mintu's */
988                 if (data == NULL)
989                         return NULL;
990                 data->wp += hdrlen + TCP6_PKT;
991         }
992
993         /* copy in pseudo ip header plus port numbers */
994         h = (Tcp6hdr *) (data->rp);
995         memmove(h, ph, TCP6_TCBPHDRSZ);
996
997         /* compose pseudo tcp header, do cksum calculation */
998         hnputl(h->vcf, hdrlen + dlen);
999         h->ploadlen[0] = h->ploadlen[1] = h->proto = 0;
1000         h->ttl = ph->proto;
1001
1002         /* copy in variable bits */
1003         hnputl(h->tcpseq, tcph->seq);
1004         hnputl(h->tcpack, tcph->ack);
1005         hnputs(h->tcpflag, (hdrlen << 10) | tcph->flags);
1006         hnputs(h->tcpwin, tcph->wnd >> (tcb != NULL ? tcb->snd.scale : 0));
1007         hnputs(h->tcpurg, tcph->urg);
1008
1009         if (tcph->flags & SYN) {
1010                 opt = h->tcpopt;
1011                 if (tcph->mss != 0) {
1012                         *opt++ = MSSOPT;
1013                         *opt++ = MSS_LENGTH;
1014                         hnputs(opt, tcph->mss);
1015                         opt += 2;
1016                 }
1017                 if (tcph->ws != 0) {
1018                         *opt++ = WSOPT;
1019                         *opt++ = WS_LENGTH;
1020                         *opt++ = tcph->ws;
1021                 }
1022                 while (optpad-- > 0)
1023                         *opt++ = NOOPOPT;
1024         }
1025
1026         if (tcb != NULL && tcb->nochecksum) {
1027                 h->tcpcksum[0] = h->tcpcksum[1] = 0;
1028         } else {
1029                 csum = ptclcsum(data, TCP6_IPLEN, hdrlen + dlen + TCP6_PHDRSIZE);
1030                 hnputs(h->tcpcksum, csum);
1031         }
1032
1033         /* move from pseudo header back to normal ip header */
1034         memset(h->vcf, 0, 4);
1035         h->vcf[0] = IP_VER6;
1036         hnputs(h->ploadlen, hdrlen + dlen);
1037         h->proto = ph->proto;
1038
1039         return data;
1040 }
1041
1042 struct block *htontcp4(Tcp * tcph, struct block *data, Tcp4hdr * ph,
1043                                            Tcpctl * tcb)
1044 {
1045         int dlen;
1046         Tcp4hdr *h;
1047         uint16_t csum;
1048         uint16_t hdrlen, optpad = 0;
1049         uint8_t *opt;
1050
1051         hdrlen = TCP4_HDRSIZE;
1052         if (tcph->flags & SYN) {
1053                 if (tcph->mss)
1054                         hdrlen += MSS_LENGTH;
1055                 if (tcph->ws)
1056                         hdrlen += WS_LENGTH;
1057                 optpad = hdrlen & 3;
1058                 if (optpad)
1059                         optpad = 4 - optpad;
1060                 hdrlen += optpad;
1061         }
1062
1063         if (data) {
1064                 dlen = blocklen(data);
1065                 data = padblock(data, hdrlen + TCP4_PKT);
1066                 if (data == NULL)
1067                         return NULL;
1068         } else {
1069                 dlen = 0;
1070                 data = allocb(hdrlen + TCP4_PKT + 64);  /* the 64 pad is to meet mintu's */
1071                 if (data == NULL)
1072                         return NULL;
1073                 data->wp += hdrlen + TCP4_PKT;
1074         }
1075
1076         /* copy in pseudo ip header plus port numbers */
1077         h = (Tcp4hdr *) (data->rp);
1078         memmove(h, ph, TCP4_TCBPHDRSZ);
1079
1080         /* copy in variable bits */
1081         hnputs(h->tcplen, hdrlen + dlen);
1082         hnputl(h->tcpseq, tcph->seq);
1083         hnputl(h->tcpack, tcph->ack);
1084         hnputs(h->tcpflag, (hdrlen << 10) | tcph->flags);
1085         hnputs(h->tcpwin, tcph->wnd >> (tcb != NULL ? tcb->snd.scale : 0));
1086         hnputs(h->tcpurg, tcph->urg);
1087
1088         if (tcph->flags & SYN) {
1089                 opt = h->tcpopt;
1090                 if (tcph->mss != 0) {
1091                         *opt++ = MSSOPT;
1092                         *opt++ = MSS_LENGTH;
1093                         hnputs(opt, tcph->mss);
1094                         opt += 2;
1095                 }
1096                 if (tcph->ws != 0) {
1097                         *opt++ = WSOPT;
1098                         *opt++ = WS_LENGTH;
1099                         *opt++ = tcph->ws;
1100                 }
1101                 while (optpad-- > 0)
1102                         *opt++ = NOOPOPT;
1103         }
1104
1105         if (tcb != NULL && tcb->nochecksum) {
1106                 h->tcpcksum[0] = h->tcpcksum[1] = 0;
1107         } else {
1108                 csum = ~ptclcsum(data, TCP4_IPLEN, TCP4_PHDRSIZE);
1109                 hnputs(h->tcpcksum, csum);
1110                 data->checksum_start = TCP4_IPLEN + TCP4_PHDRSIZE;
1111                 data->checksum_offset = ph->tcpcksum - ph->tcpsport;
1112                 data->flag |= Btcpck;
1113         }
1114
1115         return data;
1116 }
1117
1118 int ntohtcp6(Tcp * tcph, struct block **bpp)
1119 {
1120         Tcp6hdr *h;
1121         uint8_t *optr;
1122         uint16_t hdrlen;
1123         uint16_t optlen;
1124         int n;
1125
1126         *bpp = pullupblock(*bpp, TCP6_PKT + TCP6_HDRSIZE);
1127         if (*bpp == NULL)
1128                 return -1;
1129
1130         h = (Tcp6hdr *) ((*bpp)->rp);
1131         tcph->source = nhgets(h->tcpsport);
1132         tcph->dest = nhgets(h->tcpdport);
1133         tcph->seq = nhgetl(h->tcpseq);
1134         tcph->ack = nhgetl(h->tcpack);
1135         hdrlen = (h->tcpflag[0] >> 2) & ~3;
1136         if (hdrlen < TCP6_HDRSIZE) {
1137                 freeblist(*bpp);
1138                 return -1;
1139         }
1140
1141         tcph->flags = h->tcpflag[1];
1142         tcph->wnd = nhgets(h->tcpwin);
1143         tcph->urg = nhgets(h->tcpurg);
1144         tcph->mss = 0;
1145         tcph->ws = 0;
1146         tcph->len = nhgets(h->ploadlen) - hdrlen;
1147
1148         *bpp = pullupblock(*bpp, hdrlen + TCP6_PKT);
1149         if (*bpp == NULL)
1150                 return -1;
1151
1152         optr = h->tcpopt;
1153         n = hdrlen - TCP6_HDRSIZE;
1154         while (n > 0 && *optr != EOLOPT) {
1155                 if (*optr == NOOPOPT) {
1156                         n--;
1157                         optr++;
1158                         continue;
1159                 }
1160                 optlen = optr[1];
1161                 if (optlen < 2 || optlen > n)
1162                         break;
1163                 switch (*optr) {
1164                         case MSSOPT:
1165                                 if (optlen == MSS_LENGTH)
1166                                         tcph->mss = nhgets(optr + 2);
1167                                 break;
1168                         case WSOPT:
1169                                 if (optlen == WS_LENGTH && *(optr + 2) <= 14)
1170                                         tcph->ws = HaveWS | *(optr + 2);
1171                                 break;
1172                 }
1173                 n -= optlen;
1174                 optr += optlen;
1175         }
1176         return hdrlen;
1177 }
1178
1179 int ntohtcp4(Tcp * tcph, struct block **bpp)
1180 {
1181         Tcp4hdr *h;
1182         uint8_t *optr;
1183         uint16_t hdrlen;
1184         uint16_t optlen;
1185         int n;
1186
1187         *bpp = pullupblock(*bpp, TCP4_PKT + TCP4_HDRSIZE);
1188         if (*bpp == NULL)
1189                 return -1;
1190
1191         h = (Tcp4hdr *) ((*bpp)->rp);
1192         tcph->source = nhgets(h->tcpsport);
1193         tcph->dest = nhgets(h->tcpdport);
1194         tcph->seq = nhgetl(h->tcpseq);
1195         tcph->ack = nhgetl(h->tcpack);
1196
1197         hdrlen = (h->tcpflag[0] >> 2) & ~3;
1198         if (hdrlen < TCP4_HDRSIZE) {
1199                 freeblist(*bpp);
1200                 return -1;
1201         }
1202
1203         tcph->flags = h->tcpflag[1];
1204         tcph->wnd = nhgets(h->tcpwin);
1205         tcph->urg = nhgets(h->tcpurg);
1206         tcph->mss = 0;
1207         tcph->ws = 0;
1208         tcph->len = nhgets(h->length) - (hdrlen + TCP4_PKT);
1209
1210         *bpp = pullupblock(*bpp, hdrlen + TCP4_PKT);
1211         if (*bpp == NULL)
1212                 return -1;
1213
1214         optr = h->tcpopt;
1215         n = hdrlen - TCP4_HDRSIZE;
1216         while (n > 0 && *optr != EOLOPT) {
1217                 if (*optr == NOOPOPT) {
1218                         n--;
1219                         optr++;
1220                         continue;
1221                 }
1222                 optlen = optr[1];
1223                 if (optlen < 2 || optlen > n)
1224                         break;
1225                 switch (*optr) {
1226                         case MSSOPT:
1227                                 if (optlen == MSS_LENGTH)
1228                                         tcph->mss = nhgets(optr + 2);
1229                                 break;
1230                         case WSOPT:
1231                                 if (optlen == WS_LENGTH && *(optr + 2) <= 14)
1232                                         tcph->ws = HaveWS | *(optr + 2);
1233                                 break;
1234                 }
1235                 n -= optlen;
1236                 optr += optlen;
1237         }
1238         return hdrlen;
1239 }
1240
1241 /*
1242  *  For outgiing calls, generate an initial sequence
1243  *  number and put a SYN on the send queue
1244  */
1245 void tcpsndsyn(struct conv *s, Tcpctl * tcb)
1246 {
1247         urandom_read(&tcb->iss, sizeof(tcb->iss));
1248         tcb->rttseq = tcb->iss;
1249         tcb->snd.wl2 = tcb->iss;
1250         tcb->snd.una = tcb->iss;
1251         tcb->snd.ptr = tcb->rttseq;
1252         tcb->snd.nxt = tcb->rttseq;
1253         tcb->flgcnt++;
1254         tcb->flags |= FORCE;
1255         tcb->sndsyntime = NOW;
1256
1257         /* set desired mss and scale */
1258         tcb->mss = tcpmtu(s->p, s->laddr, s->ipversion, &tcb->scale,
1259                           &tcb->flags);
1260 }
1261
1262 void
1263 sndrst(struct Proto *tcp, uint8_t * source, uint8_t * dest,
1264            uint16_t length, Tcp * seg, uint8_t version, char *reason)
1265 {
1266         struct block *hbp;
1267         uint8_t rflags;
1268         struct tcppriv *tpriv;
1269         Tcp4hdr ph4;
1270         Tcp6hdr ph6;
1271
1272         netlog(tcp->f, Logtcp, "sndrst: %s\n", reason);
1273
1274         tpriv = tcp->priv;
1275
1276         if (seg->flags & RST)
1277                 return;
1278
1279         /* make pseudo header */
1280         switch (version) {
1281                 case V4:
1282                         memset(&ph4, 0, sizeof(ph4));
1283                         ph4.vihl = IP_VER4;
1284                         v6tov4(ph4.tcpsrc, dest);
1285                         v6tov4(ph4.tcpdst, source);
1286                         ph4.proto = IP_TCPPROTO;
1287                         hnputs(ph4.tcplen, TCP4_HDRSIZE);
1288                         hnputs(ph4.tcpsport, seg->dest);
1289                         hnputs(ph4.tcpdport, seg->source);
1290                         break;
1291                 case V6:
1292                         memset(&ph6, 0, sizeof(ph6));
1293                         ph6.vcf[0] = IP_VER6;
1294                         ipmove(ph6.tcpsrc, dest);
1295                         ipmove(ph6.tcpdst, source);
1296                         ph6.proto = IP_TCPPROTO;
1297                         hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1298                         hnputs(ph6.tcpsport, seg->dest);
1299                         hnputs(ph6.tcpdport, seg->source);
1300                         break;
1301                 default:
1302                         panic("sndrst: version %d", version);
1303         }
1304
1305         tpriv->stats[OutRsts]++;
1306         rflags = RST;
1307
1308         /* convince the other end that this reset is in band */
1309         if (seg->flags & ACK) {
1310                 seg->seq = seg->ack;
1311                 seg->ack = 0;
1312         } else {
1313                 rflags |= ACK;
1314                 seg->ack = seg->seq;
1315                 seg->seq = 0;
1316                 if (seg->flags & SYN)
1317                         seg->ack++;
1318                 seg->ack += length;
1319                 if (seg->flags & FIN)
1320                         seg->ack++;
1321         }
1322         seg->flags = rflags;
1323         seg->wnd = 0;
1324         seg->urg = 0;
1325         seg->mss = 0;
1326         seg->ws = 0;
1327         switch (version) {
1328                 case V4:
1329                         hbp = htontcp4(seg, NULL, &ph4, NULL);
1330                         if (hbp == NULL)
1331                                 return;
1332                         ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1333                         break;
1334                 case V6:
1335                         hbp = htontcp6(seg, NULL, &ph6, NULL);
1336                         if (hbp == NULL)
1337                                 return;
1338                         ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1339                         break;
1340                 default:
1341                         panic("sndrst2: version %d", version);
1342         }
1343 }
1344
1345 /*
1346  *  send a reset to the remote side and close the conversation
1347  *  called with s qlocked
1348  */
1349 static void tcphangup(struct conv *s)
1350 {
1351         ERRSTACK(1);
1352         Tcp seg;
1353         Tcpctl *tcb;
1354         struct block *hbp;
1355
1356         tcb = (Tcpctl *) s->ptcl;
1357         if (ipcmp(s->raddr, IPnoaddr)) {
1358                 /* discard error style, poperror regardless */
1359                 if (!waserror()) {
1360                         seg.flags = RST | ACK;
1361                         seg.ack = tcb->rcv.nxt;
1362                         tcb->rcv.una = 0;
1363                         seg.seq = tcb->snd.ptr;
1364                         seg.wnd = 0;
1365                         seg.urg = 0;
1366                         seg.mss = 0;
1367                         seg.ws = 0;
1368                         switch (s->ipversion) {
1369                                 case V4:
1370                                         tcb->protohdr.tcp4hdr.vihl = IP_VER4;
1371                                         hbp = htontcp4(&seg, NULL, &tcb->protohdr.tcp4hdr, tcb);
1372                                         ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
1373                                         break;
1374                                 case V6:
1375                                         tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
1376                                         hbp = htontcp6(&seg, NULL, &tcb->protohdr.tcp6hdr, tcb);
1377                                         ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
1378                                         break;
1379                                 default:
1380                                         panic("tcphangup: version %d", s->ipversion);
1381                         }
1382                 }
1383                 poperror();
1384         }
1385         localclose(s, NULL);
1386 }
1387
1388 /*
1389  *  (re)send a SYN ACK
1390  */
1391 int sndsynack(struct Proto *tcp, Limbo * lp)
1392 {
1393         struct block *hbp;
1394         Tcp4hdr ph4;
1395         Tcp6hdr ph6;
1396         Tcp seg;
1397         int scale;
1398         uint8_t flag = 0;
1399
1400         /* make pseudo header */
1401         switch (lp->version) {
1402                 case V4:
1403                         memset(&ph4, 0, sizeof(ph4));
1404                         ph4.vihl = IP_VER4;
1405                         v6tov4(ph4.tcpsrc, lp->laddr);
1406                         v6tov4(ph4.tcpdst, lp->raddr);
1407                         ph4.proto = IP_TCPPROTO;
1408                         hnputs(ph4.tcplen, TCP4_HDRSIZE);
1409                         hnputs(ph4.tcpsport, lp->lport);
1410                         hnputs(ph4.tcpdport, lp->rport);
1411                         break;
1412                 case V6:
1413                         memset(&ph6, 0, sizeof(ph6));
1414                         ph6.vcf[0] = IP_VER6;
1415                         ipmove(ph6.tcpsrc, lp->laddr);
1416                         ipmove(ph6.tcpdst, lp->raddr);
1417                         ph6.proto = IP_TCPPROTO;
1418                         hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1419                         hnputs(ph6.tcpsport, lp->lport);
1420                         hnputs(ph6.tcpdport, lp->rport);
1421                         break;
1422                 default:
1423                         panic("sndrst: version %d", lp->version);
1424         }
1425
1426         seg.seq = lp->iss;
1427         seg.ack = lp->irs + 1;
1428         seg.flags = SYN | ACK;
1429         seg.urg = 0;
1430         seg.mss = tcpmtu(tcp, lp->laddr, lp->version, &scale, &flag);
1431         seg.wnd = QMAX;
1432
1433         /* if the other side set scale, we should too */
1434         if (lp->rcvscale) {
1435                 seg.ws = scale;
1436                 lp->sndscale = scale;
1437         } else {
1438                 seg.ws = 0;
1439                 lp->sndscale = 0;
1440         }
1441
1442         switch (lp->version) {
1443                 case V4:
1444                         hbp = htontcp4(&seg, NULL, &ph4, NULL);
1445                         if (hbp == NULL)
1446                                 return -1;
1447                         ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1448                         break;
1449                 case V6:
1450                         hbp = htontcp6(&seg, NULL, &ph6, NULL);
1451                         if (hbp == NULL)
1452                                 return -1;
1453                         ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1454                         break;
1455                 default:
1456                         panic("sndsnack: version %d", lp->version);
1457         }
1458         lp->lastsend = NOW;
1459         return 0;
1460 }
1461
1462 #define hashipa(a, p) ( ( (a)[IPaddrlen-2] + (a)[IPaddrlen-1] + p )&LHTMASK )
1463
1464 /*
1465  *  put a call into limbo and respond with a SYN ACK
1466  *
1467  *  called with proto locked
1468  */
1469 static void
1470 limbo(struct conv *s, uint8_t * source, uint8_t * dest, Tcp * seg, int version)
1471 {
1472         Limbo *lp, **l;
1473         struct tcppriv *tpriv;
1474         int h;
1475
1476         tpriv = s->p->priv;
1477         h = hashipa(source, seg->source);
1478
1479         for (l = &tpriv->lht[h]; *l != NULL; l = &lp->next) {
1480                 lp = *l;
1481                 if (lp->lport != seg->dest || lp->rport != seg->source
1482                         || lp->version != version)
1483                         continue;
1484                 if (ipcmp(lp->raddr, source) != 0)
1485                         continue;
1486                 if (ipcmp(lp->laddr, dest) != 0)
1487                         continue;
1488
1489                 /* each new SYN restarts the retransmits */
1490                 lp->irs = seg->seq;
1491                 break;
1492         }
1493         lp = *l;
1494         if (lp == NULL) {
1495                 if (tpriv->nlimbo >= Maxlimbo && tpriv->lht[h]) {
1496                         lp = tpriv->lht[h];
1497                         tpriv->lht[h] = lp->next;
1498                         lp->next = NULL;
1499                 } else {
1500                         lp = kzmalloc(sizeof(*lp), 0);
1501                         if (lp == NULL)
1502                                 return;
1503                         tpriv->nlimbo++;
1504                 }
1505                 *l = lp;
1506                 lp->version = version;
1507                 ipmove(lp->laddr, dest);
1508                 ipmove(lp->raddr, source);
1509                 lp->lport = seg->dest;
1510                 lp->rport = seg->source;
1511                 lp->mss = seg->mss;
1512                 lp->rcvscale = seg->ws;
1513                 lp->irs = seg->seq;
1514                 urandom_read(&lp->iss, sizeof(lp->iss));
1515         }
1516
1517         if (sndsynack(s->p, lp) < 0) {
1518                 *l = lp->next;
1519                 tpriv->nlimbo--;
1520                 kfree(lp);
1521         }
1522 }
1523
1524 /*
1525  *  resend SYN ACK's once every SYNACK_RXTIMER ms.
1526  */
1527 static void limborexmit(struct Proto *tcp)
1528 {
1529         struct tcppriv *tpriv;
1530         Limbo **l, *lp;
1531         int h;
1532         int seen;
1533         uint64_t now;
1534
1535         tpriv = tcp->priv;
1536
1537         if (!canqlock(&tcp->qlock))
1538                 return;
1539         seen = 0;
1540         now = NOW;
1541         for (h = 0; h < NLHT && seen < tpriv->nlimbo; h++) {
1542                 for (l = &tpriv->lht[h]; *l != NULL && seen < tpriv->nlimbo;) {
1543                         lp = *l;
1544                         seen++;
1545                         if (now - lp->lastsend < (lp->rexmits + 1) * SYNACK_RXTIMER)
1546                                 continue;
1547
1548                         /* time it out after 1 second */
1549                         if (++(lp->rexmits) > 5) {
1550                                 tpriv->nlimbo--;
1551                                 *l = lp->next;
1552                                 kfree(lp);
1553                                 continue;
1554                         }
1555
1556                         /* if we're being attacked, don't bother resending SYN ACK's */
1557                         if (tpriv->nlimbo > 100)
1558                                 continue;
1559
1560                         if (sndsynack(tcp, lp) < 0) {
1561                                 tpriv->nlimbo--;
1562                                 *l = lp->next;
1563                                 kfree(lp);
1564                                 continue;
1565                         }
1566
1567                         l = &lp->next;
1568                 }
1569         }
1570         qunlock(&tcp->qlock);
1571 }
1572
1573 /*
1574  *  lookup call in limbo.  if found, throw it out.
1575  *
1576  *  called with proto locked
1577  */
1578 static void
1579 limborst(struct conv *s, Tcp * segp, uint8_t * src, uint8_t * dst,
1580                  uint8_t version)
1581 {
1582         Limbo *lp, **l;
1583         int h;
1584         struct tcppriv *tpriv;
1585
1586         tpriv = s->p->priv;
1587
1588         /* find a call in limbo */
1589         h = hashipa(src, segp->source);
1590         for (l = &tpriv->lht[h]; *l != NULL; l = &lp->next) {
1591                 lp = *l;
1592                 if (lp->lport != segp->dest || lp->rport != segp->source
1593                         || lp->version != version)
1594                         continue;
1595                 if (ipcmp(lp->laddr, dst) != 0)
1596                         continue;
1597                 if (ipcmp(lp->raddr, src) != 0)
1598                         continue;
1599
1600                 /* RST can only follow the SYN */
1601                 if (segp->seq == lp->irs + 1) {
1602                         tpriv->nlimbo--;
1603                         *l = lp->next;
1604                         kfree(lp);
1605                 }
1606                 break;
1607         }
1608 }
1609
1610 /*
1611  *  come here when we finally get an ACK to our SYN-ACK.
1612  *  lookup call in limbo.  if found, create a new conversation
1613  *
1614  *  called with proto locked
1615  */
1616 static struct conv *tcpincoming(struct conv *s, Tcp * segp, uint8_t * src,
1617                                                                 uint8_t * dst, uint8_t version)
1618 {
1619         struct conv *new;
1620         Tcpctl *tcb;
1621         struct tcppriv *tpriv;
1622         Tcp4hdr *h4;
1623         Tcp6hdr *h6;
1624         Limbo *lp, **l;
1625         int h;
1626
1627         /* unless it's just an ack, it can't be someone coming out of limbo */
1628         if ((segp->flags & SYN) || (segp->flags & ACK) == 0)
1629                 return NULL;
1630
1631         tpriv = s->p->priv;
1632
1633         /* find a call in limbo */
1634         h = hashipa(src, segp->source);
1635         for (l = &tpriv->lht[h]; (lp = *l) != NULL; l = &lp->next) {
1636                 netlog(s->p->f, Logtcp,
1637                            "tcpincoming s %I!%d/%I!%d d %I!%d/%I!%d v %d/%d\n", src,
1638                            segp->source, lp->raddr, lp->rport, dst, segp->dest, lp->laddr,
1639                            lp->lport, version, lp->version);
1640
1641                 if (lp->lport != segp->dest || lp->rport != segp->source
1642                         || lp->version != version)
1643                         continue;
1644                 if (ipcmp(lp->laddr, dst) != 0)
1645                         continue;
1646                 if (ipcmp(lp->raddr, src) != 0)
1647                         continue;
1648
1649                 /* we're assuming no data with the initial SYN */
1650                 if (segp->seq != lp->irs + 1 || segp->ack != lp->iss + 1) {
1651                         netlog(s->p->f, Logtcp, "tcpincoming s 0x%lx/0x%lx a 0x%lx 0x%lx\n",
1652                                    segp->seq, lp->irs + 1, segp->ack, lp->iss + 1);
1653                         lp = NULL;
1654                 } else {
1655                         tpriv->nlimbo--;
1656                         *l = lp->next;
1657                 }
1658                 break;
1659         }
1660         if (lp == NULL)
1661                 return NULL;
1662
1663         new = Fsnewcall(s, src, segp->source, dst, segp->dest, version);
1664         if (new == NULL)
1665                 return NULL;
1666
1667         memmove(new->ptcl, s->ptcl, sizeof(Tcpctl));
1668         tcb = (Tcpctl *) new->ptcl;
1669         tcb->flags &= ~CLONE;
1670         tcb->timer.arg = new;
1671         tcb->timer.state = TcptimerOFF;
1672         tcb->acktimer.arg = new;
1673         tcb->acktimer.state = TcptimerOFF;
1674         tcb->katimer.arg = new;
1675         tcb->katimer.state = TcptimerOFF;
1676         tcb->rtt_timer.arg = new;
1677         tcb->rtt_timer.state = TcptimerOFF;
1678
1679         tcb->irs = lp->irs;
1680         tcb->rcv.nxt = tcb->irs + 1;
1681         tcb->rcv.urg = tcb->rcv.nxt;
1682
1683         tcb->iss = lp->iss;
1684         tcb->rttseq = tcb->iss;
1685         tcb->snd.wl2 = tcb->iss;
1686         tcb->snd.una = tcb->iss + 1;
1687         tcb->snd.ptr = tcb->iss + 1;
1688         tcb->snd.nxt = tcb->iss + 1;
1689         tcb->flgcnt = 0;
1690         tcb->flags |= SYNACK;
1691
1692         /* our sending max segment size cannot be bigger than what he asked for */
1693         if (lp->mss != 0 && lp->mss < tcb->mss)
1694                 tcb->mss = lp->mss;
1695
1696         /* window scaling */
1697         tcpsetscale(new, tcb, lp->rcvscale, lp->sndscale);
1698
1699         /* the congestion window always starts out as a single segment */
1700         tcb->snd.wnd = segp->wnd;
1701         tcb->cwind = tcb->mss;
1702
1703         /* set initial round trip time */
1704         tcb->sndsyntime = lp->lastsend + lp->rexmits * SYNACK_RXTIMER;
1705         tcpsynackrtt(new);
1706
1707         kfree(lp);
1708
1709         /* set up proto header */
1710         switch (version) {
1711                 case V4:
1712                         h4 = &tcb->protohdr.tcp4hdr;
1713                         memset(h4, 0, sizeof(*h4));
1714                         h4->proto = IP_TCPPROTO;
1715                         hnputs(h4->tcpsport, new->lport);
1716                         hnputs(h4->tcpdport, new->rport);
1717                         v6tov4(h4->tcpsrc, dst);
1718                         v6tov4(h4->tcpdst, src);
1719                         break;
1720                 case V6:
1721                         h6 = &tcb->protohdr.tcp6hdr;
1722                         memset(h6, 0, sizeof(*h6));
1723                         h6->proto = IP_TCPPROTO;
1724                         hnputs(h6->tcpsport, new->lport);
1725                         hnputs(h6->tcpdport, new->rport);
1726                         ipmove(h6->tcpsrc, dst);
1727                         ipmove(h6->tcpdst, src);
1728                         break;
1729                 default:
1730                         panic("tcpincoming: version %d", new->ipversion);
1731         }
1732
1733         tcpsetstate(new, Established);
1734
1735         iphtadd(&tpriv->ht, new);
1736
1737         return new;
1738 }
1739
1740 int seq_within(uint32_t x, uint32_t low, uint32_t high)
1741 {
1742         if (low <= high) {
1743                 if (low <= x && x <= high)
1744                         return 1;
1745         } else {
1746                 if (x >= low || x <= high)
1747                         return 1;
1748         }
1749         return 0;
1750 }
1751
1752 int seq_lt(uint32_t x, uint32_t y)
1753 {
1754         return (int)(x - y) < 0;
1755 }
1756
1757 int seq_le(uint32_t x, uint32_t y)
1758 {
1759         return (int)(x - y) <= 0;
1760 }
1761
1762 int seq_gt(uint32_t x, uint32_t y)
1763 {
1764         return (int)(x - y) > 0;
1765 }
1766
1767 int seq_ge(uint32_t x, uint32_t y)
1768 {
1769         return (int)(x - y) >= 0;
1770 }
1771
1772 /*
1773  *  use the time between the first SYN and it's ack as the
1774  *  initial round trip time
1775  */
1776 void tcpsynackrtt(struct conv *s)
1777 {
1778         Tcpctl *tcb;
1779         uint64_t delta;
1780         struct tcppriv *tpriv;
1781
1782         tcb = (Tcpctl *) s->ptcl;
1783         tpriv = s->p->priv;
1784
1785         delta = NOW - tcb->sndsyntime;
1786         tcb->srtt = delta << LOGAGAIN;
1787         tcb->mdev = delta << LOGDGAIN;
1788
1789         /* halt round trip timer */
1790         tcphalt(tpriv, &tcb->rtt_timer);
1791 }
1792
1793 void update(struct conv *s, Tcp * seg)
1794 {
1795         int rtt, delta;
1796         Tcpctl *tcb;
1797         uint32_t acked;
1798         uint32_t expand;
1799         struct tcppriv *tpriv;
1800
1801         tpriv = s->p->priv;
1802         tcb = (Tcpctl *) s->ptcl;
1803
1804         /* if everything has been acked, force output(?) */
1805         if (seq_gt(seg->ack, tcb->snd.nxt)) {
1806                 tcb->flags |= FORCE;
1807                 return;
1808         }
1809
1810         /* added by Dong Lin for fast retransmission */
1811         if (seg->ack == tcb->snd.una
1812                 && tcb->snd.una != tcb->snd.nxt
1813                 && seg->len == 0 && seg->wnd == tcb->snd.wnd) {
1814
1815                 /* this is a pure ack w/o window update */
1816                 netlog(s->p->f, Logtcprxmt, "dupack %lu ack %lu sndwnd %d advwin %d\n",
1817                            tcb->snd.dupacks, seg->ack, tcb->snd.wnd, seg->wnd);
1818
1819                 if (++tcb->snd.dupacks == TCPREXMTTHRESH) {
1820                         /*
1821                          *  tahoe tcp rxt the packet, half sshthresh,
1822                          *  and set cwnd to one packet
1823                          */
1824                         tcb->snd.recovery = 1;
1825                         tcb->snd.rxt = tcb->snd.nxt;
1826                         netlog(s->p->f, Logtcprxmt, "fast rxt %lu, nxt %lu\n", tcb->snd.una,
1827                                    tcb->snd.nxt);
1828                         tcprxmit(s);
1829                 } else {
1830                         /* do reno tcp here. */
1831                 }
1832         }
1833
1834         /*
1835          *  update window
1836          */
1837         if (seq_gt(seg->ack, tcb->snd.wl2)
1838                 || (tcb->snd.wl2 == seg->ack && seg->wnd > tcb->snd.wnd)) {
1839                 tcb->snd.wnd = seg->wnd;
1840                 tcb->snd.wl2 = seg->ack;
1841         }
1842
1843         if (!seq_gt(seg->ack, tcb->snd.una)) {
1844                 /*
1845                  *  don't let us hangup if sending into a closed window and
1846                  *  we're still getting acks
1847                  */
1848                 if ((tcb->flags & RETRAN) && tcb->snd.wnd == 0) {
1849                         tcb->backedoff = MAXBACKMS / 4;
1850                 }
1851                 return;
1852         }
1853
1854         /*
1855          *  any positive ack turns off fast rxt,
1856          *  (should we do new-reno on partial acks?)
1857          */
1858         if (!tcb->snd.recovery || seq_ge(seg->ack, tcb->snd.rxt)) {
1859                 tcb->snd.dupacks = 0;
1860                 tcb->snd.recovery = 0;
1861         } else
1862                 netlog(s->p->f, Logtcp, "rxt next %lu, cwin %u\n", seg->ack,
1863                            tcb->cwind);
1864
1865         /* Compute the new send window size */
1866         acked = seg->ack - tcb->snd.una;
1867
1868         /* avoid slow start and timers for SYN acks */
1869         if ((tcb->flags & SYNACK) == 0) {
1870                 tcb->flags |= SYNACK;
1871                 acked--;
1872                 tcb->flgcnt--;
1873                 goto done;
1874         }
1875
1876         /* slow start as long as we're not recovering from lost packets */
1877         if (tcb->cwind < tcb->snd.wnd && !tcb->snd.recovery) {
1878                 if (tcb->cwind < tcb->ssthresh) {
1879                         expand = tcb->mss;
1880                         if (acked < expand)
1881                                 expand = acked;
1882                 } else
1883                         expand = ((int)tcb->mss * tcb->mss) / tcb->cwind;
1884
1885                 if (tcb->cwind + expand < tcb->cwind)
1886                         expand = tcb->snd.wnd - tcb->cwind;
1887                 if (tcb->cwind + expand > tcb->snd.wnd)
1888                         expand = tcb->snd.wnd - tcb->cwind;
1889                 tcb->cwind += expand;
1890         }
1891
1892         /* Adjust the timers according to the round trip time */
1893         if (tcb->rtt_timer.state == TcptimerON && seq_ge(seg->ack, tcb->rttseq)) {
1894                 tcphalt(tpriv, &tcb->rtt_timer);
1895                 if ((tcb->flags & RETRAN) == 0) {
1896                         tcb->backoff = 0;
1897                         tcb->backedoff = 0;
1898                         rtt = tcb->rtt_timer.start - tcb->rtt_timer.count;
1899                         if (rtt == 0)
1900                                 rtt = 1;        /* otherwise all close systems will rexmit in 0 time */
1901                         rtt *= MSPTICK;
1902                         if (tcb->srtt == 0) {
1903                                 tcb->srtt = rtt << LOGAGAIN;
1904                                 tcb->mdev = rtt << LOGDGAIN;
1905                         } else {
1906                                 delta = rtt - (tcb->srtt >> LOGAGAIN);
1907                                 tcb->srtt += delta;
1908                                 if (tcb->srtt <= 0)
1909                                         tcb->srtt = 1;
1910
1911                                 delta = abs(delta) - (tcb->mdev >> LOGDGAIN);
1912                                 tcb->mdev += delta;
1913                                 if (tcb->mdev <= 0)
1914                                         tcb->mdev = 1;
1915                         }
1916                         tcpsettimer(tcb);
1917                 }
1918         }
1919
1920 done:
1921         if (qdiscard(s->wq, acked) < acked)
1922                 tcb->flgcnt--;
1923
1924         tcb->snd.una = seg->ack;
1925         if (seq_gt(seg->ack, tcb->snd.urg))
1926                 tcb->snd.urg = seg->ack;
1927
1928         if (tcb->snd.una != tcb->snd.nxt)
1929                 tcpgo(tpriv, &tcb->timer);
1930         else
1931                 tcphalt(tpriv, &tcb->timer);
1932
1933         if (seq_lt(tcb->snd.ptr, tcb->snd.una))
1934                 tcb->snd.ptr = tcb->snd.una;
1935
1936         tcb->flags &= ~RETRAN;
1937         tcb->backoff = 0;
1938         tcb->backedoff = 0;
1939 }
1940
1941 void tcpiput(struct Proto *tcp, struct Ipifc *unused, struct block *bp)
1942 {
1943         ERRSTACK(1);
1944         Tcp seg;
1945         Tcp4hdr *h4;
1946         Tcp6hdr *h6;
1947         int hdrlen;
1948         Tcpctl *tcb;
1949         uint16_t length;
1950         uint8_t source[IPaddrlen], dest[IPaddrlen];
1951         struct conv *s;
1952         struct Fs *f;
1953         struct tcppriv *tpriv;
1954         uint8_t version;
1955
1956         f = tcp->f;
1957         tpriv = tcp->priv;
1958
1959         tpriv->stats[InSegs]++;
1960
1961         h4 = (Tcp4hdr *) (bp->rp);
1962         h6 = (Tcp6hdr *) (bp->rp);
1963
1964         if ((h4->vihl & 0xF0) == IP_VER4) {
1965                 version = V4;
1966                 length = nhgets(h4->length);
1967                 v4tov6(dest, h4->tcpdst);
1968                 v4tov6(source, h4->tcpsrc);
1969
1970                 h4->Unused = 0;
1971                 hnputs(h4->tcplen, length - TCP4_PKT);
1972                 if (!(bp->flag & Btcpck) && (h4->tcpcksum[0] || h4->tcpcksum[1]) &&
1973                         ptclcsum(bp, TCP4_IPLEN, length - TCP4_IPLEN)) {
1974                         tpriv->stats[CsumErrs]++;
1975                         tpriv->stats[InErrs]++;
1976                         netlog(f, Logtcp, "bad tcp proto cksum\n");
1977                         freeblist(bp);
1978                         return;
1979                 }
1980
1981                 hdrlen = ntohtcp4(&seg, &bp);
1982                 if (hdrlen < 0) {
1983                         tpriv->stats[HlenErrs]++;
1984                         tpriv->stats[InErrs]++;
1985                         netlog(f, Logtcp, "bad tcp hdr len\n");
1986                         return;
1987                 }
1988
1989                 /* trim the packet to the size claimed by the datagram */
1990                 length -= hdrlen + TCP4_PKT;
1991                 bp = trimblock(bp, hdrlen + TCP4_PKT, length);
1992                 if (bp == NULL) {
1993                         tpriv->stats[LenErrs]++;
1994                         tpriv->stats[InErrs]++;
1995                         netlog(f, Logtcp, "tcp len < 0 after trim\n");
1996                         return;
1997                 }
1998         } else {
1999                 int ttl = h6->ttl;
2000                 int proto = h6->proto;
2001
2002                 version = V6;
2003                 length = nhgets(h6->ploadlen);
2004                 ipmove(dest, h6->tcpdst);
2005                 ipmove(source, h6->tcpsrc);
2006
2007                 h6->ploadlen[0] = h6->ploadlen[1] = h6->proto = 0;
2008                 h6->ttl = proto;
2009                 hnputl(h6->vcf, length);
2010                 if ((h6->tcpcksum[0] || h6->tcpcksum[1]) &&
2011                         ptclcsum(bp, TCP6_IPLEN, length + TCP6_PHDRSIZE)) {
2012                         tpriv->stats[CsumErrs]++;
2013                         tpriv->stats[InErrs]++;
2014                         netlog(f, Logtcp, "bad tcp proto cksum\n");
2015                         freeblist(bp);
2016                         return;
2017                 }
2018                 h6->ttl = ttl;
2019                 h6->proto = proto;
2020                 hnputs(h6->ploadlen, length);
2021
2022                 hdrlen = ntohtcp6(&seg, &bp);
2023                 if (hdrlen < 0) {
2024                         tpriv->stats[HlenErrs]++;
2025                         tpriv->stats[InErrs]++;
2026                         netlog(f, Logtcp, "bad tcp hdr len\n");
2027                         return;
2028                 }
2029
2030                 /* trim the packet to the size claimed by the datagram */
2031                 length -= hdrlen;
2032                 bp = trimblock(bp, hdrlen + TCP6_PKT, length);
2033                 if (bp == NULL) {
2034                         tpriv->stats[LenErrs]++;
2035                         tpriv->stats[InErrs]++;
2036                         netlog(f, Logtcp, "tcp len < 0 after trim\n");
2037                         return;
2038                 }
2039         }
2040
2041         /* lock protocol while searching for a conversation */
2042         qlock(&tcp->qlock);
2043
2044         /* Look for a matching conversation */
2045         s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest);
2046         if (s == NULL) {
2047                 netlog(f, Logtcp, "iphtlook failed\n");
2048 reset:
2049                 qunlock(&tcp->qlock);
2050                 sndrst(tcp, source, dest, length, &seg, version, "no conversation");
2051                 freeblist(bp);
2052                 return;
2053         }
2054
2055         /* if it's a listener, look for the right flags and get a new conv */
2056         tcb = (Tcpctl *) s->ptcl;
2057         if (tcb->state == Listen) {
2058                 if (seg.flags & RST) {
2059                         limborst(s, &seg, source, dest, version);
2060                         qunlock(&tcp->qlock);
2061                         freeblist(bp);
2062                         return;
2063                 }
2064
2065                 /* if this is a new SYN, put the call into limbo */
2066                 if ((seg.flags & SYN) && (seg.flags & ACK) == 0) {
2067                         limbo(s, source, dest, &seg, version);
2068                         qunlock(&tcp->qlock);
2069                         freeblist(bp);
2070                         return;
2071                 }
2072
2073                 /*
2074                  *  if there's a matching call in limbo, tcpincoming will
2075                  *  return it in state Syn_received
2076                  */
2077                 s = tcpincoming(s, &seg, source, dest, version);
2078                 if (s == NULL)
2079                         goto reset;
2080         }
2081
2082         /* The rest of the input state machine is run with the control block
2083          * locked and implements the state machine directly out of the RFC.
2084          * Out-of-band data is ignored - it was always a bad idea.
2085          */
2086         tcb = (Tcpctl *) s->ptcl;
2087         if (waserror()) {
2088                 qunlock(&s->qlock);
2089                 nexterror();
2090         }
2091         qlock(&s->qlock);
2092         qunlock(&tcp->qlock);
2093
2094         /* fix up window */
2095         seg.wnd <<= tcb->rcv.scale;
2096
2097         /* every input packet in puts off the keep alive time out */
2098         tcpsetkacounter(tcb);
2099
2100         switch (tcb->state) {
2101                 case Closed:
2102                         sndrst(tcp, source, dest, length, &seg, version,
2103                                    "sending to Closed");
2104                         goto raise;
2105                 case Syn_sent:
2106                         if (seg.flags & ACK) {
2107                                 if (!seq_within(seg.ack, tcb->iss + 1, tcb->snd.nxt)) {
2108                                         sndrst(tcp, source, dest, length, &seg, version,
2109                                                    "bad seq in Syn_sent");
2110                                         goto raise;
2111                                 }
2112                         }
2113                         if (seg.flags & RST) {
2114                                 if (seg.flags & ACK)
2115                                         localclose(s, "connection refused");
2116                                 goto raise;
2117                         }
2118
2119                         if (seg.flags & SYN) {
2120                                 procsyn(s, &seg);
2121                                 if (seg.flags & ACK) {
2122                                         update(s, &seg);
2123                                         tcpsynackrtt(s);
2124                                         tcpsetstate(s, Established);
2125                                         tcpsetscale(s, tcb, seg.ws, tcb->scale);
2126                                 } else {
2127                                         tcb->time = NOW;
2128                                         tcpsetstate(s, Syn_received);   /* DLP - shouldn't this be a reset? */
2129                                 }
2130
2131                                 if (length != 0 || (seg.flags & FIN))
2132                                         break;
2133
2134                                 freeblist(bp);
2135                                 goto output;
2136                         } else
2137                                 freeblist(bp);
2138
2139                         qunlock(&s->qlock);
2140                         poperror();
2141                         return;
2142                 case Syn_received:
2143                         /* doesn't matter if it's the correct ack, we're just trying to set timing */
2144                         if (seg.flags & ACK)
2145                                 tcpsynackrtt(s);
2146                         break;
2147         }
2148
2149         /*
2150          *  One DOS attack is to open connections to us and then forget about them,
2151          *  thereby tying up a conv at no long term cost to the attacker.
2152          *  This is an attempt to defeat these stateless DOS attacks.  See
2153          *  corresponding code in tcpsendka().
2154          */
2155         if (tcb->state != Syn_received && (seg.flags & RST) == 0) {
2156                 if (tcpporthogdefense
2157                         && seq_within(seg.ack, tcb->snd.una - (1 << 31),
2158                                                   tcb->snd.una - (1 << 29))) {
2159                         printd("stateless hog %I.%d->%I.%d f 0x%x 0x%lx - 0x%lx - 0x%lx\n",
2160                                    source, seg.source, dest, seg.dest, seg.flags,
2161                                    tcb->snd.una - (1 << 31), seg.ack, tcb->snd.una - (1 << 29));
2162                         localclose(s, "stateless hog");
2163                 }
2164         }
2165
2166         /* Cut the data to fit the receive window */
2167         if (tcptrim(tcb, &seg, &bp, &length) == -1) {
2168                 netlog(f, Logtcp, "tcp len < 0, %lu %d\n", seg.seq, length);
2169                 update(s, &seg);
2170                 if (qlen(s->wq) + tcb->flgcnt == 0 && tcb->state == Closing) {
2171                         tcphalt(tpriv, &tcb->rtt_timer);
2172                         tcphalt(tpriv, &tcb->acktimer);
2173                         tcphalt(tpriv, &tcb->katimer);
2174                         tcpsetstate(s, Time_wait);
2175                         tcb->timer.start = MSL2 * (1000 / MSPTICK);
2176                         tcpgo(tpriv, &tcb->timer);
2177                 }
2178                 if (!(seg.flags & RST)) {
2179                         tcb->flags |= FORCE;
2180                         goto output;
2181                 }
2182                 qunlock(&s->qlock);
2183                 poperror();
2184                 return;
2185         }
2186
2187         /* Cannot accept so answer with a rst */
2188         if (length && tcb->state == Closed) {
2189                 sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
2190                 goto raise;
2191         }
2192
2193         /* The segment is beyond the current receive pointer so
2194          * queue the data in the resequence queue
2195          */
2196         if (seg.seq != tcb->rcv.nxt)
2197                 if (length != 0 || (seg.flags & (SYN | FIN))) {
2198                         update(s, &seg);
2199                         if (addreseq(tcb, tpriv, &seg, bp, length) < 0)
2200                                 printd("reseq %I.%d -> %I.%d\n", s->raddr, s->rport, s->laddr,
2201                                            s->lport);
2202                         tcb->flags |= FORCE;
2203                         goto output;
2204                 }
2205
2206         /*
2207          *  keep looping till we've processed this packet plus any
2208          *  adjacent packets in the resequence queue
2209          */
2210         for (;;) {
2211                 if (seg.flags & RST) {
2212                         if (tcb->state == Established) {
2213                                 tpriv->stats[EstabResets]++;
2214                                 if (tcb->rcv.nxt != seg.seq)
2215                                         printd
2216                                                 ("out of order RST rcvd: %I.%d -> %I.%d, rcv.nxt 0x%lx seq 0x%lx\n",
2217                                                  s->raddr, s->rport, s->laddr, s->lport, tcb->rcv.nxt,
2218                                                  seg.seq);
2219                         }
2220                         localclose(s, "connection refused");
2221                         goto raise;
2222                 }
2223
2224                 if ((seg.flags & ACK) == 0)
2225                         goto raise;
2226
2227                 switch (tcb->state) {
2228                         case Syn_received:
2229                                 if (!seq_within(seg.ack, tcb->snd.una + 1, tcb->snd.nxt)) {
2230                                         sndrst(tcp, source, dest, length, &seg, version,
2231                                                    "bad seq in Syn_received");
2232                                         goto raise;
2233                                 }
2234                                 update(s, &seg);
2235                                 tcpsetstate(s, Established);
2236                         case Established:
2237                         case Close_wait:
2238                                 update(s, &seg);
2239                                 break;
2240                         case Finwait1:
2241                                 update(s, &seg);
2242                                 if (qlen(s->wq) + tcb->flgcnt == 0) {
2243                                         tcphalt(tpriv, &tcb->rtt_timer);
2244                                         tcphalt(tpriv, &tcb->acktimer);
2245                                         tcpsetkacounter(tcb);
2246                                         tcb->time = NOW;
2247                                         tcpsetstate(s, Finwait2);
2248                                         tcb->katimer.start = MSL2 * (1000 / MSPTICK);
2249                                         tcpgo(tpriv, &tcb->katimer);
2250                                 }
2251                                 break;
2252                         case Finwait2:
2253                                 update(s, &seg);
2254                                 break;
2255                         case Closing:
2256                                 update(s, &seg);
2257                                 if (qlen(s->wq) + tcb->flgcnt == 0) {
2258                                         tcphalt(tpriv, &tcb->rtt_timer);
2259                                         tcphalt(tpriv, &tcb->acktimer);
2260                                         tcphalt(tpriv, &tcb->katimer);
2261                                         tcpsetstate(s, Time_wait);
2262                                         tcb->timer.start = MSL2 * (1000 / MSPTICK);
2263                                         tcpgo(tpriv, &tcb->timer);
2264                                 }
2265                                 break;
2266                         case Last_ack:
2267                                 update(s, &seg);
2268                                 if (qlen(s->wq) + tcb->flgcnt == 0) {
2269                                         localclose(s, NULL);
2270                                         goto raise;
2271                                 }
2272                         case Time_wait:
2273                                 tcb->flags |= FORCE;
2274                                 if (tcb->timer.state != TcptimerON)
2275                                         tcpgo(tpriv, &tcb->timer);
2276                 }
2277
2278                 if ((seg.flags & URG) && seg.urg) {
2279                         if (seq_gt(seg.urg + seg.seq, tcb->rcv.urg)) {
2280                                 tcb->rcv.urg = seg.urg + seg.seq;
2281                                 pullblock(&bp, seg.urg);
2282                         }
2283                 } else if (seq_gt(tcb->rcv.nxt, tcb->rcv.urg))
2284                         tcb->rcv.urg = tcb->rcv.nxt;
2285
2286                 if (length == 0) {
2287                         if (bp != NULL)
2288                                 freeblist(bp);
2289                 } else {
2290                         switch (tcb->state) {
2291                                 default:
2292                                         /* Ignore segment text */
2293                                         if (bp != NULL)
2294                                                 freeblist(bp);
2295                                         break;
2296
2297                                 case Syn_received:
2298                                 case Established:
2299                                 case Finwait1:
2300                                         /* If we still have some data place on
2301                                          * receive queue
2302                                          */
2303                                         if (bp) {
2304                                                 bp = packblock(bp);
2305                                                 if (bp == NULL)
2306                                                         panic("tcp packblock");
2307                                                 qpassnolim(s->rq, bp);
2308                                                 bp = NULL;
2309
2310                                                 /*
2311                                                  *  Force an ack every 2 data messages.  This is
2312                                                  *  a hack for rob to make his home system run
2313                                                  *  faster.
2314                                                  *
2315                                                  *  this also keeps the standard TCP congestion
2316                                                  *  control working since it needs an ack every
2317                                                  *  2 max segs worth.  This is not quite that,
2318                                                  *  but under a real stream is equivalent since
2319                                                  *  every packet has a max seg in it.
2320                                                  */
2321                                                 if (++(tcb->rcv.una) >= 2)
2322                                                         tcb->flags |= FORCE;
2323                                         }
2324                                         tcb->rcv.nxt += length;
2325
2326                                         /*
2327                                          *  update our rcv window
2328                                          */
2329                                         tcprcvwin(s);
2330
2331                                         /*
2332                                          *  turn on the acktimer if there's something
2333                                          *  to ack
2334                                          */
2335                                         if (tcb->acktimer.state != TcptimerON)
2336                                                 tcpgo(tpriv, &tcb->acktimer);
2337
2338                                         break;
2339                                 case Finwait2:
2340                                         /* no process to read the data, send a reset */
2341                                         if (bp != NULL)
2342                                                 freeblist(bp);
2343                                         sndrst(tcp, source, dest, length, &seg, version,
2344                                                    "send to Finwait2");
2345                                         qunlock(&s->qlock);
2346                                         poperror();
2347                                         return;
2348                         }
2349                 }
2350
2351                 if (seg.flags & FIN) {
2352                         tcb->flags |= FORCE;
2353
2354                         switch (tcb->state) {
2355                                 case Syn_received:
2356                                 case Established:
2357                                         tcb->rcv.nxt++;
2358                                         tcpsetstate(s, Close_wait);
2359                                         break;
2360                                 case Finwait1:
2361                                         tcb->rcv.nxt++;
2362                                         if (qlen(s->wq) + tcb->flgcnt == 0) {
2363                                                 tcphalt(tpriv, &tcb->rtt_timer);
2364                                                 tcphalt(tpriv, &tcb->acktimer);
2365                                                 tcphalt(tpriv, &tcb->katimer);
2366                                                 tcpsetstate(s, Time_wait);
2367                                                 tcb->timer.start = MSL2 * (1000 / MSPTICK);
2368                                                 tcpgo(tpriv, &tcb->timer);
2369                                         } else
2370                                                 tcpsetstate(s, Closing);
2371                                         break;
2372                                 case Finwait2:
2373                                         tcb->rcv.nxt++;
2374                                         tcphalt(tpriv, &tcb->rtt_timer);
2375                                         tcphalt(tpriv, &tcb->acktimer);
2376                                         tcphalt(tpriv, &tcb->katimer);
2377                                         tcpsetstate(s, Time_wait);
2378                                         tcb->timer.start = MSL2 * (1000 / MSPTICK);
2379                                         tcpgo(tpriv, &tcb->timer);
2380                                         break;
2381                                 case Close_wait:
2382                                 case Closing:
2383                                 case Last_ack:
2384                                         break;
2385                                 case Time_wait:
2386                                         tcpgo(tpriv, &tcb->timer);
2387                                         break;
2388                         }
2389                 }
2390
2391                 /*
2392                  *  get next adjacent segment from the resequence queue.
2393                  *  dump/trim any overlapping segments
2394                  */
2395                 for (;;) {
2396                         if (tcb->reseq == NULL)
2397                                 goto output;
2398
2399                         if (seq_ge(tcb->rcv.nxt, tcb->reseq->seg.seq) == 0)
2400                                 goto output;
2401
2402                         getreseq(tcb, &seg, &bp, &length);
2403
2404                         if (tcptrim(tcb, &seg, &bp, &length) == 0)
2405                                 break;
2406                 }
2407         }
2408 output:
2409         tcpoutput(s);
2410         qunlock(&s->qlock);
2411         poperror();
2412         return;
2413 raise:
2414         qunlock(&s->qlock);
2415         poperror();
2416         freeblist(bp);
2417         tcpkick(s);
2418 }
2419
2420 /*
2421  *  always enters and exits with the s locked.  We drop
2422  *  the lock to ipoput the packet so some care has to be
2423  *  taken by callers.
2424  */
2425 void tcpoutput(struct conv *s)
2426 {
2427         Tcp seg;
2428         int msgs;
2429         Tcpctl *tcb;
2430         struct block *hbp, *bp;
2431         int sndcnt, n;
2432         uint32_t ssize, dsize, usable, sent;
2433         struct Fs *f;
2434         struct tcppriv *tpriv;
2435         uint8_t version;
2436
2437         f = s->p->f;
2438         tpriv = s->p->priv;
2439         version = s->ipversion;
2440
2441         for (msgs = 0; msgs < 100; msgs++) {
2442                 tcb = (Tcpctl *) s->ptcl;
2443
2444                 switch (tcb->state) {
2445                         case Listen:
2446                         case Closed:
2447                         case Finwait2:
2448                                 return;
2449                 }
2450
2451                 /* force an ack when a window has opened up */
2452                 if (tcb->rcv.blocked && tcb->rcv.wnd > 0) {
2453                         tcb->rcv.blocked = 0;
2454                         tcb->flags |= FORCE;
2455                 }
2456
2457                 sndcnt = qlen(s->wq) + tcb->flgcnt;
2458                 sent = tcb->snd.ptr - tcb->snd.una;
2459
2460                 /* Don't send anything else until our SYN has been acked */
2461                 if (tcb->snd.ptr != tcb->iss && (tcb->flags & SYNACK) == 0)
2462                         break;
2463
2464                 /* Compute usable segment based on offered window and limit
2465                  * window probes to one
2466                  */
2467                 if (tcb->snd.wnd == 0) {
2468                         if (sent != 0) {
2469                                 if ((tcb->flags & FORCE) == 0)
2470                                         break;
2471 //              tcb->snd.ptr = tcb->snd.una;
2472                         }
2473                         usable = 1;
2474                 } else {
2475                         usable = tcb->cwind;
2476                         if (tcb->snd.wnd < usable)
2477                                 usable = tcb->snd.wnd;
2478                         usable -= sent;
2479                 }
2480                 ssize = sndcnt - sent;
2481                 if (ssize && usable < 2)
2482                         netlog(s->p->f, Logtcp, "throttled snd.wnd %lu cwind %lu\n",
2483                                    tcb->snd.wnd, tcb->cwind);
2484                 if (usable < ssize)
2485                         ssize = usable;
2486                 if (ssize > tcb->mss) {
2487                         if ((tcb->flags & TSO) == 0) {
2488                                 ssize = tcb->mss;
2489                         } else {
2490                                 int segs, window;
2491
2492                                 /*  Don't send too much.  32K is arbitrary..
2493                                  */
2494                                 if (ssize > 32 * 1024)
2495                                         ssize = 32 * 1024;
2496
2497                                 /* Clamp xmit to an integral MSS to
2498                                  * avoid ragged tail segments causing
2499                                  * poor link utilization.  Also
2500                                  * account for each segment sent in
2501                                  * msg heuristic, and round up to the
2502                                  * next multiple of 4, to ensure we
2503                                  * still yeild.
2504                                  */
2505                                 segs = ssize / tcb->mss;
2506                                 ssize = segs * tcb->mss;
2507                                 msgs += segs;
2508                                 if (segs > 3)
2509                                         msgs = (msgs + 4) & ~3;
2510                         }
2511                 }
2512
2513                 dsize = ssize;
2514                 seg.urg = 0;
2515
2516                 if (ssize == 0)
2517                         if ((tcb->flags & FORCE) == 0)
2518                                 break;
2519
2520                 tcb->flags &= ~FORCE;
2521                 tcprcvwin(s);
2522
2523                 /* By default we will generate an ack */
2524                 tcphalt(tpriv, &tcb->acktimer);
2525                 tcb->rcv.una = 0;
2526                 seg.source = s->lport;
2527                 seg.dest = s->rport;
2528                 seg.flags = ACK;
2529                 seg.mss = 0;
2530                 seg.ws = 0;
2531                 switch (tcb->state) {
2532                         case Syn_sent:
2533                                 seg.flags = 0;
2534                                 if (tcb->snd.ptr == tcb->iss) {
2535                                         seg.flags |= SYN;
2536                                         dsize--;
2537                                         seg.mss = tcb->mss;
2538                                         seg.ws = tcb->scale;
2539                                 }
2540                                 break;
2541                         case Syn_received:
2542                                 /*
2543                                  *  don't send any data with a SYN/ACK packet
2544                                  *  because Linux rejects the packet in its
2545                                  *  attempt to solve the SYN attack problem
2546                                  */
2547                                 if (tcb->snd.ptr == tcb->iss) {
2548                                         seg.flags |= SYN;
2549                                         dsize = 0;
2550                                         ssize = 1;
2551                                         seg.mss = tcb->mss;
2552                                         seg.ws = tcb->scale;
2553                                 }
2554                                 break;
2555                 }
2556                 seg.seq = tcb->snd.ptr;
2557                 seg.ack = tcb->rcv.nxt;
2558                 seg.wnd = tcb->rcv.wnd;
2559
2560                 /* Pull out data to send */
2561                 bp = NULL;
2562                 if (dsize != 0) {
2563                         bp = qcopy(s->wq, dsize, sent);
2564                         if (BLEN(bp) != dsize) {
2565                                 seg.flags |= FIN;
2566                                 dsize--;
2567                         }
2568                         if (BLEN(bp) > tcb->mss) {
2569                                 bp->flag |= Btso;
2570                                 bp->mss = tcb->mss;
2571                         }
2572                 }
2573
2574                 if (sent + dsize == sndcnt)
2575                         seg.flags |= PSH;
2576
2577                 /* keep track of balance of resent data */
2578                 if (seq_lt(tcb->snd.ptr, tcb->snd.nxt)) {
2579                         n = tcb->snd.nxt - tcb->snd.ptr;
2580                         if (ssize < n)
2581                                 n = ssize;
2582                         tcb->resent += n;
2583                         netlog(f, Logtcp, "rexmit: %I.%d -> %I.%d ptr 0x%lx nxt 0x%lx\n",
2584                                    s->raddr, s->rport, s->laddr, s->lport, tcb->snd.ptr,
2585                                    tcb->snd.nxt);
2586                         tpriv->stats[RetransSegs]++;
2587                 }
2588
2589                 tcb->snd.ptr += ssize;
2590
2591                 /* Pull up the send pointer so we can accept acks
2592                  * for this window
2593                  */
2594                 if (seq_gt(tcb->snd.ptr, tcb->snd.nxt))
2595                         tcb->snd.nxt = tcb->snd.ptr;
2596
2597                 /* Build header, link data and compute cksum */
2598                 switch (version) {
2599                         case V4:
2600                                 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
2601                                 hbp = htontcp4(&seg, bp, &tcb->protohdr.tcp4hdr, tcb);
2602                                 if (hbp == NULL) {
2603                                         freeblist(bp);
2604                                         return;
2605                                 }
2606                                 break;
2607                         case V6:
2608                                 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
2609                                 hbp = htontcp6(&seg, bp, &tcb->protohdr.tcp6hdr, tcb);
2610                                 if (hbp == NULL) {
2611                                         freeblist(bp);
2612                                         return;
2613                                 }
2614                                 break;
2615                         default:
2616                                 hbp = NULL;     /* to suppress a warning */
2617                                 panic("tcpoutput: version %d", version);
2618                 }
2619
2620                 /* Start the transmission timers if there is new data and we
2621                  * expect acknowledges
2622                  */
2623                 if (ssize != 0) {
2624                         if (tcb->timer.state != TcptimerON)
2625                                 tcpgo(tpriv, &tcb->timer);
2626
2627                         /*  If round trip timer isn't running, start it.
2628                          *  measure the longest packet only in case the
2629                          *  transmission time dominates RTT
2630                          */
2631                         if (tcb->rtt_timer.state != TcptimerON)
2632                                 if (ssize == tcb->mss) {
2633                                         tcpgo(tpriv, &tcb->rtt_timer);
2634                                         tcb->rttseq = tcb->snd.ptr;
2635                                 }
2636                 }
2637
2638                 tpriv->stats[OutSegs]++;
2639
2640                 /* put off the next keep alive */
2641                 tcpgo(tpriv, &tcb->katimer);
2642
2643                 switch (version) {
2644                         case V4:
2645                                 if (ipoput4(f, hbp, 0, s->ttl, s->tos, s) < 0) {
2646                                         /* a negative return means no route */
2647                                         localclose(s, "no route");
2648                                 }
2649                                 break;
2650                         case V6:
2651                                 if (ipoput6(f, hbp, 0, s->ttl, s->tos, s) < 0) {
2652                                         /* a negative return means no route */
2653                                         localclose(s, "no route");
2654                                 }
2655                                 break;
2656                         default:
2657                                 panic("tcpoutput2: version %d", version);
2658                 }
2659                 if ((msgs % 4) == 1) {
2660                         qunlock(&s->qlock);
2661                         kthread_yield();
2662                         qlock(&s->qlock);
2663                 }
2664         }
2665 }
2666
2667 /*
2668  *  the BSD convention (hack?) for keep alives.  resend last uint8_t acked.
2669  */
2670 void tcpsendka(struct conv *s)
2671 {
2672         Tcp seg;
2673         Tcpctl *tcb;
2674         struct block *hbp, *dbp;
2675
2676         tcb = (Tcpctl *) s->ptcl;
2677
2678         dbp = NULL;
2679         seg.urg = 0;
2680         seg.source = s->lport;
2681         seg.dest = s->rport;
2682         seg.flags = ACK | PSH;
2683         seg.mss = 0;
2684         seg.ws = 0;
2685         if (tcpporthogdefense)
2686                 urandom_read(&seg.seq, sizeof(seg.seq));
2687         else
2688                 seg.seq = tcb->snd.una - 1;
2689         seg.ack = tcb->rcv.nxt;
2690         tcb->rcv.una = 0;
2691         seg.wnd = tcb->rcv.wnd;
2692         if (tcb->state == Finwait2) {
2693                 seg.flags |= FIN;
2694         } else {
2695                 dbp = allocb(1);
2696                 dbp->wp++;
2697         }
2698
2699         if (isv4(s->raddr)) {
2700                 /* Build header, link data and compute cksum */
2701                 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
2702                 hbp = htontcp4(&seg, dbp, &tcb->protohdr.tcp4hdr, tcb);
2703                 if (hbp == NULL) {
2704                         freeblist(dbp);
2705                         return;
2706                 }
2707                 ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
2708         } else {
2709                 /* Build header, link data and compute cksum */
2710                 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
2711                 hbp = htontcp6(&seg, dbp, &tcb->protohdr.tcp6hdr, tcb);
2712                 if (hbp == NULL) {
2713                         freeblist(dbp);
2714                         return;
2715                 }
2716                 ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
2717         }
2718 }
2719
2720 /*
2721  *  set connection to time out after 12 minutes
2722  */
2723 void tcpsetkacounter(Tcpctl * tcb)
2724 {
2725         tcb->kacounter = (12 * 60 * 1000) / (tcb->katimer.start * MSPTICK);
2726         if (tcb->kacounter < 3)
2727                 tcb->kacounter = 3;
2728 }
2729
2730 /*
2731  *  if we've timed out, close the connection
2732  *  otherwise, send a keepalive and restart the timer
2733  */
2734 void tcpkeepalive(void *v)
2735 {
2736         ERRSTACK(1);
2737         Tcpctl *tcb;
2738         struct conv *s;
2739
2740         s = v;
2741         tcb = (Tcpctl *) s->ptcl;
2742         qlock(&s->qlock);
2743         if (waserror()) {
2744                 qunlock(&s->qlock);
2745                 nexterror();
2746         }
2747         if (tcb->state != Closed) {
2748                 if (--(tcb->kacounter) <= 0) {
2749                         localclose(s, "connection timed out");
2750                 } else {
2751                         tcpsendka(s);
2752                         tcpgo(s->p->priv, &tcb->katimer);
2753                 }
2754         }
2755         qunlock(&s->qlock);
2756         poperror();
2757 }
2758
2759 /*
2760  *  start keepalive timer
2761  */
2762 static void tcpstartka(struct conv *s, char **f, int n)
2763 {
2764         Tcpctl *tcb;
2765         int x;
2766
2767         tcb = (Tcpctl *) s->ptcl;
2768         if (tcb->state != Established)
2769                 error(ENOTCONN, "connection must be in Establised state");
2770         if (n > 1) {
2771                 x = atoi(f[1]);
2772                 if (x >= MSPTICK)
2773                         tcb->katimer.start = x / MSPTICK;
2774         }
2775         tcpsetkacounter(tcb);
2776         tcpgo(s->p->priv, &tcb->katimer);
2777 }
2778
2779 /*
2780  *  turn checksums on/off
2781  */
2782 static void tcpsetchecksum(struct conv *s, char **f, int unused)
2783 {
2784         Tcpctl *tcb;
2785
2786         tcb = (Tcpctl *) s->ptcl;
2787         tcb->nochecksum = !atoi(f[1]);
2788 }
2789
2790 void tcprxmit(struct conv *s)
2791 {
2792         Tcpctl *tcb;
2793
2794         tcb = (Tcpctl *) s->ptcl;
2795
2796         tcb->flags |= RETRAN | FORCE;
2797         tcb->snd.ptr = tcb->snd.una;
2798
2799         /*
2800          *  We should be halving the slow start threshhold (down to one
2801          *  mss) but leaving it at mss seems to work well enough
2802          */
2803         tcb->ssthresh = tcb->mss;
2804
2805         /*
2806          *  pull window down to a single packet
2807          */
2808         tcb->cwind = tcb->mss;
2809         tcpoutput(s);
2810 }
2811
2812 void tcptimeout(void *arg)
2813 {
2814         ERRSTACK(1);
2815         struct conv *s;
2816         Tcpctl *tcb;
2817         int maxback;
2818         struct tcppriv *tpriv;
2819
2820         s = (struct conv *)arg;
2821         tpriv = s->p->priv;
2822         tcb = (Tcpctl *) s->ptcl;
2823
2824         qlock(&s->qlock);
2825         if (waserror()) {
2826                 qunlock(&s->qlock);
2827                 nexterror();
2828         }
2829         switch (tcb->state) {
2830                 default:
2831                         tcb->backoff++;
2832                         if (tcb->state == Syn_sent)
2833                                 maxback = MAXBACKMS / 2;
2834                         else
2835                                 maxback = MAXBACKMS;
2836                         tcb->backedoff += tcb->timer.start * MSPTICK;
2837                         if (tcb->backedoff >= maxback) {
2838                                 localclose(s, "connection timed out");
2839                                 break;
2840                         }
2841                         netlog(s->p->f, Logtcprxmt, "timeout rexmit 0x%lx %llu/%llu\n",
2842                                    tcb->snd.una, tcb->timer.start, NOW);
2843                         tcpsettimer(tcb);
2844                         tcprxmit(s);
2845                         tpriv->stats[RetransTimeouts]++;
2846                         tcb->snd.dupacks = 0;
2847                         break;
2848                 case Time_wait:
2849                         localclose(s, NULL);
2850                         break;
2851                 case Closed:
2852                         break;
2853         }
2854         qunlock(&s->qlock);
2855         poperror();
2856 }
2857
2858 int inwindow(Tcpctl * tcb, int seq)
2859 {
2860         return seq_within(seq, tcb->rcv.nxt, tcb->rcv.nxt + tcb->rcv.wnd - 1);
2861 }
2862
2863 /*
2864  *  set up state for a received SYN (or SYN ACK) packet
2865  */
2866 void procsyn(struct conv *s, Tcp * seg)
2867 {
2868         Tcpctl *tcb;
2869
2870         tcb = (Tcpctl *) s->ptcl;
2871         tcb->flags |= FORCE;
2872
2873         tcb->rcv.nxt = seg->seq + 1;
2874         tcb->rcv.urg = tcb->rcv.nxt;
2875         tcb->irs = seg->seq;
2876
2877         /* our sending max segment size cannot be bigger than what he asked for */
2878         if (seg->mss != 0 && seg->mss < tcb->mss)
2879                 tcb->mss = seg->mss;
2880
2881         /* the congestion window always starts out as a single segment */
2882         tcb->snd.wnd = seg->wnd;
2883         tcb->cwind = tcb->mss;
2884 }
2885
2886 int
2887 addreseq(Tcpctl * tcb, struct tcppriv *tpriv, Tcp * seg,
2888                  struct block *bp, uint16_t length)
2889 {
2890         Reseq *rp, *rp1;
2891         int i, rqlen, qmax;
2892
2893         rp = kzmalloc(sizeof(Reseq), 0);
2894         if (rp == NULL) {
2895                 freeblist(bp);  /* bp always consumed by add_reseq */
2896                 return 0;
2897         }
2898
2899         rp->seg = *seg;
2900         rp->bp = bp;
2901         rp->length = length;
2902
2903         /* Place on reassembly list sorting by starting seq number */
2904         rp1 = tcb->reseq;
2905         if (rp1 == NULL || seq_lt(seg->seq, rp1->seg.seq)) {
2906                 rp->next = rp1;
2907                 tcb->reseq = rp;
2908                 if (rp->next != NULL)
2909                         tpriv->stats[OutOfOrder]++;
2910                 return 0;
2911         }
2912
2913         rqlen = 0;
2914         for (i = 0;; i++) {
2915                 rqlen += rp1->length;
2916                 if (rp1->next == NULL || seq_lt(seg->seq, rp1->next->seg.seq)) {
2917                         rp->next = rp1->next;
2918                         rp1->next = rp;
2919                         if (rp->next != NULL)
2920                                 tpriv->stats[OutOfOrder]++;
2921                         break;
2922                 }
2923                 rp1 = rp1->next;
2924         }
2925         qmax = QMAX << tcb->rcv.scale;
2926         if (rqlen > qmax) {
2927                 printd("resequence queue > window: %d > %d\n", rqlen, qmax);
2928                 i = 0;
2929                 for (rp1 = tcb->reseq; rp1 != NULL; rp1 = rp1->next) {
2930                         printd("0x%#lx 0x%#lx 0x%#x\n", rp1->seg.seq,
2931                                    rp1->seg.ack, rp1->seg.flags);
2932                         if (i++ > 10) {
2933                                 printd("...\n");
2934                                 break;
2935                         }
2936                 }
2937
2938                 // delete entire reassembly queue; wait for retransmit.
2939                 // - should we be smarter and only delete the tail?
2940                 for (rp = tcb->reseq; rp != NULL; rp = rp1) {
2941                         rp1 = rp->next;
2942                         freeblist(rp->bp);
2943                         kfree(rp);
2944                 }
2945                 tcb->reseq = NULL;
2946
2947                 return -1;
2948         }
2949         return 0;
2950 }
2951
2952 void getreseq(Tcpctl * tcb, Tcp * seg, struct block **bp, uint16_t * length)
2953 {
2954         Reseq *rp;
2955
2956         rp = tcb->reseq;
2957         if (rp == NULL)
2958                 return;
2959
2960         tcb->reseq = rp->next;
2961
2962         *seg = rp->seg;
2963         *bp = rp->bp;
2964         *length = rp->length;
2965
2966         kfree(rp);
2967 }
2968
2969 int tcptrim(Tcpctl * tcb, Tcp * seg, struct block **bp, uint16_t * length)
2970 {
2971         uint16_t len;
2972         uint8_t accept;
2973         int dupcnt, excess;
2974
2975         accept = 0;
2976         len = *length;
2977         if (seg->flags & SYN)
2978                 len++;
2979         if (seg->flags & FIN)
2980                 len++;
2981
2982         if (tcb->rcv.wnd == 0) {
2983                 if (len == 0 && seg->seq == tcb->rcv.nxt)
2984                         return 0;
2985         } else {
2986                 /* Some part of the segment should be in the window */
2987                 if (inwindow(tcb, seg->seq))
2988                         accept++;
2989                 else if (len != 0) {
2990                         if (inwindow(tcb, seg->seq + len - 1) ||
2991                                 seq_within(tcb->rcv.nxt, seg->seq, seg->seq + len - 1))
2992                                 accept++;
2993                 }
2994         }
2995         if (!accept) {
2996                 freeblist(*bp);
2997                 return -1;
2998         }
2999         dupcnt = tcb->rcv.nxt - seg->seq;
3000         if (dupcnt > 0) {
3001                 tcb->rerecv += dupcnt;
3002                 if (seg->flags & SYN) {
3003                         seg->flags &= ~SYN;
3004                         seg->seq++;
3005
3006                         if (seg->urg > 1)
3007                                 seg->urg--;
3008                         else
3009                                 seg->flags &= ~URG;
3010                         dupcnt--;
3011                 }
3012                 if (dupcnt > 0) {
3013                         pullblock(bp, (uint16_t) dupcnt);
3014                         seg->seq += dupcnt;
3015                         *length -= dupcnt;
3016
3017                         if (seg->urg > dupcnt)
3018                                 seg->urg -= dupcnt;
3019                         else {
3020                                 seg->flags &= ~URG;
3021                                 seg->urg = 0;
3022                         }
3023                 }
3024         }
3025         excess = seg->seq + *length - (tcb->rcv.nxt + tcb->rcv.wnd);
3026         if (excess > 0) {
3027                 tcb->rerecv += excess;
3028                 *length -= excess;
3029                 *bp = trimblock(*bp, 0, *length);
3030                 if (*bp == NULL)
3031                         panic("presotto is a boofhead");
3032                 seg->flags &= ~FIN;
3033         }
3034         return 0;
3035 }
3036
3037 void tcpadvise(struct Proto *tcp, struct block *bp, char *msg)
3038 {
3039         Tcp4hdr *h4;
3040         Tcp6hdr *h6;
3041         Tcpctl *tcb;
3042         uint8_t source[IPaddrlen];
3043         uint8_t dest[IPaddrlen];
3044         uint16_t psource, pdest;
3045         struct conv *s, **p;
3046
3047         h4 = (Tcp4hdr *) (bp->rp);
3048         h6 = (Tcp6hdr *) (bp->rp);
3049
3050         if ((h4->vihl & 0xF0) == IP_VER4) {
3051                 v4tov6(dest, h4->tcpdst);
3052                 v4tov6(source, h4->tcpsrc);
3053                 psource = nhgets(h4->tcpsport);
3054                 pdest = nhgets(h4->tcpdport);
3055         } else {
3056                 ipmove(dest, h6->tcpdst);
3057                 ipmove(source, h6->tcpsrc);
3058                 psource = nhgets(h6->tcpsport);
3059                 pdest = nhgets(h6->tcpdport);
3060         }
3061
3062         /* Look for a connection */
3063         qlock(&tcp->qlock);
3064         for (p = tcp->conv; *p; p++) {
3065                 s = *p;
3066                 tcb = (Tcpctl *) s->ptcl;
3067                 if (s->rport == pdest)
3068                         if (s->lport == psource)
3069                                 if (tcb->state != Closed)
3070                                         if (ipcmp(s->raddr, dest) == 0)
3071                                                 if (ipcmp(s->laddr, source) == 0) {
3072                                                         qlock(&s->qlock);
3073                                                         qunlock(&tcp->qlock);
3074                                                         switch (tcb->state) {
3075                                                                 case Syn_sent:
3076                                                                         localclose(s, msg);
3077                                                                         break;
3078                                                         }
3079                                                         qunlock(&s->qlock);
3080                                                         freeblist(bp);
3081                                                         return;
3082                                                 }
3083         }
3084         qunlock(&tcp->qlock);
3085         freeblist(bp);
3086 }
3087
3088 static void tcpporthogdefensectl(char *val)
3089 {
3090         if (strcmp(val, "on") == 0)
3091                 tcpporthogdefense = 1;
3092         else if (strcmp(val, "off") == 0)
3093                 tcpporthogdefense = 0;
3094         else
3095                 error(EINVAL, "unknown value for tcpporthogdefense");
3096 }
3097
3098 /* called with c qlocked */
3099 static void tcpctl(struct conv *c, char **f, int n)
3100 {
3101         if (n == 1 && strcmp(f[0], "hangup") == 0)
3102                 tcphangup(c);
3103         else if (n >= 1 && strcmp(f[0], "keepalive") == 0)
3104                 tcpstartka(c, f, n);
3105         else if (n >= 1 && strcmp(f[0], "checksum") == 0)
3106                 tcpsetchecksum(c, f, n);
3107         else if (n >= 1 && strcmp(f[0], "tcpporthogdefense") == 0)
3108                 tcpporthogdefensectl(f[1]);
3109         else
3110                 error(EINVAL, "unknown command to %s", __func__);
3111 }
3112
3113 int tcpstats(struct Proto *tcp, char *buf, int len)
3114 {
3115         struct tcppriv *priv;
3116         char *p, *e;
3117         int i;
3118
3119         priv = tcp->priv;
3120         p = buf;
3121         e = p + len;
3122         for (i = 0; i < Nstats; i++)
3123                 p = seprintf(p, e, "%s: %u\n", statnames[i], priv->stats[i]);
3124         return p - buf;
3125 }
3126
3127 /*
3128  *  garbage collect any stale conversations:
3129  *      - SYN received but no SYN-ACK after 5 seconds (could be the SYN attack)
3130  *      - Finwait2 after 5 minutes
3131  *
3132  *  this is called whenever we run out of channels.  Both checks are
3133  *  of questionable validity so we try to use them only when we're
3134  *  up against the wall.
3135  */
3136 int tcpgc(struct Proto *tcp)
3137 {
3138         struct conv *c, **pp, **ep;
3139         int n;
3140         Tcpctl *tcb;
3141
3142         n = 0;
3143         ep = &tcp->conv[tcp->nc];
3144         for (pp = tcp->conv; pp < ep; pp++) {
3145                 c = *pp;
3146                 if (c == NULL)
3147                         break;
3148                 if (!canqlock(&c->qlock))
3149                         continue;
3150                 tcb = (Tcpctl *) c->ptcl;
3151                 switch (tcb->state) {
3152                         case Syn_received:
3153                                 if (NOW - tcb->time > 5000) {
3154                                         localclose(c, "timed out");
3155                                         n++;
3156                                 }
3157                                 break;
3158                         case Finwait2:
3159                                 if (NOW - tcb->time > 5 * 60 * 1000) {
3160                                         localclose(c, "timed out");
3161                                         n++;
3162                                 }
3163                                 break;
3164                 }
3165                 qunlock(&c->qlock);
3166         }
3167         return n;
3168 }
3169
3170 void tcpsettimer(Tcpctl * tcb)
3171 {
3172         int x;
3173
3174         /* round trip dependency */
3175         x = backoff(tcb->backoff) *
3176                 (tcb->mdev + (tcb->srtt >> LOGAGAIN) + MSPTICK) / MSPTICK;
3177
3178         /* bounded twixt 1/2 and 64 seconds */
3179         if (x < 500 / MSPTICK)
3180                 x = 500 / MSPTICK;
3181         else if (x > (64000 / MSPTICK))
3182                 x = 64000 / MSPTICK;
3183         tcb->timer.start = x;
3184 }
3185
3186 void tcpinit(struct Fs *fs)
3187 {
3188         struct Proto *tcp;
3189         struct tcppriv *tpriv;
3190
3191         tcp = kzmalloc(sizeof(struct Proto), 0);
3192         tpriv = tcp->priv = kzmalloc(sizeof(struct tcppriv), 0);
3193         qlock_init(&tpriv->tl);
3194         qlock_init(&tpriv->apl);
3195         tcp->name = "tcp";
3196         tcp->connect = tcpconnect;
3197         tcp->announce = tcpannounce;
3198         tcp->ctl = tcpctl;
3199         tcp->state = tcpstate;
3200         tcp->create = tcpcreate;
3201         tcp->close = tcpclose;
3202         tcp->shutdown = tcpshutdown;
3203         tcp->rcv = tcpiput;
3204         tcp->advise = tcpadvise;
3205         tcp->stats = tcpstats;
3206         tcp->inuse = tcpinuse;
3207         tcp->gc = tcpgc;
3208         tcp->ipproto = IP_TCPPROTO;
3209         tcp->nc = scalednconv();
3210         tcp->ptclsize = sizeof(Tcpctl);
3211         tpriv->stats[MaxConn] = tcp->nc;
3212
3213         Fsproto(fs, tcp);
3214 }
3215
3216 void
3217 tcpsetscale(struct conv *s, Tcpctl * tcb, uint16_t rcvscale, uint16_t sndscale)
3218 {
3219         if (rcvscale) {
3220                 tcb->rcv.scale = rcvscale & 0xff;
3221                 tcb->snd.scale = sndscale & 0xff;
3222                 tcb->window = QMAX << tcb->snd.scale;
3223                 qsetlimit(s->rq, tcb->window);
3224         } else {
3225                 tcb->rcv.scale = 0;
3226                 tcb->snd.scale = 0;
3227                 tcb->window = QMAX;
3228                 qsetlimit(s->rq, tcb->window);
3229         }
3230 }