Fix waserror/lock order
[akaros.git] / kern / src / net / tcp.c
1 /* Copyright © 1994-1999 Lucent Technologies Inc.  All rights reserved.
2  * Portions Copyright © 1997-1999 Vita Nuova Limited
3  * Portions Copyright © 2000-2007 Vita Nuova Holdings Limited
4  *                                (www.vitanuova.com)
5  * Revisions Copyright © 2000-2007 Lucent Technologies Inc. and others
6  *
7  * Modified for the Akaros operating system:
8  * Copyright (c) 2013-2014 The Regents of the University of California
9  * Copyright (c) 2013-2015 Google Inc.
10  *
11  * Permission is hereby granted, free of charge, to any person obtaining a copy
12  * of this software and associated documentation files (the "Software"), to deal
13  * in the Software without restriction, including without limitation the rights
14  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15  * copies of the Software, and to permit persons to whom the Software is
16  * furnished to do so, subject to the following conditions:
17  *
18  * The above copyright notice and this permission notice shall be included in
19  * all copies or substantial portions of the Software.
20  *
21  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
24  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27  * SOFTWARE. */
28
29 #include <vfs.h>
30 #include <kfs.h>
31 #include <slab.h>
32 #include <kmalloc.h>
33 #include <kref.h>
34 #include <string.h>
35 #include <stdio.h>
36 #include <assert.h>
37 #include <error.h>
38 #include <cpio.h>
39 #include <pmap.h>
40 #include <smp.h>
41 #include <ip.h>
42
43 #include <vfs.h>
44 #include <kfs.h>
45 #include <slab.h>
46 #include <kmalloc.h>
47 #include <kref.h>
48 #include <string.h>
49 #include <stdio.h>
50 #include <assert.h>
51 #include <error.h>
52 #include <cpio.h>
53 #include <pmap.h>
54 #include <smp.h>
55 #include <ip.h>
56
57 enum {
58         QMAX = 64 * 1024 - 1,
59         IP_TCPPROTO = 6,
60
61         TCP4_IPLEN = 8,
62         TCP4_PHDRSIZE = 12,
63         TCP4_HDRSIZE = 20,
64         TCP4_TCBPHDRSZ = 40,
65         TCP4_PKT = TCP4_IPLEN + TCP4_PHDRSIZE,
66
67         TCP6_IPLEN = 0,
68         TCP6_PHDRSIZE = 40,
69         TCP6_HDRSIZE = 20,
70         TCP6_TCBPHDRSZ = 60,
71         TCP6_PKT = TCP6_IPLEN + TCP6_PHDRSIZE,
72
73         TcptimerOFF = 0,
74         TcptimerON = 1,
75         TcptimerDONE = 2,
76         MAX_TIME = (1 << 20),   /* Forever */
77         TCP_ACK = 50,   /* Timed ack sequence in ms */
78         MAXBACKMS = 9 * 60 * 1000,      /* longest backoff time (ms) before hangup */
79
80         URG = 0x20,     /* Data marked urgent */
81         ACK = 0x10,     /* Acknowledge is valid */
82         PSH = 0x08,     /* Whole data pipe is pushed */
83         RST = 0x04,     /* Reset connection */
84         SYN = 0x02,     /* Pkt. is synchronise */
85         FIN = 0x01,     /* Start close down */
86
87         EOLOPT = 0,
88         NOOPOPT = 1,
89         MSSOPT = 2,
90         MSS_LENGTH = 4, /* Mean segment size */
91         WSOPT = 3,
92         WS_LENGTH = 3,  /* Bits to scale window size by */
93         MSL2 = 10,
94         MSPTICK = 50,   /* Milliseconds per timer tick */
95         DEF_MSS = 1460, /* Default mean segment */
96         DEF_MSS6 = 1280,        /* Default mean segment (min) for v6 */
97         DEF_RTT = 500,  /* Default round trip */
98         DEF_KAT = 120000,       /* Default time (ms) between keep alives */
99         TCP_LISTEN = 0, /* Listen connection */
100         TCP_CONNECT = 1,        /* Outgoing connection */
101         SYNACK_RXTIMER = 250,   /* ms between SYNACK retransmits */
102
103         TCPREXMTTHRESH = 3,     /* dupack threshhold for rxt */
104
105         FORCE = 1,
106         CLONE = 2,
107         RETRAN = 4,
108         ACTIVE = 8,
109         SYNACK = 16,
110         TSO = 32,
111
112         LOGAGAIN = 3,
113         LOGDGAIN = 2,
114
115         Closed = 0,     /* Connection states */
116         Listen,
117         Syn_sent,
118         Syn_received,
119         Established,
120         Finwait1,
121         Finwait2,
122         Close_wait,
123         Closing,
124         Last_ack,
125         Time_wait,
126
127         Maxlimbo = 1000,        /* maximum procs waiting for response to SYN ACK */
128         NLHT = 256,     /* hash table size, must be a power of 2 */
129         LHTMASK = NLHT - 1,
130
131         HaveWS = 1 << 8,
132 };
133
134 /* Must correspond to the enumeration above */
135 char *tcpstates[] = {
136         "Closed", "Listen", "Syn_sent", "Syn_received",
137         "Established", "Finwait1", "Finwait2", "Close_wait",
138         "Closing", "Last_ack", "Time_wait"
139 };
140
141 typedef struct Tcptimer Tcptimer;
142 struct Tcptimer {
143         Tcptimer *next;
144         Tcptimer *prev;
145         Tcptimer *readynext;
146         int state;
147         uint64_t start;
148         uint64_t count;
149         void (*func) (void *);
150         void *arg;
151 };
152
153 /*
154  *  v4 and v6 pseudo headers used for
155  *  checksuming tcp
156  */
157 typedef struct Tcp4hdr Tcp4hdr;
158 struct Tcp4hdr {
159         uint8_t vihl;                           /* Version and header length */
160         uint8_t tos;                            /* Type of service */
161         uint8_t length[2];                      /* packet length */
162         uint8_t id[2];                          /* Identification */
163         uint8_t frag[2];                        /* Fragment information */
164         uint8_t Unused;
165         uint8_t proto;
166         uint8_t tcplen[2];
167         uint8_t tcpsrc[4];
168         uint8_t tcpdst[4];
169         uint8_t tcpsport[2];
170         uint8_t tcpdport[2];
171         uint8_t tcpseq[4];
172         uint8_t tcpack[4];
173         uint8_t tcpflag[2];
174         uint8_t tcpwin[2];
175         uint8_t tcpcksum[2];
176         uint8_t tcpurg[2];
177         /* Options segment */
178         uint8_t tcpopt[1];
179 };
180
181 typedef struct Tcp6hdr Tcp6hdr;
182 struct Tcp6hdr {
183         uint8_t vcf[4];
184         uint8_t ploadlen[2];
185         uint8_t proto;
186         uint8_t ttl;
187         uint8_t tcpsrc[IPaddrlen];
188         uint8_t tcpdst[IPaddrlen];
189         uint8_t tcpsport[2];
190         uint8_t tcpdport[2];
191         uint8_t tcpseq[4];
192         uint8_t tcpack[4];
193         uint8_t tcpflag[2];
194         uint8_t tcpwin[2];
195         uint8_t tcpcksum[2];
196         uint8_t tcpurg[2];
197         /* Options segment */
198         uint8_t tcpopt[1];
199 };
200
201 /*
202  *  this represents the control info
203  *  for a single packet.  It is derived from
204  *  a packet in ntohtcp{4,6}() and stuck into
205  *  a packet in htontcp{4,6}().
206  */
207 typedef struct Tcp Tcp;
208 struct Tcp {
209         uint16_t source;
210         uint16_t dest;
211         uint32_t seq;
212         uint32_t ack;
213         uint8_t flags;
214         uint16_t ws;                            /* window scale option (if not zero) */
215         uint32_t wnd;
216         uint16_t urg;
217         uint16_t mss;                           /* max segment size option (if not zero) */
218         uint16_t len;                           /* size of data */
219 };
220
221 /*
222  *  this header is malloc'd to thread together fragments
223  *  waiting to be coalesced
224  */
225 typedef struct Reseq Reseq;
226 struct Reseq {
227         Reseq *next;
228         Tcp seg;
229         struct block *bp;
230         uint16_t length;
231 };
232
233 /*
234  *  the qlock in the Conv locks this structure
235  */
236 typedef struct Tcpctl Tcpctl;
237 struct Tcpctl {
238         uint8_t state;                          /* Connection state */
239         uint8_t type;                           /* Listening or active connection */
240         uint8_t code;                           /* Icmp code */
241         struct {
242                 uint32_t una;                   /* Unacked data pointer */
243                 uint32_t nxt;                   /* Next sequence expected */
244                 uint32_t ptr;                   /* Data pointer */
245                 uint32_t wnd;                   /* Tcp send window */
246                 uint32_t urg;                   /* Urgent data pointer */
247                 uint32_t wl2;
248                 int scale;                              /* how much to right shift window in xmitted packets */
249                 /* to implement tahoe and reno TCP */
250                 uint32_t dupacks;               /* number of duplicate acks rcvd */
251                 int recovery;                   /* loss recovery flag */
252                 uint32_t rxt;                   /* right window marker for recovery */
253         } snd;
254         struct {
255                 uint32_t nxt;                   /* Receive pointer to next uint8_t slot */
256                 uint32_t wnd;                   /* Receive window incoming */
257                 uint32_t urg;                   /* Urgent pointer */
258                 int blocked;
259                 int una;                                /* unacked data segs */
260                 int scale;                              /* how much to left shift window in rcved packets */
261         } rcv;
262         uint32_t iss;                           /* Initial sequence number */
263         int sawwsopt;                           /* true if we saw a wsopt on the incoming SYN */
264         uint32_t cwind;                         /* Congestion window */
265         int scale;                                      /* desired snd.scale */
266         uint16_t ssthresh;                      /* Slow start threshold */
267         int resent;                                     /* Bytes just resent */
268         int irs;                                        /* Initial received squence */
269         uint16_t mss;                           /* Mean segment size */
270         int rerecv;                                     /* Overlap of data rerecevived */
271         uint32_t window;                        /* Recevive window */
272         uint8_t backoff;                        /* Exponential backoff counter */
273         int backedoff;                          /* ms we've backed off for rexmits */
274         uint8_t flags;                          /* State flags */
275         Reseq *reseq;                           /* Resequencing queue */
276         Tcptimer timer;                         /* Activity timer */
277         Tcptimer acktimer;                      /* Acknowledge timer */
278         Tcptimer rtt_timer;                     /* Round trip timer */
279         Tcptimer katimer;                       /* keep alive timer */
280         uint32_t rttseq;                        /* Round trip sequence */
281         int srtt;                                       /* Shortened round trip */
282         int mdev;                                       /* Mean deviation of round trip */
283         int kacounter;                          /* count down for keep alive */
284         uint64_t sndsyntime;            /* time syn sent */
285         uint64_t time;                          /* time Finwait2 or Syn_received was sent */
286         int nochecksum;                         /* non-zero means don't send checksums */
287         int flgcnt;                                     /* number of flags in the sequence (FIN,SEQ) */
288
289         union {
290                 Tcp4hdr tcp4hdr;
291                 Tcp6hdr tcp6hdr;
292         } protohdr;                                     /* prototype header */
293 };
294
295 /*
296  *  New calls are put in limbo rather than having a conversation structure
297  *  allocated.  Thus, a SYN attack results in lots of limbo'd calls but not
298  *  any real Conv structures mucking things up.  Calls in limbo rexmit their
299  *  SYN ACK every SYNACK_RXTIMER ms up to 4 times, i.e., they disappear after 1 second.
300  *
301  *  In particular they aren't on a listener's queue so that they don't figure
302  *  in the input queue limit.
303  *
304  *  If 1/2 of a T3 was attacking SYN packets, we'ld have a permanent queue
305  *  of 70000 limbo'd calls.  Not great for a linear list but doable.  Therefore
306  *  there is no hashing of this list.
307  */
308 typedef struct Limbo Limbo;
309 struct Limbo {
310         Limbo *next;
311
312         uint8_t laddr[IPaddrlen];
313         uint8_t raddr[IPaddrlen];
314         uint16_t lport;
315         uint16_t rport;
316         uint32_t irs;                           /* initial received sequence */
317         uint32_t iss;                           /* initial sent sequence */
318         uint16_t mss;                           /* mss from the other end */
319         uint16_t rcvscale;                      /* how much to scale rcvd windows */
320         uint16_t sndscale;                      /* how much to scale sent windows */
321         uint64_t lastsend;                      /* last time we sent a synack */
322         uint8_t version;                        /* v4 or v6 */
323         uint8_t rexmits;                        /* number of retransmissions */
324 };
325
326 int tcp_irtt = DEF_RTT;                 /* Initial guess at round trip time */
327 uint16_t tcp_mss = DEF_MSS;             /* Maximum segment size to be sent */
328
329 enum {
330         /* MIB stats */
331         MaxConn,
332         ActiveOpens,
333         PassiveOpens,
334         EstabResets,
335         CurrEstab,
336         InSegs,
337         OutSegs,
338         RetransSegs,
339         RetransTimeouts,
340         InErrs,
341         OutRsts,
342
343         /* non-MIB stats */
344         CsumErrs,
345         HlenErrs,
346         LenErrs,
347         OutOfOrder,
348
349         Nstats
350 };
351
352 static char *statnames[] = {
353         [MaxConn] "MaxConn",
354         [ActiveOpens] "ActiveOpens",
355         [PassiveOpens] "PassiveOpens",
356         [EstabResets] "EstabResets",
357         [CurrEstab] "CurrEstab",
358         [InSegs] "InSegs",
359         [OutSegs] "OutSegs",
360         [RetransSegs] "RetransSegs",
361         [RetransTimeouts] "RetransTimeouts",
362         [InErrs] "InErrs",
363         [OutRsts] "OutRsts",
364         [CsumErrs] "CsumErrs",
365         [HlenErrs] "HlenErrs",
366         [LenErrs] "LenErrs",
367         [OutOfOrder] "OutOfOrder",
368 };
369
370 typedef struct Tcppriv Tcppriv;
371 struct tcppriv {
372         /* List of active timers */
373         qlock_t tl;
374         Tcptimer *timers;
375
376         /* hash table for matching conversations */
377         struct Ipht ht;
378
379         /* calls in limbo waiting for an ACK to our SYN ACK */
380         int nlimbo;
381         Limbo *lht[NLHT];
382
383         /* for keeping track of tcpackproc */
384         qlock_t apl;
385         int ackprocstarted;
386
387         uint32_t stats[Nstats];
388 };
389
390 /*
391  *  Setting tcpporthogdefense to non-zero enables Dong Lin's
392  *  solution to hijacked systems staking out port's as a form
393  *  of DoS attack.
394  *
395  *  To avoid stateless Conv hogs, we pick a sequence number at random.  If
396  *  it that number gets acked by the other end, we shut down the connection.
397  *  Look for tcpporthogedefense in the code.
398  */
399 int tcpporthogdefense = 0;
400
401 int addreseq(Tcpctl *, struct tcppriv *, Tcp *, struct block *, uint16_t);
402 void getreseq(Tcpctl *, Tcp *, struct block **, uint16_t *);
403 void localclose(struct conv *, char *unused_char_p_t);
404 void procsyn(struct conv *, Tcp *);
405 void tcpiput(struct Proto *, struct Ipifc *, struct block *);
406 void tcpoutput(struct conv *);
407 int tcptrim(Tcpctl *, Tcp *, struct block **, uint16_t *);
408 void tcpstart(struct conv *, int);
409 void tcptimeout(void *);
410 void tcpsndsyn(struct conv *, Tcpctl *);
411 void tcprcvwin(struct conv *);
412 void tcpacktimer(void *);
413 void tcpkeepalive(void *);
414 void tcpsetkacounter(Tcpctl *);
415 void tcprxmit(struct conv *);
416 void tcpsettimer(Tcpctl *);
417 void tcpsynackrtt(struct conv *);
418 void tcpsetscale(struct conv *, Tcpctl *, uint16_t, uint16_t);
419
420 static void limborexmit(struct Proto *);
421 static void limbo(struct conv *, uint8_t * unused_uint8_p_t, uint8_t *, Tcp *,
422                                   int);
423
424 void tcpsetstate(struct conv *s, uint8_t newstate)
425 {
426         Tcpctl *tcb;
427         uint8_t oldstate;
428         struct tcppriv *tpriv;
429
430         tpriv = s->p->priv;
431
432         tcb = (Tcpctl *) s->ptcl;
433
434         oldstate = tcb->state;
435         if (oldstate == newstate)
436                 return;
437
438         if (oldstate == Established)
439                 tpriv->stats[CurrEstab]--;
440         if (newstate == Established)
441                 tpriv->stats[CurrEstab]++;
442
443         /**
444         print( "%d/%d %s->%s CurrEstab=%d\n", s->lport, s->rport,
445                 tcpstates[oldstate], tcpstates[newstate], tpriv->tstats.tcpCurrEstab );
446         **/
447
448         switch (newstate) {
449                 case Closed:
450                         qclose(s->rq);
451                         qclose(s->wq);
452                         qclose(s->eq);
453                         break;
454
455                 case Close_wait:        /* Remote closes */
456                         qhangup(s->rq, NULL);
457                         break;
458         }
459
460         tcb->state = newstate;
461
462         if (oldstate == Syn_sent && newstate != Closed)
463                 Fsconnected(s, NULL);
464 }
465
466 static char *tcpconnect(struct conv *c, char **argv, int argc)
467 {
468         char *e;
469
470         e = Fsstdconnect(c, argv, argc);
471         if (e != NULL)
472                 return e;
473         tcpstart(c, TCP_CONNECT);
474
475         return NULL;
476 }
477
478 static int tcpstate(struct conv *c, char *state, int n)
479 {
480         Tcpctl *s;
481
482         s = (Tcpctl *) (c->ptcl);
483
484         return snprintf(state, n,
485                                         "%s qin %d qout %d srtt %d mdev %d cwin %u swin %u>>%d rwin %u>>%d timer.start %llu timer.count %llu rerecv %d katimer.start %d katimer.count %d\n",
486                                         tcpstates[s->state],
487                                         c->rq ? qlen(c->rq) : 0,
488                                         c->wq ? qlen(c->wq) : 0,
489                                         s->srtt, s->mdev,
490                                         s->cwind, s->snd.wnd, s->rcv.scale, s->rcv.wnd,
491                                         s->snd.scale, s->timer.start, s->timer.count, s->rerecv,
492                                         s->katimer.start, s->katimer.count);
493 }
494
495 static int tcpinuse(struct conv *c)
496 {
497         Tcpctl *s;
498
499         s = (Tcpctl *) (c->ptcl);
500         return s->state != Closed;
501 }
502
503 static char *tcpannounce(struct conv *c, char **argv, int argc)
504 {
505         char *e;
506
507         e = Fsstdannounce(c, argv, argc);
508         if (e != NULL)
509                 return e;
510         tcpstart(c, TCP_LISTEN);
511         Fsconnected(c, NULL);
512
513         return NULL;
514 }
515
516 /*
517  *  tcpclose is always called with the q locked
518  */
519 static void tcpclose(struct conv *c)
520 {
521         Tcpctl *tcb;
522
523         tcb = (Tcpctl *) c->ptcl;
524
525         qhangup(c->rq, NULL);
526         qhangup(c->wq, NULL);
527         qhangup(c->eq, NULL);
528         qflush(c->rq);
529
530         switch (tcb->state) {
531                 case Listen:
532                         /*
533                          *  reset any incoming calls to this listener
534                          */
535                         Fsconnected(c, "Hangup");
536
537                         localclose(c, NULL);
538                         break;
539                 case Closed:
540                 case Syn_sent:
541                         localclose(c, NULL);
542                         break;
543                 case Syn_received:
544                 case Established:
545                         tcb->flgcnt++;
546                         tcb->snd.nxt++;
547                         tcpsetstate(c, Finwait1);
548                         tcpoutput(c);
549                         break;
550                 case Close_wait:
551                         tcb->flgcnt++;
552                         tcb->snd.nxt++;
553                         tcpsetstate(c, Last_ack);
554                         tcpoutput(c);
555                         break;
556         }
557 }
558
559 void tcpkick(void *x)
560 {
561         ERRSTACK(1);
562         struct conv *s = x;
563         Tcpctl *tcb;
564
565         tcb = (Tcpctl *) s->ptcl;
566
567         qlock(&s->qlock);
568         if (waserror()) {
569                 qunlock(&s->qlock);
570                 nexterror();
571         }
572
573         switch (tcb->state) {
574                 case Syn_sent:
575                 case Syn_received:
576                 case Established:
577                 case Close_wait:
578                         /*
579                          * Push data
580                          */
581                         tcprcvwin(s);
582                         tcpoutput(s);
583                         break;
584                 default:
585                         localclose(s, "Hangup");
586                         break;
587         }
588
589         qunlock(&s->qlock);
590         poperror();
591 }
592
593 void tcprcvwin(struct conv *s)
594 {       /* Call with tcb locked */
595         int w;
596         Tcpctl *tcb;
597
598         tcb = (Tcpctl *) s->ptcl;
599         w = tcb->window - qlen(s->rq);
600         if (w < 0)
601                 w = 0;
602         tcb->rcv.wnd = w;
603         if (w == 0)
604                 tcb->rcv.blocked = 1;
605 }
606
607 void tcpacktimer(void *v)
608 {
609         ERRSTACK(1);
610         Tcpctl *tcb;
611         struct conv *s;
612
613         s = v;
614         tcb = (Tcpctl *) s->ptcl;
615
616         qlock(&s->qlock);
617         if (waserror()) {
618                 qunlock(&s->qlock);
619                 nexterror();
620         }
621         if (tcb->state != Closed) {
622                 tcb->flags |= FORCE;
623                 tcprcvwin(s);
624                 tcpoutput(s);
625         }
626         qunlock(&s->qlock);
627         poperror();
628 }
629
630 static void tcpcreate(struct conv *c)
631 {
632         c->rq = qopen(QMAX, Qcoalesce, tcpacktimer, c);
633         c->wq = qopen(8 * QMAX, Qkick, tcpkick, c);
634 }
635
636 static void timerstate(struct tcppriv *priv, Tcptimer * t, int newstate)
637 {
638         if (newstate != TcptimerON) {
639                 if (t->state == TcptimerON) {
640                         // unchain
641                         if (priv->timers == t) {
642                                 priv->timers = t->next;
643                                 if (t->prev != NULL)
644                                         panic("timerstate1");
645                         }
646                         if (t->next)
647                                 t->next->prev = t->prev;
648                         if (t->prev)
649                                 t->prev->next = t->next;
650                         t->next = t->prev = NULL;
651                 }
652         } else {
653                 if (t->state != TcptimerON) {
654                         // chain
655                         if (t->prev != NULL || t->next != NULL)
656                                 panic("timerstate2");
657                         t->prev = NULL;
658                         t->next = priv->timers;
659                         if (t->next)
660                                 t->next->prev = t;
661                         priv->timers = t;
662                 }
663         }
664         t->state = newstate;
665 }
666
667 void tcpackproc(void *a)
668 {
669         ERRSTACK(1);
670         Tcptimer *t, *tp, *timeo;
671         struct Proto *tcp;
672         struct tcppriv *priv;
673         int loop;
674
675         tcp = a;
676         priv = tcp->priv;
677
678         for (;;) {
679                 kthread_usleep(MSPTICK * 1000);
680
681                 qlock(&priv->tl);
682                 timeo = NULL;
683                 loop = 0;
684                 for (t = priv->timers; t != NULL; t = tp) {
685                         if (loop++ > 10000)
686                                 panic("tcpackproc1");
687                         tp = t->next;
688                         if (t->state == TcptimerON) {
689                                 t->count--;
690                                 if (t->count == 0) {
691                                         timerstate(priv, t, TcptimerDONE);
692                                         t->readynext = timeo;
693                                         timeo = t;
694                                 }
695                         }
696                 }
697                 qunlock(&priv->tl);
698
699                 loop = 0;
700                 for (t = timeo; t != NULL; t = t->readynext) {
701                         if (loop++ > 10000)
702                                 panic("tcpackproc2");
703                         if (t->state == TcptimerDONE && t->func != NULL) {
704                                 /* discard error style */
705                                 if (!waserror())
706                                         (*t->func) (t->arg);
707                                 poperror();
708                         }
709                 }
710
711                 limborexmit(tcp);
712         }
713 }
714
715 void tcpgo(struct tcppriv *priv, Tcptimer * t)
716 {
717         if (t == NULL || t->start == 0)
718                 return;
719
720         qlock(&priv->tl);
721         t->count = t->start;
722         timerstate(priv, t, TcptimerON);
723         qunlock(&priv->tl);
724 }
725
726 void tcphalt(struct tcppriv *priv, Tcptimer * t)
727 {
728         if (t == NULL)
729                 return;
730
731         qlock(&priv->tl);
732         timerstate(priv, t, TcptimerOFF);
733         qunlock(&priv->tl);
734 }
735
736 int backoff(int n)
737 {
738         return 1 << n;
739 }
740
741 void localclose(struct conv *s, char *reason)
742 {       /* called with tcb locked */
743         Tcpctl *tcb;
744         Reseq *rp, *rp1;
745         struct tcppriv *tpriv;
746
747         tpriv = s->p->priv;
748         tcb = (Tcpctl *) s->ptcl;
749
750         iphtrem(&tpriv->ht, s);
751
752         tcphalt(tpriv, &tcb->timer);
753         tcphalt(tpriv, &tcb->rtt_timer);
754         tcphalt(tpriv, &tcb->acktimer);
755         tcphalt(tpriv, &tcb->katimer);
756
757         /* Flush reassembly queue; nothing more can arrive */
758         for (rp = tcb->reseq; rp != NULL; rp = rp1) {
759                 rp1 = rp->next;
760                 freeblist(rp->bp);
761                 kfree(rp);
762         }
763         tcb->reseq = NULL;
764
765         if (tcb->state == Syn_sent)
766                 Fsconnected(s, reason);
767
768         qhangup(s->rq, reason);
769         qhangup(s->wq, reason);
770
771         tcpsetstate(s, Closed);
772
773         /* listener will check the rq state */
774         if (s->state == Announced)
775                 rendez_wakeup(&s->listenr);
776 }
777
778 /* mtu (- TCP + IP hdr len) of 1st hop */
779 int tcpmtu(struct Proto *tcp, uint8_t * addr, int version, int *scale,
780            uint8_t *flags)
781 {
782         struct Ipifc *ifc;
783         int mtu;
784
785         ifc = findipifc(tcp->f, addr, 0);
786         switch (version) {
787                 default:
788                 case V4:
789                         mtu = DEF_MSS;
790                         if (ifc != NULL)
791                                 mtu = ifc->maxtu - ifc->m->hsize - (TCP4_PKT + TCP4_HDRSIZE);
792                         break;
793                 case V6:
794                         mtu = DEF_MSS6;
795                         if (ifc != NULL)
796                                 mtu = ifc->maxtu - ifc->m->hsize - (TCP6_PKT + TCP6_HDRSIZE);
797                         break;
798         }
799         *flags &= ~TSO;
800
801         if (ifc != NULL) {
802                 if (ifc->mbps > 100)
803                         *scale = HaveWS | 3;
804                 else if (ifc->mbps > 10)
805                         *scale = HaveWS | 1;
806                 else
807                         *scale = HaveWS | 0;
808                 if (ifc->feat & NETF_TSO)
809                         *flags |= TSO;
810         } else
811                 *scale = HaveWS | 0;
812
813         return mtu;
814 }
815
816 void inittcpctl(struct conv *s, int mode)
817 {
818         Tcpctl *tcb;
819         Tcp4hdr *h4;
820         Tcp6hdr *h6;
821         int mss;
822
823         tcb = (Tcpctl *) s->ptcl;
824
825         memset(tcb, 0, sizeof(Tcpctl));
826
827         tcb->ssthresh = 65535;
828         tcb->srtt = tcp_irtt << LOGAGAIN;
829         tcb->mdev = 0;
830
831         /* setup timers */
832         tcb->timer.start = tcp_irtt / MSPTICK;
833         tcb->timer.func = tcptimeout;
834         tcb->timer.arg = s;
835         tcb->rtt_timer.start = MAX_TIME;
836         tcb->acktimer.start = TCP_ACK / MSPTICK;
837         tcb->acktimer.func = tcpacktimer;
838         tcb->acktimer.arg = s;
839         tcb->katimer.start = DEF_KAT / MSPTICK;
840         tcb->katimer.func = tcpkeepalive;
841         tcb->katimer.arg = s;
842
843         mss = DEF_MSS;
844
845         /* create a prototype(pseudo) header */
846         if (mode != TCP_LISTEN) {
847                 if (ipcmp(s->laddr, IPnoaddr) == 0)
848                         findlocalip(s->p->f, s->laddr, s->raddr);
849
850                 switch (s->ipversion) {
851                         case V4:
852                                 h4 = &tcb->protohdr.tcp4hdr;
853                                 memset(h4, 0, sizeof(*h4));
854                                 h4->proto = IP_TCPPROTO;
855                                 hnputs(h4->tcpsport, s->lport);
856                                 hnputs(h4->tcpdport, s->rport);
857                                 v6tov4(h4->tcpsrc, s->laddr);
858                                 v6tov4(h4->tcpdst, s->raddr);
859                                 break;
860                         case V6:
861                                 h6 = &tcb->protohdr.tcp6hdr;
862                                 memset(h6, 0, sizeof(*h6));
863                                 h6->proto = IP_TCPPROTO;
864                                 hnputs(h6->tcpsport, s->lport);
865                                 hnputs(h6->tcpdport, s->rport);
866                                 ipmove(h6->tcpsrc, s->laddr);
867                                 ipmove(h6->tcpdst, s->raddr);
868                                 mss = DEF_MSS6;
869                                 break;
870                         default:
871                                 panic("inittcpctl: version %d", s->ipversion);
872                 }
873         }
874
875         tcb->mss = tcb->cwind = mss;
876
877         /* default is no window scaling */
878         tcb->window = QMAX;
879         tcb->rcv.wnd = QMAX;
880         tcb->rcv.scale = 0;
881         tcb->snd.scale = 0;
882         qsetlimit(s->rq, QMAX);
883 }
884
885 /*
886  *  called with s qlocked
887  */
888 void tcpstart(struct conv *s, int mode)
889 {
890         Tcpctl *tcb;
891         struct tcppriv *tpriv;
892         /* tcpackproc needs to free this if it ever exits */
893         char *kpname = kmalloc(KNAMELEN, KMALLOC_WAIT);
894
895         tpriv = s->p->priv;
896
897         if (tpriv->ackprocstarted == 0) {
898                 qlock(&tpriv->apl);
899                 if (tpriv->ackprocstarted == 0) {
900                         snprintf(kpname, KNAMELEN, "#I%dtcpack", s->p->f->dev);
901                         ktask(kpname, tcpackproc, s->p);
902                         tpriv->ackprocstarted = 1;
903                 }
904                 qunlock(&tpriv->apl);
905         }
906
907         tcb = (Tcpctl *) s->ptcl;
908
909         inittcpctl(s, mode);
910
911         iphtadd(&tpriv->ht, s);
912         switch (mode) {
913                 case TCP_LISTEN:
914                         tpriv->stats[PassiveOpens]++;
915                         tcb->flags |= CLONE;
916                         tcpsetstate(s, Listen);
917                         break;
918
919                 case TCP_CONNECT:
920                         tpriv->stats[ActiveOpens]++;
921                         tcb->flags |= ACTIVE;
922                         tcpsndsyn(s, tcb);
923                         tcpsetstate(s, Syn_sent);
924                         tcpoutput(s);
925                         break;
926         }
927 }
928
929 static char *tcpflag(uint16_t flag)
930 {
931         static char buf[128];
932
933         snprintf(buf, sizeof(buf), "%d", flag >> 10);   /* Head len */
934         if (flag & URG)
935                 snprintf(buf, sizeof(buf), "%s%s", buf, " URG");
936         if (flag & ACK)
937                 snprintf(buf, sizeof(buf), "%s%s", buf, " ACK");
938         if (flag & PSH)
939                 snprintf(buf, sizeof(buf), "%s%s", buf, " PSH");
940         if (flag & RST)
941                 snprintf(buf, sizeof(buf), "%s%s", buf, " RST");
942         if (flag & SYN)
943                 snprintf(buf, sizeof(buf), "%s%s", buf, " SYN");
944         if (flag & FIN)
945                 snprintf(buf, sizeof(buf), "%s%s", buf, " FIN");
946
947         return buf;
948 }
949
950 struct block *htontcp6(Tcp * tcph, struct block *data, Tcp6hdr * ph,
951                                            Tcpctl * tcb)
952 {
953         int dlen;
954         Tcp6hdr *h;
955         uint16_t csum;
956         uint16_t hdrlen, optpad = 0;
957         uint8_t *opt;
958
959         hdrlen = TCP6_HDRSIZE;
960         if (tcph->flags & SYN) {
961                 if (tcph->mss)
962                         hdrlen += MSS_LENGTH;
963                 if (tcph->ws)
964                         hdrlen += WS_LENGTH;
965                 optpad = hdrlen & 3;
966                 if (optpad)
967                         optpad = 4 - optpad;
968                 hdrlen += optpad;
969         }
970
971         if (data) {
972                 dlen = blocklen(data);
973                 data = padblock(data, hdrlen + TCP6_PKT);
974                 if (data == NULL)
975                         return NULL;
976         } else {
977                 dlen = 0;
978                 data = allocb(hdrlen + TCP6_PKT + 64);  /* the 64 pad is to meet mintu's */
979                 if (data == NULL)
980                         return NULL;
981                 data->wp += hdrlen + TCP6_PKT;
982         }
983
984         /* copy in pseudo ip header plus port numbers */
985         h = (Tcp6hdr *) (data->rp);
986         memmove(h, ph, TCP6_TCBPHDRSZ);
987
988         /* compose pseudo tcp header, do cksum calculation */
989         hnputl(h->vcf, hdrlen + dlen);
990         h->ploadlen[0] = h->ploadlen[1] = h->proto = 0;
991         h->ttl = ph->proto;
992
993         /* copy in variable bits */
994         hnputl(h->tcpseq, tcph->seq);
995         hnputl(h->tcpack, tcph->ack);
996         hnputs(h->tcpflag, (hdrlen << 10) | tcph->flags);
997         hnputs(h->tcpwin, tcph->wnd >> (tcb != NULL ? tcb->snd.scale : 0));
998         hnputs(h->tcpurg, tcph->urg);
999
1000         if (tcph->flags & SYN) {
1001                 opt = h->tcpopt;
1002                 if (tcph->mss != 0) {
1003                         *opt++ = MSSOPT;
1004                         *opt++ = MSS_LENGTH;
1005                         hnputs(opt, tcph->mss);
1006                         opt += 2;
1007                 }
1008                 if (tcph->ws != 0) {
1009                         *opt++ = WSOPT;
1010                         *opt++ = WS_LENGTH;
1011                         *opt++ = tcph->ws;
1012                 }
1013                 while (optpad-- > 0)
1014                         *opt++ = NOOPOPT;
1015         }
1016
1017         if (tcb != NULL && tcb->nochecksum) {
1018                 h->tcpcksum[0] = h->tcpcksum[1] = 0;
1019         } else {
1020                 csum = ptclcsum(data, TCP6_IPLEN, hdrlen + dlen + TCP6_PHDRSIZE);
1021                 hnputs(h->tcpcksum, csum);
1022         }
1023
1024         /* move from pseudo header back to normal ip header */
1025         memset(h->vcf, 0, 4);
1026         h->vcf[0] = IP_VER6;
1027         hnputs(h->ploadlen, hdrlen + dlen);
1028         h->proto = ph->proto;
1029
1030         return data;
1031 }
1032
1033 struct block *htontcp4(Tcp * tcph, struct block *data, Tcp4hdr * ph,
1034                                            Tcpctl * tcb)
1035 {
1036         int dlen;
1037         Tcp4hdr *h;
1038         uint16_t csum;
1039         uint16_t hdrlen, optpad = 0;
1040         uint8_t *opt;
1041
1042         hdrlen = TCP4_HDRSIZE;
1043         if (tcph->flags & SYN) {
1044                 if (tcph->mss)
1045                         hdrlen += MSS_LENGTH;
1046                 if (tcph->ws)
1047                         hdrlen += WS_LENGTH;
1048                 optpad = hdrlen & 3;
1049                 if (optpad)
1050                         optpad = 4 - optpad;
1051                 hdrlen += optpad;
1052         }
1053
1054         if (data) {
1055                 dlen = blocklen(data);
1056                 data = padblock(data, hdrlen + TCP4_PKT);
1057                 if (data == NULL)
1058                         return NULL;
1059         } else {
1060                 dlen = 0;
1061                 data = allocb(hdrlen + TCP4_PKT + 64);  /* the 64 pad is to meet mintu's */
1062                 if (data == NULL)
1063                         return NULL;
1064                 data->wp += hdrlen + TCP4_PKT;
1065         }
1066
1067         /* copy in pseudo ip header plus port numbers */
1068         h = (Tcp4hdr *) (data->rp);
1069         memmove(h, ph, TCP4_TCBPHDRSZ);
1070
1071         /* copy in variable bits */
1072         hnputs(h->tcplen, hdrlen + dlen);
1073         hnputl(h->tcpseq, tcph->seq);
1074         hnputl(h->tcpack, tcph->ack);
1075         hnputs(h->tcpflag, (hdrlen << 10) | tcph->flags);
1076         hnputs(h->tcpwin, tcph->wnd >> (tcb != NULL ? tcb->snd.scale : 0));
1077         hnputs(h->tcpurg, tcph->urg);
1078
1079         if (tcph->flags & SYN) {
1080                 opt = h->tcpopt;
1081                 if (tcph->mss != 0) {
1082                         *opt++ = MSSOPT;
1083                         *opt++ = MSS_LENGTH;
1084                         hnputs(opt, tcph->mss);
1085                         opt += 2;
1086                 }
1087                 if (tcph->ws != 0) {
1088                         *opt++ = WSOPT;
1089                         *opt++ = WS_LENGTH;
1090                         *opt++ = tcph->ws;
1091                 }
1092                 while (optpad-- > 0)
1093                         *opt++ = NOOPOPT;
1094         }
1095
1096         if (tcb != NULL && tcb->nochecksum) {
1097                 h->tcpcksum[0] = h->tcpcksum[1] = 0;
1098         } else {
1099                 csum = ~ptclcsum(data, TCP4_IPLEN, TCP4_PHDRSIZE);
1100                 hnputs(h->tcpcksum, csum);
1101                 data->checksum_start = TCP4_IPLEN + TCP4_PHDRSIZE;
1102                 data->checksum_offset = ph->tcpcksum - ph->tcpsport;
1103                 data->flag |= Btcpck;
1104         }
1105
1106         return data;
1107 }
1108
1109 int ntohtcp6(Tcp * tcph, struct block **bpp)
1110 {
1111         Tcp6hdr *h;
1112         uint8_t *optr;
1113         uint16_t hdrlen;
1114         uint16_t optlen;
1115         int n;
1116
1117         *bpp = pullupblock(*bpp, TCP6_PKT + TCP6_HDRSIZE);
1118         if (*bpp == NULL)
1119                 return -1;
1120
1121         h = (Tcp6hdr *) ((*bpp)->rp);
1122         tcph->source = nhgets(h->tcpsport);
1123         tcph->dest = nhgets(h->tcpdport);
1124         tcph->seq = nhgetl(h->tcpseq);
1125         tcph->ack = nhgetl(h->tcpack);
1126         hdrlen = (h->tcpflag[0] >> 2) & ~3;
1127         if (hdrlen < TCP6_HDRSIZE) {
1128                 freeblist(*bpp);
1129                 return -1;
1130         }
1131
1132         tcph->flags = h->tcpflag[1];
1133         tcph->wnd = nhgets(h->tcpwin);
1134         tcph->urg = nhgets(h->tcpurg);
1135         tcph->mss = 0;
1136         tcph->ws = 0;
1137         tcph->len = nhgets(h->ploadlen) - hdrlen;
1138
1139         *bpp = pullupblock(*bpp, hdrlen + TCP6_PKT);
1140         if (*bpp == NULL)
1141                 return -1;
1142
1143         optr = h->tcpopt;
1144         n = hdrlen - TCP6_HDRSIZE;
1145         while (n > 0 && *optr != EOLOPT) {
1146                 if (*optr == NOOPOPT) {
1147                         n--;
1148                         optr++;
1149                         continue;
1150                 }
1151                 optlen = optr[1];
1152                 if (optlen < 2 || optlen > n)
1153                         break;
1154                 switch (*optr) {
1155                         case MSSOPT:
1156                                 if (optlen == MSS_LENGTH)
1157                                         tcph->mss = nhgets(optr + 2);
1158                                 break;
1159                         case WSOPT:
1160                                 if (optlen == WS_LENGTH && *(optr + 2) <= 14)
1161                                         tcph->ws = HaveWS | *(optr + 2);
1162                                 break;
1163                 }
1164                 n -= optlen;
1165                 optr += optlen;
1166         }
1167         return hdrlen;
1168 }
1169
1170 int ntohtcp4(Tcp * tcph, struct block **bpp)
1171 {
1172         Tcp4hdr *h;
1173         uint8_t *optr;
1174         uint16_t hdrlen;
1175         uint16_t optlen;
1176         int n;
1177
1178         *bpp = pullupblock(*bpp, TCP4_PKT + TCP4_HDRSIZE);
1179         if (*bpp == NULL)
1180                 return -1;
1181
1182         h = (Tcp4hdr *) ((*bpp)->rp);
1183         tcph->source = nhgets(h->tcpsport);
1184         tcph->dest = nhgets(h->tcpdport);
1185         tcph->seq = nhgetl(h->tcpseq);
1186         tcph->ack = nhgetl(h->tcpack);
1187
1188         hdrlen = (h->tcpflag[0] >> 2) & ~3;
1189         if (hdrlen < TCP4_HDRSIZE) {
1190                 freeblist(*bpp);
1191                 return -1;
1192         }
1193
1194         tcph->flags = h->tcpflag[1];
1195         tcph->wnd = nhgets(h->tcpwin);
1196         tcph->urg = nhgets(h->tcpurg);
1197         tcph->mss = 0;
1198         tcph->ws = 0;
1199         tcph->len = nhgets(h->length) - (hdrlen + TCP4_PKT);
1200
1201         *bpp = pullupblock(*bpp, hdrlen + TCP4_PKT);
1202         if (*bpp == NULL)
1203                 return -1;
1204
1205         optr = h->tcpopt;
1206         n = hdrlen - TCP4_HDRSIZE;
1207         while (n > 0 && *optr != EOLOPT) {
1208                 if (*optr == NOOPOPT) {
1209                         n--;
1210                         optr++;
1211                         continue;
1212                 }
1213                 optlen = optr[1];
1214                 if (optlen < 2 || optlen > n)
1215                         break;
1216                 switch (*optr) {
1217                         case MSSOPT:
1218                                 if (optlen == MSS_LENGTH)
1219                                         tcph->mss = nhgets(optr + 2);
1220                                 break;
1221                         case WSOPT:
1222                                 if (optlen == WS_LENGTH && *(optr + 2) <= 14)
1223                                         tcph->ws = HaveWS | *(optr + 2);
1224                                 break;
1225                 }
1226                 n -= optlen;
1227                 optr += optlen;
1228         }
1229         return hdrlen;
1230 }
1231
1232 /*
1233  *  For outgiing calls, generate an initial sequence
1234  *  number and put a SYN on the send queue
1235  */
1236 void tcpsndsyn(struct conv *s, Tcpctl * tcb)
1237 {
1238         tcb->iss = (nrand(1 << 16) << 16) | nrand(1 << 16);
1239         tcb->rttseq = tcb->iss;
1240         tcb->snd.wl2 = tcb->iss;
1241         tcb->snd.una = tcb->iss;
1242         tcb->snd.ptr = tcb->rttseq;
1243         tcb->snd.nxt = tcb->rttseq;
1244         tcb->flgcnt++;
1245         tcb->flags |= FORCE;
1246         tcb->sndsyntime = NOW;
1247
1248         /* set desired mss and scale */
1249         tcb->mss = tcpmtu(s->p, s->laddr, s->ipversion, &tcb->scale,
1250                           &tcb->flags);
1251 }
1252
1253 void
1254 sndrst(struct Proto *tcp, uint8_t * source, uint8_t * dest,
1255            uint16_t length, Tcp * seg, uint8_t version, char *reason)
1256 {
1257         struct block *hbp;
1258         uint8_t rflags;
1259         struct tcppriv *tpriv;
1260         Tcp4hdr ph4;
1261         Tcp6hdr ph6;
1262
1263         netlog(tcp->f, Logtcp, "sndrst: %s\n", reason);
1264
1265         tpriv = tcp->priv;
1266
1267         if (seg->flags & RST)
1268                 return;
1269
1270         /* make pseudo header */
1271         switch (version) {
1272                 case V4:
1273                         memset(&ph4, 0, sizeof(ph4));
1274                         ph4.vihl = IP_VER4;
1275                         v6tov4(ph4.tcpsrc, dest);
1276                         v6tov4(ph4.tcpdst, source);
1277                         ph4.proto = IP_TCPPROTO;
1278                         hnputs(ph4.tcplen, TCP4_HDRSIZE);
1279                         hnputs(ph4.tcpsport, seg->dest);
1280                         hnputs(ph4.tcpdport, seg->source);
1281                         break;
1282                 case V6:
1283                         memset(&ph6, 0, sizeof(ph6));
1284                         ph6.vcf[0] = IP_VER6;
1285                         ipmove(ph6.tcpsrc, dest);
1286                         ipmove(ph6.tcpdst, source);
1287                         ph6.proto = IP_TCPPROTO;
1288                         hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1289                         hnputs(ph6.tcpsport, seg->dest);
1290                         hnputs(ph6.tcpdport, seg->source);
1291                         break;
1292                 default:
1293                         panic("sndrst: version %d", version);
1294         }
1295
1296         tpriv->stats[OutRsts]++;
1297         rflags = RST;
1298
1299         /* convince the other end that this reset is in band */
1300         if (seg->flags & ACK) {
1301                 seg->seq = seg->ack;
1302                 seg->ack = 0;
1303         } else {
1304                 rflags |= ACK;
1305                 seg->ack = seg->seq;
1306                 seg->seq = 0;
1307                 if (seg->flags & SYN)
1308                         seg->ack++;
1309                 seg->ack += length;
1310                 if (seg->flags & FIN)
1311                         seg->ack++;
1312         }
1313         seg->flags = rflags;
1314         seg->wnd = 0;
1315         seg->urg = 0;
1316         seg->mss = 0;
1317         seg->ws = 0;
1318         switch (version) {
1319                 case V4:
1320                         hbp = htontcp4(seg, NULL, &ph4, NULL);
1321                         if (hbp == NULL)
1322                                 return;
1323                         ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1324                         break;
1325                 case V6:
1326                         hbp = htontcp6(seg, NULL, &ph6, NULL);
1327                         if (hbp == NULL)
1328                                 return;
1329                         ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1330                         break;
1331                 default:
1332                         panic("sndrst2: version %d", version);
1333         }
1334 }
1335
1336 /*
1337  *  send a reset to the remote side and close the conversation
1338  *  called with s qlocked
1339  */
1340 char *tcphangup(struct conv *s)
1341 {
1342         ERRSTACK(2);
1343         Tcp seg;
1344         Tcpctl *tcb;
1345         struct block *hbp;
1346
1347         tcb = (Tcpctl *) s->ptcl;
1348         if (waserror()) {
1349                 poperror();
1350                 return commonerror();
1351         }
1352         if (ipcmp(s->raddr, IPnoaddr)) {
1353                 /* discard error style, poperror regardless */
1354                 if (!waserror()) {
1355                         seg.flags = RST | ACK;
1356                         seg.ack = tcb->rcv.nxt;
1357                         tcb->rcv.una = 0;
1358                         seg.seq = tcb->snd.ptr;
1359                         seg.wnd = 0;
1360                         seg.urg = 0;
1361                         seg.mss = 0;
1362                         seg.ws = 0;
1363                         switch (s->ipversion) {
1364                                 case V4:
1365                                         tcb->protohdr.tcp4hdr.vihl = IP_VER4;
1366                                         hbp = htontcp4(&seg, NULL, &tcb->protohdr.tcp4hdr, tcb);
1367                                         ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
1368                                         break;
1369                                 case V6:
1370                                         tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
1371                                         hbp = htontcp6(&seg, NULL, &tcb->protohdr.tcp6hdr, tcb);
1372                                         ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
1373                                         break;
1374                                 default:
1375                                         panic("tcphangup: version %d", s->ipversion);
1376                         }
1377                 }
1378                 poperror();
1379         }
1380         localclose(s, NULL);
1381         poperror();
1382         return NULL;
1383 }
1384
1385 /*
1386  *  (re)send a SYN ACK
1387  */
1388 int sndsynack(struct Proto *tcp, Limbo * lp)
1389 {
1390         struct block *hbp;
1391         Tcp4hdr ph4;
1392         Tcp6hdr ph6;
1393         Tcp seg;
1394         int scale;
1395         uint8_t flag = 0;
1396
1397         /* make pseudo header */
1398         switch (lp->version) {
1399                 case V4:
1400                         memset(&ph4, 0, sizeof(ph4));
1401                         ph4.vihl = IP_VER4;
1402                         v6tov4(ph4.tcpsrc, lp->laddr);
1403                         v6tov4(ph4.tcpdst, lp->raddr);
1404                         ph4.proto = IP_TCPPROTO;
1405                         hnputs(ph4.tcplen, TCP4_HDRSIZE);
1406                         hnputs(ph4.tcpsport, lp->lport);
1407                         hnputs(ph4.tcpdport, lp->rport);
1408                         break;
1409                 case V6:
1410                         memset(&ph6, 0, sizeof(ph6));
1411                         ph6.vcf[0] = IP_VER6;
1412                         ipmove(ph6.tcpsrc, lp->laddr);
1413                         ipmove(ph6.tcpdst, lp->raddr);
1414                         ph6.proto = IP_TCPPROTO;
1415                         hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1416                         hnputs(ph6.tcpsport, lp->lport);
1417                         hnputs(ph6.tcpdport, lp->rport);
1418                         break;
1419                 default:
1420                         panic("sndrst: version %d", lp->version);
1421         }
1422
1423         seg.seq = lp->iss;
1424         seg.ack = lp->irs + 1;
1425         seg.flags = SYN | ACK;
1426         seg.urg = 0;
1427         seg.mss = tcpmtu(tcp, lp->laddr, lp->version, &scale, &flag);
1428         seg.wnd = QMAX;
1429
1430         /* if the other side set scale, we should too */
1431         if (lp->rcvscale) {
1432                 seg.ws = scale;
1433                 lp->sndscale = scale;
1434         } else {
1435                 seg.ws = 0;
1436                 lp->sndscale = 0;
1437         }
1438
1439         switch (lp->version) {
1440                 case V4:
1441                         hbp = htontcp4(&seg, NULL, &ph4, NULL);
1442                         if (hbp == NULL)
1443                                 return -1;
1444                         ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1445                         break;
1446                 case V6:
1447                         hbp = htontcp6(&seg, NULL, &ph6, NULL);
1448                         if (hbp == NULL)
1449                                 return -1;
1450                         ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1451                         break;
1452                 default:
1453                         panic("sndsnack: version %d", lp->version);
1454         }
1455         lp->lastsend = NOW;
1456         return 0;
1457 }
1458
1459 #define hashipa(a, p) ( ( (a)[IPaddrlen-2] + (a)[IPaddrlen-1] + p )&LHTMASK )
1460
1461 /*
1462  *  put a call into limbo and respond with a SYN ACK
1463  *
1464  *  called with proto locked
1465  */
1466 static void
1467 limbo(struct conv *s, uint8_t * source, uint8_t * dest, Tcp * seg, int version)
1468 {
1469         Limbo *lp, **l;
1470         struct tcppriv *tpriv;
1471         int h;
1472
1473         tpriv = s->p->priv;
1474         h = hashipa(source, seg->source);
1475
1476         for (l = &tpriv->lht[h]; *l != NULL; l = &lp->next) {
1477                 lp = *l;
1478                 if (lp->lport != seg->dest || lp->rport != seg->source
1479                         || lp->version != version)
1480                         continue;
1481                 if (ipcmp(lp->raddr, source) != 0)
1482                         continue;
1483                 if (ipcmp(lp->laddr, dest) != 0)
1484                         continue;
1485
1486                 /* each new SYN restarts the retransmits */
1487                 lp->irs = seg->seq;
1488                 break;
1489         }
1490         lp = *l;
1491         if (lp == NULL) {
1492                 if (tpriv->nlimbo >= Maxlimbo && tpriv->lht[h]) {
1493                         lp = tpriv->lht[h];
1494                         tpriv->lht[h] = lp->next;
1495                         lp->next = NULL;
1496                 } else {
1497                         lp = kzmalloc(sizeof(*lp), 0);
1498                         if (lp == NULL)
1499                                 return;
1500                         tpriv->nlimbo++;
1501                 }
1502                 *l = lp;
1503                 lp->version = version;
1504                 ipmove(lp->laddr, dest);
1505                 ipmove(lp->raddr, source);
1506                 lp->lport = seg->dest;
1507                 lp->rport = seg->source;
1508                 lp->mss = seg->mss;
1509                 lp->rcvscale = seg->ws;
1510                 lp->irs = seg->seq;
1511                 lp->iss = (nrand(1 << 16) << 16) | nrand(1 << 16);
1512         }
1513
1514         if (sndsynack(s->p, lp) < 0) {
1515                 *l = lp->next;
1516                 tpriv->nlimbo--;
1517                 kfree(lp);
1518         }
1519 }
1520
1521 /*
1522  *  resend SYN ACK's once every SYNACK_RXTIMER ms.
1523  */
1524 static void limborexmit(struct Proto *tcp)
1525 {
1526         struct tcppriv *tpriv;
1527         Limbo **l, *lp;
1528         int h;
1529         int seen;
1530         uint64_t now;
1531
1532         tpriv = tcp->priv;
1533
1534         if (!canqlock(&tcp->qlock))
1535                 return;
1536         seen = 0;
1537         now = NOW;
1538         for (h = 0; h < NLHT && seen < tpriv->nlimbo; h++) {
1539                 for (l = &tpriv->lht[h]; *l != NULL && seen < tpriv->nlimbo;) {
1540                         lp = *l;
1541                         seen++;
1542                         if (now - lp->lastsend < (lp->rexmits + 1) * SYNACK_RXTIMER)
1543                                 continue;
1544
1545                         /* time it out after 1 second */
1546                         if (++(lp->rexmits) > 5) {
1547                                 tpriv->nlimbo--;
1548                                 *l = lp->next;
1549                                 kfree(lp);
1550                                 continue;
1551                         }
1552
1553                         /* if we're being attacked, don't bother resending SYN ACK's */
1554                         if (tpriv->nlimbo > 100)
1555                                 continue;
1556
1557                         if (sndsynack(tcp, lp) < 0) {
1558                                 tpriv->nlimbo--;
1559                                 *l = lp->next;
1560                                 kfree(lp);
1561                                 continue;
1562                         }
1563
1564                         l = &lp->next;
1565                 }
1566         }
1567         qunlock(&tcp->qlock);
1568 }
1569
1570 /*
1571  *  lookup call in limbo.  if found, throw it out.
1572  *
1573  *  called with proto locked
1574  */
1575 static void
1576 limborst(struct conv *s, Tcp * segp, uint8_t * src, uint8_t * dst,
1577                  uint8_t version)
1578 {
1579         Limbo *lp, **l;
1580         int h;
1581         struct tcppriv *tpriv;
1582
1583         tpriv = s->p->priv;
1584
1585         /* find a call in limbo */
1586         h = hashipa(src, segp->source);
1587         for (l = &tpriv->lht[h]; *l != NULL; l = &lp->next) {
1588                 lp = *l;
1589                 if (lp->lport != segp->dest || lp->rport != segp->source
1590                         || lp->version != version)
1591                         continue;
1592                 if (ipcmp(lp->laddr, dst) != 0)
1593                         continue;
1594                 if (ipcmp(lp->raddr, src) != 0)
1595                         continue;
1596
1597                 /* RST can only follow the SYN */
1598                 if (segp->seq == lp->irs + 1) {
1599                         tpriv->nlimbo--;
1600                         *l = lp->next;
1601                         kfree(lp);
1602                 }
1603                 break;
1604         }
1605 }
1606
1607 /*
1608  *  come here when we finally get an ACK to our SYN-ACK.
1609  *  lookup call in limbo.  if found, create a new conversation
1610  *
1611  *  called with proto locked
1612  */
1613 static struct conv *tcpincoming(struct conv *s, Tcp * segp, uint8_t * src,
1614                                                                 uint8_t * dst, uint8_t version)
1615 {
1616         struct conv *new;
1617         Tcpctl *tcb;
1618         struct tcppriv *tpriv;
1619         Tcp4hdr *h4;
1620         Tcp6hdr *h6;
1621         Limbo *lp, **l;
1622         int h;
1623
1624         /* unless it's just an ack, it can't be someone coming out of limbo */
1625         if ((segp->flags & SYN) || (segp->flags & ACK) == 0)
1626                 return NULL;
1627
1628         tpriv = s->p->priv;
1629
1630         /* find a call in limbo */
1631         h = hashipa(src, segp->source);
1632         for (l = &tpriv->lht[h]; (lp = *l) != NULL; l = &lp->next) {
1633                 netlog(s->p->f, Logtcp,
1634                            "tcpincoming s %I!%d/%I!%d d %I!%d/%I!%d v %d/%d\n", src,
1635                            segp->source, lp->raddr, lp->rport, dst, segp->dest, lp->laddr,
1636                            lp->lport, version, lp->version);
1637
1638                 if (lp->lport != segp->dest || lp->rport != segp->source
1639                         || lp->version != version)
1640                         continue;
1641                 if (ipcmp(lp->laddr, dst) != 0)
1642                         continue;
1643                 if (ipcmp(lp->raddr, src) != 0)
1644                         continue;
1645
1646                 /* we're assuming no data with the initial SYN */
1647                 if (segp->seq != lp->irs + 1 || segp->ack != lp->iss + 1) {
1648                         netlog(s->p->f, Logtcp, "tcpincoming s 0x%lx/0x%lx a 0x%lx 0x%lx\n",
1649                                    segp->seq, lp->irs + 1, segp->ack, lp->iss + 1);
1650                         lp = NULL;
1651                 } else {
1652                         tpriv->nlimbo--;
1653                         *l = lp->next;
1654                 }
1655                 break;
1656         }
1657         if (lp == NULL)
1658                 return NULL;
1659
1660         new = Fsnewcall(s, src, segp->source, dst, segp->dest, version);
1661         if (new == NULL)
1662                 return NULL;
1663
1664         memmove(new->ptcl, s->ptcl, sizeof(Tcpctl));
1665         tcb = (Tcpctl *) new->ptcl;
1666         tcb->flags &= ~CLONE;
1667         tcb->timer.arg = new;
1668         tcb->timer.state = TcptimerOFF;
1669         tcb->acktimer.arg = new;
1670         tcb->acktimer.state = TcptimerOFF;
1671         tcb->katimer.arg = new;
1672         tcb->katimer.state = TcptimerOFF;
1673         tcb->rtt_timer.arg = new;
1674         tcb->rtt_timer.state = TcptimerOFF;
1675
1676         tcb->irs = lp->irs;
1677         tcb->rcv.nxt = tcb->irs + 1;
1678         tcb->rcv.urg = tcb->rcv.nxt;
1679
1680         tcb->iss = lp->iss;
1681         tcb->rttseq = tcb->iss;
1682         tcb->snd.wl2 = tcb->iss;
1683         tcb->snd.una = tcb->iss + 1;
1684         tcb->snd.ptr = tcb->iss + 1;
1685         tcb->snd.nxt = tcb->iss + 1;
1686         tcb->flgcnt = 0;
1687         tcb->flags |= SYNACK;
1688
1689         /* our sending max segment size cannot be bigger than what he asked for */
1690         if (lp->mss != 0 && lp->mss < tcb->mss)
1691                 tcb->mss = lp->mss;
1692
1693         /* window scaling */
1694         tcpsetscale(new, tcb, lp->rcvscale, lp->sndscale);
1695
1696         /* the congestion window always starts out as a single segment */
1697         tcb->snd.wnd = segp->wnd;
1698         tcb->cwind = tcb->mss;
1699
1700         /* set initial round trip time */
1701         tcb->sndsyntime = lp->lastsend + lp->rexmits * SYNACK_RXTIMER;
1702         tcpsynackrtt(new);
1703
1704         kfree(lp);
1705
1706         /* set up proto header */
1707         switch (version) {
1708                 case V4:
1709                         h4 = &tcb->protohdr.tcp4hdr;
1710                         memset(h4, 0, sizeof(*h4));
1711                         h4->proto = IP_TCPPROTO;
1712                         hnputs(h4->tcpsport, new->lport);
1713                         hnputs(h4->tcpdport, new->rport);
1714                         v6tov4(h4->tcpsrc, dst);
1715                         v6tov4(h4->tcpdst, src);
1716                         break;
1717                 case V6:
1718                         h6 = &tcb->protohdr.tcp6hdr;
1719                         memset(h6, 0, sizeof(*h6));
1720                         h6->proto = IP_TCPPROTO;
1721                         hnputs(h6->tcpsport, new->lport);
1722                         hnputs(h6->tcpdport, new->rport);
1723                         ipmove(h6->tcpsrc, dst);
1724                         ipmove(h6->tcpdst, src);
1725                         break;
1726                 default:
1727                         panic("tcpincoming: version %d", new->ipversion);
1728         }
1729
1730         tcpsetstate(new, Established);
1731
1732         iphtadd(&tpriv->ht, new);
1733
1734         return new;
1735 }
1736
1737 int seq_within(uint32_t x, uint32_t low, uint32_t high)
1738 {
1739         if (low <= high) {
1740                 if (low <= x && x <= high)
1741                         return 1;
1742         } else {
1743                 if (x >= low || x <= high)
1744                         return 1;
1745         }
1746         return 0;
1747 }
1748
1749 int seq_lt(uint32_t x, uint32_t y)
1750 {
1751         return (int)(x - y) < 0;
1752 }
1753
1754 int seq_le(uint32_t x, uint32_t y)
1755 {
1756         return (int)(x - y) <= 0;
1757 }
1758
1759 int seq_gt(uint32_t x, uint32_t y)
1760 {
1761         return (int)(x - y) > 0;
1762 }
1763
1764 int seq_ge(uint32_t x, uint32_t y)
1765 {
1766         return (int)(x - y) >= 0;
1767 }
1768
1769 /*
1770  *  use the time between the first SYN and it's ack as the
1771  *  initial round trip time
1772  */
1773 void tcpsynackrtt(struct conv *s)
1774 {
1775         Tcpctl *tcb;
1776         uint64_t delta;
1777         struct tcppriv *tpriv;
1778
1779         tcb = (Tcpctl *) s->ptcl;
1780         tpriv = s->p->priv;
1781
1782         delta = NOW - tcb->sndsyntime;
1783         tcb->srtt = delta << LOGAGAIN;
1784         tcb->mdev = delta << LOGDGAIN;
1785
1786         /* halt round trip timer */
1787         tcphalt(tpriv, &tcb->rtt_timer);
1788 }
1789
1790 void update(struct conv *s, Tcp * seg)
1791 {
1792         int rtt, delta;
1793         Tcpctl *tcb;
1794         uint32_t acked;
1795         uint32_t expand;
1796         struct tcppriv *tpriv;
1797
1798         tpriv = s->p->priv;
1799         tcb = (Tcpctl *) s->ptcl;
1800
1801         /* if everything has been acked, force output(?) */
1802         if (seq_gt(seg->ack, tcb->snd.nxt)) {
1803                 tcb->flags |= FORCE;
1804                 return;
1805         }
1806
1807         /* added by Dong Lin for fast retransmission */
1808         if (seg->ack == tcb->snd.una
1809                 && tcb->snd.una != tcb->snd.nxt
1810                 && seg->len == 0 && seg->wnd == tcb->snd.wnd) {
1811
1812                 /* this is a pure ack w/o window update */
1813                 netlog(s->p->f, Logtcprxmt, "dupack %lu ack %lu sndwnd %d advwin %d\n",
1814                            tcb->snd.dupacks, seg->ack, tcb->snd.wnd, seg->wnd);
1815
1816                 if (++tcb->snd.dupacks == TCPREXMTTHRESH) {
1817                         /*
1818                          *  tahoe tcp rxt the packet, half sshthresh,
1819                          *  and set cwnd to one packet
1820                          */
1821                         tcb->snd.recovery = 1;
1822                         tcb->snd.rxt = tcb->snd.nxt;
1823                         netlog(s->p->f, Logtcprxmt, "fast rxt %lu, nxt %lu\n", tcb->snd.una,
1824                                    tcb->snd.nxt);
1825                         tcprxmit(s);
1826                 } else {
1827                         /* do reno tcp here. */
1828                 }
1829         }
1830
1831         /*
1832          *  update window
1833          */
1834         if (seq_gt(seg->ack, tcb->snd.wl2)
1835                 || (tcb->snd.wl2 == seg->ack && seg->wnd > tcb->snd.wnd)) {
1836                 tcb->snd.wnd = seg->wnd;
1837                 tcb->snd.wl2 = seg->ack;
1838         }
1839
1840         if (!seq_gt(seg->ack, tcb->snd.una)) {
1841                 /*
1842                  *  don't let us hangup if sending into a closed window and
1843                  *  we're still getting acks
1844                  */
1845                 if ((tcb->flags & RETRAN) && tcb->snd.wnd == 0) {
1846                         tcb->backedoff = MAXBACKMS / 4;
1847                 }
1848                 return;
1849         }
1850
1851         /*
1852          *  any positive ack turns off fast rxt,
1853          *  (should we do new-reno on partial acks?)
1854          */
1855         if (!tcb->snd.recovery || seq_ge(seg->ack, tcb->snd.rxt)) {
1856                 tcb->snd.dupacks = 0;
1857                 tcb->snd.recovery = 0;
1858         } else
1859                 netlog(s->p->f, Logtcp, "rxt next %lu, cwin %u\n", seg->ack,
1860                            tcb->cwind);
1861
1862         /* Compute the new send window size */
1863         acked = seg->ack - tcb->snd.una;
1864
1865         /* avoid slow start and timers for SYN acks */
1866         if ((tcb->flags & SYNACK) == 0) {
1867                 tcb->flags |= SYNACK;
1868                 acked--;
1869                 tcb->flgcnt--;
1870                 goto done;
1871         }
1872
1873         /* slow start as long as we're not recovering from lost packets */
1874         if (tcb->cwind < tcb->snd.wnd && !tcb->snd.recovery) {
1875                 if (tcb->cwind < tcb->ssthresh) {
1876                         expand = tcb->mss;
1877                         if (acked < expand)
1878                                 expand = acked;
1879                 } else
1880                         expand = ((int)tcb->mss * tcb->mss) / tcb->cwind;
1881
1882                 if (tcb->cwind + expand < tcb->cwind)
1883                         expand = tcb->snd.wnd - tcb->cwind;
1884                 if (tcb->cwind + expand > tcb->snd.wnd)
1885                         expand = tcb->snd.wnd - tcb->cwind;
1886                 tcb->cwind += expand;
1887         }
1888
1889         /* Adjust the timers according to the round trip time */
1890         if (tcb->rtt_timer.state == TcptimerON && seq_ge(seg->ack, tcb->rttseq)) {
1891                 tcphalt(tpriv, &tcb->rtt_timer);
1892                 if ((tcb->flags & RETRAN) == 0) {
1893                         tcb->backoff = 0;
1894                         tcb->backedoff = 0;
1895                         rtt = tcb->rtt_timer.start - tcb->rtt_timer.count;
1896                         if (rtt == 0)
1897                                 rtt = 1;        /* otherwise all close systems will rexmit in 0 time */
1898                         rtt *= MSPTICK;
1899                         if (tcb->srtt == 0) {
1900                                 tcb->srtt = rtt << LOGAGAIN;
1901                                 tcb->mdev = rtt << LOGDGAIN;
1902                         } else {
1903                                 delta = rtt - (tcb->srtt >> LOGAGAIN);
1904                                 tcb->srtt += delta;
1905                                 if (tcb->srtt <= 0)
1906                                         tcb->srtt = 1;
1907
1908                                 delta = abs(delta) - (tcb->mdev >> LOGDGAIN);
1909                                 tcb->mdev += delta;
1910                                 if (tcb->mdev <= 0)
1911                                         tcb->mdev = 1;
1912                         }
1913                         tcpsettimer(tcb);
1914                 }
1915         }
1916
1917 done:
1918         if (qdiscard(s->wq, acked) < acked)
1919                 tcb->flgcnt--;
1920
1921         tcb->snd.una = seg->ack;
1922         if (seq_gt(seg->ack, tcb->snd.urg))
1923                 tcb->snd.urg = seg->ack;
1924
1925         if (tcb->snd.una != tcb->snd.nxt)
1926                 tcpgo(tpriv, &tcb->timer);
1927         else
1928                 tcphalt(tpriv, &tcb->timer);
1929
1930         if (seq_lt(tcb->snd.ptr, tcb->snd.una))
1931                 tcb->snd.ptr = tcb->snd.una;
1932
1933         tcb->flags &= ~RETRAN;
1934         tcb->backoff = 0;
1935         tcb->backedoff = 0;
1936 }
1937
1938 void tcpiput(struct Proto *tcp, struct Ipifc *unused, struct block *bp)
1939 {
1940         ERRSTACK(1);
1941         Tcp seg;
1942         Tcp4hdr *h4;
1943         Tcp6hdr *h6;
1944         int hdrlen;
1945         Tcpctl *tcb;
1946         uint16_t length;
1947         uint8_t source[IPaddrlen], dest[IPaddrlen];
1948         struct conv *s;
1949         struct Fs *f;
1950         struct tcppriv *tpriv;
1951         uint8_t version;
1952
1953         f = tcp->f;
1954         tpriv = tcp->priv;
1955
1956         tpriv->stats[InSegs]++;
1957
1958         h4 = (Tcp4hdr *) (bp->rp);
1959         h6 = (Tcp6hdr *) (bp->rp);
1960
1961         if ((h4->vihl & 0xF0) == IP_VER4) {
1962                 version = V4;
1963                 length = nhgets(h4->length);
1964                 v4tov6(dest, h4->tcpdst);
1965                 v4tov6(source, h4->tcpsrc);
1966
1967                 h4->Unused = 0;
1968                 hnputs(h4->tcplen, length - TCP4_PKT);
1969                 if (!(bp->flag & Btcpck) && (h4->tcpcksum[0] || h4->tcpcksum[1]) &&
1970                         ptclcsum(bp, TCP4_IPLEN, length - TCP4_IPLEN)) {
1971                         tpriv->stats[CsumErrs]++;
1972                         tpriv->stats[InErrs]++;
1973                         netlog(f, Logtcp, "bad tcp proto cksum\n");
1974                         freeblist(bp);
1975                         return;
1976                 }
1977
1978                 hdrlen = ntohtcp4(&seg, &bp);
1979                 if (hdrlen < 0) {
1980                         tpriv->stats[HlenErrs]++;
1981                         tpriv->stats[InErrs]++;
1982                         netlog(f, Logtcp, "bad tcp hdr len\n");
1983                         return;
1984                 }
1985
1986                 /* trim the packet to the size claimed by the datagram */
1987                 length -= hdrlen + TCP4_PKT;
1988                 bp = trimblock(bp, hdrlen + TCP4_PKT, length);
1989                 if (bp == NULL) {
1990                         tpriv->stats[LenErrs]++;
1991                         tpriv->stats[InErrs]++;
1992                         netlog(f, Logtcp, "tcp len < 0 after trim\n");
1993                         return;
1994                 }
1995         } else {
1996                 int ttl = h6->ttl;
1997                 int proto = h6->proto;
1998
1999                 version = V6;
2000                 length = nhgets(h6->ploadlen);
2001                 ipmove(dest, h6->tcpdst);
2002                 ipmove(source, h6->tcpsrc);
2003
2004                 h6->ploadlen[0] = h6->ploadlen[1] = h6->proto = 0;
2005                 h6->ttl = proto;
2006                 hnputl(h6->vcf, length);
2007                 if ((h6->tcpcksum[0] || h6->tcpcksum[1]) &&
2008                         ptclcsum(bp, TCP6_IPLEN, length + TCP6_PHDRSIZE)) {
2009                         tpriv->stats[CsumErrs]++;
2010                         tpriv->stats[InErrs]++;
2011                         netlog(f, Logtcp, "bad tcp proto cksum\n");
2012                         freeblist(bp);
2013                         return;
2014                 }
2015                 h6->ttl = ttl;
2016                 h6->proto = proto;
2017                 hnputs(h6->ploadlen, length);
2018
2019                 hdrlen = ntohtcp6(&seg, &bp);
2020                 if (hdrlen < 0) {
2021                         tpriv->stats[HlenErrs]++;
2022                         tpriv->stats[InErrs]++;
2023                         netlog(f, Logtcp, "bad tcp hdr len\n");
2024                         return;
2025                 }
2026
2027                 /* trim the packet to the size claimed by the datagram */
2028                 length -= hdrlen;
2029                 bp = trimblock(bp, hdrlen + TCP6_PKT, length);
2030                 if (bp == NULL) {
2031                         tpriv->stats[LenErrs]++;
2032                         tpriv->stats[InErrs]++;
2033                         netlog(f, Logtcp, "tcp len < 0 after trim\n");
2034                         return;
2035                 }
2036         }
2037
2038         /* lock protocol while searching for a conversation */
2039         qlock(&tcp->qlock);
2040
2041         /* Look for a matching conversation */
2042         s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest);
2043         if (s == NULL) {
2044                 netlog(f, Logtcp, "iphtlook failed\n");
2045 reset:
2046                 qunlock(&tcp->qlock);
2047                 sndrst(tcp, source, dest, length, &seg, version, "no conversation");
2048                 freeblist(bp);
2049                 return;
2050         }
2051
2052         /* if it's a listener, look for the right flags and get a new conv */
2053         tcb = (Tcpctl *) s->ptcl;
2054         if (tcb->state == Listen) {
2055                 if (seg.flags & RST) {
2056                         limborst(s, &seg, source, dest, version);
2057                         qunlock(&tcp->qlock);
2058                         freeblist(bp);
2059                         return;
2060                 }
2061
2062                 /* if this is a new SYN, put the call into limbo */
2063                 if ((seg.flags & SYN) && (seg.flags & ACK) == 0) {
2064                         limbo(s, source, dest, &seg, version);
2065                         qunlock(&tcp->qlock);
2066                         freeblist(bp);
2067                         return;
2068                 }
2069
2070                 /*
2071                  *  if there's a matching call in limbo, tcpincoming will
2072                  *  return it in state Syn_received
2073                  */
2074                 s = tcpincoming(s, &seg, source, dest, version);
2075                 if (s == NULL)
2076                         goto reset;
2077         }
2078
2079         /* The rest of the input state machine is run with the control block
2080          * locked and implements the state machine directly out of the RFC.
2081          * Out-of-band data is ignored - it was always a bad idea.
2082          */
2083         tcb = (Tcpctl *) s->ptcl;
2084         if (waserror()) {
2085                 qunlock(&s->qlock);
2086                 nexterror();
2087         }
2088         qlock(&s->qlock);
2089         qunlock(&tcp->qlock);
2090
2091         /* fix up window */
2092         seg.wnd <<= tcb->rcv.scale;
2093
2094         /* every input packet in puts off the keep alive time out */
2095         tcpsetkacounter(tcb);
2096
2097         switch (tcb->state) {
2098                 case Closed:
2099                         sndrst(tcp, source, dest, length, &seg, version,
2100                                    "sending to Closed");
2101                         goto raise;
2102                 case Syn_sent:
2103                         if (seg.flags & ACK) {
2104                                 if (!seq_within(seg.ack, tcb->iss + 1, tcb->snd.nxt)) {
2105                                         sndrst(tcp, source, dest, length, &seg, version,
2106                                                    "bad seq in Syn_sent");
2107                                         goto raise;
2108                                 }
2109                         }
2110                         if (seg.flags & RST) {
2111                                 if (seg.flags & ACK)
2112                                         localclose(s, errno_to_string(ECONNREFUSED));
2113                                 goto raise;
2114                         }
2115
2116                         if (seg.flags & SYN) {
2117                                 procsyn(s, &seg);
2118                                 if (seg.flags & ACK) {
2119                                         update(s, &seg);
2120                                         tcpsynackrtt(s);
2121                                         tcpsetstate(s, Established);
2122                                         tcpsetscale(s, tcb, seg.ws, tcb->scale);
2123                                 } else {
2124                                         tcb->time = NOW;
2125                                         tcpsetstate(s, Syn_received);   /* DLP - shouldn't this be a reset? */
2126                                 }
2127
2128                                 if (length != 0 || (seg.flags & FIN))
2129                                         break;
2130
2131                                 freeblist(bp);
2132                                 goto output;
2133                         } else
2134                                 freeblist(bp);
2135
2136                         qunlock(&s->qlock);
2137                         poperror();
2138                         return;
2139                 case Syn_received:
2140                         /* doesn't matter if it's the correct ack, we're just trying to set timing */
2141                         if (seg.flags & ACK)
2142                                 tcpsynackrtt(s);
2143                         break;
2144         }
2145
2146         /*
2147          *  One DOS attack is to open connections to us and then forget about them,
2148          *  thereby tying up a conv at no long term cost to the attacker.
2149          *  This is an attempt to defeat these stateless DOS attacks.  See
2150          *  corresponding code in tcpsendka().
2151          */
2152         if (tcb->state != Syn_received && (seg.flags & RST) == 0) {
2153                 if (tcpporthogdefense
2154                         && seq_within(seg.ack, tcb->snd.una - (1 << 31),
2155                                                   tcb->snd.una - (1 << 29))) {
2156                         printd("stateless hog %I.%d->%I.%d f 0x%x 0x%lx - 0x%lx - 0x%lx\n",
2157                                    source, seg.source, dest, seg.dest, seg.flags,
2158                                    tcb->snd.una - (1 << 31), seg.ack, tcb->snd.una - (1 << 29));
2159                         localclose(s, "stateless hog");
2160                 }
2161         }
2162
2163         /* Cut the data to fit the receive window */
2164         if (tcptrim(tcb, &seg, &bp, &length) == -1) {
2165                 netlog(f, Logtcp, "tcp len < 0, %lu %d\n", seg.seq, length);
2166                 update(s, &seg);
2167                 if (qlen(s->wq) + tcb->flgcnt == 0 && tcb->state == Closing) {
2168                         tcphalt(tpriv, &tcb->rtt_timer);
2169                         tcphalt(tpriv, &tcb->acktimer);
2170                         tcphalt(tpriv, &tcb->katimer);
2171                         tcpsetstate(s, Time_wait);
2172                         tcb->timer.start = MSL2 * (1000 / MSPTICK);
2173                         tcpgo(tpriv, &tcb->timer);
2174                 }
2175                 if (!(seg.flags & RST)) {
2176                         tcb->flags |= FORCE;
2177                         goto output;
2178                 }
2179                 qunlock(&s->qlock);
2180                 poperror();
2181                 return;
2182         }
2183
2184         /* Cannot accept so answer with a rst */
2185         if (length && tcb->state == Closed) {
2186                 sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
2187                 goto raise;
2188         }
2189
2190         /* The segment is beyond the current receive pointer so
2191          * queue the data in the resequence queue
2192          */
2193         if (seg.seq != tcb->rcv.nxt)
2194                 if (length != 0 || (seg.flags & (SYN | FIN))) {
2195                         update(s, &seg);
2196                         if (addreseq(tcb, tpriv, &seg, bp, length) < 0)
2197                                 printd("reseq %I.%d -> %I.%d\n", s->raddr, s->rport, s->laddr,
2198                                            s->lport);
2199                         tcb->flags |= FORCE;
2200                         goto output;
2201                 }
2202
2203         /*
2204          *  keep looping till we've processed this packet plus any
2205          *  adjacent packets in the resequence queue
2206          */
2207         for (;;) {
2208                 if (seg.flags & RST) {
2209                         if (tcb->state == Established) {
2210                                 tpriv->stats[EstabResets]++;
2211                                 if (tcb->rcv.nxt != seg.seq)
2212                                         printd
2213                                                 ("out of order RST rcvd: %I.%d -> %I.%d, rcv.nxt 0x%lx seq 0x%lx\n",
2214                                                  s->raddr, s->rport, s->laddr, s->lport, tcb->rcv.nxt,
2215                                                  seg.seq);
2216                         }
2217                         localclose(s, errno_to_string(ECONNREFUSED));
2218                         goto raise;
2219                 }
2220
2221                 if ((seg.flags & ACK) == 0)
2222                         goto raise;
2223
2224                 switch (tcb->state) {
2225                         case Syn_received:
2226                                 if (!seq_within(seg.ack, tcb->snd.una + 1, tcb->snd.nxt)) {
2227                                         sndrst(tcp, source, dest, length, &seg, version,
2228                                                    "bad seq in Syn_received");
2229                                         goto raise;
2230                                 }
2231                                 update(s, &seg);
2232                                 tcpsetstate(s, Established);
2233                         case Established:
2234                         case Close_wait:
2235                                 update(s, &seg);
2236                                 break;
2237                         case Finwait1:
2238                                 update(s, &seg);
2239                                 if (qlen(s->wq) + tcb->flgcnt == 0) {
2240                                         tcphalt(tpriv, &tcb->rtt_timer);
2241                                         tcphalt(tpriv, &tcb->acktimer);
2242                                         tcpsetkacounter(tcb);
2243                                         tcb->time = NOW;
2244                                         tcpsetstate(s, Finwait2);
2245                                         tcb->katimer.start = MSL2 * (1000 / MSPTICK);
2246                                         tcpgo(tpriv, &tcb->katimer);
2247                                 }
2248                                 break;
2249                         case Finwait2:
2250                                 update(s, &seg);
2251                                 break;
2252                         case Closing:
2253                                 update(s, &seg);
2254                                 if (qlen(s->wq) + tcb->flgcnt == 0) {
2255                                         tcphalt(tpriv, &tcb->rtt_timer);
2256                                         tcphalt(tpriv, &tcb->acktimer);
2257                                         tcphalt(tpriv, &tcb->katimer);
2258                                         tcpsetstate(s, Time_wait);
2259                                         tcb->timer.start = MSL2 * (1000 / MSPTICK);
2260                                         tcpgo(tpriv, &tcb->timer);
2261                                 }
2262                                 break;
2263                         case Last_ack:
2264                                 update(s, &seg);
2265                                 if (qlen(s->wq) + tcb->flgcnt == 0) {
2266                                         localclose(s, NULL);
2267                                         goto raise;
2268                                 }
2269                         case Time_wait:
2270                                 tcb->flags |= FORCE;
2271                                 if (tcb->timer.state != TcptimerON)
2272                                         tcpgo(tpriv, &tcb->timer);
2273                 }
2274
2275                 if ((seg.flags & URG) && seg.urg) {
2276                         if (seq_gt(seg.urg + seg.seq, tcb->rcv.urg)) {
2277                                 tcb->rcv.urg = seg.urg + seg.seq;
2278                                 pullblock(&bp, seg.urg);
2279                         }
2280                 } else if (seq_gt(tcb->rcv.nxt, tcb->rcv.urg))
2281                         tcb->rcv.urg = tcb->rcv.nxt;
2282
2283                 if (length == 0) {
2284                         if (bp != NULL)
2285                                 freeblist(bp);
2286                 } else {
2287                         switch (tcb->state) {
2288                                 default:
2289                                         /* Ignore segment text */
2290                                         if (bp != NULL)
2291                                                 freeblist(bp);
2292                                         break;
2293
2294                                 case Syn_received:
2295                                 case Established:
2296                                 case Finwait1:
2297                                         /* If we still have some data place on
2298                                          * receive queue
2299                                          */
2300                                         if (bp) {
2301                                                 bp = packblock(bp);
2302                                                 if (bp == NULL)
2303                                                         panic("tcp packblock");
2304                                                 qpassnolim(s->rq, bp);
2305                                                 bp = NULL;
2306
2307                                                 /*
2308                                                  *  Force an ack every 2 data messages.  This is
2309                                                  *  a hack for rob to make his home system run
2310                                                  *  faster.
2311                                                  *
2312                                                  *  this also keeps the standard TCP congestion
2313                                                  *  control working since it needs an ack every
2314                                                  *  2 max segs worth.  This is not quite that,
2315                                                  *  but under a real stream is equivalent since
2316                                                  *  every packet has a max seg in it.
2317                                                  */
2318                                                 if (++(tcb->rcv.una) >= 2)
2319                                                         tcb->flags |= FORCE;
2320                                         }
2321                                         tcb->rcv.nxt += length;
2322
2323                                         /*
2324                                          *  update our rcv window
2325                                          */
2326                                         tcprcvwin(s);
2327
2328                                         /*
2329                                          *  turn on the acktimer if there's something
2330                                          *  to ack
2331                                          */
2332                                         if (tcb->acktimer.state != TcptimerON)
2333                                                 tcpgo(tpriv, &tcb->acktimer);
2334
2335                                         break;
2336                                 case Finwait2:
2337                                         /* no process to read the data, send a reset */
2338                                         if (bp != NULL)
2339                                                 freeblist(bp);
2340                                         sndrst(tcp, source, dest, length, &seg, version,
2341                                                    "send to Finwait2");
2342                                         qunlock(&s->qlock);
2343                                         poperror();
2344                                         return;
2345                         }
2346                 }
2347
2348                 if (seg.flags & FIN) {
2349                         tcb->flags |= FORCE;
2350
2351                         switch (tcb->state) {
2352                                 case Syn_received:
2353                                 case Established:
2354                                         tcb->rcv.nxt++;
2355                                         tcpsetstate(s, Close_wait);
2356                                         break;
2357                                 case Finwait1:
2358                                         tcb->rcv.nxt++;
2359                                         if (qlen(s->wq) + tcb->flgcnt == 0) {
2360                                                 tcphalt(tpriv, &tcb->rtt_timer);
2361                                                 tcphalt(tpriv, &tcb->acktimer);
2362                                                 tcphalt(tpriv, &tcb->katimer);
2363                                                 tcpsetstate(s, Time_wait);
2364                                                 tcb->timer.start = MSL2 * (1000 / MSPTICK);
2365                                                 tcpgo(tpriv, &tcb->timer);
2366                                         } else
2367                                                 tcpsetstate(s, Closing);
2368                                         break;
2369                                 case Finwait2:
2370                                         tcb->rcv.nxt++;
2371                                         tcphalt(tpriv, &tcb->rtt_timer);
2372                                         tcphalt(tpriv, &tcb->acktimer);
2373                                         tcphalt(tpriv, &tcb->katimer);
2374                                         tcpsetstate(s, Time_wait);
2375                                         tcb->timer.start = MSL2 * (1000 / MSPTICK);
2376                                         tcpgo(tpriv, &tcb->timer);
2377                                         break;
2378                                 case Close_wait:
2379                                 case Closing:
2380                                 case Last_ack:
2381                                         break;
2382                                 case Time_wait:
2383                                         tcpgo(tpriv, &tcb->timer);
2384                                         break;
2385                         }
2386                 }
2387
2388                 /*
2389                  *  get next adjacent segment from the resequence queue.
2390                  *  dump/trim any overlapping segments
2391                  */
2392                 for (;;) {
2393                         if (tcb->reseq == NULL)
2394                                 goto output;
2395
2396                         if (seq_ge(tcb->rcv.nxt, tcb->reseq->seg.seq) == 0)
2397                                 goto output;
2398
2399                         getreseq(tcb, &seg, &bp, &length);
2400
2401                         if (tcptrim(tcb, &seg, &bp, &length) == 0)
2402                                 break;
2403                 }
2404         }
2405 output:
2406         tcpoutput(s);
2407         qunlock(&s->qlock);
2408         poperror();
2409         return;
2410 raise:
2411         qunlock(&s->qlock);
2412         poperror();
2413         freeblist(bp);
2414         tcpkick(s);
2415 }
2416
2417 /*
2418  *  always enters and exits with the s locked.  We drop
2419  *  the lock to ipoput the packet so some care has to be
2420  *  taken by callers.
2421  */
2422 void tcpoutput(struct conv *s)
2423 {
2424         Tcp seg;
2425         int msgs;
2426         Tcpctl *tcb;
2427         struct block *hbp, *bp;
2428         int sndcnt, n;
2429         uint32_t ssize, dsize, usable, sent;
2430         struct Fs *f;
2431         struct tcppriv *tpriv;
2432         uint8_t version;
2433
2434         f = s->p->f;
2435         tpriv = s->p->priv;
2436         version = s->ipversion;
2437
2438         for (msgs = 0; msgs < 100; msgs++) {
2439                 tcb = (Tcpctl *) s->ptcl;
2440
2441                 switch (tcb->state) {
2442                         case Listen:
2443                         case Closed:
2444                         case Finwait2:
2445                                 return;
2446                 }
2447
2448                 /* force an ack when a window has opened up */
2449                 if (tcb->rcv.blocked && tcb->rcv.wnd > 0) {
2450                         tcb->rcv.blocked = 0;
2451                         tcb->flags |= FORCE;
2452                 }
2453
2454                 sndcnt = qlen(s->wq) + tcb->flgcnt;
2455                 sent = tcb->snd.ptr - tcb->snd.una;
2456
2457                 /* Don't send anything else until our SYN has been acked */
2458                 if (tcb->snd.ptr != tcb->iss && (tcb->flags & SYNACK) == 0)
2459                         break;
2460
2461                 /* Compute usable segment based on offered window and limit
2462                  * window probes to one
2463                  */
2464                 if (tcb->snd.wnd == 0) {
2465                         if (sent != 0) {
2466                                 if ((tcb->flags & FORCE) == 0)
2467                                         break;
2468 //              tcb->snd.ptr = tcb->snd.una;
2469                         }
2470                         usable = 1;
2471                 } else {
2472                         usable = tcb->cwind;
2473                         if (tcb->snd.wnd < usable)
2474                                 usable = tcb->snd.wnd;
2475                         usable -= sent;
2476                 }
2477                 ssize = sndcnt - sent;
2478                 if (ssize && usable < 2)
2479                         netlog(s->p->f, Logtcp, "throttled snd.wnd %lu cwind %lu\n",
2480                                    tcb->snd.wnd, tcb->cwind);
2481                 if (usable < ssize)
2482                         ssize = usable;
2483                 if (ssize > tcb->mss) {
2484                         if ((tcb->flags & TSO) == 0) {
2485                                 ssize = tcb->mss;
2486                         } else {
2487                                 int segs, window;
2488
2489                                 /*  Don't send too much.  32K is arbitrary..
2490                                  */
2491                                 if (ssize > 32 * 1024)
2492                                         ssize = 32 * 1024;
2493
2494                                 /* Clamp xmit to an integral MSS to
2495                                  * avoid ragged tail segments causing
2496                                  * poor link utilization.  Also
2497                                  * account for each segment sent in
2498                                  * msg heuristic, and round up to the
2499                                  * next multiple of 4, to ensure we
2500                                  * still yeild.
2501                                  */
2502                                 segs = ssize / tcb->mss;
2503                                 ssize = segs * tcb->mss;
2504                                 msgs += segs;
2505                                 if (segs > 3)
2506                                         msgs = (msgs + 4) & ~3;
2507                         }
2508                 }
2509
2510                 dsize = ssize;
2511                 seg.urg = 0;
2512
2513                 if (ssize == 0)
2514                         if ((tcb->flags & FORCE) == 0)
2515                                 break;
2516
2517                 tcb->flags &= ~FORCE;
2518                 tcprcvwin(s);
2519
2520                 /* By default we will generate an ack */
2521                 tcphalt(tpriv, &tcb->acktimer);
2522                 tcb->rcv.una = 0;
2523                 seg.source = s->lport;
2524                 seg.dest = s->rport;
2525                 seg.flags = ACK;
2526                 seg.mss = 0;
2527                 seg.ws = 0;
2528                 switch (tcb->state) {
2529                         case Syn_sent:
2530                                 seg.flags = 0;
2531                                 if (tcb->snd.ptr == tcb->iss) {
2532                                         seg.flags |= SYN;
2533                                         dsize--;
2534                                         seg.mss = tcb->mss;
2535                                         seg.ws = tcb->scale;
2536                                 }
2537                                 break;
2538                         case Syn_received:
2539                                 /*
2540                                  *  don't send any data with a SYN/ACK packet
2541                                  *  because Linux rejects the packet in its
2542                                  *  attempt to solve the SYN attack problem
2543                                  */
2544                                 if (tcb->snd.ptr == tcb->iss) {
2545                                         seg.flags |= SYN;
2546                                         dsize = 0;
2547                                         ssize = 1;
2548                                         seg.mss = tcb->mss;
2549                                         seg.ws = tcb->scale;
2550                                 }
2551                                 break;
2552                 }
2553                 seg.seq = tcb->snd.ptr;
2554                 seg.ack = tcb->rcv.nxt;
2555                 seg.wnd = tcb->rcv.wnd;
2556
2557                 /* Pull out data to send */
2558                 bp = NULL;
2559                 if (dsize != 0) {
2560                         bp = qcopy(s->wq, dsize, sent);
2561                         if (BLEN(bp) != dsize) {
2562                                 seg.flags |= FIN;
2563                                 dsize--;
2564                         }
2565                         if (BLEN(bp) > tcb->mss) {
2566                                 bp->flag |= Btso;
2567                                 bp->mss = tcb->mss;
2568                         }
2569                 }
2570
2571                 if (sent + dsize == sndcnt)
2572                         seg.flags |= PSH;
2573
2574                 /* keep track of balance of resent data */
2575                 if (seq_lt(tcb->snd.ptr, tcb->snd.nxt)) {
2576                         n = tcb->snd.nxt - tcb->snd.ptr;
2577                         if (ssize < n)
2578                                 n = ssize;
2579                         tcb->resent += n;
2580                         netlog(f, Logtcp, "rexmit: %I.%d -> %I.%d ptr 0x%lx nxt 0x%lx\n",
2581                                    s->raddr, s->rport, s->laddr, s->lport, tcb->snd.ptr,
2582                                    tcb->snd.nxt);
2583                         tpriv->stats[RetransSegs]++;
2584                 }
2585
2586                 tcb->snd.ptr += ssize;
2587
2588                 /* Pull up the send pointer so we can accept acks
2589                  * for this window
2590                  */
2591                 if (seq_gt(tcb->snd.ptr, tcb->snd.nxt))
2592                         tcb->snd.nxt = tcb->snd.ptr;
2593
2594                 /* Build header, link data and compute cksum */
2595                 switch (version) {
2596                         case V4:
2597                                 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
2598                                 hbp = htontcp4(&seg, bp, &tcb->protohdr.tcp4hdr, tcb);
2599                                 if (hbp == NULL) {
2600                                         freeblist(bp);
2601                                         return;
2602                                 }
2603                                 break;
2604                         case V6:
2605                                 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
2606                                 hbp = htontcp6(&seg, bp, &tcb->protohdr.tcp6hdr, tcb);
2607                                 if (hbp == NULL) {
2608                                         freeblist(bp);
2609                                         return;
2610                                 }
2611                                 break;
2612                         default:
2613                                 hbp = NULL;     /* to suppress a warning */
2614                                 panic("tcpoutput: version %d", version);
2615                 }
2616
2617                 /* Start the transmission timers if there is new data and we
2618                  * expect acknowledges
2619                  */
2620                 if (ssize != 0) {
2621                         if (tcb->timer.state != TcptimerON)
2622                                 tcpgo(tpriv, &tcb->timer);
2623
2624                         /*  If round trip timer isn't running, start it.
2625                          *  measure the longest packet only in case the
2626                          *  transmission time dominates RTT
2627                          */
2628                         if (tcb->rtt_timer.state != TcptimerON)
2629                                 if (ssize == tcb->mss) {
2630                                         tcpgo(tpriv, &tcb->rtt_timer);
2631                                         tcb->rttseq = tcb->snd.ptr;
2632                                 }
2633                 }
2634
2635                 tpriv->stats[OutSegs]++;
2636
2637                 /* put off the next keep alive */
2638                 tcpgo(tpriv, &tcb->katimer);
2639
2640                 switch (version) {
2641                         case V4:
2642                                 if (ipoput4(f, hbp, 0, s->ttl, s->tos, s) < 0) {
2643                                         /* a negative return means no route */
2644                                         localclose(s, "no route");
2645                                 }
2646                                 break;
2647                         case V6:
2648                                 if (ipoput6(f, hbp, 0, s->ttl, s->tos, s) < 0) {
2649                                         /* a negative return means no route */
2650                                         localclose(s, "no route");
2651                                 }
2652                                 break;
2653                         default:
2654                                 panic("tcpoutput2: version %d", version);
2655                 }
2656                 if ((msgs % 4) == 1) {
2657                         qunlock(&s->qlock);
2658                         kthread_yield();
2659                         qlock(&s->qlock);
2660                 }
2661         }
2662 }
2663
2664 /*
2665  *  the BSD convention (hack?) for keep alives.  resend last uint8_t acked.
2666  */
2667 void tcpsendka(struct conv *s)
2668 {
2669         Tcp seg;
2670         Tcpctl *tcb;
2671         struct block *hbp, *dbp;
2672
2673         tcb = (Tcpctl *) s->ptcl;
2674
2675         dbp = NULL;
2676         seg.urg = 0;
2677         seg.source = s->lport;
2678         seg.dest = s->rport;
2679         seg.flags = ACK | PSH;
2680         seg.mss = 0;
2681         seg.ws = 0;
2682         if (tcpporthogdefense)
2683                 seg.seq = tcb->snd.una - (1 << 30) - nrand(1 << 20);
2684         else
2685                 seg.seq = tcb->snd.una - 1;
2686         seg.ack = tcb->rcv.nxt;
2687         tcb->rcv.una = 0;
2688         seg.wnd = tcb->rcv.wnd;
2689         if (tcb->state == Finwait2) {
2690                 seg.flags |= FIN;
2691         } else {
2692                 dbp = allocb(1);
2693                 dbp->wp++;
2694         }
2695
2696         if (isv4(s->raddr)) {
2697                 /* Build header, link data and compute cksum */
2698                 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
2699                 hbp = htontcp4(&seg, dbp, &tcb->protohdr.tcp4hdr, tcb);
2700                 if (hbp == NULL) {
2701                         freeblist(dbp);
2702                         return;
2703                 }
2704                 ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
2705         } else {
2706                 /* Build header, link data and compute cksum */
2707                 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
2708                 hbp = htontcp6(&seg, dbp, &tcb->protohdr.tcp6hdr, tcb);
2709                 if (hbp == NULL) {
2710                         freeblist(dbp);
2711                         return;
2712                 }
2713                 ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
2714         }
2715 }
2716
2717 /*
2718  *  set connection to time out after 12 minutes
2719  */
2720 void tcpsetkacounter(Tcpctl * tcb)
2721 {
2722         tcb->kacounter = (12 * 60 * 1000) / (tcb->katimer.start * MSPTICK);
2723         if (tcb->kacounter < 3)
2724                 tcb->kacounter = 3;
2725 }
2726
2727 /*
2728  *  if we've timed out, close the connection
2729  *  otherwise, send a keepalive and restart the timer
2730  */
2731 void tcpkeepalive(void *v)
2732 {
2733         ERRSTACK(1);
2734         Tcpctl *tcb;
2735         struct conv *s;
2736
2737         s = v;
2738         tcb = (Tcpctl *) s->ptcl;
2739         qlock(&s->qlock);
2740         if (waserror()) {
2741                 qunlock(&s->qlock);
2742                 nexterror();
2743         }
2744         if (tcb->state != Closed) {
2745                 if (--(tcb->kacounter) <= 0) {
2746                         localclose(s, errno_to_string(ETIMEDOUT));
2747                 } else {
2748                         tcpsendka(s);
2749                         tcpgo(s->p->priv, &tcb->katimer);
2750                 }
2751         }
2752         qunlock(&s->qlock);
2753         poperror();
2754 }
2755
2756 /*
2757  *  start keepalive timer
2758  */
2759 char *tcpstartka(struct conv *s, char **f, int n)
2760 {
2761         Tcpctl *tcb;
2762         int x;
2763
2764         tcb = (Tcpctl *) s->ptcl;
2765         if (tcb->state != Established)
2766                 return "connection must be in Establised state";
2767         if (n > 1) {
2768                 x = atoi(f[1]);
2769                 if (x >= MSPTICK)
2770                         tcb->katimer.start = x / MSPTICK;
2771         }
2772         tcpsetkacounter(tcb);
2773         tcpgo(s->p->priv, &tcb->katimer);
2774
2775         return NULL;
2776 }
2777
2778 /*
2779  *  turn checksums on/off
2780  */
2781 char *tcpsetchecksum(struct conv *s, char **f, int unused)
2782 {
2783         Tcpctl *tcb;
2784
2785         tcb = (Tcpctl *) s->ptcl;
2786         tcb->nochecksum = !atoi(f[1]);
2787
2788         return NULL;
2789 }
2790
2791 void tcprxmit(struct conv *s)
2792 {
2793         Tcpctl *tcb;
2794
2795         tcb = (Tcpctl *) s->ptcl;
2796
2797         tcb->flags |= RETRAN | FORCE;
2798         tcb->snd.ptr = tcb->snd.una;
2799
2800         /*
2801          *  We should be halving the slow start threshhold (down to one
2802          *  mss) but leaving it at mss seems to work well enough
2803          */
2804         tcb->ssthresh = tcb->mss;
2805
2806         /*
2807          *  pull window down to a single packet
2808          */
2809         tcb->cwind = tcb->mss;
2810         tcpoutput(s);
2811 }
2812
2813 void tcptimeout(void *arg)
2814 {
2815         ERRSTACK(1);
2816         struct conv *s;
2817         Tcpctl *tcb;
2818         int maxback;
2819         struct tcppriv *tpriv;
2820
2821         s = (struct conv *)arg;
2822         tpriv = s->p->priv;
2823         tcb = (Tcpctl *) s->ptcl;
2824
2825         qlock(&s->qlock);
2826         if (waserror()) {
2827                 qunlock(&s->qlock);
2828                 nexterror();
2829         }
2830         switch (tcb->state) {
2831                 default:
2832                         tcb->backoff++;
2833                         if (tcb->state == Syn_sent)
2834                                 maxback = MAXBACKMS / 2;
2835                         else
2836                                 maxback = MAXBACKMS;
2837                         tcb->backedoff += tcb->timer.start * MSPTICK;
2838                         if (tcb->backedoff >= maxback) {
2839                                 localclose(s, errno_to_string(ETIMEDOUT));
2840                                 break;
2841                         }
2842                         netlog(s->p->f, Logtcprxmt, "timeout rexmit 0x%lx %llu/%llu\n",
2843                                    tcb->snd.una, tcb->timer.start, NOW);
2844                         tcpsettimer(tcb);
2845                         tcprxmit(s);
2846                         tpriv->stats[RetransTimeouts]++;
2847                         tcb->snd.dupacks = 0;
2848                         break;
2849                 case Time_wait:
2850                         localclose(s, NULL);
2851                         break;
2852                 case Closed:
2853                         break;
2854         }
2855         qunlock(&s->qlock);
2856         poperror();
2857 }
2858
2859 int inwindow(Tcpctl * tcb, int seq)
2860 {
2861         return seq_within(seq, tcb->rcv.nxt, tcb->rcv.nxt + tcb->rcv.wnd - 1);
2862 }
2863
2864 /*
2865  *  set up state for a received SYN (or SYN ACK) packet
2866  */
2867 void procsyn(struct conv *s, Tcp * seg)
2868 {
2869         Tcpctl *tcb;
2870
2871         tcb = (Tcpctl *) s->ptcl;
2872         tcb->flags |= FORCE;
2873
2874         tcb->rcv.nxt = seg->seq + 1;
2875         tcb->rcv.urg = tcb->rcv.nxt;
2876         tcb->irs = seg->seq;
2877
2878         /* our sending max segment size cannot be bigger than what he asked for */
2879         if (seg->mss != 0 && seg->mss < tcb->mss)
2880                 tcb->mss = seg->mss;
2881
2882         /* the congestion window always starts out as a single segment */
2883         tcb->snd.wnd = seg->wnd;
2884         tcb->cwind = tcb->mss;
2885 }
2886
2887 int
2888 addreseq(Tcpctl * tcb, struct tcppriv *tpriv, Tcp * seg,
2889                  struct block *bp, uint16_t length)
2890 {
2891         Reseq *rp, *rp1;
2892         int i, rqlen, qmax;
2893
2894         rp = kzmalloc(sizeof(Reseq), 0);
2895         if (rp == NULL) {
2896                 freeblist(bp);  /* bp always consumed by add_reseq */
2897                 return 0;
2898         }
2899
2900         rp->seg = *seg;
2901         rp->bp = bp;
2902         rp->length = length;
2903
2904         /* Place on reassembly list sorting by starting seq number */
2905         rp1 = tcb->reseq;
2906         if (rp1 == NULL || seq_lt(seg->seq, rp1->seg.seq)) {
2907                 rp->next = rp1;
2908                 tcb->reseq = rp;
2909                 if (rp->next != NULL)
2910                         tpriv->stats[OutOfOrder]++;
2911                 return 0;
2912         }
2913
2914         rqlen = 0;
2915         for (i = 0;; i++) {
2916                 rqlen += rp1->length;
2917                 if (rp1->next == NULL || seq_lt(seg->seq, rp1->next->seg.seq)) {
2918                         rp->next = rp1->next;
2919                         rp1->next = rp;
2920                         if (rp->next != NULL)
2921                                 tpriv->stats[OutOfOrder]++;
2922                         break;
2923                 }
2924                 rp1 = rp1->next;
2925         }
2926         qmax = QMAX << tcb->rcv.scale;
2927         if (rqlen > qmax) {
2928                 printd("resequence queue > window: %d > %d\n", rqlen, qmax);
2929                 i = 0;
2930                 for (rp1 = tcb->reseq; rp1 != NULL; rp1 = rp1->next) {
2931                         printd("0x%#lx 0x%#lx 0x%#x\n", rp1->seg.seq,
2932                                    rp1->seg.ack, rp1->seg.flags);
2933                         if (i++ > 10) {
2934                                 printd("...\n");
2935                                 break;
2936                         }
2937                 }
2938
2939                 // delete entire reassembly queue; wait for retransmit.
2940                 // - should we be smarter and only delete the tail?
2941                 for (rp = tcb->reseq; rp != NULL; rp = rp1) {
2942                         rp1 = rp->next;
2943                         freeblist(rp->bp);
2944                         kfree(rp);
2945                 }
2946                 tcb->reseq = NULL;
2947
2948                 return -1;
2949         }
2950         return 0;
2951 }
2952
2953 void getreseq(Tcpctl * tcb, Tcp * seg, struct block **bp, uint16_t * length)
2954 {
2955         Reseq *rp;
2956
2957         rp = tcb->reseq;
2958         if (rp == NULL)
2959                 return;
2960
2961         tcb->reseq = rp->next;
2962
2963         *seg = rp->seg;
2964         *bp = rp->bp;
2965         *length = rp->length;
2966
2967         kfree(rp);
2968 }
2969
2970 int tcptrim(Tcpctl * tcb, Tcp * seg, struct block **bp, uint16_t * length)
2971 {
2972         uint16_t len;
2973         uint8_t accept;
2974         int dupcnt, excess;
2975
2976         accept = 0;
2977         len = *length;
2978         if (seg->flags & SYN)
2979                 len++;
2980         if (seg->flags & FIN)
2981                 len++;
2982
2983         if (tcb->rcv.wnd == 0) {
2984                 if (len == 0 && seg->seq == tcb->rcv.nxt)
2985                         return 0;
2986         } else {
2987                 /* Some part of the segment should be in the window */
2988                 if (inwindow(tcb, seg->seq))
2989                         accept++;
2990                 else if (len != 0) {
2991                         if (inwindow(tcb, seg->seq + len - 1) ||
2992                                 seq_within(tcb->rcv.nxt, seg->seq, seg->seq + len - 1))
2993                                 accept++;
2994                 }
2995         }
2996         if (!accept) {
2997                 freeblist(*bp);
2998                 return -1;
2999         }
3000         dupcnt = tcb->rcv.nxt - seg->seq;
3001         if (dupcnt > 0) {
3002                 tcb->rerecv += dupcnt;
3003                 if (seg->flags & SYN) {
3004                         seg->flags &= ~SYN;
3005                         seg->seq++;
3006
3007                         if (seg->urg > 1)
3008                                 seg->urg--;
3009                         else
3010                                 seg->flags &= ~URG;
3011                         dupcnt--;
3012                 }
3013                 if (dupcnt > 0) {
3014                         pullblock(bp, (uint16_t) dupcnt);
3015                         seg->seq += dupcnt;
3016                         *length -= dupcnt;
3017
3018                         if (seg->urg > dupcnt)
3019                                 seg->urg -= dupcnt;
3020                         else {
3021                                 seg->flags &= ~URG;
3022                                 seg->urg = 0;
3023                         }
3024                 }
3025         }
3026         excess = seg->seq + *length - (tcb->rcv.nxt + tcb->rcv.wnd);
3027         if (excess > 0) {
3028                 tcb->rerecv += excess;
3029                 *length -= excess;
3030                 *bp = trimblock(*bp, 0, *length);
3031                 if (*bp == NULL)
3032                         panic("presotto is a boofhead");
3033                 seg->flags &= ~FIN;
3034         }
3035         return 0;
3036 }
3037
3038 void tcpadvise(struct Proto *tcp, struct block *bp, char *msg)
3039 {
3040         Tcp4hdr *h4;
3041         Tcp6hdr *h6;
3042         Tcpctl *tcb;
3043         uint8_t source[IPaddrlen];
3044         uint8_t dest[IPaddrlen];
3045         uint16_t psource, pdest;
3046         struct conv *s, **p;
3047
3048         h4 = (Tcp4hdr *) (bp->rp);
3049         h6 = (Tcp6hdr *) (bp->rp);
3050
3051         if ((h4->vihl & 0xF0) == IP_VER4) {
3052                 v4tov6(dest, h4->tcpdst);
3053                 v4tov6(source, h4->tcpsrc);
3054                 psource = nhgets(h4->tcpsport);
3055                 pdest = nhgets(h4->tcpdport);
3056         } else {
3057                 ipmove(dest, h6->tcpdst);
3058                 ipmove(source, h6->tcpsrc);
3059                 psource = nhgets(h6->tcpsport);
3060                 pdest = nhgets(h6->tcpdport);
3061         }
3062
3063         /* Look for a connection */
3064         qlock(&tcp->qlock);
3065         for (p = tcp->conv; *p; p++) {
3066                 s = *p;
3067                 tcb = (Tcpctl *) s->ptcl;
3068                 if (s->rport == pdest)
3069                         if (s->lport == psource)
3070                                 if (tcb->state != Closed)
3071                                         if (ipcmp(s->raddr, dest) == 0)
3072                                                 if (ipcmp(s->laddr, source) == 0) {
3073                                                         qlock(&s->qlock);
3074                                                         qunlock(&tcp->qlock);
3075                                                         switch (tcb->state) {
3076                                                                 case Syn_sent:
3077                                                                         localclose(s, msg);
3078                                                                         break;
3079                                                         }
3080                                                         qunlock(&s->qlock);
3081                                                         freeblist(bp);
3082                                                         return;
3083                                                 }
3084         }
3085         qunlock(&tcp->qlock);
3086         freeblist(bp);
3087 }
3088
3089 static char *tcpporthogdefensectl(char *val)
3090 {
3091         if (strcmp(val, "on") == 0)
3092                 tcpporthogdefense = 1;
3093         else if (strcmp(val, "off") == 0)
3094                 tcpporthogdefense = 0;
3095         else
3096                 return "unknown value for tcpporthogdefense";
3097         return NULL;
3098 }
3099
3100 /* called with c qlocked */
3101 char *tcpctl(struct conv *c, char **f, int n)
3102 {
3103         if (n == 1 && strcmp(f[0], "hangup") == 0)
3104                 return tcphangup(c);
3105         if (n >= 1 && strcmp(f[0], "keepalive") == 0)
3106                 return tcpstartka(c, f, n);
3107         if (n >= 1 && strcmp(f[0], "checksum") == 0)
3108                 return tcpsetchecksum(c, f, n);
3109         if (n >= 1 && strcmp(f[0], "tcpporthogdefense") == 0)
3110                 return tcpporthogdefensectl(f[1]);
3111         return "unknown control request";
3112 }
3113
3114 int tcpstats(struct Proto *tcp, char *buf, int len)
3115 {
3116         struct tcppriv *priv;
3117         char *p, *e;
3118         int i;
3119
3120         priv = tcp->priv;
3121         p = buf;
3122         e = p + len;
3123         for (i = 0; i < Nstats; i++)
3124                 p = seprintf(p, e, "%s: %u\n", statnames[i], priv->stats[i]);
3125         return p - buf;
3126 }
3127
3128 /*
3129  *  garbage collect any stale conversations:
3130  *      - SYN received but no SYN-ACK after 5 seconds (could be the SYN attack)
3131  *      - Finwait2 after 5 minutes
3132  *
3133  *  this is called whenever we run out of channels.  Both checks are
3134  *  of questionable validity so we try to use them only when we're
3135  *  up against the wall.
3136  */
3137 int tcpgc(struct Proto *tcp)
3138 {
3139         struct conv *c, **pp, **ep;
3140         int n;
3141         Tcpctl *tcb;
3142
3143         n = 0;
3144         ep = &tcp->conv[tcp->nc];
3145         for (pp = tcp->conv; pp < ep; pp++) {
3146                 c = *pp;
3147                 if (c == NULL)
3148                         break;
3149                 if (!canqlock(&c->qlock))
3150                         continue;
3151                 tcb = (Tcpctl *) c->ptcl;
3152                 switch (tcb->state) {
3153                         case Syn_received:
3154                                 if (NOW - tcb->time > 5000) {
3155                                         localclose(c, "timed out");
3156                                         n++;
3157                                 }
3158                                 break;
3159                         case Finwait2:
3160                                 if (NOW - tcb->time > 5 * 60 * 1000) {
3161                                         localclose(c, "timed out");
3162                                         n++;
3163                                 }
3164                                 break;
3165                 }
3166                 qunlock(&c->qlock);
3167         }
3168         return n;
3169 }
3170
3171 void tcpsettimer(Tcpctl * tcb)
3172 {
3173         int x;
3174
3175         /* round trip dependency */
3176         x = backoff(tcb->backoff) *
3177                 (tcb->mdev + (tcb->srtt >> LOGAGAIN) + MSPTICK) / MSPTICK;
3178
3179         /* bounded twixt 1/2 and 64 seconds */
3180         if (x < 500 / MSPTICK)
3181                 x = 500 / MSPTICK;
3182         else if (x > (64000 / MSPTICK))
3183                 x = 64000 / MSPTICK;
3184         tcb->timer.start = x;
3185 }
3186
3187 void tcpinit(struct Fs *fs)
3188 {
3189         struct Proto *tcp;
3190         struct tcppriv *tpriv;
3191
3192         tcp = kzmalloc(sizeof(struct Proto), 0);
3193         tpriv = tcp->priv = kzmalloc(sizeof(struct tcppriv), 0);
3194         qlock_init(&tpriv->tl);
3195         qlock_init(&tpriv->apl);
3196         tcp->name = "tcp";
3197         tcp->connect = tcpconnect;
3198         tcp->announce = tcpannounce;
3199         tcp->ctl = tcpctl;
3200         tcp->state = tcpstate;
3201         tcp->create = tcpcreate;
3202         tcp->close = tcpclose;
3203         tcp->rcv = tcpiput;
3204         tcp->advise = tcpadvise;
3205         tcp->stats = tcpstats;
3206         tcp->inuse = tcpinuse;
3207         tcp->gc = tcpgc;
3208         tcp->ipproto = IP_TCPPROTO;
3209         tcp->nc = scalednconv();
3210         tcp->ptclsize = sizeof(Tcpctl);
3211         tpriv->stats[MaxConn] = tcp->nc;
3212
3213         Fsproto(fs, tcp);
3214 }
3215
3216 void
3217 tcpsetscale(struct conv *s, Tcpctl * tcb, uint16_t rcvscale, uint16_t sndscale)
3218 {
3219         if (rcvscale) {
3220                 tcb->rcv.scale = rcvscale & 0xff;
3221                 tcb->snd.scale = sndscale & 0xff;
3222                 tcb->window = QMAX << tcb->snd.scale;
3223                 qsetlimit(s->rq, tcb->window);
3224         } else {
3225                 tcb->rcv.scale = 0;
3226                 tcb->snd.scale = 0;
3227                 tcb->window = QMAX;
3228                 qsetlimit(s->rq, tcb->window);
3229         }
3230 }