342afa95b03e52f9eec16e4624230a39bbd8be71
[akaros.git] / kern / src / net / tcp.c
1 // INFERNO
2 #include <vfs.h>
3 #include <kfs.h>
4 #include <slab.h>
5 #include <kmalloc.h>
6 #include <kref.h>
7 #include <string.h>
8 #include <stdio.h>
9 #include <assert.h>
10 #include <error.h>
11 #include <cpio.h>
12 #include <pmap.h>
13 #include <smp.h>
14 #include <ip.h>
15
16 #include <vfs.h>
17 #include <kfs.h>
18 #include <slab.h>
19 #include <kmalloc.h>
20 #include <kref.h>
21 #include <string.h>
22 #include <stdio.h>
23 #include <assert.h>
24 #include <error.h>
25 #include <cpio.h>
26 #include <pmap.h>
27 #include <smp.h>
28 #include <ip.h>
29
30 enum {
31         QMAX = 64 * 1024 - 1,
32         IP_TCPPROTO = 6,
33
34         TCP4_IPLEN = 8,
35         TCP4_PHDRSIZE = 12,
36         TCP4_HDRSIZE = 20,
37         TCP4_TCBPHDRSZ = 40,
38         TCP4_PKT = TCP4_IPLEN + TCP4_PHDRSIZE,
39
40         TCP6_IPLEN = 0,
41         TCP6_PHDRSIZE = 40,
42         TCP6_HDRSIZE = 20,
43         TCP6_TCBPHDRSZ = 60,
44         TCP6_PKT = TCP6_IPLEN + TCP6_PHDRSIZE,
45
46         TcptimerOFF = 0,
47         TcptimerON = 1,
48         TcptimerDONE = 2,
49         MAX_TIME = (1 << 20),   /* Forever */
50         TCP_ACK = 50,   /* Timed ack sequence in ms */
51         MAXBACKMS = 9 * 60 * 1000,      /* longest backoff time (ms) before hangup */
52
53         URG = 0x20,     /* Data marked urgent */
54         ACK = 0x10,     /* Acknowledge is valid */
55         PSH = 0x08,     /* Whole data pipe is pushed */
56         RST = 0x04,     /* Reset connection */
57         SYN = 0x02,     /* Pkt. is synchronise */
58         FIN = 0x01,     /* Start close down */
59
60         EOLOPT = 0,
61         NOOPOPT = 1,
62         MSSOPT = 2,
63         MSS_LENGTH = 4, /* Mean segment size */
64         WSOPT = 3,
65         WS_LENGTH = 3,  /* Bits to scale window size by */
66         MSL2 = 10,
67         MSPTICK = 50,   /* Milliseconds per timer tick */
68         DEF_MSS = 1460, /* Default mean segment */
69         DEF_MSS6 = 1280,        /* Default mean segment (min) for v6 */
70         DEF_RTT = 500,  /* Default round trip */
71         DEF_KAT = 120000,       /* Default time (ms) between keep alives */
72         TCP_LISTEN = 0, /* Listen connection */
73         TCP_CONNECT = 1,        /* Outgoing connection */
74         SYNACK_RXTIMER = 250,   /* ms between SYNACK retransmits */
75
76         TCPREXMTTHRESH = 3,     /* dupack threshhold for rxt */
77
78         FORCE = 1,
79         CLONE = 2,
80         RETRAN = 4,
81         ACTIVE = 8,
82         SYNACK = 16,
83
84         LOGAGAIN = 3,
85         LOGDGAIN = 2,
86
87         Closed = 0,     /* Connection states */
88         Listen,
89         Syn_sent,
90         Syn_received,
91         Established,
92         Finwait1,
93         Finwait2,
94         Close_wait,
95         Closing,
96         Last_ack,
97         Time_wait,
98
99         Maxlimbo = 1000,        /* maximum procs waiting for response to SYN ACK */
100         NLHT = 256,     /* hash table size, must be a power of 2 */
101         LHTMASK = NLHT - 1,
102
103         HaveWS = 1 << 8,
104 };
105
106 /* Must correspond to the enumeration above */
107 char *tcpstates[] = {
108         "Closed", "Listen", "Syn_sent", "Syn_received",
109         "Established", "Finwait1", "Finwait2", "Close_wait",
110         "Closing", "Last_ack", "Time_wait"
111 };
112
113 typedef struct Tcptimer Tcptimer;
114 struct Tcptimer {
115         Tcptimer *next;
116         Tcptimer *prev;
117         Tcptimer *readynext;
118         int state;
119         int start;
120         int count;
121         void (*func) (void *);
122         void *arg;
123 };
124
125 /*
126  *  v4 and v6 pseudo headers used for
127  *  checksuming tcp
128  */
129 typedef struct Tcp4hdr Tcp4hdr;
130 struct Tcp4hdr {
131         uint8_t vihl;                           /* Version and header length */
132         uint8_t tos;                            /* Type of service */
133         uint8_t length[2];                      /* packet length */
134         uint8_t id[2];                          /* Identification */
135         uint8_t frag[2];                        /* Fragment information */
136         uint8_t Unused;
137         uint8_t proto;
138         uint8_t tcplen[2];
139         uint8_t tcpsrc[4];
140         uint8_t tcpdst[4];
141         uint8_t tcpsport[2];
142         uint8_t tcpdport[2];
143         uint8_t tcpseq[4];
144         uint8_t tcpack[4];
145         uint8_t tcpflag[2];
146         uint8_t tcpwin[2];
147         uint8_t tcpcksum[2];
148         uint8_t tcpurg[2];
149         /* Options segment */
150         uint8_t tcpopt[1];
151 };
152
153 typedef struct Tcp6hdr Tcp6hdr;
154 struct Tcp6hdr {
155         uint8_t vcf[4];
156         uint8_t ploadlen[2];
157         uint8_t proto;
158         uint8_t ttl;
159         uint8_t tcpsrc[IPaddrlen];
160         uint8_t tcpdst[IPaddrlen];
161         uint8_t tcpsport[2];
162         uint8_t tcpdport[2];
163         uint8_t tcpseq[4];
164         uint8_t tcpack[4];
165         uint8_t tcpflag[2];
166         uint8_t tcpwin[2];
167         uint8_t tcpcksum[2];
168         uint8_t tcpurg[2];
169         /* Options segment */
170         uint8_t tcpopt[1];
171 };
172
173 /*
174  *  this represents the control info
175  *  for a single packet.  It is derived from
176  *  a packet in ntohtcp{4,6}() and stuck into
177  *  a packet in htontcp{4,6}().
178  */
179 typedef struct Tcp Tcp;
180 struct Tcp {
181         uint16_t source;
182         uint16_t dest;
183         uint32_t seq;
184         uint32_t ack;
185         uint8_t flags;
186         uint16_t ws;                            /* window scale option (if not zero) */
187         uint32_t wnd;
188         uint16_t urg;
189         uint16_t mss;                           /* max segment size option (if not zero) */
190         uint16_t len;                           /* size of data */
191 };
192
193 /*
194  *  this header is malloc'd to thread together fragments
195  *  waiting to be coalesced
196  */
197 typedef struct Reseq Reseq;
198 struct Reseq {
199         Reseq *next;
200         Tcp seg;
201         struct block *bp;
202         uint16_t length;
203 };
204
205 /*
206  *  the qlock in the Conv locks this structure
207  */
208 typedef struct Tcpctl Tcpctl;
209 struct Tcpctl {
210         uint8_t state;                          /* Connection state */
211         uint8_t type;                           /* Listening or active connection */
212         uint8_t code;                           /* Icmp code */
213         struct {
214                 uint32_t una;                   /* Unacked data pointer */
215                 uint32_t nxt;                   /* Next sequence expected */
216                 uint32_t ptr;                   /* Data pointer */
217                 uint32_t wnd;                   /* Tcp send window */
218                 uint32_t urg;                   /* Urgent data pointer */
219                 uint32_t wl2;
220                 int scale;                              /* how much to right shift window in xmitted packets */
221                 /* to implement tahoe and reno TCP */
222                 uint32_t dupacks;               /* number of duplicate acks rcvd */
223                 int recovery;                   /* loss recovery flag */
224                 uint32_t rxt;                   /* right window marker for recovery */
225         } snd;
226         struct {
227                 uint32_t nxt;                   /* Receive pointer to next uint8_t slot */
228                 uint32_t wnd;                   /* Receive window incoming */
229                 uint32_t urg;                   /* Urgent pointer */
230                 int blocked;
231                 int una;                                /* unacked data segs */
232                 int scale;                              /* how much to left shift window in rcved packets */
233         } rcv;
234         uint32_t iss;                           /* Initial sequence number */
235         int sawwsopt;                           /* true if we saw a wsopt on the incoming SYN */
236         uint32_t cwind;                         /* Congestion window */
237         int scale;                                      /* desired snd.scale */
238         uint16_t ssthresh;                      /* Slow start threshold */
239         int resent;                                     /* Bytes just resent */
240         int irs;                                        /* Initial received squence */
241         uint16_t mss;                           /* Mean segment size */
242         int rerecv;                                     /* Overlap of data rerecevived */
243         uint32_t window;                        /* Recevive window */
244         uint8_t backoff;                        /* Exponential backoff counter */
245         int backedoff;                          /* ms we've backed off for rexmits */
246         uint8_t flags;                          /* State flags */
247         Reseq *reseq;                           /* Resequencing queue */
248         Tcptimer timer;                         /* Activity timer */
249         Tcptimer acktimer;                      /* Acknowledge timer */
250         Tcptimer rtt_timer;                     /* Round trip timer */
251         Tcptimer katimer;                       /* keep alive timer */
252         uint32_t rttseq;                        /* Round trip sequence */
253         int srtt;                                       /* Shortened round trip */
254         int mdev;                                       /* Mean deviation of round trip */
255         int kacounter;                          /* count down for keep alive */
256         unsigned int sndsyntime;        /* time syn sent */
257         uint32_t time;                          /* time Finwait2 or Syn_received was sent */
258         int nochecksum;                         /* non-zero means don't send checksums */
259         int flgcnt;                                     /* number of flags in the sequence (FIN,SEQ) */
260
261         union {
262                 Tcp4hdr tcp4hdr;
263                 Tcp6hdr tcp6hdr;
264         } protohdr;                                     /* prototype header */
265 };
266
267 /*
268  *  New calls are put in limbo rather than having a conversation structure
269  *  allocated.  Thus, a SYN attack results in lots of limbo'd calls but not
270  *  any real Conv structures mucking things up.  Calls in limbo rexmit their
271  *  SYN ACK every SYNACK_RXTIMER ms up to 4 times, i.e., they disappear after 1 second.
272  *
273  *  In particular they aren't on a listener's queue so that they don't figure
274  *  in the input queue limit.
275  *
276  *  If 1/2 of a T3 was attacking SYN packets, we'ld have a permanent queue
277  *  of 70000 limbo'd calls.  Not great for a linear list but doable.  Therefore
278  *  there is no hashing of this list.
279  */
280 typedef struct Limbo Limbo;
281 struct Limbo {
282         Limbo *next;
283
284         uint8_t laddr[IPaddrlen];
285         uint8_t raddr[IPaddrlen];
286         uint16_t lport;
287         uint16_t rport;
288         uint32_t irs;                           /* initial received sequence */
289         uint32_t iss;                           /* initial sent sequence */
290         uint16_t mss;                           /* mss from the other end */
291         uint16_t rcvscale;                      /* how much to scale rcvd windows */
292         uint16_t sndscale;                      /* how much to scale sent windows */
293         uint32_t lastsend;                      /* last time we sent a synack */
294         uint8_t version;                        /* v4 or v6 */
295         uint8_t rexmits;                        /* number of retransmissions */
296 };
297
298 int tcp_irtt = DEF_RTT;                 /* Initial guess at round trip time */
299 uint16_t tcp_mss = DEF_MSS;             /* Maximum segment size to be sent */
300
301 enum {
302         /* MIB stats */
303         MaxConn,
304         ActiveOpens,
305         PassiveOpens,
306         EstabResets,
307         CurrEstab,
308         InSegs,
309         OutSegs,
310         RetransSegs,
311         RetransTimeouts,
312         InErrs,
313         OutRsts,
314
315         /* non-MIB stats */
316         CsumErrs,
317         HlenErrs,
318         LenErrs,
319         OutOfOrder,
320
321         Nstats
322 };
323
324 static char *statnames[] = {
325         [MaxConn] "MaxConn",
326         [ActiveOpens] "ActiveOpens",
327         [PassiveOpens] "PassiveOpens",
328         [EstabResets] "EstabResets",
329         [CurrEstab] "CurrEstab",
330         [InSegs] "InSegs",
331         [OutSegs] "OutSegs",
332         [RetransSegs] "RetransSegs",
333         [RetransTimeouts] "RetransTimeouts",
334         [InErrs] "InErrs",
335         [OutRsts] "OutRsts",
336         [CsumErrs] "CsumErrs",
337         [HlenErrs] "HlenErrs",
338         [LenErrs] "LenErrs",
339         [OutOfOrder] "OutOfOrder",
340 };
341
342 typedef struct Tcppriv Tcppriv;
343 struct tcppriv {
344         /* List of active timers */
345         qlock_t tl;
346         Tcptimer *timers;
347
348         /* hash table for matching conversations */
349         struct Ipht ht;
350
351         /* calls in limbo waiting for an ACK to our SYN ACK */
352         int nlimbo;
353         Limbo *lht[NLHT];
354
355         /* for keeping track of tcpackproc */
356         qlock_t apl;
357         int ackprocstarted;
358
359         uint32_t stats[Nstats];
360 };
361
362 /*
363  *  Setting tcpporthogdefense to non-zero enables Dong Lin's
364  *  solution to hijacked systems staking out port's as a form
365  *  of DoS attack.
366  *
367  *  To avoid stateless Conv hogs, we pick a sequence number at random.  If
368  *  it that number gets acked by the other end, we shut down the connection.
369  *  Look for tcpporthogedefense in the code.
370  */
371 int tcpporthogdefense = 0;
372
373 int addreseq(Tcpctl *, struct tcppriv *, Tcp *, struct block *, uint16_t);
374 void getreseq(Tcpctl *, Tcp *, struct block **, uint16_t *);
375 void localclose(struct conv *, char *unused_char_p_t);
376 void procsyn(struct conv *, Tcp *);
377 void tcpiput(struct Proto *, struct Ipifc *, struct block *);
378 void tcpoutput(struct conv *);
379 int tcptrim(Tcpctl *, Tcp *, struct block **, uint16_t *);
380 void tcpstart(struct conv *, int);
381 void tcptimeout(void *);
382 void tcpsndsyn(struct conv *, Tcpctl *);
383 void tcprcvwin(struct conv *);
384 void tcpacktimer(void *);
385 void tcpkeepalive(void *);
386 void tcpsetkacounter(Tcpctl *);
387 void tcprxmit(struct conv *);
388 void tcpsettimer(Tcpctl *);
389 void tcpsynackrtt(struct conv *);
390 void tcpsetscale(struct conv *, Tcpctl *, uint16_t, uint16_t);
391
392 static void limborexmit(struct Proto *);
393 static void limbo(struct conv *, uint8_t * unused_uint8_p_t, uint8_t *, Tcp *,
394                                   int);
395
396 void tcpsetstate(struct conv *s, uint8_t newstate)
397 {
398         Tcpctl *tcb;
399         uint8_t oldstate;
400         struct tcppriv *tpriv;
401
402         tpriv = s->p->priv;
403
404         tcb = (Tcpctl *) s->ptcl;
405
406         oldstate = tcb->state;
407         if (oldstate == newstate)
408                 return;
409
410         if (oldstate == Established)
411                 tpriv->stats[CurrEstab]--;
412         if (newstate == Established)
413                 tpriv->stats[CurrEstab]++;
414
415         /**
416         print( "%d/%d %s->%s CurrEstab=%d\n", s->lport, s->rport,
417                 tcpstates[oldstate], tcpstates[newstate], tpriv->tstats.tcpCurrEstab );
418         **/
419
420         switch (newstate) {
421                 case Closed:
422                         qclose(s->rq);
423                         qclose(s->wq);
424                         qclose(s->eq);
425                         break;
426
427                 case Close_wait:        /* Remote closes */
428                         qhangup(s->rq, NULL);
429                         break;
430         }
431
432         tcb->state = newstate;
433
434         if (oldstate == Syn_sent && newstate != Closed)
435                 Fsconnected(s, NULL);
436 }
437
438 static char *tcpconnect(struct conv *c, char **argv, int argc)
439 {
440         char *e;
441
442         e = Fsstdconnect(c, argv, argc);
443         if (e != NULL)
444                 return e;
445         tcpstart(c, TCP_CONNECT);
446
447         return NULL;
448 }
449
450 static int tcpstate(struct conv *c, char *state, int n)
451 {
452         Tcpctl *s;
453
454         s = (Tcpctl *) (c->ptcl);
455
456         return snprintf(state, n,
457                                         "%s qin %d qout %d srtt %d mdev %d cwin %lu swin %lu>>%d rwin %lu>>%d timer.start %d timer.count %d rerecv %d katimer.start %d katimer.count %d\n",
458                                         tcpstates[s->state],
459                                         c->rq ? qlen(c->rq) : 0,
460                                         c->wq ? qlen(c->wq) : 0,
461                                         s->srtt, s->mdev,
462                                         s->cwind, s->snd.wnd, s->rcv.scale, s->rcv.wnd,
463                                         s->snd.scale, s->timer.start, s->timer.count, s->rerecv,
464                                         s->katimer.start, s->katimer.count);
465 }
466
467 static int tcpinuse(struct conv *c)
468 {
469         Tcpctl *s;
470
471         s = (Tcpctl *) (c->ptcl);
472         return s->state != Closed;
473 }
474
475 static char *tcpannounce(struct conv *c, char **argv, int argc)
476 {
477         char *e;
478
479         e = Fsstdannounce(c, argv, argc);
480         if (e != NULL)
481                 return e;
482         tcpstart(c, TCP_LISTEN);
483         Fsconnected(c, NULL);
484
485         return NULL;
486 }
487
488 /*
489  *  tcpclose is always called with the q locked
490  */
491 static void tcpclose(struct conv *c)
492 {
493         Tcpctl *tcb;
494
495         tcb = (Tcpctl *) c->ptcl;
496
497         qhangup(c->rq, NULL);
498         qhangup(c->wq, NULL);
499         qhangup(c->eq, NULL);
500         qflush(c->rq);
501
502         switch (tcb->state) {
503                 case Listen:
504                         /*
505                          *  reset any incoming calls to this listener
506                          */
507                         Fsconnected(c, "Hangup");
508
509                         localclose(c, NULL);
510                         break;
511                 case Closed:
512                 case Syn_sent:
513                         localclose(c, NULL);
514                         break;
515                 case Syn_received:
516                 case Established:
517                         tcb->flgcnt++;
518                         tcb->snd.nxt++;
519                         tcpsetstate(c, Finwait1);
520                         tcpoutput(c);
521                         break;
522                 case Close_wait:
523                         tcb->flgcnt++;
524                         tcb->snd.nxt++;
525                         tcpsetstate(c, Last_ack);
526                         tcpoutput(c);
527                         break;
528         }
529 }
530
531 void tcpkick(void *x)
532 {
533         ERRSTACK(1);
534         struct conv *s = x;
535         Tcpctl *tcb;
536
537         tcb = (Tcpctl *) s->ptcl;
538
539         if (waserror()) {
540                 qunlock(&s->qlock);
541                 nexterror();
542         }
543         qlock(&s->qlock);
544
545         switch (tcb->state) {
546                 case Syn_sent:
547                 case Syn_received:
548                 case Established:
549                 case Close_wait:
550                         /*
551                          * Push data
552                          */
553                         tcprcvwin(s);
554                         tcpoutput(s);
555                         break;
556                 default:
557                         localclose(s, "Hangup");
558                         break;
559         }
560
561         qunlock(&s->qlock);
562         poperror();
563 }
564
565 void tcprcvwin(struct conv *s)
566 {       /* Call with tcb locked */
567         int w;
568         Tcpctl *tcb;
569
570         tcb = (Tcpctl *) s->ptcl;
571         w = tcb->window - qlen(s->rq);
572         if (w < 0)
573                 w = 0;
574         tcb->rcv.wnd = w;
575         if (w == 0)
576                 tcb->rcv.blocked = 1;
577 }
578
579 void tcpacktimer(void *v)
580 {
581         ERRSTACK(1);
582         Tcpctl *tcb;
583         struct conv *s;
584
585         s = v;
586         tcb = (Tcpctl *) s->ptcl;
587
588         if (waserror()) {
589                 qunlock(&s->qlock);
590                 nexterror();
591         }
592         qlock(&s->qlock);
593         if (tcb->state != Closed) {
594                 tcb->flags |= FORCE;
595                 tcprcvwin(s);
596                 tcpoutput(s);
597         }
598         qunlock(&s->qlock);
599         poperror();
600 }
601
602 static void tcpcreate(struct conv *c)
603 {
604         c->rq = qopen(QMAX, Qcoalesce, tcpacktimer, c);
605         c->wq = qopen((3 * QMAX) / 2, Qkick, tcpkick, c);
606 }
607
608 static void timerstate(struct tcppriv *priv, Tcptimer * t, int newstate)
609 {
610         if (newstate != TcptimerON) {
611                 if (t->state == TcptimerON) {
612                         // unchain
613                         if (priv->timers == t) {
614                                 priv->timers = t->next;
615                                 if (t->prev != NULL)
616                                         panic("timerstate1");
617                         }
618                         if (t->next)
619                                 t->next->prev = t->prev;
620                         if (t->prev)
621                                 t->prev->next = t->next;
622                         t->next = t->prev = NULL;
623                 }
624         } else {
625                 if (t->state != TcptimerON) {
626                         // chain
627                         if (t->prev != NULL || t->next != NULL)
628                                 panic("timerstate2");
629                         t->prev = NULL;
630                         t->next = priv->timers;
631                         if (t->next)
632                                 t->next->prev = t;
633                         priv->timers = t;
634                 }
635         }
636         t->state = newstate;
637 }
638
639 void tcpackproc(void *a)
640 {
641         ERRSTACK(1);
642         Tcptimer *t, *tp, *timeo;
643         struct Proto *tcp;
644         struct tcppriv *priv;
645         int loop;
646
647         tcp = a;
648         priv = tcp->priv;
649
650         for (;;) {
651                 udelay_sched(MSPTICK * 1000);
652
653                 qlock(&priv->tl);
654                 timeo = NULL;
655                 loop = 0;
656                 for (t = priv->timers; t != NULL; t = tp) {
657                         if (loop++ > 10000)
658                                 panic("tcpackproc1");
659                         tp = t->next;
660                         if (t->state == TcptimerON) {
661                                 t->count--;
662                                 if (t->count == 0) {
663                                         timerstate(priv, t, TcptimerDONE);
664                                         t->readynext = timeo;
665                                         timeo = t;
666                                 }
667                         }
668                 }
669                 qunlock(&priv->tl);
670
671                 loop = 0;
672                 for (t = timeo; t != NULL; t = t->readynext) {
673                         if (loop++ > 10000)
674                                 panic("tcpackproc2");
675                         if (t->state == TcptimerDONE && t->func != NULL) {
676                                 /* discard error style */
677                                 if (!waserror())
678                                         (*t->func) (t->arg);
679                                 poperror();
680                         }
681                 }
682
683                 limborexmit(tcp);
684         }
685 }
686
687 void tcpgo(struct tcppriv *priv, Tcptimer * t)
688 {
689         if (t == NULL || t->start == 0)
690                 return;
691
692         qlock(&priv->tl);
693         t->count = t->start;
694         timerstate(priv, t, TcptimerON);
695         qunlock(&priv->tl);
696 }
697
698 void tcphalt(struct tcppriv *priv, Tcptimer * t)
699 {
700         if (t == NULL)
701                 return;
702
703         qlock(&priv->tl);
704         timerstate(priv, t, TcptimerOFF);
705         qunlock(&priv->tl);
706 }
707
708 int backoff(int n)
709 {
710         return 1 << n;
711 }
712
713 void localclose(struct conv *s, char *reason)
714 {       /* called with tcb locked */
715         Tcpctl *tcb;
716         Reseq *rp, *rp1;
717         struct tcppriv *tpriv;
718
719         tpriv = s->p->priv;
720         tcb = (Tcpctl *) s->ptcl;
721
722         iphtrem(&tpriv->ht, s);
723
724         tcphalt(tpriv, &tcb->timer);
725         tcphalt(tpriv, &tcb->rtt_timer);
726         tcphalt(tpriv, &tcb->acktimer);
727         tcphalt(tpriv, &tcb->katimer);
728
729         /* Flush reassembly queue; nothing more can arrive */
730         for (rp = tcb->reseq; rp != NULL; rp = rp1) {
731                 rp1 = rp->next;
732                 freeblist(rp->bp);
733                 kfree(rp);
734         }
735         tcb->reseq = NULL;
736
737         if (tcb->state == Syn_sent)
738                 Fsconnected(s, reason);
739
740         qhangup(s->rq, reason);
741         qhangup(s->wq, reason);
742
743         tcpsetstate(s, Closed);
744
745         /* listener will check the rq state */
746         if (s->state == Announced)
747                 rendez_wakeup(&s->listenr);
748 }
749
750 /* mtu (- TCP + IP hdr len) of 1st hop */
751 int tcpmtu(struct Proto *tcp, uint8_t * addr, int version, int *scale)
752 {
753         struct Ipifc *ifc;
754         int mtu;
755
756         ifc = findipifc(tcp->f, addr, 0);
757         switch (version) {
758                 default:
759                 case V4:
760                         mtu = DEF_MSS;
761                         if (ifc != NULL)
762                                 mtu = ifc->maxtu - ifc->m->hsize - (TCP4_PKT + TCP4_HDRSIZE);
763                         break;
764                 case V6:
765                         mtu = DEF_MSS6;
766                         if (ifc != NULL)
767                                 mtu = ifc->maxtu - ifc->m->hsize - (TCP6_PKT + TCP6_HDRSIZE);
768                         break;
769         }
770         if (ifc != NULL) {
771                 if (ifc->mbps > 100)
772                         *scale = HaveWS | 3;
773                 else if (ifc->mbps > 10)
774                         *scale = HaveWS | 1;
775                 else
776                         *scale = HaveWS | 0;
777         } else
778                 *scale = HaveWS | 0;
779
780         return mtu;
781 }
782
783 void inittcpctl(struct conv *s, int mode)
784 {
785         Tcpctl *tcb;
786         Tcp4hdr *h4;
787         Tcp6hdr *h6;
788         int mss;
789
790         tcb = (Tcpctl *) s->ptcl;
791
792         memset(tcb, 0, sizeof(Tcpctl));
793
794         tcb->ssthresh = 65535;
795         tcb->srtt = tcp_irtt << LOGAGAIN;
796         tcb->mdev = 0;
797
798         /* setup timers */
799         tcb->timer.start = tcp_irtt / MSPTICK;
800         tcb->timer.func = tcptimeout;
801         tcb->timer.arg = s;
802         tcb->rtt_timer.start = MAX_TIME;
803         tcb->acktimer.start = TCP_ACK / MSPTICK;
804         tcb->acktimer.func = tcpacktimer;
805         tcb->acktimer.arg = s;
806         tcb->katimer.start = DEF_KAT / MSPTICK;
807         tcb->katimer.func = tcpkeepalive;
808         tcb->katimer.arg = s;
809
810         mss = DEF_MSS;
811
812         /* create a prototype(pseudo) header */
813         if (mode != TCP_LISTEN) {
814                 if (ipcmp(s->laddr, IPnoaddr) == 0)
815                         findlocalip(s->p->f, s->laddr, s->raddr);
816
817                 switch (s->ipversion) {
818                         case V4:
819                                 h4 = &tcb->protohdr.tcp4hdr;
820                                 memset(h4, 0, sizeof(*h4));
821                                 h4->proto = IP_TCPPROTO;
822                                 hnputs(h4->tcpsport, s->lport);
823                                 hnputs(h4->tcpdport, s->rport);
824                                 v6tov4(h4->tcpsrc, s->laddr);
825                                 v6tov4(h4->tcpdst, s->raddr);
826                                 break;
827                         case V6:
828                                 h6 = &tcb->protohdr.tcp6hdr;
829                                 memset(h6, 0, sizeof(*h6));
830                                 h6->proto = IP_TCPPROTO;
831                                 hnputs(h6->tcpsport, s->lport);
832                                 hnputs(h6->tcpdport, s->rport);
833                                 ipmove(h6->tcpsrc, s->laddr);
834                                 ipmove(h6->tcpdst, s->raddr);
835                                 mss = DEF_MSS6;
836                                 break;
837                         default:
838                                 panic("inittcpctl: version %d", s->ipversion);
839                 }
840         }
841
842         tcb->mss = tcb->cwind = mss;
843
844         /* default is no window scaling */
845         tcb->window = QMAX;
846         tcb->rcv.wnd = QMAX;
847         tcb->rcv.scale = 0;
848         tcb->snd.scale = 0;
849         qsetlimit(s->rq, QMAX);
850 }
851
852 /*
853  *  called with s qlocked
854  */
855 void tcpstart(struct conv *s, int mode)
856 {
857         Tcpctl *tcb;
858         struct tcppriv *tpriv;
859         /* tcpackproc needs to free this if it ever exits */
860         char *kpname = kmalloc(KNAMELEN, KMALLOC_WAIT);
861
862         tpriv = s->p->priv;
863
864         if (tpriv->ackprocstarted == 0) {
865                 qlock(&tpriv->apl);
866                 if (tpriv->ackprocstarted == 0) {
867                         snprintf(kpname, KNAMELEN, "#I%dtcpack", s->p->f->dev);
868                         ktask(kpname, tcpackproc, s->p);
869                         tpriv->ackprocstarted = 1;
870                 }
871                 qunlock(&tpriv->apl);
872         }
873
874         tcb = (Tcpctl *) s->ptcl;
875
876         inittcpctl(s, mode);
877
878         iphtadd(&tpriv->ht, s);
879         switch (mode) {
880                 case TCP_LISTEN:
881                         tpriv->stats[PassiveOpens]++;
882                         tcb->flags |= CLONE;
883                         tcpsetstate(s, Listen);
884                         break;
885
886                 case TCP_CONNECT:
887                         tpriv->stats[ActiveOpens]++;
888                         tcb->flags |= ACTIVE;
889                         tcpsndsyn(s, tcb);
890                         tcpsetstate(s, Syn_sent);
891                         tcpoutput(s);
892                         break;
893         }
894 }
895
896 static char *tcpflag(uint16_t flag)
897 {
898         static char buf[128];
899
900         snprintf(buf, sizeof(buf), "%d", flag >> 10);   /* Head len */
901         if (flag & URG)
902                 snprintf(buf, sizeof(buf), "%s%s", buf, " URG");
903         if (flag & ACK)
904                 snprintf(buf, sizeof(buf), "%s%s", buf, " ACK");
905         if (flag & PSH)
906                 snprintf(buf, sizeof(buf), "%s%s", buf, " PSH");
907         if (flag & RST)
908                 snprintf(buf, sizeof(buf), "%s%s", buf, " RST");
909         if (flag & SYN)
910                 snprintf(buf, sizeof(buf), "%s%s", buf, " SYN");
911         if (flag & FIN)
912                 snprintf(buf, sizeof(buf), "%s%s", buf, " FIN");
913
914         return buf;
915 }
916
917 struct block *htontcp6(Tcp * tcph, struct block *data, Tcp6hdr * ph,
918                                            Tcpctl * tcb)
919 {
920         int dlen;
921         Tcp6hdr *h;
922         uint16_t csum;
923         uint16_t hdrlen, optpad = 0;
924         uint8_t *opt;
925
926         hdrlen = TCP6_HDRSIZE;
927         if (tcph->flags & SYN) {
928                 if (tcph->mss)
929                         hdrlen += MSS_LENGTH;
930                 if (tcph->ws)
931                         hdrlen += WS_LENGTH;
932                 optpad = hdrlen & 3;
933                 if (optpad)
934                         optpad = 4 - optpad;
935                 hdrlen += optpad;
936         }
937
938         if (data) {
939                 dlen = blocklen(data);
940                 data = padblock(data, hdrlen + TCP6_PKT);
941                 if (data == NULL)
942                         return NULL;
943         } else {
944                 dlen = 0;
945                 data = allocb(hdrlen + TCP6_PKT + 64);  /* the 64 pad is to meet mintu's */
946                 if (data == NULL)
947                         return NULL;
948                 data->wp += hdrlen + TCP6_PKT;
949         }
950
951         /* copy in pseudo ip header plus port numbers */
952         h = (Tcp6hdr *) (data->rp);
953         memmove(h, ph, TCP6_TCBPHDRSZ);
954
955         /* compose pseudo tcp header, do cksum calculation */
956         hnputl(h->vcf, hdrlen + dlen);
957         h->ploadlen[0] = h->ploadlen[1] = h->proto = 0;
958         h->ttl = ph->proto;
959
960         /* copy in variable bits */
961         hnputl(h->tcpseq, tcph->seq);
962         hnputl(h->tcpack, tcph->ack);
963         hnputs(h->tcpflag, (hdrlen << 10) | tcph->flags);
964         hnputs(h->tcpwin, tcph->wnd >> (tcb != NULL ? tcb->snd.scale : 0));
965         hnputs(h->tcpurg, tcph->urg);
966
967         if (tcph->flags & SYN) {
968                 opt = h->tcpopt;
969                 if (tcph->mss != 0) {
970                         *opt++ = MSSOPT;
971                         *opt++ = MSS_LENGTH;
972                         hnputs(opt, tcph->mss);
973                         opt += 2;
974                 }
975                 if (tcph->ws != 0) {
976                         *opt++ = WSOPT;
977                         *opt++ = WS_LENGTH;
978                         *opt++ = tcph->ws;
979                 }
980                 while (optpad-- > 0)
981                         *opt++ = NOOPOPT;
982         }
983
984         if (tcb != NULL && tcb->nochecksum) {
985                 h->tcpcksum[0] = h->tcpcksum[1] = 0;
986         } else {
987                 csum = ptclcsum(data, TCP6_IPLEN, hdrlen + dlen + TCP6_PHDRSIZE);
988                 hnputs(h->tcpcksum, csum);
989         }
990
991         /* move from pseudo header back to normal ip header */
992         memset(h->vcf, 0, 4);
993         h->vcf[0] = IP_VER6;
994         hnputs(h->ploadlen, hdrlen + dlen);
995         h->proto = ph->proto;
996
997         return data;
998 }
999
1000 struct block *htontcp4(Tcp * tcph, struct block *data, Tcp4hdr * ph,
1001                                            Tcpctl * tcb)
1002 {
1003         int dlen;
1004         Tcp4hdr *h;
1005         uint16_t csum;
1006         uint16_t hdrlen, optpad = 0;
1007         uint8_t *opt;
1008
1009         hdrlen = TCP4_HDRSIZE;
1010         if (tcph->flags & SYN) {
1011                 if (tcph->mss)
1012                         hdrlen += MSS_LENGTH;
1013                 if (tcph->ws)
1014                         hdrlen += WS_LENGTH;
1015                 optpad = hdrlen & 3;
1016                 if (optpad)
1017                         optpad = 4 - optpad;
1018                 hdrlen += optpad;
1019         }
1020
1021         if (data) {
1022                 dlen = blocklen(data);
1023                 data = padblock(data, hdrlen + TCP4_PKT);
1024                 if (data == NULL)
1025                         return NULL;
1026         } else {
1027                 dlen = 0;
1028                 data = allocb(hdrlen + TCP4_PKT + 64);  /* the 64 pad is to meet mintu's */
1029                 if (data == NULL)
1030                         return NULL;
1031                 data->wp += hdrlen + TCP4_PKT;
1032         }
1033
1034         /* copy in pseudo ip header plus port numbers */
1035         h = (Tcp4hdr *) (data->rp);
1036         memmove(h, ph, TCP4_TCBPHDRSZ);
1037
1038         /* copy in variable bits */
1039         hnputs(h->tcplen, hdrlen + dlen);
1040         hnputl(h->tcpseq, tcph->seq);
1041         hnputl(h->tcpack, tcph->ack);
1042         hnputs(h->tcpflag, (hdrlen << 10) | tcph->flags);
1043         hnputs(h->tcpwin, tcph->wnd >> (tcb != NULL ? tcb->snd.scale : 0));
1044         hnputs(h->tcpurg, tcph->urg);
1045
1046         if (tcph->flags & SYN) {
1047                 opt = h->tcpopt;
1048                 if (tcph->mss != 0) {
1049                         *opt++ = MSSOPT;
1050                         *opt++ = MSS_LENGTH;
1051                         hnputs(opt, tcph->mss);
1052                         opt += 2;
1053                 }
1054                 if (tcph->ws != 0) {
1055                         *opt++ = WSOPT;
1056                         *opt++ = WS_LENGTH;
1057                         *opt++ = tcph->ws;
1058                 }
1059                 while (optpad-- > 0)
1060                         *opt++ = NOOPOPT;
1061         }
1062
1063         if (tcb != NULL && tcb->nochecksum) {
1064                 h->tcpcksum[0] = h->tcpcksum[1] = 0;
1065         } else {
1066                 csum = ~ptclcsum(data, TCP4_IPLEN, TCP4_PHDRSIZE);
1067                 hnputs(h->tcpcksum, csum);
1068                 data->checksum_start = TCP4_IPLEN + TCP4_PHDRSIZE;
1069                 data->checksum_offset = ph->tcpcksum - ph->tcpsport;
1070                 data->flag |= Btcpck;
1071         }
1072
1073         return data;
1074 }
1075
1076 int ntohtcp6(Tcp * tcph, struct block **bpp)
1077 {
1078         Tcp6hdr *h;
1079         uint8_t *optr;
1080         uint16_t hdrlen;
1081         uint16_t optlen;
1082         int n;
1083
1084         *bpp = pullupblock(*bpp, TCP6_PKT + TCP6_HDRSIZE);
1085         if (*bpp == NULL)
1086                 return -1;
1087
1088         h = (Tcp6hdr *) ((*bpp)->rp);
1089         tcph->source = nhgets(h->tcpsport);
1090         tcph->dest = nhgets(h->tcpdport);
1091         tcph->seq = nhgetl(h->tcpseq);
1092         tcph->ack = nhgetl(h->tcpack);
1093         hdrlen = (h->tcpflag[0] >> 2) & ~3;
1094         if (hdrlen < TCP6_HDRSIZE) {
1095                 freeblist(*bpp);
1096                 return -1;
1097         }
1098
1099         tcph->flags = h->tcpflag[1];
1100         tcph->wnd = nhgets(h->tcpwin);
1101         tcph->urg = nhgets(h->tcpurg);
1102         tcph->mss = 0;
1103         tcph->ws = 0;
1104         tcph->len = nhgets(h->ploadlen) - hdrlen;
1105
1106         *bpp = pullupblock(*bpp, hdrlen + TCP6_PKT);
1107         if (*bpp == NULL)
1108                 return -1;
1109
1110         optr = h->tcpopt;
1111         n = hdrlen - TCP6_HDRSIZE;
1112         while (n > 0 && *optr != EOLOPT) {
1113                 if (*optr == NOOPOPT) {
1114                         n--;
1115                         optr++;
1116                         continue;
1117                 }
1118                 optlen = optr[1];
1119                 if (optlen < 2 || optlen > n)
1120                         break;
1121                 switch (*optr) {
1122                         case MSSOPT:
1123                                 if (optlen == MSS_LENGTH)
1124                                         tcph->mss = nhgets(optr + 2);
1125                                 break;
1126                         case WSOPT:
1127                                 if (optlen == WS_LENGTH && *(optr + 2) <= 14)
1128                                         tcph->ws = HaveWS | *(optr + 2);
1129                                 break;
1130                 }
1131                 n -= optlen;
1132                 optr += optlen;
1133         }
1134         return hdrlen;
1135 }
1136
1137 int ntohtcp4(Tcp * tcph, struct block **bpp)
1138 {
1139         Tcp4hdr *h;
1140         uint8_t *optr;
1141         uint16_t hdrlen;
1142         uint16_t optlen;
1143         int n;
1144
1145         *bpp = pullupblock(*bpp, TCP4_PKT + TCP4_HDRSIZE);
1146         if (*bpp == NULL)
1147                 return -1;
1148
1149         h = (Tcp4hdr *) ((*bpp)->rp);
1150         tcph->source = nhgets(h->tcpsport);
1151         tcph->dest = nhgets(h->tcpdport);
1152         tcph->seq = nhgetl(h->tcpseq);
1153         tcph->ack = nhgetl(h->tcpack);
1154
1155         hdrlen = (h->tcpflag[0] >> 2) & ~3;
1156         if (hdrlen < TCP4_HDRSIZE) {
1157                 freeblist(*bpp);
1158                 return -1;
1159         }
1160
1161         tcph->flags = h->tcpflag[1];
1162         tcph->wnd = nhgets(h->tcpwin);
1163         tcph->urg = nhgets(h->tcpurg);
1164         tcph->mss = 0;
1165         tcph->ws = 0;
1166         tcph->len = nhgets(h->length) - (hdrlen + TCP4_PKT);
1167
1168         *bpp = pullupblock(*bpp, hdrlen + TCP4_PKT);
1169         if (*bpp == NULL)
1170                 return -1;
1171
1172         optr = h->tcpopt;
1173         n = hdrlen - TCP4_HDRSIZE;
1174         while (n > 0 && *optr != EOLOPT) {
1175                 if (*optr == NOOPOPT) {
1176                         n--;
1177                         optr++;
1178                         continue;
1179                 }
1180                 optlen = optr[1];
1181                 if (optlen < 2 || optlen > n)
1182                         break;
1183                 switch (*optr) {
1184                         case MSSOPT:
1185                                 if (optlen == MSS_LENGTH)
1186                                         tcph->mss = nhgets(optr + 2);
1187                                 break;
1188                         case WSOPT:
1189                                 if (optlen == WS_LENGTH && *(optr + 2) <= 14)
1190                                         tcph->ws = HaveWS | *(optr + 2);
1191                                 break;
1192                 }
1193                 n -= optlen;
1194                 optr += optlen;
1195         }
1196         return hdrlen;
1197 }
1198
1199 /*
1200  *  For outgiing calls, generate an initial sequence
1201  *  number and put a SYN on the send queue
1202  */
1203 void tcpsndsyn(struct conv *s, Tcpctl * tcb)
1204 {
1205         tcb->iss = (nrand(1 << 16) << 16) | nrand(1 << 16);
1206         tcb->rttseq = tcb->iss;
1207         tcb->snd.wl2 = tcb->iss;
1208         tcb->snd.una = tcb->iss;
1209         tcb->snd.ptr = tcb->rttseq;
1210         tcb->snd.nxt = tcb->rttseq;
1211         tcb->flgcnt++;
1212         tcb->flags |= FORCE;
1213         tcb->sndsyntime = NOW;
1214
1215         /* set desired mss and scale */
1216         tcb->mss = tcpmtu(s->p, s->laddr, s->ipversion, &tcb->scale);
1217 }
1218
1219 void
1220 sndrst(struct Proto *tcp, uint8_t * source, uint8_t * dest,
1221            uint16_t length, Tcp * seg, uint8_t version, char *reason)
1222 {
1223         struct block *hbp;
1224         uint8_t rflags;
1225         struct tcppriv *tpriv;
1226         Tcp4hdr ph4;
1227         Tcp6hdr ph6;
1228
1229         netlog(tcp->f, Logtcp, "sndrst: %s", reason);
1230
1231         tpriv = tcp->priv;
1232
1233         if (seg->flags & RST)
1234                 return;
1235
1236         /* make pseudo header */
1237         switch (version) {
1238                 case V4:
1239                         memset(&ph4, 0, sizeof(ph4));
1240                         ph4.vihl = IP_VER4;
1241                         v6tov4(ph4.tcpsrc, dest);
1242                         v6tov4(ph4.tcpdst, source);
1243                         ph4.proto = IP_TCPPROTO;
1244                         hnputs(ph4.tcplen, TCP4_HDRSIZE);
1245                         hnputs(ph4.tcpsport, seg->dest);
1246                         hnputs(ph4.tcpdport, seg->source);
1247                         break;
1248                 case V6:
1249                         memset(&ph6, 0, sizeof(ph6));
1250                         ph6.vcf[0] = IP_VER6;
1251                         ipmove(ph6.tcpsrc, dest);
1252                         ipmove(ph6.tcpdst, source);
1253                         ph6.proto = IP_TCPPROTO;
1254                         hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1255                         hnputs(ph6.tcpsport, seg->dest);
1256                         hnputs(ph6.tcpdport, seg->source);
1257                         break;
1258                 default:
1259                         panic("sndrst: version %d", version);
1260         }
1261
1262         tpriv->stats[OutRsts]++;
1263         rflags = RST;
1264
1265         /* convince the other end that this reset is in band */
1266         if (seg->flags & ACK) {
1267                 seg->seq = seg->ack;
1268                 seg->ack = 0;
1269         } else {
1270                 rflags |= ACK;
1271                 seg->ack = seg->seq;
1272                 seg->seq = 0;
1273                 if (seg->flags & SYN)
1274                         seg->ack++;
1275                 seg->ack += length;
1276                 if (seg->flags & FIN)
1277                         seg->ack++;
1278         }
1279         seg->flags = rflags;
1280         seg->wnd = 0;
1281         seg->urg = 0;
1282         seg->mss = 0;
1283         seg->ws = 0;
1284         switch (version) {
1285                 case V4:
1286                         hbp = htontcp4(seg, NULL, &ph4, NULL);
1287                         if (hbp == NULL)
1288                                 return;
1289                         ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1290                         break;
1291                 case V6:
1292                         hbp = htontcp6(seg, NULL, &ph6, NULL);
1293                         if (hbp == NULL)
1294                                 return;
1295                         ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1296                         break;
1297                 default:
1298                         panic("sndrst2: version %d", version);
1299         }
1300 }
1301
1302 /*
1303  *  send a reset to the remote side and close the conversation
1304  *  called with s qlocked
1305  */
1306 char *tcphangup(struct conv *s)
1307 {
1308         ERRSTACK(2);
1309         Tcp seg;
1310         Tcpctl *tcb;
1311         struct block *hbp;
1312
1313         tcb = (Tcpctl *) s->ptcl;
1314         if (waserror()) {
1315                 poperror();
1316                 return commonerror();
1317         }
1318         if (ipcmp(s->raddr, IPnoaddr)) {
1319                 /* discard error style, poperror regardless */
1320                 if (!waserror()) {
1321                         seg.flags = RST | ACK;
1322                         seg.ack = tcb->rcv.nxt;
1323                         tcb->rcv.una = 0;
1324                         seg.seq = tcb->snd.ptr;
1325                         seg.wnd = 0;
1326                         seg.urg = 0;
1327                         seg.mss = 0;
1328                         seg.ws = 0;
1329                         switch (s->ipversion) {
1330                                 case V4:
1331                                         tcb->protohdr.tcp4hdr.vihl = IP_VER4;
1332                                         hbp = htontcp4(&seg, NULL, &tcb->protohdr.tcp4hdr, tcb);
1333                                         ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
1334                                         break;
1335                                 case V6:
1336                                         tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
1337                                         hbp = htontcp6(&seg, NULL, &tcb->protohdr.tcp6hdr, tcb);
1338                                         ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
1339                                         break;
1340                                 default:
1341                                         panic("tcphangup: version %d", s->ipversion);
1342                         }
1343                 }
1344                 poperror();
1345         }
1346         localclose(s, NULL);
1347         poperror();
1348         return NULL;
1349 }
1350
1351 /*
1352  *  (re)send a SYN ACK
1353  */
1354 int sndsynack(struct Proto *tcp, Limbo * lp)
1355 {
1356         struct block *hbp;
1357         Tcp4hdr ph4;
1358         Tcp6hdr ph6;
1359         Tcp seg;
1360         int scale;
1361
1362         /* make pseudo header */
1363         switch (lp->version) {
1364                 case V4:
1365                         memset(&ph4, 0, sizeof(ph4));
1366                         ph4.vihl = IP_VER4;
1367                         v6tov4(ph4.tcpsrc, lp->laddr);
1368                         v6tov4(ph4.tcpdst, lp->raddr);
1369                         ph4.proto = IP_TCPPROTO;
1370                         hnputs(ph4.tcplen, TCP4_HDRSIZE);
1371                         hnputs(ph4.tcpsport, lp->lport);
1372                         hnputs(ph4.tcpdport, lp->rport);
1373                         break;
1374                 case V6:
1375                         memset(&ph6, 0, sizeof(ph6));
1376                         ph6.vcf[0] = IP_VER6;
1377                         ipmove(ph6.tcpsrc, lp->laddr);
1378                         ipmove(ph6.tcpdst, lp->raddr);
1379                         ph6.proto = IP_TCPPROTO;
1380                         hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1381                         hnputs(ph6.tcpsport, lp->lport);
1382                         hnputs(ph6.tcpdport, lp->rport);
1383                         break;
1384                 default:
1385                         panic("sndrst: version %d", lp->version);
1386         }
1387
1388         seg.seq = lp->iss;
1389         seg.ack = lp->irs + 1;
1390         seg.flags = SYN | ACK;
1391         seg.urg = 0;
1392         seg.mss = tcpmtu(tcp, lp->laddr, lp->version, &scale);
1393         seg.wnd = QMAX;
1394
1395         /* if the other side set scale, we should too */
1396         if (lp->rcvscale) {
1397                 seg.ws = scale;
1398                 lp->sndscale = scale;
1399         } else {
1400                 seg.ws = 0;
1401                 lp->sndscale = 0;
1402         }
1403
1404         switch (lp->version) {
1405                 case V4:
1406                         hbp = htontcp4(&seg, NULL, &ph4, NULL);
1407                         if (hbp == NULL)
1408                                 return -1;
1409                         ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1410                         break;
1411                 case V6:
1412                         hbp = htontcp6(&seg, NULL, &ph6, NULL);
1413                         if (hbp == NULL)
1414                                 return -1;
1415                         ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1416                         break;
1417                 default:
1418                         panic("sndsnack: version %d", lp->version);
1419         }
1420         lp->lastsend = NOW;
1421         return 0;
1422 }
1423
1424 #define hashipa(a, p) ( ( (a)[IPaddrlen-2] + (a)[IPaddrlen-1] + p )&LHTMASK )
1425
1426 /*
1427  *  put a call into limbo and respond with a SYN ACK
1428  *
1429  *  called with proto locked
1430  */
1431 static void
1432 limbo(struct conv *s, uint8_t * source, uint8_t * dest, Tcp * seg, int version)
1433 {
1434         Limbo *lp, **l;
1435         struct tcppriv *tpriv;
1436         int h;
1437
1438         tpriv = s->p->priv;
1439         h = hashipa(source, seg->source);
1440
1441         for (l = &tpriv->lht[h]; *l != NULL; l = &lp->next) {
1442                 lp = *l;
1443                 if (lp->lport != seg->dest || lp->rport != seg->source
1444                         || lp->version != version)
1445                         continue;
1446                 if (ipcmp(lp->raddr, source) != 0)
1447                         continue;
1448                 if (ipcmp(lp->laddr, dest) != 0)
1449                         continue;
1450
1451                 /* each new SYN restarts the retransmits */
1452                 lp->irs = seg->seq;
1453                 break;
1454         }
1455         lp = *l;
1456         if (lp == NULL) {
1457                 if (tpriv->nlimbo >= Maxlimbo && tpriv->lht[h]) {
1458                         lp = tpriv->lht[h];
1459                         tpriv->lht[h] = lp->next;
1460                         lp->next = NULL;
1461                 } else {
1462                         lp = kzmalloc(sizeof(*lp), 0);
1463                         if (lp == NULL)
1464                                 return;
1465                         tpriv->nlimbo++;
1466                 }
1467                 *l = lp;
1468                 lp->version = version;
1469                 ipmove(lp->laddr, dest);
1470                 ipmove(lp->raddr, source);
1471                 lp->lport = seg->dest;
1472                 lp->rport = seg->source;
1473                 lp->mss = seg->mss;
1474                 lp->rcvscale = seg->ws;
1475                 lp->irs = seg->seq;
1476                 lp->iss = (nrand(1 << 16) << 16) | nrand(1 << 16);
1477         }
1478
1479         if (sndsynack(s->p, lp) < 0) {
1480                 *l = lp->next;
1481                 tpriv->nlimbo--;
1482                 kfree(lp);
1483         }
1484 }
1485
1486 /*
1487  *  resend SYN ACK's once every SYNACK_RXTIMER ms.
1488  */
1489 static void limborexmit(struct Proto *tcp)
1490 {
1491         struct tcppriv *tpriv;
1492         Limbo **l, *lp;
1493         int h;
1494         int seen;
1495         uint32_t now;
1496
1497         tpriv = tcp->priv;
1498
1499         if (!canqlock(&tcp->qlock))
1500                 return;
1501         seen = 0;
1502         now = NOW;
1503         for (h = 0; h < NLHT && seen < tpriv->nlimbo; h++) {
1504                 for (l = &tpriv->lht[h]; *l != NULL && seen < tpriv->nlimbo;) {
1505                         lp = *l;
1506                         seen++;
1507                         if (now - lp->lastsend < (lp->rexmits + 1) * SYNACK_RXTIMER)
1508                                 continue;
1509
1510                         /* time it out after 1 second */
1511                         if (++(lp->rexmits) > 5) {
1512                                 tpriv->nlimbo--;
1513                                 *l = lp->next;
1514                                 kfree(lp);
1515                                 continue;
1516                         }
1517
1518                         /* if we're being attacked, don't bother resending SYN ACK's */
1519                         if (tpriv->nlimbo > 100)
1520                                 continue;
1521
1522                         if (sndsynack(tcp, lp) < 0) {
1523                                 tpriv->nlimbo--;
1524                                 *l = lp->next;
1525                                 kfree(lp);
1526                                 continue;
1527                         }
1528
1529                         l = &lp->next;
1530                 }
1531         }
1532         qunlock(&tcp->qlock);
1533 }
1534
1535 /*
1536  *  lookup call in limbo.  if found, throw it out.
1537  *
1538  *  called with proto locked
1539  */
1540 static void
1541 limborst(struct conv *s, Tcp * segp, uint8_t * src, uint8_t * dst,
1542                  uint8_t version)
1543 {
1544         Limbo *lp, **l;
1545         int h;
1546         struct tcppriv *tpriv;
1547
1548         tpriv = s->p->priv;
1549
1550         /* find a call in limbo */
1551         h = hashipa(src, segp->source);
1552         for (l = &tpriv->lht[h]; *l != NULL; l = &lp->next) {
1553                 lp = *l;
1554                 if (lp->lport != segp->dest || lp->rport != segp->source
1555                         || lp->version != version)
1556                         continue;
1557                 if (ipcmp(lp->laddr, dst) != 0)
1558                         continue;
1559                 if (ipcmp(lp->raddr, src) != 0)
1560                         continue;
1561
1562                 /* RST can only follow the SYN */
1563                 if (segp->seq == lp->irs + 1) {
1564                         tpriv->nlimbo--;
1565                         *l = lp->next;
1566                         kfree(lp);
1567                 }
1568                 break;
1569         }
1570 }
1571
1572 /*
1573  *  come here when we finally get an ACK to our SYN-ACK.
1574  *  lookup call in limbo.  if found, create a new conversation
1575  *
1576  *  called with proto locked
1577  */
1578 static struct conv *tcpincoming(struct conv *s, Tcp * segp, uint8_t * src,
1579                                                                 uint8_t * dst, uint8_t version)
1580 {
1581         struct conv *new;
1582         Tcpctl *tcb;
1583         struct tcppriv *tpriv;
1584         Tcp4hdr *h4;
1585         Tcp6hdr *h6;
1586         Limbo *lp, **l;
1587         int h;
1588
1589         /* unless it's just an ack, it can't be someone coming out of limbo */
1590         if ((segp->flags & SYN) || (segp->flags & ACK) == 0)
1591                 return NULL;
1592
1593         tpriv = s->p->priv;
1594
1595         /* find a call in limbo */
1596         h = hashipa(src, segp->source);
1597         for (l = &tpriv->lht[h]; (lp = *l) != NULL; l = &lp->next) {
1598                 netlog(s->p->f, Logtcp,
1599                            "tcpincoming s %I,0x%x/%I,0x%x d %I,0x%x/%I,0x%x v %d/%d", src,
1600                            segp->source, lp->raddr, lp->rport, dst, segp->dest, lp->laddr,
1601                            lp->lport, version, lp->version);
1602
1603                 if (lp->lport != segp->dest || lp->rport != segp->source
1604                         || lp->version != version)
1605                         continue;
1606                 if (ipcmp(lp->laddr, dst) != 0)
1607                         continue;
1608                 if (ipcmp(lp->raddr, src) != 0)
1609                         continue;
1610
1611                 /* we're assuming no data with the initial SYN */
1612                 if (segp->seq != lp->irs + 1 || segp->ack != lp->iss + 1) {
1613                         netlog(s->p->f, Logtcp, "tcpincoming s 0x%lx/0x%lx a 0x%lx 0x%lx",
1614                                    segp->seq, lp->irs + 1, segp->ack, lp->iss + 1);
1615                         lp = NULL;
1616                 } else {
1617                         tpriv->nlimbo--;
1618                         *l = lp->next;
1619                 }
1620                 break;
1621         }
1622         if (lp == NULL)
1623                 return NULL;
1624
1625         new = Fsnewcall(s, src, segp->source, dst, segp->dest, version);
1626         if (new == NULL)
1627                 return NULL;
1628
1629         memmove(new->ptcl, s->ptcl, sizeof(Tcpctl));
1630         tcb = (Tcpctl *) new->ptcl;
1631         tcb->flags &= ~CLONE;
1632         tcb->timer.arg = new;
1633         tcb->timer.state = TcptimerOFF;
1634         tcb->acktimer.arg = new;
1635         tcb->acktimer.state = TcptimerOFF;
1636         tcb->katimer.arg = new;
1637         tcb->katimer.state = TcptimerOFF;
1638         tcb->rtt_timer.arg = new;
1639         tcb->rtt_timer.state = TcptimerOFF;
1640
1641         tcb->irs = lp->irs;
1642         tcb->rcv.nxt = tcb->irs + 1;
1643         tcb->rcv.urg = tcb->rcv.nxt;
1644
1645         tcb->iss = lp->iss;
1646         tcb->rttseq = tcb->iss;
1647         tcb->snd.wl2 = tcb->iss;
1648         tcb->snd.una = tcb->iss + 1;
1649         tcb->snd.ptr = tcb->iss + 1;
1650         tcb->snd.nxt = tcb->iss + 1;
1651         tcb->flgcnt = 0;
1652         tcb->flags |= SYNACK;
1653
1654         /* our sending max segment size cannot be bigger than what he asked for */
1655         if (lp->mss != 0 && lp->mss < tcb->mss)
1656                 tcb->mss = lp->mss;
1657
1658         /* window scaling */
1659         tcpsetscale(new, tcb, lp->rcvscale, lp->sndscale);
1660
1661         /* the congestion window always starts out as a single segment */
1662         tcb->snd.wnd = segp->wnd;
1663         tcb->cwind = tcb->mss;
1664
1665         /* set initial round trip time */
1666         tcb->sndsyntime = lp->lastsend + lp->rexmits * SYNACK_RXTIMER;
1667         tcpsynackrtt(new);
1668
1669         kfree(lp);
1670
1671         /* set up proto header */
1672         switch (version) {
1673                 case V4:
1674                         h4 = &tcb->protohdr.tcp4hdr;
1675                         memset(h4, 0, sizeof(*h4));
1676                         h4->proto = IP_TCPPROTO;
1677                         hnputs(h4->tcpsport, new->lport);
1678                         hnputs(h4->tcpdport, new->rport);
1679                         v6tov4(h4->tcpsrc, dst);
1680                         v6tov4(h4->tcpdst, src);
1681                         break;
1682                 case V6:
1683                         h6 = &tcb->protohdr.tcp6hdr;
1684                         memset(h6, 0, sizeof(*h6));
1685                         h6->proto = IP_TCPPROTO;
1686                         hnputs(h6->tcpsport, new->lport);
1687                         hnputs(h6->tcpdport, new->rport);
1688                         ipmove(h6->tcpsrc, dst);
1689                         ipmove(h6->tcpdst, src);
1690                         break;
1691                 default:
1692                         panic("tcpincoming: version %d", new->ipversion);
1693         }
1694
1695         tcpsetstate(new, Established);
1696
1697         iphtadd(&tpriv->ht, new);
1698
1699         return new;
1700 }
1701
1702 int seq_within(uint32_t x, uint32_t low, uint32_t high)
1703 {
1704         if (low <= high) {
1705                 if (low <= x && x <= high)
1706                         return 1;
1707         } else {
1708                 if (x >= low || x <= high)
1709                         return 1;
1710         }
1711         return 0;
1712 }
1713
1714 int seq_lt(uint32_t x, uint32_t y)
1715 {
1716         return (int)(x - y) < 0;
1717 }
1718
1719 int seq_le(uint32_t x, uint32_t y)
1720 {
1721         return (int)(x - y) <= 0;
1722 }
1723
1724 int seq_gt(uint32_t x, uint32_t y)
1725 {
1726         return (int)(x - y) > 0;
1727 }
1728
1729 int seq_ge(uint32_t x, uint32_t y)
1730 {
1731         return (int)(x - y) >= 0;
1732 }
1733
1734 /*
1735  *  use the time between the first SYN and it's ack as the
1736  *  initial round trip time
1737  */
1738 void tcpsynackrtt(struct conv *s)
1739 {
1740         Tcpctl *tcb;
1741         int delta;
1742         struct tcppriv *tpriv;
1743
1744         tcb = (Tcpctl *) s->ptcl;
1745         tpriv = s->p->priv;
1746
1747         delta = NOW - tcb->sndsyntime;
1748         tcb->srtt = delta << LOGAGAIN;
1749         tcb->mdev = delta << LOGDGAIN;
1750
1751         /* halt round trip timer */
1752         tcphalt(tpriv, &tcb->rtt_timer);
1753 }
1754
1755 void update(struct conv *s, Tcp * seg)
1756 {
1757         int rtt, delta;
1758         Tcpctl *tcb;
1759         uint32_t acked;
1760         uint32_t expand;
1761         struct tcppriv *tpriv;
1762
1763         tpriv = s->p->priv;
1764         tcb = (Tcpctl *) s->ptcl;
1765
1766         /* if everything has been acked, force output(?) */
1767         if (seq_gt(seg->ack, tcb->snd.nxt)) {
1768                 tcb->flags |= FORCE;
1769                 return;
1770         }
1771
1772         /* added by Dong Lin for fast retransmission */
1773         if (seg->ack == tcb->snd.una
1774                 && tcb->snd.una != tcb->snd.nxt
1775                 && seg->len == 0 && seg->wnd == tcb->snd.wnd) {
1776
1777                 /* this is a pure ack w/o window update */
1778                 netlog(s->p->f, Logtcprxmt, "dupack %lu ack %lu sndwnd %d advwin %d\n",
1779                            tcb->snd.dupacks, seg->ack, tcb->snd.wnd, seg->wnd);
1780
1781                 if (++tcb->snd.dupacks == TCPREXMTTHRESH) {
1782                         /*
1783                          *  tahoe tcp rxt the packet, half sshthresh,
1784                          *  and set cwnd to one packet
1785                          */
1786                         tcb->snd.recovery = 1;
1787                         tcb->snd.rxt = tcb->snd.nxt;
1788                         netlog(s->p->f, Logtcprxmt, "fast rxt %lu, nxt %lu\n", tcb->snd.una,
1789                                    tcb->snd.nxt);
1790                         tcprxmit(s);
1791                 } else {
1792                         /* do reno tcp here. */
1793                 }
1794         }
1795
1796         /*
1797          *  update window
1798          */
1799         if (seq_gt(seg->ack, tcb->snd.wl2)
1800                 || (tcb->snd.wl2 == seg->ack && seg->wnd > tcb->snd.wnd)) {
1801                 tcb->snd.wnd = seg->wnd;
1802                 tcb->snd.wl2 = seg->ack;
1803         }
1804
1805         if (!seq_gt(seg->ack, tcb->snd.una)) {
1806                 /*
1807                  *  don't let us hangup if sending into a closed window and
1808                  *  we're still getting acks
1809                  */
1810                 if ((tcb->flags & RETRAN) && tcb->snd.wnd == 0) {
1811                         tcb->backedoff = MAXBACKMS / 4;
1812                 }
1813                 return;
1814         }
1815
1816         /*
1817          *  any positive ack turns off fast rxt,
1818          *  (should we do new-reno on partial acks?)
1819          */
1820         if (!tcb->snd.recovery || seq_ge(seg->ack, tcb->snd.rxt)) {
1821                 tcb->snd.dupacks = 0;
1822                 tcb->snd.recovery = 0;
1823         } else
1824                 netlog(s->p->f, Logtcp, "rxt next %lu, cwin %u\n", seg->ack,
1825                            tcb->cwind);
1826
1827         /* Compute the new send window size */
1828         acked = seg->ack - tcb->snd.una;
1829
1830         /* avoid slow start and timers for SYN acks */
1831         if ((tcb->flags & SYNACK) == 0) {
1832                 tcb->flags |= SYNACK;
1833                 acked--;
1834                 tcb->flgcnt--;
1835                 goto done;
1836         }
1837
1838         /* slow start as long as we're not recovering from lost packets */
1839         if (tcb->cwind < tcb->snd.wnd && !tcb->snd.recovery) {
1840                 if (tcb->cwind < tcb->ssthresh) {
1841                         expand = tcb->mss;
1842                         if (acked < expand)
1843                                 expand = acked;
1844                 } else
1845                         expand = ((int)tcb->mss * tcb->mss) / tcb->cwind;
1846
1847                 if (tcb->cwind + expand < tcb->cwind)
1848                         expand = tcb->snd.wnd - tcb->cwind;
1849                 if (tcb->cwind + expand > tcb->snd.wnd)
1850                         expand = tcb->snd.wnd - tcb->cwind;
1851                 tcb->cwind += expand;
1852         }
1853
1854         /* Adjust the timers according to the round trip time */
1855         if (tcb->rtt_timer.state == TcptimerON && seq_ge(seg->ack, tcb->rttseq)) {
1856                 tcphalt(tpriv, &tcb->rtt_timer);
1857                 if ((tcb->flags & RETRAN) == 0) {
1858                         tcb->backoff = 0;
1859                         tcb->backedoff = 0;
1860                         rtt = tcb->rtt_timer.start - tcb->rtt_timer.count;
1861                         if (rtt == 0)
1862                                 rtt = 1;        /* otherwise all close systems will rexmit in 0 time */
1863                         rtt *= MSPTICK;
1864                         if (tcb->srtt == 0) {
1865                                 tcb->srtt = rtt << LOGAGAIN;
1866                                 tcb->mdev = rtt << LOGDGAIN;
1867                         } else {
1868                                 delta = rtt - (tcb->srtt >> LOGAGAIN);
1869                                 tcb->srtt += delta;
1870                                 if (tcb->srtt <= 0)
1871                                         tcb->srtt = 1;
1872
1873                                 delta = abs(delta) - (tcb->mdev >> LOGDGAIN);
1874                                 tcb->mdev += delta;
1875                                 if (tcb->mdev <= 0)
1876                                         tcb->mdev = 1;
1877                         }
1878                         tcpsettimer(tcb);
1879                 }
1880         }
1881
1882 done:
1883         if (qdiscard(s->wq, acked) < acked)
1884                 tcb->flgcnt--;
1885
1886         tcb->snd.una = seg->ack;
1887         if (seq_gt(seg->ack, tcb->snd.urg))
1888                 tcb->snd.urg = seg->ack;
1889
1890         if (tcb->snd.una != tcb->snd.nxt)
1891                 tcpgo(tpriv, &tcb->timer);
1892         else
1893                 tcphalt(tpriv, &tcb->timer);
1894
1895         if (seq_lt(tcb->snd.ptr, tcb->snd.una))
1896                 tcb->snd.ptr = tcb->snd.una;
1897
1898         tcb->flags &= ~RETRAN;
1899         tcb->backoff = 0;
1900         tcb->backedoff = 0;
1901 }
1902
1903 void tcpiput(struct Proto *tcp, struct Ipifc *unused, struct block *bp)
1904 {
1905         ERRSTACK(1);
1906         Tcp seg;
1907         Tcp4hdr *h4;
1908         Tcp6hdr *h6;
1909         int hdrlen;
1910         Tcpctl *tcb;
1911         uint16_t length;
1912         uint8_t source[IPaddrlen], dest[IPaddrlen];
1913         struct conv *s;
1914         struct Fs *f;
1915         struct tcppriv *tpriv;
1916         uint8_t version;
1917
1918         f = tcp->f;
1919         tpriv = tcp->priv;
1920
1921         tpriv->stats[InSegs]++;
1922
1923         h4 = (Tcp4hdr *) (bp->rp);
1924         h6 = (Tcp6hdr *) (bp->rp);
1925
1926         if ((h4->vihl & 0xF0) == IP_VER4) {
1927                 version = V4;
1928                 length = nhgets(h4->length);
1929                 v4tov6(dest, h4->tcpdst);
1930                 v4tov6(source, h4->tcpsrc);
1931
1932                 h4->Unused = 0;
1933                 hnputs(h4->tcplen, length - TCP4_PKT);
1934                 if (!(bp->flag & Btcpck) && (h4->tcpcksum[0] || h4->tcpcksum[1]) &&
1935                         ptclcsum(bp, TCP4_IPLEN, length - TCP4_IPLEN)) {
1936                         tpriv->stats[CsumErrs]++;
1937                         tpriv->stats[InErrs]++;
1938                         netlog(f, Logtcp, "bad tcp proto cksum\n");
1939                         freeblist(bp);
1940                         return;
1941                 }
1942
1943                 hdrlen = ntohtcp4(&seg, &bp);
1944                 if (hdrlen < 0) {
1945                         tpriv->stats[HlenErrs]++;
1946                         tpriv->stats[InErrs]++;
1947                         netlog(f, Logtcp, "bad tcp hdr len\n");
1948                         return;
1949                 }
1950
1951                 /* trim the packet to the size claimed by the datagram */
1952                 length -= hdrlen + TCP4_PKT;
1953                 bp = trimblock(bp, hdrlen + TCP4_PKT, length);
1954                 if (bp == NULL) {
1955                         tpriv->stats[LenErrs]++;
1956                         tpriv->stats[InErrs]++;
1957                         netlog(f, Logtcp, "tcp len < 0 after trim\n");
1958                         return;
1959                 }
1960         } else {
1961                 int ttl = h6->ttl;
1962                 int proto = h6->proto;
1963
1964                 version = V6;
1965                 length = nhgets(h6->ploadlen);
1966                 ipmove(dest, h6->tcpdst);
1967                 ipmove(source, h6->tcpsrc);
1968
1969                 h6->ploadlen[0] = h6->ploadlen[1] = h6->proto = 0;
1970                 h6->ttl = proto;
1971                 hnputl(h6->vcf, length);
1972                 if ((h6->tcpcksum[0] || h6->tcpcksum[1]) &&
1973                         ptclcsum(bp, TCP6_IPLEN, length + TCP6_PHDRSIZE)) {
1974                         tpriv->stats[CsumErrs]++;
1975                         tpriv->stats[InErrs]++;
1976                         netlog(f, Logtcp, "bad tcp proto cksum\n");
1977                         freeblist(bp);
1978                         return;
1979                 }
1980                 h6->ttl = ttl;
1981                 h6->proto = proto;
1982                 hnputs(h6->ploadlen, length);
1983
1984                 hdrlen = ntohtcp6(&seg, &bp);
1985                 if (hdrlen < 0) {
1986                         tpriv->stats[HlenErrs]++;
1987                         tpriv->stats[InErrs]++;
1988                         netlog(f, Logtcp, "bad tcp hdr len\n");
1989                         return;
1990                 }
1991
1992                 /* trim the packet to the size claimed by the datagram */
1993                 length -= hdrlen;
1994                 bp = trimblock(bp, hdrlen + TCP6_PKT, length);
1995                 if (bp == NULL) {
1996                         tpriv->stats[LenErrs]++;
1997                         tpriv->stats[InErrs]++;
1998                         netlog(f, Logtcp, "tcp len < 0 after trim\n");
1999                         return;
2000                 }
2001         }
2002
2003         /* lock protocol while searching for a conversation */
2004         qlock(&tcp->qlock);
2005
2006         /* Look for a matching conversation */
2007         s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest);
2008         if (s == NULL) {
2009                 netlog(f, Logtcp, "iphtlook failed");
2010 reset:
2011                 qunlock(&tcp->qlock);
2012                 sndrst(tcp, source, dest, length, &seg, version, "no conversation");
2013                 freeblist(bp);
2014                 return;
2015         }
2016
2017         /* if it's a listener, look for the right flags and get a new conv */
2018         tcb = (Tcpctl *) s->ptcl;
2019         if (tcb->state == Listen) {
2020                 if (seg.flags & RST) {
2021                         limborst(s, &seg, source, dest, version);
2022                         qunlock(&tcp->qlock);
2023                         freeblist(bp);
2024                         return;
2025                 }
2026
2027                 /* if this is a new SYN, put the call into limbo */
2028                 if ((seg.flags & SYN) && (seg.flags & ACK) == 0) {
2029                         limbo(s, source, dest, &seg, version);
2030                         qunlock(&tcp->qlock);
2031                         freeblist(bp);
2032                         return;
2033                 }
2034
2035                 /*
2036                  *  if there's a matching call in limbo, tcpincoming will
2037                  *  return it in state Syn_received
2038                  */
2039                 s = tcpincoming(s, &seg, source, dest, version);
2040                 if (s == NULL)
2041                         goto reset;
2042         }
2043
2044         /* The rest of the input state machine is run with the control block
2045          * locked and implements the state machine directly out of the RFC.
2046          * Out-of-band data is ignored - it was always a bad idea.
2047          */
2048         tcb = (Tcpctl *) s->ptcl;
2049         if (waserror()) {
2050                 qunlock(&s->qlock);
2051                 nexterror();
2052         }
2053         qlock(&s->qlock);
2054         qunlock(&tcp->qlock);
2055
2056         /* fix up window */
2057         seg.wnd <<= tcb->rcv.scale;
2058
2059         /* every input packet in puts off the keep alive time out */
2060         tcpsetkacounter(tcb);
2061
2062         switch (tcb->state) {
2063                 case Closed:
2064                         sndrst(tcp, source, dest, length, &seg, version,
2065                                    "sending to Closed");
2066                         goto raise;
2067                 case Syn_sent:
2068                         if (seg.flags & ACK) {
2069                                 if (!seq_within(seg.ack, tcb->iss + 1, tcb->snd.nxt)) {
2070                                         sndrst(tcp, source, dest, length, &seg, version,
2071                                                    "bad seq in Syn_sent");
2072                                         goto raise;
2073                                 }
2074                         }
2075                         if (seg.flags & RST) {
2076                                 if (seg.flags & ACK)
2077                                         localclose(s, Econrefused);
2078                                 goto raise;
2079                         }
2080
2081                         if (seg.flags & SYN) {
2082                                 procsyn(s, &seg);
2083                                 if (seg.flags & ACK) {
2084                                         update(s, &seg);
2085                                         tcpsynackrtt(s);
2086                                         tcpsetstate(s, Established);
2087                                         tcpsetscale(s, tcb, seg.ws, tcb->scale);
2088                                 } else {
2089                                         tcb->time = NOW;
2090                                         tcpsetstate(s, Syn_received);   /* DLP - shouldn't this be a reset? */
2091                                 }
2092
2093                                 if (length != 0 || (seg.flags & FIN))
2094                                         break;
2095
2096                                 freeblist(bp);
2097                                 goto output;
2098                         } else
2099                                 freeblist(bp);
2100
2101                         qunlock(&s->qlock);
2102                         poperror();
2103                         return;
2104                 case Syn_received:
2105                         /* doesn't matter if it's the correct ack, we're just trying to set timing */
2106                         if (seg.flags & ACK)
2107                                 tcpsynackrtt(s);
2108                         break;
2109         }
2110
2111         /*
2112          *  One DOS attack is to open connections to us and then forget about them,
2113          *  thereby tying up a conv at no long term cost to the attacker.
2114          *  This is an attempt to defeat these stateless DOS attacks.  See
2115          *  corresponding code in tcpsendka().
2116          */
2117         if (tcb->state != Syn_received && (seg.flags & RST) == 0) {
2118                 if (tcpporthogdefense
2119                         && seq_within(seg.ack, tcb->snd.una - (1 << 31),
2120                                                   tcb->snd.una - (1 << 29))) {
2121                         printd("stateless hog %I.%d->%I.%d f 0x%x 0x%lx - 0x%lx - 0x%lx\n",
2122                                    source, seg.source, dest, seg.dest, seg.flags,
2123                                    tcb->snd.una - (1 << 31), seg.ack, tcb->snd.una - (1 << 29));
2124                         localclose(s, "stateless hog");
2125                 }
2126         }
2127
2128         /* Cut the data to fit the receive window */
2129         if (tcptrim(tcb, &seg, &bp, &length) == -1) {
2130                 netlog(f, Logtcp, "tcp len < 0, %lu %d\n", seg.seq, length);
2131                 update(s, &seg);
2132                 if (qlen(s->wq) + tcb->flgcnt == 0 && tcb->state == Closing) {
2133                         tcphalt(tpriv, &tcb->rtt_timer);
2134                         tcphalt(tpriv, &tcb->acktimer);
2135                         tcphalt(tpriv, &tcb->katimer);
2136                         tcpsetstate(s, Time_wait);
2137                         tcb->timer.start = MSL2 * (1000 / MSPTICK);
2138                         tcpgo(tpriv, &tcb->timer);
2139                 }
2140                 if (!(seg.flags & RST)) {
2141                         tcb->flags |= FORCE;
2142                         goto output;
2143                 }
2144                 qunlock(&s->qlock);
2145                 poperror();
2146                 return;
2147         }
2148
2149         /* Cannot accept so answer with a rst */
2150         if (length && tcb->state == Closed) {
2151                 sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
2152                 goto raise;
2153         }
2154
2155         /* The segment is beyond the current receive pointer so
2156          * queue the data in the resequence queue
2157          */
2158         if (seg.seq != tcb->rcv.nxt)
2159                 if (length != 0 || (seg.flags & (SYN | FIN))) {
2160                         update(s, &seg);
2161                         if (addreseq(tcb, tpriv, &seg, bp, length) < 0)
2162                                 printd("reseq %I.%d -> %I.%d\n", s->raddr, s->rport, s->laddr,
2163                                            s->lport);
2164                         tcb->flags |= FORCE;
2165                         goto output;
2166                 }
2167
2168         /*
2169          *  keep looping till we've processed this packet plus any
2170          *  adjacent packets in the resequence queue
2171          */
2172         for (;;) {
2173                 if (seg.flags & RST) {
2174                         if (tcb->state == Established) {
2175                                 tpriv->stats[EstabResets]++;
2176                                 if (tcb->rcv.nxt != seg.seq)
2177                                         printd
2178                                                 ("out of order RST rcvd: %I.%d -> %I.%d, rcv.nxt 0x%lx seq 0x%lx\n",
2179                                                  s->raddr, s->rport, s->laddr, s->lport, tcb->rcv.nxt,
2180                                                  seg.seq);
2181                         }
2182                         localclose(s, Econrefused);
2183                         goto raise;
2184                 }
2185
2186                 if ((seg.flags & ACK) == 0)
2187                         goto raise;
2188
2189                 switch (tcb->state) {
2190                         case Syn_received:
2191                                 if (!seq_within(seg.ack, tcb->snd.una + 1, tcb->snd.nxt)) {
2192                                         sndrst(tcp, source, dest, length, &seg, version,
2193                                                    "bad seq in Syn_received");
2194                                         goto raise;
2195                                 }
2196                                 update(s, &seg);
2197                                 tcpsetstate(s, Established);
2198                         case Established:
2199                         case Close_wait:
2200                                 update(s, &seg);
2201                                 break;
2202                         case Finwait1:
2203                                 update(s, &seg);
2204                                 if (qlen(s->wq) + tcb->flgcnt == 0) {
2205                                         tcphalt(tpriv, &tcb->rtt_timer);
2206                                         tcphalt(tpriv, &tcb->acktimer);
2207                                         tcpsetkacounter(tcb);
2208                                         tcb->time = NOW;
2209                                         tcpsetstate(s, Finwait2);
2210                                         tcb->katimer.start = MSL2 * (1000 / MSPTICK);
2211                                         tcpgo(tpriv, &tcb->katimer);
2212                                 }
2213                                 break;
2214                         case Finwait2:
2215                                 update(s, &seg);
2216                                 break;
2217                         case Closing:
2218                                 update(s, &seg);
2219                                 if (qlen(s->wq) + tcb->flgcnt == 0) {
2220                                         tcphalt(tpriv, &tcb->rtt_timer);
2221                                         tcphalt(tpriv, &tcb->acktimer);
2222                                         tcphalt(tpriv, &tcb->katimer);
2223                                         tcpsetstate(s, Time_wait);
2224                                         tcb->timer.start = MSL2 * (1000 / MSPTICK);
2225                                         tcpgo(tpriv, &tcb->timer);
2226                                 }
2227                                 break;
2228                         case Last_ack:
2229                                 update(s, &seg);
2230                                 if (qlen(s->wq) + tcb->flgcnt == 0) {
2231                                         localclose(s, NULL);
2232                                         goto raise;
2233                                 }
2234                         case Time_wait:
2235                                 tcb->flags |= FORCE;
2236                                 if (tcb->timer.state != TcptimerON)
2237                                         tcpgo(tpriv, &tcb->timer);
2238                 }
2239
2240                 if ((seg.flags & URG) && seg.urg) {
2241                         if (seq_gt(seg.urg + seg.seq, tcb->rcv.urg)) {
2242                                 tcb->rcv.urg = seg.urg + seg.seq;
2243                                 pullblock(&bp, seg.urg);
2244                         }
2245                 } else if (seq_gt(tcb->rcv.nxt, tcb->rcv.urg))
2246                         tcb->rcv.urg = tcb->rcv.nxt;
2247
2248                 if (length == 0) {
2249                         if (bp != NULL)
2250                                 freeblist(bp);
2251                 } else {
2252                         switch (tcb->state) {
2253                                 default:
2254                                         /* Ignore segment text */
2255                                         if (bp != NULL)
2256                                                 freeblist(bp);
2257                                         break;
2258
2259                                 case Syn_received:
2260                                 case Established:
2261                                 case Finwait1:
2262                                         /* If we still have some data place on
2263                                          * receive queue
2264                                          */
2265                                         if (bp) {
2266                                                 bp = packblock(bp);
2267                                                 if (bp == NULL)
2268                                                         panic("tcp packblock");
2269                                                 qpassnolim(s->rq, bp);
2270                                                 bp = NULL;
2271
2272                                                 /*
2273                                                  *  Force an ack every 2 data messages.  This is
2274                                                  *  a hack for rob to make his home system run
2275                                                  *  faster.
2276                                                  *
2277                                                  *  this also keeps the standard TCP congestion
2278                                                  *  control working since it needs an ack every
2279                                                  *  2 max segs worth.  This is not quite that,
2280                                                  *  but under a real stream is equivalent since
2281                                                  *  every packet has a max seg in it.
2282                                                  */
2283                                                 if (++(tcb->rcv.una) >= 2)
2284                                                         tcb->flags |= FORCE;
2285                                         }
2286                                         tcb->rcv.nxt += length;
2287
2288                                         /*
2289                                          *  update our rcv window
2290                                          */
2291                                         tcprcvwin(s);
2292
2293                                         /*
2294                                          *  turn on the acktimer if there's something
2295                                          *  to ack
2296                                          */
2297                                         if (tcb->acktimer.state != TcptimerON)
2298                                                 tcpgo(tpriv, &tcb->acktimer);
2299
2300                                         break;
2301                                 case Finwait2:
2302                                         /* no process to read the data, send a reset */
2303                                         if (bp != NULL)
2304                                                 freeblist(bp);
2305                                         sndrst(tcp, source, dest, length, &seg, version,
2306                                                    "send to Finwait2");
2307                                         qunlock(&s->qlock);
2308                                         poperror();
2309                                         return;
2310                         }
2311                 }
2312
2313                 if (seg.flags & FIN) {
2314                         tcb->flags |= FORCE;
2315
2316                         switch (tcb->state) {
2317                                 case Syn_received:
2318                                 case Established:
2319                                         tcb->rcv.nxt++;
2320                                         tcpsetstate(s, Close_wait);
2321                                         break;
2322                                 case Finwait1:
2323                                         tcb->rcv.nxt++;
2324                                         if (qlen(s->wq) + tcb->flgcnt == 0) {
2325                                                 tcphalt(tpriv, &tcb->rtt_timer);
2326                                                 tcphalt(tpriv, &tcb->acktimer);
2327                                                 tcphalt(tpriv, &tcb->katimer);
2328                                                 tcpsetstate(s, Time_wait);
2329                                                 tcb->timer.start = MSL2 * (1000 / MSPTICK);
2330                                                 tcpgo(tpriv, &tcb->timer);
2331                                         } else
2332                                                 tcpsetstate(s, Closing);
2333                                         break;
2334                                 case Finwait2:
2335                                         tcb->rcv.nxt++;
2336                                         tcphalt(tpriv, &tcb->rtt_timer);
2337                                         tcphalt(tpriv, &tcb->acktimer);
2338                                         tcphalt(tpriv, &tcb->katimer);
2339                                         tcpsetstate(s, Time_wait);
2340                                         tcb->timer.start = MSL2 * (1000 / MSPTICK);
2341                                         tcpgo(tpriv, &tcb->timer);
2342                                         break;
2343                                 case Close_wait:
2344                                 case Closing:
2345                                 case Last_ack:
2346                                         break;
2347                                 case Time_wait:
2348                                         tcpgo(tpriv, &tcb->timer);
2349                                         break;
2350                         }
2351                 }
2352
2353                 /*
2354                  *  get next adjacent segment from the resequence queue.
2355                  *  dump/trim any overlapping segments
2356                  */
2357                 for (;;) {
2358                         if (tcb->reseq == NULL)
2359                                 goto output;
2360
2361                         if (seq_ge(tcb->rcv.nxt, tcb->reseq->seg.seq) == 0)
2362                                 goto output;
2363
2364                         getreseq(tcb, &seg, &bp, &length);
2365
2366                         if (tcptrim(tcb, &seg, &bp, &length) == 0)
2367                                 break;
2368                 }
2369         }
2370 output:
2371         tcpoutput(s);
2372         qunlock(&s->qlock);
2373         poperror();
2374         return;
2375 raise:
2376         qunlock(&s->qlock);
2377         poperror();
2378         freeblist(bp);
2379         tcpkick(s);
2380 }
2381
2382 /*
2383  *  always enters and exits with the s locked.  We drop
2384  *  the lock to ipoput the packet so some care has to be
2385  *  taken by callers.
2386  */
2387 void tcpoutput(struct conv *s)
2388 {
2389         Tcp seg;
2390         int msgs;
2391         Tcpctl *tcb;
2392         struct block *hbp, *bp;
2393         int sndcnt, n;
2394         uint32_t ssize, dsize, usable, sent;
2395         struct Fs *f;
2396         struct tcppriv *tpriv;
2397         uint8_t version;
2398
2399         f = s->p->f;
2400         tpriv = s->p->priv;
2401         version = s->ipversion;
2402
2403         for (msgs = 0; msgs < 100; msgs++) {
2404                 tcb = (Tcpctl *) s->ptcl;
2405
2406                 switch (tcb->state) {
2407                         case Listen:
2408                         case Closed:
2409                         case Finwait2:
2410                                 return;
2411                 }
2412
2413                 /* force an ack when a window has opened up */
2414                 if (tcb->rcv.blocked && tcb->rcv.wnd > 0) {
2415                         tcb->rcv.blocked = 0;
2416                         tcb->flags |= FORCE;
2417                 }
2418
2419                 sndcnt = qlen(s->wq) + tcb->flgcnt;
2420                 sent = tcb->snd.ptr - tcb->snd.una;
2421
2422                 /* Don't send anything else until our SYN has been acked */
2423                 if (tcb->snd.ptr != tcb->iss && (tcb->flags & SYNACK) == 0)
2424                         break;
2425
2426                 /* Compute usable segment based on offered window and limit
2427                  * window probes to one
2428                  */
2429                 if (tcb->snd.wnd == 0) {
2430                         if (sent != 0) {
2431                                 if ((tcb->flags & FORCE) == 0)
2432                                         break;
2433 //              tcb->snd.ptr = tcb->snd.una;
2434                         }
2435                         usable = 1;
2436                 } else {
2437                         usable = tcb->cwind;
2438                         if (tcb->snd.wnd < usable)
2439                                 usable = tcb->snd.wnd;
2440                         usable -= sent;
2441                 }
2442                 ssize = sndcnt - sent;
2443                 if (ssize && usable < 2)
2444                         netlog(s->p->f, Logtcp, "throttled snd.wnd %lu cwind %lu\n",
2445                                    tcb->snd.wnd, tcb->cwind);
2446                 if (usable < ssize)
2447                         ssize = usable;
2448                 if (tcb->mss < ssize)
2449                         ssize = tcb->mss;
2450                 dsize = ssize;
2451                 seg.urg = 0;
2452
2453                 if (ssize == 0)
2454                         if ((tcb->flags & FORCE) == 0)
2455                                 break;
2456
2457                 tcb->flags &= ~FORCE;
2458                 tcprcvwin(s);
2459
2460                 /* By default we will generate an ack */
2461                 tcphalt(tpriv, &tcb->acktimer);
2462                 tcb->rcv.una = 0;
2463                 seg.source = s->lport;
2464                 seg.dest = s->rport;
2465                 seg.flags = ACK;
2466                 seg.mss = 0;
2467                 seg.ws = 0;
2468                 switch (tcb->state) {
2469                         case Syn_sent:
2470                                 seg.flags = 0;
2471                                 if (tcb->snd.ptr == tcb->iss) {
2472                                         seg.flags |= SYN;
2473                                         dsize--;
2474                                         seg.mss = tcb->mss;
2475                                         seg.ws = tcb->scale;
2476                                 }
2477                                 break;
2478                         case Syn_received:
2479                                 /*
2480                                  *  don't send any data with a SYN/ACK packet
2481                                  *  because Linux rejects the packet in its
2482                                  *  attempt to solve the SYN attack problem
2483                                  */
2484                                 if (tcb->snd.ptr == tcb->iss) {
2485                                         seg.flags |= SYN;
2486                                         dsize = 0;
2487                                         ssize = 1;
2488                                         seg.mss = tcb->mss;
2489                                         seg.ws = tcb->scale;
2490                                 }
2491                                 break;
2492                 }
2493                 seg.seq = tcb->snd.ptr;
2494                 seg.ack = tcb->rcv.nxt;
2495                 seg.wnd = tcb->rcv.wnd;
2496
2497                 /* Pull out data to send */
2498                 bp = NULL;
2499                 if (dsize != 0) {
2500                         bp = qcopy(s->wq, dsize, sent);
2501                         if (BLEN(bp) != dsize) {
2502                                 seg.flags |= FIN;
2503                                 dsize--;
2504                         }
2505                 }
2506
2507                 if (sent + dsize == sndcnt)
2508                         seg.flags |= PSH;
2509
2510                 /* keep track of balance of resent data */
2511                 if (seq_lt(tcb->snd.ptr, tcb->snd.nxt)) {
2512                         n = tcb->snd.nxt - tcb->snd.ptr;
2513                         if (ssize < n)
2514                                 n = ssize;
2515                         tcb->resent += n;
2516                         netlog(f, Logtcp, "rexmit: %I.%d -> %I.%d ptr 0x%lx nxt 0x%lx\n",
2517                                    s->raddr, s->rport, s->laddr, s->lport, tcb->snd.ptr,
2518                                    tcb->snd.nxt);
2519                         tpriv->stats[RetransSegs]++;
2520                 }
2521
2522                 tcb->snd.ptr += ssize;
2523
2524                 /* Pull up the send pointer so we can accept acks
2525                  * for this window
2526                  */
2527                 if (seq_gt(tcb->snd.ptr, tcb->snd.nxt))
2528                         tcb->snd.nxt = tcb->snd.ptr;
2529
2530                 /* Build header, link data and compute cksum */
2531                 switch (version) {
2532                         case V4:
2533                                 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
2534                                 hbp = htontcp4(&seg, bp, &tcb->protohdr.tcp4hdr, tcb);
2535                                 if (hbp == NULL) {
2536                                         freeblist(bp);
2537                                         return;
2538                                 }
2539                                 break;
2540                         case V6:
2541                                 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
2542                                 hbp = htontcp6(&seg, bp, &tcb->protohdr.tcp6hdr, tcb);
2543                                 if (hbp == NULL) {
2544                                         freeblist(bp);
2545                                         return;
2546                                 }
2547                                 break;
2548                         default:
2549                                 hbp = NULL;     /* to suppress a warning */
2550                                 panic("tcpoutput: version %d", version);
2551                 }
2552
2553                 /* Start the transmission timers if there is new data and we
2554                  * expect acknowledges
2555                  */
2556                 if (ssize != 0) {
2557                         if (tcb->timer.state != TcptimerON)
2558                                 tcpgo(tpriv, &tcb->timer);
2559
2560                         /*  If round trip timer isn't running, start it.
2561                          *  measure the longest packet only in case the
2562                          *  transmission time dominates RTT
2563                          */
2564                         if (tcb->rtt_timer.state != TcptimerON)
2565                                 if (ssize == tcb->mss) {
2566                                         tcpgo(tpriv, &tcb->rtt_timer);
2567                                         tcb->rttseq = tcb->snd.ptr;
2568                                 }
2569                 }
2570
2571                 tpriv->stats[OutSegs]++;
2572
2573                 /* put off the next keep alive */
2574                 tcpgo(tpriv, &tcb->katimer);
2575
2576                 switch (version) {
2577                         case V4:
2578                                 if (ipoput4(f, hbp, 0, s->ttl, s->tos, s) < 0) {
2579                                         /* a negative return means no route */
2580                                         localclose(s, "no route");
2581                                 }
2582                                 break;
2583                         case V6:
2584                                 if (ipoput6(f, hbp, 0, s->ttl, s->tos, s) < 0) {
2585                                         /* a negative return means no route */
2586                                         localclose(s, "no route");
2587                                 }
2588                                 break;
2589                         default:
2590                                 panic("tcpoutput2: version %d", version);
2591                 }
2592                 if ((msgs % 4) == 1) {
2593                         qunlock(&s->qlock);
2594                         kthread_yield();
2595                         qlock(&s->qlock);
2596                 }
2597         }
2598 }
2599
2600 /*
2601  *  the BSD convention (hack?) for keep alives.  resend last uint8_t acked.
2602  */
2603 void tcpsendka(struct conv *s)
2604 {
2605         Tcp seg;
2606         Tcpctl *tcb;
2607         struct block *hbp, *dbp;
2608
2609         tcb = (Tcpctl *) s->ptcl;
2610
2611         dbp = NULL;
2612         seg.urg = 0;
2613         seg.source = s->lport;
2614         seg.dest = s->rport;
2615         seg.flags = ACK | PSH;
2616         seg.mss = 0;
2617         seg.ws = 0;
2618         if (tcpporthogdefense)
2619                 seg.seq = tcb->snd.una - (1 << 30) - nrand(1 << 20);
2620         else
2621                 seg.seq = tcb->snd.una - 1;
2622         seg.ack = tcb->rcv.nxt;
2623         tcb->rcv.una = 0;
2624         seg.wnd = tcb->rcv.wnd;
2625         if (tcb->state == Finwait2) {
2626                 seg.flags |= FIN;
2627         } else {
2628                 dbp = allocb(1);
2629                 dbp->wp++;
2630         }
2631
2632         if (isv4(s->raddr)) {
2633                 /* Build header, link data and compute cksum */
2634                 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
2635                 hbp = htontcp4(&seg, dbp, &tcb->protohdr.tcp4hdr, tcb);
2636                 if (hbp == NULL) {
2637                         freeblist(dbp);
2638                         return;
2639                 }
2640                 ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
2641         } else {
2642                 /* Build header, link data and compute cksum */
2643                 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
2644                 hbp = htontcp6(&seg, dbp, &tcb->protohdr.tcp6hdr, tcb);
2645                 if (hbp == NULL) {
2646                         freeblist(dbp);
2647                         return;
2648                 }
2649                 ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
2650         }
2651 }
2652
2653 /*
2654  *  set connection to time out after 12 minutes
2655  */
2656 void tcpsetkacounter(Tcpctl * tcb)
2657 {
2658         tcb->kacounter = (12 * 60 * 1000) / (tcb->katimer.start * MSPTICK);
2659         if (tcb->kacounter < 3)
2660                 tcb->kacounter = 3;
2661 }
2662
2663 /*
2664  *  if we've timed out, close the connection
2665  *  otherwise, send a keepalive and restart the timer
2666  */
2667 void tcpkeepalive(void *v)
2668 {
2669         ERRSTACK(1);
2670         Tcpctl *tcb;
2671         struct conv *s;
2672
2673         s = v;
2674         tcb = (Tcpctl *) s->ptcl;
2675         if (waserror()) {
2676                 qunlock(&s->qlock);
2677                 nexterror();
2678         }
2679         qlock(&s->qlock);
2680         if (tcb->state != Closed) {
2681                 if (--(tcb->kacounter) <= 0) {
2682                         localclose(s, Etimedout);
2683                 } else {
2684                         tcpsendka(s);
2685                         tcpgo(s->p->priv, &tcb->katimer);
2686                 }
2687         }
2688         qunlock(&s->qlock);
2689         poperror();
2690 }
2691
2692 /*
2693  *  start keepalive timer
2694  */
2695 char *tcpstartka(struct conv *s, char **f, int n)
2696 {
2697         Tcpctl *tcb;
2698         int x;
2699
2700         tcb = (Tcpctl *) s->ptcl;
2701         if (tcb->state != Established)
2702                 return "connection must be in Establised state";
2703         if (n > 1) {
2704                 x = atoi(f[1]);
2705                 if (x >= MSPTICK)
2706                         tcb->katimer.start = x / MSPTICK;
2707         }
2708         tcpsetkacounter(tcb);
2709         tcpgo(s->p->priv, &tcb->katimer);
2710
2711         return NULL;
2712 }
2713
2714 /*
2715  *  turn checksums on/off
2716  */
2717 char *tcpsetchecksum(struct conv *s, char **f, int unused)
2718 {
2719         Tcpctl *tcb;
2720
2721         tcb = (Tcpctl *) s->ptcl;
2722         tcb->nochecksum = !atoi(f[1]);
2723
2724         return NULL;
2725 }
2726
2727 void tcprxmit(struct conv *s)
2728 {
2729         Tcpctl *tcb;
2730
2731         tcb = (Tcpctl *) s->ptcl;
2732
2733         tcb->flags |= RETRAN | FORCE;
2734         tcb->snd.ptr = tcb->snd.una;
2735
2736         /*
2737          *  We should be halving the slow start threshhold (down to one
2738          *  mss) but leaving it at mss seems to work well enough
2739          */
2740         tcb->ssthresh = tcb->mss;
2741
2742         /*
2743          *  pull window down to a single packet
2744          */
2745         tcb->cwind = tcb->mss;
2746         tcpoutput(s);
2747 }
2748
2749 void tcptimeout(void *arg)
2750 {
2751         ERRSTACK(1);
2752         struct conv *s;
2753         Tcpctl *tcb;
2754         int maxback;
2755         struct tcppriv *tpriv;
2756
2757         s = (struct conv *)arg;
2758         tpriv = s->p->priv;
2759         tcb = (Tcpctl *) s->ptcl;
2760
2761         if (waserror()) {
2762                 qunlock(&s->qlock);
2763                 nexterror();
2764         }
2765         qlock(&s->qlock);
2766         switch (tcb->state) {
2767                 default:
2768                         tcb->backoff++;
2769                         if (tcb->state == Syn_sent)
2770                                 maxback = MAXBACKMS / 2;
2771                         else
2772                                 maxback = MAXBACKMS;
2773                         tcb->backedoff += tcb->timer.start * MSPTICK;
2774                         if (tcb->backedoff >= maxback) {
2775                                 localclose(s, Etimedout);
2776                                 break;
2777                         }
2778                         netlog(s->p->f, Logtcprxmt, "timeout rexmit 0x%lx %d/%d\n",
2779                                    tcb->snd.una, tcb->timer.start, NOW);
2780                         tcpsettimer(tcb);
2781                         tcprxmit(s);
2782                         tpriv->stats[RetransTimeouts]++;
2783                         tcb->snd.dupacks = 0;
2784                         break;
2785                 case Time_wait:
2786                         localclose(s, NULL);
2787                         break;
2788                 case Closed:
2789                         break;
2790         }
2791         qunlock(&s->qlock);
2792         poperror();
2793 }
2794
2795 int inwindow(Tcpctl * tcb, int seq)
2796 {
2797         return seq_within(seq, tcb->rcv.nxt, tcb->rcv.nxt + tcb->rcv.wnd - 1);
2798 }
2799
2800 /*
2801  *  set up state for a received SYN (or SYN ACK) packet
2802  */
2803 void procsyn(struct conv *s, Tcp * seg)
2804 {
2805         Tcpctl *tcb;
2806
2807         tcb = (Tcpctl *) s->ptcl;
2808         tcb->flags |= FORCE;
2809
2810         tcb->rcv.nxt = seg->seq + 1;
2811         tcb->rcv.urg = tcb->rcv.nxt;
2812         tcb->irs = seg->seq;
2813
2814         /* our sending max segment size cannot be bigger than what he asked for */
2815         if (seg->mss != 0 && seg->mss < tcb->mss)
2816                 tcb->mss = seg->mss;
2817
2818         /* the congestion window always starts out as a single segment */
2819         tcb->snd.wnd = seg->wnd;
2820         tcb->cwind = tcb->mss;
2821 }
2822
2823 int
2824 addreseq(Tcpctl * tcb, struct tcppriv *tpriv, Tcp * seg,
2825                  struct block *bp, uint16_t length)
2826 {
2827         Reseq *rp, *rp1;
2828         int i, rqlen, qmax;
2829
2830         rp = kzmalloc(sizeof(Reseq), 0);
2831         if (rp == NULL) {
2832                 freeblist(bp);  /* bp always consumed by add_reseq */
2833                 return 0;
2834         }
2835
2836         rp->seg = *seg;
2837         rp->bp = bp;
2838         rp->length = length;
2839
2840         /* Place on reassembly list sorting by starting seq number */
2841         rp1 = tcb->reseq;
2842         if (rp1 == NULL || seq_lt(seg->seq, rp1->seg.seq)) {
2843                 rp->next = rp1;
2844                 tcb->reseq = rp;
2845                 if (rp->next != NULL)
2846                         tpriv->stats[OutOfOrder]++;
2847                 return 0;
2848         }
2849
2850         rqlen = 0;
2851         for (i = 0;; i++) {
2852                 rqlen += rp1->length;
2853                 if (rp1->next == NULL || seq_lt(seg->seq, rp1->next->seg.seq)) {
2854                         rp->next = rp1->next;
2855                         rp1->next = rp;
2856                         if (rp->next != NULL)
2857                                 tpriv->stats[OutOfOrder]++;
2858                         break;
2859                 }
2860                 rp1 = rp1->next;
2861         }
2862         qmax = QMAX << tcb->rcv.scale;
2863         if (rqlen > qmax) {
2864                 printd("resequence queue > window: %d > %d\n", rqlen, qmax);
2865                 i = 0;
2866                 for (rp1 = tcb->reseq; rp1 != NULL; rp1 = rp1->next) {
2867                         printd("0x%#lx 0x%#lx 0x%#x\n", rp1->seg.seq,
2868                                    rp1->seg.ack, rp1->seg.flags);
2869                         if (i++ > 10) {
2870                                 printd("...\n");
2871                                 break;
2872                         }
2873                 }
2874
2875                 // delete entire reassembly queue; wait for retransmit.
2876                 // - should we be smarter and only delete the tail?
2877                 for (rp = tcb->reseq; rp != NULL; rp = rp1) {
2878                         rp1 = rp->next;
2879                         freeblist(rp->bp);
2880                         kfree(rp);
2881                 }
2882                 tcb->reseq = NULL;
2883
2884                 return -1;
2885         }
2886         return 0;
2887 }
2888
2889 void getreseq(Tcpctl * tcb, Tcp * seg, struct block **bp, uint16_t * length)
2890 {
2891         Reseq *rp;
2892
2893         rp = tcb->reseq;
2894         if (rp == NULL)
2895                 return;
2896
2897         tcb->reseq = rp->next;
2898
2899         *seg = rp->seg;
2900         *bp = rp->bp;
2901         *length = rp->length;
2902
2903         kfree(rp);
2904 }
2905
2906 int tcptrim(Tcpctl * tcb, Tcp * seg, struct block **bp, uint16_t * length)
2907 {
2908         uint16_t len;
2909         uint8_t accept;
2910         int dupcnt, excess;
2911
2912         accept = 0;
2913         len = *length;
2914         if (seg->flags & SYN)
2915                 len++;
2916         if (seg->flags & FIN)
2917                 len++;
2918
2919         if (tcb->rcv.wnd == 0) {
2920                 if (len == 0 && seg->seq == tcb->rcv.nxt)
2921                         return 0;
2922         } else {
2923                 /* Some part of the segment should be in the window */
2924                 if (inwindow(tcb, seg->seq))
2925                         accept++;
2926                 else if (len != 0) {
2927                         if (inwindow(tcb, seg->seq + len - 1) ||
2928                                 seq_within(tcb->rcv.nxt, seg->seq, seg->seq + len - 1))
2929                                 accept++;
2930                 }
2931         }
2932         if (!accept) {
2933                 freeblist(*bp);
2934                 return -1;
2935         }
2936         dupcnt = tcb->rcv.nxt - seg->seq;
2937         if (dupcnt > 0) {
2938                 tcb->rerecv += dupcnt;
2939                 if (seg->flags & SYN) {
2940                         seg->flags &= ~SYN;
2941                         seg->seq++;
2942
2943                         if (seg->urg > 1)
2944                                 seg->urg--;
2945                         else
2946                                 seg->flags &= ~URG;
2947                         dupcnt--;
2948                 }
2949                 if (dupcnt > 0) {
2950                         pullblock(bp, (uint16_t) dupcnt);
2951                         seg->seq += dupcnt;
2952                         *length -= dupcnt;
2953
2954                         if (seg->urg > dupcnt)
2955                                 seg->urg -= dupcnt;
2956                         else {
2957                                 seg->flags &= ~URG;
2958                                 seg->urg = 0;
2959                         }
2960                 }
2961         }
2962         excess = seg->seq + *length - (tcb->rcv.nxt + tcb->rcv.wnd);
2963         if (excess > 0) {
2964                 tcb->rerecv += excess;
2965                 *length -= excess;
2966                 *bp = trimblock(*bp, 0, *length);
2967                 if (*bp == NULL)
2968                         panic("presotto is a boofhead");
2969                 seg->flags &= ~FIN;
2970         }
2971         return 0;
2972 }
2973
2974 void tcpadvise(struct Proto *tcp, struct block *bp, char *msg)
2975 {
2976         Tcp4hdr *h4;
2977         Tcp6hdr *h6;
2978         Tcpctl *tcb;
2979         uint8_t source[IPaddrlen];
2980         uint8_t dest[IPaddrlen];
2981         uint16_t psource, pdest;
2982         struct conv *s, **p;
2983
2984         h4 = (Tcp4hdr *) (bp->rp);
2985         h6 = (Tcp6hdr *) (bp->rp);
2986
2987         if ((h4->vihl & 0xF0) == IP_VER4) {
2988                 v4tov6(dest, h4->tcpdst);
2989                 v4tov6(source, h4->tcpsrc);
2990                 psource = nhgets(h4->tcpsport);
2991                 pdest = nhgets(h4->tcpdport);
2992         } else {
2993                 ipmove(dest, h6->tcpdst);
2994                 ipmove(source, h6->tcpsrc);
2995                 psource = nhgets(h6->tcpsport);
2996                 pdest = nhgets(h6->tcpdport);
2997         }
2998
2999         /* Look for a connection */
3000         qlock(&tcp->qlock);
3001         for (p = tcp->conv; *p; p++) {
3002                 s = *p;
3003                 tcb = (Tcpctl *) s->ptcl;
3004                 if (s->rport == pdest)
3005                         if (s->lport == psource)
3006                                 if (tcb->state != Closed)
3007                                         if (ipcmp(s->raddr, dest) == 0)
3008                                                 if (ipcmp(s->laddr, source) == 0) {
3009                                                         qlock(&s->qlock);
3010                                                         qunlock(&tcp->qlock);
3011                                                         switch (tcb->state) {
3012                                                                 case Syn_sent:
3013                                                                         localclose(s, msg);
3014                                                                         break;
3015                                                         }
3016                                                         qunlock(&s->qlock);
3017                                                         freeblist(bp);
3018                                                         return;
3019                                                 }
3020         }
3021         qunlock(&tcp->qlock);
3022         freeblist(bp);
3023 }
3024
3025 static char *tcpporthogdefensectl(char *val)
3026 {
3027         if (strcmp(val, "on") == 0)
3028                 tcpporthogdefense = 1;
3029         else if (strcmp(val, "off") == 0)
3030                 tcpporthogdefense = 0;
3031         else
3032                 return "unknown value for tcpporthogdefense";
3033         return NULL;
3034 }
3035
3036 /* called with c qlocked */
3037 char *tcpctl(struct conv *c, char **f, int n)
3038 {
3039         if (n == 1 && strcmp(f[0], "hangup") == 0)
3040                 return tcphangup(c);
3041         if (n >= 1 && strcmp(f[0], "keepalive") == 0)
3042                 return tcpstartka(c, f, n);
3043         if (n >= 1 && strcmp(f[0], "checksum") == 0)
3044                 return tcpsetchecksum(c, f, n);
3045         if (n >= 1 && strcmp(f[0], "tcpporthogdefense") == 0)
3046                 return tcpporthogdefensectl(f[1]);
3047         return "unknown control request";
3048 }
3049
3050 int tcpstats(struct Proto *tcp, char *buf, int len)
3051 {
3052         struct tcppriv *priv;
3053         char *p, *e;
3054         int i;
3055
3056         priv = tcp->priv;
3057         p = buf;
3058         e = p + len;
3059         for (i = 0; i < Nstats; i++)
3060                 p = seprintf(p, e, "%s: %u\n", statnames[i], priv->stats[i]);
3061         return p - buf;
3062 }
3063
3064 /*
3065  *  garbage collect any stale conversations:
3066  *      - SYN received but no SYN-ACK after 5 seconds (could be the SYN attack)
3067  *      - Finwait2 after 5 minutes
3068  *
3069  *  this is called whenever we run out of channels.  Both checks are
3070  *  of questionable validity so we try to use them only when we're
3071  *  up against the wall.
3072  */
3073 int tcpgc(struct Proto *tcp)
3074 {
3075         struct conv *c, **pp, **ep;
3076         int n;
3077         Tcpctl *tcb;
3078
3079         n = 0;
3080         ep = &tcp->conv[tcp->nc];
3081         for (pp = tcp->conv; pp < ep; pp++) {
3082                 c = *pp;
3083                 if (c == NULL)
3084                         break;
3085                 if (!canqlock(&c->qlock))
3086                         continue;
3087                 tcb = (Tcpctl *) c->ptcl;
3088                 switch (tcb->state) {
3089                         case Syn_received:
3090                                 if (NOW - tcb->time > 5000) {
3091                                         localclose(c, "timed out");
3092                                         n++;
3093                                 }
3094                                 break;
3095                         case Finwait2:
3096                                 if (NOW - tcb->time > 5 * 60 * 1000) {
3097                                         localclose(c, "timed out");
3098                                         n++;
3099                                 }
3100                                 break;
3101                 }
3102                 qunlock(&c->qlock);
3103         }
3104         return n;
3105 }
3106
3107 void tcpsettimer(Tcpctl * tcb)
3108 {
3109         int x;
3110
3111         /* round trip dependency */
3112         x = backoff(tcb->backoff) *
3113                 (tcb->mdev + (tcb->srtt >> LOGAGAIN) + MSPTICK) / MSPTICK;
3114
3115         /* bounded twixt 1/2 and 64 seconds */
3116         if (x < 500 / MSPTICK)
3117                 x = 500 / MSPTICK;
3118         else if (x > (64000 / MSPTICK))
3119                 x = 64000 / MSPTICK;
3120         tcb->timer.start = x;
3121 }
3122
3123 void tcpinit(struct Fs *fs)
3124 {
3125         struct Proto *tcp;
3126         struct tcppriv *tpriv;
3127
3128         tcp = kzmalloc(sizeof(struct Proto), 0);
3129         tpriv = tcp->priv = kzmalloc(sizeof(struct tcppriv), 0);
3130         qlock_init(&tpriv->tl);
3131         qlock_init(&tpriv->apl);
3132         tcp->name = "tcp";
3133         tcp->connect = tcpconnect;
3134         tcp->announce = tcpannounce;
3135         tcp->ctl = tcpctl;
3136         tcp->state = tcpstate;
3137         tcp->create = tcpcreate;
3138         tcp->close = tcpclose;
3139         tcp->rcv = tcpiput;
3140         tcp->advise = tcpadvise;
3141         tcp->stats = tcpstats;
3142         tcp->inuse = tcpinuse;
3143         tcp->gc = tcpgc;
3144         tcp->ipproto = IP_TCPPROTO;
3145         tcp->nc = scalednconv();
3146         tcp->ptclsize = sizeof(Tcpctl);
3147         tpriv->stats[MaxConn] = tcp->nc;
3148
3149         Fsproto(fs, tcp);
3150 }
3151
3152 void
3153 tcpsetscale(struct conv *s, Tcpctl * tcb, uint16_t rcvscale, uint16_t sndscale)
3154 {
3155         if (rcvscale) {
3156                 tcb->rcv.scale = rcvscale & 0xff;
3157                 tcb->snd.scale = sndscale & 0xff;
3158                 tcb->window = QMAX << tcb->snd.scale;
3159                 qsetlimit(s->rq, tcb->window);
3160         } else {
3161                 tcb->rcv.scale = 0;
3162                 tcb->snd.scale = 0;
3163                 tcb->window = QMAX;
3164                 qsetlimit(s->rq, tcb->window);
3165         }
3166 }