Implement TSO
[akaros.git] / kern / src / net / tcp.c
1 // INFERNO
2 #include <vfs.h>
3 #include <kfs.h>
4 #include <slab.h>
5 #include <kmalloc.h>
6 #include <kref.h>
7 #include <string.h>
8 #include <stdio.h>
9 #include <assert.h>
10 #include <error.h>
11 #include <cpio.h>
12 #include <pmap.h>
13 #include <smp.h>
14 #include <ip.h>
15
16 #include <vfs.h>
17 #include <kfs.h>
18 #include <slab.h>
19 #include <kmalloc.h>
20 #include <kref.h>
21 #include <string.h>
22 #include <stdio.h>
23 #include <assert.h>
24 #include <error.h>
25 #include <cpio.h>
26 #include <pmap.h>
27 #include <smp.h>
28 #include <ip.h>
29
30 enum {
31         QMAX = 64 * 1024 - 1,
32         IP_TCPPROTO = 6,
33
34         TCP4_IPLEN = 8,
35         TCP4_PHDRSIZE = 12,
36         TCP4_HDRSIZE = 20,
37         TCP4_TCBPHDRSZ = 40,
38         TCP4_PKT = TCP4_IPLEN + TCP4_PHDRSIZE,
39
40         TCP6_IPLEN = 0,
41         TCP6_PHDRSIZE = 40,
42         TCP6_HDRSIZE = 20,
43         TCP6_TCBPHDRSZ = 60,
44         TCP6_PKT = TCP6_IPLEN + TCP6_PHDRSIZE,
45
46         TcptimerOFF = 0,
47         TcptimerON = 1,
48         TcptimerDONE = 2,
49         MAX_TIME = (1 << 20),   /* Forever */
50         TCP_ACK = 50,   /* Timed ack sequence in ms */
51         MAXBACKMS = 9 * 60 * 1000,      /* longest backoff time (ms) before hangup */
52
53         URG = 0x20,     /* Data marked urgent */
54         ACK = 0x10,     /* Acknowledge is valid */
55         PSH = 0x08,     /* Whole data pipe is pushed */
56         RST = 0x04,     /* Reset connection */
57         SYN = 0x02,     /* Pkt. is synchronise */
58         FIN = 0x01,     /* Start close down */
59
60         EOLOPT = 0,
61         NOOPOPT = 1,
62         MSSOPT = 2,
63         MSS_LENGTH = 4, /* Mean segment size */
64         WSOPT = 3,
65         WS_LENGTH = 3,  /* Bits to scale window size by */
66         MSL2 = 10,
67         MSPTICK = 50,   /* Milliseconds per timer tick */
68         DEF_MSS = 1460, /* Default mean segment */
69         DEF_MSS6 = 1280,        /* Default mean segment (min) for v6 */
70         DEF_RTT = 500,  /* Default round trip */
71         DEF_KAT = 120000,       /* Default time (ms) between keep alives */
72         TCP_LISTEN = 0, /* Listen connection */
73         TCP_CONNECT = 1,        /* Outgoing connection */
74         SYNACK_RXTIMER = 250,   /* ms between SYNACK retransmits */
75
76         TCPREXMTTHRESH = 3,     /* dupack threshhold for rxt */
77
78         FORCE = 1,
79         CLONE = 2,
80         RETRAN = 4,
81         ACTIVE = 8,
82         SYNACK = 16,
83         TSO = 32,
84
85         LOGAGAIN = 3,
86         LOGDGAIN = 2,
87
88         Closed = 0,     /* Connection states */
89         Listen,
90         Syn_sent,
91         Syn_received,
92         Established,
93         Finwait1,
94         Finwait2,
95         Close_wait,
96         Closing,
97         Last_ack,
98         Time_wait,
99
100         Maxlimbo = 1000,        /* maximum procs waiting for response to SYN ACK */
101         NLHT = 256,     /* hash table size, must be a power of 2 */
102         LHTMASK = NLHT - 1,
103
104         HaveWS = 1 << 8,
105 };
106
107 /* Must correspond to the enumeration above */
108 char *tcpstates[] = {
109         "Closed", "Listen", "Syn_sent", "Syn_received",
110         "Established", "Finwait1", "Finwait2", "Close_wait",
111         "Closing", "Last_ack", "Time_wait"
112 };
113
114 typedef struct Tcptimer Tcptimer;
115 struct Tcptimer {
116         Tcptimer *next;
117         Tcptimer *prev;
118         Tcptimer *readynext;
119         int state;
120         int start;
121         int count;
122         void (*func) (void *);
123         void *arg;
124 };
125
126 /*
127  *  v4 and v6 pseudo headers used for
128  *  checksuming tcp
129  */
130 typedef struct Tcp4hdr Tcp4hdr;
131 struct Tcp4hdr {
132         uint8_t vihl;                           /* Version and header length */
133         uint8_t tos;                            /* Type of service */
134         uint8_t length[2];                      /* packet length */
135         uint8_t id[2];                          /* Identification */
136         uint8_t frag[2];                        /* Fragment information */
137         uint8_t Unused;
138         uint8_t proto;
139         uint8_t tcplen[2];
140         uint8_t tcpsrc[4];
141         uint8_t tcpdst[4];
142         uint8_t tcpsport[2];
143         uint8_t tcpdport[2];
144         uint8_t tcpseq[4];
145         uint8_t tcpack[4];
146         uint8_t tcpflag[2];
147         uint8_t tcpwin[2];
148         uint8_t tcpcksum[2];
149         uint8_t tcpurg[2];
150         /* Options segment */
151         uint8_t tcpopt[1];
152 };
153
154 typedef struct Tcp6hdr Tcp6hdr;
155 struct Tcp6hdr {
156         uint8_t vcf[4];
157         uint8_t ploadlen[2];
158         uint8_t proto;
159         uint8_t ttl;
160         uint8_t tcpsrc[IPaddrlen];
161         uint8_t tcpdst[IPaddrlen];
162         uint8_t tcpsport[2];
163         uint8_t tcpdport[2];
164         uint8_t tcpseq[4];
165         uint8_t tcpack[4];
166         uint8_t tcpflag[2];
167         uint8_t tcpwin[2];
168         uint8_t tcpcksum[2];
169         uint8_t tcpurg[2];
170         /* Options segment */
171         uint8_t tcpopt[1];
172 };
173
174 /*
175  *  this represents the control info
176  *  for a single packet.  It is derived from
177  *  a packet in ntohtcp{4,6}() and stuck into
178  *  a packet in htontcp{4,6}().
179  */
180 typedef struct Tcp Tcp;
181 struct Tcp {
182         uint16_t source;
183         uint16_t dest;
184         uint32_t seq;
185         uint32_t ack;
186         uint8_t flags;
187         uint16_t ws;                            /* window scale option (if not zero) */
188         uint32_t wnd;
189         uint16_t urg;
190         uint16_t mss;                           /* max segment size option (if not zero) */
191         uint16_t len;                           /* size of data */
192 };
193
194 /*
195  *  this header is malloc'd to thread together fragments
196  *  waiting to be coalesced
197  */
198 typedef struct Reseq Reseq;
199 struct Reseq {
200         Reseq *next;
201         Tcp seg;
202         struct block *bp;
203         uint16_t length;
204 };
205
206 /*
207  *  the qlock in the Conv locks this structure
208  */
209 typedef struct Tcpctl Tcpctl;
210 struct Tcpctl {
211         uint8_t state;                          /* Connection state */
212         uint8_t type;                           /* Listening or active connection */
213         uint8_t code;                           /* Icmp code */
214         struct {
215                 uint32_t una;                   /* Unacked data pointer */
216                 uint32_t nxt;                   /* Next sequence expected */
217                 uint32_t ptr;                   /* Data pointer */
218                 uint32_t wnd;                   /* Tcp send window */
219                 uint32_t urg;                   /* Urgent data pointer */
220                 uint32_t wl2;
221                 int scale;                              /* how much to right shift window in xmitted packets */
222                 /* to implement tahoe and reno TCP */
223                 uint32_t dupacks;               /* number of duplicate acks rcvd */
224                 int recovery;                   /* loss recovery flag */
225                 uint32_t rxt;                   /* right window marker for recovery */
226         } snd;
227         struct {
228                 uint32_t nxt;                   /* Receive pointer to next uint8_t slot */
229                 uint32_t wnd;                   /* Receive window incoming */
230                 uint32_t urg;                   /* Urgent pointer */
231                 int blocked;
232                 int una;                                /* unacked data segs */
233                 int scale;                              /* how much to left shift window in rcved packets */
234         } rcv;
235         uint32_t iss;                           /* Initial sequence number */
236         int sawwsopt;                           /* true if we saw a wsopt on the incoming SYN */
237         uint32_t cwind;                         /* Congestion window */
238         int scale;                                      /* desired snd.scale */
239         uint16_t ssthresh;                      /* Slow start threshold */
240         int resent;                                     /* Bytes just resent */
241         int irs;                                        /* Initial received squence */
242         uint16_t mss;                           /* Mean segment size */
243         int rerecv;                                     /* Overlap of data rerecevived */
244         uint32_t window;                        /* Recevive window */
245         uint8_t backoff;                        /* Exponential backoff counter */
246         int backedoff;                          /* ms we've backed off for rexmits */
247         uint8_t flags;                          /* State flags */
248         Reseq *reseq;                           /* Resequencing queue */
249         Tcptimer timer;                         /* Activity timer */
250         Tcptimer acktimer;                      /* Acknowledge timer */
251         Tcptimer rtt_timer;                     /* Round trip timer */
252         Tcptimer katimer;                       /* keep alive timer */
253         uint32_t rttseq;                        /* Round trip sequence */
254         int srtt;                                       /* Shortened round trip */
255         int mdev;                                       /* Mean deviation of round trip */
256         int kacounter;                          /* count down for keep alive */
257         unsigned int sndsyntime;        /* time syn sent */
258         uint32_t time;                          /* time Finwait2 or Syn_received was sent */
259         int nochecksum;                         /* non-zero means don't send checksums */
260         int flgcnt;                                     /* number of flags in the sequence (FIN,SEQ) */
261
262         union {
263                 Tcp4hdr tcp4hdr;
264                 Tcp6hdr tcp6hdr;
265         } protohdr;                                     /* prototype header */
266 };
267
268 /*
269  *  New calls are put in limbo rather than having a conversation structure
270  *  allocated.  Thus, a SYN attack results in lots of limbo'd calls but not
271  *  any real Conv structures mucking things up.  Calls in limbo rexmit their
272  *  SYN ACK every SYNACK_RXTIMER ms up to 4 times, i.e., they disappear after 1 second.
273  *
274  *  In particular they aren't on a listener's queue so that they don't figure
275  *  in the input queue limit.
276  *
277  *  If 1/2 of a T3 was attacking SYN packets, we'ld have a permanent queue
278  *  of 70000 limbo'd calls.  Not great for a linear list but doable.  Therefore
279  *  there is no hashing of this list.
280  */
281 typedef struct Limbo Limbo;
282 struct Limbo {
283         Limbo *next;
284
285         uint8_t laddr[IPaddrlen];
286         uint8_t raddr[IPaddrlen];
287         uint16_t lport;
288         uint16_t rport;
289         uint32_t irs;                           /* initial received sequence */
290         uint32_t iss;                           /* initial sent sequence */
291         uint16_t mss;                           /* mss from the other end */
292         uint16_t rcvscale;                      /* how much to scale rcvd windows */
293         uint16_t sndscale;                      /* how much to scale sent windows */
294         uint32_t lastsend;                      /* last time we sent a synack */
295         uint8_t version;                        /* v4 or v6 */
296         uint8_t rexmits;                        /* number of retransmissions */
297 };
298
299 int tcp_irtt = DEF_RTT;                 /* Initial guess at round trip time */
300 uint16_t tcp_mss = DEF_MSS;             /* Maximum segment size to be sent */
301
302 enum {
303         /* MIB stats */
304         MaxConn,
305         ActiveOpens,
306         PassiveOpens,
307         EstabResets,
308         CurrEstab,
309         InSegs,
310         OutSegs,
311         RetransSegs,
312         RetransTimeouts,
313         InErrs,
314         OutRsts,
315
316         /* non-MIB stats */
317         CsumErrs,
318         HlenErrs,
319         LenErrs,
320         OutOfOrder,
321
322         Nstats
323 };
324
325 static char *statnames[] = {
326         [MaxConn] "MaxConn",
327         [ActiveOpens] "ActiveOpens",
328         [PassiveOpens] "PassiveOpens",
329         [EstabResets] "EstabResets",
330         [CurrEstab] "CurrEstab",
331         [InSegs] "InSegs",
332         [OutSegs] "OutSegs",
333         [RetransSegs] "RetransSegs",
334         [RetransTimeouts] "RetransTimeouts",
335         [InErrs] "InErrs",
336         [OutRsts] "OutRsts",
337         [CsumErrs] "CsumErrs",
338         [HlenErrs] "HlenErrs",
339         [LenErrs] "LenErrs",
340         [OutOfOrder] "OutOfOrder",
341 };
342
343 typedef struct Tcppriv Tcppriv;
344 struct tcppriv {
345         /* List of active timers */
346         qlock_t tl;
347         Tcptimer *timers;
348
349         /* hash table for matching conversations */
350         struct Ipht ht;
351
352         /* calls in limbo waiting for an ACK to our SYN ACK */
353         int nlimbo;
354         Limbo *lht[NLHT];
355
356         /* for keeping track of tcpackproc */
357         qlock_t apl;
358         int ackprocstarted;
359
360         uint32_t stats[Nstats];
361 };
362
363 /*
364  *  Setting tcpporthogdefense to non-zero enables Dong Lin's
365  *  solution to hijacked systems staking out port's as a form
366  *  of DoS attack.
367  *
368  *  To avoid stateless Conv hogs, we pick a sequence number at random.  If
369  *  it that number gets acked by the other end, we shut down the connection.
370  *  Look for tcpporthogedefense in the code.
371  */
372 int tcpporthogdefense = 0;
373
374 int addreseq(Tcpctl *, struct tcppriv *, Tcp *, struct block *, uint16_t);
375 void getreseq(Tcpctl *, Tcp *, struct block **, uint16_t *);
376 void localclose(struct conv *, char *unused_char_p_t);
377 void procsyn(struct conv *, Tcp *);
378 void tcpiput(struct Proto *, struct Ipifc *, struct block *);
379 void tcpoutput(struct conv *);
380 int tcptrim(Tcpctl *, Tcp *, struct block **, uint16_t *);
381 void tcpstart(struct conv *, int);
382 void tcptimeout(void *);
383 void tcpsndsyn(struct conv *, Tcpctl *);
384 void tcprcvwin(struct conv *);
385 void tcpacktimer(void *);
386 void tcpkeepalive(void *);
387 void tcpsetkacounter(Tcpctl *);
388 void tcprxmit(struct conv *);
389 void tcpsettimer(Tcpctl *);
390 void tcpsynackrtt(struct conv *);
391 void tcpsetscale(struct conv *, Tcpctl *, uint16_t, uint16_t);
392
393 static void limborexmit(struct Proto *);
394 static void limbo(struct conv *, uint8_t * unused_uint8_p_t, uint8_t *, Tcp *,
395                                   int);
396
397 void tcpsetstate(struct conv *s, uint8_t newstate)
398 {
399         Tcpctl *tcb;
400         uint8_t oldstate;
401         struct tcppriv *tpriv;
402
403         tpriv = s->p->priv;
404
405         tcb = (Tcpctl *) s->ptcl;
406
407         oldstate = tcb->state;
408         if (oldstate == newstate)
409                 return;
410
411         if (oldstate == Established)
412                 tpriv->stats[CurrEstab]--;
413         if (newstate == Established)
414                 tpriv->stats[CurrEstab]++;
415
416         /**
417         print( "%d/%d %s->%s CurrEstab=%d\n", s->lport, s->rport,
418                 tcpstates[oldstate], tcpstates[newstate], tpriv->tstats.tcpCurrEstab );
419         **/
420
421         switch (newstate) {
422                 case Closed:
423                         qclose(s->rq);
424                         qclose(s->wq);
425                         qclose(s->eq);
426                         break;
427
428                 case Close_wait:        /* Remote closes */
429                         qhangup(s->rq, NULL);
430                         break;
431         }
432
433         tcb->state = newstate;
434
435         if (oldstate == Syn_sent && newstate != Closed)
436                 Fsconnected(s, NULL);
437 }
438
439 static char *tcpconnect(struct conv *c, char **argv, int argc)
440 {
441         char *e;
442
443         e = Fsstdconnect(c, argv, argc);
444         if (e != NULL)
445                 return e;
446         tcpstart(c, TCP_CONNECT);
447
448         return NULL;
449 }
450
451 static int tcpstate(struct conv *c, char *state, int n)
452 {
453         Tcpctl *s;
454
455         s = (Tcpctl *) (c->ptcl);
456
457         return snprintf(state, n,
458                                         "%s qin %d qout %d srtt %d mdev %d cwin %lu swin %lu>>%d rwin %lu>>%d timer.start %d timer.count %d rerecv %d katimer.start %d katimer.count %d\n",
459                                         tcpstates[s->state],
460                                         c->rq ? qlen(c->rq) : 0,
461                                         c->wq ? qlen(c->wq) : 0,
462                                         s->srtt, s->mdev,
463                                         s->cwind, s->snd.wnd, s->rcv.scale, s->rcv.wnd,
464                                         s->snd.scale, s->timer.start, s->timer.count, s->rerecv,
465                                         s->katimer.start, s->katimer.count);
466 }
467
468 static int tcpinuse(struct conv *c)
469 {
470         Tcpctl *s;
471
472         s = (Tcpctl *) (c->ptcl);
473         return s->state != Closed;
474 }
475
476 static char *tcpannounce(struct conv *c, char **argv, int argc)
477 {
478         char *e;
479
480         e = Fsstdannounce(c, argv, argc);
481         if (e != NULL)
482                 return e;
483         tcpstart(c, TCP_LISTEN);
484         Fsconnected(c, NULL);
485
486         return NULL;
487 }
488
489 /*
490  *  tcpclose is always called with the q locked
491  */
492 static void tcpclose(struct conv *c)
493 {
494         Tcpctl *tcb;
495
496         tcb = (Tcpctl *) c->ptcl;
497
498         qhangup(c->rq, NULL);
499         qhangup(c->wq, NULL);
500         qhangup(c->eq, NULL);
501         qflush(c->rq);
502
503         switch (tcb->state) {
504                 case Listen:
505                         /*
506                          *  reset any incoming calls to this listener
507                          */
508                         Fsconnected(c, "Hangup");
509
510                         localclose(c, NULL);
511                         break;
512                 case Closed:
513                 case Syn_sent:
514                         localclose(c, NULL);
515                         break;
516                 case Syn_received:
517                 case Established:
518                         tcb->flgcnt++;
519                         tcb->snd.nxt++;
520                         tcpsetstate(c, Finwait1);
521                         tcpoutput(c);
522                         break;
523                 case Close_wait:
524                         tcb->flgcnt++;
525                         tcb->snd.nxt++;
526                         tcpsetstate(c, Last_ack);
527                         tcpoutput(c);
528                         break;
529         }
530 }
531
532 void tcpkick(void *x)
533 {
534         ERRSTACK(1);
535         struct conv *s = x;
536         Tcpctl *tcb;
537
538         tcb = (Tcpctl *) s->ptcl;
539
540         if (waserror()) {
541                 qunlock(&s->qlock);
542                 nexterror();
543         }
544         qlock(&s->qlock);
545
546         switch (tcb->state) {
547                 case Syn_sent:
548                 case Syn_received:
549                 case Established:
550                 case Close_wait:
551                         /*
552                          * Push data
553                          */
554                         tcprcvwin(s);
555                         tcpoutput(s);
556                         break;
557                 default:
558                         localclose(s, "Hangup");
559                         break;
560         }
561
562         qunlock(&s->qlock);
563         poperror();
564 }
565
566 void tcprcvwin(struct conv *s)
567 {       /* Call with tcb locked */
568         int w;
569         Tcpctl *tcb;
570
571         tcb = (Tcpctl *) s->ptcl;
572         w = tcb->window - qlen(s->rq);
573         if (w < 0)
574                 w = 0;
575         tcb->rcv.wnd = w;
576         if (w == 0)
577                 tcb->rcv.blocked = 1;
578 }
579
580 void tcpacktimer(void *v)
581 {
582         ERRSTACK(1);
583         Tcpctl *tcb;
584         struct conv *s;
585
586         s = v;
587         tcb = (Tcpctl *) s->ptcl;
588
589         if (waserror()) {
590                 qunlock(&s->qlock);
591                 nexterror();
592         }
593         qlock(&s->qlock);
594         if (tcb->state != Closed) {
595                 tcb->flags |= FORCE;
596                 tcprcvwin(s);
597                 tcpoutput(s);
598         }
599         qunlock(&s->qlock);
600         poperror();
601 }
602
603 static void tcpcreate(struct conv *c)
604 {
605         c->rq = qopen(QMAX, Qcoalesce, tcpacktimer, c);
606         c->wq = qopen(8 * QMAX, Qkick, tcpkick, c);
607 }
608
609 static void timerstate(struct tcppriv *priv, Tcptimer * t, int newstate)
610 {
611         if (newstate != TcptimerON) {
612                 if (t->state == TcptimerON) {
613                         // unchain
614                         if (priv->timers == t) {
615                                 priv->timers = t->next;
616                                 if (t->prev != NULL)
617                                         panic("timerstate1");
618                         }
619                         if (t->next)
620                                 t->next->prev = t->prev;
621                         if (t->prev)
622                                 t->prev->next = t->next;
623                         t->next = t->prev = NULL;
624                 }
625         } else {
626                 if (t->state != TcptimerON) {
627                         // chain
628                         if (t->prev != NULL || t->next != NULL)
629                                 panic("timerstate2");
630                         t->prev = NULL;
631                         t->next = priv->timers;
632                         if (t->next)
633                                 t->next->prev = t;
634                         priv->timers = t;
635                 }
636         }
637         t->state = newstate;
638 }
639
640 void tcpackproc(void *a)
641 {
642         ERRSTACK(1);
643         Tcptimer *t, *tp, *timeo;
644         struct Proto *tcp;
645         struct tcppriv *priv;
646         int loop;
647
648         tcp = a;
649         priv = tcp->priv;
650
651         for (;;) {
652                 udelay_sched(MSPTICK * 1000);
653
654                 qlock(&priv->tl);
655                 timeo = NULL;
656                 loop = 0;
657                 for (t = priv->timers; t != NULL; t = tp) {
658                         if (loop++ > 10000)
659                                 panic("tcpackproc1");
660                         tp = t->next;
661                         if (t->state == TcptimerON) {
662                                 t->count--;
663                                 if (t->count == 0) {
664                                         timerstate(priv, t, TcptimerDONE);
665                                         t->readynext = timeo;
666                                         timeo = t;
667                                 }
668                         }
669                 }
670                 qunlock(&priv->tl);
671
672                 loop = 0;
673                 for (t = timeo; t != NULL; t = t->readynext) {
674                         if (loop++ > 10000)
675                                 panic("tcpackproc2");
676                         if (t->state == TcptimerDONE && t->func != NULL) {
677                                 /* discard error style */
678                                 if (!waserror())
679                                         (*t->func) (t->arg);
680                                 poperror();
681                         }
682                 }
683
684                 limborexmit(tcp);
685         }
686 }
687
688 void tcpgo(struct tcppriv *priv, Tcptimer * t)
689 {
690         if (t == NULL || t->start == 0)
691                 return;
692
693         qlock(&priv->tl);
694         t->count = t->start;
695         timerstate(priv, t, TcptimerON);
696         qunlock(&priv->tl);
697 }
698
699 void tcphalt(struct tcppriv *priv, Tcptimer * t)
700 {
701         if (t == NULL)
702                 return;
703
704         qlock(&priv->tl);
705         timerstate(priv, t, TcptimerOFF);
706         qunlock(&priv->tl);
707 }
708
709 int backoff(int n)
710 {
711         return 1 << n;
712 }
713
714 void localclose(struct conv *s, char *reason)
715 {       /* called with tcb locked */
716         Tcpctl *tcb;
717         Reseq *rp, *rp1;
718         struct tcppriv *tpriv;
719
720         tpriv = s->p->priv;
721         tcb = (Tcpctl *) s->ptcl;
722
723         iphtrem(&tpriv->ht, s);
724
725         tcphalt(tpriv, &tcb->timer);
726         tcphalt(tpriv, &tcb->rtt_timer);
727         tcphalt(tpriv, &tcb->acktimer);
728         tcphalt(tpriv, &tcb->katimer);
729
730         /* Flush reassembly queue; nothing more can arrive */
731         for (rp = tcb->reseq; rp != NULL; rp = rp1) {
732                 rp1 = rp->next;
733                 freeblist(rp->bp);
734                 kfree(rp);
735         }
736         tcb->reseq = NULL;
737
738         if (tcb->state == Syn_sent)
739                 Fsconnected(s, reason);
740
741         qhangup(s->rq, reason);
742         qhangup(s->wq, reason);
743
744         tcpsetstate(s, Closed);
745
746         /* listener will check the rq state */
747         if (s->state == Announced)
748                 rendez_wakeup(&s->listenr);
749 }
750
751 /* mtu (- TCP + IP hdr len) of 1st hop */
752 int tcpmtu(struct Proto *tcp, uint8_t * addr, int version, int *scale,
753            uint8_t *flags)
754 {
755         struct Ipifc *ifc;
756         int mtu;
757
758         ifc = findipifc(tcp->f, addr, 0);
759         switch (version) {
760                 default:
761                 case V4:
762                         mtu = DEF_MSS;
763                         if (ifc != NULL)
764                                 mtu = ifc->maxtu - ifc->m->hsize - (TCP4_PKT + TCP4_HDRSIZE);
765                         break;
766                 case V6:
767                         mtu = DEF_MSS6;
768                         if (ifc != NULL)
769                                 mtu = ifc->maxtu - ifc->m->hsize - (TCP6_PKT + TCP6_HDRSIZE);
770                         break;
771         }
772         *flags &= ~TSO;
773
774         if (ifc != NULL) {
775                 if (ifc->mbps > 100)
776                         *scale = HaveWS | 3;
777                 else if (ifc->mbps > 10)
778                         *scale = HaveWS | 1;
779                 else
780                         *scale = HaveWS | 0;
781                 if (ifc->feat & NETF_TSO)
782                         *flags |= TSO;
783         } else
784                 *scale = HaveWS | 0;
785
786         return mtu;
787 }
788
789 void inittcpctl(struct conv *s, int mode)
790 {
791         Tcpctl *tcb;
792         Tcp4hdr *h4;
793         Tcp6hdr *h6;
794         int mss;
795
796         tcb = (Tcpctl *) s->ptcl;
797
798         memset(tcb, 0, sizeof(Tcpctl));
799
800         tcb->ssthresh = 65535;
801         tcb->srtt = tcp_irtt << LOGAGAIN;
802         tcb->mdev = 0;
803
804         /* setup timers */
805         tcb->timer.start = tcp_irtt / MSPTICK;
806         tcb->timer.func = tcptimeout;
807         tcb->timer.arg = s;
808         tcb->rtt_timer.start = MAX_TIME;
809         tcb->acktimer.start = TCP_ACK / MSPTICK;
810         tcb->acktimer.func = tcpacktimer;
811         tcb->acktimer.arg = s;
812         tcb->katimer.start = DEF_KAT / MSPTICK;
813         tcb->katimer.func = tcpkeepalive;
814         tcb->katimer.arg = s;
815
816         mss = DEF_MSS;
817
818         /* create a prototype(pseudo) header */
819         if (mode != TCP_LISTEN) {
820                 if (ipcmp(s->laddr, IPnoaddr) == 0)
821                         findlocalip(s->p->f, s->laddr, s->raddr);
822
823                 switch (s->ipversion) {
824                         case V4:
825                                 h4 = &tcb->protohdr.tcp4hdr;
826                                 memset(h4, 0, sizeof(*h4));
827                                 h4->proto = IP_TCPPROTO;
828                                 hnputs(h4->tcpsport, s->lport);
829                                 hnputs(h4->tcpdport, s->rport);
830                                 v6tov4(h4->tcpsrc, s->laddr);
831                                 v6tov4(h4->tcpdst, s->raddr);
832                                 break;
833                         case V6:
834                                 h6 = &tcb->protohdr.tcp6hdr;
835                                 memset(h6, 0, sizeof(*h6));
836                                 h6->proto = IP_TCPPROTO;
837                                 hnputs(h6->tcpsport, s->lport);
838                                 hnputs(h6->tcpdport, s->rport);
839                                 ipmove(h6->tcpsrc, s->laddr);
840                                 ipmove(h6->tcpdst, s->raddr);
841                                 mss = DEF_MSS6;
842                                 break;
843                         default:
844                                 panic("inittcpctl: version %d", s->ipversion);
845                 }
846         }
847
848         tcb->mss = tcb->cwind = mss;
849
850         /* default is no window scaling */
851         tcb->window = QMAX;
852         tcb->rcv.wnd = QMAX;
853         tcb->rcv.scale = 0;
854         tcb->snd.scale = 0;
855         qsetlimit(s->rq, QMAX);
856 }
857
858 /*
859  *  called with s qlocked
860  */
861 void tcpstart(struct conv *s, int mode)
862 {
863         Tcpctl *tcb;
864         struct tcppriv *tpriv;
865         /* tcpackproc needs to free this if it ever exits */
866         char *kpname = kmalloc(KNAMELEN, KMALLOC_WAIT);
867
868         tpriv = s->p->priv;
869
870         if (tpriv->ackprocstarted == 0) {
871                 qlock(&tpriv->apl);
872                 if (tpriv->ackprocstarted == 0) {
873                         snprintf(kpname, KNAMELEN, "#I%dtcpack", s->p->f->dev);
874                         ktask(kpname, tcpackproc, s->p);
875                         tpriv->ackprocstarted = 1;
876                 }
877                 qunlock(&tpriv->apl);
878         }
879
880         tcb = (Tcpctl *) s->ptcl;
881
882         inittcpctl(s, mode);
883
884         iphtadd(&tpriv->ht, s);
885         switch (mode) {
886                 case TCP_LISTEN:
887                         tpriv->stats[PassiveOpens]++;
888                         tcb->flags |= CLONE;
889                         tcpsetstate(s, Listen);
890                         break;
891
892                 case TCP_CONNECT:
893                         tpriv->stats[ActiveOpens]++;
894                         tcb->flags |= ACTIVE;
895                         tcpsndsyn(s, tcb);
896                         tcpsetstate(s, Syn_sent);
897                         tcpoutput(s);
898                         break;
899         }
900 }
901
902 static char *tcpflag(uint16_t flag)
903 {
904         static char buf[128];
905
906         snprintf(buf, sizeof(buf), "%d", flag >> 10);   /* Head len */
907         if (flag & URG)
908                 snprintf(buf, sizeof(buf), "%s%s", buf, " URG");
909         if (flag & ACK)
910                 snprintf(buf, sizeof(buf), "%s%s", buf, " ACK");
911         if (flag & PSH)
912                 snprintf(buf, sizeof(buf), "%s%s", buf, " PSH");
913         if (flag & RST)
914                 snprintf(buf, sizeof(buf), "%s%s", buf, " RST");
915         if (flag & SYN)
916                 snprintf(buf, sizeof(buf), "%s%s", buf, " SYN");
917         if (flag & FIN)
918                 snprintf(buf, sizeof(buf), "%s%s", buf, " FIN");
919
920         return buf;
921 }
922
923 struct block *htontcp6(Tcp * tcph, struct block *data, Tcp6hdr * ph,
924                                            Tcpctl * tcb)
925 {
926         int dlen;
927         Tcp6hdr *h;
928         uint16_t csum;
929         uint16_t hdrlen, optpad = 0;
930         uint8_t *opt;
931
932         hdrlen = TCP6_HDRSIZE;
933         if (tcph->flags & SYN) {
934                 if (tcph->mss)
935                         hdrlen += MSS_LENGTH;
936                 if (tcph->ws)
937                         hdrlen += WS_LENGTH;
938                 optpad = hdrlen & 3;
939                 if (optpad)
940                         optpad = 4 - optpad;
941                 hdrlen += optpad;
942         }
943
944         if (data) {
945                 dlen = blocklen(data);
946                 data = padblock(data, hdrlen + TCP6_PKT);
947                 if (data == NULL)
948                         return NULL;
949         } else {
950                 dlen = 0;
951                 data = allocb(hdrlen + TCP6_PKT + 64);  /* the 64 pad is to meet mintu's */
952                 if (data == NULL)
953                         return NULL;
954                 data->wp += hdrlen + TCP6_PKT;
955         }
956
957         /* copy in pseudo ip header plus port numbers */
958         h = (Tcp6hdr *) (data->rp);
959         memmove(h, ph, TCP6_TCBPHDRSZ);
960
961         /* compose pseudo tcp header, do cksum calculation */
962         hnputl(h->vcf, hdrlen + dlen);
963         h->ploadlen[0] = h->ploadlen[1] = h->proto = 0;
964         h->ttl = ph->proto;
965
966         /* copy in variable bits */
967         hnputl(h->tcpseq, tcph->seq);
968         hnputl(h->tcpack, tcph->ack);
969         hnputs(h->tcpflag, (hdrlen << 10) | tcph->flags);
970         hnputs(h->tcpwin, tcph->wnd >> (tcb != NULL ? tcb->snd.scale : 0));
971         hnputs(h->tcpurg, tcph->urg);
972
973         if (tcph->flags & SYN) {
974                 opt = h->tcpopt;
975                 if (tcph->mss != 0) {
976                         *opt++ = MSSOPT;
977                         *opt++ = MSS_LENGTH;
978                         hnputs(opt, tcph->mss);
979                         opt += 2;
980                 }
981                 if (tcph->ws != 0) {
982                         *opt++ = WSOPT;
983                         *opt++ = WS_LENGTH;
984                         *opt++ = tcph->ws;
985                 }
986                 while (optpad-- > 0)
987                         *opt++ = NOOPOPT;
988         }
989
990         if (tcb != NULL && tcb->nochecksum) {
991                 h->tcpcksum[0] = h->tcpcksum[1] = 0;
992         } else {
993                 csum = ptclcsum(data, TCP6_IPLEN, hdrlen + dlen + TCP6_PHDRSIZE);
994                 hnputs(h->tcpcksum, csum);
995         }
996
997         /* move from pseudo header back to normal ip header */
998         memset(h->vcf, 0, 4);
999         h->vcf[0] = IP_VER6;
1000         hnputs(h->ploadlen, hdrlen + dlen);
1001         h->proto = ph->proto;
1002
1003         return data;
1004 }
1005
1006 struct block *htontcp4(Tcp * tcph, struct block *data, Tcp4hdr * ph,
1007                                            Tcpctl * tcb)
1008 {
1009         int dlen;
1010         Tcp4hdr *h;
1011         uint16_t csum;
1012         uint16_t hdrlen, optpad = 0;
1013         uint8_t *opt;
1014
1015         hdrlen = TCP4_HDRSIZE;
1016         if (tcph->flags & SYN) {
1017                 if (tcph->mss)
1018                         hdrlen += MSS_LENGTH;
1019                 if (tcph->ws)
1020                         hdrlen += WS_LENGTH;
1021                 optpad = hdrlen & 3;
1022                 if (optpad)
1023                         optpad = 4 - optpad;
1024                 hdrlen += optpad;
1025         }
1026
1027         if (data) {
1028                 dlen = blocklen(data);
1029                 data = padblock(data, hdrlen + TCP4_PKT);
1030                 if (data == NULL)
1031                         return NULL;
1032         } else {
1033                 dlen = 0;
1034                 data = allocb(hdrlen + TCP4_PKT + 64);  /* the 64 pad is to meet mintu's */
1035                 if (data == NULL)
1036                         return NULL;
1037                 data->wp += hdrlen + TCP4_PKT;
1038         }
1039
1040         /* copy in pseudo ip header plus port numbers */
1041         h = (Tcp4hdr *) (data->rp);
1042         memmove(h, ph, TCP4_TCBPHDRSZ);
1043
1044         /* copy in variable bits */
1045         hnputs(h->tcplen, hdrlen + dlen);
1046         hnputl(h->tcpseq, tcph->seq);
1047         hnputl(h->tcpack, tcph->ack);
1048         hnputs(h->tcpflag, (hdrlen << 10) | tcph->flags);
1049         hnputs(h->tcpwin, tcph->wnd >> (tcb != NULL ? tcb->snd.scale : 0));
1050         hnputs(h->tcpurg, tcph->urg);
1051
1052         if (tcph->flags & SYN) {
1053                 opt = h->tcpopt;
1054                 if (tcph->mss != 0) {
1055                         *opt++ = MSSOPT;
1056                         *opt++ = MSS_LENGTH;
1057                         hnputs(opt, tcph->mss);
1058                         opt += 2;
1059                 }
1060                 if (tcph->ws != 0) {
1061                         *opt++ = WSOPT;
1062                         *opt++ = WS_LENGTH;
1063                         *opt++ = tcph->ws;
1064                 }
1065                 while (optpad-- > 0)
1066                         *opt++ = NOOPOPT;
1067         }
1068
1069         if (tcb != NULL && tcb->nochecksum) {
1070                 h->tcpcksum[0] = h->tcpcksum[1] = 0;
1071         } else {
1072                 csum = ~ptclcsum(data, TCP4_IPLEN, TCP4_PHDRSIZE);
1073                 hnputs(h->tcpcksum, csum);
1074                 data->checksum_start = TCP4_IPLEN + TCP4_PHDRSIZE;
1075                 data->checksum_offset = ph->tcpcksum - ph->tcpsport;
1076                 data->flag |= Btcpck;
1077         }
1078
1079         return data;
1080 }
1081
1082 int ntohtcp6(Tcp * tcph, struct block **bpp)
1083 {
1084         Tcp6hdr *h;
1085         uint8_t *optr;
1086         uint16_t hdrlen;
1087         uint16_t optlen;
1088         int n;
1089
1090         *bpp = pullupblock(*bpp, TCP6_PKT + TCP6_HDRSIZE);
1091         if (*bpp == NULL)
1092                 return -1;
1093
1094         h = (Tcp6hdr *) ((*bpp)->rp);
1095         tcph->source = nhgets(h->tcpsport);
1096         tcph->dest = nhgets(h->tcpdport);
1097         tcph->seq = nhgetl(h->tcpseq);
1098         tcph->ack = nhgetl(h->tcpack);
1099         hdrlen = (h->tcpflag[0] >> 2) & ~3;
1100         if (hdrlen < TCP6_HDRSIZE) {
1101                 freeblist(*bpp);
1102                 return -1;
1103         }
1104
1105         tcph->flags = h->tcpflag[1];
1106         tcph->wnd = nhgets(h->tcpwin);
1107         tcph->urg = nhgets(h->tcpurg);
1108         tcph->mss = 0;
1109         tcph->ws = 0;
1110         tcph->len = nhgets(h->ploadlen) - hdrlen;
1111
1112         *bpp = pullupblock(*bpp, hdrlen + TCP6_PKT);
1113         if (*bpp == NULL)
1114                 return -1;
1115
1116         optr = h->tcpopt;
1117         n = hdrlen - TCP6_HDRSIZE;
1118         while (n > 0 && *optr != EOLOPT) {
1119                 if (*optr == NOOPOPT) {
1120                         n--;
1121                         optr++;
1122                         continue;
1123                 }
1124                 optlen = optr[1];
1125                 if (optlen < 2 || optlen > n)
1126                         break;
1127                 switch (*optr) {
1128                         case MSSOPT:
1129                                 if (optlen == MSS_LENGTH)
1130                                         tcph->mss = nhgets(optr + 2);
1131                                 break;
1132                         case WSOPT:
1133                                 if (optlen == WS_LENGTH && *(optr + 2) <= 14)
1134                                         tcph->ws = HaveWS | *(optr + 2);
1135                                 break;
1136                 }
1137                 n -= optlen;
1138                 optr += optlen;
1139         }
1140         return hdrlen;
1141 }
1142
1143 int ntohtcp4(Tcp * tcph, struct block **bpp)
1144 {
1145         Tcp4hdr *h;
1146         uint8_t *optr;
1147         uint16_t hdrlen;
1148         uint16_t optlen;
1149         int n;
1150
1151         *bpp = pullupblock(*bpp, TCP4_PKT + TCP4_HDRSIZE);
1152         if (*bpp == NULL)
1153                 return -1;
1154
1155         h = (Tcp4hdr *) ((*bpp)->rp);
1156         tcph->source = nhgets(h->tcpsport);
1157         tcph->dest = nhgets(h->tcpdport);
1158         tcph->seq = nhgetl(h->tcpseq);
1159         tcph->ack = nhgetl(h->tcpack);
1160
1161         hdrlen = (h->tcpflag[0] >> 2) & ~3;
1162         if (hdrlen < TCP4_HDRSIZE) {
1163                 freeblist(*bpp);
1164                 return -1;
1165         }
1166
1167         tcph->flags = h->tcpflag[1];
1168         tcph->wnd = nhgets(h->tcpwin);
1169         tcph->urg = nhgets(h->tcpurg);
1170         tcph->mss = 0;
1171         tcph->ws = 0;
1172         tcph->len = nhgets(h->length) - (hdrlen + TCP4_PKT);
1173
1174         *bpp = pullupblock(*bpp, hdrlen + TCP4_PKT);
1175         if (*bpp == NULL)
1176                 return -1;
1177
1178         optr = h->tcpopt;
1179         n = hdrlen - TCP4_HDRSIZE;
1180         while (n > 0 && *optr != EOLOPT) {
1181                 if (*optr == NOOPOPT) {
1182                         n--;
1183                         optr++;
1184                         continue;
1185                 }
1186                 optlen = optr[1];
1187                 if (optlen < 2 || optlen > n)
1188                         break;
1189                 switch (*optr) {
1190                         case MSSOPT:
1191                                 if (optlen == MSS_LENGTH)
1192                                         tcph->mss = nhgets(optr + 2);
1193                                 break;
1194                         case WSOPT:
1195                                 if (optlen == WS_LENGTH && *(optr + 2) <= 14)
1196                                         tcph->ws = HaveWS | *(optr + 2);
1197                                 break;
1198                 }
1199                 n -= optlen;
1200                 optr += optlen;
1201         }
1202         return hdrlen;
1203 }
1204
1205 /*
1206  *  For outgiing calls, generate an initial sequence
1207  *  number and put a SYN on the send queue
1208  */
1209 void tcpsndsyn(struct conv *s, Tcpctl * tcb)
1210 {
1211         tcb->iss = (nrand(1 << 16) << 16) | nrand(1 << 16);
1212         tcb->rttseq = tcb->iss;
1213         tcb->snd.wl2 = tcb->iss;
1214         tcb->snd.una = tcb->iss;
1215         tcb->snd.ptr = tcb->rttseq;
1216         tcb->snd.nxt = tcb->rttseq;
1217         tcb->flgcnt++;
1218         tcb->flags |= FORCE;
1219         tcb->sndsyntime = NOW;
1220
1221         /* set desired mss and scale */
1222         tcb->mss = tcpmtu(s->p, s->laddr, s->ipversion, &tcb->scale,
1223                           &tcb->flags);
1224 }
1225
1226 void
1227 sndrst(struct Proto *tcp, uint8_t * source, uint8_t * dest,
1228            uint16_t length, Tcp * seg, uint8_t version, char *reason)
1229 {
1230         struct block *hbp;
1231         uint8_t rflags;
1232         struct tcppriv *tpriv;
1233         Tcp4hdr ph4;
1234         Tcp6hdr ph6;
1235
1236         netlog(tcp->f, Logtcp, "sndrst: %s", reason);
1237
1238         tpriv = tcp->priv;
1239
1240         if (seg->flags & RST)
1241                 return;
1242
1243         /* make pseudo header */
1244         switch (version) {
1245                 case V4:
1246                         memset(&ph4, 0, sizeof(ph4));
1247                         ph4.vihl = IP_VER4;
1248                         v6tov4(ph4.tcpsrc, dest);
1249                         v6tov4(ph4.tcpdst, source);
1250                         ph4.proto = IP_TCPPROTO;
1251                         hnputs(ph4.tcplen, TCP4_HDRSIZE);
1252                         hnputs(ph4.tcpsport, seg->dest);
1253                         hnputs(ph4.tcpdport, seg->source);
1254                         break;
1255                 case V6:
1256                         memset(&ph6, 0, sizeof(ph6));
1257                         ph6.vcf[0] = IP_VER6;
1258                         ipmove(ph6.tcpsrc, dest);
1259                         ipmove(ph6.tcpdst, source);
1260                         ph6.proto = IP_TCPPROTO;
1261                         hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1262                         hnputs(ph6.tcpsport, seg->dest);
1263                         hnputs(ph6.tcpdport, seg->source);
1264                         break;
1265                 default:
1266                         panic("sndrst: version %d", version);
1267         }
1268
1269         tpriv->stats[OutRsts]++;
1270         rflags = RST;
1271
1272         /* convince the other end that this reset is in band */
1273         if (seg->flags & ACK) {
1274                 seg->seq = seg->ack;
1275                 seg->ack = 0;
1276         } else {
1277                 rflags |= ACK;
1278                 seg->ack = seg->seq;
1279                 seg->seq = 0;
1280                 if (seg->flags & SYN)
1281                         seg->ack++;
1282                 seg->ack += length;
1283                 if (seg->flags & FIN)
1284                         seg->ack++;
1285         }
1286         seg->flags = rflags;
1287         seg->wnd = 0;
1288         seg->urg = 0;
1289         seg->mss = 0;
1290         seg->ws = 0;
1291         switch (version) {
1292                 case V4:
1293                         hbp = htontcp4(seg, NULL, &ph4, NULL);
1294                         if (hbp == NULL)
1295                                 return;
1296                         ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1297                         break;
1298                 case V6:
1299                         hbp = htontcp6(seg, NULL, &ph6, NULL);
1300                         if (hbp == NULL)
1301                                 return;
1302                         ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1303                         break;
1304                 default:
1305                         panic("sndrst2: version %d", version);
1306         }
1307 }
1308
1309 /*
1310  *  send a reset to the remote side and close the conversation
1311  *  called with s qlocked
1312  */
1313 char *tcphangup(struct conv *s)
1314 {
1315         ERRSTACK(2);
1316         Tcp seg;
1317         Tcpctl *tcb;
1318         struct block *hbp;
1319
1320         tcb = (Tcpctl *) s->ptcl;
1321         if (waserror()) {
1322                 poperror();
1323                 return commonerror();
1324         }
1325         if (ipcmp(s->raddr, IPnoaddr)) {
1326                 /* discard error style, poperror regardless */
1327                 if (!waserror()) {
1328                         seg.flags = RST | ACK;
1329                         seg.ack = tcb->rcv.nxt;
1330                         tcb->rcv.una = 0;
1331                         seg.seq = tcb->snd.ptr;
1332                         seg.wnd = 0;
1333                         seg.urg = 0;
1334                         seg.mss = 0;
1335                         seg.ws = 0;
1336                         switch (s->ipversion) {
1337                                 case V4:
1338                                         tcb->protohdr.tcp4hdr.vihl = IP_VER4;
1339                                         hbp = htontcp4(&seg, NULL, &tcb->protohdr.tcp4hdr, tcb);
1340                                         ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
1341                                         break;
1342                                 case V6:
1343                                         tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
1344                                         hbp = htontcp6(&seg, NULL, &tcb->protohdr.tcp6hdr, tcb);
1345                                         ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
1346                                         break;
1347                                 default:
1348                                         panic("tcphangup: version %d", s->ipversion);
1349                         }
1350                 }
1351                 poperror();
1352         }
1353         localclose(s, NULL);
1354         poperror();
1355         return NULL;
1356 }
1357
1358 /*
1359  *  (re)send a SYN ACK
1360  */
1361 int sndsynack(struct Proto *tcp, Limbo * lp)
1362 {
1363         struct block *hbp;
1364         Tcp4hdr ph4;
1365         Tcp6hdr ph6;
1366         Tcp seg;
1367         int scale;
1368         uint8_t flag = 0;
1369
1370         /* make pseudo header */
1371         switch (lp->version) {
1372                 case V4:
1373                         memset(&ph4, 0, sizeof(ph4));
1374                         ph4.vihl = IP_VER4;
1375                         v6tov4(ph4.tcpsrc, lp->laddr);
1376                         v6tov4(ph4.tcpdst, lp->raddr);
1377                         ph4.proto = IP_TCPPROTO;
1378                         hnputs(ph4.tcplen, TCP4_HDRSIZE);
1379                         hnputs(ph4.tcpsport, lp->lport);
1380                         hnputs(ph4.tcpdport, lp->rport);
1381                         break;
1382                 case V6:
1383                         memset(&ph6, 0, sizeof(ph6));
1384                         ph6.vcf[0] = IP_VER6;
1385                         ipmove(ph6.tcpsrc, lp->laddr);
1386                         ipmove(ph6.tcpdst, lp->raddr);
1387                         ph6.proto = IP_TCPPROTO;
1388                         hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1389                         hnputs(ph6.tcpsport, lp->lport);
1390                         hnputs(ph6.tcpdport, lp->rport);
1391                         break;
1392                 default:
1393                         panic("sndrst: version %d", lp->version);
1394         }
1395
1396         seg.seq = lp->iss;
1397         seg.ack = lp->irs + 1;
1398         seg.flags = SYN | ACK;
1399         seg.urg = 0;
1400         seg.mss = tcpmtu(tcp, lp->laddr, lp->version, &scale, &flag);
1401         seg.wnd = QMAX;
1402
1403         /* if the other side set scale, we should too */
1404         if (lp->rcvscale) {
1405                 seg.ws = scale;
1406                 lp->sndscale = scale;
1407         } else {
1408                 seg.ws = 0;
1409                 lp->sndscale = 0;
1410         }
1411
1412         switch (lp->version) {
1413                 case V4:
1414                         hbp = htontcp4(&seg, NULL, &ph4, NULL);
1415                         if (hbp == NULL)
1416                                 return -1;
1417                         ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1418                         break;
1419                 case V6:
1420                         hbp = htontcp6(&seg, NULL, &ph6, NULL);
1421                         if (hbp == NULL)
1422                                 return -1;
1423                         ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1424                         break;
1425                 default:
1426                         panic("sndsnack: version %d", lp->version);
1427         }
1428         lp->lastsend = NOW;
1429         return 0;
1430 }
1431
1432 #define hashipa(a, p) ( ( (a)[IPaddrlen-2] + (a)[IPaddrlen-1] + p )&LHTMASK )
1433
1434 /*
1435  *  put a call into limbo and respond with a SYN ACK
1436  *
1437  *  called with proto locked
1438  */
1439 static void
1440 limbo(struct conv *s, uint8_t * source, uint8_t * dest, Tcp * seg, int version)
1441 {
1442         Limbo *lp, **l;
1443         struct tcppriv *tpriv;
1444         int h;
1445
1446         tpriv = s->p->priv;
1447         h = hashipa(source, seg->source);
1448
1449         for (l = &tpriv->lht[h]; *l != NULL; l = &lp->next) {
1450                 lp = *l;
1451                 if (lp->lport != seg->dest || lp->rport != seg->source
1452                         || lp->version != version)
1453                         continue;
1454                 if (ipcmp(lp->raddr, source) != 0)
1455                         continue;
1456                 if (ipcmp(lp->laddr, dest) != 0)
1457                         continue;
1458
1459                 /* each new SYN restarts the retransmits */
1460                 lp->irs = seg->seq;
1461                 break;
1462         }
1463         lp = *l;
1464         if (lp == NULL) {
1465                 if (tpriv->nlimbo >= Maxlimbo && tpriv->lht[h]) {
1466                         lp = tpriv->lht[h];
1467                         tpriv->lht[h] = lp->next;
1468                         lp->next = NULL;
1469                 } else {
1470                         lp = kzmalloc(sizeof(*lp), 0);
1471                         if (lp == NULL)
1472                                 return;
1473                         tpriv->nlimbo++;
1474                 }
1475                 *l = lp;
1476                 lp->version = version;
1477                 ipmove(lp->laddr, dest);
1478                 ipmove(lp->raddr, source);
1479                 lp->lport = seg->dest;
1480                 lp->rport = seg->source;
1481                 lp->mss = seg->mss;
1482                 lp->rcvscale = seg->ws;
1483                 lp->irs = seg->seq;
1484                 lp->iss = (nrand(1 << 16) << 16) | nrand(1 << 16);
1485         }
1486
1487         if (sndsynack(s->p, lp) < 0) {
1488                 *l = lp->next;
1489                 tpriv->nlimbo--;
1490                 kfree(lp);
1491         }
1492 }
1493
1494 /*
1495  *  resend SYN ACK's once every SYNACK_RXTIMER ms.
1496  */
1497 static void limborexmit(struct Proto *tcp)
1498 {
1499         struct tcppriv *tpriv;
1500         Limbo **l, *lp;
1501         int h;
1502         int seen;
1503         uint32_t now;
1504
1505         tpriv = tcp->priv;
1506
1507         if (!canqlock(&tcp->qlock))
1508                 return;
1509         seen = 0;
1510         now = NOW;
1511         for (h = 0; h < NLHT && seen < tpriv->nlimbo; h++) {
1512                 for (l = &tpriv->lht[h]; *l != NULL && seen < tpriv->nlimbo;) {
1513                         lp = *l;
1514                         seen++;
1515                         if (now - lp->lastsend < (lp->rexmits + 1) * SYNACK_RXTIMER)
1516                                 continue;
1517
1518                         /* time it out after 1 second */
1519                         if (++(lp->rexmits) > 5) {
1520                                 tpriv->nlimbo--;
1521                                 *l = lp->next;
1522                                 kfree(lp);
1523                                 continue;
1524                         }
1525
1526                         /* if we're being attacked, don't bother resending SYN ACK's */
1527                         if (tpriv->nlimbo > 100)
1528                                 continue;
1529
1530                         if (sndsynack(tcp, lp) < 0) {
1531                                 tpriv->nlimbo--;
1532                                 *l = lp->next;
1533                                 kfree(lp);
1534                                 continue;
1535                         }
1536
1537                         l = &lp->next;
1538                 }
1539         }
1540         qunlock(&tcp->qlock);
1541 }
1542
1543 /*
1544  *  lookup call in limbo.  if found, throw it out.
1545  *
1546  *  called with proto locked
1547  */
1548 static void
1549 limborst(struct conv *s, Tcp * segp, uint8_t * src, uint8_t * dst,
1550                  uint8_t version)
1551 {
1552         Limbo *lp, **l;
1553         int h;
1554         struct tcppriv *tpriv;
1555
1556         tpriv = s->p->priv;
1557
1558         /* find a call in limbo */
1559         h = hashipa(src, segp->source);
1560         for (l = &tpriv->lht[h]; *l != NULL; l = &lp->next) {
1561                 lp = *l;
1562                 if (lp->lport != segp->dest || lp->rport != segp->source
1563                         || lp->version != version)
1564                         continue;
1565                 if (ipcmp(lp->laddr, dst) != 0)
1566                         continue;
1567                 if (ipcmp(lp->raddr, src) != 0)
1568                         continue;
1569
1570                 /* RST can only follow the SYN */
1571                 if (segp->seq == lp->irs + 1) {
1572                         tpriv->nlimbo--;
1573                         *l = lp->next;
1574                         kfree(lp);
1575                 }
1576                 break;
1577         }
1578 }
1579
1580 /*
1581  *  come here when we finally get an ACK to our SYN-ACK.
1582  *  lookup call in limbo.  if found, create a new conversation
1583  *
1584  *  called with proto locked
1585  */
1586 static struct conv *tcpincoming(struct conv *s, Tcp * segp, uint8_t * src,
1587                                                                 uint8_t * dst, uint8_t version)
1588 {
1589         struct conv *new;
1590         Tcpctl *tcb;
1591         struct tcppriv *tpriv;
1592         Tcp4hdr *h4;
1593         Tcp6hdr *h6;
1594         Limbo *lp, **l;
1595         int h;
1596
1597         /* unless it's just an ack, it can't be someone coming out of limbo */
1598         if ((segp->flags & SYN) || (segp->flags & ACK) == 0)
1599                 return NULL;
1600
1601         tpriv = s->p->priv;
1602
1603         /* find a call in limbo */
1604         h = hashipa(src, segp->source);
1605         for (l = &tpriv->lht[h]; (lp = *l) != NULL; l = &lp->next) {
1606                 netlog(s->p->f, Logtcp,
1607                            "tcpincoming s %I,0x%x/%I,0x%x d %I,0x%x/%I,0x%x v %d/%d", src,
1608                            segp->source, lp->raddr, lp->rport, dst, segp->dest, lp->laddr,
1609                            lp->lport, version, lp->version);
1610
1611                 if (lp->lport != segp->dest || lp->rport != segp->source
1612                         || lp->version != version)
1613                         continue;
1614                 if (ipcmp(lp->laddr, dst) != 0)
1615                         continue;
1616                 if (ipcmp(lp->raddr, src) != 0)
1617                         continue;
1618
1619                 /* we're assuming no data with the initial SYN */
1620                 if (segp->seq != lp->irs + 1 || segp->ack != lp->iss + 1) {
1621                         netlog(s->p->f, Logtcp, "tcpincoming s 0x%lx/0x%lx a 0x%lx 0x%lx",
1622                                    segp->seq, lp->irs + 1, segp->ack, lp->iss + 1);
1623                         lp = NULL;
1624                 } else {
1625                         tpriv->nlimbo--;
1626                         *l = lp->next;
1627                 }
1628                 break;
1629         }
1630         if (lp == NULL)
1631                 return NULL;
1632
1633         new = Fsnewcall(s, src, segp->source, dst, segp->dest, version);
1634         if (new == NULL)
1635                 return NULL;
1636
1637         memmove(new->ptcl, s->ptcl, sizeof(Tcpctl));
1638         tcb = (Tcpctl *) new->ptcl;
1639         tcb->flags &= ~CLONE;
1640         tcb->timer.arg = new;
1641         tcb->timer.state = TcptimerOFF;
1642         tcb->acktimer.arg = new;
1643         tcb->acktimer.state = TcptimerOFF;
1644         tcb->katimer.arg = new;
1645         tcb->katimer.state = TcptimerOFF;
1646         tcb->rtt_timer.arg = new;
1647         tcb->rtt_timer.state = TcptimerOFF;
1648
1649         tcb->irs = lp->irs;
1650         tcb->rcv.nxt = tcb->irs + 1;
1651         tcb->rcv.urg = tcb->rcv.nxt;
1652
1653         tcb->iss = lp->iss;
1654         tcb->rttseq = tcb->iss;
1655         tcb->snd.wl2 = tcb->iss;
1656         tcb->snd.una = tcb->iss + 1;
1657         tcb->snd.ptr = tcb->iss + 1;
1658         tcb->snd.nxt = tcb->iss + 1;
1659         tcb->flgcnt = 0;
1660         tcb->flags |= SYNACK;
1661
1662         /* our sending max segment size cannot be bigger than what he asked for */
1663         if (lp->mss != 0 && lp->mss < tcb->mss)
1664                 tcb->mss = lp->mss;
1665
1666         /* window scaling */
1667         tcpsetscale(new, tcb, lp->rcvscale, lp->sndscale);
1668
1669         /* the congestion window always starts out as a single segment */
1670         tcb->snd.wnd = segp->wnd;
1671         tcb->cwind = tcb->mss;
1672
1673         /* set initial round trip time */
1674         tcb->sndsyntime = lp->lastsend + lp->rexmits * SYNACK_RXTIMER;
1675         tcpsynackrtt(new);
1676
1677         kfree(lp);
1678
1679         /* set up proto header */
1680         switch (version) {
1681                 case V4:
1682                         h4 = &tcb->protohdr.tcp4hdr;
1683                         memset(h4, 0, sizeof(*h4));
1684                         h4->proto = IP_TCPPROTO;
1685                         hnputs(h4->tcpsport, new->lport);
1686                         hnputs(h4->tcpdport, new->rport);
1687                         v6tov4(h4->tcpsrc, dst);
1688                         v6tov4(h4->tcpdst, src);
1689                         break;
1690                 case V6:
1691                         h6 = &tcb->protohdr.tcp6hdr;
1692                         memset(h6, 0, sizeof(*h6));
1693                         h6->proto = IP_TCPPROTO;
1694                         hnputs(h6->tcpsport, new->lport);
1695                         hnputs(h6->tcpdport, new->rport);
1696                         ipmove(h6->tcpsrc, dst);
1697                         ipmove(h6->tcpdst, src);
1698                         break;
1699                 default:
1700                         panic("tcpincoming: version %d", new->ipversion);
1701         }
1702
1703         tcpsetstate(new, Established);
1704
1705         iphtadd(&tpriv->ht, new);
1706
1707         return new;
1708 }
1709
1710 int seq_within(uint32_t x, uint32_t low, uint32_t high)
1711 {
1712         if (low <= high) {
1713                 if (low <= x && x <= high)
1714                         return 1;
1715         } else {
1716                 if (x >= low || x <= high)
1717                         return 1;
1718         }
1719         return 0;
1720 }
1721
1722 int seq_lt(uint32_t x, uint32_t y)
1723 {
1724         return (int)(x - y) < 0;
1725 }
1726
1727 int seq_le(uint32_t x, uint32_t y)
1728 {
1729         return (int)(x - y) <= 0;
1730 }
1731
1732 int seq_gt(uint32_t x, uint32_t y)
1733 {
1734         return (int)(x - y) > 0;
1735 }
1736
1737 int seq_ge(uint32_t x, uint32_t y)
1738 {
1739         return (int)(x - y) >= 0;
1740 }
1741
1742 /*
1743  *  use the time between the first SYN and it's ack as the
1744  *  initial round trip time
1745  */
1746 void tcpsynackrtt(struct conv *s)
1747 {
1748         Tcpctl *tcb;
1749         int delta;
1750         struct tcppriv *tpriv;
1751
1752         tcb = (Tcpctl *) s->ptcl;
1753         tpriv = s->p->priv;
1754
1755         delta = NOW - tcb->sndsyntime;
1756         tcb->srtt = delta << LOGAGAIN;
1757         tcb->mdev = delta << LOGDGAIN;
1758
1759         /* halt round trip timer */
1760         tcphalt(tpriv, &tcb->rtt_timer);
1761 }
1762
1763 void update(struct conv *s, Tcp * seg)
1764 {
1765         int rtt, delta;
1766         Tcpctl *tcb;
1767         uint32_t acked;
1768         uint32_t expand;
1769         struct tcppriv *tpriv;
1770
1771         tpriv = s->p->priv;
1772         tcb = (Tcpctl *) s->ptcl;
1773
1774         /* if everything has been acked, force output(?) */
1775         if (seq_gt(seg->ack, tcb->snd.nxt)) {
1776                 tcb->flags |= FORCE;
1777                 return;
1778         }
1779
1780         /* added by Dong Lin for fast retransmission */
1781         if (seg->ack == tcb->snd.una
1782                 && tcb->snd.una != tcb->snd.nxt
1783                 && seg->len == 0 && seg->wnd == tcb->snd.wnd) {
1784
1785                 /* this is a pure ack w/o window update */
1786                 netlog(s->p->f, Logtcprxmt, "dupack %lu ack %lu sndwnd %d advwin %d\n",
1787                            tcb->snd.dupacks, seg->ack, tcb->snd.wnd, seg->wnd);
1788
1789                 if (++tcb->snd.dupacks == TCPREXMTTHRESH) {
1790                         /*
1791                          *  tahoe tcp rxt the packet, half sshthresh,
1792                          *  and set cwnd to one packet
1793                          */
1794                         tcb->snd.recovery = 1;
1795                         tcb->snd.rxt = tcb->snd.nxt;
1796                         netlog(s->p->f, Logtcprxmt, "fast rxt %lu, nxt %lu\n", tcb->snd.una,
1797                                    tcb->snd.nxt);
1798                         tcprxmit(s);
1799                 } else {
1800                         /* do reno tcp here. */
1801                 }
1802         }
1803
1804         /*
1805          *  update window
1806          */
1807         if (seq_gt(seg->ack, tcb->snd.wl2)
1808                 || (tcb->snd.wl2 == seg->ack && seg->wnd > tcb->snd.wnd)) {
1809                 tcb->snd.wnd = seg->wnd;
1810                 tcb->snd.wl2 = seg->ack;
1811         }
1812
1813         if (!seq_gt(seg->ack, tcb->snd.una)) {
1814                 /*
1815                  *  don't let us hangup if sending into a closed window and
1816                  *  we're still getting acks
1817                  */
1818                 if ((tcb->flags & RETRAN) && tcb->snd.wnd == 0) {
1819                         tcb->backedoff = MAXBACKMS / 4;
1820                 }
1821                 return;
1822         }
1823
1824         /*
1825          *  any positive ack turns off fast rxt,
1826          *  (should we do new-reno on partial acks?)
1827          */
1828         if (!tcb->snd.recovery || seq_ge(seg->ack, tcb->snd.rxt)) {
1829                 tcb->snd.dupacks = 0;
1830                 tcb->snd.recovery = 0;
1831         } else
1832                 netlog(s->p->f, Logtcp, "rxt next %lu, cwin %u\n", seg->ack,
1833                            tcb->cwind);
1834
1835         /* Compute the new send window size */
1836         acked = seg->ack - tcb->snd.una;
1837
1838         /* avoid slow start and timers for SYN acks */
1839         if ((tcb->flags & SYNACK) == 0) {
1840                 tcb->flags |= SYNACK;
1841                 acked--;
1842                 tcb->flgcnt--;
1843                 goto done;
1844         }
1845
1846         /* slow start as long as we're not recovering from lost packets */
1847         if (tcb->cwind < tcb->snd.wnd && !tcb->snd.recovery) {
1848                 if (tcb->cwind < tcb->ssthresh) {
1849                         expand = tcb->mss;
1850                         if (acked < expand)
1851                                 expand = acked;
1852                 } else
1853                         expand = ((int)tcb->mss * tcb->mss) / tcb->cwind;
1854
1855                 if (tcb->cwind + expand < tcb->cwind)
1856                         expand = tcb->snd.wnd - tcb->cwind;
1857                 if (tcb->cwind + expand > tcb->snd.wnd)
1858                         expand = tcb->snd.wnd - tcb->cwind;
1859                 tcb->cwind += expand;
1860         }
1861
1862         /* Adjust the timers according to the round trip time */
1863         if (tcb->rtt_timer.state == TcptimerON && seq_ge(seg->ack, tcb->rttseq)) {
1864                 tcphalt(tpriv, &tcb->rtt_timer);
1865                 if ((tcb->flags & RETRAN) == 0) {
1866                         tcb->backoff = 0;
1867                         tcb->backedoff = 0;
1868                         rtt = tcb->rtt_timer.start - tcb->rtt_timer.count;
1869                         if (rtt == 0)
1870                                 rtt = 1;        /* otherwise all close systems will rexmit in 0 time */
1871                         rtt *= MSPTICK;
1872                         if (tcb->srtt == 0) {
1873                                 tcb->srtt = rtt << LOGAGAIN;
1874                                 tcb->mdev = rtt << LOGDGAIN;
1875                         } else {
1876                                 delta = rtt - (tcb->srtt >> LOGAGAIN);
1877                                 tcb->srtt += delta;
1878                                 if (tcb->srtt <= 0)
1879                                         tcb->srtt = 1;
1880
1881                                 delta = abs(delta) - (tcb->mdev >> LOGDGAIN);
1882                                 tcb->mdev += delta;
1883                                 if (tcb->mdev <= 0)
1884                                         tcb->mdev = 1;
1885                         }
1886                         tcpsettimer(tcb);
1887                 }
1888         }
1889
1890 done:
1891         if (qdiscard(s->wq, acked) < acked)
1892                 tcb->flgcnt--;
1893
1894         tcb->snd.una = seg->ack;
1895         if (seq_gt(seg->ack, tcb->snd.urg))
1896                 tcb->snd.urg = seg->ack;
1897
1898         if (tcb->snd.una != tcb->snd.nxt)
1899                 tcpgo(tpriv, &tcb->timer);
1900         else
1901                 tcphalt(tpriv, &tcb->timer);
1902
1903         if (seq_lt(tcb->snd.ptr, tcb->snd.una))
1904                 tcb->snd.ptr = tcb->snd.una;
1905
1906         tcb->flags &= ~RETRAN;
1907         tcb->backoff = 0;
1908         tcb->backedoff = 0;
1909 }
1910
1911 void tcpiput(struct Proto *tcp, struct Ipifc *unused, struct block *bp)
1912 {
1913         ERRSTACK(1);
1914         Tcp seg;
1915         Tcp4hdr *h4;
1916         Tcp6hdr *h6;
1917         int hdrlen;
1918         Tcpctl *tcb;
1919         uint16_t length;
1920         uint8_t source[IPaddrlen], dest[IPaddrlen];
1921         struct conv *s;
1922         struct Fs *f;
1923         struct tcppriv *tpriv;
1924         uint8_t version;
1925
1926         f = tcp->f;
1927         tpriv = tcp->priv;
1928
1929         tpriv->stats[InSegs]++;
1930
1931         h4 = (Tcp4hdr *) (bp->rp);
1932         h6 = (Tcp6hdr *) (bp->rp);
1933
1934         if ((h4->vihl & 0xF0) == IP_VER4) {
1935                 version = V4;
1936                 length = nhgets(h4->length);
1937                 v4tov6(dest, h4->tcpdst);
1938                 v4tov6(source, h4->tcpsrc);
1939
1940                 h4->Unused = 0;
1941                 hnputs(h4->tcplen, length - TCP4_PKT);
1942                 if (!(bp->flag & Btcpck) && (h4->tcpcksum[0] || h4->tcpcksum[1]) &&
1943                         ptclcsum(bp, TCP4_IPLEN, length - TCP4_IPLEN)) {
1944                         tpriv->stats[CsumErrs]++;
1945                         tpriv->stats[InErrs]++;
1946                         netlog(f, Logtcp, "bad tcp proto cksum\n");
1947                         freeblist(bp);
1948                         return;
1949                 }
1950
1951                 hdrlen = ntohtcp4(&seg, &bp);
1952                 if (hdrlen < 0) {
1953                         tpriv->stats[HlenErrs]++;
1954                         tpriv->stats[InErrs]++;
1955                         netlog(f, Logtcp, "bad tcp hdr len\n");
1956                         return;
1957                 }
1958
1959                 /* trim the packet to the size claimed by the datagram */
1960                 length -= hdrlen + TCP4_PKT;
1961                 bp = trimblock(bp, hdrlen + TCP4_PKT, length);
1962                 if (bp == NULL) {
1963                         tpriv->stats[LenErrs]++;
1964                         tpriv->stats[InErrs]++;
1965                         netlog(f, Logtcp, "tcp len < 0 after trim\n");
1966                         return;
1967                 }
1968         } else {
1969                 int ttl = h6->ttl;
1970                 int proto = h6->proto;
1971
1972                 version = V6;
1973                 length = nhgets(h6->ploadlen);
1974                 ipmove(dest, h6->tcpdst);
1975                 ipmove(source, h6->tcpsrc);
1976
1977                 h6->ploadlen[0] = h6->ploadlen[1] = h6->proto = 0;
1978                 h6->ttl = proto;
1979                 hnputl(h6->vcf, length);
1980                 if ((h6->tcpcksum[0] || h6->tcpcksum[1]) &&
1981                         ptclcsum(bp, TCP6_IPLEN, length + TCP6_PHDRSIZE)) {
1982                         tpriv->stats[CsumErrs]++;
1983                         tpriv->stats[InErrs]++;
1984                         netlog(f, Logtcp, "bad tcp proto cksum\n");
1985                         freeblist(bp);
1986                         return;
1987                 }
1988                 h6->ttl = ttl;
1989                 h6->proto = proto;
1990                 hnputs(h6->ploadlen, length);
1991
1992                 hdrlen = ntohtcp6(&seg, &bp);
1993                 if (hdrlen < 0) {
1994                         tpriv->stats[HlenErrs]++;
1995                         tpriv->stats[InErrs]++;
1996                         netlog(f, Logtcp, "bad tcp hdr len\n");
1997                         return;
1998                 }
1999
2000                 /* trim the packet to the size claimed by the datagram */
2001                 length -= hdrlen;
2002                 bp = trimblock(bp, hdrlen + TCP6_PKT, length);
2003                 if (bp == NULL) {
2004                         tpriv->stats[LenErrs]++;
2005                         tpriv->stats[InErrs]++;
2006                         netlog(f, Logtcp, "tcp len < 0 after trim\n");
2007                         return;
2008                 }
2009         }
2010
2011         /* lock protocol while searching for a conversation */
2012         qlock(&tcp->qlock);
2013
2014         /* Look for a matching conversation */
2015         s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest);
2016         if (s == NULL) {
2017                 netlog(f, Logtcp, "iphtlook failed");
2018 reset:
2019                 qunlock(&tcp->qlock);
2020                 sndrst(tcp, source, dest, length, &seg, version, "no conversation");
2021                 freeblist(bp);
2022                 return;
2023         }
2024
2025         /* if it's a listener, look for the right flags and get a new conv */
2026         tcb = (Tcpctl *) s->ptcl;
2027         if (tcb->state == Listen) {
2028                 if (seg.flags & RST) {
2029                         limborst(s, &seg, source, dest, version);
2030                         qunlock(&tcp->qlock);
2031                         freeblist(bp);
2032                         return;
2033                 }
2034
2035                 /* if this is a new SYN, put the call into limbo */
2036                 if ((seg.flags & SYN) && (seg.flags & ACK) == 0) {
2037                         limbo(s, source, dest, &seg, version);
2038                         qunlock(&tcp->qlock);
2039                         freeblist(bp);
2040                         return;
2041                 }
2042
2043                 /*
2044                  *  if there's a matching call in limbo, tcpincoming will
2045                  *  return it in state Syn_received
2046                  */
2047                 s = tcpincoming(s, &seg, source, dest, version);
2048                 if (s == NULL)
2049                         goto reset;
2050         }
2051
2052         /* The rest of the input state machine is run with the control block
2053          * locked and implements the state machine directly out of the RFC.
2054          * Out-of-band data is ignored - it was always a bad idea.
2055          */
2056         tcb = (Tcpctl *) s->ptcl;
2057         if (waserror()) {
2058                 qunlock(&s->qlock);
2059                 nexterror();
2060         }
2061         qlock(&s->qlock);
2062         qunlock(&tcp->qlock);
2063
2064         /* fix up window */
2065         seg.wnd <<= tcb->rcv.scale;
2066
2067         /* every input packet in puts off the keep alive time out */
2068         tcpsetkacounter(tcb);
2069
2070         switch (tcb->state) {
2071                 case Closed:
2072                         sndrst(tcp, source, dest, length, &seg, version,
2073                                    "sending to Closed");
2074                         goto raise;
2075                 case Syn_sent:
2076                         if (seg.flags & ACK) {
2077                                 if (!seq_within(seg.ack, tcb->iss + 1, tcb->snd.nxt)) {
2078                                         sndrst(tcp, source, dest, length, &seg, version,
2079                                                    "bad seq in Syn_sent");
2080                                         goto raise;
2081                                 }
2082                         }
2083                         if (seg.flags & RST) {
2084                                 if (seg.flags & ACK)
2085                                         localclose(s, Econrefused);
2086                                 goto raise;
2087                         }
2088
2089                         if (seg.flags & SYN) {
2090                                 procsyn(s, &seg);
2091                                 if (seg.flags & ACK) {
2092                                         update(s, &seg);
2093                                         tcpsynackrtt(s);
2094                                         tcpsetstate(s, Established);
2095                                         tcpsetscale(s, tcb, seg.ws, tcb->scale);
2096                                 } else {
2097                                         tcb->time = NOW;
2098                                         tcpsetstate(s, Syn_received);   /* DLP - shouldn't this be a reset? */
2099                                 }
2100
2101                                 if (length != 0 || (seg.flags & FIN))
2102                                         break;
2103
2104                                 freeblist(bp);
2105                                 goto output;
2106                         } else
2107                                 freeblist(bp);
2108
2109                         qunlock(&s->qlock);
2110                         poperror();
2111                         return;
2112                 case Syn_received:
2113                         /* doesn't matter if it's the correct ack, we're just trying to set timing */
2114                         if (seg.flags & ACK)
2115                                 tcpsynackrtt(s);
2116                         break;
2117         }
2118
2119         /*
2120          *  One DOS attack is to open connections to us and then forget about them,
2121          *  thereby tying up a conv at no long term cost to the attacker.
2122          *  This is an attempt to defeat these stateless DOS attacks.  See
2123          *  corresponding code in tcpsendka().
2124          */
2125         if (tcb->state != Syn_received && (seg.flags & RST) == 0) {
2126                 if (tcpporthogdefense
2127                         && seq_within(seg.ack, tcb->snd.una - (1 << 31),
2128                                                   tcb->snd.una - (1 << 29))) {
2129                         printd("stateless hog %I.%d->%I.%d f 0x%x 0x%lx - 0x%lx - 0x%lx\n",
2130                                    source, seg.source, dest, seg.dest, seg.flags,
2131                                    tcb->snd.una - (1 << 31), seg.ack, tcb->snd.una - (1 << 29));
2132                         localclose(s, "stateless hog");
2133                 }
2134         }
2135
2136         /* Cut the data to fit the receive window */
2137         if (tcptrim(tcb, &seg, &bp, &length) == -1) {
2138                 netlog(f, Logtcp, "tcp len < 0, %lu %d\n", seg.seq, length);
2139                 update(s, &seg);
2140                 if (qlen(s->wq) + tcb->flgcnt == 0 && tcb->state == Closing) {
2141                         tcphalt(tpriv, &tcb->rtt_timer);
2142                         tcphalt(tpriv, &tcb->acktimer);
2143                         tcphalt(tpriv, &tcb->katimer);
2144                         tcpsetstate(s, Time_wait);
2145                         tcb->timer.start = MSL2 * (1000 / MSPTICK);
2146                         tcpgo(tpriv, &tcb->timer);
2147                 }
2148                 if (!(seg.flags & RST)) {
2149                         tcb->flags |= FORCE;
2150                         goto output;
2151                 }
2152                 qunlock(&s->qlock);
2153                 poperror();
2154                 return;
2155         }
2156
2157         /* Cannot accept so answer with a rst */
2158         if (length && tcb->state == Closed) {
2159                 sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
2160                 goto raise;
2161         }
2162
2163         /* The segment is beyond the current receive pointer so
2164          * queue the data in the resequence queue
2165          */
2166         if (seg.seq != tcb->rcv.nxt)
2167                 if (length != 0 || (seg.flags & (SYN | FIN))) {
2168                         update(s, &seg);
2169                         if (addreseq(tcb, tpriv, &seg, bp, length) < 0)
2170                                 printd("reseq %I.%d -> %I.%d\n", s->raddr, s->rport, s->laddr,
2171                                            s->lport);
2172                         tcb->flags |= FORCE;
2173                         goto output;
2174                 }
2175
2176         /*
2177          *  keep looping till we've processed this packet plus any
2178          *  adjacent packets in the resequence queue
2179          */
2180         for (;;) {
2181                 if (seg.flags & RST) {
2182                         if (tcb->state == Established) {
2183                                 tpriv->stats[EstabResets]++;
2184                                 if (tcb->rcv.nxt != seg.seq)
2185                                         printd
2186                                                 ("out of order RST rcvd: %I.%d -> %I.%d, rcv.nxt 0x%lx seq 0x%lx\n",
2187                                                  s->raddr, s->rport, s->laddr, s->lport, tcb->rcv.nxt,
2188                                                  seg.seq);
2189                         }
2190                         localclose(s, Econrefused);
2191                         goto raise;
2192                 }
2193
2194                 if ((seg.flags & ACK) == 0)
2195                         goto raise;
2196
2197                 switch (tcb->state) {
2198                         case Syn_received:
2199                                 if (!seq_within(seg.ack, tcb->snd.una + 1, tcb->snd.nxt)) {
2200                                         sndrst(tcp, source, dest, length, &seg, version,
2201                                                    "bad seq in Syn_received");
2202                                         goto raise;
2203                                 }
2204                                 update(s, &seg);
2205                                 tcpsetstate(s, Established);
2206                         case Established:
2207                         case Close_wait:
2208                                 update(s, &seg);
2209                                 break;
2210                         case Finwait1:
2211                                 update(s, &seg);
2212                                 if (qlen(s->wq) + tcb->flgcnt == 0) {
2213                                         tcphalt(tpriv, &tcb->rtt_timer);
2214                                         tcphalt(tpriv, &tcb->acktimer);
2215                                         tcpsetkacounter(tcb);
2216                                         tcb->time = NOW;
2217                                         tcpsetstate(s, Finwait2);
2218                                         tcb->katimer.start = MSL2 * (1000 / MSPTICK);
2219                                         tcpgo(tpriv, &tcb->katimer);
2220                                 }
2221                                 break;
2222                         case Finwait2:
2223                                 update(s, &seg);
2224                                 break;
2225                         case Closing:
2226                                 update(s, &seg);
2227                                 if (qlen(s->wq) + tcb->flgcnt == 0) {
2228                                         tcphalt(tpriv, &tcb->rtt_timer);
2229                                         tcphalt(tpriv, &tcb->acktimer);
2230                                         tcphalt(tpriv, &tcb->katimer);
2231                                         tcpsetstate(s, Time_wait);
2232                                         tcb->timer.start = MSL2 * (1000 / MSPTICK);
2233                                         tcpgo(tpriv, &tcb->timer);
2234                                 }
2235                                 break;
2236                         case Last_ack:
2237                                 update(s, &seg);
2238                                 if (qlen(s->wq) + tcb->flgcnt == 0) {
2239                                         localclose(s, NULL);
2240                                         goto raise;
2241                                 }
2242                         case Time_wait:
2243                                 tcb->flags |= FORCE;
2244                                 if (tcb->timer.state != TcptimerON)
2245                                         tcpgo(tpriv, &tcb->timer);
2246                 }
2247
2248                 if ((seg.flags & URG) && seg.urg) {
2249                         if (seq_gt(seg.urg + seg.seq, tcb->rcv.urg)) {
2250                                 tcb->rcv.urg = seg.urg + seg.seq;
2251                                 pullblock(&bp, seg.urg);
2252                         }
2253                 } else if (seq_gt(tcb->rcv.nxt, tcb->rcv.urg))
2254                         tcb->rcv.urg = tcb->rcv.nxt;
2255
2256                 if (length == 0) {
2257                         if (bp != NULL)
2258                                 freeblist(bp);
2259                 } else {
2260                         switch (tcb->state) {
2261                                 default:
2262                                         /* Ignore segment text */
2263                                         if (bp != NULL)
2264                                                 freeblist(bp);
2265                                         break;
2266
2267                                 case Syn_received:
2268                                 case Established:
2269                                 case Finwait1:
2270                                         /* If we still have some data place on
2271                                          * receive queue
2272                                          */
2273                                         if (bp) {
2274                                                 bp = packblock(bp);
2275                                                 if (bp == NULL)
2276                                                         panic("tcp packblock");
2277                                                 qpassnolim(s->rq, bp);
2278                                                 bp = NULL;
2279
2280                                                 /*
2281                                                  *  Force an ack every 2 data messages.  This is
2282                                                  *  a hack for rob to make his home system run
2283                                                  *  faster.
2284                                                  *
2285                                                  *  this also keeps the standard TCP congestion
2286                                                  *  control working since it needs an ack every
2287                                                  *  2 max segs worth.  This is not quite that,
2288                                                  *  but under a real stream is equivalent since
2289                                                  *  every packet has a max seg in it.
2290                                                  */
2291                                                 if (++(tcb->rcv.una) >= 2)
2292                                                         tcb->flags |= FORCE;
2293                                         }
2294                                         tcb->rcv.nxt += length;
2295
2296                                         /*
2297                                          *  update our rcv window
2298                                          */
2299                                         tcprcvwin(s);
2300
2301                                         /*
2302                                          *  turn on the acktimer if there's something
2303                                          *  to ack
2304                                          */
2305                                         if (tcb->acktimer.state != TcptimerON)
2306                                                 tcpgo(tpriv, &tcb->acktimer);
2307
2308                                         break;
2309                                 case Finwait2:
2310                                         /* no process to read the data, send a reset */
2311                                         if (bp != NULL)
2312                                                 freeblist(bp);
2313                                         sndrst(tcp, source, dest, length, &seg, version,
2314                                                    "send to Finwait2");
2315                                         qunlock(&s->qlock);
2316                                         poperror();
2317                                         return;
2318                         }
2319                 }
2320
2321                 if (seg.flags & FIN) {
2322                         tcb->flags |= FORCE;
2323
2324                         switch (tcb->state) {
2325                                 case Syn_received:
2326                                 case Established:
2327                                         tcb->rcv.nxt++;
2328                                         tcpsetstate(s, Close_wait);
2329                                         break;
2330                                 case Finwait1:
2331                                         tcb->rcv.nxt++;
2332                                         if (qlen(s->wq) + tcb->flgcnt == 0) {
2333                                                 tcphalt(tpriv, &tcb->rtt_timer);
2334                                                 tcphalt(tpriv, &tcb->acktimer);
2335                                                 tcphalt(tpriv, &tcb->katimer);
2336                                                 tcpsetstate(s, Time_wait);
2337                                                 tcb->timer.start = MSL2 * (1000 / MSPTICK);
2338                                                 tcpgo(tpriv, &tcb->timer);
2339                                         } else
2340                                                 tcpsetstate(s, Closing);
2341                                         break;
2342                                 case Finwait2:
2343                                         tcb->rcv.nxt++;
2344                                         tcphalt(tpriv, &tcb->rtt_timer);
2345                                         tcphalt(tpriv, &tcb->acktimer);
2346                                         tcphalt(tpriv, &tcb->katimer);
2347                                         tcpsetstate(s, Time_wait);
2348                                         tcb->timer.start = MSL2 * (1000 / MSPTICK);
2349                                         tcpgo(tpriv, &tcb->timer);
2350                                         break;
2351                                 case Close_wait:
2352                                 case Closing:
2353                                 case Last_ack:
2354                                         break;
2355                                 case Time_wait:
2356                                         tcpgo(tpriv, &tcb->timer);
2357                                         break;
2358                         }
2359                 }
2360
2361                 /*
2362                  *  get next adjacent segment from the resequence queue.
2363                  *  dump/trim any overlapping segments
2364                  */
2365                 for (;;) {
2366                         if (tcb->reseq == NULL)
2367                                 goto output;
2368
2369                         if (seq_ge(tcb->rcv.nxt, tcb->reseq->seg.seq) == 0)
2370                                 goto output;
2371
2372                         getreseq(tcb, &seg, &bp, &length);
2373
2374                         if (tcptrim(tcb, &seg, &bp, &length) == 0)
2375                                 break;
2376                 }
2377         }
2378 output:
2379         tcpoutput(s);
2380         qunlock(&s->qlock);
2381         poperror();
2382         return;
2383 raise:
2384         qunlock(&s->qlock);
2385         poperror();
2386         freeblist(bp);
2387         tcpkick(s);
2388 }
2389
2390 /*
2391  *  always enters and exits with the s locked.  We drop
2392  *  the lock to ipoput the packet so some care has to be
2393  *  taken by callers.
2394  */
2395 void tcpoutput(struct conv *s)
2396 {
2397         Tcp seg;
2398         int msgs;
2399         Tcpctl *tcb;
2400         struct block *hbp, *bp;
2401         int sndcnt, n;
2402         uint32_t ssize, dsize, usable, sent;
2403         struct Fs *f;
2404         struct tcppriv *tpriv;
2405         uint8_t version;
2406
2407         f = s->p->f;
2408         tpriv = s->p->priv;
2409         version = s->ipversion;
2410
2411         for (msgs = 0; msgs < 100; msgs++) {
2412                 tcb = (Tcpctl *) s->ptcl;
2413
2414                 switch (tcb->state) {
2415                         case Listen:
2416                         case Closed:
2417                         case Finwait2:
2418                                 return;
2419                 }
2420
2421                 /* force an ack when a window has opened up */
2422                 if (tcb->rcv.blocked && tcb->rcv.wnd > 0) {
2423                         tcb->rcv.blocked = 0;
2424                         tcb->flags |= FORCE;
2425                 }
2426
2427                 sndcnt = qlen(s->wq) + tcb->flgcnt;
2428                 sent = tcb->snd.ptr - tcb->snd.una;
2429
2430                 /* Don't send anything else until our SYN has been acked */
2431                 if (tcb->snd.ptr != tcb->iss && (tcb->flags & SYNACK) == 0)
2432                         break;
2433
2434                 /* Compute usable segment based on offered window and limit
2435                  * window probes to one
2436                  */
2437                 if (tcb->snd.wnd == 0) {
2438                         if (sent != 0) {
2439                                 if ((tcb->flags & FORCE) == 0)
2440                                         break;
2441 //              tcb->snd.ptr = tcb->snd.una;
2442                         }
2443                         usable = 1;
2444                 } else {
2445                         usable = tcb->cwind;
2446                         if (tcb->snd.wnd < usable)
2447                                 usable = tcb->snd.wnd;
2448                         usable -= sent;
2449                 }
2450                 ssize = sndcnt - sent;
2451                 if (ssize && usable < 2)
2452                         netlog(s->p->f, Logtcp, "throttled snd.wnd %lu cwind %lu\n",
2453                                    tcb->snd.wnd, tcb->cwind);
2454                 if (usable < ssize)
2455                         ssize = usable;
2456                 if (ssize > tcb->mss) {
2457                         if ((tcb->flags & TSO) == 0) {
2458                                 ssize = tcb->mss;
2459                         } else {
2460                                 int segs, window;
2461
2462                                 /*  Don't send too much.  32K is arbitrary..
2463                                  */
2464                                 if (ssize > 32 * 1024)
2465                                         ssize = 32 * 1024;
2466
2467                                 /* Clamp xmit to an integral MSS to
2468                                  * avoid ragged tail segments causing
2469                                  * poor link utilization.  Also
2470                                  * account for each segment sent in
2471                                  * msg heuristic, and round up to the
2472                                  * next multiple of 4, to ensure we
2473                                  * still yeild.
2474                                  */
2475                                 segs = ssize / tcb->mss;
2476                                 ssize = segs * tcb->mss;
2477                                 msgs += segs;
2478                                 if (segs > 3)
2479                                         msgs = (msgs + 4) & ~3;
2480                         }
2481                 }
2482
2483                 dsize = ssize;
2484                 seg.urg = 0;
2485
2486                 if (ssize == 0)
2487                         if ((tcb->flags & FORCE) == 0)
2488                                 break;
2489
2490                 tcb->flags &= ~FORCE;
2491                 tcprcvwin(s);
2492
2493                 /* By default we will generate an ack */
2494                 tcphalt(tpriv, &tcb->acktimer);
2495                 tcb->rcv.una = 0;
2496                 seg.source = s->lport;
2497                 seg.dest = s->rport;
2498                 seg.flags = ACK;
2499                 seg.mss = 0;
2500                 seg.ws = 0;
2501                 switch (tcb->state) {
2502                         case Syn_sent:
2503                                 seg.flags = 0;
2504                                 if (tcb->snd.ptr == tcb->iss) {
2505                                         seg.flags |= SYN;
2506                                         dsize--;
2507                                         seg.mss = tcb->mss;
2508                                         seg.ws = tcb->scale;
2509                                 }
2510                                 break;
2511                         case Syn_received:
2512                                 /*
2513                                  *  don't send any data with a SYN/ACK packet
2514                                  *  because Linux rejects the packet in its
2515                                  *  attempt to solve the SYN attack problem
2516                                  */
2517                                 if (tcb->snd.ptr == tcb->iss) {
2518                                         seg.flags |= SYN;
2519                                         dsize = 0;
2520                                         ssize = 1;
2521                                         seg.mss = tcb->mss;
2522                                         seg.ws = tcb->scale;
2523                                 }
2524                                 break;
2525                 }
2526                 seg.seq = tcb->snd.ptr;
2527                 seg.ack = tcb->rcv.nxt;
2528                 seg.wnd = tcb->rcv.wnd;
2529
2530                 /* Pull out data to send */
2531                 bp = NULL;
2532                 if (dsize != 0) {
2533                         bp = qcopy(s->wq, dsize, sent);
2534                         if (BLEN(bp) != dsize) {
2535                                 seg.flags |= FIN;
2536                                 dsize--;
2537                         }
2538                         if (BLEN(bp) > tcb->mss) {
2539                                 bp->flag |= Btso;
2540                                 bp->mss = tcb->mss;
2541                         }
2542                 }
2543
2544                 if (sent + dsize == sndcnt)
2545                         seg.flags |= PSH;
2546
2547                 /* keep track of balance of resent data */
2548                 if (seq_lt(tcb->snd.ptr, tcb->snd.nxt)) {
2549                         n = tcb->snd.nxt - tcb->snd.ptr;
2550                         if (ssize < n)
2551                                 n = ssize;
2552                         tcb->resent += n;
2553                         netlog(f, Logtcp, "rexmit: %I.%d -> %I.%d ptr 0x%lx nxt 0x%lx\n",
2554                                    s->raddr, s->rport, s->laddr, s->lport, tcb->snd.ptr,
2555                                    tcb->snd.nxt);
2556                         tpriv->stats[RetransSegs]++;
2557                 }
2558
2559                 tcb->snd.ptr += ssize;
2560
2561                 /* Pull up the send pointer so we can accept acks
2562                  * for this window
2563                  */
2564                 if (seq_gt(tcb->snd.ptr, tcb->snd.nxt))
2565                         tcb->snd.nxt = tcb->snd.ptr;
2566
2567                 /* Build header, link data and compute cksum */
2568                 switch (version) {
2569                         case V4:
2570                                 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
2571                                 hbp = htontcp4(&seg, bp, &tcb->protohdr.tcp4hdr, tcb);
2572                                 if (hbp == NULL) {
2573                                         freeblist(bp);
2574                                         return;
2575                                 }
2576                                 break;
2577                         case V6:
2578                                 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
2579                                 hbp = htontcp6(&seg, bp, &tcb->protohdr.tcp6hdr, tcb);
2580                                 if (hbp == NULL) {
2581                                         freeblist(bp);
2582                                         return;
2583                                 }
2584                                 break;
2585                         default:
2586                                 hbp = NULL;     /* to suppress a warning */
2587                                 panic("tcpoutput: version %d", version);
2588                 }
2589
2590                 /* Start the transmission timers if there is new data and we
2591                  * expect acknowledges
2592                  */
2593                 if (ssize != 0) {
2594                         if (tcb->timer.state != TcptimerON)
2595                                 tcpgo(tpriv, &tcb->timer);
2596
2597                         /*  If round trip timer isn't running, start it.
2598                          *  measure the longest packet only in case the
2599                          *  transmission time dominates RTT
2600                          */
2601                         if (tcb->rtt_timer.state != TcptimerON)
2602                                 if (ssize == tcb->mss) {
2603                                         tcpgo(tpriv, &tcb->rtt_timer);
2604                                         tcb->rttseq = tcb->snd.ptr;
2605                                 }
2606                 }
2607
2608                 tpriv->stats[OutSegs]++;
2609
2610                 /* put off the next keep alive */
2611                 tcpgo(tpriv, &tcb->katimer);
2612
2613                 switch (version) {
2614                         case V4:
2615                                 if (ipoput4(f, hbp, 0, s->ttl, s->tos, s) < 0) {
2616                                         /* a negative return means no route */
2617                                         localclose(s, "no route");
2618                                 }
2619                                 break;
2620                         case V6:
2621                                 if (ipoput6(f, hbp, 0, s->ttl, s->tos, s) < 0) {
2622                                         /* a negative return means no route */
2623                                         localclose(s, "no route");
2624                                 }
2625                                 break;
2626                         default:
2627                                 panic("tcpoutput2: version %d", version);
2628                 }
2629                 if ((msgs % 4) == 1) {
2630                         qunlock(&s->qlock);
2631                         kthread_yield();
2632                         qlock(&s->qlock);
2633                 }
2634         }
2635 }
2636
2637 /*
2638  *  the BSD convention (hack?) for keep alives.  resend last uint8_t acked.
2639  */
2640 void tcpsendka(struct conv *s)
2641 {
2642         Tcp seg;
2643         Tcpctl *tcb;
2644         struct block *hbp, *dbp;
2645
2646         tcb = (Tcpctl *) s->ptcl;
2647
2648         dbp = NULL;
2649         seg.urg = 0;
2650         seg.source = s->lport;
2651         seg.dest = s->rport;
2652         seg.flags = ACK | PSH;
2653         seg.mss = 0;
2654         seg.ws = 0;
2655         if (tcpporthogdefense)
2656                 seg.seq = tcb->snd.una - (1 << 30) - nrand(1 << 20);
2657         else
2658                 seg.seq = tcb->snd.una - 1;
2659         seg.ack = tcb->rcv.nxt;
2660         tcb->rcv.una = 0;
2661         seg.wnd = tcb->rcv.wnd;
2662         if (tcb->state == Finwait2) {
2663                 seg.flags |= FIN;
2664         } else {
2665                 dbp = allocb(1);
2666                 dbp->wp++;
2667         }
2668
2669         if (isv4(s->raddr)) {
2670                 /* Build header, link data and compute cksum */
2671                 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
2672                 hbp = htontcp4(&seg, dbp, &tcb->protohdr.tcp4hdr, tcb);
2673                 if (hbp == NULL) {
2674                         freeblist(dbp);
2675                         return;
2676                 }
2677                 ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
2678         } else {
2679                 /* Build header, link data and compute cksum */
2680                 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
2681                 hbp = htontcp6(&seg, dbp, &tcb->protohdr.tcp6hdr, tcb);
2682                 if (hbp == NULL) {
2683                         freeblist(dbp);
2684                         return;
2685                 }
2686                 ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
2687         }
2688 }
2689
2690 /*
2691  *  set connection to time out after 12 minutes
2692  */
2693 void tcpsetkacounter(Tcpctl * tcb)
2694 {
2695         tcb->kacounter = (12 * 60 * 1000) / (tcb->katimer.start * MSPTICK);
2696         if (tcb->kacounter < 3)
2697                 tcb->kacounter = 3;
2698 }
2699
2700 /*
2701  *  if we've timed out, close the connection
2702  *  otherwise, send a keepalive and restart the timer
2703  */
2704 void tcpkeepalive(void *v)
2705 {
2706         ERRSTACK(1);
2707         Tcpctl *tcb;
2708         struct conv *s;
2709
2710         s = v;
2711         tcb = (Tcpctl *) s->ptcl;
2712         if (waserror()) {
2713                 qunlock(&s->qlock);
2714                 nexterror();
2715         }
2716         qlock(&s->qlock);
2717         if (tcb->state != Closed) {
2718                 if (--(tcb->kacounter) <= 0) {
2719                         localclose(s, Etimedout);
2720                 } else {
2721                         tcpsendka(s);
2722                         tcpgo(s->p->priv, &tcb->katimer);
2723                 }
2724         }
2725         qunlock(&s->qlock);
2726         poperror();
2727 }
2728
2729 /*
2730  *  start keepalive timer
2731  */
2732 char *tcpstartka(struct conv *s, char **f, int n)
2733 {
2734         Tcpctl *tcb;
2735         int x;
2736
2737         tcb = (Tcpctl *) s->ptcl;
2738         if (tcb->state != Established)
2739                 return "connection must be in Establised state";
2740         if (n > 1) {
2741                 x = atoi(f[1]);
2742                 if (x >= MSPTICK)
2743                         tcb->katimer.start = x / MSPTICK;
2744         }
2745         tcpsetkacounter(tcb);
2746         tcpgo(s->p->priv, &tcb->katimer);
2747
2748         return NULL;
2749 }
2750
2751 /*
2752  *  turn checksums on/off
2753  */
2754 char *tcpsetchecksum(struct conv *s, char **f, int unused)
2755 {
2756         Tcpctl *tcb;
2757
2758         tcb = (Tcpctl *) s->ptcl;
2759         tcb->nochecksum = !atoi(f[1]);
2760
2761         return NULL;
2762 }
2763
2764 void tcprxmit(struct conv *s)
2765 {
2766         Tcpctl *tcb;
2767
2768         tcb = (Tcpctl *) s->ptcl;
2769
2770         tcb->flags |= RETRAN | FORCE;
2771         tcb->snd.ptr = tcb->snd.una;
2772
2773         /*
2774          *  We should be halving the slow start threshhold (down to one
2775          *  mss) but leaving it at mss seems to work well enough
2776          */
2777         tcb->ssthresh = tcb->mss;
2778
2779         /*
2780          *  pull window down to a single packet
2781          */
2782         tcb->cwind = tcb->mss;
2783         tcpoutput(s);
2784 }
2785
2786 void tcptimeout(void *arg)
2787 {
2788         ERRSTACK(1);
2789         struct conv *s;
2790         Tcpctl *tcb;
2791         int maxback;
2792         struct tcppriv *tpriv;
2793
2794         s = (struct conv *)arg;
2795         tpriv = s->p->priv;
2796         tcb = (Tcpctl *) s->ptcl;
2797
2798         if (waserror()) {
2799                 qunlock(&s->qlock);
2800                 nexterror();
2801         }
2802         qlock(&s->qlock);
2803         switch (tcb->state) {
2804                 default:
2805                         tcb->backoff++;
2806                         if (tcb->state == Syn_sent)
2807                                 maxback = MAXBACKMS / 2;
2808                         else
2809                                 maxback = MAXBACKMS;
2810                         tcb->backedoff += tcb->timer.start * MSPTICK;
2811                         if (tcb->backedoff >= maxback) {
2812                                 localclose(s, Etimedout);
2813                                 break;
2814                         }
2815                         netlog(s->p->f, Logtcprxmt, "timeout rexmit 0x%lx %d/%d\n",
2816                                    tcb->snd.una, tcb->timer.start, NOW);
2817                         tcpsettimer(tcb);
2818                         tcprxmit(s);
2819                         tpriv->stats[RetransTimeouts]++;
2820                         tcb->snd.dupacks = 0;
2821                         break;
2822                 case Time_wait:
2823                         localclose(s, NULL);
2824                         break;
2825                 case Closed:
2826                         break;
2827         }
2828         qunlock(&s->qlock);
2829         poperror();
2830 }
2831
2832 int inwindow(Tcpctl * tcb, int seq)
2833 {
2834         return seq_within(seq, tcb->rcv.nxt, tcb->rcv.nxt + tcb->rcv.wnd - 1);
2835 }
2836
2837 /*
2838  *  set up state for a received SYN (or SYN ACK) packet
2839  */
2840 void procsyn(struct conv *s, Tcp * seg)
2841 {
2842         Tcpctl *tcb;
2843
2844         tcb = (Tcpctl *) s->ptcl;
2845         tcb->flags |= FORCE;
2846
2847         tcb->rcv.nxt = seg->seq + 1;
2848         tcb->rcv.urg = tcb->rcv.nxt;
2849         tcb->irs = seg->seq;
2850
2851         /* our sending max segment size cannot be bigger than what he asked for */
2852         if (seg->mss != 0 && seg->mss < tcb->mss)
2853                 tcb->mss = seg->mss;
2854
2855         /* the congestion window always starts out as a single segment */
2856         tcb->snd.wnd = seg->wnd;
2857         tcb->cwind = tcb->mss;
2858 }
2859
2860 int
2861 addreseq(Tcpctl * tcb, struct tcppriv *tpriv, Tcp * seg,
2862                  struct block *bp, uint16_t length)
2863 {
2864         Reseq *rp, *rp1;
2865         int i, rqlen, qmax;
2866
2867         rp = kzmalloc(sizeof(Reseq), 0);
2868         if (rp == NULL) {
2869                 freeblist(bp);  /* bp always consumed by add_reseq */
2870                 return 0;
2871         }
2872
2873         rp->seg = *seg;
2874         rp->bp = bp;
2875         rp->length = length;
2876
2877         /* Place on reassembly list sorting by starting seq number */
2878         rp1 = tcb->reseq;
2879         if (rp1 == NULL || seq_lt(seg->seq, rp1->seg.seq)) {
2880                 rp->next = rp1;
2881                 tcb->reseq = rp;
2882                 if (rp->next != NULL)
2883                         tpriv->stats[OutOfOrder]++;
2884                 return 0;
2885         }
2886
2887         rqlen = 0;
2888         for (i = 0;; i++) {
2889                 rqlen += rp1->length;
2890                 if (rp1->next == NULL || seq_lt(seg->seq, rp1->next->seg.seq)) {
2891                         rp->next = rp1->next;
2892                         rp1->next = rp;
2893                         if (rp->next != NULL)
2894                                 tpriv->stats[OutOfOrder]++;
2895                         break;
2896                 }
2897                 rp1 = rp1->next;
2898         }
2899         qmax = QMAX << tcb->rcv.scale;
2900         if (rqlen > qmax) {
2901                 printd("resequence queue > window: %d > %d\n", rqlen, qmax);
2902                 i = 0;
2903                 for (rp1 = tcb->reseq; rp1 != NULL; rp1 = rp1->next) {
2904                         printd("0x%#lx 0x%#lx 0x%#x\n", rp1->seg.seq,
2905                                    rp1->seg.ack, rp1->seg.flags);
2906                         if (i++ > 10) {
2907                                 printd("...\n");
2908                                 break;
2909                         }
2910                 }
2911
2912                 // delete entire reassembly queue; wait for retransmit.
2913                 // - should we be smarter and only delete the tail?
2914                 for (rp = tcb->reseq; rp != NULL; rp = rp1) {
2915                         rp1 = rp->next;
2916                         freeblist(rp->bp);
2917                         kfree(rp);
2918                 }
2919                 tcb->reseq = NULL;
2920
2921                 return -1;
2922         }
2923         return 0;
2924 }
2925
2926 void getreseq(Tcpctl * tcb, Tcp * seg, struct block **bp, uint16_t * length)
2927 {
2928         Reseq *rp;
2929
2930         rp = tcb->reseq;
2931         if (rp == NULL)
2932                 return;
2933
2934         tcb->reseq = rp->next;
2935
2936         *seg = rp->seg;
2937         *bp = rp->bp;
2938         *length = rp->length;
2939
2940         kfree(rp);
2941 }
2942
2943 int tcptrim(Tcpctl * tcb, Tcp * seg, struct block **bp, uint16_t * length)
2944 {
2945         uint16_t len;
2946         uint8_t accept;
2947         int dupcnt, excess;
2948
2949         accept = 0;
2950         len = *length;
2951         if (seg->flags & SYN)
2952                 len++;
2953         if (seg->flags & FIN)
2954                 len++;
2955
2956         if (tcb->rcv.wnd == 0) {
2957                 if (len == 0 && seg->seq == tcb->rcv.nxt)
2958                         return 0;
2959         } else {
2960                 /* Some part of the segment should be in the window */
2961                 if (inwindow(tcb, seg->seq))
2962                         accept++;
2963                 else if (len != 0) {
2964                         if (inwindow(tcb, seg->seq + len - 1) ||
2965                                 seq_within(tcb->rcv.nxt, seg->seq, seg->seq + len - 1))
2966                                 accept++;
2967                 }
2968         }
2969         if (!accept) {
2970                 freeblist(*bp);
2971                 return -1;
2972         }
2973         dupcnt = tcb->rcv.nxt - seg->seq;
2974         if (dupcnt > 0) {
2975                 tcb->rerecv += dupcnt;
2976                 if (seg->flags & SYN) {
2977                         seg->flags &= ~SYN;
2978                         seg->seq++;
2979
2980                         if (seg->urg > 1)
2981                                 seg->urg--;
2982                         else
2983                                 seg->flags &= ~URG;
2984                         dupcnt--;
2985                 }
2986                 if (dupcnt > 0) {
2987                         pullblock(bp, (uint16_t) dupcnt);
2988                         seg->seq += dupcnt;
2989                         *length -= dupcnt;
2990
2991                         if (seg->urg > dupcnt)
2992                                 seg->urg -= dupcnt;
2993                         else {
2994                                 seg->flags &= ~URG;
2995                                 seg->urg = 0;
2996                         }
2997                 }
2998         }
2999         excess = seg->seq + *length - (tcb->rcv.nxt + tcb->rcv.wnd);
3000         if (excess > 0) {
3001                 tcb->rerecv += excess;
3002                 *length -= excess;
3003                 *bp = trimblock(*bp, 0, *length);
3004                 if (*bp == NULL)
3005                         panic("presotto is a boofhead");
3006                 seg->flags &= ~FIN;
3007         }
3008         return 0;
3009 }
3010
3011 void tcpadvise(struct Proto *tcp, struct block *bp, char *msg)
3012 {
3013         Tcp4hdr *h4;
3014         Tcp6hdr *h6;
3015         Tcpctl *tcb;
3016         uint8_t source[IPaddrlen];
3017         uint8_t dest[IPaddrlen];
3018         uint16_t psource, pdest;
3019         struct conv *s, **p;
3020
3021         h4 = (Tcp4hdr *) (bp->rp);
3022         h6 = (Tcp6hdr *) (bp->rp);
3023
3024         if ((h4->vihl & 0xF0) == IP_VER4) {
3025                 v4tov6(dest, h4->tcpdst);
3026                 v4tov6(source, h4->tcpsrc);
3027                 psource = nhgets(h4->tcpsport);
3028                 pdest = nhgets(h4->tcpdport);
3029         } else {
3030                 ipmove(dest, h6->tcpdst);
3031                 ipmove(source, h6->tcpsrc);
3032                 psource = nhgets(h6->tcpsport);
3033                 pdest = nhgets(h6->tcpdport);
3034         }
3035
3036         /* Look for a connection */
3037         qlock(&tcp->qlock);
3038         for (p = tcp->conv; *p; p++) {
3039                 s = *p;
3040                 tcb = (Tcpctl *) s->ptcl;
3041                 if (s->rport == pdest)
3042                         if (s->lport == psource)
3043                                 if (tcb->state != Closed)
3044                                         if (ipcmp(s->raddr, dest) == 0)
3045                                                 if (ipcmp(s->laddr, source) == 0) {
3046                                                         qlock(&s->qlock);
3047                                                         qunlock(&tcp->qlock);
3048                                                         switch (tcb->state) {
3049                                                                 case Syn_sent:
3050                                                                         localclose(s, msg);
3051                                                                         break;
3052                                                         }
3053                                                         qunlock(&s->qlock);
3054                                                         freeblist(bp);
3055                                                         return;
3056                                                 }
3057         }
3058         qunlock(&tcp->qlock);
3059         freeblist(bp);
3060 }
3061
3062 static char *tcpporthogdefensectl(char *val)
3063 {
3064         if (strcmp(val, "on") == 0)
3065                 tcpporthogdefense = 1;
3066         else if (strcmp(val, "off") == 0)
3067                 tcpporthogdefense = 0;
3068         else
3069                 return "unknown value for tcpporthogdefense";
3070         return NULL;
3071 }
3072
3073 /* called with c qlocked */
3074 char *tcpctl(struct conv *c, char **f, int n)
3075 {
3076         if (n == 1 && strcmp(f[0], "hangup") == 0)
3077                 return tcphangup(c);
3078         if (n >= 1 && strcmp(f[0], "keepalive") == 0)
3079                 return tcpstartka(c, f, n);
3080         if (n >= 1 && strcmp(f[0], "checksum") == 0)
3081                 return tcpsetchecksum(c, f, n);
3082         if (n >= 1 && strcmp(f[0], "tcpporthogdefense") == 0)
3083                 return tcpporthogdefensectl(f[1]);
3084         return "unknown control request";
3085 }
3086
3087 int tcpstats(struct Proto *tcp, char *buf, int len)
3088 {
3089         struct tcppriv *priv;
3090         char *p, *e;
3091         int i;
3092
3093         priv = tcp->priv;
3094         p = buf;
3095         e = p + len;
3096         for (i = 0; i < Nstats; i++)
3097                 p = seprintf(p, e, "%s: %u\n", statnames[i], priv->stats[i]);
3098         return p - buf;
3099 }
3100
3101 /*
3102  *  garbage collect any stale conversations:
3103  *      - SYN received but no SYN-ACK after 5 seconds (could be the SYN attack)
3104  *      - Finwait2 after 5 minutes
3105  *
3106  *  this is called whenever we run out of channels.  Both checks are
3107  *  of questionable validity so we try to use them only when we're
3108  *  up against the wall.
3109  */
3110 int tcpgc(struct Proto *tcp)
3111 {
3112         struct conv *c, **pp, **ep;
3113         int n;
3114         Tcpctl *tcb;
3115
3116         n = 0;
3117         ep = &tcp->conv[tcp->nc];
3118         for (pp = tcp->conv; pp < ep; pp++) {
3119                 c = *pp;
3120                 if (c == NULL)
3121                         break;
3122                 if (!canqlock(&c->qlock))
3123                         continue;
3124                 tcb = (Tcpctl *) c->ptcl;
3125                 switch (tcb->state) {
3126                         case Syn_received:
3127                                 if (NOW - tcb->time > 5000) {
3128                                         localclose(c, "timed out");
3129                                         n++;
3130                                 }
3131                                 break;
3132                         case Finwait2:
3133                                 if (NOW - tcb->time > 5 * 60 * 1000) {
3134                                         localclose(c, "timed out");
3135                                         n++;
3136                                 }
3137                                 break;
3138                 }
3139                 qunlock(&c->qlock);
3140         }
3141         return n;
3142 }
3143
3144 void tcpsettimer(Tcpctl * tcb)
3145 {
3146         int x;
3147
3148         /* round trip dependency */
3149         x = backoff(tcb->backoff) *
3150                 (tcb->mdev + (tcb->srtt >> LOGAGAIN) + MSPTICK) / MSPTICK;
3151
3152         /* bounded twixt 1/2 and 64 seconds */
3153         if (x < 500 / MSPTICK)
3154                 x = 500 / MSPTICK;
3155         else if (x > (64000 / MSPTICK))
3156                 x = 64000 / MSPTICK;
3157         tcb->timer.start = x;
3158 }
3159
3160 void tcpinit(struct Fs *fs)
3161 {
3162         struct Proto *tcp;
3163         struct tcppriv *tpriv;
3164
3165         tcp = kzmalloc(sizeof(struct Proto), 0);
3166         tpriv = tcp->priv = kzmalloc(sizeof(struct tcppriv), 0);
3167         qlock_init(&tpriv->tl);
3168         qlock_init(&tpriv->apl);
3169         tcp->name = "tcp";
3170         tcp->connect = tcpconnect;
3171         tcp->announce = tcpannounce;
3172         tcp->ctl = tcpctl;
3173         tcp->state = tcpstate;
3174         tcp->create = tcpcreate;
3175         tcp->close = tcpclose;
3176         tcp->rcv = tcpiput;
3177         tcp->advise = tcpadvise;
3178         tcp->stats = tcpstats;
3179         tcp->inuse = tcpinuse;
3180         tcp->gc = tcpgc;
3181         tcp->ipproto = IP_TCPPROTO;
3182        &