kproc -> ktask
[akaros.git] / kern / src / net / tcp.c
1 #include        "u.h"
2 #include        "../port/lib.h"
3 #include        "mem.h"
4 #include        "dat.h"
5 #include        "fns.h"
6 #include        "../port/error.h"
7
8 #include        "ip.h"
9
10 enum
11 {
12         QMAX            = 64*1024-1,
13         IP_TCPPROTO     = 6,
14
15         TCP4_IPLEN      = 8,
16         TCP4_PHDRSIZE   = 12,
17         TCP4_HDRSIZE    = 20,
18         TCP4_TCBPHDRSZ  = 40,
19         TCP4_PKT        = TCP4_IPLEN+TCP4_PHDRSIZE,
20
21         TCP6_IPLEN      = 0,
22         TCP6_PHDRSIZE   = 40,
23         TCP6_HDRSIZE    = 20,
24         TCP6_TCBPHDRSZ  = 60,
25         TCP6_PKT        = TCP6_IPLEN+TCP6_PHDRSIZE,
26
27         TcptimerOFF     = 0,
28         TcptimerON      = 1,
29         TcptimerDONE    = 2,
30         MAX_TIME        = (1<<20),      /* Forever */
31         TCP_ACK         = 50,           /* Timed ack sequence in ms */
32         MAXBACKMS       = 9*60*1000,    /* longest backoff time (ms) before hangup */
33
34         URG             = 0x20,         /* Data marked urgent */
35         ACK             = 0x10,         /* Acknowledge is valid */
36         PSH             = 0x08,         /* Whole data pipe is pushed */
37         RST             = 0x04,         /* Reset connection */
38         SYN             = 0x02,         /* Pkt. is synchronise */
39         FIN             = 0x01,         /* Start close down */
40
41         EOLOPT          = 0,
42         NOOPOPT         = 1,
43         MSSOPT          = 2,
44         MSS_LENGTH      = 4,            /* Mean segment size */
45         WSOPT           = 3,
46         WS_LENGTH       = 3,            /* Bits to scale window size by */
47         MSL2            = 10,
48         MSPTICK         = 50,           /* Milliseconds per timer tick */
49         DEF_MSS         = 1460,         /* Default mean segment */
50         DEF_MSS6        = 1280,         /* Default mean segment (min) for v6 */
51         DEF_RTT         = 500,          /* Default round trip */
52         DEF_KAT         = 120000,       /* Default time (ms) between keep alives */
53         TCP_LISTEN      = 0,            /* Listen connection */
54         TCP_CONNECT     = 1,            /* Outgoing connection */
55         SYNACK_RXTIMER  = 250,          /* ms between SYNACK retransmits */
56
57         TCPREXMTTHRESH  = 3,            /* dupack threshhold for rxt */
58
59         FORCE           = 1,
60         CLONE           = 2,
61         RETRAN          = 4,
62         ACTIVE          = 8,
63         SYNACK          = 16,
64
65         LOGAGAIN        = 3,
66         LOGDGAIN        = 2,
67
68         Closed          = 0,            /* Connection states */
69         Listen,
70         Syn_sent,
71         Syn_received,
72         Established,
73         Finwait1,
74         Finwait2,
75         Close_wait,
76         Closing,
77         Last_ack,
78         Time_wait,
79
80         Maxlimbo        = 1000,         /* maximum procs waiting for response to SYN ACK */
81         NLHT            = 256,          /* hash table size, must be a power of 2 */
82         LHTMASK         = NLHT-1,
83
84         HaveWS          = 1<<8,
85 };
86
87 /* Must correspond to the enumeration above */
88 char *tcpstates[] =
89 {
90         "Closed",       "Listen",       "Syn_sent", "Syn_received",
91         "Established",  "Finwait1",     "Finwait2", "Close_wait",
92         "Closing",      "Last_ack",     "Time_wait"
93 };
94
95 typedef struct Tcptimer Tcptimer;
96 struct Tcptimer
97 {
98         Tcptimer        *next;
99         Tcptimer        *prev;
100         Tcptimer        *readynext;
101         int     state;
102         int     start;
103         int     count;
104         void    (*func)(void*);
105         void    *arg;
106 };
107
108 /*
109  *  v4 and v6 pseudo headers used for
110  *  checksuming tcp
111  */
112 typedef struct Tcp4hdr Tcp4hdr;
113 struct Tcp4hdr
114 {
115         uchar   vihl;           /* Version and header length */
116         uchar   tos;            /* Type of service */
117         uchar   length[2];      /* packet length */
118         uchar   id[2];          /* Identification */
119         uchar   frag[2];        /* Fragment information */
120         uchar   Unused;
121         uchar   proto;
122         uchar   tcplen[2];
123         uchar   tcpsrc[4];
124         uchar   tcpdst[4];
125         uchar   tcpsport[2];
126         uchar   tcpdport[2];
127         uchar   tcpseq[4];
128         uchar   tcpack[4];
129         uchar   tcpflag[2];
130         uchar   tcpwin[2];
131         uchar   tcpcksum[2];
132         uchar   tcpurg[2];
133         /* Options segment */
134         uchar   tcpopt[1];
135 };
136
137 typedef struct Tcp6hdr Tcp6hdr;
138 struct Tcp6hdr
139 {
140         uchar   vcf[4];
141         uchar   ploadlen[2];
142         uchar   proto;
143         uchar   ttl;
144         uchar   tcpsrc[IPaddrlen];
145         uchar   tcpdst[IPaddrlen];
146         uchar   tcpsport[2];
147         uchar   tcpdport[2];
148         uchar   tcpseq[4];
149         uchar   tcpack[4];
150         uchar   tcpflag[2];
151         uchar   tcpwin[2];
152         uchar   tcpcksum[2];
153         uchar   tcpurg[2];
154         /* Options segment */
155         uchar   tcpopt[1];
156 };
157
158 /*
159  *  this represents the control info
160  *  for a single packet.  It is derived from
161  *  a packet in ntohtcp{4,6}() and stuck into
162  *  a packet in htontcp{4,6}().
163  */
164 typedef struct Tcp Tcp;
165 struct  Tcp
166 {
167         ushort  source;
168         ushort  dest;
169         ulong   seq;
170         ulong   ack;
171         uchar   flags;
172         ushort  ws;     /* window scale option (if not zero) */
173         ulong   wnd;
174         ushort  urg;
175         ushort  mss;    /* max segment size option (if not zero) */
176         ushort  len;    /* size of data */
177 };
178
179 /*
180  *  this header is malloc'd to thread together fragments
181  *  waiting to be coalesced
182  */
183 typedef struct Reseq Reseq;
184 struct Reseq
185 {
186         Reseq   *next;
187         Tcp     seg;
188         Block   *bp;
189         ushort  length;
190 };
191
192 /*
193  *  the qlock in the Conv locks this structure
194  */
195 typedef struct Tcpctl Tcpctl;
196 struct Tcpctl
197 {
198         uchar   state;                  /* Connection state */
199         uchar   type;                   /* Listening or active connection */
200         uchar   code;                   /* Icmp code */
201         struct {
202                 ulong   una;            /* Unacked data pointer */
203                 ulong   nxt;            /* Next sequence expected */
204                 ulong   ptr;            /* Data pointer */
205                 ulong   wnd;            /* Tcp send window */
206                 ulong   urg;            /* Urgent data pointer */
207                 ulong   wl2;
208                 int     scale;          /* how much to right shift window in xmitted packets */
209                 /* to implement tahoe and reno TCP */
210                 ulong   dupacks;        /* number of duplicate acks rcvd */
211                 int     recovery;       /* loss recovery flag */
212                 ulong   rxt;            /* right window marker for recovery */
213         } snd;
214         struct {
215                 ulong   nxt;            /* Receive pointer to next uchar slot */
216                 ulong   wnd;            /* Receive window incoming */
217                 ulong   urg;            /* Urgent pointer */
218                 int     blocked;
219                 int     una;            /* unacked data segs */
220                 int     scale;          /* how much to left shift window in rcved packets */
221         } rcv;
222         ulong   iss;                    /* Initial sequence number */
223         int     sawwsopt;               /* true if we saw a wsopt on the incoming SYN */
224         ulong   cwind;                  /* Congestion window */
225         int     scale;                  /* desired snd.scale */
226         ushort  ssthresh;               /* Slow start threshold */
227         int     resent;                 /* Bytes just resent */
228         int     irs;                    /* Initial received squence */
229         ushort  mss;                    /* Mean segment size */
230         int     rerecv;                 /* Overlap of data rerecevived */
231         ulong   window;                 /* Recevive window */
232         uchar   backoff;                /* Exponential backoff counter */
233         int     backedoff;              /* ms we've backed off for rexmits */
234         uchar   flags;                  /* State flags */
235         Reseq   *reseq;                 /* Resequencing queue */
236         Tcptimer        timer;                  /* Activity timer */
237         Tcptimer        acktimer;               /* Acknowledge timer */
238         Tcptimer        rtt_timer;              /* Round trip timer */
239         Tcptimer        katimer;                /* keep alive timer */
240         ulong   rttseq;                 /* Round trip sequence */
241         int     srtt;                   /* Shortened round trip */
242         int     mdev;                   /* Mean deviation of round trip */
243         int     kacounter;              /* count down for keep alive */
244         uint    sndsyntime;             /* time syn sent */
245         ulong   time;                   /* time Finwait2 or Syn_received was sent */
246         int     nochecksum;             /* non-zero means don't send checksums */
247         int     flgcnt;                 /* number of flags in the sequence (FIN,SEQ) */
248
249         union {
250                 Tcp4hdr tcp4hdr;
251                 Tcp6hdr tcp6hdr;
252         } protohdr;             /* prototype header */
253 };
254
255 /*
256  *  New calls are put in limbo rather than having a conversation structure
257  *  allocated.  Thus, a SYN attack results in lots of limbo'd calls but not
258  *  any real Conv structures mucking things up.  Calls in limbo rexmit their
259  *  SYN ACK every SYNACK_RXTIMER ms up to 4 times, i.e., they disappear after 1 second.
260  *
261  *  In particular they aren't on a listener's queue so that they don't figure
262  *  in the input queue limit.
263  *
264  *  If 1/2 of a T3 was attacking SYN packets, we'ld have a permanent queue
265  *  of 70000 limbo'd calls.  Not great for a linear list but doable.  Therefore
266  *  there is no hashing of this list.
267  */
268 typedef struct Limbo Limbo;
269 struct Limbo
270 {
271         Limbo   *next;
272
273         uchar   laddr[IPaddrlen];
274         uchar   raddr[IPaddrlen];
275         ushort  lport;
276         ushort  rport;
277         ulong   irs;            /* initial received sequence */
278         ulong   iss;            /* initial sent sequence */
279         ushort  mss;            /* mss from the other end */
280         ushort  rcvscale;       /* how much to scale rcvd windows */
281         ushort  sndscale;       /* how much to scale sent windows */
282         ulong   lastsend;       /* last time we sent a synack */
283         uchar   version;        /* v4 or v6 */
284         uchar   rexmits;        /* number of retransmissions */
285 };
286
287 int     tcp_irtt = DEF_RTT;     /* Initial guess at round trip time */
288 ushort  tcp_mss = DEF_MSS;      /* Maximum segment size to be sent */
289
290 enum {
291         /* MIB stats */
292         MaxConn,
293         ActiveOpens,
294         PassiveOpens,
295         EstabResets,
296         CurrEstab,
297         InSegs,
298         OutSegs,
299         RetransSegs,
300         RetransTimeouts,
301         InErrs,
302         OutRsts,
303
304         /* non-MIB stats */
305         CsumErrs,
306         HlenErrs,
307         LenErrs,
308         OutOfOrder,
309
310         Nstats
311 };
312
313 static char *statnames[] =
314 {
315 [MaxConn]       "MaxConn",
316 [ActiveOpens]   "ActiveOpens",
317 [PassiveOpens]  "PassiveOpens",
318 [EstabResets]   "EstabResets",
319 [CurrEstab]     "CurrEstab",
320 [InSegs]        "InSegs",
321 [OutSegs]       "OutSegs",
322 [RetransSegs]   "RetransSegs",
323 [RetransTimeouts]       "RetransTimeouts",
324 [InErrs]        "InErrs",
325 [OutRsts]       "OutRsts",
326 [CsumErrs]      "CsumErrs",
327 [HlenErrs]      "HlenErrs",
328 [LenErrs]       "LenErrs",
329 [OutOfOrder]    "OutOfOrder",
330 };
331
332 typedef struct Tcppriv Tcppriv;
333 struct Tcppriv
334 {
335         /* List of active timers */
336         QLock   tl;
337         Tcptimer *timers;
338
339         /* hash table for matching conversations */
340         Ipht    ht;
341
342         /* calls in limbo waiting for an ACK to our SYN ACK */
343         int     nlimbo;
344         Limbo   *lht[NLHT];
345
346         /* for keeping track of tcpackproc */
347         QLock   apl;
348         int     ackprocstarted;
349
350         ulong   stats[Nstats];
351 };
352
353 /*
354  *  Setting tcpporthogdefense to non-zero enables Dong Lin's
355  *  solution to hijacked systems staking out port's as a form
356  *  of DoS attack.
357  *
358  *  To avoid stateless Conv hogs, we pick a sequence number at random.  If
359  *  it that number gets acked by the other end, we shut down the connection.
360  *  Look for tcpporthogedefense in the code.
361  */
362 int tcpporthogdefense = 0;
363
364 int     addreseq(Tcpctl*, Tcppriv*, Tcp*, Block*, ushort);
365 void    getreseq(Tcpctl*, Tcp*, Block**, ushort*);
366 void    localclose(Conv*, char*);
367 void    procsyn(Conv*, Tcp*);
368 void    tcpiput(Proto*, Ipifc*, Block*);
369 void    tcpoutput(Conv*);
370 int     tcptrim(Tcpctl*, Tcp*, Block**, ushort*);
371 void    tcpstart(Conv*, int);
372 void    tcptimeout(void*);
373 void    tcpsndsyn(Conv*, Tcpctl*);
374 void    tcprcvwin(Conv*);
375 void    tcpacktimer(void*);
376 void    tcpkeepalive(void*);
377 void    tcpsetkacounter(Tcpctl*);
378 void    tcprxmit(Conv*);
379 void    tcpsettimer(Tcpctl*);
380 void    tcpsynackrtt(Conv*);
381 void    tcpsetscale(Conv*, Tcpctl*, ushort, ushort);
382
383 static void limborexmit(Proto*);
384 static void limbo(Conv*, uchar*, uchar*, Tcp*, int);
385
386 void
387 tcpsetstate(Conv *s, uchar newstate)
388 {
389         Tcpctl *tcb;
390         uchar oldstate;
391         Tcppriv *tpriv;
392
393         tpriv = s->p->priv;
394
395         tcb = (Tcpctl*)s->ptcl;
396
397         oldstate = tcb->state;
398         if(oldstate == newstate)
399                 return;
400
401         if(oldstate == Established)
402                 tpriv->stats[CurrEstab]--;
403         if(newstate == Established)
404                 tpriv->stats[CurrEstab]++;
405
406         /**
407         print( "%d/%d %s->%s CurrEstab=%d\n", s->lport, s->rport,
408                 tcpstates[oldstate], tcpstates[newstate], tpriv->tstats.tcpCurrEstab );
409         **/
410
411         switch(newstate) {
412         case Closed:
413                 qclose(s->rq);
414                 qclose(s->wq);
415                 qclose(s->eq);
416                 break;
417
418         case Close_wait:                /* Remote closes */
419                 qhangup(s->rq, nil);
420                 break;
421         }
422
423         tcb->state = newstate;
424
425         if(oldstate == Syn_sent && newstate != Closed)
426                 Fsconnected(s, nil);
427 }
428
429 static char*
430 tcpconnect(Conv *c, char **argv, int argc)
431 {
432         char *e;
433
434         e = Fsstdconnect(c, argv, argc);
435         if(e != nil)
436                 return e;
437         tcpstart(c, TCP_CONNECT);
438
439         return nil;
440 }
441
442 static int
443 tcpstate(Conv *c, char *state, int n)
444 {
445         Tcpctl *s;
446
447         s = (Tcpctl*)(c->ptcl);
448
449         return snprint(state, n,
450                 "%s qin %d qout %d srtt %d mdev %d cwin %lud swin %lud>>%d rwin %lud>>%d timer.start %d timer.count %d rerecv %d katimer.start %d katimer.count %d\n",
451                 tcpstates[s->state],
452                 c->rq ? qlen(c->rq) : 0,
453                 c->wq ? qlen(c->wq) : 0,
454                 s->srtt, s->mdev,
455                 s->cwind, s->snd.wnd, s->rcv.scale, s->rcv.wnd, s->snd.scale,
456                 s->timer.start, s->timer.count, s->rerecv,
457                 s->katimer.start, s->katimer.count);
458 }
459
460 static int
461 tcpinuse(Conv *c)
462 {
463         Tcpctl *s;
464
465         s = (Tcpctl*)(c->ptcl);
466         return s->state != Closed;
467 }
468
469 static char*
470 tcpannounce(Conv *c, char **argv, int argc)
471 {
472         char *e;
473
474         e = Fsstdannounce(c, argv, argc);
475         if(e != nil)
476                 return e;
477         tcpstart(c, TCP_LISTEN);
478         Fsconnected(c, nil);
479
480         return nil;
481 }
482
483 /*
484  *  tcpclose is always called with the q locked
485  */
486 static void
487 tcpclose(Conv *c)
488 {
489         Tcpctl *tcb;
490
491         tcb = (Tcpctl*)c->ptcl;
492
493         qhangup(c->rq, nil);
494         qhangup(c->wq, nil);
495         qhangup(c->eq, nil);
496         qflush(c->rq);
497
498         switch(tcb->state) {
499         case Listen:
500                 /*
501                  *  reset any incoming calls to this listener
502                  */
503                 Fsconnected(c, "Hangup");
504
505                 localclose(c, nil);
506                 break;
507         case Closed:
508         case Syn_sent:
509                 localclose(c, nil);
510                 break;
511         case Syn_received:
512         case Established:
513                 tcb->flgcnt++;
514                 tcb->snd.nxt++;
515                 tcpsetstate(c, Finwait1);
516                 tcpoutput(c);
517                 break;
518         case Close_wait:
519                 tcb->flgcnt++;
520                 tcb->snd.nxt++;
521                 tcpsetstate(c, Last_ack);
522                 tcpoutput(c);
523                 break;
524         }
525 }
526
527 void
528 tcpkick(void *x)
529 {
530         Conv *s = x;
531         Tcpctl *tcb;
532
533         tcb = (Tcpctl*)s->ptcl;
534
535         if(waserror()){
536                 qunlock(s);
537                 nexterror();
538         }
539         qlock(s);
540
541         switch(tcb->state) {
542         case Syn_sent:
543         case Syn_received:
544         case Established:
545         case Close_wait:
546                 /*
547                  * Push data
548                  */
549                 tcprcvwin(s);
550                 tcpoutput(s);
551                 break;
552         default:
553                 localclose(s, "Hangup");
554                 break;
555         }
556
557         qunlock(s);
558         poperror();
559 }
560
561 void
562 tcprcvwin(Conv *s)                              /* Call with tcb locked */
563 {
564         int w;
565         Tcpctl *tcb;
566
567         tcb = (Tcpctl*)s->ptcl;
568         w = tcb->window - qlen(s->rq);
569         if(w < 0)
570                 w = 0;
571         tcb->rcv.wnd = w;
572         if(w == 0)
573                 tcb->rcv.blocked = 1;
574 }
575
576 void
577 tcpacktimer(void *v)
578 {
579         Tcpctl *tcb;
580         Conv *s;
581
582         s = v;
583         tcb = (Tcpctl*)s->ptcl;
584
585         if(waserror()){
586                 qunlock(s);
587                 nexterror();
588         }
589         qlock(s);
590         if(tcb->state != Closed){
591                 tcb->flags |= FORCE;
592                 tcprcvwin(s);
593                 tcpoutput(s);
594         }
595         qunlock(s);
596         poperror();
597 }
598
599 static void
600 tcpcreate(Conv *c)
601 {
602         c->rq = qopen(QMAX, Qcoalesce, tcpacktimer, c);
603         c->wq = qopen((3*QMAX)/2, Qkick, tcpkick, c);
604 }
605
606 static void
607 timerstate(Tcppriv *priv, Tcptimer *t, int newstate)
608 {
609         if(newstate != TcptimerON){
610                 if(t->state == TcptimerON){
611                         // unchain
612                         if(priv->timers == t){
613                                 priv->timers = t->next;
614                                 if(t->prev != nil)
615                                         panic("timerstate1");
616                         }
617                         if(t->next)
618                                 t->next->prev = t->prev;
619                         if(t->prev)
620                                 t->prev->next = t->next;
621                         t->next = t->prev = nil;
622                 }
623         } else {
624                 if(t->state != TcptimerON){
625                         // chain
626                         if(t->prev != nil || t->next != nil)
627                                 panic("timerstate2");
628                         t->prev = nil;
629                         t->next = priv->timers;
630                         if(t->next)
631                                 t->next->prev = t;
632                         priv->timers = t;
633                 }
634         }
635         t->state = newstate;
636 }
637
638 void
639 tcpackproc(void *a)
640 {
641         Tcptimer *t, *tp, *timeo;
642         Proto *tcp;
643         Tcppriv *priv;
644         int loop;
645
646         tcp = a;
647         priv = tcp->priv;
648
649         for(;;) {
650                 udelay_sched(MSPTICK * 1000);
651
652                 qlock(&priv->tl);
653                 timeo = nil;
654                 loop = 0;
655                 for(t = priv->timers; t != nil; t = tp) {
656                         if(loop++ > 10000)
657                                 panic("tcpackproc1");
658                         tp = t->next;
659                         if(t->state == TcptimerON) {
660                                 t->count--;
661                                 if(t->count == 0) {
662                                         timerstate(priv, t, TcptimerDONE);
663                                         t->readynext = timeo;
664                                         timeo = t;
665                                 }
666                         }
667                 }
668                 qunlock(&priv->tl);
669
670                 loop = 0;
671                 for(t = timeo; t != nil; t = t->readynext) {
672                         if(loop++ > 10000)
673                                 panic("tcpackproc2");
674                         if(t->state == TcptimerDONE && t->func != nil && !waserror()){
675                                 (*t->func)(t->arg);
676                                 poperror();
677                         }
678                 }
679
680                 limborexmit(tcp);
681         }
682 }
683
684 void
685 tcpgo(Tcppriv *priv, Tcptimer *t)
686 {
687         if(t == nil || t->start == 0)
688                 return;
689
690         qlock(&priv->tl);
691         t->count = t->start;
692         timerstate(priv, t, TcptimerON);
693         qunlock(&priv->tl);
694 }
695
696 void
697 tcphalt(Tcppriv *priv, Tcptimer *t)
698 {
699         if(t == nil)
700                 return;
701
702         qlock(&priv->tl);
703         timerstate(priv, t, TcptimerOFF);
704         qunlock(&priv->tl);
705 }
706
707 int
708 backoff(int n)
709 {
710         return 1 << n;
711 }
712
713 void
714 localclose(Conv *s, char *reason)       /* called with tcb locked */
715 {
716         Tcpctl *tcb;
717         Reseq *rp,*rp1;
718         Tcppriv *tpriv;
719
720         tpriv = s->p->priv;
721         tcb = (Tcpctl*)s->ptcl;
722
723         iphtrem(&tpriv->ht, s);
724
725         tcphalt(tpriv, &tcb->timer);
726         tcphalt(tpriv, &tcb->rtt_timer);
727         tcphalt(tpriv, &tcb->acktimer);
728         tcphalt(tpriv, &tcb->katimer);
729
730         /* Flush reassembly queue; nothing more can arrive */
731         for(rp = tcb->reseq; rp != nil; rp = rp1) {
732                 rp1 = rp->next;
733                 freeblist(rp->bp);
734                 free(rp);
735         }
736         tcb->reseq = nil;
737
738         if(tcb->state == Syn_sent)
739                 Fsconnected(s, reason);
740         if(s->state == Announced)
741                 rendez_wakeup(&s->listenr);
742
743         qhangup(s->rq, reason);
744         qhangup(s->wq, reason);
745
746         tcpsetstate(s, Closed);
747 }
748
749 /* mtu (- TCP + IP hdr len) of 1st hop */
750 int
751 tcpmtu(Proto *tcp, uchar *addr, int version, int *scale)
752 {
753         Ipifc *ifc;
754         int mtu;
755
756         ifc = findipifc(tcp->f, addr, 0);
757         switch(version){
758         default:
759         case V4:
760                 mtu = DEF_MSS;
761                 if(ifc != nil)
762                         mtu = ifc->maxtu - ifc->m->hsize - (TCP4_PKT + TCP4_HDRSIZE);
763                 break;
764         case V6:
765                 mtu = DEF_MSS6;
766                 if(ifc != nil)
767                         mtu = ifc->maxtu - ifc->m->hsize - (TCP6_PKT + TCP6_HDRSIZE);
768                 break;
769         }
770         if(ifc != nil){
771                 if(ifc->mbps > 100)
772                         *scale = HaveWS | 3;
773                 else if(ifc->mbps > 10)
774                         *scale = HaveWS | 1;
775                 else
776                         *scale = HaveWS | 0;
777         } else
778                 *scale = HaveWS | 0;
779
780         return mtu;
781 }
782
783 void
784 inittcpctl(Conv *s, int mode)
785 {
786         Tcpctl *tcb;
787         Tcp4hdr* h4;
788         Tcp6hdr* h6;
789         int mss;
790
791         tcb = (Tcpctl*)s->ptcl;
792
793         memset(tcb, 0, sizeof(Tcpctl));
794
795         tcb->ssthresh = 65535;
796         tcb->srtt = tcp_irtt<<LOGAGAIN;
797         tcb->mdev = 0;
798
799         /* setup timers */
800         tcb->timer.start = tcp_irtt / MSPTICK;
801         tcb->timer.func = tcptimeout;
802         tcb->timer.arg = s;
803         tcb->rtt_timer.start = MAX_TIME;
804         tcb->acktimer.start = TCP_ACK / MSPTICK;
805         tcb->acktimer.func = tcpacktimer;
806         tcb->acktimer.arg = s;
807         tcb->katimer.start = DEF_KAT / MSPTICK;
808         tcb->katimer.func = tcpkeepalive;
809         tcb->katimer.arg = s;
810
811         mss = DEF_MSS;
812
813         /* create a prototype(pseudo) header */
814         if(mode != TCP_LISTEN){
815                 if(ipcmp(s->laddr, IPnoaddr) == 0)
816                         findlocalip(s->p->f, s->laddr, s->raddr);
817
818                 switch(s->ipversion){
819                 case V4:
820                         h4 = &tcb->protohdr.tcp4hdr;
821                         memset(h4, 0, sizeof(*h4));
822                         h4->proto = IP_TCPPROTO;
823                         hnputs(h4->tcpsport, s->lport);
824                         hnputs(h4->tcpdport, s->rport);
825                         v6tov4(h4->tcpsrc, s->laddr);
826                         v6tov4(h4->tcpdst, s->raddr);
827                         break;
828                 case V6:
829                         h6 = &tcb->protohdr.tcp6hdr;
830                         memset(h6, 0, sizeof(*h6));
831                         h6->proto = IP_TCPPROTO;
832                         hnputs(h6->tcpsport, s->lport);
833                         hnputs(h6->tcpdport, s->rport);
834                         ipmove(h6->tcpsrc, s->laddr);
835                         ipmove(h6->tcpdst, s->raddr);
836                         mss = DEF_MSS6;
837                         break;
838                 default:
839                         panic("inittcpctl: version %d", s->ipversion);
840                 }
841         }
842
843         tcb->mss = tcb->cwind = mss;
844
845         /* default is no window scaling */
846         tcb->window = QMAX;
847         tcb->rcv.wnd = QMAX;
848         tcb->rcv.scale = 0;
849         tcb->snd.scale = 0;
850         qsetlimit(s->rq, QMAX);
851 }
852
853 /*
854  *  called with s qlocked
855  */
856 void
857 tcpstart(Conv *s, int mode)
858 {
859         Tcpctl *tcb;
860         Tcppriv *tpriv;
861         char kpname[KNAMELEN];
862
863         tpriv = s->p->priv;
864
865         if(tpriv->ackprocstarted == 0){
866                 qlock(&tpriv->apl);
867                 if(tpriv->ackprocstarted == 0){
868                         sprint(kpname, "#I%dtcpack", s->p->f->dev);
869                         ktask(kpname, tcpackproc, s->p);
870                         tpriv->ackprocstarted = 1;
871                 }
872                 qunlock(&tpriv->apl);
873         }
874
875         tcb = (Tcpctl*)s->ptcl;
876
877         inittcpctl(s, mode);
878
879         iphtadd(&tpriv->ht, s);
880         switch(mode) {
881         case TCP_LISTEN:
882                 tpriv->stats[PassiveOpens]++;
883                 tcb->flags |= CLONE;
884                 tcpsetstate(s, Listen);
885                 break;
886
887         case TCP_CONNECT:
888                 tpriv->stats[ActiveOpens]++;
889                 tcb->flags |= ACTIVE;
890                 tcpsndsyn(s, tcb);
891                 tcpsetstate(s, Syn_sent);
892                 tcpoutput(s);
893                 break;
894         }
895 }
896
897 static char*
898 tcpflag(ushort flag)
899 {
900         static char buf[128];
901
902         sprint(buf, "%d", flag>>10);    /* Head len */
903         if(flag & URG)
904                 strcat(buf, " URG");
905         if(flag & ACK)
906                 strcat(buf, " ACK");
907         if(flag & PSH)
908                 strcat(buf, " PSH");
909         if(flag & RST)
910                 strcat(buf, " RST");
911         if(flag & SYN)
912                 strcat(buf, " SYN");
913         if(flag & FIN)
914                 strcat(buf, " FIN");
915
916         return buf;
917 }
918
919 Block *
920 htontcp6(Tcp *tcph, Block *data, Tcp6hdr *ph, Tcpctl *tcb)
921 {
922         int dlen;
923         Tcp6hdr *h;
924         ushort csum;
925         ushort hdrlen, optpad = 0;
926         uchar *opt;
927
928         hdrlen = TCP6_HDRSIZE;
929         if(tcph->flags & SYN){
930                 if(tcph->mss)
931                         hdrlen += MSS_LENGTH;
932                 if(tcph->ws)
933                         hdrlen += WS_LENGTH;
934                 optpad = hdrlen & 3;
935                 if(optpad)
936                         optpad = 4 - optpad;
937                 hdrlen += optpad;
938         }
939
940         if(data) {
941                 dlen = blocklen(data);
942                 data = padblock(data, hdrlen + TCP6_PKT);
943                 if(data == nil)
944                         return nil;
945         }
946         else {
947                 dlen = 0;
948                 data = allocb(hdrlen + TCP6_PKT + 64);  /* the 64 pad is to meet mintu's */
949                 if(data == nil)
950                         return nil;
951                 data->wp += hdrlen + TCP6_PKT;
952         }
953
954         /* copy in pseudo ip header plus port numbers */
955         h = (Tcp6hdr *)(data->rp);
956         memmove(h, ph, TCP6_TCBPHDRSZ);
957
958         /* compose pseudo tcp header, do cksum calculation */
959         hnputl(h->vcf, hdrlen + dlen);
960         h->ploadlen[0] = h->ploadlen[1] = h->proto = 0;
961         h->ttl = ph->proto;
962
963         /* copy in variable bits */
964         hnputl(h->tcpseq, tcph->seq);
965         hnputl(h->tcpack, tcph->ack);
966         hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags);
967         hnputs(h->tcpwin, tcph->wnd>>(tcb != nil ? tcb->snd.scale : 0));
968         hnputs(h->tcpurg, tcph->urg);
969
970         if(tcph->flags & SYN){
971                 opt = h->tcpopt;
972                 if(tcph->mss != 0){
973                         *opt++ = MSSOPT;
974                         *opt++ = MSS_LENGTH;
975                         hnputs(opt, tcph->mss);
976                         opt += 2;
977                 }
978                 if(tcph->ws != 0){
979                         *opt++ = WSOPT;
980                         *opt++ = WS_LENGTH;
981                         *opt++ = tcph->ws;
982                 }
983                 while(optpad-- > 0)
984                         *opt++ = NOOPOPT;
985         }
986
987         if(tcb != nil && tcb->nochecksum){
988                 h->tcpcksum[0] = h->tcpcksum[1] = 0;
989         } else {
990                 csum = ptclcsum(data, TCP6_IPLEN, hdrlen+dlen+TCP6_PHDRSIZE);
991                 hnputs(h->tcpcksum, csum);
992         }
993
994         /* move from pseudo header back to normal ip header */
995         memset(h->vcf, 0, 4);
996         h->vcf[0] = IP_VER6;
997         hnputs(h->ploadlen, hdrlen+dlen);
998         h->proto = ph->proto;
999
1000         return data;
1001 }
1002
1003 Block *
1004 htontcp4(Tcp *tcph, Block *data, Tcp4hdr *ph, Tcpctl *tcb)
1005 {
1006         int dlen;
1007         Tcp4hdr *h;
1008         ushort csum;
1009         ushort hdrlen, optpad = 0;
1010         uchar *opt;
1011
1012         hdrlen = TCP4_HDRSIZE;
1013         if(tcph->flags & SYN){
1014                 if(tcph->mss)
1015                         hdrlen += MSS_LENGTH;
1016                 if(tcph->ws)
1017                         hdrlen += WS_LENGTH;
1018                 optpad = hdrlen & 3;
1019                 if(optpad)
1020                         optpad = 4 - optpad;
1021                 hdrlen += optpad;
1022         }
1023
1024         if(data) {
1025                 dlen = blocklen(data);
1026                 data = padblock(data, hdrlen + TCP4_PKT);
1027                 if(data == nil)
1028                         return nil;
1029         }
1030         else {
1031                 dlen = 0;
1032                 data = allocb(hdrlen + TCP4_PKT + 64);  /* the 64 pad is to meet mintu's */
1033                 if(data == nil)
1034                         return nil;
1035                 data->wp += hdrlen + TCP4_PKT;
1036         }
1037
1038         /* copy in pseudo ip header plus port numbers */
1039         h = (Tcp4hdr *)(data->rp);
1040         memmove(h, ph, TCP4_TCBPHDRSZ);
1041
1042         /* copy in variable bits */
1043         hnputs(h->tcplen, hdrlen + dlen);
1044         hnputl(h->tcpseq, tcph->seq);
1045         hnputl(h->tcpack, tcph->ack);
1046         hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags);
1047         hnputs(h->tcpwin, tcph->wnd>>(tcb != nil ? tcb->snd.scale : 0));
1048         hnputs(h->tcpurg, tcph->urg);
1049
1050         if(tcph->flags & SYN){
1051                 opt = h->tcpopt;
1052                 if(tcph->mss != 0){
1053                         *opt++ = MSSOPT;
1054                         *opt++ = MSS_LENGTH;
1055                         hnputs(opt, tcph->mss);
1056                         opt += 2;
1057                 }
1058                 if(tcph->ws != 0){
1059                         *opt++ = WSOPT;
1060                         *opt++ = WS_LENGTH;
1061                         *opt++ = tcph->ws;
1062                 }
1063                 while(optpad-- > 0)
1064                         *opt++ = NOOPOPT;
1065         }
1066
1067         if(tcb != nil && tcb->nochecksum){
1068                 h->tcpcksum[0] = h->tcpcksum[1] = 0;
1069         } else {
1070                 csum = ptclcsum(data, TCP4_IPLEN, hdrlen+dlen+TCP4_PHDRSIZE);
1071                 hnputs(h->tcpcksum, csum);
1072         }
1073
1074         return data;
1075 }
1076
1077 int
1078 ntohtcp6(Tcp *tcph, Block **bpp)
1079 {
1080         Tcp6hdr *h;
1081         uchar *optr;
1082         ushort hdrlen;
1083         ushort optlen;
1084         int n;
1085
1086         *bpp = pullupblock(*bpp, TCP6_PKT+TCP6_HDRSIZE);
1087         if(*bpp == nil)
1088                 return -1;
1089
1090         h = (Tcp6hdr *)((*bpp)->rp);
1091         tcph->source = nhgets(h->tcpsport);
1092         tcph->dest = nhgets(h->tcpdport);
1093         tcph->seq = nhgetl(h->tcpseq);
1094         tcph->ack = nhgetl(h->tcpack);
1095         hdrlen = (h->tcpflag[0]>>2) & ~3;
1096         if(hdrlen < TCP6_HDRSIZE) {
1097                 freeblist(*bpp);
1098                 return -1;
1099         }
1100
1101         tcph->flags = h->tcpflag[1];
1102         tcph->wnd = nhgets(h->tcpwin);
1103         tcph->urg = nhgets(h->tcpurg);
1104         tcph->mss = 0;
1105         tcph->ws = 0;
1106         tcph->len = nhgets(h->ploadlen) - hdrlen;
1107
1108         *bpp = pullupblock(*bpp, hdrlen+TCP6_PKT);
1109         if(*bpp == nil)
1110                 return -1;
1111
1112         optr = h->tcpopt;
1113         n = hdrlen - TCP6_HDRSIZE;
1114         while(n > 0 && *optr != EOLOPT) {
1115                 if(*optr == NOOPOPT) {
1116                         n--;
1117                         optr++;
1118                         continue;
1119                 }
1120                 optlen = optr[1];
1121                 if(optlen < 2 || optlen > n)
1122                         break;
1123                 switch(*optr) {
1124                 case MSSOPT:
1125                         if(optlen == MSS_LENGTH)
1126                                 tcph->mss = nhgets(optr+2);
1127                         break;
1128                 case WSOPT:
1129                         if(optlen == WS_LENGTH && *(optr+2) <= 14)
1130                                 tcph->ws = HaveWS | *(optr+2);
1131                         break;
1132                 }
1133                 n -= optlen;
1134                 optr += optlen;
1135         }
1136         return hdrlen;
1137 }
1138
1139 int
1140 ntohtcp4(Tcp *tcph, Block **bpp)
1141 {
1142         Tcp4hdr *h;
1143         uchar *optr;
1144         ushort hdrlen;
1145         ushort optlen;
1146         int n;
1147
1148         *bpp = pullupblock(*bpp, TCP4_PKT+TCP4_HDRSIZE);
1149         if(*bpp == nil)
1150                 return -1;
1151
1152         h = (Tcp4hdr *)((*bpp)->rp);
1153         tcph->source = nhgets(h->tcpsport);
1154         tcph->dest = nhgets(h->tcpdport);
1155         tcph->seq = nhgetl(h->tcpseq);
1156         tcph->ack = nhgetl(h->tcpack);
1157
1158         hdrlen = (h->tcpflag[0]>>2) & ~3;
1159         if(hdrlen < TCP4_HDRSIZE) {
1160                 freeblist(*bpp);
1161                 return -1;
1162         }
1163
1164         tcph->flags = h->tcpflag[1];
1165         tcph->wnd = nhgets(h->tcpwin);
1166         tcph->urg = nhgets(h->tcpurg);
1167         tcph->mss = 0;
1168         tcph->ws = 0;
1169         tcph->len = nhgets(h->length) - (hdrlen + TCP4_PKT);
1170
1171         *bpp = pullupblock(*bpp, hdrlen+TCP4_PKT);
1172         if(*bpp == nil)
1173                 return -1;
1174
1175         optr = h->tcpopt;
1176         n = hdrlen - TCP4_HDRSIZE;
1177         while(n > 0 && *optr != EOLOPT) {
1178                 if(*optr == NOOPOPT) {
1179                         n--;
1180                         optr++;
1181                         continue;
1182                 }
1183                 optlen = optr[1];
1184                 if(optlen < 2 || optlen > n)
1185                         break;
1186                 switch(*optr) {
1187                 case MSSOPT:
1188                         if(optlen == MSS_LENGTH)
1189                                 tcph->mss = nhgets(optr+2);
1190                         break;
1191                 case WSOPT:
1192                         if(optlen == WS_LENGTH && *(optr+2) <= 14)
1193                                 tcph->ws = HaveWS | *(optr+2);
1194                         break;
1195                 }
1196                 n -= optlen;
1197                 optr += optlen;
1198         }
1199         return hdrlen;
1200 }
1201
1202 /*
1203  *  For outgiing calls, generate an initial sequence
1204  *  number and put a SYN on the send queue
1205  */
1206 void
1207 tcpsndsyn(Conv *s, Tcpctl *tcb)
1208 {
1209         tcb->iss = (nrand(1<<16)<<16)|nrand(1<<16);
1210         tcb->rttseq = tcb->iss;
1211         tcb->snd.wl2 = tcb->iss;
1212         tcb->snd.una = tcb->iss;
1213         tcb->snd.ptr = tcb->rttseq;
1214         tcb->snd.nxt = tcb->rttseq;
1215         tcb->flgcnt++;
1216         tcb->flags |= FORCE;
1217         tcb->sndsyntime = NOW;
1218
1219         /* set desired mss and scale */
1220         tcb->mss = tcpmtu(s->p, s->laddr, s->ipversion, &tcb->scale);
1221 }
1222
1223 void
1224 sndrst(Proto *tcp, uchar *source, uchar *dest, ushort length, Tcp *seg, uchar version, char *reason)
1225 {
1226         Block *hbp;
1227         uchar rflags;
1228         Tcppriv *tpriv;
1229         Tcp4hdr ph4;
1230         Tcp6hdr ph6;
1231
1232         netlog(tcp->f, Logtcp, "sndrst: %s", reason);
1233
1234         tpriv = tcp->priv;
1235
1236         if(seg->flags & RST)
1237                 return;
1238
1239         /* make pseudo header */
1240         switch(version) {
1241         case V4:
1242                 memset(&ph4, 0, sizeof(ph4));
1243                 ph4.vihl = IP_VER4;
1244                 v6tov4(ph4.tcpsrc, dest);
1245                 v6tov4(ph4.tcpdst, source);
1246                 ph4.proto = IP_TCPPROTO;
1247                 hnputs(ph4.tcplen, TCP4_HDRSIZE);
1248                 hnputs(ph4.tcpsport, seg->dest);
1249                 hnputs(ph4.tcpdport, seg->source);
1250                 break;
1251         case V6:
1252                 memset(&ph6, 0, sizeof(ph6));
1253                 ph6.vcf[0] = IP_VER6;
1254                 ipmove(ph6.tcpsrc, dest);
1255                 ipmove(ph6.tcpdst, source);
1256                 ph6.proto = IP_TCPPROTO;
1257                 hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1258                 hnputs(ph6.tcpsport, seg->dest);
1259                 hnputs(ph6.tcpdport, seg->source);
1260                 break;
1261         default:
1262                 panic("sndrst: version %d", version);
1263         }
1264
1265         tpriv->stats[OutRsts]++;
1266         rflags = RST;
1267
1268         /* convince the other end that this reset is in band */
1269         if(seg->flags & ACK) {
1270                 seg->seq = seg->ack;
1271                 seg->ack = 0;
1272         }
1273         else {
1274                 rflags |= ACK;
1275                 seg->ack = seg->seq;
1276                 seg->seq = 0;
1277                 if(seg->flags & SYN)
1278                         seg->ack++;
1279                 seg->ack += length;
1280                 if(seg->flags & FIN)
1281                         seg->ack++;
1282         }
1283         seg->flags = rflags;
1284         seg->wnd = 0;
1285         seg->urg = 0;
1286         seg->mss = 0;
1287         seg->ws = 0;
1288         switch(version) {
1289         case V4:
1290                 hbp = htontcp4(seg, nil, &ph4, nil);
1291                 if(hbp == nil)
1292                         return;
1293                 ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1294                 break;
1295         case V6:
1296                 hbp = htontcp6(seg, nil, &ph6, nil);
1297                 if(hbp == nil)
1298                         return;
1299                 ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1300                 break;
1301         default:
1302                 panic("sndrst2: version %d", version);
1303         }
1304 }
1305
1306 /*
1307  *  send a reset to the remote side and close the conversation
1308  *  called with s qlocked
1309  */
1310 char*
1311 tcphangup(Conv *s)
1312 {
1313         Tcp seg;
1314         Tcpctl *tcb;
1315         Block *hbp;
1316
1317         tcb = (Tcpctl*)s->ptcl;
1318         if(waserror())
1319                 return commonerror();
1320         if(s->raddr != 0) {
1321                 if(!waserror()){
1322                         seg.flags = RST | ACK;
1323                         seg.ack = tcb->rcv.nxt;
1324                         tcb->rcv.una = 0;
1325                         seg.seq = tcb->snd.ptr;
1326                         seg.wnd = 0;
1327                         seg.urg = 0;
1328                         seg.mss = 0;
1329                         seg.ws = 0;
1330                         switch(s->ipversion) {
1331                         case V4:
1332                                 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
1333                                 hbp = htontcp4(&seg, nil, &tcb->protohdr.tcp4hdr, tcb);
1334                                 ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
1335                                 break;
1336                         case V6:
1337                                 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
1338                                 hbp = htontcp6(&seg, nil, &tcb->protohdr.tcp6hdr, tcb);
1339                                 ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
1340                                 break;
1341                         default:
1342                                 panic("tcphangup: version %d", s->ipversion);
1343                         }
1344                         poperror();
1345                 }
1346         }
1347         localclose(s, nil);
1348         poperror();
1349         return nil;
1350 }
1351
1352 /*
1353  *  (re)send a SYN ACK
1354  */
1355 int
1356 sndsynack(Proto *tcp, Limbo *lp)
1357 {
1358         Block *hbp;
1359         Tcp4hdr ph4;
1360         Tcp6hdr ph6;
1361         Tcp seg;
1362         int scale;
1363
1364         /* make pseudo header */
1365         switch(lp->version) {
1366         case V4:
1367                 memset(&ph4, 0, sizeof(ph4));
1368                 ph4.vihl = IP_VER4;
1369                 v6tov4(ph4.tcpsrc, lp->laddr);
1370                 v6tov4(ph4.tcpdst, lp->raddr);
1371                 ph4.proto = IP_TCPPROTO;
1372                 hnputs(ph4.tcplen, TCP4_HDRSIZE);
1373                 hnputs(ph4.tcpsport, lp->lport);
1374                 hnputs(ph4.tcpdport, lp->rport);
1375                 break;
1376         case V6:
1377                 memset(&ph6, 0, sizeof(ph6));
1378                 ph6.vcf[0] = IP_VER6;
1379                 ipmove(ph6.tcpsrc, lp->laddr);
1380                 ipmove(ph6.tcpdst, lp->raddr);
1381                 ph6.proto = IP_TCPPROTO;
1382                 hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1383                 hnputs(ph6.tcpsport, lp->lport);
1384                 hnputs(ph6.tcpdport, lp->rport);
1385                 break;
1386         default:
1387                 panic("sndrst: version %d", lp->version);
1388         }
1389
1390         seg.seq = lp->iss;
1391         seg.ack = lp->irs+1;
1392         seg.flags = SYN|ACK;
1393         seg.urg = 0;
1394         seg.mss = tcpmtu(tcp, lp->laddr, lp->version, &scale);
1395         seg.wnd = QMAX;
1396
1397         /* if the other side set scale, we should too */
1398         if(lp->rcvscale){
1399                 seg.ws = scale;
1400                 lp->sndscale = scale;
1401         } else {
1402                 seg.ws = 0;
1403                 lp->sndscale = 0;
1404         }
1405
1406         switch(lp->version) {
1407         case V4:
1408                 hbp = htontcp4(&seg, nil, &ph4, nil);
1409                 if(hbp == nil)
1410                         return -1;
1411                 ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1412                 break;
1413         case V6:
1414                 hbp = htontcp6(&seg, nil, &ph6, nil);
1415                 if(hbp == nil)
1416                         return -1;
1417                 ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1418                 break;
1419         default:
1420                 panic("sndsnack: version %d", lp->version);
1421         }
1422         lp->lastsend = NOW;
1423         return 0;
1424 }
1425
1426 #define hashipa(a, p) ( ( (a)[IPaddrlen-2] + (a)[IPaddrlen-1] + p )&LHTMASK )
1427
1428 /*
1429  *  put a call into limbo and respond with a SYN ACK
1430  *
1431  *  called with proto locked
1432  */
1433 static void
1434 limbo(Conv *s, uchar *source, uchar *dest, Tcp *seg, int version)
1435 {
1436         Limbo *lp, **l;
1437         Tcppriv *tpriv;
1438         int h;
1439
1440         tpriv = s->p->priv;
1441         h = hashipa(source, seg->source);
1442
1443         for(l = &tpriv->lht[h]; *l != nil; l = &lp->next){
1444                 lp = *l;
1445                 if(lp->lport != seg->dest || lp->rport != seg->source || lp->version != version)
1446                         continue;
1447                 if(ipcmp(lp->raddr, source) != 0)
1448                         continue;
1449                 if(ipcmp(lp->laddr, dest) != 0)
1450                         continue;
1451
1452                 /* each new SYN restarts the retransmits */
1453                 lp->irs = seg->seq;
1454                 break;
1455         }
1456         lp = *l;
1457         if(lp == nil){
1458                 if(tpriv->nlimbo >= Maxlimbo && tpriv->lht[h]){
1459                         lp = tpriv->lht[h];
1460                         tpriv->lht[h] = lp->next;
1461                         lp->next = nil;
1462                 } else {
1463                         lp = malloc(sizeof(*lp));
1464                         if(lp == nil)
1465                                 return;
1466                         tpriv->nlimbo++;
1467                 }
1468                 *l = lp;
1469                 lp->version = version;
1470                 ipmove(lp->laddr, dest);
1471                 ipmove(lp->raddr, source);
1472                 lp->lport = seg->dest;
1473                 lp->rport = seg->source;
1474                 lp->mss = seg->mss;
1475                 lp->rcvscale = seg->ws;
1476                 lp->irs = seg->seq;
1477                 lp->iss = (nrand(1<<16)<<16)|nrand(1<<16);
1478         }
1479
1480         if(sndsynack(s->p, lp) < 0){
1481                 *l = lp->next;
1482                 tpriv->nlimbo--;
1483                 free(lp);
1484         }
1485 }
1486
1487 /*
1488  *  resend SYN ACK's once every SYNACK_RXTIMER ms.
1489  */
1490 static void
1491 limborexmit(Proto *tcp)
1492 {
1493         Tcppriv *tpriv;
1494         Limbo **l, *lp;
1495         int h;
1496         int seen;
1497         ulong now;
1498
1499         tpriv = tcp->priv;
1500
1501         if(!canqlock(tcp))
1502                 return;
1503         seen = 0;
1504         now = NOW;
1505         for(h = 0; h < NLHT && seen < tpriv->nlimbo; h++){
1506                 for(l = &tpriv->lht[h]; *l != nil && seen < tpriv->nlimbo; ){
1507                         lp = *l;
1508                         seen++;
1509                         if(now - lp->lastsend < (lp->rexmits+1)*SYNACK_RXTIMER)
1510                                 continue;
1511
1512                         /* time it out after 1 second */
1513                         if(++(lp->rexmits) > 5){
1514                                 tpriv->nlimbo--;
1515                                 *l = lp->next;
1516                                 free(lp);
1517                                 continue;
1518                         }
1519
1520                         /* if we're being attacked, don't bother resending SYN ACK's */
1521                         if(tpriv->nlimbo > 100)
1522                                 continue;
1523
1524                         if(sndsynack(tcp, lp) < 0){
1525                                 tpriv->nlimbo--;
1526                                 *l = lp->next;
1527                                 free(lp);
1528                                 continue;
1529                         }
1530
1531                         l = &lp->next;
1532                 }
1533         }
1534         qunlock(tcp);
1535 }
1536
1537 /*
1538  *  lookup call in limbo.  if found, throw it out.
1539  *
1540  *  called with proto locked
1541  */
1542 static void
1543 limborst(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version)
1544 {
1545         Limbo *lp, **l;
1546         int h;
1547         Tcppriv *tpriv;
1548
1549         tpriv = s->p->priv;
1550
1551         /* find a call in limbo */
1552         h = hashipa(src, segp->source);
1553         for(l = &tpriv->lht[h]; *l != nil; l = &lp->next){
1554                 lp = *l;
1555                 if(lp->lport != segp->dest || lp->rport != segp->source || lp->version != version)
1556                         continue;
1557                 if(ipcmp(lp->laddr, dst) != 0)
1558                         continue;
1559                 if(ipcmp(lp->raddr, src) != 0)
1560                         continue;
1561
1562                 /* RST can only follow the SYN */
1563                 if(segp->seq == lp->irs+1){
1564                         tpriv->nlimbo--;
1565                         *l = lp->next;
1566                         free(lp);
1567                 }
1568                 break;
1569         }
1570 }
1571
1572 /*
1573  *  come here when we finally get an ACK to our SYN-ACK.
1574  *  lookup call in limbo.  if found, create a new conversation
1575  *
1576  *  called with proto locked
1577  */
1578 static Conv*
1579 tcpincoming(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version)
1580 {
1581         Conv *new;
1582         Tcpctl *tcb;
1583         Tcppriv *tpriv;
1584         Tcp4hdr *h4;
1585         Tcp6hdr *h6;
1586         Limbo *lp, **l;
1587         int h;
1588
1589         /* unless it's just an ack, it can't be someone coming out of limbo */
1590         if((segp->flags & SYN) || (segp->flags & ACK) == 0)
1591                 return nil;
1592
1593         tpriv = s->p->priv;
1594
1595         /* find a call in limbo */
1596         h = hashipa(src, segp->source);
1597         for(l = &tpriv->lht[h]; (lp = *l) != nil; l = &lp->next){
1598                 netlog(s->p->f, Logtcp, "tcpincoming s %I,%ux/%I,%ux d %I,%ux/%I,%ux v %d/%d",
1599                         src, segp->source, lp->raddr, lp->rport,
1600                         dst, segp->dest, lp->laddr, lp->lport,
1601                         version, lp->version
1602                 );
1603
1604                 if(lp->lport != segp->dest || lp->rport != segp->source || lp->version != version)
1605                         continue;
1606                 if(ipcmp(lp->laddr, dst) != 0)
1607                         continue;
1608                 if(ipcmp(lp->raddr, src) != 0)
1609                         continue;
1610
1611                 /* we're assuming no data with the initial SYN */
1612                 if(segp->seq != lp->irs+1 || segp->ack != lp->iss+1){
1613                         netlog(s->p->f, Logtcp, "tcpincoming s %lux/%lux a %lux %lux",
1614                                 segp->seq, lp->irs+1, segp->ack, lp->iss+1);
1615                         lp = nil;
1616                 } else {
1617                         tpriv->nlimbo--;
1618                         *l = lp->next;
1619                 }
1620                 break;
1621         }
1622         if(lp == nil)
1623                 return nil;
1624
1625         new = Fsnewcall(s, src, segp->source, dst, segp->dest, version);
1626         if(new == nil)
1627                 return nil;
1628
1629         memmove(new->ptcl, s->ptcl, sizeof(Tcpctl));
1630         tcb = (Tcpctl*)new->ptcl;
1631         tcb->flags &= ~CLONE;
1632         tcb->timer.arg = new;
1633         tcb->timer.state = TcptimerOFF;
1634         tcb->acktimer.arg = new;
1635         tcb->acktimer.state = TcptimerOFF;
1636         tcb->katimer.arg = new;
1637         tcb->katimer.state = TcptimerOFF;
1638         tcb->rtt_timer.arg = new;
1639         tcb->rtt_timer.state = TcptimerOFF;
1640
1641         tcb->irs = lp->irs;
1642         tcb->rcv.nxt = tcb->irs+1;
1643         tcb->rcv.urg = tcb->rcv.nxt;
1644
1645         tcb->iss = lp->iss;
1646         tcb->rttseq = tcb->iss;
1647         tcb->snd.wl2 = tcb->iss;
1648         tcb->snd.una = tcb->iss+1;
1649         tcb->snd.ptr = tcb->iss+1;
1650         tcb->snd.nxt = tcb->iss+1;
1651         tcb->flgcnt = 0;
1652         tcb->flags |= SYNACK;
1653
1654         /* our sending max segment size cannot be bigger than what he asked for */
1655         if(lp->mss != 0 && lp->mss < tcb->mss)
1656                 tcb->mss = lp->mss;
1657
1658         /* window scaling */
1659         tcpsetscale(new, tcb, lp->rcvscale, lp->sndscale);
1660
1661         /* the congestion window always starts out as a single segment */
1662         tcb->snd.wnd = segp->wnd;
1663         tcb->cwind = tcb->mss;
1664
1665         /* set initial round trip time */
1666         tcb->sndsyntime = lp->lastsend+lp->rexmits*SYNACK_RXTIMER;
1667         tcpsynackrtt(new);
1668
1669         free(lp);
1670
1671         /* set up proto header */
1672         switch(version){
1673         case V4:
1674                 h4 = &tcb->protohdr.tcp4hdr;
1675                 memset(h4, 0, sizeof(*h4));
1676                 h4->proto = IP_TCPPROTO;
1677                 hnputs(h4->tcpsport, new->lport);
1678                 hnputs(h4->tcpdport, new->rport);
1679                 v6tov4(h4->tcpsrc, dst);
1680                 v6tov4(h4->tcpdst, src);
1681                 break;
1682         case V6:
1683                 h6 = &tcb->protohdr.tcp6hdr;
1684                 memset(h6, 0, sizeof(*h6));
1685                 h6->proto = IP_TCPPROTO;
1686                 hnputs(h6->tcpsport, new->lport);
1687                 hnputs(h6->tcpdport, new->rport);
1688                 ipmove(h6->tcpsrc, dst);
1689                 ipmove(h6->tcpdst, src);
1690                 break;
1691         default:
1692                 panic("tcpincoming: version %d", new->ipversion);
1693         }
1694
1695         tcpsetstate(new, Established);
1696
1697         iphtadd(&tpriv->ht, new);
1698
1699         return new;
1700 }
1701
1702 int
1703 seq_within(ulong x, ulong low, ulong high)
1704 {
1705         if(low <= high){
1706                 if(low <= x && x <= high)
1707                         return 1;
1708         }
1709         else {
1710                 if(x >= low || x <= high)
1711                         return 1;
1712         }
1713         return 0;
1714 }
1715
1716 int
1717 seq_lt(ulong x, ulong y)
1718 {
1719         return (int)(x-y) < 0;
1720 }
1721
1722 int
1723 seq_le(ulong x, ulong y)
1724 {
1725         return (int)(x-y) <= 0;
1726 }
1727
1728 int
1729 seq_gt(ulong x, ulong y)
1730 {
1731         return (int)(x-y) > 0;
1732 }
1733
1734 int
1735 seq_ge(ulong x, ulong y)
1736 {
1737         return (int)(x-y) >= 0;
1738 }
1739
1740 /*
1741  *  use the time between the first SYN and it's ack as the
1742  *  initial round trip time
1743  */
1744 void
1745 tcpsynackrtt(Conv *s)
1746 {
1747         Tcpctl *tcb;
1748         int delta;
1749         Tcppriv *tpriv;
1750
1751         tcb = (Tcpctl*)s->ptcl;
1752         tpriv = s->p->priv;
1753
1754         delta = NOW - tcb->sndsyntime;
1755         tcb->srtt = delta<<LOGAGAIN;
1756         tcb->mdev = delta<<LOGDGAIN;
1757
1758         /* halt round trip timer */
1759         tcphalt(tpriv, &tcb->rtt_timer);
1760 }
1761
1762 void
1763 update(Conv *s, Tcp *seg)
1764 {
1765         int rtt, delta;
1766         Tcpctl *tcb;
1767         ulong acked;
1768         ulong expand;
1769         Tcppriv *tpriv;
1770
1771         tpriv = s->p->priv;
1772         tcb = (Tcpctl*)s->ptcl;
1773
1774         /* if everything has been acked, force output(?) */
1775         if(seq_gt(seg->ack, tcb->snd.nxt)) {
1776                 tcb->flags |= FORCE;
1777                 return;
1778         }
1779
1780         /* added by Dong Lin for fast retransmission */
1781         if(seg->ack == tcb->snd.una
1782         && tcb->snd.una != tcb->snd.nxt
1783         && seg->len == 0
1784         && seg->wnd == tcb->snd.wnd) {
1785
1786                 /* this is a pure ack w/o window update */
1787                 netlog(s->p->f, Logtcprxmt, "dupack %lud ack %lud sndwnd %d advwin %d\n",
1788                         tcb->snd.dupacks, seg->ack, tcb->snd.wnd, seg->wnd);
1789
1790                 if(++tcb->snd.dupacks == TCPREXMTTHRESH) {
1791                         /*
1792                          *  tahoe tcp rxt the packet, half sshthresh,
1793                          *  and set cwnd to one packet
1794                          */
1795                         tcb->snd.recovery = 1;
1796                         tcb->snd.rxt = tcb->snd.nxt;
1797                         netlog(s->p->f, Logtcprxmt, "fast rxt %lud, nxt %lud\n", tcb->snd.una, tcb->snd.nxt);
1798                         tcprxmit(s);
1799                 } else {
1800                         /* do reno tcp here. */
1801                 }
1802         }
1803
1804         /*
1805          *  update window
1806          */
1807         if(seq_gt(seg->ack, tcb->snd.wl2)
1808         || (tcb->snd.wl2 == seg->ack && seg->wnd > tcb->snd.wnd)){
1809                 tcb->snd.wnd = seg->wnd;
1810                 tcb->snd.wl2 = seg->ack;
1811         }
1812
1813         if(!seq_gt(seg->ack, tcb->snd.una)){
1814                 /*
1815                  *  don't let us hangup if sending into a closed window and
1816                  *  we're still getting acks
1817                  */
1818                 if((tcb->flags&RETRAN) && tcb->snd.wnd == 0){
1819                         tcb->backedoff = MAXBACKMS/4;
1820                 }
1821                 return;
1822         }
1823
1824         /*
1825          *  any positive ack turns off fast rxt,
1826          *  (should we do new-reno on partial acks?)
1827          */
1828         if(!tcb->snd.recovery || seq_ge(seg->ack, tcb->snd.rxt)) {
1829                 tcb->snd.dupacks = 0;
1830                 tcb->snd.recovery = 0;
1831         } else
1832                 netlog(s->p->f, Logtcp, "rxt next %lud, cwin %ud\n", seg->ack, tcb->cwind);
1833
1834         /* Compute the new send window size */
1835         acked = seg->ack - tcb->snd.una;
1836
1837         /* avoid slow start and timers for SYN acks */
1838         if((tcb->flags & SYNACK) == 0) {
1839                 tcb->flags |= SYNACK;
1840                 acked--;
1841                 tcb->flgcnt--;
1842                 goto done;
1843         }
1844
1845         /* slow start as long as we're not recovering from lost packets */
1846         if(tcb->cwind < tcb->snd.wnd && !tcb->snd.recovery) {
1847                 if(tcb->cwind < tcb->ssthresh) {
1848                         expand = tcb->mss;
1849                         if(acked < expand)
1850                                 expand = acked;
1851                 }
1852                 else
1853                         expand = ((int)tcb->mss * tcb->mss) / tcb->cwind;
1854
1855                 if(tcb->cwind + expand < tcb->cwind)
1856                         expand = tcb->snd.wnd - tcb->cwind;
1857                 if(tcb->cwind + expand > tcb->snd.wnd)
1858                         expand = tcb->snd.wnd - tcb->cwind;
1859                 tcb->cwind += expand;
1860         }
1861
1862         /* Adjust the timers according to the round trip time */
1863         if(tcb->rtt_timer.state == TcptimerON && seq_ge(seg->ack, tcb->rttseq)) {
1864                 tcphalt(tpriv, &tcb->rtt_timer);
1865                 if((tcb->flags&RETRAN) == 0) {
1866                         tcb->backoff = 0;
1867                         tcb->backedoff = 0;
1868                         rtt = tcb->rtt_timer.start - tcb->rtt_timer.count;
1869                         if(rtt == 0)
1870                                 rtt = 1;        /* otherwise all close systems will rexmit in 0 time */
1871                         rtt *= MSPTICK;
1872                         if(tcb->srtt == 0) {
1873                                 tcb->srtt = rtt << LOGAGAIN;
1874                                 tcb->mdev = rtt << LOGDGAIN;
1875                         } else {
1876                                 delta = rtt - (tcb->srtt>>LOGAGAIN);
1877                                 tcb->srtt += delta;
1878                                 if(tcb->srtt <= 0)
1879                                         tcb->srtt = 1;
1880
1881                                 delta = abs(delta) - (tcb->mdev>>LOGDGAIN);
1882                                 tcb->mdev += delta;
1883                                 if(tcb->mdev <= 0)
1884                                         tcb->mdev = 1;
1885                         }
1886                         tcpsettimer(tcb);
1887                 }
1888         }
1889
1890 done:
1891         if(qdiscard(s->wq, acked) < acked)
1892                 tcb->flgcnt--;
1893
1894         tcb->snd.una = seg->ack;
1895         if(seq_gt(seg->ack, tcb->snd.urg))
1896                 tcb->snd.urg = seg->ack;
1897
1898         if(tcb->snd.una != tcb->snd.nxt)
1899                 tcpgo(tpriv, &tcb->timer);
1900         else
1901                 tcphalt(tpriv, &tcb->timer);
1902
1903         if(seq_lt(tcb->snd.ptr, tcb->snd.una))
1904                 tcb->snd.ptr = tcb->snd.una;
1905
1906         tcb->flags &= ~RETRAN;
1907         tcb->backoff = 0;
1908         tcb->backedoff = 0;
1909 }
1910
1911 void
1912 tcpiput(Proto *tcp, Ipifc*, Block *bp)
1913 {
1914         Tcp seg;
1915         Tcp4hdr *h4;
1916         Tcp6hdr *h6;
1917         int hdrlen;
1918         Tcpctl *tcb;
1919         ushort length;
1920         uchar source[IPaddrlen], dest[IPaddrlen];
1921         Conv *s;
1922         Fs *f;
1923         Tcppriv *tpriv;
1924         uchar version;
1925
1926         f = tcp->f;
1927         tpriv = tcp->priv;
1928
1929         tpriv->stats[InSegs]++;
1930
1931         h4 = (Tcp4hdr*)(bp->rp);
1932         h6 = (Tcp6hdr*)(bp->rp);
1933
1934         if((h4->vihl&0xF0)==IP_VER4) {
1935                 version = V4;
1936                 length = nhgets(h4->length);
1937                 v4tov6(dest, h4->tcpdst);
1938                 v4tov6(source, h4->tcpsrc);
1939
1940                 h4->Unused = 0;
1941                 hnputs(h4->tcplen, length-TCP4_PKT);
1942                 if(!(bp->flag & Btcpck) && (h4->tcpcksum[0] || h4->tcpcksum[1]) &&
1943                         ptclcsum(bp, TCP4_IPLEN, length-TCP4_IPLEN)) {
1944                         tpriv->stats[CsumErrs]++;
1945                         tpriv->stats[InErrs]++;
1946                         netlog(f, Logtcp, "bad tcp proto cksum\n");
1947                         freeblist(bp);
1948                         return;
1949                 }
1950
1951                 hdrlen = ntohtcp4(&seg, &bp);
1952                 if(hdrlen < 0){
1953                         tpriv->stats[HlenErrs]++;
1954                         tpriv->stats[InErrs]++;
1955                         netlog(f, Logtcp, "bad tcp hdr len\n");
1956                         return;
1957                 }
1958
1959                 /* trim the packet to the size claimed by the datagram */
1960                 length -= hdrlen+TCP4_PKT;
1961                 bp = trimblock(bp, hdrlen+TCP4_PKT, length);
1962                 if(bp == nil){
1963                         tpriv->stats[LenErrs]++;
1964                         tpriv->stats[InErrs]++;
1965                         netlog(f, Logtcp, "tcp len < 0 after trim\n");
1966                         return;
1967                 }
1968         }
1969         else {
1970                 int ttl = h6->ttl;
1971                 int proto = h6->proto;
1972
1973                 version = V6;
1974                 length = nhgets(h6->ploadlen);
1975                 ipmove(dest, h6->tcpdst);
1976                 ipmove(source, h6->tcpsrc);
1977
1978                 h6->ploadlen[0] = h6->ploadlen[1] = h6->proto = 0;
1979                 h6->ttl = proto;
1980                 hnputl(h6->vcf, length);
1981                 if((h6->tcpcksum[0] || h6->tcpcksum[1]) &&
1982                         ptclcsum(bp, TCP6_IPLEN, length+TCP6_PHDRSIZE)) {
1983                         tpriv->stats[CsumErrs]++;
1984                         tpriv->stats[InErrs]++;
1985                         netlog(f, Logtcp, "bad tcp proto cksum\n");
1986                         freeblist(bp);
1987                         return;
1988                 }
1989                 h6->ttl = ttl;
1990                 h6->proto = proto;
1991                 hnputs(h6->ploadlen, length);
1992
1993                 hdrlen = ntohtcp6(&seg, &bp);
1994                 if(hdrlen < 0){
1995                         tpriv->stats[HlenErrs]++;
1996                         tpriv->stats[InErrs]++;
1997                         netlog(f, Logtcp, "bad tcp hdr len\n");
1998                         return;
1999                 }
2000
2001                 /* trim the packet to the size claimed by the datagram */
2002                 length -= hdrlen;
2003                 bp = trimblock(bp, hdrlen+TCP6_PKT, length);
2004                 if(bp == nil){
2005                         tpriv->stats[LenErrs]++;
2006                         tpriv->stats[InErrs]++;
2007                         netlog(f, Logtcp, "tcp len < 0 after trim\n");
2008                         return;
2009                 }
2010         }
2011
2012         /* lock protocol while searching for a conversation */
2013         qlock(tcp);
2014
2015         /* Look for a matching conversation */
2016         s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest);
2017         if(s == nil){
2018                 netlog(f, Logtcp, "iphtlook failed");
2019 reset:
2020                 qunlock(tcp);
2021                 sndrst(tcp, source, dest, length, &seg, version, "no conversation");
2022                 freeblist(bp);
2023                 return;
2024         }
2025
2026         /* if it's a listener, look for the right flags and get a new conv */
2027         tcb = (Tcpctl*)s->ptcl;
2028         if(tcb->state == Listen){
2029                 if(seg.flags & RST){
2030                         limborst(s, &seg, source, dest, version);
2031                         qunlock(tcp);
2032                         freeblist(bp);
2033                         return;
2034                 }
2035
2036                 /* if this is a new SYN, put the call into limbo */
2037                 if((seg.flags & SYN) && (seg.flags & ACK) == 0){
2038                         limbo(s, source, dest, &seg, version);
2039                         qunlock(tcp);
2040                         freeblist(bp);
2041                         return;
2042                 }
2043
2044                 /*
2045                  *  if there's a matching call in limbo, tcpincoming will
2046                  *  return it in state Syn_received
2047                  */
2048                 s = tcpincoming(s, &seg, source, dest, version);
2049                 if(s == nil)
2050                         goto reset;
2051         }
2052
2053         /* The rest of the input state machine is run with the control block
2054          * locked and implements the state machine directly out of the RFC.
2055          * Out-of-band data is ignored - it was always a bad idea.
2056          */
2057         tcb = (Tcpctl*)s->ptcl;
2058         if(waserror()){
2059                 qunlock(s);
2060                 nexterror();
2061         }
2062         qlock(s);
2063         qunlock(tcp);
2064
2065         /* fix up window */
2066         seg.wnd <<= tcb->rcv.scale;
2067
2068         /* every input packet in puts off the keep alive time out */
2069         tcpsetkacounter(tcb);
2070
2071         switch(tcb->state) {
2072         case Closed:
2073                 sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
2074                 goto raise;
2075         case Syn_sent:
2076                 if(seg.flags & ACK) {
2077                         if(!seq_within(seg.ack, tcb->iss+1, tcb->snd.nxt)) {
2078                                 sndrst(tcp, source, dest, length, &seg, version,
2079                                          "bad seq in Syn_sent");
2080                                 goto raise;
2081                         }
2082                 }
2083                 if(seg.flags & RST) {
2084                         if(seg.flags & ACK)
2085                                 localclose(s, Econrefused);
2086                         goto raise;
2087                 }
2088
2089                 if(seg.flags & SYN) {
2090                         procsyn(s, &seg);
2091                         if(seg.flags & ACK){
2092                                 update(s, &seg);
2093                                 tcpsynackrtt(s);
2094                                 tcpsetstate(s, Established);
2095                                 tcpsetscale(s, tcb, seg.ws, tcb->scale);
2096                         }
2097                         else {
2098                                 tcb->time = NOW;
2099                                 tcpsetstate(s, Syn_received);   /* DLP - shouldn't this be a reset? */
2100                         }
2101
2102                         if(length != 0 || (seg.flags & FIN))
2103                                 break;
2104
2105                         freeblist(bp);
2106                         goto output;
2107                 }
2108                 else
2109                         freeblist(bp);
2110
2111                 qunlock(s);
2112                 poperror();
2113                 return;
2114         case Syn_received:
2115                 /* doesn't matter if it's the correct ack, we're just trying to set timing */
2116                 if(seg.flags & ACK)
2117                         tcpsynackrtt(s);
2118                 break;
2119         }
2120
2121         /*
2122          *  One DOS attack is to open connections to us and then forget about them,
2123          *  thereby tying up a conv at no long term cost to the attacker.
2124          *  This is an attempt to defeat these stateless DOS attacks.  See
2125          *  corresponding code in tcpsendka().
2126          */
2127         if(tcb->state != Syn_received && (seg.flags & RST) == 0){
2128                 if(tcpporthogdefense
2129                 && seq_within(seg.ack, tcb->snd.una-(1<<31), tcb->snd.una-(1<<29))){
2130                         print("stateless hog %I.%d->%I.%d f %ux %lux - %lux - %lux\n",
2131                                 source, seg.source, dest, seg.dest, seg.flags,
2132                                 tcb->snd.una-(1<<31), seg.ack, tcb->snd.una-(1<<29));
2133                         localclose(s, "stateless hog");
2134                 }
2135         }
2136
2137         /* Cut the data to fit the receive window */
2138         if(tcptrim(tcb, &seg, &bp, &length) == -1) {
2139                 netlog(f, Logtcp, "tcp len < 0, %lud %d\n", seg.seq, length);
2140                 update(s, &seg);
2141                 if(qlen(s->wq)+tcb->flgcnt == 0 && tcb->state == Closing) {
2142                         tcphalt(tpriv, &tcb->rtt_timer);
2143                         tcphalt(tpriv, &tcb->acktimer);
2144                         tcphalt(tpriv, &tcb->katimer);
2145                         tcpsetstate(s, Time_wait);
2146                         tcb->timer.start = MSL2*(1000 / MSPTICK);
2147                         tcpgo(tpriv, &tcb->timer);
2148                 }
2149                 if(!(seg.flags & RST)) {
2150                         tcb->flags |= FORCE;
2151                         goto output;
2152                 }
2153                 qunlock(s);
2154                 poperror();
2155                 return;
2156         }
2157
2158         /* Cannot accept so answer with a rst */
2159         if(length && tcb->state == Closed) {
2160                 sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
2161                 goto raise;
2162         }
2163
2164         /* The segment is beyond the current receive pointer so
2165          * queue the data in the resequence queue
2166          */
2167         if(seg.seq != tcb->rcv.nxt)
2168         if(length != 0 || (seg.flags & (SYN|FIN))) {
2169                 update(s, &seg);
2170                 if(addreseq(tcb, tpriv, &seg, bp, length) < 0)
2171                         print("reseq %I.%d -> %I.%d\n", s->raddr, s->rport, s->laddr, s->lport);
2172                 tcb->flags |= FORCE;
2173                 goto output;
2174         }
2175
2176         /*
2177          *  keep looping till we've processed this packet plus any
2178          *  adjacent packets in the resequence queue
2179          */
2180         for(;;) {
2181                 if(seg.flags & RST) {
2182                         if(tcb->state == Established) {
2183                                 tpriv->stats[EstabResets]++;
2184                                 if(tcb->rcv.nxt != seg.seq)
2185                                         print("out of order RST rcvd: %I.%d -> %I.%d, rcv.nxt %lux seq %lux\n", s->raddr, s->rport, s->laddr, s->lport, tcb->rcv.nxt, seg.seq);
2186                         }
2187                         localclose(s, Econrefused);
2188                         goto raise;
2189                 }
2190
2191                 if((seg.flags&ACK) == 0)
2192                         goto raise;
2193
2194                 switch(tcb->state) {
2195                 case Syn_received:
2196                         if(!seq_within(seg.ack, tcb->snd.una+1, tcb->snd.nxt)){
2197                                 sndrst(tcp, source, dest, length, &seg, version,
2198                                         "bad seq in Syn_received");
2199                                 goto raise;
2200                         }
2201                         update(s, &seg);
2202                         tcpsetstate(s, Established);
2203                 case Established:
2204                 case Close_wait:
2205                         update(s, &seg);
2206                         break;
2207                 case Finwait1:
2208                         update(s, &seg);
2209                         if(qlen(s->wq)+tcb->flgcnt == 0){
2210                                 tcphalt(tpriv, &tcb->rtt_timer);
2211                                 tcphalt(tpriv, &tcb->acktimer);
2212                                 tcpsetkacounter(tcb);
2213                                 tcb->time = NOW;
2214                                 tcpsetstate(s, Finwait2);
2215                                 tcb->katimer.start = MSL2 * (1000 / MSPTICK);
2216                                 tcpgo(tpriv, &tcb->katimer);
2217                         }
2218                         break;
2219                 case Finwait2:
2220                         update(s, &seg);
2221                         break;
2222                 case Closing:
2223                         update(s, &seg);
2224                         if(qlen(s->wq)+tcb->flgcnt == 0) {
2225                                 tcphalt(tpriv, &tcb->rtt_timer);
2226                                 tcphalt(tpriv, &tcb->acktimer);
2227                                 tcphalt(tpriv, &tcb->katimer);
2228                                 tcpsetstate(s, Time_wait);
2229                                 tcb->timer.start = MSL2*(1000 / MSPTICK);
2230                                 tcpgo(tpriv, &tcb->timer);
2231                         }
2232                         break;
2233                 case Last_ack:
2234                         update(s, &seg);
2235                         if(qlen(s->wq)+tcb->flgcnt == 0) {
2236                                 localclose(s, nil);
2237                                 goto raise;
2238                         }
2239                 case Time_wait:
2240                         tcb->flags |= FORCE;
2241                         if(tcb->timer.state != TcptimerON)
2242                                 tcpgo(tpriv, &tcb->timer);
2243                 }
2244
2245                 if((seg.flags&URG) && seg.urg) {
2246                         if(seq_gt(seg.urg + seg.seq, tcb->rcv.urg)) {
2247                                 tcb->rcv.urg = seg.urg + seg.seq;
2248                                 pullblock(&bp, seg.urg);
2249                         }
2250                 }
2251                 else
2252                 if(seq_gt(tcb->rcv.nxt, tcb->rcv.urg))
2253                         tcb->rcv.urg = tcb->rcv.nxt;
2254
2255                 if(length == 0) {
2256                         if(bp != nil)
2257                                 freeblist(bp);
2258                 }
2259                 else {
2260                         switch(tcb->state){
2261                         default:
2262                                 /* Ignore segment text */
2263                                 if(bp != nil)
2264                                         freeblist(bp);
2265                                 break;
2266
2267                         case Syn_received:
2268                         case Established:
2269                         case Finwait1:
2270                                 /* If we still have some data place on
2271                                  * receive queue
2272                                  */
2273                                 if(bp) {
2274                                         bp = packblock(bp);
2275                                         if(bp == nil)
2276                                                 panic("tcp packblock");
2277                                         qpassnolim(s->rq, bp);
2278                                         bp = nil;
2279
2280                                         /*
2281                                          *  Force an ack every 2 data messages.  This is
2282                                          *  a hack for rob to make his home system run
2283                                          *  faster.
2284                                          *
2285                                          *  this also keeps the standard TCP congestion
2286                                          *  control working since it needs an ack every
2287                                          *  2 max segs worth.  This is not quite that,
2288                                          *  but under a real stream is equivalent since
2289                                          *  every packet has a max seg in it.
2290                                          */
2291                                         if(++(tcb->rcv.una) >= 2)
2292                                                 tcb->flags |= FORCE;
2293                                 }
2294                                 tcb->rcv.nxt += length;
2295
2296                                 /*
2297                                  *  update our rcv window
2298                                  */
2299                                 tcprcvwin(s);
2300
2301                                 /*
2302                                  *  turn on the acktimer if there's something
2303                                  *  to ack
2304                                  */
2305                                 if(tcb->acktimer.state != TcptimerON)
2306                                         tcpgo(tpriv, &tcb->acktimer);
2307
2308                                 break;
2309                         case Finwait2:
2310                                 /* no process to read the data, send a reset */
2311                                 if(bp != nil)
2312                                         freeblist(bp);
2313                                 sndrst(tcp, source, dest, length, &seg, version,
2314                                         "send to Finwait2");
2315                                 qunlock(s);
2316                                 poperror();
2317                                 return;
2318                         }
2319                 }
2320
2321                 if(seg.flags & FIN) {
2322                         tcb->flags |= FORCE;
2323
2324                         switch(tcb->state) {
2325                         case Syn_received:
2326                         case Established:
2327                                 tcb->rcv.nxt++;
2328                                 tcpsetstate(s, Close_wait);
2329                                 break;
2330                         case Finwait1:
2331                                 tcb->rcv.nxt++;
2332                                 if(qlen(s->wq)+tcb->flgcnt == 0) {
2333                                         tcphalt(tpriv, &tcb->rtt_timer);
2334                                         tcphalt(tpriv, &tcb->acktimer);
2335                                         tcphalt(tpriv, &tcb->katimer);
2336                                         tcpsetstate(s, Time_wait);
2337                                         tcb->timer.start = MSL2*(1000/MSPTICK);
2338                                         tcpgo(tpriv, &tcb->timer);
2339                                 }
2340                                 else
2341                                         tcpsetstate(s, Closing);
2342                                 break;
2343                         case Finwait2:
2344                                 tcb->rcv.nxt++;
2345                                 tcphalt(tpriv, &tcb->rtt_timer);
2346                                 tcphalt(tpriv, &tcb->acktimer);
2347                                 tcphalt(tpriv, &tcb->katimer);
2348                                 tcpsetstate(s, Time_wait);
2349                                 tcb->timer.start = MSL2 * (1000/MSPTICK);
2350                                 tcpgo(tpriv, &tcb->timer);
2351                                 break;
2352                         case Close_wait:
2353                         case Closing:
2354                         case Last_ack:
2355                                 break;
2356                         case Time_wait:
2357                                 tcpgo(tpriv, &tcb->timer);
2358                                 break;
2359                         }
2360                 }
2361
2362                 /*
2363                  *  get next adjacent segment from the resequence queue.
2364                  *  dump/trim any overlapping segments
2365                  */
2366                 for(;;) {
2367                         if(tcb->reseq == nil)
2368                                 goto output;
2369
2370                         if(seq_ge(tcb->rcv.nxt, tcb->reseq->seg.seq) == 0)
2371                                 goto output;
2372
2373                         getreseq(tcb, &seg, &bp, &length);
2374
2375                         if(tcptrim(tcb, &seg, &bp, &length) == 0)
2376                                 break;
2377                 }
2378         }
2379 output:
2380         tcpoutput(s);
2381         qunlock(s);
2382         poperror();
2383         return;
2384 raise:
2385         qunlock(s);
2386         poperror();
2387         freeblist(bp);
2388         tcpkick(s);
2389 }
2390
2391 /*
2392  *  always enters and exits with the s locked.  We drop
2393  *  the lock to ipoput the packet so some care has to be
2394  *  taken by callers.
2395  */
2396 void
2397 tcpoutput(Conv *s)
2398 {
2399         Tcp seg;
2400         int msgs;
2401         Tcpctl *tcb;
2402         Block *hbp, *bp;
2403         int sndcnt, n;
2404         ulong ssize, dsize, usable, sent;
2405         Fs *f;
2406         Tcppriv *tpriv;
2407         uchar version;
2408
2409         f = s->p->f;
2410         tpriv = s->p->priv;
2411         version = s->ipversion;
2412
2413         for(msgs = 0; msgs < 100; msgs++) {
2414                 tcb = (Tcpctl*)s->ptcl;
2415
2416                 switch(tcb->state) {
2417                 case Listen:
2418                 case Closed:
2419                 case Finwait2:
2420                         return;
2421                 }
2422
2423                 /* force an ack when a window has opened up */
2424                 if(tcb->rcv.blocked && tcb->rcv.wnd > 0){
2425                         tcb->rcv.blocked = 0;
2426                         tcb->flags |= FORCE;
2427                 }
2428
2429                 sndcnt = qlen(s->wq)+tcb->flgcnt;
2430                 sent = tcb->snd.ptr - tcb->snd.una;
2431
2432                 /* Don't send anything else until our SYN has been acked */
2433                 if(tcb->snd.ptr != tcb->iss && (tcb->flags & SYNACK) == 0)
2434                         break;
2435
2436                 /* Compute usable segment based on offered window and limit
2437                  * window probes to one
2438                  */
2439                 if(tcb->snd.wnd == 0){
2440                         if(sent != 0) {
2441                                 if((tcb->flags&FORCE) == 0)
2442                                         break;
2443 //                              tcb->snd.ptr = tcb->snd.una;
2444                         }
2445                         usable = 1;
2446                 }
2447                 else {
2448                         usable = tcb->cwind;
2449                         if(tcb->snd.wnd < usable)
2450                                 usable = tcb->snd.wnd;
2451                         usable -= sent;
2452                 }
2453                 ssize = sndcnt-sent;
2454                 if(ssize && usable < 2)
2455                         netlog(s->p->f, Logtcp, "throttled snd.wnd %lud cwind %lud\n",
2456                                 tcb->snd.wnd, tcb->cwind);
2457                 if(usable < ssize)
2458                         ssize = usable;
2459                 if(tcb->mss < ssize)
2460                         ssize = tcb->mss;
2461                 dsize = ssize;
2462                 seg.urg = 0;
2463
2464                 if(ssize == 0)
2465                 if((tcb->flags&FORCE) == 0)
2466                         break;
2467
2468                 tcb->flags &= ~FORCE;
2469                 tcprcvwin(s);
2470
2471                 /* By default we will generate an ack */
2472                 tcphalt(tpriv, &tcb->acktimer);
2473                 tcb->rcv.una = 0;
2474                 seg.source = s->lport;
2475                 seg.dest = s->rport;
2476                 seg.flags = ACK;
2477                 seg.mss = 0;
2478                 seg.ws = 0;
2479                 switch(tcb->state){
2480                 case Syn_sent:
2481                         seg.flags = 0;
2482                         if(tcb->snd.ptr == tcb->iss){
2483                                 seg.flags |= SYN;
2484                                 dsize--;
2485                                 seg.mss = tcb->mss;
2486                                 seg.ws = tcb->scale;
2487                         }
2488                         break;
2489                 case Syn_received:
2490                         /*
2491                          *  don't send any data with a SYN/ACK packet
2492                          *  because Linux rejects the packet in its
2493                          *  attempt to solve the SYN attack problem
2494                          */
2495                         if(tcb->snd.ptr == tcb->iss){
2496                                 seg.flags |= SYN;
2497                                 dsize = 0;
2498                                 ssize = 1;
2499                                 seg.mss = tcb->mss;
2500                                 seg.ws = tcb->scale;
2501                         }
2502                         break;
2503                 }
2504                 seg.seq = tcb->snd.ptr;
2505                 seg.ack = tcb->rcv.nxt;
2506                 seg.wnd = tcb->rcv.wnd;
2507
2508                 /* Pull out data to send */
2509                 bp = nil;
2510                 if(dsize != 0) {
2511                         bp = qcopy(s->wq, dsize, sent);
2512                         if(BLEN(bp) != dsize) {
2513                                 seg.flags |= FIN;
2514                                 dsize--;
2515                         }
2516                 }
2517
2518                 if(sent+dsize == sndcnt)
2519                         seg.flags |= PSH;
2520
2521                 /* keep track of balance of resent data */
2522                 if(seq_lt(tcb->snd.ptr, tcb->snd.nxt)) {
2523                         n = tcb->snd.nxt - tcb->snd.ptr;
2524                         if(ssize < n)
2525                                 n = ssize;
2526                         tcb->resent += n;
2527                         netlog(f, Logtcp, "rexmit: %I.%d -> %I.%d ptr %lux nxt %lux\n",
2528                                 s->raddr, s->rport, s->laddr, s->lport, tcb->snd.ptr, tcb->snd.nxt);
2529                         tpriv->stats[RetransSegs]++;
2530                 }
2531
2532                 tcb->snd.ptr += ssize;
2533
2534                 /* Pull up the send pointer so we can accept acks
2535                  * for this window
2536                  */
2537                 if(seq_gt(tcb->snd.ptr,tcb->snd.nxt))
2538                         tcb->snd.nxt = tcb->snd.ptr;
2539
2540                 /* Build header, link data and compute cksum */
2541                 switch(version){
2542                 case V4:
2543                         tcb->protohdr.tcp4hdr.vihl = IP_VER4;
2544                         hbp = htontcp4(&seg, bp, &tcb->protohdr.tcp4hdr, tcb);
2545                         if(hbp == nil) {
2546                                 freeblist(bp);
2547                                 return;
2548                         }
2549                         break;
2550                 case V6:
2551                         tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
2552                         hbp = htontcp6(&seg, bp, &tcb->protohdr.tcp6hdr, tcb);
2553                         if(hbp == nil) {
2554                                 freeblist(bp);
2555                                 return;
2556                         }
2557                         break;
2558                 default:
2559                         hbp = nil;      /* to suppress a warning */
2560                         panic("tcpoutput: version %d", version);
2561                 }
2562
2563                 /* Start the transmission timers if there is new data and we
2564                  * expect acknowledges
2565                  */
2566                 if(ssize != 0){
2567                         if(tcb->timer.state != TcptimerON)
2568                                 tcpgo(tpriv, &tcb->timer);
2569
2570                         /*  If round trip timer isn't running, start it.
2571                          *  measure the longest packet only in case the
2572                          *  transmission time dominates RTT
2573                          */
2574                         if(tcb->rtt_timer.state != TcptimerON)
2575                         if(ssize == tcb->mss) {
2576                                 tcpgo(tpriv, &tcb->rtt_timer);
2577                                 tcb->rttseq = tcb->snd.ptr;
2578                         }
2579                 }
2580
2581                 tpriv->stats[OutSegs]++;
2582
2583                 /* put off the next keep alive */
2584                 tcpgo(tpriv, &tcb->katimer);
2585
2586                 switch(version){
2587                 case V4:
2588                         if(ipoput4(f, hbp, 0, s->ttl, s->tos, s) < 0){
2589                                 /* a negative return means no route */
2590                                 localclose(s, "no route");
2591                         }
2592                         break;
2593                 case V6:
2594                         if(ipoput6(f, hbp, 0, s->ttl, s->tos, s) < 0){
2595                                 /* a negative return means no route */
2596                                 localclose(s, "no route");
2597                         }
2598                         break;
2599                 default:
2600                         panic("tcpoutput2: version %d", version);
2601                 }
2602                 if((msgs%4) == 1){
2603                         qunlock(s);
2604                         sched();
2605                         qlock(s);
2606                 }
2607         }
2608 }
2609
2610 /*
2611  *  the BSD convention (hack?) for keep alives.  resend last uchar acked.
2612  */
2613 void
2614 tcpsendka(Conv *s)
2615 {
2616         Tcp seg;
2617         Tcpctl *tcb;
2618         Block *hbp,*dbp;
2619
2620         tcb = (Tcpctl*)s->ptcl;
2621
2622         dbp = nil;
2623         seg.urg = 0;
2624         seg.source = s->lport;
2625         seg.dest = s->rport;
2626         seg.flags = ACK|PSH;
2627         seg.mss = 0;
2628         seg.ws = 0;
2629         if(tcpporthogdefense)
2630                 seg.seq = tcb->snd.una-(1<<30)-nrand(1<<20);
2631         else
2632                 seg.seq = tcb->snd.una-1;
2633         seg.ack = tcb->rcv.nxt;
2634         tcb->rcv.una = 0;
2635         seg.wnd = tcb->rcv.wnd;
2636         if(tcb->state == Finwait2){
2637                 seg.flags |= FIN;
2638         } else {
2639                 dbp = allocb(1);
2640                 dbp->wp++;
2641         }
2642
2643         if(isv4(s->raddr)) {
2644                 /* Build header, link data and compute cksum */
2645                 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
2646                 hbp = htontcp4(&seg, dbp, &tcb->protohdr.tcp4hdr, tcb);
2647                 if(hbp == nil) {
2648                         freeblist(dbp);
2649                         return;
2650                 }
2651                 ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
2652         }
2653         else {
2654                 /* Build header, link data and compute cksum */
2655                 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
2656                 hbp = htontcp6(&seg, dbp, &tcb->protohdr.tcp6hdr, tcb);
2657                 if(hbp == nil) {
2658                         freeblist(dbp);
2659                         return;
2660                 }
2661                 ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
2662         }
2663 }
2664
2665 /*
2666  *  set connection to time out after 12 minutes
2667  */
2668 void
2669 tcpsetkacounter(Tcpctl *tcb)
2670 {
2671         tcb->kacounter = (12 * 60 * 1000) / (tcb->katimer.start*MSPTICK);
2672         if(tcb->kacounter < 3)
2673                 tcb->kacounter = 3;
2674 }
2675
2676 /*
2677  *  if we've timed out, close the connection
2678  *  otherwise, send a keepalive and restart the timer
2679  */
2680 void
2681 tcpkeepalive(void *v)
2682 {
2683         Tcpctl *tcb;
2684         Conv *s;
2685
2686         s = v;
2687         tcb = (Tcpctl*)s->ptcl;
2688         if(waserror()){
2689                 qunlock(s);
2690                 nexterror();
2691         }
2692         qlock(s);
2693         if(tcb->state != Closed){
2694                 if(--(tcb->kacounter) <= 0) {
2695                         localclose(s, Etimedout);
2696                 } else {
2697                         tcpsendka(s);
2698                         tcpgo(s->p->priv, &tcb->katimer);
2699                 }
2700         }
2701         qunlock(s);
2702         poperror();
2703 }
2704
2705 /*
2706  *  start keepalive timer
2707  */
2708 char*
2709 tcpstartka(Conv *s, char **f, int n)
2710 {
2711         Tcpctl *tcb;
2712         int x;
2713
2714         tcb = (Tcpctl*)s->ptcl;
2715         if(tcb->state != Established)
2716                 return "connection must be in Establised state";
2717         if(n > 1){
2718                 x = atoi(f[1]);
2719                 if(x >= MSPTICK)
2720                         tcb->katimer.start = x/MSPTICK;
2721         }
2722         tcpsetkacounter(tcb);
2723         tcpgo(s->p->priv, &tcb->katimer);
2724
2725         return nil;
2726 }
2727
2728 /*
2729  *  turn checksums on/off
2730  */
2731 char*
2732 tcpsetchecksum(Conv *s, char **f, int)
2733 {
2734         Tcpctl *tcb;
2735
2736         tcb = (Tcpctl*)s->ptcl;
2737         tcb->nochecksum = !atoi(f[1]);
2738
2739         return nil;
2740 }
2741
2742 void
2743 tcprxmit(Conv *s)
2744 {
2745         Tcpctl *tcb;
2746
2747         tcb = (Tcpctl*)s->ptcl;
2748
2749         tcb->flags |= RETRAN|FORCE;
2750         tcb->snd.ptr = tcb->snd.una;
2751
2752         /*
2753          *  We should be halving the slow start threshhold (down to one
2754          *  mss) but leaving it at mss seems to work well enough
2755          */
2756         tcb->ssthresh = tcb->mss;
2757
2758         /*
2759          *  pull window down to a single packet
2760          */
2761         tcb->cwind = tcb->mss;
2762         tcpoutput(s);
2763 }
2764
2765 void
2766 tcptimeout(void *arg)
2767 {
2768         Conv *s;
2769         Tcpctl *tcb;
2770         int maxback;
2771         Tcppriv *tpriv;
2772
2773         s = (Conv*)arg;
2774         tpriv = s->p->priv;
2775         tcb = (Tcpctl*)s->ptcl;
2776
2777         if(waserror()){
2778                 qunlock(s);
2779                 nexterror();
2780         }
2781         qlock(s);
2782         switch(tcb->state){
2783         default:
2784                 tcb->backoff++;
2785                 if(tcb->state == Syn_sent)
2786                         maxback = MAXBACKMS/2;
2787                 else
2788                         maxback = MAXBACKMS;
2789                 tcb->backedoff += tcb->timer.start * MSPTICK;
2790                 if(tcb->backedoff >= maxback) {
2791                         localclose(s, Etimedout);
2792                         break;
2793                 }
2794                 netlog(s->p->f, Logtcprxmt, "timeout rexmit 0x%lux %d/%d\n", tcb->snd.una, tcb->timer.start, NOW);
2795                 tcpsettimer(tcb);
2796                 tcprxmit(s);
2797                 tpriv->stats[RetransTimeouts]++;
2798                 tcb->snd.dupacks = 0;
2799                 break;
2800         case Time_wait:
2801                 localclose(s, nil);
2802                 break;
2803         case Closed:
2804                 break;
2805         }
2806         qunlock(s);
2807         poperror();
2808 }
2809
2810 int
2811 inwindow(Tcpctl *tcb, int seq)
2812 {
2813         return seq_within(seq, tcb->rcv.nxt, tcb->rcv.nxt+tcb->rcv.wnd-1);
2814 }
2815
2816 /*
2817  *  set up state for a received SYN (or SYN ACK) packet
2818  */
2819 void
2820 procsyn(Conv *s, Tcp *seg)
2821 {
2822         Tcpctl *tcb;
2823
2824         tcb = (Tcpctl*)s->ptcl;
2825         tcb->flags |= FORCE;
2826
2827         tcb->rcv.nxt = seg->seq + 1;
2828         tcb->rcv.urg = tcb->rcv.nxt;
2829         tcb->irs = seg->seq;
2830
2831         /* our sending max segment size cannot be bigger than what he asked for */
2832         if(seg->mss != 0 && seg->mss < tcb->mss)
2833                 tcb->mss = seg->mss;
2834
2835         /* the congestion window always starts out as a single segment */
2836         tcb->snd.wnd = seg->wnd;
2837         tcb->cwind = tcb->mss;
2838 }
2839
2840 int
2841 addreseq(Tcpctl *tcb, Tcppriv *tpriv, Tcp *seg, Block *bp, ushort length)
2842 {
2843         Reseq *rp, *rp1;
2844         int i, rqlen, qmax;
2845
2846         rp = malloc(sizeof(Reseq));
2847         if(rp == nil){
2848                 freeblist(bp);  /* bp always consumed by add_reseq */
2849                 return 0;
2850         }
2851
2852         rp->seg = *seg;
2853         rp->bp = bp;
2854         rp->length = length;
2855
2856         /* Place on reassembly list sorting by starting seq number */
2857         rp1 = tcb->reseq;
2858         if(rp1 == nil || seq_lt(seg->seq, rp1->seg.seq)) {
2859                 rp->next = rp1;
2860                 tcb->reseq = rp;
2861                 if(rp->next != nil)
2862                         tpriv->stats[OutOfOrder]++;
2863                 return 0;
2864         }
2865
2866         rqlen = 0;
2867         for(i = 0;; i++) {
2868                 rqlen += rp1->length;
2869                 if(rp1->next == nil || seq_lt(seg->seq, rp1->next->seg.seq)) {
2870                         rp->next = rp1->next;
2871                         rp1->next = rp;
2872                         if(rp->next != nil)
2873                                 tpriv->stats[OutOfOrder]++;
2874                         break;
2875                 }
2876                 rp1 = rp1->next;
2877         }
2878         qmax = QMAX<<tcb->rcv.scale;
2879         if(rqlen > qmax){
2880                 print("resequence queue > window: %d > %d\n", rqlen, qmax);
2881                 i = 0;
2882                 for(rp1 = tcb->reseq; rp1 != nil; rp1 = rp1->next){
2883                         print("%#lux %#lux %#ux\n", rp1->seg.seq,
2884                                 rp1->seg.ack, rp1->seg.flags);
2885                         if(i++ > 10){
2886                                 print("...\n");
2887                                 break;
2888                         }
2889                 }
2890
2891                 // delete entire reassembly queue; wait for retransmit.
2892                 // - should we be smarter and only delete the tail?
2893                 for(rp = tcb->reseq; rp != nil; rp = rp1){
2894                         rp1 = rp->next;
2895                         freeblist(rp->bp);
2896                         free(rp);
2897                 }
2898                 tcb->reseq = nil;
2899
2900                 return -1;
2901         }
2902         return 0;
2903 }
2904
2905 void
2906 getreseq(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length)
2907 {
2908         Reseq *rp;
2909
2910         rp = tcb->reseq;
2911         if(rp == nil)
2912                 return;
2913
2914         tcb->reseq = rp->next;
2915
2916         *seg = rp->seg;
2917         *bp = rp->bp;
2918         *length = rp->length;
2919
2920         free(rp);
2921 }
2922
2923 int
2924 tcptrim(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length)
2925 {
2926         ushort len;
2927         uchar accept;
2928         int dupcnt, excess;
2929
2930         accept = 0;
2931         len = *length;
2932         if(seg->flags & SYN)
2933                 len++;
2934         if(seg->flags & FIN)
2935                 len++;
2936
2937         if(tcb->rcv.wnd == 0) {
2938                 if(len == 0 && seg->seq == tcb->rcv.nxt)
2939                         return 0;
2940         }
2941         else {
2942                 /* Some part of the segment should be in the window */
2943                 if(inwindow(tcb,seg->seq))
2944                         accept++;
2945                 else
2946                 if(len != 0) {
2947                         if(inwindow(tcb, seg->seq+len-1) ||
2948                         seq_within(tcb->rcv.nxt, seg->seq,seg->seq+len-1))
2949                                 accept++;
2950                 }
2951         }
2952         if(!accept) {
2953                 freeblist(*bp);
2954                 return -1;
2955         }
2956         dupcnt = tcb->rcv.nxt - seg->seq;
2957         if(dupcnt > 0){
2958                 tcb->rerecv += dupcnt;
2959                 if(seg->flags & SYN){
2960                         seg->flags &= ~SYN;
2961                         seg->seq++;
2962
2963                         if(seg->urg > 1)
2964                                 seg->urg--;
2965                         else
2966                                 seg->flags &= ~URG;
2967                         dupcnt--;
2968                 }
2969                 if(dupcnt > 0){
2970                         pullblock(bp, (ushort)dupcnt);
2971                         seg->seq += dupcnt;
2972                         *length -= dupcnt;
2973
2974                         if(seg->urg > dupcnt)
2975                                 seg->urg -= dupcnt;
2976                         else {
2977                                 seg->flags &= ~URG;
2978                                 seg->urg = 0;
2979                         }
2980                 }
2981         }
2982         excess = seg->seq + *length - (tcb->rcv.nxt + tcb->rcv.wnd);
2983         if(excess > 0) {
2984                 tcb->rerecv += excess;
2985                 *length -= excess;
2986                 *bp = trimblock(*bp, 0, *length);
2987                 if(*bp == nil)
2988                         panic("presotto is a boofhead");
2989                 seg->flags &= ~FIN;
2990         }
2991         return 0;
2992 }
2993
2994 void
2995 tcpadvise(Proto *tcp, Block *bp, char *msg)
2996 {
2997         Tcp4hdr *h4;
2998         Tcp6hdr *h6;
2999         Tcpctl *tcb;
3000         uchar source[IPaddrlen];
3001         uchar dest[IPaddrlen];
3002         ushort psource, pdest;
3003         Conv *s, **p;
3004
3005         h4 = (Tcp4hdr*)(bp->rp);
3006         h6 = (Tcp6hdr*)(bp->rp);
3007
3008         if((h4->vihl&0xF0)==IP_VER4) {
3009                 v4tov6(dest, h4->tcpdst);
3010                 v4tov6(source, h4->tcpsrc);
3011                 psource = nhgets(h4->tcpsport);
3012                 pdest = nhgets(h4->tcpdport);
3013         }
3014         else {
3015                 ipmove(dest, h6->tcpdst);
3016                 ipmove(source, h6->tcpsrc);
3017                 psource = nhgets(h6->tcpsport);
3018                 pdest = nhgets(h6->tcpdport);
3019         }
3020
3021         /* Look for a connection */
3022         qlock(tcp);
3023         for(p = tcp->conv; *p; p++) {
3024                 s = *p;
3025                 tcb = (Tcpctl*)s->ptcl;
3026                 if(s->rport == pdest)
3027                 if(s->lport == psource)
3028                 if(tcb->state != Closed)
3029                 if(ipcmp(s->raddr, dest) == 0)
3030                 if(ipcmp(s->laddr, source) == 0){
3031                         qlock(s);
3032                         qunlock(tcp);
3033                         switch(tcb->state){
3034                         case Syn_sent:
3035                                 localclose(s, msg);
3036                                 break;
3037                         }
3038                         qunlock(s);
3039                         freeblist(bp);
3040                         return;
3041                 }
3042         }
3043         qunlock(tcp);
3044         freeblist(bp);
3045 }
3046
3047 static char*
3048 tcpporthogdefensectl(char *val)
3049 {
3050         if(strcmp(val, "on") == 0)
3051                 tcpporthogdefense = 1;
3052         else if(strcmp(val, "off") == 0)
3053                 tcpporthogdefense = 0;
3054         else
3055                 return "unknown value for tcpporthogdefense";
3056         return nil;
3057 }
3058
3059 /* called with c qlocked */
3060 char*
3061 tcpctl(Conv* c, char** f, int n)
3062 {
3063         if(n == 1 && strcmp(f[0], "hangup") == 0)
3064                 return tcphangup(c);
3065         if(n >= 1 && strcmp(f[0], "keepalive") == 0)
3066                 return tcpstartka(c, f, n);
3067         if(n >= 1 && strcmp(f[0], "checksum") == 0)
3068                 return tcpsetchecksum(c, f, n);
3069         if(n >= 1 && strcmp(f[0], "tcpporthogdefense") == 0)
3070                 return tcpporthogdefensectl(f[1]);
3071         return "unknown control request";
3072 }
3073
3074 int
3075 tcpstats(Proto *tcp, char *buf, int len)
3076 {
3077         Tcppriv *priv;
3078         char *p, *e;
3079         int i;
3080
3081         priv = tcp->priv;
3082         p = buf;
3083         e = p+len;
3084         for(i = 0; i < Nstats; i++)
3085                 p = seprint(p, e, "%s: %lud\n", statnames[i], priv->stats[i]);
3086         return p - buf;
3087 }
3088
3089 /*
3090  *  garbage collect any stale conversations:
3091  *      - SYN received but no SYN-ACK after 5 seconds (could be the SYN attack)
3092  *      - Finwait2 after 5 minutes
3093  *
3094  *  this is called whenever we run out of channels.  Both checks are
3095  *  of questionable validity so we try to use them only when we're
3096  *  up against the wall.
3097  */
3098 int
3099 tcpgc(Proto *tcp)
3100 {
3101         Conv *c, **pp, **ep;
3102         int n;
3103         Tcpctl *tcb;
3104
3105
3106         n = 0;
3107         ep = &tcp->conv[tcp->nc];
3108         for(pp = tcp->conv; pp < ep; pp++) {
3109                 c = *pp;
3110                 if(c == nil)
3111                         break;
3112                 if(!canqlock(c))
3113                         continue;
3114                 tcb = (Tcpctl*)c->ptcl;
3115                 switch(tcb->state){
3116                 case Syn_received:
3117                         if(NOW - tcb->time > 5000){
3118                                 localclose(c, "timed out");
3119                                 n++;
3120                         }
3121                         break;
3122                 case Finwait2:
3123                         if(NOW - tcb->time > 5*60*1000){
3124                                 localclose(c, "timed out");
3125                                 n++;
3126                         }
3127                         break;
3128                 }
3129                 qunlock(c);
3130         }
3131         return n;
3132 }
3133
3134 void
3135 tcpsettimer(Tcpctl *tcb)
3136 {
3137         int x;
3138
3139         /* round trip dependency */
3140         x = backoff(tcb->backoff) *
3141                 (tcb->mdev + (tcb->srtt>>LOGAGAIN) + MSPTICK) / MSPTICK;
3142
3143         /* bounded twixt 1/2 and 64 seconds */
3144         if(x < 500/MSPTICK)
3145                 x = 500/MSPTICK;
3146         else if(x > (64000/MSPTICK))
3147                 x = 64000/MSPTICK;
3148         tcb->timer.start = x;
3149 }
3150
3151 void
3152 tcpinit(Fs *fs)
3153 {
3154         Proto *tcp;
3155         Tcppriv *tpriv;
3156
3157         tcp = smalloc(sizeof(Proto));
3158         tpriv = tcp->priv = smalloc(sizeof(Tcppriv));
3159         tcp->name = "tcp";
3160         tcp->connect = tcpconnect;
3161         tcp->announce = tcpannounce;
3162         tcp->ctl = tcpctl;
3163         tcp->state = tcpstate;
3164         tcp->create = tcpcreate;
3165         tcp->close = tcpclose;
3166         tcp->rcv = tcpiput;
3167         tcp->advise = tcpadvise;
3168         tcp->stats = tcpstats;
3169         tcp->inuse = tcpinuse;
3170         tcp->gc = tcpgc;
3171         tcp->ipproto = IP_TCPPROTO;
3172         tcp->nc = scalednconv();
3173         tcp->ptclsize = sizeof(Tcpctl);
3174         tpriv->stats[MaxConn] = tcp->nc;
3175
3176         Fsproto(fs, tcp);
3177 }
3178
3179 void
3180 tcpsetscale(Conv *s, Tcpctl *tcb, ushort rcvscale, ushort sndscale)
3181 {
3182         if(rcvscale){
3183                 tcb->rcv.scale = rcvscale & 0xff;
3184                 tcb->snd.scale = sndscale & 0xff;
3185                 tcb->window = QMAX<<tcb->snd.scale;
3186                 qsetlimit(s->rq, tcb->window);
3187         } else {
3188                 tcb->rcv.scale = 0;
3189                 tcb->snd.scale = 0;
3190                 tcb->window = QMAX;
3191                 qsetlimit(s->rq, tcb->window);
3192         }
3193 }