8ae2be26c054b8a8bac8173b387be1ff9d52f6e1
[akaros.git] / kern / src / net / tcp.c
1 /* Copyright © 1994-1999 Lucent Technologies Inc.  All rights reserved.
2  * Portions Copyright © 1997-1999 Vita Nuova Limited
3  * Portions Copyright © 2000-2007 Vita Nuova Holdings Limited
4  *                                (www.vitanuova.com)
5  * Revisions Copyright © 2000-2007 Lucent Technologies Inc. and others
6  *
7  * Modified for the Akaros operating system:
8  * Copyright (c) 2013-2014 The Regents of the University of California
9  * Copyright (c) 2013-2015 Google Inc.
10  *
11  * Permission is hereby granted, free of charge, to any person obtaining a copy
12  * of this software and associated documentation files (the "Software"), to deal
13  * in the Software without restriction, including without limitation the rights
14  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15  * copies of the Software, and to permit persons to whom the Software is
16  * furnished to do so, subject to the following conditions:
17  *
18  * The above copyright notice and this permission notice shall be included in
19  * all copies or substantial portions of the Software.
20  *
21  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
24  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27  * SOFTWARE. */
28
29 #include <vfs.h>
30 #include <kfs.h>
31 #include <slab.h>
32 #include <kmalloc.h>
33 #include <kref.h>
34 #include <string.h>
35 #include <stdio.h>
36 #include <assert.h>
37 #include <error.h>
38 #include <cpio.h>
39 #include <pmap.h>
40 #include <smp.h>
41 #include <ip.h>
42
43 #include <vfs.h>
44 #include <kfs.h>
45 #include <slab.h>
46 #include <kmalloc.h>
47 #include <kref.h>
48 #include <string.h>
49 #include <stdio.h>
50 #include <assert.h>
51 #include <error.h>
52 #include <cpio.h>
53 #include <pmap.h>
54 #include <smp.h>
55 #include <ip.h>
56
57 enum {
58         QMAX = 64 * 1024 - 1,
59         IP_TCPPROTO = 6,
60
61         TCP4_IPLEN = 8,
62         TCP4_PHDRSIZE = 12,
63         TCP4_HDRSIZE = 20,
64         TCP4_TCBPHDRSZ = 40,
65         TCP4_PKT = TCP4_IPLEN + TCP4_PHDRSIZE,
66
67         TCP6_IPLEN = 0,
68         TCP6_PHDRSIZE = 40,
69         TCP6_HDRSIZE = 20,
70         TCP6_TCBPHDRSZ = 60,
71         TCP6_PKT = TCP6_IPLEN + TCP6_PHDRSIZE,
72
73         TcptimerOFF = 0,
74         TcptimerON = 1,
75         TcptimerDONE = 2,
76         MAX_TIME = (1 << 20),   /* Forever */
77         TCP_ACK = 50,   /* Timed ack sequence in ms */
78         MAXBACKMS = 9 * 60 * 1000,      /* longest backoff time (ms) before hangup */
79
80         URG = 0x20,     /* Data marked urgent */
81         ACK = 0x10,     /* Acknowledge is valid */
82         PSH = 0x08,     /* Whole data pipe is pushed */
83         RST = 0x04,     /* Reset connection */
84         SYN = 0x02,     /* Pkt. is synchronise */
85         FIN = 0x01,     /* Start close down */
86
87         EOLOPT = 0,
88         NOOPOPT = 1,
89         MSSOPT = 2,
90         MSS_LENGTH = 4, /* Mean segment size */
91         WSOPT = 3,
92         WS_LENGTH = 3,  /* Bits to scale window size by */
93         MSL2 = 10,
94         MSPTICK = 50,   /* Milliseconds per timer tick */
95         DEF_MSS = 1460, /* Default mean segment */
96         DEF_MSS6 = 1280,        /* Default mean segment (min) for v6 */
97         DEF_RTT = 500,  /* Default round trip */
98         DEF_KAT = 120000,       /* Default time (ms) between keep alives */
99         TCP_LISTEN = 0, /* Listen connection */
100         TCP_CONNECT = 1,        /* Outgoing connection */
101         SYNACK_RXTIMER = 250,   /* ms between SYNACK retransmits */
102
103         TCPREXMTTHRESH = 3,     /* dupack threshhold for rxt */
104
105         FORCE = 1,
106         CLONE = 2,
107         RETRAN = 4,
108         ACTIVE = 8,
109         SYNACK = 16,
110         TSO = 32,
111
112         LOGAGAIN = 3,
113         LOGDGAIN = 2,
114
115         Closed = 0,     /* Connection states */
116         Listen,
117         Syn_sent,
118         Syn_received,
119         Established,
120         Finwait1,
121         Finwait2,
122         Close_wait,
123         Closing,
124         Last_ack,
125         Time_wait,
126
127         Maxlimbo = 1000,        /* maximum procs waiting for response to SYN ACK */
128         NLHT = 256,     /* hash table size, must be a power of 2 */
129         LHTMASK = NLHT - 1,
130
131         HaveWS = 1 << 8,
132 };
133
134 /* Must correspond to the enumeration above */
135 char *tcpstates[] = {
136         "Closed", "Listen", "Syn_sent", "Syn_received",
137         "Established", "Finwait1", "Finwait2", "Close_wait",
138         "Closing", "Last_ack", "Time_wait"
139 };
140
141 typedef struct Tcptimer Tcptimer;
142 struct Tcptimer {
143         Tcptimer *next;
144         Tcptimer *prev;
145         Tcptimer *readynext;
146         int state;
147         uint64_t start;
148         uint64_t count;
149         void (*func) (void *);
150         void *arg;
151 };
152
153 /*
154  *  v4 and v6 pseudo headers used for
155  *  checksuming tcp
156  */
157 typedef struct Tcp4hdr Tcp4hdr;
158 struct Tcp4hdr {
159         uint8_t vihl;                           /* Version and header length */
160         uint8_t tos;                            /* Type of service */
161         uint8_t length[2];                      /* packet length */
162         uint8_t id[2];                          /* Identification */
163         uint8_t frag[2];                        /* Fragment information */
164         uint8_t Unused;
165         uint8_t proto;
166         uint8_t tcplen[2];
167         uint8_t tcpsrc[4];
168         uint8_t tcpdst[4];
169         uint8_t tcpsport[2];
170         uint8_t tcpdport[2];
171         uint8_t tcpseq[4];
172         uint8_t tcpack[4];
173         uint8_t tcpflag[2];
174         uint8_t tcpwin[2];
175         uint8_t tcpcksum[2];
176         uint8_t tcpurg[2];
177         /* Options segment */
178         uint8_t tcpopt[1];
179 };
180
181 typedef struct Tcp6hdr Tcp6hdr;
182 struct Tcp6hdr {
183         uint8_t vcf[4];
184         uint8_t ploadlen[2];
185         uint8_t proto;
186         uint8_t ttl;
187         uint8_t tcpsrc[IPaddrlen];
188         uint8_t tcpdst[IPaddrlen];
189         uint8_t tcpsport[2];
190         uint8_t tcpdport[2];
191         uint8_t tcpseq[4];
192         uint8_t tcpack[4];
193         uint8_t tcpflag[2];
194         uint8_t tcpwin[2];
195         uint8_t tcpcksum[2];
196         uint8_t tcpurg[2];
197         /* Options segment */
198         uint8_t tcpopt[1];
199 };
200
201 /*
202  *  this represents the control info
203  *  for a single packet.  It is derived from
204  *  a packet in ntohtcp{4,6}() and stuck into
205  *  a packet in htontcp{4,6}().
206  */
207 typedef struct Tcp Tcp;
208 struct Tcp {
209         uint16_t source;
210         uint16_t dest;
211         uint32_t seq;
212         uint32_t ack;
213         uint8_t flags;
214         uint16_t ws;                            /* window scale option (if not zero) */
215         uint32_t wnd;
216         uint16_t urg;
217         uint16_t mss;                           /* max segment size option (if not zero) */
218         uint16_t len;                           /* size of data */
219 };
220
221 /*
222  *  this header is malloc'd to thread together fragments
223  *  waiting to be coalesced
224  */
225 typedef struct Reseq Reseq;
226 struct Reseq {
227         Reseq *next;
228         Tcp seg;
229         struct block *bp;
230         uint16_t length;
231 };
232
233 /*
234  *  the qlock in the Conv locks this structure
235  */
236 typedef struct Tcpctl Tcpctl;
237 struct Tcpctl {
238         uint8_t state;                          /* Connection state */
239         uint8_t type;                           /* Listening or active connection */
240         uint8_t code;                           /* Icmp code */
241         struct {
242                 uint32_t una;                   /* Unacked data pointer */
243                 uint32_t nxt;                   /* Next sequence expected */
244                 uint32_t ptr;                   /* Data pointer */
245                 uint32_t wnd;                   /* Tcp send window */
246                 uint32_t urg;                   /* Urgent data pointer */
247                 uint32_t wl2;
248                 int scale;                              /* how much to right shift window in xmitted packets */
249                 /* to implement tahoe and reno TCP */
250                 uint32_t dupacks;               /* number of duplicate acks rcvd */
251                 int recovery;                   /* loss recovery flag */
252                 uint32_t rxt;                   /* right window marker for recovery */
253         } snd;
254         struct {
255                 uint32_t nxt;                   /* Receive pointer to next uint8_t slot */
256                 uint32_t wnd;                   /* Receive window incoming */
257                 uint32_t urg;                   /* Urgent pointer */
258                 int blocked;
259                 int una;                                /* unacked data segs */
260                 int scale;                              /* how much to left shift window in rcved packets */
261         } rcv;
262         uint32_t iss;                           /* Initial sequence number */
263         int sawwsopt;                           /* true if we saw a wsopt on the incoming SYN */
264         uint32_t cwind;                         /* Congestion window */
265         int scale;                                      /* desired snd.scale */
266         uint16_t ssthresh;                      /* Slow start threshold */
267         int resent;                                     /* Bytes just resent */
268         int irs;                                        /* Initial received squence */
269         uint16_t mss;                           /* Mean segment size */
270         int rerecv;                                     /* Overlap of data rerecevived */
271         uint32_t window;                        /* Recevive window */
272         uint8_t backoff;                        /* Exponential backoff counter */
273         int backedoff;                          /* ms we've backed off for rexmits */
274         uint8_t flags;                          /* State flags */
275         Reseq *reseq;                           /* Resequencing queue */
276         Tcptimer timer;                         /* Activity timer */
277         Tcptimer acktimer;                      /* Acknowledge timer */
278         Tcptimer rtt_timer;                     /* Round trip timer */
279         Tcptimer katimer;                       /* keep alive timer */
280         uint32_t rttseq;                        /* Round trip sequence */
281         int srtt;                                       /* Shortened round trip */
282         int mdev;                                       /* Mean deviation of round trip */
283         int kacounter;                          /* count down for keep alive */
284         uint64_t sndsyntime;            /* time syn sent */
285         uint64_t time;                          /* time Finwait2 or Syn_received was sent */
286         int nochecksum;                         /* non-zero means don't send checksums */
287         int flgcnt;                                     /* number of flags in the sequence (FIN,SEQ) */
288
289         union {
290                 Tcp4hdr tcp4hdr;
291                 Tcp6hdr tcp6hdr;
292         } protohdr;                                     /* prototype header */
293 };
294
295 /*
296  *  New calls are put in limbo rather than having a conversation structure
297  *  allocated.  Thus, a SYN attack results in lots of limbo'd calls but not
298  *  any real Conv structures mucking things up.  Calls in limbo rexmit their
299  *  SYN ACK every SYNACK_RXTIMER ms up to 4 times, i.e., they disappear after 1 second.
300  *
301  *  In particular they aren't on a listener's queue so that they don't figure
302  *  in the input queue limit.
303  *
304  *  If 1/2 of a T3 was attacking SYN packets, we'ld have a permanent queue
305  *  of 70000 limbo'd calls.  Not great for a linear list but doable.  Therefore
306  *  there is no hashing of this list.
307  */
308 typedef struct Limbo Limbo;
309 struct Limbo {
310         Limbo *next;
311
312         uint8_t laddr[IPaddrlen];
313         uint8_t raddr[IPaddrlen];
314         uint16_t lport;
315         uint16_t rport;
316         uint32_t irs;                           /* initial received sequence */
317         uint32_t iss;                           /* initial sent sequence */
318         uint16_t mss;                           /* mss from the other end */
319         uint16_t rcvscale;                      /* how much to scale rcvd windows */
320         uint16_t sndscale;                      /* how much to scale sent windows */
321         uint64_t lastsend;                      /* last time we sent a synack */
322         uint8_t version;                        /* v4 or v6 */
323         uint8_t rexmits;                        /* number of retransmissions */
324 };
325
326 int tcp_irtt = DEF_RTT;                 /* Initial guess at round trip time */
327 uint16_t tcp_mss = DEF_MSS;             /* Maximum segment size to be sent */
328
329 enum {
330         /* MIB stats */
331         MaxConn,
332         ActiveOpens,
333         PassiveOpens,
334         EstabResets,
335         CurrEstab,
336         InSegs,
337         OutSegs,
338         RetransSegs,
339         RetransTimeouts,
340         InErrs,
341         OutRsts,
342
343         /* non-MIB stats */
344         CsumErrs,
345         HlenErrs,
346         LenErrs,
347         OutOfOrder,
348
349         Nstats
350 };
351
352 static char *statnames[] = {
353         [MaxConn] "MaxConn",
354         [ActiveOpens] "ActiveOpens",
355         [PassiveOpens] "PassiveOpens",
356         [EstabResets] "EstabResets",
357         [CurrEstab] "CurrEstab",
358         [InSegs] "InSegs",
359         [OutSegs] "OutSegs",
360         [RetransSegs] "RetransSegs",
361         [RetransTimeouts] "RetransTimeouts",
362         [InErrs] "InErrs",
363         [OutRsts] "OutRsts",
364         [CsumErrs] "CsumErrs",
365         [HlenErrs] "HlenErrs",
366         [LenErrs] "LenErrs",
367         [OutOfOrder] "OutOfOrder",
368 };
369
370 typedef struct Tcppriv Tcppriv;
371 struct tcppriv {
372         /* List of active timers */
373         qlock_t tl;
374         Tcptimer *timers;
375
376         /* hash table for matching conversations */
377         struct Ipht ht;
378
379         /* calls in limbo waiting for an ACK to our SYN ACK */
380         int nlimbo;
381         Limbo *lht[NLHT];
382
383         /* for keeping track of tcpackproc */
384         qlock_t apl;
385         int ackprocstarted;
386
387         uint32_t stats[Nstats];
388 };
389
390 /*
391  *  Setting tcpporthogdefense to non-zero enables Dong Lin's
392  *  solution to hijacked systems staking out port's as a form
393  *  of DoS attack.
394  *
395  *  To avoid stateless Conv hogs, we pick a sequence number at random.  If
396  *  it that number gets acked by the other end, we shut down the connection.
397  *  Look for tcpporthogedefense in the code.
398  */
399 int tcpporthogdefense = 0;
400
401 int addreseq(Tcpctl *, struct tcppriv *, Tcp *, struct block *, uint16_t);
402 void getreseq(Tcpctl *, Tcp *, struct block **, uint16_t *);
403 void localclose(struct conv *, char *unused_char_p_t);
404 void procsyn(struct conv *, Tcp *);
405 void tcpiput(struct Proto *, struct Ipifc *, struct block *);
406 void tcpoutput(struct conv *);
407 int tcptrim(Tcpctl *, Tcp *, struct block **, uint16_t *);
408 void tcpstart(struct conv *, int);
409 void tcptimeout(void *);
410 void tcpsndsyn(struct conv *, Tcpctl *);
411 void tcprcvwin(struct conv *);
412 void tcpacktimer(void *);
413 void tcpkeepalive(void *);
414 void tcpsetkacounter(Tcpctl *);
415 void tcprxmit(struct conv *);
416 void tcpsettimer(Tcpctl *);
417 void tcpsynackrtt(struct conv *);
418 void tcpsetscale(struct conv *, Tcpctl *, uint16_t, uint16_t);
419
420 static void limborexmit(struct Proto *);
421 static void limbo(struct conv *, uint8_t * unused_uint8_p_t, uint8_t *, Tcp *,
422                                   int);
423
424 void tcpsetstate(struct conv *s, uint8_t newstate)
425 {
426         Tcpctl *tcb;
427         uint8_t oldstate;
428         struct tcppriv *tpriv;
429
430         tpriv = s->p->priv;
431
432         tcb = (Tcpctl *) s->ptcl;
433
434         oldstate = tcb->state;
435         if (oldstate == newstate)
436                 return;
437
438         if (oldstate == Established)
439                 tpriv->stats[CurrEstab]--;
440         if (newstate == Established)
441                 tpriv->stats[CurrEstab]++;
442
443         /**
444         print( "%d/%d %s->%s CurrEstab=%d\n", s->lport, s->rport,
445                 tcpstates[oldstate], tcpstates[newstate], tpriv->tstats.tcpCurrEstab );
446         **/
447
448         switch (newstate) {
449                 case Closed:
450                         qclose(s->rq);
451                         qclose(s->wq);
452                         qclose(s->eq);
453                         break;
454
455                 case Close_wait:        /* Remote closes */
456                         qhangup(s->rq, NULL);
457                         break;
458         }
459
460         tcb->state = newstate;
461
462         if (oldstate == Syn_sent && newstate != Closed)
463                 Fsconnected(s, NULL);
464 }
465
466 static void tcpconnect(struct conv *c, char **argv, int argc)
467 {
468         Fsstdconnect(c, argv, argc);
469         tcpstart(c, TCP_CONNECT);
470 }
471
472 static int tcpstate(struct conv *c, char *state, int n)
473 {
474         Tcpctl *s;
475
476         s = (Tcpctl *) (c->ptcl);
477
478         return snprintf(state, n,
479                                         "%s qin %d qout %d srtt %d mdev %d cwin %u swin %u>>%d rwin %u>>%d timer.start %llu timer.count %llu rerecv %d katimer.start %d katimer.count %d\n",
480                                         tcpstates[s->state],
481                                         c->rq ? qlen(c->rq) : 0,
482                                         c->wq ? qlen(c->wq) : 0,
483                                         s->srtt, s->mdev,
484                                         s->cwind, s->snd.wnd, s->rcv.scale, s->rcv.wnd,
485                                         s->snd.scale, s->timer.start, s->timer.count, s->rerecv,
486                                         s->katimer.start, s->katimer.count);
487 }
488
489 static int tcpinuse(struct conv *c)
490 {
491         Tcpctl *s;
492
493         s = (Tcpctl *) (c->ptcl);
494         return s->state != Closed;
495 }
496
497 static char *tcpannounce(struct conv *c, char **argv, int argc)
498 {
499         char *e;
500
501         e = Fsstdannounce(c, argv, argc);
502         if (e != NULL)
503                 return e;
504         tcpstart(c, TCP_LISTEN);
505         Fsconnected(c, NULL);
506
507         return NULL;
508 }
509
510 /*
511  *  tcpclose is always called with the q locked
512  */
513 static void tcpclose(struct conv *c)
514 {
515         Tcpctl *tcb;
516
517         tcb = (Tcpctl *) c->ptcl;
518
519         qhangup(c->rq, NULL);
520         qhangup(c->wq, NULL);
521         qhangup(c->eq, NULL);
522         qflush(c->rq);
523
524         switch (tcb->state) {
525                 case Listen:
526                         /*
527                          *  reset any incoming calls to this listener
528                          */
529                         Fsconnected(c, "Hangup");
530
531                         localclose(c, NULL);
532                         break;
533                 case Closed:
534                 case Syn_sent:
535                         localclose(c, NULL);
536                         break;
537                 case Syn_received:
538                 case Established:
539                         tcb->flgcnt++;
540                         tcb->snd.nxt++;
541                         tcpsetstate(c, Finwait1);
542                         tcpoutput(c);
543                         break;
544                 case Close_wait:
545                         tcb->flgcnt++;
546                         tcb->snd.nxt++;
547                         tcpsetstate(c, Last_ack);
548                         tcpoutput(c);
549                         break;
550         }
551 }
552
553 void tcpkick(void *x)
554 {
555         ERRSTACK(1);
556         struct conv *s = x;
557         Tcpctl *tcb;
558
559         tcb = (Tcpctl *) s->ptcl;
560
561         qlock(&s->qlock);
562         if (waserror()) {
563                 qunlock(&s->qlock);
564                 nexterror();
565         }
566
567         switch (tcb->state) {
568                 case Syn_sent:
569                 case Syn_received:
570                 case Established:
571                 case Close_wait:
572                         /*
573                          * Push data
574                          */
575                         tcprcvwin(s);
576                         tcpoutput(s);
577                         break;
578                 default:
579                         localclose(s, "Hangup");
580                         break;
581         }
582
583         qunlock(&s->qlock);
584         poperror();
585 }
586
587 void tcprcvwin(struct conv *s)
588 {       /* Call with tcb locked */
589         int w;
590         Tcpctl *tcb;
591
592         tcb = (Tcpctl *) s->ptcl;
593         w = tcb->window - qlen(s->rq);
594         if (w < 0)
595                 w = 0;
596         tcb->rcv.wnd = w;
597         if (w == 0)
598                 tcb->rcv.blocked = 1;
599 }
600
601 void tcpacktimer(void *v)
602 {
603         ERRSTACK(1);
604         Tcpctl *tcb;
605         struct conv *s;
606
607         s = v;
608         tcb = (Tcpctl *) s->ptcl;
609
610         qlock(&s->qlock);
611         if (waserror()) {
612                 qunlock(&s->qlock);
613                 nexterror();
614         }
615         if (tcb->state != Closed) {
616                 tcb->flags |= FORCE;
617                 tcprcvwin(s);
618                 tcpoutput(s);
619         }
620         qunlock(&s->qlock);
621         poperror();
622 }
623
624 static void tcpcreate(struct conv *c)
625 {
626         c->rq = qopen(QMAX, Qcoalesce, tcpacktimer, c);
627         c->wq = qopen(8 * QMAX, Qkick, tcpkick, c);
628 }
629
630 static void timerstate(struct tcppriv *priv, Tcptimer * t, int newstate)
631 {
632         if (newstate != TcptimerON) {
633                 if (t->state == TcptimerON) {
634                         // unchain
635                         if (priv->timers == t) {
636                                 priv->timers = t->next;
637                                 if (t->prev != NULL)
638                                         panic("timerstate1");
639                         }
640                         if (t->next)
641                                 t->next->prev = t->prev;
642                         if (t->prev)
643                                 t->prev->next = t->next;
644                         t->next = t->prev = NULL;
645                 }
646         } else {
647                 if (t->state != TcptimerON) {
648                         // chain
649                         if (t->prev != NULL || t->next != NULL)
650                                 panic("timerstate2");
651                         t->prev = NULL;
652                         t->next = priv->timers;
653                         if (t->next)
654                                 t->next->prev = t;
655                         priv->timers = t;
656                 }
657         }
658         t->state = newstate;
659 }
660
661 void tcpackproc(void *a)
662 {
663         ERRSTACK(1);
664         Tcptimer *t, *tp, *timeo;
665         struct Proto *tcp;
666         struct tcppriv *priv;
667         int loop;
668
669         tcp = a;
670         priv = tcp->priv;
671
672         for (;;) {
673                 kthread_usleep(MSPTICK * 1000);
674
675                 qlock(&priv->tl);
676                 timeo = NULL;
677                 loop = 0;
678                 for (t = priv->timers; t != NULL; t = tp) {
679                         if (loop++ > 10000)
680                                 panic("tcpackproc1");
681                         tp = t->next;
682                         if (t->state == TcptimerON) {
683                                 t->count--;
684                                 if (t->count == 0) {
685                                         timerstate(priv, t, TcptimerDONE);
686                                         t->readynext = timeo;
687                                         timeo = t;
688                                 }
689                         }
690                 }
691                 qunlock(&priv->tl);
692
693                 loop = 0;
694                 for (t = timeo; t != NULL; t = t->readynext) {
695                         if (loop++ > 10000)
696                                 panic("tcpackproc2");
697                         if (t->state == TcptimerDONE && t->func != NULL) {
698                                 /* discard error style */
699                                 if (!waserror())
700                                         (*t->func) (t->arg);
701                                 poperror();
702                         }
703                 }
704
705                 limborexmit(tcp);
706         }
707 }
708
709 void tcpgo(struct tcppriv *priv, Tcptimer * t)
710 {
711         if (t == NULL || t->start == 0)
712                 return;
713
714         qlock(&priv->tl);
715         t->count = t->start;
716         timerstate(priv, t, TcptimerON);
717         qunlock(&priv->tl);
718 }
719
720 void tcphalt(struct tcppriv *priv, Tcptimer * t)
721 {
722         if (t == NULL)
723                 return;
724
725         qlock(&priv->tl);
726         timerstate(priv, t, TcptimerOFF);
727         qunlock(&priv->tl);
728 }
729
730 int backoff(int n)
731 {
732         return 1 << n;
733 }
734
735 void localclose(struct conv *s, char *reason)
736 {       /* called with tcb locked */
737         Tcpctl *tcb;
738         Reseq *rp, *rp1;
739         struct tcppriv *tpriv;
740
741         tpriv = s->p->priv;
742         tcb = (Tcpctl *) s->ptcl;
743
744         iphtrem(&tpriv->ht, s);
745
746         tcphalt(tpriv, &tcb->timer);
747         tcphalt(tpriv, &tcb->rtt_timer);
748         tcphalt(tpriv, &tcb->acktimer);
749         tcphalt(tpriv, &tcb->katimer);
750
751         /* Flush reassembly queue; nothing more can arrive */
752         for (rp = tcb->reseq; rp != NULL; rp = rp1) {
753                 rp1 = rp->next;
754                 freeblist(rp->bp);
755                 kfree(rp);
756         }
757         tcb->reseq = NULL;
758
759         if (tcb->state == Syn_sent)
760                 Fsconnected(s, reason);
761
762         qhangup(s->rq, reason);
763         qhangup(s->wq, reason);
764
765         tcpsetstate(s, Closed);
766
767         /* listener will check the rq state */
768         if (s->state == Announced)
769                 rendez_wakeup(&s->listenr);
770 }
771
772 /* mtu (- TCP + IP hdr len) of 1st hop */
773 int tcpmtu(struct Proto *tcp, uint8_t * addr, int version, int *scale,
774            uint8_t *flags)
775 {
776         struct Ipifc *ifc;
777         int mtu;
778
779         ifc = findipifc(tcp->f, addr, 0);
780         switch (version) {
781                 default:
782                 case V4:
783                         mtu = DEF_MSS;
784                         if (ifc != NULL)
785                                 mtu = ifc->maxtu - ifc->m->hsize - (TCP4_PKT + TCP4_HDRSIZE);
786                         break;
787                 case V6:
788                         mtu = DEF_MSS6;
789                         if (ifc != NULL)
790                                 mtu = ifc->maxtu - ifc->m->hsize - (TCP6_PKT + TCP6_HDRSIZE);
791                         break;
792         }
793         *flags &= ~TSO;
794
795         if (ifc != NULL) {
796                 if (ifc->mbps > 100)
797                         *scale = HaveWS | 3;
798                 else if (ifc->mbps > 10)
799                         *scale = HaveWS | 1;
800                 else
801                         *scale = HaveWS | 0;
802                 if (ifc->feat & NETF_TSO)
803                         *flags |= TSO;
804         } else
805                 *scale = HaveWS | 0;
806
807         return mtu;
808 }
809
810 void inittcpctl(struct conv *s, int mode)
811 {
812         Tcpctl *tcb;
813         Tcp4hdr *h4;
814         Tcp6hdr *h6;
815         int mss;
816
817         tcb = (Tcpctl *) s->ptcl;
818
819         memset(tcb, 0, sizeof(Tcpctl));
820
821         tcb->ssthresh = 65535;
822         tcb->srtt = tcp_irtt << LOGAGAIN;
823         tcb->mdev = 0;
824
825         /* setup timers */
826         tcb->timer.start = tcp_irtt / MSPTICK;
827         tcb->timer.func = tcptimeout;
828         tcb->timer.arg = s;
829         tcb->rtt_timer.start = MAX_TIME;
830         tcb->acktimer.start = TCP_ACK / MSPTICK;
831         tcb->acktimer.func = tcpacktimer;
832         tcb->acktimer.arg = s;
833         tcb->katimer.start = DEF_KAT / MSPTICK;
834         tcb->katimer.func = tcpkeepalive;
835         tcb->katimer.arg = s;
836
837         mss = DEF_MSS;
838
839         /* create a prototype(pseudo) header */
840         if (mode != TCP_LISTEN) {
841                 if (ipcmp(s->laddr, IPnoaddr) == 0)
842                         findlocalip(s->p->f, s->laddr, s->raddr);
843
844                 switch (s->ipversion) {
845                         case V4:
846                                 h4 = &tcb->protohdr.tcp4hdr;
847                                 memset(h4, 0, sizeof(*h4));
848                                 h4->proto = IP_TCPPROTO;
849                                 hnputs(h4->tcpsport, s->lport);
850                                 hnputs(h4->tcpdport, s->rport);
851                                 v6tov4(h4->tcpsrc, s->laddr);
852                                 v6tov4(h4->tcpdst, s->raddr);
853                                 break;
854                         case V6:
855                                 h6 = &tcb->protohdr.tcp6hdr;
856                                 memset(h6, 0, sizeof(*h6));
857                                 h6->proto = IP_TCPPROTO;
858                                 hnputs(h6->tcpsport, s->lport);
859                                 hnputs(h6->tcpdport, s->rport);
860                                 ipmove(h6->tcpsrc, s->laddr);
861                                 ipmove(h6->tcpdst, s->raddr);
862                                 mss = DEF_MSS6;
863                                 break;
864                         default:
865                                 panic("inittcpctl: version %d", s->ipversion);
866                 }
867         }
868
869         tcb->mss = tcb->cwind = mss;
870
871         /* default is no window scaling */
872         tcb->window = QMAX;
873         tcb->rcv.wnd = QMAX;
874         tcb->rcv.scale = 0;
875         tcb->snd.scale = 0;
876         qsetlimit(s->rq, QMAX);
877 }
878
879 /*
880  *  called with s qlocked
881  */
882 void tcpstart(struct conv *s, int mode)
883 {
884         Tcpctl *tcb;
885         struct tcppriv *tpriv;
886         /* tcpackproc needs to free this if it ever exits */
887         char *kpname = kmalloc(KNAMELEN, KMALLOC_WAIT);
888
889         tpriv = s->p->priv;
890
891         if (tpriv->ackprocstarted == 0) {
892                 qlock(&tpriv->apl);
893                 if (tpriv->ackprocstarted == 0) {
894                         snprintf(kpname, KNAMELEN, "#I%dtcpack", s->p->f->dev);
895                         ktask(kpname, tcpackproc, s->p);
896                         tpriv->ackprocstarted = 1;
897                 }
898                 qunlock(&tpriv->apl);
899         }
900
901         tcb = (Tcpctl *) s->ptcl;
902
903         inittcpctl(s, mode);
904
905         iphtadd(&tpriv->ht, s);
906         switch (mode) {
907                 case TCP_LISTEN:
908                         tpriv->stats[PassiveOpens]++;
909                         tcb->flags |= CLONE;
910                         tcpsetstate(s, Listen);
911                         break;
912
913                 case TCP_CONNECT:
914                         tpriv->stats[ActiveOpens]++;
915                         tcb->flags |= ACTIVE;
916                         tcpsndsyn(s, tcb);
917                         tcpsetstate(s, Syn_sent);
918                         tcpoutput(s);
919                         break;
920         }
921 }
922
923 static char *tcpflag(uint16_t flag)
924 {
925         static char buf[128];
926
927         snprintf(buf, sizeof(buf), "%d", flag >> 10);   /* Head len */
928         if (flag & URG)
929                 snprintf(buf, sizeof(buf), "%s%s", buf, " URG");
930         if (flag & ACK)
931                 snprintf(buf, sizeof(buf), "%s%s", buf, " ACK");
932         if (flag & PSH)
933                 snprintf(buf, sizeof(buf), "%s%s", buf, " PSH");
934         if (flag & RST)
935                 snprintf(buf, sizeof(buf), "%s%s", buf, " RST");
936         if (flag & SYN)
937                 snprintf(buf, sizeof(buf), "%s%s", buf, " SYN");
938         if (flag & FIN)
939                 snprintf(buf, sizeof(buf), "%s%s", buf, " FIN");
940
941         return buf;
942 }
943
944 struct block *htontcp6(Tcp * tcph, struct block *data, Tcp6hdr * ph,
945                                            Tcpctl * tcb)
946 {
947         int dlen;
948         Tcp6hdr *h;
949         uint16_t csum;
950         uint16_t hdrlen, optpad = 0;
951         uint8_t *opt;
952
953         hdrlen = TCP6_HDRSIZE;
954         if (tcph->flags & SYN) {
955                 if (tcph->mss)
956                         hdrlen += MSS_LENGTH;
957                 if (tcph->ws)
958                         hdrlen += WS_LENGTH;
959                 optpad = hdrlen & 3;
960                 if (optpad)
961                         optpad = 4 - optpad;
962                 hdrlen += optpad;
963         }
964
965         if (data) {
966                 dlen = blocklen(data);
967                 data = padblock(data, hdrlen + TCP6_PKT);
968                 if (data == NULL)
969                         return NULL;
970         } else {
971                 dlen = 0;
972                 data = allocb(hdrlen + TCP6_PKT + 64);  /* the 64 pad is to meet mintu's */
973                 if (data == NULL)
974                         return NULL;
975                 data->wp += hdrlen + TCP6_PKT;
976         }
977
978         /* copy in pseudo ip header plus port numbers */
979         h = (Tcp6hdr *) (data->rp);
980         memmove(h, ph, TCP6_TCBPHDRSZ);
981
982         /* compose pseudo tcp header, do cksum calculation */
983         hnputl(h->vcf, hdrlen + dlen);
984         h->ploadlen[0] = h->ploadlen[1] = h->proto = 0;
985         h->ttl = ph->proto;
986
987         /* copy in variable bits */
988         hnputl(h->tcpseq, tcph->seq);
989         hnputl(h->tcpack, tcph->ack);
990         hnputs(h->tcpflag, (hdrlen << 10) | tcph->flags);
991         hnputs(h->tcpwin, tcph->wnd >> (tcb != NULL ? tcb->snd.scale : 0));
992         hnputs(h->tcpurg, tcph->urg);
993
994         if (tcph->flags & SYN) {
995                 opt = h->tcpopt;
996                 if (tcph->mss != 0) {
997                         *opt++ = MSSOPT;
998                         *opt++ = MSS_LENGTH;
999                         hnputs(opt, tcph->mss);
1000                         opt += 2;
1001                 }
1002                 if (tcph->ws != 0) {
1003                         *opt++ = WSOPT;
1004                         *opt++ = WS_LENGTH;
1005                         *opt++ = tcph->ws;
1006                 }
1007                 while (optpad-- > 0)
1008                         *opt++ = NOOPOPT;
1009         }
1010
1011         if (tcb != NULL && tcb->nochecksum) {
1012                 h->tcpcksum[0] = h->tcpcksum[1] = 0;
1013         } else {
1014                 csum = ptclcsum(data, TCP6_IPLEN, hdrlen + dlen + TCP6_PHDRSIZE);
1015                 hnputs(h->tcpcksum, csum);
1016         }
1017
1018         /* move from pseudo header back to normal ip header */
1019         memset(h->vcf, 0, 4);
1020         h->vcf[0] = IP_VER6;
1021         hnputs(h->ploadlen, hdrlen + dlen);
1022         h->proto = ph->proto;
1023
1024         return data;
1025 }
1026
1027 struct block *htontcp4(Tcp * tcph, struct block *data, Tcp4hdr * ph,
1028                                            Tcpctl * tcb)
1029 {
1030         int dlen;
1031         Tcp4hdr *h;
1032         uint16_t csum;
1033         uint16_t hdrlen, optpad = 0;
1034         uint8_t *opt;
1035
1036         hdrlen = TCP4_HDRSIZE;
1037         if (tcph->flags & SYN) {
1038                 if (tcph->mss)
1039                         hdrlen += MSS_LENGTH;
1040                 if (tcph->ws)
1041                         hdrlen += WS_LENGTH;
1042                 optpad = hdrlen & 3;
1043                 if (optpad)
1044                         optpad = 4 - optpad;
1045                 hdrlen += optpad;
1046         }
1047
1048         if (data) {
1049                 dlen = blocklen(data);
1050                 data = padblock(data, hdrlen + TCP4_PKT);
1051                 if (data == NULL)
1052                         return NULL;
1053         } else {
1054                 dlen = 0;
1055                 data = allocb(hdrlen + TCP4_PKT + 64);  /* the 64 pad is to meet mintu's */
1056                 if (data == NULL)
1057                         return NULL;
1058                 data->wp += hdrlen + TCP4_PKT;
1059         }
1060
1061         /* copy in pseudo ip header plus port numbers */
1062         h = (Tcp4hdr *) (data->rp);
1063         memmove(h, ph, TCP4_TCBPHDRSZ);
1064
1065         /* copy in variable bits */
1066         hnputs(h->tcplen, hdrlen + dlen);
1067         hnputl(h->tcpseq, tcph->seq);
1068         hnputl(h->tcpack, tcph->ack);
1069         hnputs(h->tcpflag, (hdrlen << 10) | tcph->flags);
1070         hnputs(h->tcpwin, tcph->wnd >> (tcb != NULL ? tcb->snd.scale : 0));
1071         hnputs(h->tcpurg, tcph->urg);
1072
1073         if (tcph->flags & SYN) {
1074                 opt = h->tcpopt;
1075                 if (tcph->mss != 0) {
1076                         *opt++ = MSSOPT;
1077                         *opt++ = MSS_LENGTH;
1078                         hnputs(opt, tcph->mss);
1079                         opt += 2;
1080                 }
1081                 if (tcph->ws != 0) {
1082                         *opt++ = WSOPT;
1083                         *opt++ = WS_LENGTH;
1084                         *opt++ = tcph->ws;
1085                 }
1086                 while (optpad-- > 0)
1087                         *opt++ = NOOPOPT;
1088         }
1089
1090         if (tcb != NULL && tcb->nochecksum) {
1091                 h->tcpcksum[0] = h->tcpcksum[1] = 0;
1092         } else {
1093                 csum = ~ptclcsum(data, TCP4_IPLEN, TCP4_PHDRSIZE);
1094                 hnputs(h->tcpcksum, csum);
1095                 data->checksum_start = TCP4_IPLEN + TCP4_PHDRSIZE;
1096                 data->checksum_offset = ph->tcpcksum - ph->tcpsport;
1097                 data->flag |= Btcpck;
1098         }
1099
1100         return data;
1101 }
1102
1103 int ntohtcp6(Tcp * tcph, struct block **bpp)
1104 {
1105         Tcp6hdr *h;
1106         uint8_t *optr;
1107         uint16_t hdrlen;
1108         uint16_t optlen;
1109         int n;
1110
1111         *bpp = pullupblock(*bpp, TCP6_PKT + TCP6_HDRSIZE);
1112         if (*bpp == NULL)
1113                 return -1;
1114
1115         h = (Tcp6hdr *) ((*bpp)->rp);
1116         tcph->source = nhgets(h->tcpsport);
1117         tcph->dest = nhgets(h->tcpdport);
1118         tcph->seq = nhgetl(h->tcpseq);
1119         tcph->ack = nhgetl(h->tcpack);
1120         hdrlen = (h->tcpflag[0] >> 2) & ~3;
1121         if (hdrlen < TCP6_HDRSIZE) {
1122                 freeblist(*bpp);
1123                 return -1;
1124         }
1125
1126         tcph->flags = h->tcpflag[1];
1127         tcph->wnd = nhgets(h->tcpwin);
1128         tcph->urg = nhgets(h->tcpurg);
1129         tcph->mss = 0;
1130         tcph->ws = 0;
1131         tcph->len = nhgets(h->ploadlen) - hdrlen;
1132
1133         *bpp = pullupblock(*bpp, hdrlen + TCP6_PKT);
1134         if (*bpp == NULL)
1135                 return -1;
1136
1137         optr = h->tcpopt;
1138         n = hdrlen - TCP6_HDRSIZE;
1139         while (n > 0 && *optr != EOLOPT) {
1140                 if (*optr == NOOPOPT) {
1141                         n--;
1142                         optr++;
1143                         continue;
1144                 }
1145                 optlen = optr[1];
1146                 if (optlen < 2 || optlen > n)
1147                         break;
1148                 switch (*optr) {
1149                         case MSSOPT:
1150                                 if (optlen == MSS_LENGTH)
1151                                         tcph->mss = nhgets(optr + 2);
1152                                 break;
1153                         case WSOPT:
1154                                 if (optlen == WS_LENGTH && *(optr + 2) <= 14)
1155                                         tcph->ws = HaveWS | *(optr + 2);
1156                                 break;
1157                 }
1158                 n -= optlen;
1159                 optr += optlen;
1160         }
1161         return hdrlen;
1162 }
1163
1164 int ntohtcp4(Tcp * tcph, struct block **bpp)
1165 {
1166         Tcp4hdr *h;
1167         uint8_t *optr;
1168         uint16_t hdrlen;
1169         uint16_t optlen;
1170         int n;
1171
1172         *bpp = pullupblock(*bpp, TCP4_PKT + TCP4_HDRSIZE);
1173         if (*bpp == NULL)
1174                 return -1;
1175
1176         h = (Tcp4hdr *) ((*bpp)->rp);
1177         tcph->source = nhgets(h->tcpsport);
1178         tcph->dest = nhgets(h->tcpdport);
1179         tcph->seq = nhgetl(h->tcpseq);
1180         tcph->ack = nhgetl(h->tcpack);
1181
1182         hdrlen = (h->tcpflag[0] >> 2) & ~3;
1183         if (hdrlen < TCP4_HDRSIZE) {
1184                 freeblist(*bpp);
1185                 return -1;
1186         }
1187
1188         tcph->flags = h->tcpflag[1];
1189         tcph->wnd = nhgets(h->tcpwin);
1190         tcph->urg = nhgets(h->tcpurg);
1191         tcph->mss = 0;
1192         tcph->ws = 0;
1193         tcph->len = nhgets(h->length) - (hdrlen + TCP4_PKT);
1194
1195         *bpp = pullupblock(*bpp, hdrlen + TCP4_PKT);
1196         if (*bpp == NULL)
1197                 return -1;
1198
1199         optr = h->tcpopt;
1200         n = hdrlen - TCP4_HDRSIZE;
1201         while (n > 0 && *optr != EOLOPT) {
1202                 if (*optr == NOOPOPT) {
1203                         n--;
1204                         optr++;
1205                         continue;
1206                 }
1207                 optlen = optr[1];
1208                 if (optlen < 2 || optlen > n)
1209                         break;
1210                 switch (*optr) {
1211                         case MSSOPT:
1212                                 if (optlen == MSS_LENGTH)
1213                                         tcph->mss = nhgets(optr + 2);
1214                                 break;
1215                         case WSOPT:
1216                                 if (optlen == WS_LENGTH && *(optr + 2) <= 14)
1217                                         tcph->ws = HaveWS | *(optr + 2);
1218                                 break;
1219                 }
1220                 n -= optlen;
1221                 optr += optlen;
1222         }
1223         return hdrlen;
1224 }
1225
1226 /*
1227  *  For outgiing calls, generate an initial sequence
1228  *  number and put a SYN on the send queue
1229  */
1230 void tcpsndsyn(struct conv *s, Tcpctl * tcb)
1231 {
1232         urandom_read(&tcb->iss, sizeof(tcb->iss));
1233         tcb->rttseq = tcb->iss;
1234         tcb->snd.wl2 = tcb->iss;
1235         tcb->snd.una = tcb->iss;
1236         tcb->snd.ptr = tcb->rttseq;
1237         tcb->snd.nxt = tcb->rttseq;
1238         tcb->flgcnt++;
1239         tcb->flags |= FORCE;
1240         tcb->sndsyntime = NOW;
1241
1242         /* set desired mss and scale */
1243         tcb->mss = tcpmtu(s->p, s->laddr, s->ipversion, &tcb->scale,
1244                           &tcb->flags);
1245 }
1246
1247 void
1248 sndrst(struct Proto *tcp, uint8_t * source, uint8_t * dest,
1249            uint16_t length, Tcp * seg, uint8_t version, char *reason)
1250 {
1251         struct block *hbp;
1252         uint8_t rflags;
1253         struct tcppriv *tpriv;
1254         Tcp4hdr ph4;
1255         Tcp6hdr ph6;
1256
1257         netlog(tcp->f, Logtcp, "sndrst: %s\n", reason);
1258
1259         tpriv = tcp->priv;
1260
1261         if (seg->flags & RST)
1262                 return;
1263
1264         /* make pseudo header */
1265         switch (version) {
1266                 case V4:
1267                         memset(&ph4, 0, sizeof(ph4));
1268                         ph4.vihl = IP_VER4;
1269                         v6tov4(ph4.tcpsrc, dest);
1270                         v6tov4(ph4.tcpdst, source);
1271                         ph4.proto = IP_TCPPROTO;
1272                         hnputs(ph4.tcplen, TCP4_HDRSIZE);
1273                         hnputs(ph4.tcpsport, seg->dest);
1274                         hnputs(ph4.tcpdport, seg->source);
1275                         break;
1276                 case V6:
1277                         memset(&ph6, 0, sizeof(ph6));
1278                         ph6.vcf[0] = IP_VER6;
1279                         ipmove(ph6.tcpsrc, dest);
1280                         ipmove(ph6.tcpdst, source);
1281                         ph6.proto = IP_TCPPROTO;
1282                         hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1283                         hnputs(ph6.tcpsport, seg->dest);
1284                         hnputs(ph6.tcpdport, seg->source);
1285                         break;
1286                 default:
1287                         panic("sndrst: version %d", version);
1288         }
1289
1290         tpriv->stats[OutRsts]++;
1291         rflags = RST;
1292
1293         /* convince the other end that this reset is in band */
1294         if (seg->flags & ACK) {
1295                 seg->seq = seg->ack;
1296                 seg->ack = 0;
1297         } else {
1298                 rflags |= ACK;
1299                 seg->ack = seg->seq;
1300                 seg->seq = 0;
1301                 if (seg->flags & SYN)
1302                         seg->ack++;
1303                 seg->ack += length;
1304                 if (seg->flags & FIN)
1305                         seg->ack++;
1306         }
1307         seg->flags = rflags;
1308         seg->wnd = 0;
1309         seg->urg = 0;
1310         seg->mss = 0;
1311         seg->ws = 0;
1312         switch (version) {
1313                 case V4:
1314                         hbp = htontcp4(seg, NULL, &ph4, NULL);
1315                         if (hbp == NULL)
1316                                 return;
1317                         ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1318                         break;
1319                 case V6:
1320                         hbp = htontcp6(seg, NULL, &ph6, NULL);
1321                         if (hbp == NULL)
1322                                 return;
1323                         ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1324                         break;
1325                 default:
1326                         panic("sndrst2: version %d", version);
1327         }
1328 }
1329
1330 /*
1331  *  send a reset to the remote side and close the conversation
1332  *  called with s qlocked
1333  */
1334 static void tcphangup(struct conv *s)
1335 {
1336         ERRSTACK(1);
1337         Tcp seg;
1338         Tcpctl *tcb;
1339         struct block *hbp;
1340
1341         tcb = (Tcpctl *) s->ptcl;
1342         if (ipcmp(s->raddr, IPnoaddr)) {
1343                 /* discard error style, poperror regardless */
1344                 if (!waserror()) {
1345                         seg.flags = RST | ACK;
1346                         seg.ack = tcb->rcv.nxt;
1347                         tcb->rcv.una = 0;
1348                         seg.seq = tcb->snd.ptr;
1349                         seg.wnd = 0;
1350                         seg.urg = 0;
1351                         seg.mss = 0;
1352                         seg.ws = 0;
1353                         switch (s->ipversion) {
1354                                 case V4:
1355                                         tcb->protohdr.tcp4hdr.vihl = IP_VER4;
1356                                         hbp = htontcp4(&seg, NULL, &tcb->protohdr.tcp4hdr, tcb);
1357                                         ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
1358                                         break;
1359                                 case V6:
1360                                         tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
1361                                         hbp = htontcp6(&seg, NULL, &tcb->protohdr.tcp6hdr, tcb);
1362                                         ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
1363                                         break;
1364                                 default:
1365                                         panic("tcphangup: version %d", s->ipversion);
1366                         }
1367                 }
1368                 poperror();
1369         }
1370         localclose(s, NULL);
1371 }
1372
1373 /*
1374  *  (re)send a SYN ACK
1375  */
1376 int sndsynack(struct Proto *tcp, Limbo * lp)
1377 {
1378         struct block *hbp;
1379         Tcp4hdr ph4;
1380         Tcp6hdr ph6;
1381         Tcp seg;
1382         int scale;
1383         uint8_t flag = 0;
1384
1385         /* make pseudo header */
1386         switch (lp->version) {
1387                 case V4:
1388                         memset(&ph4, 0, sizeof(ph4));
1389                         ph4.vihl = IP_VER4;
1390                         v6tov4(ph4.tcpsrc, lp->laddr);
1391                         v6tov4(ph4.tcpdst, lp->raddr);
1392                         ph4.proto = IP_TCPPROTO;
1393                         hnputs(ph4.tcplen, TCP4_HDRSIZE);
1394                         hnputs(ph4.tcpsport, lp->lport);
1395                         hnputs(ph4.tcpdport, lp->rport);
1396                         break;
1397                 case V6:
1398                         memset(&ph6, 0, sizeof(ph6));
1399                         ph6.vcf[0] = IP_VER6;
1400                         ipmove(ph6.tcpsrc, lp->laddr);
1401                         ipmove(ph6.tcpdst, lp->raddr);
1402                         ph6.proto = IP_TCPPROTO;
1403                         hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1404                         hnputs(ph6.tcpsport, lp->lport);
1405                         hnputs(ph6.tcpdport, lp->rport);
1406                         break;
1407                 default:
1408                         panic("sndrst: version %d", lp->version);
1409         }
1410
1411         seg.seq = lp->iss;
1412         seg.ack = lp->irs + 1;
1413         seg.flags = SYN | ACK;
1414         seg.urg = 0;
1415         seg.mss = tcpmtu(tcp, lp->laddr, lp->version, &scale, &flag);
1416         seg.wnd = QMAX;
1417
1418         /* if the other side set scale, we should too */
1419         if (lp->rcvscale) {
1420                 seg.ws = scale;
1421                 lp->sndscale = scale;
1422         } else {
1423                 seg.ws = 0;
1424                 lp->sndscale = 0;
1425         }
1426
1427         switch (lp->version) {
1428                 case V4:
1429                         hbp = htontcp4(&seg, NULL, &ph4, NULL);
1430                         if (hbp == NULL)
1431                                 return -1;
1432                         ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1433                         break;
1434                 case V6:
1435                         hbp = htontcp6(&seg, NULL, &ph6, NULL);
1436                         if (hbp == NULL)
1437                                 return -1;
1438                         ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1439                         break;
1440                 default:
1441                         panic("sndsnack: version %d", lp->version);
1442         }
1443         lp->lastsend = NOW;
1444         return 0;
1445 }
1446
1447 #define hashipa(a, p) ( ( (a)[IPaddrlen-2] + (a)[IPaddrlen-1] + p )&LHTMASK )
1448
1449 /*
1450  *  put a call into limbo and respond with a SYN ACK
1451  *
1452  *  called with proto locked
1453  */
1454 static void
1455 limbo(struct conv *s, uint8_t * source, uint8_t * dest, Tcp * seg, int version)
1456 {
1457         Limbo *lp, **l;
1458         struct tcppriv *tpriv;
1459         int h;
1460
1461         tpriv = s->p->priv;
1462         h = hashipa(source, seg->source);
1463
1464         for (l = &tpriv->lht[h]; *l != NULL; l = &lp->next) {
1465                 lp = *l;
1466                 if (lp->lport != seg->dest || lp->rport != seg->source
1467                         || lp->version != version)
1468                         continue;
1469                 if (ipcmp(lp->raddr, source) != 0)
1470                         continue;
1471                 if (ipcmp(lp->laddr, dest) != 0)
1472                         continue;
1473
1474                 /* each new SYN restarts the retransmits */
1475                 lp->irs = seg->seq;
1476                 break;
1477         }
1478         lp = *l;
1479         if (lp == NULL) {
1480                 if (tpriv->nlimbo >= Maxlimbo && tpriv->lht[h]) {
1481                         lp = tpriv->lht[h];
1482                         tpriv->lht[h] = lp->next;
1483                         lp->next = NULL;
1484                 } else {
1485                         lp = kzmalloc(sizeof(*lp), 0);
1486                         if (lp == NULL)
1487                                 return;
1488                         tpriv->nlimbo++;
1489                 }
1490                 *l = lp;
1491                 lp->version = version;
1492                 ipmove(lp->laddr, dest);
1493                 ipmove(lp->raddr, source);
1494                 lp->lport = seg->dest;
1495                 lp->rport = seg->source;
1496                 lp->mss = seg->mss;
1497                 lp->rcvscale = seg->ws;
1498                 lp->irs = seg->seq;
1499                 urandom_read(&lp->iss, sizeof(lp->iss));
1500         }
1501
1502         if (sndsynack(s->p, lp) < 0) {
1503                 *l = lp->next;
1504                 tpriv->nlimbo--;
1505                 kfree(lp);
1506         }
1507 }
1508
1509 /*
1510  *  resend SYN ACK's once every SYNACK_RXTIMER ms.
1511  */
1512 static void limborexmit(struct Proto *tcp)
1513 {
1514         struct tcppriv *tpriv;
1515         Limbo **l, *lp;
1516         int h;
1517         int seen;
1518         uint64_t now;
1519
1520         tpriv = tcp->priv;
1521
1522         if (!canqlock(&tcp->qlock))
1523                 return;
1524         seen = 0;
1525         now = NOW;
1526         for (h = 0; h < NLHT && seen < tpriv->nlimbo; h++) {
1527                 for (l = &tpriv->lht[h]; *l != NULL && seen < tpriv->nlimbo;) {
1528                         lp = *l;
1529                         seen++;
1530                         if (now - lp->lastsend < (lp->rexmits + 1) * SYNACK_RXTIMER)
1531                                 continue;
1532
1533                         /* time it out after 1 second */
1534                         if (++(lp->rexmits) > 5) {
1535                                 tpriv->nlimbo--;
1536                                 *l = lp->next;
1537                                 kfree(lp);
1538                                 continue;
1539                         }
1540
1541                         /* if we're being attacked, don't bother resending SYN ACK's */
1542                         if (tpriv->nlimbo > 100)
1543                                 continue;
1544
1545                         if (sndsynack(tcp, lp) < 0) {
1546                                 tpriv->nlimbo--;
1547                                 *l = lp->next;
1548                                 kfree(lp);
1549                                 continue;
1550                         }
1551
1552                         l = &lp->next;
1553                 }
1554         }
1555         qunlock(&tcp->qlock);
1556 }
1557
1558 /*
1559  *  lookup call in limbo.  if found, throw it out.
1560  *
1561  *  called with proto locked
1562  */
1563 static void
1564 limborst(struct conv *s, Tcp * segp, uint8_t * src, uint8_t * dst,
1565                  uint8_t version)
1566 {
1567         Limbo *lp, **l;
1568         int h;
1569         struct tcppriv *tpriv;
1570
1571         tpriv = s->p->priv;
1572
1573         /* find a call in limbo */
1574         h = hashipa(src, segp->source);
1575         for (l = &tpriv->lht[h]; *l != NULL; l = &lp->next) {
1576                 lp = *l;
1577                 if (lp->lport != segp->dest || lp->rport != segp->source
1578                         || lp->version != version)
1579                         continue;
1580                 if (ipcmp(lp->laddr, dst) != 0)
1581                         continue;
1582                 if (ipcmp(lp->raddr, src) != 0)
1583                         continue;
1584
1585                 /* RST can only follow the SYN */
1586                 if (segp->seq == lp->irs + 1) {
1587                         tpriv->nlimbo--;
1588                         *l = lp->next;
1589                         kfree(lp);
1590                 }
1591                 break;
1592         }
1593 }
1594
1595 /*
1596  *  come here when we finally get an ACK to our SYN-ACK.
1597  *  lookup call in limbo.  if found, create a new conversation
1598  *
1599  *  called with proto locked
1600  */
1601 static struct conv *tcpincoming(struct conv *s, Tcp * segp, uint8_t * src,
1602                                                                 uint8_t * dst, uint8_t version)
1603 {
1604         struct conv *new;
1605         Tcpctl *tcb;
1606         struct tcppriv *tpriv;
1607         Tcp4hdr *h4;
1608         Tcp6hdr *h6;
1609         Limbo *lp, **l;
1610         int h;
1611
1612         /* unless it's just an ack, it can't be someone coming out of limbo */
1613         if ((segp->flags & SYN) || (segp->flags & ACK) == 0)
1614                 return NULL;
1615
1616         tpriv = s->p->priv;
1617
1618         /* find a call in limbo */
1619         h = hashipa(src, segp->source);
1620         for (l = &tpriv->lht[h]; (lp = *l) != NULL; l = &lp->next) {
1621                 netlog(s->p->f, Logtcp,
1622                            "tcpincoming s %I!%d/%I!%d d %I!%d/%I!%d v %d/%d\n", src,
1623                            segp->source, lp->raddr, lp->rport, dst, segp->dest, lp->laddr,
1624                            lp->lport, version, lp->version);
1625
1626                 if (lp->lport != segp->dest || lp->rport != segp->source
1627                         || lp->version != version)
1628                         continue;
1629                 if (ipcmp(lp->laddr, dst) != 0)
1630                         continue;
1631                 if (ipcmp(lp->raddr, src) != 0)
1632                         continue;
1633
1634                 /* we're assuming no data with the initial SYN */
1635                 if (segp->seq != lp->irs + 1 || segp->ack != lp->iss + 1) {
1636                         netlog(s->p->f, Logtcp, "tcpincoming s 0x%lx/0x%lx a 0x%lx 0x%lx\n",
1637                                    segp->seq, lp->irs + 1, segp->ack, lp->iss + 1);
1638                         lp = NULL;
1639                 } else {
1640                         tpriv->nlimbo--;
1641                         *l = lp->next;
1642                 }
1643                 break;
1644         }
1645         if (lp == NULL)
1646                 return NULL;
1647
1648         new = Fsnewcall(s, src, segp->source, dst, segp->dest, version);
1649         if (new == NULL)
1650                 return NULL;
1651
1652         memmove(new->ptcl, s->ptcl, sizeof(Tcpctl));
1653         tcb = (Tcpctl *) new->ptcl;
1654         tcb->flags &= ~CLONE;
1655         tcb->timer.arg = new;
1656         tcb->timer.state = TcptimerOFF;
1657         tcb->acktimer.arg = new;
1658         tcb->acktimer.state = TcptimerOFF;
1659         tcb->katimer.arg = new;
1660         tcb->katimer.state = TcptimerOFF;
1661         tcb->rtt_timer.arg = new;
1662         tcb->rtt_timer.state = TcptimerOFF;
1663
1664         tcb->irs = lp->irs;
1665         tcb->rcv.nxt = tcb->irs + 1;
1666         tcb->rcv.urg = tcb->rcv.nxt;
1667
1668         tcb->iss = lp->iss;
1669         tcb->rttseq = tcb->iss;
1670         tcb->snd.wl2 = tcb->iss;
1671         tcb->snd.una = tcb->iss + 1;
1672         tcb->snd.ptr = tcb->iss + 1;
1673         tcb->snd.nxt = tcb->iss + 1;
1674         tcb->flgcnt = 0;
1675         tcb->flags |= SYNACK;
1676
1677         /* our sending max segment size cannot be bigger than what he asked for */
1678         if (lp->mss != 0 && lp->mss < tcb->mss)
1679                 tcb->mss = lp->mss;
1680
1681         /* window scaling */
1682         tcpsetscale(new, tcb, lp->rcvscale, lp->sndscale);
1683
1684         /* the congestion window always starts out as a single segment */
1685         tcb->snd.wnd = segp->wnd;
1686         tcb->cwind = tcb->mss;
1687
1688         /* set initial round trip time */
1689         tcb->sndsyntime = lp->lastsend + lp->rexmits * SYNACK_RXTIMER;
1690         tcpsynackrtt(new);
1691
1692         kfree(lp);
1693
1694         /* set up proto header */
1695         switch (version) {
1696                 case V4:
1697                         h4 = &tcb->protohdr.tcp4hdr;
1698                         memset(h4, 0, sizeof(*h4));
1699                         h4->proto = IP_TCPPROTO;
1700                         hnputs(h4->tcpsport, new->lport);
1701                         hnputs(h4->tcpdport, new->rport);
1702                         v6tov4(h4->tcpsrc, dst);
1703                         v6tov4(h4->tcpdst, src);
1704                         break;
1705                 case V6:
1706                         h6 = &tcb->protohdr.tcp6hdr;
1707                         memset(h6, 0, sizeof(*h6));
1708                         h6->proto = IP_TCPPROTO;
1709                         hnputs(h6->tcpsport, new->lport);
1710                         hnputs(h6->tcpdport, new->rport);
1711                         ipmove(h6->tcpsrc, dst);
1712                         ipmove(h6->tcpdst, src);
1713                         break;
1714                 default:
1715                         panic("tcpincoming: version %d", new->ipversion);
1716         }
1717
1718         tcpsetstate(new, Established);
1719
1720         iphtadd(&tpriv->ht, new);
1721
1722         return new;
1723 }
1724
1725 int seq_within(uint32_t x, uint32_t low, uint32_t high)
1726 {
1727         if (low <= high) {
1728                 if (low <= x && x <= high)
1729                         return 1;
1730         } else {
1731                 if (x >= low || x <= high)
1732                         return 1;
1733         }
1734         return 0;
1735 }
1736
1737 int seq_lt(uint32_t x, uint32_t y)
1738 {
1739         return (int)(x - y) < 0;
1740 }
1741
1742 int seq_le(uint32_t x, uint32_t y)
1743 {
1744         return (int)(x - y) <= 0;
1745 }
1746
1747 int seq_gt(uint32_t x, uint32_t y)
1748 {
1749         return (int)(x - y) > 0;
1750 }
1751
1752 int seq_ge(uint32_t x, uint32_t y)
1753 {
1754         return (int)(x - y) >= 0;
1755 }
1756
1757 /*
1758  *  use the time between the first SYN and it's ack as the
1759  *  initial round trip time
1760  */
1761 void tcpsynackrtt(struct conv *s)
1762 {
1763         Tcpctl *tcb;
1764         uint64_t delta;
1765         struct tcppriv *tpriv;
1766
1767         tcb = (Tcpctl *) s->ptcl;
1768         tpriv = s->p->priv;
1769
1770         delta = NOW - tcb->sndsyntime;
1771         tcb->srtt = delta << LOGAGAIN;
1772         tcb->mdev = delta << LOGDGAIN;
1773
1774         /* halt round trip timer */
1775         tcphalt(tpriv, &tcb->rtt_timer);
1776 }
1777
1778 void update(struct conv *s, Tcp * seg)
1779 {
1780         int rtt, delta;
1781         Tcpctl *tcb;
1782         uint32_t acked;
1783         uint32_t expand;
1784         struct tcppriv *tpriv;
1785
1786         tpriv = s->p->priv;
1787         tcb = (Tcpctl *) s->ptcl;
1788
1789         /* if everything has been acked, force output(?) */
1790         if (seq_gt(seg->ack, tcb->snd.nxt)) {
1791                 tcb->flags |= FORCE;
1792                 return;
1793         }
1794
1795         /* added by Dong Lin for fast retransmission */
1796         if (seg->ack == tcb->snd.una
1797                 && tcb->snd.una != tcb->snd.nxt
1798                 && seg->len == 0 && seg->wnd == tcb->snd.wnd) {
1799
1800                 /* this is a pure ack w/o window update */
1801                 netlog(s->p->f, Logtcprxmt, "dupack %lu ack %lu sndwnd %d advwin %d\n",
1802                            tcb->snd.dupacks, seg->ack, tcb->snd.wnd, seg->wnd);
1803
1804                 if (++tcb->snd.dupacks == TCPREXMTTHRESH) {
1805                         /*
1806                          *  tahoe tcp rxt the packet, half sshthresh,
1807                          *  and set cwnd to one packet
1808                          */
1809                         tcb->snd.recovery = 1;
1810                         tcb->snd.rxt = tcb->snd.nxt;
1811                         netlog(s->p->f, Logtcprxmt, "fast rxt %lu, nxt %lu\n", tcb->snd.una,
1812                                    tcb->snd.nxt);
1813                         tcprxmit(s);
1814                 } else {
1815                         /* do reno tcp here. */
1816                 }
1817         }
1818
1819         /*
1820          *  update window
1821          */
1822         if (seq_gt(seg->ack, tcb->snd.wl2)
1823                 || (tcb->snd.wl2 == seg->ack && seg->wnd > tcb->snd.wnd)) {
1824                 tcb->snd.wnd = seg->wnd;
1825                 tcb->snd.wl2 = seg->ack;
1826         }
1827
1828         if (!seq_gt(seg->ack, tcb->snd.una)) {
1829                 /*
1830                  *  don't let us hangup if sending into a closed window and
1831                  *  we're still getting acks
1832                  */
1833                 if ((tcb->flags & RETRAN) && tcb->snd.wnd == 0) {
1834                         tcb->backedoff = MAXBACKMS / 4;
1835                 }
1836                 return;
1837         }
1838
1839         /*
1840          *  any positive ack turns off fast rxt,
1841          *  (should we do new-reno on partial acks?)
1842          */
1843         if (!tcb->snd.recovery || seq_ge(seg->ack, tcb->snd.rxt)) {
1844                 tcb->snd.dupacks = 0;
1845                 tcb->snd.recovery = 0;
1846         } else
1847                 netlog(s->p->f, Logtcp, "rxt next %lu, cwin %u\n", seg->ack,
1848                            tcb->cwind);
1849
1850         /* Compute the new send window size */
1851         acked = seg->ack - tcb->snd.una;
1852
1853         /* avoid slow start and timers for SYN acks */
1854         if ((tcb->flags & SYNACK) == 0) {
1855                 tcb->flags |= SYNACK;
1856                 acked--;
1857                 tcb->flgcnt--;
1858                 goto done;
1859         }
1860
1861         /* slow start as long as we're not recovering from lost packets */
1862         if (tcb->cwind < tcb->snd.wnd && !tcb->snd.recovery) {
1863                 if (tcb->cwind < tcb->ssthresh) {
1864                         expand = tcb->mss;
1865                         if (acked < expand)
1866                                 expand = acked;
1867                 } else
1868                         expand = ((int)tcb->mss * tcb->mss) / tcb->cwind;
1869
1870                 if (tcb->cwind + expand < tcb->cwind)
1871                         expand = tcb->snd.wnd - tcb->cwind;
1872                 if (tcb->cwind + expand > tcb->snd.wnd)
1873                         expand = tcb->snd.wnd - tcb->cwind;
1874                 tcb->cwind += expand;
1875         }
1876
1877         /* Adjust the timers according to the round trip time */
1878         if (tcb->rtt_timer.state == TcptimerON && seq_ge(seg->ack, tcb->rttseq)) {
1879                 tcphalt(tpriv, &tcb->rtt_timer);
1880                 if ((tcb->flags & RETRAN) == 0) {
1881                         tcb->backoff = 0;
1882                         tcb->backedoff = 0;
1883                         rtt = tcb->rtt_timer.start - tcb->rtt_timer.count;
1884                         if (rtt == 0)
1885                                 rtt = 1;        /* otherwise all close systems will rexmit in 0 time */
1886                         rtt *= MSPTICK;
1887                         if (tcb->srtt == 0) {
1888                                 tcb->srtt = rtt << LOGAGAIN;
1889                                 tcb->mdev = rtt << LOGDGAIN;
1890                         } else {
1891                                 delta = rtt - (tcb->srtt >> LOGAGAIN);
1892                                 tcb->srtt += delta;
1893                                 if (tcb->srtt <= 0)
1894                                         tcb->srtt = 1;
1895
1896                                 delta = abs(delta) - (tcb->mdev >> LOGDGAIN);
1897                                 tcb->mdev += delta;
1898                                 if (tcb->mdev <= 0)
1899                                         tcb->mdev = 1;
1900                         }
1901                         tcpsettimer(tcb);
1902                 }
1903         }
1904
1905 done:
1906         if (qdiscard(s->wq, acked) < acked)
1907                 tcb->flgcnt--;
1908
1909         tcb->snd.una = seg->ack;
1910         if (seq_gt(seg->ack, tcb->snd.urg))
1911                 tcb->snd.urg = seg->ack;
1912
1913         if (tcb->snd.una != tcb->snd.nxt)
1914                 tcpgo(tpriv, &tcb->timer);
1915         else
1916                 tcphalt(tpriv, &tcb->timer);
1917
1918         if (seq_lt(tcb->snd.ptr, tcb->snd.una))
1919                 tcb->snd.ptr = tcb->snd.una;
1920
1921         tcb->flags &= ~RETRAN;
1922         tcb->backoff = 0;
1923         tcb->backedoff = 0;
1924 }
1925
1926 void tcpiput(struct Proto *tcp, struct Ipifc *unused, struct block *bp)
1927 {
1928         ERRSTACK(1);
1929         Tcp seg;
1930         Tcp4hdr *h4;
1931         Tcp6hdr *h6;
1932         int hdrlen;
1933         Tcpctl *tcb;
1934         uint16_t length;
1935         uint8_t source[IPaddrlen], dest[IPaddrlen];
1936         struct conv *s;
1937         struct Fs *f;
1938         struct tcppriv *tpriv;
1939         uint8_t version;
1940
1941         f = tcp->f;
1942         tpriv = tcp->priv;
1943
1944         tpriv->stats[InSegs]++;
1945
1946         h4 = (Tcp4hdr *) (bp->rp);
1947         h6 = (Tcp6hdr *) (bp->rp);
1948
1949         if ((h4->vihl & 0xF0) == IP_VER4) {
1950                 version = V4;
1951                 length = nhgets(h4->length);
1952                 v4tov6(dest, h4->tcpdst);
1953                 v4tov6(source, h4->tcpsrc);
1954
1955                 h4->Unused = 0;
1956                 hnputs(h4->tcplen, length - TCP4_PKT);
1957                 if (!(bp->flag & Btcpck) && (h4->tcpcksum[0] || h4->tcpcksum[1]) &&
1958                         ptclcsum(bp, TCP4_IPLEN, length - TCP4_IPLEN)) {
1959                         tpriv->stats[CsumErrs]++;
1960                         tpriv->stats[InErrs]++;
1961                         netlog(f, Logtcp, "bad tcp proto cksum\n");
1962                         freeblist(bp);
1963                         return;
1964                 }
1965
1966                 hdrlen = ntohtcp4(&seg, &bp);
1967                 if (hdrlen < 0) {
1968                         tpriv->stats[HlenErrs]++;
1969                         tpriv->stats[InErrs]++;
1970                         netlog(f, Logtcp, "bad tcp hdr len\n");
1971                         return;
1972                 }
1973
1974                 /* trim the packet to the size claimed by the datagram */
1975                 length -= hdrlen + TCP4_PKT;
1976                 bp = trimblock(bp, hdrlen + TCP4_PKT, length);
1977                 if (bp == NULL) {
1978                         tpriv->stats[LenErrs]++;
1979                         tpriv->stats[InErrs]++;
1980                         netlog(f, Logtcp, "tcp len < 0 after trim\n");
1981                         return;
1982                 }
1983         } else {
1984                 int ttl = h6->ttl;
1985                 int proto = h6->proto;
1986
1987                 version = V6;
1988                 length = nhgets(h6->ploadlen);
1989                 ipmove(dest, h6->tcpdst);
1990                 ipmove(source, h6->tcpsrc);
1991
1992                 h6->ploadlen[0] = h6->ploadlen[1] = h6->proto = 0;
1993                 h6->ttl = proto;
1994                 hnputl(h6->vcf, length);
1995                 if ((h6->tcpcksum[0] || h6->tcpcksum[1]) &&
1996                         ptclcsum(bp, TCP6_IPLEN, length + TCP6_PHDRSIZE)) {
1997                         tpriv->stats[CsumErrs]++;
1998                         tpriv->stats[InErrs]++;
1999                         netlog(f, Logtcp, "bad tcp proto cksum\n");
2000                         freeblist(bp);
2001                         return;
2002                 }
2003                 h6->ttl = ttl;
2004                 h6->proto = proto;
2005                 hnputs(h6->ploadlen, length);
2006
2007                 hdrlen = ntohtcp6(&seg, &bp);
2008                 if (hdrlen < 0) {
2009                         tpriv->stats[HlenErrs]++;
2010                         tpriv->stats[InErrs]++;
2011                         netlog(f, Logtcp, "bad tcp hdr len\n");
2012                         return;
2013                 }
2014
2015                 /* trim the packet to the size claimed by the datagram */
2016                 length -= hdrlen;
2017                 bp = trimblock(bp, hdrlen + TCP6_PKT, length);
2018                 if (bp == NULL) {
2019                         tpriv->stats[LenErrs]++;
2020                         tpriv->stats[InErrs]++;
2021                         netlog(f, Logtcp, "tcp len < 0 after trim\n");
2022                         return;
2023                 }
2024         }
2025
2026         /* lock protocol while searching for a conversation */
2027         qlock(&tcp->qlock);
2028
2029         /* Look for a matching conversation */
2030         s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest);
2031         if (s == NULL) {
2032                 netlog(f, Logtcp, "iphtlook failed\n");
2033 reset:
2034                 qunlock(&tcp->qlock);
2035                 sndrst(tcp, source, dest, length, &seg, version, "no conversation");
2036                 freeblist(bp);
2037                 return;
2038         }
2039
2040         /* if it's a listener, look for the right flags and get a new conv */
2041         tcb = (Tcpctl *) s->ptcl;
2042         if (tcb->state == Listen) {
2043                 if (seg.flags & RST) {
2044                         limborst(s, &seg, source, dest, version);
2045                         qunlock(&tcp->qlock);
2046                         freeblist(bp);
2047                         return;
2048                 }
2049
2050                 /* if this is a new SYN, put the call into limbo */
2051                 if ((seg.flags & SYN) && (seg.flags & ACK) == 0) {
2052                         limbo(s, source, dest, &seg, version);
2053                         qunlock(&tcp->qlock);
2054                         freeblist(bp);
2055                         return;
2056                 }
2057
2058                 /*
2059                  *  if there's a matching call in limbo, tcpincoming will
2060                  *  return it in state Syn_received
2061                  */
2062                 s = tcpincoming(s, &seg, source, dest, version);
2063                 if (s == NULL)
2064                         goto reset;
2065         }
2066
2067         /* The rest of the input state machine is run with the control block
2068          * locked and implements the state machine directly out of the RFC.
2069          * Out-of-band data is ignored - it was always a bad idea.
2070          */
2071         tcb = (Tcpctl *) s->ptcl;
2072         if (waserror()) {
2073                 qunlock(&s->qlock);
2074                 nexterror();
2075         }
2076         qlock(&s->qlock);
2077         qunlock(&tcp->qlock);
2078
2079         /* fix up window */
2080         seg.wnd <<= tcb->rcv.scale;
2081
2082         /* every input packet in puts off the keep alive time out */
2083         tcpsetkacounter(tcb);
2084
2085         switch (tcb->state) {
2086                 case Closed:
2087                         sndrst(tcp, source, dest, length, &seg, version,
2088                                    "sending to Closed");
2089                         goto raise;
2090                 case Syn_sent:
2091                         if (seg.flags & ACK) {
2092                                 if (!seq_within(seg.ack, tcb->iss + 1, tcb->snd.nxt)) {
2093                                         sndrst(tcp, source, dest, length, &seg, version,
2094                                                    "bad seq in Syn_sent");
2095                                         goto raise;
2096                                 }
2097                         }
2098                         if (seg.flags & RST) {
2099                                 if (seg.flags & ACK)
2100                                         localclose(s, errno_to_string(ECONNREFUSED));
2101                                 goto raise;
2102                         }
2103
2104                         if (seg.flags & SYN) {
2105                                 procsyn(s, &seg);
2106                                 if (seg.flags & ACK) {
2107                                         update(s, &seg);
2108                                         tcpsynackrtt(s);
2109                                         tcpsetstate(s, Established);
2110                                         tcpsetscale(s, tcb, seg.ws, tcb->scale);
2111                                 } else {
2112                                         tcb->time = NOW;
2113                                         tcpsetstate(s, Syn_received);   /* DLP - shouldn't this be a reset? */
2114                                 }
2115
2116                                 if (length != 0 || (seg.flags & FIN))
2117                                         break;
2118
2119                                 freeblist(bp);
2120                                 goto output;
2121                         } else
2122                                 freeblist(bp);
2123
2124                         qunlock(&s->qlock);
2125                         poperror();
2126                         return;
2127                 case Syn_received:
2128                         /* doesn't matter if it's the correct ack, we're just trying to set timing */
2129                         if (seg.flags & ACK)
2130                                 tcpsynackrtt(s);
2131                         break;
2132         }
2133
2134         /*
2135          *  One DOS attack is to open connections to us and then forget about them,
2136          *  thereby tying up a conv at no long term cost to the attacker.
2137          *  This is an attempt to defeat these stateless DOS attacks.  See
2138          *  corresponding code in tcpsendka().
2139          */
2140         if (tcb->state != Syn_received && (seg.flags & RST) == 0) {
2141                 if (tcpporthogdefense
2142                         && seq_within(seg.ack, tcb->snd.una - (1 << 31),
2143                                                   tcb->snd.una - (1 << 29))) {
2144                         printd("stateless hog %I.%d->%I.%d f 0x%x 0x%lx - 0x%lx - 0x%lx\n",
2145                                    source, seg.source, dest, seg.dest, seg.flags,
2146                                    tcb->snd.una - (1 << 31), seg.ack, tcb->snd.una - (1 << 29));
2147                         localclose(s, "stateless hog");
2148                 }
2149         }
2150
2151         /* Cut the data to fit the receive window */
2152         if (tcptrim(tcb, &seg, &bp, &length) == -1) {
2153                 netlog(f, Logtcp, "tcp len < 0, %lu %d\n", seg.seq, length);
2154                 update(s, &seg);
2155                 if (qlen(s->wq) + tcb->flgcnt == 0 && tcb->state == Closing) {
2156                         tcphalt(tpriv, &tcb->rtt_timer);
2157                         tcphalt(tpriv, &tcb->acktimer);
2158                         tcphalt(tpriv, &tcb->katimer);
2159                         tcpsetstate(s, Time_wait);
2160                         tcb->timer.start = MSL2 * (1000 / MSPTICK);
2161                         tcpgo(tpriv, &tcb->timer);
2162                 }
2163                 if (!(seg.flags & RST)) {
2164                         tcb->flags |= FORCE;
2165                         goto output;
2166                 }
2167                 qunlock(&s->qlock);
2168                 poperror();
2169                 return;
2170         }
2171
2172         /* Cannot accept so answer with a rst */
2173         if (length && tcb->state == Closed) {
2174                 sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
2175                 goto raise;
2176         }
2177
2178         /* The segment is beyond the current receive pointer so
2179          * queue the data in the resequence queue
2180          */
2181         if (seg.seq != tcb->rcv.nxt)
2182                 if (length != 0 || (seg.flags & (SYN | FIN))) {
2183                         update(s, &seg);
2184                         if (addreseq(tcb, tpriv, &seg, bp, length) < 0)
2185                                 printd("reseq %I.%d -> %I.%d\n", s->raddr, s->rport, s->laddr,
2186                                            s->lport);
2187                         tcb->flags |= FORCE;
2188                         goto output;
2189                 }
2190
2191         /*
2192          *  keep looping till we've processed this packet plus any
2193          *  adjacent packets in the resequence queue
2194          */
2195         for (;;) {
2196                 if (seg.flags & RST) {
2197                         if (tcb->state == Established) {
2198                                 tpriv->stats[EstabResets]++;
2199                                 if (tcb->rcv.nxt != seg.seq)
2200                                         printd
2201                                                 ("out of order RST rcvd: %I.%d -> %I.%d, rcv.nxt 0x%lx seq 0x%lx\n",
2202                                                  s->raddr, s->rport, s->laddr, s->lport, tcb->rcv.nxt,
2203                                                  seg.seq);
2204                         }
2205                         localclose(s, errno_to_string(ECONNREFUSED));
2206                         goto raise;
2207                 }
2208
2209                 if ((seg.flags & ACK) == 0)
2210                         goto raise;
2211
2212                 switch (tcb->state) {
2213                         case Syn_received:
2214                                 if (!seq_within(seg.ack, tcb->snd.una + 1, tcb->snd.nxt)) {
2215                                         sndrst(tcp, source, dest, length, &seg, version,
2216                                                    "bad seq in Syn_received");
2217                                         goto raise;
2218                                 }
2219                                 update(s, &seg);
2220                                 tcpsetstate(s, Established);
2221                         case Established:
2222                         case Close_wait:
2223                                 update(s, &seg);
2224                                 break;
2225                         case Finwait1:
2226                                 update(s, &seg);
2227                                 if (qlen(s->wq) + tcb->flgcnt == 0) {
2228                                         tcphalt(tpriv, &tcb->rtt_timer);
2229                                         tcphalt(tpriv, &tcb->acktimer);
2230                                         tcpsetkacounter(tcb);
2231                                         tcb->time = NOW;
2232                                         tcpsetstate(s, Finwait2);
2233                                         tcb->katimer.start = MSL2 * (1000 / MSPTICK);
2234                                         tcpgo(tpriv, &tcb->katimer);
2235                                 }
2236                                 break;
2237                         case Finwait2:
2238                                 update(s, &seg);
2239                                 break;
2240                         case Closing:
2241                                 update(s, &seg);
2242                                 if (qlen(s->wq) + tcb->flgcnt == 0) {
2243                                         tcphalt(tpriv, &tcb->rtt_timer);
2244                                         tcphalt(tpriv, &tcb->acktimer);
2245                                         tcphalt(tpriv, &tcb->katimer);
2246                                         tcpsetstate(s, Time_wait);
2247                                         tcb->timer.start = MSL2 * (1000 / MSPTICK);
2248                                         tcpgo(tpriv, &tcb->timer);
2249                                 }
2250                                 break;
2251                         case Last_ack:
2252                                 update(s, &seg);
2253                                 if (qlen(s->wq) + tcb->flgcnt == 0) {
2254                                         localclose(s, NULL);
2255                                         goto raise;
2256                                 }
2257                         case Time_wait:
2258                                 tcb->flags |= FORCE;
2259                                 if (tcb->timer.state != TcptimerON)
2260                                         tcpgo(tpriv, &tcb->timer);
2261                 }
2262
2263                 if ((seg.flags & URG) && seg.urg) {
2264                         if (seq_gt(seg.urg + seg.seq, tcb->rcv.urg)) {
2265                                 tcb->rcv.urg = seg.urg + seg.seq;
2266                                 pullblock(&bp, seg.urg);
2267                         }
2268                 } else if (seq_gt(tcb->rcv.nxt, tcb->rcv.urg))
2269                         tcb->rcv.urg = tcb->rcv.nxt;
2270
2271                 if (length == 0) {
2272                         if (bp != NULL)
2273                                 freeblist(bp);
2274                 } else {
2275                         switch (tcb->state) {
2276                                 default:
2277                                         /* Ignore segment text */
2278                                         if (bp != NULL)
2279                                                 freeblist(bp);
2280                                         break;
2281
2282                                 case Syn_received:
2283                                 case Established:
2284                                 case Finwait1:
2285                                         /* If we still have some data place on
2286                                          * receive queue
2287                                          */
2288                                         if (bp) {
2289                                                 bp = packblock(bp);
2290                                                 if (bp == NULL)
2291                                                         panic("tcp packblock");
2292                                                 qpassnolim(s->rq, bp);
2293                                                 bp = NULL;
2294
2295                                                 /*
2296                                                  *  Force an ack every 2 data messages.  This is
2297                                                  *  a hack for rob to make his home system run
2298                                                  *  faster.
2299                                                  *
2300                                                  *  this also keeps the standard TCP congestion
2301                                                  *  control working since it needs an ack every
2302                                                  *  2 max segs worth.  This is not quite that,
2303                                                  *  but under a real stream is equivalent since
2304                                                  *  every packet has a max seg in it.
2305                                                  */
2306                                                 if (++(tcb->rcv.una) >= 2)
2307                                                         tcb->flags |= FORCE;
2308                                         }
2309                                         tcb->rcv.nxt += length;
2310
2311                                         /*
2312                                          *  update our rcv window
2313                                          */
2314                                         tcprcvwin(s);
2315
2316                                         /*
2317                                          *  turn on the acktimer if there's something
2318                                          *  to ack
2319                                          */
2320                                         if (tcb->acktimer.state != TcptimerON)
2321                                                 tcpgo(tpriv, &tcb->acktimer);
2322
2323                                         break;
2324                                 case Finwait2:
2325                                         /* no process to read the data, send a reset */
2326                                         if (bp != NULL)
2327                                                 freeblist(bp);
2328                                         sndrst(tcp, source, dest, length, &seg, version,
2329                                                    "send to Finwait2");
2330                                         qunlock(&s->qlock);
2331                                         poperror();
2332                                         return;
2333                         }
2334                 }
2335
2336                 if (seg.flags & FIN) {
2337                         tcb->flags |= FORCE;
2338
2339                         switch (tcb->state) {
2340                                 case Syn_received:
2341                                 case Established:
2342                                         tcb->rcv.nxt++;
2343                                         tcpsetstate(s, Close_wait);
2344                                         break;
2345                                 case Finwait1:
2346                                         tcb->rcv.nxt++;
2347                                         if (qlen(s->wq) + tcb->flgcnt == 0) {
2348                                                 tcphalt(tpriv, &tcb->rtt_timer);
2349                                                 tcphalt(tpriv, &tcb->acktimer);
2350                                                 tcphalt(tpriv, &tcb->katimer);
2351                                                 tcpsetstate(s, Time_wait);
2352                                                 tcb->timer.start = MSL2 * (1000 / MSPTICK);
2353                                                 tcpgo(tpriv, &tcb->timer);
2354                                         } else
2355                                                 tcpsetstate(s, Closing);
2356                                         break;
2357                                 case Finwait2:
2358                                         tcb->rcv.nxt++;
2359                                         tcphalt(tpriv, &tcb->rtt_timer);
2360                                         tcphalt(tpriv, &tcb->acktimer);
2361                                         tcphalt(tpriv, &tcb->katimer);
2362                                         tcpsetstate(s, Time_wait);
2363                                         tcb->timer.start = MSL2 * (1000 / MSPTICK);
2364                                         tcpgo(tpriv, &tcb->timer);
2365                                         break;
2366                                 case Close_wait:
2367                                 case Closing:
2368                                 case Last_ack:
2369                                         break;
2370                                 case Time_wait:
2371                                         tcpgo(tpriv, &tcb->timer);
2372                                         break;
2373                         }
2374                 }
2375
2376                 /*
2377                  *  get next adjacent segment from the resequence queue.
2378                  *  dump/trim any overlapping segments
2379                  */
2380                 for (;;) {
2381                         if (tcb->reseq == NULL)
2382                                 goto output;
2383
2384                         if (seq_ge(tcb->rcv.nxt, tcb->reseq->seg.seq) == 0)
2385                                 goto output;
2386
2387                         getreseq(tcb, &seg, &bp, &length);
2388
2389                         if (tcptrim(tcb, &seg, &bp, &length) == 0)
2390                                 break;
2391                 }
2392         }
2393 output:
2394         tcpoutput(s);
2395         qunlock(&s->qlock);
2396         poperror();
2397         return;
2398 raise:
2399         qunlock(&s->qlock);
2400         poperror();
2401         freeblist(bp);
2402         tcpkick(s);
2403 }
2404
2405 /*
2406  *  always enters and exits with the s locked.  We drop
2407  *  the lock to ipoput the packet so some care has to be
2408  *  taken by callers.
2409  */
2410 void tcpoutput(struct conv *s)
2411 {
2412         Tcp seg;
2413         int msgs;
2414         Tcpctl *tcb;
2415         struct block *hbp, *bp;
2416         int sndcnt, n;
2417         uint32_t ssize, dsize, usable, sent;
2418         struct Fs *f;
2419         struct tcppriv *tpriv;
2420         uint8_t version;
2421
2422         f = s->p->f;
2423         tpriv = s->p->priv;
2424         version = s->ipversion;
2425
2426         for (msgs = 0; msgs < 100; msgs++) {
2427                 tcb = (Tcpctl *) s->ptcl;
2428
2429                 switch (tcb->state) {
2430                         case Listen:
2431                         case Closed:
2432                         case Finwait2:
2433                                 return;
2434                 }
2435
2436                 /* force an ack when a window has opened up */
2437                 if (tcb->rcv.blocked && tcb->rcv.wnd > 0) {
2438                         tcb->rcv.blocked = 0;
2439                         tcb->flags |= FORCE;
2440                 }
2441
2442                 sndcnt = qlen(s->wq) + tcb->flgcnt;
2443                 sent = tcb->snd.ptr - tcb->snd.una;
2444
2445                 /* Don't send anything else until our SYN has been acked */
2446                 if (tcb->snd.ptr != tcb->iss && (tcb->flags & SYNACK) == 0)
2447                         break;
2448
2449                 /* Compute usable segment based on offered window and limit
2450                  * window probes to one
2451                  */
2452                 if (tcb->snd.wnd == 0) {
2453                         if (sent != 0) {
2454                                 if ((tcb->flags & FORCE) == 0)
2455                                         break;
2456 //              tcb->snd.ptr = tcb->snd.una;
2457                         }
2458                         usable = 1;
2459                 } else {
2460                         usable = tcb->cwind;
2461                         if (tcb->snd.wnd < usable)
2462                                 usable = tcb->snd.wnd;
2463                         usable -= sent;
2464                 }
2465                 ssize = sndcnt - sent;
2466                 if (ssize && usable < 2)
2467                         netlog(s->p->f, Logtcp, "throttled snd.wnd %lu cwind %lu\n",
2468                                    tcb->snd.wnd, tcb->cwind);
2469                 if (usable < ssize)
2470                         ssize = usable;
2471                 if (ssize > tcb->mss) {
2472                         if ((tcb->flags & TSO) == 0) {
2473                                 ssize = tcb->mss;
2474                         } else {
2475                                 int segs, window;
2476
2477                                 /*  Don't send too much.  32K is arbitrary..
2478                                  */
2479                                 if (ssize > 32 * 1024)
2480                                         ssize = 32 * 1024;
2481
2482                                 /* Clamp xmit to an integral MSS to
2483                                  * avoid ragged tail segments causing
2484                                  * poor link utilization.  Also
2485                                  * account for each segment sent in
2486                                  * msg heuristic, and round up to the
2487                                  * next multiple of 4, to ensure we
2488                                  * still yeild.
2489                                  */
2490                                 segs = ssize / tcb->mss;
2491                                 ssize = segs * tcb->mss;
2492                                 msgs += segs;
2493                                 if (segs > 3)
2494                                         msgs = (msgs + 4) & ~3;
2495                         }
2496                 }
2497
2498                 dsize = ssize;
2499                 seg.urg = 0;
2500
2501                 if (ssize == 0)
2502                         if ((tcb->flags & FORCE) == 0)
2503                                 break;
2504
2505                 tcb->flags &= ~FORCE;
2506                 tcprcvwin(s);
2507
2508                 /* By default we will generate an ack */
2509                 tcphalt(tpriv, &tcb->acktimer);
2510                 tcb->rcv.una = 0;
2511                 seg.source = s->lport;
2512                 seg.dest = s->rport;
2513                 seg.flags = ACK;
2514                 seg.mss = 0;
2515                 seg.ws = 0;
2516                 switch (tcb->state) {
2517                         case Syn_sent:
2518                                 seg.flags = 0;
2519                                 if (tcb->snd.ptr == tcb->iss) {
2520                                         seg.flags |= SYN;
2521                                         dsize--;
2522                                         seg.mss = tcb->mss;
2523                                         seg.ws = tcb->scale;
2524                                 }
2525                                 break;
2526                         case Syn_received:
2527                                 /*
2528                                  *  don't send any data with a SYN/ACK packet
2529                                  *  because Linux rejects the packet in its
2530                                  *  attempt to solve the SYN attack problem
2531                                  */
2532                                 if (tcb->snd.ptr == tcb->iss) {
2533                                         seg.flags |= SYN;
2534                                         dsize = 0;
2535                                         ssize = 1;
2536                                         seg.mss = tcb->mss;
2537                                         seg.ws = tcb->scale;
2538                                 }
2539                                 break;
2540                 }
2541                 seg.seq = tcb->snd.ptr;
2542                 seg.ack = tcb->rcv.nxt;
2543                 seg.wnd = tcb->rcv.wnd;
2544
2545                 /* Pull out data to send */
2546                 bp = NULL;
2547                 if (dsize != 0) {
2548                         bp = qcopy(s->wq, dsize, sent);
2549                         if (BLEN(bp) != dsize) {
2550                                 seg.flags |= FIN;
2551                                 dsize--;
2552                         }
2553                         if (BLEN(bp) > tcb->mss) {
2554                                 bp->flag |= Btso;
2555                                 bp->mss = tcb->mss;
2556                         }
2557                 }
2558
2559                 if (sent + dsize == sndcnt)
2560                         seg.flags |= PSH;
2561
2562                 /* keep track of balance of resent data */
2563                 if (seq_lt(tcb->snd.ptr, tcb->snd.nxt)) {
2564                         n = tcb->snd.nxt - tcb->snd.ptr;
2565                         if (ssize < n)
2566                                 n = ssize;
2567                         tcb->resent += n;
2568                         netlog(f, Logtcp, "rexmit: %I.%d -> %I.%d ptr 0x%lx nxt 0x%lx\n",
2569                                    s->raddr, s->rport, s->laddr, s->lport, tcb->snd.ptr,
2570                                    tcb->snd.nxt);
2571                         tpriv->stats[RetransSegs]++;
2572                 }
2573
2574                 tcb->snd.ptr += ssize;
2575
2576                 /* Pull up the send pointer so we can accept acks
2577                  * for this window
2578                  */
2579                 if (seq_gt(tcb->snd.ptr, tcb->snd.nxt))
2580                         tcb->snd.nxt = tcb->snd.ptr;
2581
2582                 /* Build header, link data and compute cksum */
2583                 switch (version) {
2584                         case V4:
2585                                 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
2586                                 hbp = htontcp4(&seg, bp, &tcb->protohdr.tcp4hdr, tcb);
2587                                 if (hbp == NULL) {
2588                                         freeblist(bp);
2589                                         return;
2590                                 }
2591                                 break;
2592                         case V6:
2593                                 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
2594                                 hbp = htontcp6(&seg, bp, &tcb->protohdr.tcp6hdr, tcb);
2595                                 if (hbp == NULL) {
2596                                         freeblist(bp);
2597                                         return;
2598                                 }
2599                                 break;
2600                         default:
2601                                 hbp = NULL;     /* to suppress a warning */
2602                                 panic("tcpoutput: version %d", version);
2603                 }
2604
2605                 /* Start the transmission timers if there is new data and we
2606                  * expect acknowledges
2607                  */
2608                 if (ssize != 0) {
2609                         if (tcb->timer.state != TcptimerON)
2610                                 tcpgo(tpriv, &tcb->timer);
2611
2612                         /*  If round trip timer isn't running, start it.
2613                          *  measure the longest packet only in case the
2614                          *  transmission time dominates RTT
2615                          */
2616                         if (tcb->rtt_timer.state != TcptimerON)
2617                                 if (ssize == tcb->mss) {
2618                                         tcpgo(tpriv, &tcb->rtt_timer);
2619                                         tcb->rttseq = tcb->snd.ptr;
2620                                 }
2621                 }
2622
2623                 tpriv->stats[OutSegs]++;
2624
2625                 /* put off the next keep alive */
2626                 tcpgo(tpriv, &tcb->katimer);
2627
2628                 switch (version) {
2629                         case V4:
2630                                 if (ipoput4(f, hbp, 0, s->ttl, s->tos, s) < 0) {
2631                                         /* a negative return means no route */
2632                                         localclose(s, "no route");
2633                                 }
2634                                 break;
2635                         case V6:
2636                                 if (ipoput6(f, hbp, 0, s->ttl, s->tos, s) < 0) {
2637                                         /* a negative return means no route */
2638                                         localclose(s, "no route");
2639                                 }
2640                                 break;
2641                         default:
2642                                 panic("tcpoutput2: version %d", version);
2643                 }
2644                 if ((msgs % 4) == 1) {
2645                         qunlock(&s->qlock);
2646                         kthread_yield();
2647                         qlock(&s->qlock);
2648                 }
2649         }
2650 }
2651
2652 /*
2653  *  the BSD convention (hack?) for keep alives.  resend last uint8_t acked.
2654  */
2655 void tcpsendka(struct conv *s)
2656 {
2657         Tcp seg;
2658         Tcpctl *tcb;
2659         struct block *hbp, *dbp;
2660
2661         tcb = (Tcpctl *) s->ptcl;
2662
2663         dbp = NULL;
2664         seg.urg = 0;
2665         seg.source = s->lport;
2666         seg.dest = s->rport;
2667         seg.flags = ACK | PSH;
2668         seg.mss = 0;
2669         seg.ws = 0;
2670         if (tcpporthogdefense)
2671                 urandom_read(&seg.seq, sizeof(seg.seq));
2672         else
2673                 seg.seq = tcb->snd.una - 1;
2674         seg.ack = tcb->rcv.nxt;
2675         tcb->rcv.una = 0;
2676         seg.wnd = tcb->rcv.wnd;
2677         if (tcb->state == Finwait2) {
2678                 seg.flags |= FIN;
2679         } else {
2680                 dbp = allocb(1);
2681                 dbp->wp++;
2682         }
2683
2684         if (isv4(s->raddr)) {
2685                 /* Build header, link data and compute cksum */
2686                 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
2687                 hbp = htontcp4(&seg, dbp, &tcb->protohdr.tcp4hdr, tcb);
2688                 if (hbp == NULL) {
2689                         freeblist(dbp);
2690                         return;
2691                 }
2692                 ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
2693         } else {
2694                 /* Build header, link data and compute cksum */
2695                 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
2696                 hbp = htontcp6(&seg, dbp, &tcb->protohdr.tcp6hdr, tcb);
2697                 if (hbp == NULL) {
2698                         freeblist(dbp);
2699                         return;
2700                 }
2701                 ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
2702         }
2703 }
2704
2705 /*
2706  *  set connection to time out after 12 minutes
2707  */
2708 void tcpsetkacounter(Tcpctl * tcb)
2709 {
2710         tcb->kacounter = (12 * 60 * 1000) / (tcb->katimer.start * MSPTICK);
2711         if (tcb->kacounter < 3)
2712                 tcb->kacounter = 3;
2713 }
2714
2715 /*
2716  *  if we've timed out, close the connection
2717  *  otherwise, send a keepalive and restart the timer
2718  */
2719 void tcpkeepalive(void *v)
2720 {
2721         ERRSTACK(1);
2722         Tcpctl *tcb;
2723         struct conv *s;
2724
2725         s = v;
2726         tcb = (Tcpctl *) s->ptcl;
2727         qlock(&s->qlock);
2728         if (waserror()) {
2729                 qunlock(&s->qlock);
2730                 nexterror();
2731         }
2732         if (tcb->state != Closed) {
2733                 if (--(tcb->kacounter) <= 0) {
2734                         localclose(s, errno_to_string(ETIMEDOUT));
2735                 } else {
2736                         tcpsendka(s);
2737                         tcpgo(s->p->priv, &tcb->katimer);
2738                 }
2739         }
2740         qunlock(&s->qlock);
2741         poperror();
2742 }
2743
2744 /*
2745  *  start keepalive timer
2746  */
2747 static void tcpstartka(struct conv *s, char **f, int n)
2748 {
2749         Tcpctl *tcb;
2750         int x;
2751
2752         tcb = (Tcpctl *) s->ptcl;
2753         if (tcb->state != Established)
2754                 error(ENOTCONN, "connection must be in Establised state");
2755         if (n > 1) {
2756                 x = atoi(f[1]);
2757                 if (x >= MSPTICK)
2758                         tcb->katimer.start = x / MSPTICK;
2759         }
2760         tcpsetkacounter(tcb);
2761         tcpgo(s->p->priv, &tcb->katimer);
2762 }
2763
2764 /*
2765  *  turn checksums on/off
2766  */
2767 static void tcpsetchecksum(struct conv *s, char **f, int unused)
2768 {
2769         Tcpctl *tcb;
2770
2771         tcb = (Tcpctl *) s->ptcl;
2772         tcb->nochecksum = !atoi(f[1]);
2773 }
2774
2775 void tcprxmit(struct conv *s)
2776 {
2777         Tcpctl *tcb;
2778
2779         tcb = (Tcpctl *) s->ptcl;
2780
2781         tcb->flags |= RETRAN | FORCE;
2782         tcb->snd.ptr = tcb->snd.una;
2783
2784         /*
2785          *  We should be halving the slow start threshhold (down to one
2786          *  mss) but leaving it at mss seems to work well enough
2787          */
2788         tcb->ssthresh = tcb->mss;
2789
2790         /*
2791          *  pull window down to a single packet
2792          */
2793         tcb->cwind = tcb->mss;
2794         tcpoutput(s);
2795 }
2796
2797 void tcptimeout(void *arg)
2798 {
2799         ERRSTACK(1);
2800         struct conv *s;
2801         Tcpctl *tcb;
2802         int maxback;
2803         struct tcppriv *tpriv;
2804
2805         s = (struct conv *)arg;
2806         tpriv = s->p->priv;
2807         tcb = (Tcpctl *) s->ptcl;
2808
2809         qlock(&s->qlock);
2810         if (waserror()) {
2811                 qunlock(&s->qlock);
2812                 nexterror();
2813         }
2814         switch (tcb->state) {
2815                 default:
2816                         tcb->backoff++;
2817                         if (tcb->state == Syn_sent)
2818                                 maxback = MAXBACKMS / 2;
2819                         else
2820                                 maxback = MAXBACKMS;
2821                         tcb->backedoff += tcb->timer.start * MSPTICK;
2822                         if (tcb->backedoff >= maxback) {
2823                                 localclose(s, errno_to_string(ETIMEDOUT));
2824                                 break;
2825                         }
2826                         netlog(s->p->f, Logtcprxmt, "timeout rexmit 0x%lx %llu/%llu\n",
2827                                    tcb->snd.una, tcb->timer.start, NOW);
2828                         tcpsettimer(tcb);
2829                         tcprxmit(s);
2830                         tpriv->stats[RetransTimeouts]++;
2831                         tcb->snd.dupacks = 0;
2832                         break;
2833                 case Time_wait:
2834                         localclose(s, NULL);
2835                         break;
2836                 case Closed:
2837                         break;
2838         }
2839         qunlock(&s->qlock);
2840         poperror();
2841 }
2842
2843 int inwindow(Tcpctl * tcb, int seq)
2844 {
2845         return seq_within(seq, tcb->rcv.nxt, tcb->rcv.nxt + tcb->rcv.wnd - 1);
2846 }
2847
2848 /*
2849  *  set up state for a received SYN (or SYN ACK) packet
2850  */
2851 void procsyn(struct conv *s, Tcp * seg)
2852 {
2853         Tcpctl *tcb;
2854
2855         tcb = (Tcpctl *) s->ptcl;
2856         tcb->flags |= FORCE;
2857
2858         tcb->rcv.nxt = seg->seq + 1;
2859         tcb->rcv.urg = tcb->rcv.nxt;
2860         tcb->irs = seg->seq;
2861
2862         /* our sending max segment size cannot be bigger than what he asked for */
2863         if (seg->mss != 0 && seg->mss < tcb->mss)
2864                 tcb->mss = seg->mss;
2865
2866         /* the congestion window always starts out as a single segment */
2867         tcb->snd.wnd = seg->wnd;
2868         tcb->cwind = tcb->mss;
2869 }
2870
2871 int
2872 addreseq(Tcpctl * tcb, struct tcppriv *tpriv, Tcp * seg,
2873                  struct block *bp, uint16_t length)
2874 {
2875         Reseq *rp, *rp1;
2876         int i, rqlen, qmax;
2877
2878         rp = kzmalloc(sizeof(Reseq), 0);
2879         if (rp == NULL) {
2880                 freeblist(bp);  /* bp always consumed by add_reseq */
2881                 return 0;
2882         }
2883
2884         rp->seg = *seg;
2885         rp->bp = bp;
2886         rp->length = length;
2887
2888         /* Place on reassembly list sorting by starting seq number */
2889         rp1 = tcb->reseq;
2890         if (rp1 == NULL || seq_lt(seg->seq, rp1->seg.seq)) {
2891                 rp->next = rp1;
2892                 tcb->reseq = rp;
2893                 if (rp->next != NULL)
2894                         tpriv->stats[OutOfOrder]++;
2895                 return 0;
2896         }
2897
2898         rqlen = 0;
2899         for (i = 0;; i++) {
2900                 rqlen += rp1->length;
2901                 if (rp1->next == NULL || seq_lt(seg->seq, rp1->next->seg.seq)) {
2902                         rp->next = rp1->next;
2903                         rp1->next = rp;
2904                         if (rp->next != NULL)
2905                                 tpriv->stats[OutOfOrder]++;
2906                         break;
2907                 }
2908                 rp1 = rp1->next;
2909         }
2910         qmax = QMAX << tcb->rcv.scale;
2911         if (rqlen > qmax) {
2912                 printd("resequence queue > window: %d > %d\n", rqlen, qmax);
2913                 i = 0;
2914                 for (rp1 = tcb->reseq; rp1 != NULL; rp1 = rp1->next) {
2915                         printd("0x%#lx 0x%#lx 0x%#x\n", rp1->seg.seq,
2916                                    rp1->seg.ack, rp1->seg.flags);
2917                         if (i++ > 10) {
2918                                 printd("...\n");
2919                                 break;
2920                         }
2921                 }
2922
2923                 // delete entire reassembly queue; wait for retransmit.
2924                 // - should we be smarter and only delete the tail?
2925                 for (rp = tcb->reseq; rp != NULL; rp = rp1) {
2926                         rp1 = rp->next;
2927                         freeblist(rp->bp);
2928                         kfree(rp);
2929                 }
2930                 tcb->reseq = NULL;
2931
2932                 return -1;
2933         }
2934         return 0;
2935 }
2936
2937 void getreseq(Tcpctl * tcb, Tcp * seg, struct block **bp, uint16_t * length)
2938 {
2939         Reseq *rp;
2940
2941         rp = tcb->reseq;
2942         if (rp == NULL)
2943                 return;
2944
2945         tcb->reseq = rp->next;
2946
2947         *seg = rp->seg;
2948         *bp = rp->bp;
2949         *length = rp->length;
2950
2951         kfree(rp);
2952 }
2953
2954 int tcptrim(Tcpctl * tcb, Tcp * seg, struct block **bp, uint16_t * length)
2955 {
2956         uint16_t len;
2957         uint8_t accept;
2958         int dupcnt, excess;
2959
2960         accept = 0;
2961         len = *length;
2962         if (seg->flags & SYN)
2963                 len++;
2964         if (seg->flags & FIN)
2965                 len++;
2966
2967         if (tcb->rcv.wnd == 0) {
2968                 if (len == 0 && seg->seq == tcb->rcv.nxt)
2969                         return 0;
2970         } else {
2971                 /* Some part of the segment should be in the window */
2972                 if (inwindow(tcb, seg->seq))
2973                         accept++;
2974                 else if (len != 0) {
2975                         if (inwindow(tcb, seg->seq + len - 1) ||
2976                                 seq_within(tcb->rcv.nxt, seg->seq, seg->seq + len - 1))
2977                                 accept++;
2978                 }
2979         }
2980         if (!accept) {
2981                 freeblist(*bp);
2982                 return -1;
2983         }
2984         dupcnt = tcb->rcv.nxt - seg->seq;
2985         if (dupcnt > 0) {
2986                 tcb->rerecv += dupcnt;
2987                 if (seg->flags & SYN) {
2988                         seg->flags &= ~SYN;
2989                         seg->seq++;
2990
2991                         if (seg->urg > 1)
2992                                 seg->urg--;
2993                         else
2994                                 seg->flags &= ~URG;
2995                         dupcnt--;
2996                 }
2997                 if (dupcnt > 0) {
2998                         pullblock(bp, (uint16_t) dupcnt);
2999                         seg->seq += dupcnt;
3000                         *length -= dupcnt;
3001
3002                         if (seg->urg > dupcnt)
3003                                 seg->urg -= dupcnt;
3004                         else {
3005                                 seg->flags &= ~URG;
3006                                 seg->urg = 0;
3007                         }
3008                 }
3009         }
3010         excess = seg->seq + *length - (tcb->rcv.nxt + tcb->rcv.wnd);
3011         if (excess > 0) {
3012                 tcb->rerecv += excess;
3013                 *length -= excess;
3014                 *bp = trimblock(*bp, 0, *length);
3015                 if (*bp == NULL)
3016                         panic("presotto is a boofhead");
3017                 seg->flags &= ~FIN;
3018         }
3019         return 0;
3020 }
3021
3022 void tcpadvise(struct Proto *tcp, struct block *bp, char *msg)
3023 {
3024         Tcp4hdr *h4;
3025         Tcp6hdr *h6;
3026         Tcpctl *tcb;
3027         uint8_t source[IPaddrlen];
3028         uint8_t dest[IPaddrlen];
3029         uint16_t psource, pdest;
3030         struct conv *s, **p;
3031
3032         h4 = (Tcp4hdr *) (bp->rp);
3033         h6 = (Tcp6hdr *) (bp->rp);
3034
3035         if ((h4->vihl & 0xF0) == IP_VER4) {
3036                 v4tov6(dest, h4->tcpdst);
3037                 v4tov6(source, h4->tcpsrc);
3038                 psource = nhgets(h4->tcpsport);
3039                 pdest = nhgets(h4->tcpdport);
3040         } else {
3041                 ipmove(dest, h6->tcpdst);
3042                 ipmove(source, h6->tcpsrc);
3043                 psource = nhgets(h6->tcpsport);
3044                 pdest = nhgets(h6->tcpdport);
3045         }
3046
3047         /* Look for a connection */
3048         qlock(&tcp->qlock);
3049         for (p = tcp->conv; *p; p++) {
3050                 s = *p;
3051                 tcb = (Tcpctl *) s->ptcl;
3052                 if (s->rport == pdest)
3053                         if (s->lport == psource)
3054                                 if (tcb->state != Closed)
3055                                         if (ipcmp(s->raddr, dest) == 0)
3056                                                 if (ipcmp(s->laddr, source) == 0) {
3057                                                         qlock(&s->qlock);
3058                                                         qunlock(&tcp->qlock);
3059                                                         switch (tcb->state) {
3060                                                                 case Syn_sent:
3061                                                                         localclose(s, msg);
3062                                                                         break;
3063                                                         }
3064                                                         qunlock(&s->qlock);
3065                                                         freeblist(bp);
3066                                                         return;
3067                                                 }
3068         }
3069         qunlock(&tcp->qlock);
3070         freeblist(bp);
3071 }
3072
3073 static void tcpporthogdefensectl(char *val)
3074 {
3075         if (strcmp(val, "on") == 0)
3076                 tcpporthogdefense = 1;
3077         else if (strcmp(val, "off") == 0)
3078                 tcpporthogdefense = 0;
3079         else
3080                 error(EINVAL, "unknown value for tcpporthogdefense");
3081 }
3082
3083 /* called with c qlocked */
3084 static void tcpctl(struct conv *c, char **f, int n)
3085 {
3086         if (n == 1 && strcmp(f[0], "hangup") == 0)
3087                 tcphangup(c);
3088         else if (n >= 1 && strcmp(f[0], "keepalive") == 0)
3089                 tcpstartka(c, f, n);
3090         else if (n >= 1 && strcmp(f[0], "checksum") == 0)
3091                 tcpsetchecksum(c, f, n);
3092         else if (n >= 1 && strcmp(f[0], "tcpporthogdefense") == 0)
3093                 tcpporthogdefensectl(f[1]);
3094         else
3095                 error(EINVAL, "unknown command to %s", __func__);
3096 }
3097
3098 int tcpstats(struct Proto *tcp, char *buf, int len)
3099 {
3100         struct tcppriv *priv;
3101         char *p, *e;
3102         int i;
3103
3104         priv = tcp->priv;
3105         p = buf;
3106         e = p + len;
3107         for (i = 0; i < Nstats; i++)
3108                 p = seprintf(p, e, "%s: %u\n", statnames[i], priv->stats[i]);
3109         return p - buf;
3110 }
3111
3112 /*
3113  *  garbage collect any stale conversations:
3114  *      - SYN received but no SYN-ACK after 5 seconds (could be the SYN attack)
3115  *      - Finwait2 after 5 minutes
3116  *
3117  *  this is called whenever we run out of channels.  Both checks are
3118  *  of questionable validity so we try to use them only when we're
3119  *  up against the wall.
3120  */
3121 int tcpgc(struct Proto *tcp)
3122 {
3123         struct conv *c, **pp, **ep;
3124         int n;
3125         Tcpctl *tcb;
3126
3127         n = 0;
3128         ep = &tcp->conv[tcp->nc];
3129         for (pp = tcp->conv; pp < ep; pp++) {
3130                 c = *pp;
3131                 if (c == NULL)
3132                         break;
3133                 if (!canqlock(&c->qlock))
3134                         continue;
3135                 tcb = (Tcpctl *) c->ptcl;
3136                 switch (tcb->state) {
3137                         case Syn_received:
3138                                 if (NOW - tcb->time > 5000) {
3139                                         localclose(c, "timed out");
3140                                         n++;
3141                                 }
3142                                 break;
3143                         case Finwait2:
3144                                 if (NOW - tcb->time > 5 * 60 * 1000) {
3145                                         localclose(c, "timed out");
3146                                         n++;
3147                                 }
3148                                 break;
3149                 }
3150                 qunlock(&c->qlock);
3151         }
3152         return n;
3153 }
3154
3155 void tcpsettimer(Tcpctl * tcb)
3156 {
3157         int x;
3158
3159         /* round trip dependency */
3160         x = backoff(tcb->backoff) *
3161                 (tcb->mdev + (tcb->srtt >> LOGAGAIN) + MSPTICK) / MSPTICK;
3162
3163         /* bounded twixt 1/2 and 64 seconds */
3164         if (x < 500 / MSPTICK)
3165                 x = 500 / MSPTICK;
3166         else if (x > (64000 / MSPTICK))
3167                 x = 64000 / MSPTICK;
3168         tcb->timer.start = x;
3169 }
3170
3171 void tcpinit(struct Fs *fs)
3172 {
3173         struct Proto *tcp;
3174         struct tcppriv *tpriv;
3175
3176         tcp = kzmalloc(sizeof(struct Proto), 0);
3177         tpriv = tcp->priv = kzmalloc(sizeof(struct tcppriv), 0);
3178         qlock_init(&tpriv->tl);
3179         qlock_init(&tpriv->apl);
3180         tcp->name = "tcp";
3181         tcp->connect = tcpconnect;