Remove uses of errno_to_string()
[akaros.git] / kern / src / net / tcp.c
1 /* Copyright © 1994-1999 Lucent Technologies Inc.  All rights reserved.
2  * Portions Copyright © 1997-1999 Vita Nuova Limited
3  * Portions Copyright © 2000-2007 Vita Nuova Holdings Limited
4  *                                (www.vitanuova.com)
5  * Revisions Copyright © 2000-2007 Lucent Technologies Inc. and others
6  *
7  * Modified for the Akaros operating system:
8  * Copyright (c) 2013-2014 The Regents of the University of California
9  * Copyright (c) 2013-2015 Google Inc.
10  *
11  * Permission is hereby granted, free of charge, to any person obtaining a copy
12  * of this software and associated documentation files (the "Software"), to deal
13  * in the Software without restriction, including without limitation the rights
14  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15  * copies of the Software, and to permit persons to whom the Software is
16  * furnished to do so, subject to the following conditions:
17  *
18  * The above copyright notice and this permission notice shall be included in
19  * all copies or substantial portions of the Software.
20  *
21  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
24  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27  * SOFTWARE. */
28
29 #include <vfs.h>
30 #include <kfs.h>
31 #include <slab.h>
32 #include <kmalloc.h>
33 #include <kref.h>
34 #include <string.h>
35 #include <stdio.h>
36 #include <assert.h>
37 #include <error.h>
38 #include <cpio.h>
39 #include <pmap.h>
40 #include <smp.h>
41 #include <ip.h>
42
43 #include <vfs.h>
44 #include <kfs.h>
45 #include <slab.h>
46 #include <kmalloc.h>
47 #include <kref.h>
48 #include <string.h>
49 #include <stdio.h>
50 #include <assert.h>
51 #include <error.h>
52 #include <cpio.h>
53 #include <pmap.h>
54 #include <smp.h>
55 #include <ip.h>
56
57 enum {
58         QMAX = 64 * 1024 - 1,
59         IP_TCPPROTO = 6,
60
61         TCP4_IPLEN = 8,
62         TCP4_PHDRSIZE = 12,
63         TCP4_HDRSIZE = 20,
64         TCP4_TCBPHDRSZ = 40,
65         TCP4_PKT = TCP4_IPLEN + TCP4_PHDRSIZE,
66
67         TCP6_IPLEN = 0,
68         TCP6_PHDRSIZE = 40,
69         TCP6_HDRSIZE = 20,
70         TCP6_TCBPHDRSZ = 60,
71         TCP6_PKT = TCP6_IPLEN + TCP6_PHDRSIZE,
72
73         TcptimerOFF = 0,
74         TcptimerON = 1,
75         TcptimerDONE = 2,
76         MAX_TIME = (1 << 20),   /* Forever */
77         TCP_ACK = 50,   /* Timed ack sequence in ms */
78         MAXBACKMS = 9 * 60 * 1000,      /* longest backoff time (ms) before hangup */
79
80         URG = 0x20,     /* Data marked urgent */
81         ACK = 0x10,     /* Acknowledge is valid */
82         PSH = 0x08,     /* Whole data pipe is pushed */
83         RST = 0x04,     /* Reset connection */
84         SYN = 0x02,     /* Pkt. is synchronise */
85         FIN = 0x01,     /* Start close down */
86
87         EOLOPT = 0,
88         NOOPOPT = 1,
89         MSSOPT = 2,
90         MSS_LENGTH = 4, /* Mean segment size */
91         WSOPT = 3,
92         WS_LENGTH = 3,  /* Bits to scale window size by */
93         MSL2 = 10,
94         MSPTICK = 50,   /* Milliseconds per timer tick */
95         DEF_MSS = 1460, /* Default mean segment */
96         DEF_MSS6 = 1280,        /* Default mean segment (min) for v6 */
97         DEF_RTT = 500,  /* Default round trip */
98         DEF_KAT = 120000,       /* Default time (ms) between keep alives */
99         TCP_LISTEN = 0, /* Listen connection */
100         TCP_CONNECT = 1,        /* Outgoing connection */
101         SYNACK_RXTIMER = 250,   /* ms between SYNACK retransmits */
102
103         TCPREXMTTHRESH = 3,     /* dupack threshhold for rxt */
104
105         FORCE = 1,
106         CLONE = 2,
107         RETRAN = 4,
108         ACTIVE = 8,
109         SYNACK = 16,
110         TSO = 32,
111
112         LOGAGAIN = 3,
113         LOGDGAIN = 2,
114
115         Closed = 0,     /* Connection states */
116         Listen,
117         Syn_sent,
118         Syn_received,
119         Established,
120         Finwait1,
121         Finwait2,
122         Close_wait,
123         Closing,
124         Last_ack,
125         Time_wait,
126
127         Maxlimbo = 1000,        /* maximum procs waiting for response to SYN ACK */
128         NLHT = 256,     /* hash table size, must be a power of 2 */
129         LHTMASK = NLHT - 1,
130
131         HaveWS = 1 << 8,
132 };
133
134 /* Must correspond to the enumeration above */
135 char *tcpstates[] = {
136         "Closed", "Listen", "Syn_sent", "Syn_received",
137         "Established", "Finwait1", "Finwait2", "Close_wait",
138         "Closing", "Last_ack", "Time_wait"
139 };
140
141 typedef struct Tcptimer Tcptimer;
142 struct Tcptimer {
143         Tcptimer *next;
144         Tcptimer *prev;
145         Tcptimer *readynext;
146         int state;
147         uint64_t start;
148         uint64_t count;
149         void (*func) (void *);
150         void *arg;
151 };
152
153 /*
154  *  v4 and v6 pseudo headers used for
155  *  checksuming tcp
156  */
157 typedef struct Tcp4hdr Tcp4hdr;
158 struct Tcp4hdr {
159         uint8_t vihl;                           /* Version and header length */
160         uint8_t tos;                            /* Type of service */
161         uint8_t length[2];                      /* packet length */
162         uint8_t id[2];                          /* Identification */
163         uint8_t frag[2];                        /* Fragment information */
164         uint8_t Unused;
165         uint8_t proto;
166         uint8_t tcplen[2];
167         uint8_t tcpsrc[4];
168         uint8_t tcpdst[4];
169         uint8_t tcpsport[2];
170         uint8_t tcpdport[2];
171         uint8_t tcpseq[4];
172         uint8_t tcpack[4];
173         uint8_t tcpflag[2];
174         uint8_t tcpwin[2];
175         uint8_t tcpcksum[2];
176         uint8_t tcpurg[2];
177         /* Options segment */
178         uint8_t tcpopt[1];
179 };
180
181 typedef struct Tcp6hdr Tcp6hdr;
182 struct Tcp6hdr {
183         uint8_t vcf[4];
184         uint8_t ploadlen[2];
185         uint8_t proto;
186         uint8_t ttl;
187         uint8_t tcpsrc[IPaddrlen];
188         uint8_t tcpdst[IPaddrlen];
189         uint8_t tcpsport[2];
190         uint8_t tcpdport[2];
191         uint8_t tcpseq[4];
192         uint8_t tcpack[4];
193         uint8_t tcpflag[2];
194         uint8_t tcpwin[2];
195         uint8_t tcpcksum[2];
196         uint8_t tcpurg[2];
197         /* Options segment */
198         uint8_t tcpopt[1];
199 };
200
201 /*
202  *  this represents the control info
203  *  for a single packet.  It is derived from
204  *  a packet in ntohtcp{4,6}() and stuck into
205  *  a packet in htontcp{4,6}().
206  */
207 typedef struct Tcp Tcp;
208 struct Tcp {
209         uint16_t source;
210         uint16_t dest;
211         uint32_t seq;
212         uint32_t ack;
213         uint8_t flags;
214         uint16_t ws;                            /* window scale option (if not zero) */
215         uint32_t wnd;
216         uint16_t urg;
217         uint16_t mss;                           /* max segment size option (if not zero) */
218         uint16_t len;                           /* size of data */
219 };
220
221 /*
222  *  this header is malloc'd to thread together fragments
223  *  waiting to be coalesced
224  */
225 typedef struct Reseq Reseq;
226 struct Reseq {
227         Reseq *next;
228         Tcp seg;
229         struct block *bp;
230         uint16_t length;
231 };
232
233 /*
234  *  the qlock in the Conv locks this structure
235  */
236 typedef struct Tcpctl Tcpctl;
237 struct Tcpctl {
238         uint8_t state;                          /* Connection state */
239         uint8_t type;                           /* Listening or active connection */
240         uint8_t code;                           /* Icmp code */
241         struct {
242                 uint32_t una;                   /* Unacked data pointer */
243                 uint32_t nxt;                   /* Next sequence expected */
244                 uint32_t ptr;                   /* Data pointer */
245                 uint32_t wnd;                   /* Tcp send window */
246                 uint32_t urg;                   /* Urgent data pointer */
247                 uint32_t wl2;
248                 int scale;                              /* how much to right shift window in xmitted packets */
249                 /* to implement tahoe and reno TCP */
250                 uint32_t dupacks;               /* number of duplicate acks rcvd */
251                 int recovery;                   /* loss recovery flag */
252                 uint32_t rxt;                   /* right window marker for recovery */
253         } snd;
254         struct {
255                 uint32_t nxt;                   /* Receive pointer to next uint8_t slot */
256                 uint32_t wnd;                   /* Receive window incoming */
257                 uint32_t urg;                   /* Urgent pointer */
258                 int blocked;
259                 int una;                                /* unacked data segs */
260                 int scale;                              /* how much to left shift window in rcved packets */
261         } rcv;
262         uint32_t iss;                           /* Initial sequence number */
263         int sawwsopt;                           /* true if we saw a wsopt on the incoming SYN */
264         uint32_t cwind;                         /* Congestion window */
265         int scale;                                      /* desired snd.scale */
266         uint16_t ssthresh;                      /* Slow start threshold */
267         int resent;                                     /* Bytes just resent */
268         int irs;                                        /* Initial received squence */
269         uint16_t mss;                           /* Mean segment size */
270         int rerecv;                                     /* Overlap of data rerecevived */
271         uint32_t window;                        /* Recevive window */
272         uint8_t backoff;                        /* Exponential backoff counter */
273         int backedoff;                          /* ms we've backed off for rexmits */
274         uint8_t flags;                          /* State flags */
275         Reseq *reseq;                           /* Resequencing queue */
276         Tcptimer timer;                         /* Activity timer */
277         Tcptimer acktimer;                      /* Acknowledge timer */
278         Tcptimer rtt_timer;                     /* Round trip timer */
279         Tcptimer katimer;                       /* keep alive timer */
280         uint32_t rttseq;                        /* Round trip sequence */
281         int srtt;                                       /* Shortened round trip */
282         int mdev;                                       /* Mean deviation of round trip */
283         int kacounter;                          /* count down for keep alive */
284         uint64_t sndsyntime;            /* time syn sent */
285         uint64_t time;                          /* time Finwait2 or Syn_received was sent */
286         int nochecksum;                         /* non-zero means don't send checksums */
287         int flgcnt;                                     /* number of flags in the sequence (FIN,SEQ) */
288
289         union {
290                 Tcp4hdr tcp4hdr;
291                 Tcp6hdr tcp6hdr;
292         } protohdr;                                     /* prototype header */
293 };
294
295 /*
296  *  New calls are put in limbo rather than having a conversation structure
297  *  allocated.  Thus, a SYN attack results in lots of limbo'd calls but not
298  *  any real Conv structures mucking things up.  Calls in limbo rexmit their
299  *  SYN ACK every SYNACK_RXTIMER ms up to 4 times, i.e., they disappear after 1 second.
300  *
301  *  In particular they aren't on a listener's queue so that they don't figure
302  *  in the input queue limit.
303  *
304  *  If 1/2 of a T3 was attacking SYN packets, we'ld have a permanent queue
305  *  of 70000 limbo'd calls.  Not great for a linear list but doable.  Therefore
306  *  there is no hashing of this list.
307  */
308 typedef struct Limbo Limbo;
309 struct Limbo {
310         Limbo *next;
311
312         uint8_t laddr[IPaddrlen];
313         uint8_t raddr[IPaddrlen];
314         uint16_t lport;
315         uint16_t rport;
316         uint32_t irs;                           /* initial received sequence */
317         uint32_t iss;                           /* initial sent sequence */
318         uint16_t mss;                           /* mss from the other end */
319         uint16_t rcvscale;                      /* how much to scale rcvd windows */
320         uint16_t sndscale;                      /* how much to scale sent windows */
321         uint64_t lastsend;                      /* last time we sent a synack */
322         uint8_t version;                        /* v4 or v6 */
323         uint8_t rexmits;                        /* number of retransmissions */
324 };
325
326 int tcp_irtt = DEF_RTT;                 /* Initial guess at round trip time */
327 uint16_t tcp_mss = DEF_MSS;             /* Maximum segment size to be sent */
328
329 enum {
330         /* MIB stats */
331         MaxConn,
332         ActiveOpens,
333         PassiveOpens,
334         EstabResets,
335         CurrEstab,
336         InSegs,
337         OutSegs,
338         RetransSegs,
339         RetransTimeouts,
340         InErrs,
341         OutRsts,
342
343         /* non-MIB stats */
344         CsumErrs,
345         HlenErrs,
346         LenErrs,
347         OutOfOrder,
348
349         Nstats
350 };
351
352 static char *statnames[] = {
353         [MaxConn] "MaxConn",
354         [ActiveOpens] "ActiveOpens",
355         [PassiveOpens] "PassiveOpens",
356         [EstabResets] "EstabResets",
357         [CurrEstab] "CurrEstab",
358         [InSegs] "InSegs",
359         [OutSegs] "OutSegs",
360         [RetransSegs] "RetransSegs",
361         [RetransTimeouts] "RetransTimeouts",
362         [InErrs] "InErrs",
363         [OutRsts] "OutRsts",
364         [CsumErrs] "CsumErrs",
365         [HlenErrs] "HlenErrs",
366         [LenErrs] "LenErrs",
367         [OutOfOrder] "OutOfOrder",
368 };
369
370 typedef struct Tcppriv Tcppriv;
371 struct tcppriv {
372         /* List of active timers */
373         qlock_t tl;
374         Tcptimer *timers;
375
376         /* hash table for matching conversations */
377         struct Ipht ht;
378
379         /* calls in limbo waiting for an ACK to our SYN ACK */
380         int nlimbo;
381         Limbo *lht[NLHT];
382
383         /* for keeping track of tcpackproc */
384         qlock_t apl;
385         int ackprocstarted;
386
387         uint32_t stats[Nstats];
388 };
389
390 /*
391  *  Setting tcpporthogdefense to non-zero enables Dong Lin's
392  *  solution to hijacked systems staking out port's as a form
393  *  of DoS attack.
394  *
395  *  To avoid stateless Conv hogs, we pick a sequence number at random.  If
396  *  it that number gets acked by the other end, we shut down the connection.
397  *  Look for tcpporthogedefense in the code.
398  */
399 int tcpporthogdefense = 0;
400
401 int addreseq(Tcpctl *, struct tcppriv *, Tcp *, struct block *, uint16_t);
402 void getreseq(Tcpctl *, Tcp *, struct block **, uint16_t *);
403 void localclose(struct conv *, char *unused_char_p_t);
404 void procsyn(struct conv *, Tcp *);
405 void tcpiput(struct Proto *, struct Ipifc *, struct block *);
406 void tcpoutput(struct conv *);
407 int tcptrim(Tcpctl *, Tcp *, struct block **, uint16_t *);
408 void tcpstart(struct conv *, int);
409 void tcptimeout(void *);
410 void tcpsndsyn(struct conv *, Tcpctl *);
411 void tcprcvwin(struct conv *);
412 void tcpacktimer(void *);
413 void tcpkeepalive(void *);
414 void tcpsetkacounter(Tcpctl *);
415 void tcprxmit(struct conv *);
416 void tcpsettimer(Tcpctl *);
417 void tcpsynackrtt(struct conv *);
418 void tcpsetscale(struct conv *, Tcpctl *, uint16_t, uint16_t);
419
420 static void limborexmit(struct Proto *);
421 static void limbo(struct conv *, uint8_t * unused_uint8_p_t, uint8_t *, Tcp *,
422                                   int);
423
424 void tcpsetstate(struct conv *s, uint8_t newstate)
425 {
426         Tcpctl *tcb;
427         uint8_t oldstate;
428         struct tcppriv *tpriv;
429
430         tpriv = s->p->priv;
431
432         tcb = (Tcpctl *) s->ptcl;
433
434         oldstate = tcb->state;
435         if (oldstate == newstate)
436                 return;
437
438         if (oldstate == Established)
439                 tpriv->stats[CurrEstab]--;
440         if (newstate == Established)
441                 tpriv->stats[CurrEstab]++;
442
443         /**
444         print( "%d/%d %s->%s CurrEstab=%d\n", s->lport, s->rport,
445                 tcpstates[oldstate], tcpstates[newstate], tpriv->tstats.tcpCurrEstab );
446         **/
447
448         switch (newstate) {
449                 case Closed:
450                         qclose(s->rq);
451                         qclose(s->wq);
452                         qclose(s->eq);
453                         break;
454
455                 case Close_wait:        /* Remote closes */
456                         qhangup(s->rq, NULL);
457                         break;
458         }
459
460         tcb->state = newstate;
461
462         if (oldstate == Syn_sent && newstate != Closed)
463                 Fsconnected(s, NULL);
464 }
465
466 static void tcpconnect(struct conv *c, char **argv, int argc)
467 {
468         Fsstdconnect(c, argv, argc);
469         tcpstart(c, TCP_CONNECT);
470 }
471
472 static int tcpstate(struct conv *c, char *state, int n)
473 {
474         Tcpctl *s;
475
476         s = (Tcpctl *) (c->ptcl);
477
478         return snprintf(state, n,
479                                         "%s qin %d qout %d srtt %d mdev %d cwin %u swin %u>>%d rwin %u>>%d timer.start %llu timer.count %llu rerecv %d katimer.start %d katimer.count %d\n",
480                                         tcpstates[s->state],
481                                         c->rq ? qlen(c->rq) : 0,
482                                         c->wq ? qlen(c->wq) : 0,
483                                         s->srtt, s->mdev,
484                                         s->cwind, s->snd.wnd, s->rcv.scale, s->rcv.wnd,
485                                         s->snd.scale, s->timer.start, s->timer.count, s->rerecv,
486                                         s->katimer.start, s->katimer.count);
487 }
488
489 static int tcpinuse(struct conv *c)
490 {
491         Tcpctl *s;
492
493         s = (Tcpctl *) (c->ptcl);
494         return s->state != Closed;
495 }
496
497 static void tcpannounce(struct conv *c, char **argv, int argc)
498 {
499         Fsstdannounce(c, argv, argc);
500         tcpstart(c, TCP_LISTEN);
501         Fsconnected(c, NULL);
502 }
503
504 /*
505  *  tcpclose is always called with the q locked
506  */
507 static void tcpclose(struct conv *c)
508 {
509         Tcpctl *tcb;
510
511         tcb = (Tcpctl *) c->ptcl;
512
513         qhangup(c->rq, NULL);
514         qhangup(c->wq, NULL);
515         qhangup(c->eq, NULL);
516         qflush(c->rq);
517
518         switch (tcb->state) {
519                 case Listen:
520                         /*
521                          *  reset any incoming calls to this listener
522                          */
523                         Fsconnected(c, "Hangup");
524
525                         localclose(c, NULL);
526                         break;
527                 case Closed:
528                 case Syn_sent:
529                         localclose(c, NULL);
530                         break;
531                 case Syn_received:
532                 case Established:
533                         tcb->flgcnt++;
534                         tcb->snd.nxt++;
535                         tcpsetstate(c, Finwait1);
536                         tcpoutput(c);
537                         break;
538                 case Close_wait:
539                         tcb->flgcnt++;
540                         tcb->snd.nxt++;
541                         tcpsetstate(c, Last_ack);
542                         tcpoutput(c);
543                         break;
544         }
545 }
546
547 void tcpkick(void *x)
548 {
549         ERRSTACK(1);
550         struct conv *s = x;
551         Tcpctl *tcb;
552
553         tcb = (Tcpctl *) s->ptcl;
554
555         qlock(&s->qlock);
556         if (waserror()) {
557                 qunlock(&s->qlock);
558                 nexterror();
559         }
560
561         switch (tcb->state) {
562                 case Syn_sent:
563                 case Syn_received:
564                 case Established:
565                 case Close_wait:
566                         /*
567                          * Push data
568                          */
569                         tcprcvwin(s);
570                         tcpoutput(s);
571                         break;
572                 default:
573                         localclose(s, "Hangup");
574                         break;
575         }
576
577         qunlock(&s->qlock);
578         poperror();
579 }
580
581 void tcprcvwin(struct conv *s)
582 {       /* Call with tcb locked */
583         int w;
584         Tcpctl *tcb;
585
586         tcb = (Tcpctl *) s->ptcl;
587         w = tcb->window - qlen(s->rq);
588         if (w < 0)
589                 w = 0;
590         tcb->rcv.wnd = w;
591         if (w == 0)
592                 tcb->rcv.blocked = 1;
593 }
594
595 void tcpacktimer(void *v)
596 {
597         ERRSTACK(1);
598         Tcpctl *tcb;
599         struct conv *s;
600
601         s = v;
602         tcb = (Tcpctl *) s->ptcl;
603
604         qlock(&s->qlock);
605         if (waserror()) {
606                 qunlock(&s->qlock);
607                 nexterror();
608         }
609         if (tcb->state != Closed) {
610                 tcb->flags |= FORCE;
611                 tcprcvwin(s);
612                 tcpoutput(s);
613         }
614         qunlock(&s->qlock);
615         poperror();
616 }
617
618 static void tcpcreate(struct conv *c)
619 {
620         c->rq = qopen(QMAX, Qcoalesce, tcpacktimer, c);
621         c->wq = qopen(8 * QMAX, Qkick, tcpkick, c);
622 }
623
624 static void timerstate(struct tcppriv *priv, Tcptimer * t, int newstate)
625 {
626         if (newstate != TcptimerON) {
627                 if (t->state == TcptimerON) {
628                         // unchain
629                         if (priv->timers == t) {
630                                 priv->timers = t->next;
631                                 if (t->prev != NULL)
632                                         panic("timerstate1");
633                         }
634                         if (t->next)
635                                 t->next->prev = t->prev;
636                         if (t->prev)
637                                 t->prev->next = t->next;
638                         t->next = t->prev = NULL;
639                 }
640         } else {
641                 if (t->state != TcptimerON) {
642                         // chain
643                         if (t->prev != NULL || t->next != NULL)
644                                 panic("timerstate2");
645                         t->prev = NULL;
646                         t->next = priv->timers;
647                         if (t->next)
648                                 t->next->prev = t;
649                         priv->timers = t;
650                 }
651         }
652         t->state = newstate;
653 }
654
655 void tcpackproc(void *a)
656 {
657         ERRSTACK(1);
658         Tcptimer *t, *tp, *timeo;
659         struct Proto *tcp;
660         struct tcppriv *priv;
661         int loop;
662
663         tcp = a;
664         priv = tcp->priv;
665
666         for (;;) {
667                 kthread_usleep(MSPTICK * 1000);
668
669                 qlock(&priv->tl);
670                 timeo = NULL;
671                 loop = 0;
672                 for (t = priv->timers; t != NULL; t = tp) {
673                         if (loop++ > 10000)
674                                 panic("tcpackproc1");
675                         tp = t->next;
676                         if (t->state == TcptimerON) {
677                                 t->count--;
678                                 if (t->count == 0) {
679                                         timerstate(priv, t, TcptimerDONE);
680                                         t->readynext = timeo;
681                                         timeo = t;
682                                 }
683                         }
684                 }
685                 qunlock(&priv->tl);
686
687                 loop = 0;
688                 for (t = timeo; t != NULL; t = t->readynext) {
689                         if (loop++ > 10000)
690                                 panic("tcpackproc2");
691                         if (t->state == TcptimerDONE && t->func != NULL) {
692                                 /* discard error style */
693                                 if (!waserror())
694                                         (*t->func) (t->arg);
695                                 poperror();
696                         }
697                 }
698
699                 limborexmit(tcp);
700         }
701 }
702
703 void tcpgo(struct tcppriv *priv, Tcptimer * t)
704 {
705         if (t == NULL || t->start == 0)
706                 return;
707
708         qlock(&priv->tl);
709         t->count = t->start;
710         timerstate(priv, t, TcptimerON);
711         qunlock(&priv->tl);
712 }
713
714 void tcphalt(struct tcppriv *priv, Tcptimer * t)
715 {
716         if (t == NULL)
717                 return;
718
719         qlock(&priv->tl);
720         timerstate(priv, t, TcptimerOFF);
721         qunlock(&priv->tl);
722 }
723
724 int backoff(int n)
725 {
726         return 1 << n;
727 }
728
729 void localclose(struct conv *s, char *reason)
730 {       /* called with tcb locked */
731         Tcpctl *tcb;
732         Reseq *rp, *rp1;
733         struct tcppriv *tpriv;
734
735         tpriv = s->p->priv;
736         tcb = (Tcpctl *) s->ptcl;
737
738         iphtrem(&tpriv->ht, s);
739
740         tcphalt(tpriv, &tcb->timer);
741         tcphalt(tpriv, &tcb->rtt_timer);
742         tcphalt(tpriv, &tcb->acktimer);
743         tcphalt(tpriv, &tcb->katimer);
744
745         /* Flush reassembly queue; nothing more can arrive */
746         for (rp = tcb->reseq; rp != NULL; rp = rp1) {
747                 rp1 = rp->next;
748                 freeblist(rp->bp);
749                 kfree(rp);
750         }
751         tcb->reseq = NULL;
752
753         if (tcb->state == Syn_sent)
754                 Fsconnected(s, reason);
755
756         qhangup(s->rq, reason);
757         qhangup(s->wq, reason);
758
759         tcpsetstate(s, Closed);
760
761         /* listener will check the rq state */
762         if (s->state == Announced)
763                 rendez_wakeup(&s->listenr);
764 }
765
766 /* mtu (- TCP + IP hdr len) of 1st hop */
767 int tcpmtu(struct Proto *tcp, uint8_t * addr, int version, int *scale,
768            uint8_t *flags)
769 {
770         struct Ipifc *ifc;
771         int mtu;
772
773         ifc = findipifc(tcp->f, addr, 0);
774         switch (version) {
775                 default:
776                 case V4:
777                         mtu = DEF_MSS;
778                         if (ifc != NULL)
779                                 mtu = ifc->maxtu - ifc->m->hsize - (TCP4_PKT + TCP4_HDRSIZE);
780                         break;
781                 case V6:
782                         mtu = DEF_MSS6;
783                         if (ifc != NULL)
784                                 mtu = ifc->maxtu - ifc->m->hsize - (TCP6_PKT + TCP6_HDRSIZE);
785                         break;
786         }
787         *flags &= ~TSO;
788
789         if (ifc != NULL) {
790                 if (ifc->mbps > 100)
791                         *scale = HaveWS | 3;
792                 else if (ifc->mbps > 10)
793                         *scale = HaveWS | 1;
794                 else
795                         *scale = HaveWS | 0;
796                 if (ifc->feat & NETF_TSO)
797                         *flags |= TSO;
798         } else
799                 *scale = HaveWS | 0;
800
801         return mtu;
802 }
803
804 void inittcpctl(struct conv *s, int mode)
805 {
806         Tcpctl *tcb;
807         Tcp4hdr *h4;
808         Tcp6hdr *h6;
809         int mss;
810
811         tcb = (Tcpctl *) s->ptcl;
812
813         memset(tcb, 0, sizeof(Tcpctl));
814
815         tcb->ssthresh = 65535;
816         tcb->srtt = tcp_irtt << LOGAGAIN;
817         tcb->mdev = 0;
818
819         /* setup timers */
820         tcb->timer.start = tcp_irtt / MSPTICK;
821         tcb->timer.func = tcptimeout;
822         tcb->timer.arg = s;
823         tcb->rtt_timer.start = MAX_TIME;
824         tcb->acktimer.start = TCP_ACK / MSPTICK;
825         tcb->acktimer.func = tcpacktimer;
826         tcb->acktimer.arg = s;
827         tcb->katimer.start = DEF_KAT / MSPTICK;
828         tcb->katimer.func = tcpkeepalive;
829         tcb->katimer.arg = s;
830
831         mss = DEF_MSS;
832
833         /* create a prototype(pseudo) header */
834         if (mode != TCP_LISTEN) {
835                 if (ipcmp(s->laddr, IPnoaddr) == 0)
836                         findlocalip(s->p->f, s->laddr, s->raddr);
837
838                 switch (s->ipversion) {
839                         case V4:
840                                 h4 = &tcb->protohdr.tcp4hdr;
841                                 memset(h4, 0, sizeof(*h4));
842                                 h4->proto = IP_TCPPROTO;
843                                 hnputs(h4->tcpsport, s->lport);
844                                 hnputs(h4->tcpdport, s->rport);
845                                 v6tov4(h4->tcpsrc, s->laddr);
846                                 v6tov4(h4->tcpdst, s->raddr);
847                                 break;
848                         case V6:
849                                 h6 = &tcb->protohdr.tcp6hdr;
850                                 memset(h6, 0, sizeof(*h6));
851                                 h6->proto = IP_TCPPROTO;
852                                 hnputs(h6->tcpsport, s->lport);
853                                 hnputs(h6->tcpdport, s->rport);
854                                 ipmove(h6->tcpsrc, s->laddr);
855                                 ipmove(h6->tcpdst, s->raddr);
856                                 mss = DEF_MSS6;
857                                 break;
858                         default:
859                                 panic("inittcpctl: version %d", s->ipversion);
860                 }
861         }
862
863         tcb->mss = tcb->cwind = mss;
864
865         /* default is no window scaling */
866         tcb->window = QMAX;
867         tcb->rcv.wnd = QMAX;
868         tcb->rcv.scale = 0;
869         tcb->snd.scale = 0;
870         qsetlimit(s->rq, QMAX);
871 }
872
873 /*
874  *  called with s qlocked
875  */
876 void tcpstart(struct conv *s, int mode)
877 {
878         Tcpctl *tcb;
879         struct tcppriv *tpriv;
880         /* tcpackproc needs to free this if it ever exits */
881         char *kpname = kmalloc(KNAMELEN, KMALLOC_WAIT);
882
883         tpriv = s->p->priv;
884
885         if (tpriv->ackprocstarted == 0) {
886                 qlock(&tpriv->apl);
887                 if (tpriv->ackprocstarted == 0) {
888                         snprintf(kpname, KNAMELEN, "#I%dtcpack", s->p->f->dev);
889                         ktask(kpname, tcpackproc, s->p);
890                         tpriv->ackprocstarted = 1;
891                 }
892                 qunlock(&tpriv->apl);
893         }
894
895         tcb = (Tcpctl *) s->ptcl;
896
897         inittcpctl(s, mode);
898
899         iphtadd(&tpriv->ht, s);
900         switch (mode) {
901                 case TCP_LISTEN:
902                         tpriv->stats[PassiveOpens]++;
903                         tcb->flags |= CLONE;
904                         tcpsetstate(s, Listen);
905                         break;
906
907                 case TCP_CONNECT:
908                         tpriv->stats[ActiveOpens]++;
909                         tcb->flags |= ACTIVE;
910                         tcpsndsyn(s, tcb);
911                         tcpsetstate(s, Syn_sent);
912                         tcpoutput(s);
913                         break;
914         }
915 }
916
917 static char *tcpflag(uint16_t flag)
918 {
919         static char buf[128];
920
921         snprintf(buf, sizeof(buf), "%d", flag >> 10);   /* Head len */
922         if (flag & URG)
923                 snprintf(buf, sizeof(buf), "%s%s", buf, " URG");
924         if (flag & ACK)
925                 snprintf(buf, sizeof(buf), "%s%s", buf, " ACK");
926         if (flag & PSH)
927                 snprintf(buf, sizeof(buf), "%s%s", buf, " PSH");
928         if (flag & RST)
929                 snprintf(buf, sizeof(buf), "%s%s", buf, " RST");
930         if (flag & SYN)
931                 snprintf(buf, sizeof(buf), "%s%s", buf, " SYN");
932         if (flag & FIN)
933                 snprintf(buf, sizeof(buf), "%s%s", buf, " FIN");
934
935         return buf;
936 }
937
938 struct block *htontcp6(Tcp * tcph, struct block *data, Tcp6hdr * ph,
939                                            Tcpctl * tcb)
940 {
941         int dlen;
942         Tcp6hdr *h;
943         uint16_t csum;
944         uint16_t hdrlen, optpad = 0;
945         uint8_t *opt;
946
947         hdrlen = TCP6_HDRSIZE;
948         if (tcph->flags & SYN) {
949                 if (tcph->mss)
950                         hdrlen += MSS_LENGTH;
951                 if (tcph->ws)
952                         hdrlen += WS_LENGTH;
953                 optpad = hdrlen & 3;
954                 if (optpad)
955                         optpad = 4 - optpad;
956                 hdrlen += optpad;
957         }
958
959         if (data) {
960                 dlen = blocklen(data);
961                 data = padblock(data, hdrlen + TCP6_PKT);
962                 if (data == NULL)
963                         return NULL;
964         } else {
965                 dlen = 0;
966                 data = allocb(hdrlen + TCP6_PKT + 64);  /* the 64 pad is to meet mintu's */
967                 if (data == NULL)
968                         return NULL;
969                 data->wp += hdrlen + TCP6_PKT;
970         }
971
972         /* copy in pseudo ip header plus port numbers */
973         h = (Tcp6hdr *) (data->rp);
974         memmove(h, ph, TCP6_TCBPHDRSZ);
975
976         /* compose pseudo tcp header, do cksum calculation */
977         hnputl(h->vcf, hdrlen + dlen);
978         h->ploadlen[0] = h->ploadlen[1] = h->proto = 0;
979         h->ttl = ph->proto;
980
981         /* copy in variable bits */
982         hnputl(h->tcpseq, tcph->seq);
983         hnputl(h->tcpack, tcph->ack);
984         hnputs(h->tcpflag, (hdrlen << 10) | tcph->flags);
985         hnputs(h->tcpwin, tcph->wnd >> (tcb != NULL ? tcb->snd.scale : 0));
986         hnputs(h->tcpurg, tcph->urg);
987
988         if (tcph->flags & SYN) {
989                 opt = h->tcpopt;
990                 if (tcph->mss != 0) {
991                         *opt++ = MSSOPT;
992                         *opt++ = MSS_LENGTH;
993                         hnputs(opt, tcph->mss);
994                         opt += 2;
995                 }
996                 if (tcph->ws != 0) {
997                         *opt++ = WSOPT;
998                         *opt++ = WS_LENGTH;
999                         *opt++ = tcph->ws;
1000                 }
1001                 while (optpad-- > 0)
1002                         *opt++ = NOOPOPT;
1003         }
1004
1005         if (tcb != NULL && tcb->nochecksum) {
1006                 h->tcpcksum[0] = h->tcpcksum[1] = 0;
1007         } else {
1008                 csum = ptclcsum(data, TCP6_IPLEN, hdrlen + dlen + TCP6_PHDRSIZE);
1009                 hnputs(h->tcpcksum, csum);
1010         }
1011
1012         /* move from pseudo header back to normal ip header */
1013         memset(h->vcf, 0, 4);
1014         h->vcf[0] = IP_VER6;
1015         hnputs(h->ploadlen, hdrlen + dlen);
1016         h->proto = ph->proto;
1017
1018         return data;
1019 }
1020
1021 struct block *htontcp4(Tcp * tcph, struct block *data, Tcp4hdr * ph,
1022                                            Tcpctl * tcb)
1023 {
1024         int dlen;
1025         Tcp4hdr *h;
1026         uint16_t csum;
1027         uint16_t hdrlen, optpad = 0;
1028         uint8_t *opt;
1029
1030         hdrlen = TCP4_HDRSIZE;
1031         if (tcph->flags & SYN) {
1032                 if (tcph->mss)
1033                         hdrlen += MSS_LENGTH;
1034                 if (tcph->ws)
1035                         hdrlen += WS_LENGTH;
1036                 optpad = hdrlen & 3;
1037                 if (optpad)
1038                         optpad = 4 - optpad;
1039                 hdrlen += optpad;
1040         }
1041
1042         if (data) {
1043                 dlen = blocklen(data);
1044                 data = padblock(data, hdrlen + TCP4_PKT);
1045                 if (data == NULL)
1046                         return NULL;
1047         } else {
1048                 dlen = 0;
1049                 data = allocb(hdrlen + TCP4_PKT + 64);  /* the 64 pad is to meet mintu's */
1050                 if (data == NULL)
1051                         return NULL;
1052                 data->wp += hdrlen + TCP4_PKT;
1053         }
1054
1055         /* copy in pseudo ip header plus port numbers */
1056         h = (Tcp4hdr *) (data->rp);
1057         memmove(h, ph, TCP4_TCBPHDRSZ);
1058
1059         /* copy in variable bits */
1060         hnputs(h->tcplen, hdrlen + dlen);
1061         hnputl(h->tcpseq, tcph->seq);
1062         hnputl(h->tcpack, tcph->ack);
1063         hnputs(h->tcpflag, (hdrlen << 10) | tcph->flags);
1064         hnputs(h->tcpwin, tcph->wnd >> (tcb != NULL ? tcb->snd.scale : 0));
1065         hnputs(h->tcpurg, tcph->urg);
1066
1067         if (tcph->flags & SYN) {
1068                 opt = h->tcpopt;
1069                 if (tcph->mss != 0) {
1070                         *opt++ = MSSOPT;
1071                         *opt++ = MSS_LENGTH;
1072                         hnputs(opt, tcph->mss);
1073                         opt += 2;
1074                 }
1075                 if (tcph->ws != 0) {
1076                         *opt++ = WSOPT;
1077                         *opt++ = WS_LENGTH;
1078                         *opt++ = tcph->ws;
1079                 }
1080                 while (optpad-- > 0)
1081                         *opt++ = NOOPOPT;
1082         }
1083
1084         if (tcb != NULL && tcb->nochecksum) {
1085                 h->tcpcksum[0] = h->tcpcksum[1] = 0;
1086         } else {
1087                 csum = ~ptclcsum(data, TCP4_IPLEN, TCP4_PHDRSIZE);
1088                 hnputs(h->tcpcksum, csum);
1089                 data->checksum_start = TCP4_IPLEN + TCP4_PHDRSIZE;
1090                 data->checksum_offset = ph->tcpcksum - ph->tcpsport;
1091                 data->flag |= Btcpck;
1092         }
1093
1094         return data;
1095 }
1096
1097 int ntohtcp6(Tcp * tcph, struct block **bpp)
1098 {
1099         Tcp6hdr *h;
1100         uint8_t *optr;
1101         uint16_t hdrlen;
1102         uint16_t optlen;
1103         int n;
1104
1105         *bpp = pullupblock(*bpp, TCP6_PKT + TCP6_HDRSIZE);
1106         if (*bpp == NULL)
1107                 return -1;
1108
1109         h = (Tcp6hdr *) ((*bpp)->rp);
1110         tcph->source = nhgets(h->tcpsport);
1111         tcph->dest = nhgets(h->tcpdport);
1112         tcph->seq = nhgetl(h->tcpseq);
1113         tcph->ack = nhgetl(h->tcpack);
1114         hdrlen = (h->tcpflag[0] >> 2) & ~3;
1115         if (hdrlen < TCP6_HDRSIZE) {
1116                 freeblist(*bpp);
1117                 return -1;
1118         }
1119
1120         tcph->flags = h->tcpflag[1];
1121         tcph->wnd = nhgets(h->tcpwin);
1122         tcph->urg = nhgets(h->tcpurg);
1123         tcph->mss = 0;
1124         tcph->ws = 0;
1125         tcph->len = nhgets(h->ploadlen) - hdrlen;
1126
1127         *bpp = pullupblock(*bpp, hdrlen + TCP6_PKT);
1128         if (*bpp == NULL)
1129                 return -1;
1130
1131         optr = h->tcpopt;
1132         n = hdrlen - TCP6_HDRSIZE;
1133         while (n > 0 && *optr != EOLOPT) {
1134                 if (*optr == NOOPOPT) {
1135                         n--;
1136                         optr++;
1137                         continue;
1138                 }
1139                 optlen = optr[1];
1140                 if (optlen < 2 || optlen > n)
1141                         break;
1142                 switch (*optr) {
1143                         case MSSOPT:
1144                                 if (optlen == MSS_LENGTH)
1145                                         tcph->mss = nhgets(optr + 2);
1146                                 break;
1147                         case WSOPT:
1148                                 if (optlen == WS_LENGTH && *(optr + 2) <= 14)
1149                                         tcph->ws = HaveWS | *(optr + 2);
1150                                 break;
1151                 }
1152                 n -= optlen;
1153                 optr += optlen;
1154         }
1155         return hdrlen;
1156 }
1157
1158 int ntohtcp4(Tcp * tcph, struct block **bpp)
1159 {
1160         Tcp4hdr *h;
1161         uint8_t *optr;
1162         uint16_t hdrlen;
1163         uint16_t optlen;
1164         int n;
1165
1166         *bpp = pullupblock(*bpp, TCP4_PKT + TCP4_HDRSIZE);
1167         if (*bpp == NULL)
1168                 return -1;
1169
1170         h = (Tcp4hdr *) ((*bpp)->rp);
1171         tcph->source = nhgets(h->tcpsport);
1172         tcph->dest = nhgets(h->tcpdport);
1173         tcph->seq = nhgetl(h->tcpseq);
1174         tcph->ack = nhgetl(h->tcpack);
1175
1176         hdrlen = (h->tcpflag[0] >> 2) & ~3;
1177         if (hdrlen < TCP4_HDRSIZE) {
1178                 freeblist(*bpp);
1179                 return -1;
1180         }
1181
1182         tcph->flags = h->tcpflag[1];
1183         tcph->wnd = nhgets(h->tcpwin);
1184         tcph->urg = nhgets(h->tcpurg);
1185         tcph->mss = 0;
1186         tcph->ws = 0;
1187         tcph->len = nhgets(h->length) - (hdrlen + TCP4_PKT);
1188
1189         *bpp = pullupblock(*bpp, hdrlen + TCP4_PKT);
1190         if (*bpp == NULL)
1191                 return -1;
1192
1193         optr = h->tcpopt;
1194         n = hdrlen - TCP4_HDRSIZE;
1195         while (n > 0 && *optr != EOLOPT) {
1196                 if (*optr == NOOPOPT) {
1197                         n--;
1198                         optr++;
1199                         continue;
1200                 }
1201                 optlen = optr[1];
1202                 if (optlen < 2 || optlen > n)
1203                         break;
1204                 switch (*optr) {
1205                         case MSSOPT:
1206                                 if (optlen == MSS_LENGTH)
1207                                         tcph->mss = nhgets(optr + 2);
1208                                 break;
1209                         case WSOPT:
1210                                 if (optlen == WS_LENGTH && *(optr + 2) <= 14)
1211                                         tcph->ws = HaveWS | *(optr + 2);
1212                                 break;
1213                 }
1214                 n -= optlen;
1215                 optr += optlen;
1216         }
1217         return hdrlen;
1218 }
1219
1220 /*
1221  *  For outgiing calls, generate an initial sequence
1222  *  number and put a SYN on the send queue
1223  */
1224 void tcpsndsyn(struct conv *s, Tcpctl * tcb)
1225 {
1226         urandom_read(&tcb->iss, sizeof(tcb->iss));
1227         tcb->rttseq = tcb->iss;
1228         tcb->snd.wl2 = tcb->iss;
1229         tcb->snd.una = tcb->iss;
1230         tcb->snd.ptr = tcb->rttseq;
1231         tcb->snd.nxt = tcb->rttseq;
1232         tcb->flgcnt++;
1233         tcb->flags |= FORCE;
1234         tcb->sndsyntime = NOW;
1235
1236         /* set desired mss and scale */
1237         tcb->mss = tcpmtu(s->p, s->laddr, s->ipversion, &tcb->scale,
1238                           &tcb->flags);
1239 }
1240
1241 void
1242 sndrst(struct Proto *tcp, uint8_t * source, uint8_t * dest,
1243            uint16_t length, Tcp * seg, uint8_t version, char *reason)
1244 {
1245         struct block *hbp;
1246         uint8_t rflags;
1247         struct tcppriv *tpriv;
1248         Tcp4hdr ph4;
1249         Tcp6hdr ph6;
1250
1251         netlog(tcp->f, Logtcp, "sndrst: %s\n", reason);
1252
1253         tpriv = tcp->priv;
1254
1255         if (seg->flags & RST)
1256                 return;
1257
1258         /* make pseudo header */
1259         switch (version) {
1260                 case V4:
1261                         memset(&ph4, 0, sizeof(ph4));
1262                         ph4.vihl = IP_VER4;
1263                         v6tov4(ph4.tcpsrc, dest);
1264                         v6tov4(ph4.tcpdst, source);
1265                         ph4.proto = IP_TCPPROTO;
1266                         hnputs(ph4.tcplen, TCP4_HDRSIZE);
1267                         hnputs(ph4.tcpsport, seg->dest);
1268                         hnputs(ph4.tcpdport, seg->source);
1269                         break;
1270                 case V6:
1271                         memset(&ph6, 0, sizeof(ph6));
1272                         ph6.vcf[0] = IP_VER6;
1273                         ipmove(ph6.tcpsrc, dest);
1274                         ipmove(ph6.tcpdst, source);
1275                         ph6.proto = IP_TCPPROTO;
1276                         hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1277                         hnputs(ph6.tcpsport, seg->dest);
1278                         hnputs(ph6.tcpdport, seg->source);
1279                         break;
1280                 default:
1281                         panic("sndrst: version %d", version);
1282         }
1283
1284         tpriv->stats[OutRsts]++;
1285         rflags = RST;
1286
1287         /* convince the other end that this reset is in band */
1288         if (seg->flags & ACK) {
1289                 seg->seq = seg->ack;
1290                 seg->ack = 0;
1291         } else {
1292                 rflags |= ACK;
1293                 seg->ack = seg->seq;
1294                 seg->seq = 0;
1295                 if (seg->flags & SYN)
1296                         seg->ack++;
1297                 seg->ack += length;
1298                 if (seg->flags & FIN)
1299                         seg->ack++;
1300         }
1301         seg->flags = rflags;
1302         seg->wnd = 0;
1303         seg->urg = 0;
1304         seg->mss = 0;
1305         seg->ws = 0;
1306         switch (version) {
1307                 case V4:
1308                         hbp = htontcp4(seg, NULL, &ph4, NULL);
1309                         if (hbp == NULL)
1310                                 return;
1311                         ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1312                         break;
1313                 case V6:
1314                         hbp = htontcp6(seg, NULL, &ph6, NULL);
1315                         if (hbp == NULL)
1316                                 return;
1317                         ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1318                         break;
1319                 default:
1320                         panic("sndrst2: version %d", version);
1321         }
1322 }
1323
1324 /*
1325  *  send a reset to the remote side and close the conversation
1326  *  called with s qlocked
1327  */
1328 static void tcphangup(struct conv *s)
1329 {
1330         ERRSTACK(1);
1331         Tcp seg;
1332         Tcpctl *tcb;
1333         struct block *hbp;
1334
1335         tcb = (Tcpctl *) s->ptcl;
1336         if (ipcmp(s->raddr, IPnoaddr)) {
1337                 /* discard error style, poperror regardless */
1338                 if (!waserror()) {
1339                         seg.flags = RST | ACK;
1340                         seg.ack = tcb->rcv.nxt;
1341                         tcb->rcv.una = 0;
1342                         seg.seq = tcb->snd.ptr;
1343                         seg.wnd = 0;
1344                         seg.urg = 0;
1345                         seg.mss = 0;
1346                         seg.ws = 0;
1347                         switch (s->ipversion) {
1348                                 case V4:
1349                                         tcb->protohdr.tcp4hdr.vihl = IP_VER4;
1350                                         hbp = htontcp4(&seg, NULL, &tcb->protohdr.tcp4hdr, tcb);
1351                                         ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
1352                                         break;
1353                                 case V6:
1354                                         tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
1355                                         hbp = htontcp6(&seg, NULL, &tcb->protohdr.tcp6hdr, tcb);
1356                                         ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
1357                                         break;
1358                                 default:
1359                                         panic("tcphangup: version %d", s->ipversion);
1360                         }
1361                 }
1362                 poperror();
1363         }
1364         localclose(s, NULL);
1365 }
1366
1367 /*
1368  *  (re)send a SYN ACK
1369  */
1370 int sndsynack(struct Proto *tcp, Limbo * lp)
1371 {
1372         struct block *hbp;
1373         Tcp4hdr ph4;
1374         Tcp6hdr ph6;
1375         Tcp seg;
1376         int scale;
1377         uint8_t flag = 0;
1378
1379         /* make pseudo header */
1380         switch (lp->version) {
1381                 case V4:
1382                         memset(&ph4, 0, sizeof(ph4));
1383                         ph4.vihl = IP_VER4;
1384                         v6tov4(ph4.tcpsrc, lp->laddr);
1385                         v6tov4(ph4.tcpdst, lp->raddr);
1386                         ph4.proto = IP_TCPPROTO;
1387                         hnputs(ph4.tcplen, TCP4_HDRSIZE);
1388                         hnputs(ph4.tcpsport, lp->lport);
1389                         hnputs(ph4.tcpdport, lp->rport);
1390                         break;
1391                 case V6:
1392                         memset(&ph6, 0, sizeof(ph6));
1393                         ph6.vcf[0] = IP_VER6;
1394                         ipmove(ph6.tcpsrc, lp->laddr);
1395                         ipmove(ph6.tcpdst, lp->raddr);
1396                         ph6.proto = IP_TCPPROTO;
1397                         hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1398                         hnputs(ph6.tcpsport, lp->lport);
1399                         hnputs(ph6.tcpdport, lp->rport);
1400                         break;
1401                 default:
1402                         panic("sndrst: version %d", lp->version);
1403         }
1404
1405         seg.seq = lp->iss;
1406         seg.ack = lp->irs + 1;
1407         seg.flags = SYN | ACK;
1408         seg.urg = 0;
1409         seg.mss = tcpmtu(tcp, lp->laddr, lp->version, &scale, &flag);
1410         seg.wnd = QMAX;
1411
1412         /* if the other side set scale, we should too */
1413         if (lp->rcvscale) {
1414                 seg.ws = scale;
1415                 lp->sndscale = scale;
1416         } else {
1417                 seg.ws = 0;
1418                 lp->sndscale = 0;
1419         }
1420
1421         switch (lp->version) {
1422                 case V4:
1423                         hbp = htontcp4(&seg, NULL, &ph4, NULL);
1424                         if (hbp == NULL)
1425                                 return -1;
1426                         ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1427                         break;
1428                 case V6:
1429                         hbp = htontcp6(&seg, NULL, &ph6, NULL);
1430                         if (hbp == NULL)
1431                                 return -1;
1432                         ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1433                         break;
1434                 default:
1435                         panic("sndsnack: version %d", lp->version);
1436         }
1437         lp->lastsend = NOW;
1438         return 0;
1439 }
1440
1441 #define hashipa(a, p) ( ( (a)[IPaddrlen-2] + (a)[IPaddrlen-1] + p )&LHTMASK )
1442
1443 /*
1444  *  put a call into limbo and respond with a SYN ACK
1445  *
1446  *  called with proto locked
1447  */
1448 static void
1449 limbo(struct conv *s, uint8_t * source, uint8_t * dest, Tcp * seg, int version)
1450 {
1451         Limbo *lp, **l;
1452         struct tcppriv *tpriv;
1453         int h;
1454
1455         tpriv = s->p->priv;
1456         h = hashipa(source, seg->source);
1457
1458         for (l = &tpriv->lht[h]; *l != NULL; l = &lp->next) {
1459                 lp = *l;
1460                 if (lp->lport != seg->dest || lp->rport != seg->source
1461                         || lp->version != version)
1462                         continue;
1463                 if (ipcmp(lp->raddr, source) != 0)
1464                         continue;
1465                 if (ipcmp(lp->laddr, dest) != 0)
1466                         continue;
1467
1468                 /* each new SYN restarts the retransmits */
1469                 lp->irs = seg->seq;
1470                 break;
1471         }
1472         lp = *l;
1473         if (lp == NULL) {
1474                 if (tpriv->nlimbo >= Maxlimbo && tpriv->lht[h]) {
1475                         lp = tpriv->lht[h];
1476                         tpriv->lht[h] = lp->next;
1477                         lp->next = NULL;
1478                 } else {
1479                         lp = kzmalloc(sizeof(*lp), 0);
1480                         if (lp == NULL)
1481                                 return;
1482                         tpriv->nlimbo++;
1483                 }
1484                 *l = lp;
1485                 lp->version = version;
1486                 ipmove(lp->laddr, dest);
1487                 ipmove(lp->raddr, source);
1488                 lp->lport = seg->dest;
1489                 lp->rport = seg->source;
1490                 lp->mss = seg->mss;
1491                 lp->rcvscale = seg->ws;
1492                 lp->irs = seg->seq;
1493                 urandom_read(&lp->iss, sizeof(lp->iss));
1494         }
1495
1496         if (sndsynack(s->p, lp) < 0) {
1497                 *l = lp->next;
1498                 tpriv->nlimbo--;
1499                 kfree(lp);
1500         }
1501 }
1502
1503 /*
1504  *  resend SYN ACK's once every SYNACK_RXTIMER ms.
1505  */
1506 static void limborexmit(struct Proto *tcp)
1507 {
1508         struct tcppriv *tpriv;
1509         Limbo **l, *lp;
1510         int h;
1511         int seen;
1512         uint64_t now;
1513
1514         tpriv = tcp->priv;
1515
1516         if (!canqlock(&tcp->qlock))
1517                 return;
1518         seen = 0;
1519         now = NOW;
1520         for (h = 0; h < NLHT && seen < tpriv->nlimbo; h++) {
1521                 for (l = &tpriv->lht[h]; *l != NULL && seen < tpriv->nlimbo;) {
1522                         lp = *l;
1523                         seen++;
1524                         if (now - lp->lastsend < (lp->rexmits + 1) * SYNACK_RXTIMER)
1525                                 continue;
1526
1527                         /* time it out after 1 second */
1528                         if (++(lp->rexmits) > 5) {
1529                                 tpriv->nlimbo--;
1530                                 *l = lp->next;
1531                                 kfree(lp);
1532                                 continue;
1533                         }
1534
1535                         /* if we're being attacked, don't bother resending SYN ACK's */
1536                         if (tpriv->nlimbo > 100)
1537                                 continue;
1538
1539                         if (sndsynack(tcp, lp) < 0) {
1540                                 tpriv->nlimbo--;
1541                                 *l = lp->next;
1542                                 kfree(lp);
1543                                 continue;
1544                         }
1545
1546                         l = &lp->next;
1547                 }
1548         }
1549         qunlock(&tcp->qlock);
1550 }
1551
1552 /*
1553  *  lookup call in limbo.  if found, throw it out.
1554  *
1555  *  called with proto locked
1556  */
1557 static void
1558 limborst(struct conv *s, Tcp * segp, uint8_t * src, uint8_t * dst,
1559                  uint8_t version)
1560 {
1561         Limbo *lp, **l;
1562         int h;
1563         struct tcppriv *tpriv;
1564
1565         tpriv = s->p->priv;
1566
1567         /* find a call in limbo */
1568         h = hashipa(src, segp->source);
1569         for (l = &tpriv->lht[h]; *l != NULL; l = &lp->next) {
1570                 lp = *l;
1571                 if (lp->lport != segp->dest || lp->rport != segp->source
1572                         || lp->version != version)
1573                         continue;
1574                 if (ipcmp(lp->laddr, dst) != 0)
1575                         continue;
1576                 if (ipcmp(lp->raddr, src) != 0)
1577                         continue;
1578
1579                 /* RST can only follow the SYN */
1580                 if (segp->seq == lp->irs + 1) {
1581                         tpriv->nlimbo--;
1582                         *l = lp->next;
1583                         kfree(lp);
1584                 }
1585                 break;
1586         }
1587 }
1588
1589 /*
1590  *  come here when we finally get an ACK to our SYN-ACK.
1591  *  lookup call in limbo.  if found, create a new conversation
1592  *
1593  *  called with proto locked
1594  */
1595 static struct conv *tcpincoming(struct conv *s, Tcp * segp, uint8_t * src,
1596                                                                 uint8_t * dst, uint8_t version)
1597 {
1598         struct conv *new;
1599         Tcpctl *tcb;
1600         struct tcppriv *tpriv;
1601         Tcp4hdr *h4;
1602         Tcp6hdr *h6;
1603         Limbo *lp, **l;
1604         int h;
1605
1606         /* unless it's just an ack, it can't be someone coming out of limbo */
1607         if ((segp->flags & SYN) || (segp->flags & ACK) == 0)
1608                 return NULL;
1609
1610         tpriv = s->p->priv;
1611
1612         /* find a call in limbo */
1613         h = hashipa(src, segp->source);
1614         for (l = &tpriv->lht[h]; (lp = *l) != NULL; l = &lp->next) {
1615                 netlog(s->p->f, Logtcp,
1616                            "tcpincoming s %I!%d/%I!%d d %I!%d/%I!%d v %d/%d\n", src,
1617                            segp->source, lp->raddr, lp->rport, dst, segp->dest, lp->laddr,
1618                            lp->lport, version, lp->version);
1619
1620                 if (lp->lport != segp->dest || lp->rport != segp->source
1621                         || lp->version != version)
1622                         continue;
1623                 if (ipcmp(lp->laddr, dst) != 0)
1624                         continue;
1625                 if (ipcmp(lp->raddr, src) != 0)
1626                         continue;
1627
1628                 /* we're assuming no data with the initial SYN */
1629                 if (segp->seq != lp->irs + 1 || segp->ack != lp->iss + 1) {
1630                         netlog(s->p->f, Logtcp, "tcpincoming s 0x%lx/0x%lx a 0x%lx 0x%lx\n",
1631                                    segp->seq, lp->irs + 1, segp->ack, lp->iss + 1);
1632                         lp = NULL;
1633                 } else {
1634                         tpriv->nlimbo--;
1635                         *l = lp->next;
1636                 }
1637                 break;
1638         }
1639         if (lp == NULL)
1640                 return NULL;
1641
1642         new = Fsnewcall(s, src, segp->source, dst, segp->dest, version);
1643         if (new == NULL)
1644                 return NULL;
1645
1646         memmove(new->ptcl, s->ptcl, sizeof(Tcpctl));
1647         tcb = (Tcpctl *) new->ptcl;
1648         tcb->flags &= ~CLONE;
1649         tcb->timer.arg = new;
1650         tcb->timer.state = TcptimerOFF;
1651         tcb->acktimer.arg = new;
1652         tcb->acktimer.state = TcptimerOFF;
1653         tcb->katimer.arg = new;
1654         tcb->katimer.state = TcptimerOFF;
1655         tcb->rtt_timer.arg = new;
1656         tcb->rtt_timer.state = TcptimerOFF;
1657
1658         tcb->irs = lp->irs;
1659         tcb->rcv.nxt = tcb->irs + 1;
1660         tcb->rcv.urg = tcb->rcv.nxt;
1661
1662         tcb->iss = lp->iss;
1663         tcb->rttseq = tcb->iss;
1664         tcb->snd.wl2 = tcb->iss;
1665         tcb->snd.una = tcb->iss + 1;
1666         tcb->snd.ptr = tcb->iss + 1;
1667         tcb->snd.nxt = tcb->iss + 1;
1668         tcb->flgcnt = 0;
1669         tcb->flags |= SYNACK;
1670
1671         /* our sending max segment size cannot be bigger than what he asked for */
1672         if (lp->mss != 0 && lp->mss < tcb->mss)
1673                 tcb->mss = lp->mss;
1674
1675         /* window scaling */
1676         tcpsetscale(new, tcb, lp->rcvscale, lp->sndscale);
1677
1678         /* the congestion window always starts out as a single segment */
1679         tcb->snd.wnd = segp->wnd;
1680         tcb->cwind = tcb->mss;
1681
1682         /* set initial round trip time */
1683         tcb->sndsyntime = lp->lastsend + lp->rexmits * SYNACK_RXTIMER;
1684         tcpsynackrtt(new);
1685
1686         kfree(lp);
1687
1688         /* set up proto header */
1689         switch (version) {
1690                 case V4:
1691                         h4 = &tcb->protohdr.tcp4hdr;
1692                         memset(h4, 0, sizeof(*h4));
1693                         h4->proto = IP_TCPPROTO;
1694                         hnputs(h4->tcpsport, new->lport);
1695                         hnputs(h4->tcpdport, new->rport);
1696                         v6tov4(h4->tcpsrc, dst);
1697                         v6tov4(h4->tcpdst, src);
1698                         break;
1699                 case V6:
1700                         h6 = &tcb->protohdr.tcp6hdr;
1701                         memset(h6, 0, sizeof(*h6));
1702                         h6->proto = IP_TCPPROTO;
1703                         hnputs(h6->tcpsport, new->lport);
1704                         hnputs(h6->tcpdport, new->rport);
1705                         ipmove(h6->tcpsrc, dst);
1706                         ipmove(h6->tcpdst, src);
1707                         break;
1708                 default:
1709                         panic("tcpincoming: version %d", new->ipversion);
1710         }
1711
1712         tcpsetstate(new, Established);
1713
1714         iphtadd(&tpriv->ht, new);
1715
1716         return new;
1717 }
1718
1719 int seq_within(uint32_t x, uint32_t low, uint32_t high)
1720 {
1721         if (low <= high) {
1722                 if (low <= x && x <= high)
1723                         return 1;
1724         } else {
1725                 if (x >= low || x <= high)
1726                         return 1;
1727         }
1728         return 0;
1729 }
1730
1731 int seq_lt(uint32_t x, uint32_t y)
1732 {
1733         return (int)(x - y) < 0;
1734 }
1735
1736 int seq_le(uint32_t x, uint32_t y)
1737 {
1738         return (int)(x - y) <= 0;
1739 }
1740
1741 int seq_gt(uint32_t x, uint32_t y)
1742 {
1743         return (int)(x - y) > 0;
1744 }
1745
1746 int seq_ge(uint32_t x, uint32_t y)
1747 {
1748         return (int)(x - y) >= 0;
1749 }
1750
1751 /*
1752  *  use the time between the first SYN and it's ack as the
1753  *  initial round trip time
1754  */
1755 void tcpsynackrtt(struct conv *s)
1756 {
1757         Tcpctl *tcb;
1758         uint64_t delta;
1759         struct tcppriv *tpriv;
1760
1761         tcb = (Tcpctl *) s->ptcl;
1762         tpriv = s->p->priv;
1763
1764         delta = NOW - tcb->sndsyntime;
1765         tcb->srtt = delta << LOGAGAIN;
1766         tcb->mdev = delta << LOGDGAIN;
1767
1768         /* halt round trip timer */
1769         tcphalt(tpriv, &tcb->rtt_timer);
1770 }
1771
1772 void update(struct conv *s, Tcp * seg)
1773 {
1774         int rtt, delta;
1775         Tcpctl *tcb;
1776         uint32_t acked;
1777         uint32_t expand;
1778         struct tcppriv *tpriv;
1779
1780         tpriv = s->p->priv;
1781         tcb = (Tcpctl *) s->ptcl;
1782
1783         /* if everything has been acked, force output(?) */
1784         if (seq_gt(seg->ack, tcb->snd.nxt)) {
1785                 tcb->flags |= FORCE;
1786                 return;
1787         }
1788
1789         /* added by Dong Lin for fast retransmission */
1790         if (seg->ack == tcb->snd.una
1791                 && tcb->snd.una != tcb->snd.nxt
1792                 && seg->len == 0 && seg->wnd == tcb->snd.wnd) {
1793
1794                 /* this is a pure ack w/o window update */
1795                 netlog(s->p->f, Logtcprxmt, "dupack %lu ack %lu sndwnd %d advwin %d\n",
1796                            tcb->snd.dupacks, seg->ack, tcb->snd.wnd, seg->wnd);
1797
1798                 if (++tcb->snd.dupacks == TCPREXMTTHRESH) {
1799                         /*
1800                          *  tahoe tcp rxt the packet, half sshthresh,
1801                          *  and set cwnd to one packet
1802                          */
1803                         tcb->snd.recovery = 1;
1804                         tcb->snd.rxt = tcb->snd.nxt;
1805                         netlog(s->p->f, Logtcprxmt, "fast rxt %lu, nxt %lu\n", tcb->snd.una,
1806                                    tcb->snd.nxt);
1807                         tcprxmit(s);
1808                 } else {
1809                         /* do reno tcp here. */
1810                 }
1811         }
1812
1813         /*
1814          *  update window
1815          */
1816         if (seq_gt(seg->ack, tcb->snd.wl2)
1817                 || (tcb->snd.wl2 == seg->ack && seg->wnd > tcb->snd.wnd)) {
1818                 tcb->snd.wnd = seg->wnd;
1819                 tcb->snd.wl2 = seg->ack;
1820         }
1821
1822         if (!seq_gt(seg->ack, tcb->snd.una)) {
1823                 /*
1824                  *  don't let us hangup if sending into a closed window and
1825                  *  we're still getting acks
1826                  */
1827                 if ((tcb->flags & RETRAN) && tcb->snd.wnd == 0) {
1828                         tcb->backedoff = MAXBACKMS / 4;
1829                 }
1830                 return;
1831         }
1832
1833         /*
1834          *  any positive ack turns off fast rxt,
1835          *  (should we do new-reno on partial acks?)
1836          */
1837         if (!tcb->snd.recovery || seq_ge(seg->ack, tcb->snd.rxt)) {
1838                 tcb->snd.dupacks = 0;
1839                 tcb->snd.recovery = 0;
1840         } else
1841                 netlog(s->p->f, Logtcp, "rxt next %lu, cwin %u\n", seg->ack,
1842                            tcb->cwind);
1843
1844         /* Compute the new send window size */
1845         acked = seg->ack - tcb->snd.una;
1846
1847         /* avoid slow start and timers for SYN acks */
1848         if ((tcb->flags & SYNACK) == 0) {
1849                 tcb->flags |= SYNACK;
1850                 acked--;
1851                 tcb->flgcnt--;
1852                 goto done;
1853         }
1854
1855         /* slow start as long as we're not recovering from lost packets */
1856         if (tcb->cwind < tcb->snd.wnd && !tcb->snd.recovery) {
1857                 if (tcb->cwind < tcb->ssthresh) {
1858                         expand = tcb->mss;
1859                         if (acked < expand)
1860                                 expand = acked;
1861                 } else
1862                         expand = ((int)tcb->mss * tcb->mss) / tcb->cwind;
1863
1864                 if (tcb->cwind + expand < tcb->cwind)
1865                         expand = tcb->snd.wnd - tcb->cwind;
1866                 if (tcb->cwind + expand > tcb->snd.wnd)
1867                         expand = tcb->snd.wnd - tcb->cwind;
1868                 tcb->cwind += expand;
1869         }
1870
1871         /* Adjust the timers according to the round trip time */
1872         if (tcb->rtt_timer.state == TcptimerON && seq_ge(seg->ack, tcb->rttseq)) {
1873                 tcphalt(tpriv, &tcb->rtt_timer);
1874                 if ((tcb->flags & RETRAN) == 0) {
1875                         tcb->backoff = 0;
1876                         tcb->backedoff = 0;
1877                         rtt = tcb->rtt_timer.start - tcb->rtt_timer.count;
1878                         if (rtt == 0)
1879                                 rtt = 1;        /* otherwise all close systems will rexmit in 0 time */
1880                         rtt *= MSPTICK;
1881                         if (tcb->srtt == 0) {
1882                                 tcb->srtt = rtt << LOGAGAIN;
1883                                 tcb->mdev = rtt << LOGDGAIN;
1884                         } else {
1885                                 delta = rtt - (tcb->srtt >> LOGAGAIN);
1886                                 tcb->srtt += delta;
1887                                 if (tcb->srtt <= 0)
1888                                         tcb->srtt = 1;
1889
1890                                 delta = abs(delta) - (tcb->mdev >> LOGDGAIN);
1891                                 tcb->mdev += delta;
1892                                 if (tcb->mdev <= 0)
1893                                         tcb->mdev = 1;
1894                         }
1895                         tcpsettimer(tcb);
1896                 }
1897         }
1898
1899 done:
1900         if (qdiscard(s->wq, acked) < acked)
1901                 tcb->flgcnt--;
1902
1903         tcb->snd.una = seg->ack;
1904         if (seq_gt(seg->ack, tcb->snd.urg))
1905                 tcb->snd.urg = seg->ack;
1906
1907         if (tcb->snd.una != tcb->snd.nxt)
1908                 tcpgo(tpriv, &tcb->timer);
1909         else
1910                 tcphalt(tpriv, &tcb->timer);
1911
1912         if (seq_lt(tcb->snd.ptr, tcb->snd.una))
1913                 tcb->snd.ptr = tcb->snd.una;
1914
1915         tcb->flags &= ~RETRAN;
1916         tcb->backoff = 0;
1917         tcb->backedoff = 0;
1918 }
1919
1920 void tcpiput(struct Proto *tcp, struct Ipifc *unused, struct block *bp)
1921 {
1922         ERRSTACK(1);
1923         Tcp seg;
1924         Tcp4hdr *h4;
1925         Tcp6hdr *h6;
1926         int hdrlen;
1927         Tcpctl *tcb;
1928         uint16_t length;
1929         uint8_t source[IPaddrlen], dest[IPaddrlen];
1930         struct conv *s;
1931         struct Fs *f;
1932         struct tcppriv *tpriv;
1933         uint8_t version;
1934
1935         f = tcp->f;
1936         tpriv = tcp->priv;
1937
1938         tpriv->stats[InSegs]++;
1939
1940         h4 = (Tcp4hdr *) (bp->rp);
1941         h6 = (Tcp6hdr *) (bp->rp);
1942
1943         if ((h4->vihl & 0xF0) == IP_VER4) {
1944                 version = V4;
1945                 length = nhgets(h4->length);
1946                 v4tov6(dest, h4->tcpdst);
1947                 v4tov6(source, h4->tcpsrc);
1948
1949                 h4->Unused = 0;
1950                 hnputs(h4->tcplen, length - TCP4_PKT);
1951                 if (!(bp->flag & Btcpck) && (h4->tcpcksum[0] || h4->tcpcksum[1]) &&
1952                         ptclcsum(bp, TCP4_IPLEN, length - TCP4_IPLEN)) {
1953                         tpriv->stats[CsumErrs]++;
1954                         tpriv->stats[InErrs]++;
1955                         netlog(f, Logtcp, "bad tcp proto cksum\n");
1956                         freeblist(bp);
1957                         return;
1958                 }
1959
1960                 hdrlen = ntohtcp4(&seg, &bp);
1961                 if (hdrlen < 0) {
1962                         tpriv->stats[HlenErrs]++;
1963                         tpriv->stats[InErrs]++;
1964                         netlog(f, Logtcp, "bad tcp hdr len\n");
1965                         return;
1966                 }
1967
1968                 /* trim the packet to the size claimed by the datagram */
1969                 length -= hdrlen + TCP4_PKT;
1970                 bp = trimblock(bp, hdrlen + TCP4_PKT, length);
1971                 if (bp == NULL) {
1972                         tpriv->stats[LenErrs]++;
1973                         tpriv->stats[InErrs]++;
1974                         netlog(f, Logtcp, "tcp len < 0 after trim\n");
1975                         return;
1976                 }
1977         } else {
1978                 int ttl = h6->ttl;
1979                 int proto = h6->proto;
1980
1981                 version = V6;
1982                 length = nhgets(h6->ploadlen);
1983                 ipmove(dest, h6->tcpdst);
1984                 ipmove(source, h6->tcpsrc);
1985
1986                 h6->ploadlen[0] = h6->ploadlen[1] = h6->proto = 0;
1987                 h6->ttl = proto;
1988                 hnputl(h6->vcf, length);
1989                 if ((h6->tcpcksum[0] || h6->tcpcksum[1]) &&
1990                         ptclcsum(bp, TCP6_IPLEN, length + TCP6_PHDRSIZE)) {
1991                         tpriv->stats[CsumErrs]++;
1992                         tpriv->stats[InErrs]++;
1993                         netlog(f, Logtcp, "bad tcp proto cksum\n");
1994                         freeblist(bp);
1995                         return;
1996                 }
1997                 h6->ttl = ttl;
1998                 h6->proto = proto;
1999                 hnputs(h6->ploadlen, length);
2000
2001                 hdrlen = ntohtcp6(&seg, &bp);
2002                 if (hdrlen < 0) {
2003                         tpriv->stats[HlenErrs]++;
2004                         tpriv->stats[InErrs]++;
2005                         netlog(f, Logtcp, "bad tcp hdr len\n");
2006                         return;
2007                 }
2008
2009                 /* trim the packet to the size claimed by the datagram */
2010                 length -= hdrlen;
2011                 bp = trimblock(bp, hdrlen + TCP6_PKT, length);
2012                 if (bp == NULL) {
2013                         tpriv->stats[LenErrs]++;
2014                         tpriv->stats[InErrs]++;
2015                         netlog(f, Logtcp, "tcp len < 0 after trim\n");
2016                         return;
2017                 }
2018         }
2019
2020         /* lock protocol while searching for a conversation */
2021         qlock(&tcp->qlock);
2022
2023         /* Look for a matching conversation */
2024         s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest);
2025         if (s == NULL) {
2026                 netlog(f, Logtcp, "iphtlook failed\n");
2027 reset:
2028                 qunlock(&tcp->qlock);
2029                 sndrst(tcp, source, dest, length, &seg, version, "no conversation");
2030                 freeblist(bp);
2031                 return;
2032         }
2033
2034         /* if it's a listener, look for the right flags and get a new conv */
2035         tcb = (Tcpctl *) s->ptcl;
2036         if (tcb->state == Listen) {
2037                 if (seg.flags & RST) {
2038                         limborst(s, &seg, source, dest, version);
2039                         qunlock(&tcp->qlock);
2040                         freeblist(bp);
2041                         return;
2042                 }
2043
2044                 /* if this is a new SYN, put the call into limbo */
2045                 if ((seg.flags & SYN) && (seg.flags & ACK) == 0) {
2046                         limbo(s, source, dest, &seg, version);
2047                         qunlock(&tcp->qlock);
2048                         freeblist(bp);
2049                         return;
2050                 }
2051
2052                 /*
2053                  *  if there's a matching call in limbo, tcpincoming will
2054                  *  return it in state Syn_received
2055                  */
2056                 s = tcpincoming(s, &seg, source, dest, version);
2057                 if (s == NULL)
2058                         goto reset;
2059         }
2060
2061         /* The rest of the input state machine is run with the control block
2062          * locked and implements the state machine directly out of the RFC.
2063          * Out-of-band data is ignored - it was always a bad idea.
2064          */
2065         tcb = (Tcpctl *) s->ptcl;
2066         if (waserror()) {
2067                 qunlock(&s->qlock);
2068                 nexterror();
2069         }
2070         qlock(&s->qlock);
2071         qunlock(&tcp->qlock);
2072
2073         /* fix up window */
2074         seg.wnd <<= tcb->rcv.scale;
2075
2076         /* every input packet in puts off the keep alive time out */
2077         tcpsetkacounter(tcb);
2078
2079         switch (tcb->state) {
2080                 case Closed:
2081                         sndrst(tcp, source, dest, length, &seg, version,
2082                                    "sending to Closed");
2083                         goto raise;
2084                 case Syn_sent:
2085                         if (seg.flags & ACK) {
2086                                 if (!seq_within(seg.ack, tcb->iss + 1, tcb->snd.nxt)) {
2087                                         sndrst(tcp, source, dest, length, &seg, version,
2088                                                    "bad seq in Syn_sent");
2089                                         goto raise;
2090                                 }
2091                         }
2092                         if (seg.flags & RST) {
2093                                 if (seg.flags & ACK)
2094                                         localclose(s, "connection refused");
2095                                 goto raise;
2096                         }
2097
2098                         if (seg.flags & SYN) {
2099                                 procsyn(s, &seg);
2100                                 if (seg.flags & ACK) {
2101                                         update(s, &seg);
2102                                         tcpsynackrtt(s);
2103                                         tcpsetstate(s, Established);
2104                                         tcpsetscale(s, tcb, seg.ws, tcb->scale);
2105                                 } else {
2106                                         tcb->time = NOW;
2107                                         tcpsetstate(s, Syn_received);   /* DLP - shouldn't this be a reset? */
2108                                 }
2109
2110                                 if (length != 0 || (seg.flags & FIN))
2111                                         break;
2112
2113                                 freeblist(bp);
2114                                 goto output;
2115                         } else
2116                                 freeblist(bp);
2117
2118                         qunlock(&s->qlock);
2119                         poperror();
2120                         return;
2121                 case Syn_received:
2122                         /* doesn't matter if it's the correct ack, we're just trying to set timing */
2123                         if (seg.flags & ACK)
2124                                 tcpsynackrtt(s);
2125                         break;
2126         }
2127
2128         /*
2129          *  One DOS attack is to open connections to us and then forget about them,
2130          *  thereby tying up a conv at no long term cost to the attacker.
2131          *  This is an attempt to defeat these stateless DOS attacks.  See
2132          *  corresponding code in tcpsendka().
2133          */
2134         if (tcb->state != Syn_received && (seg.flags & RST) == 0) {
2135                 if (tcpporthogdefense
2136                         && seq_within(seg.ack, tcb->snd.una - (1 << 31),
2137                                                   tcb->snd.una - (1 << 29))) {
2138                         printd("stateless hog %I.%d->%I.%d f 0x%x 0x%lx - 0x%lx - 0x%lx\n",
2139                                    source, seg.source, dest, seg.dest, seg.flags,
2140                                    tcb->snd.una - (1 << 31), seg.ack, tcb->snd.una - (1 << 29));
2141                         localclose(s, "stateless hog");
2142                 }
2143         }
2144
2145         /* Cut the data to fit the receive window */
2146         if (tcptrim(tcb, &seg, &bp, &length) == -1) {
2147                 netlog(f, Logtcp, "tcp len < 0, %lu %d\n", seg.seq, length);
2148                 update(s, &seg);
2149                 if (qlen(s->wq) + tcb->flgcnt == 0 && tcb->state == Closing) {
2150                         tcphalt(tpriv, &tcb->rtt_timer);
2151                         tcphalt(tpriv, &tcb->acktimer);
2152                         tcphalt(tpriv, &tcb->katimer);
2153                         tcpsetstate(s, Time_wait);
2154                         tcb->timer.start = MSL2 * (1000 / MSPTICK);
2155                         tcpgo(tpriv, &tcb->timer);
2156                 }
2157                 if (!(seg.flags & RST)) {
2158                         tcb->flags |= FORCE;
2159                         goto output;
2160                 }
2161                 qunlock(&s->qlock);
2162                 poperror();
2163                 return;
2164         }
2165
2166         /* Cannot accept so answer with a rst */
2167         if (length && tcb->state == Closed) {
2168                 sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
2169                 goto raise;
2170         }
2171
2172         /* The segment is beyond the current receive pointer so
2173          * queue the data in the resequence queue
2174          */
2175         if (seg.seq != tcb->rcv.nxt)
2176                 if (length != 0 || (seg.flags & (SYN | FIN))) {
2177                         update(s, &seg);
2178                         if (addreseq(tcb, tpriv, &seg, bp, length) < 0)
2179                                 printd("reseq %I.%d -> %I.%d\n", s->raddr, s->rport, s->laddr,
2180                                            s->lport);
2181                         tcb->flags |= FORCE;
2182                         goto output;
2183                 }
2184
2185         /*
2186          *  keep looping till we've processed this packet plus any
2187          *  adjacent packets in the resequence queue
2188          */
2189         for (;;) {
2190                 if (seg.flags & RST) {
2191                         if (tcb->state == Established) {
2192                                 tpriv->stats[EstabResets]++;
2193                                 if (tcb->rcv.nxt != seg.seq)
2194                                         printd
2195                                                 ("out of order RST rcvd: %I.%d -> %I.%d, rcv.nxt 0x%lx seq 0x%lx\n",
2196                                                  s->raddr, s->rport, s->laddr, s->lport, tcb->rcv.nxt,
2197                                                  seg.seq);
2198                         }
2199                         localclose(s, "connection refused");
2200                         goto raise;
2201                 }
2202
2203                 if ((seg.flags & ACK) == 0)
2204                         goto raise;
2205
2206                 switch (tcb->state) {
2207                         case Syn_received:
2208                                 if (!seq_within(seg.ack, tcb->snd.una + 1, tcb->snd.nxt)) {
2209                                         sndrst(tcp, source, dest, length, &seg, version,
2210                                                    "bad seq in Syn_received");
2211                                         goto raise;
2212                                 }
2213                                 update(s, &seg);
2214                                 tcpsetstate(s, Established);
2215                         case Established:
2216                         case Close_wait:
2217                                 update(s, &seg);
2218                                 break;
2219                         case Finwait1:
2220                                 update(s, &seg);
2221                                 if (qlen(s->wq) + tcb->flgcnt == 0) {
2222                                         tcphalt(tpriv, &tcb->rtt_timer);
2223                                         tcphalt(tpriv, &tcb->acktimer);
2224                                         tcpsetkacounter(tcb);
2225                                         tcb->time = NOW;
2226                                         tcpsetstate(s, Finwait2);
2227                                         tcb->katimer.start = MSL2 * (1000 / MSPTICK);
2228                                         tcpgo(tpriv, &tcb->katimer);
2229                                 }
2230                                 break;
2231                         case Finwait2:
2232                                 update(s, &seg);
2233                                 break;
2234                         case Closing:
2235                                 update(s, &seg);
2236                                 if (qlen(s->wq) + tcb->flgcnt == 0) {
2237                                         tcphalt(tpriv, &tcb->rtt_timer);
2238                                         tcphalt(tpriv, &tcb->acktimer);
2239                                         tcphalt(tpriv, &tcb->katimer);
2240                                         tcpsetstate(s, Time_wait);
2241                                         tcb->timer.start = MSL2 * (1000 / MSPTICK);
2242                                         tcpgo(tpriv, &tcb->timer);
2243                                 }
2244                                 break;
2245                         case Last_ack:
2246                                 update(s, &seg);
2247                                 if (qlen(s->wq) + tcb->flgcnt == 0) {
2248                                         localclose(s, NULL);
2249                                         goto raise;
2250                                 }
2251                         case Time_wait:
2252                                 tcb->flags |= FORCE;
2253                                 if (tcb->timer.state != TcptimerON)
2254                                         tcpgo(tpriv, &tcb->timer);
2255                 }
2256
2257                 if ((seg.flags & URG) && seg.urg) {
2258                         if (seq_gt(seg.urg + seg.seq, tcb->rcv.urg)) {
2259                                 tcb->rcv.urg = seg.urg + seg.seq;
2260                                 pullblock(&bp, seg.urg);
2261                         }
2262                 } else if (seq_gt(tcb->rcv.nxt, tcb->rcv.urg))
2263                         tcb->rcv.urg = tcb->rcv.nxt;
2264
2265                 if (length == 0) {
2266                         if (bp != NULL)
2267                                 freeblist(bp);
2268                 } else {
2269                         switch (tcb->state) {
2270                                 default:
2271                                         /* Ignore segment text */
2272                                         if (bp != NULL)
2273                                                 freeblist(bp);
2274                                         break;
2275
2276                                 case Syn_received:
2277                                 case Established:
2278                                 case Finwait1:
2279                                         /* If we still have some data place on
2280                                          * receive queue
2281                                          */
2282                                         if (bp) {
2283                                                 bp = packblock(bp);
2284                                                 if (bp == NULL)
2285                                                         panic("tcp packblock");
2286                                                 qpassnolim(s->rq, bp);
2287                                                 bp = NULL;
2288
2289                                                 /*
2290                                                  *  Force an ack every 2 data messages.  This is
2291                                                  *  a hack for rob to make his home system run
2292                                                  *  faster.
2293                                                  *
2294                                                  *  this also keeps the standard TCP congestion
2295                                                  *  control working since it needs an ack every
2296                                                  *  2 max segs worth.  This is not quite that,
2297                                                  *  but under a real stream is equivalent since
2298                                                  *  every packet has a max seg in it.
2299                                                  */
2300                                                 if (++(tcb->rcv.una) >= 2)
2301                                                         tcb->flags |= FORCE;
2302                                         }
2303                                         tcb->rcv.nxt += length;
2304
2305                                         /*
2306                                          *  update our rcv window
2307                                          */
2308                                         tcprcvwin(s);
2309
2310                                         /*
2311                                          *  turn on the acktimer if there's something
2312                                          *  to ack
2313                                          */
2314                                         if (tcb->acktimer.state != TcptimerON)
2315                                                 tcpgo(tpriv, &tcb->acktimer);
2316
2317                                         break;
2318                                 case Finwait2:
2319                                         /* no process to read the data, send a reset */
2320                                         if (bp != NULL)
2321                                                 freeblist(bp);
2322                                         sndrst(tcp, source, dest, length, &seg, version,
2323                                                    "send to Finwait2");
2324                                         qunlock(&s->qlock);
2325                                         poperror();
2326                                         return;
2327                         }
2328                 }
2329
2330                 if (seg.flags & FIN) {
2331                         tcb->flags |= FORCE;
2332
2333                         switch (tcb->state) {
2334                                 case Syn_received:
2335                                 case Established:
2336                                         tcb->rcv.nxt++;
2337                                         tcpsetstate(s, Close_wait);
2338                                         break;
2339                                 case Finwait1:
2340                                         tcb->rcv.nxt++;
2341                                         if (qlen(s->wq) + tcb->flgcnt == 0) {
2342                                                 tcphalt(tpriv, &tcb->rtt_timer);
2343                                                 tcphalt(tpriv, &tcb->acktimer);
2344                                                 tcphalt(tpriv, &tcb->katimer);
2345                                                 tcpsetstate(s, Time_wait);
2346                                                 tcb->timer.start = MSL2 * (1000 / MSPTICK);
2347                                                 tcpgo(tpriv, &tcb->timer);
2348                                         } else
2349                                                 tcpsetstate(s, Closing);
2350                                         break;
2351                                 case Finwait2:
2352                                         tcb->rcv.nxt++;
2353                                         tcphalt(tpriv, &tcb->rtt_timer);
2354                                         tcphalt(tpriv, &tcb->acktimer);
2355                                         tcphalt(tpriv, &tcb->katimer);
2356                                         tcpsetstate(s, Time_wait);
2357                                         tcb->timer.start = MSL2 * (1000 / MSPTICK);
2358                                         tcpgo(tpriv, &tcb->timer);
2359                                         break;
2360                                 case Close_wait:
2361                                 case Closing:
2362                                 case Last_ack:
2363                                         break;
2364                                 case Time_wait:
2365                                         tcpgo(tpriv, &tcb->timer);
2366                                         break;
2367                         }
2368                 }
2369
2370                 /*
2371                  *  get next adjacent segment from the resequence queue.
2372                  *  dump/trim any overlapping segments
2373                  */
2374                 for (;;) {
2375                         if (tcb->reseq == NULL)
2376                                 goto output;
2377
2378                         if (seq_ge(tcb->rcv.nxt, tcb->reseq->seg.seq) == 0)
2379                                 goto output;
2380
2381                         getreseq(tcb, &seg, &bp, &length);
2382
2383                         if (tcptrim(tcb, &seg, &bp, &length) == 0)
2384                                 break;
2385                 }
2386         }
2387 output:
2388         tcpoutput(s);
2389         qunlock(&s->qlock);
2390         poperror();
2391         return;
2392 raise:
2393         qunlock(&s->qlock);
2394         poperror();
2395         freeblist(bp);
2396         tcpkick(s);
2397 }
2398
2399 /*
2400  *  always enters and exits with the s locked.  We drop
2401  *  the lock to ipoput the packet so some care has to be
2402  *  taken by callers.
2403  */
2404 void tcpoutput(struct conv *s)
2405 {
2406         Tcp seg;
2407         int msgs;
2408         Tcpctl *tcb;
2409         struct block *hbp, *bp;
2410         int sndcnt, n;
2411         uint32_t ssize, dsize, usable, sent;
2412         struct Fs *f;
2413         struct tcppriv *tpriv;
2414         uint8_t version;
2415
2416         f = s->p->f;
2417         tpriv = s->p->priv;
2418         version = s->ipversion;
2419
2420         for (msgs = 0; msgs < 100; msgs++) {
2421                 tcb = (Tcpctl *) s->ptcl;
2422
2423                 switch (tcb->state) {
2424                         case Listen:
2425                         case Closed:
2426                         case Finwait2:
2427                                 return;
2428                 }
2429
2430                 /* force an ack when a window has opened up */
2431                 if (tcb->rcv.blocked && tcb->rcv.wnd > 0) {
2432                         tcb->rcv.blocked = 0;
2433                         tcb->flags |= FORCE;
2434                 }
2435
2436                 sndcnt = qlen(s->wq) + tcb->flgcnt;
2437                 sent = tcb->snd.ptr - tcb->snd.una;
2438
2439                 /* Don't send anything else until our SYN has been acked */
2440                 if (tcb->snd.ptr != tcb->iss && (tcb->flags & SYNACK) == 0)
2441                         break;
2442
2443                 /* Compute usable segment based on offered window and limit
2444                  * window probes to one
2445                  */
2446                 if (tcb->snd.wnd == 0) {
2447                         if (sent != 0) {
2448                                 if ((tcb->flags & FORCE) == 0)
2449                                         break;
2450 //              tcb->snd.ptr = tcb->snd.una;
2451                         }
2452                         usable = 1;
2453                 } else {
2454                         usable = tcb->cwind;
2455                         if (tcb->snd.wnd < usable)
2456                                 usable = tcb->snd.wnd;
2457                         usable -= sent;
2458                 }
2459                 ssize = sndcnt - sent;
2460                 if (ssize && usable < 2)
2461                         netlog(s->p->f, Logtcp, "throttled snd.wnd %lu cwind %lu\n",
2462                                    tcb->snd.wnd, tcb->cwind);
2463                 if (usable < ssize)
2464                         ssize = usable;
2465                 if (ssize > tcb->mss) {
2466                         if ((tcb->flags & TSO) == 0) {
2467                                 ssize = tcb->mss;
2468                         } else {
2469                                 int segs, window;
2470
2471                                 /*  Don't send too much.  32K is arbitrary..
2472                                  */
2473                                 if (ssize > 32 * 1024)
2474                                         ssize = 32 * 1024;
2475
2476                                 /* Clamp xmit to an integral MSS to
2477                                  * avoid ragged tail segments causing
2478                                  * poor link utilization.  Also
2479                                  * account for each segment sent in
2480                                  * msg heuristic, and round up to the
2481                                  * next multiple of 4, to ensure we
2482                                  * still yeild.
2483                                  */
2484                                 segs = ssize / tcb->mss;
2485                                 ssize = segs * tcb->mss;
2486                                 msgs += segs;
2487                                 if (segs > 3)
2488                                         msgs = (msgs + 4) & ~3;
2489                         }
2490                 }
2491
2492                 dsize = ssize;
2493                 seg.urg = 0;
2494
2495                 if (ssize == 0)
2496                         if ((tcb->flags & FORCE) == 0)
2497                                 break;
2498
2499                 tcb->flags &= ~FORCE;
2500                 tcprcvwin(s);
2501
2502                 /* By default we will generate an ack */
2503                 tcphalt(tpriv, &tcb->acktimer);
2504                 tcb->rcv.una = 0;
2505                 seg.source = s->lport;
2506                 seg.dest = s->rport;
2507                 seg.flags = ACK;
2508                 seg.mss = 0;
2509                 seg.ws = 0;
2510                 switch (tcb->state) {
2511                         case Syn_sent:
2512                                 seg.flags = 0;
2513                                 if (tcb->snd.ptr == tcb->iss) {
2514                                         seg.flags |= SYN;
2515                                         dsize--;
2516                                         seg.mss = tcb->mss;
2517                                         seg.ws = tcb->scale;
2518                                 }
2519                                 break;
2520                         case Syn_received:
2521                                 /*
2522                                  *  don't send any data with a SYN/ACK packet
2523                                  *  because Linux rejects the packet in its
2524                                  *  attempt to solve the SYN attack problem
2525                                  */
2526                                 if (tcb->snd.ptr == tcb->iss) {
2527                                         seg.flags |= SYN;
2528                                         dsize = 0;
2529                                         ssize = 1;
2530                                         seg.mss = tcb->mss;
2531                                         seg.ws = tcb->scale;
2532                                 }
2533                                 break;
2534                 }
2535                 seg.seq = tcb->snd.ptr;
2536                 seg.ack = tcb->rcv.nxt;
2537                 seg.wnd = tcb->rcv.wnd;
2538
2539                 /* Pull out data to send */
2540                 bp = NULL;
2541                 if (dsize != 0) {
2542                         bp = qcopy(s->wq, dsize, sent);
2543                         if (BLEN(bp) != dsize) {
2544                                 seg.flags |= FIN;
2545                                 dsize--;
2546                         }
2547                         if (BLEN(bp) > tcb->mss) {
2548                                 bp->flag |= Btso;
2549                                 bp->mss = tcb->mss;
2550                         }
2551                 }
2552
2553                 if (sent + dsize == sndcnt)
2554                         seg.flags |= PSH;
2555
2556                 /* keep track of balance of resent data */
2557                 if (seq_lt(tcb->snd.ptr, tcb->snd.nxt)) {
2558                         n = tcb->snd.nxt - tcb->snd.ptr;
2559                         if (ssize < n)
2560                                 n = ssize;
2561                         tcb->resent += n;
2562                         netlog(f, Logtcp, "rexmit: %I.%d -> %I.%d ptr 0x%lx nxt 0x%lx\n",
2563                                    s->raddr, s->rport, s->laddr, s->lport, tcb->snd.ptr,
2564                                    tcb->snd.nxt);
2565                         tpriv->stats[RetransSegs]++;
2566                 }
2567
2568                 tcb->snd.ptr += ssize;
2569
2570                 /* Pull up the send pointer so we can accept acks
2571                  * for this window
2572                  */
2573                 if (seq_gt(tcb->snd.ptr, tcb->snd.nxt))
2574                         tcb->snd.nxt = tcb->snd.ptr;
2575
2576                 /* Build header, link data and compute cksum */
2577                 switch (version) {
2578                         case V4:
2579                                 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
2580                                 hbp = htontcp4(&seg, bp, &tcb->protohdr.tcp4hdr, tcb);
2581                                 if (hbp == NULL) {
2582                                         freeblist(bp);
2583                                         return;
2584                                 }
2585                                 break;
2586                         case V6:
2587                                 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
2588                                 hbp = htontcp6(&seg, bp, &tcb->protohdr.tcp6hdr, tcb);
2589                                 if (hbp == NULL) {
2590                                         freeblist(bp);
2591                                         return;
2592                                 }
2593                                 break;
2594                         default:
2595                                 hbp = NULL;     /* to suppress a warning */
2596                                 panic("tcpoutput: version %d", version);
2597                 }
2598
2599                 /* Start the transmission timers if there is new data and we
2600                  * expect acknowledges
2601                  */
2602                 if (ssize != 0) {
2603                         if (tcb->timer.state != TcptimerON)
2604                                 tcpgo(tpriv, &tcb->timer);
2605
2606                         /*  If round trip timer isn't running, start it.
2607                          *  measure the longest packet only in case the
2608                          *  transmission time dominates RTT
2609                          */
2610                         if (tcb->rtt_timer.state != TcptimerON)
2611                                 if (ssize == tcb->mss) {
2612                                         tcpgo(tpriv, &tcb->rtt_timer);
2613                                         tcb->rttseq = tcb->snd.ptr;
2614                                 }
2615                 }
2616
2617                 tpriv->stats[OutSegs]++;
2618
2619                 /* put off the next keep alive */
2620                 tcpgo(tpriv, &tcb->katimer);
2621
2622                 switch (version) {
2623                         case V4:
2624                                 if (ipoput4(f, hbp, 0, s->ttl, s->tos, s) < 0) {
2625                                         /* a negative return means no route */
2626                                         localclose(s, "no route");
2627                                 }
2628                                 break;
2629                         case V6:
2630                                 if (ipoput6(f, hbp, 0, s->ttl, s->tos, s) < 0) {
2631                                         /* a negative return means no route */
2632                                         localclose(s, "no route");
2633                                 }
2634                                 break;
2635                         default:
2636                                 panic("tcpoutput2: version %d", version);
2637                 }
2638                 if ((msgs % 4) == 1) {
2639                         qunlock(&s->qlock);
2640                         kthread_yield();
2641                         qlock(&s->qlock);
2642                 }
2643         }
2644 }
2645
2646 /*
2647  *  the BSD convention (hack?) for keep alives.  resend last uint8_t acked.
2648  */
2649 void tcpsendka(struct conv *s)
2650 {
2651         Tcp seg;
2652         Tcpctl *tcb;
2653         struct block *hbp, *dbp;
2654
2655         tcb = (Tcpctl *) s->ptcl;
2656
2657         dbp = NULL;
2658         seg.urg = 0;
2659         seg.source = s->lport;
2660         seg.dest = s->rport;
2661         seg.flags = ACK | PSH;
2662         seg.mss = 0;
2663         seg.ws = 0;
2664         if (tcpporthogdefense)
2665                 urandom_read(&seg.seq, sizeof(seg.seq));
2666         else
2667                 seg.seq = tcb->snd.una - 1;
2668         seg.ack = tcb->rcv.nxt;
2669         tcb->rcv.una = 0;
2670         seg.wnd = tcb->rcv.wnd;
2671         if (tcb->state == Finwait2) {
2672                 seg.flags |= FIN;
2673         } else {
2674                 dbp = allocb(1);
2675                 dbp->wp++;
2676         }
2677
2678         if (isv4(s->raddr)) {
2679                 /* Build header, link data and compute cksum */
2680                 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
2681                 hbp = htontcp4(&seg, dbp, &tcb->protohdr.tcp4hdr, tcb);
2682                 if (hbp == NULL) {
2683                         freeblist(dbp);
2684                         return;
2685                 }
2686                 ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
2687         } else {
2688                 /* Build header, link data and compute cksum */
2689                 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
2690                 hbp = htontcp6(&seg, dbp, &tcb->protohdr.tcp6hdr, tcb);
2691                 if (hbp == NULL) {
2692                         freeblist(dbp);
2693                         return;
2694                 }
2695                 ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
2696         }
2697 }
2698
2699 /*
2700  *  set connection to time out after 12 minutes
2701  */
2702 void tcpsetkacounter(Tcpctl * tcb)
2703 {
2704         tcb->kacounter = (12 * 60 * 1000) / (tcb->katimer.start * MSPTICK);
2705         if (tcb->kacounter < 3)
2706                 tcb->kacounter = 3;
2707 }
2708
2709 /*
2710  *  if we've timed out, close the connection
2711  *  otherwise, send a keepalive and restart the timer
2712  */
2713 void tcpkeepalive(void *v)
2714 {
2715         ERRSTACK(1);
2716         Tcpctl *tcb;
2717         struct conv *s;
2718
2719         s = v;
2720         tcb = (Tcpctl *) s->ptcl;
2721         qlock(&s->qlock);
2722         if (waserror()) {
2723                 qunlock(&s->qlock);
2724                 nexterror();
2725         }
2726         if (tcb->state != Closed) {
2727                 if (--(tcb->kacounter) <= 0) {
2728                         localclose(s, "connection timed out");
2729                 } else {
2730                         tcpsendka(s);
2731                         tcpgo(s->p->priv, &tcb->katimer);
2732                 }
2733         }
2734         qunlock(&s->qlock);
2735         poperror();
2736 }
2737
2738 /*
2739  *  start keepalive timer
2740  */
2741 static void tcpstartka(struct conv *s, char **f, int n)
2742 {
2743         Tcpctl *tcb;
2744         int x;
2745
2746         tcb = (Tcpctl *) s->ptcl;
2747         if (tcb->state != Established)
2748                 error(ENOTCONN, "connection must be in Establised state");
2749         if (n > 1) {
2750                 x = atoi(f[1]);
2751                 if (x >= MSPTICK)
2752                         tcb->katimer.start = x / MSPTICK;
2753         }
2754         tcpsetkacounter(tcb);
2755         tcpgo(s->p->priv, &tcb->katimer);
2756 }
2757
2758 /*
2759  *  turn checksums on/off
2760  */
2761 static void tcpsetchecksum(struct conv *s, char **f, int unused)
2762 {
2763         Tcpctl *tcb;
2764
2765         tcb = (Tcpctl *) s->ptcl;
2766         tcb->nochecksum = !atoi(f[1]);
2767 }
2768
2769 void tcprxmit(struct conv *s)
2770 {
2771         Tcpctl *tcb;
2772
2773         tcb = (Tcpctl *) s->ptcl;
2774
2775         tcb->flags |= RETRAN | FORCE;
2776         tcb->snd.ptr = tcb->snd.una;
2777
2778         /*
2779          *  We should be halving the slow start threshhold (down to one
2780          *  mss) but leaving it at mss seems to work well enough
2781          */
2782         tcb->ssthresh = tcb->mss;
2783
2784         /*
2785          *  pull window down to a single packet
2786          */
2787         tcb->cwind = tcb->mss;
2788         tcpoutput(s);
2789 }
2790
2791 void tcptimeout(void *arg)
2792 {
2793         ERRSTACK(1);
2794         struct conv *s;
2795         Tcpctl *tcb;
2796         int maxback;
2797         struct tcppriv *tpriv;
2798
2799         s = (struct conv *)arg;
2800         tpriv = s->p->priv;
2801         tcb = (Tcpctl *) s->ptcl;
2802
2803         qlock(&s->qlock);
2804         if (waserror()) {
2805                 qunlock(&s->qlock);
2806                 nexterror();
2807         }
2808         switch (tcb->state) {
2809                 default:
2810                         tcb->backoff++;
2811                         if (tcb->state == Syn_sent)
2812                                 maxback = MAXBACKMS / 2;
2813                         else
2814                                 maxback = MAXBACKMS;
2815                         tcb->backedoff += tcb->timer.start * MSPTICK;
2816                         if (tcb->backedoff >= maxback) {
2817                                 localclose(s, "connection timed out");
2818                                 break;
2819                         }
2820                         netlog(s->p->f, Logtcprxmt, "timeout rexmit 0x%lx %llu/%llu\n",
2821                                    tcb->snd.una, tcb->timer.start, NOW);
2822                         tcpsettimer(tcb);
2823                         tcprxmit(s);
2824                         tpriv->stats[RetransTimeouts]++;
2825                         tcb->snd.dupacks = 0;
2826                         break;
2827                 case Time_wait:
2828                         localclose(s, NULL);
2829                         break;
2830                 case Closed:
2831                         break;
2832         }
2833         qunlock(&s->qlock);
2834         poperror();
2835 }
2836
2837 int inwindow(Tcpctl * tcb, int seq)
2838 {
2839         return seq_within(seq, tcb->rcv.nxt, tcb->rcv.nxt + tcb->rcv.wnd - 1);
2840 }
2841
2842 /*
2843  *  set up state for a received SYN (or SYN ACK) packet
2844  */
2845 void procsyn(struct conv *s, Tcp * seg)
2846 {
2847         Tcpctl *tcb;
2848
2849         tcb = (Tcpctl *) s->ptcl;
2850         tcb->flags |= FORCE;
2851
2852         tcb->rcv.nxt = seg->seq + 1;
2853         tcb->rcv.urg = tcb->rcv.nxt;
2854         tcb->irs = seg->seq;
2855
2856         /* our sending max segment size cannot be bigger than what he asked for */
2857         if (seg->mss != 0 && seg->mss < tcb->mss)
2858                 tcb->mss = seg->mss;
2859
2860         /* the congestion window always starts out as a single segment */
2861         tcb->snd.wnd = seg->wnd;
2862         tcb->cwind = tcb->mss;
2863 }
2864
2865 int
2866 addreseq(Tcpctl * tcb, struct tcppriv *tpriv, Tcp * seg,
2867                  struct block *bp, uint16_t length)
2868 {
2869         Reseq *rp, *rp1;
2870         int i, rqlen, qmax;
2871
2872         rp = kzmalloc(sizeof(Reseq), 0);
2873         if (rp == NULL) {
2874                 freeblist(bp);  /* bp always consumed by add_reseq */
2875                 return 0;
2876         }
2877
2878         rp->seg = *seg;
2879         rp->bp = bp;
2880         rp->length = length;
2881
2882         /* Place on reassembly list sorting by starting seq number */
2883         rp1 = tcb->reseq;
2884         if (rp1 == NULL || seq_lt(seg->seq, rp1->seg.seq)) {
2885                 rp->next = rp1;
2886                 tcb->reseq = rp;
2887                 if (rp->next != NULL)
2888                         tpriv->stats[OutOfOrder]++;
2889                 return 0;
2890         }
2891
2892         rqlen = 0;
2893         for (i = 0;; i++) {
2894                 rqlen += rp1->length;
2895                 if (rp1->next == NULL || seq_lt(seg->seq, rp1->next->seg.seq)) {
2896                         rp->next = rp1->next;
2897                         rp1->next = rp;
2898                         if (rp->next != NULL)
2899                                 tpriv->stats[OutOfOrder]++;
2900                         break;
2901                 }
2902                 rp1 = rp1->next;
2903         }
2904         qmax = QMAX << tcb->rcv.scale;
2905         if (rqlen > qmax) {
2906                 printd("resequence queue > window: %d > %d\n", rqlen, qmax);
2907                 i = 0;
2908                 for (rp1 = tcb->reseq; rp1 != NULL; rp1 = rp1->next) {
2909                         printd("0x%#lx 0x%#lx 0x%#x\n", rp1->seg.seq,
2910                                    rp1->seg.ack, rp1->seg.flags);
2911                         if (i++ > 10) {
2912                                 printd("...\n");
2913                                 break;
2914                         }
2915                 }
2916
2917                 // delete entire reassembly queue; wait for retransmit.
2918                 // - should we be smarter and only delete the tail?
2919                 for (rp = tcb->reseq; rp != NULL; rp = rp1) {
2920                         rp1 = rp->next;
2921                         freeblist(rp->bp);
2922                         kfree(rp);
2923                 }
2924                 tcb->reseq = NULL;
2925
2926                 return -1;
2927         }
2928         return 0;
2929 }
2930
2931 void getreseq(Tcpctl * tcb, Tcp * seg, struct block **bp, uint16_t * length)
2932 {
2933         Reseq *rp;
2934
2935         rp = tcb->reseq;
2936         if (rp == NULL)
2937                 return;
2938
2939         tcb->reseq = rp->next;
2940
2941         *seg = rp->seg;
2942         *bp = rp->bp;
2943         *length = rp->length;
2944
2945         kfree(rp);
2946 }
2947
2948 int tcptrim(Tcpctl * tcb, Tcp * seg, struct block **bp, uint16_t * length)
2949 {
2950         uint16_t len;
2951         uint8_t accept;
2952         int dupcnt, excess;
2953
2954         accept = 0;
2955         len = *length;
2956         if (seg->flags & SYN)
2957                 len++;
2958         if (seg->flags & FIN)
2959                 len++;
2960
2961         if (tcb->rcv.wnd == 0) {
2962                 if (len == 0 && seg->seq == tcb->rcv.nxt)
2963                         return 0;
2964         } else {
2965                 /* Some part of the segment should be in the window */
2966                 if (inwindow(tcb, seg->seq))
2967                         accept++;
2968                 else if (len != 0) {
2969                         if (inwindow(tcb, seg->seq + len - 1) ||
2970                                 seq_within(tcb->rcv.nxt, seg->seq, seg->seq + len - 1))
2971                                 accept++;
2972                 }
2973         }
2974         if (!accept) {
2975                 freeblist(*bp);
2976                 return -1;
2977         }
2978         dupcnt = tcb->rcv.nxt - seg->seq;
2979         if (dupcnt > 0) {
2980                 tcb->rerecv += dupcnt;
2981                 if (seg->flags & SYN) {
2982                         seg->flags &= ~SYN;
2983                         seg->seq++;
2984
2985                         if (seg->urg > 1)
2986                                 seg->urg--;
2987                         else
2988                                 seg->flags &= ~URG;
2989                         dupcnt--;
2990                 }
2991                 if (dupcnt > 0) {
2992                         pullblock(bp, (uint16_t) dupcnt);
2993                         seg->seq += dupcnt;
2994                         *length -= dupcnt;
2995
2996                         if (seg->urg > dupcnt)
2997                                 seg->urg -= dupcnt;
2998                         else {
2999                                 seg->flags &= ~URG;
3000                                 seg->urg = 0;
3001                         }
3002                 }
3003         }
3004         excess = seg->seq + *length - (tcb->rcv.nxt + tcb->rcv.wnd);
3005         if (excess > 0) {
3006                 tcb->rerecv += excess;
3007                 *length -= excess;
3008                 *bp = trimblock(*bp, 0, *length);
3009                 if (*bp == NULL)
3010                         panic("presotto is a boofhead");
3011                 seg->flags &= ~FIN;
3012         }
3013         return 0;
3014 }
3015
3016 void tcpadvise(struct Proto *tcp, struct block *bp, char *msg)
3017 {
3018         Tcp4hdr *h4;
3019         Tcp6hdr *h6;
3020         Tcpctl *tcb;
3021         uint8_t source[IPaddrlen];
3022         uint8_t dest[IPaddrlen];
3023         uint16_t psource, pdest;
3024         struct conv *s, **p;
3025
3026         h4 = (Tcp4hdr *) (bp->rp);
3027         h6 = (Tcp6hdr *) (bp->rp);
3028
3029         if ((h4->vihl & 0xF0) == IP_VER4) {
3030                 v4tov6(dest, h4->tcpdst);
3031                 v4tov6(source, h4->tcpsrc);
3032                 psource = nhgets(h4->tcpsport);
3033                 pdest = nhgets(h4->tcpdport);
3034         } else {
3035                 ipmove(dest, h6->tcpdst);
3036                 ipmove(source, h6->tcpsrc);
3037                 psource = nhgets(h6->tcpsport);
3038                 pdest = nhgets(h6->tcpdport);
3039         }
3040
3041         /* Look for a connection */
3042         qlock(&tcp->qlock);
3043         for (p = tcp->conv; *p; p++) {
3044                 s = *p;
3045                 tcb = (Tcpctl *) s->ptcl;
3046                 if (s->rport == pdest)
3047                         if (s->lport == psource)
3048                                 if (tcb->state != Closed)
3049                                         if (ipcmp(s->raddr, dest) == 0)
3050                                                 if (ipcmp(s->laddr, source) == 0) {
3051                                                         qlock(&s->qlock);
3052                                                         qunlock(&tcp->qlock);
3053                                                         switch (tcb->state) {
3054                                                                 case Syn_sent:
3055                                                                         localclose(s, msg);
3056                                                                         break;
3057                                                         }
3058                                                         qunlock(&s->qlock);
3059                                                         freeblist(bp);
3060                                                         return;
3061                                                 }
3062         }
3063         qunlock(&tcp->qlock);
3064         freeblist(bp);
3065 }
3066
3067 static void tcpporthogdefensectl(char *val)
3068 {
3069         if (strcmp(val, "on") == 0)
3070                 tcpporthogdefense = 1;
3071         else if (strcmp(val, "off") == 0)
3072                 tcpporthogdefense = 0;
3073         else
3074                 error(EINVAL, "unknown value for tcpporthogdefense");
3075 }
3076
3077 /* called with c qlocked */
3078 static void tcpctl(struct conv *c, char **f, int n)
3079 {
3080         if (n == 1 && strcmp(f[0], "hangup") == 0)
3081                 tcphangup(c);
3082         else if (n >= 1 && strcmp(f[0], "keepalive") == 0)
3083                 tcpstartka(c, f, n);
3084         else if (n >= 1 && strcmp(f[0], "checksum") == 0)
3085                 tcpsetchecksum(c, f, n);
3086         else if (n >= 1 && strcmp(f[0], "tcpporthogdefense") == 0)
3087                 tcpporthogdefensectl(f[1]);
3088         else
3089                 error(EINVAL, "unknown command to %s", __func__);
3090 }
3091
3092 int tcpstats(struct Proto *tcp, char *buf, int len)
3093 {
3094         struct tcppriv *priv;
3095         char *p, *e;
3096         int i;
3097
3098         priv = tcp->priv;
3099         p = buf;
3100         e = p + len;
3101         for (i = 0; i < Nstats; i++)
3102                 p = seprintf(p, e, "%s: %u\n", statnames[i], priv->stats[i]);
3103         return p - buf;
3104 }
3105
3106 /*
3107  *  garbage collect any stale conversations:
3108  *      - SYN received but no SYN-ACK after 5 seconds (could be the SYN attack)
3109  *      - Finwait2 after 5 minutes
3110  *
3111  *  this is called whenever we run out of channels.  Both checks are
3112  *  of questionable validity so we try to use them only when we're
3113  *  up against the wall.
3114  */
3115 int tcpgc(struct Proto *tcp)
3116 {
3117         struct conv *c, **pp, **ep;
3118         int n;
3119         Tcpctl *tcb;
3120
3121         n = 0;
3122         ep = &tcp->conv[tcp->nc];
3123         for (pp = tcp->conv; pp < ep; pp++) {
3124                 c = *pp;
3125                 if (c == NULL)
3126                         break;
3127                 if (!canqlock(&c->qlock))
3128                         continue;
3129                 tcb = (Tcpctl *) c->ptcl;
3130                 switch (tcb->state) {
3131                         case Syn_received:
3132                                 if (NOW - tcb->time > 5000) {
3133                                         localclose(c, "timed out");
3134                                         n++;
3135                                 }
3136                                 break;
3137                         case Finwait2:
3138                                 if (NOW - tcb->time > 5 * 60 * 1000) {
3139                                         localclose(c, "timed out");
3140                                         n++;
3141                                 }
3142                                 break;
3143                 }
3144                 qunlock(&c->qlock);
3145         }
3146         return n;
3147 }
3148
3149 void tcpsettimer(Tcpctl * tcb)
3150 {
3151         int x;
3152
3153         /* round trip dependency */
3154         x = backoff(tcb->backoff) *
3155                 (tcb->mdev + (tcb->srtt >> LOGAGAIN) + MSPTICK) / MSPTICK;
3156
3157         /* bounded twixt 1/2 and 64 seconds */
3158         if (x < 500 / MSPTICK)
3159                 x = 500 / MSPTICK;
3160         else if (x > (64000 / MSPTICK))
3161                 x = 64000 / MSPTICK;
3162         tcb->timer.start = x;
3163 }
3164
3165 void tcpinit(struct Fs *fs)
3166 {
3167         struct Proto *tcp;
3168         struct tcppriv *tpriv;
3169
3170         tcp = kzmalloc(sizeof(struct Proto), 0);
3171         tpriv = tcp->priv = kzmalloc(sizeof(struct tcppriv), 0);
3172         qlock_init(&tpriv->tl);
3173         qlock_init(&tpriv->apl);
3174         tcp->name = "tcp";
3175         tcp->connect = tcpconnect;
3176         tcp->announce = tcpannounce;
3177         tcp->ctl = tcpctl;