net: Add accounting to help TSO/LSO/GSO
[akaros.git] / kern / src / net / tcp.c
1 /* Copyright © 1994-1999 Lucent Technologies Inc.  All rights reserved.
2  * Portions Copyright © 1997-1999 Vita Nuova Limited
3  * Portions Copyright © 2000-2007 Vita Nuova Holdings Limited
4  *                                (www.vitanuova.com)
5  * Revisions Copyright © 2000-2007 Lucent Technologies Inc. and others
6  *
7  * Modified for the Akaros operating system:
8  * Copyright (c) 2013-2014 The Regents of the University of California
9  * Copyright (c) 2013-2015 Google Inc.
10  *
11  * Permission is hereby granted, free of charge, to any person obtaining a copy
12  * of this software and associated documentation files (the "Software"), to deal
13  * in the Software without restriction, including without limitation the rights
14  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15  * copies of the Software, and to permit persons to whom the Software is
16  * furnished to do so, subject to the following conditions:
17  *
18  * The above copyright notice and this permission notice shall be included in
19  * all copies or substantial portions of the Software.
20  *
21  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
24  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27  * SOFTWARE. */
28
29 #include <vfs.h>
30 #include <kfs.h>
31 #include <slab.h>
32 #include <kmalloc.h>
33 #include <kref.h>
34 #include <string.h>
35 #include <stdio.h>
36 #include <assert.h>
37 #include <error.h>
38 #include <cpio.h>
39 #include <pmap.h>
40 #include <smp.h>
41 #include <ip.h>
42
43 #include <vfs.h>
44 #include <kfs.h>
45 #include <slab.h>
46 #include <kmalloc.h>
47 #include <kref.h>
48 #include <string.h>
49 #include <stdio.h>
50 #include <assert.h>
51 #include <error.h>
52 #include <cpio.h>
53 #include <pmap.h>
54 #include <smp.h>
55 #include <ip.h>
56
57 enum {
58         QMAX = 64 * 1024 - 1,
59         IP_TCPPROTO = 6,
60
61         TCP4_IPLEN = 8,
62         TCP4_PHDRSIZE = 12,
63         TCP4_HDRSIZE = 20,
64         TCP4_TCBPHDRSZ = 40,
65         TCP4_PKT = TCP4_IPLEN + TCP4_PHDRSIZE,
66
67         TCP6_IPLEN = 0,
68         TCP6_PHDRSIZE = 40,
69         TCP6_HDRSIZE = 20,
70         TCP6_TCBPHDRSZ = 60,
71         TCP6_PKT = TCP6_IPLEN + TCP6_PHDRSIZE,
72
73         TcptimerOFF = 0,
74         TcptimerON = 1,
75         TcptimerDONE = 2,
76         MAX_TIME = (1 << 20),   /* Forever */
77         TCP_ACK = 50,   /* Timed ack sequence in ms */
78         MAXBACKMS = 9 * 60 * 1000,      /* longest backoff time (ms) before hangup */
79
80         URG = 0x20,     /* Data marked urgent */
81         ACK = 0x10,     /* Acknowledge is valid */
82         PSH = 0x08,     /* Whole data pipe is pushed */
83         RST = 0x04,     /* Reset connection */
84         SYN = 0x02,     /* Pkt. is synchronise */
85         FIN = 0x01,     /* Start close down */
86
87         EOLOPT = 0,
88         NOOPOPT = 1,
89         MSSOPT = 2,
90         MSS_LENGTH = 4, /* Mean segment size */
91         WSOPT = 3,
92         WS_LENGTH = 3,  /* Bits to scale window size by */
93         MSL2 = 10,
94         MSPTICK = 50,   /* Milliseconds per timer tick */
95         DEF_MSS = 1460, /* Default mean segment */
96         DEF_MSS6 = 1280,        /* Default mean segment (min) for v6 */
97         DEF_RTT = 500,  /* Default round trip */
98         DEF_KAT = 120000,       /* Default time (ms) between keep alives */
99         TCP_LISTEN = 0, /* Listen connection */
100         TCP_CONNECT = 1,        /* Outgoing connection */
101         SYNACK_RXTIMER = 250,   /* ms between SYNACK retransmits */
102
103         TCPREXMTTHRESH = 3,     /* dupack threshhold for rxt */
104
105         FORCE = 1,
106         CLONE = 2,
107         RETRAN = 4,
108         ACTIVE = 8,
109         SYNACK = 16,
110         TSO = 32,
111
112         LOGAGAIN = 3,
113         LOGDGAIN = 2,
114
115         Closed = 0,     /* Connection states */
116         Listen,
117         Syn_sent,
118         Syn_received,
119         Established,
120         Finwait1,
121         Finwait2,
122         Close_wait,
123         Closing,
124         Last_ack,
125         Time_wait,
126
127         Maxlimbo = 1000,        /* maximum procs waiting for response to SYN ACK */
128         NLHT = 256,     /* hash table size, must be a power of 2 */
129         LHTMASK = NLHT - 1,
130
131         HaveWS = 1 << 8,
132 };
133
134 /* Must correspond to the enumeration above */
135 char *tcpstates[] = {
136         "Closed", "Listen", "Syn_sent", "Syn_received",
137         "Established", "Finwait1", "Finwait2", "Close_wait",
138         "Closing", "Last_ack", "Time_wait"
139 };
140
141 typedef struct Tcptimer Tcptimer;
142 struct Tcptimer {
143         Tcptimer *next;
144         Tcptimer *prev;
145         Tcptimer *readynext;
146         int state;
147         uint64_t start;
148         uint64_t count;
149         void (*func) (void *);
150         void *arg;
151 };
152
153 /*
154  *  v4 and v6 pseudo headers used for
155  *  checksuming tcp
156  */
157 typedef struct Tcp4hdr Tcp4hdr;
158 struct Tcp4hdr {
159         uint8_t vihl;                           /* Version and header length */
160         uint8_t tos;                            /* Type of service */
161         uint8_t length[2];                      /* packet length */
162         uint8_t id[2];                          /* Identification */
163         uint8_t frag[2];                        /* Fragment information */
164         uint8_t Unused;
165         uint8_t proto;
166         uint8_t tcplen[2];
167         uint8_t tcpsrc[4];
168         uint8_t tcpdst[4];
169         uint8_t tcpsport[2];
170         uint8_t tcpdport[2];
171         uint8_t tcpseq[4];
172         uint8_t tcpack[4];
173         uint8_t tcpflag[2];
174         uint8_t tcpwin[2];
175         uint8_t tcpcksum[2];
176         uint8_t tcpurg[2];
177         /* Options segment */
178         uint8_t tcpopt[1];
179 };
180
181 typedef struct Tcp6hdr Tcp6hdr;
182 struct Tcp6hdr {
183         uint8_t vcf[4];
184         uint8_t ploadlen[2];
185         uint8_t proto;
186         uint8_t ttl;
187         uint8_t tcpsrc[IPaddrlen];
188         uint8_t tcpdst[IPaddrlen];
189         uint8_t tcpsport[2];
190         uint8_t tcpdport[2];
191         uint8_t tcpseq[4];
192         uint8_t tcpack[4];
193         uint8_t tcpflag[2];
194         uint8_t tcpwin[2];
195         uint8_t tcpcksum[2];
196         uint8_t tcpurg[2];
197         /* Options segment */
198         uint8_t tcpopt[1];
199 };
200
201 /*
202  *  this represents the control info
203  *  for a single packet.  It is derived from
204  *  a packet in ntohtcp{4,6}() and stuck into
205  *  a packet in htontcp{4,6}().
206  */
207 typedef struct Tcp Tcp;
208 struct Tcp {
209         uint16_t source;
210         uint16_t dest;
211         uint32_t seq;
212         uint32_t ack;
213         uint8_t flags;
214         uint16_t ws;                            /* window scale option (if not zero) */
215         uint32_t wnd;
216         uint16_t urg;
217         uint16_t mss;                           /* max segment size option (if not zero) */
218         uint16_t len;                           /* size of data */
219 };
220
221 /*
222  *  this header is malloc'd to thread together fragments
223  *  waiting to be coalesced
224  */
225 typedef struct Reseq Reseq;
226 struct Reseq {
227         Reseq *next;
228         Tcp seg;
229         struct block *bp;
230         uint16_t length;
231 };
232
233 /*
234  *  the qlock in the Conv locks this structure
235  */
236 typedef struct Tcpctl Tcpctl;
237 struct Tcpctl {
238         uint8_t state;                          /* Connection state */
239         uint8_t type;                           /* Listening or active connection */
240         uint8_t code;                           /* Icmp code */
241         struct {
242                 uint32_t una;                   /* Unacked data pointer */
243                 uint32_t nxt;                   /* Next sequence expected */
244                 uint32_t ptr;                   /* Data pointer */
245                 uint32_t wnd;                   /* Tcp send window */
246                 uint32_t urg;                   /* Urgent data pointer */
247                 uint32_t wl2;
248                 int scale;                              /* how much to right shift window in xmitted packets */
249                 /* to implement tahoe and reno TCP */
250                 uint32_t dupacks;               /* number of duplicate acks rcvd */
251                 int recovery;                   /* loss recovery flag */
252                 uint32_t rxt;                   /* right window marker for recovery */
253         } snd;
254         struct {
255                 uint32_t nxt;                   /* Receive pointer to next uint8_t slot */
256                 uint32_t wnd;                   /* Receive window incoming */
257                 uint32_t urg;                   /* Urgent pointer */
258                 int blocked;
259                 int una;                                /* unacked data segs */
260                 int scale;                              /* how much to left shift window in rcved packets */
261         } rcv;
262         uint32_t iss;                           /* Initial sequence number */
263         int sawwsopt;                           /* true if we saw a wsopt on the incoming SYN */
264         uint32_t cwind;                         /* Congestion window */
265         int scale;                                      /* desired snd.scale */
266         uint16_t ssthresh;                      /* Slow start threshold */
267         int resent;                                     /* Bytes just resent */
268         int irs;                                        /* Initial received squence */
269         uint16_t mss;                           /* Mean segment size */
270         int rerecv;                                     /* Overlap of data rerecevived */
271         uint32_t window;                        /* Recevive window */
272         uint8_t backoff;                        /* Exponential backoff counter */
273         int backedoff;                          /* ms we've backed off for rexmits */
274         uint8_t flags;                          /* State flags */
275         Reseq *reseq;                           /* Resequencing queue */
276         Tcptimer timer;                         /* Activity timer */
277         Tcptimer acktimer;                      /* Acknowledge timer */
278         Tcptimer rtt_timer;                     /* Round trip timer */
279         Tcptimer katimer;                       /* keep alive timer */
280         uint32_t rttseq;                        /* Round trip sequence */
281         int srtt;                                       /* Shortened round trip */
282         int mdev;                                       /* Mean deviation of round trip */
283         int kacounter;                          /* count down for keep alive */
284         uint64_t sndsyntime;            /* time syn sent */
285         uint64_t time;                          /* time Finwait2 or Syn_received was sent */
286         int nochecksum;                         /* non-zero means don't send checksums */
287         int flgcnt;                                     /* number of flags in the sequence (FIN,SEQ) */
288
289         union {
290                 Tcp4hdr tcp4hdr;
291                 Tcp6hdr tcp6hdr;
292         } protohdr;                                     /* prototype header */
293 };
294
295 /*
296  *  New calls are put in limbo rather than having a conversation structure
297  *  allocated.  Thus, a SYN attack results in lots of limbo'd calls but not
298  *  any real Conv structures mucking things up.  Calls in limbo rexmit their
299  *  SYN ACK every SYNACK_RXTIMER ms up to 4 times, i.e., they disappear after 1 second.
300  *
301  *  In particular they aren't on a listener's queue so that they don't figure
302  *  in the input queue limit.
303  *
304  *  If 1/2 of a T3 was attacking SYN packets, we'ld have a permanent queue
305  *  of 70000 limbo'd calls.  Not great for a linear list but doable.  Therefore
306  *  there is no hashing of this list.
307  */
308 typedef struct Limbo Limbo;
309 struct Limbo {
310         Limbo *next;
311
312         uint8_t laddr[IPaddrlen];
313         uint8_t raddr[IPaddrlen];
314         uint16_t lport;
315         uint16_t rport;
316         uint32_t irs;                           /* initial received sequence */
317         uint32_t iss;                           /* initial sent sequence */
318         uint16_t mss;                           /* mss from the other end */
319         uint16_t rcvscale;                      /* how much to scale rcvd windows */
320         uint16_t sndscale;                      /* how much to scale sent windows */
321         uint64_t lastsend;                      /* last time we sent a synack */
322         uint8_t version;                        /* v4 or v6 */
323         uint8_t rexmits;                        /* number of retransmissions */
324 };
325
326 int tcp_irtt = DEF_RTT;                 /* Initial guess at round trip time */
327 uint16_t tcp_mss = DEF_MSS;             /* Maximum segment size to be sent */
328
329 enum {
330         /* MIB stats */
331         MaxConn,
332         ActiveOpens,
333         PassiveOpens,
334         EstabResets,
335         CurrEstab,
336         InSegs,
337         OutSegs,
338         RetransSegs,
339         RetransTimeouts,
340         InErrs,
341         OutRsts,
342
343         /* non-MIB stats */
344         CsumErrs,
345         HlenErrs,
346         LenErrs,
347         OutOfOrder,
348
349         Nstats
350 };
351
352 static char *statnames[] = {
353         [MaxConn] "MaxConn",
354         [ActiveOpens] "ActiveOpens",
355         [PassiveOpens] "PassiveOpens",
356         [EstabResets] "EstabResets",
357         [CurrEstab] "CurrEstab",
358         [InSegs] "InSegs",
359         [OutSegs] "OutSegs",
360         [RetransSegs] "RetransSegs",
361         [RetransTimeouts] "RetransTimeouts",
362         [InErrs] "InErrs",
363         [OutRsts] "OutRsts",
364         [CsumErrs] "CsumErrs",
365         [HlenErrs] "HlenErrs",
366         [LenErrs] "LenErrs",
367         [OutOfOrder] "OutOfOrder",
368 };
369
370 typedef struct Tcppriv Tcppriv;
371 struct tcppriv {
372         /* List of active timers */
373         qlock_t tl;
374         Tcptimer *timers;
375
376         /* hash table for matching conversations */
377         struct Ipht ht;
378
379         /* calls in limbo waiting for an ACK to our SYN ACK */
380         int nlimbo;
381         Limbo *lht[NLHT];
382
383         /* for keeping track of tcpackproc */
384         qlock_t apl;
385         int ackprocstarted;
386
387         uint32_t stats[Nstats];
388 };
389
390 /*
391  *  Setting tcpporthogdefense to non-zero enables Dong Lin's
392  *  solution to hijacked systems staking out port's as a form
393  *  of DoS attack.
394  *
395  *  To avoid stateless Conv hogs, we pick a sequence number at random.  If
396  *  it that number gets acked by the other end, we shut down the connection.
397  *  Look for tcpporthogedefense in the code.
398  */
399 int tcpporthogdefense = 0;
400
401 int addreseq(Tcpctl *, struct tcppriv *, Tcp *, struct block *, uint16_t);
402 void getreseq(Tcpctl *, Tcp *, struct block **, uint16_t *);
403 void localclose(struct conv *, char *unused_char_p_t);
404 void procsyn(struct conv *, Tcp *);
405 void tcpiput(struct Proto *, struct Ipifc *, struct block *);
406 void tcpoutput(struct conv *);
407 int tcptrim(Tcpctl *, Tcp *, struct block **, uint16_t *);
408 void tcpstart(struct conv *, int);
409 void tcptimeout(void *);
410 void tcpsndsyn(struct conv *, Tcpctl *);
411 void tcprcvwin(struct conv *);
412 void tcpacktimer(void *);
413 void tcpkeepalive(void *);
414 void tcpsetkacounter(Tcpctl *);
415 void tcprxmit(struct conv *);
416 void tcpsettimer(Tcpctl *);
417 void tcpsynackrtt(struct conv *);
418 void tcpsetscale(struct conv *, Tcpctl *, uint16_t, uint16_t);
419
420 static void limborexmit(struct Proto *);
421 static void limbo(struct conv *, uint8_t * unused_uint8_p_t, uint8_t *, Tcp *,
422                                   int);
423
424 void tcpsetstate(struct conv *s, uint8_t newstate)
425 {
426         Tcpctl *tcb;
427         uint8_t oldstate;
428         struct tcppriv *tpriv;
429
430         tpriv = s->p->priv;
431
432         tcb = (Tcpctl *) s->ptcl;
433
434         oldstate = tcb->state;
435         if (oldstate == newstate)
436                 return;
437
438         if (oldstate == Established)
439                 tpriv->stats[CurrEstab]--;
440         if (newstate == Established)
441                 tpriv->stats[CurrEstab]++;
442
443         /**
444         print( "%d/%d %s->%s CurrEstab=%d\n", s->lport, s->rport,
445                 tcpstates[oldstate], tcpstates[newstate], tpriv->tstats.tcpCurrEstab );
446         **/
447
448         switch (newstate) {
449                 case Closed:
450                         qclose(s->rq);
451                         qclose(s->wq);
452                         qclose(s->eq);
453                         break;
454
455                 case Close_wait:        /* Remote closes */
456                         qhangup(s->rq, NULL);
457                         break;
458         }
459
460         tcb->state = newstate;
461
462         if (oldstate == Syn_sent && newstate != Closed)
463                 Fsconnected(s, NULL);
464 }
465
466 static void tcpconnect(struct conv *c, char **argv, int argc)
467 {
468         Fsstdconnect(c, argv, argc);
469         tcpstart(c, TCP_CONNECT);
470 }
471
472 static int tcpstate(struct conv *c, char *state, int n)
473 {
474         Tcpctl *s;
475
476         s = (Tcpctl *) (c->ptcl);
477
478         return snprintf(state, n,
479                                         "%s qin %d qout %d srtt %d mdev %d cwin %u swin %u>>%d rwin %u>>%d timer.start %llu timer.count %llu rerecv %d katimer.start %d katimer.count %d\n",
480                                         tcpstates[s->state],
481                                         c->rq ? qlen(c->rq) : 0,
482                                         c->wq ? qlen(c->wq) : 0,
483                                         s->srtt, s->mdev,
484                                         s->cwind, s->snd.wnd, s->rcv.scale, s->rcv.wnd,
485                                         s->snd.scale, s->timer.start, s->timer.count, s->rerecv,
486                                         s->katimer.start, s->katimer.count);
487 }
488
489 static int tcpinuse(struct conv *c)
490 {
491         Tcpctl *s;
492
493         s = (Tcpctl *) (c->ptcl);
494         return s->state != Closed;
495 }
496
497 static void tcpannounce(struct conv *c, char **argv, int argc)
498 {
499         Fsstdannounce(c, argv, argc);
500         tcpstart(c, TCP_LISTEN);
501         Fsconnected(c, NULL);
502 }
503
504 static void tcpbypass(struct conv *cv, char **argv, int argc)
505 {
506         struct tcppriv *tpriv = cv->p->priv;
507
508         Fsstdbypass(cv, argv, argc);
509         iphtadd(&tpriv->ht, cv);
510 }
511
512 static void tcpshutdown(struct conv *c, int how)
513 {
514         Tcpctl *tcb = (Tcpctl*)c->ptcl;
515
516         /* Do nothing for the read side */
517         if (how == SHUT_RD)
518                 return;
519         /* Sends a FIN.  If we're in another state (like Listen), we'll run into
520          * issues, since we'll never send the FIN.  We'll be shutdown on our end,
521          * but we'll never tell the distant end.  Might just be an app issue. */
522         switch (tcb->state) {
523         case Syn_received:
524         case Established:
525                 tcb->flgcnt++;
526                 tcb->snd.nxt++;
527                 tcpsetstate(c, Finwait1);
528                 tcpoutput(c);
529                 break;
530         }
531 }
532
533 /*
534  *  tcpclose is always called with the q locked
535  */
536 static void tcpclose(struct conv *c)
537 {
538         Tcpctl *tcb;
539
540         tcb = (Tcpctl *) c->ptcl;
541
542         qhangup(c->rq, NULL);
543         qhangup(c->wq, NULL);
544         qhangup(c->eq, NULL);
545         qflush(c->rq);
546
547         switch (tcb->state) {
548                 case Listen:
549                         /*
550                          *  reset any incoming calls to this listener
551                          */
552                         Fsconnected(c, "Hangup");
553
554                         localclose(c, NULL);
555                         break;
556                 case Closed:
557                 case Syn_sent:
558                         localclose(c, NULL);
559                         break;
560                 case Syn_received:
561                 case Established:
562                         tcb->flgcnt++;
563                         tcb->snd.nxt++;
564                         tcpsetstate(c, Finwait1);
565                         tcpoutput(c);
566                         break;
567                 case Close_wait:
568                         tcb->flgcnt++;
569                         tcb->snd.nxt++;
570                         tcpsetstate(c, Last_ack);
571                         tcpoutput(c);
572                         break;
573         }
574 }
575
576 void tcpkick(void *x)
577 {
578         ERRSTACK(1);
579         struct conv *s = x;
580         Tcpctl *tcb;
581
582         tcb = (Tcpctl *) s->ptcl;
583
584         qlock(&s->qlock);
585         if (waserror()) {
586                 qunlock(&s->qlock);
587                 nexterror();
588         }
589
590         switch (tcb->state) {
591                 case Syn_sent:
592                 case Syn_received:
593                 case Established:
594                 case Close_wait:
595                         /*
596                          * Push data
597                          */
598                         tcprcvwin(s);
599                         tcpoutput(s);
600                         break;
601                 default:
602                         localclose(s, "Hangup");
603                         break;
604         }
605
606         qunlock(&s->qlock);
607         poperror();
608 }
609
610 void tcprcvwin(struct conv *s)
611 {       /* Call with tcb locked */
612         int w;
613         Tcpctl *tcb;
614
615         tcb = (Tcpctl *) s->ptcl;
616         w = tcb->window - qlen(s->rq);
617         if (w < 0)
618                 w = 0;
619         tcb->rcv.wnd = w;
620         if (w == 0)
621                 tcb->rcv.blocked = 1;
622 }
623
624 void tcpacktimer(void *v)
625 {
626         ERRSTACK(1);
627         Tcpctl *tcb;
628         struct conv *s;
629
630         s = v;
631         tcb = (Tcpctl *) s->ptcl;
632
633         qlock(&s->qlock);
634         if (waserror()) {
635                 qunlock(&s->qlock);
636                 nexterror();
637         }
638         if (tcb->state != Closed) {
639                 tcb->flags |= FORCE;
640                 tcprcvwin(s);
641                 tcpoutput(s);
642         }
643         qunlock(&s->qlock);
644         poperror();
645 }
646
647 static void tcpcreate(struct conv *c)
648 {
649         c->rq = qopen(QMAX, Qcoalesce, 0, 0);
650         c->wq = qopen(8 * QMAX, Qkick, tcpkick, c);
651 }
652
653 static void timerstate(struct tcppriv *priv, Tcptimer * t, int newstate)
654 {
655         if (newstate != TcptimerON) {
656                 if (t->state == TcptimerON) {
657                         // unchain
658                         if (priv->timers == t) {
659                                 priv->timers = t->next;
660                                 if (t->prev != NULL)
661                                         panic("timerstate1");
662                         }
663                         if (t->next)
664                                 t->next->prev = t->prev;
665                         if (t->prev)
666                                 t->prev->next = t->next;
667                         t->next = t->prev = NULL;
668                 }
669         } else {
670                 if (t->state != TcptimerON) {
671                         // chain
672                         if (t->prev != NULL || t->next != NULL)
673                                 panic("timerstate2");
674                         t->prev = NULL;
675                         t->next = priv->timers;
676                         if (t->next)
677                                 t->next->prev = t;
678                         priv->timers = t;
679                 }
680         }
681         t->state = newstate;
682 }
683
684 void tcpackproc(void *a)
685 {
686         ERRSTACK(1);
687         Tcptimer *t, *tp, *timeo;
688         struct Proto *tcp;
689         struct tcppriv *priv;
690         int loop;
691
692         tcp = a;
693         priv = tcp->priv;
694
695         for (;;) {
696                 kthread_usleep(MSPTICK * 1000);
697
698                 qlock(&priv->tl);
699                 timeo = NULL;
700                 loop = 0;
701                 for (t = priv->timers; t != NULL; t = tp) {
702                         if (loop++ > 10000)
703                                 panic("tcpackproc1");
704                         tp = t->next;
705                         if (t->state == TcptimerON) {
706                                 t->count--;
707                                 if (t->count == 0) {
708                                         timerstate(priv, t, TcptimerDONE);
709                                         t->readynext = timeo;
710                                         timeo = t;
711                                 }
712                         }
713                 }
714                 qunlock(&priv->tl);
715
716                 loop = 0;
717                 for (t = timeo; t != NULL; t = t->readynext) {
718                         if (loop++ > 10000)
719                                 panic("tcpackproc2");
720                         if (t->state == TcptimerDONE && t->func != NULL) {
721                                 /* discard error style */
722                                 if (!waserror())
723                                         (*t->func) (t->arg);
724                                 poperror();
725                         }
726                 }
727
728                 limborexmit(tcp);
729         }
730 }
731
732 void tcpgo(struct tcppriv *priv, Tcptimer * t)
733 {
734         if (t == NULL || t->start == 0)
735                 return;
736
737         qlock(&priv->tl);
738         t->count = t->start;
739         timerstate(priv, t, TcptimerON);
740         qunlock(&priv->tl);
741 }
742
743 void tcphalt(struct tcppriv *priv, Tcptimer * t)
744 {
745         if (t == NULL)
746                 return;
747
748         qlock(&priv->tl);
749         timerstate(priv, t, TcptimerOFF);
750         qunlock(&priv->tl);
751 }
752
753 int backoff(int n)
754 {
755         return 1 << n;
756 }
757
758 void localclose(struct conv *s, char *reason)
759 {       /* called with tcb locked */
760         Tcpctl *tcb;
761         Reseq *rp, *rp1;
762         struct tcppriv *tpriv;
763
764         tpriv = s->p->priv;
765         tcb = (Tcpctl *) s->ptcl;
766
767         iphtrem(&tpriv->ht, s);
768
769         tcphalt(tpriv, &tcb->timer);
770         tcphalt(tpriv, &tcb->rtt_timer);
771         tcphalt(tpriv, &tcb->acktimer);
772         tcphalt(tpriv, &tcb->katimer);
773
774         /* Flush reassembly queue; nothing more can arrive */
775         for (rp = tcb->reseq; rp != NULL; rp = rp1) {
776                 rp1 = rp->next;
777                 freeblist(rp->bp);
778                 kfree(rp);
779         }
780         tcb->reseq = NULL;
781
782         if (tcb->state == Syn_sent)
783                 Fsconnected(s, reason);
784
785         qhangup(s->rq, reason);
786         qhangup(s->wq, reason);
787
788         tcpsetstate(s, Closed);
789
790         /* listener will check the rq state */
791         if (s->state == Announced)
792                 rendez_wakeup(&s->listenr);
793 }
794
795 /* mtu (- TCP + IP hdr len) of 1st hop */
796 int tcpmtu(struct Proto *tcp, uint8_t * addr, int version, int *scale,
797            uint8_t *flags)
798 {
799         struct Ipifc *ifc;
800         int mtu;
801
802         ifc = findipifc(tcp->f, addr, 0);
803         switch (version) {
804                 default:
805                 case V4:
806                         mtu = DEF_MSS;
807                         if (ifc != NULL)
808                                 mtu = ifc->maxtu - ifc->m->hsize - (TCP4_PKT + TCP4_HDRSIZE);
809                         break;
810                 case V6:
811                         mtu = DEF_MSS6;
812                         if (ifc != NULL)
813                                 mtu = ifc->maxtu - ifc->m->hsize - (TCP6_PKT + TCP6_HDRSIZE);
814                         break;
815         }
816         *flags &= ~TSO;
817
818         if (ifc != NULL) {
819                 if (ifc->mbps > 100)
820                         *scale = HaveWS | 3;
821                 else if (ifc->mbps > 10)
822                         *scale = HaveWS | 1;
823                 else
824                         *scale = HaveWS | 0;
825                 if (ifc->feat & NETF_TSO)
826                         *flags |= TSO;
827         } else
828                 *scale = HaveWS | 0;
829
830         return mtu;
831 }
832
833 void inittcpctl(struct conv *s, int mode)
834 {
835         Tcpctl *tcb;
836         Tcp4hdr *h4;
837         Tcp6hdr *h6;
838         int mss;
839
840         tcb = (Tcpctl *) s->ptcl;
841
842         memset(tcb, 0, sizeof(Tcpctl));
843
844         tcb->ssthresh = 65535;
845         tcb->srtt = tcp_irtt << LOGAGAIN;
846         tcb->mdev = 0;
847
848         /* setup timers */
849         tcb->timer.start = tcp_irtt / MSPTICK;
850         tcb->timer.func = tcptimeout;
851         tcb->timer.arg = s;
852         tcb->rtt_timer.start = MAX_TIME;
853         tcb->acktimer.start = TCP_ACK / MSPTICK;
854         tcb->acktimer.func = tcpacktimer;
855         tcb->acktimer.arg = s;
856         tcb->katimer.start = DEF_KAT / MSPTICK;
857         tcb->katimer.func = tcpkeepalive;
858         tcb->katimer.arg = s;
859
860         mss = DEF_MSS;
861
862         /* create a prototype(pseudo) header */
863         if (mode != TCP_LISTEN) {
864                 if (ipcmp(s->laddr, IPnoaddr) == 0)
865                         findlocalip(s->p->f, s->laddr, s->raddr);
866
867                 switch (s->ipversion) {
868                         case V4:
869                                 h4 = &tcb->protohdr.tcp4hdr;
870                                 memset(h4, 0, sizeof(*h4));
871                                 h4->proto = IP_TCPPROTO;
872                                 hnputs(h4->tcpsport, s->lport);
873                                 hnputs(h4->tcpdport, s->rport);
874                                 v6tov4(h4->tcpsrc, s->laddr);
875                                 v6tov4(h4->tcpdst, s->raddr);
876                                 break;
877                         case V6:
878                                 h6 = &tcb->protohdr.tcp6hdr;
879                                 memset(h6, 0, sizeof(*h6));
880                                 h6->proto = IP_TCPPROTO;
881                                 hnputs(h6->tcpsport, s->lport);
882                                 hnputs(h6->tcpdport, s->rport);
883                                 ipmove(h6->tcpsrc, s->laddr);
884                                 ipmove(h6->tcpdst, s->raddr);
885                                 mss = DEF_MSS6;
886                                 break;
887                         default:
888                                 panic("inittcpctl: version %d", s->ipversion);
889                 }
890         }
891
892         tcb->mss = tcb->cwind = mss;
893
894         /* default is no window scaling */
895         tcb->window = QMAX;
896         tcb->rcv.wnd = QMAX;
897         tcb->rcv.scale = 0;
898         tcb->snd.scale = 0;
899         qsetlimit(s->rq, QMAX);
900 }
901
902 /*
903  *  called with s qlocked
904  */
905 void tcpstart(struct conv *s, int mode)
906 {
907         Tcpctl *tcb;
908         struct tcppriv *tpriv;
909         /* tcpackproc needs to free this if it ever exits */
910         char *kpname = kmalloc(KNAMELEN, MEM_WAIT);
911
912         tpriv = s->p->priv;
913
914         if (tpriv->ackprocstarted == 0) {
915                 qlock(&tpriv->apl);
916                 if (tpriv->ackprocstarted == 0) {
917                         snprintf(kpname, KNAMELEN, "#I%dtcpack", s->p->f->dev);
918                         ktask(kpname, tcpackproc, s->p);
919                         tpriv->ackprocstarted = 1;
920                 }
921                 qunlock(&tpriv->apl);
922         }
923
924         tcb = (Tcpctl *) s->ptcl;
925
926         inittcpctl(s, mode);
927
928         iphtadd(&tpriv->ht, s);
929         switch (mode) {
930                 case TCP_LISTEN:
931                         tpriv->stats[PassiveOpens]++;
932                         tcb->flags |= CLONE;
933                         tcpsetstate(s, Listen);
934                         break;
935
936                 case TCP_CONNECT:
937                         tpriv->stats[ActiveOpens]++;
938                         tcb->flags |= ACTIVE;
939                         tcpsndsyn(s, tcb);
940                         tcpsetstate(s, Syn_sent);
941                         tcpoutput(s);
942                         break;
943         }
944 }
945
946 static char *tcpflag(uint16_t flag)
947 {
948         static char buf[128];
949
950         snprintf(buf, sizeof(buf), "%d", flag >> 10);   /* Head len */
951         if (flag & URG)
952                 snprintf(buf, sizeof(buf), "%s%s", buf, " URG");
953         if (flag & ACK)
954                 snprintf(buf, sizeof(buf), "%s%s", buf, " ACK");
955         if (flag & PSH)
956                 snprintf(buf, sizeof(buf), "%s%s", buf, " PSH");
957         if (flag & RST)
958                 snprintf(buf, sizeof(buf), "%s%s", buf, " RST");
959         if (flag & SYN)
960                 snprintf(buf, sizeof(buf), "%s%s", buf, " SYN");
961         if (flag & FIN)
962                 snprintf(buf, sizeof(buf), "%s%s", buf, " FIN");
963
964         return buf;
965 }
966
967 struct block *htontcp6(Tcp * tcph, struct block *data, Tcp6hdr * ph,
968                                            Tcpctl * tcb)
969 {
970         int dlen;
971         Tcp6hdr *h;
972         uint16_t csum;
973         uint16_t hdrlen, optpad = 0;
974         uint8_t *opt;
975
976         hdrlen = TCP6_HDRSIZE;
977         if (tcph->flags & SYN) {
978                 if (tcph->mss)
979                         hdrlen += MSS_LENGTH;
980                 if (tcph->ws)
981                         hdrlen += WS_LENGTH;
982                 optpad = hdrlen & 3;
983                 if (optpad)
984                         optpad = 4 - optpad;
985                 hdrlen += optpad;
986         }
987
988         if (data) {
989                 dlen = blocklen(data);
990                 data = padblock(data, hdrlen + TCP6_PKT);
991                 if (data == NULL)
992                         return NULL;
993         } else {
994                 dlen = 0;
995                 /* the 64 pad is to meet mintu's */
996                 data = block_alloc(hdrlen + TCP6_PKT + 64, MEM_WAIT);
997                 if (data == NULL)
998                         return NULL;
999                 data->wp += hdrlen + TCP6_PKT;
1000         }
1001         /* relative to the block start (bp->rp) */
1002         data->transport_header_end = hdrlen + TCP4_PKT;
1003
1004         /* copy in pseudo ip header plus port numbers */
1005         h = (Tcp6hdr *) (data->rp);
1006         memmove(h, ph, TCP6_TCBPHDRSZ);
1007
1008         /* compose pseudo tcp header, do cksum calculation */
1009         hnputl(h->vcf, hdrlen + dlen);
1010         h->ploadlen[0] = h->ploadlen[1] = h->proto = 0;
1011         h->ttl = ph->proto;
1012
1013         /* copy in variable bits */
1014         hnputl(h->tcpseq, tcph->seq);
1015         hnputl(h->tcpack, tcph->ack);
1016         hnputs(h->tcpflag, (hdrlen << 10) | tcph->flags);
1017         hnputs(h->tcpwin, tcph->wnd >> (tcb != NULL ? tcb->snd.scale : 0));
1018         hnputs(h->tcpurg, tcph->urg);
1019
1020         if (tcph->flags & SYN) {
1021                 opt = h->tcpopt;
1022                 if (tcph->mss != 0) {
1023                         *opt++ = MSSOPT;
1024                         *opt++ = MSS_LENGTH;
1025                         hnputs(opt, tcph->mss);
1026                         opt += 2;
1027                 }
1028                 if (tcph->ws != 0) {
1029                         *opt++ = WSOPT;
1030                         *opt++ = WS_LENGTH;
1031                         *opt++ = tcph->ws;
1032                 }
1033                 while (optpad-- > 0)
1034                         *opt++ = NOOPOPT;
1035         }
1036
1037         if (tcb != NULL && tcb->nochecksum) {
1038                 h->tcpcksum[0] = h->tcpcksum[1] = 0;
1039         } else {
1040                 csum = ptclcsum(data, TCP6_IPLEN, hdrlen + dlen + TCP6_PHDRSIZE);
1041                 hnputs(h->tcpcksum, csum);
1042         }
1043
1044         /* move from pseudo header back to normal ip header */
1045         memset(h->vcf, 0, 4);
1046         h->vcf[0] = IP_VER6;
1047         hnputs(h->ploadlen, hdrlen + dlen);
1048         h->proto = ph->proto;
1049
1050         return data;
1051 }
1052
1053 struct block *htontcp4(Tcp * tcph, struct block *data, Tcp4hdr * ph,
1054                                            Tcpctl * tcb)
1055 {
1056         int dlen;
1057         Tcp4hdr *h;
1058         uint16_t csum;
1059         uint16_t hdrlen, optpad = 0;
1060         uint8_t *opt;
1061
1062         hdrlen = TCP4_HDRSIZE;
1063         if (tcph->flags & SYN) {
1064                 if (tcph->mss)
1065                         hdrlen += MSS_LENGTH;
1066                 if (tcph->ws)
1067                         hdrlen += WS_LENGTH;
1068                 optpad = hdrlen & 3;
1069                 if (optpad)
1070                         optpad = 4 - optpad;
1071                 hdrlen += optpad;
1072         }
1073
1074         if (data) {
1075                 dlen = blocklen(data);
1076                 data = padblock(data, hdrlen + TCP4_PKT);
1077                 if (data == NULL)
1078                         return NULL;
1079         } else {
1080                 dlen = 0;
1081                 /* the 64 pad is to meet mintu's */
1082                 data = block_alloc(hdrlen + TCP4_PKT + 64, MEM_WAIT);
1083                 if (data == NULL)
1084                         return NULL;
1085                 data->wp += hdrlen + TCP4_PKT;
1086         }
1087         /* relative to the block start (bp->rp) */
1088         data->transport_header_end = hdrlen + TCP4_PKT;
1089
1090         /* copy in pseudo ip header plus port numbers */
1091         h = (Tcp4hdr *) (data->rp);
1092         memmove(h, ph, TCP4_TCBPHDRSZ);
1093
1094         /* copy in variable bits */
1095         hnputs(h->tcplen, hdrlen + dlen);
1096         hnputl(h->tcpseq, tcph->seq);
1097         hnputl(h->tcpack, tcph->ack);
1098         hnputs(h->tcpflag, (hdrlen << 10) | tcph->flags);
1099         hnputs(h->tcpwin, tcph->wnd >> (tcb != NULL ? tcb->snd.scale : 0));
1100         hnputs(h->tcpurg, tcph->urg);
1101
1102         if (tcph->flags & SYN) {
1103                 opt = h->tcpopt;
1104                 if (tcph->mss != 0) {
1105                         *opt++ = MSSOPT;
1106                         *opt++ = MSS_LENGTH;
1107                         hnputs(opt, tcph->mss);
1108                         opt += 2;
1109                 }
1110                 if (tcph->ws != 0) {
1111                         *opt++ = WSOPT;
1112                         *opt++ = WS_LENGTH;
1113                         *opt++ = tcph->ws;
1114                 }
1115                 while (optpad-- > 0)
1116                         *opt++ = NOOPOPT;
1117         }
1118
1119         if (tcb != NULL && tcb->nochecksum) {
1120                 h->tcpcksum[0] = h->tcpcksum[1] = 0;
1121         } else {
1122                 csum = ~ptclcsum(data, TCP4_IPLEN, TCP4_PHDRSIZE);
1123                 hnputs(h->tcpcksum, csum);
1124                 data->checksum_start = TCP4_IPLEN + TCP4_PHDRSIZE;
1125                 data->checksum_offset = ph->tcpcksum - ph->tcpsport;
1126                 data->flag |= Btcpck;
1127         }
1128
1129         return data;
1130 }
1131
1132 int ntohtcp6(Tcp * tcph, struct block **bpp)
1133 {
1134         Tcp6hdr *h;
1135         uint8_t *optr;
1136         uint16_t hdrlen;
1137         uint16_t optlen;
1138         int n;
1139
1140         *bpp = pullupblock(*bpp, TCP6_PKT + TCP6_HDRSIZE);
1141         if (*bpp == NULL)
1142                 return -1;
1143
1144         h = (Tcp6hdr *) ((*bpp)->rp);
1145         tcph->source = nhgets(h->tcpsport);
1146         tcph->dest = nhgets(h->tcpdport);
1147         tcph->seq = nhgetl(h->tcpseq);
1148         tcph->ack = nhgetl(h->tcpack);
1149         hdrlen = (h->tcpflag[0] >> 2) & ~3;
1150         if (hdrlen < TCP6_HDRSIZE) {
1151                 freeblist(*bpp);
1152                 return -1;
1153         }
1154
1155         tcph->flags = h->tcpflag[1];
1156         tcph->wnd = nhgets(h->tcpwin);
1157         tcph->urg = nhgets(h->tcpurg);
1158         tcph->mss = 0;
1159         tcph->ws = 0;
1160         tcph->len = nhgets(h->ploadlen) - hdrlen;
1161
1162         *bpp = pullupblock(*bpp, hdrlen + TCP6_PKT);
1163         if (*bpp == NULL)
1164                 return -1;
1165
1166         optr = h->tcpopt;
1167         n = hdrlen - TCP6_HDRSIZE;
1168         while (n > 0 && *optr != EOLOPT) {
1169                 if (*optr == NOOPOPT) {
1170                         n--;
1171                         optr++;
1172                         continue;
1173                 }
1174                 optlen = optr[1];
1175                 if (optlen < 2 || optlen > n)
1176                         break;
1177                 switch (*optr) {
1178                         case MSSOPT:
1179                                 if (optlen == MSS_LENGTH)
1180                                         tcph->mss = nhgets(optr + 2);
1181                                 break;
1182                         case WSOPT:
1183                                 if (optlen == WS_LENGTH && *(optr + 2) <= 14)
1184                                         tcph->ws = HaveWS | *(optr + 2);
1185                                 break;
1186                 }
1187                 n -= optlen;
1188                 optr += optlen;
1189         }
1190         return hdrlen;
1191 }
1192
1193 int ntohtcp4(Tcp * tcph, struct block **bpp)
1194 {
1195         Tcp4hdr *h;
1196         uint8_t *optr;
1197         uint16_t hdrlen;
1198         uint16_t optlen;
1199         int n;
1200
1201         *bpp = pullupblock(*bpp, TCP4_PKT + TCP4_HDRSIZE);
1202         if (*bpp == NULL)
1203                 return -1;
1204
1205         h = (Tcp4hdr *) ((*bpp)->rp);
1206         tcph->source = nhgets(h->tcpsport);
1207         tcph->dest = nhgets(h->tcpdport);
1208         tcph->seq = nhgetl(h->tcpseq);
1209         tcph->ack = nhgetl(h->tcpack);
1210
1211         hdrlen = (h->tcpflag[0] >> 2) & ~3;
1212         if (hdrlen < TCP4_HDRSIZE) {
1213                 freeblist(*bpp);
1214                 return -1;
1215         }
1216
1217         tcph->flags = h->tcpflag[1];
1218         tcph->wnd = nhgets(h->tcpwin);
1219         tcph->urg = nhgets(h->tcpurg);
1220         tcph->mss = 0;
1221         tcph->ws = 0;
1222         tcph->len = nhgets(h->length) - (hdrlen + TCP4_PKT);
1223
1224         *bpp = pullupblock(*bpp, hdrlen + TCP4_PKT);
1225         if (*bpp == NULL)
1226                 return -1;
1227
1228         optr = h->tcpopt;
1229         n = hdrlen - TCP4_HDRSIZE;
1230         while (n > 0 && *optr != EOLOPT) {
1231                 if (*optr == NOOPOPT) {
1232                         n--;
1233                         optr++;
1234                         continue;
1235                 }
1236                 optlen = optr[1];
1237                 if (optlen < 2 || optlen > n)
1238                         break;
1239                 switch (*optr) {
1240                         case MSSOPT:
1241                                 if (optlen == MSS_LENGTH)
1242                                         tcph->mss = nhgets(optr + 2);
1243                                 break;
1244                         case WSOPT:
1245                                 if (optlen == WS_LENGTH && *(optr + 2) <= 14)
1246                                         tcph->ws = HaveWS | *(optr + 2);
1247                                 break;
1248                 }
1249                 n -= optlen;
1250                 optr += optlen;
1251         }
1252         return hdrlen;
1253 }
1254
1255 /*
1256  *  For outgiing calls, generate an initial sequence
1257  *  number and put a SYN on the send queue
1258  */
1259 void tcpsndsyn(struct conv *s, Tcpctl * tcb)
1260 {
1261         urandom_read(&tcb->iss, sizeof(tcb->iss));
1262         tcb->rttseq = tcb->iss;
1263         tcb->snd.wl2 = tcb->iss;
1264         tcb->snd.una = tcb->iss;
1265         tcb->snd.ptr = tcb->rttseq;
1266         tcb->snd.nxt = tcb->rttseq;
1267         tcb->flgcnt++;
1268         tcb->flags |= FORCE;
1269         tcb->sndsyntime = NOW;
1270
1271         /* set desired mss and scale */
1272         tcb->mss = tcpmtu(s->p, s->laddr, s->ipversion, &tcb->scale,
1273                           &tcb->flags);
1274 }
1275
1276 void
1277 sndrst(struct Proto *tcp, uint8_t * source, uint8_t * dest,
1278            uint16_t length, Tcp * seg, uint8_t version, char *reason)
1279 {
1280         struct block *hbp;
1281         uint8_t rflags;
1282         struct tcppriv *tpriv;
1283         Tcp4hdr ph4;
1284         Tcp6hdr ph6;
1285
1286         netlog(tcp->f, Logtcp, "sndrst: %s\n", reason);
1287
1288         tpriv = tcp->priv;
1289
1290         if (seg->flags & RST)
1291                 return;
1292
1293         /* make pseudo header */
1294         switch (version) {
1295                 case V4:
1296                         memset(&ph4, 0, sizeof(ph4));
1297                         ph4.vihl = IP_VER4;
1298                         v6tov4(ph4.tcpsrc, dest);
1299                         v6tov4(ph4.tcpdst, source);
1300                         ph4.proto = IP_TCPPROTO;
1301                         hnputs(ph4.tcplen, TCP4_HDRSIZE);
1302                         hnputs(ph4.tcpsport, seg->dest);
1303                         hnputs(ph4.tcpdport, seg->source);
1304                         break;
1305                 case V6:
1306                         memset(&ph6, 0, sizeof(ph6));
1307                         ph6.vcf[0] = IP_VER6;
1308                         ipmove(ph6.tcpsrc, dest);
1309                         ipmove(ph6.tcpdst, source);
1310                         ph6.proto = IP_TCPPROTO;
1311                         hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1312                         hnputs(ph6.tcpsport, seg->dest);
1313                         hnputs(ph6.tcpdport, seg->source);
1314                         break;
1315                 default:
1316                         panic("sndrst: version %d", version);
1317         }
1318
1319         tpriv->stats[OutRsts]++;
1320         rflags = RST;
1321
1322         /* convince the other end that this reset is in band */
1323         if (seg->flags & ACK) {
1324                 seg->seq = seg->ack;
1325                 seg->ack = 0;
1326         } else {
1327                 rflags |= ACK;
1328                 seg->ack = seg->seq;
1329                 seg->seq = 0;
1330                 if (seg->flags & SYN)
1331                         seg->ack++;
1332                 seg->ack += length;
1333                 if (seg->flags & FIN)
1334                         seg->ack++;
1335         }
1336         seg->flags = rflags;
1337         seg->wnd = 0;
1338         seg->urg = 0;
1339         seg->mss = 0;
1340         seg->ws = 0;
1341         switch (version) {
1342                 case V4:
1343                         hbp = htontcp4(seg, NULL, &ph4, NULL);
1344                         if (hbp == NULL)
1345                                 return;
1346                         ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1347                         break;
1348                 case V6:
1349                         hbp = htontcp6(seg, NULL, &ph6, NULL);
1350                         if (hbp == NULL)
1351                                 return;
1352                         ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1353                         break;
1354                 default:
1355                         panic("sndrst2: version %d", version);
1356         }
1357 }
1358
1359 /*
1360  *  send a reset to the remote side and close the conversation
1361  *  called with s qlocked
1362  */
1363 static void tcphangup(struct conv *s)
1364 {
1365         ERRSTACK(1);
1366         Tcp seg;
1367         Tcpctl *tcb;
1368         struct block *hbp;
1369
1370         tcb = (Tcpctl *) s->ptcl;
1371         if (ipcmp(s->raddr, IPnoaddr)) {
1372                 /* discard error style, poperror regardless */
1373                 if (!waserror()) {
1374                         seg.flags = RST | ACK;
1375                         seg.ack = tcb->rcv.nxt;
1376                         tcb->rcv.una = 0;
1377                         seg.seq = tcb->snd.ptr;
1378                         seg.wnd = 0;
1379                         seg.urg = 0;
1380                         seg.mss = 0;
1381                         seg.ws = 0;
1382                         switch (s->ipversion) {
1383                                 case V4:
1384                                         tcb->protohdr.tcp4hdr.vihl = IP_VER4;
1385                                         hbp = htontcp4(&seg, NULL, &tcb->protohdr.tcp4hdr, tcb);
1386                                         ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
1387                                         break;
1388                                 case V6:
1389                                         tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
1390                                         hbp = htontcp6(&seg, NULL, &tcb->protohdr.tcp6hdr, tcb);
1391                                         ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
1392                                         break;
1393                                 default:
1394                                         panic("tcphangup: version %d", s->ipversion);
1395                         }
1396                 }
1397                 poperror();
1398         }
1399         localclose(s, NULL);
1400 }
1401
1402 /*
1403  *  (re)send a SYN ACK
1404  */
1405 int sndsynack(struct Proto *tcp, Limbo * lp)
1406 {
1407         struct block *hbp;
1408         Tcp4hdr ph4;
1409         Tcp6hdr ph6;
1410         Tcp seg;
1411         int scale;
1412         uint8_t flag = 0;
1413
1414         /* make pseudo header */
1415         switch (lp->version) {
1416                 case V4:
1417                         memset(&ph4, 0, sizeof(ph4));
1418                         ph4.vihl = IP_VER4;
1419                         v6tov4(ph4.tcpsrc, lp->laddr);
1420                         v6tov4(ph4.tcpdst, lp->raddr);
1421                         ph4.proto = IP_TCPPROTO;
1422                         hnputs(ph4.tcplen, TCP4_HDRSIZE);
1423                         hnputs(ph4.tcpsport, lp->lport);
1424                         hnputs(ph4.tcpdport, lp->rport);
1425                         break;
1426                 case V6:
1427                         memset(&ph6, 0, sizeof(ph6));
1428                         ph6.vcf[0] = IP_VER6;
1429                         ipmove(ph6.tcpsrc, lp->laddr);
1430                         ipmove(ph6.tcpdst, lp->raddr);
1431                         ph6.proto = IP_TCPPROTO;
1432                         hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1433                         hnputs(ph6.tcpsport, lp->lport);
1434                         hnputs(ph6.tcpdport, lp->rport);
1435                         break;
1436                 default:
1437                         panic("sndrst: version %d", lp->version);
1438         }
1439
1440         seg.seq = lp->iss;
1441         seg.ack = lp->irs + 1;
1442         seg.flags = SYN | ACK;
1443         seg.urg = 0;
1444         seg.mss = tcpmtu(tcp, lp->laddr, lp->version, &scale, &flag);
1445         seg.wnd = QMAX;
1446
1447         /* if the other side set scale, we should too */
1448         if (lp->rcvscale) {
1449                 seg.ws = scale;
1450                 lp->sndscale = scale;
1451         } else {
1452                 seg.ws = 0;
1453                 lp->sndscale = 0;
1454         }
1455
1456         switch (lp->version) {
1457                 case V4:
1458                         hbp = htontcp4(&seg, NULL, &ph4, NULL);
1459                         if (hbp == NULL)
1460                                 return -1;
1461                         ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1462                         break;
1463                 case V6:
1464                         hbp = htontcp6(&seg, NULL, &ph6, NULL);
1465                         if (hbp == NULL)
1466                                 return -1;
1467                         ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1468                         break;
1469                 default:
1470                         panic("sndsnack: version %d", lp->version);
1471         }
1472         lp->lastsend = NOW;
1473         return 0;
1474 }
1475
1476 #define hashipa(a, p) ( ( (a)[IPaddrlen-2] + (a)[IPaddrlen-1] + p )&LHTMASK )
1477
1478 /*
1479  *  put a call into limbo and respond with a SYN ACK
1480  *
1481  *  called with proto locked
1482  */
1483 static void
1484 limbo(struct conv *s, uint8_t * source, uint8_t * dest, Tcp * seg, int version)
1485 {
1486         Limbo *lp, **l;
1487         struct tcppriv *tpriv;
1488         int h;
1489
1490         tpriv = s->p->priv;
1491         h = hashipa(source, seg->source);
1492
1493         for (l = &tpriv->lht[h]; *l != NULL; l = &lp->next) {
1494                 lp = *l;
1495                 if (lp->lport != seg->dest || lp->rport != seg->source
1496                         || lp->version != version)
1497                         continue;
1498                 if (ipcmp(lp->raddr, source) != 0)
1499                         continue;
1500                 if (ipcmp(lp->laddr, dest) != 0)
1501                         continue;
1502
1503                 /* each new SYN restarts the retransmits */
1504                 lp->irs = seg->seq;
1505                 break;
1506         }
1507         lp = *l;
1508         if (lp == NULL) {
1509                 if (tpriv->nlimbo >= Maxlimbo && tpriv->lht[h]) {
1510                         lp = tpriv->lht[h];
1511                         tpriv->lht[h] = lp->next;
1512                         lp->next = NULL;
1513                 } else {
1514                         lp = kzmalloc(sizeof(*lp), 0);
1515                         if (lp == NULL)
1516                                 return;
1517                         tpriv->nlimbo++;
1518                 }
1519                 *l = lp;
1520                 lp->version = version;
1521                 ipmove(lp->laddr, dest);
1522                 ipmove(lp->raddr, source);
1523                 lp->lport = seg->dest;
1524                 lp->rport = seg->source;
1525                 lp->mss = seg->mss;
1526                 lp->rcvscale = seg->ws;
1527                 lp->irs = seg->seq;
1528                 urandom_read(&lp->iss, sizeof(lp->iss));
1529         }
1530
1531         if (sndsynack(s->p, lp) < 0) {
1532                 *l = lp->next;
1533                 tpriv->nlimbo--;
1534                 kfree(lp);
1535         }
1536 }
1537
1538 /*
1539  *  resend SYN ACK's once every SYNACK_RXTIMER ms.
1540  */
1541 static void limborexmit(struct Proto *tcp)
1542 {
1543         struct tcppriv *tpriv;
1544         Limbo **l, *lp;
1545         int h;
1546         int seen;
1547         uint64_t now;
1548
1549         tpriv = tcp->priv;
1550
1551         if (!canqlock(&tcp->qlock))
1552                 return;
1553         seen = 0;
1554         now = NOW;
1555         for (h = 0; h < NLHT && seen < tpriv->nlimbo; h++) {
1556                 for (l = &tpriv->lht[h]; *l != NULL && seen < tpriv->nlimbo;) {
1557                         lp = *l;
1558                         seen++;
1559                         if (now - lp->lastsend < (lp->rexmits + 1) * SYNACK_RXTIMER)
1560                                 continue;
1561
1562                         /* time it out after 1 second */
1563                         if (++(lp->rexmits) > 5) {
1564                                 tpriv->nlimbo--;
1565                                 *l = lp->next;
1566                                 kfree(lp);
1567                                 continue;
1568                         }
1569
1570                         /* if we're being attacked, don't bother resending SYN ACK's */
1571                         if (tpriv->nlimbo > 100)
1572                                 continue;
1573
1574                         if (sndsynack(tcp, lp) < 0) {
1575                                 tpriv->nlimbo--;
1576                                 *l = lp->next;
1577                                 kfree(lp);
1578                                 continue;
1579                         }
1580
1581                         l = &lp->next;
1582                 }
1583         }
1584         qunlock(&tcp->qlock);
1585 }
1586
1587 /*
1588  *  lookup call in limbo.  if found, throw it out.
1589  *
1590  *  called with proto locked
1591  */
1592 static void
1593 limborst(struct conv *s, Tcp * segp, uint8_t * src, uint8_t * dst,
1594                  uint8_t version)
1595 {
1596         Limbo *lp, **l;
1597         int h;
1598         struct tcppriv *tpriv;
1599
1600         tpriv = s->p->priv;
1601
1602         /* find a call in limbo */
1603         h = hashipa(src, segp->source);
1604         for (l = &tpriv->lht[h]; *l != NULL; l = &lp->next) {
1605                 lp = *l;
1606                 if (lp->lport != segp->dest || lp->rport != segp->source
1607                         || lp->version != version)
1608                         continue;
1609                 if (ipcmp(lp->laddr, dst) != 0)
1610                         continue;
1611                 if (ipcmp(lp->raddr, src) != 0)
1612                         continue;
1613
1614                 /* RST can only follow the SYN */
1615                 if (segp->seq == lp->irs + 1) {
1616                         tpriv->nlimbo--;
1617                         *l = lp->next;
1618                         kfree(lp);
1619                 }
1620                 break;
1621         }
1622 }
1623
1624 /*
1625  *  come here when we finally get an ACK to our SYN-ACK.
1626  *  lookup call in limbo.  if found, create a new conversation
1627  *
1628  *  called with proto locked
1629  */
1630 static struct conv *tcpincoming(struct conv *s, Tcp * segp, uint8_t * src,
1631                                                                 uint8_t * dst, uint8_t version)
1632 {
1633         struct conv *new;
1634         Tcpctl *tcb;
1635         struct tcppriv *tpriv;
1636         Tcp4hdr *h4;
1637         Tcp6hdr *h6;
1638         Limbo *lp, **l;
1639         int h;
1640
1641         /* unless it's just an ack, it can't be someone coming out of limbo */
1642         if ((segp->flags & SYN) || (segp->flags & ACK) == 0)
1643                 return NULL;
1644
1645         tpriv = s->p->priv;
1646
1647         /* find a call in limbo */
1648         h = hashipa(src, segp->source);
1649         for (l = &tpriv->lht[h]; (lp = *l) != NULL; l = &lp->next) {
1650                 netlog(s->p->f, Logtcp,
1651                            "tcpincoming s %I!%d/%I!%d d %I!%d/%I!%d v %d/%d\n", src,
1652                            segp->source, lp->raddr, lp->rport, dst, segp->dest, lp->laddr,
1653                            lp->lport, version, lp->version);
1654
1655                 if (lp->lport != segp->dest || lp->rport != segp->source
1656                         || lp->version != version)
1657                         continue;
1658                 if (ipcmp(lp->laddr, dst) != 0)
1659                         continue;
1660                 if (ipcmp(lp->raddr, src) != 0)
1661                         continue;
1662
1663                 /* we're assuming no data with the initial SYN */
1664                 if (segp->seq != lp->irs + 1 || segp->ack != lp->iss + 1) {
1665                         netlog(s->p->f, Logtcp, "tcpincoming s 0x%lx/0x%lx a 0x%lx 0x%lx\n",
1666                                    segp->seq, lp->irs + 1, segp->ack, lp->iss + 1);
1667                         lp = NULL;
1668                 } else {
1669                         tpriv->nlimbo--;
1670                         *l = lp->next;
1671                 }
1672                 break;
1673         }
1674         if (lp == NULL)
1675                 return NULL;
1676
1677         new = Fsnewcall(s, src, segp->source, dst, segp->dest, version);
1678         if (new == NULL)
1679                 return NULL;
1680
1681         memmove(new->ptcl, s->ptcl, sizeof(Tcpctl));
1682         tcb = (Tcpctl *) new->ptcl;
1683         tcb->flags &= ~CLONE;
1684         tcb->timer.arg = new;
1685         tcb->timer.state = TcptimerOFF;
1686         tcb->acktimer.arg = new;
1687         tcb->acktimer.state = TcptimerOFF;
1688         tcb->katimer.arg = new;
1689         tcb->katimer.state = TcptimerOFF;
1690         tcb->rtt_timer.arg = new;
1691         tcb->rtt_timer.state = TcptimerOFF;
1692
1693         tcb->irs = lp->irs;
1694         tcb->rcv.nxt = tcb->irs + 1;
1695         tcb->rcv.urg = tcb->rcv.nxt;
1696
1697         tcb->iss = lp->iss;
1698         tcb->rttseq = tcb->iss;
1699         tcb->snd.wl2 = tcb->iss;
1700         tcb->snd.una = tcb->iss + 1;
1701         tcb->snd.ptr = tcb->iss + 1;
1702         tcb->snd.nxt = tcb->iss + 1;
1703         tcb->flgcnt = 0;
1704         tcb->flags |= SYNACK;
1705
1706         /* our sending max segment size cannot be bigger than what he asked for */
1707         if (lp->mss != 0 && lp->mss < tcb->mss)
1708                 tcb->mss = lp->mss;
1709
1710         /* window scaling */
1711         tcpsetscale(new, tcb, lp->rcvscale, lp->sndscale);
1712
1713         /* the congestion window always starts out as a single segment */
1714         tcb->snd.wnd = segp->wnd;
1715         tcb->cwind = tcb->mss;
1716
1717         /* set initial round trip time */
1718         tcb->sndsyntime = lp->lastsend + lp->rexmits * SYNACK_RXTIMER;
1719         tcpsynackrtt(new);
1720
1721         kfree(lp);
1722
1723         /* set up proto header */
1724         switch (version) {
1725                 case V4:
1726                         h4 = &tcb->protohdr.tcp4hdr;
1727                         memset(h4, 0, sizeof(*h4));
1728                         h4->proto = IP_TCPPROTO;
1729                         hnputs(h4->tcpsport, new->lport);
1730                         hnputs(h4->tcpdport, new->rport);
1731                         v6tov4(h4->tcpsrc, dst);
1732                         v6tov4(h4->tcpdst, src);
1733                         break;
1734                 case V6:
1735                         h6 = &tcb->protohdr.tcp6hdr;
1736                         memset(h6, 0, sizeof(*h6));
1737                         h6->proto = IP_TCPPROTO;
1738                         hnputs(h6->tcpsport, new->lport);
1739                         hnputs(h6->tcpdport, new->rport);
1740                         ipmove(h6->tcpsrc, dst);
1741                         ipmove(h6->tcpdst, src);
1742                         break;
1743                 default:
1744                         panic("tcpincoming: version %d", new->ipversion);
1745         }
1746
1747         tcpsetstate(new, Established);
1748
1749         iphtadd(&tpriv->ht, new);
1750
1751         return new;
1752 }
1753
1754 int seq_within(uint32_t x, uint32_t low, uint32_t high)
1755 {
1756         if (low <= high) {
1757                 if (low <= x && x <= high)
1758                         return 1;
1759         } else {
1760                 if (x >= low || x <= high)
1761                         return 1;
1762         }
1763         return 0;
1764 }
1765
1766 int seq_lt(uint32_t x, uint32_t y)
1767 {
1768         return (int)(x - y) < 0;
1769 }
1770
1771 int seq_le(uint32_t x, uint32_t y)
1772 {
1773         return (int)(x - y) <= 0;
1774 }
1775
1776 int seq_gt(uint32_t x, uint32_t y)
1777 {
1778         return (int)(x - y) > 0;
1779 }
1780
1781 int seq_ge(uint32_t x, uint32_t y)
1782 {
1783         return (int)(x - y) >= 0;
1784 }
1785
1786 /*
1787  *  use the time between the first SYN and it's ack as the
1788  *  initial round trip time
1789  */
1790 void tcpsynackrtt(struct conv *s)
1791 {
1792         Tcpctl *tcb;
1793         uint64_t delta;
1794         struct tcppriv *tpriv;
1795
1796         tcb = (Tcpctl *) s->ptcl;
1797         tpriv = s->p->priv;
1798
1799         delta = NOW - tcb->sndsyntime;
1800         tcb->srtt = delta << LOGAGAIN;
1801         tcb->mdev = delta << LOGDGAIN;
1802
1803         /* halt round trip timer */
1804         tcphalt(tpriv, &tcb->rtt_timer);
1805 }
1806
1807 void update(struct conv *s, Tcp * seg)
1808 {
1809         int rtt, delta;
1810         Tcpctl *tcb;
1811         uint32_t acked;
1812         uint32_t expand;
1813         struct tcppriv *tpriv;
1814
1815         tpriv = s->p->priv;
1816         tcb = (Tcpctl *) s->ptcl;
1817
1818         /* if everything has been acked, force output(?) */
1819         if (seq_gt(seg->ack, tcb->snd.nxt)) {
1820                 tcb->flags |= FORCE;
1821                 return;
1822         }
1823
1824         /* added by Dong Lin for fast retransmission */
1825         if (seg->ack == tcb->snd.una
1826                 && tcb->snd.una != tcb->snd.nxt
1827                 && seg->len == 0 && seg->wnd == tcb->snd.wnd) {
1828
1829                 /* this is a pure ack w/o window update */
1830                 netlog(s->p->f, Logtcprxmt, "dupack %lu ack %lu sndwnd %d advwin %d\n",
1831                            tcb->snd.dupacks, seg->ack, tcb->snd.wnd, seg->wnd);
1832
1833                 if (++tcb->snd.dupacks == TCPREXMTTHRESH) {
1834                         /*
1835                          *  tahoe tcp rxt the packet, half sshthresh,
1836                          *  and set cwnd to one packet
1837                          */
1838                         tcb->snd.recovery = 1;
1839                         tcb->snd.rxt = tcb->snd.nxt;
1840                         netlog(s->p->f, Logtcprxmt, "fast rxt %lu, nxt %lu\n", tcb->snd.una,
1841                                    tcb->snd.nxt);
1842                         tcprxmit(s);
1843                 } else {
1844                         /* do reno tcp here. */
1845                 }
1846         }
1847
1848         /*
1849          *  update window
1850          */
1851         if (seq_gt(seg->ack, tcb->snd.wl2)
1852                 || (tcb->snd.wl2 == seg->ack && seg->wnd > tcb->snd.wnd)) {
1853                 tcb->snd.wnd = seg->wnd;
1854                 tcb->snd.wl2 = seg->ack;
1855         }
1856
1857         if (!seq_gt(seg->ack, tcb->snd.una)) {
1858                 /*
1859                  *  don't let us hangup if sending into a closed window and
1860                  *  we're still getting acks
1861                  */
1862                 if ((tcb->flags & RETRAN) && tcb->snd.wnd == 0) {
1863                         tcb->backedoff = MAXBACKMS / 4;
1864                 }
1865                 return;
1866         }
1867
1868         /*
1869          *  any positive ack turns off fast rxt,
1870          *  (should we do new-reno on partial acks?)
1871          */
1872         if (!tcb->snd.recovery || seq_ge(seg->ack, tcb->snd.rxt)) {
1873                 tcb->snd.dupacks = 0;
1874                 tcb->snd.recovery = 0;
1875         } else
1876                 netlog(s->p->f, Logtcp, "rxt next %lu, cwin %u\n", seg->ack,
1877                            tcb->cwind);
1878
1879         /* Compute the new send window size */
1880         acked = seg->ack - tcb->snd.una;
1881
1882         /* avoid slow start and timers for SYN acks */
1883         if ((tcb->flags & SYNACK) == 0) {
1884                 tcb->flags |= SYNACK;
1885                 acked--;
1886                 tcb->flgcnt--;
1887                 goto done;
1888         }
1889
1890         /* slow start as long as we're not recovering from lost packets */
1891         if (tcb->cwind < tcb->snd.wnd && !tcb->snd.recovery) {
1892                 if (tcb->cwind < tcb->ssthresh) {
1893                         expand = tcb->mss;
1894                         if (acked < expand)
1895                                 expand = acked;
1896                 } else
1897                         expand = ((int)tcb->mss * tcb->mss) / tcb->cwind;
1898
1899                 if (tcb->cwind + expand < tcb->cwind)
1900                         expand = tcb->snd.wnd - tcb->cwind;
1901                 if (tcb->cwind + expand > tcb->snd.wnd)
1902                         expand = tcb->snd.wnd - tcb->cwind;
1903                 tcb->cwind += expand;
1904         }
1905
1906         /* Adjust the timers according to the round trip time */
1907         if (tcb->rtt_timer.state == TcptimerON && seq_ge(seg->ack, tcb->rttseq)) {
1908                 tcphalt(tpriv, &tcb->rtt_timer);
1909                 if ((tcb->flags & RETRAN) == 0) {
1910                         tcb->backoff = 0;
1911                         tcb->backedoff = 0;
1912                         rtt = tcb->rtt_timer.start - tcb->rtt_timer.count;
1913                         if (rtt == 0)
1914                                 rtt = 1;        /* otherwise all close systems will rexmit in 0 time */
1915                         rtt *= MSPTICK;
1916                         if (tcb->srtt == 0) {
1917                                 tcb->srtt = rtt << LOGAGAIN;
1918                                 tcb->mdev = rtt << LOGDGAIN;
1919                         } else {
1920                                 delta = rtt - (tcb->srtt >> LOGAGAIN);
1921                                 tcb->srtt += delta;
1922                                 if (tcb->srtt <= 0)
1923                                         tcb->srtt = 1;
1924
1925                                 delta = abs(delta) - (tcb->mdev >> LOGDGAIN);
1926                                 tcb->mdev += delta;
1927                                 if (tcb->mdev <= 0)
1928                                         tcb->mdev = 1;
1929                         }
1930                         tcpsettimer(tcb);
1931                 }
1932         }
1933
1934 done:
1935         if (qdiscard(s->wq, acked) < acked)
1936                 tcb->flgcnt--;
1937
1938         tcb->snd.una = seg->ack;
1939         if (seq_gt(seg->ack, tcb->snd.urg))
1940                 tcb->snd.urg = seg->ack;
1941
1942         if (tcb->snd.una != tcb->snd.nxt)
1943                 tcpgo(tpriv, &tcb->timer);
1944         else
1945                 tcphalt(tpriv, &tcb->timer);
1946
1947         if (seq_lt(tcb->snd.ptr, tcb->snd.una))
1948                 tcb->snd.ptr = tcb->snd.una;
1949
1950         tcb->flags &= ~RETRAN;
1951         tcb->backoff = 0;
1952         tcb->backedoff = 0;
1953 }
1954
1955 void tcpiput(struct Proto *tcp, struct Ipifc *unused, struct block *bp)
1956 {
1957         ERRSTACK(1);
1958         Tcp seg;
1959         Tcp4hdr *h4;
1960         Tcp6hdr *h6;
1961         int hdrlen;
1962         Tcpctl *tcb;
1963         uint16_t length;
1964         uint8_t source[IPaddrlen], dest[IPaddrlen];
1965         struct conv *s;
1966         struct Fs *f;
1967         struct tcppriv *tpriv;
1968         uint8_t version;
1969
1970         f = tcp->f;
1971         tpriv = tcp->priv;
1972
1973         tpriv->stats[InSegs]++;
1974
1975         h4 = (Tcp4hdr *) (bp->rp);
1976         h6 = (Tcp6hdr *) (bp->rp);
1977
1978         if ((h4->vihl & 0xF0) == IP_VER4) {
1979                 uint8_t ttl;
1980
1981                 version = V4;
1982                 length = nhgets(h4->length);
1983                 v4tov6(dest, h4->tcpdst);
1984                 v4tov6(source, h4->tcpsrc);
1985
1986                 /* ttl isn't part of the xsum pseudo header, but bypass needs it. */
1987                 ttl = h4->Unused;
1988                 h4->Unused = 0;
1989                 hnputs(h4->tcplen, length - TCP4_PKT);
1990                 if (!(bp->flag & Btcpck) && (h4->tcpcksum[0] || h4->tcpcksum[1]) &&
1991                         ptclcsum(bp, TCP4_IPLEN, length - TCP4_IPLEN)) {
1992                         tpriv->stats[CsumErrs]++;
1993                         tpriv->stats[InErrs]++;
1994                         netlog(f, Logtcp, "bad tcp proto cksum\n");
1995                         freeblist(bp);
1996                         return;
1997                 }
1998                 h4->Unused = ttl;
1999
2000                 hdrlen = ntohtcp4(&seg, &bp);
2001                 if (hdrlen < 0) {
2002                         tpriv->stats[HlenErrs]++;
2003                         tpriv->stats[InErrs]++;
2004                         netlog(f, Logtcp, "bad tcp hdr len\n");
2005                         return;
2006                 }
2007
2008                 s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest);
2009                 if (s && s->state == Bypass) {
2010                         bypass_or_drop(s, bp);
2011                         return;
2012                 }
2013
2014                 /* trim the packet to the size claimed by the datagram */
2015                 length -= hdrlen + TCP4_PKT;
2016                 bp = trimblock(bp, hdrlen + TCP4_PKT, length);
2017                 if (bp == NULL) {
2018                         tpriv->stats[LenErrs]++;
2019                         tpriv->stats[InErrs]++;
2020                         netlog(f, Logtcp, "tcp len < 0 after trim\n");
2021                         return;
2022                 }
2023         } else {
2024                 int ttl = h6->ttl;
2025                 int proto = h6->proto;
2026
2027                 version = V6;
2028                 length = nhgets(h6->ploadlen);
2029                 ipmove(dest, h6->tcpdst);
2030                 ipmove(source, h6->tcpsrc);
2031
2032                 h6->ploadlen[0] = h6->ploadlen[1] = h6->proto = 0;
2033                 h6->ttl = proto;
2034                 hnputl(h6->vcf, length);
2035                 if ((h6->tcpcksum[0] || h6->tcpcksum[1]) &&
2036                         ptclcsum(bp, TCP6_IPLEN, length + TCP6_PHDRSIZE)) {
2037                         tpriv->stats[CsumErrs]++;
2038                         tpriv->stats[InErrs]++;
2039                         netlog(f, Logtcp, "bad tcp proto cksum\n");
2040                         freeblist(bp);
2041                         return;
2042                 }
2043                 h6->ttl = ttl;
2044                 h6->proto = proto;
2045                 hnputs(h6->ploadlen, length);
2046
2047                 hdrlen = ntohtcp6(&seg, &bp);
2048                 if (hdrlen < 0) {
2049                         tpriv->stats[HlenErrs]++;
2050                         tpriv->stats[InErrs]++;
2051                         netlog(f, Logtcp, "bad tcp hdr len\n");
2052                         return;
2053                 }
2054
2055                 s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest);
2056                 if (s && s->state == Bypass) {
2057                         bypass_or_drop(s, bp);
2058                         return;
2059                 }
2060
2061                 /* trim the packet to the size claimed by the datagram */
2062                 length -= hdrlen;
2063                 bp = trimblock(bp, hdrlen + TCP6_PKT, length);
2064                 if (bp == NULL) {
2065                         tpriv->stats[LenErrs]++;
2066                         tpriv->stats[InErrs]++;
2067                         netlog(f, Logtcp, "tcp len < 0 after trim\n");
2068                         return;
2069                 }
2070         }
2071
2072         /* s, the conv matching the n-tuple, was set above */
2073         if (s == NULL) {
2074                 netlog(f, Logtcp, "iphtlook failed: src %I:%u, dst %I:%u\n",
2075                        source, seg.source, dest, seg.dest);
2076 reset:
2077                 sndrst(tcp, source, dest, length, &seg, version, "no conversation");
2078                 freeblist(bp);
2079                 return;
2080         }
2081
2082         /* lock protocol for unstate Plan 9 invariants.  funcs like limbo or
2083          * incoming might rely on it. */
2084         qlock(&tcp->qlock);
2085
2086         /* if it's a listener, look for the right flags and get a new conv */
2087         tcb = (Tcpctl *) s->ptcl;
2088         if (tcb->state == Listen) {
2089                 if (seg.flags & RST) {
2090                         limborst(s, &seg, source, dest, version);
2091                         qunlock(&tcp->qlock);
2092                         freeblist(bp);
2093                         return;
2094                 }
2095
2096                 /* if this is a new SYN, put the call into limbo */
2097                 if ((seg.flags & SYN) && (seg.flags & ACK) == 0) {
2098                         limbo(s, source, dest, &seg, version);
2099                         qunlock(&tcp->qlock);
2100                         freeblist(bp);
2101                         return;
2102                 }
2103
2104                 /*
2105                  *  if there's a matching call in limbo, tcpincoming will
2106                  *  return it in state Syn_received
2107                  */
2108                 s = tcpincoming(s, &seg, source, dest, version);
2109                 if (s == NULL) {
2110                         qunlock(&tcp->qlock);
2111                         goto reset;
2112                 }
2113         }
2114
2115         /* The rest of the input state machine is run with the control block
2116          * locked and implements the state machine directly out of the RFC.
2117          * Out-of-band data is ignored - it was always a bad idea.
2118          */
2119         tcb = (Tcpctl *) s->ptcl;
2120         if (waserror()) {
2121                 qunlock(&s->qlock);
2122                 nexterror();
2123         }
2124         qlock(&s->qlock);
2125         qunlock(&tcp->qlock);
2126
2127         /* fix up window */
2128         seg.wnd <<= tcb->rcv.scale;
2129
2130         /* every input packet in puts off the keep alive time out */
2131         tcpsetkacounter(tcb);
2132
2133         switch (tcb->state) {
2134                 case Closed:
2135                         sndrst(tcp, source, dest, length, &seg, version,
2136                                    "sending to Closed");
2137                         goto raise;
2138                 case Syn_sent:
2139                         if (seg.flags & ACK) {
2140                                 if (!seq_within(seg.ack, tcb->iss + 1, tcb->snd.nxt)) {
2141                                         sndrst(tcp, source, dest, length, &seg, version,
2142                                                    "bad seq in Syn_sent");
2143                                         goto raise;
2144                                 }
2145                         }
2146                         if (seg.flags & RST) {
2147                                 if (seg.flags & ACK)
2148                                         localclose(s, "connection refused");
2149                                 goto raise;
2150                         }
2151
2152                         if (seg.flags & SYN) {
2153                                 procsyn(s, &seg);
2154                                 if (seg.flags & ACK) {
2155                                         update(s, &seg);
2156                                         tcpsynackrtt(s);
2157                                         tcpsetstate(s, Established);
2158                                         tcpsetscale(s, tcb, seg.ws, tcb->scale);
2159                                 } else {
2160                                         tcb->time = NOW;
2161                                         tcpsetstate(s, Syn_received);   /* DLP - shouldn't this be a reset? */
2162                                 }
2163
2164                                 if (length != 0 || (seg.flags & FIN))
2165                                         break;
2166
2167                                 freeblist(bp);
2168                                 goto output;
2169                         } else
2170                                 freeblist(bp);
2171
2172                         qunlock(&s->qlock);
2173                         poperror();
2174                         return;
2175                 case Syn_received:
2176                         /* doesn't matter if it's the correct ack, we're just trying to set timing */
2177                         if (seg.flags & ACK)
2178                                 tcpsynackrtt(s);
2179                         break;
2180         }
2181
2182         /*
2183          *  One DOS attack is to open connections to us and then forget about them,
2184          *  thereby tying up a conv at no long term cost to the attacker.
2185          *  This is an attempt to defeat these stateless DOS attacks.  See
2186          *  corresponding code in tcpsendka().
2187          */
2188         if (tcb->state != Syn_received && (seg.flags & RST) == 0) {
2189                 if (tcpporthogdefense
2190                         && seq_within(seg.ack, tcb->snd.una - (1 << 31),
2191                                                   tcb->snd.una - (1 << 29))) {
2192                         printd("stateless hog %I.%d->%I.%d f 0x%x 0x%lx - 0x%lx - 0x%lx\n",
2193                                    source, seg.source, dest, seg.dest, seg.flags,
2194                                    tcb->snd.una - (1 << 31), seg.ack, tcb->snd.una - (1 << 29));
2195                         localclose(s, "stateless hog");
2196                 }
2197         }
2198
2199         /* Cut the data to fit the receive window */
2200         if (tcptrim(tcb, &seg, &bp, &length) == -1) {
2201                 netlog(f, Logtcp, "tcp len < 0, %lu %d\n", seg.seq, length);
2202                 update(s, &seg);
2203                 if (qlen(s->wq) + tcb->flgcnt == 0 && tcb->state == Closing) {
2204                         tcphalt(tpriv, &tcb->rtt_timer);
2205                         tcphalt(tpriv, &tcb->acktimer);
2206                         tcphalt(tpriv, &tcb->katimer);
2207                         tcpsetstate(s, Time_wait);
2208                         tcb->timer.start = MSL2 * (1000 / MSPTICK);
2209                         tcpgo(tpriv, &tcb->timer);
2210                 }
2211                 if (!(seg.flags & RST)) {
2212                         tcb->flags |= FORCE;
2213                         goto output;
2214                 }
2215                 qunlock(&s->qlock);
2216                 poperror();
2217                 return;
2218         }
2219
2220         /* Cannot accept so answer with a rst */
2221         if (length && tcb->state == Closed) {
2222                 sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
2223                 goto raise;
2224         }
2225
2226         /* The segment is beyond the current receive pointer so
2227          * queue the data in the resequence queue
2228          */
2229         if (seg.seq != tcb->rcv.nxt)
2230                 if (length != 0 || (seg.flags & (SYN | FIN))) {
2231                         update(s, &seg);
2232                         if (addreseq(tcb, tpriv, &seg, bp, length) < 0)
2233                                 printd("reseq %I.%d -> %I.%d\n", s->raddr, s->rport, s->laddr,
2234                                            s->lport);
2235                         tcb->flags |= FORCE;
2236                         goto output;
2237                 }
2238
2239         /*
2240          *  keep looping till we've processed this packet plus any
2241          *  adjacent packets in the resequence queue
2242          */
2243         for (;;) {
2244                 if (seg.flags & RST) {
2245                         if (tcb->state == Established) {
2246                                 tpriv->stats[EstabResets]++;
2247                                 if (tcb->rcv.nxt != seg.seq)
2248                                         printd
2249                                                 ("out of order RST rcvd: %I.%d -> %I.%d, rcv.nxt 0x%lx seq 0x%lx\n",
2250                                                  s->raddr, s->rport, s->laddr, s->lport, tcb->rcv.nxt,
2251                                                  seg.seq);
2252                         }
2253                         localclose(s, "connection refused");
2254                         goto raise;
2255                 }
2256
2257                 if ((seg.flags & ACK) == 0)
2258                         goto raise;
2259
2260                 switch (tcb->state) {
2261                         case Syn_received:
2262                                 if (!seq_within(seg.ack, tcb->snd.una + 1, tcb->snd.nxt)) {
2263                                         sndrst(tcp, source, dest, length, &seg, version,
2264                                                    "bad seq in Syn_received");
2265                                         goto raise;
2266                                 }
2267                                 update(s, &seg);
2268                                 tcpsetstate(s, Established);
2269                         case Established:
2270                         case Close_wait:
2271                                 update(s, &seg);
2272                                 break;
2273                         case Finwait1:
2274                                 update(s, &seg);
2275                                 if (qlen(s->wq) + tcb->flgcnt == 0) {
2276                                         tcphalt(tpriv, &tcb->rtt_timer);
2277                                         tcphalt(tpriv, &tcb->acktimer);
2278                                         tcpsetkacounter(tcb);
2279                                         tcb->time = NOW;
2280                                         tcpsetstate(s, Finwait2);
2281                                         tcb->katimer.start = MSL2 * (1000 / MSPTICK);
2282                                         tcpgo(tpriv, &tcb->katimer);
2283                                 }
2284                                 break;
2285                         case Finwait2:
2286                                 update(s, &seg);
2287                                 break;
2288                         case Closing:
2289                                 update(s, &seg);
2290                                 if (qlen(s->wq) + tcb->flgcnt == 0) {
2291                                         tcphalt(tpriv, &tcb->rtt_timer);
2292                                         tcphalt(tpriv, &tcb->acktimer);
2293                                         tcphalt(tpriv, &tcb->katimer);
2294                                         tcpsetstate(s, Time_wait);
2295                                         tcb->timer.start = MSL2 * (1000 / MSPTICK);
2296                                         tcpgo(tpriv, &tcb->timer);
2297                                 }
2298                                 break;
2299                         case Last_ack:
2300                                 update(s, &seg);
2301                                 if (qlen(s->wq) + tcb->flgcnt == 0) {
2302                                         localclose(s, NULL);
2303                                         goto raise;
2304                                 }
2305                         case Time_wait:
2306                                 tcb->flags |= FORCE;
2307                                 if (tcb->timer.state != TcptimerON)
2308                                         tcpgo(tpriv, &tcb->timer);
2309                 }
2310
2311                 if ((seg.flags & URG) && seg.urg) {
2312                         if (seq_gt(seg.urg + seg.seq, tcb->rcv.urg)) {
2313                                 tcb->rcv.urg = seg.urg + seg.seq;
2314                                 pullblock(&bp, seg.urg);
2315                         }
2316                 } else if (seq_gt(tcb->rcv.nxt, tcb->rcv.urg))
2317                         tcb->rcv.urg = tcb->rcv.nxt;
2318
2319                 if (length == 0) {
2320                         if (bp != NULL)
2321                                 freeblist(bp);
2322                 } else {
2323                         switch (tcb->state) {
2324                                 default:
2325                                         /* Ignore segment text */
2326                                         if (bp != NULL)
2327                                                 freeblist(bp);
2328                                         break;
2329
2330                                 case Syn_received:
2331                                 case Established:
2332                                 case Finwait1:
2333                                         /* If we still have some data place on
2334                                          * receive queue
2335                                          */
2336                                         if (bp) {
2337                                                 bp = packblock(bp);
2338                                                 if (bp == NULL)
2339                                                         panic("tcp packblock");
2340                                                 qpassnolim(s->rq, bp);
2341                                                 bp = NULL;
2342
2343                                                 /*
2344                                                  *  Force an ack every 2 data messages.  This is
2345                                                  *  a hack for rob to make his home system run
2346                                                  *  faster.
2347                                                  *
2348                                                  *  this also keeps the standard TCP congestion
2349                                                  *  control working since it needs an ack every
2350                                                  *  2 max segs worth.  This is not quite that,
2351                                                  *  but under a real stream is equivalent since
2352                                                  *  every packet has a max seg in it.
2353                                                  */
2354                                                 if (++(tcb->rcv.una) >= 2)
2355                                                         tcb->flags |= FORCE;
2356                                         }
2357                                         tcb->rcv.nxt += length;
2358
2359                                         /*
2360                                          *  update our rcv window
2361                                          */
2362                                         tcprcvwin(s);
2363
2364                                         /*
2365                                          *  turn on the acktimer if there's something
2366                                          *  to ack
2367                                          */
2368                                         if (tcb->acktimer.state != TcptimerON)
2369                                                 tcpgo(tpriv, &tcb->acktimer);
2370
2371                                         break;
2372                                 case Finwait2:
2373                                         /* no process to read the data, send a reset */
2374                                         if (bp != NULL)
2375                                                 freeblist(bp);
2376                                         sndrst(tcp, source, dest, length, &seg, version,
2377                                                    "send to Finwait2");
2378                                         qunlock(&s->qlock);
2379                                         poperror();
2380                                         return;
2381                         }
2382                 }
2383
2384                 if (seg.flags & FIN) {
2385                         tcb->flags |= FORCE;
2386
2387                         switch (tcb->state) {
2388                                 case Syn_received:
2389                                 case Established:
2390                                         tcb->rcv.nxt++;
2391                                         tcpsetstate(s, Close_wait);
2392                                         break;
2393                                 case Finwait1:
2394                                         tcb->rcv.nxt++;
2395                                         if (qlen(s->wq) + tcb->flgcnt == 0) {
2396                                                 tcphalt(tpriv, &tcb->rtt_timer);
2397                                                 tcphalt(tpriv, &tcb->acktimer);
2398                                                 tcphalt(tpriv, &tcb->katimer);
2399                                                 tcpsetstate(s, Time_wait);
2400                                                 tcb->timer.start = MSL2 * (1000 / MSPTICK);
2401                                                 tcpgo(tpriv, &tcb->timer);
2402                                         } else
2403                                                 tcpsetstate(s, Closing);
2404                                         break;
2405                                 case Finwait2:
2406                                         tcb->rcv.nxt++;
2407                                         tcphalt(tpriv, &tcb->rtt_timer);
2408                                         tcphalt(tpriv, &tcb->acktimer);
2409                                         tcphalt(tpriv, &tcb->katimer);
2410                                         tcpsetstate(s, Time_wait);
2411                                         tcb->timer.start = MSL2 * (1000 / MSPTICK);
2412                                         tcpgo(tpriv, &tcb->timer);
2413                                         break;
2414                                 case Close_wait:
2415                                 case Closing:
2416                                 case Last_ack:
2417                                         break;
2418                                 case Time_wait:
2419                                         tcpgo(tpriv, &tcb->timer);
2420                                         break;
2421                         }
2422                 }
2423
2424                 /*
2425                  *  get next adjacent segment from the resequence queue.
2426                  *  dump/trim any overlapping segments
2427                  */
2428                 for (;;) {
2429                         if (tcb->reseq == NULL)
2430                                 goto output;
2431
2432                         if (seq_ge(tcb->rcv.nxt, tcb->reseq->seg.seq) == 0)
2433                                 goto output;
2434
2435                         getreseq(tcb, &seg, &bp, &length);
2436
2437                         if (tcptrim(tcb, &seg, &bp, &length) == 0)
2438                                 break;
2439                 }
2440         }
2441 output:
2442         tcpoutput(s);
2443         qunlock(&s->qlock);
2444         poperror();
2445         return;
2446 raise:
2447         qunlock(&s->qlock);
2448         poperror();
2449         freeblist(bp);
2450         tcpkick(s);
2451 }
2452
2453 /*
2454  *  always enters and exits with the s locked.  We drop
2455  *  the lock to ipoput the packet so some care has to be
2456  *  taken by callers.
2457  */
2458 void tcpoutput(struct conv *s)
2459 {
2460         Tcp seg;
2461         int msgs;
2462         Tcpctl *tcb;
2463         struct block *hbp, *bp;
2464         int sndcnt, n;
2465         uint32_t ssize, dsize, usable, sent;
2466         struct Fs *f;
2467         struct tcppriv *tpriv;
2468         uint8_t version;
2469
2470         f = s->p->f;
2471         tpriv = s->p->priv;
2472         version = s->ipversion;
2473
2474         for (msgs = 0; msgs < 100; msgs++) {
2475                 tcb = (Tcpctl *) s->ptcl;
2476
2477                 switch (tcb->state) {
2478                         case Listen:
2479                         case Closed:
2480                         case Finwait2:
2481                                 return;
2482                 }
2483
2484                 /* force an ack when a window has opened up */
2485                 if (tcb->rcv.blocked && tcb->rcv.wnd > 0) {
2486                         tcb->rcv.blocked = 0;
2487                         tcb->flags |= FORCE;
2488                 }
2489
2490                 sndcnt = qlen(s->wq) + tcb->flgcnt;
2491                 sent = tcb->snd.ptr - tcb->snd.una;
2492
2493                 /* Don't send anything else until our SYN has been acked */
2494                 if (tcb->snd.ptr != tcb->iss && (tcb->flags & SYNACK) == 0)
2495                         break;
2496
2497                 /* Compute usable segment based on offered window and limit
2498                  * window probes to one
2499                  */
2500                 if (tcb->snd.wnd == 0) {
2501                         if (sent != 0) {
2502                                 if ((tcb->flags & FORCE) == 0)
2503                                         break;
2504 //              tcb->snd.ptr = tcb->snd.una;
2505                         }
2506                         usable = 1;
2507                 } else {
2508                         usable = tcb->cwind;
2509                         if (tcb->snd.wnd < usable)
2510                                 usable = tcb->snd.wnd;
2511                         usable -= sent;
2512                 }
2513                 ssize = sndcnt - sent;
2514                 if (ssize && usable < 2)
2515                         netlog(s->p->f, Logtcp, "throttled snd.wnd %lu cwind %lu\n",
2516                                    tcb->snd.wnd, tcb->cwind);
2517                 if (usable < ssize)
2518                         ssize = usable;
2519                 if (ssize > tcb->mss) {
2520                         if ((tcb->flags & TSO) == 0) {
2521                                 ssize = tcb->mss;
2522                         } else {
2523                                 int segs, window;
2524
2525                                 /*  Don't send too much.  32K is arbitrary..
2526                                  */
2527                                 if (ssize > 32 * 1024)
2528                                         ssize = 32 * 1024;
2529
2530                                 /* Clamp xmit to an integral MSS to
2531                                  * avoid ragged tail segments causing
2532                                  * poor link utilization.  Also
2533                                  * account for each segment sent in
2534                                  * msg heuristic, and round up to the
2535                                  * next multiple of 4, to ensure we
2536                                  * still yeild.
2537                                  */
2538                                 segs = ssize / tcb->mss;
2539                                 ssize = segs * tcb->mss;
2540                                 msgs += segs;
2541                                 if (segs > 3)
2542                                         msgs = (msgs + 4) & ~3;
2543                         }
2544                 }
2545
2546                 dsize = ssize;
2547                 seg.urg = 0;
2548
2549                 if (ssize == 0)
2550                         if ((tcb->flags & FORCE) == 0)
2551                                 break;
2552
2553                 tcb->flags &= ~FORCE;
2554                 tcprcvwin(s);
2555
2556                 /* By default we will generate an ack */
2557                 tcphalt(tpriv, &tcb->acktimer);
2558                 tcb->rcv.una = 0;
2559                 seg.source = s->lport;
2560                 seg.dest = s->rport;
2561                 seg.flags = ACK;
2562                 seg.mss = 0;
2563                 seg.ws = 0;
2564                 switch (tcb->state) {
2565                         case Syn_sent:
2566                                 seg.flags = 0;
2567                                 if (tcb->snd.ptr == tcb->iss) {
2568                                         seg.flags |= SYN;
2569                                         dsize--;
2570                                         seg.mss = tcb->mss;
2571                                         seg.ws = tcb->scale;
2572                                 }
2573                                 break;
2574                         case Syn_received:
2575                                 /*
2576                                  *  don't send any data with a SYN/ACK packet
2577                                  *  because Linux rejects the packet in its
2578                                  *  attempt to solve the SYN attack problem
2579                                  */
2580                                 if (tcb->snd.ptr == tcb->iss) {
2581                                         seg.flags |= SYN;
2582                                         dsize = 0;
2583                                         ssize = 1;
2584                                         seg.mss = tcb->mss;
2585                                         seg.ws = tcb->scale;
2586                                 }
2587                                 break;
2588                 }
2589                 seg.seq = tcb->snd.ptr;
2590                 seg.ack = tcb->rcv.nxt;
2591                 seg.wnd = tcb->rcv.wnd;
2592
2593                 /* Pull out data to send */
2594                 bp = NULL;
2595                 if (dsize != 0) {
2596                         bp = qcopy(s->wq, dsize, sent);
2597                         if (BLEN(bp) != dsize) {
2598                                 seg.flags |= FIN;
2599                                 dsize--;
2600                         }
2601                         if (BLEN(bp) > tcb->mss) {
2602                                 bp->flag |= Btso;
2603                                 bp->mss = tcb->mss;
2604                         }
2605                 }
2606
2607                 if (sent + dsize == sndcnt)
2608                         seg.flags |= PSH;
2609
2610                 /* keep track of balance of resent data */
2611                 if (seq_lt(tcb->snd.ptr, tcb->snd.nxt)) {
2612                         n = tcb->snd.nxt - tcb->snd.ptr;
2613                         if (ssize < n)
2614                                 n = ssize;
2615                         tcb->resent += n;
2616                         netlog(f, Logtcp, "rexmit: %I.%d -> %I.%d ptr 0x%lx nxt 0x%lx\n",
2617                                    s->raddr, s->rport, s->laddr, s->lport, tcb->snd.ptr,
2618                                    tcb->snd.nxt);
2619                         tpriv->stats[RetransSegs]++;
2620                 }
2621
2622                 tcb->snd.ptr += ssize;
2623
2624                 /* Pull up the send pointer so we can accept acks
2625                  * for this window
2626                  */
2627                 if (seq_gt(tcb->snd.ptr, tcb->snd.nxt))
2628                         tcb->snd.nxt = tcb->snd.ptr;
2629
2630                 /* Build header, link data and compute cksum */
2631                 switch (version) {
2632                         case V4:
2633                                 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
2634                                 hbp = htontcp4(&seg, bp, &tcb->protohdr.tcp4hdr, tcb);
2635                                 if (hbp == NULL) {
2636                                         freeblist(bp);
2637                                         return;
2638                                 }
2639                                 break;
2640                         case V6:
2641                                 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
2642                                 hbp = htontcp6(&seg, bp, &tcb->protohdr.tcp6hdr, tcb);
2643                                 if (hbp == NULL) {
2644                                         freeblist(bp);
2645                                         return;
2646                                 }
2647                                 break;
2648                         default:
2649                                 hbp = NULL;     /* to suppress a warning */
2650                                 panic("tcpoutput: version %d", version);
2651                 }
2652
2653                 /* Start the transmission timers if there is new data and we
2654                  * expect acknowledges
2655                  */
2656                 if (ssize != 0) {
2657                         if (tcb->timer.state != TcptimerON)
2658                                 tcpgo(tpriv, &tcb->timer);
2659
2660                         /*  If round trip timer isn't running, start it.
2661                          *  measure the longest packet only in case the
2662                          *  transmission time dominates RTT
2663                          */
2664                         if (tcb->rtt_timer.state != TcptimerON)
2665                                 if (ssize == tcb->mss) {
2666                                         tcpgo(tpriv, &tcb->rtt_timer);
2667                                         tcb->rttseq = tcb->snd.ptr;
2668                                 }
2669                 }
2670
2671                 tpriv->stats[OutSegs]++;
2672
2673                 /* put off the next keep alive */
2674                 tcpgo(tpriv, &tcb->katimer);
2675
2676                 switch (version) {
2677                         case V4:
2678                                 if (ipoput4(f, hbp, 0, s->ttl, s->tos, s) < 0) {
2679                                         /* a negative return means no route */
2680                                         localclose(s, "no route");
2681                                 }
2682                                 break;
2683                         case V6:
2684                                 if (ipoput6(f, hbp, 0, s->ttl, s->tos, s) < 0) {
2685                                         /* a negative return means no route */
2686                                         localclose(s, "no route");
2687                                 }
2688                                 break;
2689                         default:
2690                                 panic("tcpoutput2: version %d", version);
2691                 }
2692                 if ((msgs % 4) == 1) {
2693                         qunlock(&s->qlock);
2694                         kthread_yield();
2695                         qlock(&s->qlock);
2696                 }
2697         }
2698 }
2699
2700 /*
2701  *  the BSD convention (hack?) for keep alives.  resend last uint8_t acked.
2702  */
2703 void tcpsendka(struct conv *s)
2704 {
2705         Tcp seg;
2706         Tcpctl *tcb;
2707         struct block *hbp, *dbp;
2708
2709         tcb = (Tcpctl *) s->ptcl;
2710
2711         dbp = NULL;
2712         seg.urg = 0;
2713         seg.source = s->lport;
2714         seg.dest = s->rport;
2715         seg.flags = ACK | PSH;
2716         seg.mss = 0;
2717         seg.ws = 0;
2718         if (tcpporthogdefense)
2719                 urandom_read(&seg.seq, sizeof(seg.seq));
2720         else
2721                 seg.seq = tcb->snd.una - 1;
2722         seg.ack = tcb->rcv.nxt;
2723         tcb->rcv.una = 0;
2724         seg.wnd = tcb->rcv.wnd;
2725         if (tcb->state == Finwait2) {
2726                 seg.flags |= FIN;
2727         } else {
2728                 dbp = block_alloc(1, MEM_WAIT);
2729                 dbp->wp++;
2730         }
2731
2732         if (isv4(s->raddr)) {
2733                 /* Build header, link data and compute cksum */
2734                 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
2735                 hbp = htontcp4(&seg, dbp, &tcb->protohdr.tcp4hdr, tcb);
2736                 if (hbp == NULL) {
2737                         freeblist(dbp);
2738                         return;
2739                 }
2740                 ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
2741         } else {
2742                 /* Build header, link data and compute cksum */
2743                 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
2744                 hbp = htontcp6(&seg, dbp, &tcb->protohdr.tcp6hdr, tcb);
2745                 if (hbp == NULL) {
2746                         freeblist(dbp);
2747                         return;
2748                 }
2749                 ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
2750         }
2751 }
2752
2753 /*
2754  *  set connection to time out after 12 minutes
2755  */
2756 void tcpsetkacounter(Tcpctl * tcb)
2757 {
2758         tcb->kacounter = (12 * 60 * 1000) / (tcb->katimer.start * MSPTICK);
2759         if (tcb->kacounter < 3)
2760                 tcb->kacounter = 3;
2761 }
2762
2763 /*
2764  *  if we've timed out, close the connection
2765  *  otherwise, send a keepalive and restart the timer
2766  */
2767 void tcpkeepalive(void *v)
2768 {
2769         ERRSTACK(1);
2770         Tcpctl *tcb;
2771         struct conv *s;
2772
2773         s = v;
2774         tcb = (Tcpctl *) s->ptcl;
2775         qlock(&s->qlock);
2776         if (waserror()) {
2777                 qunlock(&s->qlock);
2778                 nexterror();
2779         }
2780         if (tcb->state != Closed) {
2781                 if (--(tcb->kacounter) <= 0) {
2782                         localclose(s, "connection timed out");
2783                 } else {
2784                         tcpsendka(s);
2785                         tcpgo(s->p->priv, &tcb->katimer);
2786                 }
2787         }
2788         qunlock(&s->qlock);
2789         poperror();
2790 }
2791
2792 /*
2793  *  start keepalive timer
2794  */
2795 static void tcpstartka(struct conv *s, char **f, int n)
2796 {
2797         Tcpctl *tcb;
2798         int x;
2799
2800         tcb = (Tcpctl *) s->ptcl;
2801         if (tcb->state != Established)
2802                 error(ENOTCONN, "connection must be in Establised state");
2803         if (n > 1) {
2804                 x = atoi(f[1]);
2805                 if (x >= MSPTICK)
2806                         tcb->katimer.start = x / MSPTICK;
2807         }
2808         tcpsetkacounter(tcb);
2809         tcpgo(s->p->priv, &tcb->katimer);
2810 }
2811
2812 /*
2813  *  turn checksums on/off
2814  */
2815 static void tcpsetchecksum(struct conv *s, char **f, int unused)
2816 {
2817         Tcpctl *tcb;
2818
2819         tcb = (Tcpctl *) s->ptcl;
2820         tcb->nochecksum = !atoi(f[1]);
2821 }
2822
2823 void tcprxmit(struct conv *s)
2824 {
2825         Tcpctl *tcb;
2826
2827         tcb = (Tcpctl *) s->ptcl;
2828
2829         tcb->flags |= RETRAN | FORCE;
2830         tcb->snd.ptr = tcb->snd.una;
2831
2832         /*
2833          *  We should be halving the slow start threshhold (down to one
2834          *  mss) but leaving it at mss seems to work well enough
2835          */
2836         tcb->ssthresh = tcb->mss;
2837
2838         /*
2839          *  pull window down to a single packet
2840          */
2841         tcb->cwind = tcb->mss;
2842         tcpoutput(s);
2843 }
2844
2845 void tcptimeout(void *arg)
2846 {
2847         ERRSTACK(1);
2848         struct conv *s;
2849         Tcpctl *tcb;
2850         int maxback;
2851         struct tcppriv *tpriv;
2852
2853         s = (struct conv *)arg;
2854         tpriv = s->p->priv;
2855         tcb = (Tcpctl *) s->ptcl;
2856
2857         qlock(&s->qlock);
2858         if (waserror()) {
2859                 qunlock(&s->qlock);
2860                 nexterror();
2861         }
2862         switch (tcb->state) {
2863                 default:
2864                         tcb->backoff++;
2865                         if (tcb->state == Syn_sent)
2866                                 maxback = MAXBACKMS / 2;
2867                         else
2868                                 maxback = MAXBACKMS;
2869                         tcb->backedoff += tcb->timer.start * MSPTICK;
2870                         if (tcb->backedoff >= maxback) {
2871                                 localclose(s, "connection timed out");
2872                                 break;
2873                         }
2874                         netlog(s->p->f, Logtcprxmt, "timeout rexmit 0x%lx %llu/%llu\n",
2875                                    tcb->snd.una, tcb->timer.start, NOW);
2876                         tcpsettimer(tcb);
2877                         tcprxmit(s);
2878                         tpriv->stats[RetransTimeouts]++;
2879                         tcb->snd.dupacks = 0;
2880                         break;
2881                 case Time_wait:
2882                         localclose(s, NULL);
2883                         break;
2884                 case Closed:
2885                         break;
2886         }
2887         qunlock(&s->qlock);
2888         poperror();
2889 }
2890
2891 int inwindow(Tcpctl * tcb, int seq)
2892 {
2893         return seq_within(seq, tcb->rcv.nxt, tcb->rcv.nxt + tcb->rcv.wnd - 1);
2894 }
2895
2896 /*
2897  *  set up state for a received SYN (or SYN ACK) packet
2898  */
2899 void procsyn(struct conv *s, Tcp * seg)
2900 {
2901         Tcpctl *tcb;
2902
2903         tcb = (Tcpctl *) s->ptcl;
2904         tcb->flags |= FORCE;
2905
2906         tcb->rcv.nxt = seg->seq + 1;
2907         tcb->rcv.urg = tcb->rcv.nxt;
2908         tcb->irs = seg->seq;
2909
2910         /* our sending max segment size cannot be bigger than what he asked for */
2911         if (seg->mss != 0 && seg->mss < tcb->mss)
2912                 tcb->mss = seg->mss;
2913
2914         /* the congestion window always starts out as a single segment */
2915         tcb->snd.wnd = seg->wnd;
2916         tcb->cwind = tcb->mss;
2917 }
2918
2919 int
2920 addreseq(Tcpctl * tcb, struct tcppriv *tpriv, Tcp * seg,
2921                  struct block *bp, uint16_t length)
2922 {
2923         Reseq *rp, *rp1;
2924         int i, rqlen, qmax;
2925
2926         rp = kzmalloc(sizeof(Reseq), 0);
2927         if (rp == NULL) {
2928                 freeblist(bp);  /* bp always consumed by add_reseq */
2929                 return 0;
2930         }
2931
2932         rp->seg = *seg;
2933         rp->bp = bp;
2934         rp->length = length;
2935
2936         /* Place on reassembly list sorting by starting seq number */
2937         rp1 = tcb->reseq;
2938         if (rp1 == NULL || seq_lt(seg->seq, rp1->seg.seq)) {
2939                 rp->next = rp1;
2940                 tcb->reseq = rp;
2941                 if (rp->next != NULL)
2942                         tpriv->stats[OutOfOrder]++;
2943                 return 0;
2944         }
2945
2946         rqlen = 0;
2947         for (i = 0;; i++) {
2948                 rqlen += rp1->length;
2949                 if (rp1->next == NULL || seq_lt(seg->seq, rp1->next->seg.seq)) {
2950                         rp->next = rp1->next;
2951                         rp1->next = rp;
2952                         if (rp->next != NULL)
2953                                 tpriv->stats[OutOfOrder]++;
2954                         break;
2955                 }
2956                 rp1 = rp1->next;
2957         }
2958         qmax = QMAX << tcb->rcv.scale;
2959         if (rqlen > qmax) {
2960                 printd("resequence queue > window: %d > %d\n", rqlen, qmax);
2961                 i = 0;
2962                 for (rp1 = tcb->reseq; rp1 != NULL; rp1 = rp1->next) {
2963                         printd("0x%#lx 0x%#lx 0x%#x\n", rp1->seg.seq,
2964                                    rp1->seg.ack, rp1->seg.flags);
2965                         if (i++ > 10) {
2966                                 printd("...\n");
2967                                 break;
2968                         }
2969                 }
2970
2971                 // delete entire reassembly queue; wait for retransmit.
2972                 // - should we be smarter and only delete the tail?
2973                 for (rp = tcb->reseq; rp != NULL; rp = rp1) {
2974                         rp1 = rp->next;
2975                         freeblist(rp->bp);
2976                         kfree(rp);
2977                 }
2978                 tcb->reseq = NULL;
2979
2980                 return -1;
2981         }
2982         return 0;
2983 }
2984
2985 void getreseq(Tcpctl * tcb, Tcp * seg, struct block **bp, uint16_t * length)
2986 {
2987         Reseq *rp;
2988
2989         rp = tcb->reseq;
2990         if (rp == NULL)
2991                 return;
2992
2993         tcb->reseq = rp->next;
2994
2995         *seg = rp->seg;
2996         *bp = rp->bp;
2997         *length = rp->length;
2998
2999         kfree(rp);
3000 }
3001
3002 int tcptrim(Tcpctl * tcb, Tcp * seg, struct block **bp, uint16_t * length)
3003 {
3004         uint16_t len;
3005         uint8_t accept;
3006         int dupcnt, excess;
3007
3008         accept = 0;
3009         len = *length;
3010         if (seg->flags & SYN)
3011                 len++;
3012         if (seg->flags & FIN)
3013                 len++;
3014
3015         if (tcb->rcv.wnd == 0) {
3016                 if (len == 0 && seg->seq == tcb->rcv.nxt)
3017                         return 0;
3018         } else {
3019                 /* Some part of the segment should be in the window */
3020                 if (inwindow(tcb, seg->seq))
3021                         accept++;
3022                 else if (len != 0) {
3023                         if (inwindow(tcb, seg->seq + len - 1) ||
3024                                 seq_within(tcb->rcv.nxt, seg->seq, seg->seq + len - 1))
3025                                 accept++;
3026                 }
3027         }
3028         if (!accept) {
3029                 freeblist(*bp);
3030                 return -1;
3031         }
3032         dupcnt = tcb->rcv.nxt - seg->seq;
3033         if (dupcnt > 0) {
3034                 tcb->rerecv += dupcnt;
3035                 if (seg->flags & SYN) {
3036                         seg->flags &= ~SYN;
3037                         seg->seq++;
3038
3039                         if (seg->urg > 1)
3040                                 seg->urg--;
3041                         else
3042                                 seg->flags &= ~URG;
3043                         dupcnt--;
3044                 }
3045                 if (dupcnt > 0) {
3046                         pullblock(bp, (uint16_t) dupcnt);
3047                         seg->seq += dupcnt;
3048                         *length -= dupcnt;
3049
3050                         if (seg->urg > dupcnt)
3051                                 seg->urg -= dupcnt;
3052                         else {
3053                                 seg->flags &= ~URG;
3054                                 seg->urg = 0;
3055                         }
3056                 }
3057         }
3058         excess = seg->seq + *length - (tcb->rcv.nxt + tcb->rcv.wnd);
3059         if (excess > 0) {
3060                 tcb->rerecv += excess;
3061                 *length -= excess;
3062                 *bp = trimblock(*bp, 0, *length);
3063                 if (*bp == NULL)
3064                         panic("presotto is a boofhead");
3065                 seg->flags &= ~FIN;
3066         }
3067         return 0;
3068 }
3069
3070 void tcpadvise(struct Proto *tcp, struct block *bp, char *msg)
3071 {
3072         Tcp4hdr *h4;
3073         Tcp6hdr *h6;
3074         Tcpctl *tcb;
3075         uint8_t source[IPaddrlen];
3076         uint8_t dest[IPaddrlen];
3077         uint16_t psource, pdest;
3078         struct conv *s, **p;
3079
3080         h4 = (Tcp4hdr *) (bp->rp);
3081         h6 = (Tcp6hdr *) (bp->rp);
3082
3083         if ((h4->vihl & 0xF0) == IP_VER4) {
3084                 v4tov6(dest, h4->tcpdst);
3085                 v4tov6(source, h4->tcpsrc);
3086                 psource = nhgets(h4->tcpsport);
3087                 pdest = nhgets(h4->tcpdport);
3088         } else {
3089                 ipmove(dest, h6->tcpdst);
3090                 ipmove(source, h6->tcpsrc);
3091                 psource = nhgets(h6->tcpsport);
3092                 pdest = nhgets(h6->tcpdport);
3093         }
3094
3095         /* Look for a connection */
3096         for (p = tcp->conv; *p; p++) {
3097                 s = *p;
3098                 tcb = (Tcpctl *) s->ptcl;
3099                 if (s->rport == pdest)
3100                         if (s->lport == psource)
3101                                 if (tcb->state != Closed)
3102                                         if (ipcmp(s->raddr, dest) == 0)
3103                                                 if (ipcmp(s->laddr, source) == 0) {
3104                                                         qlock(&s->qlock);
3105                                                         switch (tcb->state) {
3106                                                                 case Syn_sent:
3107                                                                         localclose(s, msg);
3108                                                                         break;
3109                                                         }
3110                                                         qunlock(&s->qlock);
3111                                                         freeblist(bp);
3112                                                         return;
3113                                                 }
3114         }
3115         freeblist(bp);
3116 }
3117
3118 static void tcpporthogdefensectl(char *val)
3119 {
3120         if (strcmp(val, "on") == 0)
3121                 tcpporthogdefense = 1;
3122         else if (strcmp(val, "off") == 0)
3123                 tcpporthogdefense = 0;
3124         else
3125                 error(EINVAL, "unknown value for tcpporthogdefense");
3126 }
3127
3128 /* called with c qlocked */
3129 static void tcpctl(struct conv *c, char **f, int n)
3130 {
3131         if (n == 1 && strcmp(f[0], "hangup") == 0)
3132                 tcphangup(c);
3133         else if (n >= 1 && strcmp(f[0], "keepalive") == 0)
3134                 tcpstartka(c, f, n);
3135         else if (n >= 1 && strcmp(f[0], "checksum") == 0)
3136                 tcpsetchecksum(c, f, n);
3137         else if (n >= 1 && strcmp(f[0], "tcpporthogdefense") == 0)
3138                 tcpporthogdefensectl(f[1]);
3139         else
3140                 error(EINVAL, "unknown command to %s", __func__);
3141 }
3142
3143 int tcpstats(struct Proto *tcp, char *buf, int len)
3144 {
3145         struct tcppriv *priv;
3146         char *p, *e;
3147         int i;
3148
3149         priv = tcp->priv;
3150         p = buf;
3151         e = p + len;
3152         for (i = 0; i < Nstats; i++)
3153                 p = seprintf(p, e, "%s: %u\n", statnames[i], priv->stats[i]);
3154         return p - buf;
3155 }
3156
3157 /*
3158  *  garbage collect any stale conversations:
3159  *      - SYN received but no SYN-ACK after 5 seconds (could be the SYN attack)
3160  *      - Finwait2 after 5 minutes
3161  *
3162  *  this is called whenever we run out of channels.  Both checks are
3163  *  of questionable validity so we try to use them only when we're
3164  *  up against the wall.
3165  */
3166 int tcpgc(struct Proto *tcp)
3167 {
3168         struct conv *c, **pp, **ep;
3169         int n;
3170         Tcpctl *tcb;
3171
3172         n = 0;
3173         ep = &tcp->conv[tcp->nc];
3174         for (pp = tcp->conv; pp < ep; pp++) {
3175                 c = *pp;
3176                 if (c == NULL)
3177                         break;
3178                 if (!canqlock(&c->qlock))
3179                         continue;
3180                 tcb = (Tcpctl *) c->ptcl;
3181                 switch (tcb->state) {
3182                         case Syn_received:
3183                                 if (NOW - tcb->time > 5000) {
3184                                         localclose(c, "timed out");
3185                                         n++;
3186                                 }
3187                                 break;
3188                         case Finwait2:
3189                                 if (NOW - tcb->time > 5 * 60 * 1000) {
3190                                         localclose(c, "timed out");
3191                                         n++;
3192                                 }
3193                                 break;
3194                 }
3195                 qunlock(&c->qlock);
3196         }
3197         return n;
3198 }
3199
3200 void tcpsettimer(Tcpctl * tcb)
3201 {
3202         int x;
3203
3204         /* round trip dependency */
3205         x = backoff(tcb->backoff) *
3206                 (tcb->mdev + (tcb->srtt >> LOGAGAIN) + MSPTICK) / MSPTICK;
3207
3208         /* bounded twixt 1/2 and 64 seconds */
3209         if (x < 500 / MSPTICK)
3210                 x = 500 / MSPTICK;
3211         else if (x > (64000 / MSPTICK))
3212                 x = 64000 / MSPTICK;
3213         tcb->timer.start = x;
3214 }
3215
3216 static struct tcppriv *debug_priv;
3217
3218 /* Kfunc this */
3219 int dump_tcp_ht(void)
3220 {
3221         if (!debug_priv)
3222                 return -1;
3223         dump_ipht(&debug_priv->ht);
3224         return 0;
3225 }
3226
3227 void tcpinit(struct Fs *fs)
3228 {
3229         struct Proto *tcp;
3230         struct tcppriv *tpriv;
3231
3232         tcp = kzmalloc(sizeof(struct Proto), 0);
3233         tpriv = tcp->priv = kzmalloc(sizeof(struct tcppriv), 0);
3234         debug_priv = tpriv;
3235         qlock_init(&tpriv->tl);
3236         qlock_init(&tpriv->apl);
3237         tcp->name = "tcp";
3238         tcp->connect = tcpconnect;
3239         tcp->announce = tcpannounce;
3240         tcp->bypass = tcpbypass;
3241         tcp->ctl = tcpctl;
3242         tcp->state = tcpstate;
3243         tcp->create = tcpcreate;
3244         tcp->close = tcpclose;
3245         tcp->shutdown = tcpshutdown;
3246         tcp->rcv = tcpiput;
3247         tcp->advise = tcpadvise;
3248         tcp->stats = tcpstats;
3249         tcp->inuse = tcpinuse;
3250         tcp->gc = tcpgc;
3251         tcp->ipproto = IP_TCPPROTO;
3252         tcp->nc = 4096;
3253         tcp->ptclsize = sizeof(Tcpctl);
3254         tpriv->stats[MaxConn] = tcp->nc;
3255
3256         Fsproto(fs, tcp);
3257 }
3258
3259 void
3260 tcpsetscale(struct conv *s, Tcpctl * tcb, uint16_t rcvscale, uint16_t sndscale)
3261 {
3262         if (rcvscale) {
3263                 tcb->rcv.scale = rcvscale & 0xff;
3264                 tcb->snd.scale = sndscale & 0xff;
3265                 tcb->window = QMAX << tcb->snd.scale;
3266                 qsetlimit(s->rq, tcb->window);
3267         } else {
3268                 tcb->rcv.scale = 0;
3269                 tcb->snd.scale = 0;
3270                 tcb->window = QMAX;
3271                 qsetlimit(s->rq, tcb->window);
3272         }
3273 }