net: tcp: Always set the retrans timer
[akaros.git] / kern / src / net / tcp.c
1 /* Copyright © 1994-1999 Lucent Technologies Inc.  All rights reserved.
2  * Portions Copyright © 1997-1999 Vita Nuova Limited
3  * Portions Copyright © 2000-2007 Vita Nuova Holdings Limited
4  *                                (www.vitanuova.com)
5  * Revisions Copyright © 2000-2007 Lucent Technologies Inc. and others
6  *
7  * Modified for the Akaros operating system:
8  * Copyright (c) 2013-2014 The Regents of the University of California
9  * Copyright (c) 2013-2015 Google Inc.
10  *
11  * Permission is hereby granted, free of charge, to any person obtaining a copy
12  * of this software and associated documentation files (the "Software"), to deal
13  * in the Software without restriction, including without limitation the rights
14  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15  * copies of the Software, and to permit persons to whom the Software is
16  * furnished to do so, subject to the following conditions:
17  *
18  * The above copyright notice and this permission notice shall be included in
19  * all copies or substantial portions of the Software.
20  *
21  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
24  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27  * SOFTWARE. */
28
29 #include <vfs.h>
30 #include <kfs.h>
31 #include <slab.h>
32 #include <kmalloc.h>
33 #include <kref.h>
34 #include <string.h>
35 #include <stdio.h>
36 #include <assert.h>
37 #include <error.h>
38 #include <cpio.h>
39 #include <pmap.h>
40 #include <smp.h>
41 #include <ip.h>
42
43 #include <vfs.h>
44 #include <kfs.h>
45 #include <slab.h>
46 #include <kmalloc.h>
47 #include <kref.h>
48 #include <string.h>
49 #include <stdio.h>
50 #include <assert.h>
51 #include <error.h>
52 #include <cpio.h>
53 #include <pmap.h>
54 #include <smp.h>
55 #include <ip.h>
56
57 enum {
58         QMAX = 64 * 1024 - 1,
59         IP_TCPPROTO = 6,
60
61         TCP4_IPLEN = 8,
62         TCP4_PHDRSIZE = 12,
63         TCP4_HDRSIZE = 20,
64         TCP4_TCBPHDRSZ = 40,
65         TCP4_PKT = TCP4_IPLEN + TCP4_PHDRSIZE,
66
67         TCP6_IPLEN = 0,
68         TCP6_PHDRSIZE = 40,
69         TCP6_HDRSIZE = 20,
70         TCP6_TCBPHDRSZ = 60,
71         TCP6_PKT = TCP6_IPLEN + TCP6_PHDRSIZE,
72
73         TcptimerOFF = 0,
74         TcptimerON = 1,
75         TcptimerDONE = 2,
76         MAX_TIME = (1 << 20),   /* Forever */
77         TCP_ACK = 50,   /* Timed ack sequence in ms */
78         MAXBACKMS = 9 * 60 * 1000,      /* longest backoff time (ms) before hangup */
79
80         URG = 0x20,     /* Data marked urgent */
81         ACK = 0x10,     /* Acknowledge is valid */
82         PSH = 0x08,     /* Whole data pipe is pushed */
83         RST = 0x04,     /* Reset connection */
84         SYN = 0x02,     /* Pkt. is synchronise */
85         FIN = 0x01,     /* Start close down */
86
87         EOLOPT = 0,
88         NOOPOPT = 1,
89         MSSOPT = 2,
90         MSS_LENGTH = 4, /* max segment size header option length */
91         WSOPT = 3,
92         WS_LENGTH = 3,  /* WS header option length */
93         MSL2 = 10,
94         MSPTICK = 50,   /* Milliseconds per timer tick */
95         DEF_MSS = 1460, /* Default mean segment */
96         DEF_MSS6 = 1280,        /* Default mean segment (min) for v6 */
97         DEF_RTT = 500,  /* Default round trip */
98         DEF_KAT = 120000,       /* Default time (ms) between keep alives */
99         TCP_LISTEN = 0, /* Listen connection */
100         TCP_CONNECT = 1,        /* Outgoing connection */
101         SYNACK_RXTIMER = 250,   /* ms between SYNACK retransmits */
102
103         TCPREXMTTHRESH = 3,     /* dupack threshhold for rxt */
104         CWIND_SCALE = 10,       /* initial CWIND will be MSS * this */
105
106         FORCE = 1,
107         CLONE = 2,
108         RETRAN = 4,
109         ACTIVE = 8,
110         SYNACK = 16,
111         TSO = 32,
112
113         LOGAGAIN = 3,
114         LOGDGAIN = 2,
115
116         Closed = 0,     /* Connection states */
117         Listen,
118         Syn_sent,
119         Syn_received,
120         Established,
121         Finwait1,
122         Finwait2,
123         Close_wait,
124         Closing,
125         Last_ack,
126         Time_wait,
127
128         Maxlimbo = 1000,        /* maximum procs waiting for response to SYN ACK */
129         NLHT = 256,     /* hash table size, must be a power of 2 */
130         LHTMASK = NLHT - 1,
131
132         HaveWS = 1 << 8,
133 };
134
135 /* Must correspond to the enumeration above */
136 char *tcpstates[] = {
137         "Closed", "Listen", "Syn_sent", "Syn_received",
138         "Established", "Finwait1", "Finwait2", "Close_wait",
139         "Closing", "Last_ack", "Time_wait"
140 };
141
142 typedef struct Tcptimer Tcptimer;
143 struct Tcptimer {
144         Tcptimer *next;
145         Tcptimer *prev;
146         Tcptimer *readynext;
147         int state;
148         uint64_t start;
149         uint64_t count;
150         void (*func) (void *);
151         void *arg;
152 };
153
154 /*
155  *  v4 and v6 pseudo headers used for
156  *  checksuming tcp
157  */
158 typedef struct Tcp4hdr Tcp4hdr;
159 struct Tcp4hdr {
160         uint8_t vihl;                           /* Version and header length */
161         uint8_t tos;                            /* Type of service */
162         uint8_t length[2];                      /* packet length */
163         uint8_t id[2];                          /* Identification */
164         uint8_t frag[2];                        /* Fragment information */
165         uint8_t Unused;
166         uint8_t proto;
167         uint8_t tcplen[2];
168         uint8_t tcpsrc[4];
169         uint8_t tcpdst[4];
170         uint8_t tcpsport[2];
171         uint8_t tcpdport[2];
172         uint8_t tcpseq[4];
173         uint8_t tcpack[4];
174         uint8_t tcpflag[2];
175         uint8_t tcpwin[2];
176         uint8_t tcpcksum[2];
177         uint8_t tcpurg[2];
178         /* Options segment */
179         uint8_t tcpopt[1];
180 };
181
182 typedef struct Tcp6hdr Tcp6hdr;
183 struct Tcp6hdr {
184         uint8_t vcf[4];
185         uint8_t ploadlen[2];
186         uint8_t proto;
187         uint8_t ttl;
188         uint8_t tcpsrc[IPaddrlen];
189         uint8_t tcpdst[IPaddrlen];
190         uint8_t tcpsport[2];
191         uint8_t tcpdport[2];
192         uint8_t tcpseq[4];
193         uint8_t tcpack[4];
194         uint8_t tcpflag[2];
195         uint8_t tcpwin[2];
196         uint8_t tcpcksum[2];
197         uint8_t tcpurg[2];
198         /* Options segment */
199         uint8_t tcpopt[1];
200 };
201
202 /*
203  *  this represents the control info
204  *  for a single packet.  It is derived from
205  *  a packet in ntohtcp{4,6}() and stuck into
206  *  a packet in htontcp{4,6}().
207  */
208 typedef struct Tcp Tcp;
209 struct Tcp {
210         uint16_t source;
211         uint16_t dest;
212         uint32_t seq;
213         uint32_t ack;
214         uint8_t flags;
215         uint16_t ws;                            /* window scale option (if not zero) */
216         uint32_t wnd;
217         uint16_t urg;
218         uint16_t mss;                           /* max segment size option (if not zero) */
219         uint16_t len;                           /* size of data */
220 };
221
222 /*
223  *  this header is malloc'd to thread together fragments
224  *  waiting to be coalesced
225  */
226 typedef struct Reseq Reseq;
227 struct Reseq {
228         Reseq *next;
229         Tcp seg;
230         struct block *bp;
231         uint16_t length;
232 };
233
234 /*
235  *  the qlock in the Conv locks this structure
236  */
237 typedef struct Tcpctl Tcpctl;
238 struct Tcpctl {
239         uint8_t state;                          /* Connection state */
240         uint8_t type;                           /* Listening or active connection */
241         uint8_t code;                           /* Icmp code */
242         struct {
243                 uint32_t una;                   /* Unacked data pointer */
244                 uint32_t nxt;                   /* Next sequence expected */
245                 uint32_t ptr;                   /* Data pointer */
246                 uint32_t wnd;                   /* Tcp send window */
247                 uint32_t urg;                   /* Urgent data pointer */
248                 uint32_t wl2;
249                 int scale;                              /* how much to right shift window in xmitted packets */
250                 /* to implement tahoe and reno TCP */
251                 uint32_t dupacks;               /* number of duplicate acks rcvd */
252                 int recovery;                   /* loss recovery flag */
253                 uint32_t rxt;                   /* right window marker for recovery */
254         } snd;
255         struct {
256                 uint32_t nxt;                   /* Receive pointer to next uint8_t slot */
257                 uint32_t wnd;                   /* Receive window incoming */
258                 uint32_t urg;                   /* Urgent pointer */
259                 int blocked;
260                 int una;                                /* unacked data segs */
261                 int scale;                              /* how much to left shift window in rcved packets */
262         } rcv;
263         uint32_t iss;                           /* Initial sequence number */
264         int sawwsopt;                           /* true if we saw a wsopt on the incoming SYN */
265         uint32_t cwind;                         /* Congestion window */
266         int scale;                                      /* desired snd.scale */
267         uint32_t ssthresh;                      /* Slow start threshold */
268         int resent;                                     /* Bytes just resent */
269         int irs;                                        /* Initial received squence */
270         uint16_t mss;                           /* Mean segment size */
271         int rerecv;                                     /* Overlap of data rerecevived */
272         uint32_t window;                        /* Recevive window */
273         uint8_t backoff;                        /* Exponential backoff counter */
274         int backedoff;                          /* ms we've backed off for rexmits */
275         uint8_t flags;                          /* State flags */
276         Reseq *reseq;                           /* Resequencing queue */
277         Tcptimer timer;                         /* Activity timer */
278         Tcptimer acktimer;                      /* Acknowledge timer */
279         Tcptimer rtt_timer;                     /* Round trip timer */
280         Tcptimer katimer;                       /* keep alive timer */
281         uint32_t rttseq;                        /* Round trip sequence */
282         int srtt;                                       /* Shortened round trip */
283         int mdev;                                       /* Mean deviation of round trip */
284         int kacounter;                          /* count down for keep alive */
285         uint64_t sndsyntime;            /* time syn sent */
286         uint64_t time;                          /* time Finwait2 or Syn_received was sent */
287         int nochecksum;                         /* non-zero means don't send checksums */
288         int flgcnt;                                     /* number of flags in the sequence (FIN,SEQ) */
289
290         union {
291                 Tcp4hdr tcp4hdr;
292                 Tcp6hdr tcp6hdr;
293         } protohdr;                                     /* prototype header */
294 };
295
296 /*
297  *  New calls are put in limbo rather than having a conversation structure
298  *  allocated.  Thus, a SYN attack results in lots of limbo'd calls but not
299  *  any real Conv structures mucking things up.  Calls in limbo rexmit their
300  *  SYN ACK every SYNACK_RXTIMER ms up to 4 times, i.e., they disappear after 1 second.
301  *
302  *  In particular they aren't on a listener's queue so that they don't figure
303  *  in the input queue limit.
304  *
305  *  If 1/2 of a T3 was attacking SYN packets, we'ld have a permanent queue
306  *  of 70000 limbo'd calls.  Not great for a linear list but doable.  Therefore
307  *  there is no hashing of this list.
308  */
309 typedef struct Limbo Limbo;
310 struct Limbo {
311         Limbo *next;
312
313         uint8_t laddr[IPaddrlen];
314         uint8_t raddr[IPaddrlen];
315         uint16_t lport;
316         uint16_t rport;
317         uint32_t irs;                           /* initial received sequence */
318         uint32_t iss;                           /* initial sent sequence */
319         uint16_t mss;                           /* mss from the other end */
320         uint16_t rcvscale;                      /* how much to scale rcvd windows */
321         uint16_t sndscale;                      /* how much to scale sent windows */
322         uint64_t lastsend;                      /* last time we sent a synack */
323         uint8_t version;                        /* v4 or v6 */
324         uint8_t rexmits;                        /* number of retransmissions */
325 };
326
327 int tcp_irtt = DEF_RTT;                 /* Initial guess at round trip time */
328 uint16_t tcp_mss = DEF_MSS;             /* Maximum segment size to be sent */
329
330 enum {
331         /* MIB stats */
332         MaxConn,
333         ActiveOpens,
334         PassiveOpens,
335         EstabResets,
336         CurrEstab,
337         InSegs,
338         OutSegs,
339         RetransSegs,
340         RetransTimeouts,
341         InErrs,
342         OutRsts,
343
344         /* non-MIB stats */
345         CsumErrs,
346         HlenErrs,
347         LenErrs,
348         OutOfOrder,
349
350         Nstats
351 };
352
353 static char *statnames[] = {
354         [MaxConn] "MaxConn",
355         [ActiveOpens] "ActiveOpens",
356         [PassiveOpens] "PassiveOpens",
357         [EstabResets] "EstabResets",
358         [CurrEstab] "CurrEstab",
359         [InSegs] "InSegs",
360         [OutSegs] "OutSegs",
361         [RetransSegs] "RetransSegs",
362         [RetransTimeouts] "RetransTimeouts",
363         [InErrs] "InErrs",
364         [OutRsts] "OutRsts",
365         [CsumErrs] "CsumErrs",
366         [HlenErrs] "HlenErrs",
367         [LenErrs] "LenErrs",
368         [OutOfOrder] "OutOfOrder",
369 };
370
371 typedef struct Tcppriv Tcppriv;
372 struct tcppriv {
373         /* List of active timers */
374         qlock_t tl;
375         Tcptimer *timers;
376
377         /* hash table for matching conversations */
378         struct Ipht ht;
379
380         /* calls in limbo waiting for an ACK to our SYN ACK */
381         int nlimbo;
382         Limbo *lht[NLHT];
383
384         /* for keeping track of tcpackproc */
385         qlock_t apl;
386         int ackprocstarted;
387
388         uint32_t stats[Nstats];
389 };
390
391 /*
392  *  Setting tcpporthogdefense to non-zero enables Dong Lin's
393  *  solution to hijacked systems staking out port's as a form
394  *  of DoS attack.
395  *
396  *  To avoid stateless Conv hogs, we pick a sequence number at random.  If
397  *  it that number gets acked by the other end, we shut down the connection.
398  *  Look for tcpporthogedefense in the code.
399  */
400 int tcpporthogdefense = 0;
401
402 int addreseq(Tcpctl *, struct tcppriv *, Tcp *, struct block *, uint16_t);
403 void getreseq(Tcpctl *, Tcp *, struct block **, uint16_t *);
404 void localclose(struct conv *, char *unused_char_p_t);
405 void procsyn(struct conv *, Tcp *);
406 void tcpiput(struct Proto *, struct Ipifc *, struct block *);
407 void tcpoutput(struct conv *);
408 int tcptrim(Tcpctl *, Tcp *, struct block **, uint16_t *);
409 void tcpstart(struct conv *, int);
410 void tcptimeout(void *);
411 void tcpsndsyn(struct conv *, Tcpctl *);
412 void tcprcvwin(struct conv *);
413 void tcpacktimer(void *);
414 void tcpkeepalive(void *);
415 void tcpsetkacounter(Tcpctl *);
416 void tcprxmit(struct conv *);
417 void tcpsettimer(Tcpctl *);
418 void tcpsynackrtt(struct conv *);
419 void tcpsetscale(struct conv *, Tcpctl *, uint16_t, uint16_t);
420
421 static void limborexmit(struct Proto *);
422 static void limbo(struct conv *, uint8_t * unused_uint8_p_t, uint8_t *, Tcp *,
423                                   int);
424
425 void tcpsetstate(struct conv *s, uint8_t newstate)
426 {
427         Tcpctl *tcb;
428         uint8_t oldstate;
429         struct tcppriv *tpriv;
430
431         tpriv = s->p->priv;
432
433         tcb = (Tcpctl *) s->ptcl;
434
435         oldstate = tcb->state;
436         if (oldstate == newstate)
437                 return;
438
439         if (oldstate == Established)
440                 tpriv->stats[CurrEstab]--;
441         if (newstate == Established)
442                 tpriv->stats[CurrEstab]++;
443
444         /**
445         print( "%d/%d %s->%s CurrEstab=%d\n", s->lport, s->rport,
446                 tcpstates[oldstate], tcpstates[newstate], tpriv->tstats.tcpCurrEstab );
447         **/
448
449         switch (newstate) {
450                 case Closed:
451                         qclose(s->rq);
452                         qclose(s->wq);
453                         qclose(s->eq);
454                         break;
455
456                 case Close_wait:        /* Remote closes */
457                         qhangup(s->rq, NULL);
458                         break;
459         }
460
461         tcb->state = newstate;
462
463         if (oldstate == Syn_sent && newstate != Closed)
464                 Fsconnected(s, NULL);
465 }
466
467 static void tcpconnect(struct conv *c, char **argv, int argc)
468 {
469         Fsstdconnect(c, argv, argc);
470         tcpstart(c, TCP_CONNECT);
471 }
472
473 static int tcpstate(struct conv *c, char *state, int n)
474 {
475         Tcpctl *s;
476
477         s = (Tcpctl *) (c->ptcl);
478
479         return snprintf(state, n,
480                                         "%s qin %d qout %d srtt %d mdev %d cwin %u swin %u>>%d rwin %u>>%d timer.start %llu timer.count %llu rerecv %d katimer.start %d katimer.count %d\n",
481                                         tcpstates[s->state],
482                                         c->rq ? qlen(c->rq) : 0,
483                                         c->wq ? qlen(c->wq) : 0,
484                                         s->srtt, s->mdev,
485                                         s->cwind, s->snd.wnd, s->rcv.scale, s->rcv.wnd,
486                                         s->snd.scale, s->timer.start, s->timer.count, s->rerecv,
487                                         s->katimer.start, s->katimer.count);
488 }
489
490 static int tcpinuse(struct conv *c)
491 {
492         Tcpctl *s;
493
494         s = (Tcpctl *) (c->ptcl);
495         return s->state != Closed;
496 }
497
498 static void tcpannounce(struct conv *c, char **argv, int argc)
499 {
500         Fsstdannounce(c, argv, argc);
501         tcpstart(c, TCP_LISTEN);
502         Fsconnected(c, NULL);
503 }
504
505 static void tcpbypass(struct conv *cv, char **argv, int argc)
506 {
507         struct tcppriv *tpriv = cv->p->priv;
508
509         Fsstdbypass(cv, argv, argc);
510         iphtadd(&tpriv->ht, cv);
511 }
512
513 static void tcpshutdown(struct conv *c, int how)
514 {
515         Tcpctl *tcb = (Tcpctl*)c->ptcl;
516
517         /* Do nothing for the read side */
518         if (how == SHUT_RD)
519                 return;
520         /* Sends a FIN.  If we're in another state (like Listen), we'll run into
521          * issues, since we'll never send the FIN.  We'll be shutdown on our end,
522          * but we'll never tell the distant end.  Might just be an app issue. */
523         switch (tcb->state) {
524         case Syn_received:
525         case Established:
526                 tcb->flgcnt++;
527                 tcb->snd.nxt++;
528                 tcpsetstate(c, Finwait1);
529                 tcpoutput(c);
530                 break;
531         }
532 }
533
534 /*
535  *  tcpclose is always called with the q locked
536  */
537 static void tcpclose(struct conv *c)
538 {
539         Tcpctl *tcb;
540
541         tcb = (Tcpctl *) c->ptcl;
542
543         qhangup(c->rq, NULL);
544         qhangup(c->wq, NULL);
545         qhangup(c->eq, NULL);
546         qflush(c->rq);
547
548         switch (tcb->state) {
549                 case Listen:
550                         /*
551                          *  reset any incoming calls to this listener
552                          */
553                         Fsconnected(c, "Hangup");
554
555                         localclose(c, NULL);
556                         break;
557                 case Closed:
558                 case Syn_sent:
559                         localclose(c, NULL);
560                         break;
561                 case Syn_received:
562                 case Established:
563                         tcb->flgcnt++;
564                         tcb->snd.nxt++;
565                         tcpsetstate(c, Finwait1);
566                         tcpoutput(c);
567                         break;
568                 case Close_wait:
569                         tcb->flgcnt++;
570                         tcb->snd.nxt++;
571                         tcpsetstate(c, Last_ack);
572                         tcpoutput(c);
573                         break;
574         }
575 }
576
577 void tcpkick(void *x)
578 {
579         ERRSTACK(1);
580         struct conv *s = x;
581         Tcpctl *tcb;
582
583         tcb = (Tcpctl *) s->ptcl;
584
585         qlock(&s->qlock);
586         if (waserror()) {
587                 qunlock(&s->qlock);
588                 nexterror();
589         }
590
591         switch (tcb->state) {
592                 case Syn_sent:
593                 case Syn_received:
594                 case Established:
595                 case Close_wait:
596                         /*
597                          * Push data
598                          */
599                         tcprcvwin(s);
600                         tcpoutput(s);
601                         break;
602                 default:
603                         localclose(s, "Hangup");
604                         break;
605         }
606
607         qunlock(&s->qlock);
608         poperror();
609 }
610
611 void tcprcvwin(struct conv *s)
612 {       /* Call with tcb locked */
613         int w;
614         Tcpctl *tcb;
615
616         tcb = (Tcpctl *) s->ptcl;
617         w = tcb->window - qlen(s->rq);
618         if (w < 0)
619                 w = 0;
620         tcb->rcv.wnd = w;
621         if (w == 0)
622                 tcb->rcv.blocked = 1;
623 }
624
625 void tcpacktimer(void *v)
626 {
627         ERRSTACK(1);
628         Tcpctl *tcb;
629         struct conv *s;
630
631         s = v;
632         tcb = (Tcpctl *) s->ptcl;
633
634         qlock(&s->qlock);
635         if (waserror()) {
636                 qunlock(&s->qlock);
637                 nexterror();
638         }
639         if (tcb->state != Closed) {
640                 tcb->flags |= FORCE;
641                 tcprcvwin(s);
642                 tcpoutput(s);
643         }
644         qunlock(&s->qlock);
645         poperror();
646 }
647
648 static void tcpcreate(struct conv *c)
649 {
650         c->rq = qopen(QMAX, Qcoalesce, 0, 0);
651         c->wq = qopen(8 * QMAX, Qkick, tcpkick, c);
652 }
653
654 static void timerstate(struct tcppriv *priv, Tcptimer * t, int newstate)
655 {
656         if (newstate != TcptimerON) {
657                 if (t->state == TcptimerON) {
658                         // unchain
659                         if (priv->timers == t) {
660                                 priv->timers = t->next;
661                                 if (t->prev != NULL)
662                                         panic("timerstate1");
663                         }
664                         if (t->next)
665                                 t->next->prev = t->prev;
666                         if (t->prev)
667                                 t->prev->next = t->next;
668                         t->next = t->prev = NULL;
669                 }
670         } else {
671                 if (t->state != TcptimerON) {
672                         // chain
673                         if (t->prev != NULL || t->next != NULL)
674                                 panic("timerstate2");
675                         t->prev = NULL;
676                         t->next = priv->timers;
677                         if (t->next)
678                                 t->next->prev = t;
679                         priv->timers = t;
680                 }
681         }
682         t->state = newstate;
683 }
684
685 void tcpackproc(void *a)
686 {
687         ERRSTACK(1);
688         Tcptimer *t, *tp, *timeo;
689         struct Proto *tcp;
690         struct tcppriv *priv;
691         int loop;
692
693         tcp = a;
694         priv = tcp->priv;
695
696         for (;;) {
697                 kthread_usleep(MSPTICK * 1000);
698
699                 qlock(&priv->tl);
700                 timeo = NULL;
701                 loop = 0;
702                 for (t = priv->timers; t != NULL; t = tp) {
703                         if (loop++ > 10000)
704                                 panic("tcpackproc1");
705                         tp = t->next;
706                         if (t->state == TcptimerON) {
707                                 t->count--;
708                                 if (t->count == 0) {
709                                         timerstate(priv, t, TcptimerDONE);
710                                         t->readynext = timeo;
711                                         timeo = t;
712                                 }
713                         }
714                 }
715                 qunlock(&priv->tl);
716
717                 loop = 0;
718                 for (t = timeo; t != NULL; t = t->readynext) {
719                         if (loop++ > 10000)
720                                 panic("tcpackproc2");
721                         if (t->state == TcptimerDONE && t->func != NULL) {
722                                 /* discard error style */
723                                 if (!waserror())
724                                         (*t->func) (t->arg);
725                                 poperror();
726                         }
727                 }
728
729                 limborexmit(tcp);
730         }
731 }
732
733 void tcpgo(struct tcppriv *priv, Tcptimer * t)
734 {
735         if (t == NULL || t->start == 0)
736                 return;
737
738         qlock(&priv->tl);
739         t->count = t->start;
740         timerstate(priv, t, TcptimerON);
741         qunlock(&priv->tl);
742 }
743
744 void tcphalt(struct tcppriv *priv, Tcptimer * t)
745 {
746         if (t == NULL)
747                 return;
748
749         qlock(&priv->tl);
750         timerstate(priv, t, TcptimerOFF);
751         qunlock(&priv->tl);
752 }
753
754 int backoff(int n)
755 {
756         return 1 << n;
757 }
758
759 void localclose(struct conv *s, char *reason)
760 {       /* called with tcb locked */
761         Tcpctl *tcb;
762         Reseq *rp, *rp1;
763         struct tcppriv *tpriv;
764
765         tpriv = s->p->priv;
766         tcb = (Tcpctl *) s->ptcl;
767
768         iphtrem(&tpriv->ht, s);
769
770         tcphalt(tpriv, &tcb->timer);
771         tcphalt(tpriv, &tcb->rtt_timer);
772         tcphalt(tpriv, &tcb->acktimer);
773         tcphalt(tpriv, &tcb->katimer);
774
775         /* Flush reassembly queue; nothing more can arrive */
776         for (rp = tcb->reseq; rp != NULL; rp = rp1) {
777                 rp1 = rp->next;
778                 freeblist(rp->bp);
779                 kfree(rp);
780         }
781         tcb->reseq = NULL;
782
783         if (tcb->state == Syn_sent)
784                 Fsconnected(s, reason);
785
786         qhangup(s->rq, reason);
787         qhangup(s->wq, reason);
788
789         tcpsetstate(s, Closed);
790
791         /* listener will check the rq state */
792         if (s->state == Announced)
793                 rendez_wakeup(&s->listenr);
794 }
795
796 /* mtu (- TCP + IP hdr len) of 1st hop */
797 int tcpmtu(struct Proto *tcp, uint8_t * addr, int version, int *scale,
798            uint8_t *flags)
799 {
800         struct Ipifc *ifc;
801         int mtu;
802
803         ifc = findipifc(tcp->f, addr, 0);
804         switch (version) {
805                 default:
806                 case V4:
807                         mtu = DEF_MSS;
808                         if (ifc != NULL)
809                                 mtu = ifc->maxtu - ifc->m->hsize - (TCP4_PKT + TCP4_HDRSIZE);
810                         break;
811                 case V6:
812                         mtu = DEF_MSS6;
813                         if (ifc != NULL)
814                                 mtu = ifc->maxtu - ifc->m->hsize - (TCP6_PKT + TCP6_HDRSIZE);
815                         break;
816         }
817         *flags &= ~TSO;
818
819         if (ifc != NULL) {
820                 if (ifc->mbps > 100)
821                         *scale = HaveWS | 3;
822                 else if (ifc->mbps > 10)
823                         *scale = HaveWS | 1;
824                 else
825                         *scale = HaveWS | 0;
826                 if (ifc->feat & NETF_TSO)
827                         *flags |= TSO;
828         } else
829                 *scale = HaveWS | 0;
830
831         return mtu;
832 }
833
834 void inittcpctl(struct conv *s, int mode)
835 {
836         Tcpctl *tcb;
837         Tcp4hdr *h4;
838         Tcp6hdr *h6;
839         int mss;
840
841         tcb = (Tcpctl *) s->ptcl;
842
843         memset(tcb, 0, sizeof(Tcpctl));
844
845         tcb->ssthresh = UINT32_MAX;
846         tcb->srtt = tcp_irtt << LOGAGAIN;
847         tcb->mdev = 0;
848
849         /* setup timers */
850         tcb->timer.start = tcp_irtt / MSPTICK;
851         tcb->timer.func = tcptimeout;
852         tcb->timer.arg = s;
853         tcb->rtt_timer.start = MAX_TIME;
854         tcb->acktimer.start = TCP_ACK / MSPTICK;
855         tcb->acktimer.func = tcpacktimer;
856         tcb->acktimer.arg = s;
857         tcb->katimer.start = DEF_KAT / MSPTICK;
858         tcb->katimer.func = tcpkeepalive;
859         tcb->katimer.arg = s;
860
861         mss = DEF_MSS;
862
863         /* create a prototype(pseudo) header */
864         if (mode != TCP_LISTEN) {
865                 if (ipcmp(s->laddr, IPnoaddr) == 0)
866                         findlocalip(s->p->f, s->laddr, s->raddr);
867
868                 switch (s->ipversion) {
869                         case V4:
870                                 h4 = &tcb->protohdr.tcp4hdr;
871                                 memset(h4, 0, sizeof(*h4));
872                                 h4->proto = IP_TCPPROTO;
873                                 hnputs(h4->tcpsport, s->lport);
874                                 hnputs(h4->tcpdport, s->rport);
875                                 v6tov4(h4->tcpsrc, s->laddr);
876                                 v6tov4(h4->tcpdst, s->raddr);
877                                 break;
878                         case V6:
879                                 h6 = &tcb->protohdr.tcp6hdr;
880                                 memset(h6, 0, sizeof(*h6));
881                                 h6->proto = IP_TCPPROTO;
882                                 hnputs(h6->tcpsport, s->lport);
883                                 hnputs(h6->tcpdport, s->rport);
884                                 ipmove(h6->tcpsrc, s->laddr);
885                                 ipmove(h6->tcpdst, s->raddr);
886                                 mss = DEF_MSS6;
887                                 break;
888                         default:
889                                 panic("inittcpctl: version %d", s->ipversion);
890                 }
891         }
892
893         tcb->mss = mss;
894         tcb->cwind = mss * CWIND_SCALE;
895
896         /* default is no window scaling */
897         tcb->window = QMAX;
898         tcb->rcv.wnd = QMAX;
899         tcb->rcv.scale = 0;
900         tcb->snd.scale = 0;
901         qsetlimit(s->rq, QMAX);
902 }
903
904 /*
905  *  called with s qlocked
906  */
907 void tcpstart(struct conv *s, int mode)
908 {
909         Tcpctl *tcb;
910         struct tcppriv *tpriv;
911         /* tcpackproc needs to free this if it ever exits */
912         char *kpname = kmalloc(KNAMELEN, MEM_WAIT);
913
914         tpriv = s->p->priv;
915
916         if (tpriv->ackprocstarted == 0) {
917                 qlock(&tpriv->apl);
918                 if (tpriv->ackprocstarted == 0) {
919                         snprintf(kpname, KNAMELEN, "#I%dtcpack", s->p->f->dev);
920                         ktask(kpname, tcpackproc, s->p);
921                         tpriv->ackprocstarted = 1;
922                 }
923                 qunlock(&tpriv->apl);
924         }
925
926         tcb = (Tcpctl *) s->ptcl;
927
928         inittcpctl(s, mode);
929
930         iphtadd(&tpriv->ht, s);
931         switch (mode) {
932                 case TCP_LISTEN:
933                         tpriv->stats[PassiveOpens]++;
934                         tcb->flags |= CLONE;
935                         tcpsetstate(s, Listen);
936                         break;
937
938                 case TCP_CONNECT:
939                         tpriv->stats[ActiveOpens]++;
940                         tcb->flags |= ACTIVE;
941                         tcpsndsyn(s, tcb);
942                         tcpsetstate(s, Syn_sent);
943                         tcpoutput(s);
944                         break;
945         }
946 }
947
948 static char *tcpflag(uint16_t flag)
949 {
950         static char buf[128];
951
952         snprintf(buf, sizeof(buf), "%d", flag >> 10);   /* Head len */
953         if (flag & URG)
954                 snprintf(buf, sizeof(buf), "%s%s", buf, " URG");
955         if (flag & ACK)
956                 snprintf(buf, sizeof(buf), "%s%s", buf, " ACK");
957         if (flag & PSH)
958                 snprintf(buf, sizeof(buf), "%s%s", buf, " PSH");
959         if (flag & RST)
960                 snprintf(buf, sizeof(buf), "%s%s", buf, " RST");
961         if (flag & SYN)
962                 snprintf(buf, sizeof(buf), "%s%s", buf, " SYN");
963         if (flag & FIN)
964                 snprintf(buf, sizeof(buf), "%s%s", buf, " FIN");
965
966         return buf;
967 }
968
969 struct block *htontcp6(Tcp * tcph, struct block *data, Tcp6hdr * ph,
970                                            Tcpctl * tcb)
971 {
972         int dlen;
973         Tcp6hdr *h;
974         uint16_t csum;
975         uint16_t hdrlen, optpad = 0;
976         uint8_t *opt;
977
978         hdrlen = TCP6_HDRSIZE;
979         if (tcph->flags & SYN) {
980                 if (tcph->mss)
981                         hdrlen += MSS_LENGTH;
982                 if (tcph->ws)
983                         hdrlen += WS_LENGTH;
984                 optpad = hdrlen & 3;
985                 if (optpad)
986                         optpad = 4 - optpad;
987                 hdrlen += optpad;
988         }
989
990         if (data) {
991                 dlen = blocklen(data);
992                 data = padblock(data, hdrlen + TCP6_PKT);
993                 if (data == NULL)
994                         return NULL;
995         } else {
996                 dlen = 0;
997                 /* the 64 pad is to meet mintu's */
998                 data = block_alloc(hdrlen + TCP6_PKT + 64, MEM_WAIT);
999                 if (data == NULL)
1000                         return NULL;
1001                 data->wp += hdrlen + TCP6_PKT;
1002         }
1003         /* relative to the block start (bp->rp) */
1004         data->transport_header_end = hdrlen + TCP4_PKT;
1005
1006         /* copy in pseudo ip header plus port numbers */
1007         h = (Tcp6hdr *) (data->rp);
1008         memmove(h, ph, TCP6_TCBPHDRSZ);
1009
1010         /* compose pseudo tcp header, do cksum calculation */
1011         hnputl(h->vcf, hdrlen + dlen);
1012         h->ploadlen[0] = h->ploadlen[1] = h->proto = 0;
1013         h->ttl = ph->proto;
1014
1015         /* copy in variable bits */
1016         hnputl(h->tcpseq, tcph->seq);
1017         hnputl(h->tcpack, tcph->ack);
1018         hnputs(h->tcpflag, (hdrlen << 10) | tcph->flags);
1019         hnputs(h->tcpwin, tcph->wnd >> (tcb != NULL ? tcb->snd.scale : 0));
1020         hnputs(h->tcpurg, tcph->urg);
1021
1022         if (tcph->flags & SYN) {
1023                 opt = h->tcpopt;
1024                 if (tcph->mss != 0) {
1025                         *opt++ = MSSOPT;
1026                         *opt++ = MSS_LENGTH;
1027                         hnputs(opt, tcph->mss);
1028                         opt += 2;
1029                 }
1030                 if (tcph->ws != 0) {
1031                         *opt++ = WSOPT;
1032                         *opt++ = WS_LENGTH;
1033                         *opt++ = tcph->ws;
1034                 }
1035                 while (optpad-- > 0)
1036                         *opt++ = NOOPOPT;
1037         }
1038
1039         if (tcb != NULL && tcb->nochecksum) {
1040                 h->tcpcksum[0] = h->tcpcksum[1] = 0;
1041         } else {
1042                 csum = ptclcsum(data, TCP6_IPLEN, hdrlen + dlen + TCP6_PHDRSIZE);
1043                 hnputs(h->tcpcksum, csum);
1044         }
1045
1046         /* move from pseudo header back to normal ip header */
1047         memset(h->vcf, 0, 4);
1048         h->vcf[0] = IP_VER6;
1049         hnputs(h->ploadlen, hdrlen + dlen);
1050         h->proto = ph->proto;
1051
1052         return data;
1053 }
1054
1055 struct block *htontcp4(Tcp * tcph, struct block *data, Tcp4hdr * ph,
1056                                            Tcpctl * tcb)
1057 {
1058         int dlen;
1059         Tcp4hdr *h;
1060         uint16_t csum;
1061         uint16_t hdrlen, optpad = 0;
1062         uint8_t *opt;
1063
1064         hdrlen = TCP4_HDRSIZE;
1065         if (tcph->flags & SYN) {
1066                 if (tcph->mss)
1067                         hdrlen += MSS_LENGTH;
1068                 if (tcph->ws)
1069                         hdrlen += WS_LENGTH;
1070                 optpad = hdrlen & 3;
1071                 if (optpad)
1072                         optpad = 4 - optpad;
1073                 hdrlen += optpad;
1074         }
1075
1076         if (data) {
1077                 dlen = blocklen(data);
1078                 data = padblock(data, hdrlen + TCP4_PKT);
1079                 if (data == NULL)
1080                         return NULL;
1081         } else {
1082                 dlen = 0;
1083                 /* the 64 pad is to meet mintu's */
1084                 data = block_alloc(hdrlen + TCP4_PKT + 64, MEM_WAIT);
1085                 if (data == NULL)
1086                         return NULL;
1087                 data->wp += hdrlen + TCP4_PKT;
1088         }
1089         /* relative to the block start (bp->rp) */
1090         data->transport_header_end = hdrlen + TCP4_PKT;
1091
1092         /* copy in pseudo ip header plus port numbers */
1093         h = (Tcp4hdr *) (data->rp);
1094         memmove(h, ph, TCP4_TCBPHDRSZ);
1095
1096         /* copy in variable bits */
1097         hnputs(h->tcplen, hdrlen + dlen);
1098         hnputl(h->tcpseq, tcph->seq);
1099         hnputl(h->tcpack, tcph->ack);
1100         hnputs(h->tcpflag, (hdrlen << 10) | tcph->flags);
1101         hnputs(h->tcpwin, tcph->wnd >> (tcb != NULL ? tcb->snd.scale : 0));
1102         hnputs(h->tcpurg, tcph->urg);
1103
1104         if (tcph->flags & SYN) {
1105                 opt = h->tcpopt;
1106                 if (tcph->mss != 0) {
1107                         *opt++ = MSSOPT;
1108                         *opt++ = MSS_LENGTH;
1109                         hnputs(opt, tcph->mss);
1110                         opt += 2;
1111                 }
1112                 if (tcph->ws != 0) {
1113                         *opt++ = WSOPT;
1114                         *opt++ = WS_LENGTH;
1115                         *opt++ = tcph->ws;
1116                 }
1117                 while (optpad-- > 0)
1118                         *opt++ = NOOPOPT;
1119         }
1120
1121         if (tcb != NULL && tcb->nochecksum) {
1122                 h->tcpcksum[0] = h->tcpcksum[1] = 0;
1123         } else {
1124                 csum = ~ptclcsum(data, TCP4_IPLEN, TCP4_PHDRSIZE);
1125                 hnputs(h->tcpcksum, csum);
1126                 data->checksum_start = TCP4_IPLEN + TCP4_PHDRSIZE;
1127                 data->checksum_offset = ph->tcpcksum - ph->tcpsport;
1128                 data->flag |= Btcpck;
1129         }
1130
1131         return data;
1132 }
1133
1134 int ntohtcp6(Tcp * tcph, struct block **bpp)
1135 {
1136         Tcp6hdr *h;
1137         uint8_t *optr;
1138         uint16_t hdrlen;
1139         uint16_t optlen;
1140         int n;
1141
1142         *bpp = pullupblock(*bpp, TCP6_PKT + TCP6_HDRSIZE);
1143         if (*bpp == NULL)
1144                 return -1;
1145
1146         h = (Tcp6hdr *) ((*bpp)->rp);
1147         tcph->source = nhgets(h->tcpsport);
1148         tcph->dest = nhgets(h->tcpdport);
1149         tcph->seq = nhgetl(h->tcpseq);
1150         tcph->ack = nhgetl(h->tcpack);
1151         hdrlen = (h->tcpflag[0] >> 2) & ~3;
1152         if (hdrlen < TCP6_HDRSIZE) {
1153                 freeblist(*bpp);
1154                 return -1;
1155         }
1156
1157         tcph->flags = h->tcpflag[1];
1158         tcph->wnd = nhgets(h->tcpwin);
1159         tcph->urg = nhgets(h->tcpurg);
1160         tcph->mss = 0;
1161         tcph->ws = 0;
1162         tcph->len = nhgets(h->ploadlen) - hdrlen;
1163
1164         *bpp = pullupblock(*bpp, hdrlen + TCP6_PKT);
1165         if (*bpp == NULL)
1166                 return -1;
1167
1168         optr = h->tcpopt;
1169         n = hdrlen - TCP6_HDRSIZE;
1170         while (n > 0 && *optr != EOLOPT) {
1171                 if (*optr == NOOPOPT) {
1172                         n--;
1173                         optr++;
1174                         continue;
1175                 }
1176                 optlen = optr[1];
1177                 if (optlen < 2 || optlen > n)
1178                         break;
1179                 switch (*optr) {
1180                         case MSSOPT:
1181                                 if (optlen == MSS_LENGTH)
1182                                         tcph->mss = nhgets(optr + 2);
1183                                 break;
1184                         case WSOPT:
1185                                 if (optlen == WS_LENGTH && *(optr + 2) <= 14)
1186                                         tcph->ws = HaveWS | *(optr + 2);
1187                                 break;
1188                 }
1189                 n -= optlen;
1190                 optr += optlen;
1191         }
1192         return hdrlen;
1193 }
1194
1195 int ntohtcp4(Tcp * tcph, struct block **bpp)
1196 {
1197         Tcp4hdr *h;
1198         uint8_t *optr;
1199         uint16_t hdrlen;
1200         uint16_t optlen;
1201         int n;
1202
1203         *bpp = pullupblock(*bpp, TCP4_PKT + TCP4_HDRSIZE);
1204         if (*bpp == NULL)
1205                 return -1;
1206
1207         h = (Tcp4hdr *) ((*bpp)->rp);
1208         tcph->source = nhgets(h->tcpsport);
1209         tcph->dest = nhgets(h->tcpdport);
1210         tcph->seq = nhgetl(h->tcpseq);
1211         tcph->ack = nhgetl(h->tcpack);
1212
1213         hdrlen = (h->tcpflag[0] >> 2) & ~3;
1214         if (hdrlen < TCP4_HDRSIZE) {
1215                 freeblist(*bpp);
1216                 return -1;
1217         }
1218
1219         tcph->flags = h->tcpflag[1];
1220         tcph->wnd = nhgets(h->tcpwin);
1221         tcph->urg = nhgets(h->tcpurg);
1222         tcph->mss = 0;
1223         tcph->ws = 0;
1224         tcph->len = nhgets(h->length) - (hdrlen + TCP4_PKT);
1225
1226         *bpp = pullupblock(*bpp, hdrlen + TCP4_PKT);
1227         if (*bpp == NULL)
1228                 return -1;
1229
1230         optr = h->tcpopt;
1231         n = hdrlen - TCP4_HDRSIZE;
1232         while (n > 0 && *optr != EOLOPT) {
1233                 if (*optr == NOOPOPT) {
1234                         n--;
1235                         optr++;
1236                         continue;
1237                 }
1238                 optlen = optr[1];
1239                 if (optlen < 2 || optlen > n)
1240                         break;
1241                 switch (*optr) {
1242                         case MSSOPT:
1243                                 if (optlen == MSS_LENGTH)
1244                                         tcph->mss = nhgets(optr + 2);
1245                                 break;
1246                         case WSOPT:
1247                                 if (optlen == WS_LENGTH && *(optr + 2) <= 14)
1248                                         tcph->ws = HaveWS | *(optr + 2);
1249                                 break;
1250                 }
1251                 n -= optlen;
1252                 optr += optlen;
1253         }
1254         return hdrlen;
1255 }
1256
1257 /*
1258  *  For outgiing calls, generate an initial sequence
1259  *  number and put a SYN on the send queue
1260  */
1261 void tcpsndsyn(struct conv *s, Tcpctl * tcb)
1262 {
1263         urandom_read(&tcb->iss, sizeof(tcb->iss));
1264         tcb->rttseq = tcb->iss;
1265         tcb->snd.wl2 = tcb->iss;
1266         tcb->snd.una = tcb->iss;
1267         tcb->snd.ptr = tcb->rttseq;
1268         tcb->snd.nxt = tcb->rttseq;
1269         tcb->flgcnt++;
1270         tcb->flags |= FORCE;
1271         tcb->sndsyntime = NOW;
1272
1273         /* set desired mss and scale */
1274         tcb->mss = tcpmtu(s->p, s->laddr, s->ipversion, &tcb->scale,
1275                           &tcb->flags);
1276 }
1277
1278 void
1279 sndrst(struct Proto *tcp, uint8_t * source, uint8_t * dest,
1280            uint16_t length, Tcp * seg, uint8_t version, char *reason)
1281 {
1282         struct block *hbp;
1283         uint8_t rflags;
1284         struct tcppriv *tpriv;
1285         Tcp4hdr ph4;
1286         Tcp6hdr ph6;
1287
1288         netlog(tcp->f, Logtcp, "sndrst: %s\n", reason);
1289
1290         tpriv = tcp->priv;
1291
1292         if (seg->flags & RST)
1293                 return;
1294
1295         /* make pseudo header */
1296         switch (version) {
1297                 case V4:
1298                         memset(&ph4, 0, sizeof(ph4));
1299                         ph4.vihl = IP_VER4;
1300                         v6tov4(ph4.tcpsrc, dest);
1301                         v6tov4(ph4.tcpdst, source);
1302                         ph4.proto = IP_TCPPROTO;
1303                         hnputs(ph4.tcplen, TCP4_HDRSIZE);
1304                         hnputs(ph4.tcpsport, seg->dest);
1305                         hnputs(ph4.tcpdport, seg->source);
1306                         break;
1307                 case V6:
1308                         memset(&ph6, 0, sizeof(ph6));
1309                         ph6.vcf[0] = IP_VER6;
1310                         ipmove(ph6.tcpsrc, dest);
1311                         ipmove(ph6.tcpdst, source);
1312                         ph6.proto = IP_TCPPROTO;
1313                         hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1314                         hnputs(ph6.tcpsport, seg->dest);
1315                         hnputs(ph6.tcpdport, seg->source);
1316                         break;
1317                 default:
1318                         panic("sndrst: version %d", version);
1319         }
1320
1321         tpriv->stats[OutRsts]++;
1322         rflags = RST;
1323
1324         /* convince the other end that this reset is in band */
1325         if (seg->flags & ACK) {
1326                 seg->seq = seg->ack;
1327                 seg->ack = 0;
1328         } else {
1329                 rflags |= ACK;
1330                 seg->ack = seg->seq;
1331                 seg->seq = 0;
1332                 if (seg->flags & SYN)
1333                         seg->ack++;
1334                 seg->ack += length;
1335                 if (seg->flags & FIN)
1336                         seg->ack++;
1337         }
1338         seg->flags = rflags;
1339         seg->wnd = 0;
1340         seg->urg = 0;
1341         seg->mss = 0;
1342         seg->ws = 0;
1343         switch (version) {
1344                 case V4:
1345                         hbp = htontcp4(seg, NULL, &ph4, NULL);
1346                         if (hbp == NULL)
1347                                 return;
1348                         ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1349                         break;
1350                 case V6:
1351                         hbp = htontcp6(seg, NULL, &ph6, NULL);
1352                         if (hbp == NULL)
1353                                 return;
1354                         ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1355                         break;
1356                 default:
1357                         panic("sndrst2: version %d", version);
1358         }
1359 }
1360
1361 /*
1362  *  send a reset to the remote side and close the conversation
1363  *  called with s qlocked
1364  */
1365 static void tcphangup(struct conv *s)
1366 {
1367         ERRSTACK(1);
1368         Tcp seg;
1369         Tcpctl *tcb;
1370         struct block *hbp;
1371
1372         tcb = (Tcpctl *) s->ptcl;
1373         if (ipcmp(s->raddr, IPnoaddr)) {
1374                 /* discard error style, poperror regardless */
1375                 if (!waserror()) {
1376                         seg.flags = RST | ACK;
1377                         seg.ack = tcb->rcv.nxt;
1378                         tcb->rcv.una = 0;
1379                         seg.seq = tcb->snd.ptr;
1380                         seg.wnd = 0;
1381                         seg.urg = 0;
1382                         seg.mss = 0;
1383                         seg.ws = 0;
1384                         switch (s->ipversion) {
1385                                 case V4:
1386                                         tcb->protohdr.tcp4hdr.vihl = IP_VER4;
1387                                         hbp = htontcp4(&seg, NULL, &tcb->protohdr.tcp4hdr, tcb);
1388                                         ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
1389                                         break;
1390                                 case V6:
1391                                         tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
1392                                         hbp = htontcp6(&seg, NULL, &tcb->protohdr.tcp6hdr, tcb);
1393                                         ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
1394                                         break;
1395                                 default:
1396                                         panic("tcphangup: version %d", s->ipversion);
1397                         }
1398                 }
1399                 poperror();
1400         }
1401         localclose(s, NULL);
1402 }
1403
1404 /*
1405  *  (re)send a SYN ACK
1406  */
1407 int sndsynack(struct Proto *tcp, Limbo * lp)
1408 {
1409         struct block *hbp;
1410         Tcp4hdr ph4;
1411         Tcp6hdr ph6;
1412         Tcp seg;
1413         int scale;
1414         uint8_t flag = 0;
1415
1416         /* make pseudo header */
1417         switch (lp->version) {
1418                 case V4:
1419                         memset(&ph4, 0, sizeof(ph4));
1420                         ph4.vihl = IP_VER4;
1421                         v6tov4(ph4.tcpsrc, lp->laddr);
1422                         v6tov4(ph4.tcpdst, lp->raddr);
1423                         ph4.proto = IP_TCPPROTO;
1424                         hnputs(ph4.tcplen, TCP4_HDRSIZE);
1425                         hnputs(ph4.tcpsport, lp->lport);
1426                         hnputs(ph4.tcpdport, lp->rport);
1427                         break;
1428                 case V6:
1429                         memset(&ph6, 0, sizeof(ph6));
1430                         ph6.vcf[0] = IP_VER6;
1431                         ipmove(ph6.tcpsrc, lp->laddr);
1432                         ipmove(ph6.tcpdst, lp->raddr);
1433                         ph6.proto = IP_TCPPROTO;
1434                         hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1435                         hnputs(ph6.tcpsport, lp->lport);
1436                         hnputs(ph6.tcpdport, lp->rport);
1437                         break;
1438                 default:
1439                         panic("sndrst: version %d", lp->version);
1440         }
1441
1442         seg.seq = lp->iss;
1443         seg.ack = lp->irs + 1;
1444         seg.flags = SYN | ACK;
1445         seg.urg = 0;
1446         seg.mss = tcpmtu(tcp, lp->laddr, lp->version, &scale, &flag);
1447         seg.wnd = QMAX;
1448
1449         /* if the other side set scale, we should too */
1450         if (lp->rcvscale) {
1451                 seg.ws = scale;
1452                 lp->sndscale = scale;
1453         } else {
1454                 seg.ws = 0;
1455                 lp->sndscale = 0;
1456         }
1457
1458         switch (lp->version) {
1459                 case V4:
1460                         hbp = htontcp4(&seg, NULL, &ph4, NULL);
1461                         if (hbp == NULL)
1462                                 return -1;
1463                         ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1464                         break;
1465                 case V6:
1466                         hbp = htontcp6(&seg, NULL, &ph6, NULL);
1467                         if (hbp == NULL)
1468                                 return -1;
1469                         ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1470                         break;
1471                 default:
1472                         panic("sndsnack: version %d", lp->version);
1473         }
1474         lp->lastsend = NOW;
1475         return 0;
1476 }
1477
1478 #define hashipa(a, p) ( ( (a)[IPaddrlen-2] + (a)[IPaddrlen-1] + p )&LHTMASK )
1479
1480 /*
1481  *  put a call into limbo and respond with a SYN ACK
1482  *
1483  *  called with proto locked
1484  */
1485 static void
1486 limbo(struct conv *s, uint8_t * source, uint8_t * dest, Tcp * seg, int version)
1487 {
1488         Limbo *lp, **l;
1489         struct tcppriv *tpriv;
1490         int h;
1491
1492         tpriv = s->p->priv;
1493         h = hashipa(source, seg->source);
1494
1495         for (l = &tpriv->lht[h]; *l != NULL; l = &lp->next) {
1496                 lp = *l;
1497                 if (lp->lport != seg->dest || lp->rport != seg->source
1498                         || lp->version != version)
1499                         continue;
1500                 if (ipcmp(lp->raddr, source) != 0)
1501                         continue;
1502                 if (ipcmp(lp->laddr, dest) != 0)
1503                         continue;
1504
1505                 /* each new SYN restarts the retransmits */
1506                 lp->irs = seg->seq;
1507                 break;
1508         }
1509         lp = *l;
1510         if (lp == NULL) {
1511                 if (tpriv->nlimbo >= Maxlimbo && tpriv->lht[h]) {
1512                         lp = tpriv->lht[h];
1513                         tpriv->lht[h] = lp->next;
1514                         lp->next = NULL;
1515                 } else {
1516                         lp = kzmalloc(sizeof(*lp), 0);
1517                         if (lp == NULL)
1518                                 return;
1519                         tpriv->nlimbo++;
1520                 }
1521                 *l = lp;
1522                 lp->version = version;
1523                 ipmove(lp->laddr, dest);
1524                 ipmove(lp->raddr, source);
1525                 lp->lport = seg->dest;
1526                 lp->rport = seg->source;
1527                 lp->mss = seg->mss;
1528                 lp->rcvscale = seg->ws;
1529                 lp->irs = seg->seq;
1530                 urandom_read(&lp->iss, sizeof(lp->iss));
1531         }
1532
1533         if (sndsynack(s->p, lp) < 0) {
1534                 *l = lp->next;
1535                 tpriv->nlimbo--;
1536                 kfree(lp);
1537         }
1538 }
1539
1540 /*
1541  *  resend SYN ACK's once every SYNACK_RXTIMER ms.
1542  */
1543 static void limborexmit(struct Proto *tcp)
1544 {
1545         struct tcppriv *tpriv;
1546         Limbo **l, *lp;
1547         int h;
1548         int seen;
1549         uint64_t now;
1550
1551         tpriv = tcp->priv;
1552
1553         if (!canqlock(&tcp->qlock))
1554                 return;
1555         seen = 0;
1556         now = NOW;
1557         for (h = 0; h < NLHT && seen < tpriv->nlimbo; h++) {
1558                 for (l = &tpriv->lht[h]; *l != NULL && seen < tpriv->nlimbo;) {
1559                         lp = *l;
1560                         seen++;
1561                         if (now - lp->lastsend < (lp->rexmits + 1) * SYNACK_RXTIMER)
1562                                 continue;
1563
1564                         /* time it out after 1 second */
1565                         if (++(lp->rexmits) > 5) {
1566                                 tpriv->nlimbo--;
1567                                 *l = lp->next;
1568                                 kfree(lp);
1569                                 continue;
1570                         }
1571
1572                         /* if we're being attacked, don't bother resending SYN ACK's */
1573                         if (tpriv->nlimbo > 100)
1574                                 continue;
1575
1576                         if (sndsynack(tcp, lp) < 0) {
1577                                 tpriv->nlimbo--;
1578                                 *l = lp->next;
1579                                 kfree(lp);
1580                                 continue;
1581                         }
1582
1583                         l = &lp->next;
1584                 }
1585         }
1586         qunlock(&tcp->qlock);
1587 }
1588
1589 /*
1590  *  lookup call in limbo.  if found, throw it out.
1591  *
1592  *  called with proto locked
1593  */
1594 static void
1595 limborst(struct conv *s, Tcp * segp, uint8_t * src, uint8_t * dst,
1596                  uint8_t version)
1597 {
1598         Limbo *lp, **l;
1599         int h;
1600         struct tcppriv *tpriv;
1601
1602         tpriv = s->p->priv;
1603
1604         /* find a call in limbo */
1605         h = hashipa(src, segp->source);
1606         for (l = &tpriv->lht[h]; *l != NULL; l = &lp->next) {
1607                 lp = *l;
1608                 if (lp->lport != segp->dest || lp->rport != segp->source
1609                         || lp->version != version)
1610                         continue;
1611                 if (ipcmp(lp->laddr, dst) != 0)
1612                         continue;
1613                 if (ipcmp(lp->raddr, src) != 0)
1614                         continue;
1615
1616                 /* RST can only follow the SYN */
1617                 if (segp->seq == lp->irs + 1) {
1618                         tpriv->nlimbo--;
1619                         *l = lp->next;
1620                         kfree(lp);
1621                 }
1622                 break;
1623         }
1624 }
1625
1626 /*
1627  *  come here when we finally get an ACK to our SYN-ACK.
1628  *  lookup call in limbo.  if found, create a new conversation
1629  *
1630  *  called with proto locked
1631  */
1632 static struct conv *tcpincoming(struct conv *s, Tcp * segp, uint8_t * src,
1633                                                                 uint8_t * dst, uint8_t version)
1634 {
1635         struct conv *new;
1636         Tcpctl *tcb;
1637         struct tcppriv *tpriv;
1638         Tcp4hdr *h4;
1639         Tcp6hdr *h6;
1640         Limbo *lp, **l;
1641         int h;
1642
1643         /* unless it's just an ack, it can't be someone coming out of limbo */
1644         if ((segp->flags & SYN) || (segp->flags & ACK) == 0)
1645                 return NULL;
1646
1647         tpriv = s->p->priv;
1648
1649         /* find a call in limbo */
1650         h = hashipa(src, segp->source);
1651         for (l = &tpriv->lht[h]; (lp = *l) != NULL; l = &lp->next) {
1652                 netlog(s->p->f, Logtcp,
1653                            "tcpincoming s %I!%d/%I!%d d %I!%d/%I!%d v %d/%d\n", src,
1654                            segp->source, lp->raddr, lp->rport, dst, segp->dest, lp->laddr,
1655                            lp->lport, version, lp->version);
1656
1657                 if (lp->lport != segp->dest || lp->rport != segp->source
1658                         || lp->version != version)
1659                         continue;
1660                 if (ipcmp(lp->laddr, dst) != 0)
1661                         continue;
1662                 if (ipcmp(lp->raddr, src) != 0)
1663                         continue;
1664
1665                 /* we're assuming no data with the initial SYN */
1666                 if (segp->seq != lp->irs + 1 || segp->ack != lp->iss + 1) {
1667                         netlog(s->p->f, Logtcp, "tcpincoming s 0x%lx/0x%lx a 0x%lx 0x%lx\n",
1668                                    segp->seq, lp->irs + 1, segp->ack, lp->iss + 1);
1669                         lp = NULL;
1670                 } else {
1671                         tpriv->nlimbo--;
1672                         *l = lp->next;
1673                 }
1674                 break;
1675         }
1676         if (lp == NULL)
1677                 return NULL;
1678
1679         new = Fsnewcall(s, src, segp->source, dst, segp->dest, version);
1680         if (new == NULL)
1681                 return NULL;
1682
1683         memmove(new->ptcl, s->ptcl, sizeof(Tcpctl));
1684         tcb = (Tcpctl *) new->ptcl;
1685         tcb->flags &= ~CLONE;
1686         tcb->timer.arg = new;
1687         tcb->timer.state = TcptimerOFF;
1688         tcb->acktimer.arg = new;
1689         tcb->acktimer.state = TcptimerOFF;
1690         tcb->katimer.arg = new;
1691         tcb->katimer.state = TcptimerOFF;
1692         tcb->rtt_timer.arg = new;
1693         tcb->rtt_timer.state = TcptimerOFF;
1694
1695         tcb->irs = lp->irs;
1696         tcb->rcv.nxt = tcb->irs + 1;
1697         tcb->rcv.urg = tcb->rcv.nxt;
1698
1699         tcb->iss = lp->iss;
1700         tcb->rttseq = tcb->iss;
1701         tcb->snd.wl2 = tcb->iss;
1702         tcb->snd.una = tcb->iss + 1;
1703         tcb->snd.ptr = tcb->iss + 1;
1704         tcb->snd.nxt = tcb->iss + 1;
1705         tcb->flgcnt = 0;
1706         tcb->flags |= SYNACK;
1707
1708         /* our sending max segment size cannot be bigger than what he asked for */
1709         if (lp->mss != 0 && lp->mss < tcb->mss)
1710                 tcb->mss = lp->mss;
1711
1712         /* window scaling */
1713         tcpsetscale(new, tcb, lp->rcvscale, lp->sndscale);
1714
1715         tcb->snd.wnd = segp->wnd;
1716         tcb->cwind = tcb->mss * CWIND_SCALE;
1717
1718         /* set initial round trip time */
1719         tcb->sndsyntime = lp->lastsend + lp->rexmits * SYNACK_RXTIMER;
1720         tcpsynackrtt(new);
1721
1722         kfree(lp);
1723
1724         /* set up proto header */
1725         switch (version) {
1726                 case V4:
1727                         h4 = &tcb->protohdr.tcp4hdr;
1728                         memset(h4, 0, sizeof(*h4));
1729                         h4->proto = IP_TCPPROTO;
1730                         hnputs(h4->tcpsport, new->lport);
1731                         hnputs(h4->tcpdport, new->rport);
1732                         v6tov4(h4->tcpsrc, dst);
1733                         v6tov4(h4->tcpdst, src);
1734                         break;
1735                 case V6:
1736                         h6 = &tcb->protohdr.tcp6hdr;
1737                         memset(h6, 0, sizeof(*h6));
1738                         h6->proto = IP_TCPPROTO;
1739                         hnputs(h6->tcpsport, new->lport);
1740                         hnputs(h6->tcpdport, new->rport);
1741                         ipmove(h6->tcpsrc, dst);
1742                         ipmove(h6->tcpdst, src);
1743                         break;
1744                 default:
1745                         panic("tcpincoming: version %d", new->ipversion);
1746         }
1747
1748         tcpsetstate(new, Established);
1749
1750         iphtadd(&tpriv->ht, new);
1751
1752         return new;
1753 }
1754
1755 int seq_within(uint32_t x, uint32_t low, uint32_t high)
1756 {
1757         if (low <= high) {
1758                 if (low <= x && x <= high)
1759                         return 1;
1760         } else {
1761                 if (x >= low || x <= high)
1762                         return 1;
1763         }
1764         return 0;
1765 }
1766
1767 int seq_lt(uint32_t x, uint32_t y)
1768 {
1769         return (int)(x - y) < 0;
1770 }
1771
1772 int seq_le(uint32_t x, uint32_t y)
1773 {
1774         return (int)(x - y) <= 0;
1775 }
1776
1777 int seq_gt(uint32_t x, uint32_t y)
1778 {
1779         return (int)(x - y) > 0;
1780 }
1781
1782 int seq_ge(uint32_t x, uint32_t y)
1783 {
1784         return (int)(x - y) >= 0;
1785 }
1786
1787 /*
1788  *  use the time between the first SYN and it's ack as the
1789  *  initial round trip time
1790  */
1791 void tcpsynackrtt(struct conv *s)
1792 {
1793         Tcpctl *tcb;
1794         uint64_t delta;
1795         struct tcppriv *tpriv;
1796
1797         tcb = (Tcpctl *) s->ptcl;
1798         tpriv = s->p->priv;
1799
1800         delta = NOW - tcb->sndsyntime;
1801         tcb->srtt = delta << LOGAGAIN;
1802         tcb->mdev = delta << LOGDGAIN;
1803
1804         /* halt round trip timer */
1805         tcphalt(tpriv, &tcb->rtt_timer);
1806 }
1807
1808 /* For LFNs (long/fat), our default tx queue doesn't hold enough data, and TCP
1809  * blocks on the application - even if the app already has the data ready to go.
1810  * We need to hold the sent, unacked data (1x cwnd), plus all the data we might
1811  * send next RTT (1x cwnd).  Note this is called after cwnd was expanded. */
1812 static void adjust_tx_qio_limit(struct conv *s)
1813 {
1814         Tcpctl *tcb = (Tcpctl *) s->ptcl;
1815         size_t ideal_limit = tcb->cwind * 2;
1816
1817         /* This is called for every ACK, and it's not entirely free to update the
1818          * limit (locks, CVs, taps).  Updating in chunks of mss seems reasonable.
1819          * During SS, we'll update this on most ACKs (given each ACK increased the
1820          * cwind by > MSS).
1821          *
1822          * We also don't want a lot of tiny blocks from the user, but the way qio
1823          * works, you can put in as much as you want (Maxatomic) and then get
1824          * flow-controlled. */
1825         if (qgetlimit(s->wq) + tcb->mss < ideal_limit)
1826                 qsetlimit(s->wq, ideal_limit);
1827         /* TODO: we could shrink the qio limit too, if we had a better idea what the
1828          * actual threshold was.  We want the limit to be the 'stable' cwnd * 2. */
1829 }
1830
1831 void update(struct conv *s, Tcp * seg)
1832 {
1833         int rtt, delta;
1834         Tcpctl *tcb;
1835         uint32_t acked;
1836         uint32_t expand;
1837         struct tcppriv *tpriv;
1838
1839         tpriv = s->p->priv;
1840         tcb = (Tcpctl *) s->ptcl;
1841
1842         /* if everything has been acked, force output(?) */
1843         if (seq_gt(seg->ack, tcb->snd.nxt)) {
1844                 tcb->flags |= FORCE;
1845                 return;
1846         }
1847
1848         /* added by Dong Lin for fast retransmission */
1849         if (seg->ack == tcb->snd.una
1850                 && tcb->snd.una != tcb->snd.nxt
1851                 && seg->len == 0 && seg->wnd == tcb->snd.wnd) {
1852
1853                 /* this is a pure ack w/o window update */
1854                 netlog(s->p->f, Logtcprxmt, "dupack %lu ack %lu sndwnd %d advwin %d\n",
1855                            tcb->snd.dupacks, seg->ack, tcb->snd.wnd, seg->wnd);
1856
1857                 if (++tcb->snd.dupacks == TCPREXMTTHRESH) {
1858                         /*
1859                          *  tahoe tcp rxt the packet, half sshthresh,
1860                          *  and set cwnd to one packet
1861                          */
1862                         tcb->snd.recovery = 1;
1863                         tcb->snd.rxt = tcb->snd.nxt;
1864                         netlog(s->p->f, Logtcprxmt, "fast rxt %lu, nxt %lu\n", tcb->snd.una,
1865                                    tcb->snd.nxt);
1866                         tcprxmit(s);
1867                 } else {
1868                         /* do reno tcp here. */
1869                 }
1870         }
1871
1872         /*
1873          *  update window
1874          */
1875         if (seq_gt(seg->ack, tcb->snd.wl2)
1876                 || (tcb->snd.wl2 == seg->ack && seg->wnd > tcb->snd.wnd)) {
1877                 tcb->snd.wnd = seg->wnd;
1878                 tcb->snd.wl2 = seg->ack;
1879         }
1880
1881         if (!seq_gt(seg->ack, tcb->snd.una)) {
1882                 /*
1883                  *  don't let us hangup if sending into a closed window and
1884                  *  we're still getting acks
1885                  */
1886                 if ((tcb->flags & RETRAN) && tcb->snd.wnd == 0) {
1887                         tcb->backedoff = MAXBACKMS / 4;
1888                 }
1889                 return;
1890         }
1891
1892         /*
1893          *  any positive ack turns off fast rxt,
1894          *  (should we do new-reno on partial acks?)
1895          */
1896         if (!tcb->snd.recovery || seq_ge(seg->ack, tcb->snd.rxt)) {
1897                 tcb->snd.dupacks = 0;
1898                 tcb->snd.recovery = 0;
1899         } else
1900                 netlog(s->p->f, Logtcp, "rxt next %lu, cwin %u\n", seg->ack,
1901                            tcb->cwind);
1902
1903         /* Compute the new send window size */
1904         acked = seg->ack - tcb->snd.una;
1905
1906         /* avoid slow start and timers for SYN acks */
1907         if ((tcb->flags & SYNACK) == 0) {
1908                 tcb->flags |= SYNACK;
1909                 acked--;
1910                 tcb->flgcnt--;
1911                 goto done;
1912         }
1913
1914         /* slow start as long as we're not recovering from lost packets */
1915         if (tcb->cwind < tcb->snd.wnd && !tcb->snd.recovery) {
1916                 if (tcb->cwind < tcb->ssthresh) {
1917                         /* We increase the cwind by every byte we receive.  We want to
1918                          * increase the cwind by one MSS for every MSS that gets ACKed.
1919                          * Note that multiple MSSs can be ACKed in a single ACK.  If we had
1920                          * a remainder of acked / MSS, we'd add just that remainder - not 0
1921                          * or 1 MSS. */
1922                         expand = acked;
1923                 } else {
1924                         /* Every RTT, which consists of CWND bytes, we're supposed to expand
1925                          * by MSS bytes.  The classic algorithm was
1926                          *              expand = (tcb->mss * tcb->mss) / tcb->cwind;
1927                          * which assumes the ACK was for MSS bytes.  Instead, for every
1928                          * 'acked' bytes, we increase the window by acked / CWND (in units
1929                          * of MSS). */
1930                         expand = MAX(acked, tcb->mss) * tcb->mss / tcb->cwind;
1931                 }
1932
1933                 if (tcb->cwind + expand < tcb->cwind)
1934                         expand = tcb->snd.wnd - tcb->cwind;
1935                 if (tcb->cwind + expand > tcb->snd.wnd)
1936                         expand = tcb->snd.wnd - tcb->cwind;
1937                 tcb->cwind += expand;
1938         }
1939         adjust_tx_qio_limit(s);
1940
1941         /* Adjust the timers according to the round trip time */
1942         if (tcb->rtt_timer.state == TcptimerON && seq_ge(seg->ack, tcb->rttseq)) {
1943                 tcphalt(tpriv, &tcb->rtt_timer);
1944                 if ((tcb->flags & RETRAN) == 0) {
1945                         tcb->backoff = 0;
1946                         tcb->backedoff = 0;
1947                         rtt = tcb->rtt_timer.start - tcb->rtt_timer.count;
1948                         if (rtt == 0)
1949                                 rtt = 1;        /* otherwise all close systems will rexmit in 0 time */
1950                         rtt *= MSPTICK;
1951                         if (tcb->srtt == 0) {
1952                                 tcb->srtt = rtt << LOGAGAIN;
1953                                 tcb->mdev = rtt << LOGDGAIN;
1954                         } else {
1955                                 delta = rtt - (tcb->srtt >> LOGAGAIN);
1956                                 tcb->srtt += delta;
1957                                 if (tcb->srtt <= 0)
1958                                         tcb->srtt = 1;
1959
1960                                 delta = abs(delta) - (tcb->mdev >> LOGDGAIN);
1961                                 tcb->mdev += delta;
1962                                 if (tcb->mdev <= 0)
1963                                         tcb->mdev = 1;
1964                         }
1965                         tcpsettimer(tcb);
1966                 }
1967         }
1968
1969 done:
1970         if (qdiscard(s->wq, acked) < acked)
1971                 tcb->flgcnt--;
1972
1973         tcb->snd.una = seg->ack;
1974         if (seq_gt(seg->ack, tcb->snd.urg))
1975                 tcb->snd.urg = seg->ack;
1976
1977         if (tcb->snd.una != tcb->snd.nxt)
1978                 tcpgo(tpriv, &tcb->timer);
1979         else
1980                 tcphalt(tpriv, &tcb->timer);
1981
1982         if (seq_lt(tcb->snd.ptr, tcb->snd.una))
1983                 tcb->snd.ptr = tcb->snd.una;
1984
1985         tcb->flags &= ~RETRAN;
1986         tcb->backoff = 0;
1987         tcb->backedoff = 0;
1988 }
1989
1990 void tcpiput(struct Proto *tcp, struct Ipifc *unused, struct block *bp)
1991 {
1992         ERRSTACK(1);
1993         Tcp seg;
1994         Tcp4hdr *h4;
1995         Tcp6hdr *h6;
1996         int hdrlen;
1997         Tcpctl *tcb;
1998         uint16_t length;
1999         uint8_t source[IPaddrlen], dest[IPaddrlen];
2000         struct conv *s;
2001         struct Fs *f;
2002         struct tcppriv *tpriv;
2003         uint8_t version;
2004
2005         f = tcp->f;
2006         tpriv = tcp->priv;
2007
2008         tpriv->stats[InSegs]++;
2009
2010         h4 = (Tcp4hdr *) (bp->rp);
2011         h6 = (Tcp6hdr *) (bp->rp);
2012
2013         if ((h4->vihl & 0xF0) == IP_VER4) {
2014                 uint8_t ttl;
2015
2016                 version = V4;
2017                 length = nhgets(h4->length);
2018                 v4tov6(dest, h4->tcpdst);
2019                 v4tov6(source, h4->tcpsrc);
2020
2021                 /* ttl isn't part of the xsum pseudo header, but bypass needs it. */
2022                 ttl = h4->Unused;
2023                 h4->Unused = 0;
2024                 hnputs(h4->tcplen, length - TCP4_PKT);
2025                 if (!(bp->flag & Btcpck) && (h4->tcpcksum[0] || h4->tcpcksum[1]) &&
2026                         ptclcsum(bp, TCP4_IPLEN, length - TCP4_IPLEN)) {
2027                         tpriv->stats[CsumErrs]++;
2028                         tpriv->stats[InErrs]++;
2029                         netlog(f, Logtcp, "bad tcp proto cksum\n");
2030                         freeblist(bp);
2031                         return;
2032                 }
2033                 h4->Unused = ttl;
2034
2035                 hdrlen = ntohtcp4(&seg, &bp);
2036                 if (hdrlen < 0) {
2037                         tpriv->stats[HlenErrs]++;
2038                         tpriv->stats[InErrs]++;
2039                         netlog(f, Logtcp, "bad tcp hdr len\n");
2040                         return;
2041                 }
2042
2043                 s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest);
2044                 if (s && s->state == Bypass) {
2045                         bypass_or_drop(s, bp);
2046                         return;
2047                 }
2048
2049                 /* trim the packet to the size claimed by the datagram */
2050                 length -= hdrlen + TCP4_PKT;
2051                 bp = trimblock(bp, hdrlen + TCP4_PKT, length);
2052                 if (bp == NULL) {
2053                         tpriv->stats[LenErrs]++;
2054                         tpriv->stats[InErrs]++;
2055                         netlog(f, Logtcp, "tcp len < 0 after trim\n");
2056                         return;
2057                 }
2058         } else {
2059                 int ttl = h6->ttl;
2060                 int proto = h6->proto;
2061
2062                 version = V6;
2063                 length = nhgets(h6->ploadlen);
2064                 ipmove(dest, h6->tcpdst);
2065                 ipmove(source, h6->tcpsrc);
2066
2067                 h6->ploadlen[0] = h6->ploadlen[1] = h6->proto = 0;
2068                 h6->ttl = proto;
2069                 hnputl(h6->vcf, length);
2070                 if ((h6->tcpcksum[0] || h6->tcpcksum[1]) &&
2071                         ptclcsum(bp, TCP6_IPLEN, length + TCP6_PHDRSIZE)) {
2072                         tpriv->stats[CsumErrs]++;
2073                         tpriv->stats[InErrs]++;
2074                         netlog(f, Logtcp, "bad tcp proto cksum\n");
2075                         freeblist(bp);
2076                         return;
2077                 }
2078                 h6->ttl = ttl;
2079                 h6->proto = proto;
2080                 hnputs(h6->ploadlen, length);
2081
2082                 hdrlen = ntohtcp6(&seg, &bp);
2083                 if (hdrlen < 0) {
2084                         tpriv->stats[HlenErrs]++;
2085                         tpriv->stats[InErrs]++;
2086                         netlog(f, Logtcp, "bad tcp hdr len\n");
2087                         return;
2088                 }
2089
2090                 s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest);
2091                 if (s && s->state == Bypass) {
2092                         bypass_or_drop(s, bp);
2093                         return;
2094                 }
2095
2096                 /* trim the packet to the size claimed by the datagram */
2097                 length -= hdrlen;
2098                 bp = trimblock(bp, hdrlen + TCP6_PKT, length);
2099                 if (bp == NULL) {
2100                         tpriv->stats[LenErrs]++;
2101                         tpriv->stats[InErrs]++;
2102                         netlog(f, Logtcp, "tcp len < 0 after trim\n");
2103                         return;
2104                 }
2105         }
2106
2107         /* s, the conv matching the n-tuple, was set above */
2108         if (s == NULL) {
2109                 netlog(f, Logtcp, "iphtlook failed: src %I:%u, dst %I:%u\n",
2110                        source, seg.source, dest, seg.dest);
2111 reset:
2112                 sndrst(tcp, source, dest, length, &seg, version, "no conversation");
2113                 freeblist(bp);
2114                 return;
2115         }
2116
2117         /* lock protocol for unstate Plan 9 invariants.  funcs like limbo or
2118          * incoming might rely on it. */
2119         qlock(&tcp->qlock);
2120
2121         /* if it's a listener, look for the right flags and get a new conv */
2122         tcb = (Tcpctl *) s->ptcl;
2123         if (tcb->state == Listen) {
2124                 if (seg.flags & RST) {
2125                         limborst(s, &seg, source, dest, version);
2126                         qunlock(&tcp->qlock);
2127                         freeblist(bp);
2128                         return;
2129                 }
2130
2131                 /* if this is a new SYN, put the call into limbo */
2132                 if ((seg.flags & SYN) && (seg.flags & ACK) == 0) {
2133                         limbo(s, source, dest, &seg, version);
2134                         qunlock(&tcp->qlock);
2135                         freeblist(bp);
2136                         return;
2137                 }
2138
2139                 /*
2140                  *  if there's a matching call in limbo, tcpincoming will
2141                  *  return it in state Syn_received
2142                  */
2143                 s = tcpincoming(s, &seg, source, dest, version);
2144                 if (s == NULL) {
2145                         qunlock(&tcp->qlock);
2146                         goto reset;
2147                 }
2148         }
2149
2150         /* The rest of the input state machine is run with the control block
2151          * locked and implements the state machine directly out of the RFC.
2152          * Out-of-band data is ignored - it was always a bad idea.
2153          */
2154         tcb = (Tcpctl *) s->ptcl;
2155         if (waserror()) {
2156                 qunlock(&s->qlock);
2157                 nexterror();
2158         }
2159         qlock(&s->qlock);
2160         qunlock(&tcp->qlock);
2161
2162         /* fix up window */
2163         seg.wnd <<= tcb->rcv.scale;
2164
2165         /* every input packet in puts off the keep alive time out */
2166         tcpsetkacounter(tcb);
2167
2168         switch (tcb->state) {
2169                 case Closed:
2170                         sndrst(tcp, source, dest, length, &seg, version,
2171                                    "sending to Closed");
2172                         goto raise;
2173                 case Syn_sent:
2174                         if (seg.flags & ACK) {
2175                                 if (!seq_within(seg.ack, tcb->iss + 1, tcb->snd.nxt)) {
2176                                         sndrst(tcp, source, dest, length, &seg, version,
2177                                                    "bad seq in Syn_sent");
2178                                         goto raise;
2179                                 }
2180                         }
2181                         if (seg.flags & RST) {
2182                                 if (seg.flags & ACK)
2183                                         localclose(s, "connection refused");
2184                                 goto raise;
2185                         }
2186
2187                         if (seg.flags & SYN) {
2188                                 procsyn(s, &seg);
2189                                 if (seg.flags & ACK) {
2190                                         update(s, &seg);
2191                                         tcpsynackrtt(s);
2192                                         tcpsetstate(s, Established);
2193                                         tcpsetscale(s, tcb, seg.ws, tcb->scale);
2194                                 } else {
2195                                         tcb->time = NOW;
2196                                         tcpsetstate(s, Syn_received);   /* DLP - shouldn't this be a reset? */
2197                                 }
2198
2199                                 if (length != 0 || (seg.flags & FIN))
2200                                         break;
2201
2202                                 freeblist(bp);
2203                                 goto output;
2204                         } else
2205                                 freeblist(bp);
2206
2207                         qunlock(&s->qlock);
2208                         poperror();
2209                         return;
2210                 case Syn_received:
2211                         /* doesn't matter if it's the correct ack, we're just trying to set timing */
2212                         if (seg.flags & ACK)
2213                                 tcpsynackrtt(s);
2214                         break;
2215         }
2216
2217         /*
2218          *  One DOS attack is to open connections to us and then forget about them,
2219          *  thereby tying up a conv at no long term cost to the attacker.
2220          *  This is an attempt to defeat these stateless DOS attacks.  See
2221          *  corresponding code in tcpsendka().
2222          */
2223         if (tcb->state != Syn_received && (seg.flags & RST) == 0) {
2224                 if (tcpporthogdefense
2225                         && seq_within(seg.ack, tcb->snd.una - (1 << 31),
2226                                                   tcb->snd.una - (1 << 29))) {
2227                         printd("stateless hog %I.%d->%I.%d f 0x%x 0x%lx - 0x%lx - 0x%lx\n",
2228                                    source, seg.source, dest, seg.dest, seg.flags,
2229                                    tcb->snd.una - (1 << 31), seg.ack, tcb->snd.una - (1 << 29));
2230                         localclose(s, "stateless hog");
2231                 }
2232         }
2233
2234         /* Cut the data to fit the receive window */
2235         if (tcptrim(tcb, &seg, &bp, &length) == -1) {
2236                 netlog(f, Logtcp, "tcp len < 0, %lu %d\n", seg.seq, length);
2237                 update(s, &seg);
2238                 if (qlen(s->wq) + tcb->flgcnt == 0 && tcb->state == Closing) {
2239                         tcphalt(tpriv, &tcb->rtt_timer);
2240                         tcphalt(tpriv, &tcb->acktimer);
2241                         tcphalt(tpriv, &tcb->katimer);
2242                         tcpsetstate(s, Time_wait);
2243                         tcb->timer.start = MSL2 * (1000 / MSPTICK);
2244                         tcpgo(tpriv, &tcb->timer);
2245                 }
2246                 if (!(seg.flags & RST)) {
2247                         tcb->flags |= FORCE;
2248                         goto output;
2249                 }
2250                 qunlock(&s->qlock);
2251                 poperror();
2252                 return;
2253         }
2254
2255         /* Cannot accept so answer with a rst */
2256         if (length && tcb->state == Closed) {
2257                 sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
2258                 goto raise;
2259         }
2260
2261         /* The segment is beyond the current receive pointer so
2262          * queue the data in the resequence queue
2263          */
2264         if (seg.seq != tcb->rcv.nxt)
2265                 if (length != 0 || (seg.flags & (SYN | FIN))) {
2266                         update(s, &seg);
2267                         if (addreseq(tcb, tpriv, &seg, bp, length) < 0)
2268                                 printd("reseq %I.%d -> %I.%d\n", s->raddr, s->rport, s->laddr,
2269                                            s->lport);
2270                         tcb->flags |= FORCE;
2271                         goto output;
2272                 }
2273
2274         /*
2275          *  keep looping till we've processed this packet plus any
2276          *  adjacent packets in the resequence queue
2277          */
2278         for (;;) {
2279                 if (seg.flags & RST) {
2280                         if (tcb->state == Established) {
2281                                 tpriv->stats[EstabResets]++;
2282                                 if (tcb->rcv.nxt != seg.seq)
2283                                         printd
2284                                                 ("out of order RST rcvd: %I.%d -> %I.%d, rcv.nxt 0x%lx seq 0x%lx\n",
2285                                                  s->raddr, s->rport, s->laddr, s->lport, tcb->rcv.nxt,
2286                                                  seg.seq);
2287                         }
2288                         localclose(s, "connection refused");
2289                         goto raise;
2290                 }
2291
2292                 if ((seg.flags & ACK) == 0)
2293                         goto raise;
2294
2295                 switch (tcb->state) {
2296                         case Syn_received:
2297                                 if (!seq_within(seg.ack, tcb->snd.una + 1, tcb->snd.nxt)) {
2298                                         sndrst(tcp, source, dest, length, &seg, version,
2299                                                    "bad seq in Syn_received");
2300                                         goto raise;
2301                                 }
2302                                 update(s, &seg);
2303                                 tcpsetstate(s, Established);
2304                         case Established:
2305                         case Close_wait:
2306                                 update(s, &seg);
2307                                 break;
2308                         case Finwait1:
2309                                 update(s, &seg);
2310                                 if (qlen(s->wq) + tcb->flgcnt == 0) {
2311                                         tcphalt(tpriv, &tcb->rtt_timer);
2312                                         tcphalt(tpriv, &tcb->acktimer);
2313                                         tcpsetkacounter(tcb);
2314                                         tcb->time = NOW;
2315                                         tcpsetstate(s, Finwait2);
2316                                         tcb->katimer.start = MSL2 * (1000 / MSPTICK);
2317                                         tcpgo(tpriv, &tcb->katimer);
2318                                 }
2319                                 break;
2320                         case Finwait2:
2321                                 update(s, &seg);
2322                                 break;
2323                         case Closing:
2324                                 update(s, &seg);
2325                                 if (qlen(s->wq) + tcb->flgcnt == 0) {
2326                                         tcphalt(tpriv, &tcb->rtt_timer);
2327                                         tcphalt(tpriv, &tcb->acktimer);
2328                                         tcphalt(tpriv, &tcb->katimer);
2329                                         tcpsetstate(s, Time_wait);
2330                                         tcb->timer.start = MSL2 * (1000 / MSPTICK);
2331                                         tcpgo(tpriv, &tcb->timer);
2332                                 }
2333                                 break;
2334                         case Last_ack:
2335                                 update(s, &seg);
2336                                 if (qlen(s->wq) + tcb->flgcnt == 0) {
2337                                         localclose(s, NULL);
2338                                         goto raise;
2339                                 }
2340                         case Time_wait:
2341                                 tcb->flags |= FORCE;
2342                                 if (tcb->timer.state != TcptimerON)
2343                                         tcpgo(tpriv, &tcb->timer);
2344                 }
2345
2346                 if ((seg.flags & URG) && seg.urg) {
2347                         if (seq_gt(seg.urg + seg.seq, tcb->rcv.urg)) {
2348                                 tcb->rcv.urg = seg.urg + seg.seq;
2349                                 pullblock(&bp, seg.urg);
2350                         }
2351                 } else if (seq_gt(tcb->rcv.nxt, tcb->rcv.urg))
2352                         tcb->rcv.urg = tcb->rcv.nxt;
2353
2354                 if (length == 0) {
2355                         if (bp != NULL)
2356                                 freeblist(bp);
2357                 } else {
2358                         switch (tcb->state) {
2359                                 default:
2360                                         /* Ignore segment text */
2361                                         if (bp != NULL)
2362                                                 freeblist(bp);
2363                                         break;
2364
2365                                 case Syn_received:
2366                                 case Established:
2367                                 case Finwait1:
2368                                         /* If we still have some data place on
2369                                          * receive queue
2370                                          */
2371                                         if (bp) {
2372                                                 bp = packblock(bp);
2373                                                 if (bp == NULL)
2374                                                         panic("tcp packblock");
2375                                                 qpassnolim(s->rq, bp);
2376                                                 bp = NULL;
2377
2378                                                 /*
2379                                                  *  Force an ack every 2 data messages.  This is
2380                                                  *  a hack for rob to make his home system run
2381                                                  *  faster.
2382                                                  *
2383                                                  *  this also keeps the standard TCP congestion
2384                                                  *  control working since it needs an ack every
2385                                                  *  2 max segs worth.  This is not quite that,
2386                                                  *  but under a real stream is equivalent since
2387                                                  *  every packet has a max seg in it.
2388                                                  */
2389                                                 if (++(tcb->rcv.una) >= 2)
2390                                                         tcb->flags |= FORCE;
2391                                         }
2392                                         tcb->rcv.nxt += length;
2393
2394                                         /*
2395                                          *  update our rcv window
2396                                          */
2397                                         tcprcvwin(s);
2398
2399                                         /*
2400                                          *  turn on the acktimer if there's something
2401                                          *  to ack
2402                                          */
2403                                         if (tcb->acktimer.state != TcptimerON)
2404                                                 tcpgo(tpriv, &tcb->acktimer);
2405
2406                                         break;
2407                                 case Finwait2:
2408                                         /* no process to read the data, send a reset */
2409                                         if (bp != NULL)
2410                                                 freeblist(bp);
2411                                         sndrst(tcp, source, dest, length, &seg, version,
2412                                                    "send to Finwait2");
2413                                         qunlock(&s->qlock);
2414                                         poperror();
2415                                         return;
2416                         }
2417                 }
2418
2419                 if (seg.flags & FIN) {
2420                         tcb->flags |= FORCE;
2421
2422                         switch (tcb->state) {
2423                                 case Syn_received:
2424                                 case Established:
2425                                         tcb->rcv.nxt++;
2426                                         tcpsetstate(s, Close_wait);
2427                                         break;
2428                                 case Finwait1:
2429                                         tcb->rcv.nxt++;
2430                                         if (qlen(s->wq) + tcb->flgcnt == 0) {
2431                                                 tcphalt(tpriv, &tcb->rtt_timer);
2432                                                 tcphalt(tpriv, &tcb->acktimer);
2433                                                 tcphalt(tpriv, &tcb->katimer);
2434                                                 tcpsetstate(s, Time_wait);
2435                                                 tcb->timer.start = MSL2 * (1000 / MSPTICK);
2436                                                 tcpgo(tpriv, &tcb->timer);
2437                                         } else
2438                                                 tcpsetstate(s, Closing);
2439                                         break;
2440                                 case Finwait2:
2441                                         tcb->rcv.nxt++;
2442                                         tcphalt(tpriv, &tcb->rtt_timer);
2443                                         tcphalt(tpriv, &tcb->acktimer);
2444                                         tcphalt(tpriv, &tcb->katimer);
2445                                         tcpsetstate(s, Time_wait);
2446                                         tcb->timer.start = MSL2 * (1000 / MSPTICK);
2447                                         tcpgo(tpriv, &tcb->timer);
2448                                         break;
2449                                 case Close_wait:
2450                                 case Closing:
2451                                 case Last_ack:
2452                                         break;
2453                                 case Time_wait:
2454                                         tcpgo(tpriv, &tcb->timer);
2455                                         break;
2456                         }
2457                 }
2458
2459                 /*
2460                  *  get next adjacent segment from the resequence queue.
2461                  *  dump/trim any overlapping segments
2462                  */
2463                 for (;;) {
2464                         if (tcb->reseq == NULL)
2465                                 goto output;
2466
2467                         if (seq_ge(tcb->rcv.nxt, tcb->reseq->seg.seq) == 0)
2468                                 goto output;
2469
2470                         getreseq(tcb, &seg, &bp, &length);
2471
2472                         if (tcptrim(tcb, &seg, &bp, &length) == 0)
2473                                 break;
2474                 }
2475         }
2476 output:
2477         tcpoutput(s);
2478         qunlock(&s->qlock);
2479         poperror();
2480         return;
2481 raise:
2482         qunlock(&s->qlock);
2483         poperror();
2484         freeblist(bp);
2485         tcpkick(s);
2486 }
2487
2488 /*
2489  *  always enters and exits with the s locked.  We drop
2490  *  the lock to ipoput the packet so some care has to be
2491  *  taken by callers.
2492  */
2493 void tcpoutput(struct conv *s)
2494 {
2495         Tcp seg;
2496         int msgs;
2497         Tcpctl *tcb;
2498         struct block *hbp, *bp;
2499         int sndcnt, n;
2500         uint32_t ssize, dsize, usable, sent;
2501         struct Fs *f;
2502         struct tcppriv *tpriv;
2503         uint8_t version;
2504
2505         f = s->p->f;
2506         tpriv = s->p->priv;
2507         version = s->ipversion;
2508
2509         for (msgs = 0; msgs < 100; msgs++) {
2510                 tcb = (Tcpctl *) s->ptcl;
2511
2512                 switch (tcb->state) {
2513                         case Listen:
2514                         case Closed:
2515                         case Finwait2:
2516                                 return;
2517                 }
2518
2519                 /* force an ack when a window has opened up */
2520                 if (tcb->rcv.blocked && tcb->rcv.wnd > 0) {
2521                         tcb->rcv.blocked = 0;
2522                         tcb->flags |= FORCE;
2523                 }
2524
2525                 sndcnt = qlen(s->wq) + tcb->flgcnt;
2526                 sent = tcb->snd.ptr - tcb->snd.una;
2527
2528                 /* Don't send anything else until our SYN has been acked */
2529                 if (tcb->snd.ptr != tcb->iss && (tcb->flags & SYNACK) == 0)
2530                         break;
2531
2532                 /* Compute usable segment based on offered window and limit
2533                  * window probes to one
2534                  */
2535                 if (tcb->snd.wnd == 0) {
2536                         if (sent != 0) {
2537                                 if ((tcb->flags & FORCE) == 0)
2538                                         break;
2539 //              tcb->snd.ptr = tcb->snd.una;
2540                         }
2541                         usable = 1;
2542                 } else {
2543                         usable = tcb->cwind;
2544                         if (tcb->snd.wnd < usable)
2545                                 usable = tcb->snd.wnd;
2546                         usable -= sent;
2547                 }
2548                 ssize = sndcnt - sent;
2549                 if (ssize && usable < 2)
2550                         netlog(s->p->f, Logtcp, "throttled snd.wnd %lu cwind %lu\n",
2551                                    tcb->snd.wnd, tcb->cwind);
2552                 if (usable < ssize)
2553                         ssize = usable;
2554                 if (ssize > tcb->mss) {
2555                         if ((tcb->flags & TSO) == 0) {
2556                                 ssize = tcb->mss;
2557                         } else {
2558                                 int segs, window;
2559
2560                                 /*  Don't send too much.  32K is arbitrary..
2561                                  */
2562                                 if (ssize > 32 * 1024)
2563                                         ssize = 32 * 1024;
2564
2565                                 /* Clamp xmit to an integral MSS to
2566                                  * avoid ragged tail segments causing
2567                                  * poor link utilization.  Also
2568                                  * account for each segment sent in
2569                                  * msg heuristic, and round up to the
2570                                  * next multiple of 4, to ensure we
2571                                  * still yeild.
2572                                  */
2573                                 segs = ssize / tcb->mss;
2574                                 ssize = segs * tcb->mss;
2575                                 msgs += segs;
2576                                 if (segs > 3)
2577                                         msgs = (msgs + 4) & ~3;
2578                         }
2579                 }
2580
2581                 dsize = ssize;
2582                 seg.urg = 0;
2583
2584                 if (ssize == 0)
2585                         if ((tcb->flags & FORCE) == 0)
2586                                 break;
2587
2588                 tcb->flags &= ~FORCE;
2589                 tcprcvwin(s);
2590
2591                 /* By default we will generate an ack */
2592                 tcphalt(tpriv, &tcb->acktimer);
2593                 tcb->rcv.una = 0;
2594                 seg.source = s->lport;
2595                 seg.dest = s->rport;
2596                 seg.flags = ACK;
2597                 seg.mss = 0;
2598                 seg.ws = 0;
2599                 switch (tcb->state) {
2600                         case Syn_sent:
2601                                 seg.flags = 0;
2602                                 if (tcb->snd.ptr == tcb->iss) {
2603                                         seg.flags |= SYN;
2604                                         dsize--;
2605                                         seg.mss = tcb->mss;
2606                                         seg.ws = tcb->scale;
2607                                 }
2608                                 break;
2609                         case Syn_received:
2610                                 /*
2611                                  *  don't send any data with a SYN/ACK packet
2612                                  *  because Linux rejects the packet in its
2613                                  *  attempt to solve the SYN attack problem
2614                                  */
2615                                 if (tcb->snd.ptr == tcb->iss) {
2616                                         seg.flags |= SYN;
2617                                         dsize = 0;
2618                                         ssize = 1;
2619                                         seg.mss = tcb->mss;
2620                                         seg.ws = tcb->scale;
2621                                 }
2622                                 break;
2623                 }
2624                 seg.seq = tcb->snd.ptr;
2625                 seg.ack = tcb->rcv.nxt;
2626                 seg.wnd = tcb->rcv.wnd;
2627
2628                 /* Pull out data to send */
2629                 bp = NULL;
2630                 if (dsize != 0) {
2631                         bp = qcopy(s->wq, dsize, sent);
2632                         if (BLEN(bp) != dsize) {
2633                                 seg.flags |= FIN;
2634                                 dsize--;
2635                         }
2636                         if (BLEN(bp) > tcb->mss) {
2637                                 bp->flag |= Btso;
2638                                 bp->mss = tcb->mss;
2639                         }
2640                 }
2641
2642                 if (sent + dsize == sndcnt)
2643                         seg.flags |= PSH;
2644
2645                 /* keep track of balance of resent data */
2646                 if (seq_lt(tcb->snd.ptr, tcb->snd.nxt)) {
2647                         n = tcb->snd.nxt - tcb->snd.ptr;
2648                         if (ssize < n)
2649                                 n = ssize;
2650                         tcb->resent += n;
2651                         netlog(f, Logtcp, "rexmit: %I.%d -> %I.%d ptr 0x%lx nxt 0x%lx\n",
2652                                    s->raddr, s->rport, s->laddr, s->lport, tcb->snd.ptr,
2653                                    tcb->snd.nxt);
2654                         tpriv->stats[RetransSegs]++;
2655                 }
2656
2657                 tcb->snd.ptr += ssize;
2658
2659                 /* Pull up the send pointer so we can accept acks
2660                  * for this window
2661                  */
2662                 if (seq_gt(tcb->snd.ptr, tcb->snd.nxt))
2663                         tcb->snd.nxt = tcb->snd.ptr;
2664
2665                 /* Build header, link data and compute cksum */
2666                 switch (version) {
2667                         case V4:
2668                                 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
2669                                 hbp = htontcp4(&seg, bp, &tcb->protohdr.tcp4hdr, tcb);
2670                                 if (hbp == NULL) {
2671                                         freeblist(bp);
2672                                         return;
2673                                 }
2674                                 break;
2675                         case V6:
2676                                 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
2677                                 hbp = htontcp6(&seg, bp, &tcb->protohdr.tcp6hdr, tcb);
2678                                 if (hbp == NULL) {
2679                                         freeblist(bp);
2680                                         return;
2681                                 }
2682                                 break;
2683                         default:
2684                                 hbp = NULL;     /* to suppress a warning */
2685                                 panic("tcpoutput: version %d", version);
2686                 }
2687
2688                 /* Start the transmission timers if there is new data and we
2689                  * expect acknowledges
2690                  */
2691                 if (ssize != 0) {
2692                         if (tcb->timer.state != TcptimerON)
2693                                 tcpgo(tpriv, &tcb->timer);
2694
2695                         /* If round trip timer isn't running, start it. */
2696                         if (tcb->rtt_timer.state != TcptimerON) {
2697                                 tcpgo(tpriv, &tcb->rtt_timer);
2698                                 tcb->rttseq = tcb->snd.ptr;
2699                         }
2700                 }
2701
2702                 tpriv->stats[OutSegs]++;
2703
2704                 /* put off the next keep alive */
2705                 tcpgo(tpriv, &tcb->katimer);
2706
2707                 switch (version) {
2708                         case V4:
2709                                 if (ipoput4(f, hbp, 0, s->ttl, s->tos, s) < 0) {
2710                                         /* a negative return means no route */
2711                                         localclose(s, "no route");
2712                                 }
2713                                 break;
2714                         case V6:
2715                                 if (ipoput6(f, hbp, 0, s->ttl, s->tos, s) < 0) {
2716                                         /* a negative return means no route */
2717                                         localclose(s, "no route");
2718                                 }
2719                                 break;
2720                         default:
2721                                 panic("tcpoutput2: version %d", version);
2722                 }
2723                 if ((msgs % 4) == 1) {
2724                         qunlock(&s->qlock);
2725                         kthread_yield();
2726                         qlock(&s->qlock);
2727                 }
2728         }
2729 }
2730
2731 /*
2732  *  the BSD convention (hack?) for keep alives.  resend last uint8_t acked.
2733  */
2734 void tcpsendka(struct conv *s)
2735 {
2736         Tcp seg;
2737         Tcpctl *tcb;
2738         struct block *hbp, *dbp;
2739
2740         tcb = (Tcpctl *) s->ptcl;
2741
2742         dbp = NULL;
2743         seg.urg = 0;
2744         seg.source = s->lport;
2745         seg.dest = s->rport;
2746         seg.flags = ACK | PSH;
2747         seg.mss = 0;
2748         seg.ws = 0;
2749         if (tcpporthogdefense)
2750                 urandom_read(&seg.seq, sizeof(seg.seq));
2751         else
2752                 seg.seq = tcb->snd.una - 1;
2753         seg.ack = tcb->rcv.nxt;
2754         tcb->rcv.una = 0;
2755         seg.wnd = tcb->rcv.wnd;
2756         if (tcb->state == Finwait2) {
2757                 seg.flags |= FIN;
2758         } else {
2759                 dbp = block_alloc(1, MEM_WAIT);
2760                 dbp->wp++;
2761         }
2762
2763         if (isv4(s->raddr)) {
2764                 /* Build header, link data and compute cksum */
2765                 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
2766                 hbp = htontcp4(&seg, dbp, &tcb->protohdr.tcp4hdr, tcb);
2767                 if (hbp == NULL) {
2768                         freeblist(dbp);
2769                         return;
2770                 }
2771                 ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
2772         } else {
2773                 /* Build header, link data and compute cksum */
2774                 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
2775                 hbp = htontcp6(&seg, dbp, &tcb->protohdr.tcp6hdr, tcb);
2776                 if (hbp == NULL) {
2777                         freeblist(dbp);
2778                         return;
2779                 }
2780                 ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
2781         }
2782 }
2783
2784 /*
2785  *  set connection to time out after 12 minutes
2786  */
2787 void tcpsetkacounter(Tcpctl * tcb)
2788 {
2789         tcb->kacounter = (12 * 60 * 1000) / (tcb->katimer.start * MSPTICK);
2790         if (tcb->kacounter < 3)
2791                 tcb->kacounter = 3;
2792 }
2793
2794 /*
2795  *  if we've timed out, close the connection
2796  *  otherwise, send a keepalive and restart the timer
2797  */
2798 void tcpkeepalive(void *v)
2799 {
2800         ERRSTACK(1);
2801         Tcpctl *tcb;
2802         struct conv *s;
2803
2804         s = v;
2805         tcb = (Tcpctl *) s->ptcl;
2806         qlock(&s->qlock);
2807         if (waserror()) {
2808                 qunlock(&s->qlock);
2809                 nexterror();
2810         }
2811         if (tcb->state != Closed) {
2812                 if (--(tcb->kacounter) <= 0) {
2813                         localclose(s, "connection timed out");
2814                 } else {
2815                         tcpsendka(s);
2816                         tcpgo(s->p->priv, &tcb->katimer);
2817                 }
2818         }
2819         qunlock(&s->qlock);
2820         poperror();
2821 }
2822
2823 /*
2824  *  start keepalive timer
2825  */
2826 static void tcpstartka(struct conv *s, char **f, int n)
2827 {
2828         Tcpctl *tcb;
2829         int x;
2830
2831         tcb = (Tcpctl *) s->ptcl;
2832         if (tcb->state != Established)
2833                 error(ENOTCONN, "connection must be in Establised state");
2834         if (n > 1) {
2835                 x = atoi(f[1]);
2836                 if (x >= MSPTICK)
2837                         tcb->katimer.start = x / MSPTICK;
2838         }
2839         tcpsetkacounter(tcb);
2840         tcpgo(s->p->priv, &tcb->katimer);
2841 }
2842
2843 /*
2844  *  turn checksums on/off
2845  */
2846 static void tcpsetchecksum(struct conv *s, char **f, int unused)
2847 {
2848         Tcpctl *tcb;
2849
2850         tcb = (Tcpctl *) s->ptcl;
2851         tcb->nochecksum = !atoi(f[1]);
2852 }
2853
2854 void tcprxmit(struct conv *s)
2855 {
2856         Tcpctl *tcb;
2857
2858         tcb = (Tcpctl *) s->ptcl;
2859
2860         tcb->flags |= RETRAN | FORCE;
2861         tcb->snd.ptr = tcb->snd.una;
2862
2863         /* Reno */
2864         tcb->ssthresh = tcb->cwind / 2;
2865         tcb->cwind = tcb->ssthresh;
2866         tcpoutput(s);
2867 }
2868
2869 void tcptimeout(void *arg)
2870 {
2871         ERRSTACK(1);
2872         struct conv *s;
2873         Tcpctl *tcb;
2874         int maxback;
2875         struct tcppriv *tpriv;
2876
2877         s = (struct conv *)arg;
2878         tpriv = s->p->priv;
2879         tcb = (Tcpctl *) s->ptcl;
2880
2881         qlock(&s->qlock);
2882         if (waserror()) {
2883                 qunlock(&s->qlock);
2884                 nexterror();
2885         }
2886         switch (tcb->state) {
2887                 default:
2888                         tcb->backoff++;
2889                         if (tcb->state == Syn_sent)
2890                                 maxback = MAXBACKMS / 2;
2891                         else
2892                                 maxback = MAXBACKMS;
2893                         tcb->backedoff += tcb->timer.start * MSPTICK;
2894                         if (tcb->backedoff >= maxback) {
2895                                 localclose(s, "connection timed out");
2896                                 break;
2897                         }
2898                         netlog(s->p->f, Logtcprxmt, "timeout rexmit 0x%lx %llu/%llu\n",
2899                                    tcb->snd.una, tcb->timer.start, NOW);
2900                         tcpsettimer(tcb);
2901                         tcprxmit(s);
2902                         tpriv->stats[RetransTimeouts]++;
2903                         tcb->snd.dupacks = 0;
2904                         break;
2905                 case Time_wait:
2906                         localclose(s, NULL);
2907                         break;
2908                 case Closed:
2909                         break;
2910         }
2911         qunlock(&s->qlock);
2912         poperror();
2913 }
2914
2915 int inwindow(Tcpctl * tcb, int seq)
2916 {
2917         return seq_within(seq, tcb->rcv.nxt, tcb->rcv.nxt + tcb->rcv.wnd - 1);
2918 }
2919
2920 /*
2921  *  set up state for a received SYN (or SYN ACK) packet
2922  */
2923 void procsyn(struct conv *s, Tcp * seg)
2924 {
2925         Tcpctl *tcb;
2926
2927         tcb = (Tcpctl *) s->ptcl;
2928         tcb->flags |= FORCE;
2929
2930         tcb->rcv.nxt = seg->seq + 1;
2931         tcb->rcv.urg = tcb->rcv.nxt;
2932         tcb->irs = seg->seq;
2933
2934         /* our sending max segment size cannot be bigger than what he asked for */
2935         if (seg->mss != 0 && seg->mss < tcb->mss)
2936                 tcb->mss = seg->mss;
2937
2938         tcb->snd.wnd = seg->wnd;
2939         tcb->cwind = tcb->mss * CWIND_SCALE;
2940 }
2941
2942 int
2943 addreseq(Tcpctl * tcb, struct tcppriv *tpriv, Tcp * seg,
2944                  struct block *bp, uint16_t length)
2945 {
2946         Reseq *rp, *rp1;
2947         int i, rqlen, qmax;
2948
2949         rp = kzmalloc(sizeof(Reseq), 0);
2950         if (rp == NULL) {
2951                 freeblist(bp);  /* bp always consumed by add_reseq */
2952                 return 0;
2953         }
2954
2955         rp->seg = *seg;
2956         rp->bp = bp;
2957         rp->length = length;
2958
2959         /* Place on reassembly list sorting by starting seq number */
2960         rp1 = tcb->reseq;
2961         if (rp1 == NULL || seq_lt(seg->seq, rp1->seg.seq)) {
2962                 rp->next = rp1;
2963                 tcb->reseq = rp;
2964                 if (rp->next != NULL)
2965                         tpriv->stats[OutOfOrder]++;
2966                 return 0;
2967         }
2968
2969         rqlen = 0;
2970         for (i = 0;; i++) {
2971                 rqlen += rp1->length;
2972                 if (rp1->next == NULL || seq_lt(seg->seq, rp1->next->seg.seq)) {
2973                         rp->next = rp1->next;
2974                         rp1->next = rp;
2975                         if (rp->next != NULL)
2976                                 tpriv->stats[OutOfOrder]++;
2977                         break;
2978                 }
2979                 rp1 = rp1->next;
2980         }
2981         qmax = QMAX << tcb->rcv.scale;
2982         if (rqlen > qmax) {
2983                 printd("resequence queue > window: %d > %d\n", rqlen, qmax);
2984                 i = 0;
2985                 for (rp1 = tcb->reseq; rp1 != NULL; rp1 = rp1->next) {
2986                         printd("0x%#lx 0x%#lx 0x%#x\n", rp1->seg.seq,
2987                                    rp1->seg.ack, rp1->seg.flags);
2988                         if (i++ > 10) {
2989                                 printd("...\n");
2990                                 break;
2991                         }
2992                 }
2993
2994                 // delete entire reassembly queue; wait for retransmit.
2995                 // - should we be smarter and only delete the tail?
2996                 for (rp = tcb->reseq; rp != NULL; rp = rp1) {
2997                         rp1 = rp->next;
2998                         freeblist(rp->bp);
2999                         kfree(rp);
3000                 }
3001                 tcb->reseq = NULL;
3002
3003                 return -1;
3004         }
3005         return 0;
3006 }
3007
3008 void getreseq(Tcpctl * tcb, Tcp * seg, struct block **bp, uint16_t * length)
3009 {
3010         Reseq *rp;
3011
3012         rp = tcb->reseq;
3013         if (rp == NULL)
3014                 return;
3015
3016         tcb->reseq = rp->next;
3017
3018         *seg = rp->seg;
3019         *bp = rp->bp;
3020         *length = rp->length;
3021
3022         kfree(rp);
3023 }
3024
3025 int tcptrim(Tcpctl * tcb, Tcp * seg, struct block **bp, uint16_t * length)
3026 {
3027         uint16_t len;
3028         uint8_t accept;
3029         int dupcnt, excess;
3030
3031         accept = 0;
3032         len = *length;
3033         if (seg->flags & SYN)
3034                 len++;
3035         if (seg->flags & FIN)
3036                 len++;
3037
3038         if (tcb->rcv.wnd == 0) {
3039                 if (len == 0 && seg->seq == tcb->rcv.nxt)
3040                         return 0;
3041         } else {
3042                 /* Some part of the segment should be in the window */
3043                 if (inwindow(tcb, seg->seq))
3044                         accept++;
3045                 else if (len != 0) {
3046                         if (inwindow(tcb, seg->seq + len - 1) ||
3047                                 seq_within(tcb->rcv.nxt, seg->seq, seg->seq + len - 1))
3048                                 accept++;
3049                 }
3050         }
3051         if (!accept) {
3052                 freeblist(*bp);
3053                 return -1;
3054         }
3055         dupcnt = tcb->rcv.nxt - seg->seq;
3056         if (dupcnt > 0) {
3057                 tcb->rerecv += dupcnt;
3058                 if (seg->flags & SYN) {
3059                         seg->flags &= ~SYN;
3060                         seg->seq++;
3061
3062                         if (seg->urg > 1)
3063                                 seg->urg--;
3064                         else
3065                                 seg->flags &= ~URG;
3066                         dupcnt--;
3067                 }
3068                 if (dupcnt > 0) {
3069                         pullblock(bp, (uint16_t) dupcnt);
3070                         seg->seq += dupcnt;
3071                         *length -= dupcnt;
3072
3073                         if (seg->urg > dupcnt)
3074                                 seg->urg -= dupcnt;
3075                         else {
3076                                 seg->flags &= ~URG;
3077                                 seg->urg = 0;
3078                         }
3079                 }
3080         }
3081         excess = seg->seq + *length - (tcb->rcv.nxt + tcb->rcv.wnd);
3082         if (excess > 0) {
3083                 tcb->rerecv += excess;
3084                 *length -= excess;
3085                 *bp = trimblock(*bp, 0, *length);
3086                 if (*bp == NULL)
3087                         panic("presotto is a boofhead");
3088                 seg->flags &= ~FIN;
3089         }
3090         return 0;
3091 }
3092
3093 void tcpadvise(struct Proto *tcp, struct block *bp, char *msg)
3094 {
3095         Tcp4hdr *h4;
3096         Tcp6hdr *h6;
3097         Tcpctl *tcb;
3098         uint8_t source[IPaddrlen];
3099         uint8_t dest[IPaddrlen];
3100         uint16_t psource, pdest;
3101         struct conv *s, **p;
3102
3103         h4 = (Tcp4hdr *) (bp->rp);
3104         h6 = (Tcp6hdr *) (bp->rp);
3105
3106         if ((h4->vihl & 0xF0) == IP_VER4) {
3107                 v4tov6(dest, h4->tcpdst);
3108                 v4tov6(source, h4->tcpsrc);
3109                 psource = nhgets(h4->tcpsport);
3110                 pdest = nhgets(h4->tcpdport);
3111         } else {
3112                 ipmove(dest, h6->tcpdst);
3113                 ipmove(source, h6->tcpsrc);
3114                 psource = nhgets(h6->tcpsport);
3115                 pdest = nhgets(h6->tcpdport);
3116         }
3117
3118         /* Look for a connection */
3119         for (p = tcp->conv; *p; p++) {
3120                 s = *p;
3121                 tcb = (Tcpctl *) s->ptcl;
3122                 if (s->rport == pdest)
3123                         if (s->lport == psource)
3124                                 if (tcb->state != Closed)
3125                                         if (ipcmp(s->raddr, dest) == 0)
3126                                                 if (ipcmp(s->laddr, source) == 0) {
3127                                                         qlock(&s->qlock);
3128                                                         switch (tcb->state) {
3129                                                                 case Syn_sent:
3130                                                                         localclose(s, msg);
3131                                                                         break;
3132                                                         }
3133                                                         qunlock(&s->qlock);
3134                                                         freeblist(bp);
3135                                                         return;
3136                                                 }
3137         }
3138         freeblist(bp);
3139 }
3140
3141 static void tcpporthogdefensectl(char *val)
3142 {
3143         if (strcmp(val, "on") == 0)
3144                 tcpporthogdefense = 1;
3145         else if (strcmp(val, "off") == 0)
3146                 tcpporthogdefense = 0;
3147         else
3148                 error(EINVAL, "unknown value for tcpporthogdefense");
3149 }
3150
3151 /* called with c qlocked */
3152 static void tcpctl(struct conv *c, char **f, int n)
3153 {
3154         if (n == 1 && strcmp(f[0], "hangup") == 0)
3155                 tcphangup(c);
3156         else if (n >= 1 && strcmp(f[0], "keepalive") == 0)
3157                 tcpstartka(c, f, n);
3158         else if (n >= 1 && strcmp(f[0], "checksum") == 0)
3159                 tcpsetchecksum(c, f, n);
3160         else if (n >= 1 && strcmp(f[0], "tcpporthogdefense") == 0)
3161                 tcpporthogdefensectl(f[1]);
3162         else
3163                 error(EINVAL, "unknown command to %s", __func__);
3164 }
3165
3166 int tcpstats(struct Proto *tcp, char *buf, int len)
3167 {
3168         struct tcppriv *priv;
3169         char *p, *e;
3170         int i;
3171
3172         priv = tcp->priv;
3173         p = buf;
3174         e = p + len;
3175         for (i = 0; i < Nstats; i++)
3176                 p = seprintf(p, e, "%s: %u\n", statnames[i], priv->stats[i]);
3177         return p - buf;
3178 }
3179
3180 /*
3181  *  garbage collect any stale conversations:
3182  *      - SYN received but no SYN-ACK after 5 seconds (could be the SYN attack)
3183  *      - Finwait2 after 5 minutes
3184  *
3185  *  this is called whenever we run out of channels.  Both checks are
3186  *  of questionable validity so we try to use them only when we're
3187  *  up against the wall.
3188  */
3189 int tcpgc(struct Proto *tcp)
3190 {
3191         struct conv *c, **pp, **ep;
3192         int n;
3193         Tcpctl *tcb;
3194
3195         n = 0;
3196         ep = &tcp->conv[tcp->nc];
3197         for (pp = tcp->conv; pp < ep; pp++) {
3198                 c = *pp;
3199                 if (c == NULL)
3200                         break;
3201                 if (!canqlock(&c->qlock))
3202                         continue;
3203                 tcb = (Tcpctl *) c->ptcl;
3204                 switch (tcb->state) {
3205                         case Syn_received:
3206                                 if (NOW - tcb->time > 5000) {
3207                                         localclose(c, "timed out");
3208                                         n++;
3209                                 }
3210                                 break;
3211                         case Finwait2:
3212                                 if (NOW - tcb->time > 5 * 60 * 1000) {
3213                                         localclose(c, "timed out");
3214                                         n++;
3215                                 }
3216                                 break;
3217                 }
3218                 qunlock(&c->qlock);
3219         }
3220         return n;
3221 }
3222
3223 void tcpsettimer(Tcpctl * tcb)
3224 {
3225         int x;
3226
3227         /* round trip dependency */
3228         x = backoff(tcb->backoff) *
3229                 (tcb->mdev + (tcb->srtt >> LOGAGAIN) + MSPTICK) / MSPTICK;
3230
3231         /* bounded twixt 1/2 and 64 seconds */
3232         if (x < 500 / MSPTICK)
3233                 x = 500 / MSPTICK;
3234         else if (x > (64000 / MSPTICK))
3235                 x = 64000 / MSPTICK;
3236         tcb->timer.start = x;
3237 }
3238
3239 static struct tcppriv *debug_priv;
3240
3241 /* Kfunc this */
3242 int dump_tcp_ht(void)
3243 {
3244         if (!debug_priv)
3245                 return -1;
3246         dump_ipht(&debug_priv->ht);
3247         return 0;
3248 }
3249
3250 void tcpinit(struct Fs *fs)
3251 {
3252         struct Proto *tcp;
3253         struct tcppriv *tpriv;
3254
3255         tcp = kzmalloc(sizeof(struct Proto), 0);
3256         tpriv = tcp->priv = kzmalloc(sizeof(struct tcppriv), 0);
3257         debug_priv = tpriv;
3258         qlock_init(&tpriv->tl);
3259         qlock_init(&tpriv->apl);
3260         tcp->name = "tcp";
3261         tcp->connect = tcpconnect;
3262         tcp->announce = tcpannounce;
3263         tcp->bypass = tcpbypass;
3264         tcp->ctl = tcpctl;
3265         tcp->state = tcpstate;
3266         tcp->create = tcpcreate;
3267         tcp->close = tcpclose;
3268         tcp->shutdown = tcpshutdown;
3269         tcp->rcv = tcpiput;
3270         tcp->advise = tcpadvise;
3271         tcp->stats = tcpstats;
3272         tcp->inuse = tcpinuse;
3273         tcp->gc = tcpgc;
3274         tcp->ipproto = IP_TCPPROTO;
3275         tcp->nc = 4096;
3276         tcp->ptclsize = sizeof(Tcpctl);
3277         tpriv->stats[MaxConn] = tcp->nc;
3278
3279         Fsproto(fs, tcp);
3280 }
3281
3282 void
3283 tcpsetscale(struct conv *s, Tcpctl * tcb, uint16_t rcvscale, uint16_t sndscale)
3284 {
3285         if (rcvscale) {
3286                 tcb->rcv.scale = rcvscale & 0xff;
3287                 tcb->snd.scale = sndscale & 0xff;
3288                 tcb->window = QMAX << tcb->snd.scale;
3289                 qsetlimit(s->rq, tcb->window);
3290         } else {
3291                 tcb->rcv.scale = 0;
3292                 tcb->snd.scale = 0;
3293                 tcb->window = QMAX;
3294                 qsetlimit(s->rq, tcb->window);
3295         }
3296 }