akaros/kern/src/net/tcp.c
<<
>>
Prefs
   1/* Copyright © 1994-1999 Lucent Technologies Inc.  All rights reserved.
   2 * Portions Copyright © 1997-1999 Vita Nuova Limited
   3 * Portions Copyright © 2000-2007 Vita Nuova Holdings Limited
   4 *                                (www.vitanuova.com)
   5 * Revisions Copyright © 2000-2007 Lucent Technologies Inc. and others
   6 *
   7 * Modified for the Akaros operating system:
   8 * Copyright (c) 2013-2014 The Regents of the University of California
   9 * Copyright (c) 2013-2017 Google Inc.
  10 *
  11 * Permission is hereby granted, free of charge, to any person obtaining a copy
  12 * of this software and associated documentation files (the "Software"), to deal
  13 * in the Software without restriction, including without limitation the rights
  14 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  15 * copies of the Software, and to permit persons to whom the Software is
  16 * furnished to do so, subject to the following conditions:
  17 *
  18 * The above copyright notice and this permission notice shall be included in
  19 * all copies or substantial portions of the Software.
  20 *
  21 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  22 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  23 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
  24 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  25 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  26 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  27 * SOFTWARE. */
  28
  29#include <slab.h>
  30#include <kmalloc.h>
  31#include <kref.h>
  32#include <string.h>
  33#include <stdio.h>
  34#include <assert.h>
  35#include <error.h>
  36#include <cpio.h>
  37#include <pmap.h>
  38#include <smp.h>
  39#include <net/ip.h>
  40#include <net/tcp.h>
  41
  42/* Must correspond to the enumeration in tcp.h */
  43static char *tcpstates[] = {
  44        "Closed", "Listen", "Syn_sent",
  45        "Established", "Finwait1", "Finwait2", "Close_wait",
  46        "Closing", "Last_ack", "Time_wait"
  47};
  48
  49static int tcp_irtt = DEF_RTT;          /* Initial guess at round trip time */
  50static uint16_t tcp_mss = DEF_MSS;      /* Maximum segment size to be sent */
  51
  52/* Must correspond to the enumeration in tcp.h */
  53static char *statnames[] = {
  54        [MaxConn] "MaxConn",
  55        [ActiveOpens] "ActiveOpens",
  56        [PassiveOpens] "PassiveOpens",
  57        [EstabResets] "EstabResets",
  58        [CurrEstab] "CurrEstab",
  59        [InSegs] "InSegs",
  60        [OutSegs] "OutSegs",
  61        [RetransSegs] "RetransSegs",
  62        [RetransTimeouts] "RetransTimeouts",
  63        [InErrs] "InErrs",
  64        [OutRsts] "OutRsts",
  65        [CsumErrs] "CsumErrs",
  66        [HlenErrs] "HlenErrs",
  67        [LenErrs] "LenErrs",
  68        [OutOfOrder] "OutOfOrder",
  69};
  70
  71/*
  72 *  Setting tcpporthogdefense to non-zero enables Dong Lin's
  73 *  solution to hijacked systems staking out port's as a form
  74 *  of DoS attack.
  75 *
  76 *  To avoid stateless Conv hogs, we pick a sequence number at random.  If
  77 *  it that number gets acked by the other end, we shut down the connection.
  78 *  Look for tcpporthogedefense in the code.
  79 */
  80static int tcpporthogdefense = 0;
  81
  82static int addreseq(Tcpctl *, struct tcppriv *, Tcp *, struct block *,
  83                    uint16_t);
  84static void getreseq(Tcpctl *, Tcp *, struct block **, uint16_t *);
  85static void localclose(struct conv *, char *unused_char_p_t);
  86static void procsyn(struct conv *, Tcp *);
  87static void tcpiput(struct Proto *, struct Ipifc *, struct block *);
  88static void tcpoutput(struct conv *);
  89static int tcptrim(Tcpctl *, Tcp *, struct block **, uint16_t *);
  90static void tcpstart(struct conv *, int);
  91static void tcptimeout(void *);
  92static void tcpsndsyn(struct conv *, Tcpctl *);
  93static void tcprcvwin(struct conv *);
  94static void tcpacktimer(void *);
  95static void tcpkeepalive(void *);
  96static void tcpsetkacounter(Tcpctl *);
  97static void tcprxmit(struct conv *);
  98static void tcpsettimer(Tcpctl *);
  99static void tcpsynackrtt(struct conv *);
 100static void tcpsetscale(struct conv *, Tcpctl *, uint16_t, uint16_t);
 101static void tcp_loss_event(struct conv *s, Tcpctl *tcb);
 102static uint16_t derive_payload_mss(Tcpctl *tcb);
 103static void set_in_flight(Tcpctl *tcb);
 104
 105static void limborexmit(struct Proto *);
 106static void limbo(struct conv *, uint8_t *unused_uint8_p_t, uint8_t *, Tcp *,
 107                  int);
 108
 109static void tcpsetstate(struct conv *s, uint8_t newstate)
 110{
 111        Tcpctl *tcb;
 112        uint8_t oldstate;
 113        struct tcppriv *tpriv;
 114
 115        tpriv = s->p->priv;
 116
 117        tcb = (Tcpctl *) s->ptcl;
 118
 119        oldstate = tcb->state;
 120        if (oldstate == newstate)
 121                return;
 122
 123        if (oldstate == Established)
 124                tpriv->stats[CurrEstab]--;
 125        if (newstate == Established)
 126                tpriv->stats[CurrEstab]++;
 127
 128        /**
 129        print( "%d/%d %s->%s CurrEstab=%d\n", s->lport, s->rport,
 130        tcpstates[oldstate], tcpstates[newstate], tpriv->tstats.tcpCurrEstab );
 131        **/
 132
 133        switch (newstate) {
 134        case Closed:
 135                qclose(s->rq);
 136                qclose(s->wq);
 137                qclose(s->eq);
 138                break;
 139
 140        case Close_wait:        /* Remote closes */
 141                qhangup(s->rq, NULL);
 142                break;
 143        }
 144
 145        tcb->state = newstate;
 146
 147        if (oldstate == Syn_sent && newstate != Closed)
 148                Fsconnected(s, NULL);
 149}
 150
 151static void tcpconnect(struct conv *c, char **argv, int argc)
 152{
 153        Fsstdconnect(c, argv, argc);
 154        tcpstart(c, TCP_CONNECT);
 155}
 156
 157static int tcpstate(struct conv *c, char *state, int n)
 158{
 159        Tcpctl *s;
 160
 161        s = (Tcpctl *) (c->ptcl);
 162
 163        return snprintf(state, n,
 164                        "%s qin %d qout %d srtt %d mdev %d cwin %u swin %u>>%d rwin %u>>%d timer.start %llu timer.count %llu rerecv %d katimer.start %d katimer.count %d\n",
 165                        tcpstates[s->state],
 166                        c->rq ? qlen(c->rq) : 0,
 167                        c->wq ? qlen(c->wq) : 0,
 168                        s->srtt, s->mdev,
 169                        s->cwind, s->snd.wnd, s->rcv.scale, s->rcv.wnd,
 170                        s->snd.scale, s->timer.start, s->timer.count, s->rerecv,
 171                        s->katimer.start, s->katimer.count);
 172}
 173
 174static int tcpinuse(struct conv *c)
 175{
 176        Tcpctl *s;
 177
 178        s = (Tcpctl *) (c->ptcl);
 179        return s->state != Closed;
 180}
 181
 182static void tcpannounce(struct conv *c, char **argv, int argc)
 183{
 184        Fsstdannounce(c, argv, argc);
 185        tcpstart(c, TCP_LISTEN);
 186        Fsconnected(c, NULL);
 187}
 188
 189static void tcpbypass(struct conv *cv, char **argv, int argc)
 190{
 191        struct tcppriv *tpriv = cv->p->priv;
 192
 193        Fsstdbypass(cv, argv, argc);
 194        iphtadd(&tpriv->ht, cv);
 195}
 196
 197static void tcpshutdown(struct conv *c, int how)
 198{
 199        Tcpctl *tcb = (Tcpctl*)c->ptcl;
 200
 201        /* Do nothing for the read side */
 202        if (how == SHUT_RD)
 203                return;
 204        /* Sends a FIN.  If we're in another state (like Listen), we'll run into
 205         * issues, since we'll never send the FIN.  We'll be shutdown on our
 206         * end, but we'll never tell the distant end.  Might just be an app
 207         * issue. */
 208        switch (tcb->state) {
 209        case Established:
 210                tcb->flgcnt++;
 211                tcpsetstate(c, Finwait1);
 212                tcpoutput(c);
 213                break;
 214        }
 215}
 216
 217/*
 218 *  tcpclose is always called with the q locked
 219 */
 220static void tcpclose(struct conv *c)
 221{
 222        Tcpctl *tcb;
 223
 224        tcb = (Tcpctl *) c->ptcl;
 225
 226        qhangup(c->rq, NULL);
 227        qhangup(c->wq, NULL);
 228        qhangup(c->eq, NULL);
 229        qflush(c->rq);
 230
 231        switch (tcb->state) {
 232        case Listen:
 233                /*
 234                 *  reset any incoming calls to this listener
 235                 */
 236                Fsconnected(c, "Hangup");
 237
 238                localclose(c, NULL);
 239                break;
 240        case Closed:
 241        case Syn_sent:
 242                localclose(c, NULL);
 243                break;
 244        case Established:
 245                tcb->flgcnt++;
 246                tcpsetstate(c, Finwait1);
 247                tcpoutput(c);
 248                break;
 249        case Close_wait:
 250                tcb->flgcnt++;
 251                tcpsetstate(c, Last_ack);
 252                tcpoutput(c);
 253                break;
 254        }
 255}
 256
 257static void tcpkick(void *x)
 258{
 259        ERRSTACK(1);
 260        struct conv *s = x;
 261        Tcpctl *tcb;
 262
 263        tcb = (Tcpctl *) s->ptcl;
 264
 265        qlock(&s->qlock);
 266        if (waserror()) {
 267                qunlock(&s->qlock);
 268                nexterror();
 269        }
 270
 271        switch (tcb->state) {
 272        case Syn_sent:
 273        case Established:
 274        case Close_wait:
 275                /*
 276                 * Push data
 277                 */
 278                tcprcvwin(s);
 279                tcpoutput(s);
 280                break;
 281        default:
 282                localclose(s, "Hangup");
 283                break;
 284        }
 285
 286        qunlock(&s->qlock);
 287        poperror();
 288}
 289
 290static void tcprcvwin(struct conv *s)
 291{
 292        /* Call with tcb locked */
 293        int w;
 294        Tcpctl *tcb;
 295
 296        tcb = (Tcpctl *) s->ptcl;
 297        w = tcb->window - qlen(s->rq);
 298        if (w < 0)
 299                w = 0;
 300
 301        /* RFC 813: Avoid SWS.  We'll always reduce the window (because the qio
 302         * increased - that's legit), and we'll always advertise the window
 303         * increases (corresponding to qio drains) when those are greater than
 304         * MSS.  But we don't advertise increases less than MSS.
 305         *
 306         * Note we don't shrink the window at all - that'll result in tcptrim()
 307         * dropping packets that were sent before the sender gets our update. */
 308        if ((w < tcb->rcv.wnd) || (w >= tcb->mss))
 309                tcb->rcv.wnd = w;
 310        /* We've delayed sending an update to rcv.wnd, and we might never get
 311         * another ACK to drive the TCP stack after the qio is drained.  We
 312         * could replace this stuff with qio kicks or callbacks, but that might
 313         * be trickier with the MSS limitation.  (and 'edge' isn't empty or
 314         * not). */
 315        if (w < tcb->mss)
 316                tcb->rcv.blocked = 1;
 317}
 318
 319static void tcpacktimer(void *v)
 320{
 321        ERRSTACK(1);
 322        Tcpctl *tcb;
 323        struct conv *s;
 324
 325        s = v;
 326        tcb = (Tcpctl *) s->ptcl;
 327
 328        qlock(&s->qlock);
 329        if (waserror()) {
 330                qunlock(&s->qlock);
 331                nexterror();
 332        }
 333        if (tcb->state != Closed) {
 334                tcb->flags |= FORCE;
 335                tcprcvwin(s);
 336                tcpoutput(s);
 337        }
 338        qunlock(&s->qlock);
 339        poperror();
 340}
 341
 342static void tcpcreate(struct conv *c)
 343{
 344        /* We don't use qio limits.  Instead, TCP manages flow control on its
 345         * own.  We only use qpassnolim().  Note for qio that 0 doesn't mean no
 346         * limit. */
 347        c->rq = qopen(0, Qcoalesce, 0, 0);
 348        c->wq = qopen(8 * QMAX, Qkick, tcpkick, c);
 349}
 350
 351static void timerstate(struct tcppriv *priv, Tcptimer *t, int newstate)
 352{
 353        if (newstate != TcptimerON) {
 354                if (t->state == TcptimerON) {
 355                        // unchain
 356                        if (priv->timers == t) {
 357                                priv->timers = t->next;
 358                                if (t->prev != NULL)
 359                                        panic("timerstate1");
 360                        }
 361                        if (t->next)
 362                                t->next->prev = t->prev;
 363                        if (t->prev)
 364                                t->prev->next = t->next;
 365                        t->next = t->prev = NULL;
 366                }
 367        } else {
 368                if (t->state != TcptimerON) {
 369                        // chain
 370                        if (t->prev != NULL || t->next != NULL)
 371                                panic("timerstate2");
 372                        t->prev = NULL;
 373                        t->next = priv->timers;
 374                        if (t->next)
 375                                t->next->prev = t;
 376                        priv->timers = t;
 377                }
 378        }
 379        t->state = newstate;
 380}
 381
 382static void tcpackproc(void *a)
 383{
 384        ERRSTACK(1);
 385        Tcptimer *t, *tp, *timeo;
 386        struct Proto *tcp;
 387        struct tcppriv *priv;
 388        int loop;
 389
 390        tcp = a;
 391        priv = tcp->priv;
 392
 393        for (;;) {
 394                kthread_usleep(MSPTICK * 1000);
 395
 396                qlock(&priv->tl);
 397                timeo = NULL;
 398                loop = 0;
 399                for (t = priv->timers; t != NULL; t = tp) {
 400                        if (loop++ > 10000)
 401                                panic("tcpackproc1");
 402                        tp = t->next;
 403                        if (t->state == TcptimerON) {
 404                                t->count--;
 405                                if (t->count == 0) {
 406                                        timerstate(priv, t, TcptimerDONE);
 407                                        t->readynext = timeo;
 408                                        timeo = t;
 409                                }
 410                        }
 411                }
 412                qunlock(&priv->tl);
 413
 414                loop = 0;
 415                for (t = timeo; t != NULL; t = t->readynext) {
 416                        if (loop++ > 10000)
 417                                panic("tcpackproc2");
 418                        if (t->state == TcptimerDONE && t->func != NULL) {
 419                                /* discard error style */
 420                                if (!waserror())
 421                                        (*t->func) (t->arg);
 422                                poperror();
 423                        }
 424                }
 425
 426                limborexmit(tcp);
 427        }
 428}
 429
 430static void tcpgo(struct tcppriv *priv, Tcptimer *t)
 431{
 432        if (t == NULL || t->start == 0)
 433                return;
 434
 435        qlock(&priv->tl);
 436        t->count = t->start;
 437        timerstate(priv, t, TcptimerON);
 438        qunlock(&priv->tl);
 439}
 440
 441static void tcphalt(struct tcppriv *priv, Tcptimer *t)
 442{
 443        if (t == NULL)
 444                return;
 445
 446        qlock(&priv->tl);
 447        timerstate(priv, t, TcptimerOFF);
 448        qunlock(&priv->tl);
 449}
 450
 451static int backoff(int n)
 452{
 453        return 1 << n;
 454}
 455
 456static void localclose(struct conv *s, char *reason)
 457{
 458        /* called with tcb locked */
 459        Tcpctl *tcb;
 460        Reseq *rp, *rp1;
 461        struct tcppriv *tpriv;
 462
 463        tpriv = s->p->priv;
 464        tcb = (Tcpctl *) s->ptcl;
 465
 466        iphtrem(&tpriv->ht, s);
 467
 468        tcphalt(tpriv, &tcb->timer);
 469        tcphalt(tpriv, &tcb->rtt_timer);
 470        tcphalt(tpriv, &tcb->acktimer);
 471        tcphalt(tpriv, &tcb->katimer);
 472
 473        /* Flush reassembly queue; nothing more can arrive */
 474        for (rp = tcb->reseq; rp != NULL; rp = rp1) {
 475                rp1 = rp->next;
 476                freeblist(rp->bp);
 477                kfree(rp);
 478        }
 479        tcb->reseq = NULL;
 480
 481        if (tcb->state == Syn_sent)
 482                Fsconnected(s, reason);
 483
 484        qhangup(s->rq, reason);
 485        qhangup(s->wq, reason);
 486
 487        tcpsetstate(s, Closed);
 488
 489        /* listener will check the rq state */
 490        if (s->state == Announced)
 491                rendez_wakeup(&s->listenr);
 492}
 493
 494/* mtu (- TCP + IP hdr len) of 1st hop */
 495static int tcpmtu(struct Ipifc *ifc, int version, int *scale)
 496{
 497        int mtu;
 498
 499        switch (version) {
 500        default:
 501        case V4:
 502                mtu = DEF_MSS;
 503                if (ifc != NULL)
 504                        mtu = ifc->maxtu - ifc->m->hsize - (TCP4_PKT +
 505                                                            TCP4_HDRSIZE);
 506                break;
 507        case V6:
 508                mtu = DEF_MSS6;
 509                if (ifc != NULL)
 510                        mtu = ifc->maxtu - ifc->m->hsize - (TCP6_PKT +
 511                                                            TCP6_HDRSIZE);
 512                break;
 513        }
 514        *scale = HaveWS | 7;
 515
 516        return mtu;
 517}
 518
 519static void tcb_check_tso(Tcpctl *tcb)
 520{
 521        /* This can happen if the netdev isn't up yet. */
 522        if (!tcb->ifc)
 523                return;
 524        if (tcb->ifc->feat & NETF_TSO)
 525                tcb->flags |= TSO;
 526        else
 527                tcb->flags &= ~TSO;
 528}
 529
 530static void inittcpctl(struct conv *s, int mode)
 531{
 532        Tcpctl *tcb;
 533        Tcp4hdr *h4;
 534        Tcp6hdr *h6;
 535        int mss;
 536
 537        tcb = (Tcpctl *) s->ptcl;
 538
 539        memset(tcb, 0, sizeof(Tcpctl));
 540
 541        tcb->ssthresh = UINT32_MAX;
 542        tcb->srtt = tcp_irtt;
 543        tcb->mdev = 0;
 544
 545        /* setup timers */
 546        tcb->timer.start = tcp_irtt / MSPTICK;
 547        tcb->timer.func = tcptimeout;
 548        tcb->timer.arg = s;
 549        tcb->rtt_timer.start = MAX_TIME;
 550        tcb->acktimer.start = TCP_ACK / MSPTICK;
 551        tcb->acktimer.func = tcpacktimer;
 552        tcb->acktimer.arg = s;
 553        tcb->katimer.start = DEF_KAT / MSPTICK;
 554        tcb->katimer.func = tcpkeepalive;
 555        tcb->katimer.arg = s;
 556
 557        mss = DEF_MSS;
 558
 559        /* create a prototype(pseudo) header */
 560        if (mode != TCP_LISTEN) {
 561                if (ipcmp(s->laddr, IPnoaddr) == 0)
 562                        findlocalip(s->p->f, s->laddr, s->raddr);
 563
 564                switch (s->ipversion) {
 565                case V4:
 566                        h4 = &tcb->protohdr.tcp4hdr;
 567                        memset(h4, 0, sizeof(*h4));
 568                        h4->proto = IP_TCPPROTO;
 569                        hnputs(h4->tcpsport, s->lport);
 570                        hnputs(h4->tcpdport, s->rport);
 571                        v6tov4(h4->tcpsrc, s->laddr);
 572                        v6tov4(h4->tcpdst, s->raddr);
 573                        break;
 574                case V6:
 575                        h6 = &tcb->protohdr.tcp6hdr;
 576                        memset(h6, 0, sizeof(*h6));
 577                        h6->proto = IP_TCPPROTO;
 578                        hnputs(h6->tcpsport, s->lport);
 579                        hnputs(h6->tcpdport, s->rport);
 580                        ipmove(h6->tcpsrc, s->laddr);
 581                        ipmove(h6->tcpdst, s->raddr);
 582                        mss = DEF_MSS6;
 583                        break;
 584                default:
 585                        panic("inittcpctl: version %d", s->ipversion);
 586                }
 587        }
 588
 589        tcb->ifc = findipifc(s->p->f, s->laddr, 0);
 590        tcb->mss = mss;
 591        tcb->typical_mss = mss;
 592        tcb->cwind = tcb->typical_mss * CWIND_SCALE;
 593
 594        /* default is no window scaling */
 595        tcb->window = QMAX;
 596        tcb->rcv.wnd = QMAX;
 597        tcb->rcv.scale = 0;
 598        tcb->snd.scale = 0;
 599        tcb_check_tso(tcb);
 600}
 601
 602/*
 603 *  called with s qlocked
 604 */
 605static void tcpstart(struct conv *s, int mode)
 606{
 607        Tcpctl *tcb;
 608        struct tcppriv *tpriv;
 609        char *kpname;
 610
 611        tpriv = s->p->priv;
 612
 613        if (tpriv->ackprocstarted == 0) {
 614                qlock(&tpriv->apl);
 615                if (tpriv->ackprocstarted == 0) {
 616                        /* tcpackproc needs to free this if it ever exits */
 617                        kpname = kmalloc(KNAMELEN, MEM_WAIT);
 618                        snprintf(kpname, KNAMELEN, "#I%dtcpack", s->p->f->dev);
 619                        ktask(kpname, tcpackproc, s->p);
 620                        tpriv->ackprocstarted = 1;
 621                }
 622                qunlock(&tpriv->apl);
 623        }
 624
 625        tcb = (Tcpctl *) s->ptcl;
 626
 627        inittcpctl(s, mode);
 628
 629        iphtadd(&tpriv->ht, s);
 630        switch (mode) {
 631        case TCP_LISTEN:
 632                tpriv->stats[PassiveOpens]++;
 633                tcb->flags |= CLONE;
 634                tcpsetstate(s, Listen);
 635                break;
 636
 637        case TCP_CONNECT:
 638                tpriv->stats[ActiveOpens]++;
 639                tcb->flags |= ACTIVE;
 640                tcpsndsyn(s, tcb);
 641                tcpsetstate(s, Syn_sent);
 642                tcpoutput(s);
 643                break;
 644        }
 645}
 646
 647static char *tcpflag(uint16_t flag)
 648{
 649        static char buf[128];
 650
 651        snprintf(buf, sizeof(buf), "%d", flag >> 10);   /* Head len */
 652        if (flag & URG)
 653                snprintf(buf, sizeof(buf), "%s%s", buf, " URG");
 654        if (flag & ACK)
 655                snprintf(buf, sizeof(buf), "%s%s", buf, " ACK");
 656        if (flag & PSH)
 657                snprintf(buf, sizeof(buf), "%s%s", buf, " PSH");
 658        if (flag & RST)
 659                snprintf(buf, sizeof(buf), "%s%s", buf, " RST");
 660        if (flag & SYN)
 661                snprintf(buf, sizeof(buf), "%s%s", buf, " SYN");
 662        if (flag & FIN)
 663                snprintf(buf, sizeof(buf), "%s%s", buf, " FIN");
 664
 665        return buf;
 666}
 667
 668/* Helper, determine if we should send a TCP timestamp.  ts_val was the
 669 * timestamp from our distant end.  We'll also send a TS on SYN (no ACK). */
 670static bool tcp_seg_has_ts(Tcp *tcph)
 671{
 672        return tcph->ts_val || ((tcph->flags & SYN) && !(tcph->flags & ACK));
 673}
 674
 675/* Given a TCP header/segment and default header size (e.g. TCP4_HDRSIZE),
 676 * return the actual hdr_len and opt_pad */
 677static void compute_hdrlen_optpad(Tcp *tcph, uint16_t default_hdrlen,
 678                                  uint16_t *ret_hdrlen, uint16_t *ret_optpad,
 679                                  Tcpctl *tcb)
 680{
 681        uint16_t hdrlen = default_hdrlen;
 682        uint16_t optpad = 0;
 683
 684        if (tcph->flags & SYN) {
 685                if (tcph->mss)
 686                        hdrlen += MSS_LENGTH;
 687                if (tcph->ws)
 688                        hdrlen += WS_LENGTH;
 689                if (tcph->sack_ok)
 690                        hdrlen += SACK_OK_LENGTH;
 691        }
 692        if (tcp_seg_has_ts(tcph)) {
 693                hdrlen += TS_LENGTH;
 694                /* SYNs have other opts, don't do the PREPAD NOOP optimization.
 695                 */
 696                if (!(tcph->flags & SYN))
 697                        hdrlen += TS_SEND_PREPAD;
 698        }
 699        if (tcb && tcb->rcv.nr_sacks)
 700                hdrlen += 2 + tcb->rcv.nr_sacks * 8;
 701        optpad = hdrlen & 3;
 702        if (optpad)
 703                optpad = 4 - optpad;
 704        hdrlen += optpad;
 705        *ret_hdrlen = hdrlen;
 706        *ret_optpad = optpad;
 707}
 708
 709/* Writes the TCP options for tcph to opt. */
 710static void write_opts(Tcp *tcph, uint8_t *opt, uint16_t optpad, Tcpctl *tcb)
 711{
 712        if (tcph->flags & SYN) {
 713                if (tcph->mss != 0) {
 714                        *opt++ = MSSOPT;
 715                        *opt++ = MSS_LENGTH;
 716                        hnputs(opt, tcph->mss);
 717                        opt += 2;
 718                }
 719                if (tcph->ws != 0) {
 720                        *opt++ = WSOPT;
 721                        *opt++ = WS_LENGTH;
 722                        *opt++ = tcph->ws;
 723                }
 724                if (tcph->sack_ok) {
 725                        *opt++ = SACK_OK_OPT;
 726                        *opt++ = SACK_OK_LENGTH;
 727                }
 728        }
 729        if (tcp_seg_has_ts(tcph)) {
 730                if (!(tcph->flags & SYN)) {
 731                        *opt++ = NOOPOPT;
 732                        *opt++ = NOOPOPT;
 733                }
 734                *opt++ = TS_OPT;
 735                *opt++ = TS_LENGTH;
 736                /* Setting TSval, our time */
 737                hnputl(opt, milliseconds());
 738                opt += 4;
 739                /* Setting TSecr, the time we last saw from them, stored in
 740                 * ts_val */
 741                hnputl(opt, tcph->ts_val);
 742                opt += 4;
 743        }
 744        if (tcb && tcb->rcv.nr_sacks) {
 745                *opt++ = SACK_OPT;
 746                *opt++ = 2 + tcb->rcv.nr_sacks * 8;
 747                for (int i = 0; i < tcb->rcv.nr_sacks; i++) {
 748                        hnputl(opt, tcb->rcv.sacks[i].left);
 749                        opt += 4;
 750                        hnputl(opt, tcb->rcv.sacks[i].right);
 751                        opt += 4;
 752                }
 753        }
 754        while (optpad-- > 0)
 755                *opt++ = NOOPOPT;
 756}
 757
 758/* Given a data block (or NULL) returns a block with enough header room that we
 759 * can send out.  block->wp is set to the beginning of the payload.  Returns
 760 * NULL on some sort of error. */
 761static struct block *alloc_or_pad_block(struct block *data,
 762                                        uint16_t total_hdr_size)
 763{
 764        if (data) {
 765                data = padblock(data, total_hdr_size);
 766                if (data == NULL)
 767                        return NULL;
 768        } else {
 769                /* the 64 pad is to meet mintu's */
 770                data = block_alloc(total_hdr_size + 64, MEM_WAIT);
 771                if (data == NULL)
 772                        return NULL;
 773                data->wp += total_hdr_size;
 774        }
 775        return data;
 776}
 777
 778static struct block *htontcp6(Tcp *tcph, struct block *data, Tcp6hdr *ph,
 779                              Tcpctl *tcb)
 780{
 781        int dlen = blocklen(data);
 782        Tcp6hdr *h;
 783        uint16_t csum;
 784        uint16_t hdrlen, optpad;
 785
 786        compute_hdrlen_optpad(tcph, TCP6_HDRSIZE, &hdrlen, &optpad, tcb);
 787
 788        data = alloc_or_pad_block(data, hdrlen + TCP6_PKT);
 789        if (data == NULL)
 790                return NULL;
 791        /* relative to the block start (bp->rp).  Note TCP structs include IP.
 792         */
 793        data->network_offset = 0;
 794        data->transport_offset = offsetof(Tcp6hdr, tcpsport);
 795
 796        /* copy in pseudo ip header plus port numbers */
 797        h = (Tcp6hdr *) (data->rp);
 798        memmove(h, ph, TCP6_TCBPHDRSZ);
 799
 800        /* compose pseudo tcp header, do cksum calculation */
 801        hnputl(h->vcf, hdrlen + dlen);
 802        h->ploadlen[0] = h->ploadlen[1] = h->proto = 0;
 803        h->ttl = ph->proto;
 804
 805        /* copy in variable bits */
 806        hnputl(h->tcpseq, tcph->seq);
 807        hnputl(h->tcpack, tcph->ack);
 808        hnputs(h->tcpflag, (hdrlen << 10) | tcph->flags);
 809        hnputs(h->tcpwin, tcph->wnd >> (tcb != NULL ? tcb->snd.scale : 0));
 810        hnputs(h->tcpurg, tcph->urg);
 811
 812        write_opts(tcph, h->tcpopt, optpad, tcb);
 813
 814        if (tcb != NULL && tcb->nochecksum) {
 815                h->tcpcksum[0] = h->tcpcksum[1] = 0;
 816        } else {
 817                csum = ptclcsum(data, TCP6_IPLEN, hdrlen + dlen +
 818                                TCP6_PHDRSIZE);
 819                hnputs(h->tcpcksum, csum);
 820        }
 821
 822        /* move from pseudo header back to normal ip header */
 823        memset(h->vcf, 0, 4);
 824        h->vcf[0] = IP_VER6;
 825        hnputs(h->ploadlen, hdrlen + dlen);
 826        h->proto = ph->proto;
 827
 828        return data;
 829}
 830
 831static struct block *htontcp4(Tcp *tcph, struct block *data, Tcp4hdr *ph,
 832                              Tcpctl *tcb)
 833{
 834        int dlen = blocklen(data);
 835        Tcp4hdr *h;
 836        uint16_t csum;
 837        uint16_t hdrlen, optpad;
 838
 839        compute_hdrlen_optpad(tcph, TCP4_HDRSIZE, &hdrlen, &optpad, tcb);
 840
 841        data = alloc_or_pad_block(data, hdrlen + TCP4_PKT);
 842        if (data == NULL)
 843                return NULL;
 844        /* relative to the block start (bp->rp).  Note TCP structs include IP.*/
 845        data->network_offset = 0;
 846        data->transport_offset = offsetof(Tcp4hdr, tcpsport);
 847
 848        /* copy in pseudo ip header plus port numbers */
 849        h = (Tcp4hdr *) (data->rp);
 850        memmove(h, ph, TCP4_TCBPHDRSZ);
 851
 852        /* copy in variable bits */
 853        hnputs(h->tcplen, hdrlen + dlen);
 854        hnputl(h->tcpseq, tcph->seq);
 855        hnputl(h->tcpack, tcph->ack);
 856        hnputs(h->tcpflag, (hdrlen << 10) | tcph->flags);
 857        hnputs(h->tcpwin, tcph->wnd >> (tcb != NULL ? tcb->snd.scale : 0));
 858        hnputs(h->tcpurg, tcph->urg);
 859
 860        write_opts(tcph, h->tcpopt, optpad, tcb);
 861
 862        if (tcb != NULL && tcb->nochecksum) {
 863                h->tcpcksum[0] = h->tcpcksum[1] = 0;
 864        } else {
 865                assert(data->transport_offset == TCP4_IPLEN + TCP4_PHDRSIZE);
 866                csum = ~ptclcsum(data, TCP4_IPLEN, TCP4_PHDRSIZE);
 867                hnputs(h->tcpcksum, csum);
 868                data->tx_csum_offset = ph->tcpcksum - ph->tcpsport;
 869                data->flag |= Btcpck;
 870        }
 871
 872        return data;
 873}
 874
 875static void parse_inbound_sacks(Tcp *tcph, uint8_t *opt, uint16_t optlen)
 876{
 877        uint8_t nr_sacks;
 878        uint32_t left, right;
 879
 880        nr_sacks = (optlen - 2) / 8;
 881        if (nr_sacks > MAX_NR_SACKS_PER_PACKET)
 882                return;
 883        opt += 2;
 884        for (int i = 0; i < nr_sacks; i++, opt += 8) {
 885                left = nhgetl(opt);
 886                right = nhgetl(opt + 4);
 887                if (seq_ge(left, right)) {
 888                        /* bad / malicious SACK.  Skip it, and adjust. */
 889                        nr_sacks--;
 890                        i--;    /* stay on this array element next loop */
 891                        continue;
 892                }
 893                tcph->sacks[i].left = left;
 894                tcph->sacks[i].right = right;
 895        }
 896        tcph->nr_sacks = nr_sacks;
 897}
 898
 899static void parse_inbound_opts(Tcp *tcph, uint8_t *opt, uint16_t optsize)
 900{
 901        uint16_t optlen;
 902
 903        while (optsize > 0 && *opt != EOLOPT) {
 904                if (*opt == NOOPOPT) {
 905                        optsize--;
 906                        opt++;
 907                        continue;
 908                }
 909                optlen = opt[1];
 910                if (optlen < 2 || optlen > optsize)
 911                        break;
 912                switch (*opt) {
 913                case MSSOPT:
 914                        if (optlen == MSS_LENGTH)
 915                                tcph->mss = nhgets(opt + 2);
 916                        break;
 917                case WSOPT:
 918                        if (optlen == WS_LENGTH && *(opt + 2) <= MAX_WS_VALUE)
 919                                tcph->ws = HaveWS | *(opt + 2);
 920                        break;
 921                case SACK_OK_OPT:
 922                        if (optlen == SACK_OK_LENGTH)
 923                                tcph->sack_ok = TRUE;
 924                        break;
 925                case SACK_OPT:
 926                        parse_inbound_sacks(tcph, opt, optlen);
 927                        break;
 928                case TS_OPT:
 929                        if (optlen == TS_LENGTH) {
 930                                tcph->ts_val = nhgetl(opt + 2);
 931                                tcph->ts_ecr = nhgetl(opt + 6);
 932                        }
 933                        break;
 934                }
 935                optsize -= optlen;
 936                opt += optlen;
 937        }
 938}
 939
 940/* Helper, clears the opts.  We'll later set them with e.g. parse_inbound_opts,
 941 * set them manually, or something else. */
 942static void clear_tcph_opts(Tcp *tcph)
 943{
 944        tcph->mss = 0;
 945        tcph->ws = 0;
 946        tcph->sack_ok = FALSE;
 947        tcph->nr_sacks = 0;
 948        tcph->ts_val = 0;
 949        tcph->ts_ecr = 0;
 950}
 951
 952static int ntohtcp6(Tcp *tcph, struct block **bpp)
 953{
 954        Tcp6hdr *h;
 955        uint16_t hdrlen;
 956
 957        *bpp = pullupblock(*bpp, TCP6_PKT + TCP6_HDRSIZE);
 958        if (*bpp == NULL)
 959                return -1;
 960
 961        h = (Tcp6hdr *) ((*bpp)->rp);
 962        tcph->source = nhgets(h->tcpsport);
 963        tcph->dest = nhgets(h->tcpdport);
 964        tcph->seq = nhgetl(h->tcpseq);
 965        tcph->ack = nhgetl(h->tcpack);
 966        hdrlen = (h->tcpflag[0] >> 2) & ~3;
 967        if (hdrlen < TCP6_HDRSIZE) {
 968                freeblist(*bpp);
 969                return -1;
 970        }
 971
 972        tcph->flags = h->tcpflag[1];
 973        tcph->wnd = nhgets(h->tcpwin);
 974        tcph->urg = nhgets(h->tcpurg);
 975        clear_tcph_opts(tcph);
 976        tcph->len = nhgets(h->ploadlen) - hdrlen;
 977
 978        *bpp = pullupblock(*bpp, hdrlen + TCP6_PKT);
 979        if (*bpp == NULL)
 980                return -1;
 981        parse_inbound_opts(tcph, h->tcpopt, hdrlen - TCP6_HDRSIZE);
 982        return hdrlen;
 983}
 984
 985static int ntohtcp4(Tcp *tcph, struct block **bpp)
 986{
 987        Tcp4hdr *h;
 988        uint16_t hdrlen;
 989
 990        *bpp = pullupblock(*bpp, TCP4_PKT + TCP4_HDRSIZE);
 991        if (*bpp == NULL)
 992                return -1;
 993
 994        h = (Tcp4hdr *) ((*bpp)->rp);
 995        tcph->source = nhgets(h->tcpsport);
 996        tcph->dest = nhgets(h->tcpdport);
 997        tcph->seq = nhgetl(h->tcpseq);
 998        tcph->ack = nhgetl(h->tcpack);
 999
1000        hdrlen = (h->tcpflag[0] >> 2) & ~3;
1001        if (hdrlen < TCP4_HDRSIZE) {
1002                freeblist(*bpp);
1003                return -1;
1004        }
1005
1006        tcph->flags = h->tcpflag[1];
1007        tcph->wnd = nhgets(h->tcpwin);
1008        tcph->urg = nhgets(h->tcpurg);
1009        clear_tcph_opts(tcph);
1010        tcph->len = nhgets(h->length) - (hdrlen + TCP4_PKT);
1011
1012        *bpp = pullupblock(*bpp, hdrlen + TCP4_PKT);
1013        if (*bpp == NULL)
1014                return -1;
1015        parse_inbound_opts(tcph, h->tcpopt, hdrlen - TCP4_HDRSIZE);
1016        return hdrlen;
1017}
1018
1019/*
1020 *  For outgoing calls, generate an initial sequence
1021 *  number and put a SYN on the send queue
1022 */
1023static void tcpsndsyn(struct conv *s, Tcpctl *tcb)
1024{
1025        urandom_read(&tcb->iss, sizeof(tcb->iss));
1026        tcb->rttseq = tcb->iss;
1027        tcb->snd.wl2 = tcb->iss;
1028        tcb->snd.una = tcb->iss;
1029        tcb->snd.rtx = tcb->rttseq;
1030        tcb->snd.nxt = tcb->rttseq;
1031        tcb->flgcnt++;
1032        tcb->flags |= FORCE;
1033        tcb->sndsyntime = NOW;
1034
1035        /* set desired mss and scale */
1036        tcb->mss = tcpmtu(tcb->ifc, s->ipversion, &tcb->scale);
1037}
1038
1039static void sndrst(struct Proto *tcp, uint8_t *source, uint8_t *dest,
1040                   uint16_t length, Tcp *seg, uint8_t version, char *reason)
1041{
1042        struct block *hbp;
1043        uint8_t rflags;
1044        struct tcppriv *tpriv;
1045        Tcp4hdr ph4;
1046        Tcp6hdr ph6;
1047
1048        netlog(tcp->f, Logtcpreset, "sndrst: %s\n", reason);
1049
1050        tpriv = tcp->priv;
1051
1052        if (seg->flags & RST)
1053                return;
1054
1055        /* make pseudo header */
1056        switch (version) {
1057        case V4:
1058                memset(&ph4, 0, sizeof(ph4));
1059                ph4.vihl = IP_VER4;
1060                v6tov4(ph4.tcpsrc, dest);
1061                v6tov4(ph4.tcpdst, source);
1062                ph4.proto = IP_TCPPROTO;
1063                hnputs(ph4.tcplen, TCP4_HDRSIZE);
1064                hnputs(ph4.tcpsport, seg->dest);
1065                hnputs(ph4.tcpdport, seg->source);
1066                break;
1067        case V6:
1068                memset(&ph6, 0, sizeof(ph6));
1069                ph6.vcf[0] = IP_VER6;
1070                ipmove(ph6.tcpsrc, dest);
1071                ipmove(ph6.tcpdst, source);
1072                ph6.proto = IP_TCPPROTO;
1073                hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1074                hnputs(ph6.tcpsport, seg->dest);
1075                hnputs(ph6.tcpdport, seg->source);
1076                break;
1077        default:
1078                panic("sndrst: version %d", version);
1079        }
1080
1081        tpriv->stats[OutRsts]++;
1082        rflags = RST;
1083
1084        /* convince the other end that this reset is in band */
1085        if (seg->flags & ACK) {
1086                seg->seq = seg->ack;
1087                seg->ack = 0;
1088        } else {
1089                rflags |= ACK;
1090                seg->ack = seg->seq;
1091                seg->seq = 0;
1092                if (seg->flags & SYN)
1093                        seg->ack++;
1094                seg->ack += length;
1095                if (seg->flags & FIN)
1096                        seg->ack++;
1097        }
1098        seg->flags = rflags;
1099        seg->wnd = 0;
1100        seg->urg = 0;
1101        seg->mss = 0;
1102        seg->ws = 0;
1103        seg->sack_ok = FALSE;
1104        seg->nr_sacks = 0;
1105        /* seg->ts_val is already set with their timestamp */
1106        switch (version) {
1107        case V4:
1108                hbp = htontcp4(seg, NULL, &ph4, NULL);
1109                if (hbp == NULL)
1110                        return;
1111                ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1112                break;
1113        case V6:
1114                hbp = htontcp6(seg, NULL, &ph6, NULL);
1115                if (hbp == NULL)
1116                        return;
1117                ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1118                break;
1119        default:
1120                panic("sndrst2: version %d", version);
1121        }
1122}
1123
1124/*
1125 *  send a reset to the remote side and close the conversation
1126 *  called with s qlocked
1127 */
1128static void tcphangup(struct conv *s)
1129{
1130        ERRSTACK(1);
1131        Tcp seg;
1132        Tcpctl *tcb;
1133        struct block *hbp;
1134
1135        tcb = (Tcpctl *) s->ptcl;
1136        if (ipcmp(s->raddr, IPnoaddr)) {
1137                /* discard error style, poperror regardless */
1138                if (!waserror()) {
1139                        seg.flags = RST | ACK;
1140                        seg.ack = tcb->rcv.nxt;
1141                        tcb->last_ack_sent = seg.ack;
1142                        tcb->rcv.una = 0;
1143                        seg.seq = tcb->snd.nxt;
1144                        seg.wnd = 0;
1145                        seg.urg = 0;
1146                        seg.mss = 0;
1147                        seg.ws = 0;
1148                        seg.sack_ok = FALSE;
1149                        seg.nr_sacks = 0;
1150                        seg.ts_val = tcb->ts_recent;
1151                        switch (s->ipversion) {
1152                        case V4:
1153                                tcb->protohdr.tcp4hdr.vihl = IP_VER4;
1154                                hbp = htontcp4(&seg, NULL,
1155                                               &tcb->protohdr.tcp4hdr, tcb);
1156                                ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
1157                                break;
1158                        case V6:
1159                                tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
1160                                hbp = htontcp6(&seg, NULL,
1161                                               &tcb->protohdr.tcp6hdr, tcb);
1162                                ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
1163                                break;
1164                        default:
1165                                panic("tcphangup: version %d", s->ipversion);
1166                        }
1167                }
1168                poperror();
1169        }
1170        localclose(s, NULL);
1171}
1172
1173/*
1174 *  (re)send a SYN ACK
1175 */
1176static int sndsynack(struct Proto *tcp, Limbo *lp)
1177{
1178        struct block *hbp;
1179        Tcp4hdr ph4;
1180        Tcp6hdr ph6;
1181        Tcp seg;
1182        int scale;
1183        uint8_t flag = 0;
1184
1185        /* make pseudo header */
1186        switch (lp->version) {
1187        case V4:
1188                memset(&ph4, 0, sizeof(ph4));
1189                ph4.vihl = IP_VER4;
1190                v6tov4(ph4.tcpsrc, lp->laddr);
1191                v6tov4(ph4.tcpdst, lp->raddr);
1192                ph4.proto = IP_TCPPROTO;
1193                hnputs(ph4.tcplen, TCP4_HDRSIZE);
1194                hnputs(ph4.tcpsport, lp->lport);
1195                hnputs(ph4.tcpdport, lp->rport);
1196                break;
1197        case V6:
1198                memset(&ph6, 0, sizeof(ph6));
1199                ph6.vcf[0] = IP_VER6;
1200                ipmove(ph6.tcpsrc, lp->laddr);
1201                ipmove(ph6.tcpdst, lp->raddr);
1202                ph6.proto = IP_TCPPROTO;
1203                hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1204                hnputs(ph6.tcpsport, lp->lport);
1205                hnputs(ph6.tcpdport, lp->rport);
1206                break;
1207        default:
1208                panic("sndrst: version %d", lp->version);
1209        }
1210        lp->ifc = findipifc(tcp->f, lp->laddr, 0);
1211
1212        seg.seq = lp->iss;
1213        seg.ack = lp->irs + 1;
1214        seg.flags = SYN | ACK;
1215        seg.urg = 0;
1216        seg.mss = tcpmtu(lp->ifc, lp->version, &scale);
1217        seg.wnd = QMAX;
1218        seg.ts_val = lp->ts_val;
1219        seg.nr_sacks = 0;
1220
1221        /* if the other side set scale, we should too */
1222        if (lp->rcvscale) {
1223                seg.ws = scale;
1224                lp->sndscale = scale;
1225        } else {
1226                seg.ws = 0;
1227                lp->sndscale = 0;
1228        }
1229        if (SACK_SUPPORTED)
1230                seg.sack_ok = lp->sack_ok;
1231        else
1232                seg.sack_ok = FALSE;
1233
1234        switch (lp->version) {
1235        case V4:
1236                hbp = htontcp4(&seg, NULL, &ph4, NULL);
1237                if (hbp == NULL)
1238                        return -1;
1239                ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1240                break;
1241        case V6:
1242                hbp = htontcp6(&seg, NULL, &ph6, NULL);
1243                if (hbp == NULL)
1244                        return -1;
1245                ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, NULL);
1246                break;
1247        default:
1248                panic("sndsnack: version %d", lp->version);
1249        }
1250        lp->lastsend = NOW;
1251        return 0;
1252}
1253
1254#define hashipa(a, p) ( ( (a)[IPaddrlen-2] + (a)[IPaddrlen-1] + p )&LHTMASK )
1255
1256/*
1257 *  put a call into limbo and respond with a SYN ACK
1258 *
1259 *  called with proto locked
1260 */
1261static void limbo(struct conv *s, uint8_t *source, uint8_t *dest, Tcp *seg,
1262                  int version)
1263{
1264        Limbo *lp, **l;
1265        struct tcppriv *tpriv;
1266        int h;
1267
1268        tpriv = s->p->priv;
1269        h = hashipa(source, seg->source);
1270
1271        for (l = &tpriv->lht[h]; *l != NULL; l = &lp->next) {
1272                lp = *l;
1273                if (lp->lport != seg->dest || lp->rport != seg->source
1274                        || lp->version != version)
1275                        continue;
1276                if (ipcmp(lp->raddr, source) != 0)
1277                        continue;
1278                if (ipcmp(lp->laddr, dest) != 0)
1279                        continue;
1280
1281                /* each new SYN restarts the retransmits */
1282                lp->irs = seg->seq;
1283                break;
1284        }
1285        lp = *l;
1286        if (lp == NULL) {
1287                if (tpriv->nlimbo >= Maxlimbo && tpriv->lht[h]) {
1288                        lp = tpriv->lht[h];
1289                        tpriv->lht[h] = lp->next;
1290                        lp->next = NULL;
1291                } else {
1292                        lp = kzmalloc(sizeof(*lp), 0);
1293                        if (lp == NULL)
1294                                return;
1295                        tpriv->nlimbo++;
1296                }
1297                *l = lp;
1298                lp->version = version;
1299                ipmove(lp->laddr, dest);
1300                ipmove(lp->raddr, source);
1301                lp->lport = seg->dest;
1302                lp->rport = seg->source;
1303                lp->mss = seg->mss;
1304                lp->rcvscale = seg->ws;
1305                lp->sack_ok = seg->sack_ok;
1306                lp->irs = seg->seq;
1307                lp->ts_val = seg->ts_val;
1308                urandom_read(&lp->iss, sizeof(lp->iss));
1309        }
1310
1311        if (sndsynack(s->p, lp) < 0) {
1312                *l = lp->next;
1313                tpriv->nlimbo--;
1314                kfree(lp);
1315        }
1316}
1317
1318/*
1319 *  resend SYN ACK's once every SYNACK_RXTIMER ms.
1320 */
1321static void limborexmit(struct Proto *tcp)
1322{
1323        struct tcppriv *tpriv;
1324        Limbo **l, *lp;
1325        int h;
1326        int seen;
1327        uint64_t now;
1328
1329        tpriv = tcp->priv;
1330
1331        if (!canqlock(&tcp->qlock))
1332                return;
1333        seen = 0;
1334        now = NOW;
1335        for (h = 0; h < NLHT && seen < tpriv->nlimbo; h++) {
1336                for (l = &tpriv->lht[h]; *l != NULL && seen < tpriv->nlimbo;) {
1337                        lp = *l;
1338                        seen++;
1339                        if (now - lp->lastsend <
1340                            (lp->rexmits + 1) * SYNACK_RXTIMER)
1341                                continue;
1342
1343                        /* time it out after 1 second */
1344                        if (++(lp->rexmits) > 5) {
1345                                tpriv->nlimbo--;
1346                                *l = lp->next;
1347                                kfree(lp);
1348                                continue;
1349                        }
1350
1351                        /* if we're being attacked, don't bother resending SYN
1352                         * ACK's */
1353                        if (tpriv->nlimbo > 100)
1354                                continue;
1355
1356                        if (sndsynack(tcp, lp) < 0) {
1357                                tpriv->nlimbo--;
1358                                *l = lp->next;
1359                                kfree(lp);
1360                                continue;
1361                        }
1362
1363                        l = &lp->next;
1364                }
1365        }
1366        qunlock(&tcp->qlock);
1367}
1368
1369/*
1370 *  lookup call in limbo.  if found, throw it out.
1371 *
1372 *  called with proto locked
1373 */
1374static void limborst(struct conv *s, Tcp *segp, uint8_t *src, uint8_t *dst,
1375                     uint8_t version)
1376{
1377        Limbo *lp, **l;
1378        int h;
1379        struct tcppriv *tpriv;
1380
1381        tpriv = s->p->priv;
1382
1383        /* find a call in limbo */
1384        h = hashipa(src, segp->source);
1385        for (l = &tpriv->lht[h]; *l != NULL; l = &lp->next) {
1386                lp = *l;
1387                if (lp->lport != segp->dest || lp->rport != segp->source
1388                        || lp->version != version)
1389                        continue;
1390                if (ipcmp(lp->laddr, dst) != 0)
1391                        continue;
1392                if (ipcmp(lp->raddr, src) != 0)
1393                        continue;
1394
1395                /* RST can only follow the SYN */
1396                if (segp->seq == lp->irs + 1) {
1397                        tpriv->nlimbo--;
1398                        *l = lp->next;
1399                        kfree(lp);
1400                }
1401                break;
1402        }
1403}
1404
1405/* The advertised MSS (e.g. 1460) includes any per-packet TCP options, such as
1406 * TCP timestamps.  A given packet will contain mss bytes, but only typical_mss
1407 * bytes of *data*.  If we know we'll use those options, we should adjust our
1408 * typical_mss, which will affect the cwnd. */
1409static void adjust_typical_mss_for_opts(Tcp *tcph, Tcpctl *tcb)
1410{
1411        uint16_t opt_size = 0;
1412
1413        if (tcph->ts_val)
1414                opt_size += TS_LENGTH + TS_SEND_PREPAD;
1415        opt_size = ROUNDUP(opt_size, 4);
1416        tcb->typical_mss -= opt_size;
1417}
1418
1419/*
1420 *  come here when we finally get an ACK to our SYN-ACK.
1421 *  lookup call in limbo.  if found, create a new conversation
1422 *
1423 *  called with proto locked
1424 */
1425static struct conv *tcpincoming(struct conv *s, Tcp *segp, uint8_t *src,
1426                                                                uint8_t *dst, uint8_t version)
1427{
1428        struct conv *new;
1429        Tcpctl *tcb;
1430        struct tcppriv *tpriv;
1431        Tcp4hdr *h4;
1432        Tcp6hdr *h6;
1433        Limbo *lp, **l;
1434        int h;
1435
1436        /* unless it's just an ack, it can't be someone coming out of limbo */
1437        if ((segp->flags & SYN) || (segp->flags & ACK) == 0)
1438                return NULL;
1439
1440        tpriv = s->p->priv;
1441
1442        /* find a call in limbo */
1443        h = hashipa(src, segp->source);
1444        for (l = &tpriv->lht[h]; (lp = *l) != NULL; l = &lp->next) {
1445                netlog(s->p->f, Logtcp,
1446                           "tcpincoming s %I!%d/%I!%d d %I!%d/%I!%d v %d/%d\n",
1447                           src, segp->source, lp->raddr, lp->rport, dst,
1448                           segp->dest, lp->laddr, lp->lport, version,
1449                           lp->version);
1450
1451                if (lp->lport != segp->dest || lp->rport != segp->source
1452                        || lp->version != version)
1453                        continue;
1454                if (ipcmp(lp->laddr, dst) != 0)
1455                        continue;
1456                if (ipcmp(lp->raddr, src) != 0)
1457                        continue;
1458
1459                /* we're assuming no data with the initial SYN */
1460                if (segp->seq != lp->irs + 1 || segp->ack != lp->iss + 1) {
1461                        netlog(s->p->f, Logtcp,
1462                               "tcpincoming s 0x%lx/0x%lx a 0x%lx 0x%lx\n",
1463                               segp->seq, lp->irs + 1, segp->ack, lp->iss + 1);
1464                        lp = NULL;
1465                } else {
1466                        tpriv->nlimbo--;
1467                        *l = lp->next;
1468                }
1469                break;
1470        }
1471        if (lp == NULL)
1472                return NULL;
1473
1474        new = Fsnewcall(s, src, segp->source, dst, segp->dest, version);
1475        if (new == NULL)
1476                return NULL;
1477
1478        memmove(new->ptcl, s->ptcl, sizeof(Tcpctl));
1479        tcb = (Tcpctl *) new->ptcl;
1480        tcb->flags &= ~CLONE;
1481        tcb->timer.arg = new;
1482        tcb->timer.state = TcptimerOFF;
1483        tcb->acktimer.arg = new;
1484        tcb->acktimer.state = TcptimerOFF;
1485        tcb->katimer.arg = new;
1486        tcb->katimer.state = TcptimerOFF;
1487        tcb->rtt_timer.arg = new;
1488        tcb->rtt_timer.state = TcptimerOFF;
1489
1490        tcb->irs = lp->irs;
1491        tcb->rcv.nxt = tcb->irs + 1;
1492        tcb->rcv.urg = tcb->rcv.nxt;
1493
1494        tcb->iss = lp->iss;
1495        tcb->rttseq = tcb->iss;
1496        tcb->snd.wl2 = tcb->iss;
1497        tcb->snd.una = tcb->iss + 1;
1498        tcb->snd.rtx = tcb->iss + 1;
1499        tcb->snd.nxt = tcb->iss + 1;
1500        tcb->flgcnt = 0;
1501        tcb->flags |= SYNACK;
1502
1503        /* our sending max segment size cannot be bigger than what he asked for
1504         */
1505        if (lp->mss != 0 && lp->mss < tcb->mss) {
1506                tcb->mss = lp->mss;
1507                tcb->typical_mss = tcb->mss;
1508        }
1509        adjust_typical_mss_for_opts(segp, tcb);
1510
1511        /* Here's where we record the previously-decided header options.  They
1512         * were actually decided on when we agreed to them in the SYNACK we
1513         * sent.  We didn't create an actual TCB until now, so we can copy those
1514         * decisions out of the limbo tracker and into the TCB. */
1515        tcb->ifc = lp->ifc;
1516        tcb->sack_ok = lp->sack_ok;
1517        /* window scaling */
1518        tcpsetscale(new, tcb, lp->rcvscale, lp->sndscale);
1519        tcb_check_tso(tcb);
1520
1521        tcb->snd.wnd = segp->wnd;
1522        tcb->cwind = tcb->typical_mss * CWIND_SCALE;
1523
1524        /* set initial round trip time */
1525        tcb->sndsyntime = lp->lastsend + lp->rexmits * SYNACK_RXTIMER;
1526        tcpsynackrtt(new);
1527
1528        kfree(lp);
1529
1530        /* set up proto header */
1531        switch (version) {
1532        case V4:
1533                h4 = &tcb->protohdr.tcp4hdr;
1534                memset(h4, 0, sizeof(*h4));
1535                h4->proto = IP_TCPPROTO;
1536                hnputs(h4->tcpsport, new->lport);
1537                hnputs(h4->tcpdport, new->rport);
1538                v6tov4(h4->tcpsrc, dst);
1539                v6tov4(h4->tcpdst, src);
1540                break;
1541        case V6:
1542                h6 = &tcb->protohdr.tcp6hdr;
1543                memset(h6, 0, sizeof(*h6));
1544                h6->proto = IP_TCPPROTO;
1545                hnputs(h6->tcpsport, new->lport);
1546                hnputs(h6->tcpdport, new->rport);
1547                ipmove(h6->tcpsrc, dst);
1548                ipmove(h6->tcpdst, src);
1549                break;
1550        default:
1551                panic("tcpincoming: version %d", new->ipversion);
1552        }
1553
1554        tcpsetstate(new, Established);
1555
1556        iphtadd(&tpriv->ht, new);
1557
1558        return new;
1559}
1560
1561/*
1562 *  use the time between the first SYN and it's ack as the
1563 *  initial round trip time
1564 */
1565static void tcpsynackrtt(struct conv *s)
1566{
1567        Tcpctl *tcb;
1568        uint64_t delta;
1569        struct tcppriv *tpriv;
1570
1571        tcb = (Tcpctl *) s->ptcl;
1572        tpriv = s->p->priv;
1573
1574        delta = NOW - tcb->sndsyntime;
1575        tcb->srtt = delta;
1576        tcb->mdev = delta / 2;
1577
1578        /* halt round trip timer */
1579        tcphalt(tpriv, &tcb->rtt_timer);
1580}
1581
1582/* For LFNs (long/fat), our default tx queue doesn't hold enough data, and TCP
1583 * blocks on the application - even if the app already has the data ready to go.
1584 * We need to hold the sent, unacked data (1x cwnd), plus all the data we might
1585 * send next RTT (1x cwnd).  Note this is called after cwnd was expanded. */
1586static void adjust_tx_qio_limit(struct conv *s)
1587{
1588        Tcpctl *tcb = (Tcpctl *) s->ptcl;
1589        size_t ideal_limit = tcb->cwind * 2;
1590
1591        /* This is called for every ACK, and it's not entirely free to update
1592         * the limit (locks, CVs, taps).  Updating in chunks of mss seems
1593         * reasonable.  During SS, we'll update this on most ACKs (given each
1594         * ACK increased the cwind by > MSS).
1595         *
1596         * We also don't want a lot of tiny blocks from the user, but the way
1597         * qio works, you can put in as much as you want (Maxatomic) and then
1598         * get flow-controlled. */
1599        if (qgetlimit(s->wq) + tcb->typical_mss < ideal_limit)
1600                qsetlimit(s->wq, ideal_limit);
1601        /* TODO: we could shrink the qio limit too, if we had a better idea what
1602         * the actual threshold was.  We want the limit to be the 'stable' cwnd
1603         * times 2. */
1604}
1605
1606/* Attempts to merge later sacks into sack 'into' (index in the array) */
1607static void merge_sacks_into(Tcpctl *tcb, int into)
1608{
1609        struct sack_block *into_sack = &tcb->snd.sacks[into];
1610        struct sack_block *tcb_sack;
1611        int shift = 0;
1612
1613        for (int i = into + 1; i < tcb->snd.nr_sacks; i++) {
1614                tcb_sack = &tcb->snd.sacks[i];
1615                if (seq_lt(into_sack->right, tcb_sack->left))
1616                        break;
1617                if (seq_gt(tcb_sack->right, into_sack->right))
1618                        into_sack->right = tcb_sack->right;
1619                shift++;
1620        }
1621        if (shift) {
1622                memmove(tcb->snd.sacks + into + 1,
1623                        tcb->snd.sacks + into + 1 + shift,
1624                        sizeof(struct sack_block) * (tcb->snd.nr_sacks - into -
1625                                                     1 - shift));
1626                tcb->snd.nr_sacks -= shift;
1627        }
1628}
1629
1630/* If we update a sack, it means they received a packet (possibly out of order),
1631 * but they have not received earlier packets.  Otherwise, they would do a full
1632 * ACK.
1633 *
1634 * The trick is in knowing whether the reception growing this sack is due to a
1635 * retrans or due to packets from before our last loss event.  The rightmost
1636 * sack tends to grow a lot with packets we sent before the loss.  However,
1637 * intermediate sacks that grow are signs of a loss, since they only grow as a
1638 * result of retrans.
1639 *
1640 * This is only true for the first time through a retrans.  After we've gone
1641 * through a full retrans blast, the sack that hinted at the retrans loss (and
1642 * there could be multiple of them!) will continue to grow.  We could come up
1643 * with some tracking for this, but instead we'll just do a one-time deal.  You
1644 * can recover from one detected sack retrans loss.  After that, you'll have to
1645 * use the RTO.
1646 *
1647 * This won't catch some things, like a sack that grew and merged with the
1648 * rightmost sack.  This also won't work if you have a single sack.  We can't
1649 * tell where the retrans ends and the sending begins. */
1650static bool sack_hints_at_loss(Tcpctl *tcb, struct sack_block *tcb_sack)
1651{
1652        if (tcb->snd.recovery != SACK_RETRANS_RECOVERY)
1653                return FALSE;
1654        return &tcb->snd.sacks[tcb->snd.nr_sacks - 1] != tcb_sack;
1655}
1656
1657static bool sack_contains(struct sack_block *tcb_sack, uint32_t seq)
1658{
1659        return seq_le(tcb_sack->left, seq) && seq_lt(seq, tcb_sack->right);
1660}
1661
1662/* Debugging helper! */
1663static void sack_asserter(Tcpctl *tcb, char *str)
1664{
1665        struct sack_block *tcb_sack;
1666
1667        for (int i = 0; i < tcb->snd.nr_sacks; i++) {
1668                tcb_sack = &tcb->snd.sacks[i];
1669                /* Checking invariants: snd.rtx is never inside a sack, sacks
1670                 * are always mutually exclusive. */
1671                if (sack_contains(tcb_sack, tcb->snd.rtx) ||
1672                    ((i + 1 < tcb->snd.nr_sacks) &&
1673                     seq_ge(tcb_sack->right, (tcb_sack + 1)->left))) {
1674                        printk("SACK ASSERT ERROR at %s\n", str);
1675                        printk("rtx %u una %u nxt %u, sack [%u, %u)\n",
1676                               tcb->snd.rtx, tcb->snd.una, tcb->snd.nxt,
1677                               tcb_sack->left, tcb_sack->right);
1678                        for (int i = 0; i < tcb->snd.nr_sacks; i++)
1679                                printk("\t %d: [%u, %u)\n", i,
1680                                       tcb->snd.sacks[i].left,
1681                                       tcb->snd.sacks[i].right);
1682                        backtrace();
1683                        panic("");
1684                }
1685        }
1686}
1687
1688/* Updates bookkeeping whenever a sack is added or updated */
1689static void sack_has_changed(struct conv *s, Tcpctl *tcb,
1690                             struct sack_block *tcb_sack)
1691{
1692        /* Due to the change, snd.rtx might be in the middle of this sack.
1693         * Advance it to the right edge. */
1694        if (sack_contains(tcb_sack, tcb->snd.rtx))
1695                tcb->snd.rtx = tcb_sack->right;
1696
1697        /* This is a sack for something we retransed and we think it means there
1698         * was another loss.  Instead of waiting for the RTO, we can take
1699         * action. */
1700        if (sack_hints_at_loss(tcb, tcb_sack)) {
1701                if (++tcb->snd.sack_loss_hint == TCPREXMTTHRESH) {
1702                        netlog(s->p->f, Logtcprxmt,
1703                               "%I.%d -> %I.%d: sack rxmit loss: snd.rtx %u, sack [%u,%u), una %u, recovery_pt %u\n",
1704                               s->laddr, s->lport, s->raddr, s->rport,
1705                               tcb->snd.rtx, tcb_sack->left, tcb_sack->right,
1706                               tcb->snd.una, tcb->snd.recovery_pt);
1707                        /* Redo retrans, but keep the sacks and recovery point*/
1708                        tcp_loss_event(s, tcb);
1709                        tcb->snd.rtx = tcb->snd.una;
1710                        tcb->snd.sack_loss_hint = 0;
1711                        /* Act like an RTO.  We just detected it earlier.  This
1712                         * prevents us from getting another sack hint loss this
1713                         * recovery period and from advancing the opportunistic
1714                         * right edge. */
1715                        tcb->snd.recovery = RTO_RETRANS_RECOVERY;
1716                        /* We didn't actually time out yet and we expect to keep
1717                         * getting sacks, so we don't want to flush or worry
1718                         * about in_flight.  If we messed something up, the RTO
1719                         * will still fire. */
1720                        set_in_flight(tcb);
1721                }
1722        }
1723}
1724
1725/* Advances tcb_sack's right edge, if new_right is farther, and updates the
1726 * bookkeeping due to the change. */
1727static void update_right_edge(struct conv *s, Tcpctl *tcb,
1728                              struct sack_block *tcb_sack, uint32_t new_right)
1729{
1730        if (seq_le(new_right, tcb_sack->right))
1731                return;
1732        tcb_sack->right = new_right;
1733        merge_sacks_into(tcb, tcb_sack - tcb->snd.sacks);
1734        sack_has_changed(s, tcb, tcb_sack);
1735}
1736
1737static void update_or_insert_sack(struct conv *s, Tcpctl *tcb,
1738                                  struct sack_block *seg_sack)
1739{
1740        struct sack_block *tcb_sack;
1741
1742        for (int i = 0; i < tcb->snd.nr_sacks; i++) {
1743                tcb_sack = &tcb->snd.sacks[i];
1744                if (seq_lt(tcb_sack->left, seg_sack->left)) {
1745                        /* This includes adjacent (which I've seen!) and
1746                         * overlap. */
1747                        if (seq_le(seg_sack->left, tcb_sack->right)) {
1748                                update_right_edge(s, tcb, tcb_sack,
1749                                                  seg_sack->right);
1750                                return;
1751                        }
1752                        continue;
1753                }
1754                /* Update existing sack */
1755                if (tcb_sack->left == seg_sack->left) {
1756                        update_right_edge(s, tcb, tcb_sack, seg_sack->right);
1757                        return;
1758                }
1759                /* Found our slot */
1760                if (seq_gt(tcb_sack->left, seg_sack->left)) {
1761                        if (tcb->snd.nr_sacks == MAX_NR_SND_SACKS) {
1762                                /* Out of room, but it is possible this sack
1763                                 * overlaps later sacks, including the max
1764                                 * sack's right edge. */
1765                                if (seq_ge(seg_sack->right, tcb_sack->left)) {
1766                                        /* Take over the sack */
1767                                        tcb_sack->left = seg_sack->left;
1768                                        update_right_edge(s, tcb, tcb_sack,
1769                                                          seg_sack->right);
1770                                }
1771                                return;
1772                        }
1773                        /* O/W, it's our slot and we have room (at least one
1774                         * spot). */
1775                        memmove(&tcb->snd.sacks[i + 1], &tcb->snd.sacks[i],
1776                                sizeof(struct sack_block) * (tcb->snd.nr_sacks -
1777                                                             i));
1778                        tcb_sack->left = seg_sack->left;
1779                        tcb_sack->right = seg_sack->right;
1780                        tcb->snd.nr_sacks++;
1781                        merge_sacks_into(tcb, i);
1782                        sack_has_changed(s, tcb, tcb_sack);
1783                        return;
1784                }
1785        }
1786        if (tcb->snd.nr_sacks == MAX_NR_SND_SACKS) {
1787                /* We didn't find space in the sack array. */
1788                tcb_sack = &tcb->snd.sacks[MAX_NR_SND_SACKS - 1];
1789                /* Need to always maintain the rightmost sack, discarding the
1790                 * prev */
1791                if (seq_gt(seg_sack->right, tcb_sack->right)) {
1792                        tcb_sack->left = seg_sack->left;
1793                        tcb_sack->right = seg_sack->right;
1794                        sack_has_changed(s, tcb, tcb_sack);
1795                }
1796                return;
1797        }
1798        tcb_sack = &tcb->snd.sacks[tcb->snd.nr_sacks];
1799        tcb->snd.nr_sacks++;
1800        tcb_sack->left = seg_sack->left;
1801        tcb_sack->right = seg_sack->right;
1802        sack_has_changed(s, tcb, tcb_sack);
1803}
1804
1805/* Given the packet seg, track the sacks in TCB.  There are a few things: if seg
1806 * acks new data, some sacks might no longer be needed.  Some sacks might grow,
1807 * we might add new sacks, either of which can cause a merger.
1808 *
1809 * The important thing is that we always have the max sack entry: it must be
1810 * inserted for sure and findable.  We need that for our measurement of what
1811 * packets are in the network.
1812 *
1813 * Note that we keep sacks that are below snd.rtx (and above
1814 * seg.ack/tcb->snd.una) as best we can - we don't prune them.  We'll need those
1815 * for the in_flight estimate.
1816 *
1817 * When we run out of room, we'll have to throw away a sack.  Anything we throw
1818 * away below snd.rtx will be counted as 'in flight', even though it isn't.  If
1819 * we throw away something greater than snd.rtx, we'll also retrans it.  For
1820 * simplicity, we throw-away / replace the rightmost sack, since we're always
1821 * maintaining a highest sack. */
1822static void update_sacks(struct conv *s, Tcpctl *tcb, Tcp *seg)
1823{
1824        int prune = 0;
1825        struct sack_block *tcb_sack;
1826
1827        for (int i = 0; i < tcb->snd.nr_sacks; i++) {
1828                tcb_sack = &tcb->snd.sacks[i];
1829                /* For the equality case, if they acked up to, but not including
1830                 * an old sack, they must have reneged it.  Otherwise they would
1831                 * have acked beyond the sack. */
1832                if (seq_lt(seg->ack, tcb_sack->left))
1833                        break;
1834                prune++;
1835        }
1836        if (prune) {
1837                memmove(tcb->snd.sacks, tcb->snd.sacks + prune,
1838                        sizeof(struct sack_block) * (tcb->snd.nr_sacks -
1839                                                     prune));
1840                tcb->snd.nr_sacks -= prune;
1841        }
1842        for (int i = 0; i < seg->nr_sacks; i++) {
1843                /* old sacks */
1844                if (seq_lt(seg->sacks[i].left, seg->ack))
1845                        continue;
1846                /* buggy sack: out of range */
1847                if (seq_gt(seg->sacks[i].right, tcb->snd.nxt))
1848                        continue;
1849                update_or_insert_sack(s, tcb, &seg->sacks[i]);
1850        }
1851}
1852
1853/* This is a little bit of an under estimate, since we assume a packet is lost
1854 * once we have any sacks above it.  Overall, it's at most 2 * MSS of an
1855 * overestimate.
1856 *
1857 * If we have no sacks (either reneged or never used) we'll assume all packets
1858 * above snd.rtx are lost.  This will be the case for sackless fast rxmit
1859 * (Dong's stuff) or for a timeout.  In the former case, this is probably not
1860 * true, and in_flight should be higher, but we have no knowledge without the
1861 * sacks. */
1862static void set_in_flight(Tcpctl *tcb)
1863{
1864        struct sack_block *tcb_sack;
1865        uint32_t in_flight = 0;
1866        uint32_t from;
1867
1868        if (!tcb->snd.nr_sacks) {
1869                tcb->snd.in_flight = tcb->snd.rtx - tcb->snd.una;
1870                return;
1871        }
1872
1873        /* Everything to the right of the unsacked */
1874        tcb_sack = &tcb->snd.sacks[tcb->snd.nr_sacks - 1];
1875        in_flight += tcb->snd.nxt - tcb_sack->right;
1876
1877        /* Everything retransed (from una to snd.rtx, minus sacked regions.
1878         * Note we only retrans at most the last sack's left edge.  snd.rtx will
1879         * be advanced to the right edge of some sack (possibly the last one).
1880         * */
1881        from = tcb->snd.una;
1882        for (int i = 0; i < tcb->snd.nr_sacks; i++) {
1883                tcb_sack = &tcb->snd.sacks[i];
1884                if (seq_ge(tcb_sack->left, tcb->snd.rtx))
1885                        break;
1886                assert(seq_ge(tcb->snd.rtx, tcb_sack->right));
1887                in_flight += tcb_sack->left - from;
1888                from = tcb_sack->right;
1889        }
1890        in_flight += tcb->snd.rtx - from;
1891
1892        tcb->snd.in_flight = in_flight;
1893}
1894
1895static void reset_recovery(struct conv *s, Tcpctl *tcb)
1896{
1897        netlog(s->p->f, Logtcprxmt,
1898               "%I.%d -> %I.%d: recovery complete, una %u, rtx %u, nxt %u, recovery %u\n",
1899               s->laddr, s->lport, s->raddr, s->rport,
1900               tcb->snd.una, tcb->snd.rtx, tcb->snd.nxt, tcb->snd.recovery_pt);
1901        tcb->snd.recovery = 0;
1902        tcb->snd.recovery_pt = 0;
1903        tcb->snd.loss_hint = 0;
1904        tcb->snd.flush_sacks = FALSE;
1905        tcb->snd.sack_loss_hint = 0;
1906}
1907
1908static bool is_dup_ack(Tcpctl *tcb, Tcp *seg)
1909{
1910        /* this is a pure ack w/o window update */
1911        return (seg->ack == tcb->snd.una) &&
1912               (tcb->snd.una != tcb->snd.nxt) &&
1913               (seg->len == 0) &&
1914               (seg->wnd == tcb->snd.wnd);
1915}
1916
1917/* If we have sacks, we'll ignore dupacks and look at the sacks ahead of una
1918 * (which are managed by the TCB).  The tcb will not have old sacks (below
1919 * ack/snd.rtx).  Receivers often send sacks below their ack point when we are
1920 * coming out of a loss, and we don't want those to count.
1921 *
1922 * Note the tcb could have sacks (in the future), but the receiver stopped using
1923 * them (reneged).  We'll catch that with the RTO.  If we try to catch it here,
1924 * we could get in a state where we never allow them to renege. */
1925static bool is_potential_loss(Tcpctl *tcb, Tcp *seg)
1926{
1927        if (seg->nr_sacks > 0)
1928                return tcb->snd.nr_sacks > 0;
1929        else
1930                return is_dup_ack(tcb, seg);
1931}
1932
1933/* When we use timestamps for RTTM, RFC 7323 suggests scaling by
1934 * expected_samples (per cwnd).  They say:
1935 *
1936 * ExpectedSamples = ceiling(FlightSize / (SMSS * 2))
1937 *
1938 * However, SMMS * 2 is really "number of bytes expected to be acked in a
1939 * packet.".  We'll use 'acked' to approximate that.  When the receiver uses
1940 * LRO, they'll send back large ACKs, which decreases the number of samples.
1941 *
1942 * If it turns out that all the divides are bad, we can just go back to not
1943 * using expected_samples at all. */
1944static int expected_samples_ts(Tcpctl *tcb, uint32_t acked)
1945{
1946        assert(acked);
1947        return MAX(DIV_ROUND_UP(tcb->snd.nxt - tcb->snd.una, acked), 1);
1948}
1949
1950/* Updates the RTT, given the currently sampled RTT and the number samples per
1951 * cwnd.  For non-TS RTTM, that'll be 1. */
1952static void update_rtt(Tcpctl *tcb, int rtt_sample, int expected_samples)
1953{
1954        int delta;
1955
1956        tcb->backoff = 0;
1957        tcb->backedoff = 0;
1958        if (tcb->srtt == 0) {
1959                tcb->srtt = rtt_sample;
1960                tcb->mdev = rtt_sample / 2;
1961        } else {
1962                delta = rtt_sample - tcb->srtt;
1963                tcb->srtt += (delta >> RTTM_ALPHA_SHIFT) / expected_samples;
1964                if (tcb->srtt <= 0)
1965                        tcb->srtt = 1;
1966                tcb->mdev += ((abs(delta) - tcb->mdev) >> RTTM_BRAVO_SHIFT) /
1967                             expected_samples;
1968                if (tcb->mdev <= 0)
1969                        tcb->mdev = 1;
1970        }
1971        tcpsettimer(tcb);
1972}
1973
1974static void update(struct conv *s, Tcp *seg)
1975{
1976        int rtt;
1977        Tcpctl *tcb;
1978        uint32_t acked, expand;
1979        struct tcppriv *tpriv;
1980
1981        tpriv = s->p->priv;
1982        tcb = (Tcpctl *) s->ptcl;
1983
1984        if (!seq_within(seg->ack, tcb->snd.una, tcb->snd.nxt))
1985                return;
1986
1987        acked = seg->ack - tcb->snd.una;
1988        tcb->snd.una = seg->ack;
1989        if (seq_gt(seg->ack, tcb->snd.rtx))
1990                tcb->snd.rtx = seg->ack;
1991
1992        update_sacks(s, tcb, seg);
1993        set_in_flight(tcb);
1994
1995        /* We treat either a dupack or forward SACKs as a hint that there is a
1996         * loss.  The RFCs suggest three dupacks before treating it as a loss
1997         * (alternative is reordered packets).  We'll treat three SACKs the same
1998         * way. */
1999        if (is_potential_loss(tcb, seg) && !tcb->snd.recovery) {
2000                tcb->snd.loss_hint++;
2001                if (tcb->snd.loss_hint == TCPREXMTTHRESH) {
2002                        netlog(s->p->f, Logtcprxmt,
2003                               "%I.%d -> %I.%d: loss hint thresh, nr sacks %u, nxt %u, una %u, cwnd %u\n",
2004                               s->laddr, s->lport, s->raddr, s->rport,
2005                               tcb->snd.nr_sacks, tcb->snd.nxt, tcb->snd.una,
2006                               tcb->cwind);
2007                        tcp_loss_event(s, tcb);
2008                        tcb->snd.recovery_pt = tcb->snd.nxt;
2009                        if (tcb->snd.nr_sacks) {
2010                                tcb->snd.recovery = SACK_RETRANS_RECOVERY;
2011                                tcb->snd.flush_sacks = FALSE;
2012                                tcb->snd.sack_loss_hint = 0;
2013                        } else {
2014                                tcb->snd.recovery = FAST_RETRANS_RECOVERY;
2015                        }
2016                        tcprxmit(s);
2017                }
2018        }
2019
2020        /*
2021         *  update window
2022         */
2023        if (seq_gt(seg->ack, tcb->snd.wl2)
2024                || (tcb->snd.wl2 == seg->ack && seg->wnd > tcb->snd.wnd)) {
2025                tcb->snd.wnd = seg->wnd;
2026                tcb->snd.wl2 = seg->ack;
2027        }
2028
2029        if (!acked) {
2030                /*
2031                 *  don't let us hangup if sending into a closed window and
2032                 *  we're still getting acks
2033                 */
2034                if (tcb->snd.recovery && (tcb->snd.wnd == 0))
2035                        tcb->backedoff = MAXBACKMS / 4;
2036                return;
2037        }
2038        /* At this point, they have acked something new. (positive ack, ack >
2039         * una).
2040         *
2041         * If we hadn't reached the threshold for recovery yet, the positive ACK
2042         * will reset our loss_hint count. */
2043        if (!tcb->snd.recovery)
2044                tcb->snd.loss_hint = 0;
2045        else if (seq_ge(seg->ack, tcb->snd.recovery_pt))
2046                reset_recovery(s, tcb);
2047
2048        /* avoid slow start and timers for SYN acks */
2049        if ((tcb->flags & SYNACK) == 0) {
2050                tcb->flags |= SYNACK;
2051                acked--;
2052                tcb->flgcnt--;
2053                goto done;
2054        }
2055
2056        /* slow start as long as we're not recovering from lost packets */
2057        if (tcb->cwind < tcb->snd.wnd && !tcb->snd.recovery) {
2058                if (tcb->cwind < tcb->ssthresh) {
2059                        /* We increase the cwind by every byte we receive.  We
2060                         * want to increase the cwind by one MSS for every MSS
2061                         * that gets ACKed.  Note that multiple MSSs can be
2062                         * ACKed in a single ACK.  If we had a remainder of
2063                         * acked / MSS, we'd add just that remainder - not 0 or
2064                         * 1 MSS. */
2065                        expand = acked;
2066                } else {
2067                        /* Every RTT, which consists of CWND bytes, we're
2068                         * supposed to expand by MSS bytes.  The classic
2069                         * algorithm was
2070                         *      expand = (tcb->mss * tcb->mss) / tcb->cwind;
2071                         * which assumes the ACK was for MSS bytes.  Instead,
2072                         * for every 'acked' bytes, we increase the window by
2073                         * acked / CWND (in units of MSS). */
2074                        expand = MAX(acked, tcb->typical_mss) * tcb->typical_mss
2075                                 / tcb->cwind;
2076                }
2077
2078                if (tcb->cwind + expand < tcb->cwind)
2079                        expand = tcb->snd.wnd - tcb->cwind;
2080                if (tcb->cwind + expand > tcb->snd.wnd)
2081                        expand = tcb->snd.wnd - tcb->cwind;
2082                tcb->cwind += expand;
2083        }
2084        adjust_tx_qio_limit(s);
2085
2086        if (tcb->ts_recent) {
2087                update_rtt(tcb, abs(milliseconds() - seg->ts_ecr),
2088                           expected_samples_ts(tcb, acked));
2089        } else if (tcb->rtt_timer.state == TcptimerON &&
2090                   seq_ge(seg->ack, tcb->rttseq)) {
2091                /* Adjust the timers according to the round trip time */
2092                tcphalt(tpriv, &tcb->rtt_timer);
2093                if (!tcb->snd.recovery) {
2094                        rtt = tcb->rtt_timer.start - tcb->rtt_timer.count;
2095                        if (rtt == 0) {
2096                                /* o/w all close systems will rxmit in 0 time */
2097                                rtt = 1;
2098                        }
2099                        rtt *= MSPTICK;
2100                        update_rtt(tcb, rtt, 1);
2101                }
2102        }
2103
2104done:
2105        if (qdiscard(s->wq, acked) < acked) {
2106                tcb->flgcnt--;
2107                /* This happened due to another bug where acked was very large
2108                 * (negative), which was interpreted as "hey, one less flag,
2109                 * since they acked one of our flags (like a SYN).  If flgcnt
2110                 * goes negative, get_xmit_segment() will attempt to send out
2111                 * large packets. */
2112                assert(tcb->flgcnt >= 0);
2113        }
2114
2115        if (seq_gt(seg->ack, tcb->snd.urg))
2116                tcb->snd.urg = seg->ack;
2117
2118        if (tcb->snd.una != tcb->snd.nxt)
2119                tcpgo(tpriv, &tcb->timer);
2120        else
2121                tcphalt(tpriv, &tcb->timer);
2122
2123        tcb->backoff = 0;
2124        tcb->backedoff = 0;
2125}
2126
2127static void update_tcb_ts(Tcpctl *tcb, Tcp *seg)
2128{
2129        /* Get timestamp info from the tcp header.  Even though the timestamps
2130         * aren't sequence numbers, we still need to protect for wraparound.
2131         * Though if the values were 0, assume that means we need an update.  We
2132         * could have an initial ts_val that appears negative (signed). */
2133        if (!tcb->ts_recent || !tcb->last_ack_sent ||
2134            (seq_ge(seg->ts_val, tcb->ts_recent) &&
2135             seq_le(seg->seq, tcb->last_ack_sent)))
2136                tcb->ts_recent = seg->ts_val;
2137}
2138
2139/* Overlap happens when one sack's left edge is inside another sack. */
2140static bool sacks_overlap(struct sack_block *x, struct sack_block *y)
2141{
2142        return (seq_le(x->left, y->left) && seq_le(y->left, x->right)) ||
2143               (seq_le(y->left, x->left) && seq_le(x->left, y->right));
2144}
2145
2146static void make_sack_first(Tcpctl *tcb, struct sack_block *tcb_sack)
2147{
2148        struct sack_block temp;
2149
2150        if (tcb_sack == &tcb->rcv.sacks[0])
2151                return;
2152        temp = tcb->rcv.sacks[0];
2153        tcb->rcv.sacks[0] = *tcb_sack;
2154        *tcb_sack = temp;
2155}
2156
2157/* Track sack in our tcb for a block of data we received.  This handles all the
2158 * stuff: making sure sack is first (since it's the most recent sack change),
2159 * updating or merging sacks, and dropping excess sacks (we only need to
2160 * maintain 3).  Unlike on the snd side, our tcb sacks are *not* sorted. */
2161static void track_rcv_sack(Tcpctl *tcb, uint32_t left, uint32_t right)
2162{
2163        struct sack_block *tcb_sack;
2164        struct sack_block sack[1];
2165
2166        if (!tcb->sack_ok)
2167                return;
2168        if (left == right)
2169                return;
2170        assert(seq_lt(left, right));
2171        sack->left = left;
2172        sack->right = right;
2173        /* We can reuse an existing sack if we're merging or overlapping. */
2174        for (int i = 0; i < tcb->rcv.nr_sacks; i++) {
2175                tcb_sack = &tcb->rcv.sacks[i];
2176                if (sacks_overlap(tcb_sack, sack)) {
2177                        tcb_sack->left = seq_min(tcb_sack->left, sack->left);
2178                        tcb_sack->right = seq_max(tcb_sack->right, sack->right);
2179                        make_sack_first(tcb, tcb_sack);
2180                        return;
2181                }
2182        }
2183        /* We can discard the last sack (right shift) - we should have sent it
2184         * at least once by now.  If not, oh well. */
2185        memmove(tcb->rcv.sacks + 1, tcb->rcv.sacks, sizeof(struct sack_block) *
2186                MIN(MAX_NR_RCV_SACKS - 1, tcb->rcv.nr_sacks));
2187        tcb->rcv.sacks[0] = *sack;
2188        if (tcb->rcv.nr_sacks < MAX_NR_RCV_SACKS)
2189                tcb->rcv.nr_sacks++;
2190}
2191
2192/* Once we receive everything and move rcv.nxt past a sack, we don't need to
2193 * track it.  I've seen Linux report sacks in the past, but we probably
2194 * shouldn't. */
2195static void drop_old_rcv_sacks(Tcpctl *tcb)
2196{
2197        struct sack_block *tcb_sack;
2198
2199        for (int i = 0; i < tcb->rcv.nr_sacks; i++) {
2200                tcb_sack = &tcb->rcv.sacks[i];
2201                /* Moving up to or past the left is enough to drop it. */
2202                if (seq_ge(tcb->rcv.nxt, tcb_sack->left)) {
2203                        memmove(tcb->rcv.sacks + i, tcb->rcv.sacks + i + 1,
2204                                sizeof(struct sack_block) * (tcb->rcv.nr_sacks -
2205                                                             i - 1));
2206                        tcb->rcv.nr_sacks--;
2207                        i--;
2208                }
2209        }
2210}
2211
2212static void tcpiput(struct Proto *tcp, struct Ipifc *unused, struct block *bp)
2213{
2214        ERRSTACK(1);
2215        Tcp seg;
2216        Tcp4hdr *h4;
2217        Tcp6hdr *h6;
2218        int hdrlen;
2219        Tcpctl *tcb;
2220        uint16_t length;
2221        uint8_t source[IPaddrlen], dest[IPaddrlen];
2222        struct conv *s;
2223        struct Fs *f;
2224        struct tcppriv *tpriv;
2225        uint8_t version;
2226
2227        f = tcp->f;
2228        tpriv = tcp->priv;
2229
2230        tpriv->stats[InSegs]++;
2231
2232        h4 = (Tcp4hdr *) (bp->rp);
2233        h6 = (Tcp6hdr *) (bp->rp);
2234
2235        if ((h4->vihl & 0xF0) == IP_VER4) {
2236                uint8_t ttl;
2237
2238                version = V4;
2239                length = nhgets(h4->length);
2240                v4tov6(dest, h4->tcpdst);
2241                v4tov6(source, h4->tcpsrc);
2242
2243                /* ttl isn't part of the xsum pseudo header, but bypass needs
2244                 * it. */
2245                ttl = h4->Unused;
2246                h4->Unused = 0;
2247                hnputs(h4->tcplen, length - TCP4_PKT);
2248                if (!(bp->flag & Btcpck) && (h4->tcpcksum[0] || h4->tcpcksum[1])
2249                    && ptclcsum(bp, TCP4_IPLEN, length - TCP4_IPLEN)) {
2250                        tpriv->stats[CsumErrs]++;
2251                        tpriv->stats[InErrs]++;
2252                        netlog(f, Logtcp, "bad tcp proto cksum\n");
2253                        freeblist(bp);
2254                        return;
2255                }
2256                h4->Unused = ttl;
2257
2258                hdrlen = ntohtcp4(&seg, &bp);
2259                if (hdrlen < 0) {
2260                        tpriv->stats[HlenErrs]++;
2261                        tpriv->stats[InErrs]++;
2262                        netlog(f, Logtcp, "bad tcp hdr len\n");
2263                        return;
2264                }
2265
2266                s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest);
2267                if (s && s->state == Bypass) {
2268                        bypass_or_drop(s, bp);
2269                        return;
2270                }
2271
2272                /* trim the packet to the size claimed by the datagram */
2273                length -= hdrlen + TCP4_PKT;
2274                bp = trimblock(bp, hdrlen + TCP4_PKT, length);
2275                if (bp == NULL) {
2276                        tpriv->stats[LenErrs]++;
2277                        tpriv->stats[InErrs]++;
2278                        netlog(f, Logtcp, "tcp len < 0 after trim\n");
2279                        return;
2280                }
2281        } else {
2282                int ttl = h6->ttl;
2283                int proto = h6->proto;
2284
2285                version = V6;
2286                length = nhgets(h6->ploadlen);
2287                ipmove(dest, h6->tcpdst);
2288                ipmove(source, h6->tcpsrc);
2289
2290                h6->ploadlen[0] = h6->ploadlen[1] = h6->proto = 0;
2291                h6->ttl = proto;
2292                hnputl(h6->vcf, length);
2293                if ((h6->tcpcksum[0] || h6->tcpcksum[1]) &&
2294                        ptclcsum(bp, TCP6_IPLEN, length + TCP6_PHDRSIZE)) {
2295                        tpriv->stats[CsumErrs]++;
2296                        tpriv->stats[InErrs]++;
2297                        netlog(f, Logtcp, "bad tcp proto cksum\n");
2298                        freeblist(bp);
2299                        return;
2300                }
2301                h6->ttl = ttl;
2302                h6->proto = proto;
2303                hnputs(h6->ploadlen, length);
2304
2305                hdrlen = ntohtcp6(&seg, &bp);
2306                if (hdrlen < 0) {
2307                        tpriv->stats[HlenErrs]++;
2308                        tpriv->stats[InErrs]++;
2309                        netlog(f, Logtcp, "bad tcp hdr len\n");
2310                        return;
2311                }
2312
2313                s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest);
2314                if (s && s->state == Bypass) {
2315                        bypass_or_drop(s, bp);
2316                        return;
2317                }
2318
2319                /* trim the packet to the size claimed by the datagram */
2320                length -= hdrlen;
2321                bp = trimblock(bp, hdrlen + TCP6_PKT, length);
2322                if (bp == NULL) {
2323                        tpriv->stats[LenErrs]++;
2324                        tpriv->stats[InErrs]++;
2325                        netlog(f, Logtcp, "tcp len < 0 after trim\n");
2326                        return;
2327                }
2328        }
2329
2330        /* s, the conv matching the n-tuple, was set above */
2331        if (s == NULL) {
2332                netlog(f, Logtcpreset,
2333                       "iphtlook failed: src %I:%u, dst %I:%u\n",
2334                       source, seg.source, dest, seg.dest);
2335reset:
2336                sndrst(tcp, source, dest, length, &seg, version,
2337                       "no conversation");
2338                freeblist(bp);
2339                return;
2340        }
2341
2342        /* lock protocol for unstate Plan 9 invariants.  funcs like limbo or
2343         * incoming might rely on it. */
2344        qlock(&tcp->qlock);
2345
2346        /* if it's a listener, look for the right flags and get a new conv */
2347        tcb = (Tcpctl *) s->ptcl;
2348        if (tcb->state == Listen) {
2349                if (seg.flags & RST) {
2350                        limborst(s, &seg, source, dest, version);
2351                        qunlock(&tcp->qlock);
2352                        freeblist(bp);
2353                        return;
2354                }
2355
2356                /* if this is a new SYN, put the call into limbo */
2357                if ((seg.flags & SYN) && (seg.flags & ACK) == 0) {
2358                        limbo(s, source, dest, &seg, version);
2359                        qunlock(&tcp->qlock);
2360                        freeblist(bp);
2361                        return;
2362                }
2363
2364                /* if there's a matching call in limbo, tcpincoming will return
2365                 * it */
2366                s = tcpincoming(s, &seg, source, dest, version);
2367                if (s == NULL) {
2368                        qunlock(&tcp->qlock);
2369                        goto reset;
2370                }
2371        }
2372
2373        /* The rest of the input state machine is run with the control block
2374         * locked and implements the state machine directly out of the RFC.
2375         * Out-of-band data is ignored - it was always a bad idea.
2376         */
2377        tcb = (Tcpctl *) s->ptcl;
2378        if (waserror()) {
2379                qunlock(&s->qlock);
2380                nexterror();
2381        }
2382        qlock(&s->qlock);
2383        qunlock(&tcp->qlock);
2384
2385        update_tcb_ts(tcb, &seg);
2386        /* fix up window */
2387        seg.wnd <<= tcb->rcv.scale;
2388
2389        /* every input packet in puts off the keep alive time out */
2390        tcpsetkacounter(tcb);
2391
2392        switch (tcb->state) {
2393        case Closed:
2394                sndrst(tcp, source, dest, length, &seg, version,
2395                           "sending to Closed");
2396                goto raise;
2397        case Syn_sent:
2398                if (seg.flags & ACK) {
2399                        if (!seq_within(seg.ack, tcb->iss + 1,
2400                                        tcb->snd.nxt)) {
2401                                sndrst(tcp, source, dest, length, &seg,
2402                                       version, "bad seq in Syn_sent");
2403                                goto raise;
2404                        }
2405                }
2406                if (seg.flags & RST) {
2407                        if (seg.flags & ACK)
2408                                localclose(s, "connection refused");
2409                        goto raise;
2410                }
2411
2412                if (seg.flags & SYN) {
2413                        procsyn(s, &seg);
2414                        if (seg.flags & ACK) {
2415                                update(s, &seg);
2416                                tcpsynackrtt(s);
2417                                tcpsetstate(s, Established);
2418                                /* Here's where we get the results of
2419                                 * header option negotiations for
2420                                 * connections we started. (SYNACK has
2421                                 * the response) */
2422                                tcpsetscale(s, tcb, seg.ws, tcb->scale);
2423                                tcb->sack_ok = seg.sack_ok;
2424                        } else {
2425                                sndrst(tcp, source, dest, length, &seg,
2426                                       version, "Got SYN with no ACK");
2427                                goto raise;
2428                        }
2429
2430                        if (length != 0 || (seg.flags & FIN))
2431                                break;
2432
2433                        freeblist(bp);
2434                        goto output;
2435                } else
2436                        freeblist(bp);
2437
2438                qunlock(&s->qlock);
2439                poperror();
2440                return;
2441        }
2442
2443        /*
2444         *  One DOS attack is to open connections to us and then forget about
2445         *  them, thereby tying up a conv at no long term cost to the attacker.
2446         *  This is an attempt to defeat these stateless DOS attacks.  See
2447         *  corresponding code in tcpsendka().
2448         */
2449        if ((seg.flags & RST) == 0) {
2450                if (tcpporthogdefense
2451                        && seq_within(seg.ack, tcb->snd.una - (1 << 31),
2452                                                  tcb->snd.una - (1 << 29))) {
2453                        printd("stateless hog %I.%d->%I.%d f 0x%x 0x%lx - 0x%lx - 0x%lx\n",
2454                               source, seg.source, dest, seg.dest, seg.flags,
2455                               tcb->snd.una - (1 << 31), seg.ack,
2456                               tcb->snd.una - (1 << 29));
2457                        localclose(s, "stateless hog");
2458                }
2459        }
2460
2461        /* Cut the data to fit the receive window */
2462        if (tcptrim(tcb, &seg, &bp, &length) == -1) {
2463                netlog(f, Logtcp, "%I.%d -> %I.%d: tcp len < 0, %lu %d\n",
2464                       s->raddr, s->rport, s->laddr, s->lport, seg.seq, length);
2465                update(s, &seg);
2466                if (qlen(s->wq) + tcb->flgcnt == 0 && tcb->state == Closing) {
2467                        tcphalt(tpriv, &tcb->rtt_timer);
2468                        tcphalt(tpriv, &tcb->acktimer);
2469                        tcphalt(tpriv, &tcb->katimer);
2470                        tcpsetstate(s, Time_wait);
2471                        tcb->timer.start = MSL2 * (1000 / MSPTICK);
2472                        tcpgo(tpriv, &tcb->timer);
2473                }
2474                if (!(seg.flags & RST)) {
2475                        tcb->flags |= FORCE;
2476                        goto output;
2477                }
2478                qunlock(&s->qlock);
2479                poperror();
2480                return;
2481        }
2482
2483        /* Cannot accept so answer with a rst */
2484        if (length && tcb->state == Closed) {
2485                sndrst(tcp, source, dest, length, &seg, version,
2486                       "sending to Closed");
2487                goto raise;
2488        }
2489
2490        /* The segment is beyond the current receive pointer so
2491         * queue the data in the resequence queue
2492         */
2493        if (seg.seq != tcb->rcv.nxt)
2494                if (length != 0 || (seg.flags & (SYN | FIN))) {
2495                        update(s, &seg);
2496                        if (addreseq(tcb, tpriv, &seg, bp, length) < 0)
2497                                printd("reseq %I.%d -> %I.%d\n", s->raddr,
2498                                       s->rport, s->laddr, s->lport);
2499                        tcb->flags |= FORCE;
2500                        goto output;
2501                }
2502
2503        /*
2504         *  keep looping till we've processed this packet plus any
2505         *  adjacent packets in the resequence queue
2506         */
2507        for (;;) {
2508                if (seg.flags & RST) {
2509                        if (tcb->state == Established) {
2510                                tpriv->stats[EstabResets]++;
2511                                if (tcb->rcv.nxt != seg.seq)
2512                                        printd("out of order RST rcvd: %I.%d -> %I.%d, rcv.nxt 0x%lx seq 0x%lx\n",
2513                                               s->raddr, s->rport, s->laddr,
2514                                               s->lport, tcb->rcv.nxt, seg.seq);
2515                        }
2516                        localclose(s, "connection refused");
2517                        goto raise;
2518                }
2519
2520                if ((seg.flags & ACK) == 0)
2521                        goto raise;
2522
2523                switch (tcb->state) {
2524                case Established:
2525                case Close_wait:
2526                        update(s, &seg);
2527                        break;
2528                case Finwait1:
2529                        update(s, &seg);
2530                        if (qlen(s->wq) + tcb->flgcnt == 0) {
2531                                tcphalt(tpriv, &tcb->rtt_timer);
2532                                tcphalt(tpriv, &tcb->acktimer);
2533                                tcpsetkacounter(tcb);
2534                                tcb->time = NOW;
2535                                tcpsetstate(s, Finwait2);
2536                                tcb->katimer.start = MSL2 * (1000 / MSPTICK);
2537                                tcpgo(tpriv, &tcb->katimer);
2538                        }
2539                        break;
2540                case Finwait2:
2541                        update(s, &seg);
2542                        break;
2543                case Closing:
2544                        update(s, &seg);
2545                        if (qlen(s->wq) + tcb->flgcnt == 0) {
2546                                tcphalt(tpriv, &tcb->rtt_timer);
2547                                tcphalt(tpriv, &tcb->acktimer);
2548                                tcphalt(tpriv, &tcb->katimer);
2549                                tcpsetstate(s, Time_wait);
2550                                tcb->timer.start = MSL2 * (1000 / MSPTICK);
2551                                tcpgo(tpriv, &tcb->timer);
2552                        }
2553                        break;
2554                case Last_ack:
2555                        update(s, &seg);
2556                        if (qlen(s->wq) + tcb->flgcnt == 0) {
2557                                localclose(s, NULL);
2558                                goto raise;
2559                        }
2560                case Time_wait:
2561                        if (seg.flags & FIN)
2562                                tcb->flags |= FORCE;
2563                        if (tcb->timer.state != TcptimerON)
2564                                tcpgo(tpriv, &tcb->timer);
2565                }
2566
2567                if ((seg.flags & URG) && seg.urg) {
2568                        if (seq_gt(seg.urg + seg.seq, tcb->rcv.urg)) {
2569                                tcb->rcv.urg = seg.urg + seg.seq;
2570                                pullblock(&bp, seg.urg);
2571                        }
2572                } else if (seq_gt(tcb->rcv.nxt, tcb->rcv.urg))
2573                        tcb->rcv.urg = tcb->rcv.nxt;
2574
2575                if (length == 0) {
2576                        if (bp != NULL)
2577                                freeblist(bp);
2578                } else {
2579                        switch (tcb->state) {
2580                        default:
2581                                /* Ignore segment text */
2582                                if (bp != NULL)
2583                                        freeblist(bp);
2584                                break;
2585
2586                        case Established:
2587                        case Finwait1:
2588                                /* If we still have some data place on
2589                                 * receive queue
2590                                 */
2591                                if (bp) {
2592                                        bp = packblock(bp);
2593                                        if (bp == NULL)
2594                                                panic("tcp packblock");
2595                                        qpassnolim(s->rq, bp);
2596                                        bp = NULL;
2597
2598                                        /*
2599                                         * Force an ack every 2 data messages.
2600                                         * This is a hack for rob to make his
2601                                         * home system run faster.
2602                                         *
2603                                         * this also keeps the standard TCP
2604                                         * congestion control working since it
2605                                         * needs an ack every 2 max segs worth.
2606                                         * This is not quite that, but under a
2607                                         * real stream is equivalent since every
2608                                         * packet has a max seg in it.
2609                                         */
2610                                        if (++(tcb->rcv.una) >= 2)
2611                                                tcb->flags |= FORCE;
2612                                }
2613                                tcb->rcv.nxt += length;
2614                                drop_old_rcv_sacks(tcb);
2615
2616                                /*
2617                                 *  update our rcv window
2618                                 */
2619                                tcprcvwin(s);
2620
2621                                /*
2622                                 *  turn on the acktimer if there's something
2623                                 *  to ack
2624                                 */
2625                                if (tcb->acktimer.state != TcptimerON)
2626                                        tcpgo(tpriv, &tcb->acktimer);
2627
2628                                break;
2629                        case Finwait2:
2630                                /* no process to read the data, send a reset */
2631                                if (bp != NULL)
2632                                        freeblist(bp);
2633                                sndrst(tcp, source, dest, length, &seg, version,
2634                                           "send to Finwait2");
2635                                qunlock(&s->qlock);
2636                                poperror();
2637                                return;
2638                        }
2639                }
2640
2641                if (seg.flags & FIN) {
2642                        tcb->flags |= FORCE;
2643
2644                        switch (tcb->state) {
2645                        case Established:
2646                                tcb->rcv.nxt++;
2647                                tcpsetstate(s, Close_wait);
2648                                break;
2649                        case Finwait1:
2650                                tcb->rcv.nxt++;
2651                                if (qlen(s->wq) + tcb->flgcnt == 0) {
2652                                        tcphalt(tpriv, &tcb->rtt_timer);
2653                                        tcphalt(tpriv, &tcb->acktimer);
2654                                        tcphalt(tpriv, &tcb->katimer);
2655                                        tcpsetstate(s, Time_wait);
2656                                        tcb->timer.start = MSL2 * (1000 /
2657                                                                   MSPTICK);
2658                                        tcpgo(tpriv, &tcb->timer);
2659                                } else
2660                                        tcpsetstate(s, Closing);
2661                                break;
2662                        case Finwait2:
2663                                tcb->rcv.nxt++;
2664                                tcphalt(tpriv, &tcb->rtt_timer);
2665                                tcphalt(tpriv, &tcb->acktimer);
2666                                tcphalt(tpriv, &tcb->katimer);
2667                                tcpsetstate(s, Time_wait);
2668                                tcb->timer.start = MSL2 * (1000 / MSPTICK);
2669                                tcpgo(tpriv, &tcb->timer);
2670                                break;
2671                        case Close_wait:
2672                        case Closing:
2673                        case Last_ack:
2674                                break;
2675                        case Time_wait:
2676                                tcpgo(tpriv, &tcb->timer);
2677                                break;
2678                        }
2679                }
2680
2681                /*
2682                 *  get next adjacent segment from the resequence queue.
2683                 *  dump/trim any overlapping segments
2684                 */
2685                for (;;) {
2686                        if (tcb->reseq == NULL)
2687                                goto output;
2688
2689                        if (seq_ge(tcb->rcv.nxt, tcb->reseq->seg.seq) == 0)
2690                                goto output;
2691
2692                        getreseq(tcb, &seg, &bp, &length);
2693
2694                        if (tcptrim(tcb, &seg, &bp, &length) == 0)
2695                                break;
2696                }
2697        }
2698output:
2699        tcpoutput(s);
2700        qunlock(&s->qlock);
2701        poperror();
2702        return;
2703raise:
2704        qunlock(&s->qlock);
2705        poperror();
2706        freeblist(bp);
2707        tcpkick(s);
2708}
2709
2710/* The advertised mss = data + TCP headers */
2711static uint16_t derive_payload_mss(Tcpctl *tcb)
2712{
2713        uint16_t payload_mss = tcb->mss;
2714        uint16_t opt_size = 0;
2715
2716        if (tcb->ts_recent) {
2717                opt_size += TS_LENGTH;
2718                /* Note that when we're a SYN, we overestimate slightly.  This
2719                 * is safe, and not really a problem. */
2720                opt_size += TS_SEND_PREPAD;
2721        }
2722        if (tcb->rcv.nr_sacks)
2723                opt_size += 2 + tcb->rcv.nr_sacks * 8;
2724        opt_size = ROUNDUP(opt_size, 4);
2725        payload_mss -= opt_size;
2726        return payload_mss;
2727}
2728
2729/* Decreases the xmit amt, given the MSS / TSO. */
2730static uint32_t throttle_for_mss(Tcpctl *tcb, uint32_t ssize,
2731                                 uint16_t payload_mss, bool retrans)
2732{
2733        if (ssize > payload_mss) {
2734                if ((tcb->flags & TSO) == 0) {
2735                        ssize = payload_mss;
2736                } else {
2737                        /* Don't send too much.  32K is arbitrary.. */
2738                        if (ssize > 32 * 1024)
2739                                ssize = 32 * 1024;
2740                        if (!retrans) {
2741                                /* Clamp xmit to an integral MSS to avoid ragged
2742                                 * tail segments causing poor link utilization.
2743                                 */
2744                                ssize = ROUNDDOWN(ssize, payload_mss);
2745                        }
2746                }
2747        }
2748        return ssize;
2749}
2750
2751/* Reduces ssize for a variety of reasons.  Returns FALSE if we should abort
2752 * sending the packet.  o/w returns TRUE and modifies ssize by reference. */
2753static bool throttle_ssize(struct conv *s, Tcpctl *tcb, uint32_t *ssize_p,
2754                           uint16_t payload_mss, bool retrans)
2755{
2756        struct Fs *f = s->p->f;
2757        uint32_t usable;
2758        uint32_t ssize = *ssize_p;
2759
2760        /* Compute usable segment based on offered window and limit
2761         * window probes to one */
2762        if (tcb->snd.wnd == 0) {
2763                if (tcb->snd.in_flight != 0) {
2764                        if ((tcb->flags & FORCE) == 0)
2765                                return FALSE;
2766                }
2767                usable = 1;
2768        } else {
2769                usable = tcb->cwind;
2770                if (tcb->snd.wnd < usable)
2771                        usable = tcb->snd.wnd;
2772                if (usable > tcb->snd.in_flight)
2773                        usable -= tcb->snd.in_flight;
2774                else
2775                        usable = 0;
2776                /* Avoid Silly Window Syndrome.  This is a little different
2777                 * thant RFC 813.  I took their additional enhancement of "<
2778                 * MSS" as an AND, not an OR.  25% of a large snd.wnd is pretty
2779                 * large, and our main goal is to avoid packets smaller than
2780                 * MSS.  I still use the 25% threshold, because it is important
2781                 * that there is *some* data in_flight.  If usable < MSS because
2782                 * snd.wnd is very small (but not 0), we might never get an ACK
2783                 * and would need to set up a timer.
2784                 *
2785                 * Also, I'm using 'ssize' as a proxy for a PSH point.  If
2786                 * there's just a small blob in the qio (or retrans!), then we
2787                 * might as well just send it. */
2788                if ((usable < tcb->typical_mss) && (usable < tcb->snd.wnd >> 2)
2789                    && (usable < ssize)) {
2790                        return FALSE;
2791                }
2792        }
2793        if (ssize && usable < 2)
2794                netlog(s->p->f, Logtcpverbose,
2795                       "%I.%d -> %I.%d: throttled snd.wnd %lu cwind %lu\n",
2796                       s->laddr, s->lport, s->raddr, s->rport,
2797                       tcb->snd.wnd, tcb->cwind);
2798        if (usable < ssize)
2799                ssize = usable;
2800
2801        ssize = throttle_for_mss(tcb, ssize, payload_mss, retrans);
2802
2803        *ssize_p = ssize;
2804        return TRUE;
2805}
2806
2807/* Helper, picks the next segment to send, which is possibly a retransmission.
2808 * Returns TRUE if we have a segment, FALSE o/w.  Returns ssize, from_seq, and
2809 * sent by reference.
2810 *
2811 * from_seq is the seq number we are transmitting from.
2812 *
2813 * sent includes all seq from una to from_seq *including* any previously sent
2814 * flags (part of tcb->flgcnt), for instance an unacknowledged SYN (which counts
2815 * as a seq number).  Those flags are in the e.g. snd.nxt - snd.una range, and
2816 * they get dropped after qdiscard.
2817 *
2818 * ssize is the amount of data we are sending, starting from from_seq, and it
2819 * will include any *new* flags, which haven't been accounted for yet.
2820 *
2821 * tcb->flgcnt consists of the flags both in ssize and in sent.
2822 *
2823 * Note that we could be in recovery and not sack_retrans a segment. */
2824static bool get_xmit_segment(struct conv *s, Tcpctl *tcb, uint16_t payload_mss,
2825                             uint32_t *from_seq_p, uint32_t *sent_p,
2826                             uint32_t *ssize_p)
2827{
2828        struct Fs *f = s->p->f;
2829        struct tcppriv *tpriv = s->p->priv;
2830        uint32_t ssize, sent, from_seq;
2831        bool sack_retrans = FALSE;
2832        struct sack_block *tcb_sack = 0;
2833
2834        for (int i = 0; i < tcb->snd.nr_sacks; i++) {
2835                tcb_sack = &tcb->snd.sacks[i];
2836                if (seq_lt(tcb->snd.rtx, tcb_sack->left)) {
2837                        /* So ssize is supposed to include any *new* flags to
2838                         * flgcnt, which at this point would be a FIN.
2839                         *
2840                         * It might be possible that flgcnt is incremented so we
2841                         * send a FIN, even for an intermediate sack retrans.
2842                         * Perhaps the user closed the conv.
2843                         *
2844                         * However, the way the "flgcnt for FIN" works is that
2845                         * it inflates the desired amount we'd like to send
2846                         * (qlen + flgcnt).  Eventually, we reach the end of the
2847                         * queue and fail to extract all of dsize.  At that
2848                         * point, we put on the FIN, and that's where the extra
2849                         * 'byte' comes from.
2850                         *
2851                         * For sack retrans, since we're extracting from parts
2852                         * of the qio that aren't the right-most edge, we don't
2853                         * need to consider flgcnt when setting ssize. */
2854                        from_seq = tcb->snd.rtx;
2855                        sent = from_seq - tcb->snd.una;
2856                        ssize = tcb_sack->left - from_seq;
2857                        sack_retrans = TRUE;
2858                        break;
2859                }
2860        }
2861        /* SACK holes have first dibs, but we can still opportunisitically send
2862         * new data.
2863         *
2864         * During other types of recovery, we'll just send from the retrans
2865         * point.  If we're in an RTO while we still have sacks, we could be
2866         * resending data that wasn't lost.  Consider a sack that is still
2867         * growing (usually the right-most), but we haven't received the ACK
2868         * yet.  rxt may be included in that area.  Given we had two losses or
2869         * otherwise timed out, I'm not too concerned.
2870         *
2871         * Note that Fast and RTO can send data beyond nxt.  If we change that,
2872         * change the accounting below. */
2873        if (!sack_retrans) {
2874                switch (tcb->snd.recovery) {
2875                default:
2876                case SACK_RETRANS_RECOVERY:
2877                        from_seq = tcb->snd.nxt;
2878                        break;
2879                case FAST_RETRANS_RECOVERY:
2880                case RTO_RETRANS_RECOVERY:
2881                        from_seq = tcb->snd.rtx;
2882                        break;
2883                }
2884                sent = from_seq - tcb->snd.una;
2885                /* qlen + flgcnt is every seq we want to have sent, including
2886                 * unack'd data, unacked flags, and new flags. */
2887                ssize = qlen(s->wq) + tcb->flgcnt - sent;
2888        }
2889
2890        if (!throttle_ssize(s, tcb, &ssize, payload_mss, sack_retrans))
2891                return FALSE;
2892
2893        /* This counts flags, which is a little hokey, but it's okay since
2894         * in_flight gets reset on each ACK */
2895        tcb->snd.in_flight += ssize;
2896        /* Log and track rxmit.  This covers both SACK (retrans) and fast rxmit.
2897         */
2898        if (ssize && seq_lt(tcb->snd.rtx, tcb->snd.nxt)) {
2899                netlog(f, Logtcpverbose,
2900                       "%I.%d -> %I.%d: rxmit: rtx %u amt %u, nxt %u\n",
2901                       s->laddr, s->lport, s->raddr, s->rport,
2902                       tcb->snd.rtx, MIN(tcb->snd.nxt - tcb->snd.rtx, ssize),
2903                       tcb->snd.nxt);
2904                tpriv->stats[RetransSegs]++;
2905        }
2906        if (sack_retrans) {
2907                /* If we'll send up to the left edge, advance snd.rtx to the
2908                 * right.
2909                 *
2910                 * This includes the largest sack.  It might get removed later,
2911                 * in which case we'll underestimate the amount in-flight.  The
2912                 * alternative is to not count the rightmost sack, but when it
2913                 * gets removed, we'll retrans it anyway.  No matter what, we'd
2914                 * count it. */
2915                tcb->snd.rtx += ssize;
2916                if (tcb->snd.rtx == tcb_sack->left)
2917                        tcb->snd.rtx = tcb_sack->right;
2918                /* RFC 6675 says we MAY rearm the RTO timer on each retrans,
2919                 * since we might not be getting ACKs for a while. */
2920                tcpsettimer(tcb);
2921        } else {
2922                switch (tcb->snd.recovery) {
2923                default:
2924                        /* under normal op, we drag rtx along with nxt.  this
2925                         * prevents us from sending sacks too early (up above),
2926                         * since rtx doesn't get reset to una until we have a
2927                         * loss (e.g. 3 dupacks/sacks). */
2928                        tcb->snd.nxt += ssize;
2929                        tcb->snd.rtx = tcb->snd.nxt;
2930                        break;
2931                case SACK_RETRANS_RECOVERY:
2932                        /* We explicitly do not want to increase rtx here.  We
2933                         * might still need it to fill in a sack gap below nxt
2934                         * if we get new, higher sacks. */
2935                        tcb->snd.nxt += ssize;
2936                        break;
2937                case FAST_RETRANS_RECOVERY:
2938                case RTO_RETRANS_RECOVERY:
2939                        tcb->snd.rtx += ssize;
2940                        /* Fast and RTO can send new data, advancing nxt. */
2941                        if (seq_gt(tcb->snd.rtx, tcb->snd.nxt))
2942                                tcb->snd.nxt = tcb->snd.rtx;
2943                        break;
2944                }
2945        }
2946        *from_seq_p = from_seq;
2947        *sent_p = sent;
2948        *ssize_p = ssize;
2949
2950        return TRUE;
2951}
2952
2953/*
2954 *  always enters and exits with the s locked.  We drop
2955 *  the lock to ipoput the packet so some care has to be
2956 *  taken by callers.
2957 */
2958static void tcpoutput(struct conv *s)
2959{
2960        Tcp seg;
2961        int msgs;
2962        int next_yield = 1;
2963        Tcpctl *tcb;
2964        struct block *hbp, *bp;
2965        uint32_t ssize, dsize, sent, from_seq;
2966        struct Fs *f;
2967        struct tcppriv *tpriv;
2968        uint8_t version;
2969        uint16_t payload_mss;
2970
2971        f = s->p->f;
2972        tpriv = s->p->priv;
2973        version = s->ipversion;
2974
2975        for (msgs = 0; msgs < 100; msgs++) {
2976                tcb = (Tcpctl *) s->ptcl;
2977
2978                switch (tcb->state) {
2979                case Listen:
2980                case Closed:
2981                case Finwait2:
2982                        return;
2983                }
2984
2985                /* force an ack when a window has opened up */
2986                if (tcb->rcv.blocked && tcb->rcv.wnd >= tcb->mss) {
2987                        tcb->rcv.blocked = 0;
2988                        tcb->flags |= FORCE;
2989                }
2990
2991                /* Don't send anything else until our SYN has been acked */
2992                if (tcb->snd.nxt != tcb->iss && (tcb->flags & SYNACK) == 0)
2993                        break;
2994
2995                /* payload_mss is the actual amount of data in the packet, which
2996                 * is the advertised (mss - header opts).  This varies from
2997                 * packet to packet, based on the options that might be present
2998                 * (e.g. always timestamps, sometimes SACKs) */
2999                payload_mss = derive_payload_mss(tcb);
3000
3001                if (!get_xmit_segment(s, tcb, payload_mss, &from_seq, &sent,
3002                                      &ssize))
3003                        break;
3004
3005                dsize = ssize;
3006                seg.urg = 0;
3007
3008                if (ssize == 0)
3009                        if ((tcb->flags & FORCE) == 0)
3010                                break;
3011
3012                tcb->flags &= ~FORCE;
3013                tcprcvwin(s);
3014
3015                /* By default we will generate an ack, so we can normally turn
3016                 * off the timer.  If we're blocked, we'll want the timer so we
3017                 * can send a window update. */
3018                if (!tcb->rcv.blocked)
3019                        tcphalt(tpriv, &tcb->acktimer);
3020                tcb->rcv.una = 0;
3021                seg.source = s->lport;
3022                seg.dest = s->rport;
3023                seg.flags = ACK;
3024                seg.mss = 0;
3025                seg.ws = 0;
3026                seg.sack_ok = FALSE;
3027                seg.nr_sacks = 0;
3028                /* When outputting, Syn_sent means "send the Syn", for
3029                 * connections we initiate.  SYNACKs are sent from sndsynack
3030                 * directly. */
3031                if (tcb->state == Syn_sent) {
3032                        seg.flags = 0;
3033                        /* here's where we advertise SACK */
3034                        seg.sack_ok = SACK_SUPPORTED;
3035                        if (tcb->snd.nxt - ssize == tcb->iss) {
3036                                seg.flags |= SYN;
3037                                dsize--;
3038                                seg.mss = tcb->mss;
3039                                seg.ws = tcb->scale;
3040                        } else {
3041                                /* TODO: Not sure why we'd get here. */
3042                                warn("TCP: weird Syn_sent state, tell someone you saw this");
3043                        }
3044                }
3045                seg.seq = from_seq;
3046                seg.ack = tcb->rcv.nxt;
3047                tcb->last_ack_sent = seg.ack;
3048                seg.wnd = tcb->rcv.wnd;
3049                seg.ts_val = tcb->ts_recent;
3050
3051                /* Pull out data to send */
3052                bp = NULL;
3053                if (dsize != 0) {
3054                        bp = qcopy(s->wq, dsize, sent);
3055                        if (BLEN(bp) != dsize) {
3056                                /* Here's where the flgcnt kicked in.  Note
3057                                 * dsize is decremented, but ssize isn't.  Not
3058                                 * that we use ssize for much anymore.
3059                                 * Decrementing dsize prevents us from sending a
3060                                 * PSH with the FIN. */
3061                                seg.flags |= FIN;
3062                                dsize--;
3063                        }
3064                        if (BLEN(bp) > payload_mss) {
3065                                bp->flag |= Btso;
3066                                bp->mss = payload_mss;
3067                        }
3068                }
3069
3070                if (sent + dsize == qlen(s->wq) + tcb->flgcnt)
3071                        seg.flags |= PSH;
3072
3073                /* Build header, link data and compute cksum */
3074                switch (version) {
3075                case V4:
3076                        tcb->protohdr.tcp4hdr.vihl = IP_VER4;
3077                        hbp = htontcp4(&seg, bp, &tcb->protohdr.tcp4hdr, tcb);
3078                        if (hbp == NULL) {
3079                                freeblist(bp);
3080                                return;
3081                        }
3082                        break;
3083                case V6:
3084                        tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
3085                        hbp = htontcp6(&seg, bp, &tcb->protohdr.tcp6hdr, tcb);
3086                        if (hbp == NULL) {
3087                                freeblist(bp);
3088                                return;
3089                        }
3090                        break;
3091                default:
3092                        hbp = NULL;     /* to suppress a warning */
3093                        panic("tcpoutput: version %d", version);
3094                }
3095
3096                /* Start the transmission timers if there is new data and we
3097                 * expect acknowledges
3098                 */
3099                if (ssize != 0) {
3100                        if (tcb->timer.state != TcptimerON)
3101                                tcpgo(tpriv, &tcb->timer);
3102
3103                        if (!tcb->ts_recent && (tcb->rtt_timer.state !=
3104                                                TcptimerON)) {
3105                                tcpgo(tpriv, &tcb->rtt_timer);
3106                                tcb->rttseq = from_seq + ssize;
3107                        }
3108                }
3109
3110                tpriv->stats[OutSegs]++;
3111
3112                /* put off the next keep alive */
3113                tcpgo(tpriv, &tcb->katimer);
3114
3115                switch (version) {
3116                case V4:
3117                        if (ipoput4(f, hbp, 0, s->ttl, s->tos, s) < 0) {
3118                                /* a negative return means no route */
3119                                localclose(s, "no route");
3120                        }
3121                        break;
3122                case V6:
3123                        if (ipoput6(f, hbp, 0, s->ttl, s->tos, s) < 0) {
3124                                /* a negative return means no route */
3125                                localclose(s, "no route");
3126                        }
3127                        break;
3128                default:
3129                        panic("tcpoutput2: version %d", version);
3130                }
3131                if (ssize) {
3132                        /* The outer loop thinks we sent one packet.  If we used
3133                         * TSO, we might have sent several.  Minus one for the
3134                         * loop increment. */
3135                        msgs += DIV_ROUND_UP(ssize, payload_mss) - 1;
3136                }
3137                /* Old Plan 9 tidbit - yield every four messages.  We want to
3138                 * break out and unlock so we can process inbound ACKs which
3139                 * might do things like say "slow down". */
3140                if (msgs >= next_yield) {
3141                        next_yield = msgs + 4;
3142                        qunlock(&s->qlock);
3143                        kthread_yield();
3144                        qlock(&s->qlock);
3145                }
3146        }
3147}
3148
3149/*
3150 *  the BSD convention (hack?) for keep alives.  resend last uint8_t acked.
3151 */
3152static void tcpsendka(struct conv *s)
3153{
3154        Tcp seg;
3155        Tcpctl *tcb;
3156        struct block *hbp, *dbp;
3157
3158        tcb = (Tcpctl *) s->ptcl;
3159
3160        dbp = NULL;
3161        seg.urg = 0;
3162        seg.source = s->lport;
3163        seg.dest = s->rport;
3164        seg.flags = ACK | PSH;
3165        seg.mss = 0;
3166        seg.ws = 0;
3167        seg.sack_ok = FALSE;
3168        seg.nr_sacks = 0;
3169        if (tcpporthogdefense)
3170                urandom_read(&seg.seq, sizeof(seg.seq));
3171        else
3172                seg.seq = tcb->snd.una - 1;
3173        seg.ack = tcb->rcv.nxt;
3174        tcb->last_ack_sent = seg.ack;
3175        tcb->rcv.una = 0;
3176        seg.wnd = tcb->rcv.wnd;
3177        seg.ts_val = tcb->ts_recent;
3178        if (tcb->state == Finwait2) {
3179                seg.flags |= FIN;
3180        } else {
3181                dbp = block_alloc(1, MEM_WAIT);
3182                dbp->wp++;
3183        }
3184
3185        if (isv4(s->raddr)) {
3186                /* Build header, link data and compute cksum */
3187                tcb->protohdr.tcp4hdr.vihl = IP_VER4;
3188                hbp = htontcp4(&seg, dbp, &tcb->protohdr.tcp4hdr, tcb);
3189                if (hbp == NULL) {
3190                        freeblist(dbp);
3191                        return;
3192                }
3193                ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
3194        } else {
3195                /* Build header, link data and compute cksum */
3196                tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
3197                hbp = htontcp6(&seg, dbp, &tcb->protohdr.tcp6hdr, tcb);
3198                if (hbp == NULL) {
3199                        freeblist(dbp);
3200                        return;
3201                }
3202                ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
3203        }
3204}
3205
3206/*
3207 *  set connection to time out after 12 minutes
3208 */
3209static void tcpsetkacounter(Tcpctl *tcb)
3210{
3211        tcb->kacounter = (12 * 60 * 1000) / (tcb->katimer.start * MSPTICK);
3212        if (tcb->kacounter < 3)
3213                tcb->kacounter = 3;
3214}
3215
3216/*
3217 *  if we've timed out, close the connection
3218 *  otherwise, send a keepalive and restart the timer
3219 */
3220static void tcpkeepalive(void *v)
3221{
3222        ERRSTACK(1);
3223        Tcpctl *tcb;
3224        struct conv *s;
3225
3226        s = v;
3227        tcb = (Tcpctl *) s->ptcl;
3228        qlock(&s->qlock);
3229        if (waserror()) {
3230                qunlock(&s->qlock);
3231                nexterror();
3232        }
3233        if (tcb->state != Closed) {
3234                if (--(tcb->kacounter) <= 0) {
3235                        localclose(s, "connection timed out");
3236                } else {
3237                        tcpsendka(s);
3238                        tcpgo(s->p->priv, &tcb->katimer);
3239                }
3240        }
3241        qunlock(&s->qlock);
3242        poperror();
3243}
3244
3245/*
3246 *  start keepalive timer
3247 */
3248static void tcpstartka(struct conv *s, char **f, int n)
3249{
3250        Tcpctl *tcb;
3251        int x;
3252
3253        tcb = (Tcpctl *) s->ptcl;
3254        if (tcb->state != Established)
3255                error(ENOTCONN, "connection must be in Establised state");
3256        if (n > 1) {
3257                x = atoi(f[1]);
3258                if (x >= MSPTICK)
3259                        tcb->katimer.start = x / MSPTICK;
3260        }
3261        tcpsetkacounter(tcb);
3262        tcpgo(s->p->priv, &tcb->katimer);
3263}
3264
3265/*
3266 *  turn checksums on/off
3267 */
3268static void tcpsetchecksum(struct conv *s, char **f, int unused)
3269{
3270        Tcpctl *tcb;
3271
3272        tcb = (Tcpctl *) s->ptcl;
3273        tcb->nochecksum = !atoi(f[1]);
3274}
3275
3276static void tcp_loss_event(struct conv *s, Tcpctl *tcb)
3277{
3278        uint32_t old_cwnd = tcb->cwind;
3279
3280        /* Reno */
3281        tcb->ssthresh = tcb->cwind / 2;
3282        tcb->cwind = tcb->ssthresh;
3283        netlog(s->p->f, Logtcprxmt,
3284               "%I.%d -> %I.%d: loss event, cwnd was %d, now %d\n",
3285               s->laddr, s->lport, s->raddr, s->rport,
3286               old_cwnd, tcb->cwind);
3287}
3288
3289/* Called when we need to retrans the entire outstanding window (everything
3290 * previously sent, but unacknowledged). */
3291static void tcprxmit(struct conv *s)
3292{
3293        Tcpctl *tcb;
3294
3295        tcb = (Tcpctl *) s->ptcl;
3296
3297        tcb->flags |= FORCE;
3298        tcb->snd.rtx = tcb->snd.una;
3299        set_in_flight(tcb);
3300
3301        tcpoutput(s);
3302}
3303
3304/* The original RFC said to drop sacks on a timeout, since the receiver could
3305 * renege.  Later RFCs say we can keep them around, so long as we are careful.
3306 *
3307 * We'll go with a "flush if we have two timeouts" plan.  This doesn't have to
3308 * be perfect - there might be cases where we accidentally flush the sacks too
3309 * often.  Perhaps we never get dup_acks to start fast/sack rxmit.  The main
3310 * thing is that after multiple timeouts we flush the sacks, since the receiver
3311 * might renege.
3312 *
3313 * We also have an Akaros-specific problem.  We use the sacks to determine
3314 * in_flight.  Specifically, the (snd.nxt - upper right edge) is tracked as in
3315 * flight.  Usually the receiver will keep sacking that right edge all the way
3316 * up to snd.nxt, but they might not, and the gap might be quite large.  After a
3317 * timeout, that data is definitely not in flight.  If that block's size is
3318 * greater than cwnd, we'll never transmit.  This should be rare, and in that
3319 * case we can just dump the sacks.  The typical_mss fudge factor is so we can
3320 * send a reasonably-sized packet. */
3321static void timeout_handle_sacks(Tcpctl *tcb)
3322{
3323        struct sack_block *last_sack;
3324
3325        if (tcb->snd.nr_sacks) {
3326                last_sack = &tcb->snd.sacks[tcb->snd.nr_sacks - 1];
3327                if (tcb->snd.flush_sacks || (tcb->snd.nxt - last_sack->right >=
3328                                             tcb->cwind - tcb->typical_mss)) {
3329                        tcb->snd.nr_sacks = 0;
3330                        tcb->snd.flush_sacks = FALSE;
3331                } else {
3332                        tcb->snd.flush_sacks = TRUE;
3333                }
3334        }
3335}
3336
3337static void tcptimeout(void *arg)
3338{
3339        ERRSTACK(1);
3340        struct conv *s;
3341        Tcpctl *tcb;
3342        int maxback;
3343        struct tcppriv *tpriv;
3344
3345        s = (struct conv *)arg;
3346        tpriv = s->p->priv;
3347        tcb = (Tcpctl *) s->ptcl;
3348
3349        qlock(&s->qlock);
3350        if (waserror()) {
3351                qunlock(&s->qlock);
3352                nexterror();
3353        }
3354        switch (tcb->state) {
3355        default:
3356                tcb->backoff++;
3357                if (tcb->state == Syn_sent)
3358                        maxback = MAXBACKMS / 2;
3359                else
3360                        maxback = MAXBACKMS;
3361                tcb->backedoff += tcb->timer.start * MSPTICK;
3362                if (tcb->backedoff >= maxback) {
3363                        localclose(s, "connection timed out");
3364                        break;
3365                }
3366                netlog(s->p->f, Logtcprxmt,
3367                       "%I.%d -> %I.%d: timeout rxmit una %u, rtx %u, nxt %u, in_flight %u, timer.start %u\n",
3368                       s->laddr, s->lport, s->raddr, s->rport,
3369                       tcb->snd.una, tcb->snd.rtx, tcb->snd.nxt,
3370                       tcb->snd.in_flight, tcb->timer.start);
3371                tcpsettimer(tcb);
3372                tcp_loss_event(s, tcb);
3373                /* Advance the recovery point.  Any dupacks/sacks below this
3374                 * won't trigger a new loss, since we won't reset_recovery()
3375                 * until we ack past recovery_pt. */
3376                tcb->snd.recovery = RTO_RETRANS_RECOVERY;
3377                tcb->snd.recovery_pt = tcb->snd.nxt;
3378                timeout_handle_sacks(tcb);
3379                tcprxmit(s);
3380                tpriv->stats[RetransTimeouts]++;
3381                break;
3382        case Time_wait:
3383                localclose(s, NULL);
3384                break;
3385        case Closed:
3386                break;
3387        }
3388        qunlock(&s->qlock);
3389        poperror();
3390}
3391
3392static int inwindow(Tcpctl *tcb, int seq)
3393{
3394        return seq_within(seq, tcb->rcv.nxt, tcb->rcv.nxt + tcb->rcv.wnd - 1);
3395}
3396
3397/*
3398 *  set up state for a received SYN (or SYN ACK) packet
3399 */
3400static void procsyn(struct conv *s, Tcp *seg)
3401{
3402        Tcpctl *tcb;
3403
3404        tcb = (Tcpctl *) s->ptcl;
3405        tcb->flags |= FORCE;
3406
3407        tcb->rcv.nxt = seg->seq + 1;
3408        tcb->rcv.urg = tcb->rcv.nxt;
3409        tcb->irs = seg->seq;
3410
3411        /* our sending max segment size cannot be bigger than what he asked for
3412         */
3413        if (seg->mss != 0 && seg->mss < tcb->mss) {
3414                tcb->mss = seg->mss;
3415                tcb->typical_mss = tcb->mss;
3416        }
3417        adjust_typical_mss_for_opts(seg, tcb);
3418
3419        tcb->snd.wnd = seg->wnd;
3420        tcb->cwind = tcb->typical_mss * CWIND_SCALE;
3421}
3422
3423static int addreseq(Tcpctl *tcb, struct tcppriv *tpriv, Tcp *seg,
3424                    struct block *bp, uint16_t length)
3425{
3426        Reseq *rp, *rp1;
3427        int i, rqlen, qmax;
3428
3429        rp = kzmalloc(sizeof(Reseq), 0);
3430        if (rp == NULL) {
3431                freeblist(bp);  /* bp always consumed by add_reseq */
3432                return 0;
3433        }
3434
3435        rp->seg = *seg;
3436        rp->bp = bp;
3437        rp->length = length;
3438
3439        track_rcv_sack(tcb, seg->seq, seg->seq + length);
3440        /* Place on reassembly list sorting by starting seq number */
3441        rp1 = tcb->reseq;
3442        if (rp1 == NULL || seq_lt(seg->seq, rp1->seg.seq)) {
3443                rp->next = rp1;
3444                tcb->reseq = rp;
3445                if (rp->next != NULL)
3446                        tpriv->stats[OutOfOrder]++;
3447                return 0;
3448        }
3449
3450        rqlen = 0;
3451        for (i = 0;; i++) {
3452                rqlen += rp1->length;
3453                if (rp1->next == NULL || seq_lt(seg->seq, rp1->next->seg.seq)) {
3454                        rp->next = rp1->next;
3455                        rp1->next = rp;
3456                        if (rp->next != NULL)
3457                                tpriv->stats[OutOfOrder]++;
3458                        break;
3459                }
3460                rp1 = rp1->next;
3461        }
3462        qmax = QMAX << tcb->rcv.scale;
3463        /* Here's where we're reneging on previously reported sacks. */
3464        if (rqlen > qmax) {
3465                printd("resequence queue > window: %d > %d\n", rqlen, qmax);
3466                i = 0;
3467                for (rp1 = tcb->reseq; rp1 != NULL; rp1 = rp1->next) {
3468                        printd("0x%#lx 0x%#lx 0x%#x\n", rp1->seg.seq,
3469                                   rp1->seg.ack, rp1->seg.flags);
3470                        if (i++ > 10) {
3471                                printd("...\n");
3472                                break;
3473                        }
3474                }
3475
3476                // delete entire reassembly queue; wait for retransmit.
3477                // - should we be smarter and only delete the tail?
3478                for (rp = tcb->reseq; rp != NULL; rp = rp1) {
3479                        rp1 = rp->next;
3480                        freeblist(rp->bp);
3481                        kfree(rp);
3482                }
3483                tcb->reseq = NULL;
3484                tcb->rcv.nr_sacks = 0;
3485
3486                return -1;
3487        }
3488        return 0;
3489}
3490
3491static void getreseq(Tcpctl *tcb, Tcp *seg, struct block **bp, uint16_t *length)
3492{
3493        Reseq *rp;
3494
3495        rp = tcb->reseq;
3496        if (rp == NULL)
3497                return;
3498
3499        tcb->reseq = rp->next;
3500
3501        *seg = rp->seg;
3502        *bp = rp->bp;
3503        *length = rp->length;
3504
3505        kfree(rp);
3506}
3507
3508static int tcptrim(Tcpctl *tcb, Tcp *seg, struct block **bp, uint16_t *length)
3509{
3510        uint16_t len;
3511        uint8_t accept;
3512        int dupcnt, excess;
3513
3514        accept = 0;
3515        len = *length;
3516        if (seg->flags & SYN)
3517                len++;
3518        if (seg->flags & FIN)
3519                len++;
3520
3521        if (tcb->rcv.wnd == 0) {
3522                if (len == 0 && seg->seq == tcb->rcv.nxt)
3523                        return 0;
3524        } else {
3525                /* Some part of the segment should be in the window */
3526                if (inwindow(tcb, seg->seq))
3527                        accept++;
3528                else if (len != 0) {
3529                        if (inwindow(tcb, seg->seq + len - 1) ||
3530                                seq_within(tcb->rcv.nxt, seg->seq,
3531                                           seg->seq + len - 1))
3532                                accept++;
3533                }
3534        }
3535        if (!accept) {
3536                freeblist(*bp);
3537                return -1;
3538        }
3539        dupcnt = tcb->rcv.nxt - seg->seq;
3540        if (dupcnt > 0) {
3541                tcb->rerecv += dupcnt;
3542                if (seg->flags & SYN) {
3543                        seg->flags &= ~SYN;
3544                        seg->seq++;
3545
3546                        if (seg->urg > 1)
3547                                seg->urg--;
3548                        else
3549                                seg->flags &= ~URG;
3550                        dupcnt--;
3551                }
3552                if (dupcnt > 0) {
3553                        pullblock(bp, (uint16_t) dupcnt);
3554                        seg->seq += dupcnt;
3555                        *length -= dupcnt;
3556
3557                        if (seg->urg > dupcnt)
3558                                seg->urg -= dupcnt;
3559                        else {
3560                                seg->flags &= ~URG;
3561                                seg->urg = 0;
3562                        }
3563                }
3564        }
3565        excess = seg->seq + *length - (tcb->rcv.nxt + tcb->rcv.wnd);
3566        if (excess > 0) {
3567                tcb->rerecv += excess;
3568                *length -= excess;
3569                *bp = trimblock(*bp, 0, *length);
3570                if (*bp == NULL)
3571                        panic("presotto is a boofhead");
3572                seg->flags &= ~FIN;
3573        }
3574        return 0;
3575}
3576
3577static void tcpadvise(struct Proto *tcp, struct block *bp, char *msg)
3578{
3579        Tcp4hdr *h4;
3580        Tcp6hdr *h6;
3581        Tcpctl *tcb;
3582        uint8_t source[IPaddrlen];
3583        uint8_t dest[IPaddrlen];
3584        uint16_t psource, pdest;
3585        struct conv *s, **p;
3586
3587        h4 = (Tcp4hdr *) (bp->rp);
3588        h6 = (Tcp6hdr *) (bp->rp);
3589
3590        if ((h4->vihl & 0xF0) == IP_VER4) {
3591                v4tov6(dest, h4->tcpdst);
3592                v4tov6(source, h4->tcpsrc);
3593                psource = nhgets(h4->tcpsport);
3594                pdest = nhgets(h4->tcpdport);
3595        } else {
3596                ipmove(dest, h6->tcpdst);
3597                ipmove(source, h6->tcpsrc);
3598                psource = nhgets(h6->tcpsport);
3599                pdest = nhgets(h6->tcpdport);
3600        }
3601
3602        /* Look for a connection */
3603        for (p = tcp->conv; *p; p++) {
3604                s = *p;
3605                tcb = (Tcpctl *) s->ptcl;
3606                if ((s->rport == pdest) && (s->lport == psource)
3607                    && (tcb->state != Closed) && (ipcmp(s->raddr, dest) == 0)
3608                    && (ipcmp(s->laddr, source) == 0)) {
3609                        qlock(&s->qlock);
3610                        switch (tcb->state) {
3611                        case Syn_sent:
3612                                localclose(s, msg);
3613                                break;
3614                        }
3615                        qunlock(&s->qlock);
3616                        freeblist(bp);
3617                        return;
3618                }
3619        }
3620        freeblist(bp);
3621}
3622
3623static void tcpporthogdefensectl(char *val)
3624{
3625        if (strcmp(val, "on") == 0)
3626                tcpporthogdefense = 1;
3627        else if (strcmp(val, "off") == 0)
3628                tcpporthogdefense = 0;
3629        else
3630                error(EINVAL, "unknown value for tcpporthogdefense");
3631}
3632
3633/* called with c qlocked */
3634static void tcpctl(struct conv *c, char **f, int n)
3635{
3636        if (n == 1 && strcmp(f[0], "hangup") == 0)
3637                tcphangup(c);
3638        else if (n >= 1 && strcmp(f[0], "keepalive") == 0)
3639                tcpstartka(c, f, n);
3640        else if (n >= 1 && strcmp(f[0], "checksum") == 0)
3641                tcpsetchecksum(c, f, n);
3642        else if (n >= 1 && strcmp(f[0], "tcpporthogdefense") == 0)
3643                tcpporthogdefensectl(f[1]);
3644        else
3645                error(EINVAL, "unknown command to %s", __func__);
3646}
3647
3648static int tcpstats(struct Proto *tcp, char *buf, int len)
3649{
3650        struct tcppriv *priv;
3651        char *p, *e;
3652        int i;
3653
3654        priv = tcp->priv;
3655        p = buf;
3656        e = p + len;
3657        for (i = 0; i < Nstats; i++)
3658                p = seprintf(p, e, "%s: %u\n", statnames[i], priv->stats[i]);
3659        return p - buf;
3660}
3661
3662/*
3663 *  garbage collect any stale conversations:
3664 *      - SYN received but no SYN-ACK after 5 seconds (could be the SYN attack)
3665 *      - Finwait2 after 5 minutes
3666 *
3667 *  this is called whenever we run out of channels.  Both checks are
3668 *  of questionable validity so we try to use them only when we're
3669 *  up against the wall.
3670 */
3671static int tcpgc(struct Proto *tcp)
3672{
3673        struct conv *c, **pp, **ep;
3674        int n;
3675        Tcpctl *tcb;
3676
3677        n = 0;
3678        ep = &tcp->conv[tcp->nc];
3679        for (pp = tcp->conv; pp < ep; pp++) {
3680                c = *pp;
3681                if (c == NULL)
3682                        break;
3683                if (!canqlock(&c->qlock))
3684                        continue;
3685                tcb = (Tcpctl *) c->ptcl;
3686                if (tcb->state == Finwait2) {
3687                        if (NOW - tcb->time > 5 * 60 * 1000) {
3688                                localclose(c, "timed out");
3689                                n++;
3690                        }
3691                }
3692                qunlock(&c->qlock);
3693        }
3694        return n;
3695}
3696
3697static void tcpsettimer(Tcpctl *tcb)
3698{
3699        int x;
3700
3701        /* round trip dependency */
3702        x = backoff(tcb->backoff) * (tcb->srtt + MAX(4 * tcb->mdev, MSPTICK));
3703        x = DIV_ROUND_UP(x, MSPTICK);
3704
3705        /* Bounded twixt 1/2 and 64 seconds.  RFC 6298 suggested min is 1
3706         * second. */
3707        if (x < 500 / MSPTICK)
3708                x = 500 / MSPTICK;
3709        else if (x > (64000 / MSPTICK))
3710                x = 64000 / MSPTICK;
3711        tcb->timer.start = x;
3712}
3713
3714static struct tcppriv *debug_priv;
3715
3716/* Kfunc this */
3717int dump_tcp_ht(void)
3718{
3719        if (!debug_priv)
3720                return -1;
3721        dump_ipht(&debug_priv->ht);
3722        return 0;
3723}
3724
3725void tcpinit(struct Fs *fs)
3726{
3727        struct Proto *tcp;
3728        struct tcppriv *tpriv;
3729
3730        tcp = kzmalloc(sizeof(struct Proto), 0);
3731        tpriv = tcp->priv = kzmalloc(sizeof(struct tcppriv), 0);
3732        debug_priv = tpriv;
3733        qlock_init(&tpriv->tl);
3734        qlock_init(&tpriv->apl);
3735        tcp->name = "tcp";
3736        tcp->connect = tcpconnect;
3737        tcp->announce = tcpannounce;
3738        tcp->bypass = tcpbypass;
3739        tcp->ctl = tcpctl;
3740        tcp->state = tcpstate;
3741        tcp->create = tcpcreate;
3742        tcp->close = tcpclose;
3743        tcp->shutdown = tcpshutdown;
3744        tcp->rcv = tcpiput;
3745        tcp->advise = tcpadvise;
3746        tcp->stats = tcpstats;
3747        tcp->inuse = tcpinuse;
3748        tcp->gc = tcpgc;
3749        tcp->ipproto = IP_TCPPROTO;
3750        tcp->nc = 4096;
3751        tcp->ptclsize = sizeof(Tcpctl);
3752        tpriv->stats[MaxConn] = tcp->nc;
3753
3754        Fsproto(fs, tcp);
3755}
3756
3757static void tcpsetscale(struct conv *s, Tcpctl *tcb, uint16_t rcvscale,
3758                        uint16_t sndscale)
3759{
3760        if (rcvscale) {
3761                tcb->rcv.scale = rcvscale & 0xff;
3762                tcb->snd.scale = sndscale & 0xff;
3763                tcb->window = QMAX << tcb->rcv.scale;
3764        } else {
3765                tcb->rcv.scale = 0;
3766                tcb->snd.scale = 0;
3767                tcb->window = QMAX;
3768        }
3769}
3770