akaros/kern/src/net/devip.c
<<
>>
Prefs
   1/* Copyright © 1994-1999 Lucent Technologies Inc.  All rights reserved.
   2 * Portions Copyright © 1997-1999 Vita Nuova Limited
   3 * Portions Copyright © 2000-2007 Vita Nuova Holdings Limited
   4 *                                (www.vitanuova.com)
   5 * Revisions Copyright © 2000-2007 Lucent Technologies Inc. and others
   6 *
   7 * Modified for the Akaros operating system:
   8 * Copyright (c) 2013-2014 The Regents of the University of California
   9 * Copyright (c) 2013-2015 Google Inc.
  10 *
  11 * Permission is hereby granted, free of charge, to any person obtaining a copy
  12 * of this software and associated documentation files (the "Software"), to deal
  13 * in the Software without restriction, including without limitation the rights
  14 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  15 * copies of the Software, and to permit persons to whom the Software is
  16 * furnished to do so, subject to the following conditions:
  17 *
  18 * The above copyright notice and this permission notice shall be included in
  19 * all copies or substantial portions of the Software.
  20 *
  21 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  22 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  23 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
  24 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  25 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  26 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  27 * SOFTWARE. */
  28
  29#include <slab.h>
  30#include <kmalloc.h>
  31#include <kref.h>
  32#include <string.h>
  33#include <stdio.h>
  34#include <assert.h>
  35#include <error.h>
  36#include <cpio.h>
  37#include <pmap.h>
  38#include <smp.h>
  39#include <net/ip.h>
  40
  41struct dev ipdevtab;
  42
  43static char *devname(void)
  44{
  45        return ipdevtab.name;
  46}
  47
  48enum {
  49        Qtopdir = 1,    /* top level directory */
  50        Qtopbase,
  51        Qarp = Qtopbase,
  52        Qndb,
  53        Qiproute,
  54        Qiprouter,
  55        Qipselftab,
  56        Qlog,
  57
  58        Qprotodir,      /* directory for a protocol */
  59        Qprotobase,
  60        Qclone = Qprotobase,
  61        Qstats,
  62
  63        Qconvdir,       /* directory for a conversation */
  64        Qconvbase,
  65        Qctl = Qconvbase,
  66        Qdata,
  67        Qerr,
  68        Qlisten,
  69        Qlocal,
  70        Qremote,
  71        Qstatus,
  72        Qsnoop,
  73
  74        Logtype = 5,
  75        Masktype = (1 << Logtype) - 1,
  76        Logconv = 12,
  77        Maskconv = (1 << Logconv) - 1,
  78        Shiftconv = Logtype,
  79        Logproto = 8,
  80        Maskproto = (1 << Logproto) - 1,
  81        Shiftproto = Logtype + Logconv,
  82
  83        Nfs = 32,
  84        BYPASS_QMAX = 64 * MiB,
  85        IPROUTE_LEN = 2 * PGSIZE,
  86};
  87#define TYPE(x)         ( ((uint32_t)(x).path) & Masktype )
  88#define CONV(x)         ( (((uint32_t)(x).path) >> Shiftconv) & Maskconv )
  89#define PROTO(x)        ( (((uint32_t)(x).path) >> Shiftproto) & Maskproto )
  90#define QID(p, c, y)    ( ((p)<<(Shiftproto)) | ((c)<<Shiftconv) | (y))
  91static char network[] = "network";
  92
  93qlock_t fslock;
  94struct Fs *ipfs[Nfs];                   /* attached fs's */
  95struct queue *qlog;
  96
  97extern void nullmediumlink(void);
  98extern void pktmediumlink(void);
  99extern struct username eve;
 100static long ndbwrite(struct Fs *, char *unused_char_p_t, uint32_t, int);
 101static void closeconv(struct conv *);
 102static void setup_proto_qio_bypass(struct conv *cv);
 103static void undo_proto_qio_bypass(struct conv *cv);
 104static int connected(void *a);
 105
 106static struct conv *chan2conv(struct chan *chan)
 107{
 108        /* That's a lot of pointers to get to the conv! */
 109        return ipfs[chan->dev]->p[PROTO(chan->qid)]->conv[CONV(chan->qid)];
 110}
 111
 112static inline int founddevdir(struct chan *c, struct qid q, char *n,
 113                              int64_t length, char *user, long perm,
 114                              struct dir *db)
 115{
 116        devdir(c, q, n, length, user, perm, db);
 117        return 1;
 118}
 119
 120static int topdirgen(struct chan *c, struct dir *dp)
 121{
 122        struct qid q;
 123
 124        mkqid(&q, QID(0, 0, Qtopdir), 0, QTDIR);
 125        snprintf(get_cur_genbuf(), GENBUF_SZ, "#%s%lu", devname(), c->dev);
 126        return founddevdir(c, q, get_cur_genbuf(), 0, network, 0555, dp);
 127}
 128
 129/* Computes the perm field for a stat for Qdata.  Since select() polls the
 130 * 'actionability' of a socket via the qdata FD, we'll also report listenable
 131 * and connected conversations.  It's a minor hack.  =( */
 132static int qdata_stat_perm(struct conv *cv)
 133{
 134        int perm;
 135
 136        perm = cv->perm;
 137        /* If there is ever a listener, then it's readable.  Ideally, we'd only
 138         * report this on the Qlisten file (which we also do).  The socket crap
 139         * should never use a listening socket for data, so there shouldn't be
 140         * any confusion when a Qdata shows up as readable. */
 141        perm |= cv->incall ? DMREADABLE : 0;
 142        /* For connectable convs, they need to be both connected and qio
 143         * readable/writable.  The way to think about this is that the convs are
 144         * not truly writable/readable until they are connected.  Conveniently,
 145         * this means that when select polls Qdata for non-blocking connect(), a
 146         * connected conversation pops up as writable (the qio is writable too).
 147         *
 148         * Note that a conversation can be 'Connected' even if it failed to
 149         * connect.  At least that's what the 9ns TCP code does.  It's more like
 150         * "the protocol did what it needed and the connectctlmsg call (or its
 151         * non-blocking equivalent) is done".  For instance, TCP has a few
 152         * reasons to call Fsconnected, such as when we send the SYN and get a
 153         * RST. */
 154        if (!cv->p->connect || connected(cv)) {
 155                perm |= qreadable(cv->rq) ? DMREADABLE : 0;
 156                perm |= qwritable(cv->wq) ? DMWRITABLE : 0;
 157        }
 158        return perm;
 159}
 160
 161static int ip3gen(struct chan *c, int i, struct dir *dp)
 162{
 163        struct qid q;
 164        struct conv *cv;
 165        char *p;
 166        int perm;
 167
 168        cv = chan2conv(c);
 169        if (cv->owner == NULL)
 170                kstrdup(&cv->owner, eve.name);
 171        mkqid(&q, QID(PROTO(c->qid), CONV(c->qid), i), 0, QTFILE);
 172
 173        switch (i) {
 174        default:
 175                return -1;
 176        case Qctl:
 177                return founddevdir(c, q, "ctl", 0, cv->owner, cv->perm, dp);
 178        case Qdata:
 179                perm = qdata_stat_perm(cv);
 180                return founddevdir(c, q, "data", qlen(cv->rq), cv->owner, perm,
 181                                   dp);
 182        case Qerr:
 183                perm = cv->perm;
 184                perm |= qreadable(cv->eq) ? DMREADABLE : 0;
 185                return founddevdir(c, q, "err", qlen(cv->eq), cv->owner, perm,
 186                                   dp);
 187        case Qlisten:
 188                perm = cv->perm;
 189                perm |= cv->incall ? DMREADABLE : 0;
 190                return founddevdir(c, q, "listen", 0, cv->owner, perm, dp);
 191        case Qlocal:
 192                p = "local";
 193                break;
 194        case Qremote:
 195                p = "remote";
 196                break;
 197        case Qsnoop:
 198                if (strcmp(cv->p->name, "ipifc") != 0)
 199                        return -1;
 200                perm = 0400;
 201                perm |= qreadable(cv->sq) ? DMREADABLE : 0;
 202                return founddevdir(c, q, "snoop", qlen(cv->sq), cv->owner, perm,
 203                                   dp);
 204        case Qstatus:
 205                p = "status";
 206                break;
 207        }
 208        return founddevdir(c, q, p, 0, cv->owner, 0444, dp);
 209}
 210
 211static int ip2gen(struct chan *c, int i, struct dir *dp)
 212{
 213        struct qid q;
 214
 215        mkqid(&q, QID(PROTO(c->qid), 0, i), 0, QTFILE);
 216        switch (i) {
 217        case Qclone:
 218                return founddevdir(c, q, "clone", 0, network, 0666, dp);
 219        case Qstats:
 220                return founddevdir(c, q, "stats", 0, network, 0444, dp);
 221        }
 222        return -1;
 223}
 224
 225static int ip1gen(struct chan *c, int i, struct dir *dp)
 226{
 227        struct qid q;
 228        char *p;
 229        int prot;
 230        int len = 0;
 231        struct Fs *f;
 232        extern uint32_t kerndate;
 233
 234        f = ipfs[c->dev];
 235
 236        prot = 0666;
 237        mkqid(&q, QID(0, 0, i), 0, QTFILE);
 238        switch (i) {
 239        default:
 240                return -1;
 241        case Qarp:
 242                p = "arp";
 243                break;
 244        case Qndb:
 245                p = "ndb";
 246                len = strlen(f->ndb);
 247                q.vers = f->ndbvers;
 248                break;
 249        case Qiproute:
 250                p = "iproute";
 251                break;
 252        case Qipselftab:
 253                p = "ipselftab";
 254                prot = 0444;
 255                break;
 256        case Qiprouter:
 257                p = "iprouter";
 258                break;
 259        case Qlog:
 260                p = "log";
 261                break;
 262        }
 263        devdir(c, q, p, len, network, prot, dp);
 264        if (i == Qndb && f->ndbmtime > kerndate)
 265                dp->mtime.tv_sec = f->ndbmtime;
 266        return 1;
 267}
 268
 269static int ipgen(struct chan *c, char *unused_char_p_t, struct dirtab *d,
 270                 int unused_int, int s, struct dir *dp)
 271{
 272        struct qid q;
 273        struct conv *cv;
 274        struct Fs *f;
 275
 276        f = ipfs[c->dev];
 277
 278        switch (TYPE(c->qid)) {
 279        case Qtopdir:
 280                if (s == DEVDOTDOT)
 281                        return topdirgen(c, dp);
 282                if (s < f->np) {
 283                        /* protocol with no user interface */
 284                        if (f->p[s]->connect == NULL)
 285                                return 0;
 286                        mkqid(&q, QID(s, 0, Qprotodir), 0, QTDIR);
 287                        return founddevdir(c, q, f->p[s]->name, 0, network,
 288                                           0555, dp);
 289                }
 290                s -= f->np;
 291                return ip1gen(c, s + Qtopbase, dp);
 292        case Qarp:
 293        case Qndb:
 294        case Qlog:
 295        case Qiproute:
 296        case Qiprouter:
 297        case Qipselftab:
 298                return ip1gen(c, TYPE(c->qid), dp);
 299        case Qprotodir:
 300                if (s == DEVDOTDOT)
 301                        return topdirgen(c, dp);
 302                else if (s < f->p[PROTO(c->qid)]->ac) {
 303                        cv = f->p[PROTO(c->qid)]->conv[s];
 304                        snprintf(get_cur_genbuf(), GENBUF_SZ, "%d", s);
 305                        mkqid(&q, QID(PROTO(c->qid), s, Qconvdir), 0, QTDIR);
 306                        return founddevdir(c, q, get_cur_genbuf(), 0, cv->owner,
 307                                           0555, dp);
 308                }
 309                s -= f->p[PROTO(c->qid)]->ac;
 310                return ip2gen(c, s + Qprotobase, dp);
 311        case Qclone:
 312        case Qstats:
 313                return ip2gen(c, TYPE(c->qid), dp);
 314        case Qconvdir:
 315                if (s == DEVDOTDOT) {
 316                        s = PROTO(c->qid);
 317                        mkqid(&q, QID(s, 0, Qprotodir), 0, QTDIR);
 318                        devdir(c, q, f->p[s]->name, 0, network, 0555, dp);
 319                        return 1;
 320                }
 321                return ip3gen(c, s + Qconvbase, dp);
 322        case Qctl:
 323        case Qdata:
 324        case Qerr:
 325        case Qlisten:
 326        case Qlocal:
 327        case Qremote:
 328        case Qstatus:
 329        case Qsnoop:
 330                return ip3gen(c, TYPE(c->qid), dp);
 331        }
 332        return -1;
 333}
 334
 335static void ipinit(void)
 336{
 337        qlock_init(&fslock);
 338        nullmediumlink();
 339        pktmediumlink();
 340/* if only
 341        fmtinstall('i', eipfmt);
 342        fmtinstall('I', eipfmt);
 343        fmtinstall('E', eipfmt);
 344        fmtinstall('V', eipfmt);
 345        fmtinstall('M', eipfmt);
 346*/
 347}
 348
 349static void ipreset(void)
 350{
 351}
 352
 353static struct Fs *ipgetfs(int dev)
 354{
 355        extern void (*ipprotoinit[]) (struct Fs *);
 356        struct Fs *f;
 357        int i;
 358
 359        if (dev >= Nfs)
 360                return NULL;
 361
 362        qlock(&fslock);
 363        if (ipfs[dev] == NULL) {
 364                f = kzmalloc(sizeof(struct Fs), MEM_WAIT);
 365                rwinit(&f->rwlock);
 366                qlock_init(&f->iprouter.qlock);
 367                ip_init(f);
 368                arpinit(f);
 369                netloginit(f);
 370                for (i = 0; ipprotoinit[i]; i++)
 371                        ipprotoinit[i] (f);
 372                f->dev = dev;
 373                ipfs[dev] = f;
 374        }
 375        qunlock(&fslock);
 376
 377        return ipfs[dev];
 378}
 379
 380struct IPaux *newipaux(char *owner, char *tag)
 381{
 382        struct IPaux *a;
 383        int n;
 384
 385        a = kzmalloc(sizeof(*a), 0);
 386        kstrdup(&a->owner, owner);
 387        memset(a->tag, ' ', sizeof(a->tag));
 388        n = strlen(tag);
 389        if (n > sizeof(a->tag))
 390                n = sizeof(a->tag);
 391        memmove(a->tag, tag, n);
 392        return a;
 393}
 394
 395#define ATTACHER(c) (((struct IPaux*)((c)->aux))->owner)
 396
 397static struct chan *ipattach(char *spec)
 398{
 399        struct chan *c;
 400        int dev;
 401
 402        dev = atoi(spec);
 403        if (dev >= Nfs)
 404                error(EFAIL, "bad specification");
 405
 406        ipgetfs(dev);
 407        c = devattach(devname(), spec);
 408        mkqid(&c->qid, QID(0, 0, Qtopdir), 0, QTDIR);
 409        c->dev = dev;
 410
 411        c->aux = newipaux(commonuser(), "none");
 412
 413        return c;
 414}
 415
 416static struct walkqid *ipwalk(struct chan *c, struct chan *nc, char **name,
 417                                                          unsigned int nname)
 418{
 419        struct IPaux *a = c->aux;
 420        struct walkqid *w;
 421
 422        w = devwalk(c, nc, name, nname, NULL, 0, ipgen);
 423        if (w != NULL && w->clone != NULL)
 424                w->clone->aux = newipaux(a->owner, a->tag);
 425        return w;
 426}
 427
 428static size_t ipstat(struct chan *c, uint8_t *db, size_t n)
 429{
 430        return devstat(c, db, n, NULL, 0, ipgen);
 431}
 432
 433static int should_wake(void *arg)
 434{
 435        struct conv *cv = arg;
 436        /* signal that the conv is closed */
 437        if (qisclosed(cv->rq))
 438                return TRUE;
 439        return cv->incall != NULL;
 440}
 441
 442static struct chan *ipopen(struct chan *c, int omode)
 443{
 444        ERRSTACK(2);
 445        struct conv *cv, *nc;
 446        struct Proto *p;
 447        int perm;
 448        struct Fs *f;
 449
 450        /* perm is a lone rwx, not the rwx------ from the conversion */
 451        perm = omode_to_rwx(omode) >> 6;
 452
 453        f = ipfs[c->dev];
 454
 455        switch (TYPE(c->qid)) {
 456        default:
 457                break;
 458        case Qndb:
 459                if (omode & (O_WRITE | O_TRUNC) && !iseve())
 460                        error(EPERM, ERROR_FIXME);
 461                if ((omode & (O_WRITE | O_TRUNC)) == (O_WRITE | O_TRUNC))
 462                        f->ndb[0] = 0;
 463                break;
 464        case Qlog:
 465                netlogopen(f);
 466                break;
 467        case Qiprouter:
 468                iprouteropen(f);
 469                break;
 470        case Qiproute:
 471                c->synth_buf = kpages_zalloc(IPROUTE_LEN, MEM_WAIT);
 472                routeread(f, c->synth_buf, 0, IPROUTE_LEN);
 473                break;
 474        case Qtopdir:
 475        case Qprotodir:
 476        case Qconvdir:
 477        case Qstatus:
 478        case Qremote:
 479        case Qlocal:
 480        case Qstats:
 481        case Qipselftab:
 482                if (omode & O_WRITE)
 483                        error(EPERM, ERROR_FIXME);
 484                break;
 485        case Qsnoop:
 486                if (omode & O_WRITE)
 487                        error(EPERM, ERROR_FIXME);
 488                /* might be racy. note the lack of a proto lock, unlike Qdata */
 489                p = f->p[PROTO(c->qid)];
 490                cv = p->conv[CONV(c->qid)];
 491                if (strcmp(ATTACHER(c), cv->owner) != 0 && !iseve())
 492                        error(EPERM, ERROR_FIXME);
 493                atomic_inc(&cv->snoopers);
 494                break;
 495        case Qclone:
 496                p = f->p[PROTO(c->qid)];
 497                qlock(&p->qlock);
 498                if (waserror()) {
 499                        qunlock(&p->qlock);
 500                        nexterror();
 501                }
 502                cv = Fsprotoclone(p, ATTACHER(c));
 503                qunlock(&p->qlock);
 504                poperror();
 505                if (cv == NULL) {
 506                        error(ENODEV, "Null conversation from Fsprotoclone");
 507                        break;
 508                }
 509                mkqid(&c->qid, QID(p->x, cv->x, Qctl), 0, QTFILE);
 510                break;
 511        case Qdata:
 512        case Qctl:
 513        case Qerr:
 514                p = f->p[PROTO(c->qid)];
 515                qlock(&p->qlock);
 516                cv = p->conv[CONV(c->qid)];
 517                qlock(&cv->qlock);
 518                if (waserror()) {
 519                        qunlock(&cv->qlock);
 520                        qunlock(&p->qlock);
 521                        nexterror();
 522                }
 523                if ((perm & (cv->perm >> 6)) != perm) {
 524                        if (strcmp(ATTACHER(c), cv->owner) != 0)
 525                                error(EPERM, ERROR_FIXME);
 526                        if ((perm & cv->perm) != perm)
 527                                error(EPERM, ERROR_FIXME);
 528
 529                }
 530                cv->inuse++;
 531                if (cv->inuse == 1) {
 532                        kstrdup(&cv->owner, ATTACHER(c));
 533                        cv->perm = 0660;
 534                }
 535                qunlock(&cv->qlock);
 536                qunlock(&p->qlock);
 537                poperror();
 538                break;
 539        case Qlisten:
 540                cv = f->p[PROTO(c->qid)]->conv[CONV(c->qid)];
 541                /* No permissions or Announce checks required.  We'll see if
 542                 * that's a good idea or not. (the perm check would do nothing,
 543                 * as is, since an O_PATH perm is 0).
 544                 *
 545                 * But we probably want to incref to keep the conversation
 546                 * around until this FD/chan is closed.  #ip is a little weird
 547                 * in that objects never really go away (high water mark for
 548                 * convs, you can always find them in the ns).  I think it is
 549                 * possible to namec/ipgen a chan, then have that conv close,
 550                 * then have that chan be opened.  You can probably do this with
 551                 * a data file. */
 552                if (omode & O_PATH) {
 553                        qlock(&cv->qlock);
 554                        cv->inuse++;
 555                        qunlock(&cv->qlock);
 556                        break;
 557                }
 558                if ((perm & (cv->perm >> 6)) != perm) {
 559                        if (strcmp(ATTACHER(c), cv->owner) != 0)
 560                                error(EPERM, ERROR_FIXME);
 561                        if ((perm & cv->perm) != perm)
 562                                error(EPERM, ERROR_FIXME);
 563
 564                }
 565
 566                if (cv->state != Announced)
 567                        error(EFAIL, "not announced");
 568
 569                if (waserror()) {
 570                        closeconv(cv);
 571                        nexterror();
 572                }
 573                qlock(&cv->qlock);
 574                cv->inuse++;
 575                qunlock(&cv->qlock);
 576
 577                nc = NULL;
 578                while (nc == NULL) {
 579                        /* give up if we got a hangup */
 580                        if (qisclosed(cv->rq))
 581                                error(EFAIL, "listen hungup");
 582
 583                        qlock(&cv->listenq);
 584                        if (waserror()) {
 585                                qunlock(&cv->listenq);
 586                                nexterror();
 587                        }
 588                        /* we can peek at incall without grabbing the cv qlock.
 589                         * if anything is there, it'll remain there until we
 590                         * dequeue it.  no one else can, since we hold the
 591                         * listenq lock */
 592                        if ((c->flag & O_NONBLOCK) && !cv->incall)
 593                                error(EAGAIN, "listen queue empty");
 594                        /* wait for a connect */
 595                        rendez_sleep(&cv->listenr, should_wake, cv);
 596
 597                        /* if there is a concurrent hangup, they will hold the
 598                         * qlock until the hangup is complete, including closing
 599                         * the cv->rq */
 600                        qlock(&cv->qlock);
 601                        nc = cv->incall;
 602                        if (nc != NULL) {
 603                                cv->incall = nc->next;
 604                                mkqid(&c->qid, QID(PROTO(c->qid), nc->x, Qctl),
 605                                      0, QTFILE);
 606                                kstrdup(&cv->owner, ATTACHER(c));
 607                        }
 608                        qunlock(&cv->qlock);
 609
 610                        qunlock(&cv->listenq);
 611                        poperror();
 612                }
 613                closeconv(cv);
 614                poperror();
 615                break;
 616        }
 617        c->mode = openmode(omode);
 618        c->flag |= COPEN;
 619        c->offset = 0;
 620        return c;
 621}
 622
 623static size_t ipwstat(struct chan *c, uint8_t *dp, size_t n)
 624{
 625        ERRSTACK(2);
 626        struct dir *d;
 627        struct conv *cv;
 628        struct Fs *f;
 629        struct Proto *p;
 630
 631        f = ipfs[c->dev];
 632        switch (TYPE(c->qid)) {
 633        default:
 634                error(EPERM, ERROR_FIXME);
 635                break;
 636        case Qctl:
 637        case Qdata:
 638                break;
 639        }
 640
 641        d = kzmalloc(sizeof(*d) + n, 0);
 642        if (waserror()) {
 643                kfree(d);
 644                nexterror();
 645        }
 646        n = convM2D(dp, n, d, (char *)&d[1]);
 647        if (n == 0)
 648                error(ENODATA, ERROR_FIXME);
 649        p = f->p[PROTO(c->qid)];
 650        cv = p->conv[CONV(c->qid)];
 651        if (!iseve() && strcmp(ATTACHER(c), cv->owner) != 0)
 652                error(EPERM, ERROR_FIXME);
 653        if (!emptystr(d->uid))
 654                kstrdup(&cv->owner, d->uid);
 655        if (d->mode != -1)
 656                cv->perm = d->mode & 0777;
 657        poperror();
 658        kfree(d);
 659        return n;
 660}
 661
 662/* Should be able to handle any file type chan. Feel free to extend it. */
 663static char *ipchaninfo(struct chan *ch, char *ret, size_t ret_l)
 664{
 665        struct conv *conv;
 666        struct Proto *proto;
 667        char *p;
 668        struct Fs *f;
 669
 670        f = ipfs[ch->dev];
 671
 672        switch (TYPE(ch->qid)) {
 673        default:
 674                ret = "Unknown type";
 675                break;
 676        case Qdata:
 677                proto = f->p[PROTO(ch->qid)];
 678                conv = proto->conv[CONV(ch->qid)];
 679                snprintf(ret, ret_l,
 680                         "Qdata, %s, proto %s, conv idx %d, rq len %d, wq len %d, total read %llu",
 681                         SLIST_EMPTY(&conv->data_taps) ? "untapped" : "tapped",
 682                         proto->name, conv->x, qlen(conv->rq), qlen(conv->wq),
 683                                 q_bytes_read(conv->rq));
 684                break;
 685        case Qarp:
 686                ret = "Qarp";
 687                break;
 688        case Qiproute:
 689                ret = "Qiproute";
 690                break;
 691        case Qlisten:
 692                proto = f->p[PROTO(ch->qid)];
 693                conv = proto->conv[CONV(ch->qid)];
 694                snprintf(ret, ret_l,
 695                         "Qlisten, %s proto %s, conv idx %d, has %sincalls",
 696                         SLIST_EMPTY(&conv->listen_taps) ? "untapped"
 697                                                         : "tapped",
 698                         proto->name, conv->x, conv->incall ? "" : "no ");
 699                break;
 700        case Qlog:
 701                ret = "Qlog";
 702                break;
 703        case Qndb:
 704                ret = "Qndb";
 705                break;
 706        case Qctl:
 707                proto = f->p[PROTO(ch->qid)];
 708                conv = proto->conv[CONV(ch->qid)];
 709                snprintf(ret, ret_l, "Qctl, proto %s, conv idx %d", proto->name,
 710                                 conv->x);
 711                break;
 712        }
 713        return ret;
 714}
 715
 716static void closeconv(struct conv *cv)
 717{
 718        ERRSTACK(1);
 719        struct conv *nc;
 720        struct Ipmulti *mp;
 721
 722        qlock(&cv->qlock);
 723
 724        if (--cv->inuse > 0) {
 725                qunlock(&cv->qlock);
 726                return;
 727        }
 728        if (waserror()) {
 729                qunlock(&cv->qlock);
 730                nexterror();
 731        }
 732        /* close all incoming calls since no listen will ever happen */
 733        for (nc = cv->incall; nc; nc = cv->incall) {
 734                cv->incall = nc->next;
 735                closeconv(nc);
 736        }
 737        cv->incall = NULL;
 738
 739        kstrdup(&cv->owner, network);
 740        cv->perm = 0660;
 741
 742        while ((mp = cv->multi) != NULL)
 743                ipifcremmulti(cv, mp->ma, mp->ia);
 744
 745        cv->r = NULL;
 746        cv->rgen = 0;
 747        if (cv->state == Bypass)
 748                undo_proto_qio_bypass(cv);
 749        cv->p->close(cv);
 750        cv->state = Idle;
 751        qunlock(&cv->qlock);
 752        poperror();
 753}
 754
 755static void ipclose(struct chan *c)
 756{
 757        struct Fs *f;
 758
 759        f = ipfs[c->dev];
 760        switch (TYPE(c->qid)) {
 761        default:
 762                break;
 763        case Qlog:
 764                if (c->flag & COPEN)
 765                        netlogclose(f);
 766                break;
 767        case Qiprouter:
 768                if (c->flag & COPEN)
 769                        iprouterclose(f);
 770                break;
 771        case Qdata:
 772        case Qctl:
 773        case Qerr:
 774        case Qlisten:
 775                if (c->flag & COPEN)
 776                        closeconv(f->p[PROTO(c->qid)]->conv[CONV(c->qid)]);
 777                break;
 778        case Qsnoop:
 779                if (c->flag & COPEN)
 780                        atomic_dec(&f->p[PROTO(c->qid)]->conv[CONV(c->qid)]->snoopers);
 781                break;
 782        case Qiproute:
 783                if (c->flag & COPEN) {
 784                        kpages_free(c->synth_buf, IPROUTE_LEN);
 785                        c->synth_buf = NULL;
 786                }
 787                break;
 788        }
 789        kfree(((struct IPaux *)c->aux)->owner);
 790        kfree(c->aux);
 791}
 792
 793enum {
 794        Statelen = 32 * 1024,
 795};
 796
 797static size_t ipread(struct chan *ch, void *a, size_t n, off64_t off)
 798{
 799        struct conv *c;
 800        struct Proto *x;
 801        char *buf, *p;
 802        long rv;
 803        struct Fs *f;
 804        uint32_t offset = off;
 805
 806        f = ipfs[ch->dev];
 807
 808        p = a;
 809        switch (TYPE(ch->qid)) {
 810        default:
 811                error(EPERM, ERROR_FIXME);
 812        case Qtopdir:
 813        case Qprotodir:
 814        case Qconvdir:
 815                return devdirread(ch, a, n, 0, 0, ipgen);
 816        case Qarp:
 817                return arpread(f->arp, a, offset, n);
 818        case Qndb:
 819                return readstr(offset, a, n, f->ndb);
 820        case Qiproute:
 821                return readmem(offset, a, n, ch->synth_buf, IPROUTE_LEN);
 822        case Qiprouter:
 823                return iprouterread(f, a, n);
 824        case Qipselftab:
 825                return ipselftabread(f, a, offset, n);
 826        case Qlog:
 827                return netlogread(f, a, offset, n);
 828        case Qctl:
 829                snprintf(get_cur_genbuf(), GENBUF_SZ, "%lu", CONV(ch->qid));
 830                return readstr(offset, p, n, get_cur_genbuf());
 831        case Qremote:
 832                buf = kzmalloc(Statelen, 0);
 833                x = f->p[PROTO(ch->qid)];
 834                c = x->conv[CONV(ch->qid)];
 835                if (x->remote == NULL) {
 836                        snprintf(buf, Statelen, "%I!%d\n", c->raddr, c->rport);
 837                } else {
 838                        (*x->remote) (c, buf, Statelen - 2);
 839                }
 840                rv = readstr(offset, p, n, buf);
 841                kfree(buf);
 842                return rv;
 843        case Qlocal:
 844                buf = kzmalloc(Statelen, 0);
 845                x = f->p[PROTO(ch->qid)];
 846                c = x->conv[CONV(ch->qid)];
 847                if (x->local == NULL) {
 848                        snprintf(buf, Statelen, "%I!%d\n", c->laddr, c->lport);
 849                } else {
 850                        (*x->local) (c, buf, Statelen - 2);
 851                }
 852                rv = readstr(offset, p, n, buf);
 853                kfree(buf);
 854                return rv;
 855        case Qstatus:
 856                /* this all is a bit screwed up since the size of some state's
 857                 * buffers will change from one invocation to another.  a reader
 858                 * will come in and read the entire buffer.  then it will come
 859                 * again and read from the next offset, expecting EOF.  if the
 860                 * buffer changed sizes, it'll reprint the end of the buffer
 861                 * slightly. */
 862                buf = kzmalloc(Statelen, 0);
 863                x = f->p[PROTO(ch->qid)];
 864                c = x->conv[CONV(ch->qid)];
 865                if (c->state == Bypass)
 866                        snprintf(buf, Statelen, "Bypassed\n");
 867                else
 868                        (*x->state)(c, buf, Statelen - 2);
 869                rv = readstr(offset, p, n, buf);
 870                kfree(buf);
 871                return rv;
 872        case Qdata:
 873                c = f->p[PROTO(ch->qid)]->conv[CONV(ch->qid)];
 874                if (ch->flag & O_NONBLOCK)
 875                        return qread_nonblock(c->rq, a, n);
 876                else
 877                        return qread(c->rq, a, n);
 878        case Qerr:
 879                c = f->p[PROTO(ch->qid)]->conv[CONV(ch->qid)];
 880                return qread(c->eq, a, n);
 881        case Qsnoop:
 882                c = f->p[PROTO(ch->qid)]->conv[CONV(ch->qid)];
 883                return qread(c->sq, a, n);
 884        case Qstats:
 885                x = f->p[PROTO(ch->qid)];
 886                if (x->stats == NULL)
 887                        error(EFAIL, "stats not implemented");
 888                buf = kzmalloc(Statelen, 0);
 889                (*x->stats) (x, buf, Statelen);
 890                rv = readstr(offset, p, n, buf);
 891                kfree(buf);
 892                return rv;
 893        }
 894}
 895
 896static struct block *ipbread(struct chan *ch, size_t n, off64_t offset)
 897{
 898        struct conv *c;
 899
 900        switch (TYPE(ch->qid)) {
 901        case Qdata:
 902                c = chan2conv(ch);
 903                if (ch->flag & O_NONBLOCK)
 904                        return qbread_nonblock(c->rq, n);
 905                else
 906                        return qbread(c->rq, n);
 907        default:
 908                return devbread(ch, n, offset);
 909        }
 910}
 911
 912/*
 913 *  set local address to be that of the ifc closest to remote address
 914 */
 915static void setladdr(struct conv *c)
 916{
 917        findlocalip(c->p->f, c->laddr, c->raddr);
 918}
 919
 920/*
 921 *  set a local port making sure the quad of raddr,rport,laddr,lport is unique
 922 */
 923static void setluniqueport(struct conv *c, int lport)
 924{
 925        struct Proto *p;
 926        struct conv *xp;
 927        int x;
 928
 929        p = c->p;
 930
 931        qlock(&p->qlock);
 932        for (x = 0; x < p->nc; x++) {
 933                xp = p->conv[x];
 934                if (xp == NULL)
 935                        break;
 936                if (xp == c)
 937                        continue;
 938                if ((xp->state == Connected || xp->state == Announced
 939                                            || xp->state == Bypass)
 940                        && xp->lport == lport
 941                        && xp->rport == c->rport
 942                        && ipcmp(xp->raddr, c->raddr) == 0
 943                        && ipcmp(xp->laddr, c->laddr) == 0) {
 944                        qunlock(&p->qlock);
 945                        error(EFAIL, "address in use");
 946                }
 947        }
 948        c->lport = lport;
 949        qunlock(&p->qlock);
 950}
 951
 952/*
 953 *  pick a local port and set it
 954 */
 955static void setlport(struct conv *c)
 956{
 957        struct Proto *p;
 958        uint16_t *pp;
 959        int x, found;
 960
 961        p = c->p;
 962        if (c->restricted)
 963                pp = &p->nextrport;
 964        else
 965                pp = &p->nextport;
 966        qlock(&p->qlock);
 967        for (;; (*pp)++) {
 968                /*
 969                 * Fsproto initialises p->nextport to 0 and the restricted
 970                 * ports (p->nextrport) to 600.
 971                 * Restricted ports must lie between 600 and 1024.  For the
 972                 * initial condition or if the unrestricted port number has
 973                 * wrapped round, select a random port between 5000 and 1<<15 to
 974                 * start at.
 975                 */
 976                if (c->restricted) {
 977                        if (*pp >= 1024)
 978                                *pp = 600;
 979                } else
 980                        while (*pp < 5000)
 981                                urandom_read(pp, sizeof(*pp));
 982
 983                found = 0;
 984                for (x = 0; x < p->nc; x++) {
 985                        if (p->conv[x] == NULL)
 986                                break;
 987                        if (p->conv[x]->lport == *pp) {
 988                                found = 1;
 989                                break;
 990                        }
 991                }
 992                if (!found)
 993                        break;
 994        }
 995        c->lport = (*pp)++;
 996        qunlock(&p->qlock);
 997}
 998
 999/*
1000 *  set a local address and port from a string of the form
1001 *      [address!]port[!r]
1002 */
1003static void setladdrport(struct conv *c, char *str, int announcing)
1004{
1005        char *p;
1006        uint16_t lport;
1007        uint8_t addr[IPaddrlen];
1008
1009        /*
1010         *  ignore restricted part if it exists.  it's
1011         *  meaningless on local ports.
1012         */
1013        p = strchr(str, '!');
1014        if (p != NULL) {
1015                *p++ = 0;
1016                if (strcmp(p, "r") == 0)
1017                        p = NULL;
1018        }
1019
1020        c->lport = 0;
1021        if (p == NULL) {
1022                if (announcing)
1023                        ipmove(c->laddr, IPnoaddr);
1024                else
1025                        setladdr(c);
1026                p = str;
1027        } else {
1028                if (strcmp(str, "*") == 0)
1029                        ipmove(c->laddr, IPnoaddr);
1030                else {
1031                        parseip(addr, str);
1032                        if (ipforme(c->p->f, addr))
1033                                ipmove(c->laddr, addr);
1034                        else
1035                                error(EFAIL, "not a local IP address");
1036                }
1037        }
1038
1039        /* one process can get all connections */
1040        if (announcing && strcmp(p, "*") == 0) {
1041                if (!iseve())
1042                        error(EPERM, ERROR_FIXME);
1043                setluniqueport(c, 0);
1044        }
1045
1046        lport = atoi(p);
1047        if (lport <= 0)
1048                setlport(c);
1049        else
1050                setluniqueport(c, lport);
1051}
1052
1053static void setraddrport(struct conv *c, char *str)
1054{
1055        char *p;
1056
1057        p = strchr(str, '!');
1058        if (p == NULL)
1059                error(EFAIL, "malformed address");
1060        *p++ = 0;
1061        parseip(c->raddr, str);
1062        c->rport = atoi(p);
1063        p = strchr(p, '!');
1064        if (p) {
1065                if (strstr(p, "!r") != NULL)
1066                        c->restricted = 1;
1067        }
1068}
1069
1070/*
1071 *  called by protocol connect routine to set addresses
1072 */
1073void Fsstdconnect(struct conv *c, char *argv[], int argc)
1074{
1075        switch (argc) {
1076        default:
1077                error(EINVAL, "bad args to %s", __func__);
1078        case 2:
1079                setraddrport(c, argv[1]);
1080                setladdr(c);
1081                setlport(c);
1082                break;
1083        case 3:
1084                setraddrport(c, argv[1]);
1085                setladdrport(c, argv[2], 0);
1086                break;
1087        }
1088
1089        /* TODO: why is an IPnoaddr (in v6 format, equivalent to v6Unspecified),
1090         * a v4 format? */
1091        if ((memcmp(c->raddr, v4prefix, IPv4off) == 0 &&
1092                 memcmp(c->laddr, v4prefix, IPv4off) == 0)
1093                || ipcmp(c->raddr, IPnoaddr) == 0)
1094                c->ipversion = V4;
1095        else
1096                c->ipversion = V6;
1097        /* Linux has taught people to use zeros for local interfaces.  TODO: We
1098         * might need this for v6 in the future. */
1099        if (!ipcmp(c->raddr, IPv4_zeroes))
1100                ipmove(c->raddr, IPv4_loopback);
1101}
1102
1103/*
1104 *  initiate connection and sleep till its set up
1105 */
1106static int connected(void *a)
1107{
1108        return ((struct conv *)a)->state == Connected;
1109}
1110
1111static void connectctlmsg(struct Proto *x, struct conv *c, struct cmdbuf *cb,
1112                          struct chan *chan)
1113{
1114        ERRSTACK(1);
1115        char *p;
1116
1117        if (c->state != 0)
1118                error(EBUSY, ERROR_FIXME);
1119        c->state = Connecting;
1120        c->cerr[0] = '\0';
1121        if (x->connect == NULL)
1122                error(EFAIL, "connect not supported");
1123        /* It's up to the proto connect method to not block the kthread.  This
1124         * is currently the case for e.g. TCP. */
1125        x->connect(c, cb->f, cb->nf);
1126        /* This is notionally right before the rendez_sleep: either we block or
1127         * we kick back to userspace.  We do this before the unlock to avoid
1128         * races with c->state (rendez's internal lock deals with its race with
1129         * the waker) and to avoid the excessive unlock and relock.
1130         *
1131         * Also, it's important that we don't do anything important for the
1132         * functionality of the conv after the rendez sleep.  The non-blocking
1133         * style won't call back into the kernel - it just wants the event.  I
1134         * considered allowing multiple connect calls, where we just return if
1135         * it was already connected, but that would break UDP, which allows
1136         * multiple different connect calls. */
1137        if ((chan->flag & O_NONBLOCK) && !connected(c))
1138                error(EINPROGRESS, "connection not ready yet");
1139        qunlock(&c->qlock);
1140        if (waserror()) {
1141                qlock(&c->qlock);
1142                nexterror();
1143        }
1144        rendez_sleep(&c->cr, connected, c);
1145        qlock(&c->qlock);
1146        poperror();
1147
1148        if (c->cerr[0] != '\0')
1149                error(EFAIL, c->cerr);
1150}
1151
1152/*
1153 *  called by protocol announce routine to set addresses
1154 */
1155void Fsstdannounce(struct conv *c, char *argv[], int argc)
1156{
1157        memset(c->raddr, 0, sizeof(c->raddr));
1158        c->rport = 0;
1159        switch (argc) {
1160        default:
1161                error(EINVAL, "bad args to announce");
1162        case 2:
1163                setladdrport(c, argv[1], 1);
1164                break;
1165        }
1166}
1167
1168/*
1169 *  initiate announcement and sleep till its set up
1170 */
1171static int announced(void *a)
1172{
1173        return ((struct conv *)a)->state == Announced;
1174}
1175
1176static void announcectlmsg(struct Proto *x, struct conv *c, struct cmdbuf *cb)
1177{
1178        ERRSTACK(1);
1179        char *p;
1180
1181        if (c->state != 0)
1182                error(EBUSY, ERROR_FIXME);
1183        c->state = Announcing;
1184        c->cerr[0] = '\0';
1185        if (x->announce == NULL)
1186                error(EFAIL, "announce not supported");
1187        x->announce(c, cb->f, cb->nf);
1188
1189        qunlock(&c->qlock);
1190        if (waserror()) {
1191                qlock(&c->qlock);
1192                nexterror();
1193        }
1194        rendez_sleep(&c->cr, announced, c);
1195        qlock(&c->qlock);
1196        poperror();
1197
1198        if (c->cerr[0] != '\0')
1199                error(EFAIL, c->cerr);
1200}
1201
1202/*
1203 *  called by protocol bind routine to set addresses
1204 */
1205void Fsstdbind(struct conv *c, char *argv[], int argc)
1206{
1207        switch (argc) {
1208        default:
1209                error(EINVAL, "bad args to bind");
1210        case 2:
1211                setladdrport(c, argv[1], 0);
1212                break;
1213        }
1214}
1215
1216static void bindctlmsg(struct Proto *x, struct conv *c, struct cmdbuf *cb)
1217{
1218        if (x->bind == NULL)
1219                Fsstdbind(c, cb->f, cb->nf);
1220        else
1221                x->bind(c, cb->f, cb->nf);
1222}
1223
1224/* Helper, called by protocols to use the bypass.
1225 *
1226 * This is a bit nasty due to the overall nastiness of #ip.  We need to lock
1227 * before checking the state and hold the qlock throughout, because a concurrent
1228 * closeconv() could tear down the bypass.  Specifically, it could free the
1229 * bypass queues.  The root issue is that conversation lifetimes are not managed
1230 * well.
1231 *
1232 * If we fail, it's our responsibility to consume (free) the block(s). */
1233void bypass_or_drop(struct conv *cv, struct block *bp)
1234{
1235        qlock(&cv->qlock);
1236        if (cv->state == Bypass)
1237                qpass(cv->rq, bp);
1238        else
1239                freeblist(bp);
1240        qunlock(&cv->qlock);
1241}
1242
1243/* Push the block directly to the approprite ipoput function.
1244 *
1245 * It's the protocol's responsibility (and thus ours here) to make sure there is
1246 * at least the right amount of the IP header in the block (ipoput{4,6} assumes
1247 * it has the right amount, and the other protocols account for the IP header in
1248 * their own header).
1249 *
1250 * For the TTL and TOS, we just use the default ones.  If we want, we could look
1251 * into the actual block and see what the user wanted, though we're bypassing
1252 * the protocol layer, not the IP layer. */
1253static void proto_bypass_kick(void *arg, struct block *bp)
1254{
1255        struct conv *cv = (struct conv*)arg;
1256        uint8_t vers_nibble;
1257        struct Fs *f;
1258
1259        f = cv->p->f;
1260
1261        bp = pullupblock(bp, 1);
1262        if (!bp)
1263                error(EINVAL, "Proto bypass unable to pullup a byte!");
1264        vers_nibble = *(uint8_t*)bp->rp & 0xf0;
1265        switch (vers_nibble) {
1266        case IP_VER4:
1267                bp = pullupblock(bp, IPV4HDR_LEN);
1268                if (!bp)
1269                        error(EINVAL,
1270                              "Proto bypass unable to pullup v4 header");
1271                ipoput4(f, bp, FALSE, MAXTTL, DFLTTOS, NULL);
1272                break;
1273        case IP_VER6:
1274                bp = pullupblock(bp, IPV6HDR_LEN);
1275                if (!bp)
1276                        error(EINVAL,
1277                              "Proto bypass unable to pullup v6 header");
1278                ipoput6(f, bp, FALSE, MAXTTL, DFLTTOS, NULL);
1279                break;
1280        default:
1281                error(EINVAL, "Proto bypass block had unknown IP version 0x%x",
1282                      vers_nibble);
1283        }
1284}
1285
1286/* Sets up cv for the protocol bypass.  We use different queues for two reasons:
1287 * 1) To be protocol independent.  For instance, TCP and UDP could use very
1288 * different QIO styles.
1289 * 2) To set up our own kick/bypass method.  Note how udpcreate() and here uses
1290 * qbypass() (just blast it out), while TCP uses qopen() with a kick.  TCP still
1291 * follows queuing discipline.
1292 *
1293 * It's like we are our own protocol, the bypass protocol, when it comes to how
1294 * we interact with qio.  The conv still is of the real protocol type (e.g.
1295 * TCP).
1296 *
1297 * Note that we can't free the old queues.  The way #ip works, the queues are
1298 * created when the conv is created, but the conv is never freed.  It's like a
1299 * slab allocator that never frees objects, but just reinitializes them a
1300 * little.
1301 *
1302 * For the queues, we're basically like UDP:
1303 * - We take packets for rq and drop on overflow.
1304 * - rq is also Qmsg, but we also have Qcoalesce, to ignore out zero-len blocks
1305 * - We kick for our outbound (wq) messages.
1306 *
1307 * Note that Qmsg can drop parts of packets.  It's up to the user to read
1308 * enough.  If they didn't read enough, the extra is dropped.  This is similar
1309 * to SOCK_DGRAM and recvfrom().  Minus major changes, there's no nice way to
1310 * get individual messages with read().  Userspace using the bypass will need to
1311 * find out the MTU of the NIC the IP stack is attached to, and make sure to
1312 * read in at least that amount each time. */
1313static void setup_proto_qio_bypass(struct conv *cv)
1314{
1315        cv->rq_save = cv->rq;
1316        cv->wq_save = cv->wq;
1317        cv->rq = qopen(BYPASS_QMAX, Qmsg | Qcoalesce, 0, 0);
1318        cv->wq = qbypass(proto_bypass_kick, cv);
1319}
1320
1321static void undo_proto_qio_bypass(struct conv *cv)
1322{
1323        qfree(cv->rq);
1324        qfree(cv->wq);
1325        cv->rq = cv->rq_save;
1326        cv->wq = cv->wq_save;
1327        cv->rq_save = NULL;
1328        cv->wq_save = NULL;
1329}
1330
1331void Fsstdbypass(struct conv *cv, char *argv[], int argc)
1332{
1333        memset(cv->raddr, 0, sizeof(cv->raddr));
1334        cv->rport = 0;
1335        switch (argc) {
1336        case 2:
1337                setladdrport(cv, argv[1], 1);
1338                break;
1339        default:
1340                error(EINVAL, "Bad args (was %d, need 2) to bypass", argc);
1341        }
1342}
1343
1344static void bypassctlmsg(struct Proto *x, struct conv *cv, struct cmdbuf *cb)
1345{
1346        if (!x->bypass)
1347                error(EFAIL, "Protocol %s does not support bypass", x->name);
1348        /* The protocol needs to set the port (usually by calling Fsstdbypass)
1349         * and then do whatever it needs to make sure it can find the conv again
1350         * during receive (usually by adding to a hash table). */
1351        x->bypass(cv, cb->f, cb->nf);
1352        setup_proto_qio_bypass(cv);
1353        cv->state = Bypass;
1354}
1355
1356static void shutdownctlmsg(struct conv *cv, struct cmdbuf *cb)
1357{
1358        if (cb->nf < 2)
1359                goto err;
1360        if (!strcmp(cb->f[1], "rd")) {
1361                qhangup(cv->rq, "shutdown");
1362                if (cv->p->shutdown)
1363                        cv->p->shutdown(cv, SHUT_RD);
1364        } else if (!strcmp(cb->f[1], "wr")) {
1365                qhangup(cv->wq, "shutdown");
1366                if (cv->p->shutdown)
1367                        cv->p->shutdown(cv, SHUT_WR);
1368        } else if (!strcmp(cb->f[1], "rdwr")) {
1369                qhangup(cv->rq, "shutdown");
1370                qhangup(cv->wq, "shutdown");
1371                if (cv->p->shutdown)
1372                        cv->p->shutdown(cv, SHUT_RDWR);
1373        } else {
1374                goto err;
1375        }
1376        return;
1377err:
1378        error(EINVAL, "shutdown [rx|tx|rxtx]");
1379}
1380
1381static void tosctlmsg(struct conv *c, struct cmdbuf *cb)
1382{
1383        if (cb->nf < 2)
1384                c->tos = 0;
1385        else
1386                c->tos = atoi(cb->f[1]);
1387}
1388
1389static void ttlctlmsg(struct conv *c, struct cmdbuf *cb)
1390{
1391        if (cb->nf < 2)
1392                c->ttl = MAXTTL;
1393        else
1394                c->ttl = atoi(cb->f[1]);
1395}
1396
1397/* Binds a conversation, as if the user wrote "bind *" into ctl. */
1398static void autobind(struct conv *cv)
1399{
1400        ERRSTACK(1);
1401        struct cmdbuf *cb;
1402
1403        cb = parsecmd("bind *", 7);
1404        if (waserror()) {
1405                kfree(cb);
1406                nexterror();
1407        }
1408        bindctlmsg(cv->p, cv, cb);
1409        poperror();
1410        kfree(cb);
1411}
1412
1413static size_t ipwrite(struct chan *ch, void *v, size_t n, off64_t off)
1414{
1415        ERRSTACK(1);
1416        struct conv *c;
1417        struct Proto *x;
1418        char *p;
1419        struct cmdbuf *cb;
1420        uint8_t ia[IPaddrlen], ma[IPaddrlen];
1421        struct Fs *f;
1422        char *a;
1423
1424        a = v;
1425        f = ipfs[ch->dev];
1426
1427        switch (TYPE(ch->qid)) {
1428        default:
1429                error(EPERM, ERROR_FIXME);
1430        case Qdata:
1431                x = f->p[PROTO(ch->qid)];
1432                c = x->conv[CONV(ch->qid)];
1433                /* connection-less protocols (UDP) can write without manually
1434                 * binding. */
1435                if (c->lport == 0)
1436                        autobind(c);
1437                if (ch->flag & O_NONBLOCK)
1438                        qwrite_nonblock(c->wq, a, n);
1439                else
1440                        qwrite(c->wq, a, n);
1441                break;
1442        case Qarp:
1443                return arpwrite(f, a, n);
1444        case Qiproute:
1445                return routewrite(f, ch, a, n);
1446        case Qlog:
1447                netlogctl(f, a, n);
1448                return n;
1449        case Qndb:
1450                return ndbwrite(f, a, off, n);
1451        case Qctl:
1452                x = f->p[PROTO(ch->qid)];
1453                c = x->conv[CONV(ch->qid)];
1454                cb = parsecmd(a, n);
1455
1456                qlock(&c->qlock);
1457                if (waserror()) {
1458                        qunlock(&c->qlock);
1459                        kfree(cb);
1460                        nexterror();
1461                }
1462                if (cb->nf < 1)
1463                        error(EFAIL, "short control request");
1464                if (strcmp(cb->f[0], "connect") == 0)
1465                        connectctlmsg(x, c, cb, ch);
1466                else if (strcmp(cb->f[0], "announce") == 0)
1467                        announcectlmsg(x, c, cb);
1468                else if (strcmp(cb->f[0], "bind") == 0)
1469                        bindctlmsg(x, c, cb);
1470                else if (strcmp(cb->f[0], "bypass") == 0)
1471                        bypassctlmsg(x, c, cb);
1472                else if (strcmp(cb->f[0], "shutdown") == 0)
1473                        shutdownctlmsg(c, cb);
1474                else if (strcmp(cb->f[0], "ttl") == 0)
1475                        ttlctlmsg(c, cb);
1476                else if (strcmp(cb->f[0], "tos") == 0)
1477                        tosctlmsg(c, cb);
1478                else if (strcmp(cb->f[0], "ignoreadvice") == 0)
1479                        c->ignoreadvice = 1;
1480                else if (strcmp(cb->f[0], "addmulti") == 0) {
1481                        if (cb->nf < 2)
1482                                error(EFAIL,
1483                                      "addmulti needs interface address");
1484                        if (cb->nf == 2) {
1485                                if (!ipismulticast(c->raddr))
1486                                        error(EFAIL, "addmulti for a non multicast address");
1487                                parseip(ia, cb->f[1]);
1488                                ipifcaddmulti(c, c->raddr, ia);
1489                        } else {
1490                                parseip(ma, cb->f[2]);
1491                                if (!ipismulticast(ma))
1492                                        error(EFAIL, "addmulti for a non multicast address");
1493                                parseip(ia, cb->f[1]);
1494                                ipifcaddmulti(c, ma, ia);
1495                        }
1496                } else if (strcmp(cb->f[0], "remmulti") == 0) {
1497                        if (cb->nf < 2)
1498                                error(EFAIL,
1499                                      "remmulti needs interface address");
1500                        if (!ipismulticast(c->raddr))
1501                                error(EFAIL,
1502                                      "remmulti for a non multicast address");
1503                        parseip(ia, cb->f[1]);
1504                        ipifcremmulti(c, c->raddr, ia);
1505                } else if (x->ctl != NULL) {
1506                        x->ctl(c, cb->f, cb->nf);
1507                } else
1508                        error(EFAIL, "unknown control request");
1509                qunlock(&c->qlock);
1510                kfree(cb);
1511                poperror();
1512        }
1513        return n;
1514}
1515
1516static size_t ipbwrite(struct chan *ch, struct block *bp, off64_t offset)
1517{
1518        struct conv *c;
1519        size_t n;
1520
1521        switch (TYPE(ch->qid)) {
1522        case Qdata:
1523                c = chan2conv(ch);
1524                if (bp->next)
1525                        bp = concatblock(bp);
1526                n = BLEN(bp);
1527                if (ch->flag & O_NONBLOCK)
1528                        qbwrite_nonblock(c->wq, bp);
1529                else
1530                        qbwrite(c->wq, bp);
1531                return n;
1532        default:
1533                return devbwrite(ch, bp, offset);
1534        }
1535}
1536
1537static void fire_data_taps(struct conv *conv, int filter)
1538{
1539        struct fd_tap *tap_i;
1540
1541        /* At this point, we have an event we want to send to our taps (if any).
1542         * The lock protects list integrity and the existence of the tap.
1543         *
1544         * Previously, I thought of using the conv qlock.  That actually breaks,
1545         * due to weird usages of the qlock (someone holds it for a long time,
1546         * blocking the inbound wakeup from etherread4).
1547         *
1548         * I opted for a spinlock for a couple reasons:
1549         * - fire_tap should not block.  ideally it'll be fast too (it's mostly
1550         *   a send_event).
1551         * - our callers might not want to block.  A lot of network wakeups will
1552         * come network processes (etherread4) or otherwise unrelated to this
1553         * particular conversation.  I'd rather do something like fire off a
1554         * KMSG than block those.
1555         * - if fire_tap takes a while, holding the lock only slows down other
1556         * events on this *same* conversation, or other tap registration.  not a
1557         * huge deal. */
1558        spin_lock(&conv->tap_lock);
1559        SLIST_FOREACH(tap_i, &conv->data_taps, link)
1560                fire_tap(tap_i, filter);
1561        spin_unlock(&conv->tap_lock);
1562}
1563
1564static void ip_wake_cb(struct queue *q, void *data, int filter)
1565{
1566        struct conv *conv = (struct conv*)data;
1567
1568        /* For these two, we want to ignore events on the opposite end of the
1569         * queues.  For instance, we want to know when the WQ is writable.  Our
1570         * writes will actually make it readable - we don't want to trigger a
1571         * tap for that.  However, qio doesn't know how/why we are using a
1572         * queue, or even who the ends are (hence the callbacks) */
1573        if ((filter & FDTAP_FILT_READABLE) && (q == conv->wq))
1574                return;
1575        if ((filter & FDTAP_FILT_WRITABLE) && (q == conv->rq))
1576                return;
1577        fire_data_taps(conv, filter);
1578}
1579
1580int iptapfd(struct chan *chan, struct fd_tap *tap, int cmd)
1581{
1582        struct conv *conv = chan2conv(chan);
1583        int ret;
1584
1585#define DEVIP_LEGAL_DATA_TAPS (FDTAP_FILT_READABLE | FDTAP_FILT_WRITABLE |     \
1586                               FDTAP_FILT_HANGUP | FDTAP_FILT_PRIORITY |       \
1587                               FDTAP_FILT_ERROR)
1588#define DEVIP_LEGAL_LISTEN_TAPS (FDTAP_FILT_READABLE | FDTAP_FILT_HANGUP)
1589
1590        switch (TYPE(chan->qid)) {
1591        case Qdata:
1592                if (tap->filter & ~DEVIP_LEGAL_DATA_TAPS) {
1593                        set_errno(ENOSYS);
1594                        set_errstr("Unsupported #%s data tap %p, must be %p",
1595                                   devname(), tap->filter,
1596                                   DEVIP_LEGAL_DATA_TAPS);
1597                        return -1;
1598                }
1599                spin_lock(&conv->tap_lock);
1600                switch (cmd) {
1601                case (FDTAP_CMD_ADD):
1602                        if (SLIST_EMPTY(&conv->data_taps)) {
1603                                qio_set_wake_cb(conv->rq, ip_wake_cb, conv);
1604                                qio_set_wake_cb(conv->wq, ip_wake_cb, conv);
1605                        }
1606                        SLIST_INSERT_HEAD(&conv->data_taps, tap, link);
1607                        ret = 0;
1608                        break;
1609                case (FDTAP_CMD_REM):
1610                        SLIST_REMOVE(&conv->data_taps, tap, fd_tap, link);
1611                        if (SLIST_EMPTY(&conv->data_taps)) {
1612                                qio_set_wake_cb(conv->rq, 0, conv);
1613                                qio_set_wake_cb(conv->wq, 0, conv);
1614                        }
1615                        ret = 0;
1616                        break;
1617                default:
1618                        set_errno(ENOSYS);
1619                        set_errstr("Unsupported #%s data tap command %p",
1620                                   devname(), cmd);
1621                        ret = -1;
1622                }
1623                spin_unlock(&conv->tap_lock);
1624                return ret;
1625        case Qlisten:
1626                if (tap->filter & ~DEVIP_LEGAL_LISTEN_TAPS) {
1627                        set_errno(ENOSYS);
1628                        set_errstr("Unsupported #%s listen tap %p, must be %p",
1629                                   devname(), tap->filter,
1630                                   DEVIP_LEGAL_LISTEN_TAPS);
1631                        return -1;
1632                }
1633                spin_lock(&conv->tap_lock);
1634                switch (cmd) {
1635                case (FDTAP_CMD_ADD):
1636                        SLIST_INSERT_HEAD(&conv->listen_taps, tap, link);
1637                        ret = 0;
1638                        break;
1639                case (FDTAP_CMD_REM):
1640                        SLIST_REMOVE(&conv->listen_taps, tap, fd_tap, link);
1641                        ret = 0;
1642                        break;
1643                default:
1644                        set_errno(ENOSYS);
1645                        set_errstr("Unsupported #%s listen tap command %p",
1646                                   devname(), cmd);
1647                        ret = -1;
1648                }
1649                spin_unlock(&conv->tap_lock);
1650                return ret;
1651        default:
1652                set_errno(ENOSYS);
1653                set_errstr("Can't tap #%s file type %d", devname(),
1654                           TYPE(chan->qid));
1655                return -1;
1656        }
1657}
1658
1659static unsigned long ip_chan_ctl(struct chan *c, int op, unsigned long a1,
1660                                 unsigned long a2, unsigned long a3,
1661                                 unsigned long a4)
1662{
1663        switch (op) {
1664        case CCTL_SET_FL:
1665                return 0;
1666        default:
1667                error(EINVAL, "%s does not support %d", __func__, op);
1668        }
1669}
1670
1671struct dev ipdevtab __devtab = {
1672        .name = "ip",
1673
1674        .reset = ipreset,
1675        .init = ipinit,
1676        .shutdown = devshutdown,
1677        .attach = ipattach,
1678        .walk = ipwalk,
1679        .stat = ipstat,
1680        .open = ipopen,
1681        .create = devcreate,
1682        .close = ipclose,
1683        .read = ipread,
1684        .bread = ipbread,
1685        .write = ipwrite,
1686        .bwrite = ipbwrite,
1687        .remove = devremove,
1688        .wstat = ipwstat,
1689        .power = devpower,
1690        .chaninfo = ipchaninfo,
1691        .tapfd = iptapfd,
1692        .chan_ctl = ip_chan_ctl,
1693};
1694
1695int Fsproto(struct Fs *f, struct Proto *p)
1696{
1697        if (f->np >= Maxproto)
1698                return -1;
1699
1700        qlock_init(&p->qlock);
1701        p->f = f;
1702
1703        if (p->ipproto > 0) {
1704                if (f->t2p[p->ipproto] != NULL)
1705                        return -1;
1706                f->t2p[p->ipproto] = p;
1707        }
1708
1709        p->qid.type = QTDIR;
1710        p->qid.path = QID(f->np, 0, Qprotodir);
1711        p->conv = kzmalloc(sizeof(struct conv *) * (p->nc + 1), 0);
1712        if (p->conv == NULL)
1713                panic("Fsproto");
1714
1715        p->x = f->np;
1716        p->nextport = 0;
1717        p->nextrport = 600;
1718        f->p[f->np++] = p;
1719
1720        return 0;
1721}
1722
1723/*
1724 *  return true if this protocol is
1725 *  built in
1726 */
1727int Fsbuiltinproto(struct Fs *f, uint8_t proto)
1728{
1729        return f->t2p[proto] != NULL;
1730}
1731
1732/*
1733 *  called with protocol locked
1734 */
1735struct conv *Fsprotoclone(struct Proto *p, char *user)
1736{
1737        struct conv *c, **pp, **ep;
1738
1739retry:
1740        c = NULL;
1741        ep = &p->conv[p->nc];
1742        for (pp = p->conv; pp < ep; pp++) {
1743                c = *pp;
1744                if (c == NULL) {
1745                        c = kzmalloc(sizeof(struct conv), 0);
1746                        if (c == NULL)
1747                                error(ENOMEM,
1748                                      "conv kzmalloc(%d, 0) failed in Fsprotoclone",
1749                                      sizeof(struct conv));
1750                        qlock_init(&c->qlock);
1751                        qlock_init(&c->listenq);
1752                        rendez_init(&c->cr);
1753                        rendez_init(&c->listenr);
1754                        /* already = 0; set to be futureproof */
1755                        SLIST_INIT(&c->data_taps);
1756                        SLIST_INIT(&c->listen_taps);
1757                        spinlock_init(&c->tap_lock);
1758                        qlock(&c->qlock);
1759                        c->p = p;
1760                        c->x = pp - p->conv;
1761                        if (p->ptclsize != 0) {
1762                                c->ptcl = kzmalloc(p->ptclsize, 0);
1763                                if (c->ptcl == NULL) {
1764                                        kfree(c);
1765                                        error(ENOMEM,
1766                                              "ptcl kzmalloc(%d, 0) failed in Fsprotoclone",
1767                                              p->ptclsize);
1768                                }
1769                        }
1770                        *pp = c;
1771                        p->ac++;
1772                        c->eq = qopen(1024, Qmsg, 0, 0);
1773                        (*p->create) (c);
1774                        assert(c->rq && c->wq);
1775                        break;
1776                }
1777                if (canqlock(&c->qlock)) {
1778                        /*
1779                         *  make sure both processes and protocol
1780                         *  are done with this Conv
1781                         */
1782                        if (c->inuse == 0 && (p->inuse == NULL ||
1783                                              (*p->inuse)(c) == 0))
1784                                break;
1785
1786                        qunlock(&c->qlock);
1787                }
1788        }
1789        if (pp >= ep) {
1790                if (p->gc != NULL && (*p->gc) (p))
1791                        goto retry;
1792                return NULL;
1793        }
1794
1795        c->inuse = 1;
1796        kstrdup(&c->owner, user);
1797        c->perm = 0660;
1798        c->state = Idle;
1799        ipmove(c->laddr, IPnoaddr);
1800        ipmove(c->raddr, IPnoaddr);
1801        c->r = NULL;
1802        c->rgen = 0;
1803        c->lport = 0;
1804        c->rport = 0;
1805        c->restricted = 0;
1806        c->ttl = MAXTTL;
1807        c->tos = DFLTTOS;
1808        qreopen(c->rq);
1809        qreopen(c->wq);
1810        qreopen(c->eq);
1811
1812        qunlock(&c->qlock);
1813        return c;
1814}
1815
1816int Fsconnected(struct conv *c, char *msg)
1817{
1818        if (msg != NULL && *msg != '\0')
1819                strlcpy(c->cerr, msg, sizeof(c->cerr));
1820
1821        switch (c->state) {
1822                case Announcing:
1823                        c->state = Announced;
1824                        break;
1825
1826                case Connecting:
1827                        c->state = Connected;
1828                        break;
1829        }
1830
1831        rendez_wakeup(&c->cr);
1832        /* The user can poll or tap the connection status via Qdata */
1833        fire_data_taps(c, FDTAP_FILT_WRITABLE);
1834        return 0;
1835}
1836
1837struct Proto *Fsrcvpcol(struct Fs *f, uint8_t proto)
1838{
1839        if (f->ipmux)
1840                return f->ipmux;
1841        else
1842                return f->t2p[proto];
1843}
1844
1845struct Proto *Fsrcvpcolx(struct Fs *f, uint8_t proto)
1846{
1847        return f->t2p[proto];
1848}
1849
1850static void fire_listener_taps(struct conv *conv)
1851{
1852        struct fd_tap *tap_i;
1853        if (SLIST_EMPTY(&conv->listen_taps))
1854                return;
1855        spin_lock(&conv->tap_lock);
1856        SLIST_FOREACH(tap_i, &conv->listen_taps, link)
1857                fire_tap(tap_i, FDTAP_FILT_READABLE);
1858        spin_unlock(&conv->tap_lock);
1859}
1860
1861/*
1862 *  called with protocol locked
1863 */
1864struct conv *Fsnewcall(struct conv *c, uint8_t *raddr, uint16_t rport,
1865                       uint8_t *laddr, uint16_t lport, uint8_t version)
1866{
1867        struct conv *nc;
1868        struct conv **l;
1869        int i;
1870
1871        qlock(&c->qlock);
1872        i = 0;
1873        for (l = &c->incall; *l; l = &(*l)->next)
1874                i++;
1875        if (i >= Maxincall) {
1876                qunlock(&c->qlock);
1877                return NULL;
1878        }
1879
1880        /* find a free conversation */
1881        nc = Fsprotoclone(c->p, network);
1882        if (nc == NULL) {
1883                qunlock(&c->qlock);
1884                return NULL;
1885        }
1886        ipmove(nc->raddr, raddr);
1887        nc->rport = rport;
1888        ipmove(nc->laddr, laddr);
1889        nc->lport = lport;
1890        nc->next = NULL;
1891        *l = nc;
1892        nc->state = Connected;
1893        nc->ipversion = version;
1894
1895        qunlock(&c->qlock);
1896
1897        rendez_wakeup(&c->listenr);
1898        fire_listener_taps(c);
1899
1900        return nc;
1901}
1902
1903static long ndbwrite(struct Fs *f, char *a, uint32_t off, int n)
1904{
1905        if (off > strlen(f->ndb))
1906                error(EIO, ERROR_FIXME);
1907        if (off + n >= sizeof(f->ndb) - 1)
1908                error(EIO, ERROR_FIXME);
1909        memmove(f->ndb + off, a, n);
1910        f->ndb[off + n] = 0;
1911        f->ndbvers++;
1912        f->ndbmtime = seconds();
1913        return n;
1914}
1915
1916uint32_t scalednconv(void)
1917{
1918        //if(conf.npage*BY2PG >= 128*MB)
1919        return Nchans * 4;
1920        //  return Nchans;
1921}
1922