1 /* Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved.
2 * Portions Copyright © 1997-1999 Vita Nuova Limited
3 * Portions Copyright © 2000-2007 Vita Nuova Holdings Limited
5 * Revisions Copyright © 2000-2007 Lucent Technologies Inc. and others
7 * Modified for the Akaros operating system:
8 * Copyright (c) 2013-2014 The Regents of the University of California
9 * Copyright (c) 2013-2015 Google Inc.
11 * Permission is hereby granted, free of charge, to any person obtaining a copy
12 * of this software and associated documentation files (the "Software"), to deal
13 * in the Software without restriction, including without limitation the rights
14 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15 * copies of the Software, and to permit persons to whom the Software is
16 * furnished to do so, subject to the following conditions:
18 * The above copyright notice and this permission notice shall be included in
19 * all copies or substantial portions of the Software.
21 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
24 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
43 static char *devname(void)
49 Qtopdir = 1, /* top level directory */
58 Qprotodir, /* directory for a protocol */
63 Qconvdir, /* directory for a conversation */
75 Masktype = (1 << Logtype) - 1,
77 Maskconv = (1 << Logconv) - 1,
80 Maskproto = (1 << Logproto) - 1,
81 Shiftproto = Logtype + Logconv,
84 BYPASS_QMAX = 64 * MiB,
85 IPROUTE_LEN = 2 * PGSIZE,
87 #define TYPE(x) ( ((uint32_t)(x).path) & Masktype )
88 #define CONV(x) ( (((uint32_t)(x).path) >> Shiftconv) & Maskconv )
89 #define PROTO(x) ( (((uint32_t)(x).path) >> Shiftproto) & Maskproto )
90 #define QID(p, c, y) ( ((p)<<(Shiftproto)) | ((c)<<Shiftconv) | (y))
91 static char network[] = "network";
94 struct Fs *ipfs[Nfs]; /* attached fs's */
97 extern void nullmediumlink(void);
98 extern void pktmediumlink(void);
99 extern struct username eve;
100 static long ndbwrite(struct Fs *, char *unused_char_p_t, uint32_t, int);
101 static void closeconv(struct conv *);
102 static void setup_proto_qio_bypass(struct conv *cv);
103 static void undo_proto_qio_bypass(struct conv *cv);
104 static int connected(void *a);
106 static struct conv *chan2conv(struct chan *chan)
108 /* That's a lot of pointers to get to the conv! */
109 return ipfs[chan->dev]->p[PROTO(chan->qid)]->conv[CONV(chan->qid)];
112 static inline int founddevdir(struct chan *c, struct qid q, char *n,
113 int64_t length, char *user, long perm,
116 devdir(c, q, n, length, user, perm, db);
120 static int topdirgen(struct chan *c, struct dir *dp)
123 mkqid(&q, QID(0, 0, Qtopdir), 0, QTDIR);
124 snprintf(get_cur_genbuf(), GENBUF_SZ, "#%s%lu", devname(), c->dev);
125 return founddevdir(c, q, get_cur_genbuf(), 0, network, 0555, dp);
128 /* Computes the perm field for a stat for Qdata. Since select() polls the
129 * 'actionability' of a socket via the qdata FD, we'll also report listenable
130 * and connected conversations. It's a minor hack. =( */
131 static int qdata_stat_perm(struct conv *cv)
136 /* If there is ever a listener, then it's readable. Ideally, we'd only
137 * report this on the Qlisten file (which we also do). The socket crap
138 * should never use a listening socket for data, so there shouldn't be any
139 * confusion when a Qdata shows up as readable. */
140 perm |= cv->incall ? DMREADABLE : 0;
141 /* For connectable convs, they need to be both connected and qio
142 * readable/writable. The way to think about this is that the convs are not
143 * truly writable/readable until they are connected. Conveniently, this
144 * means that when select polls Qdata for non-blocking connect(), a
145 * connected conversation pops up as writable (the qio is writable too).
147 * Note that a conversation can be 'Connected' even if it failed to connect.
148 * At least that's what the 9ns TCP code does. It's more like "the protocol
149 * did what it needed and the connectctlmsg call (or its non-blocking
150 * equivalent) is done". For instance, TCP has a few reasons to call
151 * Fsconnected, such as when we send the SYN and get a RST. */
152 if (!cv->p->connect || connected(cv)) {
153 perm |= qreadable(cv->rq) ? DMREADABLE : 0;
154 perm |= qwritable(cv->wq) ? DMWRITABLE : 0;
159 static int ip3gen(struct chan *c, int i, struct dir *dp)
167 if (cv->owner == NULL)
168 kstrdup(&cv->owner, eve.name);
169 mkqid(&q, QID(PROTO(c->qid), CONV(c->qid), i), 0, QTFILE);
175 return founddevdir(c, q, "ctl", 0,
176 cv->owner, cv->perm, dp);
178 perm = qdata_stat_perm(cv);
179 return founddevdir(c, q, "data", qlen(cv->rq),
180 cv->owner, perm, dp);
183 perm |= qreadable(cv->eq) ? DMREADABLE : 0;
184 return founddevdir(c, q, "err", qlen(cv->eq),
185 cv->owner, perm, dp);
188 perm |= cv->incall ? DMREADABLE : 0;
189 return founddevdir(c, q, "listen", 0, cv->owner, perm, dp);
197 if (strcmp(cv->p->name, "ipifc") != 0)
200 perm |= qreadable(cv->sq) ? DMREADABLE : 0;
201 return founddevdir(c, q, "snoop", qlen(cv->sq),
202 cv->owner, perm, dp);
207 return founddevdir(c, q, p, 0, cv->owner, 0444, dp);
210 static int ip2gen(struct chan *c, int i, struct dir *dp)
213 mkqid(&q, QID(PROTO(c->qid), 0, i), 0, QTFILE);
216 return founddevdir(c, q, "clone", 0, network, 0666, dp);
218 return founddevdir(c, q, "stats", 0, network, 0444, dp);
223 static int ip1gen(struct chan *c, int i, struct dir *dp)
230 extern uint32_t kerndate;
235 mkqid(&q, QID(0, 0, i), 0, QTFILE);
244 len = strlen(f->ndb);
261 devdir(c, q, p, len, network, prot, dp);
262 if (i == Qndb && f->ndbmtime > kerndate)
263 dp->mtime.tv_sec = f->ndbmtime;
268 ipgen(struct chan *c, char *unused_char_p_t, struct dirtab *d, int unused_int,
269 int s, struct dir *dp)
277 switch (TYPE(c->qid)) {
280 return topdirgen(c, dp);
282 if (f->p[s]->connect == NULL)
283 return 0; /* protocol with no user interface */
284 mkqid(&q, QID(s, 0, Qprotodir), 0, QTDIR);
285 return founddevdir(c, q, f->p[s]->name, 0, network, 0555, dp);
288 return ip1gen(c, s + Qtopbase, dp);
295 return ip1gen(c, TYPE(c->qid), dp);
298 return topdirgen(c, dp);
299 else if (s < f->p[PROTO(c->qid)]->ac) {
300 cv = f->p[PROTO(c->qid)]->conv[s];
301 snprintf(get_cur_genbuf(), GENBUF_SZ, "%d", s);
302 mkqid(&q, QID(PROTO(c->qid), s, Qconvdir), 0, QTDIR);
304 founddevdir(c, q, get_cur_genbuf(), 0, cv->owner, 0555, dp);
306 s -= f->p[PROTO(c->qid)]->ac;
307 return ip2gen(c, s + Qprotobase, dp);
310 return ip2gen(c, TYPE(c->qid), dp);
312 if (s == DEVDOTDOT) {
314 mkqid(&q, QID(s, 0, Qprotodir), 0, QTDIR);
315 devdir(c, q, f->p[s]->name, 0, network, 0555, dp);
318 return ip3gen(c, s + Qconvbase, dp);
327 return ip3gen(c, TYPE(c->qid), dp);
332 static void ipinit(void)
338 fmtinstall('i', eipfmt);
339 fmtinstall('I', eipfmt);
340 fmtinstall('E', eipfmt);
341 fmtinstall('V', eipfmt);
342 fmtinstall('M', eipfmt);
346 static void ipreset(void)
350 static struct Fs *ipgetfs(int dev)
352 extern void (*ipprotoinit[]) (struct Fs *);
360 if (ipfs[dev] == NULL) {
361 f = kzmalloc(sizeof(struct Fs), MEM_WAIT);
363 qlock_init(&f->iprouter.qlock);
367 for (i = 0; ipprotoinit[i]; i++)
377 struct IPaux *newipaux(char *owner, char *tag)
382 a = kzmalloc(sizeof(*a), 0);
383 kstrdup(&a->owner, owner);
384 memset(a->tag, ' ', sizeof(a->tag));
386 if (n > sizeof(a->tag))
388 memmove(a->tag, tag, n);
392 #define ATTACHER(c) (((struct IPaux*)((c)->aux))->owner)
394 static struct chan *ipattach(char *spec)
401 error(EFAIL, "bad specification");
404 c = devattach(devname(), spec);
405 mkqid(&c->qid, QID(0, 0, Qtopdir), 0, QTDIR);
408 c->aux = newipaux(commonuser(), "none");
413 static struct walkqid *ipwalk(struct chan *c, struct chan *nc, char **name,
416 struct IPaux *a = c->aux;
419 w = devwalk(c, nc, name, nname, NULL, 0, ipgen);
420 if (w != NULL && w->clone != NULL)
421 w->clone->aux = newipaux(a->owner, a->tag);
425 static size_t ipstat(struct chan *c, uint8_t *db, size_t n)
427 return devstat(c, db, n, NULL, 0, ipgen);
430 static int should_wake(void *arg)
432 struct conv *cv = arg;
433 /* signal that the conv is closed */
434 if (qisclosed(cv->rq))
436 return cv->incall != NULL;
439 static struct chan *ipopen(struct chan *c, int omode)
442 struct conv *cv, *nc;
447 /* perm is a lone rwx, not the rwx------ from the conversion */
448 perm = omode_to_rwx(omode) >> 6;
452 switch (TYPE(c->qid)) {
456 if (omode & (O_WRITE | O_TRUNC) && !iseve())
457 error(EPERM, ERROR_FIXME);
458 if ((omode & (O_WRITE | O_TRUNC)) == (O_WRITE | O_TRUNC))
468 c->synth_buf = kpages_zalloc(IPROUTE_LEN, MEM_WAIT);
469 routeread(f, c->synth_buf, 0, IPROUTE_LEN);
480 error(EPERM, ERROR_FIXME);
484 error(EPERM, ERROR_FIXME);
485 /* might be racy. note the lack of a proto lock, unlike Qdata */
486 p = f->p[PROTO(c->qid)];
487 cv = p->conv[CONV(c->qid)];
488 if (strcmp(ATTACHER(c), cv->owner) != 0 && !iseve())
489 error(EPERM, ERROR_FIXME);
490 atomic_inc(&cv->snoopers);
493 p = f->p[PROTO(c->qid)];
499 cv = Fsprotoclone(p, ATTACHER(c));
503 error(ENODEV, "Null conversation from Fsprotoclone");
506 mkqid(&c->qid, QID(p->x, cv->x, Qctl), 0, QTFILE);
511 p = f->p[PROTO(c->qid)];
513 cv = p->conv[CONV(c->qid)];
520 if ((perm & (cv->perm >> 6)) != perm) {
521 if (strcmp(ATTACHER(c), cv->owner) != 0)
522 error(EPERM, ERROR_FIXME);
523 if ((perm & cv->perm) != perm)
524 error(EPERM, ERROR_FIXME);
528 if (cv->inuse == 1) {
529 kstrdup(&cv->owner, ATTACHER(c));
537 cv = f->p[PROTO(c->qid)]->conv[CONV(c->qid)];
538 /* No permissions or Announce checks required. We'll see if that's
539 * a good idea or not. (the perm check would do nothing, as is,
540 * since an O_PATH perm is 0).
542 * But we probably want to incref to keep the conversation around
543 * until this FD/chan is closed. #ip is a little weird in that
544 * objects never really go away (high water mark for convs, you can
545 * always find them in the ns). I think it is possible to
546 * namec/ipgen a chan, then have that conv close, then have that
547 * chan be opened. You can probably do this with a data file. */
548 if (omode & O_PATH) {
554 if ((perm & (cv->perm >> 6)) != perm) {
555 if (strcmp(ATTACHER(c), cv->owner) != 0)
556 error(EPERM, ERROR_FIXME);
557 if ((perm & cv->perm) != perm)
558 error(EPERM, ERROR_FIXME);
562 if (cv->state != Announced)
563 error(EFAIL, "not announced");
575 /* give up if we got a hangup */
576 if (qisclosed(cv->rq))
577 error(EFAIL, "listen hungup");
581 qunlock(&cv->listenq);
584 /* we can peek at incall without grabbing the cv qlock. if
585 * anything is there, it'll remain there until we dequeue it.
586 * no one else can, since we hold the listenq lock */
587 if ((c->flag & O_NONBLOCK) && !cv->incall)
588 error(EAGAIN, "listen queue empty");
589 /* wait for a connect */
590 rendez_sleep(&cv->listenr, should_wake, cv);
592 /* if there is a concurrent hangup, they will hold the qlock
593 * until the hangup is complete, including closing the cv->rq */
597 cv->incall = nc->next;
598 mkqid(&c->qid, QID(PROTO(c->qid), nc->x, Qctl), 0, QTFILE);
599 kstrdup(&cv->owner, ATTACHER(c));
603 qunlock(&cv->listenq);
610 c->mode = openmode(omode);
616 static size_t ipwstat(struct chan *c, uint8_t *dp, size_t n)
625 switch (TYPE(c->qid)) {
627 error(EPERM, ERROR_FIXME);
634 d = kzmalloc(sizeof(*d) + n, 0);
639 n = convM2D(dp, n, d, (char *)&d[1]);
641 error(ENODATA, ERROR_FIXME);
642 p = f->p[PROTO(c->qid)];
643 cv = p->conv[CONV(c->qid)];
644 if (!iseve() && strcmp(ATTACHER(c), cv->owner) != 0)
645 error(EPERM, ERROR_FIXME);
646 if (!emptystr(d->uid))
647 kstrdup(&cv->owner, d->uid);
649 cv->perm = d->mode & 0777;
655 /* Should be able to handle any file type chan. Feel free to extend it. */
656 static char *ipchaninfo(struct chan *ch, char *ret, size_t ret_l)
665 switch (TYPE(ch->qid)) {
667 ret = "Unknown type";
670 proto = f->p[PROTO(ch->qid)];
671 conv = proto->conv[CONV(ch->qid)];
673 "Qdata, %s, proto %s, conv idx %d, rq len %d, wq len %d, total read %llu",
674 SLIST_EMPTY(&conv->data_taps) ? "untapped" : "tapped",
675 proto->name, conv->x, qlen(conv->rq), qlen(conv->wq),
676 q_bytes_read(conv->rq));
685 proto = f->p[PROTO(ch->qid)];
686 conv = proto->conv[CONV(ch->qid)];
688 "Qlisten, %s proto %s, conv idx %d, has %sincalls",
689 SLIST_EMPTY(&conv->listen_taps) ? "untapped" : "tapped",
690 proto->name, conv->x, conv->incall ? "" : "no ");
699 proto = f->p[PROTO(ch->qid)];
700 conv = proto->conv[CONV(ch->qid)];
701 snprintf(ret, ret_l, "Qctl, proto %s, conv idx %d", proto->name,
708 static void closeconv(struct conv *cv)
716 if (--cv->inuse > 0) {
724 /* close all incoming calls since no listen will ever happen */
725 for (nc = cv->incall; nc; nc = cv->incall) {
726 cv->incall = nc->next;
731 kstrdup(&cv->owner, network);
734 while ((mp = cv->multi) != NULL)
735 ipifcremmulti(cv, mp->ma, mp->ia);
739 if (cv->state == Bypass)
740 undo_proto_qio_bypass(cv);
747 static void ipclose(struct chan *c)
752 switch (TYPE(c->qid)) {
768 closeconv(f->p[PROTO(c->qid)]->conv[CONV(c->qid)]);
772 atomic_dec(&f->p[PROTO(c->qid)]->conv[CONV(c->qid)]->snoopers);
776 kpages_free(c->synth_buf, IPROUTE_LEN);
779 kfree(((struct IPaux *)c->aux)->owner);
784 Statelen = 32 * 1024,
787 static size_t ipread(struct chan *ch, void *a, size_t n, off64_t off)
794 uint32_t offset = off;
799 switch (TYPE(ch->qid)) {
801 error(EPERM, ERROR_FIXME);
805 return devdirread(ch, a, n, 0, 0, ipgen);
807 return arpread(f->arp, a, offset, n);
809 return readstr(offset, a, n, f->ndb);
811 return readmem(offset, a, n, ch->synth_buf, IPROUTE_LEN);
813 return iprouterread(f, a, n);
815 return ipselftabread(f, a, offset, n);
817 return netlogread(f, a, offset, n);
819 snprintf(get_cur_genbuf(), GENBUF_SZ, "%lu", CONV(ch->qid));
820 return readstr(offset, p, n, get_cur_genbuf());
822 buf = kzmalloc(Statelen, 0);
823 x = f->p[PROTO(ch->qid)];
824 c = x->conv[CONV(ch->qid)];
825 if (x->remote == NULL) {
826 snprintf(buf, Statelen, "%I!%d\n", c->raddr, c->rport);
828 (*x->remote) (c, buf, Statelen - 2);
830 rv = readstr(offset, p, n, buf);
834 buf = kzmalloc(Statelen, 0);
835 x = f->p[PROTO(ch->qid)];
836 c = x->conv[CONV(ch->qid)];
837 if (x->local == NULL) {
838 snprintf(buf, Statelen, "%I!%d\n", c->laddr, c->lport);
840 (*x->local) (c, buf, Statelen - 2);
842 rv = readstr(offset, p, n, buf);
846 /* this all is a bit screwed up since the size of some state's
847 * buffers will change from one invocation to another. a reader
848 * will come in and read the entire buffer. then it will come again
849 * and read from the next offset, expecting EOF. if the buffer
850 * changed sizes, it'll reprint the end of the buffer slightly. */
851 buf = kzmalloc(Statelen, 0);
852 x = f->p[PROTO(ch->qid)];
853 c = x->conv[CONV(ch->qid)];
854 if (c->state == Bypass)
855 snprintf(buf, Statelen, "Bypassed\n");
857 (*x->state)(c, buf, Statelen - 2);
858 rv = readstr(offset, p, n, buf);
862 c = f->p[PROTO(ch->qid)]->conv[CONV(ch->qid)];
863 if (ch->flag & O_NONBLOCK)
864 return qread_nonblock(c->rq, a, n);
866 return qread(c->rq, a, n);
868 c = f->p[PROTO(ch->qid)]->conv[CONV(ch->qid)];
869 return qread(c->eq, a, n);
871 c = f->p[PROTO(ch->qid)]->conv[CONV(ch->qid)];
872 return qread(c->sq, a, n);
874 x = f->p[PROTO(ch->qid)];
875 if (x->stats == NULL)
876 error(EFAIL, "stats not implemented");
877 buf = kzmalloc(Statelen, 0);
878 (*x->stats) (x, buf, Statelen);
879 rv = readstr(offset, p, n, buf);
885 static struct block *ipbread(struct chan *ch, size_t n, off64_t offset)
889 switch (TYPE(ch->qid)) {
892 if (ch->flag & O_NONBLOCK)
893 return qbread_nonblock(c->rq, n);
895 return qbread(c->rq, n);
897 return devbread(ch, n, offset);
902 * set local address to be that of the ifc closest to remote address
904 static void setladdr(struct conv *c)
906 findlocalip(c->p->f, c->laddr, c->raddr);
910 * set a local port making sure the quad of raddr,rport,laddr,lport is unique
912 static void setluniqueport(struct conv *c, int lport)
921 for (x = 0; x < p->nc; x++) {
927 if ((xp->state == Connected || xp->state == Announced
928 || xp->state == Bypass)
929 && xp->lport == lport
930 && xp->rport == c->rport
931 && ipcmp(xp->raddr, c->raddr) == 0
932 && ipcmp(xp->laddr, c->laddr) == 0) {
934 error(EFAIL, "address in use");
942 * pick a local port and set it
944 static void setlport(struct conv *c)
958 * Fsproto initialises p->nextport to 0 and the restricted
959 * ports (p->nextrport) to 600.
960 * Restricted ports must lie between 600 and 1024.
961 * For the initial condition or if the unrestricted port number
962 * has wrapped round, select a random port between 5000 and 1<<15
970 urandom_read(pp, sizeof(*pp));
973 for (x = 0; x < p->nc; x++) {
974 if (p->conv[x] == NULL)
976 if (p->conv[x]->lport == *pp) {
989 * set a local address and port from a string of the form
992 static void setladdrport(struct conv *c, char *str, int announcing)
996 uint8_t addr[IPaddrlen];
999 * ignore restricted part if it exists. it's
1000 * meaningless on local ports.
1002 p = strchr(str, '!');
1005 if (strcmp(p, "r") == 0)
1012 ipmove(c->laddr, IPnoaddr);
1017 if (strcmp(str, "*") == 0)
1018 ipmove(c->laddr, IPnoaddr);
1021 if (ipforme(c->p->f, addr))
1022 ipmove(c->laddr, addr);
1024 error(EFAIL, "not a local IP address");
1028 /* one process can get all connections */
1029 if (announcing && strcmp(p, "*") == 0) {
1031 error(EPERM, ERROR_FIXME);
1032 setluniqueport(c, 0);
1039 setluniqueport(c, lport);
1042 static void setraddrport(struct conv *c, char *str)
1046 p = strchr(str, '!');
1048 error(EFAIL, "malformed address");
1050 parseip(c->raddr, str);
1054 if (strstr(p, "!r") != NULL)
1060 * called by protocol connect routine to set addresses
1062 void Fsstdconnect(struct conv *c, char *argv[], int argc)
1066 error(EINVAL, "bad args to %s", __func__);
1068 setraddrport(c, argv[1]);
1073 setraddrport(c, argv[1]);
1074 setladdrport(c, argv[2], 0);
1078 /* TODO: why is an IPnoaddr (in v6 format, equivalent to v6Unspecified),
1080 if ((memcmp(c->raddr, v4prefix, IPv4off) == 0 &&
1081 memcmp(c->laddr, v4prefix, IPv4off) == 0)
1082 || ipcmp(c->raddr, IPnoaddr) == 0)
1086 /* Linux has taught people to use zeros for local interfaces. TODO: We
1087 * might need this for v6 in the future. */
1088 if (!ipcmp(c->raddr, IPv4_zeroes))
1089 ipmove(c->raddr, IPv4_loopback);
1093 * initiate connection and sleep till its set up
1095 static int connected(void *a)
1097 return ((struct conv *)a)->state == Connected;
1100 static void connectctlmsg(struct Proto *x, struct conv *c, struct cmdbuf *cb,
1107 error(EBUSY, ERROR_FIXME);
1108 c->state = Connecting;
1110 if (x->connect == NULL)
1111 error(EFAIL, "connect not supported");
1112 /* It's up to the proto connect method to not block the kthread. This is
1113 * currently the case for e.g. TCP. */
1114 x->connect(c, cb->f, cb->nf);
1115 /* This is notionally right before the rendez_sleep: either we block or we
1116 * kick back to userspace. We do this before the unlock to avoid races with
1117 * c->state (rendez's internal lock deals with its race with the waker) and
1118 * to avoid the excessive unlock and relock.
1120 * Also, it's important that we don't do anything important for the
1121 * functionality of the conv after the rendez sleep. The non-blocking style
1122 * won't call back into the kernel - it just wants the event. I considered
1123 * allowing multiple connect calls, where we just return if it was already
1124 * connected, but that would break UDP, which allows multiple different
1126 if ((chan->flag & O_NONBLOCK) && !connected(c))
1127 error(EINPROGRESS, "connection not ready yet");
1133 rendez_sleep(&c->cr, connected, c);
1137 if (c->cerr[0] != '\0')
1138 error(EFAIL, c->cerr);
1142 * called by protocol announce routine to set addresses
1144 void Fsstdannounce(struct conv *c, char *argv[], int argc)
1146 memset(c->raddr, 0, sizeof(c->raddr));
1150 error(EINVAL, "bad args to announce");
1152 setladdrport(c, argv[1], 1);
1158 * initiate announcement and sleep till its set up
1160 static int announced(void *a)
1162 return ((struct conv *)a)->state == Announced;
1165 static void announcectlmsg(struct Proto *x, struct conv *c, struct cmdbuf *cb)
1171 error(EBUSY, ERROR_FIXME);
1172 c->state = Announcing;
1174 if (x->announce == NULL)
1175 error(EFAIL, "announce not supported");
1176 x->announce(c, cb->f, cb->nf);
1183 rendez_sleep(&c->cr, announced, c);
1187 if (c->cerr[0] != '\0')
1188 error(EFAIL, c->cerr);
1192 * called by protocol bind routine to set addresses
1194 void Fsstdbind(struct conv *c, char *argv[], int argc)
1198 error(EINVAL, "bad args to bind");
1200 setladdrport(c, argv[1], 0);
1205 static void bindctlmsg(struct Proto *x, struct conv *c, struct cmdbuf *cb)
1207 if (x->bind == NULL)
1208 Fsstdbind(c, cb->f, cb->nf);
1210 x->bind(c, cb->f, cb->nf);
1213 /* Helper, called by protocols to use the bypass.
1215 * This is a bit nasty due to the overall nastiness of #ip. We need to lock
1216 * before checking the state and hold the qlock throughout, because a concurrent
1217 * closeconv() could tear down the bypass. Specifically, it could free the
1218 * bypass queues. The root issue is that conversation lifetimes are not managed
1221 * If we fail, it's our responsibility to consume (free) the block(s). */
1222 void bypass_or_drop(struct conv *cv, struct block *bp)
1225 if (cv->state == Bypass)
1229 qunlock(&cv->qlock);
1232 /* Push the block directly to the approprite ipoput function.
1234 * It's the protocol's responsibility (and thus ours here) to make sure there is
1235 * at least the right amount of the IP header in the block (ipoput{4,6} assumes
1236 * it has the right amount, and the other protocols account for the IP header in
1237 * their own header).
1239 * For the TTL and TOS, we just use the default ones. If we want, we could look
1240 * into the actual block and see what the user wanted, though we're bypassing
1241 * the protocol layer, not the IP layer. */
1242 static void proto_bypass_kick(void *arg, struct block *bp)
1244 struct conv *cv = (struct conv*)arg;
1245 uint8_t vers_nibble;
1250 bp = pullupblock(bp, 1);
1252 error(EINVAL, "Proto bypass unable to pullup a byte!");
1253 vers_nibble = *(uint8_t*)bp->rp & 0xf0;
1254 switch (vers_nibble) {
1256 bp = pullupblock(bp, IPV4HDR_LEN);
1258 error(EINVAL, "Proto bypass unable to pullup v4 header");
1259 ipoput4(f, bp, FALSE, MAXTTL, DFLTTOS, NULL);
1262 bp = pullupblock(bp, IPV6HDR_LEN);
1264 error(EINVAL, "Proto bypass unable to pullup v6 header");
1265 ipoput6(f, bp, FALSE, MAXTTL, DFLTTOS, NULL);
1268 error(EINVAL, "Proto bypass block had unknown IP version 0x%x",
1273 /* Sets up cv for the protocol bypass. We use different queues for two reasons:
1274 * 1) To be protocol independent. For instance, TCP and UDP could use very
1275 * different QIO styles.
1276 * 2) To set up our own kick/bypass method. Note how udpcreate() and here uses
1277 * qbypass() (just blast it out), while TCP uses qopen() with a kick. TCP still
1278 * follows queuing discipline.
1280 * It's like we are our own protocol, the bypass protocol, when it comes to how
1281 * we interact with qio. The conv still is of the real protocol type (e.g.
1284 * Note that we can't free the old queues. The way #ip works, the queues are
1285 * created when the conv is created, but the conv is never freed. It's like a
1286 * slab allocator that never frees objects, but just reinitializes them a
1289 * For the queues, we're basically like UDP:
1290 * - We take packets for rq and drop on overflow.
1291 * - rq is also Qmsg, but we also have Qcoalesce, to ignore out zero-len blocks
1292 * - We kick for our outbound (wq) messages.
1294 * Note that Qmsg can drop parts of packets. It's up to the user to read
1295 * enough. If they didn't read enough, the extra is dropped. This is similar
1296 * to SOCK_DGRAM and recvfrom(). Minus major changes, there's no nice way to
1297 * get individual messages with read(). Userspace using the bypass will need to
1298 * find out the MTU of the NIC the IP stack is attached to, and make sure to
1299 * read in at least that amount each time. */
1300 static void setup_proto_qio_bypass(struct conv *cv)
1302 cv->rq_save = cv->rq;
1303 cv->wq_save = cv->wq;
1304 cv->rq = qopen(BYPASS_QMAX, Qmsg | Qcoalesce, 0, 0);
1305 cv->wq = qbypass(proto_bypass_kick, cv);
1308 static void undo_proto_qio_bypass(struct conv *cv)
1312 cv->rq = cv->rq_save;
1313 cv->wq = cv->wq_save;
1318 void Fsstdbypass(struct conv *cv, char *argv[], int argc)
1320 memset(cv->raddr, 0, sizeof(cv->raddr));
1324 setladdrport(cv, argv[1], 1);
1327 error(EINVAL, "Bad args (was %d, need 2) to bypass", argc);
1331 static void bypassctlmsg(struct Proto *x, struct conv *cv, struct cmdbuf *cb)
1334 error(EFAIL, "Protocol %s does not support bypass", x->name);
1335 /* The protocol needs to set the port (usually by calling Fsstdbypass) and
1336 * then do whatever it needs to make sure it can find the conv again during
1337 * receive (usually by adding to a hash table). */
1338 x->bypass(cv, cb->f, cb->nf);
1339 setup_proto_qio_bypass(cv);
1343 static void shutdownctlmsg(struct conv *cv, struct cmdbuf *cb)
1347 if (!strcmp(cb->f[1], "rd")) {
1348 qhangup(cv->rq, "shutdown");
1349 if (cv->p->shutdown)
1350 cv->p->shutdown(cv, SHUT_RD);
1351 } else if (!strcmp(cb->f[1], "wr")) {
1352 qhangup(cv->wq, "shutdown");
1353 if (cv->p->shutdown)
1354 cv->p->shutdown(cv, SHUT_WR);
1355 } else if (!strcmp(cb->f[1], "rdwr")) {
1356 qhangup(cv->rq, "shutdown");
1357 qhangup(cv->wq, "shutdown");
1358 if (cv->p->shutdown)
1359 cv->p->shutdown(cv, SHUT_RDWR);
1365 error(EINVAL, "shutdown [rx|tx|rxtx]");
1368 static void tosctlmsg(struct conv *c, struct cmdbuf *cb)
1373 c->tos = atoi(cb->f[1]);
1376 static void ttlctlmsg(struct conv *c, struct cmdbuf *cb)
1381 c->ttl = atoi(cb->f[1]);
1384 /* Binds a conversation, as if the user wrote "bind *" into ctl. */
1385 static void autobind(struct conv *cv)
1390 cb = parsecmd("bind *", 7);
1395 bindctlmsg(cv->p, cv, cb);
1400 static size_t ipwrite(struct chan *ch, void *v, size_t n, off64_t off)
1407 uint8_t ia[IPaddrlen], ma[IPaddrlen];
1414 switch (TYPE(ch->qid)) {
1416 error(EPERM, ERROR_FIXME);
1418 x = f->p[PROTO(ch->qid)];
1419 c = x->conv[CONV(ch->qid)];
1420 /* connection-less protocols (UDP) can write without manually
1424 if (ch->flag & O_NONBLOCK)
1425 qwrite_nonblock(c->wq, a, n);
1427 qwrite(c->wq, a, n);
1430 return arpwrite(f, a, n);
1432 return routewrite(f, ch, a, n);
1437 return ndbwrite(f, a, off, n);
1439 x = f->p[PROTO(ch->qid)];
1440 c = x->conv[CONV(ch->qid)];
1441 cb = parsecmd(a, n);
1450 error(EFAIL, "short control request");
1451 if (strcmp(cb->f[0], "connect") == 0)
1452 connectctlmsg(x, c, cb, ch);
1453 else if (strcmp(cb->f[0], "announce") == 0)
1454 announcectlmsg(x, c, cb);
1455 else if (strcmp(cb->f[0], "bind") == 0)
1456 bindctlmsg(x, c, cb);
1457 else if (strcmp(cb->f[0], "bypass") == 0)
1458 bypassctlmsg(x, c, cb);
1459 else if (strcmp(cb->f[0], "shutdown") == 0)
1460 shutdownctlmsg(c, cb);
1461 else if (strcmp(cb->f[0], "ttl") == 0)
1463 else if (strcmp(cb->f[0], "tos") == 0)
1465 else if (strcmp(cb->f[0], "ignoreadvice") == 0)
1466 c->ignoreadvice = 1;
1467 else if (strcmp(cb->f[0], "addmulti") == 0) {
1469 error(EFAIL, "addmulti needs interface address");
1471 if (!ipismulticast(c->raddr))
1472 error(EFAIL, "addmulti for a non multicast address");
1473 parseip(ia, cb->f[1]);
1474 ipifcaddmulti(c, c->raddr, ia);
1476 parseip(ma, cb->f[2]);
1477 if (!ipismulticast(ma))
1478 error(EFAIL, "addmulti for a non multicast address");
1479 parseip(ia, cb->f[1]);
1480 ipifcaddmulti(c, ma, ia);
1482 } else if (strcmp(cb->f[0], "remmulti") == 0) {
1484 error(EFAIL, "remmulti needs interface address");
1485 if (!ipismulticast(c->raddr))
1486 error(EFAIL, "remmulti for a non multicast address");
1487 parseip(ia, cb->f[1]);
1488 ipifcremmulti(c, c->raddr, ia);
1489 } else if (x->ctl != NULL) {
1490 x->ctl(c, cb->f, cb->nf);
1492 error(EFAIL, "unknown control request");
1500 static size_t ipbwrite(struct chan *ch, struct block *bp, off64_t offset)
1505 switch (TYPE(ch->qid)) {
1509 bp = concatblock(bp);
1511 if (ch->flag & O_NONBLOCK)
1512 qbwrite_nonblock(c->wq, bp);
1517 return devbwrite(ch, bp, offset);
1521 static void fire_data_taps(struct conv *conv, int filter)
1523 struct fd_tap *tap_i;
1525 /* At this point, we have an event we want to send to our taps (if any).
1526 * The lock protects list integrity and the existence of the tap.
1528 * Previously, I thought of using the conv qlock. That actually breaks, due
1529 * to weird usages of the qlock (someone holds it for a long time, blocking
1530 * the inbound wakeup from etherread4).
1532 * I opted for a spinlock for a couple reasons:
1533 * - fire_tap should not block. ideally it'll be fast too (it's mostly a
1535 * - our callers might not want to block. A lot of network wakeups will
1536 * come network processes (etherread4) or otherwise unrelated to this
1537 * particular conversation. I'd rather do something like fire off a KMSG
1539 * - if fire_tap takes a while, holding the lock only slows down other
1540 * events on this *same* conversation, or other tap registration. not a
1542 spin_lock(&conv->tap_lock);
1543 SLIST_FOREACH(tap_i, &conv->data_taps, link)
1544 fire_tap(tap_i, filter);
1545 spin_unlock(&conv->tap_lock);
1548 static void ip_wake_cb(struct queue *q, void *data, int filter)
1550 struct conv *conv = (struct conv*)data;
1552 /* For these two, we want to ignore events on the opposite end of the
1553 * queues. For instance, we want to know when the WQ is writable. Our
1554 * writes will actually make it readable - we don't want to trigger a tap
1555 * for that. However, qio doesn't know how/why we are using a queue, or
1556 * even who the ends are (hence the callbacks) */
1557 if ((filter & FDTAP_FILT_READABLE) && (q == conv->wq))
1559 if ((filter & FDTAP_FILT_WRITABLE) && (q == conv->rq))
1561 fire_data_taps(conv, filter);
1564 int iptapfd(struct chan *chan, struct fd_tap *tap, int cmd)
1566 struct conv *conv = chan2conv(chan);
1569 #define DEVIP_LEGAL_DATA_TAPS (FDTAP_FILT_READABLE | FDTAP_FILT_WRITABLE | \
1570 FDTAP_FILT_HANGUP | FDTAP_FILT_PRIORITY | \
1572 #define DEVIP_LEGAL_LISTEN_TAPS (FDTAP_FILT_READABLE | FDTAP_FILT_HANGUP)
1574 switch (TYPE(chan->qid)) {
1576 if (tap->filter & ~DEVIP_LEGAL_DATA_TAPS) {
1578 set_errstr("Unsupported #%s data tap %p, must be %p", devname(),
1579 tap->filter, DEVIP_LEGAL_DATA_TAPS);
1582 spin_lock(&conv->tap_lock);
1584 case (FDTAP_CMD_ADD):
1585 if (SLIST_EMPTY(&conv->data_taps)) {
1586 qio_set_wake_cb(conv->rq, ip_wake_cb, conv);
1587 qio_set_wake_cb(conv->wq, ip_wake_cb, conv);
1589 SLIST_INSERT_HEAD(&conv->data_taps, tap, link);
1592 case (FDTAP_CMD_REM):
1593 SLIST_REMOVE(&conv->data_taps, tap, fd_tap, link);
1594 if (SLIST_EMPTY(&conv->data_taps)) {
1595 qio_set_wake_cb(conv->rq, 0, conv);
1596 qio_set_wake_cb(conv->wq, 0, conv);
1602 set_errstr("Unsupported #%s data tap command %p",
1606 spin_unlock(&conv->tap_lock);
1609 if (tap->filter & ~DEVIP_LEGAL_LISTEN_TAPS) {
1611 set_errstr("Unsupported #%s listen tap %p, must be %p",
1612 devname(), tap->filter, DEVIP_LEGAL_LISTEN_TAPS);
1615 spin_lock(&conv->tap_lock);
1617 case (FDTAP_CMD_ADD):
1618 SLIST_INSERT_HEAD(&conv->listen_taps, tap, link);
1621 case (FDTAP_CMD_REM):
1622 SLIST_REMOVE(&conv->listen_taps, tap, fd_tap, link);
1627 set_errstr("Unsupported #%s listen tap command %p",
1631 spin_unlock(&conv->tap_lock);
1635 set_errstr("Can't tap #%s file type %d", devname(),
1641 static unsigned long ip_chan_ctl(struct chan *c, int op, unsigned long a1,
1642 unsigned long a2, unsigned long a3,
1649 error(EINVAL, "%s does not support %d", __func__, op);
1653 struct dev ipdevtab __devtab = {
1658 .shutdown = devshutdown,
1663 .create = devcreate,
1669 .remove = devremove,
1672 .chaninfo = ipchaninfo,
1674 .chan_ctl = ip_chan_ctl,
1677 int Fsproto(struct Fs *f, struct Proto *p)
1679 if (f->np >= Maxproto)
1682 qlock_init(&p->qlock);
1685 if (p->ipproto > 0) {
1686 if (f->t2p[p->ipproto] != NULL)
1688 f->t2p[p->ipproto] = p;
1691 p->qid.type = QTDIR;
1692 p->qid.path = QID(f->np, 0, Qprotodir);
1693 p->conv = kzmalloc(sizeof(struct conv *) * (p->nc + 1), 0);
1694 if (p->conv == NULL)
1706 * return true if this protocol is
1709 int Fsbuiltinproto(struct Fs *f, uint8_t proto)
1711 return f->t2p[proto] != NULL;
1715 * called with protocol locked
1717 struct conv *Fsprotoclone(struct Proto *p, char *user)
1719 struct conv *c, **pp, **ep;
1723 ep = &p->conv[p->nc];
1724 for (pp = p->conv; pp < ep; pp++) {
1727 c = kzmalloc(sizeof(struct conv), 0);
1730 "conv kzmalloc(%d, 0) failed in Fsprotoclone",
1731 sizeof(struct conv));
1732 qlock_init(&c->qlock);
1733 qlock_init(&c->listenq);
1734 rendez_init(&c->cr);
1735 rendez_init(&c->listenr);
1736 SLIST_INIT(&c->data_taps); /* already = 0; set to be futureproof */
1737 SLIST_INIT(&c->listen_taps);
1738 spinlock_init(&c->tap_lock);
1741 c->x = pp - p->conv;
1742 if (p->ptclsize != 0) {
1743 c->ptcl = kzmalloc(p->ptclsize, 0);
1744 if (c->ptcl == NULL) {
1747 "ptcl kzmalloc(%d, 0) failed in Fsprotoclone",
1753 c->eq = qopen(1024, Qmsg, 0, 0);
1755 assert(c->rq && c->wq);
1758 if (canqlock(&c->qlock)) {
1760 * make sure both processes and protocol
1761 * are done with this Conv
1763 if (c->inuse == 0 && (p->inuse == NULL || (*p->inuse) (c) == 0))
1770 if (p->gc != NULL && (*p->gc) (p))
1776 kstrdup(&c->owner, user);
1779 ipmove(c->laddr, IPnoaddr);
1780 ipmove(c->raddr, IPnoaddr);
1796 int Fsconnected(struct conv *c, char *msg)
1798 if (msg != NULL && *msg != '\0')
1799 strlcpy(c->cerr, msg, sizeof(c->cerr));
1803 c->state = Announced;
1807 c->state = Connected;
1811 rendez_wakeup(&c->cr);
1812 /* The user can poll or tap the connection status via Qdata */
1813 fire_data_taps(c, FDTAP_FILT_WRITABLE);
1817 struct Proto *Fsrcvpcol(struct Fs *f, uint8_t proto)
1822 return f->t2p[proto];
1825 struct Proto *Fsrcvpcolx(struct Fs *f, uint8_t proto)
1827 return f->t2p[proto];
1830 static void fire_listener_taps(struct conv *conv)
1832 struct fd_tap *tap_i;
1833 if (SLIST_EMPTY(&conv->listen_taps))
1835 spin_lock(&conv->tap_lock);
1836 SLIST_FOREACH(tap_i, &conv->listen_taps, link)
1837 fire_tap(tap_i, FDTAP_FILT_READABLE);
1838 spin_unlock(&conv->tap_lock);
1842 * called with protocol locked
1844 struct conv *Fsnewcall(struct conv *c, uint8_t * raddr, uint16_t rport,
1845 uint8_t * laddr, uint16_t lport, uint8_t version)
1853 for (l = &c->incall; *l; l = &(*l)->next)
1855 if (i >= Maxincall) {
1860 /* find a free conversation */
1861 nc = Fsprotoclone(c->p, network);
1866 ipmove(nc->raddr, raddr);
1868 ipmove(nc->laddr, laddr);
1872 nc->state = Connected;
1873 nc->ipversion = version;
1877 rendez_wakeup(&c->listenr);
1878 fire_listener_taps(c);
1883 static long ndbwrite(struct Fs *f, char *a, uint32_t off, int n)
1885 if (off > strlen(f->ndb))
1886 error(EIO, ERROR_FIXME);
1887 if (off + n >= sizeof(f->ndb) - 1)
1888 error(EIO, ERROR_FIXME);
1889 memmove(f->ndb + off, a, n);
1890 f->ndb[off + n] = 0;
1892 f->ndbmtime = seconds();
1896 uint32_t scalednconv(void)
1898 //if(conf.npage*BY2PG >= 128*MB)