9ns: Use an int for perm and mode
[akaros.git] / kern / src / net / devip.c
1 /* Copyright © 1994-1999 Lucent Technologies Inc.  All rights reserved.
2  * Portions Copyright © 1997-1999 Vita Nuova Limited
3  * Portions Copyright © 2000-2007 Vita Nuova Holdings Limited
4  *                                (www.vitanuova.com)
5  * Revisions Copyright © 2000-2007 Lucent Technologies Inc. and others
6  *
7  * Modified for the Akaros operating system:
8  * Copyright (c) 2013-2014 The Regents of the University of California
9  * Copyright (c) 2013-2015 Google Inc.
10  *
11  * Permission is hereby granted, free of charge, to any person obtaining a copy
12  * of this software and associated documentation files (the "Software"), to deal
13  * in the Software without restriction, including without limitation the rights
14  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15  * copies of the Software, and to permit persons to whom the Software is
16  * furnished to do so, subject to the following conditions:
17  *
18  * The above copyright notice and this permission notice shall be included in
19  * all copies or substantial portions of the Software.
20  *
21  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
24  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27  * SOFTWARE. */
28
29 #include <vfs.h>
30 #include <kfs.h>
31 #include <slab.h>
32 #include <kmalloc.h>
33 #include <kref.h>
34 #include <string.h>
35 #include <stdio.h>
36 #include <assert.h>
37 #include <error.h>
38 #include <cpio.h>
39 #include <pmap.h>
40 #include <smp.h>
41 #include <ip.h>
42
43 struct dev ipdevtab;
44
45 static char *devname(void)
46 {
47         return ipdevtab.name;
48 }
49
50 enum {
51         Qtopdir = 1,                            /* top level directory */
52         Qtopbase,
53         Qarp = Qtopbase,
54         Qndb,
55         Qiproute,
56         Qiprouter,
57         Qipselftab,
58         Qlog,
59
60         Qprotodir,      /* directory for a protocol */
61         Qprotobase,
62         Qclone = Qprotobase,
63         Qstats,
64
65         Qconvdir,       /* directory for a conversation */
66         Qconvbase,
67         Qctl = Qconvbase,
68         Qdata,
69         Qerr,
70         Qlisten,
71         Qlocal,
72         Qremote,
73         Qstatus,
74         Qsnoop,
75
76         Logtype = 5,
77         Masktype = (1 << Logtype) - 1,
78         Logconv = 12,
79         Maskconv = (1 << Logconv) - 1,
80         Shiftconv = Logtype,
81         Logproto = 8,
82         Maskproto = (1 << Logproto) - 1,
83         Shiftproto = Logtype + Logconv,
84
85         Nfs = 32,
86         BYPASS_QMAX = 64 * MiB,
87         IPROUTE_LEN = 2 * PGSIZE,
88 };
89 #define TYPE(x)         ( ((uint32_t)(x).path) & Masktype )
90 #define CONV(x)         ( (((uint32_t)(x).path) >> Shiftconv) & Maskconv )
91 #define PROTO(x)        ( (((uint32_t)(x).path) >> Shiftproto) & Maskproto )
92 #define QID(p, c, y)    ( ((p)<<(Shiftproto)) | ((c)<<Shiftconv) | (y))
93 static char network[] = "network";
94
95 qlock_t fslock;
96 struct Fs *ipfs[Nfs];                   /* attached fs's */
97 struct queue *qlog;
98
99 extern void nullmediumlink(void);
100 extern void pktmediumlink(void);
101 extern struct username eve;
102 static long ndbwrite(struct Fs *, char *unused_char_p_t, uint32_t, int);
103 static void closeconv(struct conv *);
104 static void setup_proto_qio_bypass(struct conv *cv);
105 static void undo_proto_qio_bypass(struct conv *cv);
106
107 static struct conv *chan2conv(struct chan *chan)
108 {
109         /* That's a lot of pointers to get to the conv! */
110         return ipfs[chan->dev]->p[PROTO(chan->qid)]->conv[CONV(chan->qid)];
111 }
112
113 static inline int founddevdir(struct chan *c, struct qid q, char *n,
114                                                           int64_t length, char *user, long perm,
115                                                           struct dir *db)
116 {
117         devdir(c, q, n, length, user, perm, db);
118         return 1;
119 }
120
121 static int topdirgen(struct chan *c, struct dir *dp)
122 {
123         struct qid q;
124         mkqid(&q, QID(0, 0, Qtopdir), 0, QTDIR);
125         snprintf(get_cur_genbuf(), GENBUF_SZ, "#%s%lu", devname(), c->dev);
126         return founddevdir(c, q, get_cur_genbuf(), 0, network, 0555, dp);
127 }
128
129
130 static int ip3gen(struct chan *c, int i, struct dir *dp)
131 {
132         struct qid q;
133         struct conv *cv;
134         char *p;
135         int perm;
136
137         cv = chan2conv(c);
138         if (cv->owner == NULL)
139                 kstrdup(&cv->owner, eve.name);
140         mkqid(&q, QID(PROTO(c->qid), CONV(c->qid), i), 0, QTFILE);
141
142         switch (i) {
143                 default:
144                         return -1;
145                 case Qctl:
146                         return founddevdir(c, q, "ctl", 0,
147                                                    cv->owner, cv->perm, dp);
148                 case Qdata:
149                         perm = cv->perm;
150                         perm |= qreadable(cv->rq) ? DMREADABLE : 0;
151                         perm |= qwritable(cv->wq) ? DMWRITABLE : 0;
152                         return founddevdir(c, q, "data", qlen(cv->rq),
153                                                            cv->owner, perm, dp);
154                 case Qerr:
155                         perm = cv->perm;
156                         perm |= qreadable(cv->eq) ? DMREADABLE : 0;
157                         return founddevdir(c, q, "err", qlen(cv->eq),
158                                                            cv->owner, perm, dp);
159                 case Qlisten:
160                         perm = cv->perm;
161                         perm |= cv->incall ? DMREADABLE : 0;
162                         return founddevdir(c, q, "listen", 0, cv->owner, perm, dp);
163                 case Qlocal:
164                         p = "local";
165                         break;
166                 case Qremote:
167                         p = "remote";
168                         break;
169                 case Qsnoop:
170                         if (strcmp(cv->p->name, "ipifc") != 0)
171                                 return -1;
172                         perm = 0400;
173                         perm |= qreadable(cv->sq) ? DMREADABLE : 0;
174                         return founddevdir(c, q, "snoop", qlen(cv->sq),
175                                                            cv->owner, perm, dp);
176                 case Qstatus:
177                         p = "status";
178                         break;
179         }
180         return founddevdir(c, q, p, 0, cv->owner, 0444, dp);
181 }
182
183 static int ip2gen(struct chan *c, int i, struct dir *dp)
184 {
185         struct qid q;
186         mkqid(&q, QID(PROTO(c->qid), 0, i), 0, QTFILE);
187         switch (i) {
188                 case Qclone:
189                         return founddevdir(c, q, "clone", 0, network, 0666, dp);
190                 case Qstats:
191                         return founddevdir(c, q, "stats", 0, network, 0444, dp);
192         }
193         return -1;
194 }
195
196 static int ip1gen(struct chan *c, int i, struct dir *dp)
197 {
198         struct qid q;
199         char *p;
200         int prot;
201         int len = 0;
202         struct Fs *f;
203         extern uint32_t kerndate;
204
205         f = ipfs[c->dev];
206
207         prot = 0666;
208         mkqid(&q, QID(0, 0, i), 0, QTFILE);
209         switch (i) {
210                 default:
211                         return -1;
212                 case Qarp:
213                         p = "arp";
214                         break;
215                 case Qndb:
216                         p = "ndb";
217                         len = strlen(f->ndb);
218                         q.vers = f->ndbvers;
219                         break;
220                 case Qiproute:
221                         p = "iproute";
222                         break;
223                 case Qipselftab:
224                         p = "ipselftab";
225                         prot = 0444;
226                         break;
227                 case Qiprouter:
228                         p = "iprouter";
229                         break;
230                 case Qlog:
231                         p = "log";
232                         break;
233         }
234         devdir(c, q, p, len, network, prot, dp);
235         if (i == Qndb && f->ndbmtime > kerndate)
236                 dp->mtime = f->ndbmtime;
237         return 1;
238 }
239
240 static int
241 ipgen(struct chan *c, char *unused_char_p_t, struct dirtab *d, int unused_int,
242           int s, struct dir *dp)
243 {
244         struct qid q;
245         struct conv *cv;
246         struct Fs *f;
247
248         f = ipfs[c->dev];
249
250         switch (TYPE(c->qid)) {
251                 case Qtopdir:
252                         if (s == DEVDOTDOT)
253                                 return topdirgen(c, dp);
254                         if (s < f->np) {
255                                 if (f->p[s]->connect == NULL)
256                                         return 0;       /* protocol with no user interface */
257                                 mkqid(&q, QID(s, 0, Qprotodir), 0, QTDIR);
258                                 return founddevdir(c, q, f->p[s]->name, 0, network, 0555, dp);
259                         }
260                         s -= f->np;
261                         return ip1gen(c, s + Qtopbase, dp);
262                 case Qarp:
263                 case Qndb:
264                 case Qlog:
265                 case Qiproute:
266                 case Qiprouter:
267                 case Qipselftab:
268                         return ip1gen(c, TYPE(c->qid), dp);
269                 case Qprotodir:
270                         if (s == DEVDOTDOT)
271                                 return topdirgen(c, dp);
272                         else if (s < f->p[PROTO(c->qid)]->ac) {
273                                 cv = f->p[PROTO(c->qid)]->conv[s];
274                                 snprintf(get_cur_genbuf(), GENBUF_SZ, "%d", s);
275                                 mkqid(&q, QID(PROTO(c->qid), s, Qconvdir), 0, QTDIR);
276                                 return
277                                         founddevdir(c, q, get_cur_genbuf(), 0, cv->owner, 0555, dp);
278                         }
279                         s -= f->p[PROTO(c->qid)]->ac;
280                         return ip2gen(c, s + Qprotobase, dp);
281                 case Qclone:
282                 case Qstats:
283                         return ip2gen(c, TYPE(c->qid), dp);
284                 case Qconvdir:
285                         if (s == DEVDOTDOT) {
286                                 s = PROTO(c->qid);
287                                 mkqid(&q, QID(s, 0, Qprotodir), 0, QTDIR);
288                                 devdir(c, q, f->p[s]->name, 0, network, 0555, dp);
289                                 return 1;
290                         }
291                         return ip3gen(c, s + Qconvbase, dp);
292                 case Qctl:
293                 case Qdata:
294                 case Qerr:
295                 case Qlisten:
296                 case Qlocal:
297                 case Qremote:
298                 case Qstatus:
299                 case Qsnoop:
300                         return ip3gen(c, TYPE(c->qid), dp);
301         }
302         return -1;
303 }
304
305 static void ipinit(void)
306 {
307         qlock_init(&fslock);
308         nullmediumlink();
309         pktmediumlink();
310 /* if only
311         fmtinstall('i', eipfmt);
312         fmtinstall('I', eipfmt);
313         fmtinstall('E', eipfmt);
314         fmtinstall('V', eipfmt);
315         fmtinstall('M', eipfmt);
316 */
317 }
318
319 static void ipreset(void)
320 {
321 }
322
323 static struct Fs *ipgetfs(int dev)
324 {
325         extern void (*ipprotoinit[]) (struct Fs *);
326         struct Fs *f;
327         int i;
328
329         if (dev >= Nfs)
330                 return NULL;
331
332         qlock(&fslock);
333         if (ipfs[dev] == NULL) {
334                 f = kzmalloc(sizeof(struct Fs), MEM_WAIT);
335                 rwinit(&f->rwlock);
336                 qlock_init(&f->iprouter.qlock);
337                 ip_init(f);
338                 arpinit(f);
339                 netloginit(f);
340                 for (i = 0; ipprotoinit[i]; i++)
341                         ipprotoinit[i] (f);
342                 f->dev = dev;
343                 ipfs[dev] = f;
344         }
345         qunlock(&fslock);
346
347         return ipfs[dev];
348 }
349
350 struct IPaux *newipaux(char *owner, char *tag)
351 {
352         struct IPaux *a;
353         int n;
354
355         a = kzmalloc(sizeof(*a), 0);
356         kstrdup(&a->owner, owner);
357         memset(a->tag, ' ', sizeof(a->tag));
358         n = strlen(tag);
359         if (n > sizeof(a->tag))
360                 n = sizeof(a->tag);
361         memmove(a->tag, tag, n);
362         return a;
363 }
364
365 #define ATTACHER(c) (((struct IPaux*)((c)->aux))->owner)
366
367 static struct chan *ipattach(char *spec)
368 {
369         struct chan *c;
370         int dev;
371
372         dev = atoi(spec);
373         if (dev >= Nfs)
374                 error(EFAIL, "bad specification");
375
376         ipgetfs(dev);
377         c = devattach(devname(), spec);
378         mkqid(&c->qid, QID(0, 0, Qtopdir), 0, QTDIR);
379         c->dev = dev;
380
381         c->aux = newipaux(commonuser(), "none");
382
383         return c;
384 }
385
386 static struct walkqid *ipwalk(struct chan *c, struct chan *nc, char **name,
387                                                           int nname)
388 {
389         struct IPaux *a = c->aux;
390         struct walkqid *w;
391
392         w = devwalk(c, nc, name, nname, NULL, 0, ipgen);
393         if (w != NULL && w->clone != NULL)
394                 w->clone->aux = newipaux(a->owner, a->tag);
395         return w;
396 }
397
398 static int ipstat(struct chan *c, uint8_t * db, int n)
399 {
400         return devstat(c, db, n, NULL, 0, ipgen);
401 }
402
403 static int should_wake(void *arg)
404 {
405         struct conv *cv = arg;
406         /* signal that the conv is closed */
407         if (qisclosed(cv->rq))
408                 return TRUE;
409         return cv->incall != NULL;
410 }
411
412 static struct chan *ipopen(struct chan *c, int omode)
413 {
414         ERRSTACK(2);
415         struct conv *cv, *nc;
416         struct Proto *p;
417         int perm;
418         struct Fs *f;
419
420         /* perm is a lone rwx, not the rwx------ from the conversion */
421         perm = omode_to_rwx(omode) >> 6;
422
423         f = ipfs[c->dev];
424
425         switch (TYPE(c->qid)) {
426                 default:
427                         break;
428                 case Qndb:
429                         if (omode & (O_WRITE | O_TRUNC) && !iseve())
430                                 error(EPERM, ERROR_FIXME);
431                         if ((omode & (O_WRITE | O_TRUNC)) == (O_WRITE | O_TRUNC))
432                                 f->ndb[0] = 0;
433                         break;
434                 case Qlog:
435                         netlogopen(f);
436                         break;
437                 case Qiprouter:
438                         iprouteropen(f);
439                         break;
440                 case Qiproute:
441                         c->synth_buf = kpages_zalloc(IPROUTE_LEN, MEM_WAIT);
442                         routeread(f, c->synth_buf, 0, IPROUTE_LEN);
443                         break;
444                 case Qtopdir:
445                 case Qprotodir:
446                 case Qconvdir:
447                 case Qstatus:
448                 case Qremote:
449                 case Qlocal:
450                 case Qstats:
451                 case Qipselftab:
452                         if (omode & O_WRITE)
453                                 error(EPERM, ERROR_FIXME);
454                         break;
455                 case Qsnoop:
456                         if (omode & O_WRITE)
457                                 error(EPERM, ERROR_FIXME);
458                         /* might be racy.  note the lack of a proto lock, unlike Qdata */
459                         p = f->p[PROTO(c->qid)];
460                         cv = p->conv[CONV(c->qid)];
461                         if (strcmp(ATTACHER(c), cv->owner) != 0 && !iseve())
462                                 error(EPERM, ERROR_FIXME);
463                         atomic_inc(&cv->snoopers);
464                         break;
465                 case Qclone:
466                         p = f->p[PROTO(c->qid)];
467                         qlock(&p->qlock);
468                         if (waserror()) {
469                                 qunlock(&p->qlock);
470                                 nexterror();
471                         }
472                         cv = Fsprotoclone(p, ATTACHER(c));
473                         qunlock(&p->qlock);
474                         poperror();
475                         if (cv == NULL) {
476                                 error(ENODEV, "Null conversation from Fsprotoclone");
477                                 break;
478                         }
479                         mkqid(&c->qid, QID(p->x, cv->x, Qctl), 0, QTFILE);
480                         break;
481                 case Qdata:
482                 case Qctl:
483                 case Qerr:
484                         p = f->p[PROTO(c->qid)];
485                         qlock(&p->qlock);
486                         cv = p->conv[CONV(c->qid)];
487                         qlock(&cv->qlock);
488                         if (waserror()) {
489                                 qunlock(&cv->qlock);
490                                 qunlock(&p->qlock);
491                                 nexterror();
492                         }
493                         if ((perm & (cv->perm >> 6)) != perm) {
494                                 if (strcmp(ATTACHER(c), cv->owner) != 0)
495                                         error(EPERM, ERROR_FIXME);
496                                 if ((perm & cv->perm) != perm)
497                                         error(EPERM, ERROR_FIXME);
498
499                         }
500                         cv->inuse++;
501                         if (cv->inuse == 1) {
502                                 kstrdup(&cv->owner, ATTACHER(c));
503                                 cv->perm = 0660;
504                         }
505                         qunlock(&cv->qlock);
506                         qunlock(&p->qlock);
507                         poperror();
508                         break;
509                 case Qlisten:
510                         cv = f->p[PROTO(c->qid)]->conv[CONV(c->qid)];
511                         /* No permissions or Announce checks required.  We'll see if that's
512                          * a good idea or not. (the perm check would do nothing, as is,
513                          * since an O_PATH perm is 0).
514                          *
515                          * But we probably want to incref to keep the conversation around
516                          * until this FD/chan is closed.  #ip is a little weird in that
517                          * objects never really go away (high water mark for convs, you can
518                          * always find them in the ns).  I think it is possible to
519                          * namec/ipgen a chan, then have that conv close, then have that
520                          * chan be opened.  You can probably do this with a data file. */
521                         if (omode & O_PATH) {
522                                 qlock(&cv->qlock);
523                                 cv->inuse++;
524                                 qunlock(&cv->qlock);
525                                 break;
526                         }
527                         if ((perm & (cv->perm >> 6)) != perm) {
528                                 if (strcmp(ATTACHER(c), cv->owner) != 0)
529                                         error(EPERM, ERROR_FIXME);
530                                 if ((perm & cv->perm) != perm)
531                                         error(EPERM, ERROR_FIXME);
532
533                         }
534
535                         if (cv->state != Announced)
536                                 error(EFAIL, "not announced");
537
538                         if (waserror()) {
539                                 closeconv(cv);
540                                 nexterror();
541                         }
542                         qlock(&cv->qlock);
543                         cv->inuse++;
544                         qunlock(&cv->qlock);
545
546                         nc = NULL;
547                         while (nc == NULL) {
548                                 /* give up if we got a hangup */
549                                 if (qisclosed(cv->rq))
550                                         error(EFAIL, "listen hungup");
551
552                                 qlock(&cv->listenq);
553                                 if (waserror()) {
554                                         qunlock(&cv->listenq);
555                                         nexterror();
556                                 }
557                                 /* we can peek at incall without grabbing the cv qlock.  if
558                                  * anything is there, it'll remain there until we dequeue it.
559                                  * no one else can, since we hold the listenq lock */
560                                 if ((c->flag & O_NONBLOCK) && !cv->incall)
561                                         error(EAGAIN, "listen queue empty");
562                                 /* wait for a connect */
563                                 rendez_sleep(&cv->listenr, should_wake, cv);
564
565                                 /* if there is a concurrent hangup, they will hold the qlock
566                                  * until the hangup is complete, including closing the cv->rq */
567                                 qlock(&cv->qlock);
568                                 nc = cv->incall;
569                                 if (nc != NULL) {
570                                         cv->incall = nc->next;
571                                         mkqid(&c->qid, QID(PROTO(c->qid), nc->x, Qctl), 0, QTFILE);
572                                         kstrdup(&cv->owner, ATTACHER(c));
573                                 }
574                                 qunlock(&cv->qlock);
575
576                                 qunlock(&cv->listenq);
577                                 poperror();
578                         }
579                         closeconv(cv);
580                         poperror();
581                         break;
582         }
583         c->mode = openmode(omode);
584         c->flag |= COPEN;
585         c->offset = 0;
586         return c;
587 }
588
589 static int ipwstat(struct chan *c, uint8_t * dp, int n)
590 {
591         ERRSTACK(2);
592         struct dir *d;
593         struct conv *cv;
594         struct Fs *f;
595         struct Proto *p;
596
597         f = ipfs[c->dev];
598         switch (TYPE(c->qid)) {
599                 default:
600                         error(EPERM, ERROR_FIXME);
601                         break;
602                 case Qctl:
603                 case Qdata:
604                         break;
605         }
606
607         d = kzmalloc(sizeof(*d) + n, 0);
608         if (waserror()) {
609                 kfree(d);
610                 nexterror();
611         }
612         n = convM2D(dp, n, d, (char *)&d[1]);
613         if (n == 0)
614                 error(ENODATA, ERROR_FIXME);
615         p = f->p[PROTO(c->qid)];
616         cv = p->conv[CONV(c->qid)];
617         if (!iseve() && strcmp(ATTACHER(c), cv->owner) != 0)
618                 error(EPERM, ERROR_FIXME);
619         if (!emptystr(d->uid))
620                 kstrdup(&cv->owner, d->uid);
621         if (d->mode != -1)
622                 cv->perm = d->mode & 0777;
623         poperror();
624         kfree(d);
625         return n;
626 }
627
628 /* Should be able to handle any file type chan. Feel free to extend it. */
629 static char *ipchaninfo(struct chan *ch, char *ret, size_t ret_l)
630 {
631         struct conv *conv;
632         struct Proto *proto;
633         char *p;
634         struct Fs *f;
635
636         f = ipfs[ch->dev];
637
638         switch (TYPE(ch->qid)) {
639                 default:
640                         ret = "Unknown type";
641                         break;
642                 case Qdata:
643                         proto = f->p[PROTO(ch->qid)];
644                         conv = proto->conv[CONV(ch->qid)];
645                         snprintf(ret, ret_l, "Qdata, %s, proto %s, conv idx %d, rq len %d, wq len %d",
646                                  SLIST_EMPTY(&conv->data_taps) ? "untapped" : "tapped",
647                                  proto->name, conv->x, qlen(conv->rq), qlen(conv->wq));
648                         break;
649                 case Qarp:
650                         ret = "Qarp";
651                         break;
652                 case Qiproute:
653                         ret = "Qiproute";
654                         break;
655                 case Qlisten:
656                         proto = f->p[PROTO(ch->qid)];
657                         conv = proto->conv[CONV(ch->qid)];
658                         snprintf(ret, ret_l,
659                                  "Qlisten, %s proto %s, conv idx %d, has %sincalls",
660                                  SLIST_EMPTY(&conv->listen_taps) ? "untapped" : "tapped",
661                                  proto->name, conv->x, conv->incall ? "" : "no ");
662                         break;
663                 case Qlog:
664                         ret = "Qlog";
665                         break;
666                 case Qndb:
667                         ret = "Qndb";
668                         break;
669                 case Qctl:
670                         proto = f->p[PROTO(ch->qid)];
671                         conv = proto->conv[CONV(ch->qid)];
672                         snprintf(ret, ret_l, "Qctl, proto %s, conv idx %d", proto->name,
673                                          conv->x);
674                         break;
675         }
676         return ret;
677 }
678
679 static void closeconv(struct conv *cv)
680 {
681         ERRSTACK(1);
682         struct conv *nc;
683         struct Ipmulti *mp;
684
685         qlock(&cv->qlock);
686
687         if (--cv->inuse > 0) {
688                 qunlock(&cv->qlock);
689                 return;
690         }
691         if (waserror()) {
692                 qunlock(&cv->qlock);
693                 nexterror();
694         }
695         /* close all incoming calls since no listen will ever happen */
696         for (nc = cv->incall; nc; nc = cv->incall) {
697                 cv->incall = nc->next;
698                 closeconv(nc);
699         }
700         cv->incall = NULL;
701
702         kstrdup(&cv->owner, network);
703         cv->perm = 0660;
704
705         while ((mp = cv->multi) != NULL)
706                 ipifcremmulti(cv, mp->ma, mp->ia);
707
708         cv->r = NULL;
709         cv->rgen = 0;
710         if (cv->state == Bypass)
711                 undo_proto_qio_bypass(cv);
712         cv->p->close(cv);
713         cv->state = Idle;
714         qunlock(&cv->qlock);
715         poperror();
716 }
717
718 static void ipclose(struct chan *c)
719 {
720         struct Fs *f;
721
722         f = ipfs[c->dev];
723         switch (TYPE(c->qid)) {
724                 default:
725                         break;
726                 case Qlog:
727                         if (c->flag & COPEN)
728                                 netlogclose(f);
729                         break;
730                 case Qiprouter:
731                         if (c->flag & COPEN)
732                                 iprouterclose(f);
733                         break;
734                 case Qdata:
735                 case Qctl:
736                 case Qerr:
737                 case Qlisten:
738                         if (c->flag & COPEN)
739                                 closeconv(f->p[PROTO(c->qid)]->conv[CONV(c->qid)]);
740                         break;
741                 case Qsnoop:
742                         if (c->flag & COPEN)
743                                 atomic_dec(&f->p[PROTO(c->qid)]->conv[CONV(c->qid)]->snoopers);
744                         break;
745                 case Qiproute:
746                         if (c->flag & COPEN)
747                                 kpages_free(c->synth_buf, IPROUTE_LEN);
748                         break;
749         }
750         kfree(((struct IPaux *)c->aux)->owner);
751         kfree(c->aux);
752 }
753
754 enum {
755         Statelen = 32 * 1024,
756 };
757
758 static long ipread(struct chan *ch, void *a, long n, int64_t off)
759 {
760         struct conv *c;
761         struct Proto *x;
762         char *buf, *p;
763         long rv;
764         struct Fs *f;
765         uint32_t offset = off;
766
767         f = ipfs[ch->dev];
768
769         p = a;
770         switch (TYPE(ch->qid)) {
771                 default:
772                         error(EPERM, ERROR_FIXME);
773                 case Qtopdir:
774                 case Qprotodir:
775                 case Qconvdir:
776                         return devdirread(ch, a, n, 0, 0, ipgen);
777                 case Qarp:
778                         return arpread(f->arp, a, offset, n);
779                 case Qndb:
780                         return readstr(offset, a, n, f->ndb);
781                 case Qiproute:
782                         return readmem(offset, a, n, ch->synth_buf, IPROUTE_LEN);
783                 case Qiprouter:
784                         return iprouterread(f, a, n);
785                 case Qipselftab:
786                         return ipselftabread(f, a, offset, n);
787                 case Qlog:
788                         return netlogread(f, a, offset, n);
789                 case Qctl:
790                         snprintf(get_cur_genbuf(), GENBUF_SZ, "%lu", CONV(ch->qid));
791                         return readstr(offset, p, n, get_cur_genbuf());
792                 case Qremote:
793                         buf = kzmalloc(Statelen, 0);
794                         x = f->p[PROTO(ch->qid)];
795                         c = x->conv[CONV(ch->qid)];
796                         if (x->remote == NULL) {
797                                 snprintf(buf, Statelen, "%I!%d\n", c->raddr, c->rport);
798                         } else {
799                                 (*x->remote) (c, buf, Statelen - 2);
800                         }
801                         rv = readstr(offset, p, n, buf);
802                         kfree(buf);
803                         return rv;
804                 case Qlocal:
805                         buf = kzmalloc(Statelen, 0);
806                         x = f->p[PROTO(ch->qid)];
807                         c = x->conv[CONV(ch->qid)];
808                         if (x->local == NULL) {
809                                 snprintf(buf, Statelen, "%I!%d\n", c->laddr, c->lport);
810                         } else {
811                                 (*x->local) (c, buf, Statelen - 2);
812                         }
813                         rv = readstr(offset, p, n, buf);
814                         kfree(buf);
815                         return rv;
816                 case Qstatus:
817                         /* this all is a bit screwed up since the size of some state's
818                          * buffers will change from one invocation to another.  a reader
819                          * will come in and read the entire buffer.  then it will come again
820                          * and read from the next offset, expecting EOF.  if the buffer
821                          * changed sizes, it'll reprint the end of the buffer slightly. */
822                         buf = kzmalloc(Statelen, 0);
823                         x = f->p[PROTO(ch->qid)];
824                         c = x->conv[CONV(ch->qid)];
825                         if (c->state == Bypass)
826                                 snprintf(buf, Statelen, "Bypassed\n");
827                         else
828                                 (*x->state)(c, buf, Statelen - 2);
829                         rv = readstr(offset, p, n, buf);
830                         kfree(buf);
831                         return rv;
832                 case Qdata:
833                         c = f->p[PROTO(ch->qid)]->conv[CONV(ch->qid)];
834                         if (ch->flag & O_NONBLOCK)
835                                 return qread_nonblock(c->rq, a, n);
836                         else
837                                 return qread(c->rq, a, n);
838                 case Qerr:
839                         c = f->p[PROTO(ch->qid)]->conv[CONV(ch->qid)];
840                         return qread(c->eq, a, n);
841                 case Qsnoop:
842                         c = f->p[PROTO(ch->qid)]->conv[CONV(ch->qid)];
843                         return qread(c->sq, a, n);
844                 case Qstats:
845                         x = f->p[PROTO(ch->qid)];
846                         if (x->stats == NULL)
847                                 error(EFAIL, "stats not implemented");
848                         buf = kzmalloc(Statelen, 0);
849                         (*x->stats) (x, buf, Statelen);
850                         rv = readstr(offset, p, n, buf);
851                         kfree(buf);
852                         return rv;
853         }
854 }
855
856 static struct block *ipbread(struct chan *ch, long n, uint32_t offset)
857 {
858         struct conv *c;
859
860         switch (TYPE(ch->qid)) {
861                 case Qdata:
862                         c = chan2conv(ch);
863                         if (ch->flag & O_NONBLOCK)
864                                 return qbread_nonblock(c->rq, n);
865                         else
866                                 return qbread(c->rq, n);
867                 default:
868                         return devbread(ch, n, offset);
869         }
870 }
871
872 /*
873  *  set local address to be that of the ifc closest to remote address
874  */
875 static void setladdr(struct conv *c)
876 {
877         findlocalip(c->p->f, c->laddr, c->raddr);
878 }
879
880 /*
881  *  set a local port making sure the quad of raddr,rport,laddr,lport is unique
882  */
883 static void setluniqueport(struct conv *c, int lport)
884 {
885         struct Proto *p;
886         struct conv *xp;
887         int x;
888
889         p = c->p;
890
891         qlock(&p->qlock);
892         for (x = 0; x < p->nc; x++) {
893                 xp = p->conv[x];
894                 if (xp == NULL)
895                         break;
896                 if (xp == c)
897                         continue;
898                 if ((xp->state == Connected || xp->state == Announced
899                                             || xp->state == Bypass)
900                         && xp->lport == lport
901                         && xp->rport == c->rport
902                         && ipcmp(xp->raddr, c->raddr) == 0
903                         && ipcmp(xp->laddr, c->laddr) == 0) {
904                         qunlock(&p->qlock);
905                         error(EFAIL, "address in use");
906                 }
907         }
908         c->lport = lport;
909         qunlock(&p->qlock);
910 }
911
912 /*
913  *  pick a local port and set it
914  */
915 static void setlport(struct conv *c)
916 {
917         struct Proto *p;
918         uint16_t *pp;
919         int x, found;
920
921         p = c->p;
922         if (c->restricted)
923                 pp = &p->nextrport;
924         else
925                 pp = &p->nextport;
926         qlock(&p->qlock);
927         for (;; (*pp)++) {
928                 /*
929                  * Fsproto initialises p->nextport to 0 and the restricted
930                  * ports (p->nextrport) to 600.
931                  * Restricted ports must lie between 600 and 1024.
932                  * For the initial condition or if the unrestricted port number
933                  * has wrapped round, select a random port between 5000 and 1<<15
934                  * to start at.
935                  */
936                 if (c->restricted) {
937                         if (*pp >= 1024)
938                                 *pp = 600;
939                 } else
940                         while (*pp < 5000)
941                                 urandom_read(pp, sizeof(*pp));
942
943                 found = 0;
944                 for (x = 0; x < p->nc; x++) {
945                         if (p->conv[x] == NULL)
946                                 break;
947                         if (p->conv[x]->lport == *pp) {
948                                 found = 1;
949                                 break;
950                         }
951                 }
952                 if (!found)
953                         break;
954         }
955         c->lport = (*pp)++;
956         qunlock(&p->qlock);
957 }
958
959 /*
960  *  set a local address and port from a string of the form
961  *      [address!]port[!r]
962  */
963 static void setladdrport(struct conv *c, char *str, int announcing)
964 {
965         char *p;
966         uint16_t lport;
967         uint8_t addr[IPaddrlen];
968
969         /*
970          *  ignore restricted part if it exists.  it's
971          *  meaningless on local ports.
972          */
973         p = strchr(str, '!');
974         if (p != NULL) {
975                 *p++ = 0;
976                 if (strcmp(p, "r") == 0)
977                         p = NULL;
978         }
979
980         c->lport = 0;
981         if (p == NULL) {
982                 if (announcing)
983                         ipmove(c->laddr, IPnoaddr);
984                 else
985                         setladdr(c);
986                 p = str;
987         } else {
988                 if (strcmp(str, "*") == 0)
989                         ipmove(c->laddr, IPnoaddr);
990                 else {
991                         parseip(addr, str);
992                         if (ipforme(c->p->f, addr))
993                                 ipmove(c->laddr, addr);
994                         else
995                                 error(EFAIL, "not a local IP address");
996                 }
997         }
998
999         /* one process can get all connections */
1000         if (announcing && strcmp(p, "*") == 0) {
1001                 if (!iseve())
1002                         error(EPERM, ERROR_FIXME);
1003                 setluniqueport(c, 0);
1004         }
1005
1006         lport = atoi(p);
1007         if (lport <= 0)
1008                 setlport(c);
1009         else
1010                 setluniqueport(c, lport);
1011 }
1012
1013 static void setraddrport(struct conv *c, char *str)
1014 {
1015         char *p;
1016
1017         p = strchr(str, '!');
1018         if (p == NULL)
1019                 error(EFAIL, "malformed address");
1020         *p++ = 0;
1021         parseip(c->raddr, str);
1022         c->rport = atoi(p);
1023         p = strchr(p, '!');
1024         if (p) {
1025                 if (strstr(p, "!r") != NULL)
1026                         c->restricted = 1;
1027         }
1028 }
1029
1030 /*
1031  *  called by protocol connect routine to set addresses
1032  */
1033 void Fsstdconnect(struct conv *c, char *argv[], int argc)
1034 {
1035         switch (argc) {
1036                 default:
1037                         error(EINVAL, "bad args to %s", __func__);
1038                 case 2:
1039                         setraddrport(c, argv[1]);
1040                         setladdr(c);
1041                         setlport(c);
1042                         break;
1043                 case 3:
1044                         setraddrport(c, argv[1]);
1045                         setladdrport(c, argv[2], 0);
1046                         break;
1047         }
1048
1049         if ((memcmp(c->raddr, v4prefix, IPv4off) == 0 &&
1050                  memcmp(c->laddr, v4prefix, IPv4off) == 0)
1051                 || ipcmp(c->raddr, IPnoaddr) == 0)
1052                 c->ipversion = V4;
1053         else
1054                 c->ipversion = V6;
1055 }
1056
1057 /*
1058  *  initiate connection and sleep till its set up
1059  */
1060 static int connected(void *a)
1061 {
1062         return ((struct conv *)a)->state == Connected;
1063 }
1064
1065 static void connectctlmsg(struct Proto *x, struct conv *c, struct cmdbuf *cb)
1066 {
1067         ERRSTACK(1);
1068         char *p;
1069
1070         if (c->state != 0)
1071                 error(EBUSY, ERROR_FIXME);
1072         c->state = Connecting;
1073         c->cerr[0] = '\0';
1074         if (x->connect == NULL)
1075                 error(EFAIL, "connect not supported");
1076         x->connect(c, cb->f, cb->nf);
1077
1078         qunlock(&c->qlock);
1079         if (waserror()) {
1080                 qlock(&c->qlock);
1081                 nexterror();
1082         }
1083         rendez_sleep(&c->cr, connected, c);
1084         qlock(&c->qlock);
1085         poperror();
1086
1087         if (c->cerr[0] != '\0')
1088                 error(EFAIL, c->cerr);
1089 }
1090
1091 /*
1092  *  called by protocol announce routine to set addresses
1093  */
1094 void Fsstdannounce(struct conv *c, char *argv[], int argc)
1095 {
1096         memset(c->raddr, 0, sizeof(c->raddr));
1097         c->rport = 0;
1098         switch (argc) {
1099                 default:
1100                         error(EINVAL, "bad args to announce");
1101                 case 2:
1102                         setladdrport(c, argv[1], 1);
1103                         break;
1104         }
1105 }
1106
1107 /*
1108  *  initiate announcement and sleep till its set up
1109  */
1110 static int announced(void *a)
1111 {
1112         return ((struct conv *)a)->state == Announced;
1113 }
1114
1115 static void announcectlmsg(struct Proto *x, struct conv *c, struct cmdbuf *cb)
1116 {
1117         ERRSTACK(1);
1118         char *p;
1119
1120         if (c->state != 0)
1121                 error(EBUSY, ERROR_FIXME);
1122         c->state = Announcing;
1123         c->cerr[0] = '\0';
1124         if (x->announce == NULL)
1125                 error(EFAIL, "announce not supported");
1126         x->announce(c, cb->f, cb->nf);
1127
1128         qunlock(&c->qlock);
1129         if (waserror()) {
1130                 qlock(&c->qlock);
1131                 nexterror();
1132         }
1133         rendez_sleep(&c->cr, announced, c);
1134         qlock(&c->qlock);
1135         poperror();
1136
1137         if (c->cerr[0] != '\0')
1138                 error(EFAIL, c->cerr);
1139 }
1140
1141 /*
1142  *  called by protocol bind routine to set addresses
1143  */
1144 void Fsstdbind(struct conv *c, char *argv[], int argc)
1145 {
1146         switch (argc) {
1147                 default:
1148                         error(EINVAL, "bad args to bind");
1149                 case 2:
1150                         setladdrport(c, argv[1], 0);
1151                         break;
1152         }
1153 }
1154
1155 static void bindctlmsg(struct Proto *x, struct conv *c, struct cmdbuf *cb)
1156 {
1157         if (x->bind == NULL)
1158                 Fsstdbind(c, cb->f, cb->nf);
1159         else
1160                 x->bind(c, cb->f, cb->nf);
1161 }
1162
1163 /* Helper, called by protocols to use the bypass.
1164  *
1165  * This is a bit nasty due to the overall nastiness of #ip.  We need to lock
1166  * before checking the state and hold the qlock throughout, because a concurrent
1167  * closeconv() could tear down the bypass.  Specifically, it could free the
1168  * bypass queues.  The root issue is that conversation lifetimes are not managed
1169  * well.
1170  *
1171  * If we fail, it's our responsibility to consume (free) the block(s). */
1172 void bypass_or_drop(struct conv *cv, struct block *bp)
1173 {
1174         qlock(&cv->qlock);
1175         if (cv->state == Bypass)
1176                 qpass(cv->rq, bp);
1177         else
1178                 freeblist(bp);
1179         qunlock(&cv->qlock);
1180 }
1181
1182 /* Push the block directly to the approprite ipoput function.
1183  *
1184  * It's the protocol's responsibility (and thus ours here) to make sure there is
1185  * at least the right amount of the IP header in the block (ipoput{4,6} assumes
1186  * it has the right amount, and the other protocols account for the IP header in
1187  * their own header).
1188  *
1189  * For the TTL and TOS, we just use the default ones.  If we want, we could look
1190  * into the actual block and see what the user wanted, though we're bypassing
1191  * the protocol layer, not the IP layer. */
1192 static void proto_bypass_kick(void *arg, struct block *bp)
1193 {
1194         struct conv *cv = (struct conv*)arg;
1195         uint8_t vers_nibble;
1196         struct Fs *f;
1197
1198         f = cv->p->f;
1199
1200         bp = pullupblock(bp, 1);
1201         if (!bp)
1202                 error(EINVAL, "Proto bypass unable to pullup a byte!");
1203         vers_nibble = *(uint8_t*)bp->rp & 0xf0;
1204         switch (vers_nibble) {
1205         case IP_VER4:
1206                 bp = pullupblock(bp, IPV4HDR_LEN);
1207                 if (!bp)
1208                         error(EINVAL, "Proto bypass unable to pullup v4 header");
1209                 ipoput4(f, bp, FALSE, MAXTTL, DFLTTOS, NULL);
1210                 break;
1211         case IP_VER6:
1212                 bp = pullupblock(bp, IPV6HDR_LEN);
1213                 if (!bp)
1214                         error(EINVAL, "Proto bypass unable to pullup v6 header");
1215                 ipoput6(f, bp, FALSE, MAXTTL, DFLTTOS, NULL);
1216                 break;
1217         default:
1218                 error(EINVAL, "Proto bypass block had unknown IP version 0x%x",
1219                       vers_nibble);
1220         }
1221 }
1222
1223 /* Sets up cv for the protocol bypass.  We use different queues for two reasons:
1224  * 1) To be protocol independent.  For instance, TCP and UDP could use very
1225  * different QIO styles.
1226  * 2) To set up our own kick/bypass method.  Note how udpcreate() and here uses
1227  * qbypass() (just blast it out), while TCP uses qopen() with a kick.  TCP still
1228  * follows queuing discipline.
1229  *
1230  * It's like we are our own protocol, the bypass protocol, when it comes to how
1231  * we interact with qio.  The conv still is of the real protocol type (e.g.
1232  * TCP).
1233  *
1234  * Note that we can't free the old queues.  The way #ip works, the queues are
1235  * created when the conv is created, but the conv is never freed.  It's like a
1236  * slab allocator that never frees objects, but just reinitializes them a
1237  * little.
1238  *
1239  * For the queues, we're basically like UDP:
1240  * - We take packets for rq and drop on overflow.
1241  * - rq is also Qmsg, but we also have Qcoalesce, to ignore out zero-len blocks
1242  * - We kick for our outbound (wq) messages.
1243  *
1244  * Note that Qmsg can drop parts of packets.  It's up to the user to read
1245  * enough.  If they didn't read enough, the extra is dropped.  This is similar
1246  * to SOCK_DGRAM and recvfrom().  Minus major changes, there's no nice way to
1247  * get individual messages with read().  Userspace using the bypass will need to
1248  * find out the MTU of the NIC the IP stack is attached to, and make sure to
1249  * read in at least that amount each time. */
1250 static void setup_proto_qio_bypass(struct conv *cv)
1251 {
1252         cv->rq_save = cv->rq;
1253         cv->wq_save = cv->wq;
1254         cv->rq = qopen(BYPASS_QMAX, Qmsg | Qcoalesce, 0, 0);
1255         cv->wq = qbypass(proto_bypass_kick, cv);
1256 }
1257
1258 static void undo_proto_qio_bypass(struct conv *cv)
1259 {
1260         qfree(cv->rq);
1261         qfree(cv->wq);
1262         cv->rq = cv->rq_save;
1263         cv->wq = cv->wq_save;
1264         cv->rq_save = NULL;
1265         cv->wq_save = NULL;
1266 }
1267
1268 void Fsstdbypass(struct conv *cv, char *argv[], int argc)
1269 {
1270         memset(cv->raddr, 0, sizeof(cv->raddr));
1271         cv->rport = 0;
1272         switch (argc) {
1273         case 2:
1274                 setladdrport(cv, argv[1], 1);
1275                 break;
1276         default:
1277                 error(EINVAL, "Bad args (was %d, need 2) to bypass", argc);
1278         }
1279 }
1280
1281 static void bypassctlmsg(struct Proto *x, struct conv *cv, struct cmdbuf *cb)
1282 {
1283         if (!x->bypass)
1284                 error(EFAIL, "Protocol %s does not support bypass", x->name);
1285         /* The protocol needs to set the port (usually by calling Fsstdbypass) and
1286          * then do whatever it needs to make sure it can find the conv again during
1287          * receive (usually by adding to a hash table). */
1288         x->bypass(cv, cb->f, cb->nf);
1289         setup_proto_qio_bypass(cv);
1290         cv->state = Bypass;
1291 }
1292
1293 static void shutdownctlmsg(struct conv *cv, struct cmdbuf *cb)
1294 {
1295         if (cb->nf < 2)
1296                 goto err;
1297         if (!strcmp(cb->f[1], "rd")) {
1298                 qhangup(cv->rq, "shutdown");
1299                 if (cv->p->shutdown)
1300                         cv->p->shutdown(cv, SHUT_RD);
1301         } else if (!strcmp(cb->f[1], "wr")) {
1302                 qhangup(cv->wq, "shutdown");
1303                 if (cv->p->shutdown)
1304                         cv->p->shutdown(cv, SHUT_WR);
1305         } else if (!strcmp(cb->f[1], "rdwr")) {
1306                 qhangup(cv->rq, "shutdown");
1307                 qhangup(cv->wq, "shutdown");
1308                 if (cv->p->shutdown)
1309                         cv->p->shutdown(cv, SHUT_RDWR);
1310         } else {
1311                 goto err;
1312         }
1313         return;
1314 err:
1315         error(EINVAL, "shutdown [rx|tx|rxtx]");
1316 }
1317
1318 static void tosctlmsg(struct conv *c, struct cmdbuf *cb)
1319 {
1320         if (cb->nf < 2)
1321                 c->tos = 0;
1322         else
1323                 c->tos = atoi(cb->f[1]);
1324 }
1325
1326 static void ttlctlmsg(struct conv *c, struct cmdbuf *cb)
1327 {
1328         if (cb->nf < 2)
1329                 c->ttl = MAXTTL;
1330         else
1331                 c->ttl = atoi(cb->f[1]);
1332 }
1333
1334 /* Binds a conversation, as if the user wrote "bind *" into ctl. */
1335 static void autobind(struct conv *cv)
1336 {
1337         ERRSTACK(1);
1338         struct cmdbuf *cb;
1339
1340         cb = parsecmd("bind *", 7);
1341         if (waserror()) {
1342                 kfree(cb);
1343                 nexterror();
1344         }
1345         bindctlmsg(cv->p, cv, cb);
1346         poperror();
1347         kfree(cb);
1348 }
1349
1350 static long ipwrite(struct chan *ch, void *v, long n, int64_t off)
1351 {
1352         ERRSTACK(1);
1353         struct conv *c;
1354         struct Proto *x;
1355         char *p;
1356         struct cmdbuf *cb;
1357         uint8_t ia[IPaddrlen], ma[IPaddrlen];
1358         struct Fs *f;
1359         char *a;
1360
1361         a = v;
1362         f = ipfs[ch->dev];
1363
1364         switch (TYPE(ch->qid)) {
1365                 default:
1366                         error(EPERM, ERROR_FIXME);
1367                 case Qdata:
1368                         x = f->p[PROTO(ch->qid)];
1369                         c = x->conv[CONV(ch->qid)];
1370                         /* connection-less protocols (UDP) can write without manually
1371                          * binding. */
1372                         if (c->lport == 0)
1373                                 autobind(c);
1374                         if (ch->flag & O_NONBLOCK)
1375                                 qwrite_nonblock(c->wq, a, n);
1376                         else
1377                                 qwrite(c->wq, a, n);
1378                         break;
1379                 case Qarp:
1380                         return arpwrite(f, a, n);
1381                 case Qiproute:
1382                         return routewrite(f, ch, a, n);
1383                 case Qlog:
1384                         netlogctl(f, a, n);
1385                         return n;
1386                 case Qndb:
1387                         return ndbwrite(f, a, off, n);
1388                 case Qctl:
1389                         x = f->p[PROTO(ch->qid)];
1390                         c = x->conv[CONV(ch->qid)];
1391                         cb = parsecmd(a, n);
1392
1393                         qlock(&c->qlock);
1394                         if (waserror()) {
1395                                 qunlock(&c->qlock);
1396                                 kfree(cb);
1397                                 nexterror();
1398                         }
1399                         if (cb->nf < 1)
1400                                 error(EFAIL, "short control request");
1401                         if (strcmp(cb->f[0], "connect") == 0)
1402                                 connectctlmsg(x, c, cb);
1403                         else if (strcmp(cb->f[0], "announce") == 0)
1404                                 announcectlmsg(x, c, cb);
1405                         else if (strcmp(cb->f[0], "bind") == 0)
1406                                 bindctlmsg(x, c, cb);
1407                         else if (strcmp(cb->f[0], "bypass") == 0)
1408                                 bypassctlmsg(x, c, cb);
1409                         else if (strcmp(cb->f[0], "shutdown") == 0)
1410                                 shutdownctlmsg(c, cb);
1411                         else if (strcmp(cb->f[0], "ttl") == 0)
1412                                 ttlctlmsg(c, cb);
1413                         else if (strcmp(cb->f[0], "tos") == 0)
1414                                 tosctlmsg(c, cb);
1415                         else if (strcmp(cb->f[0], "ignoreadvice") == 0)
1416                                 c->ignoreadvice = 1;
1417                         else if (strcmp(cb->f[0], "addmulti") == 0) {
1418                                 if (cb->nf < 2)
1419                                         error(EFAIL, "addmulti needs interface address");
1420                                 if (cb->nf == 2) {
1421                                         if (!ipismulticast(c->raddr))
1422                                                 error(EFAIL, "addmulti for a non multicast address");
1423                                         parseip(ia, cb->f[1]);
1424                                         ipifcaddmulti(c, c->raddr, ia);
1425                                 } else {
1426                                         parseip(ma, cb->f[2]);
1427                                         if (!ipismulticast(ma))
1428                                                 error(EFAIL, "addmulti for a non multicast address");
1429                                         parseip(ia, cb->f[1]);
1430                                         ipifcaddmulti(c, ma, ia);
1431                                 }
1432                         } else if (strcmp(cb->f[0], "remmulti") == 0) {
1433                                 if (cb->nf < 2)
1434                                         error(EFAIL, "remmulti needs interface address");
1435                                 if (!ipismulticast(c->raddr))
1436                                         error(EFAIL, "remmulti for a non multicast address");
1437                                 parseip(ia, cb->f[1]);
1438                                 ipifcremmulti(c, c->raddr, ia);
1439                         } else if (x->ctl != NULL) {
1440                                 x->ctl(c, cb->f, cb->nf);
1441                         } else
1442                                 error(EFAIL, "unknown control request");
1443                         qunlock(&c->qlock);
1444                         kfree(cb);
1445                         poperror();
1446         }
1447         return n;
1448 }
1449
1450 static long ipbwrite(struct chan *ch, struct block *bp, uint32_t offset)
1451 {
1452         struct conv *c;
1453         int n;
1454
1455         switch (TYPE(ch->qid)) {
1456                 case Qdata:
1457                         c = chan2conv(ch);
1458                         if (bp->next)
1459                                 bp = concatblock(bp);
1460                         n = BLEN(bp);
1461                         if (ch->flag & O_NONBLOCK)
1462                                 qbwrite_nonblock(c->wq, bp);
1463                         else
1464                                 qbwrite(c->wq, bp);
1465                         return n;
1466                 default:
1467                         return devbwrite(ch, bp, offset);
1468         }
1469 }
1470
1471 static void ip_wake_cb(struct queue *q, void *data, int filter)
1472 {
1473         struct conv *conv = (struct conv*)data;
1474         struct fd_tap *tap_i;
1475         /* For these two, we want to ignore events on the opposite end of the
1476          * queues.  For instance, we want to know when the WQ is writable.  Our
1477          * writes will actually make it readable - we don't want to trigger a tap
1478          * for that.  However, qio doesn't know how/why we are using a queue, or
1479          * even who the ends are (hence the callbacks) */
1480         if ((filter & FDTAP_FILT_READABLE) && (q == conv->wq))
1481                 return;
1482         if ((filter & FDTAP_FILT_WRITABLE) && (q == conv->rq))
1483                 return;
1484         /* At this point, we have an event we want to send to our taps (if any).
1485          * The lock protects list integrity and the existence of the tap.
1486          *
1487          * Previously, I thought of using the conv qlock.  That actually breaks, due
1488          * to weird usages of the qlock (someone holds it for a long time, blocking
1489          * the inbound wakeup from etherread4).
1490          *
1491          * I opted for a spinlock for a couple reasons:
1492          * - fire_tap should not block.  ideally it'll be fast too (it's mostly a
1493          * send_event).
1494          * - our callers might not want to block.  A lot of network wakeups will
1495          * come network processes (etherread4) or otherwise unrelated to this
1496          * particular conversation.  I'd rather do something like fire off a KMSG
1497          * than block those.
1498          * - if fire_tap takes a while, holding the lock only slows down other
1499          * events on this *same* conversation, or other tap registration.  not a
1500          * huge deal. */
1501         spin_lock(&conv->tap_lock);
1502         SLIST_FOREACH(tap_i, &conv->data_taps, link)
1503                 fire_tap(tap_i, filter);
1504         spin_unlock(&conv->tap_lock);
1505 }
1506
1507 int iptapfd(struct chan *chan, struct fd_tap *tap, int cmd)
1508 {
1509         struct conv *conv = chan2conv(chan);
1510         int ret;
1511
1512         #define DEVIP_LEGAL_DATA_TAPS (FDTAP_FILT_READABLE | FDTAP_FILT_WRITABLE | \
1513                                        FDTAP_FILT_HANGUP | FDTAP_FILT_PRIORITY |   \
1514                                        FDTAP_FILT_ERROR)
1515         #define DEVIP_LEGAL_LISTEN_TAPS (FDTAP_FILT_READABLE | FDTAP_FILT_HANGUP)
1516
1517         switch (TYPE(chan->qid)) {
1518                 case Qdata:
1519                         if (tap->filter & ~DEVIP_LEGAL_DATA_TAPS) {
1520                                 set_errno(ENOSYS);
1521                                 set_errstr("Unsupported #%s data tap %p, must be %p", devname(),
1522                                            tap->filter, DEVIP_LEGAL_DATA_TAPS);
1523                                 return -1;
1524                         }
1525                         spin_lock(&conv->tap_lock);
1526                         switch (cmd) {
1527                                 case (FDTAP_CMD_ADD):
1528                                         if (SLIST_EMPTY(&conv->data_taps)) {
1529                                                 qio_set_wake_cb(conv->rq, ip_wake_cb, conv);
1530                                                 qio_set_wake_cb(conv->wq, ip_wake_cb, conv);
1531                                         }
1532                                         SLIST_INSERT_HEAD(&conv->data_taps, tap, link);
1533                                         ret = 0;
1534                                         break;
1535                                 case (FDTAP_CMD_REM):
1536                                         SLIST_REMOVE(&conv->data_taps, tap, fd_tap, link);
1537                                         if (SLIST_EMPTY(&conv->data_taps)) {
1538                                                 qio_set_wake_cb(conv->rq, 0, conv);
1539                                                 qio_set_wake_cb(conv->wq, 0, conv);
1540                                         }
1541                                         ret = 0;
1542                                         break;
1543                                 default:
1544                                         set_errno(ENOSYS);
1545                                         set_errstr("Unsupported #%s data tap command %p",
1546                                                    devname(), cmd);
1547                                         ret = -1;
1548                         }
1549                         spin_unlock(&conv->tap_lock);
1550                         return ret;
1551                 case Qlisten:
1552                         if (tap->filter & ~DEVIP_LEGAL_LISTEN_TAPS) {
1553                                 set_errno(ENOSYS);
1554                                 set_errstr("Unsupported #%s listen tap %p, must be %p",
1555                                            devname(), tap->filter, DEVIP_LEGAL_LISTEN_TAPS);
1556                                 return -1;
1557                         }
1558                         spin_lock(&conv->tap_lock);
1559                         switch (cmd) {
1560                                 case (FDTAP_CMD_ADD):
1561                                         SLIST_INSERT_HEAD(&conv->listen_taps, tap, link);
1562                                         ret = 0;
1563                                         break;
1564                                 case (FDTAP_CMD_REM):
1565                                         SLIST_REMOVE(&conv->listen_taps, tap, fd_tap, link);
1566                                         ret = 0;
1567                                         break;
1568                                 default:
1569                                         set_errno(ENOSYS);
1570                                         set_errstr("Unsupported #%s listen tap command %p",
1571                                                    devname(), cmd);
1572                                         ret = -1;
1573                         }
1574                         spin_unlock(&conv->tap_lock);
1575                         return ret;
1576                 default:
1577                         set_errno(ENOSYS);
1578                         set_errstr("Can't tap #%s file type %d", devname(),
1579                                    TYPE(chan->qid));
1580                         return -1;
1581         }
1582 }
1583
1584 struct dev ipdevtab __devtab = {
1585         .name = "ip",
1586
1587         .reset = ipreset,
1588         .init = ipinit,
1589         .shutdown = devshutdown,
1590         .attach = ipattach,
1591         .walk = ipwalk,
1592         .stat = ipstat,
1593         .open = ipopen,
1594         .create = devcreate,
1595         .close = ipclose,
1596         .read = ipread,
1597         .bread = ipbread,
1598         .write = ipwrite,
1599         .bwrite = ipbwrite,
1600         .remove = devremove,
1601         .wstat = ipwstat,
1602         .power = devpower,
1603         .chaninfo = ipchaninfo,
1604         .tapfd = iptapfd,
1605 };
1606
1607 int Fsproto(struct Fs *f, struct Proto *p)
1608 {
1609         if (f->np >= Maxproto)
1610                 return -1;
1611
1612         qlock_init(&p->qlock);
1613         p->f = f;
1614
1615         if (p->ipproto > 0) {
1616                 if (f->t2p[p->ipproto] != NULL)
1617                         return -1;
1618                 f->t2p[p->ipproto] = p;
1619         }
1620
1621         p->qid.type = QTDIR;
1622         p->qid.path = QID(f->np, 0, Qprotodir);
1623         p->conv = kzmalloc(sizeof(struct conv *) * (p->nc + 1), 0);
1624         if (p->conv == NULL)
1625                 panic("Fsproto");
1626
1627         p->x = f->np;
1628         p->nextport = 0;
1629         p->nextrport = 600;
1630         f->p[f->np++] = p;
1631
1632         return 0;
1633 }
1634
1635 /*
1636  *  return true if this protocol is
1637  *  built in
1638  */
1639 int Fsbuiltinproto(struct Fs *f, uint8_t proto)
1640 {
1641         return f->t2p[proto] != NULL;
1642 }
1643
1644 /*
1645  *  called with protocol locked
1646  */
1647 struct conv *Fsprotoclone(struct Proto *p, char *user)
1648 {
1649         struct conv *c, **pp, **ep;
1650
1651 retry:
1652         c = NULL;
1653         ep = &p->conv[p->nc];
1654         for (pp = p->conv; pp < ep; pp++) {
1655                 c = *pp;
1656                 if (c == NULL) {
1657                         c = kzmalloc(sizeof(struct conv), 0);
1658                         if (c == NULL)
1659                                 error(ENOMEM,
1660                                       "conv kzmalloc(%d, 0) failed in Fsprotoclone",
1661                                       sizeof(struct conv));
1662                         qlock_init(&c->qlock);
1663                         qlock_init(&c->listenq);
1664                         rendez_init(&c->cr);
1665                         rendez_init(&c->listenr);
1666                         SLIST_INIT(&c->data_taps);      /* already = 0; set to be futureproof */
1667                         SLIST_INIT(&c->listen_taps);
1668                         spinlock_init(&c->tap_lock);
1669                         qlock(&c->qlock);
1670                         c->p = p;
1671                         c->x = pp - p->conv;
1672                         if (p->ptclsize != 0) {
1673                                 c->ptcl = kzmalloc(p->ptclsize, 0);
1674                                 if (c->ptcl == NULL) {
1675                                         kfree(c);
1676                                         error(ENOMEM,
1677                                               "ptcl kzmalloc(%d, 0) failed in Fsprotoclone",
1678                                               p->ptclsize);
1679                                 }
1680                         }
1681                         *pp = c;
1682                         p->ac++;
1683                         c->eq = qopen(1024, Qmsg, 0, 0);
1684                         (*p->create) (c);
1685                         assert(c->rq && c->wq);
1686                         break;
1687                 }
1688                 if (canqlock(&c->qlock)) {
1689                         /*
1690                          *  make sure both processes and protocol
1691                          *  are done with this Conv
1692                          */
1693                         if (c->inuse == 0 && (p->inuse == NULL || (*p->inuse) (c) == 0))
1694                                 break;
1695
1696                         qunlock(&c->qlock);
1697                 }
1698         }
1699         if (pp >= ep) {
1700                 if (p->gc != NULL && (*p->gc) (p))
1701                         goto retry;
1702                 return NULL;
1703         }
1704
1705         c->inuse = 1;
1706         kstrdup(&c->owner, user);
1707         c->perm = 0660;
1708         c->state = Idle;
1709         ipmove(c->laddr, IPnoaddr);
1710         ipmove(c->raddr, IPnoaddr);
1711         c->r = NULL;
1712         c->rgen = 0;
1713         c->lport = 0;
1714         c->rport = 0;
1715         c->restricted = 0;
1716         c->ttl = MAXTTL;
1717         c->tos = DFLTTOS;
1718         qreopen(c->rq);
1719         qreopen(c->wq);
1720         qreopen(c->eq);
1721
1722         qunlock(&c->qlock);
1723         return c;
1724 }
1725
1726 int Fsconnected(struct conv *c, char *msg)
1727 {
1728         if (msg != NULL && *msg != '\0')
1729                 strlcpy(c->cerr, msg, sizeof(c->cerr));
1730
1731         switch (c->state) {
1732                 case Announcing:
1733                         c->state = Announced;
1734                         break;
1735
1736                 case Connecting:
1737                         c->state = Connected;
1738                         break;
1739         }
1740
1741         rendez_wakeup(&c->cr);
1742         return 0;
1743 }
1744
1745 struct Proto *Fsrcvpcol(struct Fs *f, uint8_t proto)
1746 {
1747         if (f->ipmux)
1748                 return f->ipmux;
1749         else
1750                 return f->t2p[proto];
1751 }
1752
1753 struct Proto *Fsrcvpcolx(struct Fs *f, uint8_t proto)
1754 {
1755         return f->t2p[proto];
1756 }
1757
1758 static void fire_listener_taps(struct conv *conv)
1759 {
1760         struct fd_tap *tap_i;
1761         if (SLIST_EMPTY(&conv->listen_taps))
1762                 return;
1763         spin_lock(&conv->tap_lock);
1764         SLIST_FOREACH(tap_i, &conv->listen_taps, link)
1765                 fire_tap(tap_i, FDTAP_FILT_READABLE);
1766         spin_unlock(&conv->tap_lock);
1767 }
1768
1769 /*
1770  *  called with protocol locked
1771  */
1772 struct conv *Fsnewcall(struct conv *c, uint8_t * raddr, uint16_t rport,
1773                                            uint8_t * laddr, uint16_t lport, uint8_t version)
1774 {
1775         struct conv *nc;
1776         struct conv **l;
1777         int i;
1778
1779         qlock(&c->qlock);
1780         i = 0;
1781         for (l = &c->incall; *l; l = &(*l)->next)
1782                 i++;
1783         if (i >= Maxincall) {
1784                 qunlock(&c->qlock);
1785                 return NULL;
1786         }
1787
1788         /* find a free conversation */
1789         nc = Fsprotoclone(c->p, network);
1790         if (nc == NULL) {
1791                 qunlock(&c->qlock);
1792                 return NULL;
1793         }
1794         ipmove(nc->raddr, raddr);
1795         nc->rport = rport;
1796         ipmove(nc->laddr, laddr);
1797         nc->lport = lport;
1798         nc->next = NULL;
1799         *l = nc;
1800         nc->state = Connected;
1801         nc->ipversion = version;
1802
1803         qunlock(&c->qlock);
1804
1805         rendez_wakeup(&c->listenr);
1806         fire_listener_taps(c);
1807
1808         return nc;
1809 }
1810
1811 static long ndbwrite(struct Fs *f, char *a, uint32_t off, int n)
1812 {
1813         if (off > strlen(f->ndb))
1814                 error(EIO, ERROR_FIXME);
1815         if (off + n >= sizeof(f->ndb) - 1)
1816                 error(EIO, ERROR_FIXME);
1817         memmove(f->ndb + off, a, n);
1818         f->ndb[off + n] = 0;
1819         f->ndbvers++;
1820         f->ndbmtime = seconds();
1821         return n;
1822 }
1823
1824 uint32_t scalednconv(void)
1825 {
1826         //if(conf.npage*BY2PG >= 128*MB)
1827         return Nchans * 4;
1828         //  return Nchans;
1829 }