net: Add a protocol 'bypass' command for convs
[akaros.git] / kern / src / net / devip.c
1 /* Copyright © 1994-1999 Lucent Technologies Inc.  All rights reserved.
2  * Portions Copyright © 1997-1999 Vita Nuova Limited
3  * Portions Copyright © 2000-2007 Vita Nuova Holdings Limited
4  *                                (www.vitanuova.com)
5  * Revisions Copyright © 2000-2007 Lucent Technologies Inc. and others
6  *
7  * Modified for the Akaros operating system:
8  * Copyright (c) 2013-2014 The Regents of the University of California
9  * Copyright (c) 2013-2015 Google Inc.
10  *
11  * Permission is hereby granted, free of charge, to any person obtaining a copy
12  * of this software and associated documentation files (the "Software"), to deal
13  * in the Software without restriction, including without limitation the rights
14  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15  * copies of the Software, and to permit persons to whom the Software is
16  * furnished to do so, subject to the following conditions:
17  *
18  * The above copyright notice and this permission notice shall be included in
19  * all copies or substantial portions of the Software.
20  *
21  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
24  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27  * SOFTWARE. */
28
29 #include <vfs.h>
30 #include <kfs.h>
31 #include <slab.h>
32 #include <kmalloc.h>
33 #include <kref.h>
34 #include <string.h>
35 #include <stdio.h>
36 #include <assert.h>
37 #include <error.h>
38 #include <cpio.h>
39 #include <pmap.h>
40 #include <smp.h>
41 #include <ip.h>
42
43 struct dev ipdevtab;
44
45 static char *devname(void)
46 {
47         return ipdevtab.name;
48 }
49
50 enum {
51         Qtopdir = 1,                            /* top level directory */
52         Qtopbase,
53         Qarp = Qtopbase,
54         Qndb,
55         Qiproute,
56         Qiprouter,
57         Qipselftab,
58         Qlog,
59
60         Qprotodir,      /* directory for a protocol */
61         Qprotobase,
62         Qclone = Qprotobase,
63         Qstats,
64
65         Qconvdir,       /* directory for a conversation */
66         Qconvbase,
67         Qctl = Qconvbase,
68         Qdata,
69         Qerr,
70         Qlisten,
71         Qlocal,
72         Qremote,
73         Qstatus,
74         Qsnoop,
75
76         Logtype = 5,
77         Masktype = (1 << Logtype) - 1,
78         Logconv = 12,
79         Maskconv = (1 << Logconv) - 1,
80         Shiftconv = Logtype,
81         Logproto = 8,
82         Maskproto = (1 << Logproto) - 1,
83         Shiftproto = Logtype + Logconv,
84
85         Nfs = 32,
86         BYPASS_QMAX = 64 * MiB,
87 };
88 #define TYPE(x)         ( ((uint32_t)(x).path) & Masktype )
89 #define CONV(x)         ( (((uint32_t)(x).path) >> Shiftconv) & Maskconv )
90 #define PROTO(x)        ( (((uint32_t)(x).path) >> Shiftproto) & Maskproto )
91 #define QID(p, c, y)    ( ((p)<<(Shiftproto)) | ((c)<<Shiftconv) | (y))
92 static char network[] = "network";
93
94 qlock_t fslock;
95 struct Fs *ipfs[Nfs];                   /* attached fs's */
96 struct queue *qlog;
97
98 extern void nullmediumlink(void);
99 extern void pktmediumlink(void);
100 extern char *eve;
101 static long ndbwrite(struct Fs *, char *unused_char_p_t, uint32_t, int);
102 static void closeconv(struct conv *);
103 static void setup_proto_qio_bypass(struct conv *cv);
104 static void undo_proto_qio_bypass(struct conv *cv);
105
106 static struct conv *chan2conv(struct chan *chan)
107 {
108         /* That's a lot of pointers to get to the conv! */
109         return ipfs[chan->dev]->p[PROTO(chan->qid)]->conv[CONV(chan->qid)];
110 }
111
112 static inline int founddevdir(struct chan *c, struct qid q, char *n,
113                                                           int64_t length, char *user, long perm,
114                                                           struct dir *db)
115 {
116         devdir(c, q, n, length, user, perm, db);
117         return 1;
118 }
119
120 static int topdirgen(struct chan *c, struct dir *dp)
121 {
122         struct qid q;
123         mkqid(&q, QID(0, 0, Qtopdir), 0, QTDIR);
124         snprintf(get_cur_genbuf(), GENBUF_SZ, "#%s%lu", devname(), c->dev);
125         return founddevdir(c, q, get_cur_genbuf(), 0, network, 0555, dp);
126 }
127
128
129 static int ip3gen(struct chan *c, int i, struct dir *dp)
130 {
131         struct qid q;
132         struct conv *cv;
133         char *p;
134         int perm;
135
136         cv = chan2conv(c);
137         if (cv->owner == NULL)
138                 kstrdup(&cv->owner, eve);
139         mkqid(&q, QID(PROTO(c->qid), CONV(c->qid), i), 0, QTFILE);
140
141         switch (i) {
142                 default:
143                         return -1;
144                 case Qctl:
145                         return founddevdir(c, q, "ctl", 0,
146                                                    cv->owner, cv->perm, dp);
147                 case Qdata:
148                         perm = cv->perm;
149                         perm |= qreadable(cv->rq) ? DMREADABLE : 0;
150                         perm |= qwritable(cv->wq) ? DMWRITABLE : 0;
151                         return founddevdir(c, q, "data", qlen(cv->rq),
152                                                            cv->owner, perm, dp);
153                 case Qerr:
154                         perm = cv->perm;
155                         perm |= qreadable(cv->eq) ? DMREADABLE : 0;
156                         return founddevdir(c, q, "err", qlen(cv->eq),
157                                                            cv->owner, perm, dp);
158                 case Qlisten:
159                         return founddevdir(c, q, "listen", 0, cv->owner, cv->perm, dp);
160                 case Qlocal:
161                         p = "local";
162                         break;
163                 case Qremote:
164                         p = "remote";
165                         break;
166                 case Qsnoop:
167                         if (strcmp(cv->p->name, "ipifc") != 0)
168                                 return -1;
169                         perm = 0400;
170                         perm |= qreadable(cv->sq) ? DMREADABLE : 0;
171                         return founddevdir(c, q, "snoop", qlen(cv->sq),
172                                                            cv->owner, perm, dp);
173                 case Qstatus:
174                         p = "status";
175                         break;
176         }
177         return founddevdir(c, q, p, 0, cv->owner, 0444, dp);
178 }
179
180 static int ip2gen(struct chan *c, int i, struct dir *dp)
181 {
182         struct qid q;
183         mkqid(&q, QID(PROTO(c->qid), 0, i), 0, QTFILE);
184         switch (i) {
185                 case Qclone:
186                         return founddevdir(c, q, "clone", 0, network, 0666, dp);
187                 case Qstats:
188                         return founddevdir(c, q, "stats", 0, network, 0444, dp);
189         }
190         return -1;
191 }
192
193 static int ip1gen(struct chan *c, int i, struct dir *dp)
194 {
195         struct qid q;
196         char *p;
197         int prot;
198         int len = 0;
199         struct Fs *f;
200         extern uint32_t kerndate;
201
202         f = ipfs[c->dev];
203
204         prot = 0666;
205         mkqid(&q, QID(0, 0, i), 0, QTFILE);
206         switch (i) {
207                 default:
208                         return -1;
209                 case Qarp:
210                         p = "arp";
211                         break;
212                 case Qndb:
213                         p = "ndb";
214                         len = strlen(f->ndb);
215                         q.vers = f->ndbvers;
216                         break;
217                 case Qiproute:
218                         p = "iproute";
219                         break;
220                 case Qipselftab:
221                         p = "ipselftab";
222                         prot = 0444;
223                         break;
224                 case Qiprouter:
225                         p = "iprouter";
226                         break;
227                 case Qlog:
228                         p = "log";
229                         break;
230         }
231         devdir(c, q, p, len, network, prot, dp);
232         if (i == Qndb && f->ndbmtime > kerndate)
233                 dp->mtime = f->ndbmtime;
234         return 1;
235 }
236
237 static int
238 ipgen(struct chan *c, char *unused_char_p_t, struct dirtab *d, int unused_int,
239           int s, struct dir *dp)
240 {
241         struct qid q;
242         struct conv *cv;
243         struct Fs *f;
244
245         f = ipfs[c->dev];
246
247         switch (TYPE(c->qid)) {
248                 case Qtopdir:
249                         if (s == DEVDOTDOT)
250                                 return topdirgen(c, dp);
251                         if (s < f->np) {
252                                 if (f->p[s]->connect == NULL)
253                                         return 0;       /* protocol with no user interface */
254                                 mkqid(&q, QID(s, 0, Qprotodir), 0, QTDIR);
255                                 return founddevdir(c, q, f->p[s]->name, 0, network, 0555, dp);
256                         }
257                         s -= f->np;
258                         return ip1gen(c, s + Qtopbase, dp);
259                 case Qarp:
260                 case Qndb:
261                 case Qlog:
262                 case Qiproute:
263                 case Qiprouter:
264                 case Qipselftab:
265                         return ip1gen(c, TYPE(c->qid), dp);
266                 case Qprotodir:
267                         if (s == DEVDOTDOT)
268                                 return topdirgen(c, dp);
269                         else if (s < f->p[PROTO(c->qid)]->ac) {
270                                 cv = f->p[PROTO(c->qid)]->conv[s];
271                                 snprintf(get_cur_genbuf(), GENBUF_SZ, "%d", s);
272                                 mkqid(&q, QID(PROTO(c->qid), s, Qconvdir), 0, QTDIR);
273                                 return
274                                         founddevdir(c, q, get_cur_genbuf(), 0, cv->owner, 0555, dp);
275                         }
276                         s -= f->p[PROTO(c->qid)]->ac;
277                         return ip2gen(c, s + Qprotobase, dp);
278                 case Qclone:
279                 case Qstats:
280                         return ip2gen(c, TYPE(c->qid), dp);
281                 case Qconvdir:
282                         if (s == DEVDOTDOT) {
283                                 s = PROTO(c->qid);
284                                 mkqid(&q, QID(s, 0, Qprotodir), 0, QTDIR);
285                                 devdir(c, q, f->p[s]->name, 0, network, 0555, dp);
286                                 return 1;
287                         }
288                         return ip3gen(c, s + Qconvbase, dp);
289                 case Qctl:
290                 case Qdata:
291                 case Qerr:
292                 case Qlisten:
293                 case Qlocal:
294                 case Qremote:
295                 case Qstatus:
296                 case Qsnoop:
297                         return ip3gen(c, TYPE(c->qid), dp);
298         }
299         return -1;
300 }
301
302 static void ipinit(void)
303 {
304         qlock_init(&fslock);
305         nullmediumlink();
306         pktmediumlink();
307 /* if only
308         fmtinstall('i', eipfmt);
309         fmtinstall('I', eipfmt);
310         fmtinstall('E', eipfmt);
311         fmtinstall('V', eipfmt);
312         fmtinstall('M', eipfmt);
313 */
314 }
315
316 static void ipreset(void)
317 {
318 }
319
320 static struct Fs *ipgetfs(int dev)
321 {
322         extern void (*ipprotoinit[]) (struct Fs *);
323         struct Fs *f;
324         int i;
325
326         if (dev >= Nfs)
327                 return NULL;
328
329         qlock(&fslock);
330         if (ipfs[dev] == NULL) {
331                 f = kzmalloc(sizeof(struct Fs), MEM_WAIT);
332                 rwinit(&f->rwlock);
333                 qlock_init(&f->iprouter.qlock);
334                 ip_init(f);
335                 arpinit(f);
336                 netloginit(f);
337                 for (i = 0; ipprotoinit[i]; i++)
338                         ipprotoinit[i] (f);
339                 f->dev = dev;
340                 ipfs[dev] = f;
341         }
342         qunlock(&fslock);
343
344         return ipfs[dev];
345 }
346
347 struct IPaux *newipaux(char *owner, char *tag)
348 {
349         struct IPaux *a;
350         int n;
351
352         a = kzmalloc(sizeof(*a), 0);
353         kstrdup(&a->owner, owner);
354         memset(a->tag, ' ', sizeof(a->tag));
355         n = strlen(tag);
356         if (n > sizeof(a->tag))
357                 n = sizeof(a->tag);
358         memmove(a->tag, tag, n);
359         return a;
360 }
361
362 #define ATTACHER(c) (((struct IPaux*)((c)->aux))->owner)
363
364 static struct chan *ipattach(char *spec)
365 {
366         struct chan *c;
367         int dev;
368
369         dev = atoi(spec);
370         if (dev >= Nfs)
371                 error(EFAIL, "bad specification");
372
373         ipgetfs(dev);
374         c = devattach(devname(), spec);
375         mkqid(&c->qid, QID(0, 0, Qtopdir), 0, QTDIR);
376         c->dev = dev;
377
378         c->aux = newipaux(commonuser(), "none");
379
380         return c;
381 }
382
383 static struct walkqid *ipwalk(struct chan *c, struct chan *nc, char **name,
384                                                           int nname)
385 {
386         struct IPaux *a = c->aux;
387         struct walkqid *w;
388
389         w = devwalk(c, nc, name, nname, NULL, 0, ipgen);
390         if (w != NULL && w->clone != NULL)
391                 w->clone->aux = newipaux(a->owner, a->tag);
392         return w;
393 }
394
395 static int ipstat(struct chan *c, uint8_t * db, int n)
396 {
397         return devstat(c, db, n, NULL, 0, ipgen);
398 }
399
400 static int should_wake(void *arg)
401 {
402         struct conv *cv = arg;
403         /* signal that the conv is closed */
404         if (qisclosed(cv->rq))
405                 return TRUE;
406         return cv->incall != NULL;
407 }
408
409 static struct chan *ipopen(struct chan *c, int omode)
410 {
411         ERRSTACK(2);
412         struct conv *cv, *nc;
413         struct Proto *p;
414         int perm;
415         struct Fs *f;
416
417         /* perm is a lone rwx, not the rwx------ from the conversion */
418         perm = omode_to_rwx(omode) >> 6;
419
420         f = ipfs[c->dev];
421
422         switch (TYPE(c->qid)) {
423                 default:
424                         break;
425                 case Qndb:
426                         if (omode & (O_WRITE | O_TRUNC) && !iseve())
427                                 error(EPERM, ERROR_FIXME);
428                         if ((omode & (O_WRITE | O_TRUNC)) == (O_WRITE | O_TRUNC))
429                                 f->ndb[0] = 0;
430                         break;
431                 case Qlog:
432                         netlogopen(f);
433                         break;
434                 case Qiprouter:
435                         iprouteropen(f);
436                         break;
437                 case Qiproute:
438                         break;
439                 case Qtopdir:
440                 case Qprotodir:
441                 case Qconvdir:
442                 case Qstatus:
443                 case Qremote:
444                 case Qlocal:
445                 case Qstats:
446                 case Qipselftab:
447                         if (omode & O_WRITE)
448                                 error(EPERM, ERROR_FIXME);
449                         break;
450                 case Qsnoop:
451                         if (omode & O_WRITE)
452                                 error(EPERM, ERROR_FIXME);
453                         /* might be racy.  note the lack of a proto lock, unlike Qdata */
454                         p = f->p[PROTO(c->qid)];
455                         cv = p->conv[CONV(c->qid)];
456                         if (strcmp(ATTACHER(c), cv->owner) != 0 && !iseve())
457                                 error(EPERM, ERROR_FIXME);
458                         atomic_inc(&cv->snoopers);
459                         break;
460                 case Qclone:
461                         p = f->p[PROTO(c->qid)];
462                         qlock(&p->qlock);
463                         if (waserror()) {
464                                 qunlock(&p->qlock);
465                                 nexterror();
466                         }
467                         cv = Fsprotoclone(p, ATTACHER(c));
468                         qunlock(&p->qlock);
469                         poperror();
470                         if (cv == NULL) {
471                                 error(ENODEV, ERROR_FIXME);
472                                 break;
473                         }
474                         mkqid(&c->qid, QID(p->x, cv->x, Qctl), 0, QTFILE);
475                         break;
476                 case Qdata:
477                 case Qctl:
478                 case Qerr:
479                         p = f->p[PROTO(c->qid)];
480                         qlock(&p->qlock);
481                         cv = p->conv[CONV(c->qid)];
482                         qlock(&cv->qlock);
483                         if (waserror()) {
484                                 qunlock(&cv->qlock);
485                                 qunlock(&p->qlock);
486                                 nexterror();
487                         }
488                         if ((perm & (cv->perm >> 6)) != perm) {
489                                 if (strcmp(ATTACHER(c), cv->owner) != 0)
490                                         error(EPERM, ERROR_FIXME);
491                                 if ((perm & cv->perm) != perm)
492                                         error(EPERM, ERROR_FIXME);
493
494                         }
495                         cv->inuse++;
496                         if (cv->inuse == 1) {
497                                 kstrdup(&cv->owner, ATTACHER(c));
498                                 cv->perm = 0660;
499                         }
500                         qunlock(&cv->qlock);
501                         qunlock(&p->qlock);
502                         poperror();
503                         break;
504                 case Qlisten:
505                         cv = f->p[PROTO(c->qid)]->conv[CONV(c->qid)];
506                         /* No permissions or Announce checks required.  We'll see if that's
507                          * a good idea or not. (the perm check would do nothing, as is,
508                          * since an O_PATH perm is 0).
509                          *
510                          * But we probably want to incref to keep the conversation around
511                          * until this FD/chan is closed.  #ip is a little weird in that
512                          * objects never really go away (high water mark for convs, you can
513                          * always find them in the ns).  I think it is possible to
514                          * namec/ipgen a chan, then have that conv close, then have that
515                          * chan be opened.  You can probably do this with a data file. */
516                         if (omode & O_PATH) {
517                                 qlock(&cv->qlock);
518                                 cv->inuse++;
519                                 qunlock(&cv->qlock);
520                                 break;
521                         }
522                         if ((perm & (cv->perm >> 6)) != perm) {
523                                 if (strcmp(ATTACHER(c), cv->owner) != 0)
524                                         error(EPERM, ERROR_FIXME);
525                                 if ((perm & cv->perm) != perm)
526                                         error(EPERM, ERROR_FIXME);
527
528                         }
529
530                         if (cv->state != Announced)
531                                 error(EFAIL, "not announced");
532
533                         if (waserror()) {
534                                 closeconv(cv);
535                                 nexterror();
536                         }
537                         qlock(&cv->qlock);
538                         cv->inuse++;
539                         qunlock(&cv->qlock);
540
541                         nc = NULL;
542                         while (nc == NULL) {
543                                 /* give up if we got a hangup */
544                                 if (qisclosed(cv->rq))
545                                         error(EFAIL, "listen hungup");
546
547                                 qlock(&cv->listenq);
548                                 if (waserror()) {
549                                         qunlock(&cv->listenq);
550                                         nexterror();
551                                 }
552                                 /* we can peek at incall without grabbing the cv qlock.  if
553                                  * anything is there, it'll remain there until we dequeue it.
554                                  * no one else can, since we hold the listenq lock */
555                                 if ((c->flag & O_NONBLOCK) && !cv->incall)
556                                         error(EAGAIN, "listen queue empty");
557                                 /* wait for a connect */
558                                 rendez_sleep(&cv->listenr, should_wake, cv);
559
560                                 /* if there is a concurrent hangup, they will hold the qlock
561                                  * until the hangup is complete, including closing the cv->rq */
562                                 qlock(&cv->qlock);
563                                 nc = cv->incall;
564                                 if (nc != NULL) {
565                                         cv->incall = nc->next;
566                                         mkqid(&c->qid, QID(PROTO(c->qid), nc->x, Qctl), 0, QTFILE);
567                                         kstrdup(&cv->owner, ATTACHER(c));
568                                 }
569                                 qunlock(&cv->qlock);
570
571                                 qunlock(&cv->listenq);
572                                 poperror();
573                         }
574                         closeconv(cv);
575                         poperror();
576                         break;
577         }
578         c->mode = openmode(omode);
579         c->flag |= COPEN;
580         c->offset = 0;
581         return c;
582 }
583
584 static int ipwstat(struct chan *c, uint8_t * dp, int n)
585 {
586         ERRSTACK(2);
587         struct dir *d;
588         struct conv *cv;
589         struct Fs *f;
590         struct Proto *p;
591
592         f = ipfs[c->dev];
593         switch (TYPE(c->qid)) {
594                 default:
595                         error(EPERM, ERROR_FIXME);
596                         break;
597                 case Qctl:
598                 case Qdata:
599                         break;
600         }
601
602         d = kzmalloc(sizeof(*d) + n, 0);
603         if (waserror()) {
604                 kfree(d);
605                 nexterror();
606         }
607         n = convM2D(dp, n, d, (char *)&d[1]);
608         if (n == 0)
609                 error(ENODATA, ERROR_FIXME);
610         p = f->p[PROTO(c->qid)];
611         cv = p->conv[CONV(c->qid)];
612         if (!iseve() && strcmp(ATTACHER(c), cv->owner) != 0)
613                 error(EPERM, ERROR_FIXME);
614         if (!emptystr(d->uid))
615                 kstrdup(&cv->owner, d->uid);
616         if (d->mode != ~0UL)
617                 cv->perm = d->mode & 0777;
618         poperror();
619         kfree(d);
620         return n;
621 }
622
623 /* Should be able to handle any file type chan. Feel free to extend it. */
624 static char *ipchaninfo(struct chan *ch, char *ret, size_t ret_l)
625 {
626         struct conv *conv;
627         struct Proto *proto;
628         char *p;
629         struct Fs *f;
630
631         f = ipfs[ch->dev];
632
633         switch (TYPE(ch->qid)) {
634                 default:
635                         ret = "Unknown type";
636                         break;
637                 case Qdata:
638                         proto = f->p[PROTO(ch->qid)];
639                         conv = proto->conv[CONV(ch->qid)];
640                         snprintf(ret, ret_l, "Qdata, %s, proto %s, conv idx %d, rq len %d, wq len %d",
641                                  SLIST_EMPTY(&conv->data_taps) ? "untapped" : "tapped",
642                                  proto->name, conv->x, qlen(conv->rq), qlen(conv->wq));
643                         break;
644                 case Qarp:
645                         ret = "Qarp";
646                         break;
647                 case Qiproute:
648                         ret = "Qiproute";
649                         break;
650                 case Qlisten:
651                         proto = f->p[PROTO(ch->qid)];
652                         conv = proto->conv[CONV(ch->qid)];
653                         snprintf(ret, ret_l, "Qlisten, %s proto %s, conv idx %d",
654                                  SLIST_EMPTY(&conv->listen_taps) ? "untapped" : "tapped",
655                                  proto->name, conv->x);
656                         break;
657                 case Qlog:
658                         ret = "Qlog";
659                         break;
660                 case Qndb:
661                         ret = "Qndb";
662                         break;
663                 case Qctl:
664                         proto = f->p[PROTO(ch->qid)];
665                         conv = proto->conv[CONV(ch->qid)];
666                         snprintf(ret, ret_l, "Qctl, proto %s, conv idx %d", proto->name,
667                                          conv->x);
668                         break;
669         }
670         return ret;
671 }
672
673 static void closeconv(struct conv *cv)
674 {
675         ERRSTACK(1);
676         struct conv *nc;
677         struct Ipmulti *mp;
678
679         qlock(&cv->qlock);
680
681         if (--cv->inuse > 0) {
682                 qunlock(&cv->qlock);
683                 return;
684         }
685         if (waserror()) {
686                 qunlock(&cv->qlock);
687                 nexterror();
688         }
689         /* close all incoming calls since no listen will ever happen */
690         for (nc = cv->incall; nc; nc = cv->incall) {
691                 cv->incall = nc->next;
692                 closeconv(nc);
693         }
694         cv->incall = NULL;
695
696         kstrdup(&cv->owner, network);
697         cv->perm = 0660;
698
699         while ((mp = cv->multi) != NULL)
700                 ipifcremmulti(cv, mp->ma, mp->ia);
701
702         cv->r = NULL;
703         cv->rgen = 0;
704         if (cv->state == Bypass)
705                 undo_proto_qio_bypass(cv);
706         cv->p->close(cv);
707         cv->state = Idle;
708         qunlock(&cv->qlock);
709         poperror();
710 }
711
712 static void ipclose(struct chan *c)
713 {
714         struct Fs *f;
715
716         f = ipfs[c->dev];
717         switch (TYPE(c->qid)) {
718                 default:
719                         break;
720                 case Qlog:
721                         if (c->flag & COPEN)
722                                 netlogclose(f);
723                         break;
724                 case Qiprouter:
725                         if (c->flag & COPEN)
726                                 iprouterclose(f);
727                         break;
728                 case Qdata:
729                 case Qctl:
730                 case Qerr:
731                 case Qlisten:
732                         if (c->flag & COPEN)
733                                 closeconv(f->p[PROTO(c->qid)]->conv[CONV(c->qid)]);
734                         break;
735                 case Qsnoop:
736                         if (c->flag & COPEN)
737                                 atomic_dec(&f->p[PROTO(c->qid)]->conv[CONV(c->qid)]->snoopers);
738                         break;
739         }
740         kfree(((struct IPaux *)c->aux)->owner);
741         kfree(c->aux);
742 }
743
744 enum {
745         Statelen = 32 * 1024,
746 };
747
748 static long ipread(struct chan *ch, void *a, long n, int64_t off)
749 {
750         struct conv *c;
751         struct Proto *x;
752         char *buf, *p;
753         long rv;
754         struct Fs *f;
755         uint32_t offset = off;
756
757         f = ipfs[ch->dev];
758
759         p = a;
760         switch (TYPE(ch->qid)) {
761                 default:
762                         error(EPERM, ERROR_FIXME);
763                 case Qtopdir:
764                 case Qprotodir:
765                 case Qconvdir:
766                         return devdirread(ch, a, n, 0, 0, ipgen);
767                 case Qarp:
768                         return arpread(f->arp, a, offset, n);
769                 case Qndb:
770                         return readstr(offset, a, n, f->ndb);
771                 case Qiproute:
772                         return routeread(f, a, offset, n);
773                 case Qiprouter:
774                         return iprouterread(f, a, n);
775                 case Qipselftab:
776                         return ipselftabread(f, a, offset, n);
777                 case Qlog:
778                         return netlogread(f, a, offset, n);
779                 case Qctl:
780                         snprintf(get_cur_genbuf(), GENBUF_SZ, "%lu", CONV(ch->qid));
781                         return readstr(offset, p, n, get_cur_genbuf());
782                 case Qremote:
783                         buf = kzmalloc(Statelen, 0);
784                         x = f->p[PROTO(ch->qid)];
785                         c = x->conv[CONV(ch->qid)];
786                         if (x->remote == NULL) {
787                                 snprintf(buf, Statelen, "%I!%d\n", c->raddr, c->rport);
788                         } else {
789                                 (*x->remote) (c, buf, Statelen - 2);
790                         }
791                         rv = readstr(offset, p, n, buf);
792                         kfree(buf);
793                         return rv;
794                 case Qlocal:
795                         buf = kzmalloc(Statelen, 0);
796                         x = f->p[PROTO(ch->qid)];
797                         c = x->conv[CONV(ch->qid)];
798                         if (x->local == NULL) {
799                                 snprintf(buf, Statelen, "%I!%d\n", c->laddr, c->lport);
800                         } else {
801                                 (*x->local) (c, buf, Statelen - 2);
802                         }
803                         rv = readstr(offset, p, n, buf);
804                         kfree(buf);
805                         return rv;
806                 case Qstatus:
807                         /* this all is a bit screwed up since the size of some state's
808                          * buffers will change from one invocation to another.  a reader
809                          * will come in and read the entire buffer.  then it will come again
810                          * and read from the next offset, expecting EOF.  if the buffer
811                          * changed sizes, it'll reprint the end of the buffer slightly. */
812                         buf = kzmalloc(Statelen, 0);
813                         x = f->p[PROTO(ch->qid)];
814                         c = x->conv[CONV(ch->qid)];
815                         if (c->state == Bypass)
816                                 snprintf(buf, Statelen, "Bypassed\n");
817                         else
818                                 (*x->state)(c, buf, Statelen - 2);
819                         rv = readstr(offset, p, n, buf);
820                         kfree(buf);
821                         return rv;
822                 case Qdata:
823                         c = f->p[PROTO(ch->qid)]->conv[CONV(ch->qid)];
824                         if (ch->flag & O_NONBLOCK)
825                                 return qread_nonblock(c->rq, a, n);
826                         else
827                                 return qread(c->rq, a, n);
828                 case Qerr:
829                         c = f->p[PROTO(ch->qid)]->conv[CONV(ch->qid)];
830                         return qread(c->eq, a, n);
831                 case Qsnoop:
832                         c = f->p[PROTO(ch->qid)]->conv[CONV(ch->qid)];
833                         return qread(c->sq, a, n);
834                 case Qstats:
835                         x = f->p[PROTO(ch->qid)];
836                         if (x->stats == NULL)
837                                 error(EFAIL, "stats not implemented");
838                         buf = kzmalloc(Statelen, 0);
839                         (*x->stats) (x, buf, Statelen);
840                         rv = readstr(offset, p, n, buf);
841                         kfree(buf);
842                         return rv;
843         }
844 }
845
846 static struct block *ipbread(struct chan *ch, long n, uint32_t offset)
847 {
848         struct conv *c;
849
850         switch (TYPE(ch->qid)) {
851                 case Qdata:
852                         c = chan2conv(ch);
853                         if (ch->flag & O_NONBLOCK)
854                                 return qbread_nonblock(c->rq, n);
855                         else
856                                 return qbread(c->rq, n);
857                 default:
858                         return devbread(ch, n, offset);
859         }
860 }
861
862 /*
863  *  set local address to be that of the ifc closest to remote address
864  */
865 static void setladdr(struct conv *c)
866 {
867         findlocalip(c->p->f, c->laddr, c->raddr);
868 }
869
870 /*
871  *  set a local port making sure the quad of raddr,rport,laddr,lport is unique
872  */
873 static void setluniqueport(struct conv *c, int lport)
874 {
875         struct Proto *p;
876         struct conv *xp;
877         int x;
878
879         p = c->p;
880
881         qlock(&p->qlock);
882         for (x = 0; x < p->nc; x++) {
883                 xp = p->conv[x];
884                 if (xp == NULL)
885                         break;
886                 if (xp == c)
887                         continue;
888                 if ((xp->state == Connected || xp->state == Announced
889                                             || xp->state == Bypass)
890                         && xp->lport == lport
891                         && xp->rport == c->rport
892                         && ipcmp(xp->raddr, c->raddr) == 0
893                         && ipcmp(xp->laddr, c->laddr) == 0) {
894                         qunlock(&p->qlock);
895                         error(EFAIL, "address in use");
896                 }
897         }
898         c->lport = lport;
899         qunlock(&p->qlock);
900 }
901
902 /*
903  *  pick a local port and set it
904  */
905 static void setlport(struct conv *c)
906 {
907         struct Proto *p;
908         uint16_t *pp;
909         int x, found;
910
911         p = c->p;
912         if (c->restricted)
913                 pp = &p->nextrport;
914         else
915                 pp = &p->nextport;
916         qlock(&p->qlock);
917         for (;; (*pp)++) {
918                 /*
919                  * Fsproto initialises p->nextport to 0 and the restricted
920                  * ports (p->nextrport) to 600.
921                  * Restricted ports must lie between 600 and 1024.
922                  * For the initial condition or if the unrestricted port number
923                  * has wrapped round, select a random port between 5000 and 1<<15
924                  * to start at.
925                  */
926                 if (c->restricted) {
927                         if (*pp >= 1024)
928                                 *pp = 600;
929                 } else
930                         while (*pp < 5000)
931                                 urandom_read(pp, sizeof(*pp));
932
933                 found = 0;
934                 for (x = 0; x < p->nc; x++) {
935                         if (p->conv[x] == NULL)
936                                 break;
937                         if (p->conv[x]->lport == *pp) {
938                                 found = 1;
939                                 break;
940                         }
941                 }
942                 if (!found)
943                         break;
944         }
945         c->lport = (*pp)++;
946         qunlock(&p->qlock);
947 }
948
949 /*
950  *  set a local address and port from a string of the form
951  *      [address!]port[!r]
952  */
953 static void setladdrport(struct conv *c, char *str, int announcing)
954 {
955         char *p;
956         uint16_t lport;
957         uint8_t addr[IPaddrlen];
958
959         /*
960          *  ignore restricted part if it exists.  it's
961          *  meaningless on local ports.
962          */
963         p = strchr(str, '!');
964         if (p != NULL) {
965                 *p++ = 0;
966                 if (strcmp(p, "r") == 0)
967                         p = NULL;
968         }
969
970         c->lport = 0;
971         if (p == NULL) {
972                 if (announcing)
973                         ipmove(c->laddr, IPnoaddr);
974                 else
975                         setladdr(c);
976                 p = str;
977         } else {
978                 if (strcmp(str, "*") == 0)
979                         ipmove(c->laddr, IPnoaddr);
980                 else {
981                         parseip(addr, str);
982                         if (ipforme(c->p->f, addr))
983                                 ipmove(c->laddr, addr);
984                         else
985                                 error(EFAIL, "not a local IP address");
986                 }
987         }
988
989         /* one process can get all connections */
990         if (announcing && strcmp(p, "*") == 0) {
991                 if (!iseve())
992                         error(EPERM, ERROR_FIXME);
993                 setluniqueport(c, 0);
994         }
995
996         lport = atoi(p);
997         if (lport <= 0)
998                 setlport(c);
999         else
1000                 setluniqueport(c, lport);
1001 }
1002
1003 static void setraddrport(struct conv *c, char *str)
1004 {
1005         char *p;
1006
1007         p = strchr(str, '!');
1008         if (p == NULL)
1009                 error(EFAIL, "malformed address");
1010         *p++ = 0;
1011         parseip(c->raddr, str);
1012         c->rport = atoi(p);
1013         p = strchr(p, '!');
1014         if (p) {
1015                 if (strstr(p, "!r") != NULL)
1016                         c->restricted = 1;
1017         }
1018 }
1019
1020 /*
1021  *  called by protocol connect routine to set addresses
1022  */
1023 void Fsstdconnect(struct conv *c, char *argv[], int argc)
1024 {
1025         switch (argc) {
1026                 default:
1027                         error(EINVAL, "bad args to %s", __func__);
1028                 case 2:
1029                         setraddrport(c, argv[1]);
1030                         setladdr(c);
1031                         setlport(c);
1032                         break;
1033                 case 3:
1034                         setraddrport(c, argv[1]);
1035                         setladdrport(c, argv[2], 0);
1036                         break;
1037         }
1038
1039         if ((memcmp(c->raddr, v4prefix, IPv4off) == 0 &&
1040                  memcmp(c->laddr, v4prefix, IPv4off) == 0)
1041                 || ipcmp(c->raddr, IPnoaddr) == 0)
1042                 c->ipversion = V4;
1043         else
1044                 c->ipversion = V6;
1045 }
1046
1047 /*
1048  *  initiate connection and sleep till its set up
1049  */
1050 static int connected(void *a)
1051 {
1052         return ((struct conv *)a)->state == Connected;
1053 }
1054
1055 static void connectctlmsg(struct Proto *x, struct conv *c, struct cmdbuf *cb)
1056 {
1057         ERRSTACK(1);
1058         char *p;
1059
1060         if (c->state != 0)
1061                 error(EBUSY, ERROR_FIXME);
1062         c->state = Connecting;
1063         c->cerr[0] = '\0';
1064         if (x->connect == NULL)
1065                 error(EFAIL, "connect not supported");
1066         x->connect(c, cb->f, cb->nf);
1067
1068         qunlock(&c->qlock);
1069         if (waserror()) {
1070                 qlock(&c->qlock);
1071                 nexterror();
1072         }
1073         rendez_sleep(&c->cr, connected, c);
1074         qlock(&c->qlock);
1075         poperror();
1076
1077         if (c->cerr[0] != '\0')
1078                 error(EFAIL, c->cerr);
1079 }
1080
1081 /*
1082  *  called by protocol announce routine to set addresses
1083  */
1084 void Fsstdannounce(struct conv *c, char *argv[], int argc)
1085 {
1086         memset(c->raddr, 0, sizeof(c->raddr));
1087         c->rport = 0;
1088         switch (argc) {
1089                 default:
1090                         error(EINVAL, "bad args to announce");
1091                 case 2:
1092                         setladdrport(c, argv[1], 1);
1093                         break;
1094         }
1095 }
1096
1097 /*
1098  *  initiate announcement and sleep till its set up
1099  */
1100 static int announced(void *a)
1101 {
1102         return ((struct conv *)a)->state == Announced;
1103 }
1104
1105 static void announcectlmsg(struct Proto *x, struct conv *c, struct cmdbuf *cb)
1106 {
1107         ERRSTACK(1);
1108         char *p;
1109
1110         if (c->state != 0)
1111                 error(EBUSY, ERROR_FIXME);
1112         c->state = Announcing;
1113         c->cerr[0] = '\0';
1114         if (x->announce == NULL)
1115                 error(EFAIL, "announce not supported");
1116         x->announce(c, cb->f, cb->nf);
1117
1118         qunlock(&c->qlock);
1119         if (waserror()) {
1120                 qlock(&c->qlock);
1121                 nexterror();
1122         }
1123         rendez_sleep(&c->cr, announced, c);
1124         qlock(&c->qlock);
1125         poperror();
1126
1127         if (c->cerr[0] != '\0')
1128                 error(EFAIL, c->cerr);
1129 }
1130
1131 /*
1132  *  called by protocol bind routine to set addresses
1133  */
1134 void Fsstdbind(struct conv *c, char *argv[], int argc)
1135 {
1136         switch (argc) {
1137                 default:
1138                         error(EINVAL, "bad args to bind");
1139                 case 2:
1140                         setladdrport(c, argv[1], 0);
1141                         break;
1142         }
1143 }
1144
1145 static void bindctlmsg(struct Proto *x, struct conv *c, struct cmdbuf *cb)
1146 {
1147         if (x->bind == NULL)
1148                 Fsstdbind(c, cb->f, cb->nf);
1149         else
1150                 x->bind(c, cb->f, cb->nf);
1151 }
1152
1153 /* Helper, called by protocols to use the bypass.
1154  *
1155  * This is a bit nasty due to the overall nastiness of #ip.  We need to lock
1156  * before checking the state and hold the qlock throughout, because a concurrent
1157  * closeconv() could tear down the bypass.  Specifically, it could free the
1158  * bypass queues.  The root issue is that conversation lifetimes are not managed
1159  * well.
1160  *
1161  * If we fail, it's our responsibility to consume (free) the block(s). */
1162 void bypass_or_drop(struct conv *cv, struct block *bp)
1163 {
1164         qlock(&cv->qlock);
1165         if (cv->state == Bypass)
1166                 qpass(cv->rq, bp);
1167         else
1168                 freeblist(bp);
1169         qunlock(&cv->qlock);
1170 }
1171
1172 /* Push the block directly to the approprite ipoput function.
1173  *
1174  * It's the protocol's responsibility (and thus ours here) to make sure there is
1175  * at least the right amount of the IP header in the block (ipoput{4,6} assumes
1176  * it has the right amount, and the other protocols account for the IP header in
1177  * their own header).
1178  *
1179  * For the TTL and TOS, we just use the default ones.  If we want, we could look
1180  * into the actual block and see what the user wanted, though we're bypassing
1181  * the protocol layer, not the IP layer. */
1182 static void proto_bypass_kick(void *arg, struct block *bp)
1183 {
1184         struct conv *cv = (struct conv*)arg;
1185         uint8_t vers_nibble;
1186         struct Fs *f;
1187
1188         f = cv->p->f;
1189
1190         bp = pullupblock(bp, 1);
1191         if (!bp)
1192                 error(EINVAL, "Proto bypass unable to pullup a byte!");
1193         vers_nibble = *(uint8_t*)bp->rp & 0xf0;
1194         switch (vers_nibble) {
1195         case IP_VER4:
1196                 bp = pullupblock(bp, IPV4HDR_LEN);
1197                 if (!bp)
1198                         error(EINVAL, "Proto bypass unable to pullup v4 header");
1199                 ipoput4(f, bp, FALSE, MAXTTL, DFLTTOS, NULL);
1200                 break;
1201         case IP_VER6:
1202                 bp = pullupblock(bp, IPV6HDR_LEN);
1203                 if (!bp)
1204                         error(EINVAL, "Proto bypass unable to pullup v6 header");
1205                 ipoput6(f, bp, FALSE, MAXTTL, DFLTTOS, NULL);
1206                 break;
1207         default:
1208                 error(EINVAL, "Proto bypass block had unknown IP version 0x%x",
1209                       vers_nibble);
1210         }
1211 }
1212
1213 /* Sets up cv for the protocol bypass.  We use different queues for two reasons:
1214  * 1) To be protocol independent.  For instance, TCP and UDP could use very
1215  * different QIO styles.
1216  * 2) To set up our own kick/bypass method.  Note how udpcreate() and here uses
1217  * qbypass() (just blast it out), while TCP uses qopen() with a kick.  TCP still
1218  * follows queuing discipline.
1219  *
1220  * It's like we are our own protocol, the bypass protocol, when it comes to how
1221  * we interact with qio.  The conv still is of the real protocol type (e.g.
1222  * TCP).
1223  *
1224  * Note that we can't free the old queues.  The way #ip works, the queues are
1225  * created when the conv is created, but the conv is never freed.  It's like a
1226  * slab allocator that never frees objects, but just reinitializes them a
1227  * little.
1228  *
1229  * For the queues, we're basically like UDP:
1230  * - We take packets for rq and drop on overflow.
1231  * - rq is also Qmsg, but we also have Qcoalesce, to ignore out zero-len blocks
1232  * - We kick for our outbound (wq) messages.
1233  *
1234  * Note that Qmsg can drop parts of packets.  It's up to the user to read
1235  * enough.  If they didn't read enough, the extra is dropped.  This is similar
1236  * to SOCK_DGRAM and recvfrom().  Minus major changes, there's no nice way to
1237  * get individual messages with read().  Userspace using the bypass will need to
1238  * find out the MTU of the NIC the IP stack is attached to, and make sure to
1239  * read in at least that amount each time. */
1240 static void setup_proto_qio_bypass(struct conv *cv)
1241 {
1242         cv->rq_save = cv->rq;
1243         cv->wq_save = cv->wq;
1244         cv->rq = qopen(BYPASS_QMAX, Qmsg | Qcoalesce, 0, 0);
1245         cv->wq = qbypass(proto_bypass_kick, cv);
1246 }
1247
1248 static void undo_proto_qio_bypass(struct conv *cv)
1249 {
1250         qfree(cv->rq);
1251         qfree(cv->wq);
1252         cv->rq = cv->rq_save;
1253         cv->wq = cv->wq_save;
1254         cv->rq_save = NULL;
1255         cv->wq_save = NULL;
1256 }
1257
1258 void Fsstdbypass(struct conv *cv, char *argv[], int argc)
1259 {
1260         memset(cv->raddr, 0, sizeof(cv->raddr));
1261         cv->rport = 0;
1262         switch (argc) {
1263         case 2:
1264                 setladdrport(cv, argv[1], 1);
1265                 break;
1266         default:
1267                 error(EINVAL, "Bad args (was %d, need 2) to bypass", argc);
1268         }
1269 }
1270
1271 static void bypassctlmsg(struct Proto *x, struct conv *cv, struct cmdbuf *cb)
1272 {
1273         if (!x->bypass)
1274                 error(EFAIL, "Protocol %s does not support bypass", x->name);
1275         /* The protocol needs to set the port (usually by calling Fsstdbypass) and
1276          * then do whatever it needs to make sure it can find the conv again during
1277          * receive (usually by adding to a hash table). */
1278         x->bypass(cv, cb->f, cb->nf);
1279         setup_proto_qio_bypass(cv);
1280         cv->state = Bypass;
1281 }
1282
1283 static void shutdownctlmsg(struct conv *cv, struct cmdbuf *cb)
1284 {
1285         if (cb->nf < 2)
1286                 goto err;
1287         if (!strcmp(cb->f[1], "rd")) {
1288                 qhangup(cv->rq, "shutdown");
1289                 if (cv->p->shutdown)
1290                         cv->p->shutdown(cv, SHUT_RD);
1291         } else if (!strcmp(cb->f[1], "wr")) {
1292                 qhangup(cv->wq, "shutdown");
1293                 if (cv->p->shutdown)
1294                         cv->p->shutdown(cv, SHUT_WR);
1295         } else if (!strcmp(cb->f[1], "rdwr")) {
1296                 qhangup(cv->rq, "shutdown");
1297                 qhangup(cv->wq, "shutdown");
1298                 if (cv->p->shutdown)
1299                         cv->p->shutdown(cv, SHUT_RDWR);
1300         } else {
1301                 goto err;
1302         }
1303         return;
1304 err:
1305         error(EINVAL, "shutdown [rx|tx|rxtx]");
1306 }
1307
1308 static void tosctlmsg(struct conv *c, struct cmdbuf *cb)
1309 {
1310         if (cb->nf < 2)
1311                 c->tos = 0;
1312         else
1313                 c->tos = atoi(cb->f[1]);
1314 }
1315
1316 static void ttlctlmsg(struct conv *c, struct cmdbuf *cb)
1317 {
1318         if (cb->nf < 2)
1319                 c->ttl = MAXTTL;
1320         else
1321                 c->ttl = atoi(cb->f[1]);
1322 }
1323
1324 static long ipwrite(struct chan *ch, void *v, long n, int64_t off)
1325 {
1326         ERRSTACK(1);
1327         struct conv *c;
1328         struct Proto *x;
1329         char *p;
1330         struct cmdbuf *cb;
1331         uint8_t ia[IPaddrlen], ma[IPaddrlen];
1332         struct Fs *f;
1333         char *a;
1334
1335         a = v;
1336         f = ipfs[ch->dev];
1337
1338         switch (TYPE(ch->qid)) {
1339                 default:
1340                         error(EPERM, ERROR_FIXME);
1341                 case Qdata:
1342                         x = f->p[PROTO(ch->qid)];
1343                         c = x->conv[CONV(ch->qid)];
1344                         if (ch->flag & O_NONBLOCK)
1345                                 qwrite_nonblock(c->wq, a, n);
1346                         else
1347                                 qwrite(c->wq, a, n);
1348                         break;
1349                 case Qarp:
1350                         return arpwrite(f, a, n);
1351                 case Qiproute:
1352                         return routewrite(f, ch, a, n);
1353                 case Qlog:
1354                         netlogctl(f, a, n);
1355                         return n;
1356                 case Qndb:
1357                         return ndbwrite(f, a, off, n);
1358                 case Qctl:
1359                         x = f->p[PROTO(ch->qid)];
1360                         c = x->conv[CONV(ch->qid)];
1361                         cb = parsecmd(a, n);
1362
1363                         qlock(&c->qlock);
1364                         if (waserror()) {
1365                                 qunlock(&c->qlock);
1366                                 kfree(cb);
1367                                 nexterror();
1368                         }
1369                         if (cb->nf < 1)
1370                                 error(EFAIL, "short control request");
1371                         if (strcmp(cb->f[0], "connect") == 0)
1372                                 connectctlmsg(x, c, cb);
1373                         else if (strcmp(cb->f[0], "announce") == 0)
1374                                 announcectlmsg(x, c, cb);
1375                         else if (strcmp(cb->f[0], "bind") == 0)
1376                                 bindctlmsg(x, c, cb);
1377                         else if (strcmp(cb->f[0], "bypass") == 0)
1378                                 bypassctlmsg(x, c, cb);
1379                         else if (strcmp(cb->f[0], "shutdown") == 0)
1380                                 shutdownctlmsg(c, cb);
1381                         else if (strcmp(cb->f[0], "ttl") == 0)
1382                                 ttlctlmsg(c, cb);
1383                         else if (strcmp(cb->f[0], "tos") == 0)
1384                                 tosctlmsg(c, cb);
1385                         else if (strcmp(cb->f[0], "ignoreadvice") == 0)
1386                                 c->ignoreadvice = 1;
1387                         else if (strcmp(cb->f[0], "addmulti") == 0) {
1388                                 if (cb->nf < 2)
1389                                         error(EFAIL, "addmulti needs interface address");
1390                                 if (cb->nf == 2) {
1391                                         if (!ipismulticast(c->raddr))
1392                                                 error(EFAIL, "addmulti for a non multicast address");
1393                                         parseip(ia, cb->f[1]);
1394                                         ipifcaddmulti(c, c->raddr, ia);
1395                                 } else {
1396                                         parseip(ma, cb->f[2]);
1397                                         if (!ipismulticast(ma))
1398                                                 error(EFAIL, "addmulti for a non multicast address");
1399                                         parseip(ia, cb->f[1]);
1400                                         ipifcaddmulti(c, ma, ia);
1401                                 }
1402                         } else if (strcmp(cb->f[0], "remmulti") == 0) {
1403                                 if (cb->nf < 2)
1404                                         error(EFAIL, "remmulti needs interface address");
1405                                 if (!ipismulticast(c->raddr))
1406                                         error(EFAIL, "remmulti for a non multicast address");
1407                                 parseip(ia, cb->f[1]);
1408                                 ipifcremmulti(c, c->raddr, ia);
1409                         } else if (x->ctl != NULL) {
1410                                 x->ctl(c, cb->f, cb->nf);
1411                         } else
1412                                 error(EFAIL, "unknown control request");
1413                         qunlock(&c->qlock);
1414                         kfree(cb);
1415                         poperror();
1416         }
1417         return n;
1418 }
1419
1420 static long ipbwrite(struct chan *ch, struct block *bp, uint32_t offset)
1421 {
1422         struct conv *c;
1423         int n;
1424
1425         switch (TYPE(ch->qid)) {
1426                 case Qdata:
1427                         c = chan2conv(ch);
1428                         if (bp->next)
1429                                 bp = concatblock(bp);
1430                         n = BLEN(bp);
1431                         if (ch->flag & O_NONBLOCK)
1432                                 qbwrite_nonblock(c->wq, bp);
1433                         else
1434                                 qbwrite(c->wq, bp);
1435                         return n;
1436                 default:
1437                         return devbwrite(ch, bp, offset);
1438         }
1439 }
1440
1441 static void ip_wake_cb(struct queue *q, void *data, int filter)
1442 {
1443         struct conv *conv = (struct conv*)data;
1444         struct fd_tap *tap_i;
1445         /* For these two, we want to ignore events on the opposite end of the
1446          * queues.  For instance, we want to know when the WQ is writable.  Our
1447          * writes will actually make it readable - we don't want to trigger a tap
1448          * for that.  However, qio doesn't know how/why we are using a queue, or
1449          * even who the ends are (hence the callbacks) */
1450         if ((filter & FDTAP_FILT_READABLE) && (q == conv->wq))
1451                 return;
1452         if ((filter & FDTAP_FILT_WRITABLE) && (q == conv->rq))
1453                 return;
1454         /* At this point, we have an event we want to send to our taps (if any).
1455          * The lock protects list integrity and the existence of the tap.
1456          *
1457          * Previously, I thought of using the conv qlock.  That actually breaks, due
1458          * to weird usages of the qlock (someone holds it for a long time, blocking
1459          * the inbound wakeup from etherread4).
1460          *
1461          * I opted for a spinlock for a couple reasons:
1462          * - fire_tap should not block.  ideally it'll be fast too (it's mostly a
1463          * send_event).
1464          * - our callers might not want to block.  A lot of network wakeups will
1465          * come network processes (etherread4) or otherwise unrelated to this
1466          * particular conversation.  I'd rather do something like fire off a KMSG
1467          * than block those.
1468          * - if fire_tap takes a while, holding the lock only slows down other
1469          * events on this *same* conversation, or other tap registration.  not a
1470          * huge deal. */
1471         spin_lock(&conv->tap_lock);
1472         SLIST_FOREACH(tap_i, &conv->data_taps, link)
1473                 fire_tap(tap_i, filter);
1474         spin_unlock(&conv->tap_lock);
1475 }
1476
1477 int iptapfd(struct chan *chan, struct fd_tap *tap, int cmd)
1478 {
1479         struct conv *conv = chan2conv(chan);
1480         int ret;
1481
1482         #define DEVIP_LEGAL_DATA_TAPS (FDTAP_FILT_READABLE | FDTAP_FILT_WRITABLE | \
1483                                        FDTAP_FILT_HANGUP | FDTAP_FILT_PRIORITY |   \
1484                                        FDTAP_FILT_ERROR)
1485         #define DEVIP_LEGAL_LISTEN_TAPS (FDTAP_FILT_READABLE | FDTAP_FILT_HANGUP)
1486
1487         switch (TYPE(chan->qid)) {
1488                 case Qdata:
1489                         if (tap->filter & ~DEVIP_LEGAL_DATA_TAPS) {
1490                                 set_errno(ENOSYS);
1491                                 set_errstr("Unsupported #%s data tap %p, must be %p", devname(),
1492                                            tap->filter, DEVIP_LEGAL_DATA_TAPS);
1493                                 return -1;
1494                         }
1495                         spin_lock(&conv->tap_lock);
1496                         switch (cmd) {
1497                                 case (FDTAP_CMD_ADD):
1498                                         if (SLIST_EMPTY(&conv->data_taps)) {
1499                                                 qio_set_wake_cb(conv->rq, ip_wake_cb, conv);
1500                                                 qio_set_wake_cb(conv->wq, ip_wake_cb, conv);
1501                                         }
1502                                         SLIST_INSERT_HEAD(&conv->data_taps, tap, link);
1503                                         ret = 0;
1504                                         break;
1505                                 case (FDTAP_CMD_REM):
1506                                         SLIST_REMOVE(&conv->data_taps, tap, fd_tap, link);
1507                                         if (SLIST_EMPTY(&conv->data_taps)) {
1508                                                 qio_set_wake_cb(conv->rq, 0, conv);
1509                                                 qio_set_wake_cb(conv->wq, 0, conv);
1510                                         }
1511                                         ret = 0;
1512                                         break;
1513                                 default:
1514                                         set_errno(ENOSYS);
1515                                         set_errstr("Unsupported #%s data tap command %p",
1516                                                    devname(), cmd);
1517                                         ret = -1;
1518                         }
1519                         spin_unlock(&conv->tap_lock);
1520                         return ret;
1521                 case Qlisten:
1522                         if (tap->filter & ~DEVIP_LEGAL_LISTEN_TAPS) {
1523                                 set_errno(ENOSYS);
1524                                 set_errstr("Unsupported #%s listen tap %p, must be %p",
1525                                            devname(), tap->filter, DEVIP_LEGAL_LISTEN_TAPS);
1526                                 return -1;
1527                         }
1528                         spin_lock(&conv->tap_lock);
1529                         switch (cmd) {
1530                                 case (FDTAP_CMD_ADD):
1531                                         SLIST_INSERT_HEAD(&conv->listen_taps, tap, link);
1532                                         ret = 0;
1533                                         break;
1534                                 case (FDTAP_CMD_REM):
1535                                         SLIST_REMOVE(&conv->listen_taps, tap, fd_tap, link);
1536                                         ret = 0;
1537                                         break;
1538                                 default:
1539                                         set_errno(ENOSYS);
1540                                         set_errstr("Unsupported #%s listen tap command %p",
1541                                                    devname(), cmd);
1542                                         ret = -1;
1543                         }
1544                         spin_unlock(&conv->tap_lock);
1545                         return ret;
1546                 default:
1547                         set_errno(ENOSYS);
1548                         set_errstr("Can't tap #%s file type %d", devname(),
1549                                    TYPE(chan->qid));
1550                         return -1;
1551         }
1552 }
1553
1554 struct dev ipdevtab __devtab = {
1555         .name = "ip",
1556
1557         .reset = ipreset,
1558         .init = ipinit,
1559         .shutdown = devshutdown,
1560         .attach = ipattach,
1561         .walk = ipwalk,
1562         .stat = ipstat,
1563         .open = ipopen,
1564         .create = devcreate,
1565         .close = ipclose,
1566         .read = ipread,
1567         .bread = ipbread,
1568         .write = ipwrite,
1569         .bwrite = ipbwrite,
1570         .remove = devremove,
1571         .wstat = ipwstat,
1572         .power = devpower,
1573         .chaninfo = ipchaninfo,
1574         .tapfd = iptapfd,
1575 };
1576
1577 int Fsproto(struct Fs *f, struct Proto *p)
1578 {
1579         if (f->np >= Maxproto)
1580                 return -1;
1581
1582         qlock_init(&p->qlock);
1583         p->f = f;
1584
1585         if (p->ipproto > 0) {
1586                 if (f->t2p[p->ipproto] != NULL)
1587                         return -1;
1588                 f->t2p[p->ipproto] = p;
1589         }
1590
1591         p->qid.type = QTDIR;
1592         p->qid.path = QID(f->np, 0, Qprotodir);
1593         p->conv = kzmalloc(sizeof(struct conv *) * (p->nc + 1), 0);
1594         if (p->conv == NULL)
1595                 panic("Fsproto");
1596
1597         p->x = f->np;
1598         p->nextport = 0;
1599         p->nextrport = 600;
1600         f->p[f->np++] = p;
1601
1602         return 0;
1603 }
1604
1605 /*
1606  *  return true if this protocol is
1607  *  built in
1608  */
1609 int Fsbuiltinproto(struct Fs *f, uint8_t proto)
1610 {
1611         return f->t2p[proto] != NULL;
1612 }
1613
1614 /*
1615  *  called with protocol locked
1616  */
1617 struct conv *Fsprotoclone(struct Proto *p, char *user)
1618 {
1619         struct conv *c, **pp, **ep;
1620
1621 retry:
1622         c = NULL;
1623         ep = &p->conv[p->nc];
1624         for (pp = p->conv; pp < ep; pp++) {
1625                 c = *pp;
1626                 if (c == NULL) {
1627                         c = kzmalloc(sizeof(struct conv), 0);
1628                         if (c == NULL)
1629                                 error(ENOMEM, ERROR_FIXME);
1630                         qlock_init(&c->qlock);
1631                         qlock_init(&c->listenq);
1632                         rendez_init(&c->cr);
1633                         rendez_init(&c->listenr);
1634                         SLIST_INIT(&c->data_taps);      /* already = 0; set to be futureproof */
1635                         SLIST_INIT(&c->listen_taps);
1636                         spinlock_init(&c->tap_lock);
1637                         qlock(&c->qlock);
1638                         c->p = p;
1639                         c->x = pp - p->conv;
1640                         if (p->ptclsize != 0) {
1641                                 c->ptcl = kzmalloc(p->ptclsize, 0);
1642                                 if (c->ptcl == NULL) {
1643                                         kfree(c);
1644                                         error(ENOMEM, ERROR_FIXME);
1645                                 }
1646                         }
1647                         *pp = c;
1648                         p->ac++;
1649                         c->eq = qopen(1024, Qmsg, 0, 0);
1650                         (*p->create) (c);
1651                         assert(c->rq && c->wq);
1652                         break;
1653                 }
1654                 if (canqlock(&c->qlock)) {
1655                         /*
1656                          *  make sure both processes and protocol
1657                          *  are done with this Conv
1658                          */
1659                         if (c->inuse == 0 && (p->inuse == NULL || (*p->inuse) (c) == 0))
1660                                 break;
1661
1662                         qunlock(&c->qlock);
1663                 }
1664         }
1665         if (pp >= ep) {
1666                 if (p->gc != NULL && (*p->gc) (p))
1667                         goto retry;
1668                 return NULL;
1669         }
1670
1671         c->inuse = 1;
1672         kstrdup(&c->owner, user);
1673         c->perm = 0660;
1674         c->state = Idle;
1675         ipmove(c->laddr, IPnoaddr);
1676         ipmove(c->raddr, IPnoaddr);
1677         c->r = NULL;
1678         c->rgen = 0;
1679         c->lport = 0;
1680         c->rport = 0;
1681         c->restricted = 0;
1682         c->ttl = MAXTTL;
1683         c->tos = DFLTTOS;
1684         qreopen(c->rq);
1685         qreopen(c->wq);
1686         qreopen(c->eq);
1687
1688         qunlock(&c->qlock);
1689         return c;
1690 }
1691
1692 int Fsconnected(struct conv *c, char *msg)
1693 {
1694         if (msg != NULL && *msg != '\0')
1695                 strlcpy(c->cerr, msg, sizeof(c->cerr));
1696
1697         switch (c->state) {
1698                 case Announcing:
1699                         c->state = Announced;
1700                         break;
1701
1702                 case Connecting:
1703                         c->state = Connected;
1704                         break;
1705         }
1706
1707         rendez_wakeup(&c->cr);
1708         return 0;
1709 }
1710
1711 struct Proto *Fsrcvpcol(struct Fs *f, uint8_t proto)
1712 {
1713         if (f->ipmux)
1714                 return f->ipmux;
1715         else
1716                 return f->t2p[proto];
1717 }
1718
1719 struct Proto *Fsrcvpcolx(struct Fs *f, uint8_t proto)
1720 {
1721         return f->t2p[proto];
1722 }
1723
1724 static void fire_listener_taps(struct conv *conv)
1725 {
1726         struct fd_tap *tap_i;
1727         if (SLIST_EMPTY(&conv->listen_taps))
1728                 return;
1729         spin_lock(&conv->tap_lock);
1730         SLIST_FOREACH(tap_i, &conv->listen_taps, link)
1731                 fire_tap(tap_i, FDTAP_FILT_READABLE);
1732         spin_unlock(&conv->tap_lock);
1733 }
1734
1735 /*
1736  *  called with protocol locked
1737  */
1738 struct conv *Fsnewcall(struct conv *c, uint8_t * raddr, uint16_t rport,
1739                                            uint8_t * laddr, uint16_t lport, uint8_t version)
1740 {
1741         struct conv *nc;
1742         struct conv **l;
1743         int i;
1744
1745         qlock(&c->qlock);
1746         i = 0;
1747         for (l = &c->incall; *l; l = &(*l)->next)
1748                 i++;
1749         if (i >= Maxincall) {
1750                 qunlock(&c->qlock);
1751                 return NULL;
1752         }
1753
1754         /* find a free conversation */
1755         nc = Fsprotoclone(c->p, network);
1756         if (nc == NULL) {
1757                 qunlock(&c->qlock);
1758                 return NULL;
1759         }
1760         ipmove(nc->raddr, raddr);
1761         nc->rport = rport;
1762         ipmove(nc->laddr, laddr);
1763         nc->lport = lport;
1764         nc->next = NULL;
1765         *l = nc;
1766         nc->state = Connected;
1767         nc->ipversion = version;
1768
1769         qunlock(&c->qlock);
1770
1771         rendez_wakeup(&c->listenr);
1772         fire_listener_taps(c);
1773
1774         return nc;
1775 }
1776
1777 static long ndbwrite(struct Fs *f, char *a, uint32_t off, int n)
1778 {
1779         if (off > strlen(f->ndb))
1780                 error(EIO, ERROR_FIXME);
1781         if (off + n >= sizeof(f->ndb) - 1)
1782                 error(EIO, ERROR_FIXME);
1783         memmove(f->ndb + off, a, n);
1784         f->ndb[off + n] = 0;
1785         f->ndbvers++;
1786         f->ndbmtime = seconds();
1787         return n;
1788 }
1789
1790 uint32_t scalednconv(void)
1791 {
1792         //if(conf.npage*BY2PG >= 128*MB)
1793         return Nchans * 4;
1794         //  return Nchans;
1795 }