net: Support connects to 0.0.0.0
[akaros.git] / kern / src / net / devip.c
1 /* Copyright © 1994-1999 Lucent Technologies Inc.  All rights reserved.
2  * Portions Copyright © 1997-1999 Vita Nuova Limited
3  * Portions Copyright © 2000-2007 Vita Nuova Holdings Limited
4  *                                (www.vitanuova.com)
5  * Revisions Copyright © 2000-2007 Lucent Technologies Inc. and others
6  *
7  * Modified for the Akaros operating system:
8  * Copyright (c) 2013-2014 The Regents of the University of California
9  * Copyright (c) 2013-2015 Google Inc.
10  *
11  * Permission is hereby granted, free of charge, to any person obtaining a copy
12  * of this software and associated documentation files (the "Software"), to deal
13  * in the Software without restriction, including without limitation the rights
14  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15  * copies of the Software, and to permit persons to whom the Software is
16  * furnished to do so, subject to the following conditions:
17  *
18  * The above copyright notice and this permission notice shall be included in
19  * all copies or substantial portions of the Software.
20  *
21  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
24  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27  * SOFTWARE. */
28
29 #include <vfs.h>
30 #include <kfs.h>
31 #include <slab.h>
32 #include <kmalloc.h>
33 #include <kref.h>
34 #include <string.h>
35 #include <stdio.h>
36 #include <assert.h>
37 #include <error.h>
38 #include <cpio.h>
39 #include <pmap.h>
40 #include <smp.h>
41 #include <ip.h>
42
43 struct dev ipdevtab;
44
45 static char *devname(void)
46 {
47         return ipdevtab.name;
48 }
49
50 enum {
51         Qtopdir = 1,                            /* top level directory */
52         Qtopbase,
53         Qarp = Qtopbase,
54         Qndb,
55         Qiproute,
56         Qiprouter,
57         Qipselftab,
58         Qlog,
59
60         Qprotodir,      /* directory for a protocol */
61         Qprotobase,
62         Qclone = Qprotobase,
63         Qstats,
64
65         Qconvdir,       /* directory for a conversation */
66         Qconvbase,
67         Qctl = Qconvbase,
68         Qdata,
69         Qerr,
70         Qlisten,
71         Qlocal,
72         Qremote,
73         Qstatus,
74         Qsnoop,
75
76         Logtype = 5,
77         Masktype = (1 << Logtype) - 1,
78         Logconv = 12,
79         Maskconv = (1 << Logconv) - 1,
80         Shiftconv = Logtype,
81         Logproto = 8,
82         Maskproto = (1 << Logproto) - 1,
83         Shiftproto = Logtype + Logconv,
84
85         Nfs = 32,
86         BYPASS_QMAX = 64 * MiB,
87         IPROUTE_LEN = 2 * PGSIZE,
88 };
89 #define TYPE(x)         ( ((uint32_t)(x).path) & Masktype )
90 #define CONV(x)         ( (((uint32_t)(x).path) >> Shiftconv) & Maskconv )
91 #define PROTO(x)        ( (((uint32_t)(x).path) >> Shiftproto) & Maskproto )
92 #define QID(p, c, y)    ( ((p)<<(Shiftproto)) | ((c)<<Shiftconv) | (y))
93 static char network[] = "network";
94
95 qlock_t fslock;
96 struct Fs *ipfs[Nfs];                   /* attached fs's */
97 struct queue *qlog;
98
99 extern void nullmediumlink(void);
100 extern void pktmediumlink(void);
101 extern struct username eve;
102 static long ndbwrite(struct Fs *, char *unused_char_p_t, uint32_t, int);
103 static void closeconv(struct conv *);
104 static void setup_proto_qio_bypass(struct conv *cv);
105 static void undo_proto_qio_bypass(struct conv *cv);
106
107 static struct conv *chan2conv(struct chan *chan)
108 {
109         /* That's a lot of pointers to get to the conv! */
110         return ipfs[chan->dev]->p[PROTO(chan->qid)]->conv[CONV(chan->qid)];
111 }
112
113 static inline int founddevdir(struct chan *c, struct qid q, char *n,
114                                                           int64_t length, char *user, long perm,
115                                                           struct dir *db)
116 {
117         devdir(c, q, n, length, user, perm, db);
118         return 1;
119 }
120
121 static int topdirgen(struct chan *c, struct dir *dp)
122 {
123         struct qid q;
124         mkqid(&q, QID(0, 0, Qtopdir), 0, QTDIR);
125         snprintf(get_cur_genbuf(), GENBUF_SZ, "#%s%lu", devname(), c->dev);
126         return founddevdir(c, q, get_cur_genbuf(), 0, network, 0555, dp);
127 }
128
129
130 static int ip3gen(struct chan *c, int i, struct dir *dp)
131 {
132         struct qid q;
133         struct conv *cv;
134         char *p;
135         int perm;
136
137         cv = chan2conv(c);
138         if (cv->owner == NULL)
139                 kstrdup(&cv->owner, eve.name);
140         mkqid(&q, QID(PROTO(c->qid), CONV(c->qid), i), 0, QTFILE);
141
142         switch (i) {
143                 default:
144                         return -1;
145                 case Qctl:
146                         return founddevdir(c, q, "ctl", 0,
147                                                    cv->owner, cv->perm, dp);
148                 case Qdata:
149                         perm = cv->perm;
150                         perm |= qreadable(cv->rq) ? DMREADABLE : 0;
151                         perm |= qwritable(cv->wq) ? DMWRITABLE : 0;
152                         return founddevdir(c, q, "data", qlen(cv->rq),
153                                                            cv->owner, perm, dp);
154                 case Qerr:
155                         perm = cv->perm;
156                         perm |= qreadable(cv->eq) ? DMREADABLE : 0;
157                         return founddevdir(c, q, "err", qlen(cv->eq),
158                                                            cv->owner, perm, dp);
159                 case Qlisten:
160                         perm = cv->perm;
161                         perm |= cv->incall ? DMREADABLE : 0;
162                         return founddevdir(c, q, "listen", 0, cv->owner, perm, dp);
163                 case Qlocal:
164                         p = "local";
165                         break;
166                 case Qremote:
167                         p = "remote";
168                         break;
169                 case Qsnoop:
170                         if (strcmp(cv->p->name, "ipifc") != 0)
171                                 return -1;
172                         perm = 0400;
173                         perm |= qreadable(cv->sq) ? DMREADABLE : 0;
174                         return founddevdir(c, q, "snoop", qlen(cv->sq),
175                                                            cv->owner, perm, dp);
176                 case Qstatus:
177                         p = "status";
178                         break;
179         }
180         return founddevdir(c, q, p, 0, cv->owner, 0444, dp);
181 }
182
183 static int ip2gen(struct chan *c, int i, struct dir *dp)
184 {
185         struct qid q;
186         mkqid(&q, QID(PROTO(c->qid), 0, i), 0, QTFILE);
187         switch (i) {
188                 case Qclone:
189                         return founddevdir(c, q, "clone", 0, network, 0666, dp);
190                 case Qstats:
191                         return founddevdir(c, q, "stats", 0, network, 0444, dp);
192         }
193         return -1;
194 }
195
196 static int ip1gen(struct chan *c, int i, struct dir *dp)
197 {
198         struct qid q;
199         char *p;
200         int prot;
201         int len = 0;
202         struct Fs *f;
203         extern uint32_t kerndate;
204
205         f = ipfs[c->dev];
206
207         prot = 0666;
208         mkqid(&q, QID(0, 0, i), 0, QTFILE);
209         switch (i) {
210                 default:
211                         return -1;
212                 case Qarp:
213                         p = "arp";
214                         break;
215                 case Qndb:
216                         p = "ndb";
217                         len = strlen(f->ndb);
218                         q.vers = f->ndbvers;
219                         break;
220                 case Qiproute:
221                         p = "iproute";
222                         break;
223                 case Qipselftab:
224                         p = "ipselftab";
225                         prot = 0444;
226                         break;
227                 case Qiprouter:
228                         p = "iprouter";
229                         break;
230                 case Qlog:
231                         p = "log";
232                         break;
233         }
234         devdir(c, q, p, len, network, prot, dp);
235         if (i == Qndb && f->ndbmtime > kerndate)
236                 dp->mtime = f->ndbmtime;
237         return 1;
238 }
239
240 static int
241 ipgen(struct chan *c, char *unused_char_p_t, struct dirtab *d, int unused_int,
242           int s, struct dir *dp)
243 {
244         struct qid q;
245         struct conv *cv;
246         struct Fs *f;
247
248         f = ipfs[c->dev];
249
250         switch (TYPE(c->qid)) {
251                 case Qtopdir:
252                         if (s == DEVDOTDOT)
253                                 return topdirgen(c, dp);
254                         if (s < f->np) {
255                                 if (f->p[s]->connect == NULL)
256                                         return 0;       /* protocol with no user interface */
257                                 mkqid(&q, QID(s, 0, Qprotodir), 0, QTDIR);
258                                 return founddevdir(c, q, f->p[s]->name, 0, network, 0555, dp);
259                         }
260                         s -= f->np;
261                         return ip1gen(c, s + Qtopbase, dp);
262                 case Qarp:
263                 case Qndb:
264                 case Qlog:
265                 case Qiproute:
266                 case Qiprouter:
267                 case Qipselftab:
268                         return ip1gen(c, TYPE(c->qid), dp);
269                 case Qprotodir:
270                         if (s == DEVDOTDOT)
271                                 return topdirgen(c, dp);
272                         else if (s < f->p[PROTO(c->qid)]->ac) {
273                                 cv = f->p[PROTO(c->qid)]->conv[s];
274                                 snprintf(get_cur_genbuf(), GENBUF_SZ, "%d", s);
275                                 mkqid(&q, QID(PROTO(c->qid), s, Qconvdir), 0, QTDIR);
276                                 return
277                                         founddevdir(c, q, get_cur_genbuf(), 0, cv->owner, 0555, dp);
278                         }
279                         s -= f->p[PROTO(c->qid)]->ac;
280                         return ip2gen(c, s + Qprotobase, dp);
281                 case Qclone:
282                 case Qstats:
283                         return ip2gen(c, TYPE(c->qid), dp);
284                 case Qconvdir:
285                         if (s == DEVDOTDOT) {
286                                 s = PROTO(c->qid);
287                                 mkqid(&q, QID(s, 0, Qprotodir), 0, QTDIR);
288                                 devdir(c, q, f->p[s]->name, 0, network, 0555, dp);
289                                 return 1;
290                         }
291                         return ip3gen(c, s + Qconvbase, dp);
292                 case Qctl:
293                 case Qdata:
294                 case Qerr:
295                 case Qlisten:
296                 case Qlocal:
297                 case Qremote:
298                 case Qstatus:
299                 case Qsnoop:
300                         return ip3gen(c, TYPE(c->qid), dp);
301         }
302         return -1;
303 }
304
305 static void ipinit(void)
306 {
307         qlock_init(&fslock);
308         nullmediumlink();
309         pktmediumlink();
310 /* if only
311         fmtinstall('i', eipfmt);
312         fmtinstall('I', eipfmt);
313         fmtinstall('E', eipfmt);
314         fmtinstall('V', eipfmt);
315         fmtinstall('M', eipfmt);
316 */
317 }
318
319 static void ipreset(void)
320 {
321 }
322
323 static struct Fs *ipgetfs(int dev)
324 {
325         extern void (*ipprotoinit[]) (struct Fs *);
326         struct Fs *f;
327         int i;
328
329         if (dev >= Nfs)
330                 return NULL;
331
332         qlock(&fslock);
333         if (ipfs[dev] == NULL) {
334                 f = kzmalloc(sizeof(struct Fs), MEM_WAIT);
335                 rwinit(&f->rwlock);
336                 qlock_init(&f->iprouter.qlock);
337                 ip_init(f);
338                 arpinit(f);
339                 netloginit(f);
340                 for (i = 0; ipprotoinit[i]; i++)
341                         ipprotoinit[i] (f);
342                 f->dev = dev;
343                 ipfs[dev] = f;
344         }
345         qunlock(&fslock);
346
347         return ipfs[dev];
348 }
349
350 struct IPaux *newipaux(char *owner, char *tag)
351 {
352         struct IPaux *a;
353         int n;
354
355         a = kzmalloc(sizeof(*a), 0);
356         kstrdup(&a->owner, owner);
357         memset(a->tag, ' ', sizeof(a->tag));
358         n = strlen(tag);
359         if (n > sizeof(a->tag))
360                 n = sizeof(a->tag);
361         memmove(a->tag, tag, n);
362         return a;
363 }
364
365 #define ATTACHER(c) (((struct IPaux*)((c)->aux))->owner)
366
367 static struct chan *ipattach(char *spec)
368 {
369         struct chan *c;
370         int dev;
371
372         dev = atoi(spec);
373         if (dev >= Nfs)
374                 error(EFAIL, "bad specification");
375
376         ipgetfs(dev);
377         c = devattach(devname(), spec);
378         mkqid(&c->qid, QID(0, 0, Qtopdir), 0, QTDIR);
379         c->dev = dev;
380
381         c->aux = newipaux(commonuser(), "none");
382
383         return c;
384 }
385
386 static struct walkqid *ipwalk(struct chan *c, struct chan *nc, char **name,
387                                                           int nname)
388 {
389         struct IPaux *a = c->aux;
390         struct walkqid *w;
391
392         w = devwalk(c, nc, name, nname, NULL, 0, ipgen);
393         if (w != NULL && w->clone != NULL)
394                 w->clone->aux = newipaux(a->owner, a->tag);
395         return w;
396 }
397
398 static int ipstat(struct chan *c, uint8_t * db, int n)
399 {
400         return devstat(c, db, n, NULL, 0, ipgen);
401 }
402
403 static int should_wake(void *arg)
404 {
405         struct conv *cv = arg;
406         /* signal that the conv is closed */
407         if (qisclosed(cv->rq))
408                 return TRUE;
409         return cv->incall != NULL;
410 }
411
412 static struct chan *ipopen(struct chan *c, int omode)
413 {
414         ERRSTACK(2);
415         struct conv *cv, *nc;
416         struct Proto *p;
417         int perm;
418         struct Fs *f;
419
420         /* perm is a lone rwx, not the rwx------ from the conversion */
421         perm = omode_to_rwx(omode) >> 6;
422
423         f = ipfs[c->dev];
424
425         switch (TYPE(c->qid)) {
426                 default:
427                         break;
428                 case Qndb:
429                         if (omode & (O_WRITE | O_TRUNC) && !iseve())
430                                 error(EPERM, ERROR_FIXME);
431                         if ((omode & (O_WRITE | O_TRUNC)) == (O_WRITE | O_TRUNC))
432                                 f->ndb[0] = 0;
433                         break;
434                 case Qlog:
435                         netlogopen(f);
436                         break;
437                 case Qiprouter:
438                         iprouteropen(f);
439                         break;
440                 case Qiproute:
441                         c->synth_buf = kpages_zalloc(IPROUTE_LEN, MEM_WAIT);
442                         routeread(f, c->synth_buf, 0, IPROUTE_LEN);
443                         break;
444                 case Qtopdir:
445                 case Qprotodir:
446                 case Qconvdir:
447                 case Qstatus:
448                 case Qremote:
449                 case Qlocal:
450                 case Qstats:
451                 case Qipselftab:
452                         if (omode & O_WRITE)
453                                 error(EPERM, ERROR_FIXME);
454                         break;
455                 case Qsnoop:
456                         if (omode & O_WRITE)
457                                 error(EPERM, ERROR_FIXME);
458                         /* might be racy.  note the lack of a proto lock, unlike Qdata */
459                         p = f->p[PROTO(c->qid)];
460                         cv = p->conv[CONV(c->qid)];
461                         if (strcmp(ATTACHER(c), cv->owner) != 0 && !iseve())
462                                 error(EPERM, ERROR_FIXME);
463                         atomic_inc(&cv->snoopers);
464                         break;
465                 case Qclone:
466                         p = f->p[PROTO(c->qid)];
467                         qlock(&p->qlock);
468                         if (waserror()) {
469                                 qunlock(&p->qlock);
470                                 nexterror();
471                         }
472                         cv = Fsprotoclone(p, ATTACHER(c));
473                         qunlock(&p->qlock);
474                         poperror();
475                         if (cv == NULL) {
476                                 error(ENODEV, "Null conversation from Fsprotoclone");
477                                 break;
478                         }
479                         mkqid(&c->qid, QID(p->x, cv->x, Qctl), 0, QTFILE);
480                         break;
481                 case Qdata:
482                 case Qctl:
483                 case Qerr:
484                         p = f->p[PROTO(c->qid)];
485                         qlock(&p->qlock);
486                         cv = p->conv[CONV(c->qid)];
487                         qlock(&cv->qlock);
488                         if (waserror()) {
489                                 qunlock(&cv->qlock);
490                                 qunlock(&p->qlock);
491                                 nexterror();
492                         }
493                         if ((perm & (cv->perm >> 6)) != perm) {
494                                 if (strcmp(ATTACHER(c), cv->owner) != 0)
495                                         error(EPERM, ERROR_FIXME);
496                                 if ((perm & cv->perm) != perm)
497                                         error(EPERM, ERROR_FIXME);
498
499                         }
500                         cv->inuse++;
501                         if (cv->inuse == 1) {
502                                 kstrdup(&cv->owner, ATTACHER(c));
503                                 cv->perm = 0660;
504                         }
505                         qunlock(&cv->qlock);
506                         qunlock(&p->qlock);
507                         poperror();
508                         break;
509                 case Qlisten:
510                         cv = f->p[PROTO(c->qid)]->conv[CONV(c->qid)];
511                         /* No permissions or Announce checks required.  We'll see if that's
512                          * a good idea or not. (the perm check would do nothing, as is,
513                          * since an O_PATH perm is 0).
514                          *
515                          * But we probably want to incref to keep the conversation around
516                          * until this FD/chan is closed.  #ip is a little weird in that
517                          * objects never really go away (high water mark for convs, you can
518                          * always find them in the ns).  I think it is possible to
519                          * namec/ipgen a chan, then have that conv close, then have that
520                          * chan be opened.  You can probably do this with a data file. */
521                         if (omode & O_PATH) {
522                                 qlock(&cv->qlock);
523                                 cv->inuse++;
524                                 qunlock(&cv->qlock);
525                                 break;
526                         }
527                         if ((perm & (cv->perm >> 6)) != perm) {
528                                 if (strcmp(ATTACHER(c), cv->owner) != 0)
529                                         error(EPERM, ERROR_FIXME);
530                                 if ((perm & cv->perm) != perm)
531                                         error(EPERM, ERROR_FIXME);
532
533                         }
534
535                         if (cv->state != Announced)
536                                 error(EFAIL, "not announced");
537
538                         if (waserror()) {
539                                 closeconv(cv);
540                                 nexterror();
541                         }
542                         qlock(&cv->qlock);
543                         cv->inuse++;
544                         qunlock(&cv->qlock);
545
546                         nc = NULL;
547                         while (nc == NULL) {
548                                 /* give up if we got a hangup */
549                                 if (qisclosed(cv->rq))
550                                         error(EFAIL, "listen hungup");
551
552                                 qlock(&cv->listenq);
553                                 if (waserror()) {
554                                         qunlock(&cv->listenq);
555                                         nexterror();
556                                 }
557                                 /* we can peek at incall without grabbing the cv qlock.  if
558                                  * anything is there, it'll remain there until we dequeue it.
559                                  * no one else can, since we hold the listenq lock */
560                                 if ((c->flag & O_NONBLOCK) && !cv->incall)
561                                         error(EAGAIN, "listen queue empty");
562                                 /* wait for a connect */
563                                 rendez_sleep(&cv->listenr, should_wake, cv);
564
565                                 /* if there is a concurrent hangup, they will hold the qlock
566                                  * until the hangup is complete, including closing the cv->rq */
567                                 qlock(&cv->qlock);
568                                 nc = cv->incall;
569                                 if (nc != NULL) {
570                                         cv->incall = nc->next;
571                                         mkqid(&c->qid, QID(PROTO(c->qid), nc->x, Qctl), 0, QTFILE);
572                                         kstrdup(&cv->owner, ATTACHER(c));
573                                 }
574                                 qunlock(&cv->qlock);
575
576                                 qunlock(&cv->listenq);
577                                 poperror();
578                         }
579                         closeconv(cv);
580                         poperror();
581                         break;
582         }
583         c->mode = openmode(omode);
584         c->flag |= COPEN;
585         c->offset = 0;
586         return c;
587 }
588
589 static int ipwstat(struct chan *c, uint8_t * dp, int n)
590 {
591         ERRSTACK(2);
592         struct dir *d;
593         struct conv *cv;
594         struct Fs *f;
595         struct Proto *p;
596
597         f = ipfs[c->dev];
598         switch (TYPE(c->qid)) {
599                 default:
600                         error(EPERM, ERROR_FIXME);
601                         break;
602                 case Qctl:
603                 case Qdata:
604                         break;
605         }
606
607         d = kzmalloc(sizeof(*d) + n, 0);
608         if (waserror()) {
609                 kfree(d);
610                 nexterror();
611         }
612         n = convM2D(dp, n, d, (char *)&d[1]);
613         if (n == 0)
614                 error(ENODATA, ERROR_FIXME);
615         p = f->p[PROTO(c->qid)];
616         cv = p->conv[CONV(c->qid)];
617         if (!iseve() && strcmp(ATTACHER(c), cv->owner) != 0)
618                 error(EPERM, ERROR_FIXME);
619         if (!emptystr(d->uid))
620                 kstrdup(&cv->owner, d->uid);
621         if (d->mode != -1)
622                 cv->perm = d->mode & 0777;
623         poperror();
624         kfree(d);
625         return n;
626 }
627
628 /* Should be able to handle any file type chan. Feel free to extend it. */
629 static char *ipchaninfo(struct chan *ch, char *ret, size_t ret_l)
630 {
631         struct conv *conv;
632         struct Proto *proto;
633         char *p;
634         struct Fs *f;
635
636         f = ipfs[ch->dev];
637
638         switch (TYPE(ch->qid)) {
639                 default:
640                         ret = "Unknown type";
641                         break;
642                 case Qdata:
643                         proto = f->p[PROTO(ch->qid)];
644                         conv = proto->conv[CONV(ch->qid)];
645                         snprintf(ret, ret_l, "Qdata, %s, proto %s, conv idx %d, rq len %d, wq len %d",
646                                  SLIST_EMPTY(&conv->data_taps) ? "untapped" : "tapped",
647                                  proto->name, conv->x, qlen(conv->rq), qlen(conv->wq));
648                         break;
649                 case Qarp:
650                         ret = "Qarp";
651                         break;
652                 case Qiproute:
653                         ret = "Qiproute";
654                         break;
655                 case Qlisten:
656                         proto = f->p[PROTO(ch->qid)];
657                         conv = proto->conv[CONV(ch->qid)];
658                         snprintf(ret, ret_l,
659                                  "Qlisten, %s proto %s, conv idx %d, has %sincalls",
660                                  SLIST_EMPTY(&conv->listen_taps) ? "untapped" : "tapped",
661                                  proto->name, conv->x, conv->incall ? "" : "no ");
662                         break;
663                 case Qlog:
664                         ret = "Qlog";
665                         break;
666                 case Qndb:
667                         ret = "Qndb";
668                         break;
669                 case Qctl:
670                         proto = f->p[PROTO(ch->qid)];
671                         conv = proto->conv[CONV(ch->qid)];
672                         snprintf(ret, ret_l, "Qctl, proto %s, conv idx %d", proto->name,
673                                          conv->x);
674                         break;
675         }
676         return ret;
677 }
678
679 static void closeconv(struct conv *cv)
680 {
681         ERRSTACK(1);
682         struct conv *nc;
683         struct Ipmulti *mp;
684
685         qlock(&cv->qlock);
686
687         if (--cv->inuse > 0) {
688                 qunlock(&cv->qlock);
689                 return;
690         }
691         if (waserror()) {
692                 qunlock(&cv->qlock);
693                 nexterror();
694         }
695         /* close all incoming calls since no listen will ever happen */
696         for (nc = cv->incall; nc; nc = cv->incall) {
697                 cv->incall = nc->next;
698                 closeconv(nc);
699         }
700         cv->incall = NULL;
701
702         kstrdup(&cv->owner, network);
703         cv->perm = 0660;
704
705         while ((mp = cv->multi) != NULL)
706                 ipifcremmulti(cv, mp->ma, mp->ia);
707
708         cv->r = NULL;
709         cv->rgen = 0;
710         if (cv->state == Bypass)
711                 undo_proto_qio_bypass(cv);
712         cv->p->close(cv);
713         cv->state = Idle;
714         qunlock(&cv->qlock);
715         poperror();
716 }
717
718 static void ipclose(struct chan *c)
719 {
720         struct Fs *f;
721
722         f = ipfs[c->dev];
723         switch (TYPE(c->qid)) {
724                 default:
725                         break;
726                 case Qlog:
727                         if (c->flag & COPEN)
728                                 netlogclose(f);
729                         break;
730                 case Qiprouter:
731                         if (c->flag & COPEN)
732                                 iprouterclose(f);
733                         break;
734                 case Qdata:
735                 case Qctl:
736                 case Qerr:
737                 case Qlisten:
738                         if (c->flag & COPEN)
739                                 closeconv(f->p[PROTO(c->qid)]->conv[CONV(c->qid)]);
740                         break;
741                 case Qsnoop:
742                         if (c->flag & COPEN)
743                                 atomic_dec(&f->p[PROTO(c->qid)]->conv[CONV(c->qid)]->snoopers);
744                         break;
745                 case Qiproute:
746                         if (c->flag & COPEN)
747                                 kpages_free(c->synth_buf, IPROUTE_LEN);
748                         break;
749         }
750         kfree(((struct IPaux *)c->aux)->owner);
751         kfree(c->aux);
752 }
753
754 enum {
755         Statelen = 32 * 1024,
756 };
757
758 static long ipread(struct chan *ch, void *a, long n, int64_t off)
759 {
760         struct conv *c;
761         struct Proto *x;
762         char *buf, *p;
763         long rv;
764         struct Fs *f;
765         uint32_t offset = off;
766
767         f = ipfs[ch->dev];
768
769         p = a;
770         switch (TYPE(ch->qid)) {
771                 default:
772                         error(EPERM, ERROR_FIXME);
773                 case Qtopdir:
774                 case Qprotodir:
775                 case Qconvdir:
776                         return devdirread(ch, a, n, 0, 0, ipgen);
777                 case Qarp:
778                         return arpread(f->arp, a, offset, n);
779                 case Qndb:
780                         return readstr(offset, a, n, f->ndb);
781                 case Qiproute:
782                         return readmem(offset, a, n, ch->synth_buf, IPROUTE_LEN);
783                 case Qiprouter:
784                         return iprouterread(f, a, n);
785                 case Qipselftab:
786                         return ipselftabread(f, a, offset, n);
787                 case Qlog:
788                         return netlogread(f, a, offset, n);
789                 case Qctl:
790                         snprintf(get_cur_genbuf(), GENBUF_SZ, "%lu", CONV(ch->qid));
791                         return readstr(offset, p, n, get_cur_genbuf());
792                 case Qremote:
793                         buf = kzmalloc(Statelen, 0);
794                         x = f->p[PROTO(ch->qid)];
795                         c = x->conv[CONV(ch->qid)];
796                         if (x->remote == NULL) {
797                                 snprintf(buf, Statelen, "%I!%d\n", c->raddr, c->rport);
798                         } else {
799                                 (*x->remote) (c, buf, Statelen - 2);
800                         }
801                         rv = readstr(offset, p, n, buf);
802                         kfree(buf);
803                         return rv;
804                 case Qlocal:
805                         buf = kzmalloc(Statelen, 0);
806                         x = f->p[PROTO(ch->qid)];
807                         c = x->conv[CONV(ch->qid)];
808                         if (x->local == NULL) {
809                                 snprintf(buf, Statelen, "%I!%d\n", c->laddr, c->lport);
810                         } else {
811                                 (*x->local) (c, buf, Statelen - 2);
812                         }
813                         rv = readstr(offset, p, n, buf);
814                         kfree(buf);
815                         return rv;
816                 case Qstatus:
817                         /* this all is a bit screwed up since the size of some state's
818                          * buffers will change from one invocation to another.  a reader
819                          * will come in and read the entire buffer.  then it will come again
820                          * and read from the next offset, expecting EOF.  if the buffer
821                          * changed sizes, it'll reprint the end of the buffer slightly. */
822                         buf = kzmalloc(Statelen, 0);
823                         x = f->p[PROTO(ch->qid)];
824                         c = x->conv[CONV(ch->qid)];
825                         if (c->state == Bypass)
826                                 snprintf(buf, Statelen, "Bypassed\n");
827                         else
828                                 (*x->state)(c, buf, Statelen - 2);
829                         rv = readstr(offset, p, n, buf);
830                         kfree(buf);
831                         return rv;
832                 case Qdata:
833                         c = f->p[PROTO(ch->qid)]->conv[CONV(ch->qid)];
834                         if (ch->flag & O_NONBLOCK)
835                                 return qread_nonblock(c->rq, a, n);
836                         else
837                                 return qread(c->rq, a, n);
838                 case Qerr:
839                         c = f->p[PROTO(ch->qid)]->conv[CONV(ch->qid)];
840                         return qread(c->eq, a, n);
841                 case Qsnoop:
842                         c = f->p[PROTO(ch->qid)]->conv[CONV(ch->qid)];
843                         return qread(c->sq, a, n);
844                 case Qstats:
845                         x = f->p[PROTO(ch->qid)];
846                         if (x->stats == NULL)
847                                 error(EFAIL, "stats not implemented");
848                         buf = kzmalloc(Statelen, 0);
849                         (*x->stats) (x, buf, Statelen);
850                         rv = readstr(offset, p, n, buf);
851                         kfree(buf);
852                         return rv;
853         }
854 }
855
856 static struct block *ipbread(struct chan *ch, long n, uint32_t offset)
857 {
858         struct conv *c;
859
860         switch (TYPE(ch->qid)) {
861                 case Qdata:
862                         c = chan2conv(ch);
863                         if (ch->flag & O_NONBLOCK)
864                                 return qbread_nonblock(c->rq, n);
865                         else
866                                 return qbread(c->rq, n);
867                 default:
868                         return devbread(ch, n, offset);
869         }
870 }
871
872 /*
873  *  set local address to be that of the ifc closest to remote address
874  */
875 static void setladdr(struct conv *c)
876 {
877         findlocalip(c->p->f, c->laddr, c->raddr);
878 }
879
880 /*
881  *  set a local port making sure the quad of raddr,rport,laddr,lport is unique
882  */
883 static void setluniqueport(struct conv *c, int lport)
884 {
885         struct Proto *p;
886         struct conv *xp;
887         int x;
888
889         p = c->p;
890
891         qlock(&p->qlock);
892         for (x = 0; x < p->nc; x++) {
893                 xp = p->conv[x];
894                 if (xp == NULL)
895                         break;
896                 if (xp == c)
897                         continue;
898                 if ((xp->state == Connected || xp->state == Announced
899                                             || xp->state == Bypass)
900                         && xp->lport == lport
901                         && xp->rport == c->rport
902                         && ipcmp(xp->raddr, c->raddr) == 0
903                         && ipcmp(xp->laddr, c->laddr) == 0) {
904                         qunlock(&p->qlock);
905                         error(EFAIL, "address in use");
906                 }
907         }
908         c->lport = lport;
909         qunlock(&p->qlock);
910 }
911
912 /*
913  *  pick a local port and set it
914  */
915 static void setlport(struct conv *c)
916 {
917         struct Proto *p;
918         uint16_t *pp;
919         int x, found;
920
921         p = c->p;
922         if (c->restricted)
923                 pp = &p->nextrport;
924         else
925                 pp = &p->nextport;
926         qlock(&p->qlock);
927         for (;; (*pp)++) {
928                 /*
929                  * Fsproto initialises p->nextport to 0 and the restricted
930                  * ports (p->nextrport) to 600.
931                  * Restricted ports must lie between 600 and 1024.
932                  * For the initial condition or if the unrestricted port number
933                  * has wrapped round, select a random port between 5000 and 1<<15
934                  * to start at.
935                  */
936                 if (c->restricted) {
937                         if (*pp >= 1024)
938                                 *pp = 600;
939                 } else
940                         while (*pp < 5000)
941                                 urandom_read(pp, sizeof(*pp));
942
943                 found = 0;
944                 for (x = 0; x < p->nc; x++) {
945                         if (p->conv[x] == NULL)
946                                 break;
947                         if (p->conv[x]->lport == *pp) {
948                                 found = 1;
949                                 break;
950                         }
951                 }
952                 if (!found)
953                         break;
954         }
955         c->lport = (*pp)++;
956         qunlock(&p->qlock);
957 }
958
959 /*
960  *  set a local address and port from a string of the form
961  *      [address!]port[!r]
962  */
963 static void setladdrport(struct conv *c, char *str, int announcing)
964 {
965         char *p;
966         uint16_t lport;
967         uint8_t addr[IPaddrlen];
968
969         /*
970          *  ignore restricted part if it exists.  it's
971          *  meaningless on local ports.
972          */
973         p = strchr(str, '!');
974         if (p != NULL) {
975                 *p++ = 0;
976                 if (strcmp(p, "r") == 0)
977                         p = NULL;
978         }
979
980         c->lport = 0;
981         if (p == NULL) {
982                 if (announcing)
983                         ipmove(c->laddr, IPnoaddr);
984                 else
985                         setladdr(c);
986                 p = str;
987         } else {
988                 if (strcmp(str, "*") == 0)
989                         ipmove(c->laddr, IPnoaddr);
990                 else {
991                         parseip(addr, str);
992                         if (ipforme(c->p->f, addr))
993                                 ipmove(c->laddr, addr);
994                         else
995                                 error(EFAIL, "not a local IP address");
996                 }
997         }
998
999         /* one process can get all connections */
1000         if (announcing && strcmp(p, "*") == 0) {
1001                 if (!iseve())
1002                         error(EPERM, ERROR_FIXME);
1003                 setluniqueport(c, 0);
1004         }
1005
1006         lport = atoi(p);
1007         if (lport <= 0)
1008                 setlport(c);
1009         else
1010                 setluniqueport(c, lport);
1011 }
1012
1013 static void setraddrport(struct conv *c, char *str)
1014 {
1015         char *p;
1016
1017         p = strchr(str, '!');
1018         if (p == NULL)
1019                 error(EFAIL, "malformed address");
1020         *p++ = 0;
1021         parseip(c->raddr, str);
1022         c->rport = atoi(p);
1023         p = strchr(p, '!');
1024         if (p) {
1025                 if (strstr(p, "!r") != NULL)
1026                         c->restricted = 1;
1027         }
1028 }
1029
1030 /*
1031  *  called by protocol connect routine to set addresses
1032  */
1033 void Fsstdconnect(struct conv *c, char *argv[], int argc)
1034 {
1035         switch (argc) {
1036                 default:
1037                         error(EINVAL, "bad args to %s", __func__);
1038                 case 2:
1039                         setraddrport(c, argv[1]);
1040                         setladdr(c);
1041                         setlport(c);
1042                         break;
1043                 case 3:
1044                         setraddrport(c, argv[1]);
1045                         setladdrport(c, argv[2], 0);
1046                         break;
1047         }
1048
1049         /* TODO: why is an IPnoaddr (in v6 format, equivalent to v6Unspecified),
1050          * a v4 format? */
1051         if ((memcmp(c->raddr, v4prefix, IPv4off) == 0 &&
1052                  memcmp(c->laddr, v4prefix, IPv4off) == 0)
1053                 || ipcmp(c->raddr, IPnoaddr) == 0)
1054                 c->ipversion = V4;
1055         else
1056                 c->ipversion = V6;
1057         /* Linux has taught people to use zeros for local interfaces.  TODO: We
1058          * might need this for v6 in the future. */
1059         if (!ipcmp(c->raddr, IPv4_zeroes))
1060                 ipmove(c->raddr, IPv4_loopback);
1061 }
1062
1063 /*
1064  *  initiate connection and sleep till its set up
1065  */
1066 static int connected(void *a)
1067 {
1068         return ((struct conv *)a)->state == Connected;
1069 }
1070
1071 static void connectctlmsg(struct Proto *x, struct conv *c, struct cmdbuf *cb)
1072 {
1073         ERRSTACK(1);
1074         char *p;
1075
1076         if (c->state != 0)
1077                 error(EBUSY, ERROR_FIXME);
1078         c->state = Connecting;
1079         c->cerr[0] = '\0';
1080         if (x->connect == NULL)
1081                 error(EFAIL, "connect not supported");
1082         x->connect(c, cb->f, cb->nf);
1083
1084         qunlock(&c->qlock);
1085         if (waserror()) {
1086                 qlock(&c->qlock);
1087                 nexterror();
1088         }
1089         rendez_sleep(&c->cr, connected, c);
1090         qlock(&c->qlock);
1091         poperror();
1092
1093         if (c->cerr[0] != '\0')
1094                 error(EFAIL, c->cerr);
1095 }
1096
1097 /*
1098  *  called by protocol announce routine to set addresses
1099  */
1100 void Fsstdannounce(struct conv *c, char *argv[], int argc)
1101 {
1102         memset(c->raddr, 0, sizeof(c->raddr));
1103         c->rport = 0;
1104         switch (argc) {
1105                 default:
1106                         error(EINVAL, "bad args to announce");
1107                 case 2:
1108                         setladdrport(c, argv[1], 1);
1109                         break;
1110         }
1111 }
1112
1113 /*
1114  *  initiate announcement and sleep till its set up
1115  */
1116 static int announced(void *a)
1117 {
1118         return ((struct conv *)a)->state == Announced;
1119 }
1120
1121 static void announcectlmsg(struct Proto *x, struct conv *c, struct cmdbuf *cb)
1122 {
1123         ERRSTACK(1);
1124         char *p;
1125
1126         if (c->state != 0)
1127                 error(EBUSY, ERROR_FIXME);
1128         c->state = Announcing;
1129         c->cerr[0] = '\0';
1130         if (x->announce == NULL)
1131                 error(EFAIL, "announce not supported");
1132         x->announce(c, cb->f, cb->nf);
1133
1134         qunlock(&c->qlock);
1135         if (waserror()) {
1136                 qlock(&c->qlock);
1137                 nexterror();
1138         }
1139         rendez_sleep(&c->cr, announced, c);
1140         qlock(&c->qlock);
1141         poperror();
1142
1143         if (c->cerr[0] != '\0')
1144                 error(EFAIL, c->cerr);
1145 }
1146
1147 /*
1148  *  called by protocol bind routine to set addresses
1149  */
1150 void Fsstdbind(struct conv *c, char *argv[], int argc)
1151 {
1152         switch (argc) {
1153                 default:
1154                         error(EINVAL, "bad args to bind");
1155                 case 2:
1156                         setladdrport(c, argv[1], 0);
1157                         break;
1158         }
1159 }
1160
1161 static void bindctlmsg(struct Proto *x, struct conv *c, struct cmdbuf *cb)
1162 {
1163         if (x->bind == NULL)
1164                 Fsstdbind(c, cb->f, cb->nf);
1165         else
1166                 x->bind(c, cb->f, cb->nf);
1167 }
1168
1169 /* Helper, called by protocols to use the bypass.
1170  *
1171  * This is a bit nasty due to the overall nastiness of #ip.  We need to lock
1172  * before checking the state and hold the qlock throughout, because a concurrent
1173  * closeconv() could tear down the bypass.  Specifically, it could free the
1174  * bypass queues.  The root issue is that conversation lifetimes are not managed
1175  * well.
1176  *
1177  * If we fail, it's our responsibility to consume (free) the block(s). */
1178 void bypass_or_drop(struct conv *cv, struct block *bp)
1179 {
1180         qlock(&cv->qlock);
1181         if (cv->state == Bypass)
1182                 qpass(cv->rq, bp);
1183         else
1184                 freeblist(bp);
1185         qunlock(&cv->qlock);
1186 }
1187
1188 /* Push the block directly to the approprite ipoput function.
1189  *
1190  * It's the protocol's responsibility (and thus ours here) to make sure there is
1191  * at least the right amount of the IP header in the block (ipoput{4,6} assumes
1192  * it has the right amount, and the other protocols account for the IP header in
1193  * their own header).
1194  *
1195  * For the TTL and TOS, we just use the default ones.  If we want, we could look
1196  * into the actual block and see what the user wanted, though we're bypassing
1197  * the protocol layer, not the IP layer. */
1198 static void proto_bypass_kick(void *arg, struct block *bp)
1199 {
1200         struct conv *cv = (struct conv*)arg;
1201         uint8_t vers_nibble;
1202         struct Fs *f;
1203
1204         f = cv->p->f;
1205
1206         bp = pullupblock(bp, 1);
1207         if (!bp)
1208                 error(EINVAL, "Proto bypass unable to pullup a byte!");
1209         vers_nibble = *(uint8_t*)bp->rp & 0xf0;
1210         switch (vers_nibble) {
1211         case IP_VER4:
1212                 bp = pullupblock(bp, IPV4HDR_LEN);
1213                 if (!bp)
1214                         error(EINVAL, "Proto bypass unable to pullup v4 header");
1215                 ipoput4(f, bp, FALSE, MAXTTL, DFLTTOS, NULL);
1216                 break;
1217         case IP_VER6:
1218                 bp = pullupblock(bp, IPV6HDR_LEN);
1219                 if (!bp)
1220                         error(EINVAL, "Proto bypass unable to pullup v6 header");
1221                 ipoput6(f, bp, FALSE, MAXTTL, DFLTTOS, NULL);
1222                 break;
1223         default:
1224                 error(EINVAL, "Proto bypass block had unknown IP version 0x%x",
1225                       vers_nibble);
1226         }
1227 }
1228
1229 /* Sets up cv for the protocol bypass.  We use different queues for two reasons:
1230  * 1) To be protocol independent.  For instance, TCP and UDP could use very
1231  * different QIO styles.
1232  * 2) To set up our own kick/bypass method.  Note how udpcreate() and here uses
1233  * qbypass() (just blast it out), while TCP uses qopen() with a kick.  TCP still
1234  * follows queuing discipline.
1235  *
1236  * It's like we are our own protocol, the bypass protocol, when it comes to how
1237  * we interact with qio.  The conv still is of the real protocol type (e.g.
1238  * TCP).
1239  *
1240  * Note that we can't free the old queues.  The way #ip works, the queues are
1241  * created when the conv is created, but the conv is never freed.  It's like a
1242  * slab allocator that never frees objects, but just reinitializes them a
1243  * little.
1244  *
1245  * For the queues, we're basically like UDP:
1246  * - We take packets for rq and drop on overflow.
1247  * - rq is also Qmsg, but we also have Qcoalesce, to ignore out zero-len blocks
1248  * - We kick for our outbound (wq) messages.
1249  *
1250  * Note that Qmsg can drop parts of packets.  It's up to the user to read
1251  * enough.  If they didn't read enough, the extra is dropped.  This is similar
1252  * to SOCK_DGRAM and recvfrom().  Minus major changes, there's no nice way to
1253  * get individual messages with read().  Userspace using the bypass will need to
1254  * find out the MTU of the NIC the IP stack is attached to, and make sure to
1255  * read in at least that amount each time. */
1256 static void setup_proto_qio_bypass(struct conv *cv)
1257 {
1258         cv->rq_save = cv->rq;
1259         cv->wq_save = cv->wq;
1260         cv->rq = qopen(BYPASS_QMAX, Qmsg | Qcoalesce, 0, 0);
1261         cv->wq = qbypass(proto_bypass_kick, cv);
1262 }
1263
1264 static void undo_proto_qio_bypass(struct conv *cv)
1265 {
1266         qfree(cv->rq);
1267         qfree(cv->wq);
1268         cv->rq = cv->rq_save;
1269         cv->wq = cv->wq_save;
1270         cv->rq_save = NULL;
1271         cv->wq_save = NULL;
1272 }
1273
1274 void Fsstdbypass(struct conv *cv, char *argv[], int argc)
1275 {
1276         memset(cv->raddr, 0, sizeof(cv->raddr));
1277         cv->rport = 0;
1278         switch (argc) {
1279         case 2:
1280                 setladdrport(cv, argv[1], 1);
1281                 break;
1282         default:
1283                 error(EINVAL, "Bad args (was %d, need 2) to bypass", argc);
1284         }
1285 }
1286
1287 static void bypassctlmsg(struct Proto *x, struct conv *cv, struct cmdbuf *cb)
1288 {
1289         if (!x->bypass)
1290                 error(EFAIL, "Protocol %s does not support bypass", x->name);
1291         /* The protocol needs to set the port (usually by calling Fsstdbypass) and
1292          * then do whatever it needs to make sure it can find the conv again during
1293          * receive (usually by adding to a hash table). */
1294         x->bypass(cv, cb->f, cb->nf);
1295         setup_proto_qio_bypass(cv);
1296         cv->state = Bypass;
1297 }
1298
1299 static void shutdownctlmsg(struct conv *cv, struct cmdbuf *cb)
1300 {
1301         if (cb->nf < 2)
1302                 goto err;
1303         if (!strcmp(cb->f[1], "rd")) {
1304                 qhangup(cv->rq, "shutdown");
1305                 if (cv->p->shutdown)
1306                         cv->p->shutdown(cv, SHUT_RD);
1307         } else if (!strcmp(cb->f[1], "wr")) {
1308                 qhangup(cv->wq, "shutdown");
1309                 if (cv->p->shutdown)
1310                         cv->p->shutdown(cv, SHUT_WR);
1311         } else if (!strcmp(cb->f[1], "rdwr")) {
1312                 qhangup(cv->rq, "shutdown");
1313                 qhangup(cv->wq, "shutdown");
1314                 if (cv->p->shutdown)
1315                         cv->p->shutdown(cv, SHUT_RDWR);
1316         } else {
1317                 goto err;
1318         }
1319         return;
1320 err:
1321         error(EINVAL, "shutdown [rx|tx|rxtx]");
1322 }
1323
1324 static void tosctlmsg(struct conv *c, struct cmdbuf *cb)
1325 {
1326         if (cb->nf < 2)
1327                 c->tos = 0;
1328         else
1329                 c->tos = atoi(cb->f[1]);
1330 }
1331
1332 static void ttlctlmsg(struct conv *c, struct cmdbuf *cb)
1333 {
1334         if (cb->nf < 2)
1335                 c->ttl = MAXTTL;
1336         else
1337                 c->ttl = atoi(cb->f[1]);
1338 }
1339
1340 /* Binds a conversation, as if the user wrote "bind *" into ctl. */
1341 static void autobind(struct conv *cv)
1342 {
1343         ERRSTACK(1);
1344         struct cmdbuf *cb;
1345
1346         cb = parsecmd("bind *", 7);
1347         if (waserror()) {
1348                 kfree(cb);
1349                 nexterror();
1350         }
1351         bindctlmsg(cv->p, cv, cb);
1352         poperror();
1353         kfree(cb);
1354 }
1355
1356 static long ipwrite(struct chan *ch, void *v, long n, int64_t off)
1357 {
1358         ERRSTACK(1);
1359         struct conv *c;
1360         struct Proto *x;
1361         char *p;
1362         struct cmdbuf *cb;
1363         uint8_t ia[IPaddrlen], ma[IPaddrlen];
1364         struct Fs *f;
1365         char *a;
1366
1367         a = v;
1368         f = ipfs[ch->dev];
1369
1370         switch (TYPE(ch->qid)) {
1371                 default:
1372                         error(EPERM, ERROR_FIXME);
1373                 case Qdata:
1374                         x = f->p[PROTO(ch->qid)];
1375                         c = x->conv[CONV(ch->qid)];
1376                         /* connection-less protocols (UDP) can write without manually
1377                          * binding. */
1378                         if (c->lport == 0)
1379                                 autobind(c);
1380                         if (ch->flag & O_NONBLOCK)
1381                                 qwrite_nonblock(c->wq, a, n);
1382                         else
1383                                 qwrite(c->wq, a, n);
1384                         break;
1385                 case Qarp:
1386                         return arpwrite(f, a, n);
1387                 case Qiproute:
1388                         return routewrite(f, ch, a, n);
1389                 case Qlog:
1390                         netlogctl(f, a, n);
1391                         return n;
1392                 case Qndb:
1393                         return ndbwrite(f, a, off, n);
1394                 case Qctl:
1395                         x = f->p[PROTO(ch->qid)];
1396                         c = x->conv[CONV(ch->qid)];
1397                         cb = parsecmd(a, n);
1398
1399                         qlock(&c->qlock);
1400                         if (waserror()) {
1401                                 qunlock(&c->qlock);
1402                                 kfree(cb);
1403                                 nexterror();
1404                         }
1405                         if (cb->nf < 1)
1406                                 error(EFAIL, "short control request");
1407                         if (strcmp(cb->f[0], "connect") == 0)
1408                                 connectctlmsg(x, c, cb);
1409                         else if (strcmp(cb->f[0], "announce") == 0)
1410                                 announcectlmsg(x, c, cb);
1411                         else if (strcmp(cb->f[0], "bind") == 0)
1412                                 bindctlmsg(x, c, cb);
1413                         else if (strcmp(cb->f[0], "bypass") == 0)
1414                                 bypassctlmsg(x, c, cb);
1415                         else if (strcmp(cb->f[0], "shutdown") == 0)
1416                                 shutdownctlmsg(c, cb);
1417                         else if (strcmp(cb->f[0], "ttl") == 0)
1418                                 ttlctlmsg(c, cb);
1419                         else if (strcmp(cb->f[0], "tos") == 0)
1420                                 tosctlmsg(c, cb);
1421                         else if (strcmp(cb->f[0], "ignoreadvice") == 0)
1422                                 c->ignoreadvice = 1;
1423                         else if (strcmp(cb->f[0], "addmulti") == 0) {
1424                                 if (cb->nf < 2)
1425                                         error(EFAIL, "addmulti needs interface address");
1426                                 if (cb->nf == 2) {
1427                                         if (!ipismulticast(c->raddr))
1428                                                 error(EFAIL, "addmulti for a non multicast address");
1429                                         parseip(ia, cb->f[1]);
1430                                         ipifcaddmulti(c, c->raddr, ia);
1431                                 } else {
1432                                         parseip(ma, cb->f[2]);
1433                                         if (!ipismulticast(ma))
1434                                                 error(EFAIL, "addmulti for a non multicast address");
1435                                         parseip(ia, cb->f[1]);
1436                                         ipifcaddmulti(c, ma, ia);
1437                                 }
1438                         } else if (strcmp(cb->f[0], "remmulti") == 0) {
1439                                 if (cb->nf < 2)
1440                                         error(EFAIL, "remmulti needs interface address");
1441                                 if (!ipismulticast(c->raddr))
1442                                         error(EFAIL, "remmulti for a non multicast address");
1443                                 parseip(ia, cb->f[1]);
1444                                 ipifcremmulti(c, c->raddr, ia);
1445                         } else if (x->ctl != NULL) {
1446                                 x->ctl(c, cb->f, cb->nf);
1447                         } else
1448                                 error(EFAIL, "unknown control request");
1449                         qunlock(&c->qlock);
1450                         kfree(cb);
1451                         poperror();
1452         }
1453         return n;
1454 }
1455
1456 static long ipbwrite(struct chan *ch, struct block *bp, uint32_t offset)
1457 {
1458         struct conv *c;
1459         int n;
1460
1461         switch (TYPE(ch->qid)) {
1462                 case Qdata:
1463                         c = chan2conv(ch);
1464                         if (bp->next)
1465                                 bp = concatblock(bp);
1466                         n = BLEN(bp);
1467                         if (ch->flag & O_NONBLOCK)
1468                                 qbwrite_nonblock(c->wq, bp);
1469                         else
1470                                 qbwrite(c->wq, bp);
1471                         return n;
1472                 default:
1473                         return devbwrite(ch, bp, offset);
1474         }
1475 }
1476
1477 static void ip_wake_cb(struct queue *q, void *data, int filter)
1478 {
1479         struct conv *conv = (struct conv*)data;
1480         struct fd_tap *tap_i;
1481         /* For these two, we want to ignore events on the opposite end of the
1482          * queues.  For instance, we want to know when the WQ is writable.  Our
1483          * writes will actually make it readable - we don't want to trigger a tap
1484          * for that.  However, qio doesn't know how/why we are using a queue, or
1485          * even who the ends are (hence the callbacks) */
1486         if ((filter & FDTAP_FILT_READABLE) && (q == conv->wq))
1487                 return;
1488         if ((filter & FDTAP_FILT_WRITABLE) && (q == conv->rq))
1489                 return;
1490         /* At this point, we have an event we want to send to our taps (if any).
1491          * The lock protects list integrity and the existence of the tap.
1492          *
1493          * Previously, I thought of using the conv qlock.  That actually breaks, due
1494          * to weird usages of the qlock (someone holds it for a long time, blocking
1495          * the inbound wakeup from etherread4).
1496          *
1497          * I opted for a spinlock for a couple reasons:
1498          * - fire_tap should not block.  ideally it'll be fast too (it's mostly a
1499          * send_event).
1500          * - our callers might not want to block.  A lot of network wakeups will
1501          * come network processes (etherread4) or otherwise unrelated to this
1502          * particular conversation.  I'd rather do something like fire off a KMSG
1503          * than block those.
1504          * - if fire_tap takes a while, holding the lock only slows down other
1505          * events on this *same* conversation, or other tap registration.  not a
1506          * huge deal. */
1507         spin_lock(&conv->tap_lock);
1508         SLIST_FOREACH(tap_i, &conv->data_taps, link)
1509                 fire_tap(tap_i, filter);
1510         spin_unlock(&conv->tap_lock);
1511 }
1512
1513 int iptapfd(struct chan *chan, struct fd_tap *tap, int cmd)
1514 {
1515         struct conv *conv = chan2conv(chan);
1516         int ret;
1517
1518         #define DEVIP_LEGAL_DATA_TAPS (FDTAP_FILT_READABLE | FDTAP_FILT_WRITABLE | \
1519                                        FDTAP_FILT_HANGUP | FDTAP_FILT_PRIORITY |   \
1520                                        FDTAP_FILT_ERROR)
1521         #define DEVIP_LEGAL_LISTEN_TAPS (FDTAP_FILT_READABLE | FDTAP_FILT_HANGUP)
1522
1523         switch (TYPE(chan->qid)) {
1524                 case Qdata:
1525                         if (tap->filter & ~DEVIP_LEGAL_DATA_TAPS) {
1526                                 set_errno(ENOSYS);
1527                                 set_errstr("Unsupported #%s data tap %p, must be %p", devname(),
1528                                            tap->filter, DEVIP_LEGAL_DATA_TAPS);
1529                                 return -1;
1530                         }
1531                         spin_lock(&conv->tap_lock);
1532                         switch (cmd) {
1533                                 case (FDTAP_CMD_ADD):
1534                                         if (SLIST_EMPTY(&conv->data_taps)) {
1535                                                 qio_set_wake_cb(conv->rq, ip_wake_cb, conv);
1536                                                 qio_set_wake_cb(conv->wq, ip_wake_cb, conv);
1537                                         }
1538                                         SLIST_INSERT_HEAD(&conv->data_taps, tap, link);
1539                                         ret = 0;
1540                                         break;
1541                                 case (FDTAP_CMD_REM):
1542                                         SLIST_REMOVE(&conv->data_taps, tap, fd_tap, link);
1543                                         if (SLIST_EMPTY(&conv->data_taps)) {
1544                                                 qio_set_wake_cb(conv->rq, 0, conv);
1545                                                 qio_set_wake_cb(conv->wq, 0, conv);
1546                                         }
1547                                         ret = 0;
1548                                         break;
1549                                 default:
1550                                         set_errno(ENOSYS);
1551                                         set_errstr("Unsupported #%s data tap command %p",
1552                                                    devname(), cmd);
1553                                         ret = -1;
1554                         }
1555                         spin_unlock(&conv->tap_lock);
1556                         return ret;
1557                 case Qlisten:
1558                         if (tap->filter & ~DEVIP_LEGAL_LISTEN_TAPS) {
1559                                 set_errno(ENOSYS);
1560                                 set_errstr("Unsupported #%s listen tap %p, must be %p",
1561                                            devname(), tap->filter, DEVIP_LEGAL_LISTEN_TAPS);
1562                                 return -1;
1563                         }
1564                         spin_lock(&conv->tap_lock);
1565                         switch (cmd) {
1566                                 case (FDTAP_CMD_ADD):
1567                                         SLIST_INSERT_HEAD(&conv->listen_taps, tap, link);
1568                                         ret = 0;
1569                                         break;
1570                                 case (FDTAP_CMD_REM):
1571                                         SLIST_REMOVE(&conv->listen_taps, tap, fd_tap, link);
1572                                         ret = 0;
1573                                         break;
1574                                 default:
1575                                         set_errno(ENOSYS);
1576                                         set_errstr("Unsupported #%s listen tap command %p",
1577                                                    devname(), cmd);
1578                                         ret = -1;
1579                         }
1580                         spin_unlock(&conv->tap_lock);
1581                         return ret;
1582                 default:
1583                         set_errno(ENOSYS);
1584                         set_errstr("Can't tap #%s file type %d", devname(),
1585                                    TYPE(chan->qid));
1586                         return -1;
1587         }
1588 }
1589
1590 struct dev ipdevtab __devtab = {
1591         .name = "ip",
1592
1593         .reset = ipreset,
1594         .init = ipinit,
1595         .shutdown = devshutdown,
1596         .attach = ipattach,
1597         .walk = ipwalk,
1598         .stat = ipstat,
1599         .open = ipopen,
1600         .create = devcreate,
1601         .close = ipclose,
1602         .read = ipread,
1603         .bread = ipbread,
1604         .write = ipwrite,
1605         .bwrite = ipbwrite,
1606         .remove = devremove,
1607         .wstat = ipwstat,
1608         .power = devpower,
1609         .chaninfo = ipchaninfo,
1610         .tapfd = iptapfd,
1611 };
1612
1613 int Fsproto(struct Fs *f, struct Proto *p)
1614 {
1615         if (f->np >= Maxproto)
1616                 return -1;
1617
1618         qlock_init(&p->qlock);
1619         p->f = f;
1620
1621         if (p->ipproto > 0) {
1622                 if (f->t2p[p->ipproto] != NULL)
1623                         return -1;
1624                 f->t2p[p->ipproto] = p;
1625         }
1626
1627         p->qid.type = QTDIR;
1628         p->qid.path = QID(f->np, 0, Qprotodir);
1629         p->conv = kzmalloc(sizeof(struct conv *) * (p->nc + 1), 0);
1630         if (p->conv == NULL)
1631                 panic("Fsproto");
1632
1633         p->x = f->np;
1634         p->nextport = 0;
1635         p->nextrport = 600;
1636         f->p[f->np++] = p;
1637
1638         return 0;
1639 }
1640
1641 /*
1642  *  return true if this protocol is
1643  *  built in
1644  */
1645 int Fsbuiltinproto(struct Fs *f, uint8_t proto)
1646 {
1647         return f->t2p[proto] != NULL;
1648 }
1649
1650 /*
1651  *  called with protocol locked
1652  */
1653 struct conv *Fsprotoclone(struct Proto *p, char *user)
1654 {
1655         struct conv *c, **pp, **ep;
1656
1657 retry:
1658         c = NULL;
1659         ep = &p->conv[p->nc];
1660         for (pp = p->conv; pp < ep; pp++) {
1661                 c = *pp;
1662                 if (c == NULL) {
1663                         c = kzmalloc(sizeof(struct conv), 0);
1664                         if (c == NULL)
1665                                 error(ENOMEM,
1666                                       "conv kzmalloc(%d, 0) failed in Fsprotoclone",
1667                                       sizeof(struct conv));
1668                         qlock_init(&c->qlock);
1669                         qlock_init(&c->listenq);
1670                         rendez_init(&c->cr);
1671                         rendez_init(&c->listenr);
1672                         SLIST_INIT(&c->data_taps);      /* already = 0; set to be futureproof */
1673                         SLIST_INIT(&c->listen_taps);
1674                         spinlock_init(&c->tap_lock);
1675                         qlock(&c->qlock);
1676                         c->p = p;
1677                         c->x = pp - p->conv;
1678                         if (p->ptclsize != 0) {
1679                                 c->ptcl = kzmalloc(p->ptclsize, 0);
1680                                 if (c->ptcl == NULL) {
1681                                         kfree(c);
1682                                         error(ENOMEM,
1683                                               "ptcl kzmalloc(%d, 0) failed in Fsprotoclone",
1684                                               p->ptclsize);
1685                                 }
1686                         }
1687                         *pp = c;
1688                         p->ac++;
1689                         c->eq = qopen(1024, Qmsg, 0, 0);
1690                         (*p->create) (c);
1691                         assert(c->rq && c->wq);
1692                         break;
1693                 }
1694                 if (canqlock(&c->qlock)) {
1695                         /*
1696                          *  make sure both processes and protocol
1697                          *  are done with this Conv
1698                          */
1699                         if (c->inuse == 0 && (p->inuse == NULL || (*p->inuse) (c) == 0))
1700                                 break;
1701
1702                         qunlock(&c->qlock);
1703                 }
1704         }
1705         if (pp >= ep) {
1706                 if (p->gc != NULL && (*p->gc) (p))
1707                         goto retry;
1708                 return NULL;
1709         }
1710
1711         c->inuse = 1;
1712         kstrdup(&c->owner, user);
1713         c->perm = 0660;
1714         c->state = Idle;
1715         ipmove(c->laddr, IPnoaddr);
1716         ipmove(c->raddr, IPnoaddr);
1717         c->r = NULL;
1718         c->rgen = 0;
1719         c->lport = 0;
1720         c->rport = 0;
1721         c->restricted = 0;
1722         c->ttl = MAXTTL;
1723         c->tos = DFLTTOS;
1724         qreopen(c->rq);
1725         qreopen(c->wq);
1726         qreopen(c->eq);
1727
1728         qunlock(&c->qlock);
1729         return c;
1730 }
1731
1732 int Fsconnected(struct conv *c, char *msg)
1733 {
1734         if (msg != NULL && *msg != '\0')
1735                 strlcpy(c->cerr, msg, sizeof(c->cerr));
1736
1737         switch (c->state) {
1738                 case Announcing:
1739                         c->state = Announced;
1740                         break;
1741
1742                 case Connecting:
1743                         c->state = Connected;
1744                         break;
1745         }
1746
1747         rendez_wakeup(&c->cr);
1748         return 0;
1749 }
1750
1751 struct Proto *Fsrcvpcol(struct Fs *f, uint8_t proto)
1752 {
1753         if (f->ipmux)
1754                 return f->ipmux;
1755         else
1756                 return f->t2p[proto];
1757 }
1758
1759 struct Proto *Fsrcvpcolx(struct Fs *f, uint8_t proto)
1760 {
1761         return f->t2p[proto];
1762 }
1763
1764 static void fire_listener_taps(struct conv *conv)
1765 {
1766         struct fd_tap *tap_i;
1767         if (SLIST_EMPTY(&conv->listen_taps))
1768                 return;
1769         spin_lock(&conv->tap_lock);
1770         SLIST_FOREACH(tap_i, &conv->listen_taps, link)
1771                 fire_tap(tap_i, FDTAP_FILT_READABLE);
1772         spin_unlock(&conv->tap_lock);
1773 }
1774
1775 /*
1776  *  called with protocol locked
1777  */
1778 struct conv *Fsnewcall(struct conv *c, uint8_t * raddr, uint16_t rport,
1779                                            uint8_t * laddr, uint16_t lport, uint8_t version)
1780 {
1781         struct conv *nc;
1782         struct conv **l;
1783         int i;
1784
1785         qlock(&c->qlock);
1786         i = 0;
1787         for (l = &c->incall; *l; l = &(*l)->next)
1788                 i++;
1789         if (i >= Maxincall) {
1790                 qunlock(&c->qlock);
1791                 return NULL;
1792         }
1793
1794         /* find a free conversation */
1795         nc = Fsprotoclone(c->p, network);
1796         if (nc == NULL) {
1797                 qunlock(&c->qlock);
1798                 return NULL;
1799         }
1800         ipmove(nc->raddr, raddr);
1801         nc->rport = rport;
1802         ipmove(nc->laddr, laddr);
1803         nc->lport = lport;
1804         nc->next = NULL;
1805         *l = nc;
1806         nc->state = Connected;
1807         nc->ipversion = version;
1808
1809         qunlock(&c->qlock);
1810
1811         rendez_wakeup(&c->listenr);
1812         fire_listener_taps(c);
1813
1814         return nc;
1815 }
1816
1817 static long ndbwrite(struct Fs *f, char *a, uint32_t off, int n)
1818 {
1819         if (off > strlen(f->ndb))
1820                 error(EIO, ERROR_FIXME);
1821         if (off + n >= sizeof(f->ndb) - 1)
1822                 error(EIO, ERROR_FIXME);
1823         memmove(f->ndb + off, a, n);
1824         f->ndb[off + n] = 0;
1825         f->ndbvers++;
1826         f->ndbmtime = seconds();
1827         return n;
1828 }
1829
1830 uint32_t scalednconv(void)
1831 {
1832         //if(conf.npage*BY2PG >= 128*MB)
1833         return Nchans * 4;
1834         //  return Nchans;
1835 }