410f94d58aa42f6bc9bc0b0fac1904b990ea2f5d
[akaros.git] / kern / src / net / devip.c
1 /* Copyright © 1994-1999 Lucent Technologies Inc.  All rights reserved.
2  * Portions Copyright © 1997-1999 Vita Nuova Limited
3  * Portions Copyright © 2000-2007 Vita Nuova Holdings Limited
4  *                                (www.vitanuova.com)
5  * Revisions Copyright © 2000-2007 Lucent Technologies Inc. and others
6  *
7  * Modified for the Akaros operating system:
8  * Copyright (c) 2013-2014 The Regents of the University of California
9  * Copyright (c) 2013-2015 Google Inc.
10  *
11  * Permission is hereby granted, free of charge, to any person obtaining a copy
12  * of this software and associated documentation files (the "Software"), to deal
13  * in the Software without restriction, including without limitation the rights
14  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15  * copies of the Software, and to permit persons to whom the Software is
16  * furnished to do so, subject to the following conditions:
17  *
18  * The above copyright notice and this permission notice shall be included in
19  * all copies or substantial portions of the Software.
20  *
21  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
24  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27  * SOFTWARE. */
28
29 #include <vfs.h>
30 #include <kfs.h>
31 #include <slab.h>
32 #include <kmalloc.h>
33 #include <kref.h>
34 #include <string.h>
35 #include <stdio.h>
36 #include <assert.h>
37 #include <error.h>
38 #include <cpio.h>
39 #include <pmap.h>
40 #include <smp.h>
41 #include <ip.h>
42
43 struct dev ipdevtab;
44
45 static char *devname(void)
46 {
47         return ipdevtab.name;
48 }
49
50 enum {
51         Qtopdir = 1,                            /* top level directory */
52         Qtopbase,
53         Qarp = Qtopbase,
54         Qndb,
55         Qiproute,
56         Qiprouter,
57         Qipselftab,
58         Qlog,
59
60         Qprotodir,      /* directory for a protocol */
61         Qprotobase,
62         Qclone = Qprotobase,
63         Qstats,
64
65         Qconvdir,       /* directory for a conversation */
66         Qconvbase,
67         Qctl = Qconvbase,
68         Qdata,
69         Qerr,
70         Qlisten,
71         Qlocal,
72         Qremote,
73         Qstatus,
74         Qsnoop,
75
76         Logtype = 5,
77         Masktype = (1 << Logtype) - 1,
78         Logconv = 12,
79         Maskconv = (1 << Logconv) - 1,
80         Shiftconv = Logtype,
81         Logproto = 8,
82         Maskproto = (1 << Logproto) - 1,
83         Shiftproto = Logtype + Logconv,
84
85         Nfs = 32,
86         BYPASS_QMAX = 64 * MiB,
87         IPROUTE_LEN = 2 * PGSIZE,
88 };
89 #define TYPE(x)         ( ((uint32_t)(x).path) & Masktype )
90 #define CONV(x)         ( (((uint32_t)(x).path) >> Shiftconv) & Maskconv )
91 #define PROTO(x)        ( (((uint32_t)(x).path) >> Shiftproto) & Maskproto )
92 #define QID(p, c, y)    ( ((p)<<(Shiftproto)) | ((c)<<Shiftconv) | (y))
93 static char network[] = "network";
94
95 qlock_t fslock;
96 struct Fs *ipfs[Nfs];                   /* attached fs's */
97 struct queue *qlog;
98
99 extern void nullmediumlink(void);
100 extern void pktmediumlink(void);
101 extern struct username eve;
102 static long ndbwrite(struct Fs *, char *unused_char_p_t, uint32_t, int);
103 static void closeconv(struct conv *);
104 static void setup_proto_qio_bypass(struct conv *cv);
105 static void undo_proto_qio_bypass(struct conv *cv);
106
107 static struct conv *chan2conv(struct chan *chan)
108 {
109         /* That's a lot of pointers to get to the conv! */
110         return ipfs[chan->dev]->p[PROTO(chan->qid)]->conv[CONV(chan->qid)];
111 }
112
113 static inline int founddevdir(struct chan *c, struct qid q, char *n,
114                                                           int64_t length, char *user, long perm,
115                                                           struct dir *db)
116 {
117         devdir(c, q, n, length, user, perm, db);
118         return 1;
119 }
120
121 static int topdirgen(struct chan *c, struct dir *dp)
122 {
123         struct qid q;
124         mkqid(&q, QID(0, 0, Qtopdir), 0, QTDIR);
125         snprintf(get_cur_genbuf(), GENBUF_SZ, "#%s%lu", devname(), c->dev);
126         return founddevdir(c, q, get_cur_genbuf(), 0, network, 0555, dp);
127 }
128
129
130 static int ip3gen(struct chan *c, int i, struct dir *dp)
131 {
132         struct qid q;
133         struct conv *cv;
134         char *p;
135         int perm;
136
137         cv = chan2conv(c);
138         if (cv->owner == NULL)
139                 kstrdup(&cv->owner, eve.name);
140         mkqid(&q, QID(PROTO(c->qid), CONV(c->qid), i), 0, QTFILE);
141
142         switch (i) {
143                 default:
144                         return -1;
145                 case Qctl:
146                         return founddevdir(c, q, "ctl", 0,
147                                                    cv->owner, cv->perm, dp);
148                 case Qdata:
149                         perm = cv->perm;
150                         perm |= qreadable(cv->rq) ? DMREADABLE : 0;
151                         perm |= qwritable(cv->wq) ? DMWRITABLE : 0;
152                         return founddevdir(c, q, "data", qlen(cv->rq),
153                                                            cv->owner, perm, dp);
154                 case Qerr:
155                         perm = cv->perm;
156                         perm |= qreadable(cv->eq) ? DMREADABLE : 0;
157                         return founddevdir(c, q, "err", qlen(cv->eq),
158                                                            cv->owner, perm, dp);
159                 case Qlisten:
160                         perm = cv->perm;
161                         perm |= cv->incall ? DMREADABLE : 0;
162                         return founddevdir(c, q, "listen", 0, cv->owner, perm, dp);
163                 case Qlocal:
164                         p = "local";
165                         break;
166                 case Qremote:
167                         p = "remote";
168                         break;
169                 case Qsnoop:
170                         if (strcmp(cv->p->name, "ipifc") != 0)
171                                 return -1;
172                         perm = 0400;
173                         perm |= qreadable(cv->sq) ? DMREADABLE : 0;
174                         return founddevdir(c, q, "snoop", qlen(cv->sq),
175                                                            cv->owner, perm, dp);
176                 case Qstatus:
177                         p = "status";
178                         break;
179         }
180         return founddevdir(c, q, p, 0, cv->owner, 0444, dp);
181 }
182
183 static int ip2gen(struct chan *c, int i, struct dir *dp)
184 {
185         struct qid q;
186         mkqid(&q, QID(PROTO(c->qid), 0, i), 0, QTFILE);
187         switch (i) {
188                 case Qclone:
189                         return founddevdir(c, q, "clone", 0, network, 0666, dp);
190                 case Qstats:
191                         return founddevdir(c, q, "stats", 0, network, 0444, dp);
192         }
193         return -1;
194 }
195
196 static int ip1gen(struct chan *c, int i, struct dir *dp)
197 {
198         struct qid q;
199         char *p;
200         int prot;
201         int len = 0;
202         struct Fs *f;
203         extern uint32_t kerndate;
204
205         f = ipfs[c->dev];
206
207         prot = 0666;
208         mkqid(&q, QID(0, 0, i), 0, QTFILE);
209         switch (i) {
210                 default:
211                         return -1;
212                 case Qarp:
213                         p = "arp";
214                         break;
215                 case Qndb:
216                         p = "ndb";
217                         len = strlen(f->ndb);
218                         q.vers = f->ndbvers;
219                         break;
220                 case Qiproute:
221                         p = "iproute";
222                         break;
223                 case Qipselftab:
224                         p = "ipselftab";
225                         prot = 0444;
226                         break;
227                 case Qiprouter:
228                         p = "iprouter";
229                         break;
230                 case Qlog:
231                         p = "log";
232                         break;
233         }
234         devdir(c, q, p, len, network, prot, dp);
235         if (i == Qndb && f->ndbmtime > kerndate)
236                 dp->mtime = f->ndbmtime;
237         return 1;
238 }
239
240 static int
241 ipgen(struct chan *c, char *unused_char_p_t, struct dirtab *d, int unused_int,
242           int s, struct dir *dp)
243 {
244         struct qid q;
245         struct conv *cv;
246         struct Fs *f;
247
248         f = ipfs[c->dev];
249
250         switch (TYPE(c->qid)) {
251                 case Qtopdir:
252                         if (s == DEVDOTDOT)
253                                 return topdirgen(c, dp);
254                         if (s < f->np) {
255                                 if (f->p[s]->connect == NULL)
256                                         return 0;       /* protocol with no user interface */
257                                 mkqid(&q, QID(s, 0, Qprotodir), 0, QTDIR);
258                                 return founddevdir(c, q, f->p[s]->name, 0, network, 0555, dp);
259                         }
260                         s -= f->np;
261                         return ip1gen(c, s + Qtopbase, dp);
262                 case Qarp:
263                 case Qndb:
264                 case Qlog:
265                 case Qiproute:
266                 case Qiprouter:
267                 case Qipselftab:
268                         return ip1gen(c, TYPE(c->qid), dp);
269                 case Qprotodir:
270                         if (s == DEVDOTDOT)
271                                 return topdirgen(c, dp);
272                         else if (s < f->p[PROTO(c->qid)]->ac) {
273                                 cv = f->p[PROTO(c->qid)]->conv[s];
274                                 snprintf(get_cur_genbuf(), GENBUF_SZ, "%d", s);
275                                 mkqid(&q, QID(PROTO(c->qid), s, Qconvdir), 0, QTDIR);
276                                 return
277                                         founddevdir(c, q, get_cur_genbuf(), 0, cv->owner, 0555, dp);
278                         }
279                         s -= f->p[PROTO(c->qid)]->ac;
280                         return ip2gen(c, s + Qprotobase, dp);
281                 case Qclone:
282                 case Qstats:
283                         return ip2gen(c, TYPE(c->qid), dp);
284                 case Qconvdir:
285                         if (s == DEVDOTDOT) {
286                                 s = PROTO(c->qid);
287                                 mkqid(&q, QID(s, 0, Qprotodir), 0, QTDIR);
288                                 devdir(c, q, f->p[s]->name, 0, network, 0555, dp);
289                                 return 1;
290                         }
291                         return ip3gen(c, s + Qconvbase, dp);
292                 case Qctl:
293                 case Qdata:
294                 case Qerr:
295                 case Qlisten:
296                 case Qlocal:
297                 case Qremote:
298                 case Qstatus:
299                 case Qsnoop:
300                         return ip3gen(c, TYPE(c->qid), dp);
301         }
302         return -1;
303 }
304
305 static void ipinit(void)
306 {
307         qlock_init(&fslock);
308         nullmediumlink();
309         pktmediumlink();
310 /* if only
311         fmtinstall('i', eipfmt);
312         fmtinstall('I', eipfmt);
313         fmtinstall('E', eipfmt);
314         fmtinstall('V', eipfmt);
315         fmtinstall('M', eipfmt);
316 */
317 }
318
319 static void ipreset(void)
320 {
321 }
322
323 static struct Fs *ipgetfs(int dev)
324 {
325         extern void (*ipprotoinit[]) (struct Fs *);
326         struct Fs *f;
327         int i;
328
329         if (dev >= Nfs)
330                 return NULL;
331
332         qlock(&fslock);
333         if (ipfs[dev] == NULL) {
334                 f = kzmalloc(sizeof(struct Fs), MEM_WAIT);
335                 rwinit(&f->rwlock);
336                 qlock_init(&f->iprouter.qlock);
337                 ip_init(f);
338                 arpinit(f);
339                 netloginit(f);
340                 for (i = 0; ipprotoinit[i]; i++)
341                         ipprotoinit[i] (f);
342                 f->dev = dev;
343                 ipfs[dev] = f;
344         }
345         qunlock(&fslock);
346
347         return ipfs[dev];
348 }
349
350 struct IPaux *newipaux(char *owner, char *tag)
351 {
352         struct IPaux *a;
353         int n;
354
355         a = kzmalloc(sizeof(*a), 0);
356         kstrdup(&a->owner, owner);
357         memset(a->tag, ' ', sizeof(a->tag));
358         n = strlen(tag);
359         if (n > sizeof(a->tag))
360                 n = sizeof(a->tag);
361         memmove(a->tag, tag, n);
362         return a;
363 }
364
365 #define ATTACHER(c) (((struct IPaux*)((c)->aux))->owner)
366
367 static struct chan *ipattach(char *spec)
368 {
369         struct chan *c;
370         int dev;
371
372         dev = atoi(spec);
373         if (dev >= Nfs)
374                 error(EFAIL, "bad specification");
375
376         ipgetfs(dev);
377         c = devattach(devname(), spec);
378         mkqid(&c->qid, QID(0, 0, Qtopdir), 0, QTDIR);
379         c->dev = dev;
380
381         c->aux = newipaux(commonuser(), "none");
382
383         return c;
384 }
385
386 static struct walkqid *ipwalk(struct chan *c, struct chan *nc, char **name,
387                                                           int nname)
388 {
389         struct IPaux *a = c->aux;
390         struct walkqid *w;
391
392         w = devwalk(c, nc, name, nname, NULL, 0, ipgen);
393         if (w != NULL && w->clone != NULL)
394                 w->clone->aux = newipaux(a->owner, a->tag);
395         return w;
396 }
397
398 static int ipstat(struct chan *c, uint8_t * db, int n)
399 {
400         return devstat(c, db, n, NULL, 0, ipgen);
401 }
402
403 static int should_wake(void *arg)
404 {
405         struct conv *cv = arg;
406         /* signal that the conv is closed */
407         if (qisclosed(cv->rq))
408                 return TRUE;
409         return cv->incall != NULL;
410 }
411
412 static struct chan *ipopen(struct chan *c, int omode)
413 {
414         ERRSTACK(2);
415         struct conv *cv, *nc;
416         struct Proto *p;
417         int perm;
418         struct Fs *f;
419
420         /* perm is a lone rwx, not the rwx------ from the conversion */
421         perm = omode_to_rwx(omode) >> 6;
422
423         f = ipfs[c->dev];
424
425         switch (TYPE(c->qid)) {
426                 default:
427                         break;
428                 case Qndb:
429                         if (omode & (O_WRITE | O_TRUNC) && !iseve())
430                                 error(EPERM, ERROR_FIXME);
431                         if ((omode & (O_WRITE | O_TRUNC)) == (O_WRITE | O_TRUNC))
432                                 f->ndb[0] = 0;
433                         break;
434                 case Qlog:
435                         netlogopen(f);
436                         break;
437                 case Qiprouter:
438                         iprouteropen(f);
439                         break;
440                 case Qiproute:
441                         c->synth_buf = kpages_zalloc(IPROUTE_LEN, MEM_WAIT);
442                         routeread(f, c->synth_buf, 0, IPROUTE_LEN);
443                         break;
444                 case Qtopdir:
445                 case Qprotodir:
446                 case Qconvdir:
447                 case Qstatus:
448                 case Qremote:
449                 case Qlocal:
450                 case Qstats:
451                 case Qipselftab:
452                         if (omode & O_WRITE)
453                                 error(EPERM, ERROR_FIXME);
454                         break;
455                 case Qsnoop:
456                         if (omode & O_WRITE)
457                                 error(EPERM, ERROR_FIXME);
458                         /* might be racy.  note the lack of a proto lock, unlike Qdata */
459                         p = f->p[PROTO(c->qid)];
460                         cv = p->conv[CONV(c->qid)];
461                         if (strcmp(ATTACHER(c), cv->owner) != 0 && !iseve())
462                                 error(EPERM, ERROR_FIXME);
463                         atomic_inc(&cv->snoopers);
464                         break;
465                 case Qclone:
466                         p = f->p[PROTO(c->qid)];
467                         qlock(&p->qlock);
468                         if (waserror()) {
469                                 qunlock(&p->qlock);
470                                 nexterror();
471                         }
472                         cv = Fsprotoclone(p, ATTACHER(c));
473                         qunlock(&p->qlock);
474                         poperror();
475                         if (cv == NULL) {
476                                 error(ENODEV, "Null conversation from Fsprotoclone");
477                                 break;
478                         }
479                         mkqid(&c->qid, QID(p->x, cv->x, Qctl), 0, QTFILE);
480                         break;
481                 case Qdata:
482                 case Qctl:
483                 case Qerr:
484                         p = f->p[PROTO(c->qid)];
485                         qlock(&p->qlock);
486                         cv = p->conv[CONV(c->qid)];
487                         qlock(&cv->qlock);
488                         if (waserror()) {
489                                 qunlock(&cv->qlock);
490                                 qunlock(&p->qlock);
491                                 nexterror();
492                         }
493                         if ((perm & (cv->perm >> 6)) != perm) {
494                                 if (strcmp(ATTACHER(c), cv->owner) != 0)
495                                         error(EPERM, ERROR_FIXME);
496                                 if ((perm & cv->perm) != perm)
497                                         error(EPERM, ERROR_FIXME);
498
499                         }
500                         cv->inuse++;
501                         if (cv->inuse == 1) {
502                                 kstrdup(&cv->owner, ATTACHER(c));
503                                 cv->perm = 0660;
504                         }
505                         qunlock(&cv->qlock);
506                         qunlock(&p->qlock);
507                         poperror();
508                         break;
509                 case Qlisten:
510                         cv = f->p[PROTO(c->qid)]->conv[CONV(c->qid)];
511                         /* No permissions or Announce checks required.  We'll see if that's
512                          * a good idea or not. (the perm check would do nothing, as is,
513                          * since an O_PATH perm is 0).
514                          *
515                          * But we probably want to incref to keep the conversation around
516                          * until this FD/chan is closed.  #ip is a little weird in that
517                          * objects never really go away (high water mark for convs, you can
518                          * always find them in the ns).  I think it is possible to
519                          * namec/ipgen a chan, then have that conv close, then have that
520                          * chan be opened.  You can probably do this with a data file. */
521                         if (omode & O_PATH) {
522                                 qlock(&cv->qlock);
523                                 cv->inuse++;
524                                 qunlock(&cv->qlock);
525                                 break;
526                         }
527                         if ((perm & (cv->perm >> 6)) != perm) {
528                                 if (strcmp(ATTACHER(c), cv->owner) != 0)
529                                         error(EPERM, ERROR_FIXME);
530                                 if ((perm & cv->perm) != perm)
531                                         error(EPERM, ERROR_FIXME);
532
533                         }
534
535                         if (cv->state != Announced)
536                                 error(EFAIL, "not announced");
537
538                         if (waserror()) {
539                                 closeconv(cv);
540                                 nexterror();
541                         }
542                         qlock(&cv->qlock);
543                         cv->inuse++;
544                         qunlock(&cv->qlock);
545
546                         nc = NULL;
547                         while (nc == NULL) {
548                                 /* give up if we got a hangup */
549                                 if (qisclosed(cv->rq))
550                                         error(EFAIL, "listen hungup");
551
552                                 qlock(&cv->listenq);
553                                 if (waserror()) {
554                                         qunlock(&cv->listenq);
555                                         nexterror();
556                                 }
557                                 /* we can peek at incall without grabbing the cv qlock.  if
558                                  * anything is there, it'll remain there until we dequeue it.
559                                  * no one else can, since we hold the listenq lock */
560                                 if ((c->flag & O_NONBLOCK) && !cv->incall)
561                                         error(EAGAIN, "listen queue empty");
562                                 /* wait for a connect */
563                                 rendez_sleep(&cv->listenr, should_wake, cv);
564
565                                 /* if there is a concurrent hangup, they will hold the qlock
566                                  * until the hangup is complete, including closing the cv->rq */
567                                 qlock(&cv->qlock);
568                                 nc = cv->incall;
569                                 if (nc != NULL) {
570                                         cv->incall = nc->next;
571                                         mkqid(&c->qid, QID(PROTO(c->qid), nc->x, Qctl), 0, QTFILE);
572                                         kstrdup(&cv->owner, ATTACHER(c));
573                                 }
574                                 qunlock(&cv->qlock);
575
576                                 qunlock(&cv->listenq);
577                                 poperror();
578                         }
579                         closeconv(cv);
580                         poperror();
581                         break;
582         }
583         c->mode = openmode(omode);
584         c->flag |= COPEN;
585         c->offset = 0;
586         return c;
587 }
588
589 static int ipwstat(struct chan *c, uint8_t * dp, int n)
590 {
591         ERRSTACK(2);
592         struct dir *d;
593         struct conv *cv;
594         struct Fs *f;
595         struct Proto *p;
596
597         f = ipfs[c->dev];
598         switch (TYPE(c->qid)) {
599                 default:
600                         error(EPERM, ERROR_FIXME);
601                         break;
602                 case Qctl:
603                 case Qdata:
604                         break;
605         }
606
607         d = kzmalloc(sizeof(*d) + n, 0);
608         if (waserror()) {
609                 kfree(d);
610                 nexterror();
611         }
612         n = convM2D(dp, n, d, (char *)&d[1]);
613         if (n == 0)
614                 error(ENODATA, ERROR_FIXME);
615         p = f->p[PROTO(c->qid)];
616         cv = p->conv[CONV(c->qid)];
617         if (!iseve() && strcmp(ATTACHER(c), cv->owner) != 0)
618                 error(EPERM, ERROR_FIXME);
619         if (!emptystr(d->uid))
620                 kstrdup(&cv->owner, d->uid);
621         if (d->mode != -1)
622                 cv->perm = d->mode & 0777;
623         poperror();
624         kfree(d);
625         return n;
626 }
627
628 /* Should be able to handle any file type chan. Feel free to extend it. */
629 static char *ipchaninfo(struct chan *ch, char *ret, size_t ret_l)
630 {
631         struct conv *conv;
632         struct Proto *proto;
633         char *p;
634         struct Fs *f;
635
636         f = ipfs[ch->dev];
637
638         switch (TYPE(ch->qid)) {
639                 default:
640                         ret = "Unknown type";
641                         break;
642                 case Qdata:
643                         proto = f->p[PROTO(ch->qid)];
644                         conv = proto->conv[CONV(ch->qid)];
645                         snprintf(ret, ret_l,
646                                  "Qdata, %s, proto %s, conv idx %d, rq len %d, wq len %d, total read %llu",
647                                  SLIST_EMPTY(&conv->data_taps) ? "untapped" : "tapped",
648                                  proto->name, conv->x, qlen(conv->rq), qlen(conv->wq),
649                                          q_bytes_read(conv->rq));
650                         break;
651                 case Qarp:
652                         ret = "Qarp";
653                         break;
654                 case Qiproute:
655                         ret = "Qiproute";
656                         break;
657                 case Qlisten:
658                         proto = f->p[PROTO(ch->qid)];
659                         conv = proto->conv[CONV(ch->qid)];
660                         snprintf(ret, ret_l,
661                                  "Qlisten, %s proto %s, conv idx %d, has %sincalls",
662                                  SLIST_EMPTY(&conv->listen_taps) ? "untapped" : "tapped",
663                                  proto->name, conv->x, conv->incall ? "" : "no ");
664                         break;
665                 case Qlog:
666                         ret = "Qlog";
667                         break;
668                 case Qndb:
669                         ret = "Qndb";
670                         break;
671                 case Qctl:
672                         proto = f->p[PROTO(ch->qid)];
673                         conv = proto->conv[CONV(ch->qid)];
674                         snprintf(ret, ret_l, "Qctl, proto %s, conv idx %d", proto->name,
675                                          conv->x);
676                         break;
677         }
678         return ret;
679 }
680
681 static void closeconv(struct conv *cv)
682 {
683         ERRSTACK(1);
684         struct conv *nc;
685         struct Ipmulti *mp;
686
687         qlock(&cv->qlock);
688
689         if (--cv->inuse > 0) {
690                 qunlock(&cv->qlock);
691                 return;
692         }
693         if (waserror()) {
694                 qunlock(&cv->qlock);
695                 nexterror();
696         }
697         /* close all incoming calls since no listen will ever happen */
698         for (nc = cv->incall; nc; nc = cv->incall) {
699                 cv->incall = nc->next;
700                 closeconv(nc);
701         }
702         cv->incall = NULL;
703
704         kstrdup(&cv->owner, network);
705         cv->perm = 0660;
706
707         while ((mp = cv->multi) != NULL)
708                 ipifcremmulti(cv, mp->ma, mp->ia);
709
710         cv->r = NULL;
711         cv->rgen = 0;
712         if (cv->state == Bypass)
713                 undo_proto_qio_bypass(cv);
714         cv->p->close(cv);
715         cv->state = Idle;
716         qunlock(&cv->qlock);
717         poperror();
718 }
719
720 static void ipclose(struct chan *c)
721 {
722         struct Fs *f;
723
724         f = ipfs[c->dev];
725         switch (TYPE(c->qid)) {
726                 default:
727                         break;
728                 case Qlog:
729                         if (c->flag & COPEN)
730                                 netlogclose(f);
731                         break;
732                 case Qiprouter:
733                         if (c->flag & COPEN)
734                                 iprouterclose(f);
735                         break;
736                 case Qdata:
737                 case Qctl:
738                 case Qerr:
739                 case Qlisten:
740                         if (c->flag & COPEN)
741                                 closeconv(f->p[PROTO(c->qid)]->conv[CONV(c->qid)]);
742                         break;
743                 case Qsnoop:
744                         if (c->flag & COPEN)
745                                 atomic_dec(&f->p[PROTO(c->qid)]->conv[CONV(c->qid)]->snoopers);
746                         break;
747                 case Qiproute:
748                         if (c->flag & COPEN)
749                                 kpages_free(c->synth_buf, IPROUTE_LEN);
750                         break;
751         }
752         kfree(((struct IPaux *)c->aux)->owner);
753         kfree(c->aux);
754 }
755
756 enum {
757         Statelen = 32 * 1024,
758 };
759
760 static long ipread(struct chan *ch, void *a, long n, int64_t off)
761 {
762         struct conv *c;
763         struct Proto *x;
764         char *buf, *p;
765         long rv;
766         struct Fs *f;
767         uint32_t offset = off;
768
769         f = ipfs[ch->dev];
770
771         p = a;
772         switch (TYPE(ch->qid)) {
773                 default:
774                         error(EPERM, ERROR_FIXME);
775                 case Qtopdir:
776                 case Qprotodir:
777                 case Qconvdir:
778                         return devdirread(ch, a, n, 0, 0, ipgen);
779                 case Qarp:
780                         return arpread(f->arp, a, offset, n);
781                 case Qndb:
782                         return readstr(offset, a, n, f->ndb);
783                 case Qiproute:
784                         return readmem(offset, a, n, ch->synth_buf, IPROUTE_LEN);
785                 case Qiprouter:
786                         return iprouterread(f, a, n);
787                 case Qipselftab:
788                         return ipselftabread(f, a, offset, n);
789                 case Qlog:
790                         return netlogread(f, a, offset, n);
791                 case Qctl:
792                         snprintf(get_cur_genbuf(), GENBUF_SZ, "%lu", CONV(ch->qid));
793                         return readstr(offset, p, n, get_cur_genbuf());
794                 case Qremote:
795                         buf = kzmalloc(Statelen, 0);
796                         x = f->p[PROTO(ch->qid)];
797                         c = x->conv[CONV(ch->qid)];
798                         if (x->remote == NULL) {
799                                 snprintf(buf, Statelen, "%I!%d\n", c->raddr, c->rport);
800                         } else {
801                                 (*x->remote) (c, buf, Statelen - 2);
802                         }
803                         rv = readstr(offset, p, n, buf);
804                         kfree(buf);
805                         return rv;
806                 case Qlocal:
807                         buf = kzmalloc(Statelen, 0);
808                         x = f->p[PROTO(ch->qid)];
809                         c = x->conv[CONV(ch->qid)];
810                         if (x->local == NULL) {
811                                 snprintf(buf, Statelen, "%I!%d\n", c->laddr, c->lport);
812                         } else {
813                                 (*x->local) (c, buf, Statelen - 2);
814                         }
815                         rv = readstr(offset, p, n, buf);
816                         kfree(buf);
817                         return rv;
818                 case Qstatus:
819                         /* this all is a bit screwed up since the size of some state's
820                          * buffers will change from one invocation to another.  a reader
821                          * will come in and read the entire buffer.  then it will come again
822                          * and read from the next offset, expecting EOF.  if the buffer
823                          * changed sizes, it'll reprint the end of the buffer slightly. */
824                         buf = kzmalloc(Statelen, 0);
825                         x = f->p[PROTO(ch->qid)];
826                         c = x->conv[CONV(ch->qid)];
827                         if (c->state == Bypass)
828                                 snprintf(buf, Statelen, "Bypassed\n");
829                         else
830                                 (*x->state)(c, buf, Statelen - 2);
831                         rv = readstr(offset, p, n, buf);
832                         kfree(buf);
833                         return rv;
834                 case Qdata:
835                         c = f->p[PROTO(ch->qid)]->conv[CONV(ch->qid)];
836                         if (ch->flag & O_NONBLOCK)
837                                 return qread_nonblock(c->rq, a, n);
838                         else
839                                 return qread(c->rq, a, n);
840                 case Qerr:
841                         c = f->p[PROTO(ch->qid)]->conv[CONV(ch->qid)];
842                         return qread(c->eq, a, n);
843                 case Qsnoop:
844                         c = f->p[PROTO(ch->qid)]->conv[CONV(ch->qid)];
845                         return qread(c->sq, a, n);
846                 case Qstats:
847                         x = f->p[PROTO(ch->qid)];
848                         if (x->stats == NULL)
849                                 error(EFAIL, "stats not implemented");
850                         buf = kzmalloc(Statelen, 0);
851                         (*x->stats) (x, buf, Statelen);
852                         rv = readstr(offset, p, n, buf);
853                         kfree(buf);
854                         return rv;
855         }
856 }
857
858 static struct block *ipbread(struct chan *ch, long n, uint32_t offset)
859 {
860         struct conv *c;
861
862         switch (TYPE(ch->qid)) {
863                 case Qdata:
864                         c = chan2conv(ch);
865                         if (ch->flag & O_NONBLOCK)
866                                 return qbread_nonblock(c->rq, n);
867                         else
868                                 return qbread(c->rq, n);
869                 default:
870                         return devbread(ch, n, offset);
871         }
872 }
873
874 /*
875  *  set local address to be that of the ifc closest to remote address
876  */
877 static void setladdr(struct conv *c)
878 {
879         findlocalip(c->p->f, c->laddr, c->raddr);
880 }
881
882 /*
883  *  set a local port making sure the quad of raddr,rport,laddr,lport is unique
884  */
885 static void setluniqueport(struct conv *c, int lport)
886 {
887         struct Proto *p;
888         struct conv *xp;
889         int x;
890
891         p = c->p;
892
893         qlock(&p->qlock);
894         for (x = 0; x < p->nc; x++) {
895                 xp = p->conv[x];
896                 if (xp == NULL)
897                         break;
898                 if (xp == c)
899                         continue;
900                 if ((xp->state == Connected || xp->state == Announced
901                                             || xp->state == Bypass)
902                         && xp->lport == lport
903                         && xp->rport == c->rport
904                         && ipcmp(xp->raddr, c->raddr) == 0
905                         && ipcmp(xp->laddr, c->laddr) == 0) {
906                         qunlock(&p->qlock);
907                         error(EFAIL, "address in use");
908                 }
909         }
910         c->lport = lport;
911         qunlock(&p->qlock);
912 }
913
914 /*
915  *  pick a local port and set it
916  */
917 static void setlport(struct conv *c)
918 {
919         struct Proto *p;
920         uint16_t *pp;
921         int x, found;
922
923         p = c->p;
924         if (c->restricted)
925                 pp = &p->nextrport;
926         else
927                 pp = &p->nextport;
928         qlock(&p->qlock);
929         for (;; (*pp)++) {
930                 /*
931                  * Fsproto initialises p->nextport to 0 and the restricted
932                  * ports (p->nextrport) to 600.
933                  * Restricted ports must lie between 600 and 1024.
934                  * For the initial condition or if the unrestricted port number
935                  * has wrapped round, select a random port between 5000 and 1<<15
936                  * to start at.
937                  */
938                 if (c->restricted) {
939                         if (*pp >= 1024)
940                                 *pp = 600;
941                 } else
942                         while (*pp < 5000)
943                                 urandom_read(pp, sizeof(*pp));
944
945                 found = 0;
946                 for (x = 0; x < p->nc; x++) {
947                         if (p->conv[x] == NULL)
948                                 break;
949                         if (p->conv[x]->lport == *pp) {
950                                 found = 1;
951                                 break;
952                         }
953                 }
954                 if (!found)
955                         break;
956         }
957         c->lport = (*pp)++;
958         qunlock(&p->qlock);
959 }
960
961 /*
962  *  set a local address and port from a string of the form
963  *      [address!]port[!r]
964  */
965 static void setladdrport(struct conv *c, char *str, int announcing)
966 {
967         char *p;
968         uint16_t lport;
969         uint8_t addr[IPaddrlen];
970
971         /*
972          *  ignore restricted part if it exists.  it's
973          *  meaningless on local ports.
974          */
975         p = strchr(str, '!');
976         if (p != NULL) {
977                 *p++ = 0;
978                 if (strcmp(p, "r") == 0)
979                         p = NULL;
980         }
981
982         c->lport = 0;
983         if (p == NULL) {
984                 if (announcing)
985                         ipmove(c->laddr, IPnoaddr);
986                 else
987                         setladdr(c);
988                 p = str;
989         } else {
990                 if (strcmp(str, "*") == 0)
991                         ipmove(c->laddr, IPnoaddr);
992                 else {
993                         parseip(addr, str);
994                         if (ipforme(c->p->f, addr))
995                                 ipmove(c->laddr, addr);
996                         else
997                                 error(EFAIL, "not a local IP address");
998                 }
999         }
1000
1001         /* one process can get all connections */
1002         if (announcing && strcmp(p, "*") == 0) {
1003                 if (!iseve())
1004                         error(EPERM, ERROR_FIXME);
1005                 setluniqueport(c, 0);
1006         }
1007
1008         lport = atoi(p);
1009         if (lport <= 0)
1010                 setlport(c);
1011         else
1012                 setluniqueport(c, lport);
1013 }
1014
1015 static void setraddrport(struct conv *c, char *str)
1016 {
1017         char *p;
1018
1019         p = strchr(str, '!');
1020         if (p == NULL)
1021                 error(EFAIL, "malformed address");
1022         *p++ = 0;
1023         parseip(c->raddr, str);
1024         c->rport = atoi(p);
1025         p = strchr(p, '!');
1026         if (p) {
1027                 if (strstr(p, "!r") != NULL)
1028                         c->restricted = 1;
1029         }
1030 }
1031
1032 /*
1033  *  called by protocol connect routine to set addresses
1034  */
1035 void Fsstdconnect(struct conv *c, char *argv[], int argc)
1036 {
1037         switch (argc) {
1038                 default:
1039                         error(EINVAL, "bad args to %s", __func__);
1040                 case 2:
1041                         setraddrport(c, argv[1]);
1042                         setladdr(c);
1043                         setlport(c);
1044                         break;
1045                 case 3:
1046                         setraddrport(c, argv[1]);
1047                         setladdrport(c, argv[2], 0);
1048                         break;
1049         }
1050
1051         /* TODO: why is an IPnoaddr (in v6 format, equivalent to v6Unspecified),
1052          * a v4 format? */
1053         if ((memcmp(c->raddr, v4prefix, IPv4off) == 0 &&
1054                  memcmp(c->laddr, v4prefix, IPv4off) == 0)
1055                 || ipcmp(c->raddr, IPnoaddr) == 0)
1056                 c->ipversion = V4;
1057         else
1058                 c->ipversion = V6;
1059         /* Linux has taught people to use zeros for local interfaces.  TODO: We
1060          * might need this for v6 in the future. */
1061         if (!ipcmp(c->raddr, IPv4_zeroes))
1062                 ipmove(c->raddr, IPv4_loopback);
1063 }
1064
1065 /*
1066  *  initiate connection and sleep till its set up
1067  */
1068 static int connected(void *a)
1069 {
1070         return ((struct conv *)a)->state == Connected;
1071 }
1072
1073 static void connectctlmsg(struct Proto *x, struct conv *c, struct cmdbuf *cb)
1074 {
1075         ERRSTACK(1);
1076         char *p;
1077
1078         if (c->state != 0)
1079                 error(EBUSY, ERROR_FIXME);
1080         c->state = Connecting;
1081         c->cerr[0] = '\0';
1082         if (x->connect == NULL)
1083                 error(EFAIL, "connect not supported");
1084         x->connect(c, cb->f, cb->nf);
1085
1086         qunlock(&c->qlock);
1087         if (waserror()) {
1088                 qlock(&c->qlock);
1089                 nexterror();
1090         }
1091         rendez_sleep(&c->cr, connected, c);
1092         qlock(&c->qlock);
1093         poperror();
1094
1095         if (c->cerr[0] != '\0')
1096                 error(EFAIL, c->cerr);
1097 }
1098
1099 /*
1100  *  called by protocol announce routine to set addresses
1101  */
1102 void Fsstdannounce(struct conv *c, char *argv[], int argc)
1103 {
1104         memset(c->raddr, 0, sizeof(c->raddr));
1105         c->rport = 0;
1106         switch (argc) {
1107                 default:
1108                         error(EINVAL, "bad args to announce");
1109                 case 2:
1110                         setladdrport(c, argv[1], 1);
1111                         break;
1112         }
1113 }
1114
1115 /*
1116  *  initiate announcement and sleep till its set up
1117  */
1118 static int announced(void *a)
1119 {
1120         return ((struct conv *)a)->state == Announced;
1121 }
1122
1123 static void announcectlmsg(struct Proto *x, struct conv *c, struct cmdbuf *cb)
1124 {
1125         ERRSTACK(1);
1126         char *p;
1127
1128         if (c->state != 0)
1129                 error(EBUSY, ERROR_FIXME);
1130         c->state = Announcing;
1131         c->cerr[0] = '\0';
1132         if (x->announce == NULL)
1133                 error(EFAIL, "announce not supported");
1134         x->announce(c, cb->f, cb->nf);
1135
1136         qunlock(&c->qlock);
1137         if (waserror()) {
1138                 qlock(&c->qlock);
1139                 nexterror();
1140         }
1141         rendez_sleep(&c->cr, announced, c);
1142         qlock(&c->qlock);
1143         poperror();
1144
1145         if (c->cerr[0] != '\0')
1146                 error(EFAIL, c->cerr);
1147 }
1148
1149 /*
1150  *  called by protocol bind routine to set addresses
1151  */
1152 void Fsstdbind(struct conv *c, char *argv[], int argc)
1153 {
1154         switch (argc) {
1155                 default:
1156                         error(EINVAL, "bad args to bind");
1157                 case 2:
1158                         setladdrport(c, argv[1], 0);
1159                         break;
1160         }
1161 }
1162
1163 static void bindctlmsg(struct Proto *x, struct conv *c, struct cmdbuf *cb)
1164 {
1165         if (x->bind == NULL)
1166                 Fsstdbind(c, cb->f, cb->nf);
1167         else
1168                 x->bind(c, cb->f, cb->nf);
1169 }
1170
1171 /* Helper, called by protocols to use the bypass.
1172  *
1173  * This is a bit nasty due to the overall nastiness of #ip.  We need to lock
1174  * before checking the state and hold the qlock throughout, because a concurrent
1175  * closeconv() could tear down the bypass.  Specifically, it could free the
1176  * bypass queues.  The root issue is that conversation lifetimes are not managed
1177  * well.
1178  *
1179  * If we fail, it's our responsibility to consume (free) the block(s). */
1180 void bypass_or_drop(struct conv *cv, struct block *bp)
1181 {
1182         qlock(&cv->qlock);
1183         if (cv->state == Bypass)
1184                 qpass(cv->rq, bp);
1185         else
1186                 freeblist(bp);
1187         qunlock(&cv->qlock);
1188 }
1189
1190 /* Push the block directly to the approprite ipoput function.
1191  *
1192  * It's the protocol's responsibility (and thus ours here) to make sure there is
1193  * at least the right amount of the IP header in the block (ipoput{4,6} assumes
1194  * it has the right amount, and the other protocols account for the IP header in
1195  * their own header).
1196  *
1197  * For the TTL and TOS, we just use the default ones.  If we want, we could look
1198  * into the actual block and see what the user wanted, though we're bypassing
1199  * the protocol layer, not the IP layer. */
1200 static void proto_bypass_kick(void *arg, struct block *bp)
1201 {
1202         struct conv *cv = (struct conv*)arg;
1203         uint8_t vers_nibble;
1204         struct Fs *f;
1205
1206         f = cv->p->f;
1207
1208         bp = pullupblock(bp, 1);
1209         if (!bp)
1210                 error(EINVAL, "Proto bypass unable to pullup a byte!");
1211         vers_nibble = *(uint8_t*)bp->rp & 0xf0;
1212         switch (vers_nibble) {
1213         case IP_VER4:
1214                 bp = pullupblock(bp, IPV4HDR_LEN);
1215                 if (!bp)
1216                         error(EINVAL, "Proto bypass unable to pullup v4 header");
1217                 ipoput4(f, bp, FALSE, MAXTTL, DFLTTOS, NULL);
1218                 break;
1219         case IP_VER6:
1220                 bp = pullupblock(bp, IPV6HDR_LEN);
1221                 if (!bp)
1222                         error(EINVAL, "Proto bypass unable to pullup v6 header");
1223                 ipoput6(f, bp, FALSE, MAXTTL, DFLTTOS, NULL);
1224                 break;
1225         default:
1226                 error(EINVAL, "Proto bypass block had unknown IP version 0x%x",
1227                       vers_nibble);
1228         }
1229 }
1230
1231 /* Sets up cv for the protocol bypass.  We use different queues for two reasons:
1232  * 1) To be protocol independent.  For instance, TCP and UDP could use very
1233  * different QIO styles.
1234  * 2) To set up our own kick/bypass method.  Note how udpcreate() and here uses
1235  * qbypass() (just blast it out), while TCP uses qopen() with a kick.  TCP still
1236  * follows queuing discipline.
1237  *
1238  * It's like we are our own protocol, the bypass protocol, when it comes to how
1239  * we interact with qio.  The conv still is of the real protocol type (e.g.
1240  * TCP).
1241  *
1242  * Note that we can't free the old queues.  The way #ip works, the queues are
1243  * created when the conv is created, but the conv is never freed.  It's like a
1244  * slab allocator that never frees objects, but just reinitializes them a
1245  * little.
1246  *
1247  * For the queues, we're basically like UDP:
1248  * - We take packets for rq and drop on overflow.
1249  * - rq is also Qmsg, but we also have Qcoalesce, to ignore out zero-len blocks
1250  * - We kick for our outbound (wq) messages.
1251  *
1252  * Note that Qmsg can drop parts of packets.  It's up to the user to read
1253  * enough.  If they didn't read enough, the extra is dropped.  This is similar
1254  * to SOCK_DGRAM and recvfrom().  Minus major changes, there's no nice way to
1255  * get individual messages with read().  Userspace using the bypass will need to
1256  * find out the MTU of the NIC the IP stack is attached to, and make sure to
1257  * read in at least that amount each time. */
1258 static void setup_proto_qio_bypass(struct conv *cv)
1259 {
1260         cv->rq_save = cv->rq;
1261         cv->wq_save = cv->wq;
1262         cv->rq = qopen(BYPASS_QMAX, Qmsg | Qcoalesce, 0, 0);
1263         cv->wq = qbypass(proto_bypass_kick, cv);
1264 }
1265
1266 static void undo_proto_qio_bypass(struct conv *cv)
1267 {
1268         qfree(cv->rq);
1269         qfree(cv->wq);
1270         cv->rq = cv->rq_save;
1271         cv->wq = cv->wq_save;
1272         cv->rq_save = NULL;
1273         cv->wq_save = NULL;
1274 }
1275
1276 void Fsstdbypass(struct conv *cv, char *argv[], int argc)
1277 {
1278         memset(cv->raddr, 0, sizeof(cv->raddr));
1279         cv->rport = 0;
1280         switch (argc) {
1281         case 2:
1282                 setladdrport(cv, argv[1], 1);
1283                 break;
1284         default:
1285                 error(EINVAL, "Bad args (was %d, need 2) to bypass", argc);
1286         }
1287 }
1288
1289 static void bypassctlmsg(struct Proto *x, struct conv *cv, struct cmdbuf *cb)
1290 {
1291         if (!x->bypass)
1292                 error(EFAIL, "Protocol %s does not support bypass", x->name);
1293         /* The protocol needs to set the port (usually by calling Fsstdbypass) and
1294          * then do whatever it needs to make sure it can find the conv again during
1295          * receive (usually by adding to a hash table). */
1296         x->bypass(cv, cb->f, cb->nf);
1297         setup_proto_qio_bypass(cv);
1298         cv->state = Bypass;
1299 }
1300
1301 static void shutdownctlmsg(struct conv *cv, struct cmdbuf *cb)
1302 {
1303         if (cb->nf < 2)
1304                 goto err;
1305         if (!strcmp(cb->f[1], "rd")) {
1306                 qhangup(cv->rq, "shutdown");
1307                 if (cv->p->shutdown)
1308                         cv->p->shutdown(cv, SHUT_RD);
1309         } else if (!strcmp(cb->f[1], "wr")) {
1310                 qhangup(cv->wq, "shutdown");
1311                 if (cv->p->shutdown)
1312                         cv->p->shutdown(cv, SHUT_WR);
1313         } else if (!strcmp(cb->f[1], "rdwr")) {
1314                 qhangup(cv->rq, "shutdown");
1315                 qhangup(cv->wq, "shutdown");
1316                 if (cv->p->shutdown)
1317                         cv->p->shutdown(cv, SHUT_RDWR);
1318         } else {
1319                 goto err;
1320         }
1321         return;
1322 err:
1323         error(EINVAL, "shutdown [rx|tx|rxtx]");
1324 }
1325
1326 static void tosctlmsg(struct conv *c, struct cmdbuf *cb)
1327 {
1328         if (cb->nf < 2)
1329                 c->tos = 0;
1330         else
1331                 c->tos = atoi(cb->f[1]);
1332 }
1333
1334 static void ttlctlmsg(struct conv *c, struct cmdbuf *cb)
1335 {
1336         if (cb->nf < 2)
1337                 c->ttl = MAXTTL;
1338         else
1339                 c->ttl = atoi(cb->f[1]);
1340 }
1341
1342 /* Binds a conversation, as if the user wrote "bind *" into ctl. */
1343 static void autobind(struct conv *cv)
1344 {
1345         ERRSTACK(1);
1346         struct cmdbuf *cb;
1347
1348         cb = parsecmd("bind *", 7);
1349         if (waserror()) {
1350                 kfree(cb);
1351                 nexterror();
1352         }
1353         bindctlmsg(cv->p, cv, cb);
1354         poperror();
1355         kfree(cb);
1356 }
1357
1358 static long ipwrite(struct chan *ch, void *v, long n, int64_t off)
1359 {
1360         ERRSTACK(1);
1361         struct conv *c;
1362         struct Proto *x;
1363         char *p;
1364         struct cmdbuf *cb;
1365         uint8_t ia[IPaddrlen], ma[IPaddrlen];
1366         struct Fs *f;
1367         char *a;
1368
1369         a = v;
1370         f = ipfs[ch->dev];
1371
1372         switch (TYPE(ch->qid)) {
1373                 default:
1374                         error(EPERM, ERROR_FIXME);
1375                 case Qdata:
1376                         x = f->p[PROTO(ch->qid)];
1377                         c = x->conv[CONV(ch->qid)];
1378                         /* connection-less protocols (UDP) can write without manually
1379                          * binding. */
1380                         if (c->lport == 0)
1381                                 autobind(c);
1382                         if (ch->flag & O_NONBLOCK)
1383                                 qwrite_nonblock(c->wq, a, n);
1384                         else
1385                                 qwrite(c->wq, a, n);
1386                         break;
1387                 case Qarp:
1388                         return arpwrite(f, a, n);
1389                 case Qiproute:
1390                         return routewrite(f, ch, a, n);
1391                 case Qlog:
1392                         netlogctl(f, a, n);
1393                         return n;
1394                 case Qndb:
1395                         return ndbwrite(f, a, off, n);
1396                 case Qctl:
1397                         x = f->p[PROTO(ch->qid)];
1398                         c = x->conv[CONV(ch->qid)];
1399                         cb = parsecmd(a, n);
1400
1401                         qlock(&c->qlock);
1402                         if (waserror()) {
1403                                 qunlock(&c->qlock);
1404                                 kfree(cb);
1405                                 nexterror();
1406                         }
1407                         if (cb->nf < 1)
1408                                 error(EFAIL, "short control request");
1409                         if (strcmp(cb->f[0], "connect") == 0)
1410                                 connectctlmsg(x, c, cb);
1411                         else if (strcmp(cb->f[0], "announce") == 0)
1412                                 announcectlmsg(x, c, cb);
1413                         else if (strcmp(cb->f[0], "bind") == 0)
1414                                 bindctlmsg(x, c, cb);
1415                         else if (strcmp(cb->f[0], "bypass") == 0)
1416                                 bypassctlmsg(x, c, cb);
1417                         else if (strcmp(cb->f[0], "shutdown") == 0)
1418                                 shutdownctlmsg(c, cb);
1419                         else if (strcmp(cb->f[0], "ttl") == 0)
1420                                 ttlctlmsg(c, cb);
1421                         else if (strcmp(cb->f[0], "tos") == 0)
1422                                 tosctlmsg(c, cb);
1423                         else if (strcmp(cb->f[0], "ignoreadvice") == 0)
1424                                 c->ignoreadvice = 1;
1425                         else if (strcmp(cb->f[0], "addmulti") == 0) {
1426                                 if (cb->nf < 2)
1427                                         error(EFAIL, "addmulti needs interface address");
1428                                 if (cb->nf == 2) {
1429                                         if (!ipismulticast(c->raddr))
1430                                                 error(EFAIL, "addmulti for a non multicast address");
1431                                         parseip(ia, cb->f[1]);
1432                                         ipifcaddmulti(c, c->raddr, ia);
1433                                 } else {
1434                                         parseip(ma, cb->f[2]);
1435                                         if (!ipismulticast(ma))
1436                                                 error(EFAIL, "addmulti for a non multicast address");
1437                                         parseip(ia, cb->f[1]);
1438                                         ipifcaddmulti(c, ma, ia);
1439                                 }
1440                         } else if (strcmp(cb->f[0], "remmulti") == 0) {
1441                                 if (cb->nf < 2)
1442                                         error(EFAIL, "remmulti needs interface address");
1443                                 if (!ipismulticast(c->raddr))
1444                                         error(EFAIL, "remmulti for a non multicast address");
1445                                 parseip(ia, cb->f[1]);
1446                                 ipifcremmulti(c, c->raddr, ia);
1447                         } else if (x->ctl != NULL) {
1448                                 x->ctl(c, cb->f, cb->nf);
1449                         } else
1450                                 error(EFAIL, "unknown control request");
1451                         qunlock(&c->qlock);
1452                         kfree(cb);
1453                         poperror();
1454         }
1455         return n;
1456 }
1457
1458 static long ipbwrite(struct chan *ch, struct block *bp, uint32_t offset)
1459 {
1460         struct conv *c;
1461         int n;
1462
1463         switch (TYPE(ch->qid)) {
1464                 case Qdata:
1465                         c = chan2conv(ch);
1466                         if (bp->next)
1467                                 bp = concatblock(bp);
1468                         n = BLEN(bp);
1469                         if (ch->flag & O_NONBLOCK)
1470                                 qbwrite_nonblock(c->wq, bp);
1471                         else
1472                                 qbwrite(c->wq, bp);
1473                         return n;
1474                 default:
1475                         return devbwrite(ch, bp, offset);
1476         }
1477 }
1478
1479 static void ip_wake_cb(struct queue *q, void *data, int filter)
1480 {
1481         struct conv *conv = (struct conv*)data;
1482         struct fd_tap *tap_i;
1483         /* For these two, we want to ignore events on the opposite end of the
1484          * queues.  For instance, we want to know when the WQ is writable.  Our
1485          * writes will actually make it readable - we don't want to trigger a tap
1486          * for that.  However, qio doesn't know how/why we are using a queue, or
1487          * even who the ends are (hence the callbacks) */
1488         if ((filter & FDTAP_FILT_READABLE) && (q == conv->wq))
1489                 return;
1490         if ((filter & FDTAP_FILT_WRITABLE) && (q == conv->rq))
1491                 return;
1492         /* At this point, we have an event we want to send to our taps (if any).
1493          * The lock protects list integrity and the existence of the tap.
1494          *
1495          * Previously, I thought of using the conv qlock.  That actually breaks, due
1496          * to weird usages of the qlock (someone holds it for a long time, blocking
1497          * the inbound wakeup from etherread4).
1498          *
1499          * I opted for a spinlock for a couple reasons:
1500          * - fire_tap should not block.  ideally it'll be fast too (it's mostly a
1501          * send_event).
1502          * - our callers might not want to block.  A lot of network wakeups will
1503          * come network processes (etherread4) or otherwise unrelated to this
1504          * particular conversation.  I'd rather do something like fire off a KMSG
1505          * than block those.
1506          * - if fire_tap takes a while, holding the lock only slows down other
1507          * events on this *same* conversation, or other tap registration.  not a
1508          * huge deal. */
1509         spin_lock(&conv->tap_lock);
1510         SLIST_FOREACH(tap_i, &conv->data_taps, link)
1511                 fire_tap(tap_i, filter);
1512         spin_unlock(&conv->tap_lock);
1513 }
1514
1515 int iptapfd(struct chan *chan, struct fd_tap *tap, int cmd)
1516 {
1517         struct conv *conv = chan2conv(chan);
1518         int ret;
1519
1520         #define DEVIP_LEGAL_DATA_TAPS (FDTAP_FILT_READABLE | FDTAP_FILT_WRITABLE | \
1521                                        FDTAP_FILT_HANGUP | FDTAP_FILT_PRIORITY |   \
1522                                        FDTAP_FILT_ERROR)
1523         #define DEVIP_LEGAL_LISTEN_TAPS (FDTAP_FILT_READABLE | FDTAP_FILT_HANGUP)
1524
1525         switch (TYPE(chan->qid)) {
1526                 case Qdata:
1527                         if (tap->filter & ~DEVIP_LEGAL_DATA_TAPS) {
1528                                 set_errno(ENOSYS);
1529                                 set_errstr("Unsupported #%s data tap %p, must be %p", devname(),
1530                                            tap->filter, DEVIP_LEGAL_DATA_TAPS);
1531                                 return -1;
1532                         }
1533                         spin_lock(&conv->tap_lock);
1534                         switch (cmd) {
1535                                 case (FDTAP_CMD_ADD):
1536                                         if (SLIST_EMPTY(&conv->data_taps)) {
1537                                                 qio_set_wake_cb(conv->rq, ip_wake_cb, conv);
1538                                                 qio_set_wake_cb(conv->wq, ip_wake_cb, conv);
1539                                         }
1540                                         SLIST_INSERT_HEAD(&conv->data_taps, tap, link);
1541                                         ret = 0;
1542                                         break;
1543                                 case (FDTAP_CMD_REM):
1544                                         SLIST_REMOVE(&conv->data_taps, tap, fd_tap, link);
1545                                         if (SLIST_EMPTY(&conv->data_taps)) {
1546                                                 qio_set_wake_cb(conv->rq, 0, conv);
1547                                                 qio_set_wake_cb(conv->wq, 0, conv);
1548                                         }
1549                                         ret = 0;
1550                                         break;
1551                                 default:
1552                                         set_errno(ENOSYS);
1553                                         set_errstr("Unsupported #%s data tap command %p",
1554                                                    devname(), cmd);
1555                                         ret = -1;
1556                         }
1557                         spin_unlock(&conv->tap_lock);
1558                         return ret;
1559                 case Qlisten:
1560                         if (tap->filter & ~DEVIP_LEGAL_LISTEN_TAPS) {
1561                                 set_errno(ENOSYS);
1562                                 set_errstr("Unsupported #%s listen tap %p, must be %p",
1563                                            devname(), tap->filter, DEVIP_LEGAL_LISTEN_TAPS);
1564                                 return -1;
1565                         }
1566                         spin_lock(&conv->tap_lock);
1567                         switch (cmd) {
1568                                 case (FDTAP_CMD_ADD):
1569                                         SLIST_INSERT_HEAD(&conv->listen_taps, tap, link);
1570                                         ret = 0;
1571                                         break;
1572                                 case (FDTAP_CMD_REM):
1573                                         SLIST_REMOVE(&conv->listen_taps, tap, fd_tap, link);
1574                                         ret = 0;
1575                                         break;
1576                                 default:
1577                                         set_errno(ENOSYS);
1578                                         set_errstr("Unsupported #%s listen tap command %p",
1579                                                    devname(), cmd);
1580                                         ret = -1;
1581                         }
1582                         spin_unlock(&conv->tap_lock);
1583                         return ret;
1584                 default:
1585                         set_errno(ENOSYS);
1586                         set_errstr("Can't tap #%s file type %d", devname(),
1587                                    TYPE(chan->qid));
1588                         return -1;
1589         }
1590 }
1591
1592 struct dev ipdevtab __devtab = {
1593         .name = "ip",
1594
1595         .reset = ipreset,
1596         .init = ipinit,
1597         .shutdown = devshutdown,
1598         .attach = ipattach,
1599         .walk = ipwalk,
1600         .stat = ipstat,
1601         .open = ipopen,
1602         .create = devcreate,
1603         .close = ipclose,
1604         .read = ipread,
1605         .bread = ipbread,
1606         .write = ipwrite,
1607         .bwrite = ipbwrite,
1608         .remove = devremove,
1609         .wstat = ipwstat,
1610         .power = devpower,
1611         .chaninfo = ipchaninfo,
1612         .tapfd = iptapfd,
1613 };
1614
1615 int Fsproto(struct Fs *f, struct Proto *p)
1616 {
1617         if (f->np >= Maxproto)
1618                 return -1;
1619
1620         qlock_init(&p->qlock);
1621         p->f = f;
1622
1623         if (p->ipproto > 0) {
1624                 if (f->t2p[p->ipproto] != NULL)
1625                         return -1;
1626                 f->t2p[p->ipproto] = p;
1627         }
1628
1629         p->qid.type = QTDIR;
1630         p->qid.path = QID(f->np, 0, Qprotodir);
1631         p->conv = kzmalloc(sizeof(struct conv *) * (p->nc + 1), 0);
1632         if (p->conv == NULL)
1633                 panic("Fsproto");
1634
1635         p->x = f->np;
1636         p->nextport = 0;
1637         p->nextrport = 600;
1638         f->p[f->np++] = p;
1639
1640         return 0;
1641 }
1642
1643 /*
1644  *  return true if this protocol is
1645  *  built in
1646  */
1647 int Fsbuiltinproto(struct Fs *f, uint8_t proto)
1648 {
1649         return f->t2p[proto] != NULL;
1650 }
1651
1652 /*
1653  *  called with protocol locked
1654  */
1655 struct conv *Fsprotoclone(struct Proto *p, char *user)
1656 {
1657         struct conv *c, **pp, **ep;
1658
1659 retry:
1660         c = NULL;
1661         ep = &p->conv[p->nc];
1662         for (pp = p->conv; pp < ep; pp++) {
1663                 c = *pp;
1664                 if (c == NULL) {
1665                         c = kzmalloc(sizeof(struct conv), 0);
1666                         if (c == NULL)
1667                                 error(ENOMEM,
1668                                       "conv kzmalloc(%d, 0) failed in Fsprotoclone",
1669                                       sizeof(struct conv));
1670                         qlock_init(&c->qlock);
1671                         qlock_init(&c->listenq);
1672                         rendez_init(&c->cr);
1673                         rendez_init(&c->listenr);
1674                         SLIST_INIT(&c->data_taps);      /* already = 0; set to be futureproof */
1675                         SLIST_INIT(&c->listen_taps);
1676                         spinlock_init(&c->tap_lock);
1677                         qlock(&c->qlock);
1678                         c->p = p;
1679                         c->x = pp - p->conv;
1680                         if (p->ptclsize != 0) {
1681                                 c->ptcl = kzmalloc(p->ptclsize, 0);
1682                                 if (c->ptcl == NULL) {
1683                                         kfree(c);
1684                                         error(ENOMEM,
1685                                               "ptcl kzmalloc(%d, 0) failed in Fsprotoclone",
1686                                               p->ptclsize);
1687                                 }
1688                         }
1689                         *pp = c;
1690                         p->ac++;
1691                         c->eq = qopen(1024, Qmsg, 0, 0);
1692                         (*p->create) (c);
1693                         assert(c->rq && c->wq);
1694                         break;
1695                 }
1696                 if (canqlock(&c->qlock)) {
1697                         /*
1698                          *  make sure both processes and protocol
1699                          *  are done with this Conv
1700                          */
1701                         if (c->inuse == 0 && (p->inuse == NULL || (*p->inuse) (c) == 0))
1702                                 break;
1703
1704                         qunlock(&c->qlock);
1705                 }
1706         }
1707         if (pp >= ep) {
1708                 if (p->gc != NULL && (*p->gc) (p))
1709                         goto retry;
1710                 return NULL;
1711         }
1712
1713         c->inuse = 1;
1714         kstrdup(&c->owner, user);
1715         c->perm = 0660;
1716         c->state = Idle;
1717         ipmove(c->laddr, IPnoaddr);
1718         ipmove(c->raddr, IPnoaddr);
1719         c->r = NULL;
1720         c->rgen = 0;
1721         c->lport = 0;
1722         c->rport = 0;
1723         c->restricted = 0;
1724         c->ttl = MAXTTL;
1725         c->tos = DFLTTOS;
1726         qreopen(c->rq);
1727         qreopen(c->wq);
1728         qreopen(c->eq);
1729
1730         qunlock(&c->qlock);
1731         return c;
1732 }
1733
1734 int Fsconnected(struct conv *c, char *msg)
1735 {
1736         if (msg != NULL && *msg != '\0')
1737                 strlcpy(c->cerr, msg, sizeof(c->cerr));
1738
1739         switch (c->state) {
1740                 case Announcing:
1741                         c->state = Announced;
1742                         break;
1743
1744                 case Connecting:
1745                         c->state = Connected;
1746                         break;
1747         }
1748
1749         rendez_wakeup(&c->cr);
1750         return 0;
1751 }
1752
1753 struct Proto *Fsrcvpcol(struct Fs *f, uint8_t proto)
1754 {
1755         if (f->ipmux)
1756                 return f->ipmux;
1757         else
1758                 return f->t2p[proto];
1759 }
1760
1761 struct Proto *Fsrcvpcolx(struct Fs *f, uint8_t proto)
1762 {
1763         return f->t2p[proto];
1764 }
1765
1766 static void fire_listener_taps(struct conv *conv)
1767 {
1768         struct fd_tap *tap_i;
1769         if (SLIST_EMPTY(&conv->listen_taps))
1770                 return;
1771         spin_lock(&conv->tap_lock);
1772         SLIST_FOREACH(tap_i, &conv->listen_taps, link)
1773                 fire_tap(tap_i, FDTAP_FILT_READABLE);
1774         spin_unlock(&conv->tap_lock);
1775 }
1776
1777 /*
1778  *  called with protocol locked
1779  */
1780 struct conv *Fsnewcall(struct conv *c, uint8_t * raddr, uint16_t rport,
1781                                            uint8_t * laddr, uint16_t lport, uint8_t version)
1782 {
1783         struct conv *nc;
1784         struct conv **l;
1785         int i;
1786
1787         qlock(&c->qlock);
1788         i = 0;
1789         for (l = &c->incall; *l; l = &(*l)->next)
1790                 i++;
1791         if (i >= Maxincall) {
1792                 qunlock(&c->qlock);
1793                 return NULL;
1794         }
1795
1796         /* find a free conversation */
1797         nc = Fsprotoclone(c->p, network);
1798         if (nc == NULL) {
1799                 qunlock(&c->qlock);
1800                 return NULL;
1801         }
1802         ipmove(nc->raddr, raddr);
1803         nc->rport = rport;
1804         ipmove(nc->laddr, laddr);
1805         nc->lport = lport;
1806         nc->next = NULL;
1807         *l = nc;
1808         nc->state = Connected;
1809         nc->ipversion = version;
1810
1811         qunlock(&c->qlock);
1812
1813         rendez_wakeup(&c->listenr);
1814         fire_listener_taps(c);
1815
1816         return nc;
1817 }
1818
1819 static long ndbwrite(struct Fs *f, char *a, uint32_t off, int n)
1820 {
1821         if (off > strlen(f->ndb))
1822                 error(EIO, ERROR_FIXME);
1823         if (off + n >= sizeof(f->ndb) - 1)
1824                 error(EIO, ERROR_FIXME);
1825         memmove(f->ndb + off, a, n);
1826         f->ndb[off + n] = 0;
1827         f->ndbvers++;
1828         f->ndbmtime = seconds();
1829         return n;
1830 }
1831
1832 uint32_t scalednconv(void)
1833 {
1834         //if(conf.npage*BY2PG >= 128*MB)
1835         return Nchans * 4;
1836         //  return Nchans;
1837 }