net: Allow connectionless convs to auto bind
[akaros.git] / kern / src / net / devip.c
1 /* Copyright © 1994-1999 Lucent Technologies Inc.  All rights reserved.
2  * Portions Copyright © 1997-1999 Vita Nuova Limited
3  * Portions Copyright © 2000-2007 Vita Nuova Holdings Limited
4  *                                (www.vitanuova.com)
5  * Revisions Copyright © 2000-2007 Lucent Technologies Inc. and others
6  *
7  * Modified for the Akaros operating system:
8  * Copyright (c) 2013-2014 The Regents of the University of California
9  * Copyright (c) 2013-2015 Google Inc.
10  *
11  * Permission is hereby granted, free of charge, to any person obtaining a copy
12  * of this software and associated documentation files (the "Software"), to deal
13  * in the Software without restriction, including without limitation the rights
14  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15  * copies of the Software, and to permit persons to whom the Software is
16  * furnished to do so, subject to the following conditions:
17  *
18  * The above copyright notice and this permission notice shall be included in
19  * all copies or substantial portions of the Software.
20  *
21  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
24  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27  * SOFTWARE. */
28
29 #include <vfs.h>
30 #include <kfs.h>
31 #include <slab.h>
32 #include <kmalloc.h>
33 #include <kref.h>
34 #include <string.h>
35 #include <stdio.h>
36 #include <assert.h>
37 #include <error.h>
38 #include <cpio.h>
39 #include <pmap.h>
40 #include <smp.h>
41 #include <ip.h>
42
43 struct dev ipdevtab;
44
45 static char *devname(void)
46 {
47         return ipdevtab.name;
48 }
49
50 enum {
51         Qtopdir = 1,                            /* top level directory */
52         Qtopbase,
53         Qarp = Qtopbase,
54         Qndb,
55         Qiproute,
56         Qiprouter,
57         Qipselftab,
58         Qlog,
59
60         Qprotodir,      /* directory for a protocol */
61         Qprotobase,
62         Qclone = Qprotobase,
63         Qstats,
64
65         Qconvdir,       /* directory for a conversation */
66         Qconvbase,
67         Qctl = Qconvbase,
68         Qdata,
69         Qerr,
70         Qlisten,
71         Qlocal,
72         Qremote,
73         Qstatus,
74         Qsnoop,
75
76         Logtype = 5,
77         Masktype = (1 << Logtype) - 1,
78         Logconv = 12,
79         Maskconv = (1 << Logconv) - 1,
80         Shiftconv = Logtype,
81         Logproto = 8,
82         Maskproto = (1 << Logproto) - 1,
83         Shiftproto = Logtype + Logconv,
84
85         Nfs = 32,
86         BYPASS_QMAX = 64 * MiB,
87         IPROUTE_LEN = 2 * PGSIZE,
88 };
89 #define TYPE(x)         ( ((uint32_t)(x).path) & Masktype )
90 #define CONV(x)         ( (((uint32_t)(x).path) >> Shiftconv) & Maskconv )
91 #define PROTO(x)        ( (((uint32_t)(x).path) >> Shiftproto) & Maskproto )
92 #define QID(p, c, y)    ( ((p)<<(Shiftproto)) | ((c)<<Shiftconv) | (y))
93 static char network[] = "network";
94
95 qlock_t fslock;
96 struct Fs *ipfs[Nfs];                   /* attached fs's */
97 struct queue *qlog;
98
99 extern void nullmediumlink(void);
100 extern void pktmediumlink(void);
101 extern struct username eve;
102 static long ndbwrite(struct Fs *, char *unused_char_p_t, uint32_t, int);
103 static void closeconv(struct conv *);
104 static void setup_proto_qio_bypass(struct conv *cv);
105 static void undo_proto_qio_bypass(struct conv *cv);
106
107 static struct conv *chan2conv(struct chan *chan)
108 {
109         /* That's a lot of pointers to get to the conv! */
110         return ipfs[chan->dev]->p[PROTO(chan->qid)]->conv[CONV(chan->qid)];
111 }
112
113 static inline int founddevdir(struct chan *c, struct qid q, char *n,
114                                                           int64_t length, char *user, long perm,
115                                                           struct dir *db)
116 {
117         devdir(c, q, n, length, user, perm, db);
118         return 1;
119 }
120
121 static int topdirgen(struct chan *c, struct dir *dp)
122 {
123         struct qid q;
124         mkqid(&q, QID(0, 0, Qtopdir), 0, QTDIR);
125         snprintf(get_cur_genbuf(), GENBUF_SZ, "#%s%lu", devname(), c->dev);
126         return founddevdir(c, q, get_cur_genbuf(), 0, network, 0555, dp);
127 }
128
129
130 static int ip3gen(struct chan *c, int i, struct dir *dp)
131 {
132         struct qid q;
133         struct conv *cv;
134         char *p;
135         int perm;
136
137         cv = chan2conv(c);
138         if (cv->owner == NULL)
139                 kstrdup(&cv->owner, eve.name);
140         mkqid(&q, QID(PROTO(c->qid), CONV(c->qid), i), 0, QTFILE);
141
142         switch (i) {
143                 default:
144                         return -1;
145                 case Qctl:
146                         return founddevdir(c, q, "ctl", 0,
147                                                    cv->owner, cv->perm, dp);
148                 case Qdata:
149                         perm = cv->perm;
150                         perm |= qreadable(cv->rq) ? DMREADABLE : 0;
151                         perm |= qwritable(cv->wq) ? DMWRITABLE : 0;
152                         return founddevdir(c, q, "data", qlen(cv->rq),
153                                                            cv->owner, perm, dp);
154                 case Qerr:
155                         perm = cv->perm;
156                         perm |= qreadable(cv->eq) ? DMREADABLE : 0;
157                         return founddevdir(c, q, "err", qlen(cv->eq),
158                                                            cv->owner, perm, dp);
159                 case Qlisten:
160                         return founddevdir(c, q, "listen", 0, cv->owner, cv->perm, dp);
161                 case Qlocal:
162                         p = "local";
163                         break;
164                 case Qremote:
165                         p = "remote";
166                         break;
167                 case Qsnoop:
168                         if (strcmp(cv->p->name, "ipifc") != 0)
169                                 return -1;
170                         perm = 0400;
171                         perm |= qreadable(cv->sq) ? DMREADABLE : 0;
172                         return founddevdir(c, q, "snoop", qlen(cv->sq),
173                                                            cv->owner, perm, dp);
174                 case Qstatus:
175                         p = "status";
176                         break;
177         }
178         return founddevdir(c, q, p, 0, cv->owner, 0444, dp);
179 }
180
181 static int ip2gen(struct chan *c, int i, struct dir *dp)
182 {
183         struct qid q;
184         mkqid(&q, QID(PROTO(c->qid), 0, i), 0, QTFILE);
185         switch (i) {
186                 case Qclone:
187                         return founddevdir(c, q, "clone", 0, network, 0666, dp);
188                 case Qstats:
189                         return founddevdir(c, q, "stats", 0, network, 0444, dp);
190         }
191         return -1;
192 }
193
194 static int ip1gen(struct chan *c, int i, struct dir *dp)
195 {
196         struct qid q;
197         char *p;
198         int prot;
199         int len = 0;
200         struct Fs *f;
201         extern uint32_t kerndate;
202
203         f = ipfs[c->dev];
204
205         prot = 0666;
206         mkqid(&q, QID(0, 0, i), 0, QTFILE);
207         switch (i) {
208                 default:
209                         return -1;
210                 case Qarp:
211                         p = "arp";
212                         break;
213                 case Qndb:
214                         p = "ndb";
215                         len = strlen(f->ndb);
216                         q.vers = f->ndbvers;
217                         break;
218                 case Qiproute:
219                         p = "iproute";
220                         break;
221                 case Qipselftab:
222                         p = "ipselftab";
223                         prot = 0444;
224                         break;
225                 case Qiprouter:
226                         p = "iprouter";
227                         break;
228                 case Qlog:
229                         p = "log";
230                         break;
231         }
232         devdir(c, q, p, len, network, prot, dp);
233         if (i == Qndb && f->ndbmtime > kerndate)
234                 dp->mtime = f->ndbmtime;
235         return 1;
236 }
237
238 static int
239 ipgen(struct chan *c, char *unused_char_p_t, struct dirtab *d, int unused_int,
240           int s, struct dir *dp)
241 {
242         struct qid q;
243         struct conv *cv;
244         struct Fs *f;
245
246         f = ipfs[c->dev];
247
248         switch (TYPE(c->qid)) {
249                 case Qtopdir:
250                         if (s == DEVDOTDOT)
251                                 return topdirgen(c, dp);
252                         if (s < f->np) {
253                                 if (f->p[s]->connect == NULL)
254                                         return 0;       /* protocol with no user interface */
255                                 mkqid(&q, QID(s, 0, Qprotodir), 0, QTDIR);
256                                 return founddevdir(c, q, f->p[s]->name, 0, network, 0555, dp);
257                         }
258                         s -= f->np;
259                         return ip1gen(c, s + Qtopbase, dp);
260                 case Qarp:
261                 case Qndb:
262                 case Qlog:
263                 case Qiproute:
264                 case Qiprouter:
265                 case Qipselftab:
266                         return ip1gen(c, TYPE(c->qid), dp);
267                 case Qprotodir:
268                         if (s == DEVDOTDOT)
269                                 return topdirgen(c, dp);
270                         else if (s < f->p[PROTO(c->qid)]->ac) {
271                                 cv = f->p[PROTO(c->qid)]->conv[s];
272                                 snprintf(get_cur_genbuf(), GENBUF_SZ, "%d", s);
273                                 mkqid(&q, QID(PROTO(c->qid), s, Qconvdir), 0, QTDIR);
274                                 return
275                                         founddevdir(c, q, get_cur_genbuf(), 0, cv->owner, 0555, dp);
276                         }
277                         s -= f->p[PROTO(c->qid)]->ac;
278                         return ip2gen(c, s + Qprotobase, dp);
279                 case Qclone:
280                 case Qstats:
281                         return ip2gen(c, TYPE(c->qid), dp);
282                 case Qconvdir:
283                         if (s == DEVDOTDOT) {
284                                 s = PROTO(c->qid);
285                                 mkqid(&q, QID(s, 0, Qprotodir), 0, QTDIR);
286                                 devdir(c, q, f->p[s]->name, 0, network, 0555, dp);
287                                 return 1;
288                         }
289                         return ip3gen(c, s + Qconvbase, dp);
290                 case Qctl:
291                 case Qdata:
292                 case Qerr:
293                 case Qlisten:
294                 case Qlocal:
295                 case Qremote:
296                 case Qstatus:
297                 case Qsnoop:
298                         return ip3gen(c, TYPE(c->qid), dp);
299         }
300         return -1;
301 }
302
303 static void ipinit(void)
304 {
305         qlock_init(&fslock);
306         nullmediumlink();
307         pktmediumlink();
308 /* if only
309         fmtinstall('i', eipfmt);
310         fmtinstall('I', eipfmt);
311         fmtinstall('E', eipfmt);
312         fmtinstall('V', eipfmt);
313         fmtinstall('M', eipfmt);
314 */
315 }
316
317 static void ipreset(void)
318 {
319 }
320
321 static struct Fs *ipgetfs(int dev)
322 {
323         extern void (*ipprotoinit[]) (struct Fs *);
324         struct Fs *f;
325         int i;
326
327         if (dev >= Nfs)
328                 return NULL;
329
330         qlock(&fslock);
331         if (ipfs[dev] == NULL) {
332                 f = kzmalloc(sizeof(struct Fs), MEM_WAIT);
333                 rwinit(&f->rwlock);
334                 qlock_init(&f->iprouter.qlock);
335                 ip_init(f);
336                 arpinit(f);
337                 netloginit(f);
338                 for (i = 0; ipprotoinit[i]; i++)
339                         ipprotoinit[i] (f);
340                 f->dev = dev;
341                 ipfs[dev] = f;
342         }
343         qunlock(&fslock);
344
345         return ipfs[dev];
346 }
347
348 struct IPaux *newipaux(char *owner, char *tag)
349 {
350         struct IPaux *a;
351         int n;
352
353         a = kzmalloc(sizeof(*a), 0);
354         kstrdup(&a->owner, owner);
355         memset(a->tag, ' ', sizeof(a->tag));
356         n = strlen(tag);
357         if (n > sizeof(a->tag))
358                 n = sizeof(a->tag);
359         memmove(a->tag, tag, n);
360         return a;
361 }
362
363 #define ATTACHER(c) (((struct IPaux*)((c)->aux))->owner)
364
365 static struct chan *ipattach(char *spec)
366 {
367         struct chan *c;
368         int dev;
369
370         dev = atoi(spec);
371         if (dev >= Nfs)
372                 error(EFAIL, "bad specification");
373
374         ipgetfs(dev);
375         c = devattach(devname(), spec);
376         mkqid(&c->qid, QID(0, 0, Qtopdir), 0, QTDIR);
377         c->dev = dev;
378
379         c->aux = newipaux(commonuser(), "none");
380
381         return c;
382 }
383
384 static struct walkqid *ipwalk(struct chan *c, struct chan *nc, char **name,
385                                                           int nname)
386 {
387         struct IPaux *a = c->aux;
388         struct walkqid *w;
389
390         w = devwalk(c, nc, name, nname, NULL, 0, ipgen);
391         if (w != NULL && w->clone != NULL)
392                 w->clone->aux = newipaux(a->owner, a->tag);
393         return w;
394 }
395
396 static int ipstat(struct chan *c, uint8_t * db, int n)
397 {
398         return devstat(c, db, n, NULL, 0, ipgen);
399 }
400
401 static int should_wake(void *arg)
402 {
403         struct conv *cv = arg;
404         /* signal that the conv is closed */
405         if (qisclosed(cv->rq))
406                 return TRUE;
407         return cv->incall != NULL;
408 }
409
410 static struct chan *ipopen(struct chan *c, int omode)
411 {
412         ERRSTACK(2);
413         struct conv *cv, *nc;
414         struct Proto *p;
415         int perm;
416         struct Fs *f;
417
418         /* perm is a lone rwx, not the rwx------ from the conversion */
419         perm = omode_to_rwx(omode) >> 6;
420
421         f = ipfs[c->dev];
422
423         switch (TYPE(c->qid)) {
424                 default:
425                         break;
426                 case Qndb:
427                         if (omode & (O_WRITE | O_TRUNC) && !iseve())
428                                 error(EPERM, ERROR_FIXME);
429                         if ((omode & (O_WRITE | O_TRUNC)) == (O_WRITE | O_TRUNC))
430                                 f->ndb[0] = 0;
431                         break;
432                 case Qlog:
433                         netlogopen(f);
434                         break;
435                 case Qiprouter:
436                         iprouteropen(f);
437                         break;
438                 case Qiproute:
439                         c->synth_buf = kpages_zalloc(IPROUTE_LEN, MEM_WAIT);
440                         routeread(f, c->synth_buf, 0, IPROUTE_LEN);
441                         break;
442                 case Qtopdir:
443                 case Qprotodir:
444                 case Qconvdir:
445                 case Qstatus:
446                 case Qremote:
447                 case Qlocal:
448                 case Qstats:
449                 case Qipselftab:
450                         if (omode & O_WRITE)
451                                 error(EPERM, ERROR_FIXME);
452                         break;
453                 case Qsnoop:
454                         if (omode & O_WRITE)
455                                 error(EPERM, ERROR_FIXME);
456                         /* might be racy.  note the lack of a proto lock, unlike Qdata */
457                         p = f->p[PROTO(c->qid)];
458                         cv = p->conv[CONV(c->qid)];
459                         if (strcmp(ATTACHER(c), cv->owner) != 0 && !iseve())
460                                 error(EPERM, ERROR_FIXME);
461                         atomic_inc(&cv->snoopers);
462                         break;
463                 case Qclone:
464                         p = f->p[PROTO(c->qid)];
465                         qlock(&p->qlock);
466                         if (waserror()) {
467                                 qunlock(&p->qlock);
468                                 nexterror();
469                         }
470                         cv = Fsprotoclone(p, ATTACHER(c));
471                         qunlock(&p->qlock);
472                         poperror();
473                         if (cv == NULL) {
474                                 error(ENODEV, "Null conversation from Fsprotoclone");
475                                 break;
476                         }
477                         mkqid(&c->qid, QID(p->x, cv->x, Qctl), 0, QTFILE);
478                         break;
479                 case Qdata:
480                 case Qctl:
481                 case Qerr:
482                         p = f->p[PROTO(c->qid)];
483                         qlock(&p->qlock);
484                         cv = p->conv[CONV(c->qid)];
485                         qlock(&cv->qlock);
486                         if (waserror()) {
487                                 qunlock(&cv->qlock);
488                                 qunlock(&p->qlock);
489                                 nexterror();
490                         }
491                         if ((perm & (cv->perm >> 6)) != perm) {
492                                 if (strcmp(ATTACHER(c), cv->owner) != 0)
493                                         error(EPERM, ERROR_FIXME);
494                                 if ((perm & cv->perm) != perm)
495                                         error(EPERM, ERROR_FIXME);
496
497                         }
498                         cv->inuse++;
499                         if (cv->inuse == 1) {
500                                 kstrdup(&cv->owner, ATTACHER(c));
501                                 cv->perm = 0660;
502                         }
503                         qunlock(&cv->qlock);
504                         qunlock(&p->qlock);
505                         poperror();
506                         break;
507                 case Qlisten:
508                         cv = f->p[PROTO(c->qid)]->conv[CONV(c->qid)];
509                         /* No permissions or Announce checks required.  We'll see if that's
510                          * a good idea or not. (the perm check would do nothing, as is,
511                          * since an O_PATH perm is 0).
512                          *
513                          * But we probably want to incref to keep the conversation around
514                          * until this FD/chan is closed.  #ip is a little weird in that
515                          * objects never really go away (high water mark for convs, you can
516                          * always find them in the ns).  I think it is possible to
517                          * namec/ipgen a chan, then have that conv close, then have that
518                          * chan be opened.  You can probably do this with a data file. */
519                         if (omode & O_PATH) {
520                                 qlock(&cv->qlock);
521                                 cv->inuse++;
522                                 qunlock(&cv->qlock);
523                                 break;
524                         }
525                         if ((perm & (cv->perm >> 6)) != perm) {
526                                 if (strcmp(ATTACHER(c), cv->owner) != 0)
527                                         error(EPERM, ERROR_FIXME);
528                                 if ((perm & cv->perm) != perm)
529                                         error(EPERM, ERROR_FIXME);
530
531                         }
532
533                         if (cv->state != Announced)
534                                 error(EFAIL, "not announced");
535
536                         if (waserror()) {
537                                 closeconv(cv);
538                                 nexterror();
539                         }
540                         qlock(&cv->qlock);
541                         cv->inuse++;
542                         qunlock(&cv->qlock);
543
544                         nc = NULL;
545                         while (nc == NULL) {
546                                 /* give up if we got a hangup */
547                                 if (qisclosed(cv->rq))
548                                         error(EFAIL, "listen hungup");
549
550                                 qlock(&cv->listenq);
551                                 if (waserror()) {
552                                         qunlock(&cv->listenq);
553                                         nexterror();
554                                 }
555                                 /* we can peek at incall without grabbing the cv qlock.  if
556                                  * anything is there, it'll remain there until we dequeue it.
557                                  * no one else can, since we hold the listenq lock */
558                                 if ((c->flag & O_NONBLOCK) && !cv->incall)
559                                         error(EAGAIN, "listen queue empty");
560                                 /* wait for a connect */
561                                 rendez_sleep(&cv->listenr, should_wake, cv);
562
563                                 /* if there is a concurrent hangup, they will hold the qlock
564                                  * until the hangup is complete, including closing the cv->rq */
565                                 qlock(&cv->qlock);
566                                 nc = cv->incall;
567                                 if (nc != NULL) {
568                                         cv->incall = nc->next;
569                                         mkqid(&c->qid, QID(PROTO(c->qid), nc->x, Qctl), 0, QTFILE);
570                                         kstrdup(&cv->owner, ATTACHER(c));
571                                 }
572                                 qunlock(&cv->qlock);
573
574                                 qunlock(&cv->listenq);
575                                 poperror();
576                         }
577                         closeconv(cv);
578                         poperror();
579                         break;
580         }
581         c->mode = openmode(omode);
582         c->flag |= COPEN;
583         c->offset = 0;
584         return c;
585 }
586
587 static int ipwstat(struct chan *c, uint8_t * dp, int n)
588 {
589         ERRSTACK(2);
590         struct dir *d;
591         struct conv *cv;
592         struct Fs *f;
593         struct Proto *p;
594
595         f = ipfs[c->dev];
596         switch (TYPE(c->qid)) {
597                 default:
598                         error(EPERM, ERROR_FIXME);
599                         break;
600                 case Qctl:
601                 case Qdata:
602                         break;
603         }
604
605         d = kzmalloc(sizeof(*d) + n, 0);
606         if (waserror()) {
607                 kfree(d);
608                 nexterror();
609         }
610         n = convM2D(dp, n, d, (char *)&d[1]);
611         if (n == 0)
612                 error(ENODATA, ERROR_FIXME);
613         p = f->p[PROTO(c->qid)];
614         cv = p->conv[CONV(c->qid)];
615         if (!iseve() && strcmp(ATTACHER(c), cv->owner) != 0)
616                 error(EPERM, ERROR_FIXME);
617         if (!emptystr(d->uid))
618                 kstrdup(&cv->owner, d->uid);
619         if (d->mode != ~0UL)
620                 cv->perm = d->mode & 0777;
621         poperror();
622         kfree(d);
623         return n;
624 }
625
626 /* Should be able to handle any file type chan. Feel free to extend it. */
627 static char *ipchaninfo(struct chan *ch, char *ret, size_t ret_l)
628 {
629         struct conv *conv;
630         struct Proto *proto;
631         char *p;
632         struct Fs *f;
633
634         f = ipfs[ch->dev];
635
636         switch (TYPE(ch->qid)) {
637                 default:
638                         ret = "Unknown type";
639                         break;
640                 case Qdata:
641                         proto = f->p[PROTO(ch->qid)];
642                         conv = proto->conv[CONV(ch->qid)];
643                         snprintf(ret, ret_l, "Qdata, %s, proto %s, conv idx %d, rq len %d, wq len %d",
644                                  SLIST_EMPTY(&conv->data_taps) ? "untapped" : "tapped",
645                                  proto->name, conv->x, qlen(conv->rq), qlen(conv->wq));
646                         break;
647                 case Qarp:
648                         ret = "Qarp";
649                         break;
650                 case Qiproute:
651                         ret = "Qiproute";
652                         break;
653                 case Qlisten:
654                         proto = f->p[PROTO(ch->qid)];
655                         conv = proto->conv[CONV(ch->qid)];
656                         snprintf(ret, ret_l, "Qlisten, %s proto %s, conv idx %d",
657                                  SLIST_EMPTY(&conv->listen_taps) ? "untapped" : "tapped",
658                                  proto->name, conv->x);
659                         break;
660                 case Qlog:
661                         ret = "Qlog";
662                         break;
663                 case Qndb:
664                         ret = "Qndb";
665                         break;
666                 case Qctl:
667                         proto = f->p[PROTO(ch->qid)];
668                         conv = proto->conv[CONV(ch->qid)];
669                         snprintf(ret, ret_l, "Qctl, proto %s, conv idx %d", proto->name,
670                                          conv->x);
671                         break;
672         }
673         return ret;
674 }
675
676 static void closeconv(struct conv *cv)
677 {
678         ERRSTACK(1);
679         struct conv *nc;
680         struct Ipmulti *mp;
681
682         qlock(&cv->qlock);
683
684         if (--cv->inuse > 0) {
685                 qunlock(&cv->qlock);
686                 return;
687         }
688         if (waserror()) {
689                 qunlock(&cv->qlock);
690                 nexterror();
691         }
692         /* close all incoming calls since no listen will ever happen */
693         for (nc = cv->incall; nc; nc = cv->incall) {
694                 cv->incall = nc->next;
695                 closeconv(nc);
696         }
697         cv->incall = NULL;
698
699         kstrdup(&cv->owner, network);
700         cv->perm = 0660;
701
702         while ((mp = cv->multi) != NULL)
703                 ipifcremmulti(cv, mp->ma, mp->ia);
704
705         cv->r = NULL;
706         cv->rgen = 0;
707         if (cv->state == Bypass)
708                 undo_proto_qio_bypass(cv);
709         cv->p->close(cv);
710         cv->state = Idle;
711         qunlock(&cv->qlock);
712         poperror();
713 }
714
715 static void ipclose(struct chan *c)
716 {
717         struct Fs *f;
718
719         f = ipfs[c->dev];
720         switch (TYPE(c->qid)) {
721                 default:
722                         break;
723                 case Qlog:
724                         if (c->flag & COPEN)
725                                 netlogclose(f);
726                         break;
727                 case Qiprouter:
728                         if (c->flag & COPEN)
729                                 iprouterclose(f);
730                         break;
731                 case Qdata:
732                 case Qctl:
733                 case Qerr:
734                 case Qlisten:
735                         if (c->flag & COPEN)
736                                 closeconv(f->p[PROTO(c->qid)]->conv[CONV(c->qid)]);
737                         break;
738                 case Qsnoop:
739                         if (c->flag & COPEN)
740                                 atomic_dec(&f->p[PROTO(c->qid)]->conv[CONV(c->qid)]->snoopers);
741                         break;
742                 case Qiproute:
743                         if (c->flag & COPEN)
744                                 kpages_free(c->synth_buf, IPROUTE_LEN);
745                         break;
746         }
747         kfree(((struct IPaux *)c->aux)->owner);
748         kfree(c->aux);
749 }
750
751 enum {
752         Statelen = 32 * 1024,
753 };
754
755 static long ipread(struct chan *ch, void *a, long n, int64_t off)
756 {
757         struct conv *c;
758         struct Proto *x;
759         char *buf, *p;
760         long rv;
761         struct Fs *f;
762         uint32_t offset = off;
763
764         f = ipfs[ch->dev];
765
766         p = a;
767         switch (TYPE(ch->qid)) {
768                 default:
769                         error(EPERM, ERROR_FIXME);
770                 case Qtopdir:
771                 case Qprotodir:
772                 case Qconvdir:
773                         return devdirread(ch, a, n, 0, 0, ipgen);
774                 case Qarp:
775                         return arpread(f->arp, a, offset, n);
776                 case Qndb:
777                         return readstr(offset, a, n, f->ndb);
778                 case Qiproute:
779                         return readmem(offset, a, n, ch->synth_buf, IPROUTE_LEN);
780                 case Qiprouter:
781                         return iprouterread(f, a, n);
782                 case Qipselftab:
783                         return ipselftabread(f, a, offset, n);
784                 case Qlog:
785                         return netlogread(f, a, offset, n);
786                 case Qctl:
787                         snprintf(get_cur_genbuf(), GENBUF_SZ, "%lu", CONV(ch->qid));
788                         return readstr(offset, p, n, get_cur_genbuf());
789                 case Qremote:
790                         buf = kzmalloc(Statelen, 0);
791                         x = f->p[PROTO(ch->qid)];
792                         c = x->conv[CONV(ch->qid)];
793                         if (x->remote == NULL) {
794                                 snprintf(buf, Statelen, "%I!%d\n", c->raddr, c->rport);
795                         } else {
796                                 (*x->remote) (c, buf, Statelen - 2);
797                         }
798                         rv = readstr(offset, p, n, buf);
799                         kfree(buf);
800                         return rv;
801                 case Qlocal:
802                         buf = kzmalloc(Statelen, 0);
803                         x = f->p[PROTO(ch->qid)];
804                         c = x->conv[CONV(ch->qid)];
805                         if (x->local == NULL) {
806                                 snprintf(buf, Statelen, "%I!%d\n", c->laddr, c->lport);
807                         } else {
808                                 (*x->local) (c, buf, Statelen - 2);
809                         }
810                         rv = readstr(offset, p, n, buf);
811                         kfree(buf);
812                         return rv;
813                 case Qstatus:
814                         /* this all is a bit screwed up since the size of some state's
815                          * buffers will change from one invocation to another.  a reader
816                          * will come in and read the entire buffer.  then it will come again
817                          * and read from the next offset, expecting EOF.  if the buffer
818                          * changed sizes, it'll reprint the end of the buffer slightly. */
819                         buf = kzmalloc(Statelen, 0);
820                         x = f->p[PROTO(ch->qid)];
821                         c = x->conv[CONV(ch->qid)];
822                         if (c->state == Bypass)
823                                 snprintf(buf, Statelen, "Bypassed\n");
824                         else
825                                 (*x->state)(c, buf, Statelen - 2);
826                         rv = readstr(offset, p, n, buf);
827                         kfree(buf);
828                         return rv;
829                 case Qdata:
830                         c = f->p[PROTO(ch->qid)]->conv[CONV(ch->qid)];
831                         if (ch->flag & O_NONBLOCK)
832                                 return qread_nonblock(c->rq, a, n);
833                         else
834                                 return qread(c->rq, a, n);
835                 case Qerr:
836                         c = f->p[PROTO(ch->qid)]->conv[CONV(ch->qid)];
837                         return qread(c->eq, a, n);
838                 case Qsnoop:
839                         c = f->p[PROTO(ch->qid)]->conv[CONV(ch->qid)];
840                         return qread(c->sq, a, n);
841                 case Qstats:
842                         x = f->p[PROTO(ch->qid)];
843                         if (x->stats == NULL)
844                                 error(EFAIL, "stats not implemented");
845                         buf = kzmalloc(Statelen, 0);
846                         (*x->stats) (x, buf, Statelen);
847                         rv = readstr(offset, p, n, buf);
848                         kfree(buf);
849                         return rv;
850         }
851 }
852
853 static struct block *ipbread(struct chan *ch, long n, uint32_t offset)
854 {
855         struct conv *c;
856
857         switch (TYPE(ch->qid)) {
858                 case Qdata:
859                         c = chan2conv(ch);
860                         if (ch->flag & O_NONBLOCK)
861                                 return qbread_nonblock(c->rq, n);
862                         else
863                                 return qbread(c->rq, n);
864                 default:
865                         return devbread(ch, n, offset);
866         }
867 }
868
869 /*
870  *  set local address to be that of the ifc closest to remote address
871  */
872 static void setladdr(struct conv *c)
873 {
874         findlocalip(c->p->f, c->laddr, c->raddr);
875 }
876
877 /*
878  *  set a local port making sure the quad of raddr,rport,laddr,lport is unique
879  */
880 static void setluniqueport(struct conv *c, int lport)
881 {
882         struct Proto *p;
883         struct conv *xp;
884         int x;
885
886         p = c->p;
887
888         qlock(&p->qlock);
889         for (x = 0; x < p->nc; x++) {
890                 xp = p->conv[x];
891                 if (xp == NULL)
892                         break;
893                 if (xp == c)
894                         continue;
895                 if ((xp->state == Connected || xp->state == Announced
896                                             || xp->state == Bypass)
897                         && xp->lport == lport
898                         && xp->rport == c->rport
899                         && ipcmp(xp->raddr, c->raddr) == 0
900                         && ipcmp(xp->laddr, c->laddr) == 0) {
901                         qunlock(&p->qlock);
902                         error(EFAIL, "address in use");
903                 }
904         }
905         c->lport = lport;
906         qunlock(&p->qlock);
907 }
908
909 /*
910  *  pick a local port and set it
911  */
912 static void setlport(struct conv *c)
913 {
914         struct Proto *p;
915         uint16_t *pp;
916         int x, found;
917
918         p = c->p;
919         if (c->restricted)
920                 pp = &p->nextrport;
921         else
922                 pp = &p->nextport;
923         qlock(&p->qlock);
924         for (;; (*pp)++) {
925                 /*
926                  * Fsproto initialises p->nextport to 0 and the restricted
927                  * ports (p->nextrport) to 600.
928                  * Restricted ports must lie between 600 and 1024.
929                  * For the initial condition or if the unrestricted port number
930                  * has wrapped round, select a random port between 5000 and 1<<15
931                  * to start at.
932                  */
933                 if (c->restricted) {
934                         if (*pp >= 1024)
935                                 *pp = 600;
936                 } else
937                         while (*pp < 5000)
938                                 urandom_read(pp, sizeof(*pp));
939
940                 found = 0;
941                 for (x = 0; x < p->nc; x++) {
942                         if (p->conv[x] == NULL)
943                                 break;
944                         if (p->conv[x]->lport == *pp) {
945                                 found = 1;
946                                 break;
947                         }
948                 }
949                 if (!found)
950                         break;
951         }
952         c->lport = (*pp)++;
953         qunlock(&p->qlock);
954 }
955
956 /*
957  *  set a local address and port from a string of the form
958  *      [address!]port[!r]
959  */
960 static void setladdrport(struct conv *c, char *str, int announcing)
961 {
962         char *p;
963         uint16_t lport;
964         uint8_t addr[IPaddrlen];
965
966         /*
967          *  ignore restricted part if it exists.  it's
968          *  meaningless on local ports.
969          */
970         p = strchr(str, '!');
971         if (p != NULL) {
972                 *p++ = 0;
973                 if (strcmp(p, "r") == 0)
974                         p = NULL;
975         }
976
977         c->lport = 0;
978         if (p == NULL) {
979                 if (announcing)
980                         ipmove(c->laddr, IPnoaddr);
981                 else
982                         setladdr(c);
983                 p = str;
984         } else {
985                 if (strcmp(str, "*") == 0)
986                         ipmove(c->laddr, IPnoaddr);
987                 else {
988                         parseip(addr, str);
989                         if (ipforme(c->p->f, addr))
990                                 ipmove(c->laddr, addr);
991                         else
992                                 error(EFAIL, "not a local IP address");
993                 }
994         }
995
996         /* one process can get all connections */
997         if (announcing && strcmp(p, "*") == 0) {
998                 if (!iseve())
999                         error(EPERM, ERROR_FIXME);
1000                 setluniqueport(c, 0);
1001         }
1002
1003         lport = atoi(p);
1004         if (lport <= 0)
1005                 setlport(c);
1006         else
1007                 setluniqueport(c, lport);
1008 }
1009
1010 static void setraddrport(struct conv *c, char *str)
1011 {
1012         char *p;
1013
1014         p = strchr(str, '!');
1015         if (p == NULL)
1016                 error(EFAIL, "malformed address");
1017         *p++ = 0;
1018         parseip(c->raddr, str);
1019         c->rport = atoi(p);
1020         p = strchr(p, '!');
1021         if (p) {
1022                 if (strstr(p, "!r") != NULL)
1023                         c->restricted = 1;
1024         }
1025 }
1026
1027 /*
1028  *  called by protocol connect routine to set addresses
1029  */
1030 void Fsstdconnect(struct conv *c, char *argv[], int argc)
1031 {
1032         switch (argc) {
1033                 default:
1034                         error(EINVAL, "bad args to %s", __func__);
1035                 case 2:
1036                         setraddrport(c, argv[1]);
1037                         setladdr(c);
1038                         setlport(c);
1039                         break;
1040                 case 3:
1041                         setraddrport(c, argv[1]);
1042                         setladdrport(c, argv[2], 0);
1043                         break;
1044         }
1045
1046         if ((memcmp(c->raddr, v4prefix, IPv4off) == 0 &&
1047                  memcmp(c->laddr, v4prefix, IPv4off) == 0)
1048                 || ipcmp(c->raddr, IPnoaddr) == 0)
1049                 c->ipversion = V4;
1050         else
1051                 c->ipversion = V6;
1052 }
1053
1054 /*
1055  *  initiate connection and sleep till its set up
1056  */
1057 static int connected(void *a)
1058 {
1059         return ((struct conv *)a)->state == Connected;
1060 }
1061
1062 static void connectctlmsg(struct Proto *x, struct conv *c, struct cmdbuf *cb)
1063 {
1064         ERRSTACK(1);
1065         char *p;
1066
1067         if (c->state != 0)
1068                 error(EBUSY, ERROR_FIXME);
1069         c->state = Connecting;
1070         c->cerr[0] = '\0';
1071         if (x->connect == NULL)
1072                 error(EFAIL, "connect not supported");
1073         x->connect(c, cb->f, cb->nf);
1074
1075         qunlock(&c->qlock);
1076         if (waserror()) {
1077                 qlock(&c->qlock);
1078                 nexterror();
1079         }
1080         rendez_sleep(&c->cr, connected, c);
1081         qlock(&c->qlock);
1082         poperror();
1083
1084         if (c->cerr[0] != '\0')
1085                 error(EFAIL, c->cerr);
1086 }
1087
1088 /*
1089  *  called by protocol announce routine to set addresses
1090  */
1091 void Fsstdannounce(struct conv *c, char *argv[], int argc)
1092 {
1093         memset(c->raddr, 0, sizeof(c->raddr));
1094         c->rport = 0;
1095         switch (argc) {
1096                 default:
1097                         error(EINVAL, "bad args to announce");
1098                 case 2:
1099                         setladdrport(c, argv[1], 1);
1100                         break;
1101         }
1102 }
1103
1104 /*
1105  *  initiate announcement and sleep till its set up
1106  */
1107 static int announced(void *a)
1108 {
1109         return ((struct conv *)a)->state == Announced;
1110 }
1111
1112 static void announcectlmsg(struct Proto *x, struct conv *c, struct cmdbuf *cb)
1113 {
1114         ERRSTACK(1);
1115         char *p;
1116
1117         if (c->state != 0)
1118                 error(EBUSY, ERROR_FIXME);
1119         c->state = Announcing;
1120         c->cerr[0] = '\0';
1121         if (x->announce == NULL)
1122                 error(EFAIL, "announce not supported");
1123         x->announce(c, cb->f, cb->nf);
1124
1125         qunlock(&c->qlock);
1126         if (waserror()) {
1127                 qlock(&c->qlock);
1128                 nexterror();
1129         }
1130         rendez_sleep(&c->cr, announced, c);
1131         qlock(&c->qlock);
1132         poperror();
1133
1134         if (c->cerr[0] != '\0')
1135                 error(EFAIL, c->cerr);
1136 }
1137
1138 /*
1139  *  called by protocol bind routine to set addresses
1140  */
1141 void Fsstdbind(struct conv *c, char *argv[], int argc)
1142 {
1143         switch (argc) {
1144                 default:
1145                         error(EINVAL, "bad args to bind");
1146                 case 2:
1147                         setladdrport(c, argv[1], 0);
1148                         break;
1149         }
1150 }
1151
1152 static void bindctlmsg(struct Proto *x, struct conv *c, struct cmdbuf *cb)
1153 {
1154         if (x->bind == NULL)
1155                 Fsstdbind(c, cb->f, cb->nf);
1156         else
1157                 x->bind(c, cb->f, cb->nf);
1158 }
1159
1160 /* Helper, called by protocols to use the bypass.
1161  *
1162  * This is a bit nasty due to the overall nastiness of #ip.  We need to lock
1163  * before checking the state and hold the qlock throughout, because a concurrent
1164  * closeconv() could tear down the bypass.  Specifically, it could free the
1165  * bypass queues.  The root issue is that conversation lifetimes are not managed
1166  * well.
1167  *
1168  * If we fail, it's our responsibility to consume (free) the block(s). */
1169 void bypass_or_drop(struct conv *cv, struct block *bp)
1170 {
1171         qlock(&cv->qlock);
1172         if (cv->state == Bypass)
1173                 qpass(cv->rq, bp);
1174         else
1175                 freeblist(bp);
1176         qunlock(&cv->qlock);
1177 }
1178
1179 /* Push the block directly to the approprite ipoput function.
1180  *
1181  * It's the protocol's responsibility (and thus ours here) to make sure there is
1182  * at least the right amount of the IP header in the block (ipoput{4,6} assumes
1183  * it has the right amount, and the other protocols account for the IP header in
1184  * their own header).
1185  *
1186  * For the TTL and TOS, we just use the default ones.  If we want, we could look
1187  * into the actual block and see what the user wanted, though we're bypassing
1188  * the protocol layer, not the IP layer. */
1189 static void proto_bypass_kick(void *arg, struct block *bp)
1190 {
1191         struct conv *cv = (struct conv*)arg;
1192         uint8_t vers_nibble;
1193         struct Fs *f;
1194
1195         f = cv->p->f;
1196
1197         bp = pullupblock(bp, 1);
1198         if (!bp)
1199                 error(EINVAL, "Proto bypass unable to pullup a byte!");
1200         vers_nibble = *(uint8_t*)bp->rp & 0xf0;
1201         switch (vers_nibble) {
1202         case IP_VER4:
1203                 bp = pullupblock(bp, IPV4HDR_LEN);
1204                 if (!bp)
1205                         error(EINVAL, "Proto bypass unable to pullup v4 header");
1206                 ipoput4(f, bp, FALSE, MAXTTL, DFLTTOS, NULL);
1207                 break;
1208         case IP_VER6:
1209                 bp = pullupblock(bp, IPV6HDR_LEN);
1210                 if (!bp)
1211                         error(EINVAL, "Proto bypass unable to pullup v6 header");
1212                 ipoput6(f, bp, FALSE, MAXTTL, DFLTTOS, NULL);
1213                 break;
1214         default:
1215                 error(EINVAL, "Proto bypass block had unknown IP version 0x%x",
1216                       vers_nibble);
1217         }
1218 }
1219
1220 /* Sets up cv for the protocol bypass.  We use different queues for two reasons:
1221  * 1) To be protocol independent.  For instance, TCP and UDP could use very
1222  * different QIO styles.
1223  * 2) To set up our own kick/bypass method.  Note how udpcreate() and here uses
1224  * qbypass() (just blast it out), while TCP uses qopen() with a kick.  TCP still
1225  * follows queuing discipline.
1226  *
1227  * It's like we are our own protocol, the bypass protocol, when it comes to how
1228  * we interact with qio.  The conv still is of the real protocol type (e.g.
1229  * TCP).
1230  *
1231  * Note that we can't free the old queues.  The way #ip works, the queues are
1232  * created when the conv is created, but the conv is never freed.  It's like a
1233  * slab allocator that never frees objects, but just reinitializes them a
1234  * little.
1235  *
1236  * For the queues, we're basically like UDP:
1237  * - We take packets for rq and drop on overflow.
1238  * - rq is also Qmsg, but we also have Qcoalesce, to ignore out zero-len blocks
1239  * - We kick for our outbound (wq) messages.
1240  *
1241  * Note that Qmsg can drop parts of packets.  It's up to the user to read
1242  * enough.  If they didn't read enough, the extra is dropped.  This is similar
1243  * to SOCK_DGRAM and recvfrom().  Minus major changes, there's no nice way to
1244  * get individual messages with read().  Userspace using the bypass will need to
1245  * find out the MTU of the NIC the IP stack is attached to, and make sure to
1246  * read in at least that amount each time. */
1247 static void setup_proto_qio_bypass(struct conv *cv)
1248 {
1249         cv->rq_save = cv->rq;
1250         cv->wq_save = cv->wq;
1251         cv->rq = qopen(BYPASS_QMAX, Qmsg | Qcoalesce, 0, 0);
1252         cv->wq = qbypass(proto_bypass_kick, cv);
1253 }
1254
1255 static void undo_proto_qio_bypass(struct conv *cv)
1256 {
1257         qfree(cv->rq);
1258         qfree(cv->wq);
1259         cv->rq = cv->rq_save;
1260         cv->wq = cv->wq_save;
1261         cv->rq_save = NULL;
1262         cv->wq_save = NULL;
1263 }
1264
1265 void Fsstdbypass(struct conv *cv, char *argv[], int argc)
1266 {
1267         memset(cv->raddr, 0, sizeof(cv->raddr));
1268         cv->rport = 0;
1269         switch (argc) {
1270         case 2:
1271                 setladdrport(cv, argv[1], 1);
1272                 break;
1273         default:
1274                 error(EINVAL, "Bad args (was %d, need 2) to bypass", argc);
1275         }
1276 }
1277
1278 static void bypassctlmsg(struct Proto *x, struct conv *cv, struct cmdbuf *cb)
1279 {
1280         if (!x->bypass)
1281                 error(EFAIL, "Protocol %s does not support bypass", x->name);
1282         /* The protocol needs to set the port (usually by calling Fsstdbypass) and
1283          * then do whatever it needs to make sure it can find the conv again during
1284          * receive (usually by adding to a hash table). */
1285         x->bypass(cv, cb->f, cb->nf);
1286         setup_proto_qio_bypass(cv);
1287         cv->state = Bypass;
1288 }
1289
1290 static void shutdownctlmsg(struct conv *cv, struct cmdbuf *cb)
1291 {
1292         if (cb->nf < 2)
1293                 goto err;
1294         if (!strcmp(cb->f[1], "rd")) {
1295                 qhangup(cv->rq, "shutdown");
1296                 if (cv->p->shutdown)
1297                         cv->p->shutdown(cv, SHUT_RD);
1298         } else if (!strcmp(cb->f[1], "wr")) {
1299                 qhangup(cv->wq, "shutdown");
1300                 if (cv->p->shutdown)
1301                         cv->p->shutdown(cv, SHUT_WR);
1302         } else if (!strcmp(cb->f[1], "rdwr")) {
1303                 qhangup(cv->rq, "shutdown");
1304                 qhangup(cv->wq, "shutdown");
1305                 if (cv->p->shutdown)
1306                         cv->p->shutdown(cv, SHUT_RDWR);
1307         } else {
1308                 goto err;
1309         }
1310         return;
1311 err:
1312         error(EINVAL, "shutdown [rx|tx|rxtx]");
1313 }
1314
1315 static void tosctlmsg(struct conv *c, struct cmdbuf *cb)
1316 {
1317         if (cb->nf < 2)
1318                 c->tos = 0;
1319         else
1320                 c->tos = atoi(cb->f[1]);
1321 }
1322
1323 static void ttlctlmsg(struct conv *c, struct cmdbuf *cb)
1324 {
1325         if (cb->nf < 2)
1326                 c->ttl = MAXTTL;
1327         else
1328                 c->ttl = atoi(cb->f[1]);
1329 }
1330
1331 /* Binds a conversation, as if the user wrote "bind *" into ctl. */
1332 static void autobind(struct conv *cv)
1333 {
1334         ERRSTACK(1);
1335         struct cmdbuf *cb;
1336
1337         cb = parsecmd("bind *", 7);
1338         if (waserror()) {
1339                 kfree(cb);
1340                 nexterror();
1341         }
1342         bindctlmsg(cv->p, cv, cb);
1343         poperror();
1344         kfree(cb);
1345 }
1346
1347 static long ipwrite(struct chan *ch, void *v, long n, int64_t off)
1348 {
1349         ERRSTACK(1);
1350         struct conv *c;
1351         struct Proto *x;
1352         char *p;
1353         struct cmdbuf *cb;
1354         uint8_t ia[IPaddrlen], ma[IPaddrlen];
1355         struct Fs *f;
1356         char *a;
1357
1358         a = v;
1359         f = ipfs[ch->dev];
1360
1361         switch (TYPE(ch->qid)) {
1362                 default:
1363                         error(EPERM, ERROR_FIXME);
1364                 case Qdata:
1365                         x = f->p[PROTO(ch->qid)];
1366                         c = x->conv[CONV(ch->qid)];
1367                         /* connection-less protocols (UDP) can write without manually
1368                          * binding. */
1369                         if (c->lport == 0)
1370                                 autobind(c);
1371                         if (ch->flag & O_NONBLOCK)
1372                                 qwrite_nonblock(c->wq, a, n);
1373                         else
1374                                 qwrite(c->wq, a, n);
1375                         break;
1376                 case Qarp:
1377                         return arpwrite(f, a, n);
1378                 case Qiproute:
1379                         return routewrite(f, ch, a, n);
1380                 case Qlog:
1381                         netlogctl(f, a, n);
1382                         return n;
1383                 case Qndb:
1384                         return ndbwrite(f, a, off, n);
1385                 case Qctl:
1386                         x = f->p[PROTO(ch->qid)];
1387                         c = x->conv[CONV(ch->qid)];
1388                         cb = parsecmd(a, n);
1389
1390                         qlock(&c->qlock);
1391                         if (waserror()) {
1392                                 qunlock(&c->qlock);
1393                                 kfree(cb);
1394                                 nexterror();
1395                         }
1396                         if (cb->nf < 1)
1397                                 error(EFAIL, "short control request");
1398                         if (strcmp(cb->f[0], "connect") == 0)
1399                                 connectctlmsg(x, c, cb);
1400                         else if (strcmp(cb->f[0], "announce") == 0)
1401                                 announcectlmsg(x, c, cb);
1402                         else if (strcmp(cb->f[0], "bind") == 0)
1403                                 bindctlmsg(x, c, cb);
1404                         else if (strcmp(cb->f[0], "bypass") == 0)
1405                                 bypassctlmsg(x, c, cb);
1406                         else if (strcmp(cb->f[0], "shutdown") == 0)
1407                                 shutdownctlmsg(c, cb);
1408                         else if (strcmp(cb->f[0], "ttl") == 0)
1409                                 ttlctlmsg(c, cb);
1410                         else if (strcmp(cb->f[0], "tos") == 0)
1411                                 tosctlmsg(c, cb);
1412                         else if (strcmp(cb->f[0], "ignoreadvice") == 0)
1413                                 c->ignoreadvice = 1;
1414                         else if (strcmp(cb->f[0], "addmulti") == 0) {
1415                                 if (cb->nf < 2)
1416                                         error(EFAIL, "addmulti needs interface address");
1417                                 if (cb->nf == 2) {
1418                                         if (!ipismulticast(c->raddr))
1419                                                 error(EFAIL, "addmulti for a non multicast address");
1420                                         parseip(ia, cb->f[1]);
1421                                         ipifcaddmulti(c, c->raddr, ia);
1422                                 } else {
1423                                         parseip(ma, cb->f[2]);
1424                                         if (!ipismulticast(ma))
1425                                                 error(EFAIL, "addmulti for a non multicast address");
1426                                         parseip(ia, cb->f[1]);
1427                                         ipifcaddmulti(c, ma, ia);
1428                                 }
1429                         } else if (strcmp(cb->f[0], "remmulti") == 0) {
1430                                 if (cb->nf < 2)
1431                                         error(EFAIL, "remmulti needs interface address");
1432                                 if (!ipismulticast(c->raddr))
1433                                         error(EFAIL, "remmulti for a non multicast address");
1434                                 parseip(ia, cb->f[1]);
1435                                 ipifcremmulti(c, c->raddr, ia);
1436                         } else if (x->ctl != NULL) {
1437                                 x->ctl(c, cb->f, cb->nf);
1438                         } else
1439                                 error(EFAIL, "unknown control request");
1440                         qunlock(&c->qlock);
1441                         kfree(cb);
1442                         poperror();
1443         }
1444         return n;
1445 }
1446
1447 static long ipbwrite(struct chan *ch, struct block *bp, uint32_t offset)
1448 {
1449         struct conv *c;
1450         int n;
1451
1452         switch (TYPE(ch->qid)) {
1453                 case Qdata:
1454                         c = chan2conv(ch);
1455                         if (bp->next)
1456                                 bp = concatblock(bp);
1457                         n = BLEN(bp);
1458                         if (ch->flag & O_NONBLOCK)
1459                                 qbwrite_nonblock(c->wq, bp);
1460                         else
1461                                 qbwrite(c->wq, bp);
1462                         return n;
1463                 default:
1464                         return devbwrite(ch, bp, offset);
1465         }
1466 }
1467
1468 static void ip_wake_cb(struct queue *q, void *data, int filter)
1469 {
1470         struct conv *conv = (struct conv*)data;
1471         struct fd_tap *tap_i;
1472         /* For these two, we want to ignore events on the opposite end of the
1473          * queues.  For instance, we want to know when the WQ is writable.  Our
1474          * writes will actually make it readable - we don't want to trigger a tap
1475          * for that.  However, qio doesn't know how/why we are using a queue, or
1476          * even who the ends are (hence the callbacks) */
1477         if ((filter & FDTAP_FILT_READABLE) && (q == conv->wq))
1478                 return;
1479         if ((filter & FDTAP_FILT_WRITABLE) && (q == conv->rq))
1480                 return;
1481         /* At this point, we have an event we want to send to our taps (if any).
1482          * The lock protects list integrity and the existence of the tap.
1483          *
1484          * Previously, I thought of using the conv qlock.  That actually breaks, due
1485          * to weird usages of the qlock (someone holds it for a long time, blocking
1486          * the inbound wakeup from etherread4).
1487          *
1488          * I opted for a spinlock for a couple reasons:
1489          * - fire_tap should not block.  ideally it'll be fast too (it's mostly a
1490          * send_event).
1491          * - our callers might not want to block.  A lot of network wakeups will
1492          * come network processes (etherread4) or otherwise unrelated to this
1493          * particular conversation.  I'd rather do something like fire off a KMSG
1494          * than block those.
1495          * - if fire_tap takes a while, holding the lock only slows down other
1496          * events on this *same* conversation, or other tap registration.  not a
1497          * huge deal. */
1498         spin_lock(&conv->tap_lock);
1499         SLIST_FOREACH(tap_i, &conv->data_taps, link)
1500                 fire_tap(tap_i, filter);
1501         spin_unlock(&conv->tap_lock);
1502 }
1503
1504 int iptapfd(struct chan *chan, struct fd_tap *tap, int cmd)
1505 {
1506         struct conv *conv = chan2conv(chan);
1507         int ret;
1508
1509         #define DEVIP_LEGAL_DATA_TAPS (FDTAP_FILT_READABLE | FDTAP_FILT_WRITABLE | \
1510                                        FDTAP_FILT_HANGUP | FDTAP_FILT_PRIORITY |   \
1511                                        FDTAP_FILT_ERROR)
1512         #define DEVIP_LEGAL_LISTEN_TAPS (FDTAP_FILT_READABLE | FDTAP_FILT_HANGUP)
1513
1514         switch (TYPE(chan->qid)) {
1515                 case Qdata:
1516                         if (tap->filter & ~DEVIP_LEGAL_DATA_TAPS) {
1517                                 set_errno(ENOSYS);
1518                                 set_errstr("Unsupported #%s data tap %p, must be %p", devname(),
1519                                            tap->filter, DEVIP_LEGAL_DATA_TAPS);
1520                                 return -1;
1521                         }
1522                         spin_lock(&conv->tap_lock);
1523                         switch (cmd) {
1524                                 case (FDTAP_CMD_ADD):
1525                                         if (SLIST_EMPTY(&conv->data_taps)) {
1526                                                 qio_set_wake_cb(conv->rq, ip_wake_cb, conv);
1527                                                 qio_set_wake_cb(conv->wq, ip_wake_cb, conv);
1528                                         }
1529                                         SLIST_INSERT_HEAD(&conv->data_taps, tap, link);
1530                                         ret = 0;
1531                                         break;
1532                                 case (FDTAP_CMD_REM):
1533                                         SLIST_REMOVE(&conv->data_taps, tap, fd_tap, link);
1534                                         if (SLIST_EMPTY(&conv->data_taps)) {
1535                                                 qio_set_wake_cb(conv->rq, 0, conv);
1536                                                 qio_set_wake_cb(conv->wq, 0, conv);
1537                                         }
1538                                         ret = 0;
1539                                         break;
1540                                 default:
1541                                         set_errno(ENOSYS);
1542                                         set_errstr("Unsupported #%s data tap command %p",
1543                                                    devname(), cmd);
1544                                         ret = -1;
1545                         }
1546                         spin_unlock(&conv->tap_lock);
1547                         return ret;
1548                 case Qlisten:
1549                         if (tap->filter & ~DEVIP_LEGAL_LISTEN_TAPS) {
1550                                 set_errno(ENOSYS);
1551                                 set_errstr("Unsupported #%s listen tap %p, must be %p",
1552                                            devname(), tap->filter, DEVIP_LEGAL_LISTEN_TAPS);
1553                                 return -1;
1554                         }
1555                         spin_lock(&conv->tap_lock);
1556                         switch (cmd) {
1557                                 case (FDTAP_CMD_ADD):
1558                                         SLIST_INSERT_HEAD(&conv->listen_taps, tap, link);
1559                                         ret = 0;
1560                                         break;
1561                                 case (FDTAP_CMD_REM):
1562                                         SLIST_REMOVE(&conv->listen_taps, tap, fd_tap, link);
1563                                         ret = 0;
1564                                         break;
1565                                 default:
1566                                         set_errno(ENOSYS);
1567                                         set_errstr("Unsupported #%s listen tap command %p",
1568                                                    devname(), cmd);
1569                                         ret = -1;
1570                         }
1571                         spin_unlock(&conv->tap_lock);
1572                         return ret;
1573                 default:
1574                         set_errno(ENOSYS);
1575                         set_errstr("Can't tap #%s file type %d", devname(),
1576                                    TYPE(chan->qid));
1577                         return -1;
1578         }
1579 }
1580
1581 struct dev ipdevtab __devtab = {
1582         .name = "ip",
1583
1584         .reset = ipreset,
1585         .init = ipinit,
1586         .shutdown = devshutdown,
1587         .attach = ipattach,
1588         .walk = ipwalk,
1589         .stat = ipstat,
1590         .open = ipopen,
1591         .create = devcreate,
1592         .close = ipclose,
1593         .read = ipread,
1594         .bread = ipbread,
1595         .write = ipwrite,
1596         .bwrite = ipbwrite,
1597         .remove = devremove,
1598         .wstat = ipwstat,
1599         .power = devpower,
1600         .chaninfo = ipchaninfo,
1601         .tapfd = iptapfd,
1602 };
1603
1604 int Fsproto(struct Fs *f, struct Proto *p)
1605 {
1606         if (f->np >= Maxproto)
1607                 return -1;
1608
1609         qlock_init(&p->qlock);
1610         p->f = f;
1611
1612         if (p->ipproto > 0) {
1613                 if (f->t2p[p->ipproto] != NULL)
1614                         return -1;
1615                 f->t2p[p->ipproto] = p;
1616         }
1617
1618         p->qid.type = QTDIR;
1619         p->qid.path = QID(f->np, 0, Qprotodir);
1620         p->conv = kzmalloc(sizeof(struct conv *) * (p->nc + 1), 0);
1621         if (p->conv == NULL)
1622                 panic("Fsproto");
1623
1624         p->x = f->np;
1625         p->nextport = 0;
1626         p->nextrport = 600;
1627         f->p[f->np++] = p;
1628
1629         return 0;
1630 }
1631
1632 /*
1633  *  return true if this protocol is
1634  *  built in
1635  */
1636 int Fsbuiltinproto(struct Fs *f, uint8_t proto)
1637 {
1638         return f->t2p[proto] != NULL;
1639 }
1640
1641 /*
1642  *  called with protocol locked
1643  */
1644 struct conv *Fsprotoclone(struct Proto *p, char *user)
1645 {
1646         struct conv *c, **pp, **ep;
1647
1648 retry:
1649         c = NULL;
1650         ep = &p->conv[p->nc];
1651         for (pp = p->conv; pp < ep; pp++) {
1652                 c = *pp;
1653                 if (c == NULL) {
1654                         c = kzmalloc(sizeof(struct conv), 0);
1655                         if (c == NULL)
1656                                 error(ENOMEM,
1657                                       "conv kzmalloc(%d, 0) failed in Fsprotoclone",
1658                                       sizeof(struct conv));
1659                         qlock_init(&c->qlock);
1660                         qlock_init(&c->listenq);
1661                         rendez_init(&c->cr);
1662                         rendez_init(&c->listenr);
1663                         SLIST_INIT(&c->data_taps);      /* already = 0; set to be futureproof */
1664                         SLIST_INIT(&c->listen_taps);
1665                         spinlock_init(&c->tap_lock);
1666                         qlock(&c->qlock);
1667                         c->p = p;
1668                         c->x = pp - p->conv;
1669                         if (p->ptclsize != 0) {
1670                                 c->ptcl = kzmalloc(p->ptclsize, 0);
1671                                 if (c->ptcl == NULL) {
1672                                         kfree(c);
1673                                         error(ENOMEM,
1674                                               "ptcl kzmalloc(%d, 0) failed in Fsprotoclone",
1675                                               p->ptclsize);
1676                                 }
1677                         }
1678                         *pp = c;
1679                         p->ac++;
1680                         c->eq = qopen(1024, Qmsg, 0, 0);
1681                         (*p->create) (c);
1682                         assert(c->rq && c->wq);
1683                         break;
1684                 }
1685                 if (canqlock(&c->qlock)) {
1686                         /*
1687                          *  make sure both processes and protocol
1688                          *  are done with this Conv
1689                          */
1690                         if (c->inuse == 0 && (p->inuse == NULL || (*p->inuse) (c) == 0))
1691                                 break;
1692
1693                         qunlock(&c->qlock);
1694                 }
1695         }
1696         if (pp >= ep) {
1697                 if (p->gc != NULL && (*p->gc) (p))
1698                         goto retry;
1699                 return NULL;
1700         }
1701
1702         c->inuse = 1;
1703         kstrdup(&c->owner, user);
1704         c->perm = 0660;
1705         c->state = Idle;
1706         ipmove(c->laddr, IPnoaddr);
1707         ipmove(c->raddr, IPnoaddr);
1708         c->r = NULL;
1709         c->rgen = 0;
1710         c->lport = 0;
1711         c->rport = 0;
1712         c->restricted = 0;
1713         c->ttl = MAXTTL;
1714         c->tos = DFLTTOS;
1715         qreopen(c->rq);
1716         qreopen(c->wq);
1717         qreopen(c->eq);
1718
1719         qunlock(&c->qlock);
1720         return c;
1721 }
1722
1723 int Fsconnected(struct conv *c, char *msg)
1724 {
1725         if (msg != NULL && *msg != '\0')
1726                 strlcpy(c->cerr, msg, sizeof(c->cerr));
1727
1728         switch (c->state) {
1729                 case Announcing:
1730                         c->state = Announced;
1731                         break;
1732
1733                 case Connecting:
1734                         c->state = Connected;
1735                         break;
1736         }
1737
1738         rendez_wakeup(&c->cr);
1739         return 0;
1740 }
1741
1742 struct Proto *Fsrcvpcol(struct Fs *f, uint8_t proto)
1743 {
1744         if (f->ipmux)
1745                 return f->ipmux;
1746         else
1747                 return f->t2p[proto];
1748 }
1749
1750 struct Proto *Fsrcvpcolx(struct Fs *f, uint8_t proto)
1751 {
1752         return f->t2p[proto];
1753 }
1754
1755 static void fire_listener_taps(struct conv *conv)
1756 {
1757         struct fd_tap *tap_i;
1758         if (SLIST_EMPTY(&conv->listen_taps))
1759                 return;
1760         spin_lock(&conv->tap_lock);
1761         SLIST_FOREACH(tap_i, &conv->listen_taps, link)
1762                 fire_tap(tap_i, FDTAP_FILT_READABLE);
1763         spin_unlock(&conv->tap_lock);
1764 }
1765
1766 /*
1767  *  called with protocol locked
1768  */
1769 struct conv *Fsnewcall(struct conv *c, uint8_t * raddr, uint16_t rport,
1770                                            uint8_t * laddr, uint16_t lport, uint8_t version)
1771 {
1772         struct conv *nc;
1773         struct conv **l;
1774         int i;
1775
1776         qlock(&c->qlock);
1777         i = 0;
1778         for (l = &c->incall; *l; l = &(*l)->next)
1779                 i++;
1780         if (i >= Maxincall) {
1781                 qunlock(&c->qlock);
1782                 return NULL;
1783         }
1784
1785         /* find a free conversation */
1786         nc = Fsprotoclone(c->p, network);
1787         if (nc == NULL) {
1788                 qunlock(&c->qlock);
1789                 return NULL;
1790         }
1791         ipmove(nc->raddr, raddr);
1792         nc->rport = rport;
1793         ipmove(nc->laddr, laddr);
1794         nc->lport = lport;
1795         nc->next = NULL;
1796         *l = nc;
1797         nc->state = Connected;
1798         nc->ipversion = version;
1799
1800         qunlock(&c->qlock);
1801
1802         rendez_wakeup(&c->listenr);
1803         fire_listener_taps(c);
1804
1805         return nc;
1806 }
1807
1808 static long ndbwrite(struct Fs *f, char *a, uint32_t off, int n)
1809 {
1810         if (off > strlen(f->ndb))
1811                 error(EIO, ERROR_FIXME);
1812         if (off + n >= sizeof(f->ndb) - 1)
1813                 error(EIO, ERROR_FIXME);
1814         memmove(f->ndb + off, a, n);
1815         f->ndb[off + n] = 0;
1816         f->ndbvers++;
1817         f->ndbmtime = seconds();
1818         return n;
1819 }
1820
1821 uint32_t scalednconv(void)
1822 {
1823         //if(conf.npage*BY2PG >= 128*MB)
1824         return Nchans * 4;
1825         //  return Nchans;
1826 }