vfs: Remove KFS, blockdev and devfs
[akaros.git] / kern / src / net / devip.c
1 /* Copyright © 1994-1999 Lucent Technologies Inc.  All rights reserved.
2  * Portions Copyright © 1997-1999 Vita Nuova Limited
3  * Portions Copyright © 2000-2007 Vita Nuova Holdings Limited
4  *                                (www.vitanuova.com)
5  * Revisions Copyright © 2000-2007 Lucent Technologies Inc. and others
6  *
7  * Modified for the Akaros operating system:
8  * Copyright (c) 2013-2014 The Regents of the University of California
9  * Copyright (c) 2013-2015 Google Inc.
10  *
11  * Permission is hereby granted, free of charge, to any person obtaining a copy
12  * of this software and associated documentation files (the "Software"), to deal
13  * in the Software without restriction, including without limitation the rights
14  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15  * copies of the Software, and to permit persons to whom the Software is
16  * furnished to do so, subject to the following conditions:
17  *
18  * The above copyright notice and this permission notice shall be included in
19  * all copies or substantial portions of the Software.
20  *
21  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
24  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27  * SOFTWARE. */
28
29 #include <vfs.h>
30 #include <slab.h>
31 #include <kmalloc.h>
32 #include <kref.h>
33 #include <string.h>
34 #include <stdio.h>
35 #include <assert.h>
36 #include <error.h>
37 #include <cpio.h>
38 #include <pmap.h>
39 #include <smp.h>
40 #include <net/ip.h>
41
42 struct dev ipdevtab;
43
44 static char *devname(void)
45 {
46         return ipdevtab.name;
47 }
48
49 enum {
50         Qtopdir = 1,                            /* top level directory */
51         Qtopbase,
52         Qarp = Qtopbase,
53         Qndb,
54         Qiproute,
55         Qiprouter,
56         Qipselftab,
57         Qlog,
58
59         Qprotodir,      /* directory for a protocol */
60         Qprotobase,
61         Qclone = Qprotobase,
62         Qstats,
63
64         Qconvdir,       /* directory for a conversation */
65         Qconvbase,
66         Qctl = Qconvbase,
67         Qdata,
68         Qerr,
69         Qlisten,
70         Qlocal,
71         Qremote,
72         Qstatus,
73         Qsnoop,
74
75         Logtype = 5,
76         Masktype = (1 << Logtype) - 1,
77         Logconv = 12,
78         Maskconv = (1 << Logconv) - 1,
79         Shiftconv = Logtype,
80         Logproto = 8,
81         Maskproto = (1 << Logproto) - 1,
82         Shiftproto = Logtype + Logconv,
83
84         Nfs = 32,
85         BYPASS_QMAX = 64 * MiB,
86         IPROUTE_LEN = 2 * PGSIZE,
87 };
88 #define TYPE(x)         ( ((uint32_t)(x).path) & Masktype )
89 #define CONV(x)         ( (((uint32_t)(x).path) >> Shiftconv) & Maskconv )
90 #define PROTO(x)        ( (((uint32_t)(x).path) >> Shiftproto) & Maskproto )
91 #define QID(p, c, y)    ( ((p)<<(Shiftproto)) | ((c)<<Shiftconv) | (y))
92 static char network[] = "network";
93
94 qlock_t fslock;
95 struct Fs *ipfs[Nfs];                   /* attached fs's */
96 struct queue *qlog;
97
98 extern void nullmediumlink(void);
99 extern void pktmediumlink(void);
100 extern struct username eve;
101 static long ndbwrite(struct Fs *, char *unused_char_p_t, uint32_t, int);
102 static void closeconv(struct conv *);
103 static void setup_proto_qio_bypass(struct conv *cv);
104 static void undo_proto_qio_bypass(struct conv *cv);
105 static int connected(void *a);
106
107 static struct conv *chan2conv(struct chan *chan)
108 {
109         /* That's a lot of pointers to get to the conv! */
110         return ipfs[chan->dev]->p[PROTO(chan->qid)]->conv[CONV(chan->qid)];
111 }
112
113 static inline int founddevdir(struct chan *c, struct qid q, char *n,
114                                                           int64_t length, char *user, long perm,
115                                                           struct dir *db)
116 {
117         devdir(c, q, n, length, user, perm, db);
118         return 1;
119 }
120
121 static int topdirgen(struct chan *c, struct dir *dp)
122 {
123         struct qid q;
124         mkqid(&q, QID(0, 0, Qtopdir), 0, QTDIR);
125         snprintf(get_cur_genbuf(), GENBUF_SZ, "#%s%lu", devname(), c->dev);
126         return founddevdir(c, q, get_cur_genbuf(), 0, network, 0555, dp);
127 }
128
129 /* Computes the perm field for a stat for Qdata.  Since select() polls the
130  * 'actionability' of a socket via the qdata FD, we'll also report listenable
131  * and connected conversations.  It's a minor hack.  =( */
132 static int qdata_stat_perm(struct conv *cv)
133 {
134         int perm;
135
136         perm = cv->perm;
137         /* If there is ever a listener, then it's readable.  Ideally, we'd only
138          * report this on the Qlisten file (which we also do).  The socket crap
139          * should never use a listening socket for data, so there shouldn't be any
140          * confusion when a Qdata shows up as readable. */
141         perm |= cv->incall ? DMREADABLE : 0;
142         /* For connectable convs, they need to be both connected and qio
143          * readable/writable.  The way to think about this is that the convs are not
144          * truly writable/readable until they are connected.  Conveniently, this
145          * means that when select polls Qdata for non-blocking connect(), a
146          * connected conversation pops up as writable (the qio is writable too).
147          *
148          * Note that a conversation can be 'Connected' even if it failed to connect.
149          * At least that's what the 9ns TCP code does.  It's more like "the protocol
150          * did what it needed and the connectctlmsg call (or its non-blocking
151          * equivalent) is done".  For instance, TCP has a few reasons to call
152          * Fsconnected, such as when we send the SYN and get a RST. */
153         if (!cv->p->connect || connected(cv)) {
154                 perm |= qreadable(cv->rq) ? DMREADABLE : 0;
155                 perm |= qwritable(cv->wq) ? DMWRITABLE : 0;
156         }
157         return perm;
158 }
159
160 static int ip3gen(struct chan *c, int i, struct dir *dp)
161 {
162         struct qid q;
163         struct conv *cv;
164         char *p;
165         int perm;
166
167         cv = chan2conv(c);
168         if (cv->owner == NULL)
169                 kstrdup(&cv->owner, eve.name);
170         mkqid(&q, QID(PROTO(c->qid), CONV(c->qid), i), 0, QTFILE);
171
172         switch (i) {
173                 default:
174                         return -1;
175                 case Qctl:
176                         return founddevdir(c, q, "ctl", 0,
177                                                    cv->owner, cv->perm, dp);
178                 case Qdata:
179                         perm = qdata_stat_perm(cv);
180                         return founddevdir(c, q, "data", qlen(cv->rq),
181                                                            cv->owner, perm, dp);
182                 case Qerr:
183                         perm = cv->perm;
184                         perm |= qreadable(cv->eq) ? DMREADABLE : 0;
185                         return founddevdir(c, q, "err", qlen(cv->eq),
186                                                            cv->owner, perm, dp);
187                 case Qlisten:
188                         perm = cv->perm;
189                         perm |= cv->incall ? DMREADABLE : 0;
190                         return founddevdir(c, q, "listen", 0, cv->owner, perm, dp);
191                 case Qlocal:
192                         p = "local";
193                         break;
194                 case Qremote:
195                         p = "remote";
196                         break;
197                 case Qsnoop:
198                         if (strcmp(cv->p->name, "ipifc") != 0)
199                                 return -1;
200                         perm = 0400;
201                         perm |= qreadable(cv->sq) ? DMREADABLE : 0;
202                         return founddevdir(c, q, "snoop", qlen(cv->sq),
203                                                            cv->owner, perm, dp);
204                 case Qstatus:
205                         p = "status";
206                         break;
207         }
208         return founddevdir(c, q, p, 0, cv->owner, 0444, dp);
209 }
210
211 static int ip2gen(struct chan *c, int i, struct dir *dp)
212 {
213         struct qid q;
214         mkqid(&q, QID(PROTO(c->qid), 0, i), 0, QTFILE);
215         switch (i) {
216                 case Qclone:
217                         return founddevdir(c, q, "clone", 0, network, 0666, dp);
218                 case Qstats:
219                         return founddevdir(c, q, "stats", 0, network, 0444, dp);
220         }
221         return -1;
222 }
223
224 static int ip1gen(struct chan *c, int i, struct dir *dp)
225 {
226         struct qid q;
227         char *p;
228         int prot;
229         int len = 0;
230         struct Fs *f;
231         extern uint32_t kerndate;
232
233         f = ipfs[c->dev];
234
235         prot = 0666;
236         mkqid(&q, QID(0, 0, i), 0, QTFILE);
237         switch (i) {
238                 default:
239                         return -1;
240                 case Qarp:
241                         p = "arp";
242                         break;
243                 case Qndb:
244                         p = "ndb";
245                         len = strlen(f->ndb);
246                         q.vers = f->ndbvers;
247                         break;
248                 case Qiproute:
249                         p = "iproute";
250                         break;
251                 case Qipselftab:
252                         p = "ipselftab";
253                         prot = 0444;
254                         break;
255                 case Qiprouter:
256                         p = "iprouter";
257                         break;
258                 case Qlog:
259                         p = "log";
260                         break;
261         }
262         devdir(c, q, p, len, network, prot, dp);
263         if (i == Qndb && f->ndbmtime > kerndate)
264                 dp->mtime.tv_sec = f->ndbmtime;
265         return 1;
266 }
267
268 static int
269 ipgen(struct chan *c, char *unused_char_p_t, struct dirtab *d, int unused_int,
270           int s, struct dir *dp)
271 {
272         struct qid q;
273         struct conv *cv;
274         struct Fs *f;
275
276         f = ipfs[c->dev];
277
278         switch (TYPE(c->qid)) {
279                 case Qtopdir:
280                         if (s == DEVDOTDOT)
281                                 return topdirgen(c, dp);
282                         if (s < f->np) {
283                                 if (f->p[s]->connect == NULL)
284                                         return 0;       /* protocol with no user interface */
285                                 mkqid(&q, QID(s, 0, Qprotodir), 0, QTDIR);
286                                 return founddevdir(c, q, f->p[s]->name, 0, network, 0555, dp);
287                         }
288                         s -= f->np;
289                         return ip1gen(c, s + Qtopbase, dp);
290                 case Qarp:
291                 case Qndb:
292                 case Qlog:
293                 case Qiproute:
294                 case Qiprouter:
295                 case Qipselftab:
296                         return ip1gen(c, TYPE(c->qid), dp);
297                 case Qprotodir:
298                         if (s == DEVDOTDOT)
299                                 return topdirgen(c, dp);
300                         else if (s < f->p[PROTO(c->qid)]->ac) {
301                                 cv = f->p[PROTO(c->qid)]->conv[s];
302                                 snprintf(get_cur_genbuf(), GENBUF_SZ, "%d", s);
303                                 mkqid(&q, QID(PROTO(c->qid), s, Qconvdir), 0, QTDIR);
304                                 return
305                                         founddevdir(c, q, get_cur_genbuf(), 0, cv->owner, 0555, dp);
306                         }
307                         s -= f->p[PROTO(c->qid)]->ac;
308                         return ip2gen(c, s + Qprotobase, dp);
309                 case Qclone:
310                 case Qstats:
311                         return ip2gen(c, TYPE(c->qid), dp);
312                 case Qconvdir:
313                         if (s == DEVDOTDOT) {
314                                 s = PROTO(c->qid);
315                                 mkqid(&q, QID(s, 0, Qprotodir), 0, QTDIR);
316                                 devdir(c, q, f->p[s]->name, 0, network, 0555, dp);
317                                 return 1;
318                         }
319                         return ip3gen(c, s + Qconvbase, dp);
320                 case Qctl:
321                 case Qdata:
322                 case Qerr:
323                 case Qlisten:
324                 case Qlocal:
325                 case Qremote:
326                 case Qstatus:
327                 case Qsnoop:
328                         return ip3gen(c, TYPE(c->qid), dp);
329         }
330         return -1;
331 }
332
333 static void ipinit(void)
334 {
335         qlock_init(&fslock);
336         nullmediumlink();
337         pktmediumlink();
338 /* if only
339         fmtinstall('i', eipfmt);
340         fmtinstall('I', eipfmt);
341         fmtinstall('E', eipfmt);
342         fmtinstall('V', eipfmt);
343         fmtinstall('M', eipfmt);
344 */
345 }
346
347 static void ipreset(void)
348 {
349 }
350
351 static struct Fs *ipgetfs(int dev)
352 {
353         extern void (*ipprotoinit[]) (struct Fs *);
354         struct Fs *f;
355         int i;
356
357         if (dev >= Nfs)
358                 return NULL;
359
360         qlock(&fslock);
361         if (ipfs[dev] == NULL) {
362                 f = kzmalloc(sizeof(struct Fs), MEM_WAIT);
363                 rwinit(&f->rwlock);
364                 qlock_init(&f->iprouter.qlock);
365                 ip_init(f);
366                 arpinit(f);
367                 netloginit(f);
368                 for (i = 0; ipprotoinit[i]; i++)
369                         ipprotoinit[i] (f);
370                 f->dev = dev;
371                 ipfs[dev] = f;
372         }
373         qunlock(&fslock);
374
375         return ipfs[dev];
376 }
377
378 struct IPaux *newipaux(char *owner, char *tag)
379 {
380         struct IPaux *a;
381         int n;
382
383         a = kzmalloc(sizeof(*a), 0);
384         kstrdup(&a->owner, owner);
385         memset(a->tag, ' ', sizeof(a->tag));
386         n = strlen(tag);
387         if (n > sizeof(a->tag))
388                 n = sizeof(a->tag);
389         memmove(a->tag, tag, n);
390         return a;
391 }
392
393 #define ATTACHER(c) (((struct IPaux*)((c)->aux))->owner)
394
395 static struct chan *ipattach(char *spec)
396 {
397         struct chan *c;
398         int dev;
399
400         dev = atoi(spec);
401         if (dev >= Nfs)
402                 error(EFAIL, "bad specification");
403
404         ipgetfs(dev);
405         c = devattach(devname(), spec);
406         mkqid(&c->qid, QID(0, 0, Qtopdir), 0, QTDIR);
407         c->dev = dev;
408
409         c->aux = newipaux(commonuser(), "none");
410
411         return c;
412 }
413
414 static struct walkqid *ipwalk(struct chan *c, struct chan *nc, char **name,
415                                                           unsigned int nname)
416 {
417         struct IPaux *a = c->aux;
418         struct walkqid *w;
419
420         w = devwalk(c, nc, name, nname, NULL, 0, ipgen);
421         if (w != NULL && w->clone != NULL)
422                 w->clone->aux = newipaux(a->owner, a->tag);
423         return w;
424 }
425
426 static size_t ipstat(struct chan *c, uint8_t *db, size_t n)
427 {
428         return devstat(c, db, n, NULL, 0, ipgen);
429 }
430
431 static int should_wake(void *arg)
432 {
433         struct conv *cv = arg;
434         /* signal that the conv is closed */
435         if (qisclosed(cv->rq))
436                 return TRUE;
437         return cv->incall != NULL;
438 }
439
440 static struct chan *ipopen(struct chan *c, int omode)
441 {
442         ERRSTACK(2);
443         struct conv *cv, *nc;
444         struct Proto *p;
445         int perm;
446         struct Fs *f;
447
448         /* perm is a lone rwx, not the rwx------ from the conversion */
449         perm = omode_to_rwx(omode) >> 6;
450
451         f = ipfs[c->dev];
452
453         switch (TYPE(c->qid)) {
454                 default:
455                         break;
456                 case Qndb:
457                         if (omode & (O_WRITE | O_TRUNC) && !iseve())
458                                 error(EPERM, ERROR_FIXME);
459                         if ((omode & (O_WRITE | O_TRUNC)) == (O_WRITE | O_TRUNC))
460                                 f->ndb[0] = 0;
461                         break;
462                 case Qlog:
463                         netlogopen(f);
464                         break;
465                 case Qiprouter:
466                         iprouteropen(f);
467                         break;
468                 case Qiproute:
469                         c->synth_buf = kpages_zalloc(IPROUTE_LEN, MEM_WAIT);
470                         routeread(f, c->synth_buf, 0, IPROUTE_LEN);
471                         break;
472                 case Qtopdir:
473                 case Qprotodir:
474                 case Qconvdir:
475                 case Qstatus:
476                 case Qremote:
477                 case Qlocal:
478                 case Qstats:
479                 case Qipselftab:
480                         if (omode & O_WRITE)
481                                 error(EPERM, ERROR_FIXME);
482                         break;
483                 case Qsnoop:
484                         if (omode & O_WRITE)
485                                 error(EPERM, ERROR_FIXME);
486                         /* might be racy.  note the lack of a proto lock, unlike Qdata */
487                         p = f->p[PROTO(c->qid)];
488                         cv = p->conv[CONV(c->qid)];
489                         if (strcmp(ATTACHER(c), cv->owner) != 0 && !iseve())
490                                 error(EPERM, ERROR_FIXME);
491                         atomic_inc(&cv->snoopers);
492                         break;
493                 case Qclone:
494                         p = f->p[PROTO(c->qid)];
495                         qlock(&p->qlock);
496                         if (waserror()) {
497                                 qunlock(&p->qlock);
498                                 nexterror();
499                         }
500                         cv = Fsprotoclone(p, ATTACHER(c));
501                         qunlock(&p->qlock);
502                         poperror();
503                         if (cv == NULL) {
504                                 error(ENODEV, "Null conversation from Fsprotoclone");
505                                 break;
506                         }
507                         mkqid(&c->qid, QID(p->x, cv->x, Qctl), 0, QTFILE);
508                         break;
509                 case Qdata:
510                 case Qctl:
511                 case Qerr:
512                         p = f->p[PROTO(c->qid)];
513                         qlock(&p->qlock);
514                         cv = p->conv[CONV(c->qid)];
515                         qlock(&cv->qlock);
516                         if (waserror()) {
517                                 qunlock(&cv->qlock);
518                                 qunlock(&p->qlock);
519                                 nexterror();
520                         }
521                         if ((perm & (cv->perm >> 6)) != perm) {
522                                 if (strcmp(ATTACHER(c), cv->owner) != 0)
523                                         error(EPERM, ERROR_FIXME);
524                                 if ((perm & cv->perm) != perm)
525                                         error(EPERM, ERROR_FIXME);
526
527                         }
528                         cv->inuse++;
529                         if (cv->inuse == 1) {
530                                 kstrdup(&cv->owner, ATTACHER(c));
531                                 cv->perm = 0660;
532                         }
533                         qunlock(&cv->qlock);
534                         qunlock(&p->qlock);
535                         poperror();
536                         break;
537                 case Qlisten:
538                         cv = f->p[PROTO(c->qid)]->conv[CONV(c->qid)];
539                         /* No permissions or Announce checks required.  We'll see if that's
540                          * a good idea or not. (the perm check would do nothing, as is,
541                          * since an O_PATH perm is 0).
542                          *
543                          * But we probably want to incref to keep the conversation around
544                          * until this FD/chan is closed.  #ip is a little weird in that
545                          * objects never really go away (high water mark for convs, you can
546                          * always find them in the ns).  I think it is possible to
547                          * namec/ipgen a chan, then have that conv close, then have that
548                          * chan be opened.  You can probably do this with a data file. */
549                         if (omode & O_PATH) {
550                                 qlock(&cv->qlock);
551                                 cv->inuse++;
552                                 qunlock(&cv->qlock);
553                                 break;
554                         }
555                         if ((perm & (cv->perm >> 6)) != perm) {
556                                 if (strcmp(ATTACHER(c), cv->owner) != 0)
557                                         error(EPERM, ERROR_FIXME);
558                                 if ((perm & cv->perm) != perm)
559                                         error(EPERM, ERROR_FIXME);
560
561                         }
562
563                         if (cv->state != Announced)
564                                 error(EFAIL, "not announced");
565
566                         if (waserror()) {
567                                 closeconv(cv);
568                                 nexterror();
569                         }
570                         qlock(&cv->qlock);
571                         cv->inuse++;
572                         qunlock(&cv->qlock);
573
574                         nc = NULL;
575                         while (nc == NULL) {
576                                 /* give up if we got a hangup */
577                                 if (qisclosed(cv->rq))
578                                         error(EFAIL, "listen hungup");
579
580                                 qlock(&cv->listenq);
581                                 if (waserror()) {
582                                         qunlock(&cv->listenq);
583                                         nexterror();
584                                 }
585                                 /* we can peek at incall without grabbing the cv qlock.  if
586                                  * anything is there, it'll remain there until we dequeue it.
587                                  * no one else can, since we hold the listenq lock */
588                                 if ((c->flag & O_NONBLOCK) && !cv->incall)
589                                         error(EAGAIN, "listen queue empty");
590                                 /* wait for a connect */
591                                 rendez_sleep(&cv->listenr, should_wake, cv);
592
593                                 /* if there is a concurrent hangup, they will hold the qlock
594                                  * until the hangup is complete, including closing the cv->rq */
595                                 qlock(&cv->qlock);
596                                 nc = cv->incall;
597                                 if (nc != NULL) {
598                                         cv->incall = nc->next;
599                                         mkqid(&c->qid, QID(PROTO(c->qid), nc->x, Qctl), 0, QTFILE);
600                                         kstrdup(&cv->owner, ATTACHER(c));
601                                 }
602                                 qunlock(&cv->qlock);
603
604                                 qunlock(&cv->listenq);
605                                 poperror();
606                         }
607                         closeconv(cv);
608                         poperror();
609                         break;
610         }
611         c->mode = openmode(omode);
612         c->flag |= COPEN;
613         c->offset = 0;
614         return c;
615 }
616
617 static size_t ipwstat(struct chan *c, uint8_t *dp, size_t n)
618 {
619         ERRSTACK(2);
620         struct dir *d;
621         struct conv *cv;
622         struct Fs *f;
623         struct Proto *p;
624
625         f = ipfs[c->dev];
626         switch (TYPE(c->qid)) {
627                 default:
628                         error(EPERM, ERROR_FIXME);
629                         break;
630                 case Qctl:
631                 case Qdata:
632                         break;
633         }
634
635         d = kzmalloc(sizeof(*d) + n, 0);
636         if (waserror()) {
637                 kfree(d);
638                 nexterror();
639         }
640         n = convM2D(dp, n, d, (char *)&d[1]);
641         if (n == 0)
642                 error(ENODATA, ERROR_FIXME);
643         p = f->p[PROTO(c->qid)];
644         cv = p->conv[CONV(c->qid)];
645         if (!iseve() && strcmp(ATTACHER(c), cv->owner) != 0)
646                 error(EPERM, ERROR_FIXME);
647         if (!emptystr(d->uid))
648                 kstrdup(&cv->owner, d->uid);
649         if (d->mode != -1)
650                 cv->perm = d->mode & 0777;
651         poperror();
652         kfree(d);
653         return n;
654 }
655
656 /* Should be able to handle any file type chan. Feel free to extend it. */
657 static char *ipchaninfo(struct chan *ch, char *ret, size_t ret_l)
658 {
659         struct conv *conv;
660         struct Proto *proto;
661         char *p;
662         struct Fs *f;
663
664         f = ipfs[ch->dev];
665
666         switch (TYPE(ch->qid)) {
667                 default:
668                         ret = "Unknown type";
669                         break;
670                 case Qdata:
671                         proto = f->p[PROTO(ch->qid)];
672                         conv = proto->conv[CONV(ch->qid)];
673                         snprintf(ret, ret_l,
674                                  "Qdata, %s, proto %s, conv idx %d, rq len %d, wq len %d, total read %llu",
675                                  SLIST_EMPTY(&conv->data_taps) ? "untapped" : "tapped",
676                                  proto->name, conv->x, qlen(conv->rq), qlen(conv->wq),
677                                          q_bytes_read(conv->rq));
678                         break;
679                 case Qarp:
680                         ret = "Qarp";
681                         break;
682                 case Qiproute:
683                         ret = "Qiproute";
684                         break;
685                 case Qlisten:
686                         proto = f->p[PROTO(ch->qid)];
687                         conv = proto->conv[CONV(ch->qid)];
688                         snprintf(ret, ret_l,
689                                  "Qlisten, %s proto %s, conv idx %d, has %sincalls",
690                                  SLIST_EMPTY(&conv->listen_taps) ? "untapped" : "tapped",
691                                  proto->name, conv->x, conv->incall ? "" : "no ");
692                         break;
693                 case Qlog:
694                         ret = "Qlog";
695                         break;
696                 case Qndb:
697                         ret = "Qndb";
698                         break;
699                 case Qctl:
700                         proto = f->p[PROTO(ch->qid)];
701                         conv = proto->conv[CONV(ch->qid)];
702                         snprintf(ret, ret_l, "Qctl, proto %s, conv idx %d", proto->name,
703                                          conv->x);
704                         break;
705         }
706         return ret;
707 }
708
709 static void closeconv(struct conv *cv)
710 {
711         ERRSTACK(1);
712         struct conv *nc;
713         struct Ipmulti *mp;
714
715         qlock(&cv->qlock);
716
717         if (--cv->inuse > 0) {
718                 qunlock(&cv->qlock);
719                 return;
720         }
721         if (waserror()) {
722                 qunlock(&cv->qlock);
723                 nexterror();
724         }
725         /* close all incoming calls since no listen will ever happen */
726         for (nc = cv->incall; nc; nc = cv->incall) {
727                 cv->incall = nc->next;
728                 closeconv(nc);
729         }
730         cv->incall = NULL;
731
732         kstrdup(&cv->owner, network);
733         cv->perm = 0660;
734
735         while ((mp = cv->multi) != NULL)
736                 ipifcremmulti(cv, mp->ma, mp->ia);
737
738         cv->r = NULL;
739         cv->rgen = 0;
740         if (cv->state == Bypass)
741                 undo_proto_qio_bypass(cv);
742         cv->p->close(cv);
743         cv->state = Idle;
744         qunlock(&cv->qlock);
745         poperror();
746 }
747
748 static void ipclose(struct chan *c)
749 {
750         struct Fs *f;
751
752         f = ipfs[c->dev];
753         switch (TYPE(c->qid)) {
754                 default:
755                         break;
756                 case Qlog:
757                         if (c->flag & COPEN)
758                                 netlogclose(f);
759                         break;
760                 case Qiprouter:
761                         if (c->flag & COPEN)
762                                 iprouterclose(f);
763                         break;
764                 case Qdata:
765                 case Qctl:
766                 case Qerr:
767                 case Qlisten:
768                         if (c->flag & COPEN)
769                                 closeconv(f->p[PROTO(c->qid)]->conv[CONV(c->qid)]);
770                         break;
771                 case Qsnoop:
772                         if (c->flag & COPEN)
773                                 atomic_dec(&f->p[PROTO(c->qid)]->conv[CONV(c->qid)]->snoopers);
774                         break;
775                 case Qiproute:
776                         if (c->flag & COPEN)
777                                 kpages_free(c->synth_buf, IPROUTE_LEN);
778                         break;
779         }
780         kfree(((struct IPaux *)c->aux)->owner);
781         kfree(c->aux);
782 }
783
784 enum {
785         Statelen = 32 * 1024,
786 };
787
788 static size_t ipread(struct chan *ch, void *a, size_t n, off64_t off)
789 {
790         struct conv *c;
791         struct Proto *x;
792         char *buf, *p;
793         long rv;
794         struct Fs *f;
795         uint32_t offset = off;
796
797         f = ipfs[ch->dev];
798
799         p = a;
800         switch (TYPE(ch->qid)) {
801                 default:
802                         error(EPERM, ERROR_FIXME);
803                 case Qtopdir:
804                 case Qprotodir:
805                 case Qconvdir:
806                         return devdirread(ch, a, n, 0, 0, ipgen);
807                 case Qarp:
808                         return arpread(f->arp, a, offset, n);
809                 case Qndb:
810                         return readstr(offset, a, n, f->ndb);
811                 case Qiproute:
812                         return readmem(offset, a, n, ch->synth_buf, IPROUTE_LEN);
813                 case Qiprouter:
814                         return iprouterread(f, a, n);
815                 case Qipselftab:
816                         return ipselftabread(f, a, offset, n);
817                 case Qlog:
818                         return netlogread(f, a, offset, n);
819                 case Qctl:
820                         snprintf(get_cur_genbuf(), GENBUF_SZ, "%lu", CONV(ch->qid));
821                         return readstr(offset, p, n, get_cur_genbuf());
822                 case Qremote:
823                         buf = kzmalloc(Statelen, 0);
824                         x = f->p[PROTO(ch->qid)];
825                         c = x->conv[CONV(ch->qid)];
826                         if (x->remote == NULL) {
827                                 snprintf(buf, Statelen, "%I!%d\n", c->raddr, c->rport);
828                         } else {
829                                 (*x->remote) (c, buf, Statelen - 2);
830                         }
831                         rv = readstr(offset, p, n, buf);
832                         kfree(buf);
833                         return rv;
834                 case Qlocal:
835                         buf = kzmalloc(Statelen, 0);
836                         x = f->p[PROTO(ch->qid)];
837                         c = x->conv[CONV(ch->qid)];
838                         if (x->local == NULL) {
839                                 snprintf(buf, Statelen, "%I!%d\n", c->laddr, c->lport);
840                         } else {
841                                 (*x->local) (c, buf, Statelen - 2);
842                         }
843                         rv = readstr(offset, p, n, buf);
844                         kfree(buf);
845                         return rv;
846                 case Qstatus:
847                         /* this all is a bit screwed up since the size of some state's
848                          * buffers will change from one invocation to another.  a reader
849                          * will come in and read the entire buffer.  then it will come again
850                          * and read from the next offset, expecting EOF.  if the buffer
851                          * changed sizes, it'll reprint the end of the buffer slightly. */
852                         buf = kzmalloc(Statelen, 0);
853                         x = f->p[PROTO(ch->qid)];
854                         c = x->conv[CONV(ch->qid)];
855                         if (c->state == Bypass)
856                                 snprintf(buf, Statelen, "Bypassed\n");
857                         else
858                                 (*x->state)(c, buf, Statelen - 2);
859                         rv = readstr(offset, p, n, buf);
860                         kfree(buf);
861                         return rv;
862                 case Qdata:
863                         c = f->p[PROTO(ch->qid)]->conv[CONV(ch->qid)];
864                         if (ch->flag & O_NONBLOCK)
865                                 return qread_nonblock(c->rq, a, n);
866                         else
867                                 return qread(c->rq, a, n);
868                 case Qerr:
869                         c = f->p[PROTO(ch->qid)]->conv[CONV(ch->qid)];
870                         return qread(c->eq, a, n);
871                 case Qsnoop:
872                         c = f->p[PROTO(ch->qid)]->conv[CONV(ch->qid)];
873                         return qread(c->sq, a, n);
874                 case Qstats:
875                         x = f->p[PROTO(ch->qid)];
876                         if (x->stats == NULL)
877                                 error(EFAIL, "stats not implemented");
878                         buf = kzmalloc(Statelen, 0);
879                         (*x->stats) (x, buf, Statelen);
880                         rv = readstr(offset, p, n, buf);
881                         kfree(buf);
882                         return rv;
883         }
884 }
885
886 static struct block *ipbread(struct chan *ch, size_t n, off64_t offset)
887 {
888         struct conv *c;
889
890         switch (TYPE(ch->qid)) {
891                 case Qdata:
892                         c = chan2conv(ch);
893                         if (ch->flag & O_NONBLOCK)
894                                 return qbread_nonblock(c->rq, n);
895                         else
896                                 return qbread(c->rq, n);
897                 default:
898                         return devbread(ch, n, offset);
899         }
900 }
901
902 /*
903  *  set local address to be that of the ifc closest to remote address
904  */
905 static void setladdr(struct conv *c)
906 {
907         findlocalip(c->p->f, c->laddr, c->raddr);
908 }
909
910 /*
911  *  set a local port making sure the quad of raddr,rport,laddr,lport is unique
912  */
913 static void setluniqueport(struct conv *c, int lport)
914 {
915         struct Proto *p;
916         struct conv *xp;
917         int x;
918
919         p = c->p;
920
921         qlock(&p->qlock);
922         for (x = 0; x < p->nc; x++) {
923                 xp = p->conv[x];
924                 if (xp == NULL)
925                         break;
926                 if (xp == c)
927                         continue;
928                 if ((xp->state == Connected || xp->state == Announced
929                                             || xp->state == Bypass)
930                         && xp->lport == lport
931                         && xp->rport == c->rport
932                         && ipcmp(xp->raddr, c->raddr) == 0
933                         && ipcmp(xp->laddr, c->laddr) == 0) {
934                         qunlock(&p->qlock);
935                         error(EFAIL, "address in use");
936                 }
937         }
938         c->lport = lport;
939         qunlock(&p->qlock);
940 }
941
942 /*
943  *  pick a local port and set it
944  */
945 static void setlport(struct conv *c)
946 {
947         struct Proto *p;
948         uint16_t *pp;
949         int x, found;
950
951         p = c->p;
952         if (c->restricted)
953                 pp = &p->nextrport;
954         else
955                 pp = &p->nextport;
956         qlock(&p->qlock);
957         for (;; (*pp)++) {
958                 /*
959                  * Fsproto initialises p->nextport to 0 and the restricted
960                  * ports (p->nextrport) to 600.
961                  * Restricted ports must lie between 600 and 1024.
962                  * For the initial condition or if the unrestricted port number
963                  * has wrapped round, select a random port between 5000 and 1<<15
964                  * to start at.
965                  */
966                 if (c->restricted) {
967                         if (*pp >= 1024)
968                                 *pp = 600;
969                 } else
970                         while (*pp < 5000)
971                                 urandom_read(pp, sizeof(*pp));
972
973                 found = 0;
974                 for (x = 0; x < p->nc; x++) {
975                         if (p->conv[x] == NULL)
976                                 break;
977                         if (p->conv[x]->lport == *pp) {
978                                 found = 1;
979                                 break;
980                         }
981                 }
982                 if (!found)
983                         break;
984         }
985         c->lport = (*pp)++;
986         qunlock(&p->qlock);
987 }
988
989 /*
990  *  set a local address and port from a string of the form
991  *      [address!]port[!r]
992  */
993 static void setladdrport(struct conv *c, char *str, int announcing)
994 {
995         char *p;
996         uint16_t lport;
997         uint8_t addr[IPaddrlen];
998
999         /*
1000          *  ignore restricted part if it exists.  it's
1001          *  meaningless on local ports.
1002          */
1003         p = strchr(str, '!');
1004         if (p != NULL) {
1005                 *p++ = 0;
1006                 if (strcmp(p, "r") == 0)
1007                         p = NULL;
1008         }
1009
1010         c->lport = 0;
1011         if (p == NULL) {
1012                 if (announcing)
1013                         ipmove(c->laddr, IPnoaddr);
1014                 else
1015                         setladdr(c);
1016                 p = str;
1017         } else {
1018                 if (strcmp(str, "*") == 0)
1019                         ipmove(c->laddr, IPnoaddr);
1020                 else {
1021                         parseip(addr, str);
1022                         if (ipforme(c->p->f, addr))
1023                                 ipmove(c->laddr, addr);
1024                         else
1025                                 error(EFAIL, "not a local IP address");
1026                 }
1027         }
1028
1029         /* one process can get all connections */
1030         if (announcing && strcmp(p, "*") == 0) {
1031                 if (!iseve())
1032                         error(EPERM, ERROR_FIXME);
1033                 setluniqueport(c, 0);
1034         }
1035
1036         lport = atoi(p);
1037         if (lport <= 0)
1038                 setlport(c);
1039         else
1040                 setluniqueport(c, lport);
1041 }
1042
1043 static void setraddrport(struct conv *c, char *str)
1044 {
1045         char *p;
1046
1047         p = strchr(str, '!');
1048         if (p == NULL)
1049                 error(EFAIL, "malformed address");
1050         *p++ = 0;
1051         parseip(c->raddr, str);
1052         c->rport = atoi(p);
1053         p = strchr(p, '!');
1054         if (p) {
1055                 if (strstr(p, "!r") != NULL)
1056                         c->restricted = 1;
1057         }
1058 }
1059
1060 /*
1061  *  called by protocol connect routine to set addresses
1062  */
1063 void Fsstdconnect(struct conv *c, char *argv[], int argc)
1064 {
1065         switch (argc) {
1066                 default:
1067                         error(EINVAL, "bad args to %s", __func__);
1068                 case 2:
1069                         setraddrport(c, argv[1]);
1070                         setladdr(c);
1071                         setlport(c);
1072                         break;
1073                 case 3:
1074                         setraddrport(c, argv[1]);
1075                         setladdrport(c, argv[2], 0);
1076                         break;
1077         }
1078
1079         /* TODO: why is an IPnoaddr (in v6 format, equivalent to v6Unspecified),
1080          * a v4 format? */
1081         if ((memcmp(c->raddr, v4prefix, IPv4off) == 0 &&
1082                  memcmp(c->laddr, v4prefix, IPv4off) == 0)
1083                 || ipcmp(c->raddr, IPnoaddr) == 0)
1084                 c->ipversion = V4;
1085         else
1086                 c->ipversion = V6;
1087         /* Linux has taught people to use zeros for local interfaces.  TODO: We
1088          * might need this for v6 in the future. */
1089         if (!ipcmp(c->raddr, IPv4_zeroes))
1090                 ipmove(c->raddr, IPv4_loopback);
1091 }
1092
1093 /*
1094  *  initiate connection and sleep till its set up
1095  */
1096 static int connected(void *a)
1097 {
1098         return ((struct conv *)a)->state == Connected;
1099 }
1100
1101 static void connectctlmsg(struct Proto *x, struct conv *c, struct cmdbuf *cb,
1102                           struct chan *chan)
1103 {
1104         ERRSTACK(1);
1105         char *p;
1106
1107         if (c->state != 0)
1108                 error(EBUSY, ERROR_FIXME);
1109         c->state = Connecting;
1110         c->cerr[0] = '\0';
1111         if (x->connect == NULL)
1112                 error(EFAIL, "connect not supported");
1113         /* It's up to the proto connect method to not block the kthread.  This is
1114          * currently the case for e.g. TCP. */
1115         x->connect(c, cb->f, cb->nf);
1116         /* This is notionally right before the rendez_sleep: either we block or we
1117          * kick back to userspace.  We do this before the unlock to avoid races with
1118          * c->state (rendez's internal lock deals with its race with the waker) and
1119          * to avoid the excessive unlock and relock.
1120          *
1121          * Also, it's important that we don't do anything important for the
1122          * functionality of the conv after the rendez sleep.  The non-blocking style
1123          * won't call back into the kernel - it just wants the event.  I considered
1124          * allowing multiple connect calls, where we just return if it was already
1125          * connected, but that would break UDP, which allows multiple different
1126          * connect calls. */
1127         if ((chan->flag & O_NONBLOCK) && !connected(c))
1128                 error(EINPROGRESS, "connection not ready yet");
1129         qunlock(&c->qlock);
1130         if (waserror()) {
1131                 qlock(&c->qlock);
1132                 nexterror();
1133         }
1134         rendez_sleep(&c->cr, connected, c);
1135         qlock(&c->qlock);
1136         poperror();
1137
1138         if (c->cerr[0] != '\0')
1139                 error(EFAIL, c->cerr);
1140 }
1141
1142 /*
1143  *  called by protocol announce routine to set addresses
1144  */
1145 void Fsstdannounce(struct conv *c, char *argv[], int argc)
1146 {
1147         memset(c->raddr, 0, sizeof(c->raddr));
1148         c->rport = 0;
1149         switch (argc) {
1150                 default:
1151                         error(EINVAL, "bad args to announce");
1152                 case 2:
1153                         setladdrport(c, argv[1], 1);
1154                         break;
1155         }
1156 }
1157
1158 /*
1159  *  initiate announcement and sleep till its set up
1160  */
1161 static int announced(void *a)
1162 {
1163         return ((struct conv *)a)->state == Announced;
1164 }
1165
1166 static void announcectlmsg(struct Proto *x, struct conv *c, struct cmdbuf *cb)
1167 {
1168         ERRSTACK(1);
1169         char *p;
1170
1171         if (c->state != 0)
1172                 error(EBUSY, ERROR_FIXME);
1173         c->state = Announcing;
1174         c->cerr[0] = '\0';
1175         if (x->announce == NULL)
1176                 error(EFAIL, "announce not supported");
1177         x->announce(c, cb->f, cb->nf);
1178
1179         qunlock(&c->qlock);
1180         if (waserror()) {
1181                 qlock(&c->qlock);
1182                 nexterror();
1183         }
1184         rendez_sleep(&c->cr, announced, c);
1185         qlock(&c->qlock);
1186         poperror();
1187
1188         if (c->cerr[0] != '\0')
1189                 error(EFAIL, c->cerr);
1190 }
1191
1192 /*
1193  *  called by protocol bind routine to set addresses
1194  */
1195 void Fsstdbind(struct conv *c, char *argv[], int argc)
1196 {
1197         switch (argc) {
1198                 default:
1199                         error(EINVAL, "bad args to bind");
1200                 case 2:
1201                         setladdrport(c, argv[1], 0);
1202                         break;
1203         }
1204 }
1205
1206 static void bindctlmsg(struct Proto *x, struct conv *c, struct cmdbuf *cb)
1207 {
1208         if (x->bind == NULL)
1209                 Fsstdbind(c, cb->f, cb->nf);
1210         else
1211                 x->bind(c, cb->f, cb->nf);
1212 }
1213
1214 /* Helper, called by protocols to use the bypass.
1215  *
1216  * This is a bit nasty due to the overall nastiness of #ip.  We need to lock
1217  * before checking the state and hold the qlock throughout, because a concurrent
1218  * closeconv() could tear down the bypass.  Specifically, it could free the
1219  * bypass queues.  The root issue is that conversation lifetimes are not managed
1220  * well.
1221  *
1222  * If we fail, it's our responsibility to consume (free) the block(s). */
1223 void bypass_or_drop(struct conv *cv, struct block *bp)
1224 {
1225         qlock(&cv->qlock);
1226         if (cv->state == Bypass)
1227                 qpass(cv->rq, bp);
1228         else
1229                 freeblist(bp);
1230         qunlock(&cv->qlock);
1231 }
1232
1233 /* Push the block directly to the approprite ipoput function.
1234  *
1235  * It's the protocol's responsibility (and thus ours here) to make sure there is
1236  * at least the right amount of the IP header in the block (ipoput{4,6} assumes
1237  * it has the right amount, and the other protocols account for the IP header in
1238  * their own header).
1239  *
1240  * For the TTL and TOS, we just use the default ones.  If we want, we could look
1241  * into the actual block and see what the user wanted, though we're bypassing
1242  * the protocol layer, not the IP layer. */
1243 static void proto_bypass_kick(void *arg, struct block *bp)
1244 {
1245         struct conv *cv = (struct conv*)arg;
1246         uint8_t vers_nibble;
1247         struct Fs *f;
1248
1249         f = cv->p->f;
1250
1251         bp = pullupblock(bp, 1);
1252         if (!bp)
1253                 error(EINVAL, "Proto bypass unable to pullup a byte!");
1254         vers_nibble = *(uint8_t*)bp->rp & 0xf0;
1255         switch (vers_nibble) {
1256         case IP_VER4:
1257                 bp = pullupblock(bp, IPV4HDR_LEN);
1258                 if (!bp)
1259                         error(EINVAL, "Proto bypass unable to pullup v4 header");
1260                 ipoput4(f, bp, FALSE, MAXTTL, DFLTTOS, NULL);
1261                 break;
1262         case IP_VER6:
1263                 bp = pullupblock(bp, IPV6HDR_LEN);
1264                 if (!bp)
1265                         error(EINVAL, "Proto bypass unable to pullup v6 header");
1266                 ipoput6(f, bp, FALSE, MAXTTL, DFLTTOS, NULL);
1267                 break;
1268         default:
1269                 error(EINVAL, "Proto bypass block had unknown IP version 0x%x",
1270                       vers_nibble);
1271         }
1272 }
1273
1274 /* Sets up cv for the protocol bypass.  We use different queues for two reasons:
1275  * 1) To be protocol independent.  For instance, TCP and UDP could use very
1276  * different QIO styles.
1277  * 2) To set up our own kick/bypass method.  Note how udpcreate() and here uses
1278  * qbypass() (just blast it out), while TCP uses qopen() with a kick.  TCP still
1279  * follows queuing discipline.
1280  *
1281  * It's like we are our own protocol, the bypass protocol, when it comes to how
1282  * we interact with qio.  The conv still is of the real protocol type (e.g.
1283  * TCP).
1284  *
1285  * Note that we can't free the old queues.  The way #ip works, the queues are
1286  * created when the conv is created, but the conv is never freed.  It's like a
1287  * slab allocator that never frees objects, but just reinitializes them a
1288  * little.
1289  *
1290  * For the queues, we're basically like UDP:
1291  * - We take packets for rq and drop on overflow.
1292  * - rq is also Qmsg, but we also have Qcoalesce, to ignore out zero-len blocks
1293  * - We kick for our outbound (wq) messages.
1294  *
1295  * Note that Qmsg can drop parts of packets.  It's up to the user to read
1296  * enough.  If they didn't read enough, the extra is dropped.  This is similar
1297  * to SOCK_DGRAM and recvfrom().  Minus major changes, there's no nice way to
1298  * get individual messages with read().  Userspace using the bypass will need to
1299  * find out the MTU of the NIC the IP stack is attached to, and make sure to
1300  * read in at least that amount each time. */
1301 static void setup_proto_qio_bypass(struct conv *cv)
1302 {
1303         cv->rq_save = cv->rq;
1304         cv->wq_save = cv->wq;
1305         cv->rq = qopen(BYPASS_QMAX, Qmsg | Qcoalesce, 0, 0);
1306         cv->wq = qbypass(proto_bypass_kick, cv);
1307 }
1308
1309 static void undo_proto_qio_bypass(struct conv *cv)
1310 {
1311         qfree(cv->rq);
1312         qfree(cv->wq);
1313         cv->rq = cv->rq_save;
1314         cv->wq = cv->wq_save;
1315         cv->rq_save = NULL;
1316         cv->wq_save = NULL;
1317 }
1318
1319 void Fsstdbypass(struct conv *cv, char *argv[], int argc)
1320 {
1321         memset(cv->raddr, 0, sizeof(cv->raddr));
1322         cv->rport = 0;
1323         switch (argc) {
1324         case 2:
1325                 setladdrport(cv, argv[1], 1);
1326                 break;
1327         default:
1328                 error(EINVAL, "Bad args (was %d, need 2) to bypass", argc);
1329         }
1330 }
1331
1332 static void bypassctlmsg(struct Proto *x, struct conv *cv, struct cmdbuf *cb)
1333 {
1334         if (!x->bypass)
1335                 error(EFAIL, "Protocol %s does not support bypass", x->name);
1336         /* The protocol needs to set the port (usually by calling Fsstdbypass) and
1337          * then do whatever it needs to make sure it can find the conv again during
1338          * receive (usually by adding to a hash table). */
1339         x->bypass(cv, cb->f, cb->nf);
1340         setup_proto_qio_bypass(cv);
1341         cv->state = Bypass;
1342 }
1343
1344 static void shutdownctlmsg(struct conv *cv, struct cmdbuf *cb)
1345 {
1346         if (cb->nf < 2)
1347                 goto err;
1348         if (!strcmp(cb->f[1], "rd")) {
1349                 qhangup(cv->rq, "shutdown");
1350                 if (cv->p->shutdown)
1351                         cv->p->shutdown(cv, SHUT_RD);
1352         } else if (!strcmp(cb->f[1], "wr")) {
1353                 qhangup(cv->wq, "shutdown");
1354                 if (cv->p->shutdown)
1355                         cv->p->shutdown(cv, SHUT_WR);
1356         } else if (!strcmp(cb->f[1], "rdwr")) {
1357                 qhangup(cv->rq, "shutdown");
1358                 qhangup(cv->wq, "shutdown");
1359                 if (cv->p->shutdown)
1360                         cv->p->shutdown(cv, SHUT_RDWR);
1361         } else {
1362                 goto err;
1363         }
1364         return;
1365 err:
1366         error(EINVAL, "shutdown [rx|tx|rxtx]");
1367 }
1368
1369 static void tosctlmsg(struct conv *c, struct cmdbuf *cb)
1370 {
1371         if (cb->nf < 2)
1372                 c->tos = 0;
1373         else
1374                 c->tos = atoi(cb->f[1]);
1375 }
1376
1377 static void ttlctlmsg(struct conv *c, struct cmdbuf *cb)
1378 {
1379         if (cb->nf < 2)
1380                 c->ttl = MAXTTL;
1381         else
1382                 c->ttl = atoi(cb->f[1]);
1383 }
1384
1385 /* Binds a conversation, as if the user wrote "bind *" into ctl. */
1386 static void autobind(struct conv *cv)
1387 {
1388         ERRSTACK(1);
1389         struct cmdbuf *cb;
1390
1391         cb = parsecmd("bind *", 7);
1392         if (waserror()) {
1393                 kfree(cb);
1394                 nexterror();
1395         }
1396         bindctlmsg(cv->p, cv, cb);
1397         poperror();
1398         kfree(cb);
1399 }
1400
1401 static size_t ipwrite(struct chan *ch, void *v, size_t n, off64_t off)
1402 {
1403         ERRSTACK(1);
1404         struct conv *c;
1405         struct Proto *x;
1406         char *p;
1407         struct cmdbuf *cb;
1408         uint8_t ia[IPaddrlen], ma[IPaddrlen];
1409         struct Fs *f;
1410         char *a;
1411
1412         a = v;
1413         f = ipfs[ch->dev];
1414
1415         switch (TYPE(ch->qid)) {
1416                 default:
1417                         error(EPERM, ERROR_FIXME);
1418                 case Qdata:
1419                         x = f->p[PROTO(ch->qid)];
1420                         c = x->conv[CONV(ch->qid)];
1421                         /* connection-less protocols (UDP) can write without manually
1422                          * binding. */
1423                         if (c->lport == 0)
1424                                 autobind(c);
1425                         if (ch->flag & O_NONBLOCK)
1426                                 qwrite_nonblock(c->wq, a, n);
1427                         else
1428                                 qwrite(c->wq, a, n);
1429                         break;
1430                 case Qarp:
1431                         return arpwrite(f, a, n);
1432                 case Qiproute:
1433                         return routewrite(f, ch, a, n);
1434                 case Qlog:
1435                         netlogctl(f, a, n);
1436                         return n;
1437                 case Qndb:
1438                         return ndbwrite(f, a, off, n);
1439                 case Qctl:
1440                         x = f->p[PROTO(ch->qid)];
1441                         c = x->conv[CONV(ch->qid)];
1442                         cb = parsecmd(a, n);
1443
1444                         qlock(&c->qlock);
1445                         if (waserror()) {
1446                                 qunlock(&c->qlock);
1447                                 kfree(cb);
1448                                 nexterror();
1449                         }
1450                         if (cb->nf < 1)
1451                                 error(EFAIL, "short control request");
1452                         if (strcmp(cb->f[0], "connect") == 0)
1453                                 connectctlmsg(x, c, cb, ch);
1454                         else if (strcmp(cb->f[0], "announce") == 0)
1455                                 announcectlmsg(x, c, cb);
1456                         else if (strcmp(cb->f[0], "bind") == 0)
1457                                 bindctlmsg(x, c, cb);
1458                         else if (strcmp(cb->f[0], "bypass") == 0)
1459                                 bypassctlmsg(x, c, cb);
1460                         else if (strcmp(cb->f[0], "shutdown") == 0)
1461                                 shutdownctlmsg(c, cb);
1462                         else if (strcmp(cb->f[0], "ttl") == 0)
1463                                 ttlctlmsg(c, cb);
1464                         else if (strcmp(cb->f[0], "tos") == 0)
1465                                 tosctlmsg(c, cb);
1466                         else if (strcmp(cb->f[0], "ignoreadvice") == 0)
1467                                 c->ignoreadvice = 1;
1468                         else if (strcmp(cb->f[0], "addmulti") == 0) {
1469                                 if (cb->nf < 2)
1470                                         error(EFAIL, "addmulti needs interface address");
1471                                 if (cb->nf == 2) {
1472                                         if (!ipismulticast(c->raddr))
1473                                                 error(EFAIL, "addmulti for a non multicast address");
1474                                         parseip(ia, cb->f[1]);
1475                                         ipifcaddmulti(c, c->raddr, ia);
1476                                 } else {
1477                                         parseip(ma, cb->f[2]);
1478                                         if (!ipismulticast(ma))
1479                                                 error(EFAIL, "addmulti for a non multicast address");
1480                                         parseip(ia, cb->f[1]);
1481                                         ipifcaddmulti(c, ma, ia);
1482                                 }
1483                         } else if (strcmp(cb->f[0], "remmulti") == 0) {
1484                                 if (cb->nf < 2)
1485                                         error(EFAIL, "remmulti needs interface address");
1486                                 if (!ipismulticast(c->raddr))
1487                                         error(EFAIL, "remmulti for a non multicast address");
1488                                 parseip(ia, cb->f[1]);
1489                                 ipifcremmulti(c, c->raddr, ia);
1490                         } else if (x->ctl != NULL) {
1491                                 x->ctl(c, cb->f, cb->nf);
1492                         } else
1493                                 error(EFAIL, "unknown control request");
1494                         qunlock(&c->qlock);
1495                         kfree(cb);
1496                         poperror();
1497         }
1498         return n;
1499 }
1500
1501 static size_t ipbwrite(struct chan *ch, struct block *bp, off64_t offset)
1502 {
1503         struct conv *c;
1504         size_t n;
1505
1506         switch (TYPE(ch->qid)) {
1507                 case Qdata:
1508                         c = chan2conv(ch);
1509                         if (bp->next)
1510                                 bp = concatblock(bp);
1511                         n = BLEN(bp);
1512                         if (ch->flag & O_NONBLOCK)
1513                                 qbwrite_nonblock(c->wq, bp);
1514                         else
1515                                 qbwrite(c->wq, bp);
1516                         return n;
1517                 default:
1518                         return devbwrite(ch, bp, offset);
1519         }
1520 }
1521
1522 static void fire_data_taps(struct conv *conv, int filter)
1523 {
1524         struct fd_tap *tap_i;
1525
1526         /* At this point, we have an event we want to send to our taps (if any).
1527          * The lock protects list integrity and the existence of the tap.
1528          *
1529          * Previously, I thought of using the conv qlock.  That actually breaks, due
1530          * to weird usages of the qlock (someone holds it for a long time, blocking
1531          * the inbound wakeup from etherread4).
1532          *
1533          * I opted for a spinlock for a couple reasons:
1534          * - fire_tap should not block.  ideally it'll be fast too (it's mostly a
1535          * send_event).
1536          * - our callers might not want to block.  A lot of network wakeups will
1537          * come network processes (etherread4) or otherwise unrelated to this
1538          * particular conversation.  I'd rather do something like fire off a KMSG
1539          * than block those.
1540          * - if fire_tap takes a while, holding the lock only slows down other
1541          * events on this *same* conversation, or other tap registration.  not a
1542          * huge deal. */
1543         spin_lock(&conv->tap_lock);
1544         SLIST_FOREACH(tap_i, &conv->data_taps, link)
1545                 fire_tap(tap_i, filter);
1546         spin_unlock(&conv->tap_lock);
1547 }
1548
1549 static void ip_wake_cb(struct queue *q, void *data, int filter)
1550 {
1551         struct conv *conv = (struct conv*)data;
1552
1553         /* For these two, we want to ignore events on the opposite end of the
1554          * queues.  For instance, we want to know when the WQ is writable.  Our
1555          * writes will actually make it readable - we don't want to trigger a tap
1556          * for that.  However, qio doesn't know how/why we are using a queue, or
1557          * even who the ends are (hence the callbacks) */
1558         if ((filter & FDTAP_FILT_READABLE) && (q == conv->wq))
1559                 return;
1560         if ((filter & FDTAP_FILT_WRITABLE) && (q == conv->rq))
1561                 return;
1562         fire_data_taps(conv, filter);
1563 }
1564
1565 int iptapfd(struct chan *chan, struct fd_tap *tap, int cmd)
1566 {
1567         struct conv *conv = chan2conv(chan);
1568         int ret;
1569
1570         #define DEVIP_LEGAL_DATA_TAPS (FDTAP_FILT_READABLE | FDTAP_FILT_WRITABLE | \
1571                                        FDTAP_FILT_HANGUP | FDTAP_FILT_PRIORITY |   \
1572                                        FDTAP_FILT_ERROR)
1573         #define DEVIP_LEGAL_LISTEN_TAPS (FDTAP_FILT_READABLE | FDTAP_FILT_HANGUP)
1574
1575         switch (TYPE(chan->qid)) {
1576                 case Qdata:
1577                         if (tap->filter & ~DEVIP_LEGAL_DATA_TAPS) {
1578                                 set_errno(ENOSYS);
1579                                 set_errstr("Unsupported #%s data tap %p, must be %p", devname(),
1580                                            tap->filter, DEVIP_LEGAL_DATA_TAPS);
1581                                 return -1;
1582                         }
1583                         spin_lock(&conv->tap_lock);
1584                         switch (cmd) {
1585                                 case (FDTAP_CMD_ADD):
1586                                         if (SLIST_EMPTY(&conv->data_taps)) {
1587                                                 qio_set_wake_cb(conv->rq, ip_wake_cb, conv);
1588                                                 qio_set_wake_cb(conv->wq, ip_wake_cb, conv);
1589                                         }
1590                                         SLIST_INSERT_HEAD(&conv->data_taps, tap, link);
1591                                         ret = 0;
1592                                         break;
1593                                 case (FDTAP_CMD_REM):
1594                                         SLIST_REMOVE(&conv->data_taps, tap, fd_tap, link);
1595                                         if (SLIST_EMPTY(&conv->data_taps)) {
1596                                                 qio_set_wake_cb(conv->rq, 0, conv);
1597                                                 qio_set_wake_cb(conv->wq, 0, conv);
1598                                         }
1599                                         ret = 0;
1600                                         break;
1601                                 default:
1602                                         set_errno(ENOSYS);
1603                                         set_errstr("Unsupported #%s data tap command %p",
1604                                                    devname(), cmd);
1605                                         ret = -1;
1606                         }
1607                         spin_unlock(&conv->tap_lock);
1608                         return ret;
1609                 case Qlisten:
1610                         if (tap->filter & ~DEVIP_LEGAL_LISTEN_TAPS) {
1611                                 set_errno(ENOSYS);
1612                                 set_errstr("Unsupported #%s listen tap %p, must be %p",
1613                                            devname(), tap->filter, DEVIP_LEGAL_LISTEN_TAPS);
1614                                 return -1;
1615                         }
1616                         spin_lock(&conv->tap_lock);
1617                         switch (cmd) {
1618                                 case (FDTAP_CMD_ADD):
1619                                         SLIST_INSERT_HEAD(&conv->listen_taps, tap, link);
1620                                         ret = 0;
1621                                         break;
1622                                 case (FDTAP_CMD_REM):
1623                                         SLIST_REMOVE(&conv->listen_taps, tap, fd_tap, link);
1624                                         ret = 0;
1625                                         break;
1626                                 default:
1627                                         set_errno(ENOSYS);
1628                                         set_errstr("Unsupported #%s listen tap command %p",
1629                                                    devname(), cmd);
1630                                         ret = -1;
1631                         }
1632                         spin_unlock(&conv->tap_lock);
1633                         return ret;
1634                 default:
1635                         set_errno(ENOSYS);
1636                         set_errstr("Can't tap #%s file type %d", devname(),
1637                                    TYPE(chan->qid));
1638                         return -1;
1639         }
1640 }
1641
1642 struct dev ipdevtab __devtab = {
1643         .name = "ip",
1644
1645         .reset = ipreset,
1646         .init = ipinit,
1647         .shutdown = devshutdown,
1648         .attach = ipattach,
1649         .walk = ipwalk,
1650         .stat = ipstat,
1651         .open = ipopen,
1652         .create = devcreate,
1653         .close = ipclose,
1654         .read = ipread,
1655         .bread = ipbread,
1656         .write = ipwrite,
1657         .bwrite = ipbwrite,
1658         .remove = devremove,
1659         .wstat = ipwstat,
1660         .power = devpower,
1661         .chaninfo = ipchaninfo,
1662         .tapfd = iptapfd,
1663 };
1664
1665 int Fsproto(struct Fs *f, struct Proto *p)
1666 {
1667         if (f->np >= Maxproto)
1668                 return -1;
1669
1670         qlock_init(&p->qlock);
1671         p->f = f;
1672
1673         if (p->ipproto > 0) {
1674                 if (f->t2p[p->ipproto] != NULL)
1675                         return -1;
1676                 f->t2p[p->ipproto] = p;
1677         }
1678
1679         p->qid.type = QTDIR;
1680         p->qid.path = QID(f->np, 0, Qprotodir);
1681         p->conv = kzmalloc(sizeof(struct conv *) * (p->nc + 1), 0);
1682         if (p->conv == NULL)
1683                 panic("Fsproto");
1684
1685         p->x = f->np;
1686         p->nextport = 0;
1687         p->nextrport = 600;
1688         f->p[f->np++] = p;
1689
1690         return 0;
1691 }
1692
1693 /*
1694  *  return true if this protocol is
1695  *  built in
1696  */
1697 int Fsbuiltinproto(struct Fs *f, uint8_t proto)
1698 {
1699         return f->t2p[proto] != NULL;
1700 }
1701
1702 /*
1703  *  called with protocol locked
1704  */
1705 struct conv *Fsprotoclone(struct Proto *p, char *user)
1706 {
1707         struct conv *c, **pp, **ep;
1708
1709 retry:
1710         c = NULL;
1711         ep = &p->conv[p->nc];
1712         for (pp = p->conv; pp < ep; pp++) {
1713                 c = *pp;
1714                 if (c == NULL) {
1715                         c = kzmalloc(sizeof(struct conv), 0);
1716                         if (c == NULL)
1717                                 error(ENOMEM,
1718                                       "conv kzmalloc(%d, 0) failed in Fsprotoclone",
1719                                       sizeof(struct conv));
1720                         qlock_init(&c->qlock);
1721                         qlock_init(&c->listenq);
1722                         rendez_init(&c->cr);
1723                         rendez_init(&c->listenr);
1724                         SLIST_INIT(&c->data_taps);      /* already = 0; set to be futureproof */
1725                         SLIST_INIT(&c->listen_taps);
1726                         spinlock_init(&c->tap_lock);
1727                         qlock(&c->qlock);
1728                         c->p = p;
1729                         c->x = pp - p->conv;
1730                         if (p->ptclsize != 0) {
1731                                 c->ptcl = kzmalloc(p->ptclsize, 0);
1732                                 if (c->ptcl == NULL) {
1733                                         kfree(c);
1734                                         error(ENOMEM,
1735                                               "ptcl kzmalloc(%d, 0) failed in Fsprotoclone",
1736                                               p->ptclsize);
1737                                 }
1738                         }
1739                         *pp = c;
1740                         p->ac++;
1741                         c->eq = qopen(1024, Qmsg, 0, 0);
1742                         (*p->create) (c);
1743                         assert(c->rq && c->wq);
1744                         break;
1745                 }
1746                 if (canqlock(&c->qlock)) {
1747                         /*
1748                          *  make sure both processes and protocol
1749                          *  are done with this Conv
1750                          */
1751                         if (c->inuse == 0 && (p->inuse == NULL || (*p->inuse) (c) == 0))
1752                                 break;
1753
1754                         qunlock(&c->qlock);
1755                 }
1756         }
1757         if (pp >= ep) {
1758                 if (p->gc != NULL && (*p->gc) (p))
1759                         goto retry;
1760                 return NULL;
1761         }
1762
1763         c->inuse = 1;
1764         kstrdup(&c->owner, user);
1765         c->perm = 0660;
1766         c->state = Idle;
1767         ipmove(c->laddr, IPnoaddr);
1768         ipmove(c->raddr, IPnoaddr);
1769         c->r = NULL;
1770         c->rgen = 0;
1771         c->lport = 0;
1772         c->rport = 0;
1773         c->restricted = 0;
1774         c->ttl = MAXTTL;
1775         c->tos = DFLTTOS;
1776         qreopen(c->rq);
1777         qreopen(c->wq);
1778         qreopen(c->eq);
1779
1780         qunlock(&c->qlock);
1781         return c;
1782 }
1783
1784 int Fsconnected(struct conv *c, char *msg)
1785 {
1786         if (msg != NULL && *msg != '\0')
1787                 strlcpy(c->cerr, msg, sizeof(c->cerr));
1788
1789         switch (c->state) {
1790                 case Announcing:
1791                         c->state = Announced;
1792                         break;
1793
1794                 case Connecting:
1795                         c->state = Connected;
1796                         break;
1797         }
1798
1799         rendez_wakeup(&c->cr);
1800         /* The user can poll or tap the connection status via Qdata */
1801         fire_data_taps(c, FDTAP_FILT_WRITABLE);
1802         return 0;
1803 }
1804
1805 struct Proto *Fsrcvpcol(struct Fs *f, uint8_t proto)
1806 {
1807         if (f->ipmux)
1808                 return f->ipmux;
1809         else
1810                 return f->t2p[proto];
1811 }
1812
1813 struct Proto *Fsrcvpcolx(struct Fs *f, uint8_t proto)
1814 {
1815         return f->t2p[proto];
1816 }
1817
1818 static void fire_listener_taps(struct conv *conv)
1819 {
1820         struct fd_tap *tap_i;
1821         if (SLIST_EMPTY(&conv->listen_taps))
1822                 return;
1823         spin_lock(&conv->tap_lock);
1824         SLIST_FOREACH(tap_i, &conv->listen_taps, link)
1825                 fire_tap(tap_i, FDTAP_FILT_READABLE);
1826         spin_unlock(&conv->tap_lock);
1827 }
1828
1829 /*
1830  *  called with protocol locked
1831  */
1832 struct conv *Fsnewcall(struct conv *c, uint8_t * raddr, uint16_t rport,
1833                                            uint8_t * laddr, uint16_t lport, uint8_t version)
1834 {
1835         struct conv *nc;
1836         struct conv **l;
1837         int i;
1838
1839         qlock(&c->qlock);
1840         i = 0;
1841         for (l = &c->incall; *l; l = &(*l)->next)
1842                 i++;
1843         if (i >= Maxincall) {
1844                 qunlock(&c->qlock);
1845                 return NULL;
1846         }
1847
1848         /* find a free conversation */
1849         nc = Fsprotoclone(c->p, network);
1850         if (nc == NULL) {
1851                 qunlock(&c->qlock);
1852                 return NULL;
1853         }
1854         ipmove(nc->raddr, raddr);
1855         nc->rport = rport;
1856         ipmove(nc->laddr, laddr);
1857         nc->lport = lport;
1858         nc->next = NULL;
1859         *l = nc;
1860         nc->state = Connected;
1861         nc->ipversion = version;
1862
1863         qunlock(&c->qlock);
1864
1865         rendez_wakeup(&c->listenr);
1866         fire_listener_taps(c);
1867
1868         return nc;
1869 }
1870
1871 static long ndbwrite(struct Fs *f, char *a, uint32_t off, int n)
1872 {
1873         if (off > strlen(f->ndb))
1874                 error(EIO, ERROR_FIXME);
1875         if (off + n >= sizeof(f->ndb) - 1)
1876                 error(EIO, ERROR_FIXME);
1877         memmove(f->ndb + off, a, n);
1878         f->ndb[off + n] = 0;
1879         f->ndbvers++;
1880         f->ndbmtime = seconds();
1881         return n;
1882 }
1883
1884 uint32_t scalednconv(void)
1885 {
1886         //if(conf.npage*BY2PG >= 128*MB)
1887         return Nchans * 4;
1888         //  return Nchans;
1889 }