9ns: Make chan_ctl() mandatory
[akaros.git] / kern / src / net / devip.c
1 /* Copyright © 1994-1999 Lucent Technologies Inc.  All rights reserved.
2  * Portions Copyright © 1997-1999 Vita Nuova Limited
3  * Portions Copyright © 2000-2007 Vita Nuova Holdings Limited
4  *                                (www.vitanuova.com)
5  * Revisions Copyright © 2000-2007 Lucent Technologies Inc. and others
6  *
7  * Modified for the Akaros operating system:
8  * Copyright (c) 2013-2014 The Regents of the University of California
9  * Copyright (c) 2013-2015 Google Inc.
10  *
11  * Permission is hereby granted, free of charge, to any person obtaining a copy
12  * of this software and associated documentation files (the "Software"), to deal
13  * in the Software without restriction, including without limitation the rights
14  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15  * copies of the Software, and to permit persons to whom the Software is
16  * furnished to do so, subject to the following conditions:
17  *
18  * The above copyright notice and this permission notice shall be included in
19  * all copies or substantial portions of the Software.
20  *
21  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
24  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27  * SOFTWARE. */
28
29 #include <slab.h>
30 #include <kmalloc.h>
31 #include <kref.h>
32 #include <string.h>
33 #include <stdio.h>
34 #include <assert.h>
35 #include <error.h>
36 #include <cpio.h>
37 #include <pmap.h>
38 #include <smp.h>
39 #include <net/ip.h>
40
41 struct dev ipdevtab;
42
43 static char *devname(void)
44 {
45         return ipdevtab.name;
46 }
47
48 enum {
49         Qtopdir = 1,                            /* top level directory */
50         Qtopbase,
51         Qarp = Qtopbase,
52         Qndb,
53         Qiproute,
54         Qiprouter,
55         Qipselftab,
56         Qlog,
57
58         Qprotodir,      /* directory for a protocol */
59         Qprotobase,
60         Qclone = Qprotobase,
61         Qstats,
62
63         Qconvdir,       /* directory for a conversation */
64         Qconvbase,
65         Qctl = Qconvbase,
66         Qdata,
67         Qerr,
68         Qlisten,
69         Qlocal,
70         Qremote,
71         Qstatus,
72         Qsnoop,
73
74         Logtype = 5,
75         Masktype = (1 << Logtype) - 1,
76         Logconv = 12,
77         Maskconv = (1 << Logconv) - 1,
78         Shiftconv = Logtype,
79         Logproto = 8,
80         Maskproto = (1 << Logproto) - 1,
81         Shiftproto = Logtype + Logconv,
82
83         Nfs = 32,
84         BYPASS_QMAX = 64 * MiB,
85         IPROUTE_LEN = 2 * PGSIZE,
86 };
87 #define TYPE(x)         ( ((uint32_t)(x).path) & Masktype )
88 #define CONV(x)         ( (((uint32_t)(x).path) >> Shiftconv) & Maskconv )
89 #define PROTO(x)        ( (((uint32_t)(x).path) >> Shiftproto) & Maskproto )
90 #define QID(p, c, y)    ( ((p)<<(Shiftproto)) | ((c)<<Shiftconv) | (y))
91 static char network[] = "network";
92
93 qlock_t fslock;
94 struct Fs *ipfs[Nfs];                   /* attached fs's */
95 struct queue *qlog;
96
97 extern void nullmediumlink(void);
98 extern void pktmediumlink(void);
99 extern struct username eve;
100 static long ndbwrite(struct Fs *, char *unused_char_p_t, uint32_t, int);
101 static void closeconv(struct conv *);
102 static void setup_proto_qio_bypass(struct conv *cv);
103 static void undo_proto_qio_bypass(struct conv *cv);
104 static int connected(void *a);
105
106 static struct conv *chan2conv(struct chan *chan)
107 {
108         /* That's a lot of pointers to get to the conv! */
109         return ipfs[chan->dev]->p[PROTO(chan->qid)]->conv[CONV(chan->qid)];
110 }
111
112 static inline int founddevdir(struct chan *c, struct qid q, char *n,
113                                                           int64_t length, char *user, long perm,
114                                                           struct dir *db)
115 {
116         devdir(c, q, n, length, user, perm, db);
117         return 1;
118 }
119
120 static int topdirgen(struct chan *c, struct dir *dp)
121 {
122         struct qid q;
123         mkqid(&q, QID(0, 0, Qtopdir), 0, QTDIR);
124         snprintf(get_cur_genbuf(), GENBUF_SZ, "#%s%lu", devname(), c->dev);
125         return founddevdir(c, q, get_cur_genbuf(), 0, network, 0555, dp);
126 }
127
128 /* Computes the perm field for a stat for Qdata.  Since select() polls the
129  * 'actionability' of a socket via the qdata FD, we'll also report listenable
130  * and connected conversations.  It's a minor hack.  =( */
131 static int qdata_stat_perm(struct conv *cv)
132 {
133         int perm;
134
135         perm = cv->perm;
136         /* If there is ever a listener, then it's readable.  Ideally, we'd only
137          * report this on the Qlisten file (which we also do).  The socket crap
138          * should never use a listening socket for data, so there shouldn't be any
139          * confusion when a Qdata shows up as readable. */
140         perm |= cv->incall ? DMREADABLE : 0;
141         /* For connectable convs, they need to be both connected and qio
142          * readable/writable.  The way to think about this is that the convs are not
143          * truly writable/readable until they are connected.  Conveniently, this
144          * means that when select polls Qdata for non-blocking connect(), a
145          * connected conversation pops up as writable (the qio is writable too).
146          *
147          * Note that a conversation can be 'Connected' even if it failed to connect.
148          * At least that's what the 9ns TCP code does.  It's more like "the protocol
149          * did what it needed and the connectctlmsg call (or its non-blocking
150          * equivalent) is done".  For instance, TCP has a few reasons to call
151          * Fsconnected, such as when we send the SYN and get a RST. */
152         if (!cv->p->connect || connected(cv)) {
153                 perm |= qreadable(cv->rq) ? DMREADABLE : 0;
154                 perm |= qwritable(cv->wq) ? DMWRITABLE : 0;
155         }
156         return perm;
157 }
158
159 static int ip3gen(struct chan *c, int i, struct dir *dp)
160 {
161         struct qid q;
162         struct conv *cv;
163         char *p;
164         int perm;
165
166         cv = chan2conv(c);
167         if (cv->owner == NULL)
168                 kstrdup(&cv->owner, eve.name);
169         mkqid(&q, QID(PROTO(c->qid), CONV(c->qid), i), 0, QTFILE);
170
171         switch (i) {
172                 default:
173                         return -1;
174                 case Qctl:
175                         return founddevdir(c, q, "ctl", 0,
176                                                    cv->owner, cv->perm, dp);
177                 case Qdata:
178                         perm = qdata_stat_perm(cv);
179                         return founddevdir(c, q, "data", qlen(cv->rq),
180                                                            cv->owner, perm, dp);
181                 case Qerr:
182                         perm = cv->perm;
183                         perm |= qreadable(cv->eq) ? DMREADABLE : 0;
184                         return founddevdir(c, q, "err", qlen(cv->eq),
185                                                            cv->owner, perm, dp);
186                 case Qlisten:
187                         perm = cv->perm;
188                         perm |= cv->incall ? DMREADABLE : 0;
189                         return founddevdir(c, q, "listen", 0, cv->owner, perm, dp);
190                 case Qlocal:
191                         p = "local";
192                         break;
193                 case Qremote:
194                         p = "remote";
195                         break;
196                 case Qsnoop:
197                         if (strcmp(cv->p->name, "ipifc") != 0)
198                                 return -1;
199                         perm = 0400;
200                         perm |= qreadable(cv->sq) ? DMREADABLE : 0;
201                         return founddevdir(c, q, "snoop", qlen(cv->sq),
202                                                            cv->owner, perm, dp);
203                 case Qstatus:
204                         p = "status";
205                         break;
206         }
207         return founddevdir(c, q, p, 0, cv->owner, 0444, dp);
208 }
209
210 static int ip2gen(struct chan *c, int i, struct dir *dp)
211 {
212         struct qid q;
213         mkqid(&q, QID(PROTO(c->qid), 0, i), 0, QTFILE);
214         switch (i) {
215                 case Qclone:
216                         return founddevdir(c, q, "clone", 0, network, 0666, dp);
217                 case Qstats:
218                         return founddevdir(c, q, "stats", 0, network, 0444, dp);
219         }
220         return -1;
221 }
222
223 static int ip1gen(struct chan *c, int i, struct dir *dp)
224 {
225         struct qid q;
226         char *p;
227         int prot;
228         int len = 0;
229         struct Fs *f;
230         extern uint32_t kerndate;
231
232         f = ipfs[c->dev];
233
234         prot = 0666;
235         mkqid(&q, QID(0, 0, i), 0, QTFILE);
236         switch (i) {
237                 default:
238                         return -1;
239                 case Qarp:
240                         p = "arp";
241                         break;
242                 case Qndb:
243                         p = "ndb";
244                         len = strlen(f->ndb);
245                         q.vers = f->ndbvers;
246                         break;
247                 case Qiproute:
248                         p = "iproute";
249                         break;
250                 case Qipselftab:
251                         p = "ipselftab";
252                         prot = 0444;
253                         break;
254                 case Qiprouter:
255                         p = "iprouter";
256                         break;
257                 case Qlog:
258                         p = "log";
259                         break;
260         }
261         devdir(c, q, p, len, network, prot, dp);
262         if (i == Qndb && f->ndbmtime > kerndate)
263                 dp->mtime.tv_sec = f->ndbmtime;
264         return 1;
265 }
266
267 static int
268 ipgen(struct chan *c, char *unused_char_p_t, struct dirtab *d, int unused_int,
269           int s, struct dir *dp)
270 {
271         struct qid q;
272         struct conv *cv;
273         struct Fs *f;
274
275         f = ipfs[c->dev];
276
277         switch (TYPE(c->qid)) {
278                 case Qtopdir:
279                         if (s == DEVDOTDOT)
280                                 return topdirgen(c, dp);
281                         if (s < f->np) {
282                                 if (f->p[s]->connect == NULL)
283                                         return 0;       /* protocol with no user interface */
284                                 mkqid(&q, QID(s, 0, Qprotodir), 0, QTDIR);
285                                 return founddevdir(c, q, f->p[s]->name, 0, network, 0555, dp);
286                         }
287                         s -= f->np;
288                         return ip1gen(c, s + Qtopbase, dp);
289                 case Qarp:
290                 case Qndb:
291                 case Qlog:
292                 case Qiproute:
293                 case Qiprouter:
294                 case Qipselftab:
295                         return ip1gen(c, TYPE(c->qid), dp);
296                 case Qprotodir:
297                         if (s == DEVDOTDOT)
298                                 return topdirgen(c, dp);
299                         else if (s < f->p[PROTO(c->qid)]->ac) {
300                                 cv = f->p[PROTO(c->qid)]->conv[s];
301                                 snprintf(get_cur_genbuf(), GENBUF_SZ, "%d", s);
302                                 mkqid(&q, QID(PROTO(c->qid), s, Qconvdir), 0, QTDIR);
303                                 return
304                                         founddevdir(c, q, get_cur_genbuf(), 0, cv->owner, 0555, dp);
305                         }
306                         s -= f->p[PROTO(c->qid)]->ac;
307                         return ip2gen(c, s + Qprotobase, dp);
308                 case Qclone:
309                 case Qstats:
310                         return ip2gen(c, TYPE(c->qid), dp);
311                 case Qconvdir:
312                         if (s == DEVDOTDOT) {
313                                 s = PROTO(c->qid);
314                                 mkqid(&q, QID(s, 0, Qprotodir), 0, QTDIR);
315                                 devdir(c, q, f->p[s]->name, 0, network, 0555, dp);
316                                 return 1;
317                         }
318                         return ip3gen(c, s + Qconvbase, dp);
319                 case Qctl:
320                 case Qdata:
321                 case Qerr:
322                 case Qlisten:
323                 case Qlocal:
324                 case Qremote:
325                 case Qstatus:
326                 case Qsnoop:
327                         return ip3gen(c, TYPE(c->qid), dp);
328         }
329         return -1;
330 }
331
332 static void ipinit(void)
333 {
334         qlock_init(&fslock);
335         nullmediumlink();
336         pktmediumlink();
337 /* if only
338         fmtinstall('i', eipfmt);
339         fmtinstall('I', eipfmt);
340         fmtinstall('E', eipfmt);
341         fmtinstall('V', eipfmt);
342         fmtinstall('M', eipfmt);
343 */
344 }
345
346 static void ipreset(void)
347 {
348 }
349
350 static struct Fs *ipgetfs(int dev)
351 {
352         extern void (*ipprotoinit[]) (struct Fs *);
353         struct Fs *f;
354         int i;
355
356         if (dev >= Nfs)
357                 return NULL;
358
359         qlock(&fslock);
360         if (ipfs[dev] == NULL) {
361                 f = kzmalloc(sizeof(struct Fs), MEM_WAIT);
362                 rwinit(&f->rwlock);
363                 qlock_init(&f->iprouter.qlock);
364                 ip_init(f);
365                 arpinit(f);
366                 netloginit(f);
367                 for (i = 0; ipprotoinit[i]; i++)
368                         ipprotoinit[i] (f);
369                 f->dev = dev;
370                 ipfs[dev] = f;
371         }
372         qunlock(&fslock);
373
374         return ipfs[dev];
375 }
376
377 struct IPaux *newipaux(char *owner, char *tag)
378 {
379         struct IPaux *a;
380         int n;
381
382         a = kzmalloc(sizeof(*a), 0);
383         kstrdup(&a->owner, owner);
384         memset(a->tag, ' ', sizeof(a->tag));
385         n = strlen(tag);
386         if (n > sizeof(a->tag))
387                 n = sizeof(a->tag);
388         memmove(a->tag, tag, n);
389         return a;
390 }
391
392 #define ATTACHER(c) (((struct IPaux*)((c)->aux))->owner)
393
394 static struct chan *ipattach(char *spec)
395 {
396         struct chan *c;
397         int dev;
398
399         dev = atoi(spec);
400         if (dev >= Nfs)
401                 error(EFAIL, "bad specification");
402
403         ipgetfs(dev);
404         c = devattach(devname(), spec);
405         mkqid(&c->qid, QID(0, 0, Qtopdir), 0, QTDIR);
406         c->dev = dev;
407
408         c->aux = newipaux(commonuser(), "none");
409
410         return c;
411 }
412
413 static struct walkqid *ipwalk(struct chan *c, struct chan *nc, char **name,
414                                                           unsigned int nname)
415 {
416         struct IPaux *a = c->aux;
417         struct walkqid *w;
418
419         w = devwalk(c, nc, name, nname, NULL, 0, ipgen);
420         if (w != NULL && w->clone != NULL)
421                 w->clone->aux = newipaux(a->owner, a->tag);
422         return w;
423 }
424
425 static size_t ipstat(struct chan *c, uint8_t *db, size_t n)
426 {
427         return devstat(c, db, n, NULL, 0, ipgen);
428 }
429
430 static int should_wake(void *arg)
431 {
432         struct conv *cv = arg;
433         /* signal that the conv is closed */
434         if (qisclosed(cv->rq))
435                 return TRUE;
436         return cv->incall != NULL;
437 }
438
439 static struct chan *ipopen(struct chan *c, int omode)
440 {
441         ERRSTACK(2);
442         struct conv *cv, *nc;
443         struct Proto *p;
444         int perm;
445         struct Fs *f;
446
447         /* perm is a lone rwx, not the rwx------ from the conversion */
448         perm = omode_to_rwx(omode) >> 6;
449
450         f = ipfs[c->dev];
451
452         switch (TYPE(c->qid)) {
453                 default:
454                         break;
455                 case Qndb:
456                         if (omode & (O_WRITE | O_TRUNC) && !iseve())
457                                 error(EPERM, ERROR_FIXME);
458                         if ((omode & (O_WRITE | O_TRUNC)) == (O_WRITE | O_TRUNC))
459                                 f->ndb[0] = 0;
460                         break;
461                 case Qlog:
462                         netlogopen(f);
463                         break;
464                 case Qiprouter:
465                         iprouteropen(f);
466                         break;
467                 case Qiproute:
468                         c->synth_buf = kpages_zalloc(IPROUTE_LEN, MEM_WAIT);
469                         routeread(f, c->synth_buf, 0, IPROUTE_LEN);
470                         break;
471                 case Qtopdir:
472                 case Qprotodir:
473                 case Qconvdir:
474                 case Qstatus:
475                 case Qremote:
476                 case Qlocal:
477                 case Qstats:
478                 case Qipselftab:
479                         if (omode & O_WRITE)
480                                 error(EPERM, ERROR_FIXME);
481                         break;
482                 case Qsnoop:
483                         if (omode & O_WRITE)
484                                 error(EPERM, ERROR_FIXME);
485                         /* might be racy.  note the lack of a proto lock, unlike Qdata */
486                         p = f->p[PROTO(c->qid)];
487                         cv = p->conv[CONV(c->qid)];
488                         if (strcmp(ATTACHER(c), cv->owner) != 0 && !iseve())
489                                 error(EPERM, ERROR_FIXME);
490                         atomic_inc(&cv->snoopers);
491                         break;
492                 case Qclone:
493                         p = f->p[PROTO(c->qid)];
494                         qlock(&p->qlock);
495                         if (waserror()) {
496                                 qunlock(&p->qlock);
497                                 nexterror();
498                         }
499                         cv = Fsprotoclone(p, ATTACHER(c));
500                         qunlock(&p->qlock);
501                         poperror();
502                         if (cv == NULL) {
503                                 error(ENODEV, "Null conversation from Fsprotoclone");
504                                 break;
505                         }
506                         mkqid(&c->qid, QID(p->x, cv->x, Qctl), 0, QTFILE);
507                         break;
508                 case Qdata:
509                 case Qctl:
510                 case Qerr:
511                         p = f->p[PROTO(c->qid)];
512                         qlock(&p->qlock);
513                         cv = p->conv[CONV(c->qid)];
514                         qlock(&cv->qlock);
515                         if (waserror()) {
516                                 qunlock(&cv->qlock);
517                                 qunlock(&p->qlock);
518                                 nexterror();
519                         }
520                         if ((perm & (cv->perm >> 6)) != perm) {
521                                 if (strcmp(ATTACHER(c), cv->owner) != 0)
522                                         error(EPERM, ERROR_FIXME);
523                                 if ((perm & cv->perm) != perm)
524                                         error(EPERM, ERROR_FIXME);
525
526                         }
527                         cv->inuse++;
528                         if (cv->inuse == 1) {
529                                 kstrdup(&cv->owner, ATTACHER(c));
530                                 cv->perm = 0660;
531                         }
532                         qunlock(&cv->qlock);
533                         qunlock(&p->qlock);
534                         poperror();
535                         break;
536                 case Qlisten:
537                         cv = f->p[PROTO(c->qid)]->conv[CONV(c->qid)];
538                         /* No permissions or Announce checks required.  We'll see if that's
539                          * a good idea or not. (the perm check would do nothing, as is,
540                          * since an O_PATH perm is 0).
541                          *
542                          * But we probably want to incref to keep the conversation around
543                          * until this FD/chan is closed.  #ip is a little weird in that
544                          * objects never really go away (high water mark for convs, you can
545                          * always find them in the ns).  I think it is possible to
546                          * namec/ipgen a chan, then have that conv close, then have that
547                          * chan be opened.  You can probably do this with a data file. */
548                         if (omode & O_PATH) {
549                                 qlock(&cv->qlock);
550                                 cv->inuse++;
551                                 qunlock(&cv->qlock);
552                                 break;
553                         }
554                         if ((perm & (cv->perm >> 6)) != perm) {
555                                 if (strcmp(ATTACHER(c), cv->owner) != 0)
556                                         error(EPERM, ERROR_FIXME);
557                                 if ((perm & cv->perm) != perm)
558                                         error(EPERM, ERROR_FIXME);
559
560                         }
561
562                         if (cv->state != Announced)
563                                 error(EFAIL, "not announced");
564
565                         if (waserror()) {
566                                 closeconv(cv);
567                                 nexterror();
568                         }
569                         qlock(&cv->qlock);
570                         cv->inuse++;
571                         qunlock(&cv->qlock);
572
573                         nc = NULL;
574                         while (nc == NULL) {
575                                 /* give up if we got a hangup */
576                                 if (qisclosed(cv->rq))
577                                         error(EFAIL, "listen hungup");
578
579                                 qlock(&cv->listenq);
580                                 if (waserror()) {
581                                         qunlock(&cv->listenq);
582                                         nexterror();
583                                 }
584                                 /* we can peek at incall without grabbing the cv qlock.  if
585                                  * anything is there, it'll remain there until we dequeue it.
586                                  * no one else can, since we hold the listenq lock */
587                                 if ((c->flag & O_NONBLOCK) && !cv->incall)
588                                         error(EAGAIN, "listen queue empty");
589                                 /* wait for a connect */
590                                 rendez_sleep(&cv->listenr, should_wake, cv);
591
592                                 /* if there is a concurrent hangup, they will hold the qlock
593                                  * until the hangup is complete, including closing the cv->rq */
594                                 qlock(&cv->qlock);
595                                 nc = cv->incall;
596                                 if (nc != NULL) {
597                                         cv->incall = nc->next;
598                                         mkqid(&c->qid, QID(PROTO(c->qid), nc->x, Qctl), 0, QTFILE);
599                                         kstrdup(&cv->owner, ATTACHER(c));
600                                 }
601                                 qunlock(&cv->qlock);
602
603                                 qunlock(&cv->listenq);
604                                 poperror();
605                         }
606                         closeconv(cv);
607                         poperror();
608                         break;
609         }
610         c->mode = openmode(omode);
611         c->flag |= COPEN;
612         c->offset = 0;
613         return c;
614 }
615
616 static size_t ipwstat(struct chan *c, uint8_t *dp, size_t n)
617 {
618         ERRSTACK(2);
619         struct dir *d;
620         struct conv *cv;
621         struct Fs *f;
622         struct Proto *p;
623
624         f = ipfs[c->dev];
625         switch (TYPE(c->qid)) {
626                 default:
627                         error(EPERM, ERROR_FIXME);
628                         break;
629                 case Qctl:
630                 case Qdata:
631                         break;
632         }
633
634         d = kzmalloc(sizeof(*d) + n, 0);
635         if (waserror()) {
636                 kfree(d);
637                 nexterror();
638         }
639         n = convM2D(dp, n, d, (char *)&d[1]);
640         if (n == 0)
641                 error(ENODATA, ERROR_FIXME);
642         p = f->p[PROTO(c->qid)];
643         cv = p->conv[CONV(c->qid)];
644         if (!iseve() && strcmp(ATTACHER(c), cv->owner) != 0)
645                 error(EPERM, ERROR_FIXME);
646         if (!emptystr(d->uid))
647                 kstrdup(&cv->owner, d->uid);
648         if (d->mode != -1)
649                 cv->perm = d->mode & 0777;
650         poperror();
651         kfree(d);
652         return n;
653 }
654
655 /* Should be able to handle any file type chan. Feel free to extend it. */
656 static char *ipchaninfo(struct chan *ch, char *ret, size_t ret_l)
657 {
658         struct conv *conv;
659         struct Proto *proto;
660         char *p;
661         struct Fs *f;
662
663         f = ipfs[ch->dev];
664
665         switch (TYPE(ch->qid)) {
666                 default:
667                         ret = "Unknown type";
668                         break;
669                 case Qdata:
670                         proto = f->p[PROTO(ch->qid)];
671                         conv = proto->conv[CONV(ch->qid)];
672                         snprintf(ret, ret_l,
673                                  "Qdata, %s, proto %s, conv idx %d, rq len %d, wq len %d, total read %llu",
674                                  SLIST_EMPTY(&conv->data_taps) ? "untapped" : "tapped",
675                                  proto->name, conv->x, qlen(conv->rq), qlen(conv->wq),
676                                          q_bytes_read(conv->rq));
677                         break;
678                 case Qarp:
679                         ret = "Qarp";
680                         break;
681                 case Qiproute:
682                         ret = "Qiproute";
683                         break;
684                 case Qlisten:
685                         proto = f->p[PROTO(ch->qid)];
686                         conv = proto->conv[CONV(ch->qid)];
687                         snprintf(ret, ret_l,
688                                  "Qlisten, %s proto %s, conv idx %d, has %sincalls",
689                                  SLIST_EMPTY(&conv->listen_taps) ? "untapped" : "tapped",
690                                  proto->name, conv->x, conv->incall ? "" : "no ");
691                         break;
692                 case Qlog:
693                         ret = "Qlog";
694                         break;
695                 case Qndb:
696                         ret = "Qndb";
697                         break;
698                 case Qctl:
699                         proto = f->p[PROTO(ch->qid)];
700                         conv = proto->conv[CONV(ch->qid)];
701                         snprintf(ret, ret_l, "Qctl, proto %s, conv idx %d", proto->name,
702                                          conv->x);
703                         break;
704         }
705         return ret;
706 }
707
708 static void closeconv(struct conv *cv)
709 {
710         ERRSTACK(1);
711         struct conv *nc;
712         struct Ipmulti *mp;
713
714         qlock(&cv->qlock);
715
716         if (--cv->inuse > 0) {
717                 qunlock(&cv->qlock);
718                 return;
719         }
720         if (waserror()) {
721                 qunlock(&cv->qlock);
722                 nexterror();
723         }
724         /* close all incoming calls since no listen will ever happen */
725         for (nc = cv->incall; nc; nc = cv->incall) {
726                 cv->incall = nc->next;
727                 closeconv(nc);
728         }
729         cv->incall = NULL;
730
731         kstrdup(&cv->owner, network);
732         cv->perm = 0660;
733
734         while ((mp = cv->multi) != NULL)
735                 ipifcremmulti(cv, mp->ma, mp->ia);
736
737         cv->r = NULL;
738         cv->rgen = 0;
739         if (cv->state == Bypass)
740                 undo_proto_qio_bypass(cv);
741         cv->p->close(cv);
742         cv->state = Idle;
743         qunlock(&cv->qlock);
744         poperror();
745 }
746
747 static void ipclose(struct chan *c)
748 {
749         struct Fs *f;
750
751         f = ipfs[c->dev];
752         switch (TYPE(c->qid)) {
753                 default:
754                         break;
755                 case Qlog:
756                         if (c->flag & COPEN)
757                                 netlogclose(f);
758                         break;
759                 case Qiprouter:
760                         if (c->flag & COPEN)
761                                 iprouterclose(f);
762                         break;
763                 case Qdata:
764                 case Qctl:
765                 case Qerr:
766                 case Qlisten:
767                         if (c->flag & COPEN)
768                                 closeconv(f->p[PROTO(c->qid)]->conv[CONV(c->qid)]);
769                         break;
770                 case Qsnoop:
771                         if (c->flag & COPEN)
772                                 atomic_dec(&f->p[PROTO(c->qid)]->conv[CONV(c->qid)]->snoopers);
773                         break;
774                 case Qiproute:
775                         if (c->flag & COPEN)
776                                 kpages_free(c->synth_buf, IPROUTE_LEN);
777                         break;
778         }
779         kfree(((struct IPaux *)c->aux)->owner);
780         kfree(c->aux);
781 }
782
783 enum {
784         Statelen = 32 * 1024,
785 };
786
787 static size_t ipread(struct chan *ch, void *a, size_t n, off64_t off)
788 {
789         struct conv *c;
790         struct Proto *x;
791         char *buf, *p;
792         long rv;
793         struct Fs *f;
794         uint32_t offset = off;
795
796         f = ipfs[ch->dev];
797
798         p = a;
799         switch (TYPE(ch->qid)) {
800                 default:
801                         error(EPERM, ERROR_FIXME);
802                 case Qtopdir:
803                 case Qprotodir:
804                 case Qconvdir:
805                         return devdirread(ch, a, n, 0, 0, ipgen);
806                 case Qarp:
807                         return arpread(f->arp, a, offset, n);
808                 case Qndb:
809                         return readstr(offset, a, n, f->ndb);
810                 case Qiproute:
811                         return readmem(offset, a, n, ch->synth_buf, IPROUTE_LEN);
812                 case Qiprouter:
813                         return iprouterread(f, a, n);
814                 case Qipselftab:
815                         return ipselftabread(f, a, offset, n);
816                 case Qlog:
817                         return netlogread(f, a, offset, n);
818                 case Qctl:
819                         snprintf(get_cur_genbuf(), GENBUF_SZ, "%lu", CONV(ch->qid));
820                         return readstr(offset, p, n, get_cur_genbuf());
821                 case Qremote:
822                         buf = kzmalloc(Statelen, 0);
823                         x = f->p[PROTO(ch->qid)];
824                         c = x->conv[CONV(ch->qid)];
825                         if (x->remote == NULL) {
826                                 snprintf(buf, Statelen, "%I!%d\n", c->raddr, c->rport);
827                         } else {
828                                 (*x->remote) (c, buf, Statelen - 2);
829                         }
830                         rv = readstr(offset, p, n, buf);
831                         kfree(buf);
832                         return rv;
833                 case Qlocal:
834                         buf = kzmalloc(Statelen, 0);
835                         x = f->p[PROTO(ch->qid)];
836                         c = x->conv[CONV(ch->qid)];
837                         if (x->local == NULL) {
838                                 snprintf(buf, Statelen, "%I!%d\n", c->laddr, c->lport);
839                         } else {
840                                 (*x->local) (c, buf, Statelen - 2);
841                         }
842                         rv = readstr(offset, p, n, buf);
843                         kfree(buf);
844                         return rv;
845                 case Qstatus:
846                         /* this all is a bit screwed up since the size of some state's
847                          * buffers will change from one invocation to another.  a reader
848                          * will come in and read the entire buffer.  then it will come again
849                          * and read from the next offset, expecting EOF.  if the buffer
850                          * changed sizes, it'll reprint the end of the buffer slightly. */
851                         buf = kzmalloc(Statelen, 0);
852                         x = f->p[PROTO(ch->qid)];
853                         c = x->conv[CONV(ch->qid)];
854                         if (c->state == Bypass)
855                                 snprintf(buf, Statelen, "Bypassed\n");
856                         else
857                                 (*x->state)(c, buf, Statelen - 2);
858                         rv = readstr(offset, p, n, buf);
859                         kfree(buf);
860                         return rv;
861                 case Qdata:
862                         c = f->p[PROTO(ch->qid)]->conv[CONV(ch->qid)];
863                         if (ch->flag & O_NONBLOCK)
864                                 return qread_nonblock(c->rq, a, n);
865                         else
866                                 return qread(c->rq, a, n);
867                 case Qerr:
868                         c = f->p[PROTO(ch->qid)]->conv[CONV(ch->qid)];
869                         return qread(c->eq, a, n);
870                 case Qsnoop:
871                         c = f->p[PROTO(ch->qid)]->conv[CONV(ch->qid)];
872                         return qread(c->sq, a, n);
873                 case Qstats:
874                         x = f->p[PROTO(ch->qid)];
875                         if (x->stats == NULL)
876                                 error(EFAIL, "stats not implemented");
877                         buf = kzmalloc(Statelen, 0);
878                         (*x->stats) (x, buf, Statelen);
879                         rv = readstr(offset, p, n, buf);
880                         kfree(buf);
881                         return rv;
882         }
883 }
884
885 static struct block *ipbread(struct chan *ch, size_t n, off64_t offset)
886 {
887         struct conv *c;
888
889         switch (TYPE(ch->qid)) {
890                 case Qdata:
891                         c = chan2conv(ch);
892                         if (ch->flag & O_NONBLOCK)
893                                 return qbread_nonblock(c->rq, n);
894                         else
895                                 return qbread(c->rq, n);
896                 default:
897                         return devbread(ch, n, offset);
898         }
899 }
900
901 /*
902  *  set local address to be that of the ifc closest to remote address
903  */
904 static void setladdr(struct conv *c)
905 {
906         findlocalip(c->p->f, c->laddr, c->raddr);
907 }
908
909 /*
910  *  set a local port making sure the quad of raddr,rport,laddr,lport is unique
911  */
912 static void setluniqueport(struct conv *c, int lport)
913 {
914         struct Proto *p;
915         struct conv *xp;
916         int x;
917
918         p = c->p;
919
920         qlock(&p->qlock);
921         for (x = 0; x < p->nc; x++) {
922                 xp = p->conv[x];
923                 if (xp == NULL)
924                         break;
925                 if (xp == c)
926                         continue;
927                 if ((xp->state == Connected || xp->state == Announced
928                                             || xp->state == Bypass)
929                         && xp->lport == lport
930                         && xp->rport == c->rport
931                         && ipcmp(xp->raddr, c->raddr) == 0
932                         && ipcmp(xp->laddr, c->laddr) == 0) {
933                         qunlock(&p->qlock);
934                         error(EFAIL, "address in use");
935                 }
936         }
937         c->lport = lport;
938         qunlock(&p->qlock);
939 }
940
941 /*
942  *  pick a local port and set it
943  */
944 static void setlport(struct conv *c)
945 {
946         struct Proto *p;
947         uint16_t *pp;
948         int x, found;
949
950         p = c->p;
951         if (c->restricted)
952                 pp = &p->nextrport;
953         else
954                 pp = &p->nextport;
955         qlock(&p->qlock);
956         for (;; (*pp)++) {
957                 /*
958                  * Fsproto initialises p->nextport to 0 and the restricted
959                  * ports (p->nextrport) to 600.
960                  * Restricted ports must lie between 600 and 1024.
961                  * For the initial condition or if the unrestricted port number
962                  * has wrapped round, select a random port between 5000 and 1<<15
963                  * to start at.
964                  */
965                 if (c->restricted) {
966                         if (*pp >= 1024)
967                                 *pp = 600;
968                 } else
969                         while (*pp < 5000)
970                                 urandom_read(pp, sizeof(*pp));
971
972                 found = 0;
973                 for (x = 0; x < p->nc; x++) {
974                         if (p->conv[x] == NULL)
975                                 break;
976                         if (p->conv[x]->lport == *pp) {
977                                 found = 1;
978                                 break;
979                         }
980                 }
981                 if (!found)
982                         break;
983         }
984         c->lport = (*pp)++;
985         qunlock(&p->qlock);
986 }
987
988 /*
989  *  set a local address and port from a string of the form
990  *      [address!]port[!r]
991  */
992 static void setladdrport(struct conv *c, char *str, int announcing)
993 {
994         char *p;
995         uint16_t lport;
996         uint8_t addr[IPaddrlen];
997
998         /*
999          *  ignore restricted part if it exists.  it's
1000          *  meaningless on local ports.
1001          */
1002         p = strchr(str, '!');
1003         if (p != NULL) {
1004                 *p++ = 0;
1005                 if (strcmp(p, "r") == 0)
1006                         p = NULL;
1007         }
1008
1009         c->lport = 0;
1010         if (p == NULL) {
1011                 if (announcing)
1012                         ipmove(c->laddr, IPnoaddr);
1013                 else
1014                         setladdr(c);
1015                 p = str;
1016         } else {
1017                 if (strcmp(str, "*") == 0)
1018                         ipmove(c->laddr, IPnoaddr);
1019                 else {
1020                         parseip(addr, str);
1021                         if (ipforme(c->p->f, addr))
1022                                 ipmove(c->laddr, addr);
1023                         else
1024                                 error(EFAIL, "not a local IP address");
1025                 }
1026         }
1027
1028         /* one process can get all connections */
1029         if (announcing && strcmp(p, "*") == 0) {
1030                 if (!iseve())
1031                         error(EPERM, ERROR_FIXME);
1032                 setluniqueport(c, 0);
1033         }
1034
1035         lport = atoi(p);
1036         if (lport <= 0)
1037                 setlport(c);
1038         else
1039                 setluniqueport(c, lport);
1040 }
1041
1042 static void setraddrport(struct conv *c, char *str)
1043 {
1044         char *p;
1045
1046         p = strchr(str, '!');
1047         if (p == NULL)
1048                 error(EFAIL, "malformed address");
1049         *p++ = 0;
1050         parseip(c->raddr, str);
1051         c->rport = atoi(p);
1052         p = strchr(p, '!');
1053         if (p) {
1054                 if (strstr(p, "!r") != NULL)
1055                         c->restricted = 1;
1056         }
1057 }
1058
1059 /*
1060  *  called by protocol connect routine to set addresses
1061  */
1062 void Fsstdconnect(struct conv *c, char *argv[], int argc)
1063 {
1064         switch (argc) {
1065                 default:
1066                         error(EINVAL, "bad args to %s", __func__);
1067                 case 2:
1068                         setraddrport(c, argv[1]);
1069                         setladdr(c);
1070                         setlport(c);
1071                         break;
1072                 case 3:
1073                         setraddrport(c, argv[1]);
1074                         setladdrport(c, argv[2], 0);
1075                         break;
1076         }
1077
1078         /* TODO: why is an IPnoaddr (in v6 format, equivalent to v6Unspecified),
1079          * a v4 format? */
1080         if ((memcmp(c->raddr, v4prefix, IPv4off) == 0 &&
1081                  memcmp(c->laddr, v4prefix, IPv4off) == 0)
1082                 || ipcmp(c->raddr, IPnoaddr) == 0)
1083                 c->ipversion = V4;
1084         else
1085                 c->ipversion = V6;
1086         /* Linux has taught people to use zeros for local interfaces.  TODO: We
1087          * might need this for v6 in the future. */
1088         if (!ipcmp(c->raddr, IPv4_zeroes))
1089                 ipmove(c->raddr, IPv4_loopback);
1090 }
1091
1092 /*
1093  *  initiate connection and sleep till its set up
1094  */
1095 static int connected(void *a)
1096 {
1097         return ((struct conv *)a)->state == Connected;
1098 }
1099
1100 static void connectctlmsg(struct Proto *x, struct conv *c, struct cmdbuf *cb,
1101                           struct chan *chan)
1102 {
1103         ERRSTACK(1);
1104         char *p;
1105
1106         if (c->state != 0)
1107                 error(EBUSY, ERROR_FIXME);
1108         c->state = Connecting;
1109         c->cerr[0] = '\0';
1110         if (x->connect == NULL)
1111                 error(EFAIL, "connect not supported");
1112         /* It's up to the proto connect method to not block the kthread.  This is
1113          * currently the case for e.g. TCP. */
1114         x->connect(c, cb->f, cb->nf);
1115         /* This is notionally right before the rendez_sleep: either we block or we
1116          * kick back to userspace.  We do this before the unlock to avoid races with
1117          * c->state (rendez's internal lock deals with its race with the waker) and
1118          * to avoid the excessive unlock and relock.
1119          *
1120          * Also, it's important that we don't do anything important for the
1121          * functionality of the conv after the rendez sleep.  The non-blocking style
1122          * won't call back into the kernel - it just wants the event.  I considered
1123          * allowing multiple connect calls, where we just return if it was already
1124          * connected, but that would break UDP, which allows multiple different
1125          * connect calls. */
1126         if ((chan->flag & O_NONBLOCK) && !connected(c))
1127                 error(EINPROGRESS, "connection not ready yet");
1128         qunlock(&c->qlock);
1129         if (waserror()) {
1130                 qlock(&c->qlock);
1131                 nexterror();
1132         }
1133         rendez_sleep(&c->cr, connected, c);
1134         qlock(&c->qlock);
1135         poperror();
1136
1137         if (c->cerr[0] != '\0')
1138                 error(EFAIL, c->cerr);
1139 }
1140
1141 /*
1142  *  called by protocol announce routine to set addresses
1143  */
1144 void Fsstdannounce(struct conv *c, char *argv[], int argc)
1145 {
1146         memset(c->raddr, 0, sizeof(c->raddr));
1147         c->rport = 0;
1148         switch (argc) {
1149                 default:
1150                         error(EINVAL, "bad args to announce");
1151                 case 2:
1152                         setladdrport(c, argv[1], 1);
1153                         break;
1154         }
1155 }
1156
1157 /*
1158  *  initiate announcement and sleep till its set up
1159  */
1160 static int announced(void *a)
1161 {
1162         return ((struct conv *)a)->state == Announced;
1163 }
1164
1165 static void announcectlmsg(struct Proto *x, struct conv *c, struct cmdbuf *cb)
1166 {
1167         ERRSTACK(1);
1168         char *p;
1169
1170         if (c->state != 0)
1171                 error(EBUSY, ERROR_FIXME);
1172         c->state = Announcing;
1173         c->cerr[0] = '\0';
1174         if (x->announce == NULL)
1175                 error(EFAIL, "announce not supported");
1176         x->announce(c, cb->f, cb->nf);
1177
1178         qunlock(&c->qlock);
1179         if (waserror()) {
1180                 qlock(&c->qlock);
1181                 nexterror();
1182         }
1183         rendez_sleep(&c->cr, announced, c);
1184         qlock(&c->qlock);
1185         poperror();
1186
1187         if (c->cerr[0] != '\0')
1188                 error(EFAIL, c->cerr);
1189 }
1190
1191 /*
1192  *  called by protocol bind routine to set addresses
1193  */
1194 void Fsstdbind(struct conv *c, char *argv[], int argc)
1195 {
1196         switch (argc) {
1197                 default:
1198                         error(EINVAL, "bad args to bind");
1199                 case 2:
1200                         setladdrport(c, argv[1], 0);
1201                         break;
1202         }
1203 }
1204
1205 static void bindctlmsg(struct Proto *x, struct conv *c, struct cmdbuf *cb)
1206 {
1207         if (x->bind == NULL)
1208                 Fsstdbind(c, cb->f, cb->nf);
1209         else
1210                 x->bind(c, cb->f, cb->nf);
1211 }
1212
1213 /* Helper, called by protocols to use the bypass.
1214  *
1215  * This is a bit nasty due to the overall nastiness of #ip.  We need to lock
1216  * before checking the state and hold the qlock throughout, because a concurrent
1217  * closeconv() could tear down the bypass.  Specifically, it could free the
1218  * bypass queues.  The root issue is that conversation lifetimes are not managed
1219  * well.
1220  *
1221  * If we fail, it's our responsibility to consume (free) the block(s). */
1222 void bypass_or_drop(struct conv *cv, struct block *bp)
1223 {
1224         qlock(&cv->qlock);
1225         if (cv->state == Bypass)
1226                 qpass(cv->rq, bp);
1227         else
1228                 freeblist(bp);
1229         qunlock(&cv->qlock);
1230 }
1231
1232 /* Push the block directly to the approprite ipoput function.
1233  *
1234  * It's the protocol's responsibility (and thus ours here) to make sure there is
1235  * at least the right amount of the IP header in the block (ipoput{4,6} assumes
1236  * it has the right amount, and the other protocols account for the IP header in
1237  * their own header).
1238  *
1239  * For the TTL and TOS, we just use the default ones.  If we want, we could look
1240  * into the actual block and see what the user wanted, though we're bypassing
1241  * the protocol layer, not the IP layer. */
1242 static void proto_bypass_kick(void *arg, struct block *bp)
1243 {
1244         struct conv *cv = (struct conv*)arg;
1245         uint8_t vers_nibble;
1246         struct Fs *f;
1247
1248         f = cv->p->f;
1249
1250         bp = pullupblock(bp, 1);
1251         if (!bp)
1252                 error(EINVAL, "Proto bypass unable to pullup a byte!");
1253         vers_nibble = *(uint8_t*)bp->rp & 0xf0;
1254         switch (vers_nibble) {
1255         case IP_VER4:
1256                 bp = pullupblock(bp, IPV4HDR_LEN);
1257                 if (!bp)
1258                         error(EINVAL, "Proto bypass unable to pullup v4 header");
1259                 ipoput4(f, bp, FALSE, MAXTTL, DFLTTOS, NULL);
1260                 break;
1261         case IP_VER6:
1262                 bp = pullupblock(bp, IPV6HDR_LEN);
1263                 if (!bp)
1264                         error(EINVAL, "Proto bypass unable to pullup v6 header");
1265                 ipoput6(f, bp, FALSE, MAXTTL, DFLTTOS, NULL);
1266                 break;
1267         default:
1268                 error(EINVAL, "Proto bypass block had unknown IP version 0x%x",
1269                       vers_nibble);
1270         }
1271 }
1272
1273 /* Sets up cv for the protocol bypass.  We use different queues for two reasons:
1274  * 1) To be protocol independent.  For instance, TCP and UDP could use very
1275  * different QIO styles.
1276  * 2) To set up our own kick/bypass method.  Note how udpcreate() and here uses
1277  * qbypass() (just blast it out), while TCP uses qopen() with a kick.  TCP still
1278  * follows queuing discipline.
1279  *
1280  * It's like we are our own protocol, the bypass protocol, when it comes to how
1281  * we interact with qio.  The conv still is of the real protocol type (e.g.
1282  * TCP).
1283  *
1284  * Note that we can't free the old queues.  The way #ip works, the queues are
1285  * created when the conv is created, but the conv is never freed.  It's like a
1286  * slab allocator that never frees objects, but just reinitializes them a
1287  * little.
1288  *
1289  * For the queues, we're basically like UDP:
1290  * - We take packets for rq and drop on overflow.
1291  * - rq is also Qmsg, but we also have Qcoalesce, to ignore out zero-len blocks
1292  * - We kick for our outbound (wq) messages.
1293  *
1294  * Note that Qmsg can drop parts of packets.  It's up to the user to read
1295  * enough.  If they didn't read enough, the extra is dropped.  This is similar
1296  * to SOCK_DGRAM and recvfrom().  Minus major changes, there's no nice way to
1297  * get individual messages with read().  Userspace using the bypass will need to
1298  * find out the MTU of the NIC the IP stack is attached to, and make sure to
1299  * read in at least that amount each time. */
1300 static void setup_proto_qio_bypass(struct conv *cv)
1301 {
1302         cv->rq_save = cv->rq;
1303         cv->wq_save = cv->wq;
1304         cv->rq = qopen(BYPASS_QMAX, Qmsg | Qcoalesce, 0, 0);
1305         cv->wq = qbypass(proto_bypass_kick, cv);
1306 }
1307
1308 static void undo_proto_qio_bypass(struct conv *cv)
1309 {
1310         qfree(cv->rq);
1311         qfree(cv->wq);
1312         cv->rq = cv->rq_save;
1313         cv->wq = cv->wq_save;
1314         cv->rq_save = NULL;
1315         cv->wq_save = NULL;
1316 }
1317
1318 void Fsstdbypass(struct conv *cv, char *argv[], int argc)
1319 {
1320         memset(cv->raddr, 0, sizeof(cv->raddr));
1321         cv->rport = 0;
1322         switch (argc) {
1323         case 2:
1324                 setladdrport(cv, argv[1], 1);
1325                 break;
1326         default:
1327                 error(EINVAL, "Bad args (was %d, need 2) to bypass", argc);
1328         }
1329 }
1330
1331 static void bypassctlmsg(struct Proto *x, struct conv *cv, struct cmdbuf *cb)
1332 {
1333         if (!x->bypass)
1334                 error(EFAIL, "Protocol %s does not support bypass", x->name);
1335         /* The protocol needs to set the port (usually by calling Fsstdbypass) and
1336          * then do whatever it needs to make sure it can find the conv again during
1337          * receive (usually by adding to a hash table). */
1338         x->bypass(cv, cb->f, cb->nf);
1339         setup_proto_qio_bypass(cv);
1340         cv->state = Bypass;
1341 }
1342
1343 static void shutdownctlmsg(struct conv *cv, struct cmdbuf *cb)
1344 {
1345         if (cb->nf < 2)
1346                 goto err;
1347         if (!strcmp(cb->f[1], "rd")) {
1348                 qhangup(cv->rq, "shutdown");
1349                 if (cv->p->shutdown)
1350                         cv->p->shutdown(cv, SHUT_RD);
1351         } else if (!strcmp(cb->f[1], "wr")) {
1352                 qhangup(cv->wq, "shutdown");
1353                 if (cv->p->shutdown)
1354                         cv->p->shutdown(cv, SHUT_WR);
1355         } else if (!strcmp(cb->f[1], "rdwr")) {
1356                 qhangup(cv->rq, "shutdown");
1357                 qhangup(cv->wq, "shutdown");
1358                 if (cv->p->shutdown)
1359                         cv->p->shutdown(cv, SHUT_RDWR);
1360         } else {
1361                 goto err;
1362         }
1363         return;
1364 err:
1365         error(EINVAL, "shutdown [rx|tx|rxtx]");
1366 }
1367
1368 static void tosctlmsg(struct conv *c, struct cmdbuf *cb)
1369 {
1370         if (cb->nf < 2)
1371                 c->tos = 0;
1372         else
1373                 c->tos = atoi(cb->f[1]);
1374 }
1375
1376 static void ttlctlmsg(struct conv *c, struct cmdbuf *cb)
1377 {
1378         if (cb->nf < 2)
1379                 c->ttl = MAXTTL;
1380         else
1381                 c->ttl = atoi(cb->f[1]);
1382 }
1383
1384 /* Binds a conversation, as if the user wrote "bind *" into ctl. */
1385 static void autobind(struct conv *cv)
1386 {
1387         ERRSTACK(1);
1388         struct cmdbuf *cb;
1389
1390         cb = parsecmd("bind *", 7);
1391         if (waserror()) {
1392                 kfree(cb);
1393                 nexterror();
1394         }
1395         bindctlmsg(cv->p, cv, cb);
1396         poperror();
1397         kfree(cb);
1398 }
1399
1400 static size_t ipwrite(struct chan *ch, void *v, size_t n, off64_t off)
1401 {
1402         ERRSTACK(1);
1403         struct conv *c;
1404         struct Proto *x;
1405         char *p;
1406         struct cmdbuf *cb;
1407         uint8_t ia[IPaddrlen], ma[IPaddrlen];
1408         struct Fs *f;
1409         char *a;
1410
1411         a = v;
1412         f = ipfs[ch->dev];
1413
1414         switch (TYPE(ch->qid)) {
1415                 default:
1416                         error(EPERM, ERROR_FIXME);
1417                 case Qdata:
1418                         x = f->p[PROTO(ch->qid)];
1419                         c = x->conv[CONV(ch->qid)];
1420                         /* connection-less protocols (UDP) can write without manually
1421                          * binding. */
1422                         if (c->lport == 0)
1423                                 autobind(c);
1424                         if (ch->flag & O_NONBLOCK)
1425                                 qwrite_nonblock(c->wq, a, n);
1426                         else
1427                                 qwrite(c->wq, a, n);
1428                         break;
1429                 case Qarp:
1430                         return arpwrite(f, a, n);
1431                 case Qiproute:
1432                         return routewrite(f, ch, a, n);
1433                 case Qlog:
1434                         netlogctl(f, a, n);
1435                         return n;
1436                 case Qndb:
1437                         return ndbwrite(f, a, off, n);
1438                 case Qctl:
1439                         x = f->p[PROTO(ch->qid)];
1440                         c = x->conv[CONV(ch->qid)];
1441                         cb = parsecmd(a, n);
1442
1443                         qlock(&c->qlock);
1444                         if (waserror()) {
1445                                 qunlock(&c->qlock);
1446                                 kfree(cb);
1447                                 nexterror();
1448                         }
1449                         if (cb->nf < 1)
1450                                 error(EFAIL, "short control request");
1451                         if (strcmp(cb->f[0], "connect") == 0)
1452                                 connectctlmsg(x, c, cb, ch);
1453                         else if (strcmp(cb->f[0], "announce") == 0)
1454                                 announcectlmsg(x, c, cb);
1455                         else if (strcmp(cb->f[0], "bind") == 0)
1456                                 bindctlmsg(x, c, cb);
1457                         else if (strcmp(cb->f[0], "bypass") == 0)
1458                                 bypassctlmsg(x, c, cb);
1459                         else if (strcmp(cb->f[0], "shutdown") == 0)
1460                                 shutdownctlmsg(c, cb);
1461                         else if (strcmp(cb->f[0], "ttl") == 0)
1462                                 ttlctlmsg(c, cb);
1463                         else if (strcmp(cb->f[0], "tos") == 0)
1464                                 tosctlmsg(c, cb);
1465                         else if (strcmp(cb->f[0], "ignoreadvice") == 0)
1466                                 c->ignoreadvice = 1;
1467                         else if (strcmp(cb->f[0], "addmulti") == 0) {
1468                                 if (cb->nf < 2)
1469                                         error(EFAIL, "addmulti needs interface address");
1470                                 if (cb->nf == 2) {
1471                                         if (!ipismulticast(c->raddr))
1472                                                 error(EFAIL, "addmulti for a non multicast address");
1473                                         parseip(ia, cb->f[1]);
1474                                         ipifcaddmulti(c, c->raddr, ia);
1475                                 } else {
1476                                         parseip(ma, cb->f[2]);
1477                                         if (!ipismulticast(ma))
1478                                                 error(EFAIL, "addmulti for a non multicast address");
1479                                         parseip(ia, cb->f[1]);
1480                                         ipifcaddmulti(c, ma, ia);
1481                                 }
1482                         } else if (strcmp(cb->f[0], "remmulti") == 0) {
1483                                 if (cb->nf < 2)
1484                                         error(EFAIL, "remmulti needs interface address");
1485                                 if (!ipismulticast(c->raddr))
1486                                         error(EFAIL, "remmulti for a non multicast address");
1487                                 parseip(ia, cb->f[1]);
1488                                 ipifcremmulti(c, c->raddr, ia);
1489                         } else if (x->ctl != NULL) {
1490                                 x->ctl(c, cb->f, cb->nf);
1491                         } else
1492                                 error(EFAIL, "unknown control request");
1493                         qunlock(&c->qlock);
1494                         kfree(cb);
1495                         poperror();
1496         }
1497         return n;
1498 }
1499
1500 static size_t ipbwrite(struct chan *ch, struct block *bp, off64_t offset)
1501 {
1502         struct conv *c;
1503         size_t n;
1504
1505         switch (TYPE(ch->qid)) {
1506                 case Qdata:
1507                         c = chan2conv(ch);
1508                         if (bp->next)
1509                                 bp = concatblock(bp);
1510                         n = BLEN(bp);
1511                         if (ch->flag & O_NONBLOCK)
1512                                 qbwrite_nonblock(c->wq, bp);
1513                         else
1514                                 qbwrite(c->wq, bp);
1515                         return n;
1516                 default:
1517                         return devbwrite(ch, bp, offset);
1518         }
1519 }
1520
1521 static void fire_data_taps(struct conv *conv, int filter)
1522 {
1523         struct fd_tap *tap_i;
1524
1525         /* At this point, we have an event we want to send to our taps (if any).
1526          * The lock protects list integrity and the existence of the tap.
1527          *
1528          * Previously, I thought of using the conv qlock.  That actually breaks, due
1529          * to weird usages of the qlock (someone holds it for a long time, blocking
1530          * the inbound wakeup from etherread4).
1531          *
1532          * I opted for a spinlock for a couple reasons:
1533          * - fire_tap should not block.  ideally it'll be fast too (it's mostly a
1534          * send_event).
1535          * - our callers might not want to block.  A lot of network wakeups will
1536          * come network processes (etherread4) or otherwise unrelated to this
1537          * particular conversation.  I'd rather do something like fire off a KMSG
1538          * than block those.
1539          * - if fire_tap takes a while, holding the lock only slows down other
1540          * events on this *same* conversation, or other tap registration.  not a
1541          * huge deal. */
1542         spin_lock(&conv->tap_lock);
1543         SLIST_FOREACH(tap_i, &conv->data_taps, link)
1544                 fire_tap(tap_i, filter);
1545         spin_unlock(&conv->tap_lock);
1546 }
1547
1548 static void ip_wake_cb(struct queue *q, void *data, int filter)
1549 {
1550         struct conv *conv = (struct conv*)data;
1551
1552         /* For these two, we want to ignore events on the opposite end of the
1553          * queues.  For instance, we want to know when the WQ is writable.  Our
1554          * writes will actually make it readable - we don't want to trigger a tap
1555          * for that.  However, qio doesn't know how/why we are using a queue, or
1556          * even who the ends are (hence the callbacks) */
1557         if ((filter & FDTAP_FILT_READABLE) && (q == conv->wq))
1558                 return;
1559         if ((filter & FDTAP_FILT_WRITABLE) && (q == conv->rq))
1560                 return;
1561         fire_data_taps(conv, filter);
1562 }
1563
1564 int iptapfd(struct chan *chan, struct fd_tap *tap, int cmd)
1565 {
1566         struct conv *conv = chan2conv(chan);
1567         int ret;
1568
1569         #define DEVIP_LEGAL_DATA_TAPS (FDTAP_FILT_READABLE | FDTAP_FILT_WRITABLE | \
1570                                        FDTAP_FILT_HANGUP | FDTAP_FILT_PRIORITY |   \
1571                                        FDTAP_FILT_ERROR)
1572         #define DEVIP_LEGAL_LISTEN_TAPS (FDTAP_FILT_READABLE | FDTAP_FILT_HANGUP)
1573
1574         switch (TYPE(chan->qid)) {
1575                 case Qdata:
1576                         if (tap->filter & ~DEVIP_LEGAL_DATA_TAPS) {
1577                                 set_errno(ENOSYS);
1578                                 set_errstr("Unsupported #%s data tap %p, must be %p", devname(),
1579                                            tap->filter, DEVIP_LEGAL_DATA_TAPS);
1580                                 return -1;
1581                         }
1582                         spin_lock(&conv->tap_lock);
1583                         switch (cmd) {
1584                                 case (FDTAP_CMD_ADD):
1585                                         if (SLIST_EMPTY(&conv->data_taps)) {
1586                                                 qio_set_wake_cb(conv->rq, ip_wake_cb, conv);
1587                                                 qio_set_wake_cb(conv->wq, ip_wake_cb, conv);
1588                                         }
1589                                         SLIST_INSERT_HEAD(&conv->data_taps, tap, link);
1590                                         ret = 0;
1591                                         break;
1592                                 case (FDTAP_CMD_REM):
1593                                         SLIST_REMOVE(&conv->data_taps, tap, fd_tap, link);
1594                                         if (SLIST_EMPTY(&conv->data_taps)) {
1595                                                 qio_set_wake_cb(conv->rq, 0, conv);
1596                                                 qio_set_wake_cb(conv->wq, 0, conv);
1597                                         }
1598                                         ret = 0;
1599                                         break;
1600                                 default:
1601                                         set_errno(ENOSYS);
1602                                         set_errstr("Unsupported #%s data tap command %p",
1603                                                    devname(), cmd);
1604                                         ret = -1;
1605                         }
1606                         spin_unlock(&conv->tap_lock);
1607                         return ret;
1608                 case Qlisten:
1609                         if (tap->filter & ~DEVIP_LEGAL_LISTEN_TAPS) {
1610                                 set_errno(ENOSYS);
1611                                 set_errstr("Unsupported #%s listen tap %p, must be %p",
1612                                            devname(), tap->filter, DEVIP_LEGAL_LISTEN_TAPS);
1613                                 return -1;
1614                         }
1615                         spin_lock(&conv->tap_lock);
1616                         switch (cmd) {
1617                                 case (FDTAP_CMD_ADD):
1618                                         SLIST_INSERT_HEAD(&conv->listen_taps, tap, link);
1619                                         ret = 0;
1620                                         break;
1621                                 case (FDTAP_CMD_REM):
1622                                         SLIST_REMOVE(&conv->listen_taps, tap, fd_tap, link);
1623                                         ret = 0;
1624                                         break;
1625                                 default:
1626                                         set_errno(ENOSYS);
1627                                         set_errstr("Unsupported #%s listen tap command %p",
1628                                                    devname(), cmd);
1629                                         ret = -1;
1630                         }
1631                         spin_unlock(&conv->tap_lock);
1632                         return ret;
1633                 default:
1634                         set_errno(ENOSYS);
1635                         set_errstr("Can't tap #%s file type %d", devname(),
1636                                    TYPE(chan->qid));
1637                         return -1;
1638         }
1639 }
1640
1641 static unsigned long ip_chan_ctl(struct chan *c, int op, unsigned long a1,
1642                                  unsigned long a2, unsigned long a3,
1643                                  unsigned long a4)
1644 {
1645         switch (op) {
1646         case CCTL_SET_FL:
1647                 return 0;
1648         default:
1649                 error(EINVAL, "%s does not support %d", __func__, op);
1650         }
1651 }
1652
1653 struct dev ipdevtab __devtab = {
1654         .name = "ip",
1655
1656         .reset = ipreset,
1657         .init = ipinit,
1658         .shutdown = devshutdown,
1659         .attach = ipattach,
1660         .walk = ipwalk,
1661         .stat = ipstat,
1662         .open = ipopen,
1663         .create = devcreate,
1664         .close = ipclose,
1665         .read = ipread,
1666         .bread = ipbread,
1667         .write = ipwrite,
1668         .bwrite = ipbwrite,
1669         .remove = devremove,
1670         .wstat = ipwstat,
1671         .power = devpower,
1672         .chaninfo = ipchaninfo,
1673         .tapfd = iptapfd,
1674         .chan_ctl = ip_chan_ctl,
1675 };
1676
1677 int Fsproto(struct Fs *f, struct Proto *p)
1678 {
1679         if (f->np >= Maxproto)
1680                 return -1;
1681
1682         qlock_init(&p->qlock);
1683         p->f = f;
1684
1685         if (p->ipproto > 0) {
1686                 if (f->t2p[p->ipproto] != NULL)
1687                         return -1;
1688                 f->t2p[p->ipproto] = p;
1689         }
1690
1691         p->qid.type = QTDIR;
1692         p->qid.path = QID(f->np, 0, Qprotodir);
1693         p->conv = kzmalloc(sizeof(struct conv *) * (p->nc + 1), 0);
1694         if (p->conv == NULL)
1695                 panic("Fsproto");
1696
1697         p->x = f->np;
1698         p->nextport = 0;
1699         p->nextrport = 600;
1700         f->p[f->np++] = p;
1701
1702         return 0;
1703 }
1704
1705 /*
1706  *  return true if this protocol is
1707  *  built in
1708  */
1709 int Fsbuiltinproto(struct Fs *f, uint8_t proto)
1710 {
1711         return f->t2p[proto] != NULL;
1712 }
1713
1714 /*
1715  *  called with protocol locked
1716  */
1717 struct conv *Fsprotoclone(struct Proto *p, char *user)
1718 {
1719         struct conv *c, **pp, **ep;
1720
1721 retry:
1722         c = NULL;
1723         ep = &p->conv[p->nc];
1724         for (pp = p->conv; pp < ep; pp++) {
1725                 c = *pp;
1726                 if (c == NULL) {
1727                         c = kzmalloc(sizeof(struct conv), 0);
1728                         if (c == NULL)
1729                                 error(ENOMEM,
1730                                       "conv kzmalloc(%d, 0) failed in Fsprotoclone",
1731                                       sizeof(struct conv));
1732                         qlock_init(&c->qlock);
1733                         qlock_init(&c->listenq);
1734                         rendez_init(&c->cr);
1735                         rendez_init(&c->listenr);
1736                         SLIST_INIT(&c->data_taps);      /* already = 0; set to be futureproof */
1737                         SLIST_INIT(&c->listen_taps);
1738                         spinlock_init(&c->tap_lock);
1739                         qlock(&c->qlock);
1740                         c->p = p;
1741                         c->x = pp - p->conv;
1742                         if (p->ptclsize != 0) {
1743                                 c->ptcl = kzmalloc(p->ptclsize, 0);
1744                                 if (c->ptcl == NULL) {
1745                                         kfree(c);
1746                                         error(ENOMEM,
1747                                               "ptcl kzmalloc(%d, 0) failed in Fsprotoclone",
1748                                               p->ptclsize);
1749                                 }
1750                         }
1751                         *pp = c;
1752                         p->ac++;
1753                         c->eq = qopen(1024, Qmsg, 0, 0);
1754                         (*p->create) (c);
1755                         assert(c->rq && c->wq);
1756                         break;
1757                 }
1758                 if (canqlock(&c->qlock)) {
1759                         /*
1760                          *  make sure both processes and protocol
1761                          *  are done with this Conv
1762                          */
1763                         if (c->inuse == 0 && (p->inuse == NULL || (*p->inuse) (c) == 0))
1764                                 break;
1765
1766                         qunlock(&c->qlock);
1767                 }
1768         }
1769         if (pp >= ep) {
1770                 if (p->gc != NULL && (*p->gc) (p))
1771                         goto retry;
1772                 return NULL;
1773         }
1774
1775         c->inuse = 1;
1776         kstrdup(&c->owner, user);
1777         c->perm = 0660;
1778         c->state = Idle;
1779         ipmove(c->laddr, IPnoaddr);
1780         ipmove(c->raddr, IPnoaddr);
1781         c->r = NULL;
1782         c->rgen = 0;
1783         c->lport = 0;
1784         c->rport = 0;
1785         c->restricted = 0;
1786         c->ttl = MAXTTL;
1787         c->tos = DFLTTOS;
1788         qreopen(c->rq);
1789         qreopen(c->wq);
1790         qreopen(c->eq);
1791
1792         qunlock(&c->qlock);
1793         return c;
1794 }
1795
1796 int Fsconnected(struct conv *c, char *msg)
1797 {
1798         if (msg != NULL && *msg != '\0')
1799                 strlcpy(c->cerr, msg, sizeof(c->cerr));
1800
1801         switch (c->state) {
1802                 case Announcing:
1803                         c->state = Announced;
1804                         break;
1805
1806                 case Connecting:
1807                         c->state = Connected;
1808                         break;
1809         }
1810
1811         rendez_wakeup(&c->cr);
1812         /* The user can poll or tap the connection status via Qdata */
1813         fire_data_taps(c, FDTAP_FILT_WRITABLE);
1814         return 0;
1815 }
1816
1817 struct Proto *Fsrcvpcol(struct Fs *f, uint8_t proto)
1818 {
1819         if (f->ipmux)
1820                 return f->ipmux;
1821         else
1822                 return f->t2p[proto];
1823 }
1824
1825 struct Proto *Fsrcvpcolx(struct Fs *f, uint8_t proto)
1826 {
1827         return f->t2p[proto];
1828 }
1829
1830 static void fire_listener_taps(struct conv *conv)
1831 {
1832         struct fd_tap *tap_i;
1833         if (SLIST_EMPTY(&conv->listen_taps))
1834                 return;
1835         spin_lock(&conv->tap_lock);
1836         SLIST_FOREACH(tap_i, &conv->listen_taps, link)
1837                 fire_tap(tap_i, FDTAP_FILT_READABLE);
1838         spin_unlock(&conv->tap_lock);
1839 }
1840
1841 /*
1842  *  called with protocol locked
1843  */
1844 struct conv *Fsnewcall(struct conv *c, uint8_t * raddr, uint16_t rport,
1845                                            uint8_t * laddr, uint16_t lport, uint8_t version)
1846 {
1847         struct conv *nc;
1848         struct conv **l;
1849         int i;
1850
1851         qlock(&c->qlock);
1852         i = 0;
1853         for (l = &c->incall; *l; l = &(*l)->next)
1854                 i++;
1855         if (i >= Maxincall) {
1856                 qunlock(&c->qlock);
1857                 return NULL;
1858         }
1859
1860         /* find a free conversation */
1861         nc = Fsprotoclone(c->p, network);
1862         if (nc == NULL) {
1863                 qunlock(&c->qlock);
1864                 return NULL;
1865         }
1866         ipmove(nc->raddr, raddr);
1867         nc->rport = rport;
1868         ipmove(nc->laddr, laddr);
1869         nc->lport = lport;
1870         nc->next = NULL;
1871         *l = nc;
1872         nc->state = Connected;
1873         nc->ipversion = version;
1874
1875         qunlock(&c->qlock);
1876
1877         rendez_wakeup(&c->listenr);
1878         fire_listener_taps(c);
1879
1880         return nc;
1881 }
1882
1883 static long ndbwrite(struct Fs *f, char *a, uint32_t off, int n)
1884 {
1885         if (off > strlen(f->ndb))
1886                 error(EIO, ERROR_FIXME);
1887         if (off + n >= sizeof(f->ndb) - 1)
1888                 error(EIO, ERROR_FIXME);
1889         memmove(f->ndb + off, a, n);
1890         f->ndb[off + n] = 0;
1891         f->ndbvers++;
1892         f->ndbmtime = seconds();
1893         return n;
1894 }
1895
1896 uint32_t scalednconv(void)
1897 {
1898         //if(conf.npage*BY2PG >= 128*MB)
1899         return Nchans * 4;
1900         //  return Nchans;
1901 }