9ns: Extend struct dir and the stat M bufs
[akaros.git] / kern / src / net / devip.c
1 /* Copyright © 1994-1999 Lucent Technologies Inc.  All rights reserved.
2  * Portions Copyright © 1997-1999 Vita Nuova Limited
3  * Portions Copyright © 2000-2007 Vita Nuova Holdings Limited
4  *                                (www.vitanuova.com)
5  * Revisions Copyright © 2000-2007 Lucent Technologies Inc. and others
6  *
7  * Modified for the Akaros operating system:
8  * Copyright (c) 2013-2014 The Regents of the University of California
9  * Copyright (c) 2013-2015 Google Inc.
10  *
11  * Permission is hereby granted, free of charge, to any person obtaining a copy
12  * of this software and associated documentation files (the "Software"), to deal
13  * in the Software without restriction, including without limitation the rights
14  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15  * copies of the Software, and to permit persons to whom the Software is
16  * furnished to do so, subject to the following conditions:
17  *
18  * The above copyright notice and this permission notice shall be included in
19  * all copies or substantial portions of the Software.
20  *
21  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
24  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27  * SOFTWARE. */
28
29 #include <vfs.h>
30 #include <kfs.h>
31 #include <slab.h>
32 #include <kmalloc.h>
33 #include <kref.h>
34 #include <string.h>
35 #include <stdio.h>
36 #include <assert.h>
37 #include <error.h>
38 #include <cpio.h>
39 #include <pmap.h>
40 #include <smp.h>
41 #include <net/ip.h>
42
43 struct dev ipdevtab;
44
45 static char *devname(void)
46 {
47         return ipdevtab.name;
48 }
49
50 enum {
51         Qtopdir = 1,                            /* top level directory */
52         Qtopbase,
53         Qarp = Qtopbase,
54         Qndb,
55         Qiproute,
56         Qiprouter,
57         Qipselftab,
58         Qlog,
59
60         Qprotodir,      /* directory for a protocol */
61         Qprotobase,
62         Qclone = Qprotobase,
63         Qstats,
64
65         Qconvdir,       /* directory for a conversation */
66         Qconvbase,
67         Qctl = Qconvbase,
68         Qdata,
69         Qerr,
70         Qlisten,
71         Qlocal,
72         Qremote,
73         Qstatus,
74         Qsnoop,
75
76         Logtype = 5,
77         Masktype = (1 << Logtype) - 1,
78         Logconv = 12,
79         Maskconv = (1 << Logconv) - 1,
80         Shiftconv = Logtype,
81         Logproto = 8,
82         Maskproto = (1 << Logproto) - 1,
83         Shiftproto = Logtype + Logconv,
84
85         Nfs = 32,
86         BYPASS_QMAX = 64 * MiB,
87         IPROUTE_LEN = 2 * PGSIZE,
88 };
89 #define TYPE(x)         ( ((uint32_t)(x).path) & Masktype )
90 #define CONV(x)         ( (((uint32_t)(x).path) >> Shiftconv) & Maskconv )
91 #define PROTO(x)        ( (((uint32_t)(x).path) >> Shiftproto) & Maskproto )
92 #define QID(p, c, y)    ( ((p)<<(Shiftproto)) | ((c)<<Shiftconv) | (y))
93 static char network[] = "network";
94
95 qlock_t fslock;
96 struct Fs *ipfs[Nfs];                   /* attached fs's */
97 struct queue *qlog;
98
99 extern void nullmediumlink(void);
100 extern void pktmediumlink(void);
101 extern struct username eve;
102 static long ndbwrite(struct Fs *, char *unused_char_p_t, uint32_t, int);
103 static void closeconv(struct conv *);
104 static void setup_proto_qio_bypass(struct conv *cv);
105 static void undo_proto_qio_bypass(struct conv *cv);
106 static int connected(void *a);
107
108 static struct conv *chan2conv(struct chan *chan)
109 {
110         /* That's a lot of pointers to get to the conv! */
111         return ipfs[chan->dev]->p[PROTO(chan->qid)]->conv[CONV(chan->qid)];
112 }
113
114 static inline int founddevdir(struct chan *c, struct qid q, char *n,
115                                                           int64_t length, char *user, long perm,
116                                                           struct dir *db)
117 {
118         devdir(c, q, n, length, user, perm, db);
119         return 1;
120 }
121
122 static int topdirgen(struct chan *c, struct dir *dp)
123 {
124         struct qid q;
125         mkqid(&q, QID(0, 0, Qtopdir), 0, QTDIR);
126         snprintf(get_cur_genbuf(), GENBUF_SZ, "#%s%lu", devname(), c->dev);
127         return founddevdir(c, q, get_cur_genbuf(), 0, network, 0555, dp);
128 }
129
130 /* Computes the perm field for a stat for Qdata.  Since select() polls the
131  * 'actionability' of a socket via the qdata FD, we'll also report listenable
132  * and connected conversations.  It's a minor hack.  =( */
133 static int qdata_stat_perm(struct conv *cv)
134 {
135         int perm;
136
137         perm = cv->perm;
138         /* If there is ever a listener, then it's readable.  Ideally, we'd only
139          * report this on the Qlisten file (which we also do).  The socket crap
140          * should never use a listening socket for data, so there shouldn't be any
141          * confusion when a Qdata shows up as readable. */
142         perm |= cv->incall ? DMREADABLE : 0;
143         /* For connectable convs, they need to be both connected and qio
144          * readable/writable.  The way to think about this is that the convs are not
145          * truly writable/readable until they are connected.  Conveniently, this
146          * means that when select polls Qdata for non-blocking connect(), a
147          * connected conversation pops up as writable (the qio is writable too).
148          *
149          * Note that a conversation can be 'Connected' even if it failed to connect.
150          * At least that's what the 9ns TCP code does.  It's more like "the protocol
151          * did what it needed and the connectctlmsg call (or its non-blocking
152          * equivalent) is done".  For instance, TCP has a few reasons to call
153          * Fsconnected, such as when we send the SYN and get a RST. */
154         if (!cv->p->connect || connected(cv)) {
155                 perm |= qreadable(cv->rq) ? DMREADABLE : 0;
156                 perm |= qwritable(cv->wq) ? DMWRITABLE : 0;
157         }
158         return perm;
159 }
160
161 static int ip3gen(struct chan *c, int i, struct dir *dp)
162 {
163         struct qid q;
164         struct conv *cv;
165         char *p;
166         int perm;
167
168         cv = chan2conv(c);
169         if (cv->owner == NULL)
170                 kstrdup(&cv->owner, eve.name);
171         mkqid(&q, QID(PROTO(c->qid), CONV(c->qid), i), 0, QTFILE);
172
173         switch (i) {
174                 default:
175                         return -1;
176                 case Qctl:
177                         return founddevdir(c, q, "ctl", 0,
178                                                    cv->owner, cv->perm, dp);
179                 case Qdata:
180                         perm = qdata_stat_perm(cv);
181                         return founddevdir(c, q, "data", qlen(cv->rq),
182                                                            cv->owner, perm, dp);
183                 case Qerr:
184                         perm = cv->perm;
185                         perm |= qreadable(cv->eq) ? DMREADABLE : 0;
186                         return founddevdir(c, q, "err", qlen(cv->eq),
187                                                            cv->owner, perm, dp);
188                 case Qlisten:
189                         perm = cv->perm;
190                         perm |= cv->incall ? DMREADABLE : 0;
191                         return founddevdir(c, q, "listen", 0, cv->owner, perm, dp);
192                 case Qlocal:
193                         p = "local";
194                         break;
195                 case Qremote:
196                         p = "remote";
197                         break;
198                 case Qsnoop:
199                         if (strcmp(cv->p->name, "ipifc") != 0)
200                                 return -1;
201                         perm = 0400;
202                         perm |= qreadable(cv->sq) ? DMREADABLE : 0;
203                         return founddevdir(c, q, "snoop", qlen(cv->sq),
204                                                            cv->owner, perm, dp);
205                 case Qstatus:
206                         p = "status";
207                         break;
208         }
209         return founddevdir(c, q, p, 0, cv->owner, 0444, dp);
210 }
211
212 static int ip2gen(struct chan *c, int i, struct dir *dp)
213 {
214         struct qid q;
215         mkqid(&q, QID(PROTO(c->qid), 0, i), 0, QTFILE);
216         switch (i) {
217                 case Qclone:
218                         return founddevdir(c, q, "clone", 0, network, 0666, dp);
219                 case Qstats:
220                         return founddevdir(c, q, "stats", 0, network, 0444, dp);
221         }
222         return -1;
223 }
224
225 static int ip1gen(struct chan *c, int i, struct dir *dp)
226 {
227         struct qid q;
228         char *p;
229         int prot;
230         int len = 0;
231         struct Fs *f;
232         extern uint32_t kerndate;
233
234         f = ipfs[c->dev];
235
236         prot = 0666;
237         mkqid(&q, QID(0, 0, i), 0, QTFILE);
238         switch (i) {
239                 default:
240                         return -1;
241                 case Qarp:
242                         p = "arp";
243                         break;
244                 case Qndb:
245                         p = "ndb";
246                         len = strlen(f->ndb);
247                         q.vers = f->ndbvers;
248                         break;
249                 case Qiproute:
250                         p = "iproute";
251                         break;
252                 case Qipselftab:
253                         p = "ipselftab";
254                         prot = 0444;
255                         break;
256                 case Qiprouter:
257                         p = "iprouter";
258                         break;
259                 case Qlog:
260                         p = "log";
261                         break;
262         }
263         devdir(c, q, p, len, network, prot, dp);
264         if (i == Qndb && f->ndbmtime > kerndate)
265                 dp->mtime.tv_sec = f->ndbmtime;
266         return 1;
267 }
268
269 static int
270 ipgen(struct chan *c, char *unused_char_p_t, struct dirtab *d, int unused_int,
271           int s, struct dir *dp)
272 {
273         struct qid q;
274         struct conv *cv;
275         struct Fs *f;
276
277         f = ipfs[c->dev];
278
279         switch (TYPE(c->qid)) {
280                 case Qtopdir:
281                         if (s == DEVDOTDOT)
282                                 return topdirgen(c, dp);
283                         if (s < f->np) {
284                                 if (f->p[s]->connect == NULL)
285                                         return 0;       /* protocol with no user interface */
286                                 mkqid(&q, QID(s, 0, Qprotodir), 0, QTDIR);
287                                 return founddevdir(c, q, f->p[s]->name, 0, network, 0555, dp);
288                         }
289                         s -= f->np;
290                         return ip1gen(c, s + Qtopbase, dp);
291                 case Qarp:
292                 case Qndb:
293                 case Qlog:
294                 case Qiproute:
295                 case Qiprouter:
296                 case Qipselftab:
297                         return ip1gen(c, TYPE(c->qid), dp);
298                 case Qprotodir:
299                         if (s == DEVDOTDOT)
300                                 return topdirgen(c, dp);
301                         else if (s < f->p[PROTO(c->qid)]->ac) {
302                                 cv = f->p[PROTO(c->qid)]->conv[s];
303                                 snprintf(get_cur_genbuf(), GENBUF_SZ, "%d", s);
304                                 mkqid(&q, QID(PROTO(c->qid), s, Qconvdir), 0, QTDIR);
305                                 return
306                                         founddevdir(c, q, get_cur_genbuf(), 0, cv->owner, 0555, dp);
307                         }
308                         s -= f->p[PROTO(c->qid)]->ac;
309                         return ip2gen(c, s + Qprotobase, dp);
310                 case Qclone:
311                 case Qstats:
312                         return ip2gen(c, TYPE(c->qid), dp);
313                 case Qconvdir:
314                         if (s == DEVDOTDOT) {
315                                 s = PROTO(c->qid);
316                                 mkqid(&q, QID(s, 0, Qprotodir), 0, QTDIR);
317                                 devdir(c, q, f->p[s]->name, 0, network, 0555, dp);
318                                 return 1;
319                         }
320                         return ip3gen(c, s + Qconvbase, dp);
321                 case Qctl:
322                 case Qdata:
323                 case Qerr:
324                 case Qlisten:
325                 case Qlocal:
326                 case Qremote:
327                 case Qstatus:
328                 case Qsnoop:
329                         return ip3gen(c, TYPE(c->qid), dp);
330         }
331         return -1;
332 }
333
334 static void ipinit(void)
335 {
336         qlock_init(&fslock);
337         nullmediumlink();
338         pktmediumlink();
339 /* if only
340         fmtinstall('i', eipfmt);
341         fmtinstall('I', eipfmt);
342         fmtinstall('E', eipfmt);
343         fmtinstall('V', eipfmt);
344         fmtinstall('M', eipfmt);
345 */
346 }
347
348 static void ipreset(void)
349 {
350 }
351
352 static struct Fs *ipgetfs(int dev)
353 {
354         extern void (*ipprotoinit[]) (struct Fs *);
355         struct Fs *f;
356         int i;
357
358         if (dev >= Nfs)
359                 return NULL;
360
361         qlock(&fslock);
362         if (ipfs[dev] == NULL) {
363                 f = kzmalloc(sizeof(struct Fs), MEM_WAIT);
364                 rwinit(&f->rwlock);
365                 qlock_init(&f->iprouter.qlock);
366                 ip_init(f);
367                 arpinit(f);
368                 netloginit(f);
369                 for (i = 0; ipprotoinit[i]; i++)
370                         ipprotoinit[i] (f);
371                 f->dev = dev;
372                 ipfs[dev] = f;
373         }
374         qunlock(&fslock);
375
376         return ipfs[dev];
377 }
378
379 struct IPaux *newipaux(char *owner, char *tag)
380 {
381         struct IPaux *a;
382         int n;
383
384         a = kzmalloc(sizeof(*a), 0);
385         kstrdup(&a->owner, owner);
386         memset(a->tag, ' ', sizeof(a->tag));
387         n = strlen(tag);
388         if (n > sizeof(a->tag))
389                 n = sizeof(a->tag);
390         memmove(a->tag, tag, n);
391         return a;
392 }
393
394 #define ATTACHER(c) (((struct IPaux*)((c)->aux))->owner)
395
396 static struct chan *ipattach(char *spec)
397 {
398         struct chan *c;
399         int dev;
400
401         dev = atoi(spec);
402         if (dev >= Nfs)
403                 error(EFAIL, "bad specification");
404
405         ipgetfs(dev);
406         c = devattach(devname(), spec);
407         mkqid(&c->qid, QID(0, 0, Qtopdir), 0, QTDIR);
408         c->dev = dev;
409
410         c->aux = newipaux(commonuser(), "none");
411
412         return c;
413 }
414
415 static struct walkqid *ipwalk(struct chan *c, struct chan *nc, char **name,
416                                                           int nname)
417 {
418         struct IPaux *a = c->aux;
419         struct walkqid *w;
420
421         w = devwalk(c, nc, name, nname, NULL, 0, ipgen);
422         if (w != NULL && w->clone != NULL)
423                 w->clone->aux = newipaux(a->owner, a->tag);
424         return w;
425 }
426
427 static int ipstat(struct chan *c, uint8_t * db, int n)
428 {
429         return devstat(c, db, n, NULL, 0, ipgen);
430 }
431
432 static int should_wake(void *arg)
433 {
434         struct conv *cv = arg;
435         /* signal that the conv is closed */
436         if (qisclosed(cv->rq))
437                 return TRUE;
438         return cv->incall != NULL;
439 }
440
441 static struct chan *ipopen(struct chan *c, int omode)
442 {
443         ERRSTACK(2);
444         struct conv *cv, *nc;
445         struct Proto *p;
446         int perm;
447         struct Fs *f;
448
449         /* perm is a lone rwx, not the rwx------ from the conversion */
450         perm = omode_to_rwx(omode) >> 6;
451
452         f = ipfs[c->dev];
453
454         switch (TYPE(c->qid)) {
455                 default:
456                         break;
457                 case Qndb:
458                         if (omode & (O_WRITE | O_TRUNC) && !iseve())
459                                 error(EPERM, ERROR_FIXME);
460                         if ((omode & (O_WRITE | O_TRUNC)) == (O_WRITE | O_TRUNC))
461                                 f->ndb[0] = 0;
462                         break;
463                 case Qlog:
464                         netlogopen(f);
465                         break;
466                 case Qiprouter:
467                         iprouteropen(f);
468                         break;
469                 case Qiproute:
470                         c->synth_buf = kpages_zalloc(IPROUTE_LEN, MEM_WAIT);
471                         routeread(f, c->synth_buf, 0, IPROUTE_LEN);
472                         break;
473                 case Qtopdir:
474                 case Qprotodir:
475                 case Qconvdir:
476                 case Qstatus:
477                 case Qremote:
478                 case Qlocal:
479                 case Qstats:
480                 case Qipselftab:
481                         if (omode & O_WRITE)
482                                 error(EPERM, ERROR_FIXME);
483                         break;
484                 case Qsnoop:
485                         if (omode & O_WRITE)
486                                 error(EPERM, ERROR_FIXME);
487                         /* might be racy.  note the lack of a proto lock, unlike Qdata */
488                         p = f->p[PROTO(c->qid)];
489                         cv = p->conv[CONV(c->qid)];
490                         if (strcmp(ATTACHER(c), cv->owner) != 0 && !iseve())
491                                 error(EPERM, ERROR_FIXME);
492                         atomic_inc(&cv->snoopers);
493                         break;
494                 case Qclone:
495                         p = f->p[PROTO(c->qid)];
496                         qlock(&p->qlock);
497                         if (waserror()) {
498                                 qunlock(&p->qlock);
499                                 nexterror();
500                         }
501                         cv = Fsprotoclone(p, ATTACHER(c));
502                         qunlock(&p->qlock);
503                         poperror();
504                         if (cv == NULL) {
505                                 error(ENODEV, "Null conversation from Fsprotoclone");
506                                 break;
507                         }
508                         mkqid(&c->qid, QID(p->x, cv->x, Qctl), 0, QTFILE);
509                         break;
510                 case Qdata:
511                 case Qctl:
512                 case Qerr:
513                         p = f->p[PROTO(c->qid)];
514                         qlock(&p->qlock);
515                         cv = p->conv[CONV(c->qid)];
516                         qlock(&cv->qlock);
517                         if (waserror()) {
518                                 qunlock(&cv->qlock);
519                                 qunlock(&p->qlock);
520                                 nexterror();
521                         }
522                         if ((perm & (cv->perm >> 6)) != perm) {
523                                 if (strcmp(ATTACHER(c), cv->owner) != 0)
524                                         error(EPERM, ERROR_FIXME);
525                                 if ((perm & cv->perm) != perm)
526                                         error(EPERM, ERROR_FIXME);
527
528                         }
529                         cv->inuse++;
530                         if (cv->inuse == 1) {
531                                 kstrdup(&cv->owner, ATTACHER(c));
532                                 cv->perm = 0660;
533                         }
534                         qunlock(&cv->qlock);
535                         qunlock(&p->qlock);
536                         poperror();
537                         break;
538                 case Qlisten:
539                         cv = f->p[PROTO(c->qid)]->conv[CONV(c->qid)];
540                         /* No permissions or Announce checks required.  We'll see if that's
541                          * a good idea or not. (the perm check would do nothing, as is,
542                          * since an O_PATH perm is 0).
543                          *
544                          * But we probably want to incref to keep the conversation around
545                          * until this FD/chan is closed.  #ip is a little weird in that
546                          * objects never really go away (high water mark for convs, you can
547                          * always find them in the ns).  I think it is possible to
548                          * namec/ipgen a chan, then have that conv close, then have that
549                          * chan be opened.  You can probably do this with a data file. */
550                         if (omode & O_PATH) {
551                                 qlock(&cv->qlock);
552                                 cv->inuse++;
553                                 qunlock(&cv->qlock);
554                                 break;
555                         }
556                         if ((perm & (cv->perm >> 6)) != perm) {
557                                 if (strcmp(ATTACHER(c), cv->owner) != 0)
558                                         error(EPERM, ERROR_FIXME);
559                                 if ((perm & cv->perm) != perm)
560                                         error(EPERM, ERROR_FIXME);
561
562                         }
563
564                         if (cv->state != Announced)
565                                 error(EFAIL, "not announced");
566
567                         if (waserror()) {
568                                 closeconv(cv);
569                                 nexterror();
570                         }
571                         qlock(&cv->qlock);
572                         cv->inuse++;
573                         qunlock(&cv->qlock);
574
575                         nc = NULL;
576                         while (nc == NULL) {
577                                 /* give up if we got a hangup */
578                                 if (qisclosed(cv->rq))
579                                         error(EFAIL, "listen hungup");
580
581                                 qlock(&cv->listenq);
582                                 if (waserror()) {
583                                         qunlock(&cv->listenq);
584                                         nexterror();
585                                 }
586                                 /* we can peek at incall without grabbing the cv qlock.  if
587                                  * anything is there, it'll remain there until we dequeue it.
588                                  * no one else can, since we hold the listenq lock */
589                                 if ((c->flag & O_NONBLOCK) && !cv->incall)
590                                         error(EAGAIN, "listen queue empty");
591                                 /* wait for a connect */
592                                 rendez_sleep(&cv->listenr, should_wake, cv);
593
594                                 /* if there is a concurrent hangup, they will hold the qlock
595                                  * until the hangup is complete, including closing the cv->rq */
596                                 qlock(&cv->qlock);
597                                 nc = cv->incall;
598                                 if (nc != NULL) {
599                                         cv->incall = nc->next;
600                                         mkqid(&c->qid, QID(PROTO(c->qid), nc->x, Qctl), 0, QTFILE);
601                                         kstrdup(&cv->owner, ATTACHER(c));
602                                 }
603                                 qunlock(&cv->qlock);
604
605                                 qunlock(&cv->listenq);
606                                 poperror();
607                         }
608                         closeconv(cv);
609                         poperror();
610                         break;
611         }
612         c->mode = openmode(omode);
613         c->flag |= COPEN;
614         c->offset = 0;
615         return c;
616 }
617
618 static int ipwstat(struct chan *c, uint8_t * dp, int n)
619 {
620         ERRSTACK(2);
621         struct dir *d;
622         struct conv *cv;
623         struct Fs *f;
624         struct Proto *p;
625
626         f = ipfs[c->dev];
627         switch (TYPE(c->qid)) {
628                 default:
629                         error(EPERM, ERROR_FIXME);
630                         break;
631                 case Qctl:
632                 case Qdata:
633                         break;
634         }
635
636         d = kzmalloc(sizeof(*d) + n, 0);
637         if (waserror()) {
638                 kfree(d);
639                 nexterror();
640         }
641         n = convM2D(dp, n, d, (char *)&d[1]);
642         if (n == 0)
643                 error(ENODATA, ERROR_FIXME);
644         p = f->p[PROTO(c->qid)];
645         cv = p->conv[CONV(c->qid)];
646         if (!iseve() && strcmp(ATTACHER(c), cv->owner) != 0)
647                 error(EPERM, ERROR_FIXME);
648         if (!emptystr(d->uid))
649                 kstrdup(&cv->owner, d->uid);
650         if (d->mode != -1)
651                 cv->perm = d->mode & 0777;
652         poperror();
653         kfree(d);
654         return n;
655 }
656
657 /* Should be able to handle any file type chan. Feel free to extend it. */
658 static char *ipchaninfo(struct chan *ch, char *ret, size_t ret_l)
659 {
660         struct conv *conv;
661         struct Proto *proto;
662         char *p;
663         struct Fs *f;
664
665         f = ipfs[ch->dev];
666
667         switch (TYPE(ch->qid)) {
668                 default:
669                         ret = "Unknown type";
670                         break;
671                 case Qdata:
672                         proto = f->p[PROTO(ch->qid)];
673                         conv = proto->conv[CONV(ch->qid)];
674                         snprintf(ret, ret_l,
675                                  "Qdata, %s, proto %s, conv idx %d, rq len %d, wq len %d, total read %llu",
676                                  SLIST_EMPTY(&conv->data_taps) ? "untapped" : "tapped",
677                                  proto->name, conv->x, qlen(conv->rq), qlen(conv->wq),
678                                          q_bytes_read(conv->rq));
679                         break;
680                 case Qarp:
681                         ret = "Qarp";
682                         break;
683                 case Qiproute:
684                         ret = "Qiproute";
685                         break;
686                 case Qlisten:
687                         proto = f->p[PROTO(ch->qid)];
688                         conv = proto->conv[CONV(ch->qid)];
689                         snprintf(ret, ret_l,
690                                  "Qlisten, %s proto %s, conv idx %d, has %sincalls",
691                                  SLIST_EMPTY(&conv->listen_taps) ? "untapped" : "tapped",
692                                  proto->name, conv->x, conv->incall ? "" : "no ");
693                         break;
694                 case Qlog:
695                         ret = "Qlog";
696                         break;
697                 case Qndb:
698                         ret = "Qndb";
699                         break;
700                 case Qctl:
701                         proto = f->p[PROTO(ch->qid)];
702                         conv = proto->conv[CONV(ch->qid)];
703                         snprintf(ret, ret_l, "Qctl, proto %s, conv idx %d", proto->name,
704                                          conv->x);
705                         break;
706         }
707         return ret;
708 }
709
710 static void closeconv(struct conv *cv)
711 {
712         ERRSTACK(1);
713         struct conv *nc;
714         struct Ipmulti *mp;
715
716         qlock(&cv->qlock);
717
718         if (--cv->inuse > 0) {
719                 qunlock(&cv->qlock);
720                 return;
721         }
722         if (waserror()) {
723                 qunlock(&cv->qlock);
724                 nexterror();
725         }
726         /* close all incoming calls since no listen will ever happen */
727         for (nc = cv->incall; nc; nc = cv->incall) {
728                 cv->incall = nc->next;
729                 closeconv(nc);
730         }
731         cv->incall = NULL;
732
733         kstrdup(&cv->owner, network);
734         cv->perm = 0660;
735
736         while ((mp = cv->multi) != NULL)
737                 ipifcremmulti(cv, mp->ma, mp->ia);
738
739         cv->r = NULL;
740         cv->rgen = 0;
741         if (cv->state == Bypass)
742                 undo_proto_qio_bypass(cv);
743         cv->p->close(cv);
744         cv->state = Idle;
745         qunlock(&cv->qlock);
746         poperror();
747 }
748
749 static void ipclose(struct chan *c)
750 {
751         struct Fs *f;
752
753         f = ipfs[c->dev];
754         switch (TYPE(c->qid)) {
755                 default:
756                         break;
757                 case Qlog:
758                         if (c->flag & COPEN)
759                                 netlogclose(f);
760                         break;
761                 case Qiprouter:
762                         if (c->flag & COPEN)
763                                 iprouterclose(f);
764                         break;
765                 case Qdata:
766                 case Qctl:
767                 case Qerr:
768                 case Qlisten:
769                         if (c->flag & COPEN)
770                                 closeconv(f->p[PROTO(c->qid)]->conv[CONV(c->qid)]);
771                         break;
772                 case Qsnoop:
773                         if (c->flag & COPEN)
774                                 atomic_dec(&f->p[PROTO(c->qid)]->conv[CONV(c->qid)]->snoopers);
775                         break;
776                 case Qiproute:
777                         if (c->flag & COPEN)
778                                 kpages_free(c->synth_buf, IPROUTE_LEN);
779                         break;
780         }
781         kfree(((struct IPaux *)c->aux)->owner);
782         kfree(c->aux);
783 }
784
785 enum {
786         Statelen = 32 * 1024,
787 };
788
789 static long ipread(struct chan *ch, void *a, long n, int64_t off)
790 {
791         struct conv *c;
792         struct Proto *x;
793         char *buf, *p;
794         long rv;
795         struct Fs *f;
796         uint32_t offset = off;
797
798         f = ipfs[ch->dev];
799
800         p = a;
801         switch (TYPE(ch->qid)) {
802                 default:
803                         error(EPERM, ERROR_FIXME);
804                 case Qtopdir:
805                 case Qprotodir:
806                 case Qconvdir:
807                         return devdirread(ch, a, n, 0, 0, ipgen);
808                 case Qarp:
809                         return arpread(f->arp, a, offset, n);
810                 case Qndb:
811                         return readstr(offset, a, n, f->ndb);
812                 case Qiproute:
813                         return readmem(offset, a, n, ch->synth_buf, IPROUTE_LEN);
814                 case Qiprouter:
815                         return iprouterread(f, a, n);
816                 case Qipselftab:
817                         return ipselftabread(f, a, offset, n);
818                 case Qlog:
819                         return netlogread(f, a, offset, n);
820                 case Qctl:
821                         snprintf(get_cur_genbuf(), GENBUF_SZ, "%lu", CONV(ch->qid));
822                         return readstr(offset, p, n, get_cur_genbuf());
823                 case Qremote:
824                         buf = kzmalloc(Statelen, 0);
825                         x = f->p[PROTO(ch->qid)];
826                         c = x->conv[CONV(ch->qid)];
827                         if (x->remote == NULL) {
828                                 snprintf(buf, Statelen, "%I!%d\n", c->raddr, c->rport);
829                         } else {
830                                 (*x->remote) (c, buf, Statelen - 2);
831                         }
832                         rv = readstr(offset, p, n, buf);
833                         kfree(buf);
834                         return rv;
835                 case Qlocal:
836                         buf = kzmalloc(Statelen, 0);
837                         x = f->p[PROTO(ch->qid)];
838                         c = x->conv[CONV(ch->qid)];
839                         if (x->local == NULL) {
840                                 snprintf(buf, Statelen, "%I!%d\n", c->laddr, c->lport);
841                         } else {
842                                 (*x->local) (c, buf, Statelen - 2);
843                         }
844                         rv = readstr(offset, p, n, buf);
845                         kfree(buf);
846                         return rv;
847                 case Qstatus:
848                         /* this all is a bit screwed up since the size of some state's
849                          * buffers will change from one invocation to another.  a reader
850                          * will come in and read the entire buffer.  then it will come again
851                          * and read from the next offset, expecting EOF.  if the buffer
852                          * changed sizes, it'll reprint the end of the buffer slightly. */
853                         buf = kzmalloc(Statelen, 0);
854                         x = f->p[PROTO(ch->qid)];
855                         c = x->conv[CONV(ch->qid)];
856                         if (c->state == Bypass)
857                                 snprintf(buf, Statelen, "Bypassed\n");
858                         else
859                                 (*x->state)(c, buf, Statelen - 2);
860                         rv = readstr(offset, p, n, buf);
861                         kfree(buf);
862                         return rv;
863                 case Qdata:
864                         c = f->p[PROTO(ch->qid)]->conv[CONV(ch->qid)];
865                         if (ch->flag & O_NONBLOCK)
866                                 return qread_nonblock(c->rq, a, n);
867                         else
868                                 return qread(c->rq, a, n);
869                 case Qerr:
870                         c = f->p[PROTO(ch->qid)]->conv[CONV(ch->qid)];
871                         return qread(c->eq, a, n);
872                 case Qsnoop:
873                         c = f->p[PROTO(ch->qid)]->conv[CONV(ch->qid)];
874                         return qread(c->sq, a, n);
875                 case Qstats:
876                         x = f->p[PROTO(ch->qid)];
877                         if (x->stats == NULL)
878                                 error(EFAIL, "stats not implemented");
879                         buf = kzmalloc(Statelen, 0);
880                         (*x->stats) (x, buf, Statelen);
881                         rv = readstr(offset, p, n, buf);
882                         kfree(buf);
883                         return rv;
884         }
885 }
886
887 static struct block *ipbread(struct chan *ch, long n, uint32_t offset)
888 {
889         struct conv *c;
890
891         switch (TYPE(ch->qid)) {
892                 case Qdata:
893                         c = chan2conv(ch);
894                         if (ch->flag & O_NONBLOCK)
895                                 return qbread_nonblock(c->rq, n);
896                         else
897                                 return qbread(c->rq, n);
898                 default:
899                         return devbread(ch, n, offset);
900         }
901 }
902
903 /*
904  *  set local address to be that of the ifc closest to remote address
905  */
906 static void setladdr(struct conv *c)
907 {
908         findlocalip(c->p->f, c->laddr, c->raddr);
909 }
910
911 /*
912  *  set a local port making sure the quad of raddr,rport,laddr,lport is unique
913  */
914 static void setluniqueport(struct conv *c, int lport)
915 {
916         struct Proto *p;
917         struct conv *xp;
918         int x;
919
920         p = c->p;
921
922         qlock(&p->qlock);
923         for (x = 0; x < p->nc; x++) {
924                 xp = p->conv[x];
925                 if (xp == NULL)
926                         break;
927                 if (xp == c)
928                         continue;
929                 if ((xp->state == Connected || xp->state == Announced
930                                             || xp->state == Bypass)
931                         && xp->lport == lport
932                         && xp->rport == c->rport
933                         && ipcmp(xp->raddr, c->raddr) == 0
934                         && ipcmp(xp->laddr, c->laddr) == 0) {
935                         qunlock(&p->qlock);
936                         error(EFAIL, "address in use");
937                 }
938         }
939         c->lport = lport;
940         qunlock(&p->qlock);
941 }
942
943 /*
944  *  pick a local port and set it
945  */
946 static void setlport(struct conv *c)
947 {
948         struct Proto *p;
949         uint16_t *pp;
950         int x, found;
951
952         p = c->p;
953         if (c->restricted)
954                 pp = &p->nextrport;
955         else
956                 pp = &p->nextport;
957         qlock(&p->qlock);
958         for (;; (*pp)++) {
959                 /*
960                  * Fsproto initialises p->nextport to 0 and the restricted
961                  * ports (p->nextrport) to 600.
962                  * Restricted ports must lie between 600 and 1024.
963                  * For the initial condition or if the unrestricted port number
964                  * has wrapped round, select a random port between 5000 and 1<<15
965                  * to start at.
966                  */
967                 if (c->restricted) {
968                         if (*pp >= 1024)
969                                 *pp = 600;
970                 } else
971                         while (*pp < 5000)
972                                 urandom_read(pp, sizeof(*pp));
973
974                 found = 0;
975                 for (x = 0; x < p->nc; x++) {
976                         if (p->conv[x] == NULL)
977                                 break;
978                         if (p->conv[x]->lport == *pp) {
979                                 found = 1;
980                                 break;
981                         }
982                 }
983                 if (!found)
984                         break;
985         }
986         c->lport = (*pp)++;
987         qunlock(&p->qlock);
988 }
989
990 /*
991  *  set a local address and port from a string of the form
992  *      [address!]port[!r]
993  */
994 static void setladdrport(struct conv *c, char *str, int announcing)
995 {
996         char *p;
997         uint16_t lport;
998         uint8_t addr[IPaddrlen];
999
1000         /*
1001          *  ignore restricted part if it exists.  it's
1002          *  meaningless on local ports.
1003          */
1004         p = strchr(str, '!');
1005         if (p != NULL) {
1006                 *p++ = 0;
1007                 if (strcmp(p, "r") == 0)
1008                         p = NULL;
1009         }
1010
1011         c->lport = 0;
1012         if (p == NULL) {
1013                 if (announcing)
1014                         ipmove(c->laddr, IPnoaddr);
1015                 else
1016                         setladdr(c);
1017                 p = str;
1018         } else {
1019                 if (strcmp(str, "*") == 0)
1020                         ipmove(c->laddr, IPnoaddr);
1021                 else {
1022                         parseip(addr, str);
1023                         if (ipforme(c->p->f, addr))
1024                                 ipmove(c->laddr, addr);
1025                         else
1026                                 error(EFAIL, "not a local IP address");
1027                 }
1028         }
1029
1030         /* one process can get all connections */
1031         if (announcing && strcmp(p, "*") == 0) {
1032                 if (!iseve())
1033                         error(EPERM, ERROR_FIXME);
1034                 setluniqueport(c, 0);
1035         }
1036
1037         lport = atoi(p);
1038         if (lport <= 0)
1039                 setlport(c);
1040         else
1041                 setluniqueport(c, lport);
1042 }
1043
1044 static void setraddrport(struct conv *c, char *str)
1045 {
1046         char *p;
1047
1048         p = strchr(str, '!');
1049         if (p == NULL)
1050                 error(EFAIL, "malformed address");
1051         *p++ = 0;
1052         parseip(c->raddr, str);
1053         c->rport = atoi(p);
1054         p = strchr(p, '!');
1055         if (p) {
1056                 if (strstr(p, "!r") != NULL)
1057                         c->restricted = 1;
1058         }
1059 }
1060
1061 /*
1062  *  called by protocol connect routine to set addresses
1063  */
1064 void Fsstdconnect(struct conv *c, char *argv[], int argc)
1065 {
1066         switch (argc) {
1067                 default:
1068                         error(EINVAL, "bad args to %s", __func__);
1069                 case 2:
1070                         setraddrport(c, argv[1]);
1071                         setladdr(c);
1072                         setlport(c);
1073                         break;
1074                 case 3:
1075                         setraddrport(c, argv[1]);
1076                         setladdrport(c, argv[2], 0);
1077                         break;
1078         }
1079
1080         /* TODO: why is an IPnoaddr (in v6 format, equivalent to v6Unspecified),
1081          * a v4 format? */
1082         if ((memcmp(c->raddr, v4prefix, IPv4off) == 0 &&
1083                  memcmp(c->laddr, v4prefix, IPv4off) == 0)
1084                 || ipcmp(c->raddr, IPnoaddr) == 0)
1085                 c->ipversion = V4;
1086         else
1087                 c->ipversion = V6;
1088         /* Linux has taught people to use zeros for local interfaces.  TODO: We
1089          * might need this for v6 in the future. */
1090         if (!ipcmp(c->raddr, IPv4_zeroes))
1091                 ipmove(c->raddr, IPv4_loopback);
1092 }
1093
1094 /*
1095  *  initiate connection and sleep till its set up
1096  */
1097 static int connected(void *a)
1098 {
1099         return ((struct conv *)a)->state == Connected;
1100 }
1101
1102 static void connectctlmsg(struct Proto *x, struct conv *c, struct cmdbuf *cb,
1103                           struct chan *chan)
1104 {
1105         ERRSTACK(1);
1106         char *p;
1107
1108         if (c->state != 0)
1109                 error(EBUSY, ERROR_FIXME);
1110         c->state = Connecting;
1111         c->cerr[0] = '\0';
1112         if (x->connect == NULL)
1113                 error(EFAIL, "connect not supported");
1114         /* It's up to the proto connect method to not block the kthread.  This is
1115          * currently the case for e.g. TCP. */
1116         x->connect(c, cb->f, cb->nf);
1117         /* This is notionally right before the rendez_sleep: either we block or we
1118          * kick back to userspace.  We do this before the unlock to avoid races with
1119          * c->state (rendez's internal lock deals with its race with the waker) and
1120          * to avoid the excessive unlock and relock.
1121          *
1122          * Also, it's important that we don't do anything important for the
1123          * functionality of the conv after the rendez sleep.  The non-blocking style
1124          * won't call back into the kernel - it just wants the event.  I considered
1125          * allowing multiple connect calls, where we just return if it was already
1126          * connected, but that would break UDP, which allows multiple different
1127          * connect calls. */
1128         if ((chan->flag & O_NONBLOCK) && !connected(c))
1129                 error(EINPROGRESS, "connection not ready yet");
1130         qunlock(&c->qlock);
1131         if (waserror()) {
1132                 qlock(&c->qlock);
1133                 nexterror();
1134         }
1135         rendez_sleep(&c->cr, connected, c);
1136         qlock(&c->qlock);
1137         poperror();
1138
1139         if (c->cerr[0] != '\0')
1140                 error(EFAIL, c->cerr);
1141 }
1142
1143 /*
1144  *  called by protocol announce routine to set addresses
1145  */
1146 void Fsstdannounce(struct conv *c, char *argv[], int argc)
1147 {
1148         memset(c->raddr, 0, sizeof(c->raddr));
1149         c->rport = 0;
1150         switch (argc) {
1151                 default:
1152                         error(EINVAL, "bad args to announce");
1153                 case 2:
1154                         setladdrport(c, argv[1], 1);
1155                         break;
1156         }
1157 }
1158
1159 /*
1160  *  initiate announcement and sleep till its set up
1161  */
1162 static int announced(void *a)
1163 {
1164         return ((struct conv *)a)->state == Announced;
1165 }
1166
1167 static void announcectlmsg(struct Proto *x, struct conv *c, struct cmdbuf *cb)
1168 {
1169         ERRSTACK(1);
1170         char *p;
1171
1172         if (c->state != 0)
1173                 error(EBUSY, ERROR_FIXME);
1174         c->state = Announcing;
1175         c->cerr[0] = '\0';
1176         if (x->announce == NULL)
1177                 error(EFAIL, "announce not supported");
1178         x->announce(c, cb->f, cb->nf);
1179
1180         qunlock(&c->qlock);
1181         if (waserror()) {
1182                 qlock(&c->qlock);
1183                 nexterror();
1184         }
1185         rendez_sleep(&c->cr, announced, c);
1186         qlock(&c->qlock);
1187         poperror();
1188
1189         if (c->cerr[0] != '\0')
1190                 error(EFAIL, c->cerr);
1191 }
1192
1193 /*
1194  *  called by protocol bind routine to set addresses
1195  */
1196 void Fsstdbind(struct conv *c, char *argv[], int argc)
1197 {
1198         switch (argc) {
1199                 default:
1200                         error(EINVAL, "bad args to bind");
1201                 case 2:
1202                         setladdrport(c, argv[1], 0);
1203                         break;
1204         }
1205 }
1206
1207 static void bindctlmsg(struct Proto *x, struct conv *c, struct cmdbuf *cb)
1208 {
1209         if (x->bind == NULL)
1210                 Fsstdbind(c, cb->f, cb->nf);
1211         else
1212                 x->bind(c, cb->f, cb->nf);
1213 }
1214
1215 /* Helper, called by protocols to use the bypass.
1216  *
1217  * This is a bit nasty due to the overall nastiness of #ip.  We need to lock
1218  * before checking the state and hold the qlock throughout, because a concurrent
1219  * closeconv() could tear down the bypass.  Specifically, it could free the
1220  * bypass queues.  The root issue is that conversation lifetimes are not managed
1221  * well.
1222  *
1223  * If we fail, it's our responsibility to consume (free) the block(s). */
1224 void bypass_or_drop(struct conv *cv, struct block *bp)
1225 {
1226         qlock(&cv->qlock);
1227         if (cv->state == Bypass)
1228                 qpass(cv->rq, bp);
1229         else
1230                 freeblist(bp);
1231         qunlock(&cv->qlock);
1232 }
1233
1234 /* Push the block directly to the approprite ipoput function.
1235  *
1236  * It's the protocol's responsibility (and thus ours here) to make sure there is
1237  * at least the right amount of the IP header in the block (ipoput{4,6} assumes
1238  * it has the right amount, and the other protocols account for the IP header in
1239  * their own header).
1240  *
1241  * For the TTL and TOS, we just use the default ones.  If we want, we could look
1242  * into the actual block and see what the user wanted, though we're bypassing
1243  * the protocol layer, not the IP layer. */
1244 static void proto_bypass_kick(void *arg, struct block *bp)
1245 {
1246         struct conv *cv = (struct conv*)arg;
1247         uint8_t vers_nibble;
1248         struct Fs *f;
1249
1250         f = cv->p->f;
1251
1252         bp = pullupblock(bp, 1);
1253         if (!bp)
1254                 error(EINVAL, "Proto bypass unable to pullup a byte!");
1255         vers_nibble = *(uint8_t*)bp->rp & 0xf0;
1256         switch (vers_nibble) {
1257         case IP_VER4:
1258                 bp = pullupblock(bp, IPV4HDR_LEN);
1259                 if (!bp)
1260                         error(EINVAL, "Proto bypass unable to pullup v4 header");
1261                 ipoput4(f, bp, FALSE, MAXTTL, DFLTTOS, NULL);
1262                 break;
1263         case IP_VER6:
1264                 bp = pullupblock(bp, IPV6HDR_LEN);
1265                 if (!bp)
1266                         error(EINVAL, "Proto bypass unable to pullup v6 header");
1267                 ipoput6(f, bp, FALSE, MAXTTL, DFLTTOS, NULL);
1268                 break;
1269         default:
1270                 error(EINVAL, "Proto bypass block had unknown IP version 0x%x",
1271                       vers_nibble);
1272         }
1273 }
1274
1275 /* Sets up cv for the protocol bypass.  We use different queues for two reasons:
1276  * 1) To be protocol independent.  For instance, TCP and UDP could use very
1277  * different QIO styles.
1278  * 2) To set up our own kick/bypass method.  Note how udpcreate() and here uses
1279  * qbypass() (just blast it out), while TCP uses qopen() with a kick.  TCP still
1280  * follows queuing discipline.
1281  *
1282  * It's like we are our own protocol, the bypass protocol, when it comes to how
1283  * we interact with qio.  The conv still is of the real protocol type (e.g.
1284  * TCP).
1285  *
1286  * Note that we can't free the old queues.  The way #ip works, the queues are
1287  * created when the conv is created, but the conv is never freed.  It's like a
1288  * slab allocator that never frees objects, but just reinitializes them a
1289  * little.
1290  *
1291  * For the queues, we're basically like UDP:
1292  * - We take packets for rq and drop on overflow.
1293  * - rq is also Qmsg, but we also have Qcoalesce, to ignore out zero-len blocks
1294  * - We kick for our outbound (wq) messages.
1295  *
1296  * Note that Qmsg can drop parts of packets.  It's up to the user to read
1297  * enough.  If they didn't read enough, the extra is dropped.  This is similar
1298  * to SOCK_DGRAM and recvfrom().  Minus major changes, there's no nice way to
1299  * get individual messages with read().  Userspace using the bypass will need to
1300  * find out the MTU of the NIC the IP stack is attached to, and make sure to
1301  * read in at least that amount each time. */
1302 static void setup_proto_qio_bypass(struct conv *cv)
1303 {
1304         cv->rq_save = cv->rq;
1305         cv->wq_save = cv->wq;
1306         cv->rq = qopen(BYPASS_QMAX, Qmsg | Qcoalesce, 0, 0);
1307         cv->wq = qbypass(proto_bypass_kick, cv);
1308 }
1309
1310 static void undo_proto_qio_bypass(struct conv *cv)
1311 {
1312         qfree(cv->rq);
1313         qfree(cv->wq);
1314         cv->rq = cv->rq_save;
1315         cv->wq = cv->wq_save;
1316         cv->rq_save = NULL;
1317         cv->wq_save = NULL;
1318 }
1319
1320 void Fsstdbypass(struct conv *cv, char *argv[], int argc)
1321 {
1322         memset(cv->raddr, 0, sizeof(cv->raddr));
1323         cv->rport = 0;
1324         switch (argc) {
1325         case 2:
1326                 setladdrport(cv, argv[1], 1);
1327                 break;
1328         default:
1329                 error(EINVAL, "Bad args (was %d, need 2) to bypass", argc);
1330         }
1331 }
1332
1333 static void bypassctlmsg(struct Proto *x, struct conv *cv, struct cmdbuf *cb)
1334 {
1335         if (!x->bypass)
1336                 error(EFAIL, "Protocol %s does not support bypass", x->name);
1337         /* The protocol needs to set the port (usually by calling Fsstdbypass) and
1338          * then do whatever it needs to make sure it can find the conv again during
1339          * receive (usually by adding to a hash table). */
1340         x->bypass(cv, cb->f, cb->nf);
1341         setup_proto_qio_bypass(cv);
1342         cv->state = Bypass;
1343 }
1344
1345 static void shutdownctlmsg(struct conv *cv, struct cmdbuf *cb)
1346 {
1347         if (cb->nf < 2)
1348                 goto err;
1349         if (!strcmp(cb->f[1], "rd")) {
1350                 qhangup(cv->rq, "shutdown");
1351                 if (cv->p->shutdown)
1352                         cv->p->shutdown(cv, SHUT_RD);
1353         } else if (!strcmp(cb->f[1], "wr")) {
1354                 qhangup(cv->wq, "shutdown");
1355                 if (cv->p->shutdown)
1356                         cv->p->shutdown(cv, SHUT_WR);
1357         } else if (!strcmp(cb->f[1], "rdwr")) {
1358                 qhangup(cv->rq, "shutdown");
1359                 qhangup(cv->wq, "shutdown");
1360                 if (cv->p->shutdown)
1361                         cv->p->shutdown(cv, SHUT_RDWR);
1362         } else {
1363                 goto err;
1364         }
1365         return;
1366 err:
1367         error(EINVAL, "shutdown [rx|tx|rxtx]");
1368 }
1369
1370 static void tosctlmsg(struct conv *c, struct cmdbuf *cb)
1371 {
1372         if (cb->nf < 2)
1373                 c->tos = 0;
1374         else
1375                 c->tos = atoi(cb->f[1]);
1376 }
1377
1378 static void ttlctlmsg(struct conv *c, struct cmdbuf *cb)
1379 {
1380         if (cb->nf < 2)
1381                 c->ttl = MAXTTL;
1382         else
1383                 c->ttl = atoi(cb->f[1]);
1384 }
1385
1386 /* Binds a conversation, as if the user wrote "bind *" into ctl. */
1387 static void autobind(struct conv *cv)
1388 {
1389         ERRSTACK(1);
1390         struct cmdbuf *cb;
1391
1392         cb = parsecmd("bind *", 7);
1393         if (waserror()) {
1394                 kfree(cb);
1395                 nexterror();
1396         }
1397         bindctlmsg(cv->p, cv, cb);
1398         poperror();
1399         kfree(cb);
1400 }
1401
1402 static long ipwrite(struct chan *ch, void *v, long n, int64_t off)
1403 {
1404         ERRSTACK(1);
1405         struct conv *c;
1406         struct Proto *x;
1407         char *p;
1408         struct cmdbuf *cb;
1409         uint8_t ia[IPaddrlen], ma[IPaddrlen];
1410         struct Fs *f;
1411         char *a;
1412
1413         a = v;
1414         f = ipfs[ch->dev];
1415
1416         switch (TYPE(ch->qid)) {
1417                 default:
1418                         error(EPERM, ERROR_FIXME);
1419                 case Qdata:
1420                         x = f->p[PROTO(ch->qid)];
1421                         c = x->conv[CONV(ch->qid)];
1422                         /* connection-less protocols (UDP) can write without manually
1423                          * binding. */
1424                         if (c->lport == 0)
1425                                 autobind(c);
1426                         if (ch->flag & O_NONBLOCK)
1427                                 qwrite_nonblock(c->wq, a, n);
1428                         else
1429                                 qwrite(c->wq, a, n);
1430                         break;
1431                 case Qarp:
1432                         return arpwrite(f, a, n);
1433                 case Qiproute:
1434                         return routewrite(f, ch, a, n);
1435                 case Qlog:
1436                         netlogctl(f, a, n);
1437                         return n;
1438                 case Qndb:
1439                         return ndbwrite(f, a, off, n);
1440                 case Qctl:
1441                         x = f->p[PROTO(ch->qid)];
1442                         c = x->conv[CONV(ch->qid)];
1443                         cb = parsecmd(a, n);
1444
1445                         qlock(&c->qlock);
1446                         if (waserror()) {
1447                                 qunlock(&c->qlock);
1448                                 kfree(cb);
1449                                 nexterror();
1450                         }
1451                         if (cb->nf < 1)
1452                                 error(EFAIL, "short control request");
1453                         if (strcmp(cb->f[0], "connect") == 0)
1454                                 connectctlmsg(x, c, cb, ch);
1455                         else if (strcmp(cb->f[0], "announce") == 0)
1456                                 announcectlmsg(x, c, cb);
1457                         else if (strcmp(cb->f[0], "bind") == 0)
1458                                 bindctlmsg(x, c, cb);
1459                         else if (strcmp(cb->f[0], "bypass") == 0)
1460                                 bypassctlmsg(x, c, cb);
1461                         else if (strcmp(cb->f[0], "shutdown") == 0)
1462                                 shutdownctlmsg(c, cb);
1463                         else if (strcmp(cb->f[0], "ttl") == 0)
1464                                 ttlctlmsg(c, cb);
1465                         else if (strcmp(cb->f[0], "tos") == 0)
1466                                 tosctlmsg(c, cb);
1467                         else if (strcmp(cb->f[0], "ignoreadvice") == 0)
1468                                 c->ignoreadvice = 1;
1469                         else if (strcmp(cb->f[0], "addmulti") == 0) {
1470                                 if (cb->nf < 2)
1471                                         error(EFAIL, "addmulti needs interface address");
1472                                 if (cb->nf == 2) {
1473                                         if (!ipismulticast(c->raddr))
1474                                                 error(EFAIL, "addmulti for a non multicast address");
1475                                         parseip(ia, cb->f[1]);
1476                                         ipifcaddmulti(c, c->raddr, ia);
1477                                 } else {
1478                                         parseip(ma, cb->f[2]);
1479                                         if (!ipismulticast(ma))
1480                                                 error(EFAIL, "addmulti for a non multicast address");
1481                                         parseip(ia, cb->f[1]);
1482                                         ipifcaddmulti(c, ma, ia);
1483                                 }
1484                         } else if (strcmp(cb->f[0], "remmulti") == 0) {
1485                                 if (cb->nf < 2)
1486                                         error(EFAIL, "remmulti needs interface address");
1487                                 if (!ipismulticast(c->raddr))
1488                                         error(EFAIL, "remmulti for a non multicast address");
1489                                 parseip(ia, cb->f[1]);
1490                                 ipifcremmulti(c, c->raddr, ia);
1491                         } else if (x->ctl != NULL) {
1492                                 x->ctl(c, cb->f, cb->nf);
1493                         } else
1494                                 error(EFAIL, "unknown control request");
1495                         qunlock(&c->qlock);
1496                         kfree(cb);
1497                         poperror();
1498         }
1499         return n;
1500 }
1501
1502 static long ipbwrite(struct chan *ch, struct block *bp, uint32_t offset)
1503 {
1504         struct conv *c;
1505         int n;
1506
1507         switch (TYPE(ch->qid)) {
1508                 case Qdata:
1509                         c = chan2conv(ch);
1510                         if (bp->next)
1511                                 bp = concatblock(bp);
1512                         n = BLEN(bp);
1513                         if (ch->flag & O_NONBLOCK)
1514                                 qbwrite_nonblock(c->wq, bp);
1515                         else
1516                                 qbwrite(c->wq, bp);
1517                         return n;
1518                 default:
1519                         return devbwrite(ch, bp, offset);
1520         }
1521 }
1522
1523 static void fire_data_taps(struct conv *conv, int filter)
1524 {
1525         struct fd_tap *tap_i;
1526
1527         /* At this point, we have an event we want to send to our taps (if any).
1528          * The lock protects list integrity and the existence of the tap.
1529          *
1530          * Previously, I thought of using the conv qlock.  That actually breaks, due
1531          * to weird usages of the qlock (someone holds it for a long time, blocking
1532          * the inbound wakeup from etherread4).
1533          *
1534          * I opted for a spinlock for a couple reasons:
1535          * - fire_tap should not block.  ideally it'll be fast too (it's mostly a
1536          * send_event).
1537          * - our callers might not want to block.  A lot of network wakeups will
1538          * come network processes (etherread4) or otherwise unrelated to this
1539          * particular conversation.  I'd rather do something like fire off a KMSG
1540          * than block those.
1541          * - if fire_tap takes a while, holding the lock only slows down other
1542          * events on this *same* conversation, or other tap registration.  not a
1543          * huge deal. */
1544         spin_lock(&conv->tap_lock);
1545         SLIST_FOREACH(tap_i, &conv->data_taps, link)
1546                 fire_tap(tap_i, filter);
1547         spin_unlock(&conv->tap_lock);
1548 }
1549
1550 static void ip_wake_cb(struct queue *q, void *data, int filter)
1551 {
1552         struct conv *conv = (struct conv*)data;
1553
1554         /* For these two, we want to ignore events on the opposite end of the
1555          * queues.  For instance, we want to know when the WQ is writable.  Our
1556          * writes will actually make it readable - we don't want to trigger a tap
1557          * for that.  However, qio doesn't know how/why we are using a queue, or
1558          * even who the ends are (hence the callbacks) */
1559         if ((filter & FDTAP_FILT_READABLE) && (q == conv->wq))
1560                 return;
1561         if ((filter & FDTAP_FILT_WRITABLE) && (q == conv->rq))
1562                 return;
1563         fire_data_taps(conv, filter);
1564 }
1565
1566 int iptapfd(struct chan *chan, struct fd_tap *tap, int cmd)
1567 {
1568         struct conv *conv = chan2conv(chan);
1569         int ret;
1570
1571         #define DEVIP_LEGAL_DATA_TAPS (FDTAP_FILT_READABLE | FDTAP_FILT_WRITABLE | \
1572                                        FDTAP_FILT_HANGUP | FDTAP_FILT_PRIORITY |   \
1573                                        FDTAP_FILT_ERROR)
1574         #define DEVIP_LEGAL_LISTEN_TAPS (FDTAP_FILT_READABLE | FDTAP_FILT_HANGUP)
1575
1576         switch (TYPE(chan->qid)) {
1577                 case Qdata:
1578                         if (tap->filter & ~DEVIP_LEGAL_DATA_TAPS) {
1579                                 set_errno(ENOSYS);
1580                                 set_errstr("Unsupported #%s data tap %p, must be %p", devname(),
1581                                            tap->filter, DEVIP_LEGAL_DATA_TAPS);
1582                                 return -1;
1583                         }
1584                         spin_lock(&conv->tap_lock);
1585                         switch (cmd) {
1586                                 case (FDTAP_CMD_ADD):
1587                                         if (SLIST_EMPTY(&conv->data_taps)) {
1588                                                 qio_set_wake_cb(conv->rq, ip_wake_cb, conv);
1589                                                 qio_set_wake_cb(conv->wq, ip_wake_cb, conv);
1590                                         }
1591                                         SLIST_INSERT_HEAD(&conv->data_taps, tap, link);
1592                                         ret = 0;
1593                                         break;
1594                                 case (FDTAP_CMD_REM):
1595                                         SLIST_REMOVE(&conv->data_taps, tap, fd_tap, link);
1596                                         if (SLIST_EMPTY(&conv->data_taps)) {
1597                                                 qio_set_wake_cb(conv->rq, 0, conv);
1598                                                 qio_set_wake_cb(conv->wq, 0, conv);
1599                                         }
1600                                         ret = 0;
1601                                         break;
1602                                 default:
1603                                         set_errno(ENOSYS);
1604                                         set_errstr("Unsupported #%s data tap command %p",
1605                                                    devname(), cmd);
1606                                         ret = -1;
1607                         }
1608                         spin_unlock(&conv->tap_lock);
1609                         return ret;
1610                 case Qlisten:
1611                         if (tap->filter & ~DEVIP_LEGAL_LISTEN_TAPS) {
1612                                 set_errno(ENOSYS);
1613                                 set_errstr("Unsupported #%s listen tap %p, must be %p",
1614                                            devname(), tap->filter, DEVIP_LEGAL_LISTEN_TAPS);
1615                                 return -1;
1616                         }
1617                         spin_lock(&conv->tap_lock);
1618                         switch (cmd) {
1619                                 case (FDTAP_CMD_ADD):
1620                                         SLIST_INSERT_HEAD(&conv->listen_taps, tap, link);
1621                                         ret = 0;
1622                                         break;
1623                                 case (FDTAP_CMD_REM):
1624                                         SLIST_REMOVE(&conv->listen_taps, tap, fd_tap, link);
1625                                         ret = 0;
1626                                         break;
1627                                 default:
1628                                         set_errno(ENOSYS);
1629                                         set_errstr("Unsupported #%s listen tap command %p",
1630                                                    devname(), cmd);
1631                                         ret = -1;
1632                         }
1633                         spin_unlock(&conv->tap_lock);
1634                         return ret;
1635                 default:
1636                         set_errno(ENOSYS);
1637                         set_errstr("Can't tap #%s file type %d", devname(),
1638                                    TYPE(chan->qid));
1639                         return -1;
1640         }
1641 }
1642
1643 struct dev ipdevtab __devtab = {
1644         .name = "ip",
1645
1646         .reset = ipreset,
1647         .init = ipinit,
1648         .shutdown = devshutdown,
1649         .attach = ipattach,
1650         .walk = ipwalk,
1651         .stat = ipstat,
1652         .open = ipopen,
1653         .create = devcreate,
1654         .close = ipclose,
1655         .read = ipread,
1656         .bread = ipbread,
1657         .write = ipwrite,
1658         .bwrite = ipbwrite,
1659         .remove = devremove,
1660         .wstat = ipwstat,
1661         .power = devpower,
1662         .chaninfo = ipchaninfo,
1663         .tapfd = iptapfd,
1664 };
1665
1666 int Fsproto(struct Fs *f, struct Proto *p)
1667 {
1668         if (f->np >= Maxproto)
1669                 return -1;
1670
1671         qlock_init(&p->qlock);
1672         p->f = f;
1673
1674         if (p->ipproto > 0) {
1675                 if (f->t2p[p->ipproto] != NULL)
1676                         return -1;
1677                 f->t2p[p->ipproto] = p;
1678         }
1679
1680         p->qid.type = QTDIR;
1681         p->qid.path = QID(f->np, 0, Qprotodir);
1682         p->conv = kzmalloc(sizeof(struct conv *) * (p->nc + 1), 0);
1683         if (p->conv == NULL)
1684                 panic("Fsproto");
1685
1686         p->x = f->np;
1687         p->nextport = 0;
1688         p->nextrport = 600;
1689         f->p[f->np++] = p;
1690
1691         return 0;
1692 }
1693
1694 /*
1695  *  return true if this protocol is
1696  *  built in
1697  */
1698 int Fsbuiltinproto(struct Fs *f, uint8_t proto)
1699 {
1700         return f->t2p[proto] != NULL;
1701 }
1702
1703 /*
1704  *  called with protocol locked
1705  */
1706 struct conv *Fsprotoclone(struct Proto *p, char *user)
1707 {
1708         struct conv *c, **pp, **ep;
1709
1710 retry:
1711         c = NULL;
1712         ep = &p->conv[p->nc];
1713         for (pp = p->conv; pp < ep; pp++) {
1714                 c = *pp;
1715                 if (c == NULL) {
1716                         c = kzmalloc(sizeof(struct conv), 0);
1717                         if (c == NULL)
1718                                 error(ENOMEM,
1719                                       "conv kzmalloc(%d, 0) failed in Fsprotoclone",
1720                                       sizeof(struct conv));
1721                         qlock_init(&c->qlock);
1722                         qlock_init(&c->listenq);
1723                         rendez_init(&c->cr);
1724                         rendez_init(&c->listenr);
1725                         SLIST_INIT(&c->data_taps);      /* already = 0; set to be futureproof */
1726                         SLIST_INIT(&c->listen_taps);
1727                         spinlock_init(&c->tap_lock);
1728                         qlock(&c->qlock);
1729                         c->p = p;
1730                         c->x = pp - p->conv;
1731                         if (p->ptclsize != 0) {
1732                                 c->ptcl = kzmalloc(p->ptclsize, 0);
1733                                 if (c->ptcl == NULL) {
1734                                         kfree(c);
1735                                         error(ENOMEM,
1736                                               "ptcl kzmalloc(%d, 0) failed in Fsprotoclone",
1737                                               p->ptclsize);
1738                                 }
1739                         }
1740                         *pp = c;
1741                         p->ac++;
1742                         c->eq = qopen(1024, Qmsg, 0, 0);
1743                         (*p->create) (c);
1744                         assert(c->rq && c->wq);
1745                         break;
1746                 }
1747                 if (canqlock(&c->qlock)) {
1748                         /*
1749                          *  make sure both processes and protocol
1750                          *  are done with this Conv
1751                          */
1752                         if (c->inuse == 0 && (p->inuse == NULL || (*p->inuse) (c) == 0))
1753                                 break;
1754
1755                         qunlock(&c->qlock);
1756                 }
1757         }
1758         if (pp >= ep) {
1759                 if (p->gc != NULL && (*p->gc) (p))
1760                         goto retry;
1761                 return NULL;
1762         }
1763
1764         c->inuse = 1;
1765         kstrdup(&c->owner, user);
1766         c->perm = 0660;
1767         c->state = Idle;
1768         ipmove(c->laddr, IPnoaddr);
1769         ipmove(c->raddr, IPnoaddr);
1770         c->r = NULL;
1771         c->rgen = 0;
1772         c->lport = 0;
1773         c->rport = 0;
1774         c->restricted = 0;
1775         c->ttl = MAXTTL;
1776         c->tos = DFLTTOS;
1777         qreopen(c->rq);
1778         qreopen(c->wq);
1779         qreopen(c->eq);
1780
1781         qunlock(&c->qlock);
1782         return c;
1783 }
1784
1785 int Fsconnected(struct conv *c, char *msg)
1786 {
1787         if (msg != NULL && *msg != '\0')
1788                 strlcpy(c->cerr, msg, sizeof(c->cerr));
1789
1790         switch (c->state) {
1791                 case Announcing:
1792                         c->state = Announced;
1793                         break;
1794
1795                 case Connecting:
1796                         c->state = Connected;
1797                         break;
1798         }
1799
1800         rendez_wakeup(&c->cr);
1801         /* The user can poll or tap the connection status via Qdata */
1802         fire_data_taps(c, FDTAP_FILT_WRITABLE);
1803         return 0;
1804 }
1805
1806 struct Proto *Fsrcvpcol(struct Fs *f, uint8_t proto)
1807 {
1808         if (f->ipmux)
1809                 return f->ipmux;
1810         else
1811                 return f->t2p[proto];
1812 }
1813
1814 struct Proto *Fsrcvpcolx(struct Fs *f, uint8_t proto)
1815 {
1816         return f->t2p[proto];
1817 }
1818
1819 static void fire_listener_taps(struct conv *conv)
1820 {
1821         struct fd_tap *tap_i;
1822         if (SLIST_EMPTY(&conv->listen_taps))
1823                 return;
1824         spin_lock(&conv->tap_lock);
1825         SLIST_FOREACH(tap_i, &conv->listen_taps, link)
1826                 fire_tap(tap_i, FDTAP_FILT_READABLE);
1827         spin_unlock(&conv->tap_lock);
1828 }
1829
1830 /*
1831  *  called with protocol locked
1832  */
1833 struct conv *Fsnewcall(struct conv *c, uint8_t * raddr, uint16_t rport,
1834                                            uint8_t * laddr, uint16_t lport, uint8_t version)
1835 {
1836         struct conv *nc;
1837         struct conv **l;
1838         int i;
1839
1840         qlock(&c->qlock);
1841         i = 0;
1842         for (l = &c->incall; *l; l = &(*l)->next)
1843                 i++;
1844         if (i >= Maxincall) {
1845                 qunlock(&c->qlock);
1846                 return NULL;
1847         }
1848
1849         /* find a free conversation */
1850         nc = Fsprotoclone(c->p, network);
1851         if (nc == NULL) {
1852                 qunlock(&c->qlock);
1853                 return NULL;
1854         }
1855         ipmove(nc->raddr, raddr);
1856         nc->rport = rport;
1857         ipmove(nc->laddr, laddr);
1858         nc->lport = lport;
1859         nc->next = NULL;
1860         *l = nc;
1861         nc->state = Connected;
1862         nc->ipversion = version;
1863
1864         qunlock(&c->qlock);
1865
1866         rendez_wakeup(&c->listenr);
1867         fire_listener_taps(c);
1868
1869         return nc;
1870 }
1871
1872 static long ndbwrite(struct Fs *f, char *a, uint32_t off, int n)
1873 {
1874         if (off > strlen(f->ndb))
1875                 error(EIO, ERROR_FIXME);
1876         if (off + n >= sizeof(f->ndb) - 1)
1877                 error(EIO, ERROR_FIXME);
1878         memmove(f->ndb + off, a, n);
1879         f->ndb[off + n] = 0;
1880         f->ndbvers++;
1881         f->ndbmtime = seconds();
1882         return n;
1883 }
1884
1885 uint32_t scalednconv(void)
1886 {
1887         //if(conf.npage*BY2PG >= 128*MB)
1888         return Nchans * 4;
1889         //  return Nchans;
1890 }