Add #eventfd [1/2]
authorBarret Rhoden <brho@cs.berkeley.edu>
Tue, 22 Sep 2015 15:23:40 +0000 (11:23 -0400)
committerBarret Rhoden <brho@cs.berkeley.edu>
Mon, 28 Sep 2015 19:14:00 +0000 (15:14 -0400)
This is a kernel device for eventfd().  It differs slightly from Linux
in that the kernel interface is a string instead of a host-endian u64.

You can play with the raw interface by attaching an instance of #eventfd
to the namespace (just like #pipe).  For example:

/ $ bind \#eventfd /prog
bind #eventfd -> /prog flag 0
( attach an eventfd in counter mode )
/ $ echo 33 > /prog/efd
/ $ read_once /prog/efd
33/ $
( put in and extract 33 (decimal) )
/ $ read_once /prog/efd &
( read in the background.  will exit after one read syscall. )
/ $
/ $ echo 33 > /prog/efd
33/ $
( woke up the blocked read_once )

kern/drivers/dev/Kbuild
kern/drivers/dev/eventfd.c [new file with mode: 0644]

index 6a40e30..57fd936 100644 (file)
@@ -3,6 +3,7 @@ obj-y                                           += alarm.o
 obj-y                                          += coreboot.o
 obj-y                                          += cons.o
 obj-y                                          += ether.o
+obj-y                                          += eventfd.o
 obj-y                                          += kprof.o
 obj-y                                          += mnt.o
 #obj-y                                         += pci.o
diff --git a/kern/drivers/dev/eventfd.c b/kern/drivers/dev/eventfd.c
new file mode 100644 (file)
index 0000000..92a7194
--- /dev/null
@@ -0,0 +1,340 @@
+/* Copyright (c) 2015 Google Inc
+ * Barret Rhoden <brho@cs.berkeley.edu>
+ * See LICENSE for details.
+ *
+ * #eventfd device, the kernel-side implementation of man 2 eventfd.
+ *
+ * Unlike the Linux interface, which takes host-endian u64s, we read and write
+ * strings.  It's a little slower, but it maintains the distributed-system
+ * nature of Plan 9 devices. */
+
+#include <ns.h>
+#include <kmalloc.h>
+#include <kref.h>
+#include <atomic.h>
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+#include <error.h>
+#include <sys/queue.h>
+#include <fdtap.h>
+
+struct dev efd_devtab;
+
+static char *devname(void)
+{
+       return efd_devtab.name;
+}
+
+enum {
+       Qdir,
+       Qctl,
+       Qefd,
+};
+
+static struct dirtab efd_dir[] = {
+       {".", {Qdir, 0, QTDIR}, 0, DMDIR | 0555},
+       {"ctl", {Qctl, 0, QTFILE}, 0, 0666},
+       {"efd", {Qefd, 0, QTFILE}, 8, 0666},
+};
+
+enum {
+       EFD_SEMAPHORE =                         1 << 0,
+       EFD_MAX_VAL =                           (unsigned long)(-2), // i.e. 0xfffffffffffffffe
+};
+
+
+struct eventfd {
+       int                                             flags;
+       atomic_t                                        counter;
+       struct fdtap_slist                      fd_taps;
+       spinlock_t                                      tap_lock;
+       struct rendez                           rv_readers;
+       struct rendez                           rv_writers;
+       struct kref                                     refcnt;
+};
+
+
+static void efd_release(struct kref *kref)
+{
+       struct eventfd *efd = container_of(kref, struct eventfd, refcnt);
+       /* All FDs with taps should be closed before we decreffed all the chans */
+       assert(SLIST_EMPTY(&efd->fd_taps));
+       kfree(efd);
+}
+
+static struct chan *efd_attach(char *spec)
+{
+       struct chan *c;
+       struct eventfd *efd;
+
+       c = devattach(devname(), spec);
+       efd = kzmalloc(sizeof(struct eventfd), KMALLOC_WAIT);
+       SLIST_INIT(&efd->fd_taps);
+       spinlock_init(&efd->tap_lock);
+       rendez_init(&efd->rv_readers);
+       rendez_init(&efd->rv_writers);
+       /* Attach and walk are the two sources of chans.  Each returns a refcnt'd
+        * object, for the most part. */
+       kref_init(&efd->refcnt, efd_release, 1);
+       /* nothing special in the qid to ID this eventfd.  the main thing is the
+        * aux.  we could put a debugging ID in the path like pipe. */
+       mkqid(&c->qid, Qdir, 0, QTDIR);
+       c->aux = efd;
+       /* just to be fancy and remove a syscall, if they pass spec == "sem", then
+        * we'll treat them as being in semaphore mode. */
+       if (!strcmp(spec, "sem"))
+               efd->flags |= EFD_SEMAPHORE;
+       return c;
+}
+
+static struct walkqid *efd_walk(struct chan *c, struct chan *nc, char **name,
+                                                               int nname)
+{
+       struct walkqid *wq;
+       struct eventfd *efd = c->aux;
+
+       wq = devwalk(c, nc, name, nname, efd_dir, ARRAY_SIZE(efd_dir), devgen);
+       /* Walk is a source of a distinct chan from this device.  The other source
+        * is attach.  Once created, these chans will eventually be closed, and when
+        * they close, they will decref their aux, efd.  All chans within this
+        * *instance* of eventfd share the same efd.  Each one will have one refcnt.
+        * Each chan may also have several copies of its pointer out there (e.g. FD
+        * dup), all of which have their own *chan* refcnt.
+        *
+        * All of the above applies on successful walks that found all nname parts
+        * of the path.  A mid-success is wq: we got something.  wq->clone means we
+        * got to the end and the "big walk" considers this a success.
+        *
+        * There is a slight chance the new chan is the same as our original chan
+        * (if nc == c when we're called).  In which case, there's only one chan.
+        * The number of refs on efd == the number of distinct chans within this
+        * instance of #eventfd. */
+       if (wq != NULL && wq->clone != NULL && wq->clone != c)
+               kref_get(&efd->refcnt, 1);
+       return wq;
+}
+
+/* In the future, we could use stat / wstat to get and set O_NONBLOCK */
+static int efd_stat(struct chan *c, uint8_t * db, int n)
+{
+       return devstat(c, db, n, efd_dir, ARRAY_SIZE(efd_dir), devgen);
+}
+
+static struct chan *efd_open(struct chan *c, int omode)
+{
+       return devopen(c, omode, efd_dir, ARRAY_SIZE(efd_dir), devgen);
+}
+
+static void efd_close(struct chan *c)
+{
+       struct eventfd *efd = c->aux;
+       /* Here's where we put the ref from attach and successful walks */
+       kref_put(&efd->refcnt);
+}
+
+static void efd_fire_taps(struct eventfd *efd, int filter)
+{
+       struct fd_tap *tap_i;
+       if (SLIST_EMPTY(&efd->fd_taps))
+               return;
+       /* We're not expecting many FD taps, so it's not worth splitting readers
+        * from writers or anything like that.
+        * TODO: (RCU) Locking to protect the list and the tap's existence. */
+       spin_lock(&efd->tap_lock);
+       SLIST_FOREACH(tap_i, &efd->fd_taps, link)
+               fire_tap(tap_i, filter);
+       spin_unlock(&efd->tap_lock);
+}
+
+static int has_counts(void *arg)
+{
+       struct eventfd *efd = arg;
+       return atomic_read(&efd->counter) != 0;
+}
+
+/* The heart of reading an eventfd */
+static unsigned long efd_read_efd(struct eventfd *efd, struct chan *c)
+{
+       unsigned long old_count, new_count, ret;
+       while (1) {
+               old_count = atomic_read(&efd->counter);
+               if (!old_count) {
+                       if (c->flag & O_NONBLOCK) {
+                               set_errno(EAGAIN);
+                               error("Would block on #%s read", devname());
+                       }
+                       rendez_sleep(&efd->rv_readers, has_counts, efd);
+               } else {
+                       if (efd->flags & EFD_SEMAPHORE) {
+                               new_count = old_count - 1;
+                               ret = 1;
+                       } else {
+                               new_count = 0;
+                               ret = old_count;
+                       }
+                       if (atomic_cas(&efd->counter, old_count, new_count))
+                               goto success;
+               }
+       }
+success:
+       rendez_wakeup(&efd->rv_writers);
+       efd_fire_taps(efd, FDTAP_FILT_WRITABLE);
+       return ret;
+}
+
+static long efd_read(struct chan *c, void *ubuf, long n, int64_t offset)
+{
+       struct eventfd *efd = c->aux;
+
+       switch (c->qid.path) {
+               case Qdir:
+                       return devdirread(c, ubuf, n, efd_dir, ARRAY_SIZE(efd_dir),
+                                                         devgen);
+               case Qctl:
+                       return readnum(offset, ubuf, n, efd->flags, NUMSIZE32);
+               case Qefd:
+                       /* ignoring the chan offset for Qefd */
+                       return readnum(0, ubuf, n, efd_read_efd(efd, c),
+                                                  NUMSIZE64);
+               default:
+                       panic("Bad Qid %p!", c->qid.path);
+       }
+       return -1;
+}
+
+static int has_room(void *arg)
+{
+       struct eventfd *efd = arg;
+       return atomic_read(&efd->counter) != EFD_MAX_VAL;
+}
+
+/* The heart of writing an eventfd */
+static void efd_write_efd(struct eventfd *efd, unsigned long add_to,
+                          struct chan *c)
+{
+       unsigned long old_count, new_count;
+       while (1) {
+               old_count = atomic_read(&efd->counter);
+               new_count = old_count + add_to;
+               if (new_count > EFD_MAX_VAL) {
+                       if (c->flag & O_NONBLOCK) {
+                               set_errno(EAGAIN);
+                               error("Would block on #%s write", devname());
+                       }
+                       rendez_sleep(&efd->rv_writers, has_room, efd);
+               } else {
+                       if (atomic_cas(&efd->counter, old_count, new_count))
+                               goto success;
+               }
+       }
+success:
+       rendez_wakeup(&efd->rv_readers);
+       efd_fire_taps(efd, FDTAP_FILT_READABLE);
+}
+
+static long efd_write(struct chan *c, void *ubuf, long n, int64_t offset)
+{
+       struct eventfd *efd = c->aux;
+       unsigned long write_val;
+       char num64[NUMSIZE64];
+
+       switch (c->qid.path) {
+               case Qctl:
+                       /* If we want to allow runtime changing of settings, we can do it
+                        * here. */
+                       error("No #%s ctl commands supported", devname());
+                       break;
+               case Qefd:
+                       /* We want to give strtoul a null-terminated buf (can't handle
+                        * arbitrary user strings).  Ignoring the chan offset too. */
+                       if (n > sizeof(num64)) {
+                               set_errno(EINVAL);
+                               error("attempted to write %d chars, max %d", n, sizeof(num64));
+                       }
+                       memcpy(num64, ubuf, n);
+                       num64[n] = 0;   /* enforce trailing 0 */
+                       write_val = strtoul(num64, 0, 0);
+                       if (write_val == (unsigned long)(-1))
+                               error("Eventfd write must not be -1");
+                       efd_write_efd(efd, write_val, c);
+                       break;
+               default:
+                       panic("Bad Qid %p!", c->qid.path);
+       }
+       return n;
+}
+
+static char *efd_chaninfo(struct chan *c, char *ret, size_t ret_l)
+{
+       struct eventfd *efd = c->aux;
+
+       snprintf(ret, ret_l, "QID type %s, flags %p, counter %p",
+                efd_dir[c->qid.path].name, efd->flags, atomic_read(&efd->counter));
+       return ret;
+}
+
+static int efd_tapfd(struct chan *c, struct fd_tap *tap, int cmd)
+{
+       struct eventfd *efd = c->aux;
+       int ret;
+
+       /* We don't actually support HANGUP, but epoll implies it */
+       #define EFD_LEGAL_TAPS (FDTAP_FILT_READABLE | FDTAP_FILT_WRITABLE |        \
+                               FDTAP_FILT_HANGUP)
+
+       switch (c->qid.path) {
+               case Qefd:
+                       if (tap->filter & ~EFD_LEGAL_TAPS) {
+                               set_errno(ENOSYS);
+                               set_errstr("Unsupported #%s tap, must be %p", devname(),
+                                          EFD_LEGAL_TAPS);
+                               return -1;
+                       }
+                       spin_lock(&efd->tap_lock);
+                       switch (cmd) {
+                               case (FDTAP_CMD_ADD):
+                                       SLIST_INSERT_HEAD(&efd->fd_taps, tap, link);
+                                       ret = 0;
+                                       break;
+                               case (FDTAP_CMD_REM):
+                                       SLIST_REMOVE(&efd->fd_taps, tap, fd_tap, link);
+                                       ret = 0;
+                                       break;
+                               default:
+                                       set_errno(ENOSYS);
+                                       set_errstr("Unsupported #%s tap command %p",
+                                                  devname(), cmd);
+                                       ret = -1;
+                       }
+                       spin_unlock(&efd->tap_lock);
+                       return ret;
+               default:
+                       set_errno(ENOSYS);
+                       set_errstr("Can't tap #%s file type %d", devname(),
+                                  c->qid.path);
+                       return -1;
+       }
+}
+
+struct dev efd_devtab __devtab = {
+       .name = "eventfd",
+       .reset = devreset,
+       .init = devinit,
+       .shutdown = devshutdown,
+       .attach = efd_attach,
+       .walk = efd_walk,
+       .stat = efd_stat,
+       .open = efd_open,
+       .create = devcreate,
+       .close = efd_close,
+       .read = efd_read,
+       .bread = devbread,
+       .write = efd_write,
+       .bwrite = devbwrite,
+       .remove = devremove,
+       .wstat = devwstat,
+       .power = devpower,
+       .chaninfo = efd_chaninfo,
+       .tapfd = efd_tapfd,
+};