akaros/kern/drivers/dev/eventfd.c
<<
>>
Prefs
   1/* Copyright (c) 2015 Google Inc
   2 * Barret Rhoden <brho@cs.berkeley.edu>
   3 * See LICENSE for details.
   4 *
   5 * #eventfd device, the kernel-side implementation of man 2 eventfd.
   6 *
   7 * Unlike the Linux interface, which takes host-endian u64s, we read and write
   8 * strings.  It's a little slower, but it maintains the distributed-system
   9 * nature of Plan 9 devices. */
  10
  11#include <ns.h>
  12#include <kmalloc.h>
  13#include <kref.h>
  14#include <atomic.h>
  15#include <string.h>
  16#include <stdio.h>
  17#include <assert.h>
  18#include <error.h>
  19#include <sys/queue.h>
  20#include <fdtap.h>
  21#include <syscall.h>
  22
  23struct dev efd_devtab;
  24
  25static char *devname(void)
  26{
  27        return efd_devtab.name;
  28}
  29
  30enum {
  31        Qdir,
  32        Qctl,
  33        Qefd,
  34};
  35
  36static struct dirtab efd_dir[] = {
  37        {".", {Qdir, 0, QTDIR}, 0, DMDIR | 0555},
  38        {"ctl", {Qctl, 0, QTFILE}, 0, 0666},
  39        {"efd", {Qefd, 0, QTFILE}, 8, 0666},
  40};
  41
  42enum {
  43        EFD_SEMAPHORE =         1 << 0,
  44        EFD_MAX_VAL =           (unsigned long)(-2), // i.e. 0xfffffffffffffffe
  45};
  46
  47
  48struct eventfd {
  49        int                             flags;
  50        atomic_t                        counter;
  51        struct fdtap_slist              fd_taps;
  52        spinlock_t                      tap_lock;
  53        struct rendez                   rv_readers;
  54        struct rendez                   rv_writers;
  55        struct kref                     refcnt;
  56};
  57
  58
  59static void efd_release(struct kref *kref)
  60{
  61        struct eventfd *efd = container_of(kref, struct eventfd, refcnt);
  62
  63        /* All FDs with taps must be closed before we decreffed all the chans */
  64        assert(SLIST_EMPTY(&efd->fd_taps));
  65        kfree(efd);
  66}
  67
  68static struct chan *efd_attach(char *spec)
  69{
  70        struct chan *c;
  71        struct eventfd *efd;
  72
  73        c = devattach(devname(), spec);
  74        efd = kzmalloc(sizeof(struct eventfd), MEM_WAIT);
  75        SLIST_INIT(&efd->fd_taps);
  76        spinlock_init(&efd->tap_lock);
  77        rendez_init(&efd->rv_readers);
  78        rendez_init(&efd->rv_writers);
  79        /* Attach and walk are the two sources of chans.  Each returns a
  80         * refcnt'd object, for the most part. */
  81        kref_init(&efd->refcnt, efd_release, 1);
  82        /* nothing special in the qid to ID this eventfd.  the main thing is the
  83         * aux.  we could put a debugging ID in the path like pipe. */
  84        mkqid(&c->qid, Qdir, 0, QTDIR);
  85        c->aux = efd;
  86        /* just to be fancy and remove a syscall, if they pass spec == "sem",
  87         * then we'll treat them as being in semaphore mode. */
  88        if (!strcmp(spec, "sem"))
  89                efd->flags |= EFD_SEMAPHORE;
  90        return c;
  91}
  92
  93static struct walkqid *efd_walk(struct chan *c, struct chan *nc, char **name,
  94                                unsigned int nname)
  95{
  96        struct walkqid *wq;
  97        struct eventfd *efd = c->aux;
  98
  99        wq = devwalk(c, nc, name, nname, efd_dir, ARRAY_SIZE(efd_dir), devgen);
 100        /* Walk is a source of a distinct chan from this device.  The other
 101         * source is attach.  Once created, these chans will eventually be
 102         * closed, and when they close, they will decref their aux, efd.  All
 103         * chans within this *instance* of eventfd share the same efd.  Each one
 104         * will have one refcnt.  Each chan may also have several copies of its
 105         * pointer out there (e.g. FD dup), all of which have their own *chan*
 106         * refcnt.
 107         *
 108         * All of the above applies on successful walks that found all nname
 109         * parts of the path.  A mid-success is wq: we got something.  wq->clone
 110         * means we got to the end and the "big walk" considers this a success.
 111         *
 112         * There is a slight chance the new chan is the same as our original
 113         * chan (if nc == c when we're called).  In which case, there's only one
 114         * chan.  The number of refs on efd == the number of distinct chans
 115         * within this instance of #eventfd. */
 116        if (wq != NULL && wq->clone != NULL && wq->clone != c)
 117                kref_get(&efd->refcnt, 1);
 118        return wq;
 119}
 120
 121/* In the future, we could use stat / wstat to get and set O_NONBLOCK */
 122static size_t efd_stat(struct chan *c, uint8_t *db, size_t n)
 123{
 124        return devstat(c, db, n, efd_dir, ARRAY_SIZE(efd_dir), devgen);
 125}
 126
 127static struct chan *efd_open(struct chan *c, int omode)
 128{
 129        return devopen(c, omode, efd_dir, ARRAY_SIZE(efd_dir), devgen);
 130}
 131
 132static void efd_close(struct chan *c)
 133{
 134        struct eventfd *efd = c->aux;
 135
 136        /* Here's where we put the ref from attach and successful walks */
 137        kref_put(&efd->refcnt);
 138}
 139
 140static void efd_fire_taps(struct eventfd *efd, int filter)
 141{
 142        struct fd_tap *tap_i;
 143
 144        if (SLIST_EMPTY(&efd->fd_taps))
 145                return;
 146        /* We're not expecting many FD taps, so it's not worth splitting readers
 147         * from writers or anything like that.
 148         * TODO: (RCU) Locking to protect the list and the tap's existence. */
 149        spin_lock(&efd->tap_lock);
 150        SLIST_FOREACH(tap_i, &efd->fd_taps, link)
 151                fire_tap(tap_i, filter);
 152        spin_unlock(&efd->tap_lock);
 153}
 154
 155static int has_counts(void *arg)
 156{
 157        struct eventfd *efd = arg;
 158
 159        return atomic_read(&efd->counter) != 0;
 160}
 161
 162/* The heart of reading an eventfd */
 163static unsigned long efd_read_efd(struct eventfd *efd, struct chan *c)
 164{
 165        unsigned long old_count, new_count, ret;
 166
 167        while (1) {
 168                old_count = atomic_read(&efd->counter);
 169                if (!old_count) {
 170                        if (c->flag & O_NONBLOCK)
 171                                error(EAGAIN, "Would block on #%s read",
 172                                      devname());
 173                        rendez_sleep(&efd->rv_readers, has_counts, efd);
 174                } else {
 175                        if (efd->flags & EFD_SEMAPHORE) {
 176                                new_count = old_count - 1;
 177                                ret = 1;
 178                        } else {
 179                                new_count = 0;
 180                                ret = old_count;
 181                        }
 182                        if (atomic_cas(&efd->counter, old_count, new_count))
 183                                goto success;
 184                }
 185        }
 186success:
 187        rendez_wakeup(&efd->rv_writers);
 188        efd_fire_taps(efd, FDTAP_FILT_WRITABLE);
 189        return ret;
 190}
 191
 192static size_t efd_read(struct chan *c, void *ubuf, size_t n, off64_t offset)
 193{
 194        struct eventfd *efd = c->aux;
 195
 196        switch (c->qid.path) {
 197        case Qdir:
 198                return devdirread(c, ubuf, n, efd_dir, ARRAY_SIZE(efd_dir),
 199                                  devgen);
 200        case Qctl:
 201                return readnum(offset, ubuf, n, efd->flags, NUMSIZE32);
 202        case Qefd:
 203                /* ignoring the chan offset for Qefd */
 204                return readnum(0, ubuf, n, efd_read_efd(efd, c), NUMSIZE64);
 205        default:
 206                panic("Bad Qid %p!", c->qid.path);
 207        }
 208        return -1;
 209}
 210
 211static int has_room(void *arg)
 212{
 213        struct eventfd *efd = arg;
 214        return atomic_read(&efd->counter) != EFD_MAX_VAL;
 215}
 216
 217/* The heart of writing an eventfd */
 218static void efd_write_efd(struct eventfd *efd, unsigned long add_to,
 219                          struct chan *c)
 220{
 221        unsigned long old_count, new_count;
 222
 223        while (1) {
 224                old_count = atomic_read(&efd->counter);
 225                new_count = old_count + add_to;
 226                if (new_count > EFD_MAX_VAL) {
 227                        if (c->flag & O_NONBLOCK)
 228                                error(EAGAIN, "Would block on #%s write",
 229                                      devname());
 230                        rendez_sleep(&efd->rv_writers, has_room, efd);
 231                } else {
 232                        if (atomic_cas(&efd->counter, old_count, new_count))
 233                                goto success;
 234                }
 235        }
 236success:
 237        rendez_wakeup(&efd->rv_readers);
 238        efd_fire_taps(efd, FDTAP_FILT_READABLE);
 239}
 240
 241static size_t efd_write(struct chan *c, void *ubuf, size_t n, off64_t offset)
 242{
 243        struct eventfd *efd = c->aux;
 244        unsigned long write_val;
 245        char num64[NUMSIZE64];
 246
 247        switch (c->qid.path) {
 248        case Qctl:
 249                /* If we want to allow runtime changing of settings, we can do
 250                 * it here. */
 251                error(EFAIL, "No #%s ctl commands supported", devname());
 252                break;
 253        case Qefd:
 254                /* We want to give strtoul a null-terminated buf (can't handle
 255                 * arbitrary user strings).  Ignoring the chan offset too. */
 256                if (n > sizeof(num64))
 257                        error(EAGAIN, "attempted to write %d chars, max %d", n,
 258                                  sizeof(num64));
 259                memcpy(num64, ubuf, n);
 260                num64[n] = 0;   /* enforce trailing 0 */
 261                write_val = strtoul(num64, 0, 0);
 262                if (write_val == (unsigned long)(-1))
 263                        error(EFAIL, "Eventfd write must not be -1");
 264                efd_write_efd(efd, write_val, c);
 265                break;
 266        default:
 267                panic("Bad Qid %p!", c->qid.path);
 268        }
 269        return n;
 270}
 271
 272static char *efd_chaninfo(struct chan *c, char *ret, size_t ret_l)
 273{
 274        struct eventfd *efd = c->aux;
 275
 276        snprintf(ret, ret_l, "QID type %s, flags %p, counter %p",
 277                 efd_dir[c->qid.path].name, efd->flags,
 278                 atomic_read(&efd->counter));
 279        return ret;
 280}
 281
 282static int efd_tapfd(struct chan *c, struct fd_tap *tap, int cmd)
 283{
 284        struct eventfd *efd = c->aux;
 285        int ret;
 286
 287        /* HANGUP, ERROR, and PRIORITY will never fire, but people can ask for
 288         * them.  We don't actually support HANGUP, but epoll implies it.
 289         * Linux's eventfd cand have ERROR, so apps can ask for it.  Likewise,
 290         * priority is meaningless for us, but sometimes people ask for it. */
 291#define EFD_LEGAL_TAPS (FDTAP_FILT_READABLE | FDTAP_FILT_WRITABLE |        \
 292                        FDTAP_FILT_HANGUP | FDTAP_FILT_PRIORITY |          \
 293                        FDTAP_FILT_ERROR)
 294
 295        switch (c->qid.path) {
 296        case Qefd:
 297                if (tap->filter & ~EFD_LEGAL_TAPS) {
 298                        set_error(ENOSYS, "Unsupported #%s tap %p, must be %p",
 299                                  devname(), tap->filter, EFD_LEGAL_TAPS);
 300                        return -1;
 301                }
 302                spin_lock(&efd->tap_lock);
 303                switch (cmd) {
 304                case (FDTAP_CMD_ADD):
 305                        SLIST_INSERT_HEAD(&efd->fd_taps, tap, link);
 306                        ret = 0;
 307                        break;
 308                case (FDTAP_CMD_REM):
 309                        SLIST_REMOVE(&efd->fd_taps, tap, fd_tap, link);
 310                        ret = 0;
 311                        break;
 312                default:
 313                        set_error(ENOSYS, "Unsupported #%s tap command %p",
 314                                  devname(), cmd);
 315                        ret = -1;
 316                }
 317                spin_unlock(&efd->tap_lock);
 318                return ret;
 319        default:
 320                set_error(ENOSYS, "Can't tap #%s file type %d", devname(),
 321                          c->qid.path);
 322                return -1;
 323        }
 324}
 325
 326struct dev efd_devtab __devtab = {
 327        .name = "eventfd",
 328        .reset = devreset,
 329        .init = devinit,
 330        .shutdown = devshutdown,
 331        .attach = efd_attach,
 332        .walk = efd_walk,
 333        .stat = efd_stat,
 334        .open = efd_open,
 335        .create = devcreate,
 336        .close = efd_close,
 337        .read = efd_read,
 338        .bread = devbread,
 339        .write = efd_write,
 340        .bwrite = devbwrite,
 341        .remove = devremove,
 342        .wstat = devwstat,
 343        .power = devpower,
 344        .chaninfo = efd_chaninfo,
 345        .tapfd = efd_tapfd,
 346};
 347