akaros/kern/drivers/dev/alarm.c
<<
>>
Prefs
   1/* Copyright (c) 2013 The Regents of the University of California
   2 * Copyright (c) 2016 Google Inc.
   3 * Barret Rhoden <brho@cs.berkeley.edu>
   4 * See LICENSE for details.
   5 *
   6 * #alarm: a device for registering per-process alarms.
   7 *
   8 * Allows a process to set up alarms, which they can tap to get events at a
   9 * certain TSC time.
  10 *
  11 * Every process has their own alarm sets and view of #alarm; gen and friends
  12 * look at current's alarmset when it is time to gen or open a file.
  13 *
  14 * To use, first open #alarm/clone, and that gives you an alarm directory aN,
  15 * where N is ID of the alarm.  The FD you get from clone points to 'ctl.'
  16 *
  17 * 'ctl' takes no commands.  You can read it to get the ID.  That's it.
  18 *
  19 * 'timer' takes the hex string value (in absolute tsc time) to fire the alarm.
  20 * Writing 0 disables the alarm.  You can read 'timer' to get the next time it
  21 * will fire, in TSC time.  0 means it is disabled.  To find out about the timer
  22 * firing, put an FD tap on 'timer' for FDTAP_FILT_WRITTEN.
  23 *
  24 * 'period' takes the hex string value (in TSC ticks) for the period of the
  25 * alarm.  If non-zero, the alarm will rearm when it fires.  You can read the
  26 * period.
  27 *
  28 * Reading the 'count' file will return the number of times the alarm has
  29 * expired since the last read or the last write to 'timer'.  If this is 0, then
  30 * read() will block or EAGAIN.  You cannot write 'count'.  You can tap it for
  31 * FDTAP_FILT_READABLE.
  32 *
  33 * While each process has a separate view of #alarm, it is possible to post a
  34 * chan to Qctl or Qtimer to #srv.  If another proc has your Qtimer, it can set
  35 * it in the past, thereby triggering an immediate event.  More clever than
  36 * useful.
  37 *
  38 * Notes on refcnting (the trickier parts here):
  39 * - the proc_alarms have counted references to their proc
  40 *      proc won't free til all alarms are closed, which is fine.  we close
  41 *      all files in destroy.  if a proc drops a chan in srv, the proc will stay
  42 *      alive because the alarm is alive - til that chan is closed (srvremove)
  43 *
  44 *      other shady ways to keep a chan alive: cd to it!  if it is ., we'd
  45 *      keep a ref around.  however, only alarmdir *file* grab refs, not
  46 *      directories.
  47 *
  48 * - proc_alarms are kref'd, since there can be multiple chans per alarm
  49 *      the only thing that keeps an alarm alive is a chan on a CTL or TIMER (or
  50 *      other file).  when you cloned, you got back an open CTL, which keeps the
  51 *      alarm (and the dir) alive.
  52 *
  53 *      we need to be careful generating krefs, in case alarms are concurrently
  54 *      released and removed from the lists.  just like with procs and pid2proc,
  55 *      we need to sync with the source of the kref. */
  56
  57#include <kmalloc.h>
  58#include <string.h>
  59#include <stdio.h>
  60#include <assert.h>
  61#include <error.h>
  62#include <pmap.h>
  63#include <sys/queue.h>
  64#include <smp.h>
  65#include <kref.h>
  66#include <atomic.h>
  67#include <alarm.h>
  68#include <umem.h>
  69#include <devalarm.h>
  70
  71struct dev alarmdevtab;
  72
  73static char *devname(void)
  74{
  75        return alarmdevtab.name;
  76}
  77
  78/* qid path types */
  79#define Qtopdir                 1
  80#define Qclone                  2
  81#define Qalarmdir               3
  82#define Qctl                    4
  83#define Qtimer                  5       /* Qctl + 1 */
  84#define Qperiod                 6
  85#define Qcount                  7
  86
  87/* This paddr/kaddr is a bit dangerous.  it'll work so long as we don't need all
  88 * 64 bits for a physical address (48 is the current norm on x86_64). */
  89#define ADDR_SHIFT 5
  90#define QID2A(q) ((struct proc_alarm*)KADDR(((q).path >> ADDR_SHIFT)))
  91#define TYPE(q) ((q).path & ((1 << ADDR_SHIFT) - 1))
  92#define QID(ptr, type) ((PADDR(ptr) << ADDR_SHIFT) | type)
  93extern struct username eve;
  94
  95static void alarm_release(struct kref *kref)
  96{
  97        struct proc_alarm *a = container_of(kref, struct proc_alarm, kref);
  98        struct proc *p = a->proc;
  99
 100        assert(p);
 101        spin_lock(&p->alarmset.lock);
 102        TAILQ_REMOVE(&p->alarmset.list, a, link);
 103        spin_unlock(&p->alarmset.lock);
 104        /* When this returns, the alarm has either fired or it never will */
 105        unset_alarm(p->alarmset.tchain, &a->a_waiter);
 106        proc_decref(p);
 107        kfree(a);
 108}
 109
 110static void alarm_fire_taps(struct proc_alarm *a, int filter)
 111{
 112        struct fd_tap *tap_i;
 113
 114        SLIST_FOREACH(tap_i, &a->fd_taps, link)
 115                fire_tap(tap_i, filter);
 116}
 117
 118static void proc_alarm_handler(struct alarm_waiter *a_waiter)
 119{
 120        struct proc_alarm *a = container_of(a_waiter, struct proc_alarm,
 121                                            a_waiter);
 122
 123        cv_lock(&a->cv);
 124        a->count++;
 125        if (!a->period) {
 126                a_waiter->wake_up_time = 0;
 127        } else {
 128                /* TODO: use an alarm helper, once we switch over to nsec */
 129                a_waiter->wake_up_time += a->period;
 130                set_alarm(a->proc->alarmset.tchain, a_waiter);
 131        }
 132        __cv_broadcast(&a->cv);
 133        /* Fires taps for both Qtimer and Qcount. */
 134        alarm_fire_taps(a, FDTAP_FILT_WRITTEN | FDTAP_FILT_READABLE);
 135        cv_unlock(&a->cv);
 136}
 137
 138void devalarm_init(struct proc *p)
 139{
 140        TAILQ_INIT(&p->alarmset.list);
 141        spinlock_init(&p->alarmset.lock);
 142        /* Just running all the proc alarms on core 0. */
 143        p->alarmset.tchain = &per_cpu_info[0].tchain;
 144        p->alarmset.id_counter = 0;
 145}
 146
 147static int alarmgen(struct chan *c, char *entry_name, struct dirtab *unused,
 148                    int unused_nr_dirtab, int s, struct dir *dp)
 149{
 150        struct qid q;
 151        struct proc_alarm *a_i;
 152        struct proc *p = current;
 153
 154        /* Whether we're in one dir or at the top, .. still takes us to the top.
 155         */
 156        if (s == DEVDOTDOT) {
 157                mkqid(&q, Qtopdir, 0, QTDIR);
 158                devdir(c, q, devname(), 0, eve.name, 0555, dp);
 159                return 1;
 160        }
 161        switch (TYPE(c->qid)) {
 162        case Qtopdir:
 163                /* Generate elements for the top level dir.  We support a clone
 164                 * and alarm dirs at the top level */
 165                if (s == 0) {
 166                        mkqid(&q, Qclone, 0, QTFILE);
 167                        devdir(c, q, "clone", 0, eve.name, 0666, dp);
 168                        return 1;
 169                }
 170                s--;    /* 1 -> 0th element, 2 -> 1st element, etc */
 171                /* Gets the s-th element (0 index)
 172                 *
 173                 * I would like to take advantage of the state machine and our
 174                 * previous answer to get the sth element of the list.  We can
 175                 * get at our previous run of gen from dp (struct dir), and use
 176                 * that to get the next item.  I'd like to do something like:
 177                 *
 178                 * if (dp->qid.path >> ADDR_SHIFT)
 179                 *      a_i = TAILQ_NEXT(QID2A(dp->qid), link);
 180                 *
 181                 * Dev would give us a 0'd dp path on the first run, so if we
 182                 * have a path, we know we're on an iterative run.  However, the
 183                 * problem is that we could have lost the element dp refers to
 184                 * (QID2A(dp->qid)) since our previous run, so we can't even
 185                 * access that memory to check for refcnts or anything.  We need
 186                 * a new model for how gen works (probably a gen_start and
 187                 * gen_stop devop, passed as parameters to devwalk), so that we
 188                 * can have some invariants between gen runs.
 189                 *
 190                 * Til then, we're stuck with arrays like in #ip (though we can
 191                 * use Linux style fdsets) or lousy O(n^2) linked lists (like
 192                 * #srv).
 193                 *
 194                 * Note that we won't always start a gen loop with s == 0
 195                 * (devdirread, for instance) */
 196                spin_lock(&p->alarmset.lock);
 197                TAILQ_FOREACH(a_i, &p->alarmset.list, link) {
 198                        if (s-- == 0)
 199                                break;
 200                }
 201                /* As soon as we unlock, someone could free a_i */
 202                if (!a_i) {
 203                        spin_unlock(&p->alarmset.lock);
 204                        return -1;
 205                }
 206                snprintf(get_cur_genbuf(), GENBUF_SZ, "a%d", a_i->id);
 207                mkqid(&q, QID(a_i, Qalarmdir), 0, QTDIR);
 208                devdir(c, q, get_cur_genbuf(), 0, eve.name, 0555, dp);
 209                spin_unlock(&p->alarmset.lock);
 210                return 1;
 211        case Qalarmdir:
 212                /* Gen the contents of the alarm dirs */
 213                s += Qctl;      /* first time through, start on Qctl */
 214                switch (s) {
 215                case Qctl:
 216                        mkqid(&q, QID(QID2A(c->qid), Qctl), 0, QTFILE);
 217                        devdir(c, q, "ctl", 0, eve.name, 0666, dp);
 218                        return 1;
 219                case Qtimer:
 220                        mkqid(&q, QID(QID2A(c->qid), Qtimer), 0, QTFILE);
 221                        devdir(c, q, "timer", 0, eve.name, 0666, dp);
 222                        return 1;
 223                case Qperiod:
 224                        mkqid(&q, QID(QID2A(c->qid), Qperiod), 0, QTFILE);
 225                        devdir(c, q, "period", 0, eve.name, 0666, dp);
 226                        return 1;
 227                case Qcount:
 228                        mkqid(&q, QID(QID2A(c->qid), Qcount), 0, QTFILE);
 229                        devdir(c, q, "count", 0, eve.name, 0666, dp);
 230                        return 1;
 231                }
 232                return -1;
 233                /* Need to also provide a direct hit for Qclone and all other
 234                 * files (at all levels of the hierarchy).  Every file is both
 235                 * generated (via the s increments in their respective
 236                 * directories) and directly gen-able.  devstat() will call gen
 237                 * with a specific path in the qid.  In these cases, we make a
 238                 * dir for whatever they are asking for.  Note the qid stays the
 239                 * same.  I think this is what the old plan9 comments above
 240                 * devgen were talking about for (ii).
 241                 *
 242                 * We don't need to do this for the directories - devstat will
 243                 * look for the a directory by path and fail.  Then it will
 244                 * manually build the stat output (check the -1 case in
 245                 * devstat). */
 246        case Qclone:
 247                devdir(c, c->qid, "clone", 0, eve.name, 0666, dp);
 248                return 1;
 249        case Qctl:
 250                devdir(c, c->qid, "ctl", 0, eve.name, 0666, dp);
 251                return 1;
 252        case Qtimer:
 253                devdir(c, c->qid, "timer", 0, eve.name, 0666, dp);
 254                return 1;
 255        case Qperiod:
 256                devdir(c, c->qid, "period", 0, eve.name, 0666, dp);
 257                return 1;
 258        case Qcount:
 259                devdir(c, c->qid, "count", 0, eve.name, 0666, dp);
 260                return 1;
 261        }
 262        return -1;
 263}
 264
 265static void alarminit(void)
 266{
 267}
 268
 269static struct chan *alarmattach(char *spec)
 270{
 271        struct chan *c = devattach(devname(), spec);
 272
 273        mkqid(&c->qid, Qtopdir, 0, QTDIR);
 274        return c;
 275}
 276
 277static struct walkqid *alarmwalk(struct chan *c, struct chan *nc, char **name,
 278                                 unsigned int nname)
 279{
 280        return devwalk(c, nc, name, nname, 0, 0, alarmgen);
 281}
 282
 283static size_t alarmstat(struct chan *c, uint8_t *db, size_t n)
 284{
 285        return devstat(c, db, n, 0, 0, alarmgen);
 286}
 287
 288/* It shouldn't matter if p = current is DYING.  We'll eventually fail to insert
 289 * the open chan into p's fd table, then decref the chan. */
 290static struct chan *alarmopen(struct chan *c, int omode)
 291{
 292        struct proc *p = current;
 293        struct proc_alarm *a, *a_i;
 294        switch (TYPE(c->qid)) {
 295        case Qtopdir:
 296        case Qalarmdir:
 297                if (omode & O_REMCLO)
 298                        error(EPERM, ERROR_FIXME);
 299                if (omode & O_WRITE)
 300                        error(EISDIR, ERROR_FIXME);
 301                break;
 302        case Qclone:
 303                a = kzmalloc(sizeof(struct proc_alarm), MEM_WAIT);
 304                kref_init(&a->kref, alarm_release, 1);
 305                SLIST_INIT(&a->fd_taps);
 306                cv_init(&a->cv);
 307                qlock_init(&a->qlock);
 308                init_awaiter(&a->a_waiter, proc_alarm_handler);
 309                spin_lock(&p->alarmset.lock);
 310                a->id = p->alarmset.id_counter++;
 311                proc_incref(p, 1);
 312                a->proc = p;
 313                TAILQ_INSERT_TAIL(&p->alarmset.list, a, link);
 314                spin_unlock(&p->alarmset.lock);
 315                mkqid(&c->qid, QID(a, Qctl), 0, QTFILE);
 316                break;
 317        case Qctl:
 318        case Qtimer:
 319        case Qperiod:
 320        case Qcount:
 321                /* the purpose of opening is to hold a kref on the proc_alarm */
 322                a = QID2A(c->qid);
 323                assert(a);
 324                /* this isn't a valid pointer yet, since our chan doesn't have a
 325                 * ref.  since the time that walk gave our chan the qid, the
 326                 * chan could have been closed, and the alarm decref'd and
 327                 * freed.  the qid is essentially an uncounted reference, and we
 328                 * need to go to the source to attempt to get a real ref.
 329                 * Unfortunately, this is another scan of the list, same as
 330                 * devsrv. */
 331                spin_lock(&p->alarmset.lock);
 332                TAILQ_FOREACH(a_i, &p->alarmset.list, link) {
 333                        if (a_i == a) {
 334                                assert(a->proc == current);
 335                                /* it's still possible we're not getting the
 336                                 * ref, racing with the release method */
 337                                if (!kref_get_not_zero(&a->kref, 1)) {
 338                                        /* lost the race; error out later */
 339                                        a_i = 0;
 340                                }
 341                                break;
 342                        }
 343                }
 344                spin_unlock(&p->alarmset.lock);
 345                if (!a_i)
 346                        error(EFAIL,
 347                              "Unable to open alarm, concurrent closing");
 348                break;
 349        }
 350        c->mode = openmode(omode);
 351        /* Assumes c is unique (can't be closed concurrently */
 352        c->flag |= COPEN;
 353        c->offset = 0;
 354        return c;
 355}
 356
 357static void alarmclose(struct chan *c)
 358{
 359        /* There are more closes than opens.  For instance, sysstat doesn't
 360         * open, but it will close the chan it got from namec.  We only want to
 361         * clean up/decref chans that were actually open. */
 362        if (!(c->flag & COPEN))
 363                return;
 364        switch (TYPE(c->qid)) {
 365        case Qctl:
 366        case Qtimer:
 367        case Qperiod:
 368        case Qcount:
 369                kref_put(&QID2A(c->qid)->kref);
 370                break;
 371        }
 372}
 373
 374/* Helper for Qcount to encapsulate timerfd. */
 375static long read_qcount(struct chan *c, void *ubuf, size_t n)
 376{
 377        ERRSTACK(1);
 378        struct proc_alarm *a = QID2A(c->qid);
 379        struct cv_lookup_elm cle;
 380        unsigned long old_count;
 381
 382        if (n > sizeof(old_count))
 383                error(EINVAL, "timerfd buffer is too small (%llu)", n);
 384        /* TODO: have easily abortable CVs that don't require this mechanism. */
 385        cv_lock(&a->cv);
 386        __reg_abortable_cv(&cle, &a->cv);
 387        if (waserror()) {
 388                cv_unlock(&a->cv);
 389                dereg_abortable_cv(&cle);
 390                nexterror();
 391        }
 392        while (!a->count) {
 393                if (c->flag & O_NONBLOCK)
 394                        error(EAGAIN, "#alarm count was 0");
 395                if (should_abort(&cle))
 396                        error(EINTR, "syscall aborted");
 397                cv_wait(&a->cv);
 398        }
 399        old_count = a->count;
 400        a->count = 0;
 401        cv_unlock(&a->cv);
 402        dereg_abortable_cv(&cle);
 403        poperror();
 404        if (copy_to_user(ubuf, &old_count, sizeof(old_count)))
 405                error(EFAULT, "timerfd copy_to_user failed");
 406        return sizeof(old_count);
 407}
 408
 409static size_t alarmread(struct chan *c, void *ubuf, size_t n, off64_t offset)
 410{
 411        struct proc_alarm *p_alarm;
 412
 413        switch (TYPE(c->qid)) {
 414        case Qtopdir:
 415        case Qalarmdir:
 416                return devdirread(c, ubuf, n, 0, 0, alarmgen);
 417        case Qctl:
 418                p_alarm = QID2A(c->qid);
 419                /* simple reads from p_alarm shouldn't need a lock */
 420                return readnum(offset, ubuf, n, p_alarm->id, NUMSIZE32);
 421        case Qtimer:
 422                p_alarm = QID2A(c->qid);
 423                return readnum(offset, ubuf, n, p_alarm->a_waiter.wake_up_time,
 424                                           NUMSIZE64);
 425        case Qperiod:
 426                p_alarm = QID2A(c->qid);
 427                return readnum(offset, ubuf, n, p_alarm->period, NUMSIZE64);
 428        case Qcount:
 429                return read_qcount(c, ubuf, n); /* ignore offset */
 430        default:
 431                panic("Bad QID %p in devalarm", c->qid.path);
 432        }
 433        return 0;
 434}
 435
 436/* Helper, sets the procalarm to hexval (abs TSC ticks).  0 disarms. */
 437static void set_proc_alarm(struct proc_alarm *a, uint64_t hexval)
 438{
 439        /* Due to how we have to maintain 'count', we need to strictly account
 440         * for the firings of the alarm.  Easiest thing is to disarm it, reset
 441         * everything, then rearm it.  Note that if someone is blocked on count
 442         * = 0, they may still be blocked until the next time the alarm fires.
 443         *
 444         * unset waits on the handler, which grabs the cv lock, so we don't grab
 445         * the cv lock.  However, we still need to protect ourselves from
 446         * multiple setters trying to run this at once.  Unset actually can
 447         * handle being called concurrently, but alarm setters can't, nor can it
 448         * handle the unsets and sets getting out of sync.  For instance, two
 449         * unsets followed by two sets would be a bug.  Likewise, setting the
 450         * awaiter value while it is on a tchain is a bug.  The qlock prevents
 451         * that. */
 452        qlock(&a->qlock);
 453        unset_alarm(a->proc->alarmset.tchain, &a->a_waiter);
 454        cv_lock(&a->cv);
 455        a->count = 0;
 456        if (hexval) {
 457                set_awaiter_abs(&a->a_waiter, hexval);
 458                set_alarm(a->proc->alarmset.tchain, &a->a_waiter);
 459        }
 460        cv_unlock(&a->cv);
 461        qunlock(&a->qlock);
 462}
 463
 464/* Note that in read and write we have an open chan, which means we have an
 465 * active kref on the p_alarm.  Also note that we make no assumptions about
 466 * current here - we find the proc (and the tchain) via the ref stored in the
 467 * proc_alarm. */
 468static size_t alarmwrite(struct chan *c, void *ubuf, size_t n, off64_t unused)
 469{
 470        struct proc_alarm *p_alarm;
 471
 472        switch (TYPE(c->qid)) {
 473        case Qtopdir:
 474        case Qalarmdir:
 475        case Qctl:
 476        case Qcount:
 477                error(EPERM, ERROR_FIXME);
 478        case Qtimer:
 479                set_proc_alarm(QID2A(c->qid), strtoul_from_ubuf(ubuf, n, 16));
 480                break;
 481        case Qperiod:
 482                p_alarm = QID2A(c->qid);
 483                /* racing with the handler which checks the val repeatedly */
 484                cv_lock(&p_alarm->cv);
 485                p_alarm->period = strtoul_from_ubuf(ubuf, n, 16);
 486                cv_unlock(&p_alarm->cv);
 487                break;
 488        default:
 489                panic("Bad QID %p in devalarm", c->qid.path);
 490        }
 491        return n;
 492}
 493
 494/* We use the same tap list, regardless of Qtimer or Qcount */
 495static int tap_alarm(struct proc_alarm *a, struct fd_tap *tap, int cmd,
 496                     int legal_filter)
 497{
 498        int ret;
 499
 500        if (tap->filter & ~legal_filter) {
 501                set_error(ENOSYS, "Unsupported #%s tap %p, must be %p",
 502                          devname(), tap->filter, legal_filter);
 503                return -1;
 504        }
 505        cv_lock(&a->cv);
 506        switch (cmd) {
 507        case (FDTAP_CMD_ADD):
 508                SLIST_INSERT_HEAD(&a->fd_taps, tap, link);
 509                ret = 0;
 510                break;
 511        case (FDTAP_CMD_REM):
 512                SLIST_REMOVE(&a->fd_taps, tap, fd_tap, link);
 513                ret = 0;
 514                break;
 515        default:
 516                set_error(ENOSYS, "Unsupported #%s tap command %p",
 517                                  devname(), cmd);
 518                ret = -1;
 519        }
 520        cv_unlock(&a->cv);
 521        return ret;
 522}
 523
 524static int alarm_tapfd(struct chan *c, struct fd_tap *tap, int cmd)
 525{
 526        struct proc_alarm *a = QID2A(c->qid);
 527
 528        /* We don't actually support HANGUP, but epoll implies it. */
 529        #define ALARM_LEGAL_TIMER_TAPS (FDTAP_FILT_WRITTEN | FDTAP_FILT_HANGUP)
 530        #define ALARM_LEGAL_COUNT_TAPS (FDTAP_FILT_READABLE | FDTAP_FILT_HANGUP)
 531
 532        switch (TYPE(c->qid)) {
 533        case Qtimer:
 534                return tap_alarm(a, tap, cmd, ALARM_LEGAL_TIMER_TAPS);
 535        case Qcount:
 536                return tap_alarm(a, tap, cmd, ALARM_LEGAL_COUNT_TAPS);
 537        default:
 538                set_error(ENOSYS, "Can't tap #%s file type %d", devname(),
 539                          c->qid.path);
 540                return -1;
 541        }
 542}
 543
 544static char *alarm_chaninfo(struct chan *ch, char *ret, size_t ret_l)
 545{
 546        struct proc_alarm *a;
 547        struct timespec ts;
 548
 549        switch (TYPE(ch->qid)) {
 550        case Qctl:
 551        case Qtimer:
 552        case Qperiod:
 553        case Qcount:
 554                a = QID2A(ch->qid);
 555                ts = tsc2timespec(a->a_waiter.wake_up_time);
 556                snprintf(ret, ret_l,
 557                         "Id %d, %s, expires [%7d.%09d] (%p), period %llu, count %llu",
 558                         a->id,
 559                         SLIST_EMPTY(&a->fd_taps) ? "untapped" : "tapped",
 560                         ts.tv_sec, ts.tv_nsec, a->a_waiter.wake_up_time,
 561                         a->period, a->count);
 562                break;
 563        default:
 564                return devchaninfo(ch, ret, ret_l);
 565        }
 566        return ret;
 567}
 568
 569struct dev alarmdevtab __devtab = {
 570        .name = "alarm",
 571
 572        .reset = devreset,
 573        .init = alarminit,
 574        .shutdown = devshutdown,
 575        .attach = alarmattach,
 576        .walk = alarmwalk,
 577        .stat = alarmstat,
 578        .open = alarmopen,
 579        .create = devcreate,
 580        .close = alarmclose,
 581        .read = alarmread,
 582        .bread = devbread,
 583        .write = alarmwrite,
 584        .bwrite = devbwrite,
 585        .remove = devremove,
 586        .wstat = devwstat,
 587        .power = devpower,
 588        .chaninfo = alarm_chaninfo,
 589        .tapfd = alarm_tapfd,
 590};
 591