Add #eventfd [1/2]
[akaros.git] / kern / drivers / dev / eventfd.c
1 /* Copyright (c) 2015 Google Inc
2  * Barret Rhoden <brho@cs.berkeley.edu>
3  * See LICENSE for details.
4  *
5  * #eventfd device, the kernel-side implementation of man 2 eventfd.
6  *
7  * Unlike the Linux interface, which takes host-endian u64s, we read and write
8  * strings.  It's a little slower, but it maintains the distributed-system
9  * nature of Plan 9 devices. */
10
11 #include <ns.h>
12 #include <kmalloc.h>
13 #include <kref.h>
14 #include <atomic.h>
15 #include <string.h>
16 #include <stdio.h>
17 #include <assert.h>
18 #include <error.h>
19 #include <sys/queue.h>
20 #include <fdtap.h>
21
22 struct dev efd_devtab;
23
24 static char *devname(void)
25 {
26         return efd_devtab.name;
27 }
28
29 enum {
30         Qdir,
31         Qctl,
32         Qefd,
33 };
34
35 static struct dirtab efd_dir[] = {
36         {".", {Qdir, 0, QTDIR}, 0, DMDIR | 0555},
37         {"ctl", {Qctl, 0, QTFILE}, 0, 0666},
38         {"efd", {Qefd, 0, QTFILE}, 8, 0666},
39 };
40
41 enum {
42         EFD_SEMAPHORE =                         1 << 0,
43         EFD_MAX_VAL =                           (unsigned long)(-2), // i.e. 0xfffffffffffffffe
44 };
45
46
47 struct eventfd {
48         int                                             flags;
49         atomic_t                                        counter;
50         struct fdtap_slist                      fd_taps;
51         spinlock_t                                      tap_lock;
52         struct rendez                           rv_readers;
53         struct rendez                           rv_writers;
54         struct kref                                     refcnt;
55 };
56
57
58 static void efd_release(struct kref *kref)
59 {
60         struct eventfd *efd = container_of(kref, struct eventfd, refcnt);
61         /* All FDs with taps should be closed before we decreffed all the chans */
62         assert(SLIST_EMPTY(&efd->fd_taps));
63         kfree(efd);
64 }
65
66 static struct chan *efd_attach(char *spec)
67 {
68         struct chan *c;
69         struct eventfd *efd;
70
71         c = devattach(devname(), spec);
72         efd = kzmalloc(sizeof(struct eventfd), KMALLOC_WAIT);
73         SLIST_INIT(&efd->fd_taps);
74         spinlock_init(&efd->tap_lock);
75         rendez_init(&efd->rv_readers);
76         rendez_init(&efd->rv_writers);
77         /* Attach and walk are the two sources of chans.  Each returns a refcnt'd
78          * object, for the most part. */
79         kref_init(&efd->refcnt, efd_release, 1);
80         /* nothing special in the qid to ID this eventfd.  the main thing is the
81          * aux.  we could put a debugging ID in the path like pipe. */
82         mkqid(&c->qid, Qdir, 0, QTDIR);
83         c->aux = efd;
84         /* just to be fancy and remove a syscall, if they pass spec == "sem", then
85          * we'll treat them as being in semaphore mode. */
86         if (!strcmp(spec, "sem"))
87                 efd->flags |= EFD_SEMAPHORE;
88         return c;
89 }
90
91 static struct walkqid *efd_walk(struct chan *c, struct chan *nc, char **name,
92                                                                 int nname)
93 {
94         struct walkqid *wq;
95         struct eventfd *efd = c->aux;
96
97         wq = devwalk(c, nc, name, nname, efd_dir, ARRAY_SIZE(efd_dir), devgen);
98         /* Walk is a source of a distinct chan from this device.  The other source
99          * is attach.  Once created, these chans will eventually be closed, and when
100          * they close, they will decref their aux, efd.  All chans within this
101          * *instance* of eventfd share the same efd.  Each one will have one refcnt.
102          * Each chan may also have several copies of its pointer out there (e.g. FD
103          * dup), all of which have their own *chan* refcnt.
104          *
105          * All of the above applies on successful walks that found all nname parts
106          * of the path.  A mid-success is wq: we got something.  wq->clone means we
107          * got to the end and the "big walk" considers this a success.
108          *
109          * There is a slight chance the new chan is the same as our original chan
110          * (if nc == c when we're called).  In which case, there's only one chan.
111          * The number of refs on efd == the number of distinct chans within this
112          * instance of #eventfd. */
113         if (wq != NULL && wq->clone != NULL && wq->clone != c)
114                 kref_get(&efd->refcnt, 1);
115         return wq;
116 }
117
118 /* In the future, we could use stat / wstat to get and set O_NONBLOCK */
119 static int efd_stat(struct chan *c, uint8_t * db, int n)
120 {
121         return devstat(c, db, n, efd_dir, ARRAY_SIZE(efd_dir), devgen);
122 }
123
124 static struct chan *efd_open(struct chan *c, int omode)
125 {
126         return devopen(c, omode, efd_dir, ARRAY_SIZE(efd_dir), devgen);
127 }
128
129 static void efd_close(struct chan *c)
130 {
131         struct eventfd *efd = c->aux;
132         /* Here's where we put the ref from attach and successful walks */
133         kref_put(&efd->refcnt);
134 }
135
136 static void efd_fire_taps(struct eventfd *efd, int filter)
137 {
138         struct fd_tap *tap_i;
139         if (SLIST_EMPTY(&efd->fd_taps))
140                 return;
141         /* We're not expecting many FD taps, so it's not worth splitting readers
142          * from writers or anything like that.
143          * TODO: (RCU) Locking to protect the list and the tap's existence. */
144         spin_lock(&efd->tap_lock);
145         SLIST_FOREACH(tap_i, &efd->fd_taps, link)
146                 fire_tap(tap_i, filter);
147         spin_unlock(&efd->tap_lock);
148 }
149
150 static int has_counts(void *arg)
151 {
152         struct eventfd *efd = arg;
153         return atomic_read(&efd->counter) != 0;
154 }
155
156 /* The heart of reading an eventfd */
157 static unsigned long efd_read_efd(struct eventfd *efd, struct chan *c)
158 {
159         unsigned long old_count, new_count, ret;
160         while (1) {
161                 old_count = atomic_read(&efd->counter);
162                 if (!old_count) {
163                         if (c->flag & O_NONBLOCK) {
164                                 set_errno(EAGAIN);
165                                 error("Would block on #%s read", devname());
166                         }
167                         rendez_sleep(&efd->rv_readers, has_counts, efd);
168                 } else {
169                         if (efd->flags & EFD_SEMAPHORE) {
170                                 new_count = old_count - 1;
171                                 ret = 1;
172                         } else {
173                                 new_count = 0;
174                                 ret = old_count;
175                         }
176                         if (atomic_cas(&efd->counter, old_count, new_count))
177                                 goto success;
178                 }
179         }
180 success:
181         rendez_wakeup(&efd->rv_writers);
182         efd_fire_taps(efd, FDTAP_FILT_WRITABLE);
183         return ret;
184 }
185
186 static long efd_read(struct chan *c, void *ubuf, long n, int64_t offset)
187 {
188         struct eventfd *efd = c->aux;
189
190         switch (c->qid.path) {
191                 case Qdir:
192                         return devdirread(c, ubuf, n, efd_dir, ARRAY_SIZE(efd_dir),
193                                                           devgen);
194                 case Qctl:
195                         return readnum(offset, ubuf, n, efd->flags, NUMSIZE32);
196                 case Qefd:
197                         /* ignoring the chan offset for Qefd */
198                         return readnum(0, ubuf, n, efd_read_efd(efd, c),
199                                                    NUMSIZE64);
200                 default:
201                         panic("Bad Qid %p!", c->qid.path);
202         }
203         return -1;
204 }
205
206 static int has_room(void *arg)
207 {
208         struct eventfd *efd = arg;
209         return atomic_read(&efd->counter) != EFD_MAX_VAL;
210 }
211
212 /* The heart of writing an eventfd */
213 static void efd_write_efd(struct eventfd *efd, unsigned long add_to,
214                           struct chan *c)
215 {
216         unsigned long old_count, new_count;
217         while (1) {
218                 old_count = atomic_read(&efd->counter);
219                 new_count = old_count + add_to;
220                 if (new_count > EFD_MAX_VAL) {
221                         if (c->flag & O_NONBLOCK) {
222                                 set_errno(EAGAIN);
223                                 error("Would block on #%s write", devname());
224                         }
225                         rendez_sleep(&efd->rv_writers, has_room, efd);
226                 } else {
227                         if (atomic_cas(&efd->counter, old_count, new_count))
228                                 goto success;
229                 }
230         }
231 success:
232         rendez_wakeup(&efd->rv_readers);
233         efd_fire_taps(efd, FDTAP_FILT_READABLE);
234 }
235
236 static long efd_write(struct chan *c, void *ubuf, long n, int64_t offset)
237 {
238         struct eventfd *efd = c->aux;
239         unsigned long write_val;
240         char num64[NUMSIZE64];
241
242         switch (c->qid.path) {
243                 case Qctl:
244                         /* If we want to allow runtime changing of settings, we can do it
245                          * here. */
246                         error("No #%s ctl commands supported", devname());
247                         break;
248                 case Qefd:
249                         /* We want to give strtoul a null-terminated buf (can't handle
250                          * arbitrary user strings).  Ignoring the chan offset too. */
251                         if (n > sizeof(num64)) {
252                                 set_errno(EINVAL);
253                                 error("attempted to write %d chars, max %d", n, sizeof(num64));
254                         }
255                         memcpy(num64, ubuf, n);
256                         num64[n] = 0;   /* enforce trailing 0 */
257                         write_val = strtoul(num64, 0, 0);
258                         if (write_val == (unsigned long)(-1))
259                                 error("Eventfd write must not be -1");
260                         efd_write_efd(efd, write_val, c);
261                         break;
262                 default:
263                         panic("Bad Qid %p!", c->qid.path);
264         }
265         return n;
266 }
267
268 static char *efd_chaninfo(struct chan *c, char *ret, size_t ret_l)
269 {
270         struct eventfd *efd = c->aux;
271
272         snprintf(ret, ret_l, "QID type %s, flags %p, counter %p",
273                  efd_dir[c->qid.path].name, efd->flags, atomic_read(&efd->counter));
274         return ret;
275 }
276
277 static int efd_tapfd(struct chan *c, struct fd_tap *tap, int cmd)
278 {
279         struct eventfd *efd = c->aux;
280         int ret;
281
282         /* We don't actually support HANGUP, but epoll implies it */
283         #define EFD_LEGAL_TAPS (FDTAP_FILT_READABLE | FDTAP_FILT_WRITABLE |        \
284                                 FDTAP_FILT_HANGUP)
285
286         switch (c->qid.path) {
287                 case Qefd:
288                         if (tap->filter & ~EFD_LEGAL_TAPS) {
289                                 set_errno(ENOSYS);
290                                 set_errstr("Unsupported #%s tap, must be %p", devname(),
291                                            EFD_LEGAL_TAPS);
292                                 return -1;
293                         }
294                         spin_lock(&efd->tap_lock);
295                         switch (cmd) {
296                                 case (FDTAP_CMD_ADD):
297                                         SLIST_INSERT_HEAD(&efd->fd_taps, tap, link);
298                                         ret = 0;
299                                         break;
300                                 case (FDTAP_CMD_REM):
301                                         SLIST_REMOVE(&efd->fd_taps, tap, fd_tap, link);
302                                         ret = 0;
303                                         break;
304                                 default:
305                                         set_errno(ENOSYS);
306                                         set_errstr("Unsupported #%s tap command %p",
307                                                    devname(), cmd);
308                                         ret = -1;
309                         }
310                         spin_unlock(&efd->tap_lock);
311                         return ret;
312                 default:
313                         set_errno(ENOSYS);
314                         set_errstr("Can't tap #%s file type %d", devname(),
315                                    c->qid.path);
316                         return -1;
317         }
318 }
319
320 struct dev efd_devtab __devtab = {
321         .name = "eventfd",
322         .reset = devreset,
323         .init = devinit,
324         .shutdown = devshutdown,
325         .attach = efd_attach,
326         .walk = efd_walk,
327         .stat = efd_stat,
328         .open = efd_open,
329         .create = devcreate,
330         .close = efd_close,
331         .read = efd_read,
332         .bread = devbread,
333         .write = efd_write,
334         .bwrite = devbwrite,
335         .remove = devremove,
336         .wstat = devwstat,
337         .power = devpower,
338         .chaninfo = efd_chaninfo,
339         .tapfd = efd_tapfd,
340 };