Added explicit errno reporting from error() API.
[akaros.git] / kern / drivers / dev / eventfd.c
1 /* Copyright (c) 2015 Google Inc
2  * Barret Rhoden <brho@cs.berkeley.edu>
3  * See LICENSE for details.
4  *
5  * #eventfd device, the kernel-side implementation of man 2 eventfd.
6  *
7  * Unlike the Linux interface, which takes host-endian u64s, we read and write
8  * strings.  It's a little slower, but it maintains the distributed-system
9  * nature of Plan 9 devices. */
10
11 #include <ns.h>
12 #include <kmalloc.h>
13 #include <kref.h>
14 #include <atomic.h>
15 #include <string.h>
16 #include <stdio.h>
17 #include <assert.h>
18 #include <error.h>
19 #include <sys/queue.h>
20 #include <fdtap.h>
21 #include <syscall.h>
22
23 struct dev efd_devtab;
24
25 static char *devname(void)
26 {
27         return efd_devtab.name;
28 }
29
30 enum {
31         Qdir,
32         Qctl,
33         Qefd,
34 };
35
36 static struct dirtab efd_dir[] = {
37         {".", {Qdir, 0, QTDIR}, 0, DMDIR | 0555},
38         {"ctl", {Qctl, 0, QTFILE}, 0, 0666},
39         {"efd", {Qefd, 0, QTFILE}, 8, 0666},
40 };
41
42 enum {
43         EFD_SEMAPHORE =                         1 << 0,
44         EFD_MAX_VAL =                           (unsigned long)(-2), // i.e. 0xfffffffffffffffe
45 };
46
47
48 struct eventfd {
49         int                                             flags;
50         atomic_t                                        counter;
51         struct fdtap_slist                      fd_taps;
52         spinlock_t                                      tap_lock;
53         struct rendez                           rv_readers;
54         struct rendez                           rv_writers;
55         struct kref                                     refcnt;
56 };
57
58
59 static void efd_release(struct kref *kref)
60 {
61         struct eventfd *efd = container_of(kref, struct eventfd, refcnt);
62         /* All FDs with taps should be closed before we decreffed all the chans */
63         assert(SLIST_EMPTY(&efd->fd_taps));
64         kfree(efd);
65 }
66
67 static struct chan *efd_attach(char *spec)
68 {
69         struct chan *c;
70         struct eventfd *efd;
71
72         c = devattach(devname(), spec);
73         efd = kzmalloc(sizeof(struct eventfd), KMALLOC_WAIT);
74         SLIST_INIT(&efd->fd_taps);
75         spinlock_init(&efd->tap_lock);
76         rendez_init(&efd->rv_readers);
77         rendez_init(&efd->rv_writers);
78         /* Attach and walk are the two sources of chans.  Each returns a refcnt'd
79          * object, for the most part. */
80         kref_init(&efd->refcnt, efd_release, 1);
81         /* nothing special in the qid to ID this eventfd.  the main thing is the
82          * aux.  we could put a debugging ID in the path like pipe. */
83         mkqid(&c->qid, Qdir, 0, QTDIR);
84         c->aux = efd;
85         /* just to be fancy and remove a syscall, if they pass spec == "sem", then
86          * we'll treat them as being in semaphore mode. */
87         if (!strcmp(spec, "sem"))
88                 efd->flags |= EFD_SEMAPHORE;
89         return c;
90 }
91
92 static struct walkqid *efd_walk(struct chan *c, struct chan *nc, char **name,
93                                                                 int nname)
94 {
95         struct walkqid *wq;
96         struct eventfd *efd = c->aux;
97
98         wq = devwalk(c, nc, name, nname, efd_dir, ARRAY_SIZE(efd_dir), devgen);
99         /* Walk is a source of a distinct chan from this device.  The other source
100          * is attach.  Once created, these chans will eventually be closed, and when
101          * they close, they will decref their aux, efd.  All chans within this
102          * *instance* of eventfd share the same efd.  Each one will have one refcnt.
103          * Each chan may also have several copies of its pointer out there (e.g. FD
104          * dup), all of which have their own *chan* refcnt.
105          *
106          * All of the above applies on successful walks that found all nname parts
107          * of the path.  A mid-success is wq: we got something.  wq->clone means we
108          * got to the end and the "big walk" considers this a success.
109          *
110          * There is a slight chance the new chan is the same as our original chan
111          * (if nc == c when we're called).  In which case, there's only one chan.
112          * The number of refs on efd == the number of distinct chans within this
113          * instance of #eventfd. */
114         if (wq != NULL && wq->clone != NULL && wq->clone != c)
115                 kref_get(&efd->refcnt, 1);
116         return wq;
117 }
118
119 /* In the future, we could use stat / wstat to get and set O_NONBLOCK */
120 static int efd_stat(struct chan *c, uint8_t * db, int n)
121 {
122         return devstat(c, db, n, efd_dir, ARRAY_SIZE(efd_dir), devgen);
123 }
124
125 static struct chan *efd_open(struct chan *c, int omode)
126 {
127         return devopen(c, omode, efd_dir, ARRAY_SIZE(efd_dir), devgen);
128 }
129
130 static void efd_close(struct chan *c)
131 {
132         struct eventfd *efd = c->aux;
133         /* Here's where we put the ref from attach and successful walks */
134         kref_put(&efd->refcnt);
135 }
136
137 static void efd_fire_taps(struct eventfd *efd, int filter)
138 {
139         struct fd_tap *tap_i;
140         if (SLIST_EMPTY(&efd->fd_taps))
141                 return;
142         /* We're not expecting many FD taps, so it's not worth splitting readers
143          * from writers or anything like that.
144          * TODO: (RCU) Locking to protect the list and the tap's existence. */
145         spin_lock(&efd->tap_lock);
146         SLIST_FOREACH(tap_i, &efd->fd_taps, link)
147                 fire_tap(tap_i, filter);
148         spin_unlock(&efd->tap_lock);
149 }
150
151 static int has_counts(void *arg)
152 {
153         struct eventfd *efd = arg;
154         return atomic_read(&efd->counter) != 0;
155 }
156
157 /* The heart of reading an eventfd */
158 static unsigned long efd_read_efd(struct eventfd *efd, struct chan *c)
159 {
160         unsigned long old_count, new_count, ret;
161         while (1) {
162                 old_count = atomic_read(&efd->counter);
163                 if (!old_count) {
164                         if (c->flag & O_NONBLOCK) {
165                                 set_errno(EAGAIN);
166                                 error(EFAIL, "Would block on #%s read", devname());
167                         }
168                         rendez_sleep(&efd->rv_readers, has_counts, efd);
169                 } else {
170                         if (efd->flags & EFD_SEMAPHORE) {
171                                 new_count = old_count - 1;
172                                 ret = 1;
173                         } else {
174                                 new_count = 0;
175                                 ret = old_count;
176                         }
177                         if (atomic_cas(&efd->counter, old_count, new_count))
178                                 goto success;
179                 }
180         }
181 success:
182         rendez_wakeup(&efd->rv_writers);
183         efd_fire_taps(efd, FDTAP_FILT_WRITABLE);
184         return ret;
185 }
186
187 static long efd_read(struct chan *c, void *ubuf, long n, int64_t offset)
188 {
189         struct eventfd *efd = c->aux;
190
191         switch (c->qid.path) {
192                 case Qdir:
193                         return devdirread(c, ubuf, n, efd_dir, ARRAY_SIZE(efd_dir),
194                                                           devgen);
195                 case Qctl:
196                         return readnum(offset, ubuf, n, efd->flags, NUMSIZE32);
197                 case Qefd:
198                         /* ignoring the chan offset for Qefd */
199                         return readnum(0, ubuf, n, efd_read_efd(efd, c),
200                                                    NUMSIZE64);
201                 default:
202                         panic("Bad Qid %p!", c->qid.path);
203         }
204         return -1;
205 }
206
207 static int has_room(void *arg)
208 {
209         struct eventfd *efd = arg;
210         return atomic_read(&efd->counter) != EFD_MAX_VAL;
211 }
212
213 /* The heart of writing an eventfd */
214 static void efd_write_efd(struct eventfd *efd, unsigned long add_to,
215                           struct chan *c)
216 {
217         unsigned long old_count, new_count;
218         while (1) {
219                 old_count = atomic_read(&efd->counter);
220                 new_count = old_count + add_to;
221                 if (new_count > EFD_MAX_VAL) {
222                         if (c->flag & O_NONBLOCK) {
223                                 set_errno(EAGAIN);
224                                 error(EFAIL, "Would block on #%s write", devname());
225                         }
226                         rendez_sleep(&efd->rv_writers, has_room, efd);
227                 } else {
228                         if (atomic_cas(&efd->counter, old_count, new_count))
229                                 goto success;
230                 }
231         }
232 success:
233         rendez_wakeup(&efd->rv_readers);
234         efd_fire_taps(efd, FDTAP_FILT_READABLE);
235 }
236
237 static long efd_write(struct chan *c, void *ubuf, long n, int64_t offset)
238 {
239         struct eventfd *efd = c->aux;
240         unsigned long write_val;
241         char num64[NUMSIZE64];
242
243         switch (c->qid.path) {
244                 case Qctl:
245                         /* If we want to allow runtime changing of settings, we can do it
246                          * here. */
247                         error(EFAIL, "No #%s ctl commands supported", devname());
248                         break;
249                 case Qefd:
250                         /* We want to give strtoul a null-terminated buf (can't handle
251                          * arbitrary user strings).  Ignoring the chan offset too. */
252                         if (n > sizeof(num64))
253                                 error(EAGAIN, "attempted to write %d chars, max %d", n,
254                                           sizeof(num64));
255                         memcpy(num64, ubuf, n);
256                         num64[n] = 0;   /* enforce trailing 0 */
257                         write_val = strtoul(num64, 0, 0);
258                         if (write_val == (unsigned long)(-1))
259                                 error(EFAIL, "Eventfd write must not be -1");
260                         efd_write_efd(efd, write_val, c);
261                         break;
262                 default:
263                         panic("Bad Qid %p!", c->qid.path);
264         }
265         return n;
266 }
267
268 static char *efd_chaninfo(struct chan *c, char *ret, size_t ret_l)
269 {
270         struct eventfd *efd = c->aux;
271
272         snprintf(ret, ret_l, "QID type %s, flags %p, counter %p",
273                  efd_dir[c->qid.path].name, efd->flags, atomic_read(&efd->counter));
274         return ret;
275 }
276
277 static int efd_tapfd(struct chan *c, struct fd_tap *tap, int cmd)
278 {
279         struct eventfd *efd = c->aux;
280         int ret;
281
282         /* HANGUP, ERROR, and PRIORITY will never fire, but people can ask for them.
283          * We don't actually support HANGUP, but epoll implies it.  Linux's eventfd
284          * cand have ERROR, so apps can ask for it.  Likewise, priority is
285          * meaningless for us, but sometimes people ask for it. */
286         #define EFD_LEGAL_TAPS (FDTAP_FILT_READABLE | FDTAP_FILT_WRITABLE |        \
287                                 FDTAP_FILT_HANGUP | FDTAP_FILT_PRIORITY |          \
288                                 FDTAP_FILT_ERROR)
289
290         switch (c->qid.path) {
291                 case Qefd:
292                         if (tap->filter & ~EFD_LEGAL_TAPS) {
293                                 set_errno(ENOSYS);
294                                 set_errstr("Unsupported #%s tap, must be %p, got %p", devname(),
295                                            EFD_LEGAL_TAPS, tap->filter);
296                                 return -1;
297                         }
298                         spin_lock(&efd->tap_lock);
299                         switch (cmd) {
300                                 case (FDTAP_CMD_ADD):
301                                         SLIST_INSERT_HEAD(&efd->fd_taps, tap, link);
302                                         ret = 0;
303                                         break;
304                                 case (FDTAP_CMD_REM):
305                                         SLIST_REMOVE(&efd->fd_taps, tap, fd_tap, link);
306                                         ret = 0;
307                                         break;
308                                 default:
309                                         set_errno(ENOSYS);
310                                         set_errstr("Unsupported #%s tap command %p",
311                                                    devname(), cmd);
312                                         ret = -1;
313                         }
314                         spin_unlock(&efd->tap_lock);
315                         return ret;
316                 default:
317                         set_errno(ENOSYS);
318                         set_errstr("Can't tap #%s file type %d", devname(),
319                                    c->qid.path);
320                         return -1;
321         }
322 }
323
324 struct dev efd_devtab __devtab = {
325         .name = "eventfd",
326         .reset = devreset,
327         .init = devinit,
328         .shutdown = devshutdown,
329         .attach = efd_attach,
330         .walk = efd_walk,
331         .stat = efd_stat,
332         .open = efd_open,
333         .create = devcreate,
334         .close = efd_close,
335         .read = efd_read,
336         .bread = devbread,
337         .write = efd_write,
338         .bwrite = devbwrite,
339         .remove = devremove,
340         .wstat = devwstat,
341         .power = devpower,
342         .chaninfo = efd_chaninfo,
343         .tapfd = efd_tapfd,
344 };