1 /* Copyright (c) 2015 Google Inc.
2 * Barret Rhoden <brho@cs.berkeley.edu>
3 * See LICENSE for details.
5 * Epoll, built on FD taps, CEQs, and blocking uthreads on event queues.
7 * TODO: There are a few incompatibilities with Linux's epoll, some of which are
8 * artifacts of the implementation, and other issues:
9 * - you can't epoll on an epoll fd (or any user fd). you can only epoll on a
10 * kernel FD that accepts your FD taps.
11 * - there's no EPOLLONESHOT or level-triggered support.
12 * - you can only tap one FD at a time, so you can't add the same FD to
13 * multiple epoll sets.
14 * - there is no support for growing the epoll set.
15 * - closing the epoll is a little dangerous, if there are outstanding INDIR
16 * events. this will only pop up if you're yielding cores, maybe getting
17 * preempted, and are unlucky.
18 * - epoll_create1 does not support CLOEXEC. That'd need some work in glibc's
19 * exec and flags in struct user_fd.
20 * - EPOLL_CTL_MOD is just a DEL then an ADD. There might be races associated
22 * - If you close a tracked FD without removing it from the epoll set, the
23 * kernel will turn off the FD tap. You may still have an epoll event that was
24 * concurrently sent. Likewise, that FD may be used again by your program, and
25 * if you add *that* one to another epoll set before removing it from the
26 * current one, weird things may happen (like having two epoll ctlrs turning on
28 * - epoll_pwait is probably racy.
29 * - You can't dup an epoll fd (same as other user FDs).
30 * - If you add a BSD socket FD to an epoll set before calling listen(), you'll
31 * only epoll on the data (which is inactive) instead of on the accept().
32 * - If you add the same BSD socket listener to multiple epoll sets, you will
33 * likely fail. This is in addition to being able to tap only one FD at a
37 #include <sys/epoll.h>
38 #include <parlib/parlib.h>
39 #include <parlib/event.h>
40 #include <parlib/ceq.h>
41 #include <parlib/uthread.h>
42 #include <parlib/timing.h>
43 #include <sys/user_fd.h>
49 /* Sanity check, so we can ID our own FDs */
50 #define EPOLL_UFD_MAGIC 0xe9011
53 struct event_queue *ceq_evq;
54 struct ceq *ceq; /* convenience pointer */
60 /* There's some bookkeeping we need to maintain on every FD. Right now, the FD
61 * is the index into the CEQ event array, so we can just hook this into the user
62 * data blob in the ceq_event.
64 * If we ever do not maintain a 1:1 mapping from FDs to CEQ IDs, we can use this
65 * to track the CEQ ID and FD. */
67 struct epoll_event ep_event;
73 /* Converts epoll events to FD taps. */
74 static int ep_events_to_taps(uint32_t ep_ev)
78 taps |= FDTAP_FILT_READABLE;
80 taps |= FDTAP_FILT_WRITABLE;
81 if (ep_ev & EPOLLRDHUP)
82 taps |= FDTAP_FILT_RDHUP;
84 taps |= FDTAP_FILT_PRIORITY;
86 taps |= FDTAP_FILT_ERROR;
88 taps |= FDTAP_FILT_HANGUP;
92 /* Converts corresponding FD Taps to epoll events. There are other taps that do
93 * not make sense for epoll. */
94 static uint32_t taps_to_ep_events(int taps)
97 if (taps & FDTAP_FILT_READABLE)
99 if (taps & FDTAP_FILT_WRITABLE)
101 if (taps & FDTAP_FILT_RDHUP)
103 if (taps & FDTAP_FILT_PRIORITY)
105 if (taps & FDTAP_FILT_ERROR)
107 if (taps & FDTAP_FILT_HANGUP)
112 static struct ceq_event *ep_get_ceq_ev(struct epoll_ctlr *ep, size_t idx)
114 if (ep->ceq_evq->ev_mbox->ceq.nr_events <= idx)
116 return &ep->ceq_evq->ev_mbox->ceq.events[idx];
119 static struct epoll_ctlr *fd_to_cltr(int fd)
121 struct user_fd *ufd = ufd_lookup(fd);
124 if (ufd->magic != EPOLL_UFD_MAGIC) {
128 return container_of(ufd, struct epoll_ctlr, ufd);
131 /* Event queue helpers: */
132 static struct event_queue *ep_get_ceq_evq(unsigned int ceq_size)
134 struct event_queue *ceq_evq = get_eventq_raw();
135 ceq_evq->ev_mbox->type = EV_MBOX_CEQ;
136 ceq_init(&ceq_evq->ev_mbox->ceq, CEQ_OR, ceq_size, ceq_size);
137 ceq_evq->ev_flags = EVENT_INDIR | EVENT_SPAM_INDIR | EVENT_WAKEUP;
138 evq_attach_wakeup_ctlr(ceq_evq);
142 static struct event_queue *ep_get_alarm_evq(void)
144 /* Don't care about the actual message, just using it for a wakeup */
145 struct event_queue *alarm_evq = get_eventq(EV_MBOX_BITMAP);
146 alarm_evq->ev_flags = EVENT_INDIR | EVENT_SPAM_INDIR | EVENT_WAKEUP;
147 evq_attach_wakeup_ctlr(alarm_evq);
151 /* Once we've closed our sources of events, we can try to clean up the event
152 * queues. These are actually dangerous, since there could be INDIRs floating
153 * around for these evqs still, which are basically pointers. We'll need to run
154 * some sort of user deferred destruction. (TODO). */
155 static void ep_put_ceq_evq(struct event_queue *ceq_evq)
157 #if 0 /* TODO: EVQ/INDIR Cleanup */
158 ceq_cleanup(&ceq_evq->ev_mbox->ceq);
159 evq_remove_wakeup_ctlr(ceq_evq);
160 put_eventq_raw(ceq_evq);
164 static void ep_put_alarm_evq(struct event_queue *alarm_evq)
166 #if 0 /* TODO: EVQ/INDIR Cleanup */
167 evq_remove_wakeup_ctlr(alarm_evq);
168 put_eventq(alarm_evq);
172 static void epoll_close(struct user_fd *ufd)
174 struct epoll_ctlr *ep = container_of(ufd, struct epoll_ctlr, ufd);
175 struct fd_tap_req *tap_reqs, *tap_req_i;
176 struct ceq_event *ceq_ev_i;
177 struct ep_fd_data *ep_fd_i;
181 tap_reqs = malloc(sizeof(struct fd_tap_req) * ep->size);
182 memset(tap_reqs, 0, sizeof(struct fd_tap_req) * ep->size);
183 /* Slightly painful, O(n) with no escape hatch */
184 for (int i = 0; i < ep->size; i++) {
185 ceq_ev_i = ep_get_ceq_ev(ep, i);
186 /* CEQ should have been big enough for our size */
188 ep_fd_i = (struct ep_fd_data*)ceq_ev_i->user_data;
191 if (ep_fd_i->sock_listen_fd >= 0) {
192 /* This tap is using a listen_fd, opened by __epoll_ctl_add, so the
193 * user doesn't know about this FD. We need to remove the tap and
194 * close the FD; the kernel will remove the tap when we close it. */
195 close(ep_fd_i->sock_listen_fd);
199 tap_req_i = &tap_reqs[nr_tap_req++];
201 tap_req_i->cmd = FDTAP_CMD_REM;
204 /* Requests could fail if the tapped files are already closed. We need to
205 * skip the failed one (the +1) and untap the rest. */
207 nr_done += sys_tap_fds(tap_reqs + nr_done, nr_tap_req - nr_done);
208 nr_done += 1; /* nr_done could be more than nr_tap_req now */
209 } while (nr_done < nr_tap_req);
211 ep_put_ceq_evq(ep->ceq_evq);
212 uth_mutex_free(ep->mtx);
216 static int init_ep_ctlr(struct epoll_ctlr *ep, int size)
218 unsigned int ceq_size = ROUNDUPPWR2(size);
219 /* TODO: we don't grow yet. Until then, we help out a little. */
223 ep->mtx = uth_mutex_alloc();
224 ep->ufd.magic = EPOLL_UFD_MAGIC;
225 ep->ufd.close = epoll_close;
226 ep->ceq_evq = ep_get_ceq_evq(ceq_size);
230 int epoll_create(int size)
233 struct epoll_ctlr *ep;
235 /* good thing the arg is a signed int... */
240 ep = malloc(sizeof(struct epoll_ctlr));
241 memset(ep, 0, sizeof(struct epoll_ctlr));
242 if (init_ep_ctlr(ep, size)) {
246 fd = ufd_get_fd(&ep->ufd);
252 int epoll_create1(int flags)
254 /* TODO: we're supposed to support CLOEXEC. Our FD is a user_fd, so that'd
255 * require some support in glibc's exec to close our epoll ctlr. */
256 return epoll_create(1);
259 static int __epoll_ctl_add(struct epoll_ctlr *ep, int fd,
260 struct epoll_event *event)
262 struct ceq_event *ceq_ev;
263 struct ep_fd_data *ep_fd;
264 struct fd_tap_req tap_req = {0};
265 int ret, filter, sock_listen_fd;
267 /* Only support ET. Also, we just ignore EPOLLONESHOT. That might work,
268 * logically, just with spurious events firing. */
269 if (!(event->events & EPOLLET)) {
271 werrstr("Epoll level-triggered not supported");
274 /* The sockets-to-plan9 networking shims are a bit inconvenient. The user
275 * asked us to epoll on an FD, but that FD is actually a Qdata FD. We need
276 * to actually epoll on the listen_fd. We'll store this in the ep_fd, so
277 * that later on we can close it.
279 * As far as tracking the FD goes for epoll_wait() reporting, if the app
280 * wants to track the FD they think we are using, then they already passed
281 * that in event->data.
283 * But before we get too far, we need to make sure we aren't already tapping
284 * this FD's listener (hence the lookup).
286 * This all assumes that this socket is only added to one epoll set at a
287 * time. The _sock calls are racy, and once one epoller set up a listen_fd
288 * in the Rock, we'll think that it was us. */
289 extern int _sock_lookup_listen_fd(int sock_fd); /* in glibc */
290 extern int _sock_get_listen_fd(int sock_fd);
291 if (_sock_lookup_listen_fd(fd) >= 0) {
295 sock_listen_fd = _sock_get_listen_fd(fd);
296 if (sock_listen_fd >= 0)
298 ceq_ev = ep_get_ceq_ev(ep, fd);
301 werrstr("Epoll set cannot grow yet!");
304 ep_fd = (struct ep_fd_data*)ceq_ev->user_data;
310 tap_req.cmd = FDTAP_CMD_ADD;
311 /* EPOLLHUP is implicitly set for all epolls. */
312 filter = ep_events_to_taps(event->events | EPOLLHUP);
313 tap_req.filter = filter;
314 tap_req.ev_q = ep->ceq_evq;
315 tap_req.ev_id = fd; /* using FD as the CEQ ID */
316 ret = sys_tap_fds(&tap_req, 1);
319 ep_fd = malloc(sizeof(struct ep_fd_data));
321 ep_fd->filter = filter;
322 ep_fd->ep_event = *event;
323 ep_fd->ep_event.events |= EPOLLHUP;
324 ep_fd->sock_listen_fd = sock_listen_fd;
325 ceq_ev->user_data = (uint64_t)ep_fd;
329 static int __epoll_ctl_del(struct epoll_ctlr *ep, int fd,
330 struct epoll_event *event)
332 struct ceq_event *ceq_ev;
333 struct ep_fd_data *ep_fd;
334 struct fd_tap_req tap_req = {0};
335 int ret, sock_listen_fd;
337 /* They could be asking to clear an epoll for a listener. We need to remove
338 * the tap for the real FD we tapped */
339 extern int _sock_lookup_listen_fd(int sock_fd); /* in glibc */
340 sock_listen_fd = _sock_lookup_listen_fd(fd);
341 if (sock_listen_fd >= 0)
343 ceq_ev = ep_get_ceq_ev(ep, fd);
348 ep_fd = (struct ep_fd_data*)ceq_ev->user_data;
353 assert(ep_fd->fd == fd);
355 tap_req.cmd = FDTAP_CMD_REM;
356 /* ignoring the return value; we could have failed to remove it if the FD
357 * has already closed and the kernel removed the tap. */
358 sys_tap_fds(&tap_req, 1);
359 ceq_ev->user_data = 0;
360 assert(ep_fd->sock_listen_fd == sock_listen_fd);
361 if (ep_fd->sock_listen_fd >= 0) {
362 assert(ep_fd->sock_listen_fd == sock_listen_fd);
363 close(ep_fd->sock_listen_fd);
369 int epoll_ctl(int epfd, int op, int fd, struct epoll_event *event)
372 struct epoll_ctlr *ep = fd_to_cltr(epfd);
374 errno = EBADF;/* or EINVAL */
377 if (fd >= USER_FD_BASE) {
379 werrstr("Epoll can't track User FDs");
382 uth_mutex_lock(ep->mtx);
384 case (EPOLL_CTL_MOD):
385 /* In lieu of a proper MOD, just remove and readd. The errors might
386 * not work out well, and there could be a missed event in the
387 * middle. Not sure what the guarantees are, but we can fake a
389 ret = __epoll_ctl_del(ep, fd, 0);
392 ret = __epoll_ctl_add(ep, fd, event);
394 case (EPOLL_CTL_ADD):
395 ret = __epoll_ctl_add(ep, fd, event);
397 case (EPOLL_CTL_DEL):
398 ret = __epoll_ctl_del(ep, fd, event);
404 uth_mutex_unlock(ep->mtx);
408 static bool get_ep_event_from_msg(struct epoll_ctlr *ep, struct event_msg *msg,
409 struct epoll_event *ep_ev)
411 struct ceq_event *ceq_ev;
412 struct ep_fd_data *ep_fd;
414 ceq_ev = ep_get_ceq_ev(ep, msg->ev_type);
415 /* should never get a tap FD > size of the epoll set */
417 ep_fd = (struct ep_fd_data*)ceq_ev->user_data;
419 /* it's possible the FD was unregistered and this was an old
420 * event sent to this epoll set. */
423 ep_ev->data = ep_fd->ep_event.data;
424 ep_ev->events = taps_to_ep_events(msg->ev_arg2);
428 /* We should be able to have multiple waiters. ep shouldn't be closed or
429 * anything, since we have the FD (that'd be bad programming on the user's
430 * behalf). We could have concurrent ADD/MOD/DEL operations (which lock). */
431 static int __epoll_wait(struct epoll_ctlr *ep, struct epoll_event *events,
432 int maxevents, int timeout)
434 struct event_msg msg = {0};
435 struct event_msg dummy_msg;
436 struct event_queue *which_evq;
437 struct event_queue *alarm_evq;
442 /* Locking to protect get_ep_event_from_msg, specifically that the ep_fd
443 * stored at ceq_ev->user_data does not get concurrently removed and
445 uth_mutex_lock(ep->mtx);
446 for (int i = 0; i < maxevents; i++) {
447 if (uth_check_evqs(&msg, &which_evq, 1, ep->ceq_evq)) {
448 if (get_ep_event_from_msg(ep, &msg, &events[i]))
452 uth_mutex_unlock(ep->mtx);
458 alarm_evq = ep_get_alarm_evq();
459 syscall_async(&sysc, SYS_block, timeout * 1000);
460 if (!register_evq(&sysc, alarm_evq)) {
461 /* timeout occurred before we could even block! */
462 ep_put_alarm_evq(alarm_evq);
465 uth_blockon_evqs(&msg, &which_evq, 2, ep->ceq_evq, alarm_evq);
466 if (which_evq != alarm_evq) {
467 /* sysc may or may not have finished yet. this will force it to
468 * *start* to finish iff it is still a submitted syscall. */
469 sys_abort_sysc(&sysc);
470 /* But we still need to wait until the syscall completed. Need a
471 * dummy msg, since we don't want to clobber the real msg. */
472 uth_blockon_evqs(&dummy_msg, 0, 1, alarm_evq);
474 /* TODO: Slightly dangerous, due to spammed INDIRs */
475 ep_put_alarm_evq(alarm_evq);
476 if (which_evq == alarm_evq)
479 uth_blockon_evqs(&msg, &which_evq, 1, ep->ceq_evq);
481 uth_mutex_lock(ep->mtx);
482 if (get_ep_event_from_msg(ep, &msg, &events[0]))
484 uth_mutex_unlock(ep->mtx);
485 /* We might not have gotten one yet. And regardless, there might be more
486 * available. Let's try again, with timeout == 0 to ensure no blocking. We
487 * use nr_ret (0 or 1 now) to adjust maxevents and events accordingly. */
488 recurse_ret = __epoll_wait(ep, events + nr_ret, maxevents - nr_ret, 0);
490 nr_ret += recurse_ret;
494 int epoll_wait(int epfd, struct epoll_event *events, int maxevents,
497 struct epoll_ctlr *ep = fd_to_cltr(epfd);
500 errno = EBADF;/* or EINVAL */
503 if (maxevents <= 0) {
507 ret = __epoll_wait(ep, events, maxevents, timeout);
511 int epoll_pwait(int epfd, struct epoll_event *events, int maxevents,
512 int timeout, const sigset_t *sigmask)
516 /* TODO: this is probably racy */
517 sigprocmask(SIG_SETMASK, sigmask, &origmask);
518 ready = epoll_wait(epfd, events, maxevents, timeout);
519 sigprocmask(SIG_SETMASK, &origmask, NULL);