diff options
Diffstat (limited to 'fs/eventpoll.c')
-rw-r--r-- | fs/eventpoll.c | 188 |
1 files changed, 132 insertions, 56 deletions
diff --git a/fs/eventpoll.c b/fs/eventpoll.c index f9898e60dd8b..b22d6f819f78 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -218,6 +218,7 @@ struct eventpoll { /* used to optimize loop detection check */ u64 gen; struct hlist_head refs; + u8 loop_check_depth; /* * usage count, used together with epitem->dying to @@ -318,7 +319,7 @@ static void unlist_file(struct epitems_head *head) static long long_zero; static long long_max = LONG_MAX; -static struct ctl_table epoll_table[] = { +static const struct ctl_table epoll_table[] = { { .procname = "max_user_watches", .data = &max_user_watches, @@ -438,7 +439,7 @@ static bool ep_busy_loop_end(void *p, unsigned long start_time) * * we must do our busy polling with irqs enabled */ -static bool ep_busy_loop(struct eventpoll *ep, int nonblock) +static bool ep_busy_loop(struct eventpoll *ep) { unsigned int napi_id = READ_ONCE(ep->napi_id); u16 budget = READ_ONCE(ep->busy_poll_budget); @@ -447,8 +448,8 @@ static bool ep_busy_loop(struct eventpoll *ep, int nonblock) if (!budget) budget = BUSY_POLL_BUDGET; - if (napi_id >= MIN_NAPI_ID && ep_busy_loop_on(ep)) { - napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, + if (napi_id_valid(napi_id) && ep_busy_loop_on(ep)) { + napi_busy_loop(napi_id, ep_busy_loop_end, ep, prefer_busy_poll, budget); if (ep_events_available(ep)) return true; @@ -492,7 +493,7 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi) * or * Nothing to do if we already have this ID */ - if (napi_id < MIN_NAPI_ID || napi_id == ep->napi_id) + if (!napi_id_valid(napi_id) || napi_id == ep->napi_id) return; /* record NAPI ID for use in next busy poll */ @@ -546,7 +547,7 @@ static void ep_suspend_napi_irqs(struct eventpoll *ep) { unsigned int napi_id = READ_ONCE(ep->napi_id); - if (napi_id >= MIN_NAPI_ID && READ_ONCE(ep->prefer_busy_poll)) + if (napi_id_valid(napi_id) && READ_ONCE(ep->prefer_busy_poll)) napi_suspend_irqs(napi_id); } @@ -554,13 +555,13 @@ static void ep_resume_napi_irqs(struct eventpoll *ep) { unsigned int napi_id = READ_ONCE(ep->napi_id); - if (napi_id >= MIN_NAPI_ID && READ_ONCE(ep->prefer_busy_poll)) + if (napi_id_valid(napi_id) && READ_ONCE(ep->prefer_busy_poll)) napi_resume_irqs(napi_id); } #else -static inline bool ep_busy_loop(struct eventpoll *ep, int nonblock) +static inline bool ep_busy_loop(struct eventpoll *ep) { return false; } @@ -883,7 +884,7 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force) kfree_rcu(epi, rcu); percpu_counter_dec(&ep->user->epoll_watches); - return ep_refcount_dec_and_test(ep); + return true; } /* @@ -891,14 +892,14 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force) */ static void ep_remove_safe(struct eventpoll *ep, struct epitem *epi) { - WARN_ON_ONCE(__ep_remove(ep, epi, false)); + if (__ep_remove(ep, epi, false)) + WARN_ON_ONCE(ep_refcount_dec_and_test(ep)); } static void ep_clear_and_put(struct eventpoll *ep) { struct rb_node *rbp, *next; struct epitem *epi; - bool dispose; /* We need to release all tasks waiting for these file */ if (waitqueue_active(&ep->poll_wait)) @@ -931,10 +932,8 @@ static void ep_clear_and_put(struct eventpoll *ep) cond_resched(); } - dispose = ep_refcount_dec_and_test(ep); mutex_unlock(&ep->mtx); - - if (dispose) + if (ep_refcount_dec_and_test(ep)) ep_free(ep); } @@ -1137,7 +1136,7 @@ again: dispose = __ep_remove(ep, epi, true); mutex_unlock(&ep->mtx); - if (dispose) + if (dispose && ep_refcount_dec_and_test(ep)) ep_free(ep); goto again; } @@ -1980,6 +1979,30 @@ static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry, return ret; } +static int ep_try_send_events(struct eventpoll *ep, + struct epoll_event __user *events, int maxevents) +{ + int res; + + /* + * Try to transfer events to user space. In case we get 0 events and + * there's still timeout left over, we go trying again in search of + * more luck. + */ + res = ep_send_events(ep, events, maxevents); + if (res > 0) + ep_suspend_napi_irqs(ep); + return res; +} + +static int ep_schedule_timeout(ktime_t *to) +{ + if (to) + return ktime_after(*to, ktime_get()); + else + return 1; +} + /** * ep_poll - Retrieves ready events, and delivers them to the caller-supplied * event buffer. @@ -2031,23 +2054,15 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, while (1) { if (eavail) { - /* - * Try to transfer events to user space. In case we get - * 0 events and there's still timeout left over, we go - * trying again in search of more luck. - */ - res = ep_send_events(ep, events, maxevents); - if (res) { - if (res > 0) - ep_suspend_napi_irqs(ep); + res = ep_try_send_events(ep, events, maxevents); + if (res) return res; - } } if (timed_out) return 0; - eavail = ep_busy_loop(ep, timed_out); + eavail = ep_busy_loop(ep); if (eavail) continue; @@ -2096,8 +2111,9 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, write_unlock_irq(&ep->lock); if (!eavail) - timed_out = !schedule_hrtimeout_range(to, slack, - HRTIMER_MODE_ABS); + timed_out = !ep_schedule_timeout(to) || + !schedule_hrtimeout_range(to, slack, + HRTIMER_MODE_ABS); __set_current_state(TASK_RUNNING); /* @@ -2125,23 +2141,24 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, } /** - * ep_loop_check_proc - verify that adding an epoll file inside another - * epoll structure does not violate the constraints, in - * terms of closed loops, or too deep chains (which can - * result in excessive stack usage). + * ep_loop_check_proc - verify that adding an epoll file @ep inside another + * epoll file does not create closed loops, and + * determine the depth of the subtree starting at @ep * * @ep: the &struct eventpoll to be currently checked. * @depth: Current depth of the path being checked. * - * Return: %zero if adding the epoll @file inside current epoll - * structure @ep does not violate the constraints, or %-1 otherwise. + * Return: depth of the subtree, or INT_MAX if we found a loop or went too deep. */ static int ep_loop_check_proc(struct eventpoll *ep, int depth) { - int error = 0; + int result = 0; struct rb_node *rbp; struct epitem *epi; + if (ep->gen == loop_check_gen) + return ep->loop_check_depth; + mutex_lock_nested(&ep->mtx, depth + 1); ep->gen = loop_check_gen; for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { @@ -2149,13 +2166,11 @@ static int ep_loop_check_proc(struct eventpoll *ep, int depth) if (unlikely(is_file_epoll(epi->ffd.file))) { struct eventpoll *ep_tovisit; ep_tovisit = epi->ffd.file->private_data; - if (ep_tovisit->gen == loop_check_gen) - continue; if (ep_tovisit == inserting_into || depth > EP_MAX_NESTS) - error = -1; + result = INT_MAX; else - error = ep_loop_check_proc(ep_tovisit, depth + 1); - if (error != 0) + result = max(result, ep_loop_check_proc(ep_tovisit, depth + 1) + 1); + if (result > EP_MAX_NESTS) break; } else { /* @@ -2169,9 +2184,25 @@ static int ep_loop_check_proc(struct eventpoll *ep, int depth) list_file(epi->ffd.file); } } + ep->loop_check_depth = result; mutex_unlock(&ep->mtx); - return error; + return result; +} + +/* ep_get_upwards_depth_proc - determine depth of @ep when traversed upwards */ +static int ep_get_upwards_depth_proc(struct eventpoll *ep, int depth) +{ + int result = 0; + struct epitem *epi; + + if (ep->gen == loop_check_gen) + return ep->loop_check_depth; + hlist_for_each_entry_rcu(epi, &ep->refs, fllink) + result = max(result, ep_get_upwards_depth_proc(epi->ep, depth + 1) + 1); + ep->gen = loop_check_gen; + ep->loop_check_depth = result; + return result; } /** @@ -2187,8 +2218,22 @@ static int ep_loop_check_proc(struct eventpoll *ep, int depth) */ static int ep_loop_check(struct eventpoll *ep, struct eventpoll *to) { + int depth, upwards_depth; + inserting_into = ep; - return ep_loop_check_proc(to, 0); + /* + * Check how deep down we can get from @to, and whether it is possible + * to loop up to @ep. + */ + depth = ep_loop_check_proc(to, 0); + if (depth > EP_MAX_NESTS) + return -1; + /* Check how far up we can go from @ep. */ + rcu_read_lock(); + upwards_depth = ep_get_upwards_depth_proc(ep, 0); + rcu_read_unlock(); + + return (depth+1+upwards_depth > EP_MAX_NESTS) ? -1 : 0; } static void clear_tfile_check_list(void) @@ -2445,6 +2490,47 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, return do_epoll_ctl(epfd, op, fd, &epds, false); } +static int ep_check_params(struct file *file, struct epoll_event __user *evs, + int maxevents) +{ + /* The maximum number of event must be greater than zero */ + if (maxevents <= 0 || maxevents > EP_MAX_EVENTS) + return -EINVAL; + + /* Verify that the area passed by the user is writeable */ + if (!access_ok(evs, maxevents * sizeof(struct epoll_event))) + return -EFAULT; + + /* + * We have to check that the file structure underneath the fd + * the user passed to us _is_ an eventpoll file. + */ + if (!is_file_epoll(file)) + return -EINVAL; + + return 0; +} + +int epoll_sendevents(struct file *file, struct epoll_event __user *events, + int maxevents) +{ + struct eventpoll *ep; + int ret; + + ret = ep_check_params(file, events, maxevents); + if (unlikely(ret)) + return ret; + + ep = file->private_data; + /* + * Racy call, but that's ok - it should get retried based on + * poll readiness anyway. + */ + if (ep_events_available(ep)) + return ep_try_send_events(ep, events, maxevents); + return 0; +} + /* * Implement the event wait interface for the eventpoll file. It is the kernel * part of the user space epoll_wait(2). @@ -2453,26 +2539,16 @@ static int do_epoll_wait(int epfd, struct epoll_event __user *events, int maxevents, struct timespec64 *to) { struct eventpoll *ep; - - /* The maximum number of event must be greater than zero */ - if (maxevents <= 0 || maxevents > EP_MAX_EVENTS) - return -EINVAL; - - /* Verify that the area passed by the user is writeable */ - if (!access_ok(events, maxevents * sizeof(struct epoll_event))) - return -EFAULT; + int ret; /* Get the "struct file *" for the eventpoll file */ CLASS(fd, f)(epfd); if (fd_empty(f)) return -EBADF; - /* - * We have to check that the file structure underneath the fd - * the user passed to us _is_ an eventpoll file. - */ - if (!is_file_epoll(fd_file(f))) - return -EINVAL; + ret = ep_check_params(fd_file(f), events, maxevents); + if (unlikely(ret)) + return ret; /* * At this point it is safe to assume that the "private_data" contains |