diff options
Diffstat (limited to 'fs/eventpoll.c')
-rw-r--r-- | fs/eventpoll.c | 251 |
1 files changed, 193 insertions, 58 deletions
diff --git a/fs/eventpoll.c b/fs/eventpoll.c index cde5a15b129f..075fee4ba29b 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -37,6 +37,7 @@ #include <linux/seq_file.h> #include <linux/compat.h> #include <linux/rculist.h> +#include <linux/capability.h> #include <net/busy_poll.h> /* @@ -206,7 +207,7 @@ struct eventpoll { */ struct epitem *ovflist; - /* wakeup_source used when ep_scan_ready_list is running */ + /* wakeup_source used when ep_send_events or __ep_eventpoll_poll is running */ struct wakeup_source *ws; /* The user that created the eventpoll descriptor */ @@ -217,6 +218,7 @@ struct eventpoll { /* used to optimize loop detection check */ u64 gen; struct hlist_head refs; + u8 loop_check_depth; /* * usage count, used together with epitem->dying to @@ -227,6 +229,11 @@ struct eventpoll { #ifdef CONFIG_NET_RX_BUSY_POLL /* used to track busy poll napi_id */ unsigned int napi_id; + /* busy poll timeout */ + u32 busy_poll_usecs; + /* busy poll packet budget */ + u16 busy_poll_budget; + bool prefer_busy_poll; #endif #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -256,10 +263,10 @@ static u64 loop_check_gen = 0; static struct eventpoll *inserting_into; /* Slab cache used to allocate "struct epitem" */ -static struct kmem_cache *epi_cache __read_mostly; +static struct kmem_cache *epi_cache __ro_after_init; /* Slab cache used to allocate "struct eppoll_entry" */ -static struct kmem_cache *pwq_cache __read_mostly; +static struct kmem_cache *pwq_cache __ro_after_init; /* * List of files with newly added links, where we may need to limit the number @@ -271,7 +278,7 @@ struct epitems_head { }; static struct epitems_head *tfile_check_list = EP_UNACTIVE_PTR; -static struct kmem_cache *ephead_cache __read_mostly; +static struct kmem_cache *ephead_cache __ro_after_init; static inline void free_ephead(struct epitems_head *head) { @@ -322,7 +329,6 @@ static struct ctl_table epoll_table[] = { .extra1 = &long_zero, .extra2 = &long_max, }, - { } }; static void __init epoll_sysctls_init(void) @@ -388,11 +394,41 @@ static inline int ep_events_available(struct eventpoll *ep) } #ifdef CONFIG_NET_RX_BUSY_POLL +/** + * busy_loop_ep_timeout - check if busy poll has timed out. The timeout value + * from the epoll instance ep is preferred, but if it is not set fallback to + * the system-wide global via busy_loop_timeout. + * + * @start_time: The start time used to compute the remaining time until timeout. + * @ep: Pointer to the eventpoll context. + * + * Return: true if the timeout has expired, false otherwise. + */ +static bool busy_loop_ep_timeout(unsigned long start_time, + struct eventpoll *ep) +{ + unsigned long bp_usec = READ_ONCE(ep->busy_poll_usecs); + + if (bp_usec) { + unsigned long end_time = start_time + bp_usec; + unsigned long now = busy_loop_current_time(); + + return time_after(now, end_time); + } else { + return busy_loop_timeout(start_time); + } +} + +static bool ep_busy_loop_on(struct eventpoll *ep) +{ + return !!READ_ONCE(ep->busy_poll_usecs) || net_busy_loop_on(); +} + static bool ep_busy_loop_end(void *p, unsigned long start_time) { struct eventpoll *ep = p; - return ep_events_available(ep) || busy_loop_timeout(start_time); + return ep_events_available(ep) || busy_loop_ep_timeout(start_time, ep); } /* @@ -404,10 +440,15 @@ static bool ep_busy_loop_end(void *p, unsigned long start_time) static bool ep_busy_loop(struct eventpoll *ep, int nonblock) { unsigned int napi_id = READ_ONCE(ep->napi_id); + u16 budget = READ_ONCE(ep->busy_poll_budget); + bool prefer_busy_poll = READ_ONCE(ep->prefer_busy_poll); + + if (!budget) + budget = BUSY_POLL_BUDGET; - if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on()) { - napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep, false, - BUSY_POLL_BUDGET); + if (napi_id >= MIN_NAPI_ID && ep_busy_loop_on(ep)) { + napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, + ep, prefer_busy_poll, budget); if (ep_events_available(ep)) return true; /* @@ -426,12 +467,12 @@ static bool ep_busy_loop(struct eventpoll *ep, int nonblock) */ static inline void ep_set_busy_poll_napi_id(struct epitem *epi) { - struct eventpoll *ep; + struct eventpoll *ep = epi->ep; unsigned int napi_id; struct socket *sock; struct sock *sk; - if (!net_busy_loop_on()) + if (!ep_busy_loop_on(ep)) return; sock = sock_from_file(epi->ffd.file); @@ -443,7 +484,6 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi) return; napi_id = READ_ONCE(sk->sk_napi_id); - ep = epi->ep; /* Non-NAPI IDs can be rejected * or @@ -456,6 +496,49 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi) ep->napi_id = napi_id; } +static long ep_eventpoll_bp_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct eventpoll *ep = file->private_data; + void __user *uarg = (void __user *)arg; + struct epoll_params epoll_params; + + switch (cmd) { + case EPIOCSPARAMS: + if (copy_from_user(&epoll_params, uarg, sizeof(epoll_params))) + return -EFAULT; + + /* pad byte must be zero */ + if (epoll_params.__pad) + return -EINVAL; + + if (epoll_params.busy_poll_usecs > S32_MAX) + return -EINVAL; + + if (epoll_params.prefer_busy_poll > 1) + return -EINVAL; + + if (epoll_params.busy_poll_budget > NAPI_POLL_WEIGHT && + !capable(CAP_NET_ADMIN)) + return -EPERM; + + WRITE_ONCE(ep->busy_poll_usecs, epoll_params.busy_poll_usecs); + WRITE_ONCE(ep->busy_poll_budget, epoll_params.busy_poll_budget); + WRITE_ONCE(ep->prefer_busy_poll, epoll_params.prefer_busy_poll); + return 0; + case EPIOCGPARAMS: + memset(&epoll_params, 0, sizeof(epoll_params)); + epoll_params.busy_poll_usecs = READ_ONCE(ep->busy_poll_usecs); + epoll_params.busy_poll_budget = READ_ONCE(ep->busy_poll_budget); + epoll_params.prefer_busy_poll = READ_ONCE(ep->prefer_busy_poll); + if (copy_to_user(uarg, &epoll_params, sizeof(epoll_params))) + return -EFAULT; + return 0; + default: + return -ENOIOCTLCMD; + } +} + #else static inline bool ep_busy_loop(struct eventpoll *ep, int nonblock) @@ -467,6 +550,12 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi) { } +static long ep_eventpoll_bp_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + return -EOPNOTSUPP; +} + #endif /* CONFIG_NET_RX_BUSY_POLL */ /* @@ -679,12 +768,6 @@ static void ep_done_scan(struct eventpoll *ep, write_unlock_irq(&ep->lock); } -static void epi_rcu_free(struct rcu_head *head) -{ - struct epitem *epi = container_of(head, struct epitem, rcu); - kmem_cache_free(epi_cache, epi); -} - static void ep_get(struct eventpoll *ep) { refcount_inc(&ep->refcount); @@ -769,10 +852,10 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force) * ep->mtx. The rcu read side, reverse_path_check_proc(), does not make * use of the rbn field. */ - call_rcu(&epi->rcu, epi_rcu_free); + kfree_rcu(epi, rcu); percpu_counter_dec(&ep->user->epoll_watches); - return ep_refcount_dec_and_test(ep); + return true; } /* @@ -780,14 +863,14 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force) */ static void ep_remove_safe(struct eventpoll *ep, struct epitem *epi) { - WARN_ON_ONCE(__ep_remove(ep, epi, false)); + if (__ep_remove(ep, epi, false)) + WARN_ON_ONCE(ep_refcount_dec_and_test(ep)); } static void ep_clear_and_put(struct eventpoll *ep) { struct rb_node *rbp, *next; struct epitem *epi; - bool dispose; /* We need to release all tasks waiting for these file */ if (waitqueue_active(&ep->poll_wait)) @@ -820,13 +903,32 @@ static void ep_clear_and_put(struct eventpoll *ep) cond_resched(); } - dispose = ep_refcount_dec_and_test(ep); mutex_unlock(&ep->mtx); - - if (dispose) + if (ep_refcount_dec_and_test(ep)) ep_free(ep); } +static long ep_eventpoll_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + int ret; + + if (!is_file_epoll(file)) + return -EINVAL; + + switch (cmd) { + case EPIOCSPARAMS: + case EPIOCGPARAMS: + ret = ep_eventpoll_bp_ioctl(file, cmd, arg); + break; + default: + ret = -EINVAL; + break; + } + + return ret; +} + static int ep_eventpoll_release(struct inode *inode, struct file *file) { struct eventpoll *ep = file->private_data; @@ -969,6 +1071,8 @@ static const struct file_operations eventpoll_fops = { .release = ep_eventpoll_release, .poll = ep_eventpoll_poll, .llseek = noop_llseek, + .unlocked_ioctl = ep_eventpoll_ioctl, + .compat_ioctl = compat_ptr_ioctl, }; /* @@ -1003,7 +1107,7 @@ again: dispose = __ep_remove(ep, epi, true); mutex_unlock(&ep->mtx); - if (dispose) + if (dispose && ep_refcount_dec_and_test(ep)) ep_free(ep); goto again; } @@ -1191,7 +1295,7 @@ static inline bool chain_epi_lockless(struct epitem *epi) * This callback takes a read lock in order not to contend with concurrent * events from another file descriptor, thus all modifications to ->rdllist * or ->ovflist are lockless. Read lock is paired with the write lock from - * ep_scan_ready_list(), which stops all list modifications and guarantees + * ep_start/done_scan(), which stops all list modifications and guarantees * that lists state is seen correctly. * * Another thing worth to mention is that ep_poll_callback() can be called @@ -1793,7 +1897,7 @@ static int ep_send_events(struct eventpoll *ep, * availability. At this point, no one can insert * into ep->rdllist besides us. The epoll_ctl() * callers are locked out by - * ep_scan_ready_list() holding "mtx" and the + * ep_send_events() holding "mtx" and the * poll callback will queue them in ep->ovflist. */ list_add_tail(&epi->rdllink, &ep->rdllist); @@ -1946,7 +2050,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, __set_current_state(TASK_INTERRUPTIBLE); /* - * Do the final check under the lock. ep_scan_ready_list() + * Do the final check under the lock. ep_start/done_scan() * plays with two lists (->rdllist and ->ovflist) and there * is always a race when both lists are empty for short * period of time although events are pending, so lock is @@ -1988,23 +2092,24 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, } /** - * ep_loop_check_proc - verify that adding an epoll file inside another - * epoll structure does not violate the constraints, in - * terms of closed loops, or too deep chains (which can - * result in excessive stack usage). + * ep_loop_check_proc - verify that adding an epoll file @ep inside another + * epoll file does not create closed loops, and + * determine the depth of the subtree starting at @ep * * @ep: the &struct eventpoll to be currently checked. * @depth: Current depth of the path being checked. * - * Return: %zero if adding the epoll @file inside current epoll - * structure @ep does not violate the constraints, or %-1 otherwise. + * Return: depth of the subtree, or INT_MAX if we found a loop or went too deep. */ static int ep_loop_check_proc(struct eventpoll *ep, int depth) { - int error = 0; + int result = 0; struct rb_node *rbp; struct epitem *epi; + if (ep->gen == loop_check_gen) + return ep->loop_check_depth; + mutex_lock_nested(&ep->mtx, depth + 1); ep->gen = loop_check_gen; for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { @@ -2012,13 +2117,11 @@ static int ep_loop_check_proc(struct eventpoll *ep, int depth) if (unlikely(is_file_epoll(epi->ffd.file))) { struct eventpoll *ep_tovisit; ep_tovisit = epi->ffd.file->private_data; - if (ep_tovisit->gen == loop_check_gen) - continue; if (ep_tovisit == inserting_into || depth > EP_MAX_NESTS) - error = -1; + result = INT_MAX; else - error = ep_loop_check_proc(ep_tovisit, depth + 1); - if (error != 0) + result = max(result, ep_loop_check_proc(ep_tovisit, depth + 1) + 1); + if (result > EP_MAX_NESTS) break; } else { /* @@ -2032,9 +2135,27 @@ static int ep_loop_check_proc(struct eventpoll *ep, int depth) list_file(epi->ffd.file); } } + ep->loop_check_depth = result; mutex_unlock(&ep->mtx); - return error; + return result; +} + +/** + * ep_get_upwards_depth_proc - determine depth of @ep when traversed upwards + */ +static int ep_get_upwards_depth_proc(struct eventpoll *ep, int depth) +{ + int result = 0; + struct epitem *epi; + + if (ep->gen == loop_check_gen) + return ep->loop_check_depth; + hlist_for_each_entry_rcu(epi, &ep->refs, fllink) + result = max(result, ep_get_upwards_depth_proc(epi->ep, depth + 1) + 1); + ep->gen = loop_check_gen; + ep->loop_check_depth = result; + return result; } /** @@ -2050,8 +2171,22 @@ static int ep_loop_check_proc(struct eventpoll *ep, int depth) */ static int ep_loop_check(struct eventpoll *ep, struct eventpoll *to) { + int depth, upwards_depth; + inserting_into = ep; - return ep_loop_check_proc(to, 0); + /* + * Check how deep down we can get from @to, and whether it is possible + * to loop up to @ep. + */ + depth = ep_loop_check_proc(to, 0); + if (depth > EP_MAX_NESTS) + return -1; + /* Check how far up we can go from @ep. */ + rcu_read_lock(); + upwards_depth = ep_get_upwards_depth_proc(ep, 0); + rcu_read_unlock(); + + return (depth+1+upwards_depth > EP_MAX_NESTS) ? -1 : 0; } static void clear_tfile_check_list(void) @@ -2161,17 +2296,17 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, error = -EBADF; f = fdget(epfd); - if (!f.file) + if (!fd_file(f)) goto error_return; /* Get the "struct file *" for the target file */ tf = fdget(fd); - if (!tf.file) + if (!fd_file(tf)) goto error_fput; /* The target file descriptor must support poll */ error = -EPERM; - if (!file_can_poll(tf.file)) + if (!file_can_poll(fd_file(tf))) goto error_tgt_fput; /* Check if EPOLLWAKEUP is allowed */ @@ -2184,7 +2319,7 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, * adding an epoll file descriptor inside itself. */ error = -EINVAL; - if (f.file == tf.file || !is_file_epoll(f.file)) + if (fd_file(f) == fd_file(tf) || !is_file_epoll(fd_file(f))) goto error_tgt_fput; /* @@ -2195,7 +2330,7 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) { if (op == EPOLL_CTL_MOD) goto error_tgt_fput; - if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) || + if (op == EPOLL_CTL_ADD && (is_file_epoll(fd_file(tf)) || (epds->events & ~EPOLLEXCLUSIVE_OK_BITS))) goto error_tgt_fput; } @@ -2204,7 +2339,7 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, * At this point it is safe to assume that the "private_data" contains * our own data structure. */ - ep = f.file->private_data; + ep = fd_file(f)->private_data; /* * When we insert an epoll file descriptor inside another epoll file @@ -2225,16 +2360,16 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, if (error) goto error_tgt_fput; if (op == EPOLL_CTL_ADD) { - if (READ_ONCE(f.file->f_ep) || ep->gen == loop_check_gen || - is_file_epoll(tf.file)) { + if (READ_ONCE(fd_file(f)->f_ep) || ep->gen == loop_check_gen || + is_file_epoll(fd_file(tf))) { mutex_unlock(&ep->mtx); error = epoll_mutex_lock(&epnested_mutex, 0, nonblock); if (error) goto error_tgt_fput; loop_check_gen++; full_check = 1; - if (is_file_epoll(tf.file)) { - tep = tf.file->private_data; + if (is_file_epoll(fd_file(tf))) { + tep = fd_file(tf)->private_data; error = -ELOOP; if (ep_loop_check(ep, tep) != 0) goto error_tgt_fput; @@ -2250,14 +2385,14 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, * above, we can be sure to be able to use the item looked up by * ep_find() till we release the mutex. */ - epi = ep_find(ep, tf.file, fd); + epi = ep_find(ep, fd_file(tf), fd); error = -EINVAL; switch (op) { case EPOLL_CTL_ADD: if (!epi) { epds->events |= EPOLLERR | EPOLLHUP; - error = ep_insert(ep, epds, tf.file, fd, full_check); + error = ep_insert(ep, epds, fd_file(tf), fd, full_check); } else error = -EEXIST; break; @@ -2338,7 +2473,7 @@ static int do_epoll_wait(int epfd, struct epoll_event __user *events, /* Get the "struct file *" for the eventpoll file */ f = fdget(epfd); - if (!f.file) + if (!fd_file(f)) return -EBADF; /* @@ -2346,14 +2481,14 @@ static int do_epoll_wait(int epfd, struct epoll_event __user *events, * the user passed to us _is_ an eventpoll file. */ error = -EINVAL; - if (!is_file_epoll(f.file)) + if (!is_file_epoll(fd_file(f))) goto error_fput; /* * At this point it is safe to assume that the "private_data" contains * our own data structure. */ - ep = f.file->private_data; + ep = fd_file(f)->private_data; /* Time to fish for events ... */ error = ep_poll(ep, events, maxevents, to); |