summaryrefslogtreecommitdiff
path: root/fs/eventpoll.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/eventpoll.c')
-rw-r--r--fs/eventpoll.c251
1 files changed, 193 insertions, 58 deletions
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index cde5a15b129f..075fee4ba29b 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -37,6 +37,7 @@
#include <linux/seq_file.h>
#include <linux/compat.h>
#include <linux/rculist.h>
+#include <linux/capability.h>
#include <net/busy_poll.h>
/*
@@ -206,7 +207,7 @@ struct eventpoll {
*/
struct epitem *ovflist;
- /* wakeup_source used when ep_scan_ready_list is running */
+ /* wakeup_source used when ep_send_events or __ep_eventpoll_poll is running */
struct wakeup_source *ws;
/* The user that created the eventpoll descriptor */
@@ -217,6 +218,7 @@ struct eventpoll {
/* used to optimize loop detection check */
u64 gen;
struct hlist_head refs;
+ u8 loop_check_depth;
/*
* usage count, used together with epitem->dying to
@@ -227,6 +229,11 @@ struct eventpoll {
#ifdef CONFIG_NET_RX_BUSY_POLL
/* used to track busy poll napi_id */
unsigned int napi_id;
+ /* busy poll timeout */
+ u32 busy_poll_usecs;
+ /* busy poll packet budget */
+ u16 busy_poll_budget;
+ bool prefer_busy_poll;
#endif
#ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -256,10 +263,10 @@ static u64 loop_check_gen = 0;
static struct eventpoll *inserting_into;
/* Slab cache used to allocate "struct epitem" */
-static struct kmem_cache *epi_cache __read_mostly;
+static struct kmem_cache *epi_cache __ro_after_init;
/* Slab cache used to allocate "struct eppoll_entry" */
-static struct kmem_cache *pwq_cache __read_mostly;
+static struct kmem_cache *pwq_cache __ro_after_init;
/*
* List of files with newly added links, where we may need to limit the number
@@ -271,7 +278,7 @@ struct epitems_head {
};
static struct epitems_head *tfile_check_list = EP_UNACTIVE_PTR;
-static struct kmem_cache *ephead_cache __read_mostly;
+static struct kmem_cache *ephead_cache __ro_after_init;
static inline void free_ephead(struct epitems_head *head)
{
@@ -322,7 +329,6 @@ static struct ctl_table epoll_table[] = {
.extra1 = &long_zero,
.extra2 = &long_max,
},
- { }
};
static void __init epoll_sysctls_init(void)
@@ -388,11 +394,41 @@ static inline int ep_events_available(struct eventpoll *ep)
}
#ifdef CONFIG_NET_RX_BUSY_POLL
+/**
+ * busy_loop_ep_timeout - check if busy poll has timed out. The timeout value
+ * from the epoll instance ep is preferred, but if it is not set fallback to
+ * the system-wide global via busy_loop_timeout.
+ *
+ * @start_time: The start time used to compute the remaining time until timeout.
+ * @ep: Pointer to the eventpoll context.
+ *
+ * Return: true if the timeout has expired, false otherwise.
+ */
+static bool busy_loop_ep_timeout(unsigned long start_time,
+ struct eventpoll *ep)
+{
+ unsigned long bp_usec = READ_ONCE(ep->busy_poll_usecs);
+
+ if (bp_usec) {
+ unsigned long end_time = start_time + bp_usec;
+ unsigned long now = busy_loop_current_time();
+
+ return time_after(now, end_time);
+ } else {
+ return busy_loop_timeout(start_time);
+ }
+}
+
+static bool ep_busy_loop_on(struct eventpoll *ep)
+{
+ return !!READ_ONCE(ep->busy_poll_usecs) || net_busy_loop_on();
+}
+
static bool ep_busy_loop_end(void *p, unsigned long start_time)
{
struct eventpoll *ep = p;
- return ep_events_available(ep) || busy_loop_timeout(start_time);
+ return ep_events_available(ep) || busy_loop_ep_timeout(start_time, ep);
}
/*
@@ -404,10 +440,15 @@ static bool ep_busy_loop_end(void *p, unsigned long start_time)
static bool ep_busy_loop(struct eventpoll *ep, int nonblock)
{
unsigned int napi_id = READ_ONCE(ep->napi_id);
+ u16 budget = READ_ONCE(ep->busy_poll_budget);
+ bool prefer_busy_poll = READ_ONCE(ep->prefer_busy_poll);
+
+ if (!budget)
+ budget = BUSY_POLL_BUDGET;
- if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on()) {
- napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep, false,
- BUSY_POLL_BUDGET);
+ if (napi_id >= MIN_NAPI_ID && ep_busy_loop_on(ep)) {
+ napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end,
+ ep, prefer_busy_poll, budget);
if (ep_events_available(ep))
return true;
/*
@@ -426,12 +467,12 @@ static bool ep_busy_loop(struct eventpoll *ep, int nonblock)
*/
static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
{
- struct eventpoll *ep;
+ struct eventpoll *ep = epi->ep;
unsigned int napi_id;
struct socket *sock;
struct sock *sk;
- if (!net_busy_loop_on())
+ if (!ep_busy_loop_on(ep))
return;
sock = sock_from_file(epi->ffd.file);
@@ -443,7 +484,6 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
return;
napi_id = READ_ONCE(sk->sk_napi_id);
- ep = epi->ep;
/* Non-NAPI IDs can be rejected
* or
@@ -456,6 +496,49 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
ep->napi_id = napi_id;
}
+static long ep_eventpoll_bp_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ struct eventpoll *ep = file->private_data;
+ void __user *uarg = (void __user *)arg;
+ struct epoll_params epoll_params;
+
+ switch (cmd) {
+ case EPIOCSPARAMS:
+ if (copy_from_user(&epoll_params, uarg, sizeof(epoll_params)))
+ return -EFAULT;
+
+ /* pad byte must be zero */
+ if (epoll_params.__pad)
+ return -EINVAL;
+
+ if (epoll_params.busy_poll_usecs > S32_MAX)
+ return -EINVAL;
+
+ if (epoll_params.prefer_busy_poll > 1)
+ return -EINVAL;
+
+ if (epoll_params.busy_poll_budget > NAPI_POLL_WEIGHT &&
+ !capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ WRITE_ONCE(ep->busy_poll_usecs, epoll_params.busy_poll_usecs);
+ WRITE_ONCE(ep->busy_poll_budget, epoll_params.busy_poll_budget);
+ WRITE_ONCE(ep->prefer_busy_poll, epoll_params.prefer_busy_poll);
+ return 0;
+ case EPIOCGPARAMS:
+ memset(&epoll_params, 0, sizeof(epoll_params));
+ epoll_params.busy_poll_usecs = READ_ONCE(ep->busy_poll_usecs);
+ epoll_params.busy_poll_budget = READ_ONCE(ep->busy_poll_budget);
+ epoll_params.prefer_busy_poll = READ_ONCE(ep->prefer_busy_poll);
+ if (copy_to_user(uarg, &epoll_params, sizeof(epoll_params)))
+ return -EFAULT;
+ return 0;
+ default:
+ return -ENOIOCTLCMD;
+ }
+}
+
#else
static inline bool ep_busy_loop(struct eventpoll *ep, int nonblock)
@@ -467,6 +550,12 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
{
}
+static long ep_eventpoll_bp_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ return -EOPNOTSUPP;
+}
+
#endif /* CONFIG_NET_RX_BUSY_POLL */
/*
@@ -679,12 +768,6 @@ static void ep_done_scan(struct eventpoll *ep,
write_unlock_irq(&ep->lock);
}
-static void epi_rcu_free(struct rcu_head *head)
-{
- struct epitem *epi = container_of(head, struct epitem, rcu);
- kmem_cache_free(epi_cache, epi);
-}
-
static void ep_get(struct eventpoll *ep)
{
refcount_inc(&ep->refcount);
@@ -769,10 +852,10 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force)
* ep->mtx. The rcu read side, reverse_path_check_proc(), does not make
* use of the rbn field.
*/
- call_rcu(&epi->rcu, epi_rcu_free);
+ kfree_rcu(epi, rcu);
percpu_counter_dec(&ep->user->epoll_watches);
- return ep_refcount_dec_and_test(ep);
+ return true;
}
/*
@@ -780,14 +863,14 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force)
*/
static void ep_remove_safe(struct eventpoll *ep, struct epitem *epi)
{
- WARN_ON_ONCE(__ep_remove(ep, epi, false));
+ if (__ep_remove(ep, epi, false))
+ WARN_ON_ONCE(ep_refcount_dec_and_test(ep));
}
static void ep_clear_and_put(struct eventpoll *ep)
{
struct rb_node *rbp, *next;
struct epitem *epi;
- bool dispose;
/* We need to release all tasks waiting for these file */
if (waitqueue_active(&ep->poll_wait))
@@ -820,13 +903,32 @@ static void ep_clear_and_put(struct eventpoll *ep)
cond_resched();
}
- dispose = ep_refcount_dec_and_test(ep);
mutex_unlock(&ep->mtx);
-
- if (dispose)
+ if (ep_refcount_dec_and_test(ep))
ep_free(ep);
}
+static long ep_eventpoll_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ int ret;
+
+ if (!is_file_epoll(file))
+ return -EINVAL;
+
+ switch (cmd) {
+ case EPIOCSPARAMS:
+ case EPIOCGPARAMS:
+ ret = ep_eventpoll_bp_ioctl(file, cmd, arg);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ return ret;
+}
+
static int ep_eventpoll_release(struct inode *inode, struct file *file)
{
struct eventpoll *ep = file->private_data;
@@ -969,6 +1071,8 @@ static const struct file_operations eventpoll_fops = {
.release = ep_eventpoll_release,
.poll = ep_eventpoll_poll,
.llseek = noop_llseek,
+ .unlocked_ioctl = ep_eventpoll_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
};
/*
@@ -1003,7 +1107,7 @@ again:
dispose = __ep_remove(ep, epi, true);
mutex_unlock(&ep->mtx);
- if (dispose)
+ if (dispose && ep_refcount_dec_and_test(ep))
ep_free(ep);
goto again;
}
@@ -1191,7 +1295,7 @@ static inline bool chain_epi_lockless(struct epitem *epi)
* This callback takes a read lock in order not to contend with concurrent
* events from another file descriptor, thus all modifications to ->rdllist
* or ->ovflist are lockless. Read lock is paired with the write lock from
- * ep_scan_ready_list(), which stops all list modifications and guarantees
+ * ep_start/done_scan(), which stops all list modifications and guarantees
* that lists state is seen correctly.
*
* Another thing worth to mention is that ep_poll_callback() can be called
@@ -1793,7 +1897,7 @@ static int ep_send_events(struct eventpoll *ep,
* availability. At this point, no one can insert
* into ep->rdllist besides us. The epoll_ctl()
* callers are locked out by
- * ep_scan_ready_list() holding "mtx" and the
+ * ep_send_events() holding "mtx" and the
* poll callback will queue them in ep->ovflist.
*/
list_add_tail(&epi->rdllink, &ep->rdllist);
@@ -1946,7 +2050,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
__set_current_state(TASK_INTERRUPTIBLE);
/*
- * Do the final check under the lock. ep_scan_ready_list()
+ * Do the final check under the lock. ep_start/done_scan()
* plays with two lists (->rdllist and ->ovflist) and there
* is always a race when both lists are empty for short
* period of time although events are pending, so lock is
@@ -1988,23 +2092,24 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
}
/**
- * ep_loop_check_proc - verify that adding an epoll file inside another
- * epoll structure does not violate the constraints, in
- * terms of closed loops, or too deep chains (which can
- * result in excessive stack usage).
+ * ep_loop_check_proc - verify that adding an epoll file @ep inside another
+ * epoll file does not create closed loops, and
+ * determine the depth of the subtree starting at @ep
*
* @ep: the &struct eventpoll to be currently checked.
* @depth: Current depth of the path being checked.
*
- * Return: %zero if adding the epoll @file inside current epoll
- * structure @ep does not violate the constraints, or %-1 otherwise.
+ * Return: depth of the subtree, or INT_MAX if we found a loop or went too deep.
*/
static int ep_loop_check_proc(struct eventpoll *ep, int depth)
{
- int error = 0;
+ int result = 0;
struct rb_node *rbp;
struct epitem *epi;
+ if (ep->gen == loop_check_gen)
+ return ep->loop_check_depth;
+
mutex_lock_nested(&ep->mtx, depth + 1);
ep->gen = loop_check_gen;
for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
@@ -2012,13 +2117,11 @@ static int ep_loop_check_proc(struct eventpoll *ep, int depth)
if (unlikely(is_file_epoll(epi->ffd.file))) {
struct eventpoll *ep_tovisit;
ep_tovisit = epi->ffd.file->private_data;
- if (ep_tovisit->gen == loop_check_gen)
- continue;
if (ep_tovisit == inserting_into || depth > EP_MAX_NESTS)
- error = -1;
+ result = INT_MAX;
else
- error = ep_loop_check_proc(ep_tovisit, depth + 1);
- if (error != 0)
+ result = max(result, ep_loop_check_proc(ep_tovisit, depth + 1) + 1);
+ if (result > EP_MAX_NESTS)
break;
} else {
/*
@@ -2032,9 +2135,27 @@ static int ep_loop_check_proc(struct eventpoll *ep, int depth)
list_file(epi->ffd.file);
}
}
+ ep->loop_check_depth = result;
mutex_unlock(&ep->mtx);
- return error;
+ return result;
+}
+
+/**
+ * ep_get_upwards_depth_proc - determine depth of @ep when traversed upwards
+ */
+static int ep_get_upwards_depth_proc(struct eventpoll *ep, int depth)
+{
+ int result = 0;
+ struct epitem *epi;
+
+ if (ep->gen == loop_check_gen)
+ return ep->loop_check_depth;
+ hlist_for_each_entry_rcu(epi, &ep->refs, fllink)
+ result = max(result, ep_get_upwards_depth_proc(epi->ep, depth + 1) + 1);
+ ep->gen = loop_check_gen;
+ ep->loop_check_depth = result;
+ return result;
}
/**
@@ -2050,8 +2171,22 @@ static int ep_loop_check_proc(struct eventpoll *ep, int depth)
*/
static int ep_loop_check(struct eventpoll *ep, struct eventpoll *to)
{
+ int depth, upwards_depth;
+
inserting_into = ep;
- return ep_loop_check_proc(to, 0);
+ /*
+ * Check how deep down we can get from @to, and whether it is possible
+ * to loop up to @ep.
+ */
+ depth = ep_loop_check_proc(to, 0);
+ if (depth > EP_MAX_NESTS)
+ return -1;
+ /* Check how far up we can go from @ep. */
+ rcu_read_lock();
+ upwards_depth = ep_get_upwards_depth_proc(ep, 0);
+ rcu_read_unlock();
+
+ return (depth+1+upwards_depth > EP_MAX_NESTS) ? -1 : 0;
}
static void clear_tfile_check_list(void)
@@ -2161,17 +2296,17 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
error = -EBADF;
f = fdget(epfd);
- if (!f.file)
+ if (!fd_file(f))
goto error_return;
/* Get the "struct file *" for the target file */
tf = fdget(fd);
- if (!tf.file)
+ if (!fd_file(tf))
goto error_fput;
/* The target file descriptor must support poll */
error = -EPERM;
- if (!file_can_poll(tf.file))
+ if (!file_can_poll(fd_file(tf)))
goto error_tgt_fput;
/* Check if EPOLLWAKEUP is allowed */
@@ -2184,7 +2319,7 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
* adding an epoll file descriptor inside itself.
*/
error = -EINVAL;
- if (f.file == tf.file || !is_file_epoll(f.file))
+ if (fd_file(f) == fd_file(tf) || !is_file_epoll(fd_file(f)))
goto error_tgt_fput;
/*
@@ -2195,7 +2330,7 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) {
if (op == EPOLL_CTL_MOD)
goto error_tgt_fput;
- if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) ||
+ if (op == EPOLL_CTL_ADD && (is_file_epoll(fd_file(tf)) ||
(epds->events & ~EPOLLEXCLUSIVE_OK_BITS)))
goto error_tgt_fput;
}
@@ -2204,7 +2339,7 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
* At this point it is safe to assume that the "private_data" contains
* our own data structure.
*/
- ep = f.file->private_data;
+ ep = fd_file(f)->private_data;
/*
* When we insert an epoll file descriptor inside another epoll file
@@ -2225,16 +2360,16 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
if (error)
goto error_tgt_fput;
if (op == EPOLL_CTL_ADD) {
- if (READ_ONCE(f.file->f_ep) || ep->gen == loop_check_gen ||
- is_file_epoll(tf.file)) {
+ if (READ_ONCE(fd_file(f)->f_ep) || ep->gen == loop_check_gen ||
+ is_file_epoll(fd_file(tf))) {
mutex_unlock(&ep->mtx);
error = epoll_mutex_lock(&epnested_mutex, 0, nonblock);
if (error)
goto error_tgt_fput;
loop_check_gen++;
full_check = 1;
- if (is_file_epoll(tf.file)) {
- tep = tf.file->private_data;
+ if (is_file_epoll(fd_file(tf))) {
+ tep = fd_file(tf)->private_data;
error = -ELOOP;
if (ep_loop_check(ep, tep) != 0)
goto error_tgt_fput;
@@ -2250,14 +2385,14 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
* above, we can be sure to be able to use the item looked up by
* ep_find() till we release the mutex.
*/
- epi = ep_find(ep, tf.file, fd);
+ epi = ep_find(ep, fd_file(tf), fd);
error = -EINVAL;
switch (op) {
case EPOLL_CTL_ADD:
if (!epi) {
epds->events |= EPOLLERR | EPOLLHUP;
- error = ep_insert(ep, epds, tf.file, fd, full_check);
+ error = ep_insert(ep, epds, fd_file(tf), fd, full_check);
} else
error = -EEXIST;
break;
@@ -2338,7 +2473,7 @@ static int do_epoll_wait(int epfd, struct epoll_event __user *events,
/* Get the "struct file *" for the eventpoll file */
f = fdget(epfd);
- if (!f.file)
+ if (!fd_file(f))
return -EBADF;
/*
@@ -2346,14 +2481,14 @@ static int do_epoll_wait(int epfd, struct epoll_event __user *events,
* the user passed to us _is_ an eventpoll file.
*/
error = -EINVAL;
- if (!is_file_epoll(f.file))
+ if (!is_file_epoll(fd_file(f)))
goto error_fput;
/*
* At this point it is safe to assume that the "private_data" contains
* our own data structure.
*/
- ep = f.file->private_data;
+ ep = fd_file(f)->private_data;
/* Time to fish for events ... */
error = ep_poll(ep, events, maxevents, to);