diff options
Diffstat (limited to 'fs/eventpoll.c')
-rw-r--r-- | fs/eventpoll.c | 118 |
1 files changed, 63 insertions, 55 deletions
diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 67db22fe99c5..42bbe6824b4b 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -50,10 +50,10 @@ * * 1) epmutex (mutex) * 2) ep->mtx (mutex) - * 3) ep->lock (spinlock) + * 3) ep->wq.lock (spinlock) * * The acquire order is the one listed above, from 1 to 3. - * We need a spinlock (ep->lock) because we manipulate objects + * We need a spinlock (ep->wq.lock) because we manipulate objects * from inside the poll callback, that might be triggered from * a wake_up() that in turn might be called from IRQ context. * So we can't sleep inside the poll callback and hence we need @@ -85,7 +85,7 @@ * of epoll file descriptors, we use the current recursion depth as * the lockdep subkey. * It is possible to drop the "ep->mtx" and to use the global - * mutex "epmutex" (together with "ep->lock") to have it working, + * mutex "epmutex" (together with "ep->wq.lock") to have it working, * but having "ep->mtx" will make the interface more scalable. * Events that require holding "epmutex" are very rare, while for * normal operations the epoll private "ep->mtx" will guarantee @@ -182,11 +182,10 @@ struct epitem { * This structure is stored inside the "private_data" member of the file * structure and represents the main data structure for the eventpoll * interface. + * + * Access to it is protected by the lock inside wq. */ struct eventpoll { - /* Protect the access to this structure */ - spinlock_t lock; - /* * This mutex is used to ensure that files are not removed * while epoll is using them. This is held during the event @@ -210,7 +209,7 @@ struct eventpoll { /* * This is a single linked list that chains all the "struct epitem" that * happened while transferring ready events to userspace w/out - * holding ->lock. + * holding ->wq.lock. */ struct epitem *ovflist; @@ -337,9 +336,9 @@ static inline int ep_cmp_ffd(struct epoll_filefd *p1, } /* Tells us if the item is currently linked */ -static inline int ep_is_linked(struct list_head *p) +static inline int ep_is_linked(struct epitem *epi) { - return !list_empty(p); + return !list_empty(&epi->rdllink); } static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_entry_t *p) @@ -392,7 +391,6 @@ static bool ep_busy_loop_end(void *p, unsigned long start_time) return ep_events_available(ep) || busy_loop_timeout(start_time); } -#endif /* CONFIG_NET_RX_BUSY_POLL */ /* * Busy poll if globally on and supporting sockets found && no events, @@ -402,20 +400,16 @@ static bool ep_busy_loop_end(void *p, unsigned long start_time) */ static void ep_busy_loop(struct eventpoll *ep, int nonblock) { -#ifdef CONFIG_NET_RX_BUSY_POLL unsigned int napi_id = READ_ONCE(ep->napi_id); if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on()) napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep); -#endif } static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep) { -#ifdef CONFIG_NET_RX_BUSY_POLL if (ep->napi_id) ep->napi_id = 0; -#endif } /* @@ -423,7 +417,6 @@ static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep) */ static inline void ep_set_busy_poll_napi_id(struct epitem *epi) { -#ifdef CONFIG_NET_RX_BUSY_POLL struct eventpoll *ep; unsigned int napi_id; struct socket *sock; @@ -453,9 +446,24 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi) /* record NAPI ID for use in next busy poll */ ep->napi_id = napi_id; -#endif } +#else + +static inline void ep_busy_loop(struct eventpoll *ep, int nonblock) +{ +} + +static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep) +{ +} + +static inline void ep_set_busy_poll_napi_id(struct epitem *epi) +{ +} + +#endif /* CONFIG_NET_RX_BUSY_POLL */ + /** * ep_call_nested - Perform a bound (possibly) nested call, by checking * that the recursion limit is not exceeded, and that @@ -668,10 +676,11 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep, { __poll_t res; int pwake = 0; - unsigned long flags; struct epitem *epi, *nepi; LIST_HEAD(txlist); + lockdep_assert_irqs_enabled(); + /* * We need to lock this because we could be hit by * eventpoll_release_file() and epoll_ctl(). @@ -688,17 +697,17 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep, * because we want the "sproc" callback to be able to do it * in a lockless way. */ - spin_lock_irqsave(&ep->lock, flags); + spin_lock_irq(&ep->wq.lock); list_splice_init(&ep->rdllist, &txlist); ep->ovflist = NULL; - spin_unlock_irqrestore(&ep->lock, flags); + spin_unlock_irq(&ep->wq.lock); /* * Now call the callback function. */ res = (*sproc)(ep, &txlist, priv); - spin_lock_irqsave(&ep->lock, flags); + spin_lock_irq(&ep->wq.lock); /* * During the time we spent inside the "sproc" callback, some * other events might have been queued by the poll callback. @@ -712,7 +721,7 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep, * queued into ->ovflist but the "txlist" might already * contain them, and the list_splice() below takes care of them. */ - if (!ep_is_linked(&epi->rdllink)) { + if (!ep_is_linked(epi)) { list_add_tail(&epi->rdllink, &ep->rdllist); ep_pm_stay_awake(epi); } @@ -740,7 +749,7 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep, if (waitqueue_active(&ep->poll_wait)) pwake++; } - spin_unlock_irqrestore(&ep->lock, flags); + spin_unlock_irq(&ep->wq.lock); if (!ep_locked) mutex_unlock(&ep->mtx); @@ -764,16 +773,12 @@ static void epi_rcu_free(struct rcu_head *head) */ static int ep_remove(struct eventpoll *ep, struct epitem *epi) { - unsigned long flags; struct file *file = epi->ffd.file; + lockdep_assert_irqs_enabled(); + /* - * Removes poll wait queue hooks. We _have_ to do this without holding - * the "ep->lock" otherwise a deadlock might occur. This because of the - * sequence of the lock acquisition. Here we do "ep->lock" then the wait - * queue head lock when unregistering the wait queue. The wakeup callback - * will run by holding the wait queue head lock and will call our callback - * that will try to get "ep->lock". + * Removes poll wait queue hooks. */ ep_unregister_pollwait(ep, epi); @@ -784,10 +789,10 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) rb_erase_cached(&epi->rbn, &ep->rbr); - spin_lock_irqsave(&ep->lock, flags); - if (ep_is_linked(&epi->rdllink)) + spin_lock_irq(&ep->wq.lock); + if (ep_is_linked(epi)) list_del_init(&epi->rdllink); - spin_unlock_irqrestore(&ep->lock, flags); + spin_unlock_irq(&ep->wq.lock); wakeup_source_unregister(ep_wakeup_source(epi)); /* @@ -837,7 +842,7 @@ static void ep_free(struct eventpoll *ep) * Walks through the whole tree by freeing each "struct epitem". At this * point we are sure no poll callbacks will be lingering around, and also by * holding "epmutex" we can be sure that no file cleanup code will hit - * us during this operation. So we can avoid the lock on "ep->lock". + * us during this operation. So we can avoid the lock on "ep->wq.lock". * We do not need to lock ep->mtx, either, we only do it to prevent * a lockdep warning. */ @@ -1017,7 +1022,6 @@ static int ep_alloc(struct eventpoll **pep) if (unlikely(!ep)) goto free_uid; - spin_lock_init(&ep->lock); mutex_init(&ep->mtx); init_waitqueue_head(&ep->wq); init_waitqueue_head(&ep->poll_wait); @@ -1122,7 +1126,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v __poll_t pollflags = key_to_poll(key); int ewake = 0; - spin_lock_irqsave(&ep->lock, flags); + spin_lock_irqsave(&ep->wq.lock, flags); ep_set_busy_poll_napi_id(epi); @@ -1167,7 +1171,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v } /* If this file is already in the ready list we exit soon */ - if (!ep_is_linked(&epi->rdllink)) { + if (!ep_is_linked(epi)) { list_add_tail(&epi->rdllink, &ep->rdllist); ep_pm_stay_awake_rcu(epi); } @@ -1199,7 +1203,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v pwake++; out_unlock: - spin_unlock_irqrestore(&ep->lock, flags); + spin_unlock_irqrestore(&ep->wq.lock, flags); /* We have to call this outside the lock */ if (pwake) @@ -1417,11 +1421,12 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, { int error, pwake = 0; __poll_t revents; - unsigned long flags; long user_watches; struct epitem *epi; struct ep_pqueue epq; + lockdep_assert_irqs_enabled(); + user_watches = atomic_long_read(&ep->user->epoll_watches); if (unlikely(user_watches >= max_user_watches)) return -ENOSPC; @@ -1484,13 +1489,13 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, goto error_remove_epi; /* We have to drop the new item inside our item list to keep track of it */ - spin_lock_irqsave(&ep->lock, flags); + spin_lock_irq(&ep->wq.lock); /* record NAPI ID of new item if present */ ep_set_busy_poll_napi_id(epi); /* If the file is already "ready" we drop it inside the ready list */ - if (revents && !ep_is_linked(&epi->rdllink)) { + if (revents && !ep_is_linked(epi)) { list_add_tail(&epi->rdllink, &ep->rdllist); ep_pm_stay_awake(epi); @@ -1501,7 +1506,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, pwake++; } - spin_unlock_irqrestore(&ep->lock, flags); + spin_unlock_irq(&ep->wq.lock); atomic_long_inc(&ep->user->epoll_watches); @@ -1527,10 +1532,10 @@ error_unregister: * list, since that is used/cleaned only inside a section bound by "mtx". * And ep_insert() is called with "mtx" held. */ - spin_lock_irqsave(&ep->lock, flags); - if (ep_is_linked(&epi->rdllink)) + spin_lock_irq(&ep->wq.lock); + if (ep_is_linked(epi)) list_del_init(&epi->rdllink); - spin_unlock_irqrestore(&ep->lock, flags); + spin_unlock_irq(&ep->wq.lock); wakeup_source_unregister(ep_wakeup_source(epi)); @@ -1550,6 +1555,8 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, int pwake = 0; poll_table pt; + lockdep_assert_irqs_enabled(); + init_poll_funcptr(&pt, NULL); /* @@ -1572,9 +1579,9 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, * 1) Flush epi changes above to other CPUs. This ensures * we do not miss events from ep_poll_callback if an * event occurs immediately after we call f_op->poll(). - * We need this because we did not take ep->lock while + * We need this because we did not take ep->wq.lock while * changing epi above (but ep_poll_callback does take - * ep->lock). + * ep->wq.lock). * * 2) We also need to ensure we do not miss _past_ events * when calling f_op->poll(). This barrier also @@ -1593,8 +1600,8 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, * list, push it inside. */ if (ep_item_poll(epi, &pt, 1)) { - spin_lock_irq(&ep->lock); - if (!ep_is_linked(&epi->rdllink)) { + spin_lock_irq(&ep->wq.lock); + if (!ep_is_linked(epi)) { list_add_tail(&epi->rdllink, &ep->rdllist); ep_pm_stay_awake(epi); @@ -1604,7 +1611,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, if (waitqueue_active(&ep->poll_wait)) pwake++; } - spin_unlock_irq(&ep->lock); + spin_unlock_irq(&ep->wq.lock); } /* We have to call this outside the lock */ @@ -1739,11 +1746,12 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, int maxevents, long timeout) { int res = 0, eavail, timed_out = 0; - unsigned long flags; u64 slack = 0; wait_queue_entry_t wait; ktime_t expires, *to = NULL; + lockdep_assert_irqs_enabled(); + if (timeout > 0) { struct timespec64 end_time = ep_set_mstimeout(timeout); @@ -1756,7 +1764,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, * caller specified a non blocking operation. */ timed_out = 1; - spin_lock_irqsave(&ep->lock, flags); + spin_lock_irq(&ep->wq.lock); goto check_events; } @@ -1765,7 +1773,7 @@ fetch_events: if (!ep_events_available(ep)) ep_busy_loop(ep, timed_out); - spin_lock_irqsave(&ep->lock, flags); + spin_lock_irq(&ep->wq.lock); if (!ep_events_available(ep)) { /* @@ -1807,11 +1815,11 @@ fetch_events: break; } - spin_unlock_irqrestore(&ep->lock, flags); + spin_unlock_irq(&ep->wq.lock); if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) timed_out = 1; - spin_lock_irqsave(&ep->lock, flags); + spin_lock_irq(&ep->wq.lock); } __remove_wait_queue(&ep->wq, &wait); @@ -1821,7 +1829,7 @@ check_events: /* Is it worth to try to dig for events ? */ eavail = ep_events_available(ep); - spin_unlock_irqrestore(&ep->lock, flags); + spin_unlock_irq(&ep->wq.lock); /* * Try to transfer events to user space. In case we get 0 events and |