summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNam Cao <namcao@linutronix.de>2026-06-02 20:51:46 +0300
committerChristian Brauner <brauner@kernel.org>2026-06-04 11:25:10 +0300
commit0c4aefe3c2d0f272a2ad73699a12d4446ffdbe7b (patch)
tree1e65bf935c10395b928dfccc132c604f5413ee91
parent827382c088e9ac18a9e05c0a238a2ff919bc4755 (diff)
downloadlinux-0c4aefe3c2d0f272a2ad73699a12d4446ffdbe7b.tar.xz
eventpoll: Fix epoll_wait() report false negative
ep_events_available() checks for available events by looking at ep->rdllist and ep_is_scanning(). However, this is done without a lock and can report false negative if ep_start_scan() or ep_done_scan() are executed by another task concurrently. For example: _________________________________________________________________________ |ep_start_scan() | list_splice_init(&ep->rdllist, ...) ep_events_available() | !list_empty_careful(&ep->rdllist)| || ep_is_scanning(ep) | | ep_enter_scan(ep) ___________________________________|_____________________________________ Another example: _________________________________________________________________________ ep_events_available() | |ep_start_scan() | list_splice_init(&ep->rdllist, ...) | ep_enter_scan(ep) !list_empty_careful(&ep->rdllist)| |ep_done_scan() | ep_exit_scan(ep) | list_splice(..., &ep->rdllist) || ep_is_scanning(ep) | ___________________________________|_____________________________________ In the above examples, ep_events_available() sees no event despite events being available. In case epoll_wait() is called with timeout=0, epoll_wait() will wrongly return "no event" to user. Introduce a sequence lock to resolve this issue. Measuring the time consumption of 10 million loop iterations doing epoll_wait(), the following performance drop is observed: timeout #event before after diff 0ms 0 3727ms 3974ms +6.6% 0ms 1 8099ms 9134ms +13% 1ms 1 13525ms 13586ms +0.45% Considering the use case of epoll_wait() (wait for events, do something with the events, repeat), it should only contribute to a small portion of user's CPU consumption. Therefore this performance drop is not alarming. Fixes: c5a282e9635e ("fs/epoll: reduce the scope of wq lock in epoll_wait()") Suggested-by: Mateusz Guzik <mjguzik@gmail.com> Signed-off-by: Nam Cao <namcao@linutronix.de> Link: https://patch.msgid.link/4363cd8e34a21d4f0d257be1b33e84dc25030fdf.1780422138.git.namcao@linutronix.de Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
-rw-r--r--fs/eventpoll.c20
1 files changed, 19 insertions, 1 deletions
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index baa97d0edade..df364a8783b5 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -38,6 +38,7 @@
#include <linux/compat.h>
#include <linux/rculist.h>
#include <linux/capability.h>
+#include <linux/seqlock.h>
#include <net/busy_poll.h>
/*
@@ -312,6 +313,9 @@ struct eventpoll {
/* Lock which protects rdllist and ovflist */
spinlock_t lock;
+ /* Protect switching between rdllist and ovflist */
+ seqcount_spinlock_t seq;
+
/* RB tree root used to store monitored fd structs */
struct rb_root_cached rbr;
@@ -590,7 +594,10 @@ static inline void epi_clear_ovflist(struct epitem *epi)
/* True iff @ep has ready events that epoll_wait() might harvest. */
static inline bool ep_events_available(struct eventpoll *ep)
{
- return !list_empty_careful(&ep->rdllist) || ep_is_scanning(ep);
+ unsigned int seq = read_seqcount_begin(&ep->seq);
+
+ return !list_empty_careful(&ep->rdllist) || ep_is_scanning(ep) ||
+ read_seqcount_retry(&ep->seq, seq);
}
#ifdef CONFIG_NET_RX_BUSY_POLL
@@ -947,8 +954,12 @@ static void ep_start_scan(struct eventpoll *ep, struct list_head *scan_batch)
*/
lockdep_assert_irqs_enabled();
spin_lock_irq(&ep->lock);
+ write_seqcount_begin(&ep->seq);
+
list_splice_init(&ep->rdllist, scan_batch);
ep_enter_scan(ep);
+
+ write_seqcount_end(&ep->seq);
spin_unlock_irq(&ep->lock);
}
@@ -979,6 +990,9 @@ static void ep_done_scan(struct eventpoll *ep,
ep_pm_stay_awake(epi);
}
}
+
+ write_seqcount_begin(&ep->seq);
+
/* Back out of scan mode; callbacks target ep->rdllist again. */
ep_exit_scan(ep);
@@ -986,6 +1000,9 @@ static void ep_done_scan(struct eventpoll *ep,
* Quickly re-inject items left on "scan_batch".
*/
list_splice(scan_batch, &ep->rdllist);
+
+ write_seqcount_end(&ep->seq);
+
__pm_relax(ep->ws);
if (!list_empty(&ep->rdllist)) {
@@ -1405,6 +1422,7 @@ static int ep_alloc(struct eventpoll **pep)
mutex_init(&ep->mtx);
spin_lock_init(&ep->lock);
+ seqcount_spinlock_init(&ep->seq, &ep->lock);
init_waitqueue_head(&ep->wq);
init_waitqueue_head(&ep->poll_wait);
INIT_LIST_HEAD(&ep->rdllist);