From 827382c088e9ac18a9e05c0a238a2ff919bc4755 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Tue, 2 Jun 2026 19:51:45 +0200 Subject: selftests/eventpoll: Add test for multiple waiters Add a test whichs creates 64 threads who all epoll_wait() on the same eventpoll. The source eventfd is written but never read, therefore all the threads should always see an EPOLLIN event. This test fails because of a kernel bug, which will be fixed by a follow-up commit. Signed-off-by: Nam Cao Link: https://patch.msgid.link/b11947013563875c046c0b0959c29fd95eeebd34.1780422138.git.namcao@linutronix.de Signed-off-by: Christian Brauner (Amutable) --- .../filesystems/epoll/epoll_wakeup_test.c | 45 ++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c b/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c index 8bc57a2ef966..f6f1a7ff01b0 100644 --- a/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c +++ b/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c @@ -3493,4 +3493,49 @@ TEST(epoll64) close(ctx.sfd[1]); } +static void *epoll65_wait(void *ctx_) +{ + struct epoll_mtcontext *ctx = ctx_; + struct epoll_event event; + + for (int i = 0; i < 100000; ++i) { + if (!epoll_wait(ctx->efd[0], &event, 1, 0)) + return (void *)ENODATA; + } + + return (void *)0; +} + +TEST(epoll65) +{ + struct epoll_mtcontext ctx; + struct epoll_event event; + int64_t dummy_data = 99; + pthread_t threads[64]; + uintptr_t ret; + int i, err; + + ctx.efd[0] = epoll_create(1); + ASSERT_GE(ctx.efd[0], 0); + ctx.efd[1] = eventfd(0, 0); + ASSERT_GE(ctx.efd[1], 0); + + event.events = EPOLLIN; + err = epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[1], &event); + ASSERT_EQ(err, 0); + + write(ctx.efd[1], &dummy_data, sizeof(dummy_data)); + + for (i = 0; i < ARRAY_SIZE(threads); ++i) + ASSERT_EQ(pthread_create(&threads[i], NULL, epoll65_wait, &ctx), 0); + + for (i = 0; i < ARRAY_SIZE(threads); ++i) { + ASSERT_EQ(pthread_join(threads[i], (void **)&ret), 0); + ASSERT_EQ(ret, 0); + } + + close(ctx.efd[0]); + close(ctx.efd[1]); +} + TEST_HARNESS_MAIN -- cgit v1.2.3 From 0c4aefe3c2d0f272a2ad73699a12d4446ffdbe7b Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Tue, 2 Jun 2026 19:51:46 +0200 Subject: eventpoll: Fix epoll_wait() report false negative ep_events_available() checks for available events by looking at ep->rdllist and ep_is_scanning(). However, this is done without a lock and can report false negative if ep_start_scan() or ep_done_scan() are executed by another task concurrently. For example: _________________________________________________________________________ |ep_start_scan() | list_splice_init(&ep->rdllist, ...) ep_events_available() | !list_empty_careful(&ep->rdllist)| || ep_is_scanning(ep) | | ep_enter_scan(ep) ___________________________________|_____________________________________ Another example: _________________________________________________________________________ ep_events_available() | |ep_start_scan() | list_splice_init(&ep->rdllist, ...) | ep_enter_scan(ep) !list_empty_careful(&ep->rdllist)| |ep_done_scan() | ep_exit_scan(ep) | list_splice(..., &ep->rdllist) || ep_is_scanning(ep) | ___________________________________|_____________________________________ In the above examples, ep_events_available() sees no event despite events being available. In case epoll_wait() is called with timeout=0, epoll_wait() will wrongly return "no event" to user. Introduce a sequence lock to resolve this issue. Measuring the time consumption of 10 million loop iterations doing epoll_wait(), the following performance drop is observed: timeout #event before after diff 0ms 0 3727ms 3974ms +6.6% 0ms 1 8099ms 9134ms +13% 1ms 1 13525ms 13586ms +0.45% Considering the use case of epoll_wait() (wait for events, do something with the events, repeat), it should only contribute to a small portion of user's CPU consumption. Therefore this performance drop is not alarming. Fixes: c5a282e9635e ("fs/epoll: reduce the scope of wq lock in epoll_wait()") Suggested-by: Mateusz Guzik Signed-off-by: Nam Cao Link: https://patch.msgid.link/4363cd8e34a21d4f0d257be1b33e84dc25030fdf.1780422138.git.namcao@linutronix.de Signed-off-by: Christian Brauner (Amutable) --- fs/eventpoll.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index baa97d0edade..df364a8783b5 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -38,6 +38,7 @@ #include #include #include +#include #include /* @@ -312,6 +313,9 @@ struct eventpoll { /* Lock which protects rdllist and ovflist */ spinlock_t lock; + /* Protect switching between rdllist and ovflist */ + seqcount_spinlock_t seq; + /* RB tree root used to store monitored fd structs */ struct rb_root_cached rbr; @@ -590,7 +594,10 @@ static inline void epi_clear_ovflist(struct epitem *epi) /* True iff @ep has ready events that epoll_wait() might harvest. */ static inline bool ep_events_available(struct eventpoll *ep) { - return !list_empty_careful(&ep->rdllist) || ep_is_scanning(ep); + unsigned int seq = read_seqcount_begin(&ep->seq); + + return !list_empty_careful(&ep->rdllist) || ep_is_scanning(ep) || + read_seqcount_retry(&ep->seq, seq); } #ifdef CONFIG_NET_RX_BUSY_POLL @@ -947,8 +954,12 @@ static void ep_start_scan(struct eventpoll *ep, struct list_head *scan_batch) */ lockdep_assert_irqs_enabled(); spin_lock_irq(&ep->lock); + write_seqcount_begin(&ep->seq); + list_splice_init(&ep->rdllist, scan_batch); ep_enter_scan(ep); + + write_seqcount_end(&ep->seq); spin_unlock_irq(&ep->lock); } @@ -979,6 +990,9 @@ static void ep_done_scan(struct eventpoll *ep, ep_pm_stay_awake(epi); } } + + write_seqcount_begin(&ep->seq); + /* Back out of scan mode; callbacks target ep->rdllist again. */ ep_exit_scan(ep); @@ -986,6 +1000,9 @@ static void ep_done_scan(struct eventpoll *ep, * Quickly re-inject items left on "scan_batch". */ list_splice(scan_batch, &ep->rdllist); + + write_seqcount_end(&ep->seq); + __pm_relax(ep->ws); if (!list_empty(&ep->rdllist)) { @@ -1405,6 +1422,7 @@ static int ep_alloc(struct eventpoll **pep) mutex_init(&ep->mtx); spin_lock_init(&ep->lock); + seqcount_spinlock_init(&ep->seq, &ep->lock); init_waitqueue_head(&ep->wq); init_waitqueue_head(&ep->poll_wait); INIT_LIST_HEAD(&ep->rdllist); -- cgit v1.2.3