summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristian Brauner <brauner@kernel.org>2026-06-04 10:25:14 +0200
committerChristian Brauner <brauner@kernel.org>2026-06-04 10:25:14 +0200
commitb466bfe0ec0ff80994b0fa3ffe7bc36f0ce6be4b (patch)
tree1e65bf935c10395b928dfccc132c604f5413ee91
parent70a03a385de2b8f0fa54dbc70bdc3ed176853d1c (diff)
parent0c4aefe3c2d0f272a2ad73699a12d4446ffdbe7b (diff)
Merge patch series "eventpoll: Fix epoll_wait() report false negative"
Nam Cao <namcao@linutronix.de> says: While staring at epoll, I noticed ep_events_available() looks wrong. I wrote a small program to confirm, and yes it is definitely wrong. This series adds a reproducer to kselftest, and fix the bug. * patches from https://patch.msgid.link/cover.1780422137.git.namcao@linutronix.de: eventpoll: Fix epoll_wait() report false negative selftests/eventpoll: Add test for multiple waiters Link: https://patch.msgid.link/cover.1780422137.git.namcao@linutronix.de Signed-off-by: Christian Brauner <brauner@kernel.org>
-rw-r--r--fs/eventpoll.c20
-rw-r--r--tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c45
2 files changed, 64 insertions, 1 deletions
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index baa97d0edade..df364a8783b5 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -38,6 +38,7 @@
#include <linux/compat.h>
#include <linux/rculist.h>
#include <linux/capability.h>
+#include <linux/seqlock.h>
#include <net/busy_poll.h>
/*
@@ -312,6 +313,9 @@ struct eventpoll {
/* Lock which protects rdllist and ovflist */
spinlock_t lock;
+ /* Protect switching between rdllist and ovflist */
+ seqcount_spinlock_t seq;
+
/* RB tree root used to store monitored fd structs */
struct rb_root_cached rbr;
@@ -590,7 +594,10 @@ static inline void epi_clear_ovflist(struct epitem *epi)
/* True iff @ep has ready events that epoll_wait() might harvest. */
static inline bool ep_events_available(struct eventpoll *ep)
{
- return !list_empty_careful(&ep->rdllist) || ep_is_scanning(ep);
+ unsigned int seq = read_seqcount_begin(&ep->seq);
+
+ return !list_empty_careful(&ep->rdllist) || ep_is_scanning(ep) ||
+ read_seqcount_retry(&ep->seq, seq);
}
#ifdef CONFIG_NET_RX_BUSY_POLL
@@ -947,8 +954,12 @@ static void ep_start_scan(struct eventpoll *ep, struct list_head *scan_batch)
*/
lockdep_assert_irqs_enabled();
spin_lock_irq(&ep->lock);
+ write_seqcount_begin(&ep->seq);
+
list_splice_init(&ep->rdllist, scan_batch);
ep_enter_scan(ep);
+
+ write_seqcount_end(&ep->seq);
spin_unlock_irq(&ep->lock);
}
@@ -979,6 +990,9 @@ static void ep_done_scan(struct eventpoll *ep,
ep_pm_stay_awake(epi);
}
}
+
+ write_seqcount_begin(&ep->seq);
+
/* Back out of scan mode; callbacks target ep->rdllist again. */
ep_exit_scan(ep);
@@ -986,6 +1000,9 @@ static void ep_done_scan(struct eventpoll *ep,
* Quickly re-inject items left on "scan_batch".
*/
list_splice(scan_batch, &ep->rdllist);
+
+ write_seqcount_end(&ep->seq);
+
__pm_relax(ep->ws);
if (!list_empty(&ep->rdllist)) {
@@ -1405,6 +1422,7 @@ static int ep_alloc(struct eventpoll **pep)
mutex_init(&ep->mtx);
spin_lock_init(&ep->lock);
+ seqcount_spinlock_init(&ep->seq, &ep->lock);
init_waitqueue_head(&ep->wq);
init_waitqueue_head(&ep->poll_wait);
INIT_LIST_HEAD(&ep->rdllist);
diff --git a/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c b/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c
index 8bc57a2ef966..f6f1a7ff01b0 100644
--- a/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c
+++ b/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c
@@ -3493,4 +3493,49 @@ TEST(epoll64)
close(ctx.sfd[1]);
}
+static void *epoll65_wait(void *ctx_)
+{
+ struct epoll_mtcontext *ctx = ctx_;
+ struct epoll_event event;
+
+ for (int i = 0; i < 100000; ++i) {
+ if (!epoll_wait(ctx->efd[0], &event, 1, 0))
+ return (void *)ENODATA;
+ }
+
+ return (void *)0;
+}
+
+TEST(epoll65)
+{
+ struct epoll_mtcontext ctx;
+ struct epoll_event event;
+ int64_t dummy_data = 99;
+ pthread_t threads[64];
+ uintptr_t ret;
+ int i, err;
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+ ctx.efd[1] = eventfd(0, 0);
+ ASSERT_GE(ctx.efd[1], 0);
+
+ event.events = EPOLLIN;
+ err = epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[1], &event);
+ ASSERT_EQ(err, 0);
+
+ write(ctx.efd[1], &dummy_data, sizeof(dummy_data));
+
+ for (i = 0; i < ARRAY_SIZE(threads); ++i)
+ ASSERT_EQ(pthread_create(&threads[i], NULL, epoll65_wait, &ctx), 0);
+
+ for (i = 0; i < ARRAY_SIZE(threads); ++i) {
+ ASSERT_EQ(pthread_join(threads[i], (void **)&ret), 0);
+ ASSERT_EQ(ret, 0);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.efd[1]);
+}
+
TEST_HARNESS_MAIN