summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2026-05-15 12:34:02 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2026-05-15 12:34:02 -0700
commitee7226b2ae3beff5d8feffa94e5fd06af6965e52 (patch)
tree879de35d9f4e791daae5f8588775d553888601c3
parent78e8370033bfe08481212ceead113ccb668b83cb (diff)
parentf44d38a31f1802b7222adaea9ee69f9d280f698a (diff)
Merge tag 'io_uring-7.1-20260515' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux
Pull io_uring fixes from Jens Axboe: - Small series sanitizing the locking done for either modifying or reading a chain of requests - If the application has a pid namespace, ensure that the sqthread pid is correctly printed in fdinfo - Fix for a hashing issue in the io-wq thread pool, which could lead to a use-after-free - Kill dead argument from io_prep_rw_pi() - Fix for a missed validation of the CQ ring head, affecting CQE refill * tag 'io_uring-7.1-20260515' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux: io_uring: validate user-controlled cq.head in io_cqe_cache_refill() io-wq: check that the predecessor is hashed in io_wq_remove_pending() io_uring/rw: drop unused attr_type_mask from io_prep_rw_pi() io_uring: hold uring_lock across io_kill_timeouts() in cancel path io_uring: defer linked-timeout chain splice out of hrtimer context io_uring: hold uring_lock when walking link chain in io_wq_free_work() io_uring/fdinfo: translate SqThread PID through caller's pid_ns
-rw-r--r--io_uring/cancel.c2
-rw-r--r--io_uring/fdinfo.c3
-rw-r--r--io_uring/io-wq.c3
-rw-r--r--io_uring/io_uring.c29
-rw-r--r--io_uring/rw.c4
-rw-r--r--io_uring/timeout.c16
6 files changed, 44 insertions, 13 deletions
diff --git a/io_uring/cancel.c b/io_uring/cancel.c
index 5e5eb9cfc7cd..4aa3103ba9c3 100644
--- a/io_uring/cancel.c
+++ b/io_uring/cancel.c
@@ -561,8 +561,8 @@ __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
ret |= io_waitid_remove_all(ctx, tctx, cancel_all);
ret |= io_futex_remove_all(ctx, tctx, cancel_all);
ret |= io_uring_try_cancel_uring_cmd(ctx, tctx, cancel_all);
- mutex_unlock(&ctx->uring_lock);
ret |= io_kill_timeouts(ctx, tctx, cancel_all);
+ mutex_unlock(&ctx->uring_lock);
if (tctx)
ret |= io_run_task_work() > 0;
else
diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
index c2d3e45544bb..001fb542dc11 100644
--- a/io_uring/fdinfo.c
+++ b/io_uring/fdinfo.c
@@ -190,8 +190,9 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
get_task_struct(tsk);
rcu_read_unlock();
usec = io_sq_cpu_usec(tsk);
+ sq_pid = task_pid_nr_ns(tsk,
+ proc_pid_ns(file_inode(m->file)->i_sb));
put_task_struct(tsk);
- sq_pid = sq->task_pid;
sq_cpu = sq->sq_cpu;
sq_total_time = usec;
sq_work_time = sq->work_time;
diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c
index 7a9f94a0ce6f..8cc7b47d3089 100644
--- a/io_uring/io-wq.c
+++ b/io_uring/io-wq.c
@@ -1124,7 +1124,8 @@ static inline void io_wq_remove_pending(struct io_wq *wq,
if (io_wq_is_hashed(work) && work == wq->hash_tail[hash]) {
if (prev)
prev_work = container_of(prev, struct io_wq_work, list);
- if (prev_work && io_get_work_hash(prev_work) == hash)
+ if (prev_work && io_wq_is_hashed(prev_work) &&
+ io_get_work_hash(prev_work) == hash)
wq->hash_tail[hash] = prev_work;
else
wq->hash_tail[hash] = NULL;
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 4ed998d60c09..036145ee466c 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -687,12 +687,26 @@ static struct io_overflow_cqe *io_alloc_ocqe(struct io_ring_ctx *ctx,
}
/*
+ * Compute queued CQEs for free-space calculation, clamped to cq_entries.
+ */
+static unsigned int io_cqring_queued(struct io_ring_ctx *ctx)
+{
+ struct io_rings *rings = io_get_rings(ctx);
+ int diff;
+
+ diff = (int)(ctx->cached_cq_tail - READ_ONCE(rings->cq.head));
+ if (diff >= 0)
+ return min((unsigned int)diff, ctx->cq_entries);
+ return 0;
+}
+
+/*
* Fill an empty dummy CQE, in case alignment is off for posting a 32b CQE
* because the ring is a single 16b entry away from wrapping.
*/
static bool io_fill_nop_cqe(struct io_ring_ctx *ctx, unsigned int off)
{
- if (__io_cqring_events(ctx) < ctx->cq_entries) {
+ if (io_cqring_queued(ctx) < ctx->cq_entries) {
struct io_uring_cqe *cqe = &ctx->rings->cqes[off];
cqe->user_data = 0;
@@ -713,7 +727,7 @@ bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow, bool cqe32)
{
struct io_rings *rings = ctx->rings;
unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1);
- unsigned int free, queued, len;
+ unsigned int free, len;
/*
* Posting into the CQ when there are pending overflowed CQEs may break
@@ -733,9 +747,7 @@ bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow, bool cqe32)
off = 0;
}
- /* userspace may cheat modifying the tail, be safe and do min */
- queued = min(__io_cqring_events(ctx), ctx->cq_entries);
- free = ctx->cq_entries - queued;
+ free = ctx->cq_entries - io_cqring_queued(ctx);
/* we need a contiguous range, limit based on the current array offset */
len = min(free, ctx->cq_entries - off);
if (len < (cqe32 + 1))
@@ -1452,8 +1464,13 @@ struct io_wq_work *io_wq_free_work(struct io_wq_work *work)
struct io_kiocb *nxt = NULL;
if (req_ref_put_and_test_atomic(req)) {
- if (req->flags & IO_REQ_LINK_FLAGS)
+ if (req->flags & IO_REQ_LINK_FLAGS) {
+ struct io_ring_ctx *ctx = req->ctx;
+
+ mutex_lock(&ctx->uring_lock);
nxt = io_req_find_next(req);
+ mutex_unlock(&ctx->uring_lock);
+ }
io_free_req(req);
}
return nxt ? &nxt->work : NULL;
diff --git a/io_uring/rw.c b/io_uring/rw.c
index e729e0e7657e..0c4834645279 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -230,7 +230,7 @@ static inline void io_meta_restore(struct io_async_rw *io, struct kiocb *kiocb)
}
static int io_prep_rw_pi(struct io_kiocb *req, struct io_rw *rw, int ddir,
- u64 attr_ptr, u64 attr_type_mask)
+ u64 attr_ptr)
{
struct io_uring_attr_pi pi_attr;
struct io_async_rw *io;
@@ -305,7 +305,7 @@ static int __io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
return -EINVAL;
attr_ptr = READ_ONCE(sqe->attr_ptr);
- return io_prep_rw_pi(req, rw, ddir, attr_ptr, attr_type_mask);
+ return io_prep_rw_pi(req, rw, ddir, attr_ptr);
}
return 0;
}
diff --git a/io_uring/timeout.c b/io_uring/timeout.c
index e2595cae2b07..6353a4d979dc 100644
--- a/io_uring/timeout.c
+++ b/io_uring/timeout.c
@@ -284,6 +284,10 @@ static struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req,
struct io_timeout *timeout = io_kiocb_to_cmd(link, struct io_timeout);
io_remove_next_linked(req);
+
+ /* If this is NULL, then timer already claimed it and will complete it */
+ if (!timeout->head)
+ return NULL;
timeout->head = NULL;
if (hrtimer_try_to_cancel(&io->timer) != -1) {
list_del(&timeout->list);
@@ -367,6 +371,14 @@ static void io_req_task_link_timeout(struct io_tw_req tw_req, io_tw_token_t tw)
int ret;
if (prev) {
+ /*
+ * splice the linked timeout out of prev's chain if the regular
+ * completion path didn't already do it.
+ */
+ if (prev->link == req)
+ prev->link = req->link;
+ req->link = NULL;
+
if (!tw.cancel) {
struct io_cancel_data cd = {
.ctx = req->ctx,
@@ -401,10 +413,10 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
/*
* We don't expect the list to be empty, that will only happen if we
- * race with the completion of the linked work.
+ * race with the completion of the linked work. Splice of prev is
+ * done in io_req_task_link_timeout(), if needed.
*/
if (prev) {
- io_remove_next_linked(prev);
if (!req_ref_inc_not_zero(prev))
prev = NULL;
}