From f42f9091be9e5ff57567a3945cfcdd498f475348 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Thu, 5 Mar 2026 08:15:37 -0800 Subject: workqueue: Use POOL_BH instead of WQ_BH when checking pool flags pr_cont_worker_id() checks pool->flags against WQ_BH, which is a workqueue-level flag (defined in workqueue.h). Pool flags use a separate namespace with POOL_* constants (defined in workqueue.c). The correct constant is POOL_BH. Both WQ_BH and POOL_BH are defined as (1 << 0) so this has no behavioral impact, but it is semantically wrong and inconsistent with every other pool-level BH check in the file. Fixes: 4cb1ef64609f ("workqueue: Implement BH workqueues to eventually replace tasklets") Signed-off-by: Breno Leitao Acked-by: Song Liu Signed-off-by: Tejun Heo --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/workqueue.c') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index aeaec79bc09c..1e5b6cb0fbda 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -6274,7 +6274,7 @@ static void pr_cont_worker_id(struct worker *worker) { struct worker_pool *pool = worker->pool; - if (pool->flags & WQ_BH) + if (pool->flags & POOL_BH) pr_cont("bh%s", pool->attrs->nice == HIGHPRI_NICE_LEVEL ? "-hi" : ""); else -- cgit v1.2.3 From 6037160e52d72028da68546fd270a7dcac130d85 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Thu, 5 Mar 2026 08:15:38 -0800 Subject: workqueue: Rename pool->watchdog_ts to pool->last_progress_ts The watchdog_ts name doesn't convey what the timestamp actually tracks. This field tracks the last time a workqueue got progress. Rename it to last_progress_ts to make it clear that it records when the pool last made forward progress (started processing new work items). No functional change. Signed-off-by: Breno Leitao Acked-by: Song Liu Signed-off-by: Tejun Heo --- kernel/workqueue.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel/workqueue.c') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 1e5b6cb0fbda..687d5c55c617 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -190,7 +190,7 @@ struct worker_pool { int id; /* I: pool ID */ unsigned int flags; /* L: flags */ - unsigned long watchdog_ts; /* L: watchdog timestamp */ + unsigned long last_progress_ts; /* L: last forward progress timestamp */ bool cpu_stall; /* WD: stalled cpu bound pool */ /* @@ -1697,7 +1697,7 @@ static void __pwq_activate_work(struct pool_workqueue *pwq, WARN_ON_ONCE(!(*wdb & WORK_STRUCT_INACTIVE)); trace_workqueue_activate_work(work); if (list_empty(&pwq->pool->worklist)) - pwq->pool->watchdog_ts = jiffies; + pwq->pool->last_progress_ts = jiffies; move_linked_works(work, &pwq->pool->worklist, NULL); __clear_bit(WORK_STRUCT_INACTIVE_BIT, wdb); } @@ -2348,7 +2348,7 @@ retry: */ if (list_empty(&pwq->inactive_works) && pwq_tryinc_nr_active(pwq, false)) { if (list_empty(&pool->worklist)) - pool->watchdog_ts = jiffies; + pool->last_progress_ts = jiffies; trace_workqueue_activate_work(work); insert_work(pwq, work, &pool->worklist, work_flags); @@ -3352,7 +3352,7 @@ static void process_scheduled_works(struct worker *worker) while ((work = list_first_entry_or_null(&worker->scheduled, struct work_struct, entry))) { if (first) { - worker->pool->watchdog_ts = jiffies; + worker->pool->last_progress_ts = jiffies; first = false; } process_one_work(worker, work); @@ -4850,7 +4850,7 @@ static int init_worker_pool(struct worker_pool *pool) pool->cpu = -1; pool->node = NUMA_NO_NODE; pool->flags |= POOL_DISASSOCIATED; - pool->watchdog_ts = jiffies; + pool->last_progress_ts = jiffies; INIT_LIST_HEAD(&pool->worklist); INIT_LIST_HEAD(&pool->idle_list); hash_init(pool->busy_hash); @@ -6462,7 +6462,7 @@ static void show_one_worker_pool(struct worker_pool *pool) /* How long the first pending work is waiting for a worker. */ if (!list_empty(&pool->worklist)) - hung = jiffies_to_msecs(jiffies - pool->watchdog_ts) / 1000; + hung = jiffies_to_msecs(jiffies - pool->last_progress_ts) / 1000; /* * Defer printing to avoid deadlocks in console drivers that @@ -7691,7 +7691,7 @@ static void wq_watchdog_timer_fn(struct timer_list *unused) touched = READ_ONCE(per_cpu(wq_watchdog_touched_cpu, pool->cpu)); else touched = READ_ONCE(wq_watchdog_touched); - pool_ts = READ_ONCE(pool->watchdog_ts); + pool_ts = READ_ONCE(pool->last_progress_ts); if (time_after(pool_ts, touched)) ts = pool_ts; -- cgit v1.2.3 From e8e14ac7cfe437b896838e7f7d07c573965b4e4e Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Thu, 5 Mar 2026 08:15:39 -0800 Subject: workqueue: Show in-flight work item duration in stall diagnostics When diagnosing workqueue stalls, knowing how long each in-flight work item has been executing is valuable. Add a current_start timestamp (jiffies) to struct worker, set it when a work item begins execution in process_one_work(), and print the elapsed wall-clock time in show_pwq(). Unlike current_at (which tracks CPU runtime and resets on wakeup for CPU-intensive detection), current_start is never reset because the diagnostic cares about total wall-clock time including sleeps. Before: in-flight: 165:stall_work_fn [wq_stall] After: in-flight: 165:stall_work_fn [wq_stall] for 100s Signed-off-by: Breno Leitao Acked-by: Song Liu Signed-off-by: Tejun Heo --- kernel/workqueue.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel/workqueue.c') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 687d5c55c617..56d8af13843f 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3204,6 +3204,7 @@ __acquires(&pool->lock) worker->current_pwq = pwq; if (worker->task) worker->current_at = worker->task->se.sum_exec_runtime; + worker->current_start = jiffies; work_data = *work_data_bits(work); worker->current_color = get_work_color(work_data); @@ -6359,6 +6360,8 @@ static void show_pwq(struct pool_workqueue *pwq) pr_cont(" %s", comma ? "," : ""); pr_cont_worker_id(worker); pr_cont(":%ps", worker->current_func); + pr_cont(" for %us", + jiffies_to_msecs(jiffies - worker->current_start) / 1000); list_for_each_entry(work, &worker->scheduled, entry) pr_cont_work(false, work, &pcws); pr_cont_work_flush(comma, (work_func_t)-1L, &pcws); -- cgit v1.2.3 From 8823eaef45da7f156a1396f40d53b985c511edef Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Thu, 5 Mar 2026 08:15:40 -0800 Subject: workqueue: Show all busy workers in stall diagnostics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit show_cpu_pool_hog() only prints workers whose task is currently running on the CPU (task_is_running()). This misses workers that are busy processing a work item but are sleeping or blocked — for example, a worker that clears PF_WQ_WORKER and enters wait_event_idle(). Such a worker still occupies a pool slot and prevents progress, yet produces an empty backtrace section in the watchdog output. This is happening on real arm64 systems, where toggle_allocation_gate() IPIs every single CPU in the machine (which lacks NMI), causing workqueue stalls that show empty backtraces because toggle_allocation_gate() is sleeping in wait_event_idle(). Remove the task_is_running() filter so every in-flight worker in the pool's busy_hash is dumped. The busy_hash is protected by pool->lock, which is already held. Signed-off-by: Breno Leitao Acked-by: Song Liu Signed-off-by: Tejun Heo --- kernel/workqueue.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) (limited to 'kernel/workqueue.c') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 56d8af13843f..09b9ad78d566 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -7583,9 +7583,9 @@ MODULE_PARM_DESC(panic_on_stall_time, "Panic if stall exceeds this many seconds /* * Show workers that might prevent the processing of pending work items. - * The only candidates are CPU-bound workers in the running state. - * Pending work items should be handled by another idle worker - * in all other situations. + * A busy worker that is not running on the CPU (e.g. sleeping in + * wait_event_idle() with PF_WQ_WORKER cleared) can stall the pool just as + * effectively as a CPU-bound one, so dump every in-flight worker. */ static void show_cpu_pool_hog(struct worker_pool *pool) { @@ -7596,19 +7596,17 @@ static void show_cpu_pool_hog(struct worker_pool *pool) raw_spin_lock_irqsave(&pool->lock, irq_flags); hash_for_each(pool->busy_hash, bkt, worker, hentry) { - if (task_is_running(worker->task)) { - /* - * Defer printing to avoid deadlocks in console - * drivers that queue work while holding locks - * also taken in their write paths. - */ - printk_deferred_enter(); + /* + * Defer printing to avoid deadlocks in console + * drivers that queue work while holding locks + * also taken in their write paths. + */ + printk_deferred_enter(); - pr_info("pool %d:\n", pool->id); - sched_show_task(worker->task); + pr_info("pool %d:\n", pool->id); + sched_show_task(worker->task); - printk_deferred_exit(); - } + printk_deferred_exit(); } raw_spin_unlock_irqrestore(&pool->lock, irq_flags); @@ -7619,7 +7617,7 @@ static void show_cpu_pools_hogs(void) struct worker_pool *pool; int pi; - pr_info("Showing backtraces of running workers in stalled CPU-bound worker pools:\n"); + pr_info("Showing backtraces of busy workers in stalled CPU-bound worker pools:\n"); rcu_read_lock(); -- cgit v1.2.3 From 98c790b100764102d877e9339471b8c4c9233f2c Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Fri, 6 Mar 2026 02:46:42 -0800 Subject: workqueue: Rename show_cpu_pool{s,}_hog{s,}() to reflect broadened scope MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit show_cpu_pool_hog() and show_cpu_pools_hogs() no longer only dump CPU hogs — since commit 8823eaef45da ("workqueue: Show all busy workers in stall diagnostics"), they dump every in-flight worker in the pool's busy_hash. Rename them to show_cpu_pool_busy_workers() and show_cpu_pools_busy_workers() to accurately describe what they do. Also fix the pr_info() message to say "stalled worker pools" instead of "stalled CPU-bound worker pools", since sleeping/blocked workers are now included. No functional change. Suggested-by: Tejun Heo Signed-off-by: Breno Leitao Signed-off-by: Tejun Heo --- kernel/workqueue.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel/workqueue.c') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 09b9ad78d566..b77119d71641 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -7587,7 +7587,7 @@ MODULE_PARM_DESC(panic_on_stall_time, "Panic if stall exceeds this many seconds * wait_event_idle() with PF_WQ_WORKER cleared) can stall the pool just as * effectively as a CPU-bound one, so dump every in-flight worker. */ -static void show_cpu_pool_hog(struct worker_pool *pool) +static void show_cpu_pool_busy_workers(struct worker_pool *pool) { struct worker *worker; unsigned long irq_flags; @@ -7612,18 +7612,18 @@ static void show_cpu_pool_hog(struct worker_pool *pool) raw_spin_unlock_irqrestore(&pool->lock, irq_flags); } -static void show_cpu_pools_hogs(void) +static void show_cpu_pools_busy_workers(void) { struct worker_pool *pool; int pi; - pr_info("Showing backtraces of busy workers in stalled CPU-bound worker pools:\n"); + pr_info("Showing backtraces of busy workers in stalled worker pools:\n"); rcu_read_lock(); for_each_pool(pool, pi) { if (pool->cpu_stall) - show_cpu_pool_hog(pool); + show_cpu_pool_busy_workers(pool); } @@ -7720,7 +7720,7 @@ static void wq_watchdog_timer_fn(struct timer_list *unused) show_all_workqueues(); if (cpu_pool_stall) - show_cpu_pools_hogs(); + show_cpu_pools_busy_workers(); if (lockup_detected) panic_on_wq_watchdog(max_stall_time); -- cgit v1.2.3 From c7f27a8ab9f2f43570f0725256597a0d7abe2c5b Mon Sep 17 00:00:00 2001 From: Song Liu Date: Sat, 21 Mar 2026 20:30:45 -0700 Subject: workqueue: Fix false positive stall reports On weakly ordered architectures (e.g., arm64), the lockless check in wq_watchdog_timer_fn() can observe a reordering between the worklist insertion and the last_progress_ts update. Specifically, the watchdog can see a non-empty worklist (from a list_add) while reading a stale last_progress_ts value, causing a false positive stall report. This was confirmed by reading pool->last_progress_ts again after holding pool->lock in wq_watchdog_timer_fn(): workqueue watchdog: pool 7 false positive detected! lockless_ts=4784580465 locked_ts=4785033728 diff=453263ms worklist_empty=0 To avoid slowing down the hot path (queue_work, etc.), recheck last_progress_ts with pool->lock held. This will eliminate the false positive with minimal overhead. Remove two extra empty lines in wq_watchdog_timer_fn() as we are on it. Fixes: 82607adcf9cd ("workqueue: implement lockup detector") Cc: stable@vger.kernel.org # v4.5+ Assisted-by: claude-code:claude-opus-4-6 Signed-off-by: Song Liu Signed-off-by: Tejun Heo --- kernel/workqueue.c | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) (limited to 'kernel/workqueue.c') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index b77119d71641..ff97b705f25e 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -7699,8 +7699,28 @@ static void wq_watchdog_timer_fn(struct timer_list *unused) else ts = touched; - /* did we stall? */ + /* + * Did we stall? + * + * Do a lockless check first. On weakly ordered + * architectures, the lockless check can observe a + * reordering between worklist insert_work() and + * last_progress_ts update from __queue_work(). Since + * __queue_work() is a much hotter path than the timer + * function, we handle false positive here by reading + * last_progress_ts again with pool->lock held. + */ if (time_after(now, ts + thresh)) { + scoped_guard(raw_spinlock_irqsave, &pool->lock) { + pool_ts = pool->last_progress_ts; + if (time_after(pool_ts, touched)) + ts = pool_ts; + else + ts = touched; + } + if (!time_after(now, ts + thresh)) + continue; + lockup_detected = true; stall_time = jiffies_to_msecs(now - pool_ts) / 1000; max_stall_time = max(max_stall_time, stall_time); @@ -7712,8 +7732,6 @@ static void wq_watchdog_timer_fn(struct timer_list *unused) pr_cont_pool_info(pool); pr_cont(" stuck for %us!\n", stall_time); } - - } if (lockup_detected) -- cgit v1.2.3 From e398978ddf18fe5a2fc8299c77e6fe50e6c306c4 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Wed, 25 Mar 2026 13:34:18 +0100 Subject: workqueue: Better describe stall check Try to be more explicit why the workqueue watchdog does not take pool->lock by default. Spin locks are full memory barriers which delay anything. Obviously, they would primary delay operations on the related worker pools. Explain why it is enough to prevent the false positive by re-checking the timestamp under the pool->lock. Finally, make it clear what would be the alternative solution in __queue_work() which is a hotter path. Signed-off-by: Petr Mladek Acked-by: Song Liu Signed-off-by: Tejun Heo --- kernel/workqueue.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'kernel/workqueue.c') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ff97b705f25e..eda756556341 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -7702,13 +7702,14 @@ static void wq_watchdog_timer_fn(struct timer_list *unused) /* * Did we stall? * - * Do a lockless check first. On weakly ordered - * architectures, the lockless check can observe a - * reordering between worklist insert_work() and - * last_progress_ts update from __queue_work(). Since - * __queue_work() is a much hotter path than the timer - * function, we handle false positive here by reading - * last_progress_ts again with pool->lock held. + * Do a lockless check first to do not disturb the system. + * + * Prevent false positives by double checking the timestamp + * under pool->lock. The lock makes sure that the check reads + * an updated pool->last_progress_ts when this CPU saw + * an already updated pool->worklist above. It seems better + * than adding another barrier into __queue_work() which + * is a hotter path. */ if (time_after(now, ts + thresh)) { scoped_guard(raw_spinlock_irqsave, &pool->lock) { -- cgit v1.2.3 From 703ccb63ae9f7444d6ff876d024e17f628103c69 Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Tue, 31 Mar 2026 18:07:39 -0700 Subject: workqueue: Add pool_workqueue to pending_pwqs list when unplugging multiple inactive works MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In unplug_oldest_pwq(), the first inactive work item on the pool_workqueue is activated correctly. However, if multiple inactive works exist on the same pool_workqueue, subsequent works fail to activate because wq_node_nr_active.pending_pwqs is empty — the list insertion is skipped when the pool_workqueue is plugged. Fix this by checking for additional inactive works in unplug_oldest_pwq() and updating wq_node_nr_active.pending_pwqs accordingly. Fixes: 4c065dbce1e8 ("workqueue: Enable unbound cpumask update on ordered workqueues") Cc: stable@vger.kernel.org Cc: Carlos Santa Cc: Ryan Neph Cc: Lai Jiangshan Cc: Waiman Long Cc: linux-kernel@vger.kernel.org Signed-off-by: Matthew Brost Signed-off-by: Tejun Heo Acked-by: Waiman Long --- kernel/workqueue.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'kernel/workqueue.c') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index eda756556341..c6ea96d5b716 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1849,8 +1849,20 @@ static void unplug_oldest_pwq(struct workqueue_struct *wq) raw_spin_lock_irq(&pwq->pool->lock); if (pwq->plugged) { pwq->plugged = false; - if (pwq_activate_first_inactive(pwq, true)) + if (pwq_activate_first_inactive(pwq, true)) { + /* + * While plugged, queueing skips activation which + * includes bumping the nr_active count and adding the + * pwq to nna->pending_pwqs if the count can't be + * obtained. We need to restore both for the pwq being + * unplugged. The first call activates the first + * inactive work item and the second, if there are more + * inactive, puts the pwq on pending_pwqs. + */ + pwq_activate_first_inactive(pwq, false); + kick_pool(pwq->pool); + } } raw_spin_unlock_irq(&pwq->pool->lock); } -- cgit v1.2.3