From f6cfa602d2ba7e5ca9dc65ec4141521aca80bda2 Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Fri, 5 Sep 2025 11:13:23 +0200 Subject: workqueue: replace use of system_unbound_wq with system_dfl_wq Currently if a user enqueue a work item using schedule_delayed_work() the used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to schedule_work() that is using system_wq and queue_work(), that makes use again of WORK_CPU_UNBOUND. This lack of consistentcy cannot be addressed without refactoring the API. system_unbound_wq should be the default workqueue so as not to enforce locality constraints for random work whenever it's not required. Adding system_dfl_wq to encourage its use when unbound work should be used. queue_work() / queue_delayed_work() / mod_delayed_work() will now use the new unbound wq: whether the user still use the old wq a warn will be printed along with a wq redirect to the new one. The old system_unbound_wq will be kept for a few release cycles. Suggested-by: Tejun Heo Signed-off-by: Marco Crivellari Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux/workqueue.h') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 45d5dd470ff6..af860e8f8481 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -783,8 +783,8 @@ extern void __warn_flushing_systemwide_wq(void) _wq == system_highpri_wq) || \ (__builtin_constant_p(_wq == system_long_wq) && \ _wq == system_long_wq) || \ - (__builtin_constant_p(_wq == system_unbound_wq) && \ - _wq == system_unbound_wq) || \ + (__builtin_constant_p(_wq == system_dfl_wq) && \ + _wq == system_dfl_wq) || \ (__builtin_constant_p(_wq == system_freezable_wq) && \ _wq == system_freezable_wq) || \ (__builtin_constant_p(_wq == system_power_efficient_wq) && \ -- cgit v1.2.3 From a2be943b46b4a7478ea8ddf9bb8e5251c59fceb7 Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Fri, 5 Sep 2025 11:13:24 +0200 Subject: workqueue: replace use of system_wq with system_percpu_wq Currently if a user enqueue a work item using schedule_delayed_work() the used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to schedule_work() that is using system_wq and queue_work(), that makes use again of WORK_CPU_UNBOUND. This lack of consistentcy cannot be addressed without refactoring the API. system_wq is a per-CPU worqueue, yet nothing in its name tells about that CPU affinity constraint, which is very often not required by users. Make it clear by adding a system_percpu_wq. queue_work() / queue_delayed_work() mod_delayed_work() will now use the new per-cpu wq: whether the user still stick on the old name a warn will be printed along a wq redirect to the new one. This patch add the new system_percpu_wq except for mm, fs and net subsystem, whom are handled in separated patches. The old wq will be kept for a few release cylces. Suggested-by: Tejun Heo Signed-off-by: Marco Crivellari Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'include/linux/workqueue.h') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index af860e8f8481..b6834b7aee4b 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -434,10 +434,10 @@ enum wq_consts { * short queue flush time. Don't queue works which can run for too * long. * - * system_highpri_wq is similar to system_wq but for work items which + * system_highpri_wq is similar to system_percpu_wq but for work items which * require WQ_HIGHPRI. * - * system_long_wq is similar to system_wq but may host long running + * system_long_wq is similar to system_percpu_wq but may host long running * works. Queue flushing might take relatively long. * * system_dfl_wq is unbound workqueue. Workers are not bound to @@ -445,13 +445,13 @@ enum wq_consts { * executed immediately as long as max_active limit is not reached and * resources are available. * - * system_freezable_wq is equivalent to system_wq except that it's + * system_freezable_wq is equivalent to system_percpu_wq except that it's * freezable. * * *_power_efficient_wq are inclined towards saving power and converted * into WQ_UNBOUND variants if 'wq_power_efficient' is enabled; otherwise, * they are same as their non-power-efficient counterparts - e.g. - * system_power_efficient_wq is identical to system_wq if + * system_power_efficient_wq is identical to system_percpu_wq if * 'wq_power_efficient' is disabled. See WQ_POWER_EFFICIENT for more info. * * system_bh[_highpri]_wq are convenience interface to softirq. BH work items @@ -708,7 +708,7 @@ static inline bool mod_delayed_work(struct workqueue_struct *wq, */ static inline bool schedule_work_on(int cpu, struct work_struct *work) { - return queue_work_on(cpu, system_wq, work); + return queue_work_on(cpu, system_percpu_wq, work); } /** @@ -727,7 +727,7 @@ static inline bool schedule_work_on(int cpu, struct work_struct *work) */ static inline bool schedule_work(struct work_struct *work) { - return queue_work(system_wq, work); + return queue_work(system_percpu_wq, work); } /** @@ -770,15 +770,15 @@ extern void __warn_flushing_systemwide_wq(void) #define flush_scheduled_work() \ ({ \ __warn_flushing_systemwide_wq(); \ - __flush_workqueue(system_wq); \ + __flush_workqueue(system_percpu_wq); \ }) #define flush_workqueue(wq) \ ({ \ struct workqueue_struct *_wq = (wq); \ \ - if ((__builtin_constant_p(_wq == system_wq) && \ - _wq == system_wq) || \ + if ((__builtin_constant_p(_wq == system_percpu_wq) && \ + _wq == system_percpu_wq) || \ (__builtin_constant_p(_wq == system_highpri_wq) && \ _wq == system_highpri_wq) || \ (__builtin_constant_p(_wq == system_long_wq) && \ @@ -807,7 +807,7 @@ extern void __warn_flushing_systemwide_wq(void) static inline bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay) { - return queue_delayed_work_on(cpu, system_wq, dwork, delay); + return queue_delayed_work_on(cpu, system_percpu_wq, dwork, delay); } /** @@ -821,7 +821,7 @@ static inline bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork, static inline bool schedule_delayed_work(struct delayed_work *dwork, unsigned long delay) { - return queue_delayed_work(system_wq, dwork, delay); + return queue_delayed_work(system_percpu_wq, dwork, delay); } #ifndef CONFIG_SMP -- cgit v1.2.3 From dadb3ebcf395ebee3626d88ac7e5e234f15bae2c Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Sun, 14 Sep 2025 15:44:26 +0200 Subject: workqueue: WQ_PERCPU added to alloc_workqueue users MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently if a user enqueue a work item using schedule_delayed_work() the used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to schedule_work() that is using system_wq and queue_work(), that makes use again of WORK_CPU_UNBOUND. This lack of consistentcy cannot be addressed without refactoring the API. alloc_workqueue() treats all queues as per-CPU by default, while unbound workqueues must opt-in via WQ_UNBOUND. This default is suboptimal: most workloads benefit from unbound queues, allowing the scheduler to place worker threads where they’re needed and reducing noise when CPUs are isolated. This patch adds a new WQ_PERCPU flag to explicitly request the use of the per-CPU behavior. Both flags coexist for one release cycle to allow callers to transition their calls. Once migration is complete, WQ_UNBOUND can be removed and unbound will become the implicit default. With the introduction of the WQ_PERCPU flag (equivalent to !WQ_UNBOUND), any alloc_workqueue() caller that doesn’t explicitly specify WQ_UNBOUND must now use WQ_PERCPU. All existing users have been updated accordingly. Suggested-by: Tejun Heo Signed-off-by: Marco Crivellari Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux/workqueue.h') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index b6834b7aee4b..71a9900c03c7 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -410,7 +410,7 @@ enum wq_flags { __WQ_LEGACY = 1 << 18, /* internal: create*_workqueue() */ /* BH wq only allows the following flags */ - __WQ_BH_ALLOWS = WQ_BH | WQ_HIGHPRI, + __WQ_BH_ALLOWS = WQ_BH | WQ_HIGHPRI | WQ_PERCPU, }; enum wq_consts { @@ -570,7 +570,7 @@ alloc_workqueue_lockdep_map(const char *fmt, unsigned int flags, int max_active, alloc_workqueue(fmt, WQ_UNBOUND | __WQ_ORDERED | (flags), 1, ##args) #define create_workqueue(name) \ - alloc_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, 1, (name)) + alloc_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM | WQ_PERCPU, 1, (name)) #define create_freezable_workqueue(name) \ alloc_workqueue("%s", __WQ_LEGACY | WQ_FREEZABLE | WQ_UNBOUND | \ WQ_MEM_RECLAIM, 1, (name)) -- cgit v1.2.3 From 0950c64ae38661bd97127e9aa0522f1624f82006 Mon Sep 17 00:00:00 2001 From: Kriish Sharma Date: Mon, 22 Sep 2025 12:26:06 +0000 Subject: workqueue: fix texinfodocs warning for WQ_* flags reference Sphinx emitted a warning during make texinfodocs: WARNING: Inline literal start-string without end-string. This was caused by the trailing '*' in "%WQ_*" being parsed as reStructuredText markup in the kernel-doc comment. Escape the '*' in the comment so that Sphinx treats it as a literal character, resolving the warning. Signed-off-by: Kriish Sharma Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/workqueue.h') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 71a9900c03c7..dabc351cc127 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -502,7 +502,7 @@ void workqueue_softirq_dead(unsigned int cpu); * min_active which is set to min(@max_active, %WQ_DFL_MIN_ACTIVE). This means * that the sum of per-node max_active's may be larger than @max_active. * - * For detailed information on %WQ_* flags, please refer to + * For detailed information on %WQ_\* flags, please refer to * Documentation/core-api/workqueue.rst. * * RETURNS: -- cgit v1.2.3 From 23f09dcc0a0fa3b4e48516bdea1c90223dfb3d6c Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 28 May 2025 18:19:23 +0200 Subject: cpuset: Propagate cpuset isolation update to workqueue through housekeeping MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Until now, cpuset would propagate isolated partition changes to workqueues so that unbound workers get properly reaffined. Since housekeeping now centralizes, synchronize and propagates isolation cpumask changes, perform the work from that subsystem for consolidation and consistency purposes. For simplification purpose, the target function is adapted to take the new housekeeping mask instead of the isolated mask. Suggested-by: Tejun Heo Signed-off-by: Frederic Weisbecker Reviewed-by: Waiman Long Acked-by: Tejun Heo Cc: "Michal Koutný" Cc: Ingo Molnar Cc: Johannes Weiner Cc: Lai Jiangshan Cc: Marco Crivellari Cc: Michal Hocko Cc: Peter Zijlstra Cc: Tejun Heo Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Waiman Long Cc: cgroups@vger.kernel.org --- include/linux/workqueue.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/workqueue.h') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index dabc351cc127..a4749f56398f 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -588,7 +588,7 @@ struct workqueue_attrs *alloc_workqueue_attrs_noprof(void); void free_workqueue_attrs(struct workqueue_attrs *attrs); int apply_workqueue_attrs(struct workqueue_struct *wq, const struct workqueue_attrs *attrs); -extern int workqueue_unbound_exclude_cpumask(cpumask_var_t cpumask); +extern int workqueue_unbound_housekeeping_update(const struct cpumask *hk); extern bool queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work); -- cgit v1.2.3 From 4a91a33f15c634fb3477d122bdf1eef098d77ee3 Mon Sep 17 00:00:00 2001 From: Mallesh Koujalagi Date: Fri, 27 Feb 2026 14:24:01 +0530 Subject: workqueue: Update documentation as per system_percpu_wq naming Update documentation to use "per-CPU workqueue" instead of "global workqueue" to match the system_wq to system_percpu_wq rename. The workqueue behavior remains unchanged; this just aligns terminology with the clearer naming. Fixes: a2be943b46b4 ("workqueue: replace use of system_wq with system_percpu_wq") Signed-off-by: Mallesh Koujalagi Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'include/linux/workqueue.h') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index a4749f56398f..fc5744402a66 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -712,14 +712,14 @@ static inline bool schedule_work_on(int cpu, struct work_struct *work) } /** - * schedule_work - put work task in global workqueue + * schedule_work - put work task in per-CPU workqueue * @work: job to be done * - * Returns %false if @work was already on the kernel-global workqueue and + * Returns %false if @work was already on the system per-CPU workqueue and * %true otherwise. * - * This puts a job in the kernel-global workqueue if it was not already - * queued and leaves it in the same position on the kernel-global + * This puts a job in the system per-CPU workqueue if it was not already + * queued and leaves it in the same position on the system per-CPU * workqueue otherwise. * * Shares the same memory-ordering properties of queue_work(), cf. the @@ -796,12 +796,12 @@ extern void __warn_flushing_systemwide_wq(void) }) /** - * schedule_delayed_work_on - queue work in global workqueue on CPU after delay + * schedule_delayed_work_on - queue work in per-CPU workqueue on CPU after delay * @cpu: cpu to use * @dwork: job to be done * @delay: number of jiffies to wait * - * After waiting for a given time this puts a job in the kernel-global + * After waiting for a given time this puts a job in the system per-CPU * workqueue on the specified CPU. */ static inline bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork, @@ -811,11 +811,11 @@ static inline bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork, } /** - * schedule_delayed_work - put work task in global workqueue after delay + * schedule_delayed_work - put work task in per-CPU workqueue after delay * @dwork: job to be done * @delay: number of jiffies to wait or 0 for immediate execution * - * After waiting for a given time this puts a job in the kernel-global + * After waiting for a given time this puts a job in the system per-CPU * workqueue. */ static inline bool schedule_delayed_work(struct delayed_work *dwork, -- cgit v1.2.3 From c116737e972ea74f4468a1bd0703d623a3c0ee4a Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Mon, 9 Mar 2026 14:15:28 +0100 Subject: workqueue: Add system_dfl_long_wq for long unbound works Currently there are users of queue_delayed_work() who specify system_long_wq, the per-cpu workqueue. This workqueue should be used for long per-cpu works, but queue_delayed_work() queue the work using: queue_delayed_work_on(WORK_CPU_UNBOUND, ...); This would end up calling __queue_delayed_work() that does: if (housekeeping_enabled(HK_TYPE_TIMER)) { // [....] } else { if (likely(cpu == WORK_CPU_UNBOUND)) add_timer_global(timer); else add_timer_on(timer, cpu); } So when cpu == WORK_CPU_UNBOUND the timer is global and is not using a specific CPU. Later, when __queue_work() is called: if (req_cpu == WORK_CPU_UNBOUND) { if (wq->flags & WQ_UNBOUND) cpu = wq_select_unbound_cpu(raw_smp_processor_id()); else cpu = raw_smp_processor_id(); } Because the wq is not unbound, it takes the CPU where the timer fired and enqueue the work on that CPU. The consequence of all of this is that the work can run anywhere, depending on where the timer fired. Introduce system_dfl_long_wq in order to change, in a future step, users that are still calling: queue_delayed_work(system_long_wq, ...); with the new system_dfl_long_wq instead, so that the work may benefit from scheduler task placement. Signed-off-by: Marco Crivellari Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux/workqueue.h') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index fc5744402a66..8e0855d56e74 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -440,6 +440,9 @@ enum wq_consts { * system_long_wq is similar to system_percpu_wq but may host long running * works. Queue flushing might take relatively long. * + * system_dfl_long_wq is similar to system_dfl_wq but it may host long running + * works. + * * system_dfl_wq is unbound workqueue. Workers are not bound to * any specific CPU, not concurrency managed, and all queued works are * executed immediately as long as max_active limit is not reached and @@ -468,6 +471,7 @@ extern struct workqueue_struct *system_power_efficient_wq; extern struct workqueue_struct *system_freezable_power_efficient_wq; extern struct workqueue_struct *system_bh_wq; extern struct workqueue_struct *system_bh_highpri_wq; +extern struct workqueue_struct *system_dfl_long_wq; void workqueue_softirq_action(bool highpri); void workqueue_softirq_dead(unsigned int cpu); @@ -783,6 +787,8 @@ extern void __warn_flushing_systemwide_wq(void) _wq == system_highpri_wq) || \ (__builtin_constant_p(_wq == system_long_wq) && \ _wq == system_long_wq) || \ + (__builtin_constant_p(_wq == system_dfl_long_wq) && \ + _wq == system_dfl_long_wq) || \ (__builtin_constant_p(_wq == system_dfl_wq) && \ _wq == system_dfl_wq) || \ (__builtin_constant_p(_wq == system_freezable_wq) && \ -- cgit v1.2.3 From 1dfc9d60a69ec148e1cb709256617d86e5f0e8f8 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 5 Mar 2026 22:45:40 +0100 Subject: workqueue: devres: Add device-managed allocate workqueue Add a Resource-managed version of alloc_workqueue() to fix common problem of drivers mixing devm() calls with destroy_workqueue. Such naive and discouraged driver approach leads to difficult to debug bugs when the driver: 1. Allocates workqueue in standard way and destroys it in driver remove() callback, 2. Sets work struct with devm_work_autocancel(), 3. Registers interrupt handler with devm_request_threaded_irq(). Which leads to following unbind/removal path: 1. destroy_workqueue() via driver remove(), Any interrupt coming now would still execute the interrupt handler, which queues work on destroyed workqueue. 2. devm_irq_release(), 3. devm_work_drop() -> cancel_work_sync() on destroyed workqueue. devm_alloc_workqueue() has two benefits: 1. Solves above problem of mix-and-match devres and non-devres code in driver, 2. Simplify any sane drivers which were correctly using alloc_workqueue() + devm_add_action_or_reset(). Signed-off-by: Krzysztof Kozlowski Acked-by: Tejun Heo Reviewed-by: Andy Shevchenko Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include/linux/workqueue.h') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index a4749f56398f..f8d235aef10d 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -512,6 +512,26 @@ __printf(1, 4) struct workqueue_struct * alloc_workqueue_noprof(const char *fmt, unsigned int flags, int max_active, ...); #define alloc_workqueue(...) alloc_hooks(alloc_workqueue_noprof(__VA_ARGS__)) +/** + * devm_alloc_workqueue - Resource-managed allocate a workqueue + * @dev: Device to allocate workqueue for + * @fmt: printf format for the name of the workqueue + * @flags: WQ_* flags + * @max_active: max in-flight work items, 0 for default + * @...: args for @fmt + * + * Resource managed workqueue, see alloc_workqueue() for details. + * + * The workqueue will be automatically destroyed on driver detach. Typically + * this should be used in drivers already relying on devm interafaces. + * + * RETURNS: + * Pointer to the allocated workqueue on success, %NULL on failure. + */ +__printf(2, 5) struct workqueue_struct * +devm_alloc_workqueue(struct device *dev, const char *fmt, unsigned int flags, + int max_active, ...); + #ifdef CONFIG_LOCKDEP /** * alloc_workqueue_lockdep_map - allocate a workqueue with user-defined lockdep_map @@ -568,6 +588,8 @@ alloc_workqueue_lockdep_map(const char *fmt, unsigned int flags, int max_active, */ #define alloc_ordered_workqueue(fmt, flags, args...) \ alloc_workqueue(fmt, WQ_UNBOUND | __WQ_ORDERED | (flags), 1, ##args) +#define devm_alloc_ordered_workqueue(dev, fmt, flags, args...) \ + devm_alloc_workqueue(dev, fmt, WQ_UNBOUND | __WQ_ORDERED | (flags), 1, ##args) #define create_workqueue(name) \ alloc_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM | WQ_PERCPU, 1, (name)) -- cgit v1.2.3 From 9dc42c9070282c81058a875fea5acae057610980 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Wed, 1 Apr 2026 06:03:52 -0700 Subject: workqueue: fix typo in WQ_AFFN_SMT comment Fix "poer" -> "per" in the WQ_AFFN_SMT enum comment. Signed-off-by: Breno Leitao Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/workqueue.h') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 9f971912c6be..75634a09576a 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -131,7 +131,7 @@ struct rcu_work { enum wq_affn_scope { WQ_AFFN_DFL, /* use system default */ WQ_AFFN_CPU, /* one pod per CPU */ - WQ_AFFN_SMT, /* one pod poer SMT */ + WQ_AFFN_SMT, /* one pod per SMT */ WQ_AFFN_CACHE, /* one pod per LLC */ WQ_AFFN_NUMA, /* one pod per NUMA node */ WQ_AFFN_SYSTEM, /* one pod across the whole system */ -- cgit v1.2.3 From 5920d046f7ae3bf9cf51b9d915c1fff13d299d84 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Wed, 1 Apr 2026 06:03:53 -0700 Subject: workqueue: add WQ_AFFN_CACHE_SHARD affinity scope On systems where many CPUs share one LLC, unbound workqueues using WQ_AFFN_CACHE collapse to a single worker pool, causing heavy spinlock contention on pool->lock. For example, Chuck Lever measured 39% of cycles lost to native_queued_spin_lock_slowpath on a 12-core shared-L3 NFS-over-RDMA system. The existing affinity hierarchy (cpu, smt, cache, numa, system) offers no intermediate option between per-LLC and per-SMT-core granularity. Add WQ_AFFN_CACHE_SHARD, which subdivides each LLC into groups of at most wq_cache_shard_size cores (default 8, tunable via boot parameter). Shards are always split on core (SMT group) boundaries so that Hyper-Threading siblings are never placed in different pods. Cores are distributed across shards as evenly as possible -- for example, 36 cores in a single LLC with max shard size 8 produces 5 shards of 8+7+7+7+7 cores. The implementation follows the same comparator pattern as other affinity scopes: precompute_cache_shard_ids() pre-fills the cpu_shard_id[] array from the already-initialized WQ_AFFN_CACHE and WQ_AFFN_SMT topology, and cpus_share_cache_shard() is passed to init_pod_type(). Benchmark on NVIDIA Grace (72 CPUs, single LLC, 50k items/thread), show cache_shard delivers ~5x the throughput and ~6.5x lower p50 latency compared to cache scope on this 72-core single-LLC system. Suggested-by: Tejun Heo Signed-off-by: Breno Leitao Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/workqueue.h') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 75634a09576a..ab6cb70ca1a5 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -133,6 +133,7 @@ enum wq_affn_scope { WQ_AFFN_CPU, /* one pod per CPU */ WQ_AFFN_SMT, /* one pod per SMT */ WQ_AFFN_CACHE, /* one pod per LLC */ + WQ_AFFN_CACHE_SHARD, /* synthetic sub-LLC shards */ WQ_AFFN_NUMA, /* one pod per NUMA node */ WQ_AFFN_SYSTEM, /* one pod across the whole system */ -- cgit v1.2.3