From a4aa8d94f24317338cf6f62eb3267ad99a2ff7f7 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 27 Feb 2026 18:01:02 +0100
Subject: workqueue: Allow to expose ordered workqueues via sysfs

Ordered workqueues are not exposed via sysfs because the 'max_active'
attribute changes the number actives worker. More than one active worker
can break ordering guarantees.

This can be avoided by forbidding writes the file for ordered
workqueues. Exposing it via sysfs allows to alter other attributes such
as the cpumask on which CPU the worker can run.

The 'max_active' value shouldn't be changed for BH worker because the
core never spawns additional worker and the worker itself can not be
preempted. So this make no sense.

Allow to expose ordered workqueues via sysfs if requested and forbid
changing 'max_active' value for ordered and BH worker.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index aeaec79bc09c..2f95cb0d2f1b 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -7176,7 +7176,26 @@ static struct attribute *wq_sysfs_attrs[] = {
 	&dev_attr_max_active.attr,
 	NULL,
 };
-ATTRIBUTE_GROUPS(wq_sysfs);
+
+static umode_t wq_sysfs_is_visible(struct kobject *kobj, struct attribute *a, int n)
+{
+	struct device *dev = kobj_to_dev(kobj);
+	struct workqueue_struct *wq = dev_to_wq(dev);
+
+	/*
+	 * Adjusting max_active breaks ordering guarantee. Changing it has no
+	 * effect on BH worker. Limit max_active to RO in such case.
+	 */
+	if (wq->flags & (WQ_BH | __WQ_ORDERED))
+		return 0444;
+	return a->mode;
+}
+
+static const struct attribute_group wq_sysfs_group = {
+	.is_visible = wq_sysfs_is_visible,
+	.attrs = wq_sysfs_attrs,
+};
+__ATTRIBUTE_GROUPS(wq_sysfs);
 
 static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr,
 			    char *buf)
@@ -7479,13 +7498,6 @@ int workqueue_sysfs_register(struct workqueue_struct *wq)
 	struct wq_device *wq_dev;
 	int ret;
 
-	/*
-	 * Adjusting max_active breaks ordering guarantee.  Disallow exposing
-	 * ordered workqueues.
-	 */
-	if (WARN_ON(wq->flags & __WQ_ORDERED))
-		return -EINVAL;
-
 	wq->wq_dev = wq_dev = kzalloc_obj(*wq_dev);
 	if (!wq_dev)
 		return -ENOMEM;
-- 
cgit v1.2.3


From c116737e972ea74f4468a1bd0703d623a3c0ee4a Mon Sep 17 00:00:00 2001
From: Marco Crivellari <marco.crivellari@suse.com>
Date: Mon, 9 Mar 2026 14:15:28 +0100
Subject: workqueue: Add system_dfl_long_wq for long unbound works

Currently there are users of queue_delayed_work() who specify
system_long_wq, the per-cpu workqueue. This workqueue should
be used for long per-cpu works, but queue_delayed_work()
queue the work using:

  queue_delayed_work_on(WORK_CPU_UNBOUND, ...);

This would end up calling __queue_delayed_work() that does:

	if (housekeeping_enabled(HK_TYPE_TIMER)) {
	//	[....]
	} else {
		if (likely(cpu == WORK_CPU_UNBOUND))
			add_timer_global(timer);
		else
			add_timer_on(timer, cpu);
	}

So when cpu == WORK_CPU_UNBOUND the timer is global and is
not using a specific CPU. Later, when __queue_work() is called:

	if (req_cpu == WORK_CPU_UNBOUND) {
		if (wq->flags & WQ_UNBOUND)
			cpu = wq_select_unbound_cpu(raw_smp_processor_id());
		else
			cpu = raw_smp_processor_id();
	}

Because the wq is not unbound, it takes the CPU where the timer
fired and enqueue the work on that CPU.
The consequence of all of this is that the work can run anywhere,
depending on where the timer fired.

Introduce system_dfl_long_wq in order to change, in a future step,
users that are still calling:

  queue_delayed_work(system_long_wq, ...);

with the new system_dfl_long_wq instead, so that the work may
benefit from scheduler task placement.

Signed-off-by: Marco Crivellari <marco.crivellari@suse.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 2f95cb0d2f1b..2d8ff903f113 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -530,6 +530,8 @@ struct workqueue_struct *system_bh_wq;
 EXPORT_SYMBOL_GPL(system_bh_wq);
 struct workqueue_struct *system_bh_highpri_wq;
 EXPORT_SYMBOL_GPL(system_bh_highpri_wq);
+struct workqueue_struct *system_dfl_long_wq __ro_after_init;
+EXPORT_SYMBOL_GPL(system_dfl_long_wq);
 
 static int worker_thread(void *__worker);
 static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
@@ -7954,11 +7956,12 @@ void __init workqueue_init_early(void)
 	system_bh_wq = alloc_workqueue("events_bh", WQ_BH | WQ_PERCPU, 0);
 	system_bh_highpri_wq = alloc_workqueue("events_bh_highpri",
 					       WQ_BH | WQ_HIGHPRI | WQ_PERCPU, 0);
+	system_dfl_long_wq = alloc_workqueue("events_dfl_long", WQ_UNBOUND, WQ_MAX_ACTIVE);
 	BUG_ON(!system_wq || !system_percpu_wq|| !system_highpri_wq || !system_long_wq ||
 	       !system_unbound_wq || !system_freezable_wq || !system_dfl_wq ||
 	       !system_power_efficient_wq ||
 	       !system_freezable_power_efficient_wq ||
-	       !system_bh_wq || !system_bh_highpri_wq);
+	       !system_bh_wq || !system_bh_highpri_wq || !system_dfl_long_wq);
 }
 
 static void __init wq_cpu_intensive_thresh_init(void)
-- 
cgit v1.2.3


From 1dfc9d60a69ec148e1cb709256617d86e5f0e8f8 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzysztof.kozlowski@oss.qualcomm.com>
Date: Thu, 5 Mar 2026 22:45:40 +0100
Subject: workqueue: devres: Add device-managed allocate workqueue

Add a Resource-managed version of alloc_workqueue() to fix common
problem of drivers mixing devm() calls with destroy_workqueue.  Such
naive and discouraged driver approach leads to difficult to debug bugs
when the driver:

1. Allocates workqueue in standard way and destroys it in driver
   remove() callback,
2. Sets work struct with devm_work_autocancel(),
3. Registers interrupt handler with devm_request_threaded_irq().

Which leads to following unbind/removal path:

1. destroy_workqueue() via driver remove(),
   Any interrupt coming now would still execute the interrupt handler,
   which queues work on destroyed workqueue.
2. devm_irq_release(),
3. devm_work_drop() -> cancel_work_sync() on destroyed workqueue.

devm_alloc_workqueue() has two benefits:
1. Solves above problem of mix-and-match devres and non-devres code in
   driver,
2. Simplify any sane drivers which were correctly using
   alloc_workqueue() + devm_add_action_or_reset().

Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@oss.qualcomm.com>
Acked-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index aeaec79bc09c..19d20f3039d9 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -41,6 +41,7 @@
 #include <linux/mempolicy.h>
 #include <linux/freezer.h>
 #include <linux/debug_locks.h>
+#include <linux/device/devres.h>
 #include <linux/lockdep.h>
 #include <linux/idr.h>
 #include <linux/jhash.h>
@@ -5891,6 +5892,33 @@ struct workqueue_struct *alloc_workqueue_noprof(const char *fmt,
 }
 EXPORT_SYMBOL_GPL(alloc_workqueue_noprof);
 
+static void devm_workqueue_release(void *res)
+{
+	destroy_workqueue(res);
+}
+
+__printf(2, 5) struct workqueue_struct *
+devm_alloc_workqueue(struct device *dev, const char *fmt, unsigned int flags,
+		     int max_active, ...)
+{
+	struct workqueue_struct *wq;
+	va_list args;
+	int ret;
+
+	va_start(args, max_active);
+	wq = alloc_workqueue(fmt, flags, max_active, args);
+	va_end(args);
+	if (!wq)
+		return NULL;
+
+	ret = devm_add_action_or_reset(dev, devm_workqueue_release, wq);
+	if (ret)
+		return NULL;
+
+	return wq;
+}
+EXPORT_SYMBOL_GPL(devm_alloc_workqueue);
+
 #ifdef CONFIG_LOCKDEP
 __printf(1, 5)
 struct workqueue_struct *
-- 
cgit v1.2.3


From 1abaae9b38a85c9dabff67a22d8c99f7254c423a Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Thu, 12 Mar 2026 09:12:02 -0700
Subject: workqueue: fix parse_affn_scope() prefix matching bug

parse_affn_scope() uses strncasecmp() with the length of the candidate
name, which means it only checks if the input *starts with* a known
scope name.

Given that the upcoming diff will create "cache_shard" affinity scope,
writing "cache_shard" to a workqueue's affinity_scope sysfs attribute
always matches "cache" first, making it impossible to select
"cache_shard" via sysfs, so, this fix enable it to distinguish "cache"
and "cache_shard"

Fix by replacing the hand-rolled prefix matching loop with
sysfs_match_string(), which uses sysfs_streq() for exact matching
(modulo trailing newlines). Also add the missing const qualifier to
the wq_affn_names[] array declaration.

Note that sysfs_streq() is case-sensitive, unlike the previous
strncasecmp() approach. This is intentional and consistent with
how other sysfs attributes handle string matching in the kernel.

Signed-off-by: Breno Leitao <leitao@debian.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 715a23d5348f..c1743b20a524 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -405,7 +405,7 @@ struct work_offq_data {
 	u32			flags;
 };
 
-static const char *wq_affn_names[WQ_AFFN_NR_TYPES] = {
+static const char * const wq_affn_names[WQ_AFFN_NR_TYPES] = {
 	[WQ_AFFN_DFL]		= "default",
 	[WQ_AFFN_CPU]		= "cpu",
 	[WQ_AFFN_SMT]		= "smt",
@@ -7093,13 +7093,7 @@ int workqueue_unbound_housekeeping_update(const struct cpumask *hk)
 
 static int parse_affn_scope(const char *val)
 {
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(wq_affn_names); i++) {
-		if (!strncasecmp(val, wq_affn_names[i], strlen(wq_affn_names[i])))
-			return i;
-	}
-	return -EINVAL;
+	return sysfs_match_string(wq_affn_names, val);
 }
 
 static int wq_affn_dfl_set(const char *val, const struct kernel_param *kp)
-- 
cgit v1.2.3


From 48718378ab1f80da847930224360f8a1b690a538 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 17 Mar 2026 07:51:08 -1000
Subject: workqueue: Remove NULL wq WARN in __queue_delayed_work()

Remove the WARN_ON_ONCE(!wq) which doesn't serve any useful purpose.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c1743b20a524..63acaa3e1d6a 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2510,7 +2510,6 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
 	struct timer_list *timer = &dwork->timer;
 	struct work_struct *work = &dwork->work;
 
-	WARN_ON_ONCE(!wq);
 	WARN_ON_ONCE(timer->function != delayed_work_timer_fn);
 	WARN_ON_ONCE(timer_pending(timer));
 	WARN_ON_ONCE(!list_empty(&work->entry));
-- 
cgit v1.2.3


From afeaa9f2532d1d8d04803d09ac2d4f7107854f29 Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Mon, 23 Mar 2026 03:18:36 -0700
Subject: workqueue: unlink pwqs from wq->pwqs list in alloc_and_link_pwqs()
 error path

When alloc_and_link_pwqs() fails partway through the per-cpu allocation
loop, some pool_workqueues may have already been linked into wq->pwqs
via link_pwq(). The error path frees these pwqs with kmem_cache_free()
but never removes them from the wq->pwqs list, leaving dangling pointers
in the list.

Currently this is not exploitable because the workqueue was never added
to the global workqueues list and the caller frees the wq immediately
after. However, this makes sure that alloc_and_link_pwqs() doesn't leave
any half-baked structure, which may have side effects if not properly
cleaned up.

Fix this by unlinking each pwq from wq->pwqs before freeing it. No
locking is needed as the workqueue has not been published yet, thus
no concurrency is possible.

Signed-off-by: Breno Leitao <leitao@debian.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 63acaa3e1d6a..4f543da2e7c0 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -5624,8 +5624,16 @@ enomem:
 		for_each_possible_cpu(cpu) {
 			struct pool_workqueue *pwq = *per_cpu_ptr(wq->cpu_pwq, cpu);
 
-			if (pwq)
+			if (pwq) {
+				/*
+				 * Unlink pwq from wq->pwqs since link_pwq()
+				 * may have already added it. wq->mutex is not
+				 * needed as the wq has not been published yet.
+				 */
+				if (!list_empty(&pwq->pwqs_node))
+					list_del_rcu(&pwq->pwqs_node);
 				kmem_cache_free(pwq_cache, pwq);
+			}
 		}
 		free_percpu(wq->cpu_pwq);
 		wq->cpu_pwq = NULL;
-- 
cgit v1.2.3


From 2ab739383113107a1335ce7bcbc93110afb69267 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Tue, 31 Mar 2026 14:35:22 -0400
Subject: workqueue: Remove HK_TYPE_WQ from affecting wq_unbound_cpumask

For historical reason, wq_unbound_cpumask is initially set as
intersection of HK_TYPE_DOMAIN, HK_TYPE_WQ and workqueue.unbound_cpus
boot command line option.

At run time, users can update the unbound cpumask via the
/sys/devices/virtual/workqueue/cpumask sysfs file. Creation
and modification of cpuset isolated partitions will also update
wq_unbound_cpumask based on the latest HK_TYPE_DOMAIN cpumask.
The HK_TYPE_WQ cpumask is out of the picture with these runtime updates.

Complete the transition by taking HK_TYPE_WQ out from the workqueue code
and make it depends on HK_TYPE_DOMAIN only from the housekeeping side.
The final goal is to eliminate HK_TYPE_WQ as a housekeeping cpumask type.

Signed-off-by: Waiman Long <longman@redhat.com>
Acked-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 4f543da2e7c0..18c3fe90daca 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -7081,7 +7081,7 @@ int workqueue_unbound_housekeeping_update(const struct cpumask *hk)
 	/*
 	 * If the operation fails, it will fall back to
 	 * wq_requested_unbound_cpumask which is initially set to
-	 * (HK_TYPE_WQ ∩ HK_TYPE_DOMAIN) house keeping mask and rewritten
+	 * HK_TYPE_DOMAIN house keeping mask and rewritten
 	 * by any subsequent write to workqueue/cpumask sysfs file.
 	 */
 	if (!cpumask_and(cpumask, wq_requested_unbound_cpumask, hk))
@@ -7901,7 +7901,6 @@ void __init workqueue_init_early(void)
 
 	cpumask_copy(wq_online_cpumask, cpu_online_mask);
 	cpumask_copy(wq_unbound_cpumask, cpu_possible_mask);
-	restrict_unbound_cpumask("HK_TYPE_WQ", housekeeping_cpumask(HK_TYPE_WQ));
 	restrict_unbound_cpumask("HK_TYPE_DOMAIN", housekeeping_cpumask(HK_TYPE_DOMAIN));
 	if (!cpumask_empty(&wq_cmdline_cpumask))
 		restrict_unbound_cpumask("workqueue.unbound_cpus", &wq_cmdline_cpumask);
-- 
cgit v1.2.3


From 5920d046f7ae3bf9cf51b9d915c1fff13d299d84 Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Wed, 1 Apr 2026 06:03:53 -0700
Subject: workqueue: add WQ_AFFN_CACHE_SHARD affinity scope

On systems where many CPUs share one LLC, unbound workqueues using
WQ_AFFN_CACHE collapse to a single worker pool, causing heavy spinlock
contention on pool->lock. For example, Chuck Lever measured 39% of
cycles lost to native_queued_spin_lock_slowpath on a 12-core shared-L3
NFS-over-RDMA system.

The existing affinity hierarchy (cpu, smt, cache, numa, system) offers
no intermediate option between per-LLC and per-SMT-core granularity.

Add WQ_AFFN_CACHE_SHARD, which subdivides each LLC into groups of at
most wq_cache_shard_size cores (default 8, tunable via boot parameter).
Shards are always split on core (SMT group) boundaries so that
Hyper-Threading siblings are never placed in different pods. Cores are
distributed across shards as evenly as possible -- for example, 36 cores
in a single LLC with max shard size 8 produces 5 shards of 8+7+7+7+7
cores.

The implementation follows the same comparator pattern as other affinity
scopes: precompute_cache_shard_ids() pre-fills the cpu_shard_id[] array
from the already-initialized WQ_AFFN_CACHE and WQ_AFFN_SMT topology,
and cpus_share_cache_shard() is passed to init_pod_type().

Benchmark on NVIDIA Grace (72 CPUs, single LLC, 50k items/thread), show
cache_shard delivers ~5x the throughput and ~6.5x lower p50 latency
compared to cache scope on this 72-core single-LLC system.

Suggested-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Breno Leitao <leitao@debian.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 183 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 183 insertions(+)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 18c3fe90daca..f4a23e1418a7 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -131,6 +131,14 @@ enum wq_internal_consts {
 	WORKER_ID_LEN		= 10 + WQ_NAME_LEN, /* "kworker/R-" + WQ_NAME_LEN */
 };
 
+/* Layout of shards within one LLC pod */
+struct llc_shard_layout {
+	int nr_large_shards;	/* number of large shards (cores_per_shard + 1) */
+	int cores_per_shard;	/* base number of cores per default shard */
+	int nr_shards;		/* total number of shards */
+	/* nr_default shards = (nr_shards - nr_large_shards) */
+};
+
 /*
  * We don't want to trap softirq for too long. See MAX_SOFTIRQ_TIME and
  * MAX_SOFTIRQ_RESTART in kernel/softirq.c. These are macros because
@@ -410,6 +418,7 @@ static const char * const wq_affn_names[WQ_AFFN_NR_TYPES] = {
 	[WQ_AFFN_CPU]		= "cpu",
 	[WQ_AFFN_SMT]		= "smt",
 	[WQ_AFFN_CACHE]		= "cache",
+	[WQ_AFFN_CACHE_SHARD]	= "cache_shard",
 	[WQ_AFFN_NUMA]		= "numa",
 	[WQ_AFFN_SYSTEM]	= "system",
 };
@@ -432,6 +441,9 @@ module_param_named(cpu_intensive_warning_thresh, wq_cpu_intensive_warning_thresh
 static bool wq_power_efficient = IS_ENABLED(CONFIG_WQ_POWER_EFFICIENT_DEFAULT);
 module_param_named(power_efficient, wq_power_efficient, bool, 0444);
 
+static unsigned int wq_cache_shard_size = 8;
+module_param_named(cache_shard_size, wq_cache_shard_size, uint, 0444);
+
 static bool wq_online;			/* can kworkers be created yet? */
 static bool wq_topo_initialized __read_mostly = false;
 
@@ -8155,6 +8167,175 @@ static bool __init cpus_share_numa(int cpu0, int cpu1)
 	return cpu_to_node(cpu0) == cpu_to_node(cpu1);
 }
 
+/* Maps each CPU to its shard index within the LLC pod it belongs to */
+static int cpu_shard_id[NR_CPUS] __initdata;
+
+/**
+ * llc_count_cores - count distinct cores (SMT groups) within an LLC pod
+ * @pod_cpus:  the cpumask of CPUs in the LLC pod
+ * @smt_pods:  the SMT pod type, used to identify sibling groups
+ *
+ * A core is represented by the lowest-numbered CPU in its SMT group. Returns
+ * the number of distinct cores found in @pod_cpus.
+ */
+static int __init llc_count_cores(const struct cpumask *pod_cpus,
+				  struct wq_pod_type *smt_pods)
+{
+	const struct cpumask *sibling_cpus;
+	int nr_cores = 0, c;
+
+	/*
+	 * Count distinct cores by only counting the first CPU in each
+	 * SMT sibling group.
+	 */
+	for_each_cpu(c, pod_cpus) {
+		sibling_cpus = smt_pods->pod_cpus[smt_pods->cpu_pod[c]];
+		if (cpumask_first(sibling_cpus) == c)
+			nr_cores++;
+	}
+
+	return nr_cores;
+}
+
+/*
+ * llc_shard_size - number of cores in a given shard
+ *
+ * Cores are spread as evenly as possible. The first @nr_large_shards shards are
+ * "large shards" with (cores_per_shard + 1) cores; the rest are "default
+ * shards" with cores_per_shard cores.
+ */
+static int __init llc_shard_size(int shard_id, int cores_per_shard, int nr_large_shards)
+{
+	/* The first @nr_large_shards shards are large shards */
+	if (shard_id < nr_large_shards)
+		return cores_per_shard + 1;
+
+	/* The remaining shards are default shards */
+	return cores_per_shard;
+}
+
+/*
+ * llc_calc_shard_layout - compute the shard layout for an LLC pod
+ * @nr_cores:  number of distinct cores in the LLC pod
+ *
+ * Chooses the number of shards that keeps average shard size closest to
+ * wq_cache_shard_size. Returns a struct describing the total number of shards,
+ * the base size of each, and how many are large shards.
+ */
+static struct llc_shard_layout __init llc_calc_shard_layout(int nr_cores)
+{
+	struct llc_shard_layout layout;
+
+	/* Ensure at least one shard; pick the count closest to the target size */
+	layout.nr_shards = max(1, DIV_ROUND_CLOSEST(nr_cores, wq_cache_shard_size));
+	layout.cores_per_shard = nr_cores / layout.nr_shards;
+	layout.nr_large_shards = nr_cores % layout.nr_shards;
+
+	return layout;
+}
+
+/*
+ * llc_shard_is_full - check whether a shard has reached its core capacity
+ * @cores_in_shard: number of cores already assigned to this shard
+ * @shard_id:       index of the shard being checked
+ * @layout:         the shard layout computed by llc_calc_shard_layout()
+ *
+ * Returns true if @cores_in_shard equals the expected size for @shard_id.
+ */
+static bool __init llc_shard_is_full(int cores_in_shard, int shard_id,
+				     const struct llc_shard_layout *layout)
+{
+	return cores_in_shard == llc_shard_size(shard_id, layout->cores_per_shard,
+						layout->nr_large_shards);
+}
+
+/**
+ * llc_populate_cpu_shard_id - populate cpu_shard_id[] for each CPU in an LLC pod
+ * @pod_cpus:  the cpumask of CPUs in the LLC pod
+ * @smt_pods:  the SMT pod type, used to identify sibling groups
+ * @nr_cores:  number of distinct cores in @pod_cpus (from llc_count_cores())
+ *
+ * Walks @pod_cpus in order. At each SMT group leader, advances to the next
+ * shard once the current shard is full. Results are written to cpu_shard_id[].
+ */
+static void __init llc_populate_cpu_shard_id(const struct cpumask *pod_cpus,
+					     struct wq_pod_type *smt_pods,
+					     int nr_cores)
+{
+	struct llc_shard_layout layout = llc_calc_shard_layout(nr_cores);
+	const struct cpumask *sibling_cpus;
+	/* Count the number of cores in the current shard_id */
+	int cores_in_shard = 0;
+	/* This is a cursor for the shards. Go from zero to nr_shards - 1*/
+	int shard_id = 0;
+	int c;
+
+	/* Iterate at every CPU for a given LLC pod, and assign it a shard */
+	for_each_cpu(c, pod_cpus) {
+		sibling_cpus = smt_pods->pod_cpus[smt_pods->cpu_pod[c]];
+		if (cpumask_first(sibling_cpus) == c) {
+			/* This is the CPU leader for the siblings */
+			if (llc_shard_is_full(cores_in_shard, shard_id, &layout)) {
+				shard_id++;
+				cores_in_shard = 0;
+			}
+			cores_in_shard++;
+			cpu_shard_id[c] = shard_id;
+		} else {
+			/*
+			 * The siblings' shard MUST be the same as the leader.
+			 * never split threads in the same core.
+			 */
+			cpu_shard_id[c] = cpu_shard_id[cpumask_first(sibling_cpus)];
+		}
+	}
+
+	WARN_ON_ONCE(shard_id != (layout.nr_shards - 1));
+}
+
+/**
+ * precompute_cache_shard_ids - assign each CPU its shard index within its LLC
+ *
+ * Iterates over all LLC pods. For each pod, counts distinct cores then assigns
+ * shard indices to all CPUs in the pod. Must be called after WQ_AFFN_CACHE and
+ * WQ_AFFN_SMT have been initialized.
+ */
+static void __init precompute_cache_shard_ids(void)
+{
+	struct wq_pod_type *llc_pods = &wq_pod_types[WQ_AFFN_CACHE];
+	struct wq_pod_type *smt_pods = &wq_pod_types[WQ_AFFN_SMT];
+	const struct cpumask *cpus_sharing_llc;
+	int nr_cores;
+	int pod;
+
+	if (!wq_cache_shard_size) {
+		pr_warn("workqueue: cache_shard_size must be > 0, setting to 1\n");
+		wq_cache_shard_size = 1;
+	}
+
+	for (pod = 0; pod < llc_pods->nr_pods; pod++) {
+		cpus_sharing_llc = llc_pods->pod_cpus[pod];
+
+		/* Number of cores in this given LLC */
+		nr_cores = llc_count_cores(cpus_sharing_llc, smt_pods);
+		llc_populate_cpu_shard_id(cpus_sharing_llc, smt_pods, nr_cores);
+	}
+}
+
+/*
+ * cpus_share_cache_shard - test whether two CPUs belong to the same cache shard
+ *
+ * Two CPUs share a cache shard if they are in the same LLC and have the same
+ * shard index. Used as the pod affinity callback for WQ_AFFN_CACHE_SHARD.
+ */
+static bool __init cpus_share_cache_shard(int cpu0, int cpu1)
+{
+	if (!cpus_share_cache(cpu0, cpu1))
+		return false;
+
+	return cpu_shard_id[cpu0] == cpu_shard_id[cpu1];
+}
+
 /**
  * workqueue_init_topology - initialize CPU pods for unbound workqueues
  *
@@ -8170,6 +8351,8 @@ void __init workqueue_init_topology(void)
 	init_pod_type(&wq_pod_types[WQ_AFFN_CPU], cpus_dont_share);
 	init_pod_type(&wq_pod_types[WQ_AFFN_SMT], cpus_share_smt);
 	init_pod_type(&wq_pod_types[WQ_AFFN_CACHE], cpus_share_cache);
+	precompute_cache_shard_ids();
+	init_pod_type(&wq_pod_types[WQ_AFFN_CACHE_SHARD], cpus_share_cache_shard);
 	init_pod_type(&wq_pod_types[WQ_AFFN_NUMA], cpus_share_numa);
 
 	wq_topo_initialized = true;
-- 
cgit v1.2.3


From 4cdc8a7389d5025051f6c4a60fb5b7cb9b7960bb Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Wed, 1 Apr 2026 06:03:54 -0700
Subject: workqueue: set WQ_AFFN_CACHE_SHARD as the default affinity scope

Set WQ_AFFN_CACHE_SHARD as the default affinity scope for unbound
workqueues. On systems where many CPUs share one LLC, the previous
default (WQ_AFFN_CACHE) collapses all CPUs to a single worker pool,
causing heavy spinlock contention on pool->lock.

WQ_AFFN_CACHE_SHARD subdivides each LLC into smaller groups, providing
a better balance between locality and contention. Users can revert to
the previous behavior with workqueue.default_affinity_scope=cache.

On systems with 8 or fewer cores per LLC, CACHE_SHARD produces a single
shard covering the entire LLC, making it functionally identical to the
previous CACHE default. The sharding only activates when an LLC has more
than 8 cores.

Signed-off-by: Breno Leitao <leitao@debian.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f4a23e1418a7..b1a28c51d4f2 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -450,7 +450,7 @@ static bool wq_topo_initialized __read_mostly = false;
 static struct kmem_cache *pwq_cache;
 
 static struct wq_pod_type wq_pod_types[WQ_AFFN_NR_TYPES];
-static enum wq_affn_scope wq_affn_dfl = WQ_AFFN_CACHE;
+static enum wq_affn_scope wq_affn_dfl = WQ_AFFN_CACHE_SHARD;
 
 /* buf for wq_update_unbound_pod_attrs(), protected by CPU hotplug exclusion */
 static struct workqueue_attrs *unbound_wq_update_pwq_attrs_buf;
-- 
cgit v1.2.3


From 034db4dd4449c556705e6b32bc07bd31df3889ba Mon Sep 17 00:00:00 2001
From: Maninder Singh <maninder1.s@samsung.com>
Date: Tue, 7 Apr 2026 09:12:15 +0530
Subject: workqueue: use NR_STD_WORKER_POOLS instead of hardcoded value

use NR_STD_WORKER_POOLS for irq_work_fns[] array definition.
NR_STD_WORKER_POOLS is also 2, but better to use MACRO.
Initialization loop for_each_bh_worker_pool() also uses same MACRO.

Signed-off-by: Maninder Singh <maninder1.s@samsung.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index b1a28c51d4f2..900b864a30b0 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -7900,8 +7900,8 @@ void __init workqueue_init_early(void)
 {
 	struct wq_pod_type *pt = &wq_pod_types[WQ_AFFN_SYSTEM];
 	int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL };
-	void (*irq_work_fns[2])(struct irq_work *) = { bh_pool_kick_normal,
-						       bh_pool_kick_highpri };
+	void (*irq_work_fns[NR_STD_WORKER_POOLS])(struct irq_work *) =
+		{ bh_pool_kick_normal, bh_pool_kick_highpri };
 	int i, cpu;
 
 	BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
-- 
cgit v1.2.3


From 76af54648899abbd6b449c035583e47fd407078a Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Mon, 13 Apr 2026 07:26:47 -0700
Subject: workqueue: validate cpumask_first() result in
 llc_populate_cpu_shard_id()

On uniprocessor (UP) configs such as nios2, NR_CPUS is 1, so
cpu_shard_id[] is a single-element array (int[1]). In
llc_populate_cpu_shard_id(), cpumask_first(sibling_cpus) returns an
unsigned int that the compiler cannot prove is always 0, triggering
a -Warray-bounds warning when the result is used to index
cpu_shard_id[]:

  kernel/workqueue.c:8321:55: warning: array subscript 1 is above
  array bounds of 'int[1]' [-Warray-bounds]
   8321 |  cpu_shard_id[c] = cpu_shard_id[cpumask_first(sibling_cpus)];
        |                    ~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~

This is a false positive: sibling_cpus can never be empty here because
'c' itself is always set in it, so cpumask_first() will always return a
valid CPU. However, the compiler cannot prove this statically, and the
warning only manifests on UP configs where the array size is 1.

Add a bounds check with WARN_ON_ONCE to silence the warning, and store
the result in a local variable to make the code clearer and avoid calling
cpumask_first() twice.

Fixes: 5920d046f7ae ("workqueue: add WQ_AFFN_CACHE_SHARD affinity scope")
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202604022343.GQtkF2vO-lkp@intel.com/
Signed-off-by: Breno Leitao <leitao@debian.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 900b864a30b0..ed7330b9ddf9 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -8266,6 +8266,7 @@ static void __init llc_populate_cpu_shard_id(const struct cpumask *pod_cpus,
 	const struct cpumask *sibling_cpus;
 	/* Count the number of cores in the current shard_id */
 	int cores_in_shard = 0;
+	unsigned int leader;
 	/* This is a cursor for the shards. Go from zero to nr_shards - 1*/
 	int shard_id = 0;
 	int c;
@@ -8286,7 +8287,17 @@ static void __init llc_populate_cpu_shard_id(const struct cpumask *pod_cpus,
 			 * The siblings' shard MUST be the same as the leader.
 			 * never split threads in the same core.
 			 */
-			cpu_shard_id[c] = cpu_shard_id[cpumask_first(sibling_cpus)];
+			leader = cpumask_first(sibling_cpus);
+
+			/*
+			 * This check silences a Warray-bounds warning on UP
+			 * configs where NR_CPUS=1 makes cpu_shard_id[]
+			 * a single-element array, and the compiler can't
+			 * prove the index is always 0.
+			 */
+			if (WARN_ON_ONCE(leader >= nr_cpu_ids))
+				continue;
+			cpu_shard_id[c] = cpu_shard_id[leader];
 		}
 	}
 
-- 
cgit v1.2.3