From df0d98475954d655571979aa061ecb07d7e00392 Mon Sep 17 00:00:00 2001
From: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Date: Wed, 1 Apr 2026 14:52:13 -0700
Subject: sched/cache: Introduce infrastructure for cache-aware load balancing

Adds infrastructure to enable cache-aware load balancing,
which improves cache locality by grouping tasks that share resources
within the same cache domain. This reduces cache misses and improves
overall data access efficiency.

In this initial implementation, threads belonging to the same process
are treated as entities that likely share working sets. The mechanism
tracks per-process CPU occupancy across cache domains and attempts to
migrate threads toward cache-hot domains where their process already
has active threads, thereby enhancing locality.

This provides a basic model for cache affinity. While the current code
targets the last-level cache (LLC), the approach could be extended to
other domain types such as clusters (L2) or node-internal groupings.

At present, the mechanism selects the CPU within an LLC that has the
highest recent runtime. Subsequent patches in this series will use this
information in the load-balancing path to guide task placement toward
preferred LLCs.

In the future, more advanced policies could be integrated through NUMA
balancing-for example, migrating a task to its preferred LLC when spare
capacity exists, or swapping tasks across LLCs to improve cache affinity.
Grouping of tasks could also be generalized from that of a process
to be that of a NUMA group, or be user configurable.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/6269a53221b9439b9ca00d18a9d1946fb64d8cff.1775065312.git.tim.c.chen@linux.intel.com
---
 include/linux/mm_types.h | 32 ++++++++++++++++++++++++++++++++
 include/linux/sched.h    | 24 ++++++++++++++++++++++++
 2 files changed, 56 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 3cc8ae722886..67b2dfcc71ea 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1173,6 +1173,8 @@ struct mm_struct {
 		/* MM CID related storage */
 		struct mm_mm_cid mm_cid;
 
+		/* sched_cache related statistics */
+		struct sched_cache_stat sc_stat;
 #ifdef CONFIG_MMU
 		atomic_long_t pgtables_bytes;	/* size of all page tables */
 #endif
@@ -1575,6 +1577,36 @@ static inline unsigned int mm_cid_size(void)
 # define MM_CID_STATIC_SIZE	0
 #endif /* CONFIG_SCHED_MM_CID */
 
+#ifdef CONFIG_SCHED_CACHE
+void mm_init_sched(struct mm_struct *mm,
+		   struct sched_cache_time __percpu *pcpu_sched);
+
+static inline int mm_alloc_sched_noprof(struct mm_struct *mm)
+{
+	struct sched_cache_time __percpu *pcpu_sched =
+		alloc_percpu_noprof(struct sched_cache_time);
+
+	if (!pcpu_sched)
+		return -ENOMEM;
+
+	mm_init_sched(mm, pcpu_sched);
+	return 0;
+}
+
+#define mm_alloc_sched(...)	alloc_hooks(mm_alloc_sched_noprof(__VA_ARGS__))
+
+static inline void mm_destroy_sched(struct mm_struct *mm)
+{
+	free_percpu(mm->sc_stat.pcpu_sched);
+	mm->sc_stat.pcpu_sched = NULL;
+}
+#else /* !CONFIG_SCHED_CACHE */
+
+static inline int mm_alloc_sched(struct mm_struct *mm) { return 0; }
+static inline void mm_destroy_sched(struct mm_struct *mm) { }
+
+#endif /* CONFIG_SCHED_CACHE */
+
 struct mmu_gather;
 extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm);
 extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8ec3b6d7d718..2bf261bcd7b6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1407,6 +1407,10 @@ struct task_struct {
 	unsigned long			numa_pages_migrated;
 #endif /* CONFIG_NUMA_BALANCING */
 
+#ifdef CONFIG_SCHED_CACHE
+	struct callback_head		cache_work;
+#endif
+
 	struct rseq_data		rseq;
 	struct sched_mm_cid		mm_cid;
 
@@ -2407,6 +2411,26 @@ static __always_inline int task_mm_cid(struct task_struct *t)
 }
 #endif
 
+#ifdef CONFIG_SCHED_CACHE
+
+struct sched_cache_time {
+	u64 runtime;
+	unsigned long epoch;
+};
+
+struct sched_cache_stat {
+	struct sched_cache_time __percpu *pcpu_sched;
+	raw_spinlock_t lock;
+	unsigned long epoch;
+	int cpu;
+} ____cacheline_aligned_in_smp;
+
+#else
+
+struct sched_cache_stat { };
+
+#endif
+
 #ifndef MODULE
 #ifndef COMPILE_OFFSETS
 
-- 
cgit v1.2.3


From f025ef275388742643a2c33f00a0d9c0af3112ee Mon Sep 17 00:00:00 2001
From: Chen Yu <yu.c.chen@intel.com>
Date: Wed, 1 Apr 2026 14:52:15 -0700
Subject: sched/cache: Record per LLC utilization to guide cache aware
 scheduling decisions

When a system becomes busy and a process's preferred LLC is
saturated with too many threads, tasks within that LLC migrate
frequently. These in LLC migrations introduce latency and degrade
performance. To avoid this, task aggregation should be suppressed
when the preferred LLC is overloaded, which requires a metric to
indicate LLC utilization.

Record per LLC utilization/cpu capacity during periodic load
balancing. These statistics will be used in later patches to decide
whether tasks should be aggregated into their preferred LLC.

Signed-off-by: Chen Yu <yu.c.chen@intel.com>
Co-developed-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/a48151b3d57f2a42a5971aaead1b7f81e69229f4.1775065312.git.tim.c.chen@linux.intel.com
---
 include/linux/sched/topology.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 36553e14866d..159716fa0d3a 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -68,6 +68,10 @@ struct sched_domain_shared {
 	atomic_t	nr_busy_cpus;
 	int		has_idle_cores;
 	int		nr_idle_scan;
+#ifdef CONFIG_SCHED_CACHE
+	unsigned long	util_avg;
+	unsigned long	capacity;
+#endif
 };
 
 struct sched_domain {
-- 
cgit v1.2.3


From 47d8696b95f7397fe7cad2d194d550ffe82efc15 Mon Sep 17 00:00:00 2001
From: Tim Chen <tim.c.chen@linux.intel.com>
Date: Wed, 1 Apr 2026 14:52:18 -0700
Subject: sched/cache: Assign preferred LLC ID to processes

With cache-aware scheduling enabled, each task is assigned a
preferred LLC ID. This allows quick identification of the LLC domain
where the task prefers to run, similar to numa_preferred_nid in
NUMA balancing.

Co-developed-by: Chen Yu <yu.c.chen@intel.com>
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/f2ceecba5858680349ad4ce9303a2121f0bb7272.1775065312.git.tim.c.chen@linux.intel.com
---
 include/linux/sched.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2bf261bcd7b6..d2010483cd77 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1409,6 +1409,7 @@ struct task_struct {
 
 #ifdef CONFIG_SCHED_CACHE
 	struct callback_head		cache_work;
+	int				preferred_llc;
 #endif
 
 	struct rseq_data		rseq;
-- 
cgit v1.2.3


From a8d0ca0b7f2f7b53565d1e30e509d3d74d1f5460 Mon Sep 17 00:00:00 2001
From: Tim Chen <tim.c.chen@linux.intel.com>
Date: Wed, 1 Apr 2026 14:52:20 -0700
Subject: sched/cache: Introduce per CPU's tasks LLC preference counter

The lowest level of sched domain for each CPU is assigned an
array where each element tracks the number of tasks preferring
a given LLC, indexed from 0 to max_lid. Since each CPU
has its dedicated sd, this implies that each CPU will have
a dedicated task LLC preference counter.

For example, sd->llc_counts[3] = 2 signifies that there
are 2 tasks on this runqueue which prefer to run within LLC3.

The load balancer can use this information to identify busy
runqueues and migrate tasks to their preferred LLC domains.
This array will be reallocated at runtime during sched domain
rebuild.

Introduce the buffer allocation mechanism, and the statistics
will be calculated in the subsequent patch.

Note: the LLC preference statistics of each CPU are reset on
sched domain rebuild and may under count temporarily, until the
CPU becomes idle and the count is cleared. This is a trade off
to avoid complex data synchronization across sched domain builds.

Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Suggested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Co-developed-by: Chen Yu <yu.c.chen@intel.com>
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/42e79eceb8cd6be8a032401d481d101913bc5703.1775065312.git.tim.c.chen@linux.intel.com
---
 include/linux/sched/topology.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 159716fa0d3a..0036d6b4bd67 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -103,6 +103,11 @@ struct sched_domain {
 	u64 max_newidle_lb_cost;
 	unsigned long last_decay_max_lb_cost;
 
+#ifdef CONFIG_SCHED_CACHE
+	unsigned int llc_max;
+	unsigned int *llc_counts __counted_by_ptr(llc_max);
+#endif
+
 #ifdef CONFIG_SCHEDSTATS
 	/* sched_balance_rq() stats */
 	unsigned int lb_count[CPU_MAX_IDLE_TYPES];
-- 
cgit v1.2.3


From a2b4cf39d9d333bfeb9262dbaafe3d24d405a5c0 Mon Sep 17 00:00:00 2001
From: Jianyong Wu <wujianyong@hygon.cn>
Date: Wed, 13 May 2026 13:39:12 -0700
Subject: sched/cache: Allow only 1 thread of the process to calculate the LLC
 occupancy

Scanning online CPUs to calculate the occupancy might be
time-consuming. Only allow 1 thread of the process to scan
the CPUs at the same time, which is similar to what
NUMA balance does in task_numa_work().

Signed-off-by: Jianyong Wu <wujianyong@hygon.cn>
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/5672b52e588b855b01e5a1a17822f7c6c7237a3d.1778703694.git.tim.c.chen@linux.intel.com
---
 include/linux/sched.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d2010483cd77..6d883f109ba3 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2423,6 +2423,7 @@ struct sched_cache_stat {
 	struct sched_cache_time __percpu *pcpu_sched;
 	raw_spinlock_t lock;
 	unsigned long epoch;
+	unsigned long next_scan;
 	int cpu;
 } ____cacheline_aligned_in_smp;
 
-- 
cgit v1.2.3


From deee5e27d5b608323c04dc99979e55f944016a13 Mon Sep 17 00:00:00 2001
From: Chen Yu <yu.c.chen@intel.com>
Date: Wed, 13 May 2026 13:39:13 -0700
Subject: sched/cache: Disable cache aware scheduling for processes with high
 thread counts

A performance regression was observed by Prateek when running hackbench
with many threads per process (high fd count). To avoid this, processes
with a large number of active threads are excluded from cache-aware
scheduling.

With sched_cache enabled, record the number of active threads in each
process during the periodic task_cache_work(). While iterating over
CPUs, if the currently running task belongs to the same process as the
task that launched task_cache_work(), increment the active thread count.

If the number of active threads within the process exceeds the number
of Cores (divided by the SMT number) in the LLC, do not enable
cache-aware scheduling. However, on systems with a smaller number of
CPUs within 1 LLC, like Power10/Power11 with SMT4 and an LLC size of 4,
this check effectively disables cache-aware scheduling for any process.
One possible solution suggested by Peter is to use an LLC-mask instead
of a single LLC value for preference. Once there are a 'few' LLCs as
preference, this constraint becomes a little easier. It could be an
enhancement in the future.

For users who wish to perform task aggregation regardless, a debugfs knob
is provided for tuning in a subsequent change.

Suggested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Suggested-by: Aaron Lu <ziqianlu@bytedance.com>
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
Co-developed-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Tingyin Duan <tingyin.duan@gmail.com>
Link: https://patch.msgid.link/d076cd21a8e6c6341d1e2d927e118db770ebb650.1778703694.git.tim.c.chen@linux.intel.com
---
 include/linux/sched.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6d883f109ba3..6701911eaaf7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2423,6 +2423,7 @@ struct sched_cache_stat {
 	struct sched_cache_time __percpu *pcpu_sched;
 	raw_spinlock_t lock;
 	unsigned long epoch;
+	u64 nr_running_avg;
 	unsigned long next_scan;
 	int cpu;
 } ____cacheline_aligned_in_smp;
-- 
cgit v1.2.3


From 7030513a08776b2ca70fccd5dfddf7bb5c5c88ba Mon Sep 17 00:00:00 2001
From: Chen Yu <yu.c.chen@intel.com>
Date: Wed, 13 May 2026 13:39:15 -0700
Subject: sched/cache: Calculate the LLC size and store it in sched_domain

Cache aware scheduling needs to know the LLC size that a process
can use, so as to avoid memory-intensive tasks from being
over-aggregated on a single LLC.

Introduce a preparation patch to add get_effective_llc_bytes() to
get the LLC size that a CPU can use. The function can be further
enhanced by subtracting the LLC cache ways reserved by resctrl
(CAT in Intel RDT, etc).

Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
Co-developed-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Tingyin Duan <tingyin.duan@gmail.com>
Link: https://patch.msgid.link/37afee09ff608034da0ce149e72d33b6f4698edf.1778703694.git.tim.c.chen@linux.intel.com
---
 include/linux/cacheinfo.h      | 1 +
 include/linux/sched/topology.h | 7 +++++++
 2 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h
index c8f4f0a0b874..fc879ac4cc4f 100644
--- a/include/linux/cacheinfo.h
+++ b/include/linux/cacheinfo.h
@@ -89,6 +89,7 @@ int populate_cache_leaves(unsigned int cpu);
 int cache_setup_acpi(unsigned int cpu);
 bool last_level_cache_is_valid(unsigned int cpu);
 bool last_level_cache_is_shared(unsigned int cpu_x, unsigned int cpu_y);
+struct cacheinfo *get_cpu_cacheinfo_llc(unsigned int cpu);
 int fetch_cache_info(unsigned int cpu);
 int detect_cache_attributes(unsigned int cpu);
 #ifndef CONFIG_ACPI_PPTT
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 0036d6b4bd67..fe09d3268bc9 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -106,6 +106,7 @@ struct sched_domain {
 #ifdef CONFIG_SCHED_CACHE
 	unsigned int llc_max;
 	unsigned int *llc_counts __counted_by_ptr(llc_max);
+	unsigned long llc_bytes;
 #endif
 
 #ifdef CONFIG_SCHEDSTATS
@@ -265,4 +266,10 @@ static inline int task_node(const struct task_struct *p)
 	return cpu_to_node(task_cpu(p));
 }
 
+#ifdef CONFIG_SCHED_CACHE
+extern void sched_update_llc_bytes(unsigned int cpu);
+#else
+static inline void sched_update_llc_bytes(unsigned int cpu) { }
+#endif
+
 #endif /* _LINUX_SCHED_TOPOLOGY_H */
-- 
cgit v1.2.3


From 808915f982c2a52f5d148510ecfab52284de67cf Mon Sep 17 00:00:00 2001
From: Chen Yu <yu.c.chen@intel.com>
Date: Wed, 13 May 2026 13:39:16 -0700
Subject: sched/cache: Avoid cache-aware scheduling for memory-heavy processes

Prateek and Tingyin reported that memory-intensive workloads (such as
stream) can saturate memory bandwidth and caches on the preferred LLC
when sched_cache aggregates too many threads.

To mitigate this, estimate a process's memory footprint by comparing
its NUMA balancing fault statistics to the size of the LLC. If the
footprint exceeds the LLC size, skip cache-aware scheduling.

Note that footprint is only an approximation of the memory footprint,
since the kernel lacks suitable metrics to estimate the real working
set. If a user-provided hint is available in the future, it would be
more accurate. A later patch will allow users to provide a hint to
adjust this threshold.

Suggested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Suggested-by: Vern Hao <vernhao@tencent.com>
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
Co-developed-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Tingyin Duan <tingyin.duan@gmail.com>
Link: https://patch.msgid.link/95cf64a385bcc12f18dcebe9d59e8d3ba8bb318f.1778703694.git.tim.c.chen@linux.intel.com
---
 include/linux/sched.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6701911eaaf7..95729670929c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2425,6 +2425,7 @@ struct sched_cache_stat {
 	unsigned long epoch;
 	u64 nr_running_avg;
 	unsigned long next_scan;
+	unsigned long footprint;
 	int cpu;
 } ____cacheline_aligned_in_smp;
 
-- 
cgit v1.2.3


From 03755348b8e74421f92ffed9da159175a698290b Mon Sep 17 00:00:00 2001
From: Chen Yu <yu.c.chen@intel.com>
Date: Wed, 13 May 2026 13:39:21 -0700
Subject: sched/cache: Fix unpaired account_llc_enqueue/dequeue

There is a race condition that, after a task is enqueued
on a runqueue, task_llc(p) may change due to CPU hotplug,
because the llc_id is dynamically allocated and adjusted
at runtime.
Therefore, checking task_llc(p) to determine whether the
task is being dequeued from its preferred LLC is unreliable
and can cause inconsistent values.

To fix this problem, record whether p is enqueued on its
preferred LLC, in order to pair with account_llc_dequeue()
to maintain a consistent nr_pref_llc_running per runqueue.

This bug was reported by sashiko, and the solution was once
suggested by Prateek.

Fixes: 46afe3af7ead ("sched/cache: Track LLC-preferred tasks per runqueue")
Suggested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
Co-developed-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/0c8c6a1571d66792a4d2ff0103ba3cc13e059046.1778703694.git.tim.c.chen@linux.intel.com
---
 include/linux/sched.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 95729670929c..2c9e8e2edde1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1410,6 +1410,8 @@ struct task_struct {
 #ifdef CONFIG_SCHED_CACHE
 	struct callback_head		cache_work;
 	int				preferred_llc;
+	/* 1: task was enqueued to its preferred LLC, 0 otherwise */
+	int				pref_llc_queued;
 #endif
 
 	struct rseq_data		rseq;
-- 
cgit v1.2.3


From ea19506013ad13685573e4674fbeddb790e27906 Mon Sep 17 00:00:00 2001
From: Yiyang Chen <cyyzero16@gmail.com>
Date: Fri, 15 May 2026 00:05:05 +0800
Subject: sched/clock: Provide !HAVE_UNSTABLE_SCHED_CLOCK stub for
 sched_clock_stable()

When CONFIG_HAVE_UNSTABLE_SCHED_CLOCK is disabled, sched_clock() is
already assumed to provide stable semantics, but the public header
doesn't provide a sched_clock_stable() stub for that case.

Add a header stub that always returns true and clean up the duplicate
local stub in ring_buffer.c, so callers can use sched_clock_stable()
unconditionally.

Signed-off-by: Yiyang Chen <cyyzero16@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Link: https://patch.msgid.link/56e45338858946cd9581b75c8bd45dd37dba52c5.1778773587.git.cyyzero16@gmail.com
---
 include/linux/sched/clock.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched/clock.h b/include/linux/sched/clock.h
index 196f0ca351a2..39f0a7f94bfc 100644
--- a/include/linux/sched/clock.h
+++ b/include/linux/sched/clock.h
@@ -33,6 +33,11 @@ extern u64 sched_clock_cpu(int cpu);
 extern void sched_clock_init(void);
 
 #ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+static inline int sched_clock_stable(void)
+{
+	return 1;
+}
+
 static inline void sched_clock_tick(void)
 {
 }
-- 
cgit v1.2.3


From 815c5cb76a3e5dad4fc3911b9073591dc3a29282 Mon Sep 17 00:00:00 2001
From: Shrikanth Hegde <sshegde@linux.ibm.com>
Date: Fri, 15 May 2026 22:54:53 +0530
Subject: topology: Introduce cpu_smt_mask for CONFIG_SCHED_SMT=n

Define cpu_smt_mask in case of CONFIG_SCHED_SMT=n as cpumask_of that
CPU. With that config, it is expected that kernel treats each CPU
as individual core. Using cpumask_of(cpu) reflects that.

This would help to get rid of the ifdeffery that is spread across
the codebase since cpu_smt_mask is defined only in case of
CONFIG_SCHED_SMT=y.

Note: There is no arch today which defines cpu_smt_mask unconditionally.
So likely defining the cpu_smt_mask shouldn't lead redefinition errors.

Signed-off-by: Shrikanth Hegde <sshegde@linux.ibm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Phil Auld <pauld@redhat.com>
Reviewed-by: Valentin Schneider <vschneid@redhat.com>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Link: https://patch.msgid.link/20260515172456.542799-2-sshegde@linux.ibm.com
---
 include/linux/topology.h | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/topology.h b/include/linux/topology.h
index 6575af39fd10..709a2dcf4c73 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -230,11 +230,24 @@ static inline int cpu_to_mem(int cpu)
 #define topology_drawer_cpumask(cpu)		cpumask_of(cpu)
 #endif
 
-#if defined(CONFIG_SCHED_SMT) && !defined(cpu_smt_mask)
+/*
+ * Defining cpu_smt_mask as cpumask_of that CPU helps to get
+ * rid of lot of ifdeffery all around the codebase in case of
+ * CONFIG_SCHED_SMT=n. It just means there are no other siblings, which
+ * is what is expected.
+ */
+#if defined(CONFIG_SCHED_SMT)
+# if !defined(cpu_smt_mask)
 static inline const struct cpumask *cpu_smt_mask(int cpu)
 {
 	return topology_sibling_cpumask(cpu);
 }
+# endif
+#else	/* !CONFIG_SCHED_SMT */
+static inline const struct cpumask *cpu_smt_mask(int cpu)
+{
+	return cpumask_of(cpu);
+}
 #endif
 
 #ifndef topology_is_primary_thread
-- 
cgit v1.2.3


From 5bc6ab2d42e545f816def21cfcdb4ba35cc74bf6 Mon Sep 17 00:00:00 2001
From: Shrikanth Hegde <sshegde@linux.ibm.com>
Date: Fri, 15 May 2026 22:54:54 +0530
Subject: sched: Simplify ifdeffery around cpu_smt_mask

Now, that cpu_smt_mask is defined as cpumask_of(cpu) for
CONFIG_SCHED_SMT=n, it is possible to get rid of the ifdeffery.

Effectively,
 - This makes sched_smt_present is defined always

 - cpumask_weight(cpumask_of(cpu)) == 1. So sched_smt_present_inc/dec
   will never enable the sched_smt_present. Which is expected.

 - Paths that were compile-time eliminated become runtime guarded
   using static keys.

 - Defines set_idle_cores, test_idle_cores, etc which could likely benefit
   the CONFIG_SCHED_SMT=n systems to use the same optimizations within the
   LLC at wakeups.

 - This will expose sched_smt_present symbol for CONFIG_SCHED_SMT=n.
   Likely not a concern.

 - There is a bloat of code CONFIG_SCHED_SMT=n. (NR_CPUS=2048)
   add/remove: 24/18 grow/shrink: 26/28 up/down: 6396/-3188 (3208)
   Total: Before=30629880, After=30633088, chg +0.01%

 - No code bloat for CONFIG_SCHED_SMT=y, which is expected.

 - Add comments around stop_core_cpuslocked on why ifdefs are not
   removed.

 - This leaves the remaining uses of CONFIG_SCHED_SMT mainly for
   topology building bits which has a policy based decision.

Signed-off-by: Shrikanth Hegde <sshegde@linux.ibm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Phil Auld <pauld@redhat.com>
Reviewed-by: Valentin Schneider <vschneid@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Link: https://patch.msgid.link/20260515172456.542799-3-sshegde@linux.ibm.com
---
 include/linux/sched/smt.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/smt.h b/include/linux/sched/smt.h
index 166b19af956f..cde6679c0278 100644
--- a/include/linux/sched/smt.h
+++ b/include/linux/sched/smt.h
@@ -4,16 +4,12 @@
 
 #include <linux/static_key.h>
 
-#ifdef CONFIG_SCHED_SMT
 extern struct static_key_false sched_smt_present;
 
 static __always_inline bool sched_smt_active(void)
 {
 	return static_branch_likely(&sched_smt_present);
 }
-#else
-static __always_inline bool sched_smt_active(void) { return false; }
-#endif
 
 void arch_smt_update(void);
 
-- 
cgit v1.2.3


From 9e005ed21152d4a4bb0ceea71045ff8a642a6feb Mon Sep 17 00:00:00 2001
From: K Prateek Nayak <kprateek.nayak@amd.com>
Date: Tue, 19 May 2026 05:14:23 +0000
Subject: sched/topology: Allow multiple domains to claim sched_domain_shared

Recent optimizations of sd->shared assignment moved to allocating a
single instance of per-CPU sched_domain_shared objects per s_data.

Recent optimizations to select_idle_capacity() moved the sd->shared
assignments to "sd_asym" domain when ASYM_CPUCAPACITY is detected but
cache-aware scheduling mandates the presence of "sd_llc_shared" to
compute and cache per-LLC statistics.

Use an "alloc_flags" union in sched_domain_shared to claim a
sched_domain_shared object per sched_domain. Allocation starts searching
for an available / matching sched_domain_shared instance from the first
CPU of sched_domain_span(sd) (sd can be sd_llc, or sd_asym). If the
shared object is claimed by another domain, the instance corresponding
to next CPU in the domain span is explored until a matching / available
instance is found.

In case of a single CPU in sched_domain_span(), the domain will be
degenerated and a temporary overlap of ->shared objects across different
domains is acceptable.

"alloc_flags" forms a union with "nr_idle_scan" and the stale flags are
left as is when the sd->shared is published. The expectation is for the
first load balancing instance to correct the value just like the current
behavior, except the initial value is no longer 0.

Originally-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Andrea Righi <arighi@nvidia.com>
---
 include/linux/sched/topology.h | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index fe09d3268bc9..b5d9d7c2b8ad 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -67,7 +67,21 @@ struct sched_domain_shared {
 	atomic_t	ref;
 	atomic_t	nr_busy_cpus;
 	int		has_idle_cores;
-	int		nr_idle_scan;
+	union {
+		int	nr_idle_scan;
+		/*
+		 * Used during allocation to claim the sched_domain_shared
+		 * object at multiple levels.
+		 *
+		 * Note: between build and the first periodic LB tick, which
+		 * rewrites the union via update_idle_cpu_scan(), readers of
+		 * nr_idle_scan may observe the transient SD_* flag value as
+		 * the scan bound. The flag bits are small positive integers,
+		 * so the effect is just a slightly relaxed scan bound for one
+		 * window and self-heals on the first tick.
+		 */
+		int	alloc_flags;
+	};
 #ifdef CONFIG_SCHED_CACHE
 	unsigned long	util_avg;
 	unsigned long	capacity;
-- 
cgit v1.2.3


From e7b63427fdb4977621d69085a97272c8856644fe Mon Sep 17 00:00:00 2001
From: Andrea Righi <arighi@nvidia.com>
Date: Tue, 26 May 2026 18:42:48 +0200
Subject: sched_ext: Auto-register/unregister dl_server reservations

Commit cd959a3562050d ("sched_ext: Add a DL server for sched_ext tasks")
introduced an ext_server deadline server to protect sched_ext tasks from
fair/RT starvation, mirroring the existing fair_server.

Currently, both servers reserve their 50ms/1000ms bandwidth at boot,
regardless of whether a BPF scheduler is loaded. Unused bandwidth is
still reclaimed at runtime by other classes, but the static reservation
prevents the RT class from implicitly using that headroom when one of
the two classes is guaranteed to be empty.

A sysadmin can work around this by writing
/sys/kernel/debug/sched/{fair,ext}_server/cpu*/runtime, but that
requires manual action and not all systems expose debugfs.

A better approach is to make server bandwidth reservations dynamic: only
the scheduling policy that is currently active should register its
reservation, while the inactive one should not artificially hold
capacity (keeping both reservations only when the BPF scheduler is
running in partial mode):

 +---------------------------------------------+-------------+------------+
 | BPF scheduler state                         | fair server | ext server |
 +---------------------------------------------+-------------+------------+
 | not loaded (default boot)                   | reserved    | none       |
 | loaded full mode (!SCX_OPS_SWITCH_PARTIAL)  | none        | reserved   |
 | loaded partial mode (SCX_OPS_SWITCH_PARTIAL)| reserved    | reserved   |
 +---------------------------------------------+-------------+------------+

To achieve this, introduce an "attached/detached" state for each
deadline server, so the kernel can decide whether a server's bandwidth
should be accounted in global bandwidth tracking.

At boot, the system starts with only the fair server contributing to
bandwidth accounting. When a BPF scheduler is enabled, the ext server is
attached and may replace or complement the fair server depending on
whether full or partial mode is used. When sched_ext is disabled, the
system restores the previous deadline bandwidth values and behavior.

The transition logic ensures that switching between scheduling modes is
consistent and reversible, without losing runtime configuration or
requiring manual intervention.

Signed-off-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Juri Lelli <juri.lelli@redhat.com>
Link: https://patch.msgid.link/20260526164420.638711-2-arighi@nvidia.com
---
 include/linux/sched.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index da6a0907a78c..8130d13850fc 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -702,6 +702,11 @@ struct sched_dl_entity {
 	 * running, skipping the defer phase.
 	 *
 	 * @dl_defer_idle tracks idle state
+	 *
+	 * @dl_bw_attached tells if this server's bandwidth currently
+	 * contributes to the root domain's total_bw. Only meaningful for server
+	 * entities (@dl_server == 1). Allows toggling the reservation on/off
+	 * without losing the configured @dl_runtime/@dl_period.
 	 */
 	unsigned int			dl_throttled      : 1;
 	unsigned int			dl_yielded        : 1;
@@ -713,6 +718,7 @@ struct sched_dl_entity {
 	unsigned int			dl_defer_armed	  : 1;
 	unsigned int			dl_defer_running  : 1;
 	unsigned int			dl_defer_idle     : 1;
+	unsigned int			dl_bw_attached    : 1;
 
 	/*
 	 * Bandwidth enforcement timer. Each -deadline task has its
-- 
cgit v1.2.3


From f13beb010e4ab0735c9e46802cbcc820a8bd6467 Mon Sep 17 00:00:00 2001
From: John Stultz <jstultz@google.com>
Date: Tue, 12 May 2026 02:56:15 +0000
Subject: sched: Have try_to_wake_up() handle return-migration for PROXY_WAKING
 case

This patch adds logic so try_to_wake_up() will notice if we are
waking a task where blocked_on == PROXY_WAKING, and if necessary
dequeue the task so the wakeup will naturally return-migrate the
donor task back to a cpu it can run on.

This helps performance as we do the dequeue and wakeup under the
locks normally taken in the try_to_wake_up() and avoids having
to do proxy_force_return() from __schedule(), which has to
re-take similar locks and then force a pick again loop.

This was split out from the larger proxy patch, and
significantly reworked.

Credits for the original patch go to:
  Peter Zijlstra (Intel) <peterz@infradead.org>
  Juri Lelli <juri.lelli@redhat.com>
  Valentin Schneider <valentin.schneider@arm.com>
  Connor O'Brien <connoro@google.com>

Signed-off-by: John Stultz <jstultz@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20260512025635.2840817-6-jstultz@google.com
---
 include/linux/sched.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8130d13850fc..5dea5b10ac99 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -161,7 +161,7 @@ struct user_event_mm;
  */
 #define is_special_task_state(state)					\
 	((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED |	\
-		    TASK_DEAD | TASK_FROZEN))
+		    TASK_DEAD | TASK_WAKING | TASK_FROZEN))
 
 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
 # define debug_normal_state_change(state_value)				\
-- 
cgit v1.2.3


From 4c2a20413d7fb3fc3dd7adf233a4f82bb203fb58 Mon Sep 17 00:00:00 2001
From: John Stultz <jstultz@google.com>
Date: Tue, 12 May 2026 02:56:16 +0000
Subject: sched: Add is_blocked task flag

Add a new is_blocked flag to the task struct. This flag is set
by try_to_block_task() and cleared by ttwu_do_wakeup() and
tracks if the task is blocked.

Traditionally this would mirror !p->on_rq, however due things
like DELAY_DEQUEUE and PROXY_EXEC, this can diverge, so its
useful to manage separately.

Additionally with this, we might be able to get rid of the
p->se.sched_delayed (ab)use in the core code (eventually).

Taken whole cloth from Peter's email:
  https://lore.kernel.org/lkml/20260501132143.GC1026330@noisy.programming.kicks-ass.net/

With a few additional p->is_blocked = 0 in a few cases where
we return current if blocked_on gets zeroed or there is
no owner. This may hint that these current special cases
might be dropped eventually.

This change also helps resolve wait-queue stalls seen with
proxy-execution. See previous patch attempts for details:
  https://lore.kernel.org/lkml/20260430215103.2978955-2-jstultz@google.com/

Reported-by: Vineeth Pillai <vineethrp@google.com>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: John Stultz <jstultz@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20260512025635.2840817-7-jstultz@google.com
---
 include/linux/sched.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5dea5b10ac99..ec170663f99b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -852,7 +852,11 @@ struct task_struct {
 	struct alloc_tag		*alloc_tag;
 #endif
 
-	int				on_cpu;
+	u8				on_cpu;
+	u8				on_rq;
+	u8				is_blocked;
+	u8				__pad;
+
 	struct __call_single_node	wake_entry;
 	unsigned int			wakee_flips;
 	unsigned long			wakee_flip_decay_ts;
@@ -867,7 +871,6 @@ struct task_struct {
 	 */
 	int				recent_used_cpu;
 	int				wake_cpu;
-	int				on_rq;
 
 	int				prio;
 	int				static_prio;
-- 
cgit v1.2.3


From 1628b25248d0742b2ce9c7cfa59cd183e35f37e1 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 12 May 2026 02:56:17 +0000
Subject: sched: Add blocked_donor link to task for smarter mutex handoffs

Add link to the task this task is proxying for, and use it so
the mutex owner can do an intelligent hand-off of the mutex to
the task that the owner is running on behalf.

[jstultz: This patch was split out from larger proxy patch]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Juri Lelli <juri.lelli@redhat.com>
Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
Signed-off-by: Connor O'Brien <connoro@google.com>
Signed-off-by: John Stultz <jstultz@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20260512025635.2840817-8-jstultz@google.com
---
 include/linux/sched.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index ec170663f99b..e2f127a7ca0d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1250,6 +1250,13 @@ struct task_struct {
 	struct mutex			*blocked_on;	/* lock we're blocked on */
 	raw_spinlock_t			blocked_lock;
 
+	/*
+	 * The task that is boosting this task; a back link for the current
+	 * donor stack. Set in schedule() -> find_proxy_task() and only stable
+	 * under preempt_disable().
+	 */
+	struct task_struct		*blocked_donor;
+
 #ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
 	/*
 	 * Encoded lock address causing task block (lower 2 bits = type from
-- 
cgit v1.2.3


From ec9d4f1c424134bbf30965075df78d02a5d021dc Mon Sep 17 00:00:00 2001
From: K Prateek Nayak <kprateek.nayak@amd.com>
Date: Tue, 26 May 2026 11:43:02 +0200
Subject: sched/proxy: Remove PROXY_WAKING

Now that the proxy path uses ->is_blocked, use the '->is_blocked &&
!->blocked_on' state instead of PROXY_WAKING. Notably, this is where a
blocked_on relation is broken but the donor task might still need a return
migration.

Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20260526113322.596522894%40infradead.org
---
 include/linux/sched.h | 50 ++------------------------------------------------
 1 file changed, 2 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index e2f127a7ca0d..35e6183ef615 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2205,19 +2205,10 @@ extern int __cond_resched_rwlock_write(rwlock_t *lock) __must_hold(lock);
 
 #ifndef CONFIG_PREEMPT_RT
 
-/*
- * With proxy exec, if a task has been proxy-migrated, it may be a donor
- * on a cpu that it can't actually run on. Thus we need a special state
- * to denote that the task is being woken, but that it needs to be
- * evaluated for return-migration before it is run. So if the task is
- * blocked_on PROXY_WAKING, return migrate it before running it.
- */
-#define PROXY_WAKING ((struct mutex *)(-1L))
-
 static inline struct mutex *__get_task_blocked_on(struct task_struct *p)
 {
 	lockdep_assert_held_once(&p->blocked_lock);
-	return p->blocked_on == PROXY_WAKING ? NULL : p->blocked_on;
+	return p->blocked_on;
 }
 
 static inline void __set_task_blocked_on(struct task_struct *p, struct mutex *m)
@@ -2245,7 +2236,7 @@ static inline void __clear_task_blocked_on(struct task_struct *p, struct mutex *
 	 * blocked_on relationships, but make sure we are not
 	 * clearing the relationship with a different lock.
 	 */
-	WARN_ON_ONCE(m && p->blocked_on && p->blocked_on != m && p->blocked_on != PROXY_WAKING);
+	WARN_ON_ONCE(m && p->blocked_on && p->blocked_on != m);
 	p->blocked_on = NULL;
 }
 
@@ -2254,35 +2245,6 @@ static inline void clear_task_blocked_on(struct task_struct *p, struct mutex *m)
 	guard(raw_spinlock_irqsave)(&p->blocked_lock);
 	__clear_task_blocked_on(p, m);
 }
-
-static inline void __set_task_blocked_on_waking(struct task_struct *p, struct mutex *m)
-{
-	/* Currently we serialize blocked_on under the task::blocked_lock */
-	lockdep_assert_held_once(&p->blocked_lock);
-
-	if (!sched_proxy_exec()) {
-		__clear_task_blocked_on(p, m);
-		return;
-	}
-
-	/* Don't set PROXY_WAKING if blocked_on was already cleared */
-	if (!p->blocked_on)
-		return;
-	/*
-	 * There may be cases where we set PROXY_WAKING on tasks that were
-	 * already set to waking, but make sure we are not changing
-	 * the relationship with a different lock.
-	 */
-	WARN_ON_ONCE(m && p->blocked_on != m && p->blocked_on != PROXY_WAKING);
-	p->blocked_on = PROXY_WAKING;
-}
-
-static inline void set_task_blocked_on_waking(struct task_struct *p, struct mutex *m)
-{
-	guard(raw_spinlock_irqsave)(&p->blocked_lock);
-	__set_task_blocked_on_waking(p, m);
-}
-
 #else
 static inline void __clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m)
 {
@@ -2291,14 +2253,6 @@ static inline void __clear_task_blocked_on(struct task_struct *p, struct rt_mute
 static inline void clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m)
 {
 }
-
-static inline void __set_task_blocked_on_waking(struct task_struct *p, struct rt_mutex *m)
-{
-}
-
-static inline void set_task_blocked_on_waking(struct task_struct *p, struct rt_mutex *m)
-{
-}
 #endif /* !CONFIG_PREEMPT_RT */
 
 static __always_inline bool need_resched(void)
-- 
cgit v1.2.3