From df0d98475954d655571979aa061ecb07d7e00392 Mon Sep 17 00:00:00 2001 From: "Peter Zijlstra (Intel)" Date: Wed, 1 Apr 2026 14:52:13 -0700 Subject: sched/cache: Introduce infrastructure for cache-aware load balancing Adds infrastructure to enable cache-aware load balancing, which improves cache locality by grouping tasks that share resources within the same cache domain. This reduces cache misses and improves overall data access efficiency. In this initial implementation, threads belonging to the same process are treated as entities that likely share working sets. The mechanism tracks per-process CPU occupancy across cache domains and attempts to migrate threads toward cache-hot domains where their process already has active threads, thereby enhancing locality. This provides a basic model for cache affinity. While the current code targets the last-level cache (LLC), the approach could be extended to other domain types such as clusters (L2) or node-internal groupings. At present, the mechanism selects the CPU within an LLC that has the highest recent runtime. Subsequent patches in this series will use this information in the load-balancing path to guide task placement toward preferred LLCs. In the future, more advanced policies could be integrated through NUMA balancing-for example, migrating a task to its preferred LLC when spare capacity exists, or swapping tasks across LLCs to improve cache affinity. Grouping of tasks could also be generalized from that of a process to be that of a NUMA group, or be user configurable. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Chen Yu Signed-off-by: Tim Chen Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/6269a53221b9439b9ca00d18a9d1946fb64d8cff.1775065312.git.tim.c.chen@linux.intel.com --- include/linux/mm_types.h | 32 ++++++++++++++++++++++++++++++++ include/linux/sched.h | 24 ++++++++++++++++++++++++ 2 files changed, 56 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 3cc8ae722886..67b2dfcc71ea 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1173,6 +1173,8 @@ struct mm_struct { /* MM CID related storage */ struct mm_mm_cid mm_cid; + /* sched_cache related statistics */ + struct sched_cache_stat sc_stat; #ifdef CONFIG_MMU atomic_long_t pgtables_bytes; /* size of all page tables */ #endif @@ -1575,6 +1577,36 @@ static inline unsigned int mm_cid_size(void) # define MM_CID_STATIC_SIZE 0 #endif /* CONFIG_SCHED_MM_CID */ +#ifdef CONFIG_SCHED_CACHE +void mm_init_sched(struct mm_struct *mm, + struct sched_cache_time __percpu *pcpu_sched); + +static inline int mm_alloc_sched_noprof(struct mm_struct *mm) +{ + struct sched_cache_time __percpu *pcpu_sched = + alloc_percpu_noprof(struct sched_cache_time); + + if (!pcpu_sched) + return -ENOMEM; + + mm_init_sched(mm, pcpu_sched); + return 0; +} + +#define mm_alloc_sched(...) alloc_hooks(mm_alloc_sched_noprof(__VA_ARGS__)) + +static inline void mm_destroy_sched(struct mm_struct *mm) +{ + free_percpu(mm->sc_stat.pcpu_sched); + mm->sc_stat.pcpu_sched = NULL; +} +#else /* !CONFIG_SCHED_CACHE */ + +static inline int mm_alloc_sched(struct mm_struct *mm) { return 0; } +static inline void mm_destroy_sched(struct mm_struct *mm) { } + +#endif /* CONFIG_SCHED_CACHE */ + struct mmu_gather; extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm); extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm); diff --git a/include/linux/sched.h b/include/linux/sched.h index 8ec3b6d7d718..2bf261bcd7b6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1407,6 +1407,10 @@ struct task_struct { unsigned long numa_pages_migrated; #endif /* CONFIG_NUMA_BALANCING */ +#ifdef CONFIG_SCHED_CACHE + struct callback_head cache_work; +#endif + struct rseq_data rseq; struct sched_mm_cid mm_cid; @@ -2407,6 +2411,26 @@ static __always_inline int task_mm_cid(struct task_struct *t) } #endif +#ifdef CONFIG_SCHED_CACHE + +struct sched_cache_time { + u64 runtime; + unsigned long epoch; +}; + +struct sched_cache_stat { + struct sched_cache_time __percpu *pcpu_sched; + raw_spinlock_t lock; + unsigned long epoch; + int cpu; +} ____cacheline_aligned_in_smp; + +#else + +struct sched_cache_stat { }; + +#endif + #ifndef MODULE #ifndef COMPILE_OFFSETS -- cgit v1.2.3 From f025ef275388742643a2c33f00a0d9c0af3112ee Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Wed, 1 Apr 2026 14:52:15 -0700 Subject: sched/cache: Record per LLC utilization to guide cache aware scheduling decisions When a system becomes busy and a process's preferred LLC is saturated with too many threads, tasks within that LLC migrate frequently. These in LLC migrations introduce latency and degrade performance. To avoid this, task aggregation should be suppressed when the preferred LLC is overloaded, which requires a metric to indicate LLC utilization. Record per LLC utilization/cpu capacity during periodic load balancing. These statistics will be used in later patches to decide whether tasks should be aggregated into their preferred LLC. Signed-off-by: Chen Yu Co-developed-by: Tim Chen Signed-off-by: Tim Chen Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/a48151b3d57f2a42a5971aaead1b7f81e69229f4.1775065312.git.tim.c.chen@linux.intel.com --- include/linux/sched/topology.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 36553e14866d..159716fa0d3a 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -68,6 +68,10 @@ struct sched_domain_shared { atomic_t nr_busy_cpus; int has_idle_cores; int nr_idle_scan; +#ifdef CONFIG_SCHED_CACHE + unsigned long util_avg; + unsigned long capacity; +#endif }; struct sched_domain { -- cgit v1.2.3 From 47d8696b95f7397fe7cad2d194d550ffe82efc15 Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Wed, 1 Apr 2026 14:52:18 -0700 Subject: sched/cache: Assign preferred LLC ID to processes With cache-aware scheduling enabled, each task is assigned a preferred LLC ID. This allows quick identification of the LLC domain where the task prefers to run, similar to numa_preferred_nid in NUMA balancing. Co-developed-by: Chen Yu Signed-off-by: Chen Yu Signed-off-by: Tim Chen Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/f2ceecba5858680349ad4ce9303a2121f0bb7272.1775065312.git.tim.c.chen@linux.intel.com --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 2bf261bcd7b6..d2010483cd77 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1409,6 +1409,7 @@ struct task_struct { #ifdef CONFIG_SCHED_CACHE struct callback_head cache_work; + int preferred_llc; #endif struct rseq_data rseq; -- cgit v1.2.3 From a8d0ca0b7f2f7b53565d1e30e509d3d74d1f5460 Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Wed, 1 Apr 2026 14:52:20 -0700 Subject: sched/cache: Introduce per CPU's tasks LLC preference counter The lowest level of sched domain for each CPU is assigned an array where each element tracks the number of tasks preferring a given LLC, indexed from 0 to max_lid. Since each CPU has its dedicated sd, this implies that each CPU will have a dedicated task LLC preference counter. For example, sd->llc_counts[3] = 2 signifies that there are 2 tasks on this runqueue which prefer to run within LLC3. The load balancer can use this information to identify busy runqueues and migrate tasks to their preferred LLC domains. This array will be reallocated at runtime during sched domain rebuild. Introduce the buffer allocation mechanism, and the statistics will be calculated in the subsequent patch. Note: the LLC preference statistics of each CPU are reset on sched domain rebuild and may under count temporarily, until the CPU becomes idle and the count is cleared. This is a trade off to avoid complex data synchronization across sched domain builds. Suggested-by: Peter Zijlstra (Intel) Suggested-by: K Prateek Nayak Co-developed-by: Chen Yu Signed-off-by: Chen Yu Signed-off-by: Tim Chen Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/42e79eceb8cd6be8a032401d481d101913bc5703.1775065312.git.tim.c.chen@linux.intel.com --- include/linux/sched/topology.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 159716fa0d3a..0036d6b4bd67 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -103,6 +103,11 @@ struct sched_domain { u64 max_newidle_lb_cost; unsigned long last_decay_max_lb_cost; +#ifdef CONFIG_SCHED_CACHE + unsigned int llc_max; + unsigned int *llc_counts __counted_by_ptr(llc_max); +#endif + #ifdef CONFIG_SCHEDSTATS /* sched_balance_rq() stats */ unsigned int lb_count[CPU_MAX_IDLE_TYPES]; -- cgit v1.2.3 From a2b4cf39d9d333bfeb9262dbaafe3d24d405a5c0 Mon Sep 17 00:00:00 2001 From: Jianyong Wu Date: Wed, 13 May 2026 13:39:12 -0700 Subject: sched/cache: Allow only 1 thread of the process to calculate the LLC occupancy Scanning online CPUs to calculate the occupancy might be time-consuming. Only allow 1 thread of the process to scan the CPUs at the same time, which is similar to what NUMA balance does in task_numa_work(). Signed-off-by: Jianyong Wu Signed-off-by: Chen Yu Signed-off-by: Tim Chen Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/5672b52e588b855b01e5a1a17822f7c6c7237a3d.1778703694.git.tim.c.chen@linux.intel.com --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index d2010483cd77..6d883f109ba3 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2423,6 +2423,7 @@ struct sched_cache_stat { struct sched_cache_time __percpu *pcpu_sched; raw_spinlock_t lock; unsigned long epoch; + unsigned long next_scan; int cpu; } ____cacheline_aligned_in_smp; -- cgit v1.2.3 From deee5e27d5b608323c04dc99979e55f944016a13 Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Wed, 13 May 2026 13:39:13 -0700 Subject: sched/cache: Disable cache aware scheduling for processes with high thread counts A performance regression was observed by Prateek when running hackbench with many threads per process (high fd count). To avoid this, processes with a large number of active threads are excluded from cache-aware scheduling. With sched_cache enabled, record the number of active threads in each process during the periodic task_cache_work(). While iterating over CPUs, if the currently running task belongs to the same process as the task that launched task_cache_work(), increment the active thread count. If the number of active threads within the process exceeds the number of Cores (divided by the SMT number) in the LLC, do not enable cache-aware scheduling. However, on systems with a smaller number of CPUs within 1 LLC, like Power10/Power11 with SMT4 and an LLC size of 4, this check effectively disables cache-aware scheduling for any process. One possible solution suggested by Peter is to use an LLC-mask instead of a single LLC value for preference. Once there are a 'few' LLCs as preference, this constraint becomes a little easier. It could be an enhancement in the future. For users who wish to perform task aggregation regardless, a debugfs knob is provided for tuning in a subsequent change. Suggested-by: K Prateek Nayak Suggested-by: Aaron Lu Signed-off-by: Chen Yu Co-developed-by: Tim Chen Signed-off-by: Tim Chen Signed-off-by: Peter Zijlstra (Intel) Tested-by: Tingyin Duan Link: https://patch.msgid.link/d076cd21a8e6c6341d1e2d927e118db770ebb650.1778703694.git.tim.c.chen@linux.intel.com --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 6d883f109ba3..6701911eaaf7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2423,6 +2423,7 @@ struct sched_cache_stat { struct sched_cache_time __percpu *pcpu_sched; raw_spinlock_t lock; unsigned long epoch; + u64 nr_running_avg; unsigned long next_scan; int cpu; } ____cacheline_aligned_in_smp; -- cgit v1.2.3 From 7030513a08776b2ca70fccd5dfddf7bb5c5c88ba Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Wed, 13 May 2026 13:39:15 -0700 Subject: sched/cache: Calculate the LLC size and store it in sched_domain Cache aware scheduling needs to know the LLC size that a process can use, so as to avoid memory-intensive tasks from being over-aggregated on a single LLC. Introduce a preparation patch to add get_effective_llc_bytes() to get the LLC size that a CPU can use. The function can be further enhanced by subtracting the LLC cache ways reserved by resctrl (CAT in Intel RDT, etc). Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Chen Yu Co-developed-by: Tim Chen Signed-off-by: Tim Chen Signed-off-by: Peter Zijlstra (Intel) Tested-by: Tingyin Duan Link: https://patch.msgid.link/37afee09ff608034da0ce149e72d33b6f4698edf.1778703694.git.tim.c.chen@linux.intel.com --- include/linux/cacheinfo.h | 1 + include/linux/sched/topology.h | 7 +++++++ 2 files changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h index c8f4f0a0b874..fc879ac4cc4f 100644 --- a/include/linux/cacheinfo.h +++ b/include/linux/cacheinfo.h @@ -89,6 +89,7 @@ int populate_cache_leaves(unsigned int cpu); int cache_setup_acpi(unsigned int cpu); bool last_level_cache_is_valid(unsigned int cpu); bool last_level_cache_is_shared(unsigned int cpu_x, unsigned int cpu_y); +struct cacheinfo *get_cpu_cacheinfo_llc(unsigned int cpu); int fetch_cache_info(unsigned int cpu); int detect_cache_attributes(unsigned int cpu); #ifndef CONFIG_ACPI_PPTT diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 0036d6b4bd67..fe09d3268bc9 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -106,6 +106,7 @@ struct sched_domain { #ifdef CONFIG_SCHED_CACHE unsigned int llc_max; unsigned int *llc_counts __counted_by_ptr(llc_max); + unsigned long llc_bytes; #endif #ifdef CONFIG_SCHEDSTATS @@ -265,4 +266,10 @@ static inline int task_node(const struct task_struct *p) return cpu_to_node(task_cpu(p)); } +#ifdef CONFIG_SCHED_CACHE +extern void sched_update_llc_bytes(unsigned int cpu); +#else +static inline void sched_update_llc_bytes(unsigned int cpu) { } +#endif + #endif /* _LINUX_SCHED_TOPOLOGY_H */ -- cgit v1.2.3 From 808915f982c2a52f5d148510ecfab52284de67cf Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Wed, 13 May 2026 13:39:16 -0700 Subject: sched/cache: Avoid cache-aware scheduling for memory-heavy processes Prateek and Tingyin reported that memory-intensive workloads (such as stream) can saturate memory bandwidth and caches on the preferred LLC when sched_cache aggregates too many threads. To mitigate this, estimate a process's memory footprint by comparing its NUMA balancing fault statistics to the size of the LLC. If the footprint exceeds the LLC size, skip cache-aware scheduling. Note that footprint is only an approximation of the memory footprint, since the kernel lacks suitable metrics to estimate the real working set. If a user-provided hint is available in the future, it would be more accurate. A later patch will allow users to provide a hint to adjust this threshold. Suggested-by: K Prateek Nayak Suggested-by: Vern Hao Signed-off-by: Chen Yu Co-developed-by: Tim Chen Signed-off-by: Tim Chen Signed-off-by: Peter Zijlstra (Intel) Tested-by: Tingyin Duan Link: https://patch.msgid.link/95cf64a385bcc12f18dcebe9d59e8d3ba8bb318f.1778703694.git.tim.c.chen@linux.intel.com --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 6701911eaaf7..95729670929c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2425,6 +2425,7 @@ struct sched_cache_stat { unsigned long epoch; u64 nr_running_avg; unsigned long next_scan; + unsigned long footprint; int cpu; } ____cacheline_aligned_in_smp; -- cgit v1.2.3 From 03755348b8e74421f92ffed9da159175a698290b Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Wed, 13 May 2026 13:39:21 -0700 Subject: sched/cache: Fix unpaired account_llc_enqueue/dequeue There is a race condition that, after a task is enqueued on a runqueue, task_llc(p) may change due to CPU hotplug, because the llc_id is dynamically allocated and adjusted at runtime. Therefore, checking task_llc(p) to determine whether the task is being dequeued from its preferred LLC is unreliable and can cause inconsistent values. To fix this problem, record whether p is enqueued on its preferred LLC, in order to pair with account_llc_dequeue() to maintain a consistent nr_pref_llc_running per runqueue. This bug was reported by sashiko, and the solution was once suggested by Prateek. Fixes: 46afe3af7ead ("sched/cache: Track LLC-preferred tasks per runqueue") Suggested-by: K Prateek Nayak Signed-off-by: Chen Yu Co-developed-by: Tim Chen Signed-off-by: Tim Chen Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/0c8c6a1571d66792a4d2ff0103ba3cc13e059046.1778703694.git.tim.c.chen@linux.intel.com --- include/linux/sched.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 95729670929c..2c9e8e2edde1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1410,6 +1410,8 @@ struct task_struct { #ifdef CONFIG_SCHED_CACHE struct callback_head cache_work; int preferred_llc; + /* 1: task was enqueued to its preferred LLC, 0 otherwise */ + int pref_llc_queued; #endif struct rseq_data rseq; -- cgit v1.2.3 From ea19506013ad13685573e4674fbeddb790e27906 Mon Sep 17 00:00:00 2001 From: Yiyang Chen Date: Fri, 15 May 2026 00:05:05 +0800 Subject: sched/clock: Provide !HAVE_UNSTABLE_SCHED_CLOCK stub for sched_clock_stable() When CONFIG_HAVE_UNSTABLE_SCHED_CLOCK is disabled, sched_clock() is already assumed to provide stable semantics, but the public header doesn't provide a sched_clock_stable() stub for that case. Add a header stub that always returns true and clean up the duplicate local stub in ring_buffer.c, so callers can use sched_clock_stable() unconditionally. Signed-off-by: Yiyang Chen Signed-off-by: Peter Zijlstra (Intel) Acked-by: Steven Rostedt Link: https://patch.msgid.link/56e45338858946cd9581b75c8bd45dd37dba52c5.1778773587.git.cyyzero16@gmail.com --- include/linux/sched/clock.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched/clock.h b/include/linux/sched/clock.h index 196f0ca351a2..39f0a7f94bfc 100644 --- a/include/linux/sched/clock.h +++ b/include/linux/sched/clock.h @@ -33,6 +33,11 @@ extern u64 sched_clock_cpu(int cpu); extern void sched_clock_init(void); #ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK +static inline int sched_clock_stable(void) +{ + return 1; +} + static inline void sched_clock_tick(void) { } -- cgit v1.2.3 From 815c5cb76a3e5dad4fc3911b9073591dc3a29282 Mon Sep 17 00:00:00 2001 From: Shrikanth Hegde Date: Fri, 15 May 2026 22:54:53 +0530 Subject: topology: Introduce cpu_smt_mask for CONFIG_SCHED_SMT=n Define cpu_smt_mask in case of CONFIG_SCHED_SMT=n as cpumask_of that CPU. With that config, it is expected that kernel treats each CPU as individual core. Using cpumask_of(cpu) reflects that. This would help to get rid of the ifdeffery that is spread across the codebase since cpu_smt_mask is defined only in case of CONFIG_SCHED_SMT=y. Note: There is no arch today which defines cpu_smt_mask unconditionally. So likely defining the cpu_smt_mask shouldn't lead redefinition errors. Signed-off-by: Shrikanth Hegde Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Phil Auld Reviewed-by: Valentin Schneider Tested-by: K Prateek Nayak Link: https://patch.msgid.link/20260515172456.542799-2-sshegde@linux.ibm.com --- include/linux/topology.h | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/topology.h b/include/linux/topology.h index 6575af39fd10..709a2dcf4c73 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -230,11 +230,24 @@ static inline int cpu_to_mem(int cpu) #define topology_drawer_cpumask(cpu) cpumask_of(cpu) #endif -#if defined(CONFIG_SCHED_SMT) && !defined(cpu_smt_mask) +/* + * Defining cpu_smt_mask as cpumask_of that CPU helps to get + * rid of lot of ifdeffery all around the codebase in case of + * CONFIG_SCHED_SMT=n. It just means there are no other siblings, which + * is what is expected. + */ +#if defined(CONFIG_SCHED_SMT) +# if !defined(cpu_smt_mask) static inline const struct cpumask *cpu_smt_mask(int cpu) { return topology_sibling_cpumask(cpu); } +# endif +#else /* !CONFIG_SCHED_SMT */ +static inline const struct cpumask *cpu_smt_mask(int cpu) +{ + return cpumask_of(cpu); +} #endif #ifndef topology_is_primary_thread -- cgit v1.2.3 From 5bc6ab2d42e545f816def21cfcdb4ba35cc74bf6 Mon Sep 17 00:00:00 2001 From: Shrikanth Hegde Date: Fri, 15 May 2026 22:54:54 +0530 Subject: sched: Simplify ifdeffery around cpu_smt_mask Now, that cpu_smt_mask is defined as cpumask_of(cpu) for CONFIG_SCHED_SMT=n, it is possible to get rid of the ifdeffery. Effectively, - This makes sched_smt_present is defined always - cpumask_weight(cpumask_of(cpu)) == 1. So sched_smt_present_inc/dec will never enable the sched_smt_present. Which is expected. - Paths that were compile-time eliminated become runtime guarded using static keys. - Defines set_idle_cores, test_idle_cores, etc which could likely benefit the CONFIG_SCHED_SMT=n systems to use the same optimizations within the LLC at wakeups. - This will expose sched_smt_present symbol for CONFIG_SCHED_SMT=n. Likely not a concern. - There is a bloat of code CONFIG_SCHED_SMT=n. (NR_CPUS=2048) add/remove: 24/18 grow/shrink: 26/28 up/down: 6396/-3188 (3208) Total: Before=30629880, After=30633088, chg +0.01% - No code bloat for CONFIG_SCHED_SMT=y, which is expected. - Add comments around stop_core_cpuslocked on why ifdefs are not removed. - This leaves the remaining uses of CONFIG_SCHED_SMT mainly for topology building bits which has a policy based decision. Signed-off-by: Shrikanth Hegde Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Phil Auld Reviewed-by: Valentin Schneider Acked-by: Tejun Heo Tested-by: K Prateek Nayak Link: https://patch.msgid.link/20260515172456.542799-3-sshegde@linux.ibm.com --- include/linux/sched/smt.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/smt.h b/include/linux/sched/smt.h index 166b19af956f..cde6679c0278 100644 --- a/include/linux/sched/smt.h +++ b/include/linux/sched/smt.h @@ -4,16 +4,12 @@ #include -#ifdef CONFIG_SCHED_SMT extern struct static_key_false sched_smt_present; static __always_inline bool sched_smt_active(void) { return static_branch_likely(&sched_smt_present); } -#else -static __always_inline bool sched_smt_active(void) { return false; } -#endif void arch_smt_update(void); -- cgit v1.2.3 From 9e005ed21152d4a4bb0ceea71045ff8a642a6feb Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Tue, 19 May 2026 05:14:23 +0000 Subject: sched/topology: Allow multiple domains to claim sched_domain_shared Recent optimizations of sd->shared assignment moved to allocating a single instance of per-CPU sched_domain_shared objects per s_data. Recent optimizations to select_idle_capacity() moved the sd->shared assignments to "sd_asym" domain when ASYM_CPUCAPACITY is detected but cache-aware scheduling mandates the presence of "sd_llc_shared" to compute and cache per-LLC statistics. Use an "alloc_flags" union in sched_domain_shared to claim a sched_domain_shared object per sched_domain. Allocation starts searching for an available / matching sched_domain_shared instance from the first CPU of sched_domain_span(sd) (sd can be sd_llc, or sd_asym). If the shared object is claimed by another domain, the instance corresponding to next CPU in the domain span is explored until a matching / available instance is found. In case of a single CPU in sched_domain_span(), the domain will be degenerated and a temporary overlap of ->shared objects across different domains is acceptable. "alloc_flags" forms a union with "nr_idle_scan" and the stale flags are left as is when the sd->shared is published. The expectation is for the first load balancing instance to correct the value just like the current behavior, except the initial value is no longer 0. Originally-by: Peter Zijlstra Signed-off-by: K Prateek Nayak Signed-off-by: Peter Zijlstra (Intel) Tested-by: Andrea Righi --- include/linux/sched/topology.h | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index fe09d3268bc9..b5d9d7c2b8ad 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -67,7 +67,21 @@ struct sched_domain_shared { atomic_t ref; atomic_t nr_busy_cpus; int has_idle_cores; - int nr_idle_scan; + union { + int nr_idle_scan; + /* + * Used during allocation to claim the sched_domain_shared + * object at multiple levels. + * + * Note: between build and the first periodic LB tick, which + * rewrites the union via update_idle_cpu_scan(), readers of + * nr_idle_scan may observe the transient SD_* flag value as + * the scan bound. The flag bits are small positive integers, + * so the effect is just a slightly relaxed scan bound for one + * window and self-heals on the first tick. + */ + int alloc_flags; + }; #ifdef CONFIG_SCHED_CACHE unsigned long util_avg; unsigned long capacity; -- cgit v1.2.3 From e7b63427fdb4977621d69085a97272c8856644fe Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Tue, 26 May 2026 18:42:48 +0200 Subject: sched_ext: Auto-register/unregister dl_server reservations Commit cd959a3562050d ("sched_ext: Add a DL server for sched_ext tasks") introduced an ext_server deadline server to protect sched_ext tasks from fair/RT starvation, mirroring the existing fair_server. Currently, both servers reserve their 50ms/1000ms bandwidth at boot, regardless of whether a BPF scheduler is loaded. Unused bandwidth is still reclaimed at runtime by other classes, but the static reservation prevents the RT class from implicitly using that headroom when one of the two classes is guaranteed to be empty. A sysadmin can work around this by writing /sys/kernel/debug/sched/{fair,ext}_server/cpu*/runtime, but that requires manual action and not all systems expose debugfs. A better approach is to make server bandwidth reservations dynamic: only the scheduling policy that is currently active should register its reservation, while the inactive one should not artificially hold capacity (keeping both reservations only when the BPF scheduler is running in partial mode): +---------------------------------------------+-------------+------------+ | BPF scheduler state | fair server | ext server | +---------------------------------------------+-------------+------------+ | not loaded (default boot) | reserved | none | | loaded full mode (!SCX_OPS_SWITCH_PARTIAL) | none | reserved | | loaded partial mode (SCX_OPS_SWITCH_PARTIAL)| reserved | reserved | +---------------------------------------------+-------------+------------+ To achieve this, introduce an "attached/detached" state for each deadline server, so the kernel can decide whether a server's bandwidth should be accounted in global bandwidth tracking. At boot, the system starts with only the fair server contributing to bandwidth accounting. When a BPF scheduler is enabled, the ext server is attached and may replace or complement the fair server depending on whether full or partial mode is used. When sched_ext is disabled, the system restores the previous deadline bandwidth values and behavior. The transition logic ensures that switching between scheduling modes is consistent and reversible, without losing runtime configuration or requiring manual intervention. Signed-off-by: Andrea Righi Signed-off-by: Peter Zijlstra (Intel) Acked-by: Juri Lelli Link: https://patch.msgid.link/20260526164420.638711-2-arighi@nvidia.com --- include/linux/sched.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index da6a0907a78c..8130d13850fc 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -702,6 +702,11 @@ struct sched_dl_entity { * running, skipping the defer phase. * * @dl_defer_idle tracks idle state + * + * @dl_bw_attached tells if this server's bandwidth currently + * contributes to the root domain's total_bw. Only meaningful for server + * entities (@dl_server == 1). Allows toggling the reservation on/off + * without losing the configured @dl_runtime/@dl_period. */ unsigned int dl_throttled : 1; unsigned int dl_yielded : 1; @@ -713,6 +718,7 @@ struct sched_dl_entity { unsigned int dl_defer_armed : 1; unsigned int dl_defer_running : 1; unsigned int dl_defer_idle : 1; + unsigned int dl_bw_attached : 1; /* * Bandwidth enforcement timer. Each -deadline task has its -- cgit v1.2.3 From f13beb010e4ab0735c9e46802cbcc820a8bd6467 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 12 May 2026 02:56:15 +0000 Subject: sched: Have try_to_wake_up() handle return-migration for PROXY_WAKING case This patch adds logic so try_to_wake_up() will notice if we are waking a task where blocked_on == PROXY_WAKING, and if necessary dequeue the task so the wakeup will naturally return-migrate the donor task back to a cpu it can run on. This helps performance as we do the dequeue and wakeup under the locks normally taken in the try_to_wake_up() and avoids having to do proxy_force_return() from __schedule(), which has to re-take similar locks and then force a pick again loop. This was split out from the larger proxy patch, and significantly reworked. Credits for the original patch go to: Peter Zijlstra (Intel) Juri Lelli Valentin Schneider Connor O'Brien Signed-off-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260512025635.2840817-6-jstultz@google.com --- include/linux/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 8130d13850fc..5dea5b10ac99 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -161,7 +161,7 @@ struct user_event_mm; */ #define is_special_task_state(state) \ ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | \ - TASK_DEAD | TASK_FROZEN)) + TASK_DEAD | TASK_WAKING | TASK_FROZEN)) #ifdef CONFIG_DEBUG_ATOMIC_SLEEP # define debug_normal_state_change(state_value) \ -- cgit v1.2.3 From 4c2a20413d7fb3fc3dd7adf233a4f82bb203fb58 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 12 May 2026 02:56:16 +0000 Subject: sched: Add is_blocked task flag Add a new is_blocked flag to the task struct. This flag is set by try_to_block_task() and cleared by ttwu_do_wakeup() and tracks if the task is blocked. Traditionally this would mirror !p->on_rq, however due things like DELAY_DEQUEUE and PROXY_EXEC, this can diverge, so its useful to manage separately. Additionally with this, we might be able to get rid of the p->se.sched_delayed (ab)use in the core code (eventually). Taken whole cloth from Peter's email: https://lore.kernel.org/lkml/20260501132143.GC1026330@noisy.programming.kicks-ass.net/ With a few additional p->is_blocked = 0 in a few cases where we return current if blocked_on gets zeroed or there is no owner. This may hint that these current special cases might be dropped eventually. This change also helps resolve wait-queue stalls seen with proxy-execution. See previous patch attempts for details: https://lore.kernel.org/lkml/20260430215103.2978955-2-jstultz@google.com/ Reported-by: Vineeth Pillai Suggested-by: Peter Zijlstra Signed-off-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260512025635.2840817-7-jstultz@google.com --- include/linux/sched.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 5dea5b10ac99..ec170663f99b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -852,7 +852,11 @@ struct task_struct { struct alloc_tag *alloc_tag; #endif - int on_cpu; + u8 on_cpu; + u8 on_rq; + u8 is_blocked; + u8 __pad; + struct __call_single_node wake_entry; unsigned int wakee_flips; unsigned long wakee_flip_decay_ts; @@ -867,7 +871,6 @@ struct task_struct { */ int recent_used_cpu; int wake_cpu; - int on_rq; int prio; int static_prio; -- cgit v1.2.3 From 1628b25248d0742b2ce9c7cfa59cd183e35f37e1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 12 May 2026 02:56:17 +0000 Subject: sched: Add blocked_donor link to task for smarter mutex handoffs Add link to the task this task is proxying for, and use it so the mutex owner can do an intelligent hand-off of the mutex to the task that the owner is running on behalf. [jstultz: This patch was split out from larger proxy patch] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Juri Lelli Signed-off-by: Valentin Schneider Signed-off-by: Connor O'Brien Signed-off-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260512025635.2840817-8-jstultz@google.com --- include/linux/sched.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index ec170663f99b..e2f127a7ca0d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1250,6 +1250,13 @@ struct task_struct { struct mutex *blocked_on; /* lock we're blocked on */ raw_spinlock_t blocked_lock; + /* + * The task that is boosting this task; a back link for the current + * donor stack. Set in schedule() -> find_proxy_task() and only stable + * under preempt_disable(). + */ + struct task_struct *blocked_donor; + #ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER /* * Encoded lock address causing task block (lower 2 bits = type from -- cgit v1.2.3 From ec9d4f1c424134bbf30965075df78d02a5d021dc Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Tue, 26 May 2026 11:43:02 +0200 Subject: sched/proxy: Remove PROXY_WAKING Now that the proxy path uses ->is_blocked, use the '->is_blocked && !->blocked_on' state instead of PROXY_WAKING. Notably, this is where a blocked_on relation is broken but the donor task might still need a return migration. Signed-off-by: K Prateek Nayak Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260526113322.596522894%40infradead.org --- include/linux/sched.h | 50 ++------------------------------------------------ 1 file changed, 2 insertions(+), 48 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index e2f127a7ca0d..35e6183ef615 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2205,19 +2205,10 @@ extern int __cond_resched_rwlock_write(rwlock_t *lock) __must_hold(lock); #ifndef CONFIG_PREEMPT_RT -/* - * With proxy exec, if a task has been proxy-migrated, it may be a donor - * on a cpu that it can't actually run on. Thus we need a special state - * to denote that the task is being woken, but that it needs to be - * evaluated for return-migration before it is run. So if the task is - * blocked_on PROXY_WAKING, return migrate it before running it. - */ -#define PROXY_WAKING ((struct mutex *)(-1L)) - static inline struct mutex *__get_task_blocked_on(struct task_struct *p) { lockdep_assert_held_once(&p->blocked_lock); - return p->blocked_on == PROXY_WAKING ? NULL : p->blocked_on; + return p->blocked_on; } static inline void __set_task_blocked_on(struct task_struct *p, struct mutex *m) @@ -2245,7 +2236,7 @@ static inline void __clear_task_blocked_on(struct task_struct *p, struct mutex * * blocked_on relationships, but make sure we are not * clearing the relationship with a different lock. */ - WARN_ON_ONCE(m && p->blocked_on && p->blocked_on != m && p->blocked_on != PROXY_WAKING); + WARN_ON_ONCE(m && p->blocked_on && p->blocked_on != m); p->blocked_on = NULL; } @@ -2254,35 +2245,6 @@ static inline void clear_task_blocked_on(struct task_struct *p, struct mutex *m) guard(raw_spinlock_irqsave)(&p->blocked_lock); __clear_task_blocked_on(p, m); } - -static inline void __set_task_blocked_on_waking(struct task_struct *p, struct mutex *m) -{ - /* Currently we serialize blocked_on under the task::blocked_lock */ - lockdep_assert_held_once(&p->blocked_lock); - - if (!sched_proxy_exec()) { - __clear_task_blocked_on(p, m); - return; - } - - /* Don't set PROXY_WAKING if blocked_on was already cleared */ - if (!p->blocked_on) - return; - /* - * There may be cases where we set PROXY_WAKING on tasks that were - * already set to waking, but make sure we are not changing - * the relationship with a different lock. - */ - WARN_ON_ONCE(m && p->blocked_on != m && p->blocked_on != PROXY_WAKING); - p->blocked_on = PROXY_WAKING; -} - -static inline void set_task_blocked_on_waking(struct task_struct *p, struct mutex *m) -{ - guard(raw_spinlock_irqsave)(&p->blocked_lock); - __set_task_blocked_on_waking(p, m); -} - #else static inline void __clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m) { @@ -2291,14 +2253,6 @@ static inline void __clear_task_blocked_on(struct task_struct *p, struct rt_mute static inline void clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m) { } - -static inline void __set_task_blocked_on_waking(struct task_struct *p, struct rt_mutex *m) -{ -} - -static inline void set_task_blocked_on_waking(struct task_struct *p, struct rt_mutex *m) -{ -} #endif /* !CONFIG_PREEMPT_RT */ static __always_inline bool need_resched(void) -- cgit v1.2.3