diff options
| author | Peter Zijlstra <peterz@infradead.org> | 2025-11-07 17:01:31 +0100 |
|---|---|---|
| committer | Greg Kroah-Hartman <gregkh@linuxfoundation.org> | 2026-01-11 15:26:19 +0100 |
| commit | 98a26893fad4180d8ea210d8749392790dfddc81 (patch) | |
| tree | 0c91d9db0c8444bd25856ef9dcdd85369299fe30 | |
| parent | d4ffb9ce8e6501bebbf833cd7ae3f34eab1f76ba (diff) | |
sched/fair: Proportional newidle balance
commit 33cf66d88306663d16e4759e9d24766b0aaa2e17 upstream.
Add a randomized algorithm that runs newidle balancing proportional to
its success rate.
This improves schbench significantly:
6.18-rc4: 2.22 Mrps/s
6.18-rc4+revert: 2.04 Mrps/s
6.18-rc4+revert+random: 2.18 Mrps/S
Conversely, per Adam Li this affects SpecJBB slightly, reducing it by 1%:
6.17: -6%
6.17+revert: 0%
6.17+revert+random: -1%
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Tested-by: Chris Mason <clm@meta.com>
Link: https://lkml.kernel.org/r/6825c50d-7fa7-45d8-9b81-c6e7e25738e2@meta.com
Link: https://patch.msgid.link/20251107161739.770122091@infradead.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
| -rw-r--r-- | include/linux/sched/topology.h | 3 | ||||
| -rw-r--r-- | kernel/sched/core.c | 3 | ||||
| -rw-r--r-- | kernel/sched/fair.c | 44 | ||||
| -rw-r--r-- | kernel/sched/features.h | 5 | ||||
| -rw-r--r-- | kernel/sched/sched.h | 7 | ||||
| -rw-r--r-- | kernel/sched/topology.c | 6 |
6 files changed, 64 insertions, 4 deletions
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index bbcfdf12aa6e..45c0022b91ce 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -92,6 +92,9 @@ struct sched_domain { unsigned int nr_balance_failed; /* initialise to 0 */ /* idle_balance() stats */ + unsigned int newidle_call; + unsigned int newidle_success; + unsigned int newidle_ratio; u64 max_newidle_lb_cost; unsigned long last_decay_max_lb_cost; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f754a60de848..eb47d294e2c5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -121,6 +121,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); +DEFINE_PER_CPU(struct rnd_state, sched_rnd_state); #ifdef CONFIG_SCHED_PROXY_EXEC DEFINE_STATIC_KEY_TRUE(__sched_proxy_exec); @@ -8591,6 +8592,8 @@ void __init sched_init_smp(void) { sched_init_numa(NUMA_NO_NODE); + prandom_init_once(&sched_rnd_state); + /* * There's no userspace yet to cause hotplug operations; hence all the * CPU masks are stable and all blatant races in the below code cannot diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8369dadf1958..d1206f81f8b2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -12122,11 +12122,27 @@ void update_max_interval(void) max_load_balance_interval = HZ*num_online_cpus()/10; } -static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost) +static inline void update_newidle_stats(struct sched_domain *sd, unsigned int success) +{ + sd->newidle_call++; + sd->newidle_success += success; + + if (sd->newidle_call >= 1024) { + sd->newidle_ratio = sd->newidle_success; + sd->newidle_call /= 2; + sd->newidle_success /= 2; + } +} + +static inline bool +update_newidle_cost(struct sched_domain *sd, u64 cost, unsigned int success) { unsigned long next_decay = sd->last_decay_max_lb_cost + HZ; unsigned long now = jiffies; + if (cost) + update_newidle_stats(sd, success); + if (cost > sd->max_newidle_lb_cost) { /* * Track max cost of a domain to make sure to not delay the @@ -12174,7 +12190,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle) * Decay the newidle max times here because this is a regular * visit to all the domains. */ - need_decay = update_newidle_cost(sd, 0); + need_decay = update_newidle_cost(sd, 0, 0); max_cost += sd->max_newidle_lb_cost; /* @@ -12819,6 +12835,22 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) break; if (sd->flags & SD_BALANCE_NEWIDLE) { + unsigned int weight = 1; + + if (sched_feat(NI_RANDOM)) { + /* + * Throw a 1k sided dice; and only run + * newidle_balance according to the success + * rate. + */ + u32 d1k = sched_rng() % 1024; + weight = 1 + sd->newidle_ratio; + if (d1k > weight) { + update_newidle_stats(sd, 0); + continue; + } + weight = (1024 + weight/2) / weight; + } pulled_task = sched_balance_rq(this_cpu, this_rq, sd, CPU_NEWLY_IDLE, @@ -12826,10 +12858,14 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) t1 = sched_clock_cpu(this_cpu); domain_cost = t1 - t0; - update_newidle_cost(sd, domain_cost); - curr_cost += domain_cost; t0 = t1; + + /* + * Track max cost of a domain to make sure to not delay the + * next wakeup on the CPU. + */ + update_newidle_cost(sd, domain_cost, weight * !!pulled_task); } /* diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 3c12d9f93331..136a6584be79 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -121,3 +121,8 @@ SCHED_FEAT(WA_BIAS, true) SCHED_FEAT(UTIL_EST, true) SCHED_FEAT(LATENCY_WARN, false) + +/* + * Do newidle balancing proportional to its success rate using randomization. + */ +SCHED_FEAT(NI_RANDOM, true) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 92ec751799f5..2f8b06b12a98 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -5,6 +5,7 @@ #ifndef _KERNEL_SCHED_SCHED_H #define _KERNEL_SCHED_SCHED_H +#include <linux/prandom.h> #include <linux/sched/affinity.h> #include <linux/sched/autogroup.h> #include <linux/sched/cpufreq.h> @@ -1349,6 +1350,12 @@ static inline bool is_migration_disabled(struct task_struct *p) } DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); +DECLARE_PER_CPU(struct rnd_state, sched_rnd_state); + +static inline u32 sched_rng(void) +{ + return prandom_u32_state(this_cpu_ptr(&sched_rnd_state)); +} #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) #define this_rq() this_cpu_ptr(&runqueues) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 444bdfdab731..c7a4d2fff571 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1662,6 +1662,12 @@ sd_init(struct sched_domain_topology_level *tl, .last_balance = jiffies, .balance_interval = sd_weight, + + /* 50% success rate */ + .newidle_call = 512, + .newidle_success = 256, + .newidle_ratio = 512, + .max_newidle_lb_cost = 0, .last_decay_max_lb_cost = jiffies, .child = child, |
