diff options
| author | K Prateek Nayak <kprateek.nayak@amd.com> | 2026-05-19 05:14:23 +0000 |
|---|---|---|
| committer | Peter Zijlstra <peterz@infradead.org> | 2026-05-19 13:35:36 +0200 |
| commit | 9e005ed21152d4a4bb0ceea71045ff8a642a6feb (patch) | |
| tree | aa9b6e6e843e2d321b16ef2ac811a110a27a592f | |
| parent | a26d9208c1376ac3877d9f12e697f83368e2af1c (diff) | |
sched/topology: Allow multiple domains to claim sched_domain_shared
Recent optimizations of sd->shared assignment moved to allocating a
single instance of per-CPU sched_domain_shared objects per s_data.
Recent optimizations to select_idle_capacity() moved the sd->shared
assignments to "sd_asym" domain when ASYM_CPUCAPACITY is detected but
cache-aware scheduling mandates the presence of "sd_llc_shared" to
compute and cache per-LLC statistics.
Use an "alloc_flags" union in sched_domain_shared to claim a
sched_domain_shared object per sched_domain. Allocation starts searching
for an available / matching sched_domain_shared instance from the first
CPU of sched_domain_span(sd) (sd can be sd_llc, or sd_asym). If the
shared object is claimed by another domain, the instance corresponding
to next CPU in the domain span is explored until a matching / available
instance is found.
In case of a single CPU in sched_domain_span(), the domain will be
degenerated and a temporary overlap of ->shared objects across different
domains is acceptable.
"alloc_flags" forms a union with "nr_idle_scan" and the stale flags are
left as is when the sd->shared is published. The expectation is for the
first load balancing instance to correct the value just like the current
behavior, except the initial value is no longer 0.
Originally-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Andrea Righi <arighi@nvidia.com>
| -rw-r--r-- | include/linux/sched/topology.h | 16 | ||||
| -rw-r--r-- | kernel/sched/topology.c | 63 |
2 files changed, 69 insertions, 10 deletions
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index fe09d3268bc9..b5d9d7c2b8ad 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -67,7 +67,21 @@ struct sched_domain_shared { atomic_t ref; atomic_t nr_busy_cpus; int has_idle_cores; - int nr_idle_scan; + union { + int nr_idle_scan; + /* + * Used during allocation to claim the sched_domain_shared + * object at multiple levels. + * + * Note: between build and the first periodic LB tick, which + * rewrites the union via update_idle_cpu_scan(), readers of + * nr_idle_scan may observe the transient SD_* flag value as + * the scan bound. The flag bits are small positive integers, + * so the effect is just a slightly relaxed scan bound for one + * window and self-heals on the first tick. + */ + int alloc_flags; + }; #ifdef CONFIG_SCHED_CACHE unsigned long util_avg; unsigned long capacity; diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index dbfd9657f897..df2ceb54c970 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -623,6 +623,12 @@ static void free_sched_groups(struct sched_group *sg, int free_sgc) } while (sg != first); } +static void free_sched_domain_shared(struct sched_domain_shared *sds) +{ + if (sds && atomic_dec_and_test(&sds->ref)) + kfree(sds); +} + static void destroy_sched_domain(struct sched_domain *sd) { /* @@ -631,9 +637,7 @@ static void destroy_sched_domain(struct sched_domain *sd) * dropping group/capacity references, freeing where none remain. */ free_sched_groups(sd->groups, 1); - - if (sd->shared && atomic_dec_and_test(&sd->shared->ref)) - kfree(sd->shared); + free_sched_domain_shared(sd->shared); #ifdef CONFIG_SCHED_CACHE /* only the bottom sd has llc_counts array */ @@ -755,7 +759,14 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) /* Pick reference to parent->shared. */ if (parent->shared) { - WARN_ON_ONCE(tmp->shared); + /* + * It is safe to free a sd->shared that + * has not been published yet. If a + * sd->shared was published, the refcount + * will end up being non-zero and it will + * not be freed here. + */ + free_sched_domain_shared(tmp->shared); tmp->shared = parent->shared; parent->shared = NULL; } @@ -2916,11 +2927,45 @@ static void adjust_numa_imbalance(struct sched_domain *sd_llc) } } -static void init_sched_domain_shared(struct s_data *d, struct sched_domain *sd) +static void +init_sched_domain_shared(struct s_data *d, struct sched_domain *sd, int flags) { - int sd_id = cpumask_first(sched_domain_span(sd)); + struct sched_domain_shared *sds = NULL; + int cpu; + + /* + * Multiple domains can try to claim a shared object like + * SD_ASYM_CPUCAPACITY and SD_SHARE_LLC which can alias to + * same cpumask_first(sched_domain_span(sd)) CPU and can + * cause "nr_idle_scan" to be populated incorrectly during + * load balancing. + * + * Find the first CPU in sched_domain_span(sd) with an + * unclaimed domain (!alloc_flags) or where the alloc_flag + * matches the requested flag (SD_* flag) + * + * If the domain only has single CPU, allow temporary overlap + * in allocation since the domains will be degenerated later. + */ + for_each_cpu(cpu, sched_domain_span(sd)) { + sds = *per_cpu_ptr(d->sds, cpu); + + if (!sds->alloc_flags || + sd->span_weight == 1 || + sds->alloc_flags == flags) { + sds->alloc_flags = flags; + sd->shared = sds; + break; + } + } + + /* + * Use the sd_shared corresponding to the last + * CPU in the span if none are avaialable. + */ + if (WARN_ON_ONCE(!sd->shared)) + sd->shared = sds; - sd->shared = *per_cpu_ptr(d->sds, sd_id); /* * nr_busy_cpus is consumed only by the NOHZ kick path via * sd_balance_shared; on the asym-capacity path it is initialized but @@ -2960,7 +3005,7 @@ static bool claim_asym_sched_domain_shared(struct s_data *d, int cpu) if (!sd_asym || (sd_asym->flags & SD_NUMA)) return false; - init_sched_domain_shared(d, sd_asym); + init_sched_domain_shared(d, sd_asym, SD_ASYM_CPUCAPACITY); return true; } @@ -3115,7 +3160,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att sd = sd->parent; if (sd->flags & SD_SHARE_LLC) { - init_sched_domain_shared(&d, sd); + init_sched_domain_shared(&d, sd, SD_SHARE_LLC); /* * In presence of higher domains, adjust the |
