diff options
| author | Zecheng Li <zecheng@google.com> | 2026-05-22 10:15:50 -0400 |
|---|---|---|
| committer | Peter Zijlstra <peterz@infradead.org> | 2026-06-02 12:26:11 +0200 |
| commit | b8fea7af0e40feb6d9cbbd60b66ff0ec265e868f (patch) | |
| tree | a78769ec4fdaea135d1a78d05199f175f51fd3b5 | |
| parent | 89e1f67186baca353b68115bb98bd0bfed9f80c8 (diff) | |
sched/fair: Allocate cfs_tg_state with percpu allocator
To remove the cfs_rq pointer array in task_group, allocate the combined
cfs_rq and sched_entity using the per-cpu allocator.
This patch implements the following:
- Changes task_group->cfs_rq from 'struct cfs_rq **' to
'struct cfs_rq __percpu *'.
- Updates memory allocation in alloc_fair_sched_group() and
free_fair_sched_group() to use alloc_percpu() and free_percpu()
respectively.
- Uses the inline accessor tg_cfs_rq(tg, cpu) with per_cpu_ptr() to retrieve
the pointer to cfs_rq for the given task group and CPU.
- Replaces direct accesses tg->cfs_rq[cpu] with calls to the new tg_cfs_rq(tg,
cpu) helper.
- Handles the root_task_group: since struct rq is already a per-cpu variable
(runqueues), its embedded cfs_rq (rq->cfs) is also per-cpu. Therefore, we
assign root_task_group.cfs_rq = &runqueues.cfs.
- Cleanup the code in initializing the root task group.
This change places each CPU's cfs_rq and sched_entity in its local per-cpu
memory area to remove the per-task_group pointer arrays.
Signed-off-by: Zecheng Li <zecheng@google.com>
Signed-off-by: Zecheng Li <zli94@ncsu.edu>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: K Prateek Nayak <kprateek.nayak@amd.com>
Reviewed-by: Josh Don <joshdon@google.com>
Link: https://patch.msgid.link/20260522141623.600235-4-zli94@ncsu.edu
| -rw-r--r-- | kernel/sched/core.c | 35 | ||||
| -rw-r--r-- | kernel/sched/fair.c | 54 | ||||
| -rw-r--r-- | kernel/sched/sched.h | 14 |
3 files changed, 45 insertions, 58 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 39cea012c230..dd031410ab1a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8907,7 +8907,7 @@ static struct kmem_cache *task_group_cache __ro_after_init; void __init sched_init(void) { - unsigned long ptr = 0; + unsigned long __maybe_unused ptr = 0; int i; /* Make sure the linker didn't screw up */ @@ -8923,33 +8923,24 @@ void __init sched_init(void) wait_bit_init(); #ifdef CONFIG_FAIR_GROUP_SCHED - ptr += nr_cpu_ids * sizeof(void **); -#endif -#ifdef CONFIG_RT_GROUP_SCHED - ptr += 2 * nr_cpu_ids * sizeof(void **); -#endif - if (ptr) { - ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT); + root_task_group.cfs_rq = &runqueues.cfs; -#ifdef CONFIG_FAIR_GROUP_SCHED - root_task_group.cfs_rq = (struct cfs_rq **)ptr; - ptr += nr_cpu_ids * sizeof(void **); - - root_task_group.shares = ROOT_TASK_GROUP_LOAD; - init_cfs_bandwidth(&root_task_group.cfs_bandwidth, NULL); + root_task_group.shares = ROOT_TASK_GROUP_LOAD; + init_cfs_bandwidth(&root_task_group.cfs_bandwidth, NULL); #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_EXT_GROUP_SCHED - scx_tg_init(&root_task_group); + scx_tg_init(&root_task_group); #endif /* CONFIG_EXT_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED - root_task_group.rt_se = (struct sched_rt_entity **)ptr; - ptr += nr_cpu_ids * sizeof(void **); + ptr += 2 * nr_cpu_ids * sizeof(void **); + ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT); + root_task_group.rt_se = (struct sched_rt_entity **)ptr; + ptr += nr_cpu_ids * sizeof(void **); - root_task_group.rt_rq = (struct rt_rq **)ptr; - ptr += nr_cpu_ids * sizeof(void **); + root_task_group.rt_rq = (struct rt_rq **)ptr; + ptr += nr_cpu_ids * sizeof(void **); #endif /* CONFIG_RT_GROUP_SCHED */ - } init_defrootdomain(); @@ -9864,7 +9855,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, } for_each_online_cpu(i) { - struct cfs_rq *cfs_rq = tg->cfs_rq[i]; + struct cfs_rq *cfs_rq = tg_cfs_rq(tg, i); struct rq *rq = cfs_rq->rq; guard(rq_lock_irq)(rq); @@ -10032,7 +10023,7 @@ static u64 throttled_time_self(struct task_group *tg) u64 total = 0; for_each_possible_cpu(i) { - total += READ_ONCE(tg->cfs_rq[i]->throttled_clock_self_time); + total += READ_ONCE(tg_cfs_rq(tg, i)->throttled_clock_self_time); } return total; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 447b0ac426d1..1d4ed883e630 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -334,7 +334,7 @@ static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) * to a tree or when we reach the top of the tree */ if (cfs_rq->tg->parent && - cfs_rq->tg->parent->cfs_rq[cpu]->on_list) { + tg_cfs_rq(cfs_rq->tg->parent, cpu)->on_list) { /* * If parent is already on the list, we add the child * just before. Thanks to circular linked property of @@ -342,7 +342,7 @@ static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) * of the list that starts by parent. */ list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, - &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list)); + &(tg_cfs_rq(cfs_rq->tg->parent, cpu)->leaf_cfs_rq_list)); /* * The branch is now connected to its tree so we can * reset tmp_alone_branch to the beginning of the @@ -5037,7 +5037,7 @@ static void __maybe_unused clear_tg_offline_cfs_rqs(struct rq *rq) rcu_read_lock(); list_for_each_entry_rcu(tg, &task_groups, list) { - struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; + struct cfs_rq *cfs_rq = tg_cfs_rq(tg, cpu_of(rq)); clear_tg_load_avg(cfs_rq); } @@ -6594,7 +6594,7 @@ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) static inline int lb_throttled_hierarchy(struct task_struct *p, int dst_cpu) { - return throttled_hierarchy(task_group(p)->cfs_rq[dst_cpu]); + return throttled_hierarchy(tg_cfs_rq(task_group(p), dst_cpu)); } static inline bool task_is_throttled(struct task_struct *p) @@ -6740,7 +6740,7 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags); static int tg_unthrottle_up(struct task_group *tg, void *data) { struct rq *rq = data; - struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; + struct cfs_rq *cfs_rq = tg_cfs_rq(tg, cpu_of(rq)); struct task_struct *p, *tmp; if (--cfs_rq->throttle_count) @@ -6811,7 +6811,7 @@ static void record_throttle_clock(struct cfs_rq *cfs_rq) static int tg_throttle_down(struct task_group *tg, void *data) { struct rq *rq = data; - struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; + struct cfs_rq *cfs_rq = tg_cfs_rq(tg, cpu_of(rq)); if (cfs_rq->throttle_count++) return 0; @@ -7285,8 +7285,8 @@ static void sync_throttle(struct task_group *tg, int cpu) if (!tg->parent) return; - cfs_rq = tg->cfs_rq[cpu]; - pcfs_rq = tg->parent->cfs_rq[cpu]; + cfs_rq = tg_cfs_rq(tg, cpu); + pcfs_rq = tg_cfs_rq(tg->parent, cpu); cfs_rq->throttle_count = pcfs_rq->throttle_count; cfs_rq->throttled_clock_pelt = rq_clock_pelt(cpu_rq(cpu)); @@ -7478,7 +7478,7 @@ static void __maybe_unused update_runtime_enabled(struct rq *rq) rcu_read_lock(); list_for_each_entry_rcu(tg, &task_groups, list) { struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; - struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; + struct cfs_rq *cfs_rq = tg_cfs_rq(tg, cpu_of(rq)); raw_spin_lock(&cfs_b->lock); cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF; @@ -7507,7 +7507,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) rcu_read_lock(); list_for_each_entry_rcu(tg, &task_groups, list) { - struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; + struct cfs_rq *cfs_rq = tg_cfs_rq(tg, cpu_of(rq)); if (!cfs_rq->runtime_enabled) continue; @@ -10382,7 +10382,7 @@ static inline int task_is_ineligible_on_dst_cpu(struct task_struct *p, int dest_ struct cfs_rq *dst_cfs_rq; #ifdef CONFIG_FAIR_GROUP_SCHED - dst_cfs_rq = task_group(p)->cfs_rq[dest_cpu]; + dst_cfs_rq = tg_cfs_rq(task_group(p), dest_cpu); #else dst_cfs_rq = &cpu_rq(dest_cpu)->cfs; #endif @@ -14812,7 +14812,7 @@ static int task_is_throttled_fair(struct task_struct *p, int cpu) struct cfs_rq *cfs_rq; #ifdef CONFIG_FAIR_GROUP_SCHED - cfs_rq = task_group(p)->cfs_rq[cpu]; + cfs_rq = tg_cfs_rq(task_group(p), cpu); #else cfs_rq = &cpu_rq(cpu)->cfs; #endif @@ -15076,39 +15076,31 @@ static void task_change_group_fair(struct task_struct *p) void free_fair_sched_group(struct task_group *tg) { - int i; - - for_each_possible_cpu(i) { - if (tg->cfs_rq) - kfree(tg->cfs_rq[i]); - } - - kfree(tg->cfs_rq); + free_percpu(tg->cfs_rq); } int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) { - struct cfs_tg_state *state; + struct cfs_tg_state __percpu *state; struct sched_entity *se; struct cfs_rq *cfs_rq; int i; - tg->cfs_rq = kzalloc_objs(cfs_rq, nr_cpu_ids); - if (!tg->cfs_rq) + state = alloc_percpu_gfp(struct cfs_tg_state, GFP_KERNEL); + if (!state) goto err; + tg->cfs_rq = &state->cfs_rq; tg->shares = NICE_0_LOAD; init_cfs_bandwidth(tg_cfs_bandwidth(tg), tg_cfs_bandwidth(parent)); for_each_possible_cpu(i) { - state = kzalloc_node(sizeof(*state), - GFP_KERNEL, cpu_to_node(i)); - if (!state) + cfs_rq = tg_cfs_rq(tg, i); + if (!cfs_rq) goto err; - cfs_rq = &state->cfs_rq; - se = &state->se; + se = tg_se(tg, i); init_cfs_rq(cfs_rq); init_tg_cfs_entry(tg, cfs_rq, se, i, tg_se(parent, i)); init_entity_runnable_average(se); @@ -15145,7 +15137,7 @@ void unregister_fair_sched_group(struct task_group *tg) destroy_cfs_bandwidth(tg_cfs_bandwidth(tg)); for_each_possible_cpu(cpu) { - struct cfs_rq *cfs_rq = tg->cfs_rq[cpu]; + struct cfs_rq *cfs_rq = tg_cfs_rq(tg, cpu); struct sched_entity *se = tg_se(tg, cpu); struct rq *rq = cpu_rq(cpu); @@ -15182,8 +15174,6 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, cfs_rq->rq = rq; init_cfs_rq_runtime(cfs_rq); - tg->cfs_rq[cpu] = cfs_rq; - /* se could be NULL for root_task_group */ if (!se) return; @@ -15276,7 +15266,7 @@ int sched_group_set_idle(struct task_group *tg, long idle) for_each_possible_cpu(i) { struct rq *rq = cpu_rq(i); struct sched_entity *se = tg_se(tg, i); - struct cfs_rq *grp_cfs_rq = tg->cfs_rq[i]; + struct cfs_rq *grp_cfs_rq = tg_cfs_rq(tg, i); bool was_idle = cfs_rq_is_idle(grp_cfs_rq); long idle_task_delta; struct rq_flags rf; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 823ba40cf098..c7c2dea65edd 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -485,7 +485,7 @@ struct task_group { #ifdef CONFIG_FAIR_GROUP_SCHED /* runqueue "owned" by this group on each CPU */ - struct cfs_rq **cfs_rq; + struct cfs_rq __percpu *cfs_rq; unsigned long shares; /* * load_avg can be heavily contended at clock tick time, so put @@ -2304,6 +2304,12 @@ struct cfs_tg_state { struct sched_statistics stats; } __no_randomize_layout; +/* Access a specific CPU's cfs_rq from a task group */ +static inline struct cfs_rq *tg_cfs_rq(struct task_group *tg, int cpu) +{ + return per_cpu_ptr(tg->cfs_rq, cpu); +} + static inline struct sched_entity *tg_se(struct task_group *tg, int cpu) { struct cfs_tg_state *state; @@ -2311,7 +2317,7 @@ static inline struct sched_entity *tg_se(struct task_group *tg, int cpu) if (is_root_task_group(tg)) return NULL; - state = container_of(tg->cfs_rq[cpu], struct cfs_tg_state, cfs_rq); + state = container_of(tg_cfs_rq(tg, cpu), struct cfs_tg_state, cfs_rq); return &state->se; } @@ -2335,8 +2341,8 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) #endif #ifdef CONFIG_FAIR_GROUP_SCHED - set_task_rq_fair(&p->se, p->se.cfs_rq, tg->cfs_rq[cpu]); - p->se.cfs_rq = tg->cfs_rq[cpu]; + set_task_rq_fair(&p->se, p->se.cfs_rq, tg_cfs_rq(tg, cpu)); + p->se.cfs_rq = tg_cfs_rq(tg, cpu); p->se.parent = tg_se(tg, cpu); p->se.depth = p->se.parent ? p->se.parent->depth + 1 : 0; #endif |
