diff options
| author | K Prateek Nayak <kprateek.nayak@amd.com> | 2026-06-02 05:00:01 +0000 |
|---|---|---|
| committer | Peter Zijlstra <peterz@infradead.org> | 2026-06-02 12:26:11 +0200 |
| commit | 1abbecd1d2d2fdd96e52f541f07ee2b163631bee (patch) | |
| tree | b7503f863a5751d08cc555cbc243f19a266d9b84 | |
| parent | b8fea7af0e40feb6d9cbbd60b66ff0ec265e868f (diff) | |
sched/fair: Convert cfs bandwidth throttling to use guards
Routine conversion of rcu_read_lock(), spin_lock*, and rq_lock usage
within the cfs bandwidth controller to use class guards.
Only notable changes are:
- Checking for "cfs_rq->runtime_remaining <= 0" instead of the inverse
to spot a throttle and break early. This also saves the need
for extra indentation in the unthrottle case.
- Reordering of list_del_rcu() against throttled_clock indicator update
in unthrottle_cfs_rq(). Both are done with "cfs_b->lock" held after
the "cfs_rq->throttled" is cleared which make the reordering safe
against concurrent list modifications.
No functional changes intended.
Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Ben Segall <bsegall@google.com>
Tested-by: Aaron Lu <ziqianlu@bytedance.com>
Link: https://patch.msgid.link/20260602050005.11160-2-kprateek.nayak@amd.com
| -rw-r--r-- | kernel/sched/fair.c | 193 |
1 files changed, 90 insertions, 103 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1d4ed883e630..261e5cedc717 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5035,13 +5035,13 @@ static void __maybe_unused clear_tg_offline_cfs_rqs(struct rq *rq) */ rq_clock_start_loop_update(rq); - rcu_read_lock(); + guard(rcu)(); + list_for_each_entry_rcu(tg, &task_groups, list) { struct cfs_rq *cfs_rq = tg_cfs_rq(tg, cpu_of(rq)); clear_tg_load_avg(cfs_rq); } - rcu_read_unlock(); rq_clock_stop_loop_update(rq); } @@ -6540,13 +6540,10 @@ static int __assign_cfs_rq_runtime(struct cfs_bandwidth *cfs_b, static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) { struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); - int ret; - raw_spin_lock(&cfs_b->lock); - ret = __assign_cfs_rq_runtime(cfs_b, cfs_rq, sched_cfs_bandwidth_slice()); - raw_spin_unlock(&cfs_b->lock); + guard(raw_spinlock)(&cfs_b->lock); - return ret; + return __assign_cfs_rq_runtime(cfs_b, cfs_rq, sched_cfs_bandwidth_slice()); } static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) @@ -6835,33 +6832,32 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); - int dequeue = 1; - raw_spin_lock(&cfs_b->lock); - /* This will start the period timer if necessary */ - if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, 1)) { + scoped_guard(raw_spinlock, &cfs_b->lock) { /* - * We have raced with bandwidth becoming available, and if we - * actually throttled the timer might not unthrottle us for an - * entire period. We additionally needed to make sure that any - * subsequent check_cfs_rq_runtime calls agree not to throttle - * us, as we may commit to do cfs put_prev+pick_next, so we ask - * for 1ns of runtime rather than just check cfs_b. + * Check if We have raced with bandwidth becoming available. If + * we actually throttled the timer might not unthrottle us for + * an entire period. We additionally needed to make sure that + * any subsequent check_cfs_rq_runtime calls agree not to + * throttle us, as we may commit to do cfs put_prev+pick_next, + * so we ask for 1ns of runtime rather than just check cfs_b. + * + * This will start the period timer if necessary. + */ + if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, 1)) + return false; + + /* + * No bandwidth available; Add ourselves on the list to be + * unthrottled later. */ - dequeue = 0; - } else { list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); } - raw_spin_unlock(&cfs_b->lock); - - if (!dequeue) - return false; /* Throttle no longer required. */ /* freeze hierarchy runnable averages while throttled */ - rcu_read_lock(); - walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); - rcu_read_unlock(); + scoped_guard(rcu) + walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); /* * Note: distribution will already see us throttled via the @@ -6894,13 +6890,15 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) update_rq_clock(rq); - raw_spin_lock(&cfs_b->lock); - if (cfs_rq->throttled_clock) { + scoped_guard(raw_spinlock, &cfs_b->lock) { + list_del_rcu(&cfs_rq->throttled_list); + + if (!cfs_rq->throttled_clock) + break; + cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock; cfs_rq->throttled_clock = 0; } - list_del_rcu(&cfs_rq->throttled_list); - raw_spin_unlock(&cfs_b->lock); /* update hierarchical throttle state */ walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq); @@ -6929,9 +6927,8 @@ static void __cfsb_csd_unthrottle(void *arg) { struct cfs_rq *cursor, *tmp; struct rq *rq = arg; - struct rq_flags rf; - rq_lock(rq, &rf); + guard(rq_lock)(rq); /* * Iterating over the list can trigger several call to @@ -6948,7 +6945,7 @@ static void __cfsb_csd_unthrottle(void *arg) * race with group being freed in the window between removing it * from the list and advancing to the next entry in the list. */ - rcu_read_lock(); + guard(rcu)(); list_for_each_entry_safe(cursor, tmp, &rq->cfsb_csd_list, throttled_csd_list) { @@ -6958,10 +6955,7 @@ static void __cfsb_csd_unthrottle(void *arg) unthrottle_cfs_rq(cursor); } - rcu_read_unlock(); - rq_clock_stop_loop_update(rq); - rq_unlock(rq, &rf); } static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) @@ -7001,11 +6995,11 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) u64 runtime, remaining = 1; bool throttled = false; struct cfs_rq *cfs_rq, *tmp; - struct rq_flags rf; struct rq *rq; LIST_HEAD(local_unthrottle); - rcu_read_lock(); + guard(rcu)(); + list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, throttled_list) { rq = rq_of(cfs_rq); @@ -7015,65 +7009,63 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) break; } - rq_lock_irqsave(rq, &rf); + guard(rq_lock_irqsave)(rq); + if (!cfs_rq_throttled(cfs_rq)) - goto next; + continue; /* Already queued for async unthrottle */ if (!list_empty(&cfs_rq->throttled_csd_list)) - goto next; + continue; /* By the above checks, this should never be true */ WARN_ON_ONCE(cfs_rq->runtime_remaining > 0); - raw_spin_lock(&cfs_b->lock); - runtime = -cfs_rq->runtime_remaining + 1; - if (runtime > cfs_b->runtime) - runtime = cfs_b->runtime; - cfs_b->runtime -= runtime; - remaining = cfs_b->runtime; - raw_spin_unlock(&cfs_b->lock); + scoped_guard(raw_spinlock, &cfs_b->lock) { + runtime = -cfs_rq->runtime_remaining + 1; + if (runtime > cfs_b->runtime) + runtime = cfs_b->runtime; + cfs_b->runtime -= runtime; + remaining = cfs_b->runtime; + } cfs_rq->runtime_remaining += runtime; - /* we check whether we're throttled above */ - if (cfs_rq->runtime_remaining > 0) { - if (cpu_of(rq) != this_cpu) { - unthrottle_cfs_rq_async(cfs_rq); - } else { - /* - * We currently only expect to be unthrottling - * a single cfs_rq locally. - */ - WARN_ON_ONCE(!list_empty(&local_unthrottle)); - list_add_tail(&cfs_rq->throttled_csd_list, - &local_unthrottle); - } - } else { + /* + * Ran out of bandwidth during distribution! + * Indicate throttled entities and break early. + */ + if (cfs_rq->runtime_remaining <= 0) { throttled = true; + break; } -next: - rq_unlock_irqrestore(rq, &rf); + /* we check whether we're throttled above */ + if (cpu_of(rq) != this_cpu) { + unthrottle_cfs_rq_async(cfs_rq); + continue; + } + + /* + * We currently only expect to be unthrottling + * a single cfs_rq locally. + */ + WARN_ON_ONCE(!list_empty(&local_unthrottle)); + list_add_tail(&cfs_rq->throttled_csd_list, &local_unthrottle); } list_for_each_entry_safe(cfs_rq, tmp, &local_unthrottle, throttled_csd_list) { struct rq *rq = rq_of(cfs_rq); - rq_lock_irqsave(rq, &rf); + guard(rq_lock_irqsave)(rq); list_del_init(&cfs_rq->throttled_csd_list); - if (cfs_rq_throttled(cfs_rq)) unthrottle_cfs_rq(cfs_rq); - - rq_unlock_irqrestore(rq, &rf); } WARN_ON_ONCE(!list_empty(&local_unthrottle)); - rcu_read_unlock(); - return throttled; } @@ -7196,7 +7188,8 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq) if (slack_runtime <= 0) return; - raw_spin_lock(&cfs_b->lock); + guard(raw_spinlock)(&cfs_b->lock); + if (cfs_b->quota != RUNTIME_INF) { cfs_b->runtime += slack_runtime; @@ -7205,7 +7198,6 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq) !list_empty(&cfs_b->throttled_cfs_rq)) start_cfs_slack_bandwidth(cfs_b); } - raw_spin_unlock(&cfs_b->lock); /* even if it's not valid for return we don't want to try again */ cfs_rq->runtime_remaining -= slack_runtime; @@ -7228,25 +7220,21 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) */ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) { - u64 runtime = 0, slice = sched_cfs_bandwidth_slice(); - unsigned long flags; - /* confirm we're still not at a refresh boundary */ - raw_spin_lock_irqsave(&cfs_b->lock, flags); - cfs_b->slack_started = false; + scoped_guard(raw_spinlock_irqsave, &cfs_b->lock) { + u64 runtime = 0, slice = sched_cfs_bandwidth_slice(); - if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) { - raw_spin_unlock_irqrestore(&cfs_b->lock, flags); - return; - } + cfs_b->slack_started = false; - if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) - runtime = cfs_b->runtime; + if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) + return; - raw_spin_unlock_irqrestore(&cfs_b->lock, flags); + if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) + runtime = cfs_b->runtime; - if (!runtime) - return; + if (!runtime) + return; + } distribute_cfs_runtime(cfs_b); } @@ -7335,18 +7323,18 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) { struct cfs_bandwidth *cfs_b = container_of(timer, struct cfs_bandwidth, period_timer); - unsigned long flags; int overrun; int idle = 0; int count = 0; - raw_spin_lock_irqsave(&cfs_b->lock, flags); + CLASS(raw_spinlock_irqsave, cfsb_guard)(&cfs_b->lock); + for (;;) { overrun = hrtimer_forward_now(timer, cfs_b->period); if (!overrun) break; - idle = do_sched_cfs_period_timer(cfs_b, overrun, flags); + idle = do_sched_cfs_period_timer(cfs_b, overrun, cfsb_guard.flags); if (++count > 3) { u64 new, old = ktime_to_ns(cfs_b->period); @@ -7379,11 +7367,13 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) count = 0; } } - if (idle) + + if (idle) { cfs_b->period_active = 0; - raw_spin_unlock_irqrestore(&cfs_b->lock, flags); + return HRTIMER_NORESTART; + } - return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; + return HRTIMER_RESTART; } void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent) @@ -7450,14 +7440,12 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) */ for_each_possible_cpu(i) { struct rq *rq = cpu_rq(i); - unsigned long flags; if (list_empty(&rq->cfsb_csd_list)) continue; - local_irq_save(flags); - __cfsb_csd_unthrottle(rq); - local_irq_restore(flags); + scoped_guard(irqsave) + __cfsb_csd_unthrottle(rq); } } @@ -7475,16 +7463,15 @@ static void __maybe_unused update_runtime_enabled(struct rq *rq) lockdep_assert_rq_held(rq); - rcu_read_lock(); + guard(rcu)(); + list_for_each_entry_rcu(tg, &task_groups, list) { struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; struct cfs_rq *cfs_rq = tg_cfs_rq(tg, cpu_of(rq)); - raw_spin_lock(&cfs_b->lock); - cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF; - raw_spin_unlock(&cfs_b->lock); + scoped_guard(raw_spinlock, &cfs_b->lock) + cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF; } - rcu_read_unlock(); } /* cpu offline callback */ @@ -7505,7 +7492,8 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) */ rq_clock_start_loop_update(rq); - rcu_read_lock(); + guard(rcu)(); + list_for_each_entry_rcu(tg, &task_groups, list) { struct cfs_rq *cfs_rq = tg_cfs_rq(tg, cpu_of(rq)); @@ -7528,7 +7516,6 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) cfs_rq->runtime_remaining = 1; unthrottle_cfs_rq(cfs_rq); } - rcu_read_unlock(); rq_clock_stop_loop_update(rq); } |
