diff options
| author | Tejun Heo <tj@kernel.org> | 2026-03-09 09:59:36 -1000 |
|---|---|---|
| committer | Tejun Heo <tj@kernel.org> | 2026-03-09 09:59:36 -1000 |
| commit | 0e7cd9cef61fde36ebfb653fe9e7a9722185cb57 (patch) | |
| tree | 0eef127fa5f0bda6d0729a47ce5d3038c849b695 /kernel | |
| parent | bec10581e92289bdc3eed17e90200900ddb00594 (diff) | |
| parent | 54a66e431eeacf23e1dc47cb3507f2d0c068aaf0 (diff) | |
Merge branch 'sched/core' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip into for-7.1
Pull sched/core to resolve conflicts between:
c2a57380df9dd ("sched: Replace use of system_unbound_wq with system_dfl_wq")
from the tip tree and commit:
cde94c032b32b ("sched_ext: Make watchdog sub-sched aware")
The latter moves around code modiefied by the former. Apply the changes in
the new locations.
Signed-off-by: Tejun Heo <tj@kernel.org>
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/sched/core.c | 9 | ||||
| -rw-r--r-- | kernel/sched/deadline.c | 23 | ||||
| -rw-r--r-- | kernel/sched/debug.c | 14 | ||||
| -rw-r--r-- | kernel/sched/ext.c | 5 | ||||
| -rw-r--r-- | kernel/sched/fair.c | 327 | ||||
| -rw-r--r-- | kernel/sched/features.h | 3 | ||||
| -rw-r--r-- | kernel/sched/rt.c | 7 | ||||
| -rw-r--r-- | kernel/sched/sched.h | 14 | ||||
| -rw-r--r-- | kernel/sched/syscalls.c | 16 | ||||
| -rw-r--r-- | kernel/sched/topology.c | 3 |
10 files changed, 333 insertions, 88 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index dfe680e78be3..ec2a77f8879d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -687,11 +687,6 @@ bool raw_spin_rq_trylock(struct rq *rq) } } -void raw_spin_rq_unlock(struct rq *rq) -{ - raw_spin_unlock(rq_lockp(rq)); -} - /* * double_rq_lock - safely lock two runqueues */ @@ -5678,7 +5673,7 @@ static void sched_tick_remote(struct work_struct *work) os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING); WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE); if (os == TICK_SCHED_REMOTE_RUNNING) - queue_delayed_work(system_unbound_wq, dwork, HZ); + queue_delayed_work(system_dfl_wq, dwork, HZ); } static void sched_tick_start(int cpu) @@ -5697,7 +5692,7 @@ static void sched_tick_start(int cpu) if (os == TICK_SCHED_REMOTE_OFFLINE) { twork->cpu = cpu; INIT_DELAYED_WORK(&twork->work, sched_tick_remote); - queue_delayed_work(system_unbound_wq, &twork->work, HZ); + queue_delayed_work(system_dfl_wq, &twork->work, HZ); } } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index d08b00429323..9e253a825f39 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2142,10 +2142,14 @@ update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se, int flags) { struct task_struct *p = dl_task_of(dl_se); + struct rq *rq = rq_of_dl_rq(dl_rq); if (!schedstat_enabled()) return; + if (p != rq->curr) + update_stats_wait_end_dl(dl_rq, dl_se); + if ((flags & DEQUEUE_SLEEP)) { unsigned int state; @@ -3613,13 +3617,26 @@ void __setparam_dl(struct task_struct *p, const struct sched_attr *attr) dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime); } -void __getparam_dl(struct task_struct *p, struct sched_attr *attr) +void __getparam_dl(struct task_struct *p, struct sched_attr *attr, unsigned int flags) { struct sched_dl_entity *dl_se = &p->dl; + struct rq *rq = task_rq(p); + u64 adj_deadline; attr->sched_priority = p->rt_priority; - attr->sched_runtime = dl_se->dl_runtime; - attr->sched_deadline = dl_se->dl_deadline; + if (flags & SCHED_GETATTR_FLAG_DL_DYNAMIC) { + guard(raw_spinlock_irq)(&rq->__lock); + update_rq_clock(rq); + if (task_current(rq, p)) + update_curr_dl(rq); + + attr->sched_runtime = dl_se->runtime; + adj_deadline = dl_se->deadline - rq_clock(rq) + ktime_get_ns(); + attr->sched_deadline = adj_deadline; + } else { + attr->sched_runtime = dl_se->dl_runtime; + attr->sched_deadline = dl_se->dl_deadline; + } attr->sched_period = dl_se->dl_period; attr->sched_flags &= ~SCHED_DL_FLAGS; attr->sched_flags |= dl_se->flags; diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index b24f40f05019..6246008c431e 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -8,6 +8,7 @@ */ #include <linux/debugfs.h> #include <linux/nmi.h> +#include <linux/log2.h> #include "sched.h" /* @@ -901,10 +902,13 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) { - s64 left_vruntime = -1, zero_vruntime, right_vruntime = -1, left_deadline = -1, spread; + s64 left_vruntime = -1, right_vruntime = -1, left_deadline = -1, spread; + s64 zero_vruntime = -1, sum_w_vruntime = -1; struct sched_entity *last, *first, *root; struct rq *rq = cpu_rq(cpu); + unsigned int sum_shift; unsigned long flags; + u64 sum_weight; #ifdef CONFIG_FAIR_GROUP_SCHED SEQ_printf(m, "\n"); @@ -925,6 +929,9 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) if (last) right_vruntime = last->vruntime; zero_vruntime = cfs_rq->zero_vruntime; + sum_w_vruntime = cfs_rq->sum_w_vruntime; + sum_weight = cfs_rq->sum_weight; + sum_shift = cfs_rq->sum_shift; raw_spin_rq_unlock_irqrestore(rq, flags); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_deadline", @@ -933,6 +940,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SPLIT_NS(left_vruntime)); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "zero_vruntime", SPLIT_NS(zero_vruntime)); + SEQ_printf(m, " .%-30s: %Ld (%d bits)\n", "sum_w_vruntime", + sum_w_vruntime, ilog2(abs(sum_w_vruntime))); + SEQ_printf(m, " .%-30s: %Lu\n", "sum_weight", + sum_weight); + SEQ_printf(m, " .%-30s: %u\n", "sum_shift", sum_shift); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "avg_vruntime", SPLIT_NS(avg_vruntime(cfs_rq))); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "right_vruntime", diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 43fda1258903..b35b98020f3b 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -3208,8 +3208,7 @@ static void scx_watchdog_workfn(struct work_struct *work) intv = READ_ONCE(scx_watchdog_interval); if (intv < ULONG_MAX) - queue_delayed_work(system_unbound_wq, to_delayed_work(work), - intv); + queue_delayed_work(system_dfl_wq, to_delayed_work(work), intv); } void scx_tick(struct rq *rq) @@ -5233,7 +5232,7 @@ static void refresh_watchdog(void) WRITE_ONCE(scx_watchdog_interval, intv); if (intv < ULONG_MAX) - mod_delayed_work(system_unbound_wq, &scx_watchdog_work, intv); + mod_delayed_work(system_dfl_wq, &scx_watchdog_work, intv); else cancel_delayed_work_sync(&scx_watchdog_work); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bf948db905ed..d57c02e82f3a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -225,6 +225,7 @@ void __init sched_init_granularity(void) update_sysctl(); } +#ifndef CONFIG_64BIT #define WMULT_CONST (~0U) #define WMULT_SHIFT 32 @@ -283,6 +284,12 @@ static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight return mul_u64_u32_shr(delta_exec, fact, shift); } +#else +static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw) +{ + return (delta_exec * weight) / lw->weight; +} +#endif /* * delta /= w @@ -665,25 +672,83 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) * Since zero_vruntime closely tracks the per-task service, these * deltas: (v_i - v0), will be in the order of the maximal (virtual) lag * induced in the system due to quantisation. - * - * Also, we use scale_load_down() to reduce the size. - * - * As measured, the max (key * weight) value was ~44 bits for a kernel build. */ +static inline unsigned long avg_vruntime_weight(struct cfs_rq *cfs_rq, unsigned long w) +{ +#ifdef CONFIG_64BIT + if (cfs_rq->sum_shift) + w = max(2UL, w >> cfs_rq->sum_shift); +#endif + return w; +} + +static inline void +__sum_w_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + unsigned long weight = avg_vruntime_weight(cfs_rq, se->load.weight); + s64 w_vruntime, key = entity_key(cfs_rq, se); + + w_vruntime = key * weight; + WARN_ON_ONCE((w_vruntime >> 63) != (w_vruntime >> 62)); + + cfs_rq->sum_w_vruntime += w_vruntime; + cfs_rq->sum_weight += weight; +} + static void -sum_w_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se) +sum_w_vruntime_add_paranoid(struct cfs_rq *cfs_rq, struct sched_entity *se) { - unsigned long weight = scale_load_down(se->load.weight); - s64 key = entity_key(cfs_rq, se); + unsigned long weight; + s64 key, tmp; + +again: + weight = avg_vruntime_weight(cfs_rq, se->load.weight); + key = entity_key(cfs_rq, se); + + if (check_mul_overflow(key, weight, &key)) + goto overflow; - cfs_rq->sum_w_vruntime += key * weight; + if (check_add_overflow(cfs_rq->sum_w_vruntime, key, &tmp)) + goto overflow; + + cfs_rq->sum_w_vruntime = tmp; cfs_rq->sum_weight += weight; + return; + +overflow: + /* + * There's gotta be a limit -- if we're still failing at this point + * there's really nothing much to be done about things. + */ + BUG_ON(cfs_rq->sum_shift >= 10); + cfs_rq->sum_shift++; + + /* + * Note: \Sum (k_i * (w_i >> 1)) != (\Sum (k_i * w_i)) >> 1 + */ + cfs_rq->sum_w_vruntime = 0; + cfs_rq->sum_weight = 0; + + for (struct rb_node *node = cfs_rq->tasks_timeline.rb_leftmost; + node; node = rb_next(node)) + __sum_w_vruntime_add(cfs_rq, __node_2_se(node)); + + goto again; +} + +static void +sum_w_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + if (sched_feat(PARANOID_AVG)) + return sum_w_vruntime_add_paranoid(cfs_rq, se); + + __sum_w_vruntime_add(cfs_rq, se); } static void sum_w_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se) { - unsigned long weight = scale_load_down(se->load.weight); + unsigned long weight = avg_vruntime_weight(cfs_rq, se->load.weight); s64 key = entity_key(cfs_rq, se); cfs_rq->sum_w_vruntime -= key * weight; @@ -725,7 +790,7 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq) s64 runtime = cfs_rq->sum_w_vruntime; if (curr) { - unsigned long w = scale_load_down(curr->load.weight); + unsigned long w = avg_vruntime_weight(cfs_rq, curr->load.weight); runtime += entity_key(cfs_rq, curr) * w; weight += w; @@ -735,7 +800,7 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq) if (runtime < 0) runtime -= (weight - 1); - delta = div_s64(runtime, weight); + delta = div64_long(runtime, weight); } else if (curr) { /* * When there is but one element, it is the average. @@ -764,17 +829,22 @@ static inline u64 cfs_rq_max_slice(struct cfs_rq *cfs_rq); * * -r_max < lag < max(r_max, q) */ -static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) +static s64 entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 avruntime) { u64 max_slice = cfs_rq_max_slice(cfs_rq) + TICK_NSEC; s64 vlag, limit; - WARN_ON_ONCE(!se->on_rq); - - vlag = avg_vruntime(cfs_rq) - se->vruntime; + vlag = avruntime - se->vruntime; limit = calc_delta_fair(max_slice, se); - se->vlag = clamp(vlag, -limit, limit); + return clamp(vlag, -limit, limit); +} + +static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + WARN_ON_ONCE(!se->on_rq); + + se->vlag = entity_lag(cfs_rq, se, avg_vruntime(cfs_rq)); } /* @@ -801,7 +871,7 @@ static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime) long load = cfs_rq->sum_weight; if (curr && curr->on_rq) { - unsigned long weight = scale_load_down(curr->load.weight); + unsigned long weight = avg_vruntime_weight(cfs_rq, curr->load.weight); avg += entity_key(cfs_rq, curr) * weight; load += weight; @@ -3840,23 +3910,125 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) se_weight(se) * -se->avg.load_sum); } -static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags); +static void +rescale_entity(struct sched_entity *se, unsigned long weight, bool rel_vprot) +{ + unsigned long old_weight = se->load.weight; + + /* + * VRUNTIME + * -------- + * + * COROLLARY #1: The virtual runtime of the entity needs to be + * adjusted if re-weight at !0-lag point. + * + * Proof: For contradiction assume this is not true, so we can + * re-weight without changing vruntime at !0-lag point. + * + * Weight VRuntime Avg-VRuntime + * before w v V + * after w' v' V' + * + * Since lag needs to be preserved through re-weight: + * + * lag = (V - v)*w = (V'- v')*w', where v = v' + * ==> V' = (V - v)*w/w' + v (1) + * + * Let W be the total weight of the entities before reweight, + * since V' is the new weighted average of entities: + * + * V' = (WV + w'v - wv) / (W + w' - w) (2) + * + * by using (1) & (2) we obtain: + * + * (WV + w'v - wv) / (W + w' - w) = (V - v)*w/w' + v + * ==> (WV-Wv+Wv+w'v-wv)/(W+w'-w) = (V - v)*w/w' + v + * ==> (WV - Wv)/(W + w' - w) + v = (V - v)*w/w' + v + * ==> (V - v)*W/(W + w' - w) = (V - v)*w/w' (3) + * + * Since we are doing at !0-lag point which means V != v, we + * can simplify (3): + * + * ==> W / (W + w' - w) = w / w' + * ==> Ww' = Ww + ww' - ww + * ==> W * (w' - w) = w * (w' - w) + * ==> W = w (re-weight indicates w' != w) + * + * So the cfs_rq contains only one entity, hence vruntime of + * the entity @v should always equal to the cfs_rq's weighted + * average vruntime @V, which means we will always re-weight + * at 0-lag point, thus breach assumption. Proof completed. + * + * + * COROLLARY #2: Re-weight does NOT affect weighted average + * vruntime of all the entities. + * + * Proof: According to corollary #1, Eq. (1) should be: + * + * (V - v)*w = (V' - v')*w' + * ==> v' = V' - (V - v)*w/w' (4) + * + * According to the weighted average formula, we have: + * + * V' = (WV - wv + w'v') / (W - w + w') + * = (WV - wv + w'(V' - (V - v)w/w')) / (W - w + w') + * = (WV - wv + w'V' - Vw + wv) / (W - w + w') + * = (WV + w'V' - Vw) / (W - w + w') + * + * ==> V'*(W - w + w') = WV + w'V' - Vw + * ==> V' * (W - w) = (W - w) * V (5) + * + * If the entity is the only one in the cfs_rq, then reweight + * always occurs at 0-lag point, so V won't change. Or else + * there are other entities, hence W != w, then Eq. (5) turns + * into V' = V. So V won't change in either case, proof done. + * + * + * So according to corollary #1 & #2, the effect of re-weight + * on vruntime should be: + * + * v' = V' - (V - v) * w / w' (4) + * = V - (V - v) * w / w' + * = V - vl * w / w' + * = V - vl' + */ + se->vlag = div64_long(se->vlag * old_weight, weight); + + /* + * DEADLINE + * -------- + * + * When the weight changes, the virtual time slope changes and + * we should adjust the relative virtual deadline accordingly. + * + * d' = v' + (d - v)*w/w' + * = V' - (V - v)*w/w' + (d - v)*w/w' + * = V - (V - v)*w/w' + (d - v)*w/w' + * = V + (d - V)*w/w' + */ + if (se->rel_deadline) + se->deadline = div64_long(se->deadline * old_weight, weight); + + if (rel_vprot) + se->vprot = div64_long(se->vprot * old_weight, weight); +} static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, unsigned long weight) { bool curr = cfs_rq->curr == se; bool rel_vprot = false; - u64 vprot; + u64 avruntime = 0; if (se->on_rq) { /* commit outstanding execution time */ update_curr(cfs_rq); - update_entity_lag(cfs_rq, se); - se->deadline -= se->vruntime; + avruntime = avg_vruntime(cfs_rq); + se->vlag = entity_lag(cfs_rq, se, avruntime); + se->deadline -= avruntime; se->rel_deadline = 1; if (curr && protect_slice(se)) { - vprot = se->vprot - se->vruntime; + se->vprot -= avruntime; rel_vprot = true; } @@ -3867,30 +4039,23 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, } dequeue_load_avg(cfs_rq, se); - /* - * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i), - * we need to scale se->vlag when w_i changes. - */ - se->vlag = div_s64(se->vlag * se->load.weight, weight); - if (se->rel_deadline) - se->deadline = div_s64(se->deadline * se->load.weight, weight); - - if (rel_vprot) - vprot = div_s64(vprot * se->load.weight, weight); + rescale_entity(se, weight, rel_vprot); update_load_set(&se->load, weight); do { u32 divider = get_pelt_divider(&se->avg); - se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider); } while (0); enqueue_load_avg(cfs_rq, se); if (se->on_rq) { - place_entity(cfs_rq, se, 0); if (rel_vprot) - se->vprot = se->vruntime + vprot; + se->vprot += avruntime; + se->deadline += avruntime; + se->rel_deadline = 0; + se->vruntime = avruntime - se->vlag; + update_load_add(&cfs_rq->load, se->load.weight); if (!curr) __enqueue_entity(cfs_rq, se); @@ -5180,7 +5345,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) */ if (sched_feat(PLACE_LAG) && cfs_rq->nr_queued && se->vlag) { struct sched_entity *curr = cfs_rq->curr; - unsigned long load; + long load; lag = se->vlag; @@ -5238,17 +5403,17 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) */ load = cfs_rq->sum_weight; if (curr && curr->on_rq) - load += scale_load_down(curr->load.weight); + load += avg_vruntime_weight(cfs_rq, curr->load.weight); - lag *= load + scale_load_down(se->load.weight); + lag *= load + avg_vruntime_weight(cfs_rq, se->load.weight); if (WARN_ON_ONCE(!load)) load = 1; - lag = div_s64(lag, load); + lag = div64_long(lag, load); } se->vruntime = vruntime - lag; - if (se->rel_deadline) { + if (sched_feat(PLACE_REL_DEADLINE) && se->rel_deadline) { se->deadline += se->vruntime; se->rel_deadline = 0; return; @@ -6853,16 +7018,15 @@ static inline void hrtick_update(struct rq *rq) static inline bool cpu_overutilized(int cpu) { - unsigned long rq_util_min, rq_util_max; + unsigned long rq_util_max; if (!sched_energy_enabled()) return false; - rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN); rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX); /* Return true only if the utilization doesn't fit CPU's capacity */ - return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu); + return !util_fits_cpu(cpu_util_cfs(cpu), 0, rq_util_max, cpu); } /* @@ -6900,9 +7064,15 @@ static int sched_idle_rq(struct rq *rq) rq->nr_running); } -static int sched_idle_cpu(int cpu) +static int choose_sched_idle_rq(struct rq *rq, struct task_struct *p) +{ + return sched_idle_rq(rq) && !task_has_idle_policy(p); +} + +static int choose_idle_cpu(int cpu, struct task_struct *p) { - return sched_idle_rq(cpu_rq(cpu)); + return available_idle_cpu(cpu) || + choose_sched_idle_rq(cpu_rq(cpu), p); } static void @@ -7467,7 +7637,7 @@ sched_balance_find_dst_group_cpu(struct sched_group *group, struct task_struct * if (!sched_core_cookie_match(rq, p)) continue; - if (sched_idle_cpu(i)) + if (choose_sched_idle_rq(rq, p)) return i; if (available_idle_cpu(i)) { @@ -7558,8 +7728,7 @@ static inline int sched_balance_find_dst_cpu(struct sched_domain *sd, struct tas static inline int __select_idle_cpu(int cpu, struct task_struct *p) { - if ((available_idle_cpu(cpu) || sched_idle_cpu(cpu)) && - sched_cpu_cookie_match(cpu_rq(cpu), p)) + if (choose_idle_cpu(cpu, p) && sched_cpu_cookie_match(cpu_rq(cpu), p)) return cpu; return -1; @@ -7632,7 +7801,8 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu if (!available_idle_cpu(cpu)) { idle = false; if (*idle_cpu == -1) { - if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, cpus)) { + if (choose_sched_idle_rq(cpu_rq(cpu), p) && + cpumask_test_cpu(cpu, cpus)) { *idle_cpu = cpu; break; } @@ -7667,7 +7837,7 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t */ if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) continue; - if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) + if (choose_idle_cpu(cpu, p)) return cpu; } @@ -7789,7 +7959,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) for_each_cpu_wrap(cpu, cpus, target) { unsigned long cpu_cap = capacity_of(cpu); - if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu)) + if (!choose_idle_cpu(cpu, p)) continue; fits = util_fits_cpu(task_util, util_min, util_max, cpu); @@ -7860,7 +8030,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) */ lockdep_assert_irqs_disabled(); - if ((available_idle_cpu(target) || sched_idle_cpu(target)) && + if (choose_idle_cpu(target, p) && asym_fits_cpu(task_util, util_min, util_max, target)) return target; @@ -7868,7 +8038,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) * If the previous CPU is cache affine and idle, don't be stupid: */ if (prev != target && cpus_share_cache(prev, target) && - (available_idle_cpu(prev) || sched_idle_cpu(prev)) && + choose_idle_cpu(prev, p) && asym_fits_cpu(task_util, util_min, util_max, prev)) { if (!static_branch_unlikely(&sched_cluster_active) || @@ -7900,7 +8070,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) if (recent_used_cpu != prev && recent_used_cpu != target && cpus_share_cache(recent_used_cpu, target) && - (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && + choose_idle_cpu(recent_used_cpu, p) && cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) && asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) { @@ -10047,6 +10217,7 @@ struct sg_lb_stats { unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */ unsigned int group_smt_balance; /* Task on busy SMT be moved */ unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */ + unsigned int group_overutilized; /* At least one CPU is overutilized in the group */ #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; unsigned int nr_preferred_running; @@ -10279,6 +10450,13 @@ group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs) static inline bool group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs) { + /* + * With EAS and uclamp, 1 CPU in the group must be overutilized to + * consider the group overloaded. + */ + if (sched_energy_enabled() && !sgs->group_overutilized) + return false; + if (sgs->sum_nr_running <= sgs->group_weight) return false; @@ -10462,14 +10640,12 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd) * @group: sched_group whose statistics are to be updated. * @sgs: variable to hold the statistics for this group. * @sg_overloaded: sched_group is overloaded - * @sg_overutilized: sched_group is overutilized */ static inline void update_sg_lb_stats(struct lb_env *env, struct sd_lb_stats *sds, struct sched_group *group, struct sg_lb_stats *sgs, - bool *sg_overloaded, - bool *sg_overutilized) + bool *sg_overloaded) { int i, nr_running, local_group, sd_flags = env->sd->flags; bool balancing_at_rd = !env->sd->parent; @@ -10491,7 +10667,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->sum_nr_running += nr_running; if (cpu_overutilized(i)) - *sg_overutilized = 1; + sgs->group_overutilized = 1; /* * No need to call idle_cpu() if nr_running is not 0 @@ -11162,13 +11338,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd update_group_capacity(env->sd, env->dst_cpu); } - update_sg_lb_stats(env, sds, sg, sgs, &sg_overloaded, &sg_overutilized); + update_sg_lb_stats(env, sds, sg, sgs, &sg_overloaded); if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) { sds->busiest = sg; sds->busiest_stat = *sgs; } + sg_overutilized |= sgs->group_overutilized; + /* Now, start updating sd_lb_stats */ sds->total_load += sgs->group_load; sds->total_capacity += sgs->group_capacity; @@ -12289,7 +12467,30 @@ static inline void update_newidle_stats(struct sched_domain *sd, unsigned int su sd->newidle_success += success; if (sd->newidle_call >= 1024) { - sd->newidle_ratio = sd->newidle_success; + u64 now = sched_clock(); + s64 delta = now - sd->newidle_stamp; + sd->newidle_stamp = now; + int ratio = 0; + + if (delta < 0) + delta = 0; + + if (sched_feat(NI_RATE)) { + /* + * ratio delta freq + * + * 1024 - 4 s - 128 Hz + * 512 - 2 s - 256 Hz + * 256 - 1 s - 512 Hz + * 128 - .5 s - 1024 Hz + * 64 - .25 s - 2048 Hz + */ + ratio = delta >> 22; + } + + ratio += sd->newidle_success; + + sd->newidle_ratio = min(1024, ratio); sd->newidle_call /= 2; sd->newidle_success /= 2; } @@ -12336,7 +12537,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle) { int continue_balancing = 1; int cpu = rq->cpu; - int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu); + int busy = idle != CPU_IDLE && !sched_idle_rq(rq); unsigned long interval; struct sched_domain *sd; /* Earliest time when we have to do rebalance again */ @@ -12374,7 +12575,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle) * state even if we migrated tasks. Update it. */ idle = idle_cpu(cpu); - busy = !idle && !sched_idle_cpu(cpu); + busy = !idle && !sched_idle_rq(rq); } sd->last_balance = jiffies; interval = get_sd_balance_interval(sd, busy); @@ -12996,7 +13197,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) if (sd->flags & SD_BALANCE_NEWIDLE) { unsigned int weight = 1; - if (sched_feat(NI_RANDOM)) { + if (sched_feat(NI_RANDOM) && sd->newidle_ratio < 1024) { /* * Throw a 1k sided dice; and only run * newidle_balance according to the success diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 136a6584be79..a25f97201ab9 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -58,6 +58,8 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true) SCHED_FEAT(DELAY_DEQUEUE, true) SCHED_FEAT(DELAY_ZERO, true) +SCHED_FEAT(PARANOID_AVG, false) + /* * Allow wakeup-time preemption of the current task: */ @@ -126,3 +128,4 @@ SCHED_FEAT(LATENCY_WARN, false) * Do newidle balancing proportional to its success rate using randomization. */ SCHED_FEAT(NI_RANDOM, true) +SCHED_FEAT(NI_RATE, true) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index f69e1f16d923..3d823f5ffe2c 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1302,13 +1302,18 @@ update_stats_dequeue_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int flags) { struct task_struct *p = NULL; + struct rq *rq = rq_of_rt_rq(rt_rq); if (!schedstat_enabled()) return; - if (rt_entity_is_task(rt_se)) + if (rt_entity_is_task(rt_se)) { p = rt_task_of(rt_se); + if (p != rq->curr) + update_stats_wait_end_rt(rt_rq, rt_se); + } + if ((flags & DEQUEUE_SLEEP) && p) { unsigned int state; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 893f89ce2a77..60627119d0ab 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -356,7 +356,7 @@ extern int sched_dl_global_validate(void); extern void sched_dl_do_global(void); extern int sched_dl_overflow(struct task_struct *p, int policy, const struct sched_attr *attr); extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr); -extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr); +extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr, unsigned int flags); extern bool __checkparam_dl(const struct sched_attr *attr); extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); @@ -684,8 +684,9 @@ struct cfs_rq { s64 sum_w_vruntime; u64 sum_weight; - u64 zero_vruntime; + unsigned int sum_shift; + #ifdef CONFIG_SCHED_CORE unsigned int forceidle_seq; u64 zero_vruntime_fi; @@ -1609,15 +1610,18 @@ extern void raw_spin_rq_lock_nested(struct rq *rq, int subclass) extern bool raw_spin_rq_trylock(struct rq *rq) __cond_acquires(true, __rq_lockp(rq)); -extern void raw_spin_rq_unlock(struct rq *rq) - __releases(__rq_lockp(rq)); - static inline void raw_spin_rq_lock(struct rq *rq) __acquires(__rq_lockp(rq)) { raw_spin_rq_lock_nested(rq, 0); } +static inline void raw_spin_rq_unlock(struct rq *rq) + __releases(__rq_lockp(rq)) +{ + raw_spin_unlock(rq_lockp(rq)); +} + static inline void raw_spin_rq_lock_irq(struct rq *rq) __acquires(__rq_lockp(rq)) { diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c index 6f10db3646e7..a288ac0a633d 100644 --- a/kernel/sched/syscalls.c +++ b/kernel/sched/syscalls.c @@ -881,10 +881,10 @@ err_size: return -E2BIG; } -static void get_params(struct task_struct *p, struct sched_attr *attr) +static void get_params(struct task_struct *p, struct sched_attr *attr, unsigned int flags) { if (task_has_dl_policy(p)) { - __getparam_dl(p, attr); + __getparam_dl(p, attr, flags); } else if (task_has_rt_policy(p)) { attr->sched_priority = p->rt_priority; } else { @@ -950,7 +950,7 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, return -ESRCH; if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS) - get_params(p, &attr); + get_params(p, &attr, 0); return sched_setattr(p, &attr); } @@ -1035,7 +1035,7 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, int retval; if (unlikely(!uattr || pid < 0 || usize > PAGE_SIZE || - usize < SCHED_ATTR_SIZE_VER0 || flags)) + usize < SCHED_ATTR_SIZE_VER0)) return -EINVAL; scoped_guard (rcu) { @@ -1043,6 +1043,12 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, if (!p) return -ESRCH; + if (flags) { + if (!task_has_dl_policy(p) || + flags != SCHED_GETATTR_FLAG_DL_DYNAMIC) + return -EINVAL; + } + retval = security_task_getscheduler(p); if (retval) return retval; @@ -1050,7 +1056,7 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, kattr.sched_policy = p->policy; if (p->sched_reset_on_fork) kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; - get_params(p, &kattr); + get_params(p, &kattr, flags); kattr.sched_flags &= SCHED_FLAG_ALL; #ifdef CONFIG_UCLAMP_TASK diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 32dcddaead82..061f8c85f555 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -4,6 +4,7 @@ */ #include <linux/sched/isolation.h> +#include <linux/sched/clock.h> #include <linux/bsearch.h> #include "sched.h" @@ -1642,6 +1643,7 @@ sd_init(struct sched_domain_topology_level *tl, struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); int sd_id, sd_weight, sd_flags = 0; struct cpumask *sd_span; + u64 now = sched_clock(); sd_weight = cpumask_weight(tl->mask(tl, cpu)); @@ -1679,6 +1681,7 @@ sd_init(struct sched_domain_topology_level *tl, .newidle_call = 512, .newidle_success = 256, .newidle_ratio = 512, + .newidle_stamp = now, .max_newidle_lb_cost = 0, .last_decay_max_lb_cost = jiffies, |
