summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2026-03-09 09:59:36 -1000
committerTejun Heo <tj@kernel.org>2026-03-09 09:59:36 -1000
commit0e7cd9cef61fde36ebfb653fe9e7a9722185cb57 (patch)
tree0eef127fa5f0bda6d0729a47ce5d3038c849b695 /kernel
parentbec10581e92289bdc3eed17e90200900ddb00594 (diff)
parent54a66e431eeacf23e1dc47cb3507f2d0c068aaf0 (diff)
Merge branch 'sched/core' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip into for-7.1
Pull sched/core to resolve conflicts between: c2a57380df9dd ("sched: Replace use of system_unbound_wq with system_dfl_wq") from the tip tree and commit: cde94c032b32b ("sched_ext: Make watchdog sub-sched aware") The latter moves around code modiefied by the former. Apply the changes in the new locations. Signed-off-by: Tejun Heo <tj@kernel.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/sched/core.c9
-rw-r--r--kernel/sched/deadline.c23
-rw-r--r--kernel/sched/debug.c14
-rw-r--r--kernel/sched/ext.c5
-rw-r--r--kernel/sched/fair.c327
-rw-r--r--kernel/sched/features.h3
-rw-r--r--kernel/sched/rt.c7
-rw-r--r--kernel/sched/sched.h14
-rw-r--r--kernel/sched/syscalls.c16
-rw-r--r--kernel/sched/topology.c3
10 files changed, 333 insertions, 88 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index dfe680e78be3..ec2a77f8879d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -687,11 +687,6 @@ bool raw_spin_rq_trylock(struct rq *rq)
}
}
-void raw_spin_rq_unlock(struct rq *rq)
-{
- raw_spin_unlock(rq_lockp(rq));
-}
-
/*
* double_rq_lock - safely lock two runqueues
*/
@@ -5678,7 +5673,7 @@ static void sched_tick_remote(struct work_struct *work)
os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING);
WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE);
if (os == TICK_SCHED_REMOTE_RUNNING)
- queue_delayed_work(system_unbound_wq, dwork, HZ);
+ queue_delayed_work(system_dfl_wq, dwork, HZ);
}
static void sched_tick_start(int cpu)
@@ -5697,7 +5692,7 @@ static void sched_tick_start(int cpu)
if (os == TICK_SCHED_REMOTE_OFFLINE) {
twork->cpu = cpu;
INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
- queue_delayed_work(system_unbound_wq, &twork->work, HZ);
+ queue_delayed_work(system_dfl_wq, &twork->work, HZ);
}
}
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index d08b00429323..9e253a825f39 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2142,10 +2142,14 @@ update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se,
int flags)
{
struct task_struct *p = dl_task_of(dl_se);
+ struct rq *rq = rq_of_dl_rq(dl_rq);
if (!schedstat_enabled())
return;
+ if (p != rq->curr)
+ update_stats_wait_end_dl(dl_rq, dl_se);
+
if ((flags & DEQUEUE_SLEEP)) {
unsigned int state;
@@ -3613,13 +3617,26 @@ void __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime);
}
-void __getparam_dl(struct task_struct *p, struct sched_attr *attr)
+void __getparam_dl(struct task_struct *p, struct sched_attr *attr, unsigned int flags)
{
struct sched_dl_entity *dl_se = &p->dl;
+ struct rq *rq = task_rq(p);
+ u64 adj_deadline;
attr->sched_priority = p->rt_priority;
- attr->sched_runtime = dl_se->dl_runtime;
- attr->sched_deadline = dl_se->dl_deadline;
+ if (flags & SCHED_GETATTR_FLAG_DL_DYNAMIC) {
+ guard(raw_spinlock_irq)(&rq->__lock);
+ update_rq_clock(rq);
+ if (task_current(rq, p))
+ update_curr_dl(rq);
+
+ attr->sched_runtime = dl_se->runtime;
+ adj_deadline = dl_se->deadline - rq_clock(rq) + ktime_get_ns();
+ attr->sched_deadline = adj_deadline;
+ } else {
+ attr->sched_runtime = dl_se->dl_runtime;
+ attr->sched_deadline = dl_se->dl_deadline;
+ }
attr->sched_period = dl_se->dl_period;
attr->sched_flags &= ~SCHED_DL_FLAGS;
attr->sched_flags |= dl_se->flags;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index b24f40f05019..6246008c431e 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -8,6 +8,7 @@
*/
#include <linux/debugfs.h>
#include <linux/nmi.h>
+#include <linux/log2.h>
#include "sched.h"
/*
@@ -901,10 +902,13 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
{
- s64 left_vruntime = -1, zero_vruntime, right_vruntime = -1, left_deadline = -1, spread;
+ s64 left_vruntime = -1, right_vruntime = -1, left_deadline = -1, spread;
+ s64 zero_vruntime = -1, sum_w_vruntime = -1;
struct sched_entity *last, *first, *root;
struct rq *rq = cpu_rq(cpu);
+ unsigned int sum_shift;
unsigned long flags;
+ u64 sum_weight;
#ifdef CONFIG_FAIR_GROUP_SCHED
SEQ_printf(m, "\n");
@@ -925,6 +929,9 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
if (last)
right_vruntime = last->vruntime;
zero_vruntime = cfs_rq->zero_vruntime;
+ sum_w_vruntime = cfs_rq->sum_w_vruntime;
+ sum_weight = cfs_rq->sum_weight;
+ sum_shift = cfs_rq->sum_shift;
raw_spin_rq_unlock_irqrestore(rq, flags);
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_deadline",
@@ -933,6 +940,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SPLIT_NS(left_vruntime));
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "zero_vruntime",
SPLIT_NS(zero_vruntime));
+ SEQ_printf(m, " .%-30s: %Ld (%d bits)\n", "sum_w_vruntime",
+ sum_w_vruntime, ilog2(abs(sum_w_vruntime)));
+ SEQ_printf(m, " .%-30s: %Lu\n", "sum_weight",
+ sum_weight);
+ SEQ_printf(m, " .%-30s: %u\n", "sum_shift", sum_shift);
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "avg_vruntime",
SPLIT_NS(avg_vruntime(cfs_rq)));
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "right_vruntime",
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 43fda1258903..b35b98020f3b 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3208,8 +3208,7 @@ static void scx_watchdog_workfn(struct work_struct *work)
intv = READ_ONCE(scx_watchdog_interval);
if (intv < ULONG_MAX)
- queue_delayed_work(system_unbound_wq, to_delayed_work(work),
- intv);
+ queue_delayed_work(system_dfl_wq, to_delayed_work(work), intv);
}
void scx_tick(struct rq *rq)
@@ -5233,7 +5232,7 @@ static void refresh_watchdog(void)
WRITE_ONCE(scx_watchdog_interval, intv);
if (intv < ULONG_MAX)
- mod_delayed_work(system_unbound_wq, &scx_watchdog_work, intv);
+ mod_delayed_work(system_dfl_wq, &scx_watchdog_work, intv);
else
cancel_delayed_work_sync(&scx_watchdog_work);
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index bf948db905ed..d57c02e82f3a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -225,6 +225,7 @@ void __init sched_init_granularity(void)
update_sysctl();
}
+#ifndef CONFIG_64BIT
#define WMULT_CONST (~0U)
#define WMULT_SHIFT 32
@@ -283,6 +284,12 @@ static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight
return mul_u64_u32_shr(delta_exec, fact, shift);
}
+#else
+static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
+{
+ return (delta_exec * weight) / lw->weight;
+}
+#endif
/*
* delta /= w
@@ -665,25 +672,83 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
* Since zero_vruntime closely tracks the per-task service, these
* deltas: (v_i - v0), will be in the order of the maximal (virtual) lag
* induced in the system due to quantisation.
- *
- * Also, we use scale_load_down() to reduce the size.
- *
- * As measured, the max (key * weight) value was ~44 bits for a kernel build.
*/
+static inline unsigned long avg_vruntime_weight(struct cfs_rq *cfs_rq, unsigned long w)
+{
+#ifdef CONFIG_64BIT
+ if (cfs_rq->sum_shift)
+ w = max(2UL, w >> cfs_rq->sum_shift);
+#endif
+ return w;
+}
+
+static inline void
+__sum_w_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ unsigned long weight = avg_vruntime_weight(cfs_rq, se->load.weight);
+ s64 w_vruntime, key = entity_key(cfs_rq, se);
+
+ w_vruntime = key * weight;
+ WARN_ON_ONCE((w_vruntime >> 63) != (w_vruntime >> 62));
+
+ cfs_rq->sum_w_vruntime += w_vruntime;
+ cfs_rq->sum_weight += weight;
+}
+
static void
-sum_w_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
+sum_w_vruntime_add_paranoid(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- unsigned long weight = scale_load_down(se->load.weight);
- s64 key = entity_key(cfs_rq, se);
+ unsigned long weight;
+ s64 key, tmp;
+
+again:
+ weight = avg_vruntime_weight(cfs_rq, se->load.weight);
+ key = entity_key(cfs_rq, se);
+
+ if (check_mul_overflow(key, weight, &key))
+ goto overflow;
- cfs_rq->sum_w_vruntime += key * weight;
+ if (check_add_overflow(cfs_rq->sum_w_vruntime, key, &tmp))
+ goto overflow;
+
+ cfs_rq->sum_w_vruntime = tmp;
cfs_rq->sum_weight += weight;
+ return;
+
+overflow:
+ /*
+ * There's gotta be a limit -- if we're still failing at this point
+ * there's really nothing much to be done about things.
+ */
+ BUG_ON(cfs_rq->sum_shift >= 10);
+ cfs_rq->sum_shift++;
+
+ /*
+ * Note: \Sum (k_i * (w_i >> 1)) != (\Sum (k_i * w_i)) >> 1
+ */
+ cfs_rq->sum_w_vruntime = 0;
+ cfs_rq->sum_weight = 0;
+
+ for (struct rb_node *node = cfs_rq->tasks_timeline.rb_leftmost;
+ node; node = rb_next(node))
+ __sum_w_vruntime_add(cfs_rq, __node_2_se(node));
+
+ goto again;
+}
+
+static void
+sum_w_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ if (sched_feat(PARANOID_AVG))
+ return sum_w_vruntime_add_paranoid(cfs_rq, se);
+
+ __sum_w_vruntime_add(cfs_rq, se);
}
static void
sum_w_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- unsigned long weight = scale_load_down(se->load.weight);
+ unsigned long weight = avg_vruntime_weight(cfs_rq, se->load.weight);
s64 key = entity_key(cfs_rq, se);
cfs_rq->sum_w_vruntime -= key * weight;
@@ -725,7 +790,7 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
s64 runtime = cfs_rq->sum_w_vruntime;
if (curr) {
- unsigned long w = scale_load_down(curr->load.weight);
+ unsigned long w = avg_vruntime_weight(cfs_rq, curr->load.weight);
runtime += entity_key(cfs_rq, curr) * w;
weight += w;
@@ -735,7 +800,7 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
if (runtime < 0)
runtime -= (weight - 1);
- delta = div_s64(runtime, weight);
+ delta = div64_long(runtime, weight);
} else if (curr) {
/*
* When there is but one element, it is the average.
@@ -764,17 +829,22 @@ static inline u64 cfs_rq_max_slice(struct cfs_rq *cfs_rq);
*
* -r_max < lag < max(r_max, q)
*/
-static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static s64 entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 avruntime)
{
u64 max_slice = cfs_rq_max_slice(cfs_rq) + TICK_NSEC;
s64 vlag, limit;
- WARN_ON_ONCE(!se->on_rq);
-
- vlag = avg_vruntime(cfs_rq) - se->vruntime;
+ vlag = avruntime - se->vruntime;
limit = calc_delta_fair(max_slice, se);
- se->vlag = clamp(vlag, -limit, limit);
+ return clamp(vlag, -limit, limit);
+}
+
+static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ WARN_ON_ONCE(!se->on_rq);
+
+ se->vlag = entity_lag(cfs_rq, se, avg_vruntime(cfs_rq));
}
/*
@@ -801,7 +871,7 @@ static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime)
long load = cfs_rq->sum_weight;
if (curr && curr->on_rq) {
- unsigned long weight = scale_load_down(curr->load.weight);
+ unsigned long weight = avg_vruntime_weight(cfs_rq, curr->load.weight);
avg += entity_key(cfs_rq, curr) * weight;
load += weight;
@@ -3840,23 +3910,125 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
se_weight(se) * -se->avg.load_sum);
}
-static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags);
+static void
+rescale_entity(struct sched_entity *se, unsigned long weight, bool rel_vprot)
+{
+ unsigned long old_weight = se->load.weight;
+
+ /*
+ * VRUNTIME
+ * --------
+ *
+ * COROLLARY #1: The virtual runtime of the entity needs to be
+ * adjusted if re-weight at !0-lag point.
+ *
+ * Proof: For contradiction assume this is not true, so we can
+ * re-weight without changing vruntime at !0-lag point.
+ *
+ * Weight VRuntime Avg-VRuntime
+ * before w v V
+ * after w' v' V'
+ *
+ * Since lag needs to be preserved through re-weight:
+ *
+ * lag = (V - v)*w = (V'- v')*w', where v = v'
+ * ==> V' = (V - v)*w/w' + v (1)
+ *
+ * Let W be the total weight of the entities before reweight,
+ * since V' is the new weighted average of entities:
+ *
+ * V' = (WV + w'v - wv) / (W + w' - w) (2)
+ *
+ * by using (1) & (2) we obtain:
+ *
+ * (WV + w'v - wv) / (W + w' - w) = (V - v)*w/w' + v
+ * ==> (WV-Wv+Wv+w'v-wv)/(W+w'-w) = (V - v)*w/w' + v
+ * ==> (WV - Wv)/(W + w' - w) + v = (V - v)*w/w' + v
+ * ==> (V - v)*W/(W + w' - w) = (V - v)*w/w' (3)
+ *
+ * Since we are doing at !0-lag point which means V != v, we
+ * can simplify (3):
+ *
+ * ==> W / (W + w' - w) = w / w'
+ * ==> Ww' = Ww + ww' - ww
+ * ==> W * (w' - w) = w * (w' - w)
+ * ==> W = w (re-weight indicates w' != w)
+ *
+ * So the cfs_rq contains only one entity, hence vruntime of
+ * the entity @v should always equal to the cfs_rq's weighted
+ * average vruntime @V, which means we will always re-weight
+ * at 0-lag point, thus breach assumption. Proof completed.
+ *
+ *
+ * COROLLARY #2: Re-weight does NOT affect weighted average
+ * vruntime of all the entities.
+ *
+ * Proof: According to corollary #1, Eq. (1) should be:
+ *
+ * (V - v)*w = (V' - v')*w'
+ * ==> v' = V' - (V - v)*w/w' (4)
+ *
+ * According to the weighted average formula, we have:
+ *
+ * V' = (WV - wv + w'v') / (W - w + w')
+ * = (WV - wv + w'(V' - (V - v)w/w')) / (W - w + w')
+ * = (WV - wv + w'V' - Vw + wv) / (W - w + w')
+ * = (WV + w'V' - Vw) / (W - w + w')
+ *
+ * ==> V'*(W - w + w') = WV + w'V' - Vw
+ * ==> V' * (W - w) = (W - w) * V (5)
+ *
+ * If the entity is the only one in the cfs_rq, then reweight
+ * always occurs at 0-lag point, so V won't change. Or else
+ * there are other entities, hence W != w, then Eq. (5) turns
+ * into V' = V. So V won't change in either case, proof done.
+ *
+ *
+ * So according to corollary #1 & #2, the effect of re-weight
+ * on vruntime should be:
+ *
+ * v' = V' - (V - v) * w / w' (4)
+ * = V - (V - v) * w / w'
+ * = V - vl * w / w'
+ * = V - vl'
+ */
+ se->vlag = div64_long(se->vlag * old_weight, weight);
+
+ /*
+ * DEADLINE
+ * --------
+ *
+ * When the weight changes, the virtual time slope changes and
+ * we should adjust the relative virtual deadline accordingly.
+ *
+ * d' = v' + (d - v)*w/w'
+ * = V' - (V - v)*w/w' + (d - v)*w/w'
+ * = V - (V - v)*w/w' + (d - v)*w/w'
+ * = V + (d - V)*w/w'
+ */
+ if (se->rel_deadline)
+ se->deadline = div64_long(se->deadline * old_weight, weight);
+
+ if (rel_vprot)
+ se->vprot = div64_long(se->vprot * old_weight, weight);
+}
static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
unsigned long weight)
{
bool curr = cfs_rq->curr == se;
bool rel_vprot = false;
- u64 vprot;
+ u64 avruntime = 0;
if (se->on_rq) {
/* commit outstanding execution time */
update_curr(cfs_rq);
- update_entity_lag(cfs_rq, se);
- se->deadline -= se->vruntime;
+ avruntime = avg_vruntime(cfs_rq);
+ se->vlag = entity_lag(cfs_rq, se, avruntime);
+ se->deadline -= avruntime;
se->rel_deadline = 1;
if (curr && protect_slice(se)) {
- vprot = se->vprot - se->vruntime;
+ se->vprot -= avruntime;
rel_vprot = true;
}
@@ -3867,30 +4039,23 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
}
dequeue_load_avg(cfs_rq, se);
- /*
- * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
- * we need to scale se->vlag when w_i changes.
- */
- se->vlag = div_s64(se->vlag * se->load.weight, weight);
- if (se->rel_deadline)
- se->deadline = div_s64(se->deadline * se->load.weight, weight);
-
- if (rel_vprot)
- vprot = div_s64(vprot * se->load.weight, weight);
+ rescale_entity(se, weight, rel_vprot);
update_load_set(&se->load, weight);
do {
u32 divider = get_pelt_divider(&se->avg);
-
se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
} while (0);
enqueue_load_avg(cfs_rq, se);
if (se->on_rq) {
- place_entity(cfs_rq, se, 0);
if (rel_vprot)
- se->vprot = se->vruntime + vprot;
+ se->vprot += avruntime;
+ se->deadline += avruntime;
+ se->rel_deadline = 0;
+ se->vruntime = avruntime - se->vlag;
+
update_load_add(&cfs_rq->load, se->load.weight);
if (!curr)
__enqueue_entity(cfs_rq, se);
@@ -5180,7 +5345,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
*/
if (sched_feat(PLACE_LAG) && cfs_rq->nr_queued && se->vlag) {
struct sched_entity *curr = cfs_rq->curr;
- unsigned long load;
+ long load;
lag = se->vlag;
@@ -5238,17 +5403,17 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
*/
load = cfs_rq->sum_weight;
if (curr && curr->on_rq)
- load += scale_load_down(curr->load.weight);
+ load += avg_vruntime_weight(cfs_rq, curr->load.weight);
- lag *= load + scale_load_down(se->load.weight);
+ lag *= load + avg_vruntime_weight(cfs_rq, se->load.weight);
if (WARN_ON_ONCE(!load))
load = 1;
- lag = div_s64(lag, load);
+ lag = div64_long(lag, load);
}
se->vruntime = vruntime - lag;
- if (se->rel_deadline) {
+ if (sched_feat(PLACE_REL_DEADLINE) && se->rel_deadline) {
se->deadline += se->vruntime;
se->rel_deadline = 0;
return;
@@ -6853,16 +7018,15 @@ static inline void hrtick_update(struct rq *rq)
static inline bool cpu_overutilized(int cpu)
{
- unsigned long rq_util_min, rq_util_max;
+ unsigned long rq_util_max;
if (!sched_energy_enabled())
return false;
- rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
/* Return true only if the utilization doesn't fit CPU's capacity */
- return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu);
+ return !util_fits_cpu(cpu_util_cfs(cpu), 0, rq_util_max, cpu);
}
/*
@@ -6900,9 +7064,15 @@ static int sched_idle_rq(struct rq *rq)
rq->nr_running);
}
-static int sched_idle_cpu(int cpu)
+static int choose_sched_idle_rq(struct rq *rq, struct task_struct *p)
+{
+ return sched_idle_rq(rq) && !task_has_idle_policy(p);
+}
+
+static int choose_idle_cpu(int cpu, struct task_struct *p)
{
- return sched_idle_rq(cpu_rq(cpu));
+ return available_idle_cpu(cpu) ||
+ choose_sched_idle_rq(cpu_rq(cpu), p);
}
static void
@@ -7467,7 +7637,7 @@ sched_balance_find_dst_group_cpu(struct sched_group *group, struct task_struct *
if (!sched_core_cookie_match(rq, p))
continue;
- if (sched_idle_cpu(i))
+ if (choose_sched_idle_rq(rq, p))
return i;
if (available_idle_cpu(i)) {
@@ -7558,8 +7728,7 @@ static inline int sched_balance_find_dst_cpu(struct sched_domain *sd, struct tas
static inline int __select_idle_cpu(int cpu, struct task_struct *p)
{
- if ((available_idle_cpu(cpu) || sched_idle_cpu(cpu)) &&
- sched_cpu_cookie_match(cpu_rq(cpu), p))
+ if (choose_idle_cpu(cpu, p) && sched_cpu_cookie_match(cpu_rq(cpu), p))
return cpu;
return -1;
@@ -7632,7 +7801,8 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu
if (!available_idle_cpu(cpu)) {
idle = false;
if (*idle_cpu == -1) {
- if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, cpus)) {
+ if (choose_sched_idle_rq(cpu_rq(cpu), p) &&
+ cpumask_test_cpu(cpu, cpus)) {
*idle_cpu = cpu;
break;
}
@@ -7667,7 +7837,7 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t
*/
if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
continue;
- if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
+ if (choose_idle_cpu(cpu, p))
return cpu;
}
@@ -7789,7 +7959,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
for_each_cpu_wrap(cpu, cpus, target) {
unsigned long cpu_cap = capacity_of(cpu);
- if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
+ if (!choose_idle_cpu(cpu, p))
continue;
fits = util_fits_cpu(task_util, util_min, util_max, cpu);
@@ -7860,7 +8030,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
*/
lockdep_assert_irqs_disabled();
- if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
+ if (choose_idle_cpu(target, p) &&
asym_fits_cpu(task_util, util_min, util_max, target))
return target;
@@ -7868,7 +8038,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
* If the previous CPU is cache affine and idle, don't be stupid:
*/
if (prev != target && cpus_share_cache(prev, target) &&
- (available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
+ choose_idle_cpu(prev, p) &&
asym_fits_cpu(task_util, util_min, util_max, prev)) {
if (!static_branch_unlikely(&sched_cluster_active) ||
@@ -7900,7 +8070,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
if (recent_used_cpu != prev &&
recent_used_cpu != target &&
cpus_share_cache(recent_used_cpu, target) &&
- (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
+ choose_idle_cpu(recent_used_cpu, p) &&
cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) &&
asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) {
@@ -10047,6 +10217,7 @@ struct sg_lb_stats {
unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */
unsigned int group_smt_balance; /* Task on busy SMT be moved */
unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
+ unsigned int group_overutilized; /* At least one CPU is overutilized in the group */
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
unsigned int nr_preferred_running;
@@ -10279,6 +10450,13 @@ group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
static inline bool
group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
{
+ /*
+ * With EAS and uclamp, 1 CPU in the group must be overutilized to
+ * consider the group overloaded.
+ */
+ if (sched_energy_enabled() && !sgs->group_overutilized)
+ return false;
+
if (sgs->sum_nr_running <= sgs->group_weight)
return false;
@@ -10462,14 +10640,12 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
* @group: sched_group whose statistics are to be updated.
* @sgs: variable to hold the statistics for this group.
* @sg_overloaded: sched_group is overloaded
- * @sg_overutilized: sched_group is overutilized
*/
static inline void update_sg_lb_stats(struct lb_env *env,
struct sd_lb_stats *sds,
struct sched_group *group,
struct sg_lb_stats *sgs,
- bool *sg_overloaded,
- bool *sg_overutilized)
+ bool *sg_overloaded)
{
int i, nr_running, local_group, sd_flags = env->sd->flags;
bool balancing_at_rd = !env->sd->parent;
@@ -10491,7 +10667,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->sum_nr_running += nr_running;
if (cpu_overutilized(i))
- *sg_overutilized = 1;
+ sgs->group_overutilized = 1;
/*
* No need to call idle_cpu() if nr_running is not 0
@@ -11162,13 +11338,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
update_group_capacity(env->sd, env->dst_cpu);
}
- update_sg_lb_stats(env, sds, sg, sgs, &sg_overloaded, &sg_overutilized);
+ update_sg_lb_stats(env, sds, sg, sgs, &sg_overloaded);
if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) {
sds->busiest = sg;
sds->busiest_stat = *sgs;
}
+ sg_overutilized |= sgs->group_overutilized;
+
/* Now, start updating sd_lb_stats */
sds->total_load += sgs->group_load;
sds->total_capacity += sgs->group_capacity;
@@ -12289,7 +12467,30 @@ static inline void update_newidle_stats(struct sched_domain *sd, unsigned int su
sd->newidle_success += success;
if (sd->newidle_call >= 1024) {
- sd->newidle_ratio = sd->newidle_success;
+ u64 now = sched_clock();
+ s64 delta = now - sd->newidle_stamp;
+ sd->newidle_stamp = now;
+ int ratio = 0;
+
+ if (delta < 0)
+ delta = 0;
+
+ if (sched_feat(NI_RATE)) {
+ /*
+ * ratio delta freq
+ *
+ * 1024 - 4 s - 128 Hz
+ * 512 - 2 s - 256 Hz
+ * 256 - 1 s - 512 Hz
+ * 128 - .5 s - 1024 Hz
+ * 64 - .25 s - 2048 Hz
+ */
+ ratio = delta >> 22;
+ }
+
+ ratio += sd->newidle_success;
+
+ sd->newidle_ratio = min(1024, ratio);
sd->newidle_call /= 2;
sd->newidle_success /= 2;
}
@@ -12336,7 +12537,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
{
int continue_balancing = 1;
int cpu = rq->cpu;
- int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
+ int busy = idle != CPU_IDLE && !sched_idle_rq(rq);
unsigned long interval;
struct sched_domain *sd;
/* Earliest time when we have to do rebalance again */
@@ -12374,7 +12575,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
* state even if we migrated tasks. Update it.
*/
idle = idle_cpu(cpu);
- busy = !idle && !sched_idle_cpu(cpu);
+ busy = !idle && !sched_idle_rq(rq);
}
sd->last_balance = jiffies;
interval = get_sd_balance_interval(sd, busy);
@@ -12996,7 +13197,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
if (sd->flags & SD_BALANCE_NEWIDLE) {
unsigned int weight = 1;
- if (sched_feat(NI_RANDOM)) {
+ if (sched_feat(NI_RANDOM) && sd->newidle_ratio < 1024) {
/*
* Throw a 1k sided dice; and only run
* newidle_balance according to the success
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 136a6584be79..a25f97201ab9 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -58,6 +58,8 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true)
SCHED_FEAT(DELAY_DEQUEUE, true)
SCHED_FEAT(DELAY_ZERO, true)
+SCHED_FEAT(PARANOID_AVG, false)
+
/*
* Allow wakeup-time preemption of the current task:
*/
@@ -126,3 +128,4 @@ SCHED_FEAT(LATENCY_WARN, false)
* Do newidle balancing proportional to its success rate using randomization.
*/
SCHED_FEAT(NI_RANDOM, true)
+SCHED_FEAT(NI_RATE, true)
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index f69e1f16d923..3d823f5ffe2c 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1302,13 +1302,18 @@ update_stats_dequeue_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
int flags)
{
struct task_struct *p = NULL;
+ struct rq *rq = rq_of_rt_rq(rt_rq);
if (!schedstat_enabled())
return;
- if (rt_entity_is_task(rt_se))
+ if (rt_entity_is_task(rt_se)) {
p = rt_task_of(rt_se);
+ if (p != rq->curr)
+ update_stats_wait_end_rt(rt_rq, rt_se);
+ }
+
if ((flags & DEQUEUE_SLEEP) && p) {
unsigned int state;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 893f89ce2a77..60627119d0ab 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -356,7 +356,7 @@ extern int sched_dl_global_validate(void);
extern void sched_dl_do_global(void);
extern int sched_dl_overflow(struct task_struct *p, int policy, const struct sched_attr *attr);
extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr);
-extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr);
+extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr, unsigned int flags);
extern bool __checkparam_dl(const struct sched_attr *attr);
extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr);
extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
@@ -684,8 +684,9 @@ struct cfs_rq {
s64 sum_w_vruntime;
u64 sum_weight;
-
u64 zero_vruntime;
+ unsigned int sum_shift;
+
#ifdef CONFIG_SCHED_CORE
unsigned int forceidle_seq;
u64 zero_vruntime_fi;
@@ -1609,15 +1610,18 @@ extern void raw_spin_rq_lock_nested(struct rq *rq, int subclass)
extern bool raw_spin_rq_trylock(struct rq *rq)
__cond_acquires(true, __rq_lockp(rq));
-extern void raw_spin_rq_unlock(struct rq *rq)
- __releases(__rq_lockp(rq));
-
static inline void raw_spin_rq_lock(struct rq *rq)
__acquires(__rq_lockp(rq))
{
raw_spin_rq_lock_nested(rq, 0);
}
+static inline void raw_spin_rq_unlock(struct rq *rq)
+ __releases(__rq_lockp(rq))
+{
+ raw_spin_unlock(rq_lockp(rq));
+}
+
static inline void raw_spin_rq_lock_irq(struct rq *rq)
__acquires(__rq_lockp(rq))
{
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
index 6f10db3646e7..a288ac0a633d 100644
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -881,10 +881,10 @@ err_size:
return -E2BIG;
}
-static void get_params(struct task_struct *p, struct sched_attr *attr)
+static void get_params(struct task_struct *p, struct sched_attr *attr, unsigned int flags)
{
if (task_has_dl_policy(p)) {
- __getparam_dl(p, attr);
+ __getparam_dl(p, attr, flags);
} else if (task_has_rt_policy(p)) {
attr->sched_priority = p->rt_priority;
} else {
@@ -950,7 +950,7 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
return -ESRCH;
if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS)
- get_params(p, &attr);
+ get_params(p, &attr, 0);
return sched_setattr(p, &attr);
}
@@ -1035,7 +1035,7 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
int retval;
if (unlikely(!uattr || pid < 0 || usize > PAGE_SIZE ||
- usize < SCHED_ATTR_SIZE_VER0 || flags))
+ usize < SCHED_ATTR_SIZE_VER0))
return -EINVAL;
scoped_guard (rcu) {
@@ -1043,6 +1043,12 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
if (!p)
return -ESRCH;
+ if (flags) {
+ if (!task_has_dl_policy(p) ||
+ flags != SCHED_GETATTR_FLAG_DL_DYNAMIC)
+ return -EINVAL;
+ }
+
retval = security_task_getscheduler(p);
if (retval)
return retval;
@@ -1050,7 +1056,7 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
kattr.sched_policy = p->policy;
if (p->sched_reset_on_fork)
kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
- get_params(p, &kattr);
+ get_params(p, &kattr, flags);
kattr.sched_flags &= SCHED_FLAG_ALL;
#ifdef CONFIG_UCLAMP_TASK
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 32dcddaead82..061f8c85f555 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -4,6 +4,7 @@
*/
#include <linux/sched/isolation.h>
+#include <linux/sched/clock.h>
#include <linux/bsearch.h>
#include "sched.h"
@@ -1642,6 +1643,7 @@ sd_init(struct sched_domain_topology_level *tl,
struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
int sd_id, sd_weight, sd_flags = 0;
struct cpumask *sd_span;
+ u64 now = sched_clock();
sd_weight = cpumask_weight(tl->mask(tl, cpu));
@@ -1679,6 +1681,7 @@ sd_init(struct sched_domain_topology_level *tl,
.newidle_call = 512,
.newidle_success = 256,
.newidle_ratio = 512,
+ .newidle_stamp = now,
.max_newidle_lb_cost = 0,
.last_decay_max_lb_cost = jiffies,