summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2026-01-18 10:17:40 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2026-01-18 10:17:40 -0800
commit837c8180e34f30ddf99a473efe9105ccb62e66e4 (patch)
tree51e48779134a9ae6e54336b31404958dd66adbec
parentcee4757965003738533dbd408d61968d374409e2 (diff)
parent627cc25f84466d557d86e5dc67b43a4eea604c80 (diff)
Merge tag 'sched-urgent-2026-01-18' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler fixes from Ingo Molnar: "Misc deadline scheduler fixes, mainly for a new category of bugs that were discovered and fixed recently: - Fix a race condition in the DL server - Fix a DL server bug which can result in incorrectly going idle when there's work available - Fix DL server bug which triggers a WARN() due to broken get_prio_dl() logic and subsequent misbehavior - Fix double update_rq_clock() calls - Fix setscheduler() assumption about static priorities - Make sure balancing callbacks are always called - Plus a handful of preparatory commits for the fixes" * tag 'sched-urgent-2026-01-18' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched/deadline: Use ENQUEUE_MOVE to allow priority change sched: Deadline has dynamic priority sched: Audit MOVE vs balance_callbacks sched: Fold rq-pin swizzle into __balance_callbacks() sched/deadline: Avoid double update_rq_clock() sched/deadline: Ensure get_prio_dl() is up-to-date sched/deadline: Fix server stopping with runnable tasks sched: Provide idle_rq() helper sched/deadline: Fix potential race in dl_add_task_root_domain() sched/deadline: Remove unnecessary comment in dl_add_task_root_domain()
-rw-r--r--include/linux/sched.h1
-rw-r--r--kernel/sched/core.c18
-rw-r--r--kernel/sched/deadline.c36
-rw-r--r--kernel/sched/ext.c1
-rw-r--r--kernel/sched/sched.h27
-rw-r--r--kernel/sched/syscalls.c32
6 files changed, 59 insertions, 56 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d395f2810fac..da0133524d08 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1874,7 +1874,6 @@ static inline int task_nice(const struct task_struct *p)
extern int can_nice(const struct task_struct *p, const int nice);
extern int task_curr(const struct task_struct *p);
extern int idle_cpu(int cpu);
-extern int available_idle_cpu(int cpu);
extern int sched_setscheduler(struct task_struct *, int, const struct sched_param *);
extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *);
extern void sched_set_fifo(struct task_struct *p);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 60afadb6eede..045f83ad261e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4950,9 +4950,13 @@ struct balance_callback *splice_balance_callbacks(struct rq *rq)
return __splice_balance_callbacks(rq, true);
}
-static void __balance_callbacks(struct rq *rq)
+void __balance_callbacks(struct rq *rq, struct rq_flags *rf)
{
+ if (rf)
+ rq_unpin_lock(rq, rf);
do_balance_callbacks(rq, __splice_balance_callbacks(rq, false));
+ if (rf)
+ rq_repin_lock(rq, rf);
}
void balance_callbacks(struct rq *rq, struct balance_callback *head)
@@ -4991,7 +4995,7 @@ static inline void finish_lock_switch(struct rq *rq)
* prev into current:
*/
spin_acquire(&__rq_lockp(rq)->dep_map, 0, 0, _THIS_IP_);
- __balance_callbacks(rq);
+ __balance_callbacks(rq, NULL);
raw_spin_rq_unlock_irq(rq);
}
@@ -6867,7 +6871,7 @@ keep_resched:
proxy_tag_curr(rq, next);
rq_unpin_lock(rq, &rf);
- __balance_callbacks(rq);
+ __balance_callbacks(rq, NULL);
raw_spin_rq_unlock_irq(rq);
}
trace_sched_exit_tp(is_switch);
@@ -7316,7 +7320,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
trace_sched_pi_setprio(p, pi_task);
oldprio = p->prio;
- if (oldprio == prio)
+ if (oldprio == prio && !dl_prio(prio))
queue_flag &= ~DEQUEUE_MOVE;
prev_class = p->sched_class;
@@ -7362,9 +7366,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
out_unlock:
/* Caller holds task_struct::pi_lock, IRQs are still disabled */
- rq_unpin_lock(rq, &rf);
- __balance_callbacks(rq);
- rq_repin_lock(rq, &rf);
+ __balance_callbacks(rq, &rf);
__task_rq_unlock(rq, p, &rf);
}
#endif /* CONFIG_RT_MUTEXES */
@@ -9124,6 +9126,8 @@ void sched_move_task(struct task_struct *tsk, bool for_autogroup)
if (resched)
resched_curr(rq);
+
+ __balance_callbacks(rq, &rq_guard.rf);
}
static struct cgroup_subsys_state *
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 319439fe1870..c509f2e7d69d 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -752,8 +752,6 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
struct rq *rq = rq_of_dl_rq(dl_rq);
- update_rq_clock(rq);
-
WARN_ON(is_dl_boosted(dl_se));
WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline));
@@ -1420,7 +1418,7 @@ update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se, int
static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec)
{
- bool idle = rq->curr == rq->idle;
+ bool idle = idle_rq(rq);
s64 scaled_delta_exec;
if (unlikely(delta_exec <= 0)) {
@@ -1603,8 +1601,8 @@ void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec)
* | 8 | B:zero_laxity-wait | | |
* | | | <---+ |
* | +--------------------------------+ |
- * | | ^ ^ 2 |
- * | | 7 | 2 +--------------------+
+ * | | ^ ^ 2 |
+ * | | 7 | 2, 1 +----------------+
* | v |
* | +-------------+ |
* +-- | C:idle-wait | -+
@@ -1649,8 +1647,11 @@ void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec)
* dl_defer_idle = 0
*
*
- * [1] A->B, A->D
+ * [1] A->B, A->D, C->B
* dl_server_start()
+ * dl_defer_idle = 0;
+ * if (dl_server_active)
+ * return; // [B]
* dl_server_active = 1;
* enqueue_dl_entity()
* update_dl_entity(WAKEUP)
@@ -1759,6 +1760,7 @@ void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec)
* "B:zero_laxity-wait" -> "C:idle-wait" [label="7:dl_server_update_idle"]
* "B:zero_laxity-wait" -> "D:running" [label="3:dl_server_timer"]
* "C:idle-wait" -> "A:init" [label="8:dl_server_timer"]
+ * "C:idle-wait" -> "B:zero_laxity-wait" [label="1:dl_server_start"]
* "C:idle-wait" -> "B:zero_laxity-wait" [label="2:dl_server_update"]
* "C:idle-wait" -> "C:idle-wait" [label="7:dl_server_update_idle"]
* "D:running" -> "A:init" [label="4:pick_task_dl"]
@@ -1784,6 +1786,7 @@ void dl_server_start(struct sched_dl_entity *dl_se)
{
struct rq *rq = dl_se->rq;
+ dl_se->dl_defer_idle = 0;
if (!dl_server(dl_se) || dl_se->dl_server_active)
return;
@@ -1834,6 +1837,7 @@ void sched_init_dl_servers(void)
rq = cpu_rq(cpu);
guard(rq_lock_irq)(rq);
+ update_rq_clock(rq);
dl_se = &rq->fair_server;
@@ -2210,7 +2214,7 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags)
update_dl_entity(dl_se);
} else if (flags & ENQUEUE_REPLENISH) {
replenish_dl_entity(dl_se);
- } else if ((flags & ENQUEUE_RESTORE) &&
+ } else if ((flags & ENQUEUE_MOVE) &&
!is_dl_boosted(dl_se) &&
dl_time_before(dl_se->deadline, rq_clock(rq_of_dl_se(dl_se)))) {
setup_new_dl_entity(dl_se);
@@ -3154,7 +3158,7 @@ void dl_add_task_root_domain(struct task_struct *p)
struct rq *rq;
struct dl_bw *dl_b;
unsigned int cpu;
- struct cpumask *msk = this_cpu_cpumask_var_ptr(local_cpu_mask_dl);
+ struct cpumask *msk;
raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
if (!dl_task(p) || dl_entity_is_special(&p->dl)) {
@@ -3162,20 +3166,12 @@ void dl_add_task_root_domain(struct task_struct *p)
return;
}
- /*
- * Get an active rq, whose rq->rd traces the correct root
- * domain.
- * Ideally this would be under cpuset reader lock until rq->rd is
- * fetched. However, sleepable locks cannot nest inside pi_lock, so we
- * rely on the caller of dl_add_task_root_domain() holds 'cpuset_mutex'
- * to guarantee the CPU stays in the cpuset.
- */
+ msk = this_cpu_cpumask_var_ptr(local_cpu_mask_dl);
dl_get_task_effective_cpus(p, msk);
cpu = cpumask_first_and(cpu_active_mask, msk);
BUG_ON(cpu >= nr_cpu_ids);
rq = cpu_rq(cpu);
dl_b = &rq->rd->dl_bw;
- /* End of fetching rd */
raw_spin_lock(&dl_b->lock);
__dl_add(dl_b, p->dl.dl_bw, cpumask_weight(rq->rd->span));
@@ -3299,6 +3295,12 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
static u64 get_prio_dl(struct rq *rq, struct task_struct *p)
{
+ /*
+ * Make sure to update current so we don't return a stale value.
+ */
+ if (task_current_donor(rq, p))
+ update_curr_dl(rq);
+
return p->dl.deadline;
}
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 8f6d8d7f895c..afe28c04d5aa 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -545,6 +545,7 @@ static void scx_task_iter_start(struct scx_task_iter *iter)
static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter)
{
if (iter->locked_task) {
+ __balance_callbacks(iter->rq, &iter->rf);
task_rq_unlock(iter->rq, iter->locked_task, &iter->rf);
iter->locked_task = NULL;
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d30cca6870f5..93fce4bbff5e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1364,6 +1364,28 @@ static inline u32 sched_rng(void)
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
#define raw_rq() raw_cpu_ptr(&runqueues)
+static inline bool idle_rq(struct rq *rq)
+{
+ return rq->curr == rq->idle && !rq->nr_running && !rq->ttwu_pending;
+}
+
+/**
+ * available_idle_cpu - is a given CPU idle for enqueuing work.
+ * @cpu: the CPU in question.
+ *
+ * Return: 1 if the CPU is currently idle. 0 otherwise.
+ */
+static inline bool available_idle_cpu(int cpu)
+{
+ if (!idle_rq(cpu_rq(cpu)))
+ return 0;
+
+ if (vcpu_is_preempted(cpu))
+ return 0;
+
+ return 1;
+}
+
#ifdef CONFIG_SCHED_PROXY_EXEC
static inline void rq_set_donor(struct rq *rq, struct task_struct *t)
{
@@ -2366,7 +2388,8 @@ extern const u32 sched_prio_to_wmult[40];
* should preserve as much state as possible.
*
* MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location
- * in the runqueue.
+ * in the runqueue. IOW the priority is allowed to change. Callers
+ * must expect to deal with balance callbacks.
*
* NOCLOCK - skip the update_rq_clock() (avoids double updates)
*
@@ -3947,6 +3970,8 @@ extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags);
extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags);
extern struct balance_callback *splice_balance_callbacks(struct rq *rq);
+
+extern void __balance_callbacks(struct rq *rq, struct rq_flags *rf);
extern void balance_callbacks(struct rq *rq, struct balance_callback *head);
/*
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
index 0496dc29ed0f..6f10db3646e7 100644
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -180,35 +180,7 @@ int task_prio(const struct task_struct *p)
*/
int idle_cpu(int cpu)
{
- struct rq *rq = cpu_rq(cpu);
-
- if (rq->curr != rq->idle)
- return 0;
-
- if (rq->nr_running)
- return 0;
-
- if (rq->ttwu_pending)
- return 0;
-
- return 1;
-}
-
-/**
- * available_idle_cpu - is a given CPU idle for enqueuing work.
- * @cpu: the CPU in question.
- *
- * Return: 1 if the CPU is currently idle. 0 otherwise.
- */
-int available_idle_cpu(int cpu)
-{
- if (!idle_cpu(cpu))
- return 0;
-
- if (vcpu_is_preempted(cpu))
- return 0;
-
- return 1;
+ return idle_rq(cpu_rq(cpu));
}
/**
@@ -667,7 +639,7 @@ change:
* itself.
*/
newprio = rt_effective_prio(p, newprio);
- if (newprio == oldprio)
+ if (newprio == oldprio && !dl_prio(newprio))
queue_flags &= ~DEQUEUE_MOVE;
}