summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2026-03-30 08:52:33 -1000
committerTejun Heo <tj@kernel.org>2026-03-30 09:02:05 -1000
commit94555ca6d0bb41a7cb3c12ad2a7cacb6fdbdca0b (patch)
tree6c614260e1eb81b2f9fefb4a1a76ddcb0200b300 /kernel
parent238aa43f0b77ea7db7f306e010f688c77439ee62 (diff)
parent090d34f0f0285124452373225bcc520a31e305e4 (diff)
Merge branch 'for-7.0-fixes' into for-7.1
Conflict in kernel/sched/ext.c init_sched_ext_class() between: 415cb193bb97 ("sched_ext: Fix SCX_KICK_WAIT deadlock by deferring wait to balance callback") which adds cpus_to_sync cpumask allocation, and: 84b1a0ea0b7c ("sched_ext: Implement scx_bpf_dsq_reenq() for user DSQs") 8c1b9453fde6 ("sched_ext: Convert deferred_reenq_locals from llist to regular list") which add deferred_reenq init code at the same location. Both are independent additions. Include both. Signed-off-by: Tejun Heo <tj@kernel.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/sched/ext.c95
-rw-r--r--kernel/sched/ext_idle.c2
-rw-r--r--kernel/sched/sched.h3
3 files changed, 74 insertions, 26 deletions
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index bfe923b7ffe0..9628c64e5592 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3018,7 +3018,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
{
struct scx_sched *sch = scx_task_sched(p);
- /* see kick_cpus_irq_workfn() */
+ /* see kick_sync_wait_bal_cb() */
smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1);
update_curr_scx(rq);
@@ -3067,6 +3067,48 @@ switch_class:
switch_class(rq, next);
}
+static void kick_sync_wait_bal_cb(struct rq *rq)
+{
+ struct scx_kick_syncs __rcu *ks = __this_cpu_read(scx_kick_syncs);
+ unsigned long *ksyncs = rcu_dereference_sched(ks)->syncs;
+ bool waited;
+ s32 cpu;
+
+ /*
+ * Drop rq lock and enable IRQs while waiting. IRQs must be enabled
+ * — a target CPU may be waiting for us to process an IPI (e.g. TLB
+ * flush) while we wait for its kick_sync to advance.
+ *
+ * Also, keep advancing our own kick_sync so that new kick_sync waits
+ * targeting us, which can start after we drop the lock, cannot form
+ * cyclic dependencies.
+ */
+retry:
+ waited = false;
+ for_each_cpu(cpu, rq->scx.cpus_to_sync) {
+ /*
+ * smp_load_acquire() pairs with smp_store_release() on
+ * kick_sync updates on the target CPUs.
+ */
+ if (cpu == cpu_of(rq) ||
+ smp_load_acquire(&cpu_rq(cpu)->scx.kick_sync) != ksyncs[cpu]) {
+ cpumask_clear_cpu(cpu, rq->scx.cpus_to_sync);
+ continue;
+ }
+
+ raw_spin_rq_unlock_irq(rq);
+ while (READ_ONCE(cpu_rq(cpu)->scx.kick_sync) == ksyncs[cpu]) {
+ smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1);
+ cpu_relax();
+ }
+ raw_spin_rq_lock_irq(rq);
+ waited = true;
+ }
+
+ if (waited)
+ goto retry;
+}
+
static struct task_struct *first_local_task(struct rq *rq)
{
return list_first_entry_or_null(&rq->scx.local_dsq.list,
@@ -3080,7 +3122,7 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx)
bool keep_prev;
struct task_struct *p;
- /* see kick_cpus_irq_workfn() */
+ /* see kick_sync_wait_bal_cb() */
smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1);
rq_modified_begin(rq, &ext_sched_class);
@@ -3091,6 +3133,17 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx)
maybe_queue_balance_callback(rq);
/*
+ * Defer to a balance callback which can drop rq lock and enable
+ * IRQs. Waiting directly in the pick path would deadlock against
+ * CPUs sending us IPIs (e.g. TLB flushes) while we wait for them.
+ */
+ if (unlikely(rq->scx.kick_sync_pending)) {
+ rq->scx.kick_sync_pending = false;
+ queue_balance_callback(rq, &rq->scx.kick_sync_bal_cb,
+ kick_sync_wait_bal_cb);
+ }
+
+ /*
* If any higher-priority sched class enqueued a runnable task on
* this rq during balance_one(), abort and return RETRY_TASK, so
* that the scheduler loop can restart.
@@ -6219,6 +6272,9 @@ static void scx_dump_state(struct scx_sched *sch, struct scx_exit_info *ei,
if (!cpumask_empty(rq->scx.cpus_to_wait))
dump_line(&ns, " cpus_to_wait : %*pb",
cpumask_pr_args(rq->scx.cpus_to_wait));
+ if (!cpumask_empty(rq->scx.cpus_to_sync))
+ dump_line(&ns, " cpus_to_sync : %*pb",
+ cpumask_pr_args(rq->scx.cpus_to_sync));
used = seq_buf_used(&ns);
if (SCX_HAS_OP(sch, dump_cpu)) {
@@ -7583,11 +7639,11 @@ static bool kick_one_cpu(s32 cpu, struct rq *this_rq, unsigned long *ksyncs)
if (cpumask_test_cpu(cpu, this_scx->cpus_to_wait)) {
if (cur_class == &ext_sched_class) {
+ cpumask_set_cpu(cpu, this_scx->cpus_to_sync);
ksyncs[cpu] = rq->scx.kick_sync;
should_wait = true;
- } else {
- cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
}
+ cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
}
resched_curr(rq);
@@ -7642,27 +7698,15 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work)
cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle);
}
- if (!should_wait)
- return;
-
- for_each_cpu(cpu, this_scx->cpus_to_wait) {
- unsigned long *wait_kick_sync = &cpu_rq(cpu)->scx.kick_sync;
-
- /*
- * Busy-wait until the task running at the time of kicking is no
- * longer running. This can be used to implement e.g. core
- * scheduling.
- *
- * smp_cond_load_acquire() pairs with store_releases in
- * pick_task_scx() and put_prev_task_scx(). The former breaks
- * the wait if SCX's scheduling path is entered even if the same
- * task is picked subsequently. The latter is necessary to break
- * the wait when $cpu is taken by a higher sched class.
- */
- if (cpu != cpu_of(this_rq))
- smp_cond_load_acquire(wait_kick_sync, VAL != ksyncs[cpu]);
-
- cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
+ /*
+ * Can't wait in hardirq — kick_sync can't advance, deadlocking if
+ * CPUs wait for each other. Defer to kick_sync_wait_bal_cb().
+ */
+ if (should_wait) {
+ raw_spin_rq_lock(this_rq);
+ this_scx->kick_sync_pending = true;
+ resched_curr(this_rq);
+ raw_spin_rq_unlock(this_rq);
}
}
@@ -7780,6 +7824,7 @@ void __init init_sched_ext_class(void)
BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n));
BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n));
BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n));
+ BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_sync, GFP_KERNEL, n));
raw_spin_lock_init(&rq->scx.deferred_reenq_lock);
INIT_LIST_HEAD(&rq->scx.deferred_reenq_locals);
INIT_LIST_HEAD(&rq->scx.deferred_reenq_users);
diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c
index 857d8e902b44..a61339c36902 100644
--- a/kernel/sched/ext_idle.c
+++ b/kernel/sched/ext_idle.c
@@ -549,7 +549,7 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
* piled up on it even if there is an idle core elsewhere on
* the system.
*/
- waker_node = cpu_to_node(cpu);
+ waker_node = scx_cpu_node_if_enabled(cpu);
if (!(current->flags & PF_EXITING) &&
cpu_rq(cpu)->scx.local_dsq.nr == 0 &&
(!(flags & SCX_PICK_IDLE_IN_NODE) || (waker_node == node)) &&
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 5b93f6190d31..ae0783e27c1e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -806,6 +806,8 @@ struct scx_rq {
cpumask_var_t cpus_to_kick_if_idle;
cpumask_var_t cpus_to_preempt;
cpumask_var_t cpus_to_wait;
+ cpumask_var_t cpus_to_sync;
+ bool kick_sync_pending;
unsigned long kick_sync;
struct task_struct *sub_dispatch_prev;
@@ -815,6 +817,7 @@ struct scx_rq {
struct list_head deferred_reenq_locals; /* scheds requesting reenq of local DSQ */
struct list_head deferred_reenq_users; /* user DSQs requesting reenq */
struct balance_callback deferred_bal_cb;
+ struct balance_callback kick_sync_bal_cb;
struct irq_work deferred_irq_work;
struct irq_work kick_cpus_irq_work;
};