Skip to content

Commit 415cb19

Browse files
committed
sched_ext: Fix SCX_KICK_WAIT deadlock by deferring wait to balance callback
SCX_KICK_WAIT busy-waits in kick_cpus_irq_workfn() using smp_cond_load_acquire() until the target CPU's kick_sync advances. Because the irq_work runs in hardirq context, the waiting CPU cannot reschedule and its own kick_sync never advances. If multiple CPUs form a wait cycle, all CPUs deadlock. Replace the busy-wait in kick_cpus_irq_workfn() with resched_curr() to force the CPU through do_pick_task_scx(), which queues a balance callback to perform the wait. The balance callback drops the rq lock and enables IRQs following the sched_core_balance() pattern, so the CPU can process IPIs while waiting. The local CPU's kick_sync is advanced on entry to do_pick_task_scx() and continuously during the wait, ensuring any CPU that starts waiting for us sees the advancement and cannot form cyclic dependencies. Fixes: 90e5516 ("sched_ext: Implement SCX_KICK_WAIT") Cc: [email protected] # v6.12+ Reported-by: Christian Loehle <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Tejun Heo <[email protected]> Tested-by: Christian Loehle <[email protected]>
1 parent db08b19 commit 415cb19

2 files changed

Lines changed: 73 additions & 25 deletions

File tree

kernel/sched/ext.c

Lines changed: 70 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -2404,7 +2404,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
24042404
{
24052405
struct scx_sched *sch = scx_root;
24062406

2407-
/* see kick_cpus_irq_workfn() */
2407+
/* see kick_sync_wait_bal_cb() */
24082408
smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1);
24092409

24102410
update_curr_scx(rq);
@@ -2447,6 +2447,48 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
24472447
switch_class(rq, next);
24482448
}
24492449

2450+
static void kick_sync_wait_bal_cb(struct rq *rq)
2451+
{
2452+
struct scx_kick_syncs __rcu *ks = __this_cpu_read(scx_kick_syncs);
2453+
unsigned long *ksyncs = rcu_dereference_sched(ks)->syncs;
2454+
bool waited;
2455+
s32 cpu;
2456+
2457+
/*
2458+
* Drop rq lock and enable IRQs while waiting. IRQs must be enabled
2459+
* — a target CPU may be waiting for us to process an IPI (e.g. TLB
2460+
* flush) while we wait for its kick_sync to advance.
2461+
*
2462+
* Also, keep advancing our own kick_sync so that new kick_sync waits
2463+
* targeting us, which can start after we drop the lock, cannot form
2464+
* cyclic dependencies.
2465+
*/
2466+
retry:
2467+
waited = false;
2468+
for_each_cpu(cpu, rq->scx.cpus_to_sync) {
2469+
/*
2470+
* smp_load_acquire() pairs with smp_store_release() on
2471+
* kick_sync updates on the target CPUs.
2472+
*/
2473+
if (cpu == cpu_of(rq) ||
2474+
smp_load_acquire(&cpu_rq(cpu)->scx.kick_sync) != ksyncs[cpu]) {
2475+
cpumask_clear_cpu(cpu, rq->scx.cpus_to_sync);
2476+
continue;
2477+
}
2478+
2479+
raw_spin_rq_unlock_irq(rq);
2480+
while (READ_ONCE(cpu_rq(cpu)->scx.kick_sync) == ksyncs[cpu]) {
2481+
smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1);
2482+
cpu_relax();
2483+
}
2484+
raw_spin_rq_lock_irq(rq);
2485+
waited = true;
2486+
}
2487+
2488+
if (waited)
2489+
goto retry;
2490+
}
2491+
24502492
static struct task_struct *first_local_task(struct rq *rq)
24512493
{
24522494
return list_first_entry_or_null(&rq->scx.local_dsq.list,
@@ -2460,7 +2502,7 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx)
24602502
bool keep_prev;
24612503
struct task_struct *p;
24622504

2463-
/* see kick_cpus_irq_workfn() */
2505+
/* see kick_sync_wait_bal_cb() */
24642506
smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1);
24652507

24662508
rq_modified_begin(rq, &ext_sched_class);
@@ -2470,6 +2512,17 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx)
24702512
rq_repin_lock(rq, rf);
24712513
maybe_queue_balance_callback(rq);
24722514

2515+
/*
2516+
* Defer to a balance callback which can drop rq lock and enable
2517+
* IRQs. Waiting directly in the pick path would deadlock against
2518+
* CPUs sending us IPIs (e.g. TLB flushes) while we wait for them.
2519+
*/
2520+
if (unlikely(rq->scx.kick_sync_pending)) {
2521+
rq->scx.kick_sync_pending = false;
2522+
queue_balance_callback(rq, &rq->scx.kick_sync_bal_cb,
2523+
kick_sync_wait_bal_cb);
2524+
}
2525+
24732526
/*
24742527
* If any higher-priority sched class enqueued a runnable task on
24752528
* this rq during balance_one(), abort and return RETRY_TASK, so
@@ -4713,6 +4766,9 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
47134766
if (!cpumask_empty(rq->scx.cpus_to_wait))
47144767
dump_line(&ns, " cpus_to_wait : %*pb",
47154768
cpumask_pr_args(rq->scx.cpus_to_wait));
4769+
if (!cpumask_empty(rq->scx.cpus_to_sync))
4770+
dump_line(&ns, " cpus_to_sync : %*pb",
4771+
cpumask_pr_args(rq->scx.cpus_to_sync));
47164772

47174773
used = seq_buf_used(&ns);
47184774
if (SCX_HAS_OP(sch, dump_cpu)) {
@@ -5610,11 +5666,11 @@ static bool kick_one_cpu(s32 cpu, struct rq *this_rq, unsigned long *ksyncs)
56105666

56115667
if (cpumask_test_cpu(cpu, this_scx->cpus_to_wait)) {
56125668
if (cur_class == &ext_sched_class) {
5669+
cpumask_set_cpu(cpu, this_scx->cpus_to_sync);
56135670
ksyncs[cpu] = rq->scx.kick_sync;
56145671
should_wait = true;
5615-
} else {
5616-
cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
56175672
}
5673+
cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
56185674
}
56195675

56205676
resched_curr(rq);
@@ -5669,27 +5725,15 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work)
56695725
cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle);
56705726
}
56715727

5672-
if (!should_wait)
5673-
return;
5674-
5675-
for_each_cpu(cpu, this_scx->cpus_to_wait) {
5676-
unsigned long *wait_kick_sync = &cpu_rq(cpu)->scx.kick_sync;
5677-
5678-
/*
5679-
* Busy-wait until the task running at the time of kicking is no
5680-
* longer running. This can be used to implement e.g. core
5681-
* scheduling.
5682-
*
5683-
* smp_cond_load_acquire() pairs with store_releases in
5684-
* pick_task_scx() and put_prev_task_scx(). The former breaks
5685-
* the wait if SCX's scheduling path is entered even if the same
5686-
* task is picked subsequently. The latter is necessary to break
5687-
* the wait when $cpu is taken by a higher sched class.
5688-
*/
5689-
if (cpu != cpu_of(this_rq))
5690-
smp_cond_load_acquire(wait_kick_sync, VAL != ksyncs[cpu]);
5691-
5692-
cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
5728+
/*
5729+
* Can't wait in hardirq — kick_sync can't advance, deadlocking if
5730+
* CPUs wait for each other. Defer to kick_sync_wait_bal_cb().
5731+
*/
5732+
if (should_wait) {
5733+
raw_spin_rq_lock(this_rq);
5734+
this_scx->kick_sync_pending = true;
5735+
resched_curr(this_rq);
5736+
raw_spin_rq_unlock(this_rq);
56935737
}
56945738
}
56955739

@@ -5794,6 +5838,7 @@ void __init init_sched_ext_class(void)
57945838
BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n));
57955839
BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n));
57965840
BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n));
5841+
BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_sync, GFP_KERNEL, n));
57975842
rq->scx.deferred_irq_work = IRQ_WORK_INIT_HARD(deferred_irq_workfn);
57985843
rq->scx.kick_cpus_irq_work = IRQ_WORK_INIT_HARD(kick_cpus_irq_workfn);
57995844

kernel/sched/sched.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -805,9 +805,12 @@ struct scx_rq {
805805
cpumask_var_t cpus_to_kick_if_idle;
806806
cpumask_var_t cpus_to_preempt;
807807
cpumask_var_t cpus_to_wait;
808+
cpumask_var_t cpus_to_sync;
809+
bool kick_sync_pending;
808810
unsigned long kick_sync;
809811
local_t reenq_local_deferred;
810812
struct balance_callback deferred_bal_cb;
813+
struct balance_callback kick_sync_bal_cb;
811814
struct irq_work deferred_irq_work;
812815
struct irq_work kick_cpus_irq_work;
813816
struct scx_dispatch_q bypass_dsq;

0 commit comments

Comments
 (0)