Skip to content

Commit 0eca95c

Browse files
committed
sched_ext: Short-circuit sched_class operations on dead tasks
7900aa6 ("sched_ext: Fix cgroup exit ordering by moving sched_ext_free() to finish_task_switch()") moved sched_ext_free() to finish_task_switch() and renamed it to sched_ext_dead() to fix cgroup exit ordering issues. However, this created a race window where certain sched_class ops may be invoked on dead tasks leading to failures - e.g. sched_setscheduler() may try to switch a task which finished sched_ext_dead() back into SCX triggering invalid SCX task state transitions. Add task_dead_and_done() which tests whether a task is TASK_DEAD and has completed its final context switch, and use it to short-circuit sched_class operations which may be called on dead tasks. Fixes: 7900aa6 ("sched_ext: Fix cgroup exit ordering by moving sched_ext_free() to finish_task_switch()") Reported-by: Andrea Righi <[email protected]> Link: http://lkml.kernel.org/r/[email protected] Reviewed-by: Andrea Righi <[email protected]> Signed-off-by: Tejun Heo <[email protected]>
1 parent c9894e6 commit 0eca95c

1 file changed

Lines changed: 48 additions & 0 deletions

File tree

kernel/sched/ext.c

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,7 @@ MODULE_PARM_DESC(bypass_lb_intv_us, "bypass load balance interval in microsecond
194194
#include <trace/events/sched_ext.h>
195195

196196
static void process_ddsp_deferred_locals(struct rq *rq);
197+
static bool task_dead_and_done(struct task_struct *p);
197198
static u32 reenq_local(struct rq *rq);
198199
static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags);
199200
static bool scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind,
@@ -2618,6 +2619,9 @@ static void set_cpus_allowed_scx(struct task_struct *p,
26182619

26192620
set_cpus_allowed_common(p, ac);
26202621

2622+
if (task_dead_and_done(p))
2623+
return;
2624+
26212625
/*
26222626
* The effective cpumask is stored in @p->cpus_ptr which may temporarily
26232627
* differ from the configured one in @p->cpus_mask. Always tell the bpf
@@ -3033,10 +3037,45 @@ void scx_cancel_fork(struct task_struct *p)
30333037
percpu_up_read(&scx_fork_rwsem);
30343038
}
30353039

3040+
/**
3041+
* task_dead_and_done - Is a task dead and done running?
3042+
* @p: target task
3043+
*
3044+
* Once sched_ext_dead() removes the dead task from scx_tasks and exits it, the
3045+
* task no longer exists from SCX's POV. However, certain sched_class ops may be
3046+
* invoked on these dead tasks leading to failures - e.g. sched_setscheduler()
3047+
* may try to switch a task which finished sched_ext_dead() back into SCX
3048+
* triggering invalid SCX task state transitions and worse.
3049+
*
3050+
* Once a task has finished the final switch, sched_ext_dead() is the only thing
3051+
* that needs to happen on the task. Use this test to short-circuit sched_class
3052+
* operations which may be called on dead tasks.
3053+
*/
3054+
static bool task_dead_and_done(struct task_struct *p)
3055+
{
3056+
struct rq *rq = task_rq(p);
3057+
3058+
lockdep_assert_rq_held(rq);
3059+
3060+
/*
3061+
* In do_task_dead(), a dying task sets %TASK_DEAD with preemption
3062+
* disabled and __schedule(). If @p has %TASK_DEAD set and off CPU, @p
3063+
* won't ever run again.
3064+
*/
3065+
return unlikely(READ_ONCE(p->__state) == TASK_DEAD) &&
3066+
!task_on_cpu(rq, p);
3067+
}
3068+
30363069
void sched_ext_dead(struct task_struct *p)
30373070
{
30383071
unsigned long flags;
30393072

3073+
/*
3074+
* By the time control reaches here, @p has %TASK_DEAD set, switched out
3075+
* for the last time and then dropped the rq lock - task_dead_and_done()
3076+
* should be returning %true nullifying the straggling sched_class ops.
3077+
* Remove from scx_tasks and exit @p.
3078+
*/
30403079
raw_spin_lock_irqsave(&scx_tasks_lock, flags);
30413080
list_del_init(&p->scx.tasks_node);
30423081
raw_spin_unlock_irqrestore(&scx_tasks_lock, flags);
@@ -3062,6 +3101,9 @@ static void reweight_task_scx(struct rq *rq, struct task_struct *p,
30623101

30633102
lockdep_assert_rq_held(task_rq(p));
30643103

3104+
if (task_dead_and_done(p))
3105+
return;
3106+
30653107
p->scx.weight = sched_weight_to_cgroup(scale_load_down(lw->weight));
30663108
if (SCX_HAS_OP(sch, set_weight))
30673109
SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_weight, rq,
@@ -3076,6 +3118,9 @@ static void switching_to_scx(struct rq *rq, struct task_struct *p)
30763118
{
30773119
struct scx_sched *sch = scx_root;
30783120

3121+
if (task_dead_and_done(p))
3122+
return;
3123+
30793124
scx_enable_task(p);
30803125

30813126
/*
@@ -3089,6 +3134,9 @@ static void switching_to_scx(struct rq *rq, struct task_struct *p)
30893134

30903135
static void switched_from_scx(struct rq *rq, struct task_struct *p)
30913136
{
3137+
if (task_dead_and_done(p))
3138+
return;
3139+
30923140
scx_disable_task(p);
30933141
}
30943142

0 commit comments

Comments
 (0)