Skip to content

Commit 1b164b8

Browse files
committed
cgroup: Wait for dying tasks to leave on rmdir
a72f73c ("cgroup: Don't expose dead tasks in cgroup") hid PF_EXITING tasks from cgroup.procs so that systemd doesn't see tasks that have already been reaped via waitpid(). However, the populated counter (nr_populated_csets) is only decremented when the task later passes through cgroup_task_dead() in finish_task_switch(). This means cgroup.procs can appear empty while the cgroup is still populated, causing rmdir to fail with -EBUSY. Fix this by making cgroup_rmdir() wait for dying tasks to fully leave. If the cgroup is populated but all remaining tasks have PF_EXITING set (the task iterator returns none due to the existing filter), wait for a kick from cgroup_task_dead() and retry. The wait is brief as tasks are removed from the cgroup's css_set between PF_EXITING assertion in do_exit() and cgroup_task_dead() in finish_task_switch(). v2: cgroup_is_populated() true to false transition happens under css_set_lock not cgroup_mutex, so retest under css_set_lock before sleeping to avoid missed wakeups (Sebastian). Fixes: a72f73c ("cgroup: Don't expose dead tasks in cgroup") Reported-by: kernel test robot <[email protected]> Closes: https://lore.kernel.org/oe-lkp/[email protected] Reported-by: Sebastian Andrzej Siewior <[email protected]> Signed-off-by: Tejun Heo <[email protected]> Reviewed-by: Sebastian Andrzej Siewior <[email protected]> Cc: Bert Karwatzki <[email protected]> Cc: Michal Koutny <[email protected]> Cc: [email protected]
1 parent a72f73c commit 1b164b8

2 files changed

Lines changed: 86 additions & 3 deletions

File tree

include/linux/cgroup-defs.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -609,6 +609,9 @@ struct cgroup {
609609
/* used to wait for offlining of csses */
610610
wait_queue_head_t offline_waitq;
611611

612+
/* used by cgroup_rmdir() to wait for dying tasks to leave */
613+
wait_queue_head_t dying_populated_waitq;
614+
612615
/* used to schedule release agent */
613616
struct work_struct release_agent_work;
614617

kernel/cgroup/cgroup.c

Lines changed: 83 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2126,6 +2126,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
21262126
#endif
21272127

21282128
init_waitqueue_head(&cgrp->offline_waitq);
2129+
init_waitqueue_head(&cgrp->dying_populated_waitq);
21292130
INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
21302131
}
21312132

@@ -6224,6 +6225,76 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
62246225
return 0;
62256226
};
62266227

6228+
/**
6229+
* cgroup_drain_dying - wait for dying tasks to leave before rmdir
6230+
* @cgrp: the cgroup being removed
6231+
*
6232+
* The PF_EXITING filter in css_task_iter_advance() hides exiting tasks from
6233+
* cgroup.procs so that userspace (e.g. systemd) doesn't see tasks that have
6234+
* already been reaped via waitpid(). However, the populated counter
6235+
* (nr_populated_csets) is only decremented when the task later passes through
6236+
* cgroup_task_dead() in finish_task_switch(). This creates a window where
6237+
* cgroup.procs appears empty but cgroup_is_populated() is still true, causing
6238+
* rmdir to fail with -EBUSY.
6239+
*
6240+
* This function bridges that gap. If the cgroup is populated but all remaining
6241+
* tasks have PF_EXITING set, we wait for cgroup_task_dead() to process them.
6242+
* Tasks are removed from the cgroup's css_set in cgroup_task_dead() called from
6243+
* finish_task_switch(). As the window between PF_EXITING and cgroup_task_dead()
6244+
* is short, the number of PF_EXITING tasks on the list is small and the wait
6245+
* is brief.
6246+
*
6247+
* Each cgroup_task_dead() kicks the waitqueue via cset->cgrp_links, and we
6248+
* retry the full check from scratch.
6249+
*
6250+
* Must be called with cgroup_mutex held.
6251+
*/
6252+
static int cgroup_drain_dying(struct cgroup *cgrp)
6253+
__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
6254+
{
6255+
struct css_task_iter it;
6256+
struct task_struct *task;
6257+
DEFINE_WAIT(wait);
6258+
6259+
lockdep_assert_held(&cgroup_mutex);
6260+
retry:
6261+
if (!cgroup_is_populated(cgrp))
6262+
return 0;
6263+
6264+
/* Same iterator as cgroup.threads - if any task is visible, it's busy */
6265+
css_task_iter_start(&cgrp->self, 0, &it);
6266+
task = css_task_iter_next(&it);
6267+
css_task_iter_end(&it);
6268+
6269+
if (task)
6270+
return -EBUSY;
6271+
6272+
/*
6273+
* All remaining tasks are PF_EXITING and will pass through
6274+
* cgroup_task_dead() shortly. Wait for a kick and retry.
6275+
*
6276+
* cgroup_is_populated() can't transition from false to true while
6277+
* we're holding cgroup_mutex, but the true to false transition
6278+
* happens under css_set_lock (via cgroup_task_dead()). We must
6279+
* retest and prepare_to_wait() under css_set_lock. Otherwise, the
6280+
* transition can happen between our first test and
6281+
* prepare_to_wait(), and we sleep with no one to wake us.
6282+
*/
6283+
spin_lock_irq(&css_set_lock);
6284+
if (!cgroup_is_populated(cgrp)) {
6285+
spin_unlock_irq(&css_set_lock);
6286+
return 0;
6287+
}
6288+
prepare_to_wait(&cgrp->dying_populated_waitq, &wait,
6289+
TASK_UNINTERRUPTIBLE);
6290+
spin_unlock_irq(&css_set_lock);
6291+
mutex_unlock(&cgroup_mutex);
6292+
schedule();
6293+
finish_wait(&cgrp->dying_populated_waitq, &wait);
6294+
mutex_lock(&cgroup_mutex);
6295+
goto retry;
6296+
}
6297+
62276298
int cgroup_rmdir(struct kernfs_node *kn)
62286299
{
62296300
struct cgroup *cgrp;
@@ -6233,9 +6304,12 @@ int cgroup_rmdir(struct kernfs_node *kn)
62336304
if (!cgrp)
62346305
return 0;
62356306

6236-
ret = cgroup_destroy_locked(cgrp);
6237-
if (!ret)
6238-
TRACE_CGROUP_PATH(rmdir, cgrp);
6307+
ret = cgroup_drain_dying(cgrp);
6308+
if (!ret) {
6309+
ret = cgroup_destroy_locked(cgrp);
6310+
if (!ret)
6311+
TRACE_CGROUP_PATH(rmdir, cgrp);
6312+
}
62396313

62406314
cgroup_kn_unlock(kn);
62416315
return ret;
@@ -6995,6 +7069,7 @@ void cgroup_task_exit(struct task_struct *tsk)
69957069

69967070
static void do_cgroup_task_dead(struct task_struct *tsk)
69977071
{
7072+
struct cgrp_cset_link *link;
69987073
struct css_set *cset;
69997074
unsigned long flags;
70007075

@@ -7008,6 +7083,11 @@ static void do_cgroup_task_dead(struct task_struct *tsk)
70087083
if (thread_group_leader(tsk) && atomic_read(&tsk->signal->live))
70097084
list_add_tail(&tsk->cg_list, &cset->dying_tasks);
70107085

7086+
/* kick cgroup_drain_dying() waiters, see cgroup_rmdir() */
7087+
list_for_each_entry(link, &cset->cgrp_links, cgrp_link)
7088+
if (waitqueue_active(&link->cgrp->dying_populated_waitq))
7089+
wake_up(&link->cgrp->dying_populated_waitq);
7090+
70117091
if (dl_task(tsk))
70127092
dec_dl_tasks_cs(tsk);
70137093

0 commit comments

Comments
 (0)