Skip to content

Commit 53d85a2

Browse files
committed
Merge tag 'cgroup-for-7.0-rc6-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup fixes from Tejun Heo: - Fix cgroup rmdir racing with dying tasks. Deferred task cgroup unlink introduced a window where cgroup.procs is empty but the cgroup is still populated, causing rmdir to fail with -EBUSY and selftest failures. Make rmdir wait for dying tasks to fully leave and fix selftests to not depend on synchronous populated updates. - Fix cpuset v1 task migration failure from empty cpusets under strict security policies. When CPU hotplug removes the last CPU from a v1 cpuset, tasks must be migrated to an ancestor without a security_task_setscheduler() check that would block the migration. * tag 'cgroup-for-7.0-rc6-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: cgroup/cpuset: Skip security check for hotplug induced v1 task migration cgroup/cpuset: Simplify setsched decision check in task iteration loop of cpuset_can_attach() cgroup: Fix cgroup_drain_dying() testing the wrong condition selftests/cgroup: Don't require synchronous populated update on task exit cgroup: Wait for dying tasks to leave on rmdir
2 parents dbf00d8 + 089f3fc commit 53d85a2

7 files changed

Lines changed: 131 additions & 16 deletions

File tree

include/linux/cgroup-defs.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -609,6 +609,9 @@ struct cgroup {
609609
/* used to wait for offlining of csses */
610610
wait_queue_head_t offline_waitq;
611611

612+
/* used by cgroup_rmdir() to wait for dying tasks to leave */
613+
wait_queue_head_t dying_populated_waitq;
614+
612615
/* used to schedule release agent */
613616
struct work_struct release_agent_work;
614617

kernel/cgroup/cgroup.c

Lines changed: 85 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2126,6 +2126,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
21262126
#endif
21272127

21282128
init_waitqueue_head(&cgrp->offline_waitq);
2129+
init_waitqueue_head(&cgrp->dying_populated_waitq);
21292130
INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
21302131
}
21312132

@@ -6224,6 +6225,78 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
62246225
return 0;
62256226
};
62266227

6228+
/**
6229+
* cgroup_drain_dying - wait for dying tasks to leave before rmdir
6230+
* @cgrp: the cgroup being removed
6231+
*
6232+
* cgroup.procs and cgroup.threads use css_task_iter which filters out
6233+
* PF_EXITING tasks so that userspace doesn't see tasks that have already been
6234+
* reaped via waitpid(). However, cgroup_has_tasks() - which tests whether the
6235+
* cgroup has non-empty css_sets - is only updated when dying tasks pass through
6236+
* cgroup_task_dead() in finish_task_switch(). This creates a window where
6237+
* cgroup.procs reads empty but cgroup_has_tasks() is still true, making rmdir
6238+
* fail with -EBUSY from cgroup_destroy_locked() even though userspace sees no
6239+
* tasks.
6240+
*
6241+
* This function aligns cgroup_has_tasks() with what userspace can observe. If
6242+
* cgroup_has_tasks() but the task iterator sees nothing (all remaining tasks are
6243+
* PF_EXITING), we wait for cgroup_task_dead() to finish processing them. As the
6244+
* window between PF_EXITING and cgroup_task_dead() is short, the wait is brief.
6245+
*
6246+
* This function only concerns itself with this cgroup's own dying tasks.
6247+
* Whether the cgroup has children is cgroup_destroy_locked()'s problem.
6248+
*
6249+
* Each cgroup_task_dead() kicks the waitqueue via cset->cgrp_links, and we
6250+
* retry the full check from scratch.
6251+
*
6252+
* Must be called with cgroup_mutex held.
6253+
*/
6254+
static int cgroup_drain_dying(struct cgroup *cgrp)
6255+
__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
6256+
{
6257+
struct css_task_iter it;
6258+
struct task_struct *task;
6259+
DEFINE_WAIT(wait);
6260+
6261+
lockdep_assert_held(&cgroup_mutex);
6262+
retry:
6263+
if (!cgroup_has_tasks(cgrp))
6264+
return 0;
6265+
6266+
/* Same iterator as cgroup.threads - if any task is visible, it's busy */
6267+
css_task_iter_start(&cgrp->self, 0, &it);
6268+
task = css_task_iter_next(&it);
6269+
css_task_iter_end(&it);
6270+
6271+
if (task)
6272+
return -EBUSY;
6273+
6274+
/*
6275+
* All remaining tasks are PF_EXITING and will pass through
6276+
* cgroup_task_dead() shortly. Wait for a kick and retry.
6277+
*
6278+
* cgroup_has_tasks() can't transition from false to true while we're
6279+
* holding cgroup_mutex, but the true to false transition happens
6280+
* under css_set_lock (via cgroup_task_dead()). We must retest and
6281+
* prepare_to_wait() under css_set_lock. Otherwise, the transition
6282+
* can happen between our first test and prepare_to_wait(), and we
6283+
* sleep with no one to wake us.
6284+
*/
6285+
spin_lock_irq(&css_set_lock);
6286+
if (!cgroup_has_tasks(cgrp)) {
6287+
spin_unlock_irq(&css_set_lock);
6288+
return 0;
6289+
}
6290+
prepare_to_wait(&cgrp->dying_populated_waitq, &wait,
6291+
TASK_UNINTERRUPTIBLE);
6292+
spin_unlock_irq(&css_set_lock);
6293+
mutex_unlock(&cgroup_mutex);
6294+
schedule();
6295+
finish_wait(&cgrp->dying_populated_waitq, &wait);
6296+
mutex_lock(&cgroup_mutex);
6297+
goto retry;
6298+
}
6299+
62276300
int cgroup_rmdir(struct kernfs_node *kn)
62286301
{
62296302
struct cgroup *cgrp;
@@ -6233,9 +6306,12 @@ int cgroup_rmdir(struct kernfs_node *kn)
62336306
if (!cgrp)
62346307
return 0;
62356308

6236-
ret = cgroup_destroy_locked(cgrp);
6237-
if (!ret)
6238-
TRACE_CGROUP_PATH(rmdir, cgrp);
6309+
ret = cgroup_drain_dying(cgrp);
6310+
if (!ret) {
6311+
ret = cgroup_destroy_locked(cgrp);
6312+
if (!ret)
6313+
TRACE_CGROUP_PATH(rmdir, cgrp);
6314+
}
62396315

62406316
cgroup_kn_unlock(kn);
62416317
return ret;
@@ -6995,6 +7071,7 @@ void cgroup_task_exit(struct task_struct *tsk)
69957071

69967072
static void do_cgroup_task_dead(struct task_struct *tsk)
69977073
{
7074+
struct cgrp_cset_link *link;
69987075
struct css_set *cset;
69997076
unsigned long flags;
70007077

@@ -7008,6 +7085,11 @@ static void do_cgroup_task_dead(struct task_struct *tsk)
70087085
if (thread_group_leader(tsk) && atomic_read(&tsk->signal->live))
70097086
list_add_tail(&tsk->cg_list, &cset->dying_tasks);
70107087

7088+
/* kick cgroup_drain_dying() waiters, see cgroup_rmdir() */
7089+
list_for_each_entry(link, &cset->cgrp_links, cgrp_link)
7090+
if (waitqueue_active(&link->cgrp->dying_populated_waitq))
7091+
wake_up(&link->cgrp->dying_populated_waitq);
7092+
70117093
if (dl_task(tsk))
70127094
dec_dl_tasks_cs(tsk);
70137095

kernel/cgroup/cpuset.c

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2988,7 +2988,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
29882988
struct cgroup_subsys_state *css;
29892989
struct cpuset *cs, *oldcs;
29902990
struct task_struct *task;
2991-
bool cpus_updated, mems_updated;
2991+
bool setsched_check;
29922992
int ret;
29932993

29942994
/* used later by cpuset_attach() */
@@ -3003,20 +3003,31 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
30033003
if (ret)
30043004
goto out_unlock;
30053005

3006-
cpus_updated = !cpumask_equal(cs->effective_cpus, oldcs->effective_cpus);
3007-
mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems);
3006+
/*
3007+
* Skip rights over task setsched check in v2 when nothing changes,
3008+
* migration permission derives from hierarchy ownership in
3009+
* cgroup_procs_write_permission()).
3010+
*/
3011+
setsched_check = !cpuset_v2() ||
3012+
!cpumask_equal(cs->effective_cpus, oldcs->effective_cpus) ||
3013+
!nodes_equal(cs->effective_mems, oldcs->effective_mems);
3014+
3015+
/*
3016+
* A v1 cpuset with tasks will have no CPU left only when CPU hotplug
3017+
* brings the last online CPU offline as users are not allowed to empty
3018+
* cpuset.cpus when there are active tasks inside. When that happens,
3019+
* we should allow tasks to migrate out without security check to make
3020+
* sure they will be able to run after migration.
3021+
*/
3022+
if (!is_in_v2_mode() && cpumask_empty(oldcs->effective_cpus))
3023+
setsched_check = false;
30083024

30093025
cgroup_taskset_for_each(task, css, tset) {
30103026
ret = task_can_attach(task);
30113027
if (ret)
30123028
goto out_unlock;
30133029

3014-
/*
3015-
* Skip rights over task check in v2 when nothing changes,
3016-
* migration permission derives from hierarchy ownership in
3017-
* cgroup_procs_write_permission()).
3018-
*/
3019-
if (!cpuset_v2() || (cpus_updated || mems_updated)) {
3030+
if (setsched_check) {
30203031
ret = security_task_setscheduler(task);
30213032
if (ret)
30223033
goto out_unlock;

tools/testing/selftests/cgroup/lib/cgroup_util.c

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,21 @@ int cg_read_strcmp(const char *cgroup, const char *control,
123123
return ret;
124124
}
125125

126+
int cg_read_strcmp_wait(const char *cgroup, const char *control,
127+
const char *expected)
128+
{
129+
int i, ret;
130+
131+
for (i = 0; i < 100; i++) {
132+
ret = cg_read_strcmp(cgroup, control, expected);
133+
if (!ret)
134+
return ret;
135+
usleep(10000);
136+
}
137+
138+
return ret;
139+
}
140+
126141
int cg_read_strstr(const char *cgroup, const char *control, const char *needle)
127142
{
128143
char buf[PAGE_SIZE];

tools/testing/selftests/cgroup/lib/include/cgroup_util.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,8 @@ extern int cg_read(const char *cgroup, const char *control,
6161
char *buf, size_t len);
6262
extern int cg_read_strcmp(const char *cgroup, const char *control,
6363
const char *expected);
64+
extern int cg_read_strcmp_wait(const char *cgroup, const char *control,
65+
const char *expected);
6466
extern int cg_read_strstr(const char *cgroup, const char *control,
6567
const char *needle);
6668
extern long cg_read_long(const char *cgroup, const char *control);

tools/testing/selftests/cgroup/test_core.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -233,7 +233,8 @@ static int test_cgcore_populated(const char *root)
233233
if (err)
234234
goto cleanup;
235235

236-
if (cg_read_strcmp(cg_test_d, "cgroup.events", "populated 0\n"))
236+
if (cg_read_strcmp_wait(cg_test_d, "cgroup.events",
237+
"populated 0\n"))
237238
goto cleanup;
238239

239240
/* Remove cgroup. */

tools/testing/selftests/cgroup/test_kill.c

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ static int test_cgkill_simple(const char *root)
8686
wait_for_pid(pids[i]);
8787

8888
if (ret == KSFT_PASS &&
89-
cg_read_strcmp(cgroup, "cgroup.events", "populated 0\n"))
89+
cg_read_strcmp_wait(cgroup, "cgroup.events", "populated 0\n"))
9090
ret = KSFT_FAIL;
9191

9292
if (cgroup)
@@ -190,7 +190,8 @@ static int test_cgkill_tree(const char *root)
190190
wait_for_pid(pids[i]);
191191

192192
if (ret == KSFT_PASS &&
193-
cg_read_strcmp(cgroup[0], "cgroup.events", "populated 0\n"))
193+
cg_read_strcmp_wait(cgroup[0], "cgroup.events",
194+
"populated 0\n"))
194195
ret = KSFT_FAIL;
195196

196197
for (i = 9; i >= 0 && cgroup[i]; i--) {
@@ -251,7 +252,7 @@ static int test_cgkill_forkbomb(const char *root)
251252
wait_for_pid(pid);
252253

253254
if (ret == KSFT_PASS &&
254-
cg_read_strcmp(cgroup, "cgroup.events", "populated 0\n"))
255+
cg_read_strcmp_wait(cgroup, "cgroup.events", "populated 0\n"))
255256
ret = KSFT_FAIL;
256257

257258
if (cgroup)

0 commit comments

Comments
 (0)