Skip to content

Commit 5c0f43e

Browse files
committed
Merge tag 'kernel-7.1-rc1.misc' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
Pull pid_namespace updates from Christian Brauner: - pid_namespace: make init creation more flexible Annotate ->child_reaper accesses with {READ,WRITE}_ONCE() to protect the unlocked readers from cpu/compiler reordering, and enforce that pid 1 in a pid namespace is always the first allocated pid (the set_tid path already required this). On top of that, allow opening pid_for_children before the pid namespace init has been created. This lets one process create the pid namespace and a different process create the init via setns(), which makes clone3(set_tid) usable in all cases evenly and is particularly useful to CRIU when restoring nested containers. A new selftest covers both the basic create-pidns-then-init flow and the cross-process variant, and a MAINTAINERS entry for the pid namespace code is added. - unrelated signal cleanup: update outdated comment for the removed freezable_schedule() * tag 'kernel-7.1-rc1.misc' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: signal: update outdated comment for removed freezable_schedule() MAINTAINERS: add a pid namespace entry selftests: Add tests for creating pidns init via setns pid_namespace: allow opening pid_for_children before init was created pid: check init is created first after idr alloc pid_namespace: avoid optimization of accesses to ->child_reaper
2 parents 7c8a467 + 4c68d15 commit 5c0f43e

9 files changed

Lines changed: 269 additions & 24 deletions

File tree

MAINTAINERS

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18191,6 +18191,15 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/mtd/linux.git nand/next
1819118191
F: drivers/mtd/nand/
1819218192
F: include/linux/mtd/*nand*.h
1819318193

18194+
NAMESPACES:
18195+
M: Christian Brauner <[email protected]>
18196+
R: Pavel Tikhomirov <[email protected]>
18197+
18198+
S: Maintained
18199+
F: rust/kernel/pid_namespace.rs
18200+
F: kernel/pid_namespace.c
18201+
F: tools/testing/selftests/pid_namespace/
18202+
1819418203
NATIONAL INSTRUMENTS SERIAL DRIVER
1819518204
M: Chaitanya Vadrevu <[email protected]>
1819618205
@@ -20804,10 +20813,8 @@ M: Christian Brauner <[email protected]>
2080420813
2080520814
S: Maintained
2080620815
T: git git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux.git
20807-
F: rust/kernel/pid_namespace.rs
2080820816
F: samples/pidfd/
2080920817
F: tools/testing/selftests/clone3/
20810-
F: tools/testing/selftests/pid_namespace/
2081120818
F: tools/testing/selftests/pidfd/
2081220819
K: (?i)pidfd
2081320820
K: (?i)clone3

kernel/exit.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -608,7 +608,8 @@ static struct task_struct *find_child_reaper(struct task_struct *father,
608608

609609
reaper = find_alive_thread(father);
610610
if (reaper) {
611-
pid_ns->child_reaper = reaper;
611+
ASSERT_EXCLUSIVE_WRITER(pid_ns->child_reaper);
612+
WRITE_ONCE(pid_ns->child_reaper, reaper);
612613
return reaper;
613614
}
614615

kernel/fork.c

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2469,7 +2469,10 @@ __latent_entropy struct task_struct *copy_process(
24692469
init_task_pid(p, PIDTYPE_SID, task_session(current));
24702470

24712471
if (is_child_reaper(pid)) {
2472-
ns_of_pid(pid)->child_reaper = p;
2472+
struct pid_namespace *ns = ns_of_pid(pid);
2473+
2474+
ASSERT_EXCLUSIVE_WRITER(ns->child_reaper);
2475+
WRITE_ONCE(ns->child_reaper, p);
24732476
p->signal->flags |= SIGNAL_UNKILLABLE;
24742477
}
24752478
p->signal->shared_pending.signal = delayed.signal;

kernel/pid.c

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@ void free_pid(struct pid *pid)
128128
* is the reaper wake up the reaper. The reaper
129129
* may be sleeping in zap_pid_ns_processes().
130130
*/
131-
wake_up_process(ns->child_reaper);
131+
wake_up_process(READ_ONCE(ns->child_reaper));
132132
break;
133133
case PIDNS_ADDING:
134134
/* Handle a fork failure of the first process */
@@ -215,12 +215,6 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid,
215215
retval = -EINVAL;
216216
if (tid < 1 || tid >= pid_max[ns->level - i])
217217
goto out_abort;
218-
/*
219-
* Also fail if a PID != 1 is requested and
220-
* no PID 1 exists.
221-
*/
222-
if (tid != 1 && !tmp->child_reaper)
223-
goto out_abort;
224218
retval = -EPERM;
225219
if (!checkpoint_restore_ns_capable(tmp->user_ns))
226220
goto out_abort;
@@ -296,9 +290,18 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid,
296290

297291
pid->numbers[i].nr = nr;
298292
pid->numbers[i].ns = tmp;
299-
tmp = tmp->parent;
300293
i--;
301294
retried_preload = false;
295+
296+
/*
297+
* PID 1 (init) must be created first.
298+
*/
299+
if (!READ_ONCE(tmp->child_reaper) && nr != 1) {
300+
retval = -EINVAL;
301+
goto out_free;
302+
}
303+
304+
tmp = tmp->parent;
302305
}
303306

304307
/*

kernel/pid_namespace.c

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -369,15 +369,6 @@ static struct ns_common *pidns_for_children_get(struct task_struct *task)
369369
}
370370
task_unlock(task);
371371

372-
if (ns) {
373-
read_lock(&tasklist_lock);
374-
if (!ns->child_reaper) {
375-
put_pid_ns(ns);
376-
ns = NULL;
377-
}
378-
read_unlock(&tasklist_lock);
379-
}
380-
381372
return ns ? &ns->ns : NULL;
382373
}
383374

kernel/signal.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2818,8 +2818,9 @@ bool get_signal(struct ksignal *ksig)
28182818

28192819
/*
28202820
* Do this once, we can't return to user-mode if freezing() == T.
2821-
* do_signal_stop() and ptrace_stop() do freezable_schedule() and
2822-
* thus do not need another check after return.
2821+
* do_signal_stop() and ptrace_stop() set TASK_STOPPED/TASK_TRACED
2822+
* and the freezer handles those states via TASK_FROZEN, thus they
2823+
* do not need another check after return.
28232824
*/
28242825
try_to_freeze();
28252826

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
pid_max
2+
pidns_init_via_setns
23
regression_enomem

tools/testing/selftests/pid_namespace/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# SPDX-License-Identifier: GPL-2.0
22
CFLAGS += -g $(KHDR_INCLUDES)
33

4-
TEST_GEN_PROGS = regression_enomem pid_max
4+
TEST_GEN_PROGS = regression_enomem pid_max pidns_init_via_setns
55

66
LOCAL_HDRS += $(selfdir)/pidfd/pidfd.h
77

Lines changed: 238 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,238 @@
1+
// SPDX-License-Identifier: GPL-2.0
2+
#define _GNU_SOURCE
3+
#include <fcntl.h>
4+
#include <sched.h>
5+
#include <stdio.h>
6+
#include <sys/types.h>
7+
#include <unistd.h>
8+
9+
#include "kselftest_harness.h"
10+
#include "../pidfd/pidfd.h"
11+
12+
/*
13+
* Test that a process can become PID 1 (init) in a new PID namespace
14+
* created via unshare() and joined via setns().
15+
*
16+
* Flow:
17+
* 1. Parent creates a pipe for synchronization.
18+
* 2. Parent forks a child.
19+
* 3. Parent calls unshare(CLONE_NEWPID) to create a new PID namespace.
20+
* 4. Parent signals the child via the pipe.
21+
* 5. Child opens parent's /proc/<ppid>/ns/pid_for_children and calls
22+
* setns(fd, CLONE_NEWPID) to join the new namespace.
23+
* 6. Child forks a grandchild.
24+
* 7. Grandchild verifies getpid() == 1.
25+
*/
26+
TEST(pidns_init_via_setns)
27+
{
28+
pid_t child, parent_pid;
29+
int pipe_fd[2];
30+
char buf;
31+
32+
if (geteuid())
33+
ASSERT_EQ(0, unshare(CLONE_NEWUSER));
34+
35+
parent_pid = getpid();
36+
37+
ASSERT_EQ(0, pipe(pipe_fd));
38+
39+
child = fork();
40+
ASSERT_GE(child, 0);
41+
42+
if (child == 0) {
43+
char path[256];
44+
int nsfd;
45+
pid_t grandchild;
46+
47+
close(pipe_fd[1]);
48+
49+
/* Wait for parent to complete unshare */
50+
ASSERT_EQ(1, read_nointr(pipe_fd[0], &buf, 1));
51+
close(pipe_fd[0]);
52+
53+
snprintf(path, sizeof(path),
54+
"/proc/%d/ns/pid_for_children", parent_pid);
55+
nsfd = open(path, O_RDONLY);
56+
ASSERT_GE(nsfd, 0);
57+
58+
ASSERT_EQ(0, setns(nsfd, CLONE_NEWPID));
59+
close(nsfd);
60+
61+
grandchild = fork();
62+
ASSERT_GE(grandchild, 0);
63+
64+
if (grandchild == 0) {
65+
/* Should be init (PID 1) in the new namespace */
66+
if (getpid() != 1)
67+
_exit(1);
68+
_exit(0);
69+
}
70+
71+
ASSERT_EQ(0, wait_for_pid(grandchild));
72+
_exit(0);
73+
}
74+
75+
close(pipe_fd[0]);
76+
77+
ASSERT_EQ(0, unshare(CLONE_NEWPID));
78+
79+
/* Signal child that the new PID namespace is ready */
80+
buf = 0;
81+
ASSERT_EQ(1, write_nointr(pipe_fd[1], &buf, 1));
82+
close(pipe_fd[1]);
83+
84+
ASSERT_EQ(0, wait_for_pid(child));
85+
}
86+
87+
/*
88+
* Similar to pidns_init_via_setns, but:
89+
* 1. Parent enters a new PID namespace right from the start to be able to
90+
* later freely use pid 1001 in it.
91+
* 2. After forking child, parent also calls unshare(CLONE_NEWUSER)
92+
* before unshare(CLONE_NEWPID) so that new old and new pid namespaces have
93+
* different user namespace owners.
94+
* 3. Child uses clone3() with set_tid={1, 1001} instead of fork() and
95+
* grandchild checks that it gets desired pids .
96+
*
97+
* Flow:
98+
* 1. Test process creates a new PID namespace and forks a wrapper
99+
* (PID 1 in the outer namespace).
100+
* 2. Wrapper forks a child.
101+
* 3. Wrapper calls unshare(CLONE_NEWUSER) + unshare(CLONE_NEWPID)
102+
* to create an inner PID namespace.
103+
* 4. Wrapper signals the child via pipe.
104+
* 5. Child opens wrapper's /proc/<pid>/ns/pid_for_children and calls
105+
* setns(fd, CLONE_NEWPID) to join the inner namespace.
106+
* 6. Child calls clone3() with set_tid={1, 1001}.
107+
* 7. Grandchild verifies its NSpid ends with "1001 1".
108+
*/
109+
110+
pid_t set_tid[] = {1, 1001};
111+
112+
static int pidns_init_via_setns_set_tid_grandchild(struct __test_metadata *_metadata)
113+
{
114+
char *line = NULL;
115+
size_t len = 0;
116+
int found = 0;
117+
FILE *gf;
118+
119+
gf = fopen("/proc/self/status", "r");
120+
ASSERT_NE(gf, NULL);
121+
122+
while (getline(&line, &len, gf) != -1) {
123+
if (strncmp(line, "NSpid:", 6) != 0)
124+
continue;
125+
126+
for (int i = 0; i < 2; i++) {
127+
char *last = strrchr(line, '\t');
128+
pid_t pid;
129+
130+
ASSERT_NE(last, NULL);
131+
ASSERT_EQ(sscanf(last, "%d", &pid), 1);
132+
ASSERT_EQ(pid, set_tid[i]);
133+
*last = '\0';
134+
}
135+
136+
found = true;
137+
break;
138+
}
139+
140+
free(line);
141+
fclose(gf);
142+
ASSERT_TRUE(found);
143+
return 0;
144+
}
145+
146+
static int pidns_init_via_setns_set_tid_child(struct __test_metadata *_metadata,
147+
pid_t parent_pid, int pipe_fd[2])
148+
{
149+
struct __clone_args args = {
150+
.exit_signal = SIGCHLD,
151+
.set_tid = ptr_to_u64(set_tid),
152+
.set_tid_size = 2,
153+
};
154+
pid_t grandchild;
155+
char path[256];
156+
char buf;
157+
int nsfd;
158+
159+
close(pipe_fd[1]);
160+
161+
ASSERT_EQ(1, read_nointr(pipe_fd[0], &buf, 1));
162+
close(pipe_fd[0]);
163+
164+
snprintf(path, sizeof(path),
165+
"/proc/%d/ns/pid_for_children", parent_pid);
166+
nsfd = open(path, O_RDONLY);
167+
ASSERT_GE(nsfd, 0);
168+
169+
ASSERT_EQ(0, setns(nsfd, CLONE_NEWPID));
170+
close(nsfd);
171+
172+
grandchild = sys_clone3(&args, sizeof(args));
173+
ASSERT_GE(grandchild, 0);
174+
175+
if (grandchild == 0)
176+
_exit(pidns_init_via_setns_set_tid_grandchild(_metadata));
177+
178+
ASSERT_EQ(0, wait_for_pid(grandchild));
179+
return 0;
180+
}
181+
182+
static int pidns_init_via_setns_set_tid_wrapper(struct __test_metadata *_metadata)
183+
{
184+
int pipe_fd[2];
185+
pid_t child, parent_pid;
186+
char buf;
187+
FILE *f;
188+
189+
/*
190+
* We are PID 1 inside the new namespace, but /proc is
191+
* mounted from the host. Read our host-visible PID so
192+
* the child can reach our pid_for_children via /proc.
193+
*/
194+
f = fopen("/proc/self/stat", "r");
195+
ASSERT_NE(f, NULL);
196+
ASSERT_EQ(fscanf(f, "%d", &parent_pid), 1);
197+
ASSERT_EQ(0, pipe(pipe_fd));
198+
199+
child = fork();
200+
ASSERT_GE(child, 0);
201+
202+
if (child == 0)
203+
_exit(pidns_init_via_setns_set_tid_child(_metadata, parent_pid, pipe_fd));
204+
205+
close(pipe_fd[0]);
206+
207+
ASSERT_EQ(0, unshare(CLONE_NEWUSER));
208+
ASSERT_EQ(0, unshare(CLONE_NEWPID));
209+
210+
buf = 0;
211+
ASSERT_EQ(1, write_nointr(pipe_fd[1], &buf, 1));
212+
close(pipe_fd[1]);
213+
214+
ASSERT_EQ(0, wait_for_pid(child));
215+
216+
fclose(f);
217+
return 0;
218+
}
219+
220+
TEST(pidns_init_via_setns_set_tid)
221+
{
222+
pid_t wrapper;
223+
224+
if (geteuid())
225+
SKIP(return, "This test needs root to run!");
226+
227+
ASSERT_EQ(0, unshare(CLONE_NEWPID));
228+
229+
wrapper = fork();
230+
ASSERT_GE(wrapper, 0);
231+
232+
if (wrapper == 0)
233+
_exit(pidns_init_via_setns_set_tid_wrapper(_metadata));
234+
235+
ASSERT_EQ(0, wait_for_pid(wrapper));
236+
}
237+
238+
TEST_HARNESS_MAIN

0 commit comments

Comments
 (0)