[v2,49/57] pid: Create pid_ns helpers

Submitted by Kirill Tkhai on March 28, 2017, 3:41 p.m.

Details

Message ID 149071568268.12770.2131105833018944846.stgit@localhost.localdomain
State New
Series "Nested pid namespaces support"
Headers show

Commit Message

Kirill Tkhai March 28, 2017, 3:41 p.m.
Task may set last_pid only for its active pid namespace,
so if NSpid of a child contains more then one level, we
need external help to populate the whole pid hierarhy
(pid in parent pid_ns, pid in grand parent etc). Pid ns
helpers are used for that.

These are childred of usernsd, which are listening for
socket, and setting requested last pid in their active
pid_ns.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
---
 criu/cr-restore.c         |    7 +
 criu/include/namespaces.h |    3 +
 criu/namespaces.c         |  236 +++++++++++++++++++++++++++++++++++++++++++++
 criu/ns-common.c          |   51 ++++++++++
 criu/pie/restorer.c       |    5 +
 5 files changed, 302 insertions(+)
 create mode 100644 criu/ns-common.c

Patch hide | download patch | download mbox

diff --git a/criu/cr-restore.c b/criu/cr-restore.c
index ab3d80a6..54814743 100644
--- a/criu/cr-restore.c
+++ b/criu/cr-restore.c
@@ -1461,6 +1461,8 @@  static int restore_task_with_children(void *_arg)
 			pr_err("Can't add fd to fdstore\n");
 			return -1;
 		}
+		if (create_pid_ns_helper(pid_ns) < 0)
+			goto err;
 	}
 
 	if (restore_task_mnt_ns(current))
@@ -1972,6 +1974,10 @@  static int restore_root_task(struct pstree_item *init)
 	if (ret < 0)
 		goto out_kill;
 
+	ret = destroy_pid_ns_helpers();
+	if (ret < 0)
+		goto out_kill;
+
 	ret = stop_usernsd();
 	if (ret < 0)
 		goto out_kill;
@@ -2071,6 +2077,7 @@  static int restore_root_task(struct pstree_item *init)
 	return 0;
 
 out_kill:
+	destroy_pid_ns_helpers();
 	/*
 	 * The processes can be killed only when all of them have been created,
 	 * otherwise an external proccesses can be killed.
diff --git a/criu/include/namespaces.h b/criu/include/namespaces.h
index 1cc47023..70e06386 100644
--- a/criu/include/namespaces.h
+++ b/criu/include/namespaces.h
@@ -267,5 +267,8 @@  static inline int pid_ns_root_off(void)
 	return 0;
 }
 extern int reserve_pid_ns_helpers(void);
+extern int create_pid_ns_helper(struct ns_id *ns);
+extern int destroy_pid_ns_helpers(void);
+extern int request_set_next_pid(int pid_ns_id, pid_t pid, int sk);
 
 #endif /* __CR_NS_H__ */
diff --git a/criu/namespaces.c b/criu/namespaces.c
index 1c19a631..cf7aa44b 100644
--- a/criu/namespaces.c
+++ b/criu/namespaces.c
@@ -15,6 +15,7 @@ 
 #include <errno.h>
 #include <sys/ioctl.h>
 #include <sys/ptrace.h>
+#include <sys/file.h>
 
 #include "page.h"
 #include "rst-malloc.h"
@@ -38,6 +39,11 @@ 
 #include "fdstore.h"
 #include "proc_parse.h"
 
+#define __sys(foo)	foo
+#define __sys_err(ret)	(-errno)
+
+#include "ns-common.c"
+
 static struct ns_desc *ns_desc_array[] = {
 	&net_ns_desc,
 	&uts_ns_desc,
@@ -49,6 +55,8 @@  static struct ns_desc *ns_desc_array[] = {
 };
 
 static unsigned int join_ns_flags;
+/* Creation of every helper are synchronized by userns_sync_lock */
+static int nr_pid_ns_helper_created = 0;
 
 int check_namespace_opts(void)
 {
@@ -2528,5 +2536,233 @@  int reserve_pid_ns_helpers(void)
 	return walk_namespaces(&pid_ns_desc, do_reserve_pid_ns_helpers, NULL);
 }
 
+static int pid_ns_helper_sock(struct ns_id *ns)
+{
+	struct sockaddr_un addr;
+	socklen_t len;
+	int sk;
+
+	sk = socket(AF_UNIX, SOCK_DGRAM, 0);
+	if (sk < 0) {
+		pr_perror("Can't create helper socket");
+		return -1;
+	}
+	pid_ns_helper_socket_name(&addr, &len, ns->id);
+
+	if (bind(sk, (struct sockaddr *)&addr, len) < 0) {
+		pr_perror("Can't bind pid_ns sock");
+		return -1;
+	}
+
+	return sk;
+}
+
+static int pid_ns_helper(struct ns_id *ns, int sk)
+{
+	struct sockaddr_un addr;
+	struct msghdr msg = {0};
+	struct iovec iov;
+	pid_t pid;
+
+	msg.msg_name = &addr;
+	msg.msg_iov = &iov;
+	msg.msg_iovlen = 1;
+
+	while (1) {
+		int answer = 0;
+		msg.msg_namelen = sizeof(addr);
+		iov.iov_base = &pid;
+		iov.iov_len = sizeof(pid);
+
+		if (recvmsg(sk, &msg, 0) < 0) {
+			pr_perror("recv() failed to read pid");
+			break;
+		}
+
+		if (pid != 0) {
+			if (__set_next_pid(pid) < 0) {
+				pr_err("Can't set next pid\n");
+				answer = -1;
+			}
+		}
+
+		iov.iov_base = &answer;
+		iov.iov_len = sizeof(answer);
+		if (sendmsg(sk, &msg, 0) < 0) {
+			pr_perror("Can't send answer");
+			break;
+		}
+
+		if (pid == 0)
+			return 0;
+	}
+
+	return -1;
+}
+
+static int do_create_pid_ns_helper(void *arg, int unused_fd, pid_t unused_pid)
+{
+	int pid_ns_fd, mnt_ns_fd, sk, fd, i, lock_fd, transport_fd;
+	struct ns_id *ns, *tmp;
+	struct pid *pid;
+	pid_t child;
+
+	pid_ns_fd = open_proc(PROC_SELF, "ns/pid");
+	if (pid_ns_fd < 0) {
+		pr_perror("Can't open pid ns");
+		return -1;
+	}
+	ns = *(struct ns_id **)arg;
+
+	fd = fdstore_get(ns->pid.nsfd_id);
+	if (fd < 0) {
+		pr_err("Can't get pid_ns fd\n");
+		return -1;
+	}
+	if (setns(fd, CLONE_NEWPID) < 0) {
+		pr_perror("Can't setns");
+		return -1;
+	}
+	close(fd);
+
+	sk = pid_ns_helper_sock(ns);
+	if (sk < 0)
+		return -1;
+
+	pid = __pstree_pid_by_virt(ns, ns->ns_pid);
+	if (!pid) {
+		pr_err("Can't find helper reserved pid\n");
+		return -1;
+	}
+
+	tmp = ns->parent;
+	if (tmp) {
+		futex_t *f = &tmp->pid.helper_created;
+		futex_wait_while_eq(f, 0);
+	}
+
+	if (switch_ns(root_item->pid->real, &mnt_ns_desc, &mnt_ns_fd) < 0) {
+		pr_err("Can't set mnt_ns\n");
+		return -1;
+	}
+
+	lock_fd = open("/proc/" LAST_PID_PATH, O_RDONLY);
+	if (lock_fd < 0)
+		return -1;
+
+	if (restore_ns(mnt_ns_fd, &mnt_ns_desc) < 0) {
+		pr_err("Can't restore ns\n");
+		return -1;
+	}
+
+	if (flock(lock_fd, LOCK_EX)) {
+		close(lock_fd);
+		pr_perror("Can't lock %s", LAST_PID_PATH);
+		return -1;
+	}
+
+	transport_fd = get_service_fd(TRANSPORT_FD_OFF);
+	/*
+	 * Starting not from pid->level - 1, as it's helper has not created yet
+	 * (we're creating it in the moment), and the true pid for this level
+	 * is set by the task, who does close(CLONE_NEWPID) (this task is sender of fd).
+	 */
+	for (i = pid->level - 2, tmp = ns->parent; i >= 0; i--, tmp = tmp->parent)
+		if (request_set_next_pid(tmp->id, pid->ns[i].virt, transport_fd)) {
+			pr_err("Can't set next pid using helper\n");
+			flock(lock_fd, LOCK_UN);
+			close(lock_fd);
+			return -1;
+		}
+	child = fork();
+	if (child < 0) {
+		flock(lock_fd, LOCK_UN);
+		close(lock_fd);
+		pr_perror("Can't fork");
+		return -1;
+	} else if (!child) {
+		close(lock_fd);
+		exit(pid_ns_helper(ns, sk));
+	}
+	close(sk);
+	futex_set_and_wake(&ns->pid.helper_created, 1);
+	flock(lock_fd, LOCK_UN);
+	close(lock_fd);
+	nr_pid_ns_helper_created++;
+
+	if (setns(pid_ns_fd, CLONE_NEWPID) < 0) {
+		pr_perror("Restore ns");
+		return -1;
+	}
+	return 0;
+}
+
+/*
+ * Task may set last_pid only for its active pid namespace,
+ * so if NSpid of a child contains more then one level, we
+ * need external help to populate the whole pid hierarhy
+ * (pid in parent pid_ns, pid in grand parent etc). Pid ns
+ * helpers are used for that.
+ *
+ * We need a task or tasks to be a parent of pid_ns helpers.
+ * To live in common hierarhy and to be a TASK_HELPER is not
+ * possible, because it introduces circular dependencies.
+ * The same is to be children of criu main task, because
+ * we already have dependencies between it and root_item
+ * (NO more dependencies!). So, we choose usernsd for that:
+ * it always exists and have command interface.
+ */
+int create_pid_ns_helper(struct ns_id *ns)
+{
+	BUG_ON(getpid() != INIT_PID);
+
+	if (__set_next_pid(ns->ns_pid) < 0) {
+		pr_err("Can't set next fd\n");
+		return -1;
+	}
+	if (userns_call(do_create_pid_ns_helper, 0, &ns, sizeof(ns), -1) < 0) {
+		pr_err("Can't create pid_ns helper\n");
+		return -1;
+	}
+	return 0;
+}
+
+static int do_destroy_pid_ns_helper(void *arg, int fd, pid_t pid)
+{
+	int i, sk, status, nr_ok = 0;
+	struct ns_id *ns;
+
+	sk = get_service_fd(TRANSPORT_FD_OFF);
+
+	for (ns = ns_ids; ns; ns = ns->next) {
+		if (ns->nd != &pid_ns_desc)
+			continue;
+		if (request_set_next_pid(ns->id, 0, sk) == 0)
+			nr_ok++;
+	}
+
+	if (nr_ok != nr_pid_ns_helper_created)
+		pr_err("Not all pid_ns helpers killed\n");
+
+	for (i = 0; i < nr_ok; i++) {
+		if (waitpid(-1, &status, 0) < 0)
+			pr_perror("Error during waiting pid_ns helper");
+	}
+	nr_pid_ns_helper_created = 0;
+	return 0;
+}
+
+int destroy_pid_ns_helpers(void)
+{
+	if (!(root_ns_mask & CLONE_NEWPID))
+		return 0;
+
+	if (userns_call(do_destroy_pid_ns_helper, 0, NULL, 0, -1) < 0) {
+		pr_err("Can't create pid_ns helper\n");
+		return -1;
+	}
+	return 0;
+}
+
 struct ns_desc pid_ns_desc = NS_DESC_ENTRY(CLONE_NEWPID, "pid");
 struct ns_desc user_ns_desc = NS_DESC_ENTRY(CLONE_NEWUSER, "user");
diff --git a/criu/ns-common.c b/criu/ns-common.c
new file mode 100644
index 00000000..a8e28aa0
--- /dev/null
+++ b/criu/ns-common.c
@@ -0,0 +1,51 @@ 
+#include <sys/socket.h>
+#include <sys/un.h>
+
+void pid_ns_helper_socket_name(struct sockaddr_un *addr, socklen_t *len, unsigned int id)
+{
+	const char prefix[] = "0/criu-pid-ns-";
+	const char int_max[] = "2147483647";
+
+	*len = sizeof(*addr) - sizeof(addr->sun_path) +
+	       sizeof(prefix) - 1 + sizeof(int_max) - 1;
+
+	addr->sun_family = AF_UNIX;
+
+	memset(addr->sun_path + sizeof(prefix) - 1, '\0', sizeof(int_max) - 1);
+#ifdef CR_NOGLIBC
+	std_sprintf(addr->sun_path, "%s%d", prefix, id);
+#else
+	sprintf(addr->sun_path, "%s%d", prefix, id);
+#endif
+	addr->sun_path[0] = '\0';
+}
+
+/* Send helper a request to set next pid and receive success */
+int request_set_next_pid(int pid_ns_id, pid_t pid, int sk)
+{
+	struct sockaddr_un addr;
+	int answer, ret;
+	socklen_t len;
+
+	BUG_ON(pid == -1);
+
+	pid_ns_helper_socket_name(&addr, &len, pid_ns_id);
+	ret = __sys(sendto)(sk, &pid, sizeof(pid), 0, (struct sockaddr *)&addr, len);
+	if (ret	< 0) {
+		pr_err("Can't send request: err=%d\n", __sys_err(ret));
+		return -1;
+	}
+
+	ret = __sys(recvfrom)(sk, &answer, sizeof(answer), 0, NULL, NULL);
+	if (ret < 0) {
+		pr_err("Can't recv answer: err=%d\n", __sys_err(ret));
+		return -1;
+	}
+
+	if (answer != 0) {
+		pr_err("Error answer\n");
+		return -1;
+	}
+
+	return 0;
+}
diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c
index 53ecaa05..616141a0 100644
--- a/criu/pie/restorer.c
+++ b/criu/pie/restorer.c
@@ -47,6 +47,11 @@ 
 #include "restorer.h"
 #include "namespaces.h"
 
+#define __sys(foo)	sys_##foo
+#define __sys_err(ret)	ret
+
+#include "../ns-common.c"
+
 #ifndef PR_SET_PDEATHSIG
 #define PR_SET_PDEATHSIG 1
 #endif