[v2,27/36] ns: Generate user_ns tree

Submitted by Kirill Tkhai on Feb. 3, 2017, 4:15 p.m.

Details

Message ID 148613853601.3612.17874616003950721388.stgit@localhost.localdomain
State New
Series "Nested user namespaces support"
Headers show

Commit Message

Kirill Tkhai Feb. 3, 2017, 4:15 p.m.
Create user namespaces hierarhy from criu main task.
Open ns'es fds in, so they are seen for everybody as
/proc/[criu pid]/fd/[ns_fd].

Why we do it this way.
1)User namespaces are not correlated with task
hierarhy. Parent task may have a user namespace
of a level bigger, that a child task. So, we
can't restore the user namespaces just by
passing CLONE_NEWUSER in fork_with_pid().

2)We create namespaces from criu main task to store
open namespaces'es fds. If we used root_item instead,
all open files would clone to children, and children
would have close unnecessary file descriptors, which
is just a time wasting.

3)CLONE_FS tasks will require user_ns is set at the
moment of clone(), so we have to restore target user_ns
in locality of create_children_and_session().

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
---
 criu/cr-restore.c         |    3 +
 criu/include/namespaces.h |    1 
 criu/namespaces.c         |  153 +++++++++++++++++++++++++++++++++++++++++++++
 criu/pstree.c             |    6 +-
 4 files changed, 161 insertions(+), 2 deletions(-)

Patch hide | download patch | download mbox

diff --git a/criu/cr-restore.c b/criu/cr-restore.c
index ab05ebfd1..07a966154 100644
--- a/criu/cr-restore.c
+++ b/criu/cr-restore.c
@@ -1818,7 +1818,8 @@  static int restore_root_task(struct pstree_item *init)
 	 * uid_map and gid_map must be filled from a parent user namespace.
 	 * prepare_userns_creds() must be called after filling mappings.
 	 */
-	if ((root_ns_mask & CLONE_NEWUSER) && prepare_userns(init->pid->real, userns_entry))
+	if ((root_ns_mask & CLONE_NEWUSER) &&
+	    (prepare_userns(init->pid->real, userns_entry) < 0 || create_ns_hierarhy() < 0))
 		goto out_kill;
 
 	pr_info("Wait until namespaces are created\n");
diff --git a/criu/include/namespaces.h b/criu/include/namespaces.h
index 546de7c5d..bf8b90eba 100644
--- a/criu/include/namespaces.h
+++ b/criu/include/namespaces.h
@@ -168,6 +168,7 @@  extern struct ns_id *lookup_ns_by_id(unsigned int id, struct ns_desc *nd);
 
 extern int collect_user_namespaces(bool for_dump);
 extern int prepare_userns(pid_t real_pid, UsernsEntry *e);
+extern int create_ns_hierarhy(void);
 extern int stop_usernsd(void);
 
 extern uid_t userns_uid(uid_t uid);
diff --git a/criu/namespaces.c b/criu/namespaces.c
index 6151219d8..fd390c938 100644
--- a/criu/namespaces.c
+++ b/criu/namespaces.c
@@ -30,6 +30,7 @@ 
 #include "protobuf.h"
 #include "util.h"
 #include "images/ns.pb-c.h"
+#include "common/scm.h"
 
 static struct ns_desc *ns_desc_array[] = {
 	&net_ns_desc,
@@ -2151,5 +2152,157 @@  int prepare_namespace_before_tasks(void)
 	return -1;
 }
 
+enum {
+	NS__CREATED = 1,
+	NS__MAPS_POPULATED,
+	NS__RESTORED,
+	NS__EXIT_HELPER,
+	NS__ERROR,
+};
+
+struct ns_arg {
+	struct ns_id *me;
+	futex_t *futex;
+	pid_t pid;
+};
+
+static int create_user_ns_hierarhy_fn(void *in_arg)
+{
+	char stack[128] __stack_aligned__;
+	struct ns_arg arg, *p_arg = in_arg;
+	futex_t *p_futex, *futex = NULL;
+	int status, fd, ret = -1;
+	struct ns_id *me, *child;
+	pid_t pid = -1;
+
+	p_futex = p_arg->futex;
+	me = p_arg->me;
+
+	if (p_futex) {
+		/* Temporary set ns owner to me to allow parent restore user_ns maps */
+		me->owner.pid = get_self_real_pid();
+		if (me->owner.pid < 0) {
+			pr_err("Can't self pid\n");
+			goto out;
+		}
+		futex_set_and_wake(p_futex, NS__CREATED);
+
+		fd = open("/proc/self/ns/user", O_RDONLY);
+		if (fd < 0) {
+			pr_err("Can't get self user ns");
+			goto out;
+		}
+		/*
+		 * As we are cloned with CLONE_FILES,
+		 * parent task will see this fd too.
+		 */
+		me->owner.fd = fd;
+
+		futex_wait_while_lt(p_futex, NS__MAPS_POPULATED);
+		if (prepare_userns_creds()) {
+			pr_err("Can't prepare creds\n");
+			goto out;
+		}
+	}
+
+	futex = mmap(NULL, sizeof(*futex), PROT_WRITE | PROT_READ, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+	if (futex == MAP_FAILED) {
+		pr_perror("Failed to mmap futex");
+		goto out;
+	}
+	arg.futex = futex;
+	arg.pid = p_arg->pid;
+
+	list_for_each_entry(child, &me->children, siblings) {
+		arg.me = child;
+		futex_init(futex);
+
+		pid = clone(create_user_ns_hierarhy_fn, stack + 127, CLONE_NEWUSER | CLONE_FILES | SIGCHLD, &arg);
+		if (pid < 0) {
+			pr_perror("Can't clone");
+			goto out;
+		}
+		futex_wait_while_lt(futex, NS__CREATED);
+		/* Get child real pid */
+		pid = child->owner.pid;
+		if (prepare_userns(pid, child->user.e) < 0) {
+			pr_err("Can't prepare child user_ns\n");
+			goto out;
+		}
+		/* Set ns owner to criu's virt pid */
+		child->owner.pid = p_arg->pid;
+		futex_set_and_wake(futex, NS__MAPS_POPULATED);
+
+		errno = 0;
+		if (wait(&status) < 0 || WEXITSTATUS(status)) {
+			pr_perror("Child process waiting: %d\n", WEXITSTATUS(status));
+			goto out;
+		}
+	}
+
+	ret = 0;
+out:
+	if (p_futex)
+		futex_set_and_wake(p_futex, ret ? NS__ERROR : NS__RESTORED);
+	if (futex)
+		munmap(futex, sizeof(*futex));
+	return ret ? 1 : 0;
+}
+
+static int do_create_ns_hierarhy(void *ppid)
+{
+	struct ns_arg arg;
+	char buf[128];
+	int fd;
+
+	arg.me = root_user_ns;
+	arg.futex = NULL;
+	arg.pid = (pid_t)(long)ppid;
+
+	fd = get_service_fd(CR_PROC_FD_OFF);
+	if (fd < 0)
+		exit(4);
+
+	snprintf(buf, sizeof(buf), "%d/ns/user", root_item->pid->real);
+	fd = openat(fd, buf, O_RDONLY);
+	if (fd < 0) {
+		pr_perror("Can't open %s", buf);
+		exit(5);
+	}
+	if (setns(fd, CLONE_NEWUSER) < 0) {
+		pr_perror("Can't setns()");
+		exit(6);
+	}
+	if (prepare_userns_creds() < 0) {
+		pr_err("Can't prepare creds\n");
+		exit(7);
+	}
+	exit(create_user_ns_hierarhy_fn(&arg));
+}
+
+int create_ns_hierarhy(void)
+{
+	char stack[128] __stack_aligned__;
+	int status;
+	pid_t pid;
+
+	if (!(root_ns_mask & CLONE_NEWUSER))
+		return 0;
+
+	pid = clone(do_create_ns_hierarhy, stack + 127, CLONE_FILES | SIGCHLD, (void *)(long)getpid());
+	if (pid < 0) {
+		pr_perror("Can't clone()");
+		return -1;
+	}
+
+	errno = 0;
+	if (waitpid(pid, &status, 0) < 0 || WEXITSTATUS(status)) {
+		pr_err("Can't create ns hierarhy: errno=%d, status=%d\n",
+			errno, WEXITSTATUS(status));
+		return -1;
+	}
+	return 0;
+}
+
 struct ns_desc pid_ns_desc = NS_DESC_ENTRY(CLONE_NEWPID, "pid");
 struct ns_desc user_ns_desc = NS_DESC_ENTRY(CLONE_NEWUSER, "user");
diff --git a/criu/pstree.c b/criu/pstree.c
index 1ba762b80..d2d7339bc 100644
--- a/criu/pstree.c
+++ b/criu/pstree.c
@@ -873,8 +873,12 @@  static int prepare_pstree_kobj_ids(void)
 			 * be born in a fresh new mount namespace
 			 * which will be populated with all other
 			 * namespaces' entries.
+			 *
+			 * User namespaces are created in create_ns_hierarhy()
+			 * before the tasks, as their hierarhy does not correlated
+			 * with tasks hierarhy in any way.
 			 */
-			rsti(item)->clone_flags &= ~CLONE_NEWNS;
+			rsti(item)->clone_flags &= ~(CLONE_NEWNS | CLONE_NEWUSER);
 
 		cflags &= CLONE_ALLNS;
 

Comments

Andrey Vagin Feb. 3, 2017, 8:43 p.m.
On Fri, Feb 03, 2017 at 07:15:36PM +0300, Kirill Tkhai wrote:
> Create user namespaces hierarhy from criu main task.
> Open ns'es fds in, so they are seen for everybody as
> /proc/[criu pid]/fd/[ns_fd].
> 
> Why we do it this way.
> 1)User namespaces are not correlated with task
> hierarhy. Parent task may have a user namespace
> of a level bigger, that a child task. So, we
> can't restore the user namespaces just by
> passing CLONE_NEWUSER in fork_with_pid().
> 
> 2)We create namespaces from criu main task to store
> open namespaces'es fds. If we used root_item instead,
> all open files would clone to children, and children
> would have close unnecessary file descriptors, which
> is just a time wasting.
> 
> 3)CLONE_FS tasks will require user_ns is set at the
> moment of clone(), so we have to restore target user_ns
> in locality of create_children_and_session().
> 
> Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
> ---
>  criu/cr-restore.c         |    3 +
>  criu/include/namespaces.h |    1 
>  criu/namespaces.c         |  153 +++++++++++++++++++++++++++++++++++++++++++++
>  criu/pstree.c             |    6 +-
>  4 files changed, 161 insertions(+), 2 deletions(-)
> 
> diff --git a/criu/cr-restore.c b/criu/cr-restore.c
> index ab05ebfd1..07a966154 100644
> --- a/criu/cr-restore.c
> +++ b/criu/cr-restore.c
> @@ -1818,7 +1818,8 @@ static int restore_root_task(struct pstree_item *init)
>  	 * uid_map and gid_map must be filled from a parent user namespace.
>  	 * prepare_userns_creds() must be called after filling mappings.
>  	 */
> -	if ((root_ns_mask & CLONE_NEWUSER) && prepare_userns(init->pid->real, userns_entry))
> +	if ((root_ns_mask & CLONE_NEWUSER) &&
> +	    (prepare_userns(init->pid->real, userns_entry) < 0 || create_ns_hierarhy() < 0))
>  		goto out_kill;
>  
>  	pr_info("Wait until namespaces are created\n");
> diff --git a/criu/include/namespaces.h b/criu/include/namespaces.h
> index 546de7c5d..bf8b90eba 100644
> --- a/criu/include/namespaces.h
> +++ b/criu/include/namespaces.h
> @@ -168,6 +168,7 @@ extern struct ns_id *lookup_ns_by_id(unsigned int id, struct ns_desc *nd);
>  
>  extern int collect_user_namespaces(bool for_dump);
>  extern int prepare_userns(pid_t real_pid, UsernsEntry *e);
> +extern int create_ns_hierarhy(void);
>  extern int stop_usernsd(void);
>  
>  extern uid_t userns_uid(uid_t uid);
> diff --git a/criu/namespaces.c b/criu/namespaces.c
> index 6151219d8..fd390c938 100644
> --- a/criu/namespaces.c
> +++ b/criu/namespaces.c
> @@ -30,6 +30,7 @@
>  #include "protobuf.h"
>  #include "util.h"
>  #include "images/ns.pb-c.h"
> +#include "common/scm.h"
>  
>  static struct ns_desc *ns_desc_array[] = {
>  	&net_ns_desc,
> @@ -2151,5 +2152,157 @@ int prepare_namespace_before_tasks(void)
>  	return -1;
>  }
>  
> +enum {
> +	NS__CREATED = 1,
> +	NS__MAPS_POPULATED,
> +	NS__RESTORED,
> +	NS__EXIT_HELPER,
> +	NS__ERROR,
> +};
> +
> +struct ns_arg {
> +	struct ns_id *me;
> +	futex_t *futex;
> +	pid_t pid;
> +};
> +
> +static int create_user_ns_hierarhy_fn(void *in_arg)
> +{
> +	char stack[128] __stack_aligned__;
> +	struct ns_arg arg, *p_arg = in_arg;
> +	futex_t *p_futex, *futex = NULL;
> +	int status, fd, ret = -1;
> +	struct ns_id *me, *child;
> +	pid_t pid = -1;
> +
> +	p_futex = p_arg->futex;
> +	me = p_arg->me;
> +
> +	if (p_futex) {
> +		/* Temporary set ns owner to me to allow parent restore user_ns maps */
> +		me->owner.pid = get_self_real_pid();
> +		if (me->owner.pid < 0) {
> +			pr_err("Can't self pid\n");
> +			goto out;
> +		}
> +		futex_set_and_wake(p_futex, NS__CREATED);
> +
> +		fd = open("/proc/self/ns/user", O_RDONLY);
> +		if (fd < 0) {
> +			pr_err("Can't get self user ns");
> +			goto out;
> +		}
> +		/*
> +		 * As we are cloned with CLONE_FILES,
> +		 * parent task will see this fd too.
> +		 */
> +		me->owner.fd = fd;
> +
> +		futex_wait_while_lt(p_futex, NS__MAPS_POPULATED);
> +		if (prepare_userns_creds()) {
> +			pr_err("Can't prepare creds\n");
> +			goto out;
> +		}
> +	}
> +
> +	futex = mmap(NULL, sizeof(*futex), PROT_WRITE | PROT_READ, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
> +	if (futex == MAP_FAILED) {
> +		pr_perror("Failed to mmap futex");
> +		goto out;
> +	}
> +	arg.futex = futex;
> +	arg.pid = p_arg->pid;
> +
> +	list_for_each_entry(child, &me->children, siblings) {
> +		arg.me = child;
> +		futex_init(futex);
> +
> +		pid = clone(create_user_ns_hierarhy_fn, stack + 127, CLONE_NEWUSER | CLONE_FILES | SIGCHLD, &arg);

stack has to be aligned. I think stack + 128 should be used. You have to
gurantee, that arg will be placed after stack, pls take a look at
"struct cr_clone_arg". I think we need to do something similar here.

> +		if (pid < 0) {
> +			pr_perror("Can't clone");
> +			goto out;
> +		}
> +		futex_wait_while_lt(futex, NS__CREATED);
> +		/* Get child real pid */
> +		pid = child->owner.pid;
> +		if (prepare_userns(pid, child->user.e) < 0) {
> +			pr_err("Can't prepare child user_ns\n");
> +			goto out;
> +		}
> +		/* Set ns owner to criu's virt pid */
> +		child->owner.pid = p_arg->pid;
> +		futex_set_and_wake(futex, NS__MAPS_POPULATED);
> +
> +		errno = 0;
> +		if (wait(&status) < 0 || WEXITSTATUS(status)) {

If a process was killed, WEXITSTATUS(status) will be 0.

		status = -1;
		if (waitpid(pid, &status, 0) < 0 || status) {

> +			pr_perror("Child process waiting: %d\n", WEXITSTATUS(status));
> +			goto out;
> +		}
> +	}
> +
> +	ret = 0;
> +out:
> +	if (p_futex)
> +		futex_set_and_wake(p_futex, ret ? NS__ERROR : NS__RESTORED);
> +	if (futex)
> +		munmap(futex, sizeof(*futex));
> +	return ret ? 1 : 0;
> +}
> +
> +static int do_create_ns_hierarhy(void *ppid)
> +{
> +	struct ns_arg arg;
> +	char buf[128];
> +	int fd;
> +
> +	arg.me = root_user_ns;
> +	arg.futex = NULL;
> +	arg.pid = (pid_t)(long)ppid;
> +
> +	fd = get_service_fd(CR_PROC_FD_OFF);
> +	if (fd < 0)
> +		exit(4);
> +
> +	snprintf(buf, sizeof(buf), "%d/ns/user", root_item->pid->real);
> +	fd = openat(fd, buf, O_RDONLY);
> +	if (fd < 0) {
> +		pr_perror("Can't open %s", buf);
> +		exit(5);
> +	}
> +	if (setns(fd, CLONE_NEWUSER) < 0) {
> +		pr_perror("Can't setns()");
> +		exit(6);
> +	}
> +	if (prepare_userns_creds() < 0) {
> +		pr_err("Can't prepare creds\n");
> +		exit(7);
> +	}
> +	exit(create_user_ns_hierarhy_fn(&arg));
> +}
> +
> +int create_ns_hierarhy(void)
> +{
> +	char stack[128] __stack_aligned__;
> +	int status;
> +	pid_t pid;
> +
> +	if (!(root_ns_mask & CLONE_NEWUSER))
> +		return 0;
> +
> +	pid = clone(do_create_ns_hierarhy, stack + 127, CLONE_FILES | SIGCHLD, (void *)(long)getpid());
> +	if (pid < 0) {
> +		pr_perror("Can't clone()");
> +		return -1;
> +	}
> +
> +	errno = 0;
> +	if (waitpid(pid, &status, 0) < 0 || WEXITSTATUS(status)) {
> +		pr_err("Can't create ns hierarhy: errno=%d, status=%d\n",
> +			errno, WEXITSTATUS(status));
> +		return -1;
> +	}
> +	return 0;
> +}
> +
>  struct ns_desc pid_ns_desc = NS_DESC_ENTRY(CLONE_NEWPID, "pid");
>  struct ns_desc user_ns_desc = NS_DESC_ENTRY(CLONE_NEWUSER, "user");
> diff --git a/criu/pstree.c b/criu/pstree.c
> index 1ba762b80..d2d7339bc 100644
> --- a/criu/pstree.c
> +++ b/criu/pstree.c
> @@ -873,8 +873,12 @@ static int prepare_pstree_kobj_ids(void)
>  			 * be born in a fresh new mount namespace
>  			 * which will be populated with all other
>  			 * namespaces' entries.
> +			 *
> +			 * User namespaces are created in create_ns_hierarhy()
> +			 * before the tasks, as their hierarhy does not correlated
> +			 * with tasks hierarhy in any way.
>  			 */
> -			rsti(item)->clone_flags &= ~CLONE_NEWNS;
> +			rsti(item)->clone_flags &= ~(CLONE_NEWNS | CLONE_NEWUSER);
>  
>  		cflags &= CLONE_ALLNS;
>  
> 
> _______________________________________________
> CRIU mailing list
> CRIU@openvz.org
> https://lists.openvz.org/mailman/listinfo/criu
Kirill Tkhai Feb. 6, 2017, 9:13 a.m.
On 03.02.2017 23:43, Andrei Vagin wrote:
> On Fri, Feb 03, 2017 at 07:15:36PM +0300, Kirill Tkhai wrote:
>> Create user namespaces hierarhy from criu main task.
>> Open ns'es fds in, so they are seen for everybody as
>> /proc/[criu pid]/fd/[ns_fd].
>>
>> Why we do it this way.
>> 1)User namespaces are not correlated with task
>> hierarhy. Parent task may have a user namespace
>> of a level bigger, that a child task. So, we
>> can't restore the user namespaces just by
>> passing CLONE_NEWUSER in fork_with_pid().
>>
>> 2)We create namespaces from criu main task to store
>> open namespaces'es fds. If we used root_item instead,
>> all open files would clone to children, and children
>> would have close unnecessary file descriptors, which
>> is just a time wasting.
>>
>> 3)CLONE_FS tasks will require user_ns is set at the
>> moment of clone(), so we have to restore target user_ns
>> in locality of create_children_and_session().
>>
>> Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
>> ---
>>  criu/cr-restore.c         |    3 +
>>  criu/include/namespaces.h |    1 
>>  criu/namespaces.c         |  153 +++++++++++++++++++++++++++++++++++++++++++++
>>  criu/pstree.c             |    6 +-
>>  4 files changed, 161 insertions(+), 2 deletions(-)
>>
>> diff --git a/criu/cr-restore.c b/criu/cr-restore.c
>> index ab05ebfd1..07a966154 100644
>> --- a/criu/cr-restore.c
>> +++ b/criu/cr-restore.c
>> @@ -1818,7 +1818,8 @@ static int restore_root_task(struct pstree_item *init)
>>  	 * uid_map and gid_map must be filled from a parent user namespace.
>>  	 * prepare_userns_creds() must be called after filling mappings.
>>  	 */
>> -	if ((root_ns_mask & CLONE_NEWUSER) && prepare_userns(init->pid->real, userns_entry))
>> +	if ((root_ns_mask & CLONE_NEWUSER) &&
>> +	    (prepare_userns(init->pid->real, userns_entry) < 0 || create_ns_hierarhy() < 0))
>>  		goto out_kill;
>>  
>>  	pr_info("Wait until namespaces are created\n");
>> diff --git a/criu/include/namespaces.h b/criu/include/namespaces.h
>> index 546de7c5d..bf8b90eba 100644
>> --- a/criu/include/namespaces.h
>> +++ b/criu/include/namespaces.h
>> @@ -168,6 +168,7 @@ extern struct ns_id *lookup_ns_by_id(unsigned int id, struct ns_desc *nd);
>>  
>>  extern int collect_user_namespaces(bool for_dump);
>>  extern int prepare_userns(pid_t real_pid, UsernsEntry *e);
>> +extern int create_ns_hierarhy(void);
>>  extern int stop_usernsd(void);
>>  
>>  extern uid_t userns_uid(uid_t uid);
>> diff --git a/criu/namespaces.c b/criu/namespaces.c
>> index 6151219d8..fd390c938 100644
>> --- a/criu/namespaces.c
>> +++ b/criu/namespaces.c
>> @@ -30,6 +30,7 @@
>>  #include "protobuf.h"
>>  #include "util.h"
>>  #include "images/ns.pb-c.h"
>> +#include "common/scm.h"
>>  
>>  static struct ns_desc *ns_desc_array[] = {
>>  	&net_ns_desc,
>> @@ -2151,5 +2152,157 @@ int prepare_namespace_before_tasks(void)
>>  	return -1;
>>  }
>>  
>> +enum {
>> +	NS__CREATED = 1,
>> +	NS__MAPS_POPULATED,
>> +	NS__RESTORED,
>> +	NS__EXIT_HELPER,
>> +	NS__ERROR,
>> +};
>> +
>> +struct ns_arg {
>> +	struct ns_id *me;
>> +	futex_t *futex;
>> +	pid_t pid;
>> +};
>> +
>> +static int create_user_ns_hierarhy_fn(void *in_arg)
>> +{
>> +	char stack[128] __stack_aligned__;
>> +	struct ns_arg arg, *p_arg = in_arg;
>> +	futex_t *p_futex, *futex = NULL;
>> +	int status, fd, ret = -1;
>> +	struct ns_id *me, *child;
>> +	pid_t pid = -1;
>> +
>> +	p_futex = p_arg->futex;
>> +	me = p_arg->me;
>> +
>> +	if (p_futex) {
>> +		/* Temporary set ns owner to me to allow parent restore user_ns maps */
>> +		me->owner.pid = get_self_real_pid();
>> +		if (me->owner.pid < 0) {
>> +			pr_err("Can't self pid\n");
>> +			goto out;
>> +		}
>> +		futex_set_and_wake(p_futex, NS__CREATED);
>> +
>> +		fd = open("/proc/self/ns/user", O_RDONLY);
>> +		if (fd < 0) {
>> +			pr_err("Can't get self user ns");
>> +			goto out;
>> +		}
>> +		/*
>> +		 * As we are cloned with CLONE_FILES,
>> +		 * parent task will see this fd too.
>> +		 */
>> +		me->owner.fd = fd;
>> +
>> +		futex_wait_while_lt(p_futex, NS__MAPS_POPULATED);
>> +		if (prepare_userns_creds()) {
>> +			pr_err("Can't prepare creds\n");
>> +			goto out;
>> +		}
>> +	}
>> +
>> +	futex = mmap(NULL, sizeof(*futex), PROT_WRITE | PROT_READ, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
>> +	if (futex == MAP_FAILED) {
>> +		pr_perror("Failed to mmap futex");
>> +		goto out;
>> +	}
>> +	arg.futex = futex;
>> +	arg.pid = p_arg->pid;
>> +
>> +	list_for_each_entry(child, &me->children, siblings) {
>> +		arg.me = child;
>> +		futex_init(futex);
>> +
>> +		pid = clone(create_user_ns_hierarhy_fn, stack + 127, CLONE_NEWUSER | CLONE_FILES | SIGCHLD, &arg);
> 
> stack has to be aligned. I think stack + 128 should be used. You have to
> gurantee, that arg will be placed after stack, pls take a look at
> "struct cr_clone_arg". I think we need to do something similar here.

I saw it and even was inspiring this code... Strange. Ok, 128 bytes.
 
>> +		if (pid < 0) {
>> +			pr_perror("Can't clone");
>> +			goto out;
>> +		}
>> +		futex_wait_while_lt(futex, NS__CREATED);
>> +		/* Get child real pid */
>> +		pid = child->owner.pid;
>> +		if (prepare_userns(pid, child->user.e) < 0) {
>> +			pr_err("Can't prepare child user_ns\n");
>> +			goto out;
>> +		}
>> +		/* Set ns owner to criu's virt pid */
>> +		child->owner.pid = p_arg->pid;
>> +		futex_set_and_wake(futex, NS__MAPS_POPULATED);
>> +
>> +		errno = 0;
>> +		if (wait(&status) < 0 || WEXITSTATUS(status)) {
> 
> If a process was killed, WEXITSTATUS(status) will be 0.

Good point, thanks.
 
> 		status = -1;
> 		if (waitpid(pid, &status, 0) < 0 || status) {
> 
>> +			pr_perror("Child process waiting: %d\n", WEXITSTATUS(status));
>> +			goto out;
>> +		}
>> +	}
>> +
>> +	ret = 0;
>> +out:
>> +	if (p_futex)
>> +		futex_set_and_wake(p_futex, ret ? NS__ERROR : NS__RESTORED);
>> +	if (futex)
>> +		munmap(futex, sizeof(*futex));
>> +	return ret ? 1 : 0;
>> +}
>> +
>> +static int do_create_ns_hierarhy(void *ppid)
>> +{
>> +	struct ns_arg arg;
>> +	char buf[128];
>> +	int fd;
>> +
>> +	arg.me = root_user_ns;
>> +	arg.futex = NULL;
>> +	arg.pid = (pid_t)(long)ppid;
>> +
>> +	fd = get_service_fd(CR_PROC_FD_OFF);
>> +	if (fd < 0)
>> +		exit(4);
>> +
>> +	snprintf(buf, sizeof(buf), "%d/ns/user", root_item->pid->real);
>> +	fd = openat(fd, buf, O_RDONLY);
>> +	if (fd < 0) {
>> +		pr_perror("Can't open %s", buf);
>> +		exit(5);
>> +	}
>> +	if (setns(fd, CLONE_NEWUSER) < 0) {
>> +		pr_perror("Can't setns()");
>> +		exit(6);
>> +	}
>> +	if (prepare_userns_creds() < 0) {
>> +		pr_err("Can't prepare creds\n");
>> +		exit(7);
>> +	}
>> +	exit(create_user_ns_hierarhy_fn(&arg));
>> +}
>> +
>> +int create_ns_hierarhy(void)
>> +{
>> +	char stack[128] __stack_aligned__;
>> +	int status;
>> +	pid_t pid;
>> +
>> +	if (!(root_ns_mask & CLONE_NEWUSER))
>> +		return 0;
>> +
>> +	pid = clone(do_create_ns_hierarhy, stack + 127, CLONE_FILES | SIGCHLD, (void *)(long)getpid());
>> +	if (pid < 0) {
>> +		pr_perror("Can't clone()");
>> +		return -1;
>> +	}
>> +
>> +	errno = 0;
>> +	if (waitpid(pid, &status, 0) < 0 || WEXITSTATUS(status)) {
>> +		pr_err("Can't create ns hierarhy: errno=%d, status=%d\n",
>> +			errno, WEXITSTATUS(status));
>> +		return -1;
>> +	}
>> +	return 0;
>> +}
>> +
>>  struct ns_desc pid_ns_desc = NS_DESC_ENTRY(CLONE_NEWPID, "pid");
>>  struct ns_desc user_ns_desc = NS_DESC_ENTRY(CLONE_NEWUSER, "user");
>> diff --git a/criu/pstree.c b/criu/pstree.c
>> index 1ba762b80..d2d7339bc 100644
>> --- a/criu/pstree.c
>> +++ b/criu/pstree.c
>> @@ -873,8 +873,12 @@ static int prepare_pstree_kobj_ids(void)
>>  			 * be born in a fresh new mount namespace
>>  			 * which will be populated with all other
>>  			 * namespaces' entries.
>> +			 *
>> +			 * User namespaces are created in create_ns_hierarhy()
>> +			 * before the tasks, as their hierarhy does not correlated
>> +			 * with tasks hierarhy in any way.
>>  			 */
>> -			rsti(item)->clone_flags &= ~CLONE_NEWNS;
>> +			rsti(item)->clone_flags &= ~(CLONE_NEWNS | CLONE_NEWUSER);
>>  
>>  		cflags &= CLONE_ALLNS;
>>  
>>
>> _______________________________________________
>> CRIU mailing list
>> CRIU@openvz.org
>> https://lists.openvz.org/mailman/listinfo/criu
Andrey Vagin Feb. 6, 2017, 10:28 p.m.
On Mon, Feb 06, 2017 at 12:13:47PM +0300, Kirill Tkhai wrote:
> On 03.02.2017 23:43, Andrei Vagin wrote:
> > On Fri, Feb 03, 2017 at 07:15:36PM +0300, Kirill Tkhai wrote:
> >> Create user namespaces hierarhy from criu main task.
> >> Open ns'es fds in, so they are seen for everybody as
> >> /proc/[criu pid]/fd/[ns_fd].
> >>
> >> Why we do it this way.
> >> 1)User namespaces are not correlated with task
> >> hierarhy. Parent task may have a user namespace
> >> of a level bigger, that a child task. So, we
> >> can't restore the user namespaces just by
> >> passing CLONE_NEWUSER in fork_with_pid().
> >>
> >> 2)We create namespaces from criu main task to store
> >> open namespaces'es fds. If we used root_item instead,
> >> all open files would clone to children, and children
> >> would have close unnecessary file descriptors, which
> >> is just a time wasting.
> >>
> >> 3)CLONE_FS tasks will require user_ns is set at the
> >> moment of clone(), so we have to restore target user_ns
> >> in locality of create_children_and_session().
> >>
> >> Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
> >> ---
> >>  criu/cr-restore.c         |    3 +
> >>  criu/include/namespaces.h |    1 
> >>  criu/namespaces.c         |  153 +++++++++++++++++++++++++++++++++++++++++++++
> >>  criu/pstree.c             |    6 +-
> >>  4 files changed, 161 insertions(+), 2 deletions(-)
> >>
> >> diff --git a/criu/cr-restore.c b/criu/cr-restore.c
> >> index ab05ebfd1..07a966154 100644
> >> --- a/criu/cr-restore.c
> >> +++ b/criu/cr-restore.c
> >> @@ -1818,7 +1818,8 @@ static int restore_root_task(struct pstree_item *init)
> >>  	 * uid_map and gid_map must be filled from a parent user namespace.
> >>  	 * prepare_userns_creds() must be called after filling mappings.
> >>  	 */
> >> -	if ((root_ns_mask & CLONE_NEWUSER) && prepare_userns(init->pid->real, userns_entry))
> >> +	if ((root_ns_mask & CLONE_NEWUSER) &&
> >> +	    (prepare_userns(init->pid->real, userns_entry) < 0 || create_ns_hierarhy() < 0))
> >>  		goto out_kill;
> >>  
> >>  	pr_info("Wait until namespaces are created\n");
> >> diff --git a/criu/include/namespaces.h b/criu/include/namespaces.h
> >> index 546de7c5d..bf8b90eba 100644
> >> --- a/criu/include/namespaces.h
> >> +++ b/criu/include/namespaces.h
> >> @@ -168,6 +168,7 @@ extern struct ns_id *lookup_ns_by_id(unsigned int id, struct ns_desc *nd);
> >>  
> >>  extern int collect_user_namespaces(bool for_dump);
> >>  extern int prepare_userns(pid_t real_pid, UsernsEntry *e);
> >> +extern int create_ns_hierarhy(void);
> >>  extern int stop_usernsd(void);
> >>  
> >>  extern uid_t userns_uid(uid_t uid);
> >> diff --git a/criu/namespaces.c b/criu/namespaces.c
> >> index 6151219d8..fd390c938 100644
> >> --- a/criu/namespaces.c
> >> +++ b/criu/namespaces.c
> >> @@ -30,6 +30,7 @@
> >>  #include "protobuf.h"
> >>  #include "util.h"
> >>  #include "images/ns.pb-c.h"
> >> +#include "common/scm.h"
> >>  
> >>  static struct ns_desc *ns_desc_array[] = {
> >>  	&net_ns_desc,
> >> @@ -2151,5 +2152,157 @@ int prepare_namespace_before_tasks(void)
> >>  	return -1;
> >>  }
> >>  
> >> +enum {
> >> +	NS__CREATED = 1,
> >> +	NS__MAPS_POPULATED,
> >> +	NS__RESTORED,
> >> +	NS__EXIT_HELPER,
> >> +	NS__ERROR,
> >> +};
> >> +
> >> +struct ns_arg {
> >> +	struct ns_id *me;
> >> +	futex_t *futex;
> >> +	pid_t pid;
> >> +};
> >> +
> >> +static int create_user_ns_hierarhy_fn(void *in_arg)
> >> +{
> >> +	char stack[128] __stack_aligned__;
> >> +	struct ns_arg arg, *p_arg = in_arg;
> >> +	futex_t *p_futex, *futex = NULL;
> >> +	int status, fd, ret = -1;
> >> +	struct ns_id *me, *child;
> >> +	pid_t pid = -1;
> >> +
> >> +	p_futex = p_arg->futex;
> >> +	me = p_arg->me;
> >> +
> >> +	if (p_futex) {
> >> +		/* Temporary set ns owner to me to allow parent restore user_ns maps */
> >> +		me->owner.pid = get_self_real_pid();
> >> +		if (me->owner.pid < 0) {
> >> +			pr_err("Can't self pid\n");
> >> +			goto out;
> >> +		}
> >> +		futex_set_and_wake(p_futex, NS__CREATED);
> >> +
> >> +		fd = open("/proc/self/ns/user", O_RDONLY);
> >> +		if (fd < 0) {
> >> +			pr_err("Can't get self user ns");
> >> +			goto out;
> >> +		}
> >> +		/*
> >> +		 * As we are cloned with CLONE_FILES,
> >> +		 * parent task will see this fd too.
> >> +		 */
> >> +		me->owner.fd = fd;
> >> +
> >> +		futex_wait_while_lt(p_futex, NS__MAPS_POPULATED);
> >> +		if (prepare_userns_creds()) {
> >> +			pr_err("Can't prepare creds\n");
> >> +			goto out;
> >> +		}
> >> +	}
> >> +
> >> +	futex = mmap(NULL, sizeof(*futex), PROT_WRITE | PROT_READ, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
> >> +	if (futex == MAP_FAILED) {
> >> +		pr_perror("Failed to mmap futex");
> >> +		goto out;
> >> +	}
> >> +	arg.futex = futex;
> >> +	arg.pid = p_arg->pid;
> >> +
> >> +	list_for_each_entry(child, &me->children, siblings) {
> >> +		arg.me = child;
> >> +		futex_init(futex);
> >> +
> >> +		pid = clone(create_user_ns_hierarhy_fn, stack + 127, CLONE_NEWUSER | CLONE_FILES | SIGCHLD, &arg);
> > 
> > stack has to be aligned. I think stack + 128 should be used. You have to
> > gurantee, that arg will be placed after stack, pls take a look at
> > "struct cr_clone_arg". I think we need to do something similar here.
> 
> I saw it and even was inspiring this code... Strange. Ok, 128 bytes.

What is exactly strange here?

>  
> >> +		if (pid < 0) {
> >> +			pr_perror("Can't clone");
> >> +			goto out;
> >> +		}
> >> +		futex_wait_while_lt(futex, NS__CREATED);
> >> +		/* Get child real pid */
> >> +		pid = child->owner.pid;
> >> +		if (prepare_userns(pid, child->user.e) < 0) {
> >> +			pr_err("Can't prepare child user_ns\n");
> >> +			goto out;
> >> +		}
> >> +		/* Set ns owner to criu's virt pid */
> >> +		child->owner.pid = p_arg->pid;
> >> +		futex_set_and_wake(futex, NS__MAPS_POPULATED);
> >> +
> >> +		errno = 0;
> >> +		if (wait(&status) < 0 || WEXITSTATUS(status)) {
> > 
> > If a process was killed, WEXITSTATUS(status) will be 0.
> 
> Good point, thanks.
>  
> > 		status = -1;
> > 		if (waitpid(pid, &status, 0) < 0 || status) {
> > 
> >> +			pr_perror("Child process waiting: %d\n", WEXITSTATUS(status));
> >> +			goto out;
> >> +		}
> >> +	}
> >> +
> >> +	ret = 0;
> >> +out:
> >> +	if (p_futex)
> >> +		futex_set_and_wake(p_futex, ret ? NS__ERROR : NS__RESTORED);
> >> +	if (futex)
> >> +		munmap(futex, sizeof(*futex));
> >> +	return ret ? 1 : 0;
> >> +}
> >> +
> >> +static int do_create_ns_hierarhy(void *ppid)
> >> +{
> >> +	struct ns_arg arg;
> >> +	char buf[128];
> >> +	int fd;
> >> +
> >> +	arg.me = root_user_ns;
> >> +	arg.futex = NULL;
> >> +	arg.pid = (pid_t)(long)ppid;
> >> +
> >> +	fd = get_service_fd(CR_PROC_FD_OFF);
> >> +	if (fd < 0)
> >> +		exit(4);
> >> +
> >> +	snprintf(buf, sizeof(buf), "%d/ns/user", root_item->pid->real);
> >> +	fd = openat(fd, buf, O_RDONLY);
> >> +	if (fd < 0) {
> >> +		pr_perror("Can't open %s", buf);
> >> +		exit(5);
> >> +	}
> >> +	if (setns(fd, CLONE_NEWUSER) < 0) {
> >> +		pr_perror("Can't setns()");
> >> +		exit(6);
> >> +	}
> >> +	if (prepare_userns_creds() < 0) {
> >> +		pr_err("Can't prepare creds\n");
> >> +		exit(7);
> >> +	}
> >> +	exit(create_user_ns_hierarhy_fn(&arg));
> >> +}
> >> +
> >> +int create_ns_hierarhy(void)
> >> +{
> >> +	char stack[128] __stack_aligned__;
> >> +	int status;
> >> +	pid_t pid;
> >> +
> >> +	if (!(root_ns_mask & CLONE_NEWUSER))
> >> +		return 0;
> >> +
> >> +	pid = clone(do_create_ns_hierarhy, stack + 127, CLONE_FILES | SIGCHLD, (void *)(long)getpid());
> >> +	if (pid < 0) {
> >> +		pr_perror("Can't clone()");
> >> +		return -1;
> >> +	}
> >> +
> >> +	errno = 0;
> >> +	if (waitpid(pid, &status, 0) < 0 || WEXITSTATUS(status)) {
> >> +		pr_err("Can't create ns hierarhy: errno=%d, status=%d\n",
> >> +			errno, WEXITSTATUS(status));
> >> +		return -1;
> >> +	}
> >> +	return 0;
> >> +}
> >> +
> >>  struct ns_desc pid_ns_desc = NS_DESC_ENTRY(CLONE_NEWPID, "pid");
> >>  struct ns_desc user_ns_desc = NS_DESC_ENTRY(CLONE_NEWUSER, "user");
> >> diff --git a/criu/pstree.c b/criu/pstree.c
> >> index 1ba762b80..d2d7339bc 100644
> >> --- a/criu/pstree.c
> >> +++ b/criu/pstree.c
> >> @@ -873,8 +873,12 @@ static int prepare_pstree_kobj_ids(void)
> >>  			 * be born in a fresh new mount namespace
> >>  			 * which will be populated with all other
> >>  			 * namespaces' entries.
> >> +			 *
> >> +			 * User namespaces are created in create_ns_hierarhy()
> >> +			 * before the tasks, as their hierarhy does not correlated
> >> +			 * with tasks hierarhy in any way.
> >>  			 */
> >> -			rsti(item)->clone_flags &= ~CLONE_NEWNS;
> >> +			rsti(item)->clone_flags &= ~(CLONE_NEWNS | CLONE_NEWUSER);
> >>  
> >>  		cflags &= CLONE_ALLNS;
> >>  
> >>
> >> _______________________________________________
> >> CRIU mailing list
> >> CRIU@openvz.org
> >> https://lists.openvz.org/mailman/listinfo/criu
Kirill Tkhai Feb. 7, 2017, 8:05 a.m.
On 07.02.2017 01:28, Andrei Vagin wrote:
> On Mon, Feb 06, 2017 at 12:13:47PM +0300, Kirill Tkhai wrote:
>> On 03.02.2017 23:43, Andrei Vagin wrote:
>>> On Fri, Feb 03, 2017 at 07:15:36PM +0300, Kirill Tkhai wrote:
>>>> Create user namespaces hierarhy from criu main task.
>>>> Open ns'es fds in, so they are seen for everybody as
>>>> /proc/[criu pid]/fd/[ns_fd].
>>>>
>>>> Why we do it this way.
>>>> 1)User namespaces are not correlated with task
>>>> hierarhy. Parent task may have a user namespace
>>>> of a level bigger, that a child task. So, we
>>>> can't restore the user namespaces just by
>>>> passing CLONE_NEWUSER in fork_with_pid().
>>>>
>>>> 2)We create namespaces from criu main task to store
>>>> open namespaces'es fds. If we used root_item instead,
>>>> all open files would clone to children, and children
>>>> would have close unnecessary file descriptors, which
>>>> is just a time wasting.
>>>>
>>>> 3)CLONE_FS tasks will require user_ns is set at the
>>>> moment of clone(), so we have to restore target user_ns
>>>> in locality of create_children_and_session().
>>>>
>>>> Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
>>>> ---
>>>>  criu/cr-restore.c         |    3 +
>>>>  criu/include/namespaces.h |    1 
>>>>  criu/namespaces.c         |  153 +++++++++++++++++++++++++++++++++++++++++++++
>>>>  criu/pstree.c             |    6 +-
>>>>  4 files changed, 161 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/criu/cr-restore.c b/criu/cr-restore.c
>>>> index ab05ebfd1..07a966154 100644
>>>> --- a/criu/cr-restore.c
>>>> +++ b/criu/cr-restore.c
>>>> @@ -1818,7 +1818,8 @@ static int restore_root_task(struct pstree_item *init)
>>>>  	 * uid_map and gid_map must be filled from a parent user namespace.
>>>>  	 * prepare_userns_creds() must be called after filling mappings.
>>>>  	 */
>>>> -	if ((root_ns_mask & CLONE_NEWUSER) && prepare_userns(init->pid->real, userns_entry))
>>>> +	if ((root_ns_mask & CLONE_NEWUSER) &&
>>>> +	    (prepare_userns(init->pid->real, userns_entry) < 0 || create_ns_hierarhy() < 0))
>>>>  		goto out_kill;
>>>>  
>>>>  	pr_info("Wait until namespaces are created\n");
>>>> diff --git a/criu/include/namespaces.h b/criu/include/namespaces.h
>>>> index 546de7c5d..bf8b90eba 100644
>>>> --- a/criu/include/namespaces.h
>>>> +++ b/criu/include/namespaces.h
>>>> @@ -168,6 +168,7 @@ extern struct ns_id *lookup_ns_by_id(unsigned int id, struct ns_desc *nd);
>>>>  
>>>>  extern int collect_user_namespaces(bool for_dump);
>>>>  extern int prepare_userns(pid_t real_pid, UsernsEntry *e);
>>>> +extern int create_ns_hierarhy(void);
>>>>  extern int stop_usernsd(void);
>>>>  
>>>>  extern uid_t userns_uid(uid_t uid);
>>>> diff --git a/criu/namespaces.c b/criu/namespaces.c
>>>> index 6151219d8..fd390c938 100644
>>>> --- a/criu/namespaces.c
>>>> +++ b/criu/namespaces.c
>>>> @@ -30,6 +30,7 @@
>>>>  #include "protobuf.h"
>>>>  #include "util.h"
>>>>  #include "images/ns.pb-c.h"
>>>> +#include "common/scm.h"
>>>>  
>>>>  static struct ns_desc *ns_desc_array[] = {
>>>>  	&net_ns_desc,
>>>> @@ -2151,5 +2152,157 @@ int prepare_namespace_before_tasks(void)
>>>>  	return -1;
>>>>  }
>>>>  
>>>> +enum {
>>>> +	NS__CREATED = 1,
>>>> +	NS__MAPS_POPULATED,
>>>> +	NS__RESTORED,
>>>> +	NS__EXIT_HELPER,
>>>> +	NS__ERROR,
>>>> +};
>>>> +
>>>> +struct ns_arg {
>>>> +	struct ns_id *me;
>>>> +	futex_t *futex;
>>>> +	pid_t pid;
>>>> +};
>>>> +
>>>> +static int create_user_ns_hierarhy_fn(void *in_arg)
>>>> +{
>>>> +	char stack[128] __stack_aligned__;
>>>> +	struct ns_arg arg, *p_arg = in_arg;
>>>> +	futex_t *p_futex, *futex = NULL;
>>>> +	int status, fd, ret = -1;
>>>> +	struct ns_id *me, *child;
>>>> +	pid_t pid = -1;
>>>> +
>>>> +	p_futex = p_arg->futex;
>>>> +	me = p_arg->me;
>>>> +
>>>> +	if (p_futex) {
>>>> +		/* Temporary set ns owner to me to allow parent restore user_ns maps */
>>>> +		me->owner.pid = get_self_real_pid();
>>>> +		if (me->owner.pid < 0) {
>>>> +			pr_err("Can't self pid\n");
>>>> +			goto out;
>>>> +		}
>>>> +		futex_set_and_wake(p_futex, NS__CREATED);
>>>> +
>>>> +		fd = open("/proc/self/ns/user", O_RDONLY);
>>>> +		if (fd < 0) {
>>>> +			pr_err("Can't get self user ns");
>>>> +			goto out;
>>>> +		}
>>>> +		/*
>>>> +		 * As we are cloned with CLONE_FILES,
>>>> +		 * parent task will see this fd too.
>>>> +		 */
>>>> +		me->owner.fd = fd;
>>>> +
>>>> +		futex_wait_while_lt(p_futex, NS__MAPS_POPULATED);
>>>> +		if (prepare_userns_creds()) {
>>>> +			pr_err("Can't prepare creds\n");
>>>> +			goto out;
>>>> +		}
>>>> +	}
>>>> +
>>>> +	futex = mmap(NULL, sizeof(*futex), PROT_WRITE | PROT_READ, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
>>>> +	if (futex == MAP_FAILED) {
>>>> +		pr_perror("Failed to mmap futex");
>>>> +		goto out;
>>>> +	}
>>>> +	arg.futex = futex;
>>>> +	arg.pid = p_arg->pid;
>>>> +
>>>> +	list_for_each_entry(child, &me->children, siblings) {
>>>> +		arg.me = child;
>>>> +		futex_init(futex);
>>>> +
>>>> +		pid = clone(create_user_ns_hierarhy_fn, stack + 127, CLONE_NEWUSER | CLONE_FILES | SIGCHLD, &arg);
>>>
>>> stack has to be aligned. I think stack + 128 should be used. You have to
>>> gurantee, that arg will be placed after stack, pls take a look at
>>> "struct cr_clone_arg". I think we need to do something similar here.
>>
>> I saw it and even was inspiring this code... Strange. Ok, 128 bytes.
> 
> What is exactly strange here?

Strange is that I saw that code and was inspiring it, but somehow 127 bytes came.
 
>>  
>>>> +		if (pid < 0) {
>>>> +			pr_perror("Can't clone");
>>>> +			goto out;
>>>> +		}
>>>> +		futex_wait_while_lt(futex, NS__CREATED);
>>>> +		/* Get child real pid */
>>>> +		pid = child->owner.pid;
>>>> +		if (prepare_userns(pid, child->user.e) < 0) {
>>>> +			pr_err("Can't prepare child user_ns\n");
>>>> +			goto out;
>>>> +		}
>>>> +		/* Set ns owner to criu's virt pid */
>>>> +		child->owner.pid = p_arg->pid;
>>>> +		futex_set_and_wake(futex, NS__MAPS_POPULATED);
>>>> +
>>>> +		errno = 0;
>>>> +		if (wait(&status) < 0 || WEXITSTATUS(status)) {
>>>
>>> If a process was killed, WEXITSTATUS(status) will be 0.
>>
>> Good point, thanks.
>>  
>>> 		status = -1;
>>> 		if (waitpid(pid, &status, 0) < 0 || status) {
>>>
>>>> +			pr_perror("Child process waiting: %d\n", WEXITSTATUS(status));
>>>> +			goto out;
>>>> +		}
>>>> +	}
>>>> +
>>>> +	ret = 0;
>>>> +out:
>>>> +	if (p_futex)
>>>> +		futex_set_and_wake(p_futex, ret ? NS__ERROR : NS__RESTORED);
>>>> +	if (futex)
>>>> +		munmap(futex, sizeof(*futex));
>>>> +	return ret ? 1 : 0;
>>>> +}
>>>> +
>>>> +static int do_create_ns_hierarhy(void *ppid)
>>>> +{
>>>> +	struct ns_arg arg;
>>>> +	char buf[128];
>>>> +	int fd;
>>>> +
>>>> +	arg.me = root_user_ns;
>>>> +	arg.futex = NULL;
>>>> +	arg.pid = (pid_t)(long)ppid;
>>>> +
>>>> +	fd = get_service_fd(CR_PROC_FD_OFF);
>>>> +	if (fd < 0)
>>>> +		exit(4);
>>>> +
>>>> +	snprintf(buf, sizeof(buf), "%d/ns/user", root_item->pid->real);
>>>> +	fd = openat(fd, buf, O_RDONLY);
>>>> +	if (fd < 0) {
>>>> +		pr_perror("Can't open %s", buf);
>>>> +		exit(5);
>>>> +	}
>>>> +	if (setns(fd, CLONE_NEWUSER) < 0) {
>>>> +		pr_perror("Can't setns()");
>>>> +		exit(6);
>>>> +	}
>>>> +	if (prepare_userns_creds() < 0) {
>>>> +		pr_err("Can't prepare creds\n");
>>>> +		exit(7);
>>>> +	}
>>>> +	exit(create_user_ns_hierarhy_fn(&arg));
>>>> +}
>>>> +
>>>> +int create_ns_hierarhy(void)
>>>> +{
>>>> +	char stack[128] __stack_aligned__;
>>>> +	int status;
>>>> +	pid_t pid;
>>>> +
>>>> +	if (!(root_ns_mask & CLONE_NEWUSER))
>>>> +		return 0;
>>>> +
>>>> +	pid = clone(do_create_ns_hierarhy, stack + 127, CLONE_FILES | SIGCHLD, (void *)(long)getpid());
>>>> +	if (pid < 0) {
>>>> +		pr_perror("Can't clone()");
>>>> +		return -1;
>>>> +	}
>>>> +
>>>> +	errno = 0;
>>>> +	if (waitpid(pid, &status, 0) < 0 || WEXITSTATUS(status)) {
>>>> +		pr_err("Can't create ns hierarhy: errno=%d, status=%d\n",
>>>> +			errno, WEXITSTATUS(status));
>>>> +		return -1;
>>>> +	}
>>>> +	return 0;
>>>> +}
>>>> +
>>>>  struct ns_desc pid_ns_desc = NS_DESC_ENTRY(CLONE_NEWPID, "pid");
>>>>  struct ns_desc user_ns_desc = NS_DESC_ENTRY(CLONE_NEWUSER, "user");
>>>> diff --git a/criu/pstree.c b/criu/pstree.c
>>>> index 1ba762b80..d2d7339bc 100644
>>>> --- a/criu/pstree.c
>>>> +++ b/criu/pstree.c
>>>> @@ -873,8 +873,12 @@ static int prepare_pstree_kobj_ids(void)
>>>>  			 * be born in a fresh new mount namespace
>>>>  			 * which will be populated with all other
>>>>  			 * namespaces' entries.
>>>> +			 *
>>>> +			 * User namespaces are created in create_ns_hierarhy()
>>>> +			 * before the tasks, as their hierarhy does not correlated
>>>> +			 * with tasks hierarhy in any way.
>>>>  			 */
>>>> -			rsti(item)->clone_flags &= ~CLONE_NEWNS;
>>>> +			rsti(item)->clone_flags &= ~(CLONE_NEWNS | CLONE_NEWUSER);
>>>>  
>>>>  		cflags &= CLONE_ALLNS;
>>>>  
>>>>
>>>> _______________________________________________
>>>> CRIU mailing list
>>>> CRIU@openvz.org
>>>> https://lists.openvz.org/mailman/listinfo/criu