[v2,36/36] ns: Allow nested user namespaces

Submitted by Kirill Tkhai on Feb. 3, 2017, 4:16 p.m.

Details

Message ID 148613861585.3612.18184441265176875016.stgit@localhost.localdomain
State New
Series "Nested user namespaces support"
Headers show

Commit Message

Kirill Tkhai Feb. 3, 2017, 4:16 p.m.
Everything is prepared for nested user namespaces support.
The only thing, we should do more, is to enter to dumped
user namespace's parent before the dump.
We use CLONE_VM for child tasks, so they may populate
user_ns maps in parent memory without any tricks.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
---
 criu/include/namespaces.h |    2 +
 criu/namespaces.c         |   65 ++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 62 insertions(+), 5 deletions(-)

Patch hide | download patch | download mbox

diff --git a/criu/include/namespaces.h b/criu/include/namespaces.h
index 67a96d6b3..c7fc0a05e 100644
--- a/criu/include/namespaces.h
+++ b/criu/include/namespaces.h
@@ -39,7 +39,7 @@ 
 #define CLONE_ALLNS	(CLONE_NEWPID | CLONE_NEWNET | CLONE_NEWIPC | CLONE_NEWUTS | CLONE_NEWNS | CLONE_NEWUSER | CLONE_NEWCGROUP)
 
 /* Nested namespaces are supported only for these types */
-#define CLONE_SUBNS	(CLONE_NEWNS)
+#define CLONE_SUBNS	(CLONE_NEWNS | CLONE_NEWUSER)
 #define EXTRA_SIZE	20
 
 #ifndef NSIO
diff --git a/criu/namespaces.c b/criu/namespaces.c
index 0bbd57e96..5cf46ed8f 100644
--- a/criu/namespaces.c
+++ b/criu/namespaces.c
@@ -31,6 +31,7 @@ 
 #include "util.h"
 #include "images/ns.pb-c.h"
 #include "common/scm.h"
+#include "proc_parse.h"
 
 static struct ns_desc *ns_desc_array[] = {
 	&net_ns_desc,
@@ -970,11 +971,29 @@  static int parse_id_map(pid_t pid, char *name, UidGidExtent ***pb_exts)
 	return -1;
 }
 
-static int dump_user_ns(struct ns_id *ns);
+static int __dump_user_ns(struct ns_id *ns);
+
+static int dump_user_ns(void *arg)
+{
+	struct ns_id *ns = arg;
+
+	if (switch_ns(ns->parent->ns_pid, &user_ns_desc, NULL) < 0) {
+		pr_err("Can't enter user namespace\n");
+		return -1;
+	}
+
+	return __dump_user_ns(ns);
+}
 
 int collect_user_ns(struct ns_id *ns, void *oarg)
 {
+	int status, stack_size;
+	struct ns_id *p_ns;
+	pid_t pid = -1;
 	UsernsEntry *e;
+	char *stack;
+
+	p_ns = ns->parent;
 
 	e = xmalloc(sizeof(*e));
 	if (!e)
@@ -990,8 +1009,43 @@  int collect_user_ns(struct ns_id *ns, void *oarg)
 	 * mappings, which are used for convirting local id-s to
 	 * userns id-s (userns_uid(), userns_gid())
 	 */
-	if (dump_user_ns(ns))
-		return -1;
+	if (p_ns) {
+		/*
+		 * Currently, we are in NS_CRIU. To dump a NS_OTHER ns,
+		 * we need to enter its parent ns. As entered to user_ns
+		 * task has no a way back, we create a child for that.
+		 * NS_ROOT is dumped w/o clone(), it's xids maps is relatively
+		 * to NS_CRIU. We use CLONE_VM to make child share our memory,
+		 * and to allow us see allocated maps, he do. Child's open_proc()
+		 * may do changes about CRIU's internal files states in memory,
+		 * so pass CLONE_FILES to reflect that.
+		 */
+		stack_size = PAGE_SIZE;
+		stack = mmap(NULL, stack_size, PROT_WRITE | PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE | MAP_GROWSDOWN, -1, 0);
+		if (stack == MAP_FAILED) {
+			pr_perror("Can't allocate stack");
+			return -1;
+		}
+		pid = clone(dump_user_ns, stack + stack_size - 1, CLONE_VM | CLONE_FILES | SIGCHLD, ns);
+		if (pid == -1) {
+			pr_perror("Can't clone");
+			return -1;
+		}
+		if (waitpid(pid, &status, 0) != pid) {
+			pr_perror("Unable to wait the %d process", pid);
+			return -1;
+		}
+		if (status) {
+			pr_err("Can't dump nested user_ns\n");
+			return -1;
+		}
+		stack_size = find_vma_size((unsigned long *)&stack);
+		munmap(stack, stack_size);
+		return 0;
+	} else {
+		if (__dump_user_ns(ns))
+			return -1;
+	}
 
 	return 0;
 }
@@ -1030,6 +1084,9 @@  static int check_user_ns(struct ns_id *ns)
 	int status;
 	pid_t chld;
 
+	if (ns->type != NS_ROOT)
+		return 0;
+
 	chld = fork();
 	if (chld == -1) {
 		pr_perror("Unable to fork a process");
@@ -1125,7 +1182,7 @@  static int check_user_ns(struct ns_id *ns)
 	return 0;
 }
 
-static int dump_user_ns(struct ns_id *ns)
+static int __dump_user_ns(struct ns_id *ns)
 {
 	int ret, exit_code = -1;
 	pid_t pid = ns->ns_pid;

Comments

Andrey Vagin Feb. 7, 2017, 12:51 a.m.
On Fri, Feb 03, 2017 at 07:16:55PM +0300, Kirill Tkhai wrote:
> Everything is prepared for nested user namespaces support.
> The only thing, we should do more, is to enter to dumped
> user namespace's parent before the dump.
> We use CLONE_VM for child tasks, so they may populate
> user_ns maps in parent memory without any tricks.
> 
> Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
> ---
>  criu/include/namespaces.h |    2 +
>  criu/namespaces.c         |   65 ++++++++++++++++++++++++++++++++++++++++++---
>  2 files changed, 62 insertions(+), 5 deletions(-)
> 
> diff --git a/criu/include/namespaces.h b/criu/include/namespaces.h
> index 67a96d6b3..c7fc0a05e 100644
> --- a/criu/include/namespaces.h
> +++ b/criu/include/namespaces.h
> @@ -39,7 +39,7 @@
>  #define CLONE_ALLNS	(CLONE_NEWPID | CLONE_NEWNET | CLONE_NEWIPC | CLONE_NEWUTS | CLONE_NEWNS | CLONE_NEWUSER | CLONE_NEWCGROUP)
>  
>  /* Nested namespaces are supported only for these types */
> -#define CLONE_SUBNS	(CLONE_NEWNS)
> +#define CLONE_SUBNS	(CLONE_NEWNS | CLONE_NEWUSER)
>  #define EXTRA_SIZE	20
>  
>  #ifndef NSIO
> diff --git a/criu/namespaces.c b/criu/namespaces.c
> index 0bbd57e96..5cf46ed8f 100644
> --- a/criu/namespaces.c
> +++ b/criu/namespaces.c
> @@ -31,6 +31,7 @@
>  #include "util.h"
>  #include "images/ns.pb-c.h"
>  #include "common/scm.h"
> +#include "proc_parse.h"
>  
>  static struct ns_desc *ns_desc_array[] = {
>  	&net_ns_desc,
> @@ -970,11 +971,29 @@ static int parse_id_map(pid_t pid, char *name, UidGidExtent ***pb_exts)
>  	return -1;
>  }
>  
> -static int dump_user_ns(struct ns_id *ns);
> +static int __dump_user_ns(struct ns_id *ns);
> +
> +static int dump_user_ns(void *arg)
> +{
> +	struct ns_id *ns = arg;
> +
> +	if (switch_ns(ns->parent->ns_pid, &user_ns_desc, NULL) < 0) {
> +		pr_err("Can't enter user namespace\n");
> +		return -1;
> +	}
> +
> +	return __dump_user_ns(ns);
> +}
>  
>  int collect_user_ns(struct ns_id *ns, void *oarg)
>  {
> +	int status, stack_size;
> +	struct ns_id *p_ns;
> +	pid_t pid = -1;
>  	UsernsEntry *e;
> +	char *stack;
> +
> +	p_ns = ns->parent;
>  
>  	e = xmalloc(sizeof(*e));
>  	if (!e)
> @@ -990,8 +1009,43 @@ int collect_user_ns(struct ns_id *ns, void *oarg)
>  	 * mappings, which are used for convirting local id-s to
>  	 * userns id-s (userns_uid(), userns_gid())
>  	 */
> -	if (dump_user_ns(ns))
> -		return -1;
> +	if (p_ns) {
> +		/*
> +		 * Currently, we are in NS_CRIU. To dump a NS_OTHER ns,
> +		 * we need to enter its parent ns. As entered to user_ns
> +		 * task has no a way back, we create a child for that.
> +		 * NS_ROOT is dumped w/o clone(), it's xids maps is relatively
> +		 * to NS_CRIU. We use CLONE_VM to make child share our memory,
> +		 * and to allow us see allocated maps, he do. Child's open_proc()
> +		 * may do changes about CRIU's internal files states in memory,
> +		 * so pass CLONE_FILES to reflect that.
> +		 */
> +		stack_size = PAGE_SIZE;
> +		stack = mmap(NULL, stack_size, PROT_WRITE | PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE | MAP_GROWSDOWN, -1, 0);

Maybe it is easier to allocate a big enough stack without MAP_GROWSDOWN.
In this case we will not need to read /proc/self/maps to get its size,
we will not need to care about guard pages, etc.

> +		if (stack == MAP_FAILED) {
> +			pr_perror("Can't allocate stack");
> +			return -1;
> +		}
> +		pid = clone(dump_user_ns, stack + stack_size - 1, CLONE_VM | CLONE_FILES | SIGCHLD, ns);
> +		if (pid == -1) {
> +			pr_perror("Can't clone");
> +			return -1;
> +		}
> +		if (waitpid(pid, &status, 0) != pid) {
> +			pr_perror("Unable to wait the %d process", pid);
> +			return -1;
> +		}
> +		if (status) {
> +			pr_err("Can't dump nested user_ns\n");
> +			return -1;
> +		}
> +		stack_size = find_vma_size((unsigned long *)&stack);
> +		munmap(stack, stack_size);
> +		return 0;
> +	} else {
> +		if (__dump_user_ns(ns))
> +			return -1;
> +	}
>  
>  	return 0;
>  }
> @@ -1030,6 +1084,9 @@ static int check_user_ns(struct ns_id *ns)
>  	int status;
>  	pid_t chld;
>  
> +	if (ns->type != NS_ROOT)
> +		return 0;
> +
>  	chld = fork();
>  	if (chld == -1) {
>  		pr_perror("Unable to fork a process");
> @@ -1125,7 +1182,7 @@ static int check_user_ns(struct ns_id *ns)
>  	return 0;
>  }
>  
> -static int dump_user_ns(struct ns_id *ns)
> +static int __dump_user_ns(struct ns_id *ns)
>  {
>  	int ret, exit_code = -1;
>  	pid_t pid = ns->ns_pid;
> 
> _______________________________________________
> CRIU mailing list
> CRIU@openvz.org
> https://lists.openvz.org/mailman/listinfo/criu
Kirill Tkhai Feb. 7, 2017, 8:14 a.m.
On 07.02.2017 03:51, Andrei Vagin wrote:
> On Fri, Feb 03, 2017 at 07:16:55PM +0300, Kirill Tkhai wrote:
>> Everything is prepared for nested user namespaces support.
>> The only thing, we should do more, is to enter to dumped
>> user namespace's parent before the dump.
>> We use CLONE_VM for child tasks, so they may populate
>> user_ns maps in parent memory without any tricks.
>>
>> Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
>> ---
>>  criu/include/namespaces.h |    2 +
>>  criu/namespaces.c         |   65 ++++++++++++++++++++++++++++++++++++++++++---
>>  2 files changed, 62 insertions(+), 5 deletions(-)
>>
>> diff --git a/criu/include/namespaces.h b/criu/include/namespaces.h
>> index 67a96d6b3..c7fc0a05e 100644
>> --- a/criu/include/namespaces.h
>> +++ b/criu/include/namespaces.h
>> @@ -39,7 +39,7 @@
>>  #define CLONE_ALLNS	(CLONE_NEWPID | CLONE_NEWNET | CLONE_NEWIPC | CLONE_NEWUTS | CLONE_NEWNS | CLONE_NEWUSER | CLONE_NEWCGROUP)
>>  
>>  /* Nested namespaces are supported only for these types */
>> -#define CLONE_SUBNS	(CLONE_NEWNS)
>> +#define CLONE_SUBNS	(CLONE_NEWNS | CLONE_NEWUSER)
>>  #define EXTRA_SIZE	20
>>  
>>  #ifndef NSIO
>> diff --git a/criu/namespaces.c b/criu/namespaces.c
>> index 0bbd57e96..5cf46ed8f 100644
>> --- a/criu/namespaces.c
>> +++ b/criu/namespaces.c
>> @@ -31,6 +31,7 @@
>>  #include "util.h"
>>  #include "images/ns.pb-c.h"
>>  #include "common/scm.h"
>> +#include "proc_parse.h"
>>  
>>  static struct ns_desc *ns_desc_array[] = {
>>  	&net_ns_desc,
>> @@ -970,11 +971,29 @@ static int parse_id_map(pid_t pid, char *name, UidGidExtent ***pb_exts)
>>  	return -1;
>>  }
>>  
>> -static int dump_user_ns(struct ns_id *ns);
>> +static int __dump_user_ns(struct ns_id *ns);
>> +
>> +static int dump_user_ns(void *arg)
>> +{
>> +	struct ns_id *ns = arg;
>> +
>> +	if (switch_ns(ns->parent->ns_pid, &user_ns_desc, NULL) < 0) {
>> +		pr_err("Can't enter user namespace\n");
>> +		return -1;
>> +	}
>> +
>> +	return __dump_user_ns(ns);
>> +}
>>  
>>  int collect_user_ns(struct ns_id *ns, void *oarg)
>>  {
>> +	int status, stack_size;
>> +	struct ns_id *p_ns;
>> +	pid_t pid = -1;
>>  	UsernsEntry *e;
>> +	char *stack;
>> +
>> +	p_ns = ns->parent;
>>  
>>  	e = xmalloc(sizeof(*e));
>>  	if (!e)
>> @@ -990,8 +1009,43 @@ int collect_user_ns(struct ns_id *ns, void *oarg)
>>  	 * mappings, which are used for convirting local id-s to
>>  	 * userns id-s (userns_uid(), userns_gid())
>>  	 */
>> -	if (dump_user_ns(ns))
>> -		return -1;
>> +	if (p_ns) {
>> +		/*
>> +		 * Currently, we are in NS_CRIU. To dump a NS_OTHER ns,
>> +		 * we need to enter its parent ns. As entered to user_ns
>> +		 * task has no a way back, we create a child for that.
>> +		 * NS_ROOT is dumped w/o clone(), it's xids maps is relatively
>> +		 * to NS_CRIU. We use CLONE_VM to make child share our memory,
>> +		 * and to allow us see allocated maps, he do. Child's open_proc()
>> +		 * may do changes about CRIU's internal files states in memory,
>> +		 * so pass CLONE_FILES to reflect that.
>> +		 */
>> +		stack_size = PAGE_SIZE;
>> +		stack = mmap(NULL, stack_size, PROT_WRITE | PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE | MAP_GROWSDOWN, -1, 0);
> 
> Maybe it is easier to allocate a big enough stack without MAP_GROWSDOWN.
> In this case we will not need to read /proc/self/maps to get its size,
> we will not need to care about guard pages, etc.

I don't think so. We call several nested functions there, and they are used
in generic code. It wouldn't be useful to have to check over all function
call tree to determine, if some places like this should enlarge their
stack allocation or not, when we modify these function. Big enough stack
is uncountable concept.
 
>> +		if (stack == MAP_FAILED) {
>> +			pr_perror("Can't allocate stack");
>> +			return -1;
>> +		}
>> +		pid = clone(dump_user_ns, stack + stack_size - 1, CLONE_VM | CLONE_FILES | SIGCHLD, ns);
>> +		if (pid == -1) {
>> +			pr_perror("Can't clone");
>> +			return -1;
>> +		}
>> +		if (waitpid(pid, &status, 0) != pid) {
>> +			pr_perror("Unable to wait the %d process", pid);
>> +			return -1;
>> +		}
>> +		if (status) {
>> +			pr_err("Can't dump nested user_ns\n");
>> +			return -1;
>> +		}
>> +		stack_size = find_vma_size((unsigned long *)&stack);
>> +		munmap(stack, stack_size);
>> +		return 0;
>> +	} else {
>> +		if (__dump_user_ns(ns))
>> +			return -1;
>> +	}
>>  
>>  	return 0;
>>  }
>> @@ -1030,6 +1084,9 @@ static int check_user_ns(struct ns_id *ns)
>>  	int status;
>>  	pid_t chld;
>>  
>> +	if (ns->type != NS_ROOT)
>> +		return 0;
>> +
>>  	chld = fork();
>>  	if (chld == -1) {
>>  		pr_perror("Unable to fork a process");
>> @@ -1125,7 +1182,7 @@ static int check_user_ns(struct ns_id *ns)
>>  	return 0;
>>  }
>>  
>> -static int dump_user_ns(struct ns_id *ns)
>> +static int __dump_user_ns(struct ns_id *ns)
>>  {
>>  	int ret, exit_code = -1;
>>  	pid_t pid = ns->ns_pid;
>>
>> _______________________________________________
>> CRIU mailing list
>> CRIU@openvz.org
>> https://lists.openvz.org/mailman/listinfo/criu
Andrey Vagin Feb. 7, 2017, 4:31 p.m.
On Tue, Feb 07, 2017 at 11:14:07AM +0300, Kirill Tkhai wrote:
> On 07.02.2017 03:51, Andrei Vagin wrote:
> > On Fri, Feb 03, 2017 at 07:16:55PM +0300, Kirill Tkhai wrote:
> >> Everything is prepared for nested user namespaces support.
> >> The only thing, we should do more, is to enter to dumped
> >> user namespace's parent before the dump.
> >> We use CLONE_VM for child tasks, so they may populate
> >> user_ns maps in parent memory without any tricks.
> >>
> >> Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
> >> ---
> >>  criu/include/namespaces.h |    2 +
> >>  criu/namespaces.c         |   65 ++++++++++++++++++++++++++++++++++++++++++---
> >>  2 files changed, 62 insertions(+), 5 deletions(-)
> >>
> >> diff --git a/criu/include/namespaces.h b/criu/include/namespaces.h
> >> index 67a96d6b3..c7fc0a05e 100644
> >> --- a/criu/include/namespaces.h
> >> +++ b/criu/include/namespaces.h
> >> @@ -39,7 +39,7 @@
> >>  #define CLONE_ALLNS	(CLONE_NEWPID | CLONE_NEWNET | CLONE_NEWIPC | CLONE_NEWUTS | CLONE_NEWNS | CLONE_NEWUSER | CLONE_NEWCGROUP)
> >>  
> >>  /* Nested namespaces are supported only for these types */
> >> -#define CLONE_SUBNS	(CLONE_NEWNS)
> >> +#define CLONE_SUBNS	(CLONE_NEWNS | CLONE_NEWUSER)
> >>  #define EXTRA_SIZE	20
> >>  
> >>  #ifndef NSIO
> >> diff --git a/criu/namespaces.c b/criu/namespaces.c
> >> index 0bbd57e96..5cf46ed8f 100644
> >> --- a/criu/namespaces.c
> >> +++ b/criu/namespaces.c
> >> @@ -31,6 +31,7 @@
> >>  #include "util.h"
> >>  #include "images/ns.pb-c.h"
> >>  #include "common/scm.h"
> >> +#include "proc_parse.h"
> >>  
> >>  static struct ns_desc *ns_desc_array[] = {
> >>  	&net_ns_desc,
> >> @@ -970,11 +971,29 @@ static int parse_id_map(pid_t pid, char *name, UidGidExtent ***pb_exts)
> >>  	return -1;
> >>  }
> >>  
> >> -static int dump_user_ns(struct ns_id *ns);
> >> +static int __dump_user_ns(struct ns_id *ns);
> >> +
> >> +static int dump_user_ns(void *arg)
> >> +{
> >> +	struct ns_id *ns = arg;
> >> +
> >> +	if (switch_ns(ns->parent->ns_pid, &user_ns_desc, NULL) < 0) {
> >> +		pr_err("Can't enter user namespace\n");
> >> +		return -1;
> >> +	}
> >> +
> >> +	return __dump_user_ns(ns);
> >> +}
> >>  
> >>  int collect_user_ns(struct ns_id *ns, void *oarg)
> >>  {
> >> +	int status, stack_size;
> >> +	struct ns_id *p_ns;
> >> +	pid_t pid = -1;
> >>  	UsernsEntry *e;
> >> +	char *stack;
> >> +
> >> +	p_ns = ns->parent;
> >>  
> >>  	e = xmalloc(sizeof(*e));
> >>  	if (!e)
> >> @@ -990,8 +1009,43 @@ int collect_user_ns(struct ns_id *ns, void *oarg)
> >>  	 * mappings, which are used for convirting local id-s to
> >>  	 * userns id-s (userns_uid(), userns_gid())
> >>  	 */
> >> -	if (dump_user_ns(ns))
> >> -		return -1;
> >> +	if (p_ns) {
> >> +		/*
> >> +		 * Currently, we are in NS_CRIU. To dump a NS_OTHER ns,
> >> +		 * we need to enter its parent ns. As entered to user_ns
> >> +		 * task has no a way back, we create a child for that.
> >> +		 * NS_ROOT is dumped w/o clone(), it's xids maps is relatively
> >> +		 * to NS_CRIU. We use CLONE_VM to make child share our memory,
> >> +		 * and to allow us see allocated maps, he do. Child's open_proc()
> >> +		 * may do changes about CRIU's internal files states in memory,
> >> +		 * so pass CLONE_FILES to reflect that.
> >> +		 */
> >> +		stack_size = PAGE_SIZE;
> >> +		stack = mmap(NULL, stack_size, PROT_WRITE | PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE | MAP_GROWSDOWN, -1, 0);
> > 
> > Maybe it is easier to allocate a big enough stack without MAP_GROWSDOWN.
> > In this case we will not need to read /proc/self/maps to get its size,
> > we will not need to care about guard pages, etc.
> 
> I don't think so. We call several nested functions there, and they are used
> in generic code. It wouldn't be useful to have to check over all function
> call tree to determine, if some places like this should enlarge their
> stack allocation or not, when we modify these function. Big enough stack
> is uncountable concept.

In a kernel space we live with a fixed stack. Posix threads use fixed
stacks. Why is it a problem for us? We can allocate a few megabytes, the
real memory is allocated only when someone touch it.

>  
> >> +		if (stack == MAP_FAILED) {
> >> +			pr_perror("Can't allocate stack");
> >> +			return -1;
> >> +		}
> >> +		pid = clone(dump_user_ns, stack + stack_size - 1, CLONE_VM | CLONE_FILES | SIGCHLD, ns);
> >> +		if (pid == -1) {
> >> +			pr_perror("Can't clone");
> >> +			return -1;
> >> +		}
> >> +		if (waitpid(pid, &status, 0) != pid) {
> >> +			pr_perror("Unable to wait the %d process", pid);
> >> +			return -1;
> >> +		}
> >> +		if (status) {
> >> +			pr_err("Can't dump nested user_ns\n");
> >> +			return -1;
> >> +		}
> >> +		stack_size = find_vma_size((unsigned long *)&stack);
> >> +		munmap(stack, stack_size);
> >> +		return 0;
> >> +	} else {
> >> +		if (__dump_user_ns(ns))
> >> +			return -1;
> >> +	}
> >>  
> >>  	return 0;
> >>  }
> >> @@ -1030,6 +1084,9 @@ static int check_user_ns(struct ns_id *ns)
> >>  	int status;
> >>  	pid_t chld;
> >>  
> >> +	if (ns->type != NS_ROOT)
> >> +		return 0;
> >> +
> >>  	chld = fork();
> >>  	if (chld == -1) {
> >>  		pr_perror("Unable to fork a process");
> >> @@ -1125,7 +1182,7 @@ static int check_user_ns(struct ns_id *ns)
> >>  	return 0;
> >>  }
> >>  
> >> -static int dump_user_ns(struct ns_id *ns)
> >> +static int __dump_user_ns(struct ns_id *ns)
> >>  {
> >>  	int ret, exit_code = -1;
> >>  	pid_t pid = ns->ns_pid;
> >>
> >> _______________________________________________
> >> CRIU mailing list
> >> CRIU@openvz.org
> >> https://lists.openvz.org/mailman/listinfo/criu
Kirill Tkhai Feb. 8, 2017, 10:57 a.m.
On 07.02.2017 19:31, Andrei Vagin wrote:
> On Tue, Feb 07, 2017 at 11:14:07AM +0300, Kirill Tkhai wrote:
>> On 07.02.2017 03:51, Andrei Vagin wrote:
>>> On Fri, Feb 03, 2017 at 07:16:55PM +0300, Kirill Tkhai wrote:
>>>> Everything is prepared for nested user namespaces support.
>>>> The only thing, we should do more, is to enter to dumped
>>>> user namespace's parent before the dump.
>>>> We use CLONE_VM for child tasks, so they may populate
>>>> user_ns maps in parent memory without any tricks.
>>>>
>>>> Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
>>>> ---
>>>>  criu/include/namespaces.h |    2 +
>>>>  criu/namespaces.c         |   65 ++++++++++++++++++++++++++++++++++++++++++---
>>>>  2 files changed, 62 insertions(+), 5 deletions(-)
>>>>
>>>> diff --git a/criu/include/namespaces.h b/criu/include/namespaces.h
>>>> index 67a96d6b3..c7fc0a05e 100644
>>>> --- a/criu/include/namespaces.h
>>>> +++ b/criu/include/namespaces.h
>>>> @@ -39,7 +39,7 @@
>>>>  #define CLONE_ALLNS	(CLONE_NEWPID | CLONE_NEWNET | CLONE_NEWIPC | CLONE_NEWUTS | CLONE_NEWNS | CLONE_NEWUSER | CLONE_NEWCGROUP)
>>>>  
>>>>  /* Nested namespaces are supported only for these types */
>>>> -#define CLONE_SUBNS	(CLONE_NEWNS)
>>>> +#define CLONE_SUBNS	(CLONE_NEWNS | CLONE_NEWUSER)
>>>>  #define EXTRA_SIZE	20
>>>>  
>>>>  #ifndef NSIO
>>>> diff --git a/criu/namespaces.c b/criu/namespaces.c
>>>> index 0bbd57e96..5cf46ed8f 100644
>>>> --- a/criu/namespaces.c
>>>> +++ b/criu/namespaces.c
>>>> @@ -31,6 +31,7 @@
>>>>  #include "util.h"
>>>>  #include "images/ns.pb-c.h"
>>>>  #include "common/scm.h"
>>>> +#include "proc_parse.h"
>>>>  
>>>>  static struct ns_desc *ns_desc_array[] = {
>>>>  	&net_ns_desc,
>>>> @@ -970,11 +971,29 @@ static int parse_id_map(pid_t pid, char *name, UidGidExtent ***pb_exts)
>>>>  	return -1;
>>>>  }
>>>>  
>>>> -static int dump_user_ns(struct ns_id *ns);
>>>> +static int __dump_user_ns(struct ns_id *ns);
>>>> +
>>>> +static int dump_user_ns(void *arg)
>>>> +{
>>>> +	struct ns_id *ns = arg;
>>>> +
>>>> +	if (switch_ns(ns->parent->ns_pid, &user_ns_desc, NULL) < 0) {
>>>> +		pr_err("Can't enter user namespace\n");
>>>> +		return -1;
>>>> +	}
>>>> +
>>>> +	return __dump_user_ns(ns);
>>>> +}
>>>>  
>>>>  int collect_user_ns(struct ns_id *ns, void *oarg)
>>>>  {
>>>> +	int status, stack_size;
>>>> +	struct ns_id *p_ns;
>>>> +	pid_t pid = -1;
>>>>  	UsernsEntry *e;
>>>> +	char *stack;
>>>> +
>>>> +	p_ns = ns->parent;
>>>>  
>>>>  	e = xmalloc(sizeof(*e));
>>>>  	if (!e)
>>>> @@ -990,8 +1009,43 @@ int collect_user_ns(struct ns_id *ns, void *oarg)
>>>>  	 * mappings, which are used for convirting local id-s to
>>>>  	 * userns id-s (userns_uid(), userns_gid())
>>>>  	 */
>>>> -	if (dump_user_ns(ns))
>>>> -		return -1;
>>>> +	if (p_ns) {
>>>> +		/*
>>>> +		 * Currently, we are in NS_CRIU. To dump a NS_OTHER ns,
>>>> +		 * we need to enter its parent ns. As entered to user_ns
>>>> +		 * task has no a way back, we create a child for that.
>>>> +		 * NS_ROOT is dumped w/o clone(), it's xids maps is relatively
>>>> +		 * to NS_CRIU. We use CLONE_VM to make child share our memory,
>>>> +		 * and to allow us see allocated maps, he do. Child's open_proc()
>>>> +		 * may do changes about CRIU's internal files states in memory,
>>>> +		 * so pass CLONE_FILES to reflect that.
>>>> +		 */
>>>> +		stack_size = PAGE_SIZE;
>>>> +		stack = mmap(NULL, stack_size, PROT_WRITE | PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE | MAP_GROWSDOWN, -1, 0);
>>>
>>> Maybe it is easier to allocate a big enough stack without MAP_GROWSDOWN.
>>> In this case we will not need to read /proc/self/maps to get its size,
>>> we will not need to care about guard pages, etc.
>>
>> I don't think so. We call several nested functions there, and they are used
>> in generic code. It wouldn't be useful to have to check over all function
>> call tree to determine, if some places like this should enlarge their
>> stack allocation or not, when we modify these function. Big enough stack
>> is uncountable concept.
> 
> In a kernel space we live with a fixed stack. Posix threads use fixed
> stacks. Why is it a problem for us? We can allocate a few megabytes, the
> real memory is allocated only when someone touch it.

Ok, sounds reasonable.
 
>>  
>>>> +		if (stack == MAP_FAILED) {
>>>> +			pr_perror("Can't allocate stack");
>>>> +			return -1;
>>>> +		}
>>>> +		pid = clone(dump_user_ns, stack + stack_size - 1, CLONE_VM | CLONE_FILES | SIGCHLD, ns);
>>>> +		if (pid == -1) {
>>>> +			pr_perror("Can't clone");
>>>> +			return -1;
>>>> +		}
>>>> +		if (waitpid(pid, &status, 0) != pid) {
>>>> +			pr_perror("Unable to wait the %d process", pid);
>>>> +			return -1;
>>>> +		}
>>>> +		if (status) {
>>>> +			pr_err("Can't dump nested user_ns\n");
>>>> +			return -1;
>>>> +		}
>>>> +		stack_size = find_vma_size((unsigned long *)&stack);
>>>> +		munmap(stack, stack_size);
>>>> +		return 0;
>>>> +	} else {
>>>> +		if (__dump_user_ns(ns))
>>>> +			return -1;
>>>> +	}
>>>>  
>>>>  	return 0;
>>>>  }
>>>> @@ -1030,6 +1084,9 @@ static int check_user_ns(struct ns_id *ns)
>>>>  	int status;
>>>>  	pid_t chld;
>>>>  
>>>> +	if (ns->type != NS_ROOT)
>>>> +		return 0;
>>>> +
>>>>  	chld = fork();
>>>>  	if (chld == -1) {
>>>>  		pr_perror("Unable to fork a process");
>>>> @@ -1125,7 +1182,7 @@ static int check_user_ns(struct ns_id *ns)
>>>>  	return 0;
>>>>  }
>>>>  
>>>> -static int dump_user_ns(struct ns_id *ns)
>>>> +static int __dump_user_ns(struct ns_id *ns)
>>>>  {
>>>>  	int ret, exit_code = -1;
>>>>  	pid_t pid = ns->ns_pid;
>>>>
>>>> _______________________________________________
>>>> CRIU mailing list
>>>> CRIU@openvz.org
>>>> https://lists.openvz.org/mailman/listinfo/criu