[v3,49/55] pid_ns: Set user_ns before creation of pid_ns

Submitted by Kirill Tkhai on April 10, 2017, 8:23 a.m.

Details

Message ID 149181259332.17342.3512219954488642381.stgit@localhost.localdomain
State New
Series "Nested pid namespaces support"
Headers show

Commit Message

Kirill Tkhai April 10, 2017, 8:23 a.m.
Since child's pid_ns may have user_ns not equal
to parent's, and we do not want to lose parent's
user_ns (as it's not impossible to restore it back),
create the child from a sub-process.

v3: New

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
---
 criu/cr-restore.c |  131 +++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 115 insertions(+), 16 deletions(-)

Patch hide | download patch | download mbox

diff --git a/criu/cr-restore.c b/criu/cr-restore.c
index 780efed18..a73545fbd 100644
--- a/criu/cr-restore.c
+++ b/criu/cr-restore.c
@@ -1046,6 +1046,120 @@  static void maybe_clone_parent(struct pstree_item *item,
 	}
 }
 
+static int call_clone_fn(void *arg)
+{
+	struct cr_clone_arg *ca = arg;
+	struct ns_id *pid_ns;
+	pid_t pid;
+	int fd;
+
+	pid_ns = lookup_ns_by_id(ca->item->ids->pid_ns_id, &pid_ns_desc);
+	BUG_ON(!pid_ns);
+
+	if (set_next_pid(pid_ns, ca->item->pid) < 0) {
+		pr_err("Can't set next pid\n");
+		return -1;
+	}
+
+	fd = fdstore_get(pid_ns->user_ns->user.nsfd_id);
+	if (fd < 0) {
+		pr_err("Can't get ns fd\n");
+		return -1;
+	}
+
+	if (setns(fd, CLONE_NEWUSER) < 0) {
+		pr_perror("Can't set user ns");
+		close(fd);
+		return -1;
+	}
+	close(fd);
+	ca->item->user_ns = pid_ns->user_ns;
+
+	if (ca->clone_flags & CLONE_FILES)
+		close_pid_proc();
+
+	pid = clone_noasan(restore_task_with_children, ca->clone_flags | CLONE_PARENT | SIGCHLD, ca);
+
+	if (ca->item == root_item) {
+		ca->item->pid->real = pid;
+		pr_debug("PID: real %d virt %d\n", pid, vpid(ca->item));
+	}
+
+	return pid > 0 ? 0 : -1;
+}
+
+static int do_fork_with_pid(struct pstree_item *item, struct ns_id *pid_ns, struct cr_clone_arg *ca)
+{
+	int status, i, sz, ret = 0;
+	struct pid *hlp_pid;
+	sigset_t sig_mask;
+	pid_t pid;
+
+	if (!(ca->clone_flags & CLONE_NEWPID) || !current || current->user_ns == pid_ns->user_ns) {
+		if (set_next_pid(pid_ns, item->pid) < 0) {
+			pr_err("Can't set next pid\n");
+			return -1;
+		}
+		if (ca->clone_flags & CLONE_FILES)
+			close_pid_proc();
+		pid = clone_noasan(restore_task_with_children, ca->clone_flags | SIGCHLD, ca);
+		if (item == root_item) {
+			item->pid->real = pid;
+			pr_debug("PID: real %d virt %d\n", pid, vpid(item));
+		}
+		return pid > 0 ? 0 : -1;
+	}
+
+	sz = sizeof(struct pid) + (item->pid->level - 2) * sizeof(((struct pid *)NULL)->ns[0]);
+	hlp_pid = xmalloc(sz);
+	if (!hlp_pid)
+		return -1;
+	memcpy(hlp_pid, item->pid, sz);
+	hlp_pid->level--;
+
+	for (i = 0; i < hlp_pid->level; i++) {
+		/*
+		 * Choose helper's pid[] as child's pid[] + 1.
+		 * This should guarantee the helper will not
+		 * occupy child's pids.
+		 */
+		hlp_pid->ns[i].virt++;
+		if (hlp_pid->ns[i].virt < 0)
+			hlp_pid->ns[i].virt = INIT_PID + 1;
+	}
+	if (set_next_pid(pid_ns->parent, hlp_pid) < 0) {
+		pr_err("Can't set next pid\n");
+		xfree(hlp_pid);
+		return -1;
+	}
+	xfree(hlp_pid);
+
+	if (ca->clone_flags & CLONE_FILES)
+		close_pid_proc();
+
+	if (block_sigmask(&sig_mask, SIGCHLD) < 0)
+		return -1;
+
+	pid = clone_noasan(call_clone_fn, SIGCHLD, ca);
+	if (pid < 0) {
+		pr_perror("Can't clone()");
+		ret = -1;
+		goto unblock;
+	}
+
+	errno = 0;
+	if (waitpid(pid, &status, 0) < 0 || !WIFEXITED(status) || WEXITSTATUS(status)) {
+		pr_perror("Child process waiting: %d", status);
+		ret = -1;
+		goto unblock;
+	}
+
+unblock:
+	if (restore_sigmask(&sig_mask))
+		ret = -1;
+	return ret;
+}
+
 static inline int fork_with_pid(struct pstree_item *item)
 {
 	struct cr_clone_arg ca;
@@ -1114,27 +1228,12 @@  static inline int fork_with_pid(struct pstree_item *item)
 		goto err_close;
 	}
 
-	if (set_next_pid(pid_ns, item->pid) < 0) {
-		pr_err("Can't set next pid\n");
-		goto err_unlock;
-	}
-
-	if (ca.clone_flags & CLONE_FILES)
-		close_pid_proc();
-
-	ret = clone_noasan(restore_task_with_children, ca.clone_flags | SIGCHLD, &ca);
+	ret = do_fork_with_pid(item, pid_ns, &ca);
 	if (ret < 0) {
 		pr_perror("Can't fork for %d", pid);
 		goto err_unlock;
 	}
 
-
-	if (item == root_item) {
-		item->pid->real = ret;
-		pr_debug("PID: real %d virt %d\n",
-				item->pid->real, vpid(item));
-	}
-
 err_unlock:
 	if (flock(ca.fd, LOCK_UN))
 		pr_perror("%d: Can't unlock %s", pid, LAST_PID_PATH);

Comments

Andrey Vagin April 25, 2017, 5:26 a.m.
On Mon, Apr 10, 2017 at 11:23:13AM +0300, Kirill Tkhai wrote:
> Since child's pid_ns may have user_ns not equal
> to parent's, and we do not want to lose parent's
> user_ns (as it's not impossible to restore it back),
> create the child from a sub-process.
>

I don't understand what you do here. Could you elaborate and add
comments in code?

Thanks,
Andrei
 
> v3: New
> 
> Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
> ---
>  criu/cr-restore.c |  131 +++++++++++++++++++++++++++++++++++++++++++++++------
>  1 file changed, 115 insertions(+), 16 deletions(-)
> 
> diff --git a/criu/cr-restore.c b/criu/cr-restore.c
> index 780efed18..a73545fbd 100644
> --- a/criu/cr-restore.c
> +++ b/criu/cr-restore.c
> @@ -1046,6 +1046,120 @@ static void maybe_clone_parent(struct pstree_item *item,
>  	}
>  }
>  
> +static int call_clone_fn(void *arg)
> +{
> +	struct cr_clone_arg *ca = arg;
> +	struct ns_id *pid_ns;
> +	pid_t pid;
> +	int fd;
> +
> +	pid_ns = lookup_ns_by_id(ca->item->ids->pid_ns_id, &pid_ns_desc);
> +	BUG_ON(!pid_ns);
> +
> +	if (set_next_pid(pid_ns, ca->item->pid) < 0) {
> +		pr_err("Can't set next pid\n");
> +		return -1;
> +	}
> +
> +	fd = fdstore_get(pid_ns->user_ns->user.nsfd_id);
> +	if (fd < 0) {
> +		pr_err("Can't get ns fd\n");
> +		return -1;
> +	}
> +
> +	if (setns(fd, CLONE_NEWUSER) < 0) {
> +		pr_perror("Can't set user ns");
> +		close(fd);
> +		return -1;
> +	}
> +	close(fd);
> +	ca->item->user_ns = pid_ns->user_ns;
> +
> +	if (ca->clone_flags & CLONE_FILES)
> +		close_pid_proc();
> +
> +	pid = clone_noasan(restore_task_with_children, ca->clone_flags | CLONE_PARENT | SIGCHLD, ca);
> +
> +	if (ca->item == root_item) {
> +		ca->item->pid->real = pid;
> +		pr_debug("PID: real %d virt %d\n", pid, vpid(ca->item));
> +	}
> +
> +	return pid > 0 ? 0 : -1;
> +}
> +
> +static int do_fork_with_pid(struct pstree_item *item, struct ns_id *pid_ns, struct cr_clone_arg *ca)
> +{
> +	int status, i, sz, ret = 0;
> +	struct pid *hlp_pid;
> +	sigset_t sig_mask;
> +	pid_t pid;
> +
> +	if (!(ca->clone_flags & CLONE_NEWPID) || !current || current->user_ns == pid_ns->user_ns) {
> +		if (set_next_pid(pid_ns, item->pid) < 0) {
> +			pr_err("Can't set next pid\n");
> +			return -1;
> +		}
> +		if (ca->clone_flags & CLONE_FILES)
> +			close_pid_proc();
> +		pid = clone_noasan(restore_task_with_children, ca->clone_flags | SIGCHLD, ca);
> +		if (item == root_item) {
> +			item->pid->real = pid;
> +			pr_debug("PID: real %d virt %d\n", pid, vpid(item));
> +		}
> +		return pid > 0 ? 0 : -1;
> +	}
> +
> +	sz = sizeof(struct pid) + (item->pid->level - 2) * sizeof(((struct pid *)NULL)->ns[0]);
> +	hlp_pid = xmalloc(sz);
> +	if (!hlp_pid)
> +		return -1;
> +	memcpy(hlp_pid, item->pid, sz);
> +	hlp_pid->level--;
> +
> +	for (i = 0; i < hlp_pid->level; i++) {
> +		/*
> +		 * Choose helper's pid[] as child's pid[] + 1.
> +		 * This should guarantee the helper will not
> +		 * occupy child's pids.
> +		 */
> +		hlp_pid->ns[i].virt++;
> +		if (hlp_pid->ns[i].virt < 0)
> +			hlp_pid->ns[i].virt = INIT_PID + 1;
> +	}
> +	if (set_next_pid(pid_ns->parent, hlp_pid) < 0) {
> +		pr_err("Can't set next pid\n");
> +		xfree(hlp_pid);
> +		return -1;
> +	}
> +	xfree(hlp_pid);
> +
> +	if (ca->clone_flags & CLONE_FILES)
> +		close_pid_proc();
> +
> +	if (block_sigmask(&sig_mask, SIGCHLD) < 0)
> +		return -1;
> +
> +	pid = clone_noasan(call_clone_fn, SIGCHLD, ca);
> +	if (pid < 0) {
> +		pr_perror("Can't clone()");
> +		ret = -1;
> +		goto unblock;
> +	}
> +
> +	errno = 0;
> +	if (waitpid(pid, &status, 0) < 0 || !WIFEXITED(status) || WEXITSTATUS(status)) {
> +		pr_perror("Child process waiting: %d", status);
> +		ret = -1;
> +		goto unblock;
> +	}
> +
> +unblock:
> +	if (restore_sigmask(&sig_mask))
> +		ret = -1;
> +	return ret;
> +}
> +
>  static inline int fork_with_pid(struct pstree_item *item)
>  {
>  	struct cr_clone_arg ca;
> @@ -1114,27 +1228,12 @@ static inline int fork_with_pid(struct pstree_item *item)
>  		goto err_close;
>  	}
>  
> -	if (set_next_pid(pid_ns, item->pid) < 0) {
> -		pr_err("Can't set next pid\n");
> -		goto err_unlock;
> -	}
> -
> -	if (ca.clone_flags & CLONE_FILES)
> -		close_pid_proc();
> -
> -	ret = clone_noasan(restore_task_with_children, ca.clone_flags | SIGCHLD, &ca);
> +	ret = do_fork_with_pid(item, pid_ns, &ca);
>  	if (ret < 0) {
>  		pr_perror("Can't fork for %d", pid);
>  		goto err_unlock;
>  	}
>  
> -
> -	if (item == root_item) {
> -		item->pid->real = ret;
> -		pr_debug("PID: real %d virt %d\n",
> -				item->pid->real, vpid(item));
> -	}
> -
>  err_unlock:
>  	if (flock(ca.fd, LOCK_UN))
>  		pr_perror("%d: Can't unlock %s", pid, LAST_PID_PATH);
>
Kirill Tkhai April 25, 2017, 12:11 p.m.
On 25.04.2017 08:26, Andrei Vagin wrote:
> On Mon, Apr 10, 2017 at 11:23:13AM +0300, Kirill Tkhai wrote:
>> Since child's pid_ns may have user_ns not equal
>> to parent's, and we do not want to lose parent's
>> user_ns (as it's not impossible to restore it back),
>> create the child from a sub-process.
>>
> 
> I don't understand what you do here. Could you elaborate and add
> comments in code?
> 
> Thanks,
> Andrei
>  
>> v3: New
>>
>> Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
>> ---
>>  criu/cr-restore.c |  131 +++++++++++++++++++++++++++++++++++++++++++++++------
>>  1 file changed, 115 insertions(+), 16 deletions(-)
>>
>> diff --git a/criu/cr-restore.c b/criu/cr-restore.c
>> index 780efed18..a73545fbd 100644
>> --- a/criu/cr-restore.c
>> +++ b/criu/cr-restore.c
>> @@ -1046,6 +1046,120 @@ static void maybe_clone_parent(struct pstree_item *item,
>>  	}
>>  }
>>  
>> +static int call_clone_fn(void *arg)
>> +{
>> +	struct cr_clone_arg *ca = arg;
>> +	struct ns_id *pid_ns;
>> +	pid_t pid;
>> +	int fd;
>> +
>> +	pid_ns = lookup_ns_by_id(ca->item->ids->pid_ns_id, &pid_ns_desc);
>> +	BUG_ON(!pid_ns);
>> +
>> +	if (set_next_pid(pid_ns, ca->item->pid) < 0) {
>> +		pr_err("Can't set next pid\n");
>> +		return -1;
>> +	}
>> +
>> +	fd = fdstore_get(pid_ns->user_ns->user.nsfd_id);
>> +	if (fd < 0) {
>> +		pr_err("Can't get ns fd\n");
>> +		return -1;
>> +	}
>> +

Lower user_ns:

>> +	if (setns(fd, CLONE_NEWUSER) < 0) {
>> +		pr_perror("Can't set user ns");
>> +		close(fd);
>> +		return -1;
>> +	}
>> +	close(fd);
>> +	ca->item->user_ns = pid_ns->user_ns;
>> +
>> +	if (ca->clone_flags & CLONE_FILES)
>> +		close_pid_proc();
>> +

Create children with CLONE_PARENT:

>> +	pid = clone_noasan(restore_task_with_children, ca->clone_flags | CLONE_PARENT | SIGCHLD, ca);
>> +
>> +	if (ca->item == root_item) {
>> +		ca->item->pid->real = pid;
>> +		pr_debug("PID: real %d virt %d\n", pid, vpid(ca->item));
>> +	}
>> +

Exit:

>> +	return pid > 0 ? 0 : -1;
>> +}
>> +
>> +static int do_fork_with_pid(struct pstree_item *item, struct ns_id *pid_ns, struct cr_clone_arg *ca)
>> +{
>> +	int status, i, sz, ret = 0;
>> +	struct pid *hlp_pid;
>> +	sigset_t sig_mask;
>> +	pid_t pid;
>> +
>> +	if (!(ca->clone_flags & CLONE_NEWPID) || !current || current->user_ns == pid_ns->user_ns) {

If we do not create a pid_ns, or user_ns of the pid_ns is similar to current's user_ns,
then create the pid_ns without any tricks.

>> +		if (set_next_pid(pid_ns, item->pid) < 0) {
>> +			pr_err("Can't set next pid\n");
>> +			return -1;
>> +		}
>> +		if (ca->clone_flags & CLONE_FILES)
>> +			close_pid_proc();
>> +		pid = clone_noasan(restore_task_with_children, ca->clone_flags | SIGCHLD, ca);
>> +		if (item == root_item) {
>> +			item->pid->real = pid;
>> +			pr_debug("PID: real %d virt %d\n", pid, vpid(item));
>> +		}
>> +		return pid > 0 ? 0 : -1;
>> +	}

Otherwise, create a helper, lower user_ns from current->user_ns to pid_ns->user_ns
and create the pid_ns.

Alloc memory for helper pid:
>> +
>> +	sz = sizeof(struct pid) + (item->pid->level - 2) * sizeof(((struct pid *)NULL)->ns[0]);
>> +	hlp_pid = xmalloc(sz);
>> +	if (!hlp_pid)
>> +		return -1;
>> +	memcpy(hlp_pid, item->pid, sz);
>> +	hlp_pid->level--;
>> +

Choose helper pid (see comment, how it's choosed):

>> +	for (i = 0; i < hlp_pid->level; i++) {
>> +		/*
>> +		 * Choose helper's pid[] as child's pid[] + 1.
>> +		 * This should guarantee the helper will not
>> +		 * occupy child's pids.
>> +		 */
>> +		hlp_pid->ns[i].virt++;
>> +		if (hlp_pid->ns[i].virt < 0)
>> +			hlp_pid->ns[i].virt = INIT_PID + 1;
>> +	}

Set helper pid as next:

>> +	if (set_next_pid(pid_ns->parent, hlp_pid) < 0) {
>> +		pr_err("Can't set next pid\n");
>> +		xfree(hlp_pid);
>> +		return -1;
>> +	}
>> +	xfree(hlp_pid);
>> +
>> +	if (ca->clone_flags & CLONE_FILES)
>> +		close_pid_proc();
>> +
>> +	if (block_sigmask(&sig_mask, SIGCHLD) < 0)
>> +		return -1;
>> +

Create helper:

>> +	pid = clone_noasan(call_clone_fn, SIGCHLD, ca);
>> +	if (pid < 0) {
>> +		pr_perror("Can't clone()");
>> +		ret = -1;
>> +		goto unblock;
>> +	}
>> +

Wait till helper dies:

>> +	errno = 0;
>> +	if (waitpid(pid, &status, 0) < 0 || !WIFEXITED(status) || WEXITSTATUS(status)) {
>> +		pr_perror("Child process waiting: %d", status);
>> +		ret = -1;
>> +		goto unblock;
>> +	}
>> +
>> +unblock:
>> +	if (restore_sigmask(&sig_mask))
>> +		ret = -1;
>> +	return ret;
>> +}
>> +
>>  static inline int fork_with_pid(struct pstree_item *item)
>>  {
>>  	struct cr_clone_arg ca;
>> @@ -1114,27 +1228,12 @@ static inline int fork_with_pid(struct pstree_item *item)
>>  		goto err_close;
>>  	}
>>  
>> -	if (set_next_pid(pid_ns, item->pid) < 0) {
>> -		pr_err("Can't set next pid\n");
>> -		goto err_unlock;
>> -	}
>> -
>> -	if (ca.clone_flags & CLONE_FILES)
>> -		close_pid_proc();
>> -
>> -	ret = clone_noasan(restore_task_with_children, ca.clone_flags | SIGCHLD, &ca);
>> +	ret = do_fork_with_pid(item, pid_ns, &ca);
>>  	if (ret < 0) {
>>  		pr_perror("Can't fork for %d", pid);
>>  		goto err_unlock;
>>  	}
>>  
>> -
>> -	if (item == root_item) {
>> -		item->pid->real = ret;
>> -		pr_debug("PID: real %d virt %d\n",
>> -				item->pid->real, vpid(item));
>> -	}
>> -
>>  err_unlock:
>>  	if (flock(ca.fd, LOCK_UN))
>>  		pr_perror("%d: Can't unlock %s", pid, LAST_PID_PATH);
>>
Andrey Vagin April 26, 2017, 7:53 a.m.
On Tue, Apr 25, 2017 at 03:11:32PM +0300, Kirill Tkhai wrote:
> On 25.04.2017 08:26, Andrei Vagin wrote:
> > On Mon, Apr 10, 2017 at 11:23:13AM +0300, Kirill Tkhai wrote:
> >> Since child's pid_ns may have user_ns not equal
> >> to parent's, and we do not want to lose parent's
> >> user_ns (as it's not impossible to restore it back),
> >> create the child from a sub-process.
> >>
> > 
> > I don't understand what you do here. Could you elaborate and add
> > comments in code?

Ok, now I understand the idea. And I have one more question. Now we
restore userns for processes before restoring files, so we can meet a
situation whe a process will not be able to restore a file due to lack
of permissions. How are you going to handle this case?

> > 
> > Thanks,
> > Andrei
> >  
> >> v3: New
> >>
> >> Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
> >> ---
> >>  criu/cr-restore.c |  131 +++++++++++++++++++++++++++++++++++++++++++++++------
> >>  1 file changed, 115 insertions(+), 16 deletions(-)
> >>
> >> diff --git a/criu/cr-restore.c b/criu/cr-restore.c
> >> index 780efed18..a73545fbd 100644
> >> --- a/criu/cr-restore.c
> >> +++ b/criu/cr-restore.c
> >> @@ -1046,6 +1046,120 @@ static void maybe_clone_parent(struct pstree_item *item,
> >>  	}
> >>  }
> >>  
> >> +static int call_clone_fn(void *arg)
> >> +{
> >> +	struct cr_clone_arg *ca = arg;
> >> +	struct ns_id *pid_ns;
> >> +	pid_t pid;
> >> +	int fd;
> >> +
> >> +	pid_ns = lookup_ns_by_id(ca->item->ids->pid_ns_id, &pid_ns_desc);
> >> +	BUG_ON(!pid_ns);
> >> +
> >> +	if (set_next_pid(pid_ns, ca->item->pid) < 0) {
> >> +		pr_err("Can't set next pid\n");
> >> +		return -1;
> >> +	}
> >> +
> >> +	fd = fdstore_get(pid_ns->user_ns->user.nsfd_id);
> >> +	if (fd < 0) {
> >> +		pr_err("Can't get ns fd\n");
> >> +		return -1;
> >> +	}
> >> +
> 
> Lower user_ns:
> 
> >> +	if (setns(fd, CLONE_NEWUSER) < 0) {
> >> +		pr_perror("Can't set user ns");
> >> +		close(fd);
> >> +		return -1;
> >> +	}
> >> +	close(fd);
> >> +	ca->item->user_ns = pid_ns->user_ns;
> >> +
> >> +	if (ca->clone_flags & CLONE_FILES)
> >> +		close_pid_proc();
> >> +
> 
> Create children with CLONE_PARENT:
> 
> >> +	pid = clone_noasan(restore_task_with_children, ca->clone_flags | CLONE_PARENT | SIGCHLD, ca);
> >> +
> >> +	if (ca->item == root_item) {
> >> +		ca->item->pid->real = pid;
> >> +		pr_debug("PID: real %d virt %d\n", pid, vpid(ca->item));
> >> +	}
> >> +
> 
> Exit:
> 
> >> +	return pid > 0 ? 0 : -1;
> >> +}
> >> +
> >> +static int do_fork_with_pid(struct pstree_item *item, struct ns_id *pid_ns, struct cr_clone_arg *ca)
> >> +{
> >> +	int status, i, sz, ret = 0;
> >> +	struct pid *hlp_pid;
> >> +	sigset_t sig_mask;
> >> +	pid_t pid;
> >> +
> >> +	if (!(ca->clone_flags & CLONE_NEWPID) || !current || current->user_ns == pid_ns->user_ns) {
> 
> If we do not create a pid_ns, or user_ns of the pid_ns is similar to current's user_ns,
> then create the pid_ns without any tricks.
> 
> >> +		if (set_next_pid(pid_ns, item->pid) < 0) {
> >> +			pr_err("Can't set next pid\n");
> >> +			return -1;
> >> +		}
> >> +		if (ca->clone_flags & CLONE_FILES)
> >> +			close_pid_proc();
> >> +		pid = clone_noasan(restore_task_with_children, ca->clone_flags | SIGCHLD, ca);
> >> +		if (item == root_item) {
> >> +			item->pid->real = pid;
> >> +			pr_debug("PID: real %d virt %d\n", pid, vpid(item));
> >> +		}
> >> +		return pid > 0 ? 0 : -1;
> >> +	}
> 
> Otherwise, create a helper, lower user_ns from current->user_ns to pid_ns->user_ns
> and create the pid_ns.
> 
> Alloc memory for helper pid:
> >> +
> >> +	sz = sizeof(struct pid) + (item->pid->level - 2) * sizeof(((struct pid *)NULL)->ns[0]);
> >> +	hlp_pid = xmalloc(sz);
> >> +	if (!hlp_pid)
> >> +		return -1;
> >> +	memcpy(hlp_pid, item->pid, sz);
> >> +	hlp_pid->level--;
> >> +
> 
> Choose helper pid (see comment, how it's choosed):
> 
> >> +	for (i = 0; i < hlp_pid->level; i++) {
> >> +		/*
> >> +		 * Choose helper's pid[] as child's pid[] + 1.
> >> +		 * This should guarantee the helper will not
> >> +		 * occupy child's pids.
> >> +		 */
> >> +		hlp_pid->ns[i].virt++;
> >> +		if (hlp_pid->ns[i].virt < 0)
> >> +			hlp_pid->ns[i].virt = INIT_PID + 1;
> >> +	}
> 
> Set helper pid as next:
> 
> >> +	if (set_next_pid(pid_ns->parent, hlp_pid) < 0) {
> >> +		pr_err("Can't set next pid\n");
> >> +		xfree(hlp_pid);
> >> +		return -1;
> >> +	}
> >> +	xfree(hlp_pid);
> >> +
> >> +	if (ca->clone_flags & CLONE_FILES)
> >> +		close_pid_proc();
> >> +
> >> +	if (block_sigmask(&sig_mask, SIGCHLD) < 0)
> >> +		return -1;
> >> +
> 
> Create helper:
> 
> >> +	pid = clone_noasan(call_clone_fn, SIGCHLD, ca);
> >> +	if (pid < 0) {
> >> +		pr_perror("Can't clone()");
> >> +		ret = -1;
> >> +		goto unblock;
> >> +	}
> >> +
> 
> Wait till helper dies:
> 
> >> +	errno = 0;
> >> +	if (waitpid(pid, &status, 0) < 0 || !WIFEXITED(status) || WEXITSTATUS(status)) {
> >> +		pr_perror("Child process waiting: %d", status);
> >> +		ret = -1;
> >> +		goto unblock;
> >> +	}
> >> +
> >> +unblock:
> >> +	if (restore_sigmask(&sig_mask))
> >> +		ret = -1;
> >> +	return ret;
> >> +}
> >> +
> >>  static inline int fork_with_pid(struct pstree_item *item)
> >>  {
> >>  	struct cr_clone_arg ca;
> >> @@ -1114,27 +1228,12 @@ static inline int fork_with_pid(struct pstree_item *item)
> >>  		goto err_close;
> >>  	}
> >>  
> >> -	if (set_next_pid(pid_ns, item->pid) < 0) {
> >> -		pr_err("Can't set next pid\n");
> >> -		goto err_unlock;
> >> -	}
> >> -
> >> -	if (ca.clone_flags & CLONE_FILES)
> >> -		close_pid_proc();
> >> -
> >> -	ret = clone_noasan(restore_task_with_children, ca.clone_flags | SIGCHLD, &ca);
> >> +	ret = do_fork_with_pid(item, pid_ns, &ca);
> >>  	if (ret < 0) {
> >>  		pr_perror("Can't fork for %d", pid);
> >>  		goto err_unlock;
> >>  	}
> >>  
> >> -
> >> -	if (item == root_item) {
> >> -		item->pid->real = ret;
> >> -		pr_debug("PID: real %d virt %d\n",
> >> -				item->pid->real, vpid(item));
> >> -	}
> >> -
> >>  err_unlock:
> >>  	if (flock(ca.fd, LOCK_UN))
> >>  		pr_perror("%d: Can't unlock %s", pid, LAST_PID_PATH);
> >>
Kirill Tkhai April 26, 2017, 10:13 a.m.
On 26.04.2017 10:53, Andrei Vagin wrote:
> On Tue, Apr 25, 2017 at 03:11:32PM +0300, Kirill Tkhai wrote:
>> On 25.04.2017 08:26, Andrei Vagin wrote:
>>> On Mon, Apr 10, 2017 at 11:23:13AM +0300, Kirill Tkhai wrote:
>>>> Since child's pid_ns may have user_ns not equal
>>>> to parent's, and we do not want to lose parent's
>>>> user_ns (as it's not impossible to restore it back),
>>>> create the child from a sub-process.
>>>>
>>>
>>> I don't understand what you do here. Could you elaborate and add
>>> comments in code?
> 
> Ok, now I understand the idea. And I have one more question. Now we
> restore userns for processes before restoring files, so we can meet a
> situation whe a process will not be able to restore a file due to lack
> of permissions. How are you going to handle this case?

There are two ways:
1)use usernsd to open the file
2)restore such files in processes, who have the permissions (name their fle master).

It does not contradict the current ns restore scheme, but now I'm far from choosing
a variant (since even ns-support sceleton is not officially experimental supported yet)
 
>>>
>>> Thanks,
>>> Andrei
>>>  
>>>> v3: New
>>>>
>>>> Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
>>>> ---
>>>>  criu/cr-restore.c |  131 +++++++++++++++++++++++++++++++++++++++++++++++------
>>>>  1 file changed, 115 insertions(+), 16 deletions(-)
>>>>
>>>> diff --git a/criu/cr-restore.c b/criu/cr-restore.c
>>>> index 780efed18..a73545fbd 100644
>>>> --- a/criu/cr-restore.c
>>>> +++ b/criu/cr-restore.c
>>>> @@ -1046,6 +1046,120 @@ static void maybe_clone_parent(struct pstree_item *item,
>>>>  	}
>>>>  }
>>>>  
>>>> +static int call_clone_fn(void *arg)
>>>> +{
>>>> +	struct cr_clone_arg *ca = arg;
>>>> +	struct ns_id *pid_ns;
>>>> +	pid_t pid;
>>>> +	int fd;
>>>> +
>>>> +	pid_ns = lookup_ns_by_id(ca->item->ids->pid_ns_id, &pid_ns_desc);
>>>> +	BUG_ON(!pid_ns);
>>>> +
>>>> +	if (set_next_pid(pid_ns, ca->item->pid) < 0) {
>>>> +		pr_err("Can't set next pid\n");
>>>> +		return -1;
>>>> +	}
>>>> +
>>>> +	fd = fdstore_get(pid_ns->user_ns->user.nsfd_id);
>>>> +	if (fd < 0) {
>>>> +		pr_err("Can't get ns fd\n");
>>>> +		return -1;
>>>> +	}
>>>> +
>>
>> Lower user_ns:
>>
>>>> +	if (setns(fd, CLONE_NEWUSER) < 0) {
>>>> +		pr_perror("Can't set user ns");
>>>> +		close(fd);
>>>> +		return -1;
>>>> +	}
>>>> +	close(fd);
>>>> +	ca->item->user_ns = pid_ns->user_ns;
>>>> +
>>>> +	if (ca->clone_flags & CLONE_FILES)
>>>> +		close_pid_proc();
>>>> +
>>
>> Create children with CLONE_PARENT:
>>
>>>> +	pid = clone_noasan(restore_task_with_children, ca->clone_flags | CLONE_PARENT | SIGCHLD, ca);
>>>> +
>>>> +	if (ca->item == root_item) {
>>>> +		ca->item->pid->real = pid;
>>>> +		pr_debug("PID: real %d virt %d\n", pid, vpid(ca->item));
>>>> +	}
>>>> +
>>
>> Exit:
>>
>>>> +	return pid > 0 ? 0 : -1;
>>>> +}
>>>> +
>>>> +static int do_fork_with_pid(struct pstree_item *item, struct ns_id *pid_ns, struct cr_clone_arg *ca)
>>>> +{
>>>> +	int status, i, sz, ret = 0;
>>>> +	struct pid *hlp_pid;
>>>> +	sigset_t sig_mask;
>>>> +	pid_t pid;
>>>> +
>>>> +	if (!(ca->clone_flags & CLONE_NEWPID) || !current || current->user_ns == pid_ns->user_ns) {
>>
>> If we do not create a pid_ns, or user_ns of the pid_ns is similar to current's user_ns,
>> then create the pid_ns without any tricks.
>>
>>>> +		if (set_next_pid(pid_ns, item->pid) < 0) {
>>>> +			pr_err("Can't set next pid\n");
>>>> +			return -1;
>>>> +		}
>>>> +		if (ca->clone_flags & CLONE_FILES)
>>>> +			close_pid_proc();
>>>> +		pid = clone_noasan(restore_task_with_children, ca->clone_flags | SIGCHLD, ca);
>>>> +		if (item == root_item) {
>>>> +			item->pid->real = pid;
>>>> +			pr_debug("PID: real %d virt %d\n", pid, vpid(item));
>>>> +		}
>>>> +		return pid > 0 ? 0 : -1;
>>>> +	}
>>
>> Otherwise, create a helper, lower user_ns from current->user_ns to pid_ns->user_ns
>> and create the pid_ns.
>>
>> Alloc memory for helper pid:
>>>> +
>>>> +	sz = sizeof(struct pid) + (item->pid->level - 2) * sizeof(((struct pid *)NULL)->ns[0]);
>>>> +	hlp_pid = xmalloc(sz);
>>>> +	if (!hlp_pid)
>>>> +		return -1;
>>>> +	memcpy(hlp_pid, item->pid, sz);
>>>> +	hlp_pid->level--;
>>>> +
>>
>> Choose helper pid (see comment, how it's choosed):
>>
>>>> +	for (i = 0; i < hlp_pid->level; i++) {
>>>> +		/*
>>>> +		 * Choose helper's pid[] as child's pid[] + 1.
>>>> +		 * This should guarantee the helper will not
>>>> +		 * occupy child's pids.
>>>> +		 */
>>>> +		hlp_pid->ns[i].virt++;
>>>> +		if (hlp_pid->ns[i].virt < 0)
>>>> +			hlp_pid->ns[i].virt = INIT_PID + 1;
>>>> +	}
>>
>> Set helper pid as next:
>>
>>>> +	if (set_next_pid(pid_ns->parent, hlp_pid) < 0) {
>>>> +		pr_err("Can't set next pid\n");
>>>> +		xfree(hlp_pid);
>>>> +		return -1;
>>>> +	}
>>>> +	xfree(hlp_pid);
>>>> +
>>>> +	if (ca->clone_flags & CLONE_FILES)
>>>> +		close_pid_proc();
>>>> +
>>>> +	if (block_sigmask(&sig_mask, SIGCHLD) < 0)
>>>> +		return -1;
>>>> +
>>
>> Create helper:
>>
>>>> +	pid = clone_noasan(call_clone_fn, SIGCHLD, ca);
>>>> +	if (pid < 0) {
>>>> +		pr_perror("Can't clone()");
>>>> +		ret = -1;
>>>> +		goto unblock;
>>>> +	}
>>>> +
>>
>> Wait till helper dies:
>>
>>>> +	errno = 0;
>>>> +	if (waitpid(pid, &status, 0) < 0 || !WIFEXITED(status) || WEXITSTATUS(status)) {
>>>> +		pr_perror("Child process waiting: %d", status);
>>>> +		ret = -1;
>>>> +		goto unblock;
>>>> +	}
>>>> +
>>>> +unblock:
>>>> +	if (restore_sigmask(&sig_mask))
>>>> +		ret = -1;
>>>> +	return ret;
>>>> +}
>>>> +
>>>>  static inline int fork_with_pid(struct pstree_item *item)
>>>>  {
>>>>  	struct cr_clone_arg ca;
>>>> @@ -1114,27 +1228,12 @@ static inline int fork_with_pid(struct pstree_item *item)
>>>>  		goto err_close;
>>>>  	}
>>>>  
>>>> -	if (set_next_pid(pid_ns, item->pid) < 0) {
>>>> -		pr_err("Can't set next pid\n");
>>>> -		goto err_unlock;
>>>> -	}
>>>> -
>>>> -	if (ca.clone_flags & CLONE_FILES)
>>>> -		close_pid_proc();
>>>> -
>>>> -	ret = clone_noasan(restore_task_with_children, ca.clone_flags | SIGCHLD, &ca);
>>>> +	ret = do_fork_with_pid(item, pid_ns, &ca);
>>>>  	if (ret < 0) {
>>>>  		pr_perror("Can't fork for %d", pid);
>>>>  		goto err_unlock;
>>>>  	}
>>>>  
>>>> -
>>>> -	if (item == root_item) {
>>>> -		item->pid->real = ret;
>>>> -		pr_debug("PID: real %d virt %d\n",
>>>> -				item->pid->real, vpid(item));
>>>> -	}
>>>> -
>>>>  err_unlock:
>>>>  	if (flock(ca.fd, LOCK_UN))
>>>>  		pr_perror("%d: Can't unlock %s", pid, LAST_PID_PATH);
>>>>