mount: restore mounts in the root mount namespace

Submitted by Andrei Vagin on Feb. 14, 2017, 9:49 p.m.

Details

Message ID 1487108965-26093-1-git-send-email-avagin@openvz.org
State Accepted
Series "mount: restore mounts in the root mount namespace"
Headers show

Commit Message

Andrei Vagin Feb. 14, 2017, 9:49 p.m.
From: Andrei Vagin <avagin@virtuozzo.com>

Currently mounts are restored in a mount namespace, then
this namespace is cloned to create mount namespaces for
processes and finaly the first namespace is destroyed after
cleaning remap files.

But it doesn't work in a case, when we have to open file
descriptores to restore mounts (e g to restore bind-mount
pty slaves), because on the next iteration criu will not
find mounts for this files.

In this patch the first namespace is restored as the root
mount namespace and we clone the first namespace to create
a namespace whichc is used to clean remap files.

$ ./zdtm.py run -t zdtm/static/pty-console --iter 5

Patch hide | download patch | download mbox

====================== Run zdtm/static/pty-console in ns =======================
Start test
Test is SUID
./pty-console --pidfile=pty-console.pid --outfile=pty-console.out
Run criu dump
Run criu restore
Run criu dump
=[log]=> dump/zdtm/static/pty-console/36/2/dump.log
------------------------ grep Error ------------------------
(00.106521) Error (criu/files-reg.c:1132): Can't lookup mount=563 for fd=4 path=/ptmx
(00.106585) Error (criu/cr-dump.c:1325): Dump files (pid: 70) failed with -1
(00.129041) Error (criu/cr-dump.c:1674): Dumping FAILED.
------------------------ ERROR OVER ------------------------

Signed-off-by: Andrei Vagin <avagin@virtuozzo.com>
---
 criu/cr-restore.c       | 21 +++++++++++++++++----
 criu/include/restorer.h |  5 +++++
 criu/mount.c            | 32 +++++++++++++++++++++++++++-----
 3 files changed, 49 insertions(+), 9 deletions(-)

diff --git a/criu/cr-restore.c b/criu/cr-restore.c
index d75aa0d..e20036b 100644
--- a/criu/cr-restore.c
+++ b/criu/cr-restore.c
@@ -1431,6 +1431,9 @@  static int restore_task_with_children(void *_arg)
 		if (prepare_namespace(current, ca->clone_flags))
 			goto err;
 
+		if (restore_finish_stage(task_entries, CR_STATE_POST_RESTORE_NS) < 0)
+			goto err;
+
 		if (root_prepare_shared())
 			goto err;
 
@@ -1494,6 +1497,7 @@  static inline int stage_participants(int next_stage)
 	case CR_STATE_FAIL:
 		return 0;
 	case CR_STATE_RESTORE_NS:
+	case CR_STATE_POST_RESTORE_NS:
 	case CR_STATE_RESTORE_SHARED:
 		return 1;
 	case CR_STATE_FORKING:
@@ -1850,16 +1854,25 @@  static int restore_root_task(struct pstree_item *init)
 	if (ret)
 		goto out_kill;
 
+	ret = run_scripts(ACT_SETUP_NS);
+	if (ret)
+		goto out_kill;
+
+	ret = restore_switch_stage(CR_STATE_POST_RESTORE_NS);
+	if (ret < 0)
+		goto out_kill;
+
+	pr_info("Wait until namespaces are created\n");
+	ret = restore_wait_inprogress_tasks();
+	if (ret)
+		goto out_kill;
+
 	if (root_ns_mask & CLONE_NEWNS) {
 		mnt_ns_fd = open_proc(init->pid->real, "ns/mnt");
 		if (mnt_ns_fd < 0)
 			goto out_kill;
 	}
 
-	ret = run_scripts(ACT_SETUP_NS);
-	if (ret)
-		goto out_kill;
-
 	if (opts.empty_ns & CLONE_NEWNET) {
 		/*
 		 * Local TCP connections were locked by network_lock_internal()
diff --git a/criu/include/restorer.h b/criu/include/restorer.h
index 23c7e24..7c65538 100644
--- a/criu/include/restorer.h
+++ b/criu/include/restorer.h
@@ -203,6 +203,11 @@  static inline unsigned long restorer_stack(struct restore_mem_zone *mz)
 enum {
 	CR_STATE_FAIL		= -1,
 	CR_STATE_RESTORE_NS	= 0, /* is used for executing "setup-namespace" scripts */
+	/*
+	 * Need to wait a mount namespace which
+	 * will be used to clean up remap files.
+	 */
+	CR_STATE_POST_RESTORE_NS,
 	CR_STATE_RESTORE_SHARED,
 	CR_STATE_FORKING,
 	CR_STATE_RESTORE,
diff --git a/criu/mount.c b/criu/mount.c
index 8f66b4e..013b17e 100644
--- a/criu/mount.c
+++ b/criu/mount.c
@@ -2948,17 +2948,39 @@  int prepare_mnt_ns(void)
 			goto err;
 		}
 
+		if (nsid->type == NS_ROOT) {
+			int fd;
+
+			/*
+			 * We need to create a mount namespace which will be
+			 * used to clean up remap files
+			 * (depopulate_roots_yard).  The namespace where mounts
+			 * was restored has to be restored as a root mount
+			 * namespace, because there are file descriptors
+			 * linked with it (e.g. to bind-mount slave pty-s).
+			 */
+			fd = open_proc(PROC_SELF, "ns/mnt");
+			if (fd < 0)
+				goto err;
+			if (setns(rst, CLONE_NEWNS)) {
+				pr_perror("Can't restore mntns back");
+				goto err;
+			}
+			nsid->mnt.ns_fd = rst;
+			rst = fd;
+		} else {
+			/* Pin one with a file descriptor */
+			nsid->mnt.ns_fd = open_proc(PROC_SELF, "ns/mnt");
+			if (nsid->mnt.ns_fd < 0)
+				goto err;
+		}
+
 		/* Set its root */
 		path[0] = '/';
 		print_ns_root(nsid, 0, path + 1, sizeof(path) - 1);
 		if (cr_pivot_root(path))
 			goto err;
 
-		/* Pin one with a file descriptor */
-		nsid->mnt.ns_fd = open_proc(PROC_SELF, "ns/mnt");
-		if (nsid->mnt.ns_fd < 0)
-			goto err;
-
 		/* root_fd is used to restore file mappings */
 		nsid->mnt.root_fd = open_proc(PROC_SELF, "root");
 		if (nsid->mnt.root_fd < 0)

Comments

Pavel Emelianov Feb. 15, 2017, 10:26 a.m.
On 02/15/2017 12:49 AM, Andrei Vagin wrote:
> From: Andrei Vagin <avagin@virtuozzo.com>
> 
> Currently mounts are restored in a mount namespace, then
> this namespace is cloned to create mount namespaces for
> processes and finaly the first namespace is destroyed after
> cleaning remap files.
> 
> But it doesn't work in a case, when we have to open file
> descriptores to restore mounts (e g to restore bind-mount
> pty slaves), because on the next iteration criu will not
> find mounts for this files.

I don't get this. Would you describe the problem in more detail?

> In this patch the first namespace is restored as the root
> mount namespace and we clone the first namespace to create
> a namespace whichc is used to clean remap files.
> 
> $ ./zdtm.py run -t zdtm/static/pty-console --iter 5
> ====================== Run zdtm/static/pty-console in ns =======================
> Start test
> Test is SUID
> ./pty-console --pidfile=pty-console.pid --outfile=pty-console.out
> Run criu dump
> Run criu restore
> Run criu dump
> =[log]=> dump/zdtm/static/pty-console/36/2/dump.log
> ------------------------ grep Error ------------------------
> (00.106521) Error (criu/files-reg.c:1132): Can't lookup mount=563 for fd=4 path=/ptmx
> (00.106585) Error (criu/cr-dump.c:1325): Dump files (pid: 70) failed with -1
> (00.129041) Error (criu/cr-dump.c:1674): Dumping FAILED.
> ------------------------ ERROR OVER ------------------------
> 
> Signed-off-by: Andrei Vagin <avagin@virtuozzo.com>
> ---
>  criu/cr-restore.c       | 21 +++++++++++++++++----
>  criu/include/restorer.h |  5 +++++
>  criu/mount.c            | 32 +++++++++++++++++++++++++++-----
>  3 files changed, 49 insertions(+), 9 deletions(-)
> 
> diff --git a/criu/cr-restore.c b/criu/cr-restore.c
> index d75aa0d..e20036b 100644
> --- a/criu/cr-restore.c
> +++ b/criu/cr-restore.c
> @@ -1431,6 +1431,9 @@ static int restore_task_with_children(void *_arg)
>  		if (prepare_namespace(current, ca->clone_flags))
>  			goto err;
>  
> +		if (restore_finish_stage(task_entries, CR_STATE_POST_RESTORE_NS) < 0)
> +			goto err;
> +
>  		if (root_prepare_shared())
>  			goto err;
>  
> @@ -1494,6 +1497,7 @@ static inline int stage_participants(int next_stage)
>  	case CR_STATE_FAIL:
>  		return 0;
>  	case CR_STATE_RESTORE_NS:
> +	case CR_STATE_POST_RESTORE_NS:
>  	case CR_STATE_RESTORE_SHARED:
>  		return 1;
>  	case CR_STATE_FORKING:
> @@ -1850,16 +1854,25 @@ static int restore_root_task(struct pstree_item *init)
>  	if (ret)
>  		goto out_kill;
>  
> +	ret = run_scripts(ACT_SETUP_NS);
> +	if (ret)
> +		goto out_kill;
> +
> +	ret = restore_switch_stage(CR_STATE_POST_RESTORE_NS);
> +	if (ret < 0)
> +		goto out_kill;
> +
> +	pr_info("Wait until namespaces are created\n");
> +	ret = restore_wait_inprogress_tasks();
> +	if (ret)
> +		goto out_kill;
> +
>  	if (root_ns_mask & CLONE_NEWNS) {
>  		mnt_ns_fd = open_proc(init->pid->real, "ns/mnt");
>  		if (mnt_ns_fd < 0)
>  			goto out_kill;
>  	}
>  
> -	ret = run_scripts(ACT_SETUP_NS);
> -	if (ret)
> -		goto out_kill;
> -
>  	if (opts.empty_ns & CLONE_NEWNET) {
>  		/*
>  		 * Local TCP connections were locked by network_lock_internal()
> diff --git a/criu/include/restorer.h b/criu/include/restorer.h
> index 23c7e24..7c65538 100644
> --- a/criu/include/restorer.h
> +++ b/criu/include/restorer.h
> @@ -203,6 +203,11 @@ static inline unsigned long restorer_stack(struct restore_mem_zone *mz)
>  enum {
>  	CR_STATE_FAIL		= -1,
>  	CR_STATE_RESTORE_NS	= 0, /* is used for executing "setup-namespace" scripts */
> +	/*
> +	 * Need to wait a mount namespace which
> +	 * will be used to clean up remap files.
> +	 */
> +	CR_STATE_POST_RESTORE_NS,
>  	CR_STATE_RESTORE_SHARED,
>  	CR_STATE_FORKING,
>  	CR_STATE_RESTORE,
> diff --git a/criu/mount.c b/criu/mount.c
> index 8f66b4e..013b17e 100644
> --- a/criu/mount.c
> +++ b/criu/mount.c
> @@ -2948,17 +2948,39 @@ int prepare_mnt_ns(void)
>  			goto err;
>  		}
>  
> +		if (nsid->type == NS_ROOT) {
> +			int fd;
> +
> +			/*
> +			 * We need to create a mount namespace which will be
> +			 * used to clean up remap files
> +			 * (depopulate_roots_yard).  The namespace where mounts
> +			 * was restored has to be restored as a root mount
> +			 * namespace, because there are file descriptors
> +			 * linked with it (e.g. to bind-mount slave pty-s).
> +			 */
> +			fd = open_proc(PROC_SELF, "ns/mnt");
> +			if (fd < 0)
> +				goto err;
> +			if (setns(rst, CLONE_NEWNS)) {
> +				pr_perror("Can't restore mntns back");
> +				goto err;
> +			}
> +			nsid->mnt.ns_fd = rst;
> +			rst = fd;
> +		} else {
> +			/* Pin one with a file descriptor */
> +			nsid->mnt.ns_fd = open_proc(PROC_SELF, "ns/mnt");
> +			if (nsid->mnt.ns_fd < 0)
> +				goto err;
> +		}
> +
>  		/* Set its root */
>  		path[0] = '/';
>  		print_ns_root(nsid, 0, path + 1, sizeof(path) - 1);
>  		if (cr_pivot_root(path))
>  			goto err;
>  
> -		/* Pin one with a file descriptor */
> -		nsid->mnt.ns_fd = open_proc(PROC_SELF, "ns/mnt");
> -		if (nsid->mnt.ns_fd < 0)
> -			goto err;
> -
>  		/* root_fd is used to restore file mappings */
>  		nsid->mnt.root_fd = open_proc(PROC_SELF, "root");
>  		if (nsid->mnt.root_fd < 0)
>
Andrey Vagin Feb. 15, 2017, 6:53 p.m.
On Wed, Feb 15, 2017 at 01:26:52PM +0300, Pavel Emelyanov wrote:
> On 02/15/2017 12:49 AM, Andrei Vagin wrote:
> > From: Andrei Vagin <avagin@virtuozzo.com>
> > 
> > Currently mounts are restored in a mount namespace, then
> > this namespace is cloned to create mount namespaces for
> > processes and finaly the first namespace is destroyed after
> > cleaning remap files.
> > 
> > But it doesn't work in a case, when we have to open file
> > descriptores to restore mounts (e g to restore bind-mount
> > pty slaves), because on the next iteration criu will not
> > find mounts for this files.
> 
> I don't get this. Would you describe the problem in more detail?

Look at this example:

The origin mount namespace has a bind mount /dev/pts/0 -> /dev/console

And one of processes has /dev/ptmx which is linked with /dev/pts/0

How we restore this construction:
1. Fork the init process into a new mount namespace (lets call it A)
2. Restore mounts
   - open /dev/ptmx to bind-mount /dev/pts/0 and save the master fd in
     fdstore
   - bind-mount /dev/pts/0 into /dev/console
3. Create namespace which will be installed to processes
4. Restore file descriptros
   - get the master fd for /dev/pts/0 from fdstore

This patch change the third step. You can see that we open a file
desctriptor in the A namespace, so if we don't intall it to one of
processes, it will be extern next time, becase criu will not be able to
find its mount by mntid.

Currently CRIU uses the A namespace to clean up remap files and clone
this namespace to restore namespaces for processes.

With this patch, the A namespace will be used as a root mount namespace
and we will clone it to create a mount namespace, which will be used to
clean up remap files.

> 
> > In this patch the first namespace is restored as the root
> > mount namespace and we clone the first namespace to create
> > a namespace whichc is used to clean remap files.
> > 
> > $ ./zdtm.py run -t zdtm/static/pty-console --iter 5
> > ====================== Run zdtm/static/pty-console in ns =======================
> > Start test
> > Test is SUID
> > ./pty-console --pidfile=pty-console.pid --outfile=pty-console.out
> > Run criu dump
> > Run criu restore
> > Run criu dump
> > =[log]=> dump/zdtm/static/pty-console/36/2/dump.log
> > ------------------------ grep Error ------------------------
> > (00.106521) Error (criu/files-reg.c:1132): Can't lookup mount=563 for fd=4 path=/ptmx
> > (00.106585) Error (criu/cr-dump.c:1325): Dump files (pid: 70) failed with -1
> > (00.129041) Error (criu/cr-dump.c:1674): Dumping FAILED.
> > ------------------------ ERROR OVER ------------------------
> > 
> > Signed-off-by: Andrei Vagin <avagin@virtuozzo.com>
> > ---
> >  criu/cr-restore.c       | 21 +++++++++++++++++----
> >  criu/include/restorer.h |  5 +++++
> >  criu/mount.c            | 32 +++++++++++++++++++++++++++-----
> >  3 files changed, 49 insertions(+), 9 deletions(-)
> > 
> > diff --git a/criu/cr-restore.c b/criu/cr-restore.c
> > index d75aa0d..e20036b 100644
> > --- a/criu/cr-restore.c
> > +++ b/criu/cr-restore.c
> > @@ -1431,6 +1431,9 @@ static int restore_task_with_children(void *_arg)
> >  		if (prepare_namespace(current, ca->clone_flags))
> >  			goto err;
> >  
> > +		if (restore_finish_stage(task_entries, CR_STATE_POST_RESTORE_NS) < 0)
> > +			goto err;
> > +
> >  		if (root_prepare_shared())
> >  			goto err;
> >  
> > @@ -1494,6 +1497,7 @@ static inline int stage_participants(int next_stage)
> >  	case CR_STATE_FAIL:
> >  		return 0;
> >  	case CR_STATE_RESTORE_NS:
> > +	case CR_STATE_POST_RESTORE_NS:
> >  	case CR_STATE_RESTORE_SHARED:
> >  		return 1;
> >  	case CR_STATE_FORKING:
> > @@ -1850,16 +1854,25 @@ static int restore_root_task(struct pstree_item *init)
> >  	if (ret)
> >  		goto out_kill;
> >  
> > +	ret = run_scripts(ACT_SETUP_NS);
> > +	if (ret)
> > +		goto out_kill;
> > +
> > +	ret = restore_switch_stage(CR_STATE_POST_RESTORE_NS);
> > +	if (ret < 0)
> > +		goto out_kill;
> > +
> > +	pr_info("Wait until namespaces are created\n");
> > +	ret = restore_wait_inprogress_tasks();
> > +	if (ret)
> > +		goto out_kill;
> > +
> >  	if (root_ns_mask & CLONE_NEWNS) {
> >  		mnt_ns_fd = open_proc(init->pid->real, "ns/mnt");
> >  		if (mnt_ns_fd < 0)
> >  			goto out_kill;
> >  	}
> >  
> > -	ret = run_scripts(ACT_SETUP_NS);
> > -	if (ret)
> > -		goto out_kill;
> > -
> >  	if (opts.empty_ns & CLONE_NEWNET) {
> >  		/*
> >  		 * Local TCP connections were locked by network_lock_internal()
> > diff --git a/criu/include/restorer.h b/criu/include/restorer.h
> > index 23c7e24..7c65538 100644
> > --- a/criu/include/restorer.h
> > +++ b/criu/include/restorer.h
> > @@ -203,6 +203,11 @@ static inline unsigned long restorer_stack(struct restore_mem_zone *mz)
> >  enum {
> >  	CR_STATE_FAIL		= -1,
> >  	CR_STATE_RESTORE_NS	= 0, /* is used for executing "setup-namespace" scripts */
> > +	/*
> > +	 * Need to wait a mount namespace which
> > +	 * will be used to clean up remap files.
> > +	 */
> > +	CR_STATE_POST_RESTORE_NS,
> >  	CR_STATE_RESTORE_SHARED,
> >  	CR_STATE_FORKING,
> >  	CR_STATE_RESTORE,
> > diff --git a/criu/mount.c b/criu/mount.c
> > index 8f66b4e..013b17e 100644
> > --- a/criu/mount.c
> > +++ b/criu/mount.c
> > @@ -2948,17 +2948,39 @@ int prepare_mnt_ns(void)
> >  			goto err;
> >  		}
> >  
> > +		if (nsid->type == NS_ROOT) {
> > +			int fd;
> > +
> > +			/*
> > +			 * We need to create a mount namespace which will be
> > +			 * used to clean up remap files
> > +			 * (depopulate_roots_yard).  The namespace where mounts
> > +			 * was restored has to be restored as a root mount
> > +			 * namespace, because there are file descriptors
> > +			 * linked with it (e.g. to bind-mount slave pty-s).
> > +			 */
> > +			fd = open_proc(PROC_SELF, "ns/mnt");
> > +			if (fd < 0)
> > +				goto err;
> > +			if (setns(rst, CLONE_NEWNS)) {
> > +				pr_perror("Can't restore mntns back");
> > +				goto err;
> > +			}
> > +			nsid->mnt.ns_fd = rst;
> > +			rst = fd;
> > +		} else {
> > +			/* Pin one with a file descriptor */
> > +			nsid->mnt.ns_fd = open_proc(PROC_SELF, "ns/mnt");
> > +			if (nsid->mnt.ns_fd < 0)
> > +				goto err;
> > +		}
> > +
> >  		/* Set its root */
> >  		path[0] = '/';
> >  		print_ns_root(nsid, 0, path + 1, sizeof(path) - 1);
> >  		if (cr_pivot_root(path))
> >  			goto err;
> >  
> > -		/* Pin one with a file descriptor */
> > -		nsid->mnt.ns_fd = open_proc(PROC_SELF, "ns/mnt");
> > -		if (nsid->mnt.ns_fd < 0)
> > -			goto err;
> > -
> >  		/* root_fd is used to restore file mappings */
> >  		nsid->mnt.root_fd = open_proc(PROC_SELF, "root");
> >  		if (nsid->mnt.root_fd < 0)
> > 
>
Pavel Emelianov Feb. 15, 2017, 7:17 p.m.
On 02/15/2017 09:53 PM, Andrei Vagin wrote:
> On Wed, Feb 15, 2017 at 01:26:52PM +0300, Pavel Emelyanov wrote:
>> On 02/15/2017 12:49 AM, Andrei Vagin wrote:
>>> From: Andrei Vagin <avagin@virtuozzo.com>
>>>
>>> Currently mounts are restored in a mount namespace, then
>>> this namespace is cloned to create mount namespaces for
>>> processes and finaly the first namespace is destroyed after
>>> cleaning remap files.
>>>
>>> But it doesn't work in a case, when we have to open file
>>> descriptores to restore mounts (e g to restore bind-mount
>>> pty slaves), because on the next iteration criu will not
>>> find mounts for this files.
>>
>> I don't get this. Would you describe the problem in more detail?
> 
> Look at this example:
> 
> The origin mount namespace has a bind mount /dev/pts/0 -> /dev/console
> 
> And one of processes has /dev/ptmx which is linked with /dev/pts/0
> 
> How we restore this construction:
> 1. Fork the init process into a new mount namespace (lets call it A)
> 2. Restore mounts
>    - open /dev/ptmx to bind-mount /dev/pts/0 and save the master fd in
>      fdstore
>    - bind-mount /dev/pts/0 into /dev/console
> 3. Create namespace which will be installed to processes

Let's call it B :)

> 4. Restore file descriptros
>    - get the master fd for /dev/pts/0 from fdstore
> 
> This patch change the third step. You can see that we open a file
> desctriptor in the A namespace, so if we don't intall it to one of
> processes, it will be extern next time, becase criu will not be able to
> find its mount by mntid.

So the problem I see from this description is that in fdstore we keep
the descriptor for A, while should for B. Is that correct?

> Currently CRIU uses the A namespace to clean up remap files and clone
> this namespace to restore namespaces for processes.
> 
> With this patch, the A namespace will be used as a root mount namespace
> and we will clone it to create a mount namespace, which will be used to
> clean up remap files.
> 
>>
>>> In this patch the first namespace is restored as the root
>>> mount namespace and we clone the first namespace to create
>>> a namespace whichc is used to clean remap files.
>>>
>>> $ ./zdtm.py run -t zdtm/static/pty-console --iter 5
>>> ====================== Run zdtm/static/pty-console in ns =======================
>>> Start test
>>> Test is SUID
>>> ./pty-console --pidfile=pty-console.pid --outfile=pty-console.out
>>> Run criu dump
>>> Run criu restore
>>> Run criu dump
>>> =[log]=> dump/zdtm/static/pty-console/36/2/dump.log
>>> ------------------------ grep Error ------------------------
>>> (00.106521) Error (criu/files-reg.c:1132): Can't lookup mount=563 for fd=4 path=/ptmx
>>> (00.106585) Error (criu/cr-dump.c:1325): Dump files (pid: 70) failed with -1
>>> (00.129041) Error (criu/cr-dump.c:1674): Dumping FAILED.
>>> ------------------------ ERROR OVER ------------------------
>>>
>>> Signed-off-by: Andrei Vagin <avagin@virtuozzo.com>
>>> ---
>>>  criu/cr-restore.c       | 21 +++++++++++++++++----
>>>  criu/include/restorer.h |  5 +++++
>>>  criu/mount.c            | 32 +++++++++++++++++++++++++++-----
>>>  3 files changed, 49 insertions(+), 9 deletions(-)
>>>
>>> diff --git a/criu/cr-restore.c b/criu/cr-restore.c
>>> index d75aa0d..e20036b 100644
>>> --- a/criu/cr-restore.c
>>> +++ b/criu/cr-restore.c
>>> @@ -1431,6 +1431,9 @@ static int restore_task_with_children(void *_arg)
>>>  		if (prepare_namespace(current, ca->clone_flags))
>>>  			goto err;
>>>  
>>> +		if (restore_finish_stage(task_entries, CR_STATE_POST_RESTORE_NS) < 0)
>>> +			goto err;
>>> +
>>>  		if (root_prepare_shared())
>>>  			goto err;
>>>  
>>> @@ -1494,6 +1497,7 @@ static inline int stage_participants(int next_stage)
>>>  	case CR_STATE_FAIL:
>>>  		return 0;
>>>  	case CR_STATE_RESTORE_NS:
>>> +	case CR_STATE_POST_RESTORE_NS:
>>>  	case CR_STATE_RESTORE_SHARED:
>>>  		return 1;
>>>  	case CR_STATE_FORKING:
>>> @@ -1850,16 +1854,25 @@ static int restore_root_task(struct pstree_item *init)
>>>  	if (ret)
>>>  		goto out_kill;
>>>  
>>> +	ret = run_scripts(ACT_SETUP_NS);
>>> +	if (ret)
>>> +		goto out_kill;
>>> +
>>> +	ret = restore_switch_stage(CR_STATE_POST_RESTORE_NS);
>>> +	if (ret < 0)
>>> +		goto out_kill;
>>> +
>>> +	pr_info("Wait until namespaces are created\n");
>>> +	ret = restore_wait_inprogress_tasks();
>>> +	if (ret)
>>> +		goto out_kill;
>>> +
>>>  	if (root_ns_mask & CLONE_NEWNS) {
>>>  		mnt_ns_fd = open_proc(init->pid->real, "ns/mnt");
>>>  		if (mnt_ns_fd < 0)
>>>  			goto out_kill;
>>>  	}
>>>  
>>> -	ret = run_scripts(ACT_SETUP_NS);
>>> -	if (ret)
>>> -		goto out_kill;
>>> -
>>>  	if (opts.empty_ns & CLONE_NEWNET) {
>>>  		/*
>>>  		 * Local TCP connections were locked by network_lock_internal()
>>> diff --git a/criu/include/restorer.h b/criu/include/restorer.h
>>> index 23c7e24..7c65538 100644
>>> --- a/criu/include/restorer.h
>>> +++ b/criu/include/restorer.h
>>> @@ -203,6 +203,11 @@ static inline unsigned long restorer_stack(struct restore_mem_zone *mz)
>>>  enum {
>>>  	CR_STATE_FAIL		= -1,
>>>  	CR_STATE_RESTORE_NS	= 0, /* is used for executing "setup-namespace" scripts */
>>> +	/*
>>> +	 * Need to wait a mount namespace which
>>> +	 * will be used to clean up remap files.
>>> +	 */
>>> +	CR_STATE_POST_RESTORE_NS,
>>>  	CR_STATE_RESTORE_SHARED,
>>>  	CR_STATE_FORKING,
>>>  	CR_STATE_RESTORE,
>>> diff --git a/criu/mount.c b/criu/mount.c
>>> index 8f66b4e..013b17e 100644
>>> --- a/criu/mount.c
>>> +++ b/criu/mount.c
>>> @@ -2948,17 +2948,39 @@ int prepare_mnt_ns(void)
>>>  			goto err;
>>>  		}
>>>  
>>> +		if (nsid->type == NS_ROOT) {
>>> +			int fd;
>>> +
>>> +			/*
>>> +			 * We need to create a mount namespace which will be
>>> +			 * used to clean up remap files
>>> +			 * (depopulate_roots_yard).  The namespace where mounts
>>> +			 * was restored has to be restored as a root mount
>>> +			 * namespace, because there are file descriptors
>>> +			 * linked with it (e.g. to bind-mount slave pty-s).
>>> +			 */
>>> +			fd = open_proc(PROC_SELF, "ns/mnt");
>>> +			if (fd < 0)
>>> +				goto err;
>>> +			if (setns(rst, CLONE_NEWNS)) {
>>> +				pr_perror("Can't restore mntns back");
>>> +				goto err;
>>> +			}
>>> +			nsid->mnt.ns_fd = rst;
>>> +			rst = fd;
>>> +		} else {
>>> +			/* Pin one with a file descriptor */
>>> +			nsid->mnt.ns_fd = open_proc(PROC_SELF, "ns/mnt");
>>> +			if (nsid->mnt.ns_fd < 0)
>>> +				goto err;
>>> +		}
>>> +
>>>  		/* Set its root */
>>>  		path[0] = '/';
>>>  		print_ns_root(nsid, 0, path + 1, sizeof(path) - 1);
>>>  		if (cr_pivot_root(path))
>>>  			goto err;
>>>  
>>> -		/* Pin one with a file descriptor */
>>> -		nsid->mnt.ns_fd = open_proc(PROC_SELF, "ns/mnt");
>>> -		if (nsid->mnt.ns_fd < 0)
>>> -			goto err;
>>> -
>>>  		/* root_fd is used to restore file mappings */
>>>  		nsid->mnt.root_fd = open_proc(PROC_SELF, "root");
>>>  		if (nsid->mnt.root_fd < 0)
>>>
>>
> .
>
Andrey Vagin Feb. 15, 2017, 7:25 p.m.
On Wed, Feb 15, 2017 at 10:17:53PM +0300, Pavel Emelyanov wrote:
> On 02/15/2017 09:53 PM, Andrei Vagin wrote:
> > On Wed, Feb 15, 2017 at 01:26:52PM +0300, Pavel Emelyanov wrote:
> >> On 02/15/2017 12:49 AM, Andrei Vagin wrote:
> >>> From: Andrei Vagin <avagin@virtuozzo.com>
> >>>
> >>> Currently mounts are restored in a mount namespace, then
> >>> this namespace is cloned to create mount namespaces for
> >>> processes and finaly the first namespace is destroyed after
> >>> cleaning remap files.
> >>>
> >>> But it doesn't work in a case, when we have to open file
> >>> descriptores to restore mounts (e g to restore bind-mount
> >>> pty slaves), because on the next iteration criu will not
> >>> find mounts for this files.
> >>
> >> I don't get this. Would you describe the problem in more detail?
> > 
> > Look at this example:
> > 
> > The origin mount namespace has a bind mount /dev/pts/0 -> /dev/console
> > 
> > And one of processes has /dev/ptmx which is linked with /dev/pts/0
> > 
> > How we restore this construction:
> > 1. Fork the init process into a new mount namespace (lets call it A)
> > 2. Restore mounts
> >    - open /dev/ptmx to bind-mount /dev/pts/0 and save the master fd in
> >      fdstore
> >    - bind-mount /dev/pts/0 into /dev/console
> > 3. Create namespace which will be installed to processes
> 
> Let's call it B :)
> 
> > 4. Restore file descriptros
> >    - get the master fd for /dev/pts/0 from fdstore
> > 
> > This patch change the third step. You can see that we open a file
> > desctriptor in the A namespace, so if we don't intall it to one of
> > processes, it will be extern next time, becase criu will not be able to
> > find its mount by mntid.
> 
> So the problem I see from this description is that in fdstore we keep
> the descriptor for A, while should for B. Is that correct?

or A should be installed as a restore namespace, what I do in this
patch.

> 
> > Currently CRIU uses the A namespace to clean up remap files and clone
> > this namespace to restore namespaces for processes.
> > 
> > With this patch, the A namespace will be used as a root mount namespace
> > and we will clone it to create a mount namespace, which will be used to
> > clean up remap files.
> > 
> >>
> >>> In this patch the first namespace is restored as the root
> >>> mount namespace and we clone the first namespace to create
> >>> a namespace whichc is used to clean remap files.
> >>>
> >>> $ ./zdtm.py run -t zdtm/static/pty-console --iter 5
> >>> ====================== Run zdtm/static/pty-console in ns =======================
> >>> Start test
> >>> Test is SUID
> >>> ./pty-console --pidfile=pty-console.pid --outfile=pty-console.out
> >>> Run criu dump
> >>> Run criu restore
> >>> Run criu dump
> >>> =[log]=> dump/zdtm/static/pty-console/36/2/dump.log
> >>> ------------------------ grep Error ------------------------
> >>> (00.106521) Error (criu/files-reg.c:1132): Can't lookup mount=563 for fd=4 path=/ptmx
> >>> (00.106585) Error (criu/cr-dump.c:1325): Dump files (pid: 70) failed with -1
> >>> (00.129041) Error (criu/cr-dump.c:1674): Dumping FAILED.
> >>> ------------------------ ERROR OVER ------------------------
> >>>
> >>> Signed-off-by: Andrei Vagin <avagin@virtuozzo.com>
> >>> ---
> >>>  criu/cr-restore.c       | 21 +++++++++++++++++----
> >>>  criu/include/restorer.h |  5 +++++
> >>>  criu/mount.c            | 32 +++++++++++++++++++++++++++-----
> >>>  3 files changed, 49 insertions(+), 9 deletions(-)
> >>>
> >>> diff --git a/criu/cr-restore.c b/criu/cr-restore.c
> >>> index d75aa0d..e20036b 100644
> >>> --- a/criu/cr-restore.c
> >>> +++ b/criu/cr-restore.c
> >>> @@ -1431,6 +1431,9 @@ static int restore_task_with_children(void *_arg)
> >>>  		if (prepare_namespace(current, ca->clone_flags))
> >>>  			goto err;
> >>>  
> >>> +		if (restore_finish_stage(task_entries, CR_STATE_POST_RESTORE_NS) < 0)
> >>> +			goto err;
> >>> +
> >>>  		if (root_prepare_shared())
> >>>  			goto err;
> >>>  
> >>> @@ -1494,6 +1497,7 @@ static inline int stage_participants(int next_stage)
> >>>  	case CR_STATE_FAIL:
> >>>  		return 0;
> >>>  	case CR_STATE_RESTORE_NS:
> >>> +	case CR_STATE_POST_RESTORE_NS:
> >>>  	case CR_STATE_RESTORE_SHARED:
> >>>  		return 1;
> >>>  	case CR_STATE_FORKING:
> >>> @@ -1850,16 +1854,25 @@ static int restore_root_task(struct pstree_item *init)
> >>>  	if (ret)
> >>>  		goto out_kill;
> >>>  
> >>> +	ret = run_scripts(ACT_SETUP_NS);
> >>> +	if (ret)
> >>> +		goto out_kill;
> >>> +
> >>> +	ret = restore_switch_stage(CR_STATE_POST_RESTORE_NS);
> >>> +	if (ret < 0)
> >>> +		goto out_kill;
> >>> +
> >>> +	pr_info("Wait until namespaces are created\n");
> >>> +	ret = restore_wait_inprogress_tasks();
> >>> +	if (ret)
> >>> +		goto out_kill;
> >>> +
> >>>  	if (root_ns_mask & CLONE_NEWNS) {
> >>>  		mnt_ns_fd = open_proc(init->pid->real, "ns/mnt");
> >>>  		if (mnt_ns_fd < 0)
> >>>  			goto out_kill;
> >>>  	}
> >>>  
> >>> -	ret = run_scripts(ACT_SETUP_NS);
> >>> -	if (ret)
> >>> -		goto out_kill;
> >>> -
> >>>  	if (opts.empty_ns & CLONE_NEWNET) {
> >>>  		/*
> >>>  		 * Local TCP connections were locked by network_lock_internal()
> >>> diff --git a/criu/include/restorer.h b/criu/include/restorer.h
> >>> index 23c7e24..7c65538 100644
> >>> --- a/criu/include/restorer.h
> >>> +++ b/criu/include/restorer.h
> >>> @@ -203,6 +203,11 @@ static inline unsigned long restorer_stack(struct restore_mem_zone *mz)
> >>>  enum {
> >>>  	CR_STATE_FAIL		= -1,
> >>>  	CR_STATE_RESTORE_NS	= 0, /* is used for executing "setup-namespace" scripts */
> >>> +	/*
> >>> +	 * Need to wait a mount namespace which
> >>> +	 * will be used to clean up remap files.
> >>> +	 */
> >>> +	CR_STATE_POST_RESTORE_NS,
> >>>  	CR_STATE_RESTORE_SHARED,
> >>>  	CR_STATE_FORKING,
> >>>  	CR_STATE_RESTORE,
> >>> diff --git a/criu/mount.c b/criu/mount.c
> >>> index 8f66b4e..013b17e 100644
> >>> --- a/criu/mount.c
> >>> +++ b/criu/mount.c
> >>> @@ -2948,17 +2948,39 @@ int prepare_mnt_ns(void)
> >>>  			goto err;
> >>>  		}
> >>>  
> >>> +		if (nsid->type == NS_ROOT) {
> >>> +			int fd;
> >>> +
> >>> +			/*
> >>> +			 * We need to create a mount namespace which will be
> >>> +			 * used to clean up remap files
> >>> +			 * (depopulate_roots_yard).  The namespace where mounts
> >>> +			 * was restored has to be restored as a root mount
> >>> +			 * namespace, because there are file descriptors
> >>> +			 * linked with it (e.g. to bind-mount slave pty-s).
> >>> +			 */
> >>> +			fd = open_proc(PROC_SELF, "ns/mnt");
> >>> +			if (fd < 0)
> >>> +				goto err;
> >>> +			if (setns(rst, CLONE_NEWNS)) {
> >>> +				pr_perror("Can't restore mntns back");
> >>> +				goto err;
> >>> +			}
> >>> +			nsid->mnt.ns_fd = rst;
> >>> +			rst = fd;
> >>> +		} else {
> >>> +			/* Pin one with a file descriptor */
> >>> +			nsid->mnt.ns_fd = open_proc(PROC_SELF, "ns/mnt");
> >>> +			if (nsid->mnt.ns_fd < 0)
> >>> +				goto err;
> >>> +		}
> >>> +
> >>>  		/* Set its root */
> >>>  		path[0] = '/';
> >>>  		print_ns_root(nsid, 0, path + 1, sizeof(path) - 1);
> >>>  		if (cr_pivot_root(path))
> >>>  			goto err;
> >>>  
> >>> -		/* Pin one with a file descriptor */
> >>> -		nsid->mnt.ns_fd = open_proc(PROC_SELF, "ns/mnt");
> >>> -		if (nsid->mnt.ns_fd < 0)
> >>> -			goto err;
> >>> -
> >>>  		/* root_fd is used to restore file mappings */
> >>>  		nsid->mnt.root_fd = open_proc(PROC_SELF, "root");
> >>>  		if (nsid->mnt.root_fd < 0)
> >>>
> >>
> > .
> > 
>
Pavel Emelianov Feb. 15, 2017, 7:34 p.m.
On 02/15/2017 10:25 PM, Andrei Vagin wrote:
> On Wed, Feb 15, 2017 at 10:17:53PM +0300, Pavel Emelyanov wrote:
>> On 02/15/2017 09:53 PM, Andrei Vagin wrote:
>>> On Wed, Feb 15, 2017 at 01:26:52PM +0300, Pavel Emelyanov wrote:
>>>> On 02/15/2017 12:49 AM, Andrei Vagin wrote:
>>>>> From: Andrei Vagin <avagin@virtuozzo.com>
>>>>>
>>>>> Currently mounts are restored in a mount namespace, then
>>>>> this namespace is cloned to create mount namespaces for
>>>>> processes and finaly the first namespace is destroyed after
>>>>> cleaning remap files.
>>>>>
>>>>> But it doesn't work in a case, when we have to open file
>>>>> descriptores to restore mounts (e g to restore bind-mount
>>>>> pty slaves), because on the next iteration criu will not
>>>>> find mounts for this files.
>>>>
>>>> I don't get this. Would you describe the problem in more detail?
>>>
>>> Look at this example:
>>>
>>> The origin mount namespace has a bind mount /dev/pts/0 -> /dev/console
>>>
>>> And one of processes has /dev/ptmx which is linked with /dev/pts/0
>>>
>>> How we restore this construction:
>>> 1. Fork the init process into a new mount namespace (lets call it A)
>>> 2. Restore mounts
>>>    - open /dev/ptmx to bind-mount /dev/pts/0 and save the master fd in
>>>      fdstore
>>>    - bind-mount /dev/pts/0 into /dev/console
>>> 3. Create namespace which will be installed to processes
>>
>> Let's call it B :)
>>
>>> 4. Restore file descriptros
>>>    - get the master fd for /dev/pts/0 from fdstore
>>>
>>> This patch change the third step. You can see that we open a file
>>> desctriptor in the A namespace, so if we don't intall it to one of
>>> processes, it will be extern next time, becase criu will not be able to
>>> find its mount by mntid.
>>
>> So the problem I see from this description is that in fdstore we keep
>> the descriptor for A, while should for B. Is that correct?
> 
> or A should be installed as a restore namespace, what I do in this
> patch.

Yes, but ... with the need for yet another "stage" :\ Maybe fixing fdstore
fd would be simpler?

BTW, why do you say fdstore, we don't keep mnt ns fds in fdstore, do we?

>>
>>> Currently CRIU uses the A namespace to clean up remap files and clone
>>> this namespace to restore namespaces for processes.
>>>
>>> With this patch, the A namespace will be used as a root mount namespace
>>> and we will clone it to create a mount namespace, which will be used to
>>> clean up remap files.
>>>
>>>>
>>>>> In this patch the first namespace is restored as the root
>>>>> mount namespace and we clone the first namespace to create
>>>>> a namespace whichc is used to clean remap files.
>>>>>
>>>>> $ ./zdtm.py run -t zdtm/static/pty-console --iter 5
>>>>> ====================== Run zdtm/static/pty-console in ns =======================
>>>>> Start test
>>>>> Test is SUID
>>>>> ./pty-console --pidfile=pty-console.pid --outfile=pty-console.out
>>>>> Run criu dump
>>>>> Run criu restore
>>>>> Run criu dump
>>>>> =[log]=> dump/zdtm/static/pty-console/36/2/dump.log
>>>>> ------------------------ grep Error ------------------------
>>>>> (00.106521) Error (criu/files-reg.c:1132): Can't lookup mount=563 for fd=4 path=/ptmx
>>>>> (00.106585) Error (criu/cr-dump.c:1325): Dump files (pid: 70) failed with -1
>>>>> (00.129041) Error (criu/cr-dump.c:1674): Dumping FAILED.
>>>>> ------------------------ ERROR OVER ------------------------
>>>>>
>>>>> Signed-off-by: Andrei Vagin <avagin@virtuozzo.com>
>>>>> ---
>>>>>  criu/cr-restore.c       | 21 +++++++++++++++++----
>>>>>  criu/include/restorer.h |  5 +++++
>>>>>  criu/mount.c            | 32 +++++++++++++++++++++++++++-----
>>>>>  3 files changed, 49 insertions(+), 9 deletions(-)
>>>>>
>>>>> diff --git a/criu/cr-restore.c b/criu/cr-restore.c
>>>>> index d75aa0d..e20036b 100644
>>>>> --- a/criu/cr-restore.c
>>>>> +++ b/criu/cr-restore.c
>>>>> @@ -1431,6 +1431,9 @@ static int restore_task_with_children(void *_arg)
>>>>>  		if (prepare_namespace(current, ca->clone_flags))
>>>>>  			goto err;
>>>>>  
>>>>> +		if (restore_finish_stage(task_entries, CR_STATE_POST_RESTORE_NS) < 0)
>>>>> +			goto err;
>>>>> +
>>>>>  		if (root_prepare_shared())
>>>>>  			goto err;
>>>>>  
>>>>> @@ -1494,6 +1497,7 @@ static inline int stage_participants(int next_stage)
>>>>>  	case CR_STATE_FAIL:
>>>>>  		return 0;
>>>>>  	case CR_STATE_RESTORE_NS:
>>>>> +	case CR_STATE_POST_RESTORE_NS:
>>>>>  	case CR_STATE_RESTORE_SHARED:
>>>>>  		return 1;
>>>>>  	case CR_STATE_FORKING:
>>>>> @@ -1850,16 +1854,25 @@ static int restore_root_task(struct pstree_item *init)
>>>>>  	if (ret)
>>>>>  		goto out_kill;
>>>>>  
>>>>> +	ret = run_scripts(ACT_SETUP_NS);
>>>>> +	if (ret)
>>>>> +		goto out_kill;
>>>>> +
>>>>> +	ret = restore_switch_stage(CR_STATE_POST_RESTORE_NS);
>>>>> +	if (ret < 0)
>>>>> +		goto out_kill;
>>>>> +
>>>>> +	pr_info("Wait until namespaces are created\n");
>>>>> +	ret = restore_wait_inprogress_tasks();
>>>>> +	if (ret)
>>>>> +		goto out_kill;
>>>>> +
>>>>>  	if (root_ns_mask & CLONE_NEWNS) {
>>>>>  		mnt_ns_fd = open_proc(init->pid->real, "ns/mnt");
>>>>>  		if (mnt_ns_fd < 0)
>>>>>  			goto out_kill;
>>>>>  	}
>>>>>  
>>>>> -	ret = run_scripts(ACT_SETUP_NS);
>>>>> -	if (ret)
>>>>> -		goto out_kill;
>>>>> -
>>>>>  	if (opts.empty_ns & CLONE_NEWNET) {
>>>>>  		/*
>>>>>  		 * Local TCP connections were locked by network_lock_internal()
>>>>> diff --git a/criu/include/restorer.h b/criu/include/restorer.h
>>>>> index 23c7e24..7c65538 100644
>>>>> --- a/criu/include/restorer.h
>>>>> +++ b/criu/include/restorer.h
>>>>> @@ -203,6 +203,11 @@ static inline unsigned long restorer_stack(struct restore_mem_zone *mz)
>>>>>  enum {
>>>>>  	CR_STATE_FAIL		= -1,
>>>>>  	CR_STATE_RESTORE_NS	= 0, /* is used for executing "setup-namespace" scripts */
>>>>> +	/*
>>>>> +	 * Need to wait a mount namespace which
>>>>> +	 * will be used to clean up remap files.
>>>>> +	 */
>>>>> +	CR_STATE_POST_RESTORE_NS,
>>>>>  	CR_STATE_RESTORE_SHARED,
>>>>>  	CR_STATE_FORKING,
>>>>>  	CR_STATE_RESTORE,
>>>>> diff --git a/criu/mount.c b/criu/mount.c
>>>>> index 8f66b4e..013b17e 100644
>>>>> --- a/criu/mount.c
>>>>> +++ b/criu/mount.c
>>>>> @@ -2948,17 +2948,39 @@ int prepare_mnt_ns(void)
>>>>>  			goto err;
>>>>>  		}
>>>>>  
>>>>> +		if (nsid->type == NS_ROOT) {
>>>>> +			int fd;
>>>>> +
>>>>> +			/*
>>>>> +			 * We need to create a mount namespace which will be
>>>>> +			 * used to clean up remap files
>>>>> +			 * (depopulate_roots_yard).  The namespace where mounts
>>>>> +			 * was restored has to be restored as a root mount
>>>>> +			 * namespace, because there are file descriptors
>>>>> +			 * linked with it (e.g. to bind-mount slave pty-s).
>>>>> +			 */
>>>>> +			fd = open_proc(PROC_SELF, "ns/mnt");
>>>>> +			if (fd < 0)
>>>>> +				goto err;
>>>>> +			if (setns(rst, CLONE_NEWNS)) {
>>>>> +				pr_perror("Can't restore mntns back");
>>>>> +				goto err;
>>>>> +			}
>>>>> +			nsid->mnt.ns_fd = rst;
>>>>> +			rst = fd;
>>>>> +		} else {
>>>>> +			/* Pin one with a file descriptor */
>>>>> +			nsid->mnt.ns_fd = open_proc(PROC_SELF, "ns/mnt");
>>>>> +			if (nsid->mnt.ns_fd < 0)
>>>>> +				goto err;
>>>>> +		}
>>>>> +
>>>>>  		/* Set its root */
>>>>>  		path[0] = '/';
>>>>>  		print_ns_root(nsid, 0, path + 1, sizeof(path) - 1);
>>>>>  		if (cr_pivot_root(path))
>>>>>  			goto err;
>>>>>  
>>>>> -		/* Pin one with a file descriptor */
>>>>> -		nsid->mnt.ns_fd = open_proc(PROC_SELF, "ns/mnt");
>>>>> -		if (nsid->mnt.ns_fd < 0)
>>>>> -			goto err;
>>>>> -
>>>>>  		/* root_fd is used to restore file mappings */
>>>>>  		nsid->mnt.root_fd = open_proc(PROC_SELF, "root");
>>>>>  		if (nsid->mnt.root_fd < 0)
>>>>>
>>>>
>>> .
>>>
>>
> .
>
Pavel Emelianov Feb. 16, 2017, 10:44 a.m.
On 02/15/2017 10:25 PM, Andrei Vagin wrote:
> On Wed, Feb 15, 2017 at 10:17:53PM +0300, Pavel Emelyanov wrote:
>> On 02/15/2017 09:53 PM, Andrei Vagin wrote:
>>> On Wed, Feb 15, 2017 at 01:26:52PM +0300, Pavel Emelyanov wrote:
>>>> On 02/15/2017 12:49 AM, Andrei Vagin wrote:
>>>>> From: Andrei Vagin <avagin@virtuozzo.com>
>>>>>
>>>>> Currently mounts are restored in a mount namespace, then
>>>>> this namespace is cloned to create mount namespaces for
>>>>> processes and finaly the first namespace is destroyed after
>>>>> cleaning remap files.
>>>>>
>>>>> But it doesn't work in a case, when we have to open file
>>>>> descriptores to restore mounts (e g to restore bind-mount
>>>>> pty slaves), because on the next iteration criu will not
>>>>> find mounts for this files.
>>>>
>>>> I don't get this. Would you describe the problem in more detail?
>>>
>>> Look at this example:
>>>
>>> The origin mount namespace has a bind mount /dev/pts/0 -> /dev/console
>>>
>>> And one of processes has /dev/ptmx which is linked with /dev/pts/0
>>>
>>> How we restore this construction:
>>> 1. Fork the init process into a new mount namespace (lets call it A)
>>> 2. Restore mounts
>>>    - open /dev/ptmx to bind-mount /dev/pts/0 and save the master fd in
>>>      fdstore
>>>    - bind-mount /dev/pts/0 into /dev/console
>>> 3. Create namespace which will be installed to processes
>>
>> Let's call it B :)
>>
>>> 4. Restore file descriptros
>>>    - get the master fd for /dev/pts/0 from fdstore
>>>
>>> This patch change the third step. You can see that we open a file
>>> desctriptor in the A namespace, so if we don't intall it to one of
>>> processes, it will be extern next time, becase criu will not be able to
>>> find its mount by mntid.
>>
>> So the problem I see from this description is that in fdstore we keep
>> the descriptor for A, while should for B. Is that correct?
> 
> or A should be installed as a restore namespace, what I do in this
> patch.

Good. So let's do this:

1. Fixup patch comment to describe the problem in more details :)
2. Add checks for external tty-s in sub-namespaces (namespace C in
   currently accepted terminology) and abort dump.

-- Pavel

>>
>>> Currently CRIU uses the A namespace to clean up remap files and clone
>>> this namespace to restore namespaces for processes.
>>>
>>> With this patch, the A namespace will be used as a root mount namespace
>>> and we will clone it to create a mount namespace, which will be used to
>>> clean up remap files.
>>>
>>>>
>>>>> In this patch the first namespace is restored as the root
>>>>> mount namespace and we clone the first namespace to create
>>>>> a namespace whichc is used to clean remap files.
>>>>>
>>>>> $ ./zdtm.py run -t zdtm/static/pty-console --iter 5
>>>>> ====================== Run zdtm/static/pty-console in ns =======================
>>>>> Start test
>>>>> Test is SUID
>>>>> ./pty-console --pidfile=pty-console.pid --outfile=pty-console.out
>>>>> Run criu dump
>>>>> Run criu restore
>>>>> Run criu dump
>>>>> =[log]=> dump/zdtm/static/pty-console/36/2/dump.log
>>>>> ------------------------ grep Error ------------------------
>>>>> (00.106521) Error (criu/files-reg.c:1132): Can't lookup mount=563 for fd=4 path=/ptmx
>>>>> (00.106585) Error (criu/cr-dump.c:1325): Dump files (pid: 70) failed with -1
>>>>> (00.129041) Error (criu/cr-dump.c:1674): Dumping FAILED.
>>>>> ------------------------ ERROR OVER ------------------------
>>>>>
>>>>> Signed-off-by: Andrei Vagin <avagin@virtuozzo.com>
>>>>> ---
>>>>>  criu/cr-restore.c       | 21 +++++++++++++++++----
>>>>>  criu/include/restorer.h |  5 +++++
>>>>>  criu/mount.c            | 32 +++++++++++++++++++++++++++-----
>>>>>  3 files changed, 49 insertions(+), 9 deletions(-)
>>>>>
>>>>> diff --git a/criu/cr-restore.c b/criu/cr-restore.c
>>>>> index d75aa0d..e20036b 100644
>>>>> --- a/criu/cr-restore.c
>>>>> +++ b/criu/cr-restore.c
>>>>> @@ -1431,6 +1431,9 @@ static int restore_task_with_children(void *_arg)
>>>>>  		if (prepare_namespace(current, ca->clone_flags))
>>>>>  			goto err;
>>>>>  
>>>>> +		if (restore_finish_stage(task_entries, CR_STATE_POST_RESTORE_NS) < 0)
>>>>> +			goto err;
>>>>> +
>>>>>  		if (root_prepare_shared())
>>>>>  			goto err;
>>>>>  
>>>>> @@ -1494,6 +1497,7 @@ static inline int stage_participants(int next_stage)
>>>>>  	case CR_STATE_FAIL:
>>>>>  		return 0;
>>>>>  	case CR_STATE_RESTORE_NS:
>>>>> +	case CR_STATE_POST_RESTORE_NS:
>>>>>  	case CR_STATE_RESTORE_SHARED:
>>>>>  		return 1;
>>>>>  	case CR_STATE_FORKING:
>>>>> @@ -1850,16 +1854,25 @@ static int restore_root_task(struct pstree_item *init)
>>>>>  	if (ret)
>>>>>  		goto out_kill;
>>>>>  
>>>>> +	ret = run_scripts(ACT_SETUP_NS);
>>>>> +	if (ret)
>>>>> +		goto out_kill;
>>>>> +
>>>>> +	ret = restore_switch_stage(CR_STATE_POST_RESTORE_NS);
>>>>> +	if (ret < 0)
>>>>> +		goto out_kill;
>>>>> +
>>>>> +	pr_info("Wait until namespaces are created\n");
>>>>> +	ret = restore_wait_inprogress_tasks();
>>>>> +	if (ret)
>>>>> +		goto out_kill;
>>>>> +
>>>>>  	if (root_ns_mask & CLONE_NEWNS) {
>>>>>  		mnt_ns_fd = open_proc(init->pid->real, "ns/mnt");
>>>>>  		if (mnt_ns_fd < 0)
>>>>>  			goto out_kill;
>>>>>  	}
>>>>>  
>>>>> -	ret = run_scripts(ACT_SETUP_NS);
>>>>> -	if (ret)
>>>>> -		goto out_kill;
>>>>> -
>>>>>  	if (opts.empty_ns & CLONE_NEWNET) {
>>>>>  		/*
>>>>>  		 * Local TCP connections were locked by network_lock_internal()
>>>>> diff --git a/criu/include/restorer.h b/criu/include/restorer.h
>>>>> index 23c7e24..7c65538 100644
>>>>> --- a/criu/include/restorer.h
>>>>> +++ b/criu/include/restorer.h
>>>>> @@ -203,6 +203,11 @@ static inline unsigned long restorer_stack(struct restore_mem_zone *mz)
>>>>>  enum {
>>>>>  	CR_STATE_FAIL		= -1,
>>>>>  	CR_STATE_RESTORE_NS	= 0, /* is used for executing "setup-namespace" scripts */
>>>>> +	/*
>>>>> +	 * Need to wait a mount namespace which
>>>>> +	 * will be used to clean up remap files.
>>>>> +	 */
>>>>> +	CR_STATE_POST_RESTORE_NS,
>>>>>  	CR_STATE_RESTORE_SHARED,
>>>>>  	CR_STATE_FORKING,
>>>>>  	CR_STATE_RESTORE,
>>>>> diff --git a/criu/mount.c b/criu/mount.c
>>>>> index 8f66b4e..013b17e 100644
>>>>> --- a/criu/mount.c
>>>>> +++ b/criu/mount.c
>>>>> @@ -2948,17 +2948,39 @@ int prepare_mnt_ns(void)
>>>>>  			goto err;
>>>>>  		}
>>>>>  
>>>>> +		if (nsid->type == NS_ROOT) {
>>>>> +			int fd;
>>>>> +
>>>>> +			/*
>>>>> +			 * We need to create a mount namespace which will be
>>>>> +			 * used to clean up remap files
>>>>> +			 * (depopulate_roots_yard).  The namespace where mounts
>>>>> +			 * was restored has to be restored as a root mount
>>>>> +			 * namespace, because there are file descriptors
>>>>> +			 * linked with it (e.g. to bind-mount slave pty-s).
>>>>> +			 */
>>>>> +			fd = open_proc(PROC_SELF, "ns/mnt");
>>>>> +			if (fd < 0)
>>>>> +				goto err;
>>>>> +			if (setns(rst, CLONE_NEWNS)) {
>>>>> +				pr_perror("Can't restore mntns back");
>>>>> +				goto err;
>>>>> +			}
>>>>> +			nsid->mnt.ns_fd = rst;
>>>>> +			rst = fd;
>>>>> +		} else {
>>>>> +			/* Pin one with a file descriptor */
>>>>> +			nsid->mnt.ns_fd = open_proc(PROC_SELF, "ns/mnt");
>>>>> +			if (nsid->mnt.ns_fd < 0)
>>>>> +				goto err;
>>>>> +		}
>>>>> +
>>>>>  		/* Set its root */
>>>>>  		path[0] = '/';
>>>>>  		print_ns_root(nsid, 0, path + 1, sizeof(path) - 1);
>>>>>  		if (cr_pivot_root(path))
>>>>>  			goto err;
>>>>>  
>>>>> -		/* Pin one with a file descriptor */
>>>>> -		nsid->mnt.ns_fd = open_proc(PROC_SELF, "ns/mnt");
>>>>> -		if (nsid->mnt.ns_fd < 0)
>>>>> -			goto err;
>>>>> -
>>>>>  		/* root_fd is used to restore file mappings */
>>>>>  		nsid->mnt.root_fd = open_proc(PROC_SELF, "root");
>>>>>  		if (nsid->mnt.root_fd < 0)
>>>>>
>>>>
>>> .
>>>
>>
> .
>