[1/2] mount: delay restoring readonly mount flag untill all files ready

Submitted by Pavel Tikhomirov on Sept. 11, 2018, 3:53 p.m.

Details

Message ID 20180911155331.28983-2-ptikhomirov@virtuozzo.com
State New
Series "support restoring ghost files on readonly mounts"
Headers show

Commit Message

Pavel Tikhomirov Sept. 11, 2018, 3:53 p.m.
We can have ghost-files on readonly mounts, for them we will need to
recreate the file on restore, and we can't do that if mount is readonly,
so the idea is to move restoring readonly mount flags after all files are
restored, before that restore process will see mounts writable.

There is an exception for overmounted mounts as it is not so easy to
set flags on them at these late point. Other exception is if the mount
has readonly superblock - there can be no ghost-files on such a mount.

The first point where we delay readonly is do_new_mount and the second
is do_bind_mount. The latter is a bit more complex as we need to handle
nesting from source mount which can be also delayed/undelayed.

https://jira.sw.ru/browse/PSBM-82991
Signed-off-by: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
---
 criu/cr-restore.c    |   2 +
 criu/include/mount.h |   2 +
 criu/mount.c         | 112 ++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 114 insertions(+), 2 deletions(-)

Patch hide | download patch | download mbox

diff --git a/criu/cr-restore.c b/criu/cr-restore.c
index 6f22fb787..258ab916d 100644
--- a/criu/cr-restore.c
+++ b/criu/cr-restore.c
@@ -3235,6 +3235,8 @@  static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
 		/* Wait when all tasks restored all files */
 		if (restore_wait_other_tasks())
 			goto err_nv;
+		if (remount_readonly_mounts())
+			goto err_nv;
 	}
 
 	/*
diff --git a/criu/include/mount.h b/criu/include/mount.h
index ca17b059a..3dec0ec0f 100644
--- a/criu/include/mount.h
+++ b/criu/include/mount.h
@@ -126,4 +126,6 @@  extern struct mount_info *parse_mountinfo(pid_t pid, struct ns_id *nsid, bool fo
 
 extern int check_mnt_id(void);
 
+extern int remount_readonly_mounts(void);
+
 #endif /* __CR_MOUNT_H__ */
diff --git a/criu/mount.c b/criu/mount.c
index 781870eea..32d9c08b6 100644
--- a/criu/mount.c
+++ b/criu/mount.c
@@ -2084,6 +2084,16 @@  static int do_new_mount(struct mount_info *mi)
 		close(fd);
 	}
 
+	/*
+	 * Restoring ghost files on readonly mounts requires write access,
+	 * remount_readonly_mounts() will set these flags after all files
+	 * restored
+	 */
+	if (!mnt_is_overmounted(mi))
+		mflags &= ~MS_RDONLY;
+	else if (mflags & MS_RDONLY)
+		pr_warn("Can't delay making mount readonly for overmounted mount\n");
+
 	if (mflags && mount(NULL, mi->mountpoint, NULL,
 				MS_REMOUNT | MS_BIND | mflags, NULL)) {
 		pr_perror("Unable to apply bind-mount options");
@@ -2161,7 +2171,7 @@  static int do_bind_mount(struct mount_info *mi)
 {
 	char mnt_fd_path[PSFDS];
 	char *root, *cut_root, rpath[PATH_MAX];
-	unsigned long mflags;
+	unsigned long mflags, bmflags = 0;
 	int exit_code = -1, mp_len;
 	bool shared = false;
 	bool master = false;
@@ -2281,7 +2291,15 @@  static int do_bind_mount(struct mount_info *mi)
 	}
 
 	mflags = mi->flags & (~MS_PROPAGATE);
-	if (!mi->bind || mflags != (mi->bind->flags & (~MS_PROPAGATE)))
+	if (!mnt_is_overmounted(mi) && !(mi->sb_flags & MS_RDONLY))
+		mflags &= ~MS_RDONLY;
+	if (mi->bind) {
+		bmflags = mi->bind->flags & (~MS_PROPAGATE);
+		if (!mnt_is_overmounted(mi->bind) && !(mi->bind->sb_flags & MS_RDONLY))
+			bmflags &= ~MS_RDONLY;
+	}
+
+	if (!mi->bind || mflags != bmflags)
 		if (mount(NULL, mi->mountpoint, NULL, MS_BIND | MS_REMOUNT | mflags, NULL)) {
 			pr_perror("Can't mount at %s", mi->mountpoint);
 			goto err;
@@ -3654,3 +3672,93 @@  void clean_cr_time_mounts(void)
 }
 
 struct ns_desc mnt_ns_desc = NS_DESC_ENTRY(CLONE_NEWNS, "mnt");
+
+int __remount_readonly_mounts(struct ns_id *ns)
+{
+	struct mount_info *mnt;
+
+	for (mnt = mntinfo; mnt; mnt = mnt->next) {
+		if (ns && mnt->nsid != ns)
+			continue;
+
+		if (mnt_is_overmounted(mnt))
+			continue;
+
+		if (mnt->sb_flags & MS_RDONLY)
+			continue;
+
+		if (!(mnt->flags & MS_RDONLY))
+			continue;
+
+		if (mount(NULL, mnt->ns_mountpoint, NULL,
+			  MS_REMOUNT | MS_BIND | (mnt->flags & (~MS_PROPAGATE)),
+			  NULL)) {
+			pr_perror("Failed to restore %d:%s mount flags %x",
+				  mnt->mnt_id, mnt->mountpoint, mnt->flags);
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+int ns_remount_readonly_mounts(void *arg)
+{
+	struct ns_id *nsid;
+
+	for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) {
+		int mntns_fd;
+
+		if (nsid->nd != &mnt_ns_desc)
+			continue;
+
+		mntns_fd = fdstore_get(nsid->mnt.nsfd_id);
+		if (mntns_fd < 0)
+			return 1;
+
+		if (setns(mntns_fd, CLONE_NEWNS) < 0) {
+			pr_perror("`- Can't switch");
+			close(mntns_fd);
+			return 1;
+		}
+		close(mntns_fd);
+
+		pr_info("Switched to mntns %u:%u/n", nsid->id, nsid->kid);
+
+		if (__remount_readonly_mounts(nsid))
+			return 1;
+	}
+
+	return 0;
+}
+
+int remount_readonly_mounts(void)
+{
+	int pid, status;
+
+	if (!(root_ns_mask & CLONE_NEWNS))
+		return __remount_readonly_mounts(NULL);
+
+	/*
+	 * Need a helper process because the root task can share fs via
+	 * CLONE_FS and we would not be able to enter mount namespaces
+	 */
+	pid = clone_noasan(ns_remount_readonly_mounts,
+			   CLONE_VFORK | CLONE_VM | CLONE_FILES
+			   | CLONE_IO | CLONE_SIGHAND
+			   | CLONE_SYSVSEM, NULL);
+	if (pid == -1) {
+		pr_perror("Can't clone helper process");
+		return -1;
+	}
+
+	errno = 0;
+	if (waitpid(pid, &status, __WALL) != pid || !WIFEXITED(status)
+	    || WEXITSTATUS(status)) {
+		pr_err("Can't wait or bad status: errno=%d, status=%d\n",
+		       errno, status);
+		return -1;
+	}
+
+	return 0;
+}

Comments

Andrey Vagin Sept. 12, 2018, 6:04 p.m.
On Tue, Sep 11, 2018 at 06:53:30PM +0300, Pavel Tikhomirov wrote:
> We can have ghost-files on readonly mounts, for them we will need to
> recreate the file on restore, and we can't do that if mount is readonly,
> so the idea is to move restoring readonly mount flags after all files are
> restored, before that restore process will see mounts writable.

I don't like the idea to remount all readonly mounts. Can we don't this
hack only for mounts with ghost files?

What is about link-remap files?

> 
> There is an exception for overmounted mounts as it is not so easy to
> set flags on them at these late point. Other exception is if the mount
> has readonly superblock - there can be no ghost-files on such a mount.
> 
> The first point where we delay readonly is do_new_mount and the second
> is do_bind_mount. The latter is a bit more complex as we need to handle
> nesting from source mount which can be also delayed/undelayed.
> 
> https://jira.sw.ru/browse/PSBM-82991

Pls, don't add private links into a commit message.

> Signed-off-by: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
> ---
>  criu/cr-restore.c    |   2 +
>  criu/include/mount.h |   2 +
>  criu/mount.c         | 112 ++++++++++++++++++++++++++++++++++++++++++-
>  3 files changed, 114 insertions(+), 2 deletions(-)
> 
> diff --git a/criu/cr-restore.c b/criu/cr-restore.c
> index 6f22fb787..258ab916d 100644
> --- a/criu/cr-restore.c
> +++ b/criu/cr-restore.c
> @@ -3235,6 +3235,8 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
>  		/* Wait when all tasks restored all files */
>  		if (restore_wait_other_tasks())
>  			goto err_nv;
> +		if (remount_readonly_mounts())
> +			goto err_nv;
>  	}
>  
>  	/*
> diff --git a/criu/include/mount.h b/criu/include/mount.h
> index ca17b059a..3dec0ec0f 100644
> --- a/criu/include/mount.h
> +++ b/criu/include/mount.h
> @@ -126,4 +126,6 @@ extern struct mount_info *parse_mountinfo(pid_t pid, struct ns_id *nsid, bool fo
>  
>  extern int check_mnt_id(void);
>  
> +extern int remount_readonly_mounts(void);
> +
>  #endif /* __CR_MOUNT_H__ */
> diff --git a/criu/mount.c b/criu/mount.c
> index 781870eea..32d9c08b6 100644
> --- a/criu/mount.c
> +++ b/criu/mount.c
> @@ -2084,6 +2084,16 @@ static int do_new_mount(struct mount_info *mi)
>  		close(fd);
>  	}
>  
> +	/*
> +	 * Restoring ghost files on readonly mounts requires write access,
> +	 * remount_readonly_mounts() will set these flags after all files
> +	 * restored
> +	 */
> +	if (!mnt_is_overmounted(mi))
> +		mflags &= ~MS_RDONLY;
> +	else if (mflags & MS_RDONLY)
> +		pr_warn("Can't delay making mount readonly for overmounted mount\n");
> +
>  	if (mflags && mount(NULL, mi->mountpoint, NULL,
>  				MS_REMOUNT | MS_BIND | mflags, NULL)) {
>  		pr_perror("Unable to apply bind-mount options");
> @@ -2161,7 +2171,7 @@ static int do_bind_mount(struct mount_info *mi)
>  {
>  	char mnt_fd_path[PSFDS];
>  	char *root, *cut_root, rpath[PATH_MAX];
> -	unsigned long mflags;
> +	unsigned long mflags, bmflags = 0;
>  	int exit_code = -1, mp_len;
>  	bool shared = false;
>  	bool master = false;
> @@ -2281,7 +2291,15 @@ static int do_bind_mount(struct mount_info *mi)
>  	}
>  
>  	mflags = mi->flags & (~MS_PROPAGATE);
> -	if (!mi->bind || mflags != (mi->bind->flags & (~MS_PROPAGATE)))
> +	if (!mnt_is_overmounted(mi) && !(mi->sb_flags & MS_RDONLY))
> +		mflags &= ~MS_RDONLY;
> +	if (mi->bind) {
> +		bmflags = mi->bind->flags & (~MS_PROPAGATE);
> +		if (!mnt_is_overmounted(mi->bind) && !(mi->bind->sb_flags & MS_RDONLY))
> +			bmflags &= ~MS_RDONLY;
> +	}
> +
> +	if (!mi->bind || mflags != bmflags)
>  		if (mount(NULL, mi->mountpoint, NULL, MS_BIND | MS_REMOUNT | mflags, NULL)) {
>  			pr_perror("Can't mount at %s", mi->mountpoint);
>  			goto err;
> @@ -3654,3 +3672,93 @@ void clean_cr_time_mounts(void)
>  }
>  
>  struct ns_desc mnt_ns_desc = NS_DESC_ENTRY(CLONE_NEWNS, "mnt");
> +
> +int __remount_readonly_mounts(struct ns_id *ns)
> +{
> +	struct mount_info *mnt;
> +
> +	for (mnt = mntinfo; mnt; mnt = mnt->next) {
> +		if (ns && mnt->nsid != ns)
> +			continue;
> +
> +		if (mnt_is_overmounted(mnt))
> +			continue;
> +
> +		if (mnt->sb_flags & MS_RDONLY)
> +			continue;
> +
> +		if (!(mnt->flags & MS_RDONLY))
> +			continue;
> +
> +		if (mount(NULL, mnt->ns_mountpoint, NULL,
> +			  MS_REMOUNT | MS_BIND | (mnt->flags & (~MS_PROPAGATE)),
> +			  NULL)) {
> +			pr_perror("Failed to restore %d:%s mount flags %x",
> +				  mnt->mnt_id, mnt->mountpoint, mnt->flags);
> +			return -1;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
> +int ns_remount_readonly_mounts(void *arg)
> +{
> +	struct ns_id *nsid;
> +
> +	for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) {
> +		int mntns_fd;
> +
> +		if (nsid->nd != &mnt_ns_desc)
> +			continue;
> +
> +		mntns_fd = fdstore_get(nsid->mnt.nsfd_id);
> +		if (mntns_fd < 0)
> +			return 1;
> +
> +		if (setns(mntns_fd, CLONE_NEWNS) < 0) {
> +			pr_perror("`- Can't switch");
> +			close(mntns_fd);
> +			return 1;
> +		}
> +		close(mntns_fd);
> +
> +		pr_info("Switched to mntns %u:%u/n", nsid->id, nsid->kid);
> +
> +		if (__remount_readonly_mounts(nsid))
> +			return 1;
> +	}
> +
> +	return 0;
> +}
> +
> +int remount_readonly_mounts(void)
> +{
> +	int pid, status;
> +
> +	if (!(root_ns_mask & CLONE_NEWNS))
> +		return __remount_readonly_mounts(NULL);
> +
> +	/*
> +	 * Need a helper process because the root task can share fs via
> +	 * CLONE_FS and we would not be able to enter mount namespaces
> +	 */
> +	pid = clone_noasan(ns_remount_readonly_mounts,
> +			   CLONE_VFORK | CLONE_VM | CLONE_FILES
> +			   | CLONE_IO | CLONE_SIGHAND
> +			   | CLONE_SYSVSEM, NULL);
> +	if (pid == -1) {
> +		pr_perror("Can't clone helper process");
> +		return -1;
> +	}
> +
> +	errno = 0;
> +	if (waitpid(pid, &status, __WALL) != pid || !WIFEXITED(status)
> +	    || WEXITSTATUS(status)) {
> +		pr_err("Can't wait or bad status: errno=%d, status=%d\n",
> +		       errno, status);
> +		return -1;
> +	}
> +
> +	return 0;
> +}
> -- 
> 2.17.1
>
Pavel Tikhomirov Sept. 13, 2018, 8:17 a.m.
On 09/12/2018 09:04 PM, Andrew Vagin wrote:
> On Tue, Sep 11, 2018 at 06:53:30PM +0300, Pavel Tikhomirov wrote:
>> We can have ghost-files on readonly mounts, for them we will need to
>> recreate the file on restore, and we can't do that if mount is readonly,
>> so the idea is to move restoring readonly mount flags after all files are
>> restored, before that restore process will see mounts writable.
> 
> I don't like the idea to remount all readonly mounts. 

Not all, but only the once with writable superblock, note the exceptions 
below.

> Can we don't this hack only for mounts with ghost files?

When restoring ghost files we can write-access single ghost file from 
different mounts of it's superblock:

1) create_ghost, clean_one_remap - we create it on the mount from which 
it was opened and if it was open multiple times from different mounts - 
the one we've seen first on dump

2) rfi_remap - from "common bind-mount"

So we need the hack for each of the group of mounts with same 
superblock, when one from the group has ghost files open.

> 
> What is about link-remap files?

Surely we have same problem for all write accesses to fs on restore, so 
for you previous question, the best way is to leave all mounts writable 
where possible until we finish with files restore.

> 
>>
>> There is an exception for overmounted mounts as it is not so easy to
>> set flags on them at these late point. Other exception is if the mount
>> has readonly superblock - there can be no ghost-files on such a mount.
>>
>> The first point where we delay readonly is do_new_mount and the second
>> is do_bind_mount. The latter is a bit more complex as we need to handle
>> nesting from source mount which can be also delayed/undelayed.
>>
>> https://jira.sw.ru/browse/PSBM-82991
> 
> Pls, don't add private links into a commit message.

Ok

> 
>> Signed-off-by: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
>> ---
>>   criu/cr-restore.c    |   2 +
>>   criu/include/mount.h |   2 +
>>   criu/mount.c         | 112 ++++++++++++++++++++++++++++++++++++++++++-
>>   3 files changed, 114 insertions(+), 2 deletions(-)
>>
>> diff --git a/criu/cr-restore.c b/criu/cr-restore.c
>> index 6f22fb787..258ab916d 100644
>> --- a/criu/cr-restore.c
>> +++ b/criu/cr-restore.c
>> @@ -3235,6 +3235,8 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
>>   		/* Wait when all tasks restored all files */
>>   		if (restore_wait_other_tasks())
>>   			goto err_nv;
>> +		if (remount_readonly_mounts())
>> +			goto err_nv;
>>   	}
>>   
>>   	/*
>> diff --git a/criu/include/mount.h b/criu/include/mount.h
>> index ca17b059a..3dec0ec0f 100644
>> --- a/criu/include/mount.h
>> +++ b/criu/include/mount.h
>> @@ -126,4 +126,6 @@ extern struct mount_info *parse_mountinfo(pid_t pid, struct ns_id *nsid, bool fo
>>   
>>   extern int check_mnt_id(void);
>>   
>> +extern int remount_readonly_mounts(void);
>> +
>>   #endif /* __CR_MOUNT_H__ */
>> diff --git a/criu/mount.c b/criu/mount.c
>> index 781870eea..32d9c08b6 100644
>> --- a/criu/mount.c
>> +++ b/criu/mount.c
>> @@ -2084,6 +2084,16 @@ static int do_new_mount(struct mount_info *mi)
>>   		close(fd);
>>   	}
>>   
>> +	/*
>> +	 * Restoring ghost files on readonly mounts requires write access,
>> +	 * remount_readonly_mounts() will set these flags after all files
>> +	 * restored
>> +	 */
>> +	if (!mnt_is_overmounted(mi))
>> +		mflags &= ~MS_RDONLY;
>> +	else if (mflags & MS_RDONLY)
>> +		pr_warn("Can't delay making mount readonly for overmounted mount\n");
>> +
>>   	if (mflags && mount(NULL, mi->mountpoint, NULL,
>>   				MS_REMOUNT | MS_BIND | mflags, NULL)) {
>>   		pr_perror("Unable to apply bind-mount options");
>> @@ -2161,7 +2171,7 @@ static int do_bind_mount(struct mount_info *mi)
>>   {
>>   	char mnt_fd_path[PSFDS];
>>   	char *root, *cut_root, rpath[PATH_MAX];
>> -	unsigned long mflags;
>> +	unsigned long mflags, bmflags = 0;
>>   	int exit_code = -1, mp_len;
>>   	bool shared = false;
>>   	bool master = false;
>> @@ -2281,7 +2291,15 @@ static int do_bind_mount(struct mount_info *mi)
>>   	}
>>   
>>   	mflags = mi->flags & (~MS_PROPAGATE);
>> -	if (!mi->bind || mflags != (mi->bind->flags & (~MS_PROPAGATE)))
>> +	if (!mnt_is_overmounted(mi) && !(mi->sb_flags & MS_RDONLY))
>> +		mflags &= ~MS_RDONLY;
>> +	if (mi->bind) {
>> +		bmflags = mi->bind->flags & (~MS_PROPAGATE);
>> +		if (!mnt_is_overmounted(mi->bind) && !(mi->bind->sb_flags & MS_RDONLY))
>> +			bmflags &= ~MS_RDONLY;
>> +	}
>> +
>> +	if (!mi->bind || mflags != bmflags)
>>   		if (mount(NULL, mi->mountpoint, NULL, MS_BIND | MS_REMOUNT | mflags, NULL)) {
>>   			pr_perror("Can't mount at %s", mi->mountpoint);
>>   			goto err;
>> @@ -3654,3 +3672,93 @@ void clean_cr_time_mounts(void)
>>   }
>>   
>>   struct ns_desc mnt_ns_desc = NS_DESC_ENTRY(CLONE_NEWNS, "mnt");
>> +
>> +int __remount_readonly_mounts(struct ns_id *ns)
>> +{
>> +	struct mount_info *mnt;
>> +
>> +	for (mnt = mntinfo; mnt; mnt = mnt->next) {
>> +		if (ns && mnt->nsid != ns)
>> +			continue;
>> +
>> +		if (mnt_is_overmounted(mnt))
>> +			continue;
>> +
>> +		if (mnt->sb_flags & MS_RDONLY)
>> +			continue;
>> +
>> +		if (!(mnt->flags & MS_RDONLY))
>> +			continue;
>> +
>> +		if (mount(NULL, mnt->ns_mountpoint, NULL,
>> +			  MS_REMOUNT | MS_BIND | (mnt->flags & (~MS_PROPAGATE)),
>> +			  NULL)) {
>> +			pr_perror("Failed to restore %d:%s mount flags %x",
>> +				  mnt->mnt_id, mnt->mountpoint, mnt->flags);
>> +			return -1;
>> +		}
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +int ns_remount_readonly_mounts(void *arg)
>> +{
>> +	struct ns_id *nsid;
>> +
>> +	for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) {
>> +		int mntns_fd;
>> +
>> +		if (nsid->nd != &mnt_ns_desc)
>> +			continue;
>> +
>> +		mntns_fd = fdstore_get(nsid->mnt.nsfd_id);
>> +		if (mntns_fd < 0)
>> +			return 1;
>> +
>> +		if (setns(mntns_fd, CLONE_NEWNS) < 0) {
>> +			pr_perror("`- Can't switch");
>> +			close(mntns_fd);
>> +			return 1;
>> +		}
>> +		close(mntns_fd);
>> +
>> +		pr_info("Switched to mntns %u:%u/n", nsid->id, nsid->kid);
>> +
>> +		if (__remount_readonly_mounts(nsid))
>> +			return 1;
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +int remount_readonly_mounts(void)
>> +{
>> +	int pid, status;
>> +
>> +	if (!(root_ns_mask & CLONE_NEWNS))
>> +		return __remount_readonly_mounts(NULL);
>> +
>> +	/*
>> +	 * Need a helper process because the root task can share fs via
>> +	 * CLONE_FS and we would not be able to enter mount namespaces
>> +	 */
>> +	pid = clone_noasan(ns_remount_readonly_mounts,
>> +			   CLONE_VFORK | CLONE_VM | CLONE_FILES
>> +			   | CLONE_IO | CLONE_SIGHAND
>> +			   | CLONE_SYSVSEM, NULL);
>> +	if (pid == -1) {
>> +		pr_perror("Can't clone helper process");
>> +		return -1;
>> +	}
>> +
>> +	errno = 0;
>> +	if (waitpid(pid, &status, __WALL) != pid || !WIFEXITED(status)
>> +	    || WEXITSTATUS(status)) {
>> +		pr_err("Can't wait or bad status: errno=%d, status=%d\n",
>> +		       errno, status);
>> +		return -1;
>> +	}
>> +
>> +	return 0;
>> +}
>> -- 
>> 2.17.1
>>