Punch holes in input files when restoring anonymous non-shared memory if --auto-dedup is enabled.

Submitted by Pawel Stradomski on July 24, 2018, 10:12 a.m.

Details

Message ID 000000000000a47e340571d2843c@google.com
State Accepted
Series "Punch holes in input files when restoring anonymous non-shared memory"
Headers show

Commit Message

Pawel Stradomski July 24, 2018, 10:12 a.m.
This reduces memory usage if image files are stored on tmpfs.

Signed-off-by: Pawel Stradomski <pstradomski@google.com>
---
 .../arch/arm/plugins/std/syscalls/syscall.def |  1 +
 .../plugins/std/syscalls/syscall-ppc64.tbl    |  1 +
 .../plugins/std/syscalls/syscall-s390.tbl     |  1 +
 .../x86/plugins/std/syscalls/syscall_32.tbl   |  1 +
 .../x86/plugins/std/syscalls/syscall_64.tbl   |  1 +
 criu/mem.c                                    |  6 +++-
 criu/pie/restorer.c                           | 31 +++++++++++++++++++
 7 files changed, 41 insertions(+), 1 deletion(-)

Patch hide | download patch | download mbox

diff --git a/compel/arch/arm/plugins/std/syscalls/syscall.def b/compel/arch/arm/plugins/std/syscalls/syscall.def
index b68f9f2f..bcd61d4a 100644
--- a/compel/arch/arm/plugins/std/syscalls/syscall.def
+++ b/compel/arch/arm/plugins/std/syscalls/syscall.def
@@ -109,3 +109,4 @@  seccomp				277	383	(unsigned int op, unsigned int flags, const char *uargs)
 gettimeofday			169	78	(struct timeval *tv, struct timezone *tz)
 preadv_raw			69	361	(int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h)
 userfaultfd			282	388	(int flags)
+fallocate			47	352	(int fd, int mode, loff_t offset, loff_t len)
diff --git a/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl b/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl
index fa0b034e..62e0bc1a 100644
--- a/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl
+++ b/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl
@@ -89,6 +89,7 @@  __NR_set_robust_list	300		sys_set_robust_list	(struct robust_list_head *head, si
 __NR_get_robust_list	299		sys_get_robust_list	(int pid, struct robust_list_head **head_ptr, size_t *len_ptr)
 __NR_vmsplice		285		sys_vmsplice		(int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags)
 __NR_openat		286		sys_openat		(int dfd, const char *filename, int flags, int mode)
+__NR_fallocate		309		sys_fallocate		(int fd, int mode, loff_t offset, loff_t len)
 __NR_timerfd_settime	311		sys_timerfd_settime	(int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr)
 __NR_signalfd4		313		sys_signalfd		(int fd, k_rtsigset_t *mask, size_t sizemask, int flags)
 __NR_rt_tgsigqueueinfo	322		sys_rt_tgsigqueueinfo	(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
diff --git a/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl b/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl
index bc77ae97..3521e915 100644
--- a/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl
+++ b/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl
@@ -89,6 +89,7 @@  __NR_set_robust_list	304		sys_set_robust_list	(struct robust_list_head *head, si
 __NR_get_robust_list	305		sys_get_robust_list	(int pid, struct robust_list_head **head_ptr, size_t *len_ptr)
 __NR_vmsplice		309		sys_vmsplice		(int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags)
 __NR_openat		288		sys_openat		(int dfd, const char *filename, int flags, int mode)
+__NR_fallocate		314		sys_fallocate		(int fd, int mode, loff_t offset, loff_t len)
 __NR_timerfd_settime	320		sys_timerfd_settime	(int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr)
 __NR_signalfd4		322		sys_signalfd		(int fd, k_rtsigset_t *mask, size_t sizemask, int flags)
 __NR_rt_tgsigqueueinfo	330		sys_rt_tgsigqueueinfo	(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
diff --git a/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl b/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl
index 9e1de281..a6c55b83 100644
--- a/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl
+++ b/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl
@@ -83,6 +83,7 @@  __NR_set_robust_list	311		sys_set_robust_list	(struct robust_list_head *head, si
 __NR_get_robust_list	312		sys_get_robust_list	(int pid, struct robust_list_head **head_ptr, size_t *len_ptr)
 __NR_vmsplice		316		sys_vmsplice		(int fd, const struct iovec *iov, unsigned int nr_segs, unsigned int flags)
 __NR_signalfd		321		sys_signalfd		(int ufd, const k_rtsigset_t *sigmask, size_t sigsetsize)
+__NR_fallocate		324		sys_fallocate		(int fd, int mode, loff_t offset, loff_t len)
 __NR_timerfd_settime	325		sys_timerfd_settime	(int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr)
 __NR_preadv		333		sys_preadv_raw		(int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h)
 __NR_rt_tgsigqueueinfo	335		sys_rt_tgsigqueueinfo	(pid_t tgid, pid_t pid, int sig, siginfo_t *uinfo)
diff --git a/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl b/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl
index 726fa797..64271514 100644
--- a/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl
+++ b/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl
@@ -94,6 +94,7 @@  __NR_set_robust_list		273		sys_set_robust_list	(struct robust_list_head *head, s
 __NR_get_robust_list		274		sys_get_robust_list	(int pid, struct robust_list_head **head_ptr, size_t *len_ptr)
 __NR_seccomp			317		sys_seccomp		(unsigned int op, unsigned int flags, const char *uargs)
 __NR_vmsplice			278		sys_vmsplice		(int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags)
+__NR_fallocate			285		sys_fallocate		(int fd, int mode, loff_t offset, loff_t len)
 __NR_timerfd_settime		286		sys_timerfd_settime	(int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr)
 __NR_signalfd4			289		sys_signalfd		(int fd, k_rtsigset_t *mask, size_t sizemask, int flags)
 __NR_preadv			295		sys_preadv_raw		(int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h)
diff --git a/criu/mem.c b/criu/mem.c
index d020b7fd..44d0e258 100644
--- a/criu/mem.c
+++ b/criu/mem.c
@@ -1271,7 +1271,11 @@  static int prepare_vma_ios(struct pstree_item *t, struct task_restore_args *ta)
 {
 	struct cr_img *pages;
 
-	pages = open_image(CR_FD_PAGES, O_RSTR, rsti(t)->pages_img_id);
+	/* if auto-dedup is on we need RDWR mode to be able to punch holes
+	 * in the input files (in restorer.c)
+	 */
+	pages = open_image(CR_FD_PAGES, opts.auto_dedup ? O_RDWR : O_RSTR,
+				rsti(t)->pages_img_id);
 	if (!pages)
 		return -1;
 
diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c
index f990e9b7..7c70d02a 100644
--- a/criu/pie/restorer.c
+++ b/criu/pie/restorer.c
@@ -50,6 +50,15 @@ 
 #define PR_SET_PDEATHSIG 1
 #endif
 
+#ifndef FALLOC_FL_KEEP_SIZE
+#define FALLOC_FL_KEEP_SIZE     0x01
+#endif
+
+#ifndef FALLOC_FL_PUNCH_HOLE
+#define FALLOC_FL_PUNCH_HOLE    0x02
+#endif
+
+
 #define sys_prctl_safe(opcode, val1, val2, val3)			\
 	({								\
 		long __ret = sys_prctl(opcode, val1, val2, val3, 0);	\
@@ -646,6 +655,14 @@  static unsigned long restore_mapping(VmaEntry *vma_entry)
 			!(vma_entry->status & VMA_NO_PROT_WRITE))
 		prot |= PROT_WRITE;
 
+	/* TODO: Drop MAP_LOCKED bit and restore it after reading memory.
+	 *
+	 * Code below tries to limit memory usage by running fallocate()
+	 * after each preadv() to avoid doubling memory usage (once in
+	 * image files, once in process). Unfortunately, MAP_LOCKED defeats
+	 * that mechanism as it causes the process to be charged for memory
+	 * immediately upon mmap, not later upon preadv().
+	 */
 	pr_debug("\tmmap(%"PRIx64" -> %"PRIx64", %x %x %d)\n",
 			vma_entry->start, vma_entry->end,
 			prot, flags, (int)vma_entry->fd);
@@ -1355,6 +1372,11 @@  long __export_restore_task(struct task_restore_args *args)
 		struct iovec *iovs = rio->iovs;
 		int nr = rio->nr_iovs;
 		ssize_t r;
+		int file_flags = sys_fcntl(args->vma_ios_fd, F_GETFL, 0);
+		if (file_flags < 0) {
+			pr_err("Can't check file flags\n");
+			file_flags = 0;
+		}
 
 		while (nr) {
 			pr_debug("Preadv %lx:%d... (%d iovs)\n",
@@ -1367,6 +1389,15 @@  long __export_restore_task(struct task_restore_args *args)
 			}
 
 			pr_debug("`- returned %ld\n", (long)r);
+			/* If the file is open for writing, then it means we should punch holes
+			 * in it. */
+			if (r > 0 && (file_flags & O_RDWR)) {
+				int fr = sys_fallocate(args->vma_ios_fd, FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE,
+					rio->off, r);
+				if (fr < 0) {
+					pr_debug("Failed to punch holes with fallocate: %d\n", fr);
+				}
+			}
 			rio->off += r;
 			/* Advance the iovecs */
 			do {

Comments

Andrey Vagin July 27, 2018, 1:59 a.m.
Applied, thanks! Here is one in-line comment

On Tue, Jul 24, 2018 at 12:12:27PM +0200, Pawel Stradomski wrote:
> This reduces memory usage if image files are stored on tmpfs.
> 
> Signed-off-by: Pawel Stradomski <pstradomski@google.com>
> ---
>  .../arch/arm/plugins/std/syscalls/syscall.def |  1 +
>  .../plugins/std/syscalls/syscall-ppc64.tbl    |  1 +
>  .../plugins/std/syscalls/syscall-s390.tbl     |  1 +
>  .../x86/plugins/std/syscalls/syscall_32.tbl   |  1 +
>  .../x86/plugins/std/syscalls/syscall_64.tbl   |  1 +
>  criu/mem.c                                    |  6 +++-
>  criu/pie/restorer.c                           | 31 +++++++++++++++++++
>  7 files changed, 41 insertions(+), 1 deletion(-)
> 
> diff --git a/compel/arch/arm/plugins/std/syscalls/syscall.def b/compel/arch/arm/plugins/std/syscalls/syscall.def
> index b68f9f2f..bcd61d4a 100644
> --- a/compel/arch/arm/plugins/std/syscalls/syscall.def
> +++ b/compel/arch/arm/plugins/std/syscalls/syscall.def
> @@ -109,3 +109,4 @@ seccomp				277	383	(unsigned int op, unsigned int flags, const char *uargs)
>  gettimeofday			169	78	(struct timeval *tv, struct timezone *tz)
>  preadv_raw			69	361	(int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h)
>  userfaultfd			282	388	(int flags)
> +fallocate			47	352	(int fd, int mode, loff_t offset, loff_t len)
> diff --git a/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl b/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl
> index fa0b034e..62e0bc1a 100644
> --- a/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl
> +++ b/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl
> @@ -89,6 +89,7 @@ __NR_set_robust_list	300		sys_set_robust_list	(struct robust_list_head *head, si
>  __NR_get_robust_list	299		sys_get_robust_list	(int pid, struct robust_list_head **head_ptr, size_t *len_ptr)
>  __NR_vmsplice		285		sys_vmsplice		(int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags)
>  __NR_openat		286		sys_openat		(int dfd, const char *filename, int flags, int mode)
> +__NR_fallocate		309		sys_fallocate		(int fd, int mode, loff_t offset, loff_t len)
>  __NR_timerfd_settime	311		sys_timerfd_settime	(int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr)
>  __NR_signalfd4		313		sys_signalfd		(int fd, k_rtsigset_t *mask, size_t sizemask, int flags)
>  __NR_rt_tgsigqueueinfo	322		sys_rt_tgsigqueueinfo	(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
> diff --git a/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl b/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl
> index bc77ae97..3521e915 100644
> --- a/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl
> +++ b/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl
> @@ -89,6 +89,7 @@ __NR_set_robust_list	304		sys_set_robust_list	(struct robust_list_head *head, si
>  __NR_get_robust_list	305		sys_get_robust_list	(int pid, struct robust_list_head **head_ptr, size_t *len_ptr)
>  __NR_vmsplice		309		sys_vmsplice		(int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags)
>  __NR_openat		288		sys_openat		(int dfd, const char *filename, int flags, int mode)
> +__NR_fallocate		314		sys_fallocate		(int fd, int mode, loff_t offset, loff_t len)
>  __NR_timerfd_settime	320		sys_timerfd_settime	(int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr)
>  __NR_signalfd4		322		sys_signalfd		(int fd, k_rtsigset_t *mask, size_t sizemask, int flags)
>  __NR_rt_tgsigqueueinfo	330		sys_rt_tgsigqueueinfo	(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
> diff --git a/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl b/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl
> index 9e1de281..a6c55b83 100644
> --- a/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl
> +++ b/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl
> @@ -83,6 +83,7 @@ __NR_set_robust_list	311		sys_set_robust_list	(struct robust_list_head *head, si
>  __NR_get_robust_list	312		sys_get_robust_list	(int pid, struct robust_list_head **head_ptr, size_t *len_ptr)
>  __NR_vmsplice		316		sys_vmsplice		(int fd, const struct iovec *iov, unsigned int nr_segs, unsigned int flags)
>  __NR_signalfd		321		sys_signalfd		(int ufd, const k_rtsigset_t *sigmask, size_t sigsetsize)
> +__NR_fallocate		324		sys_fallocate		(int fd, int mode, loff_t offset, loff_t len)
>  __NR_timerfd_settime	325		sys_timerfd_settime	(int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr)
>  __NR_preadv		333		sys_preadv_raw		(int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h)
>  __NR_rt_tgsigqueueinfo	335		sys_rt_tgsigqueueinfo	(pid_t tgid, pid_t pid, int sig, siginfo_t *uinfo)
> diff --git a/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl b/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl
> index 726fa797..64271514 100644
> --- a/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl
> +++ b/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl
> @@ -94,6 +94,7 @@ __NR_set_robust_list		273		sys_set_robust_list	(struct robust_list_head *head, s
>  __NR_get_robust_list		274		sys_get_robust_list	(int pid, struct robust_list_head **head_ptr, size_t *len_ptr)
>  __NR_seccomp			317		sys_seccomp		(unsigned int op, unsigned int flags, const char *uargs)
>  __NR_vmsplice			278		sys_vmsplice		(int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags)
> +__NR_fallocate			285		sys_fallocate		(int fd, int mode, loff_t offset, loff_t len)
>  __NR_timerfd_settime		286		sys_timerfd_settime	(int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr)
>  __NR_signalfd4			289		sys_signalfd		(int fd, k_rtsigset_t *mask, size_t sizemask, int flags)
>  __NR_preadv			295		sys_preadv_raw		(int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h)
> diff --git a/criu/mem.c b/criu/mem.c
> index d020b7fd..44d0e258 100644
> --- a/criu/mem.c
> +++ b/criu/mem.c
> @@ -1271,7 +1271,11 @@ static int prepare_vma_ios(struct pstree_item *t, struct task_restore_args *ta)
>  {
>  	struct cr_img *pages;
>  
> -	pages = open_image(CR_FD_PAGES, O_RSTR, rsti(t)->pages_img_id);
> +	/* if auto-dedup is on we need RDWR mode to be able to punch holes
> +	 * in the input files (in restorer.c)
> +	 */
> +	pages = open_image(CR_FD_PAGES, opts.auto_dedup ? O_RDWR : O_RSTR,
> +				rsti(t)->pages_img_id);

This doesn't work for userns tests:

[root@fc24 criu]# python test/zdtm.py run -t zdtm/static/env00 -f uns --dedup
=== Run 1/1 ================ zdtm/static/env00
========================= Run zdtm/static/env00 in uns =========================
Start test
./env00 --pidfile=env00.pid --outfile=env00.out --envname=ENV_00_TEST
Run criu dump
files stat: fs/file-max 400903, fs/nr_open 1048576
rlimit: RLIMIT_NOFILE unlimited for self
Loaded kdat cache from /run/criu.kdat
Run criu restore
files stat: fs/file-max 400903, fs/nr_open 1048576
rlimit: RLIMIT_NOFILE unlimited for self
Loaded kdat cache from /run/criu.kdat
=[log]=> dump/zdtm/static/env00/43/1/restore.log
------------------------ grep Error ------------------------
(00.195834)      1:    `- FD 1 pid 4
(00.195841)      1:    `- FD 2 pid 4
(00.195848)      1:  `- type 1 ID 0xa
(00.195855)      1:    `- FD 3 pid 4
(00.278816)      1: Error (criu/image.c:432): Unable to open pages-1.img: Permission denied
(00.296743) uns: calling exit_usernsd (-1, 1)
(00.296822) uns: daemon calls 0x4675b0 (62, -1, 1)
(00.296836) uns: `- daemon exits w/ 0
(00.298277) uns: daemon stopped
(00.298298) Error (criu/cr-restore.c:2308): Restoring FAILED.
------------------------ ERROR OVER ------------------------
################# Test zdtm/static/env00 FAIL at CRIU restore ##################
##################################### FAIL #####################################

CRIU opens an image from a target userns and fails to open it for read-write:
-rw-r--r-- 1 root root 106496 Jul 27 04:54 test/dump/zdtm/static/env00/43/1/pages-1.img

Probably, we need to use userns_call() to open images in this case.

>  	if (!pages)
>  		return -1;
>  
> diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c
> index f990e9b7..7c70d02a 100644
> --- a/criu/pie/restorer.c
> +++ b/criu/pie/restorer.c
> @@ -50,6 +50,15 @@
>  #define PR_SET_PDEATHSIG 1
>  #endif
>  
> +#ifndef FALLOC_FL_KEEP_SIZE
> +#define FALLOC_FL_KEEP_SIZE     0x01
> +#endif
> +
> +#ifndef FALLOC_FL_PUNCH_HOLE
> +#define FALLOC_FL_PUNCH_HOLE    0x02
> +#endif
> +
> +
>  #define sys_prctl_safe(opcode, val1, val2, val3)			\
>  	({								\
>  		long __ret = sys_prctl(opcode, val1, val2, val3, 0);	\
> @@ -646,6 +655,14 @@ static unsigned long restore_mapping(VmaEntry *vma_entry)
>  			!(vma_entry->status & VMA_NO_PROT_WRITE))
>  		prot |= PROT_WRITE;
>  
> +	/* TODO: Drop MAP_LOCKED bit and restore it after reading memory.
> +	 *
> +	 * Code below tries to limit memory usage by running fallocate()
> +	 * after each preadv() to avoid doubling memory usage (once in
> +	 * image files, once in process). Unfortunately, MAP_LOCKED defeats
> +	 * that mechanism as it causes the process to be charged for memory
> +	 * immediately upon mmap, not later upon preadv().
> +	 */
>  	pr_debug("\tmmap(%"PRIx64" -> %"PRIx64", %x %x %d)\n",
>  			vma_entry->start, vma_entry->end,
>  			prot, flags, (int)vma_entry->fd);
> @@ -1355,6 +1372,11 @@ long __export_restore_task(struct task_restore_args *args)
>  		struct iovec *iovs = rio->iovs;
>  		int nr = rio->nr_iovs;
>  		ssize_t r;
> +		int file_flags = sys_fcntl(args->vma_ios_fd, F_GETFL, 0);
> +		if (file_flags < 0) {
> +			pr_err("Can't check file flags\n");
> +			file_flags = 0;
> +		}
>  
>  		while (nr) {
>  			pr_debug("Preadv %lx:%d... (%d iovs)\n",
> @@ -1367,6 +1389,15 @@ long __export_restore_task(struct task_restore_args *args)
>  			}
>  
>  			pr_debug("`- returned %ld\n", (long)r);
> +			/* If the file is open for writing, then it means we should punch holes
> +			 * in it. */
> +			if (r > 0 && (file_flags & O_RDWR)) {
> +				int fr = sys_fallocate(args->vma_ios_fd, FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE,
> +					rio->off, r);
> +				if (fr < 0) {
> +					pr_debug("Failed to punch holes with fallocate: %d\n", fr);
> +				}
> +			}
>  			rio->off += r;
>  			/* Advance the iovecs */
>  			do {
> -- 
> 2.18.0.233.g985f88cf7e-goog
> 
> _______________________________________________
> CRIU mailing list
> CRIU@openvz.org
> https://lists.openvz.org/mailman/listinfo/criu
Pawel Stradomski July 30, 2018, 4:18 p.m.
pt., 27 lip 2018 o 04:00 Andrei Vagin <avagin@virtuozzo.com> wrote:
> Applied, thanks! Here is one in-line comment
>
> > --- a/criu/mem.c
> > +++ b/criu/mem.c
> > @@ -1271,7 +1271,11 @@ static int prepare_vma_ios(struct pstree_item *t, struct task_restore_args *ta)
> >  {
> >       struct cr_img *pages;
> >
> > -     pages = open_image(CR_FD_PAGES, O_RSTR, rsti(t)->pages_img_id);
> > +     /* if auto-dedup is on we need RDWR mode to be able to punch holes
> > +      * in the input files (in restorer.c)
> > +      */
> > +     pages = open_image(CR_FD_PAGES, opts.auto_dedup ? O_RDWR : O_RSTR,
> > +                             rsti(t)->pages_img_id);
>
> This doesn't work for userns tests:
>
> CRIU opens an image from a target userns and fails to open it for read-write:
> -rw-r--r-- 1 root root 106496 Jul 27 04:54 test/dump/zdtm/static/env00/43/1/pages-1.img
>
> Probably, we need to use userns_call() to open images in this case.

I've added fallback to opening read-only if read-write fails.
userns_call seems like a good idea, but given that open_image not only
opens file but also allocates a few objects, we'd need to break it
apart first. I'd like to do that in a separate commit.
Pawel Stradomski July 30, 2018, 4:47 p.m.
Ah, this one was merged already. Nevermind then. I'll still send a fix
by falling back to readonly first and work on using userns_call later.
pon., 30 lip 2018 o 18:18 Paweł Stradomski <pstradomski@google.com> napisał(a):
>
> pt., 27 lip 2018 o 04:00 Andrei Vagin <avagin@virtuozzo.com> wrote:
> > Applied, thanks! Here is one in-line comment
> >
> > > --- a/criu/mem.c
> > > +++ b/criu/mem.c
> > > @@ -1271,7 +1271,11 @@ static int prepare_vma_ios(struct pstree_item *t, struct task_restore_args *ta)
> > >  {
> > >       struct cr_img *pages;
> > >
> > > -     pages = open_image(CR_FD_PAGES, O_RSTR, rsti(t)->pages_img_id);
> > > +     /* if auto-dedup is on we need RDWR mode to be able to punch holes
> > > +      * in the input files (in restorer.c)
> > > +      */
> > > +     pages = open_image(CR_FD_PAGES, opts.auto_dedup ? O_RDWR : O_RSTR,
> > > +                             rsti(t)->pages_img_id);
> >
> > This doesn't work for userns tests:
> >
> > CRIU opens an image from a target userns and fails to open it for read-write:
> > -rw-r--r-- 1 root root 106496 Jul 27 04:54 test/dump/zdtm/static/env00/43/1/pages-1.img
> >
> > Probably, we need to use userns_call() to open images in this case.
>
> I've added fallback to opening read-only if read-write fails.
> userns_call seems like a good idea, but given that open_image not only
> opens file but also allocates a few objects, we'd need to break it
> apart first. I'd like to do that in a separate commit.
>
> --
> Paweł Stradomski