Punch holes in input files when restoring anonymous non-shared memory

Submitted by Pawel Stradomski on July 17, 2018, 1:38 p.m.

Details

Message ID 000000000000c557450571324f00@google.com
State New
Series "Punch holes in input files when restoring anonymous non-shared memory"
Headers show

Commit Message

Pawel Stradomski July 17, 2018, 1:38 p.m.
Punch holes in input files when restoring anonymous non-shared memory
if --auto-dedup is enabled.

This reduces memory usage if image files are stored on tmpfs.

Signed-off-by: Pawel Stradomski <pstradomski@google.com>

---
 compel/arch/arm/plugins/std/syscalls/syscall.def |  1 +
 .../ppc64/plugins/std/syscalls/syscall-ppc64.tbl |  1 +
 .../s390/plugins/std/syscalls/syscall-s390.tbl   |  1 +
 .../arch/x86/plugins/std/syscalls/syscall_32.tbl |  1 +
 .../arch/x86/plugins/std/syscalls/syscall_64.tbl |  1 +
 criu/mem.c                                       |  3 ++-
 criu/pie/restorer.c                              | 16 ++++++++++++++++
 7 files changed, 23 insertions(+), 1 deletion(-)

Patch hide | download patch | download mbox

diff --git a/compel/arch/arm/plugins/std/syscalls/syscall.def b/compel/arch/arm/plugins/std/syscalls/syscall.def
index b68f9f2f..bcd61d4a 100644
--- a/compel/arch/arm/plugins/std/syscalls/syscall.def
+++ b/compel/arch/arm/plugins/std/syscalls/syscall.def
@@ -109,3 +109,4 @@  seccomp				277	383	(unsigned int op, unsigned int flags, const char *uargs)
 gettimeofday			169	78	(struct timeval *tv, struct timezone *tz)
 preadv_raw			69	361	(int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h)
 userfaultfd			282	388	(int flags)
+fallocate			47	352	(int fd, int mode, loff_t offset, loff_t len)
diff --git a/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl b/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl
index fa0b034e..62e0bc1a 100644
--- a/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl
+++ b/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl
@@ -89,6 +89,7 @@  __NR_set_robust_list	300		sys_set_robust_list	(struct robust_list_head *head, si
 __NR_get_robust_list	299		sys_get_robust_list	(int pid, struct robust_list_head **head_ptr, size_t *len_ptr)
 __NR_vmsplice		285		sys_vmsplice		(int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags)
 __NR_openat		286		sys_openat		(int dfd, const char *filename, int flags, int mode)
+__NR_fallocate		309		sys_fallocate		(int fd, int mode, loff_t offset, loff_t len)
 __NR_timerfd_settime	311		sys_timerfd_settime	(int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr)
 __NR_signalfd4		313		sys_signalfd		(int fd, k_rtsigset_t *mask, size_t sizemask, int flags)
 __NR_rt_tgsigqueueinfo	322		sys_rt_tgsigqueueinfo	(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
diff --git a/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl b/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl
index bc77ae97..3521e915 100644
--- a/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl
+++ b/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl
@@ -89,6 +89,7 @@  __NR_set_robust_list	304		sys_set_robust_list	(struct robust_list_head *head, si
 __NR_get_robust_list	305		sys_get_robust_list	(int pid, struct robust_list_head **head_ptr, size_t *len_ptr)
 __NR_vmsplice		309		sys_vmsplice		(int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags)
 __NR_openat		288		sys_openat		(int dfd, const char *filename, int flags, int mode)
+__NR_fallocate		314		sys_fallocate		(int fd, int mode, loff_t offset, loff_t len)
 __NR_timerfd_settime	320		sys_timerfd_settime	(int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr)
 __NR_signalfd4		322		sys_signalfd		(int fd, k_rtsigset_t *mask, size_t sizemask, int flags)
 __NR_rt_tgsigqueueinfo	330		sys_rt_tgsigqueueinfo	(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
diff --git a/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl b/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl
index 9e1de281..a6c55b83 100644
--- a/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl
+++ b/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl
@@ -83,6 +83,7 @@  __NR_set_robust_list	311		sys_set_robust_list	(struct robust_list_head *head, si
 __NR_get_robust_list	312		sys_get_robust_list	(int pid, struct robust_list_head **head_ptr, size_t *len_ptr)
 __NR_vmsplice		316		sys_vmsplice		(int fd, const struct iovec *iov, unsigned int nr_segs, unsigned int flags)
 __NR_signalfd		321		sys_signalfd		(int ufd, const k_rtsigset_t *sigmask, size_t sigsetsize)
+__NR_fallocate		324		sys_fallocate		(int fd, int mode, loff_t offset, loff_t len)
 __NR_timerfd_settime	325		sys_timerfd_settime	(int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr)
 __NR_preadv		333		sys_preadv_raw		(int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h)
 __NR_rt_tgsigqueueinfo	335		sys_rt_tgsigqueueinfo	(pid_t tgid, pid_t pid, int sig, siginfo_t *uinfo)
diff --git a/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl b/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl
index 726fa797..64271514 100644
--- a/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl
+++ b/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl
@@ -94,6 +94,7 @@  __NR_set_robust_list		273		sys_set_robust_list	(struct robust_list_head *head, s
 __NR_get_robust_list		274		sys_get_robust_list	(int pid, struct robust_list_head **head_ptr, size_t *len_ptr)
 __NR_seccomp			317		sys_seccomp		(unsigned int op, unsigned int flags, const char *uargs)
 __NR_vmsplice			278		sys_vmsplice		(int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags)
+__NR_fallocate			285		sys_fallocate		(int fd, int mode, loff_t offset, loff_t len)
 __NR_timerfd_settime		286		sys_timerfd_settime	(int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr)
 __NR_signalfd4			289		sys_signalfd		(int fd, k_rtsigset_t *mask, size_t sizemask, int flags)
 __NR_preadv			295		sys_preadv_raw		(int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h)
diff --git a/criu/mem.c b/criu/mem.c
index d020b7fd..c3d604a5 100644
--- a/criu/mem.c
+++ b/criu/mem.c
@@ -1271,7 +1271,8 @@  static int prepare_vma_ios(struct pstree_item *t, struct task_restore_args *ta)
 {
 	struct cr_img *pages;
 
-	pages = open_image(CR_FD_PAGES, O_RSTR, rsti(t)->pages_img_id);
+	pages = open_image(CR_FD_PAGES, opts.auto_dedup ? O_RDWR : O_RSTR,
+				rsti(t)->pages_img_id);
 	if (!pages)
 		return -1;
 
diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c
index f990e9b7..3f1a8a6b 100644
--- a/criu/pie/restorer.c
+++ b/criu/pie/restorer.c
@@ -646,6 +646,15 @@  static unsigned long restore_mapping(VmaEntry *vma_entry)
 			!(vma_entry->status & VMA_NO_PROT_WRITE))
 		prot |= PROT_WRITE;
 
+	/* TODO: if the mapping had MAP_LOCKED bit set, then the mmap will
+	 * cause immediate page-in and increase in process memory usage,
+	 * thus defeating attempts to conserve memory by running fallocate after
+	 * each preadv.
+	 *
+	 * This could be fixed by zeroing MAP_LOCKED bit here and restoring it
+	 * after all the contents is already loaded and the tmpfs files released
+	 * by fallocate.
+	 */
 	pr_debug("\tmmap(%"PRIx64" -> %"PRIx64", %x %x %d)\n",
 			vma_entry->start, vma_entry->end,
 			prot, flags, (int)vma_entry->fd);
@@ -1367,6 +1376,13 @@  long __export_restore_task(struct task_restore_args *args)
 			}
 
 			pr_debug("`- returned %ld\n", (long)r);
+			/* TODO: Check if auto-dedup is enabled instead of trusting fallocate to fail
+			 * if the file is not opened for writing. */
+			if (r > 0) {
+				pr_debug("   `fallocate %d %ld %ld\n", args->vma_ios_fd,  rio->off, r);
+				sys_fallocate(args->vma_ios_fd, FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE,
+					rio->off, r);
+			}
 			rio->off += r;
 			/* Advance the iovecs */
 			do {

Comments

Andrei Vagin July 25, 2018, 12:11 a.m.
On Tue, Jul 17, 2018 at 03:38:57PM +0200, Pawel Stradomski wrote:
> Punch holes in input files when restoring anonymous non-shared memory
> if --auto-dedup is enabled.

We need to update the description for --auto-dedup and describe this new
behaviour there.

With these changes, criu restore can be executed only once. I think we
need to do something, so that a second attempt of restoring from these
images fails with a error.

We need to think how to test this functionality.

> 
> This reduces memory usage if image files are stored on tmpfs.
> 
> Signed-off-by: Pawel Stradomski <pstradomski@google.com>
> 
> ---
>  compel/arch/arm/plugins/std/syscalls/syscall.def |  1 +
>  .../ppc64/plugins/std/syscalls/syscall-ppc64.tbl |  1 +
>  .../s390/plugins/std/syscalls/syscall-s390.tbl   |  1 +
>  .../arch/x86/plugins/std/syscalls/syscall_32.tbl |  1 +
>  .../arch/x86/plugins/std/syscalls/syscall_64.tbl |  1 +
>  criu/mem.c                                       |  3 ++-
>  criu/pie/restorer.c                              | 16 ++++++++++++++++
>  7 files changed, 23 insertions(+), 1 deletion(-)
> 
> diff --git a/compel/arch/arm/plugins/std/syscalls/syscall.def b/compel/arch/arm/plugins/std/syscalls/syscall.def
> index b68f9f2f..bcd61d4a 100644
> --- a/compel/arch/arm/plugins/std/syscalls/syscall.def
> +++ b/compel/arch/arm/plugins/std/syscalls/syscall.def
> @@ -109,3 +109,4 @@ seccomp				277	383	(unsigned int op, unsigned int flags, const char *uargs)
>  gettimeofday			169	78	(struct timeval *tv, struct timezone *tz)
>  preadv_raw			69	361	(int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h)
>  userfaultfd			282	388	(int flags)
> +fallocate			47	352	(int fd, int mode, loff_t offset, loff_t len)
> diff --git a/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl b/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl
> index fa0b034e..62e0bc1a 100644
> --- a/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl
> +++ b/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl
> @@ -89,6 +89,7 @@ __NR_set_robust_list	300		sys_set_robust_list	(struct robust_list_head *head, si
>  __NR_get_robust_list	299		sys_get_robust_list	(int pid, struct robust_list_head **head_ptr, size_t *len_ptr)
>  __NR_vmsplice		285		sys_vmsplice		(int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags)
>  __NR_openat		286		sys_openat		(int dfd, const char *filename, int flags, int mode)
> +__NR_fallocate		309		sys_fallocate		(int fd, int mode, loff_t offset, loff_t len)
>  __NR_timerfd_settime	311		sys_timerfd_settime	(int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr)
>  __NR_signalfd4		313		sys_signalfd		(int fd, k_rtsigset_t *mask, size_t sizemask, int flags)
>  __NR_rt_tgsigqueueinfo	322		sys_rt_tgsigqueueinfo	(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
> diff --git a/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl b/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl
> index bc77ae97..3521e915 100644
> --- a/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl
> +++ b/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl
> @@ -89,6 +89,7 @@ __NR_set_robust_list	304		sys_set_robust_list	(struct robust_list_head *head, si
>  __NR_get_robust_list	305		sys_get_robust_list	(int pid, struct robust_list_head **head_ptr, size_t *len_ptr)
>  __NR_vmsplice		309		sys_vmsplice		(int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags)
>  __NR_openat		288		sys_openat		(int dfd, const char *filename, int flags, int mode)
> +__NR_fallocate		314		sys_fallocate		(int fd, int mode, loff_t offset, loff_t len)
>  __NR_timerfd_settime	320		sys_timerfd_settime	(int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr)
>  __NR_signalfd4		322		sys_signalfd		(int fd, k_rtsigset_t *mask, size_t sizemask, int flags)
>  __NR_rt_tgsigqueueinfo	330		sys_rt_tgsigqueueinfo	(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
> diff --git a/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl b/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl
> index 9e1de281..a6c55b83 100644
> --- a/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl
> +++ b/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl
> @@ -83,6 +83,7 @@ __NR_set_robust_list	311		sys_set_robust_list	(struct robust_list_head *head, si
>  __NR_get_robust_list	312		sys_get_robust_list	(int pid, struct robust_list_head **head_ptr, size_t *len_ptr)
>  __NR_vmsplice		316		sys_vmsplice		(int fd, const struct iovec *iov, unsigned int nr_segs, unsigned int flags)
>  __NR_signalfd		321		sys_signalfd		(int ufd, const k_rtsigset_t *sigmask, size_t sigsetsize)
> +__NR_fallocate		324		sys_fallocate		(int fd, int mode, loff_t offset, loff_t len)
>  __NR_timerfd_settime	325		sys_timerfd_settime	(int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr)
>  __NR_preadv		333		sys_preadv_raw		(int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h)
>  __NR_rt_tgsigqueueinfo	335		sys_rt_tgsigqueueinfo	(pid_t tgid, pid_t pid, int sig, siginfo_t *uinfo)
> diff --git a/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl b/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl
> index 726fa797..64271514 100644
> --- a/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl
> +++ b/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl
> @@ -94,6 +94,7 @@ __NR_set_robust_list		273		sys_set_robust_list	(struct robust_list_head *head, s
>  __NR_get_robust_list		274		sys_get_robust_list	(int pid, struct robust_list_head **head_ptr, size_t *len_ptr)
>  __NR_seccomp			317		sys_seccomp		(unsigned int op, unsigned int flags, const char *uargs)
>  __NR_vmsplice			278		sys_vmsplice		(int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags)
> +__NR_fallocate			285		sys_fallocate		(int fd, int mode, loff_t offset, loff_t len)
>  __NR_timerfd_settime		286		sys_timerfd_settime	(int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr)
>  __NR_signalfd4			289		sys_signalfd		(int fd, k_rtsigset_t *mask, size_t sizemask, int flags)
>  __NR_preadv			295		sys_preadv_raw		(int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h)
> diff --git a/criu/mem.c b/criu/mem.c
> index d020b7fd..c3d604a5 100644
> --- a/criu/mem.c
> +++ b/criu/mem.c
> @@ -1271,7 +1271,8 @@ static int prepare_vma_ios(struct pstree_item *t, struct task_restore_args *ta)
>  {
>  	struct cr_img *pages;
>  
> -	pages = open_image(CR_FD_PAGES, O_RSTR, rsti(t)->pages_img_id);
> +	pages = open_image(CR_FD_PAGES, opts.auto_dedup ? O_RDWR : O_RSTR,
> +				rsti(t)->pages_img_id);

Could you add a comment here which explain why O_RDWR is required?

>  	if (!pages)
>  		return -1;
>  
> diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c
> index f990e9b7..3f1a8a6b 100644
> --- a/criu/pie/restorer.c
> +++ b/criu/pie/restorer.c
> @@ -646,6 +646,15 @@ static unsigned long restore_mapping(VmaEntry *vma_entry)
>  			!(vma_entry->status & VMA_NO_PROT_WRITE))
>  		prot |= PROT_WRITE;
>  
> +	/* TODO: if the mapping had MAP_LOCKED bit set, then the mmap will
> +	 * cause immediate page-in and increase in process memory usage,
> +	 * thus defeating attempts to conserve memory by running fallocate after
> +	 * each preadv.
> +	 *
> +	 * This could be fixed by zeroing MAP_LOCKED bit here and restoring it
> +	 * after all the contents is already loaded and the tmpfs files released
> +	 * by fallocate.

Unfortunately, I don't understand  this comment. Maybe you can elaborate
with more details.

> +	 */
>  	pr_debug("\tmmap(%"PRIx64" -> %"PRIx64", %x %x %d)\n",
>  			vma_entry->start, vma_entry->end,
>  			prot, flags, (int)vma_entry->fd);

There is one more place where a private memory is restored. It is in
restore_priv_vma_content(). Do we want to do the same for shared memory?

> @@ -1367,6 +1376,13 @@ long __export_restore_task(struct task_restore_args *args)
>  			}
>  
>  			pr_debug("`- returned %ld\n", (long)r);
> +			/* TODO: Check if auto-dedup is enabled instead of trusting fallocate to fail
> +			 * if the file is not opened for writing. */
> +			if (r > 0) {
> +				pr_debug("   `fallocate %d %ld %ld\n", args->vma_ios_fd,  rio->off, r);
> +				sys_fallocate(args->vma_ios_fd, FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE,
> +					rio->off, r);

We have to check a return code and print an error.

> +			}
>  			rio->off += r;
>  			/* Advance the iovecs */
>  			do {
> -- 
> 2.18.0.203.gfac676dfb9-goog
> 
> _______________________________________________
> CRIU mailing list
> CRIU@openvz.org
> https://lists.openvz.org/mailman/listinfo/criu
Pawel Stradomski July 25, 2018, 1:02 p.m.
śr., 25 lip 2018 o 02:11 Andrei Vagin <avagin@virtuozzo.com> napisał(a):
>
> On Tue, Jul 17, 2018 at 03:38:57PM +0200, Pawel Stradomski wrote:
> > Punch holes in input files when restoring anonymous non-shared memory
> > if --auto-dedup is enabled.
>
> We need to update the description for --auto-dedup and describe this new
> behaviour there.
>
> With these changes, criu restore can be executed only once. I think we
> need to do something, so that a second attempt of restoring from these
> images fails with a error.

For the record, this has already been the case. With --auto-dedup the image
files were already being destroyed by criu,
but so far only for shared memory - see punch_hole() which calls
fallocate().
This change only makes it happen also for anonymous non-shared memory that
is read by pie/restorer.c from
within the restored process and not main criu process.

> We need to think how to test this functionality.
>
> >
> > This reduces memory usage if image files are stored on tmpfs.
> >
> > Signed-off-by: Pawel Stradomski <pstradomski@google.com>
> >
> > ---
> >  compel/arch/arm/plugins/std/syscalls/syscall.def |  1 +
> >  .../ppc64/plugins/std/syscalls/syscall-ppc64.tbl |  1 +
> >  .../s390/plugins/std/syscalls/syscall-s390.tbl   |  1 +
> >  .../arch/x86/plugins/std/syscalls/syscall_32.tbl |  1 +
> >  .../arch/x86/plugins/std/syscalls/syscall_64.tbl |  1 +
> >  criu/mem.c                                       |  3 ++-
> >  criu/pie/restorer.c                              | 16 ++++++++++++++++
> >  7 files changed, 23 insertions(+), 1 deletion(-)
> >
> > diff --git a/compel/arch/arm/plugins/std/syscalls/syscall.def
b/compel/arch/arm/plugins/std/syscalls/syscall.def
> > index b68f9f2f..bcd61d4a 100644
> > --- a/compel/arch/arm/plugins/std/syscalls/syscall.def
> > +++ b/compel/arch/arm/plugins/std/syscalls/syscall.def
> > @@ -109,3 +109,4 @@ seccomp                           277     383
(unsigned int op, unsigned int flags, const char *uargs)
> >  gettimeofday                 169     78      (struct timeval *tv,
struct timezone *tz)
> >  preadv_raw                   69      361     (int fd, struct iovec
*iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h)
> >  userfaultfd                  282     388     (int flags)
> > +fallocate                    47      352     (int fd, int mode, loff_t
offset, loff_t len)
> > diff --git a/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl
b/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl
> > index fa0b034e..62e0bc1a 100644
> > --- a/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl
> > +++ b/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl
> > @@ -89,6 +89,7 @@ __NR_set_robust_list        300
sys_set_robust_list     (struct robust_list_head *head, si
> >  __NR_get_robust_list 299             sys_get_robust_list     (int pid,
struct robust_list_head **head_ptr, size_t *len_ptr)
> >  __NR_vmsplice                285             sys_vmsplice
 (int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int
flags)
> >  __NR_openat          286             sys_openat              (int dfd,
const char *filename, int flags, int mode)
> > +__NR_fallocate               309             sys_fallocate
(int fd, int mode, loff_t offset, loff_t len)
> >  __NR_timerfd_settime 311             sys_timerfd_settime     (int ufd,
int flags, const struct itimerspec *utmr, struct itimerspec *otmr)
> >  __NR_signalfd4               313             sys_signalfd
 (int fd, k_rtsigset_t *mask, size_t sizemask, int flags)
> >  __NR_rt_tgsigqueueinfo       322             sys_rt_tgsigqueueinfo
(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
> > diff --git a/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl
b/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl
> > index bc77ae97..3521e915 100644
> > --- a/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl
> > +++ b/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl
> > @@ -89,6 +89,7 @@ __NR_set_robust_list        304
sys_set_robust_list     (struct robust_list_head *head, si
> >  __NR_get_robust_list 305             sys_get_robust_list     (int pid,
struct robust_list_head **head_ptr, size_t *len_ptr)
> >  __NR_vmsplice                309             sys_vmsplice
 (int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int
flags)
> >  __NR_openat          288             sys_openat              (int dfd,
const char *filename, int flags, int mode)
> > +__NR_fallocate               314             sys_fallocate
(int fd, int mode, loff_t offset, loff_t len)
> >  __NR_timerfd_settime 320             sys_timerfd_settime     (int ufd,
int flags, const struct itimerspec *utmr, struct itimerspec *otmr)
> >  __NR_signalfd4               322             sys_signalfd
 (int fd, k_rtsigset_t *mask, size_t sizemask, int flags)
> >  __NR_rt_tgsigqueueinfo       330             sys_rt_tgsigqueueinfo
(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
> > diff --git a/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl
b/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl
> > index 9e1de281..a6c55b83 100644
> > --- a/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl
> > +++ b/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl
> > @@ -83,6 +83,7 @@ __NR_set_robust_list        311
sys_set_robust_list     (struct robust_list_head *head, si
> >  __NR_get_robust_list 312             sys_get_robust_list     (int pid,
struct robust_list_head **head_ptr, size_t *len_ptr)
> >  __NR_vmsplice                316             sys_vmsplice
 (int fd, const struct iovec *iov, unsigned int nr_segs, unsigned int flags)
> >  __NR_signalfd                321             sys_signalfd
 (int ufd, const k_rtsigset_t *sigmask, size_t sigsetsize)
> > +__NR_fallocate               324             sys_fallocate
(int fd, int mode, loff_t offset, loff_t len)
> >  __NR_timerfd_settime 325             sys_timerfd_settime     (int ufd,
int flags, const struct itimerspec *utmr, struct itimerspec *otmr)
> >  __NR_preadv          333             sys_preadv_raw          (int fd,
struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long
pos_h)
> >  __NR_rt_tgsigqueueinfo       335             sys_rt_tgsigqueueinfo
(pid_t tgid, pid_t pid, int sig, siginfo_t *uinfo)
> > diff --git a/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl
b/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl
> > index 726fa797..64271514 100644
> > --- a/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl
> > +++ b/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl
> > @@ -94,6 +94,7 @@ __NR_set_robust_list                273
sys_set_robust_list     (struct robust_list_head *head, s
> >  __NR_get_robust_list         274             sys_get_robust_list
(int pid, struct robust_list_head **head_ptr, size_t *len_ptr)
> >  __NR_seccomp                 317             sys_seccomp
(unsigned int op, unsigned int flags, const char *uargs)
> >  __NR_vmsplice                        278             sys_vmsplice
       (int fd, const struct iovec *iov, unsigned long nr_segs, unsigned
int flags)
> > +__NR_fallocate                       285             sys_fallocate
      (int fd, int mode, loff_t offset, loff_t len)
> >  __NR_timerfd_settime         286             sys_timerfd_settime
(int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr)
> >  __NR_signalfd4                       289             sys_signalfd
       (int fd, k_rtsigset_t *mask, size_t sizemask, int flags)
> >  __NR_preadv                  295             sys_preadv_raw
 (int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l,
unsigned long pos_h)
> > diff --git a/criu/mem.c b/criu/mem.c
> > index d020b7fd..c3d604a5 100644
> > --- a/criu/mem.c
> > +++ b/criu/mem.c
> > @@ -1271,7 +1271,8 @@ static int prepare_vma_ios(struct pstree_item *t,
struct task_restore_args *ta)
> >  {
> >       struct cr_img *pages;
> >
> > -     pages = open_image(CR_FD_PAGES, O_RSTR, rsti(t)->pages_img_id);
> > +     pages = open_image(CR_FD_PAGES, opts.auto_dedup ? O_RDWR : O_RSTR,
> > +                             rsti(t)->pages_img_id);
>
> Could you add a comment here which explain why O_RDWR is required?

Done.

>
> >       if (!pages)
> >               return -1;
> >
> > diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c
> > index f990e9b7..3f1a8a6b 100644
> > --- a/criu/pie/restorer.c
> > +++ b/criu/pie/restorer.c
> > @@ -646,6 +646,15 @@ static unsigned long restore_mapping(VmaEntry
*vma_entry)
> >                       !(vma_entry->status & VMA_NO_PROT_WRITE))
> >               prot |= PROT_WRITE;
> >
> > +     /* TODO: if the mapping had MAP_LOCKED bit set, then the mmap will
> > +      * cause immediate page-in and increase in process memory usage,
> > +      * thus defeating attempts to conserve memory by running
fallocate after
> > +      * each preadv.
> > +      *
> > +      * This could be fixed by zeroing MAP_LOCKED bit here and
restoring it
> > +      * after all the contents is already loaded and the tmpfs files
released
> > +      * by fallocate.
>
> Unfortunately, I don't understand  this comment. Maybe you can elaborate
> with more details.

Done

>
> > +      */
> >       pr_debug("\tmmap(%"PRIx64" -> %"PRIx64", %x %x %d)\n",
> >                       vma_entry->start, vma_entry->end,
> >                       prot, flags, (int)vma_entry->fd);
>
> There is one more place where a private memory is restored. It is in
> restore_priv_vma_content(). Do we want to do the same for shared memory?

restore_priv_vma_content either:
a) uses COW code where punching holes would be tricky
b) or calls pr->read_pages which ends up (via maybe_read_page) calling
read_local_page
which already has
if (opts.auto_dedup) {
      ret = punch_hole(pr, pr->pi_off, len, false);
}
c) or ends up doing async reads which call fallocate() in
process_async_reads()
d) or uses render_pagemap to defer actual reading to pie/restorer.c, which
is the path I'm fixing here.

>
> > @@ -1367,6 +1376,13 @@ long __export_restore_task(struct
task_restore_args *args)
> >                       }
> >
> >                       pr_debug("`- returned %ld\n", (long)r);
> > +                     /* TODO: Check if auto-dedup is enabled instead
of trusting fallocate to fail
> > +                      * if the file is not opened for writing. */
> > +                     if (r > 0) {
> > +                             pr_debug("   `fallocate %d %ld %ld\n",
args->vma_ios_fd,  rio->off, r);
> > +                             sys_fallocate(args->vma_ios_fd,
FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE,
> > +                                     rio->off, r);
>
> We have to check a return code and print an error.

Done.

> > +                     }
> >                       rio->off += r;
> >                       /* Advance the iovecs */
> >                       do {
> > --
> > 2.18.0.203.gfac676dfb9-goog
> >
> > _______________________________________________
> > CRIU mailing list
> > CRIU@openvz.org
> > https://lists.openvz.org/mailman/listinfo/criu