[11/11] mem: Delayed vma/pr restore

Submitted by Pavel Emelianov on May 5, 2017, 4:04 p.m.

Details

Message ID 7cd20f7f-885a-74b7-42d1-993c232edbc1@virtuozzo.com
State New
Series "Do not remap vmas when not needed"
Headers show

Commit Message

Pavel Emelianov May 5, 2017, 4:04 p.m.
Performance experiments show, that we spend (relatively) a lot of time
mremap-ing areas from premap area into their proper places. This time
depends on the task being restored, but for those with many vmas this
can be up to 20%.

The thing is that premapping is only needed to restore cow pages since
we don't have any API in the kernel to share a page between two or more
anonymous vmas. For non-cowing areas we map mmap() them directly in
place. But for such cases we'll also need to restore the page's contents
also from the pie code.

Doing the whole page-read code from PIE is way too complex (for now), so
the proposal is to optimize the case when we have a single local pagemap
layer. This is what pr.pieok boolean stands for.

Signed-off-by: Pavel Emelyanov <xemul@virtuozzo.com>
---
 criu/cr-restore.c       |  1 +
 criu/include/pagemap.h  |  6 ++++++
 criu/include/restorer.h | 10 ++++++++++
 criu/include/rst_info.h |  2 ++
 criu/mem.c              | 45 ++++++++++++++++++++++++++++++++++++++++++++-
 criu/pagemap.c          | 35 +++++++++++++++++++++++++++++++++--
 criu/pie/restorer.c     | 41 +++++++++++++++++++++++++++++++++++++++++
 criu/pstree.c           |  1 +
 8 files changed, 138 insertions(+), 3 deletions(-)

Patch hide | download patch | download mbox

diff --git a/criu/cr-restore.c b/criu/cr-restore.c
index 92945f3..f9dc091 100644
--- a/criu/cr-restore.c
+++ b/criu/cr-restore.c
@@ -3236,6 +3236,7 @@  static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
 	RST_MEM_FIXUP_PPTR(task_args->helpers);
 	RST_MEM_FIXUP_PPTR(task_args->zombies);
 	RST_MEM_FIXUP_PPTR(task_args->seccomp_filters);
+	RST_MEM_FIXUP_PPTR(task_args->vma_ios);
 
 	if (core->tc->has_seccomp_mode)
 		task_args->seccomp_mode = core->tc->seccomp_mode;
diff --git a/criu/include/pagemap.h b/criu/include/pagemap.h
index aa3c4aa..08633ef 100644
--- a/criu/include/pagemap.h
+++ b/criu/include/pagemap.h
@@ -58,6 +58,9 @@  struct page_read {
 	int (*maybe_read_page)(struct page_read *pr, unsigned long vaddr,
 			       int nr, void *buf, unsigned flags);
 
+	/* Whether or not pages can be read in PIE code */
+	bool pieok;
+
 	/* Private data of reader */
 	struct cr_img *pmi;
 	struct cr_img *pi;
@@ -104,8 +107,11 @@  extern int open_page_read(int pid, struct page_read *, int pr_flags);
 extern int open_page_read_at(int dfd, int pid, struct page_read *pr,
 		int pr_flags);
 
+struct task_restore_args;
+
 int pagemap_enqueue_iovec(struct page_read *pr, void *buf,
 			      unsigned long len, struct list_head *to);
+int pagemap_render_iovec(struct list_head *from, struct task_restore_args *ta);
 
 /*
  * Create a shallow copy of page_read object.
diff --git a/criu/include/restorer.h b/criu/include/restorer.h
index 454181e..e2205d8 100644
--- a/criu/include/restorer.h
+++ b/criu/include/restorer.h
@@ -101,6 +101,12 @@  struct thread_restore_args {
 
 typedef long (*thread_restore_fcall_t) (struct thread_restore_args *args);
 
+struct restore_vma_io {
+	int nr_iovs;
+	loff_t off;
+	struct iovec iovs[0];
+};
+
 struct task_restore_args {
 	struct thread_restore_args	*t;			/* thread group leader */
 
@@ -123,6 +129,10 @@  struct task_restore_args {
 	VmaEntry			*vmas;
 	unsigned int			vmas_n;
 
+	int				vma_ios_fd;
+	struct restore_vma_io		*vma_ios;
+	unsigned int			vma_ios_n;
+
 	struct restore_posix_timer	*posix_timers;
 	unsigned int			posix_timers_n;
 
diff --git a/criu/include/rst_info.h b/criu/include/rst_info.h
index 92dfc9d..c3dbe2d 100644
--- a/criu/include/rst_info.h
+++ b/criu/include/rst_info.h
@@ -39,6 +39,8 @@  struct rst_info {
 
 	struct vm_area_list	vmas;
 	struct _MmEntry		*mm;
+	struct list_head	vma_io;
+	unsigned int		pages_img_id;
 
 	u32			cg_set;
 
diff --git a/criu/mem.c b/criu/mem.c
index fe62b41..6ed0801 100644
--- a/criu/mem.c
+++ b/criu/mem.c
@@ -758,6 +758,13 @@  static int premap_priv_vmas(struct pstree_item *t, struct vm_area_list *vmas,
 		if (!vma_area_is_private(vma, kdat.task_size))
 			continue;
 
+		if (vma->pvma == NULL && pr->pieok)
+			/*
+			 * VMA in question is not shared with anyone. We'll
+			 * restore it with its contents in restorer.
+			 */
+			continue;
+
 		ret = premap_private_vma(t, vma, &at);
 		if (ret < 0)
 			break;
@@ -771,6 +778,7 @@  static int restore_priv_vma_content(struct pstree_item *t, struct page_read *pr)
 	struct vma_area *vma;
 	int ret = 0;
 	struct list_head *vmas = &rsti(t)->vmas.h;
+	struct list_head *vma_io = &rsti(t)->vma_io;
 
 	unsigned int nr_restored = 0;
 	unsigned int nr_shared = 0;
@@ -785,6 +793,7 @@  static int restore_priv_vma_content(struct pstree_item *t, struct page_read *pr)
 	}
 
 	vma = list_first_entry(vmas, struct vma_area, list);
+	rsti(t)->pages_img_id = pr->pages_img_id;
 
 	/*
 	 * Read page contents.
@@ -837,6 +846,28 @@  static int restore_priv_vma_content(struct pstree_item *t, struct page_read *pr)
 				goto err_addr;
 			}
 
+			if (vma->pvma == NULL && pr->pieok) {
+				unsigned long len = min_t(unsigned long,
+						(nr_pages - i) * PAGE_SIZE,
+						vma->e->end - va);
+
+				if (pagemap_enqueue_iovec(pr, (void *)va, len, vma_io))
+					return -1;
+
+				pr->skip_pages(pr, len);
+
+				va += len;
+				len >>= PAGE_SHIFT;
+				nr_restored += len;
+				i += len - 1;
+				pr_debug("Enqueue page-read\n");
+				continue;
+			}
+
+			/*
+			 * Otherwise to the COW restore
+			 */
+
 			off = (va - vma->e->start) / PAGE_SIZE;
 			p = decode_pointer((off) * PAGE_SIZE +
 					vma->premmaped_addr);
@@ -1042,6 +1073,18 @@  int open_vmas(struct pstree_item *t)
 	return 0;
 }
 
+static int prepare_vma_ios(struct pstree_item *t, struct task_restore_args *ta)
+{
+	struct cr_img *pages;
+
+	pages = open_image(CR_FD_PAGES, O_RSTR, rsti(t)->pages_img_id);
+	if (!pages)
+		return -1;
+
+	ta->vma_ios_fd = img_raw_fd(pages);
+	return pagemap_render_iovec(&rsti(t)->vma_io, ta);
+}
+
 int prepare_vmas(struct pstree_item *t, struct task_restore_args *ta)
 {
 	struct vma_area *vma;
@@ -1067,6 +1110,6 @@  int prepare_vmas(struct pstree_item *t, struct task_restore_args *ta)
 			vma_premmaped_start(vme) = vma->premmaped_addr;
 	}
 
-	return 0;
+	return prepare_vma_ios(t, ta);
 }
 
diff --git a/criu/pagemap.c b/criu/pagemap.c
index dcc1332..79076d9 100644
--- a/criu/pagemap.c
+++ b/criu/pagemap.c
@@ -11,7 +11,8 @@ 
 #include "servicefd.h"
 #include "pagemap.h"
 #include "page-xfer.h"
-
+#include "restorer.h"
+#include "rst-malloc.h"
 #include "fault-injection.h"
 #include "xmalloc.h"
 #include "protobuf.h"
@@ -309,6 +310,32 @@  static int enqueue_async_iov(struct page_read *pr, void *buf,
 	return 0;
 }
 
+int pagemap_render_iovec(struct list_head *from, struct task_restore_args *ta)
+{
+	struct page_read_iov *piov;
+
+	ta->vma_ios = (struct restore_vma_io *)rst_mem_align_cpos(RM_PRIVATE);
+	ta->vma_ios_n = 0;
+
+	list_for_each_entry(piov, from, l) {
+		struct restore_vma_io *rio;
+
+		pr_info("`- render %d iovs (%p:%zd...)\n", piov->nr,
+				piov->to[0].iov_base, piov->to[0].iov_len);
+		rio = rst_mem_alloc(sizeof(*rio) + piov->nr * sizeof(struct iovec), RM_PRIVATE);
+		if (!rio)
+			return -1;
+
+		rio->nr_iovs = piov->nr;
+		rio->off = piov->from;
+		memcpy(rio->iovs, piov->to, piov->nr * sizeof(struct iovec));
+
+		ta->vma_ios_n++;
+	}
+
+	return 0;
+}
+
 int pagemap_enqueue_iovec(struct page_read *pr, void *buf,
 			      unsigned long len, struct list_head *to)
 {
@@ -795,6 +822,7 @@  int open_page_read_at(int dfd, int pid, struct page_read *pr, int pr_flags)
 	pr->bunch.iov_len = 0;
 	pr->bunch.iov_base = NULL;
 	pr->pmes = NULL;
+	pr->pieok = false;
 
 	pr->pmi = open_image_at(dfd, i_typ, O_RSTR, (long)pid);
 	if (!pr->pmi)
@@ -836,8 +864,11 @@  int open_page_read_at(int dfd, int pid, struct page_read *pr, int pr_flags)
 		pr->maybe_read_page = maybe_read_page_img_cache;
 	else if (remote)
 		pr->maybe_read_page = maybe_read_page_remote;
-	else
+	else {
 		pr->maybe_read_page = maybe_read_page_local;
+		if (!pr->parent)
+			pr->pieok = true;
+	}
 
 	pr_debug("Opened %s page read %u (parent %u)\n",
 		 remote ? "remote" : "local", pr->id,
diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c
index dc8a4d0..4c91ddd 100644
--- a/criu/pie/restorer.c
+++ b/criu/pie/restorer.c
@@ -1270,6 +1270,47 @@  long __export_restore_task(struct task_restore_args *args)
 		}
 	}
 
+	/*
+	 * Now read the contents (if any)
+	 */
+
+	for (i = 0; i < args->vma_ios_n; i++) {
+		struct restore_vma_io *rio = args->vma_ios + i;
+		struct iovec *iovs = rio->iovs;
+		int nr = rio->nr_iovs;
+		ssize_t r;
+
+		while (nr) {
+			pr_debug("Preadv %lx:%d... (%d iovs)\n",
+					(unsigned long)iovs->iov_base,
+					(int)iovs->iov_len, nr);
+			r = sys_preadv(args->vma_ios_fd, iovs, nr, rio->off);
+			if (r < 0) {
+				pr_err("Can't read pages data (%d)\n", (int)r);
+				goto core_restore_end;
+			}
+
+			pr_debug("`- returned %ld\n", (long)r);
+			rio->off += r;
+			/* Advance the iovecs */
+			do {
+				if (iovs->iov_len <= r) {
+					pr_debug("   `- skip pagemap\n");
+					r -= iovs->iov_len;
+					iovs++;
+					nr--;
+					continue;
+				}
+
+				iovs->iov_base += r;
+				iovs->iov_len -= r;
+				break;
+			} while (nr > 0);
+		}
+	}
+
+	sys_close(args->vma_ios_fd);
+
 #ifdef CONFIG_VDSO
 	/*
 	 * Proxify vDSO.
diff --git a/criu/pstree.c b/criu/pstree.c
index b512e43..dee5f3d 100644
--- a/criu/pstree.c
+++ b/criu/pstree.c
@@ -224,6 +224,7 @@  struct pstree_item *__alloc_pstree_item(bool rst, int level)
 			return NULL;
 		memset(item, 0, sz);
 		vm_area_list_init(&rsti(item)->vmas);
+		INIT_LIST_HEAD(&rsti(item)->vma_io);
 		/*
 		 * On restore we never expand pid level,
 		 * so allocate them all at once.

Comments

Mike Rapoport May 7, 2017, 10:52 a.m.
On Fri, May 05, 2017 at 07:04:22PM +0300, Pavel Emelyanov wrote:
> Performance experiments show, that we spend (relatively) a lot of time
> mremap-ing areas from premap area into their proper places. This time
> depends on the task being restored, but for those with many vmas this
> can be up to 20%.
> 
> The thing is that premapping is only needed to restore cow pages since
> we don't have any API in the kernel to share a page between two or more
> anonymous vmas. For non-cowing areas we map mmap() them directly in
> place. But for such cases we'll also need to restore the page's contents
> also from the pie code.
> 
> Doing the whole page-read code from PIE is way too complex (for now), so
> the proposal is to optimize the case when we have a single local pagemap
> layer. This is what pr.pieok boolean stands for.
> 
> Signed-off-by: Pavel Emelyanov <xemul@virtuozzo.com>
> ---
>  criu/cr-restore.c       |  1 +
>  criu/include/pagemap.h  |  6 ++++++
>  criu/include/restorer.h | 10 ++++++++++
>  criu/include/rst_info.h |  2 ++
>  criu/mem.c              | 45 ++++++++++++++++++++++++++++++++++++++++++++-
>  criu/pagemap.c          | 35 +++++++++++++++++++++++++++++++++--
>  criu/pie/restorer.c     | 41 +++++++++++++++++++++++++++++++++++++++++
>  criu/pstree.c           |  1 +
>  8 files changed, 138 insertions(+), 3 deletions(-)
> 

[...]

> diff --git a/criu/pagemap.c b/criu/pagemap.c
> index dcc1332..79076d9 100644
> --- a/criu/pagemap.c
> +++ b/criu/pagemap.c
> @@ -795,6 +822,7 @@ int open_page_read_at(int dfd, int pid, struct page_read *pr, int pr_flags)
>  	pr->bunch.iov_len = 0;
>  	pr->bunch.iov_base = NULL;
>  	pr->pmes = NULL;
> +	pr->pieok = false;
> 
>  	pr->pmi = open_image_at(dfd, i_typ, O_RSTR, (long)pid);
>  	if (!pr->pmi)
> @@ -836,8 +864,11 @@ int open_page_read_at(int dfd, int pid, struct page_read *pr, int pr_flags)
>  		pr->maybe_read_page = maybe_read_page_img_cache;
>  	else if (remote)
>  		pr->maybe_read_page = maybe_read_page_remote;
> -	else
> +	else {
>  		pr->maybe_read_page = maybe_read_page_local;
> +		if (!pr->parent)
> +			pr->pieok = true;
> +	}

checkpatch.pl would be unhappy about this chunk ;-)
It seems that some of the previous patches also have if-else clauses with
braces added only for some of the arms.

> 
>  	pr_debug("Opened %s page read %u (parent %u)\n",
>  		 remote ? "remote" : "local", pr->id,
> diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c
> index dc8a4d0..4c91ddd 100644
> --- a/criu/pie/restorer.c
> +++ b/criu/pie/restorer.c
> @@ -1270,6 +1270,47 @@ long __export_restore_task(struct task_restore_args *args)
>  		}
>  	}
> 
> +	/*
> +	 * Now read the contents (if any)
> +	 */
> +
> +	for (i = 0; i < args->vma_ios_n; i++) {
> +		struct restore_vma_io *rio = args->vma_ios + i;
> +		struct iovec *iovs = rio->iovs;
> +		int nr = rio->nr_iovs;
> +		ssize_t r;
> +
> +		while (nr) {
> +			pr_debug("Preadv %lx:%d... (%d iovs)\n",
> +					(unsigned long)iovs->iov_base,
> +					(int)iovs->iov_len, nr);
> +			r = sys_preadv(args->vma_ios_fd, iovs, nr, rio->off);
> +			if (r < 0) {
> +				pr_err("Can't read pages data (%d)\n", (int)r);
> +				goto core_restore_end;
> +			}
> +
> +			pr_debug("`- returned %ld\n", (long)r);
> +			rio->off += r;
> +			/* Advance the iovecs */
> +			do {
> +				if (iovs->iov_len <= r) {
> +					pr_debug("   `- skip pagemap\n");
> +					r -= iovs->iov_len;
> +					iovs++;
> +					nr--;
> +					continue;
> +				}
> +
> +				iovs->iov_base += r;
> +				iovs->iov_len -= r;
> +				break;
> +			} while (nr > 0);
> +		}
> +	}
> +
> +	sys_close(args->vma_ios_fd);
> +
>  #ifdef CONFIG_VDSO
>  	/*
>  	 * Proxify vDSO.
> diff --git a/criu/pstree.c b/criu/pstree.c
> index b512e43..dee5f3d 100644
> --- a/criu/pstree.c
> +++ b/criu/pstree.c
> @@ -224,6 +224,7 @@ struct pstree_item *__alloc_pstree_item(bool rst, int level)
>  			return NULL;
>  		memset(item, 0, sz);
>  		vm_area_list_init(&rsti(item)->vmas);
> +		INIT_LIST_HEAD(&rsti(item)->vma_io);
>  		/*
>  		 * On restore we never expand pid level,
>  		 * so allocate them all at once.
> -- 
> 2.5.5
> 
> _______________________________________________
> CRIU mailing list
> CRIU@openvz.org
> https://lists.openvz.org/mailman/listinfo/criu
>