[v2,2/2] lazy-pages: add support to combine pre-copy and post-copy

Submitted by Adrian Reber on Sept. 15, 2016, 8:34 a.m.

Details

Message ID 1473928480-19567-2-git-send-email-adrian@lisas.de
State Rejected
Series "Series without cover letter"
Headers show

Commit Message

Adrian Reber Sept. 15, 2016, 8:34 a.m.
From: Adrian Reber <areber@redhat.com>

To combine pre-copy (pre-dump) and post-copy (lazy-pages) mode the
lazy-page mode must be made aware of pages which are only in the parent
image and not in the current checkpoint image.

As the restorer only works on VmaEntry-s and knows nothing about PageMap
entries the VmaEntry-s need to be adapted to match the PageMap entries.

This changes the lazy-page detection to not only rely on
vma_entry_can_be_lazy() but to also check if the page is available in
the parent. If the page is available in a parent checkpoint the page is
not marked as lazy via a new VmaEntry field (optional bool lazy = 11).

If the VmaEntry does not have the same size as the PageMap entry the
VmaEntry needs to be adapted to match the PageMap entry and then the new
lazy flag can be set in the VmaEntry.

The restorer then additionally has to check if the VmaEntry has the lazy
flag. If the lazy flag is not set, then the page is available in a
parent checkpoint.

This code additionally adds a 'return 0;' to unmap_guard_pages() as the
VmaEntry splitting can create multiple VmaEntry-s with MAP_GROWSDOWN and
only the first entry needs the guard page to be unmapped.

Following steps to migrate a process are now possible:

Source system:

 * criu pre-dump -D /tmp/cp/1 -t <PID>
 * rsync -a /tmp/cp <destination>:/tmp
 * criu dump -D /tmp/cp/2 -t <PID> --port 27 --lazy-pages \
   --prev-images-dir ../1/ --track-mem

Destination system:

 * rsync -a <source>:/tmp/cp /tmp/
 * criu lazy-pages --page-server --address <source> --port 27 \
   -D /tmp/cp/2 &
 * criu restore --lazy-pages -D /tmp/cp/2

This will now restore all pages from the parent checkpoint if they
are not marked as lazy in the second checkpoint.

v2:
 - changed parent detection to use pagemap_in_parent()

Signed-off-by: Adrian Reber <areber@redhat.com>
---
 criu/mem.c          | 117 ++++++++++++++++++++++++++++++++++++++++++++++++----
 criu/pie/restorer.c |   2 +-
 criu/uffd.c         |   3 +-
 images/vma.proto    |   1 +
 4 files changed, 113 insertions(+), 10 deletions(-)

Patch hide | download patch | download mbox

diff --git a/criu/mem.c b/criu/mem.c
index 7f7e87c..597f7e4 100644
--- a/criu/mem.c
+++ b/criu/mem.c
@@ -684,6 +684,35 @@  static int premap_priv_vmas(struct pstree_item *t, struct vm_area_list *vmas, vo
 	return ret;
 }
 
+static int split_priv_vma(unsigned long addr, struct vma_area *vma)
+{
+	struct vma_area *new_vma;
+	VmaEntry *e;
+
+	/* create new VMA Area */
+	new_vma = alloc_vma_area();
+	/* Store address of new VMA Entry */
+	e = new_vma->e;
+	/* Copy all old values */
+	memcpy(new_vma, vma, sizeof(struct vma_area));
+	/* Fill new VMA Entry with old values */
+	memcpy(e, vma->e, sizeof(VmaEntry));
+	/* overwrite start address with current */
+	e->start = addr;
+	/* Overwrite old end address */
+	vma->e->end = addr;
+	e->has_lazy = true;
+	e->lazy = false;
+	new_vma->e = e;
+	new_vma->page_bitmap = xzalloc(BITS_TO_LONGS(vma_entry_len(new_vma->e) / PAGE_SIZE) * sizeof(long));
+	if (new_vma->page_bitmap == NULL)
+		return -1;
+
+	new_vma->premmaped_addr += vma_entry_len(vma->e);
+	list_add(&new_vma->list, &vma->list);
+	return 0;
+}
+
 static int restore_priv_vma_content(struct pstree_item *t)
 {
 	struct vma_area *vma;
@@ -745,20 +774,84 @@  static int restore_priv_vma_content(struct pstree_item *t)
 				goto err_addr;
 			}
 
-			off = (va - vma->e->start) / PAGE_SIZE;
-			p = decode_pointer((off) * PAGE_SIZE +
-					vma->premmaped_addr);
-
 			/*
 			 * This means that userfaultfd is used to load the pages
 			 * on demand.
 			 */
 			if (opts.lazy_pages && vma_entry_can_be_lazy(vma->e)) {
-				pr_debug("Lazy restore skips %#016"PRIx64"\n", vma->e->start);
-				pr.skip_pages(&pr, PAGE_SIZE);
-				nr_lazy++;
-				continue;
+				pr_debug("Lazy restore skips %#016"PRIx64"\n", va);
+				if (!pagemap_in_parent(pr.pe)) {
+					pr_debug("%#016"PRIx64" not in parent\n", va);
+					pr.skip_pages(&pr, PAGE_SIZE);
+					nr_lazy++;
+					vma->e->has_lazy = true;
+					vma->e->lazy = true;
+					continue;
+				} else {
+					unsigned long new_addr;
+					/*
+					 * First check if the PageMap Entry and the
+					 * VMA Entry are the same size. That is the easy
+					 * case where the whole VMA Entry can be marked
+					 * as non-lazy as it present in the parent.
+					 */
+					if (pr.pe->vaddr == vma->e->start &&
+							pr.pe->vaddr + (pr.pe->nr_pages * PAGE_SIZE) == vma->e->end) {
+						pr_debug("VMA Entry and PageMap Entry matches\n");
+						/*
+						 * lazy defaults to false; explicitly set it for
+						 * better readability.
+						 */
+						vma->e->has_lazy = true;
+						vma->e->lazy = false;
+						goto read_pages;
+					}
+					/*
+					 * Only those pages in the VMA Entry which
+					 * are not available in the parent, should be
+					 * marked as lazy.
+					 * As only the PageMap Entry knows if the pages
+					 * are available in the parent, the VMA Entry needs
+					 * to be split into pages which actually should
+					 * be loaded lazily and pages which are in the
+					 * parent. This is necessary as the restore only
+					 * knows about VMAs and not PageMap Entries.
+					 */
+
+					/* Check if this is the last page of the VMA Entry */
+					if (vma->e->end == va + PAGE_SIZE) {
+						pr_debug("VMA Entry end has already been reached\n");
+						goto read_pages;
+					}
+
+					/*
+					 * Check if the current address is the same
+					 * as the current VMA Entries start address.
+					 * If not a VMA Entry at the beginning has to be
+					 * split off.
+					 */
+					if (va != vma->e->start) {
+						pr_debug("Replacing VMA start address\n");
+						new_addr = va;
+					} else {
+						new_addr = pr.pe->vaddr + (pr.pe->nr_pages * PAGE_SIZE);
+						if (new_addr > vma->e->end) {
+							pr_debug("VMA Entry smaller than PageMap Entry\n");
+							new_addr = va;
+						}
+					}
+
+					ret = split_priv_vma(new_addr, vma);
+					if (ret)
+						return -1;
+					rsti(t)->vmas.nr++;
+				}
 			}
+read_pages:
+
+			off = (va - vma->e->start) / PAGE_SIZE;
+			p = decode_pointer((off) * PAGE_SIZE +
+					vma->premmaped_addr);
 
 			set_bit(off, vma->page_bitmap);
 			if (vma->ppage_bitmap) { /* inherited vma */
@@ -920,6 +1013,14 @@  int unmap_guard_pages(struct pstree_item *t)
 				pr_perror("Can't unmap guard page");
 				return -1;
 			}
+
+			/*
+			 * The code to combine pre-copy and post-copy
+			 * can split existing MAP_GROWSDOWN VMA areas
+			 * into two. Therefore returning once a guard
+			 * page has been unmapped.
+			 */
+			return 0;
 		}
 	}
 
diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c
index d84d316..39719d3 100644
--- a/criu/pie/restorer.c
+++ b/criu/pie/restorer.c
@@ -841,7 +841,7 @@  static int vma_remap(VmaEntry *vma_entry, int uffd)
 	 * pages, so that the processes will hang until the memory is
 	 * injected via userfaultfd.
 	 */
-	if (vma_entry_can_be_lazy(vma_entry))
+	if (vma_entry_can_be_lazy(vma_entry) && vma_entry->lazy)
 		if (enable_uffd(uffd, dst, len) != 0)
 			return -1;
 
diff --git a/criu/uffd.c b/criu/uffd.c
index 81dc7ae..8bae84b 100644
--- a/criu/uffd.c
+++ b/criu/uffd.c
@@ -508,7 +508,8 @@  static int collect_uffd_pages(struct page_read *pr, struct lazy_pages_info *lpi)
 			 */
 			if (base >= vma->e->start && base < vma->e->end) {
 				if (vma_entry_can_be_lazy(vma->e)) {
-					uffd_page = true;
+					if(!pagemap_in_parent(pr.pe))
+						uffd_page = true;
 					break;
 				}
 			}
diff --git a/images/vma.proto b/images/vma.proto
index 7085f42..843ba2b 100644
--- a/images/vma.proto
+++ b/images/vma.proto
@@ -22,4 +22,5 @@  message vma_entry {
 
 	/* file status flags */
 	optional uint32		fdflags	= 10 [(criu).hex = true];
+	optional bool		lazy 	= 11 [(criu).hex = false];
 }