[v5,5/8] criu: pagemap: add entries for zero pages

Submitted by Mike Rapoport on June 29, 2016, 5:55 a.m.

Details

Message ID 1467179713-14400-6-git-send-email-rppt@linux.vnet.ibm.com
State Rejected
Series "criu: make pagemap friendlier to random access"
Headers show

Commit Message

Mike Rapoport June 29, 2016, 5:55 a.m.
The pages that are mapped to zero_page_pfn are not dumped but information
where are they located is required for lazy restore.
Note that get_pagemap users presumed that zero pages are not a part of the
pagemap and these pages were just silently skipped during memory restore.
At the moment I preserve this semantics and force get_pagemap to skip zero
pages.

Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
---
 criu/include/page-pipe.h | 13 +++++++++-
 criu/include/page-xfer.h |  2 +-
 criu/include/stats.h     |  1 +
 criu/mem.c               | 22 +++++++++++------
 criu/page-pipe.c         | 15 +++++++++---
 criu/page-xfer.c         | 63 +++++++++++++++++++++++++++++++++---------------
 criu/pagemap.c           | 27 ++++++++++++++-------
 criu/stats.c             |  1 +
 images/pagemap.proto     |  1 +
 images/stats.proto       |  2 ++
 10 files changed, 106 insertions(+), 41 deletions(-)

Patch hide | download patch | download mbox

diff --git a/criu/include/page-pipe.h b/criu/include/page-pipe.h
index f11071c..ed6067c 100644
--- a/criu/include/page-pipe.h
+++ b/criu/include/page-pipe.h
@@ -97,6 +97,7 @@  struct page_pipe {
 					   have their iov-s in there */
 
 	struct page_pipe_iovs holes;	/* iovs for holes */
+	struct page_pipe_iovs zeros;	/* iovs for zero pages */
 
 	bool chunk_mode;	/* Restrict the maximum buffer size of pipes
 				   and dump memory for a few iterations */
@@ -108,7 +109,17 @@  extern struct page_pipe *create_page_pipe(unsigned int nr,
 extern void destroy_page_pipe(struct page_pipe *p);
 extern int page_pipe_add_page(struct page_pipe *p, unsigned long addr,
 			      unsigned int flags);
-extern int page_pipe_add_hole(struct page_pipe *p, unsigned long addr);
+extern int __page_pipe_add_hole(struct page_pipe_iovs *i, unsigned long addr);
+
+static inline int page_pipe_add_hole(struct page_pipe *p, unsigned long addr)
+{
+	return __page_pipe_add_hole(&p->holes, addr);
+}
+
+static inline int page_pipe_add_zero(struct page_pipe *p, unsigned long addr)
+{
+	return __page_pipe_add_hole(&p->zeros, addr);
+}
 
 extern void debug_show_page_pipe(struct page_pipe *pp);
 void page_pipe_reinit(struct page_pipe *pp);
diff --git a/criu/include/page-xfer.h b/criu/include/page-xfer.h
index d19671b..3ba61ed 100644
--- a/criu/include/page-xfer.h
+++ b/criu/include/page-xfer.h
@@ -16,7 +16,7 @@  struct page_xfer {
 	/* transfers pages related to previous pagemap */
 	int (*write_pages)(struct page_xfer *self, int pipe, unsigned long len);
 	/* transfers one hole -- vaddr:len entry w/o pages */
-	int (*write_hole)(struct page_xfer *self, struct iovec *iov);
+	int (*write_hole)(struct page_xfer *self, struct iovec *iov, int type);
 	void (*close)(struct page_xfer *self);
 
 	/* private data for every page-xfer engine */
diff --git a/criu/include/stats.h b/criu/include/stats.h
index e417636..c0effa7 100644
--- a/criu/include/stats.h
+++ b/criu/include/stats.h
@@ -25,6 +25,7 @@  enum {
 	CNT_PAGES_SCANNED,
 	CNT_PAGES_SKIPPED_PARENT,
 	CNT_PAGES_WRITTEN,
+	CNT_PAGES_ZERO,
 
 	DUMP_CNT_NR_STATS,
 };
diff --git a/criu/mem.c b/criu/mem.c
index d619a72..c4af568 100644
--- a/criu/mem.c
+++ b/criu/mem.c
@@ -107,14 +107,17 @@  static inline bool should_dump_page(VmaEntry *vmae, u64 pme)
 		return false;
 	if (vma_entry_is(vmae, VMA_AREA_AIORING))
 		return true;
-	if (pme & PME_SWAP)
-		return true;
-	if ((pme & PME_PRESENT) && ((pme & PME_PFRAME_MASK) != kdat.zero_page_pfn))
+	if (pme & (PME_PRESENT | PME_SWAP))
 		return true;
 
 	return false;
 }
 
+static inline bool page_is_zero(u64 pme)
+{
+	return (pme & PME_PFRAME_MASK) == kdat.zero_page_pfn;
+}
+
 static inline bool page_in_parent(u64 pme)
 {
 	/*
@@ -138,7 +141,7 @@  static int generate_iovs(struct vma_area *vma, struct page_pipe *pp, u64 *map, u
 {
 	u64 *at = &map[PAGE_PFN(*off)];
 	unsigned long pfn, nr_to_scan;
-	unsigned long pages[2] = {};
+	unsigned long pages[3] = {};
 
 	nr_to_scan = (vma_area_len(vma) - *off) / PAGE_SIZE;
 
@@ -165,9 +168,12 @@  static int generate_iovs(struct vma_area *vma, struct page_pipe *pp, u64 *map, u
 		if (has_parent && page_in_parent(at[pfn])) {
 			ret = page_pipe_add_hole(pp, vaddr);
 			pages[0]++;
+		} else if (page_is_zero(at[pfn])) {
+			ret = page_pipe_add_zero(pp, vaddr);
+			pages[1]++;
 		} else {
 			ret = page_pipe_add_page(pp, vaddr, ppb_flags);
-			pages[1]++;
+			pages[2]++;
 		}
 
 		if (ret) {
@@ -180,9 +186,11 @@  static int generate_iovs(struct vma_area *vma, struct page_pipe *pp, u64 *map, u
 
 	cnt_add(CNT_PAGES_SCANNED, nr_to_scan);
 	cnt_add(CNT_PAGES_SKIPPED_PARENT, pages[0]);
-	cnt_add(CNT_PAGES_WRITTEN, pages[1]);
+	cnt_add(CNT_PAGES_ZERO, pages[1]);
+	cnt_add(CNT_PAGES_WRITTEN, pages[2]);
 
-	pr_info("Pagemap generated: %lu pages %lu holes\n", pages[1], pages[0]);
+	pr_info("Pagemap generated: %lu pages %lu zeros %lu holes\n",
+		pages[2], pages[1], pages[0]);
 	return 0;
 }
 
diff --git a/criu/page-pipe.c b/criu/page-pipe.c
index d002c3a..93a6b65 100644
--- a/criu/page-pipe.c
+++ b/criu/page-pipe.c
@@ -151,6 +151,11 @@  struct page_pipe *create_page_pipe(unsigned int nr_segs,
 	pp->pages.free_iov = 0;
 	pp->pages.busy_iov = 0;
 
+	pp->zeros.nr_iovs = 0;
+	pp->zeros.free_iov = 0;
+	pp->zeros.busy_iov = 0;
+	pp->zeros.iovs = NULL;
+
 	pp->holes.nr_iovs = 0;
 	pp->holes.free_iov = 0;
 	pp->holes.iovs = NULL;
@@ -267,10 +272,8 @@  int page_pipe_add_page(struct page_pipe *pp, unsigned long addr,
 
 #define PP_HOLES_BATCH	32
 
-int page_pipe_add_hole(struct page_pipe *pp, unsigned long addr)
+int __page_pipe_add_hole(struct page_pipe_iovs *hole, unsigned long addr)
 {
-	struct page_pipe_iovs *hole = &pp->holes;
-
 	if (hole->free_iov >= hole->nr_iovs) {
 		hole->iovs = xrealloc(hole->iovs,
 				(hole->nr_iovs + PP_HOLES_BATCH) * sizeof(struct iovec));
@@ -476,4 +479,10 @@  void debug_show_page_pipe(struct page_pipe *pp)
 		iov = &pp->holes.iovs[i];
 		pr_debug("\t%p %lu\n", iov->iov_base, iov->iov_len / PAGE_SIZE);
 	}
+
+	pr_debug("* %u zeros:\n", pp->zeros.free_iov);
+	for (i = 0; i < pp->zeros.free_iov; i++) {
+		iov = &pp->zeros.iovs[i];
+		pr_debug("\t%p %lu\n", iov->iov_base, iov->iov_len / PAGE_SIZE);
+	}
 }
diff --git a/criu/page-xfer.c b/criu/page-xfer.c
index 32b50f7..1164efc 100644
--- a/criu/page-xfer.c
+++ b/criu/page-xfer.c
@@ -37,6 +37,7 @@  static void psi2iovec(struct page_server_iov *ps, struct iovec *iov)
 #define PS_IOV_OPEN	3
 #define PS_IOV_OPEN2	4
 #define PS_IOV_PARENT	5
+#define PS_IOV_ZERO	6
 
 #define PS_IOV_FLUSH		0x1023
 #define PS_IOV_FLUSH_N_CLOSE	0x1024
@@ -104,9 +105,10 @@  static int write_pages_to_server(struct page_xfer *xfer,
 	return 0;
 }
 
-static int write_hole_to_server(struct page_xfer *xfer, struct iovec *iov)
+static int write_hole_to_server(struct page_xfer *xfer, struct iovec *iov,
+				int type)
 {
-	return send_iov(xfer->sk, PS_IOV_HOLE, xfer->dst_id, iov);
+	return send_iov(xfer->sk, type, xfer->dst_id, iov);
 }
 
 static void close_server_xfer(struct page_xfer *xfer)
@@ -223,24 +225,35 @@  static int check_pagehole_in_parent(struct page_read *p, struct iovec *iov)
 	}
 }
 
-static int write_pagehole_loc(struct page_xfer *xfer, struct iovec *iov)
+static int write_hole_loc(struct page_xfer *xfer, struct iovec *iov, int type)
 {
 	PagemapEntry pe = PAGEMAP_ENTRY__INIT;
 
-	if (xfer->parent != NULL) {
-		int ret;
+	iovec2pagemap(iov, &pe);
 
-		ret = check_pagehole_in_parent(xfer->parent, iov);
-		if (ret) {
-			pr_err("Hole %p/%zu not found in parent\n",
-					iov->iov_base, iov->iov_len);
-			return -1;
+	switch (type) {
+	case PS_IOV_HOLE:
+		if (xfer->parent != NULL) {
+			int ret;
+
+			ret = check_pagehole_in_parent(xfer->parent, iov);
+			if (ret) {
+				pr_err("Hole %p/%zu not found in parent\n",
+				       iov->iov_base, iov->iov_len);
+				return -1;
+			}
 		}
-	}
 
-	iovec2pagemap(iov, &pe);
-	pe.has_in_parent = true;
-	pe.in_parent = true;
+		pe.has_in_parent = true;
+		pe.in_parent = true;
+		break;
+	case PS_IOV_ZERO:
+		pe.has_zero = true;
+		pe.zero = true;
+		break;
+	default:
+		return -1;
+	}
 
 	if (pb_write_one(xfer->pmi, &pe, PB_PAGEMAP) < 0)
 		return -1;
@@ -307,7 +320,7 @@  static int open_page_local_xfer(struct page_xfer *xfer, int fd_type, long id)
 out:
 	xfer->write_pagemap = write_pagemap_loc;
 	xfer->write_pages = write_pages_loc;
-	xfer->write_hole = write_pagehole_loc;
+	xfer->write_hole = write_hole_loc;
 	xfer->close = close_page_xfer;
 	return 0;
 }
@@ -321,7 +334,7 @@  int open_page_xfer(struct page_xfer *xfer, int fd_type, long id)
 }
 
 static int dump_hole(struct page_xfer *xfer, struct page_pipe_iovs *iov,
-		     void *limit, unsigned long off)
+		     void *limit, unsigned long off, int type)
 {
 	while (iov->busy_iov < iov->free_iov) {
 		struct iovec *hole = &iov->iovs[iov->busy_iov];
@@ -333,7 +346,7 @@  static int dump_hole(struct page_xfer *xfer, struct page_pipe_iovs *iov,
 		hole->iov_base -= off;
 		pr_debug("\th %p [%u]\n", hole->iov_base,
 			 (unsigned int)(hole->iov_len / PAGE_SIZE));
-		if (xfer->write_hole(xfer, hole))
+		if (xfer->write_hole(xfer, hole, type))
 			return -1;
 
 		iov->busy_iov++;
@@ -342,6 +355,15 @@  static int dump_hole(struct page_xfer *xfer, struct page_pipe_iovs *iov,
 	return 0;
 }
 
+static int dump_holes(struct page_xfer *xfer, struct page_pipe *pp,
+		      void *limit, unsigned long off)
+{
+	if (dump_hole(xfer, &pp->holes, limit, off, PS_IOV_HOLE))
+		return -1;
+
+	return dump_hole(xfer, &pp->zeros, limit, off, PS_IOV_ZERO);
+}
+
 int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp,
 		unsigned long off)
 {
@@ -357,7 +379,7 @@  int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp,
 		for (i = 0; i < ppb->nr_segs; i++) {
 			struct iovec *iov = &ppb->iov[i];
 
-			if (dump_hole(xfer, &pp->holes, iov->iov_base, off))
+			if (dump_holes(xfer, pp, iov->iov_base, off))
 				return -1;
 
 			BUG_ON(iov->iov_base < (void *)off);
@@ -372,7 +394,7 @@  int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp,
 		}
 	}
 
-	if (dump_hole(xfer, &pp->holes, NULL, off))
+	if (dump_holes(xfer, pp, NULL, off))
 		return -1;
 
 	return 0;
@@ -562,7 +584,7 @@  static int page_server_hole(int sk, struct page_server_iov *pi)
 		return -1;
 
 	psi2iovec(pi, &iov);
-	if (lxfer->write_hole(lxfer, &iov))
+	if (lxfer->write_hole(lxfer, &iov, pi->cmd))
 		return -1;
 
 	return 0;
@@ -618,6 +640,7 @@  static int page_server_serve(int sk)
 			ret = page_server_add(sk, &pi);
 			break;
 		case PS_IOV_HOLE:
+		case PS_IOV_ZERO:
 			ret = page_server_hole(sk, &pi);
 			break;
 		case PS_IOV_FLUSH:
diff --git a/criu/pagemap.c b/criu/pagemap.c
index fded268..2416259 100644
--- a/criu/pagemap.c
+++ b/criu/pagemap.c
@@ -121,14 +121,25 @@  int dedup_one_iovec(struct page_read *pr, struct iovec *iov)
 	return 0;
 }
 
+static void put_pagemap(struct page_read *pr)
+{
+	pr->curr_pme++;
+}
+
 static int get_pagemap(struct page_read *pr, struct iovec *iov)
 {
 	PagemapEntry *pe;
 
-	if (pr->curr_pme >= pr->nr_pmes)
-		return 0;
+	for (;;) {
+		if (pr->curr_pme >= pr->nr_pmes)
+			return 0;
+
+		pe = pr->pmes[pr->curr_pme];
 
-	pe = pr->pmes[pr->curr_pme];
+		if (!pe->zero)
+			break;
+		put_pagemap(pr);
+	}
 
 	pagemap2iovec(pe, iov);
 
@@ -143,18 +154,13 @@  static int get_pagemap(struct page_read *pr, struct iovec *iov)
 	return 1;
 }
 
-static void put_pagemap(struct page_read *pr)
-{
-	pr->curr_pme++;
-}
-
 static void skip_pagemap_pages(struct page_read *pr, unsigned long len)
 {
 	if (!len)
 		return;
 
 	pr_debug("\tpr%u Skip %lu bytes from page-dump\n", pr->id, len);
-	if (!pr->pe->in_parent)
+	if (!pr->pe->in_parent && !pr->pe->zero)
 		pr->pi_off += len;
 	pr->cvaddr += len;
 }
@@ -256,6 +262,9 @@  static int read_pagemap_page(struct page_read *pr, unsigned long vaddr, int nr,
 			vaddr += p_nr * PAGE_SIZE;
 			buf += p_nr * PAGE_SIZE;
 		} while (nr);
+	} else if (pr->pe->zero) {
+		/* zero mappings should be skipped by get_pagemap */
+		BUG();
 	} else {
 		int fd = img_raw_fd(pr->pi);
 		off_t current_vaddr = lseek(fd, pr->pi_off, SEEK_SET);
diff --git a/criu/stats.c b/criu/stats.c
index 12b7d05..c01e010 100644
--- a/criu/stats.c
+++ b/criu/stats.c
@@ -122,6 +122,7 @@  void write_stats(int what)
 		ds_entry.pages_scanned = dstats->counts[CNT_PAGES_SCANNED];
 		ds_entry.pages_skipped_parent = dstats->counts[CNT_PAGES_SKIPPED_PARENT];
 		ds_entry.pages_written = dstats->counts[CNT_PAGES_WRITTEN];
+		ds_entry.pages_zero = dstats->counts[CNT_PAGES_ZERO];
 
 		name = "dump";
 	} else if (what == RESTORE_STATS) {
diff --git a/images/pagemap.proto b/images/pagemap.proto
index b492488..f80afeb 100644
--- a/images/pagemap.proto
+++ b/images/pagemap.proto
@@ -8,4 +8,5 @@  message pagemap_entry {
 	required uint64 vaddr		= 1 [(criu).hex = true];
 	required uint32 nr_pages	= 2;
 	optional bool	in_parent	= 3;
+	optional bool	zero		= 4;
 }
diff --git a/images/stats.proto b/images/stats.proto
index 16acf34..307bb29 100644
--- a/images/stats.proto
+++ b/images/stats.proto
@@ -10,6 +10,8 @@  message dump_stats_entry {
 	required uint64			pages_written		= 7;
 
 	optional uint32			irmap_resolve		= 8;
+
+	required uint64			pages_zero		= 9;
 }
 
 message restore_stats_entry {

Comments

Pavel Emelianov July 6, 2016, 8:10 a.m.
> @@ -165,9 +168,12 @@ static int generate_iovs(struct vma_area *vma, struct page_pipe *pp, u64 *map, u
>  		if (has_parent && page_in_parent(at[pfn])) {
>  			ret = page_pipe_add_hole(pp, vaddr);
>  			pages[0]++;
> +		} else if (page_is_zero(at[pfn])) {
> +			ret = page_pipe_add_zero(pp, vaddr);
> +			pages[1]++;

This means, that zero holes that are in parent will be written as ... has_parent :\
Hm... Is this correct? Presumably yes, but ...

>  		} else {
>  			ret = page_pipe_add_page(pp, vaddr, ppb_flags);
> -			pages[1]++;
> +			pages[2]++;
>  		}
>  
>  		if (ret) {

> @@ -342,6 +355,15 @@ static int dump_hole(struct page_xfer *xfer, struct page_pipe_iovs *iov,
>  	return 0;
>  }
>  
> +static int dump_holes(struct page_xfer *xfer, struct page_pipe *pp,
> +		      void *limit, unsigned long off)
> +{
> +	if (dump_hole(xfer, &pp->holes, limit, off, PS_IOV_HOLE))
> +		return -1;
> +
> +	return dump_hole(xfer, &pp->zeros, limit, off, PS_IOV_ZERO);

This generates unsorted pagemap.img. Restore part currently assumes that it's sorted.

> +}
> +
>  int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp,
>  		unsigned long off)
>  {
Mike Rapoport July 6, 2016, 1:27 p.m.
On Wed, Jul 6, 2016 at 11:10 AM, Pavel Emelyanov <xemul@virtuozzo.com> wrote:
>
>> @@ -165,9 +168,12 @@ static int generate_iovs(struct vma_area *vma, struct page_pipe *pp, u64 *map, u
>>               if (has_parent && page_in_parent(at[pfn])) {
>>                       ret = page_pipe_add_hole(pp, vaddr);
>>                       pages[0]++;
>> +             } else if (page_is_zero(at[pfn])) {
>> +                     ret = page_pipe_add_zero(pp, vaddr);
>> +                     pages[1]++;
>
> This means, that zero holes that are in parent will be written as ... has_parent :\
> Hm... Is this correct? Presumably yes, but ...

Well, we can change the order of if'else here, and put the zero hole
in the image no matter if they are in parent or not.
I don't know what is "correct" here. Maybe having zero pages in each
image is slightly faster.

>>               } else {
>>                       ret = page_pipe_add_page(pp, vaddr, ppb_flags);
>> -                     pages[1]++;
>> +                     pages[2]++;
>>               }
>>
>>               if (ret) {
>
>> @@ -342,6 +355,15 @@ static int dump_hole(struct page_xfer *xfer, struct page_pipe_iovs *iov,
>>       return 0;
>>  }
>>
>> +static int dump_holes(struct page_xfer *xfer, struct page_pipe *pp,
>> +                   void *limit, unsigned long off)
>> +{
>> +     if (dump_hole(xfer, &pp->holes, limit, off, PS_IOV_HOLE))
>> +             return -1;
>> +
>> +     return dump_hole(xfer, &pp->zeros, limit, off, PS_IOV_ZERO);
>
> This generates unsorted pagemap.img. Restore part currently assumes that it's sorted.

Why do you think so? Obviously, it was no the intention :)
The dump_hole anyway only dumps holes that are below limit, which is
in turn the beginning of the next IOV with pages...

>> +}
>> +
>>  int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp,
>>               unsigned long off)
>>  {
>
Pavel Emelianov July 7, 2016, 10:45 a.m.
>>> @@ -342,6 +355,15 @@ static int dump_hole(struct page_xfer *xfer, struct page_pipe_iovs *iov,
>>>       return 0;
>>>  }
>>>
>>> +static int dump_holes(struct page_xfer *xfer, struct page_pipe *pp,
>>> +                   void *limit, unsigned long off)
>>> +{
>>> +     if (dump_hole(xfer, &pp->holes, limit, off, PS_IOV_HOLE))
>>> +             return -1;
>>> +
>>> +     return dump_hole(xfer, &pp->zeros, limit, off, PS_IOV_ZERO);
>>
>> This generates unsorted pagemap.img. Restore part currently assumes that it's sorted.
> 
> Why do you think so? Obviously, it was no the intention :)

Because you first write all holes below the limit, them zeroes below the
limit, but if some zeroes below the limit go before holes (below the limit),
zeroes will anyway hit the image after holes.

> The dump_hole anyway only dumps holes that are below limit, which is
> in turn the beginning of the next IOV with pages...
> 
>>> +}
>>> +
>>>  int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp,
>>>               unsigned long off)
>>>  {
>>
> 
> 
>