[07/11] vma: Move cow decision earlier

Submitted by Pavel Emelianov on May 5, 2017, 4:03 p.m.

Details

Message ID 2877eb6c-492d-44dc-ce2c-ee269b253ea3@virtuozzo.com
State New
Series "Do not remap vmas when not needed"
Headers show

Commit Message

Pavel Emelianov May 5, 2017, 4:03 p.m.
Collect VMAs into COW-groups. This is done by checking each pstree_item's
VMA list in parallel with the parent one and finding VMAs that have
chances to get COW pages. The vma->parent pointer is used to tie such
areas together.

As a nice side effect -- tasks with different exe files are not even
tried to cow-ed after this patch.

Signed-off-by: Pavel Emelyanov <xemul@virtuozzo.com>
---
 criu/cr-restore.c  |   2 +
 criu/include/mem.h |   1 +
 criu/include/vma.h |  11 ++++
 criu/mem.c         | 162 ++++++++++++++++++++++++++++++++++-------------------
 4 files changed, 117 insertions(+), 59 deletions(-)

Patch hide | download patch | download mbox

diff --git a/criu/cr-restore.c b/criu/cr-restore.c
index f3f5f67..bbe0dad 100644
--- a/criu/cr-restore.c
+++ b/criu/cr-restore.c
@@ -367,6 +367,8 @@  static int root_prepare_shared(void)
 	if (ret < 0)
 		goto err;
 
+	prepare_cow_vmas();
+
 	ret = prepare_restorer_blob();
 	if (ret)
 		goto err;
diff --git a/criu/include/mem.h b/criu/include/mem.h
index 220a301..6791bfd 100644
--- a/criu/include/mem.h
+++ b/criu/include/mem.h
@@ -18,6 +18,7 @@  struct mem_dump_ctl {
 extern bool page_is_zero(u64 pme);
 extern bool page_in_parent(bool dirty);
 extern int prepare_mm_pid(struct pstree_item *i);
+extern void prepare_cow_vmas(void);
 extern int do_task_reset_dirty_track(int pid);
 extern unsigned long dump_pages_args_size(struct vm_area_list *vmas);
 extern int parasite_dump_pages_seized(struct pstree_item *item,
diff --git a/criu/include/vma.h b/criu/include/vma.h
index 12f03fb..dcce080 100644
--- a/criu/include/vma.h
+++ b/criu/include/vma.h
@@ -57,6 +57,17 @@  struct vma_area {
 			struct vma_area	*pvma;		/* parent for inherited VMAs */
 			unsigned long	*page_bitmap;	/* existent pages */
 			unsigned long	premmaped_addr;	/* restore only */
+
+			/*
+			 * Some notes about pvma, page_bitmap and premmaped_addr bits
+			 * above.
+			 *
+			 * The pvma is set on prepare_cow_vmas() when resolving which
+			 * VMAs _may_ inherit pages from which. The other two are set
+			 * in prepare_mappings() when the respective VMAs get mmap-ed
+			 * or mremap-ed, and are then inherited on fork_with_pid()-s
+			 * called from create_children_and_session().
+			 */
 		};
 	};
 };
diff --git a/criu/mem.c b/criu/mem.c
index 1b805cd..a9e6e08 100644
--- a/criu/mem.c
+++ b/criu/mem.c
@@ -552,15 +552,99 @@  int prepare_mm_pid(struct pstree_item *i)
 	return ret;
 }
 
+static inline bool check_cow_vmas(struct vma_area *vma, struct vma_area *pvma)
+{
+	/*
+	 * VMAs that _may_[1] have COW-ed pages should ...
+	 *
+	 * [1] I say "may" because whether or not particular pages are
+	 * COW-ed is determined later in restore_priv_vma_content() by
+	 * memcpy-ing the contents.
+	 */
+
+	/* ... coinside by start/stop pair (start is checked by caller) */
+	if (vma->e->end != pvma->e->end)
+		return false;
+	/* ... both be private (and thus have space in premmaped area) */
+	if (!vma_area_is_private(vma, kdat.task_size))
+		return false;
+	if (!vma_area_is_private(vma, kdat.task_size))
+		return false;
+	/* ... have growsdown and anon flags coinside */
+	if ((vma->e->flags ^ pvma->e->flags) & (MAP_GROWSDOWN | MAP_ANONYMOUS))
+		return false;
+	/* ... belong to the same file if being filemap */
+	if (!(vma->e->flags & MAP_ANONYMOUS) && vma->e->shmid != pvma->e->shmid)
+		return false;
+
+	pr_debug("Found two COW VMAs @%#lx-%#lx\n", vma->e->start, pvma->e->end);
+	return true;
+}
+
+static void prepare_cow_vmas_for(struct vm_area_list *vmas, struct vm_area_list *pvmas)
+{
+	struct vma_area *vma, *pvma;
+
+	vma = list_first_entry(&vmas->h, struct vma_area, list);
+	pvma = list_first_entry(&pvmas->h, struct vma_area, list);
+
+	while (1) {
+		if ((vma->e->start == pvma->e->start) && check_cow_vmas(vma, pvma))
+			vma->pvma = pvma;
+
+		/* <= here to shift from matching VMAs and ... */
+		while (vma->e->start <= pvma->e->start) {
+			vma = list_entry(vma->list.next, struct vma_area, list);
+			if (&vma->list == &vmas->h)
+				return;
+		}
+
+		/* ... no == here since we must stop on matching pair */
+		while (pvma->e->start < vma->e->start) {
+			pvma = list_entry(pvma->list.next, struct vma_area, list);
+			if (&pvma->list == &pvmas->h)
+				return;
+		}
+	}
+}
+
+void prepare_cow_vmas(void)
+{
+	struct pstree_item *pi;
+
+	for_each_pstree_item(pi) {
+		struct pstree_item *ppi;
+		struct vm_area_list *vmas, *pvmas;
+
+		ppi = pi->parent;
+		if (!ppi)
+			continue;
+
+		vmas = &rsti(pi)->vmas;
+		if (vmas->nr == 0) /* Zombie */
+			continue;
+
+		if (rsti(pi)->mm->exe_file_id != rsti(ppi)->mm->exe_file_id)
+			/*
+			 * Tasks running different executables have
+			 * close to zero chance of having cow-ed areas
+			 * and actually kernel never creates such.
+			 */
+			continue;
+
+		pvmas = &rsti(ppi)->vmas;
+		BUG_ON(pvmas->nr == 0); /* zombies cannot have kids */
+
+		prepare_cow_vmas_for(vmas, pvmas);
+	}
+}
+
 /* Map a private vma, if it is not mapped by a parent yet */
-static int premap_private_vma(struct pstree_item *t,
-		struct vma_area *vma, void **tgt_addr,
-		struct vma_area **pvma, struct list_head *pvma_list)
+static int premap_private_vma(struct pstree_item *t, struct vma_area *vma, void **tgt_addr)
 {
 	int ret;
-	void *addr, *paddr = NULL;
+	void *addr;
 	unsigned long nr_pages, size;
-	struct vma_area *p = *pvma;
 
 	if (vma_area_is(vma, VMA_FILE_PRIVATE)) {
 		ret = vma->vm_open(vpid(t), vma);
@@ -577,51 +661,20 @@  static int premap_private_vma(struct pstree_item *t,
 	if (vma->page_bitmap == NULL)
 		return -1;
 
-	list_for_each_entry_from(p, pvma_list, list) {
-		if (p->e->start > vma->e->start)
-			 break;
-
-		if (!vma_area_is(p, VMA_PREMMAPED))
-			continue;
-
-		 if (p->e->end != vma->e->end ||
-		     p->e->start != vma->e->start)
-			continue;
-
-		/* Check flags, which must be identical for both vma-s */
-		if ((vma->e->flags ^ p->e->flags) & (MAP_GROWSDOWN | MAP_ANONYMOUS))
-			break;
-
-		if (!(vma->e->flags & MAP_ANONYMOUS) &&
-		    vma->e->shmid != p->e->shmid)
-			break;
-
-		pr_info("COW %#016"PRIx64"-%#016"PRIx64" %#016"PRIx64" vma\n",
-			vma->e->start, vma->e->end, vma->e->pgoff);
-		paddr = decode_pointer(p->premmaped_addr);
-
-		break;
-	}
-
 	/*
 	 * A grow-down VMA has a guard page, which protect a VMA below it.
 	 * So one more page is mapped here to restore content of the first page
 	 */
-	if (vma->e->flags & MAP_GROWSDOWN) {
+	if (vma->e->flags & MAP_GROWSDOWN)
 		vma->e->start -= PAGE_SIZE;
-		if (paddr)
-			paddr -= PAGE_SIZE;
-	}
 
 	size = vma_entry_len(vma->e);
-	if (paddr == NULL) {
+	if (vma->pvma == NULL) {
 		int flag = 0;
 		/*
 		 * The respective memory area was NOT found in the parent.
 		 * Map a new one.
 		 */
-		pr_info("Map %#016"PRIx64"-%#016"PRIx64" %#016"PRIx64" vma\n",
-			vma->e->start, vma->e->end, vma->e->pgoff);
 
 		/*
 		 * Restore AIO ring buffer content to temporary anonymous area.
@@ -639,14 +692,19 @@  static int premap_private_vma(struct pstree_item *t,
 			pr_perror("Unable to map ANON_VMA");
 			return -1;
 		}
-
-		*pvma = p;
 	} else {
+		void *paddr;
+
 		/*
-		 * This region was found in parent -- remap it to inherit physical
-		 * pages (if any) from it (and COW them later if required).
+		 * The area in question can be COWed with the parent. Remap the
+		 * parent area. Note, that it has already being passed through 
+		 * the restore_priv_vma_content() call and thus may have some 
+		 * pages in it.
 		 */
-		vma->pvma = p;
+
+		paddr = decode_pointer(vma->pvma->premmaped_addr);
+		if (vma->e->flags & MAP_GROWSDOWN)
+			paddr -= PAGE_SIZE;
 
 		addr = mremap(paddr, size, size,
 				MREMAP_FIXED | MREMAP_MAYMOVE, *tgt_addr);
@@ -654,8 +712,6 @@  static int premap_private_vma(struct pstree_item *t,
 			pr_perror("Unable to remap a private vma");
 			return -1;
 		}
-
-		*pvma = list_entry(p->list.next, struct vma_area, list);
 	}
 
 	vma->e->status |= VMA_PREMMAPED;
@@ -678,23 +734,11 @@  static int premap_private_vma(struct pstree_item *t,
 static int premap_priv_vmas(struct pstree_item *t, struct vm_area_list *vmas,
 		void *at, struct page_read *pr)
 {
-	struct list_head *parent_vmas;
-	struct vma_area *pvma, *vma;
+	struct vma_area *vma;
 	unsigned long pstart = 0;
 	int ret = 0;
 	LIST_HEAD(empty);
 
-	/*
-	 * Keep parent vmas at hands to check whether we can "inherit" them.
-	 * See comments in premap_private_vma.
-	 */
-	if (t->parent)
-		parent_vmas = &rsti(t->parent)->vmas.h;
-	else
-		parent_vmas = &empty;
-
-	pvma = list_first_entry(parent_vmas, struct vma_area, list);
-
 	list_for_each_entry(vma, &vmas->h, list) {
 		if (pstart > vma->e->start) {
 			ret = -1;
@@ -706,7 +750,7 @@  static int premap_priv_vmas(struct pstree_item *t, struct vm_area_list *vmas,
 		if (!vma_area_is_private(vma, kdat.task_size))
 			continue;
 
-		ret = premap_private_vma(t, vma, &at, &pvma, parent_vmas);
+		ret = premap_private_vma(t, vma, &at);
 		if (ret < 0)
 			break;
 	}

Comments

Mike Rapoport May 7, 2017, 10:49 a.m.
On Fri, May 05, 2017 at 07:03:16PM +0300, Pavel Emelyanov wrote:
> Collect VMAs into COW-groups. This is done by checking each pstree_item's
> VMA list in parallel with the parent one and finding VMAs that have
> chances to get COW pages. The vma->parent pointer is used to tie such
> areas together.
> 
> As a nice side effect -- tasks with different exe files are not even
> tried to cow-ed after this patch.
> 
> Signed-off-by: Pavel Emelyanov <xemul@virtuozzo.com>
> ---
>  criu/cr-restore.c  |   2 +
>  criu/include/mem.h |   1 +
>  criu/include/vma.h |  11 ++++
>  criu/mem.c         | 162 ++++++++++++++++++++++++++++++++++-------------------
>  4 files changed, 117 insertions(+), 59 deletions(-)
> 
> diff --git a/criu/cr-restore.c b/criu/cr-restore.c
> index f3f5f67..bbe0dad 100644
> --- a/criu/cr-restore.c
> +++ b/criu/cr-restore.c
> @@ -367,6 +367,8 @@ static int root_prepare_shared(void)
>  	if (ret < 0)
>  		goto err;
> 
> +	prepare_cow_vmas();
> +
>  	ret = prepare_restorer_blob();
>  	if (ret)
>  		goto err;
> diff --git a/criu/include/mem.h b/criu/include/mem.h
> index 220a301..6791bfd 100644
> --- a/criu/include/mem.h
> +++ b/criu/include/mem.h
> @@ -18,6 +18,7 @@ struct mem_dump_ctl {
>  extern bool page_is_zero(u64 pme);
>  extern bool page_in_parent(bool dirty);
>  extern int prepare_mm_pid(struct pstree_item *i);
> +extern void prepare_cow_vmas(void);
>  extern int do_task_reset_dirty_track(int pid);
>  extern unsigned long dump_pages_args_size(struct vm_area_list *vmas);
>  extern int parasite_dump_pages_seized(struct pstree_item *item,
> diff --git a/criu/include/vma.h b/criu/include/vma.h
> index 12f03fb..dcce080 100644
> --- a/criu/include/vma.h
> +++ b/criu/include/vma.h
> @@ -57,6 +57,17 @@ struct vma_area {
>  			struct vma_area	*pvma;		/* parent for inherited VMAs */
>  			unsigned long	*page_bitmap;	/* existent pages */
>  			unsigned long	premmaped_addr;	/* restore only */
> +
> +			/*
> +			 * Some notes about pvma, page_bitmap and premmaped_addr bits
> +			 * above.
> +			 *
> +			 * The pvma is set on prepare_cow_vmas() when resolving which
> +			 * VMAs _may_ inherit pages from which. The other two are set
> +			 * in prepare_mappings() when the respective VMAs get mmap-ed
> +			 * or mremap-ed, and are then inherited on fork_with_pid()-s
> +			 * called from create_children_and_session().
> +			 */

* The pvma is set in prepare_cow_vmas() when we resolve which
* VMAs _may_ inherit pages from each other.
* The page_bitmap and premmaped_addr are set in prepare_mappings()
* when the respective VMAs get mmap-ed or mremap-ed.
* These VMAs are then inherited during fork_with_pid()-s
* called from create_children_and_session().

>  		};
>  	};
>  };
> diff --git a/criu/mem.c b/criu/mem.c
> index 1b805cd..a9e6e08 100644
> --- a/criu/mem.c
> +++ b/criu/mem.c
> @@ -552,15 +552,99 @@ int prepare_mm_pid(struct pstree_item *i)
>  	return ret;
>  }
> 
> +static inline bool check_cow_vmas(struct vma_area *vma, struct vma_area *pvma)
> +{
> +	/*
> +	 * VMAs that _may_[1] have COW-ed pages should ...
> +	 *
> +	 * [1] I say "may" because whether or not particular pages are
> +	 * COW-ed is determined later in restore_priv_vma_content() by
> +	 * memcpy-ing the contents.

memcmp'aring?

> +	 */
> +
> +	/* ... coinside by start/stop pair (start is checked by caller) */
> +	if (vma->e->end != pvma->e->end)
> +		return false;
> +	/* ... both be private (and thus have space in premmaped area) */
> +	if (!vma_area_is_private(vma, kdat.task_size))
> +		return false;
> +	if (!vma_area_is_private(vma, kdat.task_size))
> +		return false;

The check for the same VMA is repeated here, should be pvma in one of the
case above, IMHO.

> +	/* ... have growsdown and anon flags coinside */
> +	if ((vma->e->flags ^ pvma->e->flags) & (MAP_GROWSDOWN | MAP_ANONYMOUS))
> +		return false;
> +	/* ... belong to the same file if being filemap */
> +	if (!(vma->e->flags & MAP_ANONYMOUS) && vma->e->shmid != pvma->e->shmid)
> +		return false;
> +
> +	pr_debug("Found two COW VMAs @%#lx-%#lx\n", vma->e->start, pvma->e->end);
> +	return true;
> +}
> +
> +static void prepare_cow_vmas_for(struct vm_area_list *vmas, struct vm_area_list *pvmas)
> +{
> +	struct vma_area *vma, *pvma;
> +
> +	vma = list_first_entry(&vmas->h, struct vma_area, list);
> +	pvma = list_first_entry(&pvmas->h, struct vma_area, list);
> +
> +	while (1) {
> +		if ((vma->e->start == pvma->e->start) && check_cow_vmas(vma, pvma))
> +			vma->pvma = pvma;
> +
> +		/* <= here to shift from matching VMAs and ... */
> +		while (vma->e->start <= pvma->e->start) {
> +			vma = list_entry(vma->list.next, struct vma_area, list);
> +			if (&vma->list == &vmas->h)
> +				return;
> +		}
> +
> +		/* ... no == here since we must stop on matching pair */
> +		while (pvma->e->start < vma->e->start) {
> +			pvma = list_entry(pvma->list.next, struct vma_area, list);

It's worth having vma_next already by this point ;-)

> +			if (&pvma->list == &pvmas->h)
> +				return;
> +		}
> +	}
> +}
> +
> +void prepare_cow_vmas(void)
> +{
> +	struct pstree_item *pi;
> +
> +	for_each_pstree_item(pi) {
> +		struct pstree_item *ppi;
> +		struct vm_area_list *vmas, *pvmas;
> +
> +		ppi = pi->parent;
> +		if (!ppi)
> +			continue;
> +
> +		vmas = &rsti(pi)->vmas;
> +		if (vmas->nr == 0) /* Zombie */
> +			continue;
> +
> +		if (rsti(pi)->mm->exe_file_id != rsti(ppi)->mm->exe_file_id)
> +			/*
> +			 * Tasks running different executables have
> +			 * close to zero chance of having cow-ed areas
> +			 * and actually kernel never creates such.
> +			 */
> +			continue;
> +
> +		pvmas = &rsti(ppi)->vmas;
> +		BUG_ON(pvmas->nr == 0); /* zombies cannot have kids */
> +
> +		prepare_cow_vmas_for(vmas, pvmas);
> +	}
> +}
> +
>  /* Map a private vma, if it is not mapped by a parent yet */
> -static int premap_private_vma(struct pstree_item *t,
> -		struct vma_area *vma, void **tgt_addr,
> -		struct vma_area **pvma, struct list_head *pvma_list)
> +static int premap_private_vma(struct pstree_item *t, struct vma_area *vma, void **tgt_addr)
>  {
>  	int ret;
> -	void *addr, *paddr = NULL;
> +	void *addr;
>  	unsigned long nr_pages, size;
> -	struct vma_area *p = *pvma;
> 
>  	if (vma_area_is(vma, VMA_FILE_PRIVATE)) {
>  		ret = vma->vm_open(vpid(t), vma);
> @@ -577,51 +661,20 @@ static int premap_private_vma(struct pstree_item *t,
>  	if (vma->page_bitmap == NULL)
>  		return -1;
> 
> -	list_for_each_entry_from(p, pvma_list, list) {
> -		if (p->e->start > vma->e->start)
> -			 break;
> -
> -		if (!vma_area_is(p, VMA_PREMMAPED))
> -			continue;
> -
> -		 if (p->e->end != vma->e->end ||
> -		     p->e->start != vma->e->start)
> -			continue;
> -
> -		/* Check flags, which must be identical for both vma-s */
> -		if ((vma->e->flags ^ p->e->flags) & (MAP_GROWSDOWN | MAP_ANONYMOUS))
> -			break;
> -
> -		if (!(vma->e->flags & MAP_ANONYMOUS) &&
> -		    vma->e->shmid != p->e->shmid)
> -			break;
> -
> -		pr_info("COW %#016"PRIx64"-%#016"PRIx64" %#016"PRIx64" vma\n",
> -			vma->e->start, vma->e->end, vma->e->pgoff);
> -		paddr = decode_pointer(p->premmaped_addr);
> -
> -		break;
> -	}
> -
>  	/*
>  	 * A grow-down VMA has a guard page, which protect a VMA below it.
>  	 * So one more page is mapped here to restore content of the first page
>  	 */
> -	if (vma->e->flags & MAP_GROWSDOWN) {
> +	if (vma->e->flags & MAP_GROWSDOWN)
>  		vma->e->start -= PAGE_SIZE;
> -		if (paddr)
> -			paddr -= PAGE_SIZE;
> -	}
> 
>  	size = vma_entry_len(vma->e);
> -	if (paddr == NULL) {
> +	if (vma->pvma == NULL) {
>  		int flag = 0;
>  		/*
>  		 * The respective memory area was NOT found in the parent.
>  		 * Map a new one.
>  		 */
> -		pr_info("Map %#016"PRIx64"-%#016"PRIx64" %#016"PRIx64" vma\n",
> -			vma->e->start, vma->e->end, vma->e->pgoff);
> 
>  		/*
>  		 * Restore AIO ring buffer content to temporary anonymous area.
> @@ -639,14 +692,19 @@ static int premap_private_vma(struct pstree_item *t,
>  			pr_perror("Unable to map ANON_VMA");
>  			return -1;
>  		}
> -
> -		*pvma = p;
>  	} else {
> +		void *paddr;
> +
>  		/*
> -		 * This region was found in parent -- remap it to inherit physical
> -		 * pages (if any) from it (and COW them later if required).
> +		 * The area in question can be COWed with the parent. Remap the
> +		 * parent area. Note, that it has already being passed through 
> +		 * the restore_priv_vma_content() call and thus may have some 
> +		 * pages in it.
>  		 */
> -		vma->pvma = p;
> +
> +		paddr = decode_pointer(vma->pvma->premmaped_addr);
> +		if (vma->e->flags & MAP_GROWSDOWN)
> +			paddr -= PAGE_SIZE;
> 
>  		addr = mremap(paddr, size, size,
>  				MREMAP_FIXED | MREMAP_MAYMOVE, *tgt_addr);
> @@ -654,8 +712,6 @@ static int premap_private_vma(struct pstree_item *t,
>  			pr_perror("Unable to remap a private vma");
>  			return -1;
>  		}
> -
> -		*pvma = list_entry(p->list.next, struct vma_area, list);
>  	}
> 
>  	vma->e->status |= VMA_PREMMAPED;
> @@ -678,23 +734,11 @@ static int premap_private_vma(struct pstree_item *t,
>  static int premap_priv_vmas(struct pstree_item *t, struct vm_area_list *vmas,
>  		void *at, struct page_read *pr)
>  {
> -	struct list_head *parent_vmas;
> -	struct vma_area *pvma, *vma;
> +	struct vma_area *vma;
>  	unsigned long pstart = 0;
>  	int ret = 0;
>  	LIST_HEAD(empty);
> 
> -	/*
> -	 * Keep parent vmas at hands to check whether we can "inherit" them.
> -	 * See comments in premap_private_vma.
> -	 */
> -	if (t->parent)
> -		parent_vmas = &rsti(t->parent)->vmas.h;
> -	else
> -		parent_vmas = &empty;
> -
> -	pvma = list_first_entry(parent_vmas, struct vma_area, list);
> -
>  	list_for_each_entry(vma, &vmas->h, list) {
>  		if (pstart > vma->e->start) {
>  			ret = -1;
> @@ -706,7 +750,7 @@ static int premap_priv_vmas(struct pstree_item *t, struct vm_area_list *vmas,
>  		if (!vma_area_is_private(vma, kdat.task_size))
>  			continue;
> 
> -		ret = premap_private_vma(t, vma, &at, &pvma, parent_vmas);
> +		ret = premap_private_vma(t, vma, &at);
>  		if (ret < 0)
>  			break;
>  	}
> -- 
> 2.5.5
> 
> _______________________________________________
> CRIU mailing list
> CRIU@openvz.org
> https://lists.openvz.org/mailman/listinfo/criu
>
Pavel Emelianov May 10, 2017, 8:49 a.m.
On 05/07/2017 01:49 PM, Mike Rapoport wrote:
> On Fri, May 05, 2017 at 07:03:16PM +0300, Pavel Emelyanov wrote:
>> Collect VMAs into COW-groups. This is done by checking each pstree_item's
>> VMA list in parallel with the parent one and finding VMAs that have
>> chances to get COW pages. The vma->parent pointer is used to tie such
>> areas together.
>>
>> As a nice side effect -- tasks with different exe files are not even
>> tried to cow-ed after this patch.
>>
>> Signed-off-by: Pavel Emelyanov <xemul@virtuozzo.com>
>> ---
>>  criu/cr-restore.c  |   2 +
>>  criu/include/mem.h |   1 +
>>  criu/include/vma.h |  11 ++++
>>  criu/mem.c         | 162 ++++++++++++++++++++++++++++++++++-------------------
>>  4 files changed, 117 insertions(+), 59 deletions(-)
>>
>> diff --git a/criu/cr-restore.c b/criu/cr-restore.c
>> index f3f5f67..bbe0dad 100644
>> --- a/criu/cr-restore.c
>> +++ b/criu/cr-restore.c
>> @@ -367,6 +367,8 @@ static int root_prepare_shared(void)
>>  	if (ret < 0)
>>  		goto err;
>>
>> +	prepare_cow_vmas();
>> +
>>  	ret = prepare_restorer_blob();
>>  	if (ret)
>>  		goto err;
>> diff --git a/criu/include/mem.h b/criu/include/mem.h
>> index 220a301..6791bfd 100644
>> --- a/criu/include/mem.h
>> +++ b/criu/include/mem.h
>> @@ -18,6 +18,7 @@ struct mem_dump_ctl {
>>  extern bool page_is_zero(u64 pme);
>>  extern bool page_in_parent(bool dirty);
>>  extern int prepare_mm_pid(struct pstree_item *i);
>> +extern void prepare_cow_vmas(void);
>>  extern int do_task_reset_dirty_track(int pid);
>>  extern unsigned long dump_pages_args_size(struct vm_area_list *vmas);
>>  extern int parasite_dump_pages_seized(struct pstree_item *item,
>> diff --git a/criu/include/vma.h b/criu/include/vma.h
>> index 12f03fb..dcce080 100644
>> --- a/criu/include/vma.h
>> +++ b/criu/include/vma.h
>> @@ -57,6 +57,17 @@ struct vma_area {
>>  			struct vma_area	*pvma;		/* parent for inherited VMAs */
>>  			unsigned long	*page_bitmap;	/* existent pages */
>>  			unsigned long	premmaped_addr;	/* restore only */
>> +
>> +			/*
>> +			 * Some notes about pvma, page_bitmap and premmaped_addr bits
>> +			 * above.
>> +			 *
>> +			 * The pvma is set on prepare_cow_vmas() when resolving which
>> +			 * VMAs _may_ inherit pages from which. The other two are set
>> +			 * in prepare_mappings() when the respective VMAs get mmap-ed
>> +			 * or mremap-ed, and are then inherited on fork_with_pid()-s
>> +			 * called from create_children_and_session().
>> +			 */
> 
> * The pvma is set in prepare_cow_vmas() when we resolve which
> * VMAs _may_ inherit pages from each other.
> * The page_bitmap and premmaped_addr are set in prepare_mappings()
> * when the respective VMAs get mmap-ed or mremap-ed.
> * These VMAs are then inherited during fork_with_pid()-s
> * called from create_children_and_session().

Thanks!

>>  		};
>>  	};
>>  };
>> diff --git a/criu/mem.c b/criu/mem.c
>> index 1b805cd..a9e6e08 100644
>> --- a/criu/mem.c
>> +++ b/criu/mem.c
>> @@ -552,15 +552,99 @@ int prepare_mm_pid(struct pstree_item *i)
>>  	return ret;
>>  }
>>
>> +static inline bool check_cow_vmas(struct vma_area *vma, struct vma_area *pvma)
>> +{
>> +	/*
>> +	 * VMAs that _may_[1] have COW-ed pages should ...
>> +	 *
>> +	 * [1] I say "may" because whether or not particular pages are
>> +	 * COW-ed is determined later in restore_priv_vma_content() by
>> +	 * memcpy-ing the contents.
> 
> memcmp'aring?

Indeed :\

>> +	 */
>> +
>> +	/* ... coinside by start/stop pair (start is checked by caller) */
>> +	if (vma->e->end != pvma->e->end)
>> +		return false;
>> +	/* ... both be private (and thus have space in premmaped area) */
>> +	if (!vma_area_is_private(vma, kdat.task_size))
>> +		return false;
>> +	if (!vma_area_is_private(vma, kdat.task_size))
>> +		return false;
> 
> The check for the same VMA is repeated here, should be pvma in one of the
> case above, IMHO.

Yup, nice catch.

>> +	/* ... have growsdown and anon flags coinside */
>> +	if ((vma->e->flags ^ pvma->e->flags) & (MAP_GROWSDOWN | MAP_ANONYMOUS))
>> +		return false;
>> +	/* ... belong to the same file if being filemap */
>> +	if (!(vma->e->flags & MAP_ANONYMOUS) && vma->e->shmid != pvma->e->shmid)
>> +		return false;
>> +
>> +	pr_debug("Found two COW VMAs @%#lx-%#lx\n", vma->e->start, pvma->e->end);
>> +	return true;
>> +}
>> +
>> +static void prepare_cow_vmas_for(struct vm_area_list *vmas, struct vm_area_list *pvmas)
>> +{
>> +	struct vma_area *vma, *pvma;
>> +
>> +	vma = list_first_entry(&vmas->h, struct vma_area, list);
>> +	pvma = list_first_entry(&pvmas->h, struct vma_area, list);
>> +
>> +	while (1) {
>> +		if ((vma->e->start == pvma->e->start) && check_cow_vmas(vma, pvma))
>> +			vma->pvma = pvma;
>> +
>> +		/* <= here to shift from matching VMAs and ... */
>> +		while (vma->e->start <= pvma->e->start) {
>> +			vma = list_entry(vma->list.next, struct vma_area, list);
>> +			if (&vma->list == &vmas->h)
>> +				return;
>> +		}
>> +
>> +		/* ... no == here since we must stop on matching pair */
>> +		while (pvma->e->start < vma->e->start) {
>> +			pvma = list_entry(pvma->list.next, struct vma_area, list);
> 
> It's worth having vma_next already by this point ;-)

OK, will try to resort the patches.

>> +			if (&pvma->list == &pvmas->h)
>> +				return;
>> +		}
>> +	}
>> +}
>> +

-- Pavel