vmsplice user-buffer(grabbed by process_vm_readv) to page-pipe

Submitted by Abhishek Dubey on July 5, 2019, 12:04 p.m.

Details

Message ID 1562328266-25661-1-git-send-email-dubeyabhishek777@gmail.com
State New
Series "vmsplice user-buffer(grabbed by process_vm_readv) to page-pipe"
Headers show

Commit Message

Abhishek Dubey July 5, 2019, 12:04 p.m.
This patch implements usage of process_vm_readv
syscall to collect memory pages from target process during
pre-dump. process_vm_readv collects pages in user-buffer,
which is later vmspliced to page-pipes.

Signed-off-by: Abhishek Dubey <dubeyabhishek777@gmail.com>
---
 criu/cr-dump.c           |  3 +-
 criu/include/page-xfer.h |  2 +-
 criu/mem.c               | 88 ++++++++++++++++++------------------------------
 criu/page-pipe.c         |  4 +--
 criu/page-xfer.c         | 62 ++++++++++++++++++++++++++++++++--
 criu/shmem.c             | 19 +++--------
 6 files changed, 102 insertions(+), 76 deletions(-)

Patch hide | download patch | download mbox

diff --git a/criu/cr-dump.c b/criu/cr-dump.c
index 7f2e5ed..ee5f4f3 100644
--- a/criu/cr-dump.c
+++ b/criu/cr-dump.c
@@ -1501,6 +1501,7 @@  static int cr_pre_dump_finish(int status)
 		struct parasite_ctl *ctl = dmpi(item)->parasite_ctl;
 		struct page_pipe *mem_pp;
 		struct page_xfer xfer;
+		size_t off = 0;
 
 		if (!ctl)
 			continue;
@@ -1512,7 +1513,7 @@  static int cr_pre_dump_finish(int status)
 			goto err;
 
 		mem_pp = dmpi(item)->mem_pp;
-		ret = page_xfer_dump_pages(&xfer, mem_pp);
+		ret = page_xfer_dump_pages(item->pid->real, &xfer, mem_pp, &off);
 
 		xfer.close(&xfer);
 
diff --git a/criu/include/page-xfer.h b/criu/include/page-xfer.h
index fa72273..74d42f2 100644
--- a/criu/include/page-xfer.h
+++ b/criu/include/page-xfer.h
@@ -47,7 +47,7 @@  struct page_xfer {
 
 extern int open_page_xfer(struct page_xfer *xfer, int fd_type, unsigned long id);
 struct page_pipe;
-extern int page_xfer_dump_pages(struct page_xfer *, struct page_pipe *);
+extern int page_xfer_dump_pages(int pid, struct page_xfer *, struct page_pipe *, size_t *poff);
 extern int connect_to_page_server_to_send(void);
 extern int connect_to_page_server_to_recv(int epfd);
 extern int disconnect_from_page_server(void);
diff --git a/criu/mem.c b/criu/mem.c
index 6a1a87a..844d726 100644
--- a/criu/mem.c
+++ b/criu/mem.c
@@ -260,39 +260,7 @@  static struct parasite_dump_pages_args *prep_dump_pages_args(struct parasite_ctl
 	return args;
 }
 
-static int drain_pages(struct page_pipe *pp, struct parasite_ctl *ctl,
-		      struct parasite_dump_pages_args *args)
-{
-	struct page_pipe_buf *ppb;
-	int ret = 0;
-
-	debug_show_page_pipe(pp);
-
-	/* Step 2 -- grab pages into page-pipe */
-	list_for_each_entry(ppb, &pp->bufs, l) {
-		args->nr_segs = ppb->nr_segs;
-		args->nr_pages = ppb->pages_in;
-		pr_debug("PPB: %d pages %d segs %u pipe %d off\n",
-				args->nr_pages, args->nr_segs, ppb->pipe_size, args->off);
-
-		ret = compel_rpc_call(PARASITE_CMD_DUMPPAGES, ctl);
-		if (ret < 0)
-			return -1;
-		ret = compel_util_send_fd(ctl, ppb->p[1]);
-		if (ret)
-			return -1;
-
-		ret = compel_rpc_sync(PARASITE_CMD_DUMPPAGES, ctl);
-		if (ret < 0)
-			return -1;
-
-		args->off += args->nr_segs;
-	}
-
-	return 0;
-}
-
-static int xfer_pages(struct page_pipe *pp, struct page_xfer *xfer)
+static int xfer_pages( int pid, struct page_pipe *pp, struct page_xfer *xfer, size_t *poff)
 {
 	int ret;
 
@@ -301,7 +269,7 @@  static int xfer_pages(struct page_pipe *pp, struct page_xfer *xfer)
 	 *           pre-dump action (see pre_dump_one_task)
 	 */
 	timing_start(TIME_MEMWRITE);
-	ret = page_xfer_dump_pages(xfer, pp);
+	ret = page_xfer_dump_pages(pid, xfer, pp, poff);
 	timing_stop(TIME_MEMWRITE);
 
 	return ret;
@@ -351,7 +319,7 @@  static int generate_vma_iovs(struct pstree_item *item, struct vma_area *vma,
 			     struct page_pipe *pp, struct page_xfer *xfer,
 			     struct parasite_dump_pages_args *args,
 			     struct parasite_ctl *ctl, pmc_t *pmc,
-			     bool has_parent, bool pre_dump)
+			     bool has_parent, bool pre_dump, size_t *poff)
 {
 	u64 off = 0;
 	u64 *map;
@@ -361,6 +329,12 @@  static int generate_vma_iovs(struct pstree_item *item, struct vma_area *vma,
 				!vma_area_is(vma, VMA_ANON_SHARED))
 		return 0;
 
+	if (!(vma->e->prot & PROT_READ)){
+		if(pre_dump)
+			return 0;
+		has_parent = false;
+	}
+
 	if (vma_entry_is(vma->e, VMA_AREA_AIORING)) {
 		if (pre_dump)
 			return 0;
@@ -379,9 +353,7 @@  again:
 	if (ret == -EAGAIN) {
 		BUG_ON(!(pp->flags & PP_CHUNK_MODE));
 
-		ret = drain_pages(pp, ctl, args);
-		if (!ret)
-			ret = xfer_pages(pp, xfer);
+		ret = xfer_pages(item->pid->real, pp, xfer, poff);
 		if (!ret) {
 			page_pipe_reinit(pp);
 			goto again;
@@ -406,6 +378,7 @@  static int __parasite_dump_pages_seized(struct pstree_item *item,
 	unsigned long pmc_size;
 	int possible_pid_reuse = 0;
 	bool has_parent;
+	size_t poff = 0;
 
 	pr_info("\n");
 	pr_info("Dumping pages (type: %d pid: %d)\n", CR_FD_PAGES, item->pid->real);
@@ -470,11 +443,12 @@  static int __parasite_dump_pages_seized(struct pstree_item *item,
 	/*
 	 * Step 1 -- generate the pagemap
 	 */
+	poff = 0;
 	args->off = 0;
 	has_parent = !!xfer.parent && !possible_pid_reuse;
 	list_for_each_entry(vma_area, &vma_area_list->h, list) {
 		ret = generate_vma_iovs(item, vma_area, pp, &xfer, args, ctl,
-					&pmc, has_parent, mdc->pre_dump);
+					&pmc, has_parent, mdc->pre_dump, &poff);
 		if (ret < 0)
 			goto out_xfer;
 	}
@@ -482,9 +456,8 @@  static int __parasite_dump_pages_seized(struct pstree_item *item,
 	if (mdc->lazy)
 		memcpy(pargs_iovs(args), pp->iovs,
 		       sizeof(struct iovec) * pp->nr_iovs);
-	ret = drain_pages(pp, ctl, args);
-	if (!ret && !mdc->pre_dump)
-		ret = xfer_pages(pp, &xfer);
+	if(!mdc->pre_dump)
+		ret = xfer_pages(item->pid->real, pp, &xfer, &poff);
 	if (ret)
 		goto out_xfer;
 
@@ -529,17 +502,18 @@  int parasite_dump_pages_seized(struct pstree_item *item,
 	 *
 	 * Afterwards -- reprotect memory back.
 	 */
+	if(!mdc->pre_dump){
+		pargs->add_prot = PROT_READ;
+		ret = compel_rpc_call_sync(PARASITE_CMD_MPROTECT_VMAS, ctl);
+		if (ret) {
+			pr_err("Can't dump unprotect vmas with parasite\n");
+			return ret;
+		}
 
-	pargs->add_prot = PROT_READ;
-	ret = compel_rpc_call_sync(PARASITE_CMD_MPROTECT_VMAS, ctl);
-	if (ret) {
-		pr_err("Can't dump unprotect vmas with parasite\n");
-		return ret;
-	}
-
-	if (fault_injected(FI_DUMP_PAGES)) {
-		pr_err("fault: Dump VMA pages failure!\n");
-		return -1;
+		if (fault_injected(FI_DUMP_PAGES)) {
+			pr_err("fault: Dump VMA pages failure!\n");
+			return -1;
+		}
 	}
 
 	ret = __parasite_dump_pages_seized(item, pargs, vma_area_list, mdc, ctl);
@@ -549,10 +523,12 @@  int parasite_dump_pages_seized(struct pstree_item *item,
 		return ret;
 	}
 
-	pargs->add_prot = 0;
-	if (compel_rpc_call_sync(PARASITE_CMD_MPROTECT_VMAS, ctl)) {
-		pr_err("Can't rollback unprotected vmas with parasite\n");
-		ret = -1;
+	if(!mdc->pre_dump){
+		pargs->add_prot = 0;
+		if (compel_rpc_call_sync(PARASITE_CMD_MPROTECT_VMAS, ctl)) {
+			pr_err("Can't rollback unprotected vmas with parasite\n");
+			ret = -1;
+		}
 	}
 
 	return ret;
diff --git a/criu/page-pipe.c b/criu/page-pipe.c
index c32b893..c70ba70 100644
--- a/criu/page-pipe.c
+++ b/criu/page-pipe.c
@@ -33,7 +33,7 @@  static int __ppb_resize_pipe(struct page_pipe_buf *ppb, unsigned long new_size)
 {
 	int ret;
 
-	ret = fcntl(ppb->p[0], F_SETPIPE_SZ, new_size * PAGE_SIZE);
+	ret = fcntl(ppb->p[0], F_SETPIPE_SZ, (new_size * PAGE_SIZE) + 1);
 	if (ret < 0)
 		return -1;
 
@@ -41,7 +41,7 @@  static int __ppb_resize_pipe(struct page_pipe_buf *ppb, unsigned long new_size)
 	BUG_ON(ret < ppb->pipe_size);
 
 	pr_debug("Grow pipe %x -> %x\n", ppb->pipe_size, ret);
-	ppb->pipe_size = ret;
+	ppb->pipe_size = ret - 1;
 
 	return 0;
 }
diff --git a/criu/page-xfer.c b/criu/page-xfer.c
index 9cdffd8..a5396b1 100644
--- a/criu/page-xfer.c
+++ b/criu/page-xfer.c
@@ -496,19 +496,75 @@  static inline u32 ppb_xfer_flags(struct page_xfer *xfer, struct page_pipe_buf *p
 		return PE_PRESENT;
 }
 
-int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp)
+static char userbuf[4 << 20];
+
+int page_xfer_dump_pages(int pid, struct page_xfer *xfer, struct page_pipe *pp, size_t *poff)
 {
 	struct page_pipe_buf *ppb;
 	unsigned int cur_hole = 0;
-	int ret;
+	unsigned int ret, ret2;
+	size_t off;
+	struct iovec *remoteiovs = pp->iovs;
 
 	pr_debug("Transferring pages:\n");
 
+	off = *poff;
+
 	list_for_each_entry(ppb, &pp->bufs, l) {
 		unsigned int i;
 
 		pr_debug("\tbuf %d/%d\n", ppb->pages_in, ppb->nr_segs);
 
+		size_t bufsize = sizeof(userbuf);
+		struct iovec bufvec = {.iov_len = bufsize};
+		bufvec.iov_base = userbuf;
+
+		ret = syscall(__NR_process_vm_readv, pid, &bufvec, 1, \
+						&remoteiovs[off], ppb->nr_segs, 0);
+		if (ret == -1) {
+			switch (errno) {
+				case EINVAL:
+					pr_debug("process_vm_readv: Invalid arguments\n");
+				break;
+				case EFAULT:
+					pr_debug("process_vm_readv: Unable to access remote iov\n");
+				break;
+				case ENOMEM:
+					pr_debug("process_vm_readv: Unable to allocate memory\n");
+				break;
+				case EPERM:
+					pr_debug("process_vm_readv: Insufficient privileges\n");
+				break;
+				case ESRCH:
+					pr_debug("process_vm_readv: Target process doesn't exist\n");
+				break;
+				default:
+					pr_debug("process_vm_readv: Uncategorised error\n");
+			}
+			return 0;
+		}
+
+		/* Handling partial reads due to modified mappings*/
+
+		if(ret != ppb->pages_in * PAGE_SIZE){
+			pr_debug("Can't read remote iovs (%d(%lu)/%d)\n", ret,\
+				(unsigned long int)PAGE_SIZE * ppb->pages_in, ppb->pages_in);
+			continue;
+		}
+
+		bufvec.iov_len = ret;
+		ret2 = vmsplice(ppb->p[1], &bufvec, 1, SPLICE_F_NONBLOCK);
+
+		if(ret2 == -1){
+			pr_debug("vmsplice: Failed to splice user buffer to pipe\n");
+			continue;
+		}
+
+		if(ret != ret2){
+			pr_debug("Partial splice from user buffer to pipe (%d)\n", ret2);
+			continue;
+		}
+
 		for (i = 0; i < ppb->nr_segs; i++) {
 			struct iovec iov = ppb->iov[i];
 			u32 flags;
@@ -530,8 +586,10 @@  int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp)
 						ppb->p[0], iov.iov_len))
 				return -1;
 		}
+		off += ppb->nr_segs;
 	}
 
+	*poff = off;
 	return dump_holes(xfer, pp, &cur_hole, NULL);
 }
 
diff --git a/criu/shmem.c b/criu/shmem.c
index 03b088f..a797bde 100644
--- a/criu/shmem.c
+++ b/criu/shmem.c
@@ -629,19 +629,9 @@  int add_shmem_area(pid_t pid, VmaEntry *vma, u64 *map)
 	return 0;
 }
 
-static int dump_pages(struct page_pipe *pp, struct page_xfer *xfer)
+static int dump_pages(struct page_pipe *pp, struct page_xfer *xfer, size_t *off)
 {
-	struct page_pipe_buf *ppb;
-
-	list_for_each_entry(ppb, &pp->bufs, l)
-		if (vmsplice(ppb->p[1], ppb->iov, ppb->nr_segs,
-					SPLICE_F_GIFT | SPLICE_F_NONBLOCK) !=
-				ppb->pages_in * PAGE_SIZE) {
-			pr_perror("Can't get shmem into page-pipe");
-			return -1;
-		}
-
-	return page_xfer_dump_pages(xfer, pp);
+	return page_xfer_dump_pages(getpid(), xfer, pp, off);
 }
 
 static int next_data_segment(int fd, unsigned long pfn,
@@ -678,6 +668,7 @@  static int do_dump_one_shmem(int fd, void *addr, struct shmem_info *si)
 	int err, ret = -1;
 	unsigned long pfn, nrpages, next_data_pnf = 0, next_hole_pfn = 0;
 	unsigned long pages[2] = {};
+	size_t off = 0;
 
 	nrpages = (si->size + PAGE_SIZE - 1) / PAGE_SIZE;
 
@@ -726,7 +717,7 @@  again:
 		}
 
 		if (ret == -EAGAIN) {
-			ret = dump_pages(pp, &xfer);
+			ret = dump_pages(pp, &xfer, &off);
 			if (ret)
 				goto err_xfer;
 			page_pipe_reinit(pp);
@@ -742,7 +733,7 @@  again:
 	cnt_add(CNT_SHPAGES_SKIPPED_PARENT, pages[0]);
 	cnt_add(CNT_SHPAGES_WRITTEN, pages[1]);
 
-	ret = dump_pages(pp, &xfer);
+	ret = dump_pages(pp, &xfer, &off);
 
 err_xfer:
 	xfer.close(&xfer);

Comments

Radostin Stoyanov July 6, 2019, 9:05 p.m.
Hi Abhishek,


On 05/07/2019 13:04, Abhishek Dubey wrote:
> This patch implements usage of process_vm_readv
> syscall to collect memory pages from target process during
> pre-dump. process_vm_readv collects pages in user-buffer,
> which is later vmspliced to page-pipes.
It might be useful to include some performance information about the 
changes in this patch (e.g. you can use "-v0 --display-stats").

>
> Signed-off-by: Abhishek Dubey<dubeyabhishek777@gmail.com>
> ---
>   criu/cr-dump.c           |  3 +-
>   criu/include/page-xfer.h |  2 +-
>   criu/mem.c               | 88 ++++++++++++++++++------------------------------
>   criu/page-pipe.c         |  4 +--
>   criu/page-xfer.c         | 62 ++++++++++++++++++++++++++++++++--
>   criu/shmem.c             | 19 +++--------
>   6 files changed, 102 insertions(+), 76 deletions(-)
>
> diff --git a/criu/cr-dump.c b/criu/cr-dump.c
> index 7f2e5ed..ee5f4f3 100644
> --- a/criu/cr-dump.c
> +++ b/criu/cr-dump.c
> @@ -1501,6 +1501,7 @@ static int cr_pre_dump_finish(int status)
>   		struct parasite_ctl *ctl = dmpi(item)->parasite_ctl;
>   		struct page_pipe *mem_pp;
>   		struct page_xfer xfer;
> +		size_t off = 0;
>   
>   		if (!ctl)
>   			continue;
> @@ -1512,7 +1513,7 @@ static int cr_pre_dump_finish(int status)
>   			goto err;
>   
>   		mem_pp = dmpi(item)->mem_pp;
> -		ret = page_xfer_dump_pages(&xfer, mem_pp);
> +		ret = page_xfer_dump_pages(item->pid->real, &xfer, mem_pp, &off);
>   
>   		xfer.close(&xfer);
>   
> diff --git a/criu/include/page-xfer.h b/criu/include/page-xfer.h
> index fa72273..74d42f2 100644
> --- a/criu/include/page-xfer.h
> +++ b/criu/include/page-xfer.h
> @@ -47,7 +47,7 @@ struct page_xfer {
>   
>   extern int open_page_xfer(struct page_xfer *xfer, int fd_type, unsigned long id);
>   struct page_pipe;
> -extern int page_xfer_dump_pages(struct page_xfer *, struct page_pipe *);
> +extern int page_xfer_dump_pages(int pid, struct page_xfer *, struct page_pipe *, size_t *poff);
>   extern int connect_to_page_server_to_send(void);
>   extern int connect_to_page_server_to_recv(int epfd);
>   extern int disconnect_from_page_server(void);
> diff --git a/criu/mem.c b/criu/mem.c
> index 6a1a87a..844d726 100644
> --- a/criu/mem.c
> +++ b/criu/mem.c
> @@ -260,39 +260,7 @@ static struct parasite_dump_pages_args *prep_dump_pages_args(struct parasite_ctl
>   	return args;
>   }
>   
> -static int drain_pages(struct page_pipe *pp, struct parasite_ctl *ctl,
> -		      struct parasite_dump_pages_args *args)
> -{
> -	struct page_pipe_buf *ppb;
> -	int ret = 0;
> -
> -	debug_show_page_pipe(pp);
> -
> -	/* Step 2 -- grab pages into page-pipe */
> -	list_for_each_entry(ppb, &pp->bufs, l) {
> -		args->nr_segs = ppb->nr_segs;
> -		args->nr_pages = ppb->pages_in;
> -		pr_debug("PPB: %d pages %d segs %u pipe %d off\n",
> -				args->nr_pages, args->nr_segs, ppb->pipe_size, args->off);
> -
> -		ret = compel_rpc_call(PARASITE_CMD_DUMPPAGES, ctl);
> -		if (ret < 0)
> -			return -1;
> -		ret = compel_util_send_fd(ctl, ppb->p[1]);
> -		if (ret)
> -			return -1;
> -
> -		ret = compel_rpc_sync(PARASITE_CMD_DUMPPAGES, ctl);
> -		if (ret < 0)
> -			return -1;
> -
> -		args->off += args->nr_segs;
> -	}
> -
> -	return 0;
> -}
> -
> -static int xfer_pages(struct page_pipe *pp, struct page_xfer *xfer)
> +static int xfer_pages( int pid, struct page_pipe *pp, struct page_xfer *xfer, size_t *poff)
please remove the space before "int pid"
>   {
>   	int ret;
could you please update the comment in this function ("Step 3" should be 
"Step 2")?

>   
> @@ -301,7 +269,7 @@ static int xfer_pages(struct page_pipe *pp, struct page_xfer *xfer)
>   	 *           pre-dump action (see pre_dump_one_task)
>   	 */
>   	timing_start(TIME_MEMWRITE);
> -	ret = page_xfer_dump_pages(xfer, pp);
> +	ret = page_xfer_dump_pages(pid, xfer, pp, poff);
>   	timing_stop(TIME_MEMWRITE);
>   
>   	return ret;
> @@ -351,7 +319,7 @@ static int generate_vma_iovs(struct pstree_item *item, struct vma_area *vma,
>   			     struct page_pipe *pp, struct page_xfer *xfer,
>   			     struct parasite_dump_pages_args *args,
>   			     struct parasite_ctl *ctl, pmc_t *pmc,
> -			     bool has_parent, bool pre_dump)
> +			     bool has_parent, bool pre_dump, size_t *poff)
>   {
>   	u64 off = 0;
>   	u64 *map;
> @@ -361,6 +329,12 @@ static int generate_vma_iovs(struct pstree_item *item, struct vma_area *vma,
>   				!vma_area_is(vma, VMA_ANON_SHARED))
>   		return 0;
>   
> +	if (!(vma->e->prot & PROT_READ)){
> +		if(pre_dump)
> +			return 0;
> +		has_parent = false;
> +	}
> +
>   	if (vma_entry_is(vma->e, VMA_AREA_AIORING)) {
>   		if (pre_dump)
>   			return 0;
> @@ -379,9 +353,7 @@ again:
>   	if (ret == -EAGAIN) {
>   		BUG_ON(!(pp->flags & PP_CHUNK_MODE));
>   
> -		ret = drain_pages(pp, ctl, args);
> -		if (!ret)
> -			ret = xfer_pages(pp, xfer);
> +		ret = xfer_pages(item->pid->real, pp, xfer, poff);
>   		if (!ret) {
>   			page_pipe_reinit(pp);
>   			goto again;
> @@ -406,6 +378,7 @@ static int __parasite_dump_pages_seized(struct pstree_item *item,
>   	unsigned long pmc_size;
>   	int possible_pid_reuse = 0;
>   	bool has_parent;
> +	size_t poff = 0;
>   
>   	pr_info("\n");
>   	pr_info("Dumping pages (type: %d pid: %d)\n", CR_FD_PAGES, item->pid->real);
> @@ -470,11 +443,12 @@ static int __parasite_dump_pages_seized(struct pstree_item *item,
>   	/*
>   	 * Step 1 -- generate the pagemap
>   	 */
> +	poff = 0;
poff has been initialised to 0, it seems unnecessary to set this value 
again?
>   	args->off = 0;
>   	has_parent = !!xfer.parent && !possible_pid_reuse;
>   	list_for_each_entry(vma_area, &vma_area_list->h, list) {
>   		ret = generate_vma_iovs(item, vma_area, pp, &xfer, args, ctl,
> -					&pmc, has_parent, mdc->pre_dump);
> +					&pmc, has_parent, mdc->pre_dump, &poff);
>   		if (ret < 0)
>   			goto out_xfer;
>   	}
> @@ -482,9 +456,8 @@ static int __parasite_dump_pages_seized(struct pstree_item *item,
>   	if (mdc->lazy)
>   		memcpy(pargs_iovs(args), pp->iovs,
>   		       sizeof(struct iovec) * pp->nr_iovs);
> -	ret = drain_pages(pp, ctl, args);
> -	if (!ret && !mdc->pre_dump)
> -		ret = xfer_pages(pp, &xfer);
> +	if(!mdc->pre_dump)
please add a space after 'if'
> +		ret = xfer_pages(item->pid->real, pp, &xfer, &poff);
>   	if (ret)
>   		goto out_xfer;
>   
> @@ -529,17 +502,18 @@ int parasite_dump_pages_seized(struct pstree_item *item,
>   	 *
>   	 * Afterwards -- reprotect memory back.
>   	 */
> +	if(!mdc->pre_dump){
again here, and a space before the curly brace
> +		pargs->add_prot = PROT_READ;
> +		ret = compel_rpc_call_sync(PARASITE_CMD_MPROTECT_VMAS, ctl);
> +		if (ret) {
> +			pr_err("Can't dump unprotect vmas with parasite\n");
> +			return ret;
> +		}
>   
> -	pargs->add_prot = PROT_READ;
> -	ret = compel_rpc_call_sync(PARASITE_CMD_MPROTECT_VMAS, ctl);
> -	if (ret) {
> -		pr_err("Can't dump unprotect vmas with parasite\n");
> -		return ret;
> -	}
> -
> -	if (fault_injected(FI_DUMP_PAGES)) {
> -		pr_err("fault: Dump VMA pages failure!\n");
> -		return -1;
> +		if (fault_injected(FI_DUMP_PAGES)) {
> +			pr_err("fault: Dump VMA pages failure!\n");
> +			return -1;
> +		}
>   	}
>   
>   	ret = __parasite_dump_pages_seized(item, pargs, vma_area_list, mdc, ctl);
> @@ -549,10 +523,12 @@ int parasite_dump_pages_seized(struct pstree_item *item,
>   		return ret;
>   	}
>   
> -	pargs->add_prot = 0;
> -	if (compel_rpc_call_sync(PARASITE_CMD_MPROTECT_VMAS, ctl)) {
> -		pr_err("Can't rollback unprotected vmas with parasite\n");
> -		ret = -1;
> +	if(!mdc->pre_dump){
ditto
> +		pargs->add_prot = 0;
> +		if (compel_rpc_call_sync(PARASITE_CMD_MPROTECT_VMAS, ctl)) {
> +			pr_err("Can't rollback unprotected vmas with parasite\n");
> +			ret = -1;
> +		}
>   	}
>   
>   	return ret;
> diff --git a/criu/page-pipe.c b/criu/page-pipe.c
> index c32b893..c70ba70 100644
> --- a/criu/page-pipe.c
> +++ b/criu/page-pipe.c
> @@ -33,7 +33,7 @@ static int __ppb_resize_pipe(struct page_pipe_buf *ppb, unsigned long new_size)
>   {
>   	int ret;
>   
> -	ret = fcntl(ppb->p[0], F_SETPIPE_SZ, new_size * PAGE_SIZE);
> +	ret = fcntl(ppb->p[0], F_SETPIPE_SZ, (new_size * PAGE_SIZE) + 1);
>   	if (ret < 0)
>   		return -1;
>   
> @@ -41,7 +41,7 @@ static int __ppb_resize_pipe(struct page_pipe_buf *ppb, unsigned long new_size)
>   	BUG_ON(ret < ppb->pipe_size);
>   
>   	pr_debug("Grow pipe %x -> %x\n", ppb->pipe_size, ret);
> -	ppb->pipe_size = ret;
> +	ppb->pipe_size = ret - 1;
>   
>   	return 0;
>   }
> diff --git a/criu/page-xfer.c b/criu/page-xfer.c
> index 9cdffd8..a5396b1 100644
> --- a/criu/page-xfer.c
> +++ b/criu/page-xfer.c
> @@ -496,19 +496,75 @@ static inline u32 ppb_xfer_flags(struct page_xfer *xfer, struct page_pipe_buf *p
>   		return PE_PRESENT;
>   }
>   
> -int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp)
> +static char userbuf[4 << 20];
maybe the buffer size should be (PIPE_MAX_SIZE * PAGE_SIZE)?
> +
> +int page_xfer_dump_pages(int pid, struct page_xfer *xfer, struct page_pipe *pp, size_t *poff)
>   {
>   	struct page_pipe_buf *ppb;
>   	unsigned int cur_hole = 0;
> -	int ret;
> +	unsigned int ret, ret2;
> +	size_t off;
> +	struct iovec *remoteiovs = pp->iovs;
>   
>   	pr_debug("Transferring pages:\n");
>   
> +	off = *poff;
> +
>   	list_for_each_entry(ppb, &pp->bufs, l) {
>   		unsigned int i;
>   
>   		pr_debug("\tbuf %d/%d\n", ppb->pages_in, ppb->nr_segs);
>   
> +		size_t bufsize = sizeof(userbuf);
> +		struct iovec bufvec = {.iov_len = bufsize};
> +		bufvec.iov_base = userbuf;
> +
> +		ret = syscall(__NR_process_vm_readv, pid, &bufvec, 1, \
> +						&remoteiovs[off], ppb->nr_segs, 0);
> +		if (ret == -1) {
> +			switch (errno) {
you can use pr_perror() to print a string representation of errno
> +				case EINVAL:
> +					pr_debug("process_vm_readv: Invalid arguments\n");
> +				break;
> +				case EFAULT:
> +					pr_debug("process_vm_readv: Unable to access remote iov\n");
> +				break;
> +				case ENOMEM:
> +					pr_debug("process_vm_readv: Unable to allocate memory\n");
> +				break;
> +				case EPERM:
> +					pr_debug("process_vm_readv: Insufficient privileges\n");
> +				break;
> +				case ESRCH:
> +					pr_debug("process_vm_readv: Target process doesn't exist\n");
> +				break;
> +				default:
> +					pr_debug("process_vm_readv: Uncategorised error\n");
> +			}
> +			return 0;
why return 0 here? maybe we should exit with an error if 
process_vm_readv() fails?

> +		}
> +
> +		/* Handling partial reads due to modified mappings*/
> +
> +		if(ret != ppb->pages_in * PAGE_SIZE){
please add a space after 'if' and before '{'
> +			pr_debug("Can't read remote iovs (%d(%lu)/%d)\n", ret,\
> +				(unsigned long int)PAGE_SIZE * ppb->pages_in, ppb->pages_in);
> +			continue;
> +		}
> +
> +		bufvec.iov_len = ret;
> +		ret2 = vmsplice(ppb->p[1], &bufvec, 1, SPLICE_F_NONBLOCK);
> +
> +		if(ret2 == -1){
ditto
> +			pr_debug("vmsplice: Failed to splice user buffer to pipe\n");
> +			continue;
> +		}
> +
> +		if(ret != ret2){
ditto
> +			pr_debug("Partial splice from user buffer to pipe (%d)\n", ret2);
> +			continue;
> +		}
> +
>   		for (i = 0; i < ppb->nr_segs; i++) {
>   			struct iovec iov = ppb->iov[i];
>   			u32 flags;
> @@ -530,8 +586,10 @@ int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp)
>   						ppb->p[0], iov.iov_len))
>   				return -1;
>   		}
> +		off += ppb->nr_segs;
>   	}
>   
> +	*poff = off;
>   	return dump_holes(xfer, pp, &cur_hole, NULL);
>   }
>   
> diff --git a/criu/shmem.c b/criu/shmem.c
> index 03b088f..a797bde 100644
> --- a/criu/shmem.c
> +++ b/criu/shmem.c
> @@ -629,19 +629,9 @@ int add_shmem_area(pid_t pid, VmaEntry *vma, u64 *map)
>   	return 0;
>   }
>   
> -static int dump_pages(struct page_pipe *pp, struct page_xfer *xfer)
> +static int dump_pages(struct page_pipe *pp, struct page_xfer *xfer, size_t *off)
>   {
> -	struct page_pipe_buf *ppb;
> -
> -	list_for_each_entry(ppb, &pp->bufs, l)
> -		if (vmsplice(ppb->p[1], ppb->iov, ppb->nr_segs,
> -					SPLICE_F_GIFT | SPLICE_F_NONBLOCK) !=
> -				ppb->pages_in * PAGE_SIZE) {
> -			pr_perror("Can't get shmem into page-pipe");
> -			return -1;
> -		}
> -
> -	return page_xfer_dump_pages(xfer, pp);
> +	return page_xfer_dump_pages(getpid(), xfer, pp, off);
>   }
>   
>   static int next_data_segment(int fd, unsigned long pfn,
> @@ -678,6 +668,7 @@ static int do_dump_one_shmem(int fd, void *addr, struct shmem_info *si)
>   	int err, ret = -1;
>   	unsigned long pfn, nrpages, next_data_pnf = 0, next_hole_pfn = 0;
>   	unsigned long pages[2] = {};
> +	size_t off = 0;
>   
>   	nrpages = (si->size + PAGE_SIZE - 1) / PAGE_SIZE;
>   
> @@ -726,7 +717,7 @@ again:
>   		}
>   
>   		if (ret == -EAGAIN) {
> -			ret = dump_pages(pp, &xfer);
> +			ret = dump_pages(pp, &xfer, &off);
>   			if (ret)
>   				goto err_xfer;
>   			page_pipe_reinit(pp);
> @@ -742,7 +733,7 @@ again:
>   	cnt_add(CNT_SHPAGES_SKIPPED_PARENT, pages[0]);
>   	cnt_add(CNT_SHPAGES_WRITTEN, pages[1]);
>   
> -	ret = dump_pages(pp, &xfer);
> +	ret = dump_pages(pp, &xfer, &off);
>   
>   err_xfer:
>   	xfer.close(&xfer);
Andrey Vagin July 8, 2019, 4:05 a.m.
On Fri, Jul 05, 2019 at 05:34:26PM +0530, Abhishek Dubey wrote:
> This patch implements usage of process_vm_readv
> syscall to collect memory pages from target process during
> pre-dump. process_vm_readv collects pages in user-buffer,
> which is later vmspliced to page-pipes.

You have to be more detailed in the commit message.

Frist of all, you have to explain why we need this patch.

If you improve performance, you need to give some numbers.

If you implement a new feature, you need to provide tests for it.

If I understand this patch right, criu freezes all processes, collects
page maps, unfreezes process, dump pages? If this is right, you need to
explain how it handles in this case:

addr = map(NULL, 4 * PAGE_SIZE)	|
addr[0] = 1			|
addr[PAGE_SIZE]  = 2		|
addr[2 * PAGE_SIZE] = 3		|
addr[3 * PAGE_SIZE] = 4		|
				| criu freezes the process
				| criu collects page maps
				| criu unfreezes the process
unmap(addr, PAGE_SIZE * 2)	|
				| criu dumps pages

here are a few inline comments
> 
> Signed-off-by: Abhishek Dubey <dubeyabhishek777@gmail.com>
> ---
>  criu/cr-dump.c           |  3 +-
>  criu/include/page-xfer.h |  2 +-
>  criu/mem.c               | 88 ++++++++++++++++++------------------------------
>  criu/page-pipe.c         |  4 +--
>  criu/page-xfer.c         | 62 ++++++++++++++++++++++++++++++++--
>  criu/shmem.c             | 19 +++--------
>  6 files changed, 102 insertions(+), 76 deletions(-)
> 
> diff --git a/criu/cr-dump.c b/criu/cr-dump.c
> index 7f2e5ed..ee5f4f3 100644
> --- a/criu/cr-dump.c
> +++ b/criu/cr-dump.c
> @@ -1501,6 +1501,7 @@ static int cr_pre_dump_finish(int status)
>  		struct parasite_ctl *ctl = dmpi(item)->parasite_ctl;
>  		struct page_pipe *mem_pp;
>  		struct page_xfer xfer;
> +		size_t off = 0;
>  
>  		if (!ctl)
>  			continue;
> @@ -1512,7 +1513,7 @@ static int cr_pre_dump_finish(int status)
>  			goto err;
>  
>  		mem_pp = dmpi(item)->mem_pp;
> -		ret = page_xfer_dump_pages(&xfer, mem_pp);
> +		ret = page_xfer_dump_pages(item->pid->real, &xfer, mem_pp, &off);
>  
>  		xfer.close(&xfer);
>  
> diff --git a/criu/include/page-xfer.h b/criu/include/page-xfer.h
> index fa72273..74d42f2 100644
> --- a/criu/include/page-xfer.h
> +++ b/criu/include/page-xfer.h
> @@ -47,7 +47,7 @@ struct page_xfer {
>  
>  extern int open_page_xfer(struct page_xfer *xfer, int fd_type, unsigned long id);
>  struct page_pipe;
> -extern int page_xfer_dump_pages(struct page_xfer *, struct page_pipe *);
> +extern int page_xfer_dump_pages(int pid, struct page_xfer *, struct page_pipe *, size_t *poff);
>  extern int connect_to_page_server_to_send(void);
>  extern int connect_to_page_server_to_recv(int epfd);
>  extern int disconnect_from_page_server(void);
> diff --git a/criu/mem.c b/criu/mem.c
> index 6a1a87a..844d726 100644
> --- a/criu/mem.c
> +++ b/criu/mem.c
> @@ -260,39 +260,7 @@ static struct parasite_dump_pages_args *prep_dump_pages_args(struct parasite_ctl
>  	return args;
>  }
>  
> -static int drain_pages(struct page_pipe *pp, struct parasite_ctl *ctl,
> -		      struct parasite_dump_pages_args *args)
> -{
> -	struct page_pipe_buf *ppb;
> -	int ret = 0;
> -
> -	debug_show_page_pipe(pp);
> -
> -	/* Step 2 -- grab pages into page-pipe */
> -	list_for_each_entry(ppb, &pp->bufs, l) {
> -		args->nr_segs = ppb->nr_segs;
> -		args->nr_pages = ppb->pages_in;
> -		pr_debug("PPB: %d pages %d segs %u pipe %d off\n",
> -				args->nr_pages, args->nr_segs, ppb->pipe_size, args->off);
> -
> -		ret = compel_rpc_call(PARASITE_CMD_DUMPPAGES, ctl);
> -		if (ret < 0)
> -			return -1;
> -		ret = compel_util_send_fd(ctl, ppb->p[1]);
> -		if (ret)
> -			return -1;
> -
> -		ret = compel_rpc_sync(PARASITE_CMD_DUMPPAGES, ctl);
> -		if (ret < 0)
> -			return -1;
> -
> -		args->off += args->nr_segs;
> -	}
> -
> -	return 0;
> -}
> -
> -static int xfer_pages(struct page_pipe *pp, struct page_xfer *xfer)
> +static int xfer_pages( int pid, struct page_pipe *pp, struct page_xfer *xfer, size_t *poff)
>  {
>  	int ret;
>  
> @@ -301,7 +269,7 @@ static int xfer_pages(struct page_pipe *pp, struct page_xfer *xfer)
>  	 *           pre-dump action (see pre_dump_one_task)
>  	 */
>  	timing_start(TIME_MEMWRITE);
> -	ret = page_xfer_dump_pages(xfer, pp);
> +	ret = page_xfer_dump_pages(pid, xfer, pp, poff);
>  	timing_stop(TIME_MEMWRITE);
>  
>  	return ret;
> @@ -351,7 +319,7 @@ static int generate_vma_iovs(struct pstree_item *item, struct vma_area *vma,
>  			     struct page_pipe *pp, struct page_xfer *xfer,
>  			     struct parasite_dump_pages_args *args,
>  			     struct parasite_ctl *ctl, pmc_t *pmc,
> -			     bool has_parent, bool pre_dump)
> +			     bool has_parent, bool pre_dump, size_t *poff)
>  {
>  	u64 off = 0;
>  	u64 *map;
> @@ -361,6 +329,12 @@ static int generate_vma_iovs(struct pstree_item *item, struct vma_area *vma,
>  				!vma_area_is(vma, VMA_ANON_SHARED))
>  		return 0;
>  
> +	if (!(vma->e->prot & PROT_READ)){
> +		if(pre_dump)
> +			return 0;
> +		has_parent = false;
> +	}
> +
>  	if (vma_entry_is(vma->e, VMA_AREA_AIORING)) {
>  		if (pre_dump)
>  			return 0;
> @@ -379,9 +353,7 @@ again:
>  	if (ret == -EAGAIN) {
>  		BUG_ON(!(pp->flags & PP_CHUNK_MODE));
>  
> -		ret = drain_pages(pp, ctl, args);
> -		if (!ret)
> -			ret = xfer_pages(pp, xfer);
> +		ret = xfer_pages(item->pid->real, pp, xfer, poff);
>  		if (!ret) {
>  			page_pipe_reinit(pp);
>  			goto again;
> @@ -406,6 +378,7 @@ static int __parasite_dump_pages_seized(struct pstree_item *item,
>  	unsigned long pmc_size;
>  	int possible_pid_reuse = 0;
>  	bool has_parent;
> +	size_t poff = 0;
>  
>  	pr_info("\n");
>  	pr_info("Dumping pages (type: %d pid: %d)\n", CR_FD_PAGES, item->pid->real);
> @@ -470,11 +443,12 @@ static int __parasite_dump_pages_seized(struct pstree_item *item,
>  	/*
>  	 * Step 1 -- generate the pagemap
>  	 */
> +	poff = 0;
>  	args->off = 0;
>  	has_parent = !!xfer.parent && !possible_pid_reuse;
>  	list_for_each_entry(vma_area, &vma_area_list->h, list) {
>  		ret = generate_vma_iovs(item, vma_area, pp, &xfer, args, ctl,
> -					&pmc, has_parent, mdc->pre_dump);
> +					&pmc, has_parent, mdc->pre_dump, &poff);
>  		if (ret < 0)
>  			goto out_xfer;
>  	}
> @@ -482,9 +456,8 @@ static int __parasite_dump_pages_seized(struct pstree_item *item,
>  	if (mdc->lazy)
>  		memcpy(pargs_iovs(args), pp->iovs,
>  		       sizeof(struct iovec) * pp->nr_iovs);
> -	ret = drain_pages(pp, ctl, args);
> -	if (!ret && !mdc->pre_dump)
> -		ret = xfer_pages(pp, &xfer);
> +	if(!mdc->pre_dump)
> +		ret = xfer_pages(item->pid->real, pp, &xfer, &poff);
>  	if (ret)
>  		goto out_xfer;
>  
> @@ -529,17 +502,18 @@ int parasite_dump_pages_seized(struct pstree_item *item,
>  	 *
>  	 * Afterwards -- reprotect memory back.
>  	 */
> +	if(!mdc->pre_dump){
> +		pargs->add_prot = PROT_READ;
> +		ret = compel_rpc_call_sync(PARASITE_CMD_MPROTECT_VMAS, ctl);

This should be in a separate patch with explanation why we don't need
mprotect in this case.

> +		if (ret) {
> +			pr_err("Can't dump unprotect vmas with parasite\n");
> +			return ret;
> +		}
>  
> -	pargs->add_prot = PROT_READ;
> -	ret = compel_rpc_call_sync(PARASITE_CMD_MPROTECT_VMAS, ctl);
> -	if (ret) {
> -		pr_err("Can't dump unprotect vmas with parasite\n");
> -		return ret;
> -	}
> -
> -	if (fault_injected(FI_DUMP_PAGES)) {
> -		pr_err("fault: Dump VMA pages failure!\n");
> -		return -1;
> +		if (fault_injected(FI_DUMP_PAGES)) {
> +			pr_err("fault: Dump VMA pages failure!\n");
> +			return -1;
> +		}
>  	}
>  
>  	ret = __parasite_dump_pages_seized(item, pargs, vma_area_list, mdc, ctl);
> @@ -549,10 +523,12 @@ int parasite_dump_pages_seized(struct pstree_item *item,
>  		return ret;
>  	}
>  
> -	pargs->add_prot = 0;
> -	if (compel_rpc_call_sync(PARASITE_CMD_MPROTECT_VMAS, ctl)) {
> -		pr_err("Can't rollback unprotected vmas with parasite\n");
> -		ret = -1;
> +	if(!mdc->pre_dump){
> +		pargs->add_prot = 0;
> +		if (compel_rpc_call_sync(PARASITE_CMD_MPROTECT_VMAS, ctl)) {
> +			pr_err("Can't rollback unprotected vmas with parasite\n");
> +			ret = -1;
> +		}
>  	}
>  
>  	return ret;
> diff --git a/criu/page-pipe.c b/criu/page-pipe.c
> index c32b893..c70ba70 100644
> --- a/criu/page-pipe.c
> +++ b/criu/page-pipe.c
> @@ -33,7 +33,7 @@ static int __ppb_resize_pipe(struct page_pipe_buf *ppb, unsigned long new_size)
>  {
>  	int ret;
>  
> -	ret = fcntl(ppb->p[0], F_SETPIPE_SZ, new_size * PAGE_SIZE);
> +	ret = fcntl(ppb->p[0], F_SETPIPE_SZ, (new_size * PAGE_SIZE) + 1);
>  	if (ret < 0)
>  		return -1;
>  
> @@ -41,7 +41,7 @@ static int __ppb_resize_pipe(struct page_pipe_buf *ppb, unsigned long new_size)
>  	BUG_ON(ret < ppb->pipe_size);
>  
>  	pr_debug("Grow pipe %x -> %x\n", ppb->pipe_size, ret);
> -	ppb->pipe_size = ret;
> +	ppb->pipe_size = ret - 1;
>  
>  	return 0;
>  }
> diff --git a/criu/page-xfer.c b/criu/page-xfer.c
> index 9cdffd8..a5396b1 100644
> --- a/criu/page-xfer.c
> +++ b/criu/page-xfer.c
> @@ -496,19 +496,75 @@ static inline u32 ppb_xfer_flags(struct page_xfer *xfer, struct page_pipe_buf *p
>  		return PE_PRESENT;
>  }
>  
> -int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp)
> +static char userbuf[4 << 20];
> +
> +int page_xfer_dump_pages(int pid, struct page_xfer *xfer, struct page_pipe *pp, size_t *poff)
>  {
>  	struct page_pipe_buf *ppb;
>  	unsigned int cur_hole = 0;
> -	int ret;
> +	unsigned int ret, ret2;
> +	size_t off;
> +	struct iovec *remoteiovs = pp->iovs;
>  
>  	pr_debug("Transferring pages:\n");
>  
> +	off = *poff;
> +
>  	list_for_each_entry(ppb, &pp->bufs, l) {
>  		unsigned int i;
>  
>  		pr_debug("\tbuf %d/%d\n", ppb->pages_in, ppb->nr_segs);
>  
> +		size_t bufsize = sizeof(userbuf);
> +		struct iovec bufvec = {.iov_len = bufsize};
> +		bufvec.iov_base = userbuf;
> +
> +		ret = syscall(__NR_process_vm_readv, pid, &bufvec, 1, \
> +						&remoteiovs[off], ppb->nr_segs, 0);
> +		if (ret == -1) {
> +			switch (errno) {

you don't need this switch. You can use pr_perror of pr_debug("....: %s". strerror(errno));

> +				case EINVAL:
> +					pr_debug("process_vm_readv: Invalid arguments\n");
> +				break;
> +				case EFAULT:
> +					pr_debug("process_vm_readv: Unable to access remote iov\n");
> +				break;
> +				case ENOMEM:
> +					pr_debug("process_vm_readv: Unable to allocate memory\n");
> +				break;
> +				case EPERM:
> +					pr_debug("process_vm_readv: Insufficient privileges\n");
> +				break;
> +				case ESRCH:
> +					pr_debug("process_vm_readv: Target process doesn't exist\n");
> +				break;
> +				default:
> +					pr_debug("process_vm_readv: Uncategorised error\n");
> +			}
> +			return 0;

Why do we return 0 in error cases? You need to add a comment here.

> +		}
> +
> +		/* Handling partial reads due to modified mappings*/
> +
> +		if(ret != ppb->pages_in * PAGE_SIZE){
> +			pr_debug("Can't read remote iovs (%d(%lu)/%d)\n", ret,\
> +				(unsigned long int)PAGE_SIZE * ppb->pages_in, ppb->pages_in);
> +			continue;
> +		}
> +
> +		bufvec.iov_len = ret;
> +		ret2 = vmsplice(ppb->p[1], &bufvec, 1, SPLICE_F_NONBLOCK);
> +
> +		if(ret2 == -1){
> +			pr_debug("vmsplice: Failed to splice user buffer to pipe\n");
> +			continue;
> +		}
> +
> +		if(ret != ret2){
> +			pr_debug("Partial splice from user buffer to pipe (%d)\n", ret2);
> +			continue;
> +		}
> +
>  		for (i = 0; i < ppb->nr_segs; i++) {
>  			struct iovec iov = ppb->iov[i];
>  			u32 flags;
> @@ -530,8 +586,10 @@ int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp)
>  						ppb->p[0], iov.iov_len))
>  				return -1;
>  		}
> +		off += ppb->nr_segs;
>  	}
>  
> +	*poff = off;
>  	return dump_holes(xfer, pp, &cur_hole, NULL);
>  }
>  
> diff --git a/criu/shmem.c b/criu/shmem.c
> index 03b088f..a797bde 100644
> --- a/criu/shmem.c
> +++ b/criu/shmem.c
> @@ -629,19 +629,9 @@ int add_shmem_area(pid_t pid, VmaEntry *vma, u64 *map)
>  	return 0;
>  }
>  
> -static int dump_pages(struct page_pipe *pp, struct page_xfer *xfer)
> +static int dump_pages(struct page_pipe *pp, struct page_xfer *xfer, size_t *off)
>  {
> -	struct page_pipe_buf *ppb;
> -
> -	list_for_each_entry(ppb, &pp->bufs, l)
> -		if (vmsplice(ppb->p[1], ppb->iov, ppb->nr_segs,
> -					SPLICE_F_GIFT | SPLICE_F_NONBLOCK) !=
> -				ppb->pages_in * PAGE_SIZE) {
> -			pr_perror("Can't get shmem into page-pipe");
> -			return -1;
> -		}
> -
> -	return page_xfer_dump_pages(xfer, pp);
> +	return page_xfer_dump_pages(getpid(), xfer, pp, off);
>  }
>  
>  static int next_data_segment(int fd, unsigned long pfn,
> @@ -678,6 +668,7 @@ static int do_dump_one_shmem(int fd, void *addr, struct shmem_info *si)
>  	int err, ret = -1;
>  	unsigned long pfn, nrpages, next_data_pnf = 0, next_hole_pfn = 0;
>  	unsigned long pages[2] = {};
> +	size_t off = 0;
>  
>  	nrpages = (si->size + PAGE_SIZE - 1) / PAGE_SIZE;
>  
> @@ -726,7 +717,7 @@ again:
>  		}
>  
>  		if (ret == -EAGAIN) {
> -			ret = dump_pages(pp, &xfer);
> +			ret = dump_pages(pp, &xfer, &off);
>  			if (ret)
>  				goto err_xfer;
>  			page_pipe_reinit(pp);
> @@ -742,7 +733,7 @@ again:
>  	cnt_add(CNT_SHPAGES_SKIPPED_PARENT, pages[0]);
>  	cnt_add(CNT_SHPAGES_WRITTEN, pages[1]);
>  
> -	ret = dump_pages(pp, &xfer);
> +	ret = dump_pages(pp, &xfer, &off);
>  
>  err_xfer:
>  	xfer.close(&xfer);
> -- 
> 2.7.4
>
Abhishek Dubey July 8, 2019, 7:09 a.m.
Hi Radostin,

Please find inline replies.

On 07/07/19 2:35 AM, Radostin Stoyanov wrote:
> Hi Abhishek,
>
>
> On 05/07/2019 13:04, Abhishek Dubey wrote:
>> This patch implements usage of process_vm_readv
>> syscall to collect memory pages from target process during
>> pre-dump. process_vm_readv collects pages in user-buffer,
>> which is later vmspliced to page-pipes.
> It might be useful to include some performance information about the 
> changes in this patch (e.g. you can use "-v0 --display-stats").
Sure I will do that.
>
>>
>> Signed-off-by: Abhishek Dubey<dubeyabhishek777@gmail.com>
>> ---
>>   criu/cr-dump.c           |  3 +-
>>   criu/include/page-xfer.h |  2 +-
>>   criu/mem.c               | 88 
>> ++++++++++++++++++------------------------------
>>   criu/page-pipe.c         |  4 +--
>>   criu/page-xfer.c         | 62 ++++++++++++++++++++++++++++++++--
>>   criu/shmem.c             | 19 +++--------
>>   6 files changed, 102 insertions(+), 76 deletions(-)
>>
>> diff --git a/criu/cr-dump.c b/criu/cr-dump.c
>> index 7f2e5ed..ee5f4f3 100644
>> --- a/criu/cr-dump.c
>> +++ b/criu/cr-dump.c
>> @@ -1501,6 +1501,7 @@ static int cr_pre_dump_finish(int status)
>>           struct parasite_ctl *ctl = dmpi(item)->parasite_ctl;
>>           struct page_pipe *mem_pp;
>>           struct page_xfer xfer;
>> +        size_t off = 0;
>>             if (!ctl)
>>               continue;
>> @@ -1512,7 +1513,7 @@ static int cr_pre_dump_finish(int status)
>>               goto err;
>>             mem_pp = dmpi(item)->mem_pp;
>> -        ret = page_xfer_dump_pages(&xfer, mem_pp);
>> +        ret = page_xfer_dump_pages(item->pid->real, &xfer, mem_pp, 
>> &off);
>>             xfer.close(&xfer);
>>   diff --git a/criu/include/page-xfer.h b/criu/include/page-xfer.h
>> index fa72273..74d42f2 100644
>> --- a/criu/include/page-xfer.h
>> +++ b/criu/include/page-xfer.h
>> @@ -47,7 +47,7 @@ struct page_xfer {
>>     extern int open_page_xfer(struct page_xfer *xfer, int fd_type, 
>> unsigned long id);
>>   struct page_pipe;
>> -extern int page_xfer_dump_pages(struct page_xfer *, struct page_pipe 
>> *);
>> +extern int page_xfer_dump_pages(int pid, struct page_xfer *, struct 
>> page_pipe *, size_t *poff);
>>   extern int connect_to_page_server_to_send(void);
>>   extern int connect_to_page_server_to_recv(int epfd);
>>   extern int disconnect_from_page_server(void);
>> diff --git a/criu/mem.c b/criu/mem.c
>> index 6a1a87a..844d726 100644
>> --- a/criu/mem.c
>> +++ b/criu/mem.c
>> @@ -260,39 +260,7 @@ static struct parasite_dump_pages_args 
>> *prep_dump_pages_args(struct parasite_ctl
>>       return args;
>>   }
>>   -static int drain_pages(struct page_pipe *pp, struct parasite_ctl 
>> *ctl,
>> -              struct parasite_dump_pages_args *args)
>> -{
>> -    struct page_pipe_buf *ppb;
>> -    int ret = 0;
>> -
>> -    debug_show_page_pipe(pp);
>> -
>> -    /* Step 2 -- grab pages into page-pipe */
>> -    list_for_each_entry(ppb, &pp->bufs, l) {
>> -        args->nr_segs = ppb->nr_segs;
>> -        args->nr_pages = ppb->pages_in;
>> -        pr_debug("PPB: %d pages %d segs %u pipe %d off\n",
>> -                args->nr_pages, args->nr_segs, ppb->pipe_size, 
>> args->off);
>> -
>> -        ret = compel_rpc_call(PARASITE_CMD_DUMPPAGES, ctl);
>> -        if (ret < 0)
>> -            return -1;
>> -        ret = compel_util_send_fd(ctl, ppb->p[1]);
>> -        if (ret)
>> -            return -1;
>> -
>> -        ret = compel_rpc_sync(PARASITE_CMD_DUMPPAGES, ctl);
>> -        if (ret < 0)
>> -            return -1;
>> -
>> -        args->off += args->nr_segs;
>> -    }
>> -
>> -    return 0;
>> -}
>> -
>> -static int xfer_pages(struct page_pipe *pp, struct page_xfer *xfer)
>> +static int xfer_pages( int pid, struct page_pipe *pp, struct 
>> page_xfer *xfer, size_t *poff)
> please remove the space before "int pid"
OK.
>>   {
>>       int ret;
> could you please update the comment in this function ("Step 3" should 
> be "Step 2")?
Yes, I will update it.
>
>>   @@ -301,7 +269,7 @@ static int xfer_pages(struct page_pipe *pp, 
>> struct page_xfer *xfer)
>>        *           pre-dump action (see pre_dump_one_task)
>>        */
>>       timing_start(TIME_MEMWRITE);
>> -    ret = page_xfer_dump_pages(xfer, pp);
>> +    ret = page_xfer_dump_pages(pid, xfer, pp, poff);
>>       timing_stop(TIME_MEMWRITE);
>>         return ret;
>> @@ -351,7 +319,7 @@ static int generate_vma_iovs(struct pstree_item 
>> *item, struct vma_area *vma,
>>                    struct page_pipe *pp, struct page_xfer *xfer,
>>                    struct parasite_dump_pages_args *args,
>>                    struct parasite_ctl *ctl, pmc_t *pmc,
>> -                 bool has_parent, bool pre_dump)
>> +                 bool has_parent, bool pre_dump, size_t *poff)
>>   {
>>       u64 off = 0;
>>       u64 *map;
>> @@ -361,6 +329,12 @@ static int generate_vma_iovs(struct pstree_item 
>> *item, struct vma_area *vma,
>>                   !vma_area_is(vma, VMA_ANON_SHARED))
>>           return 0;
>>   +    if (!(vma->e->prot & PROT_READ)){
>> +        if(pre_dump)
>> +            return 0;
>> +        has_parent = false;
>> +    }
>> +
>>       if (vma_entry_is(vma->e, VMA_AREA_AIORING)) {
>>           if (pre_dump)
>>               return 0;
>> @@ -379,9 +353,7 @@ again:
>>       if (ret == -EAGAIN) {
>>           BUG_ON(!(pp->flags & PP_CHUNK_MODE));
>>   -        ret = drain_pages(pp, ctl, args);
>> -        if (!ret)
>> -            ret = xfer_pages(pp, xfer);
>> +        ret = xfer_pages(item->pid->real, pp, xfer, poff);
>>           if (!ret) {
>>               page_pipe_reinit(pp);
>>               goto again;
>> @@ -406,6 +378,7 @@ static int __parasite_dump_pages_seized(struct 
>> pstree_item *item,
>>       unsigned long pmc_size;
>>       int possible_pid_reuse = 0;
>>       bool has_parent;
>> +    size_t poff = 0;
>>         pr_info("\n");
>>       pr_info("Dumping pages (type: %d pid: %d)\n", CR_FD_PAGES, 
>> item->pid->real);
>> @@ -470,11 +443,12 @@ static int __parasite_dump_pages_seized(struct 
>> pstree_item *item,
>>       /*
>>        * Step 1 -- generate the pagemap
>>        */
>> +    poff = 0;
> poff has been initialised to 0, it seems unnecessary to set this value 
> again?
I will remove it.
>>       args->off = 0;
>>       has_parent = !!xfer.parent && !possible_pid_reuse;
>>       list_for_each_entry(vma_area, &vma_area_list->h, list) {
>>           ret = generate_vma_iovs(item, vma_area, pp, &xfer, args, ctl,
>> -                    &pmc, has_parent, mdc->pre_dump);
>> +                    &pmc, has_parent, mdc->pre_dump, &poff);
>>           if (ret < 0)
>>               goto out_xfer;
>>       }
>> @@ -482,9 +456,8 @@ static int __parasite_dump_pages_seized(struct 
>> pstree_item *item,
>>       if (mdc->lazy)
>>           memcpy(pargs_iovs(args), pp->iovs,
>>                  sizeof(struct iovec) * pp->nr_iovs);
>> -    ret = drain_pages(pp, ctl, args);
>> -    if (!ret && !mdc->pre_dump)
>> -        ret = xfer_pages(pp, &xfer);
>> +    if(!mdc->pre_dump)
> please add a space after 'if'
I have missed same at many places. Will handle it.
>> +        ret = xfer_pages(item->pid->real, pp, &xfer, &poff);
>>       if (ret)
>>           goto out_xfer;
>>   @@ -529,17 +502,18 @@ int parasite_dump_pages_seized(struct 
>> pstree_item *item,
>>        *
>>        * Afterwards -- reprotect memory back.
>>        */
>> +    if(!mdc->pre_dump){
> again here, and a space before the curly brace
Sure!
>> +        pargs->add_prot = PROT_READ;
>> +        ret = compel_rpc_call_sync(PARASITE_CMD_MPROTECT_VMAS, ctl);
>> +        if (ret) {
>> +            pr_err("Can't dump unprotect vmas with parasite\n");
>> +            return ret;
>> +        }
>>   -    pargs->add_prot = PROT_READ;
>> -    ret = compel_rpc_call_sync(PARASITE_CMD_MPROTECT_VMAS, ctl);
>> -    if (ret) {
>> -        pr_err("Can't dump unprotect vmas with parasite\n");
>> -        return ret;
>> -    }
>> -
>> -    if (fault_injected(FI_DUMP_PAGES)) {
>> -        pr_err("fault: Dump VMA pages failure!\n");
>> -        return -1;
>> +        if (fault_injected(FI_DUMP_PAGES)) {
>> +            pr_err("fault: Dump VMA pages failure!\n");
>> +            return -1;
>> +        }
>>       }
>>         ret = __parasite_dump_pages_seized(item, pargs, 
>> vma_area_list, mdc, ctl);
>> @@ -549,10 +523,12 @@ int parasite_dump_pages_seized(struct 
>> pstree_item *item,
>>           return ret;
>>       }
>>   -    pargs->add_prot = 0;
>> -    if (compel_rpc_call_sync(PARASITE_CMD_MPROTECT_VMAS, ctl)) {
>> -        pr_err("Can't rollback unprotected vmas with parasite\n");
>> -        ret = -1;
>> +    if(!mdc->pre_dump){
> ditto
ACK.
>> +        pargs->add_prot = 0;
>> +        if (compel_rpc_call_sync(PARASITE_CMD_MPROTECT_VMAS, ctl)) {
>> +            pr_err("Can't rollback unprotected vmas with parasite\n");
>> +            ret = -1;
>> +        }
>>       }
>>         return ret;
>> diff --git a/criu/page-pipe.c b/criu/page-pipe.c
>> index c32b893..c70ba70 100644
>> --- a/criu/page-pipe.c
>> +++ b/criu/page-pipe.c
>> @@ -33,7 +33,7 @@ static int __ppb_resize_pipe(struct page_pipe_buf 
>> *ppb, unsigned long new_size)
>>   {
>>       int ret;
>>   -    ret = fcntl(ppb->p[0], F_SETPIPE_SZ, new_size * PAGE_SIZE);
>> +    ret = fcntl(ppb->p[0], F_SETPIPE_SZ, (new_size * PAGE_SIZE) + 1);
>>       if (ret < 0)
>>           return -1;
>>   @@ -41,7 +41,7 @@ static int __ppb_resize_pipe(struct page_pipe_buf 
>> *ppb, unsigned long new_size)
>>       BUG_ON(ret < ppb->pipe_size);
>>         pr_debug("Grow pipe %x -> %x\n", ppb->pipe_size, ret);
>> -    ppb->pipe_size = ret;
>> +    ppb->pipe_size = ret - 1;
>>         return 0;
>>   }
>> diff --git a/criu/page-xfer.c b/criu/page-xfer.c
>> index 9cdffd8..a5396b1 100644
>> --- a/criu/page-xfer.c
>> +++ b/criu/page-xfer.c
>> @@ -496,19 +496,75 @@ static inline u32 ppb_xfer_flags(struct 
>> page_xfer *xfer, struct page_pipe_buf *p
>>           return PE_PRESENT;
>>   }
>>   -int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe 
>> *pp)
>> +static char userbuf[4 << 20];
> maybe the buffer size should be (PIPE_MAX_SIZE * PAGE_SIZE)?

In initial discussion, to keep things simple we decided on fixed size 
buffer of 4MB, since PIPE_MAX_SIZE is defined using multiple ingredients.

>> +
>> +int page_xfer_dump_pages(int pid, struct page_xfer *xfer, struct 
>> page_pipe *pp, size_t *poff)
>>   {
>>       struct page_pipe_buf *ppb;
>>       unsigned int cur_hole = 0;
>> -    int ret;
>> +    unsigned int ret, ret2;
>> +    size_t off;
>> +    struct iovec *remoteiovs = pp->iovs;
>>         pr_debug("Transferring pages:\n");
>>   +    off = *poff;
>> +
>>       list_for_each_entry(ppb, &pp->bufs, l) {
>>           unsigned int i;
>>             pr_debug("\tbuf %d/%d\n", ppb->pages_in, ppb->nr_segs);
>>   +        size_t bufsize = sizeof(userbuf);
>> +        struct iovec bufvec = {.iov_len = bufsize};
>> +        bufvec.iov_base = userbuf;
>> +
>> +        ret = syscall(__NR_process_vm_readv, pid, &bufvec, 1, \
>> +                        &remoteiovs[off], ppb->nr_segs, 0);
>> +        if (ret == -1) {
>> +            switch (errno) {
> you can use pr_perror() to print a string representation of errno

I need to handle process_vm_readv() errors gracefully, avoiding 
immediate termination.

If not pr_debug, may be pr_info could be better substitute.

>> +                case EINVAL:
>> +                    pr_debug("process_vm_readv: Invalid arguments\n");
>> +                break;
>> +                case EFAULT:
>> +                    pr_debug("process_vm_readv: Unable to access 
>> remote iov\n");
>> +                break;
>> +                case ENOMEM:
>> +                    pr_debug("process_vm_readv: Unable to allocate 
>> memory\n");
>> +                break;
>> +                case EPERM:
>> +                    pr_debug("process_vm_readv: Insufficient 
>> privileges\n");
>> +                break;
>> +                case ESRCH:
>> +                    pr_debug("process_vm_readv: Target process 
>> doesn't exist\n");
>> +                break;
>> +                default:
>> +                    pr_debug("process_vm_readv: Uncategorised 
>> error\n");
>> +            }
>> +            return 0;
> why return 0 here? maybe we should exit with an error if 
> process_vm_readv() fails?
Since, target process once unfrozen could modify the mappings as against 
to what was collected initially in collect_mappings. For transition test 
cases, syscall is expected to fail and there comes graceful handling. In 
graceful handling, for now, we are just skipping the complete iovec 
processing and moving to next iovec or if "process is not found", moving 
to next process in pstree.
>
>> +        }
>> +
>> +        /* Handling partial reads due to modified mappings*/
>> +
>> +        if(ret != ppb->pages_in * PAGE_SIZE){
> please add a space after 'if' and before '{'
Sure.
>> +            pr_debug("Can't read remote iovs (%d(%lu)/%d)\n", ret,\
>> +                (unsigned long int)PAGE_SIZE * ppb->pages_in, 
>> ppb->pages_in);
>> +            continue;
>> +        }
>> +
>> +        bufvec.iov_len = ret;
>> +        ret2 = vmsplice(ppb->p[1], &bufvec, 1, SPLICE_F_NONBLOCK);
>> +
>> +        if(ret2 == -1){
> ditto
OK
>> +            pr_debug("vmsplice: Failed to splice user buffer to 
>> pipe\n");
>> +            continue;
>> +        }
>> +
>> +        if(ret != ret2){
> OK
>> +            pr_debug("Partial splice from user buffer to pipe 
>> (%d)\n", ret2);
>> +            continue;
>> +        }
>> +
>>           for (i = 0; i < ppb->nr_segs; i++) {
>>               struct iovec iov = ppb->iov[i];
>>               u32 flags;
>> @@ -530,8 +586,10 @@ int page_xfer_dump_pages(struct page_xfer *xfer, 
>> struct page_pipe *pp)
>>                           ppb->p[0], iov.iov_len))
>>                   return -1;
>>           }
>> +        off += ppb->nr_segs;
>>       }
>>   +    *poff = off;
>>       return dump_holes(xfer, pp, &cur_hole, NULL);
>>   }
>>   diff --git a/criu/shmem.c b/criu/shmem.c
>> index 03b088f..a797bde 100644
>> --- a/criu/shmem.c
>> +++ b/criu/shmem.c
>> @@ -629,19 +629,9 @@ int add_shmem_area(pid_t pid, VmaEntry *vma, u64 
>> *map)
>>       return 0;
>>   }
>>   -static int dump_pages(struct page_pipe *pp, struct page_xfer *xfer)
>> +static int dump_pages(struct page_pipe *pp, struct page_xfer *xfer, 
>> size_t *off)
>>   {
>> -    struct page_pipe_buf *ppb;
>> -
>> -    list_for_each_entry(ppb, &pp->bufs, l)
>> -        if (vmsplice(ppb->p[1], ppb->iov, ppb->nr_segs,
>> -                    SPLICE_F_GIFT | SPLICE_F_NONBLOCK) !=
>> -                ppb->pages_in * PAGE_SIZE) {
>> -            pr_perror("Can't get shmem into page-pipe");
>> -            return -1;
>> -        }
>> -
>> -    return page_xfer_dump_pages(xfer, pp);
>> +    return page_xfer_dump_pages(getpid(), xfer, pp, off);
>>   }
>>     static int next_data_segment(int fd, unsigned long pfn,
>> @@ -678,6 +668,7 @@ static int do_dump_one_shmem(int fd, void *addr, 
>> struct shmem_info *si)
>>       int err, ret = -1;
>>       unsigned long pfn, nrpages, next_data_pnf = 0, next_hole_pfn = 0;
>>       unsigned long pages[2] = {};
>> +    size_t off = 0;
>>         nrpages = (si->size + PAGE_SIZE - 1) / PAGE_SIZE;
>>   @@ -726,7 +717,7 @@ again:
>>           }
>>             if (ret == -EAGAIN) {
>> -            ret = dump_pages(pp, &xfer);
>> +            ret = dump_pages(pp, &xfer, &off);
>>               if (ret)
>>                   goto err_xfer;
>>               page_pipe_reinit(pp);
>> @@ -742,7 +733,7 @@ again:
>>       cnt_add(CNT_SHPAGES_SKIPPED_PARENT, pages[0]);
>>       cnt_add(CNT_SHPAGES_WRITTEN, pages[1]);
>>   -    ret = dump_pages(pp, &xfer);
>> +    ret = dump_pages(pp, &xfer, &off);
>>     err_xfer:
>>       xfer.close(&xfer);
-Abhishek
Abhishek Dubey July 8, 2019, 8:20 p.m.
Hi Andrei,

Please find inline replies.

On 08/07/19 9:35 AM, Andrei Vagin wrote:
> On Fri, Jul 05, 2019 at 05:34:26PM +0530, Abhishek Dubey wrote:
>> This patch implements usage of process_vm_readv
>> syscall to collect memory pages from target process during
>> pre-dump. process_vm_readv collects pages in user-buffer,
>> which is later vmspliced to page-pipes.
> You have to be more detailed in the commit message.
>
> Frist of all, you have to explain why we need this patch.
I will take care of same from next time.
>
> If you improve performance, you need to give some numbers.
Still I am handling issue of "missing entry in parent pagemap". Next 
patch will have comparative figures included.
>
> If you implement a new feature, you need to provide tests for it.
>
> If I understand this patch right, criu freezes all processes, collects
> page maps, unfreezes process, dump pages? If this is right,
Yes, this is right.
> you need to
> explain how it handles in this case:
>
> addr = map(NULL, 4 * PAGE_SIZE)	|
> addr[0] = 1			|
> addr[PAGE_SIZE]  = 2		|
> addr[2 * PAGE_SIZE] = 3		|
> addr[3 * PAGE_SIZE] = 4		|
> 				| criu freezes the process
> 				| criu collects page maps
> 				| criu unfreezes the process
> unmap(addr, PAGE_SIZE * 2)	|
> 				| criu dumps pages
Need to explain this example somewhere along with code(as in 
page-pipe.h) or some other place?
>
> here are a few inline comments
>> Signed-off-by: Abhishek Dubey <dubeyabhishek777@gmail.com>
>> ---
>>   criu/cr-dump.c           |  3 +-
>>   criu/include/page-xfer.h |  2 +-
>>   criu/mem.c               | 88 ++++++++++++++++++------------------------------
>>   criu/page-pipe.c         |  4 +--
>>   criu/page-xfer.c         | 62 ++++++++++++++++++++++++++++++++--
>>   criu/shmem.c             | 19 +++--------
>>   6 files changed, 102 insertions(+), 76 deletions(-)
>>
>> diff --git a/criu/cr-dump.c b/criu/cr-dump.c
>> index 7f2e5ed..ee5f4f3 100644
>> --- a/criu/cr-dump.c
>> +++ b/criu/cr-dump.c
>> @@ -1501,6 +1501,7 @@ static int cr_pre_dump_finish(int status)
>>   		struct parasite_ctl *ctl = dmpi(item)->parasite_ctl;
>>   		struct page_pipe *mem_pp;
>>   		struct page_xfer xfer;
>> +		size_t off = 0;
>>   
>>   		if (!ctl)
>>   			continue;
>> @@ -1512,7 +1513,7 @@ static int cr_pre_dump_finish(int status)
>>   			goto err;
>>   
>>   		mem_pp = dmpi(item)->mem_pp;
>> -		ret = page_xfer_dump_pages(&xfer, mem_pp);
>> +		ret = page_xfer_dump_pages(item->pid->real, &xfer, mem_pp, &off);
>>   
>>   		xfer.close(&xfer);
>>   
>> diff --git a/criu/include/page-xfer.h b/criu/include/page-xfer.h
>> index fa72273..74d42f2 100644
>> --- a/criu/include/page-xfer.h
>> +++ b/criu/include/page-xfer.h
>> @@ -47,7 +47,7 @@ struct page_xfer {
>>   
>>   extern int open_page_xfer(struct page_xfer *xfer, int fd_type, unsigned long id);
>>   struct page_pipe;
>> -extern int page_xfer_dump_pages(struct page_xfer *, struct page_pipe *);
>> +extern int page_xfer_dump_pages(int pid, struct page_xfer *, struct page_pipe *, size_t *poff);
>>   extern int connect_to_page_server_to_send(void);
>>   extern int connect_to_page_server_to_recv(int epfd);
>>   extern int disconnect_from_page_server(void);
>> diff --git a/criu/mem.c b/criu/mem.c
>> index 6a1a87a..844d726 100644
>> --- a/criu/mem.c
>> +++ b/criu/mem.c
>> @@ -260,39 +260,7 @@ static struct parasite_dump_pages_args *prep_dump_pages_args(struct parasite_ctl
>>   	return args;
>>   }
>>   
>> -static int drain_pages(struct page_pipe *pp, struct parasite_ctl *ctl,
>> -		      struct parasite_dump_pages_args *args)
>> -{
>> -	struct page_pipe_buf *ppb;
>> -	int ret = 0;
>> -
>> -	debug_show_page_pipe(pp);
>> -
>> -	/* Step 2 -- grab pages into page-pipe */
>> -	list_for_each_entry(ppb, &pp->bufs, l) {
>> -		args->nr_segs = ppb->nr_segs;
>> -		args->nr_pages = ppb->pages_in;
>> -		pr_debug("PPB: %d pages %d segs %u pipe %d off\n",
>> -				args->nr_pages, args->nr_segs, ppb->pipe_size, args->off);
>> -
>> -		ret = compel_rpc_call(PARASITE_CMD_DUMPPAGES, ctl);
>> -		if (ret < 0)
>> -			return -1;
>> -		ret = compel_util_send_fd(ctl, ppb->p[1]);
>> -		if (ret)
>> -			return -1;
>> -
>> -		ret = compel_rpc_sync(PARASITE_CMD_DUMPPAGES, ctl);
>> -		if (ret < 0)
>> -			return -1;
>> -
>> -		args->off += args->nr_segs;
>> -	}
>> -
>> -	return 0;
>> -}
>> -
>> -static int xfer_pages(struct page_pipe *pp, struct page_xfer *xfer)
>> +static int xfer_pages( int pid, struct page_pipe *pp, struct page_xfer *xfer, size_t *poff)
>>   {
>>   	int ret;
>>   
>> @@ -301,7 +269,7 @@ static int xfer_pages(struct page_pipe *pp, struct page_xfer *xfer)
>>   	 *           pre-dump action (see pre_dump_one_task)
>>   	 */
>>   	timing_start(TIME_MEMWRITE);
>> -	ret = page_xfer_dump_pages(xfer, pp);
>> +	ret = page_xfer_dump_pages(pid, xfer, pp, poff);
>>   	timing_stop(TIME_MEMWRITE);
>>   
>>   	return ret;
>> @@ -351,7 +319,7 @@ static int generate_vma_iovs(struct pstree_item *item, struct vma_area *vma,
>>   			     struct page_pipe *pp, struct page_xfer *xfer,
>>   			     struct parasite_dump_pages_args *args,
>>   			     struct parasite_ctl *ctl, pmc_t *pmc,
>> -			     bool has_parent, bool pre_dump)
>> +			     bool has_parent, bool pre_dump, size_t *poff)
>>   {
>>   	u64 off = 0;
>>   	u64 *map;
>> @@ -361,6 +329,12 @@ static int generate_vma_iovs(struct pstree_item *item, struct vma_area *vma,
>>   				!vma_area_is(vma, VMA_ANON_SHARED))
>>   		return 0;
>>   
>> +	if (!(vma->e->prot & PROT_READ)){
>> +		if(pre_dump)
>> +			return 0;
>> +		has_parent = false;
>> +	}
>> +
>>   	if (vma_entry_is(vma->e, VMA_AREA_AIORING)) {
>>   		if (pre_dump)
>>   			return 0;
>> @@ -379,9 +353,7 @@ again:
>>   	if (ret == -EAGAIN) {
>>   		BUG_ON(!(pp->flags & PP_CHUNK_MODE));
>>   
>> -		ret = drain_pages(pp, ctl, args);
>> -		if (!ret)
>> -			ret = xfer_pages(pp, xfer);
>> +		ret = xfer_pages(item->pid->real, pp, xfer, poff);
>>   		if (!ret) {
>>   			page_pipe_reinit(pp);
>>   			goto again;
>> @@ -406,6 +378,7 @@ static int __parasite_dump_pages_seized(struct pstree_item *item,
>>   	unsigned long pmc_size;
>>   	int possible_pid_reuse = 0;
>>   	bool has_parent;
>> +	size_t poff = 0;
>>   
>>   	pr_info("\n");
>>   	pr_info("Dumping pages (type: %d pid: %d)\n", CR_FD_PAGES, item->pid->real);
>> @@ -470,11 +443,12 @@ static int __parasite_dump_pages_seized(struct pstree_item *item,
>>   	/*
>>   	 * Step 1 -- generate the pagemap
>>   	 */
>> +	poff = 0;
>>   	args->off = 0;
>>   	has_parent = !!xfer.parent && !possible_pid_reuse;
>>   	list_for_each_entry(vma_area, &vma_area_list->h, list) {
>>   		ret = generate_vma_iovs(item, vma_area, pp, &xfer, args, ctl,
>> -					&pmc, has_parent, mdc->pre_dump);
>> +					&pmc, has_parent, mdc->pre_dump, &poff);
>>   		if (ret < 0)
>>   			goto out_xfer;
>>   	}
>> @@ -482,9 +456,8 @@ static int __parasite_dump_pages_seized(struct pstree_item *item,
>>   	if (mdc->lazy)
>>   		memcpy(pargs_iovs(args), pp->iovs,
>>   		       sizeof(struct iovec) * pp->nr_iovs);
>> -	ret = drain_pages(pp, ctl, args);
>> -	if (!ret && !mdc->pre_dump)
>> -		ret = xfer_pages(pp, &xfer);
>> +	if(!mdc->pre_dump)
>> +		ret = xfer_pages(item->pid->real, pp, &xfer, &poff);
>>   	if (ret)
>>   		goto out_xfer;
>>   
>> @@ -529,17 +502,18 @@ int parasite_dump_pages_seized(struct pstree_item *item,
>>   	 *
>>   	 * Afterwards -- reprotect memory back.
>>   	 */
>> +	if(!mdc->pre_dump){
>> +		pargs->add_prot = PROT_READ;
>> +		ret = compel_rpc_call_sync(PARASITE_CMD_MPROTECT_VMAS, ctl);
> This should be in a separate patch with explanation why we don't need
> mprotect in this case.
Sure! I will get it in separate patch with proper reasoning.
>
>> +		if (ret) {
>> +			pr_err("Can't dump unprotect vmas with parasite\n");
>> +			return ret;
>> +		}
>>   
>> -	pargs->add_prot = PROT_READ;
>> -	ret = compel_rpc_call_sync(PARASITE_CMD_MPROTECT_VMAS, ctl);
>> -	if (ret) {
>> -		pr_err("Can't dump unprotect vmas with parasite\n");
>> -		return ret;
>> -	}
>> -
>> -	if (fault_injected(FI_DUMP_PAGES)) {
>> -		pr_err("fault: Dump VMA pages failure!\n");
>> -		return -1;
>> +		if (fault_injected(FI_DUMP_PAGES)) {
>> +			pr_err("fault: Dump VMA pages failure!\n");
>> +			return -1;
>> +		}
>>   	}
>>   
>>   	ret = __parasite_dump_pages_seized(item, pargs, vma_area_list, mdc, ctl);
>> @@ -549,10 +523,12 @@ int parasite_dump_pages_seized(struct pstree_item *item,
>>   		return ret;
>>   	}
>>   
>> -	pargs->add_prot = 0;
>> -	if (compel_rpc_call_sync(PARASITE_CMD_MPROTECT_VMAS, ctl)) {
>> -		pr_err("Can't rollback unprotected vmas with parasite\n");
>> -		ret = -1;
>> +	if(!mdc->pre_dump){
>> +		pargs->add_prot = 0;
>> +		if (compel_rpc_call_sync(PARASITE_CMD_MPROTECT_VMAS, ctl)) {
>> +			pr_err("Can't rollback unprotected vmas with parasite\n");
>> +			ret = -1;
>> +		}
>>   	}
>>   
>>   	return ret;
>> diff --git a/criu/page-pipe.c b/criu/page-pipe.c
>> index c32b893..c70ba70 100644
>> --- a/criu/page-pipe.c
>> +++ b/criu/page-pipe.c
>> @@ -33,7 +33,7 @@ static int __ppb_resize_pipe(struct page_pipe_buf *ppb, unsigned long new_size)
>>   {
>>   	int ret;
>>   
>> -	ret = fcntl(ppb->p[0], F_SETPIPE_SZ, new_size * PAGE_SIZE);
>> +	ret = fcntl(ppb->p[0], F_SETPIPE_SZ, (new_size * PAGE_SIZE) + 1);
>>   	if (ret < 0)
>>   		return -1;
>>   
>> @@ -41,7 +41,7 @@ static int __ppb_resize_pipe(struct page_pipe_buf *ppb, unsigned long new_size)
>>   	BUG_ON(ret < ppb->pipe_size);
>>   
>>   	pr_debug("Grow pipe %x -> %x\n", ppb->pipe_size, ret);
>> -	ppb->pipe_size = ret;
>> +	ppb->pipe_size = ret - 1;
>>   
>>   	return 0;
>>   }
>> diff --git a/criu/page-xfer.c b/criu/page-xfer.c
>> index 9cdffd8..a5396b1 100644
>> --- a/criu/page-xfer.c
>> +++ b/criu/page-xfer.c
>> @@ -496,19 +496,75 @@ static inline u32 ppb_xfer_flags(struct page_xfer *xfer, struct page_pipe_buf *p
>>   		return PE_PRESENT;
>>   }
>>   
>> -int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp)
>> +static char userbuf[4 << 20];
>> +
>> +int page_xfer_dump_pages(int pid, struct page_xfer *xfer, struct page_pipe *pp, size_t *poff)
>>   {
>>   	struct page_pipe_buf *ppb;
>>   	unsigned int cur_hole = 0;
>> -	int ret;
>> +	unsigned int ret, ret2;
>> +	size_t off;
>> +	struct iovec *remoteiovs = pp->iovs;
>>   
>>   	pr_debug("Transferring pages:\n");
>>   
>> +	off = *poff;
>> +
>>   	list_for_each_entry(ppb, &pp->bufs, l) {
>>   		unsigned int i;
>>   
>>   		pr_debug("\tbuf %d/%d\n", ppb->pages_in, ppb->nr_segs);
>>   
>> +		size_t bufsize = sizeof(userbuf);
>> +		struct iovec bufvec = {.iov_len = bufsize};
>> +		bufvec.iov_base = userbuf;
>> +
>> +		ret = syscall(__NR_process_vm_readv, pid, &bufvec, 1, \
>> +						&remoteiovs[off], ppb->nr_segs, 0);
>> +		if (ret == -1) {
>> +			switch (errno) {
> you don't need this switch. You can use pr_perror of pr_debug("....: %s". strerror(errno));
Ok :)
>
>> +				case EINVAL:
>> +					pr_debug("process_vm_readv: Invalid arguments\n");
>> +				break;
>> +				case EFAULT:
>> +					pr_debug("process_vm_readv: Unable to access remote iov\n");
>> +				break;
>> +				case ENOMEM:
>> +					pr_debug("process_vm_readv: Unable to allocate memory\n");
>> +				break;
>> +				case EPERM:
>> +					pr_debug("process_vm_readv: Insufficient privileges\n");
>> +				break;
>> +				case ESRCH:
>> +					pr_debug("process_vm_readv: Target process doesn't exist\n");
>> +				break;
>> +				default:
>> +					pr_debug("process_vm_readv: Uncategorised error\n");
>> +			}
>> +			return 0;
> Why do we return 0 in error cases? You need to add a comment here.
Will add brief description.
>
>> +		}
>> +
>> +		/* Handling partial reads due to modified mappings*/
>> +
>> +		if(ret != ppb->pages_in * PAGE_SIZE){
>> +			pr_debug("Can't read remote iovs (%d(%lu)/%d)\n", ret,\
>> +				(unsigned long int)PAGE_SIZE * ppb->pages_in, ppb->pages_in);
>> +			continue;
>> +		}
>> +
>> +		bufvec.iov_len = ret;
>> +		ret2 = vmsplice(ppb->p[1], &bufvec, 1, SPLICE_F_NONBLOCK);
>> +
>> +		if(ret2 == -1){
>> +			pr_debug("vmsplice: Failed to splice user buffer to pipe\n");
>> +			continue;
>> +		}
>> +
>> +		if(ret != ret2){
>> +			pr_debug("Partial splice from user buffer to pipe (%d)\n", ret2);
>> +			continue;
>> +		}
>> +
>>   		for (i = 0; i < ppb->nr_segs; i++) {
>>   			struct iovec iov = ppb->iov[i];
>>   			u32 flags;
>> @@ -530,8 +586,10 @@ int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp)
>>   						ppb->p[0], iov.iov_len))
>>   				return -1;
>>   		}
>> +		off += ppb->nr_segs;
>>   	}
>>   
>> +	*poff = off;
>>   	return dump_holes(xfer, pp, &cur_hole, NULL);
>>   }
>>   
>> diff --git a/criu/shmem.c b/criu/shmem.c
>> index 03b088f..a797bde 100644
>> --- a/criu/shmem.c
>> +++ b/criu/shmem.c
>> @@ -629,19 +629,9 @@ int add_shmem_area(pid_t pid, VmaEntry *vma, u64 *map)
>>   	return 0;
>>   }
>>   
>> -static int dump_pages(struct page_pipe *pp, struct page_xfer *xfer)
>> +static int dump_pages(struct page_pipe *pp, struct page_xfer *xfer, size_t *off)
>>   {
>> -	struct page_pipe_buf *ppb;
>> -
>> -	list_for_each_entry(ppb, &pp->bufs, l)
>> -		if (vmsplice(ppb->p[1], ppb->iov, ppb->nr_segs,
>> -					SPLICE_F_GIFT | SPLICE_F_NONBLOCK) !=
>> -				ppb->pages_in * PAGE_SIZE) {
>> -			pr_perror("Can't get shmem into page-pipe");
>> -			return -1;
>> -		}
>> -
>> -	return page_xfer_dump_pages(xfer, pp);
>> +	return page_xfer_dump_pages(getpid(), xfer, pp, off);
>>   }
>>   
>>   static int next_data_segment(int fd, unsigned long pfn,
>> @@ -678,6 +668,7 @@ static int do_dump_one_shmem(int fd, void *addr, struct shmem_info *si)
>>   	int err, ret = -1;
>>   	unsigned long pfn, nrpages, next_data_pnf = 0, next_hole_pfn = 0;
>>   	unsigned long pages[2] = {};
>> +	size_t off = 0;
>>   
>>   	nrpages = (si->size + PAGE_SIZE - 1) / PAGE_SIZE;
>>   
>> @@ -726,7 +717,7 @@ again:
>>   		}
>>   
>>   		if (ret == -EAGAIN) {
>> -			ret = dump_pages(pp, &xfer);
>> +			ret = dump_pages(pp, &xfer, &off);
>>   			if (ret)
>>   				goto err_xfer;
>>   			page_pipe_reinit(pp);
>> @@ -742,7 +733,7 @@ again:
>>   	cnt_add(CNT_SHPAGES_SKIPPED_PARENT, pages[0]);
>>   	cnt_add(CNT_SHPAGES_WRITTEN, pages[1]);
>>   
>> -	ret = dump_pages(pp, &xfer);
>> +	ret = dump_pages(pp, &xfer, &off);
>>   
>>   err_xfer:
>>   	xfer.close(&xfer);
>> -- 
>> 2.7.4
>>
-Abhishek
Mike Rapoport July 9, 2019, 8:04 a.m.
Hi Abishek,

On Fri, Jul 05, 2019 at 05:34:26PM +0530, Abhishek Dubey wrote:
> This patch implements usage of process_vm_readv
> syscall to collect memory pages from target process during
> pre-dump. process_vm_readv collects pages in user-buffer,
> which is later vmspliced to page-pipes.

From what I can tell, the patch completely replaces splicing of the memory
in the parasite code with usage of process_vm_readv() for both pre-dump and
dump.

I don't think that for dump that approach will be better then what we have
now and surely it'll be more expensive.

> Signed-off-by: Abhishek Dubey <dubeyabhishek777@gmail.com>
> ---
>  criu/cr-dump.c           |  3 +-
>  criu/include/page-xfer.h |  2 +-
>  criu/mem.c               | 88 ++++++++++++++++++------------------------------
>  criu/page-pipe.c         |  4 +--
>  criu/page-xfer.c         | 62 ++++++++++++++++++++++++++++++++--
>  criu/shmem.c             | 19 +++--------
>  6 files changed, 102 insertions(+), 76 deletions(-)
> 
> diff --git a/criu/cr-dump.c b/criu/cr-dump.c
> index 7f2e5ed..ee5f4f3 100644
> --- a/criu/cr-dump.c
> +++ b/criu/cr-dump.c
> @@ -1501,6 +1501,7 @@ static int cr_pre_dump_finish(int status)
>  		struct parasite_ctl *ctl = dmpi(item)->parasite_ctl;
>  		struct page_pipe *mem_pp;
>  		struct page_xfer xfer;
> +		size_t off = 0;
>  
>  		if (!ctl)
>  			continue;
> @@ -1512,7 +1513,7 @@ static int cr_pre_dump_finish(int status)
>  			goto err;
>  
>  		mem_pp = dmpi(item)->mem_pp;
> -		ret = page_xfer_dump_pages(&xfer, mem_pp);
> +		ret = page_xfer_dump_pages(item->pid->real, &xfer, mem_pp, &off);
>  
>  		xfer.close(&xfer);
>  
> diff --git a/criu/include/page-xfer.h b/criu/include/page-xfer.h
> index fa72273..74d42f2 100644
> --- a/criu/include/page-xfer.h
> +++ b/criu/include/page-xfer.h
> @@ -47,7 +47,7 @@ struct page_xfer {
>  
>  extern int open_page_xfer(struct page_xfer *xfer, int fd_type, unsigned long id);
>  struct page_pipe;
> -extern int page_xfer_dump_pages(struct page_xfer *, struct page_pipe *);
> +extern int page_xfer_dump_pages(int pid, struct page_xfer *, struct page_pipe *, size_t *poff);
>  extern int connect_to_page_server_to_send(void);
>  extern int connect_to_page_server_to_recv(int epfd);
>  extern int disconnect_from_page_server(void);
> diff --git a/criu/mem.c b/criu/mem.c
> index 6a1a87a..844d726 100644
> --- a/criu/mem.c
> +++ b/criu/mem.c
> @@ -260,39 +260,7 @@ static struct parasite_dump_pages_args *prep_dump_pages_args(struct parasite_ctl
>  	return args;
>  }
>  
> -static int drain_pages(struct page_pipe *pp, struct parasite_ctl *ctl,
> -		      struct parasite_dump_pages_args *args)
> -{
> -	struct page_pipe_buf *ppb;
> -	int ret = 0;
> -
> -	debug_show_page_pipe(pp);
> -
> -	/* Step 2 -- grab pages into page-pipe */
> -	list_for_each_entry(ppb, &pp->bufs, l) {
> -		args->nr_segs = ppb->nr_segs;
> -		args->nr_pages = ppb->pages_in;
> -		pr_debug("PPB: %d pages %d segs %u pipe %d off\n",
> -				args->nr_pages, args->nr_segs, ppb->pipe_size, args->off);
> -
> -		ret = compel_rpc_call(PARASITE_CMD_DUMPPAGES, ctl);
> -		if (ret < 0)
> -			return -1;
> -		ret = compel_util_send_fd(ctl, ppb->p[1]);
> -		if (ret)
> -			return -1;
> -
> -		ret = compel_rpc_sync(PARASITE_CMD_DUMPPAGES, ctl);
> -		if (ret < 0)
> -			return -1;
> -
> -		args->off += args->nr_segs;
> -	}
> -
> -	return 0;
> -}

The drain_pages() should remain as is for the dump case, the target
processes are anyway frozen and we have the parasite code there. I see no
point in replacing vmsplice() in the parasite with process_vm_readv() +
vmsplice() in this case.

Actually, I think that to use process_vm_readv() for the pre-dump case it
would be enough to override drain_pages() so we'll have something like
drain_pages_dump() that does vmsplice() via parasite and
drain_pages_pre_dump() that performs process_vm_readv() to a buffer and
then splices that buffer to the pipes.

> -static int xfer_pages(struct page_pipe *pp, struct page_xfer *xfer)
> +static int xfer_pages( int pid, struct page_pipe *pp, struct page_xfer *xfer, size_t *poff)

Radostin already mentioned a couple of coding style issues, I'd suggest to
use checkpatch.pl from Linux sources to verify the next patches.

>  {
>  	int ret;
>  
> @@ -301,7 +269,7 @@ static int xfer_pages(struct page_pipe *pp, struct page_xfer *xfer)
>  	 *           pre-dump action (see pre_dump_one_task)
>  	 */
>  	timing_start(TIME_MEMWRITE);
> -	ret = page_xfer_dump_pages(xfer, pp);
> +	ret = page_xfer_dump_pages(pid, xfer, pp, poff);
>  	timing_stop(TIME_MEMWRITE);
>  
>  	return ret;
> @@ -351,7 +319,7 @@ static int generate_vma_iovs(struct pstree_item *item, struct vma_area *vma,
>  			     struct page_pipe *pp, struct page_xfer *xfer,
>  			     struct parasite_dump_pages_args *args,
>  			     struct parasite_ctl *ctl, pmc_t *pmc,
> -			     bool has_parent, bool pre_dump)
> +			     bool has_parent, bool pre_dump, size_t *poff)
>  {
>  	u64 off = 0;
>  	u64 *map;
> @@ -361,6 +329,12 @@ static int generate_vma_iovs(struct pstree_item *item, struct vma_area *vma,
>  				!vma_area_is(vma, VMA_ANON_SHARED))
>  		return 0;
>  
> +	if (!(vma->e->prot & PROT_READ)){
> +		if(pre_dump)
> +			return 0;
> +		has_parent = false;
> +	}
> +
>  	if (vma_entry_is(vma->e, VMA_AREA_AIORING)) {
>  		if (pre_dump)
>  			return 0;
> @@ -379,9 +353,7 @@ again:
>  	if (ret == -EAGAIN) {
>  		BUG_ON(!(pp->flags & PP_CHUNK_MODE));
>  
> -		ret = drain_pages(pp, ctl, args);
> -		if (!ret)
> -			ret = xfer_pages(pp, xfer);
> +		ret = xfer_pages(item->pid->real, pp, xfer, poff);
>  		if (!ret) {
>  			page_pipe_reinit(pp);
>  			goto again;
> @@ -406,6 +378,7 @@ static int __parasite_dump_pages_seized(struct pstree_item *item,
>  	unsigned long pmc_size;
>  	int possible_pid_reuse = 0;
>  	bool has_parent;
> +	size_t poff = 0;
>  
>  	pr_info("\n");
>  	pr_info("Dumping pages (type: %d pid: %d)\n", CR_FD_PAGES, item->pid->real);
> @@ -470,11 +443,12 @@ static int __parasite_dump_pages_seized(struct pstree_item *item,
>  	/*
>  	 * Step 1 -- generate the pagemap
>  	 */
> +	poff = 0;
>  	args->off = 0;
>  	has_parent = !!xfer.parent && !possible_pid_reuse;
>  	list_for_each_entry(vma_area, &vma_area_list->h, list) {
>  		ret = generate_vma_iovs(item, vma_area, pp, &xfer, args, ctl,
> -					&pmc, has_parent, mdc->pre_dump);
> +					&pmc, has_parent, mdc->pre_dump, &poff);
>  		if (ret < 0)
>  			goto out_xfer;
>  	}
> @@ -482,9 +456,8 @@ static int __parasite_dump_pages_seized(struct pstree_item *item,
>  	if (mdc->lazy)
>  		memcpy(pargs_iovs(args), pp->iovs,
>  		       sizeof(struct iovec) * pp->nr_iovs);
> -	ret = drain_pages(pp, ctl, args);
> -	if (!ret && !mdc->pre_dump)
> -		ret = xfer_pages(pp, &xfer);
> +	if(!mdc->pre_dump)
> +		ret = xfer_pages(item->pid->real, pp, &xfer, &poff);
>  	if (ret)
>  		goto out_xfer;
>  
> @@ -529,17 +502,18 @@ int parasite_dump_pages_seized(struct pstree_item *item,
>  	 *
>  	 * Afterwards -- reprotect memory back.
>  	 */
> +	if(!mdc->pre_dump){
> +		pargs->add_prot = PROT_READ;
> +		ret = compel_rpc_call_sync(PARASITE_CMD_MPROTECT_VMAS, ctl);
> +		if (ret) {
> +			pr_err("Can't dump unprotect vmas with parasite\n");
> +			return ret;
> +		}
>  
> -	pargs->add_prot = PROT_READ;
> -	ret = compel_rpc_call_sync(PARASITE_CMD_MPROTECT_VMAS, ctl);
> -	if (ret) {
> -		pr_err("Can't dump unprotect vmas with parasite\n");
> -		return ret;
> -	}
> -
> -	if (fault_injected(FI_DUMP_PAGES)) {
> -		pr_err("fault: Dump VMA pages failure!\n");
> -		return -1;
> +		if (fault_injected(FI_DUMP_PAGES)) {
> +			pr_err("fault: Dump VMA pages failure!\n");
> +			return -1;
> +		}
>  	}
>  
>  	ret = __parasite_dump_pages_seized(item, pargs, vma_area_list, mdc, ctl);
> @@ -549,10 +523,12 @@ int parasite_dump_pages_seized(struct pstree_item *item,
>  		return ret;
>  	}
>  
> -	pargs->add_prot = 0;
> -	if (compel_rpc_call_sync(PARASITE_CMD_MPROTECT_VMAS, ctl)) {
> -		pr_err("Can't rollback unprotected vmas with parasite\n");
> -		ret = -1;
> +	if(!mdc->pre_dump){
> +		pargs->add_prot = 0;
> +		if (compel_rpc_call_sync(PARASITE_CMD_MPROTECT_VMAS, ctl)) {
> +			pr_err("Can't rollback unprotected vmas with parasite\n");
> +			ret = -1;
> +		}
>  	}
>  
>  	return ret;
> diff --git a/criu/page-pipe.c b/criu/page-pipe.c
> index c32b893..c70ba70 100644
> --- a/criu/page-pipe.c
> +++ b/criu/page-pipe.c
> @@ -33,7 +33,7 @@ static int __ppb_resize_pipe(struct page_pipe_buf *ppb, unsigned long new_size)
>  {
>  	int ret;
>  
> -	ret = fcntl(ppb->p[0], F_SETPIPE_SZ, new_size * PAGE_SIZE);
> +	ret = fcntl(ppb->p[0], F_SETPIPE_SZ, (new_size * PAGE_SIZE) + 1);

Why the size should be changed here?
In any case, this change should be a separate patch.

>  	if (ret < 0)
>  		return -1;
>  
> @@ -41,7 +41,7 @@ static int __ppb_resize_pipe(struct page_pipe_buf *ppb, unsigned long new_size)
>  	BUG_ON(ret < ppb->pipe_size);
>  
>  	pr_debug("Grow pipe %x -> %x\n", ppb->pipe_size, ret);
> -	ppb->pipe_size = ret;
> +	ppb->pipe_size = ret - 1;
>  
>  	return 0;
>  }
> diff --git a/criu/page-xfer.c b/criu/page-xfer.c
> index 9cdffd8..a5396b1 100644
> --- a/criu/page-xfer.c
> +++ b/criu/page-xfer.c
> @@ -496,19 +496,75 @@ static inline u32 ppb_xfer_flags(struct page_xfer *xfer, struct page_pipe_buf *p
>  		return PE_PRESENT;
>  }
>  
> -int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp)
> +static char userbuf[4 << 20];
> +
> +int page_xfer_dump_pages(int pid, struct page_xfer *xfer, struct page_pipe *pp, size_t *poff)
>  {
>  	struct page_pipe_buf *ppb;
>  	unsigned int cur_hole = 0;
> -	int ret;
> +	unsigned int ret, ret2;
> +	size_t off;
> +	struct iovec *remoteiovs = pp->iovs;
>  
>  	pr_debug("Transferring pages:\n");
>  
> +	off = *poff;
> +
>  	list_for_each_entry(ppb, &pp->bufs, l) {
>  		unsigned int i;
>  
>  		pr_debug("\tbuf %d/%d\n", ppb->pages_in, ppb->nr_segs);
>  
> +		size_t bufsize = sizeof(userbuf);
> +		struct iovec bufvec = {.iov_len = bufsize};
> +		bufvec.iov_base = userbuf;
> +
> +		ret = syscall(__NR_process_vm_readv, pid, &bufvec, 1, \
> +						&remoteiovs[off], ppb->nr_segs, 0);

I think glibc has a wrapper for __NR_process_vm_readv, why do you need to
use syscall(__NR_process_vm_readv, ...) here?

> +		if (ret == -1) {
> +			switch (errno) {
> +				case EINVAL:
> +					pr_debug("process_vm_readv: Invalid arguments\n");
> +				break;
> +				case EFAULT:
> +					pr_debug("process_vm_readv: Unable to access remote iov\n");
> +				break;
> +				case ENOMEM:
> +					pr_debug("process_vm_readv: Unable to allocate memory\n");
> +				break;
> +				case EPERM:
> +					pr_debug("process_vm_readv: Insufficient privileges\n");
> +				break;
> +				case ESRCH:
> +					pr_debug("process_vm_readv: Target process doesn't exist\n");
> +				break;
> +				default:
> +					pr_debug("process_vm_readv: Uncategorised error\n");
> +			}
> +			return 0;
> +		}
> +
> +		/* Handling partial reads due to modified mappings*/
> +
> +		if(ret != ppb->pages_in * PAGE_SIZE){
> +			pr_debug("Can't read remote iovs (%d(%lu)/%d)\n", ret,\
> +				(unsigned long int)PAGE_SIZE * ppb->pages_in, ppb->pages_in);
> +			continue;
> +		}
> +
> +		bufvec.iov_len = ret;
> +		ret2 = vmsplice(ppb->p[1], &bufvec, 1, SPLICE_F_NONBLOCK);
> +
> +		if(ret2 == -1){
> +			pr_debug("vmsplice: Failed to splice user buffer to pipe\n");
> +			continue;
> +		}
> +
> +		if(ret != ret2){
> +			pr_debug("Partial splice from user buffer to pipe (%d)\n", ret2);
> +			continue;
> +		}
> +

As I've said, I believe the whole block around process_vm_readv() +
vmsplice() should be in pre-dump version of drain_pages().

>  		for (i = 0; i < ppb->nr_segs; i++) {
>  			struct iovec iov = ppb->iov[i];
>  			u32 flags;
> @@ -530,8 +586,10 @@ int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp)
>  						ppb->p[0], iov.iov_len))
>  				return -1;
>  		}
> +		off += ppb->nr_segs;
>  	}
>  
> +	*poff = off;
>  	return dump_holes(xfer, pp, &cur_hole, NULL);
>  }
>  
> diff --git a/criu/shmem.c b/criu/shmem.c
> index 03b088f..a797bde 100644
> --- a/criu/shmem.c
> +++ b/criu/shmem.c
> @@ -629,19 +629,9 @@ int add_shmem_area(pid_t pid, VmaEntry *vma, u64 *map)
>  	return 0;
>  }

I don't think there is much value in doing process_vm_splice() for shared
memory. We anyway attach to the same shared memory the target processes
use and simply fill the pipes with it's contents.
  
> -static int dump_pages(struct page_pipe *pp, struct page_xfer *xfer)
> +static int dump_pages(struct page_pipe *pp, struct page_xfer *xfer, size_t *off)
>  {
> -	struct page_pipe_buf *ppb;
> -
> -	list_for_each_entry(ppb, &pp->bufs, l)
> -		if (vmsplice(ppb->p[1], ppb->iov, ppb->nr_segs,
> -					SPLICE_F_GIFT | SPLICE_F_NONBLOCK) !=
> -				ppb->pages_in * PAGE_SIZE) {
> -			pr_perror("Can't get shmem into page-pipe");
> -			return -1;
> -		}
> -
> -	return page_xfer_dump_pages(xfer, pp);
> +	return page_xfer_dump_pages(getpid(), xfer, pp, off);
>  }
>  
>  static int next_data_segment(int fd, unsigned long pfn,
> @@ -678,6 +668,7 @@ static int do_dump_one_shmem(int fd, void *addr, struct shmem_info *si)
>  	int err, ret = -1;
>  	unsigned long pfn, nrpages, next_data_pnf = 0, next_hole_pfn = 0;
>  	unsigned long pages[2] = {};
> +	size_t off = 0;
>  
>  	nrpages = (si->size + PAGE_SIZE - 1) / PAGE_SIZE;
>  
> @@ -726,7 +717,7 @@ again:
>  		}
>  
>  		if (ret == -EAGAIN) {
> -			ret = dump_pages(pp, &xfer);
> +			ret = dump_pages(pp, &xfer, &off);
>  			if (ret)
>  				goto err_xfer;
>  			page_pipe_reinit(pp);
> @@ -742,7 +733,7 @@ again:
>  	cnt_add(CNT_SHPAGES_SKIPPED_PARENT, pages[0]);
>  	cnt_add(CNT_SHPAGES_WRITTEN, pages[1]);
>  
> -	ret = dump_pages(pp, &xfer);
> +	ret = dump_pages(pp, &xfer, &off);
>  
>  err_xfer:
>  	xfer.close(&xfer);
> -- 
> 2.7.4
> 
> _______________________________________________
> CRIU mailing list
> CRIU@openvz.org
> https://lists.openvz.org/mailman/listinfo/criu
>
Andrey Vagin July 9, 2019, 4:08 p.m.
On Tue, Jul 09, 2019 at 01:50:57AM +0530, abhishek dubey wrote:
> Hi Andrei,
> 
> Please find inline replies.
> 
> On 08/07/19 9:35 AM, Andrei Vagin wrote:
> > On Fri, Jul 05, 2019 at 05:34:26PM +0530, Abhishek Dubey wrote:
> > > This patch implements usage of process_vm_readv
> > > syscall to collect memory pages from target process during
> > > pre-dump. process_vm_readv collects pages in user-buffer,
> > > which is later vmspliced to page-pipes.
> > You have to be more detailed in the commit message.
> > 
> > Frist of all, you have to explain why we need this patch.
> I will take care of same from next time.
> > 
> > If you improve performance, you need to give some numbers.
> Still I am handling issue of "missing entry in parent pagemap". Next patch
> will have comparative figures included.
> > 
> > If you implement a new feature, you need to provide tests for it.
> > 
> > If I understand this patch right, criu freezes all processes, collects
> > page maps, unfreezes process, dump pages? If this is right,
> Yes, this is right.
> > you need to
> > explain how it handles in this case:
> > 
> > addr = map(NULL, 4 * PAGE_SIZE)	|
> > addr[0] = 1			|
> > addr[PAGE_SIZE]  = 2		|
> > addr[2 * PAGE_SIZE] = 3		|
> > addr[3 * PAGE_SIZE] = 4		|
> > 				| criu freezes the process
> > 				| criu collects page maps
> > 				| criu unfreezes the process
> > unmap(addr, PAGE_SIZE * 2)	|
> > 				| criu dumps pages
> Need to explain this example somewhere along with code(as in page-pipe.h) or
> some other place?

here :)

I think this can be non-trivial case. process_vm_readv will return
ENOMEM for this segment, but a part of it still exists and we need to
find what part is here and dump it.
Abhishek Dubey July 11, 2019, 9:27 a.m.
Hi Mike,

On 09/07/19 1:34 PM, Mike Rapoport wrote:
> Hi Abishek,
>
> On Fri, Jul 05, 2019 at 05:34:26PM +0530, Abhishek Dubey wrote:
>> This patch implements usage of process_vm_readv
>> syscall to collect memory pages from target process during
>> pre-dump. process_vm_readv collects pages in user-buffer,
>> which is later vmspliced to page-pipes.
>  From what I can tell, the patch completely replaces splicing of the memory
> in the parasite code with usage of process_vm_readv() for both pre-dump and
> dump.
>
> I don't think that for dump that approach will be better then what we have
> now and surely it'll be more expensive.

Yes, for now its same for dump and pre-dump. process_vm_readv can't dump 
mappings without PROT_READ protection,

so for such cases we will fall back to parasite method in dump stage.

>> Signed-off-by: Abhishek Dubey <dubeyabhishek777@gmail.com>
>> ---
>>   criu/cr-dump.c           |  3 +-
>>   criu/include/page-xfer.h |  2 +-
>>   criu/mem.c               | 88 ++++++++++++++++++------------------------------
>>   criu/page-pipe.c         |  4 +--
>>   criu/page-xfer.c         | 62 ++++++++++++++++++++++++++++++++--
>>   criu/shmem.c             | 19 +++--------
>>   6 files changed, 102 insertions(+), 76 deletions(-)
>>
>> diff --git a/criu/cr-dump.c b/criu/cr-dump.c
>> index 7f2e5ed..ee5f4f3 100644
>> --- a/criu/cr-dump.c
>> +++ b/criu/cr-dump.c
>> @@ -1501,6 +1501,7 @@ static int cr_pre_dump_finish(int status)
>>   		struct parasite_ctl *ctl = dmpi(item)->parasite_ctl;
>>   		struct page_pipe *mem_pp;
>>   		struct page_xfer xfer;
>> +		size_t off = 0;
>>   
>>   		if (!ctl)
>>   			continue;
>> @@ -1512,7 +1513,7 @@ static int cr_pre_dump_finish(int status)
>>   			goto err;
>>   
>>   		mem_pp = dmpi(item)->mem_pp;
>> -		ret = page_xfer_dump_pages(&xfer, mem_pp);
>> +		ret = page_xfer_dump_pages(item->pid->real, &xfer, mem_pp, &off);
>>   
>>   		xfer.close(&xfer);
>>   
>> diff --git a/criu/include/page-xfer.h b/criu/include/page-xfer.h
>> index fa72273..74d42f2 100644
>> --- a/criu/include/page-xfer.h
>> +++ b/criu/include/page-xfer.h
>> @@ -47,7 +47,7 @@ struct page_xfer {
>>   
>>   extern int open_page_xfer(struct page_xfer *xfer, int fd_type, unsigned long id);
>>   struct page_pipe;
>> -extern int page_xfer_dump_pages(struct page_xfer *, struct page_pipe *);
>> +extern int page_xfer_dump_pages(int pid, struct page_xfer *, struct page_pipe *, size_t *poff);
>>   extern int connect_to_page_server_to_send(void);
>>   extern int connect_to_page_server_to_recv(int epfd);
>>   extern int disconnect_from_page_server(void);
>> diff --git a/criu/mem.c b/criu/mem.c
>> index 6a1a87a..844d726 100644
>> --- a/criu/mem.c
>> +++ b/criu/mem.c
>> @@ -260,39 +260,7 @@ static struct parasite_dump_pages_args *prep_dump_pages_args(struct parasite_ctl
>>   	return args;
>>   }
>>   
>> -static int drain_pages(struct page_pipe *pp, struct parasite_ctl *ctl,
>> -		      struct parasite_dump_pages_args *args)
>> -{
>> -	struct page_pipe_buf *ppb;
>> -	int ret = 0;
>> -
>> -	debug_show_page_pipe(pp);
>> -
>> -	/* Step 2 -- grab pages into page-pipe */
>> -	list_for_each_entry(ppb, &pp->bufs, l) {
>> -		args->nr_segs = ppb->nr_segs;
>> -		args->nr_pages = ppb->pages_in;
>> -		pr_debug("PPB: %d pages %d segs %u pipe %d off\n",
>> -				args->nr_pages, args->nr_segs, ppb->pipe_size, args->off);
>> -
>> -		ret = compel_rpc_call(PARASITE_CMD_DUMPPAGES, ctl);
>> -		if (ret < 0)
>> -			return -1;
>> -		ret = compel_util_send_fd(ctl, ppb->p[1]);
>> -		if (ret)
>> -			return -1;
>> -
>> -		ret = compel_rpc_sync(PARASITE_CMD_DUMPPAGES, ctl);
>> -		if (ret < 0)
>> -			return -1;
>> -
>> -		args->off += args->nr_segs;
>> -	}
>> -
>> -	return 0;
>> -}
> The drain_pages() should remain as is for the dump case, the target
> processes are anyway frozen and we have the parasite code there. I see no
> point in replacing vmsplice() in the parasite with process_vm_readv() +
> vmsplice() in this case.
>
> Actually, I think that to use process_vm_readv() for the pre-dump case it
> would be enough to override drain_pages() so we'll have something like
> drain_pages_dump() that does vmsplice() via parasite and
> drain_pages_pre_dump() that performs process_vm_readv() to a buffer and
> then splices that buffer to the pipes.
Sure. I will handle pre-dump independent of dump as mentioned.
>> -static int xfer_pages(struct page_pipe *pp, struct page_xfer *xfer)
>> +static int xfer_pages( int pid, struct page_pipe *pp, struct page_xfer *xfer, size_t *poff)
> Radostin already mentioned a couple of coding style issues, I'd suggest to
> use checkpatch.pl from Linux sources to verify the next patches.
Sure :)
>
>>   {
>>   	int ret;
>>   
>> @@ -301,7 +269,7 @@ static int xfer_pages(struct page_pipe *pp, struct page_xfer *xfer)
>>   	 *           pre-dump action (see pre_dump_one_task)
>>   	 */
>>   	timing_start(TIME_MEMWRITE);
>> -	ret = page_xfer_dump_pages(xfer, pp);
>> +	ret = page_xfer_dump_pages(pid, xfer, pp, poff);
>>   	timing_stop(TIME_MEMWRITE);
>>   
>>   	return ret;
>> @@ -351,7 +319,7 @@ static int generate_vma_iovs(struct pstree_item *item, struct vma_area *vma,
>>   			     struct page_pipe *pp, struct page_xfer *xfer,
>>   			     struct parasite_dump_pages_args *args,
>>   			     struct parasite_ctl *ctl, pmc_t *pmc,
>> -			     bool has_parent, bool pre_dump)
>> +			     bool has_parent, bool pre_dump, size_t *poff)
>>   {
>>   	u64 off = 0;
>>   	u64 *map;
>> @@ -361,6 +329,12 @@ static int generate_vma_iovs(struct pstree_item *item, struct vma_area *vma,
>>   				!vma_area_is(vma, VMA_ANON_SHARED))
>>   		return 0;
>>   
>> +	if (!(vma->e->prot & PROT_READ)){
>> +		if(pre_dump)
>> +			return 0;
>> +		has_parent = false;
>> +	}
>> +
>>   	if (vma_entry_is(vma->e, VMA_AREA_AIORING)) {
>>   		if (pre_dump)
>>   			return 0;
>> @@ -379,9 +353,7 @@ again:
>>   	if (ret == -EAGAIN) {
>>   		BUG_ON(!(pp->flags & PP_CHUNK_MODE));
>>   
>> -		ret = drain_pages(pp, ctl, args);
>> -		if (!ret)
>> -			ret = xfer_pages(pp, xfer);
>> +		ret = xfer_pages(item->pid->real, pp, xfer, poff);
>>   		if (!ret) {
>>   			page_pipe_reinit(pp);
>>   			goto again;
>> @@ -406,6 +378,7 @@ static int __parasite_dump_pages_seized(struct pstree_item *item,
>>   	unsigned long pmc_size;
>>   	int possible_pid_reuse = 0;
>>   	bool has_parent;
>> +	size_t poff = 0;
>>   
>>   	pr_info("\n");
>>   	pr_info("Dumping pages (type: %d pid: %d)\n", CR_FD_PAGES, item->pid->real);
>> @@ -470,11 +443,12 @@ static int __parasite_dump_pages_seized(struct pstree_item *item,
>>   	/*
>>   	 * Step 1 -- generate the pagemap
>>   	 */
>> +	poff = 0;
>>   	args->off = 0;
>>   	has_parent = !!xfer.parent && !possible_pid_reuse;
>>   	list_for_each_entry(vma_area, &vma_area_list->h, list) {
>>   		ret = generate_vma_iovs(item, vma_area, pp, &xfer, args, ctl,
>> -					&pmc, has_parent, mdc->pre_dump);
>> +					&pmc, has_parent, mdc->pre_dump, &poff);
>>   		if (ret < 0)
>>   			goto out_xfer;
>>   	}
>> @@ -482,9 +456,8 @@ static int __parasite_dump_pages_seized(struct pstree_item *item,
>>   	if (mdc->lazy)
>>   		memcpy(pargs_iovs(args), pp->iovs,
>>   		       sizeof(struct iovec) * pp->nr_iovs);
>> -	ret = drain_pages(pp, ctl, args);
>> -	if (!ret && !mdc->pre_dump)
>> -		ret = xfer_pages(pp, &xfer);
>> +	if(!mdc->pre_dump)
>> +		ret = xfer_pages(item->pid->real, pp, &xfer, &poff);
>>   	if (ret)
>>   		goto out_xfer;
>>   
>> @@ -529,17 +502,18 @@ int parasite_dump_pages_seized(struct pstree_item *item,
>>   	 *
>>   	 * Afterwards -- reprotect memory back.
>>   	 */
>> +	if(!mdc->pre_dump){
>> +		pargs->add_prot = PROT_READ;
>> +		ret = compel_rpc_call_sync(PARASITE_CMD_MPROTECT_VMAS, ctl);
>> +		if (ret) {
>> +			pr_err("Can't dump unprotect vmas with parasite\n");
>> +			return ret;
>> +		}
>>   
>> -	pargs->add_prot = PROT_READ;
>> -	ret = compel_rpc_call_sync(PARASITE_CMD_MPROTECT_VMAS, ctl);
>> -	if (ret) {
>> -		pr_err("Can't dump unprotect vmas with parasite\n");
>> -		return ret;
>> -	}
>> -
>> -	if (fault_injected(FI_DUMP_PAGES)) {
>> -		pr_err("fault: Dump VMA pages failure!\n");
>> -		return -1;
>> +		if (fault_injected(FI_DUMP_PAGES)) {
>> +			pr_err("fault: Dump VMA pages failure!\n");
>> +			return -1;
>> +		}
>>   	}
>>   
>>   	ret = __parasite_dump_pages_seized(item, pargs, vma_area_list, mdc, ctl);
>> @@ -549,10 +523,12 @@ int parasite_dump_pages_seized(struct pstree_item *item,
>>   		return ret;
>>   	}
>>   
>> -	pargs->add_prot = 0;
>> -	if (compel_rpc_call_sync(PARASITE_CMD_MPROTECT_VMAS, ctl)) {
>> -		pr_err("Can't rollback unprotected vmas with parasite\n");
>> -		ret = -1;
>> +	if(!mdc->pre_dump){
>> +		pargs->add_prot = 0;
>> +		if (compel_rpc_call_sync(PARASITE_CMD_MPROTECT_VMAS, ctl)) {
>> +			pr_err("Can't rollback unprotected vmas with parasite\n");
>> +			ret = -1;
>> +		}
>>   	}
>>   
>>   	return ret;
>> diff --git a/criu/page-pipe.c b/criu/page-pipe.c
>> index c32b893..c70ba70 100644
>> --- a/criu/page-pipe.c
>> +++ b/criu/page-pipe.c
>> @@ -33,7 +33,7 @@ static int __ppb_resize_pipe(struct page_pipe_buf *ppb, unsigned long new_size)
>>   {
>>   	int ret;
>>   
>> -	ret = fcntl(ppb->p[0], F_SETPIPE_SZ, new_size * PAGE_SIZE);
>> +	ret = fcntl(ppb->p[0], F_SETPIPE_SZ, (new_size * PAGE_SIZE) + 1);
> Why the size should be changed here?

This is minor hack that I have discussed with Pavel. The situation for 
changing pipe size like this:

Consider process_vm_readv() reads 512 pages in user buffer. All those 
512 pages are vmspliced to

pipe-fd. But vmsplice is able to transfer 511 pages completely and last 
page in fraction of 0.86 each

time for any test case. Cross checking pipe-size reveals its size is 512 
pages. So manipulated actual pipe

size and pipe size that is stored for time being. It will be helpful if 
you could assist me in this.

This is in my #TODO list to debug.

> In any case, this change should be a separate patch.
>
>>   	if (ret < 0)
>>   		return -1;
>>   
>> @@ -41,7 +41,7 @@ static int __ppb_resize_pipe(struct page_pipe_buf *ppb, unsigned long new_size)
>>   	BUG_ON(ret < ppb->pipe_size);
>>   
>>   	pr_debug("Grow pipe %x -> %x\n", ppb->pipe_size, ret);
>> -	ppb->pipe_size = ret;
>> +	ppb->pipe_size = ret - 1;
>>   
>>   	return 0;
>>   }
>> diff --git a/criu/page-xfer.c b/criu/page-xfer.c
>> index 9cdffd8..a5396b1 100644
>> --- a/criu/page-xfer.c
>> +++ b/criu/page-xfer.c
>> @@ -496,19 +496,75 @@ static inline u32 ppb_xfer_flags(struct page_xfer *xfer, struct page_pipe_buf *p
>>   		return PE_PRESENT;
>>   }
>>   
>> -int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp)
>> +static char userbuf[4 << 20];
>> +
>> +int page_xfer_dump_pages(int pid, struct page_xfer *xfer, struct page_pipe *pp, size_t *poff)
>>   {
>>   	struct page_pipe_buf *ppb;
>>   	unsigned int cur_hole = 0;
>> -	int ret;
>> +	unsigned int ret, ret2;
>> +	size_t off;
>> +	struct iovec *remoteiovs = pp->iovs;
>>   
>>   	pr_debug("Transferring pages:\n");
>>   
>> +	off = *poff;
>> +
>>   	list_for_each_entry(ppb, &pp->bufs, l) {
>>   		unsigned int i;
>>   
>>   		pr_debug("\tbuf %d/%d\n", ppb->pages_in, ppb->nr_segs);
>>   
>> +		size_t bufsize = sizeof(userbuf);
>> +		struct iovec bufvec = {.iov_len = bufsize};
>> +		bufvec.iov_base = userbuf;
>> +
>> +		ret = syscall(__NR_process_vm_readv, pid, &bufvec, 1, \
>> +						&remoteiovs[off], ppb->nr_segs, 0);
> I think glibc has a wrapper for __NR_process_vm_readv, why do you need to
> use syscall(__NR_process_vm_readv, ...) here?
Sure. Will change it.
>> +		if (ret == -1) {
>> +			switch (errno) {
>> +				case EINVAL:
>> +					pr_debug("process_vm_readv: Invalid arguments\n");
>> +				break;
>> +				case EFAULT:
>> +					pr_debug("process_vm_readv: Unable to access remote iov\n");
>> +				break;
>> +				case ENOMEM:
>> +					pr_debug("process_vm_readv: Unable to allocate memory\n");
>> +				break;
>> +				case EPERM:
>> +					pr_debug("process_vm_readv: Insufficient privileges\n");
>> +				break;
>> +				case ESRCH:
>> +					pr_debug("process_vm_readv: Target process doesn't exist\n");
>> +				break;
>> +				default:
>> +					pr_debug("process_vm_readv: Uncategorised error\n");
>> +			}
>> +			return 0;
>> +		}
>> +
>> +		/* Handling partial reads due to modified mappings*/
>> +
>> +		if(ret != ppb->pages_in * PAGE_SIZE){
>> +			pr_debug("Can't read remote iovs (%d(%lu)/%d)\n", ret,\
>> +				(unsigned long int)PAGE_SIZE * ppb->pages_in, ppb->pages_in);
>> +			continue;
>> +		}
>> +
>> +		bufvec.iov_len = ret;
>> +		ret2 = vmsplice(ppb->p[1], &bufvec, 1, SPLICE_F_NONBLOCK);
>> +
>> +		if(ret2 == -1){
>> +			pr_debug("vmsplice: Failed to splice user buffer to pipe\n");
>> +			continue;
>> +		}
>> +
>> +		if(ret != ret2){
>> +			pr_debug("Partial splice from user buffer to pipe (%d)\n", ret2);
>> +			continue;
>> +		}
>> +
> As I've said, I believe the whole block around process_vm_readv() +
> vmsplice() should be in pre-dump version of drain_pages().
ACK.
>
>>   		for (i = 0; i < ppb->nr_segs; i++) {
>>   			struct iovec iov = ppb->iov[i];
>>   			u32 flags;
>> @@ -530,8 +586,10 @@ int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp)
>>   						ppb->p[0], iov.iov_len))
>>   				return -1;
>>   		}
>> +		off += ppb->nr_segs;
>>   	}
>>   
>> +	*poff = off;
>>   	return dump_holes(xfer, pp, &cur_hole, NULL);
>>   }
>>   
>> diff --git a/criu/shmem.c b/criu/shmem.c
>> index 03b088f..a797bde 100644
>> --- a/criu/shmem.c
>> +++ b/criu/shmem.c
>> @@ -629,19 +629,9 @@ int add_shmem_area(pid_t pid, VmaEntry *vma, u64 *map)
>>   	return 0;
>>   }
> I don't think there is much value in doing process_vm_splice() for shared
> memory. We anyway attach to the same shared memory the target processes
> use and simply fill the pipes with it's contents.
>    

So pre-dump handling will vary for non-shared and shared memory.

As noted above, having separate drain_pages_predump() will take care of 
this.

>> -static int dump_pages(struct page_pipe *pp, struct page_xfer *xfer)
>> +static int dump_pages(struct page_pipe *pp, struct page_xfer *xfer, size_t *off)
>>   {
>> -	struct page_pipe_buf *ppb;
>> -
>> -	list_for_each_entry(ppb, &pp->bufs, l)
>> -		if (vmsplice(ppb->p[1], ppb->iov, ppb->nr_segs,
>> -					SPLICE_F_GIFT | SPLICE_F_NONBLOCK) !=
>> -				ppb->pages_in * PAGE_SIZE) {
>> -			pr_perror("Can't get shmem into page-pipe");
>> -			return -1;
>> -		}
>> -
>> -	return page_xfer_dump_pages(xfer, pp);
>> +	return page_xfer_dump_pages(getpid(), xfer, pp, off);
>>   }
>>   
>>   static int next_data_segment(int fd, unsigned long pfn,
>> @@ -678,6 +668,7 @@ static int do_dump_one_shmem(int fd, void *addr, struct shmem_info *si)
>>   	int err, ret = -1;
>>   	unsigned long pfn, nrpages, next_data_pnf = 0, next_hole_pfn = 0;
>>   	unsigned long pages[2] = {};
>> +	size_t off = 0;
>>   
>>   	nrpages = (si->size + PAGE_SIZE - 1) / PAGE_SIZE;
>>   
>> @@ -726,7 +717,7 @@ again:
>>   		}
>>   
>>   		if (ret == -EAGAIN) {
>> -			ret = dump_pages(pp, &xfer);
>> +			ret = dump_pages(pp, &xfer, &off);
>>   			if (ret)
>>   				goto err_xfer;
>>   			page_pipe_reinit(pp);
>> @@ -742,7 +733,7 @@ again:
>>   	cnt_add(CNT_SHPAGES_SKIPPED_PARENT, pages[0]);
>>   	cnt_add(CNT_SHPAGES_WRITTEN, pages[1]);
>>   
>> -	ret = dump_pages(pp, &xfer);
>> +	ret = dump_pages(pp, &xfer, &off);
>>   
>>   err_xfer:
>>   	xfer.close(&xfer);
>> -- 
>> 2.7.4
>>
>> _______________________________________________
>> CRIU mailing list
>> CRIU@openvz.org
>> https://lists.openvz.org/mailman/listinfo/criu
>>
> -Abhishek
Abhishek Dubey July 11, 2019, 9:31 p.m.
Hi Andrei,

On 09/07/19 9:38 PM, Andrei Vagin wrote:
> On Tue, Jul 09, 2019 at 01:50:57AM +0530, abhishek dubey wrote:
>> Hi Andrei,
>>
>> Please find inline replies.
>>
>> On 08/07/19 9:35 AM, Andrei Vagin wrote:
>>> On Fri, Jul 05, 2019 at 05:34:26PM +0530, Abhishek Dubey wrote:
>>>> This patch implements usage of process_vm_readv
>>>> syscall to collect memory pages from target process during
>>>> pre-dump. process_vm_readv collects pages in user-buffer,
>>>> which is later vmspliced to page-pipes.
>>> You have to be more detailed in the commit message.
>>>
>>> Frist of all, you have to explain why we need this patch.
>> I will take care of same from next time.
>>> If you improve performance, you need to give some numbers.
>> Still I am handling issue of "missing entry in parent pagemap". Next patch
>> will have comparative figures included.
>>> If you implement a new feature, you need to provide tests for it.
>>>
>>> If I understand this patch right, criu freezes all processes, collects
>>> page maps, unfreezes process, dump pages? If this is right,
>> Yes, this is right.
>>> you need to
>>> explain how it handles in this case:
>>>
>>> addr = map(NULL, 4 * PAGE_SIZE)	|
>>> addr[0] = 1			|
>>> addr[PAGE_SIZE]  = 2		|
>>> addr[2 * PAGE_SIZE] = 3		|
>>> addr[3 * PAGE_SIZE] = 4		|
>>> 				| criu freezes the process
>>> 				| criu collects page maps
>>> 				| criu unfreezes the process
>>> unmap(addr, PAGE_SIZE * 2)	|
>>> 				| criu dumps pages
>> Need to explain this example somewhere along with code(as in page-pipe.h) or
>> some other place?
> here :)
>
> I think this can be non-trivial case. process_vm_readv will return
> ENOMEM for this segment, but a part of it still exists and we need to
> find what part is here and dump it.

Pagemap cache won't be useful here.

For now, I can only think of setting a threshold. If size of mapping is 
below threshold, we can process page-by-page(which is costly).

Beyond threshold, we can leave such mapping to be handled by dump stage.


Implementation query:

During "pre-dump", some of the iovs in iovec, fail to be processed by 
process_vm_readv(). Hence pagemap entry is skipped for same.

Now, memory belonging to these failed iovs is not dirtied. During "dump" 
on generating iovs, such non-dirty memory areas are marked as

holes and check for pages(in parent) backing these holes is done in 
page-xfer. Ultimately, holes are pointing to pages that are not pre-dumped.

This check must be shifted to "dump stage", so that pages(iovs) skipped 
by process_vm_readv can't be marked as holes in incremental steps.

Can I add new data structure to list down all the iovs failed by 
process_vm_readv. This list will be checked while generating holes in 
dump stage.

is that a right approach?


Please suggest if something more efficient or direct solution is possible.


-Abhishek
Abhishek Dubey July 12, 2019, 4:40 p.m.
Hi Andrei,

On 12/07/19 3:01 AM, abhishek dubey wrote:
> Hi Andrei,
>
> On 09/07/19 9:38 PM, Andrei Vagin wrote:
>> On Tue, Jul 09, 2019 at 01:50:57AM +0530, abhishek dubey wrote:
>>> Hi Andrei,
>>>
>>> Please find inline replies.
>>>
>>> On 08/07/19 9:35 AM, Andrei Vagin wrote:
>>>> On Fri, Jul 05, 2019 at 05:34:26PM +0530, Abhishek Dubey wrote:
>>>>> This patch implements usage of process_vm_readv
>>>>> syscall to collect memory pages from target process during
>>>>> pre-dump. process_vm_readv collects pages in user-buffer,
>>>>> which is later vmspliced to page-pipes.
>>>> You have to be more detailed in the commit message.
>>>>
>>>> Frist of all, you have to explain why we need this patch.
>>> I will take care of same from next time.
>>>> If you improve performance, you need to give some numbers.
>>> Still I am handling issue of "missing entry in parent pagemap". Next 
>>> patch
>>> will have comparative figures included.
>>>> If you implement a new feature, you need to provide tests for it.
>>>>
>>>> If I understand this patch right, criu freezes all processes, collects
>>>> page maps, unfreezes process, dump pages? If this is right,
>>> Yes, this is right.
>>>> you need to
>>>> explain how it handles in this case:
>>>>
>>>> addr = map(NULL, 4 * PAGE_SIZE)    |
>>>> addr[0] = 1            |
>>>> addr[PAGE_SIZE]  = 2        |
>>>> addr[2 * PAGE_SIZE] = 3        |
>>>> addr[3 * PAGE_SIZE] = 4        |
>>>>                 | criu freezes the process
>>>>                 | criu collects page maps
>>>>                 | criu unfreezes the process
>>>> unmap(addr, PAGE_SIZE * 2)    |
>>>>                 | criu dumps pages
>>> Need to explain this example somewhere along with code(as in 
>>> page-pipe.h) or
>>> some other place?
>> here :)
>>
>> I think this can be non-trivial case. process_vm_readv will return
>> ENOMEM for this segment, but a part of it still exists and we need to
>> find what part is here and dump it.
>
> Pagemap cache won't be useful here.
>
> For now, I can only think of setting a threshold. If size of mapping 
> is below threshold, we can process page-by-page(which is costly).
>
> Beyond threshold, we can leave such mapping to be handled by dump stage.
>
>
Just Ignore following query. I thought there might be other regions like 
regions lacking PROT_READ, but my issue was due to implementation error.

Got resolved.

> Implementation query:
>
> During "pre-dump", some of the iovs in iovec, fail to be processed by 
> process_vm_readv(). Hence pagemap entry is skipped for same.
>
> Now, memory belonging to these failed iovs is not dirtied. During 
> "dump" on generating iovs, such non-dirty memory areas are marked as
>
> holes and check for pages(in parent) backing these holes is done in 
> page-xfer. Ultimately, holes are pointing to pages that are not 
> pre-dumped.
>
> This check must be shifted to "dump stage", so that pages(iovs) 
> skipped by process_vm_readv can't be marked as holes in incremental 
> steps.
>
> Can I add new data structure to list down all the iovs failed by 
> process_vm_readv. This list will be checked while generating holes in 
> dump stage.
>
> is that a right approach?
>
>
> Please suggest if something more efficient or direct solution is 
> possible.
>
>
> -Abhishek
>
-Abhishek
Andrey Vagin July 17, 2019, 8:27 p.m.
On Fri, Jul 12, 2019 at 03:01:39AM +0530, abhishek dubey wrote:
> Hi Andrei,
> 
> On 09/07/19 9:38 PM, Andrei Vagin wrote:
> > On Tue, Jul 09, 2019 at 01:50:57AM +0530, abhishek dubey wrote:
> > > Hi Andrei,
> > > 
> > > Please find inline replies.
> > > 
> > > On 08/07/19 9:35 AM, Andrei Vagin wrote:
> > > > On Fri, Jul 05, 2019 at 05:34:26PM +0530, Abhishek Dubey wrote:
> > > > > This patch implements usage of process_vm_readv
> > > > > syscall to collect memory pages from target process during
> > > > > pre-dump. process_vm_readv collects pages in user-buffer,
> > > > > which is later vmspliced to page-pipes.
> > > > You have to be more detailed in the commit message.
> > > > 
> > > > Frist of all, you have to explain why we need this patch.
> > > I will take care of same from next time.
> > > > If you improve performance, you need to give some numbers.
> > > Still I am handling issue of "missing entry in parent pagemap". Next patch
> > > will have comparative figures included.
> > > > If you implement a new feature, you need to provide tests for it.
> > > > 
> > > > If I understand this patch right, criu freezes all processes, collects
> > > > page maps, unfreezes process, dump pages? If this is right,
> > > Yes, this is right.
> > > > you need to
> > > > explain how it handles in this case:
> > > > 
> > > > addr = map(NULL, 4 * PAGE_SIZE)	|
> > > > addr[0] = 1			|
> > > > addr[PAGE_SIZE]  = 2		|
> > > > addr[2 * PAGE_SIZE] = 3		|
> > > > addr[3 * PAGE_SIZE] = 4		|
> > > > 				| criu freezes the process
> > > > 				| criu collects page maps
> > > > 				| criu unfreezes the process
> > > > unmap(addr, PAGE_SIZE * 2)	|
> > > > 				| criu dumps pages
> > > Need to explain this example somewhere along with code(as in page-pipe.h) or
> > > some other place?
> > here :)
> > 
> > I think this can be non-trivial case. process_vm_readv will return
> > ENOMEM for this segment, but a part of it still exists and we need to
> > find what part is here and dump it.
> 
> Pagemap cache won't be useful here.
> 
> For now, I can only think of setting a threshold. If size of mapping is
> below threshold, we can process page-by-page(which is costly).
> 
> Beyond threshold, we can leave such mapping to be handled by dump stage.

How do you detect that these pages have not been dumped on the dump
stage? Do we have a test for this case?

> 
> 
> Implementation query:
> 
> During "pre-dump", some of the iovs in iovec, fail to be processed by
> process_vm_readv(). Hence pagemap entry is skipped for same.
> 
> Now, memory belonging to these failed iovs is not dirtied. During "dump" on
> generating iovs, such non-dirty memory areas are marked as
> 
> holes and check for pages(in parent) backing these holes is done in
> page-xfer. Ultimately, holes are pointing to pages that are not pre-dumped.
> 
> This check must be shifted to "dump stage", so that pages(iovs) skipped by
> process_vm_readv can't be marked as holes in incremental steps.
> 
> Can I add new data structure to list down all the iovs failed by
> process_vm_readv. This list will be checked while generating holes in dump
> stage.
> 
> is that a right approach?
> 
> 
> Please suggest if something more efficient or direct solution is possible.
> 
> 
> -Abhishek
>
Abhishek Dubey July 19, 2019, 4:31 p.m.
Hi Andrei,

On 18/07/19 1:57 AM, Andrei Vagin wrote:
> On Fri, Jul 12, 2019 at 03:01:39AM +0530, abhishek dubey wrote:
>> Hi Andrei,
>>
>> On 09/07/19 9:38 PM, Andrei Vagin wrote:
>>> On Tue, Jul 09, 2019 at 01:50:57AM +0530, abhishek dubey wrote:
>>>> Hi Andrei,
>>>>
>>>> Please find inline replies.
>>>>
>>>> On 08/07/19 9:35 AM, Andrei Vagin wrote:
>>>>> On Fri, Jul 05, 2019 at 05:34:26PM +0530, Abhishek Dubey wrote:
>>>>>> This patch implements usage of process_vm_readv
>>>>>> syscall to collect memory pages from target process during
>>>>>> pre-dump. process_vm_readv collects pages in user-buffer,
>>>>>> which is later vmspliced to page-pipes.
>>>>> You have to be more detailed in the commit message.
>>>>>
>>>>> Frist of all, you have to explain why we need this patch.
>>>> I will take care of same from next time.
>>>>> If you improve performance, you need to give some numbers.
>>>> Still I am handling issue of "missing entry in parent pagemap". Next patch
>>>> will have comparative figures included.
>>>>> If you implement a new feature, you need to provide tests for it.
>>>>>
>>>>> If I understand this patch right, criu freezes all processes, collects
>>>>> page maps, unfreezes process, dump pages? If this is right,
>>>> Yes, this is right.
>>>>> you need to
>>>>> explain how it handles in this case:
>>>>>
>>>>> addr = map(NULL, 4 * PAGE_SIZE)	|
>>>>> addr[0] = 1			|
>>>>> addr[PAGE_SIZE]  = 2		|
>>>>> addr[2 * PAGE_SIZE] = 3		|
>>>>> addr[3 * PAGE_SIZE] = 4		|
>>>>> 				| criu freezes the process
>>>>> 				| criu collects page maps
>>>>> 				| criu unfreezes the process
>>>>> unmap(addr, PAGE_SIZE * 2)	|
>>>>> 				| criu dumps pages
>>>> Need to explain this example somewhere along with code(as in page-pipe.h) or
>>>> some other place?
>>> here :)
>>>
>>> I think this can be non-trivial case. process_vm_readv will return
>>> ENOMEM for this segment, but a part of it still exists and we need to
>>> find what part is here and dump it.
>> Pagemap cache won't be useful here.
>>
>> For now, I can only think of setting a threshold. If size of mapping is
>> below threshold, we can process page-by-page(which is costly).
>>
>> Beyond threshold, we can leave such mapping to be handled by dump stage.
> How do you detect that these pages have not been dumped on the dump
> stage? Do we have a test for this case?
>
I will get back on this. I am not sure about any test case checking this.
>>
>> Implementation query:
>>
>> During "pre-dump", some of the iovs in iovec, fail to be processed by
>> process_vm_readv(). Hence pagemap entry is skipped for same.
>>
>> Now, memory belonging to these failed iovs is not dirtied. During "dump" on
>> generating iovs, such non-dirty memory areas are marked as
>>
>> holes and check for pages(in parent) backing these holes is done in
>> page-xfer. Ultimately, holes are pointing to pages that are not pre-dumped.
>>
>> This check must be shifted to "dump stage", so that pages(iovs) skipped by
>> process_vm_readv can't be marked as holes in incremental steps.
>>
>> Can I add new data structure to list down all the iovs failed by
>> process_vm_readv. This list will be checked while generating holes in dump
>> stage.
>>
>> is that a right approach?
>>
>>
>> Please suggest if something more efficient or direct solution is possible.
>>
>>
>> -Abhishek
>>
-Abhishek
Abhishek Dubey July 21, 2019, 12:24 a.m.
Hi Andrei,

On 18/07/19 1:57 AM, Andrei Vagin wrote:
> On Fri, Jul 12, 2019 at 03:01:39AM +0530, abhishek dubey wrote:
>> Hi Andrei,
>>
>> On 09/07/19 9:38 PM, Andrei Vagin wrote:
>>> On Tue, Jul 09, 2019 at 01:50:57AM +0530, abhishek dubey wrote:
>>>> Hi Andrei,
>>>>
>>>> Please find inline replies.
>>>>
>>>> On 08/07/19 9:35 AM, Andrei Vagin wrote:
>>>>> On Fri, Jul 05, 2019 at 05:34:26PM +0530, Abhishek Dubey wrote:
>>>>>> This patch implements usage of process_vm_readv
>>>>>> syscall to collect memory pages from target process during
>>>>>> pre-dump. process_vm_readv collects pages in user-buffer,
>>>>>> which is later vmspliced to page-pipes.
>>>>> You have to be more detailed in the commit message.
>>>>>
>>>>> Frist of all, you have to explain why we need this patch.
>>>> I will take care of same from next time.
>>>>> If you improve performance, you need to give some numbers.
>>>> Still I am handling issue of "missing entry in parent pagemap". Next patch
>>>> will have comparative figures included.
>>>>> If you implement a new feature, you need to provide tests for it.
>>>>>
>>>>> If I understand this patch right, criu freezes all processes, collects
>>>>> page maps, unfreezes process, dump pages? If this is right,
>>>> Yes, this is right.
>>>>> you need to
>>>>> explain how it handles in this case:
>>>>>
>>>>> addr = map(NULL, 4 * PAGE_SIZE)	|
>>>>> addr[0] = 1			|
>>>>> addr[PAGE_SIZE]  = 2		|
>>>>> addr[2 * PAGE_SIZE] = 3		|
>>>>> addr[3 * PAGE_SIZE] = 4		|
>>>>> 				| criu freezes the process
>>>>> 				| criu collects page maps
>>>>> 				| criu unfreezes the process
>>>>> unmap(addr, PAGE_SIZE * 2)	|
>>>>> 				| criu dumps pages
>>>> Need to explain this example somewhere along with code(as in page-pipe.h) or
>>>> some other place?
>>> here :)

testcase/maps007 does something like this. But instead of unmapping 
initial 2 pages(as in example above), it always unmaps

some intermediate page, like page 3 in this example. In current 
implementation, I have handled it by pre-dumping just first 2 pages, 3rd 
is unmapped so skip it

and 4th page is not pre-dumped. I also adjust the length of iovec as per 
pre-dumped page count. This approach is not correct and passes the test case

very few times. maps007 fails with page-count mis-match error.

>>> I think this can be non-trivial case. process_vm_readv will return
>>> ENOMEM for this segment, but a part of it still exists and we need to
>>> find what part is here and dump it.

Consider the following iovec:

[A: 1 page] [B: 4 page] [C: 1 page] [D: 2 page]

iovec_cnt: 4, PIPE_size =16

Before pre-dump could start, process has unmapped complete 'B' region.

process_vm_readv() will fail at 'B' and will only process region 'A' by 
default.

Current implementation done till now supports processing region 'C' and 
'D' and generate page-map accordingly.

So here bad iovec (B) is skipped and remaining iovec(C,D) are processed.

----------------------------------------------------------------------------------------------------------

Considering the same above example, status of 4 pages of region B :

{B1: unmapped} {B2: unmapped} {B3: mapped} {B4: mapped}

The regions B3,B4 although mapped were skipped by pre-dumping.


For such case, can I implement something like:

Make "temporary iovec" for region B, like:

[B1: 1 page] [B2: 1 page] [B3: 1 page] [B4: 1 page]


With existing implementation(explained above dotted line), 
process_vm_readv can skip iovec B1 and B2

and move ahead processing B3 onward. Handling culprit iovec at finer 
granularity.


During pagemap generation, instead of iovec of B, temporary iovec will 
be supplied.

Can I go ahead with this approach?

>> Pagemap cache won't be useful here.
>>
>> For now, I can only think of setting a threshold. If size of mapping is
>> below threshold, we can process page-by-page(which is costly).
>>
>> Beyond threshold, we can leave such mapping to be handled by dump stage.
Incorrect approach. Can't leave pages.
> How do you detect that these pages have not been dumped on the dump
> stage? Do we have a test for this case?
>
>>
>> Implementation query:
>>
>> During "pre-dump", some of the iovs in iovec, fail to be processed by
>> process_vm_readv(). Hence pagemap entry is skipped for same.
>>
>> Now, memory belonging to these failed iovs is not dirtied. During "dump" on
>> generating iovs, such non-dirty memory areas are marked as
>>
>> holes and check for pages(in parent) backing these holes is done in
>> page-xfer. Ultimately, holes are pointing to pages that are not pre-dumped.
>>
>> This check must be shifted to "dump stage", so that pages(iovs) skipped by
>> process_vm_readv can't be marked as holes in incremental steps.
>>
>> Can I add new data structure to list down all the iovs failed by
>> process_vm_readv. This list will be checked while generating holes in dump
>> stage.
>>
>> is that a right approach?
>>
>>
>> Please suggest if something more efficient or direct solution is possible.
>>
>>
>> -Abhishek
>>
-Abhishek
Andrey Vagin July 24, 2019, 7:25 a.m.
On Sun, Jul 21, 2019 at 05:54:34AM +0530, abhishek dubey wrote:
> Hi Andrei,
> 
> On 18/07/19 1:57 AM, Andrei Vagin wrote:
> > On Fri, Jul 12, 2019 at 03:01:39AM +0530, abhishek dubey wrote:
> > > Hi Andrei,
> > > 
> > > On 09/07/19 9:38 PM, Andrei Vagin wrote:
> > > > On Tue, Jul 09, 2019 at 01:50:57AM +0530, abhishek dubey wrote:
> > > > > Hi Andrei,
> > > > > 
> > > > > Please find inline replies.
> > > > > 
> > > > > On 08/07/19 9:35 AM, Andrei Vagin wrote:
> > > > > > On Fri, Jul 05, 2019 at 05:34:26PM +0530, Abhishek Dubey wrote:
> > > > > > > This patch implements usage of process_vm_readv
> > > > > > > syscall to collect memory pages from target process during
> > > > > > > pre-dump. process_vm_readv collects pages in user-buffer,
> > > > > > > which is later vmspliced to page-pipes.
> > > > > > You have to be more detailed in the commit message.
> > > > > > 
> > > > > > Frist of all, you have to explain why we need this patch.
> > > > > I will take care of same from next time.
> > > > > > If you improve performance, you need to give some numbers.
> > > > > Still I am handling issue of "missing entry in parent pagemap". Next patch
> > > > > will have comparative figures included.
> > > > > > If you implement a new feature, you need to provide tests for it.
> > > > > > 
> > > > > > If I understand this patch right, criu freezes all processes, collects
> > > > > > page maps, unfreezes process, dump pages? If this is right,
> > > > > Yes, this is right.
> > > > > > you need to
> > > > > > explain how it handles in this case:
> > > > > > 
> > > > > > addr = map(NULL, 4 * PAGE_SIZE)	|
> > > > > > addr[0] = 1			|
> > > > > > addr[PAGE_SIZE]  = 2		|
> > > > > > addr[2 * PAGE_SIZE] = 3		|
> > > > > > addr[3 * PAGE_SIZE] = 4		|
> > > > > > 				| criu freezes the process
> > > > > > 				| criu collects page maps
> > > > > > 				| criu unfreezes the process
> > > > > > unmap(addr, PAGE_SIZE * 2)	|
> > > > > > 				| criu dumps pages
> > > > > Need to explain this example somewhere along with code(as in page-pipe.h) or
> > > > > some other place?
> > > > here :)
> 
> testcase/maps007 does something like this. But instead of unmapping initial
> 2 pages(as in example above), it always unmaps
> 
> some intermediate page, like page 3 in this example. In current
> implementation, I have handled it by pre-dumping just first 2 pages, 3rd is
> unmapped so skip it
> 
> and 4th page is not pre-dumped. I also adjust the length of iovec as per
> pre-dumped page count. This approach is not correct and passes the test case
> 
> very few times. maps007 fails with page-count mis-match error.
> 
> > > > I think this can be non-trivial case. process_vm_readv will return
> > > > ENOMEM for this segment, but a part of it still exists and we need to
> > > > find what part is here and dump it.
> 
> Consider the following iovec:
> 
> [A: 1 page] [B: 4 page] [C: 1 page] [D: 2 page]
> 
> iovec_cnt: 4, PIPE_size =16
> 
> Before pre-dump could start, process has unmapped complete 'B' region.
> 
> process_vm_readv() will fail at 'B' and will only process region 'A' by
> default.
> 
> Current implementation done till now supports processing region 'C' and 'D'
> and generate page-map accordingly.
> 
> So here bad iovec (B) is skipped and remaining iovec(C,D) are processed.

If we want to skip iovec, we need to remove these pages from a pagemap,
don't we?

> 
> ----------------------------------------------------------------------------------------------------------
> 
> Considering the same above example, status of 4 pages of region B :
> 
> {B1: unmapped} {B2: unmapped} {B3: mapped} {B4: mapped}
> 
> The regions B3,B4 although mapped were skipped by pre-dumping.
> 
> 
> For such case, can I implement something like:
> 
> Make "temporary iovec" for region B, like:
> 
> [B1: 1 page] [B2: 1 page] [B3: 1 page] [B4: 1 page]

How and when are you going to generate this "temporary iovec"?

> 
> 
> With existing implementation(explained above dotted line), process_vm_readv
> can skip iovec B1 and B2
> 
> and move ahead processing B3 onward. Handling culprit iovec at finer
> granularity.
> 
> 
> During pagemap generation, instead of iovec of B, temporary iovec will be
> supplied.
> 
> Can I go ahead with this approach?

I don't know. I need more details about it...

> 
> > > Pagemap cache won't be useful here.
> > > 
> > > For now, I can only think of setting a threshold. If size of mapping is
> > > below threshold, we can process page-by-page(which is costly).
> > > 
> > > Beyond threshold, we can leave such mapping to be handled by dump stage.
> Incorrect approach. Can't leave pages.
> > How do you detect that these pages have not been dumped on the dump
> > stage? Do we have a test for this case?
> > 
> > > 
> > > Implementation query:
> > > 
> > > During "pre-dump", some of the iovs in iovec, fail to be processed by
> > > process_vm_readv(). Hence pagemap entry is skipped for same.
> > > 
> > > Now, memory belonging to these failed iovs is not dirtied. During "dump" on
> > > generating iovs, such non-dirty memory areas are marked as
> > > 
> > > holes and check for pages(in parent) backing these holes is done in
> > > page-xfer. Ultimately, holes are pointing to pages that are not pre-dumped.
> > > 
> > > This check must be shifted to "dump stage", so that pages(iovs) skipped by
> > > process_vm_readv can't be marked as holes in incremental steps.
> > > 
> > > Can I add new data structure to list down all the iovs failed by
> > > process_vm_readv. This list will be checked while generating holes in dump
> > > stage.
> > > 
> > > is that a right approach?
> > > 
> > > 
> > > Please suggest if something more efficient or direct solution is possible.
> > > 
> > > 
> > > -Abhishek
> > > 
> -Abhishek
Abhishek Dubey July 24, 2019, 8:13 p.m.
Hi Andrei,

Please find inline reply.

On 24/07/19 12:55 PM, Andrei Vagin wrote:
> On Sun, Jul 21, 2019 at 05:54:34AM +0530, abhishek dubey wrote:
>> Hi Andrei,
>>
>> On 18/07/19 1:57 AM, Andrei Vagin wrote:
>>> On Fri, Jul 12, 2019 at 03:01:39AM +0530, abhishek dubey wrote:
>>>> Hi Andrei,
>>>>
>>>> On 09/07/19 9:38 PM, Andrei Vagin wrote:
>>>>> On Tue, Jul 09, 2019 at 01:50:57AM +0530, abhishek dubey wrote:
>>>>>> Hi Andrei,
>>>>>>
>>>>>> Please find inline replies.
>>>>>>
>>>>>> On 08/07/19 9:35 AM, Andrei Vagin wrote:
>>>>>>> On Fri, Jul 05, 2019 at 05:34:26PM +0530, Abhishek Dubey wrote:
>>>>>>>> This patch implements usage of process_vm_readv
>>>>>>>> syscall to collect memory pages from target process during
>>>>>>>> pre-dump. process_vm_readv collects pages in user-buffer,
>>>>>>>> which is later vmspliced to page-pipes.
>>>>>>> You have to be more detailed in the commit message.
>>>>>>>
>>>>>>> Frist of all, you have to explain why we need this patch.
>>>>>> I will take care of same from next time.
>>>>>>> If you improve performance, you need to give some numbers.
>>>>>> Still I am handling issue of "missing entry in parent pagemap". Next patch
>>>>>> will have comparative figures included.
>>>>>>> If you implement a new feature, you need to provide tests for it.
>>>>>>>
>>>>>>> If I understand this patch right, criu freezes all processes, collects
>>>>>>> page maps, unfreezes process, dump pages? If this is right,
>>>>>> Yes, this is right.
>>>>>>> you need to
>>>>>>> explain how it handles in this case:
>>>>>>>
>>>>>>> addr = map(NULL, 4 * PAGE_SIZE)	|
>>>>>>> addr[0] = 1			|
>>>>>>> addr[PAGE_SIZE]  = 2		|
>>>>>>> addr[2 * PAGE_SIZE] = 3		|
>>>>>>> addr[3 * PAGE_SIZE] = 4		|
>>>>>>> 				| criu freezes the process
>>>>>>> 				| criu collects page maps
>>>>>>> 				| criu unfreezes the process
>>>>>>> unmap(addr, PAGE_SIZE * 2)	|
>>>>>>> 				| criu dumps pages
>>>>>> Need to explain this example somewhere along with code(as in page-pipe.h) or
>>>>>> some other place?
>>>>> here :)
>> testcase/maps007 does something like this. But instead of unmapping initial
>> 2 pages(as in example above), it always unmaps
>>
>> some intermediate page, like page 3 in this example. In current
>> implementation, I have handled it by pre-dumping just first 2 pages, 3rd is
>> unmapped so skip it
>>
>> and 4th page is not pre-dumped. I also adjust the length of iovec as per
>> pre-dumped page count. This approach is not correct and passes the test case
>>
>> very few times. maps007 fails with page-count mis-match error.
>>
>>>>> I think this can be non-trivial case. process_vm_readv will return
>>>>> ENOMEM for this segment, but a part of it still exists and we need to
>>>>> find what part is here and dump it.
>> Consider the following iovec:
>>
>> [A: 1 page] [B: 4 page] [C: 1 page] [D: 2 page]
>>
>> iovec_cnt: 4, PIPE_size =16
>>
>> Before pre-dump could start, process has unmapped complete 'B' region.
>>
>> process_vm_readv() will fail at 'B' and will only process region 'A' by
>> default.
>>
>> Current implementation done till now supports processing region 'C' and 'D'
>> and generate page-map accordingly.
>>
>> So here bad iovec (B) is skipped and remaining iovec(C,D) are processed.
> If we want to skip iovec, we need to remove these pages from a pagemap,
> don't we?

Yes, pagemap entry corresponding to skipped iov is bypassed.

Special handling of memory regions reflects in pagemap also.

>> ----------------------------------------------------------------------------------------------------------
>>
>> Considering the same above example, status of 4 pages of region B :
>>
>> {B1: unmapped} {B2: unmapped} {B3: mapped} {B4: mapped}
>>
>> The regions B3,B4 although mapped were skipped by pre-dumping.
>>
>>
>> For such case, can I implement something like:
>>
>> Make "temporary iovec" for region B, like:
>>
>> [B1: 1 page] [B2: 1 page] [B3: 1 page] [B4: 1 page]
> How and when are you going to generate this "temporary iovec"?

When:

Whenever process_vm_readv returns with less count of bytes than 
expected. At this point,

find the failing iov using the count of bytes returned by syscall.

Two case arise here:

1) failing iov is of 1 PAGE_SIZE: it means page has been unmapped 
meanwhile. skip pagemap entry and move to next iov in supplied iovec.

2) failing iov is of "n" PAGE_SIZE: it means unmapped region can be 
anywhere from [0..n-1] PAGE within iov range

Temporary iovec will be generated only in second case.


How:

Take failing iov of length 'n' and form temporary iovec of n-entries, 
each sized with 1 PAGE_SIZE.  Since each entry in iovec is of 1 PAGE_SIZE,

either it will be processed completely or discarded completely(no 
partial iov processing).

Mark down those iov in temporary iovec which fails and skip page map 
generation for same.

Once temporary iovec processing is complete, resume processing of 
remaining iovs of supplied iovec.

>
>>
>> With existing implementation(explained above dotted line), process_vm_readv
>> can skip iovec B1 and B2
>>
>> and move ahead processing B3 onward. Handling culprit iovec at finer
>> granularity.
>>
>>
>> During pagemap generation, instead of iovec of B, temporary iovec will be
>> supplied.
>>
>> Can I go ahead with this approach?
> I don't know. I need more details about it...

Please let me know, if I fail delivering clear idea(through explanation 
above).

I will come up with pictorial representation.


-Abhishek