[3/6] Drain memory using process_vm_readv syscall

Submitted by abhishek dubey on July 25, 2019, 1:13 a.m.

Details

Message ID 1564017225-8045-1-git-send-email-dubeyabhishek777@gmail.com
State New
Series "Series without cover letter"
Headers show

Commit Message

abhishek dubey July 25, 2019, 1:13 a.m.
moving out page_drain to cr_pre_dump_finish for
pre-dump stage. During frozen state, only iovecs
will be generated and draining of page will happen
after the task has been unfrozen. Shared memory
pre-dumping is not modified.

Signed-off-by: Abhishek Dubey <dubeyabhishek777@gmail.com>
---
 criu/cr-dump.c           |   2 +-
 criu/include/page-xfer.h |   4 ++
 criu/mem.c               |  13 +++-
 criu/page-xfer.c         | 176 ++++++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 192 insertions(+), 3 deletions(-)

Patch hide | download patch | download mbox

diff --git a/criu/cr-dump.c b/criu/cr-dump.c
index e070b8b..4569372 100644
--- a/criu/cr-dump.c
+++ b/criu/cr-dump.c
@@ -1513,7 +1513,7 @@  static int cr_pre_dump_finish(int status)
 			goto err;
 
 		mem_pp = dmpi(item)->mem_pp;
-		ret = page_xfer_dump_pages(&xfer, mem_pp);
+		ret = page_xfer_predump_pages(item->pid->real, &xfer, mem_pp);
 
 		xfer.close(&xfer);
 
diff --git a/criu/include/page-xfer.h b/criu/include/page-xfer.h
index fa72273..c3ad877 100644
--- a/criu/include/page-xfer.h
+++ b/criu/include/page-xfer.h
@@ -9,6 +9,9 @@  struct ps_info {
 
 extern int cr_page_server(bool daemon_mode, bool lazy_dump, int cfd);
 
+/* to skip pagemap generation for skipped iovs */
+#define SKIP_PAGEMAP (void*)0xdeadbeef
+
 /*
  * page_xfer -- transfer pages into image file.
  * Two images backends are implemented -- local image file
@@ -48,6 +51,7 @@  struct page_xfer {
 extern int open_page_xfer(struct page_xfer *xfer, int fd_type, unsigned long id);
 struct page_pipe;
 extern int page_xfer_dump_pages(struct page_xfer *, struct page_pipe *);
+extern int page_xfer_predump_pages(int pid, struct page_xfer *, struct page_pipe *);
 extern int connect_to_page_server_to_send(void);
 extern int connect_to_page_server_to_recv(int epfd);
 extern int disconnect_from_page_server(void);
diff --git a/criu/mem.c b/criu/mem.c
index 5c13690..00c7951 100644
--- a/criu/mem.c
+++ b/criu/mem.c
@@ -488,7 +488,18 @@  static int __parasite_dump_pages_seized(struct pstree_item *item,
 	if (mdc->lazy)
 		memcpy(pargs_iovs(args), pp->iovs,
 		       sizeof(struct iovec) * pp->nr_iovs);
-	ret = drain_pages(pp, ctl, args);
+
+	/*
+	 * Faking drain_pages for pre-dump here. Actual drain_pages for pre-dump
+	 * will happen after task unfreezing in cr_pre_dump_finish(). This is
+	 * actual optimization which reduces time for which process was frozen
+	 * during pre-dump.
+	 */
+	if (mdc->pre_dump)
+		ret = 0;
+	else
+		ret = drain_pages(pp, ctl, args);
+
 	if (!ret && !mdc->pre_dump)
 		ret = xfer_pages(pp, &xfer);
 	if (ret)
diff --git a/criu/page-xfer.c b/criu/page-xfer.c
index fe457d2..40ec29e 100644
--- a/criu/page-xfer.c
+++ b/criu/page-xfer.c
@@ -499,16 +499,190 @@  static inline u32 ppb_xfer_flags(struct page_xfer *xfer, struct page_pipe_buf *p
 		return PE_PRESENT;
 }
 
+static char userbuf[4 << 20];
+
+ssize_t copy_to_userbuf(int pid, struct iovec* liov, struct iovec* riov, unsigned long riov_cnt)
+{
+	ssize_t ret;
+
+	ret = process_vm_readv(pid, liov, 1, riov, riov_cnt, 0);
+
+	/* Target process doesn't exist */
+	if (ret == -1 && errno == ESRCH) {
+		pr_err("Target process with PID %d not found\n", pid);
+		return -2;
+	}
+
+	return ret;
+}
+
+/*
+ * This function returns the index of that iov in an iovec, which failed to get
+ * processed by process_vm_readv or may be partially processed
+ */
+unsigned int faulty_iov_index(ssize_t bytes_read, struct iovec* riov, size_t index,
+		unsigned int riovcnt, unsigned long *iov_cntr, unsigned long *page_diff)
+{
+	ssize_t processed_bytes = 0;
+	*iov_cntr = 0;
+	*page_diff = 0;
+
+	while (processed_bytes < bytes_read && index < riovcnt) {
+
+		processed_bytes += riov[index].iov_len;
+		(*iov_cntr) += 1;
+		index += 1;
+	}
+
+	/* one iov is partially read */
+	if (bytes_read < processed_bytes) {
+		*page_diff = processed_bytes - bytes_read;
+		index -= 1;
+	}
+
+	return index;
+}
+
+long processing_ppb_userbuf(int pid, struct page_pipe_buf *ppb, struct iovec *bufvec)
+{
+	struct iovec *riov = ppb->iov;
+	ssize_t bytes_read = 0;
+	unsigned long pro_iovs = 0, start = 0, iov_cntr = 0, end;
+	unsigned long pages_read = 0, pages_skipped = 0;
+	unsigned long page_diff = 0;
+
+	bufvec->iov_len = sizeof(userbuf);
+	bufvec->iov_base = userbuf;
+
+	while (pro_iovs < ppb->nr_segs)
+	{
+		bytes_read = copy_to_userbuf(pid, bufvec, &riov[start],
+							ppb->nr_segs - pro_iovs);
+
+		if (bytes_read == -2)
+			return -2;
+
+		if (bytes_read == -1)
+		{
+			/*
+			 *  In other errors, adjust page count and mark the page
+			 *  to be skipped by pagemap generation
+			 */
+
+			cnt_sub(CNT_PAGES_WRITTEN, riov[start].iov_len/PAGE_SIZE);
+			pages_skipped += riov[start].iov_len/PAGE_SIZE;
+			riov[start].iov_base = SKIP_PAGEMAP;
+
+			pro_iovs += 1;
+			start += 1;
+			continue;
+		}
+
+		pages_read += bytes_read/PAGE_SIZE;
+
+		if (pages_read + pages_skipped == ppb->pages_in)
+			break;
+
+		end = faulty_iov_index(bytes_read, riov,
+					start, ppb->nr_segs, &iov_cntr, &page_diff);
+
+		/*
+		 * One single iov could be partially read, unless unmapped page in
+		 * iov range is not hit by process_vm_readv, need to handle this
+		 * special case
+		 */
+
+		if (!page_diff)
+		{
+			cnt_sub(CNT_PAGES_WRITTEN, riov[end].iov_len/PAGE_SIZE);
+			pages_skipped += riov[end].iov_len/PAGE_SIZE;
+			riov[end].iov_base = SKIP_PAGEMAP;
+			start = end + 1;
+			pro_iovs += iov_cntr + 1;
+		}
+		else
+		{
+			riov[end].iov_len -= page_diff;
+			cnt_sub(CNT_PAGES_WRITTEN, page_diff/PAGE_SIZE);
+			pages_skipped += page_diff/PAGE_SIZE;
+			start = end + 1;
+			pro_iovs += iov_cntr;
+		}
+
+		bufvec->iov_base = userbuf + pages_read * PAGE_SIZE;
+		bufvec->iov_len -= bytes_read;
+	}
+
+	return pages_read;
+}
+
+
+int page_xfer_predump_pages(int pid, struct page_xfer *xfer, struct page_pipe *pp)
+{
+	struct page_pipe_buf *ppb;
+	unsigned int cur_hole = 0, i;
+	unsigned long ret, pages_read;
+	struct iovec bufvec;
+
+	list_for_each_entry(ppb, &pp->bufs, l) {
+
+		pages_read = processing_ppb_userbuf(pid, ppb, &bufvec);
+
+		if (pages_read == -1)
+			return -1;
+
+		bufvec.iov_base = userbuf;
+		bufvec.iov_len = pages_read * PAGE_SIZE;
+		ret = vmsplice(ppb->p[1], &bufvec, 1, SPLICE_F_NONBLOCK);
+
+		if (ret == -1 || ret != pages_read * PAGE_SIZE) {
+			pr_err("vmsplice: Failed to splice user buffer to pipe %ld\n", ret);
+			return -1;
+		}
+
+		/* generating pagemap */
+		for (i = 0; i < ppb->nr_segs; i++) {
+
+			if (ppb->iov[i].iov_base == SKIP_PAGEMAP)
+				continue;
+
+			struct iovec iov = ppb->iov[i];
+			u32 flags;
+
+			ret = dump_holes(xfer, pp, &cur_hole, iov.iov_base);
+			if (ret)
+				return ret;
+
+			BUG_ON(iov.iov_base < (void *)xfer->offset);
+			iov.iov_base -= xfer->offset;
+			pr_debug("\tp %p [%u]\n", iov.iov_base,
+					(unsigned int)(iov.iov_len / PAGE_SIZE));
+
+			flags = ppb_xfer_flags(xfer, ppb);
+
+			if (xfer->write_pagemap(xfer, &iov, flags))
+				return -1;
+			if ((flags & PE_PRESENT) && xfer->write_pages(xfer,
+						ppb->p[0], iov.iov_len))
+				return -1;
+
+		}
+
+	}
+
+	return dump_holes(xfer, pp, &cur_hole, NULL);
+}
+
 int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp)
 {
 	struct page_pipe_buf *ppb;
 	unsigned int cur_hole = 0;
+	unsigned int i;
 	int ret;
 
 	pr_debug("Transferring pages:\n");
 
 	list_for_each_entry(ppb, &pp->bufs, l) {
-		unsigned int i;
 
 		pr_debug("\tbuf %d/%d\n", ppb->pages_in, ppb->nr_segs);
 

Comments

Andrei Vagin July 30, 2019, 4:58 a.m.
On Thu, Jul 25, 2019 at 06:43:45AM +0530, Abhishek Dubey wrote:
> moving out page_drain to cr_pre_dump_finish for
> pre-dump stage. During frozen state, only iovecs
> will be generated and draining of page will happen
> after the task has been unfrozen. Shared memory
> pre-dumping is not modified.
> 
> Signed-off-by: Abhishek Dubey <dubeyabhishek777@gmail.com>
> ---
>  criu/cr-dump.c           |   2 +-
>  criu/include/page-xfer.h |   4 ++
>  criu/mem.c               |  13 +++-
>  criu/page-xfer.c         | 176 ++++++++++++++++++++++++++++++++++++++++++++++-
>  4 files changed, 192 insertions(+), 3 deletions(-)
> 
> diff --git a/criu/cr-dump.c b/criu/cr-dump.c
> index e070b8b..4569372 100644
> --- a/criu/cr-dump.c
> +++ b/criu/cr-dump.c
> @@ -1513,7 +1513,7 @@ static int cr_pre_dump_finish(int status)
>  			goto err;
>  
>  		mem_pp = dmpi(item)->mem_pp;
> -		ret = page_xfer_dump_pages(&xfer, mem_pp);
> +		ret = page_xfer_predump_pages(item->pid->real, &xfer, mem_pp);
>  
>  		xfer.close(&xfer);
>  
> diff --git a/criu/include/page-xfer.h b/criu/include/page-xfer.h
> index fa72273..c3ad877 100644
> --- a/criu/include/page-xfer.h
> +++ b/criu/include/page-xfer.h
> @@ -9,6 +9,9 @@ struct ps_info {
>  
>  extern int cr_page_server(bool daemon_mode, bool lazy_dump, int cfd);
>  
> +/* to skip pagemap generation for skipped iovs */
> +#define SKIP_PAGEMAP (void*)0xdeadbeef
> +
>  /*
>   * page_xfer -- transfer pages into image file.
>   * Two images backends are implemented -- local image file
> @@ -48,6 +51,7 @@ struct page_xfer {
>  extern int open_page_xfer(struct page_xfer *xfer, int fd_type, unsigned long id);
>  struct page_pipe;
>  extern int page_xfer_dump_pages(struct page_xfer *, struct page_pipe *);
> +extern int page_xfer_predump_pages(int pid, struct page_xfer *, struct page_pipe *);
>  extern int connect_to_page_server_to_send(void);
>  extern int connect_to_page_server_to_recv(int epfd);
>  extern int disconnect_from_page_server(void);
> diff --git a/criu/mem.c b/criu/mem.c
> index 5c13690..00c7951 100644
> --- a/criu/mem.c
> +++ b/criu/mem.c
> @@ -488,7 +488,18 @@ static int __parasite_dump_pages_seized(struct pstree_item *item,
>  	if (mdc->lazy)
>  		memcpy(pargs_iovs(args), pp->iovs,
>  		       sizeof(struct iovec) * pp->nr_iovs);
> -	ret = drain_pages(pp, ctl, args);
> +
> +	/*
> +	 * Faking drain_pages for pre-dump here. Actual drain_pages for pre-dump
> +	 * will happen after task unfreezing in cr_pre_dump_finish(). This is
> +	 * actual optimization which reduces time for which process was frozen
> +	 * during pre-dump.
> +	 */
> +	if (mdc->pre_dump)
> +		ret = 0;
> +	else
> +		ret = drain_pages(pp, ctl, args);
> +
>  	if (!ret && !mdc->pre_dump)
>  		ret = xfer_pages(pp, &xfer);
>  	if (ret)
> diff --git a/criu/page-xfer.c b/criu/page-xfer.c
> index fe457d2..40ec29e 100644
> --- a/criu/page-xfer.c
> +++ b/criu/page-xfer.c
> @@ -499,16 +499,190 @@ static inline u32 ppb_xfer_flags(struct page_xfer *xfer, struct page_pipe_buf *p
>  		return PE_PRESENT;
>  }
>  
> +static char userbuf[4 << 20];
> +
> +ssize_t copy_to_userbuf(int pid, struct iovec* liov, struct iovec* riov, unsigned long riov_cnt)
> +{
> +	ssize_t ret;
> +
> +	ret = process_vm_readv(pid, liov, 1, riov, riov_cnt, 0);
> +
> +	/* Target process doesn't exist */
> +	if (ret == -1 && errno == ESRCH) {
> +		pr_err("Target process with PID %d not found\n", pid);

In case of pre-dump, it is a normal situation, so it should be pr_debug.

> +		return -2;
> +	}
> +
> +	return ret;
> +}
> +
> +/*
> + * This function returns the index of that iov in an iovec, which failed to get
> + * processed by process_vm_readv or may be partially processed
> + */
> +unsigned int faulty_iov_index(ssize_t bytes_read, struct iovec* riov, size_t index,
> +		unsigned int riovcnt, unsigned long *iov_cntr, unsigned long *page_diff)
> +{
> +	ssize_t processed_bytes = 0;
> +	*iov_cntr = 0;
> +	*page_diff = 0;
> +
> +	while (processed_bytes < bytes_read && index < riovcnt) {
> +
> +		processed_bytes += riov[index].iov_len;
> +		(*iov_cntr) += 1;
> +		index += 1;
> +	}
> +
> +	/* one iov is partially read */
> +	if (bytes_read < processed_bytes) {
> +		*page_diff = processed_bytes - bytes_read;
> +		index -= 1;
> +	}
> +
> +	return index;
> +}
> +
> +long processing_ppb_userbuf(int pid, struct page_pipe_buf *ppb, struct iovec *bufvec)

this function is used only in this file, so it has to be mark as static.
Pls, check other functions too.

> +{
> +	struct iovec *riov = ppb->iov;
> +	ssize_t bytes_read = 0;
> +	unsigned long pro_iovs = 0, start = 0, iov_cntr = 0, end;
> +	unsigned long pages_read = 0, pages_skipped = 0;
> +	unsigned long page_diff = 0;
> +
> +	bufvec->iov_len = sizeof(userbuf);
> +	bufvec->iov_base = userbuf;
> +
> +	while (pro_iovs < ppb->nr_segs)
> +	{
> +		bytes_read = copy_to_userbuf(pid, bufvec, &riov[start],
> +							ppb->nr_segs - pro_iovs);
> +
> +		if (bytes_read == -2)
> +			return -2;

need a comment here, what this -1 means or maybe define a constant with
a readable name.

> +
> +		if (bytes_read == -1)
> +		{
> +			/*
> +			 *  In other errors, adjust page count and mark the page
> +			 *  to be skipped by pagemap generation
> +			 */
> +
> +			cnt_sub(CNT_PAGES_WRITTEN, riov[start].iov_len/PAGE_SIZE);
> +			pages_skipped += riov[start].iov_len/PAGE_SIZE;
> +			riov[start].iov_base = SKIP_PAGEMAP;
> +
> +			pro_iovs += 1;
> +			start += 1;
> +			continue;
> +		}
> +
> +		pages_read += bytes_read/PAGE_SIZE;
> +
> +		if (pages_read + pages_skipped == ppb->pages_in)
> +			break;
> +
> +		end = faulty_iov_index(bytes_read, riov,
> +					start, ppb->nr_segs, &iov_cntr, &page_diff);
> +
> +		/*
> +		 * One single iov could be partially read, unless unmapped page in
> +		 * iov range is not hit by process_vm_readv, need to handle this
> +		 * special case
> +		 */
> +
> +		if (!page_diff)
> +		{
> +			cnt_sub(CNT_PAGES_WRITTEN, riov[end].iov_len/PAGE_SIZE);
> +			pages_skipped += riov[end].iov_len/PAGE_SIZE;
> +			riov[end].iov_base = SKIP_PAGEMAP;
> +			start = end + 1;

this looks wrong, you already incremented index in faulty_iov_index

> +			pro_iovs += iov_cntr + 1;
> +		}
> +		else
> +		{

pls follow the coding style what is used in the existing code:

		} else {

> +			riov[end].iov_len -= page_diff;
> +			cnt_sub(CNT_PAGES_WRITTEN, page_diff/PAGE_SIZE);
> +			pages_skipped += page_diff/PAGE_SIZE;
> +			start = end + 1;
> +			pro_iovs += iov_cntr;
> +		}
> +
> +		bufvec->iov_base = userbuf + pages_read * PAGE_SIZE;
> +		bufvec->iov_len -= bytes_read;
> +	}
> +
> +	return pages_read;
> +}
> +
> +
> +int page_xfer_predump_pages(int pid, struct page_xfer *xfer, struct page_pipe *pp)
> +{

how do we generate pagemap before these changes and why do we need a new
function?

> +	struct page_pipe_buf *ppb;
> +	unsigned int cur_hole = 0, i;
> +	unsigned long ret, pages_read;
> +	struct iovec bufvec;
> +
> +	list_for_each_entry(ppb, &pp->bufs, l) {
> +
> +		pages_read = processing_ppb_userbuf(pid, ppb, &bufvec);
> +
> +		if (pages_read == -1)
> +			return -1;
> +
> +		bufvec.iov_base = userbuf;
> +		bufvec.iov_len = pages_read * PAGE_SIZE;
> +		ret = vmsplice(ppb->p[1], &bufvec, 1, SPLICE_F_NONBLOCK);
> +
> +		if (ret == -1 || ret != pages_read * PAGE_SIZE) {
> +			pr_err("vmsplice: Failed to splice user buffer to pipe %ld\n", ret);
> +			return -1;
> +		}
> +
> +		/* generating pagemap */
> +		for (i = 0; i < ppb->nr_segs; i++) {
> +
> +			if (ppb->iov[i].iov_base == SKIP_PAGEMAP)
> +				continue;
> +
> +			struct iovec iov = ppb->iov[i];
> +			u32 flags;
> +
> +			ret = dump_holes(xfer, pp, &cur_hole, iov.iov_base);
> +			if (ret)
> +				return ret;
> +
> +			BUG_ON(iov.iov_base < (void *)xfer->offset);
> +			iov.iov_base -= xfer->offset;
> +			pr_debug("\tp %p [%u]\n", iov.iov_base,
> +					(unsigned int)(iov.iov_len / PAGE_SIZE));
> +
> +			flags = ppb_xfer_flags(xfer, ppb);
> +
> +			if (xfer->write_pagemap(xfer, &iov, flags))
> +				return -1;
> +			if ((flags & PE_PRESENT) && xfer->write_pages(xfer,
> +						ppb->p[0], iov.iov_len))
> +				return -1;
> +
> +		}
> +
> +	}
> +
> +	return dump_holes(xfer, pp, &cur_hole, NULL);
> +}
> +
>  int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp)
>  {
>  	struct page_pipe_buf *ppb;
>  	unsigned int cur_hole = 0;
> +	unsigned int i;

I don't understand why we need to move i out of the loop.
>  	int ret;
>  
>  	pr_debug("Transferring pages:\n");
>  
>  	list_for_each_entry(ppb, &pp->bufs, l) {
> -		unsigned int i;
>  
>  		pr_debug("\tbuf %d/%d\n", ppb->pages_in, ppb->nr_segs);
>  
> -- 
> 2.7.4
>
Pavel Emelianov July 30, 2019, 1:03 p.m.
On 7/25/19 4:13 AM, Abhishek Dubey wrote:
> moving out page_drain to cr_pre_dump_finish for
> pre-dump stage. During frozen state, only iovecs
> will be generated and draining of page will happen
> after the task has been unfrozen. Shared memory
> pre-dumping is not modified.
> 
> Signed-off-by: Abhishek Dubey <dubeyabhishek777@gmail.com>
> ---
>  criu/cr-dump.c           |   2 +-
>  criu/include/page-xfer.h |   4 ++
>  criu/mem.c               |  13 +++-
>  criu/page-xfer.c         | 176 ++++++++++++++++++++++++++++++++++++++++++++++-
>  4 files changed, 192 insertions(+), 3 deletions(-)
> 
> diff --git a/criu/cr-dump.c b/criu/cr-dump.c
> index e070b8b..4569372 100644
> --- a/criu/cr-dump.c
> +++ b/criu/cr-dump.c
> @@ -1513,7 +1513,7 @@ static int cr_pre_dump_finish(int status)
>  			goto err;
>  
>  		mem_pp = dmpi(item)->mem_pp;
> -		ret = page_xfer_dump_pages(&xfer, mem_pp);
> +		ret = page_xfer_predump_pages(item->pid->real, &xfer, mem_pp);
>  
>  		xfer.close(&xfer);
>  
> diff --git a/criu/include/page-xfer.h b/criu/include/page-xfer.h
> index fa72273..c3ad877 100644
> --- a/criu/include/page-xfer.h
> +++ b/criu/include/page-xfer.h
> @@ -9,6 +9,9 @@ struct ps_info {
>  
>  extern int cr_page_server(bool daemon_mode, bool lazy_dump, int cfd);
>  
> +/* to skip pagemap generation for skipped iovs */
> +#define SKIP_PAGEMAP (void*)0xdeadbeef
> +
>  /*
>   * page_xfer -- transfer pages into image file.
>   * Two images backends are implemented -- local image file
> @@ -48,6 +51,7 @@ struct page_xfer {
>  extern int open_page_xfer(struct page_xfer *xfer, int fd_type, unsigned long id);
>  struct page_pipe;
>  extern int page_xfer_dump_pages(struct page_xfer *, struct page_pipe *);
> +extern int page_xfer_predump_pages(int pid, struct page_xfer *, struct page_pipe *);
>  extern int connect_to_page_server_to_send(void);
>  extern int connect_to_page_server_to_recv(int epfd);
>  extern int disconnect_from_page_server(void);
> diff --git a/criu/mem.c b/criu/mem.c
> index 5c13690..00c7951 100644
> --- a/criu/mem.c
> +++ b/criu/mem.c
> @@ -488,7 +488,18 @@ static int __parasite_dump_pages_seized(struct pstree_item *item,
>  	if (mdc->lazy)
>  		memcpy(pargs_iovs(args), pp->iovs,
>  		       sizeof(struct iovec) * pp->nr_iovs);
> -	ret = drain_pages(pp, ctl, args);
> +
> +	/*
> +	 * Faking drain_pages for pre-dump here. Actual drain_pages for pre-dump
> +	 * will happen after task unfreezing in cr_pre_dump_finish(). This is
> +	 * actual optimization which reduces time for which process was frozen
> +	 * during pre-dump.
> +	 */
> +	if (mdc->pre_dump)
> +		ret = 0;
> +	else
> +		ret = drain_pages(pp, ctl, args);
> +
>  	if (!ret && !mdc->pre_dump)
>  		ret = xfer_pages(pp, &xfer);
>  	if (ret)
> diff --git a/criu/page-xfer.c b/criu/page-xfer.c
> index fe457d2..40ec29e 100644
> --- a/criu/page-xfer.c
> +++ b/criu/page-xfer.c
> @@ -499,16 +499,190 @@ static inline u32 ppb_xfer_flags(struct page_xfer *xfer, struct page_pipe_buf *p
>  		return PE_PRESENT;
>  }
>  
> +static char userbuf[4 << 20];
> +
> +ssize_t copy_to_userbuf(int pid, struct iovec* liov, struct iovec* riov, unsigned long riov_cnt)
> +{
> +	ssize_t ret;
> +
> +	ret = process_vm_readv(pid, liov, 1, riov, riov_cnt, 0);
> +
> +	/* Target process doesn't exist */
> +	if (ret == -1 && errno == ESRCH) {
> +		pr_err("Target process with PID %d not found\n", pid);
> +		return -2;
> +	}
> +
> +	return ret;
> +}
> +
> +/*
> + * This function returns the index of that iov in an iovec, which failed to get
> + * processed by process_vm_readv or may be partially processed
> + */
> +unsigned int faulty_iov_index(ssize_t bytes_read, struct iovec* riov, size_t index,
> +		unsigned int riovcnt, unsigned long *iov_cntr, unsigned long *page_diff)
> +{
> +	ssize_t processed_bytes = 0;
> +	*iov_cntr = 0;
> +	*page_diff = 0;
> +
> +	while (processed_bytes < bytes_read && index < riovcnt) {
> +
> +		processed_bytes += riov[index].iov_len;
> +		(*iov_cntr) += 1;
> +		index += 1;
> +	}
> +
> +	/* one iov is partially read */
> +	if (bytes_read < processed_bytes) {
> +		*page_diff = processed_bytes - bytes_read;
> +		index -= 1;
> +	}
> +
> +	return index;
> +}
> +
> +long processing_ppb_userbuf(int pid, struct page_pipe_buf *ppb, struct iovec *bufvec)

The return value and the bufvec argument seem to duplicate each other, as bufvec.iov_len
is the amount of data read, so is the return value.

> +{
> +	struct iovec *riov = ppb->iov;
> +	ssize_t bytes_read = 0;
> +	unsigned long pro_iovs = 0, start = 0, iov_cntr = 0, end;

I have a subtle feeling that pro_iovs, start and end are somewhat duplicating each other.
Please, explain them and especially the case why start can be not equal to pro_iovs.

> +	unsigned long pages_read = 0, pages_skipped = 0;
> +	unsigned long page_diff = 0;
> +
> +	bufvec->iov_len = sizeof(userbuf);
> +	bufvec->iov_base = userbuf;
> +
> +	while (pro_iovs < ppb->nr_segs)
> +	{
> +		bytes_read = copy_to_userbuf(pid, bufvec, &riov[start],
> +							ppb->nr_segs - pro_iovs);
> +
> +		if (bytes_read == -2)
> +			return -2;

This -2 is not handled by the caller.

> +
> +		if (bytes_read == -1)
> +		{
> +			/*
> +			 *  In other errors, adjust page count and mark the page
> +			 *  to be skipped by pagemap generation
> +			 */
> +
> +			cnt_sub(CNT_PAGES_WRITTEN, riov[start].iov_len/PAGE_SIZE);
> +			pages_skipped += riov[start].iov_len/PAGE_SIZE;
> +			riov[start].iov_base = SKIP_PAGEMAP;
> +
> +			pro_iovs += 1;
> +			start += 1;
> +			continue;
> +		}
> +
> +		pages_read += bytes_read/PAGE_SIZE;
> +
> +		if (pages_read + pages_skipped == ppb->pages_in)
> +			break;
> +
> +		end = faulty_iov_index(bytes_read, riov,
> +					start, ppb->nr_segs, &iov_cntr, &page_diff);
> +
> +		/*
> +		 * One single iov could be partially read, unless unmapped page in
> +		 * iov range is not hit by process_vm_readv, need to handle this
> +		 * special case
> +		 */

Please, explain this if-else, what does it do, especially two things below:

> +		if (!page_diff)
> +		{
> +			cnt_sub(CNT_PAGES_WRITTEN, riov[end].iov_len/PAGE_SIZE);
> +			pages_skipped += riov[end].iov_len/PAGE_SIZE;
> +			riov[end].iov_base = SKIP_PAGEMAP;

... why do we skip the last riov and why do we mess with cnt_sub at all?

> +			start = end + 1;
> +			pro_iovs += iov_cntr + 1;
> +		}
> +		else
> +		{
> +			riov[end].iov_len -= page_diff;
> +			cnt_sub(CNT_PAGES_WRITTEN, page_diff/PAGE_SIZE);
> +			pages_skipped += page_diff/PAGE_SIZE;
> +			start = end + 1;
> +			pro_iovs += iov_cntr;
> +		}
> +
> +		bufvec->iov_base = userbuf + pages_read * PAGE_SIZE;
> +		bufvec->iov_len -= bytes_read;
> +	}
> +
> +	return pages_read;
> +}
> +
> +
> +int page_xfer_predump_pages(int pid, struct page_xfer *xfer, struct page_pipe *pp)
> +{
> +	struct page_pipe_buf *ppb;
> +	unsigned int cur_hole = 0, i;
> +	unsigned long ret, pages_read;
> +	struct iovec bufvec;
> +
> +	list_for_each_entry(ppb, &pp->bufs, l) {
> +
> +		pages_read = processing_ppb_userbuf(pid, ppb, &bufvec);
> +
> +		if (pages_read == -1)
> +			return -1;
> +
> +		bufvec.iov_base = userbuf;
> +		bufvec.iov_len = pages_read * PAGE_SIZE;
> +		ret = vmsplice(ppb->p[1], &bufvec, 1, SPLICE_F_NONBLOCK);
> +
> +		if (ret == -1 || ret != pages_read * PAGE_SIZE) {
> +			pr_err("vmsplice: Failed to splice user buffer to pipe %ld\n", ret);
> +			return -1;
> +		}
> +
> +		/* generating pagemap */
> +		for (i = 0; i < ppb->nr_segs; i++) {
> +
> +			if (ppb->iov[i].iov_base == SKIP_PAGEMAP)
> +				continue;
> +
> +			struct iovec iov = ppb->iov[i];
> +			u32 flags;
> +
> +			ret = dump_holes(xfer, pp, &cur_hole, iov.iov_base);
> +			if (ret)
> +				return ret;
> +
> +			BUG_ON(iov.iov_base < (void *)xfer->offset);
> +			iov.iov_base -= xfer->offset;
> +			pr_debug("\tp %p [%u]\n", iov.iov_base,
> +					(unsigned int)(iov.iov_len / PAGE_SIZE));
> +
> +			flags = ppb_xfer_flags(xfer, ppb);
> +
> +			if (xfer->write_pagemap(xfer, &iov, flags))
> +				return -1;
> +			if ((flags & PE_PRESENT) && xfer->write_pages(xfer,

Wow, if the flags does not have the PE_PRESENT bit set, why do we even get to the
reading the pages contents?

> +						ppb->p[0], iov.iov_len))
> +				return -1;
> +
> +		}
> +
> +	}
> +
> +	return dump_holes(xfer, pp, &cur_hole, NULL);
> +}
> +
>  int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp)
>  {
>  	struct page_pipe_buf *ppb;
>  	unsigned int cur_hole = 0;
> +	unsigned int i;
>  	int ret;
>  
>  	pr_debug("Transferring pages:\n");
>  
>  	list_for_each_entry(ppb, &pp->bufs, l) {
> -		unsigned int i;

This change is cosmetic, please, do not merge it with the rest of the code.

>  
>  		pr_debug("\tbuf %d/%d\n", ppb->pages_in, ppb->nr_segs);
>  
>
abhishek dubey Aug. 1, 2019, 10:21 a.m.
On 30/07/19 10:28 AM, Andrei Vagin wrote:
> On Thu, Jul 25, 2019 at 06:43:45AM +0530, Abhishek Dubey wrote:
>> moving out page_drain to cr_pre_dump_finish for
>> pre-dump stage. During frozen state, only iovecs
>> will be generated and draining of page will happen
>> after the task has been unfrozen. Shared memory
>> pre-dumping is not modified.
>>
>> Signed-off-by: Abhishek Dubey <dubeyabhishek777@gmail.com>
>> ---
>>   criu/cr-dump.c           |   2 +-
>>   criu/include/page-xfer.h |   4 ++
>>   criu/mem.c               |  13 +++-
>>   criu/page-xfer.c         | 176 ++++++++++++++++++++++++++++++++++++++++++++++-
>>   4 files changed, 192 insertions(+), 3 deletions(-)
>>
>> diff --git a/criu/cr-dump.c b/criu/cr-dump.c
>> index e070b8b..4569372 100644
>> --- a/criu/cr-dump.c
>> +++ b/criu/cr-dump.c
>> @@ -1513,7 +1513,7 @@ static int cr_pre_dump_finish(int status)
>>   			goto err;
>>   
>>   		mem_pp = dmpi(item)->mem_pp;
>> -		ret = page_xfer_dump_pages(&xfer, mem_pp);
>> +		ret = page_xfer_predump_pages(item->pid->real, &xfer, mem_pp);
>>   
>>   		xfer.close(&xfer);
>>   
>> diff --git a/criu/include/page-xfer.h b/criu/include/page-xfer.h
>> index fa72273..c3ad877 100644
>> --- a/criu/include/page-xfer.h
>> +++ b/criu/include/page-xfer.h
>> @@ -9,6 +9,9 @@ struct ps_info {
>>   
>>   extern int cr_page_server(bool daemon_mode, bool lazy_dump, int cfd);
>>   
>> +/* to skip pagemap generation for skipped iovs */
>> +#define SKIP_PAGEMAP (void*)0xdeadbeef
>> +
>>   /*
>>    * page_xfer -- transfer pages into image file.
>>    * Two images backends are implemented -- local image file
>> @@ -48,6 +51,7 @@ struct page_xfer {
>>   extern int open_page_xfer(struct page_xfer *xfer, int fd_type, unsigned long id);
>>   struct page_pipe;
>>   extern int page_xfer_dump_pages(struct page_xfer *, struct page_pipe *);
>> +extern int page_xfer_predump_pages(int pid, struct page_xfer *, struct page_pipe *);
>>   extern int connect_to_page_server_to_send(void);
>>   extern int connect_to_page_server_to_recv(int epfd);
>>   extern int disconnect_from_page_server(void);
>> diff --git a/criu/mem.c b/criu/mem.c
>> index 5c13690..00c7951 100644
>> --- a/criu/mem.c
>> +++ b/criu/mem.c
>> @@ -488,7 +488,18 @@ static int __parasite_dump_pages_seized(struct pstree_item *item,
>>   	if (mdc->lazy)
>>   		memcpy(pargs_iovs(args), pp->iovs,
>>   		       sizeof(struct iovec) * pp->nr_iovs);
>> -	ret = drain_pages(pp, ctl, args);
>> +
>> +	/*
>> +	 * Faking drain_pages for pre-dump here. Actual drain_pages for pre-dump
>> +	 * will happen after task unfreezing in cr_pre_dump_finish(). This is
>> +	 * actual optimization which reduces time for which process was frozen
>> +	 * during pre-dump.
>> +	 */
>> +	if (mdc->pre_dump)
>> +		ret = 0;
>> +	else
>> +		ret = drain_pages(pp, ctl, args);
>> +
>>   	if (!ret && !mdc->pre_dump)
>>   		ret = xfer_pages(pp, &xfer);
>>   	if (ret)
>> diff --git a/criu/page-xfer.c b/criu/page-xfer.c
>> index fe457d2..40ec29e 100644
>> --- a/criu/page-xfer.c
>> +++ b/criu/page-xfer.c
>> @@ -499,16 +499,190 @@ static inline u32 ppb_xfer_flags(struct page_xfer *xfer, struct page_pipe_buf *p
>>   		return PE_PRESENT;
>>   }
>>   
>> +static char userbuf[4 << 20];
>> +
>> +ssize_t copy_to_userbuf(int pid, struct iovec* liov, struct iovec* riov, unsigned long riov_cnt)
>> +{
>> +	ssize_t ret;
>> +
>> +	ret = process_vm_readv(pid, liov, 1, riov, riov_cnt, 0);
>> +
>> +	/* Target process doesn't exist */
>> +	if (ret == -1 && errno == ESRCH) {
>> +		pr_err("Target process with PID %d not found\n", pid);
> In case of pre-dump, it is a normal situation, so it should be pr_debug.
OK.
>
>> +		return -2;
>> +	}
>> +
>> +	return ret;
>> +}
>> +
>> +/*
>> + * This function returns the index of that iov in an iovec, which failed to get
>> + * processed by process_vm_readv or may be partially processed
>> + */
>> +unsigned int faulty_iov_index(ssize_t bytes_read, struct iovec* riov, size_t index,
>> +		unsigned int riovcnt, unsigned long *iov_cntr, unsigned long *page_diff)
>> +{
>> +	ssize_t processed_bytes = 0;
>> +	*iov_cntr = 0;
>> +	*page_diff = 0;
>> +
>> +	while (processed_bytes < bytes_read && index < riovcnt) {
>> +
>> +		processed_bytes += riov[index].iov_len;
>> +		(*iov_cntr) += 1;
>> +		index += 1;
>> +	}
>> +
>> +	/* one iov is partially read */
>> +	if (bytes_read < processed_bytes) {
>> +		*page_diff = processed_bytes - bytes_read;
>> +		index -= 1;
>> +	}
>> +
>> +	return index;
>> +}
>> +
>> +long processing_ppb_userbuf(int pid, struct page_pipe_buf *ppb, struct iovec *bufvec)
> this function is used only in this file, so it has to be mark as static.
> Pls, check other functions too.
Done.
>
>> +{
>> +	struct iovec *riov = ppb->iov;
>> +	ssize_t bytes_read = 0;
>> +	unsigned long pro_iovs = 0, start = 0, iov_cntr = 0, end;
>> +	unsigned long pages_read = 0, pages_skipped = 0;
>> +	unsigned long page_diff = 0;
>> +
>> +	bufvec->iov_len = sizeof(userbuf);
>> +	bufvec->iov_base = userbuf;
>> +
>> +	while (pro_iovs < ppb->nr_segs)
>> +	{
>> +		bytes_read = copy_to_userbuf(pid, bufvec, &riov[start],
>> +							ppb->nr_segs - pro_iovs);
>> +
>> +		if (bytes_read == -2)
>> +			return -2;
> need a comment here, what this -1 means or maybe define a constant with
> a readable name.
OK.
>> +
>> +		if (bytes_read == -1)
>> +		{
>> +			/*
>> +			 *  In other errors, adjust page count and mark the page
>> +			 *  to be skipped by pagemap generation
>> +			 */
>> +
>> +			cnt_sub(CNT_PAGES_WRITTEN, riov[start].iov_len/PAGE_SIZE);
>> +			pages_skipped += riov[start].iov_len/PAGE_SIZE;
>> +			riov[start].iov_base = SKIP_PAGEMAP;
>> +
>> +			pro_iovs += 1;
>> +			start += 1;
>> +			continue;
>> +		}
>> +
>> +		pages_read += bytes_read/PAGE_SIZE;
>> +
>> +		if (pages_read + pages_skipped == ppb->pages_in)
>> +			break;
>> +
>> +		end = faulty_iov_index(bytes_read, riov,
>> +					start, ppb->nr_segs, &iov_cntr, &page_diff);
>> +
>> +		/*
>> +		 * One single iov could be partially read, unless unmapped page in
>> +		 * iov range is not hit by process_vm_readv, need to handle this
>> +		 * special case
>> +		 */
>> +
>> +		if (!page_diff)
>> +		{
>> +			cnt_sub(CNT_PAGES_WRITTEN, riov[end].iov_len/PAGE_SIZE);
>> +			pages_skipped += riov[end].iov_len/PAGE_SIZE;
>> +			riov[end].iov_base = SKIP_PAGEMAP;
>> +			start = end + 1;
> this looks wrong, you already incremented index in faulty_iov_index
Let me rethink over it.
>
>> +			pro_iovs += iov_cntr + 1;
>> +		}
>> +		else
>> +		{
> pls follow the coding style what is used in the existing code:
Sure.
>
> 		} else {
>
>> +			riov[end].iov_len -= page_diff;
>> +			cnt_sub(CNT_PAGES_WRITTEN, page_diff/PAGE_SIZE);
>> +			pages_skipped += page_diff/PAGE_SIZE;
>> +			start = end + 1;
>> +			pro_iovs += iov_cntr;
>> +		}
>> +
>> +		bufvec->iov_base = userbuf + pages_read * PAGE_SIZE;
>> +		bufvec->iov_len -= bytes_read;
>> +	}
>> +
>> +	return pages_read;
>> +}
>> +
>> +
>> +int page_xfer_predump_pages(int pid, struct page_xfer *xfer, struct page_pipe *pp)
>> +{
> how do we generate pagemap before these changes and why do we need a new
> function?

1) Previously for pre-dump and dump, memory draining was for complete 
page-pipe which constituted former part.

Corresponding page-map generation was later part. The 
page_xfer_dump_pages() function fits with this design, generating

page-maps for complete page-pipe at once.

2) Objective was to have different page-drain mechanism for pre-dump and 
dump. Now, pre-dump is modified and the dump

stage still works in traditional way so it requires original 
page_xfer_dump_pages() function.

In modified pre-dump, I am dealing at page-pipe-buffer level instead of 
page-pipe level. So, memory drain and page-map generation

happen together for each of the page-pipe-buffer.

I preferred to have new function with bit of code repetition, just to 
have clean segregation between dump and new pre-dump.

>> +	struct page_pipe_buf *ppb;
>> +	unsigned int cur_hole = 0, i;
>> +	unsigned long ret, pages_read;
>> +	struct iovec bufvec;
>> +
>> +	list_for_each_entry(ppb, &pp->bufs, l) {
>> +
>> +		pages_read = processing_ppb_userbuf(pid, ppb, &bufvec);
>> +
>> +		if (pages_read == -1)
>> +			return -1;
>> +
>> +		bufvec.iov_base = userbuf;
>> +		bufvec.iov_len = pages_read * PAGE_SIZE;
>> +		ret = vmsplice(ppb->p[1], &bufvec, 1, SPLICE_F_NONBLOCK);
>> +
>> +		if (ret == -1 || ret != pages_read * PAGE_SIZE) {
>> +			pr_err("vmsplice: Failed to splice user buffer to pipe %ld\n", ret);
>> +			return -1;
>> +		}
>> +
>> +		/* generating pagemap */
>> +		for (i = 0; i < ppb->nr_segs; i++) {
>> +
>> +			if (ppb->iov[i].iov_base == SKIP_PAGEMAP)
>> +				continue;
>> +
>> +			struct iovec iov = ppb->iov[i];
>> +			u32 flags;
>> +
>> +			ret = dump_holes(xfer, pp, &cur_hole, iov.iov_base);
>> +			if (ret)
>> +				return ret;
>> +
>> +			BUG_ON(iov.iov_base < (void *)xfer->offset);
>> +			iov.iov_base -= xfer->offset;
>> +			pr_debug("\tp %p [%u]\n", iov.iov_base,
>> +					(unsigned int)(iov.iov_len / PAGE_SIZE));
>> +
>> +			flags = ppb_xfer_flags(xfer, ppb);
>> +
>> +			if (xfer->write_pagemap(xfer, &iov, flags))
>> +				return -1;
>> +			if ((flags & PE_PRESENT) && xfer->write_pages(xfer,
>> +						ppb->p[0], iov.iov_len))
>> +				return -1;
>> +
>> +		}
>> +
>> +	}
>> +
>> +	return dump_holes(xfer, pp, &cur_hole, NULL);
>> +}
>> +
>>   int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp)
>>   {
>>   	struct page_pipe_buf *ppb;
>>   	unsigned int cur_hole = 0;
>> +	unsigned int i;
> I don't understand why we need to move i out of the loop.
This was carried from previous patch. Not necessary. Please ignore.
>>   	int ret;
>>   
>>   	pr_debug("Transferring pages:\n");
>>   
>>   	list_for_each_entry(ppb, &pp->bufs, l) {
>> -		unsigned int i;
>>   
>>   		pr_debug("\tbuf %d/%d\n", ppb->pages_in, ppb->nr_segs);
>>   
>> -- 
>> 2.7.4
>>
abhishek dubey Aug. 2, 2019, 7:47 a.m.
On 30/07/19 6:33 PM, Pavel Emelianov wrote:
> On 7/25/19 4:13 AM, Abhishek Dubey wrote:
>> moving out page_drain to cr_pre_dump_finish for
>> pre-dump stage. During frozen state, only iovecs
>> will be generated and draining of page will happen
>> after the task has been unfrozen. Shared memory
>> pre-dumping is not modified.
>>
>> Signed-off-by: Abhishek Dubey <dubeyabhishek777@gmail.com>
>> ---
>>   criu/cr-dump.c           |   2 +-
>>   criu/include/page-xfer.h |   4 ++
>>   criu/mem.c               |  13 +++-
>>   criu/page-xfer.c         | 176 ++++++++++++++++++++++++++++++++++++++++++++++-
>>   4 files changed, 192 insertions(+), 3 deletions(-)
>>
>> diff --git a/criu/cr-dump.c b/criu/cr-dump.c
>> index e070b8b..4569372 100644
>> --- a/criu/cr-dump.c
>> +++ b/criu/cr-dump.c
>> @@ -1513,7 +1513,7 @@ static int cr_pre_dump_finish(int status)
>>   			goto err;
>>   
>>   		mem_pp = dmpi(item)->mem_pp;
>> -		ret = page_xfer_dump_pages(&xfer, mem_pp);
>> +		ret = page_xfer_predump_pages(item->pid->real, &xfer, mem_pp);
>>   
>>   		xfer.close(&xfer);
>>   
>> diff --git a/criu/include/page-xfer.h b/criu/include/page-xfer.h
>> index fa72273..c3ad877 100644
>> --- a/criu/include/page-xfer.h
>> +++ b/criu/include/page-xfer.h
>> @@ -9,6 +9,9 @@ struct ps_info {
>>   
>>   extern int cr_page_server(bool daemon_mode, bool lazy_dump, int cfd);
>>   
>> +/* to skip pagemap generation for skipped iovs */
>> +#define SKIP_PAGEMAP (void*)0xdeadbeef
>> +
>>   /*
>>    * page_xfer -- transfer pages into image file.
>>    * Two images backends are implemented -- local image file
>> @@ -48,6 +51,7 @@ struct page_xfer {
>>   extern int open_page_xfer(struct page_xfer *xfer, int fd_type, unsigned long id);
>>   struct page_pipe;
>>   extern int page_xfer_dump_pages(struct page_xfer *, struct page_pipe *);
>> +extern int page_xfer_predump_pages(int pid, struct page_xfer *, struct page_pipe *);
>>   extern int connect_to_page_server_to_send(void);
>>   extern int connect_to_page_server_to_recv(int epfd);
>>   extern int disconnect_from_page_server(void);
>> diff --git a/criu/mem.c b/criu/mem.c
>> index 5c13690..00c7951 100644
>> --- a/criu/mem.c
>> +++ b/criu/mem.c
>> @@ -488,7 +488,18 @@ static int __parasite_dump_pages_seized(struct pstree_item *item,
>>   	if (mdc->lazy)
>>   		memcpy(pargs_iovs(args), pp->iovs,
>>   		       sizeof(struct iovec) * pp->nr_iovs);
>> -	ret = drain_pages(pp, ctl, args);
>> +
>> +	/*
>> +	 * Faking drain_pages for pre-dump here. Actual drain_pages for pre-dump
>> +	 * will happen after task unfreezing in cr_pre_dump_finish(). This is
>> +	 * actual optimization which reduces time for which process was frozen
>> +	 * during pre-dump.
>> +	 */
>> +	if (mdc->pre_dump)
>> +		ret = 0;
>> +	else
>> +		ret = drain_pages(pp, ctl, args);
>> +
>>   	if (!ret && !mdc->pre_dump)
>>   		ret = xfer_pages(pp, &xfer);
>>   	if (ret)
>> diff --git a/criu/page-xfer.c b/criu/page-xfer.c
>> index fe457d2..40ec29e 100644
>> --- a/criu/page-xfer.c
>> +++ b/criu/page-xfer.c
>> @@ -499,16 +499,190 @@ static inline u32 ppb_xfer_flags(struct page_xfer *xfer, struct page_pipe_buf *p
>>   		return PE_PRESENT;
>>   }
>>   
>> +static char userbuf[4 << 20];
>> +
>> +ssize_t copy_to_userbuf(int pid, struct iovec* liov, struct iovec* riov, unsigned long riov_cnt)
>> +{
>> +	ssize_t ret;
>> +
>> +	ret = process_vm_readv(pid, liov, 1, riov, riov_cnt, 0);
>> +
>> +	/* Target process doesn't exist */
>> +	if (ret == -1 && errno == ESRCH) {
>> +		pr_err("Target process with PID %d not found\n", pid);
>> +		return -2;
>> +	}
>> +
>> +	return ret;
>> +}
>> +
>> +/*
>> + * This function returns the index of that iov in an iovec, which failed to get
>> + * processed by process_vm_readv or may be partially processed
>> + */
>> +unsigned int faulty_iov_index(ssize_t bytes_read, struct iovec* riov, size_t index,
>> +		unsigned int riovcnt, unsigned long *iov_cntr, unsigned long *page_diff)
>> +{
>> +	ssize_t processed_bytes = 0;
>> +	*iov_cntr = 0;
>> +	*page_diff = 0;
>> +
>> +	while (processed_bytes < bytes_read && index < riovcnt) {
>> +
>> +		processed_bytes += riov[index].iov_len;
>> +		(*iov_cntr) += 1;
>> +		index += 1;
>> +	}
>> +
>> +	/* one iov is partially read */
>> +	if (bytes_read < processed_bytes) {
>> +		*page_diff = processed_bytes - bytes_read;
>> +		index -= 1;
>> +	}
>> +
>> +	return index;
>> +}
>> +
>> +long processing_ppb_userbuf(int pid, struct page_pipe_buf *ppb, struct iovec *bufvec)
> The return value and the bufvec argument seem to duplicate each other, as bufvec.iov_len
> is the amount of data read, so is the return value.

Yes both are complement of each other.

More explanation:

-----------------------

The bufvec.iov_len denotes how much buffer is available for next data to 
be read

while processing remaining iovec. Whereas return value denotes, actually 
how many bytes have been read.

Eg: Userbuffer is of 512 page size.

IOVEC: [{addr A, 8192} {addr B, 4096}]


processing IOVEC[0]:

bufvec.iov_len = 512 * 4096

Bytes_read = 8192

---------------------------------------

bufvec length available for further processing = 512 *4096 - 8192 = 510 
* 4096

---------------------------------------

processing IOVEC[1]:

bufvec.iov_len = 510 * 4096

Bytes_read = 12288

---------------------------------------

bufvec length available for further processing = 510 *4096 - 4096 = 509 
* 4096


So, finally bufvec.iov_len = 509 * 4096 (509 pages) and Bytes_read = 
12288 (3 pages)

Yes, here both are 512's complement of each other. Just to keep things 
intuitive I did this way.

>
>> +{
>> +	struct iovec *riov = ppb->iov;
>> +	ssize_t bytes_read = 0;
>> +	unsigned long pro_iovs = 0, start = 0, iov_cntr = 0, end;
> I have a subtle feeling that pro_iovs, start and end are somewhat duplicating each other.
> Please, explain them and especially the case why start can be not equal to pro_iovs.
I will try to eliminate redundancy, if not then I will explain all cases 
why it's necessary.
>
>> +	unsigned long pages_read = 0, pages_skipped = 0;
>> +	unsigned long page_diff = 0;
>> +
>> +	bufvec->iov_len = sizeof(userbuf);
>> +	bufvec->iov_base = userbuf;
>> +
>> +	while (pro_iovs < ppb->nr_segs)
>> +	{
>> +		bytes_read = copy_to_userbuf(pid, bufvec, &riov[start],
>> +							ppb->nr_segs - pro_iovs);
>> +
>> +		if (bytes_read == -2)
>> +			return -2;
> This -2 is not handled by the caller.

Handled now. Thanks :)

>
>> +
>> +		if (bytes_read == -1)
>> +		{
>> +			/*
>> +			 *  In other errors, adjust page count and mark the page
>> +			 *  to be skipped by pagemap generation
>> +			 */
>> +
>> +			cnt_sub(CNT_PAGES_WRITTEN, riov[start].iov_len/PAGE_SIZE);
>> +			pages_skipped += riov[start].iov_len/PAGE_SIZE;
>> +			riov[start].iov_base = SKIP_PAGEMAP;
>> +
>> +			pro_iovs += 1;
>> +			start += 1;
>> +			continue;
>> +		}
>> +
>> +		pages_read += bytes_read/PAGE_SIZE;
>> +
>> +		if (pages_read + pages_skipped == ppb->pages_in)
>> +			break;
>> +
>> +		end = faulty_iov_index(bytes_read, riov,
>> +					start, ppb->nr_segs, &iov_cntr, &page_diff);
>> +
>> +		/*
>> +		 * One single iov could be partially read, unless unmapped page in
>> +		 * iov range is not hit by process_vm_readv, need to handle this
>> +		 * special case
>> +		 */
> Please, explain this if-else, what does it do, especially two things below:

I will explain this with an example:

Consider a simple IOVEC: [{addrA, 2 pages}, {addrB, 4 pages}]

corresponding memory representation : < addrA > < addrA + 1* PAGE_SIZE >

< addrB > < addrB + 1* PAGE_SIZE > < addrB + 2* PAGE_SIZE > < addrB + 3* 
PAGE_SIZE >


While processing IOVEC through process_vm_readv, following cases arise:

Case 1: #pages read = 2

---------------------------------

It implies first page of IOVEC[1] is not accessible i.e. <addrB> 
unmapped/permission modified, so process_vm_readv is unable to process

anything beyond that address and returns with processing only IOVEC[0].


Case 2: #pages read = 3

---------------------------------

It implies second page of IOVEC[1] is not accessible, i.e. < addrB + 1* 
PAGE_SIZE > can't be processed. But, here first page of IOVEC[1]

is read along with 2 pages of IOVEC[0]. This is the case of partially 
processing an IOV.


IF part handles Case 2 and ELSE part handles Case 1.

>> +		if (!page_diff)
>> +		{
>> +			cnt_sub(CNT_PAGES_WRITTEN, riov[end].iov_len/PAGE_SIZE);
>> +			pages_skipped += riov[end].iov_len/PAGE_SIZE;
>> +			riov[end].iov_base = SKIP_PAGEMAP;
> ... why do we skip the last riov and why do we mess with cnt_sub at all?

Why I skip last riov:

--------------------------

In current implementation, if a IOV have its very first page unreadable( 
as in Case 1), I just skip processing that IOV.

This is what Andrei was talking about, that some mappings may modify, we 
need to find the unmodified part(readable) and pre-dump it.

In Case 1, readable pages may be 2nd,3rd and 4th pages of IOVEC[1], 
which I didn't processed at all for now. I just skip this IOV and move-on.

In Case 2, readable pages may be 3rd and 4th pages of IOVEC[1], which I 
skipped processing for now. I can't skip partially read IOV so modify

page-count and move.

This needs to be handled and I am thinking on best way to handle 
remaining pages of culprit IOVs.


Why I mess with cnt_sub:

---------------------------------

CNT_PAGES_WRITTEN is calculated at iovec generation. IOVEC gets modified 
since their generation. Pages which can't be pre-dumped (first page of 
IOVEC[1] in Case 1) are

counted, so they need to be adjusted. In current implementation since I 
am skipping unprocessed culprit IOV, so I am subtracting corresponding 
number of pages from

CNT_PAGES_WRITTEN. This avoids failing at page-count mismatch in 
test-suite run for pre-dump.

This logic will change with upcoming implementation.

>
>> +			start = end + 1;
>> +			pro_iovs += iov_cntr + 1;
>> +		}
>> +		else
>> +		{
>> +			riov[end].iov_len -= page_diff;
>> +			cnt_sub(CNT_PAGES_WRITTEN, page_diff/PAGE_SIZE);
>> +			pages_skipped += page_diff/PAGE_SIZE;
>> +			start = end + 1;
>> +			pro_iovs += iov_cntr;
>> +		}
>> +
>> +		bufvec->iov_base = userbuf + pages_read * PAGE_SIZE;
>> +		bufvec->iov_len -= bytes_read;
>> +	}
>> +
>> +	return pages_read;
>> +}
>> +
>> +
>> +int page_xfer_predump_pages(int pid, struct page_xfer *xfer, struct page_pipe *pp)
>> +{
>> +	struct page_pipe_buf *ppb;
>> +	unsigned int cur_hole = 0, i;
>> +	unsigned long ret, pages_read;
>> +	struct iovec bufvec;
>> +
>> +	list_for_each_entry(ppb, &pp->bufs, l) {
>> +
>> +		pages_read = processing_ppb_userbuf(pid, ppb, &bufvec);
>> +
>> +		if (pages_read == -1)
>> +			return -1;
>> +
>> +		bufvec.iov_base = userbuf;
>> +		bufvec.iov_len = pages_read * PAGE_SIZE;
>> +		ret = vmsplice(ppb->p[1], &bufvec, 1, SPLICE_F_NONBLOCK);
>> +
>> +		if (ret == -1 || ret != pages_read * PAGE_SIZE) {
>> +			pr_err("vmsplice: Failed to splice user buffer to pipe %ld\n", ret);
>> +			return -1;
>> +		}
>> +
>> +		/* generating pagemap */
>> +		for (i = 0; i < ppb->nr_segs; i++) {
>> +
>> +			if (ppb->iov[i].iov_base == SKIP_PAGEMAP)
>> +				continue;
>> +
>> +			struct iovec iov = ppb->iov[i];
>> +			u32 flags;
>> +
>> +			ret = dump_holes(xfer, pp, &cur_hole, iov.iov_base);
>> +			if (ret)
>> +				return ret;
>> +
>> +			BUG_ON(iov.iov_base < (void *)xfer->offset);
>> +			iov.iov_base -= xfer->offset;
>> +			pr_debug("\tp %p [%u]\n", iov.iov_base,
>> +					(unsigned int)(iov.iov_len / PAGE_SIZE));
>> +
>> +			flags = ppb_xfer_flags(xfer, ppb);
>> +
>> +			if (xfer->write_pagemap(xfer, &iov, flags))
>> +				return -1;
>> +			if ((flags & PE_PRESENT) && xfer->write_pages(xfer,
> Wow, if the flags does not have the PE_PRESENT bit set, why do we even get to the
> reading the pages contents?
This check was redundant and carried from original function.
>
>> +						ppb->p[0], iov.iov_len))
>> +				return -1;
>> +
>> +		}
>> +
>> +	}
>> +
>> +	return dump_holes(xfer, pp, &cur_hole, NULL);
>> +}
>> +
>>   int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp)
>>   {
>>   	struct page_pipe_buf *ppb;
>>   	unsigned int cur_hole = 0;
>> +	unsigned int i;
>>   	int ret;
>>   
>>   	pr_debug("Transferring pages:\n");
>>   
>>   	list_for_each_entry(ppb, &pp->bufs, l) {
>> -		unsigned int i;
> This change is cosmetic, please, do not merge it with the rest of the code.
Sure.
>
>>   
>>   		pr_debug("\tbuf %d/%d\n", ppb->pages_in, ppb->nr_segs);
>>   
>>
-Abhishek