[4/7] Drain memory using process_vm_readv syscall

Submitted by Abhishek Dubey on Aug. 20, 2019, 11:06 p.m.

Details

Message ID 1566342422-18459-5-git-send-email-dubeyabhishek777@gmail.com
State New
Series "GSoC 19: Optimizing the Pre-dump Algorithm"
Headers show

Commit Message

Abhishek Dubey Aug. 20, 2019, 11:06 p.m.
moving out page_drain to cr_pre_dump_finish
for "read" mode pre-dump stage. During the
frozen state, only iovecs will be generated
and draining of page will happen after the
task has been unfrozen. Shared memory
pre-dumping remains as earlier.

Signed-off-by: Abhishek Dubey <dubeyabhishek777@gmail.com>
---
 criu/include/page-xfer.h |   4 +
 criu/mem.c               |  14 +-
 criu/page-xfer.c         | 380 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 397 insertions(+), 1 deletion(-)

Patch hide | download patch | download mbox

diff --git a/criu/include/page-xfer.h b/criu/include/page-xfer.h
index fa72273..852601d 100644
--- a/criu/include/page-xfer.h
+++ b/criu/include/page-xfer.h
@@ -9,6 +9,9 @@  struct ps_info {
 
 extern int cr_page_server(bool daemon_mode, bool lazy_dump, int cfd);
 
+/* process not available for process_vm_readv syscall*/
+#define PR_UNAVIL -2
+
 /*
  * page_xfer -- transfer pages into image file.
  * Two images backends are implemented -- local image file
@@ -48,6 +51,7 @@  struct page_xfer {
 extern int open_page_xfer(struct page_xfer *xfer, int fd_type, unsigned long id);
 struct page_pipe;
 extern int page_xfer_dump_pages(struct page_xfer *, struct page_pipe *);
+extern int page_xfer_predump_pages(int pid, struct page_xfer *, struct page_pipe *);
 extern int connect_to_page_server_to_send(void);
 extern int connect_to_page_server_to_recv(int epfd);
 extern int disconnect_from_page_server(void);
diff --git a/criu/mem.c b/criu/mem.c
index 740992d..47a5dc5 100644
--- a/criu/mem.c
+++ b/criu/mem.c
@@ -489,7 +489,19 @@  static int __parasite_dump_pages_seized(struct pstree_item *item,
 	if (mdc->lazy)
 		memcpy(pargs_iovs(args), pp->iovs,
 		       sizeof(struct iovec) * pp->nr_iovs);
-	ret = drain_pages(pp, ctl, args);
+
+	/*
+	 * Faking drain_pages for "read" mode pre-dump here.
+	 * Actual drain_pages will happen after task unfreezing
+	 * in cr_pre_dump_finish(). This is desired optimization
+	 * of "read" mode which reduces time for which the
+	 * process was frozen during pre-dump.
+	 */
+	if (mdc->pre_dump == PRE_DUMP_READ)
+		ret = 0;
+	else
+		ret = drain_pages(pp, ctl, args);
+
 	if (!ret && !mdc->pre_dump)
 		ret = xfer_pages(pp, &xfer);
 	if (ret)
diff --git a/criu/page-xfer.c b/criu/page-xfer.c
index fe457d2..58c3e86 100644
--- a/criu/page-xfer.c
+++ b/criu/page-xfer.c
@@ -499,6 +499,386 @@  static inline u32 ppb_xfer_flags(struct page_xfer *xfer, struct page_pipe_buf *p
 		return PE_PRESENT;
 }
 
+/*
+ * Optimizing pre-dump algorithm
+ * ==============================
+ *
+ * Note: Please refer man(2) page of process_vm_readv syscall.
+ *
+ * Following discussion covers the possible faulty-iov locations
+ * in an iovec, which hinders process_vm_readv to process
+ * complete iovec in one go.
+ *
+ * Memory layout of target process:
+ *
+ * Pages: A        B        C
+ *        +--------+--------+--------+--------+--------+--------+
+ *        |||||||||||||||||||||||||||||||||||||||||||||||||||||||
+ *        +--------+--------+--------+--------+--------+--------+
+ *
+ * Single "iov" representation: {starting_address, length_in_bytes}
+ * An iovec is array of iov-s.
+ *
+ * NOTE: For easy representation and discussion purpose,
+ *       we carry out further discussion at "page granularity".
+ *       Length field signify page count of that iov. Same
+ *       assumption applies for syscall return value. Instead of
+ *       dealing with count of bytes read, it deals with
+ *       corresponding page count returned.
+ *
+ * For above memory mapping, generated iovec: {A,1}{B,1}{C,4}
+ *
+ * This iovec remains unmodified once generated. At the same
+ * time some of memory regions listed in iovec may get modified
+ * (unmap/change protection) by the target process while syscall
+ * is trying to dump iovec regions.
+ *
+ * Case 1:
+ *      A is unmapped, {A,1} become faulty iov
+ *
+ *      A        B        C
+ *      +--------+--------+--------+--------+--------+--------+
+ *      |        ||||||||||||||||||||||||||||||||||||||||||||||
+ *      +--------+--------+--------+--------+--------+--------+
+ *      ^        ^
+ *      |        |
+ *      start    |
+ *      (1)      |
+ *               start
+ *               (2)
+ *
+ *      process_vm_readv will return -1. Increment start pointer(2),
+ *      syscall will process {B,1}{C,4} in one go and copy 5 pages
+ *      to userbuf from iov-B and iov-C.
+ *
+ * Case 2:
+ *      B is unmapped, {B,1} become faulty iov
+ *
+ *      A        B        C
+ *      +--------+--------+--------+--------+--------+--------+
+ *      |||||||||         |||||||||||||||||||||||||||||||||||||
+ *      +--------+--------+--------+--------+--------+--------+
+ *      ^                 ^
+ *      |                 |
+ *      start             |
+ *      (1)               |
+ *                        start
+ *                        (2)
+ *
+ *      process_vm_readv will return 1, i.e. page A copied to
+ *      userbuf successfully and syscall stopped, since B got
+ *      unmapped.
+ *
+ *      Increment the start pointer to C(2) and invoke syscall.
+ *      Userbuf contains 5 pages overall from iov-A and iov-C.
+ *
+ * Case 3:
+ *      This case deals with partial unmapping of iov representing
+ *      more than one pagesize region.
+ *
+ *      Syscall can't process such faulty iov as whole. So we
+ *      process such regions part-by-part and form new sub-iovs
+ *      in aux_iov from successfully processed pages.
+ *
+ *
+ *      Part 3.1:
+ *              First page of C is unmapped
+ *
+ *      A        B        C
+ *      +--------+--------+--------+--------+--------+--------+
+ *      ||||||||||||||||||         ||||||||||||||||||||||||||||
+ *      +--------+--------+--------+--------+--------+--------+
+ *      ^                          ^
+ *      |                          |
+ *      start                      |
+ *      (1)                        |
+ *                                 dummy
+ *                                 (2)
+ *
+ *      process_vm_readv will return 2, i.e. pages A and B copied.
+ *      We identify length of iov-C is more than 1 page, that is
+ *      where this case differs from Case 2.
+ *
+ *      dummy-iov is introduced(2) as: {C+1,3}. dummy-iov can be
+ *      directly placed at next page to failing page. This will copy
+ *      remaining 3 pages from iov-C to userbuf. Finally create
+ *      modified iov entry in aux_iov. Complete aux_iov look like:
+ *
+ *      aux_iov: {A,1}{B,1}{C+1,3}*
+ *
+ *      Part 3.2:
+ *              In between page of C is unmapped, let's say third
+ *
+ *
+ *      A        B        C
+ *      +--------+--------+--------+--------+--------+--------+
+ *      ||||||||||||||||||||||||||||||||||||         ||||||||||
+ *      +--------+--------+--------+--------+--------+--------+
+ *      ^                                            ^
+ *      |                 |-----------------|        |
+ *      start              partial_read_bytes        |
+ *      (1)                                          |
+ *                                                   dummy
+ *                                                   (2)
+ *
+ *      process_vm_readv will return 4, i.e. pages A and B copied
+ *      completely and first two pages of C are also copied.
+ *
+ *      Since, iov-C is not processed completely, we need to find
+ *      "partial_read_byte" count to place out dummy-iov for
+ *      remainig processing of iov-C. This function is performed by
+ *      analyze_iov function.
+ *
+ *      dummy-iov will be(2): {C+3,1}. dummy-iov will be placed
+ *      next to first failing address to process remaining iov-C.
+ *      New entries in aux_iov will look like:
+ *
+ *      aux_iov: {A,1}{B,1}{C,2}*{C+3,1}*
+ */
+
+static char userbuf[512 << 12];
+
+unsigned long handle_faulty_iov(int pid, struct iovec* riov, unsigned long faulty_index,
+				struct iovec *bufvec, struct iovec* aux_iov,
+				unsigned long* aux_len, unsigned long partial_read_bytes)
+{
+	/* Handling Case 2*/
+	if (riov[faulty_index].iov_len == PAGE_SIZE) {
+		cnt_sub(CNT_PAGES_WRITTEN, 1);
+		return 0;
+	}
+
+	struct iovec dummy;
+	ssize_t bytes_read;
+	unsigned long offset = 0;
+	unsigned long final_read_cnt = 0;
+
+	/* Handling Case 3-Part 3.2*/
+	offset = (partial_read_bytes)? partial_read_bytes : PAGE_SIZE;
+
+	dummy.iov_base = riov[faulty_index].iov_base + offset;
+	dummy.iov_len = riov[faulty_index].iov_len - offset;
+
+	if (!partial_read_bytes)
+		cnt_sub(CNT_PAGES_WRITTEN, 1);
+
+	while (dummy.iov_len) {
+
+		bytes_read = process_vm_readv(pid, bufvec, 1, &dummy, 1, 0);
+
+		/*
+		 * TODO: Optimize following snippet, as it incurs 1M syscalls
+		 *       if 4G memory is unmapped
+		 */
+		if(bytes_read == -1) {
+			/* Handling faulty page read in faulty iov */
+			cnt_sub(CNT_PAGES_WRITTEN, 1);
+			dummy.iov_base += PAGE_SIZE;
+			dummy.iov_len -= PAGE_SIZE;
+			continue;
+		}
+
+		/* If aux-iov can merge and expand or new entry required */
+		if (aux_iov[(*aux_len)-1].iov_base +
+			aux_iov[(*aux_len)-1].iov_len == dummy.iov_base)
+			aux_iov[(*aux_len)-1].iov_len += bytes_read;
+		else {
+			aux_iov[*aux_len].iov_base = dummy.iov_base;
+			aux_iov[*aux_len].iov_len = bytes_read;
+			(*aux_len) += 1;
+		}
+
+		dummy.iov_base += bytes_read;
+		dummy.iov_len -= bytes_read;
+		bufvec->iov_base += bytes_read;
+		bufvec->iov_len -= bytes_read;
+		final_read_cnt += bytes_read;
+	}
+
+	return final_read_cnt;
+}
+
+
+/*
+ * This function will position start pointer to the latest
+ * successfully read iov in iovec. In case of partial read it
+ * returns partial_read_bytes, otherwise 0.
+ */
+static unsigned long analyze_iov(ssize_t bytes_read, struct iovec* riov,
+				unsigned long *index, struct iovec *aux_iov,
+				unsigned long *aux_len)
+{
+	ssize_t processed_bytes = 0;
+	unsigned long partial_read_bytes = 0;
+
+	/* correlating iovs with read bytes */
+	while (processed_bytes < bytes_read) {
+
+		processed_bytes += riov[*index].iov_len;
+		aux_iov[*aux_len].iov_base = riov[*index].iov_base;
+		aux_iov[*aux_len].iov_len = riov[*index].iov_len;
+
+		(*aux_len) += 1;
+		(*index) += 1;
+	}
+
+	/* handling partially processed faulty iov*/
+	if (processed_bytes - bytes_read) {
+
+		(*index) -= 1;
+
+		partial_read_bytes = riov[*index].iov_len
+					- (processed_bytes - bytes_read);
+		aux_iov[*aux_len-1].iov_len = partial_read_bytes;
+	}
+
+	return partial_read_bytes;
+}
+
+
+/*
+ * This function iterates over complete ppb->iov entries and passes them to
+ * process_vm_readv syscall.
+ *
+ * Since process_vm_readv returns count of successfully read bytes. It does
+ * not point to iovec entry associated to last successful byte read. We setup
+ * this correlation between bytes read and corresponding iovecs through
+ * analyze_iov function.
+ *
+ * If all iovecs are not processed in one go, it means there exists some
+ * faulty iov entry(memory mappings modified since it was grabbed) in iovec.
+ * process_vm_readv syscall stops at such faulty iov and don't process
+ * further any entry in iovec. This is handled by handle_faulty_iov function.
+ */
+static long fill_userbuf(int pid, struct page_pipe_buf *ppb, struct iovec *bufvec,
+					struct iovec* aux_iov, unsigned long *aux_len)
+{
+	struct iovec *riov = ppb->iov;
+	ssize_t bytes_read;
+	unsigned long total_read = 0;
+	unsigned long start = 0;
+	unsigned long partial_read_bytes = 0;
+
+	while (start < ppb->nr_segs) {
+
+		bytes_read = process_vm_readv(pid, bufvec, 1, &riov[start],
+							ppb->nr_segs - start, 0);
+
+		if (bytes_read == -1) {
+			/* Handling Case 1*/
+			if (riov[start].iov_len == PAGE_SIZE) {
+				cnt_sub(CNT_PAGES_WRITTEN, 1);
+				start += 1;
+				continue;
+
+			} else if (errno == ESRCH) {
+				pr_debug("Target process PID:%d not found\n", pid);
+				return PR_UNAVIL;
+			}
+		}
+
+		partial_read_bytes = 0;
+
+		if (bytes_read > 0) {
+
+			partial_read_bytes = analyze_iov(bytes_read, riov, &start,
+								aux_iov, aux_len);
+			bufvec->iov_base += bytes_read;
+			bufvec->iov_len -= bytes_read;
+			total_read += bytes_read;
+		}
+
+		/* If all iovs not processed in one go,
+		   it means some iov in between has failed */
+		if (start < ppb->nr_segs)
+			total_read += handle_faulty_iov(pid, riov, start, bufvec,
+						aux_iov, aux_len, partial_read_bytes);
+		start += 1;
+
+	}
+
+	return total_read;
+}
+
+
+/*
+ * This function is similar to page_xfer_dump_pages, instead it uses
+ * auxiliary_iov array for pagemap generation.
+ *
+ * The entries of ppb->iov may mismatch with actual process mappings
+ * present at time of memory dump. Such entries needs adjustment as per
+ * the pages read by process_vm_readv syscall. These adjusted entries
+ * along with unmodified entries are present in aux_iov array.
+ */
+int page_xfer_predump_pages(int pid, struct page_xfer *xfer, struct page_pipe *pp)
+{
+	struct page_pipe_buf *ppb;
+	unsigned int cur_hole = 0, i;
+	unsigned long ret, bytes_read;
+	struct iovec bufvec;
+
+	struct iovec aux_iov[PIPE_MAX_SIZE];
+	unsigned long aux_len;
+
+	list_for_each_entry(ppb, &pp->bufs, l) {
+
+		timing_start(TIME_MEMDUMP);
+
+		aux_len = 0;
+		bufvec.iov_len = sizeof(userbuf);
+		bufvec.iov_base = userbuf;
+
+		bytes_read = fill_userbuf(pid, ppb, &bufvec, aux_iov, &aux_len);
+
+		if (bytes_read == PR_UNAVIL)
+			return -1;
+
+		bufvec.iov_base = userbuf;
+		bufvec.iov_len = bytes_read;
+		ret = vmsplice(ppb->p[1], &bufvec, 1, SPLICE_F_NONBLOCK);
+
+		if (ret == -1 || ret != bytes_read) {
+			pr_err("vmsplice: Failed to splice user buffer to pipe %ld\n", ret);
+			return -1;
+		}
+
+		timing_stop(TIME_MEMDUMP);
+
+		timing_start(TIME_MEMWRITE);
+
+		/* generating pagemap */
+		for (i = 0; i < aux_len; i++) {
+
+			struct iovec iov = aux_iov[i];
+			u32 flags;
+
+			ret = dump_holes(xfer, pp, &cur_hole, iov.iov_base);
+			if (ret)
+				return ret;
+
+			BUG_ON(iov.iov_base < (void *)xfer->offset);
+			iov.iov_base -= xfer->offset;
+			pr_debug("\tp %p [%u]\n", iov.iov_base,
+					(unsigned int)(iov.iov_len / PAGE_SIZE));
+
+			flags = ppb_xfer_flags(xfer, ppb);
+
+			if (xfer->write_pagemap(xfer, &iov, flags))
+				return -1;
+			if (xfer->write_pages(xfer, ppb->p[0], iov.iov_len))
+				return -1;
+
+		}
+
+		timing_stop(TIME_MEMWRITE);
+	}
+
+	timing_start(TIME_MEMWRITE);
+
+	return dump_holes(xfer, pp, &cur_hole, NULL);
+}
+
+
 int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp)
 {
 	struct page_pipe_buf *ppb;