[6/9] read mode pre-dump implementation

Submitted by Abhishek Dubey on Sept. 22, 2019, 4:24 a.m.

Details

Message ID 1569126298-23300-7-git-send-email-dubeyabhishek777@gmail.com
State New
Series "Series without cover letter"
Headers show

Commit Message

Abhishek Dubey Sept. 22, 2019, 4:24 a.m.
Pre-dump using the process_vm_readv syscall.
During frozen state, only iovecs will be
generated and draining of memory happens
after the task is unfrozen. Pre-dumping of
shared memory remains unmodified.

Signed-off-by: Abhishek Dubey <dubeyabhishek777@gmail.com>
---
 criu/cr-dump.c           |   3 +-
 criu/include/page-xfer.h |   4 +
 criu/page-xfer.c         | 382 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 388 insertions(+), 1 deletion(-)

Patch hide | download patch | download mbox

diff --git a/criu/cr-dump.c b/criu/cr-dump.c
index e7618d2..23399a9 100644
--- a/criu/cr-dump.c
+++ b/criu/cr-dump.c
@@ -1516,7 +1516,8 @@  static int cr_pre_dump_finish(int status)
 		mem_pp = dmpi(item)->mem_pp;
 
 		if (opts.pre_dump_mode == PRE_DUMP_READ)
-			ret = 0;  /* Replace with call to optimized pre-dump */
+			ret = page_xfer_predump_pages(item->pid->real,
+							&xfer, mem_pp);
 		else
 			ret = page_xfer_dump_pages(&xfer, mem_pp);
 
diff --git a/criu/include/page-xfer.h b/criu/include/page-xfer.h
index fa72273..98061e2 100644
--- a/criu/include/page-xfer.h
+++ b/criu/include/page-xfer.h
@@ -9,6 +9,9 @@  struct ps_info {
 
 extern int cr_page_server(bool daemon_mode, bool lazy_dump, int cfd);
 
+/* User buffer for read-mode pre-dump*/
+#define BUFFER_SIZE (PIPE_MAX_SIZE << PAGE_SHIFT)
+
 /*
  * page_xfer -- transfer pages into image file.
  * Two images backends are implemented -- local image file
@@ -48,6 +51,7 @@  struct page_xfer {
 extern int open_page_xfer(struct page_xfer *xfer, int fd_type, unsigned long id);
 struct page_pipe;
 extern int page_xfer_dump_pages(struct page_xfer *, struct page_pipe *);
+extern int page_xfer_predump_pages(int pid, struct page_xfer *, struct page_pipe *);
 extern int connect_to_page_server_to_send(void);
 extern int connect_to_page_server_to_recv(int epfd);
 extern int disconnect_from_page_server(void);
diff --git a/criu/page-xfer.c b/criu/page-xfer.c
index fe457d2..1eeee5a 100644
--- a/criu/page-xfer.c
+++ b/criu/page-xfer.c
@@ -499,6 +499,388 @@  static inline u32 ppb_xfer_flags(struct page_xfer *xfer, struct page_pipe_buf *p
 		return PE_PRESENT;
 }
 
+/*
+ * Optimized pre-dump algorithm
+ * ==============================
+ *
+ * Note: Please refer man(2) page of process_vm_readv syscall.
+ *
+ * The following discussion covers the possibly faulty-iov
+ * locations in an iovec, which hinders process_vm_readv from
+ * dumping the entire iovec in a single invocation.
+ *
+ * Memory layout of target process:
+ *
+ * Pages: A        B        C
+ *	  +--------+--------+--------+--------+--------+--------+
+ *	  |||||||||||||||||||||||||||||||||||||||||||||||||||||||
+ *	  +--------+--------+--------+--------+--------+--------+
+ *
+ * Single "iov" representation: {starting_address, length_in_bytes}
+ * An iovec is array of iov-s.
+ *
+ * NOTE: For easy representation and discussion purpose, we carry
+ *	 out further discussion at "page granularity".
+ *	 length_in_bytes will represent page count in iov instead
+ *	 of byte count. Same assumption applies for the syscall's
+ *	 return value. Instead of returning the number of bytes
+ *	 read, it returns a page count.
+ *
+ * For above memory mapping, generated iovec: {A,1}{B,1}{C,4}
+ *
+ * This iovec remains unmodified once generated. At the same
+ * time some of memory regions listed in iovec may get modified
+ * (unmap/change protection) by the target process while syscall
+ * is trying to dump iovec regions.
+ *
+ * Case 1:
+ *	A is unmapped, {A,1} become faulty iov
+ *
+ *      A        B        C
+ *      +--------+--------+--------+--------+--------+--------+
+ *      |        ||||||||||||||||||||||||||||||||||||||||||||||
+ *      +--------+--------+--------+--------+--------+--------+
+ *      ^        ^
+ *      |        |
+ *      start    |
+ *      (1)      |
+ *               start
+ *               (2)
+ *
+ *	process_vm_readv will return -1. Increment start pointer(2),
+ *	syscall will process {B,1}{C,4} in one go and copy 5 pages
+ *	to userbuf from iov-B and iov-C.
+ *
+ * Case 2:
+ *	B is unmapped, {B,1} become faulty iov
+ *
+ *      A        B        C
+ *      +--------+--------+--------+--------+--------+--------+
+ *      |||||||||         |||||||||||||||||||||||||||||||||||||
+ *      +--------+--------+--------+--------+--------+--------+
+ *      ^                 ^
+ *      |                 |
+ *      start             |
+ *      (1)               |
+ *                        start
+ *                        (2)
+ *
+ *	process_vm_readv will return 1, i.e. page A copied to
+ *	userbuf successfully and syscall stopped, since B got
+ *	unmapped.
+ *
+ *	Increment the start pointer to C(2) and invoke syscall.
+ *	Userbuf contains 5 pages overall from iov-A and iov-C.
+ *
+ * Case 3:
+ *	This case deals with partial unmapping of iov representing
+ *	more than one pagesize region.
+ *
+ *	Syscall can't process such faulty iov as whole. So we
+ *	process such regions part-by-part and form new sub-iovs
+ *	in aux_iov from successfully processed pages.
+ *
+ *
+ *	Part 3.1:
+ *		First page of C is unmapped
+ *
+ *      A        B        C
+ *      +--------+--------+--------+--------+--------+--------+
+ *      ||||||||||||||||||         ||||||||||||||||||||||||||||
+ *      +--------+--------+--------+--------+--------+--------+
+ *      ^                          ^
+ *      |                          |
+ *      start                      |
+ *      (1)                        |
+ *                                 dummy
+ *                                 (2)
+ *
+ *	process_vm_readv will return 2, i.e. pages A and B copied.
+ *	We identify length of iov-C is more than 1 page, that is
+ *	where this case differs from Case 2.
+ *
+ *	dummy-iov is introduced(2) as: {C+1,3}. dummy-iov can be
+ *	directly placed at next page to failing page. This will copy
+ *	remaining 3 pages from iov-C to userbuf. Finally create
+ *	modified iov entry in aux_iov. Complete aux_iov look like:
+ *
+ *	aux_iov: {A,1}{B,1}{C+1,3}*
+ *
+ *
+ *	Part 3.2:
+ *		In between page of C is unmapped, let's say third
+ *
+ *      A        B        C
+ *      +--------+--------+--------+--------+--------+--------+
+ *      ||||||||||||||||||||||||||||||||||||         ||||||||||
+ *      +--------+--------+--------+--------+--------+--------+
+ *      ^                                            ^
+ *      |                 |-----------------|        |
+ *      start              partial_read_bytes        |
+ *      (1)                                          |
+ *                                                   dummy
+ *                                                   (2)
+ *
+ *	process_vm_readv will return 4, i.e. pages A and B copied
+ *	completely and first two pages of C are also copied.
+ *
+ *	Since, iov-C is not processed completely, we need to find
+ *	"partial_read_byte" count to place out dummy-iov for
+ *	remainig processing of iov-C. This function is performed by
+ *	analyze_iov function.
+ *
+ *	dummy-iov will be(2): {C+3,1}. dummy-iov will be placed
+ *	next to first failing address to process remaining iov-C.
+ *	New entries in aux_iov will look like:
+ *
+ *	aux_iov: {A,1}{B,1}{C,2}*{C+3,1}*
+ */
+
+unsigned long handle_faulty_iov(int pid, struct iovec* riov,
+				unsigned long faulty_index,
+				struct iovec *bufvec, struct iovec* aux_iov,
+				unsigned long* aux_len,
+				unsigned long partial_read_bytes)
+{
+	/* Handling Case 2*/
+	if (riov[faulty_index].iov_len == PAGE_SIZE) {
+		cnt_sub(CNT_PAGES_WRITTEN, 1);
+		return 0;
+	}
+
+	struct iovec dummy;
+	ssize_t bytes_read;
+	unsigned long offset = 0;
+	unsigned long final_read_cnt = 0;
+
+	/* Handling Case 3-Part 3.2*/
+	offset = (partial_read_bytes)? partial_read_bytes : PAGE_SIZE;
+
+	dummy.iov_base = riov[faulty_index].iov_base + offset;
+	dummy.iov_len = riov[faulty_index].iov_len - offset;
+
+	if (!partial_read_bytes)
+		cnt_sub(CNT_PAGES_WRITTEN, 1);
+
+	while (dummy.iov_len) {
+
+		bytes_read = process_vm_readv(pid, bufvec, 1, &dummy, 1, 0);
+
+		if(bytes_read == -1) {
+			/* Handling faulty page read in faulty iov */
+			cnt_sub(CNT_PAGES_WRITTEN, 1);
+			dummy.iov_base += PAGE_SIZE;
+			dummy.iov_len -= PAGE_SIZE;
+			continue;
+		}
+
+		/* If aux-iov can merge and expand or new entry required */
+		if (aux_iov[(*aux_len)-1].iov_base +
+			aux_iov[(*aux_len)-1].iov_len == dummy.iov_base)
+			aux_iov[(*aux_len)-1].iov_len += bytes_read;
+		else {
+			aux_iov[*aux_len].iov_base = dummy.iov_base;
+			aux_iov[*aux_len].iov_len = bytes_read;
+			(*aux_len) += 1;
+		}
+
+		dummy.iov_base += bytes_read;
+		dummy.iov_len -= bytes_read;
+		bufvec->iov_base += bytes_read;
+		bufvec->iov_len -= bytes_read;
+		final_read_cnt += bytes_read;
+	}
+
+	return final_read_cnt;
+}
+
+/*
+ * This function will position start pointer to the latest
+ * successfully read iov in iovec. In case of partial read it
+ * returns partial_read_bytes, otherwise 0.
+ */
+static unsigned long analyze_iov(ssize_t bytes_read, struct iovec* riov,
+				 unsigned long *index, struct iovec *aux_iov,
+				 unsigned long *aux_len)
+{
+	ssize_t processed_bytes = 0;
+	unsigned long partial_read_bytes = 0;
+
+	/* correlating iovs with read bytes */
+	while (processed_bytes < bytes_read) {
+
+		processed_bytes += riov[*index].iov_len;
+		aux_iov[*aux_len].iov_base = riov[*index].iov_base;
+		aux_iov[*aux_len].iov_len = riov[*index].iov_len;
+
+		(*aux_len) += 1;
+		(*index) += 1;
+	}
+
+	/* handling partially processed faulty iov*/
+	if (processed_bytes - bytes_read) {
+
+		(*index) -= 1;
+
+		partial_read_bytes = riov[*index].iov_len
+					- (processed_bytes - bytes_read);
+		aux_iov[*aux_len-1].iov_len = partial_read_bytes;
+	}
+
+	return partial_read_bytes;
+}
+
+/*
+ * This function iterates over complete ppb->iov entries and pass
+ * them to process_vm_readv syscall.
+ *
+ * Since process_vm_readv returns count of successfully read bytes.
+ * It does not point to iovec entry associated to last successful
+ * byte read. The correlation between bytes read and corresponding
+ * iovec is setup through analyze_iov function.
+ *
+ * If all iovecs are not processed in one go, it means there exists
+ * some faulty iov entry(memory mapping modified after it was grabbed)
+ * in iovec. process_vm_readv syscall stops at such faulty iov and
+ * skip processing further any entry in iovec. This is handled by
+ * handle_faulty_iov function.
+ */
+static long fill_userbuf(int pid, struct page_pipe_buf *ppb,
+				  struct iovec *bufvec,
+				  struct iovec* aux_iov,
+				  unsigned long *aux_len)
+{
+	struct iovec *riov = ppb->iov;
+	ssize_t bytes_read;
+	unsigned long total_read = 0;
+	unsigned long start = 0;
+	unsigned long partial_read_bytes = 0;
+
+	while (start < ppb->nr_segs) {
+
+		bytes_read = process_vm_readv(pid, bufvec, 1, &riov[start],
+						   ppb->nr_segs - start, 0);
+
+		if (bytes_read == -1) {
+			/* Handling Case 1*/
+			if (riov[start].iov_len == PAGE_SIZE) {
+				cnt_sub(CNT_PAGES_WRITTEN, 1);
+				start += 1;
+				continue;
+			} else if (errno == ESRCH) {
+				pr_debug("Target process PID:%d not found\n", pid);
+				return ESRCH;
+			}
+		}
+
+		partial_read_bytes = 0;
+
+		if (bytes_read > 0) {
+			partial_read_bytes = analyze_iov(bytes_read, riov,
+							 &start, aux_iov,
+							 aux_len);
+			bufvec->iov_base += bytes_read;
+			bufvec->iov_len -= bytes_read;
+			total_read += bytes_read;
+		}
+
+		/*
+		 * If all iovs not processed in one go,
+		 * it means some iov in between has failed.
+		 */
+		if (start < ppb->nr_segs)
+			total_read += handle_faulty_iov(pid, riov, start, bufvec,
+							aux_iov, aux_len,
+							partial_read_bytes);
+
+		start += 1;
+	}
+
+	return total_read;
+}
+
+/*
+ * This function is similar to page_xfer_dump_pages, instead it uses
+ * auxiliary_iov array for pagemap generation.
+ *
+ * The entries of ppb->iov may mismatch with actual process mappings
+ * present at time of pre-dump. Such entries need to be adjusted as per
+ * the pages read by process_vm_readv syscall. These adjusted entries
+ * along with unmodified entries are present in aux_iov array.
+ */
+
+int page_xfer_predump_pages(int pid, struct page_xfer *xfer,
+				     struct page_pipe *pp)
+{
+	struct page_pipe_buf *ppb;
+	unsigned int cur_hole = 0, i;
+	unsigned long ret, bytes_read;
+	struct iovec bufvec;
+
+	struct iovec aux_iov[PIPE_MAX_SIZE];
+	unsigned long aux_len;
+
+	char *userbuf = xzalloc(BUFFER_SIZE);
+
+	list_for_each_entry(ppb, &pp->bufs, l) {
+
+		aux_len = 0;
+		bufvec.iov_len = BUFFER_SIZE;
+		bufvec.iov_base = userbuf;
+
+		bytes_read = fill_userbuf(pid, ppb, &bufvec, aux_iov, &aux_len);
+
+		if (bytes_read == ESRCH) {
+			xfree(userbuf);
+			return -1;
+		}
+
+		bufvec.iov_base = userbuf;
+		bufvec.iov_len = bytes_read;
+		ret = vmsplice(ppb->p[1], &bufvec, 1, SPLICE_F_NONBLOCK);
+
+		if (ret == -1 || ret != bytes_read) {
+			pr_err("vmsplice: Failed to splice user buffer to pipe %ld\n", ret);
+			xfree(userbuf);
+			return -1;
+		}
+
+		/* generating pagemap */
+		for (i = 0; i < aux_len; i++) {
+
+			struct iovec iov = aux_iov[i];
+			u32 flags;
+
+			ret = dump_holes(xfer, pp, &cur_hole, iov.iov_base);
+			if (ret) {
+				xfree(userbuf);
+				return ret;
+			}
+
+			BUG_ON(iov.iov_base < (void *)xfer->offset);
+			iov.iov_base -= xfer->offset;
+			pr_debug("\t p %p [%u]\n", iov.iov_base,
+				(unsigned int)(iov.iov_len / PAGE_SIZE));
+
+			flags = ppb_xfer_flags(xfer, ppb);
+
+			if (xfer->write_pagemap(xfer, &iov, flags)) {
+				xfree(userbuf);
+				return -1;
+			}
+
+			if (xfer->write_pages(xfer, ppb->p[0], iov.iov_len)) {
+				xfree(userbuf);
+				return -1;
+			}
+		}
+
+	}
+
+	xfree(userbuf);
+	return dump_holes(xfer, pp, &cur_hole, NULL);
+}
+
 int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp)
 {
 	struct page_pipe_buf *ppb;