[1/2,UGLY] allow fetching pages from remote page-server

Submitted by Adrian Reber on April 26, 2016, 9:38 a.m.

Details

Message ID 1461663490-5425-2-git-send-email-adrian@lisas.de
State Rejected
Series "Series without cover letter"
Headers show

Commit Message

Adrian Reber April 26, 2016, 9:38 a.m.
From: Mike Rapoport <rppt@linux.vnet.ibm.com>

---
 criu/cr-restore.c         |   5 +-
 criu/crtools.c            |   4 ++
 criu/include/cr_options.h |   1 +
 criu/include/page-read.h  |   6 ++
 criu/include/page-xfer.h  |   7 ++
 criu/page-read.c          |  49 ++++++++++++--
 criu/page-xfer.c          | 163 +++++++++++++++++++++++++++++++++++++++++++++-
 7 files changed, 227 insertions(+), 8 deletions(-)

Patch hide | download patch | download mbox

diff --git a/criu/cr-restore.c b/criu/cr-restore.c
index 34db7f7..5ae1d28 100644
--- a/criu/cr-restore.c
+++ b/criu/cr-restore.c
@@ -453,10 +453,13 @@  static int restore_priv_vma_content(void)
 	unsigned int nr_lazy = 0;
 	unsigned long va;
 	struct page_read pr;
+	int pr_flags = PR_TASK;
 
 	vma = list_first_entry(vmas, struct vma_area, list);
 
-	ret = open_page_read(current->pid.virt, &pr, PR_TASK);
+	if (opts.use_page_client)
+		pr_flags |= PR_REMOTE;
+	ret = open_page_read(current->pid.virt, &pr, pr_flags);
 	if (ret <= 0)
 		return -1;
 
diff --git a/criu/crtools.c b/criu/crtools.c
index 93ca90b..211d883 100644
--- a/criu/crtools.c
+++ b/criu/crtools.c
@@ -321,6 +321,7 @@  int main(int argc, char *argv[], char *envp[])
 		{ "extra",			no_argument,		0, 1077	},
 		{ "experimental",		no_argument,		0, 1078	},
 		{ "all",			no_argument,		0, 1079	},
+		{ "page-client",		no_argument,		0, 1080	},
 		{ },
 	};
 
@@ -623,6 +624,9 @@  int main(int argc, char *argv[], char *envp[])
 			opts.check_extra_features = true;
 			opts.check_experimental_features = true;
 			break;
+		case 1080:
+			opts.use_page_client = true;
+			break;
 		case 'V':
 			pr_msg("Version: %s\n", CRIU_VERSION);
 			if (strcmp(CRIU_GITID, "0"))
diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h
index b6ae3a1..0c8b122 100644
--- a/criu/include/cr_options.h
+++ b/criu/include/cr_options.h
@@ -86,6 +86,7 @@  struct cr_options {
 	struct list_head	join_ns;
 	char			*libdir;
 	bool			use_page_server;
+	bool			use_page_client;
 	unsigned short		port;
 	char			*addr;
 	int			ps_socket;
diff --git a/criu/include/page-read.h b/criu/include/page-read.h
index 3ba1ee9..abeb4c6 100644
--- a/criu/include/page-read.h
+++ b/criu/include/page-read.h
@@ -40,6 +40,8 @@ 
  * All this is implemented in read_pagemap_page.
  */
 
+struct page_xfer;
+
 struct page_read {
 	/*
 	 * gets next vaddr:len pair to work on.
@@ -57,6 +59,8 @@  struct page_read {
 	struct cr_img *pmi;
 	struct cr_img *pi;
 
+	struct page_xfer *xfer;		/* for remote page reader */
+
 	PagemapEntry *pe;		/* current pagemap we are on */
 	struct page_read *parent;	/* parent pagemap (if ->in_parent
 					   pagemap is met in image, then
@@ -75,6 +79,8 @@  struct page_read {
 #define PR_TYPE_MASK	0x3
 #define PR_MOD		0x4	/* Will need to modify */
 
+#define PR_REMOTE	0x8	/* will read pages from remote host */
+
 /*
  * -1 -- error
  *  0 -- no images
diff --git a/criu/include/page-xfer.h b/criu/include/page-xfer.h
index 8492daa..86a7c21 100644
--- a/criu/include/page-xfer.h
+++ b/criu/include/page-xfer.h
@@ -17,6 +17,10 @@  struct page_xfer {
 	int (*write_pages)(struct page_xfer *self, int pipe, unsigned long len);
 	/* transfers one hole -- vaddr:len entry w/o pages */
 	int (*write_hole)(struct page_xfer *self, struct iovec *iov);
+
+	int (*read_pages)(struct page_xfer *self, unsigned long vaddr,
+			  int nr, void *buf);
+
 	void (*close)(struct page_xfer *self);
 
 	/* private data for every page-xfer engine */
@@ -44,4 +48,7 @@  extern int disconnect_from_page_server(void);
 
 extern int check_parent_page_xfer(int fd_type, long id);
 
+extern int page_xfer_read_pages(struct page_xfer *xfer, unsigned long vaddr,
+				int nr, void *buf);
+
 #endif /* __CR_PAGE_XFER__H__ */
diff --git a/criu/page-read.c b/criu/page-read.c
index e5ec76a..8e23dc5 100644
--- a/criu/page-read.c
+++ b/criu/page-read.c
@@ -6,6 +6,7 @@ 
 #include "cr_options.h"
 #include "servicefd.h"
 #include "page-read.h"
+#include "page-xfer.h"
 
 #include "protobuf.h"
 #include "images/pagemap.pb-c.h"
@@ -193,6 +194,13 @@  static int read_pagemap_page(struct page_read *pr, unsigned long vaddr, int nr,
 			vaddr += p_nr * PAGE_SIZE;
 			buf += p_nr * PAGE_SIZE;
 		} while (nr);
+	} else if (pr->xfer) {
+		pr_debug("\tpr%u Read %d remote pages %lx\n", pr->id, nr, vaddr);
+		ret = page_xfer_read_pages(pr->xfer, vaddr, nr, buf);
+		if (ret) {
+			pr_err("cannot get remote pages\n");
+			return -1;
+		}
 	} else {
 		int fd = img_raw_fd(pr->pi);
 		off_t current_vaddr = lseek(fd, 0, SEEK_CUR);
@@ -237,6 +245,11 @@  static void close_page_read(struct page_read *pr)
 	close_image(pr->pmi);
 	if (pr->pi)
 		close_image(pr->pi);
+
+	if (pr->xfer) {
+		pr->xfer->close(pr->xfer);
+		free(pr->xfer);
+	}
 }
 
 static int try_open_parent(int dfd, int pid, struct page_read *pr, int pr_flags)
@@ -301,9 +314,23 @@  int open_page_read_at(int dfd, int pid, struct page_read *pr, int pr_flags)
 
 	pr->pe = NULL;
 	pr->parent = NULL;
+	pr->xfer = NULL;
 	pr->bunch.iov_len = 0;
 	pr->bunch.iov_base = NULL;
 
+	if (pr_flags & PR_REMOTE) {
+		pr->xfer = malloc(sizeof(*pr->xfer));
+		if (!pr->xfer) {
+			pr_err("failed to reseve memory for page-xfer\n");
+			return -1;
+		}
+
+		if (open_page_xfer(pr->xfer, CR_FD_PAGEMAP, pid)) {
+			pr_err("failed to open page-xfer\n");
+			return -1;
+		}
+	}
+
 	pr->pmi = open_image_at(dfd, i_typ, O_RSTR, (long)pid);
 	if (!pr->pmi)
 		return -1;
@@ -318,17 +345,29 @@  int open_page_read_at(int dfd, int pid, struct page_read *pr, int pr_flags)
 		return -1;
 	}
 
-	pr->pi = open_pages_image_at(dfd, flags, pr->pmi);
-	if (!pr->pi) {
-		close_page_read(pr);
-		return -1;
+	if (pr_flags & PR_REMOTE) {
+		PagemapHead *h;
+		if (pb_read_one(pr->pmi, &h, PB_PAGEMAP_HEAD) < 0) {
+			pr_err("%s: pb_read_one\n", __func__);
+			return -1;
+		}
+		pagemap_head__free_unpacked(h, NULL);
+
+		pr->skip_pages = NULL;
+	} else {
+		pr->pi = open_pages_image_at(dfd, flags, pr->pmi);
+		if (!pr->pi) {
+			close_page_read(pr);
+			return -1;
+		}
+
+		pr->skip_pages = skip_pagemap_pages;
 	}
 
 	pr->get_pagemap = get_pagemap;
 	pr->put_pagemap = put_pagemap;
 	pr->read_pages = read_pagemap_page;
 	pr->close = close_page_read;
-	pr->skip_pages = skip_pagemap_pages;
 	pr->id = ids++;
 
 	pr_debug("Opened page read %u (parent %u)\n",
diff --git a/criu/page-xfer.c b/criu/page-xfer.c
index 2ebe8cc..df85976 100644
--- a/criu/page-xfer.c
+++ b/criu/page-xfer.c
@@ -13,6 +13,7 @@ 
 #include "image.h"
 #include "page-xfer.h"
 #include "page-pipe.h"
+#include "page-read.h"
 #include "util.h"
 #include "protobuf.h"
 #include "images/pagemap.pb-c.h"
@@ -43,6 +44,8 @@  static int open_page_local_xfer(struct page_xfer *xfer, int fd_type, long id);
 #define PS_IOV_OPEN	3
 #define PS_IOV_OPEN2	4
 #define PS_IOV_PARENT	5
+#define PS_IOV_OPEN3	6
+#define PS_IOV_GET	7
 
 #define PS_IOV_FLUSH		0x1023
 #define PS_IOV_FLUSH_N_CLOSE	0x1024
@@ -112,6 +115,24 @@  static int page_server_open(int sk, struct page_server_iov *pi)
 	return 0;
 }
 
+static int page_server_open3(int sk, struct page_server_iov *pi)
+{
+	int type;
+	long id;
+	char has_parent = 23;
+
+	type = decode_pm_type(pi->dst_id);
+	id = decode_pm_id(pi->dst_id);
+	pr_debug("Opening %d/%ld\n", type, id);
+
+	if (write(sk, &has_parent, 1) != 1) {
+		pr_perror("Unable to send reponse");
+		return -1;
+	}
+
+	return 0;
+}
+
 static int prep_loc_xfer(struct page_server_iov *pi)
 {
 	if (cxfer.dst_id != pi->dst_id) {
@@ -176,6 +197,57 @@  static int page_server_hole(int sk, struct page_server_iov *pi)
 	return 0;
 }
 
+static int page_server_get(int sk, struct page_server_iov *pi)
+{
+	struct page_read page_read;
+	struct iovec iov;
+	unsigned long len;
+	int type, id, ret;
+	char *buf;
+
+	type = decode_pm_type(pi->dst_id);
+	id = decode_pm_id(pi->dst_id);
+	pr_debug("Get %d/%d\n", type, id);
+
+	len = pi->nr_pages * PAGE_SIZE;
+	buf = malloc(len);
+	if (!buf) {
+		pr_err("allocation failed\n");
+		return -1;
+	}
+
+	open_page_read(id, &page_read, PR_TASK);
+
+	ret = page_read.get_pagemap(&page_read, &iov);
+	pr_debug("get_pagemap ret %d\n", ret);
+	if (ret <= 0)
+		return ret;
+
+	ret = seek_pagemap_page(&page_read, pi->vaddr, true);
+	pr_debug("seek_pagemap_page ret 0x%x\n", ret);
+	if (ret <= 0)
+		return ret;
+
+	ret = page_read.read_pages(&page_read, pi->vaddr, pi->nr_pages, buf);
+	if (ret < 0) {
+		pr_err("%s: read_pages: %d\n", __func__, ret);
+		goto out;
+	}
+
+	ret = write(sk, buf, len);
+	if (ret != len) {
+		pr_err("%s: Can't send the pages:%d\n", __func__, ret);
+		ret = -1;
+		goto out;
+	}
+
+	page_read.close(&page_read);
+	ret = 0;
+out:
+	free(buf);
+	return ret;
+}
+
 static int page_server_check_parent(int sk, struct page_server_iov *pi);
 
 static int page_server_serve(int sk)
@@ -221,6 +293,9 @@  static int page_server_serve(int sk)
 		case PS_IOV_OPEN2:
 			ret = page_server_open(sk, &pi);
 			break;
+		case PS_IOV_OPEN3:
+			ret = page_server_open3(sk, &pi);
+			break;
 		case PS_IOV_PARENT:
 			ret = page_server_check_parent(sk, &pi);
 			break;
@@ -230,6 +305,9 @@  static int page_server_serve(int sk)
 		case PS_IOV_HOLE:
 			ret = page_server_hole(sk, &pi);
 			break;
+		case PS_IOV_GET:
+			ret = page_server_get(sk, &pi);
+			break;
 		case PS_IOV_FLUSH:
 		case PS_IOV_FLUSH_N_CLOSE:
 		{
@@ -322,8 +400,10 @@  static int page_server_sk = -1;
 
 int connect_to_page_server(void)
 {
-	if (!opts.use_page_server)
+	if (!(opts.use_page_server || opts.use_page_client)) {
+		pr_err("mutually exclusive page-server and page-client options\n");
 		return 0;
+	}
 
 	if (opts.ps_socket != -1) {
 		page_server_sk = opts.ps_socket;
@@ -332,8 +412,11 @@  int connect_to_page_server(void)
 	}
 
 	page_server_sk = setup_tcp_client(opts.addr);
-	if (page_server_sk == -1)
+	if (page_server_sk == -1) {
+		pr_err("setup_tcp_client\n");
 		return -1;
+	}
+
 out:
 	/*
 	 * CORK the socket at the very beginning. As per ANK
@@ -431,6 +514,33 @@  static int write_hole_to_server(struct page_xfer *xfer, struct iovec *iov)
 	return 0;
 }
 
+static int read_pages_from_server(struct page_xfer *xfer, unsigned long vaddr,
+				  int nr, void *buf)
+{
+	struct page_server_iov pi;
+	unsigned long len;
+	int ret;
+
+	pi.cmd = PS_IOV_GET;
+	pi.dst_id = xfer->dst_id;
+	pi.nr_pages = nr;
+	pi.vaddr = vaddr;
+	len = nr * page_size();
+
+	if (write(xfer->sk, &pi, sizeof(pi)) != sizeof(pi)) {
+		pr_perror("Can't write GET cmd to server");
+		return -1;
+	}
+
+	ret = recv(xfer->sk, buf, len, MSG_WAITALL);
+	if (ret != len) {
+		pr_err("%s: recv failed: %d\n", __func__, ret);
+		return -1;
+	}
+
+	return 0;
+}
+
 static void close_server_xfer(struct page_xfer *xfer)
 {
 	xfer->sk = -1;
@@ -445,6 +555,7 @@  static int open_page_server_xfer(struct page_xfer *xfer, int fd_type, long id)
 	xfer->write_pagemap = write_pagemap_to_server;
 	xfer->write_pages = write_pages_to_server;
 	xfer->write_hole = write_hole_to_server;
+	xfer->read_pages = read_pages_from_server;
 	xfer->close = close_server_xfer;
 	xfer->dst_id = encode_pm_id(fd_type, id);
 	xfer->parent = NULL;
@@ -473,6 +584,46 @@  static int open_page_server_xfer(struct page_xfer *xfer, int fd_type, long id)
 	return 0;
 }
 
+static void close_client_xfer(struct page_xfer *xfer)
+{
+	close(xfer->sk);
+}
+
+static int open_page_client_xfer(struct page_xfer *xfer, int fd_type, long id)
+{
+	struct page_server_iov pi;
+	char has_parent;
+
+	connect_to_page_server();
+
+	xfer->sk = page_server_sk;
+	xfer->read_pages = read_pages_from_server;
+	xfer->close = close_client_xfer;
+	xfer->dst_id = encode_pm_id(fd_type, id);
+	xfer->parent = NULL;
+
+	pi.cmd = PS_IOV_OPEN3;
+	pi.dst_id = xfer->dst_id;
+	pi.vaddr = 0;
+	pi.nr_pages = 0;
+
+	if (write(xfer->sk, &pi, sizeof(pi)) != sizeof(pi)) {
+		pr_perror("Can't write to page server");
+		return -1;
+	}
+
+	/* Push the command NOW */
+	tcp_nodelay(xfer->sk, true);
+
+	if (read(xfer->sk, &has_parent, 1) != 1) {
+		pr_perror("The page server doesn't answer");
+		return -1;
+	}
+	pr_debug("has_parent=%d\n", has_parent);
+
+	return 0;
+}
+
 static int write_pagemap_loc(struct page_xfer *xfer,
 		struct iovec *iov)
 {
@@ -703,6 +854,8 @@  int open_page_xfer(struct page_xfer *xfer, int fd_type, long id)
 {
 	if (opts.use_page_server)
 		return open_page_server_xfer(xfer, fd_type, id);
+	else if (opts.use_page_client)
+		return open_page_client_xfer(xfer, fd_type, id);
 	else
 		return open_page_local_xfer(xfer, fd_type, id);
 }
@@ -785,3 +938,9 @@  int check_parent_page_xfer(int fd_type, long id)
 	else
 		return check_parent_local_xfer(fd_type, id);
 }
+
+int page_xfer_read_pages(struct page_xfer *xfer, unsigned long vaddr,
+			 int nr, void *buf)
+{
+	return xfer->read_pages ? xfer->read_pages(xfer, vaddr, nr, buf) : -1;
+}

Comments

Pavel Emelianov May 13, 2016, 2:08 p.m.
Adrian, Mike, sorry for such a long delay in review :( Here's my comments, inline.

On 04/26/2016 12:38 PM, Adrian Reber wrote:
> From: Mike Rapoport <rppt@linux.vnet.ibm.com>
> 
> ---
>  criu/cr-restore.c         |   5 +-
>  criu/crtools.c            |   4 ++
>  criu/include/cr_options.h |   1 +
>  criu/include/page-read.h  |   6 ++
>  criu/include/page-xfer.h  |   7 ++
>  criu/page-read.c          |  49 ++++++++++++--
>  criu/page-xfer.c          | 163 +++++++++++++++++++++++++++++++++++++++++++++-
>  7 files changed, 227 insertions(+), 8 deletions(-)
> 
> diff --git a/criu/cr-restore.c b/criu/cr-restore.c
> index 34db7f7..5ae1d28 100644
> --- a/criu/cr-restore.c
> +++ b/criu/cr-restore.c
> @@ -453,10 +453,13 @@ static int restore_priv_vma_content(void)
>  	unsigned int nr_lazy = 0;
>  	unsigned long va;
>  	struct page_read pr;
> +	int pr_flags = PR_TASK;
>  
>  	vma = list_first_entry(vmas, struct vma_area, list);
>  
> -	ret = open_page_read(current->pid.virt, &pr, PR_TASK);
> +	if (opts.use_page_client)
> +		pr_flags |= PR_REMOTE;
> +	ret = open_page_read(current->pid.virt, &pr, pr_flags);

Looking at implementation of PR_REMOTE bits I see that every call to
read_pagemap_page() would block in this case which doesn't fit what
we've discussed so far :) The uffd daemon shouldn't block in page fault
handling.

>  	if (ret <= 0)
>  		return -1;
>  

> @@ -785,3 +938,9 @@ int check_parent_page_xfer(int fd_type, long id)
>  	else
>  		return check_parent_local_xfer(fd_type, id);
>  }
> +
> +int page_xfer_read_pages(struct page_xfer *xfer, unsigned long vaddr,
> +			 int nr, void *buf)

Well, the page_xfer stands for pages transfer and it's not supposed to read any pages :)

Instead, the respective page_read should be created. Now we have legacy page read (for
images we no longer generate, will deprecate) and pagemap reader. So we need to create
the remote page_read reader.

The read_pagemap_page routine can be split in any pieces to facilitate that. E.g. -- the
checks for page-in-parent you seem to need in this patch, can be put into a separate helper.

> +{
> +	return xfer->read_pages ? xfer->read_pages(xfer, vaddr, nr, buf) : -1;
> +}
> 

-- Pavel