[8/7] page-xfer: Introduce fully asynchronous read

Submitted by Pavel Emelianov on Nov. 16, 2016, 1:19 p.m.

Details

Message ID 582C5CF5.50505@virtuozzo.com
State Superseded
Headers show

Commit Message

Pavel Emelianov Nov. 16, 2016, 1:19 p.m.
Add a queue of async-read jobs into page-xfer. When the 
page_server_sk gets a read event from epoll it reads as 
many bytes into page_server_iov + page buffer as recv
allows and returns.

Once the full iov+data is ready the requestor is notified
and the next async read is started.

This patch removes calls to recv(...MSG_WAITALL) from all
remote async paths.

Signed-off-by: Pavel Emelyanov <xemul@virtuozzo.com>
---
 criu/include/page-xfer.h |  4 +++
 criu/page-xfer.c         | 85 ++++++++++++++++++++++++++++++++++++++++++++++++
 criu/pagemap.c           | 19 ++++++++++-
 criu/uffd.c              | 68 ++------------------------------------
 4 files changed, 110 insertions(+), 66 deletions(-)

Patch hide | download patch | download mbox

diff --git a/criu/include/page-xfer.h b/criu/include/page-xfer.h
index 5b7ec21..35425f1 100644
--- a/criu/include/page-xfer.h
+++ b/criu/include/page-xfer.h
@@ -57,5 +57,9 @@  extern int request_remote_pages(int pid, unsigned long addr, int nr_pages);
 extern int receive_remote_pages_info(int *nr_pages, unsigned long *addr, int *pid);
 extern int receive_remote_pages(int len, void *buf);
 
+typedef int (*ps_async_read_complete)(int pid, unsigned long vaddr, int nr_pages, void *);
+extern int page_server_start_async_read(void *buf, int nr_pages,
+		ps_async_read_complete complete, void *priv);
+extern int page_server_async_read(void);
 
 #endif /* __CR_PAGE_XFER__H__ */
diff --git a/criu/page-xfer.c b/criu/page-xfer.c
index 2d878e2..234c38d 100644
--- a/criu/page-xfer.c
+++ b/criu/page-xfer.c
@@ -887,6 +887,91 @@  out:
 	return ret ? : status;
 }
 
+struct ps_async_read {
+	unsigned long rb; /* read bytes */
+	unsigned long goal;
+
+	struct page_server_iov pi;
+	void *pages;
+
+	ps_async_read_complete complete;
+	void *priv;
+
+	struct list_head l;
+};
+
+static LIST_HEAD(async_reads);
+
+int page_server_start_async_read(void *buf, int nr_pages,
+		ps_async_read_complete complete, void *priv)
+{
+	struct ps_async_read *ar;
+
+	ar = xmalloc(sizeof(*ar));
+	if (ar == NULL)
+		return -1;
+
+	ar->pages = buf;
+	ar->rb = 0;
+	ar->goal = sizeof(ar->pi) + nr_pages * PAGE_SIZE;
+	ar->complete = complete;
+	ar->priv = priv;
+
+	list_add_tail(&ar->l, &async_reads);
+	return 0;
+}
+
+/*
+ * There are two possible event types we need to handle:
+ * - page info is available as a reply to request_remote_page
+ * - page data is available, and it follows page info we've just received
+ * Since the on dump side communications are completely synchronous,
+ * we can return to epoll right after the reception of page info and
+ * for sure the next time socket event will occur we'll get page data
+ * related to info we've just received
+ */
+int page_server_async_read(void)
+{
+	struct ps_async_read *ar;
+	int ret, need;
+	void *buf;
+
+	BUG_ON(list_empty(&async_reads));
+	ar = list_first_entry(&async_reads, struct ps_async_read, l);
+
+	if (ar->rb < sizeof(ar->pi)) {
+		/* Header */
+		buf = ((void *)&ar->pi) + ar->rb;
+		need = sizeof(ar->pi) - ar->rb;
+	} else {
+		/* Page(s) data itself */
+		buf = ar->pages + (ar->rb - sizeof(ar->pi));
+		need = ar->goal - ar->rb;
+	}
+
+	ret = recv(page_server_sk, buf, need, MSG_DONTWAIT);
+	if (ret < 0) {
+		pr_perror("Error reading async data from page server\n");
+		return -1;
+	}
+
+	ar->rb += ret;
+	if (ar->rb < ar->goal)
+		return 0;
+
+	/*
+	 * IO complete -- notify the caller and drop the request
+	 */
+	BUG_ON(ar->rb > ar->goal);
+	ret = ar->complete((int)ar->pi.dst_id, (unsigned long)ar->pi.vaddr,
+				(int)ar->pi.nr_pages, ar->priv);
+
+	list_del(&ar->l);
+	xfree(ar);
+
+	return ret;
+}
+
 int request_remote_pages(int pid, unsigned long addr, int nr_pages)
 {
 	struct page_server_iov pi = {
diff --git a/criu/pagemap.c b/criu/pagemap.c
index f4439ec..9abacb4 100644
--- a/criu/pagemap.c
+++ b/criu/pagemap.c
@@ -403,6 +403,23 @@  static int maybe_read_page_local(struct page_read *pr, unsigned long vaddr,
 	return ret;
 }
 
+static int read_page_complete(int pid, unsigned long vaddr, int nr_pages, void *priv)
+{
+	int ret = 0;
+	struct page_read *pr = priv;
+
+	if (pr->pid != pid) {
+		pr_err("Out of order read completed (want %d have %d)\n",
+				pr->pid, pid);
+		return -1;
+	}
+
+	if (pr->io_complete)
+		ret = pr->io_complete(pr, vaddr, nr_pages);
+
+	return ret;
+}
+
 static int maybe_read_page_remote(struct page_read *pr, unsigned long vaddr,
 		int nr, void *buf, unsigned flags)
 {
@@ -411,7 +428,7 @@  static int maybe_read_page_remote(struct page_read *pr, unsigned long vaddr,
 	/* We always do PR_ASAP mode here (FIXME?) */
 	ret = request_remote_pages(pr->pid, vaddr, nr);
 	if ((ret < 0) || (flags & PR_ASYNC))
-		return ret;
+		return page_server_start_async_read(buf, nr, read_page_complete, pr);
 
 	/*
 	 * Note, that for async remote page_read, the actual
diff --git a/criu/uffd.c b/criu/uffd.c
index 09cb218..cd370aa 100644
--- a/criu/uffd.c
+++ b/criu/uffd.c
@@ -55,16 +55,9 @@  struct lazy_iovec {
 
 struct lazy_pages_info;
 
-struct sk_event_data {
-	int nr_pages;
-	unsigned long addr;
-	struct lazy_pages_info *lpi;
-};
-
 struct lazy_pages_fd {
 	int fd;
 	int (*event)(struct lazy_pages_fd *);
-	struct sk_event_data *ev_data;
 };
 
 struct lazy_pages_info {
@@ -120,18 +113,6 @@  static void lpi_fini(struct lazy_pages_info *lpi)
 	free(lpi);
 }
 
-static struct lazy_pages_info *pid2lpi(int pid)
-{
-	struct lazy_pages_info *lpi;
-
-	list_for_each_entry(lpi, &lpis, l) {
-		if (lpi->pid == pid)
-			return lpi;
-	}
-
-	return NULL;
-}
-
 static int epoll_nr_fds(int nr_tasks)
 {
        if (opts.use_page_server)
@@ -927,56 +908,12 @@  close_uffd:
 	return -1;
 }
 
-/*
- * There are two possible event types we need to handle:
- * - page info is available as a reply to request_remote_page
- * - page data is available, and it follows page info we've just received
- * Since the on dump side communications are completely synchronous,
- * we can return to epoll right after the reception of page info and
- * for sure the next time socket event will occur we'll get page data
- * related to info we've just received
- */
 static int page_server_event(struct lazy_pages_fd *lpfd)
 {
-	struct lazy_pages_info *lpi;
-	int pid, nr_pages;
-	unsigned long addr;
-
-	lpi = lpfd->ev_data->lpi;
-
-	if (!lpi) {
-		if (receive_remote_pages_info(&nr_pages, &addr, &pid))
-			return -1;
-
-		lpi = pid2lpi(pid);
-		if (!lpi)
-			return -1;
-
-		lpfd->ev_data->lpi = lpi;
-		lpfd->ev_data->nr_pages = nr_pages;
-		lpfd->ev_data->addr = addr;
-
-		return 0;
-	} else {
-		lpi = lpfd->ev_data->lpi;
-		nr_pages = lpfd->ev_data->nr_pages;
-		addr = lpfd->ev_data->addr;
-
-		memset(lpfd->ev_data, 0, sizeof(*lpfd->ev_data));
-
-		if (receive_remote_pages(nr_pages * PAGE_SIZE, lpi->buf))
-			return -1;
-
-		return complete_page_fault(lpi, addr, nr_pages);
-	}
+	return page_server_async_read();
 }
 
-static struct sk_event_data sk_event_data;
-
-static struct lazy_pages_fd page_server_sk_fd = {
-	.event = page_server_event,
-	.ev_data = &sk_event_data,
-};
+static struct lazy_pages_fd page_server_sk_fd;
 
 static int prepare_page_server_socket(int epollfd)
 {
@@ -986,6 +923,7 @@  static int prepare_page_server_socket(int epollfd)
 	if (sk < 0)
 		return -1;
 
+	page_server_sk_fd.event = page_server_event;
 	page_server_sk_fd.fd = sk;
 
 	return epoll_add_lpfd(epollfd, &page_server_sk_fd);

Comments

Mike Rapoport Nov. 16, 2016, 3:04 p.m.
On Wed, Nov 16, 2016 at 04:19:49PM +0300, Pavel Emelyanov wrote:
> Add a queue of async-read jobs into page-xfer. When the 
> page_server_sk gets a read event from epoll it reads as 
> many bytes into page_server_iov + page buffer as recv
> allows and returns.
> 
> Once the full iov+data is ready the requestor is notified
> and the next async read is started.
> 
> This patch removes calls to recv(...MSG_WAITALL) from all
> remote async paths.
> 
> Signed-off-by: Pavel Emelyanov <xemul@virtuozzo.com>
> ---
>  criu/include/page-xfer.h |  4 +++
>  criu/page-xfer.c         | 85 ++++++++++++++++++++++++++++++++++++++++++++++++
>  criu/pagemap.c           | 19 ++++++++++-
>  criu/uffd.c              | 68 ++------------------------------------
>  4 files changed, 110 insertions(+), 66 deletions(-)
> 
> diff --git a/criu/include/page-xfer.h b/criu/include/page-xfer.h
> index 5b7ec21..35425f1 100644
> --- a/criu/include/page-xfer.h
> +++ b/criu/include/page-xfer.h
> @@ -57,5 +57,9 @@ extern int request_remote_pages(int pid, unsigned long addr, int nr_pages);
>  extern int receive_remote_pages_info(int *nr_pages, unsigned long *addr, int *pid);
>  extern int receive_remote_pages(int len, void *buf);
> 
> +typedef int (*ps_async_read_complete)(int pid, unsigned long vaddr, int nr_pages, void *);
> +extern int page_server_start_async_read(void *buf, int nr_pages,
> +		ps_async_read_complete complete, void *priv);
> +extern int page_server_async_read(void);
> 
>  #endif /* __CR_PAGE_XFER__H__ */
> diff --git a/criu/page-xfer.c b/criu/page-xfer.c
> index 2d878e2..234c38d 100644
> --- a/criu/page-xfer.c
> +++ b/criu/page-xfer.c
> @@ -887,6 +887,91 @@ out:
>  	return ret ? : status;
>  }
> 
> +struct ps_async_read {
> +	unsigned long rb; /* read bytes */
> +	unsigned long goal;
> +
> +	struct page_server_iov pi;
> +	void *pages;
> +
> +	ps_async_read_complete complete;
> +	void *priv;
> +
> +	struct list_head l;
> +};
> +
> +static LIST_HEAD(async_reads);
> +
> +int page_server_start_async_read(void *buf, int nr_pages,
> +		ps_async_read_complete complete, void *priv)
> +{
> +	struct ps_async_read *ar;
> +
> +	ar = xmalloc(sizeof(*ar));
> +	if (ar == NULL)
> +		return -1;
> +
> +	ar->pages = buf;
> +	ar->rb = 0;
> +	ar->goal = sizeof(ar->pi) + nr_pages * PAGE_SIZE;
> +	ar->complete = complete;
> +	ar->priv = priv;
> +
> +	list_add_tail(&ar->l, &async_reads);
> +	return 0;
> +}
> +
> +/*
> + * There are two possible event types we need to handle:
> + * - page info is available as a reply to request_remote_page
> + * - page data is available, and it follows page info we've just received
> + * Since the on dump side communications are completely synchronous,
> + * we can return to epoll right after the reception of page info and
> + * for sure the next time socket event will occur we'll get page data
> + * related to info we've just received
> + */
> +int page_server_async_read(void)
> +{
> +	struct ps_async_read *ar;
> +	int ret, need;
> +	void *buf;
> +
> +	BUG_ON(list_empty(&async_reads));
> +	ar = list_first_entry(&async_reads, struct ps_async_read, l);
> +
> +	if (ar->rb < sizeof(ar->pi)) {
> +		/* Header */
> +		buf = ((void *)&ar->pi) + ar->rb;
> +		need = sizeof(ar->pi) - ar->rb;
> +	} else {
> +		/* Page(s) data itself */
> +		buf = ar->pages + (ar->rb - sizeof(ar->pi));
> +		need = ar->goal - ar->rb;
> +	}
> +
> +	ret = recv(page_server_sk, buf, need, MSG_DONTWAIT);
> +	if (ret < 0) {
> +		pr_perror("Error reading async data from page server\n");
> +		return -1;
> +	}
> +
> +	ar->rb += ret;
> +	if (ar->rb < ar->goal)
> +		return 0;
> +
> +	/*
> +	 * IO complete -- notify the caller and drop the request
> +	 */
> +	BUG_ON(ar->rb > ar->goal);
> +	ret = ar->complete((int)ar->pi.dst_id, (unsigned long)ar->pi.vaddr,
> +				(int)ar->pi.nr_pages, ar->priv);
> +
> +	list_del(&ar->l);
> +	xfree(ar);
> +
> +	return ret;
> +}
> +
>  int request_remote_pages(int pid, unsigned long addr, int nr_pages)
>  {
>  	struct page_server_iov pi = {
> diff --git a/criu/pagemap.c b/criu/pagemap.c
> index f4439ec..9abacb4 100644
> --- a/criu/pagemap.c
> +++ b/criu/pagemap.c
> @@ -403,6 +403,23 @@ static int maybe_read_page_local(struct page_read *pr, unsigned long vaddr,
>  	return ret;
>  }
> 
> +static int read_page_complete(int pid, unsigned long vaddr, int nr_pages, void *priv)
> +{
> +	int ret = 0;
> +	struct page_read *pr = priv;
> +
> +	if (pr->pid != pid) {
> +		pr_err("Out of order read completed (want %d have %d)\n",
> +				pr->pid, pid);
> +		return -1;
> +	}
> +
> +	if (pr->io_complete)
> +		ret = pr->io_complete(pr, vaddr, nr_pages);
> +
> +	return ret;
> +}
> +
>  static int maybe_read_page_remote(struct page_read *pr, unsigned long vaddr,
>  		int nr, void *buf, unsigned flags)
>  {
> @@ -411,7 +428,7 @@ static int maybe_read_page_remote(struct page_read *pr, unsigned long vaddr,
>  	/* We always do PR_ASAP mode here (FIXME?) */
>  	ret = request_remote_pages(pr->pid, vaddr, nr);
>  	if ((ret < 0) || (flags & PR_ASYNC))
> -		return ret;
> +		return page_server_start_async_read(buf, nr, read_page_complete, pr);

I'm not sure I get this. If request_remote_pages returned error why would
we want to start async read?

> 
>  	/*
>  	 * Note, that for async remote page_read, the actual
> diff --git a/criu/uffd.c b/criu/uffd.c
> index 09cb218..cd370aa 100644
> --- a/criu/uffd.c
> +++ b/criu/uffd.c
> @@ -55,16 +55,9 @@ struct lazy_iovec {
> 
>  struct lazy_pages_info;
> 
> -struct sk_event_data {
> -	int nr_pages;
> -	unsigned long addr;
> -	struct lazy_pages_info *lpi;
> -};
> -
>  struct lazy_pages_fd {
>  	int fd;
>  	int (*event)(struct lazy_pages_fd *);
> -	struct sk_event_data *ev_data;
>  };
> 
>  struct lazy_pages_info {
> @@ -120,18 +113,6 @@ static void lpi_fini(struct lazy_pages_info *lpi)
>  	free(lpi);
>  }
> 
> -static struct lazy_pages_info *pid2lpi(int pid)
> -{
> -	struct lazy_pages_info *lpi;
> -
> -	list_for_each_entry(lpi, &lpis, l) {
> -		if (lpi->pid == pid)
> -			return lpi;
> -	}
> -
> -	return NULL;
> -}
> -
>  static int epoll_nr_fds(int nr_tasks)
>  {
>         if (opts.use_page_server)
> @@ -927,56 +908,12 @@ close_uffd:
>  	return -1;
>  }
> 
> -/*
> - * There are two possible event types we need to handle:
> - * - page info is available as a reply to request_remote_page
> - * - page data is available, and it follows page info we've just received
> - * Since the on dump side communications are completely synchronous,
> - * we can return to epoll right after the reception of page info and
> - * for sure the next time socket event will occur we'll get page data
> - * related to info we've just received
> - */
>  static int page_server_event(struct lazy_pages_fd *lpfd)
>  {
> -	struct lazy_pages_info *lpi;
> -	int pid, nr_pages;
> -	unsigned long addr;
> -
> -	lpi = lpfd->ev_data->lpi;
> -
> -	if (!lpi) {
> -		if (receive_remote_pages_info(&nr_pages, &addr, &pid))
> -			return -1;
> -
> -		lpi = pid2lpi(pid);
> -		if (!lpi)
> -			return -1;
> -
> -		lpfd->ev_data->lpi = lpi;
> -		lpfd->ev_data->nr_pages = nr_pages;
> -		lpfd->ev_data->addr = addr;
> -
> -		return 0;
> -	} else {
> -		lpi = lpfd->ev_data->lpi;
> -		nr_pages = lpfd->ev_data->nr_pages;
> -		addr = lpfd->ev_data->addr;
> -
> -		memset(lpfd->ev_data, 0, sizeof(*lpfd->ev_data));
> -
> -		if (receive_remote_pages(nr_pages * PAGE_SIZE, lpi->buf))
> -			return -1;
> -
> -		return complete_page_fault(lpi, addr, nr_pages);
> -	}
> +	return page_server_async_read();
>  }
> 
> -static struct sk_event_data sk_event_data;
> -
> -static struct lazy_pages_fd page_server_sk_fd = {
> -	.event = page_server_event,
> -	.ev_data = &sk_event_data,
> -};
> +static struct lazy_pages_fd page_server_sk_fd;
> 
>  static int prepare_page_server_socket(int epollfd)
>  {
> @@ -986,6 +923,7 @@ static int prepare_page_server_socket(int epollfd)
>  	if (sk < 0)
>  		return -1;
> 
> +	page_server_sk_fd.event = page_server_event;
>  	page_server_sk_fd.fd = sk;
> 
>  	return epoll_add_lpfd(epollfd, &page_server_sk_fd);
> -- 
> 2.5.0
> 
> _______________________________________________
> CRIU mailing list
> CRIU@openvz.org
> https://lists.openvz.org/mailman/listinfo/criu
>
Mike Rapoport Nov. 16, 2016, 3:58 p.m.
On Wed, Nov 16, 2016 at 06:59:05PM +0300, Pavel Emelyanov wrote:
> 
> >> @@ -411,7 +428,7 @@ static int maybe_read_page_remote(struct page_read *pr, unsigned long vaddr,
> >>  	/* We always do PR_ASAP mode here (FIXME?) */
> >>  	ret = request_remote_pages(pr->pid, vaddr, nr);
> >>  	if ((ret < 0) || (flags & PR_ASYNC))
> >> -		return ret;
> >> +		return page_server_start_async_read(buf, nr, read_page_complete, pr);
> > 
> > I'm not sure I get this. If request_remote_pages returned error why would
> > we want to start async read?
> 
> Good catch :) It shouldn't look like this, but instead
> 
> if (ret < 0)
> 	return ret;
> if (flags & PR_ASYNC)
> 	return page_server_start_async_read(...);

Except that, I haven't found anything suspicious. With this fix

Acked-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
 
> ...
> 
> -- Pavel
 
--
Sincerely yours,
Mike.
Pavel Emelianov Nov. 16, 2016, 3:59 p.m.
>> @@ -411,7 +428,7 @@ static int maybe_read_page_remote(struct page_read *pr, unsigned long vaddr,
>>  	/* We always do PR_ASAP mode here (FIXME?) */
>>  	ret = request_remote_pages(pr->pid, vaddr, nr);
>>  	if ((ret < 0) || (flags & PR_ASYNC))
>> -		return ret;
>> +		return page_server_start_async_read(buf, nr, read_page_complete, pr);
> 
> I'm not sure I get this. If request_remote_pages returned error why would
> we want to start async read?

Good catch :) It shouldn't look like this, but instead

if (ret < 0)
	return ret;
if (flags & PR_ASYNC)
	return page_server_start_async_read(...);

...

-- Pavel