Put a cap on the size of single preadv in restore operation.

Submitted by Pawel Stradomski on Feb. 5, 2019, 7:13 p.m.

Details

Message ID 00000000000081987405813a15db@google.com
State New
Series "Put a cap on the size of single preadv in restore operation."
Headers show

Commit Message

Pawel Stradomski Feb. 5, 2019, 7:13 p.m.
When image files are stored on tmpfs, --auto-dedup can be used to use fallocate() to free
space used by image files after the data was copied to restored process.

Temporarily (after preadv but before fallocate) the same data is present in both places,
increasing memory usage. By default preadv() would read up to 2GiB in one go which is a significant
overhead.

This change caps the size of single read at 100MiB which is much more reasonable overhead.

Signed-off-by: Pawel Stradomski <pstradomski@google.com>
---
 criu/pie/restorer.c | 41 +++++++++++++++++++++++++++++++++++++++--
 1 file changed, 39 insertions(+), 2 deletions(-)

Patch hide | download patch | download mbox

diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c
index d3b459c6..60859294 100644
--- a/criu/pie/restorer.c
+++ b/criu/pie/restorer.c
@@ -1219,6 +1219,37 @@  static bool vdso_needs_parking(struct task_restore_args *args)
 	return !vdso_unmapped(args);
 }
 
+/* Return number of elements that can be read from iovs in one preadv
+ * without exceeding cap on read size. Possibly adjusts size of last element
+ * to make it fit. In that case, the original size is saved to saved_last_iov_len.
+ * Otherwise saved_last_iov_len is set to 0.
+ *
+ * We want to cap the size of one preadv because the code below does a preadv to
+ * read data from image files (possibly on tmpfs) and then calls fallocate() to
+ * free up space on that tmpfs. Thus temporarily the same data is on both tmpfs
+ * and in process memory, adding memory overhead to the restore process.
+ * */
+static int limit_iovec_size(struct iovec *iovs, int nr, size_t* saved_last_iov_len) {
+	size_t remaining_read_limit = 100 * (1 << 20);
+	int limited_nr = 0;
+	for (int i = 0; i < nr; ++i) {
+		if (iovs[i].iov_len > remaining_read_limit) {
+			break;
+		}
+		remaining_read_limit -= iovs[i].iov_len;
+		limited_nr++;
+	}
+
+	/* Try to do a partial read of the last iov. */
+	*saved_last_iov_len = 0;
+	if (limited_nr < nr && remaining_read_limit > 0) {
+		*saved_last_iov_len = iovs[limited_nr].iov_len;
+		iovs[limited_nr].iov_len = remaining_read_limit;
+		limited_nr++;
+	}
+	return limited_nr;
+}
+
 /*
  * The main routine to restore task via sigreturn.
  * This one is very special, we never return there
@@ -1389,10 +1420,16 @@  long __export_restore_task(struct task_restore_args *args)
 		ssize_t r;
 
 		while (nr) {
+			size_t saved_last_iov_len = 0;
+			int nr_in_one_pread = limit_iovec_size(iovs, nr, &saved_last_iov_len);
 			pr_debug("Preadv %lx:%d... (%d iovs)\n",
 					(unsigned long)iovs->iov_base,
-					(int)iovs->iov_len, nr);
-			r = sys_preadv(args->vma_ios_fd, iovs, nr, rio->off);
+					(int)iovs->iov_len, nr_in_one_pread);
+			r = sys_preadv(args->vma_ios_fd, iovs, nr_in_one_pread, rio->off);
+			/* Restore the iov_len we had overwritten */
+			if (saved_last_iov_len > 0) {
+				iovs[nr_in_one_pread-1].iov_len = saved_last_iov_len;
+			}
 			if (r < 0) {
 				pr_err("Can't read pages data (%d)\n", (int)r);
 				goto core_restore_end;

Comments

Andrei Vagin Feb. 7, 2019, 6:17 a.m.
On Tue, Feb 05, 2019 at 08:13:25PM +0100, Pawel Stradomski wrote:
> When image files are stored on tmpfs, --auto-dedup can be used to use fallocate() to free
> space used by image files after the data was copied to restored process.
> 
> Temporarily (after preadv but before fallocate) the same data is present in both places,
> increasing memory usage. By default preadv() would read up to 2GiB in one go which is a significant
> overhead.
> 
> This change caps the size of single read at 100MiB which is much more reasonable overhead.

Maybe it is better to add an option to specify this limit?

> 
> Signed-off-by: Pawel Stradomski <pstradomski@google.com>
> ---
>  criu/pie/restorer.c | 41 +++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 39 insertions(+), 2 deletions(-)
> 
> diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c
> index d3b459c6..60859294 100644
> --- a/criu/pie/restorer.c
> +++ b/criu/pie/restorer.c
> @@ -1219,6 +1219,37 @@ static bool vdso_needs_parking(struct task_restore_args *args)
>  	return !vdso_unmapped(args);
>  }
>  
> +/* Return number of elements that can be read from iovs in one preadv
> + * without exceeding cap on read size. Possibly adjusts size of last element
> + * to make it fit. In that case, the original size is saved to saved_last_iov_len.
> + * Otherwise saved_last_iov_len is set to 0.
> + *
> + * We want to cap the size of one preadv because the code below does a preadv to
> + * read data from image files (possibly on tmpfs) and then calls fallocate() to
> + * free up space on that tmpfs. Thus temporarily the same data is on both tmpfs
> + * and in process memory, adding memory overhead to the restore process.
> + * */
> +static int limit_iovec_size(struct iovec *iovs, int nr, size_t* saved_last_iov_len) {
> +	size_t remaining_read_limit = 100 * (1 << 20);
> +	int limited_nr = 0;
> +	for (int i = 0; i < nr; ++i) {
> +		if (iovs[i].iov_len > remaining_read_limit) {
> +			break;
> +		}
> +		remaining_read_limit -= iovs[i].iov_len;
> +		limited_nr++;
> +	}
> +
> +	/* Try to do a partial read of the last iov. */
> +	*saved_last_iov_len = 0;
> +	if (limited_nr < nr && remaining_read_limit > 0) {
> +		*saved_last_iov_len = iovs[limited_nr].iov_len;
> +		iovs[limited_nr].iov_len = remaining_read_limit;
> +		limited_nr++;
> +	}
> +	return limited_nr;
> +}
> +
>  /*
>   * The main routine to restore task via sigreturn.
>   * This one is very special, we never return there
> @@ -1389,10 +1420,16 @@ long __export_restore_task(struct task_restore_args *args)
>  		ssize_t r;
>  
>  		while (nr) {
> +			size_t saved_last_iov_len = 0;
> +			int nr_in_one_pread = limit_iovec_size(iovs, nr, &saved_last_iov_len);
>  			pr_debug("Preadv %lx:%d... (%d iovs)\n",
>  					(unsigned long)iovs->iov_base,
> -					(int)iovs->iov_len, nr);
> -			r = sys_preadv(args->vma_ios_fd, iovs, nr, rio->off);
> +					(int)iovs->iov_len, nr_in_one_pread);
> +			r = sys_preadv(args->vma_ios_fd, iovs, nr_in_one_pread, rio->off);
> +			/* Restore the iov_len we had overwritten */
> +			if (saved_last_iov_len > 0) {
> +				iovs[nr_in_one_pread-1].iov_len = saved_last_iov_len;
> +			}
>  			if (r < 0) {
>  				pr_err("Can't read pages data (%d)\n", (int)r);
>  				goto core_restore_end;
> -- 
> 2.20.1.611.gfbb209baf1-goog
> 
> _______________________________________________
> CRIU mailing list
> CRIU@openvz.org
> https://lists.openvz.org/mailman/listinfo/criu
Mike Rapoport Feb. 7, 2019, 7:16 a.m.
On Wed, Feb 06, 2019 at 10:17:36PM -0800, Andrei Vagin wrote:
> On Tue, Feb 05, 2019 at 08:13:25PM +0100, Pawel Stradomski wrote:
> > When image files are stored on tmpfs, --auto-dedup can be used to use fallocate() to free
> > space used by image files after the data was copied to restored process.
> > 
> > Temporarily (after preadv but before fallocate) the same data is present in both places,
> > increasing memory usage. By default preadv() would read up to 2GiB in one go which is a significant
> > overhead.
> > 
> > This change caps the size of single read at 100MiB which is much more reasonable overhead.
> 
> Maybe it is better to add an option to specify this limit?

+1
 
> > 
> > Signed-off-by: Pawel Stradomski <pstradomski@google.com>
> > ---
> >  criu/pie/restorer.c | 41 +++++++++++++++++++++++++++++++++++++++--
> >  1 file changed, 39 insertions(+), 2 deletions(-)

I think it would be better to build the ta->vma_ios so that each preadv
will use no more than the limit and add the limit to 
'struct task_restore_args'


> > 
> > diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c
> > index d3b459c6..60859294 100644
> > --- a/criu/pie/restorer.c
> > +++ b/criu/pie/restorer.c
> > @@ -1219,6 +1219,37 @@ static bool vdso_needs_parking(struct task_restore_args *args)
> >  	return !vdso_unmapped(args);
> >  }
> >  
> > +/* Return number of elements that can be read from iovs in one preadv
> > + * without exceeding cap on read size. Possibly adjusts size of last element
> > + * to make it fit. In that case, the original size is saved to saved_last_iov_len.
> > + * Otherwise saved_last_iov_len is set to 0.
> > + *
> > + * We want to cap the size of one preadv because the code below does a preadv to
> > + * read data from image files (possibly on tmpfs) and then calls fallocate() to
> > + * free up space on that tmpfs. Thus temporarily the same data is on both tmpfs
> > + * and in process memory, adding memory overhead to the restore process.
> > + * */
> > +static int limit_iovec_size(struct iovec *iovs, int nr, size_t* saved_last_iov_len) {
> > +	size_t remaining_read_limit = 100 * (1 << 20);
> > +	int limited_nr = 0;
> > +	for (int i = 0; i < nr; ++i) {
> > +		if (iovs[i].iov_len > remaining_read_limit) {
> > +			break;
> > +		}
> > +		remaining_read_limit -= iovs[i].iov_len;
> > +		limited_nr++;
> > +	}
> > +
> > +	/* Try to do a partial read of the last iov. */
> > +	*saved_last_iov_len = 0;
> > +	if (limited_nr < nr && remaining_read_limit > 0) {
> > +		*saved_last_iov_len = iovs[limited_nr].iov_len;
> > +		iovs[limited_nr].iov_len = remaining_read_limit;
> > +		limited_nr++;
> > +	}
> > +	return limited_nr;
> > +}
> > +
> >  /*
> >   * The main routine to restore task via sigreturn.
> >   * This one is very special, we never return there
> > @@ -1389,10 +1420,16 @@ long __export_restore_task(struct task_restore_args *args)
> >  		ssize_t r;
> >  
> >  		while (nr) {
> > +			size_t saved_last_iov_len = 0;
> > +			int nr_in_one_pread = limit_iovec_size(iovs, nr, &saved_last_iov_len);
> >  			pr_debug("Preadv %lx:%d... (%d iovs)\n",
> >  					(unsigned long)iovs->iov_base,
> > -					(int)iovs->iov_len, nr);
> > -			r = sys_preadv(args->vma_ios_fd, iovs, nr, rio->off);
> > +					(int)iovs->iov_len, nr_in_one_pread);
> > +			r = sys_preadv(args->vma_ios_fd, iovs, nr_in_one_pread, rio->off);
> > +			/* Restore the iov_len we had overwritten */
> > +			if (saved_last_iov_len > 0) {
> > +				iovs[nr_in_one_pread-1].iov_len = saved_last_iov_len;
> > +			}
> >  			if (r < 0) {
> >  				pr_err("Can't read pages data (%d)\n", (int)r);
> >  				goto core_restore_end;
> > -- 
> > 2.20.1.611.gfbb209baf1-goog
> > 
> > _______________________________________________
> > CRIU mailing list
> > CRIU@openvz.org
> > https://lists.openvz.org/mailman/listinfo/criu
> _______________________________________________
> CRIU mailing list
> CRIU@openvz.org
> https://lists.openvz.org/mailman/listinfo/criu
>