[2/9] Handling iov generation for non-PROT_READ regions

Submitted by Abhishek Dubey on Sept. 22, 2019, 4:24 a.m.

Details

Message ID 1569126298-23300-3-git-send-email-dubeyabhishek777@gmail.com
State Accepted
Series "Series without cover letter"
Commit f367db524ee564bf39f44ba439376d5bad4fb1a3
Headers show

Commit Message

Abhishek Dubey Sept. 22, 2019, 4:24 a.m.
Skip iov-generation for regions not having
PROT_READ, since process_vm_readv syscall
can't process them during "read" pre-dump.
Handle random order of "read" & "splice"
pre-dumps.

Signed-off-by: Abhishek Dubey <dubeyabhishek777@gmail.com>
---
 criu/cr-dump.c         |  5 +++++
 criu/mem.c             | 56 ++++++++++++++++++++++++++++++++++++++++++++++++--
 images/inventory.proto |  1 +
 3 files changed, 60 insertions(+), 2 deletions(-)

Patch hide | download patch | download mbox

diff --git a/criu/cr-dump.c b/criu/cr-dump.c
index d4adfad..e7618d2 100644
--- a/criu/cr-dump.c
+++ b/criu/cr-dump.c
@@ -1486,6 +1486,9 @@  static int cr_pre_dump_finish(int status)
 	if (ret)
 		goto err;
 
+	he.has_pre_dump_mode = true;
+	he.pre_dump_mode = opts.pre_dump_mode;
+
 	pstree_switch_state(root_item, TASK_ALIVE);
 
 	timing_stop(TIME_FROZEN);
@@ -1934,6 +1937,8 @@  int cr_dump_tasks(pid_t pid)
 	if (ret)
 		goto err;
 
+	he.has_pre_dump_mode = false;
+
 	ret = write_img_inventory(&he);
 	if (ret)
 		goto err;
diff --git a/criu/mem.c b/criu/mem.c
index 911b9d2..a5de237 100644
--- a/criu/mem.c
+++ b/criu/mem.c
@@ -351,7 +351,8 @@  static int generate_vma_iovs(struct pstree_item *item, struct vma_area *vma,
 			     struct page_pipe *pp, struct page_xfer *xfer,
 			     struct parasite_dump_pages_args *args,
 			     struct parasite_ctl *ctl, pmc_t *pmc,
-			     bool has_parent, bool pre_dump)
+			     bool has_parent, bool pre_dump,
+			     int parent_predump_mode)
 {
 	u64 off = 0;
 	u64 *map;
@@ -361,6 +362,52 @@  static int generate_vma_iovs(struct pstree_item *item, struct vma_area *vma,
 				!vma_area_is(vma, VMA_ANON_SHARED))
 		return 0;
 
+	/*
+	 * To facilitate any combination of pre-dump modes to run after
+	 * one another, we need to take extra care as discussed below.
+	 *
+	 * The SPLICE mode pre-dump, processes all type of memory regions,
+	 * whereas READ mode pre-dump skips processing those memory regions
+	 * which lacks PROT_READ flag.
+	 *
+	 * Now on mixing pre-dump modes:
+	 * 	If SPLICE mode follows SPLICE mode	: no issue
+	 *		-> everything dumped both the times
+	 *
+	 * 	If READ mode follows READ mode		: no issue
+	 *		-> non-PROT_READ skipped both the time
+	 *
+	 * 	If READ mode follows SPLICE mode   	: no issue
+	 *		-> everything dumped at first,
+	 *		   the non-PROT_READ skipped later
+	 *
+	 * 	If SPLICE mode follows READ mode   	: Need special care
+	 *
+	 * If READ pre-dump happens first, then it has skipped processing
+	 * non-PROT_READ regions. Following SPLICE pre-dump expects pagemap
+	 * entries for all mappings in parent pagemap, but last READ mode
+	 * pre-dump cycle has skipped processing & pagemap generation for
+	 * non-PROT_READ regions. So SPLICE mode throws error of missing
+	 * pagemap entry for encountered non-PROT_READ mapping.
+	 *
+	 * To resolve this, the pre-dump-mode is stored in current pre-dump's
+	 * inventoy file. This pre-dump mode is read back from this file
+	 * (present in parent pre-dump dir) as parent-pre-dump-mode during
+	 * next pre-dump.
+	 *
+	 * If parent-pre-dump-mode and next-pre-dump-mode are in READ-mode ->
+	 * SPLICE-mode order, then SPLICE mode doesn't expect mappings for
+	 * non-PROT_READ regions in parent-image and marks "has_parent=false".
+	 */
+
+	if (!(vma->e->prot & PROT_READ)) {
+		if (opts.pre_dump_mode == PRE_DUMP_READ && pre_dump)
+			return 0;
+		if ((parent_predump_mode == PRE_DUMP_READ &&
+			opts.pre_dump_mode == PRE_DUMP_SPLICE) || !pre_dump)
+			has_parent = false;
+	}
+
 	if (vma_entry_is(vma->e, VMA_AREA_AIORING)) {
 		if (pre_dump)
 			return 0;
@@ -406,6 +453,7 @@  static int __parasite_dump_pages_seized(struct pstree_item *item,
 	unsigned long pmc_size;
 	int possible_pid_reuse = 0;
 	bool has_parent;
+	int parent_predump_mode = -1;
 
 	pr_info("\n");
 	pr_info("Dumping pages (type: %d pid: %d)\n", CR_FD_PAGES, item->pid->real);
@@ -472,9 +520,13 @@  static int __parasite_dump_pages_seized(struct pstree_item *item,
 	 */
 	args->off = 0;
 	has_parent = !!xfer.parent && !possible_pid_reuse;
+	if(mdc->parent_ie)
+		parent_predump_mode = mdc->parent_ie->pre_dump_mode;
+
 	list_for_each_entry(vma_area, &vma_area_list->h, list) {
 		ret = generate_vma_iovs(item, vma_area, pp, &xfer, args, ctl,
-					&pmc, has_parent, mdc->pre_dump);
+					&pmc, has_parent, mdc->pre_dump,
+					parent_predump_mode);
 		if (ret < 0)
 			goto out_xfer;
 	}
diff --git a/images/inventory.proto b/images/inventory.proto
index 7bc2b0c..d1438e8 100644
--- a/images/inventory.proto
+++ b/images/inventory.proto
@@ -16,4 +16,5 @@  message inventory_entry {
 	optional uint32			root_cg_set	= 5;
 	optional lsmtype		lsmtype		= 6;
 	optional uint64			dump_uptime	= 8;
+	optional uint32			pre_dump_mode	= 9;
 }

Comments

Andrei Vagin Sept. 25, 2019, 4:23 p.m.
On Sun, Sep 22, 2019 at 09:54:51AM +0530, Abhishek Dubey wrote:
> Skip iov-generation for regions not having
> PROT_READ, since process_vm_readv syscall
> can't process them during "read" pre-dump.
> Handle random order of "read" & "splice"
> pre-dumps.
> 
> Signed-off-by: Abhishek Dubey <dubeyabhishek777@gmail.com>
> ---
>  criu/cr-dump.c         |  5 +++++
>  criu/mem.c             | 56 ++++++++++++++++++++++++++++++++++++++++++++++++--
>  images/inventory.proto |  1 +
>  3 files changed, 60 insertions(+), 2 deletions(-)
> 
> diff --git a/criu/cr-dump.c b/criu/cr-dump.c
> index d4adfad..e7618d2 100644
> --- a/criu/cr-dump.c
> +++ b/criu/cr-dump.c
> @@ -1486,6 +1486,9 @@ static int cr_pre_dump_finish(int status)
>  	if (ret)
>  		goto err;
>  
> +	he.has_pre_dump_mode = true;
> +	he.pre_dump_mode = opts.pre_dump_mode;
> +
>  	pstree_switch_state(root_item, TASK_ALIVE);
>  
>  	timing_stop(TIME_FROZEN);
> @@ -1934,6 +1937,8 @@ int cr_dump_tasks(pid_t pid)
>  	if (ret)
>  		goto err;
>  
> +	he.has_pre_dump_mode = false;
> +
>  	ret = write_img_inventory(&he);
>  	if (ret)
>  		goto err;
> diff --git a/criu/mem.c b/criu/mem.c
> index 911b9d2..a5de237 100644
> --- a/criu/mem.c
> +++ b/criu/mem.c
> @@ -351,7 +351,8 @@ static int generate_vma_iovs(struct pstree_item *item, struct vma_area *vma,
>  			     struct page_pipe *pp, struct page_xfer *xfer,
>  			     struct parasite_dump_pages_args *args,
>  			     struct parasite_ctl *ctl, pmc_t *pmc,
> -			     bool has_parent, bool pre_dump)
> +			     bool has_parent, bool pre_dump,
> +			     int parent_predump_mode)
>  {
>  	u64 off = 0;
>  	u64 *map;
> @@ -361,6 +362,52 @@ static int generate_vma_iovs(struct pstree_item *item, struct vma_area *vma,
>  				!vma_area_is(vma, VMA_ANON_SHARED))
>  		return 0;
>  
> +	/*
> +	 * To facilitate any combination of pre-dump modes to run after
> +	 * one another, we need to take extra care as discussed below.
> +	 *
> +	 * The SPLICE mode pre-dump, processes all type of memory regions,
> +	 * whereas READ mode pre-dump skips processing those memory regions
> +	 * which lacks PROT_READ flag.
> +	 *
> +	 * Now on mixing pre-dump modes:
> +	 * 	If SPLICE mode follows SPLICE mode	: no issue
> +	 *		-> everything dumped both the times
> +	 *
> +	 * 	If READ mode follows READ mode		: no issue
> +	 *		-> non-PROT_READ skipped both the time
> +	 *
> +	 * 	If READ mode follows SPLICE mode   	: no issue
> +	 *		-> everything dumped at first,
> +	 *		   the non-PROT_READ skipped later
> +	 *
> +	 * 	If SPLICE mode follows READ mode   	: Need special care
> +	 *
> +	 * If READ pre-dump happens first, then it has skipped processing
> +	 * non-PROT_READ regions. Following SPLICE pre-dump expects pagemap
> +	 * entries for all mappings in parent pagemap, but last READ mode
> +	 * pre-dump cycle has skipped processing & pagemap generation for
> +	 * non-PROT_READ regions. So SPLICE mode throws error of missing
> +	 * pagemap entry for encountered non-PROT_READ mapping.
> +	 *
> +	 * To resolve this, the pre-dump-mode is stored in current pre-dump's
> +	 * inventoy file. This pre-dump mode is read back from this file
> +	 * (present in parent pre-dump dir) as parent-pre-dump-mode during
> +	 * next pre-dump.
> +	 *
> +	 * If parent-pre-dump-mode and next-pre-dump-mode are in READ-mode ->
> +	 * SPLICE-mode order, then SPLICE mode doesn't expect mappings for
> +	 * non-PROT_READ regions in parent-image and marks "has_parent=false".
> +	 */

Do you see any reasons to mix pre-dump modes? Maybe it would be easier
to disallow mixing them? Otherwise, we will need to think how to test
this... 

> +
> +	if (!(vma->e->prot & PROT_READ)) {
> +		if (opts.pre_dump_mode == PRE_DUMP_READ && pre_dump)
> +			return 0;
> +		if ((parent_predump_mode == PRE_DUMP_READ &&
> +			opts.pre_dump_mode == PRE_DUMP_SPLICE) || !pre_dump)
> +			has_parent = false;
> +	}
> +
>  	if (vma_entry_is(vma->e, VMA_AREA_AIORING)) {
>  		if (pre_dump)
>  			return 0;
> @@ -406,6 +453,7 @@ static int __parasite_dump_pages_seized(struct pstree_item *item,
>  	unsigned long pmc_size;
>  	int possible_pid_reuse = 0;
>  	bool has_parent;
> +	int parent_predump_mode = -1;
>  
>  	pr_info("\n");
>  	pr_info("Dumping pages (type: %d pid: %d)\n", CR_FD_PAGES, item->pid->real);
> @@ -472,9 +520,13 @@ static int __parasite_dump_pages_seized(struct pstree_item *item,
>  	 */
>  	args->off = 0;
>  	has_parent = !!xfer.parent && !possible_pid_reuse;
> +	if(mdc->parent_ie)
> +		parent_predump_mode = mdc->parent_ie->pre_dump_mode;
> +
>  	list_for_each_entry(vma_area, &vma_area_list->h, list) {
>  		ret = generate_vma_iovs(item, vma_area, pp, &xfer, args, ctl,
> -					&pmc, has_parent, mdc->pre_dump);
> +					&pmc, has_parent, mdc->pre_dump,
> +					parent_predump_mode);
>  		if (ret < 0)
>  			goto out_xfer;
>  	}
> diff --git a/images/inventory.proto b/images/inventory.proto
> index 7bc2b0c..d1438e8 100644
> --- a/images/inventory.proto
> +++ b/images/inventory.proto
> @@ -16,4 +16,5 @@ message inventory_entry {
>  	optional uint32			root_cg_set	= 5;
>  	optional lsmtype		lsmtype		= 6;
>  	optional uint64			dump_uptime	= 8;
> +	optional uint32			pre_dump_mode	= 9;
>  }
> -- 
> 2.7.4
>
Abhishek Dubey Sept. 27, 2019, 9:40 a.m.
On 25/09/19 9:53 PM, Andrei Vagin wrote:
> On Sun, Sep 22, 2019 at 09:54:51AM +0530, Abhishek Dubey wrote:
>> Skip iov-generation for regions not having
>> PROT_READ, since process_vm_readv syscall
>> can't process them during "read" pre-dump.
>> Handle random order of "read" & "splice"
>> pre-dumps.
>>
>> Signed-off-by: Abhishek Dubey <dubeyabhishek777@gmail.com>
>> ---
>>   criu/cr-dump.c         |  5 +++++
>>   criu/mem.c             | 56 ++++++++++++++++++++++++++++++++++++++++++++++++--
>>   images/inventory.proto |  1 +
>>   3 files changed, 60 insertions(+), 2 deletions(-)
>>
>> diff --git a/criu/cr-dump.c b/criu/cr-dump.c
>> index d4adfad..e7618d2 100644
>> --- a/criu/cr-dump.c
>> +++ b/criu/cr-dump.c
>> @@ -1486,6 +1486,9 @@ static int cr_pre_dump_finish(int status)
>>   	if (ret)
>>   		goto err;
>>   
>> +	he.has_pre_dump_mode = true;
>> +	he.pre_dump_mode = opts.pre_dump_mode;
>> +
>>   	pstree_switch_state(root_item, TASK_ALIVE);
>>   
>>   	timing_stop(TIME_FROZEN);
>> @@ -1934,6 +1937,8 @@ int cr_dump_tasks(pid_t pid)
>>   	if (ret)
>>   		goto err;
>>   
>> +	he.has_pre_dump_mode = false;
>> +
>>   	ret = write_img_inventory(&he);
>>   	if (ret)
>>   		goto err;
>> diff --git a/criu/mem.c b/criu/mem.c
>> index 911b9d2..a5de237 100644
>> --- a/criu/mem.c
>> +++ b/criu/mem.c
>> @@ -351,7 +351,8 @@ static int generate_vma_iovs(struct pstree_item *item, struct vma_area *vma,
>>   			     struct page_pipe *pp, struct page_xfer *xfer,
>>   			     struct parasite_dump_pages_args *args,
>>   			     struct parasite_ctl *ctl, pmc_t *pmc,
>> -			     bool has_parent, bool pre_dump)
>> +			     bool has_parent, bool pre_dump,
>> +			     int parent_predump_mode)
>>   {
>>   	u64 off = 0;
>>   	u64 *map;
>> @@ -361,6 +362,52 @@ static int generate_vma_iovs(struct pstree_item *item, struct vma_area *vma,
>>   				!vma_area_is(vma, VMA_ANON_SHARED))
>>   		return 0;
>>   
>> +	/*
>> +	 * To facilitate any combination of pre-dump modes to run after
>> +	 * one another, we need to take extra care as discussed below.
>> +	 *
>> +	 * The SPLICE mode pre-dump, processes all type of memory regions,
>> +	 * whereas READ mode pre-dump skips processing those memory regions
>> +	 * which lacks PROT_READ flag.
>> +	 *
>> +	 * Now on mixing pre-dump modes:
>> +	 * 	If SPLICE mode follows SPLICE mode	: no issue
>> +	 *		-> everything dumped both the times
>> +	 *
>> +	 * 	If READ mode follows READ mode		: no issue
>> +	 *		-> non-PROT_READ skiret = page_xfer_predump_pages(item->pid->real,
>> +							&xfer, mem_pp);pped both the time
>> +	 *
>> +	 * 	If READ mode follows SPLICE mode   	: no issue
>> +	 *		-> everything dumped at first,
>> +	 *		   the non-PROT_READ skipped later
>> +	 *
>> +	 * 	If SPLICE mode follows READ mode   	: Need special care
>> +	 *
>> +	 * If READ pre-dump happens first, then it has skipped processing
>> +	 * non-PROT_READ regions. Following SPLICE pre-dump expects pagemap
>> +	 * entries for all mappings in parent pagemap, but last READ mode
>> +	 * pre-dump cycle has skipped processing & pagemap generation for
>> +	 * non-PROT_READ regions. So SPLICE mode throws error of missingret = page_xfer_predump_pages(item->pid->real,
>> +							&xfer, mem_pp);
>> +	 * pagemap entry for encountered non-PROT_READ mapping.
>> +	 *
>> +	 * To resolve this, the pre-dump-mode is stored in current pre-dump's
>> +	 * inventoy file. This pre-dump mode is read back from this file
>> +	 * (present in parent pre-dump dir) as parent-pre-dump-mode during
>> +	 * next pre-dump.
>> +	 *
>> +	 * If parent-pre-dump-mode and next-pre-dump-mode are in READ-mode ->
>> +	 * SPLICE-mode order, then SPLICE mode doesn't expect mappings for
>> +	 * non-PROT_READ regions in parent-image and marks "has_parent=false".
>> +	 */
> Do you see any reasons to mix pre-dump modes? Maybe it would be easier
> to disallow mixing them? Otherwise, we will need to think how to test
> this...

Dmitry suggested to support mix of READ and SPLICE pre-dump modes, so I 
tried implementing it.

Initial implementation was based on SPLICE mode alone.

>> +
>> +	if (!(vma->e->prot & PROT_READ)) {
>> +		if (opts.pre_dump_mode == PRE_DUMP_READ && pre_dump)
>> +			return 0;
>> +		if ((parent_predump_mode == PRE_DUMP_READ &&
>> +			opts.pre_dump_mode == PRE_DUMP_SPLICE) || !pre_dump)
>> +			has_parent = false;
>> +	}
>> +
>>   	if (vma_entry_is(vma->e, VMA_AREA_AIORING)) {
>>   		if (pre_dump)
>>   			return 0;
>> @@ -406,6 +453,7 @@ static int __parasite_dump_pages_seized(struct pstree_item *item,
>>   	unsigned long pmc_size;
>>   	int possible_pid_reuse = 0;
>>   	bool has_parent;
>> +	int parent_predump_mode = -1;
>>   
>>   	pr_info("\n");
>>   	pr_info("Dumping pages (type: %d pid: %d)\n", CR_FD_PAGES, item->pid->real);
>> @@ -472,9 +520,13 @@ static int __parasite_dump_pages_seized(struct pstree_item *item,
>>   	 */
>>   	args->off = 0;
>>   	has_parent = !!xfer.parent && !possible_pid_reuse;
>> +	if(mdc->parent_ie)
>> +		parent_predump_mode = mdc->parent_ie->pre_dump_mode;
>> +
>>   	list_for_each_entry(vma_area, &vma_area_list->h, list) {
>>   		ret = generate_vma_iovs(item, vma_area, pp, &xfer, args, ctl,
>> -					&pmc, has_parent, mdc->pre_dump);
>> +					&pmc, has_parent, mdc->pre_dump,
>> +					parent_predump_mode);
>>   		if (ret < 0)
>>   			goto out_xfer;
>>   	}
>> diff --git a/images/inventory.proto b/images/inventory.proto
>> index 7bc2b0c..d1438e8 100644
>> --- a/images/inventory.proto
>> +++ b/images/inventory.proto
>> @@ -16,4 +16,5 @@ message inventory_entry {
>>   	optional uint32			root_cg_set	= 5;
>>   	optional lsmtype		lsmtype		= 6;
>>   	optional uint64			dump_uptime	= 8;
>> +	optional uint32			pre_dump_mode	= 9;
>>   }
>> -- 
>> 2.7.4
>>