[Devel,2/2] parasite: Don't try to start executing syscall from guard page

Submitted by Cyrill Gorcunov on Nov. 24, 2016, 3:48 p.m.

Details

Message ID 1480002497-21671-3-git-send-email-gorcunov@virtuozzo.com
State New
Series "criu: Make sure we don't hit guard page when looking for syscall-ip"
Headers show

Commit Message

Cyrill Gorcunov Nov. 24, 2016, 3:48 p.m.
When looking for place where we can do syscall we should care
if there a guard page present. So check for it first.

https://jira.sw.ru/browse/PSBM-55989

Signed-off-by: Cyrill Gorcunov <gorcunov@virtuozzo.com>
---
 criu/parasite-syscall.c | 39 ++++++++++++++++++++++-----------------
 1 file changed, 22 insertions(+), 17 deletions(-)

Patch hide | download patch | download mbox

diff --git a/criu/parasite-syscall.c b/criu/parasite-syscall.c
index 0fe7e83..07c4ef2 100644
--- a/criu/parasite-syscall.c
+++ b/criu/parasite-syscall.c
@@ -49,30 +49,36 @@  static int can_run_syscall(unsigned long ip, unsigned long start,
 	return ip >= start && ip < (end - BUILTIN_SYSCALL_SIZE - pad);
 }
 
-static int syscall_fits_vma_area(struct vma_area *vma_area, unsigned long pad)
+static int syscall_fits_vma_area(unsigned long start, unsigned long end, unsigned long pad)
 {
-	return can_run_syscall((unsigned long)vma_area->e->start,
-			       (unsigned long)vma_area->e->start,
-			       (unsigned long)vma_area->e->end,
-			       pad);
+	return can_run_syscall(start, start, end, pad);
 }
 
-static struct vma_area *get_vma_by_ip(struct list_head *vma_area_list,
-				      unsigned long ip,
-				      unsigned long pad)
+static unsigned long find_syscall_ip(struct list_head *vma_area_list,
+				     unsigned long ip, unsigned long pad)
 {
-	struct vma_area *vma_area;
+	struct vma_area *vma_area, *vma_prev = NULL;
 
 	list_for_each_entry(vma_area, vma_area_list, list) {
+		unsigned long start, end;
+
 		if (vma_area->e->start >= kdat.task_size)
 			continue;
 		if (!(vma_area->e->prot & PROT_EXEC))
 			continue;
-		if (syscall_fits_vma_area(vma_area, pad))
-			return vma_area;
+
+		start = vma_area->e->start;
+		end = vma_area->e->end;
+
+		if (stack_guard_page_start(vma_area, vma_prev, start))
+			start += PAGE_SIZE;
+
+		if (syscall_fits_vma_area(start, end, pad))
+			return start;
+		vma_prev = vma_area;
 	}
 
-	return NULL;
+	return 0;
 }
 
 static inline int ptrace_get_regs(int pid, user_regs_struct_t *regs)
@@ -1154,7 +1160,6 @@  err:
 struct parasite_ctl *parasite_prep_ctl(pid_t pid, struct vm_area_list *vma_area_list)
 {
 	struct parasite_ctl *ctl = NULL;
-	struct vma_area *vma_area;
 
 	if (!arch_can_dump_task(pid))
 		goto err;
@@ -1180,15 +1185,15 @@  struct parasite_ctl *parasite_prep_ctl(pid_t pid, struct vm_area_list *vma_area_
 		return ctl;
 
 	/* Search a place for injecting syscall */
-	vma_area = get_vma_by_ip(&vma_area_list->h, REG_IP(ctl->orig.regs),
-				 MEMFD_FNAME_SZ);
-	if (!vma_area) {
+	ctl->syscall_ip	= find_syscall_ip(&vma_area_list->h,
+					  REG_IP(ctl->orig.regs),
+					  MEMFD_FNAME_SZ);
+	if (!ctl->syscall_ip) {
 		pr_err("No suitable VMA found to run parasite "
 		       "bootstrap code (pid: %d)\n", pid);
 		goto err;
 	}
 
-	ctl->syscall_ip	= vma_area->e->start;
 	pr_debug("Parasite syscall_ip at %p\n", (void *)ctl->syscall_ip);
 
 	return ctl;

Comments

Pavel Emelianov Nov. 24, 2016, 4:02 p.m.
On 11/24/2016 06:48 PM, Cyrill Gorcunov wrote:
> When looking for place where we can do syscall we should care
> if there a guard page present. So check for it first.

Have we found a task with executable stack?

> https://jira.sw.ru/browse/PSBM-55989
> 
> Signed-off-by: Cyrill Gorcunov <gorcunov@virtuozzo.com>
> ---
>  criu/parasite-syscall.c | 39 ++++++++++++++++++++++-----------------
>  1 file changed, 22 insertions(+), 17 deletions(-)
> 
> diff --git a/criu/parasite-syscall.c b/criu/parasite-syscall.c
> index 0fe7e83..07c4ef2 100644
> --- a/criu/parasite-syscall.c
> +++ b/criu/parasite-syscall.c
> @@ -49,30 +49,36 @@ static int can_run_syscall(unsigned long ip, unsigned long start,
>  	return ip >= start && ip < (end - BUILTIN_SYSCALL_SIZE - pad);
>  }
>  
> -static int syscall_fits_vma_area(struct vma_area *vma_area, unsigned long pad)
> +static int syscall_fits_vma_area(unsigned long start, unsigned long end, unsigned long pad)
>  {
> -	return can_run_syscall((unsigned long)vma_area->e->start,
> -			       (unsigned long)vma_area->e->start,
> -			       (unsigned long)vma_area->e->end,
> -			       pad);
> +	return can_run_syscall(start, start, end, pad);
>  }
>  
> -static struct vma_area *get_vma_by_ip(struct list_head *vma_area_list,
> -				      unsigned long ip,
> -				      unsigned long pad)
> +static unsigned long find_syscall_ip(struct list_head *vma_area_list,
> +				     unsigned long ip, unsigned long pad)
>  {
> -	struct vma_area *vma_area;
> +	struct vma_area *vma_area, *vma_prev = NULL;
>  
>  	list_for_each_entry(vma_area, vma_area_list, list) {
> +		unsigned long start, end;
> +
>  		if (vma_area->e->start >= kdat.task_size)
>  			continue;
>  		if (!(vma_area->e->prot & PROT_EXEC))
>  			continue;
> -		if (syscall_fits_vma_area(vma_area, pad))
> -			return vma_area;
> +
> +		start = vma_area->e->start;
> +		end = vma_area->e->end;
> +
> +		if (stack_guard_page_start(vma_area, vma_prev, start))
> +			start += PAGE_SIZE;
> +
> +		if (syscall_fits_vma_area(start, end, pad))
> +			return start;
> +		vma_prev = vma_area;
>  	}
>  
> -	return NULL;
> +	return 0;
>  }
>  
>  static inline int ptrace_get_regs(int pid, user_regs_struct_t *regs)
> @@ -1154,7 +1160,6 @@ err:
>  struct parasite_ctl *parasite_prep_ctl(pid_t pid, struct vm_area_list *vma_area_list)
>  {
>  	struct parasite_ctl *ctl = NULL;
> -	struct vma_area *vma_area;
>  
>  	if (!arch_can_dump_task(pid))
>  		goto err;
> @@ -1180,15 +1185,15 @@ struct parasite_ctl *parasite_prep_ctl(pid_t pid, struct vm_area_list *vma_area_
>  		return ctl;
>  
>  	/* Search a place for injecting syscall */
> -	vma_area = get_vma_by_ip(&vma_area_list->h, REG_IP(ctl->orig.regs),
> -				 MEMFD_FNAME_SZ);
> -	if (!vma_area) {
> +	ctl->syscall_ip	= find_syscall_ip(&vma_area_list->h,
> +					  REG_IP(ctl->orig.regs),
> +					  MEMFD_FNAME_SZ);
> +	if (!ctl->syscall_ip) {
>  		pr_err("No suitable VMA found to run parasite "
>  		       "bootstrap code (pid: %d)\n", pid);
>  		goto err;
>  	}
>  
> -	ctl->syscall_ip	= vma_area->e->start;
>  	pr_debug("Parasite syscall_ip at %p\n", (void *)ctl->syscall_ip);
>  
>  	return ctl;
>
Kirill Gorkunov Nov. 24, 2016, 4:16 p.m.
On Thu, Nov 24, 2016 at 07:02:10PM +0300, Pavel Emelyanov wrote:
> On 11/24/2016 06:48 PM, Cyrill Gorcunov wrote:
> > When looking for place where we can do syscall we should care
> > if there a guard page present. So check for it first.
> 
> Have we found a task with executable stack?

Not "stack" in classic meaning, but a special mmap which
kernels treats as a stack and adds guard page. It is our test maps05
Cyrill Gorcunov Nov. 24, 2016, 4:33 p.m.
On Thu, Nov 24, 2016 at 07:34:14PM +0300, Pavel Emelyanov wrote:
> On 11/24/2016 07:16 PM, Cyrill Gorcunov wrote:
> > On Thu, Nov 24, 2016 at 07:02:10PM +0300, Pavel Emelyanov wrote:
> >> On 11/24/2016 06:48 PM, Cyrill Gorcunov wrote:
> >>> When looking for place where we can do syscall we should care
> >>> if there a guard page present. So check for it first.
> >>
> >> Have we found a task with executable stack?
> > 
> > Not "stack" in classic meaning, but a special mmap which
> > kernels treats as a stack and adds guard page. It is our test maps05
> 
> Wait, kernel guards VMAs with MAP_GROWSDOWN, while CRIU searches for PROT_EXEC
> areas to work on. Have our test created such a mapping? I'd fix the test, no
> applications behave like that %)

Yes we do create such mapping and it's completely correct to applications
to create the same mappings. In reverse, I think it's good that we have
such test -- it is straight bug in criu, not test.
Pavel Emelianov Nov. 24, 2016, 4:34 p.m.
On 11/24/2016 07:16 PM, Cyrill Gorcunov wrote:
> On Thu, Nov 24, 2016 at 07:02:10PM +0300, Pavel Emelyanov wrote:
>> On 11/24/2016 06:48 PM, Cyrill Gorcunov wrote:
>>> When looking for place where we can do syscall we should care
>>> if there a guard page present. So check for it first.
>>
>> Have we found a task with executable stack?
> 
> Not "stack" in classic meaning, but a special mmap which
> kernels treats as a stack and adds guard page. It is our test maps05

Wait, kernel guards VMAs with MAP_GROWSDOWN, while CRIU searches for PROT_EXEC
areas to work on. Have our test created such a mapping? I'd fix the test, no
applications behave like that %)

-- Pavel
Pavel Emelianov Nov. 25, 2016, 10:51 a.m.
On 11/24/2016 07:33 PM, Cyrill Gorcunov wrote:
> On Thu, Nov 24, 2016 at 07:34:14PM +0300, Pavel Emelyanov wrote:
>> On 11/24/2016 07:16 PM, Cyrill Gorcunov wrote:
>>> On Thu, Nov 24, 2016 at 07:02:10PM +0300, Pavel Emelyanov wrote:
>>>> On 11/24/2016 06:48 PM, Cyrill Gorcunov wrote:
>>>>> When looking for place where we can do syscall we should care
>>>>> if there a guard page present. So check for it first.
>>>>
>>>> Have we found a task with executable stack?
>>>
>>> Not "stack" in classic meaning, but a special mmap which
>>> kernels treats as a stack and adds guard page. It is our test maps05
>>
>> Wait, kernel guards VMAs with MAP_GROWSDOWN, while CRIU searches for PROT_EXEC
>> areas to work on. Have our test created such a mapping? I'd fix the test, no
>> applications behave like that %)
> 
> Yes we do create such mapping and it's completely correct to applications
> to create the same mappings. In reverse, I think it's good that we have
> such test -- it is straight bug in criu, not test.

Just skip GROWSDOWN mappings when searching for syscall VMA and
don't mess with guard pages at all.

-- Pavel
Kirill Gorkunov Nov. 25, 2016, 10:54 a.m.
On Fri, Nov 25, 2016 at 01:51:44PM +0300, Pavel Emelyanov wrote:
> > Yes we do create such mapping and it's completely correct to applications
> > to create the same mappings. In reverse, I think it's good that we have
> > such test -- it is straight bug in criu, not test.
> 
> Just skip GROWSDOWN mappings when searching for syscall VMA and
> don't mess with guard pages at all.

I think so, this gonna be a way easier. Thanks.