Cleanup do_full_int80()

Submitted by Nicolas Viennot on Sept. 30, 2019, 8:57 p.m.

Details

Message ID bda6223e555f41078dc3a823e2d38464@EXMBDFT10.ad.twosigma.com
State Accepted
Series "Cleanup do_full_int80()"
Commit e7570f3c1ea23b12297ad1f514949e26c6d5a360
Headers show

Commit Message

Nicolas Viennot Sept. 30, 2019, 8:57 p.m.
1) Instead of tampering with the nr argument, do_full_int80() returns
the value of the system call. It also avoids copying all registers back
into the syscall_args32 argument after the syscall.

2) Additionally, the registers r12-r15 were added in the list of
clobbers as kernels older than v4.4 do not preserve these.

3) Further, GCC uses a 128-byte red-zone as defined in the x86_64 ABI
optimizing away the correct position of the %rsp register in
leaf-functions. We now avoid tampering with the red-zone, fixing a
SIGSEGV when running mmap_bug_test() in debug mode (DEBUG=1).

Signed-off-by: Nicolas Viennot <Nicolas.Viennot@twosigma.com>
---
 criu/arch/x86/crtools.c            |  6 ++--
 criu/arch/x86/include/asm/compat.h | 51 ++++++++++++++++++++----------
 criu/arch/x86/kerndat.c            |  4 +--
 criu/arch/x86/restorer.c           |  3 +-
 criu/arch/x86/sigaction_compat.c   |  6 +---
 5 files changed, 40 insertions(+), 30 deletions(-)

Patch hide | download patch | download mbox

diff --git a/criu/arch/x86/crtools.c b/criu/arch/x86/crtools.c
index efc23e5f..e4073c27 100644
--- a/criu/arch/x86/crtools.c
+++ b/criu/arch/x86/crtools.c
@@ -590,8 +590,7 @@  static int get_robust_list32(pid_t pid, uintptr_t head, uintptr_t len)
 		.arg2	= (uint32_t)len,
 	};
 
-	do_full_int80(&s);
-	return (int)s.nr;
+	return do_full_int80(&s);
 }
 
 static int set_robust_list32(uint32_t head, uint32_t len)
@@ -602,8 +601,7 @@  static int set_robust_list32(uint32_t head, uint32_t len)
 		.arg1	= len,
 	};
 
-	do_full_int80(&s);
-	return (int)s.nr;
+	return do_full_int80(&s);
 }
 
 int get_task_futex_robust_list_compat(pid_t pid, ThreadCoreEntry *info)
diff --git a/criu/arch/x86/include/asm/compat.h b/criu/arch/x86/include/asm/compat.h
index cd1ae472..acd552fb 100644
--- a/criu/arch/x86/include/asm/compat.h
+++ b/criu/arch/x86/include/asm/compat.h
@@ -38,26 +38,45 @@  struct syscall_args32 {
 	uint32_t nr, arg0, arg1, arg2, arg3, arg4, arg5;
 };
 
-static inline void do_full_int80(struct syscall_args32 *args)
+static inline uint32_t do_full_int80(struct syscall_args32 *args)
 {
 	/*
-	 * r8-r11 registers are cleared during returning to userspace
-	 * from syscall - that's x86_64 ABI to avoid leaking kernel
-	 * pointers.
+	 * Kernel older than v4.4 do not preserve r8-r15 registers when
+	 * invoking int80, so we need to preserve them.
 	 *
-	 * Other than that - we can't use %rbp in clobbers as GCC's inline
-	 * assembly doesn't allow to do so. So, here is explicitly saving
-	 * %rbp before syscall and restoring it's value afterward.
+	 * Additionally, %rbp is used as the 6th syscall argument, and we need
+	 * to preserve its value when returning from the syscall to avoid
+	 * upsetting GCC. However, we can't use %rbp in the GCC asm clobbers
+	 * due to a GCC limitation. Instead, we explicitly save %rbp on the
+	 * stack before invoking the syscall and restore its value afterward.
+	 *
+	 * Further, GCC may not adjust the %rsp pointer when allocating the
+	 * args and ret variables because 1) do_full_int80() is a leaf
+	 * function, and 2) the local variables (args and ret) are in the
+	 * 128-byte red-zone as defined in the x86_64 ABI. To use the stack
+	 * when preserving %rbp, we must either tell GCC to a) mark the
+	 * function as non-leaf, or b) move away from the red-zone when using
+	 * the stack. It seems that there is no easy way to do a), so we'll go
+	 * with b).
+	 * Note 1: Another workaround would have been to add %rsp in the list
+	 * of clobbers, but this was deprecated in GCC 9.
+	 * Note 2: This red-zone bug only manifests when compiling CRIU with
+	 * DEBUG=1.
 	 */
-	asm volatile ("pushq %%rbp\n\t"
-			"mov %6, %%ebp\n\t"
-			"int $0x80\n\t"
-			"mov %%ebp, %6\n\t"
-			"popq %%rbp\n\t"
-		      : "+a" (args->nr),
-			"+b" (args->arg0), "+c" (args->arg1), "+d" (args->arg2),
-			"+S" (args->arg3), "+D" (args->arg4), "+g" (args->arg5)
-			: : "r8", "r9", "r10", "r11");
+	uint32_t ret;
+
+	asm volatile ("sub $128, %%rsp\n\t"
+		      "pushq %%rbp\n\t"
+		      "mov %7, %%ebp\n\t"
+		      "int $0x80\n\t"
+		      "popq %%rbp\n\t"
+		      "add $128, %%rsp\n\t"
+		      : "=a" (ret)
+		      : "a" (args->nr),
+			"b" (args->arg0), "c" (args->arg1), "d" (args->arg2),
+			"S" (args->arg3), "D" (args->arg4), "g" (args->arg5)
+		      : "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15");
+	return ret;
 }
 
 #ifndef CR_NOGLIBC
diff --git a/criu/arch/x86/kerndat.c b/criu/arch/x86/kerndat.c
index f7593251..94c954e1 100644
--- a/criu/arch/x86/kerndat.c
+++ b/criu/arch/x86/kerndat.c
@@ -75,9 +75,7 @@  void *mmap_ia32(void *addr, size_t len, int prot,
 	s.arg4  = fildes;
 	s.arg5  = (uint32_t)off;
 
-	do_full_int80(&s);
-
-	return (void *)(uintptr_t)s.nr;
+	return (void *)(uintptr_t)do_full_int80(&s);
 }
 
 /*
diff --git a/criu/arch/x86/restorer.c b/criu/arch/x86/restorer.c
index 2d335d5e..b2c3b366 100644
--- a/criu/arch/x86/restorer.c
+++ b/criu/arch/x86/restorer.c
@@ -54,8 +54,7 @@  int set_compat_robust_list(uint32_t head_ptr, uint32_t len)
 		.arg1	= len,
 	};
 
-	do_full_int80(&s);
-	return (int)s.nr;
+	return do_full_int80(&s);
 }
 
 static int prepare_stack32(void **stack32)
diff --git a/criu/arch/x86/sigaction_compat.c b/criu/arch/x86/sigaction_compat.c
index b38ba801..f467da49 100644
--- a/criu/arch/x86/sigaction_compat.c
+++ b/criu/arch/x86/sigaction_compat.c
@@ -28,7 +28,6 @@  extern char restore_rt_sigaction;
  */
 int arch_compat_rt_sigaction(void *stack32, int sig, rt_sigaction_t_compat *act)
 {
-	int ret;
 	struct syscall_args32 arg = {};
 	unsigned long act_stack = (unsigned long)stack32;
 
@@ -49,8 +48,5 @@  int arch_compat_rt_sigaction(void *stack32, int sig, rt_sigaction_t_compat *act)
 	arg.arg2	= 0;					/* oldact */
 	arg.arg3	= (uint32_t)sizeof(act->rt_sa_mask);	/* sigsetsize */
 
-	do_full_int80(&arg);
-	asm volatile ("\t movl %%eax,%0\n" : "=r"(ret));
-	return ret;
+	return do_full_int80(&arg);
 }
-

Comments

Dmitry Safonov Sept. 30, 2019, 9:22 p.m.
Hi Nicolas,

On 9/30/19 9:57 PM, Nicolas Viennot wrote:
> 1) Instead of tampering with the nr argument, do_full_int80() returns
> the value of the system call. It also avoids copying all registers back
> into the syscall_args32 argument after the syscall.
> 
> 2) Additionally, the registers r12-r15 were added in the list of
> clobbers as kernels older than v4.4 do not preserve these.
> 
> 3) Further, GCC uses a 128-byte red-zone as defined in the x86_64 ABI
> optimizing away the correct position of the %rsp register in
> leaf-functions. We now avoid tampering with the red-zone, fixing a
> SIGSEGV when running mmap_bug_test() in debug mode (DEBUG=1).
> 
> Signed-off-by: Nicolas Viennot <Nicolas.Viennot@twosigma.com>

Thanks much for doing this and I feel sorry to keep you going through so
many iterations,

Reviewed-by: Dmitry Safonov <0x7f454c46@gmail.com>

> ---
>  criu/arch/x86/crtools.c            |  6 ++--
>  criu/arch/x86/include/asm/compat.h | 51 ++++++++++++++++++++----------
>  criu/arch/x86/kerndat.c            |  4 +--
>  criu/arch/x86/restorer.c           |  3 +-
>  criu/arch/x86/sigaction_compat.c   |  6 +---
>  5 files changed, 40 insertions(+), 30 deletions(-)
> 
> diff --git a/criu/arch/x86/crtools.c b/criu/arch/x86/crtools.c
> index efc23e5f..e4073c27 100644
> --- a/criu/arch/x86/crtools.c
> +++ b/criu/arch/x86/crtools.c
> @@ -590,8 +590,7 @@ static int get_robust_list32(pid_t pid, uintptr_t head, uintptr_t len)
>  		.arg2	= (uint32_t)len,
>  	};
>  
> -	do_full_int80(&s);
> -	return (int)s.nr;
> +	return do_full_int80(&s);
>  }
>  
>  static int set_robust_list32(uint32_t head, uint32_t len)
> @@ -602,8 +601,7 @@ static int set_robust_list32(uint32_t head, uint32_t len)
>  		.arg1	= len,
>  	};
>  
> -	do_full_int80(&s);
> -	return (int)s.nr;
> +	return do_full_int80(&s);
>  }
>  
>  int get_task_futex_robust_list_compat(pid_t pid, ThreadCoreEntry *info)
> diff --git a/criu/arch/x86/include/asm/compat.h b/criu/arch/x86/include/asm/compat.h
> index cd1ae472..acd552fb 100644
> --- a/criu/arch/x86/include/asm/compat.h
> +++ b/criu/arch/x86/include/asm/compat.h
> @@ -38,26 +38,45 @@ struct syscall_args32 {
>  	uint32_t nr, arg0, arg1, arg2, arg3, arg4, arg5;
>  };
>  
> -static inline void do_full_int80(struct syscall_args32 *args)
> +static inline uint32_t do_full_int80(struct syscall_args32 *args)
>  {
>  	/*
> -	 * r8-r11 registers are cleared during returning to userspace
> -	 * from syscall - that's x86_64 ABI to avoid leaking kernel
> -	 * pointers.
> +	 * Kernel older than v4.4 do not preserve r8-r15 registers when
> +	 * invoking int80, so we need to preserve them.
>  	 *
> -	 * Other than that - we can't use %rbp in clobbers as GCC's inline
> -	 * assembly doesn't allow to do so. So, here is explicitly saving
> -	 * %rbp before syscall and restoring it's value afterward.
> +	 * Additionally, %rbp is used as the 6th syscall argument, and we need
> +	 * to preserve its value when returning from the syscall to avoid
> +	 * upsetting GCC. However, we can't use %rbp in the GCC asm clobbers
> +	 * due to a GCC limitation. Instead, we explicitly save %rbp on the
> +	 * stack before invoking the syscall and restore its value afterward.
> +	 *
> +	 * Further, GCC may not adjust the %rsp pointer when allocating the
> +	 * args and ret variables because 1) do_full_int80() is a leaf
> +	 * function, and 2) the local variables (args and ret) are in the
> +	 * 128-byte red-zone as defined in the x86_64 ABI. To use the stack
> +	 * when preserving %rbp, we must either tell GCC to a) mark the
> +	 * function as non-leaf, or b) move away from the red-zone when using
> +	 * the stack. It seems that there is no easy way to do a), so we'll go
> +	 * with b).
> +	 * Note 1: Another workaround would have been to add %rsp in the list
> +	 * of clobbers, but this was deprecated in GCC 9.
> +	 * Note 2: This red-zone bug only manifests when compiling CRIU with
> +	 * DEBUG=1.
>  	 */
> -	asm volatile ("pushq %%rbp\n\t"
> -			"mov %6, %%ebp\n\t"
> -			"int $0x80\n\t"
> -			"mov %%ebp, %6\n\t"
> -			"popq %%rbp\n\t"
> -		      : "+a" (args->nr),
> -			"+b" (args->arg0), "+c" (args->arg1), "+d" (args->arg2),
> -			"+S" (args->arg3), "+D" (args->arg4), "+g" (args->arg5)
> -			: : "r8", "r9", "r10", "r11");
> +	uint32_t ret;
> +
> +	asm volatile ("sub $128, %%rsp\n\t"
> +		      "pushq %%rbp\n\t"
> +		      "mov %7, %%ebp\n\t"
> +		      "int $0x80\n\t"
> +		      "popq %%rbp\n\t"
> +		      "add $128, %%rsp\n\t"
> +		      : "=a" (ret)
> +		      : "a" (args->nr),
> +			"b" (args->arg0), "c" (args->arg1), "d" (args->arg2),
> +			"S" (args->arg3), "D" (args->arg4), "g" (args->arg5)
> +		      : "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15");
> +	return ret;
>  }
>  
>  #ifndef CR_NOGLIBC
> diff --git a/criu/arch/x86/kerndat.c b/criu/arch/x86/kerndat.c
> index f7593251..94c954e1 100644
> --- a/criu/arch/x86/kerndat.c
> +++ b/criu/arch/x86/kerndat.c
> @@ -75,9 +75,7 @@ void *mmap_ia32(void *addr, size_t len, int prot,
>  	s.arg4  = fildes;
>  	s.arg5  = (uint32_t)off;
>  
> -	do_full_int80(&s);
> -
> -	return (void *)(uintptr_t)s.nr;
> +	return (void *)(uintptr_t)do_full_int80(&s);
>  }
>  
>  /*
> diff --git a/criu/arch/x86/restorer.c b/criu/arch/x86/restorer.c
> index 2d335d5e..b2c3b366 100644
> --- a/criu/arch/x86/restorer.c
> +++ b/criu/arch/x86/restorer.c
> @@ -54,8 +54,7 @@ int set_compat_robust_list(uint32_t head_ptr, uint32_t len)
>  		.arg1	= len,
>  	};
>  
> -	do_full_int80(&s);
> -	return (int)s.nr;
> +	return do_full_int80(&s);
>  }
>  
>  static int prepare_stack32(void **stack32)
> diff --git a/criu/arch/x86/sigaction_compat.c b/criu/arch/x86/sigaction_compat.c
> index b38ba801..f467da49 100644
> --- a/criu/arch/x86/sigaction_compat.c
> +++ b/criu/arch/x86/sigaction_compat.c
> @@ -28,7 +28,6 @@ extern char restore_rt_sigaction;
>   */
>  int arch_compat_rt_sigaction(void *stack32, int sig, rt_sigaction_t_compat *act)
>  {
> -	int ret;
>  	struct syscall_args32 arg = {};
>  	unsigned long act_stack = (unsigned long)stack32;
>  
> @@ -49,8 +48,5 @@ int arch_compat_rt_sigaction(void *stack32, int sig, rt_sigaction_t_compat *act)
>  	arg.arg2	= 0;					/* oldact */
>  	arg.arg3	= (uint32_t)sizeof(act->rt_sa_mask);	/* sigsetsize */
>  
> -	do_full_int80(&arg);
> -	asm volatile ("\t movl %%eax,%0\n" : "=r"(ret));
> -	return ret;
> +	return do_full_int80(&arg);
>  }
> -
> 

Thanks,
          Dmitry
Andrei Vagin Oct. 3, 2019, 1:19 a.m.
Applied, thanks!

On Mon, Sep 30, 2019 at 08:57:08PM +0000, Nicolas Viennot wrote:
> 1) Instead of tampering with the nr argument, do_full_int80() returns
> the value of the system call. It also avoids copying all registers back
> into the syscall_args32 argument after the syscall.
> 
> 2) Additionally, the registers r12-r15 were added in the list of
> clobbers as kernels older than v4.4 do not preserve these.
> 
> 3) Further, GCC uses a 128-byte red-zone as defined in the x86_64 ABI
> optimizing away the correct position of the %rsp register in
> leaf-functions. We now avoid tampering with the red-zone, fixing a
> SIGSEGV when running mmap_bug_test() in debug mode (DEBUG=1).
> 
> Signed-off-by: Nicolas Viennot <Nicolas.Viennot@twosigma.com>
> ---
>  criu/arch/x86/crtools.c            |  6 ++--
>  criu/arch/x86/include/asm/compat.h | 51 ++++++++++++++++++++----------
>  criu/arch/x86/kerndat.c            |  4 +--
>  criu/arch/x86/restorer.c           |  3 +-
>  criu/arch/x86/sigaction_compat.c   |  6 +---
>  5 files changed, 40 insertions(+), 30 deletions(-)
> 
> diff --git a/criu/arch/x86/crtools.c b/criu/arch/x86/crtools.c
> index efc23e5f..e4073c27 100644
> --- a/criu/arch/x86/crtools.c
> +++ b/criu/arch/x86/crtools.c
> @@ -590,8 +590,7 @@ static int get_robust_list32(pid_t pid, uintptr_t head, uintptr_t len)
>  		.arg2	= (uint32_t)len,
>  	};
>  
> -	do_full_int80(&s);
> -	return (int)s.nr;
> +	return do_full_int80(&s);
>  }
>  
>  static int set_robust_list32(uint32_t head, uint32_t len)
> @@ -602,8 +601,7 @@ static int set_robust_list32(uint32_t head, uint32_t len)
>  		.arg1	= len,
>  	};
>  
> -	do_full_int80(&s);
> -	return (int)s.nr;
> +	return do_full_int80(&s);
>  }
>  
>  int get_task_futex_robust_list_compat(pid_t pid, ThreadCoreEntry *info)
> diff --git a/criu/arch/x86/include/asm/compat.h b/criu/arch/x86/include/asm/compat.h
> index cd1ae472..acd552fb 100644
> --- a/criu/arch/x86/include/asm/compat.h
> +++ b/criu/arch/x86/include/asm/compat.h
> @@ -38,26 +38,45 @@ struct syscall_args32 {
>  	uint32_t nr, arg0, arg1, arg2, arg3, arg4, arg5;
>  };
>  
> -static inline void do_full_int80(struct syscall_args32 *args)
> +static inline uint32_t do_full_int80(struct syscall_args32 *args)
>  {
>  	/*
> -	 * r8-r11 registers are cleared during returning to userspace
> -	 * from syscall - that's x86_64 ABI to avoid leaking kernel
> -	 * pointers.
> +	 * Kernel older than v4.4 do not preserve r8-r15 registers when
> +	 * invoking int80, so we need to preserve them.
>  	 *
> -	 * Other than that - we can't use %rbp in clobbers as GCC's inline
> -	 * assembly doesn't allow to do so. So, here is explicitly saving
> -	 * %rbp before syscall and restoring it's value afterward.
> +	 * Additionally, %rbp is used as the 6th syscall argument, and we need
> +	 * to preserve its value when returning from the syscall to avoid
> +	 * upsetting GCC. However, we can't use %rbp in the GCC asm clobbers
> +	 * due to a GCC limitation. Instead, we explicitly save %rbp on the
> +	 * stack before invoking the syscall and restore its value afterward.
> +	 *
> +	 * Further, GCC may not adjust the %rsp pointer when allocating the
> +	 * args and ret variables because 1) do_full_int80() is a leaf
> +	 * function, and 2) the local variables (args and ret) are in the
> +	 * 128-byte red-zone as defined in the x86_64 ABI. To use the stack
> +	 * when preserving %rbp, we must either tell GCC to a) mark the
> +	 * function as non-leaf, or b) move away from the red-zone when using
> +	 * the stack. It seems that there is no easy way to do a), so we'll go
> +	 * with b).
> +	 * Note 1: Another workaround would have been to add %rsp in the list
> +	 * of clobbers, but this was deprecated in GCC 9.
> +	 * Note 2: This red-zone bug only manifests when compiling CRIU with
> +	 * DEBUG=1.
>  	 */
> -	asm volatile ("pushq %%rbp\n\t"
> -			"mov %6, %%ebp\n\t"
> -			"int $0x80\n\t"
> -			"mov %%ebp, %6\n\t"
> -			"popq %%rbp\n\t"
> -		      : "+a" (args->nr),
> -			"+b" (args->arg0), "+c" (args->arg1), "+d" (args->arg2),
> -			"+S" (args->arg3), "+D" (args->arg4), "+g" (args->arg5)
> -			: : "r8", "r9", "r10", "r11");
> +	uint32_t ret;
> +
> +	asm volatile ("sub $128, %%rsp\n\t"
> +		      "pushq %%rbp\n\t"
> +		      "mov %7, %%ebp\n\t"
> +		      "int $0x80\n\t"
> +		      "popq %%rbp\n\t"
> +		      "add $128, %%rsp\n\t"
> +		      : "=a" (ret)
> +		      : "a" (args->nr),
> +			"b" (args->arg0), "c" (args->arg1), "d" (args->arg2),
> +			"S" (args->arg3), "D" (args->arg4), "g" (args->arg5)
> +		      : "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15");
> +	return ret;
>  }
>  
>  #ifndef CR_NOGLIBC
> diff --git a/criu/arch/x86/kerndat.c b/criu/arch/x86/kerndat.c
> index f7593251..94c954e1 100644
> --- a/criu/arch/x86/kerndat.c
> +++ b/criu/arch/x86/kerndat.c
> @@ -75,9 +75,7 @@ void *mmap_ia32(void *addr, size_t len, int prot,
>  	s.arg4  = fildes;
>  	s.arg5  = (uint32_t)off;
>  
> -	do_full_int80(&s);
> -
> -	return (void *)(uintptr_t)s.nr;
> +	return (void *)(uintptr_t)do_full_int80(&s);
>  }
>  
>  /*
> diff --git a/criu/arch/x86/restorer.c b/criu/arch/x86/restorer.c
> index 2d335d5e..b2c3b366 100644
> --- a/criu/arch/x86/restorer.c
> +++ b/criu/arch/x86/restorer.c
> @@ -54,8 +54,7 @@ int set_compat_robust_list(uint32_t head_ptr, uint32_t len)
>  		.arg1	= len,
>  	};
>  
> -	do_full_int80(&s);
> -	return (int)s.nr;
> +	return do_full_int80(&s);
>  }
>  
>  static int prepare_stack32(void **stack32)
> diff --git a/criu/arch/x86/sigaction_compat.c b/criu/arch/x86/sigaction_compat.c
> index b38ba801..f467da49 100644
> --- a/criu/arch/x86/sigaction_compat.c
> +++ b/criu/arch/x86/sigaction_compat.c
> @@ -28,7 +28,6 @@ extern char restore_rt_sigaction;
>   */
>  int arch_compat_rt_sigaction(void *stack32, int sig, rt_sigaction_t_compat *act)
>  {
> -	int ret;
>  	struct syscall_args32 arg = {};
>  	unsigned long act_stack = (unsigned long)stack32;
>  
> @@ -49,8 +48,5 @@ int arch_compat_rt_sigaction(void *stack32, int sig, rt_sigaction_t_compat *act)
>  	arg.arg2	= 0;					/* oldact */
>  	arg.arg3	= (uint32_t)sizeof(act->rt_sa_mask);	/* sigsetsize */
>  
> -	do_full_int80(&arg);
> -	asm volatile ("\t movl %%eax,%0\n" : "=r"(ret));
> -	return ret;
> +	return do_full_int80(&arg);
>  }
> -
> -- 
> 2.19.1
> 
> 
> _______________________________________________
> CRIU mailing list
> CRIU@openvz.org
> https://lists.openvz.org/mailman/listinfo/criu