[Devel] x86/vdso: Add 64-bit vdso map API

Submitted by Dmitry Safonov on July 11, 2017, 6:43 p.m.

Details

Message ID 20170711184328.29659-1-dsafonov@virtuozzo.com
State New
Series "x86/vdso: Add 64-bit vdso map API"
Headers show

Commit Message

Dmitry Safonov July 11, 2017, 6:43 p.m.
Mainstream already has arch_prctl(MAP_VDSO_64), but this was
ommited for simplicity and we only have arch_prctl(MAP_VDSO_32).
This was not a problem as previously we needed MAP_VDSO_32 only
for ia32 applications C/R.

But as we've made vdso pages to be per-UTS-ns, pages differ between
host and uts-ns. As CRIU restore starts from init-ns, vdso that's
being preserved into restored application belongs to host and
thou has host's ve_time_spec.

Using this API we can map vdso in restored CT and it'll belong
to uts-ns of CT.

https://jira.sw.ru/browse/PSBM-67017

Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Signed-off-by: Dmitry Safonov <dsafonov@virtuozzo.com>
---
 arch/x86/include/asm/elf.h   |  6 +++++-
 arch/x86/kernel/process_64.c | 14 ++++++-------
 arch/x86/vdso/vdso32-setup.c | 12 +----------
 arch/x86/vdso/vma.c          | 48 ++++++++++++++++++++++++++++++++++++--------
 4 files changed, 53 insertions(+), 27 deletions(-)

Patch hide | download patch | download mbox

diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 8b0f63910b06..920690b3a5d5 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -353,7 +353,11 @@  extern int syscall32_setup_pages(struct linux_binprm *, int exstack);
 #define compat_arch_setup_additional_pages	syscall32_setup_pages
 
 #ifdef CONFIG_X86_64
-extern int do_map_compat_vdso(unsigned long addr);
+extern bool vdso_or_vvar_present(struct mm_struct *mm);
+extern int do_map_vdso_64(unsigned long addr);
+# ifdef CONFIG_COMPAT
+extern int do_map_vdso_32(unsigned long addr);
+# endif
 #endif
 
 extern unsigned long arch_randomize_brk(struct mm_struct *mm);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index d2e444cb7209..252f9f0ecc0f 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -580,16 +580,16 @@  long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
 	}
 
 #ifdef CONFIG_CHECKPOINT_RESTORE
+# ifdef CONFIG_COMPAT
 	case ARCH_MAP_VDSO_32:
-		return do_map_compat_vdso(addr);
+		return do_map_vdso_32(addr);
+# endif
 
-	/*
-	 * x32 and 64 vDSO remap API is omitted for simplicity.
-	 * We do need 32-bit vDSO blob mapping for compatible
-	 * applications Restore, but not x32/64 (at least, for now).
-	 */
-	case ARCH_MAP_VDSO_X32:
 	case ARCH_MAP_VDSO_64:
+		return do_map_vdso_64(addr);
+
+	/* x32 vDSO remap API is omitted for simplicity. */
+	case ARCH_MAP_VDSO_X32:
 #endif
 
 	default:
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index 81a16c803f11..30b99959daed 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -512,17 +512,7 @@  up_fail:
 
 #ifdef CONFIG_X86_64
 
-static bool vdso_or_vvar_present(struct mm_struct *mm)
-{
-	struct vm_area_struct *vma;
-
-	for (vma = mm->mmap; vma; vma = vma->vm_next)
-		if (vma_is_vdso_or_vvar(vma, mm))
-			return true;
-	return false;
-}
-
-int do_map_compat_vdso(unsigned long req_addr)
+int do_map_vdso_32(unsigned long req_addr)
 {
 	struct mm_struct *mm = current->mm;
 	unsigned long vdso_addr;
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index ad0e0ac14f83..accca8edc62b 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -171,28 +171,52 @@  static unsigned long vdso_addr(unsigned long start, unsigned len)
 	return addr;
 }
 
+bool vdso_or_vvar_present(struct mm_struct *mm)
+{
+	struct vm_area_struct *vma;
+
+	for (vma = mm->mmap; vma; vma = vma->vm_next)
+		if (vma_is_vdso_or_vvar(vma, mm))
+			return true;
+	return false;
+}
+
 /* Setup a VMA at program startup for the vsyscall page.
    Not called for compat tasks */
 static int setup_additional_pages(struct linux_binprm *bprm,
 				  int uses_interp,
 				  struct page **pages,
-				  unsigned size)
+				  unsigned size,
+				  unsigned long req_addr)
 {
 	struct mm_struct *mm = current->mm;
-	unsigned long addr;
+	unsigned long addr = req_addr;
 	int ret;
 
 	if (!vdso_enabled)
 		return 0;
 
 	down_write(&mm->mmap_sem);
-	addr = vdso_addr(mm->start_stack, size);
+
+	if (vdso_or_vvar_present(mm)) {
+		ret = -EEXIST;
+		goto up_fail;
+	}
+
+	if (!req_addr)
+		addr = vdso_addr(mm->start_stack, size);
+
 	addr = get_unmapped_area(NULL, addr, size, 0, 0);
 	if (IS_ERR_VALUE(addr)) {
 		ret = addr;
 		goto up_fail;
 	}
 
+	if (req_addr && req_addr != addr) {
+		ret = -EFAULT;
+		goto up_fail;
+	}
+
 	current->mm->context.vdso = (void *)addr;
 
 	ret = install_special_mapping(mm, addr, size,
@@ -211,7 +235,8 @@  up_fail:
 
 static DEFINE_MUTEX(vdso_mutex);
 
-static int uts_arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
+static int uts_arch_setup_additional_pages(struct linux_binprm *bprm,
+		int uses_interp, unsigned long addr)
 {
 	struct uts_namespace *uts_ns = current->nsproxy->uts_ns;
 	struct ve_struct *ve = get_exec_env();
@@ -303,9 +328,11 @@  static int uts_arch_setup_additional_pages(struct linux_binprm *bprm, int uses_i
 		 LINUX_VERSION_CODE, new_version, ve->veid);
 
 map_uts:
-	return setup_additional_pages(bprm, uses_interp, uts_ns->vdso.pages, uts_ns->vdso.size);
+	return setup_additional_pages(bprm, uses_interp, uts_ns->vdso.pages,
+		uts_ns->vdso.size, addr);
 map_init_uts:
-	return setup_additional_pages(bprm, uses_interp, init_uts_ns.vdso.pages, init_uts_ns.vdso.size);
+	return setup_additional_pages(bprm, uses_interp, init_uts_ns.vdso.pages,
+		init_uts_ns.vdso.size, addr);
 out_unlock:
 	mutex_unlock(&vdso_mutex);
 	return -ENOMEM;
@@ -313,14 +340,19 @@  out_unlock:
 
 int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 {
-	return uts_arch_setup_additional_pages(bprm, uses_interp);
+	return uts_arch_setup_additional_pages(bprm, uses_interp, 0);
+}
+
+int do_map_vdso_64(unsigned long req_addr)
+{
+	return uts_arch_setup_additional_pages(0, 0, req_addr);
 }
 
 #ifdef CONFIG_X86_X32_ABI
 int x32_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 {
 	return setup_additional_pages(bprm, uses_interp, vdsox32_pages,
-				      vdsox32_size);
+				      vdsox32_size, 0);
 }
 #endif
 

Comments

Konstantin Khorenko July 12, 2017, 9:21 a.m.
Kirill, please review the patch.

--
Best regards,

Konstantin Khorenko,
Virtuozzo Linux Kernel Team

On 07/11/2017 09:43 PM, Dmitry Safonov wrote:
> Mainstream already has arch_prctl(MAP_VDSO_64), but this was
> ommited for simplicity and we only have arch_prctl(MAP_VDSO_32).
> This was not a problem as previously we needed MAP_VDSO_32 only
> for ia32 applications C/R.
>
> But as we've made vdso pages to be per-UTS-ns, pages differ between
> host and uts-ns. As CRIU restore starts from init-ns, vdso that's
> being preserved into restored application belongs to host and
> thou has host's ve_time_spec.
>
> Using this API we can map vdso in restored CT and it'll belong
> to uts-ns of CT.
>
> https://jira.sw.ru/browse/PSBM-67017
>
> Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
> Signed-off-by: Dmitry Safonov <dsafonov@virtuozzo.com>
> ---
>  arch/x86/include/asm/elf.h   |  6 +++++-
>  arch/x86/kernel/process_64.c | 14 ++++++-------
>  arch/x86/vdso/vdso32-setup.c | 12 +----------
>  arch/x86/vdso/vma.c          | 48 ++++++++++++++++++++++++++++++++++++--------
>  4 files changed, 53 insertions(+), 27 deletions(-)
>
> diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
> index 8b0f63910b06..920690b3a5d5 100644
> --- a/arch/x86/include/asm/elf.h
> +++ b/arch/x86/include/asm/elf.h
> @@ -353,7 +353,11 @@ extern int syscall32_setup_pages(struct linux_binprm *, int exstack);
>  #define compat_arch_setup_additional_pages	syscall32_setup_pages
>
>  #ifdef CONFIG_X86_64
> -extern int do_map_compat_vdso(unsigned long addr);
> +extern bool vdso_or_vvar_present(struct mm_struct *mm);
> +extern int do_map_vdso_64(unsigned long addr);
> +# ifdef CONFIG_COMPAT
> +extern int do_map_vdso_32(unsigned long addr);
> +# endif
>  #endif
>
>  extern unsigned long arch_randomize_brk(struct mm_struct *mm);
> diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
> index d2e444cb7209..252f9f0ecc0f 100644
> --- a/arch/x86/kernel/process_64.c
> +++ b/arch/x86/kernel/process_64.c
> @@ -580,16 +580,16 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
>  	}
>
>  #ifdef CONFIG_CHECKPOINT_RESTORE
> +# ifdef CONFIG_COMPAT
>  	case ARCH_MAP_VDSO_32:
> -		return do_map_compat_vdso(addr);
> +		return do_map_vdso_32(addr);
> +# endif
>
> -	/*
> -	 * x32 and 64 vDSO remap API is omitted for simplicity.
> -	 * We do need 32-bit vDSO blob mapping for compatible
> -	 * applications Restore, but not x32/64 (at least, for now).
> -	 */
> -	case ARCH_MAP_VDSO_X32:
>  	case ARCH_MAP_VDSO_64:
> +		return do_map_vdso_64(addr);
> +
> +	/* x32 vDSO remap API is omitted for simplicity. */
> +	case ARCH_MAP_VDSO_X32:
>  #endif
>
>  	default:
> diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
> index 81a16c803f11..30b99959daed 100644
> --- a/arch/x86/vdso/vdso32-setup.c
> +++ b/arch/x86/vdso/vdso32-setup.c
> @@ -512,17 +512,7 @@ up_fail:
>
>  #ifdef CONFIG_X86_64
>
> -static bool vdso_or_vvar_present(struct mm_struct *mm)
> -{
> -	struct vm_area_struct *vma;
> -
> -	for (vma = mm->mmap; vma; vma = vma->vm_next)
> -		if (vma_is_vdso_or_vvar(vma, mm))
> -			return true;
> -	return false;
> -}
> -
> -int do_map_compat_vdso(unsigned long req_addr)
> +int do_map_vdso_32(unsigned long req_addr)
>  {
>  	struct mm_struct *mm = current->mm;
>  	unsigned long vdso_addr;
> diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
> index ad0e0ac14f83..accca8edc62b 100644
> --- a/arch/x86/vdso/vma.c
> +++ b/arch/x86/vdso/vma.c
> @@ -171,28 +171,52 @@ static unsigned long vdso_addr(unsigned long start, unsigned len)
>  	return addr;
>  }
>
> +bool vdso_or_vvar_present(struct mm_struct *mm)
> +{
> +	struct vm_area_struct *vma;
> +
> +	for (vma = mm->mmap; vma; vma = vma->vm_next)
> +		if (vma_is_vdso_or_vvar(vma, mm))
> +			return true;
> +	return false;
> +}
> +
>  /* Setup a VMA at program startup for the vsyscall page.
>     Not called for compat tasks */
>  static int setup_additional_pages(struct linux_binprm *bprm,
>  				  int uses_interp,
>  				  struct page **pages,
> -				  unsigned size)
> +				  unsigned size,
> +				  unsigned long req_addr)
>  {
>  	struct mm_struct *mm = current->mm;
> -	unsigned long addr;
> +	unsigned long addr = req_addr;
>  	int ret;
>
>  	if (!vdso_enabled)
>  		return 0;
>
>  	down_write(&mm->mmap_sem);
> -	addr = vdso_addr(mm->start_stack, size);
> +
> +	if (vdso_or_vvar_present(mm)) {
> +		ret = -EEXIST;
> +		goto up_fail;
> +	}
> +
> +	if (!req_addr)
> +		addr = vdso_addr(mm->start_stack, size);
> +
>  	addr = get_unmapped_area(NULL, addr, size, 0, 0);
>  	if (IS_ERR_VALUE(addr)) {
>  		ret = addr;
>  		goto up_fail;
>  	}
>
> +	if (req_addr && req_addr != addr) {
> +		ret = -EFAULT;
> +		goto up_fail;
> +	}
> +
>  	current->mm->context.vdso = (void *)addr;
>
>  	ret = install_special_mapping(mm, addr, size,
> @@ -211,7 +235,8 @@ up_fail:
>
>  static DEFINE_MUTEX(vdso_mutex);
>
> -static int uts_arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
> +static int uts_arch_setup_additional_pages(struct linux_binprm *bprm,
> +		int uses_interp, unsigned long addr)
>  {
>  	struct uts_namespace *uts_ns = current->nsproxy->uts_ns;
>  	struct ve_struct *ve = get_exec_env();
> @@ -303,9 +328,11 @@ static int uts_arch_setup_additional_pages(struct linux_binprm *bprm, int uses_i
>  		 LINUX_VERSION_CODE, new_version, ve->veid);
>
>  map_uts:
> -	return setup_additional_pages(bprm, uses_interp, uts_ns->vdso.pages, uts_ns->vdso.size);
> +	return setup_additional_pages(bprm, uses_interp, uts_ns->vdso.pages,
> +		uts_ns->vdso.size, addr);
>  map_init_uts:
> -	return setup_additional_pages(bprm, uses_interp, init_uts_ns.vdso.pages, init_uts_ns.vdso.size);
> +	return setup_additional_pages(bprm, uses_interp, init_uts_ns.vdso.pages,
> +		init_uts_ns.vdso.size, addr);
>  out_unlock:
>  	mutex_unlock(&vdso_mutex);
>  	return -ENOMEM;
> @@ -313,14 +340,19 @@ out_unlock:
>
>  int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
>  {
> -	return uts_arch_setup_additional_pages(bprm, uses_interp);
> +	return uts_arch_setup_additional_pages(bprm, uses_interp, 0);
> +}
> +
> +int do_map_vdso_64(unsigned long req_addr)
> +{
> +	return uts_arch_setup_additional_pages(0, 0, req_addr);
>  }
>
>  #ifdef CONFIG_X86_X32_ABI
>  int x32_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
>  {
>  	return setup_additional_pages(bprm, uses_interp, vdsox32_pages,
> -				      vdsox32_size);
> +				      vdsox32_size, 0);
>  }
>  #endif
>
>