[RH7] ve/kthread: fix race when work can be added to stopped kthread worker

Submitted by Pavel Tikhomirov on March 21, 2018, 7:19 a.m.

Details

Message ID 20180321071952.17745-1-ptikhomirov@virtuozzo.com
State New
Series "ve/kthread: fix race when work can be added to stopped kthread worker"
Headers show

Commit Message

Pavel Tikhomirov March 21, 2018, 7:19 a.m.
note: resending due to missed devel list in CC

Race can be reproduced with steps below:

1) Add these test patch to increase the race probability:

kernel/kmod.c
@@ -977,6 +977,8 @@ int call_usermodehelper_fns_ve(struct ve_struct *ve,
        khelper = ve_is_super(ve) ? &khelper_worker : &ve->ve_umh_worker;

        if (ve_is_super(ve) || (get_exec_env() == ve)) {
+               while(ve->is_running)
+                       cond_resched();
                err = call_usermodehelper_by(khelper, path, argv, envp, wait, init,
                                             cleanup, data);
                goto out_put;

2) Set corepattern to type pipe in CT:

echo "|/bin/dd of=/vz/coredumps/core-%e-%t.%p" > /proc/sys/kernel/core_pattern

3) Produce a segfault inside a container and next try to stop the
container killing init.

Coredump creates "dd" work and ads it to ve_umh_worker, which is already
stopped and will never handle these work, and our work will hang
forever, and container will never stop:

[root@kuchy ~]# cat /proc/6064/stack
[<ffffffff810a86f9>] __call_usermodehelper_exec+0x179/0x1a0
[<ffffffff810a8eae>] call_usermodehelper_by+0x5e/0xa0
[<ffffffff810a901c>] call_usermodehelper_fns_ve+0xec/0x170
[<ffffffff8128600c>] do_coredump+0x4dc/0xaf0
[<ffffffff810a1cc5>] get_signal_to_deliver+0x1c5/0x5e0
[<ffffffff8102a387>] do_signal+0x57/0x6b0
[<ffffffff8102aa3f>] do_notify_resume+0x5f/0xb0
[<ffffffff816df6ec>] retint_signal+0x48/0x8c
[<ffffffffffffffff>] 0xffffffffffffffff

Fix these by putting is_running check and usermodhelper queueing in one
op_sem lock critical section. It works as we already have these lock
taken in ve_stop_ns.

commit a08505af8c51 ("ve/kthread: khelper kthread in a container
introduced") says that we can't hold lock when waiting for userspace so
drop the lock when waiting.

https://jira.sw.ru/browse/PSBM-82490
Signed-off-by: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
---
 include/linux/kmod.h |  6 ++--
 kernel/kmod.c        | 81 +++++++++++++++++++++++++++++++---------------------
 2 files changed, 51 insertions(+), 36 deletions(-)

Patch hide | download patch | download mbox

diff --git a/include/linux/kmod.h b/include/linux/kmod.h
index f9c18741d1a4..9bd5776b1302 100644
--- a/include/linux/kmod.h
+++ b/include/linux/kmod.h
@@ -72,10 +72,10 @@  struct subprocess_info {
 };
 
 extern int
-call_usermodehelper_by(struct kthread_worker *worker,
-			char *path, char **argv, char **envp, int wait,
+call_usermodehelper_by(char *path, char **argv, char **envp, int wait,
 			int (*init)(struct subprocess_info *info, struct cred *new),
-			void (*cleanup)(struct subprocess_info *), void *data);
+			void (*cleanup)(struct subprocess_info *), void *data,
+			struct ve_struct *ve);
 extern int
 call_usermodehelper(char *path, char **argv, char **envp, int wait);
 
diff --git a/kernel/kmod.c b/kernel/kmod.c
index bb7671bc0177..2b45f06cbe51 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -78,8 +78,8 @@  static void free_modprobe_argv(struct subprocess_info *info)
 	kfree(info->argv);
 }
 
-static int __call_usermodehelper_exec(struct kthread_worker *worker,
-		struct subprocess_info *sub_info, int wait);
+static int __call_usermodehelper_exec(struct subprocess_info *sub_info,
+		int wait, struct ve_struct *ve);
 
 static int call_modprobe(char *module_name, int wait, int blacklist)
 {
@@ -118,7 +118,7 @@  static int call_modprobe(char *module_name, int wait, int blacklist)
 	 * We enter to this function with the right permittions, so
 	 * it's possible to directly call __call_usermodehelper_exec()
 	 */
-	return __call_usermodehelper_exec(&khelper_worker, info, wait | UMH_KILLABLE);
+	return __call_usermodehelper_exec(info, wait | UMH_KILLABLE, NULL);
 
 free_module_name:
 	kfree(module_name);
@@ -862,6 +862,19 @@  struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
 }
 EXPORT_SYMBOL(call_usermodehelper_setup);
 
+static void ve_op_sem_down_read(struct ve_struct *ve)
+{
+	if (ve)
+		down_read(&ve->op_sem);
+}
+
+static void ve_op_sem_up_read(struct ve_struct *ve)
+{
+	if (ve)
+		up_read(&ve->op_sem);
+}
+
+
 /**
  * call_usermodehelper_exec - start a usermode application
  * @sub_info: information about the subprocessa
@@ -874,9 +887,11 @@  EXPORT_SYMBOL(call_usermodehelper_setup);
  * asynchronously if wait is not set, and runs as a child of keventd.
  * (ie. it runs with full root capabilities).
  */
-static int __call_usermodehelper_exec(struct kthread_worker *worker,
-		struct subprocess_info *sub_info, int wait)
+static int __call_usermodehelper_exec(struct subprocess_info *sub_info,
+		int wait, struct ve_struct *ve)
 {
+	struct kthread_worker *worker = ve ? &ve->ve_umh_worker
+		: &khelper_worker;
 	DECLARE_COMPLETION_ONSTACK(done);
 	int retval = 0;
 
@@ -912,7 +927,9 @@  static int __call_usermodehelper_exec(struct kthread_worker *worker,
 		goto unlock;
 
 	if (wait & UMH_KILLABLE) {
+		ve_op_sem_up_read(ve);
 		retval = wait_for_completion_killable(&done);
+		ve_op_sem_down_read(ve);
 		if (!retval)
 			goto wait_done;
 
@@ -922,7 +939,9 @@  static int __call_usermodehelper_exec(struct kthread_worker *worker,
 		/* fallthrough, umh_complete() was already called */
 	}
 
+	ve_op_sem_up_read(ve);
 	wait_for_completion(&done);
+	ve_op_sem_down_read(ve);
 wait_done:
 	retval = sub_info->retval;
 out:
@@ -937,7 +956,7 @@  int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
 	if (!ve_is_super(get_exec_env()))
 		return -EPERM;
 
-	return __call_usermodehelper_exec(&khelper_worker, sub_info, wait);
+	return __call_usermodehelper_exec(sub_info, wait, NULL);
 }
 EXPORT_SYMBOL(call_usermodehelper_exec);
 
@@ -956,8 +975,8 @@  EXPORT_SYMBOL(call_usermodehelper_exec);
  */
 int call_usermodehelper(char *path, char **argv, char **envp, int wait)
 {
-	return call_usermodehelper_by(&khelper_worker, path, argv, envp,
-			wait, NULL, NULL, NULL);
+	return call_usermodehelper_by(path, argv, envp,
+			wait, NULL, NULL, NULL, NULL);
 }
 EXPORT_SYMBOL(call_usermodehelper);
 
@@ -968,37 +987,33 @@  int call_usermodehelper_fns_ve(struct ve_struct *ve,
 	void (*cleanup)(struct subprocess_info *), void *data)
 {
 	int err;
-	struct kthread_worker *khelper;
 
 	ve = get_ve(ve);
 	if (!ve)
 		return -EFAULT;
 
-	khelper = ve_is_super(ve) ? &khelper_worker : &ve->ve_umh_worker;
-
-	if (ve_is_super(ve) || (get_exec_env() == ve)) {
-		err = call_usermodehelper_by(khelper, path, argv, envp, wait, init,
-					     cleanup, data);
-		goto out_put;
-	}
-
-	if (wait > UMH_WAIT_EXEC) {
-		printk(KERN_ERR "VE#%s: Sleeping call for containers UMH is "
-				"not supported\n", ve->ve_name);
-		err = -EINVAL;
+	if (ve_is_super(ve)) {
+		err = call_usermodehelper_by(path, argv, envp,
+				wait, init, cleanup, data, NULL);
 		goto out_put;
 	}
 
+	/*
+	 * These lock is needed to remove race with ve_stop_ns for containers,
+	 * we should not be able to add work to stopped khelper, what can lead
+	 * to infinite wait for work's completion.
+	 *
+	 * Waiting for user-space with semaphore taken is wrong, so
+	 * __call_usermodehelper_exec will release it as soon as the work is
+	 * queued and wait will be done unlocked.
+	 */
 	down_read(&ve->op_sem);
 	err = -EPIPE;
-	if (!ve->is_running)
-		goto out;
-
-	err = call_usermodehelper_by(khelper, path, argv, envp, wait, init,
-				     cleanup, data);
-
-out:
+	if (ve->is_running)
+		err = call_usermodehelper_by(path, argv, envp,
+				wait, init, cleanup, data, ve);
 	up_read(&ve->op_sem);
+
 out_put:
 	put_ve(ve);
 	return err;
@@ -1006,15 +1021,15 @@  int call_usermodehelper_fns_ve(struct ve_struct *ve,
 EXPORT_SYMBOL(call_usermodehelper_fns_ve);
 #endif
 
-int call_usermodehelper_by(struct kthread_worker *worker,
-	char *path, char **argv, char **envp, int wait,
+int call_usermodehelper_by(char *path, char **argv, char **envp, int wait,
 	int (*init)(struct subprocess_info *info, struct cred *new),
-	void (*cleanup)(struct subprocess_info *), void *data)
+	void (*cleanup)(struct subprocess_info *), void *data,
+	struct ve_struct *ve)
 {
 	struct subprocess_info *info;
 	gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL;
 
-	if (worker == &khelper_worker && !ve_is_super(get_exec_env()))
+	if (!ve && !ve_is_super(get_exec_env()))
 		return -EPERM;
 
 	info = call_usermodehelper_setup(path, argv, envp, gfp_mask,
@@ -1022,7 +1037,7 @@  int call_usermodehelper_by(struct kthread_worker *worker,
 	if (info == NULL)
 		return -ENOMEM;
 
-	return __call_usermodehelper_exec(worker, info, wait);
+	return __call_usermodehelper_exec(info, wait, ve);
 }
 EXPORT_SYMBOL(call_usermodehelper_by);
 

Comments

Kirill Tkhai March 21, 2018, 11:29 a.m.
On 21.03.2018 10:19, Pavel Tikhomirov wrote:
> note: resending due to missed devel list in CC
> 
> Race can be reproduced with steps below:
> 
> 1) Add these test patch to increase the race probability:
> 
> kernel/kmod.c
> @@ -977,6 +977,8 @@ int call_usermodehelper_fns_ve(struct ve_struct *ve,
>         khelper = ve_is_super(ve) ? &khelper_worker : &ve->ve_umh_worker;
> 
>         if (ve_is_super(ve) || (get_exec_env() == ve)) {
> +               while(ve->is_running)
> +                       cond_resched();
>                 err = call_usermodehelper_by(khelper, path, argv, envp, wait, init,
>                                              cleanup, data);
>                 goto out_put;
> 
> 2) Set corepattern to type pipe in CT:
> 
> echo "|/bin/dd of=/vz/coredumps/core-%e-%t.%p" > /proc/sys/kernel/core_pattern
> 
> 3) Produce a segfault inside a container and next try to stop the
> container killing init.
> 
> Coredump creates "dd" work and ads it to ve_umh_worker, which is already
> stopped and will never handle these work, and our work will hang
> forever, and container will never stop:
> 
> [root@kuchy ~]# cat /proc/6064/stack
> [<ffffffff810a86f9>] __call_usermodehelper_exec+0x179/0x1a0
> [<ffffffff810a8eae>] call_usermodehelper_by+0x5e/0xa0
> [<ffffffff810a901c>] call_usermodehelper_fns_ve+0xec/0x170
> [<ffffffff8128600c>] do_coredump+0x4dc/0xaf0
> [<ffffffff810a1cc5>] get_signal_to_deliver+0x1c5/0x5e0
> [<ffffffff8102a387>] do_signal+0x57/0x6b0
> [<ffffffff8102aa3f>] do_notify_resume+0x5f/0xb0
> [<ffffffff816df6ec>] retint_signal+0x48/0x8c
> [<ffffffffffffffff>] 0xffffffffffffffff
> 
> Fix these by putting is_running check and usermodhelper queueing in one
> op_sem lock critical section. It works as we already have these lock
> taken in ve_stop_ns.
> 
> commit a08505af8c51 ("ve/kthread: khelper kthread in a container
> introduced") says that we can't hold lock when waiting for userspace so
> drop the lock when waiting.
> 
> https://jira.sw.ru/browse/PSBM-82490
> Signed-off-by: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
> ---
>  include/linux/kmod.h |  6 ++--
>  kernel/kmod.c        | 81 +++++++++++++++++++++++++++++++---------------------
>  2 files changed, 51 insertions(+), 36 deletions(-)
> 
> diff --git a/include/linux/kmod.h b/include/linux/kmod.h
> index f9c18741d1a4..9bd5776b1302 100644
> --- a/include/linux/kmod.h
> +++ b/include/linux/kmod.h
> @@ -72,10 +72,10 @@ struct subprocess_info {
>  };
>  
>  extern int
> -call_usermodehelper_by(struct kthread_worker *worker,
> -			char *path, char **argv, char **envp, int wait,
> +call_usermodehelper_by(char *path, char **argv, char **envp, int wait,
>  			int (*init)(struct subprocess_info *info, struct cred *new),
> -			void (*cleanup)(struct subprocess_info *), void *data);
> +			void (*cleanup)(struct subprocess_info *), void *data,
> +			struct ve_struct *ve);

There is not only a race fix, but also preparations like changing function arguments.
Can we move them to separate patch or there are some limitations to do that?

Thanks,
Kirill

>  extern int
>  call_usermodehelper(char *path, char **argv, char **envp, int wait);
>  
> diff --git a/kernel/kmod.c b/kernel/kmod.c
> index bb7671bc0177..2b45f06cbe51 100644
> --- a/kernel/kmod.c
> +++ b/kernel/kmod.c
> @@ -78,8 +78,8 @@ static void free_modprobe_argv(struct subprocess_info *info)
>  	kfree(info->argv);
>  }
>  
> -static int __call_usermodehelper_exec(struct kthread_worker *worker,
> -		struct subprocess_info *sub_info, int wait);
> +static int __call_usermodehelper_exec(struct subprocess_info *sub_info,
> +		int wait, struct ve_struct *ve);
>  
>  static int call_modprobe(char *module_name, int wait, int blacklist)
>  {
> @@ -118,7 +118,7 @@ static int call_modprobe(char *module_name, int wait, int blacklist)
>  	 * We enter to this function with the right permittions, so
>  	 * it's possible to directly call __call_usermodehelper_exec()
>  	 */
> -	return __call_usermodehelper_exec(&khelper_worker, info, wait | UMH_KILLABLE);
> +	return __call_usermodehelper_exec(info, wait | UMH_KILLABLE, NULL);
>  
>  free_module_name:
>  	kfree(module_name);
> @@ -862,6 +862,19 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
>  }
>  EXPORT_SYMBOL(call_usermodehelper_setup);
>  
> +static void ve_op_sem_down_read(struct ve_struct *ve)
> +{
> +	if (ve)
> +		down_read(&ve->op_sem);
> +}
> +
> +static void ve_op_sem_up_read(struct ve_struct *ve)
> +{
> +	if (ve)
> +		up_read(&ve->op_sem);
> +}
> +
> +
>  /**
>   * call_usermodehelper_exec - start a usermode application
>   * @sub_info: information about the subprocessa
> @@ -874,9 +887,11 @@ EXPORT_SYMBOL(call_usermodehelper_setup);
>   * asynchronously if wait is not set, and runs as a child of keventd.
>   * (ie. it runs with full root capabilities).
>   */
> -static int __call_usermodehelper_exec(struct kthread_worker *worker,
> -		struct subprocess_info *sub_info, int wait)
> +static int __call_usermodehelper_exec(struct subprocess_info *sub_info,
> +		int wait, struct ve_struct *ve)
>  {
> +	struct kthread_worker *worker = ve ? &ve->ve_umh_worker
> +		: &khelper_worker;
>  	DECLARE_COMPLETION_ONSTACK(done);
>  	int retval = 0;
>  
> @@ -912,7 +927,9 @@ static int __call_usermodehelper_exec(struct kthread_worker *worker,
>  		goto unlock;
>  
>  	if (wait & UMH_KILLABLE) {
> +		ve_op_sem_up_read(ve);
>  		retval = wait_for_completion_killable(&done);
> +		ve_op_sem_down_read(ve);
>  		if (!retval)
>  			goto wait_done;
>  
> @@ -922,7 +939,9 @@ static int __call_usermodehelper_exec(struct kthread_worker *worker,
>  		/* fallthrough, umh_complete() was already called */
>  	}
>  
> +	ve_op_sem_up_read(ve);
>  	wait_for_completion(&done);
> +	ve_op_sem_down_read(ve);
>  wait_done:
>  	retval = sub_info->retval;
>  out:
> @@ -937,7 +956,7 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
>  	if (!ve_is_super(get_exec_env()))
>  		return -EPERM;
>  
> -	return __call_usermodehelper_exec(&khelper_worker, sub_info, wait);
> +	return __call_usermodehelper_exec(sub_info, wait, NULL);
>  }
>  EXPORT_SYMBOL(call_usermodehelper_exec);
>  
> @@ -956,8 +975,8 @@ EXPORT_SYMBOL(call_usermodehelper_exec);
>   */
>  int call_usermodehelper(char *path, char **argv, char **envp, int wait)
>  {
> -	return call_usermodehelper_by(&khelper_worker, path, argv, envp,
> -			wait, NULL, NULL, NULL);
> +	return call_usermodehelper_by(path, argv, envp,
> +			wait, NULL, NULL, NULL, NULL);
>  }
>  EXPORT_SYMBOL(call_usermodehelper);
>  
> @@ -968,37 +987,33 @@ int call_usermodehelper_fns_ve(struct ve_struct *ve,
>  	void (*cleanup)(struct subprocess_info *), void *data)
>  {
>  	int err;
> -	struct kthread_worker *khelper;
>  
>  	ve = get_ve(ve);
>  	if (!ve)
>  		return -EFAULT;
>  
> -	khelper = ve_is_super(ve) ? &khelper_worker : &ve->ve_umh_worker;
> -
> -	if (ve_is_super(ve) || (get_exec_env() == ve)) {
> -		err = call_usermodehelper_by(khelper, path, argv, envp, wait, init,
> -					     cleanup, data);
> -		goto out_put;
> -	}
> -
> -	if (wait > UMH_WAIT_EXEC) {
> -		printk(KERN_ERR "VE#%s: Sleeping call for containers UMH is "
> -				"not supported\n", ve->ve_name);
> -		err = -EINVAL;
> +	if (ve_is_super(ve)) {
> +		err = call_usermodehelper_by(path, argv, envp,
> +				wait, init, cleanup, data, NULL);
>  		goto out_put;
>  	}
>  
> +	/*
> +	 * These lock is needed to remove race with ve_stop_ns for containers,
> +	 * we should not be able to add work to stopped khelper, what can lead
> +	 * to infinite wait for work's completion.
> +	 *
> +	 * Waiting for user-space with semaphore taken is wrong, so
> +	 * __call_usermodehelper_exec will release it as soon as the work is
> +	 * queued and wait will be done unlocked.
> +	 */
>  	down_read(&ve->op_sem);
>  	err = -EPIPE;
> -	if (!ve->is_running)
> -		goto out;
> -
> -	err = call_usermodehelper_by(khelper, path, argv, envp, wait, init,
> -				     cleanup, data);
> -
> -out:
> +	if (ve->is_running)
> +		err = call_usermodehelper_by(path, argv, envp,
> +				wait, init, cleanup, data, ve);
>  	up_read(&ve->op_sem);
> +
>  out_put:
>  	put_ve(ve);
>  	return err;
> @@ -1006,15 +1021,15 @@ int call_usermodehelper_fns_ve(struct ve_struct *ve,
>  EXPORT_SYMBOL(call_usermodehelper_fns_ve);
>  #endif
>  
> -int call_usermodehelper_by(struct kthread_worker *worker,
> -	char *path, char **argv, char **envp, int wait,
> +int call_usermodehelper_by(char *path, char **argv, char **envp, int wait,
>  	int (*init)(struct subprocess_info *info, struct cred *new),
> -	void (*cleanup)(struct subprocess_info *), void *data)
> +	void (*cleanup)(struct subprocess_info *), void *data,
> +	struct ve_struct *ve)
>  {
>  	struct subprocess_info *info;
>  	gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL;
>  
> -	if (worker == &khelper_worker && !ve_is_super(get_exec_env()))
> +	if (!ve && !ve_is_super(get_exec_env()))
>  		return -EPERM;
>  
>  	info = call_usermodehelper_setup(path, argv, envp, gfp_mask,
> @@ -1022,7 +1037,7 @@ int call_usermodehelper_by(struct kthread_worker *worker,
>  	if (info == NULL)
>  		return -ENOMEM;
>  
> -	return __call_usermodehelper_exec(worker, info, wait);
> +	return __call_usermodehelper_exec(info, wait, ve);
>  }
>  EXPORT_SYMBOL(call_usermodehelper_by);
>  
>