[v4,2/2] ve/kthread: fix race when work can be added to stopped kthread worker

Submitted by Pavel Tikhomirov on March 22, 2018, 10:31 a.m.

Details

Message ID 20180322103154.5116-2-ptikhomirov@virtuozzo.com
State New
Series "Series without cover letter"
Headers show

Commit Message

Pavel Tikhomirov March 22, 2018, 10:31 a.m.
Race can be reproduced with steps below:

1) Add these test patch to increase the race probability:

kernel/kmod.c
@@ -977,6 +977,8 @@ int call_usermodehelper_fns_ve(struct ve_struct *ve,
        khelper = ve_is_super(ve) ? &khelper_worker : &ve->ve_umh_worker;

        if (ve_is_super(ve) || (get_exec_env() == ve)) {
+               while(ve->is_running)
+                       cond_resched();
                err = call_usermodehelper_by(khelper, path, argv, envp, wait, init,
                                             cleanup, data);
                goto out_put;

2) Set corepattern to type pipe in CT:

echo "|/bin/dd of=/vz/coredumps/core-%e-%t.%p" > /proc/sys/kernel/core_pattern

3) Produce a segfault inside a container and next try to stop the
container killing init.

Coredump creates "dd" work and ads it to ve_umh_worker, which is already
stopped and will never handle these work, and our work will hang
forever, and container will never stop:

[root@kuchy ~]# cat /proc/6064/stack
[<ffffffff810a86f9>] __call_usermodehelper_exec+0x179/0x1a0
[<ffffffff810a8eae>] call_usermodehelper_by+0x5e/0xa0
[<ffffffff810a901c>] call_usermodehelper_fns_ve+0xec/0x170
[<ffffffff8128600c>] do_coredump+0x4dc/0xaf0
[<ffffffff810a1cc5>] get_signal_to_deliver+0x1c5/0x5e0
[<ffffffff8102a387>] do_signal+0x57/0x6b0
[<ffffffff8102aa3f>] do_notify_resume+0x5f/0xb0
[<ffffffff816df6ec>] retint_signal+0x48/0x8c
[<ffffffffffffffff>] 0xffffffffffffffff

Fix is:

1) When run __call_usermodehelper_exec for ve, check that umh for these
ve is not PF_EXITING, before adding work. (These is done between memory
barriers of helper_lock and helper_unlock (implict).)

2) In do_exit first set PF_EXITING flag, and then in ve_stop_ns wait for
running_helpers to become 0 before stopping umh. (Between setting and
checking helpers there is a full memory barrier.)

Now we have 2 cases:

If __call_usermodehelper_exec thread comes to helper_lock after do_exit
thread finished wait_khelpers, then the former already see that umh task
is PF_EXITING (due to 2) and will not queue the work.

If __call_usermodehelper_exec comes to helper_lock before do_exit passes
wait_khelpers, the former can see no PF_EXITING and can queue the work,
and then do helper_unlock. After that if wait_khelpers see
running_helpers == 0, it will also see the work queued (due to 1) and
will run these work and complete it normaly.

So ither way work won't hang.

v3: using ve_struct::op_sem outside of ve code is bad so switch to a
better solution and remove unneded lock.
v4: actually remove lock in call_usermodehelper_fns_ve

https://jira.sw.ru/browse/PSBM-82490

Signed-of-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
---
 include/linux/kmod.h |  1 +
 kernel/kmod.c        | 35 +++++++++++------------------------
 kernel/ve/ve.c       |  2 ++
 3 files changed, 14 insertions(+), 24 deletions(-)

Patch hide | download patch | download mbox

diff --git a/include/linux/kmod.h b/include/linux/kmod.h
index 9bd5776b1302..cc9b9551a940 100644
--- a/include/linux/kmod.h
+++ b/include/linux/kmod.h
@@ -98,6 +98,7 @@  enum umh_disable_depth {
 extern void usermodehelper_init(void);
 
 extern int __usermodehelper_disable(enum umh_disable_depth depth);
+extern void wait_khelpers(void);
 extern void __usermodehelper_set_disable_depth(enum umh_disable_depth depth);
 
 static inline int usermodehelper_disable(void)
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 9c37e8b3ab01..1d2ea4fb17bb 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -803,6 +803,12 @@  int __usermodehelper_disable(enum umh_disable_depth depth)
 	return -EAGAIN;
 }
 
+void wait_khelpers(void)
+{
+	wait_event(running_helpers_waitq,
+		atomic_read(&running_helpers) == 0);
+}
+
 static void helper_lock(void)
 {
 	atomic_inc(&running_helpers);
@@ -891,7 +897,8 @@  static int __call_usermodehelper_exec(struct subprocess_info *sub_info,
 	if (sub_info->path[0] == '\0')
 		goto out;
 
-	if (usermodehelper_disabled) {
+	if (usermodehelper_disabled ||
+	    (ve && (ve->init_task->flags & PF_EXITING))) {
 		retval = -EBUSY;
 		goto out;
 	}
@@ -975,30 +982,10 @@  int call_usermodehelper_fns_ve(struct ve_struct *ve,
 	if (!ve)
 		return -EFAULT;
 
-	if (ve_is_super(ve) || (get_exec_env() == ve)) {
-		err = call_usermodehelper_by(path, argv, envp, wait, init, cleanup,
-					     data, ve_is_super(ve) ? NULL : ve);
-		goto out_put;
-	}
-
-	if (wait > UMH_WAIT_EXEC) {
-		printk(KERN_ERR "VE#%s: Sleeping call for containers UMH is "
-				"not supported\n", ve->ve_name);
-		err = -EINVAL;
-		goto out_put;
-	}
-
-	down_read(&ve->op_sem);
-	err = -EPIPE;
-	if (!ve->is_running)
-		goto out;
+	err = call_usermodehelper_by(path, argv, envp,
+			wait, init, cleanup, data,
+			ve_is_super(ve) ? NULL : ve);
 
-	err = call_usermodehelper_by(path, argv, envp, wait, init,
-				     cleanup, data, ve);
-
-out:
-	up_read(&ve->op_sem);
-out_put:
 	put_ve(ve);
 	return err;
 }
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index 68f84797a840..f562fb9e5cbf 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -556,6 +556,8 @@  void ve_stop_ns(struct pid_namespace *pid_ns)
 	if (!ve->ve_ns || ve->ve_ns->pid_ns != pid_ns)
 		return;
 
+	wait_khelpers();
+
 	down_write(&ve->op_sem);
 	/*
 	 * Here the VE changes its state into "not running".

Comments

Kirill Tkhai March 22, 2018, 10:34 a.m.
On 22.03.2018 13:31, Pavel Tikhomirov wrote:
> Race can be reproduced with steps below:
> 
> 1) Add these test patch to increase the race probability:
> 
> kernel/kmod.c
> @@ -977,6 +977,8 @@ int call_usermodehelper_fns_ve(struct ve_struct *ve,
>         khelper = ve_is_super(ve) ? &khelper_worker : &ve->ve_umh_worker;
> 
>         if (ve_is_super(ve) || (get_exec_env() == ve)) {
> +               while(ve->is_running)
> +                       cond_resched();
>                 err = call_usermodehelper_by(khelper, path, argv, envp, wait, init,
>                                              cleanup, data);
>                 goto out_put;
> 
> 2) Set corepattern to type pipe in CT:
> 
> echo "|/bin/dd of=/vz/coredumps/core-%e-%t.%p" > /proc/sys/kernel/core_pattern
> 
> 3) Produce a segfault inside a container and next try to stop the
> container killing init.
> 
> Coredump creates "dd" work and ads it to ve_umh_worker, which is already
> stopped and will never handle these work, and our work will hang
> forever, and container will never stop:
> 
> [root@kuchy ~]# cat /proc/6064/stack
> [<ffffffff810a86f9>] __call_usermodehelper_exec+0x179/0x1a0
> [<ffffffff810a8eae>] call_usermodehelper_by+0x5e/0xa0
> [<ffffffff810a901c>] call_usermodehelper_fns_ve+0xec/0x170
> [<ffffffff8128600c>] do_coredump+0x4dc/0xaf0
> [<ffffffff810a1cc5>] get_signal_to_deliver+0x1c5/0x5e0
> [<ffffffff8102a387>] do_signal+0x57/0x6b0
> [<ffffffff8102aa3f>] do_notify_resume+0x5f/0xb0
> [<ffffffff816df6ec>] retint_signal+0x48/0x8c
> [<ffffffffffffffff>] 0xffffffffffffffff
> 
> Fix is:
> 
> 1) When run __call_usermodehelper_exec for ve, check that umh for these
> ve is not PF_EXITING, before adding work. (These is done between memory
> barriers of helper_lock and helper_unlock (implict).)
> 
> 2) In do_exit first set PF_EXITING flag, and then in ve_stop_ns wait for
> running_helpers to become 0 before stopping umh. (Between setting and
> checking helpers there is a full memory barrier.)
> 
> Now we have 2 cases:
> 
> If __call_usermodehelper_exec thread comes to helper_lock after do_exit
> thread finished wait_khelpers, then the former already see that umh task
> is PF_EXITING (due to 2) and will not queue the work.
> 
> If __call_usermodehelper_exec comes to helper_lock before do_exit passes
> wait_khelpers, the former can see no PF_EXITING and can queue the work,
> and then do helper_unlock. After that if wait_khelpers see
> running_helpers == 0, it will also see the work queued (due to 1) and
> will run these work and complete it normaly.
> 
> So ither way work won't hang.
> 
> v3: using ve_struct::op_sem outside of ve code is bad so switch to a
> better solution and remove unneded lock.
> v4: actually remove lock in call_usermodehelper_fns_ve
> 
> https://jira.sw.ru/browse/PSBM-82490
> 
> Signed-of-by: Kirill Tkhai <ktkhai@virtuozzo.com>
> Signed-off-by: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>

Reviewed-by: Kirill Tkhai <ktkhai@virtuozzo.com>

> ---
>  include/linux/kmod.h |  1 +
>  kernel/kmod.c        | 35 +++++++++++------------------------
>  kernel/ve/ve.c       |  2 ++
>  3 files changed, 14 insertions(+), 24 deletions(-)
> 
> diff --git a/include/linux/kmod.h b/include/linux/kmod.h
> index 9bd5776b1302..cc9b9551a940 100644
> --- a/include/linux/kmod.h
> +++ b/include/linux/kmod.h
> @@ -98,6 +98,7 @@ enum umh_disable_depth {
>  extern void usermodehelper_init(void);
>  
>  extern int __usermodehelper_disable(enum umh_disable_depth depth);
> +extern void wait_khelpers(void);
>  extern void __usermodehelper_set_disable_depth(enum umh_disable_depth depth);
>  
>  static inline int usermodehelper_disable(void)
> diff --git a/kernel/kmod.c b/kernel/kmod.c
> index 9c37e8b3ab01..1d2ea4fb17bb 100644
> --- a/kernel/kmod.c
> +++ b/kernel/kmod.c
> @@ -803,6 +803,12 @@ int __usermodehelper_disable(enum umh_disable_depth depth)
>  	return -EAGAIN;
>  }
>  
> +void wait_khelpers(void)
> +{
> +	wait_event(running_helpers_waitq,
> +		atomic_read(&running_helpers) == 0);
> +}
> +
>  static void helper_lock(void)
>  {
>  	atomic_inc(&running_helpers);
> @@ -891,7 +897,8 @@ static int __call_usermodehelper_exec(struct subprocess_info *sub_info,
>  	if (sub_info->path[0] == '\0')
>  		goto out;
>  
> -	if (usermodehelper_disabled) {
> +	if (usermodehelper_disabled ||
> +	    (ve && (ve->init_task->flags & PF_EXITING))) {
>  		retval = -EBUSY;
>  		goto out;
>  	}
> @@ -975,30 +982,10 @@ int call_usermodehelper_fns_ve(struct ve_struct *ve,
>  	if (!ve)
>  		return -EFAULT;
>  
> -	if (ve_is_super(ve) || (get_exec_env() == ve)) {
> -		err = call_usermodehelper_by(path, argv, envp, wait, init, cleanup,
> -					     data, ve_is_super(ve) ? NULL : ve);
> -		goto out_put;
> -	}
> -
> -	if (wait > UMH_WAIT_EXEC) {
> -		printk(KERN_ERR "VE#%s: Sleeping call for containers UMH is "
> -				"not supported\n", ve->ve_name);
> -		err = -EINVAL;
> -		goto out_put;
> -	}
> -
> -	down_read(&ve->op_sem);
> -	err = -EPIPE;
> -	if (!ve->is_running)
> -		goto out;
> +	err = call_usermodehelper_by(path, argv, envp,
> +			wait, init, cleanup, data,
> +			ve_is_super(ve) ? NULL : ve);
>  
> -	err = call_usermodehelper_by(path, argv, envp, wait, init,
> -				     cleanup, data, ve);
> -
> -out:
> -	up_read(&ve->op_sem);
> -out_put:
>  	put_ve(ve);
>  	return err;
>  }
> diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
> index 68f84797a840..f562fb9e5cbf 100644
> --- a/kernel/ve/ve.c
> +++ b/kernel/ve/ve.c
> @@ -556,6 +556,8 @@ void ve_stop_ns(struct pid_namespace *pid_ns)
>  	if (!ve->ve_ns || ve->ve_ns->pid_ns != pid_ns)
>  		return;
>  
> +	wait_khelpers();
> +
>  	down_write(&ve->op_sem);
>  	/*
>  	 * Here the VE changes its state into "not running".
>