[Devel] kernel: call task_work_run() before exit_task_namespaces()

Submitted by Andrei Vagin on Aug. 7, 2017, 5:15 p.m.

Details

Message ID 20170807171528.5650-1-avagin@openvz.org
State New
Series "kernel: call task_work_run() before exit_task_namespaces()"
Headers show

Commit Message

Andrei Vagin Aug. 7, 2017, 5:15 p.m.
From: Andrei Vagin <avagin@virtuozzo.com>

This patch solves a problem for a following case. We have a container (a
group of processes in pid and mount namespaces) with a fuse mount. An
init process exits and the kernel kills all process in its pid
namespace. There is a fuse daemon, which handle the fuse mount.
Currently the kernel kills this process and closes all its file
descriptors, but __fput() for them is postponed and they will be
called from a task_work.  Then the kernel starts destroying the mount
namespace and the fuse mount, it sees that a control descriptor for
this mount is alive and sends a request to a fuse daemon:

$ cat /proc/4353/task/4355/stack
[<ffffffffa04c3451>] request_wait_answer+0x91/0x270 [fuse]
[<ffffffffa04c36b7>] __fuse_request_send+0x87/0xe0 [fuse]
[<ffffffffa04c6c47>] fuse_request_check_and_send+0x27/0x30 [fuse]
[<ffffffffa04c6c60>] fuse_request_send+0x10/0x20 [fuse]
[<ffffffffa04d2f35>] fuse_put_super+0x55/0xc0 [fuse]
[<ffffffff81218b32>] generic_shutdown_super+0x72/0xf0
[<ffffffff81218f12>] kill_anon_super+0x12/0x20
[<ffffffffa04d2577>] fuse_kill_sb_anon+0x47/0x50 [fuse]
[<ffffffff812194a9>] deactivate_locked_super+0x49/0x80
[<ffffffff81219526>] deactivate_super+0x46/0x60
[<ffffffff81237145>] mntput_no_expire+0xc5/0x120
[<ffffffff812371c4>] mntput+0x24/0x40
[<ffffffff812372f8>] namespace_unlock+0x118/0x130
[<ffffffff81239f2b>] put_mnt_ns+0x4b/0x60
[<ffffffff810b786b>] free_nsproxy+0x1b/0x90
[<ffffffff810b7a0a>] switch_task_namespaces+0x5a/0x70
[<ffffffff810b7ae0>] exit_task_namespaces+0x10/0x20
[<ffffffff8108c883>] do_exit+0x2f3/0xb20
[<ffffffff8108d12f>] do_group_exit+0x3f/0xa0
[<ffffffff8109e760>] get_signal_to_deliver+0x1d0/0x6d0
[<ffffffff8102a357>] do_signal+0x57/0x6b0
[<ffffffff8102aa0f>] do_notify_resume+0x5f/0xb0
[<ffffffff8169273d>] int_signal+0x12/0x17
[<ffffffffffffffff>] 0xffffffffffffffff

But we know that a fuse daemon is already dead and the control
descriptor isn't closed completely, because __fput() was postponed.

This patch calls task_work_run() before destroying namespaces to
complete closing all process files.

In the upstream kernel deactivate_super() is called from a task_work too,
so there is not this problem. But we can't backport these changes from
the upstream, because they are too big:

commit 48a066e72d970a3e225a9c18690d570c736fc455
Author: Al Viro <viro@zeniv.linux.org.uk>
Date:   Sun Sep 29 22:06:07 2013 -0400

    RCU'd vfsmounts

https://jira.sw.ru/browse/PSBM-68266
Signed-off-by: Andrei Vagin <avagin@virtuozzo.com>
Signed-off-by: Andrei Vagin <avagin@openvz.org>
---
 include/linux/task_work.h | 9 +++++++--
 kernel/exit.c             | 9 +++++++++
 kernel/task_work.c        | 4 ++--
 3 files changed, 18 insertions(+), 4 deletions(-)

Patch hide | download patch | download mbox

diff --git a/include/linux/task_work.h b/include/linux/task_work.h
index ca5a1cf..b3af76d 100644
--- a/include/linux/task_work.h
+++ b/include/linux/task_work.h
@@ -14,11 +14,16 @@  init_task_work(struct callback_head *twork, task_work_func_t func)
 
 int task_work_add(struct task_struct *task, struct callback_head *twork, bool);
 struct callback_head *task_work_cancel(struct task_struct *, task_work_func_t);
-void task_work_run(void);
+void __task_work_run(bool exiting);
+
+static inline void task_work_run(void)
+{
+	return __task_work_run(false);
+}
 
 static inline void exit_task_work(struct task_struct *task)
 {
-	task_work_run();
+	__task_work_run(true);
 }
 
 #endif	/* _LINUX_TASK_WORK_H */
diff --git a/kernel/exit.c b/kernel/exit.c
index 3c83db2..ea54a73 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -827,6 +827,15 @@  void do_exit(long code)
 	exit_fs(tsk);
 	if (group_dead)
 		disassociate_ctty(1);
+
+	/*
+	 * task_work_run() has to be called before exit_task_namespaces(),
+	 * because fuse_abort_conn() is called from __fput(). If it will not
+	 * be executed, we can hang in request_wait_answer(). We have seen this
+	 * situation when a process was the last member of a mount namespace
+	 * and the mount namespace has a vstorage fuse mount.
+	 */
+	task_work_run();
 	exit_task_namespaces(tsk);
 	exit_task_work(tsk);
 
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 65bd3c9..f0000c4 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -46,7 +46,7 @@  task_work_cancel(struct task_struct *task, task_work_func_t func)
 	return work;
 }
 
-void task_work_run(void)
+void __task_work_run(bool exiting)
 {
 	struct task_struct *task = current;
 	struct callback_head *work, *head, *next;
@@ -58,7 +58,7 @@  void task_work_run(void)
 		 */
 		do {
 			work = ACCESS_ONCE(task->task_works);
-			head = !work && (task->flags & PF_EXITING) ?
+			head = !work && exiting ?
 				&work_exited : NULL;
 		} while (cmpxchg(&task->task_works, work, head) != work);
 

Comments

Konstantin Khorenko Aug. 8, 2017, 9:27 a.m.
Kirill, please review the patch.

--
Best regards,

Konstantin Khorenko,
Virtuozzo Linux Kernel Team

On 08/07/2017 08:15 PM, Andrei Vagin wrote:
> From: Andrei Vagin <avagin@virtuozzo.com>
>
> This patch solves a problem for a following case. We have a container (a
> group of processes in pid and mount namespaces) with a fuse mount. An
> init process exits and the kernel kills all process in its pid
> namespace. There is a fuse daemon, which handle the fuse mount.
> Currently the kernel kills this process and closes all its file
> descriptors, but __fput() for them is postponed and they will be
> called from a task_work.  Then the kernel starts destroying the mount
> namespace and the fuse mount, it sees that a control descriptor for
> this mount is alive and sends a request to a fuse daemon:
>
> $ cat /proc/4353/task/4355/stack
> [<ffffffffa04c3451>] request_wait_answer+0x91/0x270 [fuse]
> [<ffffffffa04c36b7>] __fuse_request_send+0x87/0xe0 [fuse]
> [<ffffffffa04c6c47>] fuse_request_check_and_send+0x27/0x30 [fuse]
> [<ffffffffa04c6c60>] fuse_request_send+0x10/0x20 [fuse]
> [<ffffffffa04d2f35>] fuse_put_super+0x55/0xc0 [fuse]
> [<ffffffff81218b32>] generic_shutdown_super+0x72/0xf0
> [<ffffffff81218f12>] kill_anon_super+0x12/0x20
> [<ffffffffa04d2577>] fuse_kill_sb_anon+0x47/0x50 [fuse]
> [<ffffffff812194a9>] deactivate_locked_super+0x49/0x80
> [<ffffffff81219526>] deactivate_super+0x46/0x60
> [<ffffffff81237145>] mntput_no_expire+0xc5/0x120
> [<ffffffff812371c4>] mntput+0x24/0x40
> [<ffffffff812372f8>] namespace_unlock+0x118/0x130
> [<ffffffff81239f2b>] put_mnt_ns+0x4b/0x60
> [<ffffffff810b786b>] free_nsproxy+0x1b/0x90
> [<ffffffff810b7a0a>] switch_task_namespaces+0x5a/0x70
> [<ffffffff810b7ae0>] exit_task_namespaces+0x10/0x20
> [<ffffffff8108c883>] do_exit+0x2f3/0xb20
> [<ffffffff8108d12f>] do_group_exit+0x3f/0xa0
> [<ffffffff8109e760>] get_signal_to_deliver+0x1d0/0x6d0
> [<ffffffff8102a357>] do_signal+0x57/0x6b0
> [<ffffffff8102aa0f>] do_notify_resume+0x5f/0xb0
> [<ffffffff8169273d>] int_signal+0x12/0x17
> [<ffffffffffffffff>] 0xffffffffffffffff
>
> But we know that a fuse daemon is already dead and the control
> descriptor isn't closed completely, because __fput() was postponed.
>
> This patch calls task_work_run() before destroying namespaces to
> complete closing all process files.
>
> In the upstream kernel deactivate_super() is called from a task_work too,
> so there is not this problem. But we can't backport these changes from
> the upstream, because they are too big:
>
> commit 48a066e72d970a3e225a9c18690d570c736fc455
> Author: Al Viro <viro@zeniv.linux.org.uk>
> Date:   Sun Sep 29 22:06:07 2013 -0400
>
>     RCU'd vfsmounts
>
> https://jira.sw.ru/browse/PSBM-68266
> Signed-off-by: Andrei Vagin <avagin@virtuozzo.com>
> Signed-off-by: Andrei Vagin <avagin@openvz.org>
> ---
>  include/linux/task_work.h | 9 +++++++--
>  kernel/exit.c             | 9 +++++++++
>  kernel/task_work.c        | 4 ++--
>  3 files changed, 18 insertions(+), 4 deletions(-)
>
> diff --git a/include/linux/task_work.h b/include/linux/task_work.h
> index ca5a1cf..b3af76d 100644
> --- a/include/linux/task_work.h
> +++ b/include/linux/task_work.h
> @@ -14,11 +14,16 @@ init_task_work(struct callback_head *twork, task_work_func_t func)
>
>  int task_work_add(struct task_struct *task, struct callback_head *twork, bool);
>  struct callback_head *task_work_cancel(struct task_struct *, task_work_func_t);
> -void task_work_run(void);
> +void __task_work_run(bool exiting);
> +
> +static inline void task_work_run(void)
> +{
> +	return __task_work_run(false);
> +}
>
>  static inline void exit_task_work(struct task_struct *task)
>  {
> -	task_work_run();
> +	__task_work_run(true);
>  }
>
>  #endif	/* _LINUX_TASK_WORK_H */
> diff --git a/kernel/exit.c b/kernel/exit.c
> index 3c83db2..ea54a73 100644
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -827,6 +827,15 @@ void do_exit(long code)
>  	exit_fs(tsk);
>  	if (group_dead)
>  		disassociate_ctty(1);
> +
> +	/*
> +	 * task_work_run() has to be called before exit_task_namespaces(),
> +	 * because fuse_abort_conn() is called from __fput(). If it will not
> +	 * be executed, we can hang in request_wait_answer(). We have seen this
> +	 * situation when a process was the last member of a mount namespace
> +	 * and the mount namespace has a vstorage fuse mount.
> +	 */
> +	task_work_run();
>  	exit_task_namespaces(tsk);
>  	exit_task_work(tsk);
>
> diff --git a/kernel/task_work.c b/kernel/task_work.c
> index 65bd3c9..f0000c4 100644
> --- a/kernel/task_work.c
> +++ b/kernel/task_work.c
> @@ -46,7 +46,7 @@ task_work_cancel(struct task_struct *task, task_work_func_t func)
>  	return work;
>  }
>
> -void task_work_run(void)
> +void __task_work_run(bool exiting)
>  {
>  	struct task_struct *task = current;
>  	struct callback_head *work, *head, *next;
> @@ -58,7 +58,7 @@ void task_work_run(void)
>  		 */
>  		do {
>  			work = ACCESS_ONCE(task->task_works);
> -			head = !work && (task->flags & PF_EXITING) ?
> +			head = !work && exiting ?
>  				&work_exited : NULL;
>  		} while (cmpxchg(&task->task_works, work, head) != work);
>
>
Konstantin Khorenko Aug. 8, 2017, 9:42 a.m.
On 08/08/2017 12:27 PM, Konstantin Khorenko wrote:
> Kirill, please review the patch.

Please, disregard, it's a resend with commit message updated,
i will update it during a rebase.

>
> --
> Best regards,
>
> Konstantin Khorenko,
> Virtuozzo Linux Kernel Team
>
> On 08/07/2017 08:15 PM, Andrei Vagin wrote:
>> From: Andrei Vagin <avagin@virtuozzo.com>
>>
>> This patch solves a problem for a following case. We have a container (a
>> group of processes in pid and mount namespaces) with a fuse mount. An
>> init process exits and the kernel kills all process in its pid
>> namespace. There is a fuse daemon, which handle the fuse mount.
>> Currently the kernel kills this process and closes all its file
>> descriptors, but __fput() for them is postponed and they will be
>> called from a task_work.  Then the kernel starts destroying the mount
>> namespace and the fuse mount, it sees that a control descriptor for
>> this mount is alive and sends a request to a fuse daemon:
>>
>> $ cat /proc/4353/task/4355/stack
>> [<ffffffffa04c3451>] request_wait_answer+0x91/0x270 [fuse]
>> [<ffffffffa04c36b7>] __fuse_request_send+0x87/0xe0 [fuse]
>> [<ffffffffa04c6c47>] fuse_request_check_and_send+0x27/0x30 [fuse]
>> [<ffffffffa04c6c60>] fuse_request_send+0x10/0x20 [fuse]
>> [<ffffffffa04d2f35>] fuse_put_super+0x55/0xc0 [fuse]
>> [<ffffffff81218b32>] generic_shutdown_super+0x72/0xf0
>> [<ffffffff81218f12>] kill_anon_super+0x12/0x20
>> [<ffffffffa04d2577>] fuse_kill_sb_anon+0x47/0x50 [fuse]
>> [<ffffffff812194a9>] deactivate_locked_super+0x49/0x80
>> [<ffffffff81219526>] deactivate_super+0x46/0x60
>> [<ffffffff81237145>] mntput_no_expire+0xc5/0x120
>> [<ffffffff812371c4>] mntput+0x24/0x40
>> [<ffffffff812372f8>] namespace_unlock+0x118/0x130
>> [<ffffffff81239f2b>] put_mnt_ns+0x4b/0x60
>> [<ffffffff810b786b>] free_nsproxy+0x1b/0x90
>> [<ffffffff810b7a0a>] switch_task_namespaces+0x5a/0x70
>> [<ffffffff810b7ae0>] exit_task_namespaces+0x10/0x20
>> [<ffffffff8108c883>] do_exit+0x2f3/0xb20
>> [<ffffffff8108d12f>] do_group_exit+0x3f/0xa0
>> [<ffffffff8109e760>] get_signal_to_deliver+0x1d0/0x6d0
>> [<ffffffff8102a357>] do_signal+0x57/0x6b0
>> [<ffffffff8102aa0f>] do_notify_resume+0x5f/0xb0
>> [<ffffffff8169273d>] int_signal+0x12/0x17
>> [<ffffffffffffffff>] 0xffffffffffffffff
>>
>> But we know that a fuse daemon is already dead and the control
>> descriptor isn't closed completely, because __fput() was postponed.
>>
>> This patch calls task_work_run() before destroying namespaces to
>> complete closing all process files.
>>
>> In the upstream kernel deactivate_super() is called from a task_work too,
>> so there is not this problem. But we can't backport these changes from
>> the upstream, because they are too big:
>>
>> commit 48a066e72d970a3e225a9c18690d570c736fc455
>> Author: Al Viro <viro@zeniv.linux.org.uk>
>> Date:   Sun Sep 29 22:06:07 2013 -0400
>>
>>     RCU'd vfsmounts
>>
>> https://jira.sw.ru/browse/PSBM-68266
>> Signed-off-by: Andrei Vagin <avagin@virtuozzo.com>
>> Signed-off-by: Andrei Vagin <avagin@openvz.org>
>> ---
>>  include/linux/task_work.h | 9 +++++++--
>>  kernel/exit.c             | 9 +++++++++
>>  kernel/task_work.c        | 4 ++--
>>  3 files changed, 18 insertions(+), 4 deletions(-)
>>
>> diff --git a/include/linux/task_work.h b/include/linux/task_work.h
>> index ca5a1cf..b3af76d 100644
>> --- a/include/linux/task_work.h
>> +++ b/include/linux/task_work.h
>> @@ -14,11 +14,16 @@ init_task_work(struct callback_head *twork, task_work_func_t func)
>>
>>  int task_work_add(struct task_struct *task, struct callback_head *twork, bool);
>>  struct callback_head *task_work_cancel(struct task_struct *, task_work_func_t);
>> -void task_work_run(void);
>> +void __task_work_run(bool exiting);
>> +
>> +static inline void task_work_run(void)
>> +{
>> +	return __task_work_run(false);
>> +}
>>
>>  static inline void exit_task_work(struct task_struct *task)
>>  {
>> -	task_work_run();
>> +	__task_work_run(true);
>>  }
>>
>>  #endif	/* _LINUX_TASK_WORK_H */
>> diff --git a/kernel/exit.c b/kernel/exit.c
>> index 3c83db2..ea54a73 100644
>> --- a/kernel/exit.c
>> +++ b/kernel/exit.c
>> @@ -827,6 +827,15 @@ void do_exit(long code)
>>  	exit_fs(tsk);
>>  	if (group_dead)
>>  		disassociate_ctty(1);
>> +
>> +	/*
>> +	 * task_work_run() has to be called before exit_task_namespaces(),
>> +	 * because fuse_abort_conn() is called from __fput(). If it will not
>> +	 * be executed, we can hang in request_wait_answer(). We have seen this
>> +	 * situation when a process was the last member of a mount namespace
>> +	 * and the mount namespace has a vstorage fuse mount.
>> +	 */
>> +	task_work_run();
>>  	exit_task_namespaces(tsk);
>>  	exit_task_work(tsk);
>>
>> diff --git a/kernel/task_work.c b/kernel/task_work.c
>> index 65bd3c9..f0000c4 100644
>> --- a/kernel/task_work.c
>> +++ b/kernel/task_work.c
>> @@ -46,7 +46,7 @@ task_work_cancel(struct task_struct *task, task_work_func_t func)
>>  	return work;
>>  }
>>
>> -void task_work_run(void)
>> +void __task_work_run(bool exiting)
>>  {
>>  	struct task_struct *task = current;
>>  	struct callback_head *work, *head, *next;
>> @@ -58,7 +58,7 @@ void task_work_run(void)
>>  		 */
>>  		do {
>>  			work = ACCESS_ONCE(task->task_works);
>> -			head = !work && (task->flags & PF_EXITING) ?
>> +			head = !work && exiting ?
>>  				&work_exited : NULL;
>>  		} while (cmpxchg(&task->task_works, work, head) != work);
>>
>>