[2/3,v2,RHEL7] cgroup: do not use cgroup_mutex in cgroup_show_options

Submitted by Valeriy Vdovin on Oct. 29, 2020, 11 a.m.

Details

Message ID 1603969246-958334-3-git-send-email-valeriy.vdovin@virtuozzo.com
State New
Series "cgroup: do not use cgroup_mutex in cgroup_show_options"
Headers show

Commit Message

Valeriy Vdovin Oct. 29, 2020, 11 a.m.
In 87cb5fdb5b5c77ac617b46a0fe118a7d50a77b1c function cgroup_show_options
started to lock cgroup_mutex, which introduced new deadlock possibility,
described below:

Thread A:
  m_start() --> down_read(&namespace_sem);
    cgroup_show_options() --> mutex_lock(&cgroup_mutex);

Thread B:
attach_task_by_pid()
  cgroup_lock_live_group -->  mutex_lock(&cgroup_mutex);
  threadgroup_lock() -->  down_write(&tsk->signal->group_rwsem);

Thread C:
 copy_process
  threadgroup_change_begin() --> down_read(&tsk->signal->group_rwsem);
  copy_namespaces
   create_new_namespaces
     copy_mnt_ns
      namespace_lock() --> down_write(&namespace_sem)

Clearly cgroup_mutex can not be locked right after locking
namespace_sem, because opposite locking order is also present in
the code and should be removed from cgroup_show_options.
After reviewing cgroup_show_options, it was established that
cgroup_mutex is not absolutely needed to guarantee safe access to root_cgrp.
It was used in combination with a call to task_cgroup_from_root to
ensure that root_cgrp lived long enough to access it's value of
release_agent path.
But in this funciton we know that root_cgrp is part of ve->root_css_set,
which holds reference to it. In turn root_css_set is referenced while
ve->ve_ns is not NULL, the check of which we already have in the code.
This means that root_cgrp is valid until ve->ve_ns is valid.
ve->ve_ns is valid until the point of rcu_synchronize in ve_drop_context,
that's why rcu_read_lock should be maintained all the time when root_cgrp
is being accessed.
The patch also removes BUG_ON from css_cgroup_from_root, because all 3 calls 
to this function pass ve->root_css_set as an argument and the above logic
applies.

https://jira.sw.ru/browse/PSBM-121438

Signed-off-by: Valeriy Vdovin <valeriy.vdovin@virtuozzo.com>
---
 kernel/cgroup.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

Patch hide | download patch | download mbox

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 27d7a5e..c7ed3c2 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -644,7 +644,6 @@  static struct cgroup *css_cgroup_from_root(struct css_set *css_set,
 	struct cgroup *res = NULL;
 	struct cg_cgroup_link *link;
 
-	BUG_ON(!mutex_is_locked(&cgroup_mutex));
 	read_lock(&css_set_lock);
 
 	list_for_each_entry(link, &css_set->cg_links, cg_link_list) {
@@ -1100,7 +1099,6 @@  static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
 	struct cgroup_subsys *ss;
 	struct cgroup *root_cgrp = &root->top_cgroup;
 
-	mutex_lock(&cgroup_mutex);
 	mutex_lock(&cgroup_root_mutex);
 	for_each_subsys(root, ss)
 		seq_printf(seq, ",%s", ss->name);
@@ -1112,6 +1110,7 @@  static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
 		seq_puts(seq, ",xattr");
 	if (root->flags & CGRP_ROOT_CPUSET_V2_MODE)
 		seq_puts(seq, ",cpuset_v2_mode");
+	rcu_read_lock();
 #ifdef CONFIG_VE
 	{
 		struct ve_struct *ve = get_exec_env();
@@ -1124,15 +1123,12 @@  static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
 			 * ve->init_task is synchronized via ve->ve_ns rcu, see
 			 * ve_grab_context/drop_context.
 			 */
-			rcu_read_lock();
 			if (ve->ve_ns)
-				root_cgrp = task_cgroup_from_root(ve->init_task,
+				root_cgrp = css_cgroup_from_root(ve->root_css_set,
 					root);
-			rcu_read_unlock();
 		}
 	}
 #endif
-	rcu_read_lock();
 	release_agent = ve_get_release_agent_path(root_cgrp);
 	if (release_agent && release_agent[0])
 		seq_show_option(seq, "release_agent", release_agent);
@@ -1142,7 +1138,6 @@  static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
 	if (strlen(root->name))
 		seq_show_option(seq, "name", root->name);
 	mutex_unlock(&cgroup_root_mutex);
-	mutex_unlock(&cgroup_mutex);
 	return 0;
 }
 

Comments

Kirill Tkhai Oct. 29, 2020, 11:11 a.m.
On 29.10.2020 14:00, Valeriy Vdovin wrote:
> In 87cb5fdb5b5c77ac617b46a0fe118a7d50a77b1c function cgroup_show_options
> started to lock cgroup_mutex, which introduced new deadlock possibility,
> described below:
> 
> Thread A:
>   m_start() --> down_read(&namespace_sem);
>     cgroup_show_options() --> mutex_lock(&cgroup_mutex);
> 
> Thread B:
> attach_task_by_pid()
>   cgroup_lock_live_group -->  mutex_lock(&cgroup_mutex);
>   threadgroup_lock() -->  down_write(&tsk->signal->group_rwsem);
> 
> Thread C:
>  copy_process
>   threadgroup_change_begin() --> down_read(&tsk->signal->group_rwsem);
>   copy_namespaces
>    create_new_namespaces
>      copy_mnt_ns
>       namespace_lock() --> down_write(&namespace_sem)
> 
> Clearly cgroup_mutex can not be locked right after locking
> namespace_sem, because opposite locking order is also present in
> the code and should be removed from cgroup_show_options.
> After reviewing cgroup_show_options, it was established that
> cgroup_mutex is not absolutely needed to guarantee safe access to root_cgrp.
> It was used in combination with a call to task_cgroup_from_root to
> ensure that root_cgrp lived long enough to access it's value of
> release_agent path.
> But in this funciton we know that root_cgrp is part of ve->root_css_set,
> which holds reference to it. In turn root_css_set is referenced while
> ve->ve_ns is not NULL, the check of which we already have in the code.
> This means that root_cgrp is valid until ve->ve_ns is valid.
> ve->ve_ns is valid until the point of rcu_synchronize in ve_drop_context,
> that's why rcu_read_lock should be maintained all the time when root_cgrp
> is being accessed.
> The patch also removes BUG_ON from css_cgroup_from_root, because all 3 calls 
> to this function pass ve->root_css_set as an argument and the above logic
> applies.
> 
> https://jira.sw.ru/browse/PSBM-121438
> 
> Signed-off-by: Valeriy Vdovin <valeriy.vdovin@virtuozzo.com>

Reviewed-by: Kirill Tkhai <ktkhai@virtuozzo.com>

> ---
>  kernel/cgroup.c | 9 ++-------
>  1 file changed, 2 insertions(+), 7 deletions(-)
> 
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index 27d7a5e..c7ed3c2 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -644,7 +644,6 @@ static struct cgroup *css_cgroup_from_root(struct css_set *css_set,
>  	struct cgroup *res = NULL;
>  	struct cg_cgroup_link *link;
>  
> -	BUG_ON(!mutex_is_locked(&cgroup_mutex));
>  	read_lock(&css_set_lock);
>  
>  	list_for_each_entry(link, &css_set->cg_links, cg_link_list) {
> @@ -1100,7 +1099,6 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
>  	struct cgroup_subsys *ss;
>  	struct cgroup *root_cgrp = &root->top_cgroup;
>  
> -	mutex_lock(&cgroup_mutex);
>  	mutex_lock(&cgroup_root_mutex);
>  	for_each_subsys(root, ss)
>  		seq_printf(seq, ",%s", ss->name);
> @@ -1112,6 +1110,7 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
>  		seq_puts(seq, ",xattr");
>  	if (root->flags & CGRP_ROOT_CPUSET_V2_MODE)
>  		seq_puts(seq, ",cpuset_v2_mode");
> +	rcu_read_lock();
>  #ifdef CONFIG_VE
>  	{
>  		struct ve_struct *ve = get_exec_env();
> @@ -1124,15 +1123,12 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
>  			 * ve->init_task is synchronized via ve->ve_ns rcu, see
>  			 * ve_grab_context/drop_context.
>  			 */
> -			rcu_read_lock();
>  			if (ve->ve_ns)
> -				root_cgrp = task_cgroup_from_root(ve->init_task,
> +				root_cgrp = css_cgroup_from_root(ve->root_css_set,
>  					root);
> -			rcu_read_unlock();
>  		}
>  	}
>  #endif
> -	rcu_read_lock();
>  	release_agent = ve_get_release_agent_path(root_cgrp);
>  	if (release_agent && release_agent[0])
>  		seq_show_option(seq, "release_agent", release_agent);
> @@ -1142,7 +1138,6 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
>  	if (strlen(root->name))
>  		seq_show_option(seq, "name", root->name);
>  	mutex_unlock(&cgroup_root_mutex);
> -	mutex_unlock(&cgroup_mutex);
>  	return 0;
>  }
>  
>