cgroup: make sure a parent css isn't offlined before its children

Submitted by Andrey Ryabinin on May 15, 2020, 4:13 p.m.

Details

Message ID 20200515161309.27679-1-aryabinin@virtuozzo.com
State New
Series "cgroup: make sure a parent css isn't offlined before its children"
Headers show

Commit Message

Andrey Ryabinin May 15, 2020, 4:13 p.m.
From: Tejun Heo <tj@kernel.org>

There are three subsystem callbacks in css shutdown path -
css_offline(), css_released() and css_free().  Except for
css_released(), cgroup core didn't guarantee the order of invocation.
css_offline() or css_free() could be called on a parent css before its
children.  This behavior is unexpected and led to bugs in cpu and
memory controller.

This patch updates offline path so that a parent css is never offlined
before its children.  Each css keeps online_cnt which reaches zero iff
itself and all its children are offline and offline_css() is invoked
only after online_cnt reaches zero.

This fixes the memory controller bug and allows the fix for cpu
controller.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-and-tested-by: Christian Borntraeger <borntraeger@de.ibm.com>
Reported-by: Brian Christiansen <brian.o.christiansen@gmail.com>
Link: http://lkml.kernel.org/g/5698A023.9070703@de.ibm.com
Link: http://lkml.kernel.org/g/CAKB58ikDkzc8REt31WBkD99+hxNzjK4+FBmhkgS+NVrC9vjMSg@mail.gmail.com
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: stable@vger.kernel.org

https://jira.sw.ru/browse/PSBM-104029
(cherry picked from commit aa226ff4a1ce79f229c6b7a4c0a14e17fececd01)
[aryabinin: User refcounts instead of atomics]
Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
---
 include/linux/cgroup.h |  6 ++++
 kernel/cgroup.c        | 66 +++++++++++++++++++++++++-----------------
 2 files changed, 45 insertions(+), 27 deletions(-)

Patch hide | download patch | download mbox

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index bd48216e7ac2..1c1ee0c458e0 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -242,6 +242,12 @@  struct cgroup {
 	 */
 	atomic_t count;
 
+	/*
+	 * Incremented by online self and children.  Used to guarantee that
+	 * parents are not offlined before their children.
+	 */
+	refcount_t online_cnt;
+
 	int id;				/* ida allocated in-hierarchy ID */
 
 	/*
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 50d4b3df0f0a..17ba1ab9c7b0 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1422,6 +1422,7 @@  static void init_cgroup_root(struct cgroupfs_root *root)
 	INIT_LIST_HEAD(&root->subsys_list);
 	INIT_LIST_HEAD(&root->root_list);
 	INIT_LIST_HEAD(&root->allcg_list);
+	refcount_set(&cgrp->online_cnt, 1);
 	root->number_of_cgroups = 1;
 	cgrp->root = root;
 	cgrp->name = &root_cgroup_name;
@@ -4238,8 +4239,13 @@  static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
 
 	if (ss->css_online)
 		ret = ss->css_online(cgrp);
-	if (!ret)
+	if (!ret) {
 		cgrp->subsys[ss->subsys_id]->flags |= CSS_ONLINE;
+
+		refcount_inc(&cgrp->online_cnt);
+		if (cgrp->parent)
+			refcount_inc(&cgrp->parent->online_cnt);
+	}
 	return ret;
 }
 
@@ -4509,9 +4515,11 @@  static void cgroup_css_killed(struct cgroup *cgrp)
 	if (!atomic_dec_and_test(&cgrp->css_kill_cnt))
 		return;
 
-	/* percpu ref's of all css's are killed, kick off the next step */
-	INIT_WORK(&cgrp->destroy_work, cgroup_offline_fn);
-	queue_work(cgroup_destroy_wq, &cgrp->destroy_work);
+	if (refcount_dec_and_test(&cgrp->online_cnt)) {
+		/* percpu ref's of all css's are killed, kick off the next step */
+		INIT_WORK(&cgrp->destroy_work, cgroup_offline_fn);
+		queue_work(cgroup_destroy_wq, &cgrp->destroy_work);
+	}
 }
 
 static void css_ref_killed_fn(struct percpu_ref *ref)
@@ -4649,37 +4657,41 @@  static int cgroup_destroy_locked(struct cgroup *cgrp)
 static void cgroup_offline_fn(struct work_struct *work)
 {
 	struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
-	struct cgroup *parent = cgrp->parent;
-	struct dentry *d = cgrp->dentry;
 	struct cgroup_subsys *ss;
 
 	mutex_lock(&cgroup_mutex);
 
-	/*
-	 * css_tryget() is guaranteed to fail now.  Tell subsystems to
-	 * initate destruction.
-	 */
-	for_each_subsys(cgrp->root, ss)
-		offline_css(ss, cgrp);
+	do {
+		struct cgroup *parent = cgrp->parent;
+		struct dentry *d = cgrp->dentry;
 
-	/*
-	 * Put the css refs from cgroup_destroy_locked().  Each css holds
-	 * an extra reference to the cgroup's dentry and cgroup removal
-	 * proceeds regardless of css refs.  On the last put of each css,
-	 * whenever that may be, the extra dentry ref is put so that dentry
-	 * destruction happens only after all css's are released.
-	 */
-	for_each_subsys(cgrp->root, ss)
-		css_put(cgrp->subsys[ss->subsys_id]);
+		/*
+		 * css_tryget() is guaranteed to fail now.  Tell subsystems to
+		 * initate destruction.
+		 */
+		for_each_subsys(cgrp->root, ss)
+			offline_css(ss, cgrp);
 
-	/* delete this cgroup from parent->children */
-	list_del_rcu(&cgrp->sibling);
-	list_del_init(&cgrp->allcg_node);
+		/*
+		 * Put the css refs from cgroup_destroy_locked().  Each css holds
+		 * an extra reference to the cgroup's dentry and cgroup removal
+		 * proceeds regardless of css refs.  On the last put of each css,
+		 * whenever that may be, the extra dentry ref is put so that dentry
+		 * destruction happens only after all css's are released.
+		 */
+		for_each_subsys(cgrp->root, ss)
+			css_put(cgrp->subsys[ss->subsys_id]);
 
-	dput(d);
+		/* delete this cgroup from parent->children */
+		list_del_rcu(&cgrp->sibling);
+		list_del_init(&cgrp->allcg_node);
+
+		dput(d);
 
-	set_bit(CGRP_RELEASABLE, &parent->flags);
-	check_for_release(parent);
+		set_bit(CGRP_RELEASABLE, &parent->flags);
+		check_for_release(parent);
+		cgrp = parent;
+	} while (cgrp && refcount_dec_and_test(&cgrp->online_cnt));
 
 	mutex_unlock(&cgroup_mutex);
 }

Comments

Kirill Tkhai May 19, 2020, 8:32 a.m.
On 15.05.2020 19:13, Andrey Ryabinin wrote:
> From: Tejun Heo <tj@kernel.org>
> 
> There are three subsystem callbacks in css shutdown path -
> css_offline(), css_released() and css_free().  Except for
> css_released(), cgroup core didn't guarantee the order of invocation.
> css_offline() or css_free() could be called on a parent css before its
> children.  This behavior is unexpected and led to bugs in cpu and
> memory controller.
> 
> This patch updates offline path so that a parent css is never offlined
> before its children.  Each css keeps online_cnt which reaches zero iff
> itself and all its children are offline and offline_css() is invoked
> only after online_cnt reaches zero.
> 
> This fixes the memory controller bug and allows the fix for cpu
> controller.
> 
> Signed-off-by: Tejun Heo <tj@kernel.org>
> Reported-and-tested-by: Christian Borntraeger <borntraeger@de.ibm.com>
> Reported-by: Brian Christiansen <brian.o.christiansen@gmail.com>
> Link: http://lkml.kernel.org/g/5698A023.9070703@de.ibm.com
> Link: http://lkml.kernel.org/g/CAKB58ikDkzc8REt31WBkD99+hxNzjK4+FBmhkgS+NVrC9vjMSg@mail.gmail.com
> Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: stable@vger.kernel.org
> 
> https://jira.sw.ru/browse/PSBM-104029
> (cherry picked from commit aa226ff4a1ce79f229c6b7a4c0a14e17fececd01)
> [aryabinin: User refcounts instead of atomics]
> Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>

Reviewed-by: Kirill Tkhai <ktkhai@virtuozzo.com>

> ---
>  include/linux/cgroup.h |  6 ++++
>  kernel/cgroup.c        | 66 +++++++++++++++++++++++++-----------------
>  2 files changed, 45 insertions(+), 27 deletions(-)
> 
> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> index bd48216e7ac2..1c1ee0c458e0 100644
> --- a/include/linux/cgroup.h
> +++ b/include/linux/cgroup.h
> @@ -242,6 +242,12 @@ struct cgroup {
>  	 */
>  	atomic_t count;
>  
> +	/*
> +	 * Incremented by online self and children.  Used to guarantee that
> +	 * parents are not offlined before their children.
> +	 */
> +	refcount_t online_cnt;
> +
>  	int id;				/* ida allocated in-hierarchy ID */
>  
>  	/*
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index 50d4b3df0f0a..17ba1ab9c7b0 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -1422,6 +1422,7 @@ static void init_cgroup_root(struct cgroupfs_root *root)
>  	INIT_LIST_HEAD(&root->subsys_list);
>  	INIT_LIST_HEAD(&root->root_list);
>  	INIT_LIST_HEAD(&root->allcg_list);
> +	refcount_set(&cgrp->online_cnt, 1);
>  	root->number_of_cgroups = 1;
>  	cgrp->root = root;
>  	cgrp->name = &root_cgroup_name;
> @@ -4238,8 +4239,13 @@ static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
>  
>  	if (ss->css_online)
>  		ret = ss->css_online(cgrp);
> -	if (!ret)
> +	if (!ret) {
>  		cgrp->subsys[ss->subsys_id]->flags |= CSS_ONLINE;
> +
> +		refcount_inc(&cgrp->online_cnt);
> +		if (cgrp->parent)
> +			refcount_inc(&cgrp->parent->online_cnt);
> +	}
>  	return ret;
>  }
>  
> @@ -4509,9 +4515,11 @@ static void cgroup_css_killed(struct cgroup *cgrp)
>  	if (!atomic_dec_and_test(&cgrp->css_kill_cnt))
>  		return;
>  
> -	/* percpu ref's of all css's are killed, kick off the next step */
> -	INIT_WORK(&cgrp->destroy_work, cgroup_offline_fn);
> -	queue_work(cgroup_destroy_wq, &cgrp->destroy_work);
> +	if (refcount_dec_and_test(&cgrp->online_cnt)) {
> +		/* percpu ref's of all css's are killed, kick off the next step */
> +		INIT_WORK(&cgrp->destroy_work, cgroup_offline_fn);
> +		queue_work(cgroup_destroy_wq, &cgrp->destroy_work);
> +	}
>  }
>  
>  static void css_ref_killed_fn(struct percpu_ref *ref)
> @@ -4649,37 +4657,41 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
>  static void cgroup_offline_fn(struct work_struct *work)
>  {
>  	struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
> -	struct cgroup *parent = cgrp->parent;
> -	struct dentry *d = cgrp->dentry;
>  	struct cgroup_subsys *ss;
>  
>  	mutex_lock(&cgroup_mutex);
>  
> -	/*
> -	 * css_tryget() is guaranteed to fail now.  Tell subsystems to
> -	 * initate destruction.
> -	 */
> -	for_each_subsys(cgrp->root, ss)
> -		offline_css(ss, cgrp);
> +	do {
> +		struct cgroup *parent = cgrp->parent;
> +		struct dentry *d = cgrp->dentry;
>  
> -	/*
> -	 * Put the css refs from cgroup_destroy_locked().  Each css holds
> -	 * an extra reference to the cgroup's dentry and cgroup removal
> -	 * proceeds regardless of css refs.  On the last put of each css,
> -	 * whenever that may be, the extra dentry ref is put so that dentry
> -	 * destruction happens only after all css's are released.
> -	 */
> -	for_each_subsys(cgrp->root, ss)
> -		css_put(cgrp->subsys[ss->subsys_id]);
> +		/*
> +		 * css_tryget() is guaranteed to fail now.  Tell subsystems to
> +		 * initate destruction.
> +		 */
> +		for_each_subsys(cgrp->root, ss)
> +			offline_css(ss, cgrp);
>  
> -	/* delete this cgroup from parent->children */
> -	list_del_rcu(&cgrp->sibling);
> -	list_del_init(&cgrp->allcg_node);
> +		/*
> +		 * Put the css refs from cgroup_destroy_locked().  Each css holds
> +		 * an extra reference to the cgroup's dentry and cgroup removal
> +		 * proceeds regardless of css refs.  On the last put of each css,
> +		 * whenever that may be, the extra dentry ref is put so that dentry
> +		 * destruction happens only after all css's are released.
> +		 */
> +		for_each_subsys(cgrp->root, ss)
> +			css_put(cgrp->subsys[ss->subsys_id]);
>  
> -	dput(d);
> +		/* delete this cgroup from parent->children */
> +		list_del_rcu(&cgrp->sibling);
> +		list_del_init(&cgrp->allcg_node);
> +
> +		dput(d);
>  
> -	set_bit(CGRP_RELEASABLE, &parent->flags);
> -	check_for_release(parent);
> +		set_bit(CGRP_RELEASABLE, &parent->flags);
> +		check_for_release(parent);
> +		cgrp = parent;
> +	} while (cgrp && refcount_dec_and_test(&cgrp->online_cnt));
>  
>  	mutex_unlock(&cgroup_mutex);
>  }
>