[4/7] ve/cgroup: Moved cgroup release notifications to per-ve workqueues.

Submitted by Valeriy Vdovin on April 1, 2020, 3:41 p.m.

Details

Message ID 1585755688-20348-5-git-send-email-valeriy.vdovin@virtuozzo.com
State New
Series "Make release_agent per-cgroup property. Run release_agent in proper ve."
Headers show

Commit Message

Valeriy Vdovin April 1, 2020, 3:41 p.m.
Notifications about cgroups getting empty should be executed from
inside of a VE, that owns this cgroup. This patch splits logic
of notifications execution between ve code and cgroups code. VE is
now responsible to store the list of cgroups that want to notify
about themselves. VE also now manages the execution of release_agent
work under it's domain, so the usermode process is spawned in this
VE's namespaces.
VE is now responsible to decline possibility of scheduling release_agent
works when it is stopping.
cgroup code is still responsible to manage setting/getting release_agent
path parameter and pass it between mounts, but now it's also
responsible to store owning ve as a filed in cgroup struct.

https://jira.sw.ru/browse/PSBM-83887
Signed-off-by: Valeriy Vdovin <valeriy.vdovin@virtuozzo.com>
---
 include/linux/cgroup.h |  26 +++++++++
 include/linux/ve.h     |  17 ++++++
 kernel/cgroup.c        | 156 +++++++++++++++++++++++++++++++++----------------
 kernel/ve/ve.c         |  93 ++++++++++++++++++++++++++++-
 4 files changed, 238 insertions(+), 54 deletions(-)

Patch hide | download patch | download mbox

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index b0b8b9e..86d3e5d 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -315,6 +315,9 @@  struct cgroup {
 	/* directory xattrs */
 	struct simple_xattrs xattrs;
 	u64 subgroups_limit;
+
+	/* ve_owner, responsible for running release agent. */
+	struct ve_struct *ve_owner;
 };
 
 #define MAX_CGROUP_ROOT_NAMELEN 64
@@ -466,6 +469,27 @@  struct css_set {
 };
 
 /*
+ * refcounted get/put for css_set objects
+ */
+
+void __put_css_set(struct css_set *cg, int taskexit);
+
+static inline void get_css_set(struct css_set *cg)
+{
+	atomic_inc(&cg->refcount);
+}
+
+static inline void put_css_set(struct css_set *cg)
+{
+	__put_css_set(cg, 0);
+}
+
+static inline void put_css_set_taskexit(struct css_set *cg)
+{
+	__put_css_set(cg, 1);
+}
+
+/*
  * cgroup_map_cb is an abstract callback API for reporting map-valued
  * control files
  */
@@ -636,9 +660,11 @@  int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen);
 int cgroup_path_ve(const struct cgroup *cgrp, char *buf, int buflen);
 
 int cgroup_task_count(const struct cgroup *cgrp);
+void cgroup_release_agent(struct work_struct *work);
 
 #ifdef CONFIG_VE
 void cgroup_mark_ve_root(struct ve_struct *ve);
+void cgroup_unbind_roots_from_ve(struct ve_struct *ve);
 #endif
 
 /*
diff --git a/include/linux/ve.h b/include/linux/ve.h
index 92ec8f9..fcdb25a 100644
--- a/include/linux/ve.h
+++ b/include/linux/ve.h
@@ -126,7 +126,22 @@  struct ve_struct {
 #endif
 	struct kmapset_key	sysfs_perms_key;
 
+	/*
+	 * cgroups, that want to notify about becoming
+	 * empty, are linked to this release_list.
+	 */
+	struct list_head	release_list;
+	struct raw_spinlock	release_list_lock;
+
 	struct workqueue_struct	*wq;
+	struct work_struct	release_agent_work;
+
+	/*
+	 * All tasks, that belong to this ve, live
+	 * in cgroups, that are children to cgroups
+	 * that form this css_set.
+	 */
+	struct css_set __rcu	*root_css_set;
 };
 
 struct ve_devmnt {
@@ -190,6 +205,8 @@  call_usermodehelper_ve(struct ve_struct *ve, char *path, char **argv,
 }
 void do_update_load_avg_ve(void);
 
+void ve_add_to_release_list(struct ve_struct *ve, struct cgroup *cgrp);
+void ve_rm_from_release_list(struct ve_struct *ve, struct cgroup *cgrp);
 extern struct ve_struct *get_ve(struct ve_struct *ve);
 extern void put_ve(struct ve_struct *ve);
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 3df28a2..680ec8e 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -271,10 +271,6 @@  static bool cgroup_lock_live_group(struct cgroup *cgrp)
 
 /* the list of cgroups eligible for automatic release. Protected by
  * release_list_lock */
-static LIST_HEAD(release_list);
-static DEFINE_RAW_SPINLOCK(release_list_lock);
-static void cgroup_release_agent(struct work_struct *work);
-static DECLARE_WORK(release_agent_work, cgroup_release_agent);
 static void check_for_release(struct cgroup *cgrp);
 
 /* Link structure for associating css_set objects with cgroups */
@@ -335,7 +331,7 @@  static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
  * compiled into their kernel but not actually in use */
 static int use_task_css_set_links __read_mostly;
 
-static void __put_css_set(struct css_set *cg, int taskexit)
+void __put_css_set(struct css_set *cg, int taskexit)
 {
 	struct cg_cgroup_link *link;
 	struct cg_cgroup_link *saved_link;
@@ -384,24 +380,6 @@  static void __put_css_set(struct css_set *cg, int taskexit)
 }
 
 /*
- * refcounted get/put for css_set objects
- */
-static inline void get_css_set(struct css_set *cg)
-{
-	atomic_inc(&cg->refcount);
-}
-
-static inline void put_css_set(struct css_set *cg)
-{
-	__put_css_set(cg, 0);
-}
-
-static inline void put_css_set_taskexit(struct css_set *cg)
-{
-	__put_css_set(cg, 1);
-}
-
-/*
  * compare_css_sets - helper function for find_existing_css_set().
  * @cg: candidate css_set being tested
  * @old_cg: existing css_set for a task
@@ -654,6 +632,35 @@  static struct css_set *find_css_set(
 	return res;
 }
 
+static struct cgroup *css_cgroup_from_root(struct css_set *css_set,
+					    struct cgroupfs_root *root)
+{
+	struct cgroup *res = NULL;
+
+	BUG_ON(!mutex_is_locked(&cgroup_mutex));
+	read_lock(&css_set_lock);
+	/*
+	 * No need to lock the task - since we hold cgroup_mutex the
+	 * task can't change groups, so the only thing that can happen
+	 * is that it exits and its css is set back to init_css_set.
+	 */
+	if (css_set == &init_css_set) {
+		res = &root->top_cgroup;
+	} else {
+		struct cg_cgroup_link *link;
+		list_for_each_entry(link, &css_set->cg_links, cg_link_list) {
+			struct cgroup *c = link->cgrp;
+			if (c->root == root) {
+				res = c;
+				break;
+			}
+		}
+	}
+	read_unlock(&css_set_lock);
+	BUG_ON(!res);
+	return res;
+}
+
 /*
  * Return the cgroup for "task" from the given hierarchy. Must be
  * called with cgroup_mutex held.
@@ -796,6 +803,31 @@  static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry)
 	return name;
 }
 
+struct cgroup *cgroup_get_local_root(struct cgroup *cgrp)
+{
+	/*
+	 * Find nearest root cgroup, which might be host cgroup root
+	 * or ve cgroup root.
+	 *
+	 *    <host_root_cgroup> -> local_root
+	 *     \                    ^
+	 *      <cgroup>            |
+	 *       \                  |
+	 *        <cgroup>   --->   from here
+	 *        \
+	 *         <ve_root_cgroup> -> local_root
+	 *         \                   ^
+	 *          <cgroup>           |
+	 *          \                  |
+	 *           <cgroup>  --->    from here
+	 */
+	lockdep_assert_held(&cgroup_mutex);
+	while (cgrp->parent && !test_bit(CGRP_VE_ROOT, &cgrp->flags))
+		cgrp = cgrp->parent;
+
+	return cgrp;
+}
+
 static void cgroup_free_fn(struct work_struct *work)
 {
 	struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
@@ -1638,6 +1670,7 @@  static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 		const struct cred *cred;
 		int i;
 		struct css_set *cg;
+		root_cgrp->ve_owner = get_exec_env();
 
 		BUG_ON(sb->s_root != NULL);
 
@@ -4320,6 +4353,7 @@  void cgroup_mark_ve_root(struct ve_struct *ve)
 	mutex_lock(&cgroup_mutex);
 	for_each_active_root(root) {
 		cgrp = task_cgroup_from_root(ve->init_task, root);
+		cgrp->ve_owner = ve;
 		set_bit(CGRP_VE_ROOT, &cgrp->flags);
 
 		if (test_bit(cpu_cgroup_subsys_id, &root->subsys_mask))
@@ -4328,6 +4362,20 @@  void cgroup_mark_ve_root(struct ve_struct *ve)
 	mutex_unlock(&cgroup_mutex);
 }
 
+void cgroup_unbind_roots_from_ve(struct ve_struct *ve)
+{
+	struct cgroup *cgrp;
+	struct cgroupfs_root *root;
+
+	mutex_lock(&cgroup_mutex);
+	for_each_active_root(root) {
+		cgrp = css_cgroup_from_root(ve->root_css_set, root);
+		BUG_ON(!cgrp->ve_owner);
+		cgrp->ve_owner = NULL;
+	}
+	mutex_unlock(&cgroup_mutex);
+}
+
 struct cgroup *cgroup_get_ve_root(struct cgroup *cgrp)
 {
 	struct cgroup *ve_root = NULL;
@@ -4564,6 +4612,7 @@  static int cgroup_destroy_locked(struct cgroup *cgrp)
 	struct cgroup_event *event, *tmp;
 	struct cgroup_subsys *ss;
 	struct cgroup *child;
+	struct cgroup *root_cgrp;
 	bool empty;
 
 	lockdep_assert_held(&d->d_inode->i_mutex);
@@ -4620,11 +4669,9 @@  static int cgroup_destroy_locked(struct cgroup *cgrp)
 
 	set_bit(CGRP_REMOVED, &cgrp->flags);
 
-	raw_spin_lock(&release_list_lock);
-	if (!list_empty(&cgrp->release_list))
-		list_del_init(&cgrp->release_list);
-	raw_spin_unlock(&release_list_lock);
-
+	root_cgrp = cgroup_get_local_root(cgrp);
+	if (root_cgrp->ve_owner)
+		ve_rm_from_release_list(root_cgrp->ve_owner, cgrp);
 	/*
 	 * Remove @cgrp directory.  The removal puts the base ref but we
 	 * aren't quite done with @cgrp yet, so hold onto it.
@@ -5399,6 +5446,7 @@  void cgroup_exit(struct task_struct *tsk, int run_callbacks)
 
 static void check_for_release(struct cgroup *cgrp)
 {
+	struct cgroup *root_cgrp;
 	/* All of these checks rely on RCU to keep the cgroup
 	 * structure alive */
 	if (cgroup_is_releasable(cgrp) &&
@@ -5408,17 +5456,10 @@  static void check_for_release(struct cgroup *cgrp)
 		 * already queued for a userspace notification, queue
 		 * it now
 		 */
-		int need_schedule_work = 0;
-
-		raw_spin_lock(&release_list_lock);
-		if (!cgroup_is_removed(cgrp) &&
-		    list_empty(&cgrp->release_list)) {
-			list_add(&cgrp->release_list, &release_list);
-			need_schedule_work = 1;
-		}
-		raw_spin_unlock(&release_list_lock);
-		if (need_schedule_work)
-			schedule_work(&release_agent_work);
+		root_cgrp = cgroup_get_local_root(cgrp);
+		BUG_ON(!root_cgrp);
+		if (root_cgrp->ve_owner)
+			ve_add_to_release_list(root_cgrp->ve_owner, cgrp);
 	}
 }
 
@@ -5445,24 +5486,35 @@  static void check_for_release(struct cgroup *cgrp)
  * this routine has no use for the exit status of the release agent
  * task, so no sense holding our caller up for that.
  */
-static void cgroup_release_agent(struct work_struct *work)
+void cgroup_release_agent(struct work_struct *work)
 {
-	BUG_ON(work != &release_agent_work);
+	struct ve_struct *ve;
+	ve = container_of(work, struct ve_struct, release_agent_work);
+	down_read(&ve->op_sem);
+	if (!ve->is_running) {
+		up_read(&ve->op_sem);
+		return;
+	}
+	up_read(&ve->op_sem);
 	mutex_lock(&cgroup_mutex);
-	raw_spin_lock(&release_list_lock);
-	while (!list_empty(&release_list)) {
+	raw_spin_lock(&ve->release_list_lock);
+	while (!list_empty(&ve->release_list)) {
 		char *argv[3], *envp[3];
 		int i, err;
+		const char *release_agent;
 		char *pathbuf = NULL, *agentbuf = NULL;
-		struct cgroup *cgrp = list_entry(release_list.next,
-						    struct cgroup,
-						    release_list);
+		struct cgroup *cgrp, *root_cgrp;
+
+		cgrp = list_entry(ve->release_list.next,
+				  struct cgroup,
+				  release_list);
+
 		list_del_init(&cgrp->release_list);
-		raw_spin_unlock(&release_list_lock);
+		raw_spin_unlock(&ve->release_list_lock);
 		pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
 		if (!pathbuf)
 			goto continue_free;
-		if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0)
+		if (__cgroup_path(cgrp, pathbuf, PAGE_SIZE, 1) < 0)
 			goto continue_free;
 		agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
 		if (!agentbuf)
@@ -5483,7 +5535,9 @@  static void cgroup_release_agent(struct work_struct *work)
 		 * since the exec could involve hitting disk and hence
 		 * be a slow process */
 		mutex_unlock(&cgroup_mutex);
-		err = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
+
+		err = call_usermodehelper_fns_ve(ve, argv[0], argv,
+			envp, UMH_WAIT_EXEC, NULL, NULL, NULL);
 		if (err < 0)
 			pr_warn_ratelimited("cgroup release_agent "
 					    "%s %s failed: %d\n",
@@ -5493,9 +5547,9 @@  static void cgroup_release_agent(struct work_struct *work)
  continue_free:
 		kfree(pathbuf);
 		kfree(agentbuf);
-		raw_spin_lock(&release_list_lock);
+		raw_spin_lock(&ve->release_list_lock);
 	}
-	raw_spin_unlock(&release_list_lock);
+	raw_spin_unlock(&ve->release_list_lock);
 	mutex_unlock(&cgroup_mutex);
 }
 
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index 587f445..7eff0e7 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -87,6 +87,10 @@  struct ve_struct ve0 = {
 	.netif_max_nr		= INT_MAX,
 	.arp_neigh_nr		= ATOMIC_INIT(0),
 	.nd_neigh_nr		= ATOMIC_INIT(0),
+	.release_list_lock	= __RAW_SPIN_LOCK_UNLOCKED(ve0.release_list_lock),
+	.release_list		= LIST_HEAD_INIT(ve0.release_list),
+	.release_agent_work	= __WORK_INITIALIZER(ve0.release_agent_work,
+					cgroup_release_agent),
 };
 EXPORT_SYMBOL(ve0);
 
@@ -453,6 +457,9 @@  static void ve_grab_context(struct ve_struct *ve)
 
 	get_task_struct(tsk);
 	ve->init_task = tsk;
+	ve->root_css_set = tsk->cgroups;
+	get_css_set(ve->root_css_set);
+
 	ve->init_cred = (struct cred *)get_current_cred();
 	rcu_assign_pointer(ve->ve_ns, get_nsproxy(tsk->nsproxy));
 	ve->ve_netns =  get_net(ve->ve_ns->net_ns);
@@ -465,6 +472,9 @@  static void ve_drop_context(struct ve_struct *ve)
 	put_net(ve->ve_netns);
 	ve->ve_netns = NULL;
 
+	put_css_set_taskexit(ve->root_css_set);
+	ve->root_css_set = NULL;
+
 	/* Allows to dereference init_cred and init_task if ve_ns is set */
 	rcu_assign_pointer(ve->ve_ns, NULL);
 	synchronize_rcu();
@@ -477,7 +487,6 @@  static void ve_drop_context(struct ve_struct *ve)
 
 	put_task_struct(ve->init_task);
 	ve->init_task = NULL;
-
 }
 
 static const struct timespec zero_time = { };
@@ -504,6 +513,53 @@  static void ve_workqueue_stop(struct ve_struct *ve)
 	destroy_workqueue(ve->wq);
 }
 
+void ve_add_to_release_list(struct ve_struct *ve, struct cgroup *cgrp)
+{
+	int need_schedule_work = 0;
+
+	down_write(&ve->op_sem);
+
+	if (!ve->is_running)
+		goto out_unlock;
+
+	/*
+	 * Guarantee dependency is provided like so:
+	 * - release_list_lock is valid due to ve != NULL.
+	 * - is_running = 1 is valid due to ve->op_sem is held.
+	 * - ve->op_sem is valid due to ve != NULL.
+	 * - ve != NULL should be guaranteed by the caller.
+	 */
+	raw_spin_lock(&ve->release_list_lock);
+	if (!cgroup_is_removed(cgrp) &&
+	    list_empty(&cgrp->release_list)) {
+		list_add(&cgrp->release_list, &ve->release_list);
+		need_schedule_work = 1;
+	}
+	raw_spin_unlock(&ve->release_list_lock);
+	/*
+	 * Because we are under ve->op_sem lock, ve->is_running will
+	 * be set only after this is already scheduled to ve->wq.
+	 * Guaranteed to wait until this work is flushed before
+	 * proceeding with ve_stop_ns.
+	 */
+	if (need_schedule_work)
+		queue_work(ve->wq, &ve->release_agent_work);
+
+out_unlock:
+	up_write(&ve->op_sem);
+}
+
+void ve_rm_from_release_list(struct ve_struct *ve, struct cgroup *cgrp)
+{
+	/*
+	 * There is no is_runnig check here, because the caller is
+	 * is more flexible in using ve->op_sem lock from above.
+	 */
+	raw_spin_lock(&ve->release_list_lock);
+	if (!list_empty(&cgrp->release_list))
+		list_del_init(&cgrp->release_list);
+	raw_spin_unlock(&ve->release_list_lock);
+}
 
 /* under ve->op_sem write-lock */
 static int ve_start_container(struct ve_struct *ve)
@@ -579,6 +635,8 @@  err_list:
 	return err;
 }
 
+extern void cgroup_unbind_roots_from_ve(struct ve_struct *ve);
+
 void ve_stop_ns(struct pid_namespace *pid_ns)
 {
 	struct ve_struct *ve = current->task_ve;
@@ -590,19 +648,44 @@  void ve_stop_ns(struct pid_namespace *pid_ns)
 	if (!ve->ve_ns || ve->ve_ns->pid_ns != pid_ns)
 		return;
 
-	wait_khelpers();
-
 	down_write(&ve->op_sem);
+
+	/*
+	 * Zero-out cgroup->ve_owner for all ve roots.
+	 * After this is done cgroups will not be able to
+	 * pass through check_for_release into this ve's
+	 * release_list.
+	 */
+	cgroup_unbind_roots_from_ve(ve);
+
 	/*
 	 * Here the VE changes its state into "not running".
 	 * op_sem works as barrier for vzctl ioctls.
 	 * ve_mutex works as barrier for ve_can_attach().
+	 * is_running = 0 also blocks further insertions to
+	 * release_list.
 	 */
 	ve->is_running = 0;
 
+	up_write(&ve->op_sem);
+
+	/*
+	 * Stopping workqueue flushes all the tasks there as well as
+	 * disables all further work inserions into it. The ordering
+	 * relative to wait_khelpers does not matter.
+	 */
 	ve_workqueue_stop(ve);
 
 	/*
+	 * Wait until all calls to call_usermodehelper_by
+	 * finish to ensure that any usermode tasks launched before
+	 * ve->is_running was set to 0 has finished after this line.
+	 */
+	wait_khelpers();
+
+	down_write(&ve->op_sem);
+
+	/*
 	 * Neither it can be in pseudosuper state
 	 * anymore, setup it again if needed.
 	 */
@@ -698,6 +781,9 @@  static struct cgroup_subsys_state *ve_create(struct cgroup *cg)
 	if (!ve->ve_name)
 		goto err_name;
 
+	INIT_WORK(&ve->release_agent_work, cgroup_release_agent);
+	raw_spin_lock_init(&ve->release_list_lock);
+
 	ve->_randomize_va_space = ve0._randomize_va_space;
 
 	ve->features = VE_FEATURES_DEF;
@@ -732,6 +818,7 @@  do_init:
 	INIT_LIST_HEAD(&ve->devices);
 	INIT_LIST_HEAD(&ve->ve_list);
 	INIT_LIST_HEAD(&ve->devmnt_list);
+	INIT_LIST_HEAD(&ve->release_list);
 	mutex_init(&ve->devmnt_mutex);
 
 #ifdef CONFIG_AIO

Comments

Pavel Tikhomirov April 2, 2020, 10 a.m.
On 4/1/20 6:41 PM, Valeriy Vdovin wrote:
> Notifications about cgroups getting empty should be executed from
> inside of a VE, that owns this cgroup. This patch splits logic
> of notifications execution between ve code and cgroups code. VE is
> now responsible to store the list of cgroups that want to notify
> about themselves. VE also now manages the execution of release_agent
> work under it's domain, so the usermode process is spawned in this
> VE's namespaces.
> VE is now responsible to decline possibility of scheduling release_agent
> works when it is stopping.
> cgroup code is still responsible to manage setting/getting release_agent
> path parameter and pass it between mounts, but now it's also
> responsible to store owning ve as a filed in cgroup struct.
> 
> https://jira.sw.ru/browse/PSBM-83887
> Signed-off-by: Valeriy Vdovin <valeriy.vdovin@virtuozzo.com>
> ---
>   include/linux/cgroup.h |  26 +++++++++
>   include/linux/ve.h     |  17 ++++++
>   kernel/cgroup.c        | 156 +++++++++++++++++++++++++++++++++----------------
>   kernel/ve/ve.c         |  93 ++++++++++++++++++++++++++++-
>   4 files changed, 238 insertions(+), 54 deletions(-)
> 
> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> index b0b8b9e..86d3e5d 100644
> --- a/include/linux/cgroup.h
> +++ b/include/linux/cgroup.h
> @@ -315,6 +315,9 @@ struct cgroup {
>   	/* directory xattrs */
>   	struct simple_xattrs xattrs;
>   	u64 subgroups_limit;
> +
> +	/* ve_owner, responsible for running release agent. */
> +	struct ve_struct *ve_owner;
>   };
>   
>   #define MAX_CGROUP_ROOT_NAMELEN 64
> @@ -466,6 +469,27 @@ struct css_set {
>   };
>   
>   /*
> + * refcounted get/put for css_set objects
> + */
> +
> +void __put_css_set(struct css_set *cg, int taskexit);
> +
> +static inline void get_css_set(struct css_set *cg)
> +{
> +	atomic_inc(&cg->refcount);
> +}
> +
> +static inline void put_css_set(struct css_set *cg)
> +{
> +	__put_css_set(cg, 0);
> +}
> +
> +static inline void put_css_set_taskexit(struct css_set *cg)
> +{
> +	__put_css_set(cg, 1);
> +}
> +
> +/*
>    * cgroup_map_cb is an abstract callback API for reporting map-valued
>    * control files
>    */
> @@ -636,9 +660,11 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen);
>   int cgroup_path_ve(const struct cgroup *cgrp, char *buf, int buflen);
>   
>   int cgroup_task_count(const struct cgroup *cgrp);
> +void cgroup_release_agent(struct work_struct *work);
>   
>   #ifdef CONFIG_VE
>   void cgroup_mark_ve_root(struct ve_struct *ve);
> +void cgroup_unbind_roots_from_ve(struct ve_struct *ve);
>   #endif
>   
>   /*
> diff --git a/include/linux/ve.h b/include/linux/ve.h
> index 92ec8f9..fcdb25a 100644
> --- a/include/linux/ve.h
> +++ b/include/linux/ve.h
> @@ -126,7 +126,22 @@ struct ve_struct {
>   #endif
>   	struct kmapset_key	sysfs_perms_key;
>   
> +	/*
> +	 * cgroups, that want to notify about becoming
> +	 * empty, are linked to this release_list.
> +	 */
> +	struct list_head	release_list;
> +	struct raw_spinlock	release_list_lock;
> +
>   	struct workqueue_struct	*wq;
> +	struct work_struct	release_agent_work;
> +
> +	/*
> +	 * All tasks, that belong to this ve, live
> +	 * in cgroups, that are children to cgroups
> +	 * that form this css_set.
> +	 */
> +	struct css_set __rcu	*root_css_set;
>   };
>   
>   struct ve_devmnt {
> @@ -190,6 +205,8 @@ call_usermodehelper_ve(struct ve_struct *ve, char *path, char **argv,
>   }
>   void do_update_load_avg_ve(void);
>   
> +void ve_add_to_release_list(struct ve_struct *ve, struct cgroup *cgrp);
> +void ve_rm_from_release_list(struct ve_struct *ve, struct cgroup *cgrp);
>   extern struct ve_struct *get_ve(struct ve_struct *ve);
>   extern void put_ve(struct ve_struct *ve);
>   
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index 3df28a2..680ec8e 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -271,10 +271,6 @@ static bool cgroup_lock_live_group(struct cgroup *cgrp)
>   
>   /* the list of cgroups eligible for automatic release. Protected by
>    * release_list_lock */
> -static LIST_HEAD(release_list);
> -static DEFINE_RAW_SPINLOCK(release_list_lock);
> -static void cgroup_release_agent(struct work_struct *work);
> -static DECLARE_WORK(release_agent_work, cgroup_release_agent);
>   static void check_for_release(struct cgroup *cgrp);
>   
>   /* Link structure for associating css_set objects with cgroups */
> @@ -335,7 +331,7 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
>    * compiled into their kernel but not actually in use */
>   static int use_task_css_set_links __read_mostly;
>   
> -static void __put_css_set(struct css_set *cg, int taskexit)
> +void __put_css_set(struct css_set *cg, int taskexit)
>   {
>   	struct cg_cgroup_link *link;
>   	struct cg_cgroup_link *saved_link;
> @@ -384,24 +380,6 @@ static void __put_css_set(struct css_set *cg, int taskexit)
>   }
>   
>   /*
> - * refcounted get/put for css_set objects
> - */
> -static inline void get_css_set(struct css_set *cg)
> -{
> -	atomic_inc(&cg->refcount);
> -}
> -
> -static inline void put_css_set(struct css_set *cg)
> -{
> -	__put_css_set(cg, 0);
> -}
> -
> -static inline void put_css_set_taskexit(struct css_set *cg)
> -{
> -	__put_css_set(cg, 1);
> -}
> -
> -/*
>    * compare_css_sets - helper function for find_existing_css_set().
>    * @cg: candidate css_set being tested
>    * @old_cg: existing css_set for a task
> @@ -654,6 +632,35 @@ static struct css_set *find_css_set(
>   	return res;
>   }
>   
> +static struct cgroup *css_cgroup_from_root(struct css_set *css_set,
> +					    struct cgroupfs_root *root)
> +{
> +	struct cgroup *res = NULL;
> +
> +	BUG_ON(!mutex_is_locked(&cgroup_mutex));
> +	read_lock(&css_set_lock);
> +	/*
> +	 * No need to lock the task - since we hold cgroup_mutex the
> +	 * task can't change groups, so the only thing that can happen
> +	 * is that it exits and its css is set back to init_css_set.
> +	 */
> +	if (css_set == &init_css_set) {
> +		res = &root->top_cgroup;
> +	} else {
> +		struct cg_cgroup_link *link;
> +		list_for_each_entry(link, &css_set->cg_links, cg_link_list) {
> +			struct cgroup *c = link->cgrp;
> +			if (c->root == root) {
> +				res = c;
> +				break;
> +			}
> +		}
> +	}
> +	read_unlock(&css_set_lock);
> +	BUG_ON(!res);
> +	return res;
> +}
> +
>   /*
>    * Return the cgroup for "task" from the given hierarchy. Must be
>    * called with cgroup_mutex held.
> @@ -796,6 +803,31 @@ static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry)
>   	return name;
>   }
>   
> +struct cgroup *cgroup_get_local_root(struct cgroup *cgrp)
> +{
> +	/*
> +	 * Find nearest root cgroup, which might be host cgroup root
> +	 * or ve cgroup root.
> +	 *
> +	 *    <host_root_cgroup> -> local_root
> +	 *     \                    ^
> +	 *      <cgroup>            |
> +	 *       \                  |
> +	 *        <cgroup>   --->   from here
> +	 *        \
> +	 *         <ve_root_cgroup> -> local_root
> +	 *         \                   ^
> +	 *          <cgroup>           |
> +	 *          \                  |
> +	 *           <cgroup>  --->    from here
> +	 */
> +	lockdep_assert_held(&cgroup_mutex);
> +	while (cgrp->parent && !test_bit(CGRP_VE_ROOT, &cgrp->flags))
> +		cgrp = cgrp->parent;
> +
> +	return cgrp;
> +}
> +
>   static void cgroup_free_fn(struct work_struct *work)
>   {
>   	struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
> @@ -1638,6 +1670,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
>   		const struct cred *cred;
>   		int i;
>   		struct css_set *cg;
> +		root_cgrp->ve_owner = get_exec_env();
>   
>   		BUG_ON(sb->s_root != NULL);
>   
> @@ -4320,6 +4353,7 @@ void cgroup_mark_ve_root(struct ve_struct *ve)
>   	mutex_lock(&cgroup_mutex);
>   	for_each_active_root(root) {
>   		cgrp = task_cgroup_from_root(ve->init_task, root);
> +		cgrp->ve_owner = ve;
>   		set_bit(CGRP_VE_ROOT, &cgrp->flags);
>   
>   		if (test_bit(cpu_cgroup_subsys_id, &root->subsys_mask))
> @@ -4328,6 +4362,20 @@ void cgroup_mark_ve_root(struct ve_struct *ve)
>   	mutex_unlock(&cgroup_mutex);
>   }
>   
> +void cgroup_unbind_roots_from_ve(struct ve_struct *ve)
> +{
> +	struct cgroup *cgrp;
> +	struct cgroupfs_root *root;
> +
> +	mutex_lock(&cgroup_mutex);
> +	for_each_active_root(root) {
> +		cgrp = css_cgroup_from_root(ve->root_css_set, root);
> +		BUG_ON(!cgrp->ve_owner);
> +		cgrp->ve_owner = NULL;
> +	}
> +	mutex_unlock(&cgroup_mutex);
> +}
> +
>   struct cgroup *cgroup_get_ve_root(struct cgroup *cgrp)
>   {
>   	struct cgroup *ve_root = NULL;
> @@ -4564,6 +4612,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
>   	struct cgroup_event *event, *tmp;
>   	struct cgroup_subsys *ss;
>   	struct cgroup *child;
> +	struct cgroup *root_cgrp;
>   	bool empty;
>   
>   	lockdep_assert_held(&d->d_inode->i_mutex);
> @@ -4620,11 +4669,9 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
>   
>   	set_bit(CGRP_REMOVED, &cgrp->flags);
>   
> -	raw_spin_lock(&release_list_lock);
> -	if (!list_empty(&cgrp->release_list))
> -		list_del_init(&cgrp->release_list);
> -	raw_spin_unlock(&release_list_lock);
> -
> +	root_cgrp = cgroup_get_local_root(cgrp);
> +	if (root_cgrp->ve_owner)
> +		ve_rm_from_release_list(root_cgrp->ve_owner, cgrp);
>   	/*
>   	 * Remove @cgrp directory.  The removal puts the base ref but we
>   	 * aren't quite done with @cgrp yet, so hold onto it.
> @@ -5399,6 +5446,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
>   
>   static void check_for_release(struct cgroup *cgrp)
>   {
> +	struct cgroup *root_cgrp;
>   	/* All of these checks rely on RCU to keep the cgroup
>   	 * structure alive */
>   	if (cgroup_is_releasable(cgrp) &&
> @@ -5408,17 +5456,10 @@ static void check_for_release(struct cgroup *cgrp)
>   		 * already queued for a userspace notification, queue
>   		 * it now
>   		 */
> -		int need_schedule_work = 0;
> -
> -		raw_spin_lock(&release_list_lock);
> -		if (!cgroup_is_removed(cgrp) &&
> -		    list_empty(&cgrp->release_list)) {
> -			list_add(&cgrp->release_list, &release_list);
> -			need_schedule_work = 1;
> -		}
> -		raw_spin_unlock(&release_list_lock);
> -		if (need_schedule_work)
> -			schedule_work(&release_agent_work);
> +		root_cgrp = cgroup_get_local_root(cgrp);
> +		BUG_ON(!root_cgrp);
> +		if (root_cgrp->ve_owner)
> +			ve_add_to_release_list(root_cgrp->ve_owner, cgrp);
>   	}
>   }
>   
> @@ -5445,24 +5486,35 @@ static void check_for_release(struct cgroup *cgrp)
>    * this routine has no use for the exit status of the release agent
>    * task, so no sense holding our caller up for that.
>    */
> -static void cgroup_release_agent(struct work_struct *work)
> +void cgroup_release_agent(struct work_struct *work)
>   {
> -	BUG_ON(work != &release_agent_work);
> +	struct ve_struct *ve;
> +	ve = container_of(work, struct ve_struct, release_agent_work);

> +	down_read(&ve->op_sem);
> +	if (!ve->is_running) {
> +		up_read(&ve->op_sem);
> +		return;
> +	}
> +	up_read(&ve->op_sem);

Can you please explain why we have a lock for a single is_running check? 
It can switch to zero imediately after up_read.

>   	mutex_lock(&cgroup_mutex);
> -	raw_spin_lock(&release_list_lock);
> -	while (!list_empty(&release_list)) {
> +	raw_spin_lock(&ve->release_list_lock);
> +	while (!list_empty(&ve->release_list)) {
>   		char *argv[3], *envp[3];
>   		int i, err;
> +		const char *release_agent;
>   		char *pathbuf = NULL, *agentbuf = NULL;
> -		struct cgroup *cgrp = list_entry(release_list.next,
> -						    struct cgroup,
> -						    release_list);
> +		struct cgroup *cgrp, *root_cgrp;
> +
> +		cgrp = list_entry(ve->release_list.next,
> +				  struct cgroup,
> +				  release_list);
> +
>   		list_del_init(&cgrp->release_list);
> -		raw_spin_unlock(&release_list_lock);
> +		raw_spin_unlock(&ve->release_list_lock);
>   		pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
>   		if (!pathbuf)
>   			goto continue_free;
> -		if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0)
> +		if (__cgroup_path(cgrp, pathbuf, PAGE_SIZE, 1) < 0)
>   			goto continue_free;
>   		agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
>   		if (!agentbuf)
> @@ -5483,7 +5535,9 @@ static void cgroup_release_agent(struct work_struct *work)
>   		 * since the exec could involve hitting disk and hence
>   		 * be a slow process */
>   		mutex_unlock(&cgroup_mutex);
> -		err = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
> +
> +		err = call_usermodehelper_fns_ve(ve, argv[0], argv,
> +			envp, UMH_WAIT_EXEC, NULL, NULL, NULL);
>   		if (err < 0)
>   			pr_warn_ratelimited("cgroup release_agent "
>   					    "%s %s failed: %d\n",
> @@ -5493,9 +5547,9 @@ static void cgroup_release_agent(struct work_struct *work)
>    continue_free:
>   		kfree(pathbuf);
>   		kfree(agentbuf);
> -		raw_spin_lock(&release_list_lock);
> +		raw_spin_lock(&ve->release_list_lock);
>   	}
> -	raw_spin_unlock(&release_list_lock);
> +	raw_spin_unlock(&ve->release_list_lock);
>   	mutex_unlock(&cgroup_mutex);
>   }
>   
> diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
> index 587f445..7eff0e7 100644
> --- a/kernel/ve/ve.c
> +++ b/kernel/ve/ve.c
> @@ -87,6 +87,10 @@ struct ve_struct ve0 = {
>   	.netif_max_nr		= INT_MAX,
>   	.arp_neigh_nr		= ATOMIC_INIT(0),
>   	.nd_neigh_nr		= ATOMIC_INIT(0),
> +	.release_list_lock	= __RAW_SPIN_LOCK_UNLOCKED(ve0.release_list_lock),
> +	.release_list		= LIST_HEAD_INIT(ve0.release_list),
> +	.release_agent_work	= __WORK_INITIALIZER(ve0.release_agent_work,
> +					cgroup_release_agent),
>   };
>   EXPORT_SYMBOL(ve0);
>   
> @@ -453,6 +457,9 @@ static void ve_grab_context(struct ve_struct *ve)
>   
>   	get_task_struct(tsk);
>   	ve->init_task = tsk;
> +	ve->root_css_set = tsk->cgroups;
> +	get_css_set(ve->root_css_set);
> +
>   	ve->init_cred = (struct cred *)get_current_cred();
>   	rcu_assign_pointer(ve->ve_ns, get_nsproxy(tsk->nsproxy));
>   	ve->ve_netns =  get_net(ve->ve_ns->net_ns);
> @@ -465,6 +472,9 @@ static void ve_drop_context(struct ve_struct *ve)
>   	put_net(ve->ve_netns);
>   	ve->ve_netns = NULL;
>   
> +	put_css_set_taskexit(ve->root_css_set);
> +	ve->root_css_set = NULL;
> +
>   	/* Allows to dereference init_cred and init_task if ve_ns is set */
>   	rcu_assign_pointer(ve->ve_ns, NULL);
>   	synchronize_rcu();
> @@ -477,7 +487,6 @@ static void ve_drop_context(struct ve_struct *ve)
>   
>   	put_task_struct(ve->init_task);
>   	ve->init_task = NULL;
> -
>   }
>   
>   static const struct timespec zero_time = { };
> @@ -504,6 +513,53 @@ static void ve_workqueue_stop(struct ve_struct *ve)
>   	destroy_workqueue(ve->wq);
>   }
>   
> +void ve_add_to_release_list(struct ve_struct *ve, struct cgroup *cgrp)
> +{
> +	int need_schedule_work = 0;
> +
> +	down_write(&ve->op_sem);
> +
> +	if (!ve->is_running)
> +		goto out_unlock;
> +
> +	/*
> +	 * Guarantee dependency is provided like so:
> +	 * - release_list_lock is valid due to ve != NULL.
> +	 * - is_running = 1 is valid due to ve->op_sem is held.
> +	 * - ve->op_sem is valid due to ve != NULL.
> +	 * - ve != NULL should be guaranteed by the caller.
> +	 */
> +	raw_spin_lock(&ve->release_list_lock);
> +	if (!cgroup_is_removed(cgrp) &&
> +	    list_empty(&cgrp->release_list)) {
> +		list_add(&cgrp->release_list, &ve->release_list);
> +		need_schedule_work = 1;
> +	}
> +	raw_spin_unlock(&ve->release_list_lock);
> +	/*
> +	 * Because we are under ve->op_sem lock, ve->is_running will
> +	 * be set only after this is already scheduled to ve->wq.
> +	 * Guaranteed to wait until this work is flushed before
> +	 * proceeding with ve_stop_ns.
> +	 */
> +	if (need_schedule_work)
> +		queue_work(ve->wq, &ve->release_agent_work);
> +
> +out_unlock:
> +	up_write(&ve->op_sem);
> +}
> +
> +void ve_rm_from_release_list(struct ve_struct *ve, struct cgroup *cgrp)
> +{
> +	/*
> +	 * There is no is_runnig check here, because the caller is
> +	 * is more flexible in using ve->op_sem lock from above.
> +	 */
> +	raw_spin_lock(&ve->release_list_lock);
> +	if (!list_empty(&cgrp->release_list))
> +		list_del_init(&cgrp->release_list);
> +	raw_spin_unlock(&ve->release_list_lock);
> +}
>   
>   /* under ve->op_sem write-lock */
>   static int ve_start_container(struct ve_struct *ve)
> @@ -579,6 +635,8 @@ err_list:
>   	return err;
>   }
>   
> +extern void cgroup_unbind_roots_from_ve(struct ve_struct *ve);
> +
>   void ve_stop_ns(struct pid_namespace *pid_ns)
>   {
>   	struct ve_struct *ve = current->task_ve;
> @@ -590,19 +648,44 @@ void ve_stop_ns(struct pid_namespace *pid_ns)
>   	if (!ve->ve_ns || ve->ve_ns->pid_ns != pid_ns)
>   		return;
>   
> -	wait_khelpers();
> -
>   	down_write(&ve->op_sem);
> +
> +	/*
> +	 * Zero-out cgroup->ve_owner for all ve roots.
> +	 * After this is done cgroups will not be able to
> +	 * pass through check_for_release into this ve's
> +	 * release_list.
> +	 */
> +	cgroup_unbind_roots_from_ve(ve);
> +
>   	/*
>   	 * Here the VE changes its state into "not running".
>   	 * op_sem works as barrier for vzctl ioctls.
>   	 * ve_mutex works as barrier for ve_can_attach().
> +	 * is_running = 0 also blocks further insertions to
> +	 * release_list.
>   	 */
>   	ve->is_running = 0;
>   
> +	up_write(&ve->op_sem);
> +
> +	/*
> +	 * Stopping workqueue flushes all the tasks there as well as
> +	 * disables all further work inserions into it. The ordering
> +	 * relative to wait_khelpers does not matter.
> +	 */
>   	ve_workqueue_stop(ve);
>   
>   	/*
> +	 * Wait until all calls to call_usermodehelper_by
> +	 * finish to ensure that any usermode tasks launched before
> +	 * ve->is_running was set to 0 has finished after this line.
> +	 */
> +	wait_khelpers();

The call to wait_khelpers wants to wait finishing of all 
call_usermodehelper_by calls started before ve->init_task->flags was set 
to PF_EXITING.

As you mentioned in slack, we actually should wait not all 
call_usermodehelper_by but only once related to current ve. But it does 
not depend on is_running as you write in comment AFAICS.

> +
> +	down_write(&ve->op_sem);
> +
> +	/*
>   	 * Neither it can be in pseudosuper state
>   	 * anymore, setup it again if needed.
>   	 */
> @@ -698,6 +781,9 @@ static struct cgroup_subsys_state *ve_create(struct cgroup *cg)
>   	if (!ve->ve_name)
>   		goto err_name;
>   
> +	INIT_WORK(&ve->release_agent_work, cgroup_release_agent);
> +	raw_spin_lock_init(&ve->release_list_lock);
> +
>   	ve->_randomize_va_space = ve0._randomize_va_space;
>   
>   	ve->features = VE_FEATURES_DEF;
> @@ -732,6 +818,7 @@ do_init:
>   	INIT_LIST_HEAD(&ve->devices);
>   	INIT_LIST_HEAD(&ve->ve_list);
>   	INIT_LIST_HEAD(&ve->devmnt_list);
> +	INIT_LIST_HEAD(&ve->release_list);
>   	mutex_init(&ve->devmnt_mutex);
>   
>   #ifdef CONFIG_AIO
>