[v10,7/9] ve/cgroup: moved cgroup release notifications to per-ve workqueues.

Submitted by Valeriy Vdovin on April 16, 2020, 4:14 p.m.

Details

Message ID 1587053651-246365-8-git-send-email-valeriy.vdovin@virtuozzo.com
State New
Series "Make release_agent per-cgroup property. Run release_agent in proper ve."
Headers show

Commit Message

Valeriy Vdovin April 16, 2020, 4:14 p.m.
Notifications about cgroups getting empty should be executed from
inside of a VE, that owns this cgroup. This patch splits logic
of notifications execution between ve code and cgroups code. VE is
now responsible to store the list of cgroups that want to notify
about themselves. VE also now manages the execution of release_agent
work under it's domain, so the usermode process is spawned in this
VE's namespaces.
VE is now responsible to decline possibility of scheduling release_agent
works when it is stopping.
cgroup code is still responsible to manage setting/getting release_agent
path parameter and pass it between mounts, but now it's also
responsible to store owning ve as a filed in cgroup struct.

https://jira.sw.ru/browse/PSBM-83887

Signed-off-by: Valeriy Vdovin <valeriy.vdovin@virtuozzo.com>
Reviewed-by: Kirill Tkhai <ktkhai@virtuozzo.com>
---
 include/linux/cgroup.h |   5 +++
 include/linux/ve.h     |  10 +++++
 kernel/cgroup.c        | 112 ++++++++++++++++++++++++++++++++++---------------
 kernel/ve/ve.c         |  44 +++++++++++++++++++
 4 files changed, 138 insertions(+), 33 deletions(-)

Patch hide | download patch | download mbox

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 1bd0fe7..d0ce3cc 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -292,6 +292,9 @@  struct cgroup {
 	/* directory xattrs */
 	struct simple_xattrs xattrs;
 	u64 subgroups_limit;
+
+	/* ve_owner, responsible for running release agent. */
+	struct ve_struct *ve_owner;
 };
 
 #define MAX_CGROUP_ROOT_NAMELEN 64
@@ -633,10 +636,12 @@  int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen);
 int cgroup_path_ve(const struct cgroup *cgrp, char *buf, int buflen);
 
 int cgroup_task_count(const struct cgroup *cgrp);
+void cgroup_release_agent(struct work_struct *work);
 
 #ifdef CONFIG_VE
 void cgroup_mark_ve_roots(struct ve_struct *ve);
 void cgroup_unmark_ve_roots(struct ve_struct *ve);
+struct ve_struct *cgroup_get_ve_owner(struct cgroup *cgrp);
 #endif
 
 /*
diff --git a/include/linux/ve.h b/include/linux/ve.h
index 02e00e9..787a96f 100644
--- a/include/linux/ve.h
+++ b/include/linux/ve.h
@@ -126,7 +126,15 @@  struct ve_struct {
 #endif
 	struct kmapset_key	sysfs_perms_key;
 
+	/*
+	 * cgroups, that want to notify about becoming
+	 * empty, are linked to this release_list.
+	 */
+	struct list_head	release_list;
+	struct raw_spinlock	release_list_lock;
+
 	struct workqueue_struct	*wq;
+	struct work_struct	release_agent_work;
 
 	/*
 	 * All tasks, that belong to this ve, live
@@ -197,6 +205,8 @@  call_usermodehelper_ve(struct ve_struct *ve, char *path, char **argv,
 }
 void do_update_load_avg_ve(void);
 
+void ve_add_to_release_list(struct cgroup *cgrp);
+void ve_rm_from_release_list(struct cgroup *cgrp);
 extern struct ve_struct *get_ve(struct ve_struct *ve);
 extern void put_ve(struct ve_struct *ve);
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1dd36ed..fa1a881 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -269,10 +269,6 @@  static bool cgroup_lock_live_group(struct cgroup *cgrp)
 
 /* the list of cgroups eligible for automatic release. Protected by
  * release_list_lock */
-static LIST_HEAD(release_list);
-static DEFINE_RAW_SPINLOCK(release_list_lock);
-static void cgroup_release_agent(struct work_struct *work);
-static DECLARE_WORK(release_agent_work, cgroup_release_agent);
 static void check_for_release(struct cgroup *cgrp);
 
 /* Link structure for associating css_set objects with cgroups */
@@ -809,6 +805,42 @@  static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry)
 	return name;
 }
 
+struct cgroup *cgroup_get_local_root(struct cgroup *cgrp)
+{
+	/*
+	 * Find nearest root cgroup, which might be host cgroup root
+	 * or ve cgroup root.
+	 *
+	 *    <host_root_cgroup> -> local_root
+	 *     \                    ^
+	 *      <cgroup>            |
+	 *       \                  |
+	 *        <cgroup>   --->   from here
+	 *        \
+	 *         <ve_root_cgroup> -> local_root
+	 *         \                   ^
+	 *          <cgroup>           |
+	 *          \                  |
+	 *           <cgroup>  --->    from here
+	 */
+	while (cgrp->parent && !test_bit(CGRP_VE_ROOT, &cgrp->flags))
+		cgrp = cgrp->parent;
+
+	return cgrp;
+}
+
+struct ve_struct *cgroup_get_ve_owner(struct cgroup *cgrp)
+{
+	struct ve_struct *ve;
+	/* Caller should hold RCU */
+
+	cgrp = cgroup_get_local_root(cgrp);
+	ve = rcu_dereference(cgrp->ve_owner);
+	if (!ve)
+		ve = get_ve0();
+	return ve;
+}
+
 static void cgroup_free_fn(struct work_struct *work)
 {
 	struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
@@ -1651,6 +1683,7 @@  static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 		const struct cred *cred;
 		int i;
 		struct css_set *cg;
+		root_cgrp->ve_owner = get_exec_env();
 
 		BUG_ON(sb->s_root != NULL);
 
@@ -4258,6 +4291,7 @@  void cgroup_mark_ve_roots(struct ve_struct *ve)
 	mutex_lock(&cgroup_mutex);
 	for_each_active_root(root) {
 		cgrp = task_cgroup_from_root(ve->init_task, root);
+		cgrp->ve_owner = ve;
 		set_bit(CGRP_VE_ROOT, &cgrp->flags);
 
 		if (test_bit(cpu_cgroup_subsys_id, &root->subsys_mask))
@@ -4274,9 +4308,22 @@  void cgroup_unmark_ve_roots(struct ve_struct *ve)
 	mutex_lock(&cgroup_mutex);
 	for_each_active_root(root) {
 		cgrp = css_cgroup_from_root(ve->root_css_set, root);
+		BUG_ON(!rcu_dereference_protected(cgrp->ve_owner,
+				lockdep_is_held(&cgroup_mutex)));
+		rcu_assign_pointer(cgrp->ve_owner, NULL);
 		clear_bit(CGRP_VE_ROOT, &cgrp->flags);
 	}
 	mutex_unlock(&cgroup_mutex);
+	/* ve_owner == NULL will be visible */
+	synchronize_rcu();
+	/*
+	 * Anyone already waiting in this wq to execute
+	 * cgroup_release_agent doesn't know that ve_owner is NULL,
+	 * but we can wait for all of them at flush_workqueue.
+	 * After it is complete no other cgroup can seep through
+	 * to this ve's workqueue, so it's safe to shutdown ve.
+	 */
+	flush_workqueue(ve->wq);
 }
 
 struct cgroup *cgroup_get_ve_root(struct cgroup *cgrp)
@@ -4571,11 +4618,7 @@  static int cgroup_destroy_locked(struct cgroup *cgrp)
 
 	set_bit(CGRP_REMOVED, &cgrp->flags);
 
-	raw_spin_lock(&release_list_lock);
-	if (!list_empty(&cgrp->release_list))
-		list_del_init(&cgrp->release_list);
-	raw_spin_unlock(&release_list_lock);
-
+	ve_rm_from_release_list(cgrp);
 	/*
 	 * Remove @cgrp directory.  The removal puts the base ref but we
 	 * aren't quite done with @cgrp yet, so hold onto it.
@@ -5359,17 +5402,7 @@  static void check_for_release(struct cgroup *cgrp)
 		 * already queued for a userspace notification, queue
 		 * it now
 		 */
-		int need_schedule_work = 0;
-
-		raw_spin_lock(&release_list_lock);
-		if (!cgroup_is_removed(cgrp) &&
-		    list_empty(&cgrp->release_list)) {
-			list_add(&cgrp->release_list, &release_list);
-			need_schedule_work = 1;
-		}
-		raw_spin_unlock(&release_list_lock);
-		if (need_schedule_work)
-			schedule_work(&release_agent_work);
+		ve_add_to_release_list(cgrp);
 	}
 }
 
@@ -5396,25 +5429,36 @@  static void check_for_release(struct cgroup *cgrp)
  * this routine has no use for the exit status of the release agent
  * task, so no sense holding our caller up for that.
  */
-static void cgroup_release_agent(struct work_struct *work)
+void cgroup_release_agent(struct work_struct *work)
 {
-	BUG_ON(work != &release_agent_work);
+	struct ve_struct *ve;
+	ve = container_of(work, struct ve_struct, release_agent_work);
 	mutex_lock(&cgroup_mutex);
-	raw_spin_lock(&release_list_lock);
-	while (!list_empty(&release_list)) {
+	raw_spin_lock(&ve->release_list_lock);
+	while (!list_empty(&ve->release_list)) {
 		char *argv[3], *envp[3];
 		int i, err;
 		char *pathbuf = NULL, *agentbuf = NULL;
-		struct cgroup *cgrp = list_entry(release_list.next,
-						    struct cgroup,
-						    release_list);
+		struct cgroup *cgrp, *root_cgrp;
+
+		cgrp = list_entry(ve->release_list.next,
+				  struct cgroup,
+				  release_list);
+
 		list_del_init(&cgrp->release_list);
-		raw_spin_unlock(&release_list_lock);
+		raw_spin_unlock(&ve->release_list_lock);
 		pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
 		if (!pathbuf)
 			goto continue_free;
-		if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0)
+		if (__cgroup_path(cgrp, pathbuf, PAGE_SIZE, 1) < 0)
 			goto continue_free;
+		rcu_read_lock();
+		root_cgrp = cgroup_get_local_root(cgrp);
+		if (root_cgrp->ve_owner != ve) {
+			rcu_read_unlock();
+			goto continue_free;
+		}
+		rcu_read_unlock();
 		agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
 		if (!agentbuf)
 			goto continue_free;
@@ -5434,8 +5478,10 @@  static void cgroup_release_agent(struct work_struct *work)
 		 * since the exec could involve hitting disk and hence
 		 * be a slow process */
 		mutex_unlock(&cgroup_mutex);
-		err = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
-		if (err < 0)
+
+		err = call_usermodehelper_fns_ve(ve, argv[0], argv,
+			envp, UMH_WAIT_EXEC, NULL, NULL, NULL);
+		if (err < 0 && !(ve->init_task->flags & PF_EXITING))
 			pr_warn_ratelimited("cgroup release_agent "
 					    "%s %s failed: %d\n",
 					    agentbuf, pathbuf, err);
@@ -5444,9 +5490,9 @@  static void cgroup_release_agent(struct work_struct *work)
  continue_free:
 		kfree(pathbuf);
 		kfree(agentbuf);
-		raw_spin_lock(&release_list_lock);
+		raw_spin_lock(&ve->release_list_lock);
 	}
-	raw_spin_unlock(&release_list_lock);
+	raw_spin_unlock(&ve->release_list_lock);
 	mutex_unlock(&cgroup_mutex);
 }
 
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index db3b600..659f19d 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -87,6 +87,11 @@  struct ve_struct ve0 = {
 	.netif_max_nr		= INT_MAX,
 	.arp_neigh_nr		= ATOMIC_INIT(0),
 	.nd_neigh_nr		= ATOMIC_INIT(0),
+	.release_list_lock	= __RAW_SPIN_LOCK_UNLOCKED(
+					ve0.release_list_lock),
+	.release_list		= LIST_HEAD_INIT(ve0.release_list),
+	.release_agent_work	= __WORK_INITIALIZER(ve0.release_agent_work,
+					cgroup_release_agent),
 };
 EXPORT_SYMBOL(ve0);
 
@@ -498,6 +503,41 @@  static void ve_workqueue_stop(struct ve_struct *ve)
 	destroy_workqueue(ve->wq);
 }
 
+void ve_add_to_release_list(struct cgroup *cgrp)
+{
+	struct ve_struct *ve;
+	int need_schedule_work = 0;
+
+	rcu_read_lock();
+	ve = cgroup_get_ve_owner(cgrp);
+
+	raw_spin_lock(&ve->release_list_lock);
+	if (!cgroup_is_removed(cgrp) &&
+	    list_empty(&cgrp->release_list)) {
+		list_add(&cgrp->release_list, &ve->release_list);
+		need_schedule_work = 1;
+	}
+	raw_spin_unlock(&ve->release_list_lock);
+
+	if (need_schedule_work)
+		queue_work(ve->wq, &ve->release_agent_work);
+
+	rcu_read_unlock();
+}
+
+void ve_rm_from_release_list(struct cgroup *cgrp)
+{
+	struct ve_struct *ve;
+	rcu_read_lock();
+	ve = cgroup_get_ve_owner(cgrp);
+	rcu_read_unlock();
+
+	raw_spin_lock(&ve->release_list_lock);
+	if (!list_empty(&cgrp->release_list))
+		list_del_init(&cgrp->release_list);
+	raw_spin_unlock(&ve->release_list_lock);
+}
+
 /* under ve->op_sem write-lock */
 static int ve_start_container(struct ve_struct *ve)
 {
@@ -693,6 +733,9 @@  static struct cgroup_subsys_state *ve_create(struct cgroup *cg)
 	if (!ve->ve_name)
 		goto err_name;
 
+	INIT_WORK(&ve->release_agent_work, cgroup_release_agent);
+	raw_spin_lock_init(&ve->release_list_lock);
+
 	ve->_randomize_va_space = ve0._randomize_va_space;
 
 	ve->features = VE_FEATURES_DEF;
@@ -727,6 +770,7 @@  do_init:
 	INIT_LIST_HEAD(&ve->devices);
 	INIT_LIST_HEAD(&ve->ve_list);
 	INIT_LIST_HEAD(&ve->devmnt_list);
+	INIT_LIST_HEAD(&ve->release_list);
 	mutex_init(&ve->devmnt_mutex);
 
 #ifdef CONFIG_AIO

Comments

Pavel Tikhomirov April 17, 2020, 8:40 a.m.
> @@ -1651,6 +1683,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
>   		const struct cred *cred;
>   		int i;
>   		struct css_set *cg;
> +		root_cgrp->ve_owner = get_exec_env();

Host's root cgroup running release agent in container because it was 
first mounted in container looks strange for me. Also what will happen 
after this ve exits and is removed?

Maybe always set ve0 here ("= ve0;") instead?

I understand that implicitly we would have ve0 anyway because one can't 
mount cgroups in CT, but lets be explicit =)

>   
>   		BUG_ON(sb->s_root != NULL);
>   
> @@ -4258,6 +4291,7 @@ void cgroup_mark_ve_roots(struct ve_struct *ve)
>   	mutex_lock(&cgroup_mutex);
>   	for_each_active_root(root) {
>   		cgrp = task_cgroup_from_root(ve->init_task, root);
> +		cgrp->ve_owner = ve;
>   		set_bit(CGRP_VE_ROOT, &cgrp->flags);
>   
>   		if (test_bit(cpu_cgroup_subsys_id, &root->subsys_mask))