[RHEL7,v15,12/13] ve/cgroup: added release_agent to each container root cgroup.

Submitted by Valeriy Vdovin on April 22, 2020, 11:18 a.m.

Details

Message ID 1587554319-780254-13-git-send-email-valeriy.vdovin@virtuozzo.com
State New
Series "Make release_agent per-cgroup property. Run release_agent in proper ve."
Headers show

Commit Message

Valeriy Vdovin April 22, 2020, 11:18 a.m.
Each container will now have access to it's own cgroup release_agent
file.
Creation:
  Normally all cgroup files are created during a call to cgroup_create
  by cgroup_populate_dir function. It creates or not creates all cgroup
  files once and they immediately become visible to userspace as
  filesystem objects.
  Due to specifics of container creation process, it is not possible to
  use the same code for 'release_agent' file creation. For VE to start
  operating, first a list of ordinary cgroups is being created for
  each subsystem, then the set of newly created cgroups are converted to
  "virtual roots", so at the time when cgroup_create is executed, there
  is no knowledge of wheather or not "release_agent" file should be
  created. This information only comes at "convertion" step which is
  'cgroup_mark_ve_roots' function. As the file is created dynamically
  in a live cgroup, a rather delicate locking sequence is present in
  the new code:
    - each new "virtual root" cgroup will have to add "release_agent" file,
      thus each cgroup's directory would need to be locked during
      the insertion time by cgroup->dentry->d_inode->i_mutex.
    - d_inode->i_mutex has an ordering dependency with cgroup_mutex
      (see cgroup_mount/cgroup_remount). They can not be locked in order
      {lock(cgroup_mutex), lock(inode->i_mutex)}.
    - to collect a list of cgroups, that need to become virtual we need
      cgroup_mutex lock to iterate active roots.
    - to overcome the above conflict we first need to collect a list of
      all virtual cgroups under cgroup_mutex lock, then release it and
      after that to insert "release_agent" to each root under
      inode->i_mutex lock.
    - to collect a list of cgroups on stack we utilize
      cgroup->cft_q_node, made specially for that purpose under it's own
      cgroup_cft_mutex.

Destruction:
  Destruction is done in reverse from the above within
  cgroup_unmark_ve_root

https://jira.sw.ru/browse/PSBM-83887

Signed-off-by: Valeriy Vdovin <valeriy.vdovin@virtuozzo.com>
---
 include/linux/cgroup.h |  2 +-
 kernel/cgroup.c        | 59 +++++++++++++++++++++++++++++++++++++++++++++-----
 kernel/ve/ve.c         |  6 ++++-
 3 files changed, 60 insertions(+), 7 deletions(-)

Patch hide | download patch | download mbox

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 37765f5..911dd48 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -636,7 +636,7 @@  int cgroup_task_count(const struct cgroup *cgrp);
 void cgroup_release_agent(struct work_struct *work);
 
 #ifdef CONFIG_VE
-void cgroup_mark_ve_roots(struct ve_struct *ve);
+int cgroup_mark_ve_roots(struct ve_struct *ve);
 void cgroup_unmark_ve_roots(struct ve_struct *ve);
 struct ve_struct *cgroup_get_ve_owner(struct cgroup *cgrp);
 #endif
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 3f21ed2..98cab92 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4183,7 +4183,7 @@  static struct cftype files[] = {
 	},
 	{
 		.name = "release_agent",
-		.flags = CFTYPE_ONLY_ON_ROOT,
+		.flags = CFTYPE_ONLY_ON_ROOT | CFTYPE_VE_WRITABLE,
 		.read_seq_string = cgroup_release_agent_show,
 		.write_string = cgroup_release_agent_write,
 		.max_write_len = PATH_MAX,
@@ -4312,37 +4312,86 @@  static int subgroups_count(struct cgroup *cgroup)
 	return cgrps_count;
 }
 
+static struct cftype *get_cftype_by_name(const char *name)
+{
+	struct cftype *cft;
+	for (cft = files; cft->name[0] != '\0'; cft++) {
+		if (!strcmp(cft->name, name))
+			return cft;
+	}
+	return NULL;
+}
+
 #ifdef CONFIG_VE
-void cgroup_mark_ve_roots(struct ve_struct *ve)
+int cgroup_mark_ve_roots(struct ve_struct *ve)
 {
-	struct cgroup *cgrp;
+	struct cgroup *cgrp, *tmp;
 	struct cgroupfs_root *root;
+	int err = 0;
+	struct cftype *cft;
+	LIST_HEAD(pending);
+
+	cft = get_cftype_by_name("release_agent");
+	BUG_ON(!cft);
 
+	mutex_lock(&cgroup_cft_mutex);
 	mutex_lock(&cgroup_mutex);
 	for_each_active_root(root) {
 		cgrp = task_cgroup_from_root(ve->init_task, root);
 		rcu_assign_pointer(cgrp->ve_owner, ve);
 		set_bit(CGRP_VE_ROOT, &cgrp->flags);
+		dget(cgrp->dentry);
+		list_add_tail(&cgrp->cft_q_node, &pending);
 		if (test_bit(cpu_cgroup_subsys_id, &root->subsys_mask))
 			link_ve_root_cpu_cgroup(cgrp);
 	}
 	mutex_unlock(&cgroup_mutex);
+	list_for_each_entry_safe(cgrp, tmp, &pending, cft_q_node) {
+		struct inode *inode = cgrp->dentry->d_inode;
+
+		if (err) {
+			dput(cgrp->dentry);
+			continue;
+		}
+
+		mutex_lock(&inode->i_mutex);
+		mutex_lock(&cgroup_mutex);
+		if (!cgroup_is_removed(cgrp))
+			err = cgroup_add_file(cgrp, NULL, cft);
+		mutex_unlock(&cgroup_mutex);
+		mutex_unlock(&inode->i_mutex);
+
+		list_del_init(&cgrp->cft_q_node);
+		dput(cgrp->dentry);
+	}
+	mutex_unlock(&cgroup_cft_mutex);
+	return err;
 }
 
 void cgroup_unmark_ve_roots(struct ve_struct *ve)
 {
-	struct cgroup *cgrp;
+	struct cgroup *cgrp, *tmp;
 	struct cgroupfs_root *root;
+	struct cftype *cft;
+	LIST_HEAD(pending);
+	cft = get_cftype_by_name("release_agent");
 
+	mutex_lock(&cgroup_cft_mutex);
 	mutex_lock(&cgroup_mutex);
 	for_each_active_root(root) {
 		cgrp = css_cgroup_from_root(ve->root_css_set, root);
+		dget(cgrp->dentry);
+		list_add_tail(&cgrp->cft_q_node, &pending);
+	}
+	mutex_unlock(&cgroup_mutex);
+	list_for_each_entry_safe(cgrp, tmp, &pending, cft_q_node) {
 		BUG_ON(!rcu_dereference_protected(cgrp->ve_owner,
 				lockdep_is_held(&cgroup_mutex)));
 		rcu_assign_pointer(cgrp->ve_owner, NULL);
 		clear_bit(CGRP_VE_ROOT, &cgrp->flags);
+		cgroup_rm_file(cgrp, cft);
 	}
-	mutex_unlock(&cgroup_mutex);
+	mutex_unlock(&cgroup_cft_mutex);
 	/* ve_owner == NULL will be visible */
 	synchronize_rcu();
 
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index a12e021..5371c10 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -704,7 +704,9 @@  static int ve_start_container(struct ve_struct *ve)
 	if (err < 0)
 		goto err_iterate;
 
-	cgroup_mark_ve_roots(ve);
+	err = cgroup_mark_ve_roots(ve);
+	if (err)
+		goto err_mark_ve;
 
 	ve->is_running = 1;
 
@@ -714,6 +716,8 @@  static int ve_start_container(struct ve_struct *ve)
 
 	return 0;
 
+err_mark_ve:
+	ve_hook_iterate_fini(VE_SS_CHAIN, ve);
 err_iterate:
 	ve_workqueue_stop(ve);
 err_workqueue: