[RHEL7,v21,12/14] ve/cgroup: added release_agent to each container root cgroup.

Submitted by Valeriy Vdovin on July 28, 2020, 5:53 p.m.

Details

Message ID 1595958806-338946-13-git-send-email-valeriy.vdovin@virtuozzo.com
State New
Series "Make release_agent per-cgroup property. Run release_agent in proper ve."
Headers show

Commit Message

Valeriy Vdovin July 28, 2020, 5:53 p.m.
Each container will now have access to it's own cgroup release_agent
file.
Creation:
  Normally all cgroup files are created during a call to cgroup_create
  by cgroup_populate_dir function. It creates or not creates all cgroup
  files once and they immediately become visible to userspace as
  filesystem objects.
  Due to specifics of container creation process, it is not possible to
  use the same code for 'release_agent' file creation. For VE to start
  operating, first a list of ordinary cgroups is being created for
  each subsystem, then the set of newly created cgroups are converted to
  "virtual roots", so at the time when cgroup_create is executed, there
  is no knowledge of wheather or not "release_agent" file should be
  created. This information only comes at "convertion" step which is
  'cgroup_mark_ve_roots' function. As the file is created dynamically
  in a live cgroup, a rather delicate locking sequence is present in
  the new code:
    - each new "virtual root" cgroup will have to add "release_agent" file,
      thus each cgroup's directory would need to be locked during
      the insertion time by cgroup->dentry->d_inode->i_mutex.
    - d_inode->i_mutex has an ordering dependency with cgroup_mutex
      (see cgroup_mount/cgroup_remount). They can not be locked in order
      {lock(cgroup_mutex), lock(inode->i_mutex)}.
    - to collect a list of cgroups, that need to become virtual we need
      cgroup_mutex lock to iterate active roots.
    - to overcome the above conflict we first need to collect a list of
      all virtual cgroups under cgroup_mutex lock, then release it and
      after that to insert "release_agent" to each root under
      inode->i_mutex lock.
    - to collect a list of cgroups on stack we utilize
      cgroup->cft_q_node, made specially for that purpose under it's own
      cgroup_cft_mutex.

Destruction:
  Destruction is done in reverse from the above within
  cgroup_unmark_ve_root.
  After file destruction we must prevent further write operations to
  this file in case when someone has opened this file prior to VE
  and cgroup destruction. This is achieved by checking if cgroup
  in the argument to cgroup_file_write function has features of host
  or virtual root.

https://jira.sw.ru/browse/PSBM-83887

Signed-off-by: Valeriy Vdovin <valeriy.vdovin@virtuozzo.com>
---
 include/linux/cgroup.h |  2 +-
 include/linux/ve.h     |  2 +-
 kernel/cgroup.c        | 86 ++++++++++++++++++++++++++++++++++++++++++++++----
 kernel/ve/ve.c         |  6 +++-
 4 files changed, 87 insertions(+), 9 deletions(-)

Patch hide | download patch | download mbox

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index fc138c0..645c9fd 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -671,7 +671,7 @@  int cgroup_task_count(const struct cgroup *cgrp);
 void cgroup_release_agent(struct work_struct *work);
 
 #ifdef CONFIG_VE
-void cgroup_mark_ve_roots(struct ve_struct *ve);
+int cgroup_mark_ve_roots(struct ve_struct *ve);
 void cgroup_unmark_ve_roots(struct ve_struct *ve);
 struct ve_struct *cgroup_get_ve_owner(struct cgroup *cgrp);
 #endif
diff --git a/include/linux/ve.h b/include/linux/ve.h
index b6662637..5bf275f 100644
--- a/include/linux/ve.h
+++ b/include/linux/ve.h
@@ -215,7 +215,7 @@  void do_update_load_avg_ve(void);
 void ve_add_to_release_list(struct cgroup *cgrp);
 void ve_rm_from_release_list(struct cgroup *cgrp);
 
-int ve_set_release_agent_path(struct ve_struct *ve, struct cgroup *cgroot,
+int ve_set_release_agent_path(struct cgroup *cgroot,
 	const char *release_agent);
 
 const char *ve_get_release_agent_path(struct cgroup *cgrp_root);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1d9c889..bb77804 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2360,13 +2360,28 @@  static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
 	if (!cgroup_lock_live_group(cgrp))
 		return -ENODEV;
 
+	/*
+	 * Call to cgroup_get_local_root is a way to make sure
+	 * that cgrp in the argument is valid "virtual root"
+	 * cgroup. If it's not - this means that cgroup_unmark_ve_roots
+	 * has already cleared CGRP_VE_ROOT flag from this cgroup while
+	 * the file was still opened by other process and
+	 * that we shouldn't allow further writings via that file.
+	 */
 	root_cgrp = cgroup_get_local_root(cgrp);
 	BUG_ON(!root_cgrp);
+
+	if (root_cgrp != cgrp) {
+		ret = -EPERM;
+		goto out;
+	}
+
 	if (root_cgrp->ve_owner)
 		ret = ve_set_release_agent_path(root_cgrp, buffer);
 	else
 		return -ENODEV;
 
+out:
 	mutex_unlock(&cgroup_mutex);
 	return ret;
 }
@@ -4204,7 +4219,7 @@  static struct cftype files[] = {
 	},
 	{
 		.name = "release_agent",
-		.flags = CFTYPE_ONLY_ON_ROOT,
+		.flags = CFTYPE_ONLY_ON_ROOT | CFTYPE_VE_WRITABLE,
 		.read_seq_string = cgroup_release_agent_show,
 		.write_string = cgroup_release_agent_write,
 		.max_write_len = PATH_MAX,
@@ -4422,39 +4437,98 @@  static int subgroups_count(struct cgroup *cgroup)
 	return cgrps_count;
 }
 
+static struct cftype *get_cftype_by_name(const char *name)
+{
+	struct cftype *cft;
+	for (cft = files; cft->name[0] != '\0'; cft++) {
+		if (!strcmp(cft->name, name))
+			return cft;
+	}
+	return NULL;
+}
+
 #ifdef CONFIG_VE
-void cgroup_mark_ve_roots(struct ve_struct *ve)
+int cgroup_mark_ve_roots(struct ve_struct *ve)
 {
-	struct cgroup *cgrp;
+	struct cgroup *cgrp, *tmp;
 	struct cgroupfs_root *root;
+	int err = 0;
+	struct cftype *cft;
+	LIST_HEAD(pending);
 
+	cft = get_cftype_by_name("release_agent");
+	BUG_ON(!cft);
+
+	mutex_lock(&cgroup_cft_mutex);
 	mutex_lock(&cgroup_mutex);
 	for_each_active_root(root) {
-		cgrp = task_cgroup_from_root(ve->init_task, root);
+		cgrp = css_cgroup_from_root(ve->root_css_set, root);
 		rcu_assign_pointer(cgrp->ve_owner, ve);
 		set_bit(CGRP_VE_ROOT, &cgrp->flags);
 
+		dget(cgrp->dentry);
+		list_add_tail(&cgrp->cft_q_node, &pending);
 		if (test_bit(cpu_cgroup_subsys_id, &root->subsys_mask))
 			link_ve_root_cpu_cgroup(cgrp);
 	}
 	mutex_unlock(&cgroup_mutex);
 	synchronize_rcu();
+	list_for_each_entry_safe(cgrp, tmp, &pending, cft_q_node) {
+		struct inode *inode = cgrp->dentry->d_inode;
+
+		if (err) {
+			list_del_init(&cgrp->cft_q_node);
+			dput(cgrp->dentry);
+			continue;
+		}
+
+		mutex_lock(&inode->i_mutex);
+		mutex_lock(&cgroup_mutex);
+		if (!cgroup_is_removed(cgrp))
+			err = cgroup_add_file(cgrp, NULL, cft);
+		mutex_unlock(&cgroup_mutex);
+		mutex_unlock(&inode->i_mutex);
+
+		list_del_init(&cgrp->cft_q_node);
+		dput(cgrp->dentry);
+	}
+	mutex_unlock(&cgroup_cft_mutex);
+
+	if (err)
+		cgroup_unmark_ve_roots(ve);
+
+	return err;
 }
 
 void cgroup_unmark_ve_roots(struct ve_struct *ve)
 {
-	struct cgroup *cgrp;
+	struct cgroup *cgrp, *tmp;
 	struct cgroupfs_root *root;
+	struct cftype *cft;
+	LIST_HEAD(pending);
+
+	cft = get_cftype_by_name("release_agent");
 
+	mutex_lock(&cgroup_cft_mutex);
 	mutex_lock(&cgroup_mutex);
 	for_each_active_root(root) {
 		cgrp = css_cgroup_from_root(ve->root_css_set, root);
+		dget(cgrp->dentry);
+		list_add_tail(&cgrp->cft_q_node, &pending);
+	}
+	mutex_unlock(&cgroup_mutex);
+	list_for_each_entry_safe(cgrp, tmp, &pending, cft_q_node) {
+		struct inode *inode = cgrp->dentry->d_inode;
+		mutex_lock(&inode->i_mutex);
+		mutex_lock(&cgroup_mutex);
+		cgroup_rm_file(cgrp, cft);
 		BUG_ON(!rcu_dereference_protected(cgrp->ve_owner,
 				lockdep_is_held(&cgroup_mutex)));
 		rcu_assign_pointer(cgrp->ve_owner, NULL);
 		clear_bit(CGRP_VE_ROOT, &cgrp->flags);
+		mutex_unlock(&cgroup_mutex);
+		mutex_unlock(&inode->i_mutex);
 	}
-	mutex_unlock(&cgroup_mutex);
+	mutex_unlock(&cgroup_cft_mutex);
 	/* ve_owner == NULL will be visible */
 	synchronize_rcu();
 
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index f03f665..8d78270 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -718,7 +718,9 @@  static int ve_start_container(struct ve_struct *ve)
 	if (err < 0)
 		goto err_iterate;
 
-	cgroup_mark_ve_roots(ve);
+	err = cgroup_mark_ve_roots(ve);
+	if (err)
+		goto err_mark_ve;
 
 	ve->is_running = 1;
 
@@ -728,6 +730,8 @@  static int ve_start_container(struct ve_struct *ve)
 
 	return 0;
 
+err_mark_ve:
+	ve_hook_iterate_fini(VE_SS_CHAIN, ve);
 err_iterate:
 	ve_workqueue_stop(ve);
 err_workqueue:

Comments

Kirill Tkhai July 31, 2020, 8:24 a.m.
On 28.07.2020 20:53, Valeriy Vdovin wrote:
> Each container will now have access to it's own cgroup release_agent
> file.
> Creation:
>   Normally all cgroup files are created during a call to cgroup_create
>   by cgroup_populate_dir function. It creates or not creates all cgroup
>   files once and they immediately become visible to userspace as
>   filesystem objects.
>   Due to specifics of container creation process, it is not possible to
>   use the same code for 'release_agent' file creation. For VE to start
>   operating, first a list of ordinary cgroups is being created for
>   each subsystem, then the set of newly created cgroups are converted to
>   "virtual roots", so at the time when cgroup_create is executed, there
>   is no knowledge of wheather or not "release_agent" file should be
>   created. This information only comes at "convertion" step which is
>   'cgroup_mark_ve_roots' function. As the file is created dynamically
>   in a live cgroup, a rather delicate locking sequence is present in
>   the new code:
>     - each new "virtual root" cgroup will have to add "release_agent" file,
>       thus each cgroup's directory would need to be locked during
>       the insertion time by cgroup->dentry->d_inode->i_mutex.
>     - d_inode->i_mutex has an ordering dependency with cgroup_mutex
>       (see cgroup_mount/cgroup_remount). They can not be locked in order
>       {lock(cgroup_mutex), lock(inode->i_mutex)}.
>     - to collect a list of cgroups, that need to become virtual we need
>       cgroup_mutex lock to iterate active roots.
>     - to overcome the above conflict we first need to collect a list of
>       all virtual cgroups under cgroup_mutex lock, then release it and
>       after that to insert "release_agent" to each root under
>       inode->i_mutex lock.
>     - to collect a list of cgroups on stack we utilize
>       cgroup->cft_q_node, made specially for that purpose under it's own
>       cgroup_cft_mutex.
> 
> Destruction:
>   Destruction is done in reverse from the above within
>   cgroup_unmark_ve_root.
>   After file destruction we must prevent further write operations to
>   this file in case when someone has opened this file prior to VE
>   and cgroup destruction. This is achieved by checking if cgroup
>   in the argument to cgroup_file_write function has features of host
>   or virtual root.
> 
> https://jira.sw.ru/browse/PSBM-83887
> 
> Signed-off-by: Valeriy Vdovin <valeriy.vdovin@virtuozzo.com>
> ---
>  include/linux/cgroup.h |  2 +-
>  include/linux/ve.h     |  2 +-
>  kernel/cgroup.c        | 86 ++++++++++++++++++++++++++++++++++++++++++++++----
>  kernel/ve/ve.c         |  6 +++-
>  4 files changed, 87 insertions(+), 9 deletions(-)
> 
> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> index fc138c0..645c9fd 100644
> --- a/include/linux/cgroup.h
> +++ b/include/linux/cgroup.h
> @@ -671,7 +671,7 @@ int cgroup_task_count(const struct cgroup *cgrp);
>  void cgroup_release_agent(struct work_struct *work);
>  
>  #ifdef CONFIG_VE
> -void cgroup_mark_ve_roots(struct ve_struct *ve);
> +int cgroup_mark_ve_roots(struct ve_struct *ve);
>  void cgroup_unmark_ve_roots(struct ve_struct *ve);
>  struct ve_struct *cgroup_get_ve_owner(struct cgroup *cgrp);
>  #endif
> diff --git a/include/linux/ve.h b/include/linux/ve.h
> index b6662637..5bf275f 100644
> --- a/include/linux/ve.h
> +++ b/include/linux/ve.h
> @@ -215,7 +215,7 @@ void do_update_load_avg_ve(void);
>  void ve_add_to_release_list(struct cgroup *cgrp);
>  void ve_rm_from_release_list(struct cgroup *cgrp);
>  
> -int ve_set_release_agent_path(struct ve_struct *ve, struct cgroup *cgroot,
> +int ve_set_release_agent_path(struct cgroup *cgroot,
>  	const char *release_agent);
>  
>  const char *ve_get_release_agent_path(struct cgroup *cgrp_root);
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index 1d9c889..bb77804 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -2360,13 +2360,28 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
>  	if (!cgroup_lock_live_group(cgrp))
>  		return -ENODEV;
>  
> +	/*
> +	 * Call to cgroup_get_local_root is a way to make sure
> +	 * that cgrp in the argument is valid "virtual root"
> +	 * cgroup. If it's not - this means that cgroup_unmark_ve_roots
> +	 * has already cleared CGRP_VE_ROOT flag from this cgroup while
> +	 * the file was still opened by other process and
> +	 * that we shouldn't allow further writings via that file.
> +	 */
>  	root_cgrp = cgroup_get_local_root(cgrp);
>  	BUG_ON(!root_cgrp);
> +
> +	if (root_cgrp != cgrp) {
> +		ret = -EPERM;
> +		goto out;
> +	}
> +
>  	if (root_cgrp->ve_owner)
>  		ret = ve_set_release_agent_path(root_cgrp, buffer);
>  	else
>  		return -ENODEV;
>  
> +out:
>  	mutex_unlock(&cgroup_mutex);
>  	return ret;
>  }
> @@ -4204,7 +4219,7 @@ static struct cftype files[] = {
>  	},
>  	{
>  		.name = "release_agent",
> -		.flags = CFTYPE_ONLY_ON_ROOT,
> +		.flags = CFTYPE_ONLY_ON_ROOT | CFTYPE_VE_WRITABLE,
>  		.read_seq_string = cgroup_release_agent_show,
>  		.write_string = cgroup_release_agent_write,
>  		.max_write_len = PATH_MAX,
> @@ -4422,39 +4437,98 @@ static int subgroups_count(struct cgroup *cgroup)
>  	return cgrps_count;
>  }
>  
> +static struct cftype *get_cftype_by_name(const char *name)
> +{
> +	struct cftype *cft;
> +	for (cft = files; cft->name[0] != '\0'; cft++) {
> +		if (!strcmp(cft->name, name))
> +			return cft;
> +	}
> +	return NULL;
> +}
> +
>  #ifdef CONFIG_VE
> -void cgroup_mark_ve_roots(struct ve_struct *ve)
> +int cgroup_mark_ve_roots(struct ve_struct *ve)
>  {
> -	struct cgroup *cgrp;
> +	struct cgroup *cgrp, *tmp;
>  	struct cgroupfs_root *root;
> +	int err = 0;
> +	struct cftype *cft;
> +	LIST_HEAD(pending);
>  
> +	cft = get_cftype_by_name("release_agent");
> +	BUG_ON(!cft);
> +
> +	mutex_lock(&cgroup_cft_mutex);
>  	mutex_lock(&cgroup_mutex);
>  	for_each_active_root(root) {
> -		cgrp = task_cgroup_from_root(ve->init_task, root);
> +		cgrp = css_cgroup_from_root(ve->root_css_set, root);
>  		rcu_assign_pointer(cgrp->ve_owner, ve);
>  		set_bit(CGRP_VE_ROOT, &cgrp->flags);
>  
> +		dget(cgrp->dentry);
> +		list_add_tail(&cgrp->cft_q_node, &pending);
>  		if (test_bit(cpu_cgroup_subsys_id, &root->subsys_mask))
>  			link_ve_root_cpu_cgroup(cgrp);
>  	}
>  	mutex_unlock(&cgroup_mutex);
>  	synchronize_rcu();
> +	list_for_each_entry_safe(cgrp, tmp, &pending, cft_q_node) {
> +		struct inode *inode = cgrp->dentry->d_inode;
> +
> +		if (err) {
> +			list_del_init(&cgrp->cft_q_node);
> +			dput(cgrp->dentry);
> +			continue;
> +		}

How does this work?

If cgroup_add_file() sets error on previous iteration, then here we do double dput()?

> +
> +		mutex_lock(&inode->i_mutex);
> +		mutex_lock(&cgroup_mutex);
> +		if (!cgroup_is_removed(cgrp))
> +			err = cgroup_add_file(cgrp, NULL, cft);
> +		mutex_unlock(&cgroup_mutex);
> +		mutex_unlock(&inode->i_mutex);
> +
> +		list_del_init(&cgrp->cft_q_node);
> +		dput(cgrp->dentry);
> +	}
> +	mutex_unlock(&cgroup_cft_mutex);
> +
> +	if (err)
> +		cgroup_unmark_ve_roots(ve);
> +
> +	return err;
>  }
>  
>  void cgroup_unmark_ve_roots(struct ve_struct *ve)
>  {
> -	struct cgroup *cgrp;
> +	struct cgroup *cgrp, *tmp;
>  	struct cgroupfs_root *root;
> +	struct cftype *cft;
> +	LIST_HEAD(pending);
> +
> +	cft = get_cftype_by_name("release_agent");
>  
> +	mutex_lock(&cgroup_cft_mutex);
>  	mutex_lock(&cgroup_mutex);
>  	for_each_active_root(root) {
>  		cgrp = css_cgroup_from_root(ve->root_css_set, root);
> +		dget(cgrp->dentry);
> +		list_add_tail(&cgrp->cft_q_node, &pending);
> +	}
> +	mutex_unlock(&cgroup_mutex);
> +	list_for_each_entry_safe(cgrp, tmp, &pending, cft_q_node) {
> +		struct inode *inode = cgrp->dentry->d_inode;
> +		mutex_lock(&inode->i_mutex);
> +		mutex_lock(&cgroup_mutex);
> +		cgroup_rm_file(cgrp, cft);
>  		BUG_ON(!rcu_dereference_protected(cgrp->ve_owner,
>  				lockdep_is_held(&cgroup_mutex)));
>  		rcu_assign_pointer(cgrp->ve_owner, NULL);
>  		clear_bit(CGRP_VE_ROOT, &cgrp->flags);
> +		mutex_unlock(&cgroup_mutex);
> +		mutex_unlock(&inode->i_mutex);
>  	}
> -	mutex_unlock(&cgroup_mutex);
> +	mutex_unlock(&cgroup_cft_mutex);
>  	/* ve_owner == NULL will be visible */
>  	synchronize_rcu();
>  
> diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
> index f03f665..8d78270 100644
> --- a/kernel/ve/ve.c
> +++ b/kernel/ve/ve.c
> @@ -718,7 +718,9 @@ static int ve_start_container(struct ve_struct *ve)
>  	if (err < 0)
>  		goto err_iterate;
>  
> -	cgroup_mark_ve_roots(ve);
> +	err = cgroup_mark_ve_roots(ve);
> +	if (err)
> +		goto err_mark_ve;
>  
>  	ve->is_running = 1;
>  
> @@ -728,6 +730,8 @@ static int ve_start_container(struct ve_struct *ve)
>  
>  	return 0;
>  
> +err_mark_ve:
> +	ve_hook_iterate_fini(VE_SS_CHAIN, ve);
>  err_iterate:
>  	ve_workqueue_stop(ve);
>  err_workqueue:
>
Valeriy Vdovin July 31, 2020, 9:38 a.m.
On 31.07.2020 11:24, Kirill Tkhai wrote:
> On 28.07.2020 20:53, Valeriy Vdovin wrote:
>> Each container will now have access to it's own cgroup release_agent
>> file.
>> Creation:
>>    Normally all cgroup files are created during a call to cgroup_create
>>    by cgroup_populate_dir function. It creates or not creates all cgroup
>>    files once and they immediately become visible to userspace as
>>    filesystem objects.
>>    Due to specifics of container creation process, it is not possible to
>>    use the same code for 'release_agent' file creation. For VE to start
>>    operating, first a list of ordinary cgroups is being created for
>>    each subsystem, then the set of newly created cgroups are converted to
>>    "virtual roots", so at the time when cgroup_create is executed, there
>>    is no knowledge of wheather or not "release_agent" file should be
>>    created. This information only comes at "convertion" step which is
>>    'cgroup_mark_ve_roots' function. As the file is created dynamically
>>    in a live cgroup, a rather delicate locking sequence is present in
>>    the new code:
>>      - each new "virtual root" cgroup will have to add "release_agent" file,
>>        thus each cgroup's directory would need to be locked during
>>        the insertion time by cgroup->dentry->d_inode->i_mutex.
>>      - d_inode->i_mutex has an ordering dependency with cgroup_mutex
>>        (see cgroup_mount/cgroup_remount). They can not be locked in order
>>        {lock(cgroup_mutex), lock(inode->i_mutex)}.
>>      - to collect a list of cgroups, that need to become virtual we need
>>        cgroup_mutex lock to iterate active roots.
>>      - to overcome the above conflict we first need to collect a list of
>>        all virtual cgroups under cgroup_mutex lock, then release it and
>>        after that to insert "release_agent" to each root under
>>        inode->i_mutex lock.
>>      - to collect a list of cgroups on stack we utilize
>>        cgroup->cft_q_node, made specially for that purpose under it's own
>>        cgroup_cft_mutex.
>>
>> Destruction:
>>    Destruction is done in reverse from the above within
>>    cgroup_unmark_ve_root.
>>    After file destruction we must prevent further write operations to
>>    this file in case when someone has opened this file prior to VE
>>    and cgroup destruction. This is achieved by checking if cgroup
>>    in the argument to cgroup_file_write function has features of host
>>    or virtual root.
>>
>> https://jira.sw.ru/browse/PSBM-83887
>>
>> Signed-off-by: Valeriy Vdovin <valeriy.vdovin@virtuozzo.com>
>> ---
>>   include/linux/cgroup.h |  2 +-
>>   include/linux/ve.h     |  2 +-
>>   kernel/cgroup.c        | 86 ++++++++++++++++++++++++++++++++++++++++++++++----
>>   kernel/ve/ve.c         |  6 +++-
>>   4 files changed, 87 insertions(+), 9 deletions(-)
>>
>> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
>> index fc138c0..645c9fd 100644
>> --- a/include/linux/cgroup.h
>> +++ b/include/linux/cgroup.h
>> @@ -671,7 +671,7 @@ int cgroup_task_count(const struct cgroup *cgrp);
>>   void cgroup_release_agent(struct work_struct *work);
>>   
>>   #ifdef CONFIG_VE
>> -void cgroup_mark_ve_roots(struct ve_struct *ve);
>> +int cgroup_mark_ve_roots(struct ve_struct *ve);
>>   void cgroup_unmark_ve_roots(struct ve_struct *ve);
>>   struct ve_struct *cgroup_get_ve_owner(struct cgroup *cgrp);
>>   #endif
>> diff --git a/include/linux/ve.h b/include/linux/ve.h
>> index b6662637..5bf275f 100644
>> --- a/include/linux/ve.h
>> +++ b/include/linux/ve.h
>> @@ -215,7 +215,7 @@ void do_update_load_avg_ve(void);
>>   void ve_add_to_release_list(struct cgroup *cgrp);
>>   void ve_rm_from_release_list(struct cgroup *cgrp);
>>   
>> -int ve_set_release_agent_path(struct ve_struct *ve, struct cgroup *cgroot,
>> +int ve_set_release_agent_path(struct cgroup *cgroot,
>>   	const char *release_agent);
>>   
>>   const char *ve_get_release_agent_path(struct cgroup *cgrp_root);
>> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
>> index 1d9c889..bb77804 100644
>> --- a/kernel/cgroup.c
>> +++ b/kernel/cgroup.c
>> @@ -2360,13 +2360,28 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
>>   	if (!cgroup_lock_live_group(cgrp))
>>   		return -ENODEV;
>>   
>> +	/*
>> +	 * Call to cgroup_get_local_root is a way to make sure
>> +	 * that cgrp in the argument is valid "virtual root"
>> +	 * cgroup. If it's not - this means that cgroup_unmark_ve_roots
>> +	 * has already cleared CGRP_VE_ROOT flag from this cgroup while
>> +	 * the file was still opened by other process and
>> +	 * that we shouldn't allow further writings via that file.
>> +	 */
>>   	root_cgrp = cgroup_get_local_root(cgrp);
>>   	BUG_ON(!root_cgrp);
>> +
>> +	if (root_cgrp != cgrp) {
>> +		ret = -EPERM;
>> +		goto out;
>> +	}
>> +
>>   	if (root_cgrp->ve_owner)
>>   		ret = ve_set_release_agent_path(root_cgrp, buffer);
>>   	else
>>   		return -ENODEV;
>>   
>> +out:
>>   	mutex_unlock(&cgroup_mutex);
>>   	return ret;
>>   }
>> @@ -4204,7 +4219,7 @@ static struct cftype files[] = {
>>   	},
>>   	{
>>   		.name = "release_agent",
>> -		.flags = CFTYPE_ONLY_ON_ROOT,
>> +		.flags = CFTYPE_ONLY_ON_ROOT | CFTYPE_VE_WRITABLE,
>>   		.read_seq_string = cgroup_release_agent_show,
>>   		.write_string = cgroup_release_agent_write,
>>   		.max_write_len = PATH_MAX,
>> @@ -4422,39 +4437,98 @@ static int subgroups_count(struct cgroup *cgroup)
>>   	return cgrps_count;
>>   }
>>   
>> +static struct cftype *get_cftype_by_name(const char *name)
>> +{
>> +	struct cftype *cft;
>> +	for (cft = files; cft->name[0] != '\0'; cft++) {
>> +		if (!strcmp(cft->name, name))
>> +			return cft;
>> +	}
>> +	return NULL;
>> +}
>> +
>>   #ifdef CONFIG_VE
>> -void cgroup_mark_ve_roots(struct ve_struct *ve)
>> +int cgroup_mark_ve_roots(struct ve_struct *ve)
>>   {
>> -	struct cgroup *cgrp;
>> +	struct cgroup *cgrp, *tmp;
>>   	struct cgroupfs_root *root;
>> +	int err = 0;
>> +	struct cftype *cft;
>> +	LIST_HEAD(pending);
>>   
>> +	cft = get_cftype_by_name("release_agent");
>> +	BUG_ON(!cft);
>> +
>> +	mutex_lock(&cgroup_cft_mutex);
>>   	mutex_lock(&cgroup_mutex);
>>   	for_each_active_root(root) {
>> -		cgrp = task_cgroup_from_root(ve->init_task, root);
>> +		cgrp = css_cgroup_from_root(ve->root_css_set, root);
>>   		rcu_assign_pointer(cgrp->ve_owner, ve);
>>   		set_bit(CGRP_VE_ROOT, &cgrp->flags);
>>   
>> +		dget(cgrp->dentry);
>> +		list_add_tail(&cgrp->cft_q_node, &pending);
>>   		if (test_bit(cpu_cgroup_subsys_id, &root->subsys_mask))
>>   			link_ve_root_cpu_cgroup(cgrp);
>>   	}
>>   	mutex_unlock(&cgroup_mutex);
>>   	synchronize_rcu();
>> +	list_for_each_entry_safe(cgrp, tmp, &pending, cft_q_node) {
>> +		struct inode *inode = cgrp->dentry->d_inode;
>> +
>> +		if (err) {
>> +			list_del_init(&cgrp->cft_q_node);
>> +			dput(cgrp->dentry);
>> +			continue;
>> +		}
> How does this work?
>
> If cgroup_add_file() sets error on previous iteration, then here we do double dput()?

Here is how it works. Before marking VE_ROOT and creating release agent 
files, we need to know a list of cgroups that will need to act upon. We 
can only find it under cgroup_mutex lock. But we have a restrinction - 
to create a file we need to lock inode->i_mutex. The right lock ordering 
is different:

1. lock(inode->i_mutex), 2. lock(cgroup_mutex)

Because of that we split the whole function into 2 loops. First we lock 
cgroup_mutex and find all cgroups under cgroup mutex lock and put them 
in a list on stack (struct list_head pending). After the list is ready 
we release cgroup_mutex and go to second step. In second step we iterate 
the list of cgroups and call cgroup_add_file for each item with correct 
lock ordering. The consistency of pending cgroups list is enforced by 
cgroup_cft_mutex, which is locked across both loops.

struct cgroup has a list node field specially for this kind of 
situations called "cft_q_node", see cgroup_cfts_commit for more details.

Now in the first loop we collect the list on stack and do dget N times 
for each item in the list. In the second loop we have to call dput same 
number of times.

In the second loop the block "if (err) {...dput...continue;}" means - if 
any cgroup_add_file has returned with error, skip any further file 
creation but cleanup cgroup->cft_q_node and call dput.

I don't see double dput here per item here.

>> +
>> +		mutex_lock(&inode->i_mutex);
>> +		mutex_lock(&cgroup_mutex);
>> +		if (!cgroup_is_removed(cgrp))
>> +			err = cgroup_add_file(cgrp, NULL, cft);
>> +		mutex_unlock(&cgroup_mutex);
>> +		mutex_unlock(&inode->i_mutex);
>> +
>> +		list_del_init(&cgrp->cft_q_node);
>> +		dput(cgrp->dentry);
>> +	}
>> +	mutex_unlock(&cgroup_cft_mutex);
>> +
>> +	if (err)
>> +		cgroup_unmark_ve_roots(ve);
>> +
>> +	return err;
>>   }
>>   
>>   void cgroup_unmark_ve_roots(struct ve_struct *ve)
>>   {
>> -	struct cgroup *cgrp;
>> +	struct cgroup *cgrp, *tmp;
>>   	struct cgroupfs_root *root;
>> +	struct cftype *cft;
>> +	LIST_HEAD(pending);
>> +
>> +	cft = get_cftype_by_name("release_agent");
>>   
>> +	mutex_lock(&cgroup_cft_mutex);
>>   	mutex_lock(&cgroup_mutex);
>>   	for_each_active_root(root) {
>>   		cgrp = css_cgroup_from_root(ve->root_css_set, root);
>> +		dget(cgrp->dentry);
>> +		list_add_tail(&cgrp->cft_q_node, &pending);
>> +	}
>> +	mutex_unlock(&cgroup_mutex);
>> +	list_for_each_entry_safe(cgrp, tmp, &pending, cft_q_node) {
>> +		struct inode *inode = cgrp->dentry->d_inode;
>> +		mutex_lock(&inode->i_mutex);
>> +		mutex_lock(&cgroup_mutex);
>> +		cgroup_rm_file(cgrp, cft);
>>   		BUG_ON(!rcu_dereference_protected(cgrp->ve_owner,
>>   				lockdep_is_held(&cgroup_mutex)));
>>   		rcu_assign_pointer(cgrp->ve_owner, NULL);
>>   		clear_bit(CGRP_VE_ROOT, &cgrp->flags);
>> +		mutex_unlock(&cgroup_mutex);
>> +		mutex_unlock(&inode->i_mutex);
>>   	}
>> -	mutex_unlock(&cgroup_mutex);
>> +	mutex_unlock(&cgroup_cft_mutex);
>>   	/* ve_owner == NULL will be visible */
>>   	synchronize_rcu();
>>   
>> diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
>> index f03f665..8d78270 100644
>> --- a/kernel/ve/ve.c
>> +++ b/kernel/ve/ve.c
>> @@ -718,7 +718,9 @@ static int ve_start_container(struct ve_struct *ve)
>>   	if (err < 0)
>>   		goto err_iterate;
>>   
>> -	cgroup_mark_ve_roots(ve);
>> +	err = cgroup_mark_ve_roots(ve);
>> +	if (err)
>> +		goto err_mark_ve;
>>   
>>   	ve->is_running = 1;
>>   
>> @@ -728,6 +730,8 @@ static int ve_start_container(struct ve_struct *ve)
>>   
>>   	return 0;
>>   
>> +err_mark_ve:
>> +	ve_hook_iterate_fini(VE_SS_CHAIN, ve);
>>   err_iterate:
>>   	ve_workqueue_stop(ve);
>>   err_workqueue:
>>