[2/2,v3] ve/cgroup: Added pointers to owning ve to root cgroups

Submitted by Valeriy Vdovin on March 24, 2020, 10:46 a.m.

Details

Message ID 1585046794-179675-3-git-send-email-valeriy.vdovin@virtuozzo.com
State New
Series "Make release_agent per-cgroup property. Run release_agent in proper ve."
Headers show

Commit Message

Valeriy Vdovin March 24, 2020, 10:46 a.m.
Follow-up patch to per-cgroup release_agent property. release_agent
notifications are spawned from a special kthread, running under ve0. But
newly spawned tasks should run under their own ve context. Easy way to
pass this information to a spawning thread is by adding 've_owner' field
to a root cgroup. At notification any cgroup can be walked upwards to
it's root and get ve_owner from there. ve_owner is initialized at container
start, that's when we know all cgroup ve roots for free. For ve0 roots
we don't need ve_owner field set, because we can always get pointer to ve0.
Only place where we can detch ve_owner knowledge from root ve cgroups is
at cgroup_exit for ve->init_task, because that's that's the last time, when
ve->init_task is related to it's ve root cgroups (see cgroup_exit comments).

https://jira.sw.ru/browse/PSBM-83887
Signed-off-by: Valeriy Vdovin <valeriy.vdovin@virtuozzo.com>
---
 include/linux/cgroup.h |  3 +++
 kernel/cgroup.c        | 45 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 48 insertions(+)

Patch hide | download patch | download mbox

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 5e289fc..34fc017 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -291,6 +291,9 @@  struct cgroup {
 	struct simple_xattrs xattrs;
 	u64 subgroups_limit;
 
+	/* ve_owner, responsible for running release agent. */
+	struct ve_struct *ve_owner;
+
 	/*
 	 * Per-cgroup path to release agent binary for release
 	 * notifications.
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index d1d4605..2500095 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4352,6 +4352,7 @@  int cgroup_mark_ve_root(struct ve_struct *ve)
 	mutex_lock(&cgroup_mutex);
 	for_each_active_root(root) {
 		cgrp = task_cgroup_from_root(ve->init_task, root);
+		cgrp->ve_owner = ve;
 		set_bit(CGRP_VE_ROOT, &cgrp->flags);
 		err = cgroup_add_file_on_mark_ve(cgrp);
 		if (err)
@@ -4363,6 +4364,20 @@  int cgroup_mark_ve_root(struct ve_struct *ve)
 	return err;
 }
 
+static void cgroup_unbind_roots_from_ve(struct ve_struct *ve)
+{
+	struct cgroup *cgrp;
+	struct cgroupfs_root *root;
+
+	mutex_lock(&cgroup_mutex);
+	for_each_active_root(root) {
+		cgrp = task_cgroup_from_root(ve->init_task, root);
+		BUG_ON(!cgrp->ve_owner);
+		cgrp->ve_owner = NULL;
+	}
+	mutex_unlock(&cgroup_mutex);
+}
+
 struct cgroup *cgroup_get_ve_root(struct cgroup *cgrp)
 {
 	struct cgroup *ve_root = NULL;
@@ -5401,6 +5416,14 @@  void cgroup_exit(struct task_struct *tsk, int run_callbacks)
 	int i;
 
 	/*
+	 * Detect that the task in question is ve->init_task
+	 * if so, unbind all cgroups that want to be released under
+	 * this ve.
+	 */
+	if (!ve_is_super(tsk->task_ve) && tsk == tsk->task_ve->init_task)
+		cgroup_unbind_roots_from_ve(tsk->task_ve);
+
+	/*
 	 * Unlink from the css_set task list if necessary.
 	 * Optimistically check cg_list before taking
 	 * css_set_lock
@@ -5493,6 +5516,7 @@  static void cgroup_release_agent(struct work_struct *work)
 	raw_spin_lock(&release_list_lock);
 	while (!list_empty(&release_list)) {
 		char *argv[3], *envp[3];
+		struct ve_struct *ve = NULL;
 		int i, err;
 		const char *release_agent;
 		char *pathbuf = NULL, *agentbuf = NULL;
@@ -5507,8 +5531,24 @@  static void cgroup_release_agent(struct work_struct *work)
 			goto continue_free;
 		if (__cgroup_path(cgrp, pathbuf, PAGE_SIZE, true) < 0)
 			goto continue_free;
+
+		/*
+		 * Deduce under which VE we are going to spawn nofitier
+		 * binary. Non-root VE has it's VE-local root cgroup,
+		 * marked with VE_ROOT flag. It has non-NULL ve_owner
+		 * set during cgroup_mark_ve_root.
+		 * VE0 root cgroup does not need ve_owner field, because
+		 * it's ve is ve0. Non-VE root cgroup does not have a parent.
+		 */
 		root_cgrp = cgroup_get_local_root(cgrp);
+		if (root_cgrp->parent == NULL)
+			ve = &ve0;
+		else
+			ve = root_cgrp->ve_owner;
+		if (!ve)
+			goto continue_free;
 		rcu_read_lock();
+
 		release_agent = cgroup_release_agent_path(root_cgrp);
 		if (release_agent)
 			agentbuf = kstrdup(release_agent, GFP_KERNEL);
@@ -5531,7 +5571,12 @@  static void cgroup_release_agent(struct work_struct *work)
 		 * since the exec could involve hitting disk and hence
 		 * be a slow process */
 		mutex_unlock(&cgroup_mutex);
+#ifdef CONFIG_VE
+		err = call_usermodehelper_fns_ve(ve, argv[0], argv,
+			envp, UMH_WAIT_EXEC, NULL, NULL, NULL);
+#else
 		err = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
+#endif
 		if (err < 0)
 			pr_warn_ratelimited("cgroup release_agent "
 					    "%s %s failed: %d\n",

Comments

Kirill Tkhai March 24, 2020, 11:16 a.m.
On 24.03.2020 13:46, Valeriy Vdovin wrote:
> Follow-up patch to per-cgroup release_agent property. release_agent
> notifications are spawned from a special kthread, running under ve0. But
> newly spawned tasks should run under their own ve context. Easy way to
> pass this information to a spawning thread is by adding 've_owner' field
> to a root cgroup. At notification any cgroup can be walked upwards to
> it's root and get ve_owner from there. ve_owner is initialized at container
> start, that's when we know all cgroup ve roots for free. For ve0 roots
> we don't need ve_owner field set, because we can always get pointer to ve0.
> Only place where we can detch ve_owner knowledge from root ve cgroups is
> at cgroup_exit for ve->init_task, because that's that's the last time, when
> ve->init_task is related to it's ve root cgroups (see cgroup_exit comments).
> 
> https://jira.sw.ru/browse/PSBM-83887
> Signed-off-by: Valeriy Vdovin <valeriy.vdovin@virtuozzo.com>
> ---
>  include/linux/cgroup.h |  3 +++
>  kernel/cgroup.c        | 45 +++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 48 insertions(+)
> 
> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> index 5e289fc..34fc017 100644
> --- a/include/linux/cgroup.h
> +++ b/include/linux/cgroup.h
> @@ -291,6 +291,9 @@ struct cgroup {
>  	struct simple_xattrs xattrs;
>  	u64 subgroups_limit;
>  
> +	/* ve_owner, responsible for running release agent. */
> +	struct ve_struct *ve_owner;
> +
>  	/*
>  	 * Per-cgroup path to release agent binary for release
>  	 * notifications.
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index d1d4605..2500095 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -4352,6 +4352,7 @@ int cgroup_mark_ve_root(struct ve_struct *ve)
>  	mutex_lock(&cgroup_mutex);
>  	for_each_active_root(root) {
>  		cgrp = task_cgroup_from_root(ve->init_task, root);
> +		cgrp->ve_owner = ve;
>  		set_bit(CGRP_VE_ROOT, &cgrp->flags);
>  		err = cgroup_add_file_on_mark_ve(cgrp);
>  		if (err)
> @@ -4363,6 +4364,20 @@ int cgroup_mark_ve_root(struct ve_struct *ve)
>  	return err;
>  }
>  
> +static void cgroup_unbind_roots_from_ve(struct ve_struct *ve)
> +{
> +	struct cgroup *cgrp;
> +	struct cgroupfs_root *root;
> +
> +	mutex_lock(&cgroup_mutex);
> +	for_each_active_root(root) {
> +		cgrp = task_cgroup_from_root(ve->init_task, root);
> +		BUG_ON(!cgrp->ve_owner);
> +		cgrp->ve_owner = NULL;
> +	}
> +	mutex_unlock(&cgroup_mutex);
> +}
> +
>  struct cgroup *cgroup_get_ve_root(struct cgroup *cgrp)
>  {
>  	struct cgroup *ve_root = NULL;
> @@ -5401,6 +5416,14 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
>  	int i;
>  
>  	/*
> +	 * Detect that the task in question is ve->init_task
> +	 * if so, unbind all cgroups that want to be released under
> +	 * this ve.
> +	 */
> +	if (!ve_is_super(tsk->task_ve) && tsk == tsk->task_ve->init_task)
> +		cgroup_unbind_roots_from_ve(tsk->task_ve);
> +
> +	/*
>  	 * Unlink from the css_set task list if necessary.
>  	 * Optimistically check cg_list before taking
>  	 * css_set_lock
> @@ -5493,6 +5516,7 @@ static void cgroup_release_agent(struct work_struct *work)
>  	raw_spin_lock(&release_list_lock);
>  	while (!list_empty(&release_list)) {
>  		char *argv[3], *envp[3];
> +		struct ve_struct *ve = NULL;
>  		int i, err;
>  		const char *release_agent;
>  		char *pathbuf = NULL, *agentbuf = NULL;
> @@ -5507,8 +5531,24 @@ static void cgroup_release_agent(struct work_struct *work)
>  			goto continue_free;
>  		if (__cgroup_path(cgrp, pathbuf, PAGE_SIZE, true) < 0)
>  			goto continue_free;
> +
> +		/*
> +		 * Deduce under which VE we are going to spawn nofitier
> +		 * binary. Non-root VE has it's VE-local root cgroup,
> +		 * marked with VE_ROOT flag. It has non-NULL ve_owner
> +		 * set during cgroup_mark_ve_root.
> +		 * VE0 root cgroup does not need ve_owner field, because
> +		 * it's ve is ve0. Non-VE root cgroup does not have a parent.
> +		 */
>  		root_cgrp = cgroup_get_local_root(cgrp);
> +		if (root_cgrp->parent == NULL)
> +			ve = &ve0;
> +		else
> +			ve = root_cgrp->ve_owner;
> +		if (!ve)
> +			goto continue_free;
>  		rcu_read_lock();
> +
>  		release_agent = cgroup_release_agent_path(root_cgrp);
>  		if (release_agent)
>  			agentbuf = kstrdup(release_agent, GFP_KERNEL);
> @@ -5531,7 +5571,12 @@ static void cgroup_release_agent(struct work_struct *work)
>  		 * since the exec could involve hitting disk and hence
>  		 * be a slow process */
>  		mutex_unlock(&cgroup_mutex);
> +#ifdef CONFIG_VE
> +		err = call_usermodehelper_fns_ve(ve, argv[0], argv,
> +			envp, UMH_WAIT_EXEC, NULL, NULL, NULL);
> +#else
>  		err = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
> +#endif
>  		if (err < 0)
>  			pr_warn_ratelimited("cgroup release_agent "
>  					    "%s %s failed: %d\n",

Is there a race like the below?

[cpu 0]                                                    [cpu1]
cgroup_release_agent()
  mutex_lock(&cgroup_mutex)
  ve = root_cgrp->ve_owner
  mutex_unlock(&cgroup_mutex)

	                                                   cgroup_unbind_roots_from_ve(ve)
                                                             cgroup->ve_owner = NULL;
                                                           css_put(ve)  <--- final put


  call_usermodehelper_fns_ve(ve)
    get_ve(ve)   <--- gets zero counter
Valeriy Vdovin March 24, 2020, 12:13 p.m.
Yes, it is possible in theory. I'll patch this place. Thanks for observation.
Kirill Tkhai March 24, 2020, 1:05 p.m.
On 24.03.2020 15:13, Valeriy Vdovin wrote:
> Yes, it is possible in theory. I'll patch this place. Thanks for observation.

AFAIS, simple get_ve() under mutex should be enough.

------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
> *From:* Kirill Tkhai <ktkhai@virtuozzo.com>
> *Sent:* Tuesday, March 24, 2020 2:16 PM
> *To:* Valeriy Vdovin <Valeriy.Vdovin@virtuozzo.com>; devel@openvz.org <devel@openvz.org>; Konstantin Khorenko <khorenko@virtuozzo.com>; Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
> *Subject:* Re: [PATCH 2/2 v3] ve/cgroup: Added pointers to owning ve to root cgroups
>  
> On 24.03.2020 13:46, Valeriy Vdovin wrote:
>> Follow-up patch to per-cgroup release_agent property. release_agent
>> notifications are spawned from a special kthread, running under ve0. But
>> newly spawned tasks should run under their own ve context. Easy way to
>> pass this information to a spawning thread is by adding 've_owner' field
>> to a root cgroup. At notification any cgroup can be walked upwards to
>> it's root and get ve_owner from there. ve_owner is initialized at container
>> start, that's when we know all cgroup ve roots for free. For ve0 roots
>> we don't need ve_owner field set, because we can always get pointer to ve0.
>> Only place where we can detch ve_owner knowledge from root ve cgroups is
>> at cgroup_exit for ve->init_task, because that's that's the last time, when
>> ve->init_task is related to it's ve root cgroups (see cgroup_exit comments).
>> 
>> https://jira.sw.ru/browse/PSBM-83887
>> Signed-off-by: Valeriy Vdovin <valeriy.vdovin@virtuozzo.com>
>> ---
>>  include/linux/cgroup.h |  3 +++
>>  kernel/cgroup.c        | 45 +++++++++++++++++++++++++++++++++++++++++++++
>>  2 files changed, 48 insertions(+)
>> 
>> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
>> index 5e289fc..34fc017 100644
>> --- a/include/linux/cgroup.h
>> +++ b/include/linux/cgroup.h
>> @@ -291,6 +291,9 @@ struct cgroup {
>>        struct simple_xattrs xattrs;
>>        u64 subgroups_limit;
>>  
>> +     /* ve_owner, responsible for running release agent. */
>> +     struct ve_struct *ve_owner;
>> +
>>        /*
>>         * Per-cgroup path to release agent binary for release
>>         * notifications.
>> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
>> index d1d4605..2500095 100644
>> --- a/kernel/cgroup.c
>> +++ b/kernel/cgroup.c
>> @@ -4352,6 +4352,7 @@ int cgroup_mark_ve_root(struct ve_struct *ve)
>>        mutex_lock(&cgroup_mutex);
>>        for_each_active_root(root) {
>>                cgrp = task_cgroup_from_root(ve->init_task, root);
>> +             cgrp->ve_owner = ve;
>>                set_bit(CGRP_VE_ROOT, &cgrp->flags);
>>                err = cgroup_add_file_on_mark_ve(cgrp);
>>                if (err)
>> @@ -4363,6 +4364,20 @@ int cgroup_mark_ve_root(struct ve_struct *ve)
>>        return err;
>>  }
>>  
>> +static void cgroup_unbind_roots_from_ve(struct ve_struct *ve)
>> +{
>> +     struct cgroup *cgrp;
>> +     struct cgroupfs_root *root;
>> +
>> +     mutex_lock(&cgroup_mutex);
>> +     for_each_active_root(root) {
>> +             cgrp = task_cgroup_from_root(ve->init_task, root);
>> +             BUG_ON(!cgrp->ve_owner);
>> +             cgrp->ve_owner = NULL;
>> +     }
>> +     mutex_unlock(&cgroup_mutex);
>> +}
>> +
>>  struct cgroup *cgroup_get_ve_root(struct cgroup *cgrp)
>>  {
>>        struct cgroup *ve_root = NULL;
>> @@ -5401,6 +5416,14 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
>>        int i;
>>  
>>        /*
>> +      * Detect that the task in question is ve->init_task
>> +      * if so, unbind all cgroups that want to be released under
>> +      * this ve.
>> +      */
>> +     if (!ve_is_super(tsk->task_ve) && tsk == tsk->task_ve->init_task)
>> +             cgroup_unbind_roots_from_ve(tsk->task_ve);
>> +
>> +     /*
>>         * Unlink from the css_set task list if necessary.
>>         * Optimistically check cg_list before taking
>>         * css_set_lock
>> @@ -5493,6 +5516,7 @@ static void cgroup_release_agent(struct work_struct *work)
>>        raw_spin_lock(&release_list_lock);
>>        while (!list_empty(&release_list)) {
>>                char *argv[3], *envp[3];
>> +             struct ve_struct *ve = NULL;
>>                int i, err;
>>                const char *release_agent;
>>                char *pathbuf = NULL, *agentbuf = NULL;
>> @@ -5507,8 +5531,24 @@ static void cgroup_release_agent(struct work_struct *work)
>>                        goto continue_free;
>>                if (__cgroup_path(cgrp, pathbuf, PAGE_SIZE, true) < 0)
>>                        goto continue_free;
>> +
>> +             /*
>> +              * Deduce under which VE we are going to spawn nofitier
>> +              * binary. Non-root VE has it's VE-local root cgroup,
>> +              * marked with VE_ROOT flag. It has non-NULL ve_owner
>> +              * set during cgroup_mark_ve_root.
>> +              * VE0 root cgroup does not need ve_owner field, because
>> +              * it's ve is ve0. Non-VE root cgroup does not have a parent.
>> +              */
>>                root_cgrp = cgroup_get_local_root(cgrp);
>> +             if (root_cgrp->parent == NULL)
>> +                     ve = &ve0;
>> +             else
>> +                     ve = root_cgrp->ve_owner;
>> +             if (!ve)
>> +                     goto continue_free;
>>                rcu_read_lock();
>> +
>>                release_agent = cgroup_release_agent_path(root_cgrp);
>>                if (release_agent)
>>                        agentbuf = kstrdup(release_agent, GFP_KERNEL);
>> @@ -5531,7 +5571,12 @@ static void cgroup_release_agent(struct work_struct *work)
>>                 * since the exec could involve hitting disk and hence
>>                 * be a slow process */
>>                mutex_unlock(&cgroup_mutex);
>> +#ifdef CONFIG_VE
>> +             err = call_usermodehelper_fns_ve(ve, argv[0], argv,
>> +                     envp, UMH_WAIT_EXEC, NULL, NULL, NULL);
>> +#else
>>                err = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
>> +#endif
>>                if (err < 0)
>>                        pr_warn_ratelimited("cgroup release_agent "
>>                                            "%s %s failed: %d\n",
> 
> Is there a race like the below?
> 
> [cpu 0]                                                    [cpu1]
> cgroup_release_agent()
>   mutex_lock(&cgroup_mutex)
>   ve = root_cgrp->ve_owner
>   mutex_unlock(&cgroup_mutex)
> 
>                                                            cgroup_unbind_roots_from_ve(ve)
>                                                              cgroup->ve_owner = NULL;
>                                                            css_put(ve)  <--- final put
> 
> 
>   call_usermodehelper_fns_ve(ve)
>     get_ve(ve)   <--- gets zero counter