[RHEL7,v20,06/14] ve/cgroup: unmark ve-root cgroups at container stop

Submitted by Valeriy Vdovin on June 25, 2020, 2:29 p.m.

Details

Message ID 1593095386-899724-7-git-send-email-valeriy.vdovin@virtuozzo.com
State New
Series "Make release_agent per-cgroup property. Run release_agent in proper ve."
Headers show

Commit Message

Valeriy Vdovin June 25, 2020, 2:29 p.m.
Signed-off-by: Valeriy Vdovin <valeriy.vdovin@virtuozzo.com>
Reviewed-by: Kirill Tkhai <ktkhai@virtuozzo.com>
---
 include/linux/cgroup.h |  1 +
 kernel/cgroup.c        | 38 ++++++++++++++++++++++++++++++++++++++
 kernel/ve/ve.c         |  2 ++
 3 files changed, 41 insertions(+)

Patch hide | download patch | download mbox

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index ac60aaed..6e2c206 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -671,6 +671,7 @@  int cgroup_task_count(const struct cgroup *cgrp);
 
 #ifdef CONFIG_VE
 void cgroup_mark_ve_roots(struct ve_struct *ve);
+void cgroup_unmark_ve_roots(struct ve_struct *ve);
 #endif
 
 /*
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index ce576c5..6e3871a 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -637,6 +637,31 @@  static struct css_set *find_css_set(
 }
 
 /*
+ * Walk each cgroup link of a given css_set and find a cgroup that
+ * is the child of cgroupfs_root in argument.
+ */
+static struct cgroup *css_cgroup_from_root(struct css_set *css_set,
+					    struct cgroupfs_root *root)
+{
+	struct cgroup *res = NULL;
+	struct cg_cgroup_link *link;
+
+	BUG_ON(!mutex_is_locked(&cgroup_mutex));
+	read_lock(&css_set_lock);
+
+	list_for_each_entry(link, &css_set->cg_links, cg_link_list) {
+		struct cgroup *c = link->cgrp;
+		if (c->root == root) {
+			res = c;
+			break;
+		}
+	}
+	read_unlock(&css_set_lock);
+	BUG_ON(!res);
+	return res;
+}
+
+/*
  * Return the cgroup for "task" from the given hierarchy. Must be
  * called with cgroup_mutex held.
  */
@@ -4329,6 +4354,19 @@  void cgroup_mark_ve_roots(struct ve_struct *ve)
 	mutex_unlock(&cgroup_mutex);
 }
 
+void cgroup_unmark_ve_roots(struct ve_struct *ve)
+{
+	struct cgroup *cgrp;
+	struct cgroupfs_root *root;
+
+	mutex_lock(&cgroup_mutex);
+	for_each_active_root(root) {
+		cgrp = css_cgroup_from_root(ve->root_css_set, root);
+		clear_bit(CGRP_VE_ROOT, &cgrp->flags);
+	}
+	mutex_unlock(&cgroup_mutex);
+}
+
 struct cgroup *cgroup_get_ve_root(struct cgroup *cgrp)
 {
 	struct cgroup *ve_root = NULL;
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index 73cfee6..711050c 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -623,6 +623,8 @@  void ve_exit_ns(struct pid_namespace *pid_ns)
 	if (!ve->ve_ns || ve->ve_ns->pid_ns != pid_ns)
 		return;
 
+	cgroup_unmark_ve_roots(ve);
+
 	ve_workqueue_stop(ve);
 
 	/*

Comments

Kirill Tkhai July 21, 2020, 1:52 p.m.
On 25.06.2020 17:29, Valeriy Vdovin wrote:
> Signed-off-by: Valeriy Vdovin <valeriy.vdovin@virtuozzo.com>
> Reviewed-by: Kirill Tkhai <ktkhai@virtuozzo.com>
> ---
>  include/linux/cgroup.h |  1 +
>  kernel/cgroup.c        | 38 ++++++++++++++++++++++++++++++++++++++
>  kernel/ve/ve.c         |  2 ++
>  3 files changed, 41 insertions(+)
> 
> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> index ac60aaed..6e2c206 100644
> --- a/include/linux/cgroup.h
> +++ b/include/linux/cgroup.h
> @@ -671,6 +671,7 @@ int cgroup_task_count(const struct cgroup *cgrp);
>  
>  #ifdef CONFIG_VE
>  void cgroup_mark_ve_roots(struct ve_struct *ve);
> +void cgroup_unmark_ve_roots(struct ve_struct *ve);
>  #endif
>  
>  /*
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index ce576c5..6e3871a 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -637,6 +637,31 @@ static struct css_set *find_css_set(
>  }
>  
>  /*
> + * Walk each cgroup link of a given css_set and find a cgroup that
> + * is the child of cgroupfs_root in argument.
> + */
> +static struct cgroup *css_cgroup_from_root(struct css_set *css_set,
> +					    struct cgroupfs_root *root)
> +{
> +	struct cgroup *res = NULL;
> +	struct cg_cgroup_link *link;
> +
> +	BUG_ON(!mutex_is_locked(&cgroup_mutex));
> +	read_lock(&css_set_lock);
> +
> +	list_for_each_entry(link, &css_set->cg_links, cg_link_list) {
> +		struct cgroup *c = link->cgrp;
> +		if (c->root == root) {
> +			res = c;
> +			break;
> +		}
> +	}
> +	read_unlock(&css_set_lock);
> +	BUG_ON(!res);
> +	return res;
> +}
> +
> +/*
>   * Return the cgroup for "task" from the given hierarchy. Must be
>   * called with cgroup_mutex held.
>   */
> @@ -4329,6 +4354,19 @@ void cgroup_mark_ve_roots(struct ve_struct *ve)
>  	mutex_unlock(&cgroup_mutex);
>  }
>  
> +void cgroup_unmark_ve_roots(struct ve_struct *ve)
> +{
> +	struct cgroup *cgrp;
> +	struct cgroupfs_root *root;
> +
> +	mutex_lock(&cgroup_mutex);
> +	for_each_active_root(root) {
> +		cgrp = css_cgroup_from_root(ve->root_css_set, root);
> +		clear_bit(CGRP_VE_ROOT, &cgrp->flags);
> +	}
> +	mutex_unlock(&cgroup_mutex);
> +}
> +
>  struct cgroup *cgroup_get_ve_root(struct cgroup *cgrp)
>  {
>  	struct cgroup *ve_root = NULL;
> diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
> index 73cfee6..711050c 100644
> --- a/kernel/ve/ve.c
> +++ b/kernel/ve/ve.c
> @@ -623,6 +623,8 @@ void ve_exit_ns(struct pid_namespace *pid_ns)
>  	if (!ve->ve_ns || ve->ve_ns->pid_ns != pid_ns)
>  		return;
>  
> +	cgroup_unmark_ve_roots(ve);

Is there a problem that ve workqueue works will run after we unmark roots?
Maybe we should call this cgroup_unmark_ve_roots() after ve_workqueue_stop()?

> +
>  	ve_workqueue_stop(ve);
>  
>  	/*
>
Valeriy Vdovin July 27, 2020, 9:10 a.m.

Kirill Tkhai July 27, 2020, 10:18 a.m.
On 27.07.2020 12:10, Valeriy Vdovin wrote:
> 
> 
> ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
>> From: Kirill Tkhai <ktkhai@virtuozzo.com>
>> Sent: Tuesday, July 21, 2020 4:52 PM
>> To: Valeriy Vdovin <Valeriy.Vdovin@virtuozzo.com>; devel@openvz.org > <devel@openvz.org>
>> Cc: Valeriy Vdovin <valeriy.vdovin@virtuozz.com>
>> Subject: Re: [PATCH RHEL7 v20 06/14] ve/cgroup: unmark ve-root cgroups at container stop
>>
>> On 25.06.2020 17:29, Valeriy Vdovin wrote:
>> > Signed-off-by: Valeriy Vdovin <valeriy.vdovin@virtuozzo.com>
>> > Reviewed-by: Kirill Tkhai <ktkhai@virtuozzo.com>
>> > ---
>> >  include/linux/cgroup.h |  1 +
>> >  kernel/cgroup.c        | 38 ++++++++++++++++++++++++++++++++++++++
>> >  kernel/ve/ve.c         |  2 ++
>> >  3 files changed, 41 insertions(+)
>> >
>> > diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
>> > index ac60aaed..6e2c206 100644
>> > --- a/include/linux/cgroup.h
>> > +++ b/include/linux/cgroup.h
>> > @@ -671,6 +671,7 @@ int cgroup_task_count(const struct cgroup *cgrp);
>> >
>> >  #ifdef CONFIG_VE
>> >  void cgroup_mark_ve_roots(struct ve_struct *ve);
>> > +void cgroup_unmark_ve_roots(struct ve_struct *ve);
>> >  #endif
>> >
>> >  /*
>> > diff --git a/kernel/cgroup.c b/kernel/cgroup.c
>> > index ce576c5..6e3871a 100644
>> > --- a/kernel/cgroup.c
>> > +++ b/kernel/cgroup.c
>> > @@ -637,6 +637,31 @@ static struct css_set *find_css_set(
>> >  }
>> >
>> >  /*
>> > + * Walk each cgroup link of a given css_set and find a cgroup that
>> > + * is the child of cgroupfs_root in argument.
>> > + */
>> > +static struct cgroup *css_cgroup_from_root(struct css_set *css_set,
>> > +                                         struct cgroupfs_root *root)
>> > +{
>> > +     struct cgroup *res = NULL;
>> > +     struct cg_cgroup_link *link;
>> > +
>> > +     BUG_ON(!mutex_is_locked(&cgroup_mutex));
>> > +     read_lock(&css_set_lock);
>> > +
>> > +     list_for_each_entry(link, &css_set->cg_links, cg_link_list) {
>> > +             struct cgroup *c = link->cgrp;
>> > +             if (c->root == root) {
>> > +                     res = c;
>> > +                     break;
>> > +             }
>> > +     }
>> > +     read_unlock(&css_set_lock);
>> > +     BUG_ON(!res);
>> > +     return res;
>> > +}
>> > +
>> > +/*
>> >   * Return the cgroup for "task" from the given hierarchy. Must be
>> >   * called with cgroup_mutex held.
>> >   */
>> > @@ -4329,6 +4354,19 @@ void cgroup_mark_ve_roots(struct ve_struct *ve)
>> >        mutex_unlock(&cgroup_mutex);
>> >  }
>> >
>> > +void cgroup_unmark_ve_roots(struct ve_struct *ve)
>> > +{
>> > +     struct cgroup *cgrp;
>> > +     struct cgroupfs_root *root;
>> > +
>> > +     mutex_lock(&cgroup_mutex);
>> > +     for_each_active_root(root) {
>> > +             cgrp = css_cgroup_from_root(ve->root_css_set, root);
>> > +             clear_bit(CGRP_VE_ROOT, &cgrp->flags);
>> > +     }  
>> > +     mutex_unlock(&cgroup_mutex);
>> > +}
>> > +
>> >  struct cgroup *cgroup_get_ve_root(struct cgroup *cgrp)
>> >  {
>> >        struct cgroup *ve_root = NULL;
>> > diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
>> > index 73cfee6..711050c 100644
>> > --- a/kernel/ve/ve.c
>> > +++ b/kernel/ve/ve.c
>> > @@ -623,6 +623,8 @@ void ve_exit_ns(struct pid_namespace *pid_ns)
>> >        if (!ve->ve_ns || ve->ve_ns->pid_ns != pid_ns)
>> >                return;
>> >  
>> > +     cgroup_unmark_ve_roots(ve);
>>
>> Is there a problem that ve workqueue works will run after we unmark roots?
>> Maybe we should call this cgroup_unmark_ve_roots() after ve_workqueue_stop()?
> 
> When a cgroup gets empty it's decided to which workqueue it should be put to await for
> release. Thus when we unmark ve root, we prevent any new empty cgroups from entering this
> workqueue. After that we are safe to stop the workqueue by waiting for all the current jobs to
> complete.

But is it OK that workqueue is executing its works after root is cleared? Won't there some
security leak? If it's safe, why?

> 
>> > +
>> >        ve_workqueue_stop(ve);
>> >  
>> >        /*  
>> >
>
Valeriy Vdovin July 27, 2020, 12:49 p.m.
On 27.07.2020 13:18, Kirill Tkhai wrote:
> On 27.07.2020 12:10, Valeriy Vdovin wrote:
>>
>> ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
>>> From: Kirill Tkhai <ktkhai@virtuozzo.com>
>>> Sent: Tuesday, July 21, 2020 4:52 PM
>>> To: Valeriy Vdovin <Valeriy.Vdovin@virtuozzo.com>; devel@openvz.org > <devel@openvz.org>
>>> Cc: Valeriy Vdovin <valeriy.vdovin@virtuozz.com>
>>> Subject: Re: [PATCH RHEL7 v20 06/14] ve/cgroup: unmark ve-root cgroups at container stop
>>>
>>> On 25.06.2020 17:29, Valeriy Vdovin wrote:
>>>> Signed-off-by: Valeriy Vdovin <valeriy.vdovin@virtuozzo.com>
>>>> Reviewed-by: Kirill Tkhai <ktkhai@virtuozzo.com>
>>>> ---
>>>>   include/linux/cgroup.h |  1 +
>>>>   kernel/cgroup.c        | 38 ++++++++++++++++++++++++++++++++++++++
>>>>   kernel/ve/ve.c         |  2 ++
>>>>   3 files changed, 41 insertions(+)
>>>>
>>>> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
>>>> index ac60aaed..6e2c206 100644
>>>> --- a/include/linux/cgroup.h
>>>> +++ b/include/linux/cgroup.h
>>>> @@ -671,6 +671,7 @@ int cgroup_task_count(const struct cgroup *cgrp);
>>>>
>>>>   #ifdef CONFIG_VE
>>>>   void cgroup_mark_ve_roots(struct ve_struct *ve);
>>>> +void cgroup_unmark_ve_roots(struct ve_struct *ve);
>>>>   #endif
>>>>
>>>>   /*
>>>> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
>>>> index ce576c5..6e3871a 100644
>>>> --- a/kernel/cgroup.c
>>>> +++ b/kernel/cgroup.c
>>>> @@ -637,6 +637,31 @@ static struct css_set *find_css_set(
>>>>   }
>>>>
>>>>   /*
>>>> + * Walk each cgroup link of a given css_set and find a cgroup that
>>>> + * is the child of cgroupfs_root in argument.
>>>> + */
>>>> +static struct cgroup *css_cgroup_from_root(struct css_set *css_set,
>>>> +                                         struct cgroupfs_root *root)
>>>> +{
>>>> +     struct cgroup *res = NULL;
>>>> +     struct cg_cgroup_link *link;
>>>> +
>>>> +     BUG_ON(!mutex_is_locked(&cgroup_mutex));
>>>> +     read_lock(&css_set_lock);
>>>> +
>>>> +     list_for_each_entry(link, &css_set->cg_links, cg_link_list) {
>>>> +             struct cgroup *c = link->cgrp;
>>>> +             if (c->root == root) {
>>>> +                     res = c;
>>>> +                     break;
>>>> +             }
>>>> +     }
>>>> +     read_unlock(&css_set_lock);
>>>> +     BUG_ON(!res);
>>>> +     return res;
>>>> +}
>>>> +
>>>> +/*
>>>>    * Return the cgroup for "task" from the given hierarchy. Must be
>>>>    * called with cgroup_mutex held.
>>>>    */
>>>> @@ -4329,6 +4354,19 @@ void cgroup_mark_ve_roots(struct ve_struct *ve)
>>>>         mutex_unlock(&cgroup_mutex);
>>>>   }
>>>>
>>>> +void cgroup_unmark_ve_roots(struct ve_struct *ve)
>>>> +{
>>>> +     struct cgroup *cgrp;
>>>> +     struct cgroupfs_root *root;
>>>> +
>>>> +     mutex_lock(&cgroup_mutex);
>>>> +     for_each_active_root(root) {
>>>> +             cgrp = css_cgroup_from_root(ve->root_css_set, root);
>>>> +             clear_bit(CGRP_VE_ROOT, &cgrp->flags);
>>>> +     }
>>>> +     mutex_unlock(&cgroup_mutex);
>>>> +}
>>>> +
>>>>   struct cgroup *cgroup_get_ve_root(struct cgroup *cgrp)
>>>>   {
>>>>         struct cgroup *ve_root = NULL;
>>>> diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
>>>> index 73cfee6..711050c 100644
>>>> --- a/kernel/ve/ve.c
>>>> +++ b/kernel/ve/ve.c
>>>> @@ -623,6 +623,8 @@ void ve_exit_ns(struct pid_namespace *pid_ns)
>>>>         if (!ve->ve_ns || ve->ve_ns->pid_ns != pid_ns)
>>>>                 return;
>>>>   
>>>> +     cgroup_unmark_ve_roots(ve);
>>> Is there a problem that ve workqueue works will run after we unmark roots?
>>> Maybe we should call this cgroup_unmark_ve_roots() after ve_workqueue_stop()?
>> When a cgroup gets empty it's decided to which workqueue it should be put to await for
>> release. Thus when we unmark ve root, we prevent any new empty cgroups from entering this
>> workqueue. After that we are safe to stop the workqueue by waiting for all the current jobs to
>> complete.
> But is it OK that workqueue is executing its works after root is cleared? Won't there some
> security leak? If it's safe, why?

There are more than one reasons.

In this release_agent logic still uses ve0 and call_usermodhelper, so 
exactly here it affects nothing. Later when it actually affets something 
in patch "ve/cgroup: Implemented logic that uses 'cgroup->ve_owner' to 
run release_agent notifications." we are doing the check for this 
special case

if (rcu_access_pointer(root_cgrp->ve_owner) != ve) {
   rcu_read_unlock();
   goto continue_free;
}

per-ve release_agent workqueue always knows it's "ve" and is able to 
check that virtual root is sane in this way.

There is one more reason, which is less direct. At the time when 
unmark_ve_roots is called ve->init_task already has the flag PF_EXITING. 
This means that it would not be possible to launch usermode task even if 
the above check was not present, because per-ve workqueue will try to 
run it from ve and ve->init_task & PF_EXITING will result to true


>>>> +
>>>>         ve_workqueue_stop(ve);
>>>>   
>>>>         /*
>>>>