[v13,10/12] ve/cgroup: private per-cgroup-root data container

Submitted by Valeriy Vdovin on April 20, 2020, 12:13 p.m.

Details

Message ID 1587384811-813252-11-git-send-email-valeriy.vdovin@virtuozzo.com
State New
Series "Make release_agent per-cgroup property. Run release_agent in proper ve."
Headers show

Commit Message

Valeriy Vdovin April 20, 2020, 12:13 p.m.
As long as each ve is internally attached to a particular
css_set via it's init_task, it's good to have container with parameters,
which are common to each cgroup subsystem hierarchy, rooting from it's
virtual root.

Signed-off-by: Valeriy Vdovin <valeriy.vdovin@virtuozzo.com>
---
 include/linux/ve.h |  7 +++++
 kernel/ve/ve.c     | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 84 insertions(+)

Patch hide | download patch | download mbox

diff --git a/include/linux/ve.h b/include/linux/ve.h
index 787a96f..94a07df 100644
--- a/include/linux/ve.h
+++ b/include/linux/ve.h
@@ -137,6 +137,13 @@  struct ve_struct {
 	struct work_struct	release_agent_work;
 
 	/*
+	 * List of data, private for each root cgroup in
+	 * ve's css_set.
+	 */
+	struct list_head	per_cgroot_list;
+	struct raw_spinlock	per_cgroot_list_lock;
+
+	/*
 	 * All tasks, that belong to this ve, live
 	 * in cgroups, that are children to cgroups
 	 * that form this css_set.
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index eb266ae..a30343f 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -45,6 +45,14 @@ 
 #include <linux/vziptable_defs.h>
 #include <net/rtnetlink.h>
 
+struct per_cgroot_data {
+	struct list_head list;
+	/*
+	 * data is related to this cgroup
+	 */
+	struct cgroup *cgroot;
+};
+
 extern struct kmapset_set sysfs_ve_perms_set;
 
 static struct kmem_cache *ve_cachep;
@@ -92,6 +100,9 @@  struct ve_struct ve0 = {
 	.release_list		= LIST_HEAD_INIT(ve0.release_list),
 	.release_agent_work	= __WORK_INITIALIZER(ve0.release_agent_work,
 					cgroup_release_agent),
+	.per_cgroot_list	= LIST_HEAD_INIT(ve0.per_cgroot_list),
+	.per_cgroot_list_lock	= __RAW_SPIN_LOCK_UNLOCKED(
+					ve0.per_cgroot_list_lock),
 };
 EXPORT_SYMBOL(ve0);
 
@@ -118,6 +129,52 @@  void put_ve(struct ve_struct *ve)
 }
 EXPORT_SYMBOL(put_ve);
 
+static struct per_cgroot_data *per_cgroot_data_find_locked(
+	struct list_head *per_cgroot_list, struct cgroup *cgroot)
+{
+	struct per_cgroot_data *data;
+
+	list_for_each_entry(data, per_cgroot_list, list) {
+		if (data->cgroot == cgroot)
+			return data;
+	}
+	return NULL;
+}
+
+static inline struct per_cgroot_data *per_cgroot_get_or_create(
+	struct ve_struct *ve, struct cgroup *cgroot)
+{
+	struct per_cgroot_data *data, *other_data;
+
+	raw_spin_lock(&ve->per_cgroot_list_lock);
+	data = per_cgroot_data_find_locked(&ve->per_cgroot_list,
+		cgroot);
+	raw_spin_unlock(&ve->per_cgroot_list_lock);
+
+	if (data)
+		return data;
+
+	data = kzalloc(sizeof(struct per_cgroot_data), GFP_KERNEL);
+	if (!data)
+		return ERR_PTR(-ENOMEM);
+
+	raw_spin_lock(&ve->per_cgroot_list_lock);
+	other_data = per_cgroot_data_find_locked(&ve->per_cgroot_list,
+		cgroot);
+
+	if (other_data) {
+		raw_spin_unlock(&ve->per_cgroot_list_lock);
+		kfree(data);
+		return other_data;
+	}
+
+	data->cgroot = cgroot;
+	list_add(&data->list, &ve->per_cgroot_list);
+
+	raw_spin_unlock(&ve->per_cgroot_list_lock);
+	return data;
+}
+
 struct cgroup_subsys_state *ve_get_init_css(struct ve_struct *ve, int subsys_id)
 {
 	struct cgroup_subsys_state *css, *tmp;
@@ -612,6 +669,22 @@  err_list:
 	return err;
 }
 
+static void per_cgroot_free_all_locked(struct list_head *per_cgroot_list)
+{
+	struct per_cgroot_data *data, *saved;
+	list_for_each_entry_safe(data, saved, per_cgroot_list, list) {
+		list_del_init(&data->list);
+		kfree(data);
+	}
+}
+
+static void ve_per_cgroot_free(struct ve_struct *ve)
+{
+	raw_spin_lock(&ve->per_cgroot_list_lock);
+	per_cgroot_free_all_locked(&ve->per_cgroot_list);
+	raw_spin_unlock(&ve->per_cgroot_list_lock);
+}
+
 void ve_stop_ns(struct pid_namespace *pid_ns)
 {
 	struct ve_struct *ve = current->task_ve;
@@ -662,6 +735,8 @@  void ve_exit_ns(struct pid_namespace *pid_ns)
 
 	ve_workqueue_stop(ve);
 
+	ve_per_cgroot_free(ve);
+
 	/*
 	 * At this point all userspace tasks in container are dead.
 	 */
@@ -735,6 +810,7 @@  static struct cgroup_subsys_state *ve_create(struct cgroup *cg)
 
 	INIT_WORK(&ve->release_agent_work, cgroup_release_agent);
 	raw_spin_lock_init(&ve->release_list_lock);
+	raw_spin_lock_init(&ve->per_cgroot_list_lock);
 
 	ve->_randomize_va_space = ve0._randomize_va_space;
 
@@ -771,6 +847,7 @@  do_init:
 	INIT_LIST_HEAD(&ve->ve_list);
 	INIT_LIST_HEAD(&ve->devmnt_list);
 	INIT_LIST_HEAD(&ve->release_list);
+	INIT_LIST_HEAD(&ve->per_cgroot_list);
 	mutex_init(&ve->devmnt_mutex);
 
 #ifdef CONFIG_AIO

Comments

Kirill Tkhai April 21, 2020, 8:46 a.m.
On 20.04.2020 15:13, Valeriy Vdovin wrote:
> As long as each ve is internally attached to a particular
> css_set via it's init_task, it's good to have container with parameters,
> which are common to each cgroup subsystem hierarchy, rooting from it's
> virtual root.
> 
> Signed-off-by: Valeriy Vdovin <valeriy.vdovin@virtuozzo.com>
> ---
>  include/linux/ve.h |  7 +++++
>  kernel/ve/ve.c     | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 84 insertions(+)
> 
> diff --git a/include/linux/ve.h b/include/linux/ve.h
> index 787a96f..94a07df 100644
> --- a/include/linux/ve.h
> +++ b/include/linux/ve.h
> @@ -137,6 +137,13 @@ struct ve_struct {
>  	struct work_struct	release_agent_work;
>  
>  	/*
> +	 * List of data, private for each root cgroup in
> +	 * ve's css_set.
> +	 */
> +	struct list_head	per_cgroot_list;
> +	struct raw_spinlock	per_cgroot_list_lock;
>
> +	/*
>  	 * All tasks, that belong to this ve, live
>  	 * in cgroups, that are children to cgroups
>  	 * that form this css_set.
> diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
> index eb266ae..a30343f 100644
> --- a/kernel/ve/ve.c
> +++ b/kernel/ve/ve.c
> @@ -45,6 +45,14 @@
>  #include <linux/vziptable_defs.h>
>  #include <net/rtnetlink.h>
>  
> +struct per_cgroot_data {
> +	struct list_head list;
> +	/*
> +	 * data is related to this cgroup
> +	 */
> +	struct cgroup *cgroot;
> +};
> +
>  extern struct kmapset_set sysfs_ve_perms_set;
>  
>  static struct kmem_cache *ve_cachep;
> @@ -92,6 +100,9 @@ struct ve_struct ve0 = {
>  	.release_list		= LIST_HEAD_INIT(ve0.release_list),
>  	.release_agent_work	= __WORK_INITIALIZER(ve0.release_agent_work,
>  					cgroup_release_agent),
> +	.per_cgroot_list	= LIST_HEAD_INIT(ve0.per_cgroot_list),
> +	.per_cgroot_list_lock	= __RAW_SPIN_LOCK_UNLOCKED(
> +					ve0.per_cgroot_list_lock),
>  };
>  EXPORT_SYMBOL(ve0);
>  
> @@ -118,6 +129,52 @@ void put_ve(struct ve_struct *ve)
>  }
>  EXPORT_SYMBOL(put_ve);
>  
> +static struct per_cgroot_data *per_cgroot_data_find_locked(
> +	struct list_head *per_cgroot_list, struct cgroup *cgroot)
> +{
> +	struct per_cgroot_data *data;
> +
> +	list_for_each_entry(data, per_cgroot_list, list) {
> +		if (data->cgroot == cgroot)
> +			return data;
> +	}
> +	return NULL;
> +}
> +
> +static inline struct per_cgroot_data *per_cgroot_get_or_create(
> +	struct ve_struct *ve, struct cgroup *cgroot)
> +{
> +	struct per_cgroot_data *data, *other_data;
> +
> +	raw_spin_lock(&ve->per_cgroot_list_lock);
> +	data = per_cgroot_data_find_locked(&ve->per_cgroot_list,
> +		cgroot);
> +	raw_spin_unlock(&ve->per_cgroot_list_lock);
> +
> +	if (data)
> +		return data;
> +
> +	data = kzalloc(sizeof(struct per_cgroot_data), GFP_KERNEL);
> +	if (!data)
> +		return ERR_PTR(-ENOMEM);
> +
> +	raw_spin_lock(&ve->per_cgroot_list_lock);
> +	other_data = per_cgroot_data_find_locked(&ve->per_cgroot_list,
> +		cgroot);
> +
> +	if (other_data) {
> +		raw_spin_unlock(&ve->per_cgroot_list_lock);
> +		kfree(data);
> +		return other_data;
> +	}
> +
> +	data->cgroot = cgroot;
> +	list_add(&data->list, &ve->per_cgroot_list);
> +
> +	raw_spin_unlock(&ve->per_cgroot_list_lock);
> +	return data;
> +}
> +
>  struct cgroup_subsys_state *ve_get_init_css(struct ve_struct *ve, int subsys_id)
>  {
>  	struct cgroup_subsys_state *css, *tmp;
> @@ -612,6 +669,22 @@ err_list:
>  	return err;
>  }
>  
> +static void per_cgroot_free_all_locked(struct list_head *per_cgroot_list)
> +{
> +	struct per_cgroot_data *data, *saved;
> +	list_for_each_entry_safe(data, saved, per_cgroot_list, list) {
> +		list_del_init(&data->list);
> +		kfree(data);
> +	}
> +}
> +
> +static void ve_per_cgroot_free(struct ve_struct *ve)
> +{
> +	raw_spin_lock(&ve->per_cgroot_list_lock);
> +	per_cgroot_free_all_locked(&ve->per_cgroot_list);
> +	raw_spin_unlock(&ve->per_cgroot_list_lock);
> +}
> +
>  void ve_stop_ns(struct pid_namespace *pid_ns)
>  {
>  	struct ve_struct *ve = current->task_ve;
> @@ -662,6 +735,8 @@ void ve_exit_ns(struct pid_namespace *pid_ns)
>  
>  	ve_workqueue_stop(ve);
>  
> +	ve_per_cgroot_free(ve);
> +
>  	/*
>  	 * At this point all userspace tasks in container are dead.
>  	 */
> @@ -735,6 +810,7 @@ static struct cgroup_subsys_state *ve_create(struct cgroup *cg)
>  
>  	INIT_WORK(&ve->release_agent_work, cgroup_release_agent);
>  	raw_spin_lock_init(&ve->release_list_lock);
> +	raw_spin_lock_init(&ve->per_cgroot_list_lock);
>  
>  	ve->_randomize_va_space = ve0._randomize_va_space;
>  
> @@ -771,6 +847,7 @@ do_init:
>  	INIT_LIST_HEAD(&ve->ve_list);
>  	INIT_LIST_HEAD(&ve->devmnt_list);
>  	INIT_LIST_HEAD(&ve->release_list);
> +	INIT_LIST_HEAD(&ve->per_cgroot_list);
>  	mutex_init(&ve->devmnt_mutex);
>  
>  #ifdef CONFIG_AIO
> 

What is this all for? Do you introduce per-cgroup-root data instead of cgroup::release_agent
to save memory (avoid distribution release agent pointer to every cgroup)?
Valeriy Vdovin April 21, 2020, 10:47 a.m.
On 21.04.2020 11:46, Kirill Tkhai wrote:
> On 20.04.2020 15:13, Valeriy Vdovin wrote:
>> As long as each ve is internally attached to a particular
>> css_set via it's init_task, it's good to have container with parameters,
>> which are common to each cgroup subsystem hierarchy, rooting from it's
>> virtual root.
>>
>> Signed-off-by: Valeriy Vdovin <valeriy.vdovin@virtuozzo.com>
>> ---
>>   include/linux/ve.h |  7 +++++
>>   kernel/ve/ve.c     | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>   2 files changed, 84 insertions(+)
>>
>> diff --git a/include/linux/ve.h b/include/linux/ve.h
>> index 787a96f..94a07df 100644
>> --- a/include/linux/ve.h
>> +++ b/include/linux/ve.h
>> @@ -137,6 +137,13 @@ struct ve_struct {
>>   	struct work_struct	release_agent_work;
>>   
>>   	/*
>> +	 * List of data, private for each root cgroup in
>> +	 * ve's css_set.
>> +	 */
>> +	struct list_head	per_cgroot_list;
>> +	struct raw_spinlock	per_cgroot_list_lock;
>>
>> +	/*
>>   	 * All tasks, that belong to this ve, live
>>   	 * in cgroups, that are children to cgroups
>>   	 * that form this css_set.
>> diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
>> index eb266ae..a30343f 100644
>> --- a/kernel/ve/ve.c
>> +++ b/kernel/ve/ve.c
>> @@ -45,6 +45,14 @@
>>   #include <linux/vziptable_defs.h>
>>   #include <net/rtnetlink.h>
>>   
>> +struct per_cgroot_data {
>> +	struct list_head list;
>> +	/*
>> +	 * data is related to this cgroup
>> +	 */
>> +	struct cgroup *cgroot;
>> +};
>> +
>>   extern struct kmapset_set sysfs_ve_perms_set;
>>   
>>   static struct kmem_cache *ve_cachep;
>> @@ -92,6 +100,9 @@ struct ve_struct ve0 = {
>>   	.release_list		= LIST_HEAD_INIT(ve0.release_list),
>>   	.release_agent_work	= __WORK_INITIALIZER(ve0.release_agent_work,
>>   					cgroup_release_agent),
>> +	.per_cgroot_list	= LIST_HEAD_INIT(ve0.per_cgroot_list),
>> +	.per_cgroot_list_lock	= __RAW_SPIN_LOCK_UNLOCKED(
>> +					ve0.per_cgroot_list_lock),
>>   };
>>   EXPORT_SYMBOL(ve0);
>>   
>> @@ -118,6 +129,52 @@ void put_ve(struct ve_struct *ve)
>>   }
>>   EXPORT_SYMBOL(put_ve);
>>   
>> +static struct per_cgroot_data *per_cgroot_data_find_locked(
>> +	struct list_head *per_cgroot_list, struct cgroup *cgroot)
>> +{
>> +	struct per_cgroot_data *data;
>> +
>> +	list_for_each_entry(data, per_cgroot_list, list) {
>> +		if (data->cgroot == cgroot)
>> +			return data;
>> +	}
>> +	return NULL;
>> +}
>> +
>> +static inline struct per_cgroot_data *per_cgroot_get_or_create(
>> +	struct ve_struct *ve, struct cgroup *cgroot)
>> +{
>> +	struct per_cgroot_data *data, *other_data;
>> +
>> +	raw_spin_lock(&ve->per_cgroot_list_lock);
>> +	data = per_cgroot_data_find_locked(&ve->per_cgroot_list,
>> +		cgroot);
>> +	raw_spin_unlock(&ve->per_cgroot_list_lock);
>> +
>> +	if (data)
>> +		return data;
>> +
>> +	data = kzalloc(sizeof(struct per_cgroot_data), GFP_KERNEL);
>> +	if (!data)
>> +		return ERR_PTR(-ENOMEM);
>> +
>> +	raw_spin_lock(&ve->per_cgroot_list_lock);
>> +	other_data = per_cgroot_data_find_locked(&ve->per_cgroot_list,
>> +		cgroot);
>> +
>> +	if (other_data) {
>> +		raw_spin_unlock(&ve->per_cgroot_list_lock);
>> +		kfree(data);
>> +		return other_data;
>> +	}
>> +
>> +	data->cgroot = cgroot;
>> +	list_add(&data->list, &ve->per_cgroot_list);
>> +
>> +	raw_spin_unlock(&ve->per_cgroot_list_lock);
>> +	return data;
>> +}
>> +
>>   struct cgroup_subsys_state *ve_get_init_css(struct ve_struct *ve, int subsys_id)
>>   {
>>   	struct cgroup_subsys_state *css, *tmp;
>> @@ -612,6 +669,22 @@ err_list:
>>   	return err;
>>   }
>>   
>> +static void per_cgroot_free_all_locked(struct list_head *per_cgroot_list)
>> +{
>> +	struct per_cgroot_data *data, *saved;
>> +	list_for_each_entry_safe(data, saved, per_cgroot_list, list) {
>> +		list_del_init(&data->list);
>> +		kfree(data);
>> +	}
>> +}
>> +
>> +static void ve_per_cgroot_free(struct ve_struct *ve)
>> +{
>> +	raw_spin_lock(&ve->per_cgroot_list_lock);
>> +	per_cgroot_free_all_locked(&ve->per_cgroot_list);
>> +	raw_spin_unlock(&ve->per_cgroot_list_lock);
>> +}
>> +
>>   void ve_stop_ns(struct pid_namespace *pid_ns)
>>   {
>>   	struct ve_struct *ve = current->task_ve;
>> @@ -662,6 +735,8 @@ void ve_exit_ns(struct pid_namespace *pid_ns)
>>   
>>   	ve_workqueue_stop(ve);
>>   
>> +	ve_per_cgroot_free(ve);
>> +
>>   	/*
>>   	 * At this point all userspace tasks in container are dead.
>>   	 */
>> @@ -735,6 +810,7 @@ static struct cgroup_subsys_state *ve_create(struct cgroup *cg)
>>   
>>   	INIT_WORK(&ve->release_agent_work, cgroup_release_agent);
>>   	raw_spin_lock_init(&ve->release_list_lock);
>> +	raw_spin_lock_init(&ve->per_cgroot_list_lock);
>>   
>>   	ve->_randomize_va_space = ve0._randomize_va_space;
>>   
>> @@ -771,6 +847,7 @@ do_init:
>>   	INIT_LIST_HEAD(&ve->ve_list);
>>   	INIT_LIST_HEAD(&ve->devmnt_list);
>>   	INIT_LIST_HEAD(&ve->release_list);
>> +	INIT_LIST_HEAD(&ve->per_cgroot_list);
>>   	mutex_init(&ve->devmnt_mutex);
>>   
>>   #ifdef CONFIG_AIO
>>
> What is this all for? Do you introduce per-cgroup-root data instead of cgroup::release_agent
> to save memory (avoid distribution release agent pointer to every cgroup)?

struct cgroup is not the place for release agent. You are right, that I 
want to avoid mostly unused pointer in 99.99% or cgroups, but also 
having release agent in a cgroup is semantically misleading. It'sĀ  a 
cgroup-root property. We could introduce something like cgroupfs_root 
but supporting containers or even completely substitute one with the 
other, but this is a heavy task.

To me having a per-cgroup-root container in VE structure seems cleaner. 
VE is already responsible for managing multiple virtual cgroup roots 
(mark/unmark_ve_roots as example), so it can proceed with it's manager 
role and start having an associative container that would hold per-root 
informationĀ  (cgroup root is a key) with already established and 
well-defined semantics, to where you just insert more data when you need 
it. Right now we already need to store release_agent in direct 
association with a specific cgroup root, so we would need to have this 
release-agent<->root association one way or another.