[vz7] ve/cgroup: do not link a CT cpu cgroup twice into ve_root_list

Submitted by Konstantin Khorenko on Sept. 4, 2018, 3:55 p.m.

Details

Message ID 20180904155538.7878-1-khorenko@virtuozzo.com
State New
Series "ve/cgroup: do not link a CT cpu cgroup twice into ve_root_list"
Headers show

Commit Message

Konstantin Khorenko Sept. 4, 2018, 3:55 p.m.
Container's cpu cgroup is linked to "ve_root_list" on CT start.
But if someone holds CT's cpu cgroup while CT is being stopped,
next CT start tries to create same cpu cgroup (fails, already exists)
and links this cpu cgroup to the "ve_root_list", thus corrupting it.

As a consequence calc_load_ve() goes in an endless loop.

Minimal patch is to add a marker to task_group linked/unlinked
and skip redundant linking.

How to reproduce:

 # vzctl start 200
 # echo $$ > /sys/fs/cgroup/cpu/machine.slice/200/tasks
 # vzctl stop 200
 // At this moment VE cgroup got destroyed, but cpu cgroup is still alive
 // and linked to "ve_root_list" list

 # vzctl start 200
 // double add of same tg (same cpu cgroup) to "ve_root_list" list =>
 // list corruption => endless loop in next calc_load_ve() call

https://jira.sw.ru/browse/PSBM-88251

Signed-off-by: Konstantin Khorenko <khorenko@virtuozzo.com>
---
 include/linux/sched.h |  5 ++++-
 kernel/sched/core.c   | 26 ++++++++++++++++++--------
 kernel/sched/sched.h  |  1 +
 3 files changed, 23 insertions(+), 9 deletions(-)

Patch hide | download patch | download mbox

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 623572043ece..00d7dbca8ed6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -3346,6 +3346,9 @@  void cpufreq_set_update_util_data(int cpu, struct update_util_data *data);
 #ifdef CONFIG_VE
 struct cgroup;
 extern void link_ve_root_cpu_cgroup(struct cgroup *);
-#endif
+void unlink_ve_root_cpu_cgroup(struct cgroup *cgrp);
+#else /* CONFIG_VE */
+void unlink_ve_root_cpu_cgroup(struct cgroup *cgrp) { }
+#endif /* CONFIG_VE */
 
 #endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e572d2b14a7f..7b30e17b16e3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9018,10 +9018,24 @@  void link_ve_root_cpu_cgroup(struct cgroup *cgrp)
 
 	spin_lock_irqsave(&task_group_lock, flags);
 	BUG_ON(!(cgrp->subsys[cpu_cgroup_subsys_id]->flags & CSS_ONLINE));
-	list_add_rcu(&tg->ve_root_list, &ve_root_list);
+	if (!tg->linked) {
+		list_add_rcu(&tg->ve_root_list, &ve_root_list);
+		tg->linked = 1;
+	}
 	spin_unlock_irqrestore(&task_group_lock, flags);
 }
-#endif
+
+void unlink_ve_root_cpu_cgroup(struct cgroup *cgrp)
+{
+	struct task_group *tg = cgroup_tg(cgrp);
+	unsigned long flags;
+
+	spin_lock_irqsave(&task_group_lock, flags);
+	list_del_rcu(&tg->ve_root_list);
+	tg->linked = 0;
+	spin_unlock_irqrestore(&task_group_lock, flags);
+}
+#endif /* CONFIG_VE */
 
 static void free_sched_group(struct task_group *tg)
 {
@@ -9595,6 +9609,7 @@  static int cpu_cgroup_css_online(struct cgroup *cgrp)
 
 #ifdef CONFIG_VE
 	INIT_LIST_HEAD(&tg->ve_root_list);
+	/* tg->linked == 0 due to kzalloc() in sched_create_group() */
 #endif
 	if (!cgrp->parent)
 		return 0;
@@ -9614,13 +9629,8 @@  static void cpu_cgroup_css_free(struct cgroup *cgrp)
 static void cpu_cgroup_css_offline(struct cgroup *cgrp)
 {
 	struct task_group *tg = cgroup_tg(cgrp);
-#ifdef CONFIG_VE
-	unsigned long flags;
 
-	spin_lock_irqsave(&task_group_lock, flags);
-	list_del_rcu(&tg->ve_root_list);
-	spin_unlock_irqrestore(&task_group_lock, flags);
-#endif
+	unlink_ve_root_cpu_cgroup(cgrp);
 	sched_offline_group(tg);
 }
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 98a216b6c391..5192e1bef06c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -263,6 +263,7 @@  struct task_group {
 
 #ifdef CONFIG_VE
 	struct list_head ve_root_list;
+	bool linked;
 #endif
 
 	struct taskstats __percpu *taskstats;

Comments

Andrey Ryabinin Sept. 5, 2018, 10:12 a.m.
On 09/04/2018 06:55 PM, Konstantin Khorenko wrote:
> Container's cpu cgroup is linked to "ve_root_list" on CT start.
> But if someone holds CT's cpu cgroup while CT is being stopped,
> next CT start tries to create same cpu cgroup (fails, already exists)
> and links this cpu cgroup to the "ve_root_list", thus corrupting it.
> 
> As a consequence calc_load_ve() goes in an endless loop.
> 
> Minimal patch is to add a marker to task_group linked/unlinked
> and skip redundant linking.
> 
> How to reproduce:
> 
>  # vzctl start 200
>  # echo $$ > /sys/fs/cgroup/cpu/machine.slice/200/tasks
>  # vzctl stop 200
>  // At this moment VE cgroup got destroyed, but cpu cgroup is still alive
>  // and linked to "ve_root_list" list
> 
>  # vzctl start 200
>  // double add of same tg (same cpu cgroup) to "ve_root_list" list =>
>  // list corruption => endless loop in next calc_load_ve() call
> 
> https://jira.sw.ru/browse/PSBM-88251
> 
> Signed-off-by: Konstantin Khorenko <khorenko@virtuozzo.com>
> ---
Reviewed-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Kirill Tkhai Sept. 5, 2018, 12:47 p.m.
On 04.09.2018 18:55, Konstantin Khorenko wrote:
> Container's cpu cgroup is linked to "ve_root_list" on CT start.
> But if someone holds CT's cpu cgroup while CT is being stopped,
> next CT start tries to create same cpu cgroup (fails, already exists)
> and links this cpu cgroup to the "ve_root_list", thus corrupting it.
> 
> As a consequence calc_load_ve() goes in an endless loop.
> 
> Minimal patch is to add a marker to task_group linked/unlinked
> and skip redundant linking.
> 
> How to reproduce:
> 
>  # vzctl start 200
>  # echo $$ > /sys/fs/cgroup/cpu/machine.slice/200/tasks
>  # vzctl stop 200
>  // At this moment VE cgroup got destroyed, but cpu cgroup is still alive
>  // and linked to "ve_root_list" list
> 
>  # vzctl start 200
>  // double add of same tg (same cpu cgroup) to "ve_root_list" list =>
>  // list corruption => endless loop in next calc_load_ve() call
> 
> https://jira.sw.ru/browse/PSBM-88251
> 
> Signed-off-by: Konstantin Khorenko <khorenko@virtuozzo.com>
> ---
>  include/linux/sched.h |  5 ++++-
>  kernel/sched/core.c   | 26 ++++++++++++++++++--------
>  kernel/sched/sched.h  |  1 +
>  3 files changed, 23 insertions(+), 9 deletions(-)
> 
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 623572043ece..00d7dbca8ed6 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -3346,6 +3346,9 @@ void cpufreq_set_update_util_data(int cpu, struct update_util_data *data);
>  #ifdef CONFIG_VE
>  struct cgroup;
>  extern void link_ve_root_cpu_cgroup(struct cgroup *);
> -#endif
> +void unlink_ve_root_cpu_cgroup(struct cgroup *cgrp);
> +#else /* CONFIG_VE */
> +void unlink_ve_root_cpu_cgroup(struct cgroup *cgrp) { }
> +#endif /* CONFIG_VE */
>  
>  #endif
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index e572d2b14a7f..7b30e17b16e3 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -9018,10 +9018,24 @@ void link_ve_root_cpu_cgroup(struct cgroup *cgrp)
>  
>  	spin_lock_irqsave(&task_group_lock, flags);
>  	BUG_ON(!(cgrp->subsys[cpu_cgroup_subsys_id]->flags & CSS_ONLINE));
> -	list_add_rcu(&tg->ve_root_list, &ve_root_list);
> +	if (!tg->linked) {
> +		list_add_rcu(&tg->ve_root_list, &ve_root_list);
> +		tg->linked = 1;

Just an idea: we may use INIT_LIST_HEAD() on cgroup allocation,
and list_empty() here instead.

> +	}
>  	spin_unlock_irqrestore(&task_group_lock, flags);
>  }
> -#endif
> +
> +void unlink_ve_root_cpu_cgroup(struct cgroup *cgrp)
> +{
> +	struct task_group *tg = cgroup_tg(cgrp);
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&task_group_lock, flags);
> +	list_del_rcu(&tg->ve_root_list);
> +	tg->linked = 0;
> +	spin_unlock_irqrestore(&task_group_lock, flags);
> +}
> +#endif /* CONFIG_VE */
>  
>  static void free_sched_group(struct task_group *tg)
>  {
> @@ -9595,6 +9609,7 @@ static int cpu_cgroup_css_online(struct cgroup *cgrp)
>  
>  #ifdef CONFIG_VE
>  	INIT_LIST_HEAD(&tg->ve_root_list);
> +	/* tg->linked == 0 due to kzalloc() in sched_create_group() */
>  #endif
>  	if (!cgrp->parent)
>  		return 0;
> @@ -9614,13 +9629,8 @@ static void cpu_cgroup_css_free(struct cgroup *cgrp)
>  static void cpu_cgroup_css_offline(struct cgroup *cgrp)
>  {
>  	struct task_group *tg = cgroup_tg(cgrp);
> -#ifdef CONFIG_VE
> -	unsigned long flags;
>  
> -	spin_lock_irqsave(&task_group_lock, flags);
> -	list_del_rcu(&tg->ve_root_list);
> -	spin_unlock_irqrestore(&task_group_lock, flags);
> -#endif
> +	unlink_ve_root_cpu_cgroup(cgrp);
>  	sched_offline_group(tg);
>  }
>  
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 98a216b6c391..5192e1bef06c 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -263,6 +263,7 @@ struct task_group {
>  
>  #ifdef CONFIG_VE
>  	struct list_head ve_root_list;
> +	bool linked;
>  #endif
>  
>  	struct taskstats __percpu *taskstats;
>