[RHEL7,COMMIT] ve/cgroup: do not link a CT cpu cgroup twice into ve_root_list

Submitted by Konstantin Khorenko on Sept. 5, 2018, 10:18 a.m.

Details

Message ID 201809051018.w85AIVU6011135@finist_ce7.work
State New
Series "ve/cgroup: do not link a CT cpu cgroup twice into ve_root_list"
Headers show

Commit Message

Konstantin Khorenko Sept. 5, 2018, 10:18 a.m.
The commit is pushed to "branch-rh7-3.10.0-862.11.6.vz7.71.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-862.11.6.vz7.71.8
------>
commit d68530aa0ee943861d9dc8dc898087c58ad4520f
Author: Konstantin Khorenko <khorenko@virtuozzo.com>
Date:   Wed Sep 5 13:18:31 2018 +0300

    ve/cgroup: do not link a CT cpu cgroup twice into ve_root_list
    
    Container's cpu cgroup is linked to "ve_root_list" on CT start.
    But if someone holds CT's cpu cgroup while CT is being stopped,
    next CT start tries to create same cpu cgroup (fails, already exists)
    and links this cpu cgroup to the "ve_root_list", thus corrupting it.
    
    As a consequence calc_load_ve() goes in an endless loop.
    
    Minimal patch is to add a marker to task_group linked/unlinked
    and skip redundant linking.
    
    How to reproduce:
    
     # vzctl start 200
     # echo $$ > /sys/fs/cgroup/cpu/machine.slice/200/tasks
     # vzctl stop 200
     // At this moment VE cgroup got destroyed, but cpu cgroup is still alive
     // and linked to "ve_root_list" list
    
     # vzctl start 200
     // double add of same tg (same cpu cgroup) to "ve_root_list" list =>
     // list corruption => endless loop in next calc_load_ve() call
    
    https://jira.sw.ru/browse/PSBM-88251
    
    Signed-off-by: Konstantin Khorenko <khorenko@virtuozzo.com>
---
 include/linux/sched.h |  5 ++++-
 kernel/sched/core.c   | 26 ++++++++++++++++++--------
 kernel/sched/sched.h  |  1 +
 3 files changed, 23 insertions(+), 9 deletions(-)

Patch hide | download patch | download mbox

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 623572043ece..00d7dbca8ed6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -3346,6 +3346,9 @@  void cpufreq_set_update_util_data(int cpu, struct update_util_data *data);
 #ifdef CONFIG_VE
 struct cgroup;
 extern void link_ve_root_cpu_cgroup(struct cgroup *);
-#endif
+void unlink_ve_root_cpu_cgroup(struct cgroup *cgrp);
+#else /* CONFIG_VE */
+void unlink_ve_root_cpu_cgroup(struct cgroup *cgrp) { }
+#endif /* CONFIG_VE */
 
 #endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e572d2b14a7f..7b30e17b16e3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9018,10 +9018,24 @@  void link_ve_root_cpu_cgroup(struct cgroup *cgrp)
 
 	spin_lock_irqsave(&task_group_lock, flags);
 	BUG_ON(!(cgrp->subsys[cpu_cgroup_subsys_id]->flags & CSS_ONLINE));
-	list_add_rcu(&tg->ve_root_list, &ve_root_list);
+	if (!tg->linked) {
+		list_add_rcu(&tg->ve_root_list, &ve_root_list);
+		tg->linked = 1;
+	}
 	spin_unlock_irqrestore(&task_group_lock, flags);
 }
-#endif
+
+void unlink_ve_root_cpu_cgroup(struct cgroup *cgrp)
+{
+	struct task_group *tg = cgroup_tg(cgrp);
+	unsigned long flags;
+
+	spin_lock_irqsave(&task_group_lock, flags);
+	list_del_rcu(&tg->ve_root_list);
+	tg->linked = 0;
+	spin_unlock_irqrestore(&task_group_lock, flags);
+}
+#endif /* CONFIG_VE */
 
 static void free_sched_group(struct task_group *tg)
 {
@@ -9595,6 +9609,7 @@  static int cpu_cgroup_css_online(struct cgroup *cgrp)
 
 #ifdef CONFIG_VE
 	INIT_LIST_HEAD(&tg->ve_root_list);
+	/* tg->linked == 0 due to kzalloc() in sched_create_group() */
 #endif
 	if (!cgrp->parent)
 		return 0;
@@ -9614,13 +9629,8 @@  static void cpu_cgroup_css_free(struct cgroup *cgrp)
 static void cpu_cgroup_css_offline(struct cgroup *cgrp)
 {
 	struct task_group *tg = cgroup_tg(cgrp);
-#ifdef CONFIG_VE
-	unsigned long flags;
 
-	spin_lock_irqsave(&task_group_lock, flags);
-	list_del_rcu(&tg->ve_root_list);
-	spin_unlock_irqrestore(&task_group_lock, flags);
-#endif
+	unlink_ve_root_cpu_cgroup(cgrp);
 	sched_offline_group(tg);
 }
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 98a216b6c391..5192e1bef06c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -263,6 +263,7 @@  struct task_group {
 
 #ifdef CONFIG_VE
 	struct list_head ve_root_list;
+	bool linked;
 #endif
 
 	struct taskstats __percpu *taskstats;

Comments

Konstantin Khorenko Sept. 5, 2018, 12:14 p.m.
Patch dropped, will be reworked.

--
Best regards,

Konstantin Khorenko,
Virtuozzo Linux Kernel Team

On 09/05/2018 01:18 PM, Konstantin Khorenko wrote:
> The commit is pushed to "branch-rh7-3.10.0-862.11.6.vz7.71.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
> after rh7-3.10.0-862.11.6.vz7.71.8
> ------>
> commit d68530aa0ee943861d9dc8dc898087c58ad4520f
> Author: Konstantin Khorenko <khorenko@virtuozzo.com>
> Date:   Wed Sep 5 13:18:31 2018 +0300
>
>     ve/cgroup: do not link a CT cpu cgroup twice into ve_root_list
>
>     Container's cpu cgroup is linked to "ve_root_list" on CT start.
>     But if someone holds CT's cpu cgroup while CT is being stopped,
>     next CT start tries to create same cpu cgroup (fails, already exists)
>     and links this cpu cgroup to the "ve_root_list", thus corrupting it.
>
>     As a consequence calc_load_ve() goes in an endless loop.
>
>     Minimal patch is to add a marker to task_group linked/unlinked
>     and skip redundant linking.
>
>     How to reproduce:
>
>      # vzctl start 200
>      # echo $$ > /sys/fs/cgroup/cpu/machine.slice/200/tasks
>      # vzctl stop 200
>      // At this moment VE cgroup got destroyed, but cpu cgroup is still alive
>      // and linked to "ve_root_list" list
>
>      # vzctl start 200
>      // double add of same tg (same cpu cgroup) to "ve_root_list" list =>
>      // list corruption => endless loop in next calc_load_ve() call
>
>     https://jira.sw.ru/browse/PSBM-88251
>
>     Signed-off-by: Konstantin Khorenko <khorenko@virtuozzo.com>
> ---
>  include/linux/sched.h |  5 ++++-
>  kernel/sched/core.c   | 26 ++++++++++++++++++--------
>  kernel/sched/sched.h  |  1 +
>  3 files changed, 23 insertions(+), 9 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 623572043ece..00d7dbca8ed6 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -3346,6 +3346,9 @@ void cpufreq_set_update_util_data(int cpu, struct update_util_data *data);
>  #ifdef CONFIG_VE
>  struct cgroup;
>  extern void link_ve_root_cpu_cgroup(struct cgroup *);
> -#endif
> +void unlink_ve_root_cpu_cgroup(struct cgroup *cgrp);
> +#else /* CONFIG_VE */
> +void unlink_ve_root_cpu_cgroup(struct cgroup *cgrp) { }
> +#endif /* CONFIG_VE */
>
>  #endif
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index e572d2b14a7f..7b30e17b16e3 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -9018,10 +9018,24 @@ void link_ve_root_cpu_cgroup(struct cgroup *cgrp)
>
>  	spin_lock_irqsave(&task_group_lock, flags);
>  	BUG_ON(!(cgrp->subsys[cpu_cgroup_subsys_id]->flags & CSS_ONLINE));
> -	list_add_rcu(&tg->ve_root_list, &ve_root_list);
> +	if (!tg->linked) {
> +		list_add_rcu(&tg->ve_root_list, &ve_root_list);
> +		tg->linked = 1;
> +	}
>  	spin_unlock_irqrestore(&task_group_lock, flags);
>  }
> -#endif
> +
> +void unlink_ve_root_cpu_cgroup(struct cgroup *cgrp)
> +{
> +	struct task_group *tg = cgroup_tg(cgrp);
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&task_group_lock, flags);
> +	list_del_rcu(&tg->ve_root_list);
> +	tg->linked = 0;
> +	spin_unlock_irqrestore(&task_group_lock, flags);
> +}
> +#endif /* CONFIG_VE */
>
>  static void free_sched_group(struct task_group *tg)
>  {
> @@ -9595,6 +9609,7 @@ static int cpu_cgroup_css_online(struct cgroup *cgrp)
>
>  #ifdef CONFIG_VE
>  	INIT_LIST_HEAD(&tg->ve_root_list);
> +	/* tg->linked == 0 due to kzalloc() in sched_create_group() */
>  #endif
>  	if (!cgrp->parent)
>  		return 0;
> @@ -9614,13 +9629,8 @@ static void cpu_cgroup_css_free(struct cgroup *cgrp)
>  static void cpu_cgroup_css_offline(struct cgroup *cgrp)
>  {
>  	struct task_group *tg = cgroup_tg(cgrp);
> -#ifdef CONFIG_VE
> -	unsigned long flags;
>
> -	spin_lock_irqsave(&task_group_lock, flags);
> -	list_del_rcu(&tg->ve_root_list);
> -	spin_unlock_irqrestore(&task_group_lock, flags);
> -#endif
> +	unlink_ve_root_cpu_cgroup(cgrp);
>  	sched_offline_group(tg);
>  }
>
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 98a216b6c391..5192e1bef06c 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -263,6 +263,7 @@ struct task_group {
>
>  #ifdef CONFIG_VE
>  	struct list_head ve_root_list;
> +	bool linked;
>  #endif
>
>  	struct taskstats __percpu *taskstats;
> .
>