[vz7,v2] ve/cgroup: do not link a CT cpu cgroup twice into ve_root_list

Submitted by Konstantin Khorenko on Sept. 5, 2018, 1:30 p.m.

Details

Message ID 20180905133024.15202-1-khorenko@virtuozzo.com
State New
Series "ve/cgroup: do not link a CT cpu cgroup twice into ve_root_list"
Headers show

Commit Message

Konstantin Khorenko Sept. 5, 2018, 1:30 p.m.
Container's cpu cgroup is linked to "ve_root_list" on CT start.
But if someone holds CT's cpu cgroup while CT is being stopped,
next CT start tries to create same cpu cgroup (fails, already exists)
and links this cpu cgroup to the "ve_root_list", thus corrupting it.

As a consequence calc_load_ve() goes in an endless loop.

Let's check if task_group has been already linked to the list and skip
redundant linking.

Locking scheme change:
- drop rcu for list ve_root_list, we hold spinlocks anyway
- use "load_ve_lock" spinlock for both list add/del/iterate,
  "task_group_lock" is unrelated here

How to reproduce:

 # vzctl start 200
 # echo $$ > /sys/fs/cgroup/cpu/machine.slice/200/tasks
 # vzctl stop 200
 // At this moment VE cgroup got destroyed, but cpu cgroup is still alive
 // and linked to "ve_root_list" list

 # vzctl start 200
 // double add of same tg (same cpu cgroup) to "ve_root_list" list =>
 // list corruption => endless loop in next calc_load_ve() call

https://jira.sw.ru/browse/PSBM-88251

Signed-off-by: Konstantin Khorenko <khorenko@virtuozzo.com>

v2 changes:
 - change locking scheme: drop rcu, use "load_ve_lock" everywhere
 - drop tg->linked field, check if linked using list_empty()
---
 include/linux/sched.h |  5 ++++-
 kernel/sched/core.c   | 32 ++++++++++++++++++--------------
 2 files changed, 22 insertions(+), 15 deletions(-)

Patch hide | download patch | download mbox

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 623572043ece..00d7dbca8ed6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -3346,6 +3346,9 @@  void cpufreq_set_update_util_data(int cpu, struct update_util_data *data);
 #ifdef CONFIG_VE
 struct cgroup;
 extern void link_ve_root_cpu_cgroup(struct cgroup *);
-#endif
+void unlink_ve_root_cpu_cgroup(struct cgroup *cgrp);
+#else /* CONFIG_VE */
+void unlink_ve_root_cpu_cgroup(struct cgroup *cgrp) { }
+#endif /* CONFIG_VE */
 
 #endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e572d2b14a7f..a49ea4e9b718 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2882,10 +2882,10 @@  calc_load(unsigned long load, unsigned long exp, unsigned long active)
 
 #ifdef CONFIG_VE
 static LIST_HEAD(ve_root_list);
+static DEFINE_SPINLOCK(load_ve_lock);
 
 void calc_load_ve(void)
 {
-	static DEFINE_SPINLOCK(load_ve_lock);
 	unsigned long nr_unint, nr_active;
 	struct task_group *tg;
 	int i;
@@ -2895,8 +2895,7 @@  void calc_load_ve(void)
 	 * against very rare parallel execution on two or more cpus.
 	 */
 	spin_lock(&load_ve_lock);
-	rcu_read_lock();
-	list_for_each_entry_rcu(tg, &ve_root_list, ve_root_list) {
+	list_for_each_entry(tg, &ve_root_list, ve_root_list) {
 		nr_active = 0;
 		for_each_possible_cpu(i) {
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -2916,7 +2915,6 @@  void calc_load_ve(void)
 		tg->avenrun[1] = calc_load(tg->avenrun[1], EXP_5, nr_active);
 		tg->avenrun[2] = calc_load(tg->avenrun[2], EXP_15, nr_active);
 	}
-	rcu_read_unlock();
 
 	nr_unint = nr_uninterruptible() * FIXED_1;
 
@@ -9016,12 +9014,23 @@  void link_ve_root_cpu_cgroup(struct cgroup *cgrp)
 	struct task_group *tg = cgroup_tg(cgrp);
 	unsigned long flags;
 
-	spin_lock_irqsave(&task_group_lock, flags);
+	spin_lock_irqsave(&load_ve_lock, flags);
 	BUG_ON(!(cgrp->subsys[cpu_cgroup_subsys_id]->flags & CSS_ONLINE));
-	list_add_rcu(&tg->ve_root_list, &ve_root_list);
-	spin_unlock_irqrestore(&task_group_lock, flags);
+	if (list_empty(&tg->ve_root_list))
+		list_add(&tg->ve_root_list, &ve_root_list);
+	spin_unlock_irqrestore(&load_ve_lock, flags);
 }
-#endif
+
+void unlink_ve_root_cpu_cgroup(struct cgroup *cgrp)
+{
+	struct task_group *tg = cgroup_tg(cgrp);
+	unsigned long flags;
+
+	spin_lock_irqsave(&load_ve_lock, flags);
+	list_del_init(&tg->ve_root_list);
+	spin_unlock_irqrestore(&load_ve_lock, flags);
+}
+#endif /* CONFIG_VE */
 
 static void free_sched_group(struct task_group *tg)
 {
@@ -9614,13 +9623,8 @@  static void cpu_cgroup_css_free(struct cgroup *cgrp)
 static void cpu_cgroup_css_offline(struct cgroup *cgrp)
 {
 	struct task_group *tg = cgroup_tg(cgrp);
-#ifdef CONFIG_VE
-	unsigned long flags;
 
-	spin_lock_irqsave(&task_group_lock, flags);
-	list_del_rcu(&tg->ve_root_list);
-	spin_unlock_irqrestore(&task_group_lock, flags);
-#endif
+	unlink_ve_root_cpu_cgroup(cgrp);
 	sched_offline_group(tg);
 }
 

Comments

Kirill Tkhai Sept. 5, 2018, 1:40 p.m.
On 05.09.2018 16:30, Konstantin Khorenko wrote:
> Container's cpu cgroup is linked to "ve_root_list" on CT start.
> But if someone holds CT's cpu cgroup while CT is being stopped,
> next CT start tries to create same cpu cgroup (fails, already exists)
> and links this cpu cgroup to the "ve_root_list", thus corrupting it.
> 
> As a consequence calc_load_ve() goes in an endless loop.
> 
> Let's check if task_group has been already linked to the list and skip
> redundant linking.
> 
> Locking scheme change:
> - drop rcu for list ve_root_list, we hold spinlocks anyway
> - use "load_ve_lock" spinlock for both list add/del/iterate,
>   "task_group_lock" is unrelated here
> 
> How to reproduce:
> 
>  # vzctl start 200
>  # echo $$ > /sys/fs/cgroup/cpu/machine.slice/200/tasks
>  # vzctl stop 200
>  // At this moment VE cgroup got destroyed, but cpu cgroup is still alive
>  // and linked to "ve_root_list" list
> 
>  # vzctl start 200
>  // double add of same tg (same cpu cgroup) to "ve_root_list" list =>
>  // list corruption => endless loop in next calc_load_ve() call
> 
> https://jira.sw.ru/browse/PSBM-88251
> 
> Signed-off-by: Konstantin Khorenko <khorenko@virtuozzo.com>

Acked-by: Kirill Tkhai <ktkhai@virtuozzo.com>
 
> v2 changes:
>  - change locking scheme: drop rcu, use "load_ve_lock" everywhere
>  - drop tg->linked field, check if linked using list_empty()
> ---
>  include/linux/sched.h |  5 ++++-
>  kernel/sched/core.c   | 32 ++++++++++++++++++--------------
>  2 files changed, 22 insertions(+), 15 deletions(-)
> 
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 623572043ece..00d7dbca8ed6 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -3346,6 +3346,9 @@ void cpufreq_set_update_util_data(int cpu, struct update_util_data *data);
>  #ifdef CONFIG_VE
>  struct cgroup;
>  extern void link_ve_root_cpu_cgroup(struct cgroup *);
> -#endif
> +void unlink_ve_root_cpu_cgroup(struct cgroup *cgrp);
> +#else /* CONFIG_VE */
> +void unlink_ve_root_cpu_cgroup(struct cgroup *cgrp) { }
> +#endif /* CONFIG_VE */
>  
>  #endif
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index e572d2b14a7f..a49ea4e9b718 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -2882,10 +2882,10 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
>  
>  #ifdef CONFIG_VE
>  static LIST_HEAD(ve_root_list);
> +static DEFINE_SPINLOCK(load_ve_lock);
>  
>  void calc_load_ve(void)
>  {
> -	static DEFINE_SPINLOCK(load_ve_lock);
>  	unsigned long nr_unint, nr_active;
>  	struct task_group *tg;
>  	int i;
> @@ -2895,8 +2895,7 @@ void calc_load_ve(void)
>  	 * against very rare parallel execution on two or more cpus.
>  	 */
>  	spin_lock(&load_ve_lock);
> -	rcu_read_lock();
> -	list_for_each_entry_rcu(tg, &ve_root_list, ve_root_list) {
> +	list_for_each_entry(tg, &ve_root_list, ve_root_list) {
>  		nr_active = 0;
>  		for_each_possible_cpu(i) {
>  #ifdef CONFIG_FAIR_GROUP_SCHED
> @@ -2916,7 +2915,6 @@ void calc_load_ve(void)
>  		tg->avenrun[1] = calc_load(tg->avenrun[1], EXP_5, nr_active);
>  		tg->avenrun[2] = calc_load(tg->avenrun[2], EXP_15, nr_active);
>  	}
> -	rcu_read_unlock();
>  
>  	nr_unint = nr_uninterruptible() * FIXED_1;
>  
> @@ -9016,12 +9014,23 @@ void link_ve_root_cpu_cgroup(struct cgroup *cgrp)
>  	struct task_group *tg = cgroup_tg(cgrp);
>  	unsigned long flags;
>  
> -	spin_lock_irqsave(&task_group_lock, flags);
> +	spin_lock_irqsave(&load_ve_lock, flags);
>  	BUG_ON(!(cgrp->subsys[cpu_cgroup_subsys_id]->flags & CSS_ONLINE));
> -	list_add_rcu(&tg->ve_root_list, &ve_root_list);
> -	spin_unlock_irqrestore(&task_group_lock, flags);
> +	if (list_empty(&tg->ve_root_list))
> +		list_add(&tg->ve_root_list, &ve_root_list);
> +	spin_unlock_irqrestore(&load_ve_lock, flags);
>  }
> -#endif
> +
> +void unlink_ve_root_cpu_cgroup(struct cgroup *cgrp)
> +{
> +	struct task_group *tg = cgroup_tg(cgrp);
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&load_ve_lock, flags);
> +	list_del_init(&tg->ve_root_list);
> +	spin_unlock_irqrestore(&load_ve_lock, flags);
> +}
> +#endif /* CONFIG_VE */
>  
>  static void free_sched_group(struct task_group *tg)
>  {
> @@ -9614,13 +9623,8 @@ static void cpu_cgroup_css_free(struct cgroup *cgrp)
>  static void cpu_cgroup_css_offline(struct cgroup *cgrp)
>  {
>  	struct task_group *tg = cgroup_tg(cgrp);
> -#ifdef CONFIG_VE
> -	unsigned long flags;
>  
> -	spin_lock_irqsave(&task_group_lock, flags);
> -	list_del_rcu(&tg->ve_root_list);
> -	spin_unlock_irqrestore(&task_group_lock, flags);
> -#endif
> +	unlink_ve_root_cpu_cgroup(cgrp);
>  	sched_offline_group(tg);
>  }
>  
>
Andrey Ryabinin Sept. 5, 2018, 2:15 p.m.
On 09/05/2018 04:30 PM, Konstantin Khorenko wrote:
> Container's cpu cgroup is linked to "ve_root_list" on CT start.
> But if someone holds CT's cpu cgroup while CT is being stopped,
> next CT start tries to create same cpu cgroup (fails, already exists)
> and links this cpu cgroup to the "ve_root_list", thus corrupting it.
> 
> As a consequence calc_load_ve() goes in an endless loop.
> 
> Let's check if task_group has been already linked to the list and skip
> redundant linking.
> 
> Locking scheme change:
> - drop rcu for list ve_root_list, we hold spinlocks anyway
> - use "load_ve_lock" spinlock for both list add/del/iterate,
>   "task_group_lock" is unrelated here
> 
> How to reproduce:
> 
>  # vzctl start 200
>  # echo $$ > /sys/fs/cgroup/cpu/machine.slice/200/tasks
>  # vzctl stop 200
>  // At this moment VE cgroup got destroyed, but cpu cgroup is still alive
>  // and linked to "ve_root_list" list
> 
>  # vzctl start 200
>  // double add of same tg (same cpu cgroup) to "ve_root_list" list =>
>  // list corruption => endless loop in next calc_load_ve() call
> 
> https://jira.sw.ru/browse/PSBM-88251
> 
> Signed-off-by: Konstantin Khorenko <khorenko@virtuozzo.com>
> 
> v2 changes:
>  - change locking scheme: drop rcu, use "load_ve_lock" everywhere
>  - drop tg->linked field, check if linked using list_empty()
> ---

Reviewed-by: Andrey Ryabinin <aryabinin@virtuozzo.com>