Message ID | 201809051018.w85AIVU6011135@finist_ce7.work |
---|---|
State | New |
Series | "ve/cgroup: do not link a CT cpu cgroup twice into ve_root_list" |
Headers | show |
diff --git a/include/linux/sched.h b/include/linux/sched.h index 623572043ece..00d7dbca8ed6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -3346,6 +3346,9 @@ void cpufreq_set_update_util_data(int cpu, struct update_util_data *data); #ifdef CONFIG_VE struct cgroup; extern void link_ve_root_cpu_cgroup(struct cgroup *); -#endif +void unlink_ve_root_cpu_cgroup(struct cgroup *cgrp); +#else /* CONFIG_VE */ +void unlink_ve_root_cpu_cgroup(struct cgroup *cgrp) { } +#endif /* CONFIG_VE */ #endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e572d2b14a7f..7b30e17b16e3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -9018,10 +9018,24 @@ void link_ve_root_cpu_cgroup(struct cgroup *cgrp) spin_lock_irqsave(&task_group_lock, flags); BUG_ON(!(cgrp->subsys[cpu_cgroup_subsys_id]->flags & CSS_ONLINE)); - list_add_rcu(&tg->ve_root_list, &ve_root_list); + if (!tg->linked) { + list_add_rcu(&tg->ve_root_list, &ve_root_list); + tg->linked = 1; + } spin_unlock_irqrestore(&task_group_lock, flags); } -#endif + +void unlink_ve_root_cpu_cgroup(struct cgroup *cgrp) +{ + struct task_group *tg = cgroup_tg(cgrp); + unsigned long flags; + + spin_lock_irqsave(&task_group_lock, flags); + list_del_rcu(&tg->ve_root_list); + tg->linked = 0; + spin_unlock_irqrestore(&task_group_lock, flags); +} +#endif /* CONFIG_VE */ static void free_sched_group(struct task_group *tg) { @@ -9595,6 +9609,7 @@ static int cpu_cgroup_css_online(struct cgroup *cgrp) #ifdef CONFIG_VE INIT_LIST_HEAD(&tg->ve_root_list); + /* tg->linked == 0 due to kzalloc() in sched_create_group() */ #endif if (!cgrp->parent) return 0; @@ -9614,13 +9629,8 @@ static void cpu_cgroup_css_free(struct cgroup *cgrp) static void cpu_cgroup_css_offline(struct cgroup *cgrp) { struct task_group *tg = cgroup_tg(cgrp); -#ifdef CONFIG_VE - unsigned long flags; - spin_lock_irqsave(&task_group_lock, flags); - list_del_rcu(&tg->ve_root_list); - spin_unlock_irqrestore(&task_group_lock, flags); -#endif + unlink_ve_root_cpu_cgroup(cgrp); sched_offline_group(tg); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 98a216b6c391..5192e1bef06c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -263,6 +263,7 @@ struct task_group { #ifdef CONFIG_VE struct list_head ve_root_list; + bool linked; #endif struct taskstats __percpu *taskstats;
Patch dropped, will be reworked. -- Best regards, Konstantin Khorenko, Virtuozzo Linux Kernel Team On 09/05/2018 01:18 PM, Konstantin Khorenko wrote: > The commit is pushed to "branch-rh7-3.10.0-862.11.6.vz7.71.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git > after rh7-3.10.0-862.11.6.vz7.71.8 > ------> > commit d68530aa0ee943861d9dc8dc898087c58ad4520f > Author: Konstantin Khorenko <khorenko@virtuozzo.com> > Date: Wed Sep 5 13:18:31 2018 +0300 > > ve/cgroup: do not link a CT cpu cgroup twice into ve_root_list > > Container's cpu cgroup is linked to "ve_root_list" on CT start. > But if someone holds CT's cpu cgroup while CT is being stopped, > next CT start tries to create same cpu cgroup (fails, already exists) > and links this cpu cgroup to the "ve_root_list", thus corrupting it. > > As a consequence calc_load_ve() goes in an endless loop. > > Minimal patch is to add a marker to task_group linked/unlinked > and skip redundant linking. > > How to reproduce: > > # vzctl start 200 > # echo $$ > /sys/fs/cgroup/cpu/machine.slice/200/tasks > # vzctl stop 200 > // At this moment VE cgroup got destroyed, but cpu cgroup is still alive > // and linked to "ve_root_list" list > > # vzctl start 200 > // double add of same tg (same cpu cgroup) to "ve_root_list" list => > // list corruption => endless loop in next calc_load_ve() call > > https://jira.sw.ru/browse/PSBM-88251 > > Signed-off-by: Konstantin Khorenko <khorenko@virtuozzo.com> > --- > include/linux/sched.h | 5 ++++- > kernel/sched/core.c | 26 ++++++++++++++++++-------- > kernel/sched/sched.h | 1 + > 3 files changed, 23 insertions(+), 9 deletions(-) > > diff --git a/include/linux/sched.h b/include/linux/sched.h > index 623572043ece..00d7dbca8ed6 100644 > --- a/include/linux/sched.h > +++ b/include/linux/sched.h > @@ -3346,6 +3346,9 @@ void cpufreq_set_update_util_data(int cpu, struct update_util_data *data); > #ifdef CONFIG_VE > struct cgroup; > extern void link_ve_root_cpu_cgroup(struct cgroup *); > -#endif > +void unlink_ve_root_cpu_cgroup(struct cgroup *cgrp); > +#else /* CONFIG_VE */ > +void unlink_ve_root_cpu_cgroup(struct cgroup *cgrp) { } > +#endif /* CONFIG_VE */ > > #endif > diff --git a/kernel/sched/core.c b/kernel/sched/core.c > index e572d2b14a7f..7b30e17b16e3 100644 > --- a/kernel/sched/core.c > +++ b/kernel/sched/core.c > @@ -9018,10 +9018,24 @@ void link_ve_root_cpu_cgroup(struct cgroup *cgrp) > > spin_lock_irqsave(&task_group_lock, flags); > BUG_ON(!(cgrp->subsys[cpu_cgroup_subsys_id]->flags & CSS_ONLINE)); > - list_add_rcu(&tg->ve_root_list, &ve_root_list); > + if (!tg->linked) { > + list_add_rcu(&tg->ve_root_list, &ve_root_list); > + tg->linked = 1; > + } > spin_unlock_irqrestore(&task_group_lock, flags); > } > -#endif > + > +void unlink_ve_root_cpu_cgroup(struct cgroup *cgrp) > +{ > + struct task_group *tg = cgroup_tg(cgrp); > + unsigned long flags; > + > + spin_lock_irqsave(&task_group_lock, flags); > + list_del_rcu(&tg->ve_root_list); > + tg->linked = 0; > + spin_unlock_irqrestore(&task_group_lock, flags); > +} > +#endif /* CONFIG_VE */ > > static void free_sched_group(struct task_group *tg) > { > @@ -9595,6 +9609,7 @@ static int cpu_cgroup_css_online(struct cgroup *cgrp) > > #ifdef CONFIG_VE > INIT_LIST_HEAD(&tg->ve_root_list); > + /* tg->linked == 0 due to kzalloc() in sched_create_group() */ > #endif > if (!cgrp->parent) > return 0; > @@ -9614,13 +9629,8 @@ static void cpu_cgroup_css_free(struct cgroup *cgrp) > static void cpu_cgroup_css_offline(struct cgroup *cgrp) > { > struct task_group *tg = cgroup_tg(cgrp); > -#ifdef CONFIG_VE > - unsigned long flags; > > - spin_lock_irqsave(&task_group_lock, flags); > - list_del_rcu(&tg->ve_root_list); > - spin_unlock_irqrestore(&task_group_lock, flags); > -#endif > + unlink_ve_root_cpu_cgroup(cgrp); > sched_offline_group(tg); > } > > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h > index 98a216b6c391..5192e1bef06c 100644 > --- a/kernel/sched/sched.h > +++ b/kernel/sched/sched.h > @@ -263,6 +263,7 @@ struct task_group { > > #ifdef CONFIG_VE > struct list_head ve_root_list; > + bool linked; > #endif > > struct taskstats __percpu *taskstats; > . >
The commit is pushed to "branch-rh7-3.10.0-862.11.6.vz7.71.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-862.11.6.vz7.71.8 ------> commit d68530aa0ee943861d9dc8dc898087c58ad4520f Author: Konstantin Khorenko <khorenko@virtuozzo.com> Date: Wed Sep 5 13:18:31 2018 +0300 ve/cgroup: do not link a CT cpu cgroup twice into ve_root_list Container's cpu cgroup is linked to "ve_root_list" on CT start. But if someone holds CT's cpu cgroup while CT is being stopped, next CT start tries to create same cpu cgroup (fails, already exists) and links this cpu cgroup to the "ve_root_list", thus corrupting it. As a consequence calc_load_ve() goes in an endless loop. Minimal patch is to add a marker to task_group linked/unlinked and skip redundant linking. How to reproduce: # vzctl start 200 # echo $$ > /sys/fs/cgroup/cpu/machine.slice/200/tasks # vzctl stop 200 // At this moment VE cgroup got destroyed, but cpu cgroup is still alive // and linked to "ve_root_list" list # vzctl start 200 // double add of same tg (same cpu cgroup) to "ve_root_list" list => // list corruption => endless loop in next calc_load_ve() call https://jira.sw.ru/browse/PSBM-88251 Signed-off-by: Konstantin Khorenko <khorenko@virtuozzo.com> --- include/linux/sched.h | 5 ++++- kernel/sched/core.c | 26 ++++++++++++++++++-------- kernel/sched/sched.h | 1 + 3 files changed, 23 insertions(+), 9 deletions(-)