[RHEL7,COMMIT] ve/sched: take nr_cpus and cpu_rate from ve root task group

Submitted by Konstantin Khorenko on Nov. 7, 2017, 9:06 a.m.

Details

Message ID 201711070906.vA796Lr1023286@finist_ce7.work
State New
Series "ve: properly handle nr_cpus and cpu_rate for nested cgroups"
Headers show

Commit Message

Konstantin Khorenko Nov. 7, 2017, 9:06 a.m.
The commit is pushed to "branch-rh7-3.10.0-693.1.1.vz7.37.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-693.1.1.vz7.37.21
------>
commit e661261a0f8af475ae0dd7980bd73555ff7724a1
Author: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
Date:   Tue Nov 7 12:06:21 2017 +0300

    ve/sched: take nr_cpus and cpu_rate from ve root task group
    
    Patchset description:
    
    ve: properly handle nr_cpus and cpu_rate for nested cgroups
    
    https://jira.sw.ru/browse/PSBM-69678
    
    Pavel Tikhomirov (3):
      cgroup: remove rcu_read_lock from cgroup_get_ve_root
      cgroup: make cgroup_get_ve_root visible in kernel/sched/core.c
      sched: take nr_cpus and cpu_rate from ve root task group
    
    =============================================================
    This patch description:
    
    Cpu view in container should depend only from root cpu cgroup
    nr_cpus/rate configuration. So replace tg->xxx references by
    tg_xxx(tg) helpers to get xxx from root ve cgroup. We still
    allow set/read rate and nr_cpus directly in nested cgroups,
    but they are just converted to corresponding cfs_period and
    cfs_quota setup, and does _not_ influence in container view
    of cpus and their stats.
    
    Also remove excessive rcu_read_lock/unlock as we have no rcu
    dereference in between, looks like some leftover for task_group()
    which differs in VZ6 and VZ7.
    
    https://jira.sw.ru/browse/PSBM-69678
    
    Signed-off-by: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
    Reviewed-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
---
 include/linux/sched.h |  2 ++
 kernel/sched/core.c   | 56 +++++++++++++++++++++++++++++++++++++++++----------
 kernel/sched/fair.c   |  9 +++++----
 3 files changed, 52 insertions(+), 15 deletions(-)

Patch hide | download patch | download mbox

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 84fe6cd..03c06ff6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -3182,6 +3182,8 @@  static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
 
 #endif /* CONFIG_SMP */
 
+extern unsigned int tg_cpu_rate(struct task_group *tg);
+extern unsigned int tg_nr_cpus(struct task_group *tg);
 #ifdef CONFIG_CFS_CPULIMIT
 extern unsigned int task_nr_cpus(struct task_struct *p);
 extern unsigned int task_vcpu_id(struct task_struct *p);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7a40fa8..5b3daa1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -340,15 +340,40 @@  __read_mostly int scheduler_running;
  */
 int sysctl_sched_rt_runtime = 950000;
 
+static inline struct task_group *cgroup_tg(struct cgroup *cgrp);
+
+static struct task_group *ve_root_tg(struct task_group *tg) {
+	struct cgroup *cg;
+
+	if (!tg)
+		return NULL;
+
+	cg = cgroup_get_ve_root(tg->css.cgroup);
+	WARN_ONCE(!cg, "Failed to find ve root cgroup, possible container configuration problem.\n");
+	return cg ? cgroup_tg(cg) : NULL;
+}
+
+unsigned int tg_cpu_rate(struct task_group *tg)
+{
+	unsigned int cpu_rate = 0;
 #ifdef CONFIG_CFS_CPULIMIT
-unsigned int task_nr_cpus(struct task_struct *p)
+	tg = ve_root_tg(tg);
+	if (tg)
+		cpu_rate = tg->cpu_rate;
+#endif
+	return cpu_rate;
+}
+
+unsigned int tg_nr_cpus(struct task_group *tg)
 {
 	unsigned int nr_cpus = 0;
 	unsigned int max_nr_cpus = num_online_cpus();
 
-	rcu_read_lock();
-	nr_cpus = task_group(p)->nr_cpus;
-	rcu_read_unlock();
+#ifdef CONFIG_CFS_CPULIMIT
+	tg = ve_root_tg(tg);
+	if (tg)
+		nr_cpus = tg->nr_cpus;
+#endif
 
 	if (!nr_cpus || nr_cpus > max_nr_cpus)
 		nr_cpus = max_nr_cpus;
@@ -356,6 +381,17 @@  unsigned int task_nr_cpus(struct task_struct *p)
 	return nr_cpus;
 }
 
+#ifdef CONFIG_CFS_CPULIMIT
+unsigned int task_nr_cpus(struct task_struct *p)
+{
+	return tg_nr_cpus(task_group(p));
+}
+
+static unsigned int task_cpu_rate(struct task_struct *p)
+{
+	return tg_cpu_rate(task_group(p));
+}
+
 unsigned int task_vcpu_id(struct task_struct *p)
 {
 	return task_cpu(p) % task_nr_cpus(p);
@@ -370,9 +406,7 @@  unsigned int sched_cpulimit_scale_cpufreq(unsigned int freq)
 	if (!sysctl_sched_cpulimit_scale_cpufreq)
 		return freq;
 
-	rcu_read_lock();
-	rate = task_group(current)->cpu_rate;
-	rcu_read_unlock();
+	rate = task_cpu_rate(current);
 
 	max_rate = num_online_vcpus() * MAX_CPU_RATE;
 	if (!rate || rate >= max_rate)
@@ -9919,8 +9953,8 @@  static void cpu_cgroup_update_vcpustat(struct cgroup *cgrp)
 	spin_lock(&tg->vcpustat_lock);
 
 	now = ktime_get();
-	nr_vcpus = tg->nr_cpus ?: num_online_cpus();
-	vcpu_rate = DIV_ROUND_UP(tg->cpu_rate, nr_vcpus);
+	nr_vcpus = tg_nr_cpus(tg);
+	vcpu_rate = DIV_ROUND_UP(tg_cpu_rate(tg), nr_vcpus);
 	if (!vcpu_rate || vcpu_rate > MAX_CPU_RATE)
 		vcpu_rate = MAX_CPU_RATE;
 
@@ -10005,7 +10039,7 @@  int cpu_cgroup_proc_stat(struct cgroup *cgrp, struct cftype *cft,
 	struct timespec boottime;
 	struct task_group *tg = cgroup_tg(cgrp);
 	bool virt = !ve_is_super(get_exec_env()) && tg != &root_task_group;
-	int nr_vcpus = tg->nr_cpus ?: num_online_cpus();
+	int nr_vcpus = tg_nr_cpus(tg);
 	struct kernel_cpustat *kcpustat;
 	unsigned long tg_nr_running = 0;
 	unsigned long tg_nr_iowait = 0;
@@ -10132,7 +10166,7 @@  int cpu_cgroup_proc_loadavg(struct cgroup *cgrp, struct cftype *cft,
 int cpu_cgroup_get_stat(struct cgroup *cgrp, struct kernel_cpustat *kstat)
 {
 	struct task_group *tg = cgroup_tg(cgrp);
-	int nr_vcpus = tg->nr_cpus ?: num_online_cpus();
+	int nr_vcpus = tg_nr_cpus(tg);
 	int i;
 
 	kernel_cpustat_zero(kstat);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c677e93..5697778 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -524,11 +524,12 @@  static enum hrtimer_restart sched_cfs_active_timer(struct hrtimer *timer)
 static inline int check_cpulimit_spread(struct task_group *tg, int target_cpu)
 {
 	int nr_cpus_active = atomic_read(&tg->nr_cpus_active);
-	int nr_cpus_limit = DIV_ROUND_UP(tg->cpu_rate, MAX_CPU_RATE);
+	int nr_cpus_limit = DIV_ROUND_UP(tg_cpu_rate(tg), MAX_CPU_RATE);
+	int nr_vcpus = tg_nr_cpus(tg);
 
-	nr_cpus_limit = nr_cpus_limit && tg->nr_cpus ?
-		min_t(int, nr_cpus_limit, tg->nr_cpus) :
-		max_t(int, nr_cpus_limit, tg->nr_cpus);
+	nr_cpus_limit = nr_cpus_limit && nr_vcpus ?
+		min_t(int, nr_cpus_limit, nr_vcpus) :
+		max_t(int, nr_cpus_limit, nr_vcpus);
 
 	if (!nr_cpus_limit || nr_cpus_active < nr_cpus_limit)
 		return 1;