[RHEL8,COMMIT] ve/sched/stat: Introduce functions to calculate vcpustat data

Submitted by Konstantin Khorenko on Nov. 3, 2020, 1 p.m.

Details

Message ID 202011031300.0A3D0xWD2050473@finist-co8.sw.ru
State New
Series "ve/proc/sched/stat: Virtualize /proc/stat in a Container"
Headers show

Commit Message

Konstantin Khorenko Nov. 3, 2020, 1 p.m.
The commit is pushed to "branch-rh8-4.18.0-193.6.3.vz8.4.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-193.6.3.vz8.4.15
------>
commit 82f2b4c771019330ed36773c00a77acb70f38204
Author: Konstantin Khorenko <khorenko@virtuozzo.com>
Date:   Wed Oct 28 15:26:59 2020 +0300

    ve/sched/stat: Introduce functions to calculate vcpustat data
    
    Signed-off-by: Konstantin Khorenko <khorenko@virtuozzo.com>
    Reviewed-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
---
 kernel/sched/core.c    |   2 +-
 kernel/sched/cpuacct.c | 373 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 374 insertions(+), 1 deletion(-)

Patch hide | download patch | download mbox

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 88bc46d163b3..e381085eb771 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6569,7 +6569,7 @@  void sched_move_task(struct task_struct *tsk)
 	task_rq_unlock(rq, tsk, &rf);
 }
 
-static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
+inline struct task_group *css_tg(struct cgroup_subsys_state *css)
 {
 	return css ? container_of(css, struct task_group, css) : NULL;
 }
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 9fbb10383434..aafaee1f0722 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -5,6 +5,7 @@ 
  * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
  * (balbir@in.ibm.com).
  */
+#include <linux/kernel_stat.h>
 #include "sched.h"
 
 /* Time spent by the tasks of the CPU accounting group executing in ... */
@@ -373,3 +374,375 @@  struct cgroup_subsys cpuacct_cgrp_subsys = {
 	.legacy_cftypes	= files,
 	.early_init	= true,
 };
+
+extern inline struct task_group *css_tg(struct cgroup_subsys_state *css);
+
+static struct task_group *ve_root_tg(struct task_group *tg) {
+	struct cgroup *cg;
+
+	if (!tg)
+		return NULL;
+
+	cg = cgroup_get_ve_root1(tg->css.cgroup);
+	return cg ? css_tg(&cg->self) : NULL;
+}
+
+unsigned int tg_cpu_rate(struct task_group *tg)
+{
+	unsigned int cpu_rate = 0;
+#ifdef CONFIG_CFS_CPULIMIT
+	tg = ve_root_tg(tg);
+	if (tg)
+		cpu_rate = tg->cpu_rate;
+#endif
+	return cpu_rate;
+}
+
+static unsigned int tg_nr_cpus(struct task_group *tg)
+{
+	unsigned int nr_cpus = 0;
+	unsigned int max_nr_cpus = num_online_cpus();
+
+#ifdef CONFIG_CFS_CPULIMIT
+	tg = ve_root_tg(tg);
+	if (tg)
+		nr_cpus = tg->nr_cpus;
+#endif
+	if (!nr_cpus || nr_cpus > max_nr_cpus)
+		nr_cpus = max_nr_cpus;
+
+	return nr_cpus;
+}
+
+struct kernel_cpustat *cpuacct_cpustat(struct cgroup_subsys_state *css, int cpu)
+{
+	return per_cpu_ptr(css_ca(css)->cpustat, cpu);
+}
+
+static void cpu_cgroup_update_stat(struct cgroup_subsys_state *cpu_css,
+				   struct cgroup_subsys_state *cpuacct_css,
+				   int i)
+{
+#if defined(CONFIG_SCHEDSTATS) && defined(CONFIG_FAIR_GROUP_SCHED)
+	struct task_group *tg = css_tg(cpu_css);
+	struct sched_entity *se = tg->se[i];
+	u64 *cpustat = cpuacct_cpustat(cpuacct_css, i)->cpustat;
+	u64 now = cpu_clock(i);
+	u64 delta, idle, iowait, steal;
+
+	/* root_task_group has not sched entities */
+	if (tg == &root_task_group)
+		return;
+
+	iowait = se->statistics.iowait_sum;
+	idle = se->statistics.sum_sleep_runtime;
+	steal = se->statistics.wait_sum;
+
+	if (idle > iowait)
+		idle -= iowait;
+	else
+		idle = 0;
+
+	if (se->statistics.sleep_start) {
+		delta = now - se->statistics.sleep_start;
+		if ((s64)delta > 0)
+			idle += delta;
+	} else if (se->statistics.block_start) {
+		delta = now - se->statistics.block_start;
+		if ((s64)delta > 0)
+			iowait += delta;
+	} else if (se->statistics.wait_start) {
+		delta = now - se->statistics.wait_start;
+		if ((s64)delta > 0)
+			steal += delta;
+	}
+
+	cpustat[CPUTIME_IDLE]	= max(cpustat[CPUTIME_IDLE], idle);
+	cpustat[CPUTIME_IOWAIT]	= max(cpustat[CPUTIME_IOWAIT], iowait);
+	cpustat[CPUTIME_STEAL]	= steal;
+#endif
+}
+
+static void fixup_vcpustat_delta_usage(struct kernel_cpustat *cur,
+				       struct kernel_cpustat *rem, int ind,
+				       u64 cur_usage, u64 target_usage,
+				       u64 rem_usage)
+{
+	s64 scaled_val;
+	u32 scale_pct = 0;
+
+	/* distribute the delta among USER, NICE, and SYSTEM proportionally */
+	if (cur_usage < target_usage) {
+		if ((s64)rem_usage > 0) /* sanity check to avoid div/0 */
+			scale_pct = div64_u64(100 * rem->cpustat[ind],
+					      rem_usage);
+	} else {
+		if ((s64)cur_usage > 0) /* sanity check to avoid div/0 */
+			scale_pct = div64_u64(100 * cur->cpustat[ind],
+					      cur_usage);
+	}
+
+	scaled_val = div_s64(scale_pct * (target_usage - cur_usage), 100);
+
+	cur->cpustat[ind] += scaled_val;
+	if ((s64)cur->cpustat[ind] < 0)
+		cur->cpustat[ind] = 0;
+
+	rem->cpustat[ind] -= scaled_val;
+	if ((s64)rem->cpustat[ind] < 0)
+		rem->cpustat[ind] = 0;
+}
+
+static void calc_vcpustat_delta_idle(struct kernel_cpustat *cur,
+				     int ind, u64 cur_idle, u64 target_idle)
+{
+	/* distribute target_idle between IDLE and IOWAIT proportionally to
+	 * what we initially had on this vcpu */
+	if ((s64)cur_idle > 0) {
+		u32 scale_pct = div64_u64(100 * cur->cpustat[ind], cur_idle);
+		cur->cpustat[ind] = div_u64(scale_pct * target_idle, 100);
+	} else {
+		cur->cpustat[ind] = ind == CPUTIME_IDLE ? target_idle : 0;
+	}
+}
+
+static void fixup_vcpustat_delta(struct kernel_cpustat *cur,
+				 struct kernel_cpustat *rem,
+				 u64 max_usage)
+{
+	u64 cur_usage, target_usage, rem_usage;
+	u64 cur_idle, target_idle;
+
+	cur_usage = kernel_cpustat_total_usage(cur);
+	rem_usage = kernel_cpustat_total_usage(rem);
+
+	target_usage = min(cur_usage + rem_usage,
+			max_usage);
+
+	if (cur_usage != target_usage) {
+		fixup_vcpustat_delta_usage(cur, rem, CPUTIME_USER,
+				cur_usage, target_usage, rem_usage);
+		fixup_vcpustat_delta_usage(cur, rem, CPUTIME_NICE,
+				cur_usage, target_usage, rem_usage);
+		fixup_vcpustat_delta_usage(cur, rem, CPUTIME_SYSTEM,
+				cur_usage, target_usage, rem_usage);
+	}
+
+	cur_idle = kernel_cpustat_total_idle(cur);
+	target_idle = max_usage - target_usage;
+
+	if (cur_idle != target_idle) {
+		calc_vcpustat_delta_idle(cur, CPUTIME_IDLE,
+					 cur_idle, target_idle);
+		calc_vcpustat_delta_idle(cur, CPUTIME_IOWAIT,
+					 cur_idle, target_idle);
+	}
+
+	/* do not show steal time inside ve */
+	cur->cpustat[CPUTIME_STEAL] = 0;
+}
+
+static void cpu_cgroup_update_vcpustat(struct cgroup_subsys_state *cpu_css,
+				       struct cgroup_subsys_state *cpuacct_css)
+{
+	int i, j;
+	int nr_vcpus;
+	int vcpu_rate;
+	ktime_t now;
+	u64 max_usage;
+	struct kernel_cpustat stat_delta, stat_rem;
+	struct task_group *tg = css_tg(cpu_css);
+	int first_pass = 1;
+
+	spin_lock(&tg->vcpustat_lock);
+
+	now = ktime_get();
+	nr_vcpus = tg_nr_cpus(tg);
+	vcpu_rate = DIV_ROUND_UP(tg_cpu_rate(tg), nr_vcpus);
+	if (!vcpu_rate || vcpu_rate > MAX_CPU_RATE)
+		vcpu_rate = MAX_CPU_RATE;
+
+	if (!ktime_to_ns(tg->vcpustat_last_update)) {
+		/* on the first read initialize vcpu i stat as a sum of stats
+		 * over pcpus j such that j % nr_vcpus == i */
+		for (i = 0; i < nr_vcpus; i++) {
+			for (j = i; j < nr_cpu_ids; j += nr_vcpus) {
+				if (!cpu_possible(j))
+					continue;
+				kernel_cpustat_add(tg->vcpustat + i,
+						cpuacct_cpustat(cpuacct_css, j),
+						tg->vcpustat + i);
+			}
+		}
+		goto out_update_last;
+	}
+
+	max_usage = ktime_to_ns(ktime_sub(now, tg->vcpustat_last_update));
+	max_usage = div_u64(max_usage * vcpu_rate, MAX_CPU_RATE);
+	/* don't allow to update stats too often to avoid calculation errors */
+	if (max_usage < 10)
+		goto out_unlock;
+
+	/* temporarily copy per cpu usage delta to tg->cpustat_last */
+	for_each_possible_cpu(i)
+		kernel_cpustat_sub(cpuacct_cpustat(cpuacct_css, i),
+				   tg->cpustat_last + i,
+				   tg->cpustat_last + i);
+
+	/* proceed to calculating per vcpu delta */
+	kernel_cpustat_zero(&stat_rem);
+
+again:
+	for (i = 0; i < nr_vcpus; i++) {
+		int exceeds_max;
+
+		kernel_cpustat_zero(&stat_delta);
+		for (j = i; j < nr_cpu_ids; j += nr_vcpus) {
+			if (!cpu_possible(j))
+				continue;
+			kernel_cpustat_add(&stat_delta,
+					   tg->cpustat_last + j, &stat_delta);
+		}
+
+		exceeds_max = kernel_cpustat_total_usage(&stat_delta) >=
+			      max_usage;
+		/*
+		 * On the first pass calculate delta for vcpus with usage >
+		 * max_usage in order to accumulate excess in stat_rem.
+		 *
+		 * Once the remainder is accumulated, proceed to the rest of
+		 * vcpus so that it will be distributed among them.
+		 */
+		if (exceeds_max != first_pass)
+			continue;
+
+		fixup_vcpustat_delta(&stat_delta, &stat_rem, max_usage);
+		kernel_cpustat_add(tg->vcpustat + i, &stat_delta,
+				   tg->vcpustat + i);
+	}
+
+	if (first_pass) {
+		first_pass = 0;
+		goto again;
+	}
+out_update_last:
+	for_each_possible_cpu(i)
+		tg->cpustat_last[i] = *cpuacct_cpustat(cpuacct_css, i);
+	tg->vcpustat_last_update = now;
+out_unlock:
+	spin_unlock(&tg->vcpustat_lock);
+}
+
+int cpu_cgroup_proc_stat(struct cgroup_subsys_state *cpu_css,
+			 struct cgroup_subsys_state *cpuacct_css,
+			 struct seq_file *p)
+{
+	int i;
+	s64 boot_sec;
+	u64 user, nice, system, idle, iowait, steal;
+	struct timespec64 boottime;
+	struct task_group *tg = css_tg(cpu_css);
+	bool virt = !ve_is_super(get_exec_env()) && tg != &root_task_group;
+	int nr_vcpus = tg_nr_cpus(tg);
+	struct kernel_cpustat *kcpustat;
+	unsigned long tg_nr_running = 0;
+	unsigned long tg_nr_iowait = 0;
+
+	getboottime64(&boottime);
+
+	/*
+	 * In VE0 we always show host's boottime and in VEX we show real CT
+	 * start time, even across CT migrations, as we rely on userspace to
+	 * set real_start_timespec for us on resume.
+	 */
+	boot_sec = boottime.tv_sec +
+		   get_exec_env()->real_start_time / NSEC_PER_SEC;
+
+	for_each_possible_cpu(i) {
+		cpu_cgroup_update_stat(cpu_css, cpuacct_css, i);
+
+		/* root task group has autogrouping, so this doesn't hold */
+#ifdef CONFIG_FAIR_GROUP_SCHED
+		tg_nr_running += tg->cfs_rq[i]->h_nr_running;
+		tg_nr_iowait  += tg->cfs_rq[i]->nr_iowait;
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
+		tg_nr_running += tg->rt_rq[i]->rt_nr_running;
+#endif
+	}
+
+	if (virt)
+		cpu_cgroup_update_vcpustat(cpu_css, cpuacct_css);
+
+	user = nice = system = idle = iowait = steal = 0;
+
+	for (i = 0; i < (virt ? nr_vcpus : nr_cpu_ids); i++) {
+		if (!virt && !cpu_possible(i))
+			continue;
+
+		kcpustat = virt ? tg->vcpustat + i :
+				  cpuacct_cpustat(cpuacct_css, i);
+
+		user	+= kcpustat->cpustat[CPUTIME_USER];
+		nice	+= kcpustat->cpustat[CPUTIME_NICE];
+		system	+= kcpustat->cpustat[CPUTIME_SYSTEM];
+		idle	+= kcpustat->cpustat[CPUTIME_IDLE];
+		iowait	+= kcpustat->cpustat[CPUTIME_IOWAIT];
+		steal	+= kcpustat->cpustat[CPUTIME_STEAL];
+	}
+	/* Don't scare CT users with high steal time */
+	if (!ve_is_super(get_exec_env()))
+		steal = 0;
+
+	seq_printf(p, "cpu  %llu %llu %llu %llu %llu 0 0 %llu\n",
+		   (unsigned long long)nsec_to_clock_t(user),
+		   (unsigned long long)nsec_to_clock_t(nice),
+		   (unsigned long long)nsec_to_clock_t(system),
+		   (unsigned long long)nsec_to_clock_t(idle),
+		   (unsigned long long)nsec_to_clock_t(iowait),
+		   virt ? 0ULL :
+		   (unsigned long long)nsec_to_clock_t(steal));
+
+	for (i = 0; i < (virt ? nr_vcpus : nr_cpu_ids); i++) {
+		if (!virt && !cpu_online(i))
+			continue;
+		kcpustat = virt ? tg->vcpustat + i :
+				  cpuacct_cpustat(cpuacct_css, i);
+
+		user	= kcpustat->cpustat[CPUTIME_USER];
+		nice	= kcpustat->cpustat[CPUTIME_NICE];
+		system	= kcpustat->cpustat[CPUTIME_SYSTEM];
+		idle	= kcpustat->cpustat[CPUTIME_IDLE];
+		iowait	= kcpustat->cpustat[CPUTIME_IOWAIT];
+		steal	= kcpustat->cpustat[CPUTIME_STEAL];
+		/* Don't scare CT users with high steal time */
+		if (!ve_is_super(get_exec_env()))
+			steal = 0;
+
+		seq_printf(p,
+			   "cpu%d %llu %llu %llu %llu %llu 0 0 %llu\n",
+			   i,
+			   (unsigned long long)nsec_to_clock_t(user),
+			   (unsigned long long)nsec_to_clock_t(nice),
+			   (unsigned long long)nsec_to_clock_t(system),
+			   (unsigned long long)nsec_to_clock_t(idle),
+			   (unsigned long long)nsec_to_clock_t(iowait),
+			   virt ? 0ULL :
+			   (unsigned long long)nsec_to_clock_t(steal));
+	}
+	seq_printf(p, "intr 0");
+
+	seq_printf(p,
+		   "\nctxt %llu\n"
+		   "btime %llu\n"
+		   "processes %lu\n"
+		   "procs_running %lu\n"
+		   "procs_blocked %lu\n",
+		   nr_context_switches(),
+		   (unsigned long long)boot_sec,
+		   total_forks,
+		   tg_nr_running,
+		   tg_nr_iowait);
+
+	return 0;
+}