[rh8] sched/stat: account ctxsw per task group

Submitted by Konstantin Khorenko on Oct. 29, 2020, 3:46 p.m.

Details

Message ID 20201029154622.1807890-1-khorenko@virtuozzo.com
State New
Series "sched/stat: account ctxsw per task group"
Headers show

Commit Message

Konstantin Khorenko Oct. 29, 2020, 3:46 p.m.
From: Vladimir Davydov <vdavydov@parallels.com>

This is a backport of diff-sched-account-ctxsw-per-task-group:

 Subject: sched: account ctxsw per task group
 Date: Fri, 28 Dec 2012 15:09:45 +0400

* [sched] the number of context switches should be reported correctly
        inside a CT in /proc/stat (PSBM-18113)

For /proc/stat:ctxt to be correct inside containers.

https://jira.sw.ru/browse/PSBM-18113

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>

(cherry picked from vz7 commit d388f0bf64adb74cd62c4deff58e181bd63d62ac)
Signed-off-by: Konstantin Khorenko <khorenko@virtuozzo.com>
---
 kernel/sched/cpuacct.c |  4 +++-
 kernel/sched/fair.c    | 14 ++++++++++++--
 kernel/sched/sched.h   |  3 +++
 3 files changed, 18 insertions(+), 3 deletions(-)

Patch hide | download patch | download mbox

diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 3298228565b1..2814ea059bb3 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -651,6 +651,7 @@  int cpu_cgroup_proc_stat(struct cgroup_subsys_state *cpu_css,
 	struct kernel_cpustat *kcpustat;
 	unsigned long tg_nr_running = 0;
 	unsigned long tg_nr_iowait = 0;
+	unsigned long long tg_nr_switches = 0;
 
 	getboottime64(&boottime);
 
@@ -669,6 +670,7 @@  int cpu_cgroup_proc_stat(struct cgroup_subsys_state *cpu_css,
 #ifdef CONFIG_FAIR_GROUP_SCHED
 		tg_nr_running += tg->cfs_rq[i]->h_nr_running;
 		tg_nr_iowait  += tg->cfs_rq[i]->nr_iowait;
+		tg_nr_switches += tg->cfs_rq[i]->nr_switches;
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
 		tg_nr_running += tg->rt_rq[i]->rt_nr_running;
@@ -742,7 +744,7 @@  int cpu_cgroup_proc_stat(struct cgroup_subsys_state *cpu_css,
 		   "processes %lu\n"
 		   "procs_running %lu\n"
 		   "procs_blocked %lu\n",
-		   nr_context_switches(),
+		   tg_nr_switches,
 		   (unsigned long long)boot_sec,
 		   total_forks,
 		   tg_nr_running,
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6546d8511417..0b9bb108625a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4153,6 +4153,9 @@  dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 
 	clear_buddies(cfs_rq, se);
 
+	if (cfs_rq->prev == se)
+		cfs_rq->prev = NULL;
+
 	if (se != cfs_rq->curr)
 		__dequeue_entity(cfs_rq, se);
 	se->on_rq = 0;
@@ -4167,8 +4170,12 @@  dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	if (!(flags & DEQUEUE_SLEEP))
 		se->vruntime -= cfs_rq->min_vruntime;
 
-	/* return excess runtime on last dequeue */
-	return_cfs_rq_runtime(cfs_rq);
+	if (!cfs_rq->nr_running) {
+		/* return excess runtime on last dequeue */
+		return_cfs_rq_runtime(cfs_rq);
+		/* account switch to idle task */
+		cfs_rq->nr_switches++;
+	}
 
 	update_cfs_group(se);
 
@@ -4242,6 +4249,8 @@  set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
 	update_stats_curr_start(cfs_rq, se);
 	cfs_rq->curr = se;
+	if (cfs_rq->prev != se)
+		cfs_rq->nr_switches++;
 
 	/*
 	 * Track our maximum slice length, if the CPU's load is at
@@ -4341,6 +4350,7 @@  static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 		__enqueue_entity(cfs_rq, prev);
 		/* in !on_rq case, update occurred at dequeue */
 		update_load_avg(cfs_rq, prev, 0);
+		cfs_rq->prev = prev;
 	}
 	cfs_rq->curr = NULL;
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d8331e5b4c4f..3d55b45f1ea6 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -542,6 +542,9 @@  struct cfs_rq {
 	struct sched_entity	*next;
 	struct sched_entity	*last;
 	struct sched_entity	*skip;
+	struct sched_entity	*prev;
+
+	u64 nr_switches;
 
 #ifdef	CONFIG_SCHED_DEBUG
 	unsigned int		nr_spread_over;

Comments

Andrey Ryabinin Oct. 30, 2020, 4:20 p.m.
On 10/29/20 6:46 PM, Konstantin Khorenko wrote:
> From: Vladimir Davydov <vdavydov@parallels.com>
> 
> This is a backport of diff-sched-account-ctxsw-per-task-group:
> 
>  Subject: sched: account ctxsw per task group
>  Date: Fri, 28 Dec 2012 15:09:45 +0400
> 
> * [sched] the number of context switches should be reported correctly
>         inside a CT in /proc/stat (PSBM-18113)
> 
> For /proc/stat:ctxt to be correct inside containers.
> 
> https://jira.sw.ru/browse/PSBM-18113
> 
> Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
> 
> (cherry picked from vz7 commit d388f0bf64adb74cd62c4deff58e181bd63d62ac)
> Signed-off-by: Konstantin Khorenko <khorenko@virtuozzo.com>
> ---

Reviewed-by: Andrey Ryabinin <aryabinin@virtuozzo.com>