[rh7,4/4] /proc/<pid>/vz_latency: Show maximal allocation latency in the last second.

Submitted by Andrey Ryabinin on Aug. 17, 2018, 4:40 p.m.

Details

Message ID 20180817164025.24539-4-aryabinin@virtuozzo.com
State New
Series "Series without cover letter"
Headers show

Commit Message

Andrey Ryabinin Aug. 17, 2018, 4:40 p.m.
Add to '/proc/<pid>/vz_latency' column with maximal latency task have seen
in the last second.

E.g.:

cat /proc/1/vz_latency
Type                    Total_lat                Calls           Max (1sec)
allocatomic:                    0                  294                    0
alloc:                    3000000                43394                    0
allocmp:                        0                 1018                    0

AFAICS this changes output format but shouldn't break our the only user of
this interface - pstorage. Accordind to the pstorage code it reads this
file line by line, reads 'Total_lat' and 'Calls' fields and skips to the next
line. Thus adding new field shouldn't break it.

https://jira.sw.ru/browse/PSBM-87797
Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Pavel Borzenkov <Pavel.Borzenkov@acronis.com>
---
 fs/proc/base.c         | 28 ++++++++++++++++++++--------
 include/linux/kstat.h  |  1 +
 include/linux/vzstat.h |  8 ++++++++
 kernel/exit.c          |  8 ++++++++
 mm/page_alloc.c        |  7 +++++++
 5 files changed, 44 insertions(+), 8 deletions(-)

Patch hide | download patch | download mbox

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 64bbbc387afc..2431e74dbbfd 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -80,6 +80,7 @@ 
 #include <linux/audit.h>
 #include <linux/poll.h>
 #include <linux/nsproxy.h>
+#include <linux/vzstat.h>
 #include <linux/oom.h>
 #include <linux/elf.h>
 #include <linux/pid_namespace.h>
@@ -580,8 +581,8 @@  static void lastlat_seq_show(struct seq_file *m,
 		const char *name,
 		struct kstat_lat_snap_struct *snap)
 {
-	seq_printf(m, "%-12s %20Lu %20lu\n", name,
-			snap->totlat, snap->count);
+	seq_printf(m, "%-12s %20Lu %20lu %20Lu\n", name,
+			snap->totlat, snap->count, get_max_lat(snap));
 }
 static const char *alloc_descr[] = {
 	"allocatomic:",
@@ -599,8 +600,8 @@  static int proc_tid_vz_lat(struct seq_file *m, struct pid_namespace *ns,
 {
 	int i;
 
-	seq_printf(m, "%-12s %20s %20s\n",
-			"Type", "Total_lat", "Calls");
+	seq_printf(m, "%-12s %20s %20s %20s\n",
+			"Type", "Total_lat", "Calls", "Max (1sec)");
 
 	for (i = 0; i < ARRAY_SIZE(alloc_types); i++)
 		lastlat_seq_show(m, alloc_descr[i],
@@ -615,33 +616,44 @@  static int proc_tgid_vz_lat(struct seq_file *m, struct pid_namespace *ns,
 	unsigned long flags;
 	u64 lat[ARRAY_SIZE(alloc_types)];
 	u64 count[ARRAY_SIZE(alloc_types)];
+	u64 maxlats[ARRAY_SIZE(alloc_types)];
 
 	for (i = 0; i < ARRAY_SIZE(alloc_types); i++) {
 		lat[i] = task->alloc_lat[alloc_types[i]].totlat;
 		count[i] = task->alloc_lat[alloc_types[i]].count;
+		maxlats[i] = get_max_lat(&task->alloc_lat[alloc_types[i]]);
 	}
 
 	if (lock_task_sighand(task, &flags)) {
 		struct task_struct *t = task;
+		u64 maxlat;
+
 		while_each_thread(task, t) {
 			for (i = 0; i < ARRAY_SIZE(alloc_types); i++) {
 				lat[i] += t->alloc_lat[alloc_types[i]].totlat;
 				count[i] += t->alloc_lat[alloc_types[i]].count;
+				maxlat = get_max_lat(&t->alloc_lat[alloc_types[i]]);
+				if (maxlats[i] < maxlat)
+					maxlats[i] = maxlat;
 			}
 		}
 		for (i = 0; i < ARRAY_SIZE(alloc_types); i++) {
 			lat[i] += t->signal->alloc_lat[alloc_types[i]].totlat;
 			count[i] += t->signal->alloc_lat[alloc_types[i]].count;
+			maxlat = get_max_lat(&t->signal->alloc_lat[alloc_types[i]]);
+			if (maxlats[i] < maxlat)
+				maxlats[i] = maxlat;
+
 		}
 		unlock_task_sighand(task, &flags);
 	}
 
-	seq_printf(m, "%-12s %20s %20s\n",
-			"Type", "Total_lat", "Calls");
+	seq_printf(m, "%-12s %20s %20s %20s\n",
+			"Type", "Total_lat", "Calls", "Max (1sec)");
 
 	for (i = 0; i < ARRAY_SIZE(alloc_types); i++)
-		seq_printf(m, "%-12s %20Lu %20Lu\n", alloc_descr[i],
-			lat[i], count[i]);
+		seq_printf(m, "%-12s %20Lu %20Lu %20Lu\n", alloc_descr[i],
+			lat[i], count[i], maxlats[i]);
 
 	return 0;
 }
diff --git a/include/linux/kstat.h b/include/linux/kstat.h
index b268752f2e15..45bdd52ee7d9 100644
--- a/include/linux/kstat.h
+++ b/include/linux/kstat.h
@@ -32,6 +32,7 @@  struct kstat_perf_pcpu_struct {
 struct kstat_lat_snap_struct {
 	u64 maxlat, totlat;
 	unsigned long count;
+	unsigned long time;
 };
 
 struct kstat_lat_pcpu_snap_struct {
diff --git a/include/linux/vzstat.h b/include/linux/vzstat.h
index f1475b2b763a..8ec7b4862736 100644
--- a/include/linux/vzstat.h
+++ b/include/linux/vzstat.h
@@ -9,6 +9,7 @@ 
 #ifndef __VZSTAT_H__
 #define __VZSTAT_H__
 
+#include <linux/jiffies.h>
 #include <linux/mmzone.h>
 #include <linux/kstat.h>
 
@@ -64,6 +65,13 @@  extern void KSTAT_PERF_ADD(struct kstat_perf_pcpu_struct *ptr, u64 real_time,
 	sleep_time = current->se.statistics->sum_sleep_runtime - sleep_time; \
 	KSTAT_PERF_ADD(&kstat_glob.name, start, start - sleep_time);
 
+static inline u64 get_max_lat(struct kstat_lat_snap_struct *snap)
+{
+	if (time_before(jiffies, snap->time + HZ))
+		return snap->maxlat;
+	return 0;
+}
+
 extern void KSTAT_LAT_PCPU_ADD(struct kstat_lat_pcpu_struct *p, u64 dur);
 extern void KSTAT_LAT_PCPU_UPDATE(struct kstat_lat_pcpu_struct *p);
 
diff --git a/kernel/exit.c b/kernel/exit.c
index bf68b87a4d60..c2dc3a2c2c50 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -742,8 +742,16 @@  void kstat_add_dying(struct task_struct *tsk)
 
 	spin_lock_irq(&tsk->sighand->siglock);
 	for (i = 0; i < KSTAT_ALLOCSTAT_NR; i++) {
+		u64 maxlat_tsk, maxlat_sig;
+
 		tsk->signal->alloc_lat[i].totlat += tsk->alloc_lat[i].totlat;
 		tsk->signal->alloc_lat[i].count += tsk->alloc_lat[i].count;
+		maxlat_tsk = get_max_lat(&tsk->alloc_lat[i]);
+		maxlat_sig = get_max_lat(&tsk->signal->alloc_lat[i]);
+		if (maxlat_sig < maxlat_tsk) {
+			tsk->signal->alloc_lat[i].maxlat = tsk->alloc_lat[i].maxlat;
+			tsk->signal->alloc_lat[i].time = tsk->alloc_lat[i].time;
+		}
 	}
 	spin_unlock_irq(&tsk->sighand->siglock);
 #endif
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9d8c9e4eb970..0e0626e571fb 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3241,6 +3241,13 @@  static void __alloc_collect_stats(gfp_t gfp_mask, unsigned int order,
 	if (in_task()) {
 		current->alloc_lat[ind].totlat += time;
 		current->alloc_lat[ind].count++;
+		if (time_before(jiffies, current->alloc_lat[ind].time + HZ)) {
+			if (current->alloc_lat[ind].maxlat < time)
+				current->alloc_lat[ind].maxlat = time;
+		} else {
+			current->alloc_lat[ind].time = jiffies;
+			current->alloc_lat[ind].maxlat = time;
+		}
 	}
 
 	if (!page)

Comments

Pavel Borzenkov Aug. 20, 2018, 7:57 a.m.
> On 17 Aug 2018, at 19:40, Andrey Ryabinin <aryabinin@virtuozzo.com> wrote:
> 
> Add to '/proc/<pid>/vz_latency' column with maximal latency task have seen
> in the last second.
> 
> E.g.:
> 
> cat /proc/1/vz_latency
> Type                    Total_lat                Calls           Max (1sec)
> allocatomic:                    0                  294                    0
> alloc:                    3000000                43394                    0
> allocmp:                        0                 1018                    0

1 second is too small.
Prometheus doesn't scrape our services that frequently as we configure it to do this once a minute.
In this case we are gonna miss 59/60 of the data.

Please use 2-minute maximums.

> 
> AFAICS this changes output format but shouldn't break our the only user of
> this interface - pstorage. Accordind to the pstorage code it reads this
> file line by line, reads 'Total_lat' and 'Calls' fields and skips to the next
> line. Thus adding new field shouldn't break it.
> 
> https://jira.sw.ru/browse/PSBM-87797
> Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
> Cc: Pavel Borzenkov <Pavel.Borzenkov@acronis.com>
> ---
> fs/proc/base.c         | 28 ++++++++++++++++++++--------
> include/linux/kstat.h  |  1 +
> include/linux/vzstat.h |  8 ++++++++
> kernel/exit.c          |  8 ++++++++
> mm/page_alloc.c        |  7 +++++++
> 5 files changed, 44 insertions(+), 8 deletions(-)
> 
> diff --git a/fs/proc/base.c b/fs/proc/base.c
> index 64bbbc387afc..2431e74dbbfd 100644
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -80,6 +80,7 @@
> #include <linux/audit.h>
> #include <linux/poll.h>
> #include <linux/nsproxy.h>
> +#include <linux/vzstat.h>
> #include <linux/oom.h>
> #include <linux/elf.h>
> #include <linux/pid_namespace.h>
> @@ -580,8 +581,8 @@ static void lastlat_seq_show(struct seq_file *m,
> 		const char *name,
> 		struct kstat_lat_snap_struct *snap)
> {
> -	seq_printf(m, "%-12s %20Lu %20lu\n", name,
> -			snap->totlat, snap->count);
> +	seq_printf(m, "%-12s %20Lu %20lu %20Lu\n", name,
> +			snap->totlat, snap->count, get_max_lat(snap));
> }
> static const char *alloc_descr[] = {
> 	"allocatomic:",
> @@ -599,8 +600,8 @@ static int proc_tid_vz_lat(struct seq_file *m, struct pid_namespace *ns,
> {
> 	int i;
> 
> -	seq_printf(m, "%-12s %20s %20s\n",
> -			"Type", "Total_lat", "Calls");
> +	seq_printf(m, "%-12s %20s %20s %20s\n",
> +			"Type", "Total_lat", "Calls", "Max (1sec)");
> 
> 	for (i = 0; i < ARRAY_SIZE(alloc_types); i++)
> 		lastlat_seq_show(m, alloc_descr[i],
> @@ -615,33 +616,44 @@ static int proc_tgid_vz_lat(struct seq_file *m, struct pid_namespace *ns,
> 	unsigned long flags;
> 	u64 lat[ARRAY_SIZE(alloc_types)];
> 	u64 count[ARRAY_SIZE(alloc_types)];
> +	u64 maxlats[ARRAY_SIZE(alloc_types)];
> 
> 	for (i = 0; i < ARRAY_SIZE(alloc_types); i++) {
> 		lat[i] = task->alloc_lat[alloc_types[i]].totlat;
> 		count[i] = task->alloc_lat[alloc_types[i]].count;
> +		maxlats[i] = get_max_lat(&task->alloc_lat[alloc_types[i]]);
> 	}
> 
> 	if (lock_task_sighand(task, &flags)) {
> 		struct task_struct *t = task;
> +		u64 maxlat;
> +
> 		while_each_thread(task, t) {
> 			for (i = 0; i < ARRAY_SIZE(alloc_types); i++) {
> 				lat[i] += t->alloc_lat[alloc_types[i]].totlat;
> 				count[i] += t->alloc_lat[alloc_types[i]].count;
> +				maxlat = get_max_lat(&t->alloc_lat[alloc_types[i]]);
> +				if (maxlats[i] < maxlat)
> +					maxlats[i] = maxlat;
> 			}
> 		}
> 		for (i = 0; i < ARRAY_SIZE(alloc_types); i++) {
> 			lat[i] += t->signal->alloc_lat[alloc_types[i]].totlat;
> 			count[i] += t->signal->alloc_lat[alloc_types[i]].count;
> +			maxlat = get_max_lat(&t->signal->alloc_lat[alloc_types[i]]);
> +			if (maxlats[i] < maxlat)
> +				maxlats[i] = maxlat;
> +
> 		}
> 		unlock_task_sighand(task, &flags);
> 	}
> 
> -	seq_printf(m, "%-12s %20s %20s\n",
> -			"Type", "Total_lat", "Calls");
> +	seq_printf(m, "%-12s %20s %20s %20s\n",
> +			"Type", "Total_lat", "Calls", "Max (1sec)");
> 
> 	for (i = 0; i < ARRAY_SIZE(alloc_types); i++)
> -		seq_printf(m, "%-12s %20Lu %20Lu\n", alloc_descr[i],
> -			lat[i], count[i]);
> +		seq_printf(m, "%-12s %20Lu %20Lu %20Lu\n", alloc_descr[i],
> +			lat[i], count[i], maxlats[i]);
> 
> 	return 0;
> }
> diff --git a/include/linux/kstat.h b/include/linux/kstat.h
> index b268752f2e15..45bdd52ee7d9 100644
> --- a/include/linux/kstat.h
> +++ b/include/linux/kstat.h
> @@ -32,6 +32,7 @@ struct kstat_perf_pcpu_struct {
> struct kstat_lat_snap_struct {
> 	u64 maxlat, totlat;
> 	unsigned long count;
> +	unsigned long time;
> };
> 
> struct kstat_lat_pcpu_snap_struct {
> diff --git a/include/linux/vzstat.h b/include/linux/vzstat.h
> index f1475b2b763a..8ec7b4862736 100644
> --- a/include/linux/vzstat.h
> +++ b/include/linux/vzstat.h
> @@ -9,6 +9,7 @@
> #ifndef __VZSTAT_H__
> #define __VZSTAT_H__
> 
> +#include <linux/jiffies.h>
> #include <linux/mmzone.h>
> #include <linux/kstat.h>
> 
> @@ -64,6 +65,13 @@ extern void KSTAT_PERF_ADD(struct kstat_perf_pcpu_struct *ptr, u64 real_time,
> 	sleep_time = current->se.statistics->sum_sleep_runtime - sleep_time; \
> 	KSTAT_PERF_ADD(&kstat_glob.name, start, start - sleep_time);
> 
> +static inline u64 get_max_lat(struct kstat_lat_snap_struct *snap)
> +{
> +	if (time_before(jiffies, snap->time + HZ))
> +		return snap->maxlat;
> +	return 0;
> +}
> +
> extern void KSTAT_LAT_PCPU_ADD(struct kstat_lat_pcpu_struct *p, u64 dur);
> extern void KSTAT_LAT_PCPU_UPDATE(struct kstat_lat_pcpu_struct *p);
> 
> diff --git a/kernel/exit.c b/kernel/exit.c
> index bf68b87a4d60..c2dc3a2c2c50 100644
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -742,8 +742,16 @@ void kstat_add_dying(struct task_struct *tsk)
> 
> 	spin_lock_irq(&tsk->sighand->siglock);
> 	for (i = 0; i < KSTAT_ALLOCSTAT_NR; i++) {
> +		u64 maxlat_tsk, maxlat_sig;
> +
> 		tsk->signal->alloc_lat[i].totlat += tsk->alloc_lat[i].totlat;
> 		tsk->signal->alloc_lat[i].count += tsk->alloc_lat[i].count;
> +		maxlat_tsk = get_max_lat(&tsk->alloc_lat[i]);
> +		maxlat_sig = get_max_lat(&tsk->signal->alloc_lat[i]);
> +		if (maxlat_sig < maxlat_tsk) {
> +			tsk->signal->alloc_lat[i].maxlat = tsk->alloc_lat[i].maxlat;
> +			tsk->signal->alloc_lat[i].time = tsk->alloc_lat[i].time;
> +		}
> 	}
> 	spin_unlock_irq(&tsk->sighand->siglock);
> #endif
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 9d8c9e4eb970..0e0626e571fb 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -3241,6 +3241,13 @@ static void __alloc_collect_stats(gfp_t gfp_mask, unsigned int order,
> 	if (in_task()) {
> 		current->alloc_lat[ind].totlat += time;
> 		current->alloc_lat[ind].count++;
> +		if (time_before(jiffies, current->alloc_lat[ind].time + HZ)) {
> +			if (current->alloc_lat[ind].maxlat < time)
> +				current->alloc_lat[ind].maxlat = time;
> +		} else {
> +			current->alloc_lat[ind].time = jiffies;
> +			current->alloc_lat[ind].maxlat = time;
> +		}
> 	}
> 
> 	if (!page)
> -- 
> 2.16.4
>
Andrey Ryabinin Aug. 20, 2018, noon
On 08/20/2018 10:57 AM, Pavel Borzenkov wrote:
> 
> 
>> On 17 Aug 2018, at 19:40, Andrey Ryabinin <aryabinin@virtuozzo.com> wrote:
>>
>> Add to '/proc/<pid>/vz_latency' column with maximal latency task have seen
>> in the last second.
>>
>> E.g.:
>>
>> cat /proc/1/vz_latency
>> Type                    Total_lat                Calls           Max (1sec)
>> allocatomic:                    0                  294                    0
>> alloc:                    3000000                43394                    0
>> allocmp:                        0                 1018                    0
> 
> 1 second is too small.
> Prometheus doesn't scrape our services that frequently as we configure it to do this once a minute.
> In this case we are gonna miss 59/60 of the data.
> 
> Please use 2-minute maximums.
> 

Sure, no problem.

Also note that technically this is not maximum in last period of time.
It's maximum since the last reset of max latency with reset happening once
per period of time (2 minutes in this patch).
The true max latency in last X-sec/min is technically impossible to implement
as this would require to remember all allocations times and and latencies
in last X-sec/min. 
We can record max latencies every 10 sec for the past 2min and based on that
calculate the max latency in last 2 minutes (+/-10sec).
But the same logic can be implemented in user-space with less performance cost.
In the kernel we would need to put that complex logic into every allocation,
while in userspace this can be done only for tasks we want to monitor.