[rh7,v2,4/4] /proc/<pid>/vz_latency: Show maximal allocation latency in the last second.

Submitted by Andrey Ryabinin on Aug. 20, 2018, 12:03 p.m.

Details

Message ID 20180820120308.13857-4-aryabinin@virtuozzo.com
State New
Series "Series without cover letter"
Headers show

Commit Message

Andrey Ryabinin Aug. 20, 2018, 12:03 p.m.
Add to '/proc/<pid>/vz_latency' column with maximal latency task have seen
in the last 2 minutes.

E.g.:

cat /proc/1/vz_latency
Type                    Total_lat                Calls           Max (2min)
allocatomic:                    0                  294                    0
alloc:                    3000000                43394                    0
allocmp:                        0                 1018                    0

AFAICS this changes output format but shouldn't break our the only user of
this interface - pstorage. Accordind to the pstorage code it reads this
file line by line, reads 'Total_lat' and 'Calls' fields and skips to the next
line. Thus adding new field shouldn't break it.

Note: technically this is not maximum in last period of time.
It's maximum since the last reset of max latency with reset happening once
per period of time (2 minutes in this patch).
The true max latency in last X-sec/min is technically impossible to implement
as this would require to remember all allocations times and and latencies
in last X-sec/min. However, we can record max latencies every 10 sec for the
past 2min and based on that calculate the max latency in last 2 minutes (+/-10sec).
But the same logic can be implemented in user-space with less performance cost,
since in the kernel we would need to put that complex logic into every allocation,
while in userspace this can be done only for task we want to monitor.

https://jira.sw.ru/browse/PSBM-87797
Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Pavel Borzenkov <Pavel.Borzenkov@acronis.com>
---

Changes since v1:
 - change period from 1sec to 2min

 fs/proc/base.c         | 28 ++++++++++++++++++++--------
 include/linux/kstat.h  |  1 +
 include/linux/vzstat.h | 10 ++++++++++
 kernel/exit.c          |  8 ++++++++
 mm/page_alloc.c        |  8 ++++++++
 5 files changed, 47 insertions(+), 8 deletions(-)

Patch hide | download patch | download mbox

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 64bbbc387afc..719e93b42ff5 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -80,6 +80,7 @@ 
 #include <linux/audit.h>
 #include <linux/poll.h>
 #include <linux/nsproxy.h>
+#include <linux/vzstat.h>
 #include <linux/oom.h>
 #include <linux/elf.h>
 #include <linux/pid_namespace.h>
@@ -580,8 +581,8 @@  static void lastlat_seq_show(struct seq_file *m,
 		const char *name,
 		struct kstat_lat_snap_struct *snap)
 {
-	seq_printf(m, "%-12s %20Lu %20lu\n", name,
-			snap->totlat, snap->count);
+	seq_printf(m, "%-12s %20Lu %20lu %20Lu\n", name,
+			snap->totlat, snap->count, get_max_lat(snap));
 }
 static const char *alloc_descr[] = {
 	"allocatomic:",
@@ -599,8 +600,8 @@  static int proc_tid_vz_lat(struct seq_file *m, struct pid_namespace *ns,
 {
 	int i;
 
-	seq_printf(m, "%-12s %20s %20s\n",
-			"Type", "Total_lat", "Calls");
+	seq_printf(m, "%-12s %20s %20s %20s\n",
+			"Type", "Total_lat", "Calls", "Max (2min)");
 
 	for (i = 0; i < ARRAY_SIZE(alloc_types); i++)
 		lastlat_seq_show(m, alloc_descr[i],
@@ -615,33 +616,44 @@  static int proc_tgid_vz_lat(struct seq_file *m, struct pid_namespace *ns,
 	unsigned long flags;
 	u64 lat[ARRAY_SIZE(alloc_types)];
 	u64 count[ARRAY_SIZE(alloc_types)];
+	u64 maxlats[ARRAY_SIZE(alloc_types)];
 
 	for (i = 0; i < ARRAY_SIZE(alloc_types); i++) {
 		lat[i] = task->alloc_lat[alloc_types[i]].totlat;
 		count[i] = task->alloc_lat[alloc_types[i]].count;
+		maxlats[i] = get_max_lat(&task->alloc_lat[alloc_types[i]]);
 	}
 
 	if (lock_task_sighand(task, &flags)) {
 		struct task_struct *t = task;
+		u64 maxlat;
+
 		while_each_thread(task, t) {
 			for (i = 0; i < ARRAY_SIZE(alloc_types); i++) {
 				lat[i] += t->alloc_lat[alloc_types[i]].totlat;
 				count[i] += t->alloc_lat[alloc_types[i]].count;
+				maxlat = get_max_lat(&t->alloc_lat[alloc_types[i]]);
+				if (maxlats[i] < maxlat)
+					maxlats[i] = maxlat;
 			}
 		}
 		for (i = 0; i < ARRAY_SIZE(alloc_types); i++) {
 			lat[i] += t->signal->alloc_lat[alloc_types[i]].totlat;
 			count[i] += t->signal->alloc_lat[alloc_types[i]].count;
+			maxlat = get_max_lat(&t->signal->alloc_lat[alloc_types[i]]);
+			if (maxlats[i] < maxlat)
+				maxlats[i] = maxlat;
+
 		}
 		unlock_task_sighand(task, &flags);
 	}
 
-	seq_printf(m, "%-12s %20s %20s\n",
-			"Type", "Total_lat", "Calls");
+	seq_printf(m, "%-12s %20s %20s %20s\n",
+			"Type", "Total_lat", "Calls", "Max (1sec)");
 
 	for (i = 0; i < ARRAY_SIZE(alloc_types); i++)
-		seq_printf(m, "%-12s %20Lu %20Lu\n", alloc_descr[i],
-			lat[i], count[i]);
+		seq_printf(m, "%-12s %20Lu %20Lu %20Lu\n", alloc_descr[i],
+			lat[i], count[i], maxlats[i]);
 
 	return 0;
 }
diff --git a/include/linux/kstat.h b/include/linux/kstat.h
index b268752f2e15..45bdd52ee7d9 100644
--- a/include/linux/kstat.h
+++ b/include/linux/kstat.h
@@ -32,6 +32,7 @@  struct kstat_perf_pcpu_struct {
 struct kstat_lat_snap_struct {
 	u64 maxlat, totlat;
 	unsigned long count;
+	unsigned long time;
 };
 
 struct kstat_lat_pcpu_snap_struct {
diff --git a/include/linux/vzstat.h b/include/linux/vzstat.h
index f1475b2b763a..9df7d4ff9a17 100644
--- a/include/linux/vzstat.h
+++ b/include/linux/vzstat.h
@@ -9,6 +9,7 @@ 
 #ifndef __VZSTAT_H__
 #define __VZSTAT_H__
 
+#include <linux/jiffies.h>
 #include <linux/mmzone.h>
 #include <linux/kstat.h>
 
@@ -64,6 +65,15 @@  extern void KSTAT_PERF_ADD(struct kstat_perf_pcpu_struct *ptr, u64 real_time,
 	sleep_time = current->se.statistics->sum_sleep_runtime - sleep_time; \
 	KSTAT_PERF_ADD(&kstat_glob.name, start, start - sleep_time);
 
+#define KSTAT_ALLOC_MAX_LAT_PERIOD (120*HZ)
+
+static inline u64 get_max_lat(struct kstat_lat_snap_struct *snap)
+{
+	if (time_before(jiffies, snap->time + KSTAT_ALLOC_MAX_LAT_PERIOD))
+		return snap->maxlat;
+	return 0;
+}
+
 extern void KSTAT_LAT_PCPU_ADD(struct kstat_lat_pcpu_struct *p, u64 dur);
 extern void KSTAT_LAT_PCPU_UPDATE(struct kstat_lat_pcpu_struct *p);
 
diff --git a/kernel/exit.c b/kernel/exit.c
index bf68b87a4d60..c2dc3a2c2c50 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -742,8 +742,16 @@  void kstat_add_dying(struct task_struct *tsk)
 
 	spin_lock_irq(&tsk->sighand->siglock);
 	for (i = 0; i < KSTAT_ALLOCSTAT_NR; i++) {
+		u64 maxlat_tsk, maxlat_sig;
+
 		tsk->signal->alloc_lat[i].totlat += tsk->alloc_lat[i].totlat;
 		tsk->signal->alloc_lat[i].count += tsk->alloc_lat[i].count;
+		maxlat_tsk = get_max_lat(&tsk->alloc_lat[i]);
+		maxlat_sig = get_max_lat(&tsk->signal->alloc_lat[i]);
+		if (maxlat_sig < maxlat_tsk) {
+			tsk->signal->alloc_lat[i].maxlat = tsk->alloc_lat[i].maxlat;
+			tsk->signal->alloc_lat[i].time = tsk->alloc_lat[i].time;
+		}
 	}
 	spin_unlock_irq(&tsk->sighand->siglock);
 #endif
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9d8c9e4eb970..e8b0f16de295 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3241,6 +3241,14 @@  static void __alloc_collect_stats(gfp_t gfp_mask, unsigned int order,
 	if (in_task()) {
 		current->alloc_lat[ind].totlat += time;
 		current->alloc_lat[ind].count++;
+		if (time_before(jiffies, current->alloc_lat[ind].time +
+					KSTAT_ALLOC_MAX_LAT_PERIOD)) {
+			if (current->alloc_lat[ind].maxlat < time)
+				current->alloc_lat[ind].maxlat = time;
+		} else {
+			current->alloc_lat[ind].time = jiffies;
+			current->alloc_lat[ind].maxlat = time;
+		}
 	}
 
 	if (!page)

Comments

Pavel Borzenkov Aug. 20, 2018, 1:48 p.m.
> On 20 Aug 2018, at 15:03, Andrey Ryabinin <aryabinin@virtuozzo.com> wrote:
> 
> Add to '/proc/<pid>/vz_latency' column with maximal latency task have seen
> in the last 2 minutes.
> 
> E.g.:
> 
> cat /proc/1/vz_latency
> Type                    Total_lat                Calls           Max (2min)
> allocatomic:                    0                  294                    0
> alloc:                    3000000                43394                    0
> allocmp:                        0                 1018                    0
> 
> AFAICS this changes output format but shouldn't break our the only user of
> this interface - pstorage. Accordind to the pstorage code it reads this
> file line by line, reads 'Total_lat' and 'Calls' fields and skips to the next
> line. Thus adding new field shouldn't break it.
> 
> Note: technically this is not maximum in last period of time.
> It's maximum since the last reset of max latency with reset happening once
> per period of time (2 minutes in this patch).
> The true max latency in last X-sec/min is technically impossible to implement
> as this would require to remember all allocations times and and latencies
> in last X-sec/min. However, we can record max latencies every 10 sec for the
> past 2min and based on that calculate the max latency in last 2 minutes (+/-10sec).
> But the same logic can be implemented in user-space with less performance cost,
> since in the kernel we would need to put that complex logic into every allocation,
> while in userspace this can be done only for task we want to monitor.
> 
> https://jira.sw.ru/browse/PSBM-87797
> Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
> Cc: Pavel Borzenkov <Pavel.Borzenkov@acronis.com>
> ---
> 
> Changes since v1:
> - change period from 1sec to 2min
> 
> fs/proc/base.c         | 28 ++++++++++++++++++++--------
> include/linux/kstat.h  |  1 +
> include/linux/vzstat.h | 10 ++++++++++
> kernel/exit.c          |  8 ++++++++
> mm/page_alloc.c        |  8 ++++++++
> 5 files changed, 47 insertions(+), 8 deletions(-)
> 
> diff --git a/fs/proc/base.c b/fs/proc/base.c
> index 64bbbc387afc..719e93b42ff5 100644
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -80,6 +80,7 @@
> #include <linux/audit.h>
> #include <linux/poll.h>
> #include <linux/nsproxy.h>
> +#include <linux/vzstat.h>
> #include <linux/oom.h>
> #include <linux/elf.h>
> #include <linux/pid_namespace.h>
> @@ -580,8 +581,8 @@ static void lastlat_seq_show(struct seq_file *m,
> 		const char *name,
> 		struct kstat_lat_snap_struct *snap)
> {
> -	seq_printf(m, "%-12s %20Lu %20lu\n", name,
> -			snap->totlat, snap->count);
> +	seq_printf(m, "%-12s %20Lu %20lu %20Lu\n", name,
> +			snap->totlat, snap->count, get_max_lat(snap));
> }
> static const char *alloc_descr[] = {
> 	"allocatomic:",
> @@ -599,8 +600,8 @@ static int proc_tid_vz_lat(struct seq_file *m, struct pid_namespace *ns,
> {
> 	int i;
> 
> -	seq_printf(m, "%-12s %20s %20s\n",
> -			"Type", "Total_lat", "Calls");
> +	seq_printf(m, "%-12s %20s %20s %20s\n",
> +			"Type", "Total_lat", "Calls", "Max (2min)");
> 
> 	for (i = 0; i < ARRAY_SIZE(alloc_types); i++)
> 		lastlat_seq_show(m, alloc_descr[i],
> @@ -615,33 +616,44 @@ static int proc_tgid_vz_lat(struct seq_file *m, struct pid_namespace *ns,
> 	unsigned long flags;
> 	u64 lat[ARRAY_SIZE(alloc_types)];
> 	u64 count[ARRAY_SIZE(alloc_types)];
> +	u64 maxlats[ARRAY_SIZE(alloc_types)];
> 
> 	for (i = 0; i < ARRAY_SIZE(alloc_types); i++) {
> 		lat[i] = task->alloc_lat[alloc_types[i]].totlat;
> 		count[i] = task->alloc_lat[alloc_types[i]].count;
> +		maxlats[i] = get_max_lat(&task->alloc_lat[alloc_types[i]]);
> 	}
> 
> 	if (lock_task_sighand(task, &flags)) {
> 		struct task_struct *t = task;
> +		u64 maxlat;
> +
> 		while_each_thread(task, t) {
> 			for (i = 0; i < ARRAY_SIZE(alloc_types); i++) {
> 				lat[i] += t->alloc_lat[alloc_types[i]].totlat;
> 				count[i] += t->alloc_lat[alloc_types[i]].count;
> +				maxlat = get_max_lat(&t->alloc_lat[alloc_types[i]]);
> +				if (maxlats[i] < maxlat)
> +					maxlats[i] = maxlat;
> 			}
> 		}
> 		for (i = 0; i < ARRAY_SIZE(alloc_types); i++) {
> 			lat[i] += t->signal->alloc_lat[alloc_types[i]].totlat;
> 			count[i] += t->signal->alloc_lat[alloc_types[i]].count;
> +			maxlat = get_max_lat(&t->signal->alloc_lat[alloc_types[i]]);
> +			if (maxlats[i] < maxlat)
> +				maxlats[i] = maxlat;
> +
> 		}
> 		unlock_task_sighand(task, &flags);
> 	}
> 
> -	seq_printf(m, "%-12s %20s %20s\n",
> -			"Type", "Total_lat", "Calls");
> +	seq_printf(m, "%-12s %20s %20s %20s\n",
> +			"Type", "Total_lat", "Calls", "Max (1sec)");

Wrong header? Should be 2min

> 
> 	for (i = 0; i < ARRAY_SIZE(alloc_types); i++)
> -		seq_printf(m, "%-12s %20Lu %20Lu\n", alloc_descr[i],
> -			lat[i], count[i]);
> +		seq_printf(m, "%-12s %20Lu %20Lu %20Lu\n", alloc_descr[i],
> +			lat[i], count[i], maxlats[i]);
> 
> 	return 0;
> }
> diff --git a/include/linux/kstat.h b/include/linux/kstat.h
> index b268752f2e15..45bdd52ee7d9 100644
> --- a/include/linux/kstat.h
> +++ b/include/linux/kstat.h
> @@ -32,6 +32,7 @@ struct kstat_perf_pcpu_struct {
> struct kstat_lat_snap_struct {
> 	u64 maxlat, totlat;
> 	unsigned long count;
> +	unsigned long time;
> };
> 
> struct kstat_lat_pcpu_snap_struct {
> diff --git a/include/linux/vzstat.h b/include/linux/vzstat.h
> index f1475b2b763a..9df7d4ff9a17 100644
> --- a/include/linux/vzstat.h
> +++ b/include/linux/vzstat.h
> @@ -9,6 +9,7 @@
> #ifndef __VZSTAT_H__
> #define __VZSTAT_H__
> 
> +#include <linux/jiffies.h>
> #include <linux/mmzone.h>
> #include <linux/kstat.h>
> 
> @@ -64,6 +65,15 @@ extern void KSTAT_PERF_ADD(struct kstat_perf_pcpu_struct *ptr, u64 real_time,
> 	sleep_time = current->se.statistics->sum_sleep_runtime - sleep_time; \
> 	KSTAT_PERF_ADD(&kstat_glob.name, start, start - sleep_time);
> 
> +#define KSTAT_ALLOC_MAX_LAT_PERIOD (120*HZ)
> +
> +static inline u64 get_max_lat(struct kstat_lat_snap_struct *snap)
> +{
> +	if (time_before(jiffies, snap->time + KSTAT_ALLOC_MAX_LAT_PERIOD))
> +		return snap->maxlat;
> +	return 0;
> +}
> +
> extern void KSTAT_LAT_PCPU_ADD(struct kstat_lat_pcpu_struct *p, u64 dur);
> extern void KSTAT_LAT_PCPU_UPDATE(struct kstat_lat_pcpu_struct *p);
> 
> diff --git a/kernel/exit.c b/kernel/exit.c
> index bf68b87a4d60..c2dc3a2c2c50 100644
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -742,8 +742,16 @@ void kstat_add_dying(struct task_struct *tsk)
> 
> 	spin_lock_irq(&tsk->sighand->siglock);
> 	for (i = 0; i < KSTAT_ALLOCSTAT_NR; i++) {
> +		u64 maxlat_tsk, maxlat_sig;
> +
> 		tsk->signal->alloc_lat[i].totlat += tsk->alloc_lat[i].totlat;
> 		tsk->signal->alloc_lat[i].count += tsk->alloc_lat[i].count;
> +		maxlat_tsk = get_max_lat(&tsk->alloc_lat[i]);
> +		maxlat_sig = get_max_lat(&tsk->signal->alloc_lat[i]);
> +		if (maxlat_sig < maxlat_tsk) {
> +			tsk->signal->alloc_lat[i].maxlat = tsk->alloc_lat[i].maxlat;
> +			tsk->signal->alloc_lat[i].time = tsk->alloc_lat[i].time;
> +		}
> 	}
> 	spin_unlock_irq(&tsk->sighand->siglock);
> #endif
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 9d8c9e4eb970..e8b0f16de295 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -3241,6 +3241,14 @@ static void __alloc_collect_stats(gfp_t gfp_mask, unsigned int order,
> 	if (in_task()) {
> 		current->alloc_lat[ind].totlat += time;
> 		current->alloc_lat[ind].count++;
> +		if (time_before(jiffies, current->alloc_lat[ind].time +
> +					KSTAT_ALLOC_MAX_LAT_PERIOD)) {
> +			if (current->alloc_lat[ind].maxlat < time)
> +				current->alloc_lat[ind].maxlat = time;
> +		} else {
> +			current->alloc_lat[ind].time = jiffies;
> +			current->alloc_lat[ind].maxlat = time;
> +		}
> 	}
> 
> 	if (!page)
> -- 
> 2.16.4
>
Andrey Ryabinin Aug. 20, 2018, 1:50 p.m.
On 08/20/2018 04:48 PM, Pavel Borzenkov wrote:

>> -	seq_printf(m, "%-12s %20s %20s\n",
>> -			"Type", "Total_lat", "Calls");
>> +	seq_printf(m, "%-12s %20s %20s %20s\n",
>> +			"Type", "Total_lat", "Calls", "Max (1sec)");
> 
> Wrong header? Should be 2min
> 

Yep, missed that spot.
Denis V. Lunev Aug. 21, 2018, 3:37 p.m.
On 08/20/2018 03:03 PM, Andrey Ryabinin wrote:
> Add to '/proc/<pid>/vz_latency' column with maximal latency task have seen
> in the last 2 minutes.
>
> E.g.:
>
> cat /proc/1/vz_latency
> Type                    Total_lat                Calls           Max (2min)
> allocatomic:                    0                  294                    0
> alloc:                    3000000                43394                    0
> allocmp:                        0                 1018                    0
>
> AFAICS this changes output format but shouldn't break our the only user of
> this interface - pstorage. Accordind to the pstorage code it reads this
> file line by line, reads 'Total_lat' and 'Calls' fields and skips to the next
> line. Thus adding new field shouldn't break it.
>
> Note: technically this is not maximum in last period of time.
> It's maximum since the last reset of max latency with reset happening once
> per period of time (2 minutes in this patch).
> The true max latency in last X-sec/min is technically impossible to implement
> as this would require to remember all allocations times and and latencies
> in last X-sec/min. However, we can record max latencies every 10 sec for the
> past 2min and based on that calculate the max latency in last 2 minutes (+/-10sec).
> But the same logic can be implemented in user-space with less performance cost,
> since in the kernel we would need to put that complex logic into every allocation,
> while in userspace this can be done only for task we want to monitor.
>
> https://jira.sw.ru/browse/PSBM-87797
> Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
> Cc: Pavel Borzenkov <Pavel.Borzenkov@acronis.com>
> ---
>
> Changes since v1:
>  - change period from 1sec to 2min
>
>  fs/proc/base.c         | 28 ++++++++++++++++++++--------
>  include/linux/kstat.h  |  1 +
>  include/linux/vzstat.h | 10 ++++++++++
>  kernel/exit.c          |  8 ++++++++
>  mm/page_alloc.c        |  8 ++++++++
>  5 files changed, 47 insertions(+), 8 deletions(-)
>
> diff --git a/fs/proc/base.c b/fs/proc/base.c
> index 64bbbc387afc..719e93b42ff5 100644
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -80,6 +80,7 @@
>  #include <linux/audit.h>
>  #include <linux/poll.h>
>  #include <linux/nsproxy.h>
> +#include <linux/vzstat.h>
>  #include <linux/oom.h>
>  #include <linux/elf.h>
>  #include <linux/pid_namespace.h>
> @@ -580,8 +581,8 @@ static void lastlat_seq_show(struct seq_file *m,
>  		const char *name,
>  		struct kstat_lat_snap_struct *snap)
>  {
> -	seq_printf(m, "%-12s %20Lu %20lu\n", name,
> -			snap->totlat, snap->count);
> +	seq_printf(m, "%-12s %20Lu %20lu %20Lu\n", name,
> +			snap->totlat, snap->count, get_max_lat(snap));
>  }
>  static const char *alloc_descr[] = {
>  	"allocatomic:",
> @@ -599,8 +600,8 @@ static int proc_tid_vz_lat(struct seq_file *m, struct pid_namespace *ns,
>  {
>  	int i;
>  
> -	seq_printf(m, "%-12s %20s %20s\n",
> -			"Type", "Total_lat", "Calls");
> +	seq_printf(m, "%-12s %20s %20s %20s\n",
> +			"Type", "Total_lat", "Calls", "Max (2min)");
>  
>  	for (i = 0; i < ARRAY_SIZE(alloc_types); i++)
>  		lastlat_seq_show(m, alloc_descr[i],
> @@ -615,33 +616,44 @@ static int proc_tgid_vz_lat(struct seq_file *m, struct pid_namespace *ns,
>  	unsigned long flags;
>  	u64 lat[ARRAY_SIZE(alloc_types)];
>  	u64 count[ARRAY_SIZE(alloc_types)];
> +	u64 maxlats[ARRAY_SIZE(alloc_types)];
>  
>  	for (i = 0; i < ARRAY_SIZE(alloc_types); i++) {
>  		lat[i] = task->alloc_lat[alloc_types[i]].totlat;
>  		count[i] = task->alloc_lat[alloc_types[i]].count;
> +		maxlats[i] = get_max_lat(&task->alloc_lat[alloc_types[i]]);
>  	}
>  
>  	if (lock_task_sighand(task, &flags)) {
>  		struct task_struct *t = task;
> +		u64 maxlat;
> +
>  		while_each_thread(task, t) {
>  			for (i = 0; i < ARRAY_SIZE(alloc_types); i++) {
>  				lat[i] += t->alloc_lat[alloc_types[i]].totlat;
>  				count[i] += t->alloc_lat[alloc_types[i]].count;
> +				maxlat = get_max_lat(&t->alloc_lat[alloc_types[i]]);
> +				if (maxlats[i] < maxlat)
> +					maxlats[i] = maxlat;
>  			}
>  		}
>  		for (i = 0; i < ARRAY_SIZE(alloc_types); i++) {
>  			lat[i] += t->signal->alloc_lat[alloc_types[i]].totlat;
>  			count[i] += t->signal->alloc_lat[alloc_types[i]].count;
> +			maxlat = get_max_lat(&t->signal->alloc_lat[alloc_types[i]]);
> +			if (maxlats[i] < maxlat)
> +				maxlats[i] = maxlat;
> +
>  		}
>  		unlock_task_sighand(task, &flags);
>  	}
>  
> -	seq_printf(m, "%-12s %20s %20s\n",
> -			"Type", "Total_lat", "Calls");
> +	seq_printf(m, "%-12s %20s %20s %20s\n",
> +			"Type", "Total_lat", "Calls", "Max (1sec)");
>  
>  	for (i = 0; i < ARRAY_SIZE(alloc_types); i++)
> -		seq_printf(m, "%-12s %20Lu %20Lu\n", alloc_descr[i],
> -			lat[i], count[i]);
> +		seq_printf(m, "%-12s %20Lu %20Lu %20Lu\n", alloc_descr[i],
> +			lat[i], count[i], maxlats[i]);
>  
>  	return 0;
>  }
> diff --git a/include/linux/kstat.h b/include/linux/kstat.h
> index b268752f2e15..45bdd52ee7d9 100644
> --- a/include/linux/kstat.h
> +++ b/include/linux/kstat.h
> @@ -32,6 +32,7 @@ struct kstat_perf_pcpu_struct {
>  struct kstat_lat_snap_struct {
>  	u64 maxlat, totlat;
>  	unsigned long count;
> +	unsigned long time;
>  };
>  
>  struct kstat_lat_pcpu_snap_struct {
> diff --git a/include/linux/vzstat.h b/include/linux/vzstat.h
> index f1475b2b763a..9df7d4ff9a17 100644
> --- a/include/linux/vzstat.h
> +++ b/include/linux/vzstat.h
> @@ -9,6 +9,7 @@
>  #ifndef __VZSTAT_H__
>  #define __VZSTAT_H__
>  
> +#include <linux/jiffies.h>
>  #include <linux/mmzone.h>
>  #include <linux/kstat.h>
>  
> @@ -64,6 +65,15 @@ extern void KSTAT_PERF_ADD(struct kstat_perf_pcpu_struct *ptr, u64 real_time,
>  	sleep_time = current->se.statistics->sum_sleep_runtime - sleep_time; \
>  	KSTAT_PERF_ADD(&kstat_glob.name, start, start - sleep_time);
>  
> +#define KSTAT_ALLOC_MAX_LAT_PERIOD (120*HZ)
> +
> +static inline u64 get_max_lat(struct kstat_lat_snap_struct *snap)
> +{
> +	if (time_before(jiffies, snap->time + KSTAT_ALLOC_MAX_LAT_PERIOD))
> +		return snap->maxlat;
> +	return 0;
> +}
> +
>  extern void KSTAT_LAT_PCPU_ADD(struct kstat_lat_pcpu_struct *p, u64 dur);
>  extern void KSTAT_LAT_PCPU_UPDATE(struct kstat_lat_pcpu_struct *p);
>  
> diff --git a/kernel/exit.c b/kernel/exit.c
> index bf68b87a4d60..c2dc3a2c2c50 100644
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -742,8 +742,16 @@ void kstat_add_dying(struct task_struct *tsk)
>  
>  	spin_lock_irq(&tsk->sighand->siglock);
>  	for (i = 0; i < KSTAT_ALLOCSTAT_NR; i++) {
> +		u64 maxlat_tsk, maxlat_sig;
> +
>  		tsk->signal->alloc_lat[i].totlat += tsk->alloc_lat[i].totlat;
>  		tsk->signal->alloc_lat[i].count += tsk->alloc_lat[i].count;
> +		maxlat_tsk = get_max_lat(&tsk->alloc_lat[i]);
> +		maxlat_sig = get_max_lat(&tsk->signal->alloc_lat[i]);
> +		if (maxlat_sig < maxlat_tsk) {
> +			tsk->signal->alloc_lat[i].maxlat = tsk->alloc_lat[i].maxlat;
> +			tsk->signal->alloc_lat[i].time = tsk->alloc_lat[i].time;
> +		}
>  	}
>  	spin_unlock_irq(&tsk->sighand->siglock);
>  #endif
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 9d8c9e4eb970..e8b0f16de295 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -3241,6 +3241,14 @@ static void __alloc_collect_stats(gfp_t gfp_mask, unsigned int order,
>  	if (in_task()) {
>  		current->alloc_lat[ind].totlat += time;
>  		current->alloc_lat[ind].count++;
> +		if (time_before(jiffies, current->alloc_lat[ind].time +
> +					KSTAT_ALLOC_MAX_LAT_PERIOD)) {
> +			if (current->alloc_lat[ind].maxlat < time)
> +				current->alloc_lat[ind].maxlat = time;
> +		} else {
> +			current->alloc_lat[ind].time = jiffies;
> +			current->alloc_lat[ind].maxlat = time;
> +		}
>  	}
>  
>  	if (!page)
as far as I remember we have discussed a little bit better collection
algorithm. Let us assume that we want 1m maximum. We store
2 counters bound to 2 time periods shifted by 1/2. We return to the
user the counter, which is in the 2nd half the counter. We will not
have a 0 latency if we'll come at too bad place.

[----------]
      [-----------]

Den