[RHEL7,COMMIT] ms/mm: scale kswapd watermarks in proportion to memory

Submitted by Konstantin Khorenko on May 21, 2019, 3:40 p.m.

Details

Message ID 201905211540.x4LFegjR026223@finist-ce7.sw.ru
State New
Series "Series without cover letter"
Headers show

Commit Message

Konstantin Khorenko May 21, 2019, 3:40 p.m.
The commit is pushed to "branch-rh7-3.10.0-957.12.2.vz7.96.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-957.12.2.vz7.96.1
------>
commit 4ae4f1a8d63da37f1730a12f68c5146935b3a92c
Author: Johannes Weiner <hannes@cmpxchg.org>
Date:   Tue May 21 18:40:42 2019 +0300

    ms/mm: scale kswapd watermarks in proportion to memory
    
    In machines with 140G of memory and enterprise flash storage, we have
    seen read and write bursts routinely exceed the kswapd watermarks and
    cause thundering herds in direct reclaim.  Unfortunately, the only way
    to tune kswapd aggressiveness is through adjusting min_free_kbytes - the
    system's emergency reserves - which is entirely unrelated to the
    system's latency requirements.  In order to get kswapd to maintain a
    250M buffer of free memory, the emergency reserves need to be set to 1G.
    That is a lot of memory wasted for no good reason.
    
    On the other hand, it's reasonable to assume that allocation bursts and
    overall allocation concurrency scale with memory capacity, so it makes
    sense to make kswapd aggressiveness a function of that as well.
    
    Change the kswapd watermark scale factor from the currently fixed 25% of
    the tunable emergency reserve to a tunable 0.1% of memory.
    
    Beyond 1G of memory, this will produce bigger watermark steps than the
    current formula in default settings.  Ensure that the new formula never
    chooses steps smaller than that, i.e.  25% of the emergency reserve.
    
    On a 140G machine, this raises the default watermark steps - the
    distance between min and low, and low and high - from 16M to 143M.
    
    Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
    Acked-by: Mel Gorman <mgorman@suse.de>
    Acked-by: Rik van Riel <riel@redhat.com>
    Acked-by: David Rientjes <rientjes@google.com>
    Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
    Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
    Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
    
    https://jira.sw.ru/browse/PSBM-94102
    (cherry picked from commit 795ae7a0de6b834a0cc202aa55c190ef81496665)
    Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
---
 Documentation/sysctl/vm.txt | 18 ++++++++++++++++++
 include/linux/mm.h          |  1 +
 include/linux/mmzone.h      |  2 ++
 kernel/sysctl.c             | 10 ++++++++++
 mm/page_alloc.c             | 29 +++++++++++++++++++++++++++--
 5 files changed, 58 insertions(+), 2 deletions(-)

Patch hide | download patch | download mbox

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index bd869389291e..0708463bf737 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -782,6 +782,24 @@  never reclaim dentries and inodes due to memory pressure and this can easily
 lead to out-of-memory conditions. Increasing vfs_cache_pressure beyond 100
 causes the kernel to prefer to reclaim dentries and inodes.
 
+=============================================================
+
+watermark_scale_factor:
+
+This factor controls the aggressiveness of kswapd. It defines the
+amount of memory left in a node/system before kswapd is woken up and
+how much memory needs to be free before kswapd goes back to sleep.
+
+The unit is in fractions of 10,000. The default value of 10 means the
+distances between watermarks are 0.1% of the available memory in the
+node/system. The maximum value is 1000, or 10% of memory.
+
+A high rate of threads entering direct reclaim (allocstall) or kswapd
+going to sleep prematurely (kswapd_low_wmark_hit_quickly) can indicate
+that the number of free pages kswapd maintains for latency reasons is
+too small for the allocation bursts occurring in the system. This knob
+can then be used to tune kswapd aggressiveness accordingly.
+
 ==============================================================
 
 zone_reclaim_mode:
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 6bd1cef6e840..83a180536a16 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2002,6 +2002,7 @@  extern void zone_pcp_reset(struct zone *zone);
 
 /* page_alloc.c */
 extern int min_free_kbytes;
+extern int watermark_scale_factor;
 
 /* nommu.c */
 extern atomic_long_t mmap_pages_allocated;
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 70e925d41445..b0df91a6aceb 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1015,6 +1015,8 @@  static inline int is_dma(struct zone *zone)
 struct ctl_table;
 int min_free_kbytes_sysctl_handler(struct ctl_table *, int,
 					void __user *, size_t *, loff_t *);
+int watermark_scale_factor_sysctl_handler(struct ctl_table *, int,
+					void __user *, size_t *, loff_t *);
 extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1];
 int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int,
 					void __user *, size_t *, loff_t *);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 85e9d5e9be83..272b05541176 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -130,6 +130,7 @@  static int __maybe_unused two = 2;
 static int __maybe_unused four = 4;
 static unsigned long one_ul = 1;
 static int one_hundred = 100;
+static int one_thousand = 1000;
 #ifdef CONFIG_PRINTK
 static int ten_thousand = 10000;
 #endif
@@ -1488,6 +1489,15 @@  static struct ctl_table vm_table[] = {
 		.proc_handler	= min_free_kbytes_sysctl_handler,
 		.extra1		= &zero,
 	},
+	{
+		.procname	= "watermark_scale_factor",
+		.data		= &watermark_scale_factor,
+		.maxlen		= sizeof(watermark_scale_factor),
+		.mode		= 0644,
+		.proc_handler	= watermark_scale_factor_sysctl_handler,
+		.extra1		= &one,
+		.extra2		= &one_thousand,
+	},
 	{
 		.procname	= "percpu_pagelist_fraction",
 		.data		= &percpu_pagelist_fraction,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 45da030d8da2..50b761c9ce10 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -234,6 +234,7 @@  static char * const zone_names[REAL_MAX_ZONES] = {
 
 int min_free_kbytes = 1024;
 int user_min_free_kbytes;
+int watermark_scale_factor = 10;
 
 static unsigned long __meminitdata nr_kernel_pages;
 static unsigned long __meminitdata nr_all_pages;
@@ -6788,8 +6789,17 @@  static void __setup_per_zone_wmarks(void)
 			zone->watermark[WMARK_MIN] = tmp;
 		}
 
-		zone->watermark[WMARK_LOW]  = min_wmark_pages(zone) + (tmp >> 2);
-		zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
+		/*
+		 * Set the kswapd watermarks distance according to the
+		 * scale factor in proportion to available memory, but
+		 * ensure a minimum size on small systems.
+		 */
+		tmp = max_t(u64, tmp >> 2,
+			    mult_frac(zone->managed_pages,
+				      watermark_scale_factor, 10000));
+
+		zone->watermark[WMARK_LOW]  = min_wmark_pages(zone) + tmp;
+		zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;
 
 		__mod_zone_page_state(zone, NR_ALLOC_BATCH,
 				      high_wmark_pages(zone) -
@@ -6867,6 +6877,21 @@  int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
 	return 0;
 }
 
+int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
+	void __user *buffer, size_t *length, loff_t *ppos)
+{
+	int rc;
+
+	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
+	if (rc)
+		return rc;
+
+	if (write)
+		setup_per_zone_wmarks();
+
+	return 0;
+}
+
 #ifdef CONFIG_NUMA
 int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)