[RHEL7,COMMIT] mm: allow kmem limit bypassing if reclaimable slabs detected

Submitted by Konstantin Khorenko on Feb. 11, 2019, 3:25 p.m.

Details

Message ID 201902111525.x1BFPrnJ010904@finist-ce7.sw.ru
State New
Series "mm: allow kmem limit bypassing if reclaimable slabs detected"
Headers show

Commit Message

Konstantin Khorenko Feb. 11, 2019, 3:25 p.m.
The commit is pushed to "branch-rh7-3.10.0-957.1.3.vz7.83.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-957.1.3.vz7.83.11
------>
commit 1bbcb753b7f965b35c68312b11dfaa4ca65b9ed3
Author: Konstantin Khorenko <khorenko@virtuozzo.com>
Date:   Fri Feb 8 14:00:11 2019 +0300

    mm: allow kmem limit bypassing if reclaimable slabs detected
    
    If we generate a lot of kmem (dentries and inodes in particular)
    we may hit cgroup kmem limit in GFP_NOFS context (e.g. in
    ext4_alloc_inode()) and fail to free reclaimable inodes due to NOFS
    context.
    
    Detect reclaimable kmem on hitting the limit and allow to bypass the
    limit - reclaim will happen on next kmem alloc in GFP_KERNEL context.
    
    Honor "vm.vfs_cache_min_ratio" sysctl and don't bypass in case the
    amount of reclaimable kmem is not enough.
    
    https://jira.sw.ru/browse/PSBM-91566
    
    Signed-off-by: Konstantin Khorenko <khorenko@virtuozzo.com>
---
 fs/super.c      |  3 ++-
 mm/memcontrol.c | 35 +++++++++++++++++++++++++++++++++++
 2 files changed, 37 insertions(+), 1 deletion(-)

Patch hide | download patch | download mbox

diff --git a/fs/super.c b/fs/super.c
index 16b42a54dc55..02c9dc94e555 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -53,7 +53,7 @@  static char *sb_writers_name[SB_FREEZE_LEVELS] = {
 	"sb_internal",
 };
 
-static bool dcache_is_low(struct mem_cgroup *memcg)
+bool dcache_is_low(struct mem_cgroup *memcg)
 {
 	unsigned long anon, file, dcache;
 	int vfs_cache_min_ratio = READ_ONCE(sysctl_vfs_cache_min_ratio);
@@ -71,6 +71,7 @@  static bool dcache_is_low(struct mem_cgroup *memcg)
 	return dcache / vfs_cache_min_ratio <
 			(anon + file + dcache) / 100;
 }
+EXPORT_SYMBOL(dcache_is_low);
 
 /*
  * One thing we have to be careful of with a per-sb shrinker is that we don't
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index cc7171eaf360..84c29cf659da 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2958,6 +2958,30 @@  void memcg_css_release_check_kmem(struct cgroup_subsys_state *css)
 }
 EXPORT_SYMBOL(memcg_css_release_check_kmem);
 
+extern bool dcache_is_low(struct mem_cgroup *memcg);
+
+/*
+ * Do we have anything to reclaim in memcg kmem?
+ * Have to honor vfs_cache_min_ratio here because if dcache_is_low()
+ * we won't reclaim dcache at all in do_shrink_slab().
+ */
+static bool kmem_reclaim_is_low(struct mem_cgroup *memcg)
+{
+#define	KMEM_RECLAIM_LOW_MARK	32
+
+	unsigned long dcache;
+	int vfs_cache_min_ratio = READ_ONCE(sysctl_vfs_cache_min_ratio);
+
+	if (vfs_cache_min_ratio <= 0) {
+		dcache = mem_cgroup_read_stat2_fast(memcg,
+				MEM_CGROUP_STAT_SLAB_RECLAIMABLE);
+
+		return dcache < KMEM_RECLAIM_LOW_MARK;
+	}
+
+	return dcache_is_low(memcg);
+}
+
 /**
  * mem_cgroup_try_charge - try charging a memcg
  * @memcg: memcg to charge
@@ -3108,6 +3132,17 @@  static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, bool kmem_charge
 	if (fatal_signal_pending(current))
 		goto bypass;
 
+	/*
+	 * We might have [a lot of] reclaimable kmem which we cannot reclaim in
+	 * the current context, e.g. lot of inodes/dentries while tring to get
+	 * allocate kmem for new inode with GFP_NOFS.
+	 * Thus overcharge kmem now, it will be reclaimed on next allocation in
+	 * usual GFP_KERNEL context.
+	 */
+	if (flags & MEM_CGROUP_RECLAIM_KMEM &&
+	    !kmem_reclaim_is_low(mem_over_limit))
+		goto bypass;
+
 	mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(batch * PAGE_SIZE));
 
 nomem: