[Devel,RHEL7,COMMIT] ms/mm: memcontrol: revert use of root_mem_cgroup res_counter

Submitted by Konstantin Khorenko on Jan. 16, 2017, 4:27 p.m.

Details

Message ID 201701161627.v0GGRGU4029082@finist_cl7.x64_64.work.ct
State New
Series "Series without cover letter"
Headers show

Commit Message

Konstantin Khorenko Jan. 16, 2017, 4:27 p.m.
The commit is pushed to "branch-rh7-3.10.0-514.vz7.27.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-514.vz7.27.10
------>
commit b47182055b23197e9037214bdd631bfb73bf251c
Author: Johannes Weiner <hannes@cmpxchg.org>
Date:   Mon Jan 16 20:27:16 2017 +0400

    ms/mm: memcontrol: revert use of root_mem_cgroup res_counter
    
    Dave Hansen reports a massive scalability regression in an uncontained
    page fault benchmark with more than 30 concurrent threads, which he
    bisected down to 05b843012335 ("mm: memcontrol: use root_mem_cgroup
    res_counter") and pin-pointed on res_counter spinlock contention.
    
    That change relied on the per-cpu charge caches to mostly swallow the
    res_counter costs, but it's apparent that the caches don't scale yet.
    
    Revert memcg back to bypassing res_counters on the root level in order
    to restore performance for uncontained workloads.
    
    Reported-by: Dave Hansen <dave@sr71.net>
    Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
    Tested-by: Dave Hansen <dave.hansen@intel.com>
    Acked-by: Michal Hocko <mhocko@suse.cz>
    Acked-by: Vladimir Davydov <vdavydov@parallels.com>
    Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
    
    https://jira.sw.ru/browse/PSBM-51558
    (cherry picked from commit ce00a967377baadf2481521e131771adc7652856)
    Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
---
 mm/memcontrol.c | 73 ++++++++++++++++++++++++++++++++-------------------------
 1 file changed, 41 insertions(+), 32 deletions(-)

Patch hide | download patch | download mbox

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 16bb6aa..6c11788 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4158,8 +4158,8 @@  out:
 }
 
 
-static unsigned long tree_stat(struct mem_cgroup *memcg,
-			       enum mem_cgroup_stat_index idx)
+static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg,
+					       enum mem_cgroup_stat_index idx)
 {
 	struct mem_cgroup *iter;
 	long val = 0;
@@ -4173,6 +4173,30 @@  static unsigned long tree_stat(struct mem_cgroup *memcg,
 	return val;
 }
 
+static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
+{
+	u64 val;
+
+	if (!mem_cgroup_is_root(memcg)) {
+		if (!swap)
+			return page_counter_read(&memcg->memory);
+		else
+			return page_counter_read(&memcg->memsw);
+	}
+
+	/*
+	 * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS
+	 * as well as in MEM_CGROUP_STAT_RSS_HUGE.
+	 */
+	val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
+	val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
+
+	if (swap)
+		val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);
+
+	return val << PAGE_SHIFT;
+}
+
 void mem_cgroup_fill_meminfo(struct mem_cgroup *memcg, struct meminfo *mi)
 {
 	int nid;
@@ -4181,12 +4205,12 @@  void mem_cgroup_fill_meminfo(struct mem_cgroup *memcg, struct meminfo *mi)
 	for_each_online_node(nid)
 		mem_cgroup_get_nr_pages(memcg, nid, mi->pages);
 
-	mi->slab_reclaimable = tree_stat(memcg,
+	mi->slab_reclaimable = mem_cgroup_recursive_stat(memcg,
 					MEM_CGROUP_STAT_SLAB_RECLAIMABLE);
-	mi->slab_unreclaimable = tree_stat(memcg,
+	mi->slab_unreclaimable = mem_cgroup_recursive_stat(memcg,
 					MEM_CGROUP_STAT_SLAB_UNRECLAIMABLE);
-	mi->cached = tree_stat(memcg, MEM_CGROUP_STAT_CACHE);
-	mi->shmem = tree_stat(memcg, MEM_CGROUP_STAT_SHMEM);
+	mi->cached = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
+	mi->shmem = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SHMEM);
 }
 
 int mem_cgroup_enough_memory(struct mem_cgroup *memcg, long pages)
@@ -4200,33 +4224,15 @@  int mem_cgroup_enough_memory(struct mem_cgroup *memcg, long pages)
 	free += page_counter_read(&memcg->dcache);
 
 	/* assume file cache is reclaimable */
-	free += tree_stat(memcg, MEM_CGROUP_STAT_CACHE);
+	free += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
 
 	/* but do not count shmem pages as they can't be purged,
 	 * only swapped out */
-	free -= tree_stat(memcg, MEM_CGROUP_STAT_SHMEM);
+	free -= mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SHMEM);
 
 	return free < pages ? -ENOMEM : 0;
 }
 
-static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
-{
-	u64 val;
-
-	if (mem_cgroup_is_root(memcg)) {
-		val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE);
-		val += tree_stat(memcg, MEM_CGROUP_STAT_RSS);
-		if (swap)
-			val += tree_stat(memcg, MEM_CGROUP_STAT_SWAP);
-	} else {
-		if (!swap)
-			val = page_counter_read(&memcg->memory);
-		else
-			val = page_counter_read(&memcg->memsw);
-	}
-	return val << PAGE_SHIFT;
-}
-
 enum {
 	RES_USAGE,
 	RES_LIMIT,
@@ -6760,7 +6766,8 @@  void mem_cgroup_uncharge_swap(swp_entry_t entry)
 	rcu_read_lock();
 	memcg = mem_cgroup_lookup(id);
 	if (memcg) {
-		page_counter_uncharge(&memcg->memsw, 1);
+		if (!mem_cgroup_is_root(memcg))
+			page_counter_uncharge(&memcg->memsw, 1);
 		mem_cgroup_swap_statistics(memcg, false);
 		css_put(&memcg->css);
 	}
@@ -6919,12 +6926,14 @@  static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
 {
 	unsigned long flags;
 
-	if (nr_mem)
-		page_counter_uncharge(&memcg->memory, nr_mem);
-	if (nr_memsw)
-		page_counter_uncharge(&memcg->memsw, nr_memsw);
+	if (!mem_cgroup_is_root(memcg)) {
+		if (nr_mem)
+			page_counter_uncharge(&memcg->memory, nr_mem);
+		if (nr_memsw)
+			page_counter_uncharge(&memcg->memsw, nr_memsw);
 
-	memcg_oom_recover(memcg);
+		memcg_oom_recover(memcg);
+	}
 
 	local_irq_save(flags);
 	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon);