[RHEL7,COMMIT] memcg: access beyond end of shrinker_map

Submitted by Konstantin Khorenko on Jan. 17, 2020, 9:45 a.m.

Details

Message ID 202001170945.00H9jBMb032270@finist-ce7.sw.ru
State New
Series "memcg: access beyond end of shrinker_map"
Headers show

Commit Message

Konstantin Khorenko Jan. 17, 2020, 9:45 a.m.
The commit is pushed to "branch-rh7-3.10.0-1062.7.1.vz7.130.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-1062.7.1.vz7.130.10
------>
commit 2ad49c8b916379544fa9dd35f5059dd9ffb0ce1b
Author: Vasily Averin <vvs@virtuozzo.com>
Date:   Fri Jan 17 12:45:11 2020 +0300

    memcg: access beyond end of shrinker_map
    
    vz7 kernel can remove in-use memcg from parent-chield hierarchy tree,
    and skip resize of its shrinker_map during registration of new shrinkers.
    It allows memcg user to access memory beyond of end non-resized
    shrinker_map, and corrupt next slab element.
    
    CPU A                    CPU B                       CPU C
    got memcg            cgroup_rmdir()                 sget_userns()
    shrink_slab_memcg()  cgroup_destroy_locked()        register_shrinker()
      .                  cgroup_css_killed()            register_memcg_shrinker()
      .                   cgroup_offline_fn()           down_write(&shrinker_rwsem)
      .                   list_del_rcu(&cgrp->sibling)  memcg_expand_shrinker_maps()
    wait for shrinker_rwsem                             re-allocs shrinker_maps of all memcg
        .                                                but ignores memcg used on CPU A
        .                                                because it was removed by CPU B
        .                                               releases shrinker_rwsem
    accesses and modifies memory
     beyond end of own shrinker_map
      while using of new memcg_shrinkers
      registered on CPU C
    
    v4:
    - corrected WARN_ON condition
    
    v3:
    - corrected map->nr_max value, "- 1" was removed
    - nr_max update inside shrink_slab_memcg()
    
    v2 changes:
    - map_size was in bytes, shrinker_nr_max was in bytes (thanks to ktkhai@)
    - map_size was replaced by nr_max -- maximal allowed bit number
    - added WARN_ONs to detect another possible corruption places
    
    https://jira.sw.ru/browse/PSBM-100509
    https://jira.sw.ru/browse/PSBM-100694
    
    Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
    Acked-by: Kirill Tkhai <ktkhai@virtuozzo.com>
---
 include/linux/memcontrol.h | 1 +
 mm/memcontrol.c            | 4 ++++
 mm/vmscan.c                | 7 +++++--
 3 files changed, 10 insertions(+), 2 deletions(-)

Patch hide | download patch | download mbox

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index c1e50faf00e59..485a3d354b78b 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -51,6 +51,7 @@  struct mem_cgroup_reclaim_cookie {
  */
 struct memcg_shrinker_map {
 	struct rcu_head rcu;
+	int nr_max;
 	unsigned long map[0];
 };
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 077361eb399bf..0514f9b2b2308 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -754,6 +754,7 @@  static int memcg_expand_one_shrinker_map(struct mem_cgroup *memcg,
 		/* Set all old bits, clear all new bits */
 		memset(new->map, (int)0xff, old_size);
 		memset((void *)new->map + old_size, 0, size - old_size);
+		new->nr_max = size * BITS_PER_BYTE;
 
 		rcu_assign_pointer(memcg->info.nodeinfo[nid]->shrinker_map, new);
 		call_rcu(&old->rcu, memcg_free_shrinker_map_rcu);
@@ -797,6 +798,7 @@  static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
 			ret = -ENOMEM;
 			break;
 		}
+		map->nr_max = size * BITS_PER_BYTE;
 		rcu_assign_pointer(memcg->info.nodeinfo[nid]->shrinker_map, map);
 	}
 	mutex_unlock(&memcg_shrinker_map_mutex);
@@ -841,6 +843,7 @@  void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
 		map = rcu_dereference(memcg->info.nodeinfo[nid]->shrinker_map);
 		/* Pairs with smp mb in shrink_slab() */
 		smp_mb__before_atomic();
+		WARN_ON(shrinker_id >= map->nr_max);
 		set_bit(shrinker_id, map->map);
 		rcu_read_unlock();
 	}
@@ -857,6 +860,7 @@  void memcg_clear_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id
 	 */
 	rcu_read_lock();
 	map = rcu_dereference(memcg->info.nodeinfo[nid]->shrinker_map);
+	WARN_ON(shrinker_id >= map->nr_max);
 	clear_bit(shrinker_id, map->map);
 	rcu_read_unlock();
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index de7e51584f6f9..ec3443ad161bd 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -488,7 +488,7 @@  static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
 {
 	struct memcg_shrinker_map *map;
 	unsigned long ret, freed = 0;
-	int i;
+	int i, nr_max;
 
 	if (!memcg_kmem_enabled())
 		return 0;
@@ -501,7 +501,9 @@  static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
 	if (unlikely(!map))
 		goto unlock;
 
-	for_each_set_bit(i, map->map, shrinker_nr_max) {
+	nr_max = min(shrinker_nr_max, map->nr_max);
+
+	for_each_set_bit(i, map->map, nr_max) {
 		struct shrink_control sc = {
 			.gfp_mask = gfp_mask,
 			.nid = nid,
@@ -569,6 +571,7 @@  static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
 		}
 		put_shrinker(shrinker);
 		map = memcg_nid_shrinker_map(memcg, nid);
+		nr_max = min(shrinker_nr_max, map->nr_max);
 
 		if (rwsem_is_contended(&shrinker_rwsem)) {
 			freed = freed ? : 1;