[Devel,rh7,2/2] mm/memcg: reclaim only kmem if kmem limit reached.

Submitted by Andrey Ryabinin on Aug. 25, 2017, 3:38 p.m.

Details

Message ID 20170825153828.24990-2-aryabinin@virtuozzo.com
State New
Series "Series without cover letter"
Headers show

Commit Message

Andrey Ryabinin Aug. 25, 2017, 3:38 p.m.
If kmem limit on memcg reached, we go into memory reclaim,
and reclaim everything we can, including page cache and anon.
Reclaiming page cache or anon won't help since we need to lower
only kmem usage. This patch fixes the problem by avoiding
non-kmem reclaim on hitting the kmem limit.

https://jira.sw.ru/browse/PSBM-69226
Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
---
 include/linux/memcontrol.h | 10 ++++++++++
 include/linux/swap.h       |  2 +-
 mm/memcontrol.c            | 30 ++++++++++++++++--------------
 mm/vmscan.c                | 31 ++++++++++++++++++++++++-------
 4 files changed, 51 insertions(+), 22 deletions(-)

Patch hide | download patch | download mbox

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 1a52e58ab7de..1d6bc80c4c90 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -45,6 +45,16 @@  struct mem_cgroup_reclaim_cookie {
 	unsigned int generation;
 };
 
+/*
+ * Reclaim flags for mem_cgroup_hierarchical_reclaim
+ */
+#define MEM_CGROUP_RECLAIM_NOSWAP_BIT	0x0
+#define MEM_CGROUP_RECLAIM_NOSWAP	(1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
+#define MEM_CGROUP_RECLAIM_SHRINK_BIT	0x1
+#define MEM_CGROUP_RECLAIM_SHRINK	(1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
+#define MEM_CGROUP_RECLAIM_KMEM_BIT	0x2
+#define MEM_CGROUP_RECLAIM_KMEM		(1 << MEM_CGROUP_RECLAIM_KMEM_BIT)
+
 #ifdef CONFIG_MEMCG
 int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
 			  gfp_t gfp_mask, struct mem_cgroup **memcgp);
diff --git a/include/linux/swap.h b/include/linux/swap.h
index bd162f9bef0d..bd47451ec95a 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -324,7 +324,7 @@  extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 extern int __isolate_lru_page(struct page *page, isolate_mode_t mode);
 extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem,
 						  unsigned long nr_pages,
-						  gfp_t gfp_mask, bool noswap);
+						  gfp_t gfp_mask, int flags);
 extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
 						gfp_t gfp_mask, bool noswap,
 						struct zone *zone,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 97824e281d7a..f9a5f3819a31 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -511,16 +511,6 @@  enum res_type {
 #define OOM_CONTROL		(0)
 
 /*
- * Reclaim flags for mem_cgroup_hierarchical_reclaim
- */
-#define MEM_CGROUP_RECLAIM_NOSWAP_BIT	0x0
-#define MEM_CGROUP_RECLAIM_NOSWAP	(1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
-#define MEM_CGROUP_RECLAIM_SHRINK_BIT	0x1
-#define MEM_CGROUP_RECLAIM_SHRINK	(1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
-#define MEM_CGROUP_RECLAIM_KMEM_BIT	0x2
-#define MEM_CGROUP_RECLAIM_KMEM		(1 << MEM_CGROUP_RECLAIM_KMEM_BIT)
-
-/*
  * The memcg_create_mutex will be held whenever a new cgroup is created.
  * As a consequence, any change that needs to protect against new child cgroups
  * appearing has to hold it as well.
@@ -2137,7 +2127,7 @@  static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
 		if (loop)
 			drain_all_stock_async(memcg);
 		total += try_to_free_mem_cgroup_pages(memcg, SWAP_CLUSTER_MAX,
-						      gfp_mask, noswap);
+						      gfp_mask, flags);
 		if (test_thread_flag(TIF_MEMDIE) ||
 		    fatal_signal_pending(current))
 			return 1;
@@ -2150,6 +2140,16 @@  static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
 			break;
 		if (mem_cgroup_margin(memcg, flags & MEM_CGROUP_RECLAIM_KMEM))
 			break;
+
+		/*
+		 * Try harder to reclaim dcache. dcache reclaim may
+		 * temporarly fail due to dcache->dlock being held
+		 * by someone else. We must try harder to avoid premature
+		 * slab allocation failures.
+		 */
+		if (flags & MEM_CGROUP_RECLAIM_KMEM &&
+		    page_counter_read(&memcg->dcache))
+			continue;
 		/*
 		 * If nothing was reclaimed after two attempts, there
 		 * may be no reclaimable pages in this hierarchy.
@@ -2778,11 +2778,13 @@  static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, bool kmem_charge
 	struct mem_cgroup *mem_over_limit;
 	struct page_counter *counter;
 	unsigned long nr_reclaimed;
-	unsigned long flags = 0;
+	unsigned long flags;
 
 	if (mem_cgroup_is_root(memcg))
 		goto done;
 retry:
+	flags = 0;
+
 	if (consume_stock(memcg, nr_pages)) {
 		if (!kmem_charge)
 			goto done;
@@ -4138,7 +4140,7 @@  static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
 			return -EINTR;
 
 		progress = try_to_free_mem_cgroup_pages(memcg, SWAP_CLUSTER_MAX,
-							GFP_KERNEL, false);
+							GFP_KERNEL, 0);
 		if (!progress) {
 			nr_retries--;
 			/* maybe some writeback is necessary */
@@ -4573,7 +4575,7 @@  static int mem_cgroup_high_write(struct cgroup *cont, struct cftype *cft,
 	usage = page_counter_read(&memcg->memory);
 	if (usage > nr_pages)
 		try_to_free_mem_cgroup_pages(memcg, usage - nr_pages,
-					     GFP_KERNEL, false);
+					     GFP_KERNEL, 0);
 	return 0;
 }
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 277bd37bd430..a5db5940bb1e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -88,6 +88,9 @@  struct scan_control {
 	/* Scan (total_size >> priority) pages at once */
 	int priority;
 
+	/* Reclaim only slab */
+	bool slab_only;
+
 	/*
 	 * The memory cgroup that hit its limit and as a result is the
 	 * primary target of this reclaim invocation.
@@ -2346,6 +2349,7 @@  static void shrink_zone(struct zone *zone, struct scan_control *sc,
 	struct reclaim_state *reclaim_state = current->reclaim_state;
 	unsigned long nr_reclaimed, nr_scanned;
 	gfp_t slab_gfp = sc->gfp_mask;
+	bool slab_only = sc->slab_only;
 
 	/* Disable fs-related IO for direct reclaim */
 	if (!sc->target_mem_cgroup &&
@@ -2372,14 +2376,24 @@  static void shrink_zone(struct zone *zone, struct scan_control *sc,
 			if (!sc->may_thrash && mem_cgroup_low(root, memcg))
 				continue;
 
-			lruvec = mem_cgroup_zone_lruvec(zone, memcg);
 			scanned = sc->nr_scanned;
-			shrink_lruvec(lruvec, sc, &lru_pages);
-			zone_lru_pages += lru_pages;
 
-			if (memcg && is_classzone)
+			if (!slab_only) {
+				lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+				shrink_lruvec(lruvec, sc, &lru_pages);
+				zone_lru_pages += lru_pages;
+			}
+
+			if (memcg && is_classzone) {
 				shrink_slab(slab_gfp, zone_to_nid(zone),
 					    memcg, sc->priority, false);
+				if (reclaim_state) {
+					sc->nr_reclaimed += reclaim_state->reclaimed_slab;
+					sc->nr_scanned += reclaim_state->reclaimed_slab;
+					reclaim_state->reclaimed_slab = 0;
+				}
+
+			}
 
 			/*
 			 * Direct reclaim and kswapd have to scan all memory
@@ -2902,15 +2916,17 @@  unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
 unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 					   unsigned long nr_pages,
 					   gfp_t gfp_mask,
-					   bool noswap)
+					   int flags)
 {
 	struct zonelist *zonelist;
 	unsigned long nr_reclaimed;
+	struct reclaim_state reclaim_state = { 0 };
 	int nid;
 	struct scan_control sc = {
 		.may_writepage = !laptop_mode,
 		.may_unmap = 1,
-		.may_swap = !noswap,
+		.may_swap = !(flags & MEM_CGROUP_RECLAIM_NOSWAP),
+		.slab_only = flags & MEM_CGROUP_RECLAIM_KMEM,
 		.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
 		.order = 0,
 		.priority = DEF_PRIORITY,
@@ -2933,10 +2949,11 @@  unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 					    sc.may_writepage,
 					    sc.gfp_mask);
 
+	current->reclaim_state = &reclaim_state;
 	current->flags |= PF_MEMALLOC | PF_MEMCG_RECLAIM;
 	nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
 	current->flags &= ~(PF_MEMALLOC | PF_MEMCG_RECLAIM);
-
+	current->reclaim_state = NULL;
 	trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
 
 	return nr_reclaimed;

Comments

Stanislav Kinsburskiy Aug. 28, 2017, 9:02 a.m.
25.08.2017 18:38, Andrey Ryabinin пишет:
> If kmem limit on memcg reached, we go into memory reclaim,
> and reclaim everything we can, including page cache and anon.
> Reclaiming page cache or anon won't help since we need to lower
> only kmem usage. This patch fixes the problem by avoiding
> non-kmem reclaim on hitting the kmem limit.
> 

Can't there be a situation, when some object in anon mem or page cache holds some object in kmem (indirectly)?

> https://jira.sw.ru/browse/PSBM-69226
> Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
> ---
>  include/linux/memcontrol.h | 10 ++++++++++
>  include/linux/swap.h       |  2 +-
>  mm/memcontrol.c            | 30 ++++++++++++++++--------------
>  mm/vmscan.c                | 31 ++++++++++++++++++++++++-------
>  4 files changed, 51 insertions(+), 22 deletions(-)
> 
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 1a52e58ab7de..1d6bc80c4c90 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -45,6 +45,16 @@ struct mem_cgroup_reclaim_cookie {
>  	unsigned int generation;
>  };
>  
> +/*
> + * Reclaim flags for mem_cgroup_hierarchical_reclaim
> + */
> +#define MEM_CGROUP_RECLAIM_NOSWAP_BIT	0x0
> +#define MEM_CGROUP_RECLAIM_NOSWAP	(1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
> +#define MEM_CGROUP_RECLAIM_SHRINK_BIT	0x1
> +#define MEM_CGROUP_RECLAIM_SHRINK	(1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
> +#define MEM_CGROUP_RECLAIM_KMEM_BIT	0x2
> +#define MEM_CGROUP_RECLAIM_KMEM		(1 << MEM_CGROUP_RECLAIM_KMEM_BIT)
> +
>  #ifdef CONFIG_MEMCG
>  int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
>  			  gfp_t gfp_mask, struct mem_cgroup **memcgp);
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index bd162f9bef0d..bd47451ec95a 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -324,7 +324,7 @@ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
>  extern int __isolate_lru_page(struct page *page, isolate_mode_t mode);
>  extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem,
>  						  unsigned long nr_pages,
> -						  gfp_t gfp_mask, bool noswap);
> +						  gfp_t gfp_mask, int flags);
>  extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
>  						gfp_t gfp_mask, bool noswap,
>  						struct zone *zone,
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 97824e281d7a..f9a5f3819a31 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -511,16 +511,6 @@ enum res_type {
>  #define OOM_CONTROL		(0)
>  
>  /*
> - * Reclaim flags for mem_cgroup_hierarchical_reclaim
> - */
> -#define MEM_CGROUP_RECLAIM_NOSWAP_BIT	0x0
> -#define MEM_CGROUP_RECLAIM_NOSWAP	(1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
> -#define MEM_CGROUP_RECLAIM_SHRINK_BIT	0x1
> -#define MEM_CGROUP_RECLAIM_SHRINK	(1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
> -#define MEM_CGROUP_RECLAIM_KMEM_BIT	0x2
> -#define MEM_CGROUP_RECLAIM_KMEM		(1 << MEM_CGROUP_RECLAIM_KMEM_BIT)
> -
> -/*
>   * The memcg_create_mutex will be held whenever a new cgroup is created.
>   * As a consequence, any change that needs to protect against new child cgroups
>   * appearing has to hold it as well.
> @@ -2137,7 +2127,7 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
>  		if (loop)
>  			drain_all_stock_async(memcg);
>  		total += try_to_free_mem_cgroup_pages(memcg, SWAP_CLUSTER_MAX,
> -						      gfp_mask, noswap);
> +						      gfp_mask, flags);
>  		if (test_thread_flag(TIF_MEMDIE) ||
>  		    fatal_signal_pending(current))
>  			return 1;
> @@ -2150,6 +2140,16 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
>  			break;
>  		if (mem_cgroup_margin(memcg, flags & MEM_CGROUP_RECLAIM_KMEM))
>  			break;
> +
> +		/*
> +		 * Try harder to reclaim dcache. dcache reclaim may
> +		 * temporarly fail due to dcache->dlock being held
> +		 * by someone else. We must try harder to avoid premature
> +		 * slab allocation failures.
> +		 */
> +		if (flags & MEM_CGROUP_RECLAIM_KMEM &&
> +		    page_counter_read(&memcg->dcache))
> +			continue;
>  		/*
>  		 * If nothing was reclaimed after two attempts, there
>  		 * may be no reclaimable pages in this hierarchy.
> @@ -2778,11 +2778,13 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, bool kmem_charge
>  	struct mem_cgroup *mem_over_limit;
>  	struct page_counter *counter;
>  	unsigned long nr_reclaimed;
> -	unsigned long flags = 0;
> +	unsigned long flags;
>  
>  	if (mem_cgroup_is_root(memcg))
>  		goto done;
>  retry:
> +	flags = 0;
> +
>  	if (consume_stock(memcg, nr_pages)) {
>  		if (!kmem_charge)
>  			goto done;
> @@ -4138,7 +4140,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
>  			return -EINTR;
>  
>  		progress = try_to_free_mem_cgroup_pages(memcg, SWAP_CLUSTER_MAX,
> -							GFP_KERNEL, false);
> +							GFP_KERNEL, 0);
>  		if (!progress) {
>  			nr_retries--;
>  			/* maybe some writeback is necessary */
> @@ -4573,7 +4575,7 @@ static int mem_cgroup_high_write(struct cgroup *cont, struct cftype *cft,
>  	usage = page_counter_read(&memcg->memory);
>  	if (usage > nr_pages)
>  		try_to_free_mem_cgroup_pages(memcg, usage - nr_pages,
> -					     GFP_KERNEL, false);
> +					     GFP_KERNEL, 0);
>  	return 0;
>  }
>  
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 277bd37bd430..a5db5940bb1e 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -88,6 +88,9 @@ struct scan_control {
>  	/* Scan (total_size >> priority) pages at once */
>  	int priority;
>  
> +	/* Reclaim only slab */
> +	bool slab_only;
> +
>  	/*
>  	 * The memory cgroup that hit its limit and as a result is the
>  	 * primary target of this reclaim invocation.
> @@ -2346,6 +2349,7 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc,
>  	struct reclaim_state *reclaim_state = current->reclaim_state;
>  	unsigned long nr_reclaimed, nr_scanned;
>  	gfp_t slab_gfp = sc->gfp_mask;
> +	bool slab_only = sc->slab_only;
>  
>  	/* Disable fs-related IO for direct reclaim */
>  	if (!sc->target_mem_cgroup &&
> @@ -2372,14 +2376,24 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc,
>  			if (!sc->may_thrash && mem_cgroup_low(root, memcg))
>  				continue;
>  
> -			lruvec = mem_cgroup_zone_lruvec(zone, memcg);
>  			scanned = sc->nr_scanned;
> -			shrink_lruvec(lruvec, sc, &lru_pages);
> -			zone_lru_pages += lru_pages;
>  
> -			if (memcg && is_classzone)
> +			if (!slab_only) {
> +				lruvec = mem_cgroup_zone_lruvec(zone, memcg);
> +				shrink_lruvec(lruvec, sc, &lru_pages);
> +				zone_lru_pages += lru_pages;
> +			}
> +
> +			if (memcg && is_classzone) {
>  				shrink_slab(slab_gfp, zone_to_nid(zone),
>  					    memcg, sc->priority, false);
> +				if (reclaim_state) {
> +					sc->nr_reclaimed += reclaim_state->reclaimed_slab;
> +					sc->nr_scanned += reclaim_state->reclaimed_slab;
> +					reclaim_state->reclaimed_slab = 0;
> +				}
> +
> +			}
>  
>  			/*
>  			 * Direct reclaim and kswapd have to scan all memory
> @@ -2902,15 +2916,17 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
>  unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
>  					   unsigned long nr_pages,
>  					   gfp_t gfp_mask,
> -					   bool noswap)
> +					   int flags)
>  {
>  	struct zonelist *zonelist;
>  	unsigned long nr_reclaimed;
> +	struct reclaim_state reclaim_state = { 0 };
>  	int nid;
>  	struct scan_control sc = {
>  		.may_writepage = !laptop_mode,
>  		.may_unmap = 1,
> -		.may_swap = !noswap,
> +		.may_swap = !(flags & MEM_CGROUP_RECLAIM_NOSWAP),
> +		.slab_only = flags & MEM_CGROUP_RECLAIM_KMEM,
>  		.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
>  		.order = 0,
>  		.priority = DEF_PRIORITY,
> @@ -2933,10 +2949,11 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
>  					    sc.may_writepage,
>  					    sc.gfp_mask);
>  
> +	current->reclaim_state = &reclaim_state;
>  	current->flags |= PF_MEMALLOC | PF_MEMCG_RECLAIM;
>  	nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
>  	current->flags &= ~(PF_MEMALLOC | PF_MEMCG_RECLAIM);
> -
> +	current->reclaim_state = NULL;
>  	trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
>  
>  	return nr_reclaimed;
>
Andrey Ryabinin Aug. 28, 2017, 9:50 a.m.
On 08/28/2017 12:02 PM, Stanislav Kinsburskiy wrote:
> 
> 
> 25.08.2017 18:38, Andrey Ryabinin пишет:
>> If kmem limit on memcg reached, we go into memory reclaim,
>> and reclaim everything we can, including page cache and anon.
>> Reclaiming page cache or anon won't help since we need to lower
>> only kmem usage. This patch fixes the problem by avoiding
>> non-kmem reclaim on hitting the kmem limit.
>>
> 
> Can't there be a situation, when some object in anon mem or page cache holds some object in kmem (indirectly)?
> 

None that I know of.
Konstantin Khorenko Aug. 31, 2017, 9:58 a.m.
Do we want to push it to mainstream as well?

--
Best regards,

Konstantin Khorenko,
Virtuozzo Linux Kernel Team

On 08/25/2017 06:38 PM, Andrey Ryabinin wrote:
> If kmem limit on memcg reached, we go into memory reclaim,
> and reclaim everything we can, including page cache and anon.
> Reclaiming page cache or anon won't help since we need to lower
> only kmem usage. This patch fixes the problem by avoiding
> non-kmem reclaim on hitting the kmem limit.
>
> https://jira.sw.ru/browse/PSBM-69226
> Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
> ---
>  include/linux/memcontrol.h | 10 ++++++++++
>  include/linux/swap.h       |  2 +-
>  mm/memcontrol.c            | 30 ++++++++++++++++--------------
>  mm/vmscan.c                | 31 ++++++++++++++++++++++++-------
>  4 files changed, 51 insertions(+), 22 deletions(-)
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 1a52e58ab7de..1d6bc80c4c90 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -45,6 +45,16 @@ struct mem_cgroup_reclaim_cookie {
>  	unsigned int generation;
>  };
>
> +/*
> + * Reclaim flags for mem_cgroup_hierarchical_reclaim
> + */
> +#define MEM_CGROUP_RECLAIM_NOSWAP_BIT	0x0
> +#define MEM_CGROUP_RECLAIM_NOSWAP	(1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
> +#define MEM_CGROUP_RECLAIM_SHRINK_BIT	0x1
> +#define MEM_CGROUP_RECLAIM_SHRINK	(1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
> +#define MEM_CGROUP_RECLAIM_KMEM_BIT	0x2
> +#define MEM_CGROUP_RECLAIM_KMEM		(1 << MEM_CGROUP_RECLAIM_KMEM_BIT)
> +
>  #ifdef CONFIG_MEMCG
>  int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
>  			  gfp_t gfp_mask, struct mem_cgroup **memcgp);
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index bd162f9bef0d..bd47451ec95a 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -324,7 +324,7 @@ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
>  extern int __isolate_lru_page(struct page *page, isolate_mode_t mode);
>  extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem,
>  						  unsigned long nr_pages,
> -						  gfp_t gfp_mask, bool noswap);
> +						  gfp_t gfp_mask, int flags);
>  extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
>  						gfp_t gfp_mask, bool noswap,
>  						struct zone *zone,
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 97824e281d7a..f9a5f3819a31 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -511,16 +511,6 @@ enum res_type {
>  #define OOM_CONTROL		(0)
>
>  /*
> - * Reclaim flags for mem_cgroup_hierarchical_reclaim
> - */
> -#define MEM_CGROUP_RECLAIM_NOSWAP_BIT	0x0
> -#define MEM_CGROUP_RECLAIM_NOSWAP	(1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
> -#define MEM_CGROUP_RECLAIM_SHRINK_BIT	0x1
> -#define MEM_CGROUP_RECLAIM_SHRINK	(1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
> -#define MEM_CGROUP_RECLAIM_KMEM_BIT	0x2
> -#define MEM_CGROUP_RECLAIM_KMEM		(1 << MEM_CGROUP_RECLAIM_KMEM_BIT)
> -
> -/*
>   * The memcg_create_mutex will be held whenever a new cgroup is created.
>   * As a consequence, any change that needs to protect against new child cgroups
>   * appearing has to hold it as well.
> @@ -2137,7 +2127,7 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
>  		if (loop)
>  			drain_all_stock_async(memcg);
>  		total += try_to_free_mem_cgroup_pages(memcg, SWAP_CLUSTER_MAX,
> -						      gfp_mask, noswap);
> +						      gfp_mask, flags);
>  		if (test_thread_flag(TIF_MEMDIE) ||
>  		    fatal_signal_pending(current))
>  			return 1;
> @@ -2150,6 +2140,16 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
>  			break;
>  		if (mem_cgroup_margin(memcg, flags & MEM_CGROUP_RECLAIM_KMEM))
>  			break;
> +
> +		/*
> +		 * Try harder to reclaim dcache. dcache reclaim may
> +		 * temporarly fail due to dcache->dlock being held
> +		 * by someone else. We must try harder to avoid premature
> +		 * slab allocation failures.
> +		 */
> +		if (flags & MEM_CGROUP_RECLAIM_KMEM &&
> +		    page_counter_read(&memcg->dcache))
> +			continue;
>  		/*
>  		 * If nothing was reclaimed after two attempts, there
>  		 * may be no reclaimable pages in this hierarchy.
> @@ -2778,11 +2778,13 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, bool kmem_charge
>  	struct mem_cgroup *mem_over_limit;
>  	struct page_counter *counter;
>  	unsigned long nr_reclaimed;
> -	unsigned long flags = 0;
> +	unsigned long flags;
>
>  	if (mem_cgroup_is_root(memcg))
>  		goto done;
>  retry:
> +	flags = 0;
> +
>  	if (consume_stock(memcg, nr_pages)) {
>  		if (!kmem_charge)
>  			goto done;
> @@ -4138,7 +4140,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
>  			return -EINTR;
>
>  		progress = try_to_free_mem_cgroup_pages(memcg, SWAP_CLUSTER_MAX,
> -							GFP_KERNEL, false);
> +							GFP_KERNEL, 0);
>  		if (!progress) {
>  			nr_retries--;
>  			/* maybe some writeback is necessary */
> @@ -4573,7 +4575,7 @@ static int mem_cgroup_high_write(struct cgroup *cont, struct cftype *cft,
>  	usage = page_counter_read(&memcg->memory);
>  	if (usage > nr_pages)
>  		try_to_free_mem_cgroup_pages(memcg, usage - nr_pages,
> -					     GFP_KERNEL, false);
> +					     GFP_KERNEL, 0);
>  	return 0;
>  }
>
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 277bd37bd430..a5db5940bb1e 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -88,6 +88,9 @@ struct scan_control {
>  	/* Scan (total_size >> priority) pages at once */
>  	int priority;
>
> +	/* Reclaim only slab */
> +	bool slab_only;
> +
>  	/*
>  	 * The memory cgroup that hit its limit and as a result is the
>  	 * primary target of this reclaim invocation.
> @@ -2346,6 +2349,7 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc,
>  	struct reclaim_state *reclaim_state = current->reclaim_state;
>  	unsigned long nr_reclaimed, nr_scanned;
>  	gfp_t slab_gfp = sc->gfp_mask;
> +	bool slab_only = sc->slab_only;
>
>  	/* Disable fs-related IO for direct reclaim */
>  	if (!sc->target_mem_cgroup &&
> @@ -2372,14 +2376,24 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc,
>  			if (!sc->may_thrash && mem_cgroup_low(root, memcg))
>  				continue;
>
> -			lruvec = mem_cgroup_zone_lruvec(zone, memcg);
>  			scanned = sc->nr_scanned;
> -			shrink_lruvec(lruvec, sc, &lru_pages);
> -			zone_lru_pages += lru_pages;
>
> -			if (memcg && is_classzone)
> +			if (!slab_only) {
> +				lruvec = mem_cgroup_zone_lruvec(zone, memcg);
> +				shrink_lruvec(lruvec, sc, &lru_pages);
> +				zone_lru_pages += lru_pages;
> +			}
> +
> +			if (memcg && is_classzone) {
>  				shrink_slab(slab_gfp, zone_to_nid(zone),
>  					    memcg, sc->priority, false);
> +				if (reclaim_state) {
> +					sc->nr_reclaimed += reclaim_state->reclaimed_slab;
> +					sc->nr_scanned += reclaim_state->reclaimed_slab;
> +					reclaim_state->reclaimed_slab = 0;
> +				}
> +
> +			}
>
>  			/*
>  			 * Direct reclaim and kswapd have to scan all memory
> @@ -2902,15 +2916,17 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
>  unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
>  					   unsigned long nr_pages,
>  					   gfp_t gfp_mask,
> -					   bool noswap)
> +					   int flags)
>  {
>  	struct zonelist *zonelist;
>  	unsigned long nr_reclaimed;
> +	struct reclaim_state reclaim_state = { 0 };
>  	int nid;
>  	struct scan_control sc = {
>  		.may_writepage = !laptop_mode,
>  		.may_unmap = 1,
> -		.may_swap = !noswap,
> +		.may_swap = !(flags & MEM_CGROUP_RECLAIM_NOSWAP),
> +		.slab_only = flags & MEM_CGROUP_RECLAIM_KMEM,
>  		.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
>  		.order = 0,
>  		.priority = DEF_PRIORITY,
> @@ -2933,10 +2949,11 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
>  					    sc.may_writepage,
>  					    sc.gfp_mask);
>
> +	current->reclaim_state = &reclaim_state;
>  	current->flags |= PF_MEMALLOC | PF_MEMCG_RECLAIM;
>  	nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
>  	current->flags &= ~(PF_MEMALLOC | PF_MEMCG_RECLAIM);
> -
> +	current->reclaim_state = NULL;
>  	trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
>
>  	return nr_reclaimed;
>
Andrey Ryabinin Aug. 31, 2017, 12:27 p.m.
On 08/31/2017 12:58 PM, Konstantin Khorenko wrote:
> Do we want to push it to mainstream as well?
> 

I don't think so. Distributions are slowly moving towards v2 cgroup, where
kmem limit simply doesn't exists. And for legacy cgroup v1 lack of reclaim on kmem limit
hit wasn't a mistake but a deliberate choice. There is no clear use case for this, but
it's adds a lot complexity to the reclaim code and just looks a bit ugly.


> -- 
> Best regards,
> 
> Konstantin Khorenko,
> Virtuozzo Linux Kernel Team
>