[rh7,8/8] mm/swap: track shadow entries of swapped anon pages

Submitted by Andrey Ryabinin on Feb. 12, 2019, 3:39 p.m.

Details

Message ID 20190212153915.20204-8-aryabinin@virtuozzo.com
State New
Series "Series without cover letter"
Headers show

Commit Message

Andrey Ryabinin Feb. 12, 2019, 3:39 p.m.
This mostly copy of page cache implementation. Record refault
information when page swapped out, read it on swap in.

https://pmc.acronis.com/browse/VSTOR-19037
Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
---
 drivers/staging/zcache/zcache-main.c |   2 +-
 include/linux/swap.h                 |  10 +--
 mm/shmem.c                           |   2 +-
 mm/swap_state.c                      | 123 ++++++++++++++++++++++++---
 mm/swapfile.c                        |   2 +-
 mm/tswap.c                           |   2 +-
 mm/vmscan.c                          |   6 +-
 mm/workingset.c                      |   3 +-
 8 files changed, 125 insertions(+), 25 deletions(-)

Patch hide | download patch | download mbox

diff --git a/drivers/staging/zcache/zcache-main.c b/drivers/staging/zcache/zcache-main.c
index 01e8446b04d0..732be2143e64 100644
--- a/drivers/staging/zcache/zcache-main.c
+++ b/drivers/staging/zcache/zcache-main.c
@@ -948,7 +948,7 @@  static int zcache_get_swap_cache_page(int type, pgoff_t offset,
 		/* May fail (-ENOMEM) if radix-tree node allocation failed. */
 		__set_page_locked(new_page);
 		SetPageSwapBacked(new_page);
-		err = __add_to_swap_cache(new_page, entry);
+		err = __add_to_swap_cache(new_page, entry, NULL);
 		if (likely(!err)) {
 			radix_tree_preload_end();
 			lru_cache_add_anon(new_page);
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 7797cb88870b..2985b5f90ce5 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -455,9 +455,9 @@  extern struct address_space *swapper_spaces[];
 extern unsigned long total_swapcache_pages(void);
 extern void show_swap_cache_info(void);
 extern int add_to_swap(struct page *, struct list_head *list);
-extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t);
-extern int __add_to_swap_cache(struct page *page, swp_entry_t entry);
-extern void __delete_from_swap_cache(struct page *);
+extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t, void **);
+extern int __add_to_swap_cache(struct page *page, swp_entry_t entry, void **shadow);
+extern void __delete_from_swap_cache(struct page *, void *shadow);
 extern void delete_from_swap_cache(struct page *);
 extern void free_page_and_swap_cache(struct page *);
 extern void free_pages_and_swap_cache(struct page **, int);
@@ -592,12 +592,12 @@  static inline int add_to_swap(struct page *page, struct list_head *list)
 }
 
 static inline int add_to_swap_cache(struct page *page, swp_entry_t entry,
-							gfp_t gfp_mask)
+				gfp_t gfp_mask, void **)
 {
 	return -1;
 }
 
-static inline void __delete_from_swap_cache(struct page *page)
+static inline void __delete_from_swap_cache(struct page *page, void *shadow)
 {
 }
 
diff --git a/mm/shmem.c b/mm/shmem.c
index cda801a5496b..b25e1423d407 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -995,7 +995,7 @@  static int shmem_writepage(struct page *page, struct writeback_control *wbc)
 	if (list_empty(&info->swaplist))
 		list_add_tail(&info->swaplist, &shmem_swaplist);
 
-	if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
+	if (add_to_swap_cache(page, swap, GFP_ATOMIC, NULL) == 0) {
 		spin_lock(&info->lock);
 		shmem_recalc_inode(inode);
 		info->swapped++;
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 83e48a7edb28..3931364e78a3 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -91,10 +91,12 @@  void show_swap_cache_info(void)
  * __add_to_swap_cache resembles add_to_page_cache_locked on swapper_space,
  * but sets SwapCache flag and private instead of mapping and index.
  */
-int __add_to_swap_cache(struct page *page, swp_entry_t entry)
+int __add_to_swap_cache(struct page *page, swp_entry_t entry, void **shadow)
 {
 	int error;
+	void **slot;
 	struct address_space *address_space;
+	struct radix_tree_node *node;
 
 	VM_BUG_ON_PAGE(!PageLocked(page), page);
 	VM_BUG_ON_PAGE(PageSwapCache(page), page);
@@ -106,13 +108,46 @@  int __add_to_swap_cache(struct page *page, swp_entry_t entry)
 
 	address_space = swap_address_space(entry);
 	spin_lock_irq(&address_space->tree_lock);
-	error = radix_tree_insert(&address_space->page_tree,
-					entry.val, page);
-	if (likely(!error)) {
-		address_space->nrpages++;
-		__inc_zone_page_state(page, NR_FILE_PAGES);
-		INC_CACHE_INFO(add_total);
+	error = __radix_tree_create(&address_space->page_tree, entry.val, 0,
+				&node, &slot);
+	if (error)
+		goto out;
+	if (*slot) {
+		void *p;
+
+		p = radix_tree_deref_slot_protected(slot,
+						&address_space->tree_lock);
+		if (!radix_tree_very_exceptional_entry(p)) {
+			error = -EEXIST;
+			goto out;
+		}
+
+		address_space->nrexceptional--;
+		if (shadow)
+			*shadow = p;
+		if (node)
+			workingset_node_shadows_dec(node);
 	}
+	radix_tree_replace_slot(slot, page);
+	address_space->nrpages++;
+	__inc_zone_page_state(page, NR_FILE_PAGES);
+	INC_CACHE_INFO(add_total);
+	if (node) {
+		workingset_node_pages_inc(node);
+		/*
+		 * Don't track node that contains actual pages.
+		 *
+		 * Avoid acquiring the list_lru lock if already
+		 * untracked.  The list_empty() test is safe as
+		 * node->private_list is protected by
+		 * mapping->tree_lock.
+		 */
+		if (!list_empty(&node->private_list))
+			list_lru_del(&workingset_shadow_nodes,
+				     &node->private_list);
+	}
+
+out:
 	spin_unlock_irq(&address_space->tree_lock);
 
 	if (unlikely(error)) {
@@ -131,23 +166,78 @@  int __add_to_swap_cache(struct page *page, swp_entry_t entry)
 }
 
 
-int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
+int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask,
+		void **shadow)
 {
 	int error;
 
 	error = radix_tree_maybe_preload(gfp_mask);
 	if (!error) {
-		error = __add_to_swap_cache(page, entry);
+		error = __add_to_swap_cache(page, entry, shadow);
 		radix_tree_preload_end();
 	}
 	return error;
 }
+static void page_swap_cache_delete(struct address_space *mapping,
+				   struct page *page, void *shadow)
+{
+	struct radix_tree_node *node;
+	void **slot;
+
+	VM_BUG_ON(!PageLocked(page));
+
+	__radix_tree_lookup(&mapping->page_tree, page_private(page), &node, &slot);
+	radix_tree_clear_tags(&mapping->page_tree, node, slot);
+
+	if (!node) {
+		/*
+		 * We need a node to properly account shadow
+		 * entries. Don't plant any without. XXX
+		 */
+		shadow = NULL;
+	}
+
+	radix_tree_replace_slot(slot, shadow);
+
+	if (shadow) {
+		mapping->nrexceptional++;
+		/*
+		 * Make sure the nrexceptional update is committed before
+		 * the nrpages update so that final truncate racing
+		 * with reclaim does not see both counters 0 at the
+		 * same time and miss a shadow entry.
+		 */
+		smp_wmb();
+	}
+
+	if (!node)
+		return;
+
+	workingset_node_pages_dec(node);
+	if (shadow)
+		workingset_node_shadows_inc(node);
+	else
+		if (__radix_tree_delete_node(&mapping->page_tree, node))
+			return;
+
+	/*
+	 * Track node that only contains shadow entries.
+	 *
+	 * Avoid acquiring the list_lru lock if already tracked.  The
+	 * list_empty() test is safe as node->private_list is
+	 * protected by mapping->tree_lock.
+	 */
+	if (!workingset_node_pages(node) && list_empty(&node->private_list)) {
+		node->private_data = mapping;
+		list_lru_add(&workingset_shadow_nodes, &node->private_list);
+	}
+}
 
 /*
  * This must be called only on pages that have
  * been verified to be in the swap cache.
  */
-void __delete_from_swap_cache(struct page *page)
+void __delete_from_swap_cache(struct page *page, void *shadow)
 {
 	swp_entry_t entry;
 	struct address_space *address_space;
@@ -158,7 +248,7 @@  void __delete_from_swap_cache(struct page *page)
 
 	entry.val = page_private(page);
 	address_space = swap_address_space(entry);
-	radix_tree_delete(&address_space->page_tree, page_private(page));
+	page_swap_cache_delete(address_space, page, shadow);
 	set_page_private(page, 0);
 	ClearPageSwapCache(page);
 	address_space->nrpages--;
@@ -203,7 +293,7 @@  int add_to_swap(struct page *page, struct list_head *list)
 	 * Add it to the swap cache.
 	 */
 	err = add_to_swap_cache(page, entry,
-			__GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN);
+			__GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN, NULL);
 
 	if (!err) {
 		return 1;
@@ -232,7 +322,7 @@  void delete_from_swap_cache(struct page *page)
 
 	address_space = swap_address_space(entry);
 	spin_lock_irq(&address_space->tree_lock);
-	__delete_from_swap_cache(page);
+	__delete_from_swap_cache(page, NULL);
 	spin_unlock_irq(&address_space->tree_lock);
 
 	swapcache_free(entry);
@@ -323,6 +413,7 @@  struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 {
 	struct page *found_page, *new_page = NULL;
 	struct address_space *swapper_space = swap_address_space(entry);
+	void *shadow = NULL;
 	int err;
 	*new_page_allocated = false;
 
@@ -395,9 +486,13 @@  struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 		/* May fail (-ENOMEM) if radix-tree node allocation failed. */
 		__set_page_locked(new_page);
 		SetPageSwapBacked(new_page);
-		err = __add_to_swap_cache(new_page, entry);
+		err = __add_to_swap_cache(new_page, entry, &shadow);
 		if (likely(!err)) {
 			radix_tree_preload_end();
+			if (shadow && workingset_refault(shadow)) {
+				SetPageActive(new_page);
+				workingset_activation(new_page);
+			}
 			/*
 			 * Initiate read into locked page and return.
 			 */
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 14043e6bf776..ffc3981c8c60 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1208,7 +1208,7 @@  int reuse_swap_page(struct page *page)
 
 			address_space = swap_address_space(entry);
 			spin_lock_irq(&address_space->tree_lock);
-			__delete_from_swap_cache(page);
+			__delete_from_swap_cache(page, NULL);
 			spin_unlock_irq(&address_space->tree_lock);
 
 			/* the page is still in use, do not uncharge */
diff --git a/mm/tswap.c b/mm/tswap.c
index 112a13d223d6..8b18bd17afcf 100644
--- a/mm/tswap.c
+++ b/mm/tswap.c
@@ -213,7 +213,7 @@  static int tswap_evict_page(struct page *page)
 		goto out_free_swapcache;
 
 	SetPageSwapBacked(page);
-	err = __add_to_swap_cache(page, entry);
+	err = __add_to_swap_cache(page, entry, NULL);
 	if (err) {
 		ClearPageSwapBacked(page);
 		/* __add_to_swap_cache clears page->private on failure */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 583ba1abfc44..fe034747bb31 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -781,8 +781,12 @@  static int __remove_mapping(struct address_space *mapping, struct page *page,
 
 	if (PageSwapCache(page)) {
 		swp_entry_t swap = { .val = page_private(page) };
+		void *shadow = NULL;
+
 		mem_cgroup_swapout(page, swap);
-		__delete_from_swap_cache(page);
+
+		shadow = workingset_eviction(mapping, page);
+		__delete_from_swap_cache(page, shadow);
 		spin_unlock_irq(&mapping->tree_lock);
 		swapcache_free(swap);
 	} else {
diff --git a/mm/workingset.c b/mm/workingset.c
index 0b4cf96bb026..46865ad551ce 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -275,7 +275,8 @@  bool workingset_refault(void *shadow)
 	}
 	lruvec = mem_cgroup_zone_lruvec(zone, memcg);
 	refault = atomic_long_read(&lruvec->inactive_age);
-	active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE);
+	active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE) +
+		lruvec_lru_size(lruvec, LRU_ACTIVE_ANON);
 	rcu_read_unlock();
 
 	/*