[RHEL7,COMMIT] ms/mm, swap: fix race between swap count continuation operations

Submitted by Konstantin Khorenko on July 2, 2018, 10:17 a.m.

Details

Message ID 201807021017.w62AHH2Z001305@finist_ce7.work
State New
Series "Series without cover letter"
Headers show

Commit Message

Konstantin Khorenko July 2, 2018, 10:17 a.m.
The commit is pushed to "branch-rh7-3.10.0-862.3.2.vz7.61.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-862.3.2.vz7.61.7
------>
commit 757082d5c9cb56745148b7bed529f7436cd1b322
Author: Huang Ying <ying.huang@intel.com>
Date:   Mon Jul 2 13:17:17 2018 +0300

    ms/mm, swap: fix race between swap count continuation operations
    
    One page may store a set of entries of the sis->swap_map
    (swap_info_struct->swap_map) in multiple swap clusters.
    
    If some of the entries has sis->swap_map[offset] > SWAP_MAP_MAX,
    multiple pages will be used to store the set of entries of the
    sis->swap_map.  And the pages are linked with page->lru.  This is called
    swap count continuation.  To access the pages which store the set of
    entries of the sis->swap_map simultaneously, previously, sis->lock is
    used.  But to improve the scalability of __swap_duplicate(), swap
    cluster lock may be used in swap_count_continued() now.  This may race
    with add_swap_count_continuation() which operates on a nearby swap
    cluster, in which the sis->swap_map entries are stored in the same page.
    
    The race can cause wrong swap count in practice, thus cause unfreeable
    swap entries or software lockup, etc.
    
    To fix the race, a new spin lock called cont_lock is added to struct
    swap_info_struct to protect the swap count continuation page list.  This
    is a lock at the swap device level, so the scalability isn't very well.
    But it is still much better than the original sis->lock, because it is
    only acquired/released when swap count continuation is used.  Which is
    considered rare in practice.  If it turns out that the scalability
    becomes an issue for some workloads, we can split the lock into some
    more fine grained locks.
    
    Link: http://lkml.kernel.org/r/20171017081320.28133-1-ying.huang@intel.com
    Fixes: 235b62176712 ("mm/swap: add cluster lock")
    Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
    Cc: Johannes Weiner <hannes@cmpxchg.org>
    Cc: Shaohua Li <shli@kernel.org>
    Cc: Tim Chen <tim.c.chen@intel.com>
    Cc: Michal Hocko <mhocko@suse.com>
    Cc: Aaron Lu <aaron.lu@intel.com>
    Cc: Dave Hansen <dave.hansen@intel.com>
    Cc: Andi Kleen <ak@linux.intel.com>
    Cc: Minchan Kim <minchan@kernel.org>
    Cc: Hugh Dickins <hughd@google.com>
    Cc: <stable@vger.kernel.org>    [4.11+]
    Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
    Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
    
    https://jira.sw.ru/browse/PSBM-86091
    (cherry picked from commit 2628bd6fc052bd85e9864dae4de494d8a6313391)
    Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
---
 include/linux/swap.h |  4 ++++
 mm/swapfile.c        | 23 +++++++++++++++++------
 2 files changed, 21 insertions(+), 6 deletions(-)

Patch hide | download patch | download mbox

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 439097e86af6..c7adb0bf6e5b 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -281,6 +281,10 @@  struct swap_info_struct {
 	RH_KABI_EXTEND(struct work_struct discard_work) /* discard worker */
 	RH_KABI_EXTEND(struct swap_cluster_list discard_clusters) /* discard clusters list */
 	RH_KABI_EXTEND(struct percpu_cluster __percpu *percpu_cluster) /* per cpu's swap location */
+	spinlock_t cont_lock;		/*
+					 * protect swap count continuation page
+					 * list.
+					 */
 };
 
 /* linux/mm/workingset.c */
diff --git a/mm/swapfile.c b/mm/swapfile.c
index f6f09d2756ff..a174990d72b9 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2487,6 +2487,7 @@  static struct swap_info_struct *alloc_swap_info(void)
 	p->flags = SWP_USED;
 	spin_unlock(&swap_lock);
 	spin_lock_init(&p->lock);
+	spin_lock_init(&p->cont_lock);
 
 	return p;
 }
@@ -3162,6 +3163,7 @@  int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
 	head = vmalloc_to_page(si->swap_map + offset);
 	offset &= ~PAGE_MASK;
 
+	spin_lock(&si->cont_lock);
 	/*
 	 * Page allocation does not initialize the page's lru field,
 	 * but it does always reset its private field.
@@ -3181,7 +3183,7 @@  int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
 		 * a continuation page, free our allocation and use this one.
 		 */
 		if (!(count & COUNT_CONTINUED))
-			goto out;
+			goto out_unlock_cont;
 
 		map = kmap_atomic(list_page) + offset;
 		count = *map;
@@ -3192,11 +3194,13 @@  int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
 		 * free our allocation and use this one.
 		 */
 		if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX)
-			goto out;
+			goto out_unlock_cont;
 	}
 
 	list_add_tail(&page->lru, &head->lru);
 	page = NULL;			/* now it's attached, don't free it */
+out_unlock_cont:
+	spin_unlock(&si->cont_lock);
 out:
 	unlock_cluster(ci);
 	spin_unlock(&si->lock);
@@ -3221,6 +3225,7 @@  static bool swap_count_continued(struct swap_info_struct *si,
 	struct page *head;
 	struct page *page;
 	unsigned char *map;
+	bool ret;
 
 	head = vmalloc_to_page(si->swap_map + offset);
 	if (page_private(head) != SWP_CONTINUED) {
@@ -3228,6 +3233,7 @@  static bool swap_count_continued(struct swap_info_struct *si,
 		return false;		/* need to add count continuation */
 	}
 
+	spin_lock(&si->cont_lock);
 	offset &= ~PAGE_MASK;
 	page = list_entry(head->lru.next, struct page, lru);
 	map = kmap_atomic(page) + offset;
@@ -3248,8 +3254,10 @@  static bool swap_count_continued(struct swap_info_struct *si,
 		if (*map == SWAP_CONT_MAX) {
 			kunmap_atomic(map);
 			page = list_entry(page->lru.next, struct page, lru);
-			if (page == head)
-				return false;	/* add count continuation */
+			if (page == head) {
+				ret = false;	/* add count continuation */
+				goto out;
+			}
 			map = kmap_atomic(page) + offset;
 init_map:		*map = 0;		/* we didn't zero the page */
 		}
@@ -3262,7 +3270,7 @@  init_map:		*map = 0;		/* we didn't zero the page */
 			kunmap_atomic(map);
 			page = list_entry(page->lru.prev, struct page, lru);
 		}
-		return true;			/* incremented */
+		ret = true;			/* incremented */
 
 	} else {				/* decrementing */
 		/*
@@ -3288,8 +3296,11 @@  init_map:		*map = 0;		/* we didn't zero the page */
 			kunmap_atomic(map);
 			page = list_entry(page->lru.prev, struct page, lru);
 		}
-		return count == COUNT_CONTINUED;
+		ret = count == COUNT_CONTINUED;
 	}
+out:
+	spin_unlock(&si->cont_lock);
+	return ret;
 }
 
 /*