[RHEL8,COMMIT] userfaultfd: wp: support swap and page migration

Submitted by Konstantin Khorenko on April 20, 2020, 7:34 a.m.

Details

Message ID 202004200734.03K7Yg1g016816@finist_co8.work.ct
State New
Series "Series without cover letter"
Headers show

Commit Message

Konstantin Khorenko April 20, 2020, 7:34 a.m.
The commit is pushed to "branch-rh8-4.18.0-80.1.2.vz8.3.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-80.1.2.vz8.3.6
------>
commit 020b23314620b6cdfbb9ca434ca67d00a4801642
Author: Peter Xu <peterx@redhat.com>
Date:   Mon Apr 20 10:34:41 2020 +0300

    userfaultfd: wp: support swap and page migration
    
    For either swap and page migration, we all use the bit 2 of the entry to
    identify whether this entry is uffd write-protected.  It plays a similar
    role as the existing soft dirty bit in swap entries but only for keeping
    the uffd-wp tracking for a specific PTE/PMD.
    
    Something special here is that when we want to recover the uffd-wp bit
    from a swap/migration entry to the PTE bit we'll also need to take care of
    the _PAGE_RW bit and make sure it's cleared, otherwise even with the
    _PAGE_UFFD_WP bit we can't trap it at all.
    
    In change_pte_range() we do nothing for uffd if the PTE is a swap entry.
    That can lead to data mismatch if the page that we are going to write
    protect is swapped out when sending the UFFDIO_WRITEPROTECT.  This patch
    also applies/removes the uffd-wp bit even for the swap entries.
    
    Signed-off-by: Peter Xu <peterx@redhat.com>
    Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
    Cc: Andrea Arcangeli <aarcange@redhat.com>
    Cc: Bobby Powers <bobbypowers@gmail.com>
    Cc: Brian Geffon <bgeffon@google.com>
    Cc: David Hildenbrand <david@redhat.com>
    Cc: Denis Plotnikov <dplotnikov@virtuozzo.com>
    Cc: "Dr . David Alan Gilbert" <dgilbert@redhat.com>
    Cc: Hugh Dickins <hughd@google.com>
    Cc: Jerome Glisse <jglisse@redhat.com>
    Cc: Johannes Weiner <hannes@cmpxchg.org>
    Cc: "Kirill A . Shutemov" <kirill@shutemov.name>
    Cc: Martin Cracauer <cracauer@cons.org>
    Cc: Marty McFadden <mcfadden8@llnl.gov>
    Cc: Maya Gokhale <gokhale2@llnl.gov>
    Cc: Mel Gorman <mgorman@suse.de>
    Cc: Mike Kravetz <mike.kravetz@oracle.com>
    Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
    Cc: Pavel Emelyanov <xemul@openvz.org>
    Cc: Rik van Riel <riel@redhat.com>
    Cc: Shaohua Li <shli@fb.com>
    Link: http://lkml.kernel.org/r/20200220163112.11409-11-peterx@redhat.com
    Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
    
    https://jira.sw.ru/browse/PSBM-102938
    (cherry picked from commit f45ec5ff16a75f96dac8c89862d75f1d8739efd4)
    Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
---
 include/linux/swapops.h |  2 ++
 mm/huge_memory.c        |  3 +++
 mm/memory.c             |  8 ++++++++
 mm/migrate.c            |  6 ++++++
 mm/mprotect.c           | 28 +++++++++++++++++-----------
 mm/rmap.c               |  6 ++++++
 6 files changed, 42 insertions(+), 11 deletions(-)

Patch hide | download patch | download mbox

diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 1d3877c39a00..affbbbe7abcb 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -69,6 +69,8 @@  static inline swp_entry_t pte_to_swp_entry(pte_t pte)
 
 	if (pte_swp_soft_dirty(pte))
 		pte = pte_swp_clear_soft_dirty(pte);
+	if (pte_swp_uffd_wp(pte))
+		pte = pte_swp_clear_uffd_wp(pte);
 	arch_entry = __pte_to_swp_entry(pte);
 	return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry));
 }
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index b26c2daf3547..78c5eef073dd 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2154,6 +2154,7 @@  static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 		write = is_write_migration_entry(entry);
 		young = false;
 		soft_dirty = pmd_swp_soft_dirty(old_pmd);
+		uffd_wp = pmd_swp_uffd_wp(old_pmd);
 	} else {
 		page = pmd_page(old_pmd);
 		if (pmd_dirty(old_pmd))
@@ -2186,6 +2187,8 @@  static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 			entry = swp_entry_to_pte(swp_entry);
 			if (soft_dirty)
 				entry = pte_swp_mksoft_dirty(entry);
+			if (uffd_wp)
+				entry = pte_swp_mkuffd_wp(entry);
 		} else {
 			entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot));
 			entry = maybe_mkwrite(entry, vma);
diff --git a/mm/memory.c b/mm/memory.c
index 7e0189d729e3..c256bf4d297c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -985,6 +985,8 @@  copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 				pte = swp_entry_to_pte(entry);
 				if (pte_swp_soft_dirty(*src_pte))
 					pte = pte_swp_mksoft_dirty(pte);
+				if (pte_swp_uffd_wp(*src_pte))
+					pte = pte_swp_mkuffd_wp(pte);
 				set_pte_at(src_mm, addr, src_pte, pte);
 			}
 		} else if (is_device_private_entry(entry)) {
@@ -1014,6 +1016,8 @@  copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 			    is_cow_mapping(vm_flags)) {
 				make_device_private_entry_read(&entry);
 				pte = swp_entry_to_pte(entry);
+				if (pte_swp_uffd_wp(*src_pte))
+					pte = pte_swp_mkuffd_wp(pte);
 				set_pte_at(src_mm, addr, src_pte, pte);
 			}
 		}
@@ -3079,6 +3083,10 @@  int do_swap_page(struct vm_fault *vmf)
 	flush_icache_page(vma, page);
 	if (pte_swp_soft_dirty(vmf->orig_pte))
 		pte = pte_mksoft_dirty(pte);
+	if (pte_swp_uffd_wp(vmf->orig_pte)) {
+		pte = pte_mkuffd_wp(pte);
+		pte = pte_wrprotect(pte);
+	}
 	set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
 	arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
 	vmf->orig_pte = pte;
diff --git a/mm/migrate.c b/mm/migrate.c
index cc06cfef2fa1..1814dea49329 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -241,11 +241,15 @@  static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
 		entry = pte_to_swp_entry(*pvmw.pte);
 		if (is_write_migration_entry(entry))
 			pte = maybe_mkwrite(pte, vma);
+		else if (pte_swp_uffd_wp(*pvmw.pte))
+			pte = pte_mkuffd_wp(pte);
 
 		if (unlikely(is_zone_device_page(new))) {
 			if (is_device_private_page(new)) {
 				entry = make_device_private_entry(new, pte_write(pte));
 				pte = swp_entry_to_pte(entry);
+				if (pte_swp_uffd_wp(*pvmw.pte))
+					pte = pte_mkuffd_wp(pte);
 			} else if (is_device_public_page(new)) {
 				pte = pte_mkdevmap(pte);
 				flush_dcache_page(new);
@@ -2324,6 +2328,8 @@  static int migrate_vma_collect_pmd(pmd_t *pmdp,
 			swp_pte = swp_entry_to_pte(entry);
 			if (pte_soft_dirty(pte))
 				swp_pte = pte_swp_mksoft_dirty(swp_pte);
+			if (pte_uffd_wp(pte))
+				swp_pte = pte_swp_mkuffd_wp(swp_pte);
 			set_pte_at(mm, addr, ptep, swp_pte);
 
 			/*
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 2df60e64f139..475e18ba7131 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -140,11 +140,11 @@  static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 			}
 			ptep_modify_prot_commit(mm, addr, pte, ptent);
 			pages++;
-		} else if (IS_ENABLED(CONFIG_MIGRATION)) {
+		} else if (is_swap_pte(oldpte)) {
 			swp_entry_t entry = pte_to_swp_entry(oldpte);
+			pte_t newpte;
 
 			if (is_write_migration_entry(entry)) {
-				pte_t newpte;
 				/*
 				 * A protection check is difficult so
 				 * just be safe and disable write
@@ -153,22 +153,28 @@  static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 				newpte = swp_entry_to_pte(entry);
 				if (pte_swp_soft_dirty(oldpte))
 					newpte = pte_swp_mksoft_dirty(newpte);
-				set_pte_at(mm, addr, pte, newpte);
-
-				pages++;
-			}
-
-			if (is_write_device_private_entry(entry)) {
-				pte_t newpte;
-
+				if (pte_swp_uffd_wp(oldpte))
+					newpte = pte_swp_mkuffd_wp(newpte);
+			} else if (is_write_device_private_entry(entry)) {
 				/*
 				 * We do not preserve soft-dirtiness. See
 				 * copy_one_pte() for explanation.
 				 */
 				make_device_private_entry_read(&entry);
 				newpte = swp_entry_to_pte(entry);
-				set_pte_at(mm, addr, pte, newpte);
+				if (pte_swp_uffd_wp(oldpte))
+					newpte = pte_swp_mkuffd_wp(newpte);
+			} else {
+				newpte = oldpte;
+			}
 
+			if (uffd_wp)
+				newpte = pte_swp_mkuffd_wp(newpte);
+			else if (uffd_wp_resolve)
+				newpte = pte_swp_clear_uffd_wp(newpte);
+
+			if (!pte_same(oldpte, newpte)) {
+				set_pte_at(mm, addr, pte, newpte);
 				pages++;
 			}
 		}
diff --git a/mm/rmap.c b/mm/rmap.c
index eb477809a5c0..d9f37eccbc02 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1427,6 +1427,8 @@  static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 			swp_pte = swp_entry_to_pte(entry);
 			if (pte_soft_dirty(pteval))
 				swp_pte = pte_swp_mksoft_dirty(swp_pte);
+			if (pte_uffd_wp(pteval))
+				swp_pte = pte_swp_mkuffd_wp(swp_pte);
 			set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
 			/*
 			 * No need to invalidate here it will synchronize on
@@ -1519,6 +1521,8 @@  static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 			swp_pte = swp_entry_to_pte(entry);
 			if (pte_soft_dirty(pteval))
 				swp_pte = pte_swp_mksoft_dirty(swp_pte);
+			if (pte_uffd_wp(pteval))
+				swp_pte = pte_swp_mkuffd_wp(swp_pte);
 			set_pte_at(mm, address, pvmw.pte, swp_pte);
 			/*
 			 * No need to invalidate here it will synchronize on
@@ -1585,6 +1589,8 @@  static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 			swp_pte = swp_entry_to_pte(entry);
 			if (pte_soft_dirty(pteval))
 				swp_pte = pte_swp_mksoft_dirty(swp_pte);
+			if (pte_uffd_wp(pteval))
+				swp_pte = pte_swp_mkuffd_wp(swp_pte);
 			set_pte_at(mm, address, pvmw.pte, swp_pte);
 			/* Invalidate as we cleared the pte */
 			mmu_notifier_invalidate_range(mm, address,