[RHEL7,COMMIT] ms/Revert "mm: page_alloc: skip over regions of invalid pfns where possible"

Submitted by Konstantin Khorenko on April 27, 2018, 10 a.m.

Details

Message ID 201804271000.w3RA0ufj032197@finist_ce7.work
State New
Series "mm/page_alloc: fix false positive BUG_ON on uninitialized page structure due to pageblock alignment"
Headers show

Commit Message

Konstantin Khorenko April 27, 2018, 10 a.m.
The commit is pushed to "branch-rh7-3.10.0-693.21.1.vz7.47.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-693.21.1.vz7.47.1
------>
commit 70b9d7d5791d062e18e47565f463ad6face86623
Author: Daniel Vacek <neelx@redhat.com>
Date:   Fri Apr 27 13:00:55 2018 +0300

    ms/Revert "mm: page_alloc: skip over regions of invalid pfns where possible"
    
    This reverts commit b92df1de5d28 ("mm: page_alloc: skip over regions of
    invalid pfns where possible").  The commit is meant to be a boot init
    speed up skipping the loop in memmap_init_zone() for invalid pfns.
    
    But given some specific memory mapping on x86_64 (or more generally
    theoretically anywhere but on arm with CONFIG_HAVE_ARCH_PFN_VALID) the
    implementation also skips valid pfns which is plain wrong and causes
    'kernel BUG at mm/page_alloc.c:1389!'
    
      crash> log | grep -e BUG -e RIP -e Call.Trace -e move_freepages_block -e rmqueue -e freelist -A1
      kernel BUG at mm/page_alloc.c:1389!
      invalid opcode: 0000 [#1] SMP
      --
      RIP: 0010: move_freepages+0x15e/0x160
      --
      Call Trace:
        move_freepages_block+0x73/0x80
        __rmqueue+0x263/0x460
        get_page_from_freelist+0x7e1/0x9e0
        __alloc_pages_nodemask+0x176/0x420
      --
    
      crash> page_init_bug -v | grep RAM
      <struct resource 0xffff88067fffd2f8>          1000 -        9bfff       System RAM (620.00 KiB)
      <struct resource 0xffff88067fffd3a0>        100000 -     430bffff       System RAM (  1.05 GiB = 1071.75 MiB = 1097472.00 KiB)
      <struct resource 0xffff88067fffd410>      4b0c8000 -     4bf9cfff       System RAM ( 14.83 MiB = 15188.00 KiB)
      <struct resource 0xffff88067fffd480>      4bfac000 -     646b1fff       System RAM (391.02 MiB = 400408.00 KiB)
      <struct resource 0xffff88067fffd560>      7b788000 -     7b7fffff       System RAM (480.00 KiB)
      <struct resource 0xffff88067fffd640>     100000000 -    67fffffff       System RAM ( 22.00 GiB)
    
      crash> page_init_bug | head -6
      <struct resource 0xffff88067fffd560>      7b788000 -     7b7fffff       System RAM (480.00 KiB)
      <struct page 0xffffea0001ede200>   1fffff00000000  0 <struct pglist_data 0xffff88047ffd9000> 1 <struct zone 0xffff88047ffd9800> DMA32          4096    1048575
      <struct page 0xffffea0001ede200>       505736 505344 <struct page 0xffffea0001ed8000> 505855 <struct page 0xffffea0001edffc0>
      <struct page 0xffffea0001ed8000>                0  0 <struct pglist_data 0xffff88047ffd9000> 0 <struct zone 0xffff88047ffd9000> DMA               1       4095
      <struct page 0xffffea0001edffc0>   1fffff00000400  0 <struct pglist_data 0xffff88047ffd9000> 1 <struct zone 0xffff88047ffd9800> DMA32          4096    1048575
      BUG, zones differ!
    
      crash> kmem -p 77fff000 78000000 7b5ff000 7b600000 7b787000 7b788000
            PAGE        PHYSICAL      MAPPING       INDEX CNT FLAGS
      ffffea0001e00000  78000000                0        0  0 0
      ffffea0001ed7fc0  7b5ff000                0        0  0 0
      ffffea0001ed8000  7b600000                0        0  0 0       <<<<
      ffffea0001ede1c0  7b787000                0        0  0 0
      ffffea0001ede200  7b788000                0        0  1 1fffff00000000
    
    Link: http://lkml.kernel.org/r/20180316143855.29838-1-neelx@redhat.com
    Fixes: b92df1de5d28 ("mm: page_alloc: skip over regions of invalid pfns where possible")
    Signed-off-by: Daniel Vacek <neelx@redhat.com>
    Acked-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
    Acked-by: Michal Hocko <mhocko@suse.com>
    Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
    Cc: Vlastimil Babka <vbabka@suse.cz>
    Cc: Mel Gorman <mgorman@techsingularity.net>
    Cc: Pavel Tatashin <pasha.tatashin@oracle.com>
    Cc: Paul Burton <paul.burton@imgtec.com>
    Cc: <stable@vger.kernel.org>
    Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
    Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
    
    Just an additional fix while investigating:
    https://jira.sw.ru/browse/PSBM-83746
    
    (cherry picked from commit f59f1caf72ba00d519c793c3deb32cd3be32edc2)
    Signed-off-by: Konstantin Khorenko <khorenko@virtuozzo.com>
---
 include/linux/memblock.h |  1 -
 mm/memblock.c            | 28 ----------------------------
 mm/page_alloc.c          | 11 +----------
 3 files changed, 1 insertion(+), 39 deletions(-)

Patch hide | download patch | download mbox

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 5a439c937c3c..dad171f37f66 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -186,7 +186,6 @@  int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn,
 			    unsigned long  *end_pfn);
 void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
 			  unsigned long *out_end_pfn, int *out_nid);
-unsigned long memblock_next_valid_pfn(unsigned long pfn, unsigned long max_pfn);
 
 /**
  * for_each_mem_pfn_range - early memory pfn range iterator
diff --git a/mm/memblock.c b/mm/memblock.c
index fbc8071d9f43..e08475cd1c0a 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1061,34 +1061,6 @@  void __init_memblock __next_mem_pfn_range(int *idx, int nid,
 		*out_nid = r->nid;
 }
 
-unsigned long __init_memblock memblock_next_valid_pfn(unsigned long pfn,
-						      unsigned long max_pfn)
-{
-	struct memblock_type *type = &memblock.memory;
-	unsigned int right = type->cnt;
-	unsigned int mid, left = 0;
-	phys_addr_t addr = PFN_PHYS(pfn + 1);
-
-	do {
-		mid = (right + left) / 2;
-
-		if (addr < type->regions[mid].base)
-			right = mid;
-		else if (addr >= (type->regions[mid].base +
-				  type->regions[mid].size))
-			left = mid + 1;
-		else {
-			/* addr is within the region, so pfn + 1 is valid */
-			return min(pfn + 1, max_pfn);
-		}
-	} while (left < right);
-
-	if (right == type->cnt)
-		return max_pfn;
-	else
-		return min(PHYS_PFN(type->regions[right].base), max_pfn);
-}
-
 /**
  * memblock_set_node - set node ID on memblock regions
  * @base: base of area to set node ID for
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7fa5f026434c..e658abb6ff10 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4717,17 +4717,8 @@  void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 		if (context != MEMMAP_EARLY)
 			goto not_early;
 
-		if (!early_pfn_valid(pfn)) {
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
-			/*
-			 * Skip to the pfn preceding the next valid one (or
-			 * end_pfn), such that we hit a valid pfn (or end_pfn)
-			 * on our next iteration of the loop.
-			 */
-			pfn = memblock_next_valid_pfn(pfn, end_pfn) - 1;
-#endif
+		if (!early_pfn_valid(pfn))
 			continue;
-		}
 		if (!early_pfn_in_nid(pfn, nid))
 			continue;
 		if (!update_defer_init(pgdat, pfn, size, end_pfn,