Message ID | 201708311451.v7VEpP7b016908@finist_ce7.work |
---|---|
State | New |
Series | "ms/mm/huge_memory.c: respect FOLL_FORCE/FOLL_COW for thp" |
Headers | show |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 477610d..5a07e76 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1317,6 +1317,18 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, return ret; } +/* + * foll_force can write to even unwritable pmd's, but only + * after we've gone through a cow cycle and they are dirty. + */ +static inline bool can_follow_write_pmd(pmd_t pmd, struct page *page, + unsigned int flags) +{ + return pmd_write(pmd) || + ((flags & FOLL_FORCE) && (flags & FOLL_COW) && + page && PageAnon(page)); +} + struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, @@ -1327,9 +1339,6 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, assert_spin_locked(pmd_lockptr(mm, pmd)); - if (flags & FOLL_WRITE && !pmd_write(*pmd)) - goto out; - /* Avoid dumping huge zero page */ if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd)) return ERR_PTR(-EFAULT); @@ -1340,6 +1349,10 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, page = pmd_page(*pmd); VM_BUG_ON_PAGE(!PageHead(page), page); + + if (flags & FOLL_WRITE && !can_follow_write_pmd(*pmd, page, flags)) + return NULL; + if (flags & FOLL_TOUCH) { pmd_t _pmd; /*
Please consider to prepare a ReadyKernel patch for it. https://readykernel.com/ -- Best regards, Konstantin Khorenko, Virtuozzo Linux Kernel Team On 08/31/2017 05:51 PM, Konstantin Khorenko wrote: > The commit is pushed to "branch-rh7-3.10.0-514.26.1.vz7.35.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git > after rh7-3.10.0-514.26.1.vz7.35.5 > ------> > commit f5b413ea4e53d819c8b4e4a4927fb563bd3ec24f > Author: Keno Fischer <keno@juliacomputing.com> > Date: Thu Aug 31 17:51:25 2017 +0300 > > ms/mm/huge_memory.c: respect FOLL_FORCE/FOLL_COW for thp > > commit 8310d48b125d19fcd9521d83b8293e63eb1646aa upstream. > > In commit 19be0eaffa3a ("mm: remove gup_flags FOLL_WRITE games from > __get_user_pages()"), the mm code was changed from unsetting FOLL_WRITE > after a COW was resolved to setting the (newly introduced) FOLL_COW > instead. Simultaneously, the check in gup.c was updated to still allow > writes with FOLL_FORCE set if FOLL_COW had also been set. > > However, a similar check in huge_memory.c was forgotten. As a result, > remote memory writes to ro regions of memory backed by transparent huge > pages cause an infinite loop in the kernel (handle_mm_fault sets > FOLL_COW and returns 0 causing a retry, but follow_trans_huge_pmd bails > out immidiately because `(flags & FOLL_WRITE) && !pmd_write(*pmd)` is > true. > > While in this state the process is stil SIGKILLable, but little else > works (e.g. no ptrace attach, no other signals). This is easily > reproduced with the following code (assuming thp are set to always): > > #include <assert.h> > #include <fcntl.h> > #include <stdint.h> > #include <stdio.h> > #include <string.h> > #include <sys/mman.h> > #include <sys/stat.h> > #include <sys/types.h> > #include <sys/wait.h> > #include <unistd.h> > > #define TEST_SIZE 5 * 1024 * 1024 > > int main(void) { > int status; > pid_t child; > int fd = open("/proc/self/mem", O_RDWR); > void *addr = mmap(NULL, TEST_SIZE, PROT_READ, > MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); > assert(addr != MAP_FAILED); > pid_t parent_pid = getpid(); > if ((child = fork()) == 0) { > void *addr2 = mmap(NULL, TEST_SIZE, PROT_READ | PROT_WRITE, > MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); > assert(addr2 != MAP_FAILED); > memset(addr2, 'a', TEST_SIZE); > pwrite(fd, addr2, TEST_SIZE, (uintptr_t)addr); > return 0; > } > assert(child == waitpid(child, &status, 0)); > assert(WIFEXITED(status) && WEXITSTATUS(status) == 0); > return 0; > } > > Fix this by updating follow_trans_huge_pmd in huge_memory.c analogously > to the update in gup.c in the original commit. The same pattern exists > in follow_devmap_pmd. However, we should not be able to reach that > check with FOLL_COW set, so add WARN_ONCE to make sure we notice if we > ever do. > > [akpm@linux-foundation.org: coding-style fixes] > Link: http://lkml.kernel.org/r/20170106015025.GA38411@juliacomputing.com > Signed-off-by: Keno Fischer <keno@juliacomputing.com> > Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> > Cc: Greg Thelen <gthelen@google.com> > Cc: Nicholas Piggin <npiggin@gmail.com> > Cc: Willy Tarreau <w@1wt.eu> > Cc: Oleg Nesterov <oleg@redhat.com> > Cc: Kees Cook <keescook@chromium.org> > Cc: Andy Lutomirski <luto@kernel.org> > Cc: Michal Hocko <mhocko@suse.com> > Cc: Hugh Dickins <hughd@google.com> > Signed-off-by: Andrew Morton <akpm@linux-foundation.org> > Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> > [bwh: Backported to 3.2: > - Drop change to follow_devmap_pmd() > - pmd_dirty() is not available; check the page flags as in > can_follow_write_pte() > - Adjust context] > Signed-off-by: Ben Hutchings <ben@decadent.org.uk> > [mhocko: > This has been forward ported from the 3.2 stable tree. > And fixed to return NULL.] > Reviewed-by: Michal Hocko <mhocko@suse.cz> > Signed-off-by: Jiri Slaby <jslaby@suse.cz> > Signed-off-by: Willy Tarreau <w@1wt.eu> > > https://jira.sw.ru/browse/PSBM-70151 > > Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com> > --- > mm/huge_memory.c | 19 ++++++++++++++++--- > 1 file changed, 16 insertions(+), 3 deletions(-) > > diff --git a/mm/huge_memory.c b/mm/huge_memory.c > index 477610d..5a07e76 100644 > --- a/mm/huge_memory.c > +++ b/mm/huge_memory.c > @@ -1317,6 +1317,18 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, > return ret; > } > > +/* > + * foll_force can write to even unwritable pmd's, but only > + * after we've gone through a cow cycle and they are dirty. > + */ > +static inline bool can_follow_write_pmd(pmd_t pmd, struct page *page, > + unsigned int flags) > +{ > + return pmd_write(pmd) || > + ((flags & FOLL_FORCE) && (flags & FOLL_COW) && > + page && PageAnon(page)); > +} > + > struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, > unsigned long addr, > pmd_t *pmd, > @@ -1327,9 +1339,6 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, > > assert_spin_locked(pmd_lockptr(mm, pmd)); > > - if (flags & FOLL_WRITE && !pmd_write(*pmd)) > - goto out; > - > /* Avoid dumping huge zero page */ > if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd)) > return ERR_PTR(-EFAULT); > @@ -1340,6 +1349,10 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, > > page = pmd_page(*pmd); > VM_BUG_ON_PAGE(!PageHead(page), page); > + > + if (flags & FOLL_WRITE && !can_follow_write_pmd(*pmd, page, flags)) > + return NULL; > + > if (flags & FOLL_TOUCH) { > pmd_t _pmd; > /* > . >