From 12564485ed8caac3c18572793ec01330792c7191 Mon Sep 17 00:00:00 2001 From: Shawn Anastasio Date: Fri, 21 Aug 2020 13:55:56 -0500 Subject: Revert "powerpc/64s: Remove PROT_SAO support" This reverts commit 5c9fa16e8abd342ce04dc830c1ebb2a03abf6c05. Since PROT_SAO can still be useful for certain classes of software, reintroduce it. Concerns about guest migration for LPARs using SAO will be addressed next. Signed-off-by: Shawn Anastasio Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200821185558.35561-2-shawn@anastas.io --- mm/ksm.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'mm') diff --git a/mm/ksm.c b/mm/ksm.c index 0aa2247bddd7..90a625b02a1d 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -2453,6 +2453,10 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start, if (vma_is_dax(vma)) return 0; +#ifdef VM_SAO + if (*vm_flags & VM_SAO) + return 0; +#endif #ifdef VM_SPARC_ADI if (*vm_flags & VM_SPARC_ADI) return 0; -- cgit v1.2.3 From 9fa2dd946743ae6f30dc4830da19147bf100a7f2 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 3 Sep 2020 13:40:28 -0700 Subject: mm: fix pin vs. gup mismatch with gate pages Gate pages were missed when converting from get to pin_user_pages(). This can lead to refcount imbalances. This is reliably and quickly reproducible running the x86 selftests when vsyscall=emulate is enabled (the default). Fix by using try_grab_page() with appropriate flags passed. The long story: Today, pin_user_pages() and get_user_pages() are similar interfaces for manipulating page reference counts. However, "pins" use a "bias" value and manipulate the actual reference count by 1024 instead of 1 used by plain "gets". That means that pin_user_pages() must be matched with unpin_user_pages() and can't be mixed with a plain put_user_pages() or put_page(). Enter gate pages, like the vsyscall page. They are pages usually in the kernel image, but which are mapped to userspace. Userspace is allowed access to them, including interfaces using get/pin_user_pages(). The refcount of these kernel pages is manipulated just like a normal user page on the get/pin side so that the put/unpin side can work the same for normal user pages or gate pages. get_gate_page() uses try_get_page() which only bumps the refcount by 1, not 1024, even if called in the pin_user_pages() path. If someone pins a gate page, this happens: pin_user_pages() get_gate_page() try_get_page() // bump refcount +1 ... some time later unpin_user_pages() page_ref_sub_and_test(page, 1024)) ... and boom, we get a refcount off by 1023. This is reliably and quickly reproducible running the x86 selftests when booted with vsyscall=emulate (the default). The selftests use ptrace(), but I suspect anything using pin_user_pages() on gate pages could hit this. To fix it, simply use try_grab_page() instead of try_get_page(), and pass 'gup_flags' in so that FOLL_PIN can be respected. This bug traces back to the very beginning of the FOLL_PIN support in commit 3faa52c03f44 ("mm/gup: track FOLL_PIN pages"), which showed up in the 5.7 release. Signed-off-by: Dave Hansen Fixes: 3faa52c03f44 ("mm/gup: track FOLL_PIN pages") Reported-by: Peter Zijlstra Reviewed-by: John Hubbard Acked-by: Andy Lutomirski Cc: x86@kernel.org Cc: Jann Horn Cc: Andrew Morton Cc: Kirill A. Shutemov Signed-off-by: Linus Torvalds --- mm/gup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/gup.c b/mm/gup.c index 6f47697f8fb0..0d8d76f10ac6 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -843,7 +843,7 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address, goto unmap; *page = pte_page(*pte); } - if (unlikely(!try_get_page(*page))) { + if (unlikely(!try_grab_page(*page, gup_flags))) { ret = -ENOMEM; goto unmap; } -- cgit v1.2.3 From 09854ba94c6aad7886996bfbee2530b3d8a7f4f4 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 21 Aug 2020 19:49:55 -0400 Subject: mm: do_wp_page() simplification How about we just make sure we're the only possible valid user fo the page before we bother to reuse it? Simplify, simplify, simplify. And get rid of the nasty serialization on the page lock at the same time. [peterx: add subject prefix] Signed-off-by: Linus Torvalds Signed-off-by: Peter Xu Signed-off-by: Linus Torvalds --- mm/memory.c | 59 +++++++++++++++++------------------------------------------ 1 file changed, 17 insertions(+), 42 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 3ecad55103ad..56ae945c6845 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2924,50 +2924,25 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) * not dirty accountable. */ if (PageAnon(vmf->page)) { - int total_map_swapcount; - if (PageKsm(vmf->page) && (PageSwapCache(vmf->page) || - page_count(vmf->page) != 1)) + struct page *page = vmf->page; + + /* PageKsm() doesn't necessarily raise the page refcount */ + if (PageKsm(page) || page_count(page) != 1) + goto copy; + if (!trylock_page(page)) + goto copy; + if (PageKsm(page) || page_mapcount(page) != 1 || page_count(page) != 1) { + unlock_page(page); goto copy; - if (!trylock_page(vmf->page)) { - get_page(vmf->page); - pte_unmap_unlock(vmf->pte, vmf->ptl); - lock_page(vmf->page); - vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, - vmf->address, &vmf->ptl); - if (!pte_same(*vmf->pte, vmf->orig_pte)) { - update_mmu_tlb(vma, vmf->address, vmf->pte); - unlock_page(vmf->page); - pte_unmap_unlock(vmf->pte, vmf->ptl); - put_page(vmf->page); - return 0; - } - put_page(vmf->page); - } - if (PageKsm(vmf->page)) { - bool reused = reuse_ksm_page(vmf->page, vmf->vma, - vmf->address); - unlock_page(vmf->page); - if (!reused) - goto copy; - wp_page_reuse(vmf); - return VM_FAULT_WRITE; - } - if (reuse_swap_page(vmf->page, &total_map_swapcount)) { - if (total_map_swapcount == 1) { - /* - * The page is all ours. Move it to - * our anon_vma so the rmap code will - * not search our parent or siblings. - * Protected against the rmap code by - * the page lock. - */ - page_move_anon_rmap(vmf->page, vma); - } - unlock_page(vmf->page); - wp_page_reuse(vmf); - return VM_FAULT_WRITE; } - unlock_page(vmf->page); + /* + * Ok, we've got the only map reference, and the only + * page count reference, and the page is locked, + * it's dark out, and we're wearing sunglasses. Hit it. + */ + wp_page_reuse(vmf); + unlock_page(page); + return VM_FAULT_WRITE; } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED))) { return wp_page_shared(vmf); -- cgit v1.2.3 From 1a0cf26323c80e2f1c58fc04f15686de61bfab0c Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 21 Aug 2020 19:49:56 -0400 Subject: mm/ksm: Remove reuse_ksm_page() Remove the function as the last reference has gone away with the do_wp_page() changes. Signed-off-by: Peter Xu Signed-off-by: Linus Torvalds --- include/linux/ksm.h | 7 ------- mm/ksm.c | 25 ------------------------- 2 files changed, 32 deletions(-) (limited to 'mm') diff --git a/include/linux/ksm.h b/include/linux/ksm.h index e48b1e453ff5..161e8164abcf 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -53,8 +53,6 @@ struct page *ksm_might_need_to_copy(struct page *page, void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc); void ksm_migrate_page(struct page *newpage, struct page *oldpage); -bool reuse_ksm_page(struct page *page, - struct vm_area_struct *vma, unsigned long address); #else /* !CONFIG_KSM */ @@ -88,11 +86,6 @@ static inline void rmap_walk_ksm(struct page *page, static inline void ksm_migrate_page(struct page *newpage, struct page *oldpage) { } -static inline bool reuse_ksm_page(struct page *page, - struct vm_area_struct *vma, unsigned long address) -{ - return false; -} #endif /* CONFIG_MMU */ #endif /* !CONFIG_KSM */ diff --git a/mm/ksm.c b/mm/ksm.c index 4102034cd55a..12958287fac3 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -2660,31 +2660,6 @@ again: goto again; } -bool reuse_ksm_page(struct page *page, - struct vm_area_struct *vma, - unsigned long address) -{ -#ifdef CONFIG_DEBUG_VM - if (WARN_ON(is_zero_pfn(page_to_pfn(page))) || - WARN_ON(!page_mapped(page)) || - WARN_ON(!PageLocked(page))) { - dump_page(page, "reuse_ksm_page"); - return false; - } -#endif - - if (PageSwapCache(page) || !page_stable_node(page)) - return false; - /* Prohibit parallel get_ksm_page() */ - if (!page_ref_freeze(page, 1)) - return false; - - page_move_anon_rmap(page, vma); - page->index = linear_page_index(vma, address); - page_ref_unfreeze(page, 1); - - return true; -} #ifdef CONFIG_MIGRATION void ksm_migrate_page(struct page *newpage, struct page *oldpage) { -- cgit v1.2.3 From a308c71bf1e6e19cc2e4ced31853ee0fc7cb439a Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 21 Aug 2020 19:49:57 -0400 Subject: mm/gup: Remove enfornced COW mechanism With the more strict (but greatly simplified) page reuse logic in do_wp_page(), we can safely go back to the world where cow is not enforced with writes. This essentially reverts commit 17839856fd58 ("gup: document and work around 'COW can break either way' issue"). There are some context differences due to some changes later on around it: 2170ecfa7688 ("drm/i915: convert get_user_pages() --> pin_user_pages()", 2020-06-03) 376a34efa4ee ("mm/gup: refactor and de-duplicate gup_fast() code", 2020-06-03) Some lines moved back and forth with those, but this revert patch should have striped out and covered all the enforced cow bits anyways. Suggested-by: Linus Torvalds Signed-off-by: Peter Xu Signed-off-by: Linus Torvalds --- drivers/gpu/drm/i915/gem/i915_gem_userptr.c | 8 ------ mm/gup.c | 40 ++++------------------------- mm/huge_memory.c | 7 ++--- 3 files changed, 9 insertions(+), 46 deletions(-) (limited to 'mm') diff --git a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c index c31a6744daee..853184db8e83 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c @@ -599,14 +599,6 @@ static int i915_gem_userptr_get_pages(struct drm_i915_gem_object *obj) GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN); - /* - * Using __get_user_pages_fast() with a read-only - * access is questionable. A read-only page may be - * COW-broken, and then this might end up giving - * the wrong side of the COW.. - * - * We may or may not care. - */ if (pvec) { /* defer to worker if malloc fails */ if (!i915_gem_object_is_readonly(obj)) diff --git a/mm/gup.c b/mm/gup.c index 6f47697f8fb0..e3bd0d0b5120 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -381,22 +381,13 @@ static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address, } /* - * FOLL_FORCE or a forced COW break can write even to unwritable pte's, - * but only after we've gone through a COW cycle and they are dirty. + * FOLL_FORCE can write to even unwritable pte's, but only + * after we've gone through a COW cycle and they are dirty. */ static inline bool can_follow_write_pte(pte_t pte, unsigned int flags) { - return pte_write(pte) || ((flags & FOLL_COW) && pte_dirty(pte)); -} - -/* - * A (separate) COW fault might break the page the other way and - * get_user_pages() would return the page from what is now the wrong - * VM. So we need to force a COW break at GUP time even for reads. - */ -static inline bool should_force_cow_break(struct vm_area_struct *vma, unsigned int flags) -{ - return is_cow_mapping(vma->vm_flags) && (flags & (FOLL_GET | FOLL_PIN)); + return pte_write(pte) || + ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte)); } static struct page *follow_page_pte(struct vm_area_struct *vma, @@ -1075,11 +1066,9 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, goto out; } if (is_vm_hugetlb_page(vma)) { - if (should_force_cow_break(vma, foll_flags)) - foll_flags |= FOLL_WRITE; i = follow_hugetlb_page(mm, vma, pages, vmas, &start, &nr_pages, i, - foll_flags, locked); + gup_flags, locked); if (locked && *locked == 0) { /* * We've got a VM_FAULT_RETRY @@ -1093,10 +1082,6 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, continue; } } - - if (should_force_cow_break(vma, foll_flags)) - foll_flags |= FOLL_WRITE; - retry: /* * If we have a pending SIGKILL, don't keep faulting pages and @@ -2763,19 +2748,6 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages, return -EFAULT; /* - * The FAST_GUP case requires FOLL_WRITE even for pure reads, - * because get_user_pages() may need to cause an early COW in - * order to avoid confusing the normal COW routines. So only - * targets that are already writable are safe to do by just - * looking at the page tables. - * - * NOTE! With FOLL_FAST_ONLY we allow read-only gup_fast() here, - * because there is no slow path to fall back on. But you'd - * better be careful about possible COW pages - you'll get _a_ - * COW page, but not necessarily the one you intended to get - * depending on what COW event happens after this. COW may break - * the page copy in a random direction. - * * Disable interrupts. The nested form is used, in order to allow * full, general purpose use of this routine. * @@ -2788,8 +2760,6 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages, */ if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) && gup_fast_permitted(start, end)) { unsigned long fast_flags = gup_flags; - if (!(gup_flags & FOLL_FAST_ONLY)) - fast_flags |= FOLL_WRITE; local_irq_save(flags); gup_pgd_range(addr, end, fast_flags, pages, &nr_pinned); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 78c84bee7e29..6f74f3d93839 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1312,12 +1312,13 @@ fallback: } /* - * FOLL_FORCE or a forced COW break can write even to unwritable pmd's, - * but only after we've gone through a COW cycle and they are dirty. + * FOLL_FORCE can write to even unwritable pmd's, but only + * after we've gone through a COW cycle and they are dirty. */ static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags) { - return pmd_write(pmd) || ((flags & FOLL_COW) && pmd_dirty(pmd)); + return pmd_write(pmd) || + ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd)); } struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, -- cgit v1.2.3 From 798a6b87ecd72828a6c6b5469aaa2032a57e92b7 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 21 Aug 2020 19:49:58 -0400 Subject: mm: Add PGREUSE counter This accounts for wp_page_reuse() case, where we reused a page for COW. Signed-off-by: Peter Xu Signed-off-by: Linus Torvalds --- include/linux/vm_event_item.h | 1 + mm/memory.c | 1 + mm/vmstat.c | 1 + 3 files changed, 3 insertions(+) (limited to 'mm') diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 24fc7c3ae7d6..58d3f91baad1 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -30,6 +30,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, PGFAULT, PGMAJFAULT, PGLAZYFREED, PGREFILL, + PGREUSE, PGSTEAL_KSWAPD, PGSTEAL_DIRECT, PGSCAN_KSWAPD, diff --git a/mm/memory.c b/mm/memory.c index 56ae945c6845..20d93001ef93 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2619,6 +2619,7 @@ static inline void wp_page_reuse(struct vm_fault *vmf) if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1)) update_mmu_cache(vma, vmf->address, vmf->pte); pte_unmap_unlock(vmf->pte, vmf->ptl); + count_vm_event(PGREUSE); } /* diff --git a/mm/vmstat.c b/mm/vmstat.c index 3fb23a21f6dd..907a5045bfa3 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1198,6 +1198,7 @@ const char * const vmstat_text[] = { "pglazyfreed", "pgrefill", + "pgreuse", "pgsteal_kswapd", "pgsteal_direct", "pgscan_kswapd", -- cgit v1.2.3