diff options
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r-- | mm/hugetlb.c | 193 |
1 files changed, 115 insertions, 78 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a57e1be41401..f044962ad9df 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -66,12 +66,6 @@ static bool hugetlb_cma_page(struct page *page, unsigned int order) #endif static unsigned long hugetlb_cma_size __initdata; -/* - * Minimum page order among possible hugepage sizes, set to a proper value - * at boot time. - */ -static unsigned int minimum_order __read_mostly = UINT_MAX; - __initdata LIST_HEAD(huge_boot_pages); /* for command line parsing */ @@ -1135,7 +1129,7 @@ static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid) lockdep_assert_held(&hugetlb_lock); list_for_each_entry(page, &h->hugepage_freelists[nid], lru) { - if (pin && !is_pinnable_page(page)) + if (pin && !is_longterm_pinnable_page(page)) continue; if (PageHWPoison(page)) @@ -2152,11 +2146,17 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) unsigned long pfn; struct page *page; int rc = 0; + unsigned int order; + struct hstate *h; if (!hugepages_supported()) return rc; - for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order) { + order = huge_page_order(&default_hstate); + for_each_hstate(h) + order = min(order, huge_page_order(h)); + + for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order) { page = pfn_to_page(pfn); rc = dissolve_free_huge_page(page); if (rc) @@ -2766,8 +2766,7 @@ retry: * Fail with -EBUSY if not possible. */ spin_unlock_irq(&hugetlb_lock); - if (!isolate_huge_page(old_page, list)) - ret = -EBUSY; + ret = isolate_hugetlb(old_page, list); spin_lock_irq(&hugetlb_lock); goto free_new; } else if (!HPageFreed(old_page)) { @@ -2843,7 +2842,7 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) if (hstate_is_gigantic(h)) return -ENOMEM; - if (page_count(head) && isolate_huge_page(head, list)) + if (page_count(head) && !isolate_hugetlb(head, list)) ret = 0; else if (!page_count(head)) ret = alloc_and_dissolve_huge_page(h, head, list); @@ -3149,9 +3148,6 @@ static void __init hugetlb_init_hstates(void) struct hstate *h, *h2; for_each_hstate(h) { - if (minimum_order > huge_page_order(h)) - minimum_order = huge_page_order(h); - /* oversize hugepages were init'ed in early boot */ if (!hstate_is_gigantic(h)) hugetlb_hstate_alloc_pages(h); @@ -3176,7 +3172,6 @@ static void __init hugetlb_init_hstates(void) h->demote_order = h2->order; } } - VM_BUG_ON(minimum_order == UINT_MAX); } static void __init report_hugepages(void) @@ -4482,22 +4477,20 @@ int hugetlb_report_node_meminfo(char *buf, int len, int nid) nid, h->surplus_huge_pages_node[nid]); } -void hugetlb_show_meminfo(void) +void hugetlb_show_meminfo_node(int nid) { struct hstate *h; - int nid; if (!hugepages_supported()) return; - for_each_node_state(nid, N_MEMORY) - for_each_hstate(h) - pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n", - nid, - h->nr_huge_pages_node[nid], - h->free_huge_pages_node[nid], - h->surplus_huge_pages_node[nid], - huge_page_size(h) / SZ_1K); + for_each_hstate(h) + printk("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n", + nid, + h->nr_huge_pages_node[nid], + h->free_huge_pages_node[nid], + h->surplus_huge_pages_node[nid], + huge_page_size(h) / SZ_1K); } void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm) @@ -4732,6 +4725,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, unsigned long npages = pages_per_huge_page(h); struct address_space *mapping = src_vma->vm_file->f_mapping; struct mmu_notifier_range range; + unsigned long last_addr_mask; int ret = 0; if (cow) { @@ -4751,11 +4745,14 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, i_mmap_lock_read(mapping); } + last_addr_mask = hugetlb_mask_last_page(h); for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) { spinlock_t *src_ptl, *dst_ptl; src_pte = huge_pte_offset(src, addr, sz); - if (!src_pte) + if (!src_pte) { + addr |= last_addr_mask; continue; + } dst_pte = huge_pte_alloc(dst, dst_vma, addr, sz); if (!dst_pte) { ret = -ENOMEM; @@ -4772,8 +4769,10 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, * after taking the lock below. */ dst_entry = huge_ptep_get(dst_pte); - if ((dst_pte == src_pte) || !huge_pte_none(dst_entry)) + if ((dst_pte == src_pte) || !huge_pte_none(dst_entry)) { + addr |= last_addr_mask; continue; + } dst_ptl = huge_pte_lock(h, dst, dst_pte); src_ptl = huge_pte_lockptr(h, src, src_pte); @@ -4788,8 +4787,13 @@ again: * sharing with another vma. */ ; - } else if (unlikely(is_hugetlb_entry_migration(entry) || - is_hugetlb_entry_hwpoisoned(entry))) { + } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) { + bool uffd_wp = huge_pte_uffd_wp(entry); + + if (!userfaultfd_wp(dst_vma) && uffd_wp) + entry = huge_pte_clear_uffd_wp(entry); + set_huge_pte_at(dst, addr, dst_pte, entry); + } else if (unlikely(is_hugetlb_entry_migration(entry))) { swp_entry_t swp_entry = pte_to_swp_entry(entry); bool uffd_wp = huge_pte_uffd_wp(entry); @@ -4803,12 +4807,11 @@ again: entry = swp_entry_to_pte(swp_entry); if (userfaultfd_wp(src_vma) && uffd_wp) entry = huge_pte_mkuffd_wp(entry); - set_huge_swap_pte_at(src, addr, src_pte, - entry, sz); + set_huge_pte_at(src, addr, src_pte, entry); } if (!userfaultfd_wp(dst_vma) && uffd_wp) entry = huge_pte_clear_uffd_wp(entry); - set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz); + set_huge_pte_at(dst, addr, dst_pte, entry); } else if (unlikely(is_pte_marker(entry))) { /* * We copy the pte marker only if the dst vma has @@ -4875,7 +4878,7 @@ again: * table protection not changing it to point * to a new page. * - * See Documentation/vm/mmu_notifier.rst + * See Documentation/mm/mmu_notifier.rst */ huge_ptep_set_wrprotect(src, addr, src_pte); entry = huge_pte_wrprotect(entry); @@ -4934,7 +4937,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, unsigned long sz = huge_page_size(h); struct mm_struct *mm = vma->vm_mm; unsigned long old_end = old_addr + len; - unsigned long old_addr_copy; + unsigned long last_addr_mask; pte_t *src_pte, *dst_pte; struct mmu_notifier_range range; bool shared_pmd = false; @@ -4949,23 +4952,23 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, flush_cache_range(vma, range.start, range.end); mmu_notifier_invalidate_range_start(&range); + last_addr_mask = hugetlb_mask_last_page(h); /* Prevent race with file truncation */ i_mmap_lock_write(mapping); for (; old_addr < old_end; old_addr += sz, new_addr += sz) { src_pte = huge_pte_offset(mm, old_addr, sz); - if (!src_pte) + if (!src_pte) { + old_addr |= last_addr_mask; + new_addr |= last_addr_mask; continue; + } if (huge_pte_none(huge_ptep_get(src_pte))) continue; - /* old_addr arg to huge_pmd_unshare() is a pointer and so the - * arg may be modified. Pass a copy instead to preserve the - * value in old_addr. - */ - old_addr_copy = old_addr; - - if (huge_pmd_unshare(mm, vma, &old_addr_copy, src_pte)) { + if (huge_pmd_unshare(mm, vma, old_addr, src_pte)) { shared_pmd = true; + old_addr |= last_addr_mask; + new_addr |= last_addr_mask; continue; } @@ -4999,6 +5002,7 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); struct mmu_notifier_range range; + unsigned long last_addr_mask; bool force_flush = false; WARN_ON(!is_vm_hugetlb_page(vma)); @@ -5019,17 +5023,21 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct end); adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); mmu_notifier_invalidate_range_start(&range); + last_addr_mask = hugetlb_mask_last_page(h); address = start; for (; address < end; address += sz) { ptep = huge_pte_offset(mm, address, sz); - if (!ptep) + if (!ptep) { + address |= last_addr_mask; continue; + } ptl = huge_pte_lock(h, mm, ptep); - if (huge_pmd_unshare(mm, vma, &address, ptep)) { + if (huge_pmd_unshare(mm, vma, address, ptep)) { spin_unlock(ptl); tlb_flush_pmd_range(tlb, address & PUD_MASK, PUD_SIZE); force_flush = true; + address |= last_addr_mask; continue; } @@ -5414,19 +5422,25 @@ static bool hugetlbfs_pagecache_present(struct hstate *h, int huge_add_to_page_cache(struct page *page, struct address_space *mapping, pgoff_t idx) { + struct folio *folio = page_folio(page); struct inode *inode = mapping->host; struct hstate *h = hstate_inode(inode); - int err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); + int err; - if (err) + __folio_set_locked(folio); + err = __filemap_add_folio(mapping, folio, idx, GFP_KERNEL, NULL); + + if (unlikely(err)) { + __folio_clear_locked(folio); return err; + } ClearHPageRestoreReserve(page); /* - * set page dirty so that it will not be removed from cache/file + * mark folio dirty so that it will not be removed from cache/file * by non-hugetlbfs specific code paths. */ - set_page_dirty(page); + folio_mark_dirty(folio); spin_lock(&inode->i_lock); inode->i_blocks += blocks_per_huge_page(h); @@ -5703,7 +5717,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, */ entry = huge_ptep_get(ptep); if (unlikely(is_hugetlb_entry_migration(entry))) { - migration_entry_wait_huge(vma, mm, ptep); + migration_entry_wait_huge(vma, ptep); return 0; } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) return VM_FAULT_HWPOISON_LARGE | @@ -5947,6 +5961,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, page = alloc_huge_page(dst_vma, dst_addr, 0); if (IS_ERR(page)) { + put_page(*pagep); ret = -ENOMEM; *pagep = NULL; goto out; @@ -6040,8 +6055,6 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); - (void)huge_ptep_set_access_flags(dst_vma, dst_addr, dst_pte, _dst_pte, - dst_vma->vm_flags & VM_WRITE); hugetlb_count_add(pages_per_huge_page(h), dst_mm); /* No need to invalidate - it was non-present before */ @@ -6293,6 +6306,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long pages = 0, psize = huge_page_size(h); bool shared_pmd = false; struct mmu_notifier_range range; + unsigned long last_addr_mask; bool uffd_wp = cp_flags & MM_CP_UFFD_WP; bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; @@ -6309,14 +6323,17 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, flush_cache_range(vma, range.start, range.end); mmu_notifier_invalidate_range_start(&range); + last_addr_mask = hugetlb_mask_last_page(h); i_mmap_lock_write(vma->vm_file->f_mapping); for (; address < end; address += psize) { spinlock_t *ptl; ptep = huge_pte_offset(mm, address, psize); - if (!ptep) + if (!ptep) { + address |= last_addr_mask; continue; + } ptl = huge_pte_lock(h, mm, ptep); - if (huge_pmd_unshare(mm, vma, &address, ptep)) { + if (huge_pmd_unshare(mm, vma, address, ptep)) { /* * When uffd-wp is enabled on the vma, unshare * shouldn't happen at all. Warn about it if it @@ -6326,6 +6343,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, pages++; spin_unlock(ptl); shared_pmd = true; + address |= last_addr_mask; continue; } pte = huge_ptep_get(ptep); @@ -6351,8 +6369,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, newpte = pte_swp_mkuffd_wp(newpte); else if (uffd_wp_resolve) newpte = pte_swp_clear_uffd_wp(newpte); - set_huge_swap_pte_at(mm, address, ptep, - newpte, psize); + set_huge_pte_at(mm, address, ptep, newpte); pages++; } spin_unlock(ptl); @@ -6403,7 +6420,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, * No need to call mmu_notifier_invalidate_range() we are downgrading * page table protection not changing it to point to a new page. * - * See Documentation/vm/mmu_notifier.rst + * See Documentation/mm/mmu_notifier.rst */ i_mmap_unlock_write(vma->vm_file->f_mapping); mmu_notifier_invalidate_range_end(&range); @@ -6749,11 +6766,11 @@ out: * 0 the underlying pte page is not shared, or it is the last user */ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long *addr, pte_t *ptep) + unsigned long addr, pte_t *ptep) { - pgd_t *pgd = pgd_offset(mm, *addr); - p4d_t *p4d = p4d_offset(pgd, *addr); - pud_t *pud = pud_offset(p4d, *addr); + pgd_t *pgd = pgd_offset(mm, addr); + p4d_t *p4d = p4d_offset(pgd, addr); + pud_t *pud = pud_offset(p4d, addr); i_mmap_assert_write_locked(vma->vm_file->f_mapping); BUG_ON(page_count(virt_to_page(ptep)) == 0); @@ -6763,14 +6780,6 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, pud_clear(pud); put_page(virt_to_page(ptep)); mm_dec_nr_pmds(mm); - /* - * This update of passed address optimizes loops sequentially - * processing addresses in increments of huge page size (PMD_SIZE - * in this case). By clearing the pud, a PUD_SIZE area is unmapped. - * Update address to the 'last page' in the cleared area so that - * calling loop can move to first page past this area. - */ - *addr |= PUD_SIZE - PMD_SIZE; return 1; } @@ -6782,7 +6791,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, } int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long *addr, pte_t *ptep) + unsigned long addr, pte_t *ptep) { return 0; } @@ -6865,6 +6874,37 @@ pte_t *huge_pte_offset(struct mm_struct *mm, return (pte_t *)pmd; } +/* + * Return a mask that can be used to update an address to the last huge + * page in a page table page mapping size. Used to skip non-present + * page table entries when linearly scanning address ranges. Architectures + * with unique huge page to page table relationships can define their own + * version of this routine. + */ +unsigned long hugetlb_mask_last_page(struct hstate *h) +{ + unsigned long hp_size = huge_page_size(h); + + if (hp_size == PUD_SIZE) + return P4D_SIZE - PUD_SIZE; + else if (hp_size == PMD_SIZE) + return PUD_SIZE - PMD_SIZE; + else + return 0UL; +} + +#else + +/* See description above. Architectures can provide their own version. */ +__weak unsigned long hugetlb_mask_last_page(struct hstate *h) +{ +#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE + if (huge_page_size(h) == PMD_SIZE) + return PUD_SIZE - PMD_SIZE; +#endif + return 0UL; +} + #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ /* @@ -6928,7 +6968,7 @@ retry: } else { if (is_hugetlb_entry_migration(pte)) { spin_unlock(ptl); - __migration_entry_wait(mm, (pte_t *)pmd, ptl); + __migration_entry_wait_huge((pte_t *)pmd, ptl); goto retry; } /* @@ -6960,15 +7000,15 @@ follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int fla return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT); } -bool isolate_huge_page(struct page *page, struct list_head *list) +int isolate_hugetlb(struct page *page, struct list_head *list) { - bool ret = true; + int ret = 0; spin_lock_irq(&hugetlb_lock); if (!PageHeadHuge(page) || !HPageMigratable(page) || !get_page_unless_zero(page)) { - ret = false; + ret = -EBUSY; goto unlock; } ClearHPageMigratable(page); @@ -7088,21 +7128,18 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) mmu_notifier_invalidate_range_start(&range); i_mmap_lock_write(vma->vm_file->f_mapping); for (address = start; address < end; address += PUD_SIZE) { - unsigned long tmp = address; - ptep = huge_pte_offset(mm, address, sz); if (!ptep) continue; ptl = huge_pte_lock(h, mm, ptep); - /* We don't want 'address' to be changed */ - huge_pmd_unshare(mm, vma, &tmp, ptep); + huge_pmd_unshare(mm, vma, address, ptep); spin_unlock(ptl); } flush_hugetlb_tlb_range(vma, start, end); i_mmap_unlock_write(vma->vm_file->f_mapping); /* * No need to call mmu_notifier_invalidate_range(), see - * Documentation/vm/mmu_notifier.rst. + * Documentation/mm/mmu_notifier.rst. */ mmu_notifier_invalidate_range_end(&range); } |