summaryrefslogtreecommitdiff
path: root/mm/vmalloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/vmalloc.c')
-rw-r--r--mm/vmalloc.c650
1 files changed, 498 insertions, 152 deletions
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index d5f2a84e488a..d33894d7b27a 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -34,7 +34,7 @@
#include <linux/bitops.h>
#include <linux/rbtree_augmented.h>
#include <linux/overflow.h>
-
+#include <linux/pgtable.h>
#include <linux/uaccess.h>
#include <asm/tlbflush.h>
#include <asm/shmparam.h>
@@ -42,6 +42,19 @@
#include "internal.h"
#include "pgalloc-track.h"
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
+static bool __ro_after_init vmap_allow_huge = true;
+
+static int __init set_nohugevmalloc(char *str)
+{
+ vmap_allow_huge = false;
+ return 0;
+}
+early_param("nohugevmalloc", set_nohugevmalloc);
+#else /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
+static const bool vmap_allow_huge = false;
+#endif /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
+
bool is_vmalloc_addr(const void *x)
{
unsigned long addr = (unsigned long)x;
@@ -68,6 +81,218 @@ static void free_work(struct work_struct *w)
}
/*** Page table manipulation functions ***/
+static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ pgtbl_mod_mask *mask)
+{
+ pte_t *pte;
+ u64 pfn;
+
+ pfn = phys_addr >> PAGE_SHIFT;
+ pte = pte_alloc_kernel_track(pmd, addr, mask);
+ if (!pte)
+ return -ENOMEM;
+ do {
+ BUG_ON(!pte_none(*pte));
+ set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot));
+ pfn++;
+ } while (pte++, addr += PAGE_SIZE, addr != end);
+ *mask |= PGTBL_PTE_MODIFIED;
+ return 0;
+}
+
+static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift)
+{
+ if (max_page_shift < PMD_SHIFT)
+ return 0;
+
+ if (!arch_vmap_pmd_supported(prot))
+ return 0;
+
+ if ((end - addr) != PMD_SIZE)
+ return 0;
+
+ if (!IS_ALIGNED(addr, PMD_SIZE))
+ return 0;
+
+ if (!IS_ALIGNED(phys_addr, PMD_SIZE))
+ return 0;
+
+ if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr))
+ return 0;
+
+ return pmd_set_huge(pmd, phys_addr, prot);
+}
+
+static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift, pgtbl_mod_mask *mask)
+{
+ pmd_t *pmd;
+ unsigned long next;
+
+ pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
+ if (!pmd)
+ return -ENOMEM;
+ do {
+ next = pmd_addr_end(addr, end);
+
+ if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot,
+ max_page_shift)) {
+ *mask |= PGTBL_PMD_MODIFIED;
+ continue;
+ }
+
+ if (vmap_pte_range(pmd, addr, next, phys_addr, prot, mask))
+ return -ENOMEM;
+ } while (pmd++, phys_addr += (next - addr), addr = next, addr != end);
+ return 0;
+}
+
+static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift)
+{
+ if (max_page_shift < PUD_SHIFT)
+ return 0;
+
+ if (!arch_vmap_pud_supported(prot))
+ return 0;
+
+ if ((end - addr) != PUD_SIZE)
+ return 0;
+
+ if (!IS_ALIGNED(addr, PUD_SIZE))
+ return 0;
+
+ if (!IS_ALIGNED(phys_addr, PUD_SIZE))
+ return 0;
+
+ if (pud_present(*pud) && !pud_free_pmd_page(pud, addr))
+ return 0;
+
+ return pud_set_huge(pud, phys_addr, prot);
+}
+
+static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift, pgtbl_mod_mask *mask)
+{
+ pud_t *pud;
+ unsigned long next;
+
+ pud = pud_alloc_track(&init_mm, p4d, addr, mask);
+ if (!pud)
+ return -ENOMEM;
+ do {
+ next = pud_addr_end(addr, end);
+
+ if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot,
+ max_page_shift)) {
+ *mask |= PGTBL_PUD_MODIFIED;
+ continue;
+ }
+
+ if (vmap_pmd_range(pud, addr, next, phys_addr, prot,
+ max_page_shift, mask))
+ return -ENOMEM;
+ } while (pud++, phys_addr += (next - addr), addr = next, addr != end);
+ return 0;
+}
+
+static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift)
+{
+ if (max_page_shift < P4D_SHIFT)
+ return 0;
+
+ if (!arch_vmap_p4d_supported(prot))
+ return 0;
+
+ if ((end - addr) != P4D_SIZE)
+ return 0;
+
+ if (!IS_ALIGNED(addr, P4D_SIZE))
+ return 0;
+
+ if (!IS_ALIGNED(phys_addr, P4D_SIZE))
+ return 0;
+
+ if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr))
+ return 0;
+
+ return p4d_set_huge(p4d, phys_addr, prot);
+}
+
+static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift, pgtbl_mod_mask *mask)
+{
+ p4d_t *p4d;
+ unsigned long next;
+
+ p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
+ if (!p4d)
+ return -ENOMEM;
+ do {
+ next = p4d_addr_end(addr, end);
+
+ if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot,
+ max_page_shift)) {
+ *mask |= PGTBL_P4D_MODIFIED;
+ continue;
+ }
+
+ if (vmap_pud_range(p4d, addr, next, phys_addr, prot,
+ max_page_shift, mask))
+ return -ENOMEM;
+ } while (p4d++, phys_addr += (next - addr), addr = next, addr != end);
+ return 0;
+}
+
+static int vmap_range_noflush(unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift)
+{
+ pgd_t *pgd;
+ unsigned long start;
+ unsigned long next;
+ int err;
+ pgtbl_mod_mask mask = 0;
+
+ might_sleep();
+ BUG_ON(addr >= end);
+
+ start = addr;
+ pgd = pgd_offset_k(addr);
+ do {
+ next = pgd_addr_end(addr, end);
+ err = vmap_p4d_range(pgd, addr, next, phys_addr, prot,
+ max_page_shift, &mask);
+ if (err)
+ break;
+ } while (pgd++, phys_addr += (next - addr), addr = next, addr != end);
+
+ if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
+ arch_sync_kernel_mappings(start, end);
+
+ return err;
+}
+
+int vmap_range(unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift)
+{
+ int err;
+
+ err = vmap_range_noflush(addr, end, phys_addr, prot, max_page_shift);
+ flush_cache_vmap(addr, end);
+
+ return err;
+}
static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pgtbl_mod_mask *mask)
@@ -153,22 +378,20 @@ static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
} while (p4d++, addr = next, addr != end);
}
-/**
- * unmap_kernel_range_noflush - unmap kernel VM area
- * @start: start of the VM area to unmap
- * @size: size of the VM area to unmap
+/*
+ * vunmap_range_noflush is similar to vunmap_range, but does not
+ * flush caches or TLBs.
*
- * Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size specify
- * should have been allocated using get_vm_area() and its friends.
+ * The caller is responsible for calling flush_cache_vmap() before calling
+ * this function, and flush_tlb_kernel_range after it has returned
+ * successfully (and before the addresses are expected to cause a page fault
+ * or be re-mapped for something else, if TLB flushes are being delayed or
+ * coalesced).
*
- * NOTE:
- * This function does NOT do any cache flushing. The caller is responsible
- * for calling flush_cache_vunmap() on to-be-mapped areas before calling this
- * function and flush_tlb_kernel_range() after.
+ * This is an internal function only. Do not use outside mm/.
*/
-void unmap_kernel_range_noflush(unsigned long start, unsigned long size)
+void vunmap_range_noflush(unsigned long start, unsigned long end)
{
- unsigned long end = start + size;
unsigned long next;
pgd_t *pgd;
unsigned long addr = start;
@@ -189,7 +412,23 @@ void unmap_kernel_range_noflush(unsigned long start, unsigned long size)
arch_sync_kernel_mappings(start, end);
}
-static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
+/**
+ * vunmap_range - unmap kernel virtual addresses
+ * @addr: start of the VM area to unmap
+ * @end: end of the VM area to unmap (non-inclusive)
+ *
+ * Clears any present PTEs in the virtual address range, flushes TLBs and
+ * caches. Any subsequent access to the address before it has been re-mapped
+ * is a kernel bug.
+ */
+void vunmap_range(unsigned long addr, unsigned long end)
+{
+ flush_cache_vunmap(addr, end);
+ vunmap_range_noflush(addr, end);
+ flush_tlb_kernel_range(addr, end);
+}
+
+static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end, pgprot_t prot, struct page **pages, int *nr,
pgtbl_mod_mask *mask)
{
@@ -217,7 +456,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
return 0;
}
-static int vmap_pmd_range(pud_t *pud, unsigned long addr,
+static int vmap_pages_pmd_range(pud_t *pud, unsigned long addr,
unsigned long end, pgprot_t prot, struct page **pages, int *nr,
pgtbl_mod_mask *mask)
{
@@ -229,13 +468,13 @@ static int vmap_pmd_range(pud_t *pud, unsigned long addr,
return -ENOMEM;
do {
next = pmd_addr_end(addr, end);
- if (vmap_pte_range(pmd, addr, next, prot, pages, nr, mask))
+ if (vmap_pages_pte_range(pmd, addr, next, prot, pages, nr, mask))
return -ENOMEM;
} while (pmd++, addr = next, addr != end);
return 0;
}
-static int vmap_pud_range(p4d_t *p4d, unsigned long addr,
+static int vmap_pages_pud_range(p4d_t *p4d, unsigned long addr,
unsigned long end, pgprot_t prot, struct page **pages, int *nr,
pgtbl_mod_mask *mask)
{
@@ -247,13 +486,13 @@ static int vmap_pud_range(p4d_t *p4d, unsigned long addr,
return -ENOMEM;
do {
next = pud_addr_end(addr, end);
- if (vmap_pmd_range(pud, addr, next, prot, pages, nr, mask))
+ if (vmap_pages_pmd_range(pud, addr, next, prot, pages, nr, mask))
return -ENOMEM;
} while (pud++, addr = next, addr != end);
return 0;
}
-static int vmap_p4d_range(pgd_t *pgd, unsigned long addr,
+static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr,
unsigned long end, pgprot_t prot, struct page **pages, int *nr,
pgtbl_mod_mask *mask)
{
@@ -265,37 +504,18 @@ static int vmap_p4d_range(pgd_t *pgd, unsigned long addr,
return -ENOMEM;
do {
next = p4d_addr_end(addr, end);
- if (vmap_pud_range(p4d, addr, next, prot, pages, nr, mask))
+ if (vmap_pages_pud_range(p4d, addr, next, prot, pages, nr, mask))
return -ENOMEM;
} while (p4d++, addr = next, addr != end);
return 0;
}
-/**
- * map_kernel_range_noflush - map kernel VM area with the specified pages
- * @addr: start of the VM area to map
- * @size: size of the VM area to map
- * @prot: page protection flags to use
- * @pages: pages to map
- *
- * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size specify should
- * have been allocated using get_vm_area() and its friends.
- *
- * NOTE:
- * This function does NOT do any cache flushing. The caller is responsible for
- * calling flush_cache_vmap() on to-be-mapped areas before calling this
- * function.
- *
- * RETURNS:
- * 0 on success, -errno on failure.
- */
-int map_kernel_range_noflush(unsigned long addr, unsigned long size,
- pgprot_t prot, struct page **pages)
+static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end,
+ pgprot_t prot, struct page **pages)
{
unsigned long start = addr;
- unsigned long end = addr + size;
- unsigned long next;
pgd_t *pgd;
+ unsigned long next;
int err = 0;
int nr = 0;
pgtbl_mod_mask mask = 0;
@@ -306,7 +526,7 @@ int map_kernel_range_noflush(unsigned long addr, unsigned long size,
next = pgd_addr_end(addr, end);
if (pgd_bad(*pgd))
mask |= PGTBL_PGD_MODIFIED;
- err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr, &mask);
+ err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr, &mask);
if (err)
return err;
} while (pgd++, addr = next, addr != end);
@@ -317,14 +537,61 @@ int map_kernel_range_noflush(unsigned long addr, unsigned long size,
return 0;
}
-int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot,
- struct page **pages)
+/*
+ * vmap_pages_range_noflush is similar to vmap_pages_range, but does not
+ * flush caches.
+ *
+ * The caller is responsible for calling flush_cache_vmap() after this
+ * function returns successfully and before the addresses are accessed.
+ *
+ * This is an internal function only. Do not use outside mm/.
+ */
+int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
+ pgprot_t prot, struct page **pages, unsigned int page_shift)
{
- int ret;
+ unsigned int i, nr = (end - addr) >> PAGE_SHIFT;
+
+ WARN_ON(page_shift < PAGE_SHIFT);
+
+ if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) ||
+ page_shift == PAGE_SHIFT)
+ return vmap_small_pages_range_noflush(addr, end, prot, pages);
+
+ for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) {
+ int err;
+
+ err = vmap_range_noflush(addr, addr + (1UL << page_shift),
+ __pa(page_address(pages[i])), prot,
+ page_shift);
+ if (err)
+ return err;
+
+ addr += 1UL << page_shift;
+ }
+
+ return 0;
+}
+
+/**
+ * vmap_pages_range - map pages to a kernel virtual address
+ * @addr: start of the VM area to map
+ * @end: end of the VM area to map (non-inclusive)
+ * @prot: page protection flags to use
+ * @pages: pages to map (always PAGE_SIZE pages)
+ * @page_shift: maximum shift that the pages may be mapped with, @pages must
+ * be aligned and contiguous up to at least this shift.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+static int vmap_pages_range(unsigned long addr, unsigned long end,
+ pgprot_t prot, struct page **pages, unsigned int page_shift)
+{
+ int err;
- ret = map_kernel_range_noflush(start, size, prot, pages);
- flush_cache_vmap(start, start + size);
- return ret;
+ err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
+ flush_cache_vmap(addr, end);
+ return err;
}
int is_vmalloc_or_module_addr(const void *x)
@@ -343,7 +610,9 @@ int is_vmalloc_or_module_addr(const void *x)
}
/*
- * Walk a vmap address to the struct page it maps.
+ * Walk a vmap address to the struct page it maps. Huge vmap mappings will
+ * return the tail page that corresponds to the base page address, which
+ * matches small vmap mappings.
*/
struct page *vmalloc_to_page(const void *vmalloc_addr)
{
@@ -363,25 +632,33 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
if (pgd_none(*pgd))
return NULL;
+ if (WARN_ON_ONCE(pgd_leaf(*pgd)))
+ return NULL; /* XXX: no allowance for huge pgd */
+ if (WARN_ON_ONCE(pgd_bad(*pgd)))
+ return NULL;
+
p4d = p4d_offset(pgd, addr);
if (p4d_none(*p4d))
return NULL;
- pud = pud_offset(p4d, addr);
+ if (p4d_leaf(*p4d))
+ return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT);
+ if (WARN_ON_ONCE(p4d_bad(*p4d)))
+ return NULL;
- /*
- * Don't dereference bad PUD or PMD (below) entries. This will also
- * identify huge mappings, which we may encounter on architectures
- * that define CONFIG_HAVE_ARCH_HUGE_VMAP=y. Such regions will be
- * identified as vmalloc addresses by is_vmalloc_addr(), but are
- * not [unambiguously] associated with a struct page, so there is
- * no correct value to return for them.
- */
- WARN_ON_ONCE(pud_bad(*pud));
- if (pud_none(*pud) || pud_bad(*pud))
+ pud = pud_offset(p4d, addr);
+ if (pud_none(*pud))
return NULL;
+ if (pud_leaf(*pud))
+ return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
+ if (WARN_ON_ONCE(pud_bad(*pud)))
+ return NULL;
+
pmd = pmd_offset(pud, addr);
- WARN_ON_ONCE(pmd_bad(*pmd));
- if (pmd_none(*pmd) || pmd_bad(*pmd))
+ if (pmd_none(*pmd))
+ return NULL;
+ if (pmd_leaf(*pmd))
+ return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
+ if (WARN_ON_ONCE(pmd_bad(*pmd)))
return NULL;
ptep = pte_offset_map(pmd, addr);
@@ -389,6 +666,7 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
if (pte_present(pte))
page = pte_page(pte);
pte_unmap(ptep);
+
return page;
}
EXPORT_SYMBOL(vmalloc_to_page);
@@ -1152,6 +1430,29 @@ static void free_vmap_area(struct vmap_area *va)
spin_unlock(&free_vmap_area_lock);
}
+static inline void
+preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node)
+{
+ struct vmap_area *va = NULL;
+
+ /*
+ * Preload this CPU with one extra vmap_area object. It is used
+ * when fit type of free area is NE_FIT_TYPE. It guarantees that
+ * a CPU that does an allocation is preloaded.
+ *
+ * We do it in non-atomic context, thus it allows us to use more
+ * permissive allocation masks to be more stable under low memory
+ * condition and high memory pressure.
+ */
+ if (!this_cpu_read(ne_fit_preload_node))
+ va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
+
+ spin_lock(lock);
+
+ if (va && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, va))
+ kmem_cache_free(vmap_area_cachep, va);
+}
+
/*
* Allocate a region of KVA of the specified size and alignment, within the
* vstart and vend.
@@ -1161,7 +1462,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
unsigned long vstart, unsigned long vend,
int node, gfp_t gfp_mask)
{
- struct vmap_area *va, *pva;
+ struct vmap_area *va;
unsigned long addr;
int purged = 0;
int ret;
@@ -1187,43 +1488,14 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask);
retry:
- /*
- * Preload this CPU with one extra vmap_area object. It is used
- * when fit type of free area is NE_FIT_TYPE. Please note, it
- * does not guarantee that an allocation occurs on a CPU that
- * is preloaded, instead we minimize the case when it is not.
- * It can happen because of cpu migration, because there is a
- * race until the below spinlock is taken.
- *
- * The preload is done in non-atomic context, thus it allows us
- * to use more permissive allocation masks to be more stable under
- * low memory condition and high memory pressure. In rare case,
- * if not preloaded, GFP_NOWAIT is used.
- *
- * Set "pva" to NULL here, because of "retry" path.
- */
- pva = NULL;
-
- if (!this_cpu_read(ne_fit_preload_node))
- /*
- * Even if it fails we do not really care about that.
- * Just proceed as it is. If needed "overflow" path
- * will refill the cache we allocate from.
- */
- pva = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
-
- spin_lock(&free_vmap_area_lock);
-
- if (pva && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, pva))
- kmem_cache_free(vmap_area_cachep, pva);
+ preload_this_cpu_lock(&free_vmap_area_lock, gfp_mask, node);
+ addr = __alloc_vmap_area(size, align, vstart, vend);
+ spin_unlock(&free_vmap_area_lock);
/*
* If an allocation fails, the "vend" address is
* returned. Therefore trigger the overflow path.
*/
- addr = __alloc_vmap_area(size, align, vstart, vend);
- spin_unlock(&free_vmap_area_lock);
-
if (unlikely(addr == vend))
goto overflow;
@@ -1231,7 +1503,6 @@ retry:
va->va_end = addr + size;
va->vm = NULL;
-
spin_lock(&vmap_area_lock);
insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
spin_unlock(&vmap_area_lock);
@@ -1448,7 +1719,7 @@ static void free_vmap_area_noflush(struct vmap_area *va)
static void free_unmap_vmap_area(struct vmap_area *va)
{
flush_cache_vunmap(va->va_start, va->va_end);
- unmap_kernel_range_noflush(va->va_start, va->va_end - va->va_start);
+ vunmap_range_noflush(va->va_start, va->va_end);
if (debug_pagealloc_enabled_static())
flush_tlb_kernel_range(va->va_start, va->va_end);
@@ -1726,7 +1997,7 @@ static void vb_free(unsigned long addr, unsigned long size)
offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT;
vb = xa_load(&vmap_blocks, addr_to_vb_idx(addr));
- unmap_kernel_range_noflush(addr, size);
+ vunmap_range_noflush(addr, addr + size);
if (debug_pagealloc_enabled_static())
flush_tlb_kernel_range(addr, addr + size);
@@ -1762,7 +2033,7 @@ static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush)
rcu_read_lock();
list_for_each_entry_rcu(vb, &vbq->free, free_list) {
spin_lock(&vb->lock);
- if (vb->dirty) {
+ if (vb->dirty && vb->dirty != VMAP_BBMAP_BITS) {
unsigned long va_start = vb->va->va_start;
unsigned long s, e;
@@ -1879,16 +2150,36 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node)
kasan_unpoison_vmalloc(mem, size);
- if (map_kernel_range(addr, size, PAGE_KERNEL, pages) < 0) {
+ if (vmap_pages_range(addr, addr + size, PAGE_KERNEL,
+ pages, PAGE_SHIFT) < 0) {
vm_unmap_ram(mem, count);
return NULL;
}
+
return mem;
}
EXPORT_SYMBOL(vm_map_ram);
static struct vm_struct *vmlist __initdata;
+static inline unsigned int vm_area_page_order(struct vm_struct *vm)
+{
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
+ return vm->page_order;
+#else
+ return 0;
+#endif
+}
+
+static inline void set_vm_area_page_order(struct vm_struct *vm, unsigned int order)
+{
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
+ vm->page_order = order;
+#else
+ BUG_ON(order != 0);
+#endif
+}
+
/**
* vm_area_add_early - add vmap area early during boot
* @vm: vm_struct to add
@@ -2023,23 +2314,6 @@ void __init vmalloc_init(void)
vmap_initialized = true;
}
-/**
- * unmap_kernel_range - unmap kernel VM area and flush cache and TLB
- * @addr: start of the VM area to unmap
- * @size: size of the VM area to unmap
- *
- * Similar to unmap_kernel_range_noflush() but flushes vcache before
- * the unmapping and tlb after.
- */
-void unmap_kernel_range(unsigned long addr, unsigned long size)
-{
- unsigned long end = addr + size;
-
- flush_cache_vunmap(addr, end);
- unmap_kernel_range_noflush(addr, size);
- flush_tlb_kernel_range(addr, end);
-}
-
static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
struct vmap_area *va, unsigned long flags, const void *caller)
{
@@ -2199,6 +2473,7 @@ static inline void set_area_direct_map(const struct vm_struct *area,
{
int i;
+ /* HUGE_VMALLOC passes small pages to set_direct_map */
for (i = 0; i < area->nr_pages; i++)
if (page_address(area->pages[i]))
set_direct_map(area->pages[i]);
@@ -2208,6 +2483,7 @@ static inline void set_area_direct_map(const struct vm_struct *area,
static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages)
{
unsigned long start = ULONG_MAX, end = 0;
+ unsigned int page_order = vm_area_page_order(area);
int flush_reset = area->flags & VM_FLUSH_RESET_PERMS;
int flush_dmap = 0;
int i;
@@ -2232,11 +2508,14 @@ static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages)
* map. Find the start and end range of the direct mappings to make sure
* the vm_unmap_aliases() flush includes the direct map.
*/
- for (i = 0; i < area->nr_pages; i++) {
+ for (i = 0; i < area->nr_pages; i += 1U << page_order) {
unsigned long addr = (unsigned long)page_address(area->pages[i]);
if (addr) {
+ unsigned long page_size;
+
+ page_size = PAGE_SIZE << page_order;
start = min(addr, start);
- end = max(addr + PAGE_SIZE, end);
+ end = max(addr + page_size, end);
flush_dmap = 1;
}
}
@@ -2277,13 +2556,14 @@ static void __vunmap(const void *addr, int deallocate_pages)
vm_remove_mappings(area, deallocate_pages);
if (deallocate_pages) {
+ unsigned int page_order = vm_area_page_order(area);
int i;
- for (i = 0; i < area->nr_pages; i++) {
+ for (i = 0; i < area->nr_pages; i += 1U << page_order) {
struct page *page = area->pages[i];
BUG_ON(!page);
- __free_pages(page, 0);
+ __free_pages(page, page_order);
}
atomic_long_sub(area->nr_pages, &nr_vmalloc_pages);
@@ -2402,6 +2682,7 @@ void *vmap(struct page **pages, unsigned int count,
unsigned long flags, pgprot_t prot)
{
struct vm_struct *area;
+ unsigned long addr;
unsigned long size; /* In bytes */
might_sleep();
@@ -2414,8 +2695,9 @@ void *vmap(struct page **pages, unsigned int count,
if (!area)
return NULL;
- if (map_kernel_range((unsigned long)area->addr, size, pgprot_nx(prot),
- pages) < 0) {
+ addr = (unsigned long)area->addr;
+ if (vmap_pages_range(addr, addr + size, pgprot_nx(prot),
+ pages, PAGE_SHIFT) < 0) {
vunmap(area->addr);
return NULL;
}
@@ -2474,15 +2756,19 @@ EXPORT_SYMBOL_GPL(vmap_pfn);
#endif /* CONFIG_VMAP_PFN */
static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
- pgprot_t prot, int node)
+ pgprot_t prot, unsigned int page_shift,
+ int node)
{
const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
- unsigned int nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
+ unsigned long addr = (unsigned long)area->addr;
+ unsigned long size = get_vm_area_size(area);
unsigned long array_size;
- unsigned int i;
+ unsigned int nr_small_pages = size >> PAGE_SHIFT;
+ unsigned int page_order;
struct page **pages;
+ unsigned int i;
- array_size = (unsigned long)nr_pages * sizeof(struct page *);
+ array_size = (unsigned long)nr_small_pages * sizeof(struct page *);
gfp_mask |= __GFP_NOWARN;
if (!(gfp_mask & (GFP_DMA | GFP_DMA32)))
gfp_mask |= __GFP_HIGHMEM;
@@ -2497,42 +2783,60 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
if (!pages) {
free_vm_area(area);
+ warn_alloc(gfp_mask, NULL,
+ "vmalloc size %lu allocation failure: "
+ "page array size %lu allocation failed",
+ nr_small_pages * PAGE_SIZE, array_size);
return NULL;
}
area->pages = pages;
- area->nr_pages = nr_pages;
+ area->nr_pages = nr_small_pages;
+ set_vm_area_page_order(area, page_shift - PAGE_SHIFT);
- for (i = 0; i < area->nr_pages; i++) {
- struct page *page;
+ page_order = vm_area_page_order(area);
- if (node == NUMA_NO_NODE)
- page = alloc_page(gfp_mask);
- else
- page = alloc_pages_node(node, gfp_mask, 0);
+ /*
+ * Careful, we allocate and map page_order pages, but tracking is done
+ * per PAGE_SIZE page so as to keep the vm_struct APIs independent of
+ * the physical/mapped size.
+ */
+ for (i = 0; i < area->nr_pages; i += 1U << page_order) {
+ struct page *page;
+ int p;
+ /* Compound pages required for remap_vmalloc_page */
+ page = alloc_pages_node(node, gfp_mask | __GFP_COMP, page_order);
if (unlikely(!page)) {
/* Successfully allocated i pages, free them in __vfree() */
area->nr_pages = i;
atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
+ warn_alloc(gfp_mask, NULL,
+ "vmalloc size %lu allocation failure: "
+ "page order %u allocation failed",
+ area->nr_pages * PAGE_SIZE, page_order);
goto fail;
}
- area->pages[i] = page;
+
+ for (p = 0; p < (1U << page_order); p++)
+ area->pages[i + p] = page + p;
+
if (gfpflags_allow_blocking(gfp_mask))
cond_resched();
}
atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
- if (map_kernel_range((unsigned long)area->addr, get_vm_area_size(area),
- prot, pages) < 0)
+ if (vmap_pages_range(addr, addr + size, prot, pages, page_shift) < 0) {
+ warn_alloc(gfp_mask, NULL,
+ "vmalloc size %lu allocation failure: "
+ "failed to map pages",
+ area->nr_pages * PAGE_SIZE);
goto fail;
+ }
return area->addr;
fail:
- warn_alloc(gfp_mask, NULL,
- "vmalloc: allocation failure, allocated %ld of %ld bytes",
- (area->nr_pages*PAGE_SIZE), area->size);
__vfree(area->addr);
return NULL;
}
@@ -2563,19 +2867,54 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
struct vm_struct *area;
void *addr;
unsigned long real_size = size;
+ unsigned long real_align = align;
+ unsigned int shift = PAGE_SHIFT;
- size = PAGE_ALIGN(size);
- if (!size || (size >> PAGE_SHIFT) > totalram_pages())
- goto fail;
+ if (WARN_ON_ONCE(!size))
+ return NULL;
+
+ if ((size >> PAGE_SHIFT) > totalram_pages()) {
+ warn_alloc(gfp_mask, NULL,
+ "vmalloc size %lu allocation failure: "
+ "exceeds total pages", real_size);
+ return NULL;
+ }
- area = __get_vm_area_node(real_size, align, VM_ALLOC | VM_UNINITIALIZED |
+ if (vmap_allow_huge && !(vm_flags & VM_NO_HUGE_VMAP) &&
+ arch_vmap_pmd_supported(prot)) {
+ unsigned long size_per_node;
+
+ /*
+ * Try huge pages. Only try for PAGE_KERNEL allocations,
+ * others like modules don't yet expect huge pages in
+ * their allocations due to apply_to_page_range not
+ * supporting them.
+ */
+
+ size_per_node = size;
+ if (node == NUMA_NO_NODE)
+ size_per_node /= num_online_nodes();
+ if (size_per_node >= PMD_SIZE) {
+ shift = PMD_SHIFT;
+ align = max(real_align, 1UL << shift);
+ size = ALIGN(real_size, 1UL << shift);
+ }
+ }
+
+again:
+ size = PAGE_ALIGN(size);
+ area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED |
vm_flags, start, end, node, gfp_mask, caller);
- if (!area)
+ if (!area) {
+ warn_alloc(gfp_mask, NULL,
+ "vmalloc size %lu allocation failure: "
+ "vm_struct allocation failed", real_size);
goto fail;
+ }
- addr = __vmalloc_area_node(area, gfp_mask, prot, node);
+ addr = __vmalloc_area_node(area, gfp_mask, prot, shift, node);
if (!addr)
- return NULL;
+ goto fail;
/*
* In this function, newly allocated vm_struct has VM_UNINITIALIZED
@@ -2589,8 +2928,13 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
return addr;
fail:
- warn_alloc(gfp_mask, NULL,
- "vmalloc: allocation failure: %lu bytes", real_size);
+ if (shift > PAGE_SHIFT) {
+ shift = PAGE_SHIFT;
+ align = real_align;
+ size = real_size;
+ goto again;
+ }
+
return NULL;
}
@@ -2894,7 +3238,10 @@ long vread(char *buf, char *addr, unsigned long count)
count = -(unsigned long) addr;
spin_lock(&vmap_area_lock);
- list_for_each_entry(va, &vmap_area_list, list) {
+ va = __find_vmap_area((unsigned long)addr);
+ if (!va)
+ goto finished;
+ list_for_each_entry_from(va, &vmap_area_list, list) {
if (!count)
break;
@@ -3072,7 +3419,6 @@ int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
return 0;
}
-EXPORT_SYMBOL(remap_vmalloc_range_partial);
/**
* remap_vmalloc_range - map vmalloc pages to userspace