From be45a4902c7caa717fee6b2f671e59b396ed395c Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 11 Aug 2022 12:13:30 -0400 Subject: mm/swap: cache maximum swapfile size when init swap We used to have swapfile_maximum_size() fetching a maximum value of swapfile size per-arch. As the caller of max_swapfile_size() grows, this patch introduce a variable "swapfile_maximum_size" and cache the value of old max_swapfile_size(), so that we don't need to calculate the value every time. Caching the value in swapfile_init() is safe because when reaching the phase we should have initialized all the relevant information. Here the major arch to take care of is x86, which defines the max swapfile size based on L1TF mitigation. Here both X86_BUG_L1TF or l1tf_mitigation should have been setup properly when reaching swapfile_init(). As a reference, the code path looks like this for x86: - start_kernel - setup_arch - early_cpu_init - early_identify_cpu --> setup X86_BUG_L1TF - parse_early_param - l1tf_cmdline --> set l1tf_mitigation - check_bugs - l1tf_select_mitigation --> set l1tf_mitigation - arch_call_rest_init - rest_init - kernel_init - kernel_init_freeable - do_basic_setup - do_initcalls --> calls swapfile_init() (initcall level 4) The swapfile size only depends on swp pte format on non-x86 archs, so caching it is safe too. Since at it, rename max_swapfile_size() to arch_max_swapfile_size() because arch can define its own function, so it's more straightforward to have "arch_" as its prefix. At the meantime, export swapfile_maximum_size to replace the old usages of max_swapfile_size(). [peterx@redhat.com: declare arch_max_swapfile_size) in swapfile.h] Link: https://lkml.kernel.org/r/YxTh1GuC6ro5fKL5@xz-m1.local Link: https://lkml.kernel.org/r/20220811161331.37055-7-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: "Huang, Ying" Cc: Alistair Popple Cc: Andi Kleen Cc: Andrea Arcangeli Cc: David Hildenbrand Cc: Hugh Dickins Cc: "Kirill A . Shutemov" Cc: Minchan Kim Cc: Nadav Amit Cc: Vlastimil Babka Cc: Dave Hansen Signed-off-by: Andrew Morton --- arch/x86/mm/init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 82a042c03824..9121bc1b9453 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -1054,7 +1054,7 @@ void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache) } #ifdef CONFIG_SWAP -unsigned long max_swapfile_size(void) +unsigned long arch_max_swapfile_size(void) { unsigned long pages; -- cgit v1.2.3 From eed9a328aa1ae6ac1edaa026957e6882f57de0dd Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Sun, 18 Sep 2022 01:59:59 -0600 Subject: mm: x86: add CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some architectures support the accessed bit in non-leaf PMD entries, e.g., x86 sets the accessed bit in a non-leaf PMD entry when using it as part of linear address translation [1]. Page table walkers that clear the accessed bit may use this capability to reduce their search space. Note that: 1. Although an inline function is preferable, this capability is added as a configuration option for consistency with the existing macros. 2. Due to the little interest in other varieties, this capability was only tested on Intel and AMD CPUs. Thanks to the following developers for their efforts [2][3]. Randy Dunlap Stephen Rothwell [1]: Intel 64 and IA-32 Architectures Software Developer's Manual Volume 3 (June 2021), section 4.8 [2] https://lore.kernel.org/r/bfdcc7c8-922f-61a9-aa15-7e7250f04af7@infradead.org/ [3] https://lore.kernel.org/r/20220413151513.5a0d7a7e@canb.auug.org.au/ Link: https://lkml.kernel.org/r/20220918080010.2920238-3-yuzhao@google.com Signed-off-by: Yu Zhao Reviewed-by: Barry Song Acked-by: Brian Geffon Acked-by: Jan Alexander Steffens (heftig) Acked-by: Oleksandr Natalenko Acked-by: Steven Barrett Acked-by: Suleiman Souhlal Tested-by: Daniel Byrne Tested-by: Donald Carr Tested-by: Holger Hoffstätte Tested-by: Konstantin Kharlamov Tested-by: Shuang Zhai Tested-by: Sofia Trinh Tested-by: Vaibhav Jain Cc: Andi Kleen Cc: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Dave Hansen Cc: Hillf Danton Cc: Jens Axboe Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Linus Torvalds Cc: Matthew Wilcox Cc: Mel Gorman Cc: Miaohe Lin Cc: Michael Larabel Cc: Michal Hocko Cc: Mike Rapoport Cc: Mike Rapoport Cc: Peter Zijlstra Cc: Qi Zheng Cc: Tejun Heo Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/Kconfig | 8 ++++++++ arch/x86/Kconfig | 1 + arch/x86/include/asm/pgtable.h | 3 ++- arch/x86/mm/pgtable.c | 5 ++++- include/linux/pgtable.h | 4 ++-- 5 files changed, 17 insertions(+), 4 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/Kconfig b/arch/Kconfig index 5dbf11a5ba4e..1c2599618eeb 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -1415,6 +1415,14 @@ config DYNAMIC_SIGFRAME config HAVE_ARCH_NODE_DEV_GROUP bool +config ARCH_HAS_NONLEAF_PMD_YOUNG + bool + help + Architectures that select this option are capable of setting the + accessed bit in non-leaf PMD entries when using them as part of linear + address translations. Page table walkers that clear the accessed bit + may use this capability to reduce their search space. + source "kernel/gcov/Kconfig" source "scripts/gcc-plugins/Kconfig" diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f9920f1341c8..674d694a665e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -85,6 +85,7 @@ config X86 select ARCH_HAS_PMEM_API if X86_64 select ARCH_HAS_PTE_DEVMAP if X86_64 select ARCH_HAS_PTE_SPECIAL + select ARCH_HAS_NONLEAF_PMD_YOUNG if PGTABLE_LEVELS > 2 select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64 select ARCH_HAS_COPY_MC if X86_64 select ARCH_HAS_SET_MEMORY diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index dc5f7d8ef68a..5059799bebe3 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -815,7 +815,8 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) static inline int pmd_bad(pmd_t pmd) { - return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE; + return (pmd_flags(pmd) & ~(_PAGE_USER | _PAGE_ACCESSED)) != + (_KERNPG_TABLE & ~_PAGE_ACCESSED); } static inline unsigned long pages_to_mb(unsigned long npg) diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index a932d7712d85..8525f2876fb4 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -550,7 +550,7 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma, return ret; } -#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) int pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { @@ -562,6 +562,9 @@ int pmdp_test_and_clear_young(struct vm_area_struct *vma, return ret; } +#endif + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE int pudp_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pud_t *pudp) { diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 375e8e7e64f4..a108b60a6962 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -213,7 +213,7 @@ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, #endif #ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG -#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) @@ -234,7 +234,7 @@ static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, BUILD_BUG(); return 0; } -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG */ #endif #ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH -- cgit v1.2.3 From 1a167ddd3c561b21a76187a81530a167e3522261 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:43 +0200 Subject: x86: kmsan: pgtable: reduce vmalloc space KMSAN is going to use 3/4 of existing vmalloc space to hold the metadata, therefore we lower VMALLOC_END to make sure vmalloc() doesn't allocate past the first 1/4. Link: https://lkml.kernel.org/r/20220915150417.722975-10-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/x86/include/asm/pgtable_64_types.h | 47 ++++++++++++++++++++++++++++++++- arch/x86/mm/init_64.c | 2 +- 2 files changed, 47 insertions(+), 2 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 70e360a2e5fb..04f36063ad54 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -139,7 +139,52 @@ extern unsigned int ptrs_per_p4d; # define VMEMMAP_START __VMEMMAP_BASE_L4 #endif /* CONFIG_DYNAMIC_MEMORY_LAYOUT */ -#define VMALLOC_END (VMALLOC_START + (VMALLOC_SIZE_TB << 40) - 1) +/* + * End of the region for which vmalloc page tables are pre-allocated. + * For non-KMSAN builds, this is the same as VMALLOC_END. + * For KMSAN builds, VMALLOC_START..VMEMORY_END is 4 times bigger than + * VMALLOC_START..VMALLOC_END (see below). + */ +#define VMEMORY_END (VMALLOC_START + (VMALLOC_SIZE_TB << 40) - 1) + +#ifndef CONFIG_KMSAN +#define VMALLOC_END VMEMORY_END +#else +/* + * In KMSAN builds vmalloc area is four times smaller, and the remaining 3/4 + * are used to keep the metadata for virtual pages. The memory formerly + * belonging to vmalloc area is now laid out as follows: + * + * 1st quarter: VMALLOC_START to VMALLOC_END - new vmalloc area + * 2nd quarter: KMSAN_VMALLOC_SHADOW_START to + * VMALLOC_END+KMSAN_VMALLOC_SHADOW_OFFSET - vmalloc area shadow + * 3rd quarter: KMSAN_VMALLOC_ORIGIN_START to + * VMALLOC_END+KMSAN_VMALLOC_ORIGIN_OFFSET - vmalloc area origins + * 4th quarter: KMSAN_MODULES_SHADOW_START to KMSAN_MODULES_ORIGIN_START + * - shadow for modules, + * KMSAN_MODULES_ORIGIN_START to + * KMSAN_MODULES_ORIGIN_START + MODULES_LEN - origins for modules. + */ +#define VMALLOC_QUARTER_SIZE ((VMALLOC_SIZE_TB << 40) >> 2) +#define VMALLOC_END (VMALLOC_START + VMALLOC_QUARTER_SIZE - 1) + +/* + * vmalloc metadata addresses are calculated by adding shadow/origin offsets + * to vmalloc address. + */ +#define KMSAN_VMALLOC_SHADOW_OFFSET VMALLOC_QUARTER_SIZE +#define KMSAN_VMALLOC_ORIGIN_OFFSET (VMALLOC_QUARTER_SIZE << 1) + +#define KMSAN_VMALLOC_SHADOW_START (VMALLOC_START + KMSAN_VMALLOC_SHADOW_OFFSET) +#define KMSAN_VMALLOC_ORIGIN_START (VMALLOC_START + KMSAN_VMALLOC_ORIGIN_OFFSET) + +/* + * The shadow/origin for modules are placed one by one in the last 1/4 of + * vmalloc space. + */ +#define KMSAN_MODULES_SHADOW_START (VMALLOC_END + KMSAN_VMALLOC_ORIGIN_OFFSET + 1) +#define KMSAN_MODULES_ORIGIN_START (KMSAN_MODULES_SHADOW_START + MODULES_LEN) +#endif /* CONFIG_KMSAN */ #define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE) /* The module sections ends with the start of the fixmap */ diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 0fe690ebc269..39b6bfcaa0ed 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1287,7 +1287,7 @@ static void __init preallocate_vmalloc_pages(void) unsigned long addr; const char *lvl; - for (addr = VMALLOC_START; addr <= VMALLOC_END; addr = ALIGN(addr + 1, PGDIR_SIZE)) { + for (addr = VMALLOC_START; addr <= VMEMORY_END; addr = ALIGN(addr + 1, PGDIR_SIZE)) { pgd_t *pgd = pgd_offset_k(addr); p4d_t *p4d; pud_t *pud; -- cgit v1.2.3 From b073d7f8aee4ebf05d10e3380df377b73120cf16 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:48 +0200 Subject: mm: kmsan: maintain KMSAN metadata for page operations Insert KMSAN hooks that make the necessary bookkeeping changes: - poison page shadow and origins in alloc_pages()/free_page(); - clear page shadow and origins in clear_page(), copy_user_highpage(); - copy page metadata in copy_highpage(), wp_page_copy(); - handle vmap()/vunmap()/iounmap(); Link: https://lkml.kernel.org/r/20220915150417.722975-15-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/x86/include/asm/page_64.h | 7 ++ arch/x86/mm/ioremap.c | 3 + include/linux/highmem.h | 3 + include/linux/kmsan.h | 145 +++++++++++++++++++++++++++++++++++++++++ mm/internal.h | 6 ++ mm/kmsan/hooks.c | 86 ++++++++++++++++++++++++ mm/kmsan/shadow.c | 113 ++++++++++++++++++++++++++++++++ mm/memory.c | 2 + mm/page_alloc.c | 11 ++++ mm/vmalloc.c | 20 +++++- 10 files changed, 394 insertions(+), 2 deletions(-) create mode 100644 include/linux/kmsan.h (limited to 'arch/x86/mm') diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index baa70451b8df..198e03e59ca1 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -8,6 +8,8 @@ #include #include +#include + /* duplicated to the one in bootmem.h */ extern unsigned long max_pfn; extern unsigned long phys_base; @@ -47,6 +49,11 @@ void clear_page_erms(void *page); static inline void clear_page(void *page) { + /* + * Clean up KMSAN metadata for the page being cleared. The assembly call + * below clobbers @page, so we perform unpoisoning before it. + */ + kmsan_unpoison_memory(page, PAGE_SIZE); alternative_call_2(clear_page_orig, clear_page_rep, X86_FEATURE_REP_GOOD, clear_page_erms, X86_FEATURE_ERMS, diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 1ad0228f8ceb..78c5bc654cff 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -479,6 +480,8 @@ void iounmap(volatile void __iomem *addr) return; } + kmsan_iounmap_page_range((unsigned long)addr, + (unsigned long)addr + get_vm_area_size(p)); memtype_free(p->phys_addr, p->phys_addr + get_vm_area_size(p)); /* Finally remove it */ diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 25679035ca28..e9912da5441b 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -311,6 +312,7 @@ static inline void copy_user_highpage(struct page *to, struct page *from, vfrom = kmap_local_page(from); vto = kmap_local_page(to); copy_user_page(vto, vfrom, vaddr, to); + kmsan_unpoison_memory(page_address(to), PAGE_SIZE); kunmap_local(vto); kunmap_local(vfrom); } @@ -326,6 +328,7 @@ static inline void copy_highpage(struct page *to, struct page *from) vfrom = kmap_local_page(from); vto = kmap_local_page(to); copy_page(vto, vfrom); + kmsan_copy_page_meta(to, from); kunmap_local(vto); kunmap_local(vfrom); } diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h new file mode 100644 index 000000000000..b36bf3db835e --- /dev/null +++ b/include/linux/kmsan.h @@ -0,0 +1,145 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * KMSAN API for subsystems. + * + * Copyright (C) 2017-2022 Google LLC + * Author: Alexander Potapenko + * + */ +#ifndef _LINUX_KMSAN_H +#define _LINUX_KMSAN_H + +#include +#include +#include + +struct page; + +#ifdef CONFIG_KMSAN + +/** + * kmsan_alloc_page() - Notify KMSAN about an alloc_pages() call. + * @page: struct page pointer returned by alloc_pages(). + * @order: order of allocated struct page. + * @flags: GFP flags used by alloc_pages() + * + * KMSAN marks 1<<@order pages starting at @page as uninitialized, unless + * @flags contain __GFP_ZERO. + */ +void kmsan_alloc_page(struct page *page, unsigned int order, gfp_t flags); + +/** + * kmsan_free_page() - Notify KMSAN about a free_pages() call. + * @page: struct page pointer passed to free_pages(). + * @order: order of deallocated struct page. + * + * KMSAN marks freed memory as uninitialized. + */ +void kmsan_free_page(struct page *page, unsigned int order); + +/** + * kmsan_copy_page_meta() - Copy KMSAN metadata between two pages. + * @dst: destination page. + * @src: source page. + * + * KMSAN copies the contents of metadata pages for @src into the metadata pages + * for @dst. If @dst has no associated metadata pages, nothing happens. + * If @src has no associated metadata pages, @dst metadata pages are unpoisoned. + */ +void kmsan_copy_page_meta(struct page *dst, struct page *src); + +/** + * kmsan_map_kernel_range_noflush() - Notify KMSAN about a vmap. + * @start: start of vmapped range. + * @end: end of vmapped range. + * @prot: page protection flags used for vmap. + * @pages: array of pages. + * @page_shift: page_shift passed to vmap_range_noflush(). + * + * KMSAN maps shadow and origin pages of @pages into contiguous ranges in + * vmalloc metadata address range. + */ +void kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end, + pgprot_t prot, struct page **pages, + unsigned int page_shift); + +/** + * kmsan_vunmap_kernel_range_noflush() - Notify KMSAN about a vunmap. + * @start: start of vunmapped range. + * @end: end of vunmapped range. + * + * KMSAN unmaps the contiguous metadata ranges created by + * kmsan_map_kernel_range_noflush(). + */ +void kmsan_vunmap_range_noflush(unsigned long start, unsigned long end); + +/** + * kmsan_ioremap_page_range() - Notify KMSAN about a ioremap_page_range() call. + * @addr: range start. + * @end: range end. + * @phys_addr: physical range start. + * @prot: page protection flags used for ioremap_page_range(). + * @page_shift: page_shift argument passed to vmap_range_noflush(). + * + * KMSAN creates new metadata pages for the physical pages mapped into the + * virtual memory. + */ +void kmsan_ioremap_page_range(unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int page_shift); + +/** + * kmsan_iounmap_page_range() - Notify KMSAN about a iounmap_page_range() call. + * @start: range start. + * @end: range end. + * + * KMSAN unmaps the metadata pages for the given range and, unlike for + * vunmap_page_range(), also deallocates them. + */ +void kmsan_iounmap_page_range(unsigned long start, unsigned long end); + +#else + +static inline int kmsan_alloc_page(struct page *page, unsigned int order, + gfp_t flags) +{ + return 0; +} + +static inline void kmsan_free_page(struct page *page, unsigned int order) +{ +} + +static inline void kmsan_copy_page_meta(struct page *dst, struct page *src) +{ +} + +static inline void kmsan_vmap_pages_range_noflush(unsigned long start, + unsigned long end, + pgprot_t prot, + struct page **pages, + unsigned int page_shift) +{ +} + +static inline void kmsan_vunmap_range_noflush(unsigned long start, + unsigned long end) +{ +} + +static inline void kmsan_ioremap_page_range(unsigned long start, + unsigned long end, + phys_addr_t phys_addr, + pgprot_t prot, + unsigned int page_shift) +{ +} + +static inline void kmsan_iounmap_page_range(unsigned long start, + unsigned long end) +{ +} + +#endif + +#endif /* _LINUX_KMSAN_H */ diff --git a/mm/internal.h b/mm/internal.h index e497ab14c984..fea3cba15484 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -818,8 +818,14 @@ int vmap_pages_range_noflush(unsigned long addr, unsigned long end, } #endif +int __vmap_pages_range_noflush(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages, + unsigned int page_shift); + void vunmap_range_noflush(unsigned long start, unsigned long end); +void __vunmap_range_noflush(unsigned long start, unsigned long end); + int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, unsigned long addr, int page_nid, int *flags); diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c index 4ac62fa67a02..040111bb9f6a 100644 --- a/mm/kmsan/hooks.c +++ b/mm/kmsan/hooks.c @@ -11,6 +11,7 @@ #include #include +#include #include #include #include @@ -26,6 +27,91 @@ * skipping effects of functions like memset() inside instrumented code. */ +static unsigned long vmalloc_shadow(unsigned long addr) +{ + return (unsigned long)kmsan_get_metadata((void *)addr, + KMSAN_META_SHADOW); +} + +static unsigned long vmalloc_origin(unsigned long addr) +{ + return (unsigned long)kmsan_get_metadata((void *)addr, + KMSAN_META_ORIGIN); +} + +void kmsan_vunmap_range_noflush(unsigned long start, unsigned long end) +{ + __vunmap_range_noflush(vmalloc_shadow(start), vmalloc_shadow(end)); + __vunmap_range_noflush(vmalloc_origin(start), vmalloc_origin(end)); + flush_cache_vmap(vmalloc_shadow(start), vmalloc_shadow(end)); + flush_cache_vmap(vmalloc_origin(start), vmalloc_origin(end)); +} + +/* + * This function creates new shadow/origin pages for the physical pages mapped + * into the virtual memory. If those physical pages already had shadow/origin, + * those are ignored. + */ +void kmsan_ioremap_page_range(unsigned long start, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int page_shift) +{ + gfp_t gfp_mask = GFP_KERNEL | __GFP_ZERO; + struct page *shadow, *origin; + unsigned long off = 0; + int nr; + + if (!kmsan_enabled || kmsan_in_runtime()) + return; + + nr = (end - start) / PAGE_SIZE; + kmsan_enter_runtime(); + for (int i = 0; i < nr; i++, off += PAGE_SIZE) { + shadow = alloc_pages(gfp_mask, 1); + origin = alloc_pages(gfp_mask, 1); + __vmap_pages_range_noflush( + vmalloc_shadow(start + off), + vmalloc_shadow(start + off + PAGE_SIZE), prot, &shadow, + PAGE_SHIFT); + __vmap_pages_range_noflush( + vmalloc_origin(start + off), + vmalloc_origin(start + off + PAGE_SIZE), prot, &origin, + PAGE_SHIFT); + } + flush_cache_vmap(vmalloc_shadow(start), vmalloc_shadow(end)); + flush_cache_vmap(vmalloc_origin(start), vmalloc_origin(end)); + kmsan_leave_runtime(); +} + +void kmsan_iounmap_page_range(unsigned long start, unsigned long end) +{ + unsigned long v_shadow, v_origin; + struct page *shadow, *origin; + int nr; + + if (!kmsan_enabled || kmsan_in_runtime()) + return; + + nr = (end - start) / PAGE_SIZE; + kmsan_enter_runtime(); + v_shadow = (unsigned long)vmalloc_shadow(start); + v_origin = (unsigned long)vmalloc_origin(start); + for (int i = 0; i < nr; + i++, v_shadow += PAGE_SIZE, v_origin += PAGE_SIZE) { + shadow = kmsan_vmalloc_to_page_or_null((void *)v_shadow); + origin = kmsan_vmalloc_to_page_or_null((void *)v_origin); + __vunmap_range_noflush(v_shadow, vmalloc_shadow(end)); + __vunmap_range_noflush(v_origin, vmalloc_origin(end)); + if (shadow) + __free_pages(shadow, 1); + if (origin) + __free_pages(origin, 1); + } + flush_cache_vmap(vmalloc_shadow(start), vmalloc_shadow(end)); + flush_cache_vmap(vmalloc_origin(start), vmalloc_origin(end)); + kmsan_leave_runtime(); +} + /* Functions from kmsan-checks.h follow. */ void kmsan_poison_memory(const void *address, size_t size, gfp_t flags) { diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c index acc5279acc3b..8c81a059beea 100644 --- a/mm/kmsan/shadow.c +++ b/mm/kmsan/shadow.c @@ -145,3 +145,116 @@ void *kmsan_get_metadata(void *address, bool is_origin) return (is_origin ? origin_ptr_for(page) : shadow_ptr_for(page)) + off; } + +void kmsan_copy_page_meta(struct page *dst, struct page *src) +{ + if (!kmsan_enabled || kmsan_in_runtime()) + return; + if (!dst || !page_has_metadata(dst)) + return; + if (!src || !page_has_metadata(src)) { + kmsan_internal_unpoison_memory(page_address(dst), PAGE_SIZE, + /*checked*/ false); + return; + } + + kmsan_enter_runtime(); + __memcpy(shadow_ptr_for(dst), shadow_ptr_for(src), PAGE_SIZE); + __memcpy(origin_ptr_for(dst), origin_ptr_for(src), PAGE_SIZE); + kmsan_leave_runtime(); +} + +void kmsan_alloc_page(struct page *page, unsigned int order, gfp_t flags) +{ + bool initialized = (flags & __GFP_ZERO) || !kmsan_enabled; + struct page *shadow, *origin; + depot_stack_handle_t handle; + int pages = 1 << order; + + if (!page) + return; + + shadow = shadow_page_for(page); + origin = origin_page_for(page); + + if (initialized) { + __memset(page_address(shadow), 0, PAGE_SIZE * pages); + __memset(page_address(origin), 0, PAGE_SIZE * pages); + return; + } + + /* Zero pages allocated by the runtime should also be initialized. */ + if (kmsan_in_runtime()) + return; + + __memset(page_address(shadow), -1, PAGE_SIZE * pages); + kmsan_enter_runtime(); + handle = kmsan_save_stack_with_flags(flags, /*extra_bits*/ 0); + kmsan_leave_runtime(); + /* + * Addresses are page-aligned, pages are contiguous, so it's ok + * to just fill the origin pages with @handle. + */ + for (int i = 0; i < PAGE_SIZE * pages / sizeof(handle); i++) + ((depot_stack_handle_t *)page_address(origin))[i] = handle; +} + +void kmsan_free_page(struct page *page, unsigned int order) +{ + if (!kmsan_enabled || kmsan_in_runtime()) + return; + kmsan_enter_runtime(); + kmsan_internal_poison_memory(page_address(page), + PAGE_SIZE << compound_order(page), + GFP_KERNEL, + KMSAN_POISON_CHECK | KMSAN_POISON_FREE); + kmsan_leave_runtime(); +} + +void kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end, + pgprot_t prot, struct page **pages, + unsigned int page_shift) +{ + unsigned long shadow_start, origin_start, shadow_end, origin_end; + struct page **s_pages, **o_pages; + int nr, mapped; + + if (!kmsan_enabled) + return; + + shadow_start = vmalloc_meta((void *)start, KMSAN_META_SHADOW); + shadow_end = vmalloc_meta((void *)end, KMSAN_META_SHADOW); + if (!shadow_start) + return; + + nr = (end - start) / PAGE_SIZE; + s_pages = kcalloc(nr, sizeof(*s_pages), GFP_KERNEL); + o_pages = kcalloc(nr, sizeof(*o_pages), GFP_KERNEL); + if (!s_pages || !o_pages) + goto ret; + for (int i = 0; i < nr; i++) { + s_pages[i] = shadow_page_for(pages[i]); + o_pages[i] = origin_page_for(pages[i]); + } + prot = __pgprot(pgprot_val(prot) | _PAGE_NX); + prot = PAGE_KERNEL; + + origin_start = vmalloc_meta((void *)start, KMSAN_META_ORIGIN); + origin_end = vmalloc_meta((void *)end, KMSAN_META_ORIGIN); + kmsan_enter_runtime(); + mapped = __vmap_pages_range_noflush(shadow_start, shadow_end, prot, + s_pages, page_shift); + KMSAN_WARN_ON(mapped); + mapped = __vmap_pages_range_noflush(origin_start, origin_end, prot, + o_pages, page_shift); + KMSAN_WARN_ON(mapped); + kmsan_leave_runtime(); + flush_tlb_kernel_range(shadow_start, shadow_end); + flush_tlb_kernel_range(origin_start, origin_end); + flush_cache_vmap(shadow_start, shadow_end); + flush_cache_vmap(origin_start, origin_end); + +ret: + kfree(s_pages); + kfree(o_pages); +} diff --git a/mm/memory.c b/mm/memory.c index b3ed17219d77..118e5f023597 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -52,6 +52,7 @@ #include #include #include +#include #include #include #include @@ -3136,6 +3137,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) delayacct_wpcopy_end(); return 0; } + kmsan_copy_page_meta(new_page, old_page); } if (mem_cgroup_charge(page_folio(new_page), mm, GFP_KERNEL)) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4e8ea824e765..1db1ac74ef14 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -1400,6 +1401,7 @@ static __always_inline bool free_pages_prepare(struct page *page, VM_BUG_ON_PAGE(PageTail(page), page); trace_mm_page_free(page, order); + kmsan_free_page(page, order); if (unlikely(PageHWPoison(page)) && !order) { /* @@ -3808,6 +3810,14 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, /* * Allocate a page from the given zone. Use pcplists for order-0 allocations. */ + +/* + * Do not instrument rmqueue() with KMSAN. This function may call + * __msan_poison_alloca() through a call to set_pfnblock_flags_mask(). + * If __msan_poison_alloca() attempts to allocate pages for the stack depot, it + * may call rmqueue() again, which will result in a deadlock. + */ +__no_sanitize_memory static inline struct page *rmqueue(struct zone *preferred_zone, struct zone *zone, unsigned int order, @@ -5560,6 +5570,7 @@ out: } trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype); + kmsan_alloc_page(page, order, alloc_gfp); return page; } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index a991b909866f..ccaa461998f3 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -320,6 +320,9 @@ int ioremap_page_range(unsigned long addr, unsigned long end, err = vmap_range_noflush(addr, end, phys_addr, pgprot_nx(prot), ioremap_max_page_shift); flush_cache_vmap(addr, end); + if (!err) + kmsan_ioremap_page_range(addr, end, phys_addr, prot, + ioremap_max_page_shift); return err; } @@ -416,7 +419,7 @@ static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, * * This is an internal function only. Do not use outside mm/. */ -void vunmap_range_noflush(unsigned long start, unsigned long end) +void __vunmap_range_noflush(unsigned long start, unsigned long end) { unsigned long next; pgd_t *pgd; @@ -438,6 +441,12 @@ void vunmap_range_noflush(unsigned long start, unsigned long end) arch_sync_kernel_mappings(start, end); } +void vunmap_range_noflush(unsigned long start, unsigned long end) +{ + kmsan_vunmap_range_noflush(start, end); + __vunmap_range_noflush(start, end); +} + /** * vunmap_range - unmap kernel virtual addresses * @addr: start of the VM area to unmap @@ -575,7 +584,7 @@ static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end, * * This is an internal function only. Do not use outside mm/. */ -int vmap_pages_range_noflush(unsigned long addr, unsigned long end, +int __vmap_pages_range_noflush(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift) { unsigned int i, nr = (end - addr) >> PAGE_SHIFT; @@ -601,6 +610,13 @@ int vmap_pages_range_noflush(unsigned long addr, unsigned long end, return 0; } +int vmap_pages_range_noflush(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages, unsigned int page_shift) +{ + kmsan_vmap_pages_range_noflush(addr, end, prot, pages, page_shift); + return __vmap_pages_range_noflush(addr, end, prot, pages, page_shift); +} + /** * vmap_pages_range - map pages to a kernel virtual address * @addr: start of the VM area to map -- cgit v1.2.3 From 93324e6842148cfdb44d1437fb586b957ace1f8c Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:04:06 +0200 Subject: x86: kmsan: disable instrumentation of unsupported code Instrumenting some files with KMSAN will result in kernel being unable to link, boot or crashing at runtime for various reasons (e.g. infinite recursion caused by instrumentation hooks calling instrumented code again). Completely omit KMSAN instrumentation in the following places: - arch/x86/boot and arch/x86/realmode/rm, as KMSAN doesn't work for i386; - arch/x86/entry/vdso, which isn't linked with KMSAN runtime; - three files in arch/x86/kernel - boot problems; - arch/x86/mm/cpu_entry_area.c - recursion. Link: https://lkml.kernel.org/r/20220915150417.722975-33-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/x86/boot/Makefile | 1 + arch/x86/boot/compressed/Makefile | 1 + arch/x86/entry/vdso/Makefile | 3 +++ arch/x86/kernel/Makefile | 2 ++ arch/x86/kernel/cpu/Makefile | 1 + arch/x86/mm/Makefile | 2 ++ arch/x86/realmode/rm/Makefile | 1 + 7 files changed, 11 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index ffec8bb01ba8..9860ca5979f8 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -12,6 +12,7 @@ # Sanitizer runtimes are unavailable and cannot be linked for early boot code. KASAN_SANITIZE := n KCSAN_SANITIZE := n +KMSAN_SANITIZE := n OBJECT_FILES_NON_STANDARD := y # Kernel does not boot with kcov instrumentation here. diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 35ce1a64068b..3a261abb6d15 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -20,6 +20,7 @@ # Sanitizer runtimes are unavailable and cannot be linked for early boot code. KASAN_SANITIZE := n KCSAN_SANITIZE := n +KMSAN_SANITIZE := n OBJECT_FILES_NON_STANDARD := y # Prevents link failures: __sanitizer_cov_trace_pc() is not linked in. diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index 12f6c4d714cd..ce4eb7e44e5b 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -11,6 +11,9 @@ include $(srctree)/lib/vdso/Makefile # Sanitizer runtimes are unavailable and cannot be linked here. KASAN_SANITIZE := n +KMSAN_SANITIZE_vclock_gettime.o := n +KMSAN_SANITIZE_vgetcpu.o := n + UBSAN_SANITIZE := n KCSAN_SANITIZE := n OBJECT_FILES_NON_STANDARD := y diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index a20a5ebfacd7..ac564c5d7b1f 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -33,6 +33,8 @@ KASAN_SANITIZE_sev.o := n # With some compiler versions the generated code results in boot hangs, caused # by several compilation units. To be safe, disable all instrumentation. KCSAN_SANITIZE := n +KMSAN_SANITIZE_head$(BITS).o := n +KMSAN_SANITIZE_nmi.o := n # If instrumentation of this dir is enabled, boot hangs during first second. # Probably could be more selective here, but note that files related to irqs, diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 9661e3e802be..f10a921ee756 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -12,6 +12,7 @@ endif # If these files are instrumented, boot hangs during the first second. KCOV_INSTRUMENT_common.o := n KCOV_INSTRUMENT_perf_event.o := n +KMSAN_SANITIZE_common.o := n # As above, instrumenting secondary CPU boot code causes boot hangs. KCSAN_SANITIZE_common.o := n diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 829c1409ffbd..afb6f7187dad 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -14,6 +14,8 @@ KASAN_SANITIZE_pgprot.o := n # Disable KCSAN entirely, because otherwise we get warnings that some functions # reference __initdata sections. KCSAN_SANITIZE := n +# Avoid recursion by not calling KMSAN hooks for CEA code. +KMSAN_SANITIZE_cpu_entry_area.o := n ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_mem_encrypt.o = -pg diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile index 83f1b6a56449..f614009d3e4e 100644 --- a/arch/x86/realmode/rm/Makefile +++ b/arch/x86/realmode/rm/Makefile @@ -10,6 +10,7 @@ # Sanitizer runtimes are unavailable and cannot be linked here. KASAN_SANITIZE := n KCSAN_SANITIZE := n +KMSAN_SANITIZE := n OBJECT_FILES_NON_STANDARD := y # Prevents link failures: __sanitizer_cov_trace_pc() is not linked in. -- cgit v1.2.3 From 3f1e2c7a9099c1ed32c67f12cdf432ba782cf51f Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:04:10 +0200 Subject: x86: kmsan: sync metadata pages on page fault KMSAN assumes shadow and origin pages for every allocated page are accessible. For pages between [VMALLOC_START, VMALLOC_END] those metadata pages start at KMSAN_VMALLOC_SHADOW_START and KMSAN_VMALLOC_ORIGIN_START, therefore we must sync a bigger memory region. Link: https://lkml.kernel.org/r/20220915150417.722975-37-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/x86/mm/fault.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index fa71a5d12e87..d728791be8ac 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -260,7 +260,7 @@ static noinline int vmalloc_fault(unsigned long address) } NOKPROBE_SYMBOL(vmalloc_fault); -void arch_sync_kernel_mappings(unsigned long start, unsigned long end) +static void __arch_sync_kernel_mappings(unsigned long start, unsigned long end) { unsigned long addr; @@ -284,6 +284,27 @@ void arch_sync_kernel_mappings(unsigned long start, unsigned long end) } } +void arch_sync_kernel_mappings(unsigned long start, unsigned long end) +{ + __arch_sync_kernel_mappings(start, end); +#ifdef CONFIG_KMSAN + /* + * KMSAN maintains two additional metadata page mappings for the + * [VMALLOC_START, VMALLOC_END) range. These mappings start at + * KMSAN_VMALLOC_SHADOW_START and KMSAN_VMALLOC_ORIGIN_START and + * have to be synced together with the vmalloc memory mapping. + */ + if (start >= VMALLOC_START && end < VMALLOC_END) { + __arch_sync_kernel_mappings( + start - VMALLOC_START + KMSAN_VMALLOC_SHADOW_START, + end - VMALLOC_START + KMSAN_VMALLOC_SHADOW_START); + __arch_sync_kernel_mappings( + start - VMALLOC_START + KMSAN_VMALLOC_ORIGIN_START, + end - VMALLOC_START + KMSAN_VMALLOC_ORIGIN_START); + } +#endif +} + static bool low_pfn(unsigned long pfn) { return pfn < max_low_pfn; -- cgit v1.2.3 From ce732a7520b093091c345cba1b84542d1abd83ed Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Wed, 28 Sep 2022 14:32:19 +0200 Subject: x86: kmsan: handle CPU entry area Among other data, CPU entry area holds exception stacks, so addresses from this area can be passed to kmsan_get_metadata(). This previously led to kmsan_get_metadata() returning NULL, which in turn resulted in a warning that triggered further attempts to call kmsan_get_metadata() in the exception context, which quickly exhausted the exception stack. This patch allocates shadow and origin for the CPU entry area on x86 and introduces arch_kmsan_get_meta_or_null(), which performs arch-specific metadata mapping. Link: https://lkml.kernel.org/r/20220928123219.1101883-1-glider@google.com Signed-off-by: Alexander Potapenko Fixes: 21d723a7c1409 ("kmsan: add KMSAN runtime core") Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- MAINTAINERS | 1 + arch/x86/include/asm/kmsan.h | 32 ++++++++++++++++++++++++++++++++ arch/x86/mm/Makefile | 3 +++ arch/x86/mm/kmsan_shadow.c | 20 ++++++++++++++++++++ mm/kmsan/shadow.c | 6 +++++- 5 files changed, 61 insertions(+), 1 deletion(-) create mode 100644 arch/x86/mm/kmsan_shadow.c (limited to 'arch/x86/mm') diff --git a/MAINTAINERS b/MAINTAINERS index 3c7dfe9bb712..456b07f02803 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11379,6 +11379,7 @@ L: kasan-dev@googlegroups.com S: Maintained F: Documentation/dev-tools/kmsan.rst F: arch/*/include/asm/kmsan.h +F: arch/*/mm/kmsan_* F: include/linux/kmsan*.h F: lib/Kconfig.kmsan F: mm/kmsan/ diff --git a/arch/x86/include/asm/kmsan.h b/arch/x86/include/asm/kmsan.h index a790b865d0a6..8fa6ac0e2d76 100644 --- a/arch/x86/include/asm/kmsan.h +++ b/arch/x86/include/asm/kmsan.h @@ -11,9 +11,41 @@ #ifndef MODULE +#include #include #include +DECLARE_PER_CPU(char[CPU_ENTRY_AREA_SIZE], cpu_entry_area_shadow); +DECLARE_PER_CPU(char[CPU_ENTRY_AREA_SIZE], cpu_entry_area_origin); + +/* + * Functions below are declared in the header to make sure they are inlined. + * They all are called from kmsan_get_metadata() for every memory access in + * the kernel, so speed is important here. + */ + +/* + * Compute metadata addresses for the CPU entry area on x86. + */ +static inline void *arch_kmsan_get_meta_or_null(void *addr, bool is_origin) +{ + unsigned long addr64 = (unsigned long)addr; + char *metadata_array; + unsigned long off; + int cpu; + + if ((addr64 < CPU_ENTRY_AREA_BASE) || + (addr64 >= (CPU_ENTRY_AREA_BASE + CPU_ENTRY_AREA_MAP_SIZE))) + return NULL; + cpu = (addr64 - CPU_ENTRY_AREA_BASE) / CPU_ENTRY_AREA_SIZE; + off = addr64 - (unsigned long)get_cpu_entry_area(cpu); + if ((off < 0) || (off >= CPU_ENTRY_AREA_SIZE)) + return NULL; + metadata_array = is_origin ? cpu_entry_area_origin : + cpu_entry_area_shadow; + return &per_cpu(metadata_array[off], cpu); +} + /* * Taken from arch/x86/mm/physaddr.h to avoid using an instrumented version. */ diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index afb6f7187dad..c80febc44cd2 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -46,6 +46,9 @@ obj-$(CONFIG_HIGHMEM) += highmem_32.o KASAN_SANITIZE_kasan_init_$(BITS).o := n obj-$(CONFIG_KASAN) += kasan_init_$(BITS).o +KMSAN_SANITIZE_kmsan_shadow.o := n +obj-$(CONFIG_KMSAN) += kmsan_shadow.o + obj-$(CONFIG_MMIOTRACE) += mmiotrace.o mmiotrace-y := kmmio.o pf_in.o mmio-mod.o obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o diff --git a/arch/x86/mm/kmsan_shadow.c b/arch/x86/mm/kmsan_shadow.c new file mode 100644 index 000000000000..bee2ec4a3bfa --- /dev/null +++ b/arch/x86/mm/kmsan_shadow.c @@ -0,0 +1,20 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * x86-specific bits of KMSAN shadow implementation. + * + * Copyright (C) 2022 Google LLC + * Author: Alexander Potapenko + */ + +#include +#include + +/* + * Addresses within the CPU entry area (including e.g. exception stacks) do not + * have struct page entries corresponding to them, so they need separate + * handling. + * arch_kmsan_get_meta_or_null() (declared in the header) maps the addresses in + * CPU entry area to addresses in cpu_entry_area_shadow/cpu_entry_area_origin. + */ +DEFINE_PER_CPU(char[CPU_ENTRY_AREA_SIZE], cpu_entry_area_shadow); +DEFINE_PER_CPU(char[CPU_ENTRY_AREA_SIZE], cpu_entry_area_origin); diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c index 6e90a806a704..21e3e196ec3c 100644 --- a/mm/kmsan/shadow.c +++ b/mm/kmsan/shadow.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include @@ -126,6 +125,7 @@ void *kmsan_get_metadata(void *address, bool is_origin) { u64 addr = (u64)address, pad, off; struct page *page; + void *ret; if (is_origin && !IS_ALIGNED(addr, KMSAN_ORIGIN_SIZE)) { pad = addr % KMSAN_ORIGIN_SIZE; @@ -136,6 +136,10 @@ void *kmsan_get_metadata(void *address, bool is_origin) kmsan_internal_is_module_addr(address)) return (void *)vmalloc_meta(address, is_origin); + ret = arch_kmsan_get_meta_or_null(address, is_origin); + if (ret) + return ret; + page = virt_to_page_or_null(address); if (!page) return NULL; -- cgit v1.2.3