diff options
80 files changed, 1910 insertions, 1228 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 3a238644c811..4c12a290bee5 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1971,11 +1971,12 @@ and is between 256 and 4096 characters. It is defined in the file Format: { 0 | 1 } See arch/parisc/kernel/pdc_chassis.c - percpu_alloc= [X86] Select which percpu first chunk allocator to use. - Allowed values are one of "lpage", "embed" and "4k". - See comments in arch/x86/kernel/setup_percpu.c for - details on each allocator. This parameter is primarily - for debugging and performance comparison. + percpu_alloc= Select which percpu first chunk allocator to use. + Currently supported values are "embed" and "page". + Archs may support subset or none of the selections. + See comments in mm/percpu.c for details on each + allocator. This parameter is primarily for debugging + and performance comparison. pf. [PARIDE] See Documentation/blockdev/paride.txt. @@ -325,7 +325,7 @@ CHECKFLAGS := -D__linux__ -Dlinux -D__STDC__ -Dunix -D__unix__ \ MODFLAGS = -DMODULE CFLAGS_MODULE = $(MODFLAGS) AFLAGS_MODULE = $(MODFLAGS) -LDFLAGS_MODULE = +LDFLAGS_MODULE = -T $(srctree)/scripts/module-common.lds CFLAGS_KERNEL = AFLAGS_KERNEL = CFLAGS_GCOV = -fprofile-arcs -ftest-coverage diff --git a/arch/alpha/include/asm/percpu.h b/arch/alpha/include/asm/percpu.h index b663f1f10b6a..2c12378e3aa9 100644 --- a/arch/alpha/include/asm/percpu.h +++ b/arch/alpha/include/asm/percpu.h @@ -1,102 +1,18 @@ #ifndef __ALPHA_PERCPU_H #define __ALPHA_PERCPU_H -#include <linux/compiler.h> -#include <linux/threads.h> -#include <linux/percpu-defs.h> - -/* - * Determine the real variable name from the name visible in the - * kernel sources. - */ -#define per_cpu_var(var) per_cpu__##var - -#ifdef CONFIG_SMP - -/* - * per_cpu_offset() is the offset that has to be added to a - * percpu variable to get to the instance for a certain processor. - */ -extern unsigned long __per_cpu_offset[NR_CPUS]; - -#define per_cpu_offset(x) (__per_cpu_offset[x]) - -#define __my_cpu_offset per_cpu_offset(raw_smp_processor_id()) -#ifdef CONFIG_DEBUG_PREEMPT -#define my_cpu_offset per_cpu_offset(smp_processor_id()) -#else -#define my_cpu_offset __my_cpu_offset -#endif - -#ifndef MODULE -#define SHIFT_PERCPU_PTR(var, offset) RELOC_HIDE(&per_cpu_var(var), (offset)) -#define PER_CPU_DEF_ATTRIBUTES -#else /* - * To calculate addresses of locally defined variables, GCC uses 32-bit - * displacement from the GP. Which doesn't work for per cpu variables in - * modules, as an offset to the kernel per cpu area is way above 4G. + * To calculate addresses of locally defined variables, GCC uses + * 32-bit displacement from the GP. Which doesn't work for per cpu + * variables in modules, as an offset to the kernel per cpu area is + * way above 4G. * - * This forces allocation of a GOT entry for per cpu variable using - * ldq instruction with a 'literal' relocation. - */ -#define SHIFT_PERCPU_PTR(var, offset) ({ \ - extern int simple_identifier_##var(void); \ - unsigned long __ptr, tmp_gp; \ - asm ( "br %1, 1f \n\ - 1: ldgp %1, 0(%1) \n\ - ldq %0, per_cpu__" #var"(%1)\t!literal" \ - : "=&r"(__ptr), "=&r"(tmp_gp)); \ - (typeof(&per_cpu_var(var)))(__ptr + (offset)); }) - -#define PER_CPU_DEF_ATTRIBUTES __used - -#endif /* MODULE */ - -/* - * A percpu variable may point to a discarded regions. The following are - * established ways to produce a usable pointer from the percpu variable - * offset. + * Always use weak definitions for percpu variables in modules. */ -#define per_cpu(var, cpu) \ - (*SHIFT_PERCPU_PTR(var, per_cpu_offset(cpu))) -#define __get_cpu_var(var) \ - (*SHIFT_PERCPU_PTR(var, my_cpu_offset)) -#define __raw_get_cpu_var(var) \ - (*SHIFT_PERCPU_PTR(var, __my_cpu_offset)) - -#else /* ! SMP */ - -#define per_cpu(var, cpu) (*((void)(cpu), &per_cpu_var(var))) -#define __get_cpu_var(var) per_cpu_var(var) -#define __raw_get_cpu_var(var) per_cpu_var(var) - -#define PER_CPU_DEF_ATTRIBUTES - -#endif /* SMP */ - -#ifdef CONFIG_SMP -#define PER_CPU_BASE_SECTION ".data.percpu" -#else -#define PER_CPU_BASE_SECTION ".data" -#endif - -#ifdef CONFIG_SMP - -#ifdef MODULE -#define PER_CPU_SHARED_ALIGNED_SECTION "" -#else -#define PER_CPU_SHARED_ALIGNED_SECTION ".shared_aligned" -#endif -#define PER_CPU_FIRST_SECTION ".first" - -#else - -#define PER_CPU_SHARED_ALIGNED_SECTION "" -#define PER_CPU_FIRST_SECTION "" - +#if defined(MODULE) && defined(CONFIG_SMP) +#define ARCH_NEEDS_WEAK_PER_CPU #endif -#define PER_CPU_ATTRIBUTES +#include <asm-generic/percpu.h> #endif /* __ALPHA_PERCPU_H */ diff --git a/arch/alpha/include/asm/tlbflush.h b/arch/alpha/include/asm/tlbflush.h index 9d87aaa08c0d..e89e0c2e15b1 100644 --- a/arch/alpha/include/asm/tlbflush.h +++ b/arch/alpha/include/asm/tlbflush.h @@ -2,6 +2,7 @@ #define _ALPHA_TLBFLUSH_H #include <linux/mm.h> +#include <linux/sched.h> #include <asm/compiler.h> #include <asm/pgalloc.h> diff --git a/arch/alpha/kernel/vmlinux.lds.S b/arch/alpha/kernel/vmlinux.lds.S index b9d6568e5f7f..6dc03c35caa0 100644 --- a/arch/alpha/kernel/vmlinux.lds.S +++ b/arch/alpha/kernel/vmlinux.lds.S @@ -134,13 +134,6 @@ SECTIONS __bss_stop = .; _end = .; - /* Sections to be discarded */ - /DISCARD/ : { - EXIT_TEXT - EXIT_DATA - *(.exitcall.exit) - } - .mdebug 0 : { *(.mdebug) } @@ -150,4 +143,6 @@ SECTIONS STABS_DEBUG DWARF_DEBUG + + DISCARDS } diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S index 69371028a202..5cc4812c9763 100644 --- a/arch/arm/kernel/vmlinux.lds.S +++ b/arch/arm/kernel/vmlinux.lds.S @@ -83,6 +83,7 @@ SECTIONS EXIT_TEXT EXIT_DATA *(.exitcall.exit) + *(.discard) *(.ARM.exidx.exit.text) *(.ARM.extab.exit.text) #ifndef CONFIG_HOTPLUG_CPU diff --git a/arch/avr32/kernel/vmlinux.lds.S b/arch/avr32/kernel/vmlinux.lds.S index 7910d41eb886..c4b56654349a 100644 --- a/arch/avr32/kernel/vmlinux.lds.S +++ b/arch/avr32/kernel/vmlinux.lds.S @@ -124,14 +124,11 @@ SECTIONS _end = .; } + DWARF_DEBUG + /* When something in the kernel is NOT compiled as a module, the module * cleanup code and data are put into these segments. Both can then be * thrown away, as cleanup code is never called unless it's a module. */ - /DISCARD/ : { - EXIT_DATA - *(.exitcall.exit) - } - - DWARF_DEBUG + DISCARDS } diff --git a/arch/blackfin/kernel/vmlinux.lds.S b/arch/blackfin/kernel/vmlinux.lds.S index 6ac307ca0d80..d7ffe299b979 100644 --- a/arch/blackfin/kernel/vmlinux.lds.S +++ b/arch/blackfin/kernel/vmlinux.lds.S @@ -277,8 +277,5 @@ SECTIONS DWARF_DEBUG - /DISCARD/ : - { - *(.exitcall.exit) - } + DISCARDS } diff --git a/arch/blackfin/mm/sram-alloc.c b/arch/blackfin/mm/sram-alloc.c index 0bc3c4ef0aad..99e4dbb1dfd1 100644 --- a/arch/blackfin/mm/sram-alloc.c +++ b/arch/blackfin/mm/sram-alloc.c @@ -42,9 +42,9 @@ #include <asm/mem_map.h> #include "blackfin_sram.h" -static DEFINE_PER_CPU(spinlock_t, l1sram_lock) ____cacheline_aligned_in_smp; -static DEFINE_PER_CPU(spinlock_t, l1_data_sram_lock) ____cacheline_aligned_in_smp; -static DEFINE_PER_CPU(spinlock_t, l1_inst_sram_lock) ____cacheline_aligned_in_smp; +static DEFINE_PER_CPU_SHARED_ALIGNED(spinlock_t, l1sram_lock); +static DEFINE_PER_CPU_SHARED_ALIGNED(spinlock_t, l1_data_sram_lock); +static DEFINE_PER_CPU_SHARED_ALIGNED(spinlock_t, l1_inst_sram_lock); static spinlock_t l2_sram_lock ____cacheline_aligned_in_smp; /* the data structure for L1 scratchpad and DATA SRAM */ diff --git a/arch/cris/include/asm/mmu_context.h b/arch/cris/include/asm/mmu_context.h index 72ba08dcfd18..1d45fd6365b7 100644 --- a/arch/cris/include/asm/mmu_context.h +++ b/arch/cris/include/asm/mmu_context.h @@ -17,7 +17,8 @@ extern void switch_mm(struct mm_struct *prev, struct mm_struct *next, * registers like cr3 on the i386 */ -extern volatile DEFINE_PER_CPU(pgd_t *,current_pgd); /* defined in arch/cris/mm/fault.c */ +/* defined in arch/cris/mm/fault.c */ +DECLARE_PER_CPU(pgd_t *, current_pgd); static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) { diff --git a/arch/cris/kernel/vmlinux.lds.S b/arch/cris/kernel/vmlinux.lds.S index 0d2adfc794d4..6c81836b9229 100644 --- a/arch/cris/kernel/vmlinux.lds.S +++ b/arch/cris/kernel/vmlinux.lds.S @@ -140,12 +140,7 @@ SECTIONS _end = .; __end = .; - /* Sections to be discarded */ - /DISCARD/ : { - EXIT_TEXT - EXIT_DATA - *(.exitcall.exit) - } - dram_end = dram_start + (CONFIG_ETRAX_DRAM_SIZE - __CONFIG_ETRAX_VMEM_SIZE)*1024*1024; + + DISCARDS } diff --git a/arch/cris/mm/fault.c b/arch/cris/mm/fault.c index f925115e3250..4a7cdd9ea1ee 100644 --- a/arch/cris/mm/fault.c +++ b/arch/cris/mm/fault.c @@ -29,7 +29,7 @@ extern void die_if_kernel(const char *, struct pt_regs *, long); /* current active page directory */ -volatile DEFINE_PER_CPU(pgd_t *,current_pgd); +DEFINE_PER_CPU(pgd_t *, current_pgd); unsigned long cris_signal_return_page; /* diff --git a/arch/frv/kernel/vmlinux.lds.S b/arch/frv/kernel/vmlinux.lds.S index 22d9787406ed..7dbf41f68b52 100644 --- a/arch/frv/kernel/vmlinux.lds.S +++ b/arch/frv/kernel/vmlinux.lds.S @@ -177,6 +177,8 @@ SECTIONS .debug_ranges 0 : { *(.debug_ranges) } .comment 0 : { *(.comment) } + + DISCARDS } __kernel_image_size_no_bss = __bss_start - __kernel_image_start; diff --git a/arch/h8300/kernel/vmlinux.lds.S b/arch/h8300/kernel/vmlinux.lds.S index 43a87b9085b6..662b02ecb86e 100644 --- a/arch/h8300/kernel/vmlinux.lds.S +++ b/arch/h8300/kernel/vmlinux.lds.S @@ -152,9 +152,6 @@ SECTIONS __end = . ; __ramstart = .; } - /DISCARD/ : { - *(.exitcall.exit) - } .romfs : { *(.romfs*) @@ -165,4 +162,6 @@ SECTIONS COMMAND_START = . - 0x200 ; __ramend = . ; } + + DISCARDS } diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index e6246119932a..011a1cdf0eb5 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -89,6 +89,9 @@ config GENERIC_TIME_VSYSCALL bool default y +config HAVE_LEGACY_PER_CPU_AREA + def_bool y + config HAVE_SETUP_PER_CPU_AREA def_bool y diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c index 1b23ec126b63..1de86c96801d 100644 --- a/arch/ia64/kernel/setup.c +++ b/arch/ia64/kernel/setup.c @@ -855,11 +855,17 @@ identify_cpu (struct cpuinfo_ia64 *c) c->unimpl_pa_mask = ~((1L<<63) | ((1L << phys_addr_size) - 1)); } +/* + * In UP configuration, setup_per_cpu_areas() is defined in + * include/linux/percpu.h + */ +#ifdef CONFIG_SMP void __init setup_per_cpu_areas (void) { /* start_kernel() requires this... */ } +#endif /* * Do the following calculations: diff --git a/arch/ia64/kernel/smp.c b/arch/ia64/kernel/smp.c index f0c521b0ba4c..93ebfea43c6c 100644 --- a/arch/ia64/kernel/smp.c +++ b/arch/ia64/kernel/smp.c @@ -58,7 +58,8 @@ static struct local_tlb_flush_counts { unsigned int count; } __attribute__((__aligned__(32))) local_tlb_flush_counts[NR_CPUS]; -static DEFINE_PER_CPU(unsigned short, shadow_flush_counts[NR_CPUS]) ____cacheline_aligned; +static DEFINE_PER_CPU_SHARED_ALIGNED(unsigned short [NR_CPUS], + shadow_flush_counts); #define IPI_CALL_FUNC 0 #define IPI_CPU_STOP 1 diff --git a/arch/ia64/kernel/vmlinux.lds.S b/arch/ia64/kernel/vmlinux.lds.S index 4a95e86b9ac2..eb4214d1c5af 100644 --- a/arch/ia64/kernel/vmlinux.lds.S +++ b/arch/ia64/kernel/vmlinux.lds.S @@ -24,14 +24,14 @@ PHDRS { } SECTIONS { - /* Sections to be discarded */ + /* unwind exit sections must be discarded before the rest of the + sections get included. */ /DISCARD/ : { - EXIT_TEXT - EXIT_DATA - *(.exitcall.exit) *(.IA_64.unwind.exit.text) *(.IA_64.unwind_info.exit.text) - } + *(.comment) + *(.note) + } v = PAGE_OFFSET; /* this symbol is here to make debugging easier... */ phys_start = _start - LOAD_OFFSET; @@ -316,7 +316,7 @@ SECTIONS .debug_funcnames 0 : { *(.debug_funcnames) } .debug_typenames 0 : { *(.debug_typenames) } .debug_varnames 0 : { *(.debug_varnames) } - /* These must appear regardless of . */ - /DISCARD/ : { *(.comment) } - /DISCARD/ : { *(.note) } + + /* Default discards */ + DISCARDS } diff --git a/arch/ia64/sn/kernel/setup.c b/arch/ia64/sn/kernel/setup.c index e456f062f241..ece1bf994499 100644 --- a/arch/ia64/sn/kernel/setup.c +++ b/arch/ia64/sn/kernel/setup.c @@ -71,7 +71,7 @@ EXPORT_SYMBOL(sn_rtc_cycles_per_second); DEFINE_PER_CPU(struct sn_hub_info_s, __sn_hub_info); EXPORT_PER_CPU_SYMBOL(__sn_hub_info); -DEFINE_PER_CPU(short, __sn_cnodeid_to_nasid[MAX_COMPACT_NODES]); +DEFINE_PER_CPU(short [MAX_COMPACT_NODES], __sn_cnodeid_to_nasid); EXPORT_PER_CPU_SYMBOL(__sn_cnodeid_to_nasid); DEFINE_PER_CPU(struct nodepda_s *, __sn_nodepda); diff --git a/arch/m32r/kernel/vmlinux.lds.S b/arch/m32r/kernel/vmlinux.lds.S index 4179adf6c624..de5e21cca6a5 100644 --- a/arch/m32r/kernel/vmlinux.lds.S +++ b/arch/m32r/kernel/vmlinux.lds.S @@ -120,13 +120,6 @@ SECTIONS _end = . ; - /* Sections to be discarded */ - /DISCARD/ : { - EXIT_TEXT - EXIT_DATA - *(.exitcall.exit) - } - /* Stabs debugging sections. */ .stab 0 : { *(.stab) } .stabstr 0 : { *(.stabstr) } @@ -135,4 +128,7 @@ SECTIONS .stab.index 0 : { *(.stab.index) } .stab.indexstr 0 : { *(.stab.indexstr) } .comment 0 : { *(.comment) } + + /* Sections to be discarded */ + DISCARDS } diff --git a/arch/m68k/kernel/vmlinux-std.lds b/arch/m68k/kernel/vmlinux-std.lds index 01d212bb05a6..47eac19e8f61 100644 --- a/arch/m68k/kernel/vmlinux-std.lds +++ b/arch/m68k/kernel/vmlinux-std.lds @@ -82,13 +82,6 @@ SECTIONS _end = . ; - /* Sections to be discarded */ - /DISCARD/ : { - EXIT_TEXT - EXIT_DATA - *(.exitcall.exit) - } - /* Stabs debugging sections. */ .stab 0 : { *(.stab) } .stabstr 0 : { *(.stabstr) } @@ -97,4 +90,7 @@ SECTIONS .stab.index 0 : { *(.stab.index) } .stab.indexstr 0 : { *(.stab.indexstr) } .comment 0 : { *(.comment) } + + /* Sections to be discarded */ + DISCARDS } diff --git a/arch/m68k/kernel/vmlinux-sun3.lds b/arch/m68k/kernel/vmlinux-sun3.lds index c192f773db96..03efaf04d7d7 100644 --- a/arch/m68k/kernel/vmlinux-sun3.lds +++ b/arch/m68k/kernel/vmlinux-sun3.lds @@ -77,13 +77,6 @@ __init_begin = .; _end = . ; - /* Sections to be discarded */ - /DISCARD/ : { - EXIT_TEXT - EXIT_DATA - *(.exitcall.exit) - } - .crap : { /* Stabs debugging sections. */ *(.stab) @@ -96,4 +89,6 @@ __init_begin = .; *(.note) } + /* Sections to be discarded */ + DISCARDS } diff --git a/arch/m68knommu/kernel/vmlinux.lds.S b/arch/m68knommu/kernel/vmlinux.lds.S index b7fe505e358d..2736a5e309c0 100644 --- a/arch/m68knommu/kernel/vmlinux.lds.S +++ b/arch/m68knommu/kernel/vmlinux.lds.S @@ -184,12 +184,6 @@ SECTIONS { __init_end = .; } > INIT - /DISCARD/ : { - EXIT_TEXT - EXIT_DATA - *(.exitcall.exit) - } - .bss : { . = ALIGN(4); _sbss = . ; @@ -200,5 +194,6 @@ SECTIONS { _end = . ; } > BSS + DISCARDS } diff --git a/arch/microblaze/kernel/vmlinux.lds.S b/arch/microblaze/kernel/vmlinux.lds.S index d34d38dcd12c..ec5fa91a48d8 100644 --- a/arch/microblaze/kernel/vmlinux.lds.S +++ b/arch/microblaze/kernel/vmlinux.lds.S @@ -23,8 +23,8 @@ SECTIONS { _stext = . ; *(.text .text.*) *(.fixup) - - *(.exitcall.exit) + EXIT_TEXT + EXIT_CALL SCHED_TEXT LOCK_TEXT KPROBES_TEXT @@ -162,4 +162,6 @@ SECTIONS { } . = ALIGN(4096); _end = .; + + DISCARDS } diff --git a/arch/mips/kernel/vmlinux.lds.S b/arch/mips/kernel/vmlinux.lds.S index 58738c8d754f..1474c18fb777 100644 --- a/arch/mips/kernel/vmlinux.lds.S +++ b/arch/mips/kernel/vmlinux.lds.S @@ -176,17 +176,6 @@ SECTIONS _end = . ; - /* Sections to be discarded */ - /DISCARD/ : { - *(.exitcall.exit) - - /* ABI crap starts here */ - *(.MIPS.options) - *(.options) - *(.pdr) - *(.reginfo) - } - /* These mark the ABI of the kernel for debuggers. */ .mdebug.abi32 : { KEEP(*(.mdebug.abi32)) @@ -212,4 +201,14 @@ SECTIONS *(.gptab.bss) *(.gptab.sbss) } + + /* Sections to be discarded */ + DISCARDS + /DISCARD/ : { + /* ABI crap starts here */ + *(.MIPS.options) + *(.options) + *(.pdr) + *(.reginfo) + } } diff --git a/arch/mn10300/kernel/vmlinux.lds.S b/arch/mn10300/kernel/vmlinux.lds.S index f4aa07934654..76f41bdb79c4 100644 --- a/arch/mn10300/kernel/vmlinux.lds.S +++ b/arch/mn10300/kernel/vmlinux.lds.S @@ -115,12 +115,10 @@ SECTIONS . = ALIGN(PAGE_SIZE); pg0 = .; - /* Sections to be discarded */ - /DISCARD/ : { - EXIT_CALL - } - STABS_DEBUG DWARF_DEBUG + + /* Sections to be discarded */ + DISCARDS } diff --git a/arch/parisc/kernel/vmlinux.lds.S b/arch/parisc/kernel/vmlinux.lds.S index fd2cc4fd2b65..aea1784edbd1 100644 --- a/arch/parisc/kernel/vmlinux.lds.S +++ b/arch/parisc/kernel/vmlinux.lds.S @@ -237,9 +237,12 @@ SECTIONS /* freed after init ends here */ _end = . ; + STABS_DEBUG + .note 0 : { *(.note) } + /* Sections to be discarded */ + DISCARDS /DISCARD/ : { - *(.exitcall.exit) #ifdef CONFIG_64BIT /* temporary hack until binutils is fixed to not emit these * for static binaries @@ -252,7 +255,4 @@ SECTIONS *(.gnu.hash) #endif } - - STABS_DEBUG - .note 0 : { *(.note) } } diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index d00131ca0835..2c42e1526d03 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -49,6 +49,9 @@ config GENERIC_HARDIRQS_NO__DO_IRQ config HAVE_SETUP_PER_CPU_AREA def_bool PPC64 +config NEED_PER_CPU_EMBED_FIRST_CHUNK + def_bool PPC64 + config IRQ_PER_CPU bool default y diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 1f6816003ebe..aa6e4500635f 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -57,6 +57,7 @@ #include <asm/cache.h> #include <asm/page.h> #include <asm/mmu.h> +#include <asm/mmu-hash64.h> #include <asm/firmware.h> #include <asm/xmon.h> #include <asm/udbg.h> @@ -569,25 +570,53 @@ void cpu_die(void) } #ifdef CONFIG_SMP -void __init setup_per_cpu_areas(void) +#define PCPU_DYN_SIZE () + +static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align) { - int i; - unsigned long size; - char *ptr; - - /* Copy section for each CPU (we discard the original) */ - size = ALIGN(__per_cpu_end - __per_cpu_start, PAGE_SIZE); -#ifdef CONFIG_MODULES - if (size < PERCPU_ENOUGH_ROOM) - size = PERCPU_ENOUGH_ROOM; -#endif + return __alloc_bootmem_node(NODE_DATA(cpu_to_node(cpu)), size, align, + __pa(MAX_DMA_ADDRESS)); +} - for_each_possible_cpu(i) { - ptr = alloc_bootmem_pages_node(NODE_DATA(cpu_to_node(i)), size); +static void __init pcpu_fc_free(void *ptr, size_t size) +{ + free_bootmem(__pa(ptr), size); +} - paca[i].data_offset = ptr - __per_cpu_start; - memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); - } +static int pcpu_cpu_distance(unsigned int from, unsigned int to) +{ + if (cpu_to_node(from) == cpu_to_node(to)) + return LOCAL_DISTANCE; + else + return REMOTE_DISTANCE; +} + +void __init setup_per_cpu_areas(void) +{ + const size_t dyn_size = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; + size_t atom_size; + unsigned long delta; + unsigned int cpu; + int rc; + + /* + * Linear mapping is one of 4K, 1M and 16M. For 4K, no need + * to group units. For larger mappings, use 1M atom which + * should be large enough to contain a number of units. + */ + if (mmu_linear_psize == MMU_PAGE_4K) + atom_size = PAGE_SIZE; + else + atom_size = 1 << 20; + + rc = pcpu_embed_first_chunk(0, dyn_size, atom_size, pcpu_cpu_distance, + pcpu_fc_alloc, pcpu_fc_free); + if (rc < 0) + panic("cannot initialize percpu area (err=%d)", rc); + + delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; + for_each_possible_cpu(cpu) + paca[cpu].data_offset = delta + pcpu_unit_offsets[cpu]; } #endif diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index 8ef8a14abc95..244e3658983c 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -37,12 +37,6 @@ jiffies = jiffies_64 + 4; #endif SECTIONS { - /* Sections to be discarded. */ - /DISCARD/ : { - *(.exitcall.exit) - EXIT_DATA - } - . = KERNELBASE; /* @@ -298,4 +292,7 @@ SECTIONS . = ALIGN(PAGE_SIZE); _end = . ; PROVIDE32 (end = .); + + /* Sections to be discarded. */ + DISCARDS } diff --git a/arch/powerpc/mm/stab.c b/arch/powerpc/mm/stab.c index ab5fb48b3e90..687fddaa24c5 100644 --- a/arch/powerpc/mm/stab.c +++ b/arch/powerpc/mm/stab.c @@ -31,7 +31,7 @@ struct stab_entry { #define NR_STAB_CACHE_ENTRIES 8 static DEFINE_PER_CPU(long, stab_cache_ptr); -static DEFINE_PER_CPU(long, stab_cache[NR_STAB_CACHE_ENTRIES]); +static DEFINE_PER_CPU(long [NR_STAB_CACHE_ENTRIES], stab_cache); /* * Create a segment table entry for the given esid/vsid pair. diff --git a/arch/powerpc/platforms/ps3/smp.c b/arch/powerpc/platforms/ps3/smp.c index f6e04bcc70ef..51ffde40af2b 100644 --- a/arch/powerpc/platforms/ps3/smp.c +++ b/arch/powerpc/platforms/ps3/smp.c @@ -37,7 +37,7 @@ */ #define MSG_COUNT 4 -static DEFINE_PER_CPU(unsigned int, ps3_ipi_virqs[MSG_COUNT]); +static DEFINE_PER_CPU(unsigned int [MSG_COUNT], ps3_ipi_virqs); static void do_message_pass(int target, int msg) { diff --git a/arch/s390/include/asm/percpu.h b/arch/s390/include/asm/percpu.h index 408d60b4f75b..f7ad8719d02d 100644 --- a/arch/s390/include/asm/percpu.h +++ b/arch/s390/include/asm/percpu.h @@ -1,37 +1,21 @@ #ifndef __ARCH_S390_PERCPU__ #define __ARCH_S390_PERCPU__ -#include <linux/compiler.h> -#include <asm/lowcore.h> - /* * s390 uses its own implementation for per cpu data, the offset of * the cpu local data area is cached in the cpu's lowcore memory. - * For 64 bit module code s390 forces the use of a GOT slot for the - * address of the per cpu variable. This is needed because the module - * may be more than 4G above the per cpu area. */ -#if defined(__s390x__) && defined(MODULE) - -#define SHIFT_PERCPU_PTR(ptr,offset) (({ \ - extern int simple_identifier_##var(void); \ - unsigned long *__ptr; \ - asm ( "larl %0, %1@GOTENT" \ - : "=a" (__ptr) : "X" (ptr) ); \ - (typeof(ptr))((*__ptr) + (offset)); })) - -#else - -#define SHIFT_PERCPU_PTR(ptr, offset) (({ \ - extern int simple_identifier_##var(void); \ - unsigned long __ptr; \ - asm ( "" : "=a" (__ptr) : "0" (ptr) ); \ - (typeof(ptr)) (__ptr + (offset)); })) +#define __my_cpu_offset S390_lowcore.percpu_offset +/* + * For 64 bit module code, the module may be more than 4G above the + * per cpu area, use weak definitions to force the compiler to + * generate external references. + */ +#if defined(CONFIG_SMP) && defined(__s390x__) && defined(MODULE) +#define ARCH_NEEDS_WEAK_PER_CPU #endif -#define __my_cpu_offset S390_lowcore.percpu_offset - #include <asm-generic/percpu.h> #endif /* __ARCH_S390_PERCPU__ */ diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S index 7315f9e67e1d..bc15ef93e656 100644 --- a/arch/s390/kernel/vmlinux.lds.S +++ b/arch/s390/kernel/vmlinux.lds.S @@ -84,13 +84,10 @@ SECTIONS _end = . ; - /* Sections to be discarded */ - /DISCARD/ : { - EXIT_DATA - *(.exitcall.exit) - } - /* Debugging sections. */ STABS_DEBUG DWARF_DEBUG + + /* Sections to be discarded */ + DISCARDS } diff --git a/arch/sh/kernel/vmlinux.lds.S b/arch/sh/kernel/vmlinux.lds.S index f53c76acaede..0ce254bca92f 100644 --- a/arch/sh/kernel/vmlinux.lds.S +++ b/arch/sh/kernel/vmlinux.lds.S @@ -163,16 +163,14 @@ SECTIONS _end = . ; } + STABS_DEBUG + DWARF_DEBUG + /* * When something in the kernel is NOT compiled as a module, the * module cleanup code and data are put into these segments. Both * can then be thrown away, as cleanup code is never called unless * it's a module. */ - /DISCARD/ : { - *(.exitcall.exit) - } - - STABS_DEBUG - DWARF_DEBUG + DISCARDS } diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 2bd5c287538a..86b82348b97c 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -99,7 +99,7 @@ config AUDIT_ARCH config HAVE_SETUP_PER_CPU_AREA def_bool y if SPARC64 -config HAVE_DYNAMIC_PER_CPU_AREA +config NEED_PER_CPU_EMBED_FIRST_CHUNK def_bool y if SPARC64 config GENERIC_HARDIRQS_NO__DO_IRQ diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index 3691907a43b4..ff68373ce6d6 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -1389,8 +1389,8 @@ void smp_send_stop(void) * RETURNS: * Pointer to the allocated area on success, NULL on failure. */ -static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, - unsigned long align) +static void * __init pcpu_alloc_bootmem(unsigned int cpu, size_t size, + size_t align) { const unsigned long goal = __pa(MAX_DMA_ADDRESS); #ifdef CONFIG_NEED_MULTIPLE_NODES @@ -1415,127 +1415,35 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, #endif } -static size_t pcpur_size __initdata; -static void **pcpur_ptrs __initdata; - -static struct page * __init pcpur_get_page(unsigned int cpu, int pageno) +static void __init pcpu_free_bootmem(void *ptr, size_t size) { - size_t off = (size_t)pageno << PAGE_SHIFT; - - if (off >= pcpur_size) - return NULL; - - return virt_to_page(pcpur_ptrs[cpu] + off); + free_bootmem(__pa(ptr), size); } -#define PCPU_CHUNK_SIZE (4UL * 1024UL * 1024UL) - -static void __init pcpu_map_range(unsigned long start, unsigned long end, - struct page *page) +static int pcpu_cpu_distance(unsigned int from, unsigned int to) { - unsigned long pfn = page_to_pfn(page); - unsigned long pte_base; - - BUG_ON((pfn<<PAGE_SHIFT)&(PCPU_CHUNK_SIZE - 1UL)); - - pte_base = (_PAGE_VALID | _PAGE_SZ4MB_4U | - _PAGE_CP_4U | _PAGE_CV_4U | - _PAGE_P_4U | _PAGE_W_4U); - if (tlb_type == hypervisor) - pte_base = (_PAGE_VALID | _PAGE_SZ4MB_4V | - _PAGE_CP_4V | _PAGE_CV_4V | - _PAGE_P_4V | _PAGE_W_4V); - - while (start < end) { - pgd_t *pgd = pgd_offset_k(start); - unsigned long this_end; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - pud = pud_offset(pgd, start); - if (pud_none(*pud)) { - pmd_t *new; - - new = __alloc_bootmem(PAGE_SIZE, PAGE_SIZE, PAGE_SIZE); - pud_populate(&init_mm, pud, new); - } - - pmd = pmd_offset(pud, start); - if (!pmd_present(*pmd)) { - pte_t *new; - - new = __alloc_bootmem(PAGE_SIZE, PAGE_SIZE, PAGE_SIZE); - pmd_populate_kernel(&init_mm, pmd, new); - } - - pte = pte_offset_kernel(pmd, start); - this_end = (start + PMD_SIZE) & PMD_MASK; - if (this_end > end) - this_end = end; - - while (start < this_end) { - unsigned long paddr = pfn << PAGE_SHIFT; - - pte_val(*pte) = (paddr | pte_base); - - start += PAGE_SIZE; - pte++; - pfn++; - } - } + if (cpu_to_node(from) == cpu_to_node(to)) + return LOCAL_DISTANCE; + else + return REMOTE_DISTANCE; } void __init setup_per_cpu_areas(void) { - size_t dyn_size, static_size = __per_cpu_end - __per_cpu_start; - static struct vm_struct vm; - unsigned long delta, cpu; - size_t pcpu_unit_size; - size_t ptrs_size; - - pcpur_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE + - PERCPU_DYNAMIC_RESERVE); - dyn_size = pcpur_size - static_size - PERCPU_MODULE_RESERVE; - + unsigned long delta; + unsigned int cpu; + int rc; - ptrs_size = PFN_ALIGN(nr_cpu_ids * sizeof(pcpur_ptrs[0])); - pcpur_ptrs = alloc_bootmem(ptrs_size); - - for_each_possible_cpu(cpu) { - pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PCPU_CHUNK_SIZE, - PCPU_CHUNK_SIZE); - - free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size), - PCPU_CHUNK_SIZE - pcpur_size); - - memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size); - } - - /* allocate address and map */ - vm.flags = VM_ALLOC; - vm.size = nr_cpu_ids * PCPU_CHUNK_SIZE; - vm_area_register_early(&vm, PCPU_CHUNK_SIZE); - - for_each_possible_cpu(cpu) { - unsigned long start = (unsigned long) vm.addr; - unsigned long end; - - start += cpu * PCPU_CHUNK_SIZE; - end = start + PCPU_CHUNK_SIZE; - pcpu_map_range(start, end, virt_to_page(pcpur_ptrs[cpu])); - } - - pcpu_unit_size = pcpu_setup_first_chunk(pcpur_get_page, static_size, - PERCPU_MODULE_RESERVE, dyn_size, - PCPU_CHUNK_SIZE, vm.addr, NULL); - - free_bootmem(__pa(pcpur_ptrs), ptrs_size); + rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, + PERCPU_DYNAMIC_RESERVE, 4 << 20, + pcpu_cpu_distance, pcpu_alloc_bootmem, + pcpu_free_bootmem); + if (rc) + panic("failed to initialize first chunk (%d)", rc); delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; - for_each_possible_cpu(cpu) { - __per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size; - } + for_each_possible_cpu(cpu) + __per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu]; /* Setup %g5 for the boot cpu. */ __local_per_cpu_offset = __per_cpu_offset(smp_processor_id()); diff --git a/arch/sparc/kernel/vmlinux.lds.S b/arch/sparc/kernel/vmlinux.lds.S index fcbbd000ec08..866390feb683 100644 --- a/arch/sparc/kernel/vmlinux.lds.S +++ b/arch/sparc/kernel/vmlinux.lds.S @@ -171,12 +171,8 @@ SECTIONS } _end = . ; - /DISCARD/ : { - EXIT_TEXT - EXIT_DATA - *(.exitcall.exit) - } - STABS_DEBUG DWARF_DEBUG + + DISCARDS } diff --git a/arch/um/include/asm/common.lds.S b/arch/um/include/asm/common.lds.S index cb0248616d49..37ecc5577a9a 100644 --- a/arch/um/include/asm/common.lds.S +++ b/arch/um/include/asm/common.lds.S @@ -123,8 +123,3 @@ __initramfs_end = .; } - /* Sections to be discarded */ - /DISCARD/ : { - *(.exitcall.exit) - } - diff --git a/arch/um/kernel/dyn.lds.S b/arch/um/kernel/dyn.lds.S index 9975e1ab44fb..715a188c0472 100644 --- a/arch/um/kernel/dyn.lds.S +++ b/arch/um/kernel/dyn.lds.S @@ -156,4 +156,6 @@ SECTIONS STABS_DEBUG DWARF_DEBUG + + DISCARDS } diff --git a/arch/um/kernel/uml.lds.S b/arch/um/kernel/uml.lds.S index 11b835248b86..2ebd39765db8 100644 --- a/arch/um/kernel/uml.lds.S +++ b/arch/um/kernel/uml.lds.S @@ -100,4 +100,6 @@ SECTIONS STABS_DEBUG DWARF_DEBUG + + DISCARDS } diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index a800b0faaad6..e98e81a04971 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -150,7 +150,10 @@ config ARCH_HAS_CACHE_LINE_SIZE config HAVE_SETUP_PER_CPU_AREA def_bool y -config HAVE_DYNAMIC_PER_CPU_AREA +config NEED_PER_CPU_EMBED_FIRST_CHUNK + def_bool y + +config NEED_PER_CPU_PAGE_FIRST_CHUNK def_bool y config HAVE_CPUMASK_OF_CPU_MAP diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 04eacefcfd26..b65a36defeb7 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -168,15 +168,6 @@ do { \ /* We can use this directly for local CPU (faster). */ DECLARE_PER_CPU(unsigned long, this_cpu_off); -#ifdef CONFIG_NEED_MULTIPLE_NODES -void *pcpu_lpage_remapped(void *kaddr); -#else -static inline void *pcpu_lpage_remapped(void *kaddr) -{ - return NULL; -} -#endif - #endif /* !__ASSEMBLY__ */ #ifdef CONFIG_SMP diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c index 6b2a52dd0403..dca325c03999 100644 --- a/arch/x86/kernel/cpu/cpu_debug.c +++ b/arch/x86/kernel/cpu/cpu_debug.c @@ -30,8 +30,8 @@ #include <asm/apic.h> #include <asm/desc.h> -static DEFINE_PER_CPU(struct cpu_cpuX_base, cpu_arr[CPU_REG_ALL_BIT]); -static DEFINE_PER_CPU(struct cpu_private *, priv_arr[MAX_CPU_FILES]); +static DEFINE_PER_CPU(struct cpu_cpuX_base [CPU_REG_ALL_BIT], cpu_arr); +static DEFINE_PER_CPU(struct cpu_private * [MAX_CPU_FILES], priv_arr); static DEFINE_PER_CPU(int, cpu_priv_count); static DEFINE_MUTEX(cpu_debug_lock); diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 9bfe9d2ea615..fdd51b554355 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1101,7 +1101,7 @@ void mce_log_therm_throt_event(__u64 status) */ static int check_interval = 5 * 60; /* 5 minutes */ -static DEFINE_PER_CPU(int, next_interval); /* in jiffies */ +static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */ static DEFINE_PER_CPU(struct timer_list, mce_timer); static void mcheck_timer(unsigned long data) @@ -1120,7 +1120,7 @@ static void mcheck_timer(unsigned long data) * Alert userspace if needed. If we logged an MCE, reduce the * polling interval, otherwise increase the polling interval. */ - n = &__get_cpu_var(next_interval); + n = &__get_cpu_var(mce_next_interval); if (mce_notify_irq()) *n = max(*n/2, HZ/100); else @@ -1335,7 +1335,7 @@ static void mce_cpu_features(struct cpuinfo_x86 *c) static void mce_init_timer(void) { struct timer_list *t = &__get_cpu_var(mce_timer); - int *n = &__get_cpu_var(next_interval); + int *n = &__get_cpu_var(mce_next_interval); if (mce_ignore_ce) return; @@ -1935,7 +1935,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: t->expires = round_jiffies(jiffies + - __get_cpu_var(next_interval)); + __get_cpu_var(mce_next_interval)); add_timer_on(t, cpu); smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); break; diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 1fecba404fd8..8cd5224943b5 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -69,7 +69,7 @@ struct threshold_bank { struct threshold_block *blocks; cpumask_var_t cpus; }; -static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]); +static DEFINE_PER_CPU(struct threshold_bank * [NR_BANKS], threshold_banks); #ifdef CONFIG_SMP static unsigned char shared_bank[NR_BANKS] = { diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index f9cd0849bd42..2732e2c1e4d3 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -1211,7 +1211,7 @@ amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) x86_pmu_disable_counter(hwc, idx); } -static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]); +static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); /* * Set the next IRQ period, based on the hwc->period_left value. @@ -1253,7 +1253,7 @@ x86_perf_counter_set_period(struct perf_counter *counter, if (left > x86_pmu.max_period) left = x86_pmu.max_period; - per_cpu(prev_left[idx], smp_processor_id()) = left; + per_cpu(pmc_prev_left[idx], smp_processor_id()) = left; /* * The hw counter starts counting from this counter offset, @@ -1470,7 +1470,7 @@ void perf_counter_print_debug(void) rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); rdmsrl(x86_pmu.perfctr + idx, pmc_count); - prev_left = per_cpu(prev_left[idx], cpu); + prev_left = per_cpu(pmc_prev_left[idx], cpu); pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n", cpu, idx, pmc_ctrl); @@ -2110,8 +2110,8 @@ void callchain_store(struct perf_callchain_entry *entry, u64 ip) entry->ip[entry->nr++] = ip; } -static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry); -static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry); +static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry); +static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry); static DEFINE_PER_CPU(int, in_nmi_frame); @@ -2264,9 +2264,9 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) struct perf_callchain_entry *entry; if (in_nmi()) - entry = &__get_cpu_var(nmi_entry); + entry = &__get_cpu_var(pmc_nmi_entry); else - entry = &__get_cpu_var(irq_entry); + entry = &__get_cpu_var(pmc_irq_entry); entry->nr = 0; diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 07d81916f212..d559af913e1f 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -55,6 +55,7 @@ EXPORT_SYMBOL(__per_cpu_offset); #define PERCPU_FIRST_CHUNK_RESERVE 0 #endif +#ifdef CONFIG_X86_32 /** * pcpu_need_numa - determine percpu allocation needs to consider NUMA * @@ -83,6 +84,7 @@ static bool __init pcpu_need_numa(void) #endif return false; } +#endif /** * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu @@ -124,308 +126,35 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, } /* - * Large page remap allocator - * - * This allocator uses PMD page as unit. A PMD page is allocated for - * each cpu and each is remapped into vmalloc area using PMD mapping. - * As PMD page is quite large, only part of it is used for the first - * chunk. Unused part is returned to the bootmem allocator. - * - * So, the PMD pages are mapped twice - once to the physical mapping - * and to the vmalloc area for the first percpu chunk. The double - * mapping does add one more PMD TLB entry pressure but still is much - * better than only using 4k mappings while still being NUMA friendly. + * Helpers for first chunk memory allocation */ -#ifdef CONFIG_NEED_MULTIPLE_NODES -struct pcpul_ent { - unsigned int cpu; - void *ptr; -}; - -static size_t pcpul_size; -static struct pcpul_ent *pcpul_map; -static struct vm_struct pcpul_vm; - -static struct page * __init pcpul_get_page(unsigned int cpu, int pageno) +static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align) { - size_t off = (size_t)pageno << PAGE_SHIFT; - - if (off >= pcpul_size) - return NULL; - - return virt_to_page(pcpul_map[cpu].ptr + off); + return pcpu_alloc_bootmem(cpu, size, align); } -static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) +static void __init pcpu_fc_free(void *ptr, size_t size) { - size_t map_size, dyn_size; - unsigned int cpu; - int i, j; - ssize_t ret; - - if (!chosen) { - size_t vm_size = VMALLOC_END - VMALLOC_START; - size_t tot_size = nr_cpu_ids * PMD_SIZE; - - /* on non-NUMA, embedding is better */ - if (!pcpu_need_numa()) - return -EINVAL; - - /* don't consume more than 20% of vmalloc area */ - if (tot_size > vm_size / 5) { - pr_info("PERCPU: too large chunk size %zuMB for " - "large page remap\n", tot_size >> 20); - return -EINVAL; - } - } - - /* need PSE */ - if (!cpu_has_pse) { - pr_warning("PERCPU: lpage allocator requires PSE\n"); - return -EINVAL; - } - - /* - * Currently supports only single page. Supporting multiple - * pages won't be too difficult if it ever becomes necessary. - */ - pcpul_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE + - PERCPU_DYNAMIC_RESERVE); - if (pcpul_size > PMD_SIZE) { - pr_warning("PERCPU: static data is larger than large page, " - "can't use large page\n"); - return -EINVAL; - } - dyn_size = pcpul_size - static_size - PERCPU_FIRST_CHUNK_RESERVE; - - /* allocate pointer array and alloc large pages */ - map_size = PFN_ALIGN(nr_cpu_ids * sizeof(pcpul_map[0])); - pcpul_map = alloc_bootmem(map_size); - - for_each_possible_cpu(cpu) { - pcpul_map[cpu].cpu = cpu; - pcpul_map[cpu].ptr = pcpu_alloc_bootmem(cpu, PMD_SIZE, - PMD_SIZE); - if (!pcpul_map[cpu].ptr) { - pr_warning("PERCPU: failed to allocate large page " - "for cpu%u\n", cpu); - goto enomem; - } - - /* - * Only use pcpul_size bytes and give back the rest. - * - * Ingo: The 2MB up-rounding bootmem is needed to make - * sure the partial 2MB page is still fully RAM - it's - * not well-specified to have a PAT-incompatible area - * (unmapped RAM, device memory, etc.) in that hole. - */ - free_bootmem(__pa(pcpul_map[cpu].ptr + pcpul_size), - PMD_SIZE - pcpul_size); - - memcpy(pcpul_map[cpu].ptr, __per_cpu_load, static_size); - } - - /* allocate address and map */ - pcpul_vm.flags = VM_ALLOC; - pcpul_vm.size = nr_cpu_ids * PMD_SIZE; - vm_area_register_early(&pcpul_vm, PMD_SIZE); - - for_each_possible_cpu(cpu) { - pmd_t *pmd, pmd_v; - - pmd = populate_extra_pmd((unsigned long)pcpul_vm.addr + - cpu * PMD_SIZE); - pmd_v = pfn_pmd(page_to_pfn(virt_to_page(pcpul_map[cpu].ptr)), - PAGE_KERNEL_LARGE); - set_pmd(pmd, pmd_v); - } - - /* we're ready, commit */ - pr_info("PERCPU: Remapped at %p with large pages, static data " - "%zu bytes\n", pcpul_vm.addr, static_size); - - ret = pcpu_setup_first_chunk(pcpul_get_page, static_size, - PERCPU_FIRST_CHUNK_RESERVE, dyn_size, - PMD_SIZE, pcpul_vm.addr, NULL); - - /* sort pcpul_map array for pcpu_lpage_remapped() */ - for (i = 0; i < nr_cpu_ids - 1; i++) - for (j = i + 1; j < nr_cpu_ids; j++) - if (pcpul_map[i].ptr > pcpul_map[j].ptr) { - struct pcpul_ent tmp = pcpul_map[i]; - pcpul_map[i] = pcpul_map[j]; - pcpul_map[j] = tmp; - } - - return ret; - -enomem: - for_each_possible_cpu(cpu) - if (pcpul_map[cpu].ptr) - free_bootmem(__pa(pcpul_map[cpu].ptr), pcpul_size); - free_bootmem(__pa(pcpul_map), map_size); - return -ENOMEM; + free_bootmem(__pa(ptr), size); } -/** - * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area - * @kaddr: the kernel address in question - * - * Determine whether @kaddr falls in the pcpul recycled area. This is - * used by pageattr to detect VM aliases and break up the pcpu PMD - * mapping such that the same physical page is not mapped under - * different attributes. - * - * The recycled area is always at the tail of a partially used PMD - * page. - * - * RETURNS: - * Address of corresponding remapped pcpu address if match is found; - * otherwise, NULL. - */ -void *pcpu_lpage_remapped(void *kaddr) +static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) { - void *pmd_addr = (void *)((unsigned long)kaddr & PMD_MASK); - unsigned long offset = (unsigned long)kaddr & ~PMD_MASK; - int left = 0, right = nr_cpu_ids - 1; - int pos; - - /* pcpul in use at all? */ - if (!pcpul_map) - return NULL; - - /* okay, perform binary search */ - while (left <= right) { - pos = (left + right) / 2; - - if (pcpul_map[pos].ptr < pmd_addr) - left = pos + 1; - else if (pcpul_map[pos].ptr > pmd_addr) - right = pos - 1; - else { - /* it shouldn't be in the area for the first chunk */ - WARN_ON(offset < pcpul_size); - - return pcpul_vm.addr + - pcpul_map[pos].cpu * PMD_SIZE + offset; - } - } - - return NULL; -} +#ifdef CONFIG_NEED_MULTIPLE_NODES + if (early_cpu_to_node(from) == early_cpu_to_node(to)) + return LOCAL_DISTANCE; + else + return REMOTE_DISTANCE; #else -static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) -{ - return -EINVAL; -} + return LOCAL_DISTANCE; #endif - -/* - * Embedding allocator - * - * The first chunk is sized to just contain the static area plus - * module and dynamic reserves and embedded into linear physical - * mapping so that it can use PMD mapping without additional TLB - * pressure. - */ -static ssize_t __init setup_pcpu_embed(size_t static_size, bool chosen) -{ - size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; - - /* - * If large page isn't supported, there's no benefit in doing - * this. Also, embedding allocation doesn't play well with - * NUMA. - */ - if (!chosen && (!cpu_has_pse || pcpu_need_numa())) - return -EINVAL; - - return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, - reserve - PERCPU_FIRST_CHUNK_RESERVE, -1); } -/* - * 4k page allocator - * - * This is the basic allocator. Static percpu area is allocated - * page-by-page and most of initialization is done by the generic - * setup function. - */ -static struct page **pcpu4k_pages __initdata; -static int pcpu4k_nr_static_pages __initdata; - -static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno) -{ - if (pageno < pcpu4k_nr_static_pages) - return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno]; - return NULL; -} - -static void __init pcpu4k_populate_pte(unsigned long addr) +static void __init pcpup_populate_pte(unsigned long addr) { populate_extra_pte(addr); } -static ssize_t __init setup_pcpu_4k(size_t static_size) -{ - size_t pages_size; - unsigned int cpu; - int i, j; - ssize_t ret; - - pcpu4k_nr_static_pages = PFN_UP(static_size); - - /* unaligned allocations can't be freed, round up to page size */ - pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * nr_cpu_ids - * sizeof(pcpu4k_pages[0])); - pcpu4k_pages = alloc_bootmem(pages_size); - - /* allocate and copy */ - j = 0; - for_each_possible_cpu(cpu) - for (i = 0; i < pcpu4k_nr_static_pages; i++) { - void *ptr; - - ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE); - if (!ptr) { - pr_warning("PERCPU: failed to allocate " - "4k page for cpu%u\n", cpu); - goto enomem; - } - - memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE); - pcpu4k_pages[j++] = virt_to_page(ptr); - } - - /* we're ready, commit */ - pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n", - pcpu4k_nr_static_pages, static_size); - - ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, - PERCPU_FIRST_CHUNK_RESERVE, -1, - -1, NULL, pcpu4k_populate_pte); - goto out_free_ar; - -enomem: - while (--j >= 0) - free_bootmem(__pa(page_address(pcpu4k_pages[j])), PAGE_SIZE); - ret = -ENOMEM; -out_free_ar: - free_bootmem(__pa(pcpu4k_pages), pages_size); - return ret; -} - -/* for explicit first chunk allocator selection */ -static char pcpu_chosen_alloc[16] __initdata; - -static int __init percpu_alloc_setup(char *str) -{ - strncpy(pcpu_chosen_alloc, str, sizeof(pcpu_chosen_alloc) - 1); - return 0; -} -early_param("percpu_alloc", percpu_alloc_setup); - static inline void setup_percpu_segment(int cpu) { #ifdef CONFIG_X86_32 @@ -441,52 +170,49 @@ static inline void setup_percpu_segment(int cpu) void __init setup_per_cpu_areas(void) { - size_t static_size = __per_cpu_end - __per_cpu_start; unsigned int cpu; unsigned long delta; - size_t pcpu_unit_size; - ssize_t ret; + int rc; pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n", NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); /* - * Allocate percpu area. If PSE is supported, try to make use - * of large page mappings. Please read comments on top of - * each allocator for details. + * Allocate percpu area. Embedding allocator is our favorite; + * however, on NUMA configurations, it can result in very + * sparse unit mapping and vmalloc area isn't spacious enough + * on 32bit. Use page in that case. */ - ret = -EINVAL; - if (strlen(pcpu_chosen_alloc)) { - if (strcmp(pcpu_chosen_alloc, "4k")) { - if (!strcmp(pcpu_chosen_alloc, "lpage")) - ret = setup_pcpu_lpage(static_size, true); - else if (!strcmp(pcpu_chosen_alloc, "embed")) - ret = setup_pcpu_embed(static_size, true); - else - pr_warning("PERCPU: unknown allocator %s " - "specified\n", pcpu_chosen_alloc); - if (ret < 0) - pr_warning("PERCPU: %s allocator failed (%zd), " - "falling back to 4k\n", - pcpu_chosen_alloc, ret); - } - } else { - ret = setup_pcpu_lpage(static_size, false); - if (ret < 0) - ret = setup_pcpu_embed(static_size, false); +#ifdef CONFIG_X86_32 + if (pcpu_chosen_fc == PCPU_FC_AUTO && pcpu_need_numa()) + pcpu_chosen_fc = PCPU_FC_PAGE; +#endif + rc = -EINVAL; + if (pcpu_chosen_fc != PCPU_FC_PAGE) { + const size_t atom_size = cpu_has_pse ? PMD_SIZE : PAGE_SIZE; + const size_t dyn_size = PERCPU_MODULE_RESERVE + + PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE; + + rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, + dyn_size, atom_size, + pcpu_cpu_distance, + pcpu_fc_alloc, pcpu_fc_free); + if (rc < 0) + pr_warning("PERCPU: %s allocator failed (%d), " + "falling back to page size\n", + pcpu_fc_names[pcpu_chosen_fc], rc); } - if (ret < 0) - ret = setup_pcpu_4k(static_size); - if (ret < 0) - panic("cannot allocate static percpu area (%zu bytes, err=%zd)", - static_size, ret); - - pcpu_unit_size = ret; + if (rc < 0) + rc = pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, + pcpu_fc_alloc, pcpu_fc_free, + pcpup_populate_pte); + if (rc < 0) + panic("cannot initialize percpu area (err=%d)", rc); /* alrighty, percpu areas up and running */ delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; for_each_possible_cpu(cpu) { - per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size; + per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu]; per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); per_cpu(cpu_number, cpu) = cpu; setup_percpu_segment(cpu); diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 9fc178255c04..0ccb57d5ee35 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -348,15 +348,12 @@ SECTIONS _end = .; } - /* Sections to be discarded */ - /DISCARD/ : { - *(.exitcall.exit) - *(.eh_frame) - *(.discard) - } - STABS_DEBUG DWARF_DEBUG + + /* Sections to be discarded */ + DISCARDS + /DISCARD/ : { *(.eh_frame) } } diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index e245775ec856..24952fdc7e40 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -12,6 +12,7 @@ #include <linux/seq_file.h> #include <linux/debugfs.h> #include <linux/pfn.h> +#include <linux/percpu.h> #include <asm/e820.h> #include <asm/processor.h> @@ -686,7 +687,7 @@ static int cpa_process_alias(struct cpa_data *cpa) { struct cpa_data alias_cpa; unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT); - unsigned long vaddr, remapped; + unsigned long vaddr; int ret; if (cpa->pfn >= max_pfn_mapped) @@ -744,24 +745,6 @@ static int cpa_process_alias(struct cpa_data *cpa) } #endif - /* - * If the PMD page was partially used for per-cpu remapping, - * the recycled area needs to be split and modified. Because - * the area is always proper subset of a PMD page - * cpa->numpages is guaranteed to be 1 for these areas, so - * there's no need to loop over and check for further remaps. - */ - remapped = (unsigned long)pcpu_lpage_remapped((void *)laddr); - if (remapped) { - WARN_ON(cpa->numpages > 1); - alias_cpa = *cpa; - alias_cpa.vaddr = &remapped; - alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); - ret = __change_page_attr_set_clr(&alias_cpa, 0); - if (ret) - return ret; - } - return 0; } diff --git a/arch/xtensa/kernel/vmlinux.lds.S b/arch/xtensa/kernel/vmlinux.lds.S index 41c159cd872f..921b6ff3b645 100644 --- a/arch/xtensa/kernel/vmlinux.lds.S +++ b/arch/xtensa/kernel/vmlinux.lds.S @@ -280,15 +280,6 @@ SECTIONS *(.ResetVector.text) } - /* Sections to be discarded */ - /DISCARD/ : - { - *(.exit.literal) - EXIT_TEXT - EXIT_DATA - *(.exitcall.exit) - } - .xt.lit : { *(.xt.lit) } .xt.prop : { *(.xt.prop) } @@ -321,4 +312,8 @@ SECTIONS *(.xt.lit) *(.gnu.linkonce.p*) } + + /* Sections to be discarded */ + DISCARDS + /DISCARD/ : { *(.exit.literal) } } diff --git a/block/as-iosched.c b/block/as-iosched.c index 7a12cf6ee1d3..ce8ba57c6557 100644 --- a/block/as-iosched.c +++ b/block/as-iosched.c @@ -146,7 +146,7 @@ enum arq_state { #define RQ_STATE(rq) ((enum arq_state)(rq)->elevator_private2) #define RQ_SET_STATE(rq, state) ((rq)->elevator_private2 = (void *) state) -static DEFINE_PER_CPU(unsigned long, ioc_count); +static DEFINE_PER_CPU(unsigned long, as_ioc_count); static struct completion *ioc_gone; static DEFINE_SPINLOCK(ioc_gone_lock); @@ -161,7 +161,7 @@ static void as_antic_stop(struct as_data *ad); static void free_as_io_context(struct as_io_context *aic) { kfree(aic); - elv_ioc_count_dec(ioc_count); + elv_ioc_count_dec(as_ioc_count); if (ioc_gone) { /* * AS scheduler is exiting, grab exit lock and check @@ -169,7 +169,7 @@ static void free_as_io_context(struct as_io_context *aic) * complete ioc_gone and set it back to NULL. */ spin_lock(&ioc_gone_lock); - if (ioc_gone && !elv_ioc_count_read(ioc_count)) { + if (ioc_gone && !elv_ioc_count_read(as_ioc_count)) { complete(ioc_gone); ioc_gone = NULL; } @@ -211,7 +211,7 @@ static struct as_io_context *alloc_as_io_context(void) ret->seek_total = 0; ret->seek_samples = 0; ret->seek_mean = 0; - elv_ioc_count_inc(ioc_count); + elv_ioc_count_inc(as_ioc_count); } return ret; @@ -1507,7 +1507,7 @@ static void __exit as_exit(void) ioc_gone = &all_gone; /* ioc_gone's update must be visible before reading ioc_count */ smp_wmb(); - if (elv_ioc_count_read(ioc_count)) + if (elv_ioc_count_read(as_ioc_count)) wait_for_completion(&all_gone); synchronize_rcu(); } diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 0e3814b662af..1ca813b16e78 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -48,7 +48,7 @@ static int cfq_slice_idle = HZ / 125; static struct kmem_cache *cfq_pool; static struct kmem_cache *cfq_ioc_pool; -static DEFINE_PER_CPU(unsigned long, ioc_count); +static DEFINE_PER_CPU(unsigned long, cfq_ioc_count); static struct completion *ioc_gone; static DEFINE_SPINLOCK(ioc_gone_lock); @@ -1415,7 +1415,7 @@ static void cfq_cic_free_rcu(struct rcu_head *head) cic = container_of(head, struct cfq_io_context, rcu_head); kmem_cache_free(cfq_ioc_pool, cic); - elv_ioc_count_dec(ioc_count); + elv_ioc_count_dec(cfq_ioc_count); if (ioc_gone) { /* @@ -1424,7 +1424,7 @@ static void cfq_cic_free_rcu(struct rcu_head *head) * complete ioc_gone and set it back to NULL */ spin_lock(&ioc_gone_lock); - if (ioc_gone && !elv_ioc_count_read(ioc_count)) { + if (ioc_gone && !elv_ioc_count_read(cfq_ioc_count)) { complete(ioc_gone); ioc_gone = NULL; } @@ -1550,7 +1550,7 @@ cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask) INIT_HLIST_NODE(&cic->cic_list); cic->dtor = cfq_free_io_context; cic->exit = cfq_exit_io_context; - elv_ioc_count_inc(ioc_count); + elv_ioc_count_inc(cfq_ioc_count); } return cic; @@ -2654,7 +2654,7 @@ static void __exit cfq_exit(void) * this also protects us from entering cfq_slab_kill() with * pending RCU callbacks */ - if (elv_ioc_count_read(ioc_count)) + if (elv_ioc_count_read(cfq_ioc_count)) wait_for_completion(&all_gone); cfq_slab_kill(); } diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c index bdea7e2f94ba..bc33ddc9c97c 100644 --- a/drivers/cpufreq/cpufreq_conservative.c +++ b/drivers/cpufreq/cpufreq_conservative.c @@ -71,7 +71,7 @@ struct cpu_dbs_info_s { */ struct mutex timer_mutex; }; -static DEFINE_PER_CPU(struct cpu_dbs_info_s, cpu_dbs_info); +static DEFINE_PER_CPU(struct cpu_dbs_info_s, cs_cpu_dbs_info); static unsigned int dbs_enable; /* number of CPUs using this policy */ @@ -137,7 +137,7 @@ dbs_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data) { struct cpufreq_freqs *freq = data; - struct cpu_dbs_info_s *this_dbs_info = &per_cpu(cpu_dbs_info, + struct cpu_dbs_info_s *this_dbs_info = &per_cpu(cs_cpu_dbs_info, freq->cpu); struct cpufreq_policy *policy; @@ -297,7 +297,7 @@ static ssize_t store_ignore_nice_load(struct cpufreq_policy *policy, /* we need to re-evaluate prev_cpu_idle */ for_each_online_cpu(j) { struct cpu_dbs_info_s *dbs_info; - dbs_info = &per_cpu(cpu_dbs_info, j); + dbs_info = &per_cpu(cs_cpu_dbs_info, j); dbs_info->prev_cpu_idle = get_cpu_idle_time(j, &dbs_info->prev_cpu_wall); if (dbs_tuners_ins.ignore_nice) @@ -387,7 +387,7 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info) cputime64_t cur_wall_time, cur_idle_time; unsigned int idle_time, wall_time; - j_dbs_info = &per_cpu(cpu_dbs_info, j); + j_dbs_info = &per_cpu(cs_cpu_dbs_info, j); cur_idle_time = get_cpu_idle_time(j, &cur_wall_time); @@ -521,7 +521,7 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, unsigned int j; int rc; - this_dbs_info = &per_cpu(cpu_dbs_info, cpu); + this_dbs_info = &per_cpu(cs_cpu_dbs_info, cpu); switch (event) { case CPUFREQ_GOV_START: @@ -538,7 +538,7 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, for_each_cpu(j, policy->cpus) { struct cpu_dbs_info_s *j_dbs_info; - j_dbs_info = &per_cpu(cpu_dbs_info, j); + j_dbs_info = &per_cpu(cs_cpu_dbs_info, j); j_dbs_info->cur_policy = policy; j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j, diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c index d6ba14276bb1..d7a528c80de8 100644 --- a/drivers/cpufreq/cpufreq_ondemand.c +++ b/drivers/cpufreq/cpufreq_ondemand.c @@ -78,7 +78,7 @@ struct cpu_dbs_info_s { */ struct mutex timer_mutex; }; -static DEFINE_PER_CPU(struct cpu_dbs_info_s, cpu_dbs_info); +static DEFINE_PER_CPU(struct cpu_dbs_info_s, od_cpu_dbs_info); static unsigned int dbs_enable; /* number of CPUs using this policy */ @@ -149,7 +149,8 @@ static unsigned int powersave_bias_target(struct cpufreq_policy *policy, unsigned int freq_hi, freq_lo; unsigned int index = 0; unsigned int jiffies_total, jiffies_hi, jiffies_lo; - struct cpu_dbs_info_s *dbs_info = &per_cpu(cpu_dbs_info, policy->cpu); + struct cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, + policy->cpu); if (!dbs_info->freq_table) { dbs_info->freq_lo = 0; @@ -192,7 +193,7 @@ static unsigned int powersave_bias_target(struct cpufreq_policy *policy, static void ondemand_powersave_bias_init_cpu(int cpu) { - struct cpu_dbs_info_s *dbs_info = &per_cpu(cpu_dbs_info, cpu); + struct cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, cpu); dbs_info->freq_table = cpufreq_frequency_get_table(cpu); dbs_info->freq_lo = 0; } @@ -297,7 +298,7 @@ static ssize_t store_ignore_nice_load(struct cpufreq_policy *policy, /* we need to re-evaluate prev_cpu_idle */ for_each_online_cpu(j) { struct cpu_dbs_info_s *dbs_info; - dbs_info = &per_cpu(cpu_dbs_info, j); + dbs_info = &per_cpu(od_cpu_dbs_info, j); dbs_info->prev_cpu_idle = get_cpu_idle_time(j, &dbs_info->prev_cpu_wall); if (dbs_tuners_ins.ignore_nice) @@ -388,7 +389,7 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info) unsigned int load, load_freq; int freq_avg; - j_dbs_info = &per_cpu(cpu_dbs_info, j); + j_dbs_info = &per_cpu(od_cpu_dbs_info, j); cur_idle_time = get_cpu_idle_time(j, &cur_wall_time); @@ -535,7 +536,7 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, unsigned int j; int rc; - this_dbs_info = &per_cpu(cpu_dbs_info, cpu); + this_dbs_info = &per_cpu(od_cpu_dbs_info, cpu); switch (event) { case CPUFREQ_GOV_START: @@ -553,7 +554,7 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, dbs_enable++; for_each_cpu(j, policy->cpus) { struct cpu_dbs_info_s *j_dbs_info; - j_dbs_info = &per_cpu(cpu_dbs_info, j); + j_dbs_info = &per_cpu(od_cpu_dbs_info, j); j_dbs_info->cur_policy = policy; j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j, diff --git a/drivers/xen/events.c b/drivers/xen/events.c index abad71b1632b..2f57276e87a2 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -47,10 +47,10 @@ static DEFINE_SPINLOCK(irq_mapping_update_lock); /* IRQ <-> VIRQ mapping. */ -static DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1}; +static DEFINE_PER_CPU(int [NR_VIRQS], virq_to_irq) = {[0 ... NR_VIRQS-1] = -1}; /* IRQ <-> IPI mapping */ -static DEFINE_PER_CPU(int, ipi_to_irq[XEN_NR_IPIS]) = {[0 ... XEN_NR_IPIS-1] = -1}; +static DEFINE_PER_CPU(int [XEN_NR_IPIS], ipi_to_irq) = {[0 ... XEN_NR_IPIS-1] = -1}; /* Interrupt types. */ enum xen_irq_type { @@ -602,6 +602,8 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } +static DEFINE_PER_CPU(unsigned, xed_nesting_count); + /* * Search the CPUs pending events bitmasks. For each one found, map * the event number to an irq, and feed it into do_IRQ() for @@ -617,7 +619,6 @@ void xen_evtchn_do_upcall(struct pt_regs *regs) struct pt_regs *old_regs = set_irq_regs(regs); struct shared_info *s = HYPERVISOR_shared_info; struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu); - static DEFINE_PER_CPU(unsigned, nesting_count); unsigned count; exit_idle(); @@ -628,7 +629,7 @@ void xen_evtchn_do_upcall(struct pt_regs *regs) vcpu_info->evtchn_upcall_pending = 0; - if (__get_cpu_var(nesting_count)++) + if (__get_cpu_var(xed_nesting_count)++) goto out; #ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */ @@ -653,8 +654,8 @@ void xen_evtchn_do_upcall(struct pt_regs *regs) BUG_ON(!irqs_disabled()); - count = __get_cpu_var(nesting_count); - __get_cpu_var(nesting_count) = 0; + count = __get_cpu_var(xed_nesting_count); + __get_cpu_var(xed_nesting_count) = 0; } while(count != 1); out: diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 6ad76bf5fb40..a43223af98b6 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -33,13 +33,10 @@ * BSS_SECTION(0, 0, 0) * _end = .; * - * /DISCARD/ : { - * EXIT_TEXT - * EXIT_DATA - * EXIT_CALL - * } * STABS_DEBUG * DWARF_DEBUG + * + * DISCARDS // must be the last * } * * [__init_begin, __init_end] is the init section that may be freed after init @@ -626,6 +623,23 @@ #define INIT_RAM_FS #endif +/* + * Default discarded sections. + * + * Some archs want to discard exit text/data at runtime rather than + * link time due to cross-section references such as alt instructions, + * bug table, eh_frame, etc. DISCARDS must be the last of output + * section definitions so that such archs put those in earlier section + * definitions. + */ +#define DISCARDS \ + /DISCARD/ : { \ + EXIT_TEXT \ + EXIT_DATA \ + EXIT_CALL \ + *(.discard) \ + } + /** * PERCPU_VADDR - define output section for percpu area * @vaddr: explicit base address (optional) diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index 0761491b3eec..9bd03193ecd4 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -10,22 +10,70 @@ /* * Base implementations of per-CPU variable declarations and definitions, where * the section in which the variable is to be placed is provided by the - * 'section' argument. This may be used to affect the parameters governing the + * 'sec' argument. This may be used to affect the parameters governing the * variable's storage. * * NOTE! The sections for the DECLARE and for the DEFINE must match, lest * linkage errors occur due the compiler generating the wrong code to access * that section. */ -#define DECLARE_PER_CPU_SECTION(type, name, section) \ - extern \ - __attribute__((__section__(PER_CPU_BASE_SECTION section))) \ - PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name - -#define DEFINE_PER_CPU_SECTION(type, name, section) \ - __attribute__((__section__(PER_CPU_BASE_SECTION section))) \ - PER_CPU_ATTRIBUTES PER_CPU_DEF_ATTRIBUTES \ +#define __PCPU_ATTRS(sec) \ + __attribute__((section(PER_CPU_BASE_SECTION sec))) \ + PER_CPU_ATTRIBUTES + +#define __PCPU_DUMMY_ATTRS \ + __attribute__((section(".discard"), unused)) + +/* + * s390 and alpha modules require percpu variables to be defined as + * weak to force the compiler to generate GOT based external + * references for them. This is necessary because percpu sections + * will be located outside of the usually addressable area. + * + * This definition puts the following two extra restrictions when + * defining percpu variables. + * + * 1. The symbol must be globally unique, even the static ones. + * 2. Static percpu variables cannot be defined inside a function. + * + * Archs which need weak percpu definitions should define + * ARCH_NEEDS_WEAK_PER_CPU in asm/percpu.h when necessary. + * + * To ensure that the generic code observes the above two + * restrictions, if CONFIG_DEBUG_FORCE_WEAK_PER_CPU is set weak + * definition is used for all cases. + */ +#if defined(ARCH_NEEDS_WEAK_PER_CPU) || defined(CONFIG_DEBUG_FORCE_WEAK_PER_CPU) +/* + * __pcpu_scope_* dummy variable is used to enforce scope. It + * receives the static modifier when it's used in front of + * DEFINE_PER_CPU() and will trigger build failure if + * DECLARE_PER_CPU() is used for the same variable. + * + * __pcpu_unique_* dummy variable is used to enforce symbol uniqueness + * such that hidden weak symbol collision, which will cause unrelated + * variables to share the same address, can be detected during build. + */ +#define DECLARE_PER_CPU_SECTION(type, name, sec) \ + extern __PCPU_DUMMY_ATTRS char __pcpu_scope_##name; \ + extern __PCPU_ATTRS(sec) __typeof__(type) per_cpu__##name + +#define DEFINE_PER_CPU_SECTION(type, name, sec) \ + __PCPU_DUMMY_ATTRS char __pcpu_scope_##name; \ + __PCPU_DUMMY_ATTRS char __pcpu_unique_##name; \ + __PCPU_ATTRS(sec) PER_CPU_DEF_ATTRIBUTES __weak \ + __typeof__(type) per_cpu__##name +#else +/* + * Normal declaration and definition macros. + */ +#define DECLARE_PER_CPU_SECTION(type, name, sec) \ + extern __PCPU_ATTRS(sec) __typeof__(type) per_cpu__##name + +#define DEFINE_PER_CPU_SECTION(type, name, sec) \ + __PCPU_ATTRS(sec) PER_CPU_DEF_ATTRIBUTES \ __typeof__(type) per_cpu__##name +#endif /* * Variant on the per-CPU variable declaration/definition theme used for diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 26fd9d12f050..878836ca999c 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -34,7 +34,7 @@ #ifdef CONFIG_SMP -#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA +#ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA /* minimum unit size, also is the maximum supported allocation size */ #define PCPU_MIN_UNIT_SIZE PFN_ALIGN(64 << 10) @@ -57,19 +57,70 @@ #endif extern void *pcpu_base_addr; +extern const unsigned long *pcpu_unit_offsets; -typedef struct page * (*pcpu_get_page_fn_t)(unsigned int cpu, int pageno); -typedef void (*pcpu_populate_pte_fn_t)(unsigned long addr); +struct pcpu_group_info { + int nr_units; /* aligned # of units */ + unsigned long base_offset; /* base address offset */ + unsigned int *cpu_map; /* unit->cpu map, empty + * entries contain NR_CPUS */ +}; + +struct pcpu_alloc_info { + size_t static_size; + size_t reserved_size; + size_t dyn_size; + size_t unit_size; + size_t atom_size; + size_t alloc_size; + size_t __ai_size; /* internal, don't use */ + int nr_groups; /* 0 if grouping unnecessary */ + struct pcpu_group_info groups[]; +}; -extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, - size_t static_size, size_t reserved_size, - ssize_t dyn_size, ssize_t unit_size, - void *base_addr, - pcpu_populate_pte_fn_t populate_pte_fn); +enum pcpu_fc { + PCPU_FC_AUTO, + PCPU_FC_EMBED, + PCPU_FC_PAGE, -extern ssize_t __init pcpu_embed_first_chunk( - size_t static_size, size_t reserved_size, - ssize_t dyn_size, ssize_t unit_size); + PCPU_FC_NR, +}; +extern const char *pcpu_fc_names[PCPU_FC_NR]; + +extern enum pcpu_fc pcpu_chosen_fc; + +typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size, + size_t align); +typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size); +typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr); +typedef int (pcpu_fc_cpu_distance_fn_t)(unsigned int from, unsigned int to); + +extern struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups, + int nr_units); +extern void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai); + +extern struct pcpu_alloc_info * __init pcpu_build_alloc_info( + size_t reserved_size, ssize_t dyn_size, + size_t atom_size, + pcpu_fc_cpu_distance_fn_t cpu_distance_fn); + +extern int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, + void *base_addr); + +#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK +extern int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size, + size_t atom_size, + pcpu_fc_cpu_distance_fn_t cpu_distance_fn, + pcpu_fc_alloc_fn_t alloc_fn, + pcpu_fc_free_fn_t free_fn); +#endif + +#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK +extern int __init pcpu_page_first_chunk(size_t reserved_size, + pcpu_fc_alloc_fn_t alloc_fn, + pcpu_fc_free_fn_t free_fn, + pcpu_fc_populate_pte_fn_t populate_pte_fn); +#endif /* * Use this to get to a cpu's version of the per-cpu object @@ -80,7 +131,7 @@ extern ssize_t __init pcpu_embed_first_chunk( extern void *__alloc_reserved_percpu(size_t size, size_t align); -#else /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ +#else /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */ struct percpu_data { void *ptrs[1]; @@ -99,11 +150,15 @@ struct percpu_data { (__typeof__(ptr))__p->ptrs[(cpu)]; \ }) -#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ +#endif /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */ extern void *__alloc_percpu(size_t size, size_t align); extern void free_percpu(void *__pdata); +#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA +extern void __init setup_per_cpu_areas(void); +#endif + #else /* CONFIG_SMP */ #define per_cpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); }) @@ -124,6 +179,13 @@ static inline void free_percpu(void *p) kfree(p); } +static inline void __init setup_per_cpu_areas(void) { } + +static inline void *pcpu_lpage_remapped(void *kaddr) +{ + return NULL; +} + #endif /* CONFIG_SMP */ #define alloc_percpu(type) (type *)__alloc_percpu(sizeof(type), \ diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index a43ebec3a7b9..227c2a585e4f 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -115,4 +115,10 @@ extern rwlock_t vmlist_lock; extern struct vm_struct *vmlist; extern __init void vm_area_register_early(struct vm_struct *vm, size_t align); +struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, + const size_t *sizes, int nr_vms, + size_t align, gfp_t gfp_mask); + +void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms); + #endif /* _LINUX_VMALLOC_H */ diff --git a/init/main.c b/init/main.c index b34fd8e5edef..63904bb6ae37 100644 --- a/init/main.c +++ b/init/main.c @@ -353,7 +353,6 @@ static void __init smp_init(void) #define smp_init() do { } while (0) #endif -static inline void setup_per_cpu_areas(void) { } static inline void setup_nr_cpu_ids(void) { } static inline void smp_prepare_cpus(unsigned int maxcpus) { } @@ -374,29 +373,6 @@ static void __init setup_nr_cpu_ids(void) nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1; } -#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA -unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; - -EXPORT_SYMBOL(__per_cpu_offset); - -static void __init setup_per_cpu_areas(void) -{ - unsigned long size, i; - char *ptr; - unsigned long nr_possible_cpus = num_possible_cpus(); - - /* Copy section for each CPU (we discard the original) */ - size = ALIGN(PERCPU_ENOUGH_ROOM, PAGE_SIZE); - ptr = alloc_bootmem_pages(size * nr_possible_cpus); - - for_each_possible_cpu(i) { - __per_cpu_offset[i] = ptr - __per_cpu_start; - memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); - ptr += size; - } -} -#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */ - /* Called by boot processor to activate the rest. */ static void __init smp_init(void) { diff --git a/kernel/module.c b/kernel/module.c index 46580edff0cb..05ce49ced8f6 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -369,7 +369,7 @@ EXPORT_SYMBOL_GPL(find_module); #ifdef CONFIG_SMP -#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA +#ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA static void *percpu_modalloc(unsigned long size, unsigned long align, const char *name) @@ -394,7 +394,7 @@ static void percpu_modfree(void *freeme) free_percpu(freeme); } -#else /* ... !CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ +#else /* ... CONFIG_HAVE_LEGACY_PER_CPU_AREA */ /* Number of blocks used and allocated. */ static unsigned int pcpu_num_used, pcpu_num_allocated; @@ -540,7 +540,7 @@ static int percpu_modinit(void) } __initcall(percpu_modinit); -#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ +#endif /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */ static unsigned int find_pcpusec(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index e7f60f8e31ed..8cb94a52d1bb 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -106,16 +106,16 @@ hw_perf_group_sched_in(struct perf_counter *group_leader, void __weak perf_counter_print_debug(void) { } -static DEFINE_PER_CPU(int, disable_count); +static DEFINE_PER_CPU(int, perf_disable_count); void __perf_disable(void) { - __get_cpu_var(disable_count)++; + __get_cpu_var(perf_disable_count)++; } bool __perf_enable(void) { - return !--__get_cpu_var(disable_count); + return !--__get_cpu_var(perf_disable_count); } void perf_disable(void) diff --git a/kernel/sched.c b/kernel/sched.c index e27a53685ed9..d9db3fb17573 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -295,12 +295,12 @@ struct task_group root_task_group; /* Default task group's sched entity on each cpu */ static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); /* Default task group's cfs_rq on each cpu */ -static DEFINE_PER_CPU(struct cfs_rq, init_tg_cfs_rq) ____cacheline_aligned_in_smp; +static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq); #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); -static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; +static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq); #endif /* CONFIG_RT_GROUP_SCHED */ #else /* !CONFIG_USER_SCHED */ #define root_task_group init_task_group diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 78b1ed230177..97e2c4d2e9eb 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1432,7 +1432,7 @@ static __init void event_trace_self_tests(void) #ifdef CONFIG_FUNCTION_TRACER -static DEFINE_PER_CPU(atomic_t, test_event_disable); +static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable); static void function_test_events_call(unsigned long ip, unsigned long parent_ip) @@ -1449,7 +1449,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip) pc = preempt_count(); resched = ftrace_preempt_disable(); cpu = raw_smp_processor_id(); - disabled = atomic_inc_return(&per_cpu(test_event_disable, cpu)); + disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu)); if (disabled != 1) goto out; @@ -1468,7 +1468,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip) trace_nowake_buffer_unlock_commit(buffer, event, flags, pc); out: - atomic_dec(&per_cpu(test_event_disable, cpu)); + atomic_dec(&per_cpu(ftrace_test_event_disable, cpu)); ftrace_preempt_enable(resched); } diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 7dbd5d9c29a4..55d2acc607a1 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -805,6 +805,21 @@ config DEBUG_BLOCK_EXT_DEVT Say N if you are unsure. +config DEBUG_FORCE_WEAK_PER_CPU + bool "Force weak per-cpu definitions" + depends on DEBUG_KERNEL + help + s390 and alpha require percpu variables in modules to be + defined weak to work around addressing range issue which + puts the following two restrictions on percpu variable + definitions. + + 1. percpu symbols must be unique whether static or not + 2. percpu variables can't be defined inside a function + + To ensure that generic code follows the above rules, this + option forces all percpu variables to be defined as weak. + config LKDTM tristate "Linux Kernel Dump Test Tool Module" depends on DEBUG_KERNEL diff --git a/mm/Makefile b/mm/Makefile index 147a7a7873c4..ea4b18bd3960 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -33,7 +33,7 @@ obj-$(CONFIG_FAILSLAB) += failslab.o obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o obj-$(CONFIG_FS_XIP) += filemap_xip.o obj-$(CONFIG_MIGRATION) += migrate.o -ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA +ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA obj-$(CONFIG_SMP) += percpu.o else obj-$(CONFIG_SMP) += allocpercpu.o diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c index dfdee6a47359..df34ceae0c67 100644 --- a/mm/allocpercpu.c +++ b/mm/allocpercpu.c @@ -5,6 +5,8 @@ */ #include <linux/mm.h> #include <linux/module.h> +#include <linux/bootmem.h> +#include <asm/sections.h> #ifndef cache_line_size #define cache_line_size() L1_CACHE_BYTES @@ -147,3 +149,29 @@ void free_percpu(void *__pdata) kfree(__percpu_disguise(__pdata)); } EXPORT_SYMBOL_GPL(free_percpu); + +/* + * Generic percpu area setup. + */ +#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA +unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; + +EXPORT_SYMBOL(__per_cpu_offset); + +void __init setup_per_cpu_areas(void) +{ + unsigned long size, i; + char *ptr; + unsigned long nr_possible_cpus = num_possible_cpus(); + + /* Copy section for each CPU (we discard the original) */ + size = ALIGN(PERCPU_ENOUGH_ROOM, PAGE_SIZE); + ptr = alloc_bootmem_pages(size * nr_possible_cpus); + + for_each_possible_cpu(i) { + __per_cpu_offset[i] = ptr - __per_cpu_start; + memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); + ptr += size; + } +} +#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */ diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c index d5292fc6f523..177a5169bbde 100644 --- a/mm/kmemleak-test.c +++ b/mm/kmemleak-test.c @@ -36,7 +36,7 @@ struct test_node { }; static LIST_HEAD(test_list); -static DEFINE_PER_CPU(void *, test_pointer); +static DEFINE_PER_CPU(void *, kmemleak_test_pointer); /* * Some very simple testing. This function needs to be extended for @@ -86,9 +86,9 @@ static int __init kmemleak_test_init(void) } for_each_possible_cpu(i) { - per_cpu(test_pointer, i) = kmalloc(129, GFP_KERNEL); + per_cpu(kmemleak_test_pointer, i) = kmalloc(129, GFP_KERNEL); pr_info("kmemleak: kmalloc(129) = %p\n", - per_cpu(test_pointer, i)); + per_cpu(kmemleak_test_pointer, i)); } return 0; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 25e7770309b8..dd73d29c15a8 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -604,6 +604,8 @@ void set_page_dirty_balance(struct page *page, int page_mkwrite) } } +static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0; + /** * balance_dirty_pages_ratelimited_nr - balance dirty memory state * @mapping: address_space which was dirtied @@ -621,7 +623,6 @@ void set_page_dirty_balance(struct page *page, int page_mkwrite) void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, unsigned long nr_pages_dirtied) { - static DEFINE_PER_CPU(unsigned long, ratelimits) = 0; unsigned long ratelimit; unsigned long *p; @@ -634,7 +635,7 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, * tasks in balance_dirty_pages(). Period. */ preempt_disable(); - p = &__get_cpu_var(ratelimits); + p = &__get_cpu_var(bdp_ratelimits); *p += nr_pages_dirtied; if (unlikely(*p >= ratelimit)) { *p = 0; diff --git a/mm/percpu.c b/mm/percpu.c index 3311c8919f37..43d8cacfdaa5 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -8,12 +8,13 @@ * * This is percpu allocator which can handle both static and dynamic * areas. Percpu areas are allocated in chunks in vmalloc area. Each - * chunk is consisted of nr_cpu_ids units and the first chunk is used - * for static percpu variables in the kernel image (special boot time - * alloc/init handling necessary as these areas need to be brought up - * before allocation services are running). Unit grows as necessary - * and all units grow or shrink in unison. When a chunk is filled up, - * another chunk is allocated. ie. in vmalloc area + * chunk is consisted of boot-time determined number of units and the + * first chunk is used for static percpu variables in the kernel image + * (special boot time alloc/init handling necessary as these areas + * need to be brought up before allocation services are running). + * Unit grows as necessary and all units grow or shrink in unison. + * When a chunk is filled up, another chunk is allocated. ie. in + * vmalloc area * * c0 c1 c2 * ------------------- ------------------- ------------ @@ -22,11 +23,13 @@ * * Allocation is done in offset-size areas of single unit space. Ie, * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0, - * c1:u1, c1:u2 and c1:u3. Percpu access can be done by configuring - * percpu base registers pcpu_unit_size apart. + * c1:u1, c1:u2 and c1:u3. On UMA, units corresponds directly to + * cpus. On NUMA, the mapping can be non-linear and even sparse. + * Percpu access can be done by configuring percpu base registers + * according to cpu to unit mapping and pcpu_unit_size. * - * There are usually many small percpu allocations many of them as - * small as 4 bytes. The allocator organizes chunks into lists + * There are usually many small percpu allocations many of them being + * as small as 4 bytes. The allocator organizes chunks into lists * according to free size and tries to allocate from the fullest one. * Each chunk keeps the maximum contiguous area size hint which is * guaranteed to be eqaul to or larger than the maximum contiguous @@ -43,7 +46,7 @@ * * To use this allocator, arch code should do the followings. * - * - define CONFIG_HAVE_DYNAMIC_PER_CPU_AREA + * - drop CONFIG_HAVE_LEGACY_PER_CPU_AREA * * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate * regular address to percpu pointer and back if they need to be @@ -55,7 +58,9 @@ #include <linux/bitmap.h> #include <linux/bootmem.h> +#include <linux/err.h> #include <linux/list.h> +#include <linux/log2.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/mutex.h> @@ -89,25 +94,38 @@ struct pcpu_chunk { struct list_head list; /* linked to pcpu_slot lists */ int free_size; /* free bytes in the chunk */ int contig_hint; /* max contiguous size hint */ - struct vm_struct *vm; /* mapped vmalloc region */ + void *base_addr; /* base address of this chunk */ int map_used; /* # of map entries used */ int map_alloc; /* # of map entries allocated */ int *map; /* allocation map */ + struct vm_struct **vms; /* mapped vmalloc regions */ bool immutable; /* no [de]population allowed */ - struct page **page; /* points to page array */ - struct page *page_ar[]; /* #cpus * UNIT_PAGES */ + unsigned long populated[]; /* populated bitmap */ }; static int pcpu_unit_pages __read_mostly; static int pcpu_unit_size __read_mostly; -static int pcpu_chunk_size __read_mostly; +static int pcpu_nr_units __read_mostly; +static int pcpu_atom_size __read_mostly; static int pcpu_nr_slots __read_mostly; static size_t pcpu_chunk_struct_size __read_mostly; +/* cpus with the lowest and highest unit numbers */ +static unsigned int pcpu_first_unit_cpu __read_mostly; +static unsigned int pcpu_last_unit_cpu __read_mostly; + /* the address of the first chunk which starts with the kernel static area */ void *pcpu_base_addr __read_mostly; EXPORT_SYMBOL_GPL(pcpu_base_addr); +static const int *pcpu_unit_map __read_mostly; /* cpu -> unit */ +const unsigned long *pcpu_unit_offsets __read_mostly; /* cpu -> unit offset */ + +/* group information, used for vm allocation */ +static int pcpu_nr_groups __read_mostly; +static const unsigned long *pcpu_group_offsets __read_mostly; +static const size_t *pcpu_group_sizes __read_mostly; + /* * The first chunk which always exists. Note that unlike other * chunks, this one can be allocated and mapped in several different @@ -129,9 +147,9 @@ static int pcpu_reserved_chunk_limit; * Synchronization rules. * * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former - * protects allocation/reclaim paths, chunks and chunk->page arrays. - * The latter is a spinlock and protects the index data structures - - * chunk slots, chunks and area maps in chunks. + * protects allocation/reclaim paths, chunks, populated bitmap and + * vmalloc mapping. The latter is a spinlock and protects the index + * data structures - chunk slots, chunks and area maps in chunks. * * During allocation, pcpu_alloc_mutex is kept locked all the time and * pcpu_lock is grabbed and released as necessary. All actual memory @@ -178,31 +196,23 @@ static int pcpu_chunk_slot(const struct pcpu_chunk *chunk) static int pcpu_page_idx(unsigned int cpu, int page_idx) { - return cpu * pcpu_unit_pages + page_idx; -} - -static struct page **pcpu_chunk_pagep(struct pcpu_chunk *chunk, - unsigned int cpu, int page_idx) -{ - return &chunk->page[pcpu_page_idx(cpu, page_idx)]; + return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx; } static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk, unsigned int cpu, int page_idx) { - return (unsigned long)chunk->vm->addr + - (pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT); + return (unsigned long)chunk->base_addr + pcpu_unit_offsets[cpu] + + (page_idx << PAGE_SHIFT); } -static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk, - int page_idx) +static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk, + unsigned int cpu, int page_idx) { - /* - * Any possible cpu id can be used here, so there's no need to - * worry about preemption or cpu hotplug. - */ - return *pcpu_chunk_pagep(chunk, raw_smp_processor_id(), - page_idx) != NULL; + /* must not be used on pre-mapped chunk */ + WARN_ON(chunk->immutable); + + return vmalloc_to_page((void *)pcpu_chunk_addr(chunk, cpu, page_idx)); } /* set the pointer to a chunk in a page struct */ @@ -217,6 +227,34 @@ static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page) return (struct pcpu_chunk *)page->index; } +static void pcpu_next_unpop(struct pcpu_chunk *chunk, int *rs, int *re, int end) +{ + *rs = find_next_zero_bit(chunk->populated, end, *rs); + *re = find_next_bit(chunk->populated, end, *rs + 1); +} + +static void pcpu_next_pop(struct pcpu_chunk *chunk, int *rs, int *re, int end) +{ + *rs = find_next_bit(chunk->populated, end, *rs); + *re = find_next_zero_bit(chunk->populated, end, *rs + 1); +} + +/* + * (Un)populated page region iterators. Iterate over (un)populated + * page regions betwen @start and @end in @chunk. @rs and @re should + * be integer variables and will be set to start and end page index of + * the current region. + */ +#define pcpu_for_each_unpop_region(chunk, rs, re, start, end) \ + for ((rs) = (start), pcpu_next_unpop((chunk), &(rs), &(re), (end)); \ + (rs) < (re); \ + (rs) = (re) + 1, pcpu_next_unpop((chunk), &(rs), &(re), (end))) + +#define pcpu_for_each_pop_region(chunk, rs, re, start, end) \ + for ((rs) = (start), pcpu_next_pop((chunk), &(rs), &(re), (end)); \ + (rs) < (re); \ + (rs) = (re) + 1, pcpu_next_pop((chunk), &(rs), &(re), (end))) + /** * pcpu_mem_alloc - allocate memory * @size: bytes to allocate @@ -292,10 +330,10 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) */ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) { - void *first_start = pcpu_first_chunk->vm->addr; + void *first_start = pcpu_first_chunk->base_addr; /* is it in the first chunk? */ - if (addr >= first_start && addr < first_start + pcpu_chunk_size) { + if (addr >= first_start && addr < first_start + pcpu_unit_size) { /* is it in the reserved area? */ if (addr < first_start + pcpu_reserved_chunk_limit) return pcpu_reserved_chunk; @@ -309,7 +347,7 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) * space. Note that any possible cpu id can be used here, so * there's no need to worry about preemption or cpu hotplug. */ - addr += raw_smp_processor_id() * pcpu_unit_size; + addr += pcpu_unit_offsets[raw_smp_processor_id()]; return pcpu_get_page_chunk(vmalloc_to_page(addr)); } @@ -558,125 +596,327 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme) } /** - * pcpu_unmap - unmap pages out of a pcpu_chunk + * pcpu_get_pages_and_bitmap - get temp pages array and bitmap * @chunk: chunk of interest - * @page_start: page index of the first page to unmap - * @page_end: page index of the last page to unmap + 1 - * @flush_tlb: whether to flush tlb or not + * @bitmapp: output parameter for bitmap + * @may_alloc: may allocate the array * - * For each cpu, unmap pages [@page_start,@page_end) out of @chunk. - * If @flush is true, vcache is flushed before unmapping and tlb - * after. + * Returns pointer to array of pointers to struct page and bitmap, + * both of which can be indexed with pcpu_page_idx(). The returned + * array is cleared to zero and *@bitmapp is copied from + * @chunk->populated. Note that there is only one array and bitmap + * and access exclusion is the caller's responsibility. + * + * CONTEXT: + * pcpu_alloc_mutex and does GFP_KERNEL allocation if @may_alloc. + * Otherwise, don't care. + * + * RETURNS: + * Pointer to temp pages array on success, NULL on failure. */ -static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end, - bool flush_tlb) +static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk, + unsigned long **bitmapp, + bool may_alloc) { - unsigned int last = nr_cpu_ids - 1; - unsigned int cpu; + static struct page **pages; + static unsigned long *bitmap; + size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]); + size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) * + sizeof(unsigned long); + + if (!pages || !bitmap) { + if (may_alloc && !pages) + pages = pcpu_mem_alloc(pages_size); + if (may_alloc && !bitmap) + bitmap = pcpu_mem_alloc(bitmap_size); + if (!pages || !bitmap) + return NULL; + } - /* unmap must not be done on immutable chunk */ - WARN_ON(chunk->immutable); + memset(pages, 0, pages_size); + bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages); - /* - * Each flushing trial can be very expensive, issue flush on - * the whole region at once rather than doing it for each cpu. - * This could be an overkill but is more scalable. - */ - flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start), - pcpu_chunk_addr(chunk, last, page_end)); + *bitmapp = bitmap; + return pages; +} - for_each_possible_cpu(cpu) - unmap_kernel_range_noflush( - pcpu_chunk_addr(chunk, cpu, page_start), - (page_end - page_start) << PAGE_SHIFT); - - /* ditto as flush_cache_vunmap() */ - if (flush_tlb) - flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start), - pcpu_chunk_addr(chunk, last, page_end)); +/** + * pcpu_free_pages - free pages which were allocated for @chunk + * @chunk: chunk pages were allocated for + * @pages: array of pages to be freed, indexed by pcpu_page_idx() + * @populated: populated bitmap + * @page_start: page index of the first page to be freed + * @page_end: page index of the last page to be freed + 1 + * + * Free pages [@page_start and @page_end) in @pages for all units. + * The pages were allocated for @chunk. + */ +static void pcpu_free_pages(struct pcpu_chunk *chunk, + struct page **pages, unsigned long *populated, + int page_start, int page_end) +{ + unsigned int cpu; + int i; + + for_each_possible_cpu(cpu) { + for (i = page_start; i < page_end; i++) { + struct page *page = pages[pcpu_page_idx(cpu, i)]; + + if (page) + __free_page(page); + } + } } /** - * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk - * @chunk: chunk to depopulate - * @off: offset to the area to depopulate - * @size: size of the area to depopulate in bytes - * @flush: whether to flush cache and tlb or not - * - * For each cpu, depopulate and unmap pages [@page_start,@page_end) - * from @chunk. If @flush is true, vcache is flushed before unmapping - * and tlb after. - * - * CONTEXT: - * pcpu_alloc_mutex. + * pcpu_alloc_pages - allocates pages for @chunk + * @chunk: target chunk + * @pages: array to put the allocated pages into, indexed by pcpu_page_idx() + * @populated: populated bitmap + * @page_start: page index of the first page to be allocated + * @page_end: page index of the last page to be allocated + 1 + * + * Allocate pages [@page_start,@page_end) into @pages for all units. + * The allocation is for @chunk. Percpu core doesn't care about the + * content of @pages and will pass it verbatim to pcpu_map_pages(). */ -static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size, - bool flush) +static int pcpu_alloc_pages(struct pcpu_chunk *chunk, + struct page **pages, unsigned long *populated, + int page_start, int page_end) { - int page_start = PFN_DOWN(off); - int page_end = PFN_UP(off + size); - int unmap_start = -1; - int uninitialized_var(unmap_end); + const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD; unsigned int cpu; int i; - for (i = page_start; i < page_end; i++) { - for_each_possible_cpu(cpu) { - struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i); + for_each_possible_cpu(cpu) { + for (i = page_start; i < page_end; i++) { + struct page **pagep = &pages[pcpu_page_idx(cpu, i)]; + + *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0); + if (!*pagep) { + pcpu_free_pages(chunk, pages, populated, + page_start, page_end); + return -ENOMEM; + } + } + } + return 0; +} - if (!*pagep) - continue; +/** + * pcpu_pre_unmap_flush - flush cache prior to unmapping + * @chunk: chunk the regions to be flushed belongs to + * @page_start: page index of the first page to be flushed + * @page_end: page index of the last page to be flushed + 1 + * + * Pages in [@page_start,@page_end) of @chunk are about to be + * unmapped. Flush cache. As each flushing trial can be very + * expensive, issue flush on the whole region at once rather than + * doing it for each cpu. This could be an overkill but is more + * scalable. + */ +static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk, + int page_start, int page_end) +{ + flush_cache_vunmap( + pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), + pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); +} + +static void __pcpu_unmap_pages(unsigned long addr, int nr_pages) +{ + unmap_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT); +} - __free_page(*pagep); +/** + * pcpu_unmap_pages - unmap pages out of a pcpu_chunk + * @chunk: chunk of interest + * @pages: pages array which can be used to pass information to free + * @populated: populated bitmap + * @page_start: page index of the first page to unmap + * @page_end: page index of the last page to unmap + 1 + * + * For each cpu, unmap pages [@page_start,@page_end) out of @chunk. + * Corresponding elements in @pages were cleared by the caller and can + * be used to carry information to pcpu_free_pages() which will be + * called after all unmaps are finished. The caller should call + * proper pre/post flush functions. + */ +static void pcpu_unmap_pages(struct pcpu_chunk *chunk, + struct page **pages, unsigned long *populated, + int page_start, int page_end) +{ + unsigned int cpu; + int i; - /* - * If it's partial depopulation, it might get - * populated or depopulated again. Mark the - * page gone. - */ - *pagep = NULL; + for_each_possible_cpu(cpu) { + for (i = page_start; i < page_end; i++) { + struct page *page; - unmap_start = unmap_start < 0 ? i : unmap_start; - unmap_end = i + 1; + page = pcpu_chunk_page(chunk, cpu, i); + WARN_ON(!page); + pages[pcpu_page_idx(cpu, i)] = page; } + __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start), + page_end - page_start); } - if (unmap_start >= 0) - pcpu_unmap(chunk, unmap_start, unmap_end, flush); + for (i = page_start; i < page_end; i++) + __clear_bit(i, populated); +} + +/** + * pcpu_post_unmap_tlb_flush - flush TLB after unmapping + * @chunk: pcpu_chunk the regions to be flushed belong to + * @page_start: page index of the first page to be flushed + * @page_end: page index of the last page to be flushed + 1 + * + * Pages [@page_start,@page_end) of @chunk have been unmapped. Flush + * TLB for the regions. This can be skipped if the area is to be + * returned to vmalloc as vmalloc will handle TLB flushing lazily. + * + * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once + * for the whole region. + */ +static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk, + int page_start, int page_end) +{ + flush_tlb_kernel_range( + pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), + pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); +} + +static int __pcpu_map_pages(unsigned long addr, struct page **pages, + int nr_pages) +{ + return map_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT, + PAGE_KERNEL, pages); } /** - * pcpu_map - map pages into a pcpu_chunk + * pcpu_map_pages - map pages into a pcpu_chunk * @chunk: chunk of interest + * @pages: pages array containing pages to be mapped + * @populated: populated bitmap * @page_start: page index of the first page to map * @page_end: page index of the last page to map + 1 * - * For each cpu, map pages [@page_start,@page_end) into @chunk. - * vcache is flushed afterwards. + * For each cpu, map pages [@page_start,@page_end) into @chunk. The + * caller is responsible for calling pcpu_post_map_flush() after all + * mappings are complete. + * + * This function is responsible for setting corresponding bits in + * @chunk->populated bitmap and whatever is necessary for reverse + * lookup (addr -> chunk). */ -static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end) +static int pcpu_map_pages(struct pcpu_chunk *chunk, + struct page **pages, unsigned long *populated, + int page_start, int page_end) { - unsigned int last = nr_cpu_ids - 1; - unsigned int cpu; - int err; - - /* map must not be done on immutable chunk */ - WARN_ON(chunk->immutable); + unsigned int cpu, tcpu; + int i, err; for_each_possible_cpu(cpu) { - err = map_kernel_range_noflush( - pcpu_chunk_addr(chunk, cpu, page_start), - (page_end - page_start) << PAGE_SHIFT, - PAGE_KERNEL, - pcpu_chunk_pagep(chunk, cpu, page_start)); + err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start), + &pages[pcpu_page_idx(cpu, page_start)], + page_end - page_start); if (err < 0) - return err; + goto err; + } + + /* mapping successful, link chunk and mark populated */ + for (i = page_start; i < page_end; i++) { + for_each_possible_cpu(cpu) + pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)], + chunk); + __set_bit(i, populated); } - /* flush at once, please read comments in pcpu_unmap() */ - flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start), - pcpu_chunk_addr(chunk, last, page_end)); return 0; + +err: + for_each_possible_cpu(tcpu) { + if (tcpu == cpu) + break; + __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start), + page_end - page_start); + } + return err; +} + +/** + * pcpu_post_map_flush - flush cache after mapping + * @chunk: pcpu_chunk the regions to be flushed belong to + * @page_start: page index of the first page to be flushed + * @page_end: page index of the last page to be flushed + 1 + * + * Pages [@page_start,@page_end) of @chunk have been mapped. Flush + * cache. + * + * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once + * for the whole region. + */ +static void pcpu_post_map_flush(struct pcpu_chunk *chunk, + int page_start, int page_end) +{ + flush_cache_vmap( + pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), + pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); +} + +/** + * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk + * @chunk: chunk to depopulate + * @off: offset to the area to depopulate + * @size: size of the area to depopulate in bytes + * @flush: whether to flush cache and tlb or not + * + * For each cpu, depopulate and unmap pages [@page_start,@page_end) + * from @chunk. If @flush is true, vcache is flushed before unmapping + * and tlb after. + * + * CONTEXT: + * pcpu_alloc_mutex. + */ +static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size) +{ + int page_start = PFN_DOWN(off); + int page_end = PFN_UP(off + size); + struct page **pages; + unsigned long *populated; + int rs, re; + + /* quick path, check whether it's empty already */ + pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { + if (rs == page_start && re == page_end) + return; + break; + } + + /* immutable chunks can't be depopulated */ + WARN_ON(chunk->immutable); + + /* + * If control reaches here, there must have been at least one + * successful population attempt so the temp pages array must + * be available now. + */ + pages = pcpu_get_pages_and_bitmap(chunk, &populated, false); + BUG_ON(!pages); + + /* unmap and free */ + pcpu_pre_unmap_flush(chunk, page_start, page_end); + + pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) + pcpu_unmap_pages(chunk, pages, populated, rs, re); + + /* no need to flush tlb, vmalloc will handle it lazily */ + + pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) + pcpu_free_pages(chunk, pages, populated, rs, re); + + /* commit new bitmap */ + bitmap_copy(chunk->populated, populated, pcpu_unit_pages); } /** @@ -693,58 +933,68 @@ static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end) */ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) { - const gfp_t alloc_mask = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD; int page_start = PFN_DOWN(off); int page_end = PFN_UP(off + size); - int map_start = -1; - int uninitialized_var(map_end); + int free_end = page_start, unmap_end = page_start; + struct page **pages; + unsigned long *populated; unsigned int cpu; - int i; + int rs, re, rc; - for (i = page_start; i < page_end; i++) { - if (pcpu_chunk_page_occupied(chunk, i)) { - if (map_start >= 0) { - if (pcpu_map(chunk, map_start, map_end)) - goto err; - map_start = -1; - } - continue; - } + /* quick path, check whether all pages are already there */ + pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) { + if (rs == page_start && re == page_end) + goto clear; + break; + } - map_start = map_start < 0 ? i : map_start; - map_end = i + 1; + /* need to allocate and map pages, this chunk can't be immutable */ + WARN_ON(chunk->immutable); - for_each_possible_cpu(cpu) { - struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i); + pages = pcpu_get_pages_and_bitmap(chunk, &populated, true); + if (!pages) + return -ENOMEM; - *pagep = alloc_pages_node(cpu_to_node(cpu), - alloc_mask, 0); - if (!*pagep) - goto err; - pcpu_set_page_chunk(*pagep, chunk); - } + /* alloc and map */ + pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { + rc = pcpu_alloc_pages(chunk, pages, populated, rs, re); + if (rc) + goto err_free; + free_end = re; } - if (map_start >= 0 && pcpu_map(chunk, map_start, map_end)) - goto err; + pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { + rc = pcpu_map_pages(chunk, pages, populated, rs, re); + if (rc) + goto err_unmap; + unmap_end = re; + } + pcpu_post_map_flush(chunk, page_start, page_end); + /* commit new bitmap */ + bitmap_copy(chunk->populated, populated, pcpu_unit_pages); +clear: for_each_possible_cpu(cpu) - memset(chunk->vm->addr + cpu * pcpu_unit_size + off, 0, - size); - + memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); return 0; -err: - /* likely under heavy memory pressure, give memory back */ - pcpu_depopulate_chunk(chunk, off, size, true); - return -ENOMEM; + +err_unmap: + pcpu_pre_unmap_flush(chunk, page_start, unmap_end); + pcpu_for_each_unpop_region(chunk, rs, re, page_start, unmap_end) + pcpu_unmap_pages(chunk, pages, populated, rs, re); + pcpu_post_unmap_tlb_flush(chunk, page_start, unmap_end); +err_free: + pcpu_for_each_unpop_region(chunk, rs, re, page_start, free_end) + pcpu_free_pages(chunk, pages, populated, rs, re); + return rc; } static void free_pcpu_chunk(struct pcpu_chunk *chunk) { if (!chunk) return; - if (chunk->vm) - free_vm_area(chunk->vm); + if (chunk->vms) + pcpu_free_vm_areas(chunk->vms, pcpu_nr_groups); pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0])); kfree(chunk); } @@ -760,10 +1010,11 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void) chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0])); chunk->map_alloc = PCPU_DFL_MAP_ALLOC; chunk->map[chunk->map_used++] = pcpu_unit_size; - chunk->page = chunk->page_ar; - chunk->vm = get_vm_area(pcpu_chunk_size, VM_ALLOC); - if (!chunk->vm) { + chunk->vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes, + pcpu_nr_groups, pcpu_atom_size, + GFP_KERNEL); + if (!chunk->vms) { free_pcpu_chunk(chunk); return NULL; } @@ -771,6 +1022,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void) INIT_LIST_HEAD(&chunk->list); chunk->free_size = pcpu_unit_size; chunk->contig_hint = pcpu_unit_size; + chunk->base_addr = chunk->vms[0]->addr - pcpu_group_offsets[0]; return chunk; } @@ -860,7 +1112,8 @@ area_found: mutex_unlock(&pcpu_alloc_mutex); - return __addr_to_pcpu_ptr(chunk->vm->addr + off); + /* return address relative to base address */ + return __addr_to_pcpu_ptr(chunk->base_addr + off); fail_unlock: spin_unlock_irq(&pcpu_lock); @@ -938,12 +1191,13 @@ static void pcpu_reclaim(struct work_struct *work) } spin_unlock_irq(&pcpu_lock); - mutex_unlock(&pcpu_alloc_mutex); list_for_each_entry_safe(chunk, next, &todo, list) { - pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false); + pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size); free_pcpu_chunk(chunk); } + + mutex_unlock(&pcpu_alloc_mutex); } /** @@ -968,7 +1222,7 @@ void free_percpu(void *ptr) spin_lock_irqsave(&pcpu_lock, flags); chunk = pcpu_chunk_addr_search(addr); - off = addr - chunk->vm->addr; + off = addr - chunk->base_addr; pcpu_free_area(chunk, off); @@ -987,30 +1241,295 @@ void free_percpu(void *ptr) } EXPORT_SYMBOL_GPL(free_percpu); +static inline size_t pcpu_calc_fc_sizes(size_t static_size, + size_t reserved_size, + ssize_t *dyn_sizep) +{ + size_t size_sum; + + size_sum = PFN_ALIGN(static_size + reserved_size + + (*dyn_sizep >= 0 ? *dyn_sizep : 0)); + if (*dyn_sizep != 0) + *dyn_sizep = size_sum - static_size - reserved_size; + + return size_sum; +} + /** - * pcpu_setup_first_chunk - initialize the first percpu chunk - * @get_page_fn: callback to fetch page pointer - * @static_size: the size of static percpu area in bytes + * pcpu_alloc_alloc_info - allocate percpu allocation info + * @nr_groups: the number of groups + * @nr_units: the number of units + * + * Allocate ai which is large enough for @nr_groups groups containing + * @nr_units units. The returned ai's groups[0].cpu_map points to the + * cpu_map array which is long enough for @nr_units and filled with + * NR_CPUS. It's the caller's responsibility to initialize cpu_map + * pointer of other groups. + * + * RETURNS: + * Pointer to the allocated pcpu_alloc_info on success, NULL on + * failure. + */ +struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups, + int nr_units) +{ + struct pcpu_alloc_info *ai; + size_t base_size, ai_size; + void *ptr; + int unit; + + base_size = ALIGN(sizeof(*ai) + nr_groups * sizeof(ai->groups[0]), + __alignof__(ai->groups[0].cpu_map[0])); + ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]); + + ptr = alloc_bootmem_nopanic(PFN_ALIGN(ai_size)); + if (!ptr) + return NULL; + ai = ptr; + ptr += base_size; + + ai->groups[0].cpu_map = ptr; + + for (unit = 0; unit < nr_units; unit++) + ai->groups[0].cpu_map[unit] = NR_CPUS; + + ai->nr_groups = nr_groups; + ai->__ai_size = PFN_ALIGN(ai_size); + + return ai; +} + +/** + * pcpu_free_alloc_info - free percpu allocation info + * @ai: pcpu_alloc_info to free + * + * Free @ai which was allocated by pcpu_alloc_alloc_info(). + */ +void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai) +{ + free_bootmem(__pa(ai), ai->__ai_size); +} + +/** + * pcpu_build_alloc_info - build alloc_info considering distances between CPUs * @reserved_size: the size of reserved percpu area in bytes * @dyn_size: free size for dynamic allocation in bytes, -1 for auto - * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto - * @base_addr: mapped address, NULL for auto - * @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary + * @atom_size: allocation atom size + * @cpu_distance_fn: callback to determine distance between cpus, optional + * + * This function determines grouping of units, their mappings to cpus + * and other parameters considering needed percpu size, allocation + * atom size and distances between CPUs. + * + * Groups are always mutliples of atom size and CPUs which are of + * LOCAL_DISTANCE both ways are grouped together and share space for + * units in the same group. The returned configuration is guaranteed + * to have CPUs on different nodes on different groups and >=75% usage + * of allocated virtual address space. + * + * RETURNS: + * On success, pointer to the new allocation_info is returned. On + * failure, ERR_PTR value is returned. + */ +struct pcpu_alloc_info * __init pcpu_build_alloc_info( + size_t reserved_size, ssize_t dyn_size, + size_t atom_size, + pcpu_fc_cpu_distance_fn_t cpu_distance_fn) +{ + static int group_map[NR_CPUS] __initdata; + static int group_cnt[NR_CPUS] __initdata; + const size_t static_size = __per_cpu_end - __per_cpu_start; + int group_cnt_max = 0, nr_groups = 1, nr_units = 0; + size_t size_sum, min_unit_size, alloc_size; + int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */ + int last_allocs, group, unit; + unsigned int cpu, tcpu; + struct pcpu_alloc_info *ai; + unsigned int *cpu_map; + + /* + * Determine min_unit_size, alloc_size and max_upa such that + * alloc_size is multiple of atom_size and is the smallest + * which can accomodate 4k aligned segments which are equal to + * or larger than min_unit_size. + */ + size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size); + min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); + + alloc_size = roundup(min_unit_size, atom_size); + upa = alloc_size / min_unit_size; + while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) + upa--; + max_upa = upa; + + /* group cpus according to their proximity */ + for_each_possible_cpu(cpu) { + group = 0; + next_group: + for_each_possible_cpu(tcpu) { + if (cpu == tcpu) + break; + if (group_map[tcpu] == group && cpu_distance_fn && + (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE || + cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) { + group++; + nr_groups = max(nr_groups, group + 1); + goto next_group; + } + } + group_map[cpu] = group; + group_cnt[group]++; + group_cnt_max = max(group_cnt_max, group_cnt[group]); + } + + /* + * Expand unit size until address space usage goes over 75% + * and then as much as possible without using more address + * space. + */ + last_allocs = INT_MAX; + for (upa = max_upa; upa; upa--) { + int allocs = 0, wasted = 0; + + if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) + continue; + + for (group = 0; group < nr_groups; group++) { + int this_allocs = DIV_ROUND_UP(group_cnt[group], upa); + allocs += this_allocs; + wasted += this_allocs * upa - group_cnt[group]; + } + + /* + * Don't accept if wastage is over 25%. The + * greater-than comparison ensures upa==1 always + * passes the following check. + */ + if (wasted > num_possible_cpus() / 3) + continue; + + /* and then don't consume more memory */ + if (allocs > last_allocs) + break; + last_allocs = allocs; + best_upa = upa; + } + upa = best_upa; + + /* allocate and fill alloc_info */ + for (group = 0; group < nr_groups; group++) + nr_units += roundup(group_cnt[group], upa); + + ai = pcpu_alloc_alloc_info(nr_groups, nr_units); + if (!ai) + return ERR_PTR(-ENOMEM); + cpu_map = ai->groups[0].cpu_map; + + for (group = 0; group < nr_groups; group++) { + ai->groups[group].cpu_map = cpu_map; + cpu_map += roundup(group_cnt[group], upa); + } + + ai->static_size = static_size; + ai->reserved_size = reserved_size; + ai->dyn_size = dyn_size; + ai->unit_size = alloc_size / upa; + ai->atom_size = atom_size; + ai->alloc_size = alloc_size; + + for (group = 0, unit = 0; group_cnt[group]; group++) { + struct pcpu_group_info *gi = &ai->groups[group]; + + /* + * Initialize base_offset as if all groups are located + * back-to-back. The caller should update this to + * reflect actual allocation. + */ + gi->base_offset = unit * ai->unit_size; + + for_each_possible_cpu(cpu) + if (group_map[cpu] == group) + gi->cpu_map[gi->nr_units++] = cpu; + gi->nr_units = roundup(gi->nr_units, upa); + unit += gi->nr_units; + } + BUG_ON(unit != nr_units); + + return ai; +} + +/** + * pcpu_dump_alloc_info - print out information about pcpu_alloc_info + * @lvl: loglevel + * @ai: allocation info to dump + * + * Print out information about @ai using loglevel @lvl. + */ +static void pcpu_dump_alloc_info(const char *lvl, + const struct pcpu_alloc_info *ai) +{ + int group_width = 1, cpu_width = 1, width; + char empty_str[] = "--------"; + int alloc = 0, alloc_end = 0; + int group, v; + int upa, apl; /* units per alloc, allocs per line */ + + v = ai->nr_groups; + while (v /= 10) + group_width++; + + v = num_possible_cpus(); + while (v /= 10) + cpu_width++; + empty_str[min_t(int, cpu_width, sizeof(empty_str) - 1)] = '\0'; + + upa = ai->alloc_size / ai->unit_size; + width = upa * (cpu_width + 1) + group_width + 3; + apl = rounddown_pow_of_two(max(60 / width, 1)); + + printk("%spcpu-alloc: s%zu r%zu d%zu u%zu alloc=%zu*%zu", + lvl, ai->static_size, ai->reserved_size, ai->dyn_size, + ai->unit_size, ai->alloc_size / ai->atom_size, ai->atom_size); + + for (group = 0; group < ai->nr_groups; group++) { + const struct pcpu_group_info *gi = &ai->groups[group]; + int unit = 0, unit_end = 0; + + BUG_ON(gi->nr_units % upa); + for (alloc_end += gi->nr_units / upa; + alloc < alloc_end; alloc++) { + if (!(alloc % apl)) { + printk("\n"); + printk("%spcpu-alloc: ", lvl); + } + printk("[%0*d] ", group_width, group); + + for (unit_end += upa; unit < unit_end; unit++) + if (gi->cpu_map[unit] != NR_CPUS) + printk("%0*d ", cpu_width, + gi->cpu_map[unit]); + else + printk("%s ", empty_str); + } + } + printk("\n"); +} + +/** + * pcpu_setup_first_chunk - initialize the first percpu chunk + * @ai: pcpu_alloc_info describing how to percpu area is shaped + * @base_addr: mapped address * * Initialize the first percpu chunk which contains the kernel static * perpcu area. This function is to be called from arch percpu area - * setup path. The first two parameters are mandatory. The rest are - * optional. - * - * @get_page_fn() should return pointer to percpu page given cpu - * number and page number. It should at least return enough pages to - * cover the static area. The returned pages for static area should - * have been initialized with valid data. If @unit_size is specified, - * it can also return pages after the static area. NULL return - * indicates end of pages for the cpu. Note that @get_page_fn() must - * return the same number of pages for all cpus. - * - * @reserved_size, if non-zero, specifies the amount of bytes to + * setup path. + * + * @ai contains all information necessary to initialize the first + * chunk and prime the dynamic percpu allocator. + * + * @ai->static_size is the size of static percpu area. + * + * @ai->reserved_size, if non-zero, specifies the amount of bytes to * reserve after the static area in the first chunk. This reserves * the first chunk such that it's available only through reserved * percpu allocation. This is primarily used to serve module percpu @@ -1018,22 +1537,29 @@ EXPORT_SYMBOL_GPL(free_percpu); * limited offset range for symbol relocations to guarantee module * percpu symbols fall inside the relocatable range. * - * @dyn_size, if non-negative, determines the number of bytes - * available for dynamic allocation in the first chunk. Specifying - * non-negative value makes percpu leave alone the area beyond - * @static_size + @reserved_size + @dyn_size. + * @ai->dyn_size determines the number of bytes available for dynamic + * allocation in the first chunk. The area between @ai->static_size + + * @ai->reserved_size + @ai->dyn_size and @ai->unit_size is unused. * - * @unit_size, if non-negative, specifies unit size and must be - * aligned to PAGE_SIZE and equal to or larger than @static_size + - * @reserved_size + if non-negative, @dyn_size. + * @ai->unit_size specifies unit size and must be aligned to PAGE_SIZE + * and equal to or larger than @ai->static_size + @ai->reserved_size + + * @ai->dyn_size. * - * Non-null @base_addr means that the caller already allocated virtual - * region for the first chunk and mapped it. percpu must not mess - * with the chunk. Note that @base_addr with 0 @unit_size or non-NULL - * @populate_pte_fn doesn't make any sense. + * @ai->atom_size is the allocation atom size and used as alignment + * for vm areas. * - * @populate_pte_fn is used to populate the pagetable. NULL means the - * caller already populated the pagetable. + * @ai->alloc_size is the allocation size and always multiple of + * @ai->atom_size. This is larger than @ai->atom_size if + * @ai->unit_size is larger than @ai->atom_size. + * + * @ai->nr_groups and @ai->groups describe virtual memory layout of + * percpu areas. Units which should be colocated are put into the + * same group. Dynamic VM areas will be allocated according to these + * groupings. If @ai->nr_groups is zero, a single group containing + * all units is assumed. + * + * The caller should have mapped the first chunk at @base_addr and + * copied static data to each unit. * * If the first chunk ends up with both reserved and dynamic areas, it * is served by two chunks - one to serve the core static and reserved @@ -1043,49 +1569,83 @@ EXPORT_SYMBOL_GPL(free_percpu); * and available for dynamic allocation like any other chunks. * * RETURNS: - * The determined pcpu_unit_size which can be used to initialize - * percpu access. + * 0 on success, -errno on failure. */ -size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, - size_t static_size, size_t reserved_size, - ssize_t dyn_size, ssize_t unit_size, - void *base_addr, - pcpu_populate_pte_fn_t populate_pte_fn) +int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, + void *base_addr) { - static struct vm_struct first_vm; static int smap[2], dmap[2]; - size_t size_sum = static_size + reserved_size + - (dyn_size >= 0 ? dyn_size : 0); + size_t dyn_size = ai->dyn_size; + size_t size_sum = ai->static_size + ai->reserved_size + dyn_size; struct pcpu_chunk *schunk, *dchunk = NULL; + unsigned long *group_offsets; + size_t *group_sizes; + unsigned long *unit_off; unsigned int cpu; - int nr_pages; - int err, i; + int *unit_map; + int group, unit, i; - /* santiy checks */ + /* sanity checks */ BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC || ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC); - BUG_ON(!static_size); - if (unit_size >= 0) { - BUG_ON(unit_size < size_sum); - BUG_ON(unit_size & ~PAGE_MASK); - BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE); - } else - BUG_ON(base_addr); - BUG_ON(base_addr && populate_pte_fn); - - if (unit_size >= 0) - pcpu_unit_pages = unit_size >> PAGE_SHIFT; - else - pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT, - PFN_UP(size_sum)); + BUG_ON(ai->nr_groups <= 0); + BUG_ON(!ai->static_size); + BUG_ON(!base_addr); + BUG_ON(ai->unit_size < size_sum); + BUG_ON(ai->unit_size & ~PAGE_MASK); + BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE); + + pcpu_dump_alloc_info(KERN_DEBUG, ai); + + /* process group information and build config tables accordingly */ + group_offsets = alloc_bootmem(ai->nr_groups * sizeof(group_offsets[0])); + group_sizes = alloc_bootmem(ai->nr_groups * sizeof(group_sizes[0])); + unit_map = alloc_bootmem(nr_cpu_ids * sizeof(unit_map[0])); + unit_off = alloc_bootmem(nr_cpu_ids * sizeof(unit_off[0])); + + for (cpu = 0; cpu < nr_cpu_ids; cpu++) + unit_map[cpu] = NR_CPUS; + pcpu_first_unit_cpu = NR_CPUS; + + for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) { + const struct pcpu_group_info *gi = &ai->groups[group]; + + group_offsets[group] = gi->base_offset; + group_sizes[group] = gi->nr_units * ai->unit_size; + + for (i = 0; i < gi->nr_units; i++) { + cpu = gi->cpu_map[i]; + if (cpu == NR_CPUS) + continue; - pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; - pcpu_chunk_size = nr_cpu_ids * pcpu_unit_size; - pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) - + nr_cpu_ids * pcpu_unit_pages * sizeof(struct page *); + BUG_ON(cpu > nr_cpu_ids || !cpu_possible(cpu)); + BUG_ON(unit_map[cpu] != NR_CPUS); - if (dyn_size < 0) - dyn_size = pcpu_unit_size - static_size - reserved_size; + unit_map[cpu] = unit + i; + unit_off[cpu] = gi->base_offset + i * ai->unit_size; + + if (pcpu_first_unit_cpu == NR_CPUS) + pcpu_first_unit_cpu = cpu; + } + } + pcpu_last_unit_cpu = cpu; + pcpu_nr_units = unit; + + for_each_possible_cpu(cpu) + BUG_ON(unit_map[cpu] == NR_CPUS); + + pcpu_nr_groups = ai->nr_groups; + pcpu_group_offsets = group_offsets; + pcpu_group_sizes = group_sizes; + pcpu_unit_map = unit_map; + pcpu_unit_offsets = unit_off; + + /* determine basic parameters */ + pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT; + pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; + pcpu_atom_size = ai->atom_size; + pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) + + BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long); /* * Allocate chunk slots. The additional last slot is for @@ -1105,189 +1665,351 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, */ schunk = alloc_bootmem(pcpu_chunk_struct_size); INIT_LIST_HEAD(&schunk->list); - schunk->vm = &first_vm; + schunk->base_addr = base_addr; schunk->map = smap; schunk->map_alloc = ARRAY_SIZE(smap); - schunk->page = schunk->page_ar; + schunk->immutable = true; + bitmap_fill(schunk->populated, pcpu_unit_pages); - if (reserved_size) { - schunk->free_size = reserved_size; + if (ai->reserved_size) { + schunk->free_size = ai->reserved_size; pcpu_reserved_chunk = schunk; - pcpu_reserved_chunk_limit = static_size + reserved_size; + pcpu_reserved_chunk_limit = ai->static_size + ai->reserved_size; } else { schunk->free_size = dyn_size; dyn_size = 0; /* dynamic area covered */ } schunk->contig_hint = schunk->free_size; - schunk->map[schunk->map_used++] = -static_size; + schunk->map[schunk->map_used++] = -ai->static_size; if (schunk->free_size) schunk->map[schunk->map_used++] = schunk->free_size; /* init dynamic chunk if necessary */ if (dyn_size) { - dchunk = alloc_bootmem(sizeof(struct pcpu_chunk)); + dchunk = alloc_bootmem(pcpu_chunk_struct_size); INIT_LIST_HEAD(&dchunk->list); - dchunk->vm = &first_vm; + dchunk->base_addr = base_addr; dchunk->map = dmap; dchunk->map_alloc = ARRAY_SIZE(dmap); - dchunk->page = schunk->page_ar; /* share page map with schunk */ + dchunk->immutable = true; + bitmap_fill(dchunk->populated, pcpu_unit_pages); dchunk->contig_hint = dchunk->free_size = dyn_size; dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit; dchunk->map[dchunk->map_used++] = dchunk->free_size; } - /* allocate vm address */ - first_vm.flags = VM_ALLOC; - first_vm.size = pcpu_chunk_size; - - if (!base_addr) - vm_area_register_early(&first_vm, PAGE_SIZE); - else { - /* - * Pages already mapped. No need to remap into - * vmalloc area. In this case the first chunks can't - * be mapped or unmapped by percpu and are marked - * immutable. - */ - first_vm.addr = base_addr; - schunk->immutable = true; - if (dchunk) - dchunk->immutable = true; - } - - /* assign pages */ - nr_pages = -1; - for_each_possible_cpu(cpu) { - for (i = 0; i < pcpu_unit_pages; i++) { - struct page *page = get_page_fn(cpu, i); - - if (!page) - break; - *pcpu_chunk_pagep(schunk, cpu, i) = page; - } - - BUG_ON(i < PFN_UP(static_size)); - - if (nr_pages < 0) - nr_pages = i; - else - BUG_ON(nr_pages != i); - } - - /* map them */ - if (populate_pte_fn) { - for_each_possible_cpu(cpu) - for (i = 0; i < nr_pages; i++) - populate_pte_fn(pcpu_chunk_addr(schunk, - cpu, i)); - - err = pcpu_map(schunk, 0, nr_pages); - if (err) - panic("failed to setup static percpu area, err=%d\n", - err); - } - /* link the first chunk in */ pcpu_first_chunk = dchunk ?: schunk; pcpu_chunk_relocate(pcpu_first_chunk, -1); /* we're done */ - pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0); - return pcpu_unit_size; + pcpu_base_addr = base_addr; + return 0; } -/* - * Embedding first chunk setup helper. - */ -static void *pcpue_ptr __initdata; -static size_t pcpue_size __initdata; -static size_t pcpue_unit_size __initdata; +const char *pcpu_fc_names[PCPU_FC_NR] __initdata = { + [PCPU_FC_AUTO] = "auto", + [PCPU_FC_EMBED] = "embed", + [PCPU_FC_PAGE] = "page", +}; -static struct page * __init pcpue_get_page(unsigned int cpu, int pageno) -{ - size_t off = (size_t)pageno << PAGE_SHIFT; +enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO; - if (off >= pcpue_size) - return NULL; +static int __init percpu_alloc_setup(char *str) +{ + if (0) + /* nada */; +#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK + else if (!strcmp(str, "embed")) + pcpu_chosen_fc = PCPU_FC_EMBED; +#endif +#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK + else if (!strcmp(str, "page")) + pcpu_chosen_fc = PCPU_FC_PAGE; +#endif + else + pr_warning("PERCPU: unknown allocator %s specified\n", str); - return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size + off); + return 0; } +early_param("percpu_alloc", percpu_alloc_setup); +#if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \ + !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) /** * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem - * @static_size: the size of static percpu area in bytes * @reserved_size: the size of reserved percpu area in bytes * @dyn_size: free size for dynamic allocation in bytes, -1 for auto - * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto + * @atom_size: allocation atom size + * @cpu_distance_fn: callback to determine distance between cpus, optional + * @alloc_fn: function to allocate percpu page + * @free_fn: funtion to free percpu page * * This is a helper to ease setting up embedded first percpu chunk and * can be called where pcpu_setup_first_chunk() is expected. * * If this function is used to setup the first chunk, it is allocated - * as a contiguous area using bootmem allocator and used as-is without - * being mapped into vmalloc area. This enables the first chunk to - * piggy back on the linear physical mapping which often uses larger - * page size. + * by calling @alloc_fn and used as-is without being mapped into + * vmalloc area. Allocations are always whole multiples of @atom_size + * aligned to @atom_size. + * + * This enables the first chunk to piggy back on the linear physical + * mapping which often uses larger page size. Please note that this + * can result in very sparse cpu->unit mapping on NUMA machines thus + * requiring large vmalloc address space. Don't use this allocator if + * vmalloc space is not orders of magnitude larger than distances + * between node memory addresses (ie. 32bit NUMA machines). * * When @dyn_size is positive, dynamic area might be larger than - * specified to fill page alignment. Also, when @dyn_size is auto, - * @dyn_size does not fill the whole first chunk but only what's - * necessary for page alignment after static and reserved areas. + * specified to fill page alignment. When @dyn_size is auto, + * @dyn_size is just big enough to fill page alignment after static + * and reserved areas. * * If the needed size is smaller than the minimum or specified unit - * size, the leftover is returned to the bootmem allocator. + * size, the leftover is returned using @free_fn. * * RETURNS: - * The determined pcpu_unit_size which can be used to initialize - * percpu access on success, -errno on failure. + * 0 on success, -errno on failure. */ -ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, - ssize_t dyn_size, ssize_t unit_size) +int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size, + size_t atom_size, + pcpu_fc_cpu_distance_fn_t cpu_distance_fn, + pcpu_fc_alloc_fn_t alloc_fn, + pcpu_fc_free_fn_t free_fn) { - size_t chunk_size; - unsigned int cpu; + void *base = (void *)ULONG_MAX; + void **areas = NULL; + struct pcpu_alloc_info *ai; + size_t size_sum, areas_size; + int group, i, rc; + + ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size, + cpu_distance_fn); + if (IS_ERR(ai)) + return PTR_ERR(ai); + + size_sum = ai->static_size + ai->reserved_size + ai->dyn_size; + areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *)); + + areas = alloc_bootmem_nopanic(areas_size); + if (!areas) { + rc = -ENOMEM; + goto out_free; + } - /* determine parameters and allocate */ - pcpue_size = PFN_ALIGN(static_size + reserved_size + - (dyn_size >= 0 ? dyn_size : 0)); - if (dyn_size != 0) - dyn_size = pcpue_size - static_size - reserved_size; - - if (unit_size >= 0) { - BUG_ON(unit_size < pcpue_size); - pcpue_unit_size = unit_size; - } else - pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); - - chunk_size = pcpue_unit_size * nr_cpu_ids; - - pcpue_ptr = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE, - __pa(MAX_DMA_ADDRESS)); - if (!pcpue_ptr) { - pr_warning("PERCPU: failed to allocate %zu bytes for " - "embedding\n", chunk_size); - return -ENOMEM; + /* allocate, copy and determine base address */ + for (group = 0; group < ai->nr_groups; group++) { + struct pcpu_group_info *gi = &ai->groups[group]; + unsigned int cpu = NR_CPUS; + void *ptr; + + for (i = 0; i < gi->nr_units && cpu == NR_CPUS; i++) + cpu = gi->cpu_map[i]; + BUG_ON(cpu == NR_CPUS); + + /* allocate space for the whole group */ + ptr = alloc_fn(cpu, gi->nr_units * ai->unit_size, atom_size); + if (!ptr) { + rc = -ENOMEM; + goto out_free_areas; + } + areas[group] = ptr; + + base = min(ptr, base); + + for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) { + if (gi->cpu_map[i] == NR_CPUS) { + /* unused unit, free whole */ + free_fn(ptr, ai->unit_size); + continue; + } + /* copy and return the unused part */ + memcpy(ptr, __per_cpu_load, ai->static_size); + free_fn(ptr + size_sum, ai->unit_size - size_sum); + } } - /* return the leftover and copy */ - for (cpu = 0; cpu < nr_cpu_ids; cpu++) { - void *ptr = pcpue_ptr + cpu * pcpue_unit_size; + /* base address is now known, determine group base offsets */ + for (group = 0; group < ai->nr_groups; group++) + ai->groups[group].base_offset = areas[group] - base; + + pr_info("PERCPU: Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n", + PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size, + ai->dyn_size, ai->unit_size); + + rc = pcpu_setup_first_chunk(ai, base); + goto out_free; + +out_free_areas: + for (group = 0; group < ai->nr_groups; group++) + free_fn(areas[group], + ai->groups[group].nr_units * ai->unit_size); +out_free: + pcpu_free_alloc_info(ai); + if (areas) + free_bootmem(__pa(areas), areas_size); + return rc; +} +#endif /* CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK || + !CONFIG_HAVE_SETUP_PER_CPU_AREA */ + +#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK +/** + * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages + * @reserved_size: the size of reserved percpu area in bytes + * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE + * @free_fn: funtion to free percpu page, always called with PAGE_SIZE + * @populate_pte_fn: function to populate pte + * + * This is a helper to ease setting up page-remapped first percpu + * chunk and can be called where pcpu_setup_first_chunk() is expected. + * + * This is the basic allocator. Static percpu area is allocated + * page-by-page into vmalloc area. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int __init pcpu_page_first_chunk(size_t reserved_size, + pcpu_fc_alloc_fn_t alloc_fn, + pcpu_fc_free_fn_t free_fn, + pcpu_fc_populate_pte_fn_t populate_pte_fn) +{ + static struct vm_struct vm; + struct pcpu_alloc_info *ai; + char psize_str[16]; + int unit_pages; + size_t pages_size; + struct page **pages; + int unit, i, j, rc; + + snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10); + + ai = pcpu_build_alloc_info(reserved_size, -1, PAGE_SIZE, NULL); + if (IS_ERR(ai)) + return PTR_ERR(ai); + BUG_ON(ai->nr_groups != 1); + BUG_ON(ai->groups[0].nr_units != num_possible_cpus()); + + unit_pages = ai->unit_size >> PAGE_SHIFT; + + /* unaligned allocations can't be freed, round up to page size */ + pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() * + sizeof(pages[0])); + pages = alloc_bootmem(pages_size); + + /* allocate pages */ + j = 0; + for (unit = 0; unit < num_possible_cpus(); unit++) + for (i = 0; i < unit_pages; i++) { + unsigned int cpu = ai->groups[0].cpu_map[unit]; + void *ptr; + + ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE); + if (!ptr) { + pr_warning("PERCPU: failed to allocate %s page " + "for cpu%u\n", psize_str, cpu); + goto enomem; + } + pages[j++] = virt_to_page(ptr); + } + + /* allocate vm area, map the pages and copy static data */ + vm.flags = VM_ALLOC; + vm.size = num_possible_cpus() * ai->unit_size; + vm_area_register_early(&vm, PAGE_SIZE); + + for (unit = 0; unit < num_possible_cpus(); unit++) { + unsigned long unit_addr = + (unsigned long)vm.addr + unit * ai->unit_size; + + for (i = 0; i < unit_pages; i++) + populate_pte_fn(unit_addr + (i << PAGE_SHIFT)); + + /* pte already populated, the following shouldn't fail */ + rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages], + unit_pages); + if (rc < 0) + panic("failed to map percpu area, err=%d\n", rc); - if (cpu_possible(cpu)) { - free_bootmem(__pa(ptr + pcpue_size), - pcpue_unit_size - pcpue_size); - memcpy(ptr, __per_cpu_load, static_size); - } else - free_bootmem(__pa(ptr), pcpue_unit_size); + /* + * FIXME: Archs with virtual cache should flush local + * cache for the linear mapping here - something + * equivalent to flush_cache_vmap() on the local cpu. + * flush_cache_vmap() can't be used as most supporting + * data structures are not set up yet. + */ + + /* copy static data */ + memcpy((void *)unit_addr, __per_cpu_load, ai->static_size); } /* we're ready, commit */ - pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n", - pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size); + pr_info("PERCPU: %d %s pages/cpu @%p s%zu r%zu d%zu\n", + unit_pages, psize_str, vm.addr, ai->static_size, + ai->reserved_size, ai->dyn_size); + + rc = pcpu_setup_first_chunk(ai, vm.addr); + goto out_free_ar; + +enomem: + while (--j >= 0) + free_fn(page_address(pages[j]), PAGE_SIZE); + rc = -ENOMEM; +out_free_ar: + free_bootmem(__pa(pages), pages_size); + pcpu_free_alloc_info(ai); + return rc; +} +#endif /* CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK */ + +/* + * Generic percpu area setup. + * + * The embedding helper is used because its behavior closely resembles + * the original non-dynamic generic percpu area setup. This is + * important because many archs have addressing restrictions and might + * fail if the percpu area is located far away from the previous + * location. As an added bonus, in non-NUMA cases, embedding is + * generally a good idea TLB-wise because percpu area can piggy back + * on the physical linear memory mapping which uses large page + * mappings on applicable archs. + */ +#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA +unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; +EXPORT_SYMBOL(__per_cpu_offset); + +static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size, + size_t align) +{ + return __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS)); +} - return pcpu_setup_first_chunk(pcpue_get_page, static_size, - reserved_size, dyn_size, - pcpue_unit_size, pcpue_ptr, NULL); +static void __init pcpu_dfl_fc_free(void *ptr, size_t size) +{ + free_bootmem(__pa(ptr), size); +} + +void __init setup_per_cpu_areas(void) +{ + unsigned long delta; + unsigned int cpu; + int rc; + + /* + * Always reserve area for module percpu variables. That's + * what the legacy allocator did. + */ + rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, + PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL, + pcpu_dfl_fc_alloc, pcpu_dfl_fc_free); + if (rc < 0) + panic("Failed to initialized percpu areas."); + + delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; + for_each_possible_cpu(cpu) + __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu]; } +#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */ diff --git a/mm/quicklist.c b/mm/quicklist.c index e66d07d1b4ff..6eedf7e473d1 100644 --- a/mm/quicklist.c +++ b/mm/quicklist.c @@ -19,7 +19,7 @@ #include <linux/module.h> #include <linux/quicklist.h> -DEFINE_PER_CPU(struct quicklist, quicklist)[CONFIG_NR_QUICK]; +DEFINE_PER_CPU(struct quicklist [CONFIG_NR_QUICK], quicklist); #define FRACTION_OF_NODE_MEM 16 diff --git a/mm/slub.c b/mm/slub.c index 417ed843b251..a5789b91d179 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2111,8 +2111,8 @@ init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) */ #define NR_KMEM_CACHE_CPU 100 -static DEFINE_PER_CPU(struct kmem_cache_cpu, - kmem_cache_cpu)[NR_KMEM_CACHE_CPU]; +static DEFINE_PER_CPU(struct kmem_cache_cpu [NR_KMEM_CACHE_CPU], + kmem_cache_cpu); static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free); static DECLARE_BITMAP(kmem_cach_cpu_free_init_once, CONFIG_NR_CPUS); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index f8189a4b3e13..204b8243d8ab 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -265,6 +265,7 @@ struct vmap_area { static DEFINE_SPINLOCK(vmap_area_lock); static struct rb_root vmap_area_root = RB_ROOT; static LIST_HEAD(vmap_area_list); +static unsigned long vmap_area_pcpu_hole; static struct vmap_area *__find_vmap_area(unsigned long addr) { @@ -431,6 +432,15 @@ static void __free_vmap_area(struct vmap_area *va) RB_CLEAR_NODE(&va->rb_node); list_del_rcu(&va->list); + /* + * Track the highest possible candidate for pcpu area + * allocation. Areas outside of vmalloc area can be returned + * here too, consider only end addresses which fall inside + * vmalloc area proper. + */ + if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END) + vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end); + call_rcu(&va->rcu_head, rcu_free_va); } @@ -1038,6 +1048,9 @@ void __init vmalloc_init(void) va->va_end = va->va_start + tmp->size; __insert_vmap_area(va); } + + vmap_area_pcpu_hole = VMALLOC_END; + vmap_initialized = true; } @@ -1122,13 +1135,34 @@ EXPORT_SYMBOL_GPL(map_vm_area); DEFINE_RWLOCK(vmlist_lock); struct vm_struct *vmlist; +static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, + unsigned long flags, void *caller) +{ + struct vm_struct *tmp, **p; + + vm->flags = flags; + vm->addr = (void *)va->va_start; + vm->size = va->va_end - va->va_start; + vm->caller = caller; + va->private = vm; + va->flags |= VM_VM_AREA; + + write_lock(&vmlist_lock); + for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { + if (tmp->addr >= vm->addr) + break; + } + vm->next = *p; + *p = vm; + write_unlock(&vmlist_lock); +} + static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long flags, unsigned long start, unsigned long end, int node, gfp_t gfp_mask, void *caller) { static struct vmap_area *va; struct vm_struct *area; - struct vm_struct *tmp, **p; unsigned long align = 1; BUG_ON(in_interrupt()); @@ -1147,7 +1181,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, if (unlikely(!size)) return NULL; - area = kmalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); + area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); if (unlikely(!area)) return NULL; @@ -1162,25 +1196,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, return NULL; } - area->flags = flags; - area->addr = (void *)va->va_start; - area->size = size; - area->pages = NULL; - area->nr_pages = 0; - area->phys_addr = 0; - area->caller = caller; - va->private = area; - va->flags |= VM_VM_AREA; - - write_lock(&vmlist_lock); - for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { - if (tmp->addr >= area->addr) - break; - } - area->next = *p; - *p = area; - write_unlock(&vmlist_lock); - + insert_vmalloc_vm(area, va, flags, caller); return area; } @@ -1818,6 +1834,286 @@ void free_vm_area(struct vm_struct *area) } EXPORT_SYMBOL_GPL(free_vm_area); +static struct vmap_area *node_to_va(struct rb_node *n) +{ + return n ? rb_entry(n, struct vmap_area, rb_node) : NULL; +} + +/** + * pvm_find_next_prev - find the next and prev vmap_area surrounding @end + * @end: target address + * @pnext: out arg for the next vmap_area + * @pprev: out arg for the previous vmap_area + * + * Returns: %true if either or both of next and prev are found, + * %false if no vmap_area exists + * + * Find vmap_areas end addresses of which enclose @end. ie. if not + * NULL, *pnext->va_end > @end and *pprev->va_end <= @end. + */ +static bool pvm_find_next_prev(unsigned long end, + struct vmap_area **pnext, + struct vmap_area **pprev) +{ + struct rb_node *n = vmap_area_root.rb_node; + struct vmap_area *va = NULL; + + while (n) { + va = rb_entry(n, struct vmap_area, rb_node); + if (end < va->va_end) + n = n->rb_left; + else if (end > va->va_end) + n = n->rb_right; + else + break; + } + + if (!va) + return false; + + if (va->va_end > end) { + *pnext = va; + *pprev = node_to_va(rb_prev(&(*pnext)->rb_node)); + } else { + *pprev = va; + *pnext = node_to_va(rb_next(&(*pprev)->rb_node)); + } + return true; +} + +/** + * pvm_determine_end - find the highest aligned address between two vmap_areas + * @pnext: in/out arg for the next vmap_area + * @pprev: in/out arg for the previous vmap_area + * @align: alignment + * + * Returns: determined end address + * + * Find the highest aligned address between *@pnext and *@pprev below + * VMALLOC_END. *@pnext and *@pprev are adjusted so that the aligned + * down address is between the end addresses of the two vmap_areas. + * + * Please note that the address returned by this function may fall + * inside *@pnext vmap_area. The caller is responsible for checking + * that. + */ +static unsigned long pvm_determine_end(struct vmap_area **pnext, + struct vmap_area **pprev, + unsigned long align) +{ + const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); + unsigned long addr; + + if (*pnext) + addr = min((*pnext)->va_start & ~(align - 1), vmalloc_end); + else + addr = vmalloc_end; + + while (*pprev && (*pprev)->va_end > addr) { + *pnext = *pprev; + *pprev = node_to_va(rb_prev(&(*pnext)->rb_node)); + } + + return addr; +} + +/** + * pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator + * @offsets: array containing offset of each area + * @sizes: array containing size of each area + * @nr_vms: the number of areas to allocate + * @align: alignment, all entries in @offsets and @sizes must be aligned to this + * @gfp_mask: allocation mask + * + * Returns: kmalloc'd vm_struct pointer array pointing to allocated + * vm_structs on success, %NULL on failure + * + * Percpu allocator wants to use congruent vm areas so that it can + * maintain the offsets among percpu areas. This function allocates + * congruent vmalloc areas for it. These areas tend to be scattered + * pretty far, distance between two areas easily going up to + * gigabytes. To avoid interacting with regular vmallocs, these areas + * are allocated from top. + * + * Despite its complicated look, this allocator is rather simple. It + * does everything top-down and scans areas from the end looking for + * matching slot. While scanning, if any of the areas overlaps with + * existing vmap_area, the base address is pulled down to fit the + * area. Scanning is repeated till all the areas fit and then all + * necessary data structres are inserted and the result is returned. + */ +struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, + const size_t *sizes, int nr_vms, + size_t align, gfp_t gfp_mask) +{ + const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align); + const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); + struct vmap_area **vas, *prev, *next; + struct vm_struct **vms; + int area, area2, last_area, term_area; + unsigned long base, start, end, last_end; + bool purged = false; + + gfp_mask &= GFP_RECLAIM_MASK; + + /* verify parameters and allocate data structures */ + BUG_ON(align & ~PAGE_MASK || !is_power_of_2(align)); + for (last_area = 0, area = 0; area < nr_vms; area++) { + start = offsets[area]; + end = start + sizes[area]; + + /* is everything aligned properly? */ + BUG_ON(!IS_ALIGNED(offsets[area], align)); + BUG_ON(!IS_ALIGNED(sizes[area], align)); + + /* detect the area with the highest address */ + if (start > offsets[last_area]) + last_area = area; + + for (area2 = 0; area2 < nr_vms; area2++) { + unsigned long start2 = offsets[area2]; + unsigned long end2 = start2 + sizes[area2]; + + if (area2 == area) + continue; + + BUG_ON(start2 >= start && start2 < end); + BUG_ON(end2 <= end && end2 > start); + } + } + last_end = offsets[last_area] + sizes[last_area]; + + if (vmalloc_end - vmalloc_start < last_end) { + WARN_ON(true); + return NULL; + } + + vms = kzalloc(sizeof(vms[0]) * nr_vms, gfp_mask); + vas = kzalloc(sizeof(vas[0]) * nr_vms, gfp_mask); + if (!vas || !vms) + goto err_free; + + for (area = 0; area < nr_vms; area++) { + vas[area] = kzalloc(sizeof(struct vmap_area), gfp_mask); + vms[area] = kzalloc(sizeof(struct vm_struct), gfp_mask); + if (!vas[area] || !vms[area]) + goto err_free; + } +retry: + spin_lock(&vmap_area_lock); + + /* start scanning - we scan from the top, begin with the last area */ + area = term_area = last_area; + start = offsets[area]; + end = start + sizes[area]; + + if (!pvm_find_next_prev(vmap_area_pcpu_hole, &next, &prev)) { + base = vmalloc_end - last_end; + goto found; + } + base = pvm_determine_end(&next, &prev, align) - end; + + while (true) { + BUG_ON(next && next->va_end <= base + end); + BUG_ON(prev && prev->va_end > base + end); + + /* + * base might have underflowed, add last_end before + * comparing. + */ + if (base + last_end < vmalloc_start + last_end) { + spin_unlock(&vmap_area_lock); + if (!purged) { + purge_vmap_area_lazy(); + purged = true; + goto retry; + } + goto err_free; + } + + /* + * If next overlaps, move base downwards so that it's + * right below next and then recheck. + */ + if (next && next->va_start < base + end) { + base = pvm_determine_end(&next, &prev, align) - end; + term_area = area; + continue; + } + + /* + * If prev overlaps, shift down next and prev and move + * base so that it's right below new next and then + * recheck. + */ + if (prev && prev->va_end > base + start) { + next = prev; + prev = node_to_va(rb_prev(&next->rb_node)); + base = pvm_determine_end(&next, &prev, align) - end; + term_area = area; + continue; + } + + /* + * This area fits, move on to the previous one. If + * the previous one is the terminal one, we're done. + */ + area = (area + nr_vms - 1) % nr_vms; + if (area == term_area) + break; + start = offsets[area]; + end = start + sizes[area]; + pvm_find_next_prev(base + end, &next, &prev); + } +found: + /* we've found a fitting base, insert all va's */ + for (area = 0; area < nr_vms; area++) { + struct vmap_area *va = vas[area]; + + va->va_start = base + offsets[area]; + va->va_end = va->va_start + sizes[area]; + __insert_vmap_area(va); + } + + vmap_area_pcpu_hole = base + offsets[last_area]; + + spin_unlock(&vmap_area_lock); + + /* insert all vm's */ + for (area = 0; area < nr_vms; area++) + insert_vmalloc_vm(vms[area], vas[area], VM_ALLOC, + pcpu_get_vm_areas); + + kfree(vas); + return vms; + +err_free: + for (area = 0; area < nr_vms; area++) { + if (vas) + kfree(vas[area]); + if (vms) + kfree(vms[area]); + } + kfree(vas); + kfree(vms); + return NULL; +} + +/** + * pcpu_free_vm_areas - free vmalloc areas for percpu allocator + * @vms: vm_struct pointer array returned by pcpu_get_vm_areas() + * @nr_vms: the number of allocated areas + * + * Free vm_structs and the array allocated by pcpu_get_vm_areas(). + */ +void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) +{ + int i; + + for (i = 0; i < nr_vms; i++) + free_vm_area(vms[i]); + kfree(vms); +} #ifdef CONFIG_PROC_FS static void *s_start(struct seq_file *m, loff_t *pos) diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index cd2b97f1b6e1..a6e0e077ac33 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -37,12 +37,13 @@ __initcall(init_syncookies); #define COOKIEBITS 24 /* Upper bits store count */ #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) -static DEFINE_PER_CPU(__u32, cookie_scratch)[16 + 5 + SHA_WORKSPACE_WORDS]; +static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], + ipv4_cookie_scratch); static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, u32 count, int c) { - __u32 *tmp = __get_cpu_var(cookie_scratch); + __u32 *tmp = __get_cpu_var(ipv4_cookie_scratch); memcpy(tmp + 4, syncookie_secret[c], sizeof(syncookie_secret[c])); tmp[0] = (__force u32)saddr; diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 8c2513982b61..6b6ae913b5d4 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -74,12 +74,13 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, return child; } -static DEFINE_PER_CPU(__u32, cookie_scratch)[16 + 5 + SHA_WORKSPACE_WORDS]; +static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], + ipv6_cookie_scratch); static u32 cookie_hash(struct in6_addr *saddr, struct in6_addr *daddr, __be16 sport, __be16 dport, u32 count, int c) { - __u32 *tmp = __get_cpu_var(cookie_scratch); + __u32 *tmp = __get_cpu_var(ipv6_cookie_scratch); /* * we have 320 bits of information to hash, copy in the remaining diff --git a/net/rds/ib_stats.c b/net/rds/ib_stats.c index 8d8488306fe4..d2c904dd6fbc 100644 --- a/net/rds/ib_stats.c +++ b/net/rds/ib_stats.c @@ -37,7 +37,7 @@ #include "rds.h" #include "ib.h" -DEFINE_PER_CPU(struct rds_ib_statistics, rds_ib_stats) ____cacheline_aligned; +DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_ib_statistics, rds_ib_stats); static const char *const rds_ib_stat_names[] = { "ib_connect_raced", diff --git a/net/rds/iw_stats.c b/net/rds/iw_stats.c index d33ea790484e..5fe67f6a1d80 100644 --- a/net/rds/iw_stats.c +++ b/net/rds/iw_stats.c @@ -37,7 +37,7 @@ #include "rds.h" #include "iw.h" -DEFINE_PER_CPU(struct rds_iw_statistics, rds_iw_stats) ____cacheline_aligned; +DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_iw_statistics, rds_iw_stats); static const char *const rds_iw_stat_names[] = { "iw_connect_raced", diff --git a/net/rds/page.c b/net/rds/page.c index 55c21efdb62e..36790122dfd4 100644 --- a/net/rds/page.c +++ b/net/rds/page.c @@ -39,7 +39,7 @@ struct rds_page_remainder { unsigned long r_offset; }; -DEFINE_PER_CPU(struct rds_page_remainder, rds_page_remainders) ____cacheline_aligned; +DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder, rds_page_remainders); /* * returns 0 on success or -errno on failure. diff --git a/scripts/module-common.lds b/scripts/module-common.lds new file mode 100644 index 000000000000..47a1f9ae0ede --- /dev/null +++ b/scripts/module-common.lds @@ -0,0 +1,8 @@ +/* + * Common module linker script, always used when linking a module. + * Archs are free to supply their own linker scripts. ld will + * combine them automatically. + */ +SECTIONS { + /DISCARD/ : { *(.discard) } +} |