From e487683990972bf9aa4e688434c46ead76748bca Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sat, 20 Jun 2009 23:27:16 -0700 Subject: x86, mce: fix typo in comment in asm/mce.h Fix comment to match the actual declaration. Signed-off-by: Borislav Petkov Cc: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mce.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 5cdd8d100ec..b50b9e9042c 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -9,7 +9,7 @@ */ #define MCG_BANKCNT_MASK 0xff /* Number of Banks */ -#define MCG_CTL_P (1ULL<<8) /* MCG_CAP register available */ +#define MCG_CTL_P (1ULL<<8) /* MCG_CTL register available */ #define MCG_EXT_P (1ULL<<9) /* Extended registers available */ #define MCG_CMCI_P (1ULL<<10) /* CMCI supported */ #define MCG_EXT_CNT_MASK 0xff0000 /* Number of Extended registers */ -- cgit v1.2.3 From a95436e44a76a32dcbe7c8df59701ddde53017c1 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sat, 20 Jun 2009 23:28:22 -0700 Subject: x86, mce: use atomic_inc_return() instead of add by 1 Use atomic_inc_return() instead of atomic_add_return() by 1. Signed-off-by: Borislav Petkov Cc: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 284d1de968b..7da8fec9ca8 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -242,7 +242,7 @@ static void mce_panic(char *msg, struct mce *final, char *exp) /* * Make sure only one CPU runs in machine check panic */ - if (atomic_add_return(1, &mce_paniced) > 1) + if (atomic_inc_return(&mce_paniced) > 1) wait_for_panic(); barrier(); @@ -705,7 +705,7 @@ static int mce_start(int *no_way_out) * global_nwo should be updated before mce_callin */ smp_wmb(); - order = atomic_add_return(1, &mce_callin); + order = atomic_inc_return(&mce_callin); /* * Wait for everyone. -- cgit v1.2.3 From e74e396204bfcb67570ba4517b08f5918e69afea Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 30 Mar 2009 19:07:44 +0900 Subject: percpu: use dynamic percpu allocator as the default percpu allocator This patch makes most !CONFIG_HAVE_SETUP_PER_CPU_AREA archs use dynamic percpu allocator. The first chunk is allocated using embedding helper and 8k is reserved for modules. This ensures that the new allocator behaves almost identically to the original allocator as long as static percpu variables are concerned, so it shouldn't introduce much breakage. s390 and alpha use custom SHIFT_PERCPU_PTR() to work around addressing range limit the addressing model imposes. Unfortunately, this breaks if the address is specified using a variable, so for now, the two archs aren't converted. The following architectures are affected by this change. * sh * arm * cris * mips * sparc(32) * blackfin * avr32 * parisc (broken, under investigation) * m32r * powerpc(32) As this change makes the dynamic allocator the default one, CONFIG_HAVE_DYNAMIC_PER_CPU_AREA is replaced with its invert - CONFIG_HAVE_LEGACY_PER_CPU_AREA, which is added to yet-to-be converted archs. These archs implement their own setup_per_cpu_areas() and the conversion is not trivial. * powerpc(64) * sparc(64) * ia64 * alpha * s390 Boot and batch alloc/free tests on x86_32 with debug code (x86_32 doesn't use default first chunk initialization). Compile tested on sparc(32), powerpc(32), arm and alpha. Kyle McMartin reported that this change breaks parisc. The problem is still under investigation and he is okay with pushing this patch forward and fixing parisc later. [ Impact: use dynamic allocator for most archs w/o custom percpu setup ] Signed-off-by: Tejun Heo Acked-by: Rusty Russell Acked-by: David S. Miller Acked-by: Benjamin Herrenschmidt Acked-by: Martin Schwidefsky Reviewed-by: Christoph Lameter Cc: Paul Mundt Cc: Russell King Cc: Mikael Starvik Cc: Ralf Baechle Cc: Bryan Wu Cc: Kyle McMartin Cc: Matthew Wilcox Cc: Grant Grundler Cc: Hirokazu Takata Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Heiko Carstens Cc: Ingo Molnar --- arch/alpha/Kconfig | 3 +++ arch/ia64/Kconfig | 3 +++ arch/powerpc/Kconfig | 3 +++ arch/s390/Kconfig | 3 +++ arch/sparc/Kconfig | 3 +++ arch/x86/Kconfig | 3 --- include/linux/percpu.h | 12 +++++++++--- init/main.c | 24 ------------------------ kernel/module.c | 6 +++--- mm/Makefile | 2 +- mm/allocpercpu.c | 28 ++++++++++++++++++++++++++++ mm/percpu.c | 40 +++++++++++++++++++++++++++++++++++++++- 12 files changed, 95 insertions(+), 35 deletions(-) (limited to 'arch/x86') diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index 9fb8aae5c39..05d86407188 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig @@ -70,6 +70,9 @@ config AUTO_IRQ_AFFINITY depends on SMP default y +config HAVE_LEGACY_PER_CPU_AREA + def_bool y + source "init/Kconfig" source "kernel/Kconfig.freezer" diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 170042b420d..328d2f8b8c3 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -89,6 +89,9 @@ config GENERIC_TIME_VSYSCALL bool default y +config HAVE_LEGACY_PER_CPU_AREA + def_bool y + config HAVE_SETUP_PER_CPU_AREA def_bool y diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index bf6cedfa05d..a774c2acbe6 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -46,6 +46,9 @@ config GENERIC_HARDIRQS_NO__DO_IRQ bool default y +config HAVE_LEGACY_PER_CPU_AREA + def_bool PPC64 + config HAVE_SETUP_PER_CPU_AREA def_bool PPC64 diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index a14dba0e4d6..f4a3cc62d28 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -75,6 +75,9 @@ config VIRT_CPU_ACCOUNTING config ARCH_SUPPORTS_DEBUG_PAGEALLOC def_bool y +config HAVE_LEGACY_PER_CPU_AREA + def_bool y + mainmenu "Linux Kernel Configuration" config S390 diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 3f8b6a92eab..7a8698b913f 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -92,6 +92,9 @@ config AUDIT_ARCH bool default y +config HAVE_LEGACY_PER_CPU_AREA + def_bool y if SPARC64 + config HAVE_SETUP_PER_CPU_AREA def_bool y if SPARC64 diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d1430ef6b4f..a48a90076d8 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -149,9 +149,6 @@ config ARCH_HAS_CACHE_LINE_SIZE config HAVE_SETUP_PER_CPU_AREA def_bool y -config HAVE_DYNAMIC_PER_CPU_AREA - def_bool y - config HAVE_CPUMASK_OF_CPU_MAP def_bool X86_64_SMP diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 26fd9d12f05..e5000343dd6 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -34,7 +34,7 @@ #ifdef CONFIG_SMP -#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA +#ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA /* minimum unit size, also is the maximum supported allocation size */ #define PCPU_MIN_UNIT_SIZE PFN_ALIGN(64 << 10) @@ -80,7 +80,7 @@ extern ssize_t __init pcpu_embed_first_chunk( extern void *__alloc_reserved_percpu(size_t size, size_t align); -#else /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ +#else /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */ struct percpu_data { void *ptrs[1]; @@ -99,11 +99,15 @@ struct percpu_data { (__typeof__(ptr))__p->ptrs[(cpu)]; \ }) -#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ +#endif /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */ extern void *__alloc_percpu(size_t size, size_t align); extern void free_percpu(void *__pdata); +#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA +extern void __init setup_per_cpu_areas(void); +#endif + #else /* CONFIG_SMP */ #define per_cpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); }) @@ -124,6 +128,8 @@ static inline void free_percpu(void *p) kfree(p); } +static inline void __init setup_per_cpu_areas(void) { } + #endif /* CONFIG_SMP */ #define alloc_percpu(type) (type *)__alloc_percpu(sizeof(type), \ diff --git a/init/main.c b/init/main.c index 09131ec090c..602d724afa5 100644 --- a/init/main.c +++ b/init/main.c @@ -357,7 +357,6 @@ static void __init smp_init(void) #define smp_init() do { } while (0) #endif -static inline void setup_per_cpu_areas(void) { } static inline void setup_nr_cpu_ids(void) { } static inline void smp_prepare_cpus(unsigned int maxcpus) { } @@ -378,29 +377,6 @@ static void __init setup_nr_cpu_ids(void) nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1; } -#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA -unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; - -EXPORT_SYMBOL(__per_cpu_offset); - -static void __init setup_per_cpu_areas(void) -{ - unsigned long size, i; - char *ptr; - unsigned long nr_possible_cpus = num_possible_cpus(); - - /* Copy section for each CPU (we discard the original) */ - size = ALIGN(PERCPU_ENOUGH_ROOM, PAGE_SIZE); - ptr = alloc_bootmem_pages(size * nr_possible_cpus); - - for_each_possible_cpu(i) { - __per_cpu_offset[i] = ptr - __per_cpu_start; - memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); - ptr += size; - } -} -#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */ - /* Called by boot processor to activate the rest. */ static void __init smp_init(void) { diff --git a/kernel/module.c b/kernel/module.c index 38928fcaff2..f5934954fa9 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -364,7 +364,7 @@ EXPORT_SYMBOL_GPL(find_module); #ifdef CONFIG_SMP -#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA +#ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA static void *percpu_modalloc(unsigned long size, unsigned long align, const char *name) @@ -389,7 +389,7 @@ static void percpu_modfree(void *freeme) free_percpu(freeme); } -#else /* ... !CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ +#else /* ... CONFIG_HAVE_LEGACY_PER_CPU_AREA */ /* Number of blocks used and allocated. */ static unsigned int pcpu_num_used, pcpu_num_allocated; @@ -535,7 +535,7 @@ static int percpu_modinit(void) } __initcall(percpu_modinit); -#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ +#endif /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */ static unsigned int find_pcpusec(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, diff --git a/mm/Makefile b/mm/Makefile index 5e0bd642669..c77c6487552 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -33,7 +33,7 @@ obj-$(CONFIG_FAILSLAB) += failslab.o obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o obj-$(CONFIG_FS_XIP) += filemap_xip.o obj-$(CONFIG_MIGRATION) += migrate.o -ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA +ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA obj-$(CONFIG_SMP) += percpu.o else obj-$(CONFIG_SMP) += allocpercpu.o diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c index dfdee6a4735..df34ceae0c6 100644 --- a/mm/allocpercpu.c +++ b/mm/allocpercpu.c @@ -5,6 +5,8 @@ */ #include #include +#include +#include #ifndef cache_line_size #define cache_line_size() L1_CACHE_BYTES @@ -147,3 +149,29 @@ void free_percpu(void *__pdata) kfree(__percpu_disguise(__pdata)); } EXPORT_SYMBOL_GPL(free_percpu); + +/* + * Generic percpu area setup. + */ +#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA +unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; + +EXPORT_SYMBOL(__per_cpu_offset); + +void __init setup_per_cpu_areas(void) +{ + unsigned long size, i; + char *ptr; + unsigned long nr_possible_cpus = num_possible_cpus(); + + /* Copy section for each CPU (we discard the original) */ + size = ALIGN(PERCPU_ENOUGH_ROOM, PAGE_SIZE); + ptr = alloc_bootmem_pages(size * nr_possible_cpus); + + for_each_possible_cpu(i) { + __per_cpu_offset[i] = ptr - __per_cpu_start; + memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); + ptr += size; + } +} +#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */ diff --git a/mm/percpu.c b/mm/percpu.c index b70f2acd885..b14984566f5 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -43,7 +43,7 @@ * * To use this allocator, arch code should do the followings. * - * - define CONFIG_HAVE_DYNAMIC_PER_CPU_AREA + * - drop CONFIG_HAVE_LEGACY_PER_CPU_AREA * * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate * regular address to percpu pointer and back if they need to be @@ -1275,3 +1275,41 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, reserved_size, dyn_size, pcpue_unit_size, pcpue_ptr, NULL); } + +/* + * Generic percpu area setup. + * + * The embedding helper is used because its behavior closely resembles + * the original non-dynamic generic percpu area setup. This is + * important because many archs have addressing restrictions and might + * fail if the percpu area is located far away from the previous + * location. As an added bonus, in non-NUMA cases, embedding is + * generally a good idea TLB-wise because percpu area can piggy back + * on the physical linear memory mapping which uses large page + * mappings on applicable archs. + */ +#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA +unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; +EXPORT_SYMBOL(__per_cpu_offset); + +void __init setup_per_cpu_areas(void) +{ + size_t static_size = __per_cpu_end - __per_cpu_start; + ssize_t unit_size; + unsigned long delta; + unsigned int cpu; + + /* + * Always reserve area for module percpu variables. That's + * what the legacy allocator did. + */ + unit_size = pcpu_embed_first_chunk(static_size, PERCPU_MODULE_RESERVE, + PERCPU_DYNAMIC_RESERVE, -1); + if (unit_size < 0) + panic("Failed to initialized percpu areas."); + + delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; + for_each_possible_cpu(cpu) + __per_cpu_offset[cpu] = delta + cpu * unit_size; +} +#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */ -- cgit v1.2.3 From 204fba4aa303ea4a7bb726a539bf4a5b9e3203d0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 24 Jun 2009 15:13:45 +0900 Subject: percpu: cleanup percpu array definitions Currently, the following three different ways to define percpu arrays are in use. 1. DEFINE_PER_CPU(elem_type[array_len], array_name); 2. DEFINE_PER_CPU(elem_type, array_name[array_len]); 3. DEFINE_PER_CPU(elem_type, array_name)[array_len]; Unify to #1 which correctly separates the roles of the two parameters and thus allows more flexibility in the way percpu variables are defined. [ Impact: cleanup ] Signed-off-by: Tejun Heo Reviewed-by: Christoph Lameter Cc: Ingo Molnar Cc: Tony Luck Cc: Benjamin Herrenschmidt Cc: Thomas Gleixner Cc: Jeremy Fitzhardinge Cc: linux-mm@kvack.org Cc: Christoph Lameter Cc: David S. Miller --- arch/ia64/kernel/smp.c | 2 +- arch/ia64/sn/kernel/setup.c | 2 +- arch/powerpc/mm/stab.c | 2 +- arch/powerpc/platforms/ps3/smp.c | 2 +- arch/x86/kernel/cpu/cpu_debug.c | 4 ++-- arch/x86/kernel/cpu/mcheck/mce_amd.c | 2 +- arch/x86/kernel/cpu/perf_counter.c | 2 +- drivers/xen/events.c | 4 ++-- mm/quicklist.c | 2 +- mm/slub.c | 4 ++-- net/ipv4/syncookies.c | 2 +- net/ipv6/syncookies.c | 2 +- 12 files changed, 15 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/ia64/kernel/smp.c b/arch/ia64/kernel/smp.c index f0c521b0ba4..94cf78ba28f 100644 --- a/arch/ia64/kernel/smp.c +++ b/arch/ia64/kernel/smp.c @@ -58,7 +58,7 @@ static struct local_tlb_flush_counts { unsigned int count; } __attribute__((__aligned__(32))) local_tlb_flush_counts[NR_CPUS]; -static DEFINE_PER_CPU(unsigned short, shadow_flush_counts[NR_CPUS]) ____cacheline_aligned; +static DEFINE_PER_CPU(unsigned short [NR_CPUS], shadow_flush_counts) ____cacheline_aligned; #define IPI_CALL_FUNC 0 #define IPI_CPU_STOP 1 diff --git a/arch/ia64/sn/kernel/setup.c b/arch/ia64/sn/kernel/setup.c index e456f062f24..ece1bf99449 100644 --- a/arch/ia64/sn/kernel/setup.c +++ b/arch/ia64/sn/kernel/setup.c @@ -71,7 +71,7 @@ EXPORT_SYMBOL(sn_rtc_cycles_per_second); DEFINE_PER_CPU(struct sn_hub_info_s, __sn_hub_info); EXPORT_PER_CPU_SYMBOL(__sn_hub_info); -DEFINE_PER_CPU(short, __sn_cnodeid_to_nasid[MAX_COMPACT_NODES]); +DEFINE_PER_CPU(short [MAX_COMPACT_NODES], __sn_cnodeid_to_nasid); EXPORT_PER_CPU_SYMBOL(__sn_cnodeid_to_nasid); DEFINE_PER_CPU(struct nodepda_s *, __sn_nodepda); diff --git a/arch/powerpc/mm/stab.c b/arch/powerpc/mm/stab.c index 98cd1dc2ae7..6e9b69c9985 100644 --- a/arch/powerpc/mm/stab.c +++ b/arch/powerpc/mm/stab.c @@ -31,7 +31,7 @@ struct stab_entry { #define NR_STAB_CACHE_ENTRIES 8 static DEFINE_PER_CPU(long, stab_cache_ptr); -static DEFINE_PER_CPU(long, stab_cache[NR_STAB_CACHE_ENTRIES]); +static DEFINE_PER_CPU(long [NR_STAB_CACHE_ENTRIES], stab_cache); /* * Create a segment table entry for the given esid/vsid pair. diff --git a/arch/powerpc/platforms/ps3/smp.c b/arch/powerpc/platforms/ps3/smp.c index f6e04bcc70e..51ffde40af2 100644 --- a/arch/powerpc/platforms/ps3/smp.c +++ b/arch/powerpc/platforms/ps3/smp.c @@ -37,7 +37,7 @@ */ #define MSG_COUNT 4 -static DEFINE_PER_CPU(unsigned int, ps3_ipi_virqs[MSG_COUNT]); +static DEFINE_PER_CPU(unsigned int [MSG_COUNT], ps3_ipi_virqs); static void do_message_pass(int target, int msg) { diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c index 6b2a52dd040..dca325c0399 100644 --- a/arch/x86/kernel/cpu/cpu_debug.c +++ b/arch/x86/kernel/cpu/cpu_debug.c @@ -30,8 +30,8 @@ #include #include -static DEFINE_PER_CPU(struct cpu_cpuX_base, cpu_arr[CPU_REG_ALL_BIT]); -static DEFINE_PER_CPU(struct cpu_private *, priv_arr[MAX_CPU_FILES]); +static DEFINE_PER_CPU(struct cpu_cpuX_base [CPU_REG_ALL_BIT], cpu_arr); +static DEFINE_PER_CPU(struct cpu_private * [MAX_CPU_FILES], priv_arr); static DEFINE_PER_CPU(int, cpu_priv_count); static DEFINE_MUTEX(cpu_debug_lock); diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index ddae21620bd..bd2a2fa8462 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -69,7 +69,7 @@ struct threshold_bank { struct threshold_block *blocks; cpumask_var_t cpus; }; -static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]); +static DEFINE_PER_CPU(struct threshold_bank * [NR_BANKS], threshold_banks); #ifdef CONFIG_SMP static unsigned char shared_bank[NR_BANKS] = { diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 76dfef23f78..4946288d683 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -862,7 +862,7 @@ amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) x86_pmu_disable_counter(hwc, idx); } -static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]); +static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], prev_left); /* * Set the next IRQ period, based on the hwc->period_left value. diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 891d2e90753..ab581fa6268 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -47,10 +47,10 @@ static DEFINE_SPINLOCK(irq_mapping_update_lock); /* IRQ <-> VIRQ mapping. */ -static DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1}; +static DEFINE_PER_CPU(int [NR_VIRQS], virq_to_irq) = {[0 ... NR_VIRQS-1] = -1}; /* IRQ <-> IPI mapping */ -static DEFINE_PER_CPU(int, ipi_to_irq[XEN_NR_IPIS]) = {[0 ... XEN_NR_IPIS-1] = -1}; +static DEFINE_PER_CPU(int [XEN_NR_IPIS], ipi_to_irq) = {[0 ... XEN_NR_IPIS-1] = -1}; /* Interrupt types. */ enum xen_irq_type { diff --git a/mm/quicklist.c b/mm/quicklist.c index e66d07d1b4f..6eedf7e473d 100644 --- a/mm/quicklist.c +++ b/mm/quicklist.c @@ -19,7 +19,7 @@ #include #include -DEFINE_PER_CPU(struct quicklist, quicklist)[CONFIG_NR_QUICK]; +DEFINE_PER_CPU(struct quicklist [CONFIG_NR_QUICK], quicklist); #define FRACTION_OF_NODE_MEM 16 diff --git a/mm/slub.c b/mm/slub.c index ce62b770e2f..23bb79acc4b 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2086,8 +2086,8 @@ init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) */ #define NR_KMEM_CACHE_CPU 100 -static DEFINE_PER_CPU(struct kmem_cache_cpu, - kmem_cache_cpu)[NR_KMEM_CACHE_CPU]; +static DEFINE_PER_CPU(struct kmem_cache_cpu [NR_KMEM_CACHE_CPU], + kmem_cache_cpu); static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free); static DECLARE_BITMAP(kmem_cach_cpu_free_init_once, CONFIG_NR_CPUS); diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index cd2b97f1b6e..84d90f2799b 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -37,7 +37,7 @@ __initcall(init_syncookies); #define COOKIEBITS 24 /* Upper bits store count */ #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) -static DEFINE_PER_CPU(__u32, cookie_scratch)[16 + 5 + SHA_WORKSPACE_WORDS]; +static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], cookie_scratch); static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, u32 count, int c) diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 8c2513982b6..23d0d6db046 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -74,7 +74,7 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, return child; } -static DEFINE_PER_CPU(__u32, cookie_scratch)[16 + 5 + SHA_WORKSPACE_WORDS]; +static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], cookie_scratch); static u32 cookie_hash(struct in6_addr *saddr, struct in6_addr *daddr, __be16 sport, __be16 dport, u32 count, int c) -- cgit v1.2.3 From 245b2e70eabd797932adb263a65da0bab3711753 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 24 Jun 2009 15:13:48 +0900 Subject: percpu: clean up percpu variable definitions Percpu variable definition is about to be updated such that all percpu symbols including the static ones must be unique. Update percpu variable definitions accordingly. * as,cfq: rename ioc_count uniquely * cpufreq: rename cpu_dbs_info uniquely * xen: move nesting_count out of xen_evtchn_do_upcall() and rename it * mm: move ratelimits out of balance_dirty_pages_ratelimited_nr() and rename it * ipv4,6: rename cookie_scratch uniquely * x86 perf_counter: rename prev_left to pmc_prev_left, irq_entry to pmc_irq_entry and nmi_entry to pmc_nmi_entry * perf_counter: rename disable_count to perf_disable_count * ftrace: rename test_event_disable to ftrace_test_event_disable * kmemleak: rename test_pointer to kmemleak_test_pointer * mce: rename next_interval to mce_next_interval [ Impact: percpu usage cleanups, no duplicate static percpu var names ] Signed-off-by: Tejun Heo Reviewed-by: Christoph Lameter Cc: Ivan Kokshaysky Cc: Jens Axboe Cc: Dave Jones Cc: Jeremy Fitzhardinge Cc: linux-mm Cc: David S. Miller Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Li Zefan Cc: Catalin Marinas Cc: Andi Kleen --- arch/x86/kernel/cpu/mcheck/mce.c | 8 ++++---- arch/x86/kernel/cpu/perf_counter.c | 14 +++++++------- block/as-iosched.c | 10 +++++----- block/cfq-iosched.c | 10 +++++----- drivers/cpufreq/cpufreq_conservative.c | 12 ++++++------ drivers/cpufreq/cpufreq_ondemand.c | 15 ++++++++------- drivers/xen/events.c | 9 +++++---- kernel/perf_counter.c | 6 +++--- kernel/trace/trace_events.c | 6 +++--- mm/kmemleak-test.c | 6 +++--- mm/page-writeback.c | 5 +++-- net/ipv4/syncookies.c | 5 +++-- net/ipv6/syncookies.c | 5 +++-- 13 files changed, 58 insertions(+), 53 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 284d1de968b..cba8cd3e957 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1091,7 +1091,7 @@ void mce_log_therm_throt_event(__u64 status) */ static int check_interval = 5 * 60; /* 5 minutes */ -static DEFINE_PER_CPU(int, next_interval); /* in jiffies */ +static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */ static DEFINE_PER_CPU(struct timer_list, mce_timer); static void mcheck_timer(unsigned long data) @@ -1110,7 +1110,7 @@ static void mcheck_timer(unsigned long data) * Alert userspace if needed. If we logged an MCE, reduce the * polling interval, otherwise increase the polling interval. */ - n = &__get_cpu_var(next_interval); + n = &__get_cpu_var(mce_next_interval); if (mce_notify_irq()) *n = max(*n/2, HZ/100); else @@ -1311,7 +1311,7 @@ static void mce_cpu_features(struct cpuinfo_x86 *c) static void mce_init_timer(void) { struct timer_list *t = &__get_cpu_var(mce_timer); - int *n = &__get_cpu_var(next_interval); + int *n = &__get_cpu_var(mce_next_interval); if (mce_ignore_ce) return; @@ -1914,7 +1914,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: t->expires = round_jiffies(jiffies + - __get_cpu_var(next_interval)); + __get_cpu_var(mce_next_interval)); add_timer_on(t, cpu); smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); break; diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 4946288d683..5fdf63aaaba 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -862,7 +862,7 @@ amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) x86_pmu_disable_counter(hwc, idx); } -static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], prev_left); +static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); /* * Set the next IRQ period, based on the hwc->period_left value. @@ -901,7 +901,7 @@ x86_perf_counter_set_period(struct perf_counter *counter, if (left > x86_pmu.max_period) left = x86_pmu.max_period; - per_cpu(prev_left[idx], smp_processor_id()) = left; + per_cpu(pmc_prev_left[idx], smp_processor_id()) = left; /* * The hw counter starts counting from this counter offset, @@ -1089,7 +1089,7 @@ void perf_counter_print_debug(void) rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); rdmsrl(x86_pmu.perfctr + idx, pmc_count); - prev_left = per_cpu(prev_left[idx], cpu); + prev_left = per_cpu(pmc_prev_left[idx], cpu); pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n", cpu, idx, pmc_ctrl); @@ -1561,8 +1561,8 @@ void callchain_store(struct perf_callchain_entry *entry, u64 ip) entry->ip[entry->nr++] = ip; } -static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry); -static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry); +static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry); +static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry); static void @@ -1709,9 +1709,9 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) struct perf_callchain_entry *entry; if (in_nmi()) - entry = &__get_cpu_var(nmi_entry); + entry = &__get_cpu_var(pmc_nmi_entry); else - entry = &__get_cpu_var(irq_entry); + entry = &__get_cpu_var(pmc_irq_entry); entry->nr = 0; diff --git a/block/as-iosched.c b/block/as-iosched.c index 7a12cf6ee1d..ce8ba57c655 100644 --- a/block/as-iosched.c +++ b/block/as-iosched.c @@ -146,7 +146,7 @@ enum arq_state { #define RQ_STATE(rq) ((enum arq_state)(rq)->elevator_private2) #define RQ_SET_STATE(rq, state) ((rq)->elevator_private2 = (void *) state) -static DEFINE_PER_CPU(unsigned long, ioc_count); +static DEFINE_PER_CPU(unsigned long, as_ioc_count); static struct completion *ioc_gone; static DEFINE_SPINLOCK(ioc_gone_lock); @@ -161,7 +161,7 @@ static void as_antic_stop(struct as_data *ad); static void free_as_io_context(struct as_io_context *aic) { kfree(aic); - elv_ioc_count_dec(ioc_count); + elv_ioc_count_dec(as_ioc_count); if (ioc_gone) { /* * AS scheduler is exiting, grab exit lock and check @@ -169,7 +169,7 @@ static void free_as_io_context(struct as_io_context *aic) * complete ioc_gone and set it back to NULL. */ spin_lock(&ioc_gone_lock); - if (ioc_gone && !elv_ioc_count_read(ioc_count)) { + if (ioc_gone && !elv_ioc_count_read(as_ioc_count)) { complete(ioc_gone); ioc_gone = NULL; } @@ -211,7 +211,7 @@ static struct as_io_context *alloc_as_io_context(void) ret->seek_total = 0; ret->seek_samples = 0; ret->seek_mean = 0; - elv_ioc_count_inc(ioc_count); + elv_ioc_count_inc(as_ioc_count); } return ret; @@ -1507,7 +1507,7 @@ static void __exit as_exit(void) ioc_gone = &all_gone; /* ioc_gone's update must be visible before reading ioc_count */ smp_wmb(); - if (elv_ioc_count_read(ioc_count)) + if (elv_ioc_count_read(as_ioc_count)) wait_for_completion(&all_gone); synchronize_rcu(); } diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 833ec18eaa6..0f1cc7d3855 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -48,7 +48,7 @@ static int cfq_slice_idle = HZ / 125; static struct kmem_cache *cfq_pool; static struct kmem_cache *cfq_ioc_pool; -static DEFINE_PER_CPU(unsigned long, ioc_count); +static DEFINE_PER_CPU(unsigned long, cfq_ioc_count); static struct completion *ioc_gone; static DEFINE_SPINLOCK(ioc_gone_lock); @@ -1422,7 +1422,7 @@ static void cfq_cic_free_rcu(struct rcu_head *head) cic = container_of(head, struct cfq_io_context, rcu_head); kmem_cache_free(cfq_ioc_pool, cic); - elv_ioc_count_dec(ioc_count); + elv_ioc_count_dec(cfq_ioc_count); if (ioc_gone) { /* @@ -1431,7 +1431,7 @@ static void cfq_cic_free_rcu(struct rcu_head *head) * complete ioc_gone and set it back to NULL */ spin_lock(&ioc_gone_lock); - if (ioc_gone && !elv_ioc_count_read(ioc_count)) { + if (ioc_gone && !elv_ioc_count_read(cfq_ioc_count)) { complete(ioc_gone); ioc_gone = NULL; } @@ -1557,7 +1557,7 @@ cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask) INIT_HLIST_NODE(&cic->cic_list); cic->dtor = cfq_free_io_context; cic->exit = cfq_exit_io_context; - elv_ioc_count_inc(ioc_count); + elv_ioc_count_inc(cfq_ioc_count); } return cic; @@ -2658,7 +2658,7 @@ static void __exit cfq_exit(void) * this also protects us from entering cfq_slab_kill() with * pending RCU callbacks */ - if (elv_ioc_count_read(ioc_count)) + if (elv_ioc_count_read(cfq_ioc_count)) wait_for_completion(&all_gone); cfq_slab_kill(); } diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c index 7fc58af748b..a7ef465c83b 100644 --- a/drivers/cpufreq/cpufreq_conservative.c +++ b/drivers/cpufreq/cpufreq_conservative.c @@ -65,7 +65,7 @@ struct cpu_dbs_info_s { int cpu; unsigned int enable:1; }; -static DEFINE_PER_CPU(struct cpu_dbs_info_s, cpu_dbs_info); +static DEFINE_PER_CPU(struct cpu_dbs_info_s, cs_cpu_dbs_info); static unsigned int dbs_enable; /* number of CPUs using this policy */ @@ -138,7 +138,7 @@ dbs_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data) { struct cpufreq_freqs *freq = data; - struct cpu_dbs_info_s *this_dbs_info = &per_cpu(cpu_dbs_info, + struct cpu_dbs_info_s *this_dbs_info = &per_cpu(cs_cpu_dbs_info, freq->cpu); struct cpufreq_policy *policy; @@ -298,7 +298,7 @@ static ssize_t store_ignore_nice_load(struct cpufreq_policy *policy, /* we need to re-evaluate prev_cpu_idle */ for_each_online_cpu(j) { struct cpu_dbs_info_s *dbs_info; - dbs_info = &per_cpu(cpu_dbs_info, j); + dbs_info = &per_cpu(cs_cpu_dbs_info, j); dbs_info->prev_cpu_idle = get_cpu_idle_time(j, &dbs_info->prev_cpu_wall); if (dbs_tuners_ins.ignore_nice) @@ -388,7 +388,7 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info) cputime64_t cur_wall_time, cur_idle_time; unsigned int idle_time, wall_time; - j_dbs_info = &per_cpu(cpu_dbs_info, j); + j_dbs_info = &per_cpu(cs_cpu_dbs_info, j); cur_idle_time = get_cpu_idle_time(j, &cur_wall_time); @@ -528,7 +528,7 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, unsigned int j; int rc; - this_dbs_info = &per_cpu(cpu_dbs_info, cpu); + this_dbs_info = &per_cpu(cs_cpu_dbs_info, cpu); switch (event) { case CPUFREQ_GOV_START: @@ -548,7 +548,7 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, for_each_cpu(j, policy->cpus) { struct cpu_dbs_info_s *j_dbs_info; - j_dbs_info = &per_cpu(cpu_dbs_info, j); + j_dbs_info = &per_cpu(cs_cpu_dbs_info, j); j_dbs_info->cur_policy = policy; j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j, diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c index 1911d172935..36f292a7bd0 100644 --- a/drivers/cpufreq/cpufreq_ondemand.c +++ b/drivers/cpufreq/cpufreq_ondemand.c @@ -73,7 +73,7 @@ struct cpu_dbs_info_s { unsigned int enable:1, sample_type:1; }; -static DEFINE_PER_CPU(struct cpu_dbs_info_s, cpu_dbs_info); +static DEFINE_PER_CPU(struct cpu_dbs_info_s, od_cpu_dbs_info); static unsigned int dbs_enable; /* number of CPUs using this policy */ @@ -151,7 +151,8 @@ static unsigned int powersave_bias_target(struct cpufreq_policy *policy, unsigned int freq_hi, freq_lo; unsigned int index = 0; unsigned int jiffies_total, jiffies_hi, jiffies_lo; - struct cpu_dbs_info_s *dbs_info = &per_cpu(cpu_dbs_info, policy->cpu); + struct cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, + policy->cpu); if (!dbs_info->freq_table) { dbs_info->freq_lo = 0; @@ -196,7 +197,7 @@ static void ondemand_powersave_bias_init(void) { int i; for_each_online_cpu(i) { - struct cpu_dbs_info_s *dbs_info = &per_cpu(cpu_dbs_info, i); + struct cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, i); dbs_info->freq_table = cpufreq_frequency_get_table(i); dbs_info->freq_lo = 0; } @@ -297,7 +298,7 @@ static ssize_t store_ignore_nice_load(struct cpufreq_policy *policy, /* we need to re-evaluate prev_cpu_idle */ for_each_online_cpu(j) { struct cpu_dbs_info_s *dbs_info; - dbs_info = &per_cpu(cpu_dbs_info, j); + dbs_info = &per_cpu(od_cpu_dbs_info, j); dbs_info->prev_cpu_idle = get_cpu_idle_time(j, &dbs_info->prev_cpu_wall); if (dbs_tuners_ins.ignore_nice) @@ -391,7 +392,7 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info) unsigned int load, load_freq; int freq_avg; - j_dbs_info = &per_cpu(cpu_dbs_info, j); + j_dbs_info = &per_cpu(od_cpu_dbs_info, j); cur_idle_time = get_cpu_idle_time(j, &cur_wall_time); @@ -548,7 +549,7 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, unsigned int j; int rc; - this_dbs_info = &per_cpu(cpu_dbs_info, cpu); + this_dbs_info = &per_cpu(od_cpu_dbs_info, cpu); switch (event) { case CPUFREQ_GOV_START: @@ -570,7 +571,7 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, for_each_cpu(j, policy->cpus) { struct cpu_dbs_info_s *j_dbs_info; - j_dbs_info = &per_cpu(cpu_dbs_info, j); + j_dbs_info = &per_cpu(od_cpu_dbs_info, j); j_dbs_info->cur_policy = policy; j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j, diff --git a/drivers/xen/events.c b/drivers/xen/events.c index ab581fa6268..7d2987e9b1b 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -602,6 +602,8 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } +static DEFINE_PER_CPU(unsigned, xed_nesting_count); + /* * Search the CPUs pending events bitmasks. For each one found, map * the event number to an irq, and feed it into do_IRQ() for @@ -617,7 +619,6 @@ void xen_evtchn_do_upcall(struct pt_regs *regs) struct pt_regs *old_regs = set_irq_regs(regs); struct shared_info *s = HYPERVISOR_shared_info; struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu); - static DEFINE_PER_CPU(unsigned, nesting_count); unsigned count; exit_idle(); @@ -628,7 +629,7 @@ void xen_evtchn_do_upcall(struct pt_regs *regs) vcpu_info->evtchn_upcall_pending = 0; - if (__get_cpu_var(nesting_count)++) + if (__get_cpu_var(xed_nesting_count)++) goto out; #ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */ @@ -653,8 +654,8 @@ void xen_evtchn_do_upcall(struct pt_regs *regs) BUG_ON(!irqs_disabled()); - count = __get_cpu_var(nesting_count); - __get_cpu_var(nesting_count) = 0; + count = __get_cpu_var(xed_nesting_count); + __get_cpu_var(xed_nesting_count) = 0; } while(count != 1); out: diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 1a933a221ea..1fd7a2e7575 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -98,16 +98,16 @@ hw_perf_group_sched_in(struct perf_counter *group_leader, void __weak perf_counter_print_debug(void) { } -static DEFINE_PER_CPU(int, disable_count); +static DEFINE_PER_CPU(int, perf_disable_count); void __perf_disable(void) { - __get_cpu_var(disable_count)++; + __get_cpu_var(perf_disable_count)++; } bool __perf_enable(void) { - return !--__get_cpu_var(disable_count); + return !--__get_cpu_var(perf_disable_count); } void perf_disable(void) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index aa08be69a1b..54b1de5074b 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1318,7 +1318,7 @@ static __init void event_trace_self_tests(void) #ifdef CONFIG_FUNCTION_TRACER -static DEFINE_PER_CPU(atomic_t, test_event_disable); +static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable); static void function_test_events_call(unsigned long ip, unsigned long parent_ip) @@ -1334,7 +1334,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip) pc = preempt_count(); resched = ftrace_preempt_disable(); cpu = raw_smp_processor_id(); - disabled = atomic_inc_return(&per_cpu(test_event_disable, cpu)); + disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu)); if (disabled != 1) goto out; @@ -1352,7 +1352,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip) trace_nowake_buffer_unlock_commit(event, flags, pc); out: - atomic_dec(&per_cpu(test_event_disable, cpu)); + atomic_dec(&per_cpu(ftrace_test_event_disable, cpu)); ftrace_preempt_enable(resched); } diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c index d5292fc6f52..177a5169bbd 100644 --- a/mm/kmemleak-test.c +++ b/mm/kmemleak-test.c @@ -36,7 +36,7 @@ struct test_node { }; static LIST_HEAD(test_list); -static DEFINE_PER_CPU(void *, test_pointer); +static DEFINE_PER_CPU(void *, kmemleak_test_pointer); /* * Some very simple testing. This function needs to be extended for @@ -86,9 +86,9 @@ static int __init kmemleak_test_init(void) } for_each_possible_cpu(i) { - per_cpu(test_pointer, i) = kmalloc(129, GFP_KERNEL); + per_cpu(kmemleak_test_pointer, i) = kmalloc(129, GFP_KERNEL); pr_info("kmemleak: kmalloc(129) = %p\n", - per_cpu(test_pointer, i)); + per_cpu(kmemleak_test_pointer, i)); } return 0; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 7b0dcea4935..2c075dcf03d 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -607,6 +607,8 @@ void set_page_dirty_balance(struct page *page, int page_mkwrite) } } +static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0; + /** * balance_dirty_pages_ratelimited_nr - balance dirty memory state * @mapping: address_space which was dirtied @@ -624,7 +626,6 @@ void set_page_dirty_balance(struct page *page, int page_mkwrite) void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, unsigned long nr_pages_dirtied) { - static DEFINE_PER_CPU(unsigned long, ratelimits) = 0; unsigned long ratelimit; unsigned long *p; @@ -637,7 +638,7 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, * tasks in balance_dirty_pages(). Period. */ preempt_disable(); - p = &__get_cpu_var(ratelimits); + p = &__get_cpu_var(bdp_ratelimits); *p += nr_pages_dirtied; if (unlikely(*p >= ratelimit)) { *p = 0; diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 84d90f2799b..a6e0e077ac3 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -37,12 +37,13 @@ __initcall(init_syncookies); #define COOKIEBITS 24 /* Upper bits store count */ #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) -static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], cookie_scratch); +static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], + ipv4_cookie_scratch); static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, u32 count, int c) { - __u32 *tmp = __get_cpu_var(cookie_scratch); + __u32 *tmp = __get_cpu_var(ipv4_cookie_scratch); memcpy(tmp + 4, syncookie_secret[c], sizeof(syncookie_secret[c])); tmp[0] = (__force u32)saddr; diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 23d0d6db046..6b6ae913b5d 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -74,12 +74,13 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, return child; } -static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], cookie_scratch); +static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], + ipv6_cookie_scratch); static u32 cookie_hash(struct in6_addr *saddr, struct in6_addr *daddr, __be16 sport, __be16 dport, u32 count, int c) { - __u32 *tmp = __get_cpu_var(cookie_scratch); + __u32 *tmp = __get_cpu_var(ipv6_cookie_scratch); /* * we have 320 bits of information to hash, copy in the remaining -- cgit v1.2.3 From a4a874a906ae69c35df4b712fadbc35b15665355 Mon Sep 17 00:00:00 2001 From: Huang Weiyi Date: Thu, 18 Jun 2009 07:05:46 +0800 Subject: kmemcheck: remove duplicated #include Remove duplicated #include in arch/x86/mm/kmemcheck/shadow.c. Signed-off-by: Huang Weiyi Acked-by: Pekka Enberg Signed-off-by: Vegard Nossum --- arch/x86/mm/kmemcheck/shadow.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/kmemcheck/shadow.c b/arch/x86/mm/kmemcheck/shadow.c index e773b6bd007..3f66b82076a 100644 --- a/arch/x86/mm/kmemcheck/shadow.c +++ b/arch/x86/mm/kmemcheck/shadow.c @@ -1,7 +1,6 @@ #include #include #include -#include #include #include -- cgit v1.2.3 From 414f3251aa1b4cbd1e070866971eabc004a7dc20 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Mon, 22 Jun 2009 14:31:53 +0200 Subject: kmemcheck: remove useless check This check is a left-over from ancient times. We now have the equivalent check much earlier in both the page fault handler and the debug trap handler (the calls to kmemcheck_active()). Signed-off-by: Vegard Nossum --- arch/x86/mm/kmemcheck/kmemcheck.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c index 2c55ed09865..5b99004fb4b 100644 --- a/arch/x86/mm/kmemcheck/kmemcheck.c +++ b/arch/x86/mm/kmemcheck/kmemcheck.c @@ -225,9 +225,6 @@ void kmemcheck_hide(struct pt_regs *regs) BUG_ON(!irqs_disabled()); - if (data->balance == 0) - return; - if (unlikely(data->balance != 1)) { kmemcheck_show_all(); kmemcheck_error_save_bug(regs); -- cgit v1.2.3 From 788e5abc5441e9046dd91c995c6f1f75bbd144bf Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 4 Jul 2009 08:10:58 +0900 Subject: percpu: drop @unit_size from embed first chunk allocator The only extra feature @unit_size provides is making dead space at the end of the first chunk which doesn't have any valid usecase. Drop the parameter. This will increase consistency with generalized 4k allocator. James Bottomley spotted missing conversion for the default setup_per_cpu_areas() which caused build breakage on all arcsh which use it. [ Impact: drop unused code path ] Signed-off-by: Tejun Heo Cc: James Bottomley Cc: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 2 +- include/linux/percpu.h | 2 +- mm/percpu.c | 18 ++++++------------ 3 files changed, 8 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 29a3eef7cf4..14728206fb5 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -342,7 +342,7 @@ static ssize_t __init setup_pcpu_embed(size_t static_size, bool chosen) return -EINVAL; return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, - reserve - PERCPU_FIRST_CHUNK_RESERVE, -1); + reserve - PERCPU_FIRST_CHUNK_RESERVE); } /* diff --git a/include/linux/percpu.h b/include/linux/percpu.h index e5000343dd6..83bff053bd1 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -69,7 +69,7 @@ extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, extern ssize_t __init pcpu_embed_first_chunk( size_t static_size, size_t reserved_size, - ssize_t dyn_size, ssize_t unit_size); + ssize_t dyn_size); /* * Use this to get to a cpu's version of the per-cpu object diff --git a/mm/percpu.c b/mm/percpu.c index 19dd83b5cbd..fc6babe6e55 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1207,7 +1207,6 @@ static struct page * __init pcpue_get_page(unsigned int cpu, int pageno) * @static_size: the size of static percpu area in bytes * @reserved_size: the size of reserved percpu area in bytes * @dyn_size: free size for dynamic allocation in bytes, -1 for auto - * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto * * This is a helper to ease setting up embedded first percpu chunk and * can be called where pcpu_setup_first_chunk() is expected. @@ -1219,9 +1218,9 @@ static struct page * __init pcpue_get_page(unsigned int cpu, int pageno) * page size. * * When @dyn_size is positive, dynamic area might be larger than - * specified to fill page alignment. Also, when @dyn_size is auto, - * @dyn_size does not fill the whole first chunk but only what's - * necessary for page alignment after static and reserved areas. + * specified to fill page alignment. When @dyn_size is auto, + * @dyn_size is just big enough to fill page alignment after static + * and reserved areas. * * If the needed size is smaller than the minimum or specified unit * size, the leftover is returned to the bootmem allocator. @@ -1231,7 +1230,7 @@ static struct page * __init pcpue_get_page(unsigned int cpu, int pageno) * percpu access on success, -errno on failure. */ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, - ssize_t dyn_size, ssize_t unit_size) + ssize_t dyn_size) { size_t chunk_size; unsigned int cpu; @@ -1242,12 +1241,7 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, if (dyn_size != 0) dyn_size = pcpue_size - static_size - reserved_size; - if (unit_size >= 0) { - BUG_ON(unit_size < pcpue_size); - pcpue_unit_size = unit_size; - } else - pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); - + pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); chunk_size = pcpue_unit_size * num_possible_cpus(); pcpue_ptr = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE, @@ -1304,7 +1298,7 @@ void __init setup_per_cpu_areas(void) * what the legacy allocator did. */ unit_size = pcpu_embed_first_chunk(static_size, PERCPU_MODULE_RESERVE, - PERCPU_DYNAMIC_RESERVE, -1); + PERCPU_DYNAMIC_RESERVE); if (unit_size < 0) panic("Failed to initialized percpu areas."); -- cgit v1.2.3 From d4b95f80399471e4bce5e992700ff7f06ef91f6a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 4 Jul 2009 08:10:59 +0900 Subject: x86,percpu: generalize 4k first chunk allocator Generalize and move x86 setup_pcpu_4k() into pcpu_4k_first_chunk(). setup_pcpu_4k() now is a simple wrapper around the generalized version. Other than taking size parameters and using arch supplied callbacks to allocate/free memory, pcpu_4k_first_chunk() is identical to the original implementation. This simplifies arch code and will help converting more archs to dynamic percpu allocator. While at it, s/pcpu_populate_pte_fn_t/pcpu_fc_populate_pte_fn_t/ for consistency. [ Impact: code reorganization and generalization ] Signed-off-by: Tejun Heo Cc: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 78 ++++++++++---------------------------- include/linux/percpu.h | 12 +++++- mm/percpu.c | 85 +++++++++++++++++++++++++++++++++++++++++- 3 files changed, 113 insertions(+), 62 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 14728206fb5..ab896b31e80 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -123,6 +123,19 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, #endif } +/* + * Helpers for first chunk memory allocation + */ +static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size) +{ + return pcpu_alloc_bootmem(cpu, size, size); +} + +static void __init pcpu_fc_free(void *ptr, size_t size) +{ + free_bootmem(__pa(ptr), size); +} + /* * Large page remap allocator * @@ -346,22 +359,11 @@ static ssize_t __init setup_pcpu_embed(size_t static_size, bool chosen) } /* - * 4k page allocator + * 4k allocator * - * This is the basic allocator. Static percpu area is allocated - * page-by-page and most of initialization is done by the generic - * setup function. + * Boring fallback 4k allocator. This allocator puts more pressure on + * PTE TLBs but other than that behaves nicely on both UMA and NUMA. */ -static struct page **pcpu4k_pages __initdata; -static int pcpu4k_nr_static_pages __initdata; - -static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno) -{ - if (pageno < pcpu4k_nr_static_pages) - return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno]; - return NULL; -} - static void __init pcpu4k_populate_pte(unsigned long addr) { populate_extra_pte(addr); @@ -369,51 +371,9 @@ static void __init pcpu4k_populate_pte(unsigned long addr) static ssize_t __init setup_pcpu_4k(size_t static_size) { - size_t pages_size; - unsigned int cpu; - int i, j; - ssize_t ret; - - pcpu4k_nr_static_pages = PFN_UP(static_size); - - /* unaligned allocations can't be freed, round up to page size */ - pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * num_possible_cpus() - * sizeof(pcpu4k_pages[0])); - pcpu4k_pages = alloc_bootmem(pages_size); - - /* allocate and copy */ - j = 0; - for_each_possible_cpu(cpu) - for (i = 0; i < pcpu4k_nr_static_pages; i++) { - void *ptr; - - ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE); - if (!ptr) { - pr_warning("PERCPU: failed to allocate " - "4k page for cpu%u\n", cpu); - goto enomem; - } - - memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE); - pcpu4k_pages[j++] = virt_to_page(ptr); - } - - /* we're ready, commit */ - pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n", - pcpu4k_nr_static_pages, static_size); - - ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, - PERCPU_FIRST_CHUNK_RESERVE, -1, - -1, NULL, pcpu4k_populate_pte); - goto out_free_ar; - -enomem: - while (--j >= 0) - free_bootmem(__pa(page_address(pcpu4k_pages[j])), PAGE_SIZE); - ret = -ENOMEM; -out_free_ar: - free_bootmem(__pa(pcpu4k_pages), pages_size); - return ret; + return pcpu_4k_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, + pcpu_fc_alloc, pcpu_fc_free, + pcpu4k_populate_pte); } /* for explicit first chunk allocator selection */ diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 83bff053bd1..41b5bfab419 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -59,18 +59,26 @@ extern void *pcpu_base_addr; typedef struct page * (*pcpu_get_page_fn_t)(unsigned int cpu, int pageno); -typedef void (*pcpu_populate_pte_fn_t)(unsigned long addr); +typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size); +typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size); +typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr); extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, size_t static_size, size_t reserved_size, ssize_t dyn_size, ssize_t unit_size, void *base_addr, - pcpu_populate_pte_fn_t populate_pte_fn); + pcpu_fc_populate_pte_fn_t populate_pte_fn); extern ssize_t __init pcpu_embed_first_chunk( size_t static_size, size_t reserved_size, ssize_t dyn_size); +extern ssize_t __init pcpu_4k_first_chunk( + size_t static_size, size_t reserved_size, + pcpu_fc_alloc_fn_t alloc_fn, + pcpu_fc_free_fn_t free_fn, + pcpu_fc_populate_pte_fn_t populate_pte_fn); + /* * Use this to get to a cpu's version of the per-cpu object * dynamically allocated. Non-atomic access to the current CPU's diff --git a/mm/percpu.c b/mm/percpu.c index fc6babe6e55..27b0f40a3ea 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1037,7 +1037,7 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, size_t static_size, size_t reserved_size, ssize_t dyn_size, ssize_t unit_size, void *base_addr, - pcpu_populate_pte_fn_t populate_pte_fn) + pcpu_fc_populate_pte_fn_t populate_pte_fn) { static struct vm_struct first_vm; static int smap[2], dmap[2]; @@ -1270,6 +1270,89 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, pcpue_unit_size, pcpue_ptr, NULL); } +/* + * 4k page first chunk setup helper. + */ +static struct page **pcpu4k_pages __initdata; +static int pcpu4k_nr_static_pages __initdata; + +static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno) +{ + if (pageno < pcpu4k_nr_static_pages) + return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno]; + return NULL; +} + +/** + * pcpu_4k_first_chunk - map the first chunk using PAGE_SIZE pages + * @static_size: the size of static percpu area in bytes + * @reserved_size: the size of reserved percpu area in bytes + * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE + * @free_fn: funtion to free percpu page, always called with PAGE_SIZE + * @populate_pte_fn: function to populate pte + * + * This is a helper to ease setting up embedded first percpu chunk and + * can be called where pcpu_setup_first_chunk() is expected. + * + * This is the basic allocator. Static percpu area is allocated + * page-by-page into vmalloc area. + * + * RETURNS: + * The determined pcpu_unit_size which can be used to initialize + * percpu access on success, -errno on failure. + */ +ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size, + pcpu_fc_alloc_fn_t alloc_fn, + pcpu_fc_free_fn_t free_fn, + pcpu_fc_populate_pte_fn_t populate_pte_fn) +{ + size_t pages_size; + unsigned int cpu; + int i, j; + ssize_t ret; + + pcpu4k_nr_static_pages = PFN_UP(static_size); + + /* unaligned allocations can't be freed, round up to page size */ + pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * num_possible_cpus() * + sizeof(pcpu4k_pages[0])); + pcpu4k_pages = alloc_bootmem(pages_size); + + /* allocate and copy */ + j = 0; + for_each_possible_cpu(cpu) + for (i = 0; i < pcpu4k_nr_static_pages; i++) { + void *ptr; + + ptr = alloc_fn(cpu, PAGE_SIZE); + if (!ptr) { + pr_warning("PERCPU: failed to allocate " + "4k page for cpu%u\n", cpu); + goto enomem; + } + + memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE); + pcpu4k_pages[j++] = virt_to_page(ptr); + } + + /* we're ready, commit */ + pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n", + pcpu4k_nr_static_pages, static_size); + + ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, + reserved_size, -1, + -1, NULL, populate_pte_fn); + goto out_free_ar; + +enomem: + while (--j >= 0) + free_fn(page_address(pcpu4k_pages[j]), PAGE_SIZE); + ret = -ENOMEM; +out_free_ar: + free_bootmem(__pa(pcpu4k_pages), pages_size); + return ret; +} + /* * Generic percpu area setup. * -- cgit v1.2.3 From 8c4bfc6e8801616ab2e01c38140b2159b388d2ff Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 4 Jul 2009 08:10:59 +0900 Subject: x86,percpu: generalize lpage first chunk allocator Generalize and move x86 setup_pcpu_lpage() into pcpu_lpage_first_chunk(). setup_pcpu_lpage() now is a simple wrapper around the generalized version. Other than taking size parameters and using arch supplied callbacks to allocate/free/map memory, pcpu_lpage_first_chunk() is identical to the original implementation. This simplifies arch code and will help converting more archs to dynamic percpu allocator. While at it, factor out pcpu_calc_fc_sizes() which is common to pcpu_embed_first_chunk() and pcpu_lpage_first_chunk(). [ Impact: code reorganization and generalization ] Signed-off-by: Tejun Heo Cc: Ingo Molnar --- arch/x86/include/asm/percpu.h | 9 -- arch/x86/kernel/setup_percpu.c | 169 +++------------------------------ arch/x86/mm/pageattr.c | 1 + include/linux/percpu.h | 27 ++++++ mm/percpu.c | 209 ++++++++++++++++++++++++++++++++++++++++- 5 files changed, 244 insertions(+), 171 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 103f1ddb0d8..a18c038a307 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -156,15 +156,6 @@ do { \ /* We can use this directly for local CPU (faster). */ DECLARE_PER_CPU(unsigned long, this_cpu_off); -#ifdef CONFIG_NEED_MULTIPLE_NODES -void *pcpu_lpage_remapped(void *kaddr); -#else -static inline void *pcpu_lpage_remapped(void *kaddr) -{ - return NULL; -} -#endif - #endif /* !__ASSEMBLY__ */ #ifdef CONFIG_SMP diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index ab896b31e80..4f2e0ac9130 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -137,44 +137,21 @@ static void __init pcpu_fc_free(void *ptr, size_t size) } /* - * Large page remap allocator - * - * This allocator uses PMD page as unit. A PMD page is allocated for - * each cpu and each is remapped into vmalloc area using PMD mapping. - * As PMD page is quite large, only part of it is used for the first - * chunk. Unused part is returned to the bootmem allocator. - * - * So, the PMD pages are mapped twice - once to the physical mapping - * and to the vmalloc area for the first percpu chunk. The double - * mapping does add one more PMD TLB entry pressure but still is much - * better than only using 4k mappings while still being NUMA friendly. + * Large page remapping allocator */ #ifdef CONFIG_NEED_MULTIPLE_NODES -struct pcpul_ent { - unsigned int cpu; - void *ptr; -}; - -static size_t pcpul_size; -static struct pcpul_ent *pcpul_map; -static struct vm_struct pcpul_vm; - -static struct page * __init pcpul_get_page(unsigned int cpu, int pageno) +static void __init pcpul_map(void *ptr, size_t size, void *addr) { - size_t off = (size_t)pageno << PAGE_SHIFT; + pmd_t *pmd, pmd_v; - if (off >= pcpul_size) - return NULL; - - return virt_to_page(pcpul_map[cpu].ptr + off); + pmd = populate_extra_pmd((unsigned long)addr); + pmd_v = pfn_pmd(page_to_pfn(virt_to_page(ptr)), PAGE_KERNEL_LARGE); + set_pmd(pmd, pmd_v); } static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) { - size_t map_size, dyn_size; - unsigned int cpu; - int i, j; - ssize_t ret; + size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; if (!chosen) { size_t vm_size = VMALLOC_END - VMALLOC_START; @@ -198,134 +175,10 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) return -EINVAL; } - /* - * Currently supports only single page. Supporting multiple - * pages won't be too difficult if it ever becomes necessary. - */ - pcpul_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE + - PERCPU_DYNAMIC_RESERVE); - if (pcpul_size > PMD_SIZE) { - pr_warning("PERCPU: static data is larger than large page, " - "can't use large page\n"); - return -EINVAL; - } - dyn_size = pcpul_size - static_size - PERCPU_FIRST_CHUNK_RESERVE; - - /* allocate pointer array and alloc large pages */ - map_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpul_map[0])); - pcpul_map = alloc_bootmem(map_size); - - for_each_possible_cpu(cpu) { - pcpul_map[cpu].cpu = cpu; - pcpul_map[cpu].ptr = pcpu_alloc_bootmem(cpu, PMD_SIZE, - PMD_SIZE); - if (!pcpul_map[cpu].ptr) { - pr_warning("PERCPU: failed to allocate large page " - "for cpu%u\n", cpu); - goto enomem; - } - - /* - * Only use pcpul_size bytes and give back the rest. - * - * Ingo: The 2MB up-rounding bootmem is needed to make - * sure the partial 2MB page is still fully RAM - it's - * not well-specified to have a PAT-incompatible area - * (unmapped RAM, device memory, etc.) in that hole. - */ - free_bootmem(__pa(pcpul_map[cpu].ptr + pcpul_size), - PMD_SIZE - pcpul_size); - - memcpy(pcpul_map[cpu].ptr, __per_cpu_load, static_size); - } - - /* allocate address and map */ - pcpul_vm.flags = VM_ALLOC; - pcpul_vm.size = num_possible_cpus() * PMD_SIZE; - vm_area_register_early(&pcpul_vm, PMD_SIZE); - - for_each_possible_cpu(cpu) { - pmd_t *pmd, pmd_v; - - pmd = populate_extra_pmd((unsigned long)pcpul_vm.addr + - cpu * PMD_SIZE); - pmd_v = pfn_pmd(page_to_pfn(virt_to_page(pcpul_map[cpu].ptr)), - PAGE_KERNEL_LARGE); - set_pmd(pmd, pmd_v); - } - - /* we're ready, commit */ - pr_info("PERCPU: Remapped at %p with large pages, static data " - "%zu bytes\n", pcpul_vm.addr, static_size); - - ret = pcpu_setup_first_chunk(pcpul_get_page, static_size, - PERCPU_FIRST_CHUNK_RESERVE, dyn_size, - PMD_SIZE, pcpul_vm.addr, NULL); - - /* sort pcpul_map array for pcpu_lpage_remapped() */ - for (i = 0; i < num_possible_cpus() - 1; i++) - for (j = i + 1; j < num_possible_cpus(); j++) - if (pcpul_map[i].ptr > pcpul_map[j].ptr) { - struct pcpul_ent tmp = pcpul_map[i]; - pcpul_map[i] = pcpul_map[j]; - pcpul_map[j] = tmp; - } - - return ret; - -enomem: - for_each_possible_cpu(cpu) - if (pcpul_map[cpu].ptr) - free_bootmem(__pa(pcpul_map[cpu].ptr), pcpul_size); - free_bootmem(__pa(pcpul_map), map_size); - return -ENOMEM; -} - -/** - * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area - * @kaddr: the kernel address in question - * - * Determine whether @kaddr falls in the pcpul recycled area. This is - * used by pageattr to detect VM aliases and break up the pcpu PMD - * mapping such that the same physical page is not mapped under - * different attributes. - * - * The recycled area is always at the tail of a partially used PMD - * page. - * - * RETURNS: - * Address of corresponding remapped pcpu address if match is found; - * otherwise, NULL. - */ -void *pcpu_lpage_remapped(void *kaddr) -{ - void *pmd_addr = (void *)((unsigned long)kaddr & PMD_MASK); - unsigned long offset = (unsigned long)kaddr & ~PMD_MASK; - int left = 0, right = num_possible_cpus() - 1; - int pos; - - /* pcpul in use at all? */ - if (!pcpul_map) - return NULL; - - /* okay, perform binary search */ - while (left <= right) { - pos = (left + right) / 2; - - if (pcpul_map[pos].ptr < pmd_addr) - left = pos + 1; - else if (pcpul_map[pos].ptr > pmd_addr) - right = pos - 1; - else { - /* it shouldn't be in the area for the first chunk */ - WARN_ON(offset < pcpul_size); - - return pcpul_vm.addr + - pcpul_map[pos].cpu * PMD_SIZE + offset; - } - } - - return NULL; + return pcpu_lpage_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, + reserve - PERCPU_FIRST_CHUNK_RESERVE, + PMD_SIZE, + pcpu_fc_alloc, pcpu_fc_free, pcpul_map); } #else static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 1b734d7a896..c106f785242 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 41b5bfab419..9f6bfd7d4b9 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -62,6 +62,7 @@ typedef struct page * (*pcpu_get_page_fn_t)(unsigned int cpu, int pageno); typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size); typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size); typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr); +typedef void (*pcpu_fc_map_fn_t)(void *ptr, size_t size, void *addr); extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, size_t static_size, size_t reserved_size, @@ -79,6 +80,32 @@ extern ssize_t __init pcpu_4k_first_chunk( pcpu_fc_free_fn_t free_fn, pcpu_fc_populate_pte_fn_t populate_pte_fn); +#ifdef CONFIG_NEED_MULTIPLE_NODES +extern ssize_t __init pcpu_lpage_first_chunk( + size_t static_size, size_t reserved_size, + ssize_t dyn_size, size_t lpage_size, + pcpu_fc_alloc_fn_t alloc_fn, + pcpu_fc_free_fn_t free_fn, + pcpu_fc_map_fn_t map_fn); + +extern void *pcpu_lpage_remapped(void *kaddr); +#else +static inline ssize_t __init pcpu_lpage_first_chunk( + size_t static_size, size_t reserved_size, + ssize_t dyn_size, size_t lpage_size, + pcpu_fc_alloc_fn_t alloc_fn, + pcpu_fc_free_fn_t free_fn, + pcpu_fc_map_fn_t map_fn) +{ + return -EINVAL; +} + +static inline void *pcpu_lpage_remapped(void *kaddr) +{ + return NULL; +} +#endif + /* * Use this to get to a cpu's version of the per-cpu object * dynamically allocated. Non-atomic access to the current CPU's diff --git a/mm/percpu.c b/mm/percpu.c index f3fe7bc7378..17db527ee2e 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1190,6 +1190,19 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, return pcpu_unit_size; } +static size_t pcpu_calc_fc_sizes(size_t static_size, size_t reserved_size, + ssize_t *dyn_sizep) +{ + size_t size_sum; + + size_sum = PFN_ALIGN(static_size + reserved_size + + (*dyn_sizep >= 0 ? *dyn_sizep : 0)); + if (*dyn_sizep != 0) + *dyn_sizep = size_sum - static_size - reserved_size; + + return size_sum; +} + /* * Embedding first chunk setup helper. */ @@ -1241,10 +1254,7 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, unsigned int cpu; /* determine parameters and allocate */ - pcpue_size = PFN_ALIGN(static_size + reserved_size + - (dyn_size >= 0 ? dyn_size : 0)); - if (dyn_size != 0) - dyn_size = pcpue_size - static_size - reserved_size; + pcpue_size = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size); pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); chunk_size = pcpue_unit_size * num_possible_cpus(); @@ -1390,6 +1400,197 @@ out_free_ar: return ret; } +/* + * Large page remapping first chunk setup helper + */ +#ifdef CONFIG_NEED_MULTIPLE_NODES +struct pcpul_ent { + unsigned int cpu; + void *ptr; +}; + +static size_t pcpul_size; +static size_t pcpul_unit_size; +static struct pcpul_ent *pcpul_map; +static struct vm_struct pcpul_vm; + +static struct page * __init pcpul_get_page(unsigned int cpu, int pageno) +{ + size_t off = (size_t)pageno << PAGE_SHIFT; + + if (off >= pcpul_size) + return NULL; + + return virt_to_page(pcpul_map[cpu].ptr + off); +} + +/** + * pcpu_lpage_first_chunk - remap the first percpu chunk using large page + * @static_size: the size of static percpu area in bytes + * @reserved_size: the size of reserved percpu area in bytes + * @dyn_size: free size for dynamic allocation in bytes, -1 for auto + * @lpage_size: the size of a large page + * @alloc_fn: function to allocate percpu lpage, always called with lpage_size + * @free_fn: function to free percpu memory, @size <= lpage_size + * @map_fn: function to map percpu lpage, always called with lpage_size + * + * This allocator uses large page as unit. A large page is allocated + * for each cpu and each is remapped into vmalloc area using large + * page mapping. As large page can be quite large, only part of it is + * used for the first chunk. Unused part is returned to the bootmem + * allocator. + * + * So, the large pages are mapped twice - once to the physical mapping + * and to the vmalloc area for the first percpu chunk. The double + * mapping does add one more large TLB entry pressure but still is + * much better than only using 4k mappings while still being NUMA + * friendly. + * + * RETURNS: + * The determined pcpu_unit_size which can be used to initialize + * percpu access on success, -errno on failure. + */ +ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size, + ssize_t dyn_size, size_t lpage_size, + pcpu_fc_alloc_fn_t alloc_fn, + pcpu_fc_free_fn_t free_fn, + pcpu_fc_map_fn_t map_fn) +{ + size_t size_sum; + size_t map_size; + unsigned int cpu; + int i, j; + ssize_t ret; + + /* + * Currently supports only single page. Supporting multiple + * pages won't be too difficult if it ever becomes necessary. + */ + size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size); + + pcpul_unit_size = lpage_size; + pcpul_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); + if (pcpul_size > pcpul_unit_size) { + pr_warning("PERCPU: static data is larger than large page, " + "can't use large page\n"); + return -EINVAL; + } + + /* allocate pointer array and alloc large pages */ + map_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpul_map[0])); + pcpul_map = alloc_bootmem(map_size); + + for_each_possible_cpu(cpu) { + void *ptr; + + ptr = alloc_fn(cpu, lpage_size); + if (!ptr) { + pr_warning("PERCPU: failed to allocate large page " + "for cpu%u\n", cpu); + goto enomem; + } + + /* + * Only use pcpul_size bytes and give back the rest. + * + * Ingo: The lpage_size up-rounding bootmem is needed + * to make sure the partial lpage is still fully RAM - + * it's not well-specified to have a incompatible area + * (unmapped RAM, device memory, etc.) in that hole. + */ + free_fn(ptr + pcpul_size, lpage_size - pcpul_size); + + pcpul_map[cpu].cpu = cpu; + pcpul_map[cpu].ptr = ptr; + + memcpy(ptr, __per_cpu_load, static_size); + } + + /* allocate address and map */ + pcpul_vm.flags = VM_ALLOC; + pcpul_vm.size = num_possible_cpus() * pcpul_unit_size; + vm_area_register_early(&pcpul_vm, pcpul_unit_size); + + for_each_possible_cpu(cpu) + map_fn(pcpul_map[cpu].ptr, pcpul_unit_size, + pcpul_vm.addr + cpu * pcpul_unit_size); + + /* we're ready, commit */ + pr_info("PERCPU: Remapped at %p with large pages, static data " + "%zu bytes\n", pcpul_vm.addr, static_size); + + ret = pcpu_setup_first_chunk(pcpul_get_page, static_size, + reserved_size, dyn_size, pcpul_unit_size, + pcpul_vm.addr, NULL); + + /* sort pcpul_map array for pcpu_lpage_remapped() */ + for (i = 0; i < num_possible_cpus() - 1; i++) + for (j = i + 1; j < num_possible_cpus(); j++) + if (pcpul_map[i].ptr > pcpul_map[j].ptr) { + struct pcpul_ent tmp = pcpul_map[i]; + pcpul_map[i] = pcpul_map[j]; + pcpul_map[j] = tmp; + } + + return ret; + +enomem: + for_each_possible_cpu(cpu) + if (pcpul_map[cpu].ptr) + free_fn(pcpul_map[cpu].ptr, pcpul_size); + free_bootmem(__pa(pcpul_map), map_size); + return -ENOMEM; +} + +/** + * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area + * @kaddr: the kernel address in question + * + * Determine whether @kaddr falls in the pcpul recycled area. This is + * used by pageattr to detect VM aliases and break up the pcpu large + * page mapping such that the same physical page is not mapped under + * different attributes. + * + * The recycled area is always at the tail of a partially used large + * page. + * + * RETURNS: + * Address of corresponding remapped pcpu address if match is found; + * otherwise, NULL. + */ +void *pcpu_lpage_remapped(void *kaddr) +{ + unsigned long unit_mask = pcpul_unit_size - 1; + void *lpage_addr = (void *)((unsigned long)kaddr & ~unit_mask); + unsigned long offset = (unsigned long)kaddr & unit_mask; + int left = 0, right = num_possible_cpus() - 1; + int pos; + + /* pcpul in use at all? */ + if (!pcpul_map) + return NULL; + + /* okay, perform binary search */ + while (left <= right) { + pos = (left + right) / 2; + + if (pcpul_map[pos].ptr < lpage_addr) + left = pos + 1; + else if (pcpul_map[pos].ptr > lpage_addr) + right = pos - 1; + else { + /* it shouldn't be in the area for the first chunk */ + WARN_ON(offset < pcpul_size); + + return pcpul_vm.addr + + pcpul_map[pos].cpu * pcpul_unit_size + offset; + } + } + + return NULL; +} +#endif + /* * Generic percpu area setup. * -- cgit v1.2.3 From a530b7958612bafe2027e21359083dba84f0b3b4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 4 Jul 2009 08:11:00 +0900 Subject: percpu: teach large page allocator about NUMA Large page first chunk allocator is primarily used for NUMA machines; however, its NUMA handling is extremely simplistic. Regardless of their proximity, each cpu is put into separate large page just to return most of the allocated space back wasting large amount of vmalloc space and increasing cache footprint. This patch teachs NUMA details to large page allocator. Given processor proximity information, pcpu_lpage_build_unit_map() will find fitting cpu -> unit mapping in which cpus in LOCAL_DISTANCE share the same large page and not too much virtual address space is wasted. This greatly reduces the unit and thus chunk size and wastes much less address space for the first chunk. For example, on 4/4 NUMA machine, the original code occupied 16MB of virtual space for the first chunk while the new code only uses 4MB - one 2MB page for each node. [ Impact: much better space efficiency on NUMA machines ] Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: Jan Beulich Cc: Andi Kleen Cc: David Miller --- arch/x86/kernel/setup_percpu.c | 72 +++++++-- include/linux/percpu.h | 24 ++- mm/percpu.c | 358 ++++++++++++++++++++++++++++++++--------- 3 files changed, 359 insertions(+), 95 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 4f2e0ac9130..7501bb14bd5 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -149,36 +149,73 @@ static void __init pcpul_map(void *ptr, size_t size, void *addr) set_pmd(pmd, pmd_v); } +static int pcpu_lpage_cpu_distance(unsigned int from, unsigned int to) +{ + if (early_cpu_to_node(from) == early_cpu_to_node(to)) + return LOCAL_DISTANCE; + else + return REMOTE_DISTANCE; +} + static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) { size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; + size_t dyn_size = reserve - PERCPU_FIRST_CHUNK_RESERVE; + size_t unit_map_size, unit_size; + int *unit_map; + int nr_units; + ssize_t ret; + + /* on non-NUMA, embedding is better */ + if (!chosen && !pcpu_need_numa()) + return -EINVAL; + + /* need PSE */ + if (!cpu_has_pse) { + pr_warning("PERCPU: lpage allocator requires PSE\n"); + return -EINVAL; + } + /* allocate and build unit_map */ + unit_map_size = num_possible_cpus() * sizeof(int); + unit_map = alloc_bootmem_nopanic(unit_map_size); + if (!unit_map) { + pr_warning("PERCPU: failed to allocate unit_map\n"); + return -ENOMEM; + } + + ret = pcpu_lpage_build_unit_map(static_size, + PERCPU_FIRST_CHUNK_RESERVE, + &dyn_size, &unit_size, PMD_SIZE, + unit_map, pcpu_lpage_cpu_distance); + if (ret < 0) { + pr_warning("PERCPU: failed to build unit_map\n"); + goto out_free; + } + nr_units = ret; + + /* do the parameters look okay? */ if (!chosen) { size_t vm_size = VMALLOC_END - VMALLOC_START; - size_t tot_size = num_possible_cpus() * PMD_SIZE; - - /* on non-NUMA, embedding is better */ - if (!pcpu_need_numa()) - return -EINVAL; + size_t tot_size = nr_units * unit_size; /* don't consume more than 20% of vmalloc area */ if (tot_size > vm_size / 5) { pr_info("PERCPU: too large chunk size %zuMB for " "large page remap\n", tot_size >> 20); - return -EINVAL; + ret = -EINVAL; + goto out_free; } } - /* need PSE */ - if (!cpu_has_pse) { - pr_warning("PERCPU: lpage allocator requires PSE\n"); - return -EINVAL; - } - - return pcpu_lpage_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, - reserve - PERCPU_FIRST_CHUNK_RESERVE, - PMD_SIZE, - pcpu_fc_alloc, pcpu_fc_free, pcpul_map); + ret = pcpu_lpage_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, + dyn_size, unit_size, PMD_SIZE, + unit_map, nr_units, + pcpu_fc_alloc, pcpu_fc_free, pcpul_map); +out_free: + if (ret < 0) + free_bootmem(__pa(unit_map), unit_map_size); + return ret; } #else static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) @@ -299,7 +336,8 @@ void __init setup_per_cpu_areas(void) /* alrighty, percpu areas up and running */ delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; for_each_possible_cpu(cpu) { - per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size; + per_cpu_offset(cpu) = + delta + pcpu_unit_map[cpu] * pcpu_unit_size; per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); per_cpu(cpu_number, cpu) = cpu; setup_percpu_segment(cpu); diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 1e0e8878dc2..8ce91af4aa1 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -62,6 +62,7 @@ extern const int *pcpu_unit_map; typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size); typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size); typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr); +typedef int (pcpu_fc_cpu_distance_fn_t)(unsigned int from, unsigned int to); typedef void (*pcpu_fc_map_fn_t)(void *ptr, size_t size, void *addr); extern size_t __init pcpu_setup_first_chunk( @@ -80,18 +81,37 @@ extern ssize_t __init pcpu_4k_first_chunk( pcpu_fc_populate_pte_fn_t populate_pte_fn); #ifdef CONFIG_NEED_MULTIPLE_NODES +extern int __init pcpu_lpage_build_unit_map( + size_t static_size, size_t reserved_size, + ssize_t *dyn_sizep, size_t *unit_sizep, + size_t lpage_size, int *unit_map, + pcpu_fc_cpu_distance_fn_t cpu_distance_fn); + extern ssize_t __init pcpu_lpage_first_chunk( size_t static_size, size_t reserved_size, - ssize_t dyn_size, size_t lpage_size, + size_t dyn_size, size_t unit_size, + size_t lpage_size, const int *unit_map, + int nr_units, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn, pcpu_fc_map_fn_t map_fn); extern void *pcpu_lpage_remapped(void *kaddr); #else +static inline int pcpu_lpage_build_unit_map( + size_t static_size, size_t reserved_size, + ssize_t *dyn_sizep, size_t *unit_sizep, + size_t lpage_size, int *unit_map, + pcpu_fc_cpu_distance_fn_t cpu_distance_fn) +{ + return -EINVAL; +} + static inline ssize_t __init pcpu_lpage_first_chunk( size_t static_size, size_t reserved_size, - ssize_t dyn_size, size_t lpage_size, + size_t dyn_size, size_t unit_size, + size_t lpage_size, const int *unit_map, + int nr_units, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn, pcpu_fc_map_fn_t map_fn) diff --git a/mm/percpu.c b/mm/percpu.c index 2196fae24f0..b3d0bcff8c7 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -59,6 +59,7 @@ #include #include #include +#include #include #include #include @@ -1594,75 +1595,259 @@ out_free_ar: * Large page remapping first chunk setup helper */ #ifdef CONFIG_NEED_MULTIPLE_NODES + +/** + * pcpu_lpage_build_unit_map - build unit_map for large page remapping + * @static_size: the size of static percpu area in bytes + * @reserved_size: the size of reserved percpu area in bytes + * @dyn_sizep: in/out parameter for dynamic size, -1 for auto + * @unit_sizep: out parameter for unit size + * @unit_map: unit_map to be filled + * @cpu_distance_fn: callback to determine distance between cpus + * + * This function builds cpu -> unit map and determine other parameters + * considering needed percpu size, large page size and distances + * between CPUs in NUMA. + * + * CPUs which are of LOCAL_DISTANCE both ways are grouped together and + * may share units in the same large page. The returned configuration + * is guaranteed to have CPUs on different nodes on different large + * pages and >=75% usage of allocated virtual address space. + * + * RETURNS: + * On success, fills in @unit_map, sets *@dyn_sizep, *@unit_sizep and + * returns the number of units to be allocated. -errno on failure. + */ +int __init pcpu_lpage_build_unit_map(size_t static_size, size_t reserved_size, + ssize_t *dyn_sizep, size_t *unit_sizep, + size_t lpage_size, int *unit_map, + pcpu_fc_cpu_distance_fn_t cpu_distance_fn) +{ + static int group_map[NR_CPUS] __initdata; + static int group_cnt[NR_CPUS] __initdata; + int group_cnt_max = 0; + size_t size_sum, min_unit_size, alloc_size; + int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */ + int last_allocs; + unsigned int cpu, tcpu; + int group, unit; + + /* + * Determine min_unit_size, alloc_size and max_upa such that + * alloc_size is multiple of lpage_size and is the smallest + * which can accomodate 4k aligned segments which are equal to + * or larger than min_unit_size. + */ + size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, dyn_sizep); + min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); + + alloc_size = roundup(min_unit_size, lpage_size); + upa = alloc_size / min_unit_size; + while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) + upa--; + max_upa = upa; + + /* group cpus according to their proximity */ + for_each_possible_cpu(cpu) { + group = 0; + next_group: + for_each_possible_cpu(tcpu) { + if (cpu == tcpu) + break; + if (group_map[tcpu] == group && + (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE || + cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) { + group++; + goto next_group; + } + } + group_map[cpu] = group; + group_cnt[group]++; + group_cnt_max = max(group_cnt_max, group_cnt[group]); + } + + /* + * Expand unit size until address space usage goes over 75% + * and then as much as possible without using more address + * space. + */ + last_allocs = INT_MAX; + for (upa = max_upa; upa; upa--) { + int allocs = 0, wasted = 0; + + if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) + continue; + + for (group = 0; group_cnt[group]; group++) { + int this_allocs = DIV_ROUND_UP(group_cnt[group], upa); + allocs += this_allocs; + wasted += this_allocs * upa - group_cnt[group]; + } + + /* + * Don't accept if wastage is over 25%. The + * greater-than comparison ensures upa==1 always + * passes the following check. + */ + if (wasted > num_possible_cpus() / 3) + continue; + + /* and then don't consume more memory */ + if (allocs > last_allocs) + break; + last_allocs = allocs; + best_upa = upa; + } + *unit_sizep = alloc_size / best_upa; + + /* assign units to cpus accordingly */ + unit = 0; + for (group = 0; group_cnt[group]; group++) { + for_each_possible_cpu(cpu) + if (group_map[cpu] == group) + unit_map[cpu] = unit++; + unit = roundup(unit, best_upa); + } + + return unit; /* unit contains aligned number of units */ +} + struct pcpul_ent { - unsigned int cpu; void *ptr; + void *map_addr; }; static size_t pcpul_size; -static size_t pcpul_unit_size; +static size_t pcpul_lpage_size; +static int pcpul_nr_lpages; static struct pcpul_ent *pcpul_map; -static struct vm_struct pcpul_vm; + +static bool __init pcpul_unit_to_cpu(int unit, const int *unit_map, + unsigned int *cpup) +{ + unsigned int cpu; + + for_each_possible_cpu(cpu) + if (unit_map[cpu] == unit) { + if (cpup) + *cpup = cpu; + return true; + } + + return false; +} + +static void __init pcpul_lpage_dump_cfg(const char *lvl, size_t static_size, + size_t reserved_size, size_t dyn_size, + size_t unit_size, size_t lpage_size, + const int *unit_map, int nr_units) +{ + int width = 1, v = nr_units; + char empty_str[] = "--------"; + int upl, lpl; /* units per lpage, lpage per line */ + unsigned int cpu; + int lpage, unit; + + while (v /= 10) + width++; + empty_str[min_t(int, width, sizeof(empty_str) - 1)] = '\0'; + + upl = max_t(int, lpage_size / unit_size, 1); + lpl = rounddown_pow_of_two(max_t(int, 60 / (upl * (width + 1) + 2), 1)); + + printk("%spcpu-lpage: sta/res/dyn=%zu/%zu/%zu unit=%zu lpage=%zu", lvl, + static_size, reserved_size, dyn_size, unit_size, lpage_size); + + for (lpage = 0, unit = 0; unit < nr_units; unit++) { + if (!(unit % upl)) { + if (!(lpage++ % lpl)) { + printk("\n"); + printk("%spcpu-lpage: ", lvl); + } else + printk("| "); + } + if (pcpul_unit_to_cpu(unit, unit_map, &cpu)) + printk("%0*d ", width, cpu); + else + printk("%s ", empty_str); + } + printk("\n"); +} /** * pcpu_lpage_first_chunk - remap the first percpu chunk using large page * @static_size: the size of static percpu area in bytes * @reserved_size: the size of reserved percpu area in bytes - * @dyn_size: free size for dynamic allocation in bytes, -1 for auto + * @dyn_size: free size for dynamic allocation in bytes + * @unit_size: unit size in bytes * @lpage_size: the size of a large page + * @unit_map: cpu -> unit mapping + * @nr_units: the number of units * @alloc_fn: function to allocate percpu lpage, always called with lpage_size * @free_fn: function to free percpu memory, @size <= lpage_size * @map_fn: function to map percpu lpage, always called with lpage_size * - * This allocator uses large page as unit. A large page is allocated - * for each cpu and each is remapped into vmalloc area using large - * page mapping. As large page can be quite large, only part of it is - * used for the first chunk. Unused part is returned to the bootmem - * allocator. - * - * So, the large pages are mapped twice - once to the physical mapping - * and to the vmalloc area for the first percpu chunk. The double - * mapping does add one more large TLB entry pressure but still is - * much better than only using 4k mappings while still being NUMA - * friendly. + * This allocator uses large page to build and map the first chunk. + * Unlike other helpers, the caller should always specify @dyn_size + * and @unit_size. These parameters along with @unit_map and + * @nr_units can be determined using pcpu_lpage_build_unit_map(). + * This two stage initialization is to allow arch code to evaluate the + * parameters before committing to it. + * + * Large pages are allocated as directed by @unit_map and other + * parameters and mapped to vmalloc space. Unused holes are returned + * to the page allocator. Note that these holes end up being actively + * mapped twice - once to the physical mapping and to the vmalloc area + * for the first percpu chunk. Depending on architecture, this might + * cause problem when changing page attributes of the returned area. + * These double mapped areas can be detected using + * pcpu_lpage_remapped(). * * RETURNS: * The determined pcpu_unit_size which can be used to initialize * percpu access on success, -errno on failure. */ ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size, - ssize_t dyn_size, size_t lpage_size, + size_t dyn_size, size_t unit_size, + size_t lpage_size, const int *unit_map, + int nr_units, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn, pcpu_fc_map_fn_t map_fn) { - size_t size_sum; + static struct vm_struct vm; + size_t chunk_size = unit_size * nr_units; size_t map_size; unsigned int cpu; - int i, j; ssize_t ret; + int i, j, unit; - /* - * Currently supports only single page. Supporting multiple - * pages won't be too difficult if it ever becomes necessary. - */ - size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size); + pcpul_lpage_dump_cfg(KERN_DEBUG, static_size, reserved_size, dyn_size, + unit_size, lpage_size, unit_map, nr_units); - pcpul_unit_size = lpage_size; - pcpul_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); - if (pcpul_size > pcpul_unit_size) { - pr_warning("PERCPU: static data is larger than large page, " - "can't use large page\n"); - return -EINVAL; - } + BUG_ON(chunk_size % lpage_size); + + pcpul_size = static_size + reserved_size + dyn_size; + pcpul_lpage_size = lpage_size; + pcpul_nr_lpages = chunk_size / lpage_size; /* allocate pointer array and alloc large pages */ - map_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpul_map[0])); + map_size = pcpul_nr_lpages * sizeof(pcpul_map[0]); pcpul_map = alloc_bootmem(map_size); - for_each_possible_cpu(cpu) { + /* allocate all pages */ + for (i = 0; i < pcpul_nr_lpages; i++) { + size_t offset = i * lpage_size; + int first_unit = offset / unit_size; + int last_unit = (offset + lpage_size - 1) / unit_size; void *ptr; + /* find out which cpu is mapped to this unit */ + for (unit = first_unit; unit <= last_unit; unit++) + if (pcpul_unit_to_cpu(unit, unit_map, &cpu)) + goto found; + continue; + found: ptr = alloc_fn(cpu, lpage_size); if (!ptr) { pr_warning("PERCPU: failed to allocate large page " @@ -1670,53 +1855,79 @@ ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size, goto enomem; } - /* - * Only use pcpul_size bytes and give back the rest. - * - * Ingo: The lpage_size up-rounding bootmem is needed - * to make sure the partial lpage is still fully RAM - - * it's not well-specified to have a incompatible area - * (unmapped RAM, device memory, etc.) in that hole. - */ - free_fn(ptr + pcpul_size, lpage_size - pcpul_size); - - pcpul_map[cpu].cpu = cpu; - pcpul_map[cpu].ptr = ptr; + pcpul_map[i].ptr = ptr; + } - memcpy(ptr, __per_cpu_load, static_size); + /* return unused holes */ + for (unit = 0; unit < nr_units; unit++) { + size_t start = unit * unit_size; + size_t end = start + unit_size; + size_t off, next; + + /* don't free used part of occupied unit */ + if (pcpul_unit_to_cpu(unit, unit_map, NULL)) + start += pcpul_size; + + /* unit can span more than one page, punch the holes */ + for (off = start; off < end; off = next) { + void *ptr = pcpul_map[off / lpage_size].ptr; + next = min(roundup(off + 1, lpage_size), end); + if (ptr) + free_fn(ptr + off % lpage_size, next - off); + } } - /* allocate address and map */ - pcpul_vm.flags = VM_ALLOC; - pcpul_vm.size = num_possible_cpus() * pcpul_unit_size; - vm_area_register_early(&pcpul_vm, pcpul_unit_size); + /* allocate address, map and copy */ + vm.flags = VM_ALLOC; + vm.size = chunk_size; + vm_area_register_early(&vm, unit_size); + + for (i = 0; i < pcpul_nr_lpages; i++) { + if (!pcpul_map[i].ptr) + continue; + pcpul_map[i].map_addr = vm.addr + i * lpage_size; + map_fn(pcpul_map[i].ptr, lpage_size, pcpul_map[i].map_addr); + } for_each_possible_cpu(cpu) - map_fn(pcpul_map[cpu].ptr, pcpul_unit_size, - pcpul_vm.addr + cpu * pcpul_unit_size); + memcpy(vm.addr + unit_map[cpu] * unit_size, __per_cpu_load, + static_size); /* we're ready, commit */ pr_info("PERCPU: Remapped at %p with large pages, static data " - "%zu bytes\n", pcpul_vm.addr, static_size); + "%zu bytes\n", vm.addr, static_size); ret = pcpu_setup_first_chunk(static_size, reserved_size, dyn_size, - pcpul_unit_size, pcpul_vm.addr, NULL); - - /* sort pcpul_map array for pcpu_lpage_remapped() */ - for (i = 0; i < num_possible_cpus() - 1; i++) - for (j = i + 1; j < num_possible_cpus(); j++) - if (pcpul_map[i].ptr > pcpul_map[j].ptr) { - struct pcpul_ent tmp = pcpul_map[i]; - pcpul_map[i] = pcpul_map[j]; - pcpul_map[j] = tmp; - } + unit_size, vm.addr, unit_map); + + /* + * Sort pcpul_map array for pcpu_lpage_remapped(). Unmapped + * lpages are pushed to the end and trimmed. + */ + for (i = 0; i < pcpul_nr_lpages - 1; i++) + for (j = i + 1; j < pcpul_nr_lpages; j++) { + struct pcpul_ent tmp; + + if (!pcpul_map[j].ptr) + continue; + if (pcpul_map[i].ptr && + pcpul_map[i].ptr < pcpul_map[j].ptr) + continue; + + tmp = pcpul_map[i]; + pcpul_map[i] = pcpul_map[j]; + pcpul_map[j] = tmp; + } + + while (pcpul_nr_lpages && !pcpul_map[pcpul_nr_lpages - 1].ptr) + pcpul_nr_lpages--; return ret; enomem: - for_each_possible_cpu(cpu) - if (pcpul_map[cpu].ptr) - free_fn(pcpul_map[cpu].ptr, pcpul_size); + for (i = 0; i < pcpul_nr_lpages; i++) + if (pcpul_map[i].ptr) + free_fn(pcpul_map[i].ptr, lpage_size); free_bootmem(__pa(pcpul_map), map_size); return -ENOMEM; } @@ -1739,10 +1950,10 @@ enomem: */ void *pcpu_lpage_remapped(void *kaddr) { - unsigned long unit_mask = pcpul_unit_size - 1; - void *lpage_addr = (void *)((unsigned long)kaddr & ~unit_mask); - unsigned long offset = (unsigned long)kaddr & unit_mask; - int left = 0, right = num_possible_cpus() - 1; + unsigned long lpage_mask = pcpul_lpage_size - 1; + void *lpage_addr = (void *)((unsigned long)kaddr & ~lpage_mask); + unsigned long offset = (unsigned long)kaddr & lpage_mask; + int left = 0, right = pcpul_nr_lpages - 1; int pos; /* pcpul in use at all? */ @@ -1757,13 +1968,8 @@ void *pcpu_lpage_remapped(void *kaddr) left = pos + 1; else if (pcpul_map[pos].ptr > lpage_addr) right = pos - 1; - else { - /* it shouldn't be in the area for the first chunk */ - WARN_ON(offset < pcpul_size); - - return pcpul_vm.addr + - pcpul_map[pos].cpu * pcpul_unit_size + offset; - } + else + return pcpul_map[pos].map_addr + offset; } return NULL; -- cgit v1.2.3 From 023bf6f1b8bf58dc4da7f0dc1cf4787b0d5297c1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 9 Jul 2009 11:27:40 +0900 Subject: linker script: unify usage of discard definition Discarded sections in different archs share some commonality but have considerable differences. This led to linker script for each arch implementing its own /DISCARD/ definition, which makes maintaining tedious and adding new entries error-prone. This patch makes all linker scripts to move discard definitions to the end of the linker script and use the common DISCARDS macro. As ld uses the first matching section definition, archs can include default discarded sections by including them earlier in the linker script. ia64 is notable because it first throws away some ia64 specific subsections and then include the rest of the sections into the final image, so those sections must be discarded before the inclusion. defconfig compile tested for x86, x86-64, powerpc, powerpc64, ia64, alpha, sparc, sparc64 and s390. Michal Simek tested microblaze. Signed-off-by: Tejun Heo Acked-by: Paul Mundt Acked-by: Mike Frysinger Tested-by: Michal Simek Cc: linux-arch@vger.kernel.org Cc: Michal Simek Cc: microblaze-uclinux@itee.uq.edu.au Cc: Sam Ravnborg Cc: Tony Luck --- arch/alpha/kernel/vmlinux.lds.S | 10 ++-------- arch/avr32/kernel/vmlinux.lds.S | 10 +++------- arch/blackfin/kernel/vmlinux.lds.S | 6 +----- arch/cris/kernel/vmlinux.lds.S | 10 ++-------- arch/frv/kernel/vmlinux.lds.S | 2 +- arch/h8300/kernel/vmlinux.lds.S | 6 ++---- arch/ia64/kernel/vmlinux.lds.S | 17 ++++++++--------- arch/m32r/kernel/vmlinux.lds.S | 11 +++-------- arch/m68k/kernel/vmlinux-std.lds | 11 +++-------- arch/m68k/kernel/vmlinux-sun3.lds | 10 ++-------- arch/m68knommu/kernel/vmlinux.lds.S | 8 +------- arch/microblaze/kernel/vmlinux.lds.S | 2 +- arch/mips/kernel/vmlinux.lds.S | 22 ++++++++++------------ arch/mn10300/kernel/vmlinux.lds.S | 9 +++------ arch/parisc/kernel/vmlinux.lds.S | 9 ++++----- arch/powerpc/kernel/vmlinux.lds.S | 10 +++------- arch/s390/kernel/vmlinux.lds.S | 10 +++------- arch/sh/kernel/vmlinux.lds.S | 11 ++++------- arch/sparc/kernel/vmlinux.lds.S | 9 ++------- arch/um/include/asm/common.lds.S | 5 ----- arch/um/kernel/dyn.lds.S | 2 +- arch/um/kernel/uml.lds.S | 2 +- arch/x86/kernel/vmlinux.lds.S | 11 ++++------- arch/xtensa/kernel/vmlinux.lds.S | 14 ++++---------- include/asm-generic/vmlinux.lds.h | 18 ++++++++++++------ 25 files changed, 80 insertions(+), 155 deletions(-) (limited to 'arch/x86') diff --git a/arch/alpha/kernel/vmlinux.lds.S b/arch/alpha/kernel/vmlinux.lds.S index 75fe1d6877e..6dc03c35caa 100644 --- a/arch/alpha/kernel/vmlinux.lds.S +++ b/arch/alpha/kernel/vmlinux.lds.S @@ -134,14 +134,6 @@ SECTIONS __bss_stop = .; _end = .; - /* Sections to be discarded */ - /DISCARD/ : { - EXIT_TEXT - EXIT_DATA - *(.exitcall.exit) - *(.discard) - } - .mdebug 0 : { *(.mdebug) } @@ -151,4 +143,6 @@ SECTIONS STABS_DEBUG DWARF_DEBUG + + DISCARDS } diff --git a/arch/avr32/kernel/vmlinux.lds.S b/arch/avr32/kernel/vmlinux.lds.S index b8324608ec0..c4b56654349 100644 --- a/arch/avr32/kernel/vmlinux.lds.S +++ b/arch/avr32/kernel/vmlinux.lds.S @@ -124,15 +124,11 @@ SECTIONS _end = .; } + DWARF_DEBUG + /* When something in the kernel is NOT compiled as a module, the module * cleanup code and data are put into these segments. Both can then be * thrown away, as cleanup code is never called unless it's a module. */ - /DISCARD/ : { - EXIT_DATA - *(.exitcall.exit) - *(.discard) - } - - DWARF_DEBUG + DISCARDS } diff --git a/arch/blackfin/kernel/vmlinux.lds.S b/arch/blackfin/kernel/vmlinux.lds.S index 6e8eabd8f0a..d7ffe299b97 100644 --- a/arch/blackfin/kernel/vmlinux.lds.S +++ b/arch/blackfin/kernel/vmlinux.lds.S @@ -277,9 +277,5 @@ SECTIONS DWARF_DEBUG - /DISCARD/ : - { - *(.exitcall.exit) - *(.discard) - } + DISCARDS } diff --git a/arch/cris/kernel/vmlinux.lds.S b/arch/cris/kernel/vmlinux.lds.S index a3175ebb38c..6c81836b922 100644 --- a/arch/cris/kernel/vmlinux.lds.S +++ b/arch/cris/kernel/vmlinux.lds.S @@ -140,13 +140,7 @@ SECTIONS _end = .; __end = .; - /* Sections to be discarded */ - /DISCARD/ : { - EXIT_TEXT - EXIT_DATA - *(.exitcall.exit) - *(.discard) - } - dram_end = dram_start + (CONFIG_ETRAX_DRAM_SIZE - __CONFIG_ETRAX_VMEM_SIZE)*1024*1024; + + DISCARDS } diff --git a/arch/frv/kernel/vmlinux.lds.S b/arch/frv/kernel/vmlinux.lds.S index 64b5a5e4d35..7dbf41f68b5 100644 --- a/arch/frv/kernel/vmlinux.lds.S +++ b/arch/frv/kernel/vmlinux.lds.S @@ -178,7 +178,7 @@ SECTIONS .comment 0 : { *(.comment) } - /DISCARD/ : { *(.discard) } + DISCARDS } __kernel_image_size_no_bss = __bss_start - __kernel_image_start; diff --git a/arch/h8300/kernel/vmlinux.lds.S b/arch/h8300/kernel/vmlinux.lds.S index 03d6c0df33d..662b02ecb86 100644 --- a/arch/h8300/kernel/vmlinux.lds.S +++ b/arch/h8300/kernel/vmlinux.lds.S @@ -152,10 +152,6 @@ SECTIONS __end = . ; __ramstart = .; } - /DISCARD/ : { - *(.exitcall.exit) - *(.discard) - } .romfs : { *(.romfs*) @@ -166,4 +162,6 @@ SECTIONS COMMAND_START = . - 0x200 ; __ramend = . ; } + + DISCARDS } diff --git a/arch/ia64/kernel/vmlinux.lds.S b/arch/ia64/kernel/vmlinux.lds.S index 13d95897587..eb4214d1c5a 100644 --- a/arch/ia64/kernel/vmlinux.lds.S +++ b/arch/ia64/kernel/vmlinux.lds.S @@ -24,15 +24,14 @@ PHDRS { } SECTIONS { - /* Sections to be discarded */ + /* unwind exit sections must be discarded before the rest of the + sections get included. */ /DISCARD/ : { - EXIT_TEXT - EXIT_DATA - *(.exitcall.exit) - *(.discard) *(.IA_64.unwind.exit.text) *(.IA_64.unwind_info.exit.text) - } + *(.comment) + *(.note) + } v = PAGE_OFFSET; /* this symbol is here to make debugging easier... */ phys_start = _start - LOAD_OFFSET; @@ -317,7 +316,7 @@ SECTIONS .debug_funcnames 0 : { *(.debug_funcnames) } .debug_typenames 0 : { *(.debug_typenames) } .debug_varnames 0 : { *(.debug_varnames) } - /* These must appear regardless of . */ - /DISCARD/ : { *(.comment) } - /DISCARD/ : { *(.note) } + + /* Default discards */ + DISCARDS } diff --git a/arch/m32r/kernel/vmlinux.lds.S b/arch/m32r/kernel/vmlinux.lds.S index 480a49944cf..de5e21cca6a 100644 --- a/arch/m32r/kernel/vmlinux.lds.S +++ b/arch/m32r/kernel/vmlinux.lds.S @@ -120,14 +120,6 @@ SECTIONS _end = . ; - /* Sections to be discarded */ - /DISCARD/ : { - EXIT_TEXT - EXIT_DATA - *(.exitcall.exit) - *(.discard) - } - /* Stabs debugging sections. */ .stab 0 : { *(.stab) } .stabstr 0 : { *(.stabstr) } @@ -136,4 +128,7 @@ SECTIONS .stab.index 0 : { *(.stab.index) } .stab.indexstr 0 : { *(.stab.indexstr) } .comment 0 : { *(.comment) } + + /* Sections to be discarded */ + DISCARDS } diff --git a/arch/m68k/kernel/vmlinux-std.lds b/arch/m68k/kernel/vmlinux-std.lds index 905a797ada9..47eac19e8f6 100644 --- a/arch/m68k/kernel/vmlinux-std.lds +++ b/arch/m68k/kernel/vmlinux-std.lds @@ -82,14 +82,6 @@ SECTIONS _end = . ; - /* Sections to be discarded */ - /DISCARD/ : { - EXIT_TEXT - EXIT_DATA - *(.exitcall.exit) - *(.discard) - } - /* Stabs debugging sections. */ .stab 0 : { *(.stab) } .stabstr 0 : { *(.stabstr) } @@ -98,4 +90,7 @@ SECTIONS .stab.index 0 : { *(.stab.index) } .stab.indexstr 0 : { *(.stab.indexstr) } .comment 0 : { *(.comment) } + + /* Sections to be discarded */ + DISCARDS } diff --git a/arch/m68k/kernel/vmlinux-sun3.lds b/arch/m68k/kernel/vmlinux-sun3.lds index 47d04be322a..03efaf04d7d 100644 --- a/arch/m68k/kernel/vmlinux-sun3.lds +++ b/arch/m68k/kernel/vmlinux-sun3.lds @@ -77,14 +77,6 @@ __init_begin = .; _end = . ; - /* Sections to be discarded */ - /DISCARD/ : { - EXIT_TEXT - EXIT_DATA - *(.exitcall.exit) - *(.discard) - } - .crap : { /* Stabs debugging sections. */ *(.stab) @@ -97,4 +89,6 @@ __init_begin = .; *(.note) } + /* Sections to be discarded */ + DISCARDS } diff --git a/arch/m68knommu/kernel/vmlinux.lds.S b/arch/m68knommu/kernel/vmlinux.lds.S index 68111a61a77..2736a5e309c 100644 --- a/arch/m68knommu/kernel/vmlinux.lds.S +++ b/arch/m68knommu/kernel/vmlinux.lds.S @@ -184,13 +184,6 @@ SECTIONS { __init_end = .; } > INIT - /DISCARD/ : { - EXIT_TEXT - EXIT_DATA - *(.exitcall.exit) - *(.discard) - } - .bss : { . = ALIGN(4); _sbss = . ; @@ -201,5 +194,6 @@ SECTIONS { _end = . ; } > BSS + DISCARDS } diff --git a/arch/microblaze/kernel/vmlinux.lds.S b/arch/microblaze/kernel/vmlinux.lds.S index 81bebdcb18f..ec5fa91a48d 100644 --- a/arch/microblaze/kernel/vmlinux.lds.S +++ b/arch/microblaze/kernel/vmlinux.lds.S @@ -163,5 +163,5 @@ SECTIONS { . = ALIGN(4096); _end = .; - /DISCARD/ : { *(.discard) } + DISCARDS } diff --git a/arch/mips/kernel/vmlinux.lds.S b/arch/mips/kernel/vmlinux.lds.S index 45901609b74..1474c18fb77 100644 --- a/arch/mips/kernel/vmlinux.lds.S +++ b/arch/mips/kernel/vmlinux.lds.S @@ -176,18 +176,6 @@ SECTIONS _end = . ; - /* Sections to be discarded */ - /DISCARD/ : { - *(.exitcall.exit) - *(.discard) - - /* ABI crap starts here */ - *(.MIPS.options) - *(.options) - *(.pdr) - *(.reginfo) - } - /* These mark the ABI of the kernel for debuggers. */ .mdebug.abi32 : { KEEP(*(.mdebug.abi32)) @@ -213,4 +201,14 @@ SECTIONS *(.gptab.bss) *(.gptab.sbss) } + + /* Sections to be discarded */ + DISCARDS + /DISCARD/ : { + /* ABI crap starts here */ + *(.MIPS.options) + *(.options) + *(.pdr) + *(.reginfo) + } } diff --git a/arch/mn10300/kernel/vmlinux.lds.S b/arch/mn10300/kernel/vmlinux.lds.S index 5609d4962a5..8fcd0f1e21d 100644 --- a/arch/mn10300/kernel/vmlinux.lds.S +++ b/arch/mn10300/kernel/vmlinux.lds.S @@ -115,13 +115,10 @@ SECTIONS . = ALIGN(PAGE_SIZE); pg0 = .; - /* Sections to be discarded */ - /DISCARD/ : { - EXIT_CALL - *(.discard) - } - STABS_DEBUG DWARF_DEBUG + + /* Sections to be discarded */ + DISCARDS } diff --git a/arch/parisc/kernel/vmlinux.lds.S b/arch/parisc/kernel/vmlinux.lds.S index ccf58341845..aea1784edbd 100644 --- a/arch/parisc/kernel/vmlinux.lds.S +++ b/arch/parisc/kernel/vmlinux.lds.S @@ -237,10 +237,12 @@ SECTIONS /* freed after init ends here */ _end = . ; + STABS_DEBUG + .note 0 : { *(.note) } + /* Sections to be discarded */ + DISCARDS /DISCARD/ : { - *(.exitcall.exit) - *(.discard) #ifdef CONFIG_64BIT /* temporary hack until binutils is fixed to not emit these * for static binaries @@ -253,7 +255,4 @@ SECTIONS *(.gnu.hash) #endif } - - STABS_DEBUG - .note 0 : { *(.note) } } diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index 7fca9355fd3..244e3658983 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -37,13 +37,6 @@ jiffies = jiffies_64 + 4; #endif SECTIONS { - /* Sections to be discarded. */ - /DISCARD/ : { - *(.exitcall.exit) - *(.discard) - EXIT_DATA - } - . = KERNELBASE; /* @@ -299,4 +292,7 @@ SECTIONS . = ALIGN(PAGE_SIZE); _end = . ; PROVIDE32 (end = .); + + /* Sections to be discarded. */ + DISCARDS } diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S index 98867dfea46..82415c75b99 100644 --- a/arch/s390/kernel/vmlinux.lds.S +++ b/arch/s390/kernel/vmlinux.lds.S @@ -157,14 +157,10 @@ SECTIONS _end = . ; - /* Sections to be discarded */ - /DISCARD/ : { - EXIT_DATA - *(.exitcall.exit) - *(.discard) - } - /* Debugging sections. */ STABS_DEBUG DWARF_DEBUG + + /* Sections to be discarded */ + DISCARDS } diff --git a/arch/sh/kernel/vmlinux.lds.S b/arch/sh/kernel/vmlinux.lds.S index 766976d27b2..0ce254bca92 100644 --- a/arch/sh/kernel/vmlinux.lds.S +++ b/arch/sh/kernel/vmlinux.lds.S @@ -163,17 +163,14 @@ SECTIONS _end = . ; } + STABS_DEBUG + DWARF_DEBUG + /* * When something in the kernel is NOT compiled as a module, the * module cleanup code and data are put into these segments. Both * can then be thrown away, as cleanup code is never called unless * it's a module. */ - /DISCARD/ : { - *(.exitcall.exit) - *(.discard) - } - - STABS_DEBUG - DWARF_DEBUG + DISCARDS } diff --git a/arch/sparc/kernel/vmlinux.lds.S b/arch/sparc/kernel/vmlinux.lds.S index d63cf914667..866390feb68 100644 --- a/arch/sparc/kernel/vmlinux.lds.S +++ b/arch/sparc/kernel/vmlinux.lds.S @@ -171,13 +171,8 @@ SECTIONS } _end = . ; - /DISCARD/ : { - EXIT_TEXT - EXIT_DATA - *(.exitcall.exit) - *(.discard) - } - STABS_DEBUG DWARF_DEBUG + + DISCARDS } diff --git a/arch/um/include/asm/common.lds.S b/arch/um/include/asm/common.lds.S index cb0248616d4..37ecc5577a9 100644 --- a/arch/um/include/asm/common.lds.S +++ b/arch/um/include/asm/common.lds.S @@ -123,8 +123,3 @@ __initramfs_end = .; } - /* Sections to be discarded */ - /DISCARD/ : { - *(.exitcall.exit) - } - diff --git a/arch/um/kernel/dyn.lds.S b/arch/um/kernel/dyn.lds.S index 2916d6eadff..715a188c047 100644 --- a/arch/um/kernel/dyn.lds.S +++ b/arch/um/kernel/dyn.lds.S @@ -157,5 +157,5 @@ SECTIONS DWARF_DEBUG - /DISCARD/ : { *(.discard) } + DISCARDS } diff --git a/arch/um/kernel/uml.lds.S b/arch/um/kernel/uml.lds.S index 1f8a622cabe..2ebd39765db 100644 --- a/arch/um/kernel/uml.lds.S +++ b/arch/um/kernel/uml.lds.S @@ -101,5 +101,5 @@ SECTIONS DWARF_DEBUG - /DISCARD/ : { *(.discard) } + DISCARDS } diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 367e8788204..b600c843710 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -387,15 +387,12 @@ SECTIONS _end = .; } - /* Sections to be discarded */ - /DISCARD/ : { - *(.exitcall.exit) - *(.eh_frame) - *(.discard) - } - STABS_DEBUG DWARF_DEBUG + + /* Sections to be discarded */ + DISCARDS + /DISCARD/ : { *(.eh_frame) } } diff --git a/arch/xtensa/kernel/vmlinux.lds.S b/arch/xtensa/kernel/vmlinux.lds.S index b1e24638acd..921b6ff3b64 100644 --- a/arch/xtensa/kernel/vmlinux.lds.S +++ b/arch/xtensa/kernel/vmlinux.lds.S @@ -280,16 +280,6 @@ SECTIONS *(.ResetVector.text) } - /* Sections to be discarded */ - /DISCARD/ : - { - *(.exit.literal) - EXIT_TEXT - EXIT_DATA - *(.exitcall.exit) - *(.discard) - } - .xt.lit : { *(.xt.lit) } .xt.prop : { *(.xt.prop) } @@ -322,4 +312,8 @@ SECTIONS *(.xt.lit) *(.gnu.linkonce.p*) } + + /* Sections to be discarded */ + DISCARDS + /DISCARD/ : { *(.exit.literal) } } diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index c5c18ac878a..ab8ea9b7741 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -35,13 +35,10 @@ * __bss_stop = .; * _end = .; * - * /DISCARD/ : { - * EXIT_TEXT - * EXIT_DATA - * EXIT_CALL - * } * STABS_DEBUG * DWARF_DEBUG + * + * DISCARDS // must be the last * } * * [__init_begin, __init_end] is the init section that may be freed after init @@ -629,11 +626,20 @@ #define INIT_RAM_FS #endif +/* + * Default discarded sections. + * + * Some archs want to discard exit text/data at runtime rather than + * link time due to cross-section references such as alt instructions, + * bug table, eh_frame, etc. DISCARDS must be the last of output + * section definitions so that such archs put those in earlier section + * definitions. + */ #define DISCARDS \ /DISCARD/ : { \ EXIT_TEXT \ EXIT_DATA \ - *(.exitcall.exit) \ + EXIT_CALL \ *(.discard) \ } -- cgit v1.2.3 From c31d96338a6041520ba5f1b6a4a5012ef00686b3 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 9 Jul 2009 00:31:37 +0200 Subject: x86: mce: Make CONFIG_X86_ANCIENT_MCE dependent on CONFIG_X86_MCE Add a missing depency for ANCIENT_MCE. It didn't matter in practice because the ANCIENT code wasn't compiled without X86_MCE, but it's better to express that clearly in Kconfig. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 356d2ec8e2f..5962b872a7a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -823,7 +823,7 @@ config X86_MCE_AMD config X86_ANCIENT_MCE def_bool n - depends on X86_32 + depends on X86_32 && X86_MCE prompt "Support for old Pentium 5 / WinChip machine checks" ---help--- Include support for machine check handling on old Pentium 5 or WinChip -- cgit v1.2.3 From bab9bc6583fe6c1660d6ed36dd14bbb4edfaf393 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 9 Jul 2009 00:31:38 +0200 Subject: x86: mce: Update X86_MCE description in x86/Kconfig - Clarify that this config controls thermal throttling reporting too - Clarify the types of errors reported by machine checks - Drop references to ancient CPUs. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5962b872a7a..134a8c0d80d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -774,20 +774,12 @@ config X86_REROUTE_FOR_BROKEN_BOOT_IRQS increased on these systems. config X86_MCE - bool "Machine Check Exception" + bool "Machine Check / overheating reporting" ---help--- - Machine Check Exception support allows the processor to notify the - kernel if it detects a problem (e.g. overheating, component failure). + Machine Check support allows the processor to notify the + kernel if it detects a problem (e.g. overheating, data corruption). The action the kernel takes depends on the severity of the problem, - ranging from a warning message on the console, to halting the machine. - Your processor must be a Pentium or newer to support this - check the - flags in /proc/cpuinfo for mce. Note that some older Pentium systems - have a design flaw which leads to false MCE events - hence MCE is - disabled on all P5 processors, unless explicitly enabled with "mce" - as a boot argument. Similarly, if MCE is built in and creates a - problem on some new non-standard machine, you can boot with "nomce" - to disable it. MCE support simply ignores non-MCE processors like - the 386 and 486, so nearly everyone can say Y here. + ranging from warning messages to halting the machine. config X86_OLD_MCE depends on X86_32 && X86_MCE -- cgit v1.2.3 From 5bb38adcb54cf7192b154368ad62982caa11ca0b Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 9 Jul 2009 00:31:39 +0200 Subject: x86: mce: Remove old i386 machine check code As announced in feature-remove-schedule.txt remove CONFIG_X86_OLD_MCE This patch only removes code. The ancient machine check code for very old systems that are not supported by CONFIG_X86_NEW_MCE is still kept. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- Documentation/feature-removal-schedule.txt | 10 -- arch/x86/Kconfig | 35 +------ arch/x86/include/asm/mce.h | 11 -- arch/x86/kernel/cpu/mcheck/Makefile | 2 - arch/x86/kernel/cpu/mcheck/k7.c | 116 -------------------- arch/x86/kernel/cpu/mcheck/mce.c | 47 --------- arch/x86/kernel/cpu/mcheck/non-fatal.c | 94 ----------------- arch/x86/kernel/cpu/mcheck/p4.c | 163 ----------------------------- arch/x86/kernel/cpu/mcheck/p6.c | 127 ---------------------- 9 files changed, 2 insertions(+), 603 deletions(-) delete mode 100644 arch/x86/kernel/cpu/mcheck/k7.c delete mode 100644 arch/x86/kernel/cpu/mcheck/non-fatal.c delete mode 100644 arch/x86/kernel/cpu/mcheck/p4.c delete mode 100644 arch/x86/kernel/cpu/mcheck/p6.c (limited to 'arch/x86') diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index 7129846a278..edb2f0b0761 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -444,13 +444,3 @@ What: CONFIG_RFKILL_INPUT When: 2.6.33 Why: Should be implemented in userspace, policy daemon. Who: Johannes Berg - ----------------------------- - -What: CONFIG_X86_OLD_MCE -When: 2.6.32 -Why: Remove the old legacy 32bit machine check code. This has been - superseded by the newer machine check code from the 64bit port, - but the old version has been kept around for easier testing. Note this - doesn't impact the old P5 and WinChip machine check handlers. -Who: Andi Kleen diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 134a8c0d80d..d986769a7d9 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -781,21 +781,10 @@ config X86_MCE The action the kernel takes depends on the severity of the problem, ranging from warning messages to halting the machine. -config X86_OLD_MCE - depends on X86_32 && X86_MCE - bool "Use legacy machine check code (will go away)" - default n - select X86_ANCIENT_MCE - ---help--- - Use the old i386 machine check code. This is merely intended for - testing in a transition period. Try this if you run into any machine - check related software problems, but report the problem to - linux-kernel. When in doubt say no. - config X86_NEW_MCE depends on X86_MCE bool - default y if (!X86_OLD_MCE && X86_32) || X86_64 + default y config X86_MCE_INTEL def_bool y @@ -835,29 +824,9 @@ config X86_MCE_INJECT If you don't know what a machine check is and you don't do kernel QA it is safe to say n. -config X86_MCE_NONFATAL - tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4" - depends on X86_OLD_MCE - ---help--- - Enabling this feature starts a timer that triggers every 5 seconds which - will look at the machine check registers to see if anything happened. - Non-fatal problems automatically get corrected (but still logged). - Disable this if you don't want to see these messages. - Seeing the messages this option prints out may be indicative of dying - or out-of-spec (ie, overclocked) hardware. - This option only does something on certain CPUs. - (AMD Athlon/Duron and Intel Pentium 4) - -config X86_MCE_P4THERMAL - bool "check for P4 thermal throttling interrupt." - depends on X86_OLD_MCE && X86_MCE && (X86_UP_APIC || SMP) - ---help--- - Enabling this feature will cause a message to be printed when the P4 - enters thermal throttling. - config X86_THERMAL_VECTOR def_bool y - depends on X86_MCE_P4THERMAL || X86_MCE_INTEL + depends on X86_MCE_INTEL config VM86 bool "Enable VM86 support" if EMBEDDED diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index b50b9e9042c..6b8a974e127 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -115,13 +115,6 @@ void mcheck_init(struct cpuinfo_x86 *c); static inline void mcheck_init(struct cpuinfo_x86 *c) {} #endif -#ifdef CONFIG_X86_OLD_MCE -extern int nr_mce_banks; -void amd_mcheck_init(struct cpuinfo_x86 *c); -void intel_p4_mcheck_init(struct cpuinfo_x86 *c); -void intel_p6_mcheck_init(struct cpuinfo_x86 *c); -#endif - #ifdef CONFIG_X86_ANCIENT_MCE void intel_p5_mcheck_init(struct cpuinfo_x86 *c); void winchip_mcheck_init(struct cpuinfo_x86 *c); @@ -208,11 +201,7 @@ extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); void intel_init_thermal(struct cpuinfo_x86 *c); -#ifdef CONFIG_X86_NEW_MCE void mce_log_therm_throt_event(__u64 status); -#else -static inline void mce_log_therm_throt_event(__u64 status) {} -#endif #endif /* __KERNEL__ */ #endif /* _ASM_X86_MCE_H */ diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index 188a1ca5ad2..022a036ce21 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile @@ -1,11 +1,9 @@ obj-y = mce.o obj-$(CONFIG_X86_NEW_MCE) += mce-severity.o -obj-$(CONFIG_X86_OLD_MCE) += k7.o p4.o p6.o obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o -obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c deleted file mode 100644 index b945d5dbc60..00000000000 --- a/arch/x86/kernel/cpu/mcheck/k7.c +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Athlon specific Machine Check Exception Reporting - * (C) Copyright 2002 Dave Jones - */ -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -/* Machine Check Handler For AMD Athlon/Duron: */ -static void k7_machine_check(struct pt_regs *regs, long error_code) -{ - u32 alow, ahigh, high, low; - u32 mcgstl, mcgsth; - int recover = 1; - int i; - - rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); - if (mcgstl & (1<<0)) /* Recoverable ? */ - recover = 0; - - printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", - smp_processor_id(), mcgsth, mcgstl); - - for (i = 1; i < nr_mce_banks; i++) { - rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); - if (high & (1<<31)) { - char misc[20]; - char addr[24]; - - misc[0] = '\0'; - addr[0] = '\0'; - - if (high & (1<<29)) - recover |= 1; - if (high & (1<<25)) - recover |= 2; - high &= ~(1<<31); - - if (high & (1<<27)) { - rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); - snprintf(misc, 20, "[%08x%08x]", ahigh, alow); - } - if (high & (1<<26)) { - rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); - snprintf(addr, 24, " at %08x%08x", ahigh, alow); - } - - printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", - smp_processor_id(), i, high, low, misc, addr); - - /* Clear it: */ - wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); - /* Serialize: */ - wmb(); - add_taint(TAINT_MACHINE_CHECK); - } - } - - if (recover & 2) - panic("CPU context corrupt"); - if (recover & 1) - panic("Unable to continue"); - - printk(KERN_EMERG "Attempting to continue.\n"); - - mcgstl &= ~(1<<2); - wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); -} - - -/* AMD K7 machine check is Intel like: */ -void amd_mcheck_init(struct cpuinfo_x86 *c) -{ - u32 l, h; - int i; - - if (!cpu_has(c, X86_FEATURE_MCE)) - return; - - machine_check_vector = k7_machine_check; - /* Make sure the vector pointer is visible before we enable MCEs: */ - wmb(); - - printk(KERN_INFO "Intel machine check architecture supported.\n"); - - rdmsr(MSR_IA32_MCG_CAP, l, h); - if (l & (1<<8)) /* Control register present ? */ - wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); - nr_mce_banks = l & 0xff; - - /* - * Clear status for MC index 0 separately, we don't touch CTL, - * as some K7 Athlons cause spurious MCEs when its enabled: - */ - if (boot_cpu_data.x86 == 6) { - wrmsr(MSR_IA32_MC0_STATUS, 0x0, 0x0); - i = 1; - } else - i = 0; - - for (; i < nr_mce_banks; i++) { - wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); - wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); - } - - set_in_cr4(X86_CR4_MCE); - printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n", - smp_processor_id()); -} diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 7da8fec9ca8..5ff6362ecb1 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -58,8 +58,6 @@ void (*machine_check_vector)(struct pt_regs *, long error_code) = int mce_disabled __read_mostly; -#ifdef CONFIG_X86_NEW_MCE - #define MISC_MCELOG_MINOR 227 #define SPINUNIT 100 /* 100ns */ @@ -1993,51 +1991,6 @@ static __init int mce_init_device(void) device_initcall(mce_init_device); -#else /* CONFIG_X86_OLD_MCE: */ - -int nr_mce_banks; -EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ - -/* This has to be run for each processor */ -void mcheck_init(struct cpuinfo_x86 *c) -{ - if (mce_disabled) - return; - - switch (c->x86_vendor) { - case X86_VENDOR_AMD: - amd_mcheck_init(c); - break; - - case X86_VENDOR_INTEL: - if (c->x86 == 5) - intel_p5_mcheck_init(c); - if (c->x86 == 6) - intel_p6_mcheck_init(c); - if (c->x86 == 15) - intel_p4_mcheck_init(c); - break; - - case X86_VENDOR_CENTAUR: - if (c->x86 == 5) - winchip_mcheck_init(c); - break; - - default: - break; - } - printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks); -} - -static int __init mcheck_enable(char *str) -{ - mce_p5_enabled = 1; - return 1; -} -__setup("mce", mcheck_enable); - -#endif /* CONFIG_X86_OLD_MCE */ - /* * Old style boot options parsing. Only for compatibility. */ diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c deleted file mode 100644 index f5f2d6f71fb..00000000000 --- a/arch/x86/kernel/cpu/mcheck/non-fatal.c +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Non Fatal Machine Check Exception Reporting - * - * (C) Copyright 2002 Dave Jones. - * - * This file contains routines to check for non-fatal MCEs every 15s - * - */ -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -static int firstbank; - -#define MCE_RATE (15*HZ) /* timer rate is 15s */ - -static void mce_checkregs(void *info) -{ - u32 low, high; - int i; - - for (i = firstbank; i < nr_mce_banks; i++) { - rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); - - if (!(high & (1<<31))) - continue; - - printk(KERN_INFO "MCE: The hardware reports a non fatal, " - "correctable incident occurred on CPU %d.\n", - smp_processor_id()); - - printk(KERN_INFO "Bank %d: %08x%08x\n", i, high, low); - - /* - * Scrub the error so we don't pick it up in MCE_RATE - * seconds time: - */ - wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); - - /* Serialize: */ - wmb(); - add_taint(TAINT_MACHINE_CHECK); - } -} - -static void mce_work_fn(struct work_struct *work); -static DECLARE_DELAYED_WORK(mce_work, mce_work_fn); - -static void mce_work_fn(struct work_struct *work) -{ - on_each_cpu(mce_checkregs, NULL, 1); - schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE)); -} - -static int __init init_nonfatal_mce_checker(void) -{ - struct cpuinfo_x86 *c = &boot_cpu_data; - - /* Check for MCE support */ - if (!cpu_has(c, X86_FEATURE_MCE)) - return -ENODEV; - - /* Check for PPro style MCA */ - if (!cpu_has(c, X86_FEATURE_MCA)) - return -ENODEV; - - /* Some Athlons misbehave when we frob bank 0 */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && - boot_cpu_data.x86 == 6) - firstbank = 1; - else - firstbank = 0; - - /* - * Check for non-fatal errors every MCE_RATE s - */ - schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE)); - printk(KERN_INFO "Machine check exception polling timer started.\n"); - - return 0; -} -module_init(init_nonfatal_mce_checker); - -MODULE_LICENSE("GPL"); diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c deleted file mode 100644 index 4482aea9aa2..00000000000 --- a/arch/x86/kernel/cpu/mcheck/p4.c +++ /dev/null @@ -1,163 +0,0 @@ -/* - * P4 specific Machine Check Exception Reporting - */ -#include -#include -#include -#include - -#include -#include -#include - -/* as supported by the P4/Xeon family */ -struct intel_mce_extended_msrs { - u32 eax; - u32 ebx; - u32 ecx; - u32 edx; - u32 esi; - u32 edi; - u32 ebp; - u32 esp; - u32 eflags; - u32 eip; - /* u32 *reserved[]; */ -}; - -static int mce_num_extended_msrs; - -/* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */ -static void intel_get_extended_msrs(struct intel_mce_extended_msrs *r) -{ - u32 h; - - rdmsr(MSR_IA32_MCG_EAX, r->eax, h); - rdmsr(MSR_IA32_MCG_EBX, r->ebx, h); - rdmsr(MSR_IA32_MCG_ECX, r->ecx, h); - rdmsr(MSR_IA32_MCG_EDX, r->edx, h); - rdmsr(MSR_IA32_MCG_ESI, r->esi, h); - rdmsr(MSR_IA32_MCG_EDI, r->edi, h); - rdmsr(MSR_IA32_MCG_EBP, r->ebp, h); - rdmsr(MSR_IA32_MCG_ESP, r->esp, h); - rdmsr(MSR_IA32_MCG_EFLAGS, r->eflags, h); - rdmsr(MSR_IA32_MCG_EIP, r->eip, h); -} - -static void intel_machine_check(struct pt_regs *regs, long error_code) -{ - u32 alow, ahigh, high, low; - u32 mcgstl, mcgsth; - int recover = 1; - int i; - - rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); - if (mcgstl & (1<<0)) /* Recoverable ? */ - recover = 0; - - printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", - smp_processor_id(), mcgsth, mcgstl); - - if (mce_num_extended_msrs > 0) { - struct intel_mce_extended_msrs dbg; - - intel_get_extended_msrs(&dbg); - - printk(KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n" - "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n" - "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n", - smp_processor_id(), dbg.eip, dbg.eflags, - dbg.eax, dbg.ebx, dbg.ecx, dbg.edx, - dbg.esi, dbg.edi, dbg.ebp, dbg.esp); - } - - for (i = 0; i < nr_mce_banks; i++) { - rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); - if (high & (1<<31)) { - char misc[20]; - char addr[24]; - - misc[0] = addr[0] = '\0'; - if (high & (1<<29)) - recover |= 1; - if (high & (1<<25)) - recover |= 2; - high &= ~(1<<31); - if (high & (1<<27)) { - rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); - snprintf(misc, 20, "[%08x%08x]", ahigh, alow); - } - if (high & (1<<26)) { - rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); - snprintf(addr, 24, " at %08x%08x", ahigh, alow); - } - printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", - smp_processor_id(), i, high, low, misc, addr); - } - } - - if (recover & 2) - panic("CPU context corrupt"); - if (recover & 1) - panic("Unable to continue"); - - printk(KERN_EMERG "Attempting to continue.\n"); - - /* - * Do not clear the MSR_IA32_MCi_STATUS if the error is not - * recoverable/continuable.This will allow BIOS to look at the MSRs - * for errors if the OS could not log the error. - */ - for (i = 0; i < nr_mce_banks; i++) { - u32 msr; - msr = MSR_IA32_MC0_STATUS+i*4; - rdmsr(msr, low, high); - if (high&(1<<31)) { - /* Clear it */ - wrmsr(msr, 0UL, 0UL); - /* Serialize */ - wmb(); - add_taint(TAINT_MACHINE_CHECK); - } - } - mcgstl &= ~(1<<2); - wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); -} - -void intel_p4_mcheck_init(struct cpuinfo_x86 *c) -{ - u32 l, h; - int i; - - machine_check_vector = intel_machine_check; - wmb(); - - printk(KERN_INFO "Intel machine check architecture supported.\n"); - rdmsr(MSR_IA32_MCG_CAP, l, h); - if (l & (1<<8)) /* Control register present ? */ - wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); - nr_mce_banks = l & 0xff; - - for (i = 0; i < nr_mce_banks; i++) { - wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); - wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); - } - - set_in_cr4(X86_CR4_MCE); - printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n", - smp_processor_id()); - - /* Check for P4/Xeon extended MCE MSRs */ - rdmsr(MSR_IA32_MCG_CAP, l, h); - if (l & (1<<9)) {/* MCG_EXT_P */ - mce_num_extended_msrs = (l >> 16) & 0xff; - printk(KERN_INFO "CPU%d: Intel P4/Xeon Extended MCE MSRs (%d)" - " available\n", - smp_processor_id(), mce_num_extended_msrs); - -#ifdef CONFIG_X86_MCE_P4THERMAL - /* Check for P4/Xeon Thermal monitor */ - intel_init_thermal(c); -#endif - } -} diff --git a/arch/x86/kernel/cpu/mcheck/p6.c b/arch/x86/kernel/cpu/mcheck/p6.c deleted file mode 100644 index 01e4f817818..00000000000 --- a/arch/x86/kernel/cpu/mcheck/p6.c +++ /dev/null @@ -1,127 +0,0 @@ -/* - * P6 specific Machine Check Exception Reporting - * (C) Copyright 2002 Alan Cox - */ -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -/* Machine Check Handler For PII/PIII */ -static void intel_machine_check(struct pt_regs *regs, long error_code) -{ - u32 alow, ahigh, high, low; - u32 mcgstl, mcgsth; - int recover = 1; - int i; - - rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); - if (mcgstl & (1<<0)) /* Recoverable ? */ - recover = 0; - - printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", - smp_processor_id(), mcgsth, mcgstl); - - for (i = 0; i < nr_mce_banks; i++) { - rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); - if (high & (1<<31)) { - char misc[20]; - char addr[24]; - - misc[0] = '\0'; - addr[0] = '\0'; - - if (high & (1<<29)) - recover |= 1; - if (high & (1<<25)) - recover |= 2; - high &= ~(1<<31); - - if (high & (1<<27)) { - rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); - snprintf(misc, 20, "[%08x%08x]", ahigh, alow); - } - if (high & (1<<26)) { - rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); - snprintf(addr, 24, " at %08x%08x", ahigh, alow); - } - - printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", - smp_processor_id(), i, high, low, misc, addr); - } - } - - if (recover & 2) - panic("CPU context corrupt"); - if (recover & 1) - panic("Unable to continue"); - - printk(KERN_EMERG "Attempting to continue.\n"); - /* - * Do not clear the MSR_IA32_MCi_STATUS if the error is not - * recoverable/continuable.This will allow BIOS to look at the MSRs - * for errors if the OS could not log the error: - */ - for (i = 0; i < nr_mce_banks; i++) { - unsigned int msr; - - msr = MSR_IA32_MC0_STATUS+i*4; - rdmsr(msr, low, high); - if (high & (1<<31)) { - /* Clear it: */ - wrmsr(msr, 0UL, 0UL); - /* Serialize: */ - wmb(); - add_taint(TAINT_MACHINE_CHECK); - } - } - mcgstl &= ~(1<<2); - wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); -} - -/* Set up machine check reporting for processors with Intel style MCE: */ -void intel_p6_mcheck_init(struct cpuinfo_x86 *c) -{ - u32 l, h; - int i; - - /* Check for MCE support */ - if (!cpu_has(c, X86_FEATURE_MCE)) - return; - - /* Check for PPro style MCA */ - if (!cpu_has(c, X86_FEATURE_MCA)) - return; - - /* Ok machine check is available */ - machine_check_vector = intel_machine_check; - /* Make sure the vector pointer is visible before we enable MCEs: */ - wmb(); - - printk(KERN_INFO "Intel machine check architecture supported.\n"); - rdmsr(MSR_IA32_MCG_CAP, l, h); - if (l & (1<<8)) /* Control register present ? */ - wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); - nr_mce_banks = l & 0xff; - - /* - * Following the example in IA-32 SDM Vol 3: - * - MC0_CTL should not be written - * - Status registers on all banks should be cleared on reset - */ - for (i = 1; i < nr_mce_banks; i++) - wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); - - for (i = 0; i < nr_mce_banks; i++) - wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); - - set_in_cr4(X86_CR4_MCE); - printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n", - smp_processor_id()); -} -- cgit v1.2.3 From c1ebf835617035b1f08f734247dcb981e17aac6b Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 9 Jul 2009 00:31:41 +0200 Subject: x86: mce: Rename CONFIG_X86_NEW_MCE to CONFIG_X86_MCE Drop the CONFIG_X86_NEW_MCE symbol and change all references to it to check for CONFIG_X86_MCE directly. No code changes Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 11 +++-------- arch/x86/include/asm/entry_arch.h | 2 +- arch/x86/kernel/apic/nmi.c | 2 +- arch/x86/kernel/cpu/mcheck/Makefile | 3 +-- arch/x86/kernel/irq.c | 4 ++-- arch/x86/kernel/irqinit.c | 2 +- arch/x86/kernel/signal.c | 2 +- 7 files changed, 10 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d986769a7d9..06880ca677f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -781,15 +781,10 @@ config X86_MCE The action the kernel takes depends on the severity of the problem, ranging from warning messages to halting the machine. -config X86_NEW_MCE - depends on X86_MCE - bool - default y - config X86_MCE_INTEL def_bool y prompt "Intel MCE features" - depends on X86_NEW_MCE && X86_LOCAL_APIC + depends on X86_MCE && X86_LOCAL_APIC ---help--- Additional support for intel specific MCE features such as the thermal monitor. @@ -797,7 +792,7 @@ config X86_MCE_INTEL config X86_MCE_AMD def_bool y prompt "AMD MCE features" - depends on X86_NEW_MCE && X86_LOCAL_APIC + depends on X86_MCE && X86_LOCAL_APIC ---help--- Additional support for AMD specific MCE features such as the DRAM Error Threshold. @@ -817,7 +812,7 @@ config X86_MCE_THRESHOLD default y config X86_MCE_INJECT - depends on X86_NEW_MCE + depends on X86_MCE tristate "Machine check injector support" ---help--- Provide support for injecting machine checks for testing purposes. diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index ff8cbfa0785..5e3f2044f0d 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -61,7 +61,7 @@ BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR) BUILD_INTERRUPT(threshold_interrupt,THRESHOLD_APIC_VECTOR) #endif -#ifdef CONFIG_X86_NEW_MCE +#ifdef CONFIG_X86_MCE BUILD_INTERRUPT(mce_self_interrupt,MCE_SELF_VECTOR) #endif diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index b3025b43b63..f4227289caf 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -66,7 +66,7 @@ static inline unsigned int get_nmi_count(int cpu) static inline int mce_in_progress(void) { -#if defined(CONFIG_X86_NEW_MCE) +#if defined(CONFIG_X86_MCE) return atomic_read(&mce_entry) > 0; #endif return 0; diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index 022a036ce21..4ac6d48fe11 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile @@ -1,6 +1,5 @@ -obj-y = mce.o +obj-y = mce.o mce-severity.o -obj-$(CONFIG_X86_NEW_MCE) += mce-severity.o obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index b0cdde6932f..74656d1d4e3 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -104,7 +104,7 @@ static int show_other_interrupts(struct seq_file *p, int prec) seq_printf(p, " Threshold APIC interrupts\n"); # endif #endif -#ifdef CONFIG_X86_NEW_MCE +#ifdef CONFIG_X86_MCE seq_printf(p, "%*s: ", prec, "MCE"); for_each_online_cpu(j) seq_printf(p, "%10u ", per_cpu(mce_exception_count, j)); @@ -200,7 +200,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu) sum += irq_stats(cpu)->irq_threshold_count; # endif #endif -#ifdef CONFIG_X86_NEW_MCE +#ifdef CONFIG_X86_MCE sum += per_cpu(mce_exception_count, cpu); sum += per_cpu(mce_poll_count, cpu); #endif diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 696f0e475c2..8a194ad357e 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -190,7 +190,7 @@ static void __init apic_intr_init(void) #ifdef CONFIG_X86_THRESHOLD alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); #endif -#if defined(CONFIG_X86_NEW_MCE) && defined(CONFIG_X86_LOCAL_APIC) +#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_LOCAL_APIC) alloc_intr_gate(MCE_SELF_VECTOR, mce_self_interrupt); #endif diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 4c578751e94..cc26ad4c307 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -856,7 +856,7 @@ static void do_signal(struct pt_regs *regs) void do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) { -#ifdef CONFIG_X86_NEW_MCE +#ifdef CONFIG_X86_MCE /* notify userspace of pending MCEs */ if (thread_info_flags & _TIF_MCE_NOTIFY) mce_notify_process(); -- cgit v1.2.3 From 9eda8cb3ac235217e4ffa01cb9cedee1c1550599 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 9 Jul 2009 00:31:42 +0200 Subject: x86: mce: Move code in mce.c Now that the X86_OLD_MCE ifdefs are gone move some code that used to be outside the big ifdef to a more natural place near its user. No code change. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 5ff6362ecb1..e16271f01ac 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -45,17 +45,6 @@ #include "mce-internal.h" -/* Handle unconfigured int18 (should never happen) */ -static void unexpected_machine_check(struct pt_regs *regs, long error_code) -{ - printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", - smp_processor_id()); -} - -/* Call the installed machine check handler for this CPU setup. */ -void (*machine_check_vector)(struct pt_regs *, long error_code) = - unexpected_machine_check; - int mce_disabled __read_mostly; #define MISC_MCELOG_MINOR 227 @@ -1322,6 +1311,17 @@ static void mce_init_timer(void) add_timer(t); } +/* Handle unconfigured int18 (should never happen) */ +static void unexpected_machine_check(struct pt_regs *regs, long error_code) +{ + printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", + smp_processor_id()); +} + +/* Call the installed machine check handler for this CPU setup. */ +void (*machine_check_vector)(struct pt_regs *, long error_code) = + unexpected_machine_check; + /* * Called for each booted CPU to set up machine checks. * Must be called with preempt off: -- cgit v1.2.3 From cebe182033f156b430952370fb0f9dbe6e89b081 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 9 Jul 2009 00:31:43 +0200 Subject: x86: mce: Move per bank data in a single datastructure This addresses one of the leftover review comments. Move the per bank data into a single structure. This avoids several separate variables and also separate allocation of sysfs objects. I didn't move the CMCI ownership information so far because that would have needed some non trivial changes in the algorithms. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce-internal.h | 14 ++++ arch/x86/kernel/cpu/mcheck/mce.c | 109 +++++++++++++++--------------- 2 files changed, 67 insertions(+), 56 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index 54dcb8ff12e..6bd51e7ba87 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -1,3 +1,4 @@ +#include #include enum severity_level { @@ -10,6 +11,19 @@ enum severity_level { MCE_PANIC_SEVERITY, }; +#define ATTR_LEN 16 + +/* One object for each MCE bank, shared by all CPUs */ +struct mce_bank { + u64 ctl; /* subevents to enable */ + unsigned char init; /* initialise bank? */ + struct sysdev_attribute attr; /* sysdev attribute */ + char attrname[ATTR_LEN]; /* attribute name */ +}; + int mce_severity(struct mce *a, int tolerant, char **msg); extern int mce_ser; + +extern struct mce_bank *mce_banks; + diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index e16271f01ac..a04806e01a8 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -64,7 +64,6 @@ DEFINE_PER_CPU(unsigned, mce_exception_count); */ static int tolerant __read_mostly = 1; static int banks __read_mostly; -static u64 *bank __read_mostly; static int rip_msr __read_mostly; static int mce_bootlog __read_mostly = -1; static int monarch_timeout __read_mostly = -1; @@ -74,13 +73,13 @@ int mce_cmci_disabled __read_mostly; int mce_ignore_ce __read_mostly; int mce_ser __read_mostly; +struct mce_bank *mce_banks __read_mostly; + /* User mode helper program triggered by machine check event */ static unsigned long mce_need_notify; static char mce_helper[128]; static char *mce_helper_argv[2] = { mce_helper, NULL }; -static unsigned long dont_init_banks; - static DECLARE_WAIT_QUEUE_HEAD(mce_wait); static DEFINE_PER_CPU(struct mce, mces_seen); static int cpu_missing; @@ -91,11 +90,6 @@ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL }; -static inline int skip_bank_init(int i) -{ - return i < BITS_PER_LONG && test_bit(i, &dont_init_banks); -} - static DEFINE_PER_CPU(struct work_struct, mce_work); /* Do initial initialization of a struct mce */ @@ -482,7 +476,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); for (i = 0; i < banks; i++) { - if (!bank[i] || !test_bit(i, *b)) + if (!mce_banks[i].ctl || !test_bit(i, *b)) continue; m.misc = 0; @@ -903,7 +897,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) order = mce_start(&no_way_out); for (i = 0; i < banks; i++) { __clear_bit(i, toclear); - if (!bank[i]) + if (!mce_banks[i].ctl) continue; m.misc = 0; @@ -1146,6 +1140,21 @@ int mce_notify_irq(void) } EXPORT_SYMBOL_GPL(mce_notify_irq); +static int mce_banks_init(void) +{ + int i; + + mce_banks = kzalloc(banks * sizeof(struct mce_bank), GFP_KERNEL); + if (!mce_banks) + return -ENOMEM; + for (i = 0; i < banks; i++) { + struct mce_bank *b = &mce_banks[i]; + b->ctl = -1ULL; + b->init = 1; + } + return 0; +} + /* * Initialize Machine Checks for a CPU. */ @@ -1169,11 +1178,10 @@ static int mce_cap_init(void) /* Don't support asymmetric configurations today */ WARN_ON(banks != 0 && b != banks); banks = b; - if (!bank) { - bank = kmalloc(banks * sizeof(u64), GFP_KERNEL); - if (!bank) - return -ENOMEM; - memset(bank, 0xff, banks * sizeof(u64)); + if (!mce_banks) { + int err = mce_banks_init(); + if (err) + return err; } /* Use accurate RIP reporting if available. */ @@ -1205,9 +1213,10 @@ static void mce_init(void) wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); for (i = 0; i < banks; i++) { - if (skip_bank_init(i)) + struct mce_bank *b = &mce_banks[i]; + if (!b->init) continue; - wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); + wrmsrl(MSR_IA32_MC0_CTL+4*i, b->ctl); wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); } } @@ -1223,7 +1232,7 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c) * trips off incorrectly with the IOMMU & 3ware * & Cerberus: */ - clear_bit(10, (unsigned long *)&bank[4]); + clear_bit(10, (unsigned long *)&mce_banks[4].ctl); } if (c->x86 <= 17 && mce_bootlog < 0) { /* @@ -1237,7 +1246,7 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c) * by default. */ if (c->x86 == 6 && banks > 0) - bank[0] = 0; + mce_banks[0].ctl = 0; } if (c->x86_vendor == X86_VENDOR_INTEL) { @@ -1250,8 +1259,8 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c) * valid event later, merely don't write CTL0. */ - if (c->x86 == 6 && c->x86_model < 0x1A) - __set_bit(0, &dont_init_banks); + if (c->x86 == 6 && c->x86_model < 0x1A && banks > 0) + mce_banks[0].init = 0; /* * All newer Intel systems support MCE broadcasting. Enable @@ -1578,7 +1587,8 @@ static int mce_disable(void) int i; for (i = 0; i < banks; i++) { - if (!skip_bank_init(i)) + struct mce_bank *b = &mce_banks[i]; + if (b->init) wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); } return 0; @@ -1654,14 +1664,15 @@ DEFINE_PER_CPU(struct sys_device, mce_dev); __cpuinitdata void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); -static struct sysdev_attribute *bank_attrs; +static inline struct mce_bank *attr_to_bank(struct sysdev_attribute *attr) +{ + return container_of(attr, struct mce_bank, attr); +} static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, char *buf) { - u64 b = bank[attr - bank_attrs]; - - return sprintf(buf, "%llx\n", b); + return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl); } static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, @@ -1672,7 +1683,7 @@ static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, if (strict_strtoull(buf, 0, &new) < 0) return -EINVAL; - bank[attr - bank_attrs] = new; + attr_to_bank(attr)->ctl = new; mce_restart(); return size; @@ -1816,7 +1827,7 @@ static __cpuinit int mce_create_device(unsigned int cpu) } for (j = 0; j < banks; j++) { err = sysdev_create_file(&per_cpu(mce_dev, cpu), - &bank_attrs[j]); + &mce_banks[j].attr); if (err) goto error2; } @@ -1825,10 +1836,10 @@ static __cpuinit int mce_create_device(unsigned int cpu) return 0; error2: while (--j >= 0) - sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[j]); + sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr); error: while (--i >= 0) - sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); + sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr); sysdev_unregister(&per_cpu(mce_dev, cpu)); @@ -1846,7 +1857,7 @@ static __cpuinit void mce_remove_device(unsigned int cpu) sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); for (i = 0; i < banks; i++) - sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); + sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr); sysdev_unregister(&per_cpu(mce_dev, cpu)); cpumask_clear_cpu(cpu, mce_dev_initialized); @@ -1863,7 +1874,8 @@ static void mce_disable_cpu(void *h) if (!(action & CPU_TASKS_FROZEN)) cmci_clear(); for (i = 0; i < banks; i++) { - if (!skip_bank_init(i)) + struct mce_bank *b = &mce_banks[i]; + if (b->init) wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); } } @@ -1879,8 +1891,9 @@ static void mce_reenable_cpu(void *h) if (!(action & CPU_TASKS_FROZEN)) cmci_reenable(); for (i = 0; i < banks; i++) { - if (!skip_bank_init(i)) - wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); + struct mce_bank *b = &mce_banks[i]; + if (b->init) + wrmsrl(MSR_IA32_MC0_CTL + i*4, b->ctl); } } @@ -1928,35 +1941,21 @@ static struct notifier_block mce_cpu_notifier __cpuinitdata = { .notifier_call = mce_cpu_callback, }; -static __init int mce_init_banks(void) +static __init void mce_init_banks(void) { int i; - bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks, - GFP_KERNEL); - if (!bank_attrs) - return -ENOMEM; - for (i = 0; i < banks; i++) { - struct sysdev_attribute *a = &bank_attrs[i]; + struct mce_bank *b = &mce_banks[i]; + struct sysdev_attribute *a = &b->attr; - a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i); - if (!a->attr.name) - goto nomem; + a->attr.name = b->attrname; + snprintf(b->attrname, ATTR_LEN, "bank%d", i); a->attr.mode = 0644; a->show = show_bank; a->store = set_bank; } - return 0; - -nomem: - while (--i >= 0) - kfree(bank_attrs[i].attr.name); - kfree(bank_attrs); - bank_attrs = NULL; - - return -ENOMEM; } static __init int mce_init_device(void) @@ -1969,9 +1968,7 @@ static __init int mce_init_device(void) zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); - err = mce_init_banks(); - if (err) - return err; + mce_init_banks(); err = sysdev_class_register(&mce_sysclass); if (err) -- cgit v1.2.3 From a2d32bcbc008aa0f9c301a7c6f3494cb23e6af54 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 9 Jul 2009 00:31:44 +0200 Subject: x86: mce: macros to compute banks MSRs Instead of open coded calculations for bank MSRs hide the indexing of higher banks MCE register MSRs in new macros. No semantic changes. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/msr-index.h | 7 +++++++ arch/x86/kernel/cpu/mcheck/mce.c | 34 +++++++++++++++++----------------- arch/x86/kernel/cpu/mcheck/mce_intel.c | 10 +++++----- 3 files changed, 29 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 1692fb5050e..3d1ce094586 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -81,8 +81,15 @@ #define MSR_IA32_MC0_ADDR 0x00000402 #define MSR_IA32_MC0_MISC 0x00000403 +#define MSR_IA32_MCx_CTL(x) (MSR_IA32_MC0_CTL + 4*(x)) +#define MSR_IA32_MCx_STATUS(x) (MSR_IA32_MC0_STATUS + 4*(x)) +#define MSR_IA32_MCx_ADDR(x) (MSR_IA32_MC0_ADDR + 4*(x)) +#define MSR_IA32_MCx_MISC(x) (MSR_IA32_MC0_MISC + 4*(x)) + /* These are consecutive and not in the normal 4er MCE bank block */ #define MSR_IA32_MC0_CTL2 0x00000280 +#define MSR_IA32_MCx_CTL2(x) (MSR_IA32_MC0_CTL2 + (x)) + #define CMCI_EN (1ULL << 30) #define CMCI_THRESHOLD_MASK 0xffffULL diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index a04806e01a8..07139a0578e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -267,11 +267,11 @@ static int msr_to_offset(u32 msr) unsigned bank = __get_cpu_var(injectm.bank); if (msr == rip_msr) return offsetof(struct mce, ip); - if (msr == MSR_IA32_MC0_STATUS + bank*4) + if (msr == MSR_IA32_MCx_STATUS(bank)) return offsetof(struct mce, status); - if (msr == MSR_IA32_MC0_ADDR + bank*4) + if (msr == MSR_IA32_MCx_ADDR(bank)) return offsetof(struct mce, addr); - if (msr == MSR_IA32_MC0_MISC + bank*4) + if (msr == MSR_IA32_MCx_MISC(bank)) return offsetof(struct mce, misc); if (msr == MSR_IA32_MCG_STATUS) return offsetof(struct mce, mcgstatus); @@ -485,7 +485,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) m.tsc = 0; barrier(); - m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); + m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); if (!(m.status & MCI_STATUS_VAL)) continue; @@ -500,9 +500,9 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) continue; if (m.status & MCI_STATUS_MISCV) - m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); + m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); if (m.status & MCI_STATUS_ADDRV) - m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); + m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); if (!(flags & MCP_TIMESTAMP)) m.tsc = 0; @@ -518,7 +518,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) /* * Clear state for this bank. */ - mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); } /* @@ -539,7 +539,7 @@ static int mce_no_way_out(struct mce *m, char **msg) int i; for (i = 0; i < banks; i++) { - m->status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); + m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) return 1; } @@ -823,7 +823,7 @@ static void mce_clear_state(unsigned long *toclear) for (i = 0; i < banks; i++) { if (test_bit(i, toclear)) - mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); } } @@ -904,7 +904,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) m.addr = 0; m.bank = i; - m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); + m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); if ((m.status & MCI_STATUS_VAL) == 0) continue; @@ -945,9 +945,9 @@ void do_machine_check(struct pt_regs *regs, long error_code) kill_it = 1; if (m.status & MCI_STATUS_MISCV) - m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); + m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); if (m.status & MCI_STATUS_ADDRV) - m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); + m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); /* * Action optional error. Queue address for later processing. @@ -1216,8 +1216,8 @@ static void mce_init(void) struct mce_bank *b = &mce_banks[i]; if (!b->init) continue; - wrmsrl(MSR_IA32_MC0_CTL+4*i, b->ctl); - wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); + wrmsrl(MSR_IA32_MCx_STATUS(i), 0); } } @@ -1589,7 +1589,7 @@ static int mce_disable(void) for (i = 0; i < banks; i++) { struct mce_bank *b = &mce_banks[i]; if (b->init) - wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); + wrmsrl(MSR_IA32_MCx_CTL(i), 0); } return 0; } @@ -1876,7 +1876,7 @@ static void mce_disable_cpu(void *h) for (i = 0; i < banks; i++) { struct mce_bank *b = &mce_banks[i]; if (b->init) - wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); + wrmsrl(MSR_IA32_MCx_CTL(i), 0); } } @@ -1893,7 +1893,7 @@ static void mce_reenable_cpu(void *h) for (i = 0; i < banks; i++) { struct mce_bank *b = &mce_banks[i]; if (b->init) - wrmsrl(MSR_IA32_MC0_CTL + i*4, b->ctl); + wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); } } diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index e1acec0f7a3..889f665fe93 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -90,7 +90,7 @@ static void cmci_discover(int banks, int boot) if (test_bit(i, owned)) continue; - rdmsrl(MSR_IA32_MC0_CTL2 + i, val); + rdmsrl(MSR_IA32_MCx_CTL2(i), val); /* Already owned by someone else? */ if (val & CMCI_EN) { @@ -101,8 +101,8 @@ static void cmci_discover(int banks, int boot) } val |= CMCI_EN | CMCI_THRESHOLD; - wrmsrl(MSR_IA32_MC0_CTL2 + i, val); - rdmsrl(MSR_IA32_MC0_CTL2 + i, val); + wrmsrl(MSR_IA32_MCx_CTL2(i), val); + rdmsrl(MSR_IA32_MCx_CTL2(i), val); /* Did the enable bit stick? -- the bank supports CMCI */ if (val & CMCI_EN) { @@ -152,9 +152,9 @@ void cmci_clear(void) if (!test_bit(i, __get_cpu_var(mce_banks_owned))) continue; /* Disable CMCI */ - rdmsrl(MSR_IA32_MC0_CTL2 + i, val); + rdmsrl(MSR_IA32_MCx_CTL2(i), val); val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK); - wrmsrl(MSR_IA32_MC0_CTL2 + i, val); + wrmsrl(MSR_IA32_MCx_CTL2(i), val); __clear_bit(i, __get_cpu_var(mce_banks_owned)); } spin_unlock_irqrestore(&cmci_discover_lock, flags); -- cgit v1.2.3 From 3ccdccfadbd2548abe38682b587f4ba27eac2fc9 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 9 Jul 2009 00:31:45 +0200 Subject: x86: mce: Lower maximum number of banks to architecture limit The Intel x86 architecture right now only supports 32 machine check banks, more would bump into other MSRs. So lower the max define to 32. This only affects a few bitmaps, most data structures are dynamically sized anyways. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mce.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 6b8a974e127..ad753537291 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -130,10 +130,11 @@ void mce_log(struct mce *m); DECLARE_PER_CPU(struct sys_device, mce_dev); /* - * To support more than 128 would need to escape the predefined - * Linux defined extended banks first. + * Maximum banks number. + * This is the limit of the current register layout on + * Intel CPUs. */ -#define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1) +#define MAX_NR_BANKS 32 #ifdef CONFIG_X86_MCE_INTEL extern int mce_cmci_disabled; -- cgit v1.2.3 From 3162534069597e34dd0ac9eb711be8dc23835ae7 Mon Sep 17 00:00:00 2001 From: Joseph Cihula Date: Tue, 30 Jun 2009 19:30:59 -0700 Subject: x86, intel_txt: Intel TXT boot support This patch adds kernel configuration and boot support for Intel Trusted Execution Technology (Intel TXT). Intel's technology for safer computing, Intel Trusted Execution Technology (Intel TXT), defines platform-level enhancements that provide the building blocks for creating trusted platforms. Intel TXT was formerly known by the code name LaGrande Technology (LT). Intel TXT in Brief: o Provides dynamic root of trust for measurement (DRTM) o Data protection in case of improper shutdown o Measurement and verification of launched environment Intel TXT is part of the vPro(TM) brand and is also available some non-vPro systems. It is currently available on desktop systems based on the Q35, X38, Q45, and Q43 Express chipsets (e.g. Dell Optiplex 755, HP dc7800, etc.) and mobile systems based on the GM45, PM45, and GS45 Express chipsets. For more information, see http://www.intel.com/technology/security/. This site also has a link to the Intel TXT MLE Developers Manual, which has been updated for the new released platforms. A much more complete description of how these patches support TXT, how to configure a system for it, etc. is in the Documentation/intel_txt.txt file in this patch. This patch provides the TXT support routines for complete functionality, documentation for TXT support and for the changes to the boot_params structure, and boot detection of a TXT launch. Attempts to shutdown (reboot, Sx) the system will result in platform resets; subsequent patches will support these shutdown modes properly. Documentation/intel_txt.txt | 210 +++++++++++++++++++++ Documentation/x86/zero-page.txt | 1 arch/x86/include/asm/bootparam.h | 3 arch/x86/include/asm/fixmap.h | 3 arch/x86/include/asm/tboot.h | 197 ++++++++++++++++++++ arch/x86/kernel/Makefile | 1 arch/x86/kernel/setup.c | 4 arch/x86/kernel/tboot.c | 379 +++++++++++++++++++++++++++++++++++++++ security/Kconfig | 30 +++ 9 files changed, 827 insertions(+), 1 deletion(-) Signed-off-by: Joseph Cihula Signed-off-by: Shane Wang Signed-off-by: Gang Wei Signed-off-by: H. Peter Anvin --- Documentation/intel_txt.txt | 210 ++++++++++++++++++++++ Documentation/x86/zero-page.txt | 1 + arch/x86/include/asm/bootparam.h | 3 +- arch/x86/include/asm/fixmap.h | 3 + arch/x86/include/asm/tboot.h | 197 ++++++++++++++++++++ arch/x86/kernel/Makefile | 1 + arch/x86/kernel/setup.c | 4 + arch/x86/kernel/tboot.c | 379 +++++++++++++++++++++++++++++++++++++++ security/Kconfig | 30 ++++ 9 files changed, 827 insertions(+), 1 deletion(-) create mode 100644 Documentation/intel_txt.txt create mode 100644 arch/x86/include/asm/tboot.h create mode 100644 arch/x86/kernel/tboot.c (limited to 'arch/x86') diff --git a/Documentation/intel_txt.txt b/Documentation/intel_txt.txt new file mode 100644 index 00000000000..f40a1f03001 --- /dev/null +++ b/Documentation/intel_txt.txt @@ -0,0 +1,210 @@ +Intel(R) TXT Overview: +===================== + +Intel's technology for safer computing, Intel(R) Trusted Execution +Technology (Intel(R) TXT), defines platform-level enhancements that +provide the building blocks for creating trusted platforms. + +Intel TXT was formerly known by the code name LaGrande Technology (LT). + +Intel TXT in Brief: +o Provides dynamic root of trust for measurement (DRTM) +o Data protection in case of improper shutdown +o Measurement and verification of launched environment + +Intel TXT is part of the vPro(TM) brand and is also available some +non-vPro systems. It is currently available on desktop systems +based on the Q35, X38, Q45, and Q43 Express chipsets (e.g. Dell +Optiplex 755, HP dc7800, etc.) and mobile systems based on the GM45, +PM45, and GS45 Express chipsets. + +For more information, see http://www.intel.com/technology/security/. +This site also has a link to the Intel TXT MLE Developers Manual, +which has been updated for the new released platforms. + +Intel TXT has been presented at various events over the past few +years, some of which are: + LinuxTAG 2008: + http://www.linuxtag.org/2008/en/conf/events/vp-donnerstag/ + details.html?talkid=110 + TRUST2008: + http://www.trust2008.eu/downloads/Keynote-Speakers/ + 3_David-Grawrock_The-Front-Door-of-Trusted-Computing.pdf + IDF 2008, Shanghai: + http://inteldeveloperforum.com.edgesuite.net/shanghai_2008/ + aep/PROS003/index.html + IDFs 2006, 2007 (I'm not sure if/where they are online) + +Trusted Boot Project Overview: +============================= + +Trusted Boot (tboot) is an open source, pre- kernel/VMM module that +uses Intel TXT to perform a measured and verified launch of an OS +kernel/VMM. + +It is hosted on SourceForge at http://sourceforge.net/projects/tboot. +The mercurial source repo is available at http://www.bughost.org/ +repos.hg/tboot.hg. + +Tboot currently supports launching Xen (open source VMM/hypervisor +w/ TXT support since v3.2), and now Linux kernels. + + +Value Proposition for Linux or "Why should you care?" +===================================================== + +While there are many products and technologies that attempt to +measure or protect the integrity of a running kernel, they all +assume the kernel is "good" to begin with. The Integrity +Measurement Architecture (IMA) and Linux Integrity Module interface +are examples of such solutions. + +To get trust in the initial kernel without using Intel TXT, a +static root of trust must be used. This bases trust in BIOS +starting at system reset and requires measurement of all code +executed between system reset through the completion of the kernel +boot as well as data objects used by that code. In the case of a +Linux kernel, this means all of BIOS, any option ROMs, the +bootloader and the boot config. In practice, this is a lot of +code/data, much of which is subject to change from boot to boot +(e.g. changing NICs may change option ROMs). Without reference +hashes, these measurement changes are difficult to assess or +confirm as benign. This process also does not provide DMA +protection, memory configuration/alias checks and locks, crash +protection, or policy support. + +By using the hardware-based root of trust that Intel TXT provides, +many of these issues can be mitigated. Specifically: many +pre-launch components can be removed from the trust chain, DMA +protection is provided to all launched components, a large number +of platform configuration checks are performed and values locked, +protection is provided for any data in the event of an improper +shutdown, and there is support for policy-based execution/verification. +This provides a more stable measurement and a higher assurance of +system configuration and initial state than would be otherwise +possible. Since the tboot project is open source, source code for +almost all parts of the trust chain is available (excepting SMM and +Intel-provided firmware). + +How Does it Work? +================= + +o Tboot is an executable that is launched by the bootloader as + the "kernel" (the binary the bootloader executes). +o It performs all of the work necessary to determine if the + platform supports Intel TXT and, if so, executes the GETSEC[SENTER] + processor instruction that initiates the dynamic root of trust. + - If tboot determines that the system does not support Intel TXT + or is not configured correctly (e.g. the SINIT AC Module was + incorrect), it will directly launch the kernel with no changes + to any state. + - Tboot will output various information about its progress to the + terminal, serial port, and/or an in-memory log; the output + locations can be configured with a command line switch. +o The GETSEC[SENTER] instruction will return control to tboot and + tboot then verifies certain aspects of the environment (e.g. TPM NV + lock, e820 table does not have invalid entries, etc.). +o It will wake the APs from the special sleep state the GETSEC[SENTER] + instruction had put them in and place them into a wait-for-SIPI + state. + - Because the processors will not respond to an INIT or SIPI when + in the TXT environment, it is necessary to create a small VT-x + guest for the APs. When they run in this guest, they will + simply wait for the INIT-SIPI-SIPI sequence, which will cause + VMEXITs, and then disable VT and jump to the SIPI vector. This + approach seemed like a better choice than having to insert + special code into the kernel's MP wakeup sequence. +o Tboot then applies an (optional) user-defined launch policy to + verify the kernel and initrd. + - This policy is rooted in TPM NV and is described in the tboot + project. The tboot project also contains code for tools to + create and provision the policy. + - Policies are completely under user control and if not present + then any kernel will be launched. + - Policy action is flexible and can include halting on failures + or simply logging them and continuing. +o Tboot adjusts the e820 table provided by the bootloader to reserve + its own location in memory as well as to reserve certain other + TXT-related regions. +o As part of it's launch, tboot DMA protects all of RAM (using the + VT-d PMRs). Thus, the kernel must be booted with 'intel_iommu=on' + in order to remove this blanket protection and use VT-d's + page-level protection. +o Tboot will populate a shared page with some data about itself and + pass this to the Linux kernel as it transfers control. + - The location of the shared page is passed via the boot_params + struct as a physical address. +o The kernel will look for the tboot shared page address and, if it + exists, map it. +o As one of the checks/protections provided by TXT, it makes a copy + of the VT-d DMARs in a DMA-protected region of memory and verifies + them for correctness. The VT-d code will detect if the kernel was + launched with tboot and use this copy instead of the one in the + ACPI table. +o At this point, tboot and TXT are out of the picture until a + shutdown (S) +o In order to put a system into any of the sleep states after a TXT + launch, TXT must first be exited. This is to prevent attacks that + attempt to crash the system to gain control on reboot and steal + data left in memory. + - The kernel will perform all of its sleep preparation and + populate the shared page with the ACPI data needed to put the + platform in the desired sleep state. + - Then the kernel jumps into tboot via the vector specified in the + shared page. + - Tboot will clean up the environment and disable TXT, then use the + kernel-provided ACPI information to actually place the platform + into the desired sleep state. + - In the case of S3, tboot will also register itself as the resume + vector. This is necessary because it must re-establish the + measured environment upon resume. Once the TXT environment + has been restored, it will restore the TPM PCRs and then + transfer control back to the kernel's S3 resume vector. + In order to preserve system integrity across S3, the kernel + provides tboot with a set of memory ranges (kernel + code/data/bss, S3 resume code, and AP trampoline) that tboot + will calculate a MAC (message authentication code) over and then + seal with the TPM. On resume and once the measured environment + has been re-established, tboot will re-calculate the MAC and + verify it against the sealed value. Tboot's policy determines + what happens if the verification fails. + +That's pretty much it for TXT support. + + +Configuring the System: +====================== + +This code works with 32bit, 32bit PAE, and 64bit (x86_64) kernels. + +In BIOS, the user must enable: TPM, TXT, VT-x, VT-d. Not all BIOSes +allow these to be individually enabled/disabled and the screens in +which to find them are BIOS-specific. + +grub.conf needs to be modified as follows: + title Linux 2.6.29-tip w/ tboot + root (hd0,0) + kernel /tboot.gz logging=serial,vga,memory + module /vmlinuz-2.6.29-tip intel_iommu=on ro + root=LABEL=/ rhgb console=ttyS0,115200 3 + module /initrd-2.6.29-tip.img + module /Q35_SINIT_17.BIN + +The kernel option for enabling Intel TXT support is found under the +Security top-level menu and is called "Enable Intel(R) Trusted +Execution Technology (TXT)". It is marked as EXPERIMENTAL and +depends on the generic x86 support (to allow maximum flexibility in +kernel build options), since the tboot code will detect whether the +platform actually supports Intel TXT and thus whether any of the +kernel code is executed. + +The Q35_SINIT_17.BIN file is what Intel TXT refers to as an +Authenticated Code Module. It is specific to the chipset in the +system and can also be found on the Trusted Boot site. It is an +(unencrypted) module signed by Intel that is used as part of the +DRTM process to verify and configure the system. It is signed +because it operates at a higher privilege level in the system than +any other macrocode and its correct operation is critical to the +establishment of the DRTM. The process for determining the correct +SINIT ACM for a system is documented in the SINIT-guide.txt file +that is on the tboot SourceForge site under the SINIT ACM downloads. diff --git a/Documentation/x86/zero-page.txt b/Documentation/x86/zero-page.txt index 4f913857b8a..feb37e17701 100644 --- a/Documentation/x86/zero-page.txt +++ b/Documentation/x86/zero-page.txt @@ -12,6 +12,7 @@ Offset Proto Name Meaning 000/040 ALL screen_info Text mode or frame buffer information (struct screen_info) 040/014 ALL apm_bios_info APM BIOS information (struct apm_bios_info) +058/008 ALL tboot_addr Physical address of tboot shared page 060/010 ALL ist_info Intel SpeedStep (IST) BIOS support information (struct ist_info) 080/010 ALL hd0_info hd0 disk parameter, OBSOLETE!! diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h index 1724e8de317..6ca20218dd7 100644 --- a/arch/x86/include/asm/bootparam.h +++ b/arch/x86/include/asm/bootparam.h @@ -85,7 +85,8 @@ struct efi_info { struct boot_params { struct screen_info screen_info; /* 0x000 */ struct apm_bios_info apm_bios_info; /* 0x040 */ - __u8 _pad2[12]; /* 0x054 */ + __u8 _pad2[4]; /* 0x054 */ + __u64 tboot_addr; /* 0x058 */ struct ist_info ist_info; /* 0x060 */ __u8 _pad3[16]; /* 0x070 */ __u8 hd0_info[16]; /* obsolete! */ /* 0x080 */ diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 7b2d71df39a..14f9890eb49 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -131,6 +131,9 @@ enum fixed_addresses { #endif #ifdef CONFIG_X86_32 FIX_WP_TEST, +#endif +#ifdef CONFIG_INTEL_TXT + FIX_TBOOT_BASE, #endif __end_of_fixed_addresses }; diff --git a/arch/x86/include/asm/tboot.h b/arch/x86/include/asm/tboot.h new file mode 100644 index 00000000000..b13929d4e5f --- /dev/null +++ b/arch/x86/include/asm/tboot.h @@ -0,0 +1,197 @@ +/* + * tboot.h: shared data structure with tboot and kernel and functions + * used by kernel for runtime support of Intel(R) Trusted + * Execution Technology + * + * Copyright (c) 2006-2009, Intel Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * + */ + +#ifndef _ASM_TBOOT_H +#define _ASM_TBOOT_H + +#include + +/* these must have the values from 0-5 in this order */ +enum { + TB_SHUTDOWN_REBOOT = 0, + TB_SHUTDOWN_S5, + TB_SHUTDOWN_S4, + TB_SHUTDOWN_S3, + TB_SHUTDOWN_HALT, + TB_SHUTDOWN_WFS +}; + +#ifdef CONFIG_INTEL_TXT + +/* used to communicate between tboot and the launched kernel */ + +#define TB_KEY_SIZE 64 /* 512 bits */ + +#define MAX_TB_MAC_REGIONS 32 + +struct tboot_mac_region { + u64 start; /* must be 64 byte -aligned */ + u32 size; /* must be 64 byte -granular */ +} __packed; + +/* GAS - Generic Address Structure (ACPI 2.0+) */ +struct tboot_acpi_generic_address { + u8 space_id; + u8 bit_width; + u8 bit_offset; + u8 access_width; + u64 address; +} __packed; + +/* + * combines Sx info from FADT and FACS tables per ACPI 2.0+ spec + * (http://www.acpi.info/) + */ +struct tboot_acpi_sleep_info { + struct tboot_acpi_generic_address pm1a_cnt_blk; + struct tboot_acpi_generic_address pm1b_cnt_blk; + struct tboot_acpi_generic_address pm1a_evt_blk; + struct tboot_acpi_generic_address pm1b_evt_blk; + u16 pm1a_cnt_val; + u16 pm1b_cnt_val; + u64 wakeup_vector; + u32 vector_width; + u64 kernel_s3_resume_vector; +} __packed; + +/* + * shared memory page used for communication between tboot and kernel + */ +struct tboot { + /* + * version 3+ fields: + */ + + /* TBOOT_UUID */ + u8 uuid[16]; + + /* version number: 5 is current */ + u32 version; + + /* physical addr of tb_log_t log */ + u32 log_addr; + + /* + * physical addr of entry point for tboot shutdown and + * type of shutdown (TB_SHUTDOWN_*) being requested + */ + u32 shutdown_entry; + u32 shutdown_type; + + /* kernel-specified ACPI info for Sx shutdown */ + struct tboot_acpi_sleep_info acpi_sinfo; + + /* tboot location in memory (physical) */ + u32 tboot_base; + u32 tboot_size; + + /* memory regions (phys addrs) for tboot to MAC on S3 */ + u8 num_mac_regions; + struct tboot_mac_region mac_regions[MAX_TB_MAC_REGIONS]; + + + /* + * version 4+ fields: + */ + + /* symmetric key for use by kernel; will be encrypted on S3 */ + u8 s3_key[TB_KEY_SIZE]; + + + /* + * version 5+ fields: + */ + + /* used to 4byte-align num_in_wfs */ + u8 reserved_align[3]; + + /* number of processors in wait-for-SIPI */ + u32 num_in_wfs; +} __packed; + +/* + * UUID for tboot data struct to facilitate matching + * defined as {663C8DFF-E8B3-4b82-AABF-19EA4D057A08} by tboot, which is + * represented as {} in the char array used here + */ +#define TBOOT_UUID {0xff, 0x8d, 0x3c, 0x66, 0xb3, 0xe8, 0x82, 0x4b, 0xbf,\ + 0xaa, 0x19, 0xea, 0x4d, 0x5, 0x7a, 0x8} + +extern struct tboot *tboot; + +static inline int tboot_enabled(void) +{ + return tboot != NULL; +} + +extern void tboot_probe(void); +extern void tboot_create_trampoline(void); +extern void tboot_shutdown(u32 shutdown_type); +extern void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control); +extern int tboot_wait_for_aps(int num_aps); +extern struct acpi_table_header *tboot_get_dmar_table( + struct acpi_table_header *dmar_tbl); +extern int tboot_force_iommu(void); + +#else /* CONFIG_INTEL_TXT */ + +static inline int tboot_enabled(void) +{ + return 0; +} + +static inline void tboot_probe(void) +{ +} + +static inline void tboot_create_trampoline(void) +{ +} + +static inline void tboot_shutdown(u32 shutdown_type) +{ +} + +static inline void tboot_sleep(u8 sleep_state, u32 pm1a_control, + u32 pm1b_control) +{ +} + +static inline int tboot_wait_for_aps(int num_aps) +{ + return 0; +} + +static inline struct acpi_table_header *tboot_get_dmar_table( + struct acpi_table_header *dmar_tbl) +{ + return dmar_tbl; +} + +static inline int tboot_force_iommu(void) +{ + return 0; +} + +#endif /* !CONFIG_INTEL_TXT */ + +#endif /* _ASM_TBOOT_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 430d5b24af7..832cb838cb4 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -52,6 +52,7 @@ obj-$(CONFIG_X86_DS_SELFTEST) += ds_selftest.o obj-$(CONFIG_X86_32) += tls.o obj-$(CONFIG_IA32_EMULATION) += tls.o obj-y += step.o +obj-$(CONFIG_INTEL_TXT) += tboot.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += cpu/ obj-y += acpi/ diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index de2cab13284..80d6e9e3248 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -145,6 +145,8 @@ struct boot_params __initdata boot_params; struct boot_params boot_params; #endif +#include + /* * Machine setup.. */ @@ -964,6 +966,8 @@ void __init setup_arch(char **cmdline_p) paravirt_pagetable_setup_done(swapper_pg_dir); paravirt_post_allocator_init(); + tboot_probe(); + #ifdef CONFIG_X86_64 map_vsyscall(); #endif diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c new file mode 100644 index 00000000000..263591afd29 --- /dev/null +++ b/arch/x86/kernel/tboot.c @@ -0,0 +1,379 @@ +/* + * tboot.c: main implementation of helper functions used by kernel for + * runtime support of Intel(R) Trusted Execution Technology + * + * Copyright (c) 2006-2009, Intel Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "acpi/realmode/wakeup.h" + +/* Global pointer to shared data; NULL means no measured launch. */ +struct tboot *tboot __read_mostly; + +/* timeout for APs (in secs) to enter wait-for-SIPI state during shutdown */ +#define AP_WAIT_TIMEOUT 1 + +#undef pr_fmt +#define pr_fmt(fmt) "tboot: " fmt + +static u8 tboot_uuid[16] __initdata = TBOOT_UUID; + +void __init tboot_probe(void) +{ + /* Look for valid page-aligned address for shared page. */ + if (!boot_params.tboot_addr) + return; + /* + * also verify that it is mapped as we expect it before calling + * set_fixmap(), to reduce chance of garbage value causing crash + */ + if (!e820_any_mapped(boot_params.tboot_addr, + boot_params.tboot_addr, E820_RESERVED)) { + pr_warning("non-0 tboot_addr but it is not of type E820_RESERVED\n"); + return; + } + + /* only a natively booted kernel should be using TXT */ + if (paravirt_enabled()) { + pr_warning("non-0 tboot_addr but pv_ops is enabled\n"); + return; + } + + /* Map and check for tboot UUID. */ + set_fixmap(FIX_TBOOT_BASE, boot_params.tboot_addr); + tboot = (struct tboot *)fix_to_virt(FIX_TBOOT_BASE); + if (memcmp(&tboot_uuid, &tboot->uuid, sizeof(tboot->uuid))) { + pr_warning("tboot at 0x%llx is invalid\n", + boot_params.tboot_addr); + tboot = NULL; + return; + } + if (tboot->version < 5) { + pr_warning("tboot version is invalid: %u\n", tboot->version); + tboot = NULL; + return; + } + + pr_info("found shared page at phys addr 0x%llx:\n", + boot_params.tboot_addr); + pr_debug("version: %d\n", tboot->version); + pr_debug("log_addr: 0x%08x\n", tboot->log_addr); + pr_debug("shutdown_entry: 0x%x\n", tboot->shutdown_entry); + pr_debug("tboot_base: 0x%08x\n", tboot->tboot_base); + pr_debug("tboot_size: 0x%x\n", tboot->tboot_size); +} + +static pgd_t *tboot_pg_dir; +static struct mm_struct tboot_mm = { + .mm_rb = RB_ROOT, + .pgd = swapper_pg_dir, + .mm_users = ATOMIC_INIT(2), + .mm_count = ATOMIC_INIT(1), + .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem), + .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), + .mmlist = LIST_HEAD_INIT(init_mm.mmlist), + .cpu_vm_mask = CPU_MASK_ALL, +}; + +static inline void switch_to_tboot_pt(void) +{ + write_cr3(virt_to_phys(tboot_pg_dir)); +} + +static int map_tboot_page(unsigned long vaddr, unsigned long pfn, + pgprot_t prot) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pgd = pgd_offset(&tboot_mm, vaddr); + pud = pud_alloc(&tboot_mm, pgd, vaddr); + if (!pud) + return -1; + pmd = pmd_alloc(&tboot_mm, pud, vaddr); + if (!pmd) + return -1; + pte = pte_alloc_map(&tboot_mm, pmd, vaddr); + if (!pte) + return -1; + set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot)); + pte_unmap(pte); + return 0; +} + +static int map_tboot_pages(unsigned long vaddr, unsigned long start_pfn, + unsigned long nr) +{ + /* Reuse the original kernel mapping */ + tboot_pg_dir = pgd_alloc(&tboot_mm); + if (!tboot_pg_dir) + return -1; + + for (; nr > 0; nr--, vaddr += PAGE_SIZE, start_pfn++) { + if (map_tboot_page(vaddr, start_pfn, PAGE_KERNEL_EXEC)) + return -1; + } + + return 0; +} + +void tboot_create_trampoline(void) +{ + u32 map_base, map_size; + + if (!tboot_enabled()) + return; + + /* Create identity map for tboot shutdown code. */ + map_base = PFN_DOWN(tboot->tboot_base); + map_size = PFN_UP(tboot->tboot_size); + if (map_tboot_pages(map_base << PAGE_SHIFT, map_base, map_size)) + panic("tboot: Error mapping tboot pages (mfns) @ 0x%x, 0x%x\n", map_base, map_size); +} + +static void set_mac_regions(void) +{ + tboot->num_mac_regions = 3; + /* S3 resume code */ + tboot->mac_regions[0].start = PFN_PHYS(PFN_DOWN(acpi_wakeup_address)); + tboot->mac_regions[0].size = PFN_UP(WAKEUP_SIZE) << PAGE_SHIFT; + /* AP trampoline code */ + tboot->mac_regions[1].start = + PFN_PHYS(PFN_DOWN(virt_to_phys(trampoline_base))); + tboot->mac_regions[1].size = PFN_UP(TRAMPOLINE_SIZE) << PAGE_SHIFT; + /* kernel code + data + bss */ + tboot->mac_regions[2].start = PFN_PHYS(PFN_DOWN(virt_to_phys(&_text))); + tboot->mac_regions[2].size = PFN_PHYS(PFN_UP(virt_to_phys(&_end))) - + PFN_PHYS(PFN_DOWN(virt_to_phys(&_text))); +} + +void tboot_shutdown(u32 shutdown_type) +{ + void (*shutdown)(void); + + if (!tboot_enabled()) + return; + + /* + * if we're being called before the 1:1 mapping is set up then just + * return and let the normal shutdown happen; this should only be + * due to very early panic() + */ + if (!tboot_pg_dir) + return; + + /* if this is S3 then set regions to MAC */ + if (shutdown_type == TB_SHUTDOWN_S3) + set_mac_regions(); + + tboot->shutdown_type = shutdown_type; + + switch_to_tboot_pt(); + + shutdown = (void(*)(void))(unsigned long)tboot->shutdown_entry; + shutdown(); + + /* should not reach here */ + while (1) + halt(); +} + +static void tboot_copy_fadt(const struct acpi_table_fadt *fadt) +{ +#define TB_COPY_GAS(tbg, g) \ + tbg.space_id = g.space_id; \ + tbg.bit_width = g.bit_width; \ + tbg.bit_offset = g.bit_offset; \ + tbg.access_width = g.access_width; \ + tbg.address = g.address; + + TB_COPY_GAS(tboot->acpi_sinfo.pm1a_cnt_blk, fadt->xpm1a_control_block); + TB_COPY_GAS(tboot->acpi_sinfo.pm1b_cnt_blk, fadt->xpm1b_control_block); + TB_COPY_GAS(tboot->acpi_sinfo.pm1a_evt_blk, fadt->xpm1a_event_block); + TB_COPY_GAS(tboot->acpi_sinfo.pm1b_evt_blk, fadt->xpm1b_event_block); + + /* + * We need phys addr of waking vector, but can't use virt_to_phys() on + * &acpi_gbl_FACS because it is ioremap'ed, so calc from FACS phys + * addr. + */ + tboot->acpi_sinfo.wakeup_vector = fadt->facs + + offsetof(struct acpi_table_facs, firmware_waking_vector); +} + +void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control) +{ + static u32 acpi_shutdown_map[ACPI_S_STATE_COUNT] = { + /* S0,1,2: */ -1, -1, -1, + /* S3: */ TB_SHUTDOWN_S3, + /* S4: */ TB_SHUTDOWN_S4, + /* S5: */ TB_SHUTDOWN_S5 }; + + if (!tboot_enabled()) + return; + + tboot_copy_fadt(&acpi_gbl_FADT); + tboot->acpi_sinfo.pm1a_cnt_val = pm1a_control; + tboot->acpi_sinfo.pm1b_cnt_val = pm1b_control; + /* we always use the 32b wakeup vector */ + tboot->acpi_sinfo.vector_width = 32; + tboot->acpi_sinfo.kernel_s3_resume_vector = acpi_wakeup_address; + + if (sleep_state >= ACPI_S_STATE_COUNT || + acpi_shutdown_map[sleep_state] == -1) { + pr_warning("unsupported sleep state 0x%x\n", sleep_state); + return; + } + + tboot_shutdown(acpi_shutdown_map[sleep_state]); +} + +int tboot_wait_for_aps(int num_aps) +{ + unsigned long timeout; + + if (!tboot_enabled()) + return 0; + + timeout = jiffies + AP_WAIT_TIMEOUT*HZ; + while (atomic_read((atomic_t *)&tboot->num_in_wfs) != num_aps && + time_before(jiffies, timeout)) + cpu_relax(); + + return time_before(jiffies, timeout) ? 0 : 1; +} + +/* + * TXT configuration registers (offsets from TXT_{PUB, PRIV}_CONFIG_REGS_BASE) + */ + +#define TXT_PUB_CONFIG_REGS_BASE 0xfed30000 +#define TXT_PRIV_CONFIG_REGS_BASE 0xfed20000 + +/* # pages for each config regs space - used by fixmap */ +#define NR_TXT_CONFIG_PAGES ((TXT_PUB_CONFIG_REGS_BASE - \ + TXT_PRIV_CONFIG_REGS_BASE) >> PAGE_SHIFT) + +/* offsets from pub/priv config space */ +#define TXTCR_HEAP_BASE 0x0300 +#define TXTCR_HEAP_SIZE 0x0308 + +#define SHA1_SIZE 20 + +struct sha1_hash { + u8 hash[SHA1_SIZE]; +}; + +struct sinit_mle_data { + u32 version; /* currently 6 */ + struct sha1_hash bios_acm_id; + u32 edx_senter_flags; + u64 mseg_valid; + struct sha1_hash sinit_hash; + struct sha1_hash mle_hash; + struct sha1_hash stm_hash; + struct sha1_hash lcp_policy_hash; + u32 lcp_policy_control; + u32 rlp_wakeup_addr; + u32 reserved; + u32 num_mdrs; + u32 mdrs_off; + u32 num_vtd_dmars; + u32 vtd_dmars_off; +} __packed; + +struct acpi_table_header *tboot_get_dmar_table(struct acpi_table_header *dmar_tbl) +{ + void *heap_base, *heap_ptr, *config; + + if (!tboot_enabled()) + return dmar_tbl; + + /* + * ACPI tables may not be DMA protected by tboot, so use DMAR copy + * SINIT saved in SinitMleData in TXT heap (which is DMA protected) + */ + + /* map config space in order to get heap addr */ + config = ioremap(TXT_PUB_CONFIG_REGS_BASE, NR_TXT_CONFIG_PAGES * + PAGE_SIZE); + if (!config) + return NULL; + + /* now map TXT heap */ + heap_base = ioremap(*(u64 *)(config + TXTCR_HEAP_BASE), + *(u64 *)(config + TXTCR_HEAP_SIZE)); + iounmap(config); + if (!heap_base) + return NULL; + + /* walk heap to SinitMleData */ + /* skip BiosData */ + heap_ptr = heap_base + *(u64 *)heap_base; + /* skip OsMleData */ + heap_ptr += *(u64 *)heap_ptr; + /* skip OsSinitData */ + heap_ptr += *(u64 *)heap_ptr; + /* now points to SinitMleDataSize; set to SinitMleData */ + heap_ptr += sizeof(u64); + /* get addr of DMAR table */ + dmar_tbl = (struct acpi_table_header *)(heap_ptr + + ((struct sinit_mle_data *)heap_ptr)->vtd_dmars_off - + sizeof(u64)); + + /* don't unmap heap because dmar.c needs access to this */ + + return dmar_tbl; +} + +int tboot_force_iommu(void) +{ + if (!tboot_enabled()) + return 0; + + if (no_iommu || swiotlb || dmar_disabled) + pr_warning("Forcing Intel-IOMMU to enabled\n"); + + dmar_disabled = 0; +#ifdef CONFIG_SWIOTLB + swiotlb = 0; +#endif + no_iommu = 0; + + return 1; +} diff --git a/security/Kconfig b/security/Kconfig index d23c839038f..edc7cbdc012 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -113,6 +113,36 @@ config SECURITY_ROOTPLUG If you are unsure how to answer this question, answer N. +config INTEL_TXT + bool "Enable Intel(R) Trusted Execution Technology (Intel(R) TXT)" + depends on EXPERIMENTAL && X86 && DMAR && ACPI + help + This option enables support for booting the kernel with the + Trusted Boot (tboot) module. This will utilize + Intel(R) Trusted Execution Technology to perform a measured launch + of the kernel. If the system does not support Intel(R) TXT, this + will have no effect. + + Intel TXT will provide higher assurance of sysem configuration and + initial state as well as data reset protection. This is used to + create a robust initial kernel measurement and verification, which + helps to ensure that kernel security mechanisms are functioning + correctly. This level of protection requires a root of trust outside + of the kernel itself. + + Intel TXT also helps solve real end user concerns about having + confidence that their hardware is running the VMM or kernel that + it was conigured with, especially since they may be responsible for + providing such assurances to VMs and services running on it. + + See for more information + about Intel(R) TXT. + See for more information about tboot. + See Documentation/intel_txt.txt for a description of how to enable + Intel TXT support in a kernel boot. + + If you are unsure as to whether this is required, answer N. + source security/selinux/Kconfig source security/smack/Kconfig source security/tomoyo/Kconfig -- cgit v1.2.3 From 840c2baf2d4cdf35ecc3b7fcbba7740f97de30a4 Mon Sep 17 00:00:00 2001 From: Joseph Cihula Date: Tue, 30 Jun 2009 19:31:02 -0700 Subject: x86, intel_txt: Intel TXT reboot/halt shutdown support Support for graceful handling of kernel reboots after an Intel(R) TXT launch. Without this patch, attempting to reboot or halt the system will cause the TXT hardware to lock memory upon system restart because the secrets-in-memory flag that was set on launch was never cleared. This will in turn cause BIOS to execute a TXT Authenticated Code Module (ACM) that will scrub all of memory and then unlock it. Depending on the amount of memory in the system and its type, this may take some time. This patch creates a 1:1 address mapping to the tboot module and then calls back into tboot so that it may properly and securely clean up system state and clear the secrets-in-memory flag. When it has completed these steps, the tboot module will reboot or halt the system. arch/x86/kernel/reboot.c | 8 ++++++++ init/main.c | 3 +++ 2 files changed, 11 insertions(+) Signed-off-by: Joseph Cihula Signed-off-by: Shane Wang Signed-off-by: H. Peter Anvin --- arch/x86/kernel/reboot.c | 8 ++++++++ init/main.c | 3 +++ 2 files changed, 11 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index d2d1ce8170f..9de01c5d979 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -24,6 +24,8 @@ # include #endif +#include + /* * Power off function, if any */ @@ -460,6 +462,8 @@ static void native_machine_emergency_restart(void) if (reboot_emergency) emergency_vmx_disable_all(); + tboot_shutdown(TB_SHUTDOWN_REBOOT); + /* Tell the BIOS if we want cold or warm reboot */ *((unsigned short *)__va(0x472)) = reboot_mode; @@ -586,6 +590,8 @@ static void native_machine_halt(void) /* stop other cpus and apics */ machine_shutdown(); + tboot_shutdown(TB_SHUTDOWN_HALT); + /* stop this cpu */ stop_this_cpu(NULL); } @@ -597,6 +603,8 @@ static void native_machine_power_off(void) machine_shutdown(); pm_power_off(); } + /* a fallback in case there is no PM info available */ + tboot_shutdown(TB_SHUTDOWN_HALT); } struct machine_ops machine_ops = { diff --git a/init/main.c b/init/main.c index 2c5ade79eb8..56ada27c4f4 100644 --- a/init/main.c +++ b/init/main.c @@ -73,6 +73,7 @@ #include #include #include +#include #include #include @@ -715,6 +716,8 @@ asmlinkage void __init start_kernel(void) ftrace_init(); + tboot_create_trampoline(); + /* Do the rest non-__init'ed, we're now alive */ rest_init(); } -- cgit v1.2.3 From 86886e55b273f565935491816c7c96b82469d4f8 Mon Sep 17 00:00:00 2001 From: Joseph Cihula Date: Tue, 30 Jun 2009 19:31:07 -0700 Subject: x86, intel_txt: Intel TXT Sx shutdown support Support for graceful handling of sleep states (S3/S4/S5) after an Intel(R) TXT launch. Without this patch, attempting to place the system in one of the ACPI sleep states (S3/S4/S5) will cause the TXT hardware to treat this as an attack and will cause a system reset, with memory locked. Not only may the subsequent memory scrub take some time, but the platform will be unable to enter the requested power state. This patch calls back into the tboot so that it may properly and securely clean up system state and clear the secrets-in-memory flag, after which it will place the system into the requested sleep state using ACPI information passed by the kernel. arch/x86/kernel/smpboot.c | 2 ++ drivers/acpi/acpica/hwsleep.c | 3 +++ kernel/cpu.c | 7 ++++++- 3 files changed, 11 insertions(+), 1 deletion(-) Signed-off-by: Joseph Cihula Signed-off-by: Shane Wang Signed-off-by: H. Peter Anvin --- arch/x86/kernel/smpboot.c | 2 ++ drivers/acpi/acpica/hwsleep.c | 3 +++ kernel/cpu.c | 7 ++++++- 3 files changed, 11 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 2fecda69ee6..61cc40887c4 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -62,6 +62,7 @@ #include #include #include +#include #include #include @@ -1317,6 +1318,7 @@ void play_dead_common(void) void native_play_dead(void) { play_dead_common(); + tboot_shutdown(TB_SHUTDOWN_WFS); wbinvd_halt(); } diff --git a/drivers/acpi/acpica/hwsleep.c b/drivers/acpi/acpica/hwsleep.c index db307a356f0..8c01dd3724e 100644 --- a/drivers/acpi/acpica/hwsleep.c +++ b/drivers/acpi/acpica/hwsleep.c @@ -45,6 +45,7 @@ #include #include "accommon.h" #include "actables.h" +#include #define _COMPONENT ACPI_HARDWARE ACPI_MODULE_NAME("hwsleep") @@ -342,6 +343,8 @@ acpi_status asmlinkage acpi_enter_sleep_state(u8 sleep_state) ACPI_FLUSH_CPU_CACHE(); + tboot_sleep(sleep_state, pm1a_control, pm1b_control); + /* Write #2: Write both SLP_TYP + SLP_EN */ status = acpi_hw_write_pm1_control(pm1a_control, pm1b_control); diff --git a/kernel/cpu.c b/kernel/cpu.c index 8ce10043e4a..ff071e022a8 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -14,6 +14,7 @@ #include #include #include +#include #ifdef CONFIG_SMP /* Serializes the updates to cpu_online_mask, cpu_present_mask */ @@ -376,7 +377,7 @@ static cpumask_var_t frozen_cpus; int disable_nonboot_cpus(void) { - int cpu, first_cpu, error; + int cpu, first_cpu, error, num_cpus = 0; error = stop_machine_create(); if (error) @@ -391,6 +392,7 @@ int disable_nonboot_cpus(void) for_each_online_cpu(cpu) { if (cpu == first_cpu) continue; + num_cpus++; error = _cpu_down(cpu, 1); if (!error) { cpumask_set_cpu(cpu, frozen_cpus); @@ -401,6 +403,9 @@ int disable_nonboot_cpus(void) break; } } + /* ensure all CPUs have gone into wait-for-SIPI */ + error |= tboot_wait_for_aps(num_cpus); + if (!error) { BUG_ON(num_online_cpus() > 1); /* Make sure the CPUs won't be enabled by someone else */ -- cgit v1.2.3 From 94699b04eddd4b247d871930431d6fa1a46c175e Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 28 Jul 2009 23:52:54 +0200 Subject: x86, mce: don't log boot MCEs on Pentium M (model == 13) CPUs On my legacy Pentium M laptop (Acer Extensa 2900) I get bogus MCE on a cold boot with CONFIG_X86_NEW_MCE enabled, i.e. (after decoding it with mcelog): MCE 0 HARDWARE ERROR. This is *NOT* a software problem! Please contact your hardware vendor CPU 0 BANK 1 MCG status: MCi status: Error overflow Uncorrected error Error enabled Processor context corrupt MCA: Data CACHE Level-1 UNKNOWN Error STATUS f200000000000195 MCGSTATUS 0 [ The other STATUS values observed: f2000000000001b5 (... UNKNOWN error) and f200000000000115 (... READ Error). To verify that this is not a CONFIG_X86_NEW_MCE bug I also modified the CONFIG_X86_OLD_MCE code (which doesn't log any MCEs) to dump content of STATUS MSR before it is cleared during initialization. ] Since the bogus MCE results in a kernel taint (which in turn disables lockdep support) don't log boot MCEs on Pentium M (model == 13) CPUs by default ("mce=bootlog" boot parameter can be be used to get the old behavior). Signed-off-by: Bartlomiej Zolnierkiewicz Reviewed-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 07139a0578e..7bd19c7f531 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1269,6 +1269,10 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c) if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && monarch_timeout < 0) monarch_timeout = USEC_PER_SEC; + + /* There are also broken BIOSes on some Pentium M systems. */ + if (c->x86 == 6 && c->x86_model == 13 && mce_bootlog < 0) + mce_bootlog = 0; } if (monarch_timeout < 0) monarch_timeout = 0; -- cgit v1.2.3 From e3346fc48204d780f92527d06df8bf6f28d603ec Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 28 Jul 2009 23:55:09 +0200 Subject: x86, mce: fix "mce" boot option handling for CONFIG_X86_NEW_MCE "mce argument mce ignored. Please use /sys" message shouldn't be printed when using "mce" boot option. Signed-off-by: Bartlomiej Zolnierkiewicz Reviewed-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 7bd19c7f531..75919440a18 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1549,8 +1549,10 @@ static struct miscdevice mce_log_device = { */ static int __init mcheck_enable(char *str) { - if (*str == 0) + if (*str == 0) { enable_p5_mce(); + return 1; + } if (*str == '=') str++; if (!strcmp(str, "off")) -- cgit v1.2.3 From 419d6162c0c0103fa2f44f6691dff9cac14c650d Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 28 Jul 2009 23:56:00 +0200 Subject: x86, mce: add missing __cpuinit tags mce_cap_init() and mce_cpu_quirks() can be tagged with __cpuinit. Signed-off-by: Bartlomiej Zolnierkiewicz Reviewed-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 75919440a18..1ce6db1f878 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1158,7 +1158,7 @@ static int mce_banks_init(void) /* * Initialize Machine Checks for a CPU. */ -static int mce_cap_init(void) +static int __cpuinit mce_cap_init(void) { unsigned b; u64 cap; @@ -1222,7 +1222,7 @@ static void mce_init(void) } /* Add per CPU specific workarounds here */ -static void mce_cpu_quirks(struct cpuinfo_x86 *c) +static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) { /* This should be disabled by the BIOS, but isn't always */ if (c->x86_vendor == X86_VENDOR_AMD) { -- cgit v1.2.3 From d0c87d1f61704ed589fc0788bedd753632340e98 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 28 Jul 2009 23:56:37 +0200 Subject: x86, mce: remove never executed code fseverities_coverage is never NULL in err_out code path. Signed-off-by: Bartlomiej Zolnierkiewicz Reviewed-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce-severity.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index ff0807f9705..51f7c725dab 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -209,8 +209,6 @@ static int __init severities_debugfs_init(void) return 0; err_out: - if (fseverities_coverage) - debugfs_remove(fseverities_coverage); if (dmce) debugfs_remove(dmce); return -ENOMEM; -- cgit v1.2.3 From f3a0867b12e0cf1512c0bd0665f2339fc75ed2a8 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Wed, 29 Jul 2009 00:04:59 +0200 Subject: x86, mce: fix reporting of Thermal Monitoring mechanism enabled Early Pentium M models use different method for enabling TM2 (per paragraph 13.5.2.3 of the "Intel 64 and IA-32 Architectures Software Developer's Manual Volume 3A: System Programming Guide, Part 1"). Tested on the affected Pentium M variant (model == 13). Signed-off-by: Bartlomiej Zolnierkiewicz Cc: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/msr-index.h | 4 ++++ arch/x86/kernel/cpu/mcheck/therm_throt.c | 13 ++++++++++--- 2 files changed, 14 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 3d1ce094586..cbec06deb68 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -222,6 +222,10 @@ #define THERM_STATUS_PROCHOT (1 << 0) +#define MSR_THERM2_CTL 0x0000019d + +#define MSR_THERM2_CTL_TM_SELECT (1ULL << 16) + #define MSR_IA32_MISC_ENABLE 0x000001a0 /* MISC_ENABLE bits: architectural */ diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index bff8dd191dd..15f2bc07bb6 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -253,9 +253,6 @@ void intel_init_thermal(struct cpuinfo_x86 *c) return; } - if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2)) - tm2 = 1; - /* Check whether a vector already exists */ if (h & APIC_VECTOR_MASK) { printk(KERN_DEBUG @@ -264,6 +261,16 @@ void intel_init_thermal(struct cpuinfo_x86 *c) return; } + /* early Pentium M models use different method for enabling TM2 */ + if (cpu_has(c, X86_FEATURE_TM2)) { + if (c->x86 == 6 && (c->x86_model == 9 || c->x86_model == 13)) { + rdmsr(MSR_THERM2_CTL, l, h); + if (l & MSR_THERM2_CTL_TM_SELECT) + tm2 = 1; + } else if (l & MSR_IA32_MISC_ENABLE_TM2) + tm2 = 1; + } + /* We'll mask the thermal vector in the lapic till we're ready: */ h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED; apic_write(APIC_LVTTHMR, h); -- cgit v1.2.3 From 6a12235c7d2d75c7d94b9afcaaecd422ff845ce0 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Wed, 29 Jul 2009 10:25:58 +0100 Subject: agp: kill phys_to_gart() and gart_to_phys() There seems to be no reason for these -- they're a 1:1 mapping on all platforms. Signed-off-by: David Woodhouse --- arch/alpha/include/asm/agp.h | 4 ---- arch/ia64/include/asm/agp.h | 4 ---- arch/parisc/include/asm/agp.h | 4 ---- arch/powerpc/include/asm/agp.h | 4 ---- arch/sparc/include/asm/agp.h | 4 ---- arch/x86/include/asm/agp.h | 4 ---- drivers/char/agp/agp.h | 3 --- drivers/char/agp/ali-agp.c | 4 ++-- drivers/char/agp/amd-k7-agp.c | 8 ++++---- drivers/char/agp/amd64-agp.c | 6 +++--- drivers/char/agp/ati-agp.c | 6 +++--- drivers/char/agp/backend.c | 2 +- drivers/char/agp/efficeon-agp.c | 4 ++-- drivers/char/agp/generic.c | 6 +++--- drivers/char/agp/hp-agp.c | 4 ++-- drivers/char/agp/i460-agp.c | 4 ++-- drivers/char/agp/intel-agp.c | 7 +++---- drivers/char/agp/nvidia-agp.c | 2 +- drivers/char/agp/sgi-agp.c | 2 +- drivers/char/agp/sworks-agp.c | 8 ++++---- drivers/char/agp/uninorth-agp.c | 2 +- 21 files changed, 32 insertions(+), 60 deletions(-) (limited to 'arch/x86') diff --git a/arch/alpha/include/asm/agp.h b/arch/alpha/include/asm/agp.h index 26c17913529..a94d48b8677 100644 --- a/arch/alpha/include/asm/agp.h +++ b/arch/alpha/include/asm/agp.h @@ -9,10 +9,6 @@ #define unmap_page_from_agp(page) #define flush_agp_cache() mb() -/* Convert a physical address to an address suitable for the GART. */ -#define phys_to_gart(x) (x) -#define gart_to_phys(x) (x) - /* GATT allocation. Returns/accepts GATT kernel virtual address. */ #define alloc_gatt_pages(order) \ ((char *)__get_free_pages(GFP_KERNEL, (order))) diff --git a/arch/ia64/include/asm/agp.h b/arch/ia64/include/asm/agp.h index c11fdd8ab4d..01d09c401c5 100644 --- a/arch/ia64/include/asm/agp.h +++ b/arch/ia64/include/asm/agp.h @@ -17,10 +17,6 @@ #define unmap_page_from_agp(page) /* nothing */ #define flush_agp_cache() mb() -/* Convert a physical address to an address suitable for the GART. */ -#define phys_to_gart(x) (x) -#define gart_to_phys(x) (x) - /* GATT allocation. Returns/accepts GATT kernel virtual address. */ #define alloc_gatt_pages(order) \ ((char *)__get_free_pages(GFP_KERNEL, (order))) diff --git a/arch/parisc/include/asm/agp.h b/arch/parisc/include/asm/agp.h index 9651660da63..d226ffa8fc1 100644 --- a/arch/parisc/include/asm/agp.h +++ b/arch/parisc/include/asm/agp.h @@ -11,10 +11,6 @@ #define unmap_page_from_agp(page) /* nothing */ #define flush_agp_cache() mb() -/* Convert a physical address to an address suitable for the GART. */ -#define phys_to_gart(x) (x) -#define gart_to_phys(x) (x) - /* GATT allocation. Returns/accepts GATT kernel virtual address. */ #define alloc_gatt_pages(order) \ ((char *)__get_free_pages(GFP_KERNEL, (order))) diff --git a/arch/powerpc/include/asm/agp.h b/arch/powerpc/include/asm/agp.h index 86455c4c31e..416e12c2d50 100644 --- a/arch/powerpc/include/asm/agp.h +++ b/arch/powerpc/include/asm/agp.h @@ -8,10 +8,6 @@ #define unmap_page_from_agp(page) #define flush_agp_cache() mb() -/* Convert a physical address to an address suitable for the GART. */ -#define phys_to_gart(x) (x) -#define gart_to_phys(x) (x) - /* GATT allocation. Returns/accepts GATT kernel virtual address. */ #define alloc_gatt_pages(order) \ ((char *)__get_free_pages(GFP_KERNEL, (order))) diff --git a/arch/sparc/include/asm/agp.h b/arch/sparc/include/asm/agp.h index c2456870b05..70f52c1661b 100644 --- a/arch/sparc/include/asm/agp.h +++ b/arch/sparc/include/asm/agp.h @@ -7,10 +7,6 @@ #define unmap_page_from_agp(page) #define flush_agp_cache() mb() -/* Convert a physical address to an address suitable for the GART. */ -#define phys_to_gart(x) (x) -#define gart_to_phys(x) (x) - /* GATT allocation. Returns/accepts GATT kernel virtual address. */ #define alloc_gatt_pages(order) \ ((char *)__get_free_pages(GFP_KERNEL, (order))) diff --git a/arch/x86/include/asm/agp.h b/arch/x86/include/asm/agp.h index 9825cd64c9b..eec2a70d437 100644 --- a/arch/x86/include/asm/agp.h +++ b/arch/x86/include/asm/agp.h @@ -22,10 +22,6 @@ */ #define flush_agp_cache() wbinvd() -/* Convert a physical address to an address suitable for the GART. */ -#define phys_to_gart(x) (x) -#define gart_to_phys(x) (x) - /* GATT allocation. Returns/accepts GATT kernel virtual address. */ #define alloc_gatt_pages(order) \ ((char *)__get_free_pages(GFP_KERNEL, (order))) diff --git a/drivers/char/agp/agp.h b/drivers/char/agp/agp.h index 4c6e5079d87..d6f36c004d9 100644 --- a/drivers/char/agp/agp.h +++ b/drivers/char/agp/agp.h @@ -318,9 +318,6 @@ void agp3_generic_cleanup(void); #define AGP_GENERIC_SIZES_ENTRIES 11 extern const struct aper_size_info_16 agp3_generic_sizes[]; -#define virt_to_gart(x) (phys_to_gart(virt_to_phys(x))) -#define gart_to_virt(x) (phys_to_virt(gart_to_phys(x))) - extern int agp_off; extern int agp_try_unsupported_boot; diff --git a/drivers/char/agp/ali-agp.c b/drivers/char/agp/ali-agp.c index 201ef3ffd48..d2ce68f27e4 100644 --- a/drivers/char/agp/ali-agp.c +++ b/drivers/char/agp/ali-agp.c @@ -152,7 +152,7 @@ static struct page *m1541_alloc_page(struct agp_bridge_data *bridge) pci_read_config_dword(agp_bridge->dev, ALI_CACHE_FLUSH_CTRL, &temp); pci_write_config_dword(agp_bridge->dev, ALI_CACHE_FLUSH_CTRL, (((temp & ALI_CACHE_FLUSH_ADDR_MASK) | - phys_to_gart(page_to_phys(page))) | ALI_CACHE_FLUSH_EN )); + page_to_phys(page)) | ALI_CACHE_FLUSH_EN )); return page; } @@ -180,7 +180,7 @@ static void m1541_destroy_page(struct page *page, int flags) pci_read_config_dword(agp_bridge->dev, ALI_CACHE_FLUSH_CTRL, &temp); pci_write_config_dword(agp_bridge->dev, ALI_CACHE_FLUSH_CTRL, (((temp & ALI_CACHE_FLUSH_ADDR_MASK) | - phys_to_gart(page_to_phys(page))) | ALI_CACHE_FLUSH_EN)); + page_to_phys(page)) | ALI_CACHE_FLUSH_EN)); } agp_generic_destroy_page(page, flags); } diff --git a/drivers/char/agp/amd-k7-agp.c b/drivers/char/agp/amd-k7-agp.c index 542a87895ae..73dbf40c874 100644 --- a/drivers/char/agp/amd-k7-agp.c +++ b/drivers/char/agp/amd-k7-agp.c @@ -44,7 +44,7 @@ static int amd_create_page_map(struct amd_page_map *page_map) #ifndef CONFIG_X86 SetPageReserved(virt_to_page(page_map->real)); global_cache_flush(); - page_map->remapped = ioremap_nocache(virt_to_gart(page_map->real), + page_map->remapped = ioremap_nocache(virt_to_phys(page_map->real), PAGE_SIZE); if (page_map->remapped == NULL) { ClearPageReserved(virt_to_page(page_map->real)); @@ -160,7 +160,7 @@ static int amd_create_gatt_table(struct agp_bridge_data *bridge) agp_bridge->gatt_table_real = (u32 *)page_dir.real; agp_bridge->gatt_table = (u32 __iomem *)page_dir.remapped; - agp_bridge->gatt_bus_addr = virt_to_gart(page_dir.real); + agp_bridge->gatt_bus_addr = virt_to_phys(page_dir.real); /* Get the address for the gart region. * This is a bus address even on the alpha, b/c its @@ -173,7 +173,7 @@ static int amd_create_gatt_table(struct agp_bridge_data *bridge) /* Calculate the agp offset */ for (i = 0; i < value->num_entries / 1024; i++, addr += 0x00400000) { - writel(virt_to_gart(amd_irongate_private.gatt_pages[i]->real) | 1, + writel(virt_to_phys(amd_irongate_private.gatt_pages[i]->real) | 1, page_dir.remapped+GET_PAGE_DIR_OFF(addr)); readl(page_dir.remapped+GET_PAGE_DIR_OFF(addr)); /* PCI Posting. */ } @@ -325,7 +325,7 @@ static int amd_insert_memory(struct agp_memory *mem, off_t pg_start, int type) addr = (j * PAGE_SIZE) + agp_bridge->gart_bus_addr; cur_gatt = GET_GATT(addr); writel(agp_generic_mask_memory(agp_bridge, - phys_to_gart(page_to_phys(mem->pages[i])), + page_to_phys(mem->pages[i]), mem->type), cur_gatt+GET_GATT_OFF(addr)); readl(cur_gatt+GET_GATT_OFF(addr)); /* PCI Posting. */ diff --git a/drivers/char/agp/amd64-agp.c b/drivers/char/agp/amd64-agp.c index e85a5b3e952..2fb2e6cc322 100644 --- a/drivers/char/agp/amd64-agp.c +++ b/drivers/char/agp/amd64-agp.c @@ -79,7 +79,7 @@ static int amd64_insert_memory(struct agp_memory *mem, off_t pg_start, int type) for (i = 0, j = pg_start; i < mem->page_count; i++, j++) { tmp = agp_bridge->driver->mask_memory(agp_bridge, - phys_to_gart(page_to_phys(mem->pages[i])), + page_to_phys(mem->pages[i]), mask_type); BUG_ON(tmp & 0xffffff0000000ffcULL); @@ -178,7 +178,7 @@ static const struct aper_size_info_32 amd_8151_sizes[7] = static int amd_8151_configure(void) { - unsigned long gatt_bus = virt_to_gart(agp_bridge->gatt_table_real); + unsigned long gatt_bus = virt_to_phys(agp_bridge->gatt_table_real); int i; /* Configure AGP regs in each x86-64 host bridge. */ @@ -558,7 +558,7 @@ static void __devexit agp_amd64_remove(struct pci_dev *pdev) { struct agp_bridge_data *bridge = pci_get_drvdata(pdev); - release_mem_region(virt_to_gart(bridge->gatt_table_real), + release_mem_region(virt_to_phys(bridge->gatt_table_real), amd64_aperture_sizes[bridge->aperture_size_idx].size); agp_remove_bridge(bridge); agp_put_bridge(bridge); diff --git a/drivers/char/agp/ati-agp.c b/drivers/char/agp/ati-agp.c index 59ebd60c1b6..3b2ecbe86eb 100644 --- a/drivers/char/agp/ati-agp.c +++ b/drivers/char/agp/ati-agp.c @@ -302,7 +302,7 @@ static int ati_insert_memory(struct agp_memory * mem, addr = (j * PAGE_SIZE) + agp_bridge->gart_bus_addr; cur_gatt = GET_GATT(addr); writel(agp_bridge->driver->mask_memory(agp_bridge, - phys_to_gart(page_to_phys(mem->pages[i])), + page_to_phys(mem->pages[i]), mem->type), cur_gatt+GET_GATT_OFF(addr)); } @@ -360,7 +360,7 @@ static int ati_create_gatt_table(struct agp_bridge_data *bridge) agp_bridge->gatt_table_real = (u32 *)page_dir.real; agp_bridge->gatt_table = (u32 __iomem *) page_dir.remapped; - agp_bridge->gatt_bus_addr = virt_to_gart(page_dir.real); + agp_bridge->gatt_bus_addr = virt_to_phys(page_dir.real); /* Write out the size register */ current_size = A_SIZE_LVL2(agp_bridge->current_size); @@ -390,7 +390,7 @@ static int ati_create_gatt_table(struct agp_bridge_data *bridge) /* Calculate the agp offset */ for (i = 0; i < value->num_entries / 1024; i++, addr += 0x00400000) { - writel(virt_to_gart(ati_generic_private.gatt_pages[i]->real) | 1, + writel(virt_to_phys(ati_generic_private.gatt_pages[i]->real) | 1, page_dir.remapped+GET_PAGE_DIR_OFF(addr)); readl(page_dir.remapped+GET_PAGE_DIR_OFF(addr)); /* PCI Posting. */ } diff --git a/drivers/char/agp/backend.c b/drivers/char/agp/backend.c index 343f102090a..ad87753f6de 100644 --- a/drivers/char/agp/backend.c +++ b/drivers/char/agp/backend.c @@ -159,7 +159,7 @@ static int agp_backend_initialize(struct agp_bridge_data *bridge) goto err_out_nounmap; } } else { - bridge->scratch_page_dma = phys_to_gart(page_to_phys(page)); + bridge->scratch_page_dma = page_to_phys(page); } bridge->scratch_page = bridge->driver->mask_memory(bridge, diff --git a/drivers/char/agp/efficeon-agp.c b/drivers/char/agp/efficeon-agp.c index 35d50f2861b..793f39ea961 100644 --- a/drivers/char/agp/efficeon-agp.c +++ b/drivers/char/agp/efficeon-agp.c @@ -67,7 +67,7 @@ static const struct gatt_mask efficeon_generic_masks[] = /* This function does the same thing as mask_memory() for this chipset... */ static inline unsigned long efficeon_mask_memory(struct page *page) { - unsigned long addr = phys_to_gart(page_to_phys(page)); + unsigned long addr = page_to_phys(page); return addr | 0x00000001; } @@ -226,7 +226,7 @@ static int efficeon_create_gatt_table(struct agp_bridge_data *bridge) efficeon_private.l1_table[index] = page; - value = virt_to_gart((unsigned long *)page) | pati | present | index; + value = virt_to_phys((unsigned long *)page) | pati | present | index; pci_write_config_dword(agp_bridge->dev, EFFICEON_ATTPAGE, value); diff --git a/drivers/char/agp/generic.c b/drivers/char/agp/generic.c index 28f0208c66a..c50543966eb 100644 --- a/drivers/char/agp/generic.c +++ b/drivers/char/agp/generic.c @@ -988,7 +988,7 @@ int agp_generic_create_gatt_table(struct agp_bridge_data *bridge) set_memory_uc((unsigned long)table, 1 << page_order); bridge->gatt_table = (void *)table; #else - bridge->gatt_table = ioremap_nocache(virt_to_gart(table), + bridge->gatt_table = ioremap_nocache(virt_to_phys(table), (PAGE_SIZE * (1 << page_order))); bridge->driver->cache_flush(); #endif @@ -1001,7 +1001,7 @@ int agp_generic_create_gatt_table(struct agp_bridge_data *bridge) return -ENOMEM; } - bridge->gatt_bus_addr = virt_to_gart(bridge->gatt_table_real); + bridge->gatt_bus_addr = virt_to_phys(bridge->gatt_table_real); /* AK: bogus, should encode addresses > 4GB */ for (i = 0; i < num_entries; i++) { @@ -1142,7 +1142,7 @@ int agp_generic_insert_memory(struct agp_memory * mem, off_t pg_start, int type) for (i = 0, j = pg_start; i < mem->page_count; i++, j++) { writel(bridge->driver->mask_memory(bridge, - phys_to_gart(page_to_phys(mem->pages[i])), + page_to_phys(mem->pages[i]), mask_type), bridge->gatt_table+j); } diff --git a/drivers/char/agp/hp-agp.c b/drivers/char/agp/hp-agp.c index 64dbf4b1cf2..501e293e5ad 100644 --- a/drivers/char/agp/hp-agp.c +++ b/drivers/char/agp/hp-agp.c @@ -107,7 +107,7 @@ static int __init hp_zx1_ioc_shared(void) hp->gart_size = HP_ZX1_GART_SIZE; hp->gatt_entries = hp->gart_size / hp->io_page_size; - hp->io_pdir = gart_to_virt(readq(hp->ioc_regs+HP_ZX1_PDIR_BASE)); + hp->io_pdir = phys_to_virt(readq(hp->ioc_regs+HP_ZX1_PDIR_BASE)); hp->gatt = &hp->io_pdir[HP_ZX1_IOVA_TO_PDIR(hp->gart_base)]; if (hp->gatt[0] != HP_ZX1_SBA_IOMMU_COOKIE) { @@ -246,7 +246,7 @@ hp_zx1_configure (void) agp_bridge->mode = readl(hp->lba_regs+hp->lba_cap_offset+PCI_AGP_STATUS); if (hp->io_pdir_owner) { - writel(virt_to_gart(hp->io_pdir), hp->ioc_regs+HP_ZX1_PDIR_BASE); + writel(virt_to_phys(hp->io_pdir), hp->ioc_regs+HP_ZX1_PDIR_BASE); readl(hp->ioc_regs+HP_ZX1_PDIR_BASE); writel(hp->io_tlb_ps, hp->ioc_regs+HP_ZX1_TCNFG); readl(hp->ioc_regs+HP_ZX1_TCNFG); diff --git a/drivers/char/agp/i460-agp.c b/drivers/char/agp/i460-agp.c index 54191f86053..e763d3312ce 100644 --- a/drivers/char/agp/i460-agp.c +++ b/drivers/char/agp/i460-agp.c @@ -325,7 +325,7 @@ static int i460_insert_memory_small_io_page (struct agp_memory *mem, io_page_size = 1UL << I460_IO_PAGE_SHIFT; for (i = 0, j = io_pg_start; i < mem->page_count; i++) { - paddr = phys_to_gart(page_to_phys(mem->pages[i])); + paddr = page_to_phys(mem->pages[i]); for (k = 0; k < I460_IOPAGES_PER_KPAGE; k++, j++, paddr += io_page_size) WR_GATT(j, i460_mask_memory(agp_bridge, paddr, mem->type)); } @@ -382,7 +382,7 @@ static int i460_alloc_large_page (struct lp_desc *lp) return -ENOMEM; } - lp->paddr = phys_to_gart(page_to_phys(lp->page)); + lp->paddr = page_to_phys(lp->page); lp->refcount = 0; atomic_add(I460_KPAGES_PER_IOPAGE, &agp_bridge->current_memory_agp); return 0; diff --git a/drivers/char/agp/intel-agp.c b/drivers/char/agp/intel-agp.c index d8c80d8be5e..aa8889e8afc 100644 --- a/drivers/char/agp/intel-agp.c +++ b/drivers/char/agp/intel-agp.c @@ -288,7 +288,7 @@ static void intel_agp_insert_sg_entries(struct agp_memory *mem, for (i = 0, j = pg_start; i < mem->page_count; i++, j++) { writel(agp_bridge->driver->mask_memory(agp_bridge, - phys_to_gart(page_to_phys(mem->pages[i])), mask_type), + page_to_phys(mem->pages[i]), mask_type), intel_private.gtt+j); } @@ -470,8 +470,7 @@ static int intel_i810_insert_entries(struct agp_memory *mem, off_t pg_start, global_cache_flush(); for (i = 0, j = pg_start; i < mem->page_count; i++, j++) { writel(agp_bridge->driver->mask_memory(agp_bridge, - phys_to_gart(page_to_phys(mem->pages[i])), - mask_type), + page_to_phys(mem->pages[i]), mask_type), intel_private.registers+I810_PTE_BASE+(j*4)); } readl(intel_private.registers+I810_PTE_BASE+((j-1)*4)); @@ -977,7 +976,7 @@ static int intel_i830_insert_entries(struct agp_memory *mem, off_t pg_start, for (i = 0, j = pg_start; i < mem->page_count; i++, j++) { writel(agp_bridge->driver->mask_memory(agp_bridge, - phys_to_gart(page_to_phys(mem->pages[i])), mask_type), + page_to_phys(mem->pages[i]), mask_type), intel_private.registers+I810_PTE_BASE+(j*4)); } readl(intel_private.registers+I810_PTE_BASE+((j-1)*4)); diff --git a/drivers/char/agp/nvidia-agp.c b/drivers/char/agp/nvidia-agp.c index cedacee30ec..7e36d2b4f9d 100644 --- a/drivers/char/agp/nvidia-agp.c +++ b/drivers/char/agp/nvidia-agp.c @@ -225,7 +225,7 @@ static int nvidia_insert_memory(struct agp_memory *mem, off_t pg_start, int type } for (i = 0, j = pg_start; i < mem->page_count; i++, j++) { writel(agp_bridge->driver->mask_memory(agp_bridge, - phys_to_gart(page_to_phys(mem->pages[i])), mask_type), + page_to_phys(mem->pages[i]), mask_type), agp_bridge->gatt_table+nvidia_private.pg_offset+j); } diff --git a/drivers/char/agp/sgi-agp.c b/drivers/char/agp/sgi-agp.c index 0d47fa84740..0d426ae39c8 100644 --- a/drivers/char/agp/sgi-agp.c +++ b/drivers/char/agp/sgi-agp.c @@ -190,7 +190,7 @@ static int sgi_tioca_insert_memory(struct agp_memory *mem, off_t pg_start, for (i = 0, j = pg_start; i < mem->page_count; i++, j++) { table[j] = bridge->driver->mask_memory(bridge, - phys_to_gart(page_to_phys(mem->pages[i])), + page_to_phys(mem->pages[i]), mem->type); } diff --git a/drivers/char/agp/sworks-agp.c b/drivers/char/agp/sworks-agp.c index 07259952fc3..13acaaf64ed 100644 --- a/drivers/char/agp/sworks-agp.c +++ b/drivers/char/agp/sworks-agp.c @@ -155,7 +155,7 @@ static int serverworks_create_gatt_table(struct agp_bridge_data *bridge) /* Create a fake scratch directory */ for (i = 0; i < 1024; i++) { writel(agp_bridge->scratch_page, serverworks_private.scratch_dir.remapped+i); - writel(virt_to_gart(serverworks_private.scratch_dir.real) | 1, page_dir.remapped+i); + writel(virt_to_phys(serverworks_private.scratch_dir.real) | 1, page_dir.remapped+i); } retval = serverworks_create_gatt_pages(value->num_entries / 1024); @@ -167,7 +167,7 @@ static int serverworks_create_gatt_table(struct agp_bridge_data *bridge) agp_bridge->gatt_table_real = (u32 *)page_dir.real; agp_bridge->gatt_table = (u32 __iomem *)page_dir.remapped; - agp_bridge->gatt_bus_addr = virt_to_gart(page_dir.real); + agp_bridge->gatt_bus_addr = virt_to_phys(page_dir.real); /* Get the address for the gart region. * This is a bus address even on the alpha, b/c its @@ -179,7 +179,7 @@ static int serverworks_create_gatt_table(struct agp_bridge_data *bridge) /* Calculate the agp offset */ for (i = 0; i < value->num_entries / 1024; i++) - writel(virt_to_gart(serverworks_private.gatt_pages[i]->real)|1, page_dir.remapped+i); + writel(virt_to_phys(serverworks_private.gatt_pages[i]->real)|1, page_dir.remapped+i); return 0; } @@ -350,7 +350,7 @@ static int serverworks_insert_memory(struct agp_memory *mem, addr = (j * PAGE_SIZE) + agp_bridge->gart_bus_addr; cur_gatt = SVRWRKS_GET_GATT(addr); writel(agp_bridge->driver->mask_memory(agp_bridge, - phys_to_gart(page_to_phys(mem->pages[i])), mem->type), + page_to_phys(mem->pages[i]), mem->type), cur_gatt+GET_GATT_OFF(addr)); } serverworks_tlbflush(mem); diff --git a/drivers/char/agp/uninorth-agp.c b/drivers/char/agp/uninorth-agp.c index f192c3b9ad4..2e993112ab8 100644 --- a/drivers/char/agp/uninorth-agp.c +++ b/drivers/char/agp/uninorth-agp.c @@ -431,7 +431,7 @@ static int uninorth_create_gatt_table(struct agp_bridge_data *bridge) bridge->gatt_table_real = (u32 *) table; bridge->gatt_table = (u32 *)table; - bridge->gatt_bus_addr = virt_to_gart(table); + bridge->gatt_bus_addr = virt_to_phys(table); for (i = 0; i < num_entries; i++) bridge->gatt_table[i] = 0; -- cgit v1.2.3 From cfc65dd57967f2e0c7b3a8b73e6d12470b1cf1c1 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Thu, 30 Jul 2009 16:15:18 -0600 Subject: iommu=pt is a valid early param This avoids a "Malformed early option 'iommu'" warning on boot when trying to use pass-through mode. Signed-off-by: Alex Williamson Signed-off-by: David Woodhouse --- arch/x86/kernel/pci-dma.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 1a041bcf506..ae13e34f724 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -212,10 +212,8 @@ static __init int iommu_setup(char *p) if (!strncmp(p, "soft", 4)) swiotlb = 1; #endif - if (!strncmp(p, "pt", 2)) { + if (!strncmp(p, "pt", 2)) iommu_pass_through = 1; - return 1; - } gart_parse_options(p); -- cgit v1.2.3 From 19943b0e30b05d42e494ae6fef78156ebc8c637e Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Tue, 4 Aug 2009 16:19:20 +0100 Subject: intel-iommu: Unify hardware and software passthrough support This makes the hardware passthrough mode work a lot more like the software version, so that the behaviour of a kernel with 'iommu=pt' is the same whether the hardware supports passthrough or not. In particular: - We use a single si_domain for the pass-through devices. - 32-bit devices can be taken out of the pass-through domain so that they don't have to use swiotlb. - Devices will work again after being removed from a KVM guest. - A potential oops on OOM (in init_context_pass_through()) is fixed. Signed-off-by: David Woodhouse --- arch/x86/kernel/pci-swiotlb.c | 5 +- drivers/pci/intel-iommu.c | 174 ++++++++++++++++++------------------------ 2 files changed, 76 insertions(+), 103 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index 6af96ee4420..1e66b18f45c 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -71,9 +71,8 @@ void __init pci_swiotlb_init(void) { /* don't initialize swiotlb if iommu=off (no_iommu=1) */ #ifdef CONFIG_X86_64 - if ((!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN) || - iommu_pass_through) - swiotlb = 1; + if ((!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN)) + swiotlb = 1; #endif if (swiotlb_force) swiotlb = 1; diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index 097d5da2fae..147b3b960b6 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -251,7 +251,8 @@ static inline int first_pte_in_page(struct dma_pte *pte) * 2. It maps to each iommu if successful. * 3. Each iommu mapps to this domain if successful. */ -struct dmar_domain *si_domain; +static struct dmar_domain *si_domain; +static int hw_pass_through = 1; /* devices under the same p2p bridge are owned in one domain */ #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0) @@ -1948,14 +1949,24 @@ static int iommu_prepare_identity_map(struct pci_dev *pdev, struct dmar_domain *domain; int ret; - printk(KERN_INFO - "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n", - pci_name(pdev), start, end); - domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH); if (!domain) return -ENOMEM; + /* For _hardware_ passthrough, don't bother. But for software + passthrough, we do it anyway -- it may indicate a memory + range which is reserved in E820, so which didn't get set + up to start with in si_domain */ + if (domain == si_domain && hw_pass_through) { + printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n", + pci_name(pdev), start, end); + return 0; + } + + printk(KERN_INFO + "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n", + pci_name(pdev), start, end); + ret = iommu_domain_identity_map(domain, start, end); if (ret) goto error; @@ -2006,23 +2017,6 @@ static inline void iommu_prepare_isa(void) } #endif /* !CONFIG_DMAR_FLPY_WA */ -/* Initialize each context entry as pass through.*/ -static int __init init_context_pass_through(void) -{ - struct pci_dev *pdev = NULL; - struct dmar_domain *domain; - int ret; - - for_each_pci_dev(pdev) { - domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH); - ret = domain_context_mapping(domain, pdev, - CONTEXT_TT_PASS_THROUGH); - if (ret) - return ret; - } - return 0; -} - static int md_domain_init(struct dmar_domain *domain, int guest_width); static int __init si_domain_work_fn(unsigned long start_pfn, @@ -2037,7 +2031,7 @@ static int __init si_domain_work_fn(unsigned long start_pfn, } -static int si_domain_init(void) +static int si_domain_init(int hw) { struct dmar_drhd_unit *drhd; struct intel_iommu *iommu; @@ -2064,6 +2058,9 @@ static int si_domain_init(void) si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY; + if (hw) + return 0; + for_each_online_node(nid) { work_with_active_regions(nid, si_domain_work_fn, &ret); if (ret) @@ -2155,24 +2152,26 @@ static int iommu_should_identity_map(struct pci_dev *pdev, int startup) return 1; } -static int iommu_prepare_static_identity_mapping(void) +static int iommu_prepare_static_identity_mapping(int hw) { struct pci_dev *pdev = NULL; int ret; - ret = si_domain_init(); + ret = si_domain_init(hw); if (ret) return -EFAULT; for_each_pci_dev(pdev) { if (iommu_should_identity_map(pdev, 1)) { - printk(KERN_INFO "IOMMU: identity mapping for device %s\n", - pci_name(pdev)); + printk(KERN_INFO "IOMMU: %s identity mapping for device %s\n", + hw ? "hardware" : "software", pci_name(pdev)); ret = domain_context_mapping(si_domain, pdev, + hw ? CONTEXT_TT_PASS_THROUGH : CONTEXT_TT_MULTI_LEVEL); if (ret) return ret; + ret = domain_add_dev_info(si_domain, pdev); if (ret) return ret; @@ -2189,14 +2188,6 @@ int __init init_dmars(void) struct pci_dev *pdev; struct intel_iommu *iommu; int i, ret; - int pass_through = 1; - - /* - * In case pass through can not be enabled, iommu tries to use identity - * mapping. - */ - if (iommu_pass_through) - iommu_identity_mapping = 1; /* * for each drhd @@ -2250,14 +2241,8 @@ int __init init_dmars(void) goto error; } if (!ecap_pass_through(iommu->ecap)) - pass_through = 0; + hw_pass_through = 0; } - if (iommu_pass_through) - if (!pass_through) { - printk(KERN_INFO - "Pass Through is not supported by hardware.\n"); - iommu_pass_through = 0; - } /* * Start from the sane iommu hardware state. @@ -2312,64 +2297,57 @@ int __init init_dmars(void) } } + if (iommu_pass_through) + iommu_identity_mapping = 1; +#ifdef CONFIG_DMAR_BROKEN_GFX_WA + else + iommu_identity_mapping = 2; +#endif /* - * If pass through is set and enabled, context entries of all pci - * devices are intialized by pass through translation type. + * If pass through is not set or not enabled, setup context entries for + * identity mappings for rmrr, gfx, and isa and may fall back to static + * identity mapping if iommu_identity_mapping is set. */ - if (iommu_pass_through) { - ret = init_context_pass_through(); + if (iommu_identity_mapping) { + ret = iommu_prepare_static_identity_mapping(hw_pass_through); if (ret) { - printk(KERN_ERR "IOMMU: Pass through init failed.\n"); - iommu_pass_through = 0; + printk(KERN_CRIT "Failed to setup IOMMU pass-through\n"); + goto error; } } - /* - * If pass through is not set or not enabled, setup context entries for - * identity mappings for rmrr, gfx, and isa and may fall back to static - * identity mapping if iommu_identity_mapping is set. + * For each rmrr + * for each dev attached to rmrr + * do + * locate drhd for dev, alloc domain for dev + * allocate free domain + * allocate page table entries for rmrr + * if context not allocated for bus + * allocate and init context + * set present in root table for this bus + * init context with domain, translation etc + * endfor + * endfor */ - if (!iommu_pass_through) { -#ifdef CONFIG_DMAR_BROKEN_GFX_WA - if (!iommu_identity_mapping) - iommu_identity_mapping = 2; -#endif - if (iommu_identity_mapping) - iommu_prepare_static_identity_mapping(); - /* - * For each rmrr - * for each dev attached to rmrr - * do - * locate drhd for dev, alloc domain for dev - * allocate free domain - * allocate page table entries for rmrr - * if context not allocated for bus - * allocate and init context - * set present in root table for this bus - * init context with domain, translation etc - * endfor - * endfor - */ - printk(KERN_INFO "IOMMU: Setting RMRR:\n"); - for_each_rmrr_units(rmrr) { - for (i = 0; i < rmrr->devices_cnt; i++) { - pdev = rmrr->devices[i]; - /* - * some BIOS lists non-exist devices in DMAR - * table. - */ - if (!pdev) - continue; - ret = iommu_prepare_rmrr_dev(rmrr, pdev); - if (ret) - printk(KERN_ERR - "IOMMU: mapping reserved region failed\n"); - } + printk(KERN_INFO "IOMMU: Setting RMRR:\n"); + for_each_rmrr_units(rmrr) { + for (i = 0; i < rmrr->devices_cnt; i++) { + pdev = rmrr->devices[i]; + /* + * some BIOS lists non-exist devices in DMAR + * table. + */ + if (!pdev) + continue; + ret = iommu_prepare_rmrr_dev(rmrr, pdev); + if (ret) + printk(KERN_ERR + "IOMMU: mapping reserved region failed\n"); } - - iommu_prepare_isa(); } + iommu_prepare_isa(); + /* * for each drhd * enable fault log @@ -2536,7 +2514,10 @@ static int iommu_no_mapping(struct device *dev) ret = domain_add_dev_info(si_domain, pdev); if (ret) return 0; - ret = domain_context_mapping(si_domain, pdev, CONTEXT_TT_MULTI_LEVEL); + ret = domain_context_mapping(si_domain, pdev, + hw_pass_through ? + CONTEXT_TT_PASS_THROUGH : + CONTEXT_TT_MULTI_LEVEL); if (!ret) { printk(KERN_INFO "64bit %s uses identity mapping\n", pci_name(pdev)); @@ -3202,7 +3183,7 @@ int __init intel_iommu_init(void) * Check the need for DMA-remapping initialization now. * Above initialization will also be used by Interrupt-remapping. */ - if (no_iommu || (swiotlb && !iommu_pass_through) || dmar_disabled) + if (no_iommu || swiotlb || dmar_disabled) return -ENODEV; iommu_init_mempool(); @@ -3222,14 +3203,7 @@ int __init intel_iommu_init(void) init_timer(&unmap_timer); force_iommu = 1; - - if (!iommu_pass_through) { - printk(KERN_INFO - "Multi-level page-table translation for DMAR.\n"); - dma_ops = &intel_dma_ops; - } else - printk(KERN_INFO - "DMAR: Pass through translation for DMAR.\n"); + dma_ops = &intel_dma_ops; init_iommu_sysfs(); -- cgit v1.2.3 From 5b7e88edc6193f36941bccbfd5ed9ed5fe27d2e1 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 31 Jul 2009 09:41:40 +0800 Subject: x86, mce: Support specifying context for software mce injection The cpu context is specified via the new mce.inject_flags fields. This allows more realistic machine check testing in different situations. "RANDOM" context is implemented via NMI broadcasting to add randomization to testing. AK: Fix NMI broadcasting check. Fix 32-bit building. Some race fixes. Move to module. Various changes ChangeLog: v3: - Re-based on latest x86-tip.git/mce4 - Fix 32-bit building v2: - Re-base on latest x86-tip.git/mce3 Signed-off-by: Huang Ying Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mce.h | 11 ++- arch/x86/kernel/cpu/mcheck/mce-inject.c | 156 ++++++++++++++++++++++++++------ 2 files changed, 135 insertions(+), 32 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index ad753537291..8945be9ad2b 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -38,6 +38,13 @@ #define MCM_ADDR_MEM 3 /* memory address */ #define MCM_ADDR_GENERIC 7 /* generic */ +#define MCJ_CTX_MASK 3 +#define MCJ_CTX(flags) ((flags) & MCJ_CTX_MASK) +#define MCJ_CTX_RANDOM 0 /* inject context: random */ +#define MCJ_CTX_PROCESS 1 /* inject context: process */ +#define MCJ_CTX_IRQ 2 /* inject context: IRQ */ +#define MCJ_NMI_BROADCAST 4 /* do NMI broadcasting */ + /* Fields are zero when not available */ struct mce { __u64 status; @@ -48,8 +55,8 @@ struct mce { __u64 tsc; /* cpu time stamp counter */ __u64 time; /* wall time_t when error was detected */ __u8 cpuvendor; /* cpu vendor as encoded in system.h */ - __u8 pad1; - __u16 pad2; + __u8 inject_flags; /* software inject flags */ + __u16 pad; __u32 cpuid; /* CPUID 1 EAX */ __u8 cs; /* code segment */ __u8 bank; /* machine check bank */ diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index a3a235a53f0..ad5d92790eb 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -18,7 +18,12 @@ #include #include #include +#include +#include +#include +#include #include +#include /* Update fake mce registers on current CPU. */ static void inject_mce(struct mce *m) @@ -39,44 +44,141 @@ static void inject_mce(struct mce *m) i->finished = 1; } -struct delayed_mce { - struct timer_list timer; - struct mce m; +static void raise_corrected(struct mce *m) +{ + unsigned long flags; + mce_banks_t b; + + memset(&b, 0xff, sizeof(mce_banks_t)); + local_irq_save(flags); + machine_check_poll(0, &b); + local_irq_restore(flags); + m->finished = 0; +} + +static void raise_uncorrected(struct mce *m, struct pt_regs *pregs) +{ + struct pt_regs regs; + unsigned long flags; + + if (!pregs) { + memset(®s, 0, sizeof(struct pt_regs)); + regs.ip = m->ip; + regs.cs = m->cs; + pregs = ®s; + } + /* in mcheck exeception handler, irq will be disabled */ + local_irq_save(flags); + do_machine_check(pregs, 0); + local_irq_restore(flags); + m->finished = 0; +} + +static cpumask_t mce_inject_cpumask; + +static int mce_raise_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + struct die_args *args = (struct die_args *)data; + int cpu = smp_processor_id(); + struct mce *m = &__get_cpu_var(injectm); + if (val != DIE_NMI_IPI || !cpu_isset(cpu, mce_inject_cpumask)) + return NOTIFY_DONE; + cpu_clear(cpu, mce_inject_cpumask); + if (m->status & MCI_STATUS_UC) + raise_uncorrected(m, args->regs); + else if (m->status) + raise_corrected(m); + return NOTIFY_STOP; +} + +static struct notifier_block mce_raise_nb = { + .notifier_call = mce_raise_notify, + .priority = 1000, }; /* Inject mce on current CPU */ -static void raise_mce(unsigned long data) +static int raise_local(struct mce *m) { - struct delayed_mce *dm = (struct delayed_mce *)data; - struct mce *m = &dm->m; + int context = MCJ_CTX(m->inject_flags); + int ret = 0; int cpu = m->extcpu; - inject_mce(m); if (m->status & MCI_STATUS_UC) { - struct pt_regs regs; - memset(®s, 0, sizeof(struct pt_regs)); - regs.ip = m->ip; - regs.cs = m->cs; printk(KERN_INFO "Triggering MCE exception on CPU %d\n", cpu); - do_machine_check(®s, 0); + switch (context) { + case MCJ_CTX_IRQ: + /* + * Could do more to fake interrupts like + * calling irq_enter, but the necessary + * machinery isn't exported currently. + */ + /*FALL THROUGH*/ + case MCJ_CTX_PROCESS: + raise_uncorrected(m, NULL); + break; + default: + printk(KERN_INFO "Invalid MCE context\n"); + ret = -EINVAL; + } printk(KERN_INFO "MCE exception done on CPU %d\n", cpu); - } else { - mce_banks_t b; - memset(&b, 0xff, sizeof(mce_banks_t)); + } else if (m->status) { printk(KERN_INFO "Starting machine check poll CPU %d\n", cpu); - machine_check_poll(0, &b); + raise_corrected(m); mce_notify_irq(); - printk(KERN_INFO "Finished machine check poll on CPU %d\n", - cpu); - } - kfree(dm); + printk(KERN_INFO "Machine check poll done on CPU %d\n", cpu); + } else + m->finished = 0; + + return ret; +} + +static void raise_mce(struct mce *m) +{ + int context = MCJ_CTX(m->inject_flags); + + inject_mce(m); + + if (context == MCJ_CTX_RANDOM) + return; + +#ifdef CONFIG_X86_LOCAL_APIC + if (m->inject_flags & MCJ_NMI_BROADCAST) { + unsigned long start; + int cpu; + get_online_cpus(); + mce_inject_cpumask = cpu_online_map; + cpu_clear(get_cpu(), mce_inject_cpumask); + for_each_online_cpu(cpu) { + struct mce *mcpu = &per_cpu(injectm, cpu); + if (!mcpu->finished || + MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM) + cpu_clear(cpu, mce_inject_cpumask); + } + if (!cpus_empty(mce_inject_cpumask)) + apic->send_IPI_mask(&mce_inject_cpumask, NMI_VECTOR); + start = jiffies; + while (!cpus_empty(mce_inject_cpumask)) { + if (!time_before(jiffies, start + 2*HZ)) { + printk(KERN_ERR + "Timeout waiting for mce inject NMI %lx\n", + *cpus_addr(mce_inject_cpumask)); + break; + } + cpu_relax(); + } + raise_local(m); + put_cpu(); + put_online_cpus(); + } else +#endif + raise_local(m); } /* Error injection interface */ static ssize_t mce_write(struct file *filp, const char __user *ubuf, size_t usize, loff_t *off) { - struct delayed_mce *dm; struct mce m; if (!capable(CAP_SYS_ADMIN)) @@ -96,19 +198,12 @@ static ssize_t mce_write(struct file *filp, const char __user *ubuf, if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu)) return -EINVAL; - dm = kmalloc(sizeof(struct delayed_mce), GFP_KERNEL); - if (!dm) - return -ENOMEM; - /* * Need to give user space some time to set everything up, * so do it a jiffie or two later everywhere. - * Should we use a hrtimer here for better synchronization? */ - memcpy(&dm->m, &m, sizeof(struct mce)); - setup_timer(&dm->timer, raise_mce, (unsigned long)dm); - dm->timer.expires = jiffies + 2; - add_timer_on(&dm->timer, m.extcpu); + schedule_timeout(2); + raise_mce(&m); return usize; } @@ -116,6 +211,7 @@ static int inject_init(void) { printk(KERN_INFO "Machine check injector initialized\n"); mce_chrdev_ops.write = mce_write; + register_die_notifier(&mce_raise_nb); return 0; } -- cgit v1.2.3 From 0dcc66851f1091af421416c28a9458836885f522 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 31 Jul 2009 09:41:41 +0800 Subject: x86, mce: Support specifying raise mode for software MCE injection Raise mode include raising as exception or raising as poll, it is specified via the mce.inject_flags field. This can be used to specify raise mode of UCNA, which is UC error but raised not as exception. And this can be used to test the filter code of poll handler or exception handler too. For example, enforce a poll raise mode for a fatal MCE. ChangeLog: v2: - Re-base on latest x86-tip.git/mce3 Signed-off-by: Huang Ying Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mce.h | 1 + arch/x86/kernel/cpu/mcheck/mce-inject.c | 16 ++++++++-------- 2 files changed, 9 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 8945be9ad2b..b608a64c581 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -44,6 +44,7 @@ #define MCJ_CTX_PROCESS 1 /* inject context: process */ #define MCJ_CTX_IRQ 2 /* inject context: IRQ */ #define MCJ_NMI_BROADCAST 4 /* do NMI broadcasting */ +#define MCJ_EXCEPTION 8 /* raise as exception */ /* Fields are zero when not available */ struct mce { diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index ad5d92790eb..7029f0e2aca 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -44,7 +44,7 @@ static void inject_mce(struct mce *m) i->finished = 1; } -static void raise_corrected(struct mce *m) +static void raise_poll(struct mce *m) { unsigned long flags; mce_banks_t b; @@ -56,7 +56,7 @@ static void raise_corrected(struct mce *m) m->finished = 0; } -static void raise_uncorrected(struct mce *m, struct pt_regs *pregs) +static void raise_exception(struct mce *m, struct pt_regs *pregs) { struct pt_regs regs; unsigned long flags; @@ -85,10 +85,10 @@ static int mce_raise_notify(struct notifier_block *self, if (val != DIE_NMI_IPI || !cpu_isset(cpu, mce_inject_cpumask)) return NOTIFY_DONE; cpu_clear(cpu, mce_inject_cpumask); - if (m->status & MCI_STATUS_UC) - raise_uncorrected(m, args->regs); + if (m->inject_flags & MCJ_EXCEPTION) + raise_exception(m, args->regs); else if (m->status) - raise_corrected(m); + raise_poll(m); return NOTIFY_STOP; } @@ -104,7 +104,7 @@ static int raise_local(struct mce *m) int ret = 0; int cpu = m->extcpu; - if (m->status & MCI_STATUS_UC) { + if (m->inject_flags & MCJ_EXCEPTION) { printk(KERN_INFO "Triggering MCE exception on CPU %d\n", cpu); switch (context) { case MCJ_CTX_IRQ: @@ -115,7 +115,7 @@ static int raise_local(struct mce *m) */ /*FALL THROUGH*/ case MCJ_CTX_PROCESS: - raise_uncorrected(m, NULL); + raise_exception(m, NULL); break; default: printk(KERN_INFO "Invalid MCE context\n"); @@ -124,7 +124,7 @@ static int raise_local(struct mce *m) printk(KERN_INFO "MCE exception done on CPU %d\n", cpu); } else if (m->status) { printk(KERN_INFO "Starting machine check poll CPU %d\n", cpu); - raise_corrected(m); + raise_poll(m); mce_notify_irq(); printk(KERN_INFO "Machine check poll done on CPU %d\n", cpu); } else -- cgit v1.2.3 From 5be9ed251f58881dfc3dd6742a81ff9ad1a7bb04 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 31 Jul 2009 09:41:42 +0800 Subject: x86, mce: Move debugfs mce dir creating to mce.c Because more debugfs files under mce dir will be create in mce.c. ChangeLog: v5: - Rebased on x86-tip.git/mce Signed-off-by: Huang Ying Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce-internal.h | 1 + arch/x86/kernel/cpu/mcheck/mce-severity.c | 4 +--- arch/x86/kernel/cpu/mcheck/mce.c | 13 +++++++++++++ 3 files changed, 15 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index 6bd51e7ba87..32996f9fab6 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -22,6 +22,7 @@ struct mce_bank { }; int mce_severity(struct mce *a, int tolerant, char **msg); +struct dentry *mce_get_debugfs_dir(void); extern int mce_ser; diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 51f7c725dab..bc35a073d15 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -197,7 +197,7 @@ static int __init severities_debugfs_init(void) { struct dentry *dmce = NULL, *fseverities_coverage = NULL; - dmce = debugfs_create_dir("mce", NULL); + dmce = mce_get_debugfs_dir(); if (dmce == NULL) goto err_out; fseverities_coverage = debugfs_create_file("severities-coverage", @@ -209,8 +209,6 @@ static int __init severities_debugfs_init(void) return 0; err_out: - if (dmce) - debugfs_remove(dmce); return -ENOMEM; } late_initcall(severities_debugfs_init); diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 1ce6db1f878..9c7419e459d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -2003,3 +2004,15 @@ static int __init mcheck_disable(char *str) return 1; } __setup("nomce", mcheck_disable); + +#ifdef CONFIG_DEBUG_FS +struct dentry *mce_get_debugfs_dir(void) +{ + static struct dentry *dmce; + + if (!dmce) + dmce = debugfs_create_dir("mce", NULL); + + return dmce; +} +#endif -- cgit v1.2.3 From bf783f9f7d33576815bc89f9f1856a7309ea2f17 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 31 Jul 2009 09:41:43 +0800 Subject: x86, mce: Fake panic support for MCE testing If "fake panic" mode is turned on, just log panic message instead of go real panic. This is used for testing only, so that the test suite can check for the correct panic message and do regression testing for MCE would go panic. This patch is based on x86-tip.git/mce. ChangeLog: v5: - Rebased on x86-tip.git/mce v4: - Move config file from sysfs to debugfs Signed-off-by: Huang Ying Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 75 ++++++++++++++++++++++++++++++++++------ 1 file changed, 64 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 9c7419e459d..54bd1b2fb4c 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -204,6 +204,9 @@ static void print_mce_tail(void) static atomic_t mce_paniced; +static int fake_panic; +static atomic_t mce_fake_paniced; + /* Panic in progress. Enable interrupts and wait for final IPI */ static void wait_for_panic(void) { @@ -221,15 +224,21 @@ static void mce_panic(char *msg, struct mce *final, char *exp) { int i; - /* - * Make sure only one CPU runs in machine check panic - */ - if (atomic_inc_return(&mce_paniced) > 1) - wait_for_panic(); - barrier(); + if (!fake_panic) { + /* + * Make sure only one CPU runs in machine check panic + */ + if (atomic_inc_return(&mce_paniced) > 1) + wait_for_panic(); + barrier(); - bust_spinlocks(1); - console_verbose(); + bust_spinlocks(1); + console_verbose(); + } else { + /* Don't log too much for fake panic */ + if (atomic_inc_return(&mce_fake_paniced) > 1) + return; + } print_mce_head(); /* First print corrected ones that are still unlogged */ for (i = 0; i < MCE_LOG_LEN; i++) { @@ -256,9 +265,12 @@ static void mce_panic(char *msg, struct mce *final, char *exp) print_mce_tail(); if (exp) printk(KERN_EMERG "Machine check: %s\n", exp); - if (panic_timeout == 0) - panic_timeout = mce_panic_timeout; - panic(msg); + if (!fake_panic) { + if (panic_timeout == 0) + panic_timeout = mce_panic_timeout; + panic(msg); + } else + printk(KERN_EMERG "Fake kernel panic: %s\n", msg); } /* Support code for software error injection */ @@ -2015,4 +2027,45 @@ struct dentry *mce_get_debugfs_dir(void) return dmce; } + +static void mce_reset(void) +{ + cpu_missing = 0; + atomic_set(&mce_fake_paniced, 0); + atomic_set(&mce_executing, 0); + atomic_set(&mce_callin, 0); + atomic_set(&global_nwo, 0); +} + +static int fake_panic_get(void *data, u64 *val) +{ + *val = fake_panic; + return 0; +} + +static int fake_panic_set(void *data, u64 val) +{ + mce_reset(); + fake_panic = val; + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get, + fake_panic_set, "%llu\n"); + +static int __init mce_debugfs_init(void) +{ + struct dentry *dmce, *ffake_panic; + + dmce = mce_get_debugfs_dir(); + if (!dmce) + return -ENOMEM; + ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL, + &fake_panic_fops); + if (!ffake_panic) + return -ENOMEM; + + return 0; +} +late_initcall(mce_debugfs_init); #endif -- cgit v1.2.3 From 81e2d7b30d718824434725a4a24d5864a71b1d30 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 12 Aug 2009 05:45:34 -0700 Subject: x86, intel_txt: tboot.c needs arch/x86/kernel/tboot.c needs . In most configurations that ends up getting implicitly included, but not in all, causing build failures in some configurations. Reported-by: Ingo Molnar Signed-off-by: H. Peter Anvin Cc: Joseph Cihula Cc: Shane Wang --- arch/x86/kernel/tboot.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index 263591afd29..1ab80120894 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3 From 00ae4064b1445524752575dd84df227c0687c99d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 14 Aug 2009 15:00:49 +0900 Subject: percpu: rename 4k first chunk allocator to page Page size isn't always 4k depending on arch and configuration. Rename 4k first chunk allocator to page. Signed-off-by: Tejun Heo Cc: David Howells --- Documentation/kernel-parameters.txt | 2 +- arch/x86/kernel/setup_percpu.c | 23 ++++++++++++----------- include/linux/percpu.h | 2 +- mm/percpu.c | 25 ++++++++++++++----------- 4 files changed, 28 insertions(+), 24 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 7936b801fe6..12e9eb77ee0 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1920,7 +1920,7 @@ and is between 256 and 4096 characters. It is defined in the file See arch/parisc/kernel/pdc_chassis.c percpu_alloc= [X86] Select which percpu first chunk allocator to use. - Allowed values are one of "lpage", "embed" and "4k". + Allowed values are one of "lpage", "embed" and "page". See comments in arch/x86/kernel/setup_percpu.c for details on each allocator. This parameter is primarily for debugging and performance comparison. diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index a26ff61e2fb..1e17711c29d 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -249,21 +249,22 @@ static ssize_t __init setup_pcpu_embed(size_t static_size, bool chosen) } /* - * 4k allocator + * Page allocator * - * Boring fallback 4k allocator. This allocator puts more pressure on - * PTE TLBs but other than that behaves nicely on both UMA and NUMA. + * Boring fallback 4k page allocator. This allocator puts more + * pressure on PTE TLBs but other than that behaves nicely on both UMA + * and NUMA. */ -static void __init pcpu4k_populate_pte(unsigned long addr) +static void __init pcpup_populate_pte(unsigned long addr) { populate_extra_pte(addr); } -static ssize_t __init setup_pcpu_4k(size_t static_size) +static ssize_t __init setup_pcpu_page(size_t static_size) { - return pcpu_4k_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, - pcpu_fc_alloc, pcpu_fc_free, - pcpu4k_populate_pte); + return pcpu_page_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, + pcpu_fc_alloc, pcpu_fc_free, + pcpup_populate_pte); } /* for explicit first chunk allocator selection */ @@ -307,7 +308,7 @@ void __init setup_per_cpu_areas(void) */ ret = -EINVAL; if (strlen(pcpu_chosen_alloc)) { - if (strcmp(pcpu_chosen_alloc, "4k")) { + if (strcmp(pcpu_chosen_alloc, "page")) { if (!strcmp(pcpu_chosen_alloc, "lpage")) ret = setup_pcpu_lpage(static_size, true); else if (!strcmp(pcpu_chosen_alloc, "embed")) @@ -317,7 +318,7 @@ void __init setup_per_cpu_areas(void) "specified\n", pcpu_chosen_alloc); if (ret < 0) pr_warning("PERCPU: %s allocator failed (%zd), " - "falling back to 4k\n", + "falling back to page size\n", pcpu_chosen_alloc, ret); } } else { @@ -326,7 +327,7 @@ void __init setup_per_cpu_areas(void) ret = setup_pcpu_embed(static_size, false); } if (ret < 0) - ret = setup_pcpu_4k(static_size); + ret = setup_pcpu_page(static_size); if (ret < 0) panic("cannot allocate static percpu area (%zu bytes, err=%zd)", static_size, ret); diff --git a/include/linux/percpu.h b/include/linux/percpu.h index e134c822963..7989f61b03f 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -74,7 +74,7 @@ extern ssize_t __init pcpu_embed_first_chunk( size_t static_size, size_t reserved_size, ssize_t dyn_size); -extern ssize_t __init pcpu_4k_first_chunk( +extern ssize_t __init pcpu_page_first_chunk( size_t static_size, size_t reserved_size, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn, diff --git a/mm/percpu.c b/mm/percpu.c index cbddcbdab68..6feac793490 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1497,15 +1497,15 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, } /** - * pcpu_4k_first_chunk - map the first chunk using PAGE_SIZE pages + * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages * @static_size: the size of static percpu area in bytes * @reserved_size: the size of reserved percpu area in bytes * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE * @free_fn: funtion to free percpu page, always called with PAGE_SIZE * @populate_pte_fn: function to populate pte * - * This is a helper to ease setting up embedded first percpu chunk and - * can be called where pcpu_setup_first_chunk() is expected. + * This is a helper to ease setting up page-remapped first percpu + * chunk and can be called where pcpu_setup_first_chunk() is expected. * * This is the basic allocator. Static percpu area is allocated * page-by-page into vmalloc area. @@ -1514,12 +1514,13 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, * The determined pcpu_unit_size which can be used to initialize * percpu access on success, -errno on failure. */ -ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size, - pcpu_fc_alloc_fn_t alloc_fn, - pcpu_fc_free_fn_t free_fn, - pcpu_fc_populate_pte_fn_t populate_pte_fn) +ssize_t __init pcpu_page_first_chunk(size_t static_size, size_t reserved_size, + pcpu_fc_alloc_fn_t alloc_fn, + pcpu_fc_free_fn_t free_fn, + pcpu_fc_populate_pte_fn_t populate_pte_fn) { static struct vm_struct vm; + char psize_str[16]; int unit_pages; size_t pages_size; struct page **pages; @@ -1527,6 +1528,8 @@ ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size, int i, j; ssize_t ret; + snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10); + unit_pages = PFN_UP(max_t(size_t, static_size + reserved_size, PCPU_MIN_UNIT_SIZE)); @@ -1542,8 +1545,8 @@ ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size, ptr = alloc_fn(cpu, PAGE_SIZE); if (!ptr) { - pr_warning("PERCPU: failed to allocate " - "4k page for cpu%u\n", cpu); + pr_warning("PERCPU: failed to allocate %s page " + "for cpu%u\n", psize_str, cpu); goto enomem; } pages[j++] = virt_to_page(ptr); @@ -1580,8 +1583,8 @@ ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size, } /* we're ready, commit */ - pr_info("PERCPU: %d 4k pages/cpu @%p s%zu r%zu\n", - unit_pages, vm.addr, static_size, reserved_size); + pr_info("PERCPU: %d %s pages/cpu @%p s%zu r%zu\n", + unit_pages, psize_str, vm.addr, static_size, reserved_size); ret = pcpu_setup_first_chunk(static_size, reserved_size, -1, unit_pages << PAGE_SHIFT, vm.addr, NULL); -- cgit v1.2.3 From 08fc45806103e59a37418e84719b878f9bb32540 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 14 Aug 2009 15:00:49 +0900 Subject: percpu: build first chunk allocators selectively There's no need to build unused first chunk allocators in. Define CONFIG_NEED_PER_CPU_*_FIRST_CHUNK and let archs enable them selectively. Signed-off-by: Tejun Heo --- arch/x86/Kconfig | 10 ++++++++++ include/linux/percpu.h | 27 +++++---------------------- mm/percpu.c | 19 +++++++++++-------- 3 files changed, 26 insertions(+), 30 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e06b2eeff9f..f7ac2721551 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -150,6 +150,16 @@ config ARCH_HAS_CACHE_LINE_SIZE config HAVE_SETUP_PER_CPU_AREA def_bool y +config NEED_PER_CPU_EMBED_FIRST_CHUNK + def_bool y + +config NEED_PER_CPU_PAGE_FIRST_CHUNK + def_bool y + +config NEED_PER_CPU_LPAGE_FIRST_CHUNK + def_bool y + depends on NEED_MULTIPLE_NODES + config HAVE_CPUMASK_OF_CPU_MAP def_bool X86_64_SMP diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 7989f61b03f..e26788e0da4 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -70,17 +70,21 @@ extern size_t __init pcpu_setup_first_chunk( ssize_t dyn_size, size_t unit_size, void *base_addr, const int *unit_map); +#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK extern ssize_t __init pcpu_embed_first_chunk( size_t static_size, size_t reserved_size, ssize_t dyn_size); +#endif +#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK extern ssize_t __init pcpu_page_first_chunk( size_t static_size, size_t reserved_size, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn, pcpu_fc_populate_pte_fn_t populate_pte_fn); +#endif -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK extern int __init pcpu_lpage_build_unit_map( size_t static_size, size_t reserved_size, ssize_t *dyn_sizep, size_t *unit_sizep, @@ -98,27 +102,6 @@ extern ssize_t __init pcpu_lpage_first_chunk( extern void *pcpu_lpage_remapped(void *kaddr); #else -static inline int pcpu_lpage_build_unit_map( - size_t static_size, size_t reserved_size, - ssize_t *dyn_sizep, size_t *unit_sizep, - size_t lpage_size, int *unit_map, - pcpu_fc_cpu_distance_fn_t cpu_distance_fn) -{ - return -EINVAL; -} - -static inline ssize_t __init pcpu_lpage_first_chunk( - size_t static_size, size_t reserved_size, - size_t dyn_size, size_t unit_size, - size_t lpage_size, const int *unit_map, - int nr_units, - pcpu_fc_alloc_fn_t alloc_fn, - pcpu_fc_free_fn_t free_fn, - pcpu_fc_map_fn_t map_fn) -{ - return -EINVAL; -} - static inline void *pcpu_lpage_remapped(void *kaddr) { return NULL; diff --git a/mm/percpu.c b/mm/percpu.c index 6feac793490..7971997de31 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1414,8 +1414,9 @@ size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size, return pcpu_unit_size; } -static size_t pcpu_calc_fc_sizes(size_t static_size, size_t reserved_size, - ssize_t *dyn_sizep) +static inline size_t pcpu_calc_fc_sizes(size_t static_size, + size_t reserved_size, + ssize_t *dyn_sizep) { size_t size_sum; @@ -1427,6 +1428,8 @@ static size_t pcpu_calc_fc_sizes(size_t static_size, size_t reserved_size, return size_sum; } +#if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \ + !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) /** * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem * @static_size: the size of static percpu area in bytes @@ -1495,7 +1498,10 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, return pcpu_setup_first_chunk(static_size, reserved_size, dyn_size, unit_size, base, NULL); } +#endif /* CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK || + !CONFIG_HAVE_SETUP_PER_CPU_AREA */ +#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK /** * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages * @static_size: the size of static percpu area in bytes @@ -1598,12 +1604,9 @@ out_free_ar: free_bootmem(__pa(pages), pages_size); return ret; } +#endif /* CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK */ -/* - * Large page remapping first chunk setup helper - */ -#ifdef CONFIG_NEED_MULTIPLE_NODES - +#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK /** * pcpu_lpage_build_unit_map - build unit_map for large page remapping * @static_size: the size of static percpu area in bytes @@ -1982,7 +1985,7 @@ void *pcpu_lpage_remapped(void *kaddr) return NULL; } -#endif +#endif /* CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK */ /* * Generic percpu area setup. -- cgit v1.2.3 From f58dc01ba2ca9fe3ab2ba4ca43d9c8a735cf62d8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 14 Aug 2009 15:00:50 +0900 Subject: percpu: generalize first chunk allocator selection Now that all first chunk allocators are in mm/percpu.c, it makes sense to make generalize percpu_alloc kernel parameter. Define PCPU_FC_* and set pcpu_chosen_fc using early_param() in mm/percpu.c. Arch code can use the set value to determine which first chunk allocator to use. Signed-off-by: Tejun Heo --- Documentation/kernel-parameters.txt | 11 ++++++----- arch/x86/kernel/setup_percpu.c | 24 ++++++------------------ include/linux/percpu.h | 12 ++++++++++++ mm/percpu.c | 32 ++++++++++++++++++++++++++++++++ 4 files changed, 56 insertions(+), 23 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 12e9eb77ee0..dee9ce2e6cf 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1919,11 +1919,12 @@ and is between 256 and 4096 characters. It is defined in the file Format: { 0 | 1 } See arch/parisc/kernel/pdc_chassis.c - percpu_alloc= [X86] Select which percpu first chunk allocator to use. - Allowed values are one of "lpage", "embed" and "page". - See comments in arch/x86/kernel/setup_percpu.c for - details on each allocator. This parameter is primarily - for debugging and performance comparison. + percpu_alloc= Select which percpu first chunk allocator to use. + Currently supported values are "embed", "page" and + "lpage". Archs may support subset or none of the + selections. See comments in mm/percpu.c for details + on each allocator. This parameter is primarily for + debugging and performance comparison. pf. [PARIDE] See Documentation/blockdev/paride.txt. diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 1e17711c29d..b961d99e641 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -267,16 +267,6 @@ static ssize_t __init setup_pcpu_page(size_t static_size) pcpup_populate_pte); } -/* for explicit first chunk allocator selection */ -static char pcpu_chosen_alloc[16] __initdata; - -static int __init percpu_alloc_setup(char *str) -{ - strncpy(pcpu_chosen_alloc, str, sizeof(pcpu_chosen_alloc) - 1); - return 0; -} -early_param("percpu_alloc", percpu_alloc_setup); - static inline void setup_percpu_segment(int cpu) { #ifdef CONFIG_X86_32 @@ -307,19 +297,17 @@ void __init setup_per_cpu_areas(void) * each allocator for details. */ ret = -EINVAL; - if (strlen(pcpu_chosen_alloc)) { - if (strcmp(pcpu_chosen_alloc, "page")) { - if (!strcmp(pcpu_chosen_alloc, "lpage")) + if (pcpu_chosen_fc != PCPU_FC_AUTO) { + if (pcpu_chosen_fc != PCPU_FC_PAGE) { + if (pcpu_chosen_fc == PCPU_FC_LPAGE) ret = setup_pcpu_lpage(static_size, true); - else if (!strcmp(pcpu_chosen_alloc, "embed")) - ret = setup_pcpu_embed(static_size, true); else - pr_warning("PERCPU: unknown allocator %s " - "specified\n", pcpu_chosen_alloc); + ret = setup_pcpu_embed(static_size, true); + if (ret < 0) pr_warning("PERCPU: %s allocator failed (%zd), " "falling back to page size\n", - pcpu_chosen_alloc, ret); + pcpu_fc_names[pcpu_chosen_fc], ret); } } else { ret = setup_pcpu_lpage(static_size, false); diff --git a/include/linux/percpu.h b/include/linux/percpu.h index e26788e0da4..9be05cbe5ee 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -59,6 +59,18 @@ extern void *pcpu_base_addr; extern const int *pcpu_unit_map; +enum pcpu_fc { + PCPU_FC_AUTO, + PCPU_FC_EMBED, + PCPU_FC_PAGE, + PCPU_FC_LPAGE, + + PCPU_FC_NR, +}; +extern const char *pcpu_fc_names[PCPU_FC_NR]; + +extern enum pcpu_fc pcpu_chosen_fc; + typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size); typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size); typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr); diff --git a/mm/percpu.c b/mm/percpu.c index 7971997de31..7fb40bb1555 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1414,6 +1414,38 @@ size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size, return pcpu_unit_size; } +const char *pcpu_fc_names[PCPU_FC_NR] __initdata = { + [PCPU_FC_AUTO] = "auto", + [PCPU_FC_EMBED] = "embed", + [PCPU_FC_PAGE] = "page", + [PCPU_FC_LPAGE] = "lpage", +}; + +enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO; + +static int __init percpu_alloc_setup(char *str) +{ + if (0) + /* nada */; +#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK + else if (!strcmp(str, "embed")) + pcpu_chosen_fc = PCPU_FC_EMBED; +#endif +#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK + else if (!strcmp(str, "page")) + pcpu_chosen_fc = PCPU_FC_PAGE; +#endif +#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK + else if (!strcmp(str, "lpage")) + pcpu_chosen_fc = PCPU_FC_LPAGE; +#endif + else + pr_warning("PERCPU: unknown allocator %s specified\n", str); + + return 0; +} +early_param("percpu_alloc", percpu_alloc_setup); + static inline size_t pcpu_calc_fc_sizes(size_t static_size, size_t reserved_size, ssize_t *dyn_sizep) -- cgit v1.2.3 From 9a7737691e90d3cce0e5248f91826c50e5aa3fcf Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 14 Aug 2009 15:00:50 +0900 Subject: percpu: drop @static_size from first chunk allocators First chunk allocators assume percpu areas have been linked using one of PERCPU_*() macros and depend on __per_cpu_load symbol defined by those macros, so there isn't much point in passing in static area size explicitly when it can be easily calculated from __per_cpu_start and __per_cpu_end. Drop @static_size from all percpu first chunk allocators and helpers. Signed-off-by: Tejun Heo --- arch/x86/kernel/setup_percpu.c | 34 +++++++++++++++------------------- include/linux/percpu.h | 18 ++++++++---------- mm/percpu.c | 29 +++++++++++++---------------- 3 files changed, 36 insertions(+), 45 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index b961d99e641..8aad486c688 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -157,7 +157,7 @@ static int pcpu_lpage_cpu_distance(unsigned int from, unsigned int to) return REMOTE_DISTANCE; } -static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) +static ssize_t __init setup_pcpu_lpage(bool chosen) { size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; size_t dyn_size = reserve - PERCPU_FIRST_CHUNK_RESERVE; @@ -184,8 +184,7 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) return -ENOMEM; } - ret = pcpu_lpage_build_unit_map(static_size, - PERCPU_FIRST_CHUNK_RESERVE, + ret = pcpu_lpage_build_unit_map(PERCPU_FIRST_CHUNK_RESERVE, &dyn_size, &unit_size, PMD_SIZE, unit_map, pcpu_lpage_cpu_distance); if (ret < 0) { @@ -208,9 +207,8 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) } } - ret = pcpu_lpage_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, - dyn_size, unit_size, PMD_SIZE, - unit_map, nr_units, + ret = pcpu_lpage_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, dyn_size, + unit_size, PMD_SIZE, unit_map, nr_units, pcpu_fc_alloc, pcpu_fc_free, pcpul_map); out_free: if (ret < 0) @@ -218,7 +216,7 @@ out_free: return ret; } #else -static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) +static ssize_t __init setup_pcpu_lpage(bool chosen) { return -EINVAL; } @@ -232,7 +230,7 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) * mapping so that it can use PMD mapping without additional TLB * pressure. */ -static ssize_t __init setup_pcpu_embed(size_t static_size, bool chosen) +static ssize_t __init setup_pcpu_embed(bool chosen) { size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; @@ -244,7 +242,7 @@ static ssize_t __init setup_pcpu_embed(size_t static_size, bool chosen) if (!chosen && (!cpu_has_pse || pcpu_need_numa())) return -EINVAL; - return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, + return pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, reserve - PERCPU_FIRST_CHUNK_RESERVE); } @@ -260,9 +258,9 @@ static void __init pcpup_populate_pte(unsigned long addr) populate_extra_pte(addr); } -static ssize_t __init setup_pcpu_page(size_t static_size) +static ssize_t __init setup_pcpu_page(void) { - return pcpu_page_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, + return pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, pcpu_fc_alloc, pcpu_fc_free, pcpup_populate_pte); } @@ -282,7 +280,6 @@ static inline void setup_percpu_segment(int cpu) void __init setup_per_cpu_areas(void) { - size_t static_size = __per_cpu_end - __per_cpu_start; unsigned int cpu; unsigned long delta; size_t pcpu_unit_size; @@ -300,9 +297,9 @@ void __init setup_per_cpu_areas(void) if (pcpu_chosen_fc != PCPU_FC_AUTO) { if (pcpu_chosen_fc != PCPU_FC_PAGE) { if (pcpu_chosen_fc == PCPU_FC_LPAGE) - ret = setup_pcpu_lpage(static_size, true); + ret = setup_pcpu_lpage(true); else - ret = setup_pcpu_embed(static_size, true); + ret = setup_pcpu_embed(true); if (ret < 0) pr_warning("PERCPU: %s allocator failed (%zd), " @@ -310,15 +307,14 @@ void __init setup_per_cpu_areas(void) pcpu_fc_names[pcpu_chosen_fc], ret); } } else { - ret = setup_pcpu_lpage(static_size, false); + ret = setup_pcpu_lpage(false); if (ret < 0) - ret = setup_pcpu_embed(static_size, false); + ret = setup_pcpu_embed(false); } if (ret < 0) - ret = setup_pcpu_page(static_size); + ret = setup_pcpu_page(); if (ret < 0) - panic("cannot allocate static percpu area (%zu bytes, err=%zd)", - static_size, ret); + panic("cannot initialize percpu area (err=%zd)", ret); pcpu_unit_size = ret; diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 9be05cbe5ee..be2fc8fb9b6 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -84,13 +84,12 @@ extern size_t __init pcpu_setup_first_chunk( #ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK extern ssize_t __init pcpu_embed_first_chunk( - size_t static_size, size_t reserved_size, - ssize_t dyn_size); + size_t reserved_size, ssize_t dyn_size); #endif #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK extern ssize_t __init pcpu_page_first_chunk( - size_t static_size, size_t reserved_size, + size_t reserved_size, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn, pcpu_fc_populate_pte_fn_t populate_pte_fn); @@ -98,16 +97,15 @@ extern ssize_t __init pcpu_page_first_chunk( #ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK extern int __init pcpu_lpage_build_unit_map( - size_t static_size, size_t reserved_size, - ssize_t *dyn_sizep, size_t *unit_sizep, - size_t lpage_size, int *unit_map, + size_t reserved_size, ssize_t *dyn_sizep, + size_t *unit_sizep, size_t lpage_size, + int *unit_map, pcpu_fc_cpu_distance_fn_t cpu_distance_fn); extern ssize_t __init pcpu_lpage_first_chunk( - size_t static_size, size_t reserved_size, - size_t dyn_size, size_t unit_size, - size_t lpage_size, const int *unit_map, - int nr_units, + size_t reserved_size, size_t dyn_size, + size_t unit_size, size_t lpage_size, + const int *unit_map, int nr_units, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn, pcpu_fc_map_fn_t map_fn); diff --git a/mm/percpu.c b/mm/percpu.c index 7fb40bb1555..e2ac58a39bb 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1464,7 +1464,6 @@ static inline size_t pcpu_calc_fc_sizes(size_t static_size, !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) /** * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem - * @static_size: the size of static percpu area in bytes * @reserved_size: the size of reserved percpu area in bytes * @dyn_size: free size for dynamic allocation in bytes, -1 for auto * @@ -1489,9 +1488,9 @@ static inline size_t pcpu_calc_fc_sizes(size_t static_size, * The determined pcpu_unit_size which can be used to initialize * percpu access on success, -errno on failure. */ -ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, - ssize_t dyn_size) +ssize_t __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size) { + const size_t static_size = __per_cpu_end - __per_cpu_start; size_t size_sum, unit_size, chunk_size; void *base; unsigned int cpu; @@ -1536,7 +1535,6 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK /** * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages - * @static_size: the size of static percpu area in bytes * @reserved_size: the size of reserved percpu area in bytes * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE * @free_fn: funtion to free percpu page, always called with PAGE_SIZE @@ -1552,12 +1550,13 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, * The determined pcpu_unit_size which can be used to initialize * percpu access on success, -errno on failure. */ -ssize_t __init pcpu_page_first_chunk(size_t static_size, size_t reserved_size, +ssize_t __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn, pcpu_fc_populate_pte_fn_t populate_pte_fn) { static struct vm_struct vm; + const size_t static_size = __per_cpu_end - __per_cpu_start; char psize_str[16]; int unit_pages; size_t pages_size; @@ -1641,7 +1640,6 @@ out_free_ar: #ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK /** * pcpu_lpage_build_unit_map - build unit_map for large page remapping - * @static_size: the size of static percpu area in bytes * @reserved_size: the size of reserved percpu area in bytes * @dyn_sizep: in/out parameter for dynamic size, -1 for auto * @unit_sizep: out parameter for unit size @@ -1661,13 +1659,14 @@ out_free_ar: * On success, fills in @unit_map, sets *@dyn_sizep, *@unit_sizep and * returns the number of units to be allocated. -errno on failure. */ -int __init pcpu_lpage_build_unit_map(size_t static_size, size_t reserved_size, - ssize_t *dyn_sizep, size_t *unit_sizep, - size_t lpage_size, int *unit_map, +int __init pcpu_lpage_build_unit_map(size_t reserved_size, ssize_t *dyn_sizep, + size_t *unit_sizep, size_t lpage_size, + int *unit_map, pcpu_fc_cpu_distance_fn_t cpu_distance_fn) { static int group_map[NR_CPUS] __initdata; static int group_cnt[NR_CPUS] __initdata; + const size_t static_size = __per_cpu_end - __per_cpu_start; int group_cnt_max = 0; size_t size_sum, min_unit_size, alloc_size; int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */ @@ -1819,7 +1818,6 @@ static void __init pcpul_lpage_dump_cfg(const char *lvl, size_t static_size, /** * pcpu_lpage_first_chunk - remap the first percpu chunk using large page - * @static_size: the size of static percpu area in bytes * @reserved_size: the size of reserved percpu area in bytes * @dyn_size: free size for dynamic allocation in bytes * @unit_size: unit size in bytes @@ -1850,15 +1848,15 @@ static void __init pcpul_lpage_dump_cfg(const char *lvl, size_t static_size, * The determined pcpu_unit_size which can be used to initialize * percpu access on success, -errno on failure. */ -ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size, - size_t dyn_size, size_t unit_size, - size_t lpage_size, const int *unit_map, - int nr_units, +ssize_t __init pcpu_lpage_first_chunk(size_t reserved_size, size_t dyn_size, + size_t unit_size, size_t lpage_size, + const int *unit_map, int nr_units, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn, pcpu_fc_map_fn_t map_fn) { static struct vm_struct vm; + const size_t static_size = __per_cpu_end - __per_cpu_start; size_t chunk_size = unit_size * nr_units; size_t map_size; unsigned int cpu; @@ -2037,7 +2035,6 @@ EXPORT_SYMBOL(__per_cpu_offset); void __init setup_per_cpu_areas(void) { - size_t static_size = __per_cpu_end - __per_cpu_start; ssize_t unit_size; unsigned long delta; unsigned int cpu; @@ -2046,7 +2043,7 @@ void __init setup_per_cpu_areas(void) * Always reserve area for module percpu variables. That's * what the legacy allocator did. */ - unit_size = pcpu_embed_first_chunk(static_size, PERCPU_MODULE_RESERVE, + unit_size = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, PERCPU_DYNAMIC_RESERVE); if (unit_size < 0) panic("Failed to initialized percpu areas."); -- cgit v1.2.3 From 3cbc85652767c38b252c8de55f9fd180b29e4c0d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 14 Aug 2009 15:00:50 +0900 Subject: percpu: add @align to pcpu_fc_alloc_fn_t pcpu_fc_alloc_fn_t is about to see more interesting usage, add @align parameter. Signed-off-by: Tejun Heo --- arch/x86/kernel/setup_percpu.c | 4 ++-- include/linux/percpu.h | 3 ++- mm/percpu.c | 4 ++-- 3 files changed, 6 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 8aad486c688..660cde13314 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -126,9 +126,9 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, /* * Helpers for first chunk memory allocation */ -static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size) +static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align) { - return pcpu_alloc_bootmem(cpu, size, size); + return pcpu_alloc_bootmem(cpu, size, align); } static void __init pcpu_fc_free(void *ptr, size_t size) diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 0cfdd14d096..d385dbcf190 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -71,7 +71,8 @@ extern const char *pcpu_fc_names[PCPU_FC_NR]; extern enum pcpu_fc pcpu_chosen_fc; -typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size); +typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size, + size_t align); typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size); typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr); typedef int (pcpu_fc_cpu_distance_fn_t)(unsigned int from, unsigned int to); diff --git a/mm/percpu.c b/mm/percpu.c index 287f59cc5fb..3316e3aac7e 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1578,7 +1578,7 @@ ssize_t __init pcpu_page_first_chunk(size_t reserved_size, for (i = 0; i < unit_pages; i++) { void *ptr; - ptr = alloc_fn(cpu, PAGE_SIZE); + ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE); if (!ptr) { pr_warning("PERCPU: failed to allocate %s page " "for cpu%u\n", psize_str, cpu); @@ -1888,7 +1888,7 @@ ssize_t __init pcpu_lpage_first_chunk(size_t reserved_size, size_t dyn_size, goto found; continue; found: - ptr = alloc_fn(cpu, lpage_size); + ptr = alloc_fn(cpu, lpage_size, lpage_size); if (!ptr) { pr_warning("PERCPU: failed to allocate large page " "for cpu%u\n", cpu); -- cgit v1.2.3 From fd1e8a1fe2b54df6c185b4fa65f181f50b9c4d4e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 14 Aug 2009 15:00:51 +0900 Subject: percpu: introduce pcpu_alloc_info and pcpu_group_info Till now, non-linear cpu->unit map was expressed using an integer array which maps each cpu to a unit and used only by lpage allocator. Although how many units have been placed in a single contiguos area (group) is known while building unit_map, the information is lost when the result is recorded into the unit_map array. For lpage allocator, as all allocations are done by lpages and whether two adjacent lpages are in the same group or not is irrelevant, this didn't cause any problem. Non-linear cpu->unit mapping will be used for sparse embedding and this grouping information is necessary for that. This patch introduces pcpu_alloc_info which contains all the information necessary for initializing percpu allocator. pcpu_alloc_info contains array of pcpu_group_info which describes how units are grouped and mapped to cpus. pcpu_group_info also has base_offset field to specify its offset from the chunk's base address. pcpu_build_alloc_info() initializes this field as if all groups are allocated back-to-back as is currently done but this will be used to sparsely place groups. pcpu_alloc_info is a rather complex data structure which contains a flexible array which in turn points to nested cpu_map arrays. * pcpu_alloc_alloc_info() and pcpu_free_alloc_info() are provided to help dealing with pcpu_alloc_info. * pcpu_lpage_build_unit_map() is updated to build pcpu_alloc_info, generalized and renamed to pcpu_build_alloc_info(). @cpu_distance_fn may be NULL indicating that all cpus are of LOCAL_DISTANCE. * pcpul_lpage_dump_cfg() is updated to process pcpu_alloc_info, generalized and renamed to pcpu_dump_alloc_info(). It now also prints which group each alloc unit belongs to. * pcpu_setup_first_chunk() now takes pcpu_alloc_info instead of the separate parameters. All first chunk allocators are updated to use pcpu_build_alloc_info() to build alloc_info and call pcpu_setup_first_chunk() with it. This has the side effect of packing units for sparse possible cpus. ie. if cpus 0, 2 and 4 are possible, they'll be assigned unit 0, 1 and 2 instead of 0, 2 and 4. * x86 setup_pcpu_lpage() is updated to deal with alloc_info. * sparc64 setup_per_cpu_areas() is updated to build alloc_info. Although the changes made by this patch are pretty pervasive, it doesn't cause any behavior difference other than packing of sparse cpus. It mostly changes how information is passed among initialization functions and makes room for more flexibility. Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: David Miller --- arch/sparc/kernel/smp_64.c | 24 +- arch/x86/kernel/setup_percpu.c | 38 ++- include/linux/percpu.h | 42 +++- mm/percpu.c | 529 +++++++++++++++++++++++++---------------- 4 files changed, 389 insertions(+), 244 deletions(-) (limited to 'arch/x86') diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index 9856d866b77..a42a4a744d1 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -1475,17 +1475,29 @@ static void __init pcpu_map_range(unsigned long start, unsigned long end, void __init setup_per_cpu_areas(void) { - size_t dyn_size, static_size = __per_cpu_end - __per_cpu_start; static struct vm_struct vm; + struct pcpu_alloc_info *ai; unsigned long delta, cpu; size_t size_sum, pcpu_unit_size; size_t ptrs_size; void **ptrs; - size_sum = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE + + ai = pcpu_alloc_alloc_info(1, nr_cpu_ids); + + ai->static_size = __per_cpu_end - __per_cpu_start; + ai->reserved_size = PERCPU_MODULE_RESERVE; + + size_sum = PFN_ALIGN(ai->static_size + ai->reserved_size + PERCPU_DYNAMIC_RESERVE); - dyn_size = size_sum - static_size - PERCPU_MODULE_RESERVE; + ai->dyn_size = size_sum - ai->static_size - ai->reserved_size; + ai->unit_size = PCPU_CHUNK_SIZE; + ai->atom_size = PCPU_CHUNK_SIZE; + ai->alloc_size = PCPU_CHUNK_SIZE; + ai->groups[0].nr_units = nr_cpu_ids; + + for_each_possible_cpu(cpu) + ai->groups[0].cpu_map[cpu] = cpu; ptrs_size = PFN_ALIGN(nr_cpu_ids * sizeof(ptrs[0])); ptrs = alloc_bootmem(ptrs_size); @@ -1497,7 +1509,7 @@ void __init setup_per_cpu_areas(void) free_bootmem(__pa(ptrs[cpu] + size_sum), PCPU_CHUNK_SIZE - size_sum); - memcpy(ptrs[cpu], __per_cpu_load, static_size); + memcpy(ptrs[cpu], __per_cpu_load, ai->static_size); } /* allocate address and map */ @@ -1514,9 +1526,7 @@ void __init setup_per_cpu_areas(void) pcpu_map_range(start, end, virt_to_page(ptrs[cpu])); } - pcpu_unit_size = pcpu_setup_first_chunk(static_size, - PERCPU_MODULE_RESERVE, dyn_size, - PCPU_CHUNK_SIZE, vm.addr, NULL); + pcpu_unit_size = pcpu_setup_first_chunk(ai, vm.addr); free_bootmem(__pa(ptrs), ptrs_size); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 660cde13314..db5f9c49fec 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -161,9 +161,7 @@ static ssize_t __init setup_pcpu_lpage(bool chosen) { size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; size_t dyn_size = reserve - PERCPU_FIRST_CHUNK_RESERVE; - size_t unit_map_size, unit_size; - int *unit_map; - int nr_units; + struct pcpu_alloc_info *ai; ssize_t ret; /* on non-NUMA, embedding is better */ @@ -177,26 +175,22 @@ static ssize_t __init setup_pcpu_lpage(bool chosen) } /* allocate and build unit_map */ - unit_map_size = nr_cpu_ids * sizeof(int); - unit_map = alloc_bootmem_nopanic(unit_map_size); - if (!unit_map) { - pr_warning("PERCPU: failed to allocate unit_map\n"); - return -ENOMEM; + ai = pcpu_build_alloc_info(PERCPU_FIRST_CHUNK_RESERVE, dyn_size, + PMD_SIZE, pcpu_lpage_cpu_distance); + if (IS_ERR(ai)) { + pr_warning("PERCPU: failed to build unit_map (%ld)\n", + PTR_ERR(ai)); + return PTR_ERR(ai); } - ret = pcpu_lpage_build_unit_map(PERCPU_FIRST_CHUNK_RESERVE, - &dyn_size, &unit_size, PMD_SIZE, - unit_map, pcpu_lpage_cpu_distance); - if (ret < 0) { - pr_warning("PERCPU: failed to build unit_map\n"); - goto out_free; - } - nr_units = ret; - /* do the parameters look okay? */ if (!chosen) { size_t vm_size = VMALLOC_END - VMALLOC_START; - size_t tot_size = nr_units * unit_size; + size_t tot_size = 0; + int group; + + for (group = 0; group < ai->nr_groups; group++) + tot_size += ai->unit_size * ai->groups[group].nr_units; /* don't consume more than 20% of vmalloc area */ if (tot_size > vm_size / 5) { @@ -207,12 +201,10 @@ static ssize_t __init setup_pcpu_lpage(bool chosen) } } - ret = pcpu_lpage_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, dyn_size, - unit_size, PMD_SIZE, unit_map, nr_units, - pcpu_fc_alloc, pcpu_fc_free, pcpul_map); + ret = pcpu_lpage_first_chunk(ai, pcpu_fc_alloc, pcpu_fc_free, + pcpul_map); out_free: - if (ret < 0) - free_bootmem(__pa(unit_map), unit_map_size); + pcpu_free_alloc_info(ai); return ret; } #else diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 570fb18de2b..77b86be8ce4 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -59,6 +59,25 @@ extern void *pcpu_base_addr; extern const int *pcpu_unit_map; +struct pcpu_group_info { + int nr_units; /* aligned # of units */ + unsigned long base_offset; /* base address offset */ + unsigned int *cpu_map; /* unit->cpu map, empty + * entries contain NR_CPUS */ +}; + +struct pcpu_alloc_info { + size_t static_size; + size_t reserved_size; + size_t dyn_size; + size_t unit_size; + size_t atom_size; + size_t alloc_size; + size_t __ai_size; /* internal, don't use */ + int nr_groups; /* 0 if grouping unnecessary */ + struct pcpu_group_info groups[]; +}; + enum pcpu_fc { PCPU_FC_AUTO, PCPU_FC_EMBED, @@ -78,18 +97,17 @@ typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr); typedef int (pcpu_fc_cpu_distance_fn_t)(unsigned int from, unsigned int to); typedef void (*pcpu_fc_map_fn_t)(void *ptr, size_t size, void *addr); -#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK -extern int __init pcpu_lpage_build_unit_map( - size_t reserved_size, ssize_t *dyn_sizep, - size_t *unit_sizep, size_t lpage_size, - int *unit_map, +extern struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups, + int nr_units); +extern void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai); + +extern struct pcpu_alloc_info * __init pcpu_build_alloc_info( + size_t reserved_size, ssize_t dyn_size, + size_t atom_size, pcpu_fc_cpu_distance_fn_t cpu_distance_fn); -#endif -extern size_t __init pcpu_setup_first_chunk( - size_t static_size, size_t reserved_size, - size_t dyn_size, size_t unit_size, - void *base_addr, const int *unit_map); +extern size_t __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, + void *base_addr); #ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK extern ssize_t __init pcpu_embed_first_chunk( @@ -106,9 +124,7 @@ extern ssize_t __init pcpu_page_first_chunk( #ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK extern ssize_t __init pcpu_lpage_first_chunk( - size_t reserved_size, size_t dyn_size, - size_t unit_size, size_t lpage_size, - const int *unit_map, int nr_units, + const struct pcpu_alloc_info *ai, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn, pcpu_fc_map_fn_t map_fn); diff --git a/mm/percpu.c b/mm/percpu.c index 2b9c4b2a2fc..99f7fa68272 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -58,6 +58,7 @@ #include #include +#include #include #include #include @@ -1245,53 +1246,108 @@ static inline size_t pcpu_calc_fc_sizes(size_t static_size, return size_sum; } -#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK /** - * pcpu_lpage_build_unit_map - build unit_map for large page remapping + * pcpu_alloc_alloc_info - allocate percpu allocation info + * @nr_groups: the number of groups + * @nr_units: the number of units + * + * Allocate ai which is large enough for @nr_groups groups containing + * @nr_units units. The returned ai's groups[0].cpu_map points to the + * cpu_map array which is long enough for @nr_units and filled with + * NR_CPUS. It's the caller's responsibility to initialize cpu_map + * pointer of other groups. + * + * RETURNS: + * Pointer to the allocated pcpu_alloc_info on success, NULL on + * failure. + */ +struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups, + int nr_units) +{ + struct pcpu_alloc_info *ai; + size_t base_size, ai_size; + void *ptr; + int unit; + + base_size = ALIGN(sizeof(*ai) + nr_groups * sizeof(ai->groups[0]), + __alignof__(ai->groups[0].cpu_map[0])); + ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]); + + ptr = alloc_bootmem_nopanic(PFN_ALIGN(ai_size)); + if (!ptr) + return NULL; + ai = ptr; + ptr += base_size; + + ai->groups[0].cpu_map = ptr; + + for (unit = 0; unit < nr_units; unit++) + ai->groups[0].cpu_map[unit] = NR_CPUS; + + ai->nr_groups = nr_groups; + ai->__ai_size = PFN_ALIGN(ai_size); + + return ai; +} + +/** + * pcpu_free_alloc_info - free percpu allocation info + * @ai: pcpu_alloc_info to free + * + * Free @ai which was allocated by pcpu_alloc_alloc_info(). + */ +void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai) +{ + free_bootmem(__pa(ai), ai->__ai_size); +} + +/** + * pcpu_build_alloc_info - build alloc_info considering distances between CPUs * @reserved_size: the size of reserved percpu area in bytes - * @dyn_sizep: in/out parameter for dynamic size, -1 for auto - * @unit_sizep: out parameter for unit size - * @unit_map: unit_map to be filled - * @cpu_distance_fn: callback to determine distance between cpus + * @dyn_size: free size for dynamic allocation in bytes, -1 for auto + * @atom_size: allocation atom size + * @cpu_distance_fn: callback to determine distance between cpus, optional * - * This function builds cpu -> unit map and determine other parameters - * considering needed percpu size, large page size and distances - * between CPUs in NUMA. + * This function determines grouping of units, their mappings to cpus + * and other parameters considering needed percpu size, allocation + * atom size and distances between CPUs. * - * CPUs which are of LOCAL_DISTANCE both ways are grouped together and - * may share units in the same large page. The returned configuration - * is guaranteed to have CPUs on different nodes on different large - * pages and >=75% usage of allocated virtual address space. + * Groups are always mutliples of atom size and CPUs which are of + * LOCAL_DISTANCE both ways are grouped together and share space for + * units in the same group. The returned configuration is guaranteed + * to have CPUs on different nodes on different groups and >=75% usage + * of allocated virtual address space. * * RETURNS: - * On success, fills in @unit_map, sets *@dyn_sizep, *@unit_sizep and - * returns the number of units to be allocated. -errno on failure. + * On success, pointer to the new allocation_info is returned. On + * failure, ERR_PTR value is returned. */ -int __init pcpu_lpage_build_unit_map(size_t reserved_size, ssize_t *dyn_sizep, - size_t *unit_sizep, size_t lpage_size, - int *unit_map, - pcpu_fc_cpu_distance_fn_t cpu_distance_fn) +struct pcpu_alloc_info * __init pcpu_build_alloc_info( + size_t reserved_size, ssize_t dyn_size, + size_t atom_size, + pcpu_fc_cpu_distance_fn_t cpu_distance_fn) { static int group_map[NR_CPUS] __initdata; static int group_cnt[NR_CPUS] __initdata; const size_t static_size = __per_cpu_end - __per_cpu_start; - int group_cnt_max = 0; + int group_cnt_max = 0, nr_groups = 1, nr_units = 0; size_t size_sum, min_unit_size, alloc_size; int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */ - int last_allocs; + int last_allocs, group, unit; unsigned int cpu, tcpu; - int group, unit; + struct pcpu_alloc_info *ai; + unsigned int *cpu_map; /* * Determine min_unit_size, alloc_size and max_upa such that - * alloc_size is multiple of lpage_size and is the smallest + * alloc_size is multiple of atom_size and is the smallest * which can accomodate 4k aligned segments which are equal to * or larger than min_unit_size. */ - size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, dyn_sizep); + size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size); min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); - alloc_size = roundup(min_unit_size, lpage_size); + alloc_size = roundup(min_unit_size, atom_size); upa = alloc_size / min_unit_size; while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) upa--; @@ -1304,10 +1360,11 @@ int __init pcpu_lpage_build_unit_map(size_t reserved_size, ssize_t *dyn_sizep, for_each_possible_cpu(tcpu) { if (cpu == tcpu) break; - if (group_map[tcpu] == group && + if (group_map[tcpu] == group && cpu_distance_fn && (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE || cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) { group++; + nr_groups = max(nr_groups, group + 1); goto next_group; } } @@ -1328,7 +1385,7 @@ int __init pcpu_lpage_build_unit_map(size_t reserved_size, ssize_t *dyn_sizep, if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) continue; - for (group = 0; group_cnt[group]; group++) { + for (group = 0; group < nr_groups; group++) { int this_allocs = DIV_ROUND_UP(group_cnt[group], upa); allocs += this_allocs; wasted += this_allocs * upa - group_cnt[group]; @@ -1348,75 +1405,122 @@ int __init pcpu_lpage_build_unit_map(size_t reserved_size, ssize_t *dyn_sizep, last_allocs = allocs; best_upa = upa; } - *unit_sizep = alloc_size / best_upa; + upa = best_upa; + + /* allocate and fill alloc_info */ + for (group = 0; group < nr_groups; group++) + nr_units += roundup(group_cnt[group], upa); + + ai = pcpu_alloc_alloc_info(nr_groups, nr_units); + if (!ai) + return ERR_PTR(-ENOMEM); + cpu_map = ai->groups[0].cpu_map; + + for (group = 0; group < nr_groups; group++) { + ai->groups[group].cpu_map = cpu_map; + cpu_map += roundup(group_cnt[group], upa); + } + + ai->static_size = static_size; + ai->reserved_size = reserved_size; + ai->dyn_size = dyn_size; + ai->unit_size = alloc_size / upa; + ai->atom_size = atom_size; + ai->alloc_size = alloc_size; + + for (group = 0, unit = 0; group_cnt[group]; group++) { + struct pcpu_group_info *gi = &ai->groups[group]; + + /* + * Initialize base_offset as if all groups are located + * back-to-back. The caller should update this to + * reflect actual allocation. + */ + gi->base_offset = unit * ai->unit_size; - /* assign units to cpus accordingly */ - unit = 0; - for (group = 0; group_cnt[group]; group++) { for_each_possible_cpu(cpu) if (group_map[cpu] == group) - unit_map[cpu] = unit++; - unit = roundup(unit, best_upa); + gi->cpu_map[gi->nr_units++] = cpu; + gi->nr_units = roundup(gi->nr_units, upa); + unit += gi->nr_units; } + BUG_ON(unit != nr_units); - return unit; /* unit contains aligned number of units */ + return ai; } -static bool __init pcpul_unit_to_cpu(int unit, const int *unit_map, - unsigned int *cpup); - -static void __init pcpul_lpage_dump_cfg(const char *lvl, size_t static_size, - size_t reserved_size, size_t dyn_size, - size_t unit_size, size_t lpage_size, - const int *unit_map, int nr_units) +/** + * pcpu_dump_alloc_info - print out information about pcpu_alloc_info + * @lvl: loglevel + * @ai: allocation info to dump + * + * Print out information about @ai using loglevel @lvl. + */ +static void pcpu_dump_alloc_info(const char *lvl, + const struct pcpu_alloc_info *ai) { - int width = 1, v = nr_units; + int group_width = 1, cpu_width = 1, width; char empty_str[] = "--------"; - int upl, lpl; /* units per lpage, lpage per line */ - unsigned int cpu; - int lpage, unit; + int alloc = 0, alloc_end = 0; + int group, v; + int upa, apl; /* units per alloc, allocs per line */ + + v = ai->nr_groups; + while (v /= 10) + group_width++; + v = num_possible_cpus(); while (v /= 10) - width++; - empty_str[min_t(int, width, sizeof(empty_str) - 1)] = '\0'; + cpu_width++; + empty_str[min_t(int, cpu_width, sizeof(empty_str) - 1)] = '\0'; - upl = max_t(int, lpage_size / unit_size, 1); - lpl = rounddown_pow_of_two(max_t(int, 60 / (upl * (width + 1) + 2), 1)); + upa = ai->alloc_size / ai->unit_size; + width = upa * (cpu_width + 1) + group_width + 3; + apl = rounddown_pow_of_two(max(60 / width, 1)); - printk("%spcpu-lpage: sta/res/dyn=%zu/%zu/%zu unit=%zu lpage=%zu", lvl, - static_size, reserved_size, dyn_size, unit_size, lpage_size); + printk("%spcpu-alloc: s%zu r%zu d%zu u%zu alloc=%zu*%zu", + lvl, ai->static_size, ai->reserved_size, ai->dyn_size, + ai->unit_size, ai->alloc_size / ai->atom_size, ai->atom_size); - for (lpage = 0, unit = 0; unit < nr_units; unit++) { - if (!(unit % upl)) { - if (!(lpage++ % lpl)) { + for (group = 0; group < ai->nr_groups; group++) { + const struct pcpu_group_info *gi = &ai->groups[group]; + int unit = 0, unit_end = 0; + + BUG_ON(gi->nr_units % upa); + for (alloc_end += gi->nr_units / upa; + alloc < alloc_end; alloc++) { + if (!(alloc % apl)) { printk("\n"); - printk("%spcpu-lpage: ", lvl); - } else - printk("| "); + printk("%spcpu-alloc: ", lvl); + } + printk("[%0*d] ", group_width, group); + + for (unit_end += upa; unit < unit_end; unit++) + if (gi->cpu_map[unit] != NR_CPUS) + printk("%0*d ", cpu_width, + gi->cpu_map[unit]); + else + printk("%s ", empty_str); } - if (pcpul_unit_to_cpu(unit, unit_map, &cpu)) - printk("%0*d ", width, cpu); - else - printk("%s ", empty_str); } printk("\n"); } -#endif /** * pcpu_setup_first_chunk - initialize the first percpu chunk - * @static_size: the size of static percpu area in bytes - * @reserved_size: the size of reserved percpu area in bytes, 0 for none - * @dyn_size: free size for dynamic allocation in bytes - * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE + * @ai: pcpu_alloc_info describing how to percpu area is shaped * @base_addr: mapped address - * @unit_map: cpu -> unit map, NULL for sequential mapping * * Initialize the first percpu chunk which contains the kernel static * perpcu area. This function is to be called from arch percpu area * setup path. * - * @reserved_size, if non-zero, specifies the amount of bytes to + * @ai contains all information necessary to initialize the first + * chunk and prime the dynamic percpu allocator. + * + * @ai->static_size is the size of static percpu area. + * + * @ai->reserved_size, if non-zero, specifies the amount of bytes to * reserve after the static area in the first chunk. This reserves * the first chunk such that it's available only through reserved * percpu allocation. This is primarily used to serve module percpu @@ -1424,13 +1528,26 @@ static void __init pcpul_lpage_dump_cfg(const char *lvl, size_t static_size, * limited offset range for symbol relocations to guarantee module * percpu symbols fall inside the relocatable range. * - * @dyn_size determines the number of bytes available for dynamic - * allocation in the first chunk. The area between @static_size + - * @reserved_size + @dyn_size and @unit_size is unused. + * @ai->dyn_size determines the number of bytes available for dynamic + * allocation in the first chunk. The area between @ai->static_size + + * @ai->reserved_size + @ai->dyn_size and @ai->unit_size is unused. * - * @unit_size specifies unit size and must be aligned to PAGE_SIZE and - * equal to or larger than @static_size + @reserved_size + if - * non-negative, @dyn_size. + * @ai->unit_size specifies unit size and must be aligned to PAGE_SIZE + * and equal to or larger than @ai->static_size + @ai->reserved_size + + * @ai->dyn_size. + * + * @ai->atom_size is the allocation atom size and used as alignment + * for vm areas. + * + * @ai->alloc_size is the allocation size and always multiple of + * @ai->atom_size. This is larger than @ai->atom_size if + * @ai->unit_size is larger than @ai->atom_size. + * + * @ai->nr_groups and @ai->groups describe virtual memory layout of + * percpu areas. Units which should be colocated are put into the + * same group. Dynamic VM areas will be allocated according to these + * groupings. If @ai->nr_groups is zero, a single group containing + * all units is assumed. * * The caller should have mapped the first chunk at @base_addr and * copied static data to each unit. @@ -1446,70 +1563,63 @@ static void __init pcpul_lpage_dump_cfg(const char *lvl, size_t static_size, * The determined pcpu_unit_size which can be used to initialize * percpu access. */ -size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size, - size_t dyn_size, size_t unit_size, - void *base_addr, const int *unit_map) +size_t __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, + void *base_addr) { static struct vm_struct first_vm; static int smap[2], dmap[2]; - size_t size_sum = static_size + reserved_size + dyn_size; + size_t dyn_size = ai->dyn_size; + size_t size_sum = ai->static_size + ai->reserved_size + dyn_size; struct pcpu_chunk *schunk, *dchunk = NULL; - unsigned int cpu, tcpu; - int i; + unsigned int cpu; + int *unit_map; + int group, unit, i; /* sanity checks */ BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC || ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC); - BUG_ON(!static_size); + BUG_ON(ai->nr_groups <= 0); + BUG_ON(!ai->static_size); BUG_ON(!base_addr); - BUG_ON(unit_size < size_sum); - BUG_ON(unit_size & ~PAGE_MASK); - BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE); + BUG_ON(ai->unit_size < size_sum); + BUG_ON(ai->unit_size & ~PAGE_MASK); + BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE); + + pcpu_dump_alloc_info(KERN_DEBUG, ai); /* determine number of units and verify and initialize pcpu_unit_map */ - if (unit_map) { - int first_unit = INT_MAX, last_unit = INT_MIN; - - for_each_possible_cpu(cpu) { - int unit = unit_map[cpu]; - - BUG_ON(unit < 0); - for_each_possible_cpu(tcpu) { - if (tcpu == cpu) - break; - /* the mapping should be one-to-one */ - BUG_ON(unit_map[tcpu] == unit); - } + unit_map = alloc_bootmem(nr_cpu_ids * sizeof(unit_map[0])); - if (unit < first_unit) { - pcpu_first_unit_cpu = cpu; - first_unit = unit; - } - if (unit > last_unit) { - pcpu_last_unit_cpu = cpu; - last_unit = unit; - } - } - pcpu_nr_units = last_unit + 1; - pcpu_unit_map = unit_map; - } else { - int *identity_map; + for (cpu = 0; cpu < nr_cpu_ids; cpu++) + unit_map[cpu] = NR_CPUS; + pcpu_first_unit_cpu = NR_CPUS; - /* #units == #cpus, identity mapped */ - identity_map = alloc_bootmem(nr_cpu_ids * - sizeof(identity_map[0])); + for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) { + const struct pcpu_group_info *gi = &ai->groups[group]; - for_each_possible_cpu(cpu) - identity_map[cpu] = cpu; + for (i = 0; i < gi->nr_units; i++) { + cpu = gi->cpu_map[i]; + if (cpu == NR_CPUS) + continue; - pcpu_first_unit_cpu = 0; - pcpu_last_unit_cpu = pcpu_nr_units - 1; - pcpu_nr_units = nr_cpu_ids; - pcpu_unit_map = identity_map; + BUG_ON(cpu > nr_cpu_ids || !cpu_possible(cpu)); + BUG_ON(unit_map[cpu] != NR_CPUS); + + unit_map[cpu] = unit + i; + if (pcpu_first_unit_cpu == NR_CPUS) + pcpu_first_unit_cpu = cpu; + } } + pcpu_last_unit_cpu = cpu; + pcpu_nr_units = unit; + + for_each_possible_cpu(cpu) + BUG_ON(unit_map[cpu] == NR_CPUS); + + pcpu_unit_map = unit_map; /* determine basic parameters */ - pcpu_unit_pages = unit_size >> PAGE_SHIFT; + pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT; pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; pcpu_chunk_size = pcpu_nr_units * pcpu_unit_size; pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) + @@ -1543,17 +1653,17 @@ size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size, schunk->immutable = true; bitmap_fill(schunk->populated, pcpu_unit_pages); - if (reserved_size) { - schunk->free_size = reserved_size; + if (ai->reserved_size) { + schunk->free_size = ai->reserved_size; pcpu_reserved_chunk = schunk; - pcpu_reserved_chunk_limit = static_size + reserved_size; + pcpu_reserved_chunk_limit = ai->static_size + ai->reserved_size; } else { schunk->free_size = dyn_size; dyn_size = 0; /* dynamic area covered */ } schunk->contig_hint = schunk->free_size; - schunk->map[schunk->map_used++] = -static_size; + schunk->map[schunk->map_used++] = -ai->static_size; if (schunk->free_size) schunk->map[schunk->map_used++] = schunk->free_size; @@ -1643,44 +1753,47 @@ early_param("percpu_alloc", percpu_alloc_setup); */ ssize_t __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size) { - const size_t static_size = __per_cpu_end - __per_cpu_start; - size_t size_sum, unit_size, chunk_size; + struct pcpu_alloc_info *ai; + size_t size_sum, chunk_size; void *base; - unsigned int cpu; + int unit; + ssize_t ret; - /* determine parameters and allocate */ - size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size); + ai = pcpu_build_alloc_info(reserved_size, dyn_size, PAGE_SIZE, NULL); + if (IS_ERR(ai)) + return PTR_ERR(ai); + BUG_ON(ai->nr_groups != 1); + BUG_ON(ai->groups[0].nr_units != num_possible_cpus()); - unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); - chunk_size = unit_size * nr_cpu_ids; + size_sum = ai->static_size + ai->reserved_size + ai->dyn_size; + chunk_size = ai->unit_size * num_possible_cpus(); base = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); if (!base) { pr_warning("PERCPU: failed to allocate %zu bytes for " "embedding\n", chunk_size); - return -ENOMEM; + ret = -ENOMEM; + goto out_free_ai; } /* return the leftover and copy */ - for (cpu = 0; cpu < nr_cpu_ids; cpu++) { - void *ptr = base + cpu * unit_size; - - if (cpu_possible(cpu)) { - free_bootmem(__pa(ptr + size_sum), - unit_size - size_sum); - memcpy(ptr, __per_cpu_load, static_size); - } else - free_bootmem(__pa(ptr), unit_size); + for (unit = 0; unit < num_possible_cpus(); unit++) { + void *ptr = base + unit * ai->unit_size; + + free_bootmem(__pa(ptr + size_sum), ai->unit_size - size_sum); + memcpy(ptr, __per_cpu_load, ai->static_size); } /* we're ready, commit */ pr_info("PERCPU: Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n", - PFN_DOWN(size_sum), base, static_size, reserved_size, dyn_size, - unit_size); + PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size, + ai->dyn_size, ai->unit_size); - return pcpu_setup_first_chunk(static_size, reserved_size, dyn_size, - unit_size, base, NULL); + ret = pcpu_setup_first_chunk(ai, base); +out_free_ai: + pcpu_free_alloc_info(ai); + return ret; } #endif /* CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK || !CONFIG_HAVE_SETUP_PER_CPU_AREA */ @@ -1709,31 +1822,34 @@ ssize_t __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_populate_pte_fn_t populate_pte_fn) { static struct vm_struct vm; - const size_t static_size = __per_cpu_end - __per_cpu_start; - ssize_t dyn_size = -1; - size_t size_sum, unit_size; + struct pcpu_alloc_info *ai; char psize_str[16]; int unit_pages; size_t pages_size; struct page **pages; - unsigned int cpu; - int i, j; + int unit, i, j; ssize_t ret; snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10); - size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size); - unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); - unit_pages = unit_size >> PAGE_SHIFT; + ai = pcpu_build_alloc_info(reserved_size, -1, PAGE_SIZE, NULL); + if (IS_ERR(ai)) + return PTR_ERR(ai); + BUG_ON(ai->nr_groups != 1); + BUG_ON(ai->groups[0].nr_units != num_possible_cpus()); + + unit_pages = ai->unit_size >> PAGE_SHIFT; /* unaligned allocations can't be freed, round up to page size */ - pages_size = PFN_ALIGN(unit_pages * nr_cpu_ids * sizeof(pages[0])); + pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() * + sizeof(pages[0])); pages = alloc_bootmem(pages_size); /* allocate pages */ j = 0; - for_each_possible_cpu(cpu) + for (unit = 0; unit < num_possible_cpus(); unit++) for (i = 0; i < unit_pages; i++) { + unsigned int cpu = ai->groups[0].cpu_map[unit]; void *ptr; ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE); @@ -1747,18 +1863,18 @@ ssize_t __init pcpu_page_first_chunk(size_t reserved_size, /* allocate vm area, map the pages and copy static data */ vm.flags = VM_ALLOC; - vm.size = nr_cpu_ids * unit_size; + vm.size = num_possible_cpus() * ai->unit_size; vm_area_register_early(&vm, PAGE_SIZE); - for_each_possible_cpu(cpu) { + for (unit = 0; unit < num_possible_cpus(); unit++) { unsigned long unit_addr = - (unsigned long)vm.addr + cpu * unit_size; + (unsigned long)vm.addr + unit * ai->unit_size; for (i = 0; i < unit_pages; i++) populate_pte_fn(unit_addr + (i << PAGE_SHIFT)); /* pte already populated, the following shouldn't fail */ - ret = __pcpu_map_pages(unit_addr, &pages[cpu * unit_pages], + ret = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages], unit_pages); if (ret < 0) panic("failed to map percpu area, err=%zd\n", ret); @@ -1772,16 +1888,15 @@ ssize_t __init pcpu_page_first_chunk(size_t reserved_size, */ /* copy static data */ - memcpy((void *)unit_addr, __per_cpu_load, static_size); + memcpy((void *)unit_addr, __per_cpu_load, ai->static_size); } /* we're ready, commit */ pr_info("PERCPU: %d %s pages/cpu @%p s%zu r%zu d%zu\n", - unit_pages, psize_str, vm.addr, static_size, reserved_size, - dyn_size); + unit_pages, psize_str, vm.addr, ai->static_size, + ai->reserved_size, ai->dyn_size); - ret = pcpu_setup_first_chunk(static_size, reserved_size, dyn_size, - unit_size, vm.addr, NULL); + ret = pcpu_setup_first_chunk(ai, vm.addr); goto out_free_ar; enomem: @@ -1790,6 +1905,7 @@ enomem: ret = -ENOMEM; out_free_ar: free_bootmem(__pa(pages), pages_size); + pcpu_free_alloc_info(ai); return ret; } #endif /* CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK */ @@ -1805,38 +1921,50 @@ static size_t pcpul_lpage_size; static int pcpul_nr_lpages; static struct pcpul_ent *pcpul_map; -static bool __init pcpul_unit_to_cpu(int unit, const int *unit_map, +static bool __init pcpul_unit_to_cpu(int unit, const struct pcpu_alloc_info *ai, unsigned int *cpup) { - unsigned int cpu; + int group, cunit; - for_each_possible_cpu(cpu) - if (unit_map[cpu] == unit) { + for (group = 0, cunit = 0; group < ai->nr_groups; group++) { + const struct pcpu_group_info *gi = &ai->groups[group]; + + if (unit < cunit + gi->nr_units) { if (cpup) - *cpup = cpu; + *cpup = gi->cpu_map[unit - cunit]; return true; } + cunit += gi->nr_units; + } return false; } +static int __init pcpul_cpu_to_unit(int cpu, const struct pcpu_alloc_info *ai) +{ + int group, unit, i; + + for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) { + const struct pcpu_group_info *gi = &ai->groups[group]; + + for (i = 0; i < gi->nr_units; i++) + if (gi->cpu_map[i] == cpu) + return unit + i; + } + BUG(); +} + /** * pcpu_lpage_first_chunk - remap the first percpu chunk using large page - * @reserved_size: the size of reserved percpu area in bytes - * @dyn_size: free size for dynamic allocation in bytes - * @unit_size: unit size in bytes - * @lpage_size: the size of a large page - * @unit_map: cpu -> unit mapping - * @nr_units: the number of units + * @ai: pcpu_alloc_info * @alloc_fn: function to allocate percpu lpage, always called with lpage_size * @free_fn: function to free percpu memory, @size <= lpage_size * @map_fn: function to map percpu lpage, always called with lpage_size * * This allocator uses large page to build and map the first chunk. - * Unlike other helpers, the caller should always specify @dyn_size - * and @unit_size. These parameters along with @unit_map and - * @nr_units can be determined using pcpu_lpage_build_unit_map(). - * This two stage initialization is to allow arch code to evaluate the + * Unlike other helpers, the caller should provide fully initialized + * @ai. This can be done using pcpu_build_alloc_info(). This two + * stage initialization is to allow arch code to evaluate the * parameters before committing to it. * * Large pages are allocated as directed by @unit_map and other @@ -1852,27 +1980,26 @@ static bool __init pcpul_unit_to_cpu(int unit, const int *unit_map, * The determined pcpu_unit_size which can be used to initialize * percpu access on success, -errno on failure. */ -ssize_t __init pcpu_lpage_first_chunk(size_t reserved_size, size_t dyn_size, - size_t unit_size, size_t lpage_size, - const int *unit_map, int nr_units, +ssize_t __init pcpu_lpage_first_chunk(const struct pcpu_alloc_info *ai, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn, pcpu_fc_map_fn_t map_fn) { static struct vm_struct vm; - const size_t static_size = __per_cpu_end - __per_cpu_start; - size_t chunk_size = unit_size * nr_units; - size_t map_size; + const size_t lpage_size = ai->atom_size; + size_t chunk_size, map_size; unsigned int cpu; ssize_t ret; - int i, j, unit; + int i, j, unit, nr_units; - pcpul_lpage_dump_cfg(KERN_DEBUG, static_size, reserved_size, dyn_size, - unit_size, lpage_size, unit_map, nr_units); + nr_units = 0; + for (i = 0; i < ai->nr_groups; i++) + nr_units += ai->groups[i].nr_units; + chunk_size = ai->unit_size * nr_units; BUG_ON(chunk_size % lpage_size); - pcpul_size = static_size + reserved_size + dyn_size; + pcpul_size = ai->static_size + ai->reserved_size + ai->dyn_size; pcpul_lpage_size = lpage_size; pcpul_nr_lpages = chunk_size / lpage_size; @@ -1883,13 +2010,13 @@ ssize_t __init pcpu_lpage_first_chunk(size_t reserved_size, size_t dyn_size, /* allocate all pages */ for (i = 0; i < pcpul_nr_lpages; i++) { size_t offset = i * lpage_size; - int first_unit = offset / unit_size; - int last_unit = (offset + lpage_size - 1) / unit_size; + int first_unit = offset / ai->unit_size; + int last_unit = (offset + lpage_size - 1) / ai->unit_size; void *ptr; /* find out which cpu is mapped to this unit */ for (unit = first_unit; unit <= last_unit; unit++) - if (pcpul_unit_to_cpu(unit, unit_map, &cpu)) + if (pcpul_unit_to_cpu(unit, ai, &cpu)) goto found; continue; found: @@ -1905,12 +2032,12 @@ ssize_t __init pcpu_lpage_first_chunk(size_t reserved_size, size_t dyn_size, /* return unused holes */ for (unit = 0; unit < nr_units; unit++) { - size_t start = unit * unit_size; - size_t end = start + unit_size; + size_t start = unit * ai->unit_size; + size_t end = start + ai->unit_size; size_t off, next; /* don't free used part of occupied unit */ - if (pcpul_unit_to_cpu(unit, unit_map, NULL)) + if (pcpul_unit_to_cpu(unit, ai, NULL)) start += pcpul_size; /* unit can span more than one page, punch the holes */ @@ -1925,7 +2052,7 @@ ssize_t __init pcpu_lpage_first_chunk(size_t reserved_size, size_t dyn_size, /* allocate address, map and copy */ vm.flags = VM_ALLOC; vm.size = chunk_size; - vm_area_register_early(&vm, unit_size); + vm_area_register_early(&vm, ai->unit_size); for (i = 0; i < pcpul_nr_lpages; i++) { if (!pcpul_map[i].ptr) @@ -1935,15 +2062,15 @@ ssize_t __init pcpu_lpage_first_chunk(size_t reserved_size, size_t dyn_size, } for_each_possible_cpu(cpu) - memcpy(vm.addr + unit_map[cpu] * unit_size, __per_cpu_load, - static_size); + memcpy(vm.addr + pcpul_cpu_to_unit(cpu, ai) * ai->unit_size, + __per_cpu_load, ai->static_size); /* we're ready, commit */ pr_info("PERCPU: large pages @%p s%zu r%zu d%zu u%zu\n", - vm.addr, static_size, reserved_size, dyn_size, unit_size); + vm.addr, ai->static_size, ai->reserved_size, ai->dyn_size, + ai->unit_size); - ret = pcpu_setup_first_chunk(static_size, reserved_size, dyn_size, - unit_size, vm.addr, unit_map); + ret = pcpu_setup_first_chunk(ai, vm.addr); /* * Sort pcpul_map array for pcpu_lpage_remapped(). Unmapped -- cgit v1.2.3 From fb435d5233f8b6f9b93c11d6304d8e98fed03234 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 14 Aug 2009 15:00:51 +0900 Subject: percpu: add pcpu_unit_offsets[] Currently units are mapped sequentially into address space. This patch adds pcpu_unit_offsets[] which allows units to be mapped to arbitrary offsets from the chunk base address. This is necessary to allow sparse embedding which might would need to allocate address ranges and memory areas which aren't aligned to unit size but allocation atom size (page or large page size). This also simplifies things a bit by removing the need to calculate offset from unit number. With this change, there's no need for the arch code to know pcpu_unit_size. Update pcpu_setup_first_chunk() and first chunk allocators to return regular 0 or -errno return code instead of unit size or -errno. Signed-off-by: Tejun Heo Cc: David S. Miller --- arch/sparc/kernel/smp_64.c | 12 +++--- arch/x86/kernel/setup_percpu.c | 51 ++++++++++------------- include/linux/percpu.h | 16 ++++--- mm/percpu.c | 95 +++++++++++++++++++++--------------------- 4 files changed, 84 insertions(+), 90 deletions(-) (limited to 'arch/x86') diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index a42a4a744d1..b03fd362c62 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -1478,9 +1478,10 @@ void __init setup_per_cpu_areas(void) static struct vm_struct vm; struct pcpu_alloc_info *ai; unsigned long delta, cpu; - size_t size_sum, pcpu_unit_size; + size_t size_sum; size_t ptrs_size; void **ptrs; + int rc; ai = pcpu_alloc_alloc_info(1, nr_cpu_ids); @@ -1526,14 +1527,15 @@ void __init setup_per_cpu_areas(void) pcpu_map_range(start, end, virt_to_page(ptrs[cpu])); } - pcpu_unit_size = pcpu_setup_first_chunk(ai, vm.addr); + rc = pcpu_setup_first_chunk(ai, vm.addr); + if (rc) + panic("failed to setup percpu first chunk (%d)", rc); free_bootmem(__pa(ptrs), ptrs_size); delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; - for_each_possible_cpu(cpu) { - __per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size; - } + for_each_possible_cpu(cpu) + __per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu]; /* Setup %g5 for the boot cpu. */ __local_per_cpu_offset = __per_cpu_offset(smp_processor_id()); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index db5f9c49fec..9becc5d4b51 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -157,12 +157,12 @@ static int pcpu_lpage_cpu_distance(unsigned int from, unsigned int to) return REMOTE_DISTANCE; } -static ssize_t __init setup_pcpu_lpage(bool chosen) +static int __init setup_pcpu_lpage(bool chosen) { size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; size_t dyn_size = reserve - PERCPU_FIRST_CHUNK_RESERVE; struct pcpu_alloc_info *ai; - ssize_t ret; + int rc; /* on non-NUMA, embedding is better */ if (!chosen && !pcpu_need_numa()) @@ -196,19 +196,18 @@ static ssize_t __init setup_pcpu_lpage(bool chosen) if (tot_size > vm_size / 5) { pr_info("PERCPU: too large chunk size %zuMB for " "large page remap\n", tot_size >> 20); - ret = -EINVAL; + rc = -EINVAL; goto out_free; } } - ret = pcpu_lpage_first_chunk(ai, pcpu_fc_alloc, pcpu_fc_free, - pcpul_map); + rc = pcpu_lpage_first_chunk(ai, pcpu_fc_alloc, pcpu_fc_free, pcpul_map); out_free: pcpu_free_alloc_info(ai); - return ret; + return rc; } #else -static ssize_t __init setup_pcpu_lpage(bool chosen) +static int __init setup_pcpu_lpage(bool chosen) { return -EINVAL; } @@ -222,7 +221,7 @@ static ssize_t __init setup_pcpu_lpage(bool chosen) * mapping so that it can use PMD mapping without additional TLB * pressure. */ -static ssize_t __init setup_pcpu_embed(bool chosen) +static int __init setup_pcpu_embed(bool chosen) { size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; @@ -250,7 +249,7 @@ static void __init pcpup_populate_pte(unsigned long addr) populate_extra_pte(addr); } -static ssize_t __init setup_pcpu_page(void) +static int __init setup_pcpu_page(void) { return pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, pcpu_fc_alloc, pcpu_fc_free, @@ -274,8 +273,7 @@ void __init setup_per_cpu_areas(void) { unsigned int cpu; unsigned long delta; - size_t pcpu_unit_size; - ssize_t ret; + int rc; pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n", NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); @@ -285,36 +283,33 @@ void __init setup_per_cpu_areas(void) * of large page mappings. Please read comments on top of * each allocator for details. */ - ret = -EINVAL; + rc = -EINVAL; if (pcpu_chosen_fc != PCPU_FC_AUTO) { if (pcpu_chosen_fc != PCPU_FC_PAGE) { if (pcpu_chosen_fc == PCPU_FC_LPAGE) - ret = setup_pcpu_lpage(true); + rc = setup_pcpu_lpage(true); else - ret = setup_pcpu_embed(true); + rc = setup_pcpu_embed(true); - if (ret < 0) - pr_warning("PERCPU: %s allocator failed (%zd), " + if (rc < 0) + pr_warning("PERCPU: %s allocator failed (%d), " "falling back to page size\n", - pcpu_fc_names[pcpu_chosen_fc], ret); + pcpu_fc_names[pcpu_chosen_fc], rc); } } else { - ret = setup_pcpu_lpage(false); - if (ret < 0) - ret = setup_pcpu_embed(false); + rc = setup_pcpu_lpage(false); + if (rc < 0) + rc = setup_pcpu_embed(false); } - if (ret < 0) - ret = setup_pcpu_page(); - if (ret < 0) - panic("cannot initialize percpu area (err=%zd)", ret); - - pcpu_unit_size = ret; + if (rc < 0) + rc = setup_pcpu_page(); + if (rc < 0) + panic("cannot initialize percpu area (err=%d)", rc); /* alrighty, percpu areas up and running */ delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; for_each_possible_cpu(cpu) { - per_cpu_offset(cpu) = - delta + pcpu_unit_map[cpu] * pcpu_unit_size; + per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu]; per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); per_cpu(cpu_number, cpu) = cpu; setup_percpu_segment(cpu); diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 77b86be8ce4..a7ec840f596 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -57,7 +57,7 @@ #endif extern void *pcpu_base_addr; -extern const int *pcpu_unit_map; +extern const unsigned long *pcpu_unit_offsets; struct pcpu_group_info { int nr_units; /* aligned # of units */ @@ -106,25 +106,23 @@ extern struct pcpu_alloc_info * __init pcpu_build_alloc_info( size_t atom_size, pcpu_fc_cpu_distance_fn_t cpu_distance_fn); -extern size_t __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, - void *base_addr); +extern int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, + void *base_addr); #ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK -extern ssize_t __init pcpu_embed_first_chunk( - size_t reserved_size, ssize_t dyn_size); +extern int __init pcpu_embed_first_chunk(size_t reserved_size, + ssize_t dyn_size); #endif #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK -extern ssize_t __init pcpu_page_first_chunk( - size_t reserved_size, +extern int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn, pcpu_fc_populate_pte_fn_t populate_pte_fn); #endif #ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK -extern ssize_t __init pcpu_lpage_first_chunk( - const struct pcpu_alloc_info *ai, +extern int __init pcpu_lpage_first_chunk(const struct pcpu_alloc_info *ai, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn, pcpu_fc_map_fn_t map_fn); diff --git a/mm/percpu.c b/mm/percpu.c index 99f7fa68272..653b02c4020 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -117,8 +117,8 @@ static unsigned int pcpu_last_unit_cpu __read_mostly; void *pcpu_base_addr __read_mostly; EXPORT_SYMBOL_GPL(pcpu_base_addr); -/* cpu -> unit map */ -const int *pcpu_unit_map __read_mostly; +static const int *pcpu_unit_map __read_mostly; /* cpu -> unit */ +const unsigned long *pcpu_unit_offsets __read_mostly; /* cpu -> unit offset */ /* * The first chunk which always exists. Note that unlike other @@ -196,8 +196,8 @@ static int pcpu_page_idx(unsigned int cpu, int page_idx) static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk, unsigned int cpu, int page_idx) { - return (unsigned long)chunk->vm->addr + - (pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT); + return (unsigned long)chunk->vm->addr + pcpu_unit_offsets[cpu] + + (page_idx << PAGE_SHIFT); } static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk, @@ -341,7 +341,7 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) * space. Note that any possible cpu id can be used here, so * there's no need to worry about preemption or cpu hotplug. */ - addr += pcpu_unit_map[smp_processor_id()] * pcpu_unit_size; + addr += pcpu_unit_offsets[smp_processor_id()]; return pcpu_get_page_chunk(vmalloc_to_page(addr)); } @@ -1560,17 +1560,17 @@ static void pcpu_dump_alloc_info(const char *lvl, * and available for dynamic allocation like any other chunks. * * RETURNS: - * The determined pcpu_unit_size which can be used to initialize - * percpu access. + * 0 on success, -errno on failure. */ -size_t __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, - void *base_addr) +int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, + void *base_addr) { static struct vm_struct first_vm; static int smap[2], dmap[2]; size_t dyn_size = ai->dyn_size; size_t size_sum = ai->static_size + ai->reserved_size + dyn_size; struct pcpu_chunk *schunk, *dchunk = NULL; + unsigned long *unit_off; unsigned int cpu; int *unit_map; int group, unit, i; @@ -1587,8 +1587,9 @@ size_t __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, pcpu_dump_alloc_info(KERN_DEBUG, ai); - /* determine number of units and verify and initialize pcpu_unit_map */ + /* determine number of units and initialize unit_map and base */ unit_map = alloc_bootmem(nr_cpu_ids * sizeof(unit_map[0])); + unit_off = alloc_bootmem(nr_cpu_ids * sizeof(unit_off[0])); for (cpu = 0; cpu < nr_cpu_ids; cpu++) unit_map[cpu] = NR_CPUS; @@ -1606,6 +1607,8 @@ size_t __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, BUG_ON(unit_map[cpu] != NR_CPUS); unit_map[cpu] = unit + i; + unit_off[cpu] = gi->base_offset + i * ai->unit_size; + if (pcpu_first_unit_cpu == NR_CPUS) pcpu_first_unit_cpu = cpu; } @@ -1617,6 +1620,7 @@ size_t __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, BUG_ON(unit_map[cpu] == NR_CPUS); pcpu_unit_map = unit_map; + pcpu_unit_offsets = unit_off; /* determine basic parameters */ pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT; @@ -1688,7 +1692,7 @@ size_t __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, /* we're done */ pcpu_base_addr = schunk->vm->addr; - return pcpu_unit_size; + return 0; } const char *pcpu_fc_names[PCPU_FC_NR] __initdata = { @@ -1748,16 +1752,15 @@ early_param("percpu_alloc", percpu_alloc_setup); * size, the leftover is returned to the bootmem allocator. * * RETURNS: - * The determined pcpu_unit_size which can be used to initialize - * percpu access on success, -errno on failure. + * 0 on success, -errno on failure. */ -ssize_t __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size) +int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size) { struct pcpu_alloc_info *ai; size_t size_sum, chunk_size; void *base; int unit; - ssize_t ret; + int rc; ai = pcpu_build_alloc_info(reserved_size, dyn_size, PAGE_SIZE, NULL); if (IS_ERR(ai)) @@ -1773,7 +1776,7 @@ ssize_t __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size) if (!base) { pr_warning("PERCPU: failed to allocate %zu bytes for " "embedding\n", chunk_size); - ret = -ENOMEM; + rc = -ENOMEM; goto out_free_ai; } @@ -1790,10 +1793,10 @@ ssize_t __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size) PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size, ai->dyn_size, ai->unit_size); - ret = pcpu_setup_first_chunk(ai, base); + rc = pcpu_setup_first_chunk(ai, base); out_free_ai: pcpu_free_alloc_info(ai); - return ret; + return rc; } #endif /* CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK || !CONFIG_HAVE_SETUP_PER_CPU_AREA */ @@ -1813,13 +1816,12 @@ out_free_ai: * page-by-page into vmalloc area. * * RETURNS: - * The determined pcpu_unit_size which can be used to initialize - * percpu access on success, -errno on failure. + * 0 on success, -errno on failure. */ -ssize_t __init pcpu_page_first_chunk(size_t reserved_size, - pcpu_fc_alloc_fn_t alloc_fn, - pcpu_fc_free_fn_t free_fn, - pcpu_fc_populate_pte_fn_t populate_pte_fn) +int __init pcpu_page_first_chunk(size_t reserved_size, + pcpu_fc_alloc_fn_t alloc_fn, + pcpu_fc_free_fn_t free_fn, + pcpu_fc_populate_pte_fn_t populate_pte_fn) { static struct vm_struct vm; struct pcpu_alloc_info *ai; @@ -1827,8 +1829,7 @@ ssize_t __init pcpu_page_first_chunk(size_t reserved_size, int unit_pages; size_t pages_size; struct page **pages; - int unit, i, j; - ssize_t ret; + int unit, i, j, rc; snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10); @@ -1874,10 +1875,10 @@ ssize_t __init pcpu_page_first_chunk(size_t reserved_size, populate_pte_fn(unit_addr + (i << PAGE_SHIFT)); /* pte already populated, the following shouldn't fail */ - ret = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages], - unit_pages); - if (ret < 0) - panic("failed to map percpu area, err=%zd\n", ret); + rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages], + unit_pages); + if (rc < 0) + panic("failed to map percpu area, err=%d\n", rc); /* * FIXME: Archs with virtual cache should flush local @@ -1896,17 +1897,17 @@ ssize_t __init pcpu_page_first_chunk(size_t reserved_size, unit_pages, psize_str, vm.addr, ai->static_size, ai->reserved_size, ai->dyn_size); - ret = pcpu_setup_first_chunk(ai, vm.addr); + rc = pcpu_setup_first_chunk(ai, vm.addr); goto out_free_ar; enomem: while (--j >= 0) free_fn(page_address(pages[j]), PAGE_SIZE); - ret = -ENOMEM; + rc = -ENOMEM; out_free_ar: free_bootmem(__pa(pages), pages_size); pcpu_free_alloc_info(ai); - return ret; + return rc; } #endif /* CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK */ @@ -1977,20 +1978,18 @@ static int __init pcpul_cpu_to_unit(int cpu, const struct pcpu_alloc_info *ai) * pcpu_lpage_remapped(). * * RETURNS: - * The determined pcpu_unit_size which can be used to initialize - * percpu access on success, -errno on failure. + * 0 on success, -errno on failure. */ -ssize_t __init pcpu_lpage_first_chunk(const struct pcpu_alloc_info *ai, - pcpu_fc_alloc_fn_t alloc_fn, - pcpu_fc_free_fn_t free_fn, - pcpu_fc_map_fn_t map_fn) +int __init pcpu_lpage_first_chunk(const struct pcpu_alloc_info *ai, + pcpu_fc_alloc_fn_t alloc_fn, + pcpu_fc_free_fn_t free_fn, + pcpu_fc_map_fn_t map_fn) { static struct vm_struct vm; const size_t lpage_size = ai->atom_size; size_t chunk_size, map_size; unsigned int cpu; - ssize_t ret; - int i, j, unit, nr_units; + int i, j, unit, nr_units, rc; nr_units = 0; for (i = 0; i < ai->nr_groups; i++) @@ -2070,7 +2069,7 @@ ssize_t __init pcpu_lpage_first_chunk(const struct pcpu_alloc_info *ai, vm.addr, ai->static_size, ai->reserved_size, ai->dyn_size, ai->unit_size); - ret = pcpu_setup_first_chunk(ai, vm.addr); + rc = pcpu_setup_first_chunk(ai, vm.addr); /* * Sort pcpul_map array for pcpu_lpage_remapped(). Unmapped @@ -2094,7 +2093,7 @@ ssize_t __init pcpu_lpage_first_chunk(const struct pcpu_alloc_info *ai, while (pcpul_nr_lpages && !pcpul_map[pcpul_nr_lpages - 1].ptr) pcpul_nr_lpages--; - return ret; + return rc; enomem: for (i = 0; i < pcpul_nr_lpages; i++) @@ -2166,21 +2165,21 @@ EXPORT_SYMBOL(__per_cpu_offset); void __init setup_per_cpu_areas(void) { - ssize_t unit_size; unsigned long delta; unsigned int cpu; + int rc; /* * Always reserve area for module percpu variables. That's * what the legacy allocator did. */ - unit_size = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, - PERCPU_DYNAMIC_RESERVE); - if (unit_size < 0) + rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, + PERCPU_DYNAMIC_RESERVE); + if (rc < 0) panic("Failed to initialized percpu areas."); delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; for_each_possible_cpu(cpu) - __per_cpu_offset[cpu] = delta + cpu * unit_size; + __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu]; } #endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */ -- cgit v1.2.3 From c8826dd538602d730ed2c18c6753f1bbfa6c4933 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 14 Aug 2009 15:00:52 +0900 Subject: percpu: update embedding first chunk allocator to handle sparse units Now that percpu core can handle very sparse units, given that vmalloc space is large enough, embedding first chunk allocator can use any memory to build the first chunk. This patch teaches pcpu_embed_first_chunk() about distances between cpus and to use alloc/free callbacks to allocate node specific areas for each group and use them for the first chunk. This brings the benefits of embedding allocator to NUMA configurations - no extra TLB pressure with the flexibility of unified dynamic allocator and no need to restructure arch code to build memory layout suitable for percpu. With units put into atom_size aligned groups according to cpu distances, using large page for dynamic chunks is also easily possible with falling back to reuglar pages if large allocation fails. Embedding allocator users are converted to specify NULL cpu_distance_fn, so this patch doesn't cause any visible behavior difference. Following patches will convert them. Signed-off-by: Tejun Heo --- arch/x86/kernel/setup_percpu.c | 4 +- include/linux/percpu.h | 7 ++- mm/percpu.c | 113 +++++++++++++++++++++++++++++++---------- 3 files changed, 93 insertions(+), 31 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 9becc5d4b51..67f6314de9f 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -234,7 +234,9 @@ static int __init setup_pcpu_embed(bool chosen) return -EINVAL; return pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, - reserve - PERCPU_FIRST_CHUNK_RESERVE); + reserve - PERCPU_FIRST_CHUNK_RESERVE, + PAGE_SIZE, NULL, pcpu_fc_alloc, + pcpu_fc_free); } /* diff --git a/include/linux/percpu.h b/include/linux/percpu.h index a7ec840f596..25359932740 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -110,8 +110,11 @@ extern int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, void *base_addr); #ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK -extern int __init pcpu_embed_first_chunk(size_t reserved_size, - ssize_t dyn_size); +extern int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size, + size_t atom_size, + pcpu_fc_cpu_distance_fn_t cpu_distance_fn, + pcpu_fc_alloc_fn_t alloc_fn, + pcpu_fc_free_fn_t free_fn); #endif #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK diff --git a/mm/percpu.c b/mm/percpu.c index cc9c4c64606..c2826d05505 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1747,15 +1747,25 @@ early_param("percpu_alloc", percpu_alloc_setup); * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem * @reserved_size: the size of reserved percpu area in bytes * @dyn_size: free size for dynamic allocation in bytes, -1 for auto + * @atom_size: allocation atom size + * @cpu_distance_fn: callback to determine distance between cpus, optional + * @alloc_fn: function to allocate percpu page + * @free_fn: funtion to free percpu page * * This is a helper to ease setting up embedded first percpu chunk and * can be called where pcpu_setup_first_chunk() is expected. * * If this function is used to setup the first chunk, it is allocated - * as a contiguous area using bootmem allocator and used as-is without - * being mapped into vmalloc area. This enables the first chunk to - * piggy back on the linear physical mapping which often uses larger - * page size. + * by calling @alloc_fn and used as-is without being mapped into + * vmalloc area. Allocations are always whole multiples of @atom_size + * aligned to @atom_size. + * + * This enables the first chunk to piggy back on the linear physical + * mapping which often uses larger page size. Please note that this + * can result in very sparse cpu->unit mapping on NUMA machines thus + * requiring large vmalloc address space. Don't use this allocator if + * vmalloc space is not orders of magnitude larger than distances + * between node memory addresses (ie. 32bit NUMA machines). * * When @dyn_size is positive, dynamic area might be larger than * specified to fill page alignment. When @dyn_size is auto, @@ -1763,53 +1773,88 @@ early_param("percpu_alloc", percpu_alloc_setup); * and reserved areas. * * If the needed size is smaller than the minimum or specified unit - * size, the leftover is returned to the bootmem allocator. + * size, the leftover is returned using @free_fn. * * RETURNS: * 0 on success, -errno on failure. */ -int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size) +int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size, + size_t atom_size, + pcpu_fc_cpu_distance_fn_t cpu_distance_fn, + pcpu_fc_alloc_fn_t alloc_fn, + pcpu_fc_free_fn_t free_fn) { + void *base = (void *)ULONG_MAX; + void **areas = NULL; struct pcpu_alloc_info *ai; - size_t size_sum, chunk_size; - void *base; - int unit; - int rc; + size_t size_sum, areas_size; + int group, i, rc; - ai = pcpu_build_alloc_info(reserved_size, dyn_size, PAGE_SIZE, NULL); + ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size, + cpu_distance_fn); if (IS_ERR(ai)) return PTR_ERR(ai); - BUG_ON(ai->nr_groups != 1); - BUG_ON(ai->groups[0].nr_units != num_possible_cpus()); size_sum = ai->static_size + ai->reserved_size + ai->dyn_size; - chunk_size = ai->unit_size * num_possible_cpus(); + areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *)); - base = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE, - __pa(MAX_DMA_ADDRESS)); - if (!base) { - pr_warning("PERCPU: failed to allocate %zu bytes for " - "embedding\n", chunk_size); + areas = alloc_bootmem_nopanic(areas_size); + if (!areas) { rc = -ENOMEM; - goto out_free_ai; + goto out_free; } - /* return the leftover and copy */ - for (unit = 0; unit < num_possible_cpus(); unit++) { - void *ptr = base + unit * ai->unit_size; + /* allocate, copy and determine base address */ + for (group = 0; group < ai->nr_groups; group++) { + struct pcpu_group_info *gi = &ai->groups[group]; + unsigned int cpu = NR_CPUS; + void *ptr; + + for (i = 0; i < gi->nr_units && cpu == NR_CPUS; i++) + cpu = gi->cpu_map[i]; + BUG_ON(cpu == NR_CPUS); + + /* allocate space for the whole group */ + ptr = alloc_fn(cpu, gi->nr_units * ai->unit_size, atom_size); + if (!ptr) { + rc = -ENOMEM; + goto out_free_areas; + } + areas[group] = ptr; - free_bootmem(__pa(ptr + size_sum), ai->unit_size - size_sum); - memcpy(ptr, __per_cpu_load, ai->static_size); + base = min(ptr, base); + + for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) { + if (gi->cpu_map[i] == NR_CPUS) { + /* unused unit, free whole */ + free_fn(ptr, ai->unit_size); + continue; + } + /* copy and return the unused part */ + memcpy(ptr, __per_cpu_load, ai->static_size); + free_fn(ptr + size_sum, ai->unit_size - size_sum); + } } - /* we're ready, commit */ + /* base address is now known, determine group base offsets */ + for (group = 0; group < ai->nr_groups; group++) + ai->groups[group].base_offset = areas[group] - base; + pr_info("PERCPU: Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n", PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size, ai->dyn_size, ai->unit_size); rc = pcpu_setup_first_chunk(ai, base); -out_free_ai: + goto out_free; + +out_free_areas: + for (group = 0; group < ai->nr_groups; group++) + free_fn(areas[group], + ai->groups[group].nr_units * ai->unit_size); +out_free: pcpu_free_alloc_info(ai); + if (areas) + free_bootmem(__pa(areas), areas_size); return rc; } #endif /* CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK || @@ -2177,6 +2222,17 @@ void *pcpu_lpage_remapped(void *kaddr) unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; EXPORT_SYMBOL(__per_cpu_offset); +static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size, + size_t align) +{ + return __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS)); +} + +static void __init pcpu_dfl_fc_free(void *ptr, size_t size) +{ + free_bootmem(__pa(ptr), size); +} + void __init setup_per_cpu_areas(void) { unsigned long delta; @@ -2188,7 +2244,8 @@ void __init setup_per_cpu_areas(void) * what the legacy allocator did. */ rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, - PERCPU_DYNAMIC_RESERVE); + PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL, + pcpu_dfl_fc_alloc, pcpu_dfl_fc_free); if (rc < 0) panic("Failed to initialized percpu areas."); -- cgit v1.2.3 From 4518e6a0c038b98be4c480e6f4481e8676bd15dd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 14 Aug 2009 15:00:52 +0900 Subject: x86,percpu: use embedding for 64bit NUMA and page for 32bit NUMA Embedding percpu first chunk allocator can now handle very sparse unit mapping. Use embedding allocator instead of lpage for 64bit NUMA. This removes extra TLB pressure and the need to do complex and fragile dancing when changing page attributes. For 32bit, using very sparse unit mapping isn't a good idea because the vmalloc space is very constrained. 32bit NUMA machines aren't exactly the focus of optimization and it isn't very clear whether lpage performs better than page. Use page first chunk allocator for 32bit NUMAs. As this leaves setup_pcpu_*() functions pretty much empty, fold them into setup_per_cpu_areas(). Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: Andi Kleen --- arch/x86/Kconfig | 4 -- arch/x86/kernel/setup_percpu.c | 155 ++++++++--------------------------------- 2 files changed, 28 insertions(+), 131 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f7ac2721551..869d7d30144 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -156,10 +156,6 @@ config NEED_PER_CPU_EMBED_FIRST_CHUNK config NEED_PER_CPU_PAGE_FIRST_CHUNK def_bool y -config NEED_PER_CPU_LPAGE_FIRST_CHUNK - def_bool y - depends on NEED_MULTIPLE_NODES - config HAVE_CPUMASK_OF_CPU_MAP def_bool X86_64_SMP diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 67f6314de9f..d559af913e1 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -55,6 +55,7 @@ EXPORT_SYMBOL(__per_cpu_offset); #define PERCPU_FIRST_CHUNK_RESERVE 0 #endif +#ifdef CONFIG_X86_32 /** * pcpu_need_numa - determine percpu allocation needs to consider NUMA * @@ -83,6 +84,7 @@ static bool __init pcpu_need_numa(void) #endif return false; } +#endif /** * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu @@ -136,128 +138,23 @@ static void __init pcpu_fc_free(void *ptr, size_t size) free_bootmem(__pa(ptr), size); } -/* - * Large page remapping allocator - */ -#ifdef CONFIG_NEED_MULTIPLE_NODES -static void __init pcpul_map(void *ptr, size_t size, void *addr) -{ - pmd_t *pmd, pmd_v; - - pmd = populate_extra_pmd((unsigned long)addr); - pmd_v = pfn_pmd(page_to_pfn(virt_to_page(ptr)), PAGE_KERNEL_LARGE); - set_pmd(pmd, pmd_v); -} - -static int pcpu_lpage_cpu_distance(unsigned int from, unsigned int to) +static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) { +#ifdef CONFIG_NEED_MULTIPLE_NODES if (early_cpu_to_node(from) == early_cpu_to_node(to)) return LOCAL_DISTANCE; else return REMOTE_DISTANCE; -} - -static int __init setup_pcpu_lpage(bool chosen) -{ - size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; - size_t dyn_size = reserve - PERCPU_FIRST_CHUNK_RESERVE; - struct pcpu_alloc_info *ai; - int rc; - - /* on non-NUMA, embedding is better */ - if (!chosen && !pcpu_need_numa()) - return -EINVAL; - - /* need PSE */ - if (!cpu_has_pse) { - pr_warning("PERCPU: lpage allocator requires PSE\n"); - return -EINVAL; - } - - /* allocate and build unit_map */ - ai = pcpu_build_alloc_info(PERCPU_FIRST_CHUNK_RESERVE, dyn_size, - PMD_SIZE, pcpu_lpage_cpu_distance); - if (IS_ERR(ai)) { - pr_warning("PERCPU: failed to build unit_map (%ld)\n", - PTR_ERR(ai)); - return PTR_ERR(ai); - } - - /* do the parameters look okay? */ - if (!chosen) { - size_t vm_size = VMALLOC_END - VMALLOC_START; - size_t tot_size = 0; - int group; - - for (group = 0; group < ai->nr_groups; group++) - tot_size += ai->unit_size * ai->groups[group].nr_units; - - /* don't consume more than 20% of vmalloc area */ - if (tot_size > vm_size / 5) { - pr_info("PERCPU: too large chunk size %zuMB for " - "large page remap\n", tot_size >> 20); - rc = -EINVAL; - goto out_free; - } - } - - rc = pcpu_lpage_first_chunk(ai, pcpu_fc_alloc, pcpu_fc_free, pcpul_map); -out_free: - pcpu_free_alloc_info(ai); - return rc; -} #else -static int __init setup_pcpu_lpage(bool chosen) -{ - return -EINVAL; -} + return LOCAL_DISTANCE; #endif - -/* - * Embedding allocator - * - * The first chunk is sized to just contain the static area plus - * module and dynamic reserves and embedded into linear physical - * mapping so that it can use PMD mapping without additional TLB - * pressure. - */ -static int __init setup_pcpu_embed(bool chosen) -{ - size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; - - /* - * If large page isn't supported, there's no benefit in doing - * this. Also, embedding allocation doesn't play well with - * NUMA. - */ - if (!chosen && (!cpu_has_pse || pcpu_need_numa())) - return -EINVAL; - - return pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, - reserve - PERCPU_FIRST_CHUNK_RESERVE, - PAGE_SIZE, NULL, pcpu_fc_alloc, - pcpu_fc_free); } -/* - * Page allocator - * - * Boring fallback 4k page allocator. This allocator puts more - * pressure on PTE TLBs but other than that behaves nicely on both UMA - * and NUMA. - */ static void __init pcpup_populate_pte(unsigned long addr) { populate_extra_pte(addr); } -static int __init setup_pcpu_page(void) -{ - return pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, - pcpu_fc_alloc, pcpu_fc_free, - pcpup_populate_pte); -} - static inline void setup_percpu_segment(int cpu) { #ifdef CONFIG_X86_32 @@ -281,30 +178,34 @@ void __init setup_per_cpu_areas(void) NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); /* - * Allocate percpu area. If PSE is supported, try to make use - * of large page mappings. Please read comments on top of - * each allocator for details. + * Allocate percpu area. Embedding allocator is our favorite; + * however, on NUMA configurations, it can result in very + * sparse unit mapping and vmalloc area isn't spacious enough + * on 32bit. Use page in that case. */ +#ifdef CONFIG_X86_32 + if (pcpu_chosen_fc == PCPU_FC_AUTO && pcpu_need_numa()) + pcpu_chosen_fc = PCPU_FC_PAGE; +#endif rc = -EINVAL; - if (pcpu_chosen_fc != PCPU_FC_AUTO) { - if (pcpu_chosen_fc != PCPU_FC_PAGE) { - if (pcpu_chosen_fc == PCPU_FC_LPAGE) - rc = setup_pcpu_lpage(true); - else - rc = setup_pcpu_embed(true); - - if (rc < 0) - pr_warning("PERCPU: %s allocator failed (%d), " - "falling back to page size\n", - pcpu_fc_names[pcpu_chosen_fc], rc); - } - } else { - rc = setup_pcpu_lpage(false); + if (pcpu_chosen_fc != PCPU_FC_PAGE) { + const size_t atom_size = cpu_has_pse ? PMD_SIZE : PAGE_SIZE; + const size_t dyn_size = PERCPU_MODULE_RESERVE + + PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE; + + rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, + dyn_size, atom_size, + pcpu_cpu_distance, + pcpu_fc_alloc, pcpu_fc_free); if (rc < 0) - rc = setup_pcpu_embed(false); + pr_warning("PERCPU: %s allocator failed (%d), " + "falling back to page size\n", + pcpu_fc_names[pcpu_chosen_fc], rc); } if (rc < 0) - rc = setup_pcpu_page(); + rc = pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, + pcpu_fc_alloc, pcpu_fc_free, + pcpup_populate_pte); if (rc < 0) panic("cannot initialize percpu area (err=%d)", rc); -- cgit v1.2.3 From e933a73f48e3b2d40cfa56d81e2646f194b5a66a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 14 Aug 2009 15:00:53 +0900 Subject: percpu: kill lpage first chunk allocator With x86 converted to embedding allocator, lpage doesn't have any user left. Kill it along with cpa handling code. Signed-off-by: Tejun Heo Cc: Jan Beulich --- Documentation/kernel-parameters.txt | 10 +- arch/x86/mm/pageattr.c | 20 +-- include/linux/percpu.h | 16 --- mm/percpu.c | 241 ------------------------------------ 4 files changed, 6 insertions(+), 281 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index dee9ce2e6cf..e710093e3d3 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1920,11 +1920,11 @@ and is between 256 and 4096 characters. It is defined in the file See arch/parisc/kernel/pdc_chassis.c percpu_alloc= Select which percpu first chunk allocator to use. - Currently supported values are "embed", "page" and - "lpage". Archs may support subset or none of the - selections. See comments in mm/percpu.c for details - on each allocator. This parameter is primarily for - debugging and performance comparison. + Currently supported values are "embed" and "page". + Archs may support subset or none of the selections. + See comments in mm/percpu.c for details on each + allocator. This parameter is primarily for debugging + and performance comparison. pf. [PARIDE] See Documentation/blockdev/paride.txt. diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index dce282f6570..f53cfc7f963 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -687,7 +687,7 @@ static int cpa_process_alias(struct cpa_data *cpa) { struct cpa_data alias_cpa; unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT); - unsigned long vaddr, remapped; + unsigned long vaddr; int ret; if (cpa->pfn >= max_pfn_mapped) @@ -745,24 +745,6 @@ static int cpa_process_alias(struct cpa_data *cpa) } #endif - /* - * If the PMD page was partially used for per-cpu remapping, - * the recycled area needs to be split and modified. Because - * the area is always proper subset of a PMD page - * cpa->numpages is guaranteed to be 1 for these areas, so - * there's no need to loop over and check for further remaps. - */ - remapped = (unsigned long)pcpu_lpage_remapped((void *)laddr); - if (remapped) { - WARN_ON(cpa->numpages > 1); - alias_cpa = *cpa; - alias_cpa.vaddr = &remapped; - alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); - ret = __change_page_attr_set_clr(&alias_cpa, 0); - if (ret) - return ret; - } - return 0; } diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 25359932740..878836ca999 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -82,7 +82,6 @@ enum pcpu_fc { PCPU_FC_AUTO, PCPU_FC_EMBED, PCPU_FC_PAGE, - PCPU_FC_LPAGE, PCPU_FC_NR, }; @@ -95,7 +94,6 @@ typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size, typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size); typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr); typedef int (pcpu_fc_cpu_distance_fn_t)(unsigned int from, unsigned int to); -typedef void (*pcpu_fc_map_fn_t)(void *ptr, size_t size, void *addr); extern struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups, int nr_units); @@ -124,20 +122,6 @@ extern int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_populate_pte_fn_t populate_pte_fn); #endif -#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK -extern int __init pcpu_lpage_first_chunk(const struct pcpu_alloc_info *ai, - pcpu_fc_alloc_fn_t alloc_fn, - pcpu_fc_free_fn_t free_fn, - pcpu_fc_map_fn_t map_fn); - -extern void *pcpu_lpage_remapped(void *kaddr); -#else -static inline void *pcpu_lpage_remapped(void *kaddr) -{ - return NULL; -} -#endif - /* * Use this to get to a cpu's version of the per-cpu object * dynamically allocated. Non-atomic access to the current CPU's diff --git a/mm/percpu.c b/mm/percpu.c index c2826d05505..77933928107 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1713,7 +1713,6 @@ const char *pcpu_fc_names[PCPU_FC_NR] __initdata = { [PCPU_FC_AUTO] = "auto", [PCPU_FC_EMBED] = "embed", [PCPU_FC_PAGE] = "page", - [PCPU_FC_LPAGE] = "lpage", }; enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO; @@ -1729,10 +1728,6 @@ static int __init percpu_alloc_setup(char *str) #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK else if (!strcmp(str, "page")) pcpu_chosen_fc = PCPU_FC_PAGE; -#endif -#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK - else if (!strcmp(str, "lpage")) - pcpu_chosen_fc = PCPU_FC_LPAGE; #endif else pr_warning("PERCPU: unknown allocator %s specified\n", str); @@ -1970,242 +1965,6 @@ out_free_ar: } #endif /* CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK */ -#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK -struct pcpul_ent { - void *ptr; - void *map_addr; -}; - -static size_t pcpul_size; -static size_t pcpul_lpage_size; -static int pcpul_nr_lpages; -static struct pcpul_ent *pcpul_map; - -static bool __init pcpul_unit_to_cpu(int unit, const struct pcpu_alloc_info *ai, - unsigned int *cpup) -{ - int group, cunit; - - for (group = 0, cunit = 0; group < ai->nr_groups; group++) { - const struct pcpu_group_info *gi = &ai->groups[group]; - - if (unit < cunit + gi->nr_units) { - if (cpup) - *cpup = gi->cpu_map[unit - cunit]; - return true; - } - cunit += gi->nr_units; - } - - return false; -} - -static int __init pcpul_cpu_to_unit(int cpu, const struct pcpu_alloc_info *ai) -{ - int group, unit, i; - - for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) { - const struct pcpu_group_info *gi = &ai->groups[group]; - - for (i = 0; i < gi->nr_units; i++) - if (gi->cpu_map[i] == cpu) - return unit + i; - } - BUG(); -} - -/** - * pcpu_lpage_first_chunk - remap the first percpu chunk using large page - * @ai: pcpu_alloc_info - * @alloc_fn: function to allocate percpu lpage, always called with lpage_size - * @free_fn: function to free percpu memory, @size <= lpage_size - * @map_fn: function to map percpu lpage, always called with lpage_size - * - * This allocator uses large page to build and map the first chunk. - * Unlike other helpers, the caller should provide fully initialized - * @ai. This can be done using pcpu_build_alloc_info(). This two - * stage initialization is to allow arch code to evaluate the - * parameters before committing to it. - * - * Large pages are allocated as directed by @unit_map and other - * parameters and mapped to vmalloc space. Unused holes are returned - * to the page allocator. Note that these holes end up being actively - * mapped twice - once to the physical mapping and to the vmalloc area - * for the first percpu chunk. Depending on architecture, this might - * cause problem when changing page attributes of the returned area. - * These double mapped areas can be detected using - * pcpu_lpage_remapped(). - * - * RETURNS: - * 0 on success, -errno on failure. - */ -int __init pcpu_lpage_first_chunk(const struct pcpu_alloc_info *ai, - pcpu_fc_alloc_fn_t alloc_fn, - pcpu_fc_free_fn_t free_fn, - pcpu_fc_map_fn_t map_fn) -{ - static struct vm_struct vm; - const size_t lpage_size = ai->atom_size; - size_t chunk_size, map_size; - unsigned int cpu; - int i, j, unit, nr_units, rc; - - nr_units = 0; - for (i = 0; i < ai->nr_groups; i++) - nr_units += ai->groups[i].nr_units; - - chunk_size = ai->unit_size * nr_units; - BUG_ON(chunk_size % lpage_size); - - pcpul_size = ai->static_size + ai->reserved_size + ai->dyn_size; - pcpul_lpage_size = lpage_size; - pcpul_nr_lpages = chunk_size / lpage_size; - - /* allocate pointer array and alloc large pages */ - map_size = pcpul_nr_lpages * sizeof(pcpul_map[0]); - pcpul_map = alloc_bootmem(map_size); - - /* allocate all pages */ - for (i = 0; i < pcpul_nr_lpages; i++) { - size_t offset = i * lpage_size; - int first_unit = offset / ai->unit_size; - int last_unit = (offset + lpage_size - 1) / ai->unit_size; - void *ptr; - - /* find out which cpu is mapped to this unit */ - for (unit = first_unit; unit <= last_unit; unit++) - if (pcpul_unit_to_cpu(unit, ai, &cpu)) - goto found; - continue; - found: - ptr = alloc_fn(cpu, lpage_size, lpage_size); - if (!ptr) { - pr_warning("PERCPU: failed to allocate large page " - "for cpu%u\n", cpu); - goto enomem; - } - - pcpul_map[i].ptr = ptr; - } - - /* return unused holes */ - for (unit = 0; unit < nr_units; unit++) { - size_t start = unit * ai->unit_size; - size_t end = start + ai->unit_size; - size_t off, next; - - /* don't free used part of occupied unit */ - if (pcpul_unit_to_cpu(unit, ai, NULL)) - start += pcpul_size; - - /* unit can span more than one page, punch the holes */ - for (off = start; off < end; off = next) { - void *ptr = pcpul_map[off / lpage_size].ptr; - next = min(roundup(off + 1, lpage_size), end); - if (ptr) - free_fn(ptr + off % lpage_size, next - off); - } - } - - /* allocate address, map and copy */ - vm.flags = VM_ALLOC; - vm.size = chunk_size; - vm_area_register_early(&vm, ai->unit_size); - - for (i = 0; i < pcpul_nr_lpages; i++) { - if (!pcpul_map[i].ptr) - continue; - pcpul_map[i].map_addr = vm.addr + i * lpage_size; - map_fn(pcpul_map[i].ptr, lpage_size, pcpul_map[i].map_addr); - } - - for_each_possible_cpu(cpu) - memcpy(vm.addr + pcpul_cpu_to_unit(cpu, ai) * ai->unit_size, - __per_cpu_load, ai->static_size); - - /* we're ready, commit */ - pr_info("PERCPU: large pages @%p s%zu r%zu d%zu u%zu\n", - vm.addr, ai->static_size, ai->reserved_size, ai->dyn_size, - ai->unit_size); - - rc = pcpu_setup_first_chunk(ai, vm.addr); - - /* - * Sort pcpul_map array for pcpu_lpage_remapped(). Unmapped - * lpages are pushed to the end and trimmed. - */ - for (i = 0; i < pcpul_nr_lpages - 1; i++) - for (j = i + 1; j < pcpul_nr_lpages; j++) { - struct pcpul_ent tmp; - - if (!pcpul_map[j].ptr) - continue; - if (pcpul_map[i].ptr && - pcpul_map[i].ptr < pcpul_map[j].ptr) - continue; - - tmp = pcpul_map[i]; - pcpul_map[i] = pcpul_map[j]; - pcpul_map[j] = tmp; - } - - while (pcpul_nr_lpages && !pcpul_map[pcpul_nr_lpages - 1].ptr) - pcpul_nr_lpages--; - - return rc; - -enomem: - for (i = 0; i < pcpul_nr_lpages; i++) - if (pcpul_map[i].ptr) - free_fn(pcpul_map[i].ptr, lpage_size); - free_bootmem(__pa(pcpul_map), map_size); - return -ENOMEM; -} - -/** - * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area - * @kaddr: the kernel address in question - * - * Determine whether @kaddr falls in the pcpul recycled area. This is - * used by pageattr to detect VM aliases and break up the pcpu large - * page mapping such that the same physical page is not mapped under - * different attributes. - * - * The recycled area is always at the tail of a partially used large - * page. - * - * RETURNS: - * Address of corresponding remapped pcpu address if match is found; - * otherwise, NULL. - */ -void *pcpu_lpage_remapped(void *kaddr) -{ - unsigned long lpage_mask = pcpul_lpage_size - 1; - void *lpage_addr = (void *)((unsigned long)kaddr & ~lpage_mask); - unsigned long offset = (unsigned long)kaddr & lpage_mask; - int left = 0, right = pcpul_nr_lpages - 1; - int pos; - - /* pcpul in use at all? */ - if (!pcpul_map) - return NULL; - - /* okay, perform binary search */ - while (left <= right) { - pos = (left + right) / 2; - - if (pcpul_map[pos].ptr < lpage_addr) - left = pos + 1; - else if (pcpul_map[pos].ptr > lpage_addr) - right = pos - 1; - else - return pcpul_map[pos].map_addr + offset; - } - - return NULL; -} -#endif /* CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK */ - /* * Generic percpu area setup. * -- cgit v1.2.3 From 58c41d28259c246dbc11358d85d332dc20ccd57b Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 14 Aug 2009 12:14:19 -0700 Subject: x86, intel_txt: Factor out the code for S3 setup S3 sleep requires special setup in tboot. However, the data structures needed to do such setup are only available if CONFIG_ACPI_SLEEP is enabled. Abstract them out as much as possible, so we can have a single tboot_setup_sleep() which either is a proper implementation or a stub which simply calls BUG(). Signed-off-by: H. Peter Anvin Acked-by: Shane Wang Cc: Joseph Cihula --- arch/x86/kernel/tboot.c | 53 +++++++++++++++++++++++++++++++++++++------------ 1 file changed, 40 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index 1ab80120894..a183beffe39 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -164,25 +165,51 @@ void tboot_create_trampoline(void) map_base = PFN_DOWN(tboot->tboot_base); map_size = PFN_UP(tboot->tboot_size); if (map_tboot_pages(map_base << PAGE_SHIFT, map_base, map_size)) - panic("tboot: Error mapping tboot pages (mfns) @ 0x%x, 0x%x\n", map_base, map_size); + panic("tboot: Error mapping tboot pages (mfns) @ 0x%x, 0x%x\n", + map_base, map_size); } -static void set_mac_regions(void) +#ifdef CONFIG_ACPI_SLEEP + +static void add_mac_region(phys_addr_t start, unsigned long size) { - tboot->num_mac_regions = 3; + struct tboot_mac_region *mr; + phys_addr_t end = start + size; + + if (start && size) { + mr = &tboot->mac_regions[tboot->num_mac_regions++]; + mr->start = round_down(start, PAGE_SIZE); + mr->size = round_up(end, PAGE_SIZE) - mr->start; + } +} + +static int tboot_setup_sleep(void) +{ + tboot->num_mac_regions = 0; + /* S3 resume code */ - tboot->mac_regions[0].start = PFN_PHYS(PFN_DOWN(acpi_wakeup_address)); - tboot->mac_regions[0].size = PFN_UP(WAKEUP_SIZE) << PAGE_SHIFT; + add_mac_region(acpi_wakeup_address, WAKEUP_SIZE); /* AP trampoline code */ - tboot->mac_regions[1].start = - PFN_PHYS(PFN_DOWN(virt_to_phys(trampoline_base))); - tboot->mac_regions[1].size = PFN_UP(TRAMPOLINE_SIZE) << PAGE_SHIFT; + add_mac_region(virt_to_phys(trampoline_base), TRAMPOLINE_SIZE); /* kernel code + data + bss */ - tboot->mac_regions[2].start = PFN_PHYS(PFN_DOWN(virt_to_phys(&_text))); - tboot->mac_regions[2].size = PFN_PHYS(PFN_UP(virt_to_phys(&_end))) - - PFN_PHYS(PFN_DOWN(virt_to_phys(&_text))); + add_mac_region(virt_to_phys(_text), _end - _text); + + tboot->acpi_sinfo.kernel_s3_resume_vector = acpi_wakeup_address; + + return 0; } +#else /* no CONFIG_ACPI_SLEEP */ + +static int tboot_setup_sleep(void) +{ + /* S3 shutdown requested, but S3 not supported by the kernel... */ + BUG(); + return -1; +} + +#endif + void tboot_shutdown(u32 shutdown_type) { void (*shutdown)(void); @@ -200,7 +227,8 @@ void tboot_shutdown(u32 shutdown_type) /* if this is S3 then set regions to MAC */ if (shutdown_type == TB_SHUTDOWN_S3) - set_mac_regions(); + if (tboot_setup_sleep()) + return; tboot->shutdown_type = shutdown_type; @@ -253,7 +281,6 @@ void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control) tboot->acpi_sinfo.pm1b_cnt_val = pm1b_control; /* we always use the 32b wakeup vector */ tboot->acpi_sinfo.vector_width = 32; - tboot->acpi_sinfo.kernel_s3_resume_vector = acpi_wakeup_address; if (sleep_state >= ACPI_S_STATE_COUNT || acpi_shutdown_map[sleep_state] == -1) { -- cgit v1.2.3 From 1be396794897f80bfc8774719ba60309a9e3d374 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 14 Aug 2009 15:47:20 +0200 Subject: timekeeping: Move reset of cycle_last for tsc clocksource to tsc change_clocksource resets the cycle_last value to zero then sets it to a value read from the clocksource. The reset to zero is required only for the TSC clocksource to make the read_tsc function work after a resume. The reason is that the TSC read function uses cycle_last to detect backwards going TSCs. In the resume case cycle_last contains the TSC value from the last update before the suspend. On resume the TSC starts counting from 0 again and would trip over the cycle_last comparison. This is subtle and surprising. Move the reset to a resume function in the tsc code. Signed-off-by: Martin Schwidefsky Acked-by: Thomas Gleixner Acked-by: John Stultz Cc: Daniel Walker LKML-Reference: <20090814134808.142191175@de.ibm.com> Signed-off-by: Thomas Gleixner --- arch/x86/kernel/tsc.c | 6 ++++++ kernel/time/timekeeping.c | 1 - 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 71f4368b357..968425422c4 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -744,10 +744,16 @@ static cycle_t __vsyscall_fn vread_tsc(void) } #endif +static void resume_tsc(void) +{ + clocksource_tsc.cycle_last = 0; +} + static struct clocksource clocksource_tsc = { .name = "tsc", .rating = 300, .read = read_tsc, + .resume = resume_tsc, .mask = CLOCKSOURCE_MASK(64), .shift = 22, .flags = CLOCK_SOURCE_IS_CONTINUOUS | diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 016a2591d71..b5673016089 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -295,7 +295,6 @@ static void change_clocksource(void) if (old->disable) old->disable(old); - clock->cycle_last = 0; clock->cycle_last = clock->read(clock); clock->error = 0; clock->xtime_nsec = 0; -- cgit v1.2.3 From d4f587c67fc39e0030ddd718675e252e208da4d7 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 14 Aug 2009 15:47:31 +0200 Subject: timekeeping: Increase granularity of read_persistent_clock() The persistent clock of some architectures (e.g. s390) have a better granularity than seconds. To reduce the delta between the host clock and the guest clock in a virtualized system change the read_persistent_clock function to return a struct timespec. Signed-off-by: Martin Schwidefsky Cc: Ingo Molnar Acked-by: John Stultz Cc: Daniel Walker LKML-Reference: <20090814134811.013873340@de.ibm.com> Signed-off-by: Thomas Gleixner --- arch/m68knommu/kernel/time.c | 5 ++-- arch/mips/dec/time.c | 5 ++-- arch/mips/lasat/ds1603.c | 5 ++-- arch/mips/lasat/sysctl.c | 8 ++++-- arch/mips/lemote/lm2e/setup.c | 5 ++-- arch/mips/mti-malta/malta-time.c | 5 ++-- arch/mips/pmc-sierra/yosemite/setup.c | 5 ++-- arch/mips/sibyte/swarm/setup.c | 15 +++++++--- arch/mips/sni/time.c | 5 ++-- arch/powerpc/kernel/time.c | 7 +++-- arch/s390/kernel/time.c | 22 +++------------ arch/sh/kernel/time.c | 6 ++-- arch/x86/kernel/rtc.c | 5 ++-- arch/xtensa/kernel/time.c | 5 ++-- include/linux/time.h | 2 +- kernel/time/timekeeping.c | 52 +++++++++++++++++++---------------- 16 files changed, 83 insertions(+), 74 deletions(-) (limited to 'arch/x86') diff --git a/arch/m68knommu/kernel/time.c b/arch/m68knommu/kernel/time.c index d182b2f7221..68432248515 100644 --- a/arch/m68knommu/kernel/time.c +++ b/arch/m68knommu/kernel/time.c @@ -72,9 +72,10 @@ static unsigned long read_rtc_mmss(void) return mktime(year, mon, day, hour, min, sec);; } -unsigned long read_persistent_clock(void) +void read_persistent_clock(struct timespec *ts) { - return read_rtc_mmss(); + ts->tv_sec = read_rtc_mmss(); + ts->tv_nsec = 0; } int update_persistent_clock(struct timespec now) diff --git a/arch/mips/dec/time.c b/arch/mips/dec/time.c index 463136e6685..02f505f23c3 100644 --- a/arch/mips/dec/time.c +++ b/arch/mips/dec/time.c @@ -18,7 +18,7 @@ #include #include -unsigned long read_persistent_clock(void) +void read_persistent_clock(struct timespec *ts) { unsigned int year, mon, day, hour, min, sec, real_year; unsigned long flags; @@ -53,7 +53,8 @@ unsigned long read_persistent_clock(void) year += real_year - 72 + 2000; - return mktime(year, mon, day, hour, min, sec); + ts->tv_sec = mktime(year, mon, day, hour, min, sec); + ts->tv_nsec = 0; } /* diff --git a/arch/mips/lasat/ds1603.c b/arch/mips/lasat/ds1603.c index 52cb1436a12..c6fd96ff118 100644 --- a/arch/mips/lasat/ds1603.c +++ b/arch/mips/lasat/ds1603.c @@ -135,7 +135,7 @@ static void rtc_end_op(void) lasat_ndelay(1000); } -unsigned long read_persistent_clock(void) +void read_persistent_clock(struct timespec *ts) { unsigned long word; unsigned long flags; @@ -147,7 +147,8 @@ unsigned long read_persistent_clock(void) rtc_end_op(); spin_unlock_irqrestore(&rtc_lock, flags); - return word; + ts->tv_sec = word; + ts->tv_nsec = 0; } int rtc_mips_set_mmss(unsigned long time) diff --git a/arch/mips/lasat/sysctl.c b/arch/mips/lasat/sysctl.c index 8f88886feb1..3f04d4c406b 100644 --- a/arch/mips/lasat/sysctl.c +++ b/arch/mips/lasat/sysctl.c @@ -92,10 +92,12 @@ static int rtctmp; int proc_dolasatrtc(ctl_table *table, int write, struct file *filp, void *buffer, size_t *lenp, loff_t *ppos) { + struct timespec ts; int r; if (!write) { - rtctmp = read_persistent_clock(); + read_persistent_clock(&ts); + rtctmp = ts.tv_sec; /* check for time < 0 and set to 0 */ if (rtctmp < 0) rtctmp = 0; @@ -134,9 +136,11 @@ int sysctl_lasat_rtc(ctl_table *table, void *oldval, size_t *oldlenp, void *newval, size_t newlen) { + struct timespec ts; int r; - rtctmp = read_persistent_clock(); + read_persistent_clock(&ts); + rtctmp = ts.tv_sec; if (rtctmp < 0) rtctmp = 0; r = sysctl_intvec(table, oldval, oldlenp, newval, newlen); diff --git a/arch/mips/lemote/lm2e/setup.c b/arch/mips/lemote/lm2e/setup.c index ebd6ceaef2f..24b355df612 100644 --- a/arch/mips/lemote/lm2e/setup.c +++ b/arch/mips/lemote/lm2e/setup.c @@ -54,9 +54,10 @@ void __init plat_time_init(void) mips_hpt_frequency = cpu_clock_freq / 2; } -unsigned long read_persistent_clock(void) +void read_persistent_clock(struct timespec *ts) { - return mc146818_get_cmos_time(); + ts->tv_sec = mc146818_get_cmos_time(); + ts->tv_nsec = 0; } void (*__wbflush)(void); diff --git a/arch/mips/mti-malta/malta-time.c b/arch/mips/mti-malta/malta-time.c index 0b97d47691f..3c6f190aa61 100644 --- a/arch/mips/mti-malta/malta-time.c +++ b/arch/mips/mti-malta/malta-time.c @@ -100,9 +100,10 @@ static unsigned int __init estimate_cpu_frequency(void) return count; } -unsigned long read_persistent_clock(void) +void read_persistent_clock(struct timespec *ts) { - return mc146818_get_cmos_time(); + ts->tv_sec = mc146818_get_cmos_time(); + ts->tv_nsec = 0; } static void __init plat_perf_setup(void) diff --git a/arch/mips/pmc-sierra/yosemite/setup.c b/arch/mips/pmc-sierra/yosemite/setup.c index 2d3c0dca275..3498ac9c35a 100644 --- a/arch/mips/pmc-sierra/yosemite/setup.c +++ b/arch/mips/pmc-sierra/yosemite/setup.c @@ -70,7 +70,7 @@ void __init bus_error_init(void) } -unsigned long read_persistent_clock(void) +void read_persistent_clock(struct timespec *ts) { unsigned int year, month, day, hour, min, sec; unsigned long flags; @@ -92,7 +92,8 @@ unsigned long read_persistent_clock(void) m48t37_base->control = 0x00; spin_unlock_irqrestore(&rtc_lock, flags); - return mktime(year, month, day, hour, min, sec); + ts->tv_sec = mktime(year, month, day, hour, min, sec); + ts->tv_nsec = 0; } int rtc_mips_set_time(unsigned long tim) diff --git a/arch/mips/sibyte/swarm/setup.c b/arch/mips/sibyte/swarm/setup.c index 672e45d495a..623ffc933c4 100644 --- a/arch/mips/sibyte/swarm/setup.c +++ b/arch/mips/sibyte/swarm/setup.c @@ -87,19 +87,26 @@ enum swarm_rtc_type { enum swarm_rtc_type swarm_rtc_type; -unsigned long read_persistent_clock(void) +void read_persistent_clock(struct timespec *ts) { + unsigned long sec; + switch (swarm_rtc_type) { case RTC_XICOR: - return xicor_get_time(); + sec = xicor_get_time(); + break; case RTC_M4LT81: - return m41t81_get_time(); + sec = m41t81_get_time(); + break; case RTC_NONE: default: - return mktime(2000, 1, 1, 0, 0, 0); + sec = mktime(2000, 1, 1, 0, 0, 0); + break; } + ts->tv_sec = sec; + tv->tv_nsec = 0; } int rtc_mips_set_time(unsigned long sec) diff --git a/arch/mips/sni/time.c b/arch/mips/sni/time.c index 0d9ec1a5c24..62df6a598e0 100644 --- a/arch/mips/sni/time.c +++ b/arch/mips/sni/time.c @@ -182,7 +182,8 @@ void __init plat_time_init(void) setup_pit_timer(); } -unsigned long read_persistent_clock(void) +void read_persistent_clock(struct timespec *ts) { - return -1; + ts->tv_sec = -1; + ts->tv_nsec = 0; } diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index eae4511ceea..ad63f30fe3d 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -769,7 +769,7 @@ int update_persistent_clock(struct timespec now) return ppc_md.set_rtc_time(&tm); } -unsigned long read_persistent_clock(void) +void read_persistent_clock(struct timespec *ts) { struct rtc_time tm; static int first = 1; @@ -787,8 +787,9 @@ unsigned long read_persistent_clock(void) if (!ppc_md.get_rtc_time) return 0; ppc_md.get_rtc_time(&tm); - return mktime(tm.tm_year+1900, tm.tm_mon+1, tm.tm_mday, - tm.tm_hour, tm.tm_min, tm.tm_sec); + ts->tv_sec = mktime(tm.tm_year+1900, tm.tm_mon+1, tm.tm_mday, + tm.tm_hour, tm.tm_min, tm.tm_sec); + ts->tv_nsec = 0; } /* clocksource code */ diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index e76c2e7a8b9..a94ec48587b 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -182,12 +182,9 @@ static void timing_alert_interrupt(__u16 code) static void etr_reset(void); static void stp_reset(void); -unsigned long read_persistent_clock(void) +void read_persistent_clock(struct timespec *ts) { - struct timespec ts; - - tod_to_timeval(get_clock() - TOD_UNIX_EPOCH, &ts); - return ts.tv_sec; + tod_to_timeval(get_clock() - TOD_UNIX_EPOCH, ts); } static cycle_t read_tod_clock(struct clocksource *cs) @@ -248,7 +245,6 @@ void __init time_init(void) { struct timespec ts; unsigned long flags; - cycle_t now; /* Reset time synchronization interfaces. */ etr_reset(); @@ -266,20 +262,10 @@ void __init time_init(void) panic("Could not register TOD clock source"); /* - * The TOD clock is an accurate clock. The xtime should be - * initialized in a way that the difference between TOD and - * xtime is reasonably small. Too bad that timekeeping_init - * sets xtime.tv_nsec to zero. In addition the clock source - * change from the jiffies clock source to the TOD clock - * source add another error of up to 1/HZ second. The same - * function sets wall_to_monotonic to a value that is too - * small for /proc/uptime to be accurate. - * Reset xtime and wall_to_monotonic to sane values. + * Reset wall_to_monotonic to the initial timestamp created + * in head.S to get a precise value in /proc/uptime. */ write_seqlock_irqsave(&xtime_lock, flags); - now = get_clock(); - tod_to_timeval(now - TOD_UNIX_EPOCH, &xtime); - clocksource_tod.cycle_last = now; tod_to_timeval(sched_clock_base_cc - TOD_UNIX_EPOCH, &ts); set_normalized_timespec(&wall_to_monotonic, -ts.tv_sec, -ts.tv_nsec); write_sequnlock_irqrestore(&xtime_lock, flags); diff --git a/arch/sh/kernel/time.c b/arch/sh/kernel/time.c index 9b352a1e3fb..3f4706aa975 100644 --- a/arch/sh/kernel/time.c +++ b/arch/sh/kernel/time.c @@ -39,11 +39,9 @@ void (*rtc_sh_get_time)(struct timespec *) = null_rtc_get_time; int (*rtc_sh_set_time)(const time_t) = null_rtc_set_time; #ifdef CONFIG_GENERIC_CMOS_UPDATE -unsigned long read_persistent_clock(void) +void read_persistent_clock(struct timespec *ts) { - struct timespec tv; - rtc_sh_get_time(&tv); - return tv.tv_sec; + rtc_sh_get_time(&ts); } int update_persistent_clock(struct timespec now) diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index 5d465b207e7..bf67dcb4a44 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -178,7 +178,7 @@ static int set_rtc_mmss(unsigned long nowtime) } /* not static: needed by APM */ -unsigned long read_persistent_clock(void) +void read_persistent_clock(struct timespec *ts) { unsigned long retval, flags; @@ -186,7 +186,8 @@ unsigned long read_persistent_clock(void) retval = get_wallclock(); spin_unlock_irqrestore(&rtc_lock, flags); - return retval; + ts->tv_sec = retval; + ts->tv_nsec = 0; } int update_persistent_clock(struct timespec now) diff --git a/arch/xtensa/kernel/time.c b/arch/xtensa/kernel/time.c index 8848120d291..19085ff0484 100644 --- a/arch/xtensa/kernel/time.c +++ b/arch/xtensa/kernel/time.c @@ -59,9 +59,8 @@ static struct irqaction timer_irqaction = { void __init time_init(void) { - xtime.tv_nsec = 0; - xtime.tv_sec = read_persistent_clock(); - + /* FIXME: xtime&wall_to_monotonic are set in timekeeping_init. */ + read_persistent_clock(&xtime); set_normalized_timespec(&wall_to_monotonic, -xtime.tv_sec, -xtime.tv_nsec); diff --git a/include/linux/time.h b/include/linux/time.h index e7c84455888..53a3216f0d1 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -101,7 +101,7 @@ extern struct timespec xtime; extern struct timespec wall_to_monotonic; extern seqlock_t xtime_lock; -extern unsigned long read_persistent_clock(void); +extern void read_persistent_clock(struct timespec *ts); extern int update_persistent_clock(struct timespec now); extern int no_sync_cmos_clock __read_mostly; void timekeeping_init(void); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 41579e7fcf9..f1a21ce491e 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -154,7 +154,7 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); */ struct timespec xtime __attribute__ ((aligned (16))); struct timespec wall_to_monotonic __attribute__ ((aligned (16))); -static unsigned long total_sleep_time; /* seconds */ +static struct timespec total_sleep_time; /* * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. @@ -487,17 +487,18 @@ int timekeeping_valid_for_hres(void) } /** - * read_persistent_clock - Return time in seconds from the persistent clock. + * read_persistent_clock - Return time from the persistent clock. * * Weak dummy function for arches that do not yet support it. - * Returns seconds from epoch using the battery backed persistent clock. - * Returns zero if unsupported. + * Reads the time from the battery backed persistent clock. + * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported. * * XXX - Do be sure to remove it once all arches implement it. */ -unsigned long __attribute__((weak)) read_persistent_clock(void) +void __attribute__((weak)) read_persistent_clock(struct timespec *ts) { - return 0; + ts->tv_sec = 0; + ts->tv_nsec = 0; } /* @@ -507,7 +508,9 @@ void __init timekeeping_init(void) { struct clocksource *clock; unsigned long flags; - unsigned long sec = read_persistent_clock(); + struct timespec now; + + read_persistent_clock(&now); write_seqlock_irqsave(&xtime_lock, flags); @@ -518,19 +521,20 @@ void __init timekeeping_init(void) clock->enable(clock); timekeeper_setup_internals(clock); - xtime.tv_sec = sec; - xtime.tv_nsec = 0; + xtime.tv_sec = now.tv_sec; + xtime.tv_nsec = now.tv_nsec; raw_time.tv_sec = 0; raw_time.tv_nsec = 0; set_normalized_timespec(&wall_to_monotonic, -xtime.tv_sec, -xtime.tv_nsec); update_xtime_cache(0); - total_sleep_time = 0; + total_sleep_time.tv_sec = 0; + total_sleep_time.tv_nsec = 0; write_sequnlock_irqrestore(&xtime_lock, flags); } /* time in seconds when suspend began */ -static unsigned long timekeeping_suspend_time; +static struct timespec timekeeping_suspend_time; /** * timekeeping_resume - Resumes the generic timekeeping subsystem. @@ -543,18 +547,19 @@ static unsigned long timekeeping_suspend_time; static int timekeeping_resume(struct sys_device *dev) { unsigned long flags; - unsigned long now = read_persistent_clock(); + struct timespec ts; + + read_persistent_clock(&ts); clocksource_resume(); write_seqlock_irqsave(&xtime_lock, flags); - if (now && (now > timekeeping_suspend_time)) { - unsigned long sleep_length = now - timekeeping_suspend_time; - - xtime.tv_sec += sleep_length; - wall_to_monotonic.tv_sec -= sleep_length; - total_sleep_time += sleep_length; + if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { + ts = timespec_sub(ts, timekeeping_suspend_time); + xtime = timespec_add_safe(xtime, ts); + wall_to_monotonic = timespec_sub(wall_to_monotonic, ts); + total_sleep_time = timespec_add_safe(total_sleep_time, ts); } update_xtime_cache(0); /* re-base the last cycle value */ @@ -577,7 +582,7 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) { unsigned long flags; - timekeeping_suspend_time = read_persistent_clock(); + read_persistent_clock(&timekeeping_suspend_time); write_seqlock_irqsave(&xtime_lock, flags); timekeeping_forward_now(); @@ -801,9 +806,10 @@ void update_wall_time(void) */ void getboottime(struct timespec *ts) { - set_normalized_timespec(ts, - - (wall_to_monotonic.tv_sec + total_sleep_time), - - wall_to_monotonic.tv_nsec); + struct timespec boottime; + + boottime = timespec_add_safe(wall_to_monotonic, total_sleep_time); + set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec); } /** @@ -812,7 +818,7 @@ void getboottime(struct timespec *ts) */ void monotonic_to_bootbased(struct timespec *ts) { - ts->tv_sec += total_sleep_time; + *ts = timespec_add_safe(*ts, total_sleep_time); } unsigned long get_seconds(void) -- cgit v1.2.3 From 62a3207b8cf3de35368cdc3822b30b82d59eea95 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 17 Aug 2009 11:16:16 -0700 Subject: x86, intel_txt: Handle ACPI_SLEEP without X86_TRAMPOLINE On 32 bits, we can have CONFIG_ACPI_SLEEP set without implying CONFIG_X86_TRAMPOLINE. In that case, we simply do not need to mark the trampoline as a MAC region. Signed-off-by: H. Peter Anvin Cc: Shane Wang Cc: Joseph Cihula --- arch/x86/kernel/tboot.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index a183beffe39..c2e760ca7b0 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -189,8 +189,12 @@ static int tboot_setup_sleep(void) /* S3 resume code */ add_mac_region(acpi_wakeup_address, WAKEUP_SIZE); + +#ifdef CONFIG_X86_TRAMPOLINE /* AP trampoline code */ add_mac_region(virt_to_phys(trampoline_base), TRAMPOLINE_SIZE); +#endif + /* kernel code + data + bss */ add_mac_region(virt_to_phys(_text), _end - _text); -- cgit v1.2.3 From 8126dec32738421afa362114337331337b4be17f Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 20 Aug 2009 20:23:11 +0800 Subject: x86: Fix system crash when loading with "reservetop" parameter The system will die if the kernel is booted with "reservetop" parameter, in present code, parse "reservetop" parameter after early_ioremap_init(), and some function still use early_ioremap() after it. The problem is, "reservetop" parameter can modify 'FIXADDR_TOP', then the virtual address got by early_ioremap() is base on old 'FIXADDR_TOP', but the page mapping is base on new 'FIXADDR_TOP', it will occur page fault, and the IDT is not prepare yet, so, the system is dead. So, put parse_early_param() in the front of early_ioremap_init() in this patch. Signed-off-by: Xiao Guangrong Cc: yinghai@kernel.org Cc: Andrew Morton LKML-Reference: <4A8D402F.4080805@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 63f32d220ef..02643cc3bf2 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -711,6 +711,11 @@ void __init setup_arch(char **cmdline_p) printk(KERN_INFO "Command line: %s\n", boot_command_line); #endif + strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); + *cmdline_p = command_line; + + parse_early_param(); + /* VMI may relocate the fixmap; do this before touching ioremap area */ vmi_init(); @@ -793,11 +798,6 @@ void __init setup_arch(char **cmdline_p) #endif #endif - strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); - *cmdline_p = command_line; - - parse_early_param(); - #ifdef CONFIG_X86_64 check_efer(); #endif -- cgit v1.2.3 From 3e0e1e9c5a327d4dba8490d83ef55c0564e6e8a7 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Fri, 21 Aug 2009 04:34:45 -0400 Subject: x86: Fix an incorrect argument of reserve_bootmem() This line looks suspicious, because if this is true, then the 'flags' parameter of function reserve_bootmem_generic() will be unused when !CONFIG_NUMA. I don't think this is what we want. Signed-off-by: WANG Cong Cc: Yinghai Lu Cc: akpm@linux-foundation.org LKML-Reference: <20090821083709.5098.52505.sendpatchset@localhost.localdomain> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 6176fe8f29e..ea56b8cbb6a 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -796,7 +796,7 @@ int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, return ret; #else - reserve_bootmem(phys, len, BOOTMEM_DEFAULT); + reserve_bootmem(phys, len, flags); #endif if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) { -- cgit v1.2.3 From 8cab02dc3c58a12235c6d463ce684dded9696848 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 20 Aug 2009 18:19:45 +0200 Subject: x86: Do not unregister PIT clocksource on PIT oneshot setup/shutdown This basically reverts commit 1a0c009ac (x86: unregister PIT clocksource when PIT is disabled) because the problem which was tried to address with that patch has been solved by commit 3f68535ada (clocksource: sanity check sysfs clocksource changes). The problem addressed by the original patch is that PIT could be selected as clocksource after the system switched the PIT off or set the PIT into one shot mode which would result in complete timekeeping wreckage. Now with the sysfs sanity check in place PIT cannot be selected again when the system is in oneshot mode. The system will not switch to one shot mode as long as PIT is installed because PIT is not suitable for one shot. The shutdown case which happens when the lapic timer is installed is covered by the fact that init_pit_clocksource() is called after the lapic timer take over and then does not install the PIT clocksource at all. We should have done the sanity checks back then, but ... This also solves the locking problem which was reported vs. the clocksource rework. LKML-Reference: Cc: Martin Schwidefsky Cc: john stultz Signed-off-by: Thomas Gleixner --- arch/x86/kernel/i8253.c | 19 ------------------- 1 file changed, 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c index 5cf36c053ac..23c167925a5 100644 --- a/arch/x86/kernel/i8253.c +++ b/arch/x86/kernel/i8253.c @@ -19,12 +19,6 @@ DEFINE_SPINLOCK(i8253_lock); EXPORT_SYMBOL(i8253_lock); -#ifdef CONFIG_X86_32 -static void pit_disable_clocksource(void); -#else -static inline void pit_disable_clocksource(void) { } -#endif - /* * HPET replaces the PIT, when enabled. So we need to know, which of * the two timers is used @@ -57,12 +51,10 @@ static void init_pit_timer(enum clock_event_mode mode, outb_pit(0, PIT_CH0); outb_pit(0, PIT_CH0); } - pit_disable_clocksource(); break; case CLOCK_EVT_MODE_ONESHOT: /* One shot setup */ - pit_disable_clocksource(); outb_pit(0x38, PIT_MODE); break; @@ -200,17 +192,6 @@ static struct clocksource pit_cs = { .shift = 20, }; -static void pit_disable_clocksource(void) -{ - /* - * Use mult to check whether it is registered or not - */ - if (pit_cs.mult) { - clocksource_unregister(&pit_cs); - pit_cs.mult = 0; - } -} - static int __init init_pit_clocksource(void) { /* -- cgit v1.2.3 From da15cfdae03351c689736f8d142618592e3cebc3 Mon Sep 17 00:00:00 2001 From: john stultz Date: Wed, 19 Aug 2009 19:13:34 -0700 Subject: time: Introduce CLOCK_REALTIME_COARSE After talking with some application writers who want very fast, but not fine-grained timestamps, I decided to try to implement new clock_ids to clock_gettime(): CLOCK_REALTIME_COARSE and CLOCK_MONOTONIC_COARSE which returns the time at the last tick. This is very fast as we don't have to access any hardware (which can be very painful if you're using something like the acpi_pm clocksource), and we can even use the vdso clock_gettime() method to avoid the syscall. The only trade off is you only get low-res tick grained time resolution. This isn't a new idea, I know Ingo has a patch in the -rt tree that made the vsyscall gettimeofday() return coarse grained time when the vsyscall64 sysctrl was set to 2. However this affects all applications on a system. With this method, applications can choose the proper speed/granularity trade-off for themselves. Signed-off-by: John Stultz Cc: Andi Kleen Cc: nikolag@ca.ibm.com Cc: Darren Hart Cc: arjan@infradead.org Cc: jonathan@jonmasters.org LKML-Reference: <1250734414.6897.5.camel@localhost.localdomain> Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/vgtod.h | 1 + arch/x86/kernel/vsyscall_64.c | 1 + arch/x86/vdso/vclock_gettime.c | 39 ++++++++++++++++++++++++++++++++++++--- include/linux/time.h | 4 ++++ kernel/posix-timers.c | 35 +++++++++++++++++++++++++++++++++++ kernel/time/timekeeping.c | 21 +++++++++++++++++++++ 6 files changed, 98 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h index dc27a69e5d2..3d61e204826 100644 --- a/arch/x86/include/asm/vgtod.h +++ b/arch/x86/include/asm/vgtod.h @@ -21,6 +21,7 @@ struct vsyscall_gtod_data { u32 shift; } clock; struct timespec wall_to_monotonic; + struct timespec wall_time_coarse; }; extern struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data; diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 25ee06a80aa..cf53a78e2dc 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -87,6 +87,7 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock) vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic; + vsyscall_gtod_data.wall_time_coarse = __current_kernel_time(); write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); } diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index 6a40b78b46a..ee55754cc3c 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -86,14 +86,47 @@ notrace static noinline int do_monotonic(struct timespec *ts) return 0; } +notrace static noinline int do_realtime_coarse(struct timespec *ts) +{ + unsigned long seq; + do { + seq = read_seqbegin(>od->lock); + ts->tv_sec = gtod->wall_time_coarse.tv_sec; + ts->tv_nsec = gtod->wall_time_coarse.tv_nsec; + } while (unlikely(read_seqretry(>od->lock, seq))); + return 0; +} + +notrace static noinline int do_monotonic_coarse(struct timespec *ts) +{ + unsigned long seq, ns, secs; + do { + seq = read_seqbegin(>od->lock); + secs = gtod->wall_time_coarse.tv_sec; + ns = gtod->wall_time_coarse.tv_nsec; + secs += gtod->wall_to_monotonic.tv_sec; + ns += gtod->wall_to_monotonic.tv_nsec; + } while (unlikely(read_seqretry(>od->lock, seq))); + vset_normalized_timespec(ts, secs, ns); + return 0; +} + notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) { - if (likely(gtod->sysctl_enabled && gtod->clock.vread)) + if (likely(gtod->sysctl_enabled)) switch (clock) { case CLOCK_REALTIME: - return do_realtime(ts); + if (likely(gtod->clock.vread)) + return do_realtime(ts); + break; case CLOCK_MONOTONIC: - return do_monotonic(ts); + if (likely(gtod->clock.vread)) + return do_monotonic(ts); + break; + case CLOCK_REALTIME_COARSE: + return do_realtime_coarse(ts); + case CLOCK_MONOTONIC_COARSE: + return do_monotonic_coarse(ts); } return vdso_fallback_gettime(clock, ts); } diff --git a/include/linux/time.h b/include/linux/time.h index f505988398e..256232f7e5e 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -110,6 +110,8 @@ extern int timekeeping_suspended; unsigned long get_seconds(void); struct timespec current_kernel_time(void); +struct timespec __current_kernel_time(void); /* does not hold xtime_lock */ +struct timespec get_monotonic_coarse(void); #define CURRENT_TIME (current_kernel_time()) #define CURRENT_TIME_SEC ((struct timespec) { get_seconds(), 0 }) @@ -243,6 +245,8 @@ struct itimerval { #define CLOCK_PROCESS_CPUTIME_ID 2 #define CLOCK_THREAD_CPUTIME_ID 3 #define CLOCK_MONOTONIC_RAW 4 +#define CLOCK_REALTIME_COARSE 5 +#define CLOCK_MONOTONIC_COARSE 6 /* * The IDs of various hardware clocks: diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index d089d052c4a..495440779ce 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -242,6 +242,25 @@ static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp) return 0; } + +static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec *tp) +{ + *tp = current_kernel_time(); + return 0; +} + +static int posix_get_monotonic_coarse(clockid_t which_clock, + struct timespec *tp) +{ + *tp = get_monotonic_coarse(); + return 0; +} + +int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp) +{ + *tp = ktime_to_timespec(KTIME_LOW_RES); + return 0; +} /* * Initialize everything, well, just everything in Posix clocks/timers ;) */ @@ -262,10 +281,26 @@ static __init int init_posix_timers(void) .timer_create = no_timer_create, .nsleep = no_nsleep, }; + struct k_clock clock_realtime_coarse = { + .clock_getres = posix_get_coarse_res, + .clock_get = posix_get_realtime_coarse, + .clock_set = do_posix_clock_nosettime, + .timer_create = no_timer_create, + .nsleep = no_nsleep, + }; + struct k_clock clock_monotonic_coarse = { + .clock_getres = posix_get_coarse_res, + .clock_get = posix_get_monotonic_coarse, + .clock_set = do_posix_clock_nosettime, + .timer_create = no_timer_create, + .nsleep = no_nsleep, + }; register_posix_clock(CLOCK_REALTIME, &clock_realtime); register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic); register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw); + register_posix_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse); + register_posix_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse); posix_timers_cache = kmem_cache_create("posix_timers_cache", sizeof (struct k_itimer), 0, SLAB_PANIC, diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 15e06defca5..03cbeb34d14 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -847,6 +847,10 @@ unsigned long get_seconds(void) } EXPORT_SYMBOL(get_seconds); +struct timespec __current_kernel_time(void) +{ + return xtime_cache; +} struct timespec current_kernel_time(void) { @@ -862,3 +866,20 @@ struct timespec current_kernel_time(void) return now; } EXPORT_SYMBOL(current_kernel_time); + +struct timespec get_monotonic_coarse(void) +{ + struct timespec now, mono; + unsigned long seq; + + do { + seq = read_seqbegin(&xtime_lock); + + now = xtime_cache; + mono = wall_to_monotonic; + } while (read_seqretry(&xtime_lock, seq)); + + set_normalized_timespec(&now, now.tv_sec + mono.tv_sec, + now.tv_nsec + mono.tv_nsec); + return now; +} -- cgit v1.2.3 From d0af9eed5aa91b6b7b5049cae69e5ea956fd85c3 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Wed, 19 Aug 2009 18:05:36 -0700 Subject: x86, pat/mtrr: Rendezvous all the cpus for MTRR/PAT init SDM Vol 3a section titled "MTRR considerations in MP systems" specifies the need for synchronizing the logical cpu's while initializing/updating MTRR. Currently Linux kernel does the synchronization of all cpu's only when a single MTRR register is programmed/updated. During an AP online (during boot/cpu-online/resume) where we initialize all the MTRR/PAT registers, we don't follow this synchronization algorithm. This can lead to scenarios where during a dynamic cpu online, that logical cpu is initializing MTRR/PAT with cache disabled (cr0.cd=1) etc while other logical HT sibling continue to run (also with cache disabled because of cr0.cd=1 on its sibling). Starting from Westmere, VMX transitions with cr0.cd=1 don't work properly (because of some VMX performance optimizations) and the above scenario (with one logical cpu doing VMX activity and another logical cpu coming online) can result in system crash. Fix the MTRR initialization by doing rendezvous of all the cpus. During boot and resume, we delay the MTRR/PAT init for APs till all the logical cpu's come online and the rendezvous process at the end of AP's bringup, will initialize the MTRR/PAT for all AP's. For dynamic single cpu online, we synchronize all the logical cpus and do the MTRR/PAT init on the AP that is coming online. Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mtrr.h | 7 +++++++ arch/x86/kernel/cpu/mtrr/main.c | 46 +++++++++++++++++++++++++++++++++-------- arch/x86/kernel/smpboot.c | 14 +++++++++++++ arch/x86/power/cpu.c | 2 +- kernel/cpu.c | 14 +++++++++++++ 5 files changed, 73 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h index a51ada8467d..d5366ec5cb8 100644 --- a/arch/x86/include/asm/mtrr.h +++ b/arch/x86/include/asm/mtrr.h @@ -121,8 +121,12 @@ extern int mtrr_del_page(int reg, unsigned long base, unsigned long size); extern void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi); extern void mtrr_ap_init(void); extern void mtrr_bp_init(void); +extern void set_mtrr_aps_delayed_init(void); +extern void mtrr_aps_init(void); +extern void mtrr_bp_restore(void); extern int mtrr_trim_uncached_memory(unsigned long end_pfn); extern int amd_special_default_mtrr(void); +extern u32 mtrr_aps_delayed_init; # else static inline u8 mtrr_type_lookup(u64 addr, u64 end) { @@ -161,6 +165,9 @@ static inline void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi) #define mtrr_ap_init() do {} while (0) #define mtrr_bp_init() do {} while (0) +#define set_mtrr_aps_delayed_init() do {} while (0) +#define mtrr_aps_init() do {} while (0) +#define mtrr_bp_restore() do {} while (0) # endif #ifdef CONFIG_COMPAT diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 7af0f88a416..7339be0aa58 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -58,6 +58,7 @@ unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES]; static DEFINE_MUTEX(mtrr_mutex); u64 size_or_mask, size_and_mask; +u32 mtrr_aps_delayed_init; static struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM]; @@ -163,7 +164,10 @@ static void ipi_handler(void *info) if (data->smp_reg != ~0U) { mtrr_if->set(data->smp_reg, data->smp_base, data->smp_size, data->smp_type); - } else { + } else if (mtrr_aps_delayed_init) { + /* + * Initialize the MTRRs inaddition to the synchronisation. + */ mtrr_if->set_all(); } @@ -265,6 +269,8 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ */ if (reg != ~0U) mtrr_if->set(reg, base, size, type); + else if (!mtrr_aps_delayed_init) + mtrr_if->set_all(); /* Wait for the others */ while (atomic_read(&data.count)) @@ -721,9 +727,7 @@ void __init mtrr_bp_init(void) void mtrr_ap_init(void) { - unsigned long flags; - - if (!mtrr_if || !use_intel()) + if (!use_intel() || mtrr_aps_delayed_init) return; /* * Ideally we should hold mtrr_mutex here to avoid mtrr entries @@ -738,11 +742,7 @@ void mtrr_ap_init(void) * 2. cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug * lock to prevent mtrr entry changes */ - local_irq_save(flags); - - mtrr_if->set_all(); - - local_irq_restore(flags); + set_mtrr(~0U, 0, 0, 0); } /** @@ -753,6 +753,34 @@ void mtrr_save_state(void) smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1); } +void set_mtrr_aps_delayed_init(void) +{ + if (!use_intel()) + return; + + mtrr_aps_delayed_init = 1; +} + +/* + * MTRR initialization for all AP's + */ +void mtrr_aps_init(void) +{ + if (!use_intel()) + return; + + set_mtrr(~0U, 0, 0, 0); + mtrr_aps_delayed_init = 0; +} + +void mtrr_bp_restore(void) +{ + if (!use_intel()) + return; + + mtrr_if->set_all(); +} + static int __init mtrr_init_finialize(void) { if (!mtrr_if) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 2fecda69ee6..d720b7e0cf3 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1116,9 +1116,22 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) if (is_uv_system()) uv_system_init(); + + set_mtrr_aps_delayed_init(); out: preempt_enable(); } + +void arch_enable_nonboot_cpus_begin(void) +{ + set_mtrr_aps_delayed_init(); +} + +void arch_enable_nonboot_cpus_end(void) +{ + mtrr_aps_init(); +} + /* * Early setup to make printk work. */ @@ -1140,6 +1153,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus) setup_ioapic_dest(); #endif check_nmi_watchdog(); + mtrr_aps_init(); } static int __initdata setup_possible_cpus = -1; diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index b3d20b9cac6..417c9f5b4af 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -242,7 +242,7 @@ static void __restore_processor_state(struct saved_context *ctxt) fix_processor_context(); do_fpu_end(); - mtrr_ap_init(); + mtrr_bp_restore(); #ifdef CONFIG_X86_OLD_MCE mcheck_init(&boot_cpu_data); diff --git a/kernel/cpu.c b/kernel/cpu.c index 8ce10043e4a..f5f9485b8c0 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -413,6 +413,14 @@ int disable_nonboot_cpus(void) return error; } +void __weak arch_enable_nonboot_cpus_begin(void) +{ +} + +void __weak arch_enable_nonboot_cpus_end(void) +{ +} + void __ref enable_nonboot_cpus(void) { int cpu, error; @@ -424,6 +432,9 @@ void __ref enable_nonboot_cpus(void) goto out; printk("Enabling non-boot CPUs ...\n"); + + arch_enable_nonboot_cpus_begin(); + for_each_cpu(cpu, frozen_cpus) { error = _cpu_up(cpu, 1); if (!error) { @@ -432,6 +443,9 @@ void __ref enable_nonboot_cpus(void) } printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); } + + arch_enable_nonboot_cpus_end(); + cpumask_clear(frozen_cpus); out: cpu_maps_update_done(); -- cgit v1.2.3 From 5400743db5a06a4e6e298725a2044c40edcb27b9 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 21 Aug 2009 17:00:02 -0700 Subject: x86, mtrr: make mtrr_aps_delayed_init static bool mtr_aps_delayed_init was declared u32 and made global, but it only ever takes boolean values and is only ever used in arch/x86/kernel/cpu/mtrr/main.c. Declare it "static bool" and remove external references. Signed-off-by: H. Peter Anvin Cc: Suresh Siddha --- arch/x86/include/asm/mtrr.h | 1 - arch/x86/kernel/cpu/mtrr/main.c | 6 +++--- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h index d5366ec5cb8..4365ffdb461 100644 --- a/arch/x86/include/asm/mtrr.h +++ b/arch/x86/include/asm/mtrr.h @@ -126,7 +126,6 @@ extern void mtrr_aps_init(void); extern void mtrr_bp_restore(void); extern int mtrr_trim_uncached_memory(unsigned long end_pfn); extern int amd_special_default_mtrr(void); -extern u32 mtrr_aps_delayed_init; # else static inline u8 mtrr_type_lookup(u64 addr, u64 end) { diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 7339be0aa58..84e83de5457 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -58,7 +58,7 @@ unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES]; static DEFINE_MUTEX(mtrr_mutex); u64 size_or_mask, size_and_mask; -u32 mtrr_aps_delayed_init; +static bool mtrr_aps_delayed_init; static struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM]; @@ -758,7 +758,7 @@ void set_mtrr_aps_delayed_init(void) if (!use_intel()) return; - mtrr_aps_delayed_init = 1; + mtrr_aps_delayed_init = true; } /* @@ -770,7 +770,7 @@ void mtrr_aps_init(void) return; set_mtrr(~0U, 0, 0, 0); - mtrr_aps_delayed_init = 0; + mtrr_aps_delayed_init = false; } void mtrr_bp_restore(void) -- cgit v1.2.3 From 680b6cfd3cee30a7d997d49430fb73af84523853 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Wed, 26 Aug 2009 16:20:36 +0900 Subject: x86, mce: CE in last bank prevents panic by unknown MCE If MCE handler is called but none of mces_seen have machine check event which might signal the MCE (i.e. event higher than MCE_KEEP_SEVERITY), panic with "Machine check from unknown source" will be taken since the MCE is assumed to be signaled from external agent or so. Usually mces_seen never point MCE_KEEP_SEVERITY event such as CE. But it can happen because initial value of mces_seen is accidentally modified by mce_no_way_out() - in case if mce_no_way_out() run through all banks and the last bank has the CE, mces_seen points the CE and the "panic by unknown" will not be taken. This patch fixes this undesired behavior, and clarifies the logic. Signed-off-by: Hidetoshi Seto Cc: H. Peter Anvin Cc: Andi Kleen Cc: Jin Dongming LKML-Reference: <4A94E244.3020301@jp.fujitsu.com> Signed-off-by: Ingo Molnar Reported-by: Jin Dongming --- arch/x86/kernel/cpu/mcheck/mce.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 54bd1b2fb4c..325559d1aa5 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -612,7 +612,7 @@ out: * This way we prevent any potential data corruption in a unrecoverable case * and also makes sure always all CPU's errors are examined. * - * Also this detects the case of an machine check event coming from outer + * Also this detects the case of a machine check event coming from outer * space (not detected by any CPUs) In this case some external agent wants * us to shut down, so panic too. * @@ -665,7 +665,7 @@ static void mce_reign(void) * No machine check event found. Must be some external * source or one CPU is hung. Panic. */ - if (!m && tolerant < 3) + if (global_worst <= MCE_KEEP_SEVERITY && tolerant < 3) mce_panic("Machine check from unknown source", NULL, NULL); /* @@ -889,11 +889,11 @@ void do_machine_check(struct pt_regs *regs, long error_code) mce_setup(&m); m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); - no_way_out = mce_no_way_out(&m, &msg); - final = &__get_cpu_var(mces_seen); *final = m; + no_way_out = mce_no_way_out(&m, &msg); + barrier(); /* -- cgit v1.2.3 From 5fc517466dd3d0fc6d2a5180ca6792e60344d8be Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Fri, 10 Jul 2009 09:57:32 -0700 Subject: x86, pat: Keep identity maps consistent with mmaps even when pat_disabled Make reserve_memtype internally take care of pat disabled case and fallback to default return values. Remove the specific pat_disabled checks in track_* routines. Change kernel_map_sync_memtype to sync identity map even when pat_disabled. This change ensures that, even for pat_disabled case, we take care of keeping identity map in sync. Before this patch, in pat disabled case, ioremap() keeps the identity maps in sync and other APIs like pci and /dev/mem mmap don't, which is not a very consistent behavior. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/mm/pat.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index e6718bb2806..d5af2792d2f 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -339,6 +339,8 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, if (new_type) { if (req_type == -1) *new_type = _PAGE_CACHE_WB; + else if (req_type == _PAGE_CACHE_WC) + *new_type = _PAGE_CACHE_UC_MINUS; else *new_type = req_type & _PAGE_CACHE_MASK; } @@ -577,7 +579,7 @@ int kernel_map_sync_memtype(u64 base, unsigned long size, unsigned long flags) { unsigned long id_sz; - if (!pat_enabled || base >= __pa(high_memory)) + if (base >= __pa(high_memory)) return 0; id_sz = (__pa(high_memory) < base + size) ? @@ -677,9 +679,6 @@ int track_pfn_vma_copy(struct vm_area_struct *vma) unsigned long vma_size = vma->vm_end - vma->vm_start; pgprot_t pgprot; - if (!pat_enabled) - return 0; - /* * For now, only handle remap_pfn_range() vmas where * is_linear_pfn_mapping() == TRUE. Handling of @@ -715,9 +714,6 @@ int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot, resource_size_t paddr; unsigned long vma_size = vma->vm_end - vma->vm_start; - if (!pat_enabled) - return 0; - /* * For now, only handle remap_pfn_range() vmas where * is_linear_pfn_mapping() == TRUE. Handling of @@ -743,9 +739,6 @@ void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, resource_size_t paddr; unsigned long vma_size = vma->vm_end - vma->vm_start; - if (!pat_enabled) - return; - /* * For now, only handle remap_pfn_range() vmas where * is_linear_pfn_mapping() == TRUE. Handling of -- cgit v1.2.3 From 279e669b3fc0068cc3509e8e53036999e1e86588 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Fri, 10 Jul 2009 09:57:33 -0700 Subject: x86, pat: ioremap to follow same PAT restrictions as other PAT users ioremap has this hard-coded check for new type and requested type. That check differs from other PAT users like /dev/mem mmap, remap_pfn_range in only one condition where requested type is UC_MINUS and new type is WC. Under that condition, ioremap fails. But other PAT interfaces succeed with a WC mapping. Change to make ioremap be in sync with other PAT APIs and use the same macro as others. Also changes the error print to KERN_ERR instead of pr_debug. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/mm/ioremap.c | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 8a450930834..aeaea8c5b2f 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -228,24 +228,13 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, retval = reserve_memtype(phys_addr, (u64)phys_addr + size, prot_val, &new_prot_val); if (retval) { - pr_debug("Warning: reserve_memtype returned %d\n", retval); + printk(KERN_ERR "ioremap reserve_memtype failed %d\n", retval); return NULL; } if (prot_val != new_prot_val) { - /* - * Do not fallback to certain memory types with certain - * requested type: - * - request is uc-, return cannot be write-back - * - request is uc-, return cannot be write-combine - * - request is write-combine, return cannot be write-back - */ - if ((prot_val == _PAGE_CACHE_UC_MINUS && - (new_prot_val == _PAGE_CACHE_WB || - new_prot_val == _PAGE_CACHE_WC)) || - (prot_val == _PAGE_CACHE_WC && - new_prot_val == _PAGE_CACHE_WB)) { - pr_debug( + if (!is_new_memtype_allowed(prot_val, new_prot_val)) { + printk(KERN_ERR "ioremap error for 0x%llx-0x%llx, requested 0x%lx, got 0x%lx\n", (unsigned long long)phys_addr, (unsigned long long)(phys_addr + size), -- cgit v1.2.3 From 9fd126bc742f74a95d2ba610247712ff05da02fe Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Fri, 10 Jul 2009 09:57:34 -0700 Subject: x86, pat: New i/f for driver to request memtype for IO regions Add new routines to request memtype for IO regions. This will currently be a backend for io_mapping_* routines. But, it can also be made available to drivers directly in future, in case it is needed. reserve interface reserves the memory, makes sure we have a compatible memory type available and keeps the identity map in sync when needed. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/pat.h | 5 +++++ arch/x86/mm/pat.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h index 7af14e512f9..e2c1668dde7 100644 --- a/arch/x86/include/asm/pat.h +++ b/arch/x86/include/asm/pat.h @@ -19,4 +19,9 @@ extern int free_memtype(u64 start, u64 end); extern int kernel_map_sync_memtype(u64 base, unsigned long size, unsigned long flag); +int io_reserve_memtype(resource_size_t start, resource_size_t end, + unsigned long *type); + +void io_free_memtype(resource_size_t start, resource_size_t end); + #endif /* _ASM_X86_PAT_H */ diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index d5af2792d2f..82d097ce309 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -498,6 +498,55 @@ int free_memtype(u64 start, u64 end) } +/** + * io_reserve_memtype - Request a memory type mapping for a region of memory + * @start: start (physical address) of the region + * @end: end (physical address) of the region + * @type: A pointer to memtype, with requested type. On success, requested + * or any other compatible type that was available for the region is returned + * + * On success, returns 0 + * On failure, returns non-zero + */ +int io_reserve_memtype(resource_size_t start, resource_size_t end, + unsigned long *type) +{ + unsigned long req_type = *type; + unsigned long new_type; + int ret; + + WARN_ON_ONCE(iomem_map_sanity_check(start, end - start)); + + ret = reserve_memtype(start, end, req_type, &new_type); + if (ret) + goto out_err; + + if (!is_new_memtype_allowed(req_type, new_type)) + goto out_free; + + if (kernel_map_sync_memtype(start, end - start, new_type) < 0) + goto out_free; + + *type = new_type; + return 0; + +out_free: + free_memtype(start, end); + ret = -EBUSY; +out_err: + return ret; +} + +/** + * io_free_memtype - Release a memory type mapping for a region of memory + * @start: start (physical address) of the region + * @end: end (physical address) of the region + */ +void io_free_memtype(resource_size_t start, resource_size_t end) +{ + free_memtype(start, end); +} + pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, unsigned long size, pgprot_t vma_prot) { -- cgit v1.2.3 From 9e36fda0b359d2a6ae039c3d7e71a04502a77898 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Fri, 10 Jul 2009 09:57:35 -0700 Subject: x86, pat: Add PAT reserve free to io_mapping* APIs io_mapping_* interfaces were added, mainly for graphics drivers. Make this interface go through the PAT reserve/free, instead of hardcoding WC mapping. This makes sure that there are no aliases due to unconditional WC setting. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/iomap.h | 9 ++++++--- arch/x86/mm/iomap_32.c | 27 +++++++++++++++++++++++++-- include/linux/io-mapping.h | 17 ++++++++++++----- 3 files changed, 43 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/iomap.h b/arch/x86/include/asm/iomap.h index 0e9fe1d9d97..f35eb45d657 100644 --- a/arch/x86/include/asm/iomap.h +++ b/arch/x86/include/asm/iomap.h @@ -26,13 +26,16 @@ #include #include -int -is_io_mapping_possible(resource_size_t base, unsigned long size); - void * iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot); void iounmap_atomic(void *kvaddr, enum km_type type); +int +iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot); + +void +iomap_free(resource_size_t base, unsigned long size); + #endif /* _ASM_X86_IOMAP_H */ diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c index fe6f84ca121..84e236ce76b 100644 --- a/arch/x86/mm/iomap_32.c +++ b/arch/x86/mm/iomap_32.c @@ -21,7 +21,7 @@ #include #include -int is_io_mapping_possible(resource_size_t base, unsigned long size) +static int is_io_mapping_possible(resource_size_t base, unsigned long size) { #if !defined(CONFIG_X86_PAE) && defined(CONFIG_PHYS_ADDR_T_64BIT) /* There is no way to map greater than 1 << 32 address without PAE */ @@ -30,7 +30,30 @@ int is_io_mapping_possible(resource_size_t base, unsigned long size) #endif return 1; } -EXPORT_SYMBOL_GPL(is_io_mapping_possible); + +int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot) +{ + unsigned long flag = _PAGE_CACHE_WC; + int ret; + + if (!is_io_mapping_possible(base, size)) + return -EINVAL; + + ret = io_reserve_memtype(base, base + size, &flag); + if (ret) + return ret; + + *prot = __pgprot(__PAGE_KERNEL | flag); + return 0; +} +EXPORT_SYMBOL_GPL(iomap_create_wc); + +void +iomap_free(resource_size_t base, unsigned long size) +{ + io_free_memtype(base, base + size); +} +EXPORT_SYMBOL_GPL(iomap_free); void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) { diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h index 0adb0f91568..97eb928b492 100644 --- a/include/linux/io-mapping.h +++ b/include/linux/io-mapping.h @@ -49,23 +49,30 @@ static inline struct io_mapping * io_mapping_create_wc(resource_size_t base, unsigned long size) { struct io_mapping *iomap; - - if (!is_io_mapping_possible(base, size)) - return NULL; + pgprot_t prot; iomap = kmalloc(sizeof(*iomap), GFP_KERNEL); if (!iomap) - return NULL; + goto out_err; + + if (iomap_create_wc(base, size, &prot)) + goto out_free; iomap->base = base; iomap->size = size; - iomap->prot = pgprot_writecombine(__pgprot(__PAGE_KERNEL)); + iomap->prot = prot; return iomap; + +out_free: + kfree(iomap); +out_err: + return NULL; } static inline void io_mapping_free(struct io_mapping *mapping) { + iomap_free(mapping->base, mapping->size); kfree(mapping); } -- cgit v1.2.3 From 335ef896d4c6639849d79367f0fef9abc06d121b Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Fri, 10 Jul 2009 09:57:36 -0700 Subject: x86, pat: Add rbtree to do quick lookup in memtype tracking PAT memtype tracking uses a linear link list to keep track of IO (non-RAM) regions and their memtypes. The code used a last_accessed pointer as a cache to speedup the lookup. As per discussions with H. Peter Anvin a while back, having a rbtree here will avoid bad performances in pathological cases where we may end up with huge linked list. This may not add any noticable performance speedup in normal case as the number of entires in PAT memtype list tend to be ~20-30 range. The patch removes the "cached_entry" logic as with rbtree we have more generic way of speeding up the lookup. With this patch, we use rbtree to do the quick lookup. We still use linked list as the memtype range tracked can be of different sizes and can overlap in different ways. We also keep track of usage counts with linked list. Example: Multiple ioremaps with different sizes uncached-minus @ 0xfffff00000-0xfffff04000 uncached-minus @ 0xfffff02000-0xfffff03000 And one userlevel mmap and the thread forks a new process uncached-minus @ 0xbf453000-0xbf454000 uncached-minus @ 0xbf453000-0xbf454000 Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/mm/pat.c | 106 +++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 86 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 82d097ce309..c90f2420f56 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -148,11 +149,10 @@ static char *cattr_name(unsigned long flags) * areas). All the aliases have the same cache attributes of course. * Zero attributes are represented as holes. * - * Currently the data structure is a list because the number of mappings - * are expected to be relatively small. If this should be a problem - * it could be changed to a rbtree or similar. + * The data structure is a list that is also organized as an rbtree + * sorted on the start address of memtype range. * - * memtype_lock protects the whole list. + * memtype_lock protects both the linear list and rbtree. */ struct memtype { @@ -160,11 +160,53 @@ struct memtype { u64 end; unsigned long type; struct list_head nd; + struct rb_node rb; }; +static struct rb_root memtype_rbroot = RB_ROOT; static LIST_HEAD(memtype_list); static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */ +static struct memtype *memtype_rb_search(struct rb_root *root, u64 start) +{ + struct rb_node *node = root->rb_node; + struct memtype *last_lower = NULL; + + while (node) { + struct memtype *data = container_of(node, struct memtype, rb); + + if (data->start < start) { + last_lower = data; + node = node->rb_right; + } else if (data->start > start) { + node = node->rb_left; + } else + return data; + } + + /* Will return NULL if there is no entry with its start <= start */ + return last_lower; +} + +static void memtype_rb_insert(struct rb_root *root, struct memtype *data) +{ + struct rb_node **new = &(root->rb_node); + struct rb_node *parent = NULL; + + while (*new) { + struct memtype *this = container_of(*new, struct memtype, rb); + + parent = *new; + if (data->start <= this->start) + new = &((*new)->rb_left); + else if (data->start > this->start) + new = &((*new)->rb_right); + } + + rb_link_node(&data->rb, parent, new); + rb_insert_color(&data->rb, root); +} + /* * Does intersection of PAT memory type and MTRR memory type and returns * the resulting memory type as PAT understands it. @@ -218,9 +260,6 @@ chk_conflict(struct memtype *new, struct memtype *entry, unsigned long *type) return -EBUSY; } -static struct memtype *cached_entry; -static u64 cached_start; - static int pat_pagerange_is_ram(unsigned long start, unsigned long end) { int ram_page = 0, not_rampage = 0; @@ -382,17 +421,19 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, spin_lock(&memtype_lock); - if (cached_entry && start >= cached_start) - entry = cached_entry; - else + entry = memtype_rb_search(&memtype_rbroot, new->start); + if (likely(entry != NULL)) { + /* To work correctly with list_for_each_entry_continue */ + entry = list_entry(entry->nd.prev, struct memtype, nd); + } else { entry = list_entry(&memtype_list, struct memtype, nd); + } /* Search for existing mapping that overlaps the current range */ where = NULL; list_for_each_entry_continue(entry, &memtype_list, nd) { if (end <= entry->start) { where = entry->nd.prev; - cached_entry = list_entry(where, struct memtype, nd); break; } else if (start <= entry->start) { /* end > entry->start */ err = chk_conflict(new, entry, new_type); @@ -400,8 +441,6 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, dprintk("Overlap at 0x%Lx-0x%Lx\n", entry->start, entry->end); where = entry->nd.prev; - cached_entry = list_entry(where, - struct memtype, nd); } break; } else if (start < entry->end) { /* start > entry->start */ @@ -409,8 +448,6 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, if (!err) { dprintk("Overlap at 0x%Lx-0x%Lx\n", entry->start, entry->end); - cached_entry = list_entry(entry->nd.prev, - struct memtype, nd); /* * Move to right position in the linked @@ -438,13 +475,13 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, return err; } - cached_start = start; - if (where) list_add(&new->nd, where); else list_add_tail(&new->nd, &memtype_list); + memtype_rb_insert(&memtype_rbroot, new); + spin_unlock(&memtype_lock); dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n", @@ -456,7 +493,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, int free_memtype(u64 start, u64 end) { - struct memtype *entry; + struct memtype *entry, *saved_entry; int err = -EINVAL; int is_range_ram; @@ -474,17 +511,46 @@ int free_memtype(u64 start, u64 end) return -EINVAL; spin_lock(&memtype_lock); + + entry = memtype_rb_search(&memtype_rbroot, start); + if (unlikely(entry == NULL)) + goto unlock_ret; + + /* + * Saved entry points to an entry with start same or less than what + * we searched for. Now go through the list in both directions to look + * for the entry that matches with both start and end, with list stored + * in sorted start address + */ + saved_entry = entry; list_for_each_entry(entry, &memtype_list, nd) { if (entry->start == start && entry->end == end) { - if (cached_entry == entry || cached_start == start) - cached_entry = NULL; + rb_erase(&entry->rb, &memtype_rbroot); + list_del(&entry->nd); + kfree(entry); + err = 0; + break; + } else if (entry->start > start) { + break; + } + } + + if (!err) + goto unlock_ret; + entry = saved_entry; + list_for_each_entry_reverse(entry, &memtype_list, nd) { + if (entry->start == start && entry->end == end) { + rb_erase(&entry->rb, &memtype_rbroot); list_del(&entry->nd); kfree(entry); err = 0; break; + } else if (entry->start < start) { + break; } } +unlock_ret: spin_unlock(&memtype_lock); if (err) { -- cgit v1.2.3 From 46cf98cdaef5471926010b5bddf84c44ec177fdd Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Fri, 10 Jul 2009 09:57:37 -0700 Subject: x86, pat: Generalize the use of page flag PG_uncached Only IA64 was using PG_uncached as of now. We now intend to use this bit in x86 as well, to keep track of memory type of those addresses that have page struct for them. So, generalize the use of that bit across ia64 and x86. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/ia64/Kconfig | 4 ++++ arch/x86/Kconfig | 4 ++++ include/linux/page-flags.h | 4 ++-- 3 files changed, 10 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 170042b420d..e6246119932 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -112,6 +112,10 @@ config IA64_UNCACHED_ALLOCATOR bool select GENERIC_ALLOCATOR +config ARCH_USES_PG_UNCACHED + def_bool y + depends on IA64_UNCACHED_ALLOCATOR + config AUDIT_ARCH bool default y diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index c07f7220590..8e159538219 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1414,6 +1414,10 @@ config X86_PAT If unsure, say Y. +config ARCH_USES_PG_UNCACHED + def_bool y + depends on X86_PAT + config EFI bool "EFI runtime service support" depends on ACPI diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index e2e5ce54359..2b87acfc5f8 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -99,7 +99,7 @@ enum pageflags { #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT PG_mlocked, /* Page is vma mlocked */ #endif -#ifdef CONFIG_IA64_UNCACHED_ALLOCATOR +#ifdef CONFIG_ARCH_USES_PG_UNCACHED PG_uncached, /* Page has been mapped as uncached */ #endif __NR_PAGEFLAGS, @@ -257,7 +257,7 @@ PAGEFLAG_FALSE(Mlocked) SETPAGEFLAG_NOOP(Mlocked) TESTCLEARFLAG_FALSE(Mlocked) #endif -#ifdef CONFIG_IA64_UNCACHED_ALLOCATOR +#ifdef CONFIG_ARCH_USES_PG_UNCACHED PAGEFLAG(Uncached, uncached) #else PAGEFLAG_FALSE(Uncached) -- cgit v1.2.3 From f58417409603d62f2eb23db4d2cf6853d84a1698 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Fri, 10 Jul 2009 09:57:38 -0700 Subject: x86, pat: Use page flags to track memtypes of RAM pages Change reserve_ram_pages_type and free_ram_pages_type to use 2 page flags to track UC_MINUS, WC, WB and default types. Previous RAM tracking just tracked WB or NonWB, which was not complete and did not allow tracking of RAM fully and there was no way to get the actual type reserved by looking at the page flags. We use the memtype_lock spinlock for atomicity in dealing with memtype tracking in struct page. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/cacheflush.h | 54 ++++++++++++++++++++++- arch/x86/mm/pat.c | 91 +++++++++++++++++++++------------------ 2 files changed, 102 insertions(+), 43 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h index e55dfc1ad45..b54f6afe7ec 100644 --- a/arch/x86/include/asm/cacheflush.h +++ b/arch/x86/include/asm/cacheflush.h @@ -43,8 +43,58 @@ static inline void copy_from_user_page(struct vm_area_struct *vma, memcpy(dst, src, len); } -#define PG_non_WB PG_arch_1 -PAGEFLAG(NonWB, non_WB) +#define PG_WC PG_arch_1 +PAGEFLAG(WC, WC) + +#ifdef CONFIG_X86_PAT +/* + * X86 PAT uses page flags WC and Uncached together to keep track of + * memory type of pages that have backing page struct. X86 PAT supports 3 + * different memory types, _PAGE_CACHE_WB, _PAGE_CACHE_WC and + * _PAGE_CACHE_UC_MINUS and fourth state where page's memory type has not + * been changed from its default (value of -1 used to denote this). + * Note we do not support _PAGE_CACHE_UC here. + * + * Caller must hold memtype_lock for atomicity. + */ +static inline unsigned long get_page_memtype(struct page *pg) +{ + if (!PageUncached(pg) && !PageWC(pg)) + return -1; + else if (!PageUncached(pg) && PageWC(pg)) + return _PAGE_CACHE_WC; + else if (PageUncached(pg) && !PageWC(pg)) + return _PAGE_CACHE_UC_MINUS; + else + return _PAGE_CACHE_WB; +} + +static inline void set_page_memtype(struct page *pg, unsigned long memtype) +{ + switch (memtype) { + case _PAGE_CACHE_WC: + ClearPageUncached(pg); + SetPageWC(pg); + break; + case _PAGE_CACHE_UC_MINUS: + SetPageUncached(pg); + ClearPageWC(pg); + break; + case _PAGE_CACHE_WB: + SetPageUncached(pg); + SetPageWC(pg); + break; + default: + case -1: + ClearPageUncached(pg); + ClearPageWC(pg); + break; + } +} +#else +static inline unsigned long get_page_memtype(struct page *pg) { return -1; } +static inline void set_page_memtype(struct page *pg, unsigned long memtype) { } +#endif /* * The set_memory_* API can be used to change various attributes of a virtual diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index c90f2420f56..1a9d0f07593 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -288,63 +288,61 @@ static int pat_pagerange_is_ram(unsigned long start, unsigned long end) } /* - * For RAM pages, mark the pages as non WB memory type using - * PageNonWB (PG_arch_1). We allow only one set_memory_uc() or - * set_memory_wc() on a RAM page at a time before marking it as WB again. - * This is ok, because only one driver will be owning the page and - * doing set_memory_*() calls. + * For RAM pages, we use page flags to mark the pages with appropriate type. + * Here we do two pass: + * - Find the memtype of all the pages in the range, look for any conflicts + * - In case of no conflicts, set the new memtype for pages in the range * - * For now, we use PageNonWB to track that the RAM page is being mapped - * as non WB. In future, we will have to use one more flag - * (or some other mechanism in page_struct) to distinguish between - * UC and WC mapping. + * Caller must hold memtype_lock for atomicity. */ static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type, unsigned long *new_type) { struct page *page; - u64 pfn, end_pfn; + u64 pfn; + + if (req_type == _PAGE_CACHE_UC) { + /* We do not support strong UC */ + WARN_ON_ONCE(1); + req_type = _PAGE_CACHE_UC_MINUS; + } for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { - page = pfn_to_page(pfn); - if (page_mapped(page) || PageNonWB(page)) - goto out; + unsigned long type; - SetPageNonWB(page); + page = pfn_to_page(pfn); + type = get_page_memtype(page); + if (type != -1) { + printk(KERN_INFO "reserve_ram_pages_type failed " + "0x%Lx-0x%Lx, track 0x%lx, req 0x%lx\n", + start, end, type, req_type); + if (new_type) + *new_type = type; + + return -EBUSY; + } } - return 0; -out: - end_pfn = pfn; - for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) { + if (new_type) + *new_type = req_type; + + for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { page = pfn_to_page(pfn); - ClearPageNonWB(page); + set_page_memtype(page, req_type); } - - return -EINVAL; + return 0; } static int free_ram_pages_type(u64 start, u64 end) { struct page *page; - u64 pfn, end_pfn; + u64 pfn; for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { page = pfn_to_page(pfn); - if (page_mapped(page) || !PageNonWB(page)) - goto out; - - ClearPageNonWB(page); + set_page_memtype(page, -1); } return 0; - -out: - end_pfn = pfn; - for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) { - page = pfn_to_page(pfn); - SetPageNonWB(page); - } - return -EINVAL; } /* @@ -405,11 +403,16 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, *new_type = actual_type; is_range_ram = pat_pagerange_is_ram(start, end); - if (is_range_ram == 1) - return reserve_ram_pages_type(start, end, req_type, - new_type); - else if (is_range_ram < 0) + if (is_range_ram == 1) { + + spin_lock(&memtype_lock); + err = reserve_ram_pages_type(start, end, req_type, new_type); + spin_unlock(&memtype_lock); + + return err; + } else if (is_range_ram < 0) { return -EINVAL; + } new = kmalloc(sizeof(struct memtype), GFP_KERNEL); if (!new) @@ -505,10 +508,16 @@ int free_memtype(u64 start, u64 end) return 0; is_range_ram = pat_pagerange_is_ram(start, end); - if (is_range_ram == 1) - return free_ram_pages_type(start, end); - else if (is_range_ram < 0) + if (is_range_ram == 1) { + + spin_lock(&memtype_lock); + err = free_ram_pages_type(start, end); + spin_unlock(&memtype_lock); + + return err; + } else if (is_range_ram < 0) { return -EINVAL; + } spin_lock(&memtype_lock); -- cgit v1.2.3 From 637b86e75f4c255a4446bc0b67ce9d914b9d2d42 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Fri, 10 Jul 2009 09:57:39 -0700 Subject: x86, pat: Add lookup_memtype to get the current memtype of a paddr Add a new routine lookup_memtype() to get the current memtype based on the PAT reserves and frees. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/mm/pat.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 1a9d0f07593..71aa6f7246c 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -573,6 +573,51 @@ unlock_ret: } +/** + * lookup_memtype - Looksup the memory type for a physical address + * @paddr: physical address of which memory type needs to be looked up + * + * Only to be called when PAT is enabled + * + * Returns _PAGE_CACHE_WB, _PAGE_CACHE_WC, _PAGE_CACHE_UC_MINUS or + * _PAGE_CACHE_UC + */ +static unsigned long lookup_memtype(u64 paddr) +{ + int rettype = _PAGE_CACHE_WB; + struct memtype *entry; + + if (is_ISA_range(paddr, paddr + PAGE_SIZE - 1)) + return rettype; + + if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) { + struct page *page; + spin_lock(&memtype_lock); + page = pfn_to_page(paddr >> PAGE_SHIFT); + rettype = get_page_memtype(page); + spin_unlock(&memtype_lock); + /* + * -1 from get_page_memtype() implies RAM page is in its + * default state and not reserved, and hence of type WB + */ + if (rettype == -1) + rettype = _PAGE_CACHE_WB; + + return rettype; + } + + spin_lock(&memtype_lock); + + entry = memtype_rb_search(&memtype_rbroot, paddr); + if (entry != NULL) + rettype = entry->type; + else + rettype = _PAGE_CACHE_UC_MINUS; + + spin_unlock(&memtype_lock); + return rettype; +} + /** * io_reserve_memtype - Request a memory type mapping for a region of memory * @start: start (physical address) of the region -- cgit v1.2.3 From 1087637616dd5e96d834164ea462aed6159d039b Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Fri, 10 Jul 2009 09:57:40 -0700 Subject: x86, pat: Lookup the protection from memtype list on vm_insert_pfn() Lookup the reserved memtype during vm_insert_pfn and use that memtype for the new mapping. This takes care or handling of vm_insert_pfn() interface in track_pfn_vma*/untrack_pfn_vma. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/mm/pat.c | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 71aa6f7246c..b629f75f73d 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -848,11 +848,6 @@ int track_pfn_vma_copy(struct vm_area_struct *vma) unsigned long vma_size = vma->vm_end - vma->vm_start; pgprot_t pgprot; - /* - * For now, only handle remap_pfn_range() vmas where - * is_linear_pfn_mapping() == TRUE. Handling of - * vm_insert_pfn() is TBD. - */ if (is_linear_pfn_mapping(vma)) { /* * reserve the whole chunk covered by vma. We need the @@ -880,20 +875,24 @@ int track_pfn_vma_copy(struct vm_area_struct *vma) int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot, unsigned long pfn, unsigned long size) { + unsigned long flags; resource_size_t paddr; unsigned long vma_size = vma->vm_end - vma->vm_start; - /* - * For now, only handle remap_pfn_range() vmas where - * is_linear_pfn_mapping() == TRUE. Handling of - * vm_insert_pfn() is TBD. - */ if (is_linear_pfn_mapping(vma)) { /* reserve the whole chunk starting from vm_pgoff */ paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT; return reserve_pfn_range(paddr, vma_size, prot, 0); } + if (!pat_enabled) + return 0; + + /* for vm_insert_pfn and friends, we set prot based on lookup */ + flags = lookup_memtype(pfn << PAGE_SHIFT); + *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) | + flags); + return 0; } @@ -908,11 +907,6 @@ void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, resource_size_t paddr; unsigned long vma_size = vma->vm_end - vma->vm_start; - /* - * For now, only handle remap_pfn_range() vmas where - * is_linear_pfn_mapping() == TRUE. Handling of - * vm_insert_pfn() is TBD. - */ if (is_linear_pfn_mapping(vma)) { /* free the whole chunk starting from vm_pgoff */ paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT; -- cgit v1.2.3 From d886c73cd4cf02a71e1650cbcb6176799d78aac1 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Fri, 10 Jul 2009 09:57:41 -0700 Subject: x86, pat: Sanity check remap_pfn_range for RAM region Add sanity check for remap_pfn_range of RAM regions using lookup_memtype(). Previously, we did not have anyway to get the type of RAM memory regions as they were tracked using a single bit in page_struct (WB, nonWB). Now we can get the actual type from page struct (WB, WC, UC_MINUS) and make sure the requester gets that type. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/mm/pat.c | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index b629f75f73d..a6cace0694a 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -783,11 +783,29 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, is_ram = pat_pagerange_is_ram(paddr, paddr + size); /* - * reserve_pfn_range() doesn't support RAM pages. Maintain the current - * behavior with RAM pages by returning success. + * reserve_pfn_range() for RAM pages. We do not refcount to keep + * track of number of mappings of RAM pages. We can assert that + * the type requested matches the type of first page in the range. */ - if (is_ram != 0) + if (is_ram) { + if (!pat_enabled) + return 0; + + flags = lookup_memtype(paddr); + if (want_flags != flags) { + printk(KERN_WARNING + "%s:%d map pfn RAM range req %s for %Lx-%Lx, got %s\n", + current->comm, current->pid, + cattr_name(want_flags), + (unsigned long long)paddr, + (unsigned long long)(paddr + size), + cattr_name(flags)); + *vma_prot = __pgprot((pgprot_val(*vma_prot) & + (~_PAGE_CACHE_MASK)) | + flags); + } return 0; + } ret = reserve_memtype(paddr, paddr + size, want_flags, &flags); if (ret) -- cgit v1.2.3 From 57844a8f8e29802f37ad9a0f94eb11d6ae358603 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Aug 2009 14:48:38 +0200 Subject: x86: Add x86_init infrastructure The upcoming Moorestown support brings the embedded world to x86. The setup code of x86 has already a couple of hooks which are either x86_quirks or paravirt ops. Some of those setup hooks are pretty convoluted like the timer setup and the tsc calibration code. But there are other places which could do with a cleanup. Instead of having inline functions/macros which are modified at compile time I decided to introduce x86_init ops which are unconditional in the code and make it clear that they can be changed either during compile time or in the early boot process. The function pointers are initialized by default functions which can be noops so that the pointer can be called unconditionally in the most cases. This also allows us to remove 32bit/64bit, paravirt and other #ifdeffery. paravirt guests are just a hardware platform in the setup code, so we should treat them as such and not hide all behind multiple layers of indirection and compile time dependencies. It's more obvious that x86_init.timers.timer_init() is a function pointer than the late_time_init = choose_time_init() obscurity. It's also way simpler to grep for x86_init.timers.timer_init and find all the places which modify that function pointer instead of analyzing weak functions, macros and paravirt indirections. Note. This is not a general paravirt_ops replacement. It just will move setup related hooks which are potentially useful for other platform setup purposes as well out of the paravirt domain. Add the base infrastructure without any functionality. Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/setup.h | 2 ++ arch/x86/include/asm/x86_init.h | 15 +++++++++++++++ arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/x86_init.c | 17 +++++++++++++++++ 4 files changed, 35 insertions(+), 1 deletion(-) create mode 100644 arch/x86/include/asm/x86_init.h create mode 100644 arch/x86/kernel/x86_init.c (limited to 'arch/x86') diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 4093d1ed6db..741e2956f3c 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -7,6 +7,8 @@ #ifndef __ASSEMBLY__ +#include + /* * Any setup quirks to be performed? */ diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h new file mode 100644 index 00000000000..14d11071675 --- /dev/null +++ b/arch/x86/include/asm/x86_init.h @@ -0,0 +1,15 @@ +#ifndef _ASM_X86_PLATFORM_H +#define _ASM_X86_PLATFORM_H + +/** + * struct x86_init_ops - functions for platform specific setup + * + */ +struct x86_init_ops { +}; + +extern struct x86_init_ops x86_init; + +extern void x86_init_noop(void); + +#endif diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 430d5b24af7..313ed6fca9b 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -32,7 +32,7 @@ GCOV_PROFILE_paravirt.o := n obj-y := process_$(BITS).o signal.o entry_$(BITS).o obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time_$(BITS).o ioport.o ldt.o dumpstack.o -obj-y += setup.o i8259.o irqinit.o +obj-y += setup.o x86_init.o i8259.o irqinit.o obj-$(CONFIG_X86_VISWS) += visws_quirks.o obj-$(CONFIG_X86_32) += probe_roms_32.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c new file mode 100644 index 00000000000..82d510c9c99 --- /dev/null +++ b/arch/x86/kernel/x86_init.c @@ -0,0 +1,17 @@ +/* + * Copyright (C) 2009 Thomas Gleixner + * + * For licencing details see kernel-base/COPYING + */ +#include + +#include + +void __cpuinit x86_init_noop(void) { } + +/* + * The platform setup functions are preset with the default functions + * for standard PC hardware. + */ +struct __initdata x86_init_ops x86_init = { +}; -- cgit v1.2.3 From f7cf5a5b8c0e59eac8d30b62271cb0fa52e53ebc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Aug 2009 14:43:56 +0200 Subject: x86: Add probe_roms to x86_init probe_roms is only used on 32bit. Add it to the x86_init ops and remove the #ifdefs. Default initializer is x86_init_noop() which is overridden in the 32bit boot code. Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/x86_init.h | 10 ++++++++++ arch/x86/kernel/head32.c | 3 +++ arch/x86/kernel/setup.c | 4 +--- arch/x86/kernel/x86_init.c | 4 ++++ 4 files changed, 18 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 14d11071675..75e9e68d635 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -1,11 +1,21 @@ #ifndef _ASM_X86_PLATFORM_H #define _ASM_X86_PLATFORM_H +/** + * struct x86_init_resources - platform specific resource related ops + * @probe_roms: probe BIOS roms + * + */ +struct x86_init_resources { + void (*probe_roms)(void); +}; + /** * struct x86_init_ops - functions for platform specific setup * */ struct x86_init_ops { + struct x86_init_resources resources; }; extern struct x86_init_ops x86_init; diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 3f8579f8d42..4049353152c 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -29,6 +29,9 @@ void __init i386_start_kernel(void) reserve_early(ramdisk_image, ramdisk_end, "RAMDISK"); } #endif + /* Initilize 32bit specific setup functions */ + x86_init.resources.probe_roms = probe_roms; + reserve_ebda_region(); /* diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 63f32d220ef..5796eb158d4 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -835,9 +835,7 @@ void __init setup_arch(char **cmdline_p) */ init_hypervisor(&boot_cpu_data); -#ifdef CONFIG_X86_32 - probe_roms(); -#endif + x86_init.resources.probe_roms(); /* after parse_early_param, so could debug it */ insert_resource(&iomem_resource, &code_resource); diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 82d510c9c99..88883f8006c 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -14,4 +14,8 @@ void __cpuinit x86_init_noop(void) { } * for standard PC hardware. */ struct __initdata x86_init_ops x86_init = { + + .resources = { + .probe_roms = x86_init_noop, + }, }; -- cgit v1.2.3 From 8fee697d990c54976c8dc167270633299e2515d2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Aug 2009 14:55:50 +0200 Subject: x86: Add request_standard_resources to x86_init The 32bit and the 64bit code are slighty different in the reservation of standard resources. Also the upcoming Moorestown support needs its own version of that. Add it to x86_init_ops and initialize it with the 64bit default. 32bit overrides it in early boot. Now moorestown can add it's own override w/o sprinkling the code with more #ifdefs Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/setup.h | 3 +++ arch/x86/include/asm/x86_init.h | 3 +++ arch/x86/kernel/head32.c | 1 + arch/x86/kernel/setup.c | 28 ++++++++++++++++------------ arch/x86/kernel/x86_init.c | 3 ++- 5 files changed, 25 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 741e2956f3c..19769ac6061 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -88,6 +88,9 @@ extern unsigned long saved_video_mode; #define paravirt_post_allocator_init() do {} while (0) #endif +extern void reserve_standard_io_resources(void); +extern void i386_reserve_resources(void); + #ifndef _SETUP /* diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 75e9e68d635..d0d9be25ed9 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -4,10 +4,13 @@ /** * struct x86_init_resources - platform specific resource related ops * @probe_roms: probe BIOS roms + * @reserve_resources: reserve the standard resources for the + * platform * */ struct x86_init_resources { void (*probe_roms)(void); + void (*reserve_resources)(void); }; /** diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 4049353152c..d91c37c0206 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -31,6 +31,7 @@ void __init i386_start_kernel(void) #endif /* Initilize 32bit specific setup functions */ x86_init.resources.probe_roms = probe_roms; + x86_init.resources.reserve_resources = i386_reserve_resources; reserve_ebda_region(); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 5796eb158d4..c2a8090e831 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -171,13 +171,6 @@ static struct resource bss_resource = { #ifdef CONFIG_X86_32 -static struct resource video_ram_resource = { - .name = "Video RAM area", - .start = 0xa0000, - .end = 0xbffff, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM -}; - /* cpu data as detected by the assembly code in head.S */ struct cpuinfo_x86 new_cpu_data __cpuinitdata = {0, 0, 0, 0, -1, 1, 0, 0, -1}; /* common cpu data for all cpus */ @@ -605,7 +598,7 @@ static struct resource standard_io_resources[] = { .flags = IORESOURCE_BUSY | IORESOURCE_IO } }; -static void __init reserve_standard_io_resources(void) +void __init reserve_standard_io_resources(void) { int i; @@ -1013,10 +1006,7 @@ void __init setup_arch(char **cmdline_p) e820_reserve_resources(); e820_mark_nosave_regions(max_low_pfn); -#ifdef CONFIG_X86_32 - request_resource(&iomem_resource, &video_ram_resource); -#endif - reserve_standard_io_resources(); + x86_init.resources.reserve_resources(); e820_setup_gap(); @@ -1102,4 +1092,18 @@ void __init x86_quirk_time_init(void) irq0.mask = cpumask_of_cpu(0); setup_irq(0, &irq0); } + +static struct resource video_ram_resource = { + .name = "Video RAM area", + .start = 0xa0000, + .end = 0xbffff, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; + +void __init i386_reserve_resources(void) +{ + request_resource(&iomem_resource, &video_ram_resource); + reserve_standard_io_resources(); +} + #endif /* CONFIG_X86_32 */ diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 88883f8006c..68c093b67ec 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -5,7 +5,7 @@ */ #include -#include +#include void __cpuinit x86_init_noop(void) { } @@ -17,5 +17,6 @@ struct __initdata x86_init_ops x86_init = { .resources = { .probe_roms = x86_init_noop, + .reserve_resources = reserve_standard_io_resources, }, }; -- cgit v1.2.3 From 816c25e7d4fb6fd40022a376e8b7f45b1edf5a89 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Aug 2009 14:36:27 +0200 Subject: x86: Add reserve_ebda_region to x86_init_ops reserve_ebda_region needs to be called befor start_kernel. Moorestown needs to override it. Make it a x86_init_ops function and initialize it with the default reserve_ebda_region. Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/x86_init.h | 2 ++ arch/x86/kernel/head32.c | 4 ++-- arch/x86/kernel/head64.c | 3 +-- arch/x86/kernel/x86_init.c | 2 ++ 4 files changed, 7 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index d0d9be25ed9..8a971cb3dd3 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -6,11 +6,13 @@ * @probe_roms: probe BIOS roms * @reserve_resources: reserve the standard resources for the * platform + * @reserve_ebda_region: reserve the extended bios data area * */ struct x86_init_resources { void (*probe_roms)(void); void (*reserve_resources)(void); + void (*reserve_ebda_region)(void); }; /** diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index d91c37c0206..921a23b6c14 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -11,7 +11,7 @@ #include #include #include -#include +#include #include void __init i386_start_kernel(void) @@ -33,7 +33,7 @@ void __init i386_start_kernel(void) x86_init.resources.probe_roms = probe_roms; x86_init.resources.reserve_resources = i386_reserve_resources; - reserve_ebda_region(); + x86_init.resources.reserve_ebda_region(); /* * At this point everything still needed from the boot loader diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 70eaa852c73..cead8149c3d 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -23,7 +23,6 @@ #include #include #include -#include #include static void __init zap_identity_mappings(void) @@ -112,7 +111,7 @@ void __init x86_64_start_reservations(char *real_mode_data) } #endif - reserve_ebda_region(); + x86_init.resources.reserve_ebda_region(); /* * At this point everything still needed from the boot loader diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 68c093b67ec..1fff49a6858 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -5,6 +5,7 @@ */ #include +#include #include void __cpuinit x86_init_noop(void) { } @@ -18,5 +19,6 @@ struct __initdata x86_init_ops x86_init = { .resources = { .probe_roms = x86_init_noop, .reserve_resources = reserve_standard_io_resources, + .reserve_ebda_region = reserve_ebda_region, }, }; -- cgit v1.2.3 From 6b18ae3e2ff62daa9f181401759161dd8de0aadf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 20 Aug 2009 10:19:54 +0200 Subject: x86: Move memory_setup to x86_init_ops memory_setup is overridden by x86_quirks and by paravirts with weak functions and quirks. Unify the whole mess and make it an unconditional x86_init_ops function which defaults to the standard function and can be overridden by the early platform code. Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/e820.h | 2 -- arch/x86/include/asm/paravirt_types.h | 1 - arch/x86/include/asm/setup.h | 1 - arch/x86/include/asm/x86_init.h | 2 ++ arch/x86/kernel/apic/numaq_32.c | 1 - arch/x86/kernel/e820.c | 19 +------------------ arch/x86/kernel/paravirt.c | 6 ------ arch/x86/kernel/visws_quirks.c | 3 ++- arch/x86/kernel/x86_init.c | 2 ++ arch/x86/lguest/boot.c | 3 ++- arch/x86/xen/enlighten.c | 3 ++- 11 files changed, 11 insertions(+), 32 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h index 7ecba4d8508..40b4e614fe7 100644 --- a/arch/x86/include/asm/e820.h +++ b/arch/x86/include/asm/e820.h @@ -126,8 +126,6 @@ extern void e820_reserve_resources(void); extern void e820_reserve_resources_late(void); extern void setup_memory_map(void); extern char *default_machine_specific_memory_setup(void); -extern char *machine_specific_memory_setup(void); -extern char *memory_setup(void); #endif /* __KERNEL__ */ #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 2b3371bae29..6d668968b6b 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -81,7 +81,6 @@ struct pv_init_ops { /* Basic arch-specific setup */ void (*arch_setup)(void); - char *(*memory_setup)(void); void (*post_allocator_init)(void); /* Print a banner to identify the environment */ diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 19769ac6061..9cba9d6ca88 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -22,7 +22,6 @@ struct x86_quirks { int (*arch_pre_intr_init)(void); int (*arch_intr_init)(void); int (*arch_trap_init)(void); - char * (*arch_memory_setup)(void); int (*mach_get_smp_config)(unsigned int early); int (*mach_find_smp_config)(unsigned int reserve); diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 8a971cb3dd3..6c084f2a6c3 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -7,12 +7,14 @@ * @reserve_resources: reserve the standard resources for the * platform * @reserve_ebda_region: reserve the extended bios data area + * @memory_setup: platform specific memory setup * */ struct x86_init_resources { void (*probe_roms)(void); void (*reserve_resources)(void); void (*reserve_ebda_region)(void); + char *(*memory_setup)(void); }; /** diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index ca96e68f0d2..403c062f69e 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -260,7 +260,6 @@ static struct x86_quirks numaq_x86_quirks __initdata = { .arch_pre_time_init = numaq_pre_time_init, .arch_time_init = NULL, .arch_pre_intr_init = NULL, - .arch_memory_setup = NULL, .arch_intr_init = NULL, .arch_trap_init = NULL, .mach_get_smp_config = NULL, diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 5cb5725b2ba..0d804b907e8 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1455,28 +1455,11 @@ char *__init default_machine_specific_memory_setup(void) return who; } -char *__init __attribute__((weak)) machine_specific_memory_setup(void) -{ - if (x86_quirks->arch_memory_setup) { - char *who = x86_quirks->arch_memory_setup(); - - if (who) - return who; - } - return default_machine_specific_memory_setup(); -} - -/* Overridden in paravirt.c if CONFIG_PARAVIRT */ -char * __init __attribute__((weak)) memory_setup(void) -{ - return machine_specific_memory_setup(); -} - void __init setup_memory_map(void) { char *who; - who = memory_setup(); + who = x86_init.resources.memory_setup(); memcpy(&e820_saved, &e820, sizeof(struct e820map)); printk(KERN_INFO "BIOS-provided physical RAM map:\n"); e820_print_map(who); diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 70ec9b951d7..532c9a2626c 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -60,11 +60,6 @@ static void __init default_banner(void) pv_info.name); } -char *memory_setup(void) -{ - return pv_init_ops.memory_setup(); -} - /* Simple instruction patching code. */ #define DEF_NATIVE(ops, name, code) \ extern const char start_##ops##_##name[], end_##ops##_##name[]; \ @@ -322,7 +317,6 @@ struct pv_init_ops pv_init_ops = { .patch = native_patch, .banner = default_banner, .arch_setup = paravirt_nop, - .memory_setup = machine_specific_memory_setup, }; struct pv_time_ops pv_time_ops = { diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index 31ffc24eec4..97c670df1ae 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -239,7 +239,6 @@ static int visws_trap_init(void); static struct x86_quirks visws_x86_quirks __initdata = { .arch_time_init = visws_time_init, .arch_pre_intr_init = visws_pre_intr_init, - .arch_memory_setup = visws_memory_setup, .arch_intr_init = NULL, .arch_trap_init = visws_trap_init, .mach_get_smp_config = visws_get_smp_config, @@ -263,6 +262,8 @@ void __init visws_early_detect(void) */ x86_quirks = &visws_x86_quirks; + x86_init.resources.memory_setup = visws_memory_setup; + /* * Install reboot quirks: */ diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 1fff49a6858..1965bff3489 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -7,6 +7,7 @@ #include #include +#include void __cpuinit x86_init_noop(void) { } @@ -20,5 +21,6 @@ struct __initdata x86_init_ops x86_init = { .probe_roms = x86_init_noop, .reserve_resources = reserve_standard_io_resources, .reserve_ebda_region = reserve_ebda_region, + .memory_setup = default_machine_specific_memory_setup, }, }; diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index d677fa9ca65..11445c176de 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -1270,7 +1270,6 @@ __init void lguest_init(void) pv_irq_ops.safe_halt = lguest_safe_halt; /* Setup operations */ - pv_init_ops.memory_setup = lguest_memory_setup; pv_init_ops.patch = lguest_patch; /* Intercepts of various CPU instructions */ @@ -1325,6 +1324,8 @@ __init void lguest_init(void) pv_time_ops.time_init = lguest_time_init; pv_time_ops.get_tsc_khz = lguest_tsc_khz; + x86_init.resources.memory_setup = lguest_memory_setup; + /* * Now is a good time to look at the implementations of these functions * before returning to the rest of lguest_init(). diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index e90540a46a0..50b20c64f0b 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -841,7 +841,6 @@ static const struct pv_init_ops xen_init_ops __initdata = { .patch = xen_patch, .banner = xen_banner, - .memory_setup = xen_memory_setup, .arch_setup = xen_arch_setup, .post_allocator_init = xen_post_allocator_init, }; @@ -982,6 +981,8 @@ asmlinkage void __init xen_start_kernel(void) pv_apic_ops = xen_apic_ops; pv_mmu_ops = xen_mmu_ops; + x86_init.resources.memory_setup = xen_memory_setup; + #ifdef CONFIG_X86_64 /* * Setup percpu state. We only need to do this for 64-bit -- cgit v1.2.3 From f4848472cd99487e182b64fb2a5d0e4fedbe86ad Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 20 Aug 2009 12:05:01 +0200 Subject: x86: Sanitize smp_record and move it to x86_init_ops The x86 quirkification introduced an extra ugly hackery with a variable pointer in the mpparse code. If the pointer is initialized then it is dereferenced and the variable set to 0 or incremented. Create a x86_init_ops function and let the affected numaq code hold the function. Default init is a setup noop. Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/setup.h | 1 - arch/x86/include/asm/x86_init.h | 12 +++++++++++- arch/x86/kernel/apic/numaq_32.c | 19 ++++++++++++++++--- arch/x86/kernel/mpparse.c | 6 ++---- arch/x86/kernel/x86_init.c | 5 +++++ 5 files changed, 34 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 9cba9d6ca88..bbf2dfd59b4 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -25,7 +25,6 @@ struct x86_quirks { int (*mach_get_smp_config)(unsigned int early); int (*mach_find_smp_config)(unsigned int reserve); - int *mpc_record; int (*mpc_apic_id)(struct mpc_cpu *m); void (*mpc_oem_bus_info)(struct mpc_bus *m, char *name); void (*mpc_oem_pci_bus)(struct mpc_bus *m); diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 6c084f2a6c3..10b297b1881 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -1,6 +1,14 @@ #ifndef _ASM_X86_PLATFORM_H #define _ASM_X86_PLATFORM_H +/** + * struct x86_init_mpparse - platform specific mpparse ops + * @mpc_record: platform specific mpc record accounting + */ +struct x86_init_mpparse { + void (*mpc_record)(unsigned int mode); +}; + /** * struct x86_init_resources - platform specific resource related ops * @probe_roms: probe BIOS roms @@ -22,11 +30,13 @@ struct x86_init_resources { * */ struct x86_init_ops { - struct x86_init_resources resources; + struct x86_init_resources resources; + struct x86_init_mpparse mpparse; }; extern struct x86_init_ops x86_init; extern void x86_init_noop(void); +extern void x86_init_uint_noop(unsigned int unused); #endif diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index 403c062f69e..b5f0b1dc7dd 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -66,7 +66,6 @@ struct mpc_trans { unsigned short trans_reserved; }; -/* x86_quirks member */ static int mpc_record; static struct mpc_trans *translation_table[MAX_MPC_ENTRY]; @@ -177,6 +176,19 @@ static void mpc_oem_pci_bus(struct mpc_bus *m) quad_local_to_mp_bus_id[quad][local] = m->busid; } +/* + * Called from mpparse code. + * mode = 0: prescan + * mode = 1: one mpc entry scanned + */ +static void numaq_mpc_record(unsigned int mode) +{ + if (!mode) + mpc_record = 0; + else + mpc_record++; +} + static void __init MP_translation_info(struct mpc_trans *m) { printk(KERN_INFO @@ -264,7 +276,6 @@ static struct x86_quirks numaq_x86_quirks __initdata = { .arch_trap_init = NULL, .mach_get_smp_config = NULL, .mach_find_smp_config = NULL, - .mpc_record = &mpc_record, .mpc_apic_id = mpc_apic_id, .mpc_oem_bus_info = mpc_oem_bus_info, .mpc_oem_pci_bus = mpc_oem_pci_bus, @@ -285,8 +296,10 @@ static __init void early_check_numaq(void) if (smp_found_config) early_get_smp_config(); - if (found_numaq) + if (found_numaq) { x86_quirks = &numaq_x86_quirks; + x86_init.mpparse.mpc_record = numaq_mpc_record; + } } int __init get_memcfg_numaq(void) diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 651c93b2886..b2179fdf0ff 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -320,8 +320,7 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) /* * Now process the configuration blocks. */ - if (x86_quirks->mpc_record) - *x86_quirks->mpc_record = 0; + x86_init.mpparse.mpc_record(0); while (count < mpc->length) { switch (*mpt) { @@ -353,8 +352,7 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) count = mpc->length; break; } - if (x86_quirks->mpc_record) - (*x86_quirks->mpc_record)++; + x86_init.mpparse.mpc_record(1); } #ifdef CONFIG_X86_BIGSMP diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 1965bff3489..83bd5db376b 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -10,6 +10,7 @@ #include void __cpuinit x86_init_noop(void) { } +void __init x86_init_uint_noop(unsigned int unused) { } /* * The platform setup functions are preset with the default functions @@ -23,4 +24,8 @@ struct __initdata x86_init_ops x86_init = { .reserve_ebda_region = reserve_ebda_region, .memory_setup = default_machine_specific_memory_setup, }, + + .mpparse = { + .mpc_record = x86_init_uint_noop, + }, }; -- cgit v1.2.3 From de93410310952fb7b705f784ef22493c8362dbe8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 20 Aug 2009 09:27:29 +0200 Subject: x86: Move ioapic_ids_setup to x86_init_ops 32bit and also the numaq code have special requirements on the ioapic_id setup. Convert it to a x86_init_ops function and get rid of the quirks and #ifdefs Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/io_apic.h | 3 ++- arch/x86/include/asm/setup.h | 1 - arch/x86/include/asm/x86_init.h | 2 ++ arch/x86/kernel/apic/io_apic.c | 11 ++++------- arch/x86/kernel/apic/numaq_32.c | 8 +------- arch/x86/kernel/head32.c | 3 +++ arch/x86/kernel/x86_init.c | 1 + 7 files changed, 13 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 330ee807f89..2b8aeb89933 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -177,12 +177,13 @@ extern int setup_ioapic_entry(int apic, int irq, int polarity, int vector, int pin); extern void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e); +extern void setup_ioapic_ids_from_mpc(void); #else /* !CONFIG_X86_IO_APIC */ #define io_apic_assign_pci_irqs 0 +#define setup_ioapic_ids_from_mpc x86_init_noop static const int timer_through_8259 = 0; static inline void ioapic_init_mappings(void) { } static inline void ioapic_insert_resources(void) { } - static inline void probe_nr_irqs_gsi(void) { } #endif diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index bbf2dfd59b4..cc8b4b0550e 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -30,7 +30,6 @@ struct x86_quirks { void (*mpc_oem_pci_bus)(struct mpc_bus *m); void (*smp_read_mpc_oem)(struct mpc_oemtable *oemtable, unsigned short oemsize); - int (*setup_ioapic_ids)(void); }; extern void x86_quirk_intr_init(void); diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 10b297b1881..65985730b37 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -4,9 +4,11 @@ /** * struct x86_init_mpparse - platform specific mpparse ops * @mpc_record: platform specific mpc record accounting + * @setup_ioapic_ids: platform specific ioapic id override */ struct x86_init_mpparse { void (*mpc_record)(unsigned int mode); + void (*setup_ioapic_ids)(void); }; /** diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index d2ed6c5ddc8..5f4687187ce 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2014,7 +2014,7 @@ void disable_IO_APIC(void) * by Matt Domsch Tue Dec 21 12:25:05 CST 1999 */ -static void __init setup_ioapic_ids_from_mpc(void) +void __init setup_ioapic_ids_from_mpc(void) { union IO_APIC_reg_00 reg_00; physid_mask_t phys_id_present_map; @@ -2023,9 +2023,8 @@ static void __init setup_ioapic_ids_from_mpc(void) unsigned char old_id; unsigned long flags; - if (x86_quirks->setup_ioapic_ids && x86_quirks->setup_ioapic_ids()) + if (acpi_ioapic) return; - /* * Don't check I/O APIC IDs for xAPIC systems. They have * no meaning without the serial APIC bus. @@ -3061,10 +3060,8 @@ void __init setup_IO_APIC(void) /* * Set up IO-APIC IRQ routing. */ -#ifdef CONFIG_X86_32 - if (!acpi_ioapic) - setup_ioapic_ids_from_mpc(); -#endif + x86_init.mpparse.setup_ioapic_ids(); + sync_Arb_IDs(); setup_IO_APIC_irqs(); init_IO_APIC_traps(); diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index b5f0b1dc7dd..f3717659265 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -262,12 +262,6 @@ static void __init } } -static int __init numaq_setup_ioapic_ids(void) -{ - /* so can skip it */ - return 1; -} - static struct x86_quirks numaq_x86_quirks __initdata = { .arch_pre_time_init = numaq_pre_time_init, .arch_time_init = NULL, @@ -280,7 +274,6 @@ static struct x86_quirks numaq_x86_quirks __initdata = { .mpc_oem_bus_info = mpc_oem_bus_info, .mpc_oem_pci_bus = mpc_oem_pci_bus, .smp_read_mpc_oem = smp_read_mpc_oem, - .setup_ioapic_ids = numaq_setup_ioapic_ids, }; static __init void early_check_numaq(void) @@ -299,6 +292,7 @@ static __init void early_check_numaq(void) if (found_numaq) { x86_quirks = &numaq_x86_quirks; x86_init.mpparse.mpc_record = numaq_mpc_record; + x86_init.mpparse.setup_ioapic_ids = x86_init_noop; } } diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 921a23b6c14..a21398fac4f 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -13,6 +13,8 @@ #include #include #include +#include +#include void __init i386_start_kernel(void) { @@ -32,6 +34,7 @@ void __init i386_start_kernel(void) /* Initilize 32bit specific setup functions */ x86_init.resources.probe_roms = probe_roms; x86_init.resources.reserve_resources = i386_reserve_resources; + x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc; x86_init.resources.reserve_ebda_region(); diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 83bd5db376b..f4a32b3ab02 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -27,5 +27,6 @@ struct __initdata x86_init_ops x86_init = { .mpparse = { .mpc_record = x86_init_uint_noop, + .setup_ioapic_ids = x86_init_noop, }, }; -- cgit v1.2.3 From fd6c6661492226bb82f422157c535ac573cbecbd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 20 Aug 2009 10:41:58 +0200 Subject: x86: Move mpc_apic_id to x86_init_ops The mpc_apic_id setup is handled by a x86_quirk. Make it a x86_init_ops function with a default implementation. Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/mpspec.h | 2 ++ arch/x86/include/asm/setup.h | 2 -- arch/x86/include/asm/x86_init.h | 4 ++++ arch/x86/kernel/apic/numaq_32.c | 2 +- arch/x86/kernel/mpparse.c | 10 ++++++---- arch/x86/kernel/x86_init.c | 2 ++ 6 files changed, 15 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h index e2a1bb6d71e..03c6a92bfd4 100644 --- a/arch/x86/include/asm/mpspec.h +++ b/arch/x86/include/asm/mpspec.h @@ -62,10 +62,12 @@ extern void get_smp_config(void); extern void find_smp_config(void); extern void early_reserve_e820_mpc_new(void); extern int enable_update_mptable; +extern int default_mpc_apic_id(struct mpc_cpu *m); #else static inline void find_smp_config(void) { } static inline void early_reserve_e820_mpc_new(void) { } #define enable_update_mptable 0 +#define default_mpc_apic_id NULL #endif void __cpuinit generic_processor_info(int apicid, int version); diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index cc8b4b0550e..7c7f44f3e4d 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -12,7 +12,6 @@ /* * Any setup quirks to be performed? */ -struct mpc_cpu; struct mpc_bus; struct mpc_oemtable; @@ -25,7 +24,6 @@ struct x86_quirks { int (*mach_get_smp_config)(unsigned int early); int (*mach_find_smp_config)(unsigned int reserve); - int (*mpc_apic_id)(struct mpc_cpu *m); void (*mpc_oem_bus_info)(struct mpc_bus *m, char *name); void (*mpc_oem_pci_bus)(struct mpc_bus *m); void (*smp_read_mpc_oem)(struct mpc_oemtable *oemtable, diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 65985730b37..f2be2a78018 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -1,14 +1,18 @@ #ifndef _ASM_X86_PLATFORM_H #define _ASM_X86_PLATFORM_H +struct mpc_cpu; + /** * struct x86_init_mpparse - platform specific mpparse ops * @mpc_record: platform specific mpc record accounting * @setup_ioapic_ids: platform specific ioapic id override + * @mpc_apic_id: platform specific mpc apic id assignment */ struct x86_init_mpparse { void (*mpc_record)(unsigned int mode); void (*setup_ioapic_ids)(void); + int (*mpc_apic_id)(struct mpc_cpu *m); }; /** diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index f3717659265..222413f7e79 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -270,7 +270,6 @@ static struct x86_quirks numaq_x86_quirks __initdata = { .arch_trap_init = NULL, .mach_get_smp_config = NULL, .mach_find_smp_config = NULL, - .mpc_apic_id = mpc_apic_id, .mpc_oem_bus_info = mpc_oem_bus_info, .mpc_oem_pci_bus = mpc_oem_pci_bus, .smp_read_mpc_oem = smp_read_mpc_oem, @@ -293,6 +292,7 @@ static __init void early_check_numaq(void) x86_quirks = &numaq_x86_quirks; x86_init.mpparse.mpc_record = numaq_mpc_record; x86_init.mpparse.setup_ioapic_ids = x86_init_noop; + x86_init.mpparse.mpc_apic_id = mpc_apic_id; } } diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index b2179fdf0ff..04560860a72 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -45,6 +45,11 @@ static int __init mpf_checksum(unsigned char *mp, int len) return sum & 0xFF; } +int __init default_mpc_apic_id(struct mpc_cpu *m) +{ + return m->apicid; +} + static void __init MP_processor_info(struct mpc_cpu *m) { int apicid; @@ -55,10 +60,7 @@ static void __init MP_processor_info(struct mpc_cpu *m) return; } - if (x86_quirks->mpc_apic_id) - apicid = x86_quirks->mpc_apic_id(m); - else - apicid = m->apicid; + apicid = x86_init.mpparse.mpc_apic_id(m); if (m->cpuflag & CPU_BOOTPROCESSOR) { bootup_cpu = " (Bootup-CPU)"; diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index f4a32b3ab02..08749f2612f 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -6,6 +6,7 @@ #include #include +#include #include #include @@ -28,5 +29,6 @@ struct __initdata x86_init_ops x86_init = { .mpparse = { .mpc_record = x86_init_uint_noop, .setup_ioapic_ids = x86_init_noop, + .mpc_apic_id = default_mpc_apic_id, }, }; -- cgit v1.2.3 From 72302142e165313ee58af81bd76708c12b58d7ab Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 20 Aug 2009 12:18:32 +0200 Subject: x86: Move smp_read_mpc_oem to x86_init_ops. Move smp_read_mpc_oem from quirks to x86_init. Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/mpspec.h | 2 ++ arch/x86/include/asm/setup.h | 3 --- arch/x86/include/asm/x86_init.h | 3 +++ arch/x86/kernel/apic/numaq_32.c | 6 +++--- arch/x86/kernel/mpparse.c | 8 ++++---- arch/x86/kernel/x86_init.c | 1 + 6 files changed, 13 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h index 03c6a92bfd4..5de8e92be2d 100644 --- a/arch/x86/include/asm/mpspec.h +++ b/arch/x86/include/asm/mpspec.h @@ -63,11 +63,13 @@ extern void find_smp_config(void); extern void early_reserve_e820_mpc_new(void); extern int enable_update_mptable; extern int default_mpc_apic_id(struct mpc_cpu *m); +extern void default_smp_read_mpc_oem(struct mpc_table *mpc); #else static inline void find_smp_config(void) { } static inline void early_reserve_e820_mpc_new(void) { } #define enable_update_mptable 0 #define default_mpc_apic_id NULL +#define default_smp_read_mpc_oem NULL #endif void __cpuinit generic_processor_info(int apicid, int version); diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 7c7f44f3e4d..adb5d44d990 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -13,7 +13,6 @@ * Any setup quirks to be performed? */ struct mpc_bus; -struct mpc_oemtable; struct x86_quirks { int (*arch_pre_time_init)(void); @@ -26,8 +25,6 @@ struct x86_quirks { void (*mpc_oem_bus_info)(struct mpc_bus *m, char *name); void (*mpc_oem_pci_bus)(struct mpc_bus *m); - void (*smp_read_mpc_oem)(struct mpc_oemtable *oemtable, - unsigned short oemsize); }; extern void x86_quirk_intr_init(void); diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index f2be2a78018..fc0eef2f5fd 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -2,17 +2,20 @@ #define _ASM_X86_PLATFORM_H struct mpc_cpu; +struct mpc_table; /** * struct x86_init_mpparse - platform specific mpparse ops * @mpc_record: platform specific mpc record accounting * @setup_ioapic_ids: platform specific ioapic id override * @mpc_apic_id: platform specific mpc apic id assignment + * @smp_read_mpc_oem: platform specific oem mpc table setup */ struct x86_init_mpparse { void (*mpc_record)(unsigned int mode); void (*setup_ioapic_ids)(void); int (*mpc_apic_id)(struct mpc_cpu *m); + void (*smp_read_mpc_oem)(struct mpc_table *mpc); }; /** diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index 222413f7e79..1bd3b0ed240 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -218,9 +218,9 @@ static int __init mpf_checksum(unsigned char *mp, int len) /* * Read/parse the MPC oem tables */ -static void __init - smp_read_mpc_oem(struct mpc_oemtable *oemtable, unsigned short oemsize) +static void __init smp_read_mpc_oem(struct mpc_table *mpc) { + struct mpc_oemtable *oemtable = (void *)(long)mpc->oemptr; int count = sizeof(*oemtable); /* the header size */ unsigned char *oemptr = ((unsigned char *)oemtable) + count; @@ -272,7 +272,6 @@ static struct x86_quirks numaq_x86_quirks __initdata = { .mach_find_smp_config = NULL, .mpc_oem_bus_info = mpc_oem_bus_info, .mpc_oem_pci_bus = mpc_oem_pci_bus, - .smp_read_mpc_oem = smp_read_mpc_oem, }; static __init void early_check_numaq(void) @@ -293,6 +292,7 @@ static __init void early_check_numaq(void) x86_init.mpparse.mpc_record = numaq_mpc_record; x86_init.mpparse.setup_ioapic_ids = x86_init_noop; x86_init.mpparse.mpc_apic_id = mpc_apic_id; + x86_init.mpparse.smp_read_mpc_oem = smp_read_mpc_oem; } } diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 04560860a72..45abdf63edc 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -293,6 +293,8 @@ static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt) 1, mpc, mpc->length, 1); } +void __init default_smp_read_mpc_oem(struct mpc_table *mpc) { } + static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) { char str[16]; @@ -314,10 +316,8 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) if (early) return 1; - if (mpc->oemptr && x86_quirks->smp_read_mpc_oem) { - struct mpc_oemtable *oem_table = (void *)(long)mpc->oemptr; - x86_quirks->smp_read_mpc_oem(oem_table, mpc->oemsize); - } + if (mpc->oemptr) + x86_init.mpparse.smp_read_mpc_oem(mpc); /* * Now process the configuration blocks. diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 08749f2612f..fb5d93c077d 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -30,5 +30,6 @@ struct __initdata x86_init_ops x86_init = { .mpc_record = x86_init_uint_noop, .setup_ioapic_ids = x86_init_noop, .mpc_apic_id = default_mpc_apic_id, + .smp_read_mpc_oem = default_smp_read_mpc_oem, }, }; -- cgit v1.2.3 From 52fdb5684660f9fd7129f7bbbe279a02893bacb8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 20 Aug 2009 12:45:33 +0200 Subject: x86: Move mpc_oem_pci_bus to x86_init_ops Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/setup.h | 1 - arch/x86/include/asm/x86_init.h | 3 +++ arch/x86/kernel/apic/numaq_32.c | 2 +- arch/x86/kernel/mpparse.c | 4 ++-- 4 files changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index adb5d44d990..fd2267baba1 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -24,7 +24,6 @@ struct x86_quirks { int (*mach_find_smp_config)(unsigned int reserve); void (*mpc_oem_bus_info)(struct mpc_bus *m, char *name); - void (*mpc_oem_pci_bus)(struct mpc_bus *m); }; extern void x86_quirk_intr_init(void); diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index fc0eef2f5fd..404e2d2b06d 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -1,6 +1,7 @@ #ifndef _ASM_X86_PLATFORM_H #define _ASM_X86_PLATFORM_H +struct mpc_bus; struct mpc_cpu; struct mpc_table; @@ -10,12 +11,14 @@ struct mpc_table; * @setup_ioapic_ids: platform specific ioapic id override * @mpc_apic_id: platform specific mpc apic id assignment * @smp_read_mpc_oem: platform specific oem mpc table setup + * @mpc_oem_pci_bus: platform specific pci bus setup (default NULL) */ struct x86_init_mpparse { void (*mpc_record)(unsigned int mode); void (*setup_ioapic_ids)(void); int (*mpc_apic_id)(struct mpc_cpu *m); void (*smp_read_mpc_oem)(struct mpc_table *mpc); + void (*mpc_oem_pci_bus)(struct mpc_bus *m); }; /** diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index 1bd3b0ed240..feebe8eed7d 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -271,7 +271,6 @@ static struct x86_quirks numaq_x86_quirks __initdata = { .mach_get_smp_config = NULL, .mach_find_smp_config = NULL, .mpc_oem_bus_info = mpc_oem_bus_info, - .mpc_oem_pci_bus = mpc_oem_pci_bus, }; static __init void early_check_numaq(void) @@ -293,6 +292,7 @@ static __init void early_check_numaq(void) x86_init.mpparse.setup_ioapic_ids = x86_init_noop; x86_init.mpparse.mpc_apic_id = mpc_apic_id; x86_init.mpparse.smp_read_mpc_oem = smp_read_mpc_oem; + x86_init.mpparse.mpc_oem_pci_bus = mpc_oem_pci_bus; } } diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 45abdf63edc..72e1140723c 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -98,8 +98,8 @@ static void __init MP_bus_info(struct mpc_bus *m) mp_bus_id_to_type[m->busid] = MP_BUS_ISA; #endif } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) { - if (x86_quirks->mpc_oem_pci_bus) - x86_quirks->mpc_oem_pci_bus(m); + if (x86_init.mpparse.mpc_oem_pci_bus) + x86_init.mpparse.mpc_oem_pci_bus(m); clear_bit(m->busid, mp_bus_not_pci); #if defined(CONFIG_EISA) || defined(CONFIG_MCA) -- cgit v1.2.3 From 90e1c6969d8711edb888a00ec54c74370f125c8f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 20 Aug 2009 12:34:47 +0200 Subject: x86: Move oem_bus_info to x86_init_ops Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/mpspec.h | 6 ++++++ arch/x86/include/asm/setup.h | 3 --- arch/x86/include/asm/x86_init.h | 2 ++ arch/x86/kernel/apic/numaq_32.c | 2 +- arch/x86/kernel/mpparse.c | 14 ++++++++------ arch/x86/kernel/x86_init.c | 1 + 6 files changed, 18 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h index 5de8e92be2d..e3c579efde7 100644 --- a/arch/x86/include/asm/mpspec.h +++ b/arch/x86/include/asm/mpspec.h @@ -64,12 +64,18 @@ extern void early_reserve_e820_mpc_new(void); extern int enable_update_mptable; extern int default_mpc_apic_id(struct mpc_cpu *m); extern void default_smp_read_mpc_oem(struct mpc_table *mpc); +# ifdef CONFIG_X86_IO_APIC +extern void default_mpc_oem_bus_info(struct mpc_bus *m, char *str); +# else +# define default_mpc_oem_bus_info NULL +# endif #else static inline void find_smp_config(void) { } static inline void early_reserve_e820_mpc_new(void) { } #define enable_update_mptable 0 #define default_mpc_apic_id NULL #define default_smp_read_mpc_oem NULL +#define default_mpc_oem_bus_info NULL #endif void __cpuinit generic_processor_info(int apicid, int version); diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index fd2267baba1..6121a8ac7b0 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -12,7 +12,6 @@ /* * Any setup quirks to be performed? */ -struct mpc_bus; struct x86_quirks { int (*arch_pre_time_init)(void); @@ -22,8 +21,6 @@ struct x86_quirks { int (*arch_trap_init)(void); int (*mach_get_smp_config)(unsigned int early); int (*mach_find_smp_config)(unsigned int reserve); - - void (*mpc_oem_bus_info)(struct mpc_bus *m, char *name); }; extern void x86_quirk_intr_init(void); diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 404e2d2b06d..2833a873a90 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -12,6 +12,7 @@ struct mpc_table; * @mpc_apic_id: platform specific mpc apic id assignment * @smp_read_mpc_oem: platform specific oem mpc table setup * @mpc_oem_pci_bus: platform specific pci bus setup (default NULL) + * @mpc_oem_bus_info: platform specific mpc bus info */ struct x86_init_mpparse { void (*mpc_record)(unsigned int mode); @@ -19,6 +20,7 @@ struct x86_init_mpparse { int (*mpc_apic_id)(struct mpc_cpu *m); void (*smp_read_mpc_oem)(struct mpc_table *mpc); void (*mpc_oem_pci_bus)(struct mpc_bus *m); + void (*mpc_oem_bus_info)(struct mpc_bus *m, char *name); }; /** diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index feebe8eed7d..700273dca68 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -270,7 +270,6 @@ static struct x86_quirks numaq_x86_quirks __initdata = { .arch_trap_init = NULL, .mach_get_smp_config = NULL, .mach_find_smp_config = NULL, - .mpc_oem_bus_info = mpc_oem_bus_info, }; static __init void early_check_numaq(void) @@ -293,6 +292,7 @@ static __init void early_check_numaq(void) x86_init.mpparse.mpc_apic_id = mpc_apic_id; x86_init.mpparse.smp_read_mpc_oem = smp_read_mpc_oem; x86_init.mpparse.mpc_oem_pci_bus = mpc_oem_pci_bus; + x86_init.mpparse.mpc_oem_bus_info = mpc_oem_bus_info; } } diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 72e1140723c..a42f23f1dc7 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -72,16 +72,18 @@ static void __init MP_processor_info(struct mpc_cpu *m) } #ifdef CONFIG_X86_IO_APIC -static void __init MP_bus_info(struct mpc_bus *m) +void __init default_mpc_oem_bus_info(struct mpc_bus *m, char *str) { - char str[7]; memcpy(str, m->bustype, 6); str[6] = 0; + apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->busid, str); +} + +static void __init MP_bus_info(struct mpc_bus *m) +{ + char str[7]; - if (x86_quirks->mpc_oem_bus_info) - x86_quirks->mpc_oem_bus_info(m, str); - else - apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->busid, str); + x86_init.mpparse.mpc_oem_bus_info(m, str); #if MAX_MP_BUSSES < 256 if (m->busid >= MAX_MP_BUSSES) { diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index fb5d93c077d..27685edc546 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -31,5 +31,6 @@ struct __initdata x86_init_ops x86_init = { .setup_ioapic_ids = x86_init_noop, .mpc_apic_id = default_mpc_apic_id, .smp_read_mpc_oem = default_smp_read_mpc_oem, + .mpc_oem_bus_info = default_mpc_oem_bus_info, }, }; -- cgit v1.2.3 From 7285dd7fd375763bfb8ab1ac9cf3f1206f503c16 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 28 Aug 2009 20:25:24 +0200 Subject: clocksource: Resolve cpu hotplug dead lock with TSC unstable Martin Schwidefsky analyzed it: To register a clocksource the clocksource_mutex is acquired and if necessary timekeeping_notify is called to install the clocksource as the timekeeper clock. timekeeping_notify uses stop_machine which needs to take cpu_add_remove_lock mutex. Starting a new cpu is done with the cpu_add_remove_lock mutex held. native_cpu_up checks the tsc of the new cpu and if the tsc is no good clocksource_change_rating is called. Which needs the clocksource_mutex and the deadlock is complete. The solution is to replace the TSC via the clocksource watchdog mechanism. Mark the TSC as unstable and schedule the watchdog work so it gets removed in the watchdog thread context. Signed-off-by: Thomas Gleixner LKML-Reference: Cc: Martin Schwidefsky Cc: John Stultz --- arch/x86/kernel/tsc.c | 8 +++++--- include/linux/clocksource.h | 1 + kernel/time/clocksource.c | 33 ++++++++++++++++++++++++++++++--- 3 files changed, 36 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 968425422c4..fc3672a303d 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -767,12 +767,14 @@ void mark_tsc_unstable(char *reason) { if (!tsc_unstable) { tsc_unstable = 1; - printk("Marking TSC unstable due to %s\n", reason); + printk(KERN_INFO "Marking TSC unstable due to %s\n", reason); /* Change only the rating, when not registered */ if (clocksource_tsc.mult) - clocksource_change_rating(&clocksource_tsc, 0); - else + clocksource_mark_unstable(&clocksource_tsc); + else { + clocksource_tsc.flags |= CLOCK_SOURCE_UNSTABLE; clocksource_tsc.rating = 0; + } } } diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index 9ea40ff26f0..83d2fbd81b9 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -277,6 +277,7 @@ extern struct clocksource* clocksource_get_next(void); extern void clocksource_change_rating(struct clocksource *cs, int rating); extern void clocksource_resume(void); extern struct clocksource * __init __weak clocksource_default_clock(void); +extern void clocksource_mark_unstable(struct clocksource *cs); #ifdef CONFIG_GENERIC_TIME_VSYSCALL extern void update_vsyscall(struct timespec *ts, struct clocksource *c); diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index e0c86ad6e9f..a0af4ffcb6e 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -149,15 +149,42 @@ static void clocksource_watchdog_work(struct work_struct *work) kthread_run(clocksource_watchdog_kthread, NULL, "kwatchdog"); } -static void clocksource_unstable(struct clocksource *cs, int64_t delta) +static void __clocksource_unstable(struct clocksource *cs) { - printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n", - cs->name, delta); cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG); cs->flags |= CLOCK_SOURCE_UNSTABLE; schedule_work(&watchdog_work); } +static void clocksource_unstable(struct clocksource *cs, int64_t delta) +{ + printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n", + cs->name, delta); + __clocksource_unstable(cs); +} + +/** + * clocksource_mark_unstable - mark clocksource unstable via watchdog + * @cs: clocksource to be marked unstable + * + * This function is called instead of clocksource_change_rating from + * cpu hotplug code to avoid a deadlock between the clocksource mutex + * and the cpu hotplug mutex. It defers the update of the clocksource + * to the watchdog thread. + */ +void clocksource_mark_unstable(struct clocksource *cs) +{ + unsigned long flags; + + spin_lock_irqsave(&watchdog_lock, flags); + if (!(cs->flags & CLOCK_SOURCE_UNSTABLE)) { + if (list_empty(&cs->wd_list)) + list_add(&cs->wd_list, &watchdog_list); + __clocksource_unstable(cs); + } + spin_unlock_irqrestore(&watchdog_lock, flags); +} + static void clocksource_watchdog(unsigned long data) { struct clocksource *cs; -- cgit v1.2.3 From a192a9580bcc41692be1f36b77c3b681827f566a Mon Sep 17 00:00:00 2001 From: Len Brown Date: Tue, 28 Jul 2009 16:45:54 -0400 Subject: ACPI: Move definition of PREFIX from acpi_bus.h to internal..h Linux/ACPI core files using internal.h all PREFIX "ACPI: ", however, not all ACPI drivers use/want it -- and they should not have to #undef PREFIX to define their own. Add GPL commment to internal.h while we are there. This does not change any actual console output, asside from a whitespace fix. Signed-off-by: Len Brown --- arch/x86/pci/mmconfig-shared.c | 2 ++ drivers/acpi/ac.c | 2 ++ drivers/acpi/battery.c | 2 ++ drivers/acpi/blacklist.c | 2 ++ drivers/acpi/button.c | 2 ++ drivers/acpi/cm_sbs.c | 2 ++ drivers/acpi/container.c | 2 ++ drivers/acpi/dock.c | 2 ++ drivers/acpi/ec.c | 1 - drivers/acpi/event.c | 2 ++ drivers/acpi/fan.c | 2 ++ drivers/acpi/glue.c | 2 ++ drivers/acpi/internal.h | 22 +++++++++++++++++++++- drivers/acpi/numa.c | 2 ++ drivers/acpi/pci_irq.c | 2 ++ drivers/acpi/pci_link.c | 2 ++ drivers/acpi/pci_root.c | 2 ++ drivers/acpi/power.c | 2 ++ drivers/acpi/processor_core.c | 2 ++ drivers/acpi/processor_idle.c | 2 ++ drivers/acpi/processor_perflib.c | 2 ++ drivers/acpi/processor_thermal.c | 2 ++ drivers/acpi/processor_throttling.c | 2 ++ drivers/acpi/sbs.c | 2 ++ drivers/acpi/sbshc.c | 2 ++ drivers/acpi/system.c | 2 ++ drivers/acpi/thermal.c | 2 ++ drivers/acpi/utils.c | 2 ++ drivers/acpi/video.c | 2 ++ drivers/acpi/video_detect.c | 2 ++ drivers/pci/dmar.c | 3 +-- drivers/platform/x86/fujitsu-laptop.c | 4 ++-- drivers/platform/x86/wmi.c | 1 - include/acpi/acpi_bus.h | 2 -- 34 files changed, 80 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 712443ec6d4..81d3466765c 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -18,6 +18,8 @@ #include #include +#define PREFIX "ACPI: " + /* aperture is up to 256MB but BIOS may reserve less */ #define MMCONFIG_APER_MIN (2 * 1024*1024) #define MMCONFIG_APER_MAX (256 * 1024*1024) diff --git a/drivers/acpi/ac.c b/drivers/acpi/ac.c index 0df8fcb687d..98b9690b015 100644 --- a/drivers/acpi/ac.c +++ b/drivers/acpi/ac.c @@ -37,6 +37,8 @@ #include #include +#define PREFIX "ACPI: " + #define ACPI_AC_CLASS "ac_adapter" #define ACPI_AC_DEVICE_NAME "AC Adapter" #define ACPI_AC_FILE_STATE "state" diff --git a/drivers/acpi/battery.c b/drivers/acpi/battery.c index 58b4517ce71..f8c3d1bb696 100644 --- a/drivers/acpi/battery.c +++ b/drivers/acpi/battery.c @@ -45,6 +45,8 @@ #include #endif +#define PREFIX "ACPI: " + #define ACPI_BATTERY_VALUE_UNKNOWN 0xFFFFFFFF #define ACPI_BATTERY_CLASS "battery" diff --git a/drivers/acpi/blacklist.c b/drivers/acpi/blacklist.c index f6baa77deef..19152ea2b10 100644 --- a/drivers/acpi/blacklist.c +++ b/drivers/acpi/blacklist.c @@ -34,6 +34,8 @@ #include #include +#include "internal.h" + enum acpi_blacklist_predicates { all_versions, less_than_or_equal, diff --git a/drivers/acpi/button.c b/drivers/acpi/button.c index 9195deba9d9..d295bdccc09 100644 --- a/drivers/acpi/button.c +++ b/drivers/acpi/button.c @@ -33,6 +33,8 @@ #include #include +#define PREFIX "ACPI: " + #define ACPI_BUTTON_CLASS "button" #define ACPI_BUTTON_FILE_INFO "info" #define ACPI_BUTTON_FILE_STATE "state" diff --git a/drivers/acpi/cm_sbs.c b/drivers/acpi/cm_sbs.c index 332fe4b2170..6c9ee68e46f 100644 --- a/drivers/acpi/cm_sbs.c +++ b/drivers/acpi/cm_sbs.c @@ -28,6 +28,8 @@ #include #include +#define PREFIX "ACPI: " + ACPI_MODULE_NAME("cm_sbs"); #define ACPI_AC_CLASS "ac_adapter" #define ACPI_BATTERY_CLASS "battery" diff --git a/drivers/acpi/container.c b/drivers/acpi/container.c index fe0cdf83641..5f2c3c00a31 100644 --- a/drivers/acpi/container.c +++ b/drivers/acpi/container.c @@ -35,6 +35,8 @@ #include #include +#define PREFIX "ACPI: " + #define ACPI_CONTAINER_DEVICE_NAME "ACPI container device" #define ACPI_CONTAINER_CLASS "container" diff --git a/drivers/acpi/dock.c b/drivers/acpi/dock.c index efb959d6c8a..9a855669ff1 100644 --- a/drivers/acpi/dock.c +++ b/drivers/acpi/dock.c @@ -33,6 +33,8 @@ #include #include +#define PREFIX "ACPI: " + #define ACPI_DOCK_DRIVER_DESCRIPTION "ACPI Dock Station Driver" ACPI_MODULE_NAME("dock"); diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c index 391f331674c..5180f0f1dd0 100644 --- a/drivers/acpi/ec.c +++ b/drivers/acpi/ec.c @@ -47,7 +47,6 @@ #define ACPI_EC_DEVICE_NAME "Embedded Controller" #define ACPI_EC_FILE_INFO "info" -#undef PREFIX #define PREFIX "ACPI: EC: " /* EC status register */ diff --git a/drivers/acpi/event.c b/drivers/acpi/event.c index aeb7e5fb4a0..c511071bfd7 100644 --- a/drivers/acpi/event.c +++ b/drivers/acpi/event.c @@ -14,6 +14,8 @@ #include #include +#include "internal.h" + #define _COMPONENT ACPI_SYSTEM_COMPONENT ACPI_MODULE_NAME("event"); diff --git a/drivers/acpi/fan.c b/drivers/acpi/fan.c index 53698ea0837..f419849a0d3 100644 --- a/drivers/acpi/fan.c +++ b/drivers/acpi/fan.c @@ -34,6 +34,8 @@ #include #include +#define PREFIX "ACPI: " + #define ACPI_FAN_CLASS "fan" #define ACPI_FAN_FILE_STATE "state" diff --git a/drivers/acpi/glue.c b/drivers/acpi/glue.c index a8a5c29958c..dc36a448de4 100644 --- a/drivers/acpi/glue.c +++ b/drivers/acpi/glue.c @@ -12,6 +12,8 @@ #include #include +#include "internal.h" + #define ACPI_GLUE_DEBUG 0 #if ACPI_GLUE_DEBUG #define DBG(x...) printk(PREFIX x) diff --git a/drivers/acpi/internal.h b/drivers/acpi/internal.h index 11a69b53004..074cf8682d5 100644 --- a/drivers/acpi/internal.h +++ b/drivers/acpi/internal.h @@ -1,4 +1,24 @@ -/* For use by Linux/ACPI infrastructure, not drivers */ +/* + * acpi/internal.h + * For use by Linux/ACPI infrastructure, not drivers + * + * Copyright (c) 2009, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#define PREFIX "ACPI: " int init_acpi_device_notify(void); int acpi_scan_init(void); diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c index d440ccd27d9..202dd0c976a 100644 --- a/drivers/acpi/numa.c +++ b/drivers/acpi/numa.c @@ -30,6 +30,8 @@ #include #include +#define PREFIX "ACPI: " + #define ACPI_NUMA 0x80000000 #define _COMPONENT ACPI_NUMA ACPI_MODULE_NAME("numa"); diff --git a/drivers/acpi/pci_irq.c b/drivers/acpi/pci_irq.c index b794eb88ab9..843699ed93f 100644 --- a/drivers/acpi/pci_irq.c +++ b/drivers/acpi/pci_irq.c @@ -40,6 +40,8 @@ #include #include +#define PREFIX "ACPI: " + #define _COMPONENT ACPI_PCI_COMPONENT ACPI_MODULE_NAME("pci_irq"); diff --git a/drivers/acpi/pci_link.c b/drivers/acpi/pci_link.c index 16e0f9d3d17..394ae89409c 100644 --- a/drivers/acpi/pci_link.c +++ b/drivers/acpi/pci_link.c @@ -43,6 +43,8 @@ #include #include +#define PREFIX "ACPI: " + #define _COMPONENT ACPI_PCI_COMPONENT ACPI_MODULE_NAME("pci_link"); #define ACPI_PCI_LINK_CLASS "pci_irq_routing" diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c index 55b5b90c2a4..dee916707a7 100644 --- a/drivers/acpi/pci_root.c +++ b/drivers/acpi/pci_root.c @@ -36,6 +36,8 @@ #include #include +#define PREFIX "ACPI: " + #define _COMPONENT ACPI_PCI_COMPONENT ACPI_MODULE_NAME("pci_root"); #define ACPI_PCI_ROOT_CLASS "pci_bridge" diff --git a/drivers/acpi/power.c b/drivers/acpi/power.c index d74365d4a6e..e86603f37de 100644 --- a/drivers/acpi/power.c +++ b/drivers/acpi/power.c @@ -44,6 +44,8 @@ #include #include +#define PREFIX "ACPI: " + #define _COMPONENT ACPI_POWER_COMPONENT ACPI_MODULE_NAME("power"); #define ACPI_POWER_CLASS "power_resource" diff --git a/drivers/acpi/processor_core.c b/drivers/acpi/processor_core.c index 2cc4b303387..b4a1ab297e7 100644 --- a/drivers/acpi/processor_core.c +++ b/drivers/acpi/processor_core.c @@ -59,6 +59,8 @@ #include #include +#define PREFIX "ACPI: " + #define ACPI_PROCESSOR_CLASS "processor" #define ACPI_PROCESSOR_DEVICE_NAME "Processor" #define ACPI_PROCESSOR_FILE_INFO "info" diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index 66393d5c4c7..22aab1fc9b4 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -60,6 +60,8 @@ #include #include +#define PREFIX "ACPI: " + #define ACPI_PROCESSOR_CLASS "processor" #define _COMPONENT ACPI_PROCESSOR_COMPONENT ACPI_MODULE_NAME("processor_idle"); diff --git a/drivers/acpi/processor_perflib.c b/drivers/acpi/processor_perflib.c index 60e543d3234..11088cf1031 100644 --- a/drivers/acpi/processor_perflib.c +++ b/drivers/acpi/processor_perflib.c @@ -39,6 +39,8 @@ #include #include +#define PREFIX "ACPI: " + #define ACPI_PROCESSOR_CLASS "processor" #define ACPI_PROCESSOR_FILE_PERFORMANCE "performance" #define _COMPONENT ACPI_PROCESSOR_COMPONENT diff --git a/drivers/acpi/processor_thermal.c b/drivers/acpi/processor_thermal.c index 31adda1099e..3e3181c0efc 100644 --- a/drivers/acpi/processor_thermal.c +++ b/drivers/acpi/processor_thermal.c @@ -40,6 +40,8 @@ #include #include +#define PREFIX "ACPI: " + #define ACPI_PROCESSOR_CLASS "processor" #define _COMPONENT ACPI_PROCESSOR_COMPONENT ACPI_MODULE_NAME("processor_thermal"); diff --git a/drivers/acpi/processor_throttling.c b/drivers/acpi/processor_throttling.c index ae39797aab5..b366b9c13d4 100644 --- a/drivers/acpi/processor_throttling.c +++ b/drivers/acpi/processor_throttling.c @@ -41,6 +41,8 @@ #include #include +#define PREFIX "ACPI: " + #define ACPI_PROCESSOR_CLASS "processor" #define _COMPONENT ACPI_PROCESSOR_COMPONENT ACPI_MODULE_NAME("processor_throttling"); diff --git a/drivers/acpi/sbs.c b/drivers/acpi/sbs.c index 4b214b74eba..52b9db8afc2 100644 --- a/drivers/acpi/sbs.c +++ b/drivers/acpi/sbs.c @@ -46,6 +46,8 @@ #include "sbshc.h" +#define PREFIX "ACPI: " + #define ACPI_SBS_CLASS "sbs" #define ACPI_AC_CLASS "ac_adapter" #define ACPI_BATTERY_CLASS "battery" diff --git a/drivers/acpi/sbshc.c b/drivers/acpi/sbshc.c index 0619734895b..d9339806df4 100644 --- a/drivers/acpi/sbshc.c +++ b/drivers/acpi/sbshc.c @@ -15,6 +15,8 @@ #include #include "sbshc.h" +#define PREFIX "ACPI: " + #define ACPI_SMB_HC_CLASS "smbus_host_controller" #define ACPI_SMB_HC_DEVICE_NAME "ACPI SMBus HC" diff --git a/drivers/acpi/system.c b/drivers/acpi/system.c index 9c61ab2177c..d11282975f3 100644 --- a/drivers/acpi/system.c +++ b/drivers/acpi/system.c @@ -31,6 +31,8 @@ #include +#define PREFIX "ACPI: " + #define _COMPONENT ACPI_SYSTEM_COMPONENT ACPI_MODULE_NAME("system"); diff --git a/drivers/acpi/thermal.c b/drivers/acpi/thermal.c index 564ea142428..65f67815902 100644 --- a/drivers/acpi/thermal.c +++ b/drivers/acpi/thermal.c @@ -47,6 +47,8 @@ #include #include +#define PREFIX "ACPI: " + #define ACPI_THERMAL_CLASS "thermal_zone" #define ACPI_THERMAL_DEVICE_NAME "Thermal Zone" #define ACPI_THERMAL_FILE_STATE "state" diff --git a/drivers/acpi/utils.c b/drivers/acpi/utils.c index f844941089b..811fec10462 100644 --- a/drivers/acpi/utils.c +++ b/drivers/acpi/utils.c @@ -30,6 +30,8 @@ #include #include +#include "internal.h" + #define _COMPONENT ACPI_BUS_COMPONENT ACPI_MODULE_NAME("utils"); diff --git a/drivers/acpi/video.c b/drivers/acpi/video.c index 8851315ce85..a0fa3946b50 100644 --- a/drivers/acpi/video.c +++ b/drivers/acpi/video.c @@ -44,6 +44,8 @@ #include #include +#define PREFIX "ACPI: " + #define ACPI_VIDEO_CLASS "video" #define ACPI_VIDEO_BUS_NAME "Video Bus" #define ACPI_VIDEO_DEVICE_NAME "Video Device" diff --git a/drivers/acpi/video_detect.c b/drivers/acpi/video_detect.c index 7cd2b63435e..7032f25da9b 100644 --- a/drivers/acpi/video_detect.c +++ b/drivers/acpi/video_detect.c @@ -38,6 +38,8 @@ #include #include +#define PREFIX "ACPI: " + ACPI_MODULE_NAME("video"); #define _COMPONENT ACPI_VIDEO_COMPONENT diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c index 7b287cb38b7..998f02d2ba4 100644 --- a/drivers/pci/dmar.c +++ b/drivers/pci/dmar.c @@ -34,8 +34,7 @@ #include #include -#undef PREFIX -#define PREFIX "DMAR:" +#define PREFIX "DMAR: " /* No locks are needed as DMA remapping hardware unit * list is constructed at boot time and hotplug of diff --git a/drivers/platform/x86/fujitsu-laptop.c b/drivers/platform/x86/fujitsu-laptop.c index 218b9a16ac3..eabddc9c192 100644 --- a/drivers/platform/x86/fujitsu-laptop.c +++ b/drivers/platform/x86/fujitsu-laptop.c @@ -700,7 +700,7 @@ static int acpi_fujitsu_add(struct acpi_device *device) goto end; } - printk(KERN_INFO PREFIX "%s [%s] (%s)\n", + printk(KERN_INFO "ACPI: %s [%s] (%s)\n", acpi_device_name(device), acpi_device_bid(device), !device->power.state ? "on" : "off"); @@ -874,7 +874,7 @@ static int acpi_fujitsu_hotkey_add(struct acpi_device *device) goto end; } - printk(KERN_INFO PREFIX "%s [%s] (%s)\n", + printk(KERN_INFO "ACPI: %s [%s] (%s)\n", acpi_device_name(device), acpi_device_bid(device), !device->power.state ? "on" : "off"); diff --git a/drivers/platform/x86/wmi.c b/drivers/platform/x86/wmi.c index f215a591919..177f8d767df 100644 --- a/drivers/platform/x86/wmi.c +++ b/drivers/platform/x86/wmi.c @@ -42,7 +42,6 @@ MODULE_LICENSE("GPL"); #define ACPI_WMI_CLASS "wmi" -#undef PREFIX #define PREFIX "ACPI: WMI: " static DEFINE_MUTEX(wmi_data_lock); diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h index c65e4ce6c3a..f485107ddc4 100644 --- a/include/acpi/acpi_bus.h +++ b/include/acpi/acpi_bus.h @@ -30,8 +30,6 @@ #include -#define PREFIX "ACPI: " - /* TBD: Make dynamic */ #define ACPI_MAX_HANDLES 10 struct acpi_handle_list { -- cgit v1.2.3 From f4a2d5840e9f0e48d1a787b66e7346087a756029 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Tue, 28 Jul 2009 16:48:02 -0400 Subject: ACPI, PCI: Change PREFIX to "PCI" from "ACPI" in mmconfig-shared.c Signed-off-by: Len Brown Acked-by: Jesse Barnes --- arch/x86/pci/mmconfig-shared.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 81d3466765c..b707a0141d3 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -18,7 +18,7 @@ #include #include -#define PREFIX "ACPI: " +#define PREFIX "PCI: " /* aperture is up to 256MB but BIOS may reserve less */ #define MMCONFIG_APER_MIN (2 * 1024*1024) -- cgit v1.2.3 From e55a5999ffcf72dc4d43d73618957964cb87065a Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Tue, 28 Jul 2009 17:41:53 +0800 Subject: ACPI: Handle CONFIG_ACPI=n better from linux/acpi.h linux/acpi.h is the top level header for interfacing with the ACPI sub-system, so acpi_disabled should be up there instead of down in asm/acpi.h -- particularly since asm/acpi.h doesn't exist for all architectures. Same story for acpi_table_parse(), which is a top-level API to Linux/ACPI. This is necessary for building some code that used to always depend on CONFIG_ACPI=y, but will soon also need to build with CONFIG_ACPI=n. Signed-off-by: Feng Tang Signed-off-by: Len Brown --- arch/x86/include/asm/acpi.h | 1 - include/linux/acpi.h | 11 ++++++++++- 2 files changed, 10 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 20d1465a2ab..4518dc50090 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -144,7 +144,6 @@ static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate) #else /* !CONFIG_ACPI */ -#define acpi_disabled 1 #define acpi_lapic 0 #define acpi_ioapic 0 static inline void acpi_noirq_set(void) { } diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 34321cfffea..3fce811bf9a 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -292,7 +292,10 @@ void __init acpi_s4_no_nvs(void); extern acpi_status acpi_pci_osc_control_set(acpi_handle handle, u32 flags); extern void acpi_early_init(void); -#else /* CONFIG_ACPI */ +#else /* !CONFIG_ACPI */ + +#define acpi_disabled 1 + static inline void acpi_early_init(void) { } static inline int early_acpi_boot_init(void) @@ -331,5 +334,11 @@ static inline int acpi_check_mem_region(resource_size_t start, return 0; } +struct acpi_table_header; +static inline int acpi_table_parse(char *id, + int (*handler)(struct acpi_table_header *)) +{ + return -1; +} #endif /* !CONFIG_ACPI */ #endif /*_LINUX_ACPI_H*/ -- cgit v1.2.3 From efafc8b213e67ed148a5b53ade29ee7b48af907d Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Fri, 14 Aug 2009 15:23:29 -0400 Subject: x86: add arch-specific SFI support arch/x86/kernel/sfi.c serves the dual-purpose of supporting the SFI core with arch specific code, as well as a home for the arch-specific code that uses SFI. analogous to ACPI, drivers/sfi/Kconfig is pulled in by arch/x86/Kconfig Signed-off-by: Feng Tang Signed-off-by: Len Brown Cc: x86@kernel.org --- arch/x86/Kconfig | 2 + arch/x86/kernel/Makefile | 1 + arch/x86/kernel/setup.c | 3 ++ arch/x86/kernel/sfi.c | 133 +++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 139 insertions(+) create mode 100644 arch/x86/kernel/sfi.c (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 13ffa5df37d..d8ba42418d3 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1683,6 +1683,8 @@ source "kernel/power/Kconfig" source "drivers/acpi/Kconfig" +source "drivers/sfi/Kconfig" + config X86_APM_BOOT bool default y diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 430d5b24af7..6321afaafb2 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -55,6 +55,7 @@ obj-y += step.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += cpu/ obj-y += acpi/ +obj-$(CONFIG_SFI) += sfi.o obj-y += reboot.o obj-$(CONFIG_MCA) += mca_32.o obj-$(CONFIG_X86_MSR) += msr.o diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 63f32d220ef..d784ea20760 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -990,6 +991,8 @@ void __init setup_arch(char **cmdline_p) */ acpi_boot_init(); + sfi_init(); + #if defined(CONFIG_X86_MPPARSE) || defined(CONFIG_X86_VISWS) /* * get boot-time SMP configuration: diff --git a/arch/x86/kernel/sfi.c b/arch/x86/kernel/sfi.c new file mode 100644 index 00000000000..761df3f759c --- /dev/null +++ b/arch/x86/kernel/sfi.c @@ -0,0 +1,133 @@ +/* + * sfi.c - x86 architecture SFI support. + * + * Copyright (c) 2009, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * + */ + +#define KMSG_COMPONENT "SFI" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_X86_LOCAL_APIC +static unsigned long sfi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; + +void __init mp_sfi_register_lapic_address(unsigned long address) +{ + mp_lapic_addr = address; + + set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr); + if (boot_cpu_physical_apicid == -1U) + boot_cpu_physical_apicid = read_apic_id(); + + pr_info("Boot CPU = %d\n", boot_cpu_physical_apicid); +} + +/* All CPUs enumerated by SFI must be present and enabled */ +void __cpuinit mp_sfi_register_lapic(u8 id) +{ + int boot_cpu = 0; + + if (MAX_APICS - id <= 0) { + pr_warning("Processor #%d invalid (max %d)\n", + id, MAX_APICS); + return; + } + + if (id == boot_cpu_physical_apicid) + boot_cpu = 1; + pr_info("registering lapic[%d]\n", id); + + generic_processor_info(id, GET_APIC_VERSION(apic_read(APIC_LVR))); +} + +static int __init sfi_parse_cpus(struct sfi_table_header *table) +{ + struct sfi_table_simple *sb; + struct sfi_cpu_table_entry *pentry; + int i; + int cpu_num; + + sb = (struct sfi_table_simple *)table; + cpu_num = SFI_GET_NUM_ENTRIES(sb, struct sfi_cpu_table_entry); + pentry = (struct sfi_cpu_table_entry *)sb->pentry; + + for (i = 0; i < cpu_num; i++) { + mp_sfi_register_lapic(pentry->apic_id); + pentry++; + } + + smp_found_config = 1; + return 0; +} +#endif /* CONFIG_X86_LOCAL_APIC */ + +#ifdef CONFIG_X86_IO_APIC +static u32 gsi_base; + +static int __init sfi_parse_ioapic(struct sfi_table_header *table) +{ + struct sfi_table_simple *sb; + struct sfi_apic_table_entry *pentry; + int i, num; + + sb = (struct sfi_table_simple *)table; + num = SFI_GET_NUM_ENTRIES(sb, struct sfi_apic_table_entry); + pentry = (struct sfi_apic_table_entry *)sb->pentry; + + for (i = 0; i < num; i++) { + mp_register_ioapic(i, pentry->phys_addr, gsi_base); + gsi_base += io_apic_get_redir_entries(i); + pentry++; + } + + WARN(pic_mode, KERN_WARNING + "SFI: pic_mod shouldn't be 1 when IOAPIC table is present\n"); + pic_mode = 0; + return 0; +} +#endif /* CONFIG_X86_IO_APIC */ + +/* + * sfi_platform_init(): register lapics & io-apics + */ +int __init sfi_platform_init(void) +{ +#ifdef CONFIG_X86_LOCAL_APIC + mp_sfi_register_lapic_address(sfi_lapic_addr); + sfi_table_parse(SFI_SIG_CPUS, NULL, NULL, sfi_parse_cpus); +#endif +#ifdef CONFIG_X86_IO_APIC + sfi_table_parse(SFI_SIG_APIC, NULL, NULL, sfi_parse_ioapic); +#endif + return 0; +} -- cgit v1.2.3 From 5f0db7a2fb78895a197f64e548333b3bbd433996 Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Fri, 14 Aug 2009 15:37:50 -0400 Subject: SFI: Hook PCI MMCONFIG First check ACPI, and if that fails, ask SFI to find the MCFG. Signed-off-by: Feng Tang Signed-off-by: Len Brown Cc: Jesse Barnes --- arch/x86/Kconfig | 2 +- arch/x86/pci/mmconfig-shared.c | 6 ++++-- arch/x86/pci/mmconfig_32.c | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d8ba42418d3..4c92c91f354 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1880,7 +1880,7 @@ config PCI_DIRECT config PCI_MMCONFIG def_bool y - depends on X86_32 && PCI && ACPI && (PCI_GOMMCONFIG || PCI_GOANY) + depends on X86_32 && PCI && (ACPI || SFI) && (PCI_GOMMCONFIG || PCI_GOANY) config PCI_OLPC def_bool y diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index b707a0141d3..602c172d3bd 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -13,10 +13,12 @@ #include #include #include +#include #include #include #include #include +#include #define PREFIX "PCI: " @@ -493,7 +495,7 @@ static void __init pci_mmcfg_reject_broken(int early) (unsigned int)cfg->start_bus_number, (unsigned int)cfg->end_bus_number); - if (!early) + if (!early && !acpi_disabled) valid = is_mmconf_reserved(is_acpi_reserved, addr, size, i, cfg, 0); if (valid) @@ -608,7 +610,7 @@ static void __init __pci_mmcfg_init(int early) } if (!known_bridge) - acpi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg); + acpi_sfi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg); pci_mmcfg_reject_broken(early); diff --git a/arch/x86/pci/mmconfig_32.c b/arch/x86/pci/mmconfig_32.c index 8b2d561046a..f10a7e94a84 100644 --- a/arch/x86/pci/mmconfig_32.c +++ b/arch/x86/pci/mmconfig_32.c @@ -11,9 +11,9 @@ #include #include -#include #include #include +#include /* Assume systems with more busses have correct MCFG */ #define mmcfg_virt_addr ((void __iomem *) fix_to_virt(FIX_PCIE_MCFG)) -- cgit v1.2.3 From 47d25003cbd9e9030a95f7ccc4e70fec6aa7b844 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 28 Aug 2009 14:11:57 +0100 Subject: x86: Fix earlyprintk=dbgp for machines without NX Since parse_early_param() may (e.g. for earlyprintk=dbgp) involve calls to page table manipulation functions (here set_fixmap_nocache()), NX hardware support must be determined before calling that function (so that __supported_pte_mask gets properly set up). But the call after parse_early_param() can also not go away, as that will honor eventual command line specified disabling of the NX functionality. ( This will then just result in whatever mappings got established during parse_early_param() having the NX bit set despite it being disabled on the command line, but I think that's tolerable). Signed-off-by: Jan Beulich Cc: Yinghai Lu LKML-Reference: <4A97F3BD02000078000121B9@vpn.id2.novell.com> [ merged to x86/pat to resolve a conflict. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 02643cc3bf2..eb1f1e6e52b 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -714,6 +714,16 @@ void __init setup_arch(char **cmdline_p) strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); *cmdline_p = command_line; +#ifdef CONFIG_X86_64 + /* + * Must call this twice: Once just to detect whether hardware doesn't + * support NX (so that the early EHCI debug console setup can safely + * call set_fixmap(), and then again after parsing early parameters to + * honor the respective command line option. + */ + check_efer(); +#endif + parse_early_param(); /* VMI may relocate the fixmap; do this before touching ioremap area */ -- cgit v1.2.3 From b3f1b617f49447df6c3f5fac9c225aaea8b724ea Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 20 Aug 2009 11:11:52 +0200 Subject: x86: Move get/find_smp_config to x86_init_ops Replace the quirk machinery by a x86_init_ops function which defaults to the standard implementation. Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/mpspec.h | 37 ++++++++++++++++++++++++++++++------- arch/x86/include/asm/setup.h | 2 -- arch/x86/include/asm/x86_init.h | 4 ++++ arch/x86/kernel/apic/numaq_32.c | 2 -- arch/x86/kernel/mpparse.c | 33 ++------------------------------- arch/x86/kernel/setup.c | 2 -- arch/x86/kernel/visws_quirks.c | 14 ++++---------- arch/x86/kernel/x86_init.c | 2 ++ 8 files changed, 42 insertions(+), 54 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h index e3c579efde7..79c94500c0b 100644 --- a/arch/x86/include/asm/mpspec.h +++ b/arch/x86/include/asm/mpspec.h @@ -4,6 +4,7 @@ #include #include +#include extern int apic_version[MAX_APICS]; extern int pic_mode; @@ -41,9 +42,6 @@ extern int quad_local_to_mp_bus_id [NR_CPUS/4][4]; #endif /* CONFIG_X86_64 */ -extern void early_find_smp_config(void); -extern void early_get_smp_config(void); - #if defined(CONFIG_MCA) || defined(CONFIG_EISA) extern int mp_bus_id_to_type[MAX_MP_BUSSES]; #endif @@ -52,14 +50,36 @@ extern DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); extern unsigned int boot_cpu_physical_apicid; extern unsigned int max_physical_apicid; -extern int smp_found_config; extern int mpc_default_type; extern unsigned long mp_lapic_addr; -extern void get_smp_config(void); +#ifdef CONFIG_X86_LOCAL_APIC +extern int smp_found_config; +#else +# define smp_found_config 0 +#endif + +static inline void get_smp_config(void) +{ + x86_init.mpparse.get_smp_config(0); +} + +static inline void early_get_smp_config(void) +{ + x86_init.mpparse.get_smp_config(1); +} + +static inline void find_smp_config(void) +{ + x86_init.mpparse.find_smp_config(1); +} + +static inline void early_find_smp_config(void) +{ + x86_init.mpparse.find_smp_config(0); +} #ifdef CONFIG_X86_MPPARSE -extern void find_smp_config(void); extern void early_reserve_e820_mpc_new(void); extern int enable_update_mptable; extern int default_mpc_apic_id(struct mpc_cpu *m); @@ -69,13 +89,16 @@ extern void default_mpc_oem_bus_info(struct mpc_bus *m, char *str); # else # define default_mpc_oem_bus_info NULL # endif +extern void default_find_smp_config(unsigned int reserve); +extern void default_get_smp_config(unsigned int early); #else -static inline void find_smp_config(void) { } static inline void early_reserve_e820_mpc_new(void) { } #define enable_update_mptable 0 #define default_mpc_apic_id NULL #define default_smp_read_mpc_oem NULL #define default_mpc_oem_bus_info NULL +#define default_find_smp_config x86_init_uint_noop +#define default_get_smp_config x86_init_uint_noop #endif void __cpuinit generic_processor_info(int apicid, int version); diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 6121a8ac7b0..345a2551af9 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -19,8 +19,6 @@ struct x86_quirks { int (*arch_pre_intr_init)(void); int (*arch_intr_init)(void); int (*arch_trap_init)(void); - int (*mach_get_smp_config)(unsigned int early); - int (*mach_find_smp_config)(unsigned int reserve); }; extern void x86_quirk_intr_init(void); diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 2833a873a90..e0d4729c905 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -13,6 +13,8 @@ struct mpc_table; * @smp_read_mpc_oem: platform specific oem mpc table setup * @mpc_oem_pci_bus: platform specific pci bus setup (default NULL) * @mpc_oem_bus_info: platform specific mpc bus info + * @find_smp_config: find the smp configuration + * @get_smp_config: get the smp configuration */ struct x86_init_mpparse { void (*mpc_record)(unsigned int mode); @@ -21,6 +23,8 @@ struct x86_init_mpparse { void (*smp_read_mpc_oem)(struct mpc_table *mpc); void (*mpc_oem_pci_bus)(struct mpc_bus *m); void (*mpc_oem_bus_info)(struct mpc_bus *m, char *name); + void (*find_smp_config)(unsigned int reserve); + void (*get_smp_config)(unsigned int early); }; /** diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index 700273dca68..3dd5fd76534 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -268,8 +268,6 @@ static struct x86_quirks numaq_x86_quirks __initdata = { .arch_pre_intr_init = NULL, .arch_intr_init = NULL, .arch_trap_init = NULL, - .mach_get_smp_config = NULL, - .mach_find_smp_config = NULL, }; static __init void early_check_numaq(void) diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index a42f23f1dc7..75357647b6e 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -610,7 +610,7 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early) /* * Scan the memory blocks for an SMP configuration block. */ -static void __init __get_smp_config(unsigned int early) +void __init default_get_smp_config(unsigned int early) { struct mpf_intel *mpf = mpf_found; @@ -627,11 +627,6 @@ static void __init __get_smp_config(unsigned int early) if (acpi_lapic && acpi_ioapic) return; - if (x86_quirks->mach_get_smp_config) { - if (x86_quirks->mach_get_smp_config(early)) - return; - } - printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", mpf->specification); #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) @@ -672,16 +667,6 @@ static void __init __get_smp_config(unsigned int early) */ } -void __init early_get_smp_config(void) -{ - __get_smp_config(1); -} - -void __init get_smp_config(void) -{ - __get_smp_config(0); -} - static void __init smp_reserve_bootmem(struct mpf_intel *mpf) { unsigned long size = get_mpc_size(mpf->physptr); @@ -747,14 +732,10 @@ static int __init smp_scan_config(unsigned long base, unsigned long length, return 0; } -static void __init __find_smp_config(unsigned int reserve) +void __init default_find_smp_config(unsigned int reserve) { unsigned int address; - if (x86_quirks->mach_find_smp_config) { - if (x86_quirks->mach_find_smp_config(reserve)) - return; - } /* * FIXME: Linux assumes you have 640K of base ram.. * this continues the error... @@ -789,16 +770,6 @@ static void __init __find_smp_config(unsigned int reserve) smp_scan_config(address, 0x400, reserve); } -void __init early_find_smp_config(void) -{ - __find_smp_config(0); -} - -void __init find_smp_config(void) -{ - __find_smp_config(1); -} - #ifdef CONFIG_X86_IO_APIC static u8 __initdata irq_used[MAX_IRQ_SOURCES]; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index c2a8090e831..54043cb7ba6 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -981,13 +981,11 @@ void __init setup_arch(char **cmdline_p) */ acpi_boot_init(); -#if defined(CONFIG_X86_MPPARSE) || defined(CONFIG_X86_VISWS) /* * get boot-time SMP configuration: */ if (smp_found_config) get_smp_config(); -#endif prefill_possible_map(); diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index 97c670df1ae..31e828118f8 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -156,12 +156,8 @@ static void visws_machine_power_off(void) outl(PIIX_SPECIAL_STOP, 0xCFC); } -static int __init visws_get_smp_config(unsigned int early) +static void __init visws_get_smp_config(unsigned int early) { - /* - * Prevent MP-table parsing by the generic code: - */ - return 1; } /* @@ -208,7 +204,7 @@ static void __init MP_processor_info(struct mpc_cpu *m) apic_version[m->apicid] = ver; } -static int __init visws_find_smp_config(unsigned int reserve) +static void __init visws_find_smp_config(unsigned int reserve) { struct mpc_cpu *mp = phys_to_virt(CO_CPU_TAB_PHYS); unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS)); @@ -230,8 +226,6 @@ static int __init visws_find_smp_config(unsigned int reserve) MP_processor_info(mp++); mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; - - return 1; } static int visws_trap_init(void); @@ -241,8 +235,6 @@ static struct x86_quirks visws_x86_quirks __initdata = { .arch_pre_intr_init = visws_pre_intr_init, .arch_intr_init = NULL, .arch_trap_init = visws_trap_init, - .mach_get_smp_config = visws_get_smp_config, - .mach_find_smp_config = visws_find_smp_config, }; void __init visws_early_detect(void) @@ -263,6 +255,8 @@ void __init visws_early_detect(void) x86_quirks = &visws_x86_quirks; x86_init.resources.memory_setup = visws_memory_setup; + x86_init.mpparse.get_smp_config = visws_get_smp_config; + x86_init.mpparse.find_smp_config = visws_find_smp_config; /* * Install reboot quirks: diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 27685edc546..3488fb62ac0 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -32,5 +32,7 @@ struct __initdata x86_init_ops x86_init = { .mpc_apic_id = default_mpc_apic_id, .smp_read_mpc_oem = default_smp_read_mpc_oem, .mpc_oem_bus_info = default_mpc_oem_bus_info, + .find_smp_config = default_find_smp_config, + .get_smp_config = default_get_smp_config, }, }; -- cgit v1.2.3 From d9112f43021554ded2ef2b9bea5f88ba4b52abe0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 20 Aug 2009 09:41:38 +0200 Subject: x86: Move pre_intr_init to x86_init_ops Replace the quirk machinery by a x86_init_ops function which defaults to the standard implementation. This is also a preparatory patch for Moorestown support which needs to replace the default init_ISA_irqs as well. Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/irq.h | 2 ++ arch/x86/include/asm/setup.h | 1 - arch/x86/include/asm/x86_init.h | 10 ++++++++++ arch/x86/kernel/apic/numaq_32.c | 1 - arch/x86/kernel/irqinit.c | 24 ++---------------------- arch/x86/kernel/visws_quirks.c | 10 +++------- arch/x86/kernel/x86_init.c | 5 +++++ 7 files changed, 22 insertions(+), 31 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index f38481bcd45..8fe2782a253 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -47,4 +47,6 @@ extern unsigned int do_IRQ(struct pt_regs *regs); extern DECLARE_BITMAP(used_vectors, NR_VECTORS); extern int vector_used_by_percpu_irq(unsigned int vector); +extern void init_ISA_irqs(void); + #endif /* _ASM_X86_IRQ_H */ diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 345a2551af9..66a319709d6 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -16,7 +16,6 @@ struct x86_quirks { int (*arch_pre_time_init)(void); int (*arch_time_init)(void); - int (*arch_pre_intr_init)(void); int (*arch_intr_init)(void); int (*arch_trap_init)(void); }; diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index e0d4729c905..65e3394c77f 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -43,6 +43,15 @@ struct x86_init_resources { char *(*memory_setup)(void); }; +/** + * struct x86_init_irqs - platform specific interrupt setup + * @pre_vector_init: init code to run before interrupt vectors + * are set up. + */ +struct x86_init_irqs { + void (*pre_vector_init)(void); +}; + /** * struct x86_init_ops - functions for platform specific setup * @@ -50,6 +59,7 @@ struct x86_init_resources { struct x86_init_ops { struct x86_init_resources resources; struct x86_init_mpparse mpparse; + struct x86_init_irqs irqs; }; extern struct x86_init_ops x86_init; diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index 3dd5fd76534..ec8b3113716 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -265,7 +265,6 @@ static void __init smp_read_mpc_oem(struct mpc_table *mpc) static struct x86_quirks numaq_x86_quirks __initdata = { .arch_pre_time_init = numaq_pre_time_init, .arch_time_init = NULL, - .arch_pre_intr_init = NULL, .arch_intr_init = NULL, .arch_trap_init = NULL, }; diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 92b7703d3d5..acdf088c758 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -116,7 +116,7 @@ int vector_used_by_percpu_irq(unsigned int vector) return 0; } -static void __init init_ISA_irqs(void) +void __init init_ISA_irqs(void) { int i; @@ -213,32 +213,12 @@ static void __init apic_intr_init(void) #endif } -/** - * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors - * - * Description: - * Perform any necessary interrupt initialisation prior to setting up - * the "ordinary" interrupt call gates. For legacy reasons, the ISA - * interrupts should be initialised here if the machine emulates a PC - * in any way. - **/ -static void __init x86_quirk_pre_intr_init(void) -{ -#ifdef CONFIG_X86_32 - if (x86_quirks->arch_pre_intr_init) { - if (x86_quirks->arch_pre_intr_init()) - return; - } -#endif - init_ISA_irqs(); -} - void __init native_init_IRQ(void) { int i; /* Execute any quirks before the call gates are initialised: */ - x86_quirk_pre_intr_init(); + x86_init.irqs.pre_vector_init(); apic_intr_init(); diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index 31e828118f8..1d6309d70df 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -73,14 +73,10 @@ static int __init visws_time_init(void) return 0; } -static int __init visws_pre_intr_init(void) +/* Replaces the default init_ISA_irqs in the generic setup */ +static void __init visws_pre_intr_init(void) { init_VISWS_APIC_irqs(); - - /* - * We dont want ISA irqs to be set up by the generic code: - */ - return 1; } /* Quirk for machine specific memory setup. */ @@ -232,7 +228,6 @@ static int visws_trap_init(void); static struct x86_quirks visws_x86_quirks __initdata = { .arch_time_init = visws_time_init, - .arch_pre_intr_init = visws_pre_intr_init, .arch_intr_init = NULL, .arch_trap_init = visws_trap_init, }; @@ -257,6 +252,7 @@ void __init visws_early_detect(void) x86_init.resources.memory_setup = visws_memory_setup; x86_init.mpparse.get_smp_config = visws_get_smp_config; x86_init.mpparse.find_smp_config = visws_find_smp_config; + x86_init.irqs.pre_vector_init = visws_pre_intr_init; /* * Install reboot quirks: diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 3488fb62ac0..f2abe2136da 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -9,6 +9,7 @@ #include #include #include +#include void __cpuinit x86_init_noop(void) { } void __init x86_init_uint_noop(unsigned int unused) { } @@ -35,4 +36,8 @@ struct __initdata x86_init_ops x86_init = { .find_smp_config = default_find_smp_config, .get_smp_config = default_get_smp_config, }, + + .irqs = { + .pre_vector_init = init_ISA_irqs, + }, }; -- cgit v1.2.3 From 66bcaf0bde100a4b54b82fc6fea6ceee2212ffb4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 20 Aug 2009 09:59:09 +0200 Subject: x86: Move irq_init to x86_init_ops irq_init is overridden by x86_quirks and by paravirts. Unify the whole mess and make it an unconditional x86_init_ops function which defaults to the standard function and can be overridden by the early platform code. Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/irq.h | 1 - arch/x86/include/asm/paravirt_types.h | 2 -- arch/x86/include/asm/setup.h | 3 --- arch/x86/include/asm/x86_init.h | 2 ++ arch/x86/kernel/apic/numaq_32.c | 1 - arch/x86/kernel/irqinit.c | 12 ++++-------- arch/x86/kernel/paravirt.c | 6 ------ arch/x86/kernel/setup.c | 17 ----------------- arch/x86/kernel/visws_quirks.c | 1 - arch/x86/kernel/x86_init.c | 1 + arch/x86/lguest/boot.c | 2 +- arch/x86/xen/irq.c | 5 +++-- 12 files changed, 11 insertions(+), 42 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index 8fe2782a253..ddda6cbed6f 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -37,7 +37,6 @@ extern void fixup_irqs(void); #endif extern void (*generic_interrupt_extension)(void); -extern void init_IRQ(void); extern void native_init_IRQ(void); extern bool handle_irq(unsigned irq, struct pt_regs *regs); diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 6d668968b6b..25922afb634 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -201,8 +201,6 @@ struct pv_cpu_ops { }; struct pv_irq_ops { - void (*init_IRQ)(void); - /* * Get/set interrupt state. save_fl and restore_fl are only * expected to use X86_EFLAGS_IF; all other bits diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 66a319709d6..404086f9411 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -16,12 +16,9 @@ struct x86_quirks { int (*arch_pre_time_init)(void); int (*arch_time_init)(void); - int (*arch_intr_init)(void); int (*arch_trap_init)(void); }; -extern void x86_quirk_intr_init(void); - extern void x86_quirk_trap_init(void); extern void x86_quirk_pre_time_init(void); diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 65e3394c77f..8d7be65ccf7 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -47,9 +47,11 @@ struct x86_init_resources { * struct x86_init_irqs - platform specific interrupt setup * @pre_vector_init: init code to run before interrupt vectors * are set up. + * @intr_init: interrupt init code */ struct x86_init_irqs { void (*pre_vector_init)(void); + void (*intr_init)(void); }; /** diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index ec8b3113716..eafd341e42d 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -265,7 +265,6 @@ static void __init smp_read_mpc_oem(struct mpc_table *mpc) static struct x86_quirks numaq_x86_quirks __initdata = { .arch_pre_time_init = numaq_pre_time_init, .arch_time_init = NULL, - .arch_intr_init = NULL, .arch_trap_init = NULL, }; diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index acdf088c758..e0142cda239 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -140,8 +140,10 @@ void __init init_ISA_irqs(void) } } -/* Overridden in paravirt.c */ -void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); +void init_IRQ(void) +{ + x86_init.irqs.intr_init(); +} static void __init smp_intr_init(void) { @@ -237,12 +239,6 @@ void __init native_init_IRQ(void) setup_irq(2, &irq2); #ifdef CONFIG_X86_32 - /* - * Call quirks after call gates are initialised (usually add in - * the architecture specific gates): - */ - x86_quirk_intr_init(); - /* * External FPU? Set up irq13 if so, for * original braindamaged IBM FERR coupling. diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 532c9a2626c..d76bfbec71a 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -183,11 +183,6 @@ unsigned paravirt_patch_insns(void *insnbuf, unsigned len, return insn_len; } -void init_IRQ(void) -{ - pv_irq_ops.init_IRQ(); -} - static void native_flush_tlb(void) { __native_flush_tlb(); @@ -328,7 +323,6 @@ struct pv_time_ops pv_time_ops = { }; struct pv_irq_ops pv_irq_ops = { - .init_IRQ = native_init_IRQ, .save_fl = __PV_IS_CALLEE_SAVE(native_save_fl), .restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl), .irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable), diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 54043cb7ba6..d3da0f7333f 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1020,23 +1020,6 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_X86_32 -/** - * x86_quirk_intr_init - post gate setup interrupt initialisation - * - * Description: - * Fill in any interrupts that may have been left out by the general - * init_IRQ() routine. interrupts having to do with the machine rather - * than the devices on the I/O bus (like APIC interrupts in intel MP - * systems) are started here. - **/ -void __init x86_quirk_intr_init(void) -{ - if (x86_quirks->arch_intr_init) { - if (x86_quirks->arch_intr_init()) - return; - } -} - /** * x86_quirk_trap_init - initialise system specific traps * diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index 1d6309d70df..a49013716da 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -228,7 +228,6 @@ static int visws_trap_init(void); static struct x86_quirks visws_x86_quirks __initdata = { .arch_time_init = visws_time_init, - .arch_intr_init = NULL, .arch_trap_init = visws_trap_init, }; diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index f2abe2136da..8cb59332e3b 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -39,5 +39,6 @@ struct __initdata x86_init_ops x86_init = { .irqs = { .pre_vector_init = init_ISA_irqs, + .intr_init = native_init_IRQ, }, }; diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 11445c176de..1ff986511f1 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -1262,7 +1262,6 @@ __init void lguest_init(void) */ /* Interrupt-related operations */ - pv_irq_ops.init_IRQ = lguest_init_IRQ; pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl); pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl); pv_irq_ops.irq_disable = PV_CALLEE_SAVE(irq_disable); @@ -1325,6 +1324,7 @@ __init void lguest_init(void) pv_time_ops.get_tsc_khz = lguest_tsc_khz; x86_init.resources.memory_setup = lguest_memory_setup; + x86_init.irqs.intr_init = lguest_init_IRQ; /* * Now is a good time to look at the implementations of these functions diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c index cfd17799bd6..9d30105a0c4 100644 --- a/arch/x86/xen/irq.c +++ b/arch/x86/xen/irq.c @@ -1,5 +1,7 @@ #include +#include + #include #include #include @@ -112,8 +114,6 @@ static void xen_halt(void) } static const struct pv_irq_ops xen_irq_ops __initdata = { - .init_IRQ = xen_init_IRQ, - .save_fl = PV_CALLEE_SAVE(xen_save_fl), .restore_fl = PV_CALLEE_SAVE(xen_restore_fl), .irq_disable = PV_CALLEE_SAVE(xen_irq_disable), @@ -129,4 +129,5 @@ static const struct pv_irq_ops xen_irq_ops __initdata = { void __init xen_init_irq_ops() { pv_irq_ops = xen_irq_ops; + x86_init.irqs.intr_init = xen_init_IRQ; } -- cgit v1.2.3 From 428cf9025b15573e16e658032f2b963283e34ae0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 20 Aug 2009 10:35:46 +0200 Subject: x86: Move traps_init to x86_init_ops Replace the quirks by a simple x86_init_ops function. Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/setup.h | 3 --- arch/x86/include/asm/x86_init.h | 2 ++ arch/x86/kernel/apic/numaq_32.c | 1 - arch/x86/kernel/setup.c | 15 --------------- arch/x86/kernel/traps.c | 5 ++--- arch/x86/kernel/visws_quirks.c | 8 +++----- arch/x86/kernel/x86_init.c | 1 + 7 files changed, 8 insertions(+), 27 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 404086f9411..7751d1f92bc 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -16,11 +16,8 @@ struct x86_quirks { int (*arch_pre_time_init)(void); int (*arch_time_init)(void); - int (*arch_trap_init)(void); }; -extern void x86_quirk_trap_init(void); - extern void x86_quirk_pre_time_init(void); extern void x86_quirk_time_init(void); diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 8d7be65ccf7..07c37bd879f 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -48,10 +48,12 @@ struct x86_init_resources { * @pre_vector_init: init code to run before interrupt vectors * are set up. * @intr_init: interrupt init code + * @trap_init: platform specific trap setup */ struct x86_init_irqs { void (*pre_vector_init)(void); void (*intr_init)(void); + void (*trap_init)(void); }; /** diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index eafd341e42d..71c5ea64586 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -265,7 +265,6 @@ static void __init smp_read_mpc_oem(struct mpc_table *mpc) static struct x86_quirks numaq_x86_quirks __initdata = { .arch_pre_time_init = numaq_pre_time_init, .arch_time_init = NULL, - .arch_trap_init = NULL, }; static __init void early_check_numaq(void) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index d3da0f7333f..bf3b87f1f7d 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1020,21 +1020,6 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_X86_32 -/** - * x86_quirk_trap_init - initialise system specific traps - * - * Description: - * Called as the final act of trap_init(). Used in VISWS to initialise - * the various board specific APIC traps. - **/ -void __init x86_quirk_trap_init(void) -{ - if (x86_quirks->arch_trap_init) { - if (x86_quirks->arch_trap_init()) - return; - } -} - static struct irqaction irq0 = { .handler = timer_interrupt, .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER, diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 7e4b1f5dec8..ed96ed53f69 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -59,6 +59,7 @@ #include #ifdef CONFIG_X86_64 +#include #include #include #else @@ -980,7 +981,5 @@ void __init trap_init(void) */ cpu_init(); -#ifdef CONFIG_X86_32 - x86_quirk_trap_init(); -#endif + x86_init.irqs.trap_init(); } diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index a49013716da..2719091b335 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -224,11 +224,10 @@ static void __init visws_find_smp_config(unsigned int reserve) mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; } -static int visws_trap_init(void); +static void visws_trap_init(void); static struct x86_quirks visws_x86_quirks __initdata = { .arch_time_init = visws_time_init, - .arch_trap_init = visws_trap_init, }; void __init visws_early_detect(void) @@ -252,6 +251,7 @@ void __init visws_early_detect(void) x86_init.mpparse.get_smp_config = visws_get_smp_config; x86_init.mpparse.find_smp_config = visws_find_smp_config; x86_init.irqs.pre_vector_init = visws_pre_intr_init; + x86_init.irqs.trap_init = visws_trap_init; /* * Install reboot quirks: @@ -390,12 +390,10 @@ static __init void cobalt_init(void) co_apic_read(CO_APIC_ID)); } -static int __init visws_trap_init(void) +static void __init visws_trap_init(void) { lithium_init(); cobalt_init(); - - return 1; } /* diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 8cb59332e3b..9f2b775dc72 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -40,5 +40,6 @@ struct __initdata x86_init_ops x86_init = { .irqs = { .pre_vector_init = init_ISA_irqs, .intr_init = native_init_IRQ, + .trap_init = x86_init_noop, }, }; -- cgit v1.2.3 From 42bbdb43b16d233b2dacb4cd76e28f61c2a86dc6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 20 Aug 2009 13:04:10 +0200 Subject: x86: Replace ARCH_SETUP by a proper x86_init_ops ARCH_SETUP is a horrible leftover from the old arch/i386 mach support code. It still has a lonely user in xen. Move it to x86_init_ops. Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/paravirt.h | 1 - arch/x86/include/asm/paravirt_types.h | 1 - arch/x86/include/asm/x86_init.h | 9 +++++++++ arch/x86/kernel/paravirt.c | 1 - arch/x86/kernel/setup.c | 6 +----- arch/x86/kernel/x86_init.c | 4 ++++ arch/x86/xen/enlighten.c | 2 +- 7 files changed, 15 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 6a07af432c8..22cb3872f6d 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -24,7 +24,6 @@ static inline void load_sp0(struct tss_struct *tss, PVOP_VCALL2(pv_cpu_ops.load_sp0, tss, thread); } -#define ARCH_SETUP pv_init_ops.arch_setup(); static inline unsigned long get_wallclock(void) { return PVOP_CALL0(unsigned long, pv_time_ops.get_wallclock); diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 25922afb634..a05085e5fdb 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -80,7 +80,6 @@ struct pv_init_ops { unsigned long addr, unsigned len); /* Basic arch-specific setup */ - void (*arch_setup)(void); void (*post_allocator_init)(void); /* Print a banner to identify the environment */ diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 07c37bd879f..ceffbf358fc 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -56,6 +56,14 @@ struct x86_init_irqs { void (*trap_init)(void); }; +/** + * struct x86_init_oem - oem platform specific customizing functions + * @arch_setup: platform specific architecure setup + */ +struct x86_init_oem { + void (*arch_setup)(void); +}; + /** * struct x86_init_ops - functions for platform specific setup * @@ -64,6 +72,7 @@ struct x86_init_ops { struct x86_init_resources resources; struct x86_init_mpparse mpparse; struct x86_init_irqs irqs; + struct x86_init_oem oem; }; extern struct x86_init_ops x86_init; diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index d76bfbec71a..80275ef1651 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -311,7 +311,6 @@ struct pv_info pv_info = { struct pv_init_ops pv_init_ops = { .patch = native_patch, .banner = default_banner, - .arch_setup = paravirt_nop, }; struct pv_time_ops pv_time_ops = { diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index bf3b87f1f7d..d12aa82c9c3 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -108,10 +108,6 @@ #include #endif -#ifndef ARCH_SETUP -#define ARCH_SETUP -#endif - /* * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. * The direct mapping extends to max_pfn_mapped, so that we can directly access @@ -750,7 +746,7 @@ void __init setup_arch(char **cmdline_p) } #endif - ARCH_SETUP + x86_init.oem.arch_setup(); setup_memory_map(); parse_setup_data(); diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 9f2b775dc72..fa2d849be35 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -42,4 +42,8 @@ struct __initdata x86_init_ops x86_init = { .intr_init = native_init_IRQ, .trap_init = x86_init_noop, }, + + .oem = { + .arch_setup = x86_init_noop, + }, }; diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 50b20c64f0b..73c7b1d610f 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -841,7 +841,6 @@ static const struct pv_init_ops xen_init_ops __initdata = { .patch = xen_patch, .banner = xen_banner, - .arch_setup = xen_arch_setup, .post_allocator_init = xen_post_allocator_init, }; @@ -982,6 +981,7 @@ asmlinkage void __init xen_start_kernel(void) pv_mmu_ops = xen_mmu_ops; x86_init.resources.memory_setup = xen_memory_setup; + x86_init.oem.arch_setup = xen_arch_setup; #ifdef CONFIG_X86_64 /* -- cgit v1.2.3 From 6f30c1ac3fcf11e08f00670f293546a112cdf4e3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 20 Aug 2009 13:19:57 +0200 Subject: x86: Move paravirt banner printout to x86_init_ops Replace another obscure paravirt magic and move it to x86_init_ops. Such a hook is also useful for embedded and special hardware. Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/paravirt.h | 6 +++++- arch/x86/include/asm/paravirt_types.h | 3 --- arch/x86/include/asm/x86_init.h | 2 ++ arch/x86/kernel/paravirt.c | 10 +--------- arch/x86/kernel/setup.c | 1 + arch/x86/kernel/x86_init.c | 2 ++ arch/x86/xen/enlighten.c | 2 +- 7 files changed, 12 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 22cb3872f6d..3de6435a106 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -918,6 +918,8 @@ static inline unsigned long __raw_local_irq_save(void) #undef PVOP_VCALL4 #undef PVOP_CALL4 +extern void default_banner(void); + #else /* __ASSEMBLY__ */ #define _PVSITE(ptype, clobbers, ops, word, algn) \ @@ -1058,5 +1060,7 @@ static inline unsigned long __raw_local_irq_save(void) #endif /* CONFIG_X86_32 */ #endif /* __ASSEMBLY__ */ -#endif /* CONFIG_PARAVIRT */ +#else /* CONFIG_PARAVIRT */ +# define default_banner x86_init_noop +#endif /* !CONFIG_PARAVIRT */ #endif /* _ASM_X86_PARAVIRT_H */ diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index a05085e5fdb..ce7723c81a1 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -81,9 +81,6 @@ struct pv_init_ops { /* Basic arch-specific setup */ void (*post_allocator_init)(void); - - /* Print a banner to identify the environment */ - void (*banner)(void); }; diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index ceffbf358fc..ee7c59df781 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -59,9 +59,11 @@ struct x86_init_irqs { /** * struct x86_init_oem - oem platform specific customizing functions * @arch_setup: platform specific architecure setup + * @banner: print a platform specific banner */ struct x86_init_oem { void (*arch_setup)(void); + void (*banner)(void); }; /** diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 80275ef1651..f7a5fb79d18 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -54,7 +54,7 @@ u64 _paravirt_ident_64(u64 x) return x; } -static void __init default_banner(void) +void __init default_banner(void) { printk(KERN_INFO "Booting paravirtualized kernel on %s\n", pv_info.name); @@ -208,13 +208,6 @@ extern void native_irq_enable_sysexit(void); extern void native_usergs_sysret32(void); extern void native_usergs_sysret64(void); -static int __init print_banner(void) -{ - pv_init_ops.banner(); - return 0; -} -core_initcall(print_banner); - static struct resource reserve_ioports = { .start = 0, .end = IO_SPACE_LIMIT, @@ -310,7 +303,6 @@ struct pv_info pv_info = { struct pv_init_ops pv_init_ops = { .patch = native_patch, - .banner = default_banner, }; struct pv_time_ops pv_time_ops = { diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index d12aa82c9c3..bc5f0e561cf 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1012,6 +1012,7 @@ void __init setup_arch(char **cmdline_p) conswitchp = &dummy_con; #endif #endif + x86_init.oem.banner(); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index fa2d849be35..08fea49d59a 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -6,6 +6,7 @@ #include #include +#include #include #include #include @@ -45,5 +46,6 @@ struct __initdata x86_init_ops x86_init = { .oem = { .arch_setup = x86_init_noop, + .banner = default_banner, }, }; diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 73c7b1d610f..46e23cde143 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -840,7 +840,6 @@ static const struct pv_info xen_info __initdata = { static const struct pv_init_ops xen_init_ops __initdata = { .patch = xen_patch, - .banner = xen_banner, .post_allocator_init = xen_post_allocator_init, }; @@ -982,6 +981,7 @@ asmlinkage void __init xen_start_kernel(void) x86_init.resources.memory_setup = xen_memory_setup; x86_init.oem.arch_setup = xen_arch_setup; + x86_init.oem.banner = xen_banner; #ifdef CONFIG_X86_64 /* -- cgit v1.2.3 From 030cb6c00d242c20e92a3327d0cac17ce02d0cc3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 20 Aug 2009 14:30:02 +0200 Subject: x86: Move paravirt pagetable_setup to x86_init_ops Replace more paravirt hackery by proper x86_init_ops. Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/paravirt.h | 10 ---------- arch/x86/include/asm/paravirt_types.h | 9 --------- arch/x86/include/asm/pgtable.h | 10 ---------- arch/x86/include/asm/pgtable_types.h | 4 ++-- arch/x86/include/asm/x86_init.h | 13 +++++++++++++ arch/x86/kernel/paravirt.c | 7 ------- arch/x86/kernel/setup.c | 4 ++-- arch/x86/kernel/x86_init.c | 6 ++++++ arch/x86/xen/enlighten.c | 2 +- arch/x86/xen/mmu.c | 11 +++++++---- arch/x86/xen/mmu.h | 2 +- 11 files changed, 32 insertions(+), 46 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 3de6435a106..1caf25b91e6 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -351,16 +351,6 @@ static inline void paravirt_post_allocator_init(void) (*pv_init_ops.post_allocator_init)(); } -static inline void paravirt_pagetable_setup_start(pgd_t *base) -{ - (*pv_mmu_ops.pagetable_setup_start)(base); -} - -static inline void paravirt_pagetable_setup_done(pgd_t *base) -{ - (*pv_mmu_ops.pagetable_setup_done)(base); -} - #ifdef CONFIG_SMP static inline void startup_ipi_hook(int phys_apicid, unsigned long start_eip, unsigned long start_esp) diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index ce7723c81a1..4039eefd3eb 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -231,15 +231,6 @@ struct pv_apic_ops { }; struct pv_mmu_ops { - /* - * Called before/after init_mm pagetable setup. setup_start - * may reset %cr3, and may pre-install parts of the pagetable; - * pagetable setup is expected to preserve any existing - * mapping. - */ - void (*pagetable_setup_start)(pgd_t *pgd_base); - void (*pagetable_setup_done)(pgd_t *pgd_base); - unsigned long (*read_cr2)(void); void (*write_cr2)(unsigned long); diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 16748077559..60d422adf70 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -56,16 +56,6 @@ extern struct list_head pgd_list; #define pte_update(mm, addr, ptep) do { } while (0) #define pte_update_defer(mm, addr, ptep) do { } while (0) -static inline void __init paravirt_pagetable_setup_start(pgd_t *base) -{ - native_pagetable_setup_start(base); -} - -static inline void __init paravirt_pagetable_setup_done(pgd_t *base) -{ - native_pagetable_setup_done(base); -} - #define pgd_val(x) native_pgd_val(x) #define __pgd(x) native_make_pgd(x) diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 54cb697f490..7b467bf3c68 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -299,8 +299,8 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pte); extern void native_pagetable_setup_start(pgd_t *base); extern void native_pagetable_setup_done(pgd_t *base); #else -static inline void native_pagetable_setup_start(pgd_t *base) {} -static inline void native_pagetable_setup_done(pgd_t *base) {} +#define native_pagetable_setup_start x86_init_pgd_noop +#define native_pagetable_setup_done x86_init_pgd_noop #endif struct seq_file; diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index ee7c59df781..b9bb4faefc4 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -1,6 +1,8 @@ #ifndef _ASM_X86_PLATFORM_H #define _ASM_X86_PLATFORM_H +#include + struct mpc_bus; struct mpc_cpu; struct mpc_table; @@ -66,6 +68,16 @@ struct x86_init_oem { void (*banner)(void); }; +/** + * struct x86_init_paging - platform specific paging functions + * @pagetable_setup_start: platform specific pre paging_init() call + * @pagetable_setup_done: platform specific post paging_init() call + */ +struct x86_init_paging { + void (*pagetable_setup_start)(pgd_t *base); + void (*pagetable_setup_done)(pgd_t *base); +}; + /** * struct x86_init_ops - functions for platform specific setup * @@ -75,6 +87,7 @@ struct x86_init_ops { struct x86_init_mpparse mpparse; struct x86_init_irqs irqs; struct x86_init_oem oem; + struct x86_init_paging paging; }; extern struct x86_init_ops x86_init; diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index f7a5fb79d18..8167be0b68c 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -402,13 +402,6 @@ struct pv_apic_ops pv_apic_ops = { #endif struct pv_mmu_ops pv_mmu_ops = { -#ifndef CONFIG_X86_64 - .pagetable_setup_start = native_pagetable_setup_start, - .pagetable_setup_done = native_pagetable_setup_done, -#else - .pagetable_setup_start = paravirt_nop, - .pagetable_setup_done = paravirt_nop, -#endif .read_cr2 = native_read_cr2, .write_cr2 = native_write_cr2, diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index bc5f0e561cf..4952d63dd67 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -959,9 +959,9 @@ void __init setup_arch(char **cmdline_p) kvmclock_init(); #endif - paravirt_pagetable_setup_start(swapper_pg_dir); + x86_init.paging.pagetable_setup_start(swapper_pg_dir); paging_init(); - paravirt_pagetable_setup_done(swapper_pg_dir); + x86_init.paging.pagetable_setup_done(swapper_pg_dir); paravirt_post_allocator_init(); #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 08fea49d59a..7df020e6740 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -14,6 +14,7 @@ void __cpuinit x86_init_noop(void) { } void __init x86_init_uint_noop(unsigned int unused) { } +void __init x86_init_pgd_noop(pgd_t *unused) { } /* * The platform setup functions are preset with the default functions @@ -48,4 +49,9 @@ struct __initdata x86_init_ops x86_init = { .arch_setup = x86_init_noop, .banner = default_banner, }, + + .paging = { + .pagetable_setup_start = native_pagetable_setup_start, + .pagetable_setup_done = native_pagetable_setup_done, + }, }; diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 46e23cde143..12ea09ec39b 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -977,7 +977,6 @@ asmlinkage void __init xen_start_kernel(void) pv_time_ops = xen_time_ops; pv_cpu_ops = xen_cpu_ops; pv_apic_ops = xen_apic_ops; - pv_mmu_ops = xen_mmu_ops; x86_init.resources.memory_setup = xen_memory_setup; x86_init.oem.arch_setup = xen_arch_setup; @@ -991,6 +990,7 @@ asmlinkage void __init xen_start_kernel(void) load_percpu_segment(0); #endif + xen_init_mmu_ops(); xen_init_irq_ops(); xen_init_cpuid_mask(); diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 4ceb2858165..dbec51da930 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1875,10 +1875,7 @@ static void xen_leave_lazy_mmu(void) preempt_enable(); } -const struct pv_mmu_ops xen_mmu_ops __initdata = { - .pagetable_setup_start = xen_pagetable_setup_start, - .pagetable_setup_done = xen_pagetable_setup_done, - +static const struct pv_mmu_ops xen_mmu_ops __initdata = { .read_cr2 = xen_read_cr2, .write_cr2 = xen_write_cr2, @@ -1954,6 +1951,12 @@ const struct pv_mmu_ops xen_mmu_ops __initdata = { .set_fixmap = xen_set_fixmap, }; +void __init xen_init_mmu_ops(void) +{ + x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start; + x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done; + pv_mmu_ops = xen_mmu_ops; +} #ifdef CONFIG_XEN_DEBUG_FS diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h index da730262489..5fe6bc7f5ec 100644 --- a/arch/x86/xen/mmu.h +++ b/arch/x86/xen/mmu.h @@ -59,5 +59,5 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, unsigned long xen_read_cr2_direct(void); -extern const struct pv_mmu_ops xen_mmu_ops; +extern void xen_init_mmu_ops(void); #endif /* _XEN_MMU_H */ -- cgit v1.2.3 From f1d7062a235d057e5d85ed2860bef609e0160cde Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 20 Aug 2009 13:13:52 +0200 Subject: x86: Move xen_post_allocator_init into xen_pagetable_setup_done We really do not need two paravirt/x86_init_ops functions which are called in two consecutive source lines. Move the only user of post_allocator_init into the already existing pagetable_setup_done function. Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/paravirt.h | 6 ------ arch/x86/include/asm/paravirt_types.h | 3 --- arch/x86/include/asm/setup.h | 4 ---- arch/x86/kernel/setup.c | 1 - arch/x86/xen/enlighten.c | 2 -- arch/x86/xen/mmu.c | 5 ++++- arch/x86/xen/xen-ops.h | 2 -- 7 files changed, 4 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 1caf25b91e6..7ce415e844b 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -345,12 +345,6 @@ static inline void setup_secondary_clock(void) } #endif -static inline void paravirt_post_allocator_init(void) -{ - if (pv_init_ops.post_allocator_init) - (*pv_init_ops.post_allocator_init)(); -} - #ifdef CONFIG_SMP static inline void startup_ipi_hook(int phys_apicid, unsigned long start_eip, unsigned long start_esp) diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 4039eefd3eb..ecc74e5ad40 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -78,9 +78,6 @@ struct pv_init_ops { */ unsigned (*patch)(u8 type, u16 clobber, void *insnbuf, unsigned long addr, unsigned len); - - /* Basic arch-specific setup */ - void (*post_allocator_init)(void); }; diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 7751d1f92bc..58b58952b80 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -63,10 +63,6 @@ static inline int is_visws_box(void) { return 0; } extern struct x86_quirks *x86_quirks; extern unsigned long saved_video_mode; -#ifndef CONFIG_PARAVIRT -#define paravirt_post_allocator_init() do {} while (0) -#endif - extern void reserve_standard_io_resources(void); extern void i386_reserve_resources(void); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 4952d63dd67..43ec6aa175b 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -962,7 +962,6 @@ void __init setup_arch(char **cmdline_p) x86_init.paging.pagetable_setup_start(swapper_pg_dir); paging_init(); x86_init.paging.pagetable_setup_done(swapper_pg_dir); - paravirt_post_allocator_init(); #ifdef CONFIG_X86_64 map_vsyscall(); diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 12ea09ec39b..a924caa168d 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -839,8 +839,6 @@ static const struct pv_info xen_info __initdata = { static const struct pv_init_ops xen_init_ops __initdata = { .patch = xen_patch, - - .post_allocator_init = xen_post_allocator_init, }; static const struct pv_time_ops xen_time_ops __initdata = { diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index dbec51da930..093dd59b538 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1229,9 +1229,12 @@ static __init void xen_pagetable_setup_start(pgd_t *base) { } +static void xen_post_allocator_init(void); + static __init void xen_pagetable_setup_done(pgd_t *base) { xen_setup_shared_info(); + xen_post_allocator_init(); } static void xen_write_cr2(unsigned long cr2) @@ -1841,7 +1844,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) #endif } -__init void xen_post_allocator_init(void) +static __init void xen_post_allocator_init(void) { pv_mmu_ops.set_pte = xen_set_pte; pv_mmu_ops.set_pmd = xen_set_pmd; diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 22494fd4c9b..355fa6b99c9 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -30,8 +30,6 @@ pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); void xen_ident_map_ISA(void); void xen_reserve_top(void); -void xen_post_allocator_init(void); - char * __init xen_memory_setup(void); void __init xen_arch_setup(void); void __init xen_init_IRQ(void); -- cgit v1.2.3 From 736decac643e8982655e22ac7f0e5e61c5b7f9bd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Aug 2009 12:35:53 +0200 Subject: x86: Move percpu clockevents setup to x86_init_ops paravirt overrides the setup of the default apic timers as per cpu timers. Moorestown needs to override that as well. Move it to x86_init_ops setup and create a separate x86_cpuinit struct which holds the function for the secondary evtl. hotplugabble CPUs. Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/apic.h | 5 ++--- arch/x86/include/asm/paravirt.h | 12 ------------ arch/x86/include/asm/paravirt_types.h | 3 --- arch/x86/include/asm/x86_init.h | 19 +++++++++++++++++++ arch/x86/kernel/apic/apic.c | 3 ++- arch/x86/kernel/kvmclock.c | 5 ++++- arch/x86/kernel/paravirt.c | 2 -- arch/x86/kernel/smpboot.c | 4 ++-- arch/x86/kernel/vmi_32.c | 4 ++-- arch/x86/kernel/x86_init.c | 9 +++++++++ arch/x86/xen/enlighten.c | 4 ++-- 11 files changed, 42 insertions(+), 28 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index bb7d4792584..6f15b29005a 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -70,9 +70,6 @@ static inline void default_inquire_remote_apic(int apicid) */ #ifdef CONFIG_PARAVIRT #include -#else -#define setup_boot_clock setup_boot_APIC_clock -#define setup_secondary_clock setup_secondary_APIC_clock #endif #ifdef CONFIG_X86_64 @@ -245,6 +242,8 @@ static inline void lapic_shutdown(void) { } static inline void init_apic_mappings(void) { } static inline void disable_local_APIC(void) { } static inline void apic_disable(void) { } +# define setup_boot_APIC_clock x86_init_noop +# define setup_secondary_APIC_clock x86_init_noop #endif /* !CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_X86_64 diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 7ce415e844b..825674a968d 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -333,18 +333,6 @@ static inline void slow_down_io(void) #endif } -#ifdef CONFIG_X86_LOCAL_APIC -static inline void setup_boot_clock(void) -{ - PVOP_VCALL0(pv_apic_ops.setup_boot_clock); -} - -static inline void setup_secondary_clock(void) -{ - PVOP_VCALL0(pv_apic_ops.setup_secondary_clock); -} -#endif - #ifdef CONFIG_SMP static inline void startup_ipi_hook(int phys_apicid, unsigned long start_eip, unsigned long start_esp) diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index ecc74e5ad40..1da89276d14 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -218,9 +218,6 @@ struct pv_irq_ops { struct pv_apic_ops { #ifdef CONFIG_X86_LOCAL_APIC - void (*setup_boot_clock)(void); - void (*setup_secondary_clock)(void); - void (*startup_ipi_hook)(int phys_apicid, unsigned long start_eip, unsigned long start_esp); diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index b9bb4faefc4..b7d258f4c40 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -78,6 +78,15 @@ struct x86_init_paging { void (*pagetable_setup_done)(pgd_t *base); }; +/** + * struct x86_init_timers - platform specific timer setup + * @setup_perpcu_clockev: set up the per cpu clock event device for the + * boot cpu + */ +struct x86_init_timers { + void (*setup_percpu_clockev)(void); +}; + /** * struct x86_init_ops - functions for platform specific setup * @@ -88,9 +97,19 @@ struct x86_init_ops { struct x86_init_irqs irqs; struct x86_init_oem oem; struct x86_init_paging paging; + struct x86_init_timers timers; +}; + +/** + * struct x86_cpuinit_ops - platform specific cpu hotplug setups + * @setup_percpu_clockev: set up the per cpu clock event device + */ +struct x86_cpuinit_ops { + void (*setup_percpu_clockev)(void); }; extern struct x86_init_ops x86_init; +extern struct x86_cpuinit_ops x86_cpuinit; extern void x86_init_noop(void); extern void x86_init_uint_noop(unsigned int unused); diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 0a1c2830ec6..ce0098066e9 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -36,6 +36,7 @@ #include #include +#include #include #include #include @@ -1701,7 +1702,7 @@ int __init APIC_init_uniprocessor(void) localise_nmi_watchdog(); #endif - setup_boot_clock(); + x86_init.timers.setup_percpu_clockev(); #ifdef CONFIG_X86_64 check_nmi_watchdog(); #endif diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 223af43f152..64e9b5f59d2 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -22,6 +22,8 @@ #include #include #include + +#include #include #define KVM_SCALE 22 @@ -187,7 +189,8 @@ void __init kvmclock_init(void) pv_time_ops.sched_clock = kvm_clock_read; pv_time_ops.get_tsc_khz = kvm_get_tsc_khz; #ifdef CONFIG_X86_LOCAL_APIC - pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock; + x86_cpuinit.setup_percpu_clockev = + kvm_setup_secondary_clock; #endif #ifdef CONFIG_SMP smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 8167be0b68c..1ed32c79679 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -387,8 +387,6 @@ struct pv_cpu_ops pv_cpu_ops = { struct pv_apic_ops pv_apic_ops = { #ifdef CONFIG_X86_LOCAL_APIC - .setup_boot_clock = setup_boot_APIC_clock, - .setup_secondary_clock = setup_secondary_APIC_clock, .startup_ipi_hook = paravirt_nop, #endif }; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 2fecda69ee6..6eb81a87b4b 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -323,7 +323,7 @@ notrace static void __cpuinit start_secondary(void *unused) /* enable local interrupts */ local_irq_enable(); - setup_secondary_clock(); + x86_cpuinit.setup_percpu_clockev(); wmb(); cpu_idle(); @@ -1112,7 +1112,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) printk(KERN_INFO "CPU%d: ", 0); print_cpu_info(&cpu_data(0)); - setup_boot_clock(); + x86_init.timers.setup_percpu_clockev(); if (is_uv_system()) uv_system_init(); diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index 95a7289e4b0..b43b6685cae 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -821,8 +821,8 @@ static inline int __init activate_vmi(void) pv_time_ops.get_wallclock = vmi_get_wallclock; pv_time_ops.set_wallclock = vmi_set_wallclock; #ifdef CONFIG_X86_LOCAL_APIC - pv_apic_ops.setup_boot_clock = vmi_time_bsp_init; - pv_apic_ops.setup_secondary_clock = vmi_time_ap_init; + x86_init.timers.setup_percpu_clockev = vmi_time_bsp_init; + x86_cpuinit.setup_percpu_clockev = vmi_time_ap_init; #endif pv_time_ops.sched_clock = vmi_sched_clock; pv_time_ops.get_tsc_khz = vmi_tsc_khz; diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 7df020e6740..e666a98db7c 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -54,4 +55,12 @@ struct __initdata x86_init_ops x86_init = { .pagetable_setup_start = native_pagetable_setup_start, .pagetable_setup_done = native_pagetable_setup_done, }, + + .timers = { + .setup_percpu_clockev = setup_boot_APIC_clock, + }, +}; + +__cpuinitdata struct x86_cpuinit_ops x86_cpuinit = { + .setup_percpu_clockev = setup_secondary_APIC_clock, }; diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index a924caa168d..14e597e0c16 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -912,8 +912,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = { static const struct pv_apic_ops xen_apic_ops __initdata = { #ifdef CONFIG_X86_LOCAL_APIC - .setup_boot_clock = paravirt_nop, - .setup_secondary_clock = paravirt_nop, .startup_ipi_hook = paravirt_nop, #endif }; @@ -979,6 +977,8 @@ asmlinkage void __init xen_start_kernel(void) x86_init.resources.memory_setup = xen_memory_setup; x86_init.oem.arch_setup = xen_arch_setup; x86_init.oem.banner = xen_banner; + x86_init.timers.setup_percpu_clockev = x86_init_noop; + x86_cpuinit.setup_percpu_clockev = x86_init_noop; #ifdef CONFIG_X86_64 /* -- cgit v1.2.3 From 845b3944bbdf9e9247849bf037f27ff3a3f26d87 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Aug 2009 15:37:03 +0200 Subject: x86: Add timer_init to x86_init_ops The timer init code is convoluted with several quirks and the paravirt timer chooser. Figuring out which code path is actually taken is not for the faint hearted. Move the numaq TSC quirk to tsc_pre_init x86_init_ops function and replace the paravirt time chooser and the remaining x86 quirk with a simple x86_init_ops function. Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/paravirt.h | 5 ---- arch/x86/include/asm/paravirt_types.h | 2 -- arch/x86/include/asm/setup.h | 21 ++--------------- arch/x86/include/asm/time.h | 1 - arch/x86/include/asm/timer.h | 3 +-- arch/x86/include/asm/x86_init.h | 4 ++++ arch/x86/kernel/apic/numaq_32.c | 10 ++------ arch/x86/kernel/paravirt.c | 1 - arch/x86/kernel/setup.c | 43 ----------------------------------- arch/x86/kernel/time_32.c | 34 ++++++++++++++++++--------- arch/x86/kernel/time_64.c | 9 ++++++-- arch/x86/kernel/tsc.c | 2 ++ arch/x86/kernel/visws_quirks.c | 20 ++++------------ arch/x86/kernel/vmi_32.c | 2 +- arch/x86/kernel/x86_init.c | 3 +++ arch/x86/lguest/boot.c | 2 +- arch/x86/xen/enlighten.c | 4 ++-- 17 files changed, 53 insertions(+), 113 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 825674a968d..11a4ba7b209 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -34,11 +34,6 @@ static inline int set_wallclock(unsigned long nowtime) return PVOP_CALL1(int, pv_time_ops.set_wallclock, nowtime); } -static inline void (*choose_time_init(void))(void) -{ - return pv_time_ops.time_init; -} - /* The paravirtualized CPUID instruction. */ static inline void __cpuid(unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 1da89276d14..0d812e592e3 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -88,8 +88,6 @@ struct pv_lazy_ops { }; struct pv_time_ops { - void (*time_init)(void); - /* Set and set time of day */ unsigned long (*get_wallclock)(void); int (*set_wallclock)(unsigned long); diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 58b58952b80..861e1fe2303 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -5,24 +5,6 @@ #define COMMAND_LINE_SIZE 2048 -#ifndef __ASSEMBLY__ - -#include - -/* - * Any setup quirks to be performed? - */ - -struct x86_quirks { - int (*arch_pre_time_init)(void); - int (*arch_time_init)(void); -}; - -extern void x86_quirk_pre_time_init(void); -extern void x86_quirk_time_init(void); - -#endif /* __ASSEMBLY__ */ - #ifdef __i386__ #include @@ -42,6 +24,7 @@ extern void x86_quirk_time_init(void); #ifndef __ASSEMBLY__ #include +#include /* Interrupt control for vSMPowered x86_64 systems */ #ifdef CONFIG_X86_64 @@ -60,11 +43,11 @@ static inline void visws_early_detect(void) { } static inline int is_visws_box(void) { return 0; } #endif -extern struct x86_quirks *x86_quirks; extern unsigned long saved_video_mode; extern void reserve_standard_io_resources(void); extern void i386_reserve_resources(void); +extern void setup_default_timer_irq(void); #ifndef _SETUP diff --git a/arch/x86/include/asm/time.h b/arch/x86/include/asm/time.h index 50c733aac42..91bb162b5a3 100644 --- a/arch/x86/include/asm/time.h +++ b/arch/x86/include/asm/time.h @@ -54,7 +54,6 @@ extern void time_init(void); #define get_wallclock() native_get_wallclock() #define set_wallclock(x) native_set_wallclock(x) -#define choose_time_init() hpet_time_init #endif /* CONFIG_PARAVIRT */ diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h index 20ca9c4d468..e854c7ab416 100644 --- a/arch/x86/include/asm/timer.h +++ b/arch/x86/include/asm/timer.h @@ -12,8 +12,7 @@ unsigned long native_calibrate_tsc(void); #ifdef CONFIG_X86_32 extern int timer_ack; -extern irqreturn_t timer_interrupt(int irq, void *dev_id); -#endif /* CONFIG_X86_32 */ +#endif extern int recalibrate_cpu_khz(void); extern int no_timer_check; diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index b7d258f4c40..f8bdd2271a0 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -82,9 +82,13 @@ struct x86_init_paging { * struct x86_init_timers - platform specific timer setup * @setup_perpcu_clockev: set up the per cpu clock event device for the * boot cpu + * @tsc_pre_init: platform function called before TSC init + * @timer_init: initialize the platform timer (default PIT/HPET) */ struct x86_init_timers { void (*setup_percpu_clockev)(void); + void (*tsc_pre_init)(void); + void (*timer_init)(void); }; /** diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index 71c5ea64586..f1ebed6bd15 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -129,10 +129,9 @@ void __cpuinit numaq_tsc_disable(void) } } -static int __init numaq_pre_time_init(void) +static void __init numaq_tsc_init(void) { numaq_tsc_disable(); - return 0; } static inline int generate_logical_apicid(int quad, int phys_apicid) @@ -262,11 +261,6 @@ static void __init smp_read_mpc_oem(struct mpc_table *mpc) } } -static struct x86_quirks numaq_x86_quirks __initdata = { - .arch_pre_time_init = numaq_pre_time_init, - .arch_time_init = NULL, -}; - static __init void early_check_numaq(void) { /* @@ -281,13 +275,13 @@ static __init void early_check_numaq(void) early_get_smp_config(); if (found_numaq) { - x86_quirks = &numaq_x86_quirks; x86_init.mpparse.mpc_record = numaq_mpc_record; x86_init.mpparse.setup_ioapic_ids = x86_init_noop; x86_init.mpparse.mpc_apic_id = mpc_apic_id; x86_init.mpparse.smp_read_mpc_oem = smp_read_mpc_oem; x86_init.mpparse.mpc_oem_pci_bus = mpc_oem_pci_bus; x86_init.mpparse.mpc_oem_bus_info = mpc_oem_bus_info; + x86_init.timers.tsc_pre_init = numaq_tsc_init; } } diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 1ed32c79679..9c0e644a76d 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -306,7 +306,6 @@ struct pv_init_ops pv_init_ops = { }; struct pv_time_ops pv_time_ops = { - .time_init = hpet_time_init, .get_wallclock = native_get_wallclock, .set_wallclock = native_set_wallclock, .sched_clock = native_sched_clock, diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 43ec6aa175b..bb207a47c63 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -626,10 +626,6 @@ static int __init setup_elfcorehdr(char *arg) early_param("elfcorehdr", setup_elfcorehdr); #endif -static struct x86_quirks default_x86_quirks __initdata; - -struct x86_quirks *x86_quirks __initdata = &default_x86_quirks; - #ifdef CONFIG_X86_RESERVE_LOW_64K static int __init dmi_low_memory_corruption(const struct dmi_system_id *d) { @@ -1016,45 +1012,6 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_X86_32 -static struct irqaction irq0 = { - .handler = timer_interrupt, - .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER, - .name = "timer" -}; - -/** - * x86_quirk_pre_time_init - do any specific initialisations before. - * - **/ -void __init x86_quirk_pre_time_init(void) -{ - if (x86_quirks->arch_pre_time_init) - x86_quirks->arch_pre_time_init(); -} - -/** - * x86_quirk_time_init - do any specific initialisations for the system timer. - * - * Description: - * Must plug the system timer interrupt source at HZ into the IRQ listed - * in irq_vectors.h:TIMER_IRQ - **/ -void __init x86_quirk_time_init(void) -{ - if (x86_quirks->arch_time_init) { - /* - * A nonzero return code does not mean failure, it means - * that the architecture quirk does not want any - * generic (timer) setup to be performed after this: - */ - if (x86_quirks->arch_time_init()) - return; - } - - irq0.mask = cpumask_of_cpu(0); - setup_irq(0, &irq0); -} - static struct resource video_ram_resource = { .name = "Video RAM area", .start = 0xa0000, diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c index 5c5d87f0b2e..89bbb52218b 100644 --- a/arch/x86/kernel/time_32.c +++ b/arch/x86/kernel/time_32.c @@ -72,7 +72,7 @@ EXPORT_SYMBOL(profile_pc); * Time Stamp Counter value at the time of the timer interrupt, so that * we later on can estimate the time of day more exactly. */ -irqreturn_t timer_interrupt(int irq, void *dev_id) +static irqreturn_t timer_interrupt(int irq, void *dev_id) { /* Keep nmi watchdog up to date */ inc_irq_stat(irq0_irqs); @@ -113,25 +113,37 @@ irqreturn_t timer_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } -/* Duplicate of time_init() below, with hpet_enable part added */ +static struct irqaction irq0 = { + .handler = timer_interrupt, + .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER, + .name = "timer" +}; + +void __init setup_default_timer_irq(void) +{ + irq0.mask = cpumask_of_cpu(0); + setup_irq(0, &irq0); +} + +/* Default timer init function */ void __init hpet_time_init(void) { if (!hpet_enable()) setup_pit_timer(); - x86_quirk_time_init(); + setup_default_timer_irq(); +} + +static void x86_late_time_init(void) +{ + x86_init.timers.timer_init(); } /* - * This is called directly from init code; we must delay timer setup in the - * HPET case as we can't make the decision to turn on HPET this early in the - * boot process. - * - * The chosen time_init function will usually be hpet_time_init, above, but - * in the case of virtual hardware, an alternative function may be substituted. + * Initialize TSC and delay the periodic timer init to + * late x86_late_time_init() so ioremap works. */ void __init time_init(void) { - x86_quirk_pre_time_init(); tsc_init(); - late_time_init = choose_time_init(); + late_time_init = x86_late_time_init; } diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c index 5ba343e6184..38a7df94c10 100644 --- a/arch/x86/kernel/time_64.c +++ b/arch/x86/kernel/time_64.c @@ -19,6 +19,7 @@ #include #include +#include #include #include #include @@ -127,9 +128,13 @@ void __init hpet_time_init(void) setup_irq(0, &irq0); } +static void x86_late_time_init(void) +{ + x86_init.timers.timer_init(); +} + void __init time_init(void) { tsc_init(); - - late_time_init = choose_time_init(); + late_time_init = x86_late_time_init; } diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 71f4368b357..652bc214eeb 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -857,6 +857,8 @@ void __init tsc_init(void) u64 lpj; int cpu; + x86_init.timers.tsc_pre_init(); + if (!cpu_has_tsc) return; diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index 2719091b335..f068553a1b1 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -53,7 +54,7 @@ int is_visws_box(void) return visws_board_type >= 0; } -static int __init visws_time_init(void) +static void __init visws_time_init(void) { printk(KERN_INFO "Starting Cobalt Timer system clock\n"); @@ -66,11 +67,7 @@ static int __init visws_time_init(void) /* Enable (unmask) the timer interrupt */ co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) & ~CO_CTRL_TIMEMASK); - /* - * Zero return means the generic timer setup code will set up - * the standard vector: - */ - return 0; + setup_default_timer_irq(); } /* Replaces the default init_ISA_irqs in the generic setup */ @@ -226,10 +223,6 @@ static void __init visws_find_smp_config(unsigned int reserve) static void visws_trap_init(void); -static struct x86_quirks visws_x86_quirks __initdata = { - .arch_time_init = visws_time_init, -}; - void __init visws_early_detect(void) { int raw; @@ -241,17 +234,14 @@ void __init visws_early_detect(void) return; /* - * Install special quirks for timer, interrupt and memory setup: - * Fall back to generic behavior for traps: - * Override generic MP-table parsing: + * Override the default platform setup functions */ - x86_quirks = &visws_x86_quirks; - x86_init.resources.memory_setup = visws_memory_setup; x86_init.mpparse.get_smp_config = visws_get_smp_config; x86_init.mpparse.find_smp_config = visws_find_smp_config; x86_init.irqs.pre_vector_init = visws_pre_intr_init; x86_init.irqs.trap_init = visws_trap_init; + x86_init.timers.timer_init = visws_time_init; /* * Install reboot quirks: diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index b43b6685cae..cd7d0fbbf66 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -817,7 +817,7 @@ static inline int __init activate_vmi(void) vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm); vmi_timer_ops.cancel_alarm = vmi_get_function(VMI_CALL_CancelAlarm); - pv_time_ops.time_init = vmi_time_init; + x86_init.timers.timer_init = vmi_time_init; pv_time_ops.get_wallclock = vmi_get_wallclock; pv_time_ops.set_wallclock = vmi_set_wallclock; #ifdef CONFIG_X86_LOCAL_APIC diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index e666a98db7c..4790b92714a 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -11,6 +11,7 @@ #include #include #include +#include #include void __cpuinit x86_init_noop(void) { } @@ -58,6 +59,8 @@ struct __initdata x86_init_ops x86_init = { .timers = { .setup_percpu_clockev = setup_boot_APIC_clock, + .tsc_pre_init = x86_init_noop, + .timer_init = hpet_time_init, }, }; diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 1ff986511f1..6caa8c0c793 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -1320,11 +1320,11 @@ __init void lguest_init(void) /* Time operations */ pv_time_ops.get_wallclock = lguest_get_wallclock; - pv_time_ops.time_init = lguest_time_init; pv_time_ops.get_tsc_khz = lguest_tsc_khz; x86_init.resources.memory_setup = lguest_memory_setup; x86_init.irqs.intr_init = lguest_init_IRQ; + x86_init.timers.timer_init = lguest_time_init; /* * Now is a good time to look at the implementations of these functions diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 14e597e0c16..84826b842b5 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -842,8 +842,6 @@ static const struct pv_init_ops xen_init_ops __initdata = { }; static const struct pv_time_ops xen_time_ops __initdata = { - .time_init = xen_time_init, - .set_wallclock = xen_set_wallclock, .get_wallclock = xen_get_wallclock, .get_tsc_khz = xen_tsc_khz, @@ -977,6 +975,8 @@ asmlinkage void __init xen_start_kernel(void) x86_init.resources.memory_setup = xen_memory_setup; x86_init.oem.arch_setup = xen_arch_setup; x86_init.oem.banner = xen_banner; + + x86_init.timers.timer_init = xen_time_init; x86_init.timers.setup_percpu_clockev = x86_init_noop; x86_cpuinit.setup_percpu_clockev = x86_init_noop; -- cgit v1.2.3 From ecce85089e6d31eed7535b68f5acdd194265690c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 20 Aug 2009 15:28:50 +0200 Subject: x86: Remove do_timer hook This is a left over of the old x86 sub arch support. Remove it and open code it like we do in time_64.c Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/do_timer.h | 16 ---------------- arch/x86/kernel/time_32.c | 7 ++++--- 2 files changed, 4 insertions(+), 19 deletions(-) delete mode 100644 arch/x86/include/asm/do_timer.h (limited to 'arch/x86') diff --git a/arch/x86/include/asm/do_timer.h b/arch/x86/include/asm/do_timer.h deleted file mode 100644 index 23ecda0b28a..00000000000 --- a/arch/x86/include/asm/do_timer.h +++ /dev/null @@ -1,16 +0,0 @@ -/* defines for inline arch setup functions */ -#include - -#include -#include - -/** - * do_timer_interrupt_hook - hook into timer tick - * - * Call the pit clock event handler. see asm/i8253.h - **/ - -static inline void do_timer_interrupt_hook(void) -{ - global_clock_event->event_handler(global_clock_event); -} diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c index 89bbb52218b..6fef4ea1e7a 100644 --- a/arch/x86/kernel/time_32.c +++ b/arch/x86/kernel/time_32.c @@ -28,6 +28,7 @@ * serialize accesses to xtime/lost_ticks). */ +#include #include #include #include @@ -37,8 +38,8 @@ #include #include #include - -#include +#include +#include int timer_ack; @@ -92,7 +93,7 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id) } #endif - do_timer_interrupt_hook(); + global_clock_event->event_handler(global_clock_event); #ifdef CONFIG_MCA if (MCA_bus) { -- cgit v1.2.3 From dd3e6e8c6e7a2294f137c4dbccb3e73e7fa8ba15 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 20 Aug 2009 15:35:23 +0200 Subject: x86: Prepare unification of time_32/64.c Unify the top comment and the includes. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/time_32.c | 44 +++++++++++++------------------------------- arch/x86/kernel/time_64.c | 13 +++++-------- 2 files changed, 18 insertions(+), 39 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c index 6fef4ea1e7a..acbaefd61e8 100644 --- a/arch/x86/kernel/time_32.c +++ b/arch/x86/kernel/time_32.c @@ -1,45 +1,27 @@ /* - * Copyright (C) 1991, 1992, 1995 Linus Torvalds + * Copyright (c) 1991,1992,1995 Linus Torvalds + * Copyright (c) 1994 Alan Modra + * Copyright (c) 1995 Markus Kuhn + * Copyright (c) 1996 Ingo Molnar + * Copyright (c) 1998 Andrea Arcangeli + * Copyright (c) 2002,2006 Vojtech Pavlik + * Copyright (c) 2003 Andi Kleen * - * This file contains the PC-specific time handling details: - * reading the RTC at bootup, etc.. - * 1994-07-02 Alan Modra - * fixed set_rtc_mmss, fixed time.year for >= 2000, new mktime - * 1995-03-26 Markus Kuhn - * fixed 500 ms bug at call to set_rtc_mmss, fixed DS12887 - * precision CMOS clock update - * 1996-05-03 Ingo Molnar - * fixed time warps in do_[slow|fast]_gettimeoffset() - * 1997-09-10 Updated NTP code according to technical memorandum Jan '96 - * "A Kernel Model for Precision Timekeeping" by Dave Mills - * 1998-09-05 (Various) - * More robust do_fast_gettimeoffset() algorithm implemented - * (works with APM, Cyrix 6x86MX and Centaur C6), - * monotonic gettimeofday() with fast_get_timeoffset(), - * drift-proof precision TSC calibration on boot - * (C. Scott Ananian , Andrew D. - * Balsa , Philip Gladstone ; - * ported from 2.0.35 Jumbo-9 by Michael Krause ). - * 1998-12-16 Andrea Arcangeli - * Fixed Jumbo-9 code in 2.1.131: do_gettimeofday was missing 1 jiffy - * because was not accounting lost_ticks. - * 1998-12-24 Copyright (C) 1998 Andrea Arcangeli - * Fixed a xtime SMP race (we need the xtime_lock rw spinlock to - * serialize accesses to xtime/lost_ticks). */ #include -#include #include #include #include -#include -#include -#include -#include +#include +#include #include #include +#include +#include +#include +#include int timer_ack; diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c index 38a7df94c10..45914f8844a 100644 --- a/arch/x86/kernel/time_64.c +++ b/arch/x86/kernel/time_64.c @@ -1,6 +1,4 @@ /* - * "High Precision Event Timer" based timekeeping. - * * Copyright (c) 1991,1992,1995 Linus Torvalds * Copyright (c) 1994 Alan Modra * Copyright (c) 1995 Markus Kuhn @@ -8,23 +6,22 @@ * Copyright (c) 1998 Andrea Arcangeli * Copyright (c) 2002,2006 Vojtech Pavlik * Copyright (c) 2003 Andi Kleen - * RTC support code taken from arch/i386/kernel/timers/time_hpet.c + * */ #include -#include #include -#include #include #include -#include +#include #include +#include #include +#include #include -#include #include -#include +#include volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; -- cgit v1.2.3 From 64fcbac1f38882d8ae82c44a1c2a676cfa5e79e1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 20 Aug 2009 15:54:21 +0200 Subject: x86: Simplify timer_ack magic in time_32.c Let the compiler optimize the timer_ack magic away in the 32bit timer interrupt and put the same code into time_64.c. It's optimized out for CONFIG_X86_IO_APIC on 32bit and for 64bit because timer_ack is const 0 in both cases. Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/timer.h | 6 ++++-- arch/x86/kernel/time_32.c | 5 +++-- arch/x86/kernel/time_64.c | 14 ++++++++++++++ 3 files changed, 21 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h index e854c7ab416..65228ccc5f0 100644 --- a/arch/x86/include/asm/timer.h +++ b/arch/x86/include/asm/timer.h @@ -9,11 +9,13 @@ unsigned long long native_sched_clock(void); unsigned long native_calibrate_tsc(void); +extern int recalibrate_cpu_khz(void); -#ifdef CONFIG_X86_32 +#if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC) extern int timer_ack; +#else +# define timer_ack (0) #endif -extern int recalibrate_cpu_khz(void); extern int no_timer_check; diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c index acbaefd61e8..7a26bcf887f 100644 --- a/arch/x86/kernel/time_32.c +++ b/arch/x86/kernel/time_32.c @@ -23,7 +23,9 @@ #include #include +#if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC) int timer_ack; +#endif unsigned long profile_pc(struct pt_regs *regs) { @@ -60,7 +62,7 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id) /* Keep nmi watchdog up to date */ inc_irq_stat(irq0_irqs); -#ifdef CONFIG_X86_IO_APIC + /* Optimized out for !IO_APIC and x86_64 */ if (timer_ack) { /* * Subtle, when I/O APICs are used we have to ack timer IRQ @@ -73,7 +75,6 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id) inb(PIC_MASTER_POLL); spin_unlock(&i8259A_lock); } -#endif global_clock_event->event_handler(global_clock_event); diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c index 45914f8844a..35e0a925da5 100644 --- a/arch/x86/kernel/time_64.c +++ b/arch/x86/kernel/time_64.c @@ -51,6 +51,20 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id) { inc_irq_stat(irq0_irqs); + /* Optimized out for !IO_APIC and x86_64 */ + if (timer_ack) { + /* + * Subtle, when I/O APICs are used we have to ack timer IRQ + * manually to deassert NMI lines for the watchdog if run + * on an 82489DX-based system. + */ + spin_lock(&i8259A_lock); + outb(0x0c, PIC_MASTER_OCW3); + /* Ack the IRQ; AEOI will end it automatically. */ + inb(PIC_MASTER_POLL); + spin_unlock(&i8259A_lock); + } + global_clock_event->event_handler(global_clock_event); #ifdef CONFIG_MCA -- cgit v1.2.3 From 0be6939422eb2f54df4b3d8763c569c6759c1a42 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 20 Aug 2009 15:59:35 +0200 Subject: x86: Remove mca bus ifdef from timer interrupt MCA_bus is constant 0 when CONFIG_MCA=n. So the compiler removes that code w/o needing an extra #ifdef Signed-off-by: Thomas Gleixner --- arch/x86/kernel/time_32.c | 18 +++--------------- arch/x86/kernel/time_64.c | 9 +++------ 2 files changed, 6 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c index 7a26bcf887f..ec729cdcfa3 100644 --- a/arch/x86/kernel/time_32.c +++ b/arch/x86/kernel/time_32.c @@ -78,21 +78,9 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id) global_clock_event->event_handler(global_clock_event); -#ifdef CONFIG_MCA - if (MCA_bus) { - /* The PS/2 uses level-triggered interrupts. You can't - turn them off, nor would you want to (any attempt to - enable edge-triggered interrupts usually gets intercepted by a - special hardware circuit). Hence we have to acknowledge - the timer interrupt. Through some incredibly stupid - design idea, the reset for IRQ 0 is done by setting the - high bit of the PPI port B (0x61). Note that some PS/2s, - notably the 55SX, work fine if this is removed. */ - - u8 irq_v = inb_p(0x61); /* read the current state */ - outb_p(irq_v | 0x80, 0x61); /* reset the IRQ */ - } -#endif + /* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */ + if (MCA_bus) + outb_p(inb_p(0x61)| 0x80, 0x61); return IRQ_HANDLED; } diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c index 35e0a925da5..7db3912b869 100644 --- a/arch/x86/kernel/time_64.c +++ b/arch/x86/kernel/time_64.c @@ -67,12 +67,9 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id) global_clock_event->event_handler(global_clock_event); -#ifdef CONFIG_MCA - if (MCA_bus) { - u8 irq_v = inb_p(0x61); /* read the current state */ - outb_p(irq_v|0x80, 0x61); /* reset the IRQ */ - } -#endif + /* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */ + if (MCA_bus) + outb_p(inb_p(0x61)| 0x80, 0x61); return IRQ_HANDLED; } -- cgit v1.2.3 From 454ede7eebf91b92ab1eafe10c6b6ed04de29bf8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 20 Aug 2009 16:07:40 +0200 Subject: x86: Make timer setup and global variables the same in time_32/64.c The timer and timer irq setup code is identical in 32 and 64 bit. Make it the same formatting as well. Also add the global variables under the necessary ifdefs to both files. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/time_32.c | 8 +++++--- arch/x86/kernel/time_64.c | 38 ++++++++++++++++++++++++++++++-------- 2 files changed, 35 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c index ec729cdcfa3..186abc577b2 100644 --- a/arch/x86/kernel/time_32.c +++ b/arch/x86/kernel/time_32.c @@ -27,6 +27,10 @@ int timer_ack; #endif +#ifdef CONFIG_X86_64 +volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; +#endif + unsigned long profile_pc(struct pt_regs *regs) { unsigned long pc = instruction_pointer(regs); @@ -53,9 +57,7 @@ unsigned long profile_pc(struct pt_regs *regs) EXPORT_SYMBOL(profile_pc); /* - * This is the same as the above, except we _also_ save the current - * Time Stamp Counter value at the time of the timer interrupt, so that - * we later on can estimate the time of day more exactly. + * Default timer interrupt handler for PIT/HPET */ static irqreturn_t timer_interrupt(int irq, void *dev_id) { diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c index 7db3912b869..78cbdf5c006 100644 --- a/arch/x86/kernel/time_64.c +++ b/arch/x86/kernel/time_64.c @@ -23,7 +23,13 @@ #include #include +#if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC) +int timer_ack; +#endif + +#ifdef CONFIG_X86_64 volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; +#endif unsigned long profile_pc(struct pt_regs *regs) { @@ -47,8 +53,12 @@ unsigned long profile_pc(struct pt_regs *regs) } EXPORT_SYMBOL(profile_pc); +/* + * Default timer interrupt handler for PIT/HPET + */ static irqreturn_t timer_interrupt(int irq, void *dev_id) { + /* Keep nmi watchdog up to date */ inc_irq_stat(irq0_irqs); /* Optimized out for !IO_APIC and x86_64 */ @@ -74,8 +84,10 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } -/* calibrate_cpu is used on systems with fixed rate TSCs to determine - * processor frequency */ +/* + * calibrate_cpu is used on systems with fixed rate TSCs to determine + * processor frequency + */ #define TICK_COUNT 100000000 unsigned long __init calibrate_cpu(void) { @@ -122,18 +134,24 @@ unsigned long __init calibrate_cpu(void) return pmc_now * tsc_khz / (tsc_now - tsc_start); } -static struct irqaction irq0 = { - .handler = timer_interrupt, - .flags = IRQF_DISABLED | IRQF_IRQPOLL | IRQF_NOBALANCING | IRQF_TIMER, - .name = "timer" +static struct irqaction irq0 = { + .handler = timer_interrupt, + .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER, + .name = "timer" }; +void __init setup_default_timer_irq(void) +{ + irq0.mask = cpumask_of_cpu(0); + setup_irq(0, &irq0); +} + +/* Default timer init function */ void __init hpet_time_init(void) { if (!hpet_enable()) setup_pit_timer(); - - setup_irq(0, &irq0); + setup_default_timer_irq(); } static void x86_late_time_init(void) @@ -141,6 +159,10 @@ static void x86_late_time_init(void) x86_init.timers.timer_init(); } +/* + * Initialize TSC and delay the periodic timer init to + * late x86_late_time_init() so ioremap works. + */ void __init time_init(void) { tsc_init(); -- cgit v1.2.3 From 08047c4f1740c7cee75d58e2919d48c09f951649 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 20 Aug 2009 16:27:41 +0200 Subject: x86: Move calibrate_cpu to tsc.c Move the code where it's only user is. Also we need to look whether this hardwired hackery might interfere with perfcounters. Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/time.h | 2 -- arch/x86/kernel/time_32.c | 1 - arch/x86/kernel/time_64.c | 51 ---------------------------------------- arch/x86/kernel/tsc.c | 57 +++++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 55 insertions(+), 56 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/time.h b/arch/x86/include/asm/time.h index 91bb162b5a3..9c5608b21c2 100644 --- a/arch/x86/include/asm/time.h +++ b/arch/x86/include/asm/time.h @@ -57,6 +57,4 @@ extern void time_init(void); #endif /* CONFIG_PARAVIRT */ -extern unsigned long __init calibrate_cpu(void); - #endif /* _ASM_X86_TIME_H */ diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c index 186abc577b2..fd876cc7748 100644 --- a/arch/x86/kernel/time_32.c +++ b/arch/x86/kernel/time_32.c @@ -21,7 +21,6 @@ #include #include #include -#include #if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC) int timer_ack; diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c index 78cbdf5c006..e59a40ebff1 100644 --- a/arch/x86/kernel/time_64.c +++ b/arch/x86/kernel/time_64.c @@ -21,7 +21,6 @@ #include #include #include -#include #if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC) int timer_ack; @@ -84,56 +83,6 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } -/* - * calibrate_cpu is used on systems with fixed rate TSCs to determine - * processor frequency - */ -#define TICK_COUNT 100000000 -unsigned long __init calibrate_cpu(void) -{ - int tsc_start, tsc_now; - int i, no_ctr_free; - unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0; - unsigned long flags; - - for (i = 0; i < 4; i++) - if (avail_to_resrv_perfctr_nmi_bit(i)) - break; - no_ctr_free = (i == 4); - if (no_ctr_free) { - WARN(1, KERN_WARNING "Warning: AMD perfctrs busy ... " - "cpu_khz value may be incorrect.\n"); - i = 3; - rdmsrl(MSR_K7_EVNTSEL3, evntsel3); - wrmsrl(MSR_K7_EVNTSEL3, 0); - rdmsrl(MSR_K7_PERFCTR3, pmc3); - } else { - reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i); - reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i); - } - local_irq_save(flags); - /* start measuring cycles, incrementing from 0 */ - wrmsrl(MSR_K7_PERFCTR0 + i, 0); - wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76); - rdtscl(tsc_start); - do { - rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now); - tsc_now = get_cycles(); - } while ((tsc_now - tsc_start) < TICK_COUNT); - - local_irq_restore(flags); - if (no_ctr_free) { - wrmsrl(MSR_K7_EVNTSEL3, 0); - wrmsrl(MSR_K7_PERFCTR3, pmc3); - wrmsrl(MSR_K7_EVNTSEL3, evntsel3); - } else { - release_perfctr_nmi(MSR_K7_PERFCTR0 + i); - release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); - } - - return pmc_now * tsc_khz / (tsc_now - tsc_start); -} - static struct irqaction irq0 = { .handler = timer_interrupt, .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER, diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 652bc214eeb..97a0bcbad10 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -17,6 +17,7 @@ #include #include #include +#include unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */ EXPORT_SYMBOL(cpu_khz); @@ -852,6 +853,60 @@ static void __init init_tsc_clocksource(void) clocksource_register(&clocksource_tsc); } +#ifdef CONFIG_X86_64 +/* + * calibrate_cpu is used on systems with fixed rate TSCs to determine + * processor frequency + */ +#define TICK_COUNT 100000000 +static unsigned long __init calibrate_cpu(void) +{ + int tsc_start, tsc_now; + int i, no_ctr_free; + unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0; + unsigned long flags; + + for (i = 0; i < 4; i++) + if (avail_to_resrv_perfctr_nmi_bit(i)) + break; + no_ctr_free = (i == 4); + if (no_ctr_free) { + WARN(1, KERN_WARNING "Warning: AMD perfctrs busy ... " + "cpu_khz value may be incorrect.\n"); + i = 3; + rdmsrl(MSR_K7_EVNTSEL3, evntsel3); + wrmsrl(MSR_K7_EVNTSEL3, 0); + rdmsrl(MSR_K7_PERFCTR3, pmc3); + } else { + reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i); + reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i); + } + local_irq_save(flags); + /* start measuring cycles, incrementing from 0 */ + wrmsrl(MSR_K7_PERFCTR0 + i, 0); + wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76); + rdtscl(tsc_start); + do { + rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now); + tsc_now = get_cycles(); + } while ((tsc_now - tsc_start) < TICK_COUNT); + + local_irq_restore(flags); + if (no_ctr_free) { + wrmsrl(MSR_K7_EVNTSEL3, 0); + wrmsrl(MSR_K7_PERFCTR3, pmc3); + wrmsrl(MSR_K7_EVNTSEL3, evntsel3); + } else { + release_perfctr_nmi(MSR_K7_PERFCTR0 + i); + release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); + } + + return pmc_now * tsc_khz / (tsc_now - tsc_start); +} +#else +static inline unsigned long calibrate_cpu(void) { return cpu_khz; } +#endif + void __init tsc_init(void) { u64 lpj; @@ -870,11 +925,9 @@ void __init tsc_init(void) return; } -#ifdef CONFIG_X86_64 if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) && (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)) cpu_khz = calibrate_cpu(); -#endif printk("Detected %lu.%03lu MHz processor.\n", (unsigned long)cpu_khz / 1000, -- cgit v1.2.3 From ef4512882dbe9978e7a18ccbcb4cb45705ce5560 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 21 Aug 2009 13:24:08 +0200 Subject: x86: time_32/64.c unify profile_pc The code is identical except for the formatting and a useless #ifdef. Make it the same. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/time_32.c | 13 ++++++------- arch/x86/kernel/time_64.c | 8 +++++--- 2 files changed, 11 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c index fd876cc7748..fda0c34da75 100644 --- a/arch/x86/kernel/time_32.c +++ b/arch/x86/kernel/time_32.c @@ -34,23 +34,22 @@ unsigned long profile_pc(struct pt_regs *regs) { unsigned long pc = instruction_pointer(regs); -#ifdef CONFIG_SMP if (!user_mode_vm(regs) && in_lock_functions(pc)) { #ifdef CONFIG_FRAME_POINTER return *(unsigned long *)(regs->bp + sizeof(long)); #else - unsigned long *sp = (unsigned long *)®s->sp; - - /* Return address is either directly at stack pointer - or above a saved flags. Eflags has bits 22-31 zero, - kernel addresses don't. */ + unsigned long *sp = (unsigned long *)regs->sp; + /* + * Return address is either directly at stack pointer + * or above a saved flags. Eflags has bits 22-31 zero, + * kernel addresses don't. + */ if (sp[0] >> 22) return sp[0]; if (sp[1] >> 22) return sp[1]; #endif } -#endif return pc; } EXPORT_SYMBOL(profile_pc); diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c index e59a40ebff1..fda0c34da75 100644 --- a/arch/x86/kernel/time_64.c +++ b/arch/x86/kernel/time_64.c @@ -34,14 +34,16 @@ unsigned long profile_pc(struct pt_regs *regs) { unsigned long pc = instruction_pointer(regs); - /* Assume the lock function has either no stack frame or a copy - of flags from PUSHF - Eflags always has bits 22 and up cleared unlike kernel addresses. */ if (!user_mode_vm(regs) && in_lock_functions(pc)) { #ifdef CONFIG_FRAME_POINTER return *(unsigned long *)(regs->bp + sizeof(long)); #else unsigned long *sp = (unsigned long *)regs->sp; + /* + * Return address is either directly at stack pointer + * or above a saved flags. Eflags has bits 22-31 zero, + * kernel addresses don't. + */ if (sp[0] >> 22) return sp[0]; if (sp[1] >> 22) -- cgit v1.2.3 From 47926214d8b2bef13b2be57c500194a804f16198 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 20 Aug 2009 16:47:19 +0200 Subject: x86: Replace the now identical time_32/64.c by time.c Remove the redundant copy. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/time.c | 121 ++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/time_32.c | 121 ---------------------------------------------- arch/x86/kernel/time_64.c | 121 ---------------------------------------------- 4 files changed, 122 insertions(+), 243 deletions(-) create mode 100644 arch/x86/kernel/time.c delete mode 100644 arch/x86/kernel/time_32.c delete mode 100644 arch/x86/kernel/time_64.c (limited to 'arch/x86') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 313ed6fca9b..ccf3db607c2 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -31,7 +31,7 @@ GCOV_PROFILE_paravirt.o := n obj-y := process_$(BITS).o signal.o entry_$(BITS).o obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o -obj-y += time_$(BITS).o ioport.o ldt.o dumpstack.o +obj-y += time.o ioport.o ldt.o dumpstack.o obj-y += setup.o x86_init.o i8259.o irqinit.o obj-$(CONFIG_X86_VISWS) += visws_quirks.o obj-$(CONFIG_X86_32) += probe_roms_32.o diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c new file mode 100644 index 00000000000..fda0c34da75 --- /dev/null +++ b/arch/x86/kernel/time.c @@ -0,0 +1,121 @@ +/* + * Copyright (c) 1991,1992,1995 Linus Torvalds + * Copyright (c) 1994 Alan Modra + * Copyright (c) 1995 Markus Kuhn + * Copyright (c) 1996 Ingo Molnar + * Copyright (c) 1998 Andrea Arcangeli + * Copyright (c) 2002,2006 Vojtech Pavlik + * Copyright (c) 2003 Andi Kleen + * + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC) +int timer_ack; +#endif + +#ifdef CONFIG_X86_64 +volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; +#endif + +unsigned long profile_pc(struct pt_regs *regs) +{ + unsigned long pc = instruction_pointer(regs); + + if (!user_mode_vm(regs) && in_lock_functions(pc)) { +#ifdef CONFIG_FRAME_POINTER + return *(unsigned long *)(regs->bp + sizeof(long)); +#else + unsigned long *sp = (unsigned long *)regs->sp; + /* + * Return address is either directly at stack pointer + * or above a saved flags. Eflags has bits 22-31 zero, + * kernel addresses don't. + */ + if (sp[0] >> 22) + return sp[0]; + if (sp[1] >> 22) + return sp[1]; +#endif + } + return pc; +} +EXPORT_SYMBOL(profile_pc); + +/* + * Default timer interrupt handler for PIT/HPET + */ +static irqreturn_t timer_interrupt(int irq, void *dev_id) +{ + /* Keep nmi watchdog up to date */ + inc_irq_stat(irq0_irqs); + + /* Optimized out for !IO_APIC and x86_64 */ + if (timer_ack) { + /* + * Subtle, when I/O APICs are used we have to ack timer IRQ + * manually to deassert NMI lines for the watchdog if run + * on an 82489DX-based system. + */ + spin_lock(&i8259A_lock); + outb(0x0c, PIC_MASTER_OCW3); + /* Ack the IRQ; AEOI will end it automatically. */ + inb(PIC_MASTER_POLL); + spin_unlock(&i8259A_lock); + } + + global_clock_event->event_handler(global_clock_event); + + /* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */ + if (MCA_bus) + outb_p(inb_p(0x61)| 0x80, 0x61); + + return IRQ_HANDLED; +} + +static struct irqaction irq0 = { + .handler = timer_interrupt, + .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER, + .name = "timer" +}; + +void __init setup_default_timer_irq(void) +{ + irq0.mask = cpumask_of_cpu(0); + setup_irq(0, &irq0); +} + +/* Default timer init function */ +void __init hpet_time_init(void) +{ + if (!hpet_enable()) + setup_pit_timer(); + setup_default_timer_irq(); +} + +static void x86_late_time_init(void) +{ + x86_init.timers.timer_init(); +} + +/* + * Initialize TSC and delay the periodic timer init to + * late x86_late_time_init() so ioremap works. + */ +void __init time_init(void) +{ + tsc_init(); + late_time_init = x86_late_time_init; +} diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c deleted file mode 100644 index fda0c34da75..00000000000 --- a/arch/x86/kernel/time_32.c +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Copyright (c) 1991,1992,1995 Linus Torvalds - * Copyright (c) 1994 Alan Modra - * Copyright (c) 1995 Markus Kuhn - * Copyright (c) 1996 Ingo Molnar - * Copyright (c) 1998 Andrea Arcangeli - * Copyright (c) 2002,2006 Vojtech Pavlik - * Copyright (c) 2003 Andi Kleen - * - */ - -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC) -int timer_ack; -#endif - -#ifdef CONFIG_X86_64 -volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; -#endif - -unsigned long profile_pc(struct pt_regs *regs) -{ - unsigned long pc = instruction_pointer(regs); - - if (!user_mode_vm(regs) && in_lock_functions(pc)) { -#ifdef CONFIG_FRAME_POINTER - return *(unsigned long *)(regs->bp + sizeof(long)); -#else - unsigned long *sp = (unsigned long *)regs->sp; - /* - * Return address is either directly at stack pointer - * or above a saved flags. Eflags has bits 22-31 zero, - * kernel addresses don't. - */ - if (sp[0] >> 22) - return sp[0]; - if (sp[1] >> 22) - return sp[1]; -#endif - } - return pc; -} -EXPORT_SYMBOL(profile_pc); - -/* - * Default timer interrupt handler for PIT/HPET - */ -static irqreturn_t timer_interrupt(int irq, void *dev_id) -{ - /* Keep nmi watchdog up to date */ - inc_irq_stat(irq0_irqs); - - /* Optimized out for !IO_APIC and x86_64 */ - if (timer_ack) { - /* - * Subtle, when I/O APICs are used we have to ack timer IRQ - * manually to deassert NMI lines for the watchdog if run - * on an 82489DX-based system. - */ - spin_lock(&i8259A_lock); - outb(0x0c, PIC_MASTER_OCW3); - /* Ack the IRQ; AEOI will end it automatically. */ - inb(PIC_MASTER_POLL); - spin_unlock(&i8259A_lock); - } - - global_clock_event->event_handler(global_clock_event); - - /* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */ - if (MCA_bus) - outb_p(inb_p(0x61)| 0x80, 0x61); - - return IRQ_HANDLED; -} - -static struct irqaction irq0 = { - .handler = timer_interrupt, - .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER, - .name = "timer" -}; - -void __init setup_default_timer_irq(void) -{ - irq0.mask = cpumask_of_cpu(0); - setup_irq(0, &irq0); -} - -/* Default timer init function */ -void __init hpet_time_init(void) -{ - if (!hpet_enable()) - setup_pit_timer(); - setup_default_timer_irq(); -} - -static void x86_late_time_init(void) -{ - x86_init.timers.timer_init(); -} - -/* - * Initialize TSC and delay the periodic timer init to - * late x86_late_time_init() so ioremap works. - */ -void __init time_init(void) -{ - tsc_init(); - late_time_init = x86_late_time_init; -} diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c deleted file mode 100644 index fda0c34da75..00000000000 --- a/arch/x86/kernel/time_64.c +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Copyright (c) 1991,1992,1995 Linus Torvalds - * Copyright (c) 1994 Alan Modra - * Copyright (c) 1995 Markus Kuhn - * Copyright (c) 1996 Ingo Molnar - * Copyright (c) 1998 Andrea Arcangeli - * Copyright (c) 2002,2006 Vojtech Pavlik - * Copyright (c) 2003 Andi Kleen - * - */ - -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC) -int timer_ack; -#endif - -#ifdef CONFIG_X86_64 -volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; -#endif - -unsigned long profile_pc(struct pt_regs *regs) -{ - unsigned long pc = instruction_pointer(regs); - - if (!user_mode_vm(regs) && in_lock_functions(pc)) { -#ifdef CONFIG_FRAME_POINTER - return *(unsigned long *)(regs->bp + sizeof(long)); -#else - unsigned long *sp = (unsigned long *)regs->sp; - /* - * Return address is either directly at stack pointer - * or above a saved flags. Eflags has bits 22-31 zero, - * kernel addresses don't. - */ - if (sp[0] >> 22) - return sp[0]; - if (sp[1] >> 22) - return sp[1]; -#endif - } - return pc; -} -EXPORT_SYMBOL(profile_pc); - -/* - * Default timer interrupt handler for PIT/HPET - */ -static irqreturn_t timer_interrupt(int irq, void *dev_id) -{ - /* Keep nmi watchdog up to date */ - inc_irq_stat(irq0_irqs); - - /* Optimized out for !IO_APIC and x86_64 */ - if (timer_ack) { - /* - * Subtle, when I/O APICs are used we have to ack timer IRQ - * manually to deassert NMI lines for the watchdog if run - * on an 82489DX-based system. - */ - spin_lock(&i8259A_lock); - outb(0x0c, PIC_MASTER_OCW3); - /* Ack the IRQ; AEOI will end it automatically. */ - inb(PIC_MASTER_POLL); - spin_unlock(&i8259A_lock); - } - - global_clock_event->event_handler(global_clock_event); - - /* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */ - if (MCA_bus) - outb_p(inb_p(0x61)| 0x80, 0x61); - - return IRQ_HANDLED; -} - -static struct irqaction irq0 = { - .handler = timer_interrupt, - .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER, - .name = "timer" -}; - -void __init setup_default_timer_irq(void) -{ - irq0.mask = cpumask_of_cpu(0); - setup_irq(0, &irq0); -} - -/* Default timer init function */ -void __init hpet_time_init(void) -{ - if (!hpet_enable()) - setup_pit_timer(); - setup_default_timer_irq(); -} - -static void x86_late_time_init(void) -{ - x86_init.timers.timer_init(); -} - -/* - * Initialize TSC and delay the periodic timer init to - * late x86_late_time_init() so ioremap works. - */ -void __init time_init(void) -{ - tsc_init(); - late_time_init = x86_late_time_init; -} -- cgit v1.2.3 From 2d826404f0bdcac2a4dd7e3c446b70d6a3b63b78 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 20 Aug 2009 17:06:25 +0200 Subject: x86: Move tsc_calibration to x86_init_ops TSC calibration is modified by the vmware hypervisor and paravirt by separate means. Moorestown wants to add its own calibration routine as well. So make calibrate_tsc a proper x86_init_ops function and override it by paravirt or by the early setup of the vmware hypervisor. Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/hypervisor.h | 2 +- arch/x86/include/asm/paravirt.h | 1 - arch/x86/include/asm/timer.h | 5 ----- arch/x86/include/asm/tsc.h | 3 ++- arch/x86/include/asm/vmware.h | 2 +- arch/x86/include/asm/x86_init.h | 9 +++++++++ arch/x86/kernel/cpu/hypervisor.c | 14 +++++++------- arch/x86/kernel/cpu/vmware.c | 21 ++++++++++++--------- arch/x86/kernel/kvmclock.c | 2 +- arch/x86/kernel/paravirt.c | 1 - arch/x86/kernel/setup.c | 2 +- arch/x86/kernel/tsc.c | 13 ++++--------- arch/x86/kernel/vmi_32.c | 2 +- arch/x86/kernel/vmiclock_32.c | 2 +- arch/x86/kernel/x86_init.c | 5 +++++ arch/x86/lguest/boot.c | 2 +- arch/x86/xen/enlighten.c | 3 ++- 17 files changed, 48 insertions(+), 41 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h index 369f5c5d09a..b78c0941e42 100644 --- a/arch/x86/include/asm/hypervisor.h +++ b/arch/x86/include/asm/hypervisor.h @@ -20,7 +20,7 @@ #ifndef ASM_X86__HYPERVISOR_H #define ASM_X86__HYPERVISOR_H -extern unsigned long get_hypervisor_tsc_freq(void); extern void init_hypervisor(struct cpuinfo_x86 *c); +extern void init_hypervisor_platform(void); #endif diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 11a4ba7b209..1e458a55330 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -210,7 +210,6 @@ static inline unsigned long long paravirt_sched_clock(void) { return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock); } -#define calibrate_tsc() (pv_time_ops.get_tsc_khz()) static inline unsigned long long paravirt_read_pmc(int counter) { diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h index 65228ccc5f0..5469630b27f 100644 --- a/arch/x86/include/asm/timer.h +++ b/arch/x86/include/asm/timer.h @@ -8,7 +8,6 @@ #define TICK_SIZE (tick_nsec / 1000) unsigned long long native_sched_clock(void); -unsigned long native_calibrate_tsc(void); extern int recalibrate_cpu_khz(void); #if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC) @@ -19,10 +18,6 @@ extern int timer_ack; extern int no_timer_check; -#ifndef CONFIG_PARAVIRT -#define calibrate_tsc() native_calibrate_tsc() -#endif - /* Accelerators for sched_clock() * convert from cycles(64bits) => nanoseconds (64bits) * basic equation: diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index 38ae163cc91..c0427295e8f 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -48,7 +48,8 @@ static __always_inline cycles_t vget_cycles(void) extern void tsc_init(void); extern void mark_tsc_unstable(char *reason); extern int unsynchronized_tsc(void); -int check_tsc_unstable(void); +extern int check_tsc_unstable(void); +extern unsigned long native_calibrate_tsc(void); /* * Boot-time check whether the TSCs are synchronized across diff --git a/arch/x86/include/asm/vmware.h b/arch/x86/include/asm/vmware.h index c11b7e100d8..e49ed6d2fd4 100644 --- a/arch/x86/include/asm/vmware.h +++ b/arch/x86/include/asm/vmware.h @@ -20,7 +20,7 @@ #ifndef ASM_X86__VMWARE_H #define ASM_X86__VMWARE_H -extern unsigned long vmware_get_tsc_khz(void); +extern void vmware_platform_setup(void); extern int vmware_platform(void); extern void vmware_set_feature_bits(struct cpuinfo_x86 *c); diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index f8bdd2271a0..20df5187171 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -112,8 +112,17 @@ struct x86_cpuinit_ops { void (*setup_percpu_clockev)(void); }; +/** + * struct x86_platform_ops - platform specific runtime functions + * @calibrate_tsc: calibrate TSC + */ +struct x86_platform_ops { + unsigned long (*calibrate_tsc)(void); +}; + extern struct x86_init_ops x86_init; extern struct x86_cpuinit_ops x86_cpuinit; +extern struct x86_platform_ops x86_platform; extern void x86_init_noop(void); extern void x86_init_uint_noop(unsigned int unused); diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index 93ba8eeb100..08be922de33 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -34,13 +34,6 @@ detect_hypervisor_vendor(struct cpuinfo_x86 *c) c->x86_hyper_vendor = X86_HYPER_VENDOR_NONE; } -unsigned long get_hypervisor_tsc_freq(void) -{ - if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE) - return vmware_get_tsc_khz(); - return 0; -} - static inline void __cpuinit hypervisor_set_feature_bits(struct cpuinfo_x86 *c) { @@ -55,3 +48,10 @@ void __cpuinit init_hypervisor(struct cpuinfo_x86 *c) detect_hypervisor_vendor(c); hypervisor_set_feature_bits(c); } + +void __init init_hypervisor_platform(void) +{ + init_hypervisor(&boot_cpu_data); + if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE) + vmware_platform_setup(); +} diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index bc24f514ec9..0a46b4df5d8 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -24,6 +24,7 @@ #include #include #include +#include #define CPUID_VMWARE_INFO_LEAF 0x40000000 #define VMWARE_HYPERVISOR_MAGIC 0x564D5868 @@ -47,21 +48,29 @@ static inline int __vmware_platform(void) return eax != (uint32_t)-1 && ebx == VMWARE_HYPERVISOR_MAGIC; } -static unsigned long __vmware_get_tsc_khz(void) +static unsigned long vmware_get_tsc_khz(void) { uint64_t tsc_hz; uint32_t eax, ebx, ecx, edx; VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); - if (ebx == UINT_MAX) - return 0; tsc_hz = eax | (((uint64_t)ebx) << 32); do_div(tsc_hz, 1000); BUG_ON(tsc_hz >> 32); return tsc_hz; } +void __init vmware_platform_setup(void) +{ + uint32_t eax, ebx, ecx, edx; + + VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); + + if (ebx != UINT_MAX) + x86_platform.calibrate_tsc = vmware_get_tsc_khz; +} + /* * While checking the dmi string infomation, just checking the product * serial key should be enough, as this will always have a VMware @@ -87,12 +96,6 @@ int vmware_platform(void) return 0; } -unsigned long vmware_get_tsc_khz(void) -{ - BUG_ON(!vmware_platform()); - return __vmware_get_tsc_khz(); -} - /* * VMware hypervisor takes care of exporting a reliable TSC to the guest. * Still, due to timing difference when running on virtual cpus, the TSC can diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 64e9b5f59d2..75a21b61b86 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -187,7 +187,7 @@ void __init kvmclock_init(void) pv_time_ops.get_wallclock = kvm_get_wallclock; pv_time_ops.set_wallclock = kvm_set_wallclock; pv_time_ops.sched_clock = kvm_clock_read; - pv_time_ops.get_tsc_khz = kvm_get_tsc_khz; + x86_platform.calibrate_tsc = kvm_get_tsc_khz; #ifdef CONFIG_X86_LOCAL_APIC x86_cpuinit.setup_percpu_clockev = kvm_setup_secondary_clock; diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 9c0e644a76d..7cbf898d839 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -309,7 +309,6 @@ struct pv_time_ops pv_time_ops = { .get_wallclock = native_get_wallclock, .set_wallclock = native_set_wallclock, .sched_clock = native_sched_clock, - .get_tsc_khz = native_calibrate_tsc, }; struct pv_irq_ops pv_irq_ops = { diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index bb207a47c63..2d93026af7c 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -818,7 +818,7 @@ void __init setup_arch(char **cmdline_p) * VMware detection requires dmi to be available, so this * needs to be done after dmi_scan_machine, for the BP. */ - init_hypervisor(&boot_cpu_data); + init_hypervisor_platform(); x86_init.resources.probe_roms(); diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 97a0bcbad10..9917632a8b4 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -18,6 +18,7 @@ #include #include #include +#include unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */ EXPORT_SYMBOL(cpu_khz); @@ -401,15 +402,9 @@ unsigned long native_calibrate_tsc(void) { u64 tsc1, tsc2, delta, ref1, ref2; unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; - unsigned long flags, latch, ms, fast_calibrate, hv_tsc_khz; + unsigned long flags, latch, ms, fast_calibrate; int hpet = is_hpet_enabled(), i, loopmin; - hv_tsc_khz = get_hypervisor_tsc_freq(); - if (hv_tsc_khz) { - printk(KERN_INFO "TSC: Frequency read from the hypervisor\n"); - return hv_tsc_khz; - } - local_irq_save(flags); fast_calibrate = quick_pit_calibrate(); local_irq_restore(flags); @@ -567,7 +562,7 @@ int recalibrate_cpu_khz(void) unsigned long cpu_khz_old = cpu_khz; if (cpu_has_tsc) { - tsc_khz = calibrate_tsc(); + tsc_khz = x86_platform.calibrate_tsc(); cpu_khz = tsc_khz; cpu_data(0).loops_per_jiffy = cpufreq_scale(cpu_data(0).loops_per_jiffy, @@ -917,7 +912,7 @@ void __init tsc_init(void) if (!cpu_has_tsc) return; - tsc_khz = calibrate_tsc(); + tsc_khz = x86_platform.calibrate_tsc(); cpu_khz = tsc_khz; if (!tsc_khz) { diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index cd7d0fbbf66..052ae81ee08 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -825,7 +825,7 @@ static inline int __init activate_vmi(void) x86_cpuinit.setup_percpu_clockev = vmi_time_ap_init; #endif pv_time_ops.sched_clock = vmi_sched_clock; - pv_time_ops.get_tsc_khz = vmi_tsc_khz; + x86_platform.calibrate_tsc = vmi_tsc_khz; /* We have true wallclock functions; disable CMOS clock sync */ no_sync_cmos_clock = 1; diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c index 2b3eb82efee..611b9e2360d 100644 --- a/arch/x86/kernel/vmiclock_32.c +++ b/arch/x86/kernel/vmiclock_32.c @@ -68,7 +68,7 @@ unsigned long long vmi_sched_clock(void) return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE)); } -/* paravirt_ops.get_tsc_khz = vmi_tsc_khz */ +/* x86_platform.calibrate_tsc = vmi_tsc_khz */ unsigned long vmi_tsc_khz(void) { unsigned long long khz; diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 4790b92714a..13081b92191 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -13,6 +13,7 @@ #include #include #include +#include void __cpuinit x86_init_noop(void) { } void __init x86_init_uint_noop(unsigned int unused) { } @@ -67,3 +68,7 @@ struct __initdata x86_init_ops x86_init = { __cpuinitdata struct x86_cpuinit_ops x86_cpuinit = { .setup_percpu_clockev = setup_secondary_APIC_clock, }; + +struct x86_platform_ops x86_platform = { + .calibrate_tsc = native_calibrate_tsc, +}; diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 6caa8c0c793..fabe745513d 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -1320,11 +1320,11 @@ __init void lguest_init(void) /* Time operations */ pv_time_ops.get_wallclock = lguest_get_wallclock; - pv_time_ops.get_tsc_khz = lguest_tsc_khz; x86_init.resources.memory_setup = lguest_memory_setup; x86_init.irqs.intr_init = lguest_init_IRQ; x86_init.timers.timer_init = lguest_time_init; + x86_platform.calibrate_tsc = lguest_tsc_khz; /* * Now is a good time to look at the implementations of these functions diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 84826b842b5..ee8cac77c8a 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -844,7 +844,6 @@ static const struct pv_init_ops xen_init_ops __initdata = { static const struct pv_time_ops xen_time_ops __initdata = { .set_wallclock = xen_set_wallclock, .get_wallclock = xen_get_wallclock, - .get_tsc_khz = xen_tsc_khz, .sched_clock = xen_sched_clock, }; @@ -980,6 +979,8 @@ asmlinkage void __init xen_start_kernel(void) x86_init.timers.setup_percpu_clockev = x86_init_noop; x86_cpuinit.setup_percpu_clockev = x86_init_noop; + x86_platform.calibrate_tsc = xen_tsc_khz; + #ifdef CONFIG_X86_64 /* * Setup percpu state. We only need to do this for 64-bit -- cgit v1.2.3 From dd0a70c8f921708eba29ef9f30dde1f14a74af05 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 20 Aug 2009 16:51:07 +0200 Subject: x86: Move tsc_init to late_time_init We do not need the TSC before late_time_init. Move the tsc_init to the late time init code so we can also utilize HPET for calibration (which we claimed to do but never did except in some older kernel version). This also helps Moorestown to calibrate the TSC with the AHBT timer which needs to be initialized in late_time_init like HPET. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/time.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index fda0c34da75..fcece00356a 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -108,6 +108,7 @@ void __init hpet_time_init(void) static void x86_late_time_init(void) { x86_init.timers.timer_init(); + tsc_init(); } /* @@ -116,6 +117,5 @@ static void x86_late_time_init(void) */ void __init time_init(void) { - tsc_init(); late_time_init = x86_late_time_init; } -- cgit v1.2.3 From 47a3d5da70f411bc044ecd3c0593b158b09d0efa Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 29 Aug 2009 15:03:59 +0200 Subject: x86: Add early platform detection Platforms like Moorestown require early setup and want to avoid the call to reserve_ebda_region. The x86_init override is too late when the MRST detection happens in setup_arch. Move the default i386 x86_init overrides and the call to reserve_ebda_region into a separate function which is called as the default of a switch case depending on the hardware_subarch id in boot params. This allows us to add a case for MRST and let MRST have its own early setup function. Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/x86_init.h | 3 +-- arch/x86/kernel/head32.c | 22 +++++++++++++++++----- arch/x86/kernel/head64.c | 3 ++- arch/x86/kernel/x86_init.c | 1 - 4 files changed, 20 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 20df5187171..b6c89428137 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -2,6 +2,7 @@ #define _ASM_X86_PLATFORM_H #include +#include struct mpc_bus; struct mpc_cpu; @@ -34,14 +35,12 @@ struct x86_init_mpparse { * @probe_roms: probe BIOS roms * @reserve_resources: reserve the standard resources for the * platform - * @reserve_ebda_region: reserve the extended bios data area * @memory_setup: platform specific memory setup * */ struct x86_init_resources { void (*probe_roms)(void); void (*reserve_resources)(void); - void (*reserve_ebda_region)(void); char *(*memory_setup)(void); }; diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index a21398fac4f..441c075e2b8 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -15,6 +15,17 @@ #include #include #include +#include + +static void __init i386_default_early_setup(void) +{ + /* Initilize 32bit specific setup functions */ + x86_init.resources.probe_roms = probe_roms; + x86_init.resources.reserve_resources = i386_reserve_resources; + x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc; + + reserve_ebda_region(); +} void __init i386_start_kernel(void) { @@ -31,12 +42,13 @@ void __init i386_start_kernel(void) reserve_early(ramdisk_image, ramdisk_end, "RAMDISK"); } #endif - /* Initilize 32bit specific setup functions */ - x86_init.resources.probe_roms = probe_roms; - x86_init.resources.reserve_resources = i386_reserve_resources; - x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc; - x86_init.resources.reserve_ebda_region(); + /* Call the subarch specific early setup function */ + switch (boot_params.hdr.hardware_subarch) { + default: + i386_default_early_setup(); + break; + } /* * At this point everything still needed from the boot loader diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index cead8149c3d..0b06cd778fd 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -24,6 +24,7 @@ #include #include #include +#include static void __init zap_identity_mappings(void) { @@ -111,7 +112,7 @@ void __init x86_64_start_reservations(char *real_mode_data) } #endif - x86_init.resources.reserve_ebda_region(); + reserve_ebda_region(); /* * At this point everything still needed from the boot loader diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 13081b92191..24be7f39789 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -28,7 +28,6 @@ struct __initdata x86_init_ops x86_init = { .resources = { .probe_roms = x86_init_noop, .reserve_resources = reserve_standard_io_resources, - .reserve_ebda_region = reserve_ebda_region, .memory_setup = default_machine_specific_memory_setup, }, -- cgit v1.2.3 From 162bc7ab01a00eba1c5d614e64a51e1268ee3f96 Mon Sep 17 00:00:00 2001 From: "Pan, Jacob jun" Date: Fri, 28 Aug 2009 14:52:47 -0700 Subject: x86: Add hardware_subarch ID for Moorestown x86 bootprotocol 2.07 has introduced hardware_subarch ID in the boot parameters provided by FW. We use it to identify Moorestown platforms. [ tglx: Cleanup and paravirt fix ] Signed-off-by: Jacob Pan Signed-off-by: Thomas Gleixner --- Documentation/x86/boot.txt | 1 + arch/x86/include/asm/bootparam.h | 10 ++++++++++ arch/x86/kernel/head_32.S | 1 + 3 files changed, 12 insertions(+) (limited to 'arch/x86') diff --git a/Documentation/x86/boot.txt b/Documentation/x86/boot.txt index 8da3a795083..30b43e1b269 100644 --- a/Documentation/x86/boot.txt +++ b/Documentation/x86/boot.txt @@ -599,6 +599,7 @@ Protocol: 2.07+ 0x00000000 The default x86/PC environment 0x00000001 lguest 0x00000002 Xen + 0x00000003 Moorestown MID Field name: hardware_subarch_data Type: write (subarch-dependent) diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h index 1724e8de317..283a9a1b3ef 100644 --- a/arch/x86/include/asm/bootparam.h +++ b/arch/x86/include/asm/bootparam.h @@ -109,4 +109,14 @@ struct boot_params { __u8 _pad9[276]; /* 0xeec */ } __attribute__((packed)); +enum { + X86_SUBARCH_PC = 0, + X86_SUBARCH_LGUEST, + X86_SUBARCH_XEN, + X86_SUBARCH_MRST, + X86_NR_SUBARCHS, +}; + + + #endif /* _ASM_X86_BOOTPARAM_H */ diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index cc827ac9e8d..304e3f3d747 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -157,6 +157,7 @@ subarch_entries: .long default_entry /* normal x86/PC */ .long lguest_entry /* lguest hypervisor */ .long xen_entry /* Xen hypervisor */ + .long default_entry /* Moorestown MID */ num_subarch_entries = (. - subarch_entries) / 4 .previous #endif /* CONFIG_PARAVIRT */ -- cgit v1.2.3 From 3f4110a48a749a1aa1c54fb807afb3f32f49711c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 29 Aug 2009 14:54:20 +0200 Subject: x86: Add Moorestown early detection Moorestown MID devices need to be detected early in the boot process to setup and do not call x86_default_early_setup as there is no EBDA region to reserve. [ Copied the minimal code from Jacobs latest MRST series ] Signed-off-by: Thomas Gleixner Cc: Jacob Pan --- arch/x86/Kconfig | 13 +++++++++++++ arch/x86/include/asm/setup.h | 6 ++++++ arch/x86/kernel/Makefile | 1 + arch/x86/kernel/head32.c | 3 +++ arch/x86/kernel/mrst.c | 24 ++++++++++++++++++++++++ 5 files changed, 47 insertions(+) create mode 100644 arch/x86/kernel/mrst.c (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 13ffa5df37d..586d84557f7 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -318,6 +318,7 @@ config X86_EXTENDED_PLATFORM SGI 320/540 (Visual Workstation) Summit/EXA (IBM x440) Unisys ES7000 IA32 series + Moorestown MID devices If you have one of these systems, or if you want to build a generic distribution kernel, say Y here - otherwise say N. @@ -377,6 +378,18 @@ config X86_ELAN If unsure, choose "PC-compatible" instead. +config X86_MRST + bool "Moorestown MID platform" + depends on X86_32 + depends on X86_EXTENDED_PLATFORM + ---help--- + Moorestown is Intel's Low Power Intel Architecture (LPIA) based Moblin + Internet Device(MID) platform. Moorestown consists of two chips: + Lincroft (CPU core, graphics, and memory controller) and Langwell IOH. + Unlike standard x86 PCs, Moorestown does not have many legacy devices + nor standard legacy replacement devices/features. e.g. Moorestown does + not contain i8259, i8254, HPET, legacy BIOS, most of the io ports. + config X86_RDC321X bool "RDC R-321x SoC" depends on X86_32 diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 861e1fe2303..18e496c98ff 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -49,6 +49,12 @@ extern void reserve_standard_io_resources(void); extern void i386_reserve_resources(void); extern void setup_default_timer_irq(void); +#ifdef CONFIG_X86_MRST +extern void x86_mrst_early_setup(void); +#else +static inline void x86_mrst_early_setup(void) { } +#endif + #ifndef _SETUP /* diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index ccf3db607c2..5f33316610d 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -104,6 +104,7 @@ obj-$(CONFIG_SCx200) += scx200.o scx200-y += scx200_32.o obj-$(CONFIG_OLPC) += olpc.o +obj-$(CONFIG_X86_MRST) += mrst.o microcode-y := microcode_core.o microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 441c075e2b8..4f8e2507e8f 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -45,6 +45,9 @@ void __init i386_start_kernel(void) /* Call the subarch specific early setup function */ switch (boot_params.hdr.hardware_subarch) { + case X86_SUBARCH_MRST: + x86_mrst_early_setup(); + break; default: i386_default_early_setup(); break; diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c new file mode 100644 index 00000000000..3b7078abc87 --- /dev/null +++ b/arch/x86/kernel/mrst.c @@ -0,0 +1,24 @@ +/* + * mrst.c: Intel Moorestown platform specific setup code + * + * (C) Copyright 2008 Intel Corporation + * Author: Jacob Pan (jacob.jun.pan@intel.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ +#include + +#include + +/* + * Moorestown specific x86_init function overrides and early setup + * calls. + */ +void __init x86_mrst_early_setup(void) +{ + x86_init.resources.probe_roms = x86_init_noop; + x86_init.resources.reserve_resources = x86_init_noop; +} -- cgit v1.2.3 From bc07844a33734c4b2f32ef26d942d2f3ef9302ea Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 29 Aug 2009 18:09:57 +0200 Subject: x86: Distangle ioapic and i8259 The proposed Moorestown support patches use an extra feature flag mechanism to make the ioapic work w/o an i8259. There is a much simpler solution. Most i8259 specific functions are already called dependend on the irq number less than NR_IRQS_LEGACY. Replacing that constant by a read_mostly variable which can be set to 0 by the platform setup code allows us to achieve the same without any special feature flags. That trivial change allows us to proceed with MRST w/o doing a full blown overhaul of the ioapic code which would delay MRST unduly. Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/io_apic.h | 2 ++ arch/x86/kernel/apic/io_apic.c | 41 +++++++++++++++++++++++++++++------------ 2 files changed, 31 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 2b8aeb89933..e1f89a1a07e 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -143,6 +143,8 @@ extern int noioapicreroute; /* 1 if the timer IRQ uses the '8259A Virtual Wire' mode */ extern int timer_through_8259; +extern void io_apic_disable_legacy(void); + /* * If we use the IO-APIC for IRQ routing, disable automatic * assignment of PCI IRQ's. diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 5f4687187ce..6c961290a5f 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -91,6 +91,11 @@ struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES]; /* # of MP IRQ source entries */ int mp_irq_entries; +/* Number of legacy interrupts */ +static int nr_legacy_irqs __read_mostly = NR_IRQS_LEGACY; +/* GSI interrupts */ +static int nr_irqs_gsi = NR_IRQS_LEGACY; + #if defined (CONFIG_MCA) || defined (CONFIG_EISA) int mp_bus_id_to_type[MAX_MP_BUSSES]; #endif @@ -172,6 +177,12 @@ static struct irq_cfg irq_cfgx[NR_IRQS] = { [15] = { .vector = IRQ15_VECTOR, }, }; +void __init io_apic_disable_legacy(void) +{ + nr_legacy_irqs = 0; + nr_irqs_gsi = 0; +} + int __init arch_early_irq_init(void) { struct irq_cfg *cfg; @@ -189,7 +200,7 @@ int __init arch_early_irq_init(void) desc->chip_data = &cfg[i]; zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node); zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node); - if (i < NR_IRQS_LEGACY) + if (i < nr_legacy_irqs) cpumask_setall(cfg[i].domain); } @@ -883,7 +894,7 @@ static int __init find_isa_irq_apic(int irq, int type) */ static int EISA_ELCR(unsigned int irq) { - if (irq < NR_IRQS_LEGACY) { + if (irq < nr_legacy_irqs) { unsigned int port = 0x4d0 + (irq >> 3); return (inb(port) >> (irq & 7)) & 1; } @@ -1480,7 +1491,7 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq } ioapic_register_intr(irq, desc, trigger); - if (irq < NR_IRQS_LEGACY) + if (irq < nr_legacy_irqs) disable_8259A_irq(irq); ioapic_write_entry(apic_id, pin, entry); @@ -1851,7 +1862,7 @@ __apicdebuginit(void) print_PIC(void) unsigned int v; unsigned long flags; - if (apic_verbosity == APIC_QUIET) + if (apic_verbosity == APIC_QUIET || !nr_legacy_irqs) return; printk(KERN_DEBUG "\nprinting PIC contents\n"); @@ -1914,6 +1925,10 @@ void __init enable_IO_APIC(void) spin_unlock_irqrestore(&ioapic_lock, flags); nr_ioapic_registers[apic] = reg_01.bits.entries+1; } + + if (!nr_legacy_irqs) + return; + for(apic = 0; apic < nr_ioapics; apic++) { int pin; /* See if any of the pins is in ExtINT mode */ @@ -1968,6 +1983,9 @@ void disable_IO_APIC(void) */ clear_IO_APIC(); + if (!nr_legacy_irqs) + return; + /* * If the i8259 is routed through an IOAPIC * Put that IOAPIC in virtual wire mode @@ -2198,7 +2216,7 @@ static unsigned int startup_ioapic_irq(unsigned int irq) struct irq_cfg *cfg; spin_lock_irqsave(&ioapic_lock, flags); - if (irq < NR_IRQS_LEGACY) { + if (irq < nr_legacy_irqs) { disable_8259A_irq(irq); if (i8259A_irq_pending(irq)) was_pending = 1; @@ -2709,7 +2727,7 @@ static inline void init_IO_APIC_traps(void) * so default to an old-fashioned 8259 * interrupt if we can.. */ - if (irq < NR_IRQS_LEGACY) + if (irq < nr_legacy_irqs) make_8259A_irq(irq); else /* Strange. Oh, well.. */ @@ -3045,7 +3063,7 @@ out: * the I/O APIC in all cases now. No actual device should request * it anyway. --macro */ -#define PIC_IRQS (1 << PIC_CASCADE_IR) +#define PIC_IRQS (1UL << PIC_CASCADE_IR) void __init setup_IO_APIC(void) { @@ -3053,8 +3071,7 @@ void __init setup_IO_APIC(void) /* * calling enable_IO_APIC() is moved to setup_local_APIC for BP */ - - io_apic_irqs = ~PIC_IRQS; + io_apic_irqs = nr_legacy_irqs ? ~PIC_IRQS : ~0UL; apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); /* @@ -3065,7 +3082,8 @@ void __init setup_IO_APIC(void) sync_Arb_IDs(); setup_IO_APIC_irqs(); init_IO_APIC_traps(); - check_timer(); + if (nr_legacy_irqs) + check_timer(); } /* @@ -3166,7 +3184,6 @@ static int __init ioapic_init_sysfs(void) device_initcall(ioapic_init_sysfs); -static int nr_irqs_gsi = NR_IRQS_LEGACY; /* * Dynamic irq allocate and deallocation */ @@ -3907,7 +3924,7 @@ static int __io_apic_set_pci_routing(struct device *dev, int irq, /* * IRQs < 16 are already in the irq_2_pin[] map */ - if (irq >= NR_IRQS_LEGACY) { + if (irq >= nr_legacy_irqs) { cfg = desc->chip_data; add_pin_to_irq_node(cfg, node, ioapic, pin); } -- cgit v1.2.3 From e11dadabf443dc3101f28b74d8b9d56870a87db4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 31 Aug 2009 15:18:40 +0200 Subject: x86: apic namespace cleanup boot_cpu_physical_apicid is a global variable and used as function argument as well. Rename the function arguments to avoid confusion. Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/apic.h | 14 +++++++------- arch/x86/kernel/apic/bigsmp_32.c | 2 +- arch/x86/kernel/apic/numaq_32.c | 2 +- arch/x86/kernel/apic/summit_32.c | 2 +- arch/x86/kernel/setup.c | 4 ++-- 5 files changed, 12 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 6f15b29005a..d6a0f2636a6 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -292,7 +292,7 @@ struct apic { int (*cpu_present_to_apicid)(int mps_cpu); physid_mask_t (*apicid_to_cpu_present)(int phys_apicid); void (*setup_portio_remap)(void); - int (*check_phys_apicid_present)(int boot_cpu_physical_apicid); + int (*check_phys_apicid_present)(int phys_apicid); void (*enable_apic_mode)(void); int (*phys_pkg_id)(int cpuid_apic, int index_msb); @@ -426,7 +426,7 @@ extern struct apic apic_x2apic_uv_x; DECLARE_PER_CPU(int, x2apic_extra_bits); extern int default_cpu_present_to_apicid(int mps_cpu); -extern int default_check_phys_apicid_present(int boot_cpu_physical_apicid); +extern int default_check_phys_apicid_present(int phys_apicid); #endif static inline void default_wait_for_init_deassert(atomic_t *deassert) @@ -542,9 +542,9 @@ static inline int __default_cpu_present_to_apicid(int mps_cpu) } static inline int -__default_check_phys_apicid_present(int boot_cpu_physical_apicid) +__default_check_phys_apicid_present(int phys_apicid) { - return physid_isset(boot_cpu_physical_apicid, phys_cpu_present_map); + return physid_isset(phys_apicid, phys_cpu_present_map); } #ifdef CONFIG_X86_32 @@ -554,13 +554,13 @@ static inline int default_cpu_present_to_apicid(int mps_cpu) } static inline int -default_check_phys_apicid_present(int boot_cpu_physical_apicid) +default_check_phys_apicid_present(int phys_apicid) { - return __default_check_phys_apicid_present(boot_cpu_physical_apicid); + return __default_check_phys_apicid_present(phys_apicid); } #else extern int default_cpu_present_to_apicid(int mps_cpu); -extern int default_check_phys_apicid_present(int boot_cpu_physical_apicid); +extern int default_check_phys_apicid_present(int phys_apicid); #endif static inline physid_mask_t default_apicid_to_cpu_present(int phys_apicid) diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index 676cdac385c..77a06413b6b 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -112,7 +112,7 @@ static physid_mask_t bigsmp_ioapic_phys_id_map(physid_mask_t phys_map) return physids_promote(0xFFL); } -static int bigsmp_check_phys_apicid_present(int boot_cpu_physical_apicid) +static int bigsmp_check_phys_apicid_present(int phys_apicid) { return 1; } diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index f1ebed6bd15..efa00e2b850 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -413,7 +413,7 @@ static inline physid_mask_t numaq_apicid_to_cpu_present(int logical_apicid) /* Where the IO area was mapped on multiquad, always 0 otherwise */ void *xquad_portio; -static inline int numaq_check_phys_apicid_present(int boot_cpu_physical_apicid) +static inline int numaq_check_phys_apicid_present(int phys_apicid) { return 1; } diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index eafdfbd1ea9..645ecc4ff0b 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -272,7 +272,7 @@ static physid_mask_t summit_apicid_to_cpu_present(int apicid) return physid_mask_of_physid(0); } -static int summit_check_phys_apicid_present(int boot_cpu_physical_apicid) +static int summit_check_phys_apicid_present(int physical_apicid) { return 1; } diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 2d93026af7c..fda22ec1a93 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -129,9 +129,9 @@ int default_cpu_present_to_apicid(int mps_cpu) return __default_cpu_present_to_apicid(mps_cpu); } -int default_check_phys_apicid_present(int boot_cpu_physical_apicid) +int default_check_phys_apicid_present(int phys_apicid) { - return __default_check_phys_apicid_present(boot_cpu_physical_apicid); + return __default_check_phys_apicid_present(phys_apicid); } #endif -- cgit v1.2.3 From db39d5529d347de5e2eec1a72d67fcfacae6c5a2 Mon Sep 17 00:00:00 2001 From: Mark Langsdorf Date: Fri, 21 Aug 2009 19:15:28 -0500 Subject: [CPUFREQ] Powernow-k8: Enable more than 2 low P-states Remove an obsolete check that used to prevent there being more than 2 low P-states. Now that low-to-low P-states changes are enabled, it prevents otherwise workable configurations with multiple low P-states. Signed-off-by: Mark Langsdorf Tested-by: Krists Krilovs Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 29 ++++------------------------- 1 file changed, 4 insertions(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 2a50ef89100..0cbce0481a5 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -854,6 +854,10 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) goto err_out; } + /* fill in data */ + data->numps = data->acpi_data.state_count; + powernow_k8_acpi_pst_values(data, 0); + if (cpu_family == CPU_HW_PSTATE) ret_val = fill_powernow_table_pstate(data, powernow_table); else @@ -866,11 +870,8 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) powernow_table[data->acpi_data.state_count].index = 0; data->powernow_table = powernow_table; - /* fill in data */ - data->numps = data->acpi_data.state_count; if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu) print_basics(data); - powernow_k8_acpi_pst_values(data, 0); /* notify BIOS that we exist */ acpi_processor_notify_smm(THIS_MODULE); @@ -941,7 +942,6 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table) { int i; - int cntlofreq = 0; for (i = 0; i < data->acpi_data.state_count; i++) { u32 fid; @@ -982,27 +982,6 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data, continue; } - /* verify only 1 entry from the lo frequency table */ - if (fid < HI_FID_TABLE_BOTTOM) { - if (cntlofreq) { - /* if both entries are the same, - * ignore this one ... */ - if ((freq != powernow_table[cntlofreq].frequency) || - (index != powernow_table[cntlofreq].index)) { - printk(KERN_ERR PFX - "Too many lo freq table " - "entries\n"); - return 1; - } - - dprintk("double low frequency table entry, " - "ignoring it.\n"); - invalidate_entry(data, i); - continue; - } else - cntlofreq = i; - } - if (freq != (data->acpi_data.states[i].core_frequency * 1000)) { printk(KERN_INFO PFX "invalid freq entries " "%u kHz vs. %u kHz\n", freq, -- cgit v1.2.3 From 1a8e42fa81e62d47cc471f7764f906bb42b27a54 Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Wed, 26 Aug 2009 13:19:37 -0400 Subject: [CPUFREQ] Create a blacklist for processors that should not load the acpi-cpufreq module. Create a blacklist for processors that should not load the acpi-cpufreq module. The initial entry in the blacklist function is the Intel 0f68 processor. It's specification update mentions errata AL30 which implies that cpufreq should not run on this processor. Signed-off-by: Prarit Bhargava Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index ae9b503220c..badce508406 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -588,6 +588,21 @@ static const struct dmi_system_id sw_any_bug_dmi_table[] = { }, { } }; + +static int acpi_cpufreq_blacklist(struct cpuinfo_x86 *c) +{ + /* http://www.intel.com/Assets/PDF/specupdate/314554.pdf + * AL30: A Machine Check Exception (MCE) Occurring during an + * Enhanced Intel SpeedStep Technology Ratio Change May Cause + * Both Processor Cores to Lock Up when HT is enabled*/ + if (c->x86_vendor == X86_VENDOR_INTEL) { + if ((c->x86 == 15) && + (c->x86_model == 6) && + (c->x86_mask == 8) && smt_capable()) + return -ENODEV; + } + return 0; +} #endif static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) @@ -602,6 +617,12 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) dprintk("acpi_cpufreq_cpu_init\n"); +#ifdef CONFIG_SMP + result = acpi_cpufreq_blacklist(c); + if (result) + return result; +#endif + data = kzalloc(sizeof(struct acpi_cpufreq_data), GFP_KERNEL); if (!data) return -ENOMEM; -- cgit v1.2.3 From 69575d388603365f2afbf4166df93152df59b165 Mon Sep 17 00:00:00 2001 From: Shane Wang Date: Tue, 1 Sep 2009 18:25:07 -0700 Subject: x86, intel_txt: clean up the impact on generic code, unbreak non-x86 Move tboot.h from asm to linux to fix the build errors of intel_txt patch on non-X86 platforms. Remove the tboot code from generic code init/main.c and kernel/cpu.c. Signed-off-by: Shane Wang Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 4 + arch/x86/include/asm/tboot.h | 197 ------------------------------------------ arch/x86/kernel/reboot.c | 3 +- arch/x86/kernel/setup.c | 3 +- arch/x86/kernel/smpboot.c | 2 +- arch/x86/kernel/tboot.c | 58 ++++++++++--- drivers/acpi/acpica/hwsleep.c | 2 +- drivers/pci/dmar.c | 2 +- drivers/pci/intel-iommu.c | 2 +- include/linux/tboot.h | 162 ++++++++++++++++++++++++++++++++++ init/main.c | 3 - kernel/cpu.c | 6 +- security/Kconfig | 2 +- 13 files changed, 221 insertions(+), 225 deletions(-) delete mode 100644 arch/x86/include/asm/tboot.h create mode 100644 include/linux/tboot.h (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 738bdc6b0f8..b66f2102c35 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -178,6 +178,10 @@ config ARCH_SUPPORTS_OPTIMIZED_INLINING config ARCH_SUPPORTS_DEBUG_PAGEALLOC def_bool y +config HAVE_INTEL_TXT + def_bool y + depends on EXPERIMENTAL && DMAR && ACPI + # Use the generic interrupt handling code in kernel/irq/: config GENERIC_HARDIRQS bool diff --git a/arch/x86/include/asm/tboot.h b/arch/x86/include/asm/tboot.h deleted file mode 100644 index b13929d4e5f..00000000000 --- a/arch/x86/include/asm/tboot.h +++ /dev/null @@ -1,197 +0,0 @@ -/* - * tboot.h: shared data structure with tboot and kernel and functions - * used by kernel for runtime support of Intel(R) Trusted - * Execution Technology - * - * Copyright (c) 2006-2009, Intel Corporation - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., - * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. - * - */ - -#ifndef _ASM_TBOOT_H -#define _ASM_TBOOT_H - -#include - -/* these must have the values from 0-5 in this order */ -enum { - TB_SHUTDOWN_REBOOT = 0, - TB_SHUTDOWN_S5, - TB_SHUTDOWN_S4, - TB_SHUTDOWN_S3, - TB_SHUTDOWN_HALT, - TB_SHUTDOWN_WFS -}; - -#ifdef CONFIG_INTEL_TXT - -/* used to communicate between tboot and the launched kernel */ - -#define TB_KEY_SIZE 64 /* 512 bits */ - -#define MAX_TB_MAC_REGIONS 32 - -struct tboot_mac_region { - u64 start; /* must be 64 byte -aligned */ - u32 size; /* must be 64 byte -granular */ -} __packed; - -/* GAS - Generic Address Structure (ACPI 2.0+) */ -struct tboot_acpi_generic_address { - u8 space_id; - u8 bit_width; - u8 bit_offset; - u8 access_width; - u64 address; -} __packed; - -/* - * combines Sx info from FADT and FACS tables per ACPI 2.0+ spec - * (http://www.acpi.info/) - */ -struct tboot_acpi_sleep_info { - struct tboot_acpi_generic_address pm1a_cnt_blk; - struct tboot_acpi_generic_address pm1b_cnt_blk; - struct tboot_acpi_generic_address pm1a_evt_blk; - struct tboot_acpi_generic_address pm1b_evt_blk; - u16 pm1a_cnt_val; - u16 pm1b_cnt_val; - u64 wakeup_vector; - u32 vector_width; - u64 kernel_s3_resume_vector; -} __packed; - -/* - * shared memory page used for communication between tboot and kernel - */ -struct tboot { - /* - * version 3+ fields: - */ - - /* TBOOT_UUID */ - u8 uuid[16]; - - /* version number: 5 is current */ - u32 version; - - /* physical addr of tb_log_t log */ - u32 log_addr; - - /* - * physical addr of entry point for tboot shutdown and - * type of shutdown (TB_SHUTDOWN_*) being requested - */ - u32 shutdown_entry; - u32 shutdown_type; - - /* kernel-specified ACPI info for Sx shutdown */ - struct tboot_acpi_sleep_info acpi_sinfo; - - /* tboot location in memory (physical) */ - u32 tboot_base; - u32 tboot_size; - - /* memory regions (phys addrs) for tboot to MAC on S3 */ - u8 num_mac_regions; - struct tboot_mac_region mac_regions[MAX_TB_MAC_REGIONS]; - - - /* - * version 4+ fields: - */ - - /* symmetric key for use by kernel; will be encrypted on S3 */ - u8 s3_key[TB_KEY_SIZE]; - - - /* - * version 5+ fields: - */ - - /* used to 4byte-align num_in_wfs */ - u8 reserved_align[3]; - - /* number of processors in wait-for-SIPI */ - u32 num_in_wfs; -} __packed; - -/* - * UUID for tboot data struct to facilitate matching - * defined as {663C8DFF-E8B3-4b82-AABF-19EA4D057A08} by tboot, which is - * represented as {} in the char array used here - */ -#define TBOOT_UUID {0xff, 0x8d, 0x3c, 0x66, 0xb3, 0xe8, 0x82, 0x4b, 0xbf,\ - 0xaa, 0x19, 0xea, 0x4d, 0x5, 0x7a, 0x8} - -extern struct tboot *tboot; - -static inline int tboot_enabled(void) -{ - return tboot != NULL; -} - -extern void tboot_probe(void); -extern void tboot_create_trampoline(void); -extern void tboot_shutdown(u32 shutdown_type); -extern void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control); -extern int tboot_wait_for_aps(int num_aps); -extern struct acpi_table_header *tboot_get_dmar_table( - struct acpi_table_header *dmar_tbl); -extern int tboot_force_iommu(void); - -#else /* CONFIG_INTEL_TXT */ - -static inline int tboot_enabled(void) -{ - return 0; -} - -static inline void tboot_probe(void) -{ -} - -static inline void tboot_create_trampoline(void) -{ -} - -static inline void tboot_shutdown(u32 shutdown_type) -{ -} - -static inline void tboot_sleep(u8 sleep_state, u32 pm1a_control, - u32 pm1b_control) -{ -} - -static inline int tboot_wait_for_aps(int num_aps) -{ - return 0; -} - -static inline struct acpi_table_header *tboot_get_dmar_table( - struct acpi_table_header *dmar_tbl) -{ - return dmar_tbl; -} - -static inline int tboot_force_iommu(void) -{ - return 0; -} - -#endif /* !CONFIG_INTEL_TXT */ - -#endif /* _ASM_TBOOT_H */ diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 9de01c5d979..18ce5c04242 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -24,8 +25,6 @@ # include #endif -#include - /* * Power off function, if any */ diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 80d6e9e3248..6ce0d6f38f7 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -66,6 +66,7 @@ #include #include +#include #include