diff options
Diffstat (limited to 'arch/x86')
41 files changed, 391 insertions, 175 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index a84fc34c8f77..cea0cd9a316f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -245,6 +245,11 @@ config ARCH_HWEIGHT_CFLAGS config KTIME_SCALAR def_bool X86_32 + +config ARCH_CPU_PROBE_RELEASE + def_bool y + depends on HOTPLUG_CPU + source "init/Kconfig" source "kernel/Kconfig.freezer" @@ -749,11 +754,11 @@ config IOMMU_API def_bool (AMD_IOMMU || DMAR) config MAXSMP - bool "Configure Maximum number of SMP Processors and NUMA Nodes" + bool "Enable Maximum number of SMP Processors and NUMA Nodes" depends on X86_64 && SMP && DEBUG_KERNEL && EXPERIMENTAL select CPUMASK_OFFSTACK ---help--- - Configure maximum number of CPUS and NUMA Nodes for this architecture. + Enable maximum number of CPUS and NUMA Nodes for this architecture. If unsure, say N. config NR_CPUS diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 8aa1b59b9074..e8c8881351b3 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -74,7 +74,7 @@ endif ifdef CONFIG_CC_STACKPROTECTOR cc_has_sp := $(srctree)/scripts/gcc-x86_$(BITS)-has-stack-protector.sh - ifeq ($(shell $(CONFIG_SHELL) $(cc_has_sp) $(CC) $(biarch)),y) + ifeq ($(shell $(CONFIG_SHELL) $(cc_has_sp) $(CC) $(KBUILD_CPPFLAGS) $(biarch)),y) stackp-y := -fstack-protector KBUILD_CFLAGS += $(stackp-y) else diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index b86feabed69b..518bb99c3394 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -50,7 +50,12 @@ /* * Reload arg registers from stack in case ptrace changed them. * We don't reload %eax because syscall_trace_enter() returned - * the value it wants us to use in the table lookup. + * the %rax value we should see. Instead, we just truncate that + * value to 32 bits again as we did on entry from user mode. + * If it's a new value set by user_regset during entry tracing, + * this matches the normal truncation of the user-mode value. + * If it's -1 to make us punt the syscall, then (u32)-1 is still + * an appropriately invalid value. */ .macro LOAD_ARGS32 offset, _r9=0 .if \_r9 @@ -60,6 +65,7 @@ movl \offset+48(%rsp),%edx movl \offset+56(%rsp),%esi movl \offset+64(%rsp),%edi + movl %eax,%eax /* zero extension */ .endm .macro CFI_STARTPROC32 simple @@ -153,7 +159,7 @@ ENTRY(ia32_sysenter_target) testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10) CFI_REMEMBER_STATE jnz sysenter_tracesys - cmpl $(IA32_NR_syscalls-1),%eax + cmpq $(IA32_NR_syscalls-1),%rax ja ia32_badsys sysenter_do_call: IA32_ARG_FIXUP @@ -195,7 +201,7 @@ sysexit_from_sys_call: movl $AUDIT_ARCH_I386,%edi /* 1st arg: audit arch */ call audit_syscall_entry movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall number */ - cmpl $(IA32_NR_syscalls-1),%eax + cmpq $(IA32_NR_syscalls-1),%rax ja ia32_badsys movl %ebx,%edi /* reload 1st syscall arg */ movl RCX-ARGOFFSET(%rsp),%esi /* reload 2nd syscall arg */ @@ -248,7 +254,7 @@ sysenter_tracesys: call syscall_trace_enter LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ RESTORE_REST - cmpl $(IA32_NR_syscalls-1),%eax + cmpq $(IA32_NR_syscalls-1),%rax ja int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */ jmp sysenter_do_call CFI_ENDPROC @@ -314,7 +320,7 @@ ENTRY(ia32_cstar_target) testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10) CFI_REMEMBER_STATE jnz cstar_tracesys - cmpl $IA32_NR_syscalls-1,%eax + cmpq $IA32_NR_syscalls-1,%rax ja ia32_badsys cstar_do_call: IA32_ARG_FIXUP 1 @@ -367,7 +373,7 @@ cstar_tracesys: LOAD_ARGS32 ARGOFFSET, 1 /* reload args from stack in case ptrace changed it */ RESTORE_REST xchgl %ebp,%r9d - cmpl $(IA32_NR_syscalls-1),%eax + cmpq $(IA32_NR_syscalls-1),%rax ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */ jmp cstar_do_call END(ia32_cstar_target) @@ -425,7 +431,7 @@ ENTRY(ia32_syscall) orl $TS_COMPAT,TI_status(%r10) testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10) jnz ia32_tracesys - cmpl $(IA32_NR_syscalls-1),%eax + cmpq $(IA32_NR_syscalls-1),%rax ja ia32_badsys ia32_do_call: IA32_ARG_FIXUP @@ -444,7 +450,7 @@ ia32_tracesys: call syscall_trace_enter LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ RESTORE_REST - cmpl $(IA32_NR_syscalls-1),%eax + cmpq $(IA32_NR_syscalls-1),%rax ja int_ret_from_sys_call /* ia32_tracesys has set RAX(%rsp) */ jmp ia32_do_call END(ia32_syscall) diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index 306160e58b48..1d9cd27c2920 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -205,7 +205,7 @@ static inline compat_uptr_t ptr_to_compat(void __user *uptr) return (u32)(unsigned long)uptr; } -static inline void __user *compat_alloc_user_space(long len) +static inline void __user *arch_compat_alloc_user_space(long len) { struct pt_regs *regs = task_pt_regs(current); return (void __user *)regs->sp - len; diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 781a50b29a49..c6fbb7b430d1 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -296,6 +296,7 @@ extern const char * const x86_power_flags[32]; #endif /* CONFIG_X86_64 */ +#if __GNUC__ >= 4 /* * Static testing of CPU features. Used the same as boot_cpu_has(). * These are only valid after alternatives have run, but will statically @@ -304,7 +305,7 @@ extern const char * const x86_power_flags[32]; */ static __always_inline __pure bool __static_cpu_has(u16 bit) { -#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5) +#if __GNUC__ > 4 || __GNUC_MINOR__ >= 5 asm goto("1: jmp %l[t_no]\n" "2:\n" ".section .altinstructions,\"a\"\n" @@ -345,7 +346,6 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) #endif } -#if __GNUC__ >= 4 #define static_cpu_has(bit) \ ( \ __builtin_constant_p(boot_cpu_has(bit)) ? \ diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h index 004e6e25e913..1d5c08a1bdfd 100644 --- a/arch/x86/include/asm/hpet.h +++ b/arch/x86/include/asm/hpet.h @@ -68,7 +68,6 @@ extern unsigned long force_hpet_address; extern u8 hpet_blockid; extern int hpet_force_user; extern u8 hpet_msi_disable; -extern u8 hpet_readback_cmp; extern int is_hpet_enabled(void); extern int hpet_enable(void); extern void hpet_disable(void); diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h index 528a11e8d3e3..824ca07860d0 100644 --- a/arch/x86/include/asm/hw_breakpoint.h +++ b/arch/x86/include/asm/hw_breakpoint.h @@ -20,7 +20,7 @@ struct arch_hw_breakpoint { #include <linux/list.h> /* Available HW breakpoint length encodings */ -#define X86_BREAKPOINT_LEN_X 0x00 +#define X86_BREAKPOINT_LEN_X 0x40 #define X86_BREAKPOINT_LEN_1 0x40 #define X86_BREAKPOINT_LEN_2 0x44 #define X86_BREAKPOINT_LEN_4 0x4c diff --git a/arch/x86/include/asm/iomap.h b/arch/x86/include/asm/iomap.h index f35eb45d6576..c4191b3b7056 100644 --- a/arch/x86/include/asm/iomap.h +++ b/arch/x86/include/asm/iomap.h @@ -26,11 +26,11 @@ #include <asm/pgtable.h> #include <asm/tlbflush.h> -void * +void __iomem * iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot); void -iounmap_atomic(void *kvaddr, enum km_type type); +iounmap_atomic(void __iomem *kvaddr, enum km_type type); int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot); diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 51cfd730ac5d..1f99ecfc48e1 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -152,9 +152,14 @@ struct x86_emulate_ops { struct operand { enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type; unsigned int bytes; - unsigned long orig_val, *ptr; + union { + unsigned long orig_val; + u64 orig_val64; + }; + unsigned long *ptr; union { unsigned long val; + u64 val64; char valptr[sizeof(unsigned long) + 2]; }; }; diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index 404a880ea325..d395540ff894 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -27,6 +27,9 @@ extern struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops, int node); extern struct pci_bus *pci_scan_bus_with_sysdata(int busno); +#ifdef CONFIG_PCI + +#ifdef CONFIG_PCI_DOMAINS static inline int pci_domain_nr(struct pci_bus *bus) { struct pci_sysdata *sd = bus->sysdata; @@ -37,13 +40,12 @@ static inline int pci_proc_domain(struct pci_bus *bus) { return pci_domain_nr(bus); } - +#endif /* Can be used to override the logic in pci_scan_bus for skipping already-configured bus numbers - to be used for buggy BIOSes or architectures with incomplete PCI setup by the loader */ -#ifdef CONFIG_PCI extern unsigned int pcibios_assign_all_busses(void); extern int pci_legacy_init(void); # ifdef CONFIG_ACPI diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h index 2984a25ff383..f686f49e8b7b 100644 --- a/arch/x86/include/asm/pgtable_32.h +++ b/arch/x86/include/asm/pgtable_32.h @@ -26,6 +26,7 @@ struct mm_struct; struct vm_area_struct; extern pgd_t swapper_pg_dir[1024]; +extern pgd_t trampoline_pg_dir[1024]; static inline void pgtable_cache_init(void) { } static inline void check_pgt_cache(void) { } diff --git a/arch/x86/include/asm/trampoline.h b/arch/x86/include/asm/trampoline.h index cb507bb05d79..4dde797c0578 100644 --- a/arch/x86/include/asm/trampoline.h +++ b/arch/x86/include/asm/trampoline.h @@ -13,14 +13,17 @@ extern unsigned char *trampoline_base; extern unsigned long init_rsp; extern unsigned long initial_code; +extern unsigned long initial_page_table; extern unsigned long initial_gs; #define TRAMPOLINE_SIZE roundup(trampoline_end - trampoline_data, PAGE_SIZE) extern unsigned long setup_trampoline(void); +extern void __init setup_trampoline_page_table(void); extern void __init reserve_trampoline_memory(void); #else -static inline void reserve_trampoline_memory(void) {}; +static inline void setup_trampoline_page_table(void) {} +static inline void reserve_trampoline_memory(void) {} #endif /* CONFIG_X86_TRAMPOLINE */ #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index c0427295e8f5..1ca132fc0d03 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -59,5 +59,7 @@ extern void check_tsc_sync_source(int cpu); extern void check_tsc_sync_target(void); extern int notsc_setup(char *); +extern void save_sched_clock_state(void); +extern void restore_sched_clock_state(void); #endif /* _ASM_X86_TSC_H */ diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 4dc0084ec1b1..f1efebaf5510 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1728,6 +1728,8 @@ __apicdebuginit(void) print_IO_APIC(void) struct irq_pin_list *entry; cfg = desc->chip_data; + if (!cfg) + continue; entry = cfg->irq_2_pin; if (!entry) continue; diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 7b598b84c902..f744f54cb248 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -698,9 +698,11 @@ void __init uv_system_init(void) for (j = 0; j < 64; j++) { if (!test_bit(j, &present)) continue; - uv_blade_info[blade].pnode = (i * 64 + j); + pnode = (i * 64 + j); + uv_blade_info[blade].pnode = pnode; uv_blade_info[blade].nr_possible_cpus = 0; uv_blade_info[blade].nr_online_cpus = 0; + max_pnode = max(pnode, max_pnode); blade++; } } @@ -738,7 +740,6 @@ void __init uv_system_init(void) uv_cpu_hub_info(cpu)->scir.offset = uv_scir_offset(apicid); uv_node_to_blade[nid] = blade; uv_cpu_to_blade[cpu] = blade; - max_pnode = max(pnode, max_pnode); } /* Add blade/pnode info for nodes without cpus */ @@ -750,7 +751,6 @@ void __init uv_system_init(void) pnode = (paddr >> m_val) & pnode_mask; blade = boot_pnode_to_blade(pnode); uv_node_to_blade[nid] = blade; - max_pnode = max(pnode, max_pnode); } map_gru_high(max_pnode); diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 60a57b13082d..ba5f62f45f01 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -669,7 +669,7 @@ bool cpu_has_amd_erratum(const int *erratum) } /* OSVW unavailable or ID unknown, match family-model-stepping range */ - ms = (cpu->x86_model << 8) | cpu->x86_mask; + ms = (cpu->x86_model << 4) | cpu->x86_mask; while ((range = *erratum++)) if ((cpu->x86 == AMD_MODEL_RANGE_FAMILY(range)) && (ms >= AMD_MODEL_RANGE_START(range)) && diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 224392d8fe8c..5e975298fa81 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -530,7 +530,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) err = -ENOMEM; goto out; } - if (!alloc_cpumask_var(&b->cpus, GFP_KERNEL)) { + if (!zalloc_cpumask_var(&b->cpus, GFP_KERNEL)) { kfree(b); err = -ENOMEM; goto out; @@ -543,7 +543,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) #ifndef CONFIG_SMP cpumask_setall(b->cpus); #else - cpumask_copy(b->cpus, c->llc_shared_map); + cpumask_set_cpu(cpu, b->cpus); #endif per_cpu(threshold_banks, cpu)[bank] = b; diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index c2a8b26d4fea..d9368eeda309 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -202,10 +202,11 @@ static int therm_throt_process(bool new_event, int event, int level) #ifdef CONFIG_SYSFS /* Add/Remove thermal_throttle interface for CPU device: */ -static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev) +static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev, + unsigned int cpu) { int err; - struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); + struct cpuinfo_x86 *c = &cpu_data(cpu); err = sysfs_create_group(&sys_dev->kobj, &thermal_attr_group); if (err) @@ -251,7 +252,7 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb, case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: mutex_lock(&therm_cpu_lock); - err = thermal_throttle_add_dev(sys_dev); + err = thermal_throttle_add_dev(sys_dev, cpu); mutex_unlock(&therm_cpu_lock); WARN_ON(err); break; @@ -287,7 +288,7 @@ static __init int thermal_throttle_init_device(void) #endif /* connect live CPUs to sysfs */ for_each_online_cpu(cpu) { - err = thermal_throttle_add_dev(get_cpu_sysdev(cpu)); + err = thermal_throttle_add_dev(get_cpu_sysdev(cpu), cpu); WARN_ON(err); } #ifdef CONFIG_HOTPLUG_CPU diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index f2da20fda02d..3efdf2870a35 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1154,7 +1154,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs) /* * event overflow */ - handled = 1; + handled++; data.period = event->hw.last_period; if (!x86_perf_event_set_period(event)) @@ -1200,12 +1200,20 @@ void perf_events_lapic_init(void) apic_write(APIC_LVTPC, APIC_DM_NMI); } +struct pmu_nmi_state { + unsigned int marked; + int handled; +}; + +static DEFINE_PER_CPU(struct pmu_nmi_state, pmu_nmi); + static int __kprobes perf_event_nmi_handler(struct notifier_block *self, unsigned long cmd, void *__args) { struct die_args *args = __args; - struct pt_regs *regs; + unsigned int this_nmi; + int handled; if (!atomic_read(&active_events)) return NOTIFY_DONE; @@ -1214,22 +1222,47 @@ perf_event_nmi_handler(struct notifier_block *self, case DIE_NMI: case DIE_NMI_IPI: break; - + case DIE_NMIUNKNOWN: + this_nmi = percpu_read(irq_stat.__nmi_count); + if (this_nmi != __get_cpu_var(pmu_nmi).marked) + /* let the kernel handle the unknown nmi */ + return NOTIFY_DONE; + /* + * This one is a PMU back-to-back nmi. Two events + * trigger 'simultaneously' raising two back-to-back + * NMIs. If the first NMI handles both, the latter + * will be empty and daze the CPU. So, we drop it to + * avoid false-positive 'unknown nmi' messages. + */ + return NOTIFY_STOP; default: return NOTIFY_DONE; } - regs = args->regs; - apic_write(APIC_LVTPC, APIC_DM_NMI); - /* - * Can't rely on the handled return value to say it was our NMI, two - * events could trigger 'simultaneously' raising two back-to-back NMIs. - * - * If the first NMI handles both, the latter will be empty and daze - * the CPU. - */ - x86_pmu.handle_irq(regs); + + handled = x86_pmu.handle_irq(args->regs); + if (!handled) + return NOTIFY_DONE; + + this_nmi = percpu_read(irq_stat.__nmi_count); + if ((handled > 1) || + /* the next nmi could be a back-to-back nmi */ + ((__get_cpu_var(pmu_nmi).marked == this_nmi) && + (__get_cpu_var(pmu_nmi).handled > 1))) { + /* + * We could have two subsequent back-to-back nmis: The + * first handles more than one counter, the 2nd + * handles only one counter and the 3rd handles no + * counter. + * + * This is the 2nd nmi because the previous was + * handling more than one counter. We will mark the + * next (3rd) and then drop it if unhandled. + */ + __get_cpu_var(pmu_nmi).marked = this_nmi + 1; + __get_cpu_var(pmu_nmi).handled = handled; + } return NOTIFY_STOP; } diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 214ac860ebe0..ee05c90012d2 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -491,33 +491,78 @@ static void intel_pmu_enable_all(int added) * Intel Errata AAP53 (model 30) * Intel Errata BD53 (model 44) * - * These chips need to be 'reset' when adding counters by programming - * the magic three (non counting) events 0x4300D2, 0x4300B1 and 0x4300B5 - * either in sequence on the same PMC or on different PMCs. + * The official story: + * These chips need to be 'reset' when adding counters by programming the + * magic three (non-counting) events 0x4300B5, 0x4300D2, and 0x4300B1 either + * in sequence on the same PMC or on different PMCs. + * + * In practise it appears some of these events do in fact count, and + * we need to programm all 4 events. */ -static void intel_pmu_nhm_enable_all(int added) +static void intel_pmu_nhm_workaround(void) { - if (added) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - int i; + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + static const unsigned long nhm_magic[4] = { + 0x4300B5, + 0x4300D2, + 0x4300B1, + 0x4300B1 + }; + struct perf_event *event; + int i; + + /* + * The Errata requires below steps: + * 1) Clear MSR_IA32_PEBS_ENABLE and MSR_CORE_PERF_GLOBAL_CTRL; + * 2) Configure 4 PERFEVTSELx with the magic events and clear + * the corresponding PMCx; + * 3) set bit0~bit3 of MSR_CORE_PERF_GLOBAL_CTRL; + * 4) Clear MSR_CORE_PERF_GLOBAL_CTRL; + * 5) Clear 4 pairs of ERFEVTSELx and PMCx; + */ - wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 0, 0x4300D2); - wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 1, 0x4300B1); - wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 2, 0x4300B5); + /* + * The real steps we choose are a little different from above. + * A) To reduce MSR operations, we don't run step 1) as they + * are already cleared before this function is called; + * B) Call x86_perf_event_update to save PMCx before configuring + * PERFEVTSELx with magic number; + * C) With step 5), we do clear only when the PERFEVTSELx is + * not used currently. + * D) Call x86_perf_event_set_period to restore PMCx; + */ + + /* We always operate 4 pairs of PERF Counters */ + for (i = 0; i < 4; i++) { + event = cpuc->events[i]; + if (event) + x86_perf_event_update(event); + } - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x3); - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x0); + for (i = 0; i < 4; i++) { + wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, nhm_magic[i]); + wrmsrl(MSR_ARCH_PERFMON_PERFCTR0 + i, 0x0); + } - for (i = 0; i < 3; i++) { - struct perf_event *event = cpuc->events[i]; + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0xf); + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x0); - if (!event) - continue; + for (i = 0; i < 4; i++) { + event = cpuc->events[i]; + if (event) { + x86_perf_event_set_period(event); __x86_pmu_enable_event(&event->hw, - ARCH_PERFMON_EVENTSEL_ENABLE); - } + ARCH_PERFMON_EVENTSEL_ENABLE); + } else + wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, 0x0); } +} + +static void intel_pmu_nhm_enable_all(int added) +{ + if (added) + intel_pmu_nhm_workaround(); intel_pmu_enable_all(added); } @@ -667,7 +712,8 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) struct perf_sample_data data; struct cpu_hw_events *cpuc; int bit, loops; - u64 ack, status; + u64 status; + int handled = 0; perf_sample_data_init(&data, 0); @@ -683,6 +729,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) loops = 0; again: + intel_pmu_ack_status(status); if (++loops > 100) { WARN_ONCE(1, "perfevents: irq loop stuck!\n"); perf_event_print_debug(); @@ -691,19 +738,22 @@ again: } inc_irq_stat(apic_perf_irqs); - ack = status; intel_pmu_lbr_read(); /* * PEBS overflow sets bit 62 in the global status register */ - if (__test_and_clear_bit(62, (unsigned long *)&status)) + if (__test_and_clear_bit(62, (unsigned long *)&status)) { + handled++; x86_pmu.drain_pebs(regs); + } for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { struct perf_event *event = cpuc->events[bit]; + handled++; + if (!test_bit(bit, cpuc->active_mask)) continue; @@ -716,8 +766,6 @@ again: x86_pmu_stop(event); } - intel_pmu_ack_status(ack); - /* * Repeat if there is more work to be done: */ @@ -727,7 +775,7 @@ again: done: intel_pmu_enable_all(0); - return 1; + return handled; } static struct event_constraint * diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index febb12cea795..b560db3305be 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -497,6 +497,8 @@ static int p4_hw_config(struct perf_event *event) event->hw.config |= event->attr.config & (p4_config_pack_escr(P4_ESCR_MASK_HT) | p4_config_pack_cccr(P4_CCCR_MASK_HT | P4_CCCR_RESERVED)); + + event->hw.config &= ~P4_CCCR_FORCE_OVF; } rc = x86_setup_perfctr(event); @@ -690,7 +692,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs) inc_irq_stat(apic_perf_irqs); } - return handled > 0; + return handled; } /* diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index e5cc7e82e60d..ebdb85cf2686 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -18,7 +18,6 @@ #include <asm/apic.h> #include <asm/iommu.h> #include <asm/gart.h> -#include <asm/hpet.h> static void __init fix_hypertransport_config(int num, int slot, int func) { @@ -192,21 +191,6 @@ static void __init ati_bugs_contd(int num, int slot, int func) } #endif -/* - * Force the read back of the CMP register in hpet_next_event() - * to work around the problem that the CMP register write seems to be - * delayed. See hpet_next_event() for details. - * - * We do this on all SMBUS incarnations for now until we have more - * information about the affected chipsets. - */ -static void __init ati_hpet_bugs(int num, int slot, int func) -{ -#ifdef CONFIG_HPET_TIMER - hpet_readback_cmp = 1; -#endif -} - #define QFLAG_APPLY_ONCE 0x1 #define QFLAG_APPLIED 0x2 #define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED) @@ -236,8 +220,6 @@ static struct chipset early_qrk[] __initdata = { PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs }, { PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS, PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs_contd }, - { PCI_VENDOR_ID_ATI, PCI_ANY_ID, - PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_hpet_bugs }, {} }; diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index ff4c453e13f3..fa8c1b8e09fb 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -334,7 +334,7 @@ ENTRY(startup_32_smp) /* * Enable paging */ - movl $pa(swapper_pg_dir),%eax + movl pa(initial_page_table), %eax movl %eax,%cr3 /* set the page table pointer.. */ movl %cr0,%eax orl $X86_CR0_PG,%eax @@ -614,6 +614,8 @@ ignore_int: .align 4 ENTRY(initial_code) .long i386_start_kernel +ENTRY(initial_page_table) + .long pa(swapper_pg_dir) /* * BSS section @@ -629,6 +631,10 @@ ENTRY(swapper_pg_dir) #endif swapper_pg_fixmap: .fill 1024,4,0 +#ifdef CONFIG_X86_TRAMPOLINE +ENTRY(trampoline_pg_dir) + .fill 1024,4,0 +#endif ENTRY(empty_zero_page) .fill 4096,1,0 diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 351f9c0fea1f..410fdb3f1939 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -35,7 +35,6 @@ unsigned long hpet_address; u8 hpet_blockid; /* OS timer block num */ u8 hpet_msi_disable; -u8 hpet_readback_cmp; #ifdef CONFIG_PCI_MSI static unsigned long hpet_num_timers; @@ -395,23 +394,27 @@ static int hpet_next_event(unsigned long delta, * at that point and we would wait for the next hpet interrupt * forever. We found out that reading the CMP register back * forces the transfer so we can rely on the comparison with - * the counter register below. + * the counter register below. If the read back from the + * compare register does not match the value we programmed + * then we might have a real hardware problem. We can not do + * much about it here, but at least alert the user/admin with + * a prominent warning. * - * That works fine on those ATI chipsets, but on newer Intel - * chipsets (ICH9...) this triggers due to an erratum: Reading - * the comparator immediately following a write is returning - * the old value. + * An erratum on some chipsets (ICH9,..), results in + * comparator read immediately following a write returning old + * value. Workaround for this is to read this value second + * time, when first read returns old value. * - * We restrict the read back to the affected ATI chipsets (set - * by quirks) and also run it with hpet=verbose for debugging - * purposes. + * In fact the write to the comparator register is delayed up + * to two HPET cycles so the workaround we tried to restrict + * the readback to those known to be borked ATI chipsets + * failed miserably. So we give up on optimizations forever + * and penalize all HPET incarnations unconditionally. */ - if (hpet_readback_cmp || hpet_verbose) { - u32 cmp = hpet_readl(HPET_Tn_CMP(timer)); - - if (cmp != cnt) + if (unlikely((u32)hpet_readl(HPET_Tn_CMP(timer)) != cnt)) { + if (hpet_readl(HPET_Tn_CMP(timer)) != cnt) printk_once(KERN_WARNING - "hpet: compare register read back failed.\n"); + "hpet: compare register read back failed.\n"); } return (s32)(hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0; diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index a474ec37c32f..ff15c9dcc25d 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -206,11 +206,27 @@ int arch_check_bp_in_kernelspace(struct perf_event *bp) int arch_bp_generic_fields(int x86_len, int x86_type, int *gen_len, int *gen_type) { - /* Len */ - switch (x86_len) { - case X86_BREAKPOINT_LEN_X: + /* Type */ + switch (x86_type) { + case X86_BREAKPOINT_EXECUTE: + if (x86_len != X86_BREAKPOINT_LEN_X) + return -EINVAL; + + *gen_type = HW_BREAKPOINT_X; *gen_len = sizeof(long); + return 0; + case X86_BREAKPOINT_WRITE: + *gen_type = HW_BREAKPOINT_W; break; + case X86_BREAKPOINT_RW: + *gen_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R; + break; + default: + return -EINVAL; + } + + /* Len */ + switch (x86_len) { case X86_BREAKPOINT_LEN_1: *gen_len = HW_BREAKPOINT_LEN_1; break; @@ -229,21 +245,6 @@ int arch_bp_generic_fields(int x86_len, int x86_type, return -EINVAL; } - /* Type */ - switch (x86_type) { - case X86_BREAKPOINT_EXECUTE: - *gen_type = HW_BREAKPOINT_X; - break; - case X86_BREAKPOINT_WRITE: - *gen_type = HW_BREAKPOINT_W; - break; - case X86_BREAKPOINT_RW: - *gen_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R; - break; - default: - return -EINVAL; - } - return 0; } @@ -316,9 +317,6 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp) ret = -EINVAL; switch (info->len) { - case X86_BREAKPOINT_LEN_X: - align = sizeof(long) -1; - break; case X86_BREAKPOINT_LEN_1: align = 0; break; diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 1f11f5ce668f..a46cb3522c0c 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -40,6 +40,7 @@ static unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu; unsigned int xstate_size; +EXPORT_SYMBOL_GPL(xstate_size); unsigned int sig_xstate_ia32_size = sizeof(struct _fpstate_ia32); static struct i387_fxsave_struct fx_scratch __cpuinitdata; diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 1bfb6cf4dd55..770ebfb349e9 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -709,6 +709,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs) struct hlist_node *node, *tmp; unsigned long flags, orig_ret_address = 0; unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline; + kprobe_opcode_t *correct_ret_addr = NULL; INIT_HLIST_HEAD(&empty_rp); kretprobe_hash_lock(current, &head, &flags); @@ -740,14 +741,34 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs) /* another task is sharing our hash bucket */ continue; + orig_ret_address = (unsigned long)ri->ret_addr; + + if (orig_ret_address != trampoline_address) + /* + * This is the real return address. Any other + * instances associated with this task are for + * other calls deeper on the call stack + */ + break; + } + + kretprobe_assert(ri, orig_ret_address, trampoline_address); + + correct_ret_addr = ri->ret_addr; + hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { + if (ri->task != current) + /* another task is sharing our hash bucket */ + continue; + + orig_ret_address = (unsigned long)ri->ret_addr; if (ri->rp && ri->rp->handler) { __get_cpu_var(current_kprobe) = &ri->rp->kp; get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE; + ri->ret_addr = correct_ret_addr; ri->rp->handler(ri, regs); __get_cpu_var(current_kprobe) = NULL; } - orig_ret_address = (unsigned long)ri->ret_addr; recycle_rp_inst(ri, &empty_rp); if (orig_ret_address != trampoline_address) @@ -759,8 +780,6 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs) break; } - kretprobe_assert(ri, orig_ret_address, trampoline_address); - kretprobe_hash_unlock(current, &flags); hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index b008e7883207..c3a4fbb2b996 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1014,6 +1014,8 @@ void __init setup_arch(char **cmdline_p) paging_init(); x86_init.paging.pagetable_setup_done(swapper_pg_dir); + setup_trampoline_page_table(); + tboot_probe(); #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index a5e928b0cb5f..8b3bfc4dd708 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -73,7 +73,6 @@ #ifdef CONFIG_X86_32 u8 apicid_2_node[MAX_APICID]; -static int low_mappings; #endif /* State of each CPU */ @@ -91,6 +90,25 @@ DEFINE_PER_CPU(int, cpu_state) = { 0 }; static DEFINE_PER_CPU(struct task_struct *, idle_thread_array); #define get_idle_for_cpu(x) (per_cpu(idle_thread_array, x)) #define set_idle_for_cpu(x, p) (per_cpu(idle_thread_array, x) = (p)) + +/* + * We need this for trampoline_base protection from concurrent accesses when + * off- and onlining cores wildly. + */ +static DEFINE_MUTEX(x86_cpu_hotplug_driver_mutex); + +void cpu_hotplug_driver_lock() +{ + mutex_lock(&x86_cpu_hotplug_driver_mutex); +} + +void cpu_hotplug_driver_unlock() +{ + mutex_unlock(&x86_cpu_hotplug_driver_mutex); +} + +ssize_t arch_cpu_probe(const char *buf, size_t count) { return -1; } +ssize_t arch_cpu_release(const char *buf, size_t count) { return -1; } #else static struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ; #define get_idle_for_cpu(x) (idle_thread_array[(x)]) @@ -281,6 +299,18 @@ notrace static void __cpuinit start_secondary(void *unused) * fragile that we want to limit the things done here to the * most necessary things. */ + +#ifdef CONFIG_X86_32 + /* + * Switch away from the trampoline page-table + * + * Do this before cpu_init() because it needs to access per-cpu + * data which may not be mapped in the trampoline page-table. + */ + load_cr3(swapper_pg_dir); + __flush_tlb_all(); +#endif + vmi_bringup(); cpu_init(); preempt_disable(); @@ -299,12 +329,6 @@ notrace static void __cpuinit start_secondary(void *unused) legacy_pic->chip->unmask(0); } -#ifdef CONFIG_X86_32 - while (low_mappings) - cpu_relax(); - __flush_tlb_all(); -#endif - /* This must be done before setting cpu_online_mask */ set_cpu_sibling_map(raw_smp_processor_id()); wmb(); @@ -750,6 +774,7 @@ do_rest: #ifdef CONFIG_X86_32 /* Stack for startup_32 can be just as for start_secondary onwards */ irq_ctx_init(cpu); + initial_page_table = __pa(&trampoline_pg_dir); #else clear_tsk_thread_flag(c_idle.idle, TIF_FORK); initial_gs = per_cpu_offset(cpu); @@ -897,20 +922,8 @@ int __cpuinit native_cpu_up(unsigned int cpu) per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; -#ifdef CONFIG_X86_32 - /* init low mem mapping */ - clone_pgd_range(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY, - min_t(unsigned long, KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY)); - flush_tlb_all(); - low_mappings = 1; - err = do_boot_cpu(apicid, cpu); - zap_low_mappings(false); - low_mappings = 0; -#else - err = do_boot_cpu(apicid, cpu); -#endif if (err) { pr_debug("do_boot_cpu failed %d\n", err); return -EIO; diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c index c652ef62742d..e2a595257390 100644 --- a/arch/x86/kernel/trampoline.c +++ b/arch/x86/kernel/trampoline.c @@ -1,6 +1,7 @@ #include <linux/io.h> #include <asm/trampoline.h> +#include <asm/pgtable.h> #include <asm/e820.h> #if defined(CONFIG_X86_64) && defined(CONFIG_ACPI_SLEEP) @@ -37,3 +38,19 @@ unsigned long __trampinit setup_trampoline(void) memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE); return virt_to_phys(trampoline_base); } + +void __init setup_trampoline_page_table(void) +{ +#ifdef CONFIG_X86_32 + /* Copy kernel address range */ + clone_pgd_range(trampoline_pg_dir + KERNEL_PGD_BOUNDARY, + swapper_pg_dir + KERNEL_PGD_BOUNDARY, + KERNEL_PGD_PTRS); + + /* Initialize low mappings */ + clone_pgd_range(trampoline_pg_dir, + swapper_pg_dir + KERNEL_PGD_BOUNDARY, + min_t(unsigned long, KERNEL_PGD_PTRS, + KERNEL_PGD_BOUNDARY)); +#endif +} diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index ce8e50239332..26a863a9c2a8 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -626,6 +626,44 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) local_irq_restore(flags); } +static unsigned long long cyc2ns_suspend; + +void save_sched_clock_state(void) +{ + if (!sched_clock_stable) + return; + + cyc2ns_suspend = sched_clock(); +} + +/* + * Even on processors with invariant TSC, TSC gets reset in some the + * ACPI system sleep states. And in some systems BIOS seem to reinit TSC to + * arbitrary value (still sync'd across cpu's) during resume from such sleep + * states. To cope up with this, recompute the cyc2ns_offset for each cpu so + * that sched_clock() continues from the point where it was left off during + * suspend. + */ +void restore_sched_clock_state(void) +{ + unsigned long long offset; + unsigned long flags; + int cpu; + + if (!sched_clock_stable) + return; + + local_irq_save(flags); + + __get_cpu_var(cyc2ns_offset) = 0; + offset = cyc2ns_suspend - sched_clock(); + + for_each_possible_cpu(cpu) + per_cpu(cyc2ns_offset, cpu) = offset; + + local_irq_restore(flags); +} + #ifdef CONFIG_CPU_FREQ /* Frequency scaling support. Adjust the TSC based timer when the cpu frequency diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index b38bd8b92aa6..66ca98aafdd6 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1870,17 +1870,16 @@ static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { struct decode_cache *c = &ctxt->decode; - u64 old = c->dst.orig_val; + u64 old = c->dst.orig_val64; if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) || ((u32) (old >> 32) != (u32) c->regs[VCPU_REGS_RDX])) { - c->regs[VCPU_REGS_RAX] = (u32) (old >> 0); c->regs[VCPU_REGS_RDX] = (u32) (old >> 32); ctxt->eflags &= ~EFLG_ZF; } else { - c->dst.val = ((u64)c->regs[VCPU_REGS_RCX] << 32) | - (u32) c->regs[VCPU_REGS_RBX]; + c->dst.val64 = ((u64)c->regs[VCPU_REGS_RCX] << 32) | + (u32) c->regs[VCPU_REGS_RBX]; ctxt->eflags |= EFLG_ZF; } @@ -2616,7 +2615,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) c->src.valptr, c->src.bytes); if (rc != X86EMUL_CONTINUE) goto done; - c->src.orig_val = c->src.val; + c->src.orig_val64 = c->src.val64; } if (c->src2.type == OP_MEM) { diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 0fd6378981f4..ddeb2314b522 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -697,6 +697,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) pit->wq = create_singlethread_workqueue("kvm-pit-wq"); if (!pit->wq) { mutex_unlock(&pit->pit_state.lock); + kvm_free_irq_source_id(kvm, pit->irq_source_id); kfree(pit); return NULL; } @@ -742,7 +743,7 @@ fail: kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier); kvm_unregister_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier); kvm_free_irq_source_id(kvm, pit->irq_source_id); - + destroy_workqueue(pit->wq); kfree(pit); return NULL; } diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 8d10c063d7f2..4b7b73ce2098 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -64,6 +64,9 @@ static void pic_unlock(struct kvm_pic *s) if (!found) found = s->kvm->bsp_vcpu; + if (!found) + return; + kvm_vcpu_kick(found); } } diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index ffed06871c5c..63c314502993 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -43,7 +43,6 @@ struct kvm_kpic_state { u8 irr; /* interrupt request register */ u8 imr; /* interrupt mask register */ u8 isr; /* interrupt service register */ - u8 isr_ack; /* interrupt ack detection */ u8 priority_add; /* highest irq priority */ u8 irq_base; u8 read_reg_select; @@ -56,6 +55,7 @@ struct kvm_kpic_state { u8 init4; /* true if 4 byte init */ u8 elcr; /* PIIX edge/trigger selection */ u8 elcr_mask; + u8 isr_ack; /* interrupt ack detection */ struct kvm_pic *pics_state; }; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 25f19078b321..3a09c625d526 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2387,7 +2387,7 @@ static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, if (cpu_has_xsave) memcpy(guest_xsave->region, &vcpu->arch.guest_fpu.state->xsave, - sizeof(struct xsave_struct)); + xstate_size); else { memcpy(guest_xsave->region, &vcpu->arch.guest_fpu.state->fxsave, @@ -2405,7 +2405,7 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, if (cpu_has_xsave) memcpy(&vcpu->arch.guest_fpu.state->xsave, - guest_xsave->region, sizeof(struct xsave_struct)); + guest_xsave->region, xstate_size); else { if (xstate_bv & ~XSTATE_FPSSE) return -EINVAL; diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 9257510b4836..9d5f55848455 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -324,9 +324,8 @@ static void lguest_load_gdt(const struct desc_ptr *desc) } /* - * For a single GDT entry which changes, we do the lazy thing: alter our GDT, - * then tell the Host to reload the entire thing. This operation is so rare - * that this naive implementation is reasonable. + * For a single GDT entry which changes, we simply change our copy and + * then tell the host about it. */ static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum, const void *desc, int type) @@ -338,9 +337,13 @@ static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum, } /* - * OK, I lied. There are three "thread local storage" GDT entries which change + * There are three "thread local storage" GDT entries which change * on every context switch (these three entries are how glibc implements - * __thread variables). So we have a hypercall specifically for this case. + * __thread variables). As an optimization, we have a hypercall + * specifically for this case. + * + * Wouldn't it be nicer to have a general LOAD_GDT_ENTRIES hypercall + * which took a range of entries? */ static void lguest_load_tls(struct thread_struct *t, unsigned int cpu) { diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c index 84e236ce76ba..72fc70cf6184 100644 --- a/arch/x86/mm/iomap_32.c +++ b/arch/x86/mm/iomap_32.c @@ -74,7 +74,7 @@ void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) /* * Map 'pfn' using fixed map 'type' and protections 'prot' */ -void * +void __iomem * iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) { /* @@ -86,12 +86,12 @@ iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) if (!pat_enabled && pgprot_val(prot) == pgprot_val(PAGE_KERNEL_WC)) prot = PAGE_KERNEL_UC_MINUS; - return kmap_atomic_prot_pfn(pfn, type, prot); + return (void __force __iomem *) kmap_atomic_prot_pfn(pfn, type, prot); } EXPORT_SYMBOL_GPL(iomap_atomic_prot_pfn); void -iounmap_atomic(void *kvaddr, enum km_type type) +iounmap_atomic(void __iomem *kvaddr, enum km_type type) { unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index f6b48f6c5951..009b819f48d0 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -568,8 +568,13 @@ static int __init init_sysfs(void) int error; error = sysdev_class_register(&oprofile_sysclass); - if (!error) - error = sysdev_register(&device_oprofile); + if (error) + return error; + + error = sysdev_register(&device_oprofile); + if (error) + sysdev_class_unregister(&oprofile_sysclass); + return error; } @@ -580,8 +585,10 @@ static void exit_sysfs(void) } #else -#define init_sysfs() do { } while (0) -#define exit_sysfs() do { } while (0) + +static inline int init_sysfs(void) { return 0; } +static inline void exit_sysfs(void) { } + #endif /* CONFIG_PM */ static int __init p4_init(char **cpu_type) @@ -664,7 +671,9 @@ static int __init ppro_init(char **cpu_type) case 14: *cpu_type = "i386/core"; break; - case 15: case 23: + case 0x0f: + case 0x16: + case 0x17: *cpu_type = "i386/core_2"; break; case 0x1a: @@ -695,6 +704,8 @@ int __init op_nmi_init(struct oprofile_operations *ops) char *cpu_type = NULL; int ret = 0; + using_nmi = 0; + if (!cpu_has_apic) return -ENODEV; @@ -774,7 +785,10 @@ int __init op_nmi_init(struct oprofile_operations *ops) mux_init(ops); - init_sysfs(); + ret = init_sysfs(); + if (ret) + return ret; + using_nmi = 1; printk(KERN_INFO "oprofile: using NMI interrupt.\n"); return 0; diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index e7e8c5f54956..87bb35e34ef1 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -113,6 +113,7 @@ static void __save_processor_state(struct saved_context *ctxt) void save_processor_state(void) { __save_processor_state(&saved_context); + save_sched_clock_state(); } #ifdef CONFIG_X86_32 EXPORT_SYMBOL(save_processor_state); @@ -229,6 +230,7 @@ static void __restore_processor_state(struct saved_context *ctxt) void restore_processor_state(void) { __restore_processor_state(&saved_context); + restore_sched_clock_state(); } #ifdef CONFIG_X86_32 EXPORT_SYMBOL(restore_processor_state); diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c index 554c002a1e1a..0f456386cce5 100644 --- a/arch/x86/xen/platform-pci-unplug.c +++ b/arch/x86/xen/platform-pci-unplug.c @@ -72,13 +72,17 @@ void __init xen_unplug_emulated_devices(void) { int r; + /* user explicitly requested no unplug */ + if (xen_emul_unplug & XEN_UNPLUG_NEVER) + return; /* check the version of the xen platform PCI device */ r = check_platform_magic(); /* If the version matches enable the Xen platform PCI driver. - * Also enable the Xen platform PCI driver if the version is really old - * and the user told us to ignore it. */ + * Also enable the Xen platform PCI driver if the host does + * not support the unplug protocol (XEN_PLATFORM_ERR_MAGIC) + * but the user told us that unplugging is unnecessary. */ if (r && !(r == XEN_PLATFORM_ERR_MAGIC && - (xen_emul_unplug & XEN_UNPLUG_IGNORE))) + (xen_emul_unplug & XEN_UNPLUG_UNNECESSARY))) return; /* Set the default value of xen_emul_unplug depending on whether or * not the Xen PV frontends and the Xen platform PCI driver have @@ -99,7 +103,7 @@ void __init xen_unplug_emulated_devices(void) } } /* Now unplug the emulated devices */ - if (!(xen_emul_unplug & XEN_UNPLUG_IGNORE)) + if (!(xen_emul_unplug & XEN_UNPLUG_UNNECESSARY)) outw(xen_emul_unplug, XEN_IOPORT_UNPLUG); xen_platform_pci_unplug = xen_emul_unplug; } @@ -125,8 +129,10 @@ static int __init parse_xen_emul_unplug(char *arg) xen_emul_unplug |= XEN_UNPLUG_AUX_IDE_DISKS; else if (!strncmp(p, "nics", l)) xen_emul_unplug |= XEN_UNPLUG_ALL_NICS; - else if (!strncmp(p, "ignore", l)) - xen_emul_unplug |= XEN_UNPLUG_IGNORE; + else if (!strncmp(p, "unnecessary", l)) + xen_emul_unplug |= XEN_UNPLUG_UNNECESSARY; + else if (!strncmp(p, "never", l)) + xen_emul_unplug |= XEN_UNPLUG_NEVER; else printk(KERN_WARNING "unrecognised option '%s' " "in parameter 'xen_emul_unplug'\n", p); |