diff options
Diffstat (limited to 'arch/x86/kernel')
54 files changed, 463 insertions, 1433 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 5eeb808eb024..2ddf08351f0b 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -116,7 +116,6 @@ obj-$(CONFIG_VM86) += vm86_32.o obj-$(CONFIG_EARLY_PRINTK) += early_printk.o obj-$(CONFIG_HPET_TIMER) += hpet.o -obj-$(CONFIG_APB_TIMER) += apb_timer.o obj-$(CONFIG_AMD_NB) += amd_nb.o obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile index f1bb57b0e41e..cf340d85946a 100644 --- a/arch/x86/kernel/acpi/Makefile +++ b/arch/x86/kernel/acpi/Makefile @@ -1,5 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 -OBJECT_FILES_NON_STANDARD_wakeup_$(BITS).o := y obj-$(CONFIG_ACPI) += boot.o obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S index 5d3a0b8fd379..56b6865afb2a 100644 --- a/arch/x86/kernel/acpi/wakeup_64.S +++ b/arch/x86/kernel/acpi/wakeup_64.S @@ -1,12 +1,14 @@ /* SPDX-License-Identifier: GPL-2.0-only */ .text #include <linux/linkage.h> +#include <linux/objtool.h> #include <asm/segment.h> #include <asm/pgtable_types.h> #include <asm/page_types.h> #include <asm/msr.h> #include <asm/asm-offsets.h> #include <asm/frame.h> +#include <asm/nospec-branch.h> # Copyright 2003 Pavel Machek <pavel@suse.cz @@ -39,6 +41,7 @@ SYM_FUNC_START(wakeup_long64) movq saved_rbp, %rbp movq saved_rip, %rax + ANNOTATE_RETPOLINE_SAFE jmp *%rax SYM_FUNC_END(wakeup_long64) @@ -126,6 +129,7 @@ SYM_FUNC_START(do_suspend_lowlevel) FRAME_END jmp restore_processor_state SYM_FUNC_END(do_suspend_lowlevel) +STACK_FRAME_NON_STANDARD do_suspend_lowlevel .data saved_rbp: .quad 0 diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c deleted file mode 100644 index 263eeaddb0aa..000000000000 --- a/arch/x86/kernel/apb_timer.c +++ /dev/null @@ -1,347 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * apb_timer.c: Driver for Langwell APB timers - * - * (C) Copyright 2009 Intel Corporation - * Author: Jacob Pan (jacob.jun.pan@intel.com) - * - * Note: - * Langwell is the south complex of Intel Moorestown MID platform. There are - * eight external timers in total that can be used by the operating system. - * The timer information, such as frequency and addresses, is provided to the - * OS via SFI tables. - * Timer interrupts are routed via FW/HW emulated IOAPIC independently via - * individual redirection table entries (RTE). - * Unlike HPET, there is no master counter, therefore one of the timers are - * used as clocksource. The overall allocation looks like: - * - timer 0 - NR_CPUs for per cpu timer - * - one timer for clocksource - * - one timer for watchdog driver. - * It is also worth notice that APB timer does not support true one-shot mode, - * free-running mode will be used here to emulate one-shot mode. - * APB timer can also be used as broadcast timer along with per cpu local APIC - * timer, but by default APB timer has higher rating than local APIC timers. - */ - -#include <linux/delay.h> -#include <linux/dw_apb_timer.h> -#include <linux/errno.h> -#include <linux/init.h> -#include <linux/slab.h> -#include <linux/pm.h> -#include <linux/sfi.h> -#include <linux/interrupt.h> -#include <linux/cpu.h> -#include <linux/irq.h> - -#include <asm/fixmap.h> -#include <asm/apb_timer.h> -#include <asm/intel-mid.h> -#include <asm/time.h> - -#define APBT_CLOCKEVENT_RATING 110 -#define APBT_CLOCKSOURCE_RATING 250 - -#define APBT_CLOCKEVENT0_NUM (0) -#define APBT_CLOCKSOURCE_NUM (2) - -static phys_addr_t apbt_address; -static int apb_timer_block_enabled; -static void __iomem *apbt_virt_address; - -/* - * Common DW APB timer info - */ -static unsigned long apbt_freq; - -struct apbt_dev { - struct dw_apb_clock_event_device *timer; - unsigned int num; - int cpu; - unsigned int irq; - char name[10]; -}; - -static struct dw_apb_clocksource *clocksource_apbt; - -static inline void __iomem *adev_virt_addr(struct apbt_dev *adev) -{ - return apbt_virt_address + adev->num * APBTMRS_REG_SIZE; -} - -static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev); - -#ifdef CONFIG_SMP -static unsigned int apbt_num_timers_used; -#endif - -static inline void apbt_set_mapping(void) -{ - struct sfi_timer_table_entry *mtmr; - int phy_cs_timer_id = 0; - - if (apbt_virt_address) { - pr_debug("APBT base already mapped\n"); - return; - } - mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM); - if (mtmr == NULL) { - printk(KERN_ERR "Failed to get MTMR %d from SFI\n", - APBT_CLOCKEVENT0_NUM); - return; - } - apbt_address = (phys_addr_t)mtmr->phys_addr; - if (!apbt_address) { - printk(KERN_WARNING "No timer base from SFI, use default\n"); - apbt_address = APBT_DEFAULT_BASE; - } - apbt_virt_address = ioremap(apbt_address, APBT_MMAP_SIZE); - if (!apbt_virt_address) { - pr_debug("Failed mapping APBT phy address at %lu\n",\ - (unsigned long)apbt_address); - goto panic_noapbt; - } - apbt_freq = mtmr->freq_hz; - sfi_free_mtmr(mtmr); - - /* Now figure out the physical timer id for clocksource device */ - mtmr = sfi_get_mtmr(APBT_CLOCKSOURCE_NUM); - if (mtmr == NULL) - goto panic_noapbt; - - /* Now figure out the physical timer id */ - pr_debug("Use timer %d for clocksource\n", - (int)(mtmr->phys_addr & 0xff) / APBTMRS_REG_SIZE); - phy_cs_timer_id = (unsigned int)(mtmr->phys_addr & 0xff) / - APBTMRS_REG_SIZE; - - clocksource_apbt = dw_apb_clocksource_init(APBT_CLOCKSOURCE_RATING, - "apbt0", apbt_virt_address + phy_cs_timer_id * - APBTMRS_REG_SIZE, apbt_freq); - return; - -panic_noapbt: - panic("Failed to setup APB system timer\n"); - -} - -static inline void apbt_clear_mapping(void) -{ - iounmap(apbt_virt_address); - apbt_virt_address = NULL; -} - -static int __init apbt_clockevent_register(void) -{ - struct sfi_timer_table_entry *mtmr; - struct apbt_dev *adev = this_cpu_ptr(&cpu_apbt_dev); - - mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM); - if (mtmr == NULL) { - printk(KERN_ERR "Failed to get MTMR %d from SFI\n", - APBT_CLOCKEVENT0_NUM); - return -ENODEV; - } - - adev->num = smp_processor_id(); - adev->timer = dw_apb_clockevent_init(smp_processor_id(), "apbt0", - intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT ? - APBT_CLOCKEVENT_RATING - 100 : APBT_CLOCKEVENT_RATING, - adev_virt_addr(adev), 0, apbt_freq); - /* Firmware does EOI handling for us. */ - adev->timer->eoi = NULL; - - if (intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT) { - global_clock_event = &adev->timer->ced; - printk(KERN_DEBUG "%s clockevent registered as global\n", - global_clock_event->name); - } - - dw_apb_clockevent_register(adev->timer); - - sfi_free_mtmr(mtmr); - return 0; -} - -#ifdef CONFIG_SMP - -static void apbt_setup_irq(struct apbt_dev *adev) -{ - irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT); - irq_set_affinity(adev->irq, cpumask_of(adev->cpu)); -} - -/* Should be called with per cpu */ -void apbt_setup_secondary_clock(void) -{ - struct apbt_dev *adev; - int cpu; - - /* Don't register boot CPU clockevent */ - cpu = smp_processor_id(); - if (!cpu) - return; - - adev = this_cpu_ptr(&cpu_apbt_dev); - if (!adev->timer) { - adev->timer = dw_apb_clockevent_init(cpu, adev->name, - APBT_CLOCKEVENT_RATING, adev_virt_addr(adev), - adev->irq, apbt_freq); - adev->timer->eoi = NULL; - } else { - dw_apb_clockevent_resume(adev->timer); - } - - printk(KERN_INFO "Registering CPU %d clockevent device %s, cpu %08x\n", - cpu, adev->name, adev->cpu); - - apbt_setup_irq(adev); - dw_apb_clockevent_register(adev->timer); - - return; -} - -/* - * this notify handler process CPU hotplug events. in case of S0i3, nonboot - * cpus are disabled/enabled frequently, for performance reasons, we keep the - * per cpu timer irq registered so that we do need to do free_irq/request_irq. - * - * TODO: it might be more reliable to directly disable percpu clockevent device - * without the notifier chain. currently, cpu 0 may get interrupts from other - * cpu timers during the offline process due to the ordering of notification. - * the extra interrupt is harmless. - */ -static int apbt_cpu_dead(unsigned int cpu) -{ - struct apbt_dev *adev = &per_cpu(cpu_apbt_dev, cpu); - - dw_apb_clockevent_pause(adev->timer); - if (system_state == SYSTEM_RUNNING) { - pr_debug("skipping APBT CPU %u offline\n", cpu); - } else { - pr_debug("APBT clockevent for cpu %u offline\n", cpu); - dw_apb_clockevent_stop(adev->timer); - } - return 0; -} - -static __init int apbt_late_init(void) -{ - if (intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT || - !apb_timer_block_enabled) - return 0; - return cpuhp_setup_state(CPUHP_X86_APB_DEAD, "x86/apb:dead", NULL, - apbt_cpu_dead); -} -fs_initcall(apbt_late_init); -#else - -void apbt_setup_secondary_clock(void) {} - -#endif /* CONFIG_SMP */ - -static int apbt_clocksource_register(void) -{ - u64 start, now; - u64 t1; - - /* Start the counter, use timer 2 as source, timer 0/1 for event */ - dw_apb_clocksource_start(clocksource_apbt); - - /* Verify whether apbt counter works */ - t1 = dw_apb_clocksource_read(clocksource_apbt); - start = rdtsc(); - - /* - * We don't know the TSC frequency yet, but waiting for - * 200000 TSC cycles is safe: - * 4 GHz == 50us - * 1 GHz == 200us - */ - do { - rep_nop(); - now = rdtsc(); - } while ((now - start) < 200000UL); - - /* APBT is the only always on clocksource, it has to work! */ - if (t1 == dw_apb_clocksource_read(clocksource_apbt)) - panic("APBT counter not counting. APBT disabled\n"); - - dw_apb_clocksource_register(clocksource_apbt); - - return 0; -} - -/* - * Early setup the APBT timer, only use timer 0 for booting then switch to - * per CPU timer if possible. - * returns 1 if per cpu apbt is setup - * returns 0 if no per cpu apbt is chosen - * panic if set up failed, this is the only platform timer on Moorestown. - */ -void __init apbt_time_init(void) -{ -#ifdef CONFIG_SMP - int i; - struct sfi_timer_table_entry *p_mtmr; - struct apbt_dev *adev; -#endif - - if (apb_timer_block_enabled) - return; - apbt_set_mapping(); - if (!apbt_virt_address) - goto out_noapbt; - /* - * Read the frequency and check for a sane value, for ESL model - * we extend the possible clock range to allow time scaling. - */ - - if (apbt_freq < APBT_MIN_FREQ || apbt_freq > APBT_MAX_FREQ) { - pr_debug("APBT has invalid freq 0x%lx\n", apbt_freq); - goto out_noapbt; - } - if (apbt_clocksource_register()) { - pr_debug("APBT has failed to register clocksource\n"); - goto out_noapbt; - } - if (!apbt_clockevent_register()) - apb_timer_block_enabled = 1; - else { - pr_debug("APBT has failed to register clockevent\n"); - goto out_noapbt; - } -#ifdef CONFIG_SMP - /* kernel cmdline disable apb timer, so we will use lapic timers */ - if (intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT) { - printk(KERN_INFO "apbt: disabled per cpu timer\n"); - return; - } - pr_debug("%s: %d CPUs online\n", __func__, num_online_cpus()); - if (num_possible_cpus() <= sfi_mtimer_num) - apbt_num_timers_used = num_possible_cpus(); - else - apbt_num_timers_used = 1; - pr_debug("%s: %d APB timers used\n", __func__, apbt_num_timers_used); - - /* here we set up per CPU timer data structure */ - for (i = 0; i < apbt_num_timers_used; i++) { - adev = &per_cpu(cpu_apbt_dev, i); - adev->num = i; - adev->cpu = i; - p_mtmr = sfi_get_mtmr(i); - if (p_mtmr) - adev->irq = p_mtmr->irq; - else - printk(KERN_ERR "Failed to get timer for cpu %d\n", i); - snprintf(adev->name, sizeof(adev->name) - 1, "apbt%d", i); - } -#endif - - return; - -out_noapbt: - apbt_clear_mapping(); - apb_timer_block_enabled = 0; - panic("failed to enable APB timer\n"); -} diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 6bd20c0de8bc..bda4f2a36868 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -41,6 +41,7 @@ #include <asm/perf_event.h> #include <asm/x86_init.h> #include <linux/atomic.h> +#include <asm/barrier.h> #include <asm/mpspec.h> #include <asm/i8259.h> #include <asm/proto.h> @@ -477,6 +478,9 @@ static int lapic_next_deadline(unsigned long delta, { u64 tsc; + /* This MSR is special and need a special fence: */ + weak_wrmsr_fence(); + tsc = rdtsc(); wrmsrl(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR)); return 0; @@ -1743,6 +1747,7 @@ void apic_ap_setup(void) #ifdef CONFIG_X86_X2APIC int x2apic_mode; +EXPORT_SYMBOL_GPL(x2apic_mode); enum { X2APIC_OFF, @@ -2133,18 +2138,11 @@ void __init register_lapic_address(unsigned long address) * Local APIC interrupts */ -/** - * spurious_interrupt - Catch all for interrupts raised on unused vectors - * @regs: Pointer to pt_regs on stack - * @vector: The vector number - * - * This is invoked from ASM entry code to catch all interrupts which - * trigger on an entry which is routed to the common_spurious idtentry - * point. - * - * Also called from sysvec_spurious_apic_interrupt(). +/* + * Common handling code for spurious_interrupt and spurious_vector entry + * points below. No point in allowing the compiler to inline it twice. */ -DEFINE_IDTENTRY_IRQ(spurious_interrupt) +static noinline void handle_spurious_interrupt(u8 vector) { u32 v; @@ -2179,9 +2177,23 @@ out: trace_spurious_apic_exit(vector); } +/** + * spurious_interrupt - Catch all for interrupts raised on unused vectors + * @regs: Pointer to pt_regs on stack + * @vector: The vector number + * + * This is invoked from ASM entry code to catch all interrupts which + * trigger on an entry which is routed to the common_spurious idtentry + * point. + */ +DEFINE_IDTENTRY_IRQ(spurious_interrupt) +{ + handle_spurious_interrupt(vector); +} + DEFINE_IDTENTRY_SYSVEC(sysvec_spurious_apic_interrupt) { - __spurious_interrupt(regs, SPURIOUS_APIC_VECTOR); + handle_spurious_interrupt(SPURIOUS_APIC_VECTOR); } /* diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index e4ab4804b20d..c3b60c37c728 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -198,7 +198,7 @@ static int __init parse_noapic(char *str) } early_param("noapic", parse_noapic); -/* Will be called in mpparse/acpi/sfi codes for saving IRQ info */ +/* Will be called in mpparse/ACPI codes for saving IRQ info */ void mp_save_irq(struct mpc_intsrc *m) { int i; @@ -2863,7 +2863,7 @@ int mp_register_ioapic(int id, u32 address, u32 gsi_base, /* * If mp_register_ioapic() is called during early boot stage when - * walking ACPI/SFI/DT tables, it's too early to create irqdomain, + * walking ACPI/DT tables, it's too early to create irqdomain, * we are still using bootmem allocator. So delay it to setup_IO_APIC(). */ if (hotplug) { diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index df6adc5674c9..f4da9bb69a88 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -29,7 +29,8 @@ static void x2apic_send_IPI(int cpu, int vector) { u32 dest = per_cpu(x86_cpu_to_logical_apicid, cpu); - x2apic_wrmsr_fence(); + /* x2apic MSRs are special and need a special fence: */ + weak_wrmsr_fence(); __x2apic_send_IPI_dest(dest, vector, APIC_DEST_LOGICAL); } @@ -41,7 +42,8 @@ __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest) unsigned long flags; u32 dest; - x2apic_wrmsr_fence(); + /* x2apic MSRs are special and need a special fence: */ + weak_wrmsr_fence(); local_irq_save(flags); tmpmsk = this_cpu_cpumask_var_ptr(ipi_mask); diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index 0e4e81971567..6bde05a86b4e 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -43,7 +43,8 @@ static void x2apic_send_IPI(int cpu, int vector) { u32 dest = per_cpu(x86_cpu_to_apicid, cpu); - x2apic_wrmsr_fence(); + /* x2apic MSRs are special and need a special fence: */ + weak_wrmsr_fence(); __x2apic_send_IPI_dest(dest, vector, APIC_DEST_PHYSICAL); } @@ -54,7 +55,8 @@ __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest) unsigned long this_cpu; unsigned long flags; - x2apic_wrmsr_fence(); + /* x2apic MSRs are special and need a special fence: */ + weak_wrmsr_fence(); local_irq_save(flags); @@ -125,7 +127,8 @@ void __x2apic_send_IPI_shorthand(int vector, u32 which) { unsigned long cfg = __prepare_ICR(which, vector, 0); - x2apic_wrmsr_fence(); + /* x2apic MSRs are special and need a special fence: */ + weak_wrmsr_fence(); native_x2apic_icr_write(cfg, 0); } diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 828be792231e..b14533af7676 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -13,9 +13,6 @@ int main(void) { #ifdef CONFIG_PARAVIRT #ifdef CONFIG_PARAVIRT_XXL - OFFSET(PV_CPU_usergs_sysret64, paravirt_patch_template, - cpu.usergs_sysret64); - OFFSET(PV_CPU_swapgs, paravirt_patch_template, cpu.swapgs); #ifdef CONFIG_DEBUG_ENTRY OFFSET(PV_IRQ_save_fl, paravirt_patch_template, irq.save_fl); #endif diff --git a/arch/x86/kernel/cpu/acrn.c b/arch/x86/kernel/cpu/acrn.c index 0b2c03943ac6..23f5f27b5a02 100644 --- a/arch/x86/kernel/cpu/acrn.c +++ b/arch/x86/kernel/cpu/acrn.c @@ -10,6 +10,8 @@ */ #include <linux/interrupt.h> + +#include <asm/acrn.h> #include <asm/apic.h> #include <asm/cpufeatures.h> #include <asm/desc.h> @@ -19,7 +21,7 @@ static u32 __init acrn_detect(void) { - return hypervisor_cpuid_base("ACRNACRNACRN", 0); + return acrn_cpuid_base(); } static void __init acrn_init_platform(void) @@ -55,6 +57,18 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_acrn_hv_callback) set_irq_regs(old_regs); } +void acrn_setup_intr_handler(void (*handler)(void)) +{ + acrn_intr_handler = handler; +} +EXPORT_SYMBOL_GPL(acrn_setup_intr_handler); + +void acrn_remove_intr_handler(void) +{ + acrn_intr_handler = NULL; +} +EXPORT_SYMBOL_GPL(acrn_remove_intr_handler); + const __initconst struct hypervisor_x86 x86_hyper_acrn = { .name = "ACRN", .detect = acrn_detect, diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 35ad8480c464..ab640abe26b6 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -960,6 +960,9 @@ void get_cpu_cap(struct cpuinfo_x86 *c) if (c->extended_cpuid_level >= 0x8000000a) c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a); + if (c->extended_cpuid_level >= 0x8000001f) + c->x86_capability[CPUID_8000_001F_EAX] = cpuid_eax(0x8000001f); + init_scattered_cpuid_features(c); init_speculation_control(c); @@ -1739,8 +1742,8 @@ DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned = &init_task; EXPORT_PER_CPU_SYMBOL(current_task); -DEFINE_PER_CPU(struct irq_stack *, hardirq_stack_ptr); -DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1; +DEFINE_PER_CPU(void *, hardirq_stack_ptr); +DEFINE_PER_CPU(bool, hardirq_stack_inuse); DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; EXPORT_PER_CPU_SYMBOL(__preempt_count); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 59a1e3ce3f14..0e422a544835 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -24,6 +24,7 @@ #include <asm/traps.h> #include <asm/resctrl.h> #include <asm/numa.h> +#include <asm/thermal.h> #ifdef CONFIG_X86_64 #include <linux/topology.h> @@ -719,6 +720,8 @@ static void init_intel(struct cpuinfo_x86 *c) tsx_disable(); split_lock_init(); + + intel_init_thermal(c); } #ifdef CONFIG_X86_32 @@ -1159,6 +1162,7 @@ static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, 1), X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, 1), X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, 1), + X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, 1), {} }; diff --git a/arch/x86/kernel/cpu/mce/Makefile b/arch/x86/kernel/cpu/mce/Makefile index 9f020c994154..015856abdbb1 100644 --- a/arch/x86/kernel/cpu/mce/Makefile +++ b/arch/x86/kernel/cpu/mce/Makefile @@ -9,8 +9,6 @@ obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o mce-inject-y := inject.o obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o -obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o - obj-$(CONFIG_ACPI_APEI) += apei.o obj-$(CONFIG_X86_MCELOG_LEGACY) += dev-mcelog.o diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index e133ce1e562b..7962355436da 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -878,6 +878,12 @@ static atomic_t mce_executing; static atomic_t mce_callin; /* + * Track which CPUs entered the MCA broadcast synchronization and which not in + * order to print holdouts. + */ +static cpumask_t mce_missing_cpus = CPU_MASK_ALL; + +/* * Check if a timeout waiting for other CPUs happened. */ static int mce_timed_out(u64 *t, const char *msg) @@ -894,8 +900,12 @@ static int mce_timed_out(u64 *t, const char *msg) if (!mca_cfg.monarch_timeout) goto out; if ((s64)*t < SPINUNIT) { - if (mca_cfg.tolerant <= 1) + if (mca_cfg.tolerant <= 1) { + if (cpumask_and(&mce_missing_cpus, cpu_online_mask, &mce_missing_cpus)) + pr_emerg("CPUs not responding to MCE broadcast (may include false positives): %*pbl\n", + cpumask_pr_args(&mce_missing_cpus)); mce_panic(msg, NULL, NULL); + } cpu_missing = 1; return 1; } @@ -1006,6 +1016,7 @@ static int mce_start(int *no_way_out) * is updated before mce_callin. */ order = atomic_inc_return(&mce_callin); + cpumask_clear_cpu(smp_processor_id(), &mce_missing_cpus); /* * Wait for everyone. @@ -1114,6 +1125,7 @@ static int mce_end(int order) reset: atomic_set(&global_nwo, 0); atomic_set(&mce_callin, 0); + cpumask_setall(&mce_missing_cpus); barrier(); /* @@ -2178,7 +2190,6 @@ __setup("mce", mcheck_enable); int __init mcheck_init(void) { - mcheck_intel_therm_init(); mce_register_decode_chain(&early_nb); mce_register_decode_chain(&mce_uc_nb); mce_register_decode_chain(&mce_default_nb); @@ -2713,6 +2724,7 @@ static void mce_reset(void) atomic_set(&mce_executing, 0); atomic_set(&mce_callin, 0); atomic_set(&global_nwo, 0); + cpumask_setall(&mce_missing_cpus); } static int fake_panic_get(void *data, u64 *val) diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c index c2476fe0682e..e309476743b7 100644 --- a/arch/x86/kernel/cpu/mce/intel.c +++ b/arch/x86/kernel/cpu/mce/intel.c @@ -531,7 +531,6 @@ static void intel_imc_init(struct cpuinfo_x86 *c) void mce_intel_feature_init(struct cpuinfo_x86 *c) { - intel_init_thermal(c); intel_init_cmci(); intel_init_lmce(); intel_ppin_init(c); diff --git a/arch/x86/kernel/cpu/mce/therm_throt.c b/arch/x86/kernel/cpu/mce/therm_throt.c deleted file mode 100644 index a7cd2d203ced..000000000000 --- a/arch/x86/kernel/cpu/mce/therm_throt.c +++ /dev/null @@ -1,739 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Thermal throttle event support code (such as syslog messaging and rate - * limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c). - * - * This allows consistent reporting of CPU thermal throttle events. - * - * Maintains a counter in /sys that keeps track of the number of thermal - * events, such that the user knows how bad the thermal problem might be - * (since the logging to syslog is rate limited). - * - * Author: Dmitriy Zavin (dmitriyz@google.com) - * - * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c. - * Inspired by Ross Biro's and Al Borchers' counter code. - */ -#include <linux/interrupt.h> -#include <linux/notifier.h> -#include <linux/jiffies.h> -#include <linux/kernel.h> -#include <linux/percpu.h> -#include <linux/export.h> -#include <linux/types.h> -#include <linux/init.h> -#include <linux/smp.h> -#include <linux/cpu.h> - -#include <asm/processor.h> -#include <asm/traps.h> -#include <asm/apic.h> -#include <asm/mce.h> -#include <asm/msr.h> -#include <asm/trace/irq_vectors.h> - -#include "internal.h" - -/* How long to wait between reporting thermal events */ -#define CHECK_INTERVAL (300 * HZ) - -#define THERMAL_THROTTLING_EVENT 0 -#define POWER_LIMIT_EVENT 1 - -/** - * struct _thermal_state - Represent the current thermal event state - * @next_check: Stores the next timestamp, when it is allowed - * to log the next warning message. - * @last_interrupt_time: Stores the timestamp for the last threshold - * high event. - * @therm_work: Delayed workqueue structure - * @count: Stores the current running count for thermal - * or power threshold interrupts. - * @last_count: Stores the previous running count for thermal - * or power threshold interrupts. - * @max_time_ms: This shows the maximum amount of time CPU was - * in throttled state for a single thermal - * threshold high to low state. - * @total_time_ms: This is a cumulative time during which CPU was - * in the throttled state. - * @rate_control_active: Set when a throttling message is logged. - * This is used for the purpose of rate-control. - * @new_event: Stores the last high/low status of the - * THERM_STATUS_PROCHOT or - * THERM_STATUS_POWER_LIMIT. - * @level: Stores whether this _thermal_state instance is - * for a CORE level or for PACKAGE level. - * @sample_index: Index for storing the next sample in the buffer - * temp_samples[]. - * @sample_count: Total number of samples collected in the buffer - * temp_samples[]. - * @average: The last moving average of temperature samples - * @baseline_temp: Temperature at which thermal threshold high - * interrupt was generated. - * @temp_samples: Storage for temperature samples to calculate - * moving average. - * - * This structure is used to represent data related to thermal state for a CPU. - * There is a separate storage for core and package level for each CPU. - */ -struct _thermal_state { - u64 next_check; - u64 last_interrupt_time; - struct delayed_work therm_work; - unsigned long count; - unsigned long last_count; - unsigned long max_time_ms; - unsigned long total_time_ms; - bool rate_control_active; - bool new_event; - u8 level; - u8 sample_index; - u8 sample_count; - u8 average; - u8 baseline_temp; - u8 temp_samples[3]; -}; - -struct thermal_state { - struct _thermal_state core_throttle; - struct _thermal_state core_power_limit; - struct _thermal_state package_throttle; - struct _thermal_state package_power_limit; - struct _thermal_state core_thresh0; - struct _thermal_state core_thresh1; - struct _thermal_state pkg_thresh0; - struct _thermal_state pkg_thresh1; -}; - -/* Callback to handle core threshold interrupts */ -int (*platform_thermal_notify)(__u64 msr_val); -EXPORT_SYMBOL(platform_thermal_notify); - -/* Callback to handle core package threshold_interrupts */ -int (*platform_thermal_package_notify)(__u64 msr_val); -EXPORT_SYMBOL_GPL(platform_thermal_package_notify); - -/* Callback support of rate control, return true, if - * callback has rate control */ -bool (*platform_thermal_package_rate_control)(void); -EXPORT_SYMBOL_GPL(platform_thermal_package_rate_control); - - -static DEFINE_PER_CPU(struct thermal_state, thermal_state); - -static atomic_t therm_throt_en = ATOMIC_INIT(0); - -static u32 lvtthmr_init __read_mostly; - -#ifdef CONFIG_SYSFS -#define define_therm_throt_device_one_ro(_name) \ - static DEVICE_ATTR(_name, 0444, \ - therm_throt_device_show_##_name, \ - NULL) \ - -#define define_therm_throt_device_show_func(event, name) \ - \ -static ssize_t therm_throt_device_show_##event##_##name( \ - struct device *dev, \ - struct device_attribute *attr, \ - char *buf) \ -{ \ - unsigned int cpu = dev->id; \ - ssize_t ret; \ - \ - preempt_disable(); /* CPU hotplug */ \ - if (cpu_online(cpu)) { \ - ret = sprintf(buf, "%lu\n", \ - per_cpu(thermal_state, cpu).event.name); \ - } else \ - ret = 0; \ - preempt_enable(); \ - \ - return ret; \ -} - -define_therm_throt_device_show_func(core_throttle, count); -define_therm_throt_device_one_ro(core_throttle_count); - -define_therm_throt_device_show_func(core_power_limit, count); -define_therm_throt_device_one_ro(core_power_limit_count); - -define_therm_throt_device_show_func(package_throttle, count); -define_therm_throt_device_one_ro(package_throttle_count); - -define_therm_throt_device_show_func(package_power_limit, count); -define_therm_throt_device_one_ro(package_power_limit_count); - -define_therm_throt_device_show_func(core_throttle, max_time_ms); -define_therm_throt_device_one_ro(core_throttle_max_time_ms); - -define_therm_throt_device_show_func(package_throttle, max_time_ms); -define_therm_throt_device_one_ro(package_throttle_max_time_ms); - -define_therm_throt_device_show_func(core_throttle, total_time_ms); -define_therm_throt_device_one_ro(core_throttle_total_time_ms); - -define_therm_throt_device_show_func(package_throttle, total_time_ms); -define_therm_throt_device_one_ro(package_throttle_total_time_ms); - -static struct attribute *thermal_throttle_attrs[] = { - &dev_attr_core_throttle_count.attr, - &dev_attr_core_throttle_max_time_ms.attr, - &dev_attr_core_throttle_total_time_ms.attr, - NULL -}; - -static const struct attribute_group thermal_attr_group = { - .attrs = thermal_throttle_attrs, - .name = "thermal_throttle" -}; -#endif /* CONFIG_SYSFS */ - -#define CORE_LEVEL 0 -#define PACKAGE_LEVEL 1 - -#define THERM_THROT_POLL_INTERVAL HZ -#define THERM_STATUS_PROCHOT_LOG BIT(1) - -#define THERM_STATUS_CLEAR_CORE_MASK (BIT(1) | BIT(3) | BIT(5) | BIT(7) | BIT(9) | BIT(11) | BIT(13) | BIT(15)) -#define THERM_STATUS_CLEAR_PKG_MASK (BIT(1) | BIT(3) | BIT(5) | BIT(7) | BIT(9) | BIT(11)) - -static void clear_therm_status_log(int level) -{ - int msr; - u64 mask, msr_val; - - if (level == CORE_LEVEL) { - msr = MSR_IA32_THERM_STATUS; - mask = THERM_STATUS_CLEAR_CORE_MASK; - } else { - msr = MSR_IA32_PACKAGE_THERM_STATUS; - mask = THERM_STATUS_CLEAR_PKG_MASK; - } - - rdmsrl(msr, msr_val); - msr_val &= mask; - wrmsrl(msr, msr_val & ~THERM_STATUS_PROCHOT_LOG); -} - -static void get_therm_status(int level, bool *proc_hot, u8 *temp) -{ - int msr; - u64 msr_val; - - if (level == CORE_LEVEL) - msr = MSR_IA32_THERM_STATUS; - else - msr = MSR_IA32_PACKAGE_THERM_STATUS; - - rdmsrl(msr, msr_val); - if (msr_val & THERM_STATUS_PROCHOT_LOG) - *proc_hot = true; - else - *proc_hot = false; - - *temp = (msr_val >> 16) & 0x7F; -} - -static void __maybe_unused throttle_active_work(struct work_struct *work) -{ - struct _thermal_state *state = container_of(to_delayed_work(work), - struct _thermal_state, therm_work); - unsigned int i, avg, this_cpu = smp_processor_id(); - u64 now = get_jiffies_64(); - bool hot; - u8 temp; - - get_therm_status(state->level, &hot, &temp); - /* temperature value is offset from the max so lesser means hotter */ - if (!hot && temp > state->baseline_temp) { - if (state->rate_control_active) - pr_info("CPU%d: %s temperature/speed normal (total events = %lu)\n", - this_cpu, - state->level == CORE_LEVEL ? "Core" : "Package", - state->count); - - state->rate_control_active = false; - return; - } - - if (time_before64(now, state->next_check) && - state->rate_control_active) - goto re_arm; - - state->next_check = now + CHECK_INTERVAL; - - if (state->count != state->last_count) { - /* There was one new thermal interrupt */ - state->last_count = state->count; - state->average = 0; - state->sample_count = 0; - state->sample_index = 0; - } - - state->temp_samples[state->sample_index] = temp; - state->sample_count++; - state->sample_index = (state->sample_index + 1) % ARRAY_SIZE(state->temp_samples); - if (state->sample_count < ARRAY_SIZE(state->temp_samples)) - goto re_arm; - - avg = 0; - for (i = 0; i < ARRAY_SIZE(state->temp_samples); ++i) - avg += state->temp_samples[i]; - - avg /= ARRAY_SIZE(state->temp_samples); - - if (state->average > avg) { - pr_warn("CPU%d: %s temperature is above threshold, cpu clock is throttled (total events = %lu)\n", - this_cpu, - state->level == CORE_LEVEL ? "Core" : "Package", - state->count); - state->rate_control_active = true; - } - - state->average = avg; - -re_arm: - clear_therm_status_log(state->level); - schedule_delayed_work_on(this_cpu, &state->therm_work, THERM_THROT_POLL_INTERVAL); -} - -/*** - * therm_throt_process - Process thermal throttling event from interrupt - * @curr: Whether the condition is current or not (boolean), since the - * thermal interrupt normally gets called both when the thermal - * event begins and once the event has ended. - * - * This function is called by the thermal interrupt after the - * IRQ has been acknowledged. - * - * It will take care of rate limiting and printing messages to the syslog. - */ -static void therm_throt_process(bool new_event, int event, int level) -{ - struct _thermal_state *state; - unsigned int this_cpu = smp_processor_id(); - bool old_event; - u64 now; - struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu); - - now = get_jiffies_64(); - if (level == CORE_LEVEL) { - if (event == THERMAL_THROTTLING_EVENT) - state = &pstate->core_throttle; - else if (event == POWER_LIMIT_EVENT) - state = &pstate->core_power_limit; - else - return; - } else if (level == PACKAGE_LEVEL) { - if (event == THERMAL_THROTTLING_EVENT) - state = &pstate->package_throttle; - else if (event == POWER_LIMIT_EVENT) - state = &pstate->package_power_limit; - else - return; - } else - return; - - old_event = state->new_event; - state->new_event = new_event; - - if (new_event) - state->count++; - - if (event != THERMAL_THROTTLING_EVENT) - return; - - if (new_event && !state->last_interrupt_time) { - bool hot; - u8 temp; - - get_therm_status(state->level, &hot, &temp); - /* - * Ignore short temperature spike as the system is not close - * to PROCHOT. 10C offset is large enough to ignore. It is - * already dropped from the high threshold temperature. - */ - if (temp > 10) - return; - - state->baseline_temp = temp; - state->last_interrupt_time = now; - schedule_delayed_work_on(this_cpu, &state->therm_work, THERM_THROT_POLL_INTERVAL); - } else if (old_event && state->last_interrupt_time) { - unsigned long throttle_time; - - throttle_time = jiffies_delta_to_msecs(now - state->last_interrupt_time); - if (throttle_time > state->max_time_ms) - state->max_time_ms = throttle_time; - state->total_time_ms += throttle_time; - state->last_interrupt_time = 0; - } -} - -static int thresh_event_valid(int level, int event) -{ - struct _thermal_state *state; - unsigned int this_cpu = smp_processor_id(); - struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu); - u64 now = get_jiffies_64(); - - if (level == PACKAGE_LEVEL) - state = (event == 0) ? &pstate->pkg_thresh0 : - &pstate->pkg_thresh1; - else - state = (event == 0) ? &pstate->core_thresh0 : - &pstate->core_thresh1; - - if (time_before64(now, state->next_check)) - return 0; - - state->next_check = now + CHECK_INTERVAL; - - return 1; -} - -static bool int_pln_enable; -static int __init int_pln_enable_setup(char *s) -{ - int_pln_enable = true; - - return 1; -} -__setup("int_pln_enable", int_pln_enable_setup); - -#ifdef CONFIG_SYSFS -/* Add/Remove thermal_throttle interface for CPU device: */ -static int thermal_throttle_add_dev(struct device *dev, unsigned int cpu) -{ - int err; - struct cpuinfo_x86 *c = &cpu_data(cpu); - - err = sysfs_create_group(&dev->kobj, &thermal_attr_group); - if (err) - return err; - - if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) { - err = sysfs_add_file_to_group(&dev->kobj, - &dev_attr_core_power_limit_count.attr, - thermal_attr_group.name); - if (err) - goto del_group; - } - - if (cpu_has(c, X86_FEATURE_PTS)) { - err = sysfs_add_file_to_group(&dev->kobj, - &dev_attr_package_throttle_count.attr, - thermal_attr_group.name); - if (err) - goto del_group; - - err = sysfs_add_file_to_group(&dev->kobj, - &dev_attr_package_throttle_max_time_ms.attr, - thermal_attr_group.name); - if (err) - goto del_group; - - err = sysfs_add_file_to_group(&dev->kobj, - &dev_attr_package_throttle_total_time_ms.attr, - thermal_attr_group.name); - if (err) - goto del_group; - - if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) { - err = sysfs_add_file_to_group(&dev->kobj, - &dev_attr_package_power_limit_count.attr, - thermal_attr_group.name); - if (err) - goto del_group; - } - } - - return 0; - -del_group: - sysfs_remove_group(&dev->kobj, &thermal_attr_group); - - return err; -} - -static void thermal_throttle_remove_dev(struct device *dev) -{ - sysfs_remove_group(&dev->kobj, &thermal_attr_group); -} - -/* Get notified when a cpu comes on/off. Be hotplug friendly. */ -static int thermal_throttle_online(unsigned int cpu) -{ - struct thermal_state *state = &per_cpu(thermal_state, cpu); - struct device *dev = get_cpu_device(cpu); - u32 l; - - state->package_throttle.level = PACKAGE_LEVEL; - state->core_throttle.level = CORE_LEVEL; - - INIT_DELAYED_WORK(&state->package_throttle.therm_work, throttle_active_work); - INIT_DELAYED_WORK(&state->core_throttle.therm_work, throttle_active_work); - - /* Unmask the thermal vector after the above workqueues are initialized. */ - l = apic_read(APIC_LVTTHMR); - apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); - - return thermal_throttle_add_dev(dev, cpu); -} - -static int thermal_throttle_offline(unsigned int cpu) -{ - struct thermal_state *state = &per_cpu(thermal_state, cpu); - struct device *dev = get_cpu_device(cpu); - u32 l; - - /* Mask the thermal vector before draining evtl. pending work */ - l = apic_read(APIC_LVTTHMR); - apic_write(APIC_LVTTHMR, l | APIC_LVT_MASKED); - - cancel_delayed_work_sync(&state->package_throttle.therm_work); - cancel_delayed_work_sync(&state->core_throttle.therm_work); - - state->package_throttle.rate_control_active = false; - state->core_throttle.rate_control_active = false; - - thermal_throttle_remove_dev(dev); - return 0; -} - -static __init int thermal_throttle_init_device(void) -{ - int ret; - - if (!atomic_read(&therm_throt_en)) - return 0; - - ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/therm:online", - thermal_throttle_online, - thermal_throttle_offline); - return ret < 0 ? ret : 0; -} -device_initcall(thermal_throttle_init_device); - -#endif /* CONFIG_SYSFS */ - -static void notify_package_thresholds(__u64 msr_val) -{ - bool notify_thres_0 = false; - bool notify_thres_1 = false; - - if (!platform_thermal_package_notify) - return; - - /* lower threshold check */ - if (msr_val & THERM_LOG_THRESHOLD0) - notify_thres_0 = true; - /* higher threshold check */ - if (msr_val & THERM_LOG_THRESHOLD1) - notify_thres_1 = true; - - if (!notify_thres_0 && !notify_thres_1) - return; - - if (platform_thermal_package_rate_control && - platform_thermal_package_rate_control()) { - /* Rate control is implemented in callback */ - platform_thermal_package_notify(msr_val); - return; - } - - /* lower threshold reached */ - if (notify_thres_0 && thresh_event_valid(PACKAGE_LEVEL, 0)) - platform_thermal_package_notify(msr_val); - /* higher threshold reached */ - if (notify_thres_1 && thresh_event_valid(PACKAGE_LEVEL, 1)) - platform_thermal_package_notify(msr_val); -} - -static void notify_thresholds(__u64 msr_val) -{ - /* check whether the interrupt handler is defined; - * otherwise simply return - */ - if (!platform_thermal_notify) - return; - - /* lower threshold reached */ - if ((msr_val & THERM_LOG_THRESHOLD0) && - thresh_event_valid(CORE_LEVEL, 0)) - platform_thermal_notify(msr_val); - /* higher threshold reached */ - if ((msr_val & THERM_LOG_THRESHOLD1) && - thresh_event_valid(CORE_LEVEL, 1)) - platform_thermal_notify(msr_val); -} - -/* Thermal transition interrupt handler */ -static void intel_thermal_interrupt(void) -{ - __u64 msr_val; - - if (static_cpu_has(X86_FEATURE_HWP)) - wrmsrl_safe(MSR_HWP_STATUS, 0); - - rdmsrl(MSR_IA32_THERM_STATUS, msr_val); - - /* Check for violation of core thermal thresholds*/ - notify_thresholds(msr_val); - - therm_throt_process(msr_val & THERM_STATUS_PROCHOT, - THERMAL_THROTTLING_EVENT, - CORE_LEVEL); - - if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable) - therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT, - POWER_LIMIT_EVENT, - CORE_LEVEL); - - if (this_cpu_has(X86_FEATURE_PTS)) { - rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val); - /* check violations of package thermal thresholds */ - notify_package_thresholds(msr_val); - therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT, - THERMAL_THROTTLING_EVENT, - PACKAGE_LEVEL); - if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable) - therm_throt_process(msr_val & - PACKAGE_THERM_STATUS_POWER_LIMIT, - POWER_LIMIT_EVENT, - PACKAGE_LEVEL); - } -} - -static void unexpected_thermal_interrupt(void) -{ - pr_err("CPU%d: Unexpected LVT thermal interrupt!\n", - smp_processor_id()); -} - -static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt; - -DEFINE_IDTENTRY_SYSVEC(sysvec_thermal) -{ - trace_thermal_apic_entry(THERMAL_APIC_VECTOR); - inc_irq_stat(irq_thermal_count); - smp_thermal_vector(); - trace_thermal_apic_exit(THERMAL_APIC_VECTOR); - ack_APIC_irq(); -} - -/* Thermal monitoring depends on APIC, ACPI and clock modulation */ -static int intel_thermal_supported(struct cpuinfo_x86 *c) -{ - if (!boot_cpu_has(X86_FEATURE_APIC)) - return 0; - if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC)) - return 0; - return 1; -} - -void __init mcheck_intel_therm_init(void) -{ - /* - * This function is only called on boot CPU. Save the init thermal - * LVT value on BSP and use that value to restore APs' thermal LVT - * entry BIOS programmed later - */ - if (intel_thermal_supported(&boot_cpu_data)) - lvtthmr_init = apic_read(APIC_LVTTHMR); -} - -void intel_init_thermal(struct cpuinfo_x86 *c) -{ - unsigned int cpu = smp_processor_id(); - int tm2 = 0; - u32 l, h; - - if (!intel_thermal_supported(c)) - return; - - /* - * First check if its enabled already, in which case there might - * be some SMM goo which handles it, so we can't even put a handler - * since it might be delivered via SMI already: - */ - rdmsr(MSR_IA32_MISC_ENABLE, l, h); - - h = lvtthmr_init; - /* - * The initial value of thermal LVT entries on all APs always reads - * 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI - * sequence to them and LVT registers are reset to 0s except for - * the mask bits which are set to 1s when APs receive INIT IPI. - * If BIOS takes over the thermal interrupt and sets its interrupt - * delivery mode to SMI (not fixed), it restores the value that the - * BIOS has programmed on AP based on BSP's info we saved since BIOS - * is always setting the same value for all threads/cores. - */ - if ((h & APIC_DM_FIXED_MASK) != APIC_DM_FIXED) - apic_write(APIC_LVTTHMR, lvtthmr_init); - - - if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { - if (system_state == SYSTEM_BOOTING) - pr_debug("CPU%d: Thermal monitoring handled by SMI\n", cpu); - return; - } - - /* early Pentium M models use different method for enabling TM2 */ - if (cpu_has(c, X86_FEATURE_TM2)) { - if (c->x86 == 6 && (c->x86_model == 9 || c->x86_model == 13)) { - rdmsr(MSR_THERM2_CTL, l, h); - if (l & MSR_THERM2_CTL_TM_SELECT) - tm2 = 1; - } else if (l & MSR_IA32_MISC_ENABLE_TM2) - tm2 = 1; - } - - /* We'll mask the thermal vector in the lapic till we're ready: */ - h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED; - apic_write(APIC_LVTTHMR, h); - - rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); - if (cpu_has(c, X86_FEATURE_PLN) && !int_pln_enable) - wrmsr(MSR_IA32_THERM_INTERRUPT, - (l | (THERM_INT_LOW_ENABLE - | THERM_INT_HIGH_ENABLE)) & ~THERM_INT_PLN_ENABLE, h); - else if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) - wrmsr(MSR_IA32_THERM_INTERRUPT, - l | (THERM_INT_LOW_ENABLE - | THERM_INT_HIGH_ENABLE | THERM_INT_PLN_ENABLE), h); - else - wrmsr(MSR_IA32_THERM_INTERRUPT, - l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h); - - if (cpu_has(c, X86_FEATURE_PTS)) { - rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); - if (cpu_has(c, X86_FEATURE_PLN) && !int_pln_enable) - wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, - (l | (PACKAGE_THERM_INT_LOW_ENABLE - | PACKAGE_THERM_INT_HIGH_ENABLE)) - & ~PACKAGE_THERM_INT_PLN_ENABLE, h); - else if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) - wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, - l | (PACKAGE_THERM_INT_LOW_ENABLE - | PACKAGE_THERM_INT_HIGH_ENABLE - | PACKAGE_THERM_INT_PLN_ENABLE), h); - else - wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, - l | (PACKAGE_THERM_INT_LOW_ENABLE - | PACKAGE_THERM_INT_HIGH_ENABLE), h); - } - - smp_thermal_vector = intel_thermal_interrupt; - - rdmsr(MSR_IA32_MISC_ENABLE, l, h); - wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h); - - pr_info_once("CPU0: Thermal monitoring enabled (%s)\n", - tm2 ? "TM2" : "TM1"); - - /* enable thermal throttle processing */ - atomic_set(&therm_throt_en, 1); -} diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index ec6f0415bc6d..b935e1b5f115 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -830,7 +830,7 @@ static const struct attribute_group cpu_root_microcode_group = { .attrs = cpu_root_microcode_attrs, }; -int __init microcode_init(void) +static int __init microcode_init(void) { struct cpuinfo_x86 *c = &boot_cpu_data; int error; diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 43b54bef5448..e88bc296afca 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -31,6 +31,11 @@ #include <asm/reboot.h> #include <asm/nmi.h> #include <clocksource/hyperv_timer.h> +#include <asm/numa.h> + +/* Is Linux running as the root partition? */ +bool hv_root_partition; +EXPORT_SYMBOL_GPL(hv_root_partition); struct ms_hyperv_info ms_hyperv; EXPORT_SYMBOL_GPL(ms_hyperv); @@ -226,6 +231,32 @@ static void __init hv_smp_prepare_boot_cpu(void) hv_init_spinlocks(); #endif } + +static void __init hv_smp_prepare_cpus(unsigned int max_cpus) +{ +#ifdef CONFIG_X86_64 + int i; + int ret; +#endif + + native_smp_prepare_cpus(max_cpus); + +#ifdef CONFIG_X86_64 + for_each_present_cpu(i) { + if (i == 0) + continue; + ret = hv_call_add_logical_proc(numa_cpu_node(i), i, cpu_physical_id(i)); + BUG_ON(ret); + } + + for_each_present_cpu(i) { + if (i == 0) + continue; + ret = hv_call_create_vp(numa_cpu_node(i), hv_current_partition_id, i, i); + BUG_ON(ret); + } +#endif +} #endif static void __init ms_hyperv_init_platform(void) @@ -243,6 +274,7 @@ static void __init ms_hyperv_init_platform(void) * Extract the features and hints */ ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES); + ms_hyperv.features_b = cpuid_ebx(HYPERV_CPUID_FEATURES); ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES); ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO); @@ -256,6 +288,22 @@ static void __init ms_hyperv_init_platform(void) ms_hyperv.max_vp_index, ms_hyperv.max_lp_index); /* + * Check CPU management privilege. + * + * To mirror what Windows does we should extract CPU management + * features and use the ReservedIdentityBit to detect if Linux is the + * root partition. But that requires negotiating CPU management + * interface (a process to be finalized). + * + * For now, use the privilege flag as the indicator for running as + * root. + */ + if (cpuid_ebx(HYPERV_CPUID_FEATURES) & HV_CPU_MANAGEMENT) { + hv_root_partition = true; + pr_info("Hyper-V: running as root partition\n"); + } + + /* * Extract host information. */ if (cpuid_eax(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS) >= @@ -277,6 +325,14 @@ static void __init ms_hyperv_init_platform(void) x86_platform.calibrate_cpu = hv_get_tsc_khz; } + if (ms_hyperv.features_b & HV_ISOLATION) { + ms_hyperv.isolation_config_a = cpuid_eax(HYPERV_CPUID_ISOLATION_CONFIG); + ms_hyperv.isolation_config_b = cpuid_ebx(HYPERV_CPUID_ISOLATION_CONFIG); + + pr_info("Hyper-V: Isolation Config: Group A 0x%x, Group B 0x%x\n", + ms_hyperv.isolation_config_a, ms_hyperv.isolation_config_b); + } + if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED) { ms_hyperv.nested_features = cpuid_eax(HYPERV_CPUID_NESTED_FEATURES); @@ -366,6 +422,8 @@ static void __init ms_hyperv_init_platform(void) # ifdef CONFIG_SMP smp_ops.smp_prepare_boot_cpu = hv_smp_prepare_boot_cpu; + if (hv_root_partition) + smp_ops.smp_prepare_cpus = hv_smp_prepare_cpus; # endif /* diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 5bd011737272..9231640782fa 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -537,9 +537,9 @@ static void __init print_out_mtrr_range_state(void) if (!size_base) continue; - size_base = to_size_factor(size_base, &size_factor), + size_base = to_size_factor(size_base, &size_factor); start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10); - start_base = to_size_factor(start_base, &start_factor), + start_base = to_size_factor(start_base, &start_factor); type = range_state[i].type; pr_debug("reg %d, base: %ld%cB, range: %ld%cB, type %s\n", diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index a29997e6cf9e..b90f3f437765 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -3,7 +3,6 @@ * This only handles 32bit MTRR on 32bit hosts. This is strictly wrong * because MTRRs can span up to 40 bits (36bits on most modern x86) */ -#define DEBUG #include <linux/export.h> #include <linux/init.h> diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.c b/arch/x86/kernel/cpu/mtrr/mtrr.c index 61eb26edc6d2..28c8a23aa42e 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.c +++ b/arch/x86/kernel/cpu/mtrr/mtrr.c @@ -31,8 +31,6 @@ System Programming Guide; Section 9.11. (1997 edition - PPro). */ -#define DEBUG - #include <linux/types.h> /* FIXME: kvm_para.h needs this */ #include <linux/stop_machine.h> diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index a5ee607a3b89..3ef5868ac588 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -3,7 +3,7 @@ * local apic based NMI watchdog for various CPUs. * * This file also handles reservation of performance counters for coordination - * with other users (like oprofile). + * with other users. * * Note that these events normally don't tick when the CPU idles. This means * the frequency varies with CPU load. @@ -105,15 +105,6 @@ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr) } -/* checks for a bit availability (hack for oprofile) */ -int avail_to_resrv_perfctr_nmi_bit(unsigned int counter) -{ - BUG_ON(counter > NMI_MAX_COUNTER_BITS); - - return !test_bit(counter, perfctr_nmi_owner); -} -EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit); - int reserve_perfctr_nmi(unsigned int msr) { unsigned int counter; diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index ee71c47844cb..c4d320d02fd5 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -572,6 +572,7 @@ union cpuid_0x10_x_edx { void rdt_last_cmd_clear(void); void rdt_last_cmd_puts(const char *s); +__printf(1, 2) void rdt_last_cmd_printf(const char *fmt, ...); void rdt_ctrl_update(void *arg); diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 460f3e0df106..f9190adc52cb 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -563,11 +563,11 @@ static int __rdtgroup_move_task(struct task_struct *tsk, */ if (rdtgrp->type == RDTCTRL_GROUP) { - tsk->closid = rdtgrp->closid; - tsk->rmid = rdtgrp->mon.rmid; + WRITE_ONCE(tsk->closid, rdtgrp->closid); + WRITE_ONCE(tsk->rmid, rdtgrp->mon.rmid); } else if (rdtgrp->type == RDTMON_GROUP) { if (rdtgrp->mon.parent->closid == tsk->closid) { - tsk->rmid = rdtgrp->mon.rmid; + WRITE_ONCE(tsk->rmid, rdtgrp->mon.rmid); } else { rdt_last_cmd_puts("Can't move task to different control group\n"); return -EINVAL; @@ -2310,22 +2310,18 @@ static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to, for_each_process_thread(p, t) { if (!from || is_closid_match(t, from) || is_rmid_match(t, from)) { - t->closid = to->closid; - t->rmid = to->mon.rmid; + WRITE_ONCE(t->closid, to->closid); + WRITE_ONCE(t->rmid, to->mon.rmid); -#ifdef CONFIG_SMP /* - * This is safe on x86 w/o barriers as the ordering - * of writing to task_cpu() and t->on_cpu is - * reverse to the reading here. The detection is - * inaccurate as tasks might move or schedule - * before the smp function call takes place. In - * such a case the function call is pointless, but + * If the task is on a CPU, set the CPU in the mask. + * The detection is inaccurate as tasks might move or + * schedule before the smp function call takes place. + * In such a case the function call is pointless, but * there is no other side effect. */ - if (mask && t->on_cpu) + if (IS_ENABLED(CONFIG_SMP) && mask && task_curr(t)) cpumask_set_cpu(task_cpu(t), mask); -#endif } } read_unlock(&tasklist_lock); diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 236924930bf0..972ec3bfa9c0 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -40,11 +40,6 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, { X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 }, - { X86_FEATURE_SME, CPUID_EAX, 0, 0x8000001f, 0 }, - { X86_FEATURE_SEV, CPUID_EAX, 1, 0x8000001f, 0 }, - { X86_FEATURE_SEV_ES, CPUID_EAX, 3, 0x8000001f, 0 }, - { X86_FEATURE_SME_COHERENT, CPUID_EAX, 10, 0x8000001f, 0 }, - { X86_FEATURE_VM_PAGE_FLUSH, CPUID_EAX, 2, 0x8000001f, 0 }, { 0, 0, 0, 0, 0 } }; diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c index f2eac41bb4ff..8ce6d8371cfb 100644 --- a/arch/x86/kernel/cpu/sgx/driver.c +++ b/arch/x86/kernel/cpu/sgx/driver.c @@ -72,6 +72,9 @@ static int sgx_release(struct inode *inode, struct file *file) synchronize_srcu(&encl->srcu); mmu_notifier_unregister(&encl_mm->mmu_notifier, encl_mm->mm); kfree(encl_mm); + + /* 'encl_mm' is gone, put encl_mm->encl reference: */ + kref_put(&encl->refcount, sgx_encl_release); } kref_put(&encl->refcount, sgx_encl_release); diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index ee50a5010277..7449ef33f081 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -141,7 +141,6 @@ static vm_fault_t sgx_vma_fault(struct vm_fault *vmf) struct sgx_encl_page *entry; unsigned long phys_addr; struct sgx_encl *encl; - unsigned long pfn; vm_fault_t ret; encl = vma->vm_private_data; @@ -168,13 +167,6 @@ static vm_fault_t sgx_vma_fault(struct vm_fault *vmf) phys_addr = sgx_get_epc_phys_addr(entry->epc_page); - /* Check if another thread got here first to insert the PTE. */ - if (!follow_pfn(vma, addr, &pfn)) { - mutex_unlock(&encl->lock); - - return VM_FAULT_NOPAGE; - } - ret = vmf_insert_pfn(vma, addr, PFN_DOWN(phys_addr)); if (ret != VM_FAULT_NOPAGE) { mutex_unlock(&encl->lock); @@ -481,6 +473,9 @@ static void sgx_mmu_notifier_free(struct mmu_notifier *mn) { struct sgx_encl_mm *encl_mm = container_of(mn, struct sgx_encl_mm, mmu_notifier); + /* 'encl_mm' is going away, put encl_mm->encl reference: */ + kref_put(&encl_mm->encl->refcount, sgx_encl_release); + kfree(encl_mm); } @@ -534,6 +529,8 @@ int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm) if (!encl_mm) return -ENOMEM; + /* Grab a refcount for the encl_mm->encl reference: */ + kref_get(&encl->refcount); encl_mm->encl = encl; encl_mm->mm = mm; encl_mm->mmu_notifier.ops = &sgx_mmu_notifier_ops; diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index c519fc5f6948..8df81a3ed945 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -700,25 +700,27 @@ static bool __init sgx_page_cache_init(void) return true; } -static void __init sgx_init(void) +static int __init sgx_init(void) { int ret; int i; if (!cpu_feature_enabled(X86_FEATURE_SGX)) - return; + return -ENODEV; if (!sgx_page_cache_init()) - return; + return -ENOMEM; - if (!sgx_page_reclaimer_init()) + if (!sgx_page_reclaimer_init()) { + ret = -ENOMEM; goto err_page_cache; + } ret = sgx_drv_init(); if (ret) goto err_kthread; - return; + return 0; err_kthread: kthread_stop(ksgxd_tsk); @@ -728,6 +730,8 @@ err_page_cache: vfree(sgx_epc_sections[i].pages); memunmap(sgx_epc_sections[i].virt_addr); } + + return ret; } device_initcall(sgx_init); diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 1dd851397bd9..5601b95944fa 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -128,12 +128,21 @@ static __always_inline bool in_exception_stack(unsigned long *stack, struct stac static __always_inline bool in_irq_stack(unsigned long *stack, struct stack_info *info) { - unsigned long *end = (unsigned long *)this_cpu_read(hardirq_stack_ptr); - unsigned long *begin = end - (IRQ_STACK_SIZE / sizeof(long)); + unsigned long *end = (unsigned long *)this_cpu_read(hardirq_stack_ptr); + unsigned long *begin; /* - * This is a software stack, so 'end' can be a valid stack pointer. - * It just means the stack is empty. + * @end points directly to the top most stack entry to avoid a -8 + * adjustment in the stack switch hotpath. Adjust it back before + * calculating @begin. + */ + end++; + begin = end - (IRQ_STACK_SIZE / sizeof(long)); + + /* + * Due to the switching logic RSP can never be == @end because the + * final operation is 'popq %rsp' which means after that RSP points + * to the original stack and not to @end. */ if (stack < begin || stack >= end) return false; @@ -143,8 +152,9 @@ static __always_inline bool in_irq_stack(unsigned long *stack, struct stack_info info->end = end; /* - * The next stack pointer is the first thing pushed by the entry code - * after switching to the irq stack. + * The next stack pointer is stored at the top of the irq stack + * before switching to the irq stack. Actual stack entries are all + * below that. */ info->next_sp = (unsigned long *)*(end - 1); diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 5d8047441a0a..683749b80ae2 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -167,14 +167,14 @@ void fpstate_sanitize_xstate(struct fpu *fpu) fx->fop = 0; fx->rip = 0; fx->rdp = 0; - memset(&fx->st_space[0], 0, 128); + memset(fx->st_space, 0, sizeof(fx->st_space)); } /* * SSE is in init state */ if (!(xfeatures & XFEATURE_MASK_SSE)) - memset(&fx->xmm_space[0], 0, 256); + memset(fx->xmm_space, 0, sizeof(fx->xmm_space)); /* * First two features are FPU and SSE, which above we handled diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index 0d54099c2a3a..7c273846c687 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -184,6 +184,7 @@ SYM_INNER_LABEL(ftrace_graph_call, SYM_L_GLOBAL) * It is also used to copy the retq for trampolines. */ SYM_INNER_LABEL_ALIGN(ftrace_stub, SYM_L_WEAK) + UNWIND_HINT_FUNC retq SYM_FUNC_END(ftrace_epilogue) @@ -276,7 +277,7 @@ SYM_INNER_LABEL(ftrace_regs_caller_end, SYM_L_GLOBAL) restore_mcount_regs 8 /* Restore flags */ popfq - UNWIND_HINT_RET_OFFSET + UNWIND_HINT_FUNC jmp ftrace_epilogue SYM_FUNC_END(ftrace_regs_caller) @@ -333,8 +334,7 @@ SYM_FUNC_START(ftrace_graph_caller) retq SYM_FUNC_END(ftrace_graph_caller) -SYM_CODE_START(return_to_handler) - UNWIND_HINT_EMPTY +SYM_FUNC_START(return_to_handler) subq $24, %rsp /* Save the return values */ @@ -349,5 +349,5 @@ SYM_CODE_START(return_to_handler) movq (%rsp), %rax addq $24, %rsp JMP_NOSPEC rdi -SYM_CODE_END(return_to_handler) +SYM_FUNC_END(return_to_handler) #endif diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 03aa33b58165..668a4a6533d9 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -269,6 +269,20 @@ static inline bool within_cpu_entry(unsigned long addr, unsigned long end) CPU_ENTRY_AREA_TOTAL_SIZE)) return true; + /* + * When FSGSBASE is enabled, paranoid_entry() fetches the per-CPU + * GSBASE value via __per_cpu_offset or pcpu_unit_offsets. + */ +#ifdef CONFIG_SMP + if (within_area(addr, end, (unsigned long)__per_cpu_offset, + sizeof(unsigned long) * nr_cpu_ids)) + return true; +#else + if (within_area(addr, end, (unsigned long)&pcpu_unit_offsets, + sizeof(pcpu_unit_offsets))) + return true; +#endif + for_each_possible_cpu(cpu) { /* The original rw GDT is being used after load_direct_gdt() */ if (within_area(addr, end, (unsigned long)get_cpu_gdt_rw(cpu), @@ -293,6 +307,14 @@ static inline bool within_cpu_entry(unsigned long addr, unsigned long end) (unsigned long)&per_cpu(cpu_tlbstate, cpu), sizeof(struct tlb_state))) return true; + + /* + * When in guest (X86_FEATURE_HYPERVISOR), local_db_save() + * will read per-cpu cpu_dr7 before clear dr7 register. + */ + if (within_area(addr, end, (unsigned long)&per_cpu(cpu_dr7, cpu), + sizeof(cpu_dr7))) + return true; } return false; @@ -491,15 +513,12 @@ static int hw_breakpoint_handler(struct die_args *args) struct perf_event *bp; unsigned long *dr6_p; unsigned long dr6; + bool bpx; /* The DR6 value is pointed by args->err */ dr6_p = (unsigned long *)ERR_PTR(args->err); dr6 = *dr6_p; - /* If it's a single step, TRAP bits are random */ - if (dr6 & DR_STEP) - return NOTIFY_DONE; - /* Do an early return if no trap bits are set in DR6 */ if ((dr6 & DR_TRAP_BITS) == 0) return NOTIFY_DONE; @@ -509,28 +528,29 @@ static int hw_breakpoint_handler(struct die_args *args) if (likely(!(dr6 & (DR_TRAP0 << i)))) continue; + bp = this_cpu_read(bp_per_reg[i]); + if (!bp) + continue; + + bpx = bp->hw.info.type == X86_BREAKPOINT_EXECUTE; + /* - * The counter may be concurrently released but that can only - * occur from a call_rcu() path. We can then safely fetch - * the breakpoint, use its callback, touch its counter - * while we are in an rcu_read_lock() path. + * TF and data breakpoints are traps and can be merged, however + * instruction breakpoints are faults and will be raised + * separately. + * + * However DR6 can indicate both TF and instruction + * breakpoints. In that case take TF as that has precedence and + * delay the instruction breakpoint for the next exception. */ - rcu_read_lock(); + if (bpx && (dr6 & DR_STEP)) + continue; - bp = this_cpu_read(bp_per_reg[i]); /* * Reset the 'i'th TRAP bit in dr6 to denote completion of * exception handling */ (*dr6_p) &= ~(DR_TRAP0 << i); - /* - * bp can be NULL due to lazy debug register switching - * or due to concurrent perf counter removing. - */ - if (!bp) { - rcu_read_unlock(); - break; - } perf_bp_event(bp, args->regs); @@ -538,11 +558,10 @@ static int hw_breakpoint_handler(struct die_args *args) * Set up resume flag to avoid breakpoint recursion when * returning back to origin. */ - if (bp->hw.info.type == X86_BREAKPOINT_EXECUTE) + if (bpx) args->regs->flags |= X86_EFLAGS_RF; - - rcu_read_unlock(); } + /* * Further processing in do_debug() is needed for a) user-space * breakpoints (to generate signals) and b) when the system has diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index c5dd50369e2f..58aa712973ac 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -21,6 +21,7 @@ #include <asm/hw_irq.h> #include <asm/desc.h> #include <asm/traps.h> +#include <asm/thermal.h> #define CREATE_TRACE_POINTS #include <asm/trace/irq_vectors.h> @@ -227,7 +228,7 @@ static __always_inline void handle_irq(struct irq_desc *desc, struct pt_regs *regs) { if (IS_ENABLED(CONFIG_X86_64)) - run_irq_on_irqstack_cond(desc->handle_irq, desc, regs); + generic_handle_irq_desc(desc); else __handle_irq(desc, regs); } @@ -374,3 +375,23 @@ void fixup_irqs(void) } } #endif + +#ifdef CONFIG_X86_THERMAL_VECTOR +static void smp_thermal_vector(void) +{ + if (x86_thermal_enabled()) + intel_thermal_interrupt(); + else + pr_err("CPU%d: Unexpected LVT thermal interrupt!\n", + smp_processor_id()); +} + +DEFINE_IDTENTRY_SYSVEC(sysvec_thermal) +{ + trace_thermal_apic_entry(THERMAL_APIC_VECTOR); + inc_irq_stat(irq_thermal_count); + smp_thermal_vector(); + trace_thermal_apic_exit(THERMAL_APIC_VECTOR); + ack_APIC_irq(); +} +#endif diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 0b79efc87be5..044902d5a3c4 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -22,6 +22,7 @@ #include <asm/apic.h> #include <asm/nospec-branch.h> +#include <asm/softirq_stack.h> #ifdef CONFIG_DEBUG_STACKOVERFLOW diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 440eed558558..1c0fb96b9e39 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -20,6 +20,7 @@ #include <linux/sched/task_stack.h> #include <asm/cpu_entry_area.h> +#include <asm/softirq_stack.h> #include <asm/irq_stack.h> #include <asm/io_apic.h> #include <asm/apic.h> @@ -48,7 +49,8 @@ static int map_irq_stack(unsigned int cpu) if (!va) return -ENOMEM; - per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE; + /* Store actual TOS to avoid adjustment in the hotpath */ + per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8; return 0; } #else @@ -60,7 +62,8 @@ static int map_irq_stack(unsigned int cpu) { void *va = per_cpu_ptr(&irq_stack_backing_store, cpu); - per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE; + /* Store actual TOS to avoid adjustment in the hotpath */ + per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8; return 0; } #endif @@ -71,8 +74,3 @@ int irq_init_percpu_irqstack(unsigned int cpu) return 0; return map_irq_stack(cpu); } - -void do_softirq_own_stack(void) -{ - run_on_irqstack_cond(__do_softirq, NULL); -} diff --git a/arch/x86/kernel/irqflags.S b/arch/x86/kernel/irqflags.S index 0db0375235b4..8ef35063964b 100644 --- a/arch/x86/kernel/irqflags.S +++ b/arch/x86/kernel/irqflags.S @@ -13,14 +13,3 @@ SYM_FUNC_START(native_save_fl) ret SYM_FUNC_END(native_save_fl) EXPORT_SYMBOL(native_save_fl) - -/* - * void native_restore_fl(unsigned long flags) - * %eax/%rdi: flags - */ -SYM_FUNC_START(native_restore_fl) - push %_ASM_ARG1 - popf - ret -SYM_FUNC_END(native_restore_fl) -EXPORT_SYMBOL(native_restore_fl) diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index a65e9e97857f..df776cdca327 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -133,26 +133,6 @@ void synthesize_relcall(void *dest, void *from, void *to) NOKPROBE_SYMBOL(synthesize_relcall); /* - * Skip the prefixes of the instruction. - */ -static kprobe_opcode_t *skip_prefixes(kprobe_opcode_t *insn) -{ - insn_attr_t attr; - - attr = inat_get_opcode_attribute((insn_byte_t)*insn); - while (inat_is_legacy_prefix(attr)) { - insn++; - attr = inat_get_opcode_attribute((insn_byte_t)*insn); - } -#ifdef CONFIG_X86_64 - if (inat_is_rex_prefix(attr)) - insn++; -#endif - return insn; -} -NOKPROBE_SYMBOL(skip_prefixes); - -/* * Returns non-zero if INSN is boostable. * RIP relative instructions are adjusted at copying time in 64 bits mode */ @@ -312,25 +292,6 @@ static int can_probe(unsigned long paddr) } /* - * Returns non-zero if opcode modifies the interrupt flag. - */ -static int is_IF_modifier(kprobe_opcode_t *insn) -{ - /* Skip prefixes */ - insn = skip_prefixes(insn); - - switch (*insn) { - case 0xfa: /* cli */ - case 0xfb: /* sti */ - case 0xcf: /* iret/iretd */ - case 0x9d: /* popf/popfd */ - return 1; - } - - return 0; -} - -/* * Copy an instruction with recovering modified instruction by kprobes * and adjust the displacement if the instruction uses the %rip-relative * addressing mode. Note that since @real will be the final place of copied @@ -411,9 +372,9 @@ static int prepare_boost(kprobe_opcode_t *buf, struct kprobe *p, synthesize_reljump(buf + len, p->ainsn.insn + len, p->addr + insn->length); len += JMP32_INSN_SIZE; - p->ainsn.boostable = true; + p->ainsn.boostable = 1; } else { - p->ainsn.boostable = false; + p->ainsn.boostable = 0; } return len; @@ -450,6 +411,67 @@ void free_insn_page(void *page) module_memfree(page); } +static void set_resume_flags(struct kprobe *p, struct insn *insn) +{ + insn_byte_t opcode = insn->opcode.bytes[0]; + + switch (opcode) { + case 0xfa: /* cli */ + case 0xfb: /* sti */ + case 0x9d: /* popf/popfd */ + /* Check whether the instruction modifies Interrupt Flag or not */ + p->ainsn.if_modifier = 1; + break; + case 0x9c: /* pushfl */ + p->ainsn.is_pushf = 1; + break; + case 0xcf: /* iret */ + p->ainsn.if_modifier = 1; + fallthrough; + case 0xc2: /* ret/lret */ + case 0xc3: + case 0xca: + case 0xcb: + case 0xea: /* jmp absolute -- ip is correct */ + /* ip is already adjusted, no more changes required */ + p->ainsn.is_abs_ip = 1; + /* Without resume jump, this is boostable */ + p->ainsn.boostable = 1; + break; + case 0xe8: /* call relative - Fix return addr */ + p->ainsn.is_call = 1; + break; +#ifdef CONFIG_X86_32 + case 0x9a: /* call absolute -- same as call absolute, indirect */ + p->ainsn.is_call = 1; + p->ainsn.is_abs_ip = 1; + break; +#endif + case 0xff: + opcode = insn->opcode.bytes[1]; + if ((opcode & 0x30) == 0x10) { + /* + * call absolute, indirect + * Fix return addr; ip is correct. + * But this is not boostable + */ + p->ainsn.is_call = 1; + p->ainsn.is_abs_ip = 1; + break; + } else if (((opcode & 0x31) == 0x20) || + ((opcode & 0x31) == 0x21)) { + /* + * jmp near and far, absolute indirect + * ip is correct. + */ + p->ainsn.is_abs_ip = 1; + /* Without resume jump, this is boostable */ + p->ainsn.boostable = 1; + } + break; + } +} + static int arch_copy_kprobe(struct kprobe *p) { struct insn insn; @@ -467,8 +489,8 @@ static int arch_copy_kprobe(struct kprobe *p) */ len = prepare_boost(buf, p, &insn); - /* Check whether the instruction modifies Interrupt Flag or not */ - p->ainsn.if_modifier = is_IF_modifier(buf); + /* Analyze the opcode and set resume flags */ + set_resume_flags(p, &insn); /* Also, displacement change doesn't affect the first byte */ p->opcode = buf[0]; @@ -491,6 +513,9 @@ int arch_prepare_kprobe(struct kprobe *p) if (!can_probe((unsigned long)p->addr)) return -EILSEQ; + + memset(&p->ainsn, 0, sizeof(p->ainsn)); + /* insn: must be on special executable page on x86. */ p->ainsn.insn = get_insn_slot(); if (!p->ainsn.insn) @@ -806,11 +831,6 @@ NOKPROBE_SYMBOL(trampoline_handler); * 2) If the single-stepped instruction was a call, the return address * that is atop the stack is the address following the copied instruction. * We need to make it the address following the original instruction. - * - * If this is the first time we've single-stepped the instruction at - * this probepoint, and the instruction is boostable, boost it: add a - * jump instruction after the copied instruction, that jumps to the next - * instruction after the probepoint. */ static void resume_execution(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb) @@ -818,60 +838,20 @@ static void resume_execution(struct kprobe *p, struct pt_regs *regs, unsigned long *tos = stack_addr(regs); unsigned long copy_ip = (unsigned long)p->ainsn.insn; unsigned long orig_ip = (unsigned long)p->addr; - kprobe_opcode_t *insn = p->ainsn.insn; - - /* Skip prefixes */ - insn = skip_prefixes(insn); regs->flags &= ~X86_EFLAGS_TF; - switch (*insn) { - case 0x9c: /* pushfl */ + + /* Fixup the contents of top of stack */ + if (p->ainsn.is_pushf) { *tos &= ~(X86_EFLAGS_TF | X86_EFLAGS_IF); *tos |= kcb->kprobe_old_flags; - break; - case 0xc2: /* iret/ret/lret */ - case 0xc3: - case 0xca: - case 0xcb: - case 0xcf: - case 0xea: /* jmp absolute -- ip is correct */ - /* ip is already adjusted, no more changes required */ - p->ainsn.boostable = true; - goto no_change; - case 0xe8: /* call relative - Fix return addr */ + } else if (p->ainsn.is_call) { *tos = orig_ip + (*tos - copy_ip); - break; -#ifdef CONFIG_X86_32 - case 0x9a: /* call absolute -- same as call absolute, indirect */ - *tos = orig_ip + (*tos - copy_ip); - goto no_change; -#endif - case 0xff: - if ((insn[1] & 0x30) == 0x10) { - /* - * call absolute, indirect - * Fix return addr; ip is correct. - * But this is not boostable - */ - *tos = orig_ip + (*tos - copy_ip); - goto no_change; - } else if (((insn[1] & 0x31) == 0x20) || - ((insn[1] & 0x31) == 0x21)) { - /* - * jmp near and far, absolute indirect - * ip is correct. And this is boostable - */ - p->ainsn.boostable = true; - goto no_change; - } - break; - default: - break; } - regs->ip += orig_ip - copy_ip; + if (!p->ainsn.is_abs_ip) + regs->ip += orig_ip - copy_ip; -no_change: restore_btf(); } NOKPROBE_SYMBOL(resume_execution); diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index b8aee71840ae..aa15132228da 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -398,9 +398,15 @@ static void free_ldt_pgtables(struct mm_struct *mm) if (!boot_cpu_has(X86_FEATURE_PTI)) return; - tlb_gather_mmu(&tlb, mm, start, end); + /* + * Although free_pgd_range() is intended for freeing user + * page-tables, it also works out for kernel mappings on x86. + * We use tlb_gather_mmu_fullmm() to avoid confusing the + * range-tracking logic in __tlb_adjust_range(). + */ + tlb_gather_mmu_fullmm(&tlb, mm); free_pgd_range(&tlb, start, end, start, end); - tlb_finish_mmu(&tlb, start, end); + tlb_finish_mmu(&tlb); #endif } diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 34b153cbd4ac..5e9a34b5bd74 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -114,6 +114,7 @@ int apply_relocate(Elf32_Shdr *sechdrs, *location += sym->st_value; break; case R_386_PC32: + case R_386_PLT32: /* Add the value, subtract its position */ *location += sym->st_value - (uint32_t)location; break; diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 8a67d1fa8dc5..ed8ac6bcbafb 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -182,6 +182,13 @@ static long msr_ioctl(struct file *file, unsigned int ioc, unsigned long arg) err = security_locked_down(LOCKDOWN_MSR); if (err) break; + + err = filter_write(regs[1]); + if (err) + return err; + + add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK); + err = wrmsr_safe_regs_on_cpu(cpu, regs); if (err) break; diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 6c3407ba6ee9..c60222ab8ab9 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -135,8 +135,7 @@ unsigned paravirt_patch_default(u8 type, void *insn_buff, else if (opfunc == _paravirt_ident_64) ret = paravirt_patch_ident_64(insn_buff, len); - else if (type == PARAVIRT_PATCH(cpu.iret) || - type == PARAVIRT_PATCH(cpu.usergs_sysret64)) + else if (type == PARAVIRT_PATCH(cpu.iret)) /* If operation requires a jmp, then jmp */ ret = paravirt_patch_jmp(insn_buff, opfunc, addr, len); #endif @@ -170,7 +169,6 @@ static u64 native_steal_clock(int cpu) /* These are in entry.S */ extern void native_iret(void); -extern void native_usergs_sysret64(void); static struct resource reserve_ioports = { .start = 0, @@ -310,9 +308,7 @@ struct paravirt_patch_template pv_ops = { .cpu.load_sp0 = native_load_sp0, - .cpu.usergs_sysret64 = native_usergs_sysret64, .cpu.iret = native_iret, - .cpu.swapgs = native_swapgs, #ifdef CONFIG_X86_IOPL_IOPERM .cpu.invalidate_io_bitmap = native_tss_invalidate_io_bitmap, @@ -324,7 +320,6 @@ struct paravirt_patch_template pv_ops = { /* Irq ops. */ .irq.save_fl = __PV_IS_CALLEE_SAVE(native_save_fl), - .irq.restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl), .irq.irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable), .irq.irq_enable = __PV_IS_CALLEE_SAVE(native_irq_enable), .irq.safe_halt = native_safe_halt, diff --git a/arch/x86/kernel/paravirt_patch.c b/arch/x86/kernel/paravirt_patch.c index ace6e334cb39..abd27ec67397 100644 --- a/arch/x86/kernel/paravirt_patch.c +++ b/arch/x86/kernel/paravirt_patch.c @@ -25,10 +25,7 @@ struct patch_xxl { const unsigned char mmu_read_cr2[3]; const unsigned char mmu_read_cr3[3]; const unsigned char mmu_write_cr3[3]; - const unsigned char irq_restore_fl[2]; const unsigned char cpu_wbinvd[2]; - const unsigned char cpu_usergs_sysret64[6]; - const unsigned char cpu_swapgs[3]; const unsigned char mov64[3]; }; @@ -39,11 +36,7 @@ static const struct patch_xxl patch_data_xxl = { .mmu_read_cr2 = { 0x0f, 0x20, 0xd0 }, // mov %cr2, %[re]ax .mmu_read_cr3 = { 0x0f, 0x20, 0xd8 }, // mov %cr3, %[re]ax .mmu_write_cr3 = { 0x0f, 0x22, 0xdf }, // mov %rdi, %cr3 - .irq_restore_fl = { 0x57, 0x9d }, // push %rdi; popfq .cpu_wbinvd = { 0x0f, 0x09 }, // wbinvd - .cpu_usergs_sysret64 = { 0x0f, 0x01, 0xf8, - 0x48, 0x0f, 0x07 }, // swapgs; sysretq - .cpu_swapgs = { 0x0f, 0x01, 0xf8 }, // swapgs .mov64 = { 0x48, 0x89, 0xf8 }, // mov %rdi, %rax }; @@ -76,7 +69,6 @@ unsigned int native_patch(u8 type, void *insn_buff, unsigned long addr, switch (type) { #ifdef CONFIG_PARAVIRT_XXL - PATCH_CASE(irq, restore_fl, xxl, insn_buff, len); PATCH_CASE(irq, save_fl, xxl, insn_buff, len); PATCH_CASE(irq, irq_enable, xxl, insn_buff, len); PATCH_CASE(irq, irq_disable, xxl, insn_buff, len); @@ -85,8 +77,6 @@ unsigned int native_patch(u8 type, void *insn_buff, unsigned long addr, PATCH_CASE(mmu, read_cr3, xxl, insn_buff, len); PATCH_CASE(mmu, write_cr3, xxl, insn_buff, len); - PATCH_CASE(cpu, usergs_sysret64, xxl, insn_buff, len); - PATCH_CASE(cpu, swapgs, xxl, insn_buff, len); PATCH_CASE(cpu, wbinvd, xxl, insn_buff, len); #endif diff --git a/arch/x86/kernel/pci-iommu_table.c b/arch/x86/kernel/pci-iommu_table.c index 2e9006c1e240..42e92ec62973 100644 --- a/arch/x86/kernel/pci-iommu_table.c +++ b/arch/x86/kernel/pci-iommu_table.c @@ -4,9 +4,6 @@ #include <linux/string.h> #include <linux/kallsyms.h> - -#define DEBUG 1 - static struct iommu_table_entry * __init find_dependents_of(struct iommu_table_entry *start, struct iommu_table_entry *finish, diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 145a7ac0c19a..9c214d7085a4 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -161,7 +161,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg, #endif /* Kernel thread ? */ - if (unlikely(p->flags & PF_KTHREAD)) { + if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { memset(childregs, 0, sizeof(struct pt_regs)); kthread_frame_init(frame, sp, arg); return 0; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index ad582f9ac5a6..d08307df69ad 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -539,7 +539,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) int cpu = smp_processor_id(); WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) && - this_cpu_read(irq_count) != -1); + this_cpu_read(hardirq_stack_inuse)); if (!test_thread_flag(TIF_NEED_FPU_LOAD)) switch_fpu_prepare(prev_fpu, cpu); diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index bedca011459c..87a4143aa7d7 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -704,6 +704,9 @@ void ptrace_disable(struct task_struct *child) #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION static const struct user_regset_view user_x86_32_view; /* Initialized below. */ #endif +#ifdef CONFIG_X86_64 +static const struct user_regset_view user_x86_64_view; /* Initialized below. */ +#endif long arch_ptrace(struct task_struct *child, long request, unsigned long addr, unsigned long data) @@ -711,6 +714,14 @@ long arch_ptrace(struct task_struct *child, long request, int ret; unsigned long __user *datap = (unsigned long __user *)data; +#ifdef CONFIG_X86_64 + /* This is native 64-bit ptrace() */ + const struct user_regset_view *regset_view = &user_x86_64_view; +#else + /* This is native 32-bit ptrace() */ + const struct user_regset_view *regset_view = &user_x86_32_view; +#endif + switch (request) { /* read the word at location addr in the USER area. */ case PTRACE_PEEKUSR: { @@ -749,28 +760,28 @@ long arch_ptrace(struct task_struct *child, long request, case PTRACE_GETREGS: /* Get all gp regs from the child. */ return copy_regset_to_user(child, - task_user_regset_view(current), + regset_view, REGSET_GENERAL, 0, sizeof(struct user_regs_struct), datap); case PTRACE_SETREGS: /* Set all gp regs in the child. */ return copy_regset_from_user(child, - task_user_regset_view(current), + regset_view, REGSET_GENERAL, 0, sizeof(struct user_regs_struct), datap); case PTRACE_GETFPREGS: /* Get the child FPU state. */ return copy_regset_to_user(child, - task_user_regset_view(current), + regset_view, REGSET_FP, 0, sizeof(struct user_i387_struct), datap); case PTRACE_SETFPREGS: /* Set the child FPU state. */ return copy_regset_from_user(child, - task_user_regset_view(current), + regset_view, REGSET_FP, 0, sizeof(struct user_i387_struct), datap); @@ -1152,28 +1163,28 @@ static long x32_arch_ptrace(struct task_struct *child, case PTRACE_GETREGS: /* Get all gp regs from the child. */ return copy_regset_to_user(child, - task_user_regset_view(current), + &user_x86_64_view, REGSET_GENERAL, 0, sizeof(struct user_regs_struct), datap); case PTRACE_SETREGS: /* Set all gp regs in the child. */ return copy_regset_from_user(child, - task_user_regset_view(current), + &user_x86_64_view, REGSET_GENERAL, 0, sizeof(struct user_regs_struct), datap); case PTRACE_GETFPREGS: /* Get the child FPU state. */ return copy_regset_to_user(child, - task_user_regset_view(current), + &user_x86_64_view, REGSET_FP, 0, sizeof(struct user_i387_struct), datap); case PTRACE_SETFPREGS: /* Set the child FPU state. */ return copy_regset_from_user(child, - task_user_regset_view(current), + &user_x86_64_view, REGSET_FP, 0, sizeof(struct user_i387_struct), datap); @@ -1309,6 +1320,25 @@ void __init update_regset_xstate_info(unsigned int size, u64 xstate_mask) xstate_fx_sw_bytes[USER_XSTATE_XCR0_WORD] = xstate_mask; } +/* + * This is used by the core dump code to decide which regset to dump. The + * core dump code writes out the resulting .e_machine and the corresponding + * regsets. This is suboptimal if the task is messing around with its CS.L + * field, but at worst the core dump will end up missing some information. + * + * Unfortunately, it is also used by the broken PTRACE_GETREGSET and + * PTRACE_SETREGSET APIs. These APIs look at the .regsets field but have + * no way to make sure that the e_machine they use matches the caller's + * expectations. The result is that the data format returned by + * PTRACE_GETREGSET depends on the returned CS field (and even the offset + * of the returned CS field depends on its value!) and the data format + * accepted by PTRACE_SETREGSET is determined by the old CS value. The + * upshot is that it is basically impossible to use these APIs correctly. + * + * The best way to fix it in the long run would probably be to add new + * improved ptrace() APIs to read and write registers reliably, possibly by + * allowing userspace to select the ELF e_machine variant that they expect. + */ const struct user_regset_view *task_user_regset_view(struct task_struct *task) { #ifdef CONFIG_IA32_EMULATION diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index db115943e8bd..b29657b76e3f 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -477,6 +477,15 @@ static const struct dmi_system_id reboot_dmi_table[] __initconst = { }, }, + { /* PCIe Wifi card isn't detected after reboot otherwise */ + .callback = set_pci_reboot, + .ident = "Zotac ZBOX CI327 nano", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "NA"), + DMI_MATCH(DMI_PRODUCT_NAME, "ZBOX-CI327NANO-GS-01"), + }, + }, + /* Sony */ { /* Handle problems with rebooting on Sony VGN-Z540N */ .callback = set_bios_reboot, @@ -538,31 +547,21 @@ static void emergency_vmx_disable_all(void) local_irq_disable(); /* - * We need to disable VMX on all CPUs before rebooting, otherwise - * we risk hanging up the machine, because the CPU ignores INIT - * signals when VMX is enabled. - * - * We can't take any locks and we may be on an inconsistent - * state, so we use NMIs as IPIs to tell the other CPUs to disable - * VMX and halt. + * Disable VMX on all CPUs before rebooting, otherwise we risk hanging + * the machine, because the CPU blocks INIT when it's in VMX root. * - * For safety, we will avoid running the nmi_shootdown_cpus() - * stuff unnecessarily, but we don't have a way to check - * if other CPUs have VMX enabled. So we will call it only if the - * CPU we are running on has VMX enabled. + * We can't take any locks and we may be on an inconsistent state, so + * use NMIs as IPIs to tell the other CPUs to exit VMX root and halt. * - * We will miss cases where VMX is not enabled on all CPUs. This - * shouldn't do much harm because KVM always enable VMX on all - * CPUs anyway. But we can miss it on the small window where KVM - * is still enabling VMX. + * Do the NMI shootdown even if VMX if off on _this_ CPU, as that + * doesn't prevent a different CPU from being in VMX root operation. */ - if (cpu_has_vmx() && cpu_vmx_enabled()) { - /* Disable VMX on this CPU. */ - cpu_vmxoff(); + if (cpu_has_vmx()) { + /* Safely force _this_ CPU out of VMX root operation. */ + __cpu_emergency_vmxoff(); - /* Halt and disable VMX on the other CPUs */ + /* Halt and exit VMX root operation on the other CPUs. */ nmi_shootdown_cpus(vmxoff_nmi); - } } diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 3412c4595efd..d883176ef2ce 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -16,7 +16,6 @@ #include <linux/memblock.h> #include <linux/pci.h> #include <linux/root_dev.h> -#include <linux/sfi.h> #include <linux/hugetlb.h> #include <linux/tboot.h> #include <linux/usb/xhci-dbgp.h> @@ -661,6 +660,17 @@ static void __init trim_platform_memory_ranges(void) static void __init trim_bios_range(void) { /* + * A special case is the first 4Kb of memory; + * This is a BIOS owned area, not kernel ram, but generally + * not listed as such in the E820 table. + * + * This typically reserves additional memory (64KiB by default) + * since some BIOSes are known to corrupt low memory. See the + * Kconfig help text for X86_RESERVE_LOW. + */ + e820__range_update(0, PAGE_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED); + + /* * special case: Some BIOSes report the PC BIOS * area (640Kb -> 1Mb) as RAM even though it is not. * take them out. @@ -717,15 +727,6 @@ early_param("reservelow", parse_reservelow); static void __init trim_low_memory_range(void) { - /* - * A special case is the first 4Kb of memory; - * This is a BIOS owned area, not kernel ram, but generally - * not listed as such in the E820 table. - * - * This typically reserves additional memory (64KiB by default) - * since some BIOSes are known to corrupt low memory. See the - * Kconfig help text for X86_RESERVE_LOW. - */ memblock_reserve(0, ALIGN(reserve_low, PAGE_SIZE)); } @@ -1183,7 +1184,6 @@ void __init setup_arch(char **cmdline_p) * Read APIC and some other early information from ACPI tables. */ acpi_boot_init(); - sfi_init(); x86_dtb_init(); /* diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 117e24fbfd8a..02813a7f3a7c 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1833,6 +1833,7 @@ void arch_set_max_freq_ratio(bool turbo_disabled) arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE : arch_turbo_freq_ratio; } +EXPORT_SYMBOL_GPL(arch_set_max_freq_ratio); static bool turbo_disabled(void) { diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c index ca9a380d9c0b..9442c4136c38 100644 --- a/arch/x86/kernel/static_call.c +++ b/arch/x86/kernel/static_call.c @@ -11,14 +11,26 @@ enum insn_type { RET = 3, /* tramp / site cond-tail-call */ }; +/* + * data16 data16 xorq %rax, %rax - a single 5 byte instruction that clears %rax + * The REX.W cancels the effect of any data16. + */ +static const u8 xor5rax[] = { 0x66, 0x66, 0x48, 0x31, 0xc0 }; + static void __ref __static_call_transform(void *insn, enum insn_type type, void *func) { + const void *emulate = NULL; int size = CALL_INSN_SIZE; const void *code; switch (type) { case CALL: code = text_gen_insn(CALL_INSN_OPCODE, insn, func); + if (func == &__static_call_return0) { + emulate = code; + code = &xor5rax; + } + break; case NOP: @@ -41,7 +53,7 @@ static void __ref __static_call_transform(void *insn, enum insn_type type, void if (unlikely(system_state == SYSTEM_BOOTING)) return text_poke_early(insn, code, size); - text_poke_bp(insn, code, size, NULL); + text_poke_bp(insn, code, size, emulate); } static void __static_call_validate(void *insn, bool tail) @@ -54,7 +66,8 @@ static void __static_call_validate(void *insn, bool tail) return; } else { if (opcode == CALL_INSN_OPCODE || - !memcmp(insn, ideal_nops[NOP_ATOMIC5], 5)) + !memcmp(insn, ideal_nops[NOP_ATOMIC5], 5) || + !memcmp(insn, xor5rax, 5)) return; } diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index 60d2c3798ba2..0f3c307b37b3 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c @@ -127,12 +127,17 @@ static int enable_single_step(struct task_struct *child) regs->flags |= X86_EFLAGS_TF; /* - * Always set TIF_SINGLESTEP - this guarantees that - * we single-step system calls etc.. This will also + * Always set TIF_SINGLESTEP. This will also * cause us to set TF when returning to user mode. */ set_tsk_thread_flag(child, TIF_SINGLESTEP); + /* + * Ensure that a trap is triggered once stepping out of a system + * call prior to executing any user instruction. + */ + set_task_syscall_work(child, SYSCALL_EXIT_TRAP); + oflags = regs->flags; /* Set TF on the kernel stack.. */ @@ -230,6 +235,7 @@ void user_disable_single_step(struct task_struct *child) /* Always clear TIF_SINGLESTEP... */ clear_tsk_thread_flag(child, TIF_SINGLESTEP); + clear_task_syscall_work(child, SYSCALL_EXIT_TRAP); /* But touch TF only if it was set by us.. */ if (test_and_clear_tsk_thread_flag(child, TIF_FORCED_TF)) diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 504fa5425bce..660b78827638 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -90,14 +90,10 @@ SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len, unsigned long, prot, unsigned long, flags, unsigned long, fd, unsigned long, off) { - long error; - error = -EINVAL; if (off & ~PAGE_MASK) - goto out; + return -EINVAL; - error = ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT); -out: - return error; + return ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT); } static void find_start_end(unsigned long addr, unsigned long flags, diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index 73f800100066..2a1d47f47eee 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -471,7 +471,7 @@ bool unwind_next_frame(struct unwind_state *state) break; case ORC_REG_SP_INDIRECT: - sp = state->sp + orc->sp_offset; + sp = state->sp; indirect = true; break; @@ -521,6 +521,9 @@ bool unwind_next_frame(struct unwind_state *state) if (indirect) { if (!deref_stack_reg(state, sp, &sp)) goto err; + + if (orc->sp_reg == ORC_REG_SP_INDIRECT) + sp += orc->sp_offset; } /* Find IP, SP and possibly regs: */ diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 764573de3996..e5a7a10a0164 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -134,7 +134,11 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval) unsafe_put_user(regs->ds, &user->regs.ds, Efault_end); unsafe_put_user(regs->fs, &user->regs.fs, Efault_end); unsafe_put_user(regs->gs, &user->regs.gs, Efault_end); - unsafe_put_user(vm86->screen_bitmap, &user->screen_bitmap, Efault_end); + + /* + * Don't write screen_bitmap in case some user had a value there + * and expected it to remain unchanged. + */ user_access_end(); @@ -160,49 +164,6 @@ Efault: do_exit(SIGSEGV); } -static void mark_screen_rdonly(struct mm_struct *mm) -{ - struct vm_area_struct *vma; - spinlock_t *ptl; - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - int i; - - mmap_write_lock(mm); - pgd = pgd_offset(mm, 0xA0000); - if (pgd_none_or_clear_bad(pgd)) - goto out; - p4d = p4d_offset(pgd, 0xA0000); - if (p4d_none_or_clear_bad(p4d)) - goto out; - pud = pud_offset(p4d, 0xA0000); - if (pud_none_or_clear_bad(pud)) - goto out; - pmd = pmd_offset(pud, 0xA0000); - - if (pmd_trans_huge(*pmd)) { - vma = find_vma(mm, 0xA0000); - split_huge_pmd(vma, pmd, 0xA0000); - } - if (pmd_none_or_clear_bad(pmd)) - goto out; - pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl); - for (i = 0; i < 32; i++) { - if (pte_present(*pte)) - set_pte(pte, pte_wrprotect(*pte)); - pte++; - } - pte_unmap_unlock(pte, ptl); -out: - mmap_write_unlock(mm); - flush_tlb_mm_range(mm, 0xA0000, 0xA0000 + 32*PAGE_SIZE, PAGE_SHIFT, false); -} - - - static int do_vm86_irq_handling(int subfunction, int irqnumber); static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus); @@ -282,6 +243,15 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) offsetof(struct vm86_struct, int_revectored))) return -EFAULT; + + /* VM86_SCREEN_BITMAP had numerous bugs and appears to have no users. */ + if (v.flags & VM86_SCREEN_BITMAP) { + char comm[TASK_COMM_LEN]; + + pr_info_once("vm86: '%s' uses VM86_SCREEN_BITMAP, which is no longer supported\n", get_task_comm(comm, current)); + return -EINVAL; + } + memset(&vm86regs, 0, sizeof(vm86regs)); vm86regs.pt.bx = v.regs.ebx; @@ -302,7 +272,6 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) vm86regs.gs = v.regs.gs; vm86->flags = v.flags; - vm86->screen_bitmap = v.screen_bitmap; vm86->cpu_type = v.cpu_type; if (copy_from_user(&vm86->int_revectored, @@ -370,9 +339,6 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) update_task_stack(tsk); preempt_enable(); - if (vm86->flags & VM86_SCREEN_BITMAP) - mark_screen_rdonly(tsk->mm); - memcpy((struct kernel_vm86_regs *)regs, &vm86regs, sizeof(vm86regs)); return regs->ax; } |