summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kbuild1
-rw-r--r--arch/x86/Kconfig148
-rw-r--r--arch/x86/Kconfig.cpu19
-rw-r--r--arch/x86/Kconfig.debug20
-rw-r--r--arch/x86/Makefile_32.cpu2
-rw-r--r--arch/x86/configs/i386_defconfig1
-rw-r--r--arch/x86/configs/x86_64_defconfig1
-rw-r--r--arch/x86/crypto/Makefile4
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c9
-rw-r--r--arch/x86/crypto/fpu.c10
-rw-r--r--arch/x86/ia32/ia32_signal.c22
-rw-r--r--arch/x86/ia32/ia32entry.S12
-rw-r--r--arch/x86/include/asm/acpi.h4
-rw-r--r--arch/x86/include/asm/alternative-asm.h9
-rw-r--r--arch/x86/include/asm/alternative.h17
-rw-r--r--arch/x86/include/asm/amd_iommu.h35
-rw-r--r--arch/x86/include/asm/amd_iommu_proto.h47
-rw-r--r--arch/x86/include/asm/amd_iommu_types.h560
-rw-r--r--arch/x86/include/asm/amd_nb.h1
-rw-r--r--arch/x86/include/asm/apb_timer.h25
-rw-r--r--arch/x86/include/asm/apic.h38
-rw-r--r--arch/x86/include/asm/asm.h5
-rw-r--r--arch/x86/include/asm/bios_ebda.h28
-rw-r--r--arch/x86/include/asm/calling.h130
-rw-r--r--arch/x86/include/asm/clocksource.h18
-rw-r--r--arch/x86/include/asm/cmpxchg_32.h48
-rw-r--r--arch/x86/include/asm/cmpxchg_64.h45
-rw-r--r--arch/x86/include/asm/cpufeature.h27
-rw-r--r--arch/x86/include/asm/desc.h152
-rw-r--r--arch/x86/include/asm/dma.h12
-rw-r--r--arch/x86/include/asm/efi.h1
-rw-r--r--arch/x86/include/asm/entry_arch.h4
-rw-r--r--arch/x86/include/asm/fixmap.h1
-rw-r--r--arch/x86/include/asm/frame.h11
-rw-r--r--arch/x86/include/asm/ftrace.h7
-rw-r--r--arch/x86/include/asm/hw_irq.h1
-rw-r--r--arch/x86/include/asm/i8253.h18
-rw-r--r--arch/x86/include/asm/idle.h2
-rw-r--r--arch/x86/include/asm/io.h24
-rw-r--r--arch/x86/include/asm/io_apic.h28
-rw-r--r--arch/x86/include/asm/irq_vectors.h11
-rw-r--r--arch/x86/include/asm/irqflags.h11
-rw-r--r--arch/x86/include/asm/jump_label.h27
-rw-r--r--arch/x86/include/asm/kgdb.h1
-rw-r--r--arch/x86/include/asm/kvm_emulate.h193
-rw-r--r--arch/x86/include/asm/kvm_host.h55
-rw-r--r--arch/x86/include/asm/lguest_hcall.h1
-rw-r--r--arch/x86/include/asm/linkage.h5
-rw-r--r--arch/x86/include/asm/mce.h21
-rw-r--r--arch/x86/include/asm/memblock.h2
-rw-r--r--arch/x86/include/asm/mmu.h4
-rw-r--r--arch/x86/include/asm/mmzone_32.h39
-rw-r--r--arch/x86/include/asm/mmzone_64.h26
-rw-r--r--arch/x86/include/asm/module.h2
-rw-r--r--arch/x86/include/asm/msr-index.h4
-rw-r--r--arch/x86/include/asm/nops.h146
-rw-r--r--arch/x86/include/asm/numa.h32
-rw-r--r--arch/x86/include/asm/numa_32.h10
-rw-r--r--arch/x86/include/asm/numa_64.h36
-rw-r--r--arch/x86/include/asm/numaq.h7
-rw-r--r--arch/x86/include/asm/olpc_ofw.h9
-rw-r--r--arch/x86/include/asm/pci.h2
-rw-r--r--arch/x86/include/asm/percpu.h45
-rw-r--r--arch/x86/include/asm/perf_event.h5
-rw-r--r--arch/x86/include/asm/perf_event_p4.h33
-rw-r--r--arch/x86/include/asm/pgtable_types.h6
-rw-r--r--arch/x86/include/asm/probe_roms.h8
-rw-r--r--arch/x86/include/asm/processor-flags.h1
-rw-r--r--arch/x86/include/asm/processor.h4
-rw-r--r--arch/x86/include/asm/prom.h11
-rw-r--r--arch/x86/include/asm/ptrace.h18
-rw-r--r--arch/x86/include/asm/pvclock.h9
-rw-r--r--arch/x86/include/asm/rwlock.h43
-rw-r--r--arch/x86/include/asm/segment.h2
-rw-r--r--arch/x86/include/asm/setup.h4
-rw-r--r--arch/x86/include/asm/smpboot_hooks.h8
-rw-r--r--arch/x86/include/asm/spinlock.h37
-rw-r--r--arch/x86/include/asm/spinlock_types.h6
-rw-r--r--arch/x86/include/asm/srat.h39
-rw-r--r--arch/x86/include/asm/stacktrace.h3
-rw-r--r--arch/x86/include/asm/suspend_32.h2
-rw-r--r--arch/x86/include/asm/suspend_64.h5
-rw-r--r--arch/x86/include/asm/system.h85
-rw-r--r--arch/x86/include/asm/time.h6
-rw-r--r--arch/x86/include/asm/topology.h8
-rw-r--r--arch/x86/include/asm/traps.h4
-rw-r--r--arch/x86/include/asm/uaccess.h6
-rw-r--r--arch/x86/include/asm/uaccess_32.h1
-rw-r--r--arch/x86/include/asm/uaccess_64.h1
-rw-r--r--arch/x86/include/asm/unistd_32.h4
-rw-r--r--arch/x86/include/asm/unistd_64.h4
-rw-r--r--arch/x86/include/asm/uv/uv_bau.h599
-rw-r--r--arch/x86/include/asm/uv/uv_hub.h71
-rw-r--r--arch/x86/include/asm/uv/uv_mmrs.h2513
-rw-r--r--arch/x86/include/asm/vdso.h14
-rw-r--r--arch/x86/include/asm/vgtod.h5
-rw-r--r--arch/x86/include/asm/vsyscall.h16
-rw-r--r--arch/x86/include/asm/vvar.h50
-rw-r--r--arch/x86/include/asm/x2apic.h62
-rw-r--r--arch/x86/include/asm/xen/hypercall.h7
-rw-r--r--arch/x86/include/asm/xen/page.h5
-rw-r--r--arch/x86/include/asm/xen/pci.h21
-rw-r--r--arch/x86/kernel/Makefile7
-rw-r--r--arch/x86/kernel/acpi/boot.c8
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.S14
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.h6
-rw-r--r--arch/x86/kernel/acpi/sleep.c11
-rw-r--r--arch/x86/kernel/alternative.c224
-rw-r--r--arch/x86/kernel/amd_gart_64.c (renamed from arch/x86/kernel/pci-gart_64.c)0
-rw-r--r--arch/x86/kernel/amd_iommu.c2635
-rw-r--r--arch/x86/kernel/amd_iommu_init.c1540
-rw-r--r--arch/x86/kernel/apb_timer.c418
-rw-r--r--arch/x86/kernel/aperture_64.c34
-rw-r--r--arch/x86/kernel/apic/Makefile17
-rw-r--r--arch/x86/kernel/apic/apic.c125
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c26
-rw-r--r--arch/x86/kernel/apic/apic_noop.c9
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c13
-rw-r--r--arch/x86/kernel/apic/es7000_32.c17
-rw-r--r--arch/x86/kernel/apic/hw_nmi.c4
-rw-r--r--arch/x86/kernel/apic/io_apic.c400
-rw-r--r--arch/x86/kernel/apic/numaq_32.c40
-rw-r--r--arch/x86/kernel/apic/probe_32.c118
-rw-r--r--arch/x86/kernel/apic/probe_64.c61
-rw-r--r--arch/x86/kernel/apic/summit_32.c5
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c222
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c115
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c61
-rw-r--r--arch/x86/kernel/apm_32.c14
-rw-r--r--arch/x86/kernel/asm-offsets_32.c1
-rw-r--r--arch/x86/kernel/cpu/Makefile1
-rw-r--r--arch/x86/kernel/cpu/amd.c16
-rw-r--r--arch/x86/kernel/cpu/bugs.c1
-rw-r--r--arch/x86/kernel/cpu/common.c35
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Kconfig266
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Makefile21
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c776
-rw-r--r--arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c446
-rw-r--r--arch/x86/kernel/cpu/cpufreq/e_powersaver.c367
-rw-r--r--arch/x86/kernel/cpu/cpufreq/elanfreq.c309
-rw-r--r--arch/x86/kernel/cpu/cpufreq/gx-suspmod.c517
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longhaul.c1029
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longhaul.h353
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longrun.c327
-rw-r--r--arch/x86/kernel/cpu/cpufreq/mperf.c51
-rw-r--r--arch/x86/kernel/cpu/cpufreq/mperf.h9
-rw-r--r--arch/x86/kernel/cpu/cpufreq/p4-clockmod.c331
-rw-r--r--arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c624
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k6.c261
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k7.c752
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k7.h43
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c1607
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.h224
-rw-r--r--arch/x86/kernel/cpu/cpufreq/sc520_freq.c194
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c636
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-ich.c452
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-lib.c481
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-lib.h49
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-smi.c467
-rw-r--r--arch/x86/kernel/cpu/intel.c47
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c20
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-severity.c152
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c313
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c10
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c10
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c182
-rw-r--r--arch/x86/kernel/cpu/perf_event.c196
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c28
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c418
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c4
-rw-r--r--arch/x86/kernel/cpu/perf_event_p4.c126
-rw-r--r--arch/x86/kernel/devicetree.c77
-rw-r--r--arch/x86/kernel/dumpstack.c17
-rw-r--r--arch/x86/kernel/dumpstack_64.c37
-rw-r--r--arch/x86/kernel/entry_64.S84
-rw-r--r--arch/x86/kernel/ftrace.c16
-rw-r--r--arch/x86/kernel/head32.c1
-rw-r--r--arch/x86/kernel/hpet.c86
-rw-r--r--arch/x86/kernel/i8253.c183
-rw-r--r--arch/x86/kernel/irq.c5
-rw-r--r--arch/x86/kernel/irqinit.c3
-rw-r--r--arch/x86/kernel/jump_label.c5
-rw-r--r--arch/x86/kernel/kgdb.c4
-rw-r--r--arch/x86/kernel/kvmclock.c6
-rw-r--r--arch/x86/kernel/microcode_amd.c21
-rw-r--r--arch/x86/kernel/module.c1
-rw-r--r--arch/x86/kernel/mpparse.c8
-rw-r--r--arch/x86/kernel/pci-dma.c64
-rw-r--r--arch/x86/kernel/pci-iommu_table.c18
-rw-r--r--arch/x86/kernel/probe_roms.c (renamed from arch/x86/kernel/probe_roms_32.c)101
-rw-r--r--arch/x86/kernel/process.c45
-rw-r--r--arch/x86/kernel/process_32.c1
-rw-r--r--arch/x86/kernel/process_64.c1
-rw-r--r--arch/x86/kernel/ptrace.c9
-rw-r--r--arch/x86/kernel/reboot.c56
-rw-r--r--arch/x86/kernel/setup.c16
-rw-r--r--arch/x86/kernel/signal.c70
-rw-r--r--arch/x86/kernel/smp.c5
-rw-r--r--arch/x86/kernel/smpboot.c21
-rw-r--r--arch/x86/kernel/stacktrace.c15
-rw-r--r--arch/x86/kernel/syscall_table_32.S2
-rw-r--r--arch/x86/kernel/tboot.c2
-rw-r--r--arch/x86/kernel/test_nx.c2
-rw-r--r--arch/x86/kernel/time.c4
-rw-r--r--arch/x86/kernel/traps.c6
-rw-r--r--arch/x86/kernel/tsc.c21
-rw-r--r--arch/x86/kernel/vmlinux.lds.S72
-rw-r--r--arch/x86/kernel/vsyscall_64.c322
-rw-r--r--arch/x86/kernel/vsyscall_emu_64.S27
-rw-r--r--arch/x86/kernel/x86_init.c2
-rw-r--r--arch/x86/kvm/Kconfig1
-rw-r--r--arch/x86/kvm/emulate.c1830
-rw-r--r--arch/x86/kvm/i8254.h2
-rw-r--r--arch/x86/kvm/irq.h2
-rw-r--r--arch/x86/kvm/mmu.c21
-rw-r--r--arch/x86/kvm/paging_tmpl.h83
-rw-r--r--arch/x86/kvm/svm.c585
-rw-r--r--arch/x86/kvm/vmx.c231
-rw-r--r--arch/x86/kvm/x86.c570
-rw-r--r--arch/x86/kvm/x86.h2
-rw-r--r--arch/x86/lguest/boot.c43
-rw-r--r--arch/x86/lguest/i386_head.S35
-rw-r--r--arch/x86/lib/Makefile9
-rw-r--r--arch/x86/lib/clear_page_64.S33
-rw-r--r--arch/x86/lib/copy_page_64.S9
-rw-r--r--arch/x86/lib/copy_user_64.S69
-rw-r--r--arch/x86/lib/memcpy_64.S47
-rw-r--r--arch/x86/lib/memmove_64.S28
-rw-r--r--arch/x86/lib/memset_64.S54
-rw-r--r--arch/x86/lib/rwlock.S44
-rw-r--r--arch/x86/lib/rwlock_64.S38
-rw-r--r--arch/x86/lib/rwsem.S (renamed from arch/x86/lib/rwsem_64.S)75
-rw-r--r--arch/x86/lib/semaphore_32.S124
-rw-r--r--arch/x86/lib/thunk_64.S45
-rw-r--r--arch/x86/lib/usercopy.c43
-rw-r--r--arch/x86/mm/Makefile4
-rw-r--r--arch/x86/mm/amdtopology.c (renamed from arch/x86/mm/amdtopology_64.c)21
-rw-r--r--arch/x86/mm/fault.c34
-rw-r--r--arch/x86/mm/hugetlbpage.c4
-rw-r--r--arch/x86/mm/init.c2
-rw-r--r--arch/x86/mm/init_32.c3
-rw-r--r--arch/x86/mm/init_64.c13
-rw-r--r--arch/x86/mm/ioremap.c14
-rw-r--r--arch/x86/mm/kmemcheck/error.c2
-rw-r--r--arch/x86/mm/memblock.c4
-rw-r--r--arch/x86/mm/numa.c565
-rw-r--r--arch/x86/mm/numa_32.c404
-rw-r--r--arch/x86/mm/numa_64.c644
-rw-r--r--arch/x86/mm/numa_emulation.c16
-rw-r--r--arch/x86/mm/numa_internal.h8
-rw-r--r--arch/x86/mm/pageattr-test.c3
-rw-r--r--arch/x86/mm/pf_in.c14
-rw-r--r--arch/x86/mm/srat.c (renamed from arch/x86/mm/srat_64.c)82
-rw-r--r--arch/x86/mm/srat_32.c288
-rw-r--r--arch/x86/net/Makefile4
-rw-r--r--arch/x86/net/bpf_jit.S140
-rw-r--r--arch/x86/net/bpf_jit_comp.c654
-rw-r--r--arch/x86/oprofile/backtrace.c34
-rw-r--r--arch/x86/oprofile/nmi_int.c14
-rw-r--r--arch/x86/oprofile/op_model_amd.c100
-rw-r--r--arch/x86/pci/acpi.c2
-rw-r--r--arch/x86/pci/direct.c17
-rw-r--r--arch/x86/pci/irq.c4
-rw-r--r--arch/x86/pci/mmconfig-shared.c13
-rw-r--r--arch/x86/pci/xen.c469
-rw-r--r--arch/x86/platform/efi/efi.c231
-rw-r--r--arch/x86/platform/efi/efi_64.c37
-rw-r--r--arch/x86/platform/mrst/mrst.c4
-rw-r--r--arch/x86/platform/olpc/Makefile4
-rw-r--r--arch/x86/platform/olpc/olpc.c51
-rw-r--r--arch/x86/platform/olpc/olpc_dt.c19
-rw-r--r--arch/x86/platform/uv/tlb_uv.c1531
-rw-r--r--arch/x86/platform/uv/uv_time.c22
-rw-r--r--arch/x86/vdso/Makefile18
-rw-r--r--arch/x86/vdso/vclock_gettime.c141
-rw-r--r--arch/x86/vdso/vdso.S15
-rw-r--r--arch/x86/vdso/vdso.lds.S9
-rw-r--r--arch/x86/vdso/vextern.h16
-rw-r--r--arch/x86/vdso/vgetcpu.c3
-rw-r--r--arch/x86/vdso/vma.c77
-rw-r--r--arch/x86/vdso/vvar.c12
-rw-r--r--arch/x86/xen/Makefile2
-rw-r--r--arch/x86/xen/enlighten.c37
-rw-r--r--arch/x86/xen/irq.c2
-rw-r--r--arch/x86/xen/mmu.c334
-rw-r--r--arch/x86/xen/mmu.h37
-rw-r--r--arch/x86/xen/multicalls.c12
-rw-r--r--arch/x86/xen/p2m.c43
-rw-r--r--arch/x86/xen/pci-swiotlb-xen.c2
-rw-r--r--arch/x86/xen/platform-pci-unplug.c2
-rw-r--r--arch/x86/xen/setup.c14
-rw-r--r--arch/x86/xen/smp.c20
-rw-r--r--arch/x86/xen/time.c14
-rw-r--r--arch/x86/xen/vga.c67
-rw-r--r--arch/x86/xen/xen-ops.h13
295 files changed, 11928 insertions, 24154 deletions
diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild
index 0e103236b75..0e9dec6cadd 100644
--- a/arch/x86/Kbuild
+++ b/arch/x86/Kbuild
@@ -15,3 +15,4 @@ obj-y += vdso/
obj-$(CONFIG_IA32_EMULATION) += ia32/
obj-y += platform/
+obj-y += net/
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cc6c53a95bf..c3e03e46bc6 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -8,6 +8,7 @@ config 64BIT
config X86_32
def_bool !64BIT
+ select CLKSRC_I8253
config X86_64
def_bool 64BIT
@@ -16,11 +17,10 @@ config X86_64
config X86
def_bool y
select HAVE_AOUT if X86_32
- select HAVE_READQ
- select HAVE_WRITEQ
select HAVE_UNSTABLE_SCHED_CLOCK
select HAVE_IDE
select HAVE_OPROFILE
+ select HAVE_PCSPKR_PLATFORM
select HAVE_PERF_EVENTS
select HAVE_IRQ_WORK
select HAVE_IOREMAP_PROT
@@ -65,13 +65,13 @@ config X86
select HAVE_GENERIC_HARDIRQS
select HAVE_SPARSE_IRQ
select GENERIC_FIND_FIRST_BIT
- select GENERIC_FIND_NEXT_BIT
select GENERIC_IRQ_PROBE
select GENERIC_PENDING_IRQ if SMP
select GENERIC_IRQ_SHOW
select IRQ_FORCED_THREADING
select USE_GENERIC_SMP_HELPERS if SMP
- select ARCH_NO_SYSDEV_OPS
+ select HAVE_BPF_JIT if (X86_64 && NET)
+ select CLKEVT_I8253
config INSTRUCTION_DECODER
def_bool (KPROBES || PERF_EVENTS)
@@ -95,6 +95,10 @@ config CLOCKSOURCE_WATCHDOG
config GENERIC_CLOCKEVENTS
def_bool y
+config ARCH_CLOCKSOURCE_DATA
+ def_bool y
+ depends on X86_64
+
config GENERIC_CLOCKEVENTS_BROADCAST
def_bool y
depends on X86_64 || (X86_32 && X86_LOCAL_APIC)
@@ -112,7 +116,14 @@ config MMU
def_bool y
config ZONE_DMA
- def_bool y
+ bool "DMA memory allocation support" if EXPERT
+ default y
+ help
+ DMA memory allocation support allows devices with less than 32-bit
+ addressing to allocate within the first 16MB of address space.
+ Disable if no such devices will be used.
+
+ If unsure, say Y.
config SBUS
bool
@@ -365,17 +376,6 @@ config X86_UV
# Following is an alphabetically sorted list of 32 bit extended platforms
# Please maintain the alphabetic order if and when there are additions
-config X86_ELAN
- bool "AMD Elan"
- depends on X86_32
- depends on X86_EXTENDED_PLATFORM
- ---help---
- Select this for an AMD Elan processor.
-
- Do not use this option for K6/Athlon/Opteron processors!
-
- If unsure, choose "PC-compatible" instead.
-
config X86_INTEL_CE
bool "CE4100 TV platform"
depends on PCI
@@ -623,6 +623,7 @@ config HPET_EMULATE_RTC
config APB_TIMER
def_bool y if MRST
prompt "Langwell APB Timer Support" if X86_MRST
+ select DW_APB_TIMER
help
APB timer is the replacement for 8254, HPET on X86 MID platforms.
The APBT provides a stable time base on SMP
@@ -686,32 +687,6 @@ config CALGARY_IOMMU_ENABLED_BY_DEFAULT
Calgary anyway, pass 'iommu=calgary' on the kernel command line.
If unsure, say Y.
-config AMD_IOMMU
- bool "AMD IOMMU support"
- select SWIOTLB
- select PCI_MSI
- depends on X86_64 && PCI && ACPI
- ---help---
- With this option you can enable support for AMD IOMMU hardware in
- your system. An IOMMU is a hardware component which provides
- remapping of DMA memory accesses from devices. With an AMD IOMMU you
- can isolate the the DMA memory of different devices and protect the
- system from misbehaving device drivers or hardware.
-
- You can find out if your system has an AMD IOMMU if you look into
- your BIOS for an option to enable it or if you have an IVRS ACPI
- table.
-
-config AMD_IOMMU_STATS
- bool "Export AMD IOMMU statistics to debugfs"
- depends on AMD_IOMMU
- select DEBUG_FS
- ---help---
- This option enables code in the AMD IOMMU driver to collect various
- statistics about whats happening in the driver and exports that
- information to userspace via debugfs.
- If unsure, say N.
-
# need this always selected by IOMMU for the VIA workaround
config SWIOTLB
def_bool y if X86_64
@@ -725,9 +700,6 @@ config SWIOTLB
config IOMMU_HELPER
def_bool (CALGARY_IOMMU || GART_IOMMU || SWIOTLB || AMD_IOMMU)
-config IOMMU_API
- def_bool (AMD_IOMMU || DMAR)
-
config MAXSMP
bool "Enable Maximum number of SMP Processors and NUMA Nodes"
depends on X86_64 && SMP && DEBUG_KERNEL && EXPERIMENTAL
@@ -919,6 +891,7 @@ config TOSHIBA
config I8K
tristate "Dell laptop support"
+ select HWMON
---help---
This adds a driver to safely access the System Management Mode
of the CPU on the Dell Inspiron 8000. The System Management Mode
@@ -1201,7 +1174,7 @@ config NODES_SPAN_OTHER_NODES
config NUMA_EMU
bool "NUMA emulation"
- depends on X86_64 && NUMA
+ depends on NUMA
---help---
Enable NUMA emulation. A flat machine will be split
into virtual nodes when booted with "numa=fake=N", where N is the
@@ -1223,6 +1196,10 @@ config HAVE_ARCH_BOOTMEM
def_bool y
depends on X86_32 && NUMA
+config HAVE_ARCH_ALLOC_REMAP
+ def_bool y
+ depends on X86_32 && NUMA
+
config ARCH_HAVE_MEMORY_PRESENT
def_bool y
depends on X86_32 && DISCONTIGMEM
@@ -1231,13 +1208,9 @@ config NEED_NODE_MEMMAP_SIZE
def_bool y
depends on X86_32 && (DISCONTIGMEM || SPARSEMEM)
-config HAVE_ARCH_ALLOC_REMAP
- def_bool y
- depends on X86_32 && NUMA
-
config ARCH_FLATMEM_ENABLE
def_bool y
- depends on X86_32 && ARCH_SELECT_MEMORY_MODEL && !NUMA
+ depends on X86_32 && !NUMA
config ARCH_DISCONTIGMEM_ENABLE
def_bool y
@@ -1247,20 +1220,16 @@ config ARCH_DISCONTIGMEM_DEFAULT
def_bool y
depends on NUMA && X86_32
-config ARCH_PROC_KCORE_TEXT
- def_bool y
- depends on X86_64 && PROC_KCORE
-
-config ARCH_SPARSEMEM_DEFAULT
- def_bool y
- depends on X86_64
-
config ARCH_SPARSEMEM_ENABLE
def_bool y
depends on X86_64 || NUMA || (EXPERIMENTAL && X86_32) || X86_32_NON_STANDARD
select SPARSEMEM_STATIC if X86_32
select SPARSEMEM_VMEMMAP_ENABLE if X86_64
+config ARCH_SPARSEMEM_DEFAULT
+ def_bool y
+ depends on X86_64
+
config ARCH_SELECT_MEMORY_MODEL
def_bool y
depends on ARCH_SPARSEMEM_ENABLE
@@ -1269,6 +1238,10 @@ config ARCH_MEMORY_PROBE
def_bool X86_64
depends on MEMORY_HOTPLUG
+config ARCH_PROC_KCORE_TEXT
+ def_bool y
+ depends on X86_64 && PROC_KCORE
+
config ILLEGAL_POINTER_VALUE
hex
default 0 if X86_32
@@ -1703,10 +1676,6 @@ config ARCH_ENABLE_MEMORY_HOTREMOVE
def_bool y
depends on MEMORY_HOTPLUG
-config HAVE_ARCH_EARLY_PFN_TO_NID
- def_bool X86_64
- depends on NUMA
-
config USE_PERCPU_NUMA_NODE_ID
def_bool y
depends on NUMA
@@ -1848,7 +1817,7 @@ config APM_ALLOW_INTS
endif # APM
-source "arch/x86/kernel/cpu/cpufreq/Kconfig"
+source "drivers/cpufreq/Kconfig"
source "drivers/cpuidle/Kconfig"
@@ -1950,55 +1919,6 @@ config PCI_CNB20LE_QUIRK
You should say N unless you know you need this.
-config DMAR
- bool "Support for DMA Remapping Devices (EXPERIMENTAL)"
- depends on PCI_MSI && ACPI && EXPERIMENTAL
- help
- DMA remapping (DMAR) devices support enables independent address
- translations for Direct Memory Access (DMA) from devices.
- These DMA remapping devices are reported via ACPI tables
- and include PCI device scope covered by these DMA
- remapping devices.
-
-config DMAR_DEFAULT_ON
- def_bool y
- prompt "Enable DMA Remapping Devices by default"
- depends on DMAR
- help
- Selecting this option will enable a DMAR device at boot time if
- one is found. If this option is not selected, DMAR support can
- be enabled by passing intel_iommu=on to the kernel. It is
- recommended you say N here while the DMAR code remains
- experimental.
-
-config DMAR_BROKEN_GFX_WA
- bool "Workaround broken graphics drivers (going away soon)"
- depends on DMAR && BROKEN
- ---help---
- Current Graphics drivers tend to use physical address
- for DMA and avoid using DMA APIs. Setting this config
- option permits the IOMMU driver to set a unity map for
- all the OS-visible memory. Hence the driver can continue
- to use physical addresses for DMA, at least until this
- option is removed in the 2.6.32 kernel.
-
-config DMAR_FLOPPY_WA
- def_bool y
- depends on DMAR
- ---help---
- Floppy disk drivers are known to bypass DMA API calls
- thereby failing to work when IOMMU is enabled. This
- workaround will setup a 1:1 mapping for the first
- 16MiB to make floppy (an ISA device) work.
-
-config INTR_REMAP
- bool "Support for Interrupt Remapping (EXPERIMENTAL)"
- depends on X86_64 && X86_IO_APIC && PCI_MSI && ACPI && EXPERIMENTAL
- ---help---
- Supports Interrupt remapping for IO-APIC and MSI devices.
- To use x2apic mode in the CPU's which support x2APIC enhancements or
- to support platforms with CPU's having > 8 bit APIC ID, say Y.
-
source "drivers/pci/pcie/Kconfig"
source "drivers/pci/Kconfig"
@@ -2076,7 +1996,7 @@ config OLPC
depends on !X86_PAE
select GPIOLIB
select OF
- select OF_PROMTREE if PROC_DEVICETREE
+ select OF_PROMTREE
---help---
Add support for detecting the unique features of the OLPC
XO hardware.
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index d161e939df6..e3ca7e0d858 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -1,6 +1,4 @@
# Put here option for CPU selection and depending optimization
-if !X86_ELAN
-
choice
prompt "Processor family"
default M686 if X86_32
@@ -203,6 +201,14 @@ config MWINCHIP3D
stores for this CPU, which can increase performance of some
operations.
+config MELAN
+ bool "AMD Elan"
+ depends on X86_32
+ ---help---
+ Select this for an AMD Elan processor.
+
+ Do not use this option for K6/Athlon/Opteron processors!
+
config MGEODEGX1
bool "GeodeGX1"
depends on X86_32
@@ -292,8 +298,6 @@ config X86_GENERIC
This is really intended for distributors who need more
generic optimizations.
-endif
-
#
# Define implied options from the CPU selection here
config X86_INTERNODE_CACHE_SHIFT
@@ -308,11 +312,14 @@ config X86_CMPXCHG
config CMPXCHG_LOCAL
def_bool X86_64 || (X86_32 && !M386)
+config CMPXCHG_DOUBLE
+ def_bool y
+
config X86_L1_CACHE_SHIFT
int
default "7" if MPENTIUM4 || MPSC
default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU
- default "4" if X86_ELAN || M486 || M386 || MGEODEGX1
+ default "4" if MELAN || M486 || M386 || MGEODEGX1
default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
config X86_XADD
@@ -358,7 +365,7 @@ config X86_POPAD_OK
config X86_ALIGNMENT_16
def_bool y
- depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1
+ depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1
config X86_INTEL_USERCOPY
def_bool y
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 615e18810f4..c0f8a5c8891 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -66,26 +66,6 @@ config DEBUG_STACKOVERFLOW
This option will cause messages to be printed if free stack space
drops below a certain limit.
-config DEBUG_STACK_USAGE
- bool "Stack utilization instrumentation"
- depends on DEBUG_KERNEL
- ---help---
- Enables the display of the minimum amount of free stack which each
- task has ever had available in the sysrq-T and sysrq-P debug output.
-
- This option will slow down process creation somewhat.
-
-config DEBUG_PER_CPU_MAPS
- bool "Debug access to per_cpu maps"
- depends on DEBUG_KERNEL
- depends on SMP
- ---help---
- Say Y to verify that the per_cpu map being accessed has
- been setup. Adds a fair amount of code to kernel memory
- and decreases performance.
-
- Say N if unsure.
-
config X86_PTDUMP
bool "Export kernel pagetable layout to userspace via debugfs"
depends on DEBUG_KERNEL
diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu
index f2ee1abb1df..86cee7b749e 100644
--- a/arch/x86/Makefile_32.cpu
+++ b/arch/x86/Makefile_32.cpu
@@ -37,7 +37,7 @@ cflags-$(CONFIG_MATOM) += $(call cc-option,-march=atom,$(call cc-option,-march=
$(call cc-option,-mtune=atom,$(call cc-option,-mtune=generic))
# AMD Elan support
-cflags-$(CONFIG_X86_ELAN) += -march=i486
+cflags-$(CONFIG_MELAN) += -march=i486
# Geode GX1 support
cflags-$(CONFIG_MGEODEGX1) += -march=pentium-mmx
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 6f9872658dd..2bf18059fbe 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -10,7 +10,6 @@ CONFIG_TASK_IO_ACCOUNTING=y
CONFIG_AUDIT=y
CONFIG_LOG_BUF_SHIFT=18
CONFIG_CGROUPS=y
-CONFIG_CGROUP_NS=y
CONFIG_CGROUP_FREEZER=y
CONFIG_CPUSETS=y
CONFIG_CGROUP_CPUACCT=y
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index ee01a9d5d4f..22a0dc8e51d 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -11,7 +11,6 @@ CONFIG_TASK_IO_ACCOUNTING=y
CONFIG_AUDIT=y
CONFIG_LOG_BUF_SHIFT=18
CONFIG_CGROUPS=y
-CONFIG_CGROUP_NS=y
CONFIG_CGROUP_FREEZER=y
CONFIG_CPUSETS=y
CONFIG_CGROUP_CPUACCT=y
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 1a58ad89fdf..c04f1b7a913 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -2,8 +2,6 @@
# Arch-specific CryptoAPI modules.
#
-obj-$(CONFIG_CRYPTO_FPU) += fpu.o
-
obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o
obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o
obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o
@@ -24,6 +22,6 @@ aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
-aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o
+aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 2577613fb32..feee8ff1d05 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -94,6 +94,10 @@ asmlinkage void aesni_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len, u8 *iv);
asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len, u8 *iv);
+
+int crypto_fpu_init(void);
+void crypto_fpu_exit(void);
+
#ifdef CONFIG_X86_64
asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len, u8 *iv);
@@ -1257,6 +1261,8 @@ static int __init aesni_init(void)
return -ENODEV;
}
+ if ((err = crypto_fpu_init()))
+ goto fpu_err;
if ((err = crypto_register_alg(&aesni_alg)))
goto aes_err;
if ((err = crypto_register_alg(&__aesni_alg)))
@@ -1334,6 +1340,7 @@ blk_ecb_err:
__aes_err:
crypto_unregister_alg(&aesni_alg);
aes_err:
+fpu_err:
return err;
}
@@ -1363,6 +1370,8 @@ static void __exit aesni_exit(void)
crypto_unregister_alg(&blk_ecb_alg);
crypto_unregister_alg(&__aesni_alg);
crypto_unregister_alg(&aesni_alg);
+
+ crypto_fpu_exit();
}
module_init(aesni_init);
diff --git a/arch/x86/crypto/fpu.c b/arch/x86/crypto/fpu.c
index 1a8f8649c03..98d7a188f46 100644
--- a/arch/x86/crypto/fpu.c
+++ b/arch/x86/crypto/fpu.c
@@ -150,18 +150,12 @@ static struct crypto_template crypto_fpu_tmpl = {
.module = THIS_MODULE,
};
-static int __init crypto_fpu_module_init(void)
+int __init crypto_fpu_init(void)
{
return crypto_register_template(&crypto_fpu_tmpl);
}
-static void __exit crypto_fpu_module_exit(void)
+void __exit crypto_fpu_exit(void)
{
crypto_unregister_template(&crypto_fpu_tmpl);
}
-
-module_init(crypto_fpu_module_init);
-module_exit(crypto_fpu_module_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("FPU block cipher wrapper");
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 588a7aa937e..65577698cab 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -127,15 +127,17 @@ int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from)
asmlinkage long sys32_sigsuspend(int history0, int history1, old_sigset_t mask)
{
- mask &= _BLOCKABLE;
- spin_lock_irq(&current->sighand->siglock);
+ sigset_t blocked;
+
current->saved_sigmask = current->blocked;
- siginitset(&current->blocked, mask);
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
+
+ mask &= _BLOCKABLE;
+ siginitset(&blocked, mask);
+ set_current_blocked(&blocked);
current->state = TASK_INTERRUPTIBLE;
schedule();
+
set_restore_sigmask();
return -ERESTARTNOHAND;
}
@@ -279,10 +281,7 @@ asmlinkage long sys32_sigreturn(struct pt_regs *regs)
goto badframe;
sigdelsetmask(&set, ~_BLOCKABLE);
- spin_lock_irq(&current->sighand->siglock);
- current->blocked = set;
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
+ set_current_blocked(&set);
if (ia32_restore_sigcontext(regs, &frame->sc, &ax))
goto badframe;
@@ -308,10 +307,7 @@ asmlinkage long sys32_rt_sigreturn(struct pt_regs *regs)
goto badframe;
sigdelsetmask(&set, ~_BLOCKABLE);
- spin_lock_irq(&current->sighand->siglock);
- current->blocked = set;
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
+ set_current_blocked(&set);
if (ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
goto badframe;
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 849a9d23c71..a0e866d233e 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -143,7 +143,7 @@ ENTRY(ia32_sysenter_target)
CFI_REL_OFFSET rip,0
pushq_cfi %rax
cld
- SAVE_ARGS 0,0,1
+ SAVE_ARGS 0,1,0
/* no need to do an access_ok check here because rbp has been
32bit zero extended */
1: movl (%rbp),%ebp
@@ -173,7 +173,7 @@ sysexit_from_sys_call:
andl $~0x200,EFLAGS-R11(%rsp)
movl RIP-R11(%rsp),%edx /* User %eip */
CFI_REGISTER rip,rdx
- RESTORE_ARGS 1,24,1,1,1,1
+ RESTORE_ARGS 0,24,0,0,0,0
xorq %r8,%r8
xorq %r9,%r9
xorq %r10,%r10
@@ -289,7 +289,7 @@ ENTRY(ia32_cstar_target)
* disabled irqs and here we enable it straight after entry:
*/
ENABLE_INTERRUPTS(CLBR_NONE)
- SAVE_ARGS 8,1,1
+ SAVE_ARGS 8,0,0
movl %eax,%eax /* zero extension */
movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
movq %rcx,RIP-ARGOFFSET(%rsp)
@@ -328,7 +328,7 @@ cstar_dispatch:
jnz sysretl_audit
sysretl_from_sys_call:
andl $~TS_COMPAT,TI_status(%r10)
- RESTORE_ARGS 1,-ARG_SKIP,1,1,1
+ RESTORE_ARGS 0,-ARG_SKIP,0,0,0
movl RIP-ARGOFFSET(%rsp),%ecx
CFI_REGISTER rip,rcx
movl EFLAGS-ARGOFFSET(%rsp),%r11d
@@ -419,7 +419,7 @@ ENTRY(ia32_syscall)
cld
/* note the registers are not zero extended to the sf.
this could be a problem. */
- SAVE_ARGS 0,0,1
+ SAVE_ARGS 0,1,0
GET_THREAD_INFO(%r10)
orl $TS_COMPAT,TI_status(%r10)
testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
@@ -848,4 +848,6 @@ ia32_sys_call_table:
.quad compat_sys_open_by_handle_at
.quad compat_sys_clock_adjtime
.quad sys_syncfs
+ .quad compat_sys_sendmmsg /* 345 */
+ .quad sys_setns
ia32_syscall_end:
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 12e0e7dd869..610001d385d 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -139,7 +139,7 @@ static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate)
boot_cpu_data.x86_model <= 0x05 &&
boot_cpu_data.x86_mask < 0x0A)
return 1;
- else if (c1e_detected)
+ else if (amd_e400_c1e_detected)
return 1;
else
return max_cstate;
@@ -183,8 +183,6 @@ static inline void disable_acpi(void) { }
#define ARCH_HAS_POWER_INIT 1
-struct bootnode;
-
#ifdef CONFIG_ACPI_NUMA
extern int acpi_numa;
extern int x86_acpi_numa_init(void);
diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h
index a63a68be1cc..4554cc6fb96 100644
--- a/arch/x86/include/asm/alternative-asm.h
+++ b/arch/x86/include/asm/alternative-asm.h
@@ -15,4 +15,13 @@
.endm
#endif
+.macro altinstruction_entry orig alt feature orig_len alt_len
+ .align 8
+ .long \orig - .
+ .long \alt - .
+ .word \feature
+ .byte \orig_len
+ .byte \alt_len
+.endm
+
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 13009d1af99..23fb6d79f20 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -4,7 +4,6 @@
#include <linux/types.h>
#include <linux/stddef.h>
#include <linux/stringify.h>
-#include <linux/jump_label.h>
#include <asm/asm.h>
/*
@@ -44,8 +43,8 @@
#endif
struct alt_instr {
- u8 *instr; /* original instruction */
- u8 *replacement;
+ s32 instr_offset; /* original instruction */
+ s32 repl_offset; /* offset to replacement instruction */
u16 cpuid; /* cpuid bit set for replacement */
u8 instrlen; /* length of original instruction */
u8 replacementlen; /* length of new instruction, <= instrlen */
@@ -85,8 +84,8 @@ static inline int alternatives_text_reserved(void *start, void *end)
"661:\n\t" oldinstr "\n662:\n" \
".section .altinstructions,\"a\"\n" \
_ASM_ALIGN "\n" \
- _ASM_PTR "661b\n" /* label */ \
- _ASM_PTR "663f\n" /* new instruction */ \
+ " .long 661b - .\n" /* label */ \
+ " .long 663f - .\n" /* new instruction */ \
" .word " __stringify(feature) "\n" /* feature bit */ \
" .byte 662b-661b\n" /* sourcelen */ \
" .byte 664f-663f\n" /* replacementlen */ \
@@ -191,12 +190,4 @@ extern void *text_poke(void *addr, const void *opcode, size_t len);
extern void *text_poke_smp(void *addr, const void *opcode, size_t len);
extern void text_poke_smp_batch(struct text_poke_param *params, int n);
-#if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL)
-#define IDEAL_NOP_SIZE_5 5
-extern unsigned char ideal_nop5[IDEAL_NOP_SIZE_5];
-extern void arch_init_ideal_nop5(void);
-#else
-static inline void arch_init_ideal_nop5(void) {}
-#endif
-
#endif /* _ASM_X86_ALTERNATIVE_H */
diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h
deleted file mode 100644
index a6863a2dec1..00000000000
--- a/arch/x86/include/asm/amd_iommu.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
- * Author: Joerg Roedel <joerg.roedel@amd.com>
- * Leo Duran <leo.duran@amd.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-#ifndef _ASM_X86_AMD_IOMMU_H
-#define _ASM_X86_AMD_IOMMU_H
-
-#include <linux/irqreturn.h>
-
-#ifdef CONFIG_AMD_IOMMU
-
-extern int amd_iommu_detect(void);
-
-#else
-
-static inline int amd_iommu_detect(void) { return -ENODEV; }
-
-#endif
-
-#endif /* _ASM_X86_AMD_IOMMU_H */
diff --git a/arch/x86/include/asm/amd_iommu_proto.h b/arch/x86/include/asm/amd_iommu_proto.h
deleted file mode 100644
index 916bc8111a0..00000000000
--- a/arch/x86/include/asm/amd_iommu_proto.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (C) 2009-2010 Advanced Micro Devices, Inc.
- * Author: Joerg Roedel <joerg.roedel@amd.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-#ifndef _ASM_X86_AMD_IOMMU_PROTO_H
-#define _ASM_X86_AMD_IOMMU_PROTO_H
-
-struct amd_iommu;
-
-extern int amd_iommu_init_dma_ops(void);
-extern int amd_iommu_init_passthrough(void);
-extern irqreturn_t amd_iommu_int_handler(int irq, void *data);
-extern void amd_iommu_flush_all_domains(void);
-extern void amd_iommu_flush_all_devices(void);
-extern void amd_iommu_apply_erratum_63(u16 devid);
-extern void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu);
-extern int amd_iommu_init_devices(void);
-extern void amd_iommu_uninit_devices(void);
-extern void amd_iommu_init_notifier(void);
-extern void amd_iommu_init_api(void);
-#ifndef CONFIG_AMD_IOMMU_STATS
-
-static inline void amd_iommu_stats_init(void) { }
-
-#endif /* !CONFIG_AMD_IOMMU_STATS */
-
-static inline bool is_rd890_iommu(struct pci_dev *pdev)
-{
- return (pdev->vendor == PCI_VENDOR_ID_ATI) &&
- (pdev->device == PCI_DEVICE_ID_RD890_IOMMU);
-}
-
-#endif /* _ASM_X86_AMD_IOMMU_PROTO_H */
diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
deleted file mode 100644
index e3509fc303b..00000000000
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ /dev/null
@@ -1,560 +0,0 @@
-/*
- * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
- * Author: Joerg Roedel <joerg.roedel@amd.com>
- * Leo Duran <leo.duran@amd.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-#ifndef _ASM_X86_AMD_IOMMU_TYPES_H
-#define _ASM_X86_AMD_IOMMU_TYPES_H
-
-#include <linux/types.h>
-#include <linux/mutex.h>
-#include <linux/list.h>
-#include <linux/spinlock.h>
-
-/*
- * Maximum number of IOMMUs supported
- */
-#define MAX_IOMMUS 32
-
-/*
- * some size calculation constants
- */
-#define DEV_TABLE_ENTRY_SIZE 32
-#define ALIAS_TABLE_ENTRY_SIZE 2
-#define RLOOKUP_TABLE_ENTRY_SIZE (sizeof(void *))
-
-/* Length of the MMIO region for the AMD IOMMU */
-#define MMIO_REGION_LENGTH 0x4000
-
-/* Capability offsets used by the driver */
-#define MMIO_CAP_HDR_OFFSET 0x00
-#define MMIO_RANGE_OFFSET 0x0c
-#define MMIO_MISC_OFFSET 0x10
-
-/* Masks, shifts and macros to parse the device range capability */
-#define MMIO_RANGE_LD_MASK 0xff000000
-#define MMIO_RANGE_FD_MASK 0x00ff0000
-#define MMIO_RANGE_BUS_MASK 0x0000ff00
-#define MMIO_RANGE_LD_SHIFT 24
-#define MMIO_RANGE_FD_SHIFT 16
-#define MMIO_RANGE_BUS_SHIFT 8
-#define MMIO_GET_LD(x) (((x) & MMIO_RANGE_LD_MASK) >> MMIO_RANGE_LD_SHIFT)
-#define MMIO_GET_FD(x) (((x) & MMIO_RANGE_FD_MASK) >> MMIO_RANGE_FD_SHIFT)
-#define MMIO_GET_BUS(x) (((x) & MMIO_RANGE_BUS_MASK) >> MMIO_RANGE_BUS_SHIFT)
-#define MMIO_MSI_NUM(x) ((x) & 0x1f)
-
-/* Flag masks for the AMD IOMMU exclusion range */
-#define MMIO_EXCL_ENABLE_MASK 0x01ULL
-#define MMIO_EXCL_ALLOW_MASK 0x02ULL
-
-/* Used offsets into the MMIO space */
-#define MMIO_DEV_TABLE_OFFSET 0x0000
-#define MMIO_CMD_BUF_OFFSET 0x0008
-#define MMIO_EVT_BUF_OFFSET 0x0010
-#define MMIO_CONTROL_OFFSET 0x0018
-#define MMIO_EXCL_BASE_OFFSET 0x0020
-#define MMIO_EXCL_LIMIT_OFFSET 0x0028
-#define MMIO_CMD_HEAD_OFFSET 0x2000
-#define MMIO_CMD_TAIL_OFFSET 0x2008
-#define MMIO_EVT_HEAD_OFFSET 0x2010
-#define MMIO_EVT_TAIL_OFFSET 0x2018
-#define MMIO_STATUS_OFFSET 0x2020
-
-/* MMIO status bits */
-#define MMIO_STATUS_COM_WAIT_INT_MASK 0x04
-
-/* event logging constants */
-#define EVENT_ENTRY_SIZE 0x10
-#define EVENT_TYPE_SHIFT 28
-#define EVENT_TYPE_MASK 0xf
-#define EVENT_TYPE_ILL_DEV 0x1
-#define EVENT_TYPE_IO_FAULT 0x2
-#define EVENT_TYPE_DEV_TAB_ERR 0x3
-#define EVENT_TYPE_PAGE_TAB_ERR 0x4
-#define EVENT_TYPE_ILL_CMD 0x5
-#define EVENT_TYPE_CMD_HARD_ERR 0x6
-#define EVENT_TYPE_IOTLB_INV_TO 0x7
-#define EVENT_TYPE_INV_DEV_REQ 0x8
-#define EVENT_DEVID_MASK 0xffff
-#define EVENT_DEVID_SHIFT 0
-#define EVENT_DOMID_MASK 0xffff
-#define EVENT_DOMID_SHIFT 0
-#define EVENT_FLAGS_MASK 0xfff
-#define EVENT_FLAGS_SHIFT 0x10
-
-/* feature control bits */
-#define CONTROL_IOMMU_EN 0x00ULL
-#define CONTROL_HT_TUN_EN 0x01ULL
-#define CONTROL_EVT_LOG_EN 0x02ULL
-#define CONTROL_EVT_INT_EN 0x03ULL
-#define CONTROL_COMWAIT_EN 0x04ULL
-#define CONTROL_PASSPW_EN 0x08ULL
-#define CONTROL_RESPASSPW_EN 0x09ULL
-#define CONTROL_COHERENT_EN 0x0aULL
-#define CONTROL_ISOC_EN 0x0bULL
-#define CONTROL_CMDBUF_EN 0x0cULL
-#define CONTROL_PPFLOG_EN 0x0dULL
-#define CONTROL_PPFINT_EN 0x0eULL
-
-/* command specific defines */
-#define CMD_COMPL_WAIT 0x01
-#define CMD_INV_DEV_ENTRY 0x02
-#define CMD_INV_IOMMU_PAGES 0x03
-
-#define CMD_COMPL_WAIT_STORE_MASK 0x01
-#define CMD_COMPL_WAIT_INT_MASK 0x02
-#define CMD_INV_IOMMU_PAGES_SIZE_MASK 0x01
-#define CMD_INV_IOMMU_PAGES_PDE_MASK 0x02
-
-#define CMD_INV_IOMMU_ALL_PAGES_ADDRESS 0x7fffffffffffffffULL
-
-/* macros and definitions for device table entries */
-#define DEV_ENTRY_VALID 0x00
-#define DEV_ENTRY_TRANSLATION 0x01
-#define DEV_ENTRY_IR 0x3d
-#define DEV_ENTRY_IW 0x3e
-#define DEV_ENTRY_NO_PAGE_FAULT 0x62
-#define DEV_ENTRY_EX 0x67
-#define DEV_ENTRY_SYSMGT1 0x68
-#define DEV_ENTRY_SYSMGT2 0x69
-#define DEV_ENTRY_INIT_PASS 0xb8
-#define DEV_ENTRY_EINT_PASS 0xb9
-#define DEV_ENTRY_NMI_PASS 0xba
-#define DEV_ENTRY_LINT0_PASS 0xbe
-#define DEV_ENTRY_LINT1_PASS 0xbf
-#define DEV_ENTRY_MODE_MASK 0x07
-#define DEV_ENTRY_MODE_SHIFT 0x09
-
-/* constants to configure the command buffer */
-#define CMD_BUFFER_SIZE 8192
-#define CMD_BUFFER_UNINITIALIZED 1
-#define CMD_BUFFER_ENTRIES 512
-#define MMIO_CMD_SIZE_SHIFT 56
-#define MMIO_CMD_SIZE_512 (0x9ULL << MMIO_CMD_SIZE_SHIFT)
-
-/* constants for event buffer handling */
-#define EVT_BUFFER_SIZE 8192 /* 512 entries */
-#define EVT_LEN_MASK (0x9ULL << 56)
-
-#define PAGE_MODE_NONE 0x00
-#define PAGE_MODE_1_LEVEL 0x01
-#define PAGE_MODE_2_LEVEL 0x02
-#define PAGE_MODE_3_LEVEL 0x03
-#define PAGE_MODE_4_LEVEL 0x04
-#define PAGE_MODE_5_LEVEL 0x05
-#define PAGE_MODE_6_LEVEL 0x06
-
-#define PM_LEVEL_SHIFT(x) (12 + ((x) * 9))
-#define PM_LEVEL_SIZE(x) (((x) < 6) ? \
- ((1ULL << PM_LEVEL_SHIFT((x))) - 1): \
- (0xffffffffffffffffULL))
-#define PM_LEVEL_INDEX(x, a) (((a) >> PM_LEVEL_SHIFT((x))) & 0x1ffULL)
-#define PM_LEVEL_ENC(x) (((x) << 9) & 0xe00ULL)
-#define PM_LEVEL_PDE(x, a) ((a) | PM_LEVEL_ENC((x)) | \
- IOMMU_PTE_P | IOMMU_PTE_IR | IOMMU_PTE_IW)
-#define PM_PTE_LEVEL(pte) (((pte) >> 9) & 0x7ULL)
-
-#define PM_MAP_4k 0
-#define PM_ADDR_MASK 0x000ffffffffff000ULL
-#define PM_MAP_MASK(lvl) (PM_ADDR_MASK & \
- (~((1ULL << (12 + ((lvl) * 9))) - 1)))
-#define PM_ALIGNED(lvl, addr) ((PM_MAP_MASK(lvl) & (addr)) == (addr))
-
-/*
- * Returns the page table level to use for a given page size
- * Pagesize is expected to be a power-of-two
- */
-#define PAGE_SIZE_LEVEL(pagesize) \
- ((__ffs(pagesize) - 12) / 9)
-/*
- * Returns the number of ptes to use for a given page size
- * Pagesize is expected to be a power-of-two
- */
-#define PAGE_SIZE_PTE_COUNT(pagesize) \
- (1ULL << ((__ffs(pagesize) - 12) % 9))
-
-/*
- * Aligns a given io-virtual address to a given page size
- * Pagesize is expected to be a power-of-two
- */
-#define PAGE_SIZE_ALIGN(address, pagesize) \
- ((address) & ~((pagesize) - 1))
-/*
- * Creates an IOMMU PTE for an address an a given pagesize
- * The PTE has no permission bits set
- * Pagesize is expected to be a power-of-two larger than 4096
- */
-#define PAGE_SIZE_PTE(address, pagesize) \
- (((address) | ((pagesize) - 1)) & \
- (~(pagesize >> 1)) & PM_ADDR_MASK)
-
-/*
- * Takes a PTE value with mode=0x07 and returns the page size it maps
- */
-#define PTE_PAGE_SIZE(pte) \
- (1ULL << (1 + ffz(((pte) | 0xfffULL))))
-
-#define IOMMU_PTE_P (1ULL << 0)
-#define IOMMU_PTE_TV (1ULL << 1)
-#define IOMMU_PTE_U (1ULL << 59)
-#define IOMMU_PTE_FC (1ULL << 60)
-#define IOMMU_PTE_IR (1ULL << 61)
-#define IOMMU_PTE_IW (1ULL << 62)
-
-#define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL)
-#define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_P)
-#define IOMMU_PTE_PAGE(pte) (phys_to_virt((pte) & IOMMU_PAGE_MASK))
-#define IOMMU_PTE_MODE(pte) (((pte) >> 9) & 0x07)
-
-#define IOMMU_PROT_MASK 0x03
-#define IOMMU_PROT_IR 0x01
-#define IOMMU_PROT_IW 0x02
-
-/* IOMMU capabilities */
-#define IOMMU_CAP_IOTLB 24
-#define IOMMU_CAP_NPCACHE 26
-
-#define MAX_DOMAIN_ID 65536
-
-/* FIXME: move this macro to <linux/pci.h> */
-#define PCI_BUS(x) (((x) >> 8) & 0xff)
-
-/* Protection domain flags */
-#define PD_DMA_OPS_MASK (1UL << 0) /* domain used for dma_ops */
-#define PD_DEFAULT_MASK (1UL << 1) /* domain is a default dma_ops
- domain for an IOMMU */
-#define PD_PASSTHROUGH_MASK (1UL << 2) /* domain has no page
- translation */
-
-extern bool amd_iommu_dump;
-#define DUMP_printk(format, arg...) \
- do { \
- if (amd_iommu_dump) \
- printk(KERN_INFO "AMD-Vi: " format, ## arg); \
- } while(0);
-
-/* global flag if IOMMUs cache non-present entries */
-extern bool amd_iommu_np_cache;
-
-/*
- * Make iterating over all IOMMUs easier
- */
-#define for_each_iommu(iommu) \
- list_for_each_entry((iommu), &amd_iommu_list, list)
-#define for_each_iommu_safe(iommu, next) \
- list_for_each_entry_safe((iommu), (next), &amd_iommu_list, list)
-
-#define APERTURE_RANGE_SHIFT 27 /* 128 MB */
-#define APERTURE_RANGE_SIZE (1ULL << APERTURE_RANGE_SHIFT)
-#define APERTURE_RANGE_PAGES (APERTURE_RANGE_SIZE >> PAGE_SHIFT)
-#define APERTURE_MAX_RANGES 32 /* allows 4GB of DMA address space */
-#define APERTURE_RANGE_INDEX(a) ((a) >> APERTURE_RANGE_SHIFT)
-#define APERTURE_PAGE_INDEX(a) (((a) >> 21) & 0x3fULL)
-
-/*
- * This structure contains generic data for IOMMU protection domains
- * independent of their use.
- */
-struct protection_domain {
- struct list_head list; /* for list of all protection domains */
- struct list_head dev_list; /* List of all devices in this domain */
- spinlock_t lock; /* mostly used to lock the page table*/
- struct mutex api_lock; /* protect page tables in the iommu-api path */
- u16 id; /* the domain id written to the device table */
- int mode; /* paging mode (0-6 levels) */
- u64 *pt_root; /* page table root pointer */
- unsigned long flags; /* flags to find out type of domain */
- bool updated; /* complete domain flush required */
- unsigned dev_cnt; /* devices assigned to this domain */
- unsigned dev_iommu[MAX_IOMMUS]; /* per-IOMMU reference count */
- void *priv; /* private data */
-
-};
-
-/*
- * This struct contains device specific data for the IOMMU
- */
-struct iommu_dev_data {
- struct list_head list; /* For domain->dev_list */
- struct device *dev; /* Device this data belong to */
- struct device *alias; /* The Alias Device */
- struct protection_domain *domain; /* Domain the device is bound to */
- atomic_t bind; /* Domain attach reverent count */
-};
-
-/*
- * For dynamic growth the aperture size is split into ranges of 128MB of
- * DMA address space each. This struct represents one such range.
- */
-struct aperture_range {
-
- /* address allocation bitmap */
- unsigned long *bitmap;
-
- /*
- * Array of PTE pages for the aperture. In this array we save all the
- * leaf pages of the domain page table used for the aperture. This way
- * we don't need to walk the page table to find a specific PTE. We can
- * just calculate its address in constant time.
- */
- u64 *pte_pages[64];
-
- unsigned long offset;
-};
-
-/*
- * Data container for a dma_ops specific protection domain
- */
-struct dma_ops_domain {
- struct list_head list;
-
- /* generic protection domain information */
- struct protection_domain domain;
-
- /* size of the aperture for the mappings */
- unsigned long aperture_size;
-
- /* address we start to search for free addresses */
- unsigned long next_address;
-
- /* address space relevant data */
- struct aperture_range *aperture[APERTURE_MAX_RANGES];
-
- /* This will be set to true when TLB needs to be flushed */
- bool need_flush;
-
- /*
- * if this is a preallocated domain, keep the device for which it was
- * preallocated in this variable
- */
- u16 target_dev;
-};
-
-/*
- * Structure where we save information about one hardware AMD IOMMU in the
- * system.
- */
-struct amd_iommu {
- struct list_head list;
-
- /* Index within the IOMMU array */
- int index;
-
- /* locks the accesses to the hardware */
- spinlock_t lock;
-
- /* Pointer to PCI device of this IOMMU */
- struct pci_dev *dev;
-
- /* physical address of MMIO space */
- u64 mmio_phys;
- /* virtual address of MMIO space */
- u8 *mmio_base;
-
- /* capabilities of that IOMMU read from ACPI */
- u32 cap;
-
- /* flags read from acpi table */
- u8 acpi_flags;
-
- /*
- * Capability pointer. There could be more than one IOMMU per PCI
- * device function if there are more than one AMD IOMMU capability
- * pointers.
- */
- u16 cap_ptr;
-
- /* pci domain of this IOMMU */
- u16 pci_seg;
-
- /* first device this IOMMU handles. read from PCI */
- u16 first_device;
- /* last device this IOMMU handles. read from PCI */
- u16 last_device;
-
- /* start of exclusion range of that IOMMU */
- u64 exclusion_start;
- /* length of exclusion range of that IOMMU */
- u64 exclusion_length;
-
- /* command buffer virtual address */
- u8 *cmd_buf;
- /* size of command buffer */
- u32 cmd_buf_size;
-
- /* size of event buffer */
- u32 evt_buf_size;
- /* event buffer virtual address */
- u8 *evt_buf;
- /* MSI number for event interrupt */
- u16 evt_msi_num;
-
- /* true if interrupts for this IOMMU are already enabled */
- bool int_enabled;
-
- /* if one, we need to send a completion wait command */
- bool need_sync;
-
- /* becomes true if a command buffer reset is running */
- bool reset_in_progress;
-
- /* default dma_ops domain for that IOMMU */
- struct dma_ops_domain *default_dom;
-
- /*
- * We can't rely on the BIOS to restore all values on reinit, so we
- * need to stash them
- */
-
- /* The iommu BAR */
- u32 stored_addr_lo;
- u32 stored_addr_hi;
-
- /*
- * Each iommu has 6 l1s, each of which is documented as having 0x12
- * registers
- */
- u32 stored_l1[6][0x12];
-
- /* The l2 indirect registers */
- u32 stored_l2[0x83];
-};
-
-/*
- * List with all IOMMUs in the system. This list is not locked because it is
- * only written and read at driver initialization or suspend time
- */
-extern struct list_head amd_iommu_list;
-
-/*
- * Array with pointers to each IOMMU struct
- * The indices are referenced in the protection domains
- */
-extern struct amd_iommu *amd_iommus[MAX_IOMMUS];
-
-/* Number of IOMMUs present in the system */
-extern int amd_iommus_present;
-
-/*
- * Declarations for the global list of all protection domains
- */
-extern spinlock_t amd_iommu_pd_lock;
-extern struct list_head amd_iommu_pd_list;
-
-/*
- * Structure defining one entry in the device table
- */
-struct dev_table_entry {
- u32 data[8];
-};
-
-/*
- * One entry for unity mappings parsed out of the ACPI table.
- */
-struct unity_map_entry {
- struct list_head list;
-
- /* starting device id this entry is used for (including) */
- u16 devid_start;
- /* end device id this entry is used for (including) */
- u16 devid_end;
-
- /* start address to unity map (including) */
- u64 address_start;
- /* end address to unity map (including) */
- u64 address_end;
-
- /* required protection */
- int prot;
-};
-
-/*
- * List of all unity mappings. It is not locked because as runtime it is only
- * read. It is created at ACPI table parsing time.
- */
-extern struct list_head amd_iommu_unity_map;
-
-/*
- * Data structures for device handling
- */
-
-/*
- * Device table used by hardware. Read and write accesses by software are
- * locked with the amd_iommu_pd_table lock.
- */
-extern struct dev_table_entry *amd_iommu_dev_table;
-
-/*
- * Alias table to find requestor ids to device ids. Not locked because only
- * read on runtime.
- */
-extern u16 *amd_iommu_alias_table;
-
-/*
- * Reverse lookup table to find the IOMMU which translates a specific device.
- */
-extern struct amd_iommu **amd_iommu_rlookup_table;
-
-/* size of the dma_ops aperture as power of 2 */
-extern unsigned amd_iommu_aperture_order;
-
-/* largest PCI device id we expect translation requests for */
-extern u16 amd_iommu_last_bdf;
-
-/* allocation bitmap for domain ids */
-extern unsigned long *amd_iommu_pd_alloc_bitmap;
-
-/*
- * If true, the addresses will be flushed on unmap time, not when
- * they are reused
- */
-extern bool amd_iommu_unmap_flush;
-
-/* takes bus and device/function and returns the device id
- * FIXME: should that be in generic PCI code? */
-static inline u16 calc_devid(u8 bus, u8 devfn)
-{
- return (((u16)bus) << 8) | devfn;
-}
-
-#ifdef CONFIG_AMD_IOMMU_STATS
-
-struct __iommu_counter {
- char *name;
- struct dentry *dent;
- u64 value;
-};
-
-#define DECLARE_STATS_COUNTER(nm) \
- static struct __iommu_counter nm = { \
- .name = #nm, \
- }
-
-#define INC_STATS_COUNTER(name) name.value += 1
-#define ADD_STATS_COUNTER(name, x) name.value += (x)
-#define SUB_STATS_COUNTER(name, x) name.value -= (x)
-
-#else /* CONFIG_AMD_IOMMU_STATS */
-
-#define DECLARE_STATS_COUNTER(name)
-#define INC_STATS_COUNTER(name)
-#define ADD_STATS_COUNTER(name, x)
-#define SUB_STATS_COUNTER(name, x)
-
-#endif /* CONFIG_AMD_IOMMU_STATS */
-
-#endif /* _ASM_X86_AMD_IOMMU_TYPES_H */
diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index 331682231bb..67f87f25761 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -11,7 +11,6 @@ struct amd_nb_bus_dev_range {
extern const struct pci_device_id amd_nb_misc_ids[];
extern const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[];
-struct bootnode;
extern bool early_is_amd_nb(u32 value);
extern int amd_cache_northbridges(void);
diff --git a/arch/x86/include/asm/apb_timer.h b/arch/x86/include/asm/apb_timer.h
index 2fefa501d3b..0acbac299e4 100644
--- a/arch/x86/include/asm/apb_timer.h
+++ b/arch/x86/include/asm/apb_timer.h
@@ -18,24 +18,6 @@
#ifdef CONFIG_APB_TIMER
-/* Langwell DW APB timer registers */
-#define APBTMR_N_LOAD_COUNT 0x00
-#define APBTMR_N_CURRENT_VALUE 0x04
-#define APBTMR_N_CONTROL 0x08
-#define APBTMR_N_EOI 0x0c
-#define APBTMR_N_INT_STATUS 0x10
-
-#define APBTMRS_INT_STATUS 0xa0
-#define APBTMRS_EOI 0xa4
-#define APBTMRS_RAW_INT_STATUS 0xa8
-#define APBTMRS_COMP_VERSION 0xac
-#define APBTMRS_REG_SIZE 0x14
-
-/* register bits */
-#define APBTMR_CONTROL_ENABLE (1<<0)
-#define APBTMR_CONTROL_MODE_PERIODIC (1<<1) /*1: periodic 0:free running */
-#define APBTMR_CONTROL_INT (1<<2)
-
/* default memory mapped register base */
#define LNW_SCU_ADDR 0xFF100000
#define LNW_EXT_TIMER_OFFSET 0x1B800
@@ -43,14 +25,13 @@
#define LNW_EXT_TIMER_PGOFFSET 0x800
/* APBT clock speed range from PCLK to fabric base, 25-100MHz */
-#define APBT_MAX_FREQ 50
-#define APBT_MIN_FREQ 1
+#define APBT_MAX_FREQ 50000000
+#define APBT_MIN_FREQ 1000000
#define APBT_MMAP_SIZE 1024
#define APBT_DEV_USED 1
extern void apbt_time_init(void);
-extern struct clock_event_device *global_clock_event;
extern unsigned long apbt_quick_calibrate(void);
extern int arch_setup_apbt_irqs(int irq, int trigger, int mask, int cpu);
extern void apbt_setup_secondary_clock(void);
@@ -62,7 +43,7 @@ extern int sfi_mtimer_num;
#else /* CONFIG_APB_TIMER */
static inline unsigned long apbt_quick_calibrate(void) {return 0; }
-static inline void apbt_time_init(void) {return 0; }
+static inline void apbt_time_init(void) { }
#endif
#endif /* ASM_X86_APBT_H */
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 2b7d573be54..4a0b7c7e2cc 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -363,7 +363,12 @@ struct apic {
*/
int (*x86_32_early_logical_apicid)(int cpu);
- /* determine CPU -> NUMA node mapping */
+ /*
+ * Optional method called from setup_local_APIC() after logical
+ * apicid is guaranteed to be known to initialize apicid -> node
+ * mapping if NUMA initialization hasn't done so already. Don't
+ * add new users.
+ */
int (*x86_32_numa_cpu_node)(int cpu);
#endif
};
@@ -376,6 +381,26 @@ struct apic {
extern struct apic *apic;
/*
+ * APIC drivers are probed based on how they are listed in the .apicdrivers
+ * section. So the order is important and enforced by the ordering
+ * of different apic driver files in the Makefile.
+ *
+ * For the files having two apic drivers, we use apic_drivers()
+ * to enforce the order with in them.
+ */
+#define apic_driver(sym) \
+ static struct apic *__apicdrivers_##sym __used \
+ __aligned(sizeof(struct apic *)) \
+ __section(.apicdrivers) = { &sym }
+
+#define apic_drivers(sym1, sym2) \
+ static struct apic *__apicdrivers_##sym1##sym2[2] __used \
+ __aligned(sizeof(struct apic *)) \
+ __section(.apicdrivers) = { &sym1, &sym2 }
+
+extern struct apic *__apicdrivers[], *__apicdrivers_end[];
+
+/*
* APIC functionality to boot other CPUs - only used on SMP:
*/
#ifdef CONFIG_SMP
@@ -453,15 +478,10 @@ static inline unsigned default_get_apic_id(unsigned long x)
#define DEFAULT_TRAMPOLINE_PHYS_HIGH 0x469
#ifdef CONFIG_X86_64
-extern struct apic apic_flat;
-extern struct apic apic_physflat;
-extern struct apic apic_x2apic_cluster;
-extern struct apic apic_x2apic_phys;
extern int default_acpi_madt_oem_check(char *, char *);
extern void apic_send_IPI_self(int vector);
-extern struct apic apic_x2apic_uv_x;
DECLARE_PER_CPU(int, x2apic_extra_bits);
extern int default_cpu_present_to_apicid(int mps_cpu);
@@ -475,7 +495,7 @@ static inline void default_wait_for_init_deassert(atomic_t *deassert)
return;
}
-extern void generic_bigsmp_probe(void);
+extern struct apic *generic_bigsmp_probe(void);
#ifdef CONFIG_X86_LOCAL_APIC
@@ -511,8 +531,6 @@ extern struct apic apic_noop;
#ifdef CONFIG_X86_32
-extern struct apic apic_default;
-
static inline int noop_x86_32_early_logical_apicid(int cpu)
{
return BAD_APICID;
@@ -537,8 +555,6 @@ static inline int default_phys_pkg_id(int cpuid_apic, int index_msb)
return cpuid_apic >> index_msb;
}
-extern int default_x86_32_numa_cpu_node(int cpu);
-
#endif
static inline unsigned int
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index b3ed1e1460f..9412d6558c8 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -3,9 +3,11 @@
#ifdef __ASSEMBLY__
# define __ASM_FORM(x) x
+# define __ASM_FORM_COMMA(x) x,
# define __ASM_EX_SEC .section __ex_table, "a"
#else
# define __ASM_FORM(x) " " #x " "
+# define __ASM_FORM_COMMA(x) " " #x ","
# define __ASM_EX_SEC " .section __ex_table,\"a\"\n"
#endif
@@ -15,7 +17,8 @@
# define __ASM_SEL(a,b) __ASM_FORM(b)
#endif
-#define __ASM_SIZE(inst) __ASM_SEL(inst##l, inst##q)
+#define __ASM_SIZE(inst, ...) __ASM_SEL(inst##l##__VA_ARGS__, \
+ inst##q##__VA_ARGS__)
#define __ASM_REG(reg) __ASM_SEL(e##reg, r##reg)
#define _ASM_PTR __ASM_SEL(.long, .quad)
diff --git a/arch/x86/include/asm/bios_ebda.h b/arch/x86/include/asm/bios_ebda.h
index 3c7521063d3..aa6a3170ab5 100644
--- a/arch/x86/include/asm/bios_ebda.h
+++ b/arch/x86/include/asm/bios_ebda.h
@@ -4,16 +4,40 @@
#include <asm/io.h>
/*
- * there is a real-mode segmented pointer pointing to the
- * 4K EBDA area at 0x40E.
+ * Returns physical address of EBDA. Returns 0 if there is no EBDA.
*/
static inline unsigned int get_bios_ebda(void)
{
+ /*
+ * There is a real-mode segmented pointer pointing to the
+ * 4K EBDA area at 0x40E.
+ */
unsigned int address = *(unsigned short *)phys_to_virt(0x40E);
address <<= 4;
return address; /* 0 means none */
}
+/*
+ * Return the sanitized length of the EBDA in bytes, if it exists.
+ */
+static inline unsigned int get_bios_ebda_length(void)
+{
+ unsigned int address;
+ unsigned int length;
+
+ address = get_bios_ebda();
+ if (!address)
+ return 0;
+
+ /* EBDA length is byte 0 of the EBDA (stored in KiB) */
+ length = *(unsigned char *)phys_to_virt(address);
+ length <<= 10;
+
+ /* Trim the length if it extends beyond 640KiB */
+ length = min_t(unsigned int, (640 * 1024) - address, length);
+ return length;
+}
+
void reserve_ebda_region(void);
#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h
index 30af5a83216..a9e3a740f69 100644
--- a/arch/x86/include/asm/calling.h
+++ b/arch/x86/include/asm/calling.h
@@ -46,6 +46,7 @@ For 32-bit we have the following conventions - kernel is built with
*/
+#include "dwarf2.h"
/*
* 64-bit system call stack frame layout defines and helpers, for
@@ -84,72 +85,57 @@ For 32-bit we have the following conventions - kernel is built with
#define ARGOFFSET R11
#define SWFRAME ORIG_RAX
- .macro SAVE_ARGS addskip=0, norcx=0, nor891011=0
+ .macro SAVE_ARGS addskip=0, save_rcx=1, save_r891011=1
subq $9*8+\addskip, %rsp
CFI_ADJUST_CFA_OFFSET 9*8+\addskip
- movq %rdi, 8*8(%rsp)
- CFI_REL_OFFSET rdi, 8*8
- movq %rsi, 7*8(%rsp)
- CFI_REL_OFFSET rsi, 7*8
- movq %rdx, 6*8(%rsp)
- CFI_REL_OFFSET rdx, 6*8
- .if \norcx
- .else
- movq %rcx, 5*8(%rsp)
- CFI_REL_OFFSET rcx, 5*8
+ movq_cfi rdi, 8*8
+ movq_cfi rsi, 7*8
+ movq_cfi rdx, 6*8
+
+ .if \save_rcx
+ movq_cfi rcx, 5*8
.endif
- movq %rax, 4*8(%rsp)
- CFI_REL_OFFSET rax, 4*8
- .if \nor891011
- .else
- movq %r8, 3*8(%rsp)
- CFI_REL_OFFSET r8, 3*8
- movq %r9, 2*8(%rsp)
- CFI_REL_OFFSET r9, 2*8
- movq %r10, 1*8(%rsp)
- CFI_REL_OFFSET r10, 1*8
- movq %r11, (%rsp)
- CFI_REL_OFFSET r11, 0*8
+
+ movq_cfi rax, 4*8
+
+ .if \save_r891011
+ movq_cfi r8, 3*8
+ movq_cfi r9, 2*8
+ movq_cfi r10, 1*8
+ movq_cfi r11, 0*8
.endif
+
.endm
#define ARG_SKIP (9*8)
- .macro RESTORE_ARGS skiprax=0, addskip=0, skiprcx=0, skipr11=0, \
- skipr8910=0, skiprdx=0
- .if \skipr11
- .else
- movq (%rsp), %r11
- CFI_RESTORE r11
+ .macro RESTORE_ARGS rstor_rax=1, addskip=0, rstor_rcx=1, rstor_r11=1, \
+ rstor_r8910=1, rstor_rdx=1
+ .if \rstor_r11
+ movq_cfi_restore 0*8, r11
.endif
- .if \skipr8910
- .else
- movq 1*8(%rsp), %r10
- CFI_RESTORE r10
- movq 2*8(%rsp), %r9
- CFI_RESTORE r9
- movq 3*8(%rsp), %r8
- CFI_RESTORE r8
+
+ .if \rstor_r8910
+ movq_cfi_restore 1*8, r10
+ movq_cfi_restore 2*8, r9
+ movq_cfi_restore 3*8, r8
.endif
- .if \skiprax
- .else
- movq 4*8(%rsp), %rax
- CFI_RESTORE rax
+
+ .if \rstor_rax
+ movq_cfi_restore 4*8, rax
.endif
- .if \skiprcx
- .else
- movq 5*8(%rsp), %rcx
- CFI_RESTORE rcx
+
+ .if \rstor_rcx
+ movq_cfi_restore 5*8, rcx
.endif
- .if \skiprdx
- .else
- movq 6*8(%rsp), %rdx
- CFI_RESTORE rdx
+
+ .if \rstor_rdx
+ movq_cfi_restore 6*8, rdx
.endif
- movq 7*8(%rsp), %rsi
- CFI_RESTORE rsi
- movq 8*8(%rsp), %rdi
- CFI_RESTORE rdi
+
+ movq_cfi_restore 7*8, rsi
+ movq_cfi_restore 8*8, rdi
+
.if ARG_SKIP+\addskip > 0
addq $ARG_SKIP+\addskip, %rsp
CFI_ADJUST_CFA_OFFSET -(ARG_SKIP+\addskip)
@@ -176,33 +162,21 @@ For 32-bit we have the following conventions - kernel is built with
.macro SAVE_REST
subq $REST_SKIP, %rsp
CFI_ADJUST_CFA_OFFSET REST_SKIP
- movq %rbx, 5*8(%rsp)
- CFI_REL_OFFSET rbx, 5*8
- movq %rbp, 4*8(%rsp)
- CFI_REL_OFFSET rbp, 4*8
- movq %r12, 3*8(%rsp)
- CFI_REL_OFFSET r12, 3*8
- movq %r13, 2*8(%rsp)
- CFI_REL_OFFSET r13, 2*8
- movq %r14, 1*8(%rsp)
- CFI_REL_OFFSET r14, 1*8
- movq %r15, (%rsp)
- CFI_REL_OFFSET r15, 0*8
+ movq_cfi rbx, 5*8
+ movq_cfi rbp, 4*8
+ movq_cfi r12, 3*8
+ movq_cfi r13, 2*8
+ movq_cfi r14, 1*8
+ movq_cfi r15, 0*8
.endm
.macro RESTORE_REST
- movq (%rsp), %r15
- CFI_RESTORE r15
- movq 1*8(%rsp), %r14
- CFI_RESTORE r14
- movq 2*8(%rsp), %r13
- CFI_RESTORE r13
- movq 3*8(%rsp), %r12
- CFI_RESTORE r12
- movq 4*8(%rsp), %rbp
- CFI_RESTORE rbp
- movq 5*8(%rsp), %rbx
- CFI_RESTORE rbx
+ movq_cfi_restore 0*8, r15
+ movq_cfi_restore 1*8, r14
+ movq_cfi_restore 2*8, r13
+ movq_cfi_restore 3*8, r12
+ movq_cfi_restore 4*8, rbp
+ movq_cfi_restore 5*8, rbx
addq $REST_SKIP, %rsp
CFI_ADJUST_CFA_OFFSET -(REST_SKIP)
.endm
@@ -214,7 +188,7 @@ For 32-bit we have the following conventions - kernel is built with
.macro RESTORE_ALL addskip=0
RESTORE_REST
- RESTORE_ARGS 0, \addskip
+ RESTORE_ARGS 1, \addskip
.endm
.macro icebp
diff --git a/arch/x86/include/asm/clocksource.h b/arch/x86/include/asm/clocksource.h
new file mode 100644
index 00000000000..0bdbbb3b9ce
--- /dev/null
+++ b/arch/x86/include/asm/clocksource.h
@@ -0,0 +1,18 @@
+/* x86-specific clocksource additions */
+
+#ifndef _ASM_X86_CLOCKSOURCE_H
+#define _ASM_X86_CLOCKSOURCE_H
+
+#ifdef CONFIG_X86_64
+
+#define VCLOCK_NONE 0 /* No vDSO clock available. */
+#define VCLOCK_TSC 1 /* vDSO should use vread_tsc. */
+#define VCLOCK_HPET 2 /* vDSO should use vread_hpet. */
+
+struct arch_clocksource_data {
+ int vclock_mode;
+};
+
+#endif /* CONFIG_X86_64 */
+
+#endif /* _ASM_X86_CLOCKSOURCE_H */
diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h
index 284a6e8f7ce..3deb7250624 100644
--- a/arch/x86/include/asm/cmpxchg_32.h
+++ b/arch/x86/include/asm/cmpxchg_32.h
@@ -280,4 +280,52 @@ static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old,
#endif
+#define cmpxchg8b(ptr, o1, o2, n1, n2) \
+({ \
+ char __ret; \
+ __typeof__(o2) __dummy; \
+ __typeof__(*(ptr)) __old1 = (o1); \
+ __typeof__(o2) __old2 = (o2); \
+ __typeof__(*(ptr)) __new1 = (n1); \
+ __typeof__(o2) __new2 = (n2); \
+ asm volatile(LOCK_PREFIX "cmpxchg8b %2; setz %1" \
+ : "=d"(__dummy), "=a" (__ret), "+m" (*ptr)\
+ : "a" (__old1), "d"(__old2), \
+ "b" (__new1), "c" (__new2) \
+ : "memory"); \
+ __ret; })
+
+
+#define cmpxchg8b_local(ptr, o1, o2, n1, n2) \
+({ \
+ char __ret; \
+ __typeof__(o2) __dummy; \
+ __typeof__(*(ptr)) __old1 = (o1); \
+ __typeof__(o2) __old2 = (o2); \
+ __typeof__(*(ptr)) __new1 = (n1); \
+ __typeof__(o2) __new2 = (n2); \
+ asm volatile("cmpxchg8b %2; setz %1" \
+ : "=d"(__dummy), "=a"(__ret), "+m" (*ptr)\
+ : "a" (__old), "d"(__old2), \
+ "b" (__new1), "c" (__new2), \
+ : "memory"); \
+ __ret; })
+
+
+#define cmpxchg_double(ptr, o1, o2, n1, n2) \
+({ \
+ BUILD_BUG_ON(sizeof(*(ptr)) != 4); \
+ VM_BUG_ON((unsigned long)(ptr) % 8); \
+ cmpxchg8b((ptr), (o1), (o2), (n1), (n2)); \
+})
+
+#define cmpxchg_double_local(ptr, o1, o2, n1, n2) \
+({ \
+ BUILD_BUG_ON(sizeof(*(ptr)) != 4); \
+ VM_BUG_ON((unsigned long)(ptr) % 8); \
+ cmpxchg16b_local((ptr), (o1), (o2), (n1), (n2)); \
+})
+
+#define system_has_cmpxchg_double() cpu_has_cx8
+
#endif /* _ASM_X86_CMPXCHG_32_H */
diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h
index 423ae58aa02..7cf5c0a2443 100644
--- a/arch/x86/include/asm/cmpxchg_64.h
+++ b/arch/x86/include/asm/cmpxchg_64.h
@@ -151,4 +151,49 @@ extern void __cmpxchg_wrong_size(void);
cmpxchg_local((ptr), (o), (n)); \
})
+#define cmpxchg16b(ptr, o1, o2, n1, n2) \
+({ \
+ char __ret; \
+ __typeof__(o2) __junk; \
+ __typeof__(*(ptr)) __old1 = (o1); \
+ __typeof__(o2) __old2 = (o2); \
+ __typeof__(*(ptr)) __new1 = (n1); \
+ __typeof__(o2) __new2 = (n2); \
+ asm volatile(LOCK_PREFIX "cmpxchg16b %2;setz %1" \
+ : "=d"(__junk), "=a"(__ret), "+m" (*ptr) \
+ : "b"(__new1), "c"(__new2), \
+ "a"(__old1), "d"(__old2)); \
+ __ret; })
+
+
+#define cmpxchg16b_local(ptr, o1, o2, n1, n2) \
+({ \
+ char __ret; \
+ __typeof__(o2) __junk; \
+ __typeof__(*(ptr)) __old1 = (o1); \
+ __typeof__(o2) __old2 = (o2); \
+ __typeof__(*(ptr)) __new1 = (n1); \
+ __typeof__(o2) __new2 = (n2); \
+ asm volatile("cmpxchg16b %2;setz %1" \
+ : "=d"(__junk), "=a"(__ret), "+m" (*ptr) \
+ : "b"(__new1), "c"(__new2), \
+ "a"(__old1), "d"(__old2)); \
+ __ret; })
+
+#define cmpxchg_double(ptr, o1, o2, n1, n2) \
+({ \
+ BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
+ VM_BUG_ON((unsigned long)(ptr) % 16); \
+ cmpxchg16b((ptr), (o1), (o2), (n1), (n2)); \
+})
+
+#define cmpxchg_double_local(ptr, o1, o2, n1, n2) \
+({ \
+ BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
+ VM_BUG_ON((unsigned long)(ptr) % 16); \
+ cmpxchg16b_local((ptr), (o1), (o2), (n1), (n2)); \
+})
+
+#define system_has_cmpxchg_double() cpu_has_cx16
+
#endif /* _ASM_X86_CMPXCHG_64_H */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 91f3e087cf2..4258aac99a6 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -125,7 +125,7 @@
#define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */
#define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */
#define X86_FEATURE_F16C (4*32+29) /* 16-bit fp conversions */
-#define X86_FEATURE_RDRND (4*32+30) /* The RDRAND instruction */
+#define X86_FEATURE_RDRAND (4*32+30) /* The RDRAND instruction */
#define X86_FEATURE_HYPERVISOR (4*32+31) /* Running on a hypervisor */
/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */
@@ -195,6 +195,8 @@
/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
#define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/
+#define X86_FEATURE_SMEP (9*32+ 7) /* Supervisor Mode Execution Protection */
+#define X86_FEATURE_ERMS (9*32+ 9) /* Enhanced REP MOVSB/STOSB */
#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
@@ -207,8 +209,7 @@ extern const char * const x86_power_flags[32];
#define test_cpu_cap(c, bit) \
test_bit(bit, (unsigned long *)((c)->x86_capability))
-#define cpu_has(c, bit) \
- (__builtin_constant_p(bit) && \
+#define REQUIRED_MASK_BIT_SET(bit) \
( (((bit)>>5)==0 && (1UL<<((bit)&31) & REQUIRED_MASK0)) || \
(((bit)>>5)==1 && (1UL<<((bit)&31) & REQUIRED_MASK1)) || \
(((bit)>>5)==2 && (1UL<<((bit)&31) & REQUIRED_MASK2)) || \
@@ -218,10 +219,16 @@ extern const char * const x86_power_flags[32];
(((bit)>>5)==6 && (1UL<<((bit)&31) & REQUIRED_MASK6)) || \
(((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7)) || \
(((bit)>>5)==8 && (1UL<<((bit)&31) & REQUIRED_MASK8)) || \
- (((bit)>>5)==9 && (1UL<<((bit)&31) & REQUIRED_MASK9)) ) \
- ? 1 : \
+ (((bit)>>5)==9 && (1UL<<((bit)&31) & REQUIRED_MASK9)) )
+
+#define cpu_has(c, bit) \
+ (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \
test_cpu_cap(c, bit))
+#define this_cpu_has(bit) \
+ (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \
+ x86_this_cpu_test_bit(bit, (unsigned long *)&cpu_info.x86_capability))
+
#define boot_cpu_has(bit) cpu_has(&boot_cpu_data, bit)
#define set_cpu_cap(c, bit) set_bit(bit, (unsigned long *)((c)->x86_capability))
@@ -281,6 +288,8 @@ extern const char * const x86_power_flags[32];
#define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR)
#define cpu_has_pclmulqdq boot_cpu_has(X86_FEATURE_PCLMULQDQ)
#define cpu_has_perfctr_core boot_cpu_has(X86_FEATURE_PERFCTR_CORE)
+#define cpu_has_cx8 boot_cpu_has(X86_FEATURE_CX8)
+#define cpu_has_cx16 boot_cpu_has(X86_FEATURE_CX16)
#if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64)
# define cpu_has_invlpg 1
@@ -324,8 +333,8 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
"2:\n"
".section .altinstructions,\"a\"\n"
_ASM_ALIGN "\n"
- _ASM_PTR "1b\n"
- _ASM_PTR "0\n" /* no replacement */
+ " .long 1b - .\n"
+ " .long 0\n" /* no replacement */
" .word %P0\n" /* feature bit */
" .byte 2b - 1b\n" /* source len */
" .byte 0\n" /* replacement len */
@@ -342,8 +351,8 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
"2:\n"
".section .altinstructions,\"a\"\n"
_ASM_ALIGN "\n"
- _ASM_PTR "1b\n"
- _ASM_PTR "3f\n"
+ " .long 1b - .\n"
+ " .long 3f - .\n"
" .word %P1\n" /* feature bit */
" .byte 2b - 1b\n" /* source len */
" .byte 4f - 3f\n" /* replacement len */
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 617bd56b307..7b439d9aea2 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -4,30 +4,33 @@
#include <asm/desc_defs.h>
#include <asm/ldt.h>
#include <asm/mmu.h>
+
#include <linux/smp.h>
-static inline void fill_ldt(struct desc_struct *desc,
- const struct user_desc *info)
-{
- desc->limit0 = info->limit & 0x0ffff;
- desc->base0 = info->base_addr & 0x0000ffff;
-
- desc->base1 = (info->base_addr & 0x00ff0000) >> 16;
- desc->type = (info->read_exec_only ^ 1) << 1;
- desc->type |= info->contents << 2;
- desc->s = 1;
- desc->dpl = 0x3;
- desc->p = info->seg_not_present ^ 1;
- desc->limit = (info->limit & 0xf0000) >> 16;
- desc->avl = info->useable;
- desc->d = info->seg_32bit;
- desc->g = info->limit_in_pages;
- desc->base2 = (info->base_addr & 0xff000000) >> 24;
+static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *info)
+{
+ desc->limit0 = info->limit & 0x0ffff;
+
+ desc->base0 = (info->base_addr & 0x0000ffff);
+ desc->base1 = (info->base_addr & 0x00ff0000) >> 16;
+
+ desc->type = (info->read_exec_only ^ 1) << 1;
+ desc->type |= info->contents << 2;
+
+ desc->s = 1;
+ desc->dpl = 0x3;
+ desc->p = info->seg_not_present ^ 1;
+ desc->limit = (info->limit & 0xf0000) >> 16;
+ desc->avl = info->useable;
+ desc->d = info->seg_32bit;
+ desc->g = info->limit_in_pages;
+
+ desc->base2 = (info->base_addr & 0xff000000) >> 24;
/*
* Don't allow setting of the lm bit. It is useless anyway
* because 64bit system calls require __USER_CS:
*/
- desc->l = 0;
+ desc->l = 0;
}
extern struct desc_ptr idt_descr;
@@ -36,6 +39,7 @@ extern gate_desc idt_table[];
struct gdt_page {
struct desc_struct gdt[GDT_ENTRIES];
} __attribute__((aligned(PAGE_SIZE)));
+
DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page);
static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
@@ -48,16 +52,16 @@ static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func,
unsigned dpl, unsigned ist, unsigned seg)
{
- gate->offset_low = PTR_LOW(func);
- gate->segment = __KERNEL_CS;
- gate->ist = ist;
- gate->p = 1;
- gate->dpl = dpl;
- gate->zero0 = 0;
- gate->zero1 = 0;
- gate->type = type;
- gate->offset_middle = PTR_MIDDLE(func);
- gate->offset_high = PTR_HIGH(func);
+ gate->offset_low = PTR_LOW(func);
+ gate->segment = __KERNEL_CS;
+ gate->ist = ist;
+ gate->p = 1;
+ gate->dpl = dpl;
+ gate->zero0 = 0;
+ gate->zero1 = 0;
+ gate->type = type;
+ gate->offset_middle = PTR_MIDDLE(func);
+ gate->offset_high = PTR_HIGH(func);
}
#else
@@ -66,8 +70,7 @@ static inline void pack_gate(gate_desc *gate, unsigned char type,
unsigned short seg)
{
gate->a = (seg << 16) | (base & 0xffff);
- gate->b = (base & 0xffff0000) |
- (((0x80 | type | (dpl << 5)) & 0xff) << 8);
+ gate->b = (base & 0xffff0000) | (((0x80 | type | (dpl << 5)) & 0xff) << 8);
}
#endif
@@ -75,31 +78,29 @@ static inline void pack_gate(gate_desc *gate, unsigned char type,
static inline int desc_empty(const void *ptr)
{
const u32 *desc = ptr;
+
return !(desc[0] | desc[1]);
}
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#else
-#define load_TR_desc() native_load_tr_desc()
-#define load_gdt(dtr) native_load_gdt(dtr)
-#define load_idt(dtr) native_load_idt(dtr)
-#define load_tr(tr) asm volatile("ltr %0"::"m" (tr))
-#define load_ldt(ldt) asm volatile("lldt %0"::"m" (ldt))
-
-#define store_gdt(dtr) native_store_gdt(dtr)
-#define store_idt(dtr) native_store_idt(dtr)
-#define store_tr(tr) (tr = native_store_tr())
-
-#define load_TLS(t, cpu) native_load_tls(t, cpu)
-#define set_ldt native_set_ldt
-
-#define write_ldt_entry(dt, entry, desc) \
- native_write_ldt_entry(dt, entry, desc)
-#define write_gdt_entry(dt, entry, desc, type) \
- native_write_gdt_entry(dt, entry, desc, type)
-#define write_idt_entry(dt, entry, g) \
- native_write_idt_entry(dt, entry, g)
+#define load_TR_desc() native_load_tr_desc()
+#define load_gdt(dtr) native_load_gdt(dtr)
+#define load_idt(dtr) native_load_idt(dtr)
+#define load_tr(tr) asm volatile("ltr %0"::"m" (tr))
+#define load_ldt(ldt) asm volatile("lldt %0"::"m" (ldt))
+
+#define store_gdt(dtr) native_store_gdt(dtr)
+#define store_idt(dtr) native_store_idt(dtr)
+#define store_tr(tr) (tr = native_store_tr())
+
+#define load_TLS(t, cpu) native_load_tls(t, cpu)
+#define set_ldt native_set_ldt
+
+#define write_ldt_entry(dt, entry, desc) native_write_ldt_entry(dt, entry, desc)
+#define write_gdt_entry(dt, entry, desc, type) native_write_gdt_entry(dt, entry, desc, type)
+#define write_idt_entry(dt, entry, g) native_write_idt_entry(dt, entry, g)
static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries)
{
@@ -112,33 +113,27 @@ static inline void paravirt_free_ldt(struct desc_struct *ldt, unsigned entries)
#define store_ldt(ldt) asm("sldt %0" : "=m"(ldt))
-static inline void native_write_idt_entry(gate_desc *idt, int entry,
- const gate_desc *gate)
+static inline void native_write_idt_entry(gate_desc *idt, int entry, const gate_desc *gate)
{
memcpy(&idt[entry], gate, sizeof(*gate));
}
-static inline void native_write_ldt_entry(struct desc_struct *ldt, int entry,
- const void *desc)
+static inline void native_write_ldt_entry(struct desc_struct *ldt, int entry, const void *desc)
{
memcpy(&ldt[entry], desc, 8);
}
-static inline void native_write_gdt_entry(struct desc_struct *gdt, int entry,
- const void *desc, int type)
+static inline void
+native_write_gdt_entry(struct desc_struct *gdt, int entry, const void *desc, int type)
{
unsigned int size;
+
switch (type) {
- case DESC_TSS:
- size = sizeof(tss_desc);
- break;
- case DESC_LDT:
- size = sizeof(ldt_desc);
- break;
- default:
- size = sizeof(struct desc_struct);
- break;
+ case DESC_TSS: size = sizeof(tss_desc); break;
+ case DESC_LDT: size = sizeof(ldt_desc); break;
+ default: size = sizeof(*gdt); break;
}
+
memcpy(&gdt[entry], desc, size);
}
@@ -154,20 +149,21 @@ static inline void pack_descriptor(struct desc_struct *desc, unsigned long base,
}
-static inline void set_tssldt_descriptor(void *d, unsigned long addr,
- unsigned type, unsigned size)
+static inline void set_tssldt_descriptor(void *d, unsigned long addr, unsigned type, unsigned size)
{
#ifdef CONFIG_X86_64
struct ldttss_desc64 *desc = d;
+
memset(desc, 0, sizeof(*desc));
- desc->limit0 = size & 0xFFFF;
- desc->base0 = PTR_LOW(addr);
- desc->base1 = PTR_MIDDLE(addr) & 0xFF;
- desc->type = type;
- desc->p = 1;
- desc->limit1 = (size >> 16) & 0xF;
- desc->base2 = (PTR_MIDDLE(addr) >> 8) & 0xFF;
- desc->base3 = PTR_HIGH(addr);
+
+ desc->limit0 = size & 0xFFFF;
+ desc->base0 = PTR_LOW(addr);
+ desc->base1 = PTR_MIDDLE(addr) & 0xFF;
+ desc->type = type;
+ desc->p = 1;
+ desc->limit1 = (size >> 16) & 0xF;
+ desc->base2 = (PTR_MIDDLE(addr) >> 8) & 0xFF;
+ desc->base3 = PTR_HIGH(addr);
#else
pack_descriptor((struct desc_struct *)d, addr, size, 0x80 | type, 0);
#endif
@@ -237,14 +233,16 @@ static inline void native_store_idt(struct desc_ptr *dtr)
static inline unsigned long native_store_tr(void)
{
unsigned long tr;
+
asm volatile("str %0":"=r" (tr));
+
return tr;
}
static inline void native_load_tls(struct thread_struct *t, unsigned int cpu)
{
- unsigned int i;
struct desc_struct *gdt = get_cpu_gdt_table(cpu);
+ unsigned int i;
for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i];
@@ -313,6 +311,7 @@ static inline void _set_gate(int gate, unsigned type, void *addr,
unsigned dpl, unsigned ist, unsigned seg)
{
gate_desc s;
+
pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg);
/*
* does not need to be atomic because it is only done once at
@@ -343,8 +342,9 @@ static inline void alloc_system_vector(int vector)
set_bit(vector, used_vectors);
if (first_system_vector > vector)
first_system_vector = vector;
- } else
+ } else {
BUG();
+ }
}
static inline void alloc_intr_gate(unsigned int n, void *addr)
diff --git a/arch/x86/include/asm/dma.h b/arch/x86/include/asm/dma.h
index 057099e5fab..0bdb0c54d9a 100644
--- a/arch/x86/include/asm/dma.h
+++ b/arch/x86/include/asm/dma.h
@@ -69,22 +69,18 @@
#define MAX_DMA_CHANNELS 8
-#ifdef CONFIG_X86_32
-
-/* The maximum address that we can perform a DMA transfer to on this platform */
-#define MAX_DMA_ADDRESS (PAGE_OFFSET + 0x1000000)
-
-#else
-
/* 16MB ISA DMA zone */
#define MAX_DMA_PFN ((16 * 1024 * 1024) >> PAGE_SHIFT)
/* 4GB broken PCI/AGP hardware bus master zone */
#define MAX_DMA32_PFN ((4UL * 1024 * 1024 * 1024) >> PAGE_SHIFT)
+#ifdef CONFIG_X86_32
+/* The maximum address that we can perform a DMA transfer to on this platform */
+#define MAX_DMA_ADDRESS (PAGE_OFFSET + 0x1000000)
+#else
/* Compat define for old dma zone */
#define MAX_DMA_ADDRESS ((unsigned long)__va(MAX_DMA_PFN << PAGE_SHIFT))
-
#endif
/* 8237 DMA controllers */
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 8e4a16508d4..7093e4a6a0b 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -90,6 +90,7 @@ extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size,
#endif /* CONFIG_X86_32 */
extern int add_efi_memmap;
+extern void efi_set_executable(efi_memory_desc_t *md, bool executable);
extern void efi_memblock_x86_reserve_range(void);
extern void efi_call_phys_prelog(void);
extern void efi_call_phys_epilog(void);
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index 1cd6d26a0a8..0baa628e330 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -53,8 +53,4 @@ BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR)
BUILD_INTERRUPT(threshold_interrupt,THRESHOLD_APIC_VECTOR)
#endif
-#ifdef CONFIG_X86_MCE
-BUILD_INTERRUPT(mce_self_interrupt,MCE_SELF_VECTOR)
-#endif
-
#endif
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 4729b2b6311..460c74e4852 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -78,6 +78,7 @@ enum fixed_addresses {
VSYSCALL_LAST_PAGE,
VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE
+ ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
+ VVAR_PAGE,
VSYSCALL_HPET,
#endif
FIX_DBGP_BASE,
diff --git a/arch/x86/include/asm/frame.h b/arch/x86/include/asm/frame.h
index 2c6fc9e6281..3b629f47eb6 100644
--- a/arch/x86/include/asm/frame.h
+++ b/arch/x86/include/asm/frame.h
@@ -1,5 +1,6 @@
#ifdef __ASSEMBLY__
+#include <asm/asm.h>
#include <asm/dwarf2.h>
/* The annotation hides the frame from the unwinder and makes it look
@@ -7,13 +8,13 @@
frame pointer later */
#ifdef CONFIG_FRAME_POINTER
.macro FRAME
- pushl_cfi %ebp
- CFI_REL_OFFSET ebp,0
- movl %esp,%ebp
+ __ASM_SIZE(push,_cfi) %__ASM_REG(bp)
+ CFI_REL_OFFSET __ASM_REG(bp), 0
+ __ASM_SIZE(mov) %__ASM_REG(sp), %__ASM_REG(bp)
.endm
.macro ENDFRAME
- popl_cfi %ebp
- CFI_RESTORE ebp
+ __ASM_SIZE(pop,_cfi) %__ASM_REG(bp)
+ CFI_RESTORE __ASM_REG(bp)
.endm
#else
.macro FRAME
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index db24c2278be..268c783ab1c 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -38,11 +38,10 @@ extern void mcount(void);
static inline unsigned long ftrace_call_adjust(unsigned long addr)
{
/*
- * call mcount is "e8 <4 byte offset>"
- * The addr points to the 4 byte offset and the caller of this
- * function wants the pointer to e8. Simply subtract one.
+ * addr is the address of the mcount call instruction.
+ * recordmcount does the necessary offset calculation.
*/
- return addr - 1;
+ return addr;
}
#ifdef CONFIG_DYNAMIC_FTRACE
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index bb9efe8706e..13f5504c76c 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -34,7 +34,6 @@ extern void irq_work_interrupt(void);
extern void spurious_interrupt(void);
extern void thermal_interrupt(void);
extern void reschedule_interrupt(void);
-extern void mce_self_interrupt(void);
extern void invalidate_interrupt(void);
extern void invalidate_interrupt0(void);
diff --git a/arch/x86/include/asm/i8253.h b/arch/x86/include/asm/i8253.h
deleted file mode 100644
index fc1f579fb96..00000000000
--- a/arch/x86/include/asm/i8253.h
+++ /dev/null
@@ -1,18 +0,0 @@
-#ifndef _ASM_X86_I8253_H
-#define _ASM_X86_I8253_H
-
-/* i8253A PIT registers */
-#define PIT_MODE 0x43
-#define PIT_CH0 0x40
-#define PIT_CH2 0x42
-
-extern raw_spinlock_t i8253_lock;
-
-extern struct clock_event_device *global_clock_event;
-
-extern void setup_pit_timer(void);
-
-#define inb_pit inb_p
-#define outb_pit outb_p
-
-#endif /* _ASM_X86_I8253_H */
diff --git a/arch/x86/include/asm/idle.h b/arch/x86/include/asm/idle.h
index 38d87379e27..f49253d7571 100644
--- a/arch/x86/include/asm/idle.h
+++ b/arch/x86/include/asm/idle.h
@@ -16,6 +16,6 @@ static inline void enter_idle(void) { }
static inline void exit_idle(void) { }
#endif /* CONFIG_X86_64 */
-void c1e_remove_cpu(int cpu);
+void amd_e400_remove_cpu(int cpu);
#endif /* _ASM_X86_IDLE_H */
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 07227308252..d02804d650c 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -38,7 +38,6 @@
#include <linux/string.h>
#include <linux/compiler.h>
-#include <asm-generic/int-ll64.h>
#include <asm/page.h>
#include <xen/xen.h>
@@ -87,27 +86,6 @@ build_mmio_write(__writel, "l", unsigned int, "r", )
build_mmio_read(readq, "q", unsigned long, "=r", :"memory")
build_mmio_write(writeq, "q", unsigned long, "r", :"memory")
-#else
-
-static inline __u64 readq(const volatile void __iomem *addr)
-{
- const volatile u32 __iomem *p = addr;
- u32 low, high;
-
- low = readl(p);
- high = readl(p + 1);
-
- return low + ((u64)high << 32);
-}
-
-static inline void writeq(__u64 val, volatile void __iomem *addr)
-{
- writel(val, addr);
- writel(val >> 32, addr+4);
-}
-
-#endif
-
#define readq_relaxed(a) readq(a)
#define __raw_readq(a) readq(a)
@@ -117,6 +95,8 @@ static inline void writeq(__u64 val, volatile void __iomem *addr)
#define readq readq
#define writeq writeq
+#endif
+
/**
* virt_to_phys - map virtual addresses to physical
* @address: address to remap
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index a97a240f67f..690d1cc9a87 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -105,12 +105,12 @@ struct IR_IO_APIC_route_entry {
* # of IO-APICs and # of IRQ routing registers
*/
extern int nr_ioapics;
-extern int nr_ioapic_registers[MAX_IO_APICS];
-#define MP_MAX_IOAPIC_PIN 127
+extern int mpc_ioapic_id(int ioapic);
+extern unsigned int mpc_ioapic_addr(int ioapic);
+extern struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int ioapic);
-/* I/O APIC entries */
-extern struct mpc_ioapic mp_ioapics[MAX_IO_APICS];
+#define MP_MAX_IOAPIC_PIN 127
/* # of MP IRQ source entries */
extern int mp_irq_entries;
@@ -152,11 +152,9 @@ extern void ioapic_insert_resources(void);
int io_apic_setup_irq_pin_once(unsigned int irq, int node, struct io_apic_irq_attr *attr);
-extern struct IO_APIC_route_entry **alloc_ioapic_entries(void);
-extern void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries);
-extern int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
-extern void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
-extern int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
+extern int save_ioapic_entries(void);
+extern void mask_ioapic_entries(void);
+extern int restore_ioapic_entries(void);
extern int get_nr_irqs_gsi(void);
@@ -192,19 +190,13 @@ struct io_apic_irq_attr;
static inline int io_apic_set_pci_routing(struct device *dev, int irq,
struct io_apic_irq_attr *irq_attr) { return 0; }
-static inline struct IO_APIC_route_entry **alloc_ioapic_entries(void)
-{
- return NULL;
-}
-
-static inline void free_ioapic_entries(struct IO_APIC_route_entry **ent) { }
-static inline int save_IO_APIC_setup(struct IO_APIC_route_entry **ent)
+static inline int save_ioapic_entries(void)
{
return -ENOMEM;
}
-static inline void mask_IO_APIC_setup(struct IO_APIC_route_entry **ent) { }
-static inline int restore_IO_APIC_setup(struct IO_APIC_route_entry **ent)
+static inline void mask_ioapic_entries(void) { }
+static inline int restore_ioapic_entries(void)
{
return -ENOMEM;
}
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 6e976ee3b3e..f9a320984a1 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -17,7 +17,8 @@
* Vectors 0 ... 31 : system traps and exceptions - hardcoded events
* Vectors 32 ... 127 : device interrupts
* Vector 128 : legacy int80 syscall interface
- * Vectors 129 ... INVALIDATE_TLB_VECTOR_START-1 : device interrupts
+ * Vector 204 : legacy x86_64 vsyscall emulation
+ * Vectors 129 ... INVALIDATE_TLB_VECTOR_START-1 except 204 : device interrupts
* Vectors INVALIDATE_TLB_VECTOR_START ... 255 : special interrupts
*
* 64-bit x86 has per CPU IDT tables, 32-bit has one shared IDT table.
@@ -50,6 +51,9 @@
#ifdef CONFIG_X86_32
# define SYSCALL_VECTOR 0x80
#endif
+#ifdef CONFIG_X86_64
+# define VSYSCALL_EMU_VECTOR 0xcc
+#endif
/*
* Vectors 0x30-0x3f are used for ISA interrupts.
@@ -109,11 +113,6 @@
#define UV_BAU_MESSAGE 0xf5
-/*
- * Self IPI vector for machine checks
- */
-#define MCE_SELF_VECTOR 0xf4
-
/* Xen vector callback to receive events in a HVM domain */
#define XEN_HVM_EVTCHN_CALLBACK 0xf3
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index 5745ce8bf10..bba3cf88e62 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -60,23 +60,24 @@ static inline void native_halt(void)
#include <asm/paravirt.h>
#else
#ifndef __ASSEMBLY__
+#include <linux/types.h>
-static inline unsigned long arch_local_save_flags(void)
+static inline notrace unsigned long arch_local_save_flags(void)
{
return native_save_fl();
}
-static inline void arch_local_irq_restore(unsigned long flags)
+static inline notrace void arch_local_irq_restore(unsigned long flags)
{
native_restore_fl(flags);
}
-static inline void arch_local_irq_disable(void)
+static inline notrace void arch_local_irq_disable(void)
{
native_irq_disable();
}
-static inline void arch_local_irq_enable(void)
+static inline notrace void arch_local_irq_enable(void)
{
native_irq_enable();
}
@@ -102,7 +103,7 @@ static inline void halt(void)
/*
* For spinlocks, etc:
*/
-static inline unsigned long arch_local_irq_save(void)
+static inline notrace unsigned long arch_local_irq_save(void)
{
unsigned long flags = arch_local_save_flags();
arch_local_irq_disable();
diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h
index 574dbc22893..a32b18ce6ea 100644
--- a/arch/x86/include/asm/jump_label.h
+++ b/arch/x86/include/asm/jump_label.h
@@ -5,20 +5,25 @@
#include <linux/types.h>
#include <asm/nops.h>
+#include <asm/asm.h>
#define JUMP_LABEL_NOP_SIZE 5
-# define JUMP_LABEL_INITIAL_NOP ".byte 0xe9 \n\t .long 0\n\t"
-
-# define JUMP_LABEL(key, label) \
- do { \
- asm goto("1:" \
- JUMP_LABEL_INITIAL_NOP \
- ".pushsection __jump_table, \"aw\" \n\t"\
- _ASM_PTR "1b, %l[" #label "], %c0 \n\t" \
- ".popsection \n\t" \
- : : "i" (key) : : label); \
- } while (0)
+#define JUMP_LABEL_INITIAL_NOP ".byte 0xe9 \n\t .long 0\n\t"
+
+static __always_inline bool arch_static_branch(struct jump_label_key *key)
+{
+ asm goto("1:"
+ JUMP_LABEL_INITIAL_NOP
+ ".pushsection __jump_table, \"aw\" \n\t"
+ _ASM_ALIGN "\n\t"
+ _ASM_PTR "1b, %l[l_yes], %c0 \n\t"
+ ".popsection \n\t"
+ : : "i" (key) : : l_yes);
+ return false;
+l_yes:
+ return true;
+}
#endif /* __KERNEL__ */
diff --git a/arch/x86/include/asm/kgdb.h b/arch/x86/include/asm/kgdb.h
index 396f5b5fc4d..77e95f54570 100644
--- a/arch/x86/include/asm/kgdb.h
+++ b/arch/x86/include/asm/kgdb.h
@@ -77,6 +77,7 @@ static inline void arch_kgdb_breakpoint(void)
}
#define BREAK_INSTR_SIZE 1
#define CACHE_FLUSH_IS_SAFE 1
+#define GDB_ADJUSTS_BREAK_OFFSET
extern int kgdb_ll_trap(int cmd, const char *str,
struct pt_regs *regs, long err, int trap, int sig);
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 0f521356432..0049211959c 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -14,6 +14,8 @@
#include <asm/desc_defs.h>
struct x86_emulate_ctxt;
+enum x86_intercept;
+enum x86_intercept_stage;
struct x86_exception {
u8 vector;
@@ -24,6 +26,24 @@ struct x86_exception {
};
/*
+ * This struct is used to carry enough information from the instruction
+ * decoder to main KVM so that a decision can be made whether the
+ * instruction needs to be intercepted or not.
+ */
+struct x86_instruction_info {
+ u8 intercept; /* which intercept */
+ u8 rep_prefix; /* rep prefix? */
+ u8 modrm_mod; /* mod part of modrm */
+ u8 modrm_reg; /* index of register used */
+ u8 modrm_rm; /* rm part of modrm */
+ u64 src_val; /* value of source operand */
+ u8 src_bytes; /* size of source operand */
+ u8 dst_bytes; /* size of destination operand */
+ u8 ad_bytes; /* size of src/dst address */
+ u64 next_rip; /* rip following the instruction */
+};
+
+/*
* x86_emulate_ops:
*
* These operations represent the instruction emulator's interface to memory.
@@ -62,6 +82,7 @@ struct x86_exception {
#define X86EMUL_RETRY_INSTR 3 /* retry the instruction for some reason */
#define X86EMUL_CMPXCHG_FAILED 4 /* cmpxchg did not see expected value */
#define X86EMUL_IO_NEEDED 5 /* IO is needed to complete emulation */
+#define X86EMUL_INTERCEPTED 6 /* Intercepted by nested VMCB/VMCS */
struct x86_emulate_ops {
/*
@@ -71,8 +92,9 @@ struct x86_emulate_ops {
* @val: [OUT] Value read from memory, zero-extended to 'u_long'.
* @bytes: [IN ] Number of bytes to read from memory.
*/
- int (*read_std)(unsigned long addr, void *val,
- unsigned int bytes, struct kvm_vcpu *vcpu,
+ int (*read_std)(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr, void *val,
+ unsigned int bytes,
struct x86_exception *fault);
/*
@@ -82,8 +104,8 @@ struct x86_emulate_ops {
* @val: [OUT] Value write to memory, zero-extended to 'u_long'.
* @bytes: [IN ] Number of bytes to write to memory.
*/
- int (*write_std)(unsigned long addr, void *val,
- unsigned int bytes, struct kvm_vcpu *vcpu,
+ int (*write_std)(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr, void *val, unsigned int bytes,
struct x86_exception *fault);
/*
* fetch: Read bytes of standard (non-emulated/special) memory.
@@ -92,8 +114,8 @@ struct x86_emulate_ops {
* @val: [OUT] Value read from memory, zero-extended to 'u_long'.
* @bytes: [IN ] Number of bytes to read from memory.
*/
- int (*fetch)(unsigned long addr, void *val,
- unsigned int bytes, struct kvm_vcpu *vcpu,
+ int (*fetch)(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr, void *val, unsigned int bytes,
struct x86_exception *fault);
/*
@@ -102,11 +124,9 @@ struct x86_emulate_ops {
* @val: [OUT] Value read from memory, zero-extended to 'u_long'.
* @bytes: [IN ] Number of bytes to read from memory.
*/
- int (*read_emulated)(unsigned long addr,
- void *val,
- unsigned int bytes,
- struct x86_exception *fault,
- struct kvm_vcpu *vcpu);
+ int (*read_emulated)(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr, void *val, unsigned int bytes,
+ struct x86_exception *fault);
/*
* write_emulated: Write bytes to emulated/special memory area.
@@ -115,11 +135,10 @@ struct x86_emulate_ops {
* required).
* @bytes: [IN ] Number of bytes to write to memory.
*/
- int (*write_emulated)(unsigned long addr,
- const void *val,
+ int (*write_emulated)(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr, const void *val,
unsigned int bytes,
- struct x86_exception *fault,
- struct kvm_vcpu *vcpu);
+ struct x86_exception *fault);
/*
* cmpxchg_emulated: Emulate an atomic (LOCKed) CMPXCHG operation on an
@@ -129,40 +148,54 @@ struct x86_emulate_ops {
* @new: [IN ] Value to write to @addr.
* @bytes: [IN ] Number of bytes to access using CMPXCHG.
*/
- int (*cmpxchg_emulated)(unsigned long addr,
+ int (*cmpxchg_emulated)(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr,
const void *old,
const void *new,
unsigned int bytes,
- struct x86_exception *fault,
- struct kvm_vcpu *vcpu);
-
- int (*pio_in_emulated)(int size, unsigned short port, void *val,
- unsigned int count, struct kvm_vcpu *vcpu);
-
- int (*pio_out_emulated)(int size, unsigned short port, const void *val,
- unsigned int count, struct kvm_vcpu *vcpu);
-
- bool (*get_cached_descriptor)(struct desc_struct *desc, u32 *base3,
- int seg, struct kvm_vcpu *vcpu);
- void (*set_cached_descriptor)(struct desc_struct *desc, u32 base3,
- int seg, struct kvm_vcpu *vcpu);
- u16 (*get_segment_selector)(int seg, struct kvm_vcpu *vcpu);
- void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu);
- unsigned long (*get_cached_segment_base)(int seg, struct kvm_vcpu *vcpu);
- void (*get_gdt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu);
- void (*get_idt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu);
- ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu);
- int (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu);
- int (*cpl)(struct kvm_vcpu *vcpu);
- int (*get_dr)(int dr, unsigned long *dest, struct kvm_vcpu *vcpu);
- int (*set_dr)(int dr, unsigned long value, struct kvm_vcpu *vcpu);
- int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
- int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
+ struct x86_exception *fault);
+ void (*invlpg)(struct x86_emulate_ctxt *ctxt, ulong addr);
+
+ int (*pio_in_emulated)(struct x86_emulate_ctxt *ctxt,
+ int size, unsigned short port, void *val,
+ unsigned int count);
+
+ int (*pio_out_emulated)(struct x86_emulate_ctxt *ctxt,
+ int size, unsigned short port, const void *val,
+ unsigned int count);
+
+ bool (*get_segment)(struct x86_emulate_ctxt *ctxt, u16 *selector,
+ struct desc_struct *desc, u32 *base3, int seg);
+ void (*set_segment)(struct x86_emulate_ctxt *ctxt, u16 selector,
+ struct desc_struct *desc, u32 base3, int seg);
+ unsigned long (*get_cached_segment_base)(struct x86_emulate_ctxt *ctxt,
+ int seg);
+ void (*get_gdt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt);
+ void (*get_idt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt);
+ void (*set_gdt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt);
+ void (*set_idt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt);
+ ulong (*get_cr)(struct x86_emulate_ctxt *ctxt, int cr);
+ int (*set_cr)(struct x86_emulate_ctxt *ctxt, int cr, ulong val);
+ int (*cpl)(struct x86_emulate_ctxt *ctxt);
+ int (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest);
+ int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value);
+ int (*set_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data);
+ int (*get_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata);
+ void (*halt)(struct x86_emulate_ctxt *ctxt);
+ void (*wbinvd)(struct x86_emulate_ctxt *ctxt);
+ int (*fix_hypercall)(struct x86_emulate_ctxt *ctxt);
+ void (*get_fpu)(struct x86_emulate_ctxt *ctxt); /* disables preempt */
+ void (*put_fpu)(struct x86_emulate_ctxt *ctxt); /* reenables preempt */
+ int (*intercept)(struct x86_emulate_ctxt *ctxt,
+ struct x86_instruction_info *info,
+ enum x86_intercept_stage stage);
};
+typedef u32 __attribute__((vector_size(16))) sse128_t;
+
/* Type, address-of, and value of an instruction's operand. */
struct operand {
- enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type;
+ enum { OP_REG, OP_MEM, OP_IMM, OP_XMM, OP_NONE } type;
unsigned int bytes;
union {
unsigned long orig_val;
@@ -174,11 +207,13 @@ struct operand {
ulong ea;
unsigned seg;
} mem;
+ unsigned xmm;
} addr;
union {
unsigned long val;
u64 val64;
char valptr[sizeof(unsigned long) + 2];
+ sse128_t vec_val;
};
};
@@ -197,6 +232,7 @@ struct read_cache {
struct decode_cache {
u8 twobyte;
u8 b;
+ u8 intercept;
u8 lock_prefix;
u8 rep_prefix;
u8 op_bytes;
@@ -209,6 +245,7 @@ struct decode_cache {
u8 seg_override;
unsigned int d;
int (*execute)(struct x86_emulate_ctxt *ctxt);
+ int (*check_perm)(struct x86_emulate_ctxt *ctxt);
unsigned long regs[NR_VCPU_REGS];
unsigned long eip;
/* modrm */
@@ -227,17 +264,15 @@ struct x86_emulate_ctxt {
struct x86_emulate_ops *ops;
/* Register state before/after emulation. */
- struct kvm_vcpu *vcpu;
-
unsigned long eflags;
unsigned long eip; /* eip before instruction emulation */
/* Emulated execution mode, represented by an X86EMUL_MODE value. */
int mode;
- u32 cs_base;
/* interruptibility state, as a result of execution of STI or MOV SS */
int interruptibility;
+ bool guest_mode; /* guest running a nested guest */
bool perm_ok; /* do not check permissions if true */
bool only_vendor_specific_insn;
@@ -249,8 +284,8 @@ struct x86_emulate_ctxt {
};
/* Repeat String Operation Prefix */
-#define REPE_PREFIX 1
-#define REPNE_PREFIX 2
+#define REPE_PREFIX 0xf3
+#define REPNE_PREFIX 0xf2
/* Execution mode, passed to the emulator. */
#define X86EMUL_MODE_REAL 0 /* Real mode. */
@@ -259,6 +294,69 @@ struct x86_emulate_ctxt {
#define X86EMUL_MODE_PROT32 4 /* 32-bit protected mode. */
#define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */
+/* any protected mode */
+#define X86EMUL_MODE_PROT (X86EMUL_MODE_PROT16|X86EMUL_MODE_PROT32| \
+ X86EMUL_MODE_PROT64)
+
+enum x86_intercept_stage {
+ X86_ICTP_NONE = 0, /* Allow zero-init to not match anything */
+ X86_ICPT_PRE_EXCEPT,
+ X86_ICPT_POST_EXCEPT,
+ X86_ICPT_POST_MEMACCESS,
+};
+
+enum x86_intercept {
+ x86_intercept_none,
+ x86_intercept_cr_read,
+ x86_intercept_cr_write,
+ x86_intercept_clts,
+ x86_intercept_lmsw,
+ x86_intercept_smsw,
+ x86_intercept_dr_read,
+ x86_intercept_dr_write,
+ x86_intercept_lidt,
+ x86_intercept_sidt,
+ x86_intercept_lgdt,
+ x86_intercept_sgdt,
+ x86_intercept_lldt,
+ x86_intercept_sldt,
+ x86_intercept_ltr,
+ x86_intercept_str,
+ x86_intercept_rdtsc,
+ x86_intercept_rdpmc,
+ x86_intercept_pushf,
+ x86_intercept_popf,
+ x86_intercept_cpuid,
+ x86_intercept_rsm,
+ x86_intercept_iret,
+ x86_intercept_intn,
+ x86_intercept_invd,
+ x86_intercept_pause,
+ x86_intercept_hlt,
+ x86_intercept_invlpg,
+ x86_intercept_invlpga,
+ x86_intercept_vmrun,
+ x86_intercept_vmload,
+ x86_intercept_vmsave,
+ x86_intercept_vmmcall,
+ x86_intercept_stgi,
+ x86_intercept_clgi,
+ x86_intercept_skinit,
+ x86_intercept_rdtscp,
+ x86_intercept_icebp,
+ x86_intercept_wbinvd,
+ x86_intercept_monitor,
+ x86_intercept_mwait,
+ x86_intercept_rdmsr,
+ x86_intercept_wrmsr,
+ x86_intercept_in,
+ x86_intercept_ins,
+ x86_intercept_out,
+ x86_intercept_outs,
+
+ nr_x86_intercepts
+};
+
/* Host execution mode. */
#if defined(CONFIG_X86_32)
#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32
@@ -270,6 +368,7 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len);
#define EMULATION_FAILED -1
#define EMULATION_OK 0
#define EMULATION_RESTART 1
+#define EMULATION_INTERCEPTED 2
int x86_emulate_insn(struct x86_emulate_ctxt *ctxt);
int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
u16 tss_selector, int reason,
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c8af0991fdf..d2ac8e2ee89 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -30,14 +30,30 @@
#define KVM_MEMORY_SLOTS 32
/* memory slots that does not exposed to userspace */
#define KVM_PRIVATE_MEM_SLOTS 4
+#define KVM_MMIO_SIZE 16
#define KVM_PIO_PAGE_OFFSET 1
#define KVM_COALESCED_MMIO_PAGE_OFFSET 2
+#define CR0_RESERVED_BITS \
+ (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
+ | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
+ | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
+
#define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1)
#define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD))
#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \
0xFFFFFF0000000000ULL)
+#define CR4_RESERVED_BITS \
+ (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
+ | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
+ | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \
+ | X86_CR4_OSXSAVE \
+ | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
+
+#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
+
+
#define INVALID_PAGE (~(hpa_t)0)
#define VALID_PAGE(x) ((x) != INVALID_PAGE)
@@ -118,6 +134,9 @@ enum kvm_reg {
enum kvm_reg_ex {
VCPU_EXREG_PDPTR = NR_VCPU_REGS,
VCPU_EXREG_CR3,
+ VCPU_EXREG_RFLAGS,
+ VCPU_EXREG_CPL,
+ VCPU_EXREG_SEGMENTS,
};
enum {
@@ -256,7 +275,7 @@ struct kvm_mmu {
struct kvm_mmu_page *sp);
void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
- u64 *spte, const void *pte, unsigned long mmu_seq);
+ u64 *spte, const void *pte);
hpa_t root_hpa;
int root_level;
int shadow_root_level;
@@ -340,7 +359,6 @@ struct kvm_vcpu_arch {
struct fpu guest_fpu;
u64 xcr0;
- gva_t mmio_fault_cr2;
struct kvm_pio_request pio;
void *pio_data;
@@ -367,18 +385,22 @@ struct kvm_vcpu_arch {
/* emulate context */
struct x86_emulate_ctxt emulate_ctxt;
+ bool emulate_regs_need_sync_to_vcpu;
+ bool emulate_regs_need_sync_from_vcpu;
gpa_t time;
struct pvclock_vcpu_time_info hv_clock;
unsigned int hw_tsc_khz;
unsigned int time_offset;
struct page *time_page;
- u64 last_host_tsc;
u64 last_guest_tsc;
u64 last_kernel_ns;
u64 last_tsc_nsec;
u64 last_tsc_write;
+ u32 virtual_tsc_khz;
bool tsc_catchup;
+ u32 tsc_catchup_mult;
+ s8 tsc_catchup_shift;
bool nmi_pending;
bool nmi_injected;
@@ -448,9 +470,6 @@ struct kvm_arch {
u64 last_tsc_nsec;
u64 last_tsc_offset;
u64 last_tsc_write;
- u32 virtual_tsc_khz;
- u32 virtual_tsc_mult;
- s8 virtual_tsc_shift;
struct kvm_xen_hvm_config xen_hvm_config;
@@ -502,6 +521,8 @@ struct kvm_vcpu_stat {
u32 nmi_injections;
};
+struct x86_instruction_info;
+
struct kvm_x86_ops {
int (*cpu_has_kvm_support)(void); /* __init */
int (*disabled_by_bios)(void); /* __init */
@@ -586,9 +607,17 @@ struct kvm_x86_ops {
bool (*has_wbinvd_exit)(void);
+ void (*set_tsc_khz)(struct kvm_vcpu *vcpu, u32 user_tsc_khz);
void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
+ u64 (*compute_tsc_offset)(struct kvm_vcpu *vcpu, u64 target_tsc);
+
void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2);
+
+ int (*check_intercept)(struct kvm_vcpu *vcpu,
+ struct x86_instruction_info *info,
+ enum x86_intercept_stage stage);
+
const struct trace_print_flags *exit_reasons_str;
};
@@ -627,6 +656,13 @@ u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
extern bool tdp_enabled;
+/* control of guest tsc rate supported? */
+extern bool kvm_has_tsc_control;
+/* minimum supported tsc_khz for guests */
+extern u32 kvm_min_guest_tsc_khz;
+/* maximum supported tsc_khz for guests */
+extern u32 kvm_max_guest_tsc_khz;
+
enum emulation_result {
EMULATE_DONE, /* no further processing */
EMULATE_DO_MMIO, /* kvm_run filled with mmio request */
@@ -645,9 +681,6 @@ static inline int emulate_instruction(struct kvm_vcpu *vcpu,
return x86_emulate_instruction(vcpu, 0, emulation_type, NULL, 0);
}
-void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
-void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
-
void kvm_enable_efer_bits(u64);
int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data);
int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
@@ -657,8 +690,6 @@ struct x86_emulate_ctxt;
int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port);
void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
int kvm_emulate_halt(struct kvm_vcpu *vcpu);
-int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
-int emulate_clts(struct kvm_vcpu *vcpu);
int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu);
void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
@@ -721,8 +752,6 @@ gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
-int kvm_fix_hypercall(struct kvm_vcpu *vcpu);
-
int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code,
void *insn, int insn_len);
void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h
index b60f2924c41..879fd7d3387 100644
--- a/arch/x86/include/asm/lguest_hcall.h
+++ b/arch/x86/include/asm/lguest_hcall.h
@@ -61,6 +61,7 @@ hcall(unsigned long call,
: "memory");
return call;
}
+/*:*/
/* Can't use our min() macro here: needs to be a constant */
#define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32)
diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h
index 12d55e773eb..48142971b25 100644
--- a/arch/x86/include/asm/linkage.h
+++ b/arch/x86/include/asm/linkage.h
@@ -8,11 +8,6 @@
#ifdef CONFIG_X86_32
#define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0)))
-/*
- * For 32-bit UML - mark functions implemented in assembly that use
- * regparm input parameters:
- */
-#define asmregparm __attribute__((regparm(3)))
/*
* Make sure the compiler doesn't do anything stupid with the
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index eb16e94ae04..716b48af786 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -8,6 +8,7 @@
* Machine Check support for x86
*/
+/* MCG_CAP register defines */
#define MCG_BANKCNT_MASK 0xff /* Number of Banks */
#define MCG_CTL_P (1ULL<<8) /* MCG_CTL register available */
#define MCG_EXT_P (1ULL<<9) /* Extended registers available */
@@ -17,10 +18,12 @@
#define MCG_EXT_CNT(c) (((c) & MCG_EXT_CNT_MASK) >> MCG_EXT_CNT_SHIFT)
#define MCG_SER_P (1ULL<<24) /* MCA recovery/new status bits */
+/* MCG_STATUS register defines */
#define MCG_STATUS_RIPV (1ULL<<0) /* restart ip valid */
#define MCG_STATUS_EIPV (1ULL<<1) /* ip points to correct instruction */
#define MCG_STATUS_MCIP (1ULL<<2) /* machine check in progress */
+/* MCi_STATUS register defines */
#define MCI_STATUS_VAL (1ULL<<63) /* valid error */
#define MCI_STATUS_OVER (1ULL<<62) /* previous errors lost */
#define MCI_STATUS_UC (1ULL<<61) /* uncorrected error */
@@ -31,12 +34,14 @@
#define MCI_STATUS_S (1ULL<<56) /* Signaled machine check */
#define MCI_STATUS_AR (1ULL<<55) /* Action required */
-/* MISC register defines */
-#define MCM_ADDR_SEGOFF 0 /* segment offset */
-#define MCM_ADDR_LINEAR 1 /* linear address */
-#define MCM_ADDR_PHYS 2 /* physical address */
-#define MCM_ADDR_MEM 3 /* memory address */
-#define MCM_ADDR_GENERIC 7 /* generic */
+/* MCi_MISC register defines */
+#define MCI_MISC_ADDR_LSB(m) ((m) & 0x3f)
+#define MCI_MISC_ADDR_MODE(m) (((m) >> 6) & 7)
+#define MCI_MISC_ADDR_SEGOFF 0 /* segment offset */
+#define MCI_MISC_ADDR_LINEAR 1 /* linear address */
+#define MCI_MISC_ADDR_PHYS 2 /* physical address */
+#define MCI_MISC_ADDR_MEM 3 /* memory address */
+#define MCI_MISC_ADDR_GENERIC 7 /* generic */
/* CTL2 register defines */
#define MCI_CTL2_CMCI_EN (1ULL << 30)
@@ -142,11 +147,9 @@ static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {}
static inline void enable_p5_mce(void) {}
#endif
-extern void (*x86_mce_decode_callback)(struct mce *m);
-
void mce_setup(struct mce *m);
void mce_log(struct mce *m);
-DECLARE_PER_CPU(struct sys_device, mce_dev);
+DECLARE_PER_CPU(struct sys_device, mce_sysdev);
/*
* Maximum banks number.
diff --git a/arch/x86/include/asm/memblock.h b/arch/x86/include/asm/memblock.h
index 19ae14ba697..0cd3800f33b 100644
--- a/arch/x86/include/asm/memblock.h
+++ b/arch/x86/include/asm/memblock.h
@@ -4,7 +4,6 @@
#define ARCH_DISCARD_MEMBLOCK
u64 memblock_x86_find_in_range_size(u64 start, u64 *sizep, u64 align);
-void memblock_x86_to_bootmem(u64 start, u64 end);
void memblock_x86_reserve_range(u64 start, u64 end, char *name);
void memblock_x86_free_range(u64 start, u64 end);
@@ -19,5 +18,6 @@ u64 memblock_x86_hole_size(u64 start, u64 end);
u64 memblock_x86_find_in_range_node(int nid, u64 start, u64 end, u64 size, u64 align);
u64 memblock_x86_free_memory_in_range(u64 addr, u64 limit);
u64 memblock_x86_memory_in_range(u64 addr, u64 limit);
+bool memblock_x86_check_reserved_size(u64 *addrp, u64 *sizep, u64 align);
#endif
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index aeff3e89b22..5f55e696276 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -11,14 +11,14 @@
typedef struct {
void *ldt;
int size;
- struct mutex lock;
- void *vdso;
#ifdef CONFIG_X86_64
/* True if mm supports a task running in 32 bit compatibility mode. */
unsigned short ia32_compat;
#endif
+ struct mutex lock;
+ void *vdso;
} mm_context_t;
#ifdef CONFIG_SMP
diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h
index 91df7c51806..55728e12147 100644
--- a/arch/x86/include/asm/mmzone_32.h
+++ b/arch/x86/include/asm/mmzone_32.h
@@ -13,31 +13,11 @@ extern struct pglist_data *node_data[];
#define NODE_DATA(nid) (node_data[nid])
#include <asm/numaq.h>
-/* summit or generic arch */
-#include <asm/srat.h>
-
-extern int get_memcfg_numa_flat(void);
-/*
- * This allows any one NUMA architecture to be compiled
- * for, and still fall back to the flat function if it
- * fails.
- */
-static inline void get_memcfg_numa(void)
-{
-
- if (get_memcfg_numaq())
- return;
- if (get_memcfg_from_srat())
- return;
- get_memcfg_numa_flat();
-}
extern void resume_map_numa_kva(pgd_t *pgd);
#else /* !CONFIG_NUMA */
-#define get_memcfg_numa get_memcfg_numa_flat
-
static inline void resume_map_numa_kva(pgd_t *pgd) {}
#endif /* CONFIG_NUMA */
@@ -54,31 +34,20 @@ static inline void resume_map_numa_kva(pgd_t *pgd) {}
* 64Gb / 4096bytes/page = 16777216 pages
*/
#define MAX_NR_PAGES 16777216
-#define MAX_ELEMENTS 1024
-#define PAGES_PER_ELEMENT (MAX_NR_PAGES/MAX_ELEMENTS)
+#define MAX_SECTIONS 1024
+#define PAGES_PER_SECTION (MAX_NR_PAGES/MAX_SECTIONS)
extern s8 physnode_map[];
static inline int pfn_to_nid(unsigned long pfn)
{
#ifdef CONFIG_NUMA
- return((int) physnode_map[(pfn) / PAGES_PER_ELEMENT]);
+ return((int) physnode_map[(pfn) / PAGES_PER_SECTION]);
#else
return 0;
#endif
}
-/*
- * Following are macros that each numa implmentation must define.
- */
-
-#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn)
-#define node_end_pfn(nid) \
-({ \
- pg_data_t *__pgdat = NODE_DATA(nid); \
- __pgdat->node_start_pfn + __pgdat->node_spanned_pages; \
-})
-
static inline int pfn_valid(int pfn)
{
int nid = pfn_to_nid(pfn);
@@ -88,6 +57,8 @@ static inline int pfn_valid(int pfn)
return 0;
}
+#define early_pfn_valid(pfn) pfn_valid((pfn))
+
#endif /* CONFIG_DISCONTIGMEM */
#ifdef CONFIG_NEED_MULTIPLE_NODES
diff --git a/arch/x86/include/asm/mmzone_64.h b/arch/x86/include/asm/mmzone_64.h
index 288b96f815a..129d9aa3ceb 100644
--- a/arch/x86/include/asm/mmzone_64.h
+++ b/arch/x86/include/asm/mmzone_64.h
@@ -4,40 +4,14 @@
#ifndef _ASM_X86_MMZONE_64_H
#define _ASM_X86_MMZONE_64_H
-
#ifdef CONFIG_NUMA
#include <linux/mmdebug.h>
-
#include <asm/smp.h>
-/* Simple perfect hash to map physical addresses to node numbers */
-struct memnode {
- int shift;
- unsigned int mapsize;
- s16 *map;
- s16 embedded_map[64 - 8];
-} ____cacheline_aligned; /* total size = 128 bytes */
-extern struct memnode memnode;
-#define memnode_shift memnode.shift
-#define memnodemap memnode.map
-#define memnodemapsize memnode.mapsize
-
extern struct pglist_data *node_data[];
-static inline __attribute__((pure)) int phys_to_nid(unsigned long addr)
-{
- unsigned nid;
- VIRTUAL_BUG_ON(!memnodemap);
- nid = memnodemap[addr >> memnode_shift];
- VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]);
- return nid;
-}
-
#define NODE_DATA(nid) (node_data[nid])
-#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn)
-#define node_end_pfn(nid) (NODE_DATA(nid)->node_start_pfn + \
- NODE_DATA(nid)->node_spanned_pages)
#endif
#endif /* _ASM_X86_MMZONE_64_H */
diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h
index 67763c5d8b4..9eae7752ae9 100644
--- a/arch/x86/include/asm/module.h
+++ b/arch/x86/include/asm/module.h
@@ -35,7 +35,7 @@
#define MODULE_PROC_FAMILY "K7 "
#elif defined CONFIG_MK8
#define MODULE_PROC_FAMILY "K8 "
-#elif defined CONFIG_X86_ELAN
+#elif defined CONFIG_MELAN
#define MODULE_PROC_FAMILY "ELAN "
#elif defined CONFIG_MCRUSOE
#define MODULE_PROC_FAMILY "CRUSOE "
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 3cce71413d0..d96bdb25ca3 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -118,6 +118,7 @@
complete list. */
#define MSR_AMD64_PATCH_LEVEL 0x0000008b
+#define MSR_AMD64_TSC_RATIO 0xc0000104
#define MSR_AMD64_NB_CFG 0xc001001f
#define MSR_AMD64_PATCH_LOADER 0xc0010020
#define MSR_AMD64_OSVW_ID_LENGTH 0xc0010140
@@ -258,6 +259,9 @@
#define MSR_IA32_TEMPERATURE_TARGET 0x000001a2
#define MSR_IA32_ENERGY_PERF_BIAS 0x000001b0
+#define ENERGY_PERF_BIAS_PERFORMANCE 0
+#define ENERGY_PERF_BIAS_NORMAL 6
+#define ENERGY_PERF_BIAS_POWERSAVE 15
#define MSR_IA32_PACKAGE_THERM_STATUS 0x000001b1
diff --git a/arch/x86/include/asm/nops.h b/arch/x86/include/asm/nops.h
index af788496020..405b4032a60 100644
--- a/arch/x86/include/asm/nops.h
+++ b/arch/x86/include/asm/nops.h
@@ -1,7 +1,13 @@
#ifndef _ASM_X86_NOPS_H
#define _ASM_X86_NOPS_H
-/* Define nops for use with alternative() */
+/*
+ * Define nops for use with alternative() and for tracing.
+ *
+ * *_NOP5_ATOMIC must be a single instruction.
+ */
+
+#define NOP_DS_PREFIX 0x3e
/* generic versions from gas
1: nop
@@ -13,14 +19,15 @@
6: leal 0x00000000(%esi),%esi
7: leal 0x00000000(,%esi,1),%esi
*/
-#define GENERIC_NOP1 ".byte 0x90\n"
-#define GENERIC_NOP2 ".byte 0x89,0xf6\n"
-#define GENERIC_NOP3 ".byte 0x8d,0x76,0x00\n"
-#define GENERIC_NOP4 ".byte 0x8d,0x74,0x26,0x00\n"
-#define GENERIC_NOP5 GENERIC_NOP1 GENERIC_NOP4
-#define GENERIC_NOP6 ".byte 0x8d,0xb6,0x00,0x00,0x00,0x00\n"
-#define GENERIC_NOP7 ".byte 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00\n"
-#define GENERIC_NOP8 GENERIC_NOP1 GENERIC_NOP7
+#define GENERIC_NOP1 0x90
+#define GENERIC_NOP2 0x89,0xf6
+#define GENERIC_NOP3 0x8d,0x76,0x00
+#define GENERIC_NOP4 0x8d,0x74,0x26,0x00
+#define GENERIC_NOP5 GENERIC_NOP1,GENERIC_NOP4
+#define GENERIC_NOP6 0x8d,0xb6,0x00,0x00,0x00,0x00
+#define GENERIC_NOP7 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00
+#define GENERIC_NOP8 GENERIC_NOP1,GENERIC_NOP7
+#define GENERIC_NOP5_ATOMIC NOP_DS_PREFIX,GENERIC_NOP4
/* Opteron 64bit nops
1: nop
@@ -29,13 +36,14 @@
4: osp osp osp nop
*/
#define K8_NOP1 GENERIC_NOP1
-#define K8_NOP2 ".byte 0x66,0x90\n"
-#define K8_NOP3 ".byte 0x66,0x66,0x90\n"
-#define K8_NOP4 ".byte 0x66,0x66,0x66,0x90\n"
-#define K8_NOP5 K8_NOP3 K8_NOP2
-#define K8_NOP6 K8_NOP3 K8_NOP3
-#define K8_NOP7 K8_NOP4 K8_NOP3
-#define K8_NOP8 K8_NOP4 K8_NOP4
+#define K8_NOP2 0x66,K8_NOP1
+#define K8_NOP3 0x66,K8_NOP2
+#define K8_NOP4 0x66,K8_NOP3
+#define K8_NOP5 K8_NOP3,K8_NOP2
+#define K8_NOP6 K8_NOP3,K8_NOP3
+#define K8_NOP7 K8_NOP4,K8_NOP3
+#define K8_NOP8 K8_NOP4,K8_NOP4
+#define K8_NOP5_ATOMIC 0x66,K8_NOP4
/* K7 nops
uses eax dependencies (arbitrary choice)
@@ -47,13 +55,14 @@
7: leal 0x00000000(,%eax,1),%eax
*/
#define K7_NOP1 GENERIC_NOP1
-#define K7_NOP2 ".byte 0x8b,0xc0\n"
-#define K7_NOP3 ".byte 0x8d,0x04,0x20\n"
-#define K7_NOP4 ".byte 0x8d,0x44,0x20,0x00\n"
-#define K7_NOP5 K7_NOP4 ASM_NOP1
-#define K7_NOP6 ".byte 0x8d,0x80,0,0,0,0\n"
-#define K7_NOP7 ".byte 0x8D,0x04,0x05,0,0,0,0\n"
-#define K7_NOP8 K7_NOP7 ASM_NOP1
+#define K7_NOP2 0x8b,0xc0
+#define K7_NOP3 0x8d,0x04,0x20
+#define K7_NOP4 0x8d,0x44,0x20,0x00
+#define K7_NOP5 K7_NOP4,K7_NOP1
+#define K7_NOP6 0x8d,0x80,0,0,0,0
+#define K7_NOP7 0x8D,0x04,0x05,0,0,0,0
+#define K7_NOP8 K7_NOP7,K7_NOP1
+#define K7_NOP5_ATOMIC NOP_DS_PREFIX,K7_NOP4
/* P6 nops
uses eax dependencies (Intel-recommended choice)
@@ -69,52 +78,65 @@
There is kernel code that depends on this.
*/
#define P6_NOP1 GENERIC_NOP1
-#define P6_NOP2 ".byte 0x66,0x90\n"
-#define P6_NOP3 ".byte 0x0f,0x1f,0x00\n"
-#define P6_NOP4 ".byte 0x0f,0x1f,0x40,0\n"
-#define P6_NOP5 ".byte 0x0f,0x1f,0x44,0x00,0\n"
-#define P6_NOP6 ".byte 0x66,0x0f,0x1f,0x44,0x00,0\n"
-#define P6_NOP7 ".byte 0x0f,0x1f,0x80,0,0,0,0\n"
-#define P6_NOP8 ".byte 0x0f,0x1f,0x84,0x00,0,0,0,0\n"
+#define P6_NOP2 0x66,0x90
+#define P6_NOP3 0x0f,0x1f,0x00
+#define P6_NOP4 0x0f,0x1f,0x40,0
+#define P6_NOP5 0x0f,0x1f,0x44,0x00,0
+#define P6_NOP6 0x66,0x0f,0x1f,0x44,0x00,0
+#define P6_NOP7 0x0f,0x1f,0x80,0,0,0,0
+#define P6_NOP8 0x0f,0x1f,0x84,0x00,0,0,0,0
+#define P6_NOP5_ATOMIC P6_NOP5
+
+#define _ASM_MK_NOP(x) ".byte " __stringify(x) "\n"
#if defined(CONFIG_MK7)
-#define ASM_NOP1 K7_NOP1
-#define ASM_NOP2 K7_NOP2
-#define ASM_NOP3 K7_NOP3
-#define ASM_NOP4 K7_NOP4
-#define ASM_NOP5 K7_NOP5
-#define ASM_NOP6 K7_NOP6
-#define ASM_NOP7 K7_NOP7
-#define ASM_NOP8 K7_NOP8
+#define ASM_NOP1 _ASM_MK_NOP(K7_NOP1)
+#define ASM_NOP2 _ASM_MK_NOP(K7_NOP2)
+#define ASM_NOP3 _ASM_MK_NOP(K7_NOP3)
+#define ASM_NOP4 _ASM_MK_NOP(K7_NOP4)
+#define ASM_NOP5 _ASM_MK_NOP(K7_NOP5)
+#define ASM_NOP6 _ASM_MK_NOP(K7_NOP6)
+#define ASM_NOP7 _ASM_MK_NOP(K7_NOP7)
+#define ASM_NOP8 _ASM_MK_NOP(K7_NOP8)
+#define ASM_NOP5_ATOMIC _ASM_MK_NOP(K7_NOP5_ATOMIC)
#elif defined(CONFIG_X86_P6_NOP)
-#define ASM_NOP1 P6_NOP1
-#define ASM_NOP2 P6_NOP2
-#define ASM_NOP3 P6_NOP3
-#define ASM_NOP4 P6_NOP4
-#define ASM_NOP5 P6_NOP5
-#define ASM_NOP6 P6_NOP6
-#define ASM_NOP7 P6_NOP7
-#define ASM_NOP8 P6_NOP8
+#define ASM_NOP1 _ASM_MK_NOP(P6_NOP1)
+#define ASM_NOP2 _ASM_MK_NOP(P6_NOP2)
+#define ASM_NOP3 _ASM_MK_NOP(P6_NOP3)
+#define ASM_NOP4 _ASM_MK_NOP(P6_NOP4)
+#define ASM_NOP5 _ASM_MK_NOP(P6_NOP5)
+#define ASM_NOP6 _ASM_MK_NOP(P6_NOP6)
+#define ASM_NOP7 _ASM_MK_NOP(P6_NOP7)
+#define ASM_NOP8 _ASM_MK_NOP(P6_NOP8)
+#define ASM_NOP5_ATOMIC _ASM_MK_NOP(P6_NOP5_ATOMIC)
#elif defined(CONFIG_X86_64)
-#define ASM_NOP1 K8_NOP1
-#define ASM_NOP2 K8_NOP2
-#define ASM_NOP3 K8_NOP3
-#define ASM_NOP4 K8_NOP4
-#define ASM_NOP5 K8_NOP5
-#define ASM_NOP6 K8_NOP6
-#define ASM_NOP7 K8_NOP7
-#define ASM_NOP8 K8_NOP8
+#define ASM_NOP1 _ASM_MK_NOP(K8_NOP1)
+#define ASM_NOP2 _ASM_MK_NOP(K8_NOP2)
+#define ASM_NOP3 _ASM_MK_NOP(K8_NOP3)
+#define ASM_NOP4 _ASM_MK_NOP(K8_NOP4)
+#define ASM_NOP5 _ASM_MK_NOP(K8_NOP5)
+#define ASM_NOP6 _ASM_MK_NOP(K8_NOP6)
+#define ASM_NOP7 _ASM_MK_NOP(K8_NOP7)
+#define ASM_NOP8 _ASM_MK_NOP(K8_NOP8)
+#define ASM_NOP5_ATOMIC _ASM_MK_NOP(K8_NOP5_ATOMIC)
#else
-#define ASM_NOP1 GENERIC_NOP1
-#define ASM_NOP2 GENERIC_NOP2
-#define ASM_NOP3 GENERIC_NOP3
-#define ASM_NOP4 GENERIC_NOP4
-#define ASM_NOP5 GENERIC_NOP5
-#define ASM_NOP6 GENERIC_NOP6
-#define ASM_NOP7 GENERIC_NOP7
-#define ASM_NOP8 GENERIC_NOP8
+#define ASM_NOP1 _ASM_MK_NOP(GENERIC_NOP1)
+#define ASM_NOP2 _ASM_MK_NOP(GENERIC_NOP2)
+#define ASM_NOP3 _ASM_MK_NOP(GENERIC_NOP3)
+#define ASM_NOP4 _ASM_MK_NOP(GENERIC_NOP4)
+#define ASM_NOP5 _ASM_MK_NOP(GENERIC_NOP5)
+#define ASM_NOP6 _ASM_MK_NOP(GENERIC_NOP6)
+#define ASM_NOP7 _ASM_MK_NOP(GENERIC_NOP7)
+#define ASM_NOP8 _ASM_MK_NOP(GENERIC_NOP8)
+#define ASM_NOP5_ATOMIC _ASM_MK_NOP(GENERIC_NOP5_ATOMIC)
#endif
#define ASM_NOP_MAX 8
+#define NOP_ATOMIC5 (ASM_NOP_MAX+1) /* Entry for the 5-byte atomic NOP */
+
+#ifndef __ASSEMBLY__
+extern const unsigned char * const *ideal_nops;
+extern void arch_init_ideal_nops(void);
+#endif
#endif /* _ASM_X86_NOPS_H */
diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
index a50fc9f493b..bfacd2ccf65 100644
--- a/arch/x86/include/asm/numa.h
+++ b/arch/x86/include/asm/numa.h
@@ -1,12 +1,24 @@
#ifndef _ASM_X86_NUMA_H
#define _ASM_X86_NUMA_H
+#include <linux/nodemask.h>
+
#include <asm/topology.h>
#include <asm/apicdef.h>
#ifdef CONFIG_NUMA
#define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
+#define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
+
+/*
+ * Too small node sizes may confuse the VM badly. Usually they
+ * result from BIOS bugs. So dont recognize nodes as standalone
+ * NUMA entities that have less than this amount of RAM listed:
+ */
+#define NODE_MIN_SIZE (4*1024*1024)
+
+extern int numa_off;
/*
* __apicid_to_node[] stores the raw mapping between physical apicid and
@@ -17,15 +29,27 @@
* numa_cpu_node().
*/
extern s16 __apicid_to_node[MAX_LOCAL_APIC];
+extern nodemask_t numa_nodes_parsed __initdata;
+
+extern int __init numa_add_memblk(int nodeid, u64 start, u64 end);
+extern void __init numa_set_distance(int from, int to, int distance);
static inline void set_apicid_to_node(int apicid, s16 node)
{
__apicid_to_node[apicid] = node;
}
+
+extern int __cpuinit numa_cpu_node(int cpu);
+
#else /* CONFIG_NUMA */
static inline void set_apicid_to_node(int apicid, s16 node)
{
}
+
+static inline int numa_cpu_node(int cpu)
+{
+ return NUMA_NO_NODE;
+}
#endif /* CONFIG_NUMA */
#ifdef CONFIG_X86_32
@@ -37,14 +61,12 @@ static inline void set_apicid_to_node(int apicid, s16 node)
#ifdef CONFIG_NUMA
extern void __cpuinit numa_set_node(int cpu, int node);
extern void __cpuinit numa_clear_node(int cpu);
-extern void __init numa_init_array(void);
extern void __init init_cpu_to_node(void);
extern void __cpuinit numa_add_cpu(int cpu);
extern void __cpuinit numa_remove_cpu(int cpu);
#else /* CONFIG_NUMA */
static inline void numa_set_node(int cpu, int node) { }
static inline void numa_clear_node(int cpu) { }
-static inline void numa_init_array(void) { }
static inline void init_cpu_to_node(void) { }
static inline void numa_add_cpu(int cpu) { }
static inline void numa_remove_cpu(int cpu) { }
@@ -54,4 +76,10 @@ static inline void numa_remove_cpu(int cpu) { }
void debug_cpumask_set_cpu(int cpu, int node, bool enable);
#endif
+#ifdef CONFIG_NUMA_EMU
+#define FAKE_NODE_MIN_SIZE ((u64)32 << 20)
+#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL))
+void numa_emu_cmdline(char *);
+#endif /* CONFIG_NUMA_EMU */
+
#endif /* _ASM_X86_NUMA_H */
diff --git a/arch/x86/include/asm/numa_32.h b/arch/x86/include/asm/numa_32.h
index c6beed1ef10..e7d6b825474 100644
--- a/arch/x86/include/asm/numa_32.h
+++ b/arch/x86/include/asm/numa_32.h
@@ -1,16 +1,6 @@
#ifndef _ASM_X86_NUMA_32_H
#define _ASM_X86_NUMA_32_H
-extern int numa_off;
-
-extern int pxm_to_nid(int pxm);
-
-#ifdef CONFIG_NUMA
-extern int __cpuinit numa_cpu_node(int cpu);
-#else /* CONFIG_NUMA */
-static inline int numa_cpu_node(int cpu) { return NUMA_NO_NODE; }
-#endif /* CONFIG_NUMA */
-
#ifdef CONFIG_HIGHMEM
extern void set_highmem_pages_init(void);
#else
diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
index 344eb1790b4..0c05f7ae46e 100644
--- a/arch/x86/include/asm/numa_64.h
+++ b/arch/x86/include/asm/numa_64.h
@@ -1,42 +1,6 @@
#ifndef _ASM_X86_NUMA_64_H
#define _ASM_X86_NUMA_64_H
-#include <linux/nodemask.h>
-
-struct bootnode {
- u64 start;
- u64 end;
-};
-
-#define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
-
-extern int numa_off;
-
extern unsigned long numa_free_all_bootmem(void);
-extern void setup_node_bootmem(int nodeid, unsigned long start,
- unsigned long end);
-
-#ifdef CONFIG_NUMA
-/*
- * Too small node sizes may confuse the VM badly. Usually they
- * result from BIOS bugs. So dont recognize nodes as standalone
- * NUMA entities that have less than this amount of RAM listed:
- */
-#define NODE_MIN_SIZE (4*1024*1024)
-
-extern nodemask_t numa_nodes_parsed __initdata;
-
-extern int __cpuinit numa_cpu_node(int cpu);
-extern int __init numa_add_memblk(int nodeid, u64 start, u64 end);
-extern void __init numa_set_distance(int from, int to, int distance);
-
-#ifdef CONFIG_NUMA_EMU
-#define FAKE_NODE_MIN_SIZE ((u64)32 << 20)
-#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL))
-void numa_emu_cmdline(char *);
-#endif /* CONFIG_NUMA_EMU */
-#else
-static inline int numa_cpu_node(int cpu) { return NUMA_NO_NODE; }
-#endif
#endif /* _ASM_X86_NUMA_64_H */
diff --git a/arch/x86/include/asm/numaq.h b/arch/x86/include/asm/numaq.h
index 37c516545ec..c3b3c322fd8 100644
--- a/arch/x86/include/asm/numaq.h
+++ b/arch/x86/include/asm/numaq.h
@@ -29,7 +29,7 @@
#ifdef CONFIG_X86_NUMAQ
extern int found_numaq;
-extern int get_memcfg_numaq(void);
+extern int numaq_numa_init(void);
extern int pci_numaq_init(void);
extern void *xquad_portio;
@@ -166,11 +166,6 @@ struct sys_cfg_data {
void numaq_tsc_disable(void);
-#else
-static inline int get_memcfg_numaq(void)
-{
- return 0;
-}
#endif /* CONFIG_X86_NUMAQ */
#endif /* _ASM_X86_NUMAQ_H */
diff --git a/arch/x86/include/asm/olpc_ofw.h b/arch/x86/include/asm/olpc_ofw.h
index c5d3a5abbb9..24487712e0b 100644
--- a/arch/x86/include/asm/olpc_ofw.h
+++ b/arch/x86/include/asm/olpc_ofw.h
@@ -26,15 +26,12 @@ extern void setup_olpc_ofw_pgd(void);
/* check if OFW was detected during boot */
extern bool olpc_ofw_present(void);
+extern void olpc_dt_build_devicetree(void);
+
#else /* !CONFIG_OLPC */
static inline void olpc_ofw_detect(void) { }
static inline void setup_olpc_ofw_pgd(void) { }
-#endif /* !CONFIG_OLPC */
-
-#ifdef CONFIG_OF_PROMTREE
-extern void olpc_dt_build_devicetree(void);
-#else
static inline void olpc_dt_build_devicetree(void) { }
-#endif
+#endif /* !CONFIG_OLPC */
#endif /* _ASM_X86_OLPC_OFW_H */
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index 67612922963..d498943b906 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -135,8 +135,6 @@ void default_teardown_msi_irqs(struct pci_dev *dev);
#include "pci_64.h"
#endif
-void dma32_reserve_bootmem(void);
-
/* implement the pci_ DMA API in terms of the generic device dma_ one */
#include <asm-generic/pci-dma-compat.h>
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index d475b4398d8..3470c9d0ebb 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -388,12 +388,9 @@ do { \
#define __this_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val)
#define __this_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val)
#define __this_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val)
-/*
- * Generic fallback operations for __this_cpu_xchg_[1-4] are okay and much
- * faster than an xchg with forced lock semantics.
- */
-#define __this_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval)
-#define __this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
+#define __this_cpu_xchg_1(pcp, val) percpu_xchg_op(pcp, val)
+#define __this_cpu_xchg_2(pcp, val) percpu_xchg_op(pcp, val)
+#define __this_cpu_xchg_4(pcp, val) percpu_xchg_op(pcp, val)
#define this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
#define this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
@@ -485,6 +482,8 @@ do { \
#define __this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val)
#define __this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val)
#define __this_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val)
+#define __this_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval)
+#define __this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
#define this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
#define this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val)
@@ -509,6 +508,11 @@ do { \
* it in software. The address used in the cmpxchg16 instruction must be
* aligned to a 16 byte boundary.
*/
+#ifdef CONFIG_SMP
+#define CMPXCHG16B_EMU_CALL "call this_cpu_cmpxchg16b_emu\n\t" ASM_NOP3
+#else
+#define CMPXCHG16B_EMU_CALL "call this_cpu_cmpxchg16b_emu\n\t" ASM_NOP2
+#endif
#define percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2) \
({ \
char __ret; \
@@ -517,7 +521,7 @@ do { \
typeof(o2) __o2 = o2; \
typeof(o2) __n2 = n2; \
typeof(o2) __dummy; \
- alternative_io("call this_cpu_cmpxchg16b_emu\n\t" P6_NOP4, \
+ alternative_io(CMPXCHG16B_EMU_CALL, \
"cmpxchg16b " __percpu_prefix "(%%rsi)\n\tsetz %0\n\t", \
X86_FEATURE_CX16, \
ASM_OUTPUT2("=a"(__ret), "=d"(__dummy)), \
@@ -542,6 +546,33 @@ do { \
old__; \
})
+static __always_inline int x86_this_cpu_constant_test_bit(unsigned int nr,
+ const unsigned long __percpu *addr)
+{
+ unsigned long __percpu *a = (unsigned long *)addr + nr / BITS_PER_LONG;
+
+ return ((1UL << (nr % BITS_PER_LONG)) & percpu_read(*a)) != 0;
+}
+
+static inline int x86_this_cpu_variable_test_bit(int nr,
+ const unsigned long __percpu *addr)
+{
+ int oldbit;
+
+ asm volatile("bt "__percpu_arg(2)",%1\n\t"
+ "sbb %0,%0"
+ : "=r" (oldbit)
+ : "m" (*(unsigned long *)addr), "Ir" (nr));
+
+ return oldbit;
+}
+
+#define x86_this_cpu_test_bit(nr, addr) \
+ (__builtin_constant_p((nr)) \
+ ? x86_this_cpu_constant_test_bit((nr), (addr)) \
+ : x86_this_cpu_variable_test_bit((nr), (addr)))
+
+
#include <asm-generic/percpu.h>
/* We can use this directly for local CPU (faster). */
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index d9d4dae305f..094fb30817a 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -152,6 +152,11 @@ extern unsigned long perf_misc_flags(struct pt_regs *regs);
(regs)->bp = caller_frame_pointer(); \
(regs)->cs = __KERNEL_CS; \
regs->flags = 0; \
+ asm volatile( \
+ _ASM_MOV "%%"_ASM_SP ", %0\n" \
+ : "=m" ((regs)->sp) \
+ :: "memory" \
+ ); \
}
#else
diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h
index 56fd9e3abbd..4f7e67e2345 100644
--- a/arch/x86/include/asm/perf_event_p4.h
+++ b/arch/x86/include/asm/perf_event_p4.h
@@ -102,6 +102,14 @@
#define P4_CONFIG_HT (1ULL << P4_CONFIG_HT_SHIFT)
/*
+ * If an event has alias it should be marked
+ * with a special bit. (Don't forget to check
+ * P4_PEBS_CONFIG_MASK and related bits on
+ * modification.)
+ */
+#define P4_CONFIG_ALIASABLE (1 << 9)
+
+/*
* The bits we allow to pass for RAW events
*/
#define P4_CONFIG_MASK_ESCR \
@@ -123,6 +131,31 @@
(p4_config_pack_escr(P4_CONFIG_MASK_ESCR)) | \
(p4_config_pack_cccr(P4_CONFIG_MASK_CCCR))
+/*
+ * In case of event aliasing we need to preserve some
+ * caller bits, otherwise the mapping won't be complete.
+ */
+#define P4_CONFIG_EVENT_ALIAS_MASK \
+ (p4_config_pack_escr(P4_CONFIG_MASK_ESCR) | \
+ p4_config_pack_cccr(P4_CCCR_EDGE | \
+ P4_CCCR_THRESHOLD_MASK | \
+ P4_CCCR_COMPLEMENT | \
+ P4_CCCR_COMPARE))
+
+#define P4_CONFIG_EVENT_ALIAS_IMMUTABLE_BITS \
+ ((P4_CONFIG_HT) | \
+ p4_config_pack_escr(P4_ESCR_T0_OS | \
+ P4_ESCR_T0_USR | \
+ P4_ESCR_T1_OS | \
+ P4_ESCR_T1_USR) | \
+ p4_config_pack_cccr(P4_CCCR_OVF | \
+ P4_CCCR_CASCADE | \
+ P4_CCCR_FORCE_OVF | \
+ P4_CCCR_THREAD_ANY | \
+ P4_CCCR_OVF_PMI_T0 | \
+ P4_CCCR_OVF_PMI_T1 | \
+ P4_CONFIG_ALIASABLE))
+
static inline bool p4_is_event_cascaded(u64 config)
{
u32 cccr = p4_config_unpack_cccr(config);
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index d56187c6b83..013286a10c2 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -107,7 +107,8 @@
#define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT)
#define __PAGE_KERNEL_UC_MINUS (__PAGE_KERNEL | _PAGE_PCD)
#define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER)
-#define __PAGE_KERNEL_VSYSCALL_NOCACHE (__PAGE_KERNEL_VSYSCALL | _PAGE_PCD | _PAGE_PWT)
+#define __PAGE_KERNEL_VVAR (__PAGE_KERNEL_RO | _PAGE_USER)
+#define __PAGE_KERNEL_VVAR_NOCACHE (__PAGE_KERNEL_VVAR | _PAGE_PCD | _PAGE_PWT)
#define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE)
#define __PAGE_KERNEL_LARGE_NOCACHE (__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE)
#define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE)
@@ -129,7 +130,8 @@
#define PAGE_KERNEL_LARGE_NOCACHE __pgprot(__PAGE_KERNEL_LARGE_NOCACHE)
#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC)
#define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL)
-#define PAGE_KERNEL_VSYSCALL_NOCACHE __pgprot(__PAGE_KERNEL_VSYSCALL_NOCACHE)
+#define PAGE_KERNEL_VVAR __pgprot(__PAGE_KERNEL_VVAR)
+#define PAGE_KERNEL_VVAR_NOCACHE __pgprot(__PAGE_KERNEL_VVAR_NOCACHE)
#define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO)
#define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE)
diff --git a/arch/x86/include/asm/probe_roms.h b/arch/x86/include/asm/probe_roms.h
new file mode 100644
index 00000000000..4950a0b1d09
--- /dev/null
+++ b/arch/x86/include/asm/probe_roms.h
@@ -0,0 +1,8 @@
+#ifndef _PROBE_ROMS_H_
+#define _PROBE_ROMS_H_
+struct pci_dev;
+
+extern void __iomem *pci_map_biosrom(struct pci_dev *pdev);
+extern void pci_unmap_biosrom(void __iomem *rom);
+extern size_t pci_biosrom_size(struct pci_dev *pdev);
+#endif
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
index a898a2b6e10..59ab4dffa37 100644
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -60,6 +60,7 @@
#define X86_CR4_OSXMMEXCPT 0x00000400 /* enable unmasked SSE exceptions */
#define X86_CR4_VMXE 0x00002000 /* enable VMX virtualization */
#define X86_CR4_OSXSAVE 0x00040000 /* enable xsave and xrestore */
+#define X86_CR4_SMEP 0x00100000 /* enable SMEP support */
/*
* x86-64 Task Priority Register, CR8
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 4c25ab48257..219371546af 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -754,10 +754,10 @@ static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
extern void select_idle_routine(const struct cpuinfo_x86 *c);
-extern void init_c1e_mask(void);
+extern void init_amd_e400_c1e_mask(void);
extern unsigned long boot_option_idle_override;
-extern bool c1e_detected;
+extern bool amd_e400_c1e_detected;
enum idle_boot_override {IDLE_NO_OVERRIDE=0, IDLE_HALT, IDLE_NOMWAIT,
IDLE_POLL, IDLE_FORCE_MWAIT};
diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h
index 971e0b46446..df1287019e6 100644
--- a/arch/x86/include/asm/prom.h
+++ b/arch/x86/include/asm/prom.h
@@ -30,17 +30,6 @@ extern void add_dtb(u64 data);
extern void x86_add_irq_domains(void);
void __cpuinit x86_of_pci_init(void);
void x86_dtb_init(void);
-
-static inline struct device_node *pci_device_to_OF_node(struct pci_dev *pdev)
-{
- return pdev ? pdev->dev.of_node : NULL;
-}
-
-static inline struct device_node *pci_bus_to_OF_node(struct pci_bus *bus)
-{
- return pci_device_to_OF_node(bus->self);
-}
-
#else
static inline void add_dtb(u64 data) { }
static inline void x86_add_irq_domains(void) { }
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 1babf8adecd..94e7618fcac 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -136,6 +136,7 @@ struct cpuinfo_x86;
struct task_struct;
extern unsigned long profile_pc(struct pt_regs *regs);
+#define profile_pc profile_pc
extern unsigned long
convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs);
@@ -202,20 +203,11 @@ static inline unsigned long kernel_stack_pointer(struct pt_regs *regs)
#endif
}
-static inline unsigned long instruction_pointer(struct pt_regs *regs)
-{
- return regs->ip;
-}
-
-static inline unsigned long frame_pointer(struct pt_regs *regs)
-{
- return regs->bp;
-}
+#define GET_IP(regs) ((regs)->ip)
+#define GET_FP(regs) ((regs)->bp)
+#define GET_USP(regs) ((regs)->sp)
-static inline unsigned long user_stack_pointer(struct pt_regs *regs)
-{
- return regs->sp;
-}
+#include <asm-generic/ptrace.h>
/* Query offset/name of register from its name/offset */
extern int regs_query_register_offset(const char *name);
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index 31d84acc151..a518c0a4504 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -22,6 +22,8 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift)
u64 product;
#ifdef __i386__
u32 tmp1, tmp2;
+#else
+ ulong tmp;
#endif
if (shift < 0)
@@ -42,8 +44,11 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift)
: "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
#elif defined(__x86_64__)
__asm__ (
- "mul %%rdx ; shrd $32,%%rdx,%%rax"
- : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
+ "mul %[mul_frac] ; shrd $32, %[hi], %[lo]"
+ : [lo]"=a"(product),
+ [hi]"=d"(tmp)
+ : "0"(delta),
+ [mul_frac]"rm"((u64)mul_frac));
#else
#error implement me!
#endif
diff --git a/arch/x86/include/asm/rwlock.h b/arch/x86/include/asm/rwlock.h
index 6a8c0d64510..a5370a03d90 100644
--- a/arch/x86/include/asm/rwlock.h
+++ b/arch/x86/include/asm/rwlock.h
@@ -1,7 +1,48 @@
#ifndef _ASM_X86_RWLOCK_H
#define _ASM_X86_RWLOCK_H
-#define RW_LOCK_BIAS 0x01000000
+#include <asm/asm.h>
+
+#if CONFIG_NR_CPUS <= 2048
+
+#ifndef __ASSEMBLY__
+typedef union {
+ s32 lock;
+ s32 write;
+} arch_rwlock_t;
+#endif
+
+#define RW_LOCK_BIAS 0x00100000
+#define READ_LOCK_SIZE(insn) __ASM_FORM(insn##l)
+#define READ_LOCK_ATOMIC(n) atomic_##n
+#define WRITE_LOCK_ADD(n) __ASM_FORM_COMMA(addl n)
+#define WRITE_LOCK_SUB(n) __ASM_FORM_COMMA(subl n)
+#define WRITE_LOCK_CMP RW_LOCK_BIAS
+
+#else /* CONFIG_NR_CPUS > 2048 */
+
+#include <linux/const.h>
+
+#ifndef __ASSEMBLY__
+typedef union {
+ s64 lock;
+ struct {
+ u32 read;
+ s32 write;
+ };
+} arch_rwlock_t;
+#endif
+
+#define RW_LOCK_BIAS (_AC(1,L) << 32)
+#define READ_LOCK_SIZE(insn) __ASM_FORM(insn##q)
+#define READ_LOCK_ATOMIC(n) atomic64_##n
+#define WRITE_LOCK_ADD(n) __ASM_FORM(incl)
+#define WRITE_LOCK_SUB(n) __ASM_FORM(decl)
+#define WRITE_LOCK_CMP 1
+
+#endif /* CONFIG_NR_CPUS */
+
+#define __ARCH_RW_LOCK_UNLOCKED { RW_LOCK_BIAS }
/* Actual code is in asm/spinlock.h or in arch/x86/lib/rwlock.S */
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index cd84f7208f7..5e641715c3f 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -162,7 +162,7 @@
#define GDT_ENTRY_DEFAULT_USER32_CS 4
#define GDT_ENTRY_DEFAULT_USER_DS 5
#define GDT_ENTRY_DEFAULT_USER_CS 6
-#define __USER32_CS (GDT_ENTRY_DEFAULT_USER32_CS * 8 + 3)
+#define __USER32_CS (GDT_ENTRY_DEFAULT_USER32_CS*8+3)
#define __USER32_DS __USER_DS
#define GDT_ENTRY_TSS 8 /* needs two entries */
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index db8aa19a08a..9756551ec76 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -88,7 +88,7 @@ void *extend_brk(size_t size, size_t align);
* executable.)
*/
#define RESERVE_BRK(name,sz) \
- static void __section(.discard.text) __used \
+ static void __section(.discard.text) __used notrace \
__brk_reservation_fn_##name##__(void) { \
asm volatile ( \
".pushsection .brk_reservation,\"aw\",@nobits;" \
@@ -104,10 +104,10 @@ void *extend_brk(size_t size, size_t align);
type *name; \
RESERVE_BRK(name, sizeof(type) * entries)
+extern void probe_roms(void);
#ifdef __i386__
void __init i386_start_kernel(void);
-extern void probe_roms(void);
#else
void __init x86_64_start_kernel(char *real_mode);
diff --git a/arch/x86/include/asm/smpboot_hooks.h b/arch/x86/include/asm/smpboot_hooks.h
index 725b7783199..49adfd7bb4a 100644
--- a/arch/x86/include/asm/smpboot_hooks.h
+++ b/arch/x86/include/asm/smpboot_hooks.h
@@ -10,7 +10,11 @@ static inline void smpboot_clear_io_apic_irqs(void)
static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
{
+ unsigned long flags;
+
+ spin_lock_irqsave(&rtc_lock, flags);
CMOS_WRITE(0xa, 0xf);
+ spin_unlock_irqrestore(&rtc_lock, flags);
local_flush_tlb();
pr_debug("1.\n");
*((volatile unsigned short *)phys_to_virt(apic->trampoline_phys_high)) =
@@ -23,6 +27,8 @@ static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
static inline void smpboot_restore_warm_reset_vector(void)
{
+ unsigned long flags;
+
/*
* Install writable page 0 entry to set BIOS data area.
*/
@@ -32,7 +38,9 @@ static inline void smpboot_restore_warm_reset_vector(void)
* Paranoid: Set warm reset code and vector here back
* to default values.
*/
+ spin_lock_irqsave(&rtc_lock, flags);
CMOS_WRITE(0, 0xf);
+ spin_unlock_irqrestore(&rtc_lock, flags);
*((volatile u32 *)phys_to_virt(apic->trampoline_phys_low)) = 0;
}
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index 3089f70c0c5..e9e51f710e6 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -2,7 +2,6 @@
#define _ASM_X86_SPINLOCK_H
#include <asm/atomic.h>
-#include <asm/rwlock.h>
#include <asm/page.h>
#include <asm/processor.h>
#include <linux/compiler.h>
@@ -234,7 +233,7 @@ static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
*/
static inline int arch_read_can_lock(arch_rwlock_t *lock)
{
- return (int)(lock)->lock > 0;
+ return lock->lock > 0;
}
/**
@@ -243,12 +242,12 @@ static inline int arch_read_can_lock(arch_rwlock_t *lock)
*/
static inline int arch_write_can_lock(arch_rwlock_t *lock)
{
- return (lock)->lock == RW_LOCK_BIAS;
+ return lock->write == WRITE_LOCK_CMP;
}
static inline void arch_read_lock(arch_rwlock_t *rw)
{
- asm volatile(LOCK_PREFIX " subl $1,(%0)\n\t"
+ asm volatile(LOCK_PREFIX READ_LOCK_SIZE(dec) " (%0)\n\t"
"jns 1f\n"
"call __read_lock_failed\n\t"
"1:\n"
@@ -257,47 +256,55 @@ static inline void arch_read_lock(arch_rwlock_t *rw)
static inline void arch_write_lock(arch_rwlock_t *rw)
{
- asm volatile(LOCK_PREFIX " subl %1,(%0)\n\t"
+ asm volatile(LOCK_PREFIX WRITE_LOCK_SUB(%1) "(%0)\n\t"
"jz 1f\n"
"call __write_lock_failed\n\t"
"1:\n"
- ::LOCK_PTR_REG (rw), "i" (RW_LOCK_BIAS) : "memory");
+ ::LOCK_PTR_REG (&rw->write), "i" (RW_LOCK_BIAS)
+ : "memory");
}
static inline int arch_read_trylock(arch_rwlock_t *lock)
{
- atomic_t *count = (atomic_t *)lock;
+ READ_LOCK_ATOMIC(t) *count = (READ_LOCK_ATOMIC(t) *)lock;
- if (atomic_dec_return(count) >= 0)
+ if (READ_LOCK_ATOMIC(dec_return)(count) >= 0)
return 1;
- atomic_inc(count);
+ READ_LOCK_ATOMIC(inc)(count);
return 0;
}
static inline int arch_write_trylock(arch_rwlock_t *lock)
{
- atomic_t *count = (atomic_t *)lock;
+ atomic_t *count = (atomic_t *)&lock->write;
- if (atomic_sub_and_test(RW_LOCK_BIAS, count))
+ if (atomic_sub_and_test(WRITE_LOCK_CMP, count))
return 1;
- atomic_add(RW_LOCK_BIAS, count);
+ atomic_add(WRITE_LOCK_CMP, count);
return 0;
}
static inline void arch_read_unlock(arch_rwlock_t *rw)
{
- asm volatile(LOCK_PREFIX "incl %0" :"+m" (rw->lock) : : "memory");
+ asm volatile(LOCK_PREFIX READ_LOCK_SIZE(inc) " %0"
+ :"+m" (rw->lock) : : "memory");
}
static inline void arch_write_unlock(arch_rwlock_t *rw)
{
- asm volatile(LOCK_PREFIX "addl %1, %0"
- : "+m" (rw->lock) : "i" (RW_LOCK_BIAS) : "memory");
+ asm volatile(LOCK_PREFIX WRITE_LOCK_ADD(%1) "%0"
+ : "+m" (rw->write) : "i" (RW_LOCK_BIAS) : "memory");
}
#define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
#define arch_write_lock_flags(lock, flags) arch_write_lock(lock)
+#undef READ_LOCK_SIZE
+#undef READ_LOCK_ATOMIC
+#undef WRITE_LOCK_ADD
+#undef WRITE_LOCK_SUB
+#undef WRITE_LOCK_CMP
+
#define arch_spin_relax(lock) cpu_relax()
#define arch_read_relax(lock) cpu_relax()
#define arch_write_relax(lock) cpu_relax()
diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h
index dcb48b2edc1..7c7a486fcb6 100644
--- a/arch/x86/include/asm/spinlock_types.h
+++ b/arch/x86/include/asm/spinlock_types.h
@@ -11,10 +11,6 @@ typedef struct arch_spinlock {
#define __ARCH_SPIN_LOCK_UNLOCKED { 0 }
-typedef struct {
- unsigned int lock;
-} arch_rwlock_t;
-
-#define __ARCH_RW_LOCK_UNLOCKED { RW_LOCK_BIAS }
+#include <asm/rwlock.h>
#endif /* _ASM_X86_SPINLOCK_TYPES_H */
diff --git a/arch/x86/include/asm/srat.h b/arch/x86/include/asm/srat.h
deleted file mode 100644
index b508d639d1a..00000000000
--- a/arch/x86/include/asm/srat.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Some of the code in this file has been gleaned from the 64 bit
- * discontigmem support code base.
- *
- * Copyright (C) 2002, IBM Corp.
- *
- * All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT. See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Send feedback to Pat Gaughen <gone@us.ibm.com>
- */
-
-#ifndef _ASM_X86_SRAT_H
-#define _ASM_X86_SRAT_H
-
-#ifdef CONFIG_ACPI_NUMA
-extern int get_memcfg_from_srat(void);
-#else
-static inline int get_memcfg_from_srat(void)
-{
- return 0;
-}
-#endif
-
-#endif /* _ASM_X86_SRAT_H */
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index d7e89c83645..70bbe39043a 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -37,9 +37,6 @@ print_context_stack_bp(struct thread_info *tinfo,
/* Generic stack tracer with callbacks */
struct stacktrace_ops {
- void (*warning)(void *data, char *msg);
- /* msg must contain %s for the symbol */
- void (*warning_symbol)(void *data, char *msg, unsigned long symbol);
void (*address)(void *data, unsigned long address, int reliable);
/* On negative return stop dumping */
int (*stack)(void *data, char *name);
diff --git a/arch/x86/include/asm/suspend_32.h b/arch/x86/include/asm/suspend_32.h
index fd921c3a684..487055c8c1a 100644
--- a/arch/x86/include/asm/suspend_32.h
+++ b/arch/x86/include/asm/suspend_32.h
@@ -9,8 +9,6 @@
#include <asm/desc.h>
#include <asm/i387.h>
-static inline int arch_prepare_suspend(void) { return 0; }
-
/* image of the saved processor state */
struct saved_context {
u16 es, fs, gs, ss;
diff --git a/arch/x86/include/asm/suspend_64.h b/arch/x86/include/asm/suspend_64.h
index 8d942afae68..09b0bf10415 100644
--- a/arch/x86/include/asm/suspend_64.h
+++ b/arch/x86/include/asm/suspend_64.h
@@ -9,11 +9,6 @@
#include <asm/desc.h>
#include <asm/i387.h>
-static inline int arch_prepare_suspend(void)
-{
- return 0;
-}
-
/*
* Image of the saved processor state, used by the low level ACPI suspend to
* RAM code and by the low level hibernation code.
diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h
index 12569e691ce..c2ff2a1d845 100644
--- a/arch/x86/include/asm/system.h
+++ b/arch/x86/include/asm/system.h
@@ -303,24 +303,81 @@ static inline void native_wbinvd(void)
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#else
-#define read_cr0() (native_read_cr0())
-#define write_cr0(x) (native_write_cr0(x))
-#define read_cr2() (native_read_cr2())
-#define write_cr2(x) (native_write_cr2(x))
-#define read_cr3() (native_read_cr3())
-#define write_cr3(x) (native_write_cr3(x))
-#define read_cr4() (native_read_cr4())
-#define read_cr4_safe() (native_read_cr4_safe())
-#define write_cr4(x) (native_write_cr4(x))
-#define wbinvd() (native_wbinvd())
+
+static inline unsigned long read_cr0(void)
+{
+ return native_read_cr0();
+}
+
+static inline void write_cr0(unsigned long x)
+{
+ native_write_cr0(x);
+}
+
+static inline unsigned long read_cr2(void)
+{
+ return native_read_cr2();
+}
+
+static inline void write_cr2(unsigned long x)
+{
+ native_write_cr2(x);
+}
+
+static inline unsigned long read_cr3(void)
+{
+ return native_read_cr3();
+}
+
+static inline void write_cr3(unsigned long x)
+{
+ native_write_cr3(x);
+}
+
+static inline unsigned long read_cr4(void)
+{
+ return native_read_cr4();
+}
+
+static inline unsigned long read_cr4_safe(void)
+{
+ return native_read_cr4_safe();
+}
+
+static inline void write_cr4(unsigned long x)
+{
+ native_write_cr4(x);
+}
+
+static inline void wbinvd(void)
+{
+ native_wbinvd();
+}
+
#ifdef CONFIG_X86_64
-#define read_cr8() (native_read_cr8())
-#define write_cr8(x) (native_write_cr8(x))
-#define load_gs_index native_load_gs_index
+
+static inline unsigned long read_cr8(void)
+{
+ return native_read_cr8();
+}
+
+static inline void write_cr8(unsigned long x)
+{
+ native_write_cr8(x);
+}
+
+static inline void load_gs_index(unsigned selector)
+{
+ native_load_gs_index(selector);
+}
+
#endif
/* Clear the 'TS' bit */
-#define clts() (native_clts())
+static inline void clts(void)
+{
+ native_clts();
+}
#endif/* CONFIG_PARAVIRT */
diff --git a/arch/x86/include/asm/time.h b/arch/x86/include/asm/time.h
index 7bdec4e9b73..92b8aec0697 100644
--- a/arch/x86/include/asm/time.h
+++ b/arch/x86/include/asm/time.h
@@ -1,10 +1,12 @@
#ifndef _ASM_X86_TIME_H
#define _ASM_X86_TIME_H
-extern void hpet_time_init(void);
-
+#include <linux/clocksource.h>
#include <asm/mc146818rtc.h>
+extern void hpet_time_init(void);
extern void time_init(void);
+extern struct clock_event_device *global_clock_event;
+
#endif /* _ASM_X86_TIME_H */
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 910a7084f7f..c00692476e9 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -93,19 +93,11 @@ extern void setup_node_to_cpumask_map(void);
#define pcibus_to_node(bus) __pcibus_to_node(bus)
#ifdef CONFIG_X86_32
-extern unsigned long node_start_pfn[];
-extern unsigned long node_end_pfn[];
-extern unsigned long node_remap_size[];
-#define node_has_online_mem(nid) (node_start_pfn[nid] != node_end_pfn[nid])
-
# define SD_CACHE_NICE_TRIES 1
# define SD_IDLE_IDX 1
-
#else
-
# define SD_CACHE_NICE_TRIES 2
# define SD_IDLE_IDX 2
-
#endif
/* sched_domains SD_NODE_INIT for NUMA machines */
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 0310da67307..2bae0a513b4 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -1,6 +1,8 @@
#ifndef _ASM_X86_TRAPS_H
#define _ASM_X86_TRAPS_H
+#include <linux/kprobes.h>
+
#include <asm/debugreg.h>
#include <asm/siginfo.h> /* TRAP_TRACE, ... */
@@ -38,6 +40,7 @@ asmlinkage void alignment_check(void);
asmlinkage void machine_check(void);
#endif /* CONFIG_X86_MCE */
asmlinkage void simd_coprocessor_error(void);
+asmlinkage void emulate_vsyscall(void);
dotraplinkage void do_divide_error(struct pt_regs *, long);
dotraplinkage void do_debug(struct pt_regs *, long);
@@ -64,6 +67,7 @@ dotraplinkage void do_alignment_check(struct pt_regs *, long);
dotraplinkage void do_machine_check(struct pt_regs *, long);
#endif
dotraplinkage void do_simd_coprocessor_error(struct pt_regs *, long);
+dotraplinkage void do_emulate_vsyscall(struct pt_regs *, long);
#ifdef CONFIG_X86_32
dotraplinkage void do_iret_error(struct pt_regs *, long);
#endif
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index abd3e0ea762..36361bf6fdd 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -6,7 +6,6 @@
#include <linux/errno.h>
#include <linux/compiler.h>
#include <linux/thread_info.h>
-#include <linux/prefetch.h>
#include <linux/string.h>
#include <asm/asm.h>
#include <asm/page.h>
@@ -42,7 +41,7 @@
* Returns 0 if the range is valid, nonzero otherwise.
*
* This is equivalent to the following test:
- * (u33)addr + (u33)size >= (u33)current->addr_limit.seg (u65 for x86_64)
+ * (u33)addr + (u33)size > (u33)current->addr_limit.seg (u65 for x86_64)
*
* This needs 33-bit (65-bit for x86_64) arithmetic. We have a carry...
*/
@@ -556,6 +555,9 @@ struct __large_struct { unsigned long buf[100]; };
#endif /* CONFIG_X86_WP_WORKS_OK */
+extern unsigned long
+copy_from_user_nmi(void *to, const void __user *from, unsigned long n);
+
/*
* movsl can be slow when source and dest are not both 8-byte aligned
*/
diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
index 088d09fb161..566e803cc60 100644
--- a/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -6,7 +6,6 @@
*/
#include <linux/errno.h>
#include <linux/thread_info.h>
-#include <linux/prefetch.h>
#include <linux/string.h>
#include <asm/asm.h>
#include <asm/page.h>
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index 316708d5af9..1c66d30971a 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -6,7 +6,6 @@
*/
#include <linux/compiler.h>
#include <linux/errno.h>
-#include <linux/prefetch.h>
#include <linux/lockdep.h>
#include <asm/alternative.h>
#include <asm/cpufeature.h>
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index a755ef5e597..593485b38ab 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -350,10 +350,12 @@
#define __NR_open_by_handle_at 342
#define __NR_clock_adjtime 343
#define __NR_syncfs 344
+#define __NR_sendmmsg 345
+#define __NR_setns 346
#ifdef __KERNEL__
-#define NR_syscalls 345
+#define NR_syscalls 347
#define __ARCH_WANT_IPC_PARSE_VERSION
#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index 160fa76bd57..705bf139288 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -677,6 +677,10 @@ __SYSCALL(__NR_open_by_handle_at, sys_open_by_handle_at)
__SYSCALL(__NR_clock_adjtime, sys_clock_adjtime)
#define __NR_syncfs 306
__SYSCALL(__NR_syncfs, sys_syncfs)
+#define __NR_sendmmsg 307
+__SYSCALL(__NR_sendmmsg, sys_sendmmsg)
+#define __NR_setns 308
+__SYSCALL(__NR_setns, sys_setns)
#ifndef __NO_STUBS
#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index 130f1eeee5f..37d369859c8 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -5,7 +5,7 @@
*
* SGI UV Broadcast Assist Unit definitions
*
- * Copyright (C) 2008 Silicon Graphics, Inc. All rights reserved.
+ * Copyright (C) 2008-2011 Silicon Graphics, Inc. All rights reserved.
*/
#ifndef _ASM_X86_UV_UV_BAU_H
@@ -35,17 +35,20 @@
#define MAX_CPUS_PER_UVHUB 64
#define MAX_CPUS_PER_SOCKET 32
-#define UV_ADP_SIZE 64 /* hardware-provided max. */
-#define UV_CPUS_PER_ACT_STATUS 32 /* hardware-provided max. */
-#define UV_ITEMS_PER_DESCRIPTOR 8
+#define ADP_SZ 64 /* hardware-provided max. */
+#define UV_CPUS_PER_AS 32 /* hardware-provided max. */
+#define ITEMS_PER_DESC 8
/* the 'throttle' to prevent the hardware stay-busy bug */
#define MAX_BAU_CONCURRENT 3
#define UV_ACT_STATUS_MASK 0x3
#define UV_ACT_STATUS_SIZE 2
#define UV_DISTRIBUTION_SIZE 256
#define UV_SW_ACK_NPENDING 8
-#define UV_NET_ENDPOINT_INTD 0x38
-#define UV_DESC_BASE_PNODE_SHIFT 49
+#define UV1_NET_ENDPOINT_INTD 0x38
+#define UV2_NET_ENDPOINT_INTD 0x28
+#define UV_NET_ENDPOINT_INTD (is_uv1_hub() ? \
+ UV1_NET_ENDPOINT_INTD : UV2_NET_ENDPOINT_INTD)
+#define UV_DESC_PSHIFT 49
#define UV_PAYLOADQ_PNODE_SHIFT 49
#define UV_PTC_BASENAME "sgi_uv/ptc_statistics"
#define UV_BAU_BASENAME "sgi_uv/bau_tunables"
@@ -53,29 +56,72 @@
#define UV_BAU_TUNABLES_FILE "bau_tunables"
#define WHITESPACE " \t\n"
#define uv_physnodeaddr(x) ((__pa((unsigned long)(x)) & uv_mmask))
-#define UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT 15
-#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT 16
-#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x0000000009UL
+#define cpubit_isset(cpu, bau_local_cpumask) \
+ test_bit((cpu), (bau_local_cpumask).bits)
+
/* [19:16] SOFT_ACK timeout period 19: 1 is urgency 7 17:16 1 is multiplier */
-#define BAU_MISC_CONTROL_MULT_MASK 3
+/*
+ * UV2: Bit 19 selects between
+ * (0): 10 microsecond timebase and
+ * (1): 80 microseconds
+ * we're using 655us, similar to UV1: 65 units of 10us
+ */
+#define UV1_INTD_SOFT_ACK_TIMEOUT_PERIOD (9UL)
+#define UV2_INTD_SOFT_ACK_TIMEOUT_PERIOD (15UL)
+
+#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD (is_uv1_hub() ? \
+ UV1_INTD_SOFT_ACK_TIMEOUT_PERIOD : \
+ UV2_INTD_SOFT_ACK_TIMEOUT_PERIOD)
-#define UVH_AGING_PRESCALE_SEL 0x000000b000UL
+#define BAU_MISC_CONTROL_MULT_MASK 3
+
+#define UVH_AGING_PRESCALE_SEL 0x000000b000UL
/* [30:28] URGENCY_7 an index into a table of times */
-#define BAU_URGENCY_7_SHIFT 28
-#define BAU_URGENCY_7_MASK 7
+#define BAU_URGENCY_7_SHIFT 28
+#define BAU_URGENCY_7_MASK 7
-#define UVH_TRANSACTION_TIMEOUT 0x000000b200UL
+#define UVH_TRANSACTION_TIMEOUT 0x000000b200UL
/* [45:40] BAU - BAU transaction timeout select - a multiplier */
-#define BAU_TRANS_SHIFT 40
-#define BAU_TRANS_MASK 0x3f
+#define BAU_TRANS_SHIFT 40
+#define BAU_TRANS_MASK 0x3f
+
+/*
+ * shorten some awkward names
+ */
+#define AS_PUSH_SHIFT UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT
+#define SOFTACK_MSHIFT UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT
+#define SOFTACK_PSHIFT UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT
+#define SOFTACK_TIMEOUT_PERIOD UV_INTD_SOFT_ACK_TIMEOUT_PERIOD
+#define write_gmmr uv_write_global_mmr64
+#define write_lmmr uv_write_local_mmr
+#define read_lmmr uv_read_local_mmr
+#define read_gmmr uv_read_global_mmr64
/*
* bits in UVH_LB_BAU_SB_ACTIVATION_STATUS_0/1
*/
-#define DESC_STATUS_IDLE 0
-#define DESC_STATUS_ACTIVE 1
-#define DESC_STATUS_DESTINATION_TIMEOUT 2
-#define DESC_STATUS_SOURCE_TIMEOUT 3
+#define DS_IDLE 0
+#define DS_ACTIVE 1
+#define DS_DESTINATION_TIMEOUT 2
+#define DS_SOURCE_TIMEOUT 3
+/*
+ * bits put together from HRP_LB_BAU_SB_ACTIVATION_STATUS_0/1/2
+ * values 1 and 3 will not occur
+ * Decoded meaning ERROR BUSY AUX ERR
+ * ------------------------------- ---- ----- -------
+ * IDLE 0 0 0
+ * BUSY (active) 0 1 0
+ * SW Ack Timeout (destination) 1 0 0
+ * SW Ack INTD rejected (strong NACK) 1 0 1
+ * Source Side Time Out Detected 1 1 0
+ * Destination Side PUT Failed 1 1 1
+ */
+#define UV2H_DESC_IDLE 0
+#define UV2H_DESC_BUSY 2
+#define UV2H_DESC_DEST_TIMEOUT 4
+#define UV2H_DESC_DEST_STRONG_NACK 5
+#define UV2H_DESC_SOURCE_TIMEOUT 6
+#define UV2H_DESC_DEST_PUT_ERR 7
/*
* delay for 'plugged' timeout retries, in microseconds
@@ -86,15 +132,24 @@
* threshholds at which to use IPI to free resources
*/
/* after this # consecutive 'plugged' timeouts, use IPI to release resources */
-#define PLUGSB4RESET 100
+#define PLUGSB4RESET 100
/* after this many consecutive timeouts, use IPI to release resources */
-#define TIMEOUTSB4RESET 1
+#define TIMEOUTSB4RESET 1
/* at this number uses of IPI to release resources, giveup the request */
-#define IPI_RESET_LIMIT 1
+#define IPI_RESET_LIMIT 1
/* after this # consecutive successes, bump up the throttle if it was lowered */
-#define COMPLETE_THRESHOLD 5
+#define COMPLETE_THRESHOLD 5
+
+#define UV_LB_SUBNODEID 0x10
-#define UV_LB_SUBNODEID 0x10
+/* these two are the same for UV1 and UV2: */
+#define UV_SA_SHFT UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT
+#define UV_SA_MASK UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK
+/* 4 bits of software ack period */
+#define UV2_ACK_MASK 0x7UL
+#define UV2_ACK_UNITS_SHFT 3
+#define UV2_LEG_SHFT UV2H_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_SHFT
+#define UV2_EXT_SHFT UV2H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_SHFT
/*
* number of entries in the destination side payload queue
@@ -115,9 +170,16 @@
/*
* tuning the action when the numalink network is extremely delayed
*/
-#define CONGESTED_RESPONSE_US 1000 /* 'long' response time, in microseconds */
-#define CONGESTED_REPS 10 /* long delays averaged over this many broadcasts */
-#define CONGESTED_PERIOD 30 /* time for the bau to be disabled, in seconds */
+#define CONGESTED_RESPONSE_US 1000 /* 'long' response time, in
+ microseconds */
+#define CONGESTED_REPS 10 /* long delays averaged over
+ this many broadcasts */
+#define CONGESTED_PERIOD 30 /* time for the bau to be
+ disabled, in seconds */
+/* see msg_type: */
+#define MSG_NOOP 0
+#define MSG_REGULAR 1
+#define MSG_RETRY 2
/*
* Distribution: 32 bytes (256 bits) (bytes 0-0x1f of descriptor)
@@ -129,8 +191,8 @@
* 'base_dest_nasid' field of the header corresponds to the
* destination nodeID associated with that specified bit.
*/
-struct bau_target_uvhubmask {
- unsigned long bits[BITS_TO_LONGS(UV_DISTRIBUTION_SIZE)];
+struct pnmask {
+ unsigned long bits[BITS_TO_LONGS(UV_DISTRIBUTION_SIZE)];
};
/*
@@ -139,7 +201,7 @@ struct bau_target_uvhubmask {
* enough bits for max. cpu's per uvhub)
*/
struct bau_local_cpumask {
- unsigned long bits;
+ unsigned long bits;
};
/*
@@ -160,14 +222,14 @@ struct bau_local_cpumask {
* The payload is software-defined for INTD transactions
*/
struct bau_msg_payload {
- unsigned long address; /* signifies a page or all TLB's
- of the cpu */
+ unsigned long address; /* signifies a page or all
+ TLB's of the cpu */
/* 64 bits */
- unsigned short sending_cpu; /* filled in by sender */
+ unsigned short sending_cpu; /* filled in by sender */
/* 16 bits */
- unsigned short acknowledge_count;/* filled in by destination */
+ unsigned short acknowledge_count; /* filled in by destination */
/* 16 bits */
- unsigned int reserved1:32; /* not usable */
+ unsigned int reserved1:32; /* not usable */
};
@@ -176,93 +238,96 @@ struct bau_msg_payload {
* see table 4.2.3.0.1 in broacast_assist spec.
*/
struct bau_msg_header {
- unsigned int dest_subnodeid:6; /* must be 0x10, for the LB */
+ unsigned int dest_subnodeid:6; /* must be 0x10, for the LB */
/* bits 5:0 */
- unsigned int base_dest_nasid:15; /* nasid of the */
- /* bits 20:6 */ /* first bit in uvhub map */
- unsigned int command:8; /* message type */
+ unsigned int base_dest_nasid:15; /* nasid of the first bit */
+ /* bits 20:6 */ /* in uvhub map */
+ unsigned int command:8; /* message type */
/* bits 28:21 */
- /* 0x38: SN3net EndPoint Message */
- unsigned int rsvd_1:3; /* must be zero */
+ /* 0x38: SN3net EndPoint Message */
+ unsigned int rsvd_1:3; /* must be zero */
/* bits 31:29 */
- /* int will align on 32 bits */
- unsigned int rsvd_2:9; /* must be zero */
+ /* int will align on 32 bits */
+ unsigned int rsvd_2:9; /* must be zero */
/* bits 40:32 */
- /* Suppl_A is 56-41 */
- unsigned int sequence:16;/* message sequence number */
- /* bits 56:41 */ /* becomes bytes 16-17 of msg */
- /* Address field (96:57) is never used as an
- address (these are address bits 42:3) */
-
- unsigned int rsvd_3:1; /* must be zero */
+ /* Suppl_A is 56-41 */
+ unsigned int sequence:16; /* message sequence number */
+ /* bits 56:41 */ /* becomes bytes 16-17 of msg */
+ /* Address field (96:57) is
+ never used as an address
+ (these are address bits
+ 42:3) */
+
+ unsigned int rsvd_3:1; /* must be zero */
/* bit 57 */
- /* address bits 27:4 are payload */
+ /* address bits 27:4 are payload */
/* these next 24 (58-81) bits become bytes 12-14 of msg */
-
/* bits 65:58 land in byte 12 */
- unsigned int replied_to:1;/* sent as 0 by the source to byte 12 */
+ unsigned int replied_to:1; /* sent as 0 by the source to
+ byte 12 */
/* bit 58 */
- unsigned int msg_type:3; /* software type of the message*/
+ unsigned int msg_type:3; /* software type of the
+ message */
/* bits 61:59 */
- unsigned int canceled:1; /* message canceled, resource to be freed*/
+ unsigned int canceled:1; /* message canceled, resource
+ is to be freed*/
/* bit 62 */
- unsigned int payload_1a:1;/* not currently used */
+ unsigned int payload_1a:1; /* not currently used */
/* bit 63 */
- unsigned int payload_1b:2;/* not currently used */
+ unsigned int payload_1b:2; /* not currently used */
/* bits 65:64 */
/* bits 73:66 land in byte 13 */
- unsigned int payload_1ca:6;/* not currently used */
+ unsigned int payload_1ca:6; /* not currently used */
/* bits 71:66 */
- unsigned int payload_1c:2;/* not currently used */
+ unsigned int payload_1c:2; /* not currently used */
/* bits 73:72 */
/* bits 81:74 land in byte 14 */
- unsigned int payload_1d:6;/* not currently used */
+ unsigned int payload_1d:6; /* not currently used */
/* bits 79:74 */
- unsigned int payload_1e:2;/* not currently used */
+ unsigned int payload_1e:2; /* not currently used */
/* bits 81:80 */
- unsigned int rsvd_4:7; /* must be zero */
+ unsigned int rsvd_4:7; /* must be zero */
/* bits 88:82 */
- unsigned int sw_ack_flag:1;/* software acknowledge flag */
+ unsigned int swack_flag:1; /* software acknowledge flag */
/* bit 89 */
- /* INTD trasactions at destination are to
- wait for software acknowledge */
- unsigned int rsvd_5:6; /* must be zero */
+ /* INTD trasactions at
+ destination are to wait for
+ software acknowledge */
+ unsigned int rsvd_5:6; /* must be zero */
/* bits 95:90 */
- unsigned int rsvd_6:5; /* must be zero */
+ unsigned int rsvd_6:5; /* must be zero */
/* bits 100:96 */
- unsigned int int_both:1;/* if 1, interrupt both sockets on the uvhub */
+ unsigned int int_both:1; /* if 1, interrupt both sockets
+ on the uvhub */
/* bit 101*/
- unsigned int fairness:3;/* usually zero */
+ unsigned int fairness:3; /* usually zero */
/* bits 104:102 */
- unsigned int multilevel:1; /* multi-level multicast format */
+ unsigned int multilevel:1; /* multi-level multicast
+ format */
/* bit 105 */
- /* 0 for TLB: endpoint multi-unicast messages */
- unsigned int chaining:1;/* next descriptor is part of this activation*/
+ /* 0 for TLB: endpoint multi-unicast messages */
+ unsigned int chaining:1; /* next descriptor is part of
+ this activation*/
/* bit 106 */
- unsigned int rsvd_7:21; /* must be zero */
+ unsigned int rsvd_7:21; /* must be zero */
/* bits 127:107 */
};
-/* see msg_type: */
-#define MSG_NOOP 0
-#define MSG_REGULAR 1
-#define MSG_RETRY 2
-
/*
* The activation descriptor:
* The format of the message to send, plus all accompanying control
* Should be 64 bytes
*/
struct bau_desc {
- struct bau_target_uvhubmask distribution;
+ struct pnmask distribution;
/*
* message template, consisting of header and payload:
*/
- struct bau_msg_header header;
- struct bau_msg_payload payload;
+ struct bau_msg_header header;
+ struct bau_msg_payload payload;
};
/*
* -payload-- ---------header------
@@ -281,59 +346,51 @@ struct bau_desc {
* are 32 bytes (2 micropackets) (256 bits) in length, but contain only 17
* bytes of usable data, including the sw ack vector in byte 15 (bits 127:120)
* (12 bytes come from bau_msg_payload, 3 from payload_1, 2 from
- * sw_ack_vector and payload_2)
+ * swack_vec and payload_2)
* "Enabling Software Acknowledgment mode (see Section 4.3.3 Software
* Acknowledge Processing) also selects 32 byte (17 bytes usable) payload
* operation."
*/
-struct bau_payload_queue_entry {
- unsigned long address; /* signifies a page or all TLB's
- of the cpu */
+struct bau_pq_entry {
+ unsigned long address; /* signifies a page or all TLB's
+ of the cpu */
/* 64 bits, bytes 0-7 */
-
- unsigned short sending_cpu; /* cpu that sent the message */
+ unsigned short sending_cpu; /* cpu that sent the message */
/* 16 bits, bytes 8-9 */
-
- unsigned short acknowledge_count; /* filled in by destination */
+ unsigned short acknowledge_count; /* filled in by destination */
/* 16 bits, bytes 10-11 */
-
/* these next 3 bytes come from bits 58-81 of the message header */
- unsigned short replied_to:1; /* sent as 0 by the source */
- unsigned short msg_type:3; /* software message type */
- unsigned short canceled:1; /* sent as 0 by the source */
- unsigned short unused1:3; /* not currently using */
+ unsigned short replied_to:1; /* sent as 0 by the source */
+ unsigned short msg_type:3; /* software message type */
+ unsigned short canceled:1; /* sent as 0 by the source */
+ unsigned short unused1:3; /* not currently using */
/* byte 12 */
-
- unsigned char unused2a; /* not currently using */
+ unsigned char unused2a; /* not currently using */
/* byte 13 */
- unsigned char unused2; /* not currently using */
+ unsigned char unused2; /* not currently using */
/* byte 14 */
-
- unsigned char sw_ack_vector; /* filled in by the hardware */
+ unsigned char swack_vec; /* filled in by the hardware */
/* byte 15 (bits 127:120) */
-
- unsigned short sequence; /* message sequence number */
+ unsigned short sequence; /* message sequence number */
/* bytes 16-17 */
- unsigned char unused4[2]; /* not currently using bytes 18-19 */
+ unsigned char unused4[2]; /* not currently using bytes 18-19 */
/* bytes 18-19 */
-
- int number_of_cpus; /* filled in at destination */
+ int number_of_cpus; /* filled in at destination */
/* 32 bits, bytes 20-23 (aligned) */
-
- unsigned char unused5[8]; /* not using */
+ unsigned char unused5[8]; /* not using */
/* bytes 24-31 */
};
struct msg_desc {
- struct bau_payload_queue_entry *msg;
- int msg_slot;
- int sw_ack_slot;
- struct bau_payload_queue_entry *va_queue_first;
- struct bau_payload_queue_entry *va_queue_last;
+ struct bau_pq_entry *msg;
+ int msg_slot;
+ int swack_slot;
+ struct bau_pq_entry *queue_first;
+ struct bau_pq_entry *queue_last;
};
struct reset_args {
- int sender;
+ int sender;
};
/*
@@ -341,112 +398,227 @@ struct reset_args {
*/
struct ptc_stats {
/* sender statistics */
- unsigned long s_giveup; /* number of fall backs to IPI-style flushes */
- unsigned long s_requestor; /* number of shootdown requests */
- unsigned long s_stimeout; /* source side timeouts */
- unsigned long s_dtimeout; /* destination side timeouts */
- unsigned long s_time; /* time spent in sending side */
- unsigned long s_retriesok; /* successful retries */
- unsigned long s_ntargcpu; /* total number of cpu's targeted */
- unsigned long s_ntargself; /* times the sending cpu was targeted */
- unsigned long s_ntarglocals; /* targets of cpus on the local blade */
- unsigned long s_ntargremotes; /* targets of cpus on remote blades */
- unsigned long s_ntarglocaluvhub; /* targets of the local hub */
- unsigned long s_ntargremoteuvhub; /* remotes hubs targeted */
- unsigned long s_ntarguvhub; /* total number of uvhubs targeted */
- unsigned long s_ntarguvhub16; /* number of times target hubs >= 16*/
- unsigned long s_ntarguvhub8; /* number of times target hubs >= 8 */
- unsigned long s_ntarguvhub4; /* number of times target hubs >= 4 */
- unsigned long s_ntarguvhub2; /* number of times target hubs >= 2 */
- unsigned long s_ntarguvhub1; /* number of times target hubs == 1 */
- unsigned long s_resets_plug; /* ipi-style resets from plug state */
- unsigned long s_resets_timeout; /* ipi-style resets from timeouts */
- unsigned long s_busy; /* status stayed busy past s/w timer */
- unsigned long s_throttles; /* waits in throttle */
- unsigned long s_retry_messages; /* retry broadcasts */
- unsigned long s_bau_reenabled; /* for bau enable/disable */
- unsigned long s_bau_disabled; /* for bau enable/disable */
+ unsigned long s_giveup; /* number of fall backs to
+ IPI-style flushes */
+ unsigned long s_requestor; /* number of shootdown
+ requests */
+ unsigned long s_stimeout; /* source side timeouts */
+ unsigned long s_dtimeout; /* destination side timeouts */
+ unsigned long s_time; /* time spent in sending side */
+ unsigned long s_retriesok; /* successful retries */
+ unsigned long s_ntargcpu; /* total number of cpu's
+ targeted */
+ unsigned long s_ntargself; /* times the sending cpu was
+ targeted */
+ unsigned long s_ntarglocals; /* targets of cpus on the local
+ blade */
+ unsigned long s_ntargremotes; /* targets of cpus on remote
+ blades */
+ unsigned long s_ntarglocaluvhub; /* targets of the local hub */
+ unsigned long s_ntargremoteuvhub; /* remotes hubs targeted */
+ unsigned long s_ntarguvhub; /* total number of uvhubs
+ targeted */
+ unsigned long s_ntarguvhub16; /* number of times target
+ hubs >= 16*/
+ unsigned long s_ntarguvhub8; /* number of times target
+ hubs >= 8 */
+ unsigned long s_ntarguvhub4; /* number of times target
+ hubs >= 4 */
+ unsigned long s_ntarguvhub2; /* number of times target
+ hubs >= 2 */
+ unsigned long s_ntarguvhub1; /* number of times target
+ hubs == 1 */
+ unsigned long s_resets_plug; /* ipi-style resets from plug
+ state */
+ unsigned long s_resets_timeout; /* ipi-style resets from
+ timeouts */
+ unsigned long s_busy; /* status stayed busy past
+ s/w timer */
+ unsigned long s_throttles; /* waits in throttle */
+ unsigned long s_retry_messages; /* retry broadcasts */
+ unsigned long s_bau_reenabled; /* for bau enable/disable */
+ unsigned long s_bau_disabled; /* for bau enable/disable */
/* destination statistics */
- unsigned long d_alltlb; /* times all tlb's on this cpu were flushed */
- unsigned long d_onetlb; /* times just one tlb on this cpu was flushed */
- unsigned long d_multmsg; /* interrupts with multiple messages */
- unsigned long d_nomsg; /* interrupts with no message */
- unsigned long d_time; /* time spent on destination side */
- unsigned long d_requestee; /* number of messages processed */
- unsigned long d_retries; /* number of retry messages processed */
- unsigned long d_canceled; /* number of messages canceled by retries */
- unsigned long d_nocanceled; /* retries that found nothing to cancel */
- unsigned long d_resets; /* number of ipi-style requests processed */
- unsigned long d_rcanceled; /* number of messages canceled by resets */
+ unsigned long d_alltlb; /* times all tlb's on this
+ cpu were flushed */
+ unsigned long d_onetlb; /* times just one tlb on this
+ cpu was flushed */
+ unsigned long d_multmsg; /* interrupts with multiple
+ messages */
+ unsigned long d_nomsg; /* interrupts with no message */
+ unsigned long d_time; /* time spent on destination
+ side */
+ unsigned long d_requestee; /* number of messages
+ processed */
+ unsigned long d_retries; /* number of retry messages
+ processed */
+ unsigned long d_canceled; /* number of messages canceled
+ by retries */
+ unsigned long d_nocanceled; /* retries that found nothing
+ to cancel */
+ unsigned long d_resets; /* number of ipi-style requests
+ processed */
+ unsigned long d_rcanceled; /* number of messages canceled
+ by resets */
+};
+
+struct tunables {
+ int *tunp;
+ int deflt;
};
struct hub_and_pnode {
- short uvhub;
- short pnode;
+ short uvhub;
+ short pnode;
};
+
+struct socket_desc {
+ short num_cpus;
+ short cpu_number[MAX_CPUS_PER_SOCKET];
+};
+
+struct uvhub_desc {
+ unsigned short socket_mask;
+ short num_cpus;
+ short uvhub;
+ short pnode;
+ struct socket_desc socket[2];
+};
+
/*
* one per-cpu; to locate the software tables
*/
struct bau_control {
- struct bau_desc *descriptor_base;
- struct bau_payload_queue_entry *va_queue_first;
- struct bau_payload_queue_entry *va_queue_last;
- struct bau_payload_queue_entry *bau_msg_head;
- struct bau_control *uvhub_master;
- struct bau_control *socket_master;
- struct ptc_stats *statp;
- unsigned long timeout_interval;
- unsigned long set_bau_on_time;
- atomic_t active_descriptor_count;
- int plugged_tries;
- int timeout_tries;
- int ipi_attempts;
- int conseccompletes;
- int baudisabled;
- int set_bau_off;
- short cpu;
- short osnode;
- short uvhub_cpu;
- short uvhub;
- short cpus_in_socket;
- short cpus_in_uvhub;
- short partition_base_pnode;
- unsigned short message_number;
- unsigned short uvhub_quiesce;
- short socket_acknowledge_count[DEST_Q_SIZE];
- cycles_t send_message;
- spinlock_t uvhub_lock;
- spinlock_t queue_lock;
+ struct bau_desc *descriptor_base;
+ struct bau_pq_entry *queue_first;
+ struct bau_pq_entry *queue_last;
+ struct bau_pq_entry *bau_msg_head;
+ struct bau_control *uvhub_master;
+ struct bau_control *socket_master;
+ struct ptc_stats *statp;
+ cpumask_t *cpumask;
+ unsigned long timeout_interval;
+ unsigned long set_bau_on_time;
+ atomic_t active_descriptor_count;
+ int plugged_tries;
+ int timeout_tries;
+ int ipi_attempts;
+ int conseccompletes;
+ int baudisabled;
+ int set_bau_off;
+ short cpu;
+ short osnode;
+ short uvhub_cpu;
+ short uvhub;
+ short cpus_in_socket;
+ short cpus_in_uvhub;
+ short partition_base_pnode;
+ unsigned short message_number;
+ unsigned short uvhub_quiesce;
+ short socket_acknowledge_count[DEST_Q_SIZE];
+ cycles_t send_message;
+ spinlock_t uvhub_lock;
+ spinlock_t queue_lock;
/* tunables */
- int max_bau_concurrent;
- int max_bau_concurrent_constant;
- int plugged_delay;
- int plugsb4reset;
- int timeoutsb4reset;
- int ipi_reset_limit;
- int complete_threshold;
- int congested_response_us;
- int congested_reps;
- int congested_period;
- cycles_t period_time;
- long period_requests;
- struct hub_and_pnode *target_hub_and_pnode;
+ int max_concurr;
+ int max_concurr_const;
+ int plugged_delay;
+ int plugsb4reset;
+ int timeoutsb4reset;
+ int ipi_reset_limit;
+ int complete_threshold;
+ int cong_response_us;
+ int cong_reps;
+ int cong_period;
+ cycles_t period_time;
+ long period_requests;
+ struct hub_and_pnode *thp;
};
-static inline int bau_uvhub_isset(int uvhub, struct bau_target_uvhubmask *dstp)
+static inline unsigned long read_mmr_uv2_status(void)
+{
+ return read_lmmr(UV2H_LB_BAU_SB_ACTIVATION_STATUS_2);
+}
+
+static inline void write_mmr_data_broadcast(int pnode, unsigned long mmr_image)
+{
+ write_gmmr(pnode, UVH_BAU_DATA_BROADCAST, mmr_image);
+}
+
+static inline void write_mmr_descriptor_base(int pnode, unsigned long mmr_image)
+{
+ write_gmmr(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE, mmr_image);
+}
+
+static inline void write_mmr_activation(unsigned long index)
+{
+ write_lmmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index);
+}
+
+static inline void write_gmmr_activation(int pnode, unsigned long mmr_image)
+{
+ write_gmmr(pnode, UVH_LB_BAU_SB_ACTIVATION_CONTROL, mmr_image);
+}
+
+static inline void write_mmr_payload_first(int pnode, unsigned long mmr_image)
+{
+ write_gmmr(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST, mmr_image);
+}
+
+static inline void write_mmr_payload_tail(int pnode, unsigned long mmr_image)
+{
+ write_gmmr(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL, mmr_image);
+}
+
+static inline void write_mmr_payload_last(int pnode, unsigned long mmr_image)
+{
+ write_gmmr(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST, mmr_image);
+}
+
+static inline void write_mmr_misc_control(int pnode, unsigned long mmr_image)
+{
+ write_gmmr(pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
+}
+
+static inline unsigned long read_mmr_misc_control(int pnode)
+{
+ return read_gmmr(pnode, UVH_LB_BAU_MISC_CONTROL);
+}
+
+static inline void write_mmr_sw_ack(unsigned long mr)
+{
+ uv_write_local_mmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, mr);
+}
+
+static inline unsigned long read_mmr_sw_ack(void)
+{
+ return read_lmmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
+}
+
+static inline unsigned long read_gmmr_sw_ack(int pnode)
+{
+ return read_gmmr(pnode, UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
+}
+
+static inline void write_mmr_data_config(int pnode, unsigned long mr)
+{
+ uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG, mr);
+}
+
+static inline int bau_uvhub_isset(int uvhub, struct pnmask *dstp)
{
return constant_test_bit(uvhub, &dstp->bits[0]);
}
-static inline void bau_uvhub_set(int pnode, struct bau_target_uvhubmask *dstp)
+static inline void bau_uvhub_set(int pnode, struct pnmask *dstp)
{
__set_bit(pnode, &dstp->bits[0]);
}
-static inline void bau_uvhubs_clear(struct bau_target_uvhubmask *dstp,
+static inline void bau_uvhubs_clear(struct pnmask *dstp,
int nbits)
{
bitmap_zero(&dstp->bits[0], nbits);
}
-static inline int bau_uvhub_weight(struct bau_target_uvhubmask *dstp)
+static inline int bau_uvhub_weight(struct pnmask *dstp)
{
return bitmap_weight((unsigned long *)&dstp->bits[0],
UV_DISTRIBUTION_SIZE);
@@ -457,9 +629,6 @@ static inline void bau_cpubits_clear(struct bau_local_cpumask *dstp, int nbits)
bitmap_zero(&dstp->bits, nbits);
}
-#define cpubit_isset(cpu, bau_local_cpumask) \
- test_bit((cpu), (bau_local_cpumask).bits)
-
extern void uv_bau_message_intr1(void);
extern void uv_bau_timeout_intr1(void);
@@ -467,7 +636,7 @@ struct atomic_short {
short counter;
};
-/**
+/*
* atomic_read_short - read a short atomic variable
* @v: pointer of type atomic_short
*
@@ -478,14 +647,14 @@ static inline int atomic_read_short(const struct atomic_short *v)
return v->counter;
}
-/**
- * atomic_add_short_return - add and return a short int
+/*
+ * atom_asr - add and return a short int
* @i: short value to add
* @v: pointer of type atomic_short
*
* Atomically adds @i to @v and returns @i + @v
*/
-static inline int atomic_add_short_return(short i, struct atomic_short *v)
+static inline int atom_asr(short i, struct atomic_short *v)
{
short __i = i;
asm volatile(LOCK_PREFIX "xaddw %0, %1"
@@ -494,4 +663,26 @@ static inline int atomic_add_short_return(short i, struct atomic_short *v)
return i + __i;
}
+/*
+ * conditionally add 1 to *v, unless *v is >= u
+ * return 0 if we cannot add 1 to *v because it is >= u
+ * return 1 if we can add 1 to *v because it is < u
+ * the add is atomic
+ *
+ * This is close to atomic_add_unless(), but this allows the 'u' value
+ * to be lowered below the current 'v'. atomic_add_unless can only stop
+ * on equal.
+ */
+static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u)
+{
+ spin_lock(lock);
+ if (atomic_read(v) >= u) {
+ spin_unlock(lock);
+ return 0;
+ }
+ atomic_inc(v);
+ spin_unlock(lock);
+ return 1;
+}
+
#endif /* _ASM_X86_UV_UV_BAU_H */
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index 4298002d0c8..f26544a1521 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -77,8 +77,9 @@
*
* 1111110000000000
* 5432109876543210
- * pppppppppplc0cch Nehalem-EX
- * ppppppppplcc0cch Westmere-EX
+ * pppppppppplc0cch Nehalem-EX (12 bits in hdw reg)
+ * ppppppppplcc0cch Westmere-EX (12 bits in hdw reg)
+ * pppppppppppcccch SandyBridge (15 bits in hdw reg)
* sssssssssss
*
* p = pnode bits
@@ -87,7 +88,7 @@
* h = hyperthread
* s = bits that are in the SOCKET_ID CSR
*
- * Note: Processor only supports 12 bits in the APICID register. The ACPI
+ * Note: Processor may support fewer bits in the APICID register. The ACPI
* tables hold all 16 bits. Software needs to be aware of this.
*
* Unless otherwise specified, all references to APICID refer to
@@ -138,6 +139,8 @@ struct uv_hub_info_s {
unsigned long global_mmr_base;
unsigned long gpa_mask;
unsigned int gnode_extra;
+ unsigned char hub_revision;
+ unsigned char apic_pnode_shift;
unsigned long gnode_upper;
unsigned long lowmem_remap_top;
unsigned long lowmem_remap_base;
@@ -149,13 +152,31 @@ struct uv_hub_info_s {
unsigned char m_val;
unsigned char n_val;
struct uv_scir_s scir;
- unsigned char apic_pnode_shift;
};
DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
#define uv_hub_info (&__get_cpu_var(__uv_hub_info))
#define uv_cpu_hub_info(cpu) (&per_cpu(__uv_hub_info, cpu))
+/*
+ * Hub revisions less than UV2_HUB_REVISION_BASE are UV1 hubs. All UV2
+ * hubs have revision numbers greater than or equal to UV2_HUB_REVISION_BASE.
+ * This is a software convention - NOT the hardware revision numbers in
+ * the hub chip.
+ */
+#define UV1_HUB_REVISION_BASE 1
+#define UV2_HUB_REVISION_BASE 3
+
+static inline int is_uv1_hub(void)
+{
+ return uv_hub_info->hub_revision < UV2_HUB_REVISION_BASE;
+}
+
+static inline int is_uv2_hub(void)
+{
+ return uv_hub_info->hub_revision >= UV2_HUB_REVISION_BASE;
+}
+
union uvh_apicid {
unsigned long v;
struct uvh_apicid_s {
@@ -180,11 +201,25 @@ union uvh_apicid {
#define UV_PNODE_TO_GNODE(p) ((p) |uv_hub_info->gnode_extra)
#define UV_PNODE_TO_NASID(p) (UV_PNODE_TO_GNODE(p) << 1)
-#define UV_LOCAL_MMR_BASE 0xf4000000UL
-#define UV_GLOBAL_MMR32_BASE 0xf8000000UL
+#define UV1_LOCAL_MMR_BASE 0xf4000000UL
+#define UV1_GLOBAL_MMR32_BASE 0xf8000000UL
+#define UV1_LOCAL_MMR_SIZE (64UL * 1024 * 1024)
+#define UV1_GLOBAL_MMR32_SIZE (64UL * 1024 * 1024)
+
+#define UV2_LOCAL_MMR_BASE 0xfa000000UL
+#define UV2_GLOBAL_MMR32_BASE 0xfc000000UL
+#define UV2_LOCAL_MMR_SIZE (32UL * 1024 * 1024)
+#define UV2_GLOBAL_MMR32_SIZE (32UL * 1024 * 1024)
+
+#define UV_LOCAL_MMR_BASE (is_uv1_hub() ? UV1_LOCAL_MMR_BASE \
+ : UV2_LOCAL_MMR_BASE)
+#define UV_GLOBAL_MMR32_BASE (is_uv1_hub() ? UV1_GLOBAL_MMR32_BASE \
+ : UV2_GLOBAL_MMR32_BASE)
+#define UV_LOCAL_MMR_SIZE (is_uv1_hub() ? UV1_LOCAL_MMR_SIZE : \
+ UV2_LOCAL_MMR_SIZE)
+#define UV_GLOBAL_MMR32_SIZE (is_uv1_hub() ? UV1_GLOBAL_MMR32_SIZE :\
+ UV2_GLOBAL_MMR32_SIZE)
#define UV_GLOBAL_MMR64_BASE (uv_hub_info->global_mmr_base)
-#define UV_LOCAL_MMR_SIZE (64UL * 1024 * 1024)
-#define UV_GLOBAL_MMR32_SIZE (64UL * 1024 * 1024)
#define UV_GLOBAL_GRU_MMR_BASE 0x4000000
@@ -301,6 +336,17 @@ static inline int uv_apicid_to_pnode(int apicid)
}
/*
+ * Convert an apicid to the socket number on the blade
+ */
+static inline int uv_apicid_to_socket(int apicid)
+{
+ if (is_uv1_hub())
+ return (apicid >> (uv_hub_info->apic_pnode_shift - 1)) & 1;
+ else
+ return 0;
+}
+
+/*
* Access global MMRs using the low memory MMR32 space. This region supports
* faster MMR access but not all MMRs are accessible in this space.
*/
@@ -519,14 +565,13 @@ static inline void uv_hub_send_ipi(int pnode, int apicid, int vector)
/*
* Get the minimum revision number of the hub chips within the partition.
- * 1 - initial rev 1.0 silicon
- * 2 - rev 2.0 production silicon
+ * 1 - UV1 rev 1.0 initial silicon
+ * 2 - UV1 rev 2.0 production silicon
+ * 3 - UV2 rev 1.0 initial silicon
*/
static inline int uv_get_min_hub_revision_id(void)
{
- extern int uv_min_hub_revision_id;
-
- return uv_min_hub_revision_id;
+ return uv_hub_info->hub_revision;
}
#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/include/asm/uv/uv_mmrs.h b/arch/x86/include/asm/uv/uv_mmrs.h
index f5bb64a823d..10474fb1185 100644
--- a/arch/x86/include/asm/uv/uv_mmrs.h
+++ b/arch/x86/include/asm/uv/uv_mmrs.h
@@ -11,1107 +11,2066 @@
#ifndef _ASM_X86_UV_UV_MMRS_H
#define _ASM_X86_UV_UV_MMRS_H
+/*
+ * This file contains MMR definitions for both UV1 & UV2 hubs.
+ *
+ * In general, MMR addresses and structures are identical on both hubs.
+ * These MMRs are identified as:
+ * #define UVH_xxx <address>
+ * union uvh_xxx {
+ * unsigned long v;
+ * struct uvh_int_cmpd_s {
+ * } s;
+ * };
+ *
+ * If the MMR exists on both hub type but has different addresses or
+ * contents, the MMR definition is similar to:
+ * #define UV1H_xxx <uv1 address>
+ * #define UV2H_xxx <uv2address>
+ * #define UVH_xxx (is_uv1_hub() ? UV1H_xxx : UV2H_xxx)
+ * union uvh_xxx {
+ * unsigned long v;
+ * struct uv1h_int_cmpd_s { (Common fields only)
+ * } s;
+ * struct uv1h_int_cmpd_s { (Full UV1 definition)
+ * } s1;
+ * struct uv2h_int_cmpd_s { (Full UV2 definition)
+ * } s2;
+ * };
+ *
+ * Only essential difference are enumerated. For example, if the address is
+ * the same for both UV1 & UV2, only a single #define is generated. Likewise,
+ * if the contents is the same for both hubs, only the "s" structure is
+ * generated.
+ *
+ * If the MMR exists on ONLY 1 type of hub, no generic definition is
+ * generated:
+ * #define UVnH_xxx <uvn address>
+ * union uvnh_xxx {
+ * unsigned long v;
+ * struct uvh_int_cmpd_s {
+ * } sn;
+ * };
+ */
+
#define UV_MMR_ENABLE (1UL << 63)
+#define UV1_HUB_PART_NUMBER 0x88a5
+#define UV2_HUB_PART_NUMBER 0x8eb8
+
+/* Compat: if this #define is present, UV headers support UV2 */
+#define UV2_HUB_IS_SUPPORTED 1
+
/* ========================================================================= */
/* UVH_BAU_DATA_BROADCAST */
/* ========================================================================= */
-#define UVH_BAU_DATA_BROADCAST 0x61688UL
-#define UVH_BAU_DATA_BROADCAST_32 0x0440
+#define UVH_BAU_DATA_BROADCAST 0x61688UL
+#define UVH_BAU_DATA_BROADCAST_32 0x440
-#define UVH_BAU_DATA_BROADCAST_ENABLE_SHFT 0
-#define UVH_BAU_DATA_BROADCAST_ENABLE_MASK 0x0000000000000001UL
+#define UVH_BAU_DATA_BROADCAST_ENABLE_SHFT 0
+#define UVH_BAU_DATA_BROADCAST_ENABLE_MASK 0x0000000000000001UL
union uvh_bau_data_broadcast_u {
- unsigned long v;
- struct uvh_bau_data_broadcast_s {
- unsigned long enable : 1; /* RW */
- unsigned long rsvd_1_63: 63; /* */
- } s;
+ unsigned long v;
+ struct uvh_bau_data_broadcast_s {
+ unsigned long enable:1; /* RW */
+ unsigned long rsvd_1_63:63;
+ } s;
};
/* ========================================================================= */
/* UVH_BAU_DATA_CONFIG */
/* ========================================================================= */
-#define UVH_BAU_DATA_CONFIG 0x61680UL
-#define UVH_BAU_DATA_CONFIG_32 0x0438
-
-#define UVH_BAU_DATA_CONFIG_VECTOR_SHFT 0
-#define UVH_BAU_DATA_CONFIG_VECTOR_MASK 0x00000000000000ffUL
-#define UVH_BAU_DATA_CONFIG_DM_SHFT 8
-#define UVH_BAU_DATA_CONFIG_DM_MASK 0x0000000000000700UL
-#define UVH_BAU_DATA_CONFIG_DESTMODE_SHFT 11
-#define UVH_BAU_DATA_CONFIG_DESTMODE_MASK 0x0000000000000800UL
-#define UVH_BAU_DATA_CONFIG_STATUS_SHFT 12
-#define UVH_BAU_DATA_CONFIG_STATUS_MASK 0x0000000000001000UL
-#define UVH_BAU_DATA_CONFIG_P_SHFT 13
-#define UVH_BAU_DATA_CONFIG_P_MASK 0x0000000000002000UL
-#define UVH_BAU_DATA_CONFIG_T_SHFT 15
-#define UVH_BAU_DATA_CONFIG_T_MASK 0x0000000000008000UL
-#define UVH_BAU_DATA_CONFIG_M_SHFT 16
-#define UVH_BAU_DATA_CONFIG_M_MASK 0x0000000000010000UL
-#define UVH_BAU_DATA_CONFIG_APIC_ID_SHFT 32
-#define UVH_BAU_DATA_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
+#define UVH_BAU_DATA_CONFIG 0x61680UL
+#define UVH_BAU_DATA_CONFIG_32 0x438
+
+#define UVH_BAU_DATA_CONFIG_VECTOR_SHFT 0
+#define UVH_BAU_DATA_CONFIG_DM_SHFT 8
+#define UVH_BAU_DATA_CONFIG_DESTMODE_SHFT 11
+#define UVH_BAU_DATA_CONFIG_STATUS_SHFT 12
+#define UVH_BAU_DATA_CONFIG_P_SHFT 13
+#define UVH_BAU_DATA_CONFIG_T_SHFT 15
+#define UVH_BAU_DATA_CONFIG_M_SHFT 16
+#define UVH_BAU_DATA_CONFIG_APIC_ID_SHFT 32
+#define UVH_BAU_DATA_CONFIG_VECTOR_MASK 0x00000000000000ffUL
+#define UVH_BAU_DATA_CONFIG_DM_MASK 0x0000000000000700UL
+#define UVH_BAU_DATA_CONFIG_DESTMODE_MASK 0x0000000000000800UL
+#define UVH_BAU_DATA_CONFIG_STATUS_MASK 0x0000000000001000UL
+#define UVH_BAU_DATA_CONFIG_P_MASK 0x0000000000002000UL
+#define UVH_BAU_DATA_CONFIG_T_MASK 0x0000000000008000UL
+#define UVH_BAU_DATA_CONFIG_M_MASK 0x0000000000010000UL
+#define UVH_BAU_DATA_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
union uvh_bau_data_config_u {
- unsigned long v;
- struct uvh_bau_data_config_s {
- unsigned long vector_ : 8; /* RW */
- unsigned long dm : 3; /* RW */
- unsigned long destmode : 1; /* RW */
- unsigned long status : 1; /* RO */
- unsigned long p : 1; /* RO */
- unsigned long rsvd_14 : 1; /* */
- unsigned long t : 1; /* RO */
- unsigned long m : 1; /* RW */
- unsigned long rsvd_17_31: 15; /* */
- unsigned long apic_id : 32; /* RW */
- } s;
+ unsigned long v;
+ struct uvh_bau_data_config_s {
+ unsigned long vector_:8; /* RW */
+ unsigned long dm:3; /* RW */
+ unsigned long destmode:1; /* RW */
+ unsigned long status:1; /* RO */
+ unsigned long p:1; /* RO */
+ unsigned long rsvd_14:1;
+ unsigned long t:1; /* RO */
+ unsigned long m:1; /* RW */
+ unsigned long rsvd_17_31:15;
+ unsigned long apic_id:32; /* RW */
+ } s;
};
/* ========================================================================= */
/* UVH_EVENT_OCCURRED0 */
/* ========================================================================= */
-#define UVH_EVENT_OCCURRED0 0x70000UL
-#define UVH_EVENT_OCCURRED0_32 0x005e8
-
-#define UVH_EVENT_OCCURRED0_LB_HCERR_SHFT 0
-#define UVH_EVENT_OCCURRED0_LB_HCERR_MASK 0x0000000000000001UL
-#define UVH_EVENT_OCCURRED0_GR0_HCERR_SHFT 1
-#define UVH_EVENT_OCCURRED0_GR0_HCERR_MASK 0x0000000000000002UL
-#define UVH_EVENT_OCCURRED0_GR1_HCERR_SHFT 2
-#define UVH_EVENT_OCCURRED0_GR1_HCERR_MASK 0x0000000000000004UL
-#define UVH_EVENT_OCCURRED0_LH_HCERR_SHFT 3
-#define UVH_EVENT_OCCURRED0_LH_HCERR_MASK 0x0000000000000008UL
-#define UVH_EVENT_OCCURRED0_RH_HCERR_SHFT 4
-#define UVH_EVENT_OCCURRED0_RH_HCERR_MASK 0x0000000000000010UL
-#define UVH_EVENT_OCCURRED0_XN_HCERR_SHFT 5
-#define UVH_EVENT_OCCURRED0_XN_HCERR_MASK 0x0000000000000020UL
-#define UVH_EVENT_OCCURRED0_SI_HCERR_SHFT 6
-#define UVH_EVENT_OCCURRED0_SI_HCERR_MASK 0x0000000000000040UL
-#define UVH_EVENT_OCCURRED0_LB_AOERR0_SHFT 7
-#define UVH_EVENT_OCCURRED0_LB_AOERR0_MASK 0x0000000000000080UL
-#define UVH_EVENT_OCCURRED0_GR0_AOERR0_SHFT 8
-#define UVH_EVENT_OCCURRED0_GR0_AOERR0_MASK 0x0000000000000100UL
-#define UVH_EVENT_OCCURRED0_GR1_AOERR0_SHFT 9
-#define UVH_EVENT_OCCURRED0_GR1_AOERR0_MASK 0x0000000000000200UL
-#define UVH_EVENT_OCCURRED0_LH_AOERR0_SHFT 10
-#define UVH_EVENT_OCCURRED0_LH_AOERR0_MASK 0x0000000000000400UL
-#define UVH_EVENT_OCCURRED0_RH_AOERR0_SHFT 11
-#define UVH_EVENT_OCCURRED0_RH_AOERR0_MASK 0x0000000000000800UL
-#define UVH_EVENT_OCCURRED0_XN_AOERR0_SHFT 12
-#define UVH_EVENT_OCCURRED0_XN_AOERR0_MASK 0x0000000000001000UL
-#define UVH_EVENT_OCCURRED0_SI_AOERR0_SHFT 13
-#define UVH_EVENT_OCCURRED0_SI_AOERR0_MASK 0x0000000000002000UL
-#define UVH_EVENT_OCCURRED0_LB_AOERR1_SHFT 14
-#define UVH_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000000004000UL
-#define UVH_EVENT_OCCURRED0_GR0_AOERR1_SHFT 15
-#define UVH_EVENT_OCCURRED0_GR0_AOERR1_MASK 0x0000000000008000UL
-#define UVH_EVENT_OCCURRED0_GR1_AOERR1_SHFT 16
-#define UVH_EVENT_OCCURRED0_GR1_AOERR1_MASK 0x0000000000010000UL
-#define UVH_EVENT_OCCURRED0_LH_AOERR1_SHFT 17
-#define UVH_EVENT_OCCURRED0_LH_AOERR1_MASK 0x0000000000020000UL
-#define UVH_EVENT_OCCURRED0_RH_AOERR1_SHFT 18
-#define UVH_EVENT_OCCURRED0_RH_AOERR1_MASK 0x0000000000040000UL
-#define UVH_EVENT_OCCURRED0_XN_AOERR1_SHFT 19
-#define UVH_EVENT_OCCURRED0_XN_AOERR1_MASK 0x0000000000080000UL
-#define UVH_EVENT_OCCURRED0_SI_AOERR1_SHFT 20
-#define UVH_EVENT_OCCURRED0_SI_AOERR1_MASK 0x0000000000100000UL
-#define UVH_EVENT_OCCURRED0_RH_VPI_INT_SHFT 21
-#define UVH_EVENT_OCCURRED0_RH_VPI_INT_MASK 0x0000000000200000UL
-#define UVH_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 22
-#define UVH_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK 0x0000000000400000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 23
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK 0x0000000000800000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 24
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK 0x0000000001000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 25
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK 0x0000000002000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 26
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK 0x0000000004000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 27
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK 0x0000000008000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 28
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK 0x0000000010000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 29
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK 0x0000000020000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 30
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK 0x0000000040000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 31
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK 0x0000000080000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 32
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK 0x0000000100000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 33
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK 0x0000000200000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 34
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK 0x0000000400000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 35
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK 0x0000000800000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 36
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK 0x0000001000000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 37
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK 0x0000002000000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 38
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK 0x0000004000000000UL
-#define UVH_EVENT_OCCURRED0_L1_NMI_INT_SHFT 39
-#define UVH_EVENT_OCCURRED0_L1_NMI_INT_MASK 0x0000008000000000UL
-#define UVH_EVENT_OCCURRED0_STOP_CLOCK_SHFT 40
-#define UVH_EVENT_OCCURRED0_STOP_CLOCK_MASK 0x0000010000000000UL
-#define UVH_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 41
-#define UVH_EVENT_OCCURRED0_ASIC_TO_L1_MASK 0x0000020000000000UL
-#define UVH_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 42
-#define UVH_EVENT_OCCURRED0_L1_TO_ASIC_MASK 0x0000040000000000UL
-#define UVH_EVENT_OCCURRED0_LTC_INT_SHFT 43
-#define UVH_EVENT_OCCURRED0_LTC_INT_MASK 0x0000080000000000UL
-#define UVH_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 44
-#define UVH_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK 0x0000100000000000UL
-#define UVH_EVENT_OCCURRED0_IPI_INT_SHFT 45
-#define UVH_EVENT_OCCURRED0_IPI_INT_MASK 0x0000200000000000UL
-#define UVH_EVENT_OCCURRED0_EXTIO_INT0_SHFT 46
-#define UVH_EVENT_OCCURRED0_EXTIO_INT0_MASK 0x0000400000000000UL
-#define UVH_EVENT_OCCURRED0_EXTIO_INT1_SHFT 47
-#define UVH_EVENT_OCCURRED0_EXTIO_INT1_MASK 0x0000800000000000UL
-#define UVH_EVENT_OCCURRED0_EXTIO_INT2_SHFT 48
-#define UVH_EVENT_OCCURRED0_EXTIO_INT2_MASK 0x0001000000000000UL
-#define UVH_EVENT_OCCURRED0_EXTIO_INT3_SHFT 49
-#define UVH_EVENT_OCCURRED0_EXTIO_INT3_MASK 0x0002000000000000UL
-#define UVH_EVENT_OCCURRED0_PROFILE_INT_SHFT 50
-#define UVH_EVENT_OCCURRED0_PROFILE_INT_MASK 0x0004000000000000UL
-#define UVH_EVENT_OCCURRED0_RTC0_SHFT 51
-#define UVH_EVENT_OCCURRED0_RTC0_MASK 0x0008000000000000UL
-#define UVH_EVENT_OCCURRED0_RTC1_SHFT 52
-#define UVH_EVENT_OCCURRED0_RTC1_MASK 0x0010000000000000UL
-#define UVH_EVENT_OCCURRED0_RTC2_SHFT 53
-#define UVH_EVENT_OCCURRED0_RTC2_MASK 0x0020000000000000UL
-#define UVH_EVENT_OCCURRED0_RTC3_SHFT 54
-#define UVH_EVENT_OCCURRED0_RTC3_MASK 0x0040000000000000UL
-#define UVH_EVENT_OCCURRED0_BAU_DATA_SHFT 55
-#define UVH_EVENT_OCCURRED0_BAU_DATA_MASK 0x0080000000000000UL
-#define UVH_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_SHFT 56
-#define UVH_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_MASK 0x0100000000000000UL
+#define UVH_EVENT_OCCURRED0 0x70000UL
+#define UVH_EVENT_OCCURRED0_32 0x5e8
+
+#define UV1H_EVENT_OCCURRED0_LB_HCERR_SHFT 0
+#define UV1H_EVENT_OCCURRED0_GR0_HCERR_SHFT 1
+#define UV1H_EVENT_OCCURRED0_GR1_HCERR_SHFT 2
+#define UV1H_EVENT_OCCURRED0_LH_HCERR_SHFT 3
+#define UV1H_EVENT_OCCURRED0_RH_HCERR_SHFT 4
+#define UV1H_EVENT_OCCURRED0_XN_HCERR_SHFT 5
+#define UV1H_EVENT_OCCURRED0_SI_HCERR_SHFT 6
+#define UV1H_EVENT_OCCURRED0_LB_AOERR0_SHFT 7
+#define UV1H_EVENT_OCCURRED0_GR0_AOERR0_SHFT 8
+#define UV1H_EVENT_OCCURRED0_GR1_AOERR0_SHFT 9
+#define UV1H_EVENT_OCCURRED0_LH_AOERR0_SHFT 10
+#define UV1H_EVENT_OCCURRED0_RH_AOERR0_SHFT 11
+#define UV1H_EVENT_OCCURRED0_XN_AOERR0_SHFT 12
+#define UV1H_EVENT_OCCURRED0_SI_AOERR0_SHFT 13
+#define UV1H_EVENT_OCCURRED0_LB_AOERR1_SHFT 14
+#define UV1H_EVENT_OCCURRED0_GR0_AOERR1_SHFT 15
+#define UV1H_EVENT_OCCURRED0_GR1_AOERR1_SHFT 16
+#define UV1H_EVENT_OCCURRED0_LH_AOERR1_SHFT 17
+#define UV1H_EVENT_OCCURRED0_RH_AOERR1_SHFT 18
+#define UV1H_EVENT_OCCURRED0_XN_AOERR1_SHFT 19
+#define UV1H_EVENT_OCCURRED0_SI_AOERR1_SHFT 20
+#define UV1H_EVENT_OCCURRED0_RH_VPI_INT_SHFT 21
+#define UV1H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 22
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 23
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 24
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 25
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 26
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 27
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 28
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 29
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 30
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 31
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 32
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 33
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 34
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 35
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 36
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 37
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 38
+#define UV1H_EVENT_OCCURRED0_L1_NMI_INT_SHFT 39
+#define UV1H_EVENT_OCCURRED0_STOP_CLOCK_SHFT 40
+#define UV1H_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 41
+#define UV1H_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 42
+#define UV1H_EVENT_OCCURRED0_LTC_INT_SHFT 43
+#define UV1H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 44
+#define UV1H_EVENT_OCCURRED0_IPI_INT_SHFT 45
+#define UV1H_EVENT_OCCURRED0_EXTIO_INT0_SHFT 46
+#define UV1H_EVENT_OCCURRED0_EXTIO_INT1_SHFT 47
+#define UV1H_EVENT_OCCURRED0_EXTIO_INT2_SHFT 48
+#define UV1H_EVENT_OCCURRED0_EXTIO_INT3_SHFT 49
+#define UV1H_EVENT_OCCURRED0_PROFILE_INT_SHFT 50
+#define UV1H_EVENT_OCCURRED0_RTC0_SHFT 51
+#define UV1H_EVENT_OCCURRED0_RTC1_SHFT 52
+#define UV1H_EVENT_OCCURRED0_RTC2_SHFT 53
+#define UV1H_EVENT_OCCURRED0_RTC3_SHFT 54
+#define UV1H_EVENT_OCCURRED0_BAU_DATA_SHFT 55
+#define UV1H_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_SHFT 56
+#define UV1H_EVENT_OCCURRED0_LB_HCERR_MASK 0x0000000000000001UL
+#define UV1H_EVENT_OCCURRED0_GR0_HCERR_MASK 0x0000000000000002UL
+#define UV1H_EVENT_OCCURRED0_GR1_HCERR_MASK 0x0000000000000004UL
+#define UV1H_EVENT_OCCURRED0_LH_HCERR_MASK 0x0000000000000008UL
+#define UV1H_EVENT_OCCURRED0_RH_HCERR_MASK 0x0000000000000010UL
+#define UV1H_EVENT_OCCURRED0_XN_HCERR_MASK 0x0000000000000020UL
+#define UV1H_EVENT_OCCURRED0_SI_HCERR_MASK 0x0000000000000040UL
+#define UV1H_EVENT_OCCURRED0_LB_AOERR0_MASK 0x0000000000000080UL
+#define UV1H_EVENT_OCCURRED0_GR0_AOERR0_MASK 0x0000000000000100UL
+#define UV1H_EVENT_OCCURRED0_GR1_AOERR0_MASK 0x0000000000000200UL
+#define UV1H_EVENT_OCCURRED0_LH_AOERR0_MASK 0x0000000000000400UL
+#define UV1H_EVENT_OCCURRED0_RH_AOERR0_MASK 0x0000000000000800UL
+#define UV1H_EVENT_OCCURRED0_XN_AOERR0_MASK 0x0000000000001000UL
+#define UV1H_EVENT_OCCURRED0_SI_AOERR0_MASK 0x0000000000002000UL
+#define UV1H_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000000004000UL
+#define UV1H_EVENT_OCCURRED0_GR0_AOERR1_MASK 0x0000000000008000UL
+#define UV1H_EVENT_OCCURRED0_GR1_AOERR1_MASK 0x0000000000010000UL
+#define UV1H_EVENT_OCCURRED0_LH_AOERR1_MASK 0x0000000000020000UL
+#define UV1H_EVENT_OCCURRED0_RH_AOERR1_MASK 0x0000000000040000UL
+#define UV1H_EVENT_OCCURRED0_XN_AOERR1_MASK 0x0000000000080000UL
+#define UV1H_EVENT_OCCURRED0_SI_AOERR1_MASK 0x0000000000100000UL
+#define UV1H_EVENT_OCCURRED0_RH_VPI_INT_MASK 0x0000000000200000UL
+#define UV1H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK 0x0000000000400000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK 0x0000000000800000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK 0x0000000001000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK 0x0000000002000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK 0x0000000004000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK 0x0000000008000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK 0x0000000010000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK 0x0000000020000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK 0x0000000040000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK 0x0000000080000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK 0x0000000100000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK 0x0000000200000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK 0x0000000400000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK 0x0000000800000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK 0x0000001000000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK 0x0000002000000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK 0x0000004000000000UL
+#define UV1H_EVENT_OCCURRED0_L1_NMI_INT_MASK 0x0000008000000000UL
+#define UV1H_EVENT_OCCURRED0_STOP_CLOCK_MASK 0x0000010000000000UL
+#define UV1H_EVENT_OCCURRED0_ASIC_TO_L1_MASK 0x0000020000000000UL
+#define UV1H_EVENT_OCCURRED0_L1_TO_ASIC_MASK 0x0000040000000000UL
+#define UV1H_EVENT_OCCURRED0_LTC_INT_MASK 0x0000080000000000UL
+#define UV1H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK 0x0000100000000000UL
+#define UV1H_EVENT_OCCURRED0_IPI_INT_MASK 0x0000200000000000UL
+#define UV1H_EVENT_OCCURRED0_EXTIO_INT0_MASK 0x0000400000000000UL
+#define UV1H_EVENT_OCCURRED0_EXTIO_INT1_MASK 0x0000800000000000UL
+#define UV1H_EVENT_OCCURRED0_EXTIO_INT2_MASK 0x0001000000000000UL
+#define UV1H_EVENT_OCCURRED0_EXTIO_INT3_MASK 0x0002000000000000UL
+#define UV1H_EVENT_OCCURRED0_PROFILE_INT_MASK 0x0004000000000000UL
+#define UV1H_EVENT_OCCURRED0_RTC0_MASK 0x0008000000000000UL
+#define UV1H_EVENT_OCCURRED0_RTC1_MASK 0x0010000000000000UL
+#define UV1H_EVENT_OCCURRED0_RTC2_MASK 0x0020000000000000UL
+#define UV1H_EVENT_OCCURRED0_RTC3_MASK 0x0040000000000000UL
+#define UV1H_EVENT_OCCURRED0_BAU_DATA_MASK 0x0080000000000000UL
+#define UV1H_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_MASK 0x0100000000000000UL
+
+#define UV2H_EVENT_OCCURRED0_LB_HCERR_SHFT 0
+#define UV2H_EVENT_OCCURRED0_QP_HCERR_SHFT 1
+#define UV2H_EVENT_OCCURRED0_RH_HCERR_SHFT 2
+#define UV2H_EVENT_OCCURRED0_LH0_HCERR_SHFT 3
+#define UV2H_EVENT_OCCURRED0_LH1_HCERR_SHFT 4
+#define UV2H_EVENT_OCCURRED0_GR0_HCERR_SHFT 5
+#define UV2H_EVENT_OCCURRED0_GR1_HCERR_SHFT 6
+#define UV2H_EVENT_OCCURRED0_NI0_HCERR_SHFT 7
+#define UV2H_EVENT_OCCURRED0_NI1_HCERR_SHFT 8
+#define UV2H_EVENT_OCCURRED0_LB_AOERR0_SHFT 9
+#define UV2H_EVENT_OCCURRED0_QP_AOERR0_SHFT 10
+#define UV2H_EVENT_OCCURRED0_RH_AOERR0_SHFT 11
+#define UV2H_EVENT_OCCURRED0_LH0_AOERR0_SHFT 12
+#define UV2H_EVENT_OCCURRED0_LH1_AOERR0_SHFT 13
+#define UV2H_EVENT_OCCURRED0_GR0_AOERR0_SHFT 14
+#define UV2H_EVENT_OCCURRED0_GR1_AOERR0_SHFT 15
+#define UV2H_EVENT_OCCURRED0_XB_AOERR0_SHFT 16
+#define UV2H_EVENT_OCCURRED0_RT_AOERR0_SHFT 17
+#define UV2H_EVENT_OCCURRED0_NI0_AOERR0_SHFT 18
+#define UV2H_EVENT_OCCURRED0_NI1_AOERR0_SHFT 19
+#define UV2H_EVENT_OCCURRED0_LB_AOERR1_SHFT 20
+#define UV2H_EVENT_OCCURRED0_QP_AOERR1_SHFT 21
+#define UV2H_EVENT_OCCURRED0_RH_AOERR1_SHFT 22
+#define UV2H_EVENT_OCCURRED0_LH0_AOERR1_SHFT 23
+#define UV2H_EVENT_OCCURRED0_LH1_AOERR1_SHFT 24
+#define UV2H_EVENT_OCCURRED0_GR0_AOERR1_SHFT 25
+#define UV2H_EVENT_OCCURRED0_GR1_AOERR1_SHFT 26
+#define UV2H_EVENT_OCCURRED0_XB_AOERR1_SHFT 27
+#define UV2H_EVENT_OCCURRED0_RT_AOERR1_SHFT 28
+#define UV2H_EVENT_OCCURRED0_NI0_AOERR1_SHFT 29
+#define UV2H_EVENT_OCCURRED0_NI1_AOERR1_SHFT 30
+#define UV2H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 31
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 32
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 33
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 34
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 35
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 36
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 37
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 38
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 39
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 40
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 41
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 42
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 43
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 44
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 45
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 46
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 47
+#define UV2H_EVENT_OCCURRED0_L1_NMI_INT_SHFT 48
+#define UV2H_EVENT_OCCURRED0_STOP_CLOCK_SHFT 49
+#define UV2H_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 50
+#define UV2H_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 51
+#define UV2H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 52
+#define UV2H_EVENT_OCCURRED0_IPI_INT_SHFT 53
+#define UV2H_EVENT_OCCURRED0_EXTIO_INT0_SHFT 54
+#define UV2H_EVENT_OCCURRED0_EXTIO_INT1_SHFT 55
+#define UV2H_EVENT_OCCURRED0_EXTIO_INT2_SHFT 56
+#define UV2H_EVENT_OCCURRED0_EXTIO_INT3_SHFT 57
+#define UV2H_EVENT_OCCURRED0_PROFILE_INT_SHFT 58
+#define UV2H_EVENT_OCCURRED0_LB_HCERR_MASK 0x0000000000000001UL
+#define UV2H_EVENT_OCCURRED0_QP_HCERR_MASK 0x0000000000000002UL
+#define UV2H_EVENT_OCCURRED0_RH_HCERR_MASK 0x0000000000000004UL
+#define UV2H_EVENT_OCCURRED0_LH0_HCERR_MASK 0x0000000000000008UL
+#define UV2H_EVENT_OCCURRED0_LH1_HCERR_MASK 0x0000000000000010UL
+#define UV2H_EVENT_OCCURRED0_GR0_HCERR_MASK 0x0000000000000020UL
+#define UV2H_EVENT_OCCURRED0_GR1_HCERR_MASK 0x0000000000000040UL
+#define UV2H_EVENT_OCCURRED0_NI0_HCERR_MASK 0x0000000000000080UL
+#define UV2H_EVENT_OCCURRED0_NI1_HCERR_MASK 0x0000000000000100UL
+#define UV2H_EVENT_OCCURRED0_LB_AOERR0_MASK 0x0000000000000200UL
+#define UV2H_EVENT_OCCURRED0_QP_AOERR0_MASK 0x0000000000000400UL
+#define UV2H_EVENT_OCCURRED0_RH_AOERR0_MASK 0x0000000000000800UL
+#define UV2H_EVENT_OCCURRED0_LH0_AOERR0_MASK 0x0000000000001000UL
+#define UV2H_EVENT_OCCURRED0_LH1_AOERR0_MASK 0x0000000000002000UL
+#define UV2H_EVENT_OCCURRED0_GR0_AOERR0_MASK 0x0000000000004000UL
+#define UV2H_EVENT_OCCURRED0_GR1_AOERR0_MASK 0x0000000000008000UL
+#define UV2H_EVENT_OCCURRED0_XB_AOERR0_MASK 0x0000000000010000UL
+#define UV2H_EVENT_OCCURRED0_RT_AOERR0_MASK 0x0000000000020000UL
+#define UV2H_EVENT_OCCURRED0_NI0_AOERR0_MASK 0x0000000000040000UL
+#define UV2H_EVENT_OCCURRED0_NI1_AOERR0_MASK 0x0000000000080000UL
+#define UV2H_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000000100000UL
+#define UV2H_EVENT_OCCURRED0_QP_AOERR1_MASK 0x0000000000200000UL
+#define UV2H_EVENT_OCCURRED0_RH_AOERR1_MASK 0x0000000000400000UL
+#define UV2H_EVENT_OCCURRED0_LH0_AOERR1_MASK 0x0000000000800000UL
+#define UV2H_EVENT_OCCURRED0_LH1_AOERR1_MASK 0x0000000001000000UL
+#define UV2H_EVENT_OCCURRED0_GR0_AOERR1_MASK 0x0000000002000000UL
+#define UV2H_EVENT_OCCURRED0_GR1_AOERR1_MASK 0x0000000004000000UL
+#define UV2H_EVENT_OCCURRED0_XB_AOERR1_MASK 0x0000000008000000UL
+#define UV2H_EVENT_OCCURRED0_RT_AOERR1_MASK 0x0000000010000000UL
+#define UV2H_EVENT_OCCURRED0_NI0_AOERR1_MASK 0x0000000020000000UL
+#define UV2H_EVENT_OCCURRED0_NI1_AOERR1_MASK 0x0000000040000000UL
+#define UV2H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK 0x0000000080000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK 0x0000000100000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK 0x0000000200000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK 0x0000000400000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK 0x0000000800000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK 0x0000001000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK 0x0000002000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK 0x0000004000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK 0x0000008000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK 0x0000010000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK 0x0000020000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK 0x0000040000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK 0x0000080000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK 0x0000100000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK 0x0000200000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK 0x0000400000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK 0x0000800000000000UL
+#define UV2H_EVENT_OCCURRED0_L1_NMI_INT_MASK 0x0001000000000000UL
+#define UV2H_EVENT_OCCURRED0_STOP_CLOCK_MASK 0x0002000000000000UL
+#define UV2H_EVENT_OCCURRED0_ASIC_TO_L1_MASK 0x0004000000000000UL
+#define UV2H_EVENT_OCCURRED0_L1_TO_ASIC_MASK 0x0008000000000000UL
+#define UV2H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK 0x0010000000000000UL
+#define UV2H_EVENT_OCCURRED0_IPI_INT_MASK 0x0020000000000000UL
+#define UV2H_EVENT_OCCURRED0_EXTIO_INT0_MASK 0x0040000000000000UL
+#define UV2H_EVENT_OCCURRED0_EXTIO_INT1_MASK 0x0080000000000000UL
+#define UV2H_EVENT_OCCURRED0_EXTIO_INT2_MASK 0x0100000000000000UL
+#define UV2H_EVENT_OCCURRED0_EXTIO_INT3_MASK 0x0200000000000000UL
+#define UV2H_EVENT_OCCURRED0_PROFILE_INT_MASK 0x0400000000000000UL
+
union uvh_event_occurred0_u {
- unsigned long v;
- struct uvh_event_occurred0_s {
- unsigned long lb_hcerr : 1; /* RW, W1C */
- unsigned long gr0_hcerr : 1; /* RW, W1C */
- unsigned long gr1_hcerr : 1; /* RW, W1C */
- unsigned long lh_hcerr : 1; /* RW, W1C */
- unsigned long rh_hcerr : 1; /* RW, W1C */
- unsigned long xn_hcerr : 1; /* RW, W1C */
- unsigned long si_hcerr : 1; /* RW, W1C */
- unsigned long lb_aoerr0 : 1; /* RW, W1C */
- unsigned long gr0_aoerr0 : 1; /* RW, W1C */
- unsigned long gr1_aoerr0 : 1; /* RW, W1C */
- unsigned long lh_aoerr0 : 1; /* RW, W1C */
- unsigned long rh_aoerr0 : 1; /* RW, W1C */
- unsigned long xn_aoerr0 : 1; /* RW, W1C */
- unsigned long si_aoerr0 : 1; /* RW, W1C */
- unsigned long lb_aoerr1 : 1; /* RW, W1C */
- unsigned long gr0_aoerr1 : 1; /* RW, W1C */
- unsigned long gr1_aoerr1 : 1; /* RW, W1C */
- unsigned long lh_aoerr1 : 1; /* RW, W1C */
- unsigned long rh_aoerr1 : 1; /* RW, W1C */
- unsigned long xn_aoerr1 : 1; /* RW, W1C */
- unsigned long si_aoerr1 : 1; /* RW, W1C */
- unsigned long rh_vpi_int : 1; /* RW, W1C */
- unsigned long system_shutdown_int : 1; /* RW, W1C */
- unsigned long lb_irq_int_0 : 1; /* RW, W1C */
- unsigned long lb_irq_int_1 : 1; /* RW, W1C */
- unsigned long lb_irq_int_2 : 1; /* RW, W1C */
- unsigned long lb_irq_int_3 : 1; /* RW, W1C */
- unsigned long lb_irq_int_4 : 1; /* RW, W1C */
- unsigned long lb_irq_int_5 : 1; /* RW, W1C */
- unsigned long lb_irq_int_6 : 1; /* RW, W1C */
- unsigned long lb_irq_int_7 : 1; /* RW, W1C */
- unsigned long lb_irq_int_8 : 1; /* RW, W1C */
- unsigned long lb_irq_int_9 : 1; /* RW, W1C */
- unsigned long lb_irq_int_10 : 1; /* RW, W1C */
- unsigned long lb_irq_int_11 : 1; /* RW, W1C */
- unsigned long lb_irq_int_12 : 1; /* RW, W1C */
- unsigned long lb_irq_int_13 : 1; /* RW, W1C */
- unsigned long lb_irq_int_14 : 1; /* RW, W1C */
- unsigned long lb_irq_int_15 : 1; /* RW, W1C */
- unsigned long l1_nmi_int : 1; /* RW, W1C */
- unsigned long stop_clock : 1; /* RW, W1C */
- unsigned long asic_to_l1 : 1; /* RW, W1C */
- unsigned long l1_to_asic : 1; /* RW, W1C */
- unsigned long ltc_int : 1; /* RW, W1C */
- unsigned long la_seq_trigger : 1; /* RW, W1C */
- unsigned long ipi_int : 1; /* RW, W1C */
- unsigned long extio_int0 : 1; /* RW, W1C */
- unsigned long extio_int1 : 1; /* RW, W1C */
- unsigned long extio_int2 : 1; /* RW, W1C */
- unsigned long extio_int3 : 1; /* RW, W1C */
- unsigned long profile_int : 1; /* RW, W1C */
- unsigned long rtc0 : 1; /* RW, W1C */
- unsigned long rtc1 : 1; /* RW, W1C */
- unsigned long rtc2 : 1; /* RW, W1C */
- unsigned long rtc3 : 1; /* RW, W1C */
- unsigned long bau_data : 1; /* RW, W1C */
- unsigned long power_management_req : 1; /* RW, W1C */
- unsigned long rsvd_57_63 : 7; /* */
- } s;
+ unsigned long v;
+ struct uv1h_event_occurred0_s {
+ unsigned long lb_hcerr:1; /* RW, W1C */
+ unsigned long gr0_hcerr:1; /* RW, W1C */
+ unsigned long gr1_hcerr:1; /* RW, W1C */
+ unsigned long lh_hcerr:1; /* RW, W1C */
+ unsigned long rh_hcerr:1; /* RW, W1C */
+ unsigned long xn_hcerr:1; /* RW, W1C */
+ unsigned long si_hcerr:1; /* RW, W1C */
+ unsigned long lb_aoerr0:1; /* RW, W1C */
+ unsigned long gr0_aoerr0:1; /* RW, W1C */
+ unsigned long gr1_aoerr0:1; /* RW, W1C */
+ unsigned long lh_aoerr0:1; /* RW, W1C */
+ unsigned long rh_aoerr0:1; /* RW, W1C */
+ unsigned long xn_aoerr0:1; /* RW, W1C */
+ unsigned long si_aoerr0:1; /* RW, W1C */
+ unsigned long lb_aoerr1:1; /* RW, W1C */
+ unsigned long gr0_aoerr1:1; /* RW, W1C */
+ unsigned long gr1_aoerr1:1; /* RW, W1C */
+ unsigned long lh_aoerr1:1; /* RW, W1C */
+ unsigned long rh_aoerr1:1; /* RW, W1C */
+ unsigned long xn_aoerr1:1; /* RW, W1C */
+ unsigned long si_aoerr1:1; /* RW, W1C */
+ unsigned long rh_vpi_int:1; /* RW, W1C */
+ unsigned long system_shutdown_int:1; /* RW, W1C */
+ unsigned long lb_irq_int_0:1; /* RW, W1C */
+ unsigned long lb_irq_int_1:1; /* RW, W1C */
+ unsigned long lb_irq_int_2:1; /* RW, W1C */
+ unsigned long lb_irq_int_3:1; /* RW, W1C */
+ unsigned long lb_irq_int_4:1; /* RW, W1C */
+ unsigned long lb_irq_int_5:1; /* RW, W1C */
+ unsigned long lb_irq_int_6:1; /* RW, W1C */
+ unsigned long lb_irq_int_7:1; /* RW, W1C */
+ unsigned long lb_irq_int_8:1; /* RW, W1C */
+ unsigned long lb_irq_int_9:1; /* RW, W1C */
+ unsigned long lb_irq_int_10:1; /* RW, W1C */
+ unsigned long lb_irq_int_11:1; /* RW, W1C */
+ unsigned long lb_irq_int_12:1; /* RW, W1C */
+ unsigned long lb_irq_int_13:1; /* RW, W1C */
+ unsigned long lb_irq_int_14:1; /* RW, W1C */
+ unsigned long lb_irq_int_15:1; /* RW, W1C */
+ unsigned long l1_nmi_int:1; /* RW, W1C */
+ unsigned long stop_clock:1; /* RW, W1C */
+ unsigned long asic_to_l1:1; /* RW, W1C */
+ unsigned long l1_to_asic:1; /* RW, W1C */
+ unsigned long ltc_int:1; /* RW, W1C */
+ unsigned long la_seq_trigger:1; /* RW, W1C */
+ unsigned long ipi_int:1; /* RW, W1C */
+ unsigned long extio_int0:1; /* RW, W1C */
+ unsigned long extio_int1:1; /* RW, W1C */
+ unsigned long extio_int2:1; /* RW, W1C */
+ unsigned long extio_int3:1; /* RW, W1C */
+ unsigned long profile_int:1; /* RW, W1C */
+ unsigned long rtc0:1; /* RW, W1C */
+ unsigned long rtc1:1; /* RW, W1C */
+ unsigned long rtc2:1; /* RW, W1C */
+ unsigned long rtc3:1; /* RW, W1C */
+ unsigned long bau_data:1; /* RW, W1C */
+ unsigned long power_management_req:1; /* RW, W1C */
+ unsigned long rsvd_57_63:7;
+ } s1;
+ struct uv2h_event_occurred0_s {
+ unsigned long lb_hcerr:1; /* RW */
+ unsigned long qp_hcerr:1; /* RW */
+ unsigned long rh_hcerr:1; /* RW */
+ unsigned long lh0_hcerr:1; /* RW */
+ unsigned long lh1_hcerr:1; /* RW */
+ unsigned long gr0_hcerr:1; /* RW */
+ unsigned long gr1_hcerr:1; /* RW */
+ unsigned long ni0_hcerr:1; /* RW */
+ unsigned long ni1_hcerr:1; /* RW */
+ unsigned long lb_aoerr0:1; /* RW */
+ unsigned long qp_aoerr0:1; /* RW */
+ unsigned long rh_aoerr0:1; /* RW */
+ unsigned long lh0_aoerr0:1; /* RW */
+ unsigned long lh1_aoerr0:1; /* RW */
+ unsigned long gr0_aoerr0:1; /* RW */
+ unsigned long gr1_aoerr0:1; /* RW */
+ unsigned long xb_aoerr0:1; /* RW */
+ unsigned long rt_aoerr0:1; /* RW */
+ unsigned long ni0_aoerr0:1; /* RW */
+ unsigned long ni1_aoerr0:1; /* RW */
+ unsigned long lb_aoerr1:1; /* RW */
+ unsigned long qp_aoerr1:1; /* RW */
+ unsigned long rh_aoerr1:1; /* RW */
+ unsigned long lh0_aoerr1:1; /* RW */
+ unsigned long lh1_aoerr1:1; /* RW */
+ unsigned long gr0_aoerr1:1; /* RW */
+ unsigned long gr1_aoerr1:1; /* RW */
+ unsigned long xb_aoerr1:1; /* RW */
+ unsigned long rt_aoerr1:1; /* RW */
+ unsigned long ni0_aoerr1:1; /* RW */
+ unsigned long ni1_aoerr1:1; /* RW */
+ unsigned long system_shutdown_int:1; /* RW */
+ unsigned long lb_irq_int_0:1; /* RW */
+ unsigned long lb_irq_int_1:1; /* RW */
+ unsigned long lb_irq_int_2:1; /* RW */
+ unsigned long lb_irq_int_3:1; /* RW */
+ unsigned long lb_irq_int_4:1; /* RW */
+ unsigned long lb_irq_int_5:1; /* RW */
+ unsigned long lb_irq_int_6:1; /* RW */
+ unsigned long lb_irq_int_7:1; /* RW */
+ unsigned long lb_irq_int_8:1; /* RW */
+ unsigned long lb_irq_int_9:1; /* RW */
+ unsigned long lb_irq_int_10:1; /* RW */
+ unsigned long lb_irq_int_11:1; /* RW */
+ unsigned long lb_irq_int_12:1; /* RW */
+ unsigned long lb_irq_int_13:1; /* RW */
+ unsigned long lb_irq_int_14:1; /* RW */
+ unsigned long lb_irq_int_15:1; /* RW */
+ unsigned long l1_nmi_int:1; /* RW */
+ unsigned long stop_clock:1; /* RW */
+ unsigned long asic_to_l1:1; /* RW */
+ unsigned long l1_to_asic:1; /* RW */
+ unsigned long la_seq_trigger:1; /* RW */
+ unsigned long ipi_int:1; /* RW */
+ unsigned long extio_int0:1; /* RW */
+ unsigned long extio_int1:1; /* RW */
+ unsigned long extio_int2:1; /* RW */
+ unsigned long extio_int3:1; /* RW */
+ unsigned long profile_int:1; /* RW */
+ unsigned long rsvd_59_63:5;
+ } s2;
};
/* ========================================================================= */
/* UVH_EVENT_OCCURRED0_ALIAS */
/* ========================================================================= */
-#define UVH_EVENT_OCCURRED0_ALIAS 0x0000000000070008UL
-#define UVH_EVENT_OCCURRED0_ALIAS_32 0x005f0
+#define UVH_EVENT_OCCURRED0_ALIAS 0x0000000000070008UL
+#define UVH_EVENT_OCCURRED0_ALIAS_32 0x5f0
/* ========================================================================= */
/* UVH_GR0_TLB_INT0_CONFIG */
/* ========================================================================= */
-#define UVH_GR0_TLB_INT0_CONFIG 0x61b00UL
-
-#define UVH_GR0_TLB_INT0_CONFIG_VECTOR_SHFT 0
-#define UVH_GR0_TLB_INT0_CONFIG_VECTOR_MASK 0x00000000000000ffUL
-#define UVH_GR0_TLB_INT0_CONFIG_DM_SHFT 8
-#define UVH_GR0_TLB_INT0_CONFIG_DM_MASK 0x0000000000000700UL
-#define UVH_GR0_TLB_INT0_CONFIG_DESTMODE_SHFT 11
-#define UVH_GR0_TLB_INT0_CONFIG_DESTMODE_MASK 0x0000000000000800UL
-#define UVH_GR0_TLB_INT0_CONFIG_STATUS_SHFT 12
-#define UVH_GR0_TLB_INT0_CONFIG_STATUS_MASK 0x0000000000001000UL
-#define UVH_GR0_TLB_INT0_CONFIG_P_SHFT 13
-#define UVH_GR0_TLB_INT0_CONFIG_P_MASK 0x0000000000002000UL
-#define UVH_GR0_TLB_INT0_CONFIG_T_SHFT 15
-#define UVH_GR0_TLB_INT0_CONFIG_T_MASK 0x0000000000008000UL
-#define UVH_GR0_TLB_INT0_CONFIG_M_SHFT 16
-#define UVH_GR0_TLB_INT0_CONFIG_M_MASK 0x0000000000010000UL
-#define UVH_GR0_TLB_INT0_CONFIG_APIC_ID_SHFT 32
-#define UVH_GR0_TLB_INT0_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
+#define UVH_GR0_TLB_INT0_CONFIG 0x61b00UL
+
+#define UVH_GR0_TLB_INT0_CONFIG_VECTOR_SHFT 0
+#define UVH_GR0_TLB_INT0_CONFIG_DM_SHFT 8
+#define UVH_GR0_TLB_INT0_CONFIG_DESTMODE_SHFT 11
+#define UVH_GR0_TLB_INT0_CONFIG_STATUS_SHFT 12
+#define UVH_GR0_TLB_INT0_CONFIG_P_SHFT 13
+#define UVH_GR0_TLB_INT0_CONFIG_T_SHFT 15
+#define UVH_GR0_TLB_INT0_CONFIG_M_SHFT 16
+#define UVH_GR0_TLB_INT0_CONFIG_APIC_ID_SHFT 32
+#define UVH_GR0_TLB_INT0_CONFIG_VECTOR_MASK 0x00000000000000ffUL
+#define UVH_GR0_TLB_INT0_CONFIG_DM_MASK 0x0000000000000700UL
+#define UVH_GR0_TLB_INT0_CONFIG_DESTMODE_MASK 0x0000000000000800UL
+#define UVH_GR0_TLB_INT0_CONFIG_STATUS_MASK 0x0000000000001000UL
+#define UVH_GR0_TLB_INT0_CONFIG_P_MASK 0x0000000000002000UL
+#define UVH_GR0_TLB_INT0_CONFIG_T_MASK 0x0000000000008000UL
+#define UVH_GR0_TLB_INT0_CONFIG_M_MASK 0x0000000000010000UL
+#define UVH_GR0_TLB_INT0_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
union uvh_gr0_tlb_int0_config_u {
- unsigned long v;
- struct uvh_gr0_tlb_int0_config_s {
- unsigned long vector_ : 8; /* RW */
- unsigned long dm : 3; /* RW */
- unsigned long destmode : 1; /* RW */
- unsigned long status : 1; /* RO */
- unsigned long p : 1; /* RO */
- unsigned long rsvd_14 : 1; /* */
- unsigned long t : 1; /* RO */
- unsigned long m : 1; /* RW */
- unsigned long rsvd_17_31: 15; /* */
- unsigned long apic_id : 32; /* RW */
- } s;
+ unsigned long v;
+ struct uvh_gr0_tlb_int0_config_s {
+ unsigned long vector_:8; /* RW */
+ unsigned long dm:3; /* RW */
+ unsigned long destmode:1; /* RW */
+ unsigned long status:1; /* RO */
+ unsigned long p:1; /* RO */
+ unsigned long rsvd_14:1;
+ unsigned long t:1; /* RO */
+ unsigned long m:1; /* RW */
+ unsigned long rsvd_17_31:15;
+ unsigned long apic_id:32; /* RW */
+ } s;
};
/* ========================================================================= */
/* UVH_GR0_TLB_INT1_CONFIG */
/* ========================================================================= */
-#define UVH_GR0_TLB_INT1_CONFIG 0x61b40UL
-
-#define UVH_GR0_TLB_INT1_CONFIG_VECTOR_SHFT 0
-#define UVH_GR0_TLB_INT1_CONFIG_VECTOR_MASK 0x00000000000000ffUL
-#define UVH_GR0_TLB_INT1_CONFIG_DM_SHFT 8
-#define UVH_GR0_TLB_INT1_CONFIG_DM_MASK 0x0000000000000700UL
-#define UVH_GR0_TLB_INT1_CONFIG_DESTMODE_SHFT 11
-#define UVH_GR0_TLB_INT1_CONFIG_DESTMODE_MASK 0x0000000000000800UL
-#define UVH_GR0_TLB_INT1_CONFIG_STATUS_SHFT 12
-#define UVH_GR0_TLB_INT1_CONFIG_STATUS_MASK 0x0000000000001000UL
-#define UVH_GR0_TLB_INT1_CONFIG_P_SHFT 13
-#define UVH_GR0_TLB_INT1_CONFIG_P_MASK 0x0000000000002000UL
-#define UVH_GR0_TLB_INT1_CONFIG_T_SHFT 15
-#define UVH_GR0_TLB_INT1_CONFIG_T_MASK 0x0000000000008000UL
-#define UVH_GR0_TLB_INT1_CONFIG_M_SHFT 16
-#define UVH_GR0_TLB_INT1_CONFIG_M_MASK 0x0000000000010000UL
-#define UVH_GR0_TLB_INT1_CONFIG_APIC_ID_SHFT 32
-#define UVH_GR0_TLB_INT1_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
+#define UVH_GR0_TLB_INT1_CONFIG 0x61b40UL
+
+#define UVH_GR0_TLB_INT1_CONFIG_VECTOR_SHFT 0
+#define UVH_GR0_TLB_INT1_CONFIG_DM_SHFT 8
+#define UVH_GR0_TLB_INT1_CONFIG_DESTMODE_SHFT 11
+#define UVH_GR0_TLB_INT1_CONFIG_STATUS_SHFT 12
+#define UVH_GR0_TLB_INT1_CONFIG_P_SHFT 13
+#define UVH_GR0_TLB_INT1_CONFIG_T_SHFT 15
+#define UVH_GR0_TLB_INT1_CONFIG_M_SHFT 16
+#define UVH_GR0_TLB_INT1_CONFIG_APIC_ID_SHFT 32
+#define UVH_GR0_TLB_INT1_CONFIG_VECTOR_MASK 0x00000000000000ffUL
+#define UVH_GR0_TLB_INT1_CONFIG_DM_MASK 0x0000000000000700UL
+#define UVH_GR0_TLB_INT1_CONFIG_DESTMODE_MASK 0x0000000000000800UL
+#define UVH_GR0_TLB_INT1_CONFIG_STATUS_MASK 0x0000000000001000UL
+#define UVH_GR0_TLB_INT1_CONFIG_P_MASK 0x0000000000002000UL
+#define UVH_GR0_TLB_INT1_CONFIG_T_MASK 0x0000000000008000UL
+#define UVH_GR0_TLB_INT1_CONFIG_M_MASK 0x0000000000010000UL
+#define UVH_GR0_TLB_INT1_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
union uvh_gr0_tlb_int1_config_u {
- unsigned long v;
- struct uvh_gr0_tlb_int1_config_s {
- unsigned long vector_ : 8; /* RW */
- unsigned long dm : 3; /* RW */
- unsigned long destmode : 1; /* RW */
- unsigned long status : 1; /* RO */
- unsigned long p : 1; /* RO */
- unsigned long rsvd_14 : 1; /* */
- unsigned long t : 1; /* RO */
- unsigned long m : 1; /* RW */
- unsigned long rsvd_17_31: 15; /* */
- unsigned long apic_id : 32; /* RW */
- } s;
+ unsigned long v;
+ struct uvh_gr0_tlb_int1_config_s {
+ unsigned long vector_:8; /* RW */
+ unsigned long dm:3; /* RW */
+ unsigned long destmode:1; /* RW */
+ unsigned long status:1; /* RO */
+ unsigned long p:1; /* RO */
+ unsigned long rsvd_14:1;
+ unsigned long t:1; /* RO */
+ unsigned long m:1; /* RW */
+ unsigned long rsvd_17_31:15;
+ unsigned long apic_id:32; /* RW */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_GR0_TLB_MMR_CONTROL */
+/* ========================================================================= */
+#define UV1H_GR0_TLB_MMR_CONTROL 0x401080UL
+#define UV2H_GR0_TLB_MMR_CONTROL 0xc01080UL
+#define UVH_GR0_TLB_MMR_CONTROL (is_uv1_hub() ? \
+ UV1H_GR0_TLB_MMR_CONTROL : \
+ UV2H_GR0_TLB_MMR_CONTROL)
+
+#define UVH_GR0_TLB_MMR_CONTROL_INDEX_SHFT 0
+#define UVH_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT 12
+#define UVH_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16
+#define UVH_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20
+#define UVH_GR0_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30
+#define UVH_GR0_TLB_MMR_CONTROL_MMR_READ_SHFT 31
+#define UVH_GR0_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000000fffUL
+#define UVH_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000003000UL
+#define UVH_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL
+#define UVH_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL
+#define UVH_GR0_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL
+#define UVH_GR0_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL
+
+#define UV1H_GR0_TLB_MMR_CONTROL_INDEX_SHFT 0
+#define UV1H_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT 12
+#define UV1H_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16
+#define UV1H_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20
+#define UV1H_GR0_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30
+#define UV1H_GR0_TLB_MMR_CONTROL_MMR_READ_SHFT 31
+#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_CON_SHFT 48
+#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_SHFT 52
+#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBPGSIZE_SHFT 54
+#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBRREG_SHFT 56
+#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBLRUV_SHFT 60
+#define UV1H_GR0_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000000fffUL
+#define UV1H_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000003000UL
+#define UV1H_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL
+#define UV1H_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL
+#define UV1H_GR0_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL
+#define UV1H_GR0_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL
+#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_CON_MASK 0x0001000000000000UL
+#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_MASK 0x0010000000000000UL
+#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBPGSIZE_MASK 0x0040000000000000UL
+#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBRREG_MASK 0x0100000000000000UL
+#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBLRUV_MASK 0x1000000000000000UL
+
+#define UV2H_GR0_TLB_MMR_CONTROL_INDEX_SHFT 0
+#define UV2H_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT 12
+#define UV2H_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16
+#define UV2H_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20
+#define UV2H_GR0_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30
+#define UV2H_GR0_TLB_MMR_CONTROL_MMR_READ_SHFT 31
+#define UV2H_GR0_TLB_MMR_CONTROL_MMR_OP_DONE_SHFT 32
+#define UV2H_GR0_TLB_MMR_CONTROL_MMR_INJ_CON_SHFT 48
+#define UV2H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_SHFT 52
+#define UV2H_GR0_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000000fffUL
+#define UV2H_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000003000UL
+#define UV2H_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL
+#define UV2H_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL
+#define UV2H_GR0_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL
+#define UV2H_GR0_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL
+#define UV2H_GR0_TLB_MMR_CONTROL_MMR_OP_DONE_MASK 0x0000000100000000UL
+#define UV2H_GR0_TLB_MMR_CONTROL_MMR_INJ_CON_MASK 0x0001000000000000UL
+#define UV2H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_MASK 0x0010000000000000UL
+
+union uvh_gr0_tlb_mmr_control_u {
+ unsigned long v;
+ struct uvh_gr0_tlb_mmr_control_s {
+ unsigned long index:12; /* RW */
+ unsigned long mem_sel:2; /* RW */
+ unsigned long rsvd_14_15:2;
+ unsigned long auto_valid_en:1; /* RW */
+ unsigned long rsvd_17_19:3;
+ unsigned long mmr_hash_index_en:1; /* RW */
+ unsigned long rsvd_21_29:9;
+ unsigned long mmr_write:1; /* WP */
+ unsigned long mmr_read:1; /* WP */
+ unsigned long rsvd_32_63:32;
+ } s;
+ struct uv1h_gr0_tlb_mmr_control_s {
+ unsigned long index:12; /* RW */
+ unsigned long mem_sel:2; /* RW */
+ unsigned long rsvd_14_15:2;
+ unsigned long auto_valid_en:1; /* RW */
+ unsigned long rsvd_17_19:3;
+ unsigned long mmr_hash_index_en:1; /* RW */
+ unsigned long rsvd_21_29:9;
+ unsigned long mmr_write:1; /* WP */
+ unsigned long mmr_read:1; /* WP */
+ unsigned long rsvd_32_47:16;
+ unsigned long mmr_inj_con:1; /* RW */
+ unsigned long rsvd_49_51:3;
+ unsigned long mmr_inj_tlbram:1; /* RW */
+ unsigned long rsvd_53:1;
+ unsigned long mmr_inj_tlbpgsize:1; /* RW */
+ unsigned long rsvd_55:1;
+ unsigned long mmr_inj_tlbrreg:1; /* RW */
+ unsigned long rsvd_57_59:3;
+ unsigned long mmr_inj_tlblruv:1; /* RW */
+ unsigned long rsvd_61_63:3;
+ } s1;
+ struct uv2h_gr0_tlb_mmr_control_s {
+ unsigned long index:12; /* RW */
+ unsigned long mem_sel:2; /* RW */
+ unsigned long rsvd_14_15:2;
+ unsigned long auto_valid_en:1; /* RW */
+ unsigned long rsvd_17_19:3;
+ unsigned long mmr_hash_index_en:1; /* RW */
+ unsigned long rsvd_21_29:9;
+ unsigned long mmr_write:1; /* WP */
+ unsigned long mmr_read:1; /* WP */
+ unsigned long mmr_op_done:1; /* RW */
+ unsigned long rsvd_33_47:15;
+ unsigned long mmr_inj_con:1; /* RW */
+ unsigned long rsvd_49_51:3;
+ unsigned long mmr_inj_tlbram:1; /* RW */
+ unsigned long rsvd_53_63:11;
+ } s2;
+};
+
+/* ========================================================================= */
+/* UVH_GR0_TLB_MMR_READ_DATA_HI */
+/* ========================================================================= */
+#define UV1H_GR0_TLB_MMR_READ_DATA_HI 0x4010a0UL
+#define UV2H_GR0_TLB_MMR_READ_DATA_HI 0xc010a0UL
+#define UVH_GR0_TLB_MMR_READ_DATA_HI (is_uv1_hub() ? \
+ UV1H_GR0_TLB_MMR_READ_DATA_HI : \
+ UV2H_GR0_TLB_MMR_READ_DATA_HI)
+
+#define UVH_GR0_TLB_MMR_READ_DATA_HI_PFN_SHFT 0
+#define UVH_GR0_TLB_MMR_READ_DATA_HI_GAA_SHFT 41
+#define UVH_GR0_TLB_MMR_READ_DATA_HI_DIRTY_SHFT 43
+#define UVH_GR0_TLB_MMR_READ_DATA_HI_LARGER_SHFT 44
+#define UVH_GR0_TLB_MMR_READ_DATA_HI_PFN_MASK 0x000001ffffffffffUL
+#define UVH_GR0_TLB_MMR_READ_DATA_HI_GAA_MASK 0x0000060000000000UL
+#define UVH_GR0_TLB_MMR_READ_DATA_HI_DIRTY_MASK 0x0000080000000000UL
+#define UVH_GR0_TLB_MMR_READ_DATA_HI_LARGER_MASK 0x0000100000000000UL
+
+union uvh_gr0_tlb_mmr_read_data_hi_u {
+ unsigned long v;
+ struct uvh_gr0_tlb_mmr_read_data_hi_s {
+ unsigned long pfn:41; /* RO */
+ unsigned long gaa:2; /* RO */
+ unsigned long dirty:1; /* RO */
+ unsigned long larger:1; /* RO */
+ unsigned long rsvd_45_63:19;
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_GR0_TLB_MMR_READ_DATA_LO */
+/* ========================================================================= */
+#define UV1H_GR0_TLB_MMR_READ_DATA_LO 0x4010a8UL
+#define UV2H_GR0_TLB_MMR_READ_DATA_LO 0xc010a8UL
+#define UVH_GR0_TLB_MMR_READ_DATA_LO (is_uv1_hub() ? \
+ UV1H_GR0_TLB_MMR_READ_DATA_LO : \
+ UV2H_GR0_TLB_MMR_READ_DATA_LO)
+
+#define UVH_GR0_TLB_MMR_READ_DATA_LO_VPN_SHFT 0
+#define UVH_GR0_TLB_MMR_READ_DATA_LO_ASID_SHFT 39
+#define UVH_GR0_TLB_MMR_READ_DATA_LO_VALID_SHFT 63
+#define UVH_GR0_TLB_MMR_READ_DATA_LO_VPN_MASK 0x0000007fffffffffUL
+#define UVH_GR0_TLB_MMR_READ_DATA_LO_ASID_MASK 0x7fffff8000000000UL
+#define UVH_GR0_TLB_MMR_READ_DATA_LO_VALID_MASK 0x8000000000000000UL
+
+union uvh_gr0_tlb_mmr_read_data_lo_u {
+ unsigned long v;
+ struct uvh_gr0_tlb_mmr_read_data_lo_s {
+ unsigned long vpn:39; /* RO */
+ unsigned long asid:24; /* RO */
+ unsigned long valid:1; /* RO */
+ } s;
};
/* ========================================================================= */
/* UVH_GR1_TLB_INT0_CONFIG */
/* ========================================================================= */
-#define UVH_GR1_TLB_INT0_CONFIG 0x61f00UL
-
-#define UVH_GR1_TLB_INT0_CONFIG_VECTOR_SHFT 0
-#define UVH_GR1_TLB_INT0_CONFIG_VECTOR_MASK 0x00000000000000ffUL
-#define UVH_GR1_TLB_INT0_CONFIG_DM_SHFT 8
-#define UVH_GR1_TLB_INT0_CONFIG_DM_MASK 0x0000000000000700UL
-#define UVH_GR1_TLB_INT0_CONFIG_DESTMODE_SHFT 11
-#define UVH_GR1_TLB_INT0_CONFIG_DESTMODE_MASK 0x0000000000000800UL
-#define UVH_GR1_TLB_INT0_CONFIG_STATUS_SHFT 12
-#define UVH_GR1_TLB_INT0_CONFIG_STATUS_MASK 0x0000000000001000UL
-#define UVH_GR1_TLB_INT0_CONFIG_P_SHFT 13
-#define UVH_GR1_TLB_INT0_CONFIG_P_MASK 0x0000000000002000UL
-#define UVH_GR1_TLB_INT0_CONFIG_T_SHFT 15
-#define UVH_GR1_TLB_INT0_CONFIG_T_MASK 0x0000000000008000UL
-#define UVH_GR1_TLB_INT0_CONFIG_M_SHFT 16
-#define UVH_GR1_TLB_INT0_CONFIG_M_MASK 0x0000000000010000UL
-#define UVH_GR1_TLB_INT0_CONFIG_APIC_ID_SHFT 32
-#define UVH_GR1_TLB_INT0_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
+#define UVH_GR1_TLB_INT0_CONFIG 0x61f00UL
+
+#define UVH_GR1_TLB_INT0_CONFIG_VECTOR_SHFT 0
+#define UVH_GR1_TLB_INT0_CONFIG_DM_SHFT 8
+#define UVH_GR1_TLB_INT0_CONFIG_DESTMODE_SHFT 11
+#define UVH_GR1_TLB_INT0_CONFIG_STATUS_SHFT 12
+#define UVH_GR1_TLB_INT0_CONFIG_P_SHFT 13
+#define UVH_GR1_TLB_INT0_CONFIG_T_SHFT 15
+#define UVH_GR1_TLB_INT0_CONFIG_M_SHFT 16
+#define UVH_GR1_TLB_INT0_CONFIG_APIC_ID_SHFT 32
+#define UVH_GR1_TLB_INT0_CONFIG_VECTOR_MASK 0x00000000000000ffUL
+#define UVH_GR1_TLB_INT0_CONFIG_DM_MASK 0x0000000000000700UL
+#define UVH_GR1_TLB_INT0_CONFIG_DESTMODE_MASK 0x0000000000000800UL
+#define UVH_GR1_TLB_INT0_CONFIG_STATUS_MASK 0x0000000000001000UL
+#define UVH_GR1_TLB_INT0_CONFIG_P_MASK 0x0000000000002000UL
+#define UVH_GR1_TLB_INT0_CONFIG_T_MASK 0x0000000000008000UL
+#define UVH_GR1_TLB_INT0_CONFIG_M_MASK 0x0000000000010000UL
+#define UVH_GR1_TLB_INT0_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
union uvh_gr1_tlb_int0_config_u {
- unsigned long v;
- struct uvh_gr1_tlb_int0_config_s {
- unsigned long vector_ : 8; /* RW */
- unsigned long dm : 3; /* RW */
- unsigned long destmode : 1; /* RW */
- unsigned long status : 1; /* RO */
- unsigned long p : 1; /* RO */
- unsigned long rsvd_14 : 1; /* */
- unsigned long t : 1; /* RO */
- unsigned long m : 1; /* RW */
- unsigned long rsvd_17_31: 15; /* */
- unsigned long apic_id : 32; /* RW */
- } s;
+ unsigned long v;
+ struct uvh_gr1_tlb_int0_config_s {
+ unsigned long vector_:8; /* RW */
+ unsigned long dm:3; /* RW */
+ unsigned long destmode:1; /* RW */
+ unsigned long status:1; /* RO */
+ unsigned long p:1; /* RO */
+ unsigned long rsvd_14:1;
+ unsigned long t:1; /* RO */
+ unsigned long m:1; /* RW */
+ unsigned long rsvd_17_31:15;
+ unsigned long apic_id:32; /* RW */
+ } s;
};
/* ========================================================================= */
/* UVH_GR1_TLB_INT1_CONFIG */
/* ========================================================================= */
-#define UVH_GR1_TLB_INT1_CONFIG 0x61f40UL
-
-#define UVH_GR1_TLB_INT1_CONFIG_VECTOR_SHFT 0
-#define UVH_GR1_TLB_INT1_CONFIG_VECTOR_MASK 0x00000000000000ffUL
-#define UVH_GR1_TLB_INT1_CONFIG_DM_SHFT 8
-#define UVH_GR1_TLB_INT1_CONFIG_DM_MASK 0x0000000000000700UL
-#define UVH_GR1_TLB_INT1_CONFIG_DESTMODE_SHFT 11
-#define UVH_GR1_TLB_INT1_CONFIG_DESTMODE_MASK 0x0000000000000800UL
-#define UVH_GR1_TLB_INT1_CONFIG_STATUS_SHFT 12
-#define UVH_GR1_TLB_INT1_CONFIG_STATUS_MASK 0x0000000000001000UL
-#define UVH_GR1_TLB_INT1_CONFIG_P_SHFT 13
-#define UVH_GR1_TLB_INT1_CONFIG_P_MASK 0x0000000000002000UL
-#define UVH_GR1_TLB_INT1_CONFIG_T_SHFT 15
-#define UVH_GR1_TLB_INT1_CONFIG_T_MASK 0x0000000000008000UL
-#define UVH_GR1_TLB_INT1_CONFIG_M_SHFT 16
-#define UVH_GR1_TLB_INT1_CONFIG_M_MASK 0x0000000000010000UL
-#define UVH_GR1_TLB_INT1_CONFIG_APIC_ID_SHFT 32
-#define UVH_GR1_TLB_INT1_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
+#define UVH_GR1_TLB_INT1_CONFIG 0x61f40UL
+
+#define UVH_GR1_TLB_INT1_CONFIG_VECTOR_SHFT 0
+#define UVH_GR1_TLB_INT1_CONFIG_DM_SHFT 8
+#define UVH_GR1_TLB_INT1_CONFIG_DESTMODE_SHFT 11
+#define UVH_GR1_TLB_INT1_CONFIG_STATUS_SHFT 12
+#define UVH_GR1_TLB_INT1_CONFIG_P_SHFT 13
+#define UVH_GR1_TLB_INT1_CONFIG_T_SHFT 15
+#define UVH_GR1_TLB_INT1_CONFIG_M_SHFT 16
+#define UVH_GR1_TLB_INT1_CONFIG_APIC_ID_SHFT 32
+#define UVH_GR1_TLB_INT1_CONFIG_VECTOR_MASK 0x00000000000000ffUL
+#define UVH_GR1_TLB_INT1_CONFIG_DM_MASK 0x0000000000000700UL
+#define UVH_GR1_TLB_INT1_CONFIG_DESTMODE_MASK 0x0000000000000800UL
+#define UVH_GR1_TLB_INT1_CONFIG_STATUS_MASK 0x0000000000001000UL
+#define UVH_GR1_TLB_INT1_CONFIG_P_MASK 0x0000000000002000UL
+#define UVH_GR1_TLB_INT1_CONFIG_T_MASK 0x0000000000008000UL
+#define UVH_GR1_TLB_INT1_CONFIG_M_MASK 0x0000000000010000UL
+#define UVH_GR1_TLB_INT1_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
union uvh_gr1_tlb_int1_config_u {
- unsigned long v;
- struct uvh_gr1_tlb_int1_config_s {
- unsigned long vector_ : 8; /* RW */
- unsigned long dm : 3; /* RW */
- unsigned long destmode : 1; /* RW */
- unsigned long status : 1; /* RO */
- unsigned long p : 1; /* RO */
- unsigned long rsvd_14 : 1; /* */
- unsigned long t : 1; /* RO */
- unsigned long m : 1; /* RW */
- unsigned long rsvd_17_31: 15; /* */
- unsigned long apic_id : 32; /* RW */
- } s;
+ unsigned long v;
+ struct uvh_gr1_tlb_int1_config_s {
+ unsigned long vector_:8; /* RW */
+ unsigned long dm:3; /* RW */
+ unsigned long destmode:1; /* RW */
+ unsigned long status:1; /* RO */
+ unsigned long p:1; /* RO */
+ unsigned long rsvd_14:1;
+ unsigned long t:1; /* RO */
+ unsigned long m:1; /* RW */
+ unsigned long rsvd_17_31:15;
+ unsigned long apic_id:32; /* RW */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_GR1_TLB_MMR_CONTROL */
+/* ========================================================================= */
+#define UV1H_GR1_TLB_MMR_CONTROL 0x801080UL
+#define UV2H_GR1_TLB_MMR_CONTROL 0x1001080UL
+#define UVH_GR1_TLB_MMR_CONTROL (is_uv1_hub() ? \
+ UV1H_GR1_TLB_MMR_CONTROL : \
+ UV2H_GR1_TLB_MMR_CONTROL)
+
+#define UVH_GR1_TLB_MMR_CONTROL_INDEX_SHFT 0
+#define UVH_GR1_TLB_MMR_CONTROL_MEM_SEL_SHFT 12
+#define UVH_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16
+#define UVH_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20
+#define UVH_GR1_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30
+#define UVH_GR1_TLB_MMR_CONTROL_MMR_READ_SHFT 31
+#define UVH_GR1_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000000fffUL
+#define UVH_GR1_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000003000UL
+#define UVH_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL
+#define UVH_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL
+#define UVH_GR1_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL
+#define UVH_GR1_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL
+
+#define UV1H_GR1_TLB_MMR_CONTROL_INDEX_SHFT 0
+#define UV1H_GR1_TLB_MMR_CONTROL_MEM_SEL_SHFT 12
+#define UV1H_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16
+#define UV1H_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20
+#define UV1H_GR1_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30
+#define UV1H_GR1_TLB_MMR_CONTROL_MMR_READ_SHFT 31
+#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_CON_SHFT 48
+#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_SHFT 52
+#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBPGSIZE_SHFT 54
+#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBRREG_SHFT 56
+#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBLRUV_SHFT 60
+#define UV1H_GR1_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000000fffUL
+#define UV1H_GR1_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000003000UL
+#define UV1H_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL
+#define UV1H_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL
+#define UV1H_GR1_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL
+#define UV1H_GR1_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL
+#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_CON_MASK 0x0001000000000000UL
+#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_MASK 0x0010000000000000UL
+#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBPGSIZE_MASK 0x0040000000000000UL
+#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBRREG_MASK 0x0100000000000000UL
+#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBLRUV_MASK 0x1000000000000000UL
+
+#define UV2H_GR1_TLB_MMR_CONTROL_INDEX_SHFT 0
+#define UV2H_GR1_TLB_MMR_CONTROL_MEM_SEL_SHFT 12
+#define UV2H_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16
+#define UV2H_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20
+#define UV2H_GR1_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30
+#define UV2H_GR1_TLB_MMR_CONTROL_MMR_READ_SHFT 31
+#define UV2H_GR1_TLB_MMR_CONTROL_MMR_OP_DONE_SHFT 32
+#define UV2H_GR1_TLB_MMR_CONTROL_MMR_INJ_CON_SHFT 48
+#define UV2H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_SHFT 52
+#define UV2H_GR1_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000000fffUL
+#define UV2H_GR1_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000003000UL
+#define UV2H_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL
+#define UV2H_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL
+#define UV2H_GR1_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL
+#define UV2H_GR1_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL
+#define UV2H_GR1_TLB_MMR_CONTROL_MMR_OP_DONE_MASK 0x0000000100000000UL
+#define UV2H_GR1_TLB_MMR_CONTROL_MMR_INJ_CON_MASK 0x0001000000000000UL
+#define UV2H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_MASK 0x0010000000000000UL
+
+union uvh_gr1_tlb_mmr_control_u {
+ unsigned long v;
+ struct uvh_gr1_tlb_mmr_control_s {
+ unsigned long index:12; /* RW */
+ unsigned long mem_sel:2; /* RW */
+ unsigned long rsvd_14_15:2;
+ unsigned long auto_valid_en:1; /* RW */
+ unsigned long rsvd_17_19:3;
+ unsigned long mmr_hash_index_en:1; /* RW */
+ unsigned long rsvd_21_29:9;
+ unsigned long mmr_write:1; /* WP */
+ unsigned long mmr_read:1; /* WP */
+ unsigned long rsvd_32_63:32;
+ } s;
+ struct uv1h_gr1_tlb_mmr_control_s {
+ unsigned long index:12; /* RW */
+ unsigned long mem_sel:2; /* RW */
+ unsigned long rsvd_14_15:2;
+ unsigned long auto_valid_en:1; /* RW */
+ unsigned long rsvd_17_19:3;
+ unsigned long mmr_hash_index_en:1; /* RW */
+ unsigned long rsvd_21_29:9;
+ unsigned long mmr_write:1; /* WP */
+ unsigned long mmr_read:1; /* WP */
+ unsigned long rsvd_32_47:16;
+ unsigned long mmr_inj_con:1; /* RW */
+ unsigned long rsvd_49_51:3;
+ unsigned long mmr_inj_tlbram:1; /* RW */
+ unsigned long rsvd_53:1;
+ unsigned long mmr_inj_tlbpgsize:1; /* RW */
+ unsigned long rsvd_55:1;
+ unsigned long mmr_inj_tlbrreg:1; /* RW */
+ unsigned long rsvd_57_59:3;
+ unsigned long mmr_inj_tlblruv:1; /* RW */
+ unsigned long rsvd_61_63:3;
+ } s1;
+ struct uv2h_gr1_tlb_mmr_control_s {
+ unsigned long index:12; /* RW */
+ unsigned long mem_sel:2; /* RW */
+ unsigned long rsvd_14_15:2;
+ unsigned long auto_valid_en:1; /* RW */
+ unsigned long rsvd_17_19:3;
+ unsigned long mmr_hash_index_en:1; /* RW */
+ unsigned long rsvd_21_29:9;
+ unsigned long mmr_write:1; /* WP */
+ unsigned long mmr_read:1; /* WP */
+ unsigned long mmr_op_done:1; /* RW */
+ unsigned long rsvd_33_47:15;
+ unsigned long mmr_inj_con:1; /* RW */
+ unsigned long rsvd_49_51:3;
+ unsigned long mmr_inj_tlbram:1; /* RW */
+ unsigned long rsvd_53_63:11;
+ } s2;
+};
+
+/* ========================================================================= */
+/* UVH_GR1_TLB_MMR_READ_DATA_HI */
+/* ========================================================================= */
+#define UV1H_GR1_TLB_MMR_READ_DATA_HI 0x8010a0UL
+#define UV2H_GR1_TLB_MMR_READ_DATA_HI 0x10010a0UL
+#define UVH_GR1_TLB_MMR_READ_DATA_HI (is_uv1_hub() ? \
+ UV1H_GR1_TLB_MMR_READ_DATA_HI : \
+ UV2H_GR1_TLB_MMR_READ_DATA_HI)
+
+#define UVH_GR1_TLB_MMR_READ_DATA_HI_PFN_SHFT 0
+#define UVH_GR1_TLB_MMR_READ_DATA_HI_GAA_SHFT 41
+#define UVH_GR1_TLB_MMR_READ_DATA_HI_DIRTY_SHFT 43
+#define UVH_GR1_TLB_MMR_READ_DATA_HI_LARGER_SHFT 44
+#define UVH_GR1_TLB_MMR_READ_DATA_HI_PFN_MASK 0x000001ffffffffffUL
+#define UVH_GR1_TLB_MMR_READ_DATA_HI_GAA_MASK 0x0000060000000000UL
+#define UVH_GR1_TLB_MMR_READ_DATA_HI_DIRTY_MASK 0x0000080000000000UL
+#define UVH_GR1_TLB_MMR_READ_DATA_HI_LARGER_MASK 0x0000100000000000UL
+
+union uvh_gr1_tlb_mmr_read_data_hi_u {
+ unsigned long v;
+ struct uvh_gr1_tlb_mmr_read_data_hi_s {
+ unsigned long pfn:41; /* RO */
+ unsigned long gaa:2; /* RO */
+ unsigned long dirty:1; /* RO */
+ unsigned long larger:1; /* RO */
+ unsigned long rsvd_45_63:19;
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_GR1_TLB_MMR_READ_DATA_LO */
+/* ========================================================================= */
+#define UV1H_GR1_TLB_MMR_READ_DATA_LO 0x8010a8UL
+#define UV2H_GR1_TLB_MMR_READ_DATA_LO 0x10010a8UL
+#define UVH_GR1_TLB_MMR_READ_DATA_LO (is_uv1_hub() ? \
+ UV1H_GR1_TLB_MMR_READ_DATA_LO : \
+ UV2H_GR1_TLB_MMR_READ_DATA_LO)
+
+#define UVH_GR1_TLB_MMR_READ_DATA_LO_VPN_SHFT 0
+#define UVH_GR1_TLB_MMR_READ_DATA_LO_ASID_SHFT 39
+#define UVH_GR1_TLB_MMR_READ_DATA_LO_VALID_SHFT 63
+#define UVH_GR1_TLB_MMR_READ_DATA_LO_VPN_MASK 0x0000007fffffffffUL
+#define UVH_GR1_TLB_MMR_READ_DATA_LO_ASID_MASK 0x7fffff8000000000UL
+#define UVH_GR1_TLB_MMR_READ_DATA_LO_VALID_MASK 0x8000000000000000UL
+
+union uvh_gr1_tlb_mmr_read_data_lo_u {
+ unsigned long v;
+ struct uvh_gr1_tlb_mmr_read_data_lo_s {
+ unsigned long vpn:39; /* RO */
+ unsigned long asid:24; /* RO */
+ unsigned long valid:1; /* RO */
+ } s;
};
/* ========================================================================= */
/* UVH_INT_CMPB */
/* ========================================================================= */
-#define UVH_INT_CMPB 0x22080UL
+#define UVH_INT_CMPB 0x22080UL
-#define UVH_INT_CMPB_REAL_TIME_CMPB_SHFT 0
-#define UVH_INT_CMPB_REAL_TIME_CMPB_MASK 0x00ffffffffffffffUL
+#define UVH_INT_CMPB_REAL_TIME_CMPB_SHFT 0
+#define UVH_INT_CMPB_REAL_TIME_CMPB_MASK 0x00ffffffffffffffUL
union uvh_int_cmpb_u {
- unsigned long v;
- struct uvh_int_cmpb_s {
- unsigned long real_time_cmpb : 56; /* RW */
- unsigned long rsvd_56_63 : 8; /* */
- } s;
+ unsigned long v;
+ struct uvh_int_cmpb_s {
+ unsigned long real_time_cmpb:56; /* RW */
+ unsigned long rsvd_56_63:8;
+ } s;
};
/* ========================================================================= */
/* UVH_INT_CMPC */
/* ========================================================================= */
-#define UVH_INT_CMPC 0x22100UL
+#define UVH_INT_CMPC 0x22100UL
-#define UVH_INT_CMPC_REAL_TIME_CMPC_SHFT 0
-#define UVH_INT_CMPC_REAL_TIME_CMPC_MASK 0x00ffffffffffffffUL
+#define UVH_INT_CMPC_REAL_TIME_CMPC_SHFT 0
+#define UVH_INT_CMPC_REAL_TIME_CMPC_MASK 0xffffffffffffffUL
union uvh_int_cmpc_u {
- unsigned long v;
- struct uvh_int_cmpc_s {
- unsigned long real_time_cmpc : 56; /* RW */
- unsigned long rsvd_56_63 : 8; /* */
- } s;
+ unsigned long v;
+ struct uvh_int_cmpc_s {
+ unsigned long real_time_cmpc:56; /* RW */
+ unsigned long rsvd_56_63:8;
+ } s;
};
/* ========================================================================= */
/* UVH_INT_CMPD */
/* ========================================================================= */
-#define UVH_INT_CMPD 0x22180UL
+#define UVH_INT_CMPD 0x22180UL
-#define UVH_INT_CMPD_REAL_TIME_CMPD_SHFT 0
-#define UVH_INT_CMPD_REAL_TIME_CMPD_MASK 0x00ffffffffffffffUL
+#define UVH_INT_CMPD_REAL_TIME_CMPD_SHFT 0
+#define UVH_INT_CMPD_REAL_TIME_CMPD_MASK 0xffffffffffffffUL
union uvh_int_cmpd_u {
- unsigned long v;
- struct uvh_int_cmpd_s {
- unsigned long real_time_cmpd : 56; /* RW */
- unsigned long rsvd_56_63 : 8; /* */
- } s;
+ unsigned long v;
+ struct uvh_int_cmpd_s {
+ unsigned long real_time_cmpd:56; /* RW */
+ unsigned long rsvd_56_63:8;
+ } s;
};
/* ========================================================================= */
/* UVH_IPI_INT */
/* ========================================================================= */
-#define UVH_IPI_INT 0x60500UL
-#define UVH_IPI_INT_32 0x0348
+#define UVH_IPI_INT 0x60500UL
+#define UVH_IPI_INT_32 0x348
-#define UVH_IPI_INT_VECTOR_SHFT 0
-#define UVH_IPI_INT_VECTOR_MASK 0x00000000000000ffUL
-#define UVH_IPI_INT_DELIVERY_MODE_SHFT 8
-#define UVH_IPI_INT_DELIVERY_MODE_MASK 0x0000000000000700UL
-#define UVH_IPI_INT_DESTMODE_SHFT 11
-#define UVH_IPI_INT_DESTMODE_MASK 0x0000000000000800UL
-#define UVH_IPI_INT_APIC_ID_SHFT 16
-#define UVH_IPI_INT_APIC_ID_MASK 0x0000ffffffff0000UL
-#define UVH_IPI_INT_SEND_SHFT 63
-#define UVH_IPI_INT_SEND_MASK 0x8000000000000000UL
+#define UVH_IPI_INT_VECTOR_SHFT 0
+#define UVH_IPI_INT_DELIVERY_MODE_SHFT 8
+#define UVH_IPI_INT_DESTMODE_SHFT 11
+#define UVH_IPI_INT_APIC_ID_SHFT 16
+#define UVH_IPI_INT_SEND_SHFT 63
+#define UVH_IPI_INT_VECTOR_MASK 0x00000000000000ffUL
+#define UVH_IPI_INT_DELIVERY_MODE_MASK 0x0000000000000700UL
+#define UVH_IPI_INT_DESTMODE_MASK 0x0000000000000800UL
+#define UVH_IPI_INT_APIC_ID_MASK 0x0000ffffffff0000UL
+#define UVH_IPI_INT_SEND_MASK 0x8000000000000000UL
union uvh_ipi_int_u {
- unsigned long v;
- struct uvh_ipi_int_s {
- unsigned long vector_ : 8; /* RW */
- unsigned long delivery_mode : 3; /* RW */
- unsigned long destmode : 1; /* RW */
- unsigned long rsvd_12_15 : 4; /* */
- unsigned long apic_id : 32; /* RW */
- unsigned long rsvd_48_62 : 15; /* */
- unsigned long send : 1; /* WP */
- } s;
+ unsigned long v;
+ struct uvh_ipi_int_s {
+ unsigned long vector_:8; /* RW */
+ unsigned long delivery_mode:3; /* RW */
+ unsigned long destmode:1; /* RW */
+ unsigned long rsvd_12_15:4;
+ unsigned long apic_id:32; /* RW */
+ unsigned long rsvd_48_62:15;
+ unsigned long send:1; /* WP */
+ } s;
};
/* ========================================================================= */
/* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST */
/* ========================================================================= */
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST 0x320050UL
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_32 0x009c0
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST 0x320050UL
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_32 0x9c0
#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_SHFT 4
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_MASK 0x000007fffffffff0UL
#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_NODE_ID_SHFT 49
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_MASK 0x000007fffffffff0UL
#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_NODE_ID_MASK 0x7ffe000000000000UL
union uvh_lb_bau_intd_payload_queue_first_u {
- unsigned long v;
- struct uvh_lb_bau_intd_payload_queue_first_s {
- unsigned long rsvd_0_3: 4; /* */
- unsigned long address : 39; /* RW */
- unsigned long rsvd_43_48: 6; /* */
- unsigned long node_id : 14; /* RW */
- unsigned long rsvd_63 : 1; /* */
- } s;
+ unsigned long v;
+ struct uvh_lb_bau_intd_payload_queue_first_s {
+ unsigned long rsvd_0_3:4;
+ unsigned long address:39; /* RW */
+ unsigned long rsvd_43_48:6;
+ unsigned long node_id:14; /* RW */
+ unsigned long rsvd_63:1;
+ } s;
};
/* ========================================================================= */
/* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST */
/* ========================================================================= */
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST 0x320060UL
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_32 0x009c8
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST 0x320060UL
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_32 0x9c8
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_SHFT 4
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_MASK 0x000007fffffffff0UL
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_SHFT 4
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_MASK 0x000007fffffffff0UL
union uvh_lb_bau_intd_payload_queue_last_u {
- unsigned long v;
- struct uvh_lb_bau_intd_payload_queue_last_s {
- unsigned long rsvd_0_3: 4; /* */
- unsigned long address : 39; /* RW */
- unsigned long rsvd_43_63: 21; /* */
- } s;
+ unsigned long v;
+ struct uvh_lb_bau_intd_payload_queue_last_s {
+ unsigned long rsvd_0_3:4;
+ unsigned long address:39; /* RW */
+ unsigned long rsvd_43_63:21;
+ } s;
};
/* ========================================================================= */
/* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL */
/* ========================================================================= */
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL 0x320070UL
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_32 0x009d0
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL 0x320070UL
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_32 0x9d0
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_SHFT 4
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_MASK 0x000007fffffffff0UL
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_SHFT 4
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_MASK 0x000007fffffffff0UL
union uvh_lb_bau_intd_payload_queue_tail_u {
- unsigned long v;
- struct uvh_lb_bau_intd_payload_queue_tail_s {
- unsigned long rsvd_0_3: 4; /* */
- unsigned long address : 39; /* RW */
- unsigned long rsvd_43_63: 21; /* */
- } s;
+ unsigned long v;
+ struct uvh_lb_bau_intd_payload_queue_tail_s {
+ unsigned long rsvd_0_3:4;
+ unsigned long address:39; /* RW */
+ unsigned long rsvd_43_63:21;
+ } s;
};
/* ========================================================================= */
/* UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE */
/* ========================================================================= */
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE 0x320080UL
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_32 0x0a68
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE 0x320080UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_32 0xa68
#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_SHFT 0
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_MASK 0x0000000000000001UL
#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_SHFT 1
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_MASK 0x0000000000000002UL
#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_SHFT 2
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_MASK 0x0000000000000004UL
#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_SHFT 3
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_MASK 0x0000000000000008UL
#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_SHFT 4
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_MASK 0x0000000000000010UL
#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_SHFT 5
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_MASK 0x0000000000000020UL
#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_SHFT 6
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_MASK 0x0000000000000040UL
#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_SHFT 7
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_MASK 0x0000000000000080UL
#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_SHFT 8
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_MASK 0x0000000000000100UL
#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_SHFT 9
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_MASK 0x0000000000000200UL
#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_SHFT 10
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_MASK 0x0000000000000400UL
#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_SHFT 11
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_MASK 0x0000000000000800UL
#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_SHFT 12
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_MASK 0x0000000000001000UL
#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_SHFT 13
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_MASK 0x0000000000002000UL
#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_SHFT 14
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_MASK 0x0000000000004000UL
#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_SHFT 15
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_MASK 0x0000000000000001UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_MASK 0x0000000000000002UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_MASK 0x0000000000000004UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_MASK 0x0000000000000008UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_MASK 0x0000000000000010UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_MASK 0x0000000000000020UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_MASK 0x0000000000000040UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_MASK 0x0000000000000080UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_MASK 0x0000000000000100UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_MASK 0x0000000000000200UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_MASK 0x0000000000000400UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_MASK 0x0000000000000800UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_MASK 0x0000000000001000UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_MASK 0x0000000000002000UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_MASK 0x0000000000004000UL
#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_MASK 0x0000000000008000UL
+
union uvh_lb_bau_intd_software_acknowledge_u {
- unsigned long v;
- struct uvh_lb_bau_intd_software_acknowledge_s {
- unsigned long pending_0 : 1; /* RW, W1C */
- unsigned long pending_1 : 1; /* RW, W1C */
- unsigned long pending_2 : 1; /* RW, W1C */
- unsigned long pending_3 : 1; /* RW, W1C */
- unsigned long pending_4 : 1; /* RW, W1C */
- unsigned long pending_5 : 1; /* RW, W1C */
- unsigned long pending_6 : 1; /* RW, W1C */
- unsigned long pending_7 : 1; /* RW, W1C */
- unsigned long timeout_0 : 1; /* RW, W1C */
- unsigned long timeout_1 : 1; /* RW, W1C */
- unsigned long timeout_2 : 1; /* RW, W1C */
- unsigned long timeout_3 : 1; /* RW, W1C */
- unsigned long timeout_4 : 1; /* RW, W1C */
- unsigned long timeout_5 : 1; /* RW, W1C */
- unsigned long timeout_6 : 1; /* RW, W1C */
- unsigned long timeout_7 : 1; /* RW, W1C */
- unsigned long rsvd_16_63: 48; /* */
- } s;
+ unsigned long v;
+ struct uvh_lb_bau_intd_software_acknowledge_s {
+ unsigned long pending_0:1; /* RW, W1C */
+ unsigned long pending_1:1; /* RW, W1C */
+ unsigned long pending_2:1; /* RW, W1C */
+ unsigned long pending_3:1; /* RW, W1C */
+ unsigned long pending_4:1; /* RW, W1C */
+ unsigned long pending_5:1; /* RW, W1C */
+ unsigned long pending_6:1; /* RW, W1C */
+ unsigned long pending_7:1; /* RW, W1C */
+ unsigned long timeout_0:1; /* RW, W1C */
+ unsigned long timeout_1:1; /* RW, W1C */
+ unsigned long timeout_2:1; /* RW, W1C */
+ unsigned long timeout_3:1; /* RW, W1C */
+ unsigned long timeout_4:1; /* RW, W1C */
+ unsigned long timeout_5:1; /* RW, W1C */
+ unsigned long timeout_6:1; /* RW, W1C */
+ unsigned long timeout_7:1; /* RW, W1C */
+ unsigned long rsvd_16_63:48;
+ } s;
};
/* ========================================================================= */
/* UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS */
/* ========================================================================= */
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS 0x0000000000320088UL
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS_32 0x0a70
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS 0x0000000000320088UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS_32 0xa70
/* ========================================================================= */
/* UVH_LB_BAU_MISC_CONTROL */
/* ========================================================================= */
-#define UVH_LB_BAU_MISC_CONTROL 0x320170UL
-#define UVH_LB_BAU_MISC_CONTROL_32 0x00a10
-
-#define UVH_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0
-#define UVH_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK 0x00000000000000ffUL
-#define UVH_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT 8
-#define UVH_LB_BAU_MISC_CONTROL_APIC_MODE_MASK 0x0000000000000100UL
-#define UVH_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_SHFT 9
-#define UVH_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK 0x0000000000000200UL
-#define UVH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10
-#define UVH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL
-#define UVH_LB_BAU_MISC_CONTROL_CSI_AGENT_PRESENCE_VECTOR_SHFT 11
-#define UVH_LB_BAU_MISC_CONTROL_CSI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL
+#define UVH_LB_BAU_MISC_CONTROL 0x320170UL
+#define UVH_LB_BAU_MISC_CONTROL_32 0xa10
+
+#define UVH_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0
+#define UVH_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT 8
+#define UVH_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_SHFT 9
+#define UVH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10
+#define UVH_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_SHFT 11
#define UVH_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14
-#define UVH_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL
#define UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT 15
-#define UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK 0x0000000000008000UL
#define UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT 16
-#define UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK 0x00000000000f0000UL
#define UVH_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_SHFT 20
-#define UVH_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL
#define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_SHFT 21
-#define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL
#define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_SHFT 22
-#define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL
#define UVH_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_SHFT 23
-#define UVH_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_MASK 0x0000000000800000UL
#define UVH_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_SHFT 24
-#define UVH_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000007000000UL
#define UVH_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_SHFT 27
-#define UVH_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL
#define UVH_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_SHFT 28
+#define UVH_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK 0x00000000000000ffUL
+#define UVH_LB_BAU_MISC_CONTROL_APIC_MODE_MASK 0x0000000000000100UL
+#define UVH_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK 0x0000000000000200UL
+#define UVH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL
+#define UVH_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL
+#define UVH_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL
+#define UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK 0x0000000000008000UL
+#define UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK 0x00000000000f0000UL
+#define UVH_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL
+#define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL
+#define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL
+#define UVH_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_MASK 0x0000000000800000UL
+#define UVH_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000007000000UL
+#define UVH_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL
#define UVH_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL
-#define UVH_LB_BAU_MISC_CONTROL_FUN_SHFT 48
-#define UVH_LB_BAU_MISC_CONTROL_FUN_MASK 0xffff000000000000UL
+
+#define UV1H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0
+#define UV1H_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT 8
+#define UV1H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_SHFT 9
+#define UV1H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10
+#define UV1H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_SHFT 11
+#define UV1H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14
+#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT 15
+#define UV1H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT 16
+#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_SHFT 20
+#define UV1H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_SHFT 21
+#define UV1H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_SHFT 22
+#define UV1H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_SHFT 23
+#define UV1H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_SHFT 24
+#define UV1H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_SHFT 27
+#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_SHFT 28
+#define UV1H_LB_BAU_MISC_CONTROL_FUN_SHFT 48
+#define UV1H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK 0x00000000000000ffUL
+#define UV1H_LB_BAU_MISC_CONTROL_APIC_MODE_MASK 0x0000000000000100UL
+#define UV1H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK 0x0000000000000200UL
+#define UV1H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL
+#define UV1H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL
+#define UV1H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL
+#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK 0x0000000000008000UL
+#define UV1H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK 0x00000000000f0000UL
+#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL
+#define UV1H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL
+#define UV1H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL
+#define UV1H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_MASK 0x0000000000800000UL
+#define UV1H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000007000000UL
+#define UV1H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL
+#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL
+#define UV1H_LB_BAU_MISC_CONTROL_FUN_MASK 0xffff000000000000UL
+
+#define UV2H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0
+#define UV2H_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT 8
+#define UV2H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_SHFT 9
+#define UV2H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10
+#define UV2H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_SHFT 11
+#define UV2H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14
+#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT 15
+#define UV2H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT 16
+#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_SHFT 20
+#define UV2H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_SHFT 21
+#define UV2H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_SHFT 22
+#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_SHFT 23
+#define UV2H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_SHFT 24
+#define UV2H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_SHFT 27
+#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_SHFT 28
+#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_AUTOMATIC_APIC_MODE_SELECTION_SHFT 29
+#define UV2H_LB_BAU_MISC_CONTROL_APIC_MODE_STATUS_SHFT 30
+#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_INTERRUPTS_TO_SELF_SHFT 31
+#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_LOCK_BASED_SYSTEM_FLUSH_SHFT 32
+#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_SHFT 33
+#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_INT_PRIO_UDT_TO_SELF_SHFT 34
+#define UV2H_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_SHFT 35
+#define UV2H_LB_BAU_MISC_CONTROL_FUN_SHFT 48
+#define UV2H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK 0x00000000000000ffUL
+#define UV2H_LB_BAU_MISC_CONTROL_APIC_MODE_MASK 0x0000000000000100UL
+#define UV2H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK 0x0000000000000200UL
+#define UV2H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL
+#define UV2H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL
+#define UV2H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL
+#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK 0x0000000000008000UL
+#define UV2H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK 0x00000000000f0000UL
+#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL
+#define UV2H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL
+#define UV2H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL
+#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_MASK 0x0000000000800000UL
+#define UV2H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000007000000UL
+#define UV2H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL
+#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL
+#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_AUTOMATIC_APIC_MODE_SELECTION_MASK 0x0000000020000000UL
+#define UV2H_LB_BAU_MISC_CONTROL_APIC_MODE_STATUS_MASK 0x0000000040000000UL
+#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_INTERRUPTS_TO_SELF_MASK 0x0000000080000000UL
+#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_LOCK_BASED_SYSTEM_FLUSH_MASK 0x0000000100000000UL
+#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_MASK 0x0000000200000000UL
+#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_INT_PRIO_UDT_TO_SELF_MASK 0x0000000400000000UL
+#define UV2H_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_MASK 0x0000000800000000UL
+#define UV2H_LB_BAU_MISC_CONTROL_FUN_MASK 0xffff000000000000UL
union uvh_lb_bau_misc_control_u {
- unsigned long v;
- struct uvh_lb_bau_misc_control_s {
- unsigned long rejection_delay : 8; /* RW */
- unsigned long apic_mode : 1; /* RW */
- unsigned long force_broadcast : 1; /* RW */
- unsigned long force_lock_nop : 1; /* RW */
- unsigned long csi_agent_presence_vector : 3; /* RW */
- unsigned long descriptor_fetch_mode : 1; /* RW */
- unsigned long enable_intd_soft_ack_mode : 1; /* RW */
- unsigned long intd_soft_ack_timeout_period : 4; /* RW */
- unsigned long enable_dual_mapping_mode : 1; /* RW */
- unsigned long vga_io_port_decode_enable : 1; /* RW */
- unsigned long vga_io_port_16_bit_decode : 1; /* RW */
- unsigned long suppress_dest_registration : 1; /* RW */
- unsigned long programmed_initial_priority : 3; /* RW */
- unsigned long use_incoming_priority : 1; /* RW */
- unsigned long enable_programmed_initial_priority : 1; /* RW */
- unsigned long rsvd_29_47 : 19; /* */
- unsigned long fun : 16; /* RW */
- } s;
+ unsigned long v;
+ struct uvh_lb_bau_misc_control_s {
+ unsigned long rejection_delay:8; /* RW */
+ unsigned long apic_mode:1; /* RW */
+ unsigned long force_broadcast:1; /* RW */
+ unsigned long force_lock_nop:1; /* RW */
+ unsigned long qpi_agent_presence_vector:3; /* RW */
+ unsigned long descriptor_fetch_mode:1; /* RW */
+ unsigned long enable_intd_soft_ack_mode:1; /* RW */
+ unsigned long intd_soft_ack_timeout_period:4; /* RW */
+ unsigned long enable_dual_mapping_mode:1; /* RW */
+ unsigned long vga_io_port_decode_enable:1; /* RW */
+ unsigned long vga_io_port_16_bit_decode:1; /* RW */
+ unsigned long suppress_dest_registration:1; /* RW */
+ unsigned long programmed_initial_priority:3; /* RW */
+ unsigned long use_incoming_priority:1; /* RW */
+ unsigned long enable_programmed_initial_priority:1;/* RW */
+ unsigned long rsvd_29_63:35;
+ } s;
+ struct uv1h_lb_bau_misc_control_s {
+ unsigned long rejection_delay:8; /* RW */
+ unsigned long apic_mode:1; /* RW */
+ unsigned long force_broadcast:1; /* RW */
+ unsigned long force_lock_nop:1; /* RW */
+ unsigned long qpi_agent_presence_vector:3; /* RW */
+ unsigned long descriptor_fetch_mode:1; /* RW */
+ unsigned long enable_intd_soft_ack_mode:1; /* RW */
+ unsigned long intd_soft_ack_timeout_period:4; /* RW */
+ unsigned long enable_dual_mapping_mode:1; /* RW */
+ unsigned long vga_io_port_decode_enable:1; /* RW */
+ unsigned long vga_io_port_16_bit_decode:1; /* RW */
+ unsigned long suppress_dest_registration:1; /* RW */
+ unsigned long programmed_initial_priority:3; /* RW */
+ unsigned long use_incoming_priority:1; /* RW */
+ unsigned long enable_programmed_initial_priority:1;/* RW */
+ unsigned long rsvd_29_47:19;
+ unsigned long fun:16; /* RW */
+ } s1;
+ struct uv2h_lb_bau_misc_control_s {
+ unsigned long rejection_delay:8; /* RW */
+ unsigned long apic_mode:1; /* RW */
+ unsigned long force_broadcast:1; /* RW */
+ unsigned long force_lock_nop:1; /* RW */
+ unsigned long qpi_agent_presence_vector:3; /* RW */
+ unsigned long descriptor_fetch_mode:1; /* RW */
+ unsigned long enable_intd_soft_ack_mode:1; /* RW */
+ unsigned long intd_soft_ack_timeout_period:4; /* RW */
+ unsigned long enable_dual_mapping_mode:1; /* RW */
+ unsigned long vga_io_port_decode_enable:1; /* RW */
+ unsigned long vga_io_port_16_bit_decode:1; /* RW */
+ unsigned long suppress_dest_registration:1; /* RW */
+ unsigned long programmed_initial_priority:3; /* RW */
+ unsigned long use_incoming_priority:1; /* RW */
+ unsigned long enable_programmed_initial_priority:1;/* RW */
+ unsigned long enable_automatic_apic_mode_selection:1;/* RW */
+ unsigned long apic_mode_status:1; /* RO */
+ unsigned long suppress_interrupts_to_self:1; /* RW */
+ unsigned long enable_lock_based_system_flush:1;/* RW */
+ unsigned long enable_extended_sb_status:1; /* RW */
+ unsigned long suppress_int_prio_udt_to_self:1;/* RW */
+ unsigned long use_legacy_descriptor_formats:1;/* RW */
+ unsigned long rsvd_36_47:12;
+ unsigned long fun:16; /* RW */
+ } s2;
};
/* ========================================================================= */
/* UVH_LB_BAU_SB_ACTIVATION_CONTROL */
/* ========================================================================= */
-#define UVH_LB_BAU_SB_ACTIVATION_CONTROL 0x320020UL
-#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_32 0x009a8
+#define UVH_LB_BAU_SB_ACTIVATION_CONTROL 0x320020UL
+#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_32 0x9a8
-#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INDEX_SHFT 0
-#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INDEX_MASK 0x000000000000003fUL
-#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT 62
-#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_MASK 0x4000000000000000UL
-#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INIT_SHFT 63
-#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INIT_MASK 0x8000000000000000UL
+#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INDEX_SHFT 0
+#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT 62
+#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INIT_SHFT 63
+#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INDEX_MASK 0x000000000000003fUL
+#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_MASK 0x4000000000000000UL
+#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INIT_MASK 0x8000000000000000UL
union uvh_lb_bau_sb_activation_control_u {
- unsigned long v;
- struct uvh_lb_bau_sb_activation_control_s {
- unsigned long index : 6; /* RW */
- unsigned long rsvd_6_61: 56; /* */
- unsigned long push : 1; /* WP */
- unsigned long init : 1; /* WP */
- } s;
+ unsigned long v;
+ struct uvh_lb_bau_sb_activation_control_s {
+ unsigned long index:6; /* RW */
+ unsigned long rsvd_6_61:56;
+ unsigned long push:1; /* WP */
+ unsigned long init:1; /* WP */
+ } s;
};
/* ========================================================================= */
/* UVH_LB_BAU_SB_ACTIVATION_STATUS_0 */
/* ========================================================================= */
-#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0 0x320030UL
-#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_32 0x009b0
+#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0 0x320030UL
+#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_32 0x9b0
-#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_STATUS_SHFT 0
-#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_STATUS_MASK 0xffffffffffffffffUL
+#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_STATUS_SHFT 0
+#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_STATUS_MASK 0xffffffffffffffffUL
union uvh_lb_bau_sb_activation_status_0_u {
- unsigned long v;
- struct uvh_lb_bau_sb_activation_status_0_s {
- unsigned long status : 64; /* RW */
- } s;
+ unsigned long v;
+ struct uvh_lb_bau_sb_activation_status_0_s {
+ unsigned long status:64; /* RW */
+ } s;
};
/* ========================================================================= */
/* UVH_LB_BAU_SB_ACTIVATION_STATUS_1 */
/* ========================================================================= */
-#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1 0x320040UL
-#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_32 0x009b8
+#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1 0x320040UL
+#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_32 0x9b8
-#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_STATUS_SHFT 0
-#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_STATUS_MASK 0xffffffffffffffffUL
+#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_STATUS_SHFT 0
+#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_STATUS_MASK 0xffffffffffffffffUL
union uvh_lb_bau_sb_activation_status_1_u {
- unsigned long v;
- struct uvh_lb_bau_sb_activation_status_1_s {
- unsigned long status : 64; /* RW */
- } s;
+ unsigned long v;
+ struct uvh_lb_bau_sb_activation_status_1_s {
+ unsigned long status:64; /* RW */
+ } s;
};
/* ========================================================================= */
/* UVH_LB_BAU_SB_DESCRIPTOR_BASE */
/* ========================================================================= */
-#define UVH_LB_BAU_SB_DESCRIPTOR_BASE 0x320010UL
-#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_32 0x009a0
+#define UVH_LB_BAU_SB_DESCRIPTOR_BASE 0x320010UL
+#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_32 0x9a0
-#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_SHFT 12
-#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_MASK 0x000007fffffff000UL
-#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT 49
-#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_MASK 0x7ffe000000000000UL
+#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_SHFT 12
+#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT 49
+#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_MASK 0x000007fffffff000UL
+#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_MASK 0x7ffe000000000000UL
union uvh_lb_bau_sb_descriptor_base_u {
- unsigned long v;
- struct uvh_lb_bau_sb_descriptor_base_s {
- unsigned long rsvd_0_11 : 12; /* */
- unsigned long page_address : 31; /* RW */
- unsigned long rsvd_43_48 : 6; /* */
- unsigned long node_id : 14; /* RW */
- unsigned long rsvd_63 : 1; /* */
- } s;
-};
-
-/* ========================================================================= */
-/* UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK */
-/* ========================================================================= */
-#define UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK 0x320130UL
-#define UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK_32 0x009f0
-
-#define UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK_BIT_ENABLES_SHFT 0
-#define UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK_BIT_ENABLES_MASK 0x00000000ffffffffUL
-
-union uvh_lb_target_physical_apic_id_mask_u {
- unsigned long v;
- struct uvh_lb_target_physical_apic_id_mask_s {
- unsigned long bit_enables : 32; /* RW */
- unsigned long rsvd_32_63 : 32; /* */
+ unsigned long v;
+ struct uvh_lb_bau_sb_descriptor_base_s {
+ unsigned long rsvd_0_11:12;
+ unsigned long page_address:31; /* RW */
+ unsigned long rsvd_43_48:6;
+ unsigned long node_id:14; /* RW */
+ unsigned long rsvd_63:1;
} s;
};
/* ========================================================================= */
/* UVH_NODE_ID */
/* ========================================================================= */
-#define UVH_NODE_ID 0x0UL
-
-#define UVH_NODE_ID_FORCE1_SHFT 0
-#define UVH_NODE_ID_FORCE1_MASK 0x0000000000000001UL
-#define UVH_NODE_ID_MANUFACTURER_SHFT 1
-#define UVH_NODE_ID_MANUFACTURER_MASK 0x0000000000000ffeUL
-#define UVH_NODE_ID_PART_NUMBER_SHFT 12
-#define UVH_NODE_ID_PART_NUMBER_MASK 0x000000000ffff000UL
-#define UVH_NODE_ID_REVISION_SHFT 28
-#define UVH_NODE_ID_REVISION_MASK 0x00000000f0000000UL
-#define UVH_NODE_ID_NODE_ID_SHFT 32
-#define UVH_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL
-#define UVH_NODE_ID_NODES_PER_BIT_SHFT 48
-#define UVH_NODE_ID_NODES_PER_BIT_MASK 0x007f000000000000UL
-#define UVH_NODE_ID_NI_PORT_SHFT 56
-#define UVH_NODE_ID_NI_PORT_MASK 0x0f00000000000000UL
+#define UVH_NODE_ID 0x0UL
+
+#define UVH_NODE_ID_FORCE1_SHFT 0
+#define UVH_NODE_ID_MANUFACTURER_SHFT 1
+#define UVH_NODE_ID_PART_NUMBER_SHFT 12
+#define UVH_NODE_ID_REVISION_SHFT 28
+#define UVH_NODE_ID_NODE_ID_SHFT 32
+#define UVH_NODE_ID_FORCE1_MASK 0x0000000000000001UL
+#define UVH_NODE_ID_MANUFACTURER_MASK 0x0000000000000ffeUL
+#define UVH_NODE_ID_PART_NUMBER_MASK 0x000000000ffff000UL
+#define UVH_NODE_ID_REVISION_MASK 0x00000000f0000000UL
+#define UVH_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL
+
+#define UV1H_NODE_ID_FORCE1_SHFT 0
+#define UV1H_NODE_ID_MANUFACTURER_SHFT 1
+#define UV1H_NODE_ID_PART_NUMBER_SHFT 12
+#define UV1H_NODE_ID_REVISION_SHFT 28
+#define UV1H_NODE_ID_NODE_ID_SHFT 32
+#define UV1H_NODE_ID_NODES_PER_BIT_SHFT 48
+#define UV1H_NODE_ID_NI_PORT_SHFT 56
+#define UV1H_NODE_ID_FORCE1_MASK 0x0000000000000001UL
+#define UV1H_NODE_ID_MANUFACTURER_MASK 0x0000000000000ffeUL
+#define UV1H_NODE_ID_PART_NUMBER_MASK 0x000000000ffff000UL
+#define UV1H_NODE_ID_REVISION_MASK 0x00000000f0000000UL
+#define UV1H_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL
+#define UV1H_NODE_ID_NODES_PER_BIT_MASK 0x007f000000000000UL
+#define UV1H_NODE_ID_NI_PORT_MASK 0x0f00000000000000UL
+
+#define UV2H_NODE_ID_FORCE1_SHFT 0
+#define UV2H_NODE_ID_MANUFACTURER_SHFT 1
+#define UV2H_NODE_ID_PART_NUMBER_SHFT 12
+#define UV2H_NODE_ID_REVISION_SHFT 28
+#define UV2H_NODE_ID_NODE_ID_SHFT 32
+#define UV2H_NODE_ID_NODES_PER_BIT_SHFT 50
+#define UV2H_NODE_ID_NI_PORT_SHFT 57
+#define UV2H_NODE_ID_FORCE1_MASK 0x0000000000000001UL
+#define UV2H_NODE_ID_MANUFACTURER_MASK 0x0000000000000ffeUL
+#define UV2H_NODE_ID_PART_NUMBER_MASK 0x000000000ffff000UL
+#define UV2H_NODE_ID_REVISION_MASK 0x00000000f0000000UL
+#define UV2H_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL
+#define UV2H_NODE_ID_NODES_PER_BIT_MASK 0x01fc000000000000UL
+#define UV2H_NODE_ID_NI_PORT_MASK 0x3e00000000000000UL
union uvh_node_id_u {
- unsigned long v;
- struct uvh_node_id_s {
- unsigned long force1 : 1; /* RO */
- unsigned long manufacturer : 11; /* RO */
- unsigned long part_number : 16; /* RO */
- unsigned long revision : 4; /* RO */
- unsigned long node_id : 15; /* RW */
- unsigned long rsvd_47 : 1; /* */
- unsigned long nodes_per_bit : 7; /* RW */
- unsigned long rsvd_55 : 1; /* */
- unsigned long ni_port : 4; /* RO */
- unsigned long rsvd_60_63 : 4; /* */
- } s;
+ unsigned long v;
+ struct uvh_node_id_s {
+ unsigned long force1:1; /* RO */
+ unsigned long manufacturer:11; /* RO */
+ unsigned long part_number:16; /* RO */
+ unsigned long revision:4; /* RO */
+ unsigned long node_id:15; /* RW */
+ unsigned long rsvd_47_63:17;
+ } s;
+ struct uv1h_node_id_s {
+ unsigned long force1:1; /* RO */
+ unsigned long manufacturer:11; /* RO */
+ unsigned long part_number:16; /* RO */
+ unsigned long revision:4; /* RO */
+ unsigned long node_id:15; /* RW */
+ unsigned long rsvd_47:1;
+ unsigned long nodes_per_bit:7; /* RW */
+ unsigned long rsvd_55:1;
+ unsigned long ni_port:4; /* RO */
+ unsigned long rsvd_60_63:4;
+ } s1;
+ struct uv2h_node_id_s {
+ unsigned long force1:1; /* RO */
+ unsigned long manufacturer:11; /* RO */
+ unsigned long part_number:16; /* RO */
+ unsigned long revision:4; /* RO */
+ unsigned long node_id:15; /* RW */
+ unsigned long rsvd_47_49:3;
+ unsigned long nodes_per_bit:7; /* RO */
+ unsigned long ni_port:5; /* RO */
+ unsigned long rsvd_62_63:2;
+ } s2;
};
/* ========================================================================= */
/* UVH_NODE_PRESENT_TABLE */
/* ========================================================================= */
-#define UVH_NODE_PRESENT_TABLE 0x1400UL
-#define UVH_NODE_PRESENT_TABLE_DEPTH 16
+#define UVH_NODE_PRESENT_TABLE 0x1400UL
+#define UVH_NODE_PRESENT_TABLE_DEPTH 16
-#define UVH_NODE_PRESENT_TABLE_NODES_SHFT 0
-#define UVH_NODE_PRESENT_TABLE_NODES_MASK 0xffffffffffffffffUL
+#define UVH_NODE_PRESENT_TABLE_NODES_SHFT 0
+#define UVH_NODE_PRESENT_TABLE_NODES_MASK 0xffffffffffffffffUL
union uvh_node_present_table_u {
- unsigned long v;
- struct uvh_node_present_table_s {
- unsigned long nodes : 64; /* RW */
- } s;
+ unsigned long v;
+ struct uvh_node_present_table_s {
+ unsigned long nodes:64; /* RW */
+ } s;
};
/* ========================================================================= */
/* UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR */
/* ========================================================================= */
-#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR 0x16000c8UL
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR 0x16000c8UL
#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_SHFT 24
-#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_MASK 0x00000000ff000000UL
#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_SHFT 48
-#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_MASK 0x001f000000000000UL
#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_SHFT 63
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_MASK 0x00000000ff000000UL
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_MASK 0x001f000000000000UL
#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_MASK 0x8000000000000000UL
union uvh_rh_gam_alias210_overlay_config_0_mmr_u {
- unsigned long v;
- struct uvh_rh_gam_alias210_overlay_config_0_mmr_s {
- unsigned long rsvd_0_23: 24; /* */
- unsigned long base : 8; /* RW */
- unsigned long rsvd_32_47: 16; /* */
- unsigned long m_alias : 5; /* RW */
- unsigned long rsvd_53_62: 10; /* */
- unsigned long enable : 1; /* RW */
- } s;
+ unsigned long v;
+ struct uvh_rh_gam_alias210_overlay_config_0_mmr_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long base:8; /* RW */
+ unsigned long rsvd_32_47:16;
+ unsigned long m_alias:5; /* RW */
+ unsigned long rsvd_53_62:10;
+ unsigned long enable:1; /* RW */
+ } s;
};
/* ========================================================================= */
/* UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR */
/* ========================================================================= */
-#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR 0x16000d8UL
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR 0x16000d8UL
#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_SHFT 24
-#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_MASK 0x00000000ff000000UL
#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_SHFT 48
-#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_MASK 0x001f000000000000UL
#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_SHFT 63
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_MASK 0x00000000ff000000UL
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_MASK 0x001f000000000000UL
#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_MASK 0x8000000000000000UL
union uvh_rh_gam_alias210_overlay_config_1_mmr_u {
- unsigned long v;
- struct uvh_rh_gam_alias210_overlay_config_1_mmr_s {
- unsigned long rsvd_0_23: 24; /* */
- unsigned long base : 8; /* RW */
- unsigned long rsvd_32_47: 16; /* */
- unsigned long m_alias : 5; /* RW */
- unsigned long rsvd_53_62: 10; /* */
- unsigned long enable : 1; /* RW */
- } s;
+ unsigned long v;
+ struct uvh_rh_gam_alias210_overlay_config_1_mmr_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long base:8; /* RW */
+ unsigned long rsvd_32_47:16;
+ unsigned long m_alias:5; /* RW */
+ unsigned long rsvd_53_62:10;
+ unsigned long enable:1; /* RW */
+ } s;
};
/* ========================================================================= */
/* UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR */
/* ========================================================================= */
-#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR 0x16000e8UL
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR 0x16000e8UL
#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_SHFT 24
-#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_MASK 0x00000000ff000000UL
#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_SHFT 48
-#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_MASK 0x001f000000000000UL
#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_SHFT 63
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_MASK 0x00000000ff000000UL
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_MASK 0x001f000000000000UL
#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_MASK 0x8000000000000000UL
union uvh_rh_gam_alias210_overlay_config_2_mmr_u {
- unsigned long v;
- struct uvh_rh_gam_alias210_overlay_config_2_mmr_s {
- unsigned long rsvd_0_23: 24; /* */
- unsigned long base : 8; /* RW */
- unsigned long rsvd_32_47: 16; /* */
- unsigned long m_alias : 5; /* RW */
- unsigned long rsvd_53_62: 10; /* */
- unsigned long enable : 1; /* RW */
- } s;
+ unsigned long v;
+ struct uvh_rh_gam_alias210_overlay_config_2_mmr_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long base:8; /* RW */
+ unsigned long rsvd_32_47:16;
+ unsigned long m_alias:5; /* RW */
+ unsigned long rsvd_53_62:10;
+ unsigned long enable:1; /* RW */
+ } s;
};
/* ========================================================================= */
/* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR */
/* ========================================================================= */
-#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR 0x16000d0UL
+#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR 0x16000d0UL
#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT 24
#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_MASK 0x00003fffff000000UL
union uvh_rh_gam_alias210_redirect_config_0_mmr_u {
- unsigned long v;
- struct uvh_rh_gam_alias210_redirect_config_0_mmr_s {
- unsigned long rsvd_0_23 : 24; /* */
- unsigned long dest_base : 22; /* RW */
- unsigned long rsvd_46_63: 18; /* */
- } s;
+ unsigned long v;
+ struct uvh_rh_gam_alias210_redirect_config_0_mmr_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long dest_base:22; /* RW */
+ unsigned long rsvd_46_63:18;
+ } s;
};
/* ========================================================================= */
/* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR */
/* ========================================================================= */
-#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR 0x16000e0UL
+#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR 0x16000e0UL
#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_SHFT 24
#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_MASK 0x00003fffff000000UL
union uvh_rh_gam_alias210_redirect_config_1_mmr_u {
- unsigned long v;
- struct uvh_rh_gam_alias210_redirect_config_1_mmr_s {
- unsigned long rsvd_0_23 : 24; /* */
- unsigned long dest_base : 22; /* RW */
- unsigned long rsvd_46_63: 18; /* */
- } s;
+ unsigned long v;
+ struct uvh_rh_gam_alias210_redirect_config_1_mmr_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long dest_base:22; /* RW */
+ unsigned long rsvd_46_63:18;
+ } s;
};
/* ========================================================================= */
/* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR */
/* ========================================================================= */
-#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR 0x16000f0UL
+#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR 0x16000f0UL
#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_SHFT 24
#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_MASK 0x00003fffff000000UL
union uvh_rh_gam_alias210_redirect_config_2_mmr_u {
- unsigned long v;
- struct uvh_rh_gam_alias210_redirect_config_2_mmr_s {
- unsigned long rsvd_0_23 : 24; /* */
- unsigned long dest_base : 22; /* RW */
- unsigned long rsvd_46_63: 18; /* */
- } s;
+ unsigned long v;
+ struct uvh_rh_gam_alias210_redirect_config_2_mmr_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long dest_base:22; /* RW */
+ unsigned long rsvd_46_63:18;
+ } s;
};
/* ========================================================================= */
/* UVH_RH_GAM_CONFIG_MMR */
/* ========================================================================= */
-#define UVH_RH_GAM_CONFIG_MMR 0x1600000UL
+#define UVH_RH_GAM_CONFIG_MMR 0x1600000UL
+
+#define UVH_RH_GAM_CONFIG_MMR_M_SKT_SHFT 0
+#define UVH_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6
+#define UVH_RH_GAM_CONFIG_MMR_M_SKT_MASK 0x000000000000003fUL
+#define UVH_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL
+
+#define UV1H_RH_GAM_CONFIG_MMR_M_SKT_SHFT 0
+#define UV1H_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6
+#define UV1H_RH_GAM_CONFIG_MMR_MMIOL_CFG_SHFT 12
+#define UV1H_RH_GAM_CONFIG_MMR_M_SKT_MASK 0x000000000000003fUL
+#define UV1H_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL
+#define UV1H_RH_GAM_CONFIG_MMR_MMIOL_CFG_MASK 0x0000000000001000UL
-#define UVH_RH_GAM_CONFIG_MMR_M_SKT_SHFT 0
-#define UVH_RH_GAM_CONFIG_MMR_M_SKT_MASK 0x000000000000003fUL
-#define UVH_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6
-#define UVH_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL
-#define UVH_RH_GAM_CONFIG_MMR_MMIOL_CFG_SHFT 12
-#define UVH_RH_GAM_CONFIG_MMR_MMIOL_CFG_MASK 0x0000000000001000UL
+#define UV2H_RH_GAM_CONFIG_MMR_M_SKT_SHFT 0
+#define UV2H_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6
+#define UV2H_RH_GAM_CONFIG_MMR_M_SKT_MASK 0x000000000000003fUL
+#define UV2H_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL
union uvh_rh_gam_config_mmr_u {
- unsigned long v;
- struct uvh_rh_gam_config_mmr_s {
- unsigned long m_skt : 6; /* RW */
- unsigned long n_skt : 4; /* RW */
- unsigned long rsvd_10_11: 2; /* */
- unsigned long mmiol_cfg : 1; /* RW */
- unsigned long rsvd_13_63: 51; /* */
- } s;
+ unsigned long v;
+ struct uvh_rh_gam_config_mmr_s {
+ unsigned long m_skt:6; /* RW */
+ unsigned long n_skt:4; /* RW */
+ unsigned long rsvd_10_63:54;
+ } s;
+ struct uv1h_rh_gam_config_mmr_s {
+ unsigned long m_skt:6; /* RW */
+ unsigned long n_skt:4; /* RW */
+ unsigned long rsvd_10_11:2;
+ unsigned long mmiol_cfg:1; /* RW */
+ unsigned long rsvd_13_63:51;
+ } s1;
+ struct uv2h_rh_gam_config_mmr_s {
+ unsigned long m_skt:6; /* RW */
+ unsigned long n_skt:4; /* RW */
+ unsigned long rsvd_10_63:54;
+ } s2;
};
/* ========================================================================= */
/* UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR */
/* ========================================================================= */
-#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x1600010UL
+#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x1600010UL
+
+#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28
+#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff0000000UL
+
+#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28
+#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_SHFT 48
+#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52
+#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
+#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff0000000UL
+#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_MASK 0x0001000000000000UL
+#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL
+#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
-#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28
-#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff0000000UL
-#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_SHFT 48
-#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_MASK 0x0001000000000000UL
-#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52
-#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL
-#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
-#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
+#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28
+#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52
+#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
+#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff0000000UL
+#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL
+#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
union uvh_rh_gam_gru_overlay_config_mmr_u {
- unsigned long v;
- struct uvh_rh_gam_gru_overlay_config_mmr_s {
- unsigned long rsvd_0_27: 28; /* */
- unsigned long base : 18; /* RW */
- unsigned long rsvd_46_47: 2; /* */
- unsigned long gr4 : 1; /* RW */
- unsigned long rsvd_49_51: 3; /* */
- unsigned long n_gru : 4; /* RW */
- unsigned long rsvd_56_62: 7; /* */
- unsigned long enable : 1; /* RW */
- } s;
+ unsigned long v;
+ struct uvh_rh_gam_gru_overlay_config_mmr_s {
+ unsigned long rsvd_0_27:28;
+ unsigned long base:18; /* RW */
+ unsigned long rsvd_46_62:17;
+ unsigned long enable:1; /* RW */
+ } s;
+ struct uv1h_rh_gam_gru_overlay_config_mmr_s {
+ unsigned long rsvd_0_27:28;
+ unsigned long base:18; /* RW */
+ unsigned long rsvd_46_47:2;
+ unsigned long gr4:1; /* RW */
+ unsigned long rsvd_49_51:3;
+ unsigned long n_gru:4; /* RW */
+ unsigned long rsvd_56_62:7;
+ unsigned long enable:1; /* RW */
+ } s1;
+ struct uv2h_rh_gam_gru_overlay_config_mmr_s {
+ unsigned long rsvd_0_27:28;
+ unsigned long base:18; /* RW */
+ unsigned long rsvd_46_51:6;
+ unsigned long n_gru:4; /* RW */
+ unsigned long rsvd_56_62:7;
+ unsigned long enable:1; /* RW */
+ } s2;
};
/* ========================================================================= */
/* UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR */
/* ========================================================================= */
-#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR 0x1600030UL
-
-#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT 30
-#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003fffc0000000UL
-#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_SHFT 46
-#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_MASK 0x000fc00000000000UL
-#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_SHFT 52
-#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_MASK 0x00f0000000000000UL
-#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
-#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
+#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR 0x1600030UL
+
+#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT 30
+#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_SHFT 46
+#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_SHFT 52
+#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
+#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003fffc0000000UL
+#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_MASK 0x000fc00000000000UL
+#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_MASK 0x00f0000000000000UL
+#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
+
+#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT 27
+#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_SHFT 46
+#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_SHFT 52
+#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
+#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff8000000UL
+#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_MASK 0x000fc00000000000UL
+#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_MASK 0x00f0000000000000UL
+#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
union uvh_rh_gam_mmioh_overlay_config_mmr_u {
- unsigned long v;
- struct uvh_rh_gam_mmioh_overlay_config_mmr_s {
- unsigned long rsvd_0_29: 30; /* */
- unsigned long base : 16; /* RW */
- unsigned long m_io : 6; /* RW */
- unsigned long n_io : 4; /* RW */
- unsigned long rsvd_56_62: 7; /* */
- unsigned long enable : 1; /* RW */
- } s;
+ unsigned long v;
+ struct uv1h_rh_gam_mmioh_overlay_config_mmr_s {
+ unsigned long rsvd_0_29:30;
+ unsigned long base:16; /* RW */
+ unsigned long m_io:6; /* RW */
+ unsigned long n_io:4; /* RW */
+ unsigned long rsvd_56_62:7;
+ unsigned long enable:1; /* RW */
+ } s1;
+ struct uv2h_rh_gam_mmioh_overlay_config_mmr_s {
+ unsigned long rsvd_0_26:27;
+ unsigned long base:19; /* RW */
+ unsigned long m_io:6; /* RW */
+ unsigned long n_io:4; /* RW */
+ unsigned long rsvd_56_62:7;
+ unsigned long enable:1; /* RW */
+ } s2;
};
/* ========================================================================= */
/* UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR */
/* ========================================================================= */
-#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR 0x1600028UL
+#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR 0x1600028UL
-#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26
-#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL
-#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_DUAL_HUB_SHFT 46
-#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_DUAL_HUB_MASK 0x0000400000000000UL
-#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
-#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
+#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26
+#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL
+
+#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26
+#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_DUAL_HUB_SHFT 46
+#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
+#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL
+#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_DUAL_HUB_MASK 0x0000400000000000UL
+#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
+
+#define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26
+#define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
+#define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL
+#define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
union uvh_rh_gam_mmr_overlay_config_mmr_u {
- unsigned long v;
- struct uvh_rh_gam_mmr_overlay_config_mmr_s {
- unsigned long rsvd_0_25: 26; /* */
- unsigned long base : 20; /* RW */
- unsigned long dual_hub : 1; /* RW */
- unsigned long rsvd_47_62: 16; /* */
- unsigned long enable : 1; /* RW */
- } s;
+ unsigned long v;
+ struct uvh_rh_gam_mmr_overlay_config_mmr_s {
+ unsigned long rsvd_0_25:26;
+ unsigned long base:20; /* RW */
+ unsigned long rsvd_46_62:17;
+ unsigned long enable:1; /* RW */
+ } s;
+ struct uv1h_rh_gam_mmr_overlay_config_mmr_s {
+ unsigned long rsvd_0_25:26;
+ unsigned long base:20; /* RW */
+ unsigned long dual_hub:1; /* RW */
+ unsigned long rsvd_47_62:16;
+ unsigned long enable:1; /* RW */
+ } s1;
+ struct uv2h_rh_gam_mmr_overlay_config_mmr_s {
+ unsigned long rsvd_0_25:26;
+ unsigned long base:20; /* RW */
+ unsigned long rsvd_46_62:17;
+ unsigned long enable:1; /* RW */
+ } s2;
};
/* ========================================================================= */
/* UVH_RTC */
/* ========================================================================= */
-#define UVH_RTC 0x340000UL
+#define UVH_RTC 0x340000UL
-#define UVH_RTC_REAL_TIME_CLOCK_SHFT 0
-#define UVH_RTC_REAL_TIME_CLOCK_MASK 0x00ffffffffffffffUL
+#define UVH_RTC_REAL_TIME_CLOCK_SHFT 0
+#define UVH_RTC_REAL_TIME_CLOCK_MASK 0x00ffffffffffffffUL
union uvh_rtc_u {
- unsigned long v;
- struct uvh_rtc_s {
- unsigned long real_time_clock : 56; /* RW */
- unsigned long rsvd_56_63 : 8; /* */
- } s;
+ unsigned long v;
+ struct uvh_rtc_s {
+ unsigned long real_time_clock:56; /* RW */
+ unsigned long rsvd_56_63:8;
+ } s;
};
/* ========================================================================= */
/* UVH_RTC1_INT_CONFIG */
/* ========================================================================= */
-#define UVH_RTC1_INT_CONFIG 0x615c0UL
-
-#define UVH_RTC1_INT_CONFIG_VECTOR_SHFT 0
-#define UVH_RTC1_INT_CONFIG_VECTOR_MASK 0x00000000000000ffUL
-#define UVH_RTC1_INT_CONFIG_DM_SHFT 8
-#define UVH_RTC1_INT_CONFIG_DM_MASK 0x0000000000000700UL
-#define UVH_RTC1_INT_CONFIG_DESTMODE_SHFT 11
-#define UVH_RTC1_INT_CONFIG_DESTMODE_MASK 0x0000000000000800UL
-#define UVH_RTC1_INT_CONFIG_STATUS_SHFT 12
-#define UVH_RTC1_INT_CONFIG_STATUS_MASK 0x0000000000001000UL
-#define UVH_RTC1_INT_CONFIG_P_SHFT 13
-#define UVH_RTC1_INT_CONFIG_P_MASK 0x0000000000002000UL
-#define UVH_RTC1_INT_CONFIG_T_SHFT 15
-#define UVH_RTC1_INT_CONFIG_T_MASK 0x0000000000008000UL
-#define UVH_RTC1_INT_CONFIG_M_SHFT 16
-#define UVH_RTC1_INT_CONFIG_M_MASK 0x0000000000010000UL
-#define UVH_RTC1_INT_CONFIG_APIC_ID_SHFT 32
-#define UVH_RTC1_INT_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
+#define UVH_RTC1_INT_CONFIG 0x615c0UL
+
+#define UVH_RTC1_INT_CONFIG_VECTOR_SHFT 0
+#define UVH_RTC1_INT_CONFIG_DM_SHFT 8
+#define UVH_RTC1_INT_CONFIG_DESTMODE_SHFT 11
+#define UVH_RTC1_INT_CONFIG_STATUS_SHFT 12
+#define UVH_RTC1_INT_CONFIG_P_SHFT 13
+#define UVH_RTC1_INT_CONFIG_T_SHFT 15
+#define UVH_RTC1_INT_CONFIG_M_SHFT 16
+#define UVH_RTC1_INT_CONFIG_APIC_ID_SHFT 32
+#define UVH_RTC1_INT_CONFIG_VECTOR_MASK 0x00000000000000ffUL
+#define UVH_RTC1_INT_CONFIG_DM_MASK 0x0000000000000700UL
+#define UVH_RTC1_INT_CONFIG_DESTMODE_MASK 0x0000000000000800UL
+#define UVH_RTC1_INT_CONFIG_STATUS_MASK 0x0000000000001000UL
+#define UVH_RTC1_INT_CONFIG_P_MASK 0x0000000000002000UL
+#define UVH_RTC1_INT_CONFIG_T_MASK 0x0000000000008000UL
+#define UVH_RTC1_INT_CONFIG_M_MASK 0x0000000000010000UL
+#define UVH_RTC1_INT_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
union uvh_rtc1_int_config_u {
- unsigned long v;
- struct uvh_rtc1_int_config_s {
- unsigned long vector_ : 8; /* RW */
- unsigned long dm : 3; /* RW */
- unsigned long destmode : 1; /* RW */
- unsigned long status : 1; /* RO */
- unsigned long p : 1; /* RO */
- unsigned long rsvd_14 : 1; /* */
- unsigned long t : 1; /* RO */
- unsigned long m : 1; /* RW */
- unsigned long rsvd_17_31: 15; /* */
- unsigned long apic_id : 32; /* RW */
- } s;
+ unsigned long v;
+ struct uvh_rtc1_int_config_s {
+ unsigned long vector_:8; /* RW */
+ unsigned long dm:3; /* RW */
+ unsigned long destmode:1; /* RW */
+ unsigned long status:1; /* RO */
+ unsigned long p:1; /* RO */
+ unsigned long rsvd_14:1;
+ unsigned long t:1; /* RO */
+ unsigned long m:1; /* RW */
+ unsigned long rsvd_17_31:15;
+ unsigned long apic_id:32; /* RW */
+ } s;
};
/* ========================================================================= */
/* UVH_SCRATCH5 */
/* ========================================================================= */
-#define UVH_SCRATCH5 0x2d0200UL
-#define UVH_SCRATCH5_32 0x00778
+#define UVH_SCRATCH5 0x2d0200UL
+#define UVH_SCRATCH5_32 0x778
+
+#define UVH_SCRATCH5_SCRATCH5_SHFT 0
+#define UVH_SCRATCH5_SCRATCH5_MASK 0xffffffffffffffffUL
-#define UVH_SCRATCH5_SCRATCH5_SHFT 0
-#define UVH_SCRATCH5_SCRATCH5_MASK 0xffffffffffffffffUL
union uvh_scratch5_u {
- unsigned long v;
- struct uvh_scratch5_s {
- unsigned long scratch5 : 64; /* RW, W1CS */
- } s;
+ unsigned long v;
+ struct uvh_scratch5_s {
+ unsigned long scratch5:64; /* RW, W1CS */
+ } s;
+};
+
+/* ========================================================================= */
+/* UV2H_EVENT_OCCURRED2 */
+/* ========================================================================= */
+#define UV2H_EVENT_OCCURRED2 0x70100UL
+#define UV2H_EVENT_OCCURRED2_32 0xb68
+
+#define UV2H_EVENT_OCCURRED2_RTC_0_SHFT 0
+#define UV2H_EVENT_OCCURRED2_RTC_1_SHFT 1
+#define UV2H_EVENT_OCCURRED2_RTC_2_SHFT 2
+#define UV2H_EVENT_OCCURRED2_RTC_3_SHFT 3
+#define UV2H_EVENT_OCCURRED2_RTC_4_SHFT 4
+#define UV2H_EVENT_OCCURRED2_RTC_5_SHFT 5
+#define UV2H_EVENT_OCCURRED2_RTC_6_SHFT 6
+#define UV2H_EVENT_OCCURRED2_RTC_7_SHFT 7
+#define UV2H_EVENT_OCCURRED2_RTC_8_SHFT 8
+#define UV2H_EVENT_OCCURRED2_RTC_9_SHFT 9
+#define UV2H_EVENT_OCCURRED2_RTC_10_SHFT 10
+#define UV2H_EVENT_OCCURRED2_RTC_11_SHFT 11
+#define UV2H_EVENT_OCCURRED2_RTC_12_SHFT 12
+#define UV2H_EVENT_OCCURRED2_RTC_13_SHFT 13
+#define UV2H_EVENT_OCCURRED2_RTC_14_SHFT 14
+#define UV2H_EVENT_OCCURRED2_RTC_15_SHFT 15
+#define UV2H_EVENT_OCCURRED2_RTC_16_SHFT 16
+#define UV2H_EVENT_OCCURRED2_RTC_17_SHFT 17
+#define UV2H_EVENT_OCCURRED2_RTC_18_SHFT 18
+#define UV2H_EVENT_OCCURRED2_RTC_19_SHFT 19
+#define UV2H_EVENT_OCCURRED2_RTC_20_SHFT 20
+#define UV2H_EVENT_OCCURRED2_RTC_21_SHFT 21
+#define UV2H_EVENT_OCCURRED2_RTC_22_SHFT 22
+#define UV2H_EVENT_OCCURRED2_RTC_23_SHFT 23
+#define UV2H_EVENT_OCCURRED2_RTC_24_SHFT 24
+#define UV2H_EVENT_OCCURRED2_RTC_25_SHFT 25
+#define UV2H_EVENT_OCCURRED2_RTC_26_SHFT 26
+#define UV2H_EVENT_OCCURRED2_RTC_27_SHFT 27
+#define UV2H_EVENT_OCCURRED2_RTC_28_SHFT 28
+#define UV2H_EVENT_OCCURRED2_RTC_29_SHFT 29
+#define UV2H_EVENT_OCCURRED2_RTC_30_SHFT 30
+#define UV2H_EVENT_OCCURRED2_RTC_31_SHFT 31
+#define UV2H_EVENT_OCCURRED2_RTC_0_MASK 0x0000000000000001UL
+#define UV2H_EVENT_OCCURRED2_RTC_1_MASK 0x0000000000000002UL
+#define UV2H_EVENT_OCCURRED2_RTC_2_MASK 0x0000000000000004UL
+#define UV2H_EVENT_OCCURRED2_RTC_3_MASK 0x0000000000000008UL
+#define UV2H_EVENT_OCCURRED2_RTC_4_MASK 0x0000000000000010UL
+#define UV2H_EVENT_OCCURRED2_RTC_5_MASK 0x0000000000000020UL
+#define UV2H_EVENT_OCCURRED2_RTC_6_MASK 0x0000000000000040UL
+#define UV2H_EVENT_OCCURRED2_RTC_7_MASK 0x0000000000000080UL
+#define UV2H_EVENT_OCCURRED2_RTC_8_MASK 0x0000000000000100UL
+#define UV2H_EVENT_OCCURRED2_RTC_9_MASK 0x0000000000000200UL
+#define UV2H_EVENT_OCCURRED2_RTC_10_MASK 0x0000000000000400UL
+#define UV2H_EVENT_OCCURRED2_RTC_11_MASK 0x0000000000000800UL
+#define UV2H_EVENT_OCCURRED2_RTC_12_MASK 0x0000000000001000UL
+#define UV2H_EVENT_OCCURRED2_RTC_13_MASK 0x0000000000002000UL
+#define UV2H_EVENT_OCCURRED2_RTC_14_MASK 0x0000000000004000UL
+#define UV2H_EVENT_OCCURRED2_RTC_15_MASK 0x0000000000008000UL
+#define UV2H_EVENT_OCCURRED2_RTC_16_MASK 0x0000000000010000UL
+#define UV2H_EVENT_OCCURRED2_RTC_17_MASK 0x0000000000020000UL
+#define UV2H_EVENT_OCCURRED2_RTC_18_MASK 0x0000000000040000UL
+#define UV2H_EVENT_OCCURRED2_RTC_19_MASK 0x0000000000080000UL
+#define UV2H_EVENT_OCCURRED2_RTC_20_MASK 0x0000000000100000UL
+#define UV2H_EVENT_OCCURRED2_RTC_21_MASK 0x0000000000200000UL
+#define UV2H_EVENT_OCCURRED2_RTC_22_MASK 0x0000000000400000UL
+#define UV2H_EVENT_OCCURRED2_RTC_23_MASK 0x0000000000800000UL
+#define UV2H_EVENT_OCCURRED2_RTC_24_MASK 0x0000000001000000UL
+#define UV2H_EVENT_OCCURRED2_RTC_25_MASK 0x0000000002000000UL
+#define UV2H_EVENT_OCCURRED2_RTC_26_MASK 0x0000000004000000UL
+#define UV2H_EVENT_OCCURRED2_RTC_27_MASK 0x0000000008000000UL
+#define UV2H_EVENT_OCCURRED2_RTC_28_MASK 0x0000000010000000UL
+#define UV2H_EVENT_OCCURRED2_RTC_29_MASK 0x0000000020000000UL
+#define UV2H_EVENT_OCCURRED2_RTC_30_MASK 0x0000000040000000UL
+#define UV2H_EVENT_OCCURRED2_RTC_31_MASK 0x0000000080000000UL
+
+union uv2h_event_occurred2_u {
+ unsigned long v;
+ struct uv2h_event_occurred2_s {
+ unsigned long rtc_0:1; /* RW */
+ unsigned long rtc_1:1; /* RW */
+ unsigned long rtc_2:1; /* RW */
+ unsigned long rtc_3:1; /* RW */
+ unsigned long rtc_4:1; /* RW */
+ unsigned long rtc_5:1; /* RW */
+ unsigned long rtc_6:1; /* RW */
+ unsigned long rtc_7:1; /* RW */
+ unsigned long rtc_8:1; /* RW */
+ unsigned long rtc_9:1; /* RW */
+ unsigned long rtc_10:1; /* RW */
+ unsigned long rtc_11:1; /* RW */
+ unsigned long rtc_12:1; /* RW */
+ unsigned long rtc_13:1; /* RW */
+ unsigned long rtc_14:1; /* RW */
+ unsigned long rtc_15:1; /* RW */
+ unsigned long rtc_16:1; /* RW */
+ unsigned long rtc_17:1; /* RW */
+ unsigned long rtc_18:1; /* RW */
+ unsigned long rtc_19:1; /* RW */
+ unsigned long rtc_20:1; /* RW */
+ unsigned long rtc_21:1; /* RW */
+ unsigned long rtc_22:1; /* RW */
+ unsigned long rtc_23:1; /* RW */
+ unsigned long rtc_24:1; /* RW */
+ unsigned long rtc_25:1; /* RW */
+ unsigned long rtc_26:1; /* RW */
+ unsigned long rtc_27:1; /* RW */
+ unsigned long rtc_28:1; /* RW */
+ unsigned long rtc_29:1; /* RW */
+ unsigned long rtc_30:1; /* RW */
+ unsigned long rtc_31:1; /* RW */
+ unsigned long rsvd_32_63:32;
+ } s1;
};
-#endif /* __ASM_UV_MMRS_X86_H__ */
+/* ========================================================================= */
+/* UV2H_EVENT_OCCURRED2_ALIAS */
+/* ========================================================================= */
+#define UV2H_EVENT_OCCURRED2_ALIAS 0x70108UL
+#define UV2H_EVENT_OCCURRED2_ALIAS_32 0xb70
+
+/* ========================================================================= */
+/* UV2H_LB_BAU_SB_ACTIVATION_STATUS_2 */
+/* ========================================================================= */
+#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2 0x320130UL
+#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2_32 0x9f0
+
+#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_SHFT 0
+#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_MASK 0xffffffffffffffffUL
+
+union uv2h_lb_bau_sb_activation_status_2_u {
+ unsigned long v;
+ struct uv2h_lb_bau_sb_activation_status_2_s {
+ unsigned long aux_error:64; /* RW */
+ } s1;
+};
+
+/* ========================================================================= */
+/* UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK */
+/* ========================================================================= */
+#define UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK 0x320130UL
+#define UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK_32 0x9f0
+
+#define UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK_BIT_ENABLES_SHFT 0
+#define UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK_BIT_ENABLES_MASK 0x00000000ffffffffUL
+
+union uv1h_lb_target_physical_apic_id_mask_u {
+ unsigned long v;
+ struct uv1h_lb_target_physical_apic_id_mask_s {
+ unsigned long bit_enables:32; /* RW */
+ unsigned long rsvd_32_63:32;
+ } s1;
+};
+
+
+#endif /* _ASM_X86_UV_UV_MMRS_H */
diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
index 9064052b73d..bb0522850b7 100644
--- a/arch/x86/include/asm/vdso.h
+++ b/arch/x86/include/asm/vdso.h
@@ -1,20 +1,6 @@
#ifndef _ASM_X86_VDSO_H
#define _ASM_X86_VDSO_H
-#ifdef CONFIG_X86_64
-extern const char VDSO64_PRELINK[];
-
-/*
- * Given a pointer to the vDSO image, find the pointer to VDSO64_name
- * as that symbol is defined in the vDSO sources or linker script.
- */
-#define VDSO64_SYMBOL(base, name) \
-({ \
- extern const char VDSO64_##name[]; \
- (void *)(VDSO64_##name - VDSO64_PRELINK + (unsigned long)(base)); \
-})
-#endif
-
#if defined CONFIG_X86_32 || defined CONFIG_COMPAT
extern const char VDSO32_PRELINK[];
diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h
index 3d61e204826..815285bcace 100644
--- a/arch/x86/include/asm/vgtod.h
+++ b/arch/x86/include/asm/vgtod.h
@@ -11,10 +11,9 @@ struct vsyscall_gtod_data {
time_t wall_time_sec;
u32 wall_time_nsec;
- int sysctl_enabled;
struct timezone sys_tz;
struct { /* extract of a clocksource struct */
- cycle_t (*vread)(void);
+ int vclock_mode;
cycle_t cycle_last;
cycle_t mask;
u32 mult;
@@ -23,8 +22,6 @@ struct vsyscall_gtod_data {
struct timespec wall_to_monotonic;
struct timespec wall_time_coarse;
};
-extern struct vsyscall_gtod_data __vsyscall_gtod_data
-__section_vsyscall_gtod_data;
extern struct vsyscall_gtod_data vsyscall_gtod_data;
#endif /* _ASM_X86_VGTOD_H */
diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h
index d0983d255fb..60107072c28 100644
--- a/arch/x86/include/asm/vsyscall.h
+++ b/arch/x86/include/asm/vsyscall.h
@@ -16,27 +16,15 @@ enum vsyscall_num {
#ifdef __KERNEL__
#include <linux/seqlock.h>
-#define __section_vgetcpu_mode __attribute__ ((unused, __section__ (".vgetcpu_mode"), aligned(16)))
-#define __section_jiffies __attribute__ ((unused, __section__ (".jiffies"), aligned(16)))
-
-/* Definitions for CONFIG_GENERIC_TIME definitions */
-#define __section_vsyscall_gtod_data __attribute__ \
- ((unused, __section__ (".vsyscall_gtod_data"),aligned(16)))
-#define __section_vsyscall_clock __attribute__ \
- ((unused, __section__ (".vsyscall_clock"),aligned(16)))
-#define __vsyscall_fn \
- __attribute__ ((unused, __section__(".vsyscall_fn"))) notrace
-
#define VGETCPU_RDTSCP 1
#define VGETCPU_LSL 2
-extern int __vgetcpu_mode;
-extern volatile unsigned long __jiffies;
-
/* kernel space (writeable) */
extern int vgetcpu_mode;
extern struct timezone sys_tz;
+#include <asm/vvar.h>
+
extern void map_vsyscall(void);
#endif /* __KERNEL__ */
diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h
new file mode 100644
index 00000000000..de656ac2af4
--- /dev/null
+++ b/arch/x86/include/asm/vvar.h
@@ -0,0 +1,50 @@
+/*
+ * vvar.h: Shared vDSO/kernel variable declarations
+ * Copyright (c) 2011 Andy Lutomirski
+ * Subject to the GNU General Public License, version 2
+ *
+ * A handful of variables are accessible (read-only) from userspace
+ * code in the vsyscall page and the vdso. They are declared here.
+ * Some other file must define them with DEFINE_VVAR.
+ *
+ * In normal kernel code, they are used like any other variable.
+ * In user code, they are accessed through the VVAR macro.
+ *
+ * These variables live in a page of kernel data that has an extra RO
+ * mapping for userspace. Each variable needs a unique offset within
+ * that page; specify that offset with the DECLARE_VVAR macro. (If
+ * you mess up, the linker will catch it.)
+ */
+
+/* Base address of vvars. This is not ABI. */
+#define VVAR_ADDRESS (-10*1024*1024 - 4096)
+
+#if defined(__VVAR_KERNEL_LDS)
+
+/* The kernel linker script defines its own magic to put vvars in the
+ * right place.
+ */
+#define DECLARE_VVAR(offset, type, name) \
+ EMIT_VVAR(name, offset)
+
+#else
+
+#define DECLARE_VVAR(offset, type, name) \
+ static type const * const vvaraddr_ ## name = \
+ (void *)(VVAR_ADDRESS + (offset));
+
+#define DEFINE_VVAR(type, name) \
+ type name \
+ __attribute__((section(".vvar_" #name), aligned(16)))
+
+#define VVAR(name) (*vvaraddr_ ## name)
+
+#endif
+
+/* DECLARE_VVAR(offset, type, name) */
+
+DECLARE_VVAR(0, volatile unsigned long, jiffies)
+DECLARE_VVAR(16, int, vgetcpu_mode)
+DECLARE_VVAR(128, struct vsyscall_gtod_data, vsyscall_gtod_data)
+
+#undef DECLARE_VVAR
diff --git a/arch/x86/include/asm/x2apic.h b/arch/x86/include/asm/x2apic.h
new file mode 100644
index 00000000000..6bf5b8e478c
--- /dev/null
+++ b/arch/x86/include/asm/x2apic.h
@@ -0,0 +1,62 @@
+/*
+ * Common bits for X2APIC cluster/physical modes.
+ */
+
+#ifndef _ASM_X86_X2APIC_H
+#define _ASM_X86_X2APIC_H
+
+#include <asm/apic.h>
+#include <asm/ipi.h>
+#include <linux/cpumask.h>
+
+/*
+ * Need to use more than cpu 0, because we need more vectors
+ * when MSI-X are used.
+ */
+static const struct cpumask *x2apic_target_cpus(void)
+{
+ return cpu_online_mask;
+}
+
+static int x2apic_apic_id_registered(void)
+{
+ return 1;
+}
+
+/*
+ * For now each logical cpu is in its own vector allocation domain.
+ */
+static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
+{
+ cpumask_clear(retmask);
+ cpumask_set_cpu(cpu, retmask);
+}
+
+static void
+__x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest)
+{
+ unsigned long cfg = __prepare_ICR(0, vector, dest);
+ native_x2apic_icr_write(cfg, apicid);
+}
+
+static unsigned int x2apic_get_apic_id(unsigned long id)
+{
+ return id;
+}
+
+static unsigned long x2apic_set_apic_id(unsigned int id)
+{
+ return id;
+}
+
+static int x2apic_phys_pkg_id(int initial_apicid, int index_msb)
+{
+ return initial_apicid >> index_msb;
+}
+
+static void x2apic_send_IPI_self(int vector)
+{
+ apic_write(APIC_SELF_IPI, vector);
+}
+
+#endif /* _ASM_X86_X2APIC_H */
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index 8508bfe5229..d240ea95051 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -447,6 +447,13 @@ HYPERVISOR_hvm_op(int op, void *arg)
return _hypercall2(unsigned long, hvm_op, op, arg);
}
+static inline int
+HYPERVISOR_tmem_op(
+ struct tmem_op *op)
+{
+ return _hypercall1(int, tmem_op, op);
+}
+
static inline void
MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set)
{
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index c61934fbf22..64a619d47d3 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -47,8 +47,9 @@ extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
extern unsigned long set_phys_range_identity(unsigned long pfn_s,
unsigned long pfn_e);
-extern int m2p_add_override(unsigned long mfn, struct page *page);
-extern int m2p_remove_override(struct page *page);
+extern int m2p_add_override(unsigned long mfn, struct page *page,
+ bool clear_pte);
+extern int m2p_remove_override(struct page *page, bool clear_pte);
extern struct page *m2p_find_override(unsigned long mfn);
extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn);
diff --git a/arch/x86/include/asm/xen/pci.h b/arch/x86/include/asm/xen/pci.h
index aa862098916..968d57dd54c 100644
--- a/arch/x86/include/asm/xen/pci.h
+++ b/arch/x86/include/asm/xen/pci.h
@@ -14,10 +14,27 @@ static inline int pci_xen_hvm_init(void)
}
#endif
#if defined(CONFIG_XEN_DOM0)
-void __init xen_setup_pirqs(void);
+int __init pci_xen_initial_domain(void);
+int xen_find_device_domain_owner(struct pci_dev *dev);
+int xen_register_device_domain_owner(struct pci_dev *dev, uint16_t domain);
+int xen_unregister_device_domain_owner(struct pci_dev *dev);
#else
-static inline void __init xen_setup_pirqs(void)
+static inline int __init pci_xen_initial_domain(void)
{
+ return -1;
+}
+static inline int xen_find_device_domain_owner(struct pci_dev *dev)
+{
+ return -1;
+}
+static inline int xen_register_device_domain_owner(struct pci_dev *dev,
+ uint16_t domain)
+{
+ return -1;
+}
+static inline int xen_unregister_device_domain_owner(struct pci_dev *dev)
+{
+ return -1;
}
#endif
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 7338ef2218b..04105574c8e 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -24,7 +24,6 @@ endif
nostackp := $(call cc-option, -fno-stack-protector)
CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp)
CFLAGS_hpet.o := $(nostackp)
-CFLAGS_tsc.o := $(nostackp)
CFLAGS_paravirt.o := $(nostackp)
GCOV_PROFILE_vsyscall_64.o := n
GCOV_PROFILE_hpet.o := n
@@ -36,10 +35,11 @@ obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
obj-y += time.o ioport.o ldt.o dumpstack.o
obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o
obj-$(CONFIG_IRQ_WORK) += irq_work.o
-obj-$(CONFIG_X86_32) += probe_roms_32.o
+obj-y += probe_roms.o
obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o
+obj-$(CONFIG_X86_64) += vsyscall_emu_64.o
obj-y += bootflag.o e820.o
obj-y += pci-dma.o quirks.o topology.o kdebugfs.o
obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o
@@ -117,9 +117,8 @@ obj-$(CONFIG_OF) += devicetree.o
ifeq ($(CONFIG_X86_64),y)
obj-$(CONFIG_AUDIT) += audit_64.o
- obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o
+ obj-$(CONFIG_GART_IOMMU) += amd_gart_64.o aperture_64.o
obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o
- obj-$(CONFIG_AMD_IOMMU) += amd_iommu_init.o amd_iommu.o
obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o
obj-y += vsmp_64.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 9a966c579af..4558f0d0822 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -970,7 +970,7 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
mp_irq.irqflag = (trigger << 2) | polarity;
mp_irq.srcbus = MP_ISA_BUS;
mp_irq.srcbusirq = bus_irq; /* IRQ */
- mp_irq.dstapic = mp_ioapics[ioapic].apicid; /* APIC ID */
+ mp_irq.dstapic = mpc_ioapic_id(ioapic); /* APIC ID */
mp_irq.dstirq = pin; /* INTIN# */
mp_save_irq(&mp_irq);
@@ -1021,7 +1021,7 @@ void __init mp_config_acpi_legacy_irqs(void)
if (ioapic < 0)
continue;
pin = mp_find_ioapic_pin(ioapic, gsi);
- dstapic = mp_ioapics[ioapic].apicid;
+ dstapic = mpc_ioapic_id(ioapic);
for (idx = 0; idx < mp_irq_entries; idx++) {
struct mpc_intsrc *irq = mp_irqs + idx;
@@ -1082,7 +1082,7 @@ static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger,
mp_irq.srcbus = number;
mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
ioapic = mp_find_ioapic(gsi);
- mp_irq.dstapic = mp_ioapics[ioapic].apicid;
+ mp_irq.dstapic = mpc_ioapic_id(ioapic);
mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi);
mp_save_irq(&mp_irq);
@@ -1113,7 +1113,7 @@ int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
printk(KERN_ERR "Invalid reference to IOAPIC pin "
- "%d-%d\n", mp_ioapics[ioapic].apicid,
+ "%d-%d\n", mpc_ioapic_id(ioapic),
ioapic_pin);
return gsi;
}
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.S b/arch/x86/kernel/acpi/realmode/wakeup.S
index ead21b66311..b4fd836e405 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.S
+++ b/arch/x86/kernel/acpi/realmode/wakeup.S
@@ -28,6 +28,8 @@ pmode_cr3: .long 0 /* Saved %cr3 */
pmode_cr4: .long 0 /* Saved %cr4 */
pmode_efer: .quad 0 /* Saved EFER */
pmode_gdt: .quad 0
+pmode_misc_en: .quad 0 /* Saved MISC_ENABLE MSR */
+pmode_behavior: .long 0 /* Wakeup behavior flags */
realmode_flags: .long 0
real_magic: .long 0
trampoline_segment: .word 0
@@ -91,6 +93,18 @@ wakeup_code:
/* Call the C code */
calll main
+ /* Restore MISC_ENABLE before entering protected mode, in case
+ BIOS decided to clear XD_DISABLE during S3. */
+ movl pmode_behavior, %eax
+ btl $WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE, %eax
+ jnc 1f
+
+ movl pmode_misc_en, %eax
+ movl pmode_misc_en + 4, %edx
+ movl $MSR_IA32_MISC_ENABLE, %ecx
+ wrmsr
+1:
+
/* Do any other stuff... */
#ifndef CONFIG_64BIT
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.h b/arch/x86/kernel/acpi/realmode/wakeup.h
index e1828c07e79..97a29e1430e 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.h
+++ b/arch/x86/kernel/acpi/realmode/wakeup.h
@@ -21,6 +21,9 @@ struct wakeup_header {
u32 pmode_efer_low; /* Protected mode EFER */
u32 pmode_efer_high;
u64 pmode_gdt;
+ u32 pmode_misc_en_low; /* Protected mode MISC_ENABLE */
+ u32 pmode_misc_en_high;
+ u32 pmode_behavior; /* Wakeup routine behavior flags */
u32 realmode_flags;
u32 real_magic;
u16 trampoline_segment; /* segment with trampoline code, 64-bit only */
@@ -39,4 +42,7 @@ extern struct wakeup_header wakeup_header;
#define WAKEUP_HEADER_SIGNATURE 0x51ee1111
#define WAKEUP_END_SIGNATURE 0x65a22c82
+/* Wakeup behavior bits */
+#define WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE 0
+
#endif /* ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H */
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index ff93bc1b09c..103b6ab368d 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -77,6 +77,12 @@ int acpi_suspend_lowlevel(void)
header->pmode_cr0 = read_cr0();
header->pmode_cr4 = read_cr4_safe();
+ header->pmode_behavior = 0;
+ if (!rdmsr_safe(MSR_IA32_MISC_ENABLE,
+ &header->pmode_misc_en_low,
+ &header->pmode_misc_en_high))
+ header->pmode_behavior |=
+ (1 << WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE);
header->realmode_flags = acpi_realmode_flags;
header->real_magic = 0x12345678;
@@ -112,11 +118,6 @@ static int __init acpi_sleep_setup(char *str)
#ifdef CONFIG_HIBERNATION
if (strncmp(str, "s4_nohwsig", 10) == 0)
acpi_no_s4_hw_signature();
- if (strncmp(str, "s4_nonvs", 8) == 0) {
- pr_warning("ACPI: acpi_sleep=s4_nonvs is deprecated, "
- "please use acpi_sleep=nonvs instead");
- acpi_nvs_nosave();
- }
#endif
if (strncmp(str, "nonvs", 5) == 0)
acpi_nvs_nosave();
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 4a234677e21..c6382281624 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -14,7 +14,6 @@
#include <asm/pgtable.h>
#include <asm/mce.h>
#include <asm/nmi.h>
-#include <asm/vsyscall.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
#include <asm/io.h>
@@ -67,17 +66,30 @@ __setup("noreplace-paravirt", setup_noreplace_paravirt);
#define DPRINTK(fmt, args...) if (debug_alternative) \
printk(KERN_DEBUG fmt, args)
+/*
+ * Each GENERIC_NOPX is of X bytes, and defined as an array of bytes
+ * that correspond to that nop. Getting from one nop to the next, we
+ * add to the array the offset that is equal to the sum of all sizes of
+ * nops preceding the one we are after.
+ *
+ * Note: The GENERIC_NOP5_ATOMIC is at the end, as it breaks the
+ * nice symmetry of sizes of the previous nops.
+ */
#if defined(GENERIC_NOP1) && !defined(CONFIG_X86_64)
-/* Use inline assembly to define this because the nops are defined
- as inline assembly strings in the include files and we cannot
- get them easily into strings. */
-asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nintelnops: "
- GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 GENERIC_NOP6
- GENERIC_NOP7 GENERIC_NOP8
- "\t.previous");
-extern const unsigned char intelnops[];
-static const unsigned char *const __initconst_or_module
-intel_nops[ASM_NOP_MAX+1] = {
+static const unsigned char intelnops[] =
+{
+ GENERIC_NOP1,
+ GENERIC_NOP2,
+ GENERIC_NOP3,
+ GENERIC_NOP4,
+ GENERIC_NOP5,
+ GENERIC_NOP6,
+ GENERIC_NOP7,
+ GENERIC_NOP8,
+ GENERIC_NOP5_ATOMIC
+};
+static const unsigned char * const intel_nops[ASM_NOP_MAX+2] =
+{
NULL,
intelnops,
intelnops + 1,
@@ -87,17 +99,25 @@ intel_nops[ASM_NOP_MAX+1] = {
intelnops + 1 + 2 + 3 + 4 + 5,
intelnops + 1 + 2 + 3 + 4 + 5 + 6,
intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
+ intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
};
#endif
#ifdef K8_NOP1
-asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nk8nops: "
- K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6
- K8_NOP7 K8_NOP8
- "\t.previous");
-extern const unsigned char k8nops[];
-static const unsigned char *const __initconst_or_module
-k8_nops[ASM_NOP_MAX+1] = {
+static const unsigned char k8nops[] =
+{
+ K8_NOP1,
+ K8_NOP2,
+ K8_NOP3,
+ K8_NOP4,
+ K8_NOP5,
+ K8_NOP6,
+ K8_NOP7,
+ K8_NOP8,
+ K8_NOP5_ATOMIC
+};
+static const unsigned char * const k8_nops[ASM_NOP_MAX+2] =
+{
NULL,
k8nops,
k8nops + 1,
@@ -107,17 +127,25 @@ k8_nops[ASM_NOP_MAX+1] = {
k8nops + 1 + 2 + 3 + 4 + 5,
k8nops + 1 + 2 + 3 + 4 + 5 + 6,
k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
+ k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
};
#endif
#if defined(K7_NOP1) && !defined(CONFIG_X86_64)
-asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nk7nops: "
- K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6
- K7_NOP7 K7_NOP8
- "\t.previous");
-extern const unsigned char k7nops[];
-static const unsigned char *const __initconst_or_module
-k7_nops[ASM_NOP_MAX+1] = {
+static const unsigned char k7nops[] =
+{
+ K7_NOP1,
+ K7_NOP2,
+ K7_NOP3,
+ K7_NOP4,
+ K7_NOP5,
+ K7_NOP6,
+ K7_NOP7,
+ K7_NOP8,
+ K7_NOP5_ATOMIC
+};
+static const unsigned char * const k7_nops[ASM_NOP_MAX+2] =
+{
NULL,
k7nops,
k7nops + 1,
@@ -127,17 +155,25 @@ k7_nops[ASM_NOP_MAX+1] = {
k7nops + 1 + 2 + 3 + 4 + 5,
k7nops + 1 + 2 + 3 + 4 + 5 + 6,
k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
+ k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
};
#endif
#ifdef P6_NOP1
-asm("\t" __stringify(__INITRODATA_OR_MODULE) "\np6nops: "
- P6_NOP1 P6_NOP2 P6_NOP3 P6_NOP4 P6_NOP5 P6_NOP6
- P6_NOP7 P6_NOP8
- "\t.previous");
-extern const unsigned char p6nops[];
-static const unsigned char *const __initconst_or_module
-p6_nops[ASM_NOP_MAX+1] = {
+static const unsigned char __initconst_or_module p6nops[] =
+{
+ P6_NOP1,
+ P6_NOP2,
+ P6_NOP3,
+ P6_NOP4,
+ P6_NOP5,
+ P6_NOP6,
+ P6_NOP7,
+ P6_NOP8,
+ P6_NOP5_ATOMIC
+};
+static const unsigned char * const p6_nops[ASM_NOP_MAX+2] =
+{
NULL,
p6nops,
p6nops + 1,
@@ -147,47 +183,65 @@ p6_nops[ASM_NOP_MAX+1] = {
p6nops + 1 + 2 + 3 + 4 + 5,
p6nops + 1 + 2 + 3 + 4 + 5 + 6,
p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
+ p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
};
#endif
+/* Initialize these to a safe default */
#ifdef CONFIG_X86_64
+const unsigned char * const *ideal_nops = p6_nops;
+#else
+const unsigned char * const *ideal_nops = intel_nops;
+#endif
-extern char __vsyscall_0;
-static const unsigned char *const *__init_or_module find_nop_table(void)
+void __init arch_init_ideal_nops(void)
{
- if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
- boot_cpu_has(X86_FEATURE_NOPL))
- return p6_nops;
- else
- return k8_nops;
-}
-
-#else /* CONFIG_X86_64 */
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_INTEL:
+ /*
+ * Due to a decoder implementation quirk, some
+ * specific Intel CPUs actually perform better with
+ * the "k8_nops" than with the SDM-recommended NOPs.
+ */
+ if (boot_cpu_data.x86 == 6 &&
+ boot_cpu_data.x86_model >= 0x0f &&
+ boot_cpu_data.x86_model != 0x1c &&
+ boot_cpu_data.x86_model != 0x26 &&
+ boot_cpu_data.x86_model != 0x27 &&
+ boot_cpu_data.x86_model < 0x30) {
+ ideal_nops = k8_nops;
+ } else if (boot_cpu_has(X86_FEATURE_NOPL)) {
+ ideal_nops = p6_nops;
+ } else {
+#ifdef CONFIG_X86_64
+ ideal_nops = k8_nops;
+#else
+ ideal_nops = intel_nops;
+#endif
+ }
-static const unsigned char *const *__init_or_module find_nop_table(void)
-{
- if (boot_cpu_has(X86_FEATURE_K8))
- return k8_nops;
- else if (boot_cpu_has(X86_FEATURE_K7))
- return k7_nops;
- else if (boot_cpu_has(X86_FEATURE_NOPL))
- return p6_nops;
- else
- return intel_nops;
+ default:
+#ifdef CONFIG_X86_64
+ ideal_nops = k8_nops;
+#else
+ if (boot_cpu_has(X86_FEATURE_K8))
+ ideal_nops = k8_nops;
+ else if (boot_cpu_has(X86_FEATURE_K7))
+ ideal_nops = k7_nops;
+ else
+ ideal_nops = intel_nops;
+#endif
+ }
}
-#endif /* CONFIG_X86_64 */
-
/* Use this to add nops to a buffer, then text_poke the whole buffer. */
static void __init_or_module add_nops(void *insns, unsigned int len)
{
- const unsigned char *const *noptable = find_nop_table();
-
while (len > 0) {
unsigned int noplen = len;
if (noplen > ASM_NOP_MAX)
noplen = ASM_NOP_MAX;
- memcpy(insns, noptable[noplen], noplen);
+ memcpy(insns, ideal_nops[noplen], noplen);
insns += noplen;
len -= noplen;
}
@@ -207,29 +261,37 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
struct alt_instr *end)
{
struct alt_instr *a;
+ u8 *instr, *replacement;
u8 insnbuf[MAX_PATCH_LEN];
DPRINTK("%s: alt table %p -> %p\n", __func__, start, end);
+ /*
+ * The scan order should be from start to end. A later scanned
+ * alternative code can overwrite a previous scanned alternative code.
+ * Some kernel functions (e.g. memcpy, memset, etc) use this order to
+ * patch code.
+ *
+ * So be careful if you want to change the scan order to any other
+ * order.
+ */
for (a = start; a < end; a++) {
- u8 *instr = a->instr;
+ instr = (u8 *)&a->instr_offset + a->instr_offset;
+ replacement = (u8 *)&a->repl_offset + a->repl_offset;
BUG_ON(a->replacementlen > a->instrlen);
BUG_ON(a->instrlen > sizeof(insnbuf));
BUG_ON(a->cpuid >= NCAPINTS*32);
if (!boot_cpu_has(a->cpuid))
continue;
-#ifdef CONFIG_X86_64
- /* vsyscall code is not mapped yet. resolve it manually. */
- if (instr >= (u8 *)VSYSCALL_START && instr < (u8*)VSYSCALL_END) {
- instr = __va(instr - (u8*)VSYSCALL_START + (u8*)__pa_symbol(&__vsyscall_0));
- DPRINTK("%s: vsyscall fixup: %p => %p\n",
- __func__, a->instr, instr);
- }
-#endif
- memcpy(insnbuf, a->replacement, a->replacementlen);
+
+ memcpy(insnbuf, replacement, a->replacementlen);
+
+ /* 0xe8 is a relative jump; fix the offset. */
if (*insnbuf == 0xe8 && a->replacementlen == 5)
- *(s32 *)(insnbuf + 1) += a->replacement - a->instr;
+ *(s32 *)(insnbuf + 1) += replacement - instr;
+
add_nops(insnbuf + a->replacementlen,
a->instrlen - a->replacementlen);
+
text_poke_early(instr, insnbuf, a->instrlen);
}
}
@@ -678,29 +740,3 @@ void __kprobes text_poke_smp_batch(struct text_poke_param *params, int n)
wrote_text = 0;
__stop_machine(stop_machine_text_poke, (void *)&tpp, NULL);
}
-
-#if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL)
-
-#ifdef CONFIG_X86_64
-unsigned char ideal_nop5[5] = { 0x66, 0x66, 0x66, 0x66, 0x90 };
-#else
-unsigned char ideal_nop5[5] = { 0x3e, 0x8d, 0x74, 0x26, 0x00 };
-#endif
-
-void __init arch_init_ideal_nop5(void)
-{
- /*
- * There is no good nop for all x86 archs. This selection
- * algorithm should be unified with the one in find_nop_table(),
- * but this should be good enough for now.
- *
- * For cases other than the ones below, use the safe (as in
- * always functional) defaults above.
- */
-#ifdef CONFIG_X86_64
- /* Don't use these on 32 bits due to broken virtualizers */
- if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
- memcpy(ideal_nop5, p6_nops[5], 5);
-#endif
-}
-#endif
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/amd_gart_64.c
index b117efd24f7..b117efd24f7 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
deleted file mode 100644
index 57ca7778722..00000000000
--- a/arch/x86/kernel/amd_iommu.c
+++ /dev/null
@@ -1,2635 +0,0 @@
-/*
- * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
- * Author: Joerg Roedel <joerg.roedel@amd.com>
- * Leo Duran <leo.duran@amd.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-#include <linux/pci.h>
-#include <linux/bitmap.h>
-#include <linux/slab.h>
-#include <linux/debugfs.h>
-#include <linux/scatterlist.h>
-#include <linux/dma-mapping.h>
-#include <linux/iommu-helper.h>
-#include <linux/iommu.h>
-#include <asm/proto.h>
-#include <asm/iommu.h>
-#include <asm/gart.h>
-#include <asm/amd_iommu_proto.h>
-#include <asm/amd_iommu_types.h>
-#include <asm/amd_iommu.h>
-
-#define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28))
-
-#define EXIT_LOOP_COUNT 10000000
-
-static DEFINE_RWLOCK(amd_iommu_devtable_lock);
-
-/* A list of preallocated protection domains */
-static LIST_HEAD(iommu_pd_list);
-static DEFINE_SPINLOCK(iommu_pd_list_lock);
-
-/*
- * Domain for untranslated devices - only allocated
- * if iommu=pt passed on kernel cmd line.
- */
-static struct protection_domain *pt_domain;
-
-static struct iommu_ops amd_iommu_ops;
-
-/*
- * general struct to manage commands send to an IOMMU
- */
-struct iommu_cmd {
- u32 data[4];
-};
-
-static void reset_iommu_command_buffer(struct amd_iommu *iommu);
-static void update_domain(struct protection_domain *domain);
-
-/****************************************************************************
- *
- * Helper functions
- *
- ****************************************************************************/
-
-static inline u16 get_device_id(struct device *dev)
-{
- struct pci_dev *pdev = to_pci_dev(dev);
-
- return calc_devid(pdev->bus->number, pdev->devfn);
-}
-
-static struct iommu_dev_data *get_dev_data(struct device *dev)
-{
- return dev->archdata.iommu;
-}
-
-/*
- * In this function the list of preallocated protection domains is traversed to
- * find the domain for a specific device
- */
-static struct dma_ops_domain *find_protection_domain(u16 devid)
-{
- struct dma_ops_domain *entry, *ret = NULL;
- unsigned long flags;
- u16 alias = amd_iommu_alias_table[devid];
-
- if (list_empty(&iommu_pd_list))
- return NULL;
-
- spin_lock_irqsave(&iommu_pd_list_lock, flags);
-
- list_for_each_entry(entry, &iommu_pd_list, list) {
- if (entry->target_dev == devid ||
- entry->target_dev == alias) {
- ret = entry;
- break;
- }
- }
-
- spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
-
- return ret;
-}
-
-/*
- * This function checks if the driver got a valid device from the caller to
- * avoid dereferencing invalid pointers.
- */
-static bool check_device(struct device *dev)
-{
- u16 devid;
-
- if (!dev || !dev->dma_mask)
- return false;
-
- /* No device or no PCI device */
- if (dev->bus != &pci_bus_type)
- return false;
-
- devid = get_device_id(dev);
-
- /* Out of our scope? */
- if (devid > amd_iommu_last_bdf)
- return false;
-
- if (amd_iommu_rlookup_table[devid] == NULL)
- return false;
-
- return true;
-}
-
-static int iommu_init_device(struct device *dev)
-{
- struct iommu_dev_data *dev_data;
- struct pci_dev *pdev;
- u16 devid, alias;
-
- if (dev->archdata.iommu)
- return 0;
-
- dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL);
- if (!dev_data)
- return -ENOMEM;
-
- dev_data->dev = dev;
-
- devid = get_device_id(dev);
- alias = amd_iommu_alias_table[devid];
- pdev = pci_get_bus_and_slot(PCI_BUS(alias), alias & 0xff);
- if (pdev)
- dev_data->alias = &pdev->dev;
-
- atomic_set(&dev_data->bind, 0);
-
- dev->archdata.iommu = dev_data;
-
-
- return 0;
-}
-
-static void iommu_uninit_device(struct device *dev)
-{
- kfree(dev->archdata.iommu);
-}
-
-void __init amd_iommu_uninit_devices(void)
-{
- struct pci_dev *pdev = NULL;
-
- for_each_pci_dev(pdev) {
-
- if (!check_device(&pdev->dev))
- continue;
-
- iommu_uninit_device(&pdev->dev);
- }
-}
-
-int __init amd_iommu_init_devices(void)
-{
- struct pci_dev *pdev = NULL;
- int ret = 0;
-
- for_each_pci_dev(pdev) {
-
- if (!check_device(&pdev->dev))
- continue;
-
- ret = iommu_init_device(&pdev->dev);
- if (ret)
- goto out_free;
- }
-
- return 0;
-
-out_free:
-
- amd_iommu_uninit_devices();
-
- return ret;
-}
-#ifdef CONFIG_AMD_IOMMU_STATS
-
-/*
- * Initialization code for statistics collection
- */
-
-DECLARE_STATS_COUNTER(compl_wait);
-DECLARE_STATS_COUNTER(cnt_map_single);
-DECLARE_STATS_COUNTER(cnt_unmap_single);
-DECLARE_STATS_COUNTER(cnt_map_sg);
-DECLARE_STATS_COUNTER(cnt_unmap_sg);
-DECLARE_STATS_COUNTER(cnt_alloc_coherent);
-DECLARE_STATS_COUNTER(cnt_free_coherent);
-DECLARE_STATS_COUNTER(cross_page);
-DECLARE_STATS_COUNTER(domain_flush_single);
-DECLARE_STATS_COUNTER(domain_flush_all);
-DECLARE_STATS_COUNTER(alloced_io_mem);
-DECLARE_STATS_COUNTER(total_map_requests);
-
-static struct dentry *stats_dir;
-static struct dentry *de_fflush;
-
-static void amd_iommu_stats_add(struct __iommu_counter *cnt)
-{
- if (stats_dir == NULL)
- return;
-
- cnt->dent = debugfs_create_u64(cnt->name, 0444, stats_dir,
- &cnt->value);
-}
-
-static void amd_iommu_stats_init(void)
-{
- stats_dir = debugfs_create_dir("amd-iommu", NULL);
- if (stats_dir == NULL)
- return;
-
- de_fflush = debugfs_create_bool("fullflush", 0444, stats_dir,
- (u32 *)&amd_iommu_unmap_flush);
-
- amd_iommu_stats_add(&compl_wait);
- amd_iommu_stats_add(&cnt_map_single);
- amd_iommu_stats_add(&cnt_unmap_single);
- amd_iommu_stats_add(&cnt_map_sg);
- amd_iommu_stats_add(&cnt_unmap_sg);
- amd_iommu_stats_add(&cnt_alloc_coherent);
- amd_iommu_stats_add(&cnt_free_coherent);
- amd_iommu_stats_add(&cross_page);
- amd_iommu_stats_add(&domain_flush_single);
- amd_iommu_stats_add(&domain_flush_all);
- amd_iommu_stats_add(&alloced_io_mem);
- amd_iommu_stats_add(&total_map_requests);
-}
-
-#endif
-
-/****************************************************************************
- *
- * Interrupt handling functions
- *
- ****************************************************************************/
-
-static void dump_dte_entry(u16 devid)
-{
- int i;
-
- for (i = 0; i < 8; ++i)
- pr_err("AMD-Vi: DTE[%d]: %08x\n", i,
- amd_iommu_dev_table[devid].data[i]);
-}
-
-static void dump_command(unsigned long phys_addr)
-{
- struct iommu_cmd *cmd = phys_to_virt(phys_addr);
- int i;
-
- for (i = 0; i < 4; ++i)
- pr_err("AMD-Vi: CMD[%d]: %08x\n", i, cmd->data[i]);
-}
-
-static void iommu_print_event(struct amd_iommu *iommu, void *__evt)
-{
- u32 *event = __evt;
- int type = (event[1] >> EVENT_TYPE_SHIFT) & EVENT_TYPE_MASK;
- int devid = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK;
- int domid = (event[1] >> EVENT_DOMID_SHIFT) & EVENT_DOMID_MASK;
- int flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
- u64 address = (u64)(((u64)event[3]) << 32) | event[2];
-
- printk(KERN_ERR "AMD-Vi: Event logged [");
-
- switch (type) {
- case EVENT_TYPE_ILL_DEV:
- printk("ILLEGAL_DEV_TABLE_ENTRY device=%02x:%02x.%x "
- "address=0x%016llx flags=0x%04x]\n",
- PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
- address, flags);
- dump_dte_entry(devid);
- break;
- case EVENT_TYPE_IO_FAULT:
- printk("IO_PAGE_FAULT device=%02x:%02x.%x "
- "domain=0x%04x address=0x%016llx flags=0x%04x]\n",
- PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
- domid, address, flags);
- break;
- case EVENT_TYPE_DEV_TAB_ERR:
- printk("DEV_TAB_HARDWARE_ERROR device=%02x:%02x.%x "
- "address=0x%016llx flags=0x%04x]\n",
- PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
- address, flags);
- break;
- case EVENT_TYPE_PAGE_TAB_ERR:
- printk("PAGE_TAB_HARDWARE_ERROR device=%02x:%02x.%x "
- "domain=0x%04x address=0x%016llx flags=0x%04x]\n",
- PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
- domid, address, flags);
- break;
- case EVENT_TYPE_ILL_CMD:
- printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address);
- iommu->reset_in_progress = true;
- reset_iommu_command_buffer(iommu);
- dump_command(address);
- break;
- case EVENT_TYPE_CMD_HARD_ERR:
- printk("COMMAND_HARDWARE_ERROR address=0x%016llx "
- "flags=0x%04x]\n", address, flags);
- break;
- case EVENT_TYPE_IOTLB_INV_TO:
- printk("IOTLB_INV_TIMEOUT device=%02x:%02x.%x "
- "address=0x%016llx]\n",
- PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
- address);
- break;
- case EVENT_TYPE_INV_DEV_REQ:
- printk("INVALID_DEVICE_REQUEST device=%02x:%02x.%x "
- "address=0x%016llx flags=0x%04x]\n",
- PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
- address, flags);
- break;
- default:
- printk(KERN_ERR "UNKNOWN type=0x%02x]\n", type);
- }
-}
-
-static void iommu_poll_events(struct amd_iommu *iommu)
-{
- u32 head, tail;
- unsigned long flags;
-
- spin_lock_irqsave(&iommu->lock, flags);
-
- head = readl(iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
- tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
-
- while (head != tail) {
- iommu_print_event(iommu, iommu->evt_buf + head);
- head = (head + EVENT_ENTRY_SIZE) % iommu->evt_buf_size;
- }
-
- writel(head, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
-
- spin_unlock_irqrestore(&iommu->lock, flags);
-}
-
-irqreturn_t amd_iommu_int_handler(int irq, void *data)
-{
- struct amd_iommu *iommu;
-
- for_each_iommu(iommu)
- iommu_poll_events(iommu);
-
- return IRQ_HANDLED;
-}
-
-/****************************************************************************
- *
- * IOMMU command queuing functions
- *
- ****************************************************************************/
-
-/*
- * Writes the command to the IOMMUs command buffer and informs the
- * hardware about the new command. Must be called with iommu->lock held.
- */
-static int __iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
-{
- u32 tail, head;
- u8 *target;
-
- WARN_ON(iommu->cmd_buf_size & CMD_BUFFER_UNINITIALIZED);
- tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
- target = iommu->cmd_buf + tail;
- memcpy_toio(target, cmd, sizeof(*cmd));
- tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size;
- head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
- if (tail == head)
- return -ENOMEM;
- writel(tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
-
- return 0;
-}
-
-/*
- * General queuing function for commands. Takes iommu->lock and calls
- * __iommu_queue_command().
- */
-static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
-{
- unsigned long flags;
- int ret;
-
- spin_lock_irqsave(&iommu->lock, flags);
- ret = __iommu_queue_command(iommu, cmd);
- if (!ret)
- iommu->need_sync = true;
- spin_unlock_irqrestore(&iommu->lock, flags);
-
- return ret;
-}
-
-/*
- * This function waits until an IOMMU has completed a completion
- * wait command
- */
-static void __iommu_wait_for_completion(struct amd_iommu *iommu)
-{
- int ready = 0;
- unsigned status = 0;
- unsigned long i = 0;
-
- INC_STATS_COUNTER(compl_wait);
-
- while (!ready && (i < EXIT_LOOP_COUNT)) {
- ++i;
- /* wait for the bit to become one */
- status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
- ready = status & MMIO_STATUS_COM_WAIT_INT_MASK;
- }
-
- /* set bit back to zero */
- status &= ~MMIO_STATUS_COM_WAIT_INT_MASK;
- writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET);
-
- if (unlikely(i == EXIT_LOOP_COUNT))
- iommu->reset_in_progress = true;
-}
-
-/*
- * This function queues a completion wait command into the command
- * buffer of an IOMMU
- */
-static int __iommu_completion_wait(struct amd_iommu *iommu)
-{
- struct iommu_cmd cmd;
-
- memset(&cmd, 0, sizeof(cmd));
- cmd.data[0] = CMD_COMPL_WAIT_INT_MASK;
- CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT);
-
- return __iommu_queue_command(iommu, &cmd);
-}
-
-/*
- * This function is called whenever we need to ensure that the IOMMU has
- * completed execution of all commands we sent. It sends a
- * COMPLETION_WAIT command and waits for it to finish. The IOMMU informs
- * us about that by writing a value to a physical address we pass with
- * the command.
- */
-static int iommu_completion_wait(struct amd_iommu *iommu)
-{
- int ret = 0;
- unsigned long flags;
-
- spin_lock_irqsave(&iommu->lock, flags);
-
- if (!iommu->need_sync)
- goto out;
-
- ret = __iommu_completion_wait(iommu);
-
- iommu->need_sync = false;
-
- if (ret)
- goto out;
-
- __iommu_wait_for_completion(iommu);
-
-out:
- spin_unlock_irqrestore(&iommu->lock, flags);
-
- if (iommu->reset_in_progress)
- reset_iommu_command_buffer(iommu);
-
- return 0;
-}
-
-static void iommu_flush_complete(struct protection_domain *domain)
-{
- int i;
-
- for (i = 0; i < amd_iommus_present; ++i) {
- if (!domain->dev_iommu[i])
- continue;
-
- /*
- * Devices of this domain are behind this IOMMU
- * We need to wait for completion of all commands.
- */
- iommu_completion_wait(amd_iommus[i]);
- }
-}
-
-/*
- * Command send function for invalidating a device table entry
- */
-static int iommu_flush_device(struct device *dev)
-{
- struct amd_iommu *iommu;
- struct iommu_cmd cmd;
- u16 devid;
-
- devid = get_device_id(dev);
- iommu = amd_iommu_rlookup_table[devid];
-
- /* Build command */
- memset(&cmd, 0, sizeof(cmd));
- CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY);
- cmd.data[0] = devid;
-
- return iommu_queue_command(iommu, &cmd);
-}
-
-static void __iommu_build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address,
- u16 domid, int pde, int s)
-{
- memset(cmd, 0, sizeof(*cmd));
- address &= PAGE_MASK;
- CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES);
- cmd->data[1] |= domid;
- cmd->data[2] = lower_32_bits(address);
- cmd->data[3] = upper_32_bits(address);
- if (s) /* size bit - we flush more than one 4kb page */
- cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
- if (pde) /* PDE bit - we wan't flush everything not only the PTEs */
- cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
-}
-
-/*
- * Generic command send function for invalidaing TLB entries
- */
-static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
- u64 address, u16 domid, int pde, int s)
-{
- struct iommu_cmd cmd;
- int ret;
-
- __iommu_build_inv_iommu_pages(&cmd, address, domid, pde, s);
-
- ret = iommu_queue_command(iommu, &cmd);
-
- return ret;
-}
-
-/*
- * TLB invalidation function which is called from the mapping functions.
- * It invalidates a single PTE if the range to flush is within a single
- * page. Otherwise it flushes the whole TLB of the IOMMU.
- */
-static void __iommu_flush_pages(struct protection_domain *domain,
- u64 address, size_t size, int pde)
-{
- int s = 0, i;
- unsigned long pages = iommu_num_pages(address, size, PAGE_SIZE);
-
- address &= PAGE_MASK;
-
- if (pages > 1) {
- /*
- * If we have to flush more than one page, flush all
- * TLB entries for this domain
- */
- address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
- s = 1;
- }
-
-
- for (i = 0; i < amd_iommus_present; ++i) {
- if (!domain->dev_iommu[i])
- continue;
-
- /*
- * Devices of this domain are behind this IOMMU
- * We need a TLB flush
- */
- iommu_queue_inv_iommu_pages(amd_iommus[i], address,
- domain->id, pde, s);
- }
-
- return;
-}
-
-static void iommu_flush_pages(struct protection_domain *domain,
- u64 address, size_t size)
-{
- __iommu_flush_pages(domain, address, size, 0);
-}
-
-/* Flush the whole IO/TLB for a given protection domain */
-static void iommu_flush_tlb(struct protection_domain *domain)
-{
- __iommu_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 0);
-}
-
-/* Flush the whole IO/TLB for a given protection domain - including PDE */
-static void iommu_flush_tlb_pde(struct protection_domain *domain)
-{
- __iommu_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1);
-}
-
-
-/*
- * This function flushes the DTEs for all devices in domain
- */
-static void iommu_flush_domain_devices(struct protection_domain *domain)
-{
- struct iommu_dev_data *dev_data;
- unsigned long flags;
-
- spin_lock_irqsave(&domain->lock, flags);
-
- list_for_each_entry(dev_data, &domain->dev_list, list)
- iommu_flush_device(dev_data->dev);
-
- spin_unlock_irqrestore(&domain->lock, flags);
-}
-
-static void iommu_flush_all_domain_devices(void)
-{
- struct protection_domain *domain;
- unsigned long flags;
-
- spin_lock_irqsave(&amd_iommu_pd_lock, flags);
-
- list_for_each_entry(domain, &amd_iommu_pd_list, list) {
- iommu_flush_domain_devices(domain);
- iommu_flush_complete(domain);
- }
-
- spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
-}
-
-void amd_iommu_flush_all_devices(void)
-{
- iommu_flush_all_domain_devices();
-}
-
-/*
- * This function uses heavy locking and may disable irqs for some time. But
- * this is no issue because it is only called during resume.
- */
-void amd_iommu_flush_all_domains(void)
-{
- struct protection_domain *domain;
- unsigned long flags;
-
- spin_lock_irqsave(&amd_iommu_pd_lock, flags);
-
- list_for_each_entry(domain, &amd_iommu_pd_list, list) {
- spin_lock(&domain->lock);
- iommu_flush_tlb_pde(domain);
- iommu_flush_complete(domain);
- spin_unlock(&domain->lock);
- }
-
- spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
-}
-
-static void reset_iommu_command_buffer(struct amd_iommu *iommu)
-{
- pr_err("AMD-Vi: Resetting IOMMU command buffer\n");
-
- if (iommu->reset_in_progress)
- panic("AMD-Vi: ILLEGAL_COMMAND_ERROR while resetting command buffer\n");
-
- amd_iommu_reset_cmd_buffer(iommu);
- amd_iommu_flush_all_devices();
- amd_iommu_flush_all_domains();
-
- iommu->reset_in_progress = false;
-}
-
-/****************************************************************************
- *
- * The functions below are used the create the page table mappings for
- * unity mapped regions.
- *
- ****************************************************************************/
-
-/*
- * This function is used to add another level to an IO page table. Adding
- * another level increases the size of the address space by 9 bits to a size up
- * to 64 bits.
- */
-static bool increase_address_space(struct protection_domain *domain,
- gfp_t gfp)
-{
- u64 *pte;
-
- if (domain->mode == PAGE_MODE_6_LEVEL)
- /* address space already 64 bit large */
- return false;
-
- pte = (void *)get_zeroed_page(gfp);
- if (!pte)
- return false;
-
- *pte = PM_LEVEL_PDE(domain->mode,
- virt_to_phys(domain->pt_root));
- domain->pt_root = pte;
- domain->mode += 1;
- domain->updated = true;
-
- return true;
-}
-
-static u64 *alloc_pte(struct protection_domain *domain,
- unsigned long address,
- unsigned long page_size,
- u64 **pte_page,
- gfp_t gfp)
-{
- int level, end_lvl;
- u64 *pte, *page;
-
- BUG_ON(!is_power_of_2(page_size));
-
- while (address > PM_LEVEL_SIZE(domain->mode))
- increase_address_space(domain, gfp);
-
- level = domain->mode - 1;
- pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
- address = PAGE_SIZE_ALIGN(address, page_size);
- end_lvl = PAGE_SIZE_LEVEL(page_size);
-
- while (level > end_lvl) {
- if (!IOMMU_PTE_PRESENT(*pte)) {
- page = (u64 *)get_zeroed_page(gfp);
- if (!page)
- return NULL;
- *pte = PM_LEVEL_PDE(level, virt_to_phys(page));
- }
-
- /* No level skipping support yet */
- if (PM_PTE_LEVEL(*pte) != level)
- return NULL;
-
- level -= 1;
-
- pte = IOMMU_PTE_PAGE(*pte);
-
- if (pte_page && level == end_lvl)
- *pte_page = pte;
-
- pte = &pte[PM_LEVEL_INDEX(level, address)];
- }
-
- return pte;
-}
-
-/*
- * This function checks if there is a PTE for a given dma address. If
- * there is one, it returns the pointer to it.
- */
-static u64 *fetch_pte(struct protection_domain *domain, unsigned long address)
-{
- int level;
- u64 *pte;
-
- if (address > PM_LEVEL_SIZE(domain->mode))
- return NULL;
-
- level = domain->mode - 1;
- pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
-
- while (level > 0) {
-
- /* Not Present */
- if (!IOMMU_PTE_PRESENT(*pte))
- return NULL;
-
- /* Large PTE */
- if (PM_PTE_LEVEL(*pte) == 0x07) {
- unsigned long pte_mask, __pte;
-
- /*
- * If we have a series of large PTEs, make
- * sure to return a pointer to the first one.
- */
- pte_mask = PTE_PAGE_SIZE(*pte);
- pte_mask = ~((PAGE_SIZE_PTE_COUNT(pte_mask) << 3) - 1);
- __pte = ((unsigned long)pte) & pte_mask;
-
- return (u64 *)__pte;
- }
-
- /* No level skipping support yet */
- if (PM_PTE_LEVEL(*pte) != level)
- return NULL;
-
- level -= 1;
-
- /* Walk to the next level */
- pte = IOMMU_PTE_PAGE(*pte);
- pte = &pte[PM_LEVEL_INDEX(level, address)];
- }
-
- return pte;
-}
-
-/*
- * Generic mapping functions. It maps a physical address into a DMA
- * address space. It allocates the page table pages if necessary.
- * In the future it can be extended to a generic mapping function
- * supporting all features of AMD IOMMU page tables like level skipping
- * and full 64 bit address spaces.
- */
-static int iommu_map_page(struct protection_domain *dom,
- unsigned long bus_addr,
- unsigned long phys_addr,
- int prot,
- unsigned long page_size)
-{
- u64 __pte, *pte;
- int i, count;
-
- if (!(prot & IOMMU_PROT_MASK))
- return -EINVAL;
-
- bus_addr = PAGE_ALIGN(bus_addr);
- phys_addr = PAGE_ALIGN(phys_addr);
- count = PAGE_SIZE_PTE_COUNT(page_size);
- pte = alloc_pte(dom, bus_addr, page_size, NULL, GFP_KERNEL);
-
- for (i = 0; i < count; ++i)
- if (IOMMU_PTE_PRESENT(pte[i]))
- return -EBUSY;
-
- if (page_size > PAGE_SIZE) {
- __pte = PAGE_SIZE_PTE(phys_addr, page_size);
- __pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_P | IOMMU_PTE_FC;
- } else
- __pte = phys_addr | IOMMU_PTE_P | IOMMU_PTE_FC;
-
- if (prot & IOMMU_PROT_IR)
- __pte |= IOMMU_PTE_IR;
- if (prot & IOMMU_PROT_IW)
- __pte |= IOMMU_PTE_IW;
-
- for (i = 0; i < count; ++i)
- pte[i] = __pte;
-
- update_domain(dom);
-
- return 0;
-}
-
-static unsigned long iommu_unmap_page(struct protection_domain *dom,
- unsigned long bus_addr,
- unsigned long page_size)
-{
- unsigned long long unmap_size, unmapped;
- u64 *pte;
-
- BUG_ON(!is_power_of_2(page_size));
-
- unmapped = 0;
-
- while (unmapped < page_size) {
-
- pte = fetch_pte(dom, bus_addr);
-
- if (!pte) {
- /*
- * No PTE for this address
- * move forward in 4kb steps
- */
- unmap_size = PAGE_SIZE;
- } else if (PM_PTE_LEVEL(*pte) == 0) {
- /* 4kb PTE found for this address */
- unmap_size = PAGE_SIZE;
- *pte = 0ULL;
- } else {
- int count, i;
-
- /* Large PTE found which maps this address */
- unmap_size = PTE_PAGE_SIZE(*pte);
- count = PAGE_SIZE_PTE_COUNT(unmap_size);
- for (i = 0; i < count; i++)
- pte[i] = 0ULL;
- }
-
- bus_addr = (bus_addr & ~(unmap_size - 1)) + unmap_size;
- unmapped += unmap_size;
- }
-
- BUG_ON(!is_power_of_2(unmapped));
-
- return unmapped;
-}
-
-/*
- * This function checks if a specific unity mapping entry is needed for
- * this specific IOMMU.
- */
-static int iommu_for_unity_map(struct amd_iommu *iommu,
- struct unity_map_entry *entry)
-{
- u16 bdf, i;
-
- for (i = entry->devid_start; i <= entry->devid_end; ++i) {
- bdf = amd_iommu_alias_table[i];
- if (amd_iommu_rlookup_table[bdf] == iommu)
- return 1;
- }
-
- return 0;
-}
-
-/*
- * This function actually applies the mapping to the page table of the
- * dma_ops domain.
- */
-static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
- struct unity_map_entry *e)
-{
- u64 addr;
- int ret;
-
- for (addr = e->address_start; addr < e->address_end;
- addr += PAGE_SIZE) {
- ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot,
- PAGE_SIZE);
- if (ret)
- return ret;
- /*
- * if unity mapping is in aperture range mark the page
- * as allocated in the aperture
- */
- if (addr < dma_dom->aperture_size)
- __set_bit(addr >> PAGE_SHIFT,
- dma_dom->aperture[0]->bitmap);
- }
-
- return 0;
-}
-
-/*
- * Init the unity mappings for a specific IOMMU in the system
- *
- * Basically iterates over all unity mapping entries and applies them to
- * the default domain DMA of that IOMMU if necessary.
- */
-static int iommu_init_unity_mappings(struct amd_iommu *iommu)
-{
- struct unity_map_entry *entry;
- int ret;
-
- list_for_each_entry(entry, &amd_iommu_unity_map, list) {
- if (!iommu_for_unity_map(iommu, entry))
- continue;
- ret = dma_ops_unity_map(iommu->default_dom, entry);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-/*
- * Inits the unity mappings required for a specific device
- */
-static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
- u16 devid)
-{
- struct unity_map_entry *e;
- int ret;
-
- list_for_each_entry(e, &amd_iommu_unity_map, list) {
- if (!(devid >= e->devid_start && devid <= e->devid_end))
- continue;
- ret = dma_ops_unity_map(dma_dom, e);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-/****************************************************************************
- *
- * The next functions belong to the address allocator for the dma_ops
- * interface functions. They work like the allocators in the other IOMMU
- * drivers. Its basically a bitmap which marks the allocated pages in
- * the aperture. Maybe it could be enhanced in the future to a more
- * efficient allocator.
- *
- ****************************************************************************/
-
-/*
- * The address allocator core functions.
- *
- * called with domain->lock held
- */
-
-/*
- * Used to reserve address ranges in the aperture (e.g. for exclusion
- * ranges.
- */
-static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
- unsigned long start_page,
- unsigned int pages)
-{
- unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT;
-
- if (start_page + pages > last_page)
- pages = last_page - start_page;
-
- for (i = start_page; i < start_page + pages; ++i) {
- int index = i / APERTURE_RANGE_PAGES;
- int page = i % APERTURE_RANGE_PAGES;
- __set_bit(page, dom->aperture[index]->bitmap);
- }
-}
-
-/*
- * This function is used to add a new aperture range to an existing
- * aperture in case of dma_ops domain allocation or address allocation
- * failure.
- */
-static int alloc_new_range(struct dma_ops_domain *dma_dom,
- bool populate, gfp_t gfp)
-{
- int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT;
- struct amd_iommu *iommu;
- unsigned long i;
-
-#ifdef CONFIG_IOMMU_STRESS
- populate = false;
-#endif
-
- if (index >= APERTURE_MAX_RANGES)
- return -ENOMEM;
-
- dma_dom->aperture[index] = kzalloc(sizeof(struct aperture_range), gfp);
- if (!dma_dom->aperture[index])
- return -ENOMEM;
-
- dma_dom->aperture[index]->bitmap = (void *)get_zeroed_page(gfp);
- if (!dma_dom->aperture[index]->bitmap)
- goto out_free;
-
- dma_dom->aperture[index]->offset = dma_dom->aperture_size;
-
- if (populate) {
- unsigned long address = dma_dom->aperture_size;
- int i, num_ptes = APERTURE_RANGE_PAGES / 512;
- u64 *pte, *pte_page;
-
- for (i = 0; i < num_ptes; ++i) {
- pte = alloc_pte(&dma_dom->domain, address, PAGE_SIZE,
- &pte_page, gfp);
- if (!pte)
- goto out_free;
-
- dma_dom->aperture[index]->pte_pages[i] = pte_page;
-
- address += APERTURE_RANGE_SIZE / 64;
- }
- }
-
- dma_dom->aperture_size += APERTURE_RANGE_SIZE;
-
- /* Initialize the exclusion range if necessary */
- for_each_iommu(iommu) {
- if (iommu->exclusion_start &&
- iommu->exclusion_start >= dma_dom->aperture[index]->offset
- && iommu->exclusion_start < dma_dom->aperture_size) {
- unsigned long startpage;
- int pages = iommu_num_pages(iommu->exclusion_start,
- iommu->exclusion_length,
- PAGE_SIZE);
- startpage = iommu->exclusion_start >> PAGE_SHIFT;
- dma_ops_reserve_addresses(dma_dom, startpage, pages);
- }
- }
-
- /*
- * Check for areas already mapped as present in the new aperture
- * range and mark those pages as reserved in the allocator. Such
- * mappings may already exist as a result of requested unity
- * mappings for devices.
- */
- for (i = dma_dom->aperture[index]->offset;
- i < dma_dom->aperture_size;
- i += PAGE_SIZE) {
- u64 *pte = fetch_pte(&dma_dom->domain, i);
- if (!pte || !IOMMU_PTE_PRESENT(*pte))
- continue;
-
- dma_ops_reserve_addresses(dma_dom, i << PAGE_SHIFT, 1);
- }
-
- update_domain(&dma_dom->domain);
-
- return 0;
-
-out_free:
- update_domain(&dma_dom->domain);
-
- free_page((unsigned long)dma_dom->aperture[index]->bitmap);
-
- kfree(dma_dom->aperture[index]);
- dma_dom->aperture[index] = NULL;
-
- return -ENOMEM;
-}
-
-static unsigned long dma_ops_area_alloc(struct device *dev,
- struct dma_ops_domain *dom,
- unsigned int pages,
- unsigned long align_mask,
- u64 dma_mask,
- unsigned long start)
-{
- unsigned long next_bit = dom->next_address % APERTURE_RANGE_SIZE;
- int max_index = dom->aperture_size >> APERTURE_RANGE_SHIFT;
- int i = start >> APERTURE_RANGE_SHIFT;
- unsigned long boundary_size;
- unsigned long address = -1;
- unsigned long limit;
-
- next_bit >>= PAGE_SHIFT;
-
- boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
- PAGE_SIZE) >> PAGE_SHIFT;
-
- for (;i < max_index; ++i) {
- unsigned long offset = dom->aperture[i]->offset >> PAGE_SHIFT;
-
- if (dom->aperture[i]->offset >= dma_mask)
- break;
-
- limit = iommu_device_max_index(APERTURE_RANGE_PAGES, offset,
- dma_mask >> PAGE_SHIFT);
-
- address = iommu_area_alloc(dom->aperture[i]->bitmap,
- limit, next_bit, pages, 0,
- boundary_size, align_mask);
- if (address != -1) {
- address = dom->aperture[i]->offset +
- (address << PAGE_SHIFT);
- dom->next_address = address + (pages << PAGE_SHIFT);
- break;
- }
-
- next_bit = 0;
- }
-
- return address;
-}
-
-static unsigned long dma_ops_alloc_addresses(struct device *dev,
- struct dma_ops_domain *dom,
- unsigned int pages,
- unsigned long align_mask,
- u64 dma_mask)
-{
- unsigned long address;
-
-#ifdef CONFIG_IOMMU_STRESS
- dom->next_address = 0;
- dom->need_flush = true;
-#endif
-
- address = dma_ops_area_alloc(dev, dom, pages, align_mask,
- dma_mask, dom->next_address);
-
- if (address == -1) {
- dom->next_address = 0;
- address = dma_ops_area_alloc(dev, dom, pages, align_mask,
- dma_mask, 0);
- dom->need_flush = true;
- }
-
- if (unlikely(address == -1))
- address = DMA_ERROR_CODE;
-
- WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size);
-
- return address;
-}
-
-/*
- * The address free function.
- *
- * called with domain->lock held
- */
-static void dma_ops_free_addresses(struct dma_ops_domain *dom,
- unsigned long address,
- unsigned int pages)
-{
- unsigned i = address >> APERTURE_RANGE_SHIFT;
- struct aperture_range *range = dom->aperture[i];
-
- BUG_ON(i >= APERTURE_MAX_RANGES || range == NULL);
-
-#ifdef CONFIG_IOMMU_STRESS
- if (i < 4)
- return;
-#endif
-
- if (address >= dom->next_address)
- dom->need_flush = true;
-
- address = (address % APERTURE_RANGE_SIZE) >> PAGE_SHIFT;
-
- bitmap_clear(range->bitmap, address, pages);
-
-}
-
-/****************************************************************************
- *
- * The next functions belong to the domain allocation. A domain is
- * allocated for every IOMMU as the default domain. If device isolation
- * is enabled, every device get its own domain. The most important thing
- * about domains is the page table mapping the DMA address space they
- * contain.
- *
- ****************************************************************************/
-
-/*
- * This function adds a protection domain to the global protection domain list
- */
-static void add_domain_to_list(struct protection_domain *domain)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&amd_iommu_pd_lock, flags);
- list_add(&domain->list, &amd_iommu_pd_list);
- spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
-}
-
-/*
- * This function removes a protection domain to the global
- * protection domain list
- */
-static void del_domain_from_list(struct protection_domain *domain)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&amd_iommu_pd_lock, flags);
- list_del(&domain->list);
- spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
-}
-
-static u16 domain_id_alloc(void)
-{
- unsigned long flags;
- int id;
-
- write_lock_irqsave(&amd_iommu_devtable_lock, flags);
- id = find_first_zero_bit(amd_iommu_pd_alloc_bitmap, MAX_DOMAIN_ID);
- BUG_ON(id == 0);
- if (id > 0 && id < MAX_DOMAIN_ID)
- __set_bit(id, amd_iommu_pd_alloc_bitmap);
- else
- id = 0;
- write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
-
- return id;
-}
-
-static void domain_id_free(int id)
-{
- unsigned long flags;
-
- write_lock_irqsave(&amd_iommu_devtable_lock, flags);
- if (id > 0 && id < MAX_DOMAIN_ID)
- __clear_bit(id, amd_iommu_pd_alloc_bitmap);
- write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
-}
-
-static void free_pagetable(struct protection_domain *domain)
-{
- int i, j;
- u64 *p1, *p2, *p3;
-
- p1 = domain->pt_root;
-
- if (!p1)
- return;
-
- for (i = 0; i < 512; ++i) {
- if (!IOMMU_PTE_PRESENT(p1[i]))
- continue;
-
- p2 = IOMMU_PTE_PAGE(p1[i]);
- for (j = 0; j < 512; ++j) {
- if (!IOMMU_PTE_PRESENT(p2[j]))
- continue;
- p3 = IOMMU_PTE_PAGE(p2[j]);
- free_page((unsigned long)p3);
- }
-
- free_page((unsigned long)p2);
- }
-
- free_page((unsigned long)p1);
-
- domain->pt_root = NULL;
-}
-
-/*
- * Free a domain, only used if something went wrong in the
- * allocation path and we need to free an already allocated page table
- */
-static void dma_ops_domain_free(struct dma_ops_domain *dom)
-{
- int i;
-
- if (!dom)
- return;
-
- del_domain_from_list(&dom->domain);
-
- free_pagetable(&dom->domain);
-
- for (i = 0; i < APERTURE_MAX_RANGES; ++i) {
- if (!dom->aperture[i])
- continue;
- free_page((unsigned long)dom->aperture[i]->bitmap);
- kfree(dom->aperture[i]);
- }
-
- kfree(dom);
-}
-
-/*
- * Allocates a new protection domain usable for the dma_ops functions.
- * It also initializes the page table and the address allocator data
- * structures required for the dma_ops interface
- */
-static struct dma_ops_domain *dma_ops_domain_alloc(void)
-{
- struct dma_ops_domain *dma_dom;
-
- dma_dom = kzalloc(sizeof(struct dma_ops_domain), GFP_KERNEL);
- if (!dma_dom)
- return NULL;
-
- spin_lock_init(&dma_dom->domain.lock);
-
- dma_dom->domain.id = domain_id_alloc();
- if (dma_dom->domain.id == 0)
- goto free_dma_dom;
- INIT_LIST_HEAD(&dma_dom->domain.dev_list);
- dma_dom->domain.mode = PAGE_MODE_2_LEVEL;
- dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL);
- dma_dom->domain.flags = PD_DMA_OPS_MASK;
- dma_dom->domain.priv = dma_dom;
- if (!dma_dom->domain.pt_root)
- goto free_dma_dom;
-
- dma_dom->need_flush = false;
- dma_dom->target_dev = 0xffff;
-
- add_domain_to_list(&dma_dom->domain);
-
- if (alloc_new_range(dma_dom, true, GFP_KERNEL))
- goto free_dma_dom;
-
- /*
- * mark the first page as allocated so we never return 0 as
- * a valid dma-address. So we can use 0 as error value
- */
- dma_dom->aperture[0]->bitmap[0] = 1;
- dma_dom->next_address = 0;
-
-
- return dma_dom;
-
-free_dma_dom:
- dma_ops_domain_free(dma_dom);
-
- return NULL;
-}
-
-/*
- * little helper function to check whether a given protection domain is a
- * dma_ops domain
- */
-static bool dma_ops_domain(struct protection_domain *domain)
-{
- return domain->flags & PD_DMA_OPS_MASK;
-}
-
-static void set_dte_entry(u16 devid, struct protection_domain *domain)
-{
- u64 pte_root = virt_to_phys(domain->pt_root);
-
- pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
- << DEV_ENTRY_MODE_SHIFT;
- pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV;
-
- amd_iommu_dev_table[devid].data[2] = domain->id;
- amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root);
- amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root);
-}
-
-static void clear_dte_entry(u16 devid)
-{
- /* remove entry from the device table seen by the hardware */
- amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV;
- amd_iommu_dev_table[devid].data[1] = 0;
- amd_iommu_dev_table[devid].data[2] = 0;
-
- amd_iommu_apply_erratum_63(devid);
-}
-
-static void do_attach(struct device *dev, struct protection_domain *domain)
-{
- struct iommu_dev_data *dev_data;
- struct amd_iommu *iommu;
- u16 devid;
-
- devid = get_device_id(dev);
- iommu = amd_iommu_rlookup_table[devid];
- dev_data = get_dev_data(dev);
-
- /* Update data structures */
- dev_data->domain = domain;
- list_add(&dev_data->list, &domain->dev_list);
- set_dte_entry(devid, domain);
-
- /* Do reference counting */
- domain->dev_iommu[iommu->index] += 1;
- domain->dev_cnt += 1;
-
- /* Flush the DTE entry */
- iommu_flush_device(dev);
-}
-
-static void do_detach(struct device *dev)
-{
- struct iommu_dev_data *dev_data;
- struct amd_iommu *iommu;
- u16 devid;
-
- devid = get_device_id(dev);
- iommu = amd_iommu_rlookup_table[devid];
- dev_data = get_dev_data(dev);
-
- /* decrease reference counters */
- dev_data->domain->dev_iommu[iommu->index] -= 1;
- dev_data->domain->dev_cnt -= 1;
-
- /* Update data structures */
- dev_data->domain = NULL;
- list_del(&dev_data->list);
- clear_dte_entry(devid);
-
- /* Flush the DTE entry */
- iommu_flush_device(dev);
-}
-
-/*
- * If a device is not yet associated with a domain, this function does
- * assigns it visible for the hardware
- */
-static int __attach_device(struct device *dev,
- struct protection_domain *domain)
-{
- struct iommu_dev_data *dev_data, *alias_data;
- int ret;
-
- dev_data = get_dev_data(dev);
- alias_data = get_dev_data(dev_data->alias);
-
- if (!alias_data)
- return -EINVAL;
-
- /* lock domain */
- spin_lock(&domain->lock);
-
- /* Some sanity checks */
- ret = -EBUSY;
- if (alias_data->domain != NULL &&
- alias_data->domain != domain)
- goto out_unlock;
-
- if (dev_data->domain != NULL &&
- dev_data->domain != domain)
- goto out_unlock;
-
- /* Do real assignment */
- if (dev_data->alias != dev) {
- alias_data = get_dev_data(dev_data->alias);
- if (alias_data->domain == NULL)
- do_attach(dev_data->alias, domain);
-
- atomic_inc(&alias_data->bind);
- }
-
- if (dev_data->domain == NULL)
- do_attach(dev, domain);
-
- atomic_inc(&dev_data->bind);
-
- ret = 0;
-
-out_unlock:
-
- /* ready */
- spin_unlock(&domain->lock);
-
- return ret;
-}
-
-/*
- * If a device is not yet associated with a domain, this function does
- * assigns it visible for the hardware
- */
-static int attach_device(struct device *dev,
- struct protection_domain *domain)
-{
- unsigned long flags;
- int ret;
-
- write_lock_irqsave(&amd_iommu_devtable_lock, flags);
- ret = __attach_device(dev, domain);
- write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
-
- /*
- * We might boot into a crash-kernel here. The crashed kernel
- * left the caches in the IOMMU dirty. So we have to flush
- * here to evict all dirty stuff.
- */
- iommu_flush_tlb_pde(domain);
-
- return ret;
-}
-
-/*
- * Removes a device from a protection domain (unlocked)
- */
-static void __detach_device(struct device *dev)
-{
- struct iommu_dev_data *dev_data = get_dev_data(dev);
- struct iommu_dev_data *alias_data;
- struct protection_domain *domain;
- unsigned long flags;
-
- BUG_ON(!dev_data->domain);
-
- domain = dev_data->domain;
-
- spin_lock_irqsave(&domain->lock, flags);
-
- if (dev_data->alias != dev) {
- alias_data = get_dev_data(dev_data->alias);
- if (atomic_dec_and_test(&alias_data->bind))
- do_detach(dev_data->alias);
- }
-
- if (atomic_dec_and_test(&dev_data->bind))
- do_detach(dev);
-
- spin_unlock_irqrestore(&domain->lock, flags);
-
- /*
- * If we run in passthrough mode the device must be assigned to the
- * passthrough domain if it is detached from any other domain.
- * Make sure we can deassign from the pt_domain itself.
- */
- if (iommu_pass_through &&
- (dev_data->domain == NULL && domain != pt_domain))
- __attach_device(dev, pt_domain);
-}
-
-/*
- * Removes a device from a protection domain (with devtable_lock held)
- */
-static void detach_device(struct device *dev)
-{
- unsigned long flags;
-
- /* lock device table */
- write_lock_irqsave(&amd_iommu_devtable_lock, flags);
- __detach_device(dev);
- write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
-}
-
-/*
- * Find out the protection domain structure for a given PCI device. This
- * will give us the pointer to the page table root for example.
- */
-static struct protection_domain *domain_for_device(struct device *dev)
-{
- struct protection_domain *dom;
- struct iommu_dev_data *dev_data, *alias_data;
- unsigned long flags;
- u16 devid, alias;
-
- devid = get_device_id(dev);
- alias = amd_iommu_alias_table[devid];
- dev_data = get_dev_data(dev);
- alias_data = get_dev_data(dev_data->alias);
- if (!alias_data)
- return NULL;
-
- read_lock_irqsave(&amd_iommu_devtable_lock, flags);
- dom = dev_data->domain;
- if (dom == NULL &&
- alias_data->domain != NULL) {
- __attach_device(dev, alias_data->domain);
- dom = alias_data->domain;
- }
-
- read_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
-
- return dom;
-}
-
-static int device_change_notifier(struct notifier_block *nb,
- unsigned long action, void *data)
-{
- struct device *dev = data;
- u16 devid;
- struct protection_domain *domain;
- struct dma_ops_domain *dma_domain;
- struct amd_iommu *iommu;
- unsigned long flags;
-
- if (!check_device(dev))
- return 0;
-
- devid = get_device_id(dev);
- iommu = amd_iommu_rlookup_table[devid];
-
- switch (action) {
- case BUS_NOTIFY_UNBOUND_DRIVER:
-
- domain = domain_for_device(dev);
-
- if (!domain)
- goto out;
- if (iommu_pass_through)
- break;
- detach_device(dev);
- break;
- case BUS_NOTIFY_ADD_DEVICE:
-
- iommu_init_device(dev);
-
- domain = domain_for_device(dev);
-
- /* allocate a protection domain if a device is added */
- dma_domain = find_protection_domain(devid);
- if (dma_domain)
- goto out;
- dma_domain = dma_ops_domain_alloc();
- if (!dma_domain)
- goto out;
- dma_domain->target_dev = devid;
-
- spin_lock_irqsave(&iommu_pd_list_lock, flags);
- list_add_tail(&dma_domain->list, &iommu_pd_list);
- spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
-
- break;
- case BUS_NOTIFY_DEL_DEVICE:
-
- iommu_uninit_device(dev);
-
- default:
- goto out;
- }
-
- iommu_flush_device(dev);
- iommu_completion_wait(iommu);
-
-out:
- return 0;
-}
-
-static struct notifier_block device_nb = {
- .notifier_call = device_change_notifier,
-};
-
-void amd_iommu_init_notifier(void)
-{
- bus_register_notifier(&pci_bus_type, &device_nb);
-}
-
-/*****************************************************************************
- *
- * The next functions belong to the dma_ops mapping/unmapping code.
- *
- *****************************************************************************/
-
-/*
- * In the dma_ops path we only have the struct device. This function
- * finds the corresponding IOMMU, the protection domain and the
- * requestor id for a given device.
- * If the device is not yet associated with a domain this is also done
- * in this function.
- */
-static struct protection_domain *get_domain(struct device *dev)
-{
- struct protection_domain *domain;
- struct dma_ops_domain *dma_dom;
- u16 devid = get_device_id(dev);
-
- if (!check_device(dev))
- return ERR_PTR(-EINVAL);
-
- domain = domain_for_device(dev);
- if (domain != NULL && !dma_ops_domain(domain))
- return ERR_PTR(-EBUSY);
-
- if (domain != NULL)
- return domain;
-
- /* Device not bount yet - bind it */
- dma_dom = find_protection_domain(devid);
- if (!dma_dom)
- dma_dom = amd_iommu_rlookup_table[devid]->default_dom;
- attach_device(dev, &dma_dom->domain);
- DUMP_printk("Using protection domain %d for device %s\n",
- dma_dom->domain.id, dev_name(dev));
-
- return &dma_dom->domain;
-}
-
-static void update_device_table(struct protection_domain *domain)
-{
- struct iommu_dev_data *dev_data;
-
- list_for_each_entry(dev_data, &domain->dev_list, list) {
- u16 devid = get_device_id(dev_data->dev);
- set_dte_entry(devid, domain);
- }
-}
-
-static void update_domain(struct protection_domain *domain)
-{
- if (!domain->updated)
- return;
-
- update_device_table(domain);
- iommu_flush_domain_devices(domain);
- iommu_flush_tlb_pde(domain);
-
- domain->updated = false;
-}
-
-/*
- * This function fetches the PTE for a given address in the aperture
- */
-static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
- unsigned long address)
-{
- struct aperture_range *aperture;
- u64 *pte, *pte_page;
-
- aperture = dom->aperture[APERTURE_RANGE_INDEX(address)];
- if (!aperture)
- return NULL;
-
- pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
- if (!pte) {
- pte = alloc_pte(&dom->domain, address, PAGE_SIZE, &pte_page,
- GFP_ATOMIC);
- aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page;
- } else
- pte += PM_LEVEL_INDEX(0, address);
-
- update_domain(&dom->domain);
-
- return pte;
-}
-
-/*
- * This is the generic map function. It maps one 4kb page at paddr to
- * the given address in the DMA address space for the domain.
- */
-static dma_addr_t dma_ops_domain_map(struct dma_ops_domain *dom,
- unsigned long address,
- phys_addr_t paddr,
- int direction)
-{
- u64 *pte, __pte;
-
- WARN_ON(address > dom->aperture_size);
-
- paddr &= PAGE_MASK;
-
- pte = dma_ops_get_pte(dom, address);
- if (!pte)
- return DMA_ERROR_CODE;
-
- __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC;
-
- if (direction == DMA_TO_DEVICE)
- __pte |= IOMMU_PTE_IR;
- else if (direction == DMA_FROM_DEVICE)
- __pte |= IOMMU_PTE_IW;
- else if (direction == DMA_BIDIRECTIONAL)
- __pte |= IOMMU_PTE_IR | IOMMU_PTE_IW;
-
- WARN_ON(*pte);
-
- *pte = __pte;
-
- return (dma_addr_t)address;
-}
-
-/*
- * The generic unmapping function for on page in the DMA address space.
- */
-static void dma_ops_domain_unmap(struct dma_ops_domain *dom,
- unsigned long address)
-{
- struct aperture_range *aperture;
- u64 *pte;
-
- if (address >= dom->aperture_size)
- return;
-
- aperture = dom->aperture[APERTURE_RANGE_INDEX(address)];
- if (!aperture)
- return;
-
- pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
- if (!pte)
- return;
-
- pte += PM_LEVEL_INDEX(0, address);
-
- WARN_ON(!*pte);
-
- *pte = 0ULL;
-}
-
-/*
- * This function contains common code for mapping of a physically
- * contiguous memory region into DMA address space. It is used by all
- * mapping functions provided with this IOMMU driver.
- * Must be called with the domain lock held.
- */
-static dma_addr_t __map_single(struct device *dev,
- struct dma_ops_domain *dma_dom,
- phys_addr_t paddr,
- size_t size,
- int dir,
- bool align,
- u64 dma_mask)
-{
- dma_addr_t offset = paddr & ~PAGE_MASK;
- dma_addr_t address, start, ret;
- unsigned int pages;
- unsigned long align_mask = 0;
- int i;
-
- pages = iommu_num_pages(paddr, size, PAGE_SIZE);
- paddr &= PAGE_MASK;
-
- INC_STATS_COUNTER(total_map_requests);
-
- if (pages > 1)
- INC_STATS_COUNTER(cross_page);
-
- if (align)
- align_mask = (1UL << get_order(size)) - 1;
-
-retry:
- address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask,
- dma_mask);
- if (unlikely(address == DMA_ERROR_CODE)) {
- /*
- * setting next_address here will let the address
- * allocator only scan the new allocated range in the
- * first run. This is a small optimization.
- */
- dma_dom->next_address = dma_dom->aperture_size;
-
- if (alloc_new_range(dma_dom, false, GFP_ATOMIC))
- goto out;
-
- /*
- * aperture was successfully enlarged by 128 MB, try
- * allocation again
- */
- goto retry;
- }
-
- start = address;
- for (i = 0; i < pages; ++i) {
- ret = dma_ops_domain_map(dma_dom, start, paddr, dir);
- if (ret == DMA_ERROR_CODE)
- goto out_unmap;
-
- paddr += PAGE_SIZE;
- start += PAGE_SIZE;
- }
- address += offset;
-
- ADD_STATS_COUNTER(alloced_io_mem, size);
-
- if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) {
- iommu_flush_tlb(&dma_dom->domain);
- dma_dom->need_flush = false;
- } else if (unlikely(amd_iommu_np_cache))
- iommu_flush_pages(&dma_dom->domain, address, size);
-
-out:
- return address;
-
-out_unmap:
-
- for (--i; i >= 0; --i) {
- start -= PAGE_SIZE;
- dma_ops_domain_unmap(dma_dom, start);
- }
-
- dma_ops_free_addresses(dma_dom, address, pages);
-
- return DMA_ERROR_CODE;
-}
-
-/*
- * Does the reverse of the __map_single function. Must be called with
- * the domain lock held too
- */
-static void __unmap_single(struct dma_ops_domain *dma_dom,
- dma_addr_t dma_addr,
- size_t size,
- int dir)
-{
- dma_addr_t flush_addr;
- dma_addr_t i, start;
- unsigned int pages;
-
- if ((dma_addr == DMA_ERROR_CODE) ||
- (dma_addr + size > dma_dom->aperture_size))
- return;
-
- flush_addr = dma_addr;
- pages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
- dma_addr &= PAGE_MASK;
- start = dma_addr;
-
- for (i = 0; i < pages; ++i) {
- dma_ops_domain_unmap(dma_dom, start);
- start += PAGE_SIZE;
- }
-
- SUB_STATS_COUNTER(alloced_io_mem, size);
-
- dma_ops_free_addresses(dma_dom, dma_addr, pages);
-
- if (amd_iommu_unmap_flush || dma_dom->need_flush) {
- iommu_flush_pages(&dma_dom->domain, flush_addr, size);
- dma_dom->need_flush = false;
- }
-}
-
-/*
- * The exported map_single function for dma_ops.
- */
-static dma_addr_t map_page(struct device *dev, struct page *page,
- unsigned long offset, size_t size,
- enum dma_data_direction dir,
- struct dma_attrs *attrs)
-{
- unsigned long flags;
- struct protection_domain *domain;
- dma_addr_t addr;
- u64 dma_mask;
- phys_addr_t paddr = page_to_phys(page) + offset;
-
- INC_STATS_COUNTER(cnt_map_single);
-
- domain = get_domain(dev);
- if (PTR_ERR(domain) == -EINVAL)
- return (dma_addr_t)paddr;
- else if (IS_ERR(domain))
- return DMA_ERROR_CODE;
-
- dma_mask = *dev->dma_mask;
-
- spin_lock_irqsave(&domain->lock, flags);
-
- addr = __map_single(dev, domain->priv, paddr, size, dir, false,
- dma_mask);
- if (addr == DMA_ERROR_CODE)
- goto out;
-
- iommu_flush_complete(domain);
-
-out:
- spin_unlock_irqrestore(&domain->lock, flags);
-
- return addr;
-}
-
-/*
- * The exported unmap_single function for dma_ops.
- */
-static void unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size,
- enum dma_data_direction dir, struct dma_attrs *attrs)
-{
- unsigned long flags;
- struct protection_domain *domain;
-
- INC_STATS_COUNTER(cnt_unmap_single);
-
- domain = get_domain(dev);
- if (IS_ERR(domain))
- return;
-
- spin_lock_irqsave(&domain->lock, flags);
-
- __unmap_single(domain->priv, dma_addr, size, dir);
-
- iommu_flush_complete(domain);
-
- spin_unlock_irqrestore(&domain->lock, flags);
-}
-
-/*
- * This is a special map_sg function which is used if we should map a
- * device which is not handled by an AMD IOMMU in the system.
- */
-static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist,
- int nelems, int dir)
-{
- struct scatterlist *s;
- int i;
-
- for_each_sg(sglist, s, nelems, i) {
- s->dma_address = (dma_addr_t)sg_phys(s);
- s->dma_length = s->length;
- }
-
- return nelems;
-}
-
-/*
- * The exported map_sg function for dma_ops (handles scatter-gather
- * lists).
- */
-static int map_sg(struct device *dev, struct scatterlist *sglist,
- int nelems, enum dma_data_direction dir,
- struct dma_attrs *attrs)
-{
- unsigned long flags;
- struct protection_domain *domain;
- int i;
- struct scatterlist *s;
- phys_addr_t paddr;
- int mapped_elems = 0;
- u64 dma_mask;
-
- INC_STATS_COUNTER(cnt_map_sg);
-
- domain = get_domain(dev);
- if (PTR_ERR(domain) == -EINVAL)
- return map_sg_no_iommu(dev, sglist, nelems, dir);
- else if (IS_ERR(domain))
- return 0;
-
- dma_mask = *dev->dma_mask;
-
- spin_lock_irqsave(&domain->lock, flags);
-
- for_each_sg(sglist, s, nelems, i) {
- paddr = sg_phys(s);
-
- s->dma_address = __map_single(dev, domain->priv,
- paddr, s->length, dir, false,
- dma_mask);
-
- if (s->dma_address) {
- s->dma_length = s->length;
- mapped_elems++;
- } else
- goto unmap;
- }
-
- iommu_flush_complete(domain);
-
-out:
- spin_unlock_irqrestore(&domain->lock, flags);
-
- return mapped_elems;
-unmap:
- for_each_sg(sglist, s, mapped_elems, i) {
- if (s->dma_address)
- __unmap_single(domain->priv, s->dma_address,
- s->dma_length, dir);
- s->dma_address = s->dma_length = 0;
- }
-
- mapped_elems = 0;
-
- goto out;
-}
-
-/*
- * The exported map_sg function for dma_ops (handles scatter-gather
- * lists).
- */
-static void unmap_sg(struct device *dev, struct scatterlist *sglist,
- int nelems, enum dma_data_direction dir,
- struct dma_attrs *attrs)
-{
- unsigned long flags;
- struct protection_domain *domain;
- struct scatterlist *s;
- int i;
-
- INC_STATS_COUNTER(cnt_unmap_sg);
-
- domain = get_domain(dev);
- if (IS_ERR(domain))
- return;
-
- spin_lock_irqsave(&domain->lock, flags);
-
- for_each_sg(sglist, s, nelems, i) {
- __unmap_single(domain->priv, s->dma_address,
- s->dma_length, dir);
- s->dma_address = s->dma_length = 0;
- }
-
- iommu_flush_complete(domain);
-
- spin_unlock_irqrestore(&domain->lock, flags);
-}
-
-/*
- * The exported alloc_coherent function for dma_ops.
- */
-static void *alloc_coherent(struct device *dev, size_t size,
- dma_addr_t *dma_addr, gfp_t flag)
-{
- unsigned long flags;
- void *virt_addr;
- struct protection_domain *domain;
- phys_addr_t paddr;
- u64 dma_mask = dev->coherent_dma_mask;
-
- INC_STATS_COUNTER(cnt_alloc_coherent);
-
- domain = get_domain(dev);
- if (PTR_ERR(domain) == -EINVAL) {
- virt_addr = (void *)__get_free_pages(flag, get_order(size));
- *dma_addr = __pa(virt_addr);
- return virt_addr;
- } else if (IS_ERR(domain))
- return NULL;
-
- dma_mask = dev->coherent_dma_mask;
- flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
- flag |= __GFP_ZERO;
-
- virt_addr = (void *)__get_free_pages(flag, get_order(size));
- if (!virt_addr)
- return NULL;
-
- paddr = virt_to_phys(virt_addr);
-
- if (!dma_mask)
- dma_mask = *dev->dma_mask;
-
- spin_lock_irqsave(&domain->lock, flags);
-
- *dma_addr = __map_single(dev, domain->priv, paddr,
- size, DMA_BIDIRECTIONAL, true, dma_mask);
-
- if (*dma_addr == DMA_ERROR_CODE) {
- spin_unlock_irqrestore(&domain->lock, flags);
- goto out_free;
- }
-
- iommu_flush_complete(domain);
-
- spin_unlock_irqrestore(&domain->lock, flags);
-
- return virt_addr;
-
-out_free:
-
- free_pages((unsigned long)virt_addr, get_order(size));
-
- return NULL;
-}
-
-/*
- * The exported free_coherent function for dma_ops.
- */
-static void free_coherent(struct device *dev, size_t size,
- void *virt_addr, dma_addr_t dma_addr)
-{
- unsigned long flags;
- struct protection_domain *domain;
-
- INC_STATS_COUNTER(cnt_free_coherent);
-
- domain = get_domain(dev);
- if (IS_ERR(domain))
- goto free_mem;
-
- spin_lock_irqsave(&domain->lock, flags);
-
- __unmap_single(domain->priv, dma_addr, size, DMA_BIDIRECTIONAL);
-
- iommu_flush_complete(domain);
-
- spin_unlock_irqrestore(&domain->lock, flags);
-
-free_mem:
- free_pages((unsigned long)virt_addr, get_order(size));
-}
-
-/*
- * This function is called by the DMA layer to find out if we can handle a
- * particular device. It is part of the dma_ops.
- */
-static int amd_iommu_dma_supported(struct device *dev, u64 mask)
-{
- return check_device(dev);
-}
-
-/*
- * The function for pre-allocating protection domains.
- *
- * If the driver core informs the DMA layer if a driver grabs a device
- * we don't need to preallocate the protection domains anymore.
- * For now we have to.
- */
-static void prealloc_protection_domains(void)
-{
- struct pci_dev *dev = NULL;
- struct dma_ops_domain *dma_dom;
- u16 devid;
-
- for_each_pci_dev(dev) {
-
- /* Do we handle this device? */
- if (!check_device(&dev->dev))
- continue;
-
- /* Is there already any domain for it? */
- if (domain_for_device(&dev->dev))
- continue;
-
- devid = get_device_id(&dev->dev);
-
- dma_dom = dma_ops_domain_alloc();
- if (!dma_dom)
- continue;
- init_unity_mappings_for_device(dma_dom, devid);
- dma_dom->target_dev = devid;
-
- attach_device(&dev->dev, &dma_dom->domain);
-
- list_add_tail(&dma_dom->list, &iommu_pd_list);
- }
-}
-
-static struct dma_map_ops amd_iommu_dma_ops = {
- .alloc_coherent = alloc_coherent,
- .free_coherent = free_coherent,
- .map_page = map_page,
- .unmap_page = unmap_page,
- .map_sg = map_sg,
- .unmap_sg = unmap_sg,
- .dma_supported = amd_iommu_dma_supported,
-};
-
-/*
- * The function which clues the AMD IOMMU driver into dma_ops.
- */
-
-void __init amd_iommu_init_api(void)
-{
- register_iommu(&amd_iommu_ops);
-}
-
-int __init amd_iommu_init_dma_ops(void)
-{
- struct amd_iommu *iommu;
- int ret;
-
- /*
- * first allocate a default protection domain for every IOMMU we
- * found in the system. Devices not assigned to any other
- * protection domain will be assigned to the default one.
- */
- for_each_iommu(iommu) {
- iommu->default_dom = dma_ops_domain_alloc();
- if (iommu->default_dom == NULL)
- return -ENOMEM;
- iommu->default_dom->domain.flags |= PD_DEFAULT_MASK;
- ret = iommu_init_unity_mappings(iommu);
- if (ret)
- goto free_domains;
- }
-
- /*
- * Pre-allocate the protection domains for each device.
- */
- prealloc_protection_domains();
-
- iommu_detected = 1;
- swiotlb = 0;
-
- /* Make the driver finally visible to the drivers */
- dma_ops = &amd_iommu_dma_ops;
-
- amd_iommu_stats_init();
-
- return 0;
-
-free_domains:
-
- for_each_iommu(iommu) {
- if (iommu->default_dom)
- dma_ops_domain_free(iommu->default_dom);
- }
-
- return ret;
-}
-
-/*****************************************************************************
- *
- * The following functions belong to the exported interface of AMD IOMMU
- *
- * This interface allows access to lower level functions of the IOMMU
- * like protection domain handling and assignement of devices to domains
- * which is not possible with the dma_ops interface.
- *
- *****************************************************************************/
-
-static void cleanup_domain(struct protection_domain *domain)
-{
- struct iommu_dev_data *dev_data, *next;
- unsigned long flags;
-
- write_lock_irqsave(&amd_iommu_devtable_lock, flags);
-
- list_for_each_entry_safe(dev_data, next, &domain->dev_list, list) {
- struct device *dev = dev_data->dev;
-
- __detach_device(dev);
- atomic_set(&dev_data->bind, 0);
- }
-
- write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
-}
-
-static void protection_domain_free(struct protection_domain *domain)
-{
- if (!domain)
- return;
-
- del_domain_from_list(domain);
-
- if (domain->id)
- domain_id_free(domain->id);
-
- kfree(domain);
-}
-
-static struct protection_domain *protection_domain_alloc(void)
-{
- struct protection_domain *domain;
-
- domain = kzalloc(sizeof(*domain), GFP_KERNEL);
- if (!domain)
- return NULL;
-
- spin_lock_init(&domain->lock);
- mutex_init(&domain->api_lock);
- domain->id = domain_id_alloc();
- if (!domain->id)
- goto out_err;
- INIT_LIST_HEAD(&domain->dev_list);
-
- add_domain_to_list(domain);
-
- return domain;
-
-out_err:
- kfree(domain);
-
- return NULL;
-}
-
-static int amd_iommu_domain_init(struct iommu_domain *dom)
-{
- struct protection_domain *domain;
-
- domain = protection_domain_alloc();
- if (!domain)
- goto out_free;
-
- domain->mode = PAGE_MODE_3_LEVEL;
- domain->pt_root = (void *)get_zeroed_page(GFP_KERNEL);
- if (!domain->pt_root)
- goto out_free;
-
- dom->priv = domain;
-
- return 0;
-
-out_free:
- protection_domain_free(domain);
-
- return -ENOMEM;
-}
-
-static void amd_iommu_domain_destroy(struct iommu_domain *dom)
-{
- struct protection_domain *domain = dom->priv;
-
- if (!domain)
- return;
-
- if (domain->dev_cnt > 0)
- cleanup_domain(domain);
-
- BUG_ON(domain->dev_cnt != 0);
-
- free_pagetable(domain);
-
- protection_domain_free(domain);
-
- dom->priv = NULL;
-}
-
-static void amd_iommu_detach_device(struct iommu_domain *dom,
- struct device *dev)
-{
- struct iommu_dev_data *dev_data = dev->archdata.iommu;
- struct amd_iommu *iommu;
- u16 devid;
-
- if (!check_device(dev))
- return;
-
- devid = get_device_id(dev);
-
- if (dev_data->domain != NULL)
- detach_device(dev);
-
- iommu = amd_iommu_rlookup_table[devid];
- if (!iommu)
- return;
-
- iommu_flush_device(dev);
- iommu_completion_wait(iommu);
-}
-
-static int amd_iommu_attach_device(struct iommu_domain *dom,
- struct device *dev)
-{
- struct protection_domain *domain = dom->priv;
- struct iommu_dev_data *dev_data;
- struct amd_iommu *iommu;
- int ret;
- u16 devid;
-
- if (!check_device(dev))
- return -EINVAL;
-
- dev_data = dev->archdata.iommu;
-
- devid = get_device_id(dev);
-
- iommu = amd_iommu_rlookup_table[devid];
- if (!iommu)
- return -EINVAL;
-
- if (dev_data->domain)
- detach_device(dev);
-
- ret = attach_device(dev, domain);
-
- iommu_completion_wait(iommu);
-
- return ret;
-}
-
-static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova,
- phys_addr_t paddr, int gfp_order, int iommu_prot)
-{
- unsigned long page_size = 0x1000UL << gfp_order;
- struct protection_domain *domain = dom->priv;
- int prot = 0;
- int ret;
-
- if (iommu_prot & IOMMU_READ)
- prot |= IOMMU_PROT_IR;
- if (iommu_prot & IOMMU_WRITE)
- prot |= IOMMU_PROT_IW;
-
- mutex_lock(&domain->api_lock);
- ret = iommu_map_page(domain, iova, paddr, prot, page_size);
- mutex_unlock(&domain->api_lock);
-
- return ret;
-}
-
-static int amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova,
- int gfp_order)
-{
- struct protection_domain *domain = dom->priv;
- unsigned long page_size, unmap_size;
-
- page_size = 0x1000UL << gfp_order;
-
- mutex_lock(&domain->api_lock);
- unmap_size = iommu_unmap_page(domain, iova, page_size);
- mutex_unlock(&domain->api_lock);
-
- iommu_flush_tlb_pde(domain);
-
- return get_order(unmap_size);
-}
-
-static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
- unsigned long iova)
-{
- struct protection_domain *domain = dom->priv;
- unsigned long offset_mask;
- phys_addr_t paddr;
- u64 *pte, __pte;
-
- pte = fetch_pte(domain, iova);
-
- if (!pte || !IOMMU_PTE_PRESENT(*pte))
- return 0;
-
- if (PM_PTE_LEVEL(*pte) == 0)
- offset_mask = PAGE_SIZE - 1;
- else
- offset_mask = PTE_PAGE_SIZE(*pte) - 1;
-
- __pte = *pte & PM_ADDR_MASK;
- paddr = (__pte & ~offset_mask) | (iova & offset_mask);
-
- return paddr;
-}
-
-static int amd_iommu_domain_has_cap(struct iommu_domain *domain,
- unsigned long cap)
-{
- switch (cap) {
- case IOMMU_CAP_CACHE_COHERENCY:
- return 1;
- }
-
- return 0;
-}
-
-static struct iommu_ops amd_iommu_ops = {
- .domain_init = amd_iommu_domain_init,
- .domain_destroy = amd_iommu_domain_destroy,
- .attach_dev = amd_iommu_attach_device,
- .detach_dev = amd_iommu_detach_device,
- .map = amd_iommu_map,
- .unmap = amd_iommu_unmap,
- .iova_to_phys = amd_iommu_iova_to_phys,
- .domain_has_cap = amd_iommu_domain_has_cap,
-};
-
-/*****************************************************************************
- *
- * The next functions do a basic initialization of IOMMU for pass through
- * mode
- *
- * In passthrough mode the IOMMU is initialized and enabled but not used for
- * DMA-API translation.
- *
- *****************************************************************************/
-
-int __init amd_iommu_init_passthrough(void)
-{
- struct amd_iommu *iommu;
- struct pci_dev *dev = NULL;
- u16 devid;
-
- /* allocate passthrough domain */
- pt_domain = protection_domain_alloc();
- if (!pt_domain)
- return -ENOMEM;
-
- pt_domain->mode |= PAGE_MODE_NONE;
-
- for_each_pci_dev(dev) {
- if (!check_device(&dev->dev))
- continue;
-
- devid = get_device_id(&dev->dev);
-
- iommu = amd_iommu_rlookup_table[devid];
- if (!iommu)
- continue;
-
- attach_device(&dev->dev, pt_domain);
- }
-
- pr_info("AMD-Vi: Initialized for Passthrough Mode\n");
-
- return 0;
-}
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
deleted file mode 100644
index 246d727b65b..00000000000
--- a/arch/x86/kernel/amd_iommu_init.c
+++ /dev/null
@@ -1,1540 +0,0 @@
-/*
- * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
- * Author: Joerg Roedel <joerg.roedel@amd.com>
- * Leo Duran <leo.duran@amd.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-#include <linux/pci.h>
-#include <linux/acpi.h>
-#include <linux/list.h>
-#include <linux/slab.h>
-#include <linux/syscore_ops.h>
-#include <linux/interrupt.h>
-#include <linux/msi.h>
-#include <asm/pci-direct.h>
-#include <asm/amd_iommu_proto.h>
-#include <asm/amd_iommu_types.h>
-#include <asm/amd_iommu.h>
-#include <asm/iommu.h>
-#include <asm/gart.h>
-#include <asm/x86_init.h>
-#include <asm/iommu_table.h>
-/*
- * definitions for the ACPI scanning code
- */
-#define IVRS_HEADER_LENGTH 48
-
-#define ACPI_IVHD_TYPE 0x10
-#define ACPI_IVMD_TYPE_ALL 0x20
-#define ACPI_IVMD_TYPE 0x21
-#define ACPI_IVMD_TYPE_RANGE 0x22
-
-#define IVHD_DEV_ALL 0x01
-#define IVHD_DEV_SELECT 0x02
-#define IVHD_DEV_SELECT_RANGE_START 0x03
-#define IVHD_DEV_RANGE_END 0x04
-#define IVHD_DEV_ALIAS 0x42
-#define IVHD_DEV_ALIAS_RANGE 0x43
-#define IVHD_DEV_EXT_SELECT 0x46
-#define IVHD_DEV_EXT_SELECT_RANGE 0x47
-
-#define IVHD_FLAG_HT_TUN_EN_MASK 0x01
-#define IVHD_FLAG_PASSPW_EN_MASK 0x02
-#define IVHD_FLAG_RESPASSPW_EN_MASK 0x04
-#define IVHD_FLAG_ISOC_EN_MASK 0x08
-
-#define IVMD_FLAG_EXCL_RANGE 0x08
-#define IVMD_FLAG_UNITY_MAP 0x01
-
-#define ACPI_DEVFLAG_INITPASS 0x01
-#define ACPI_DEVFLAG_EXTINT 0x02
-#define ACPI_DEVFLAG_NMI 0x04
-#define ACPI_DEVFLAG_SYSMGT1 0x10
-#define ACPI_DEVFLAG_SYSMGT2 0x20
-#define ACPI_DEVFLAG_LINT0 0x40
-#define ACPI_DEVFLAG_LINT1 0x80
-#define ACPI_DEVFLAG_ATSDIS 0x10000000
-
-/*
- * ACPI table definitions
- *
- * These data structures are laid over the table to parse the important values
- * out of it.
- */
-
-/*
- * structure describing one IOMMU in the ACPI table. Typically followed by one
- * or more ivhd_entrys.
- */
-struct ivhd_header {
- u8 type;
- u8 flags;
- u16 length;
- u16 devid;
- u16 cap_ptr;
- u64 mmio_phys;
- u16 pci_seg;
- u16 info;
- u32 reserved;
-} __attribute__((packed));
-
-/*
- * A device entry describing which devices a specific IOMMU translates and
- * which requestor ids they use.
- */
-struct ivhd_entry {
- u8 type;
- u16 devid;
- u8 flags;
- u32 ext;
-} __attribute__((packed));
-
-/*
- * An AMD IOMMU memory definition structure. It defines things like exclusion
- * ranges for devices and regions that should be unity mapped.
- */
-struct ivmd_header {
- u8 type;
- u8 flags;
- u16 length;
- u16 devid;
- u16 aux;
- u64 resv;
- u64 range_start;
- u64 range_length;
-} __attribute__((packed));
-
-bool amd_iommu_dump;
-
-static int __initdata amd_iommu_detected;
-static bool __initdata amd_iommu_disabled;
-
-u16 amd_iommu_last_bdf; /* largest PCI device id we have
- to handle */
-LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings
- we find in ACPI */
-bool amd_iommu_unmap_flush; /* if true, flush on every unmap */
-
-LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the
- system */
-
-/* Array to assign indices to IOMMUs*/
-struct amd_iommu *amd_iommus[MAX_IOMMUS];
-int amd_iommus_present;
-
-/* IOMMUs have a non-present cache? */
-bool amd_iommu_np_cache __read_mostly;
-
-/*
- * The ACPI table parsing functions set this variable on an error
- */
-static int __initdata amd_iommu_init_err;
-
-/*
- * List of protection domains - used during resume
- */
-LIST_HEAD(amd_iommu_pd_list);
-spinlock_t amd_iommu_pd_lock;
-
-/*
- * Pointer to the device table which is shared by all AMD IOMMUs
- * it is indexed by the PCI device id or the HT unit id and contains
- * information about the domain the device belongs to as well as the
- * page table root pointer.
- */
-struct dev_table_entry *amd_iommu_dev_table;
-
-/*
- * The alias table is a driver specific data structure which contains the
- * mappings of the PCI device ids to the actual requestor ids on the IOMMU.
- * More than one device can share the same requestor id.
- */
-u16 *amd_iommu_alias_table;
-
-/*
- * The rlookup table is used to find the IOMMU which is responsible
- * for a specific device. It is also indexed by the PCI device id.
- */
-struct amd_iommu **amd_iommu_rlookup_table;
-
-/*
- * AMD IOMMU allows up to 2^16 differend protection domains. This is a bitmap
- * to know which ones are already in use.
- */
-unsigned long *amd_iommu_pd_alloc_bitmap;
-
-static u32 dev_table_size; /* size of the device table */
-static u32 alias_table_size; /* size of the alias table */
-static u32 rlookup_table_size; /* size if the rlookup table */
-
-static inline void update_last_devid(u16 devid)
-{
- if (devid > amd_iommu_last_bdf)
- amd_iommu_last_bdf = devid;
-}
-
-static inline unsigned long tbl_size(int entry_size)
-{
- unsigned shift = PAGE_SHIFT +
- get_order(((int)amd_iommu_last_bdf + 1) * entry_size);
-
- return 1UL << shift;
-}
-
-/* Access to l1 and l2 indexed register spaces */
-
-static u32 iommu_read_l1(struct amd_iommu *iommu, u16 l1, u8 address)
-{
- u32 val;
-
- pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16));
- pci_read_config_dword(iommu->dev, 0xfc, &val);
- return val;
-}
-
-static void iommu_write_l1(struct amd_iommu *iommu, u16 l1, u8 address, u32 val)
-{
- pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16 | 1 << 31));
- pci_write_config_dword(iommu->dev, 0xfc, val);
- pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16));
-}
-
-static u32 iommu_read_l2(struct amd_iommu *iommu, u8 address)
-{
- u32 val;
-
- pci_write_config_dword(iommu->dev, 0xf0, address);
- pci_read_config_dword(iommu->dev, 0xf4, &val);
- return val;
-}
-
-static void iommu_write_l2(struct amd_iommu *iommu, u8 address, u32 val)
-{
- pci_write_config_dword(iommu->dev, 0xf0, (address | 1 << 8));
- pci_write_config_dword(iommu->dev, 0xf4, val);
-}
-
-/****************************************************************************
- *
- * AMD IOMMU MMIO register space handling functions
- *
- * These functions are used to program the IOMMU device registers in
- * MMIO space required for that driver.
- *
- ****************************************************************************/
-
-/*
- * This function set the exclusion range in the IOMMU. DMA accesses to the
- * exclusion range are passed through untranslated
- */
-static void iommu_set_exclusion_range(struct amd_iommu *iommu)
-{
- u64 start = iommu->exclusion_start & PAGE_MASK;
- u64 limit = (start + iommu->exclusion_length) & PAGE_MASK;
- u64 entry;
-
- if (!iommu->exclusion_start)
- return;
-
- entry = start | MMIO_EXCL_ENABLE_MASK;
- memcpy_toio(iommu->mmio_base + MMIO_EXCL_BASE_OFFSET,
- &entry, sizeof(entry));
-
- entry = limit;
- memcpy_toio(iommu->mmio_base + MMIO_EXCL_LIMIT_OFFSET,
- &entry, sizeof(entry));
-}
-
-/* Programs the physical address of the device table into the IOMMU hardware */
-static void __init iommu_set_device_table(struct amd_iommu *iommu)
-{
- u64 entry;
-
- BUG_ON(iommu->mmio_base == NULL);
-
- entry = virt_to_phys(amd_iommu_dev_table);
- entry |= (dev_table_size >> 12) - 1;
- memcpy_toio(iommu->mmio_base + MMIO_DEV_TABLE_OFFSET,
- &entry, sizeof(entry));
-}
-
-/* Generic functions to enable/disable certain features of the IOMMU. */
-static void iommu_feature_enable(struct amd_iommu *iommu, u8 bit)
-{
- u32 ctrl;
-
- ctrl = readl(iommu->mmio_base + MMIO_CONTROL_OFFSET);
- ctrl |= (1 << bit);
- writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
-}
-
-static void iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
-{
- u32 ctrl;
-
- ctrl = readl(iommu->mmio_base + MMIO_CONTROL_OFFSET);
- ctrl &= ~(1 << bit);
- writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
-}
-
-/* Function to enable the hardware */
-static void iommu_enable(struct amd_iommu *iommu)
-{
- printk(KERN_INFO "AMD-Vi: Enabling IOMMU at %s cap 0x%hx\n",
- dev_name(&iommu->dev->dev), iommu->cap_ptr);
-
- iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
-}
-
-static void iommu_disable(struct amd_iommu *iommu)
-{
- /* Disable command buffer */
- iommu_feature_disable(iommu, CONTROL_CMDBUF_EN);
-
- /* Disable event logging and event interrupts */
- iommu_feature_disable(iommu, CONTROL_EVT_INT_EN);
- iommu_feature_disable(iommu, CONTROL_EVT_LOG_EN);
-
- /* Disable IOMMU hardware itself */
- iommu_feature_disable(iommu, CONTROL_IOMMU_EN);
-}
-
-/*
- * mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in
- * the system has one.
- */
-static u8 * __init iommu_map_mmio_space(u64 address)
-{
- u8 *ret;
-
- if (!request_mem_region(address, MMIO_REGION_LENGTH, "amd_iommu")) {
- pr_err("AMD-Vi: Can not reserve memory region %llx for mmio\n",
- address);
- pr_err("AMD-Vi: This is a BIOS bug. Please contact your hardware vendor\n");
- return NULL;
- }
-
- ret = ioremap_nocache(address, MMIO_REGION_LENGTH);
- if (ret != NULL)
- return ret;
-
- release_mem_region(address, MMIO_REGION_LENGTH);
-
- return NULL;
-}
-
-static void __init iommu_unmap_mmio_space(struct amd_iommu *iommu)
-{
- if (iommu->mmio_base)
- iounmap(iommu->mmio_base);
- release_mem_region(iommu->mmio_phys, MMIO_REGION_LENGTH);
-}
-
-/****************************************************************************
- *
- * The functions below belong to the first pass of AMD IOMMU ACPI table
- * parsing. In this pass we try to find out the highest device id this
- * code has to handle. Upon this information the size of the shared data
- * structures is determined later.
- *
- ****************************************************************************/
-
-/*
- * This function calculates the length of a given IVHD entry
- */
-static inline int ivhd_entry_length(u8 *ivhd)
-{
- return 0x04 << (*ivhd >> 6);
-}
-
-/*
- * This function reads the last device id the IOMMU has to handle from the PCI
- * capability header for this IOMMU
- */
-static int __init find_last_devid_on_pci(int bus, int dev, int fn, int cap_ptr)
-{
- u32 cap;
-
- cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET);
- update_last_devid(calc_devid(MMIO_GET_BUS(cap), MMIO_GET_LD(cap)));
-
- return 0;
-}
-
-/*
- * After reading the highest device id from the IOMMU PCI capability header
- * this function looks if there is a higher device id defined in the ACPI table
- */
-static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
-{
- u8 *p = (void *)h, *end = (void *)h;
- struct ivhd_entry *dev;
-
- p += sizeof(*h);
- end += h->length;
-
- find_last_devid_on_pci(PCI_BUS(h->devid),
- PCI_SLOT(h->devid),
- PCI_FUNC(h->devid),
- h->cap_ptr);
-
- while (p < end) {
- dev = (struct ivhd_entry *)p;
- switch (dev->type) {
- case IVHD_DEV_SELECT:
- case IVHD_DEV_RANGE_END:
- case IVHD_DEV_ALIAS:
- case IVHD_DEV_EXT_SELECT:
- /* all the above subfield types refer to device ids */
- update_last_devid(dev->devid);
- break;
- default:
- break;
- }
- p += ivhd_entry_length(p);
- }
-
- WARN_ON(p != end);
-
- return 0;
-}
-
-/*
- * Iterate over all IVHD entries in the ACPI table and find the highest device
- * id which we need to handle. This is the first of three functions which parse
- * the ACPI table. So we check the checksum here.
- */
-static int __init find_last_devid_acpi(struct acpi_table_header *table)
-{
- int i;
- u8 checksum = 0, *p = (u8 *)table, *end = (u8 *)table;
- struct ivhd_header *h;
-
- /*
- * Validate checksum here so we don't need to do it when
- * we actually parse the table
- */
- for (i = 0; i < table->length; ++i)
- checksum += p[i];
- if (checksum != 0) {
- /* ACPI table corrupt */
- amd_iommu_init_err = -ENODEV;
- return 0;
- }
-
- p += IVRS_HEADER_LENGTH;
-
- end += table->length;
- while (p < end) {
- h = (struct ivhd_header *)p;
- switch (h->type) {
- case ACPI_IVHD_TYPE:
- find_last_devid_from_ivhd(h);
- break;
- default:
- break;
- }
- p += h->length;
- }
- WARN_ON(p != end);
-
- return 0;
-}
-
-/****************************************************************************
- *
- * The following functions belong the the code path which parses the ACPI table
- * the second time. In this ACPI parsing iteration we allocate IOMMU specific
- * data structures, initialize the device/alias/rlookup table and also
- * basically initialize the hardware.
- *
- ****************************************************************************/
-
-/*
- * Allocates the command buffer. This buffer is per AMD IOMMU. We can
- * write commands to that buffer later and the IOMMU will execute them
- * asynchronously
- */
-static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
-{
- u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
- get_order(CMD_BUFFER_SIZE));
-
- if (cmd_buf == NULL)
- return NULL;
-
- iommu->cmd_buf_size = CMD_BUFFER_SIZE | CMD_BUFFER_UNINITIALIZED;
-
- return cmd_buf;
-}
-
-/*
- * This function resets the command buffer if the IOMMU stopped fetching
- * commands from it.
- */
-void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu)
-{
- iommu_feature_disable(iommu, CONTROL_CMDBUF_EN);
-
- writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
- writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
-
- iommu_feature_enable(iommu, CONTROL_CMDBUF_EN);
-}
-
-/*
- * This function writes the command buffer address to the hardware and
- * enables it.
- */
-static void iommu_enable_command_buffer(struct amd_iommu *iommu)
-{
- u64 entry;
-
- BUG_ON(iommu->cmd_buf == NULL);
-
- entry = (u64)virt_to_phys(iommu->cmd_buf);
- entry |= MMIO_CMD_SIZE_512;
-
- memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
- &entry, sizeof(entry));
-
- amd_iommu_reset_cmd_buffer(iommu);
- iommu->cmd_buf_size &= ~(CMD_BUFFER_UNINITIALIZED);
-}
-
-static void __init free_command_buffer(struct amd_iommu *iommu)
-{
- free_pages((unsigned long)iommu->cmd_buf,
- get_order(iommu->cmd_buf_size & ~(CMD_BUFFER_UNINITIALIZED)));
-}
-
-/* allocates the memory where the IOMMU will log its events to */
-static u8 * __init alloc_event_buffer(struct amd_iommu *iommu)
-{
- iommu->evt_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
- get_order(EVT_BUFFER_SIZE));
-
- if (iommu->evt_buf == NULL)
- return NULL;
-
- iommu->evt_buf_size = EVT_BUFFER_SIZE;
-
- return iommu->evt_buf;
-}
-
-static void iommu_enable_event_buffer(struct amd_iommu *iommu)
-{
- u64 entry;
-
- BUG_ON(iommu->evt_buf == NULL);
-
- entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK;
-
- memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET,
- &entry, sizeof(entry));
-
- /* set head and tail to zero manually */
- writel(0x00, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
- writel(0x00, iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
-
- iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN);
-}
-
-static void __init free_event_buffer(struct amd_iommu *iommu)
-{
- free_pages((unsigned long)iommu->evt_buf, get_order(EVT_BUFFER_SIZE));
-}
-
-/* sets a specific bit in the device table entry. */
-static void set_dev_entry_bit(u16 devid, u8 bit)
-{
- int i = (bit >> 5) & 0x07;
- int _bit = bit & 0x1f;
-
- amd_iommu_dev_table[devid].data[i] |= (1 << _bit);
-}
-
-static int get_dev_entry_bit(u16 devid, u8 bit)
-{
- int i = (bit >> 5) & 0x07;
- int _bit = bit & 0x1f;
-
- return (amd_iommu_dev_table[devid].data[i] & (1 << _bit)) >> _bit;
-}
-
-
-void amd_iommu_apply_erratum_63(u16 devid)
-{
- int sysmgt;
-
- sysmgt = get_dev_entry_bit(devid, DEV_ENTRY_SYSMGT1) |
- (get_dev_entry_bit(devid, DEV_ENTRY_SYSMGT2) << 1);
-
- if (sysmgt == 0x01)
- set_dev_entry_bit(devid, DEV_ENTRY_IW);
-}
-
-/* Writes the specific IOMMU for a device into the rlookup table */
-static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid)
-{
- amd_iommu_rlookup_table[devid] = iommu;
-}
-
-/*
- * This function takes the device specific flags read from the ACPI
- * table and sets up the device table entry with that information
- */
-static void __init set_dev_entry_from_acpi(struct amd_iommu *iommu,
- u16 devid, u32 flags, u32 ext_flags)
-{
- if (flags & ACPI_DEVFLAG_INITPASS)
- set_dev_entry_bit(devid, DEV_ENTRY_INIT_PASS);
- if (flags & ACPI_DEVFLAG_EXTINT)
- set_dev_entry_bit(devid, DEV_ENTRY_EINT_PASS);
- if (flags & ACPI_DEVFLAG_NMI)
- set_dev_entry_bit(devid, DEV_ENTRY_NMI_PASS);
- if (flags & ACPI_DEVFLAG_SYSMGT1)
- set_dev_entry_bit(devid, DEV_ENTRY_SYSMGT1);
- if (flags & ACPI_DEVFLAG_SYSMGT2)
- set_dev_entry_bit(devid, DEV_ENTRY_SYSMGT2);
- if (flags & ACPI_DEVFLAG_LINT0)
- set_dev_entry_bit(devid, DEV_ENTRY_LINT0_PASS);
- if (flags & ACPI_DEVFLAG_LINT1)
- set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS);
-
- amd_iommu_apply_erratum_63(devid);
-
- set_iommu_for_device(iommu, devid);
-}
-
-/*
- * Reads the device exclusion range from ACPI and initialize IOMMU with
- * it
- */
-static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m)
-{
- struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
-
- if (!(m->flags & IVMD_FLAG_EXCL_RANGE))
- return;
-
- if (iommu) {
- /*
- * We only can configure exclusion ranges per IOMMU, not
- * per device. But we can enable the exclusion range per
- * device. This is done here
- */
- set_dev_entry_bit(m->devid, DEV_ENTRY_EX);
- iommu->exclusion_start = m->range_start;
- iommu->exclusion_length = m->range_length;
- }
-}
-
-/*
- * This function reads some important data from the IOMMU PCI space and
- * initializes the driver data structure with it. It reads the hardware
- * capabilities and the first/last device entries
- */
-static void __init init_iommu_from_pci(struct amd_iommu *iommu)
-{
- int cap_ptr = iommu->cap_ptr;
- u32 range, misc;
- int i, j;
-
- pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET,
- &iommu->cap);
- pci_read_config_dword(iommu->dev, cap_ptr + MMIO_RANGE_OFFSET,
- &range);
- pci_read_config_dword(iommu->dev, cap_ptr + MMIO_MISC_OFFSET,
- &misc);
-
- iommu->first_device = calc_devid(MMIO_GET_BUS(range),
- MMIO_GET_FD(range));
- iommu->last_device = calc_devid(MMIO_GET_BUS(range),
- MMIO_GET_LD(range));
- iommu->evt_msi_num = MMIO_MSI_NUM(misc);
-
- if (!is_rd890_iommu(iommu->dev))
- return;
-
- /*
- * Some rd890 systems may not be fully reconfigured by the BIOS, so
- * it's necessary for us to store this information so it can be
- * reprogrammed on resume
- */
-
- pci_read_config_dword(iommu->dev, iommu->cap_ptr + 4,
- &iommu->stored_addr_lo);
- pci_read_config_dword(iommu->dev, iommu->cap_ptr + 8,
- &iommu->stored_addr_hi);
-
- /* Low bit locks writes to configuration space */
- iommu->stored_addr_lo &= ~1;
-
- for (i = 0; i < 6; i++)
- for (j = 0; j < 0x12; j++)
- iommu->stored_l1[i][j] = iommu_read_l1(iommu, i, j);
-
- for (i = 0; i < 0x83; i++)
- iommu->stored_l2[i] = iommu_read_l2(iommu, i);
-}
-
-/*
- * Takes a pointer to an AMD IOMMU entry in the ACPI table and
- * initializes the hardware and our data structures with it.
- */
-static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
- struct ivhd_header *h)
-{
- u8 *p = (u8 *)h;
- u8 *end = p, flags = 0;
- u16 dev_i, devid = 0, devid_start = 0, devid_to = 0;
- u32 ext_flags = 0;
- bool alias = false;
- struct ivhd_entry *e;
-
- /*
- * First save the recommended feature enable bits from ACPI
- */
- iommu->acpi_flags = h->flags;
-
- /*
- * Done. Now parse the device entries
- */
- p += sizeof(struct ivhd_header);
- end += h->length;
-
-
- while (p < end) {
- e = (struct ivhd_entry *)p;
- switch (e->type) {
- case IVHD_DEV_ALL:
-
- DUMP_printk(" DEV_ALL\t\t\t first devid: %02x:%02x.%x"
- " last device %02x:%02x.%x flags: %02x\n",
- PCI_BUS(iommu->first_device),
- PCI_SLOT(iommu->first_device),
- PCI_FUNC(iommu->first_device),
- PCI_BUS(iommu->last_device),
- PCI_SLOT(iommu->last_device),
- PCI_FUNC(iommu->last_device),
- e->flags);
-
- for (dev_i = iommu->first_device;
- dev_i <= iommu->last_device; ++dev_i)
- set_dev_entry_from_acpi(iommu, dev_i,
- e->flags, 0);
- break;
- case IVHD_DEV_SELECT:
-
- DUMP_printk(" DEV_SELECT\t\t\t devid: %02x:%02x.%x "
- "flags: %02x\n",
- PCI_BUS(e->devid),
- PCI_SLOT(e->devid),
- PCI_FUNC(e->devid),
- e->flags);
-
- devid = e->devid;
- set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
- break;
- case IVHD_DEV_SELECT_RANGE_START:
-
- DUMP_printk(" DEV_SELECT_RANGE_START\t "
- "devid: %02x:%02x.%x flags: %02x\n",
- PCI_BUS(e->devid),
- PCI_SLOT(e->devid),
- PCI_FUNC(e->devid),
- e->flags);
-
- devid_start = e->devid;
- flags = e->flags;
- ext_flags = 0;
- alias = false;
- break;
- case IVHD_DEV_ALIAS:
-
- DUMP_printk(" DEV_ALIAS\t\t\t devid: %02x:%02x.%x "
- "flags: %02x devid_to: %02x:%02x.%x\n",
- PCI_BUS(e->devid),
- PCI_SLOT(e->devid),
- PCI_FUNC(e->devid),
- e->flags,
- PCI_BUS(e->ext >> 8),
- PCI_SLOT(e->ext >> 8),
- PCI_FUNC(e->ext >> 8));
-
- devid = e->devid;
- devid_to = e->ext >> 8;
- set_dev_entry_from_acpi(iommu, devid , e->flags, 0);
- set_dev_entry_from_acpi(iommu, devid_to, e->flags, 0);
- amd_iommu_alias_table[devid] = devid_to;
- break;
- case IVHD_DEV_ALIAS_RANGE:
-
- DUMP_printk(" DEV_ALIAS_RANGE\t\t "
- "devid: %02x:%02x.%x flags: %02x "
- "devid_to: %02x:%02x.%x\n",
- PCI_BUS(e->devid),
- PCI_SLOT(e->devid),
- PCI_FUNC(e->devid),
- e->flags,
- PCI_BUS(e->ext >> 8),
- PCI_SLOT(e->ext >> 8),
- PCI_FUNC(e->ext >> 8));
-
- devid_start = e->devid;
- flags = e->flags;
- devid_to = e->ext >> 8;
- ext_flags = 0;
- alias = true;
- break;
- case IVHD_DEV_EXT_SELECT:
-
- DUMP_printk(" DEV_EXT_SELECT\t\t devid: %02x:%02x.%x "
- "flags: %02x ext: %08x\n",
- PCI_BUS(e->devid),
- PCI_SLOT(e->devid),
- PCI_FUNC(e->devid),
- e->flags, e->ext);
-
- devid = e->devid;
- set_dev_entry_from_acpi(iommu, devid, e->flags,
- e->ext);
- break;
- case IVHD_DEV_EXT_SELECT_RANGE:
-
- DUMP_printk(" DEV_EXT_SELECT_RANGE\t devid: "
- "%02x:%02x.%x flags: %02x ext: %08x\n",
- PCI_BUS(e->devid),
- PCI_SLOT(e->devid),
- PCI_FUNC(e->devid),
- e->flags, e->ext);
-
- devid_start = e->devid;
- flags = e->flags;
- ext_flags = e->ext;
- alias = false;
- break;
- case IVHD_DEV_RANGE_END:
-
- DUMP_printk(" DEV_RANGE_END\t\t devid: %02x:%02x.%x\n",
- PCI_BUS(e->devid),
- PCI_SLOT(e->devid),
- PCI_FUNC(e->devid));
-
- devid = e->devid;
- for (dev_i = devid_start; dev_i <= devid; ++dev_i) {
- if (alias) {
- amd_iommu_alias_table[dev_i] = devid_to;
- set_dev_entry_from_acpi(iommu,
- devid_to, flags, ext_flags);
- }
- set_dev_entry_from_acpi(iommu, dev_i,
- flags, ext_flags);
- }
- break;
- default:
- break;
- }
-
- p += ivhd_entry_length(p);
- }
-}
-
-/* Initializes the device->iommu mapping for the driver */
-static int __init init_iommu_devices(struct amd_iommu *iommu)
-{
- u16 i;
-
- for (i = iommu->first_device; i <= iommu->last_device; ++i)
- set_iommu_for_device(iommu, i);
-
- return 0;
-}
-
-static void __init free_iommu_one(struct amd_iommu *iommu)
-{
- free_command_buffer(iommu);
- free_event_buffer(iommu);
- iommu_unmap_mmio_space(iommu);
-}
-
-static void __init free_iommu_all(void)
-{
- struct amd_iommu *iommu, *next;
-
- for_each_iommu_safe(iommu, next) {
- list_del(&iommu->list);
- free_iommu_one(iommu);
- kfree(iommu);
- }
-}
-
-/*
- * This function clues the initialization function for one IOMMU
- * together and also allocates the command buffer and programs the
- * hardware. It does NOT enable the IOMMU. This is done afterwards.
- */
-static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
-{
- spin_lock_init(&iommu->lock);
-
- /* Add IOMMU to internal data structures */
- list_add_tail(&iommu->list, &amd_iommu_list);
- iommu->index = amd_iommus_present++;
-
- if (unlikely(iommu->index >= MAX_IOMMUS)) {
- WARN(1, "AMD-Vi: System has more IOMMUs than supported by this driver\n");
- return -ENOSYS;
- }
-
- /* Index is fine - add IOMMU to the array */
- amd_iommus[iommu->index] = iommu;
-
- /*
- * Copy data from ACPI table entry to the iommu struct
- */
- iommu->dev = pci_get_bus_and_slot(PCI_BUS(h->devid), h->devid & 0xff);
- if (!iommu->dev)
- return 1;
-
- iommu->cap_ptr = h->cap_ptr;
- iommu->pci_seg = h->pci_seg;
- iommu->mmio_phys = h->mmio_phys;
- iommu->mmio_base = iommu_map_mmio_space(h->mmio_phys);
- if (!iommu->mmio_base)
- return -ENOMEM;
-
- iommu->cmd_buf = alloc_command_buffer(iommu);
- if (!iommu->cmd_buf)
- return -ENOMEM;
-
- iommu->evt_buf = alloc_event_buffer(iommu);
- if (!iommu->evt_buf)
- return -ENOMEM;
-
- iommu->int_enabled = false;
-
- init_iommu_from_pci(iommu);
- init_iommu_from_acpi(iommu, h);
- init_iommu_devices(iommu);
-
- if (iommu->cap & (1UL << IOMMU_CAP_NPCACHE))
- amd_iommu_np_cache = true;
-
- return pci_enable_device(iommu->dev);
-}
-
-/*
- * Iterates over all IOMMU entries in the ACPI table, allocates the
- * IOMMU structure and initializes it with init_iommu_one()
- */
-static int __init init_iommu_all(struct acpi_table_header *table)
-{
- u8 *p = (u8 *)table, *end = (u8 *)table;
- struct ivhd_header *h;
- struct amd_iommu *iommu;
- int ret;
-
- end += table->length;
- p += IVRS_HEADER_LENGTH;
-
- while (p < end) {
- h = (struct ivhd_header *)p;
- switch (*p) {
- case ACPI_IVHD_TYPE:
-
- DUMP_printk("device: %02x:%02x.%01x cap: %04x "
- "seg: %d flags: %01x info %04x\n",
- PCI_BUS(h->devid), PCI_SLOT(h->devid),
- PCI_FUNC(h->devid), h->cap_ptr,
- h->pci_seg, h->flags, h->info);
- DUMP_printk(" mmio-addr: %016llx\n",
- h->mmio_phys);
-
- iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL);
- if (iommu == NULL) {
- amd_iommu_init_err = -ENOMEM;
- return 0;
- }
-
- ret = init_iommu_one(iommu, h);
- if (ret) {
- amd_iommu_init_err = ret;
- return 0;
- }
- break;
- default:
- break;
- }
- p += h->length;
-
- }
- WARN_ON(p != end);
-
- return 0;
-}
-
-/****************************************************************************
- *
- * The following functions initialize the MSI interrupts for all IOMMUs
- * in the system. Its a bit challenging because there could be multiple
- * IOMMUs per PCI BDF but we can call pci_enable_msi(x) only once per
- * pci_dev.
- *
- ****************************************************************************/
-
-static int iommu_setup_msi(struct amd_iommu *iommu)
-{
- int r;
-
- if (pci_enable_msi(iommu->dev))
- return 1;
-
- r = request_irq(iommu->dev->irq, amd_iommu_int_handler,
- IRQF_SAMPLE_RANDOM,
- "AMD-Vi",
- NULL);
-
- if (r) {
- pci_disable_msi(iommu->dev);
- return 1;
- }
-
- iommu->int_enabled = true;
- iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);
-
- return 0;
-}
-
-static int iommu_init_msi(struct amd_iommu *iommu)
-{
- if (iommu->int_enabled)
- return 0;
-
- if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI))
- return iommu_setup_msi(iommu);
-
- return 1;
-}
-
-/****************************************************************************
- *
- * The next functions belong to the third pass of parsing the ACPI
- * table. In this last pass the memory mapping requirements are
- * gathered (like exclusion and unity mapping reanges).
- *
- ****************************************************************************/
-
-static void __init free_unity_maps(void)
-{
- struct unity_map_entry *entry, *next;
-
- list_for_each_entry_safe(entry, next, &amd_iommu_unity_map, list) {
- list_del(&entry->list);
- kfree(entry);
- }
-}
-
-/* called when we find an exclusion range definition in ACPI */
-static int __init init_exclusion_range(struct ivmd_header *m)
-{
- int i;
-
- switch (m->type) {
- case ACPI_IVMD_TYPE:
- set_device_exclusion_range(m->devid, m);
- break;
- case ACPI_IVMD_TYPE_ALL:
- for (i = 0; i <= amd_iommu_last_bdf; ++i)
- set_device_exclusion_range(i, m);
- break;
- case ACPI_IVMD_TYPE_RANGE:
- for (i = m->devid; i <= m->aux; ++i)
- set_device_exclusion_range(i, m);
- break;
- default:
- break;
- }
-
- return 0;
-}
-
-/* called for unity map ACPI definition */
-static int __init init_unity_map_range(struct ivmd_header *m)
-{
- struct unity_map_entry *e = 0;
- char *s;
-
- e = kzalloc(sizeof(*e), GFP_KERNEL);
- if (e == NULL)
- return -ENOMEM;
-
- switch (m->type) {
- default:
- kfree(e);
- return 0;
- case ACPI_IVMD_TYPE:
- s = "IVMD_TYPEi\t\t\t";
- e->devid_start = e->devid_end = m->devid;
- break;
- case ACPI_IVMD_TYPE_ALL:
- s = "IVMD_TYPE_ALL\t\t";
- e->devid_start = 0;
- e->devid_end = amd_iommu_last_bdf;
- break;
- case ACPI_IVMD_TYPE_RANGE:
- s = "IVMD_TYPE_RANGE\t\t";
- e->devid_start = m->devid;
- e->devid_end = m->aux;
- break;
- }
- e->address_start = PAGE_ALIGN(m->range_start);
- e->address_end = e->address_start + PAGE_ALIGN(m->range_length);
- e->prot = m->flags >> 1;
-
- DUMP_printk("%s devid_start: %02x:%02x.%x devid_end: %02x:%02x.%x"
- " range_start: %016llx range_end: %016llx flags: %x\n", s,
- PCI_BUS(e->devid_start), PCI_SLOT(e->devid_start),
- PCI_FUNC(e->devid_start), PCI_BUS(e->devid_end),
- PCI_SLOT(e->devid_end), PCI_FUNC(e->devid_end),
- e->address_start, e->address_end, m->flags);
-
- list_add_tail(&e->list, &amd_iommu_unity_map);
-
- return 0;
-}
-
-/* iterates over all memory definitions we find in the ACPI table */
-static int __init init_memory_definitions(struct acpi_table_header *table)
-{
- u8 *p = (u8 *)table, *end = (u8 *)table;
- struct ivmd_header *m;
-
- end += table->length;
- p += IVRS_HEADER_LENGTH;
-
- while (p < end) {
- m = (struct ivmd_header *)p;
- if (m->flags & IVMD_FLAG_EXCL_RANGE)
- init_exclusion_range(m);
- else if (m->flags & IVMD_FLAG_UNITY_MAP)
- init_unity_map_range(m);
-
- p += m->length;
- }
-
- return 0;
-}
-
-/*
- * Init the device table to not allow DMA access for devices and
- * suppress all page faults
- */
-static void init_device_table(void)
-{
- u16 devid;
-
- for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) {
- set_dev_entry_bit(devid, DEV_ENTRY_VALID);
- set_dev_entry_bit(devid, DEV_ENTRY_TRANSLATION);
- }
-}
-
-static void iommu_init_flags(struct amd_iommu *iommu)
-{
- iommu->acpi_flags & IVHD_FLAG_HT_TUN_EN_MASK ?
- iommu_feature_enable(iommu, CONTROL_HT_TUN_EN) :
- iommu_feature_disable(iommu, CONTROL_HT_TUN_EN);
-
- iommu->acpi_flags & IVHD_FLAG_PASSPW_EN_MASK ?
- iommu_feature_enable(iommu, CONTROL_PASSPW_EN) :
- iommu_feature_disable(iommu, CONTROL_PASSPW_EN);
-
- iommu->acpi_flags & IVHD_FLAG_RESPASSPW_EN_MASK ?
- iommu_feature_enable(iommu, CONTROL_RESPASSPW_EN) :
- iommu_feature_disable(iommu, CONTROL_RESPASSPW_EN);
-
- iommu->acpi_flags & IVHD_FLAG_ISOC_EN_MASK ?
- iommu_feature_enable(iommu, CONTROL_ISOC_EN) :
- iommu_feature_disable(iommu, CONTROL_ISOC_EN);
-
- /*
- * make IOMMU memory accesses cache coherent
- */
- iommu_feature_enable(iommu, CONTROL_COHERENT_EN);
-}
-
-static void iommu_apply_resume_quirks(struct amd_iommu *iommu)
-{
- int i, j;
- u32 ioc_feature_control;
- struct pci_dev *pdev = NULL;
-
- /* RD890 BIOSes may not have completely reconfigured the iommu */
- if (!is_rd890_iommu(iommu->dev))
- return;
-
- /*
- * First, we need to ensure that the iommu is enabled. This is
- * controlled by a register in the northbridge
- */
- pdev = pci_get_bus_and_slot(iommu->dev->bus->number, PCI_DEVFN(0, 0));
-
- if (!pdev)
- return;
-
- /* Select Northbridge indirect register 0x75 and enable writing */
- pci_write_config_dword(pdev, 0x60, 0x75 | (1 << 7));
- pci_read_config_dword(pdev, 0x64, &ioc_feature_control);
-
- /* Enable the iommu */
- if (!(ioc_feature_control & 0x1))
- pci_write_config_dword(pdev, 0x64, ioc_feature_control | 1);
-
- pci_dev_put(pdev);
-
- /* Restore the iommu BAR */
- pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4,
- iommu->stored_addr_lo);
- pci_write_config_dword(iommu->dev, iommu->cap_ptr + 8,
- iommu->stored_addr_hi);
-
- /* Restore the l1 indirect regs for each of the 6 l1s */
- for (i = 0; i < 6; i++)
- for (j = 0; j < 0x12; j++)
- iommu_write_l1(iommu, i, j, iommu->stored_l1[i][j]);
-
- /* Restore the l2 indirect regs */
- for (i = 0; i < 0x83; i++)
- iommu_write_l2(iommu, i, iommu->stored_l2[i]);
-
- /* Lock PCI setup registers */
- pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4,
- iommu->stored_addr_lo | 1);
-}
-
-/*
- * This function finally enables all IOMMUs found in the system after
- * they have been initialized
- */
-static void enable_iommus(void)
-{
- struct amd_iommu *iommu;
-
- for_each_iommu(iommu) {
- iommu_disable(iommu);
- iommu_init_flags(iommu);
- iommu_set_device_table(iommu);
- iommu_enable_command_buffer(iommu);
- iommu_enable_event_buffer(iommu);
- iommu_set_exclusion_range(iommu);
- iommu_init_msi(iommu);
- iommu_enable(iommu);
- }
-}
-
-static void disable_iommus(void)
-{
- struct amd_iommu *iommu;
-
- for_each_iommu(iommu)
- iommu_disable(iommu);
-}
-
-/*
- * Suspend/Resume support
- * disable suspend until real resume implemented
- */
-
-static void amd_iommu_resume(void)
-{
- struct amd_iommu *iommu;
-
- for_each_iommu(iommu)
- iommu_apply_resume_quirks(iommu);
-
- /* re-load the hardware */
- enable_iommus();
-
- /*
- * we have to flush after the IOMMUs are enabled because a
- * disabled IOMMU will never execute the commands we send
- */
- amd_iommu_flush_all_devices();
- amd_iommu_flush_all_domains();
-}
-
-static int amd_iommu_suspend(void)
-{
- /* disable IOMMUs to go out of the way for BIOS */
- disable_iommus();
-
- return 0;
-}
-
-static struct syscore_ops amd_iommu_syscore_ops = {
- .suspend = amd_iommu_suspend,
- .resume = amd_iommu_resume,
-};
-
-/*
- * This is the core init function for AMD IOMMU hardware in the system.
- * This function is called from the generic x86 DMA layer initialization
- * code.
- *
- * This function basically parses the ACPI table for AMD IOMMU (IVRS)
- * three times:
- *
- * 1 pass) Find the highest PCI device id the driver has to handle.
- * Upon this information the size of the data structures is
- * determined that needs to be allocated.
- *
- * 2 pass) Initialize the data structures just allocated with the
- * information in the ACPI table about available AMD IOMMUs
- * in the system. It also maps the PCI devices in the
- * system to specific IOMMUs
- *
- * 3 pass) After the basic data structures are allocated and
- * initialized we update them with information about memory
- * remapping requirements parsed out of the ACPI table in
- * this last pass.
- *
- * After that the hardware is initialized and ready to go. In the last
- * step we do some Linux specific things like registering the driver in
- * the dma_ops interface and initializing the suspend/resume support
- * functions. Finally it prints some information about AMD IOMMUs and
- * the driver state and enables the hardware.
- */
-static int __init amd_iommu_init(void)
-{
- int i, ret = 0;
-
- /*
- * First parse ACPI tables to find the largest Bus/Dev/Func
- * we need to handle. Upon this information the shared data
- * structures for the IOMMUs in the system will be allocated
- */
- if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0)
- return -ENODEV;
-
- ret = amd_iommu_init_err;
- if (ret)
- goto out;
-
- dev_table_size = tbl_size(DEV_TABLE_ENTRY_SIZE);
- alias_table_size = tbl_size(ALIAS_TABLE_ENTRY_SIZE);
- rlookup_table_size = tbl_size(RLOOKUP_TABLE_ENTRY_SIZE);
-
- ret = -ENOMEM;
-
- /* Device table - directly used by all IOMMUs */
- amd_iommu_dev_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
- get_order(dev_table_size));
- if (amd_iommu_dev_table == NULL)
- goto out;
-
- /*
- * Alias table - map PCI Bus/Dev/Func to Bus/Dev/Func the
- * IOMMU see for that device
- */
- amd_iommu_alias_table = (void *)__get_free_pages(GFP_KERNEL,
- get_order(alias_table_size));
- if (amd_iommu_alias_table == NULL)
- goto free;
-
- /* IOMMU rlookup table - find the IOMMU for a specific device */
- amd_iommu_rlookup_table = (void *)__get_free_pages(
- GFP_KERNEL | __GFP_ZERO,
- get_order(rlookup_table_size));
- if (amd_iommu_rlookup_table == NULL)
- goto free;
-
- amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages(
- GFP_KERNEL | __GFP_ZERO,
- get_order(MAX_DOMAIN_ID/8));
- if (amd_iommu_pd_alloc_bitmap == NULL)
- goto free;
-
- /* init the device table */
- init_device_table();
-
- /*
- * let all alias entries point to itself
- */
- for (i = 0; i <= amd_iommu_last_bdf; ++i)
- amd_iommu_alias_table[i] = i;
-
- /*
- * never allocate domain 0 because its used as the non-allocated and
- * error value placeholder
- */
- amd_iommu_pd_alloc_bitmap[0] = 1;
-
- spin_lock_init(&amd_iommu_pd_lock);
-
- /*
- * now the data structures are allocated and basically initialized
- * start the real acpi table scan
- */
- ret = -ENODEV;
- if (acpi_table_parse("IVRS", init_iommu_all) != 0)
- goto free;
-
- if (amd_iommu_init_err) {
- ret = amd_iommu_init_err;
- goto free;
- }
-
- if (acpi_table_parse("IVRS", init_memory_definitions) != 0)
- goto free;
-
- if (amd_iommu_init_err) {
- ret = amd_iommu_init_err;
- goto free;
- }
-
- ret = amd_iommu_init_devices();
- if (ret)
- goto free;
-
- enable_iommus();
-
- if (iommu_pass_through)
- ret = amd_iommu_init_passthrough();
- else
- ret = amd_iommu_init_dma_ops();
-
- if (ret)
- goto free_disable;
-
- amd_iommu_init_api();
-
- amd_iommu_init_notifier();
-
- register_syscore_ops(&amd_iommu_syscore_ops);
-
- if (iommu_pass_through)
- goto out;
-
- if (amd_iommu_unmap_flush)
- printk(KERN_INFO "AMD-Vi: IO/TLB flush on unmap enabled\n");
- else
- printk(KERN_INFO "AMD-Vi: Lazy IO/TLB flushing enabled\n");
-
- x86_platform.iommu_shutdown = disable_iommus;
-out:
- return ret;
-
-free_disable:
- disable_iommus();
-
-free:
- amd_iommu_uninit_devices();
-
- free_pages((unsigned long)amd_iommu_pd_alloc_bitmap,
- get_order(MAX_DOMAIN_ID/8));
-
- free_pages((unsigned long)amd_iommu_rlookup_table,
- get_order(rlookup_table_size));
-
- free_pages((unsigned long)amd_iommu_alias_table,
- get_order(alias_table_size));
-
- free_pages((unsigned long)amd_iommu_dev_table,
- get_order(dev_table_size));
-
- free_iommu_all();
-
- free_unity_maps();
-
-#ifdef CONFIG_GART_IOMMU
- /*
- * We failed to initialize the AMD IOMMU - try fallback to GART
- * if possible.
- */
- gart_iommu_init();
-
-#endif
-
- goto out;
-}
-
-/****************************************************************************
- *
- * Early detect code. This code runs at IOMMU detection time in the DMA
- * layer. It just looks if there is an IVRS ACPI table to detect AMD
- * IOMMUs
- *
- ****************************************************************************/
-static int __init early_amd_iommu_detect(struct acpi_table_header *table)
-{
- return 0;
-}
-
-int __init amd_iommu_detect(void)
-{
- if (no_iommu || (iommu_detected && !gart_iommu_aperture))
- return -ENODEV;
-
- if (amd_iommu_disabled)
- return -ENODEV;
-
- if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
- iommu_detected = 1;
- amd_iommu_detected = 1;
- x86_init.iommu.iommu_init = amd_iommu_init;
-
- /* Make sure ACS will be enabled */
- pci_request_acs();
- return 1;
- }
- return -ENODEV;
-}
-
-/****************************************************************************
- *
- * Parsing functions for the AMD IOMMU specific kernel command line
- * options.
- *
- ****************************************************************************/
-
-static int __init parse_amd_iommu_dump(char *str)
-{
- amd_iommu_dump = true;
-
- return 1;
-}
-
-static int __init parse_amd_iommu_options(char *str)
-{
- for (; *str; ++str) {
- if (strncmp(str, "fullflush", 9) == 0)
- amd_iommu_unmap_flush = true;
- if (strncmp(str, "off", 3) == 0)
- amd_iommu_disabled = true;
- }
-
- return 1;
-}
-
-__setup("amd_iommu_dump", parse_amd_iommu_dump);
-__setup("amd_iommu=", parse_amd_iommu_options);
-
-IOMMU_INIT_FINISH(amd_iommu_detect,
- gart_iommu_hole_init,
- 0,
- 0);
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index cd1ffed4ee2..afdc3f756de 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -27,15 +27,12 @@
* timer, but by default APB timer has higher rating than local APIC timers.
*/
-#include <linux/clocksource.h>
-#include <linux/clockchips.h>
#include <linux/delay.h>
+#include <linux/dw_apb_timer.h>
#include <linux/errno.h>
#include <linux/init.h>
-#include <linux/sysdev.h>
#include <linux/slab.h>
#include <linux/pm.h>
-#include <linux/pci.h>
#include <linux/sfi.h>
#include <linux/interrupt.h>
#include <linux/cpu.h>
@@ -44,76 +41,48 @@
#include <asm/fixmap.h>
#include <asm/apb_timer.h>
#include <asm/mrst.h>
+#include <asm/time.h>
-#define APBT_MASK CLOCKSOURCE_MASK(32)
-#define APBT_SHIFT 22
#define APBT_CLOCKEVENT_RATING 110
#define APBT_CLOCKSOURCE_RATING 250
-#define APBT_MIN_DELTA_USEC 200
-#define EVT_TO_APBT_DEV(evt) container_of(evt, struct apbt_dev, evt)
#define APBT_CLOCKEVENT0_NUM (0)
-#define APBT_CLOCKEVENT1_NUM (1)
#define APBT_CLOCKSOURCE_NUM (2)
-static unsigned long apbt_address;
+static phys_addr_t apbt_address;
static int apb_timer_block_enabled;
static void __iomem *apbt_virt_address;
-static int phy_cs_timer_id;
/*
* Common DW APB timer info
*/
-static uint64_t apbt_freq;
-
-static void apbt_set_mode(enum clock_event_mode mode,
- struct clock_event_device *evt);
-static int apbt_next_event(unsigned long delta,
- struct clock_event_device *evt);
-static cycle_t apbt_read_clocksource(struct clocksource *cs);
-static void apbt_restart_clocksource(struct clocksource *cs);
+static unsigned long apbt_freq;
struct apbt_dev {
- struct clock_event_device evt;
- unsigned int num;
- int cpu;
- unsigned int irq;
- unsigned int tick;
- unsigned int count;
- unsigned int flags;
- char name[10];
+ struct dw_apb_clock_event_device *timer;
+ unsigned int num;
+ int cpu;
+ unsigned int irq;
+ char name[10];
};
-static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev);
+static struct dw_apb_clocksource *clocksource_apbt;
-#ifdef CONFIG_SMP
-static unsigned int apbt_num_timers_used;
-static struct apbt_dev *apbt_devs;
-#endif
-
-static inline unsigned long apbt_readl_reg(unsigned long a)
+static inline void __iomem *adev_virt_addr(struct apbt_dev *adev)
{
- return readl(apbt_virt_address + a);
+ return apbt_virt_address + adev->num * APBTMRS_REG_SIZE;
}
-static inline void apbt_writel_reg(unsigned long d, unsigned long a)
-{
- writel(d, apbt_virt_address + a);
-}
-
-static inline unsigned long apbt_readl(int n, unsigned long a)
-{
- return readl(apbt_virt_address + a + n * APBTMRS_REG_SIZE);
-}
+static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev);
-static inline void apbt_writel(int n, unsigned long d, unsigned long a)
-{
- writel(d, apbt_virt_address + a + n * APBTMRS_REG_SIZE);
-}
+#ifdef CONFIG_SMP
+static unsigned int apbt_num_timers_used;
+#endif
static inline void apbt_set_mapping(void)
{
struct sfi_timer_table_entry *mtmr;
+ int phy_cs_timer_id = 0;
if (apbt_virt_address) {
pr_debug("APBT base already mapped\n");
@@ -125,21 +94,18 @@ static inline void apbt_set_mapping(void)
APBT_CLOCKEVENT0_NUM);
return;
}
- apbt_address = (unsigned long)mtmr->phys_addr;
+ apbt_address = (phys_addr_t)mtmr->phys_addr;
if (!apbt_address) {
printk(KERN_WARNING "No timer base from SFI, use default\n");
apbt_address = APBT_DEFAULT_BASE;
}
apbt_virt_address = ioremap_nocache(apbt_address, APBT_MMAP_SIZE);
- if (apbt_virt_address) {
- pr_debug("Mapped APBT physical addr %p at virtual addr %p\n",\
- (void *)apbt_address, (void *)apbt_virt_address);
- } else {
- pr_debug("Failed mapping APBT phy address at %p\n",\
- (void *)apbt_address);
+ if (!apbt_virt_address) {
+ pr_debug("Failed mapping APBT phy address at %lu\n",\
+ (unsigned long)apbt_address);
goto panic_noapbt;
}
- apbt_freq = mtmr->freq_hz / USEC_PER_SEC;
+ apbt_freq = mtmr->freq_hz;
sfi_free_mtmr(mtmr);
/* Now figure out the physical timer id for clocksource device */
@@ -148,9 +114,14 @@ static inline void apbt_set_mapping(void)
goto panic_noapbt;
/* Now figure out the physical timer id */
- phy_cs_timer_id = (unsigned int)(mtmr->phys_addr & 0xff)
- / APBTMRS_REG_SIZE;
- pr_debug("Use timer %d for clocksource\n", phy_cs_timer_id);
+ pr_debug("Use timer %d for clocksource\n",
+ (int)(mtmr->phys_addr & 0xff) / APBTMRS_REG_SIZE);
+ phy_cs_timer_id = (unsigned int)(mtmr->phys_addr & 0xff) /
+ APBTMRS_REG_SIZE;
+
+ clocksource_apbt = dw_apb_clocksource_init(APBT_CLOCKSOURCE_RATING,
+ "apbt0", apbt_virt_address + phy_cs_timer_id *
+ APBTMRS_REG_SIZE, apbt_freq);
return;
panic_noapbt:
@@ -172,83 +143,6 @@ static inline int is_apbt_capable(void)
return apbt_virt_address ? 1 : 0;
}
-static struct clocksource clocksource_apbt = {
- .name = "apbt",
- .rating = APBT_CLOCKSOURCE_RATING,
- .read = apbt_read_clocksource,
- .mask = APBT_MASK,
- .shift = APBT_SHIFT,
- .flags = CLOCK_SOURCE_IS_CONTINUOUS,
- .resume = apbt_restart_clocksource,
-};
-
-/* boot APB clock event device */
-static struct clock_event_device apbt_clockevent = {
- .name = "apbt0",
- .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
- .set_mode = apbt_set_mode,
- .set_next_event = apbt_next_event,
- .shift = APBT_SHIFT,
- .irq = 0,
- .rating = APBT_CLOCKEVENT_RATING,
-};
-
-/*
- * start count down from 0xffff_ffff. this is done by toggling the enable bit
- * then load initial load count to ~0.
- */
-static void apbt_start_counter(int n)
-{
- unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
-
- ctrl &= ~APBTMR_CONTROL_ENABLE;
- apbt_writel(n, ctrl, APBTMR_N_CONTROL);
- apbt_writel(n, ~0, APBTMR_N_LOAD_COUNT);
- /* enable, mask interrupt */
- ctrl &= ~APBTMR_CONTROL_MODE_PERIODIC;
- ctrl |= (APBTMR_CONTROL_ENABLE | APBTMR_CONTROL_INT);
- apbt_writel(n, ctrl, APBTMR_N_CONTROL);
- /* read it once to get cached counter value initialized */
- apbt_read_clocksource(&clocksource_apbt);
-}
-
-static irqreturn_t apbt_interrupt_handler(int irq, void *data)
-{
- struct apbt_dev *dev = (struct apbt_dev *)data;
- struct clock_event_device *aevt = &dev->evt;
-
- if (!aevt->event_handler) {
- printk(KERN_INFO "Spurious APBT timer interrupt on %d\n",
- dev->num);
- return IRQ_NONE;
- }
- aevt->event_handler(aevt);
- return IRQ_HANDLED;
-}
-
-static void apbt_restart_clocksource(struct clocksource *cs)
-{
- apbt_start_counter(phy_cs_timer_id);
-}
-
-static void apbt_enable_int(int n)
-{
- unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
- /* clear pending intr */
- apbt_readl(n, APBTMR_N_EOI);
- ctrl &= ~APBTMR_CONTROL_INT;
- apbt_writel(n, ctrl, APBTMR_N_CONTROL);
-}
-
-static void apbt_disable_int(int n)
-{
- unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
-
- ctrl |= APBTMR_CONTROL_INT;
- apbt_writel(n, ctrl, APBTMR_N_CONTROL);
-}
-
-
static int __init apbt_clockevent_register(void)
{
struct sfi_timer_table_entry *mtmr;
@@ -261,45 +155,21 @@ static int __init apbt_clockevent_register(void)
return -ENODEV;
}
- /*
- * We need to calculate the scaled math multiplication factor for
- * nanosecond to apbt tick conversion.
- * mult = (nsec/cycle)*2^APBT_SHIFT
- */
- apbt_clockevent.mult = div_sc((unsigned long) mtmr->freq_hz
- , NSEC_PER_SEC, APBT_SHIFT);
-
- /* Calculate the min / max delta */
- apbt_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF,
- &apbt_clockevent);
- apbt_clockevent.min_delta_ns = clockevent_delta2ns(
- APBT_MIN_DELTA_USEC*apbt_freq,
- &apbt_clockevent);
- /*
- * Start apbt with the boot cpu mask and make it
- * global if not used for per cpu timer.
- */
- apbt_clockevent.cpumask = cpumask_of(smp_processor_id());
adev->num = smp_processor_id();
- memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device));
+ adev->timer = dw_apb_clockevent_init(smp_processor_id(), "apbt0",
+ mrst_timer_options == MRST_TIMER_LAPIC_APBT ?
+ APBT_CLOCKEVENT_RATING - 100 : APBT_CLOCKEVENT_RATING,
+ adev_virt_addr(adev), 0, apbt_freq);
+ /* Firmware does EOI handling for us. */
+ adev->timer->eoi = NULL;
if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) {
- adev->evt.rating = APBT_CLOCKEVENT_RATING - 100;
- global_clock_event = &adev->evt;
+ global_clock_event = &adev->timer->ced;
printk(KERN_DEBUG "%s clockevent registered as global\n",
global_clock_event->name);
}
- if (request_irq(apbt_clockevent.irq, apbt_interrupt_handler,
- IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING,
- apbt_clockevent.name, adev)) {
- printk(KERN_ERR "Failed request IRQ for APBT%d\n",
- apbt_clockevent.irq);
- }
-
- clockevents_register_device(&adev->evt);
- /* Start APBT 0 interrupts */
- apbt_enable_int(APBT_CLOCKEVENT0_NUM);
+ dw_apb_clockevent_register(adev->timer);
sfi_free_mtmr(mtmr);
return 0;
@@ -317,52 +187,34 @@ static void apbt_setup_irq(struct apbt_dev *adev)
irq_set_affinity(adev->irq, cpumask_of(adev->cpu));
/* APB timer irqs are set up as mp_irqs, timer is edge type */
__irq_set_handler(adev->irq, handle_edge_irq, 0, "edge");
-
- if (system_state == SYSTEM_BOOTING) {
- if (request_irq(adev->irq, apbt_interrupt_handler,
- IRQF_TIMER | IRQF_DISABLED |
- IRQF_NOBALANCING,
- adev->name, adev)) {
- printk(KERN_ERR "Failed request IRQ for APBT%d\n",
- adev->num);
- }
- } else
- enable_irq(adev->irq);
}
/* Should be called with per cpu */
void apbt_setup_secondary_clock(void)
{
struct apbt_dev *adev;
- struct clock_event_device *aevt;
int cpu;
/* Don't register boot CPU clockevent */
cpu = smp_processor_id();
if (!cpu)
return;
- /*
- * We need to calculate the scaled math multiplication factor for
- * nanosecond to apbt tick conversion.
- * mult = (nsec/cycle)*2^APBT_SHIFT
- */
- printk(KERN_INFO "Init per CPU clockevent %d\n", cpu);
- adev = &per_cpu(cpu_apbt_dev, cpu);
- aevt = &adev->evt;
- memcpy(aevt, &apbt_clockevent, sizeof(*aevt));
- aevt->cpumask = cpumask_of(cpu);
- aevt->name = adev->name;
- aevt->mode = CLOCK_EVT_MODE_UNUSED;
+ adev = &__get_cpu_var(cpu_apbt_dev);
+ if (!adev->timer) {
+ adev->timer = dw_apb_clockevent_init(cpu, adev->name,
+ APBT_CLOCKEVENT_RATING, adev_virt_addr(adev),
+ adev->irq, apbt_freq);
+ adev->timer->eoi = NULL;
+ } else {
+ dw_apb_clockevent_resume(adev->timer);
+ }
- printk(KERN_INFO "Registering CPU %d clockevent device %s, mask %08x\n",
- cpu, aevt->name, *(u32 *)aevt->cpumask);
+ printk(KERN_INFO "Registering CPU %d clockevent device %s, cpu %08x\n",
+ cpu, adev->name, adev->cpu);
apbt_setup_irq(adev);
-
- clockevents_register_device(aevt);
-
- apbt_enable_int(cpu);
+ dw_apb_clockevent_register(adev->timer);
return;
}
@@ -385,13 +237,12 @@ static int apbt_cpuhp_notify(struct notifier_block *n,
switch (action & 0xf) {
case CPU_DEAD:
- disable_irq(adev->irq);
- apbt_disable_int(cpu);
+ dw_apb_clockevent_pause(adev->timer);
if (system_state == SYSTEM_RUNNING) {
pr_debug("skipping APBT CPU %lu offline\n", cpu);
} else if (adev) {
pr_debug("APBT clockevent for cpu %lu offline\n", cpu);
- free_irq(adev->irq, adev);
+ dw_apb_clockevent_stop(adev->timer);
}
break;
default:
@@ -416,116 +267,16 @@ void apbt_setup_secondary_clock(void) {}
#endif /* CONFIG_SMP */
-static void apbt_set_mode(enum clock_event_mode mode,
- struct clock_event_device *evt)
-{
- unsigned long ctrl;
- uint64_t delta;
- int timer_num;
- struct apbt_dev *adev = EVT_TO_APBT_DEV(evt);
-
- BUG_ON(!apbt_virt_address);
-
- timer_num = adev->num;
- pr_debug("%s CPU %d timer %d mode=%d\n",
- __func__, first_cpu(*evt->cpumask), timer_num, mode);
-
- switch (mode) {
- case CLOCK_EVT_MODE_PERIODIC:
- delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * apbt_clockevent.mult;
- delta >>= apbt_clockevent.shift;
- ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
- ctrl |= APBTMR_CONTROL_MODE_PERIODIC;
- apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
- /*
- * DW APB p. 46, have to disable timer before load counter,
- * may cause sync problem.
- */
- ctrl &= ~APBTMR_CONTROL_ENABLE;
- apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
- udelay(1);
- pr_debug("Setting clock period %d for HZ %d\n", (int)delta, HZ);
- apbt_writel(timer_num, delta, APBTMR_N_LOAD_COUNT);
- ctrl |= APBTMR_CONTROL_ENABLE;
- apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
- break;
- /* APB timer does not have one-shot mode, use free running mode */
- case CLOCK_EVT_MODE_ONESHOT:
- ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
- /*
- * set free running mode, this mode will let timer reload max
- * timeout which will give time (3min on 25MHz clock) to rearm
- * the next event, therefore emulate the one-shot mode.
- */
- ctrl &= ~APBTMR_CONTROL_ENABLE;
- ctrl &= ~APBTMR_CONTROL_MODE_PERIODIC;
-
- apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
- /* write again to set free running mode */
- apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
-
- /*
- * DW APB p. 46, load counter with all 1s before starting free
- * running mode.
- */
- apbt_writel(timer_num, ~0, APBTMR_N_LOAD_COUNT);
- ctrl &= ~APBTMR_CONTROL_INT;
- ctrl |= APBTMR_CONTROL_ENABLE;
- apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
- break;
-
- case CLOCK_EVT_MODE_UNUSED:
- case CLOCK_EVT_MODE_SHUTDOWN:
- apbt_disable_int(timer_num);
- ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
- ctrl &= ~APBTMR_CONTROL_ENABLE;
- apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
- break;
-
- case CLOCK_EVT_MODE_RESUME:
- apbt_enable_int(timer_num);
- break;
- }
-}
-
-static int apbt_next_event(unsigned long delta,
- struct clock_event_device *evt)
-{
- unsigned long ctrl;
- int timer_num;
-
- struct apbt_dev *adev = EVT_TO_APBT_DEV(evt);
-
- timer_num = adev->num;
- /* Disable timer */
- ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
- ctrl &= ~APBTMR_CONTROL_ENABLE;
- apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
- /* write new count */
- apbt_writel(timer_num, delta, APBTMR_N_LOAD_COUNT);
- ctrl |= APBTMR_CONTROL_ENABLE;
- apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
- return 0;
-}
-
-static cycle_t apbt_read_clocksource(struct clocksource *cs)
-{
- unsigned long current_count;
-
- current_count = apbt_readl(phy_cs_timer_id, APBTMR_N_CURRENT_VALUE);
- return (cycle_t)~current_count;
-}
-
static int apbt_clocksource_register(void)
{
u64 start, now;
cycle_t t1;
/* Start the counter, use timer 2 as source, timer 0/1 for event */
- apbt_start_counter(phy_cs_timer_id);
+ dw_apb_clocksource_start(clocksource_apbt);
/* Verify whether apbt counter works */
- t1 = apbt_read_clocksource(&clocksource_apbt);
+ t1 = dw_apb_clocksource_read(clocksource_apbt);
rdtscll(start);
/*
@@ -540,17 +291,10 @@ static int apbt_clocksource_register(void)
} while ((now - start) < 200000UL);
/* APBT is the only always on clocksource, it has to work! */
- if (t1 == apbt_read_clocksource(&clocksource_apbt))
+ if (t1 == dw_apb_clocksource_read(clocksource_apbt))
panic("APBT counter not counting. APBT disabled\n");
- /*
- * initialize and register APBT clocksource
- * convert that to ns/clock cycle
- * mult = (ns/c) * 2^APBT_SHIFT
- */
- clocksource_apbt.mult = div_sc(MSEC_PER_SEC,
- (unsigned long) apbt_freq, APBT_SHIFT);
- clocksource_register(&clocksource_apbt);
+ dw_apb_clocksource_register(clocksource_apbt);
return 0;
}
@@ -574,10 +318,7 @@ void __init apbt_time_init(void)
if (apb_timer_block_enabled)
return;
apbt_set_mapping();
- if (apbt_virt_address) {
- pr_debug("Found APBT version 0x%lx\n",\
- apbt_readl_reg(APBTMRS_COMP_VERSION));
- } else
+ if (!apbt_virt_address)
goto out_noapbt;
/*
* Read the frequency and check for a sane value, for ESL model
@@ -585,7 +326,7 @@ void __init apbt_time_init(void)
*/
if (apbt_freq < APBT_MIN_FREQ || apbt_freq > APBT_MAX_FREQ) {
- pr_debug("APBT has invalid freq 0x%llx\n", apbt_freq);
+ pr_debug("APBT has invalid freq 0x%lx\n", apbt_freq);
goto out_noapbt;
}
if (apbt_clocksource_register()) {
@@ -611,30 +352,20 @@ void __init apbt_time_init(void)
} else {
percpu_timer = 0;
apbt_num_timers_used = 1;
- adev = &per_cpu(cpu_apbt_dev, 0);
- adev->flags &= ~APBT_DEV_USED;
}
pr_debug("%s: %d APB timers used\n", __func__, apbt_num_timers_used);
/* here we set up per CPU timer data structure */
- apbt_devs = kzalloc(sizeof(struct apbt_dev) * apbt_num_timers_used,
- GFP_KERNEL);
- if (!apbt_devs) {
- printk(KERN_ERR "Failed to allocate APB timer devices\n");
- return;
- }
for (i = 0; i < apbt_num_timers_used; i++) {
adev = &per_cpu(cpu_apbt_dev, i);
adev->num = i;
adev->cpu = i;
p_mtmr = sfi_get_mtmr(i);
- if (p_mtmr) {
- adev->tick = p_mtmr->freq_hz;
+ if (p_mtmr)
adev->irq = p_mtmr->irq;
- } else
+ else
printk(KERN_ERR "Failed to get timer for cpu %d\n", i);
- adev->count = 0;
- sprintf(adev->name, "apbt%d", i);
+ snprintf(adev->name, sizeof(adev->name) - 1, "apbt%d", i);
}
#endif
@@ -646,17 +377,8 @@ out_noapbt:
panic("failed to enable APB timer\n");
}
-static inline void apbt_disable(int n)
-{
- if (is_apbt_capable()) {
- unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
- ctrl &= ~APBTMR_CONTROL_ENABLE;
- apbt_writel(n, ctrl, APBTMR_N_CONTROL);
- }
-}
-
/* called before apb_timer_enable, use early map */
-unsigned long apbt_quick_calibrate()
+unsigned long apbt_quick_calibrate(void)
{
int i, scale;
u64 old, new;
@@ -665,31 +387,31 @@ unsigned long apbt_quick_calibrate()
u32 loop, shift;
apbt_set_mapping();
- apbt_start_counter(phy_cs_timer_id);
+ dw_apb_clocksource_start(clocksource_apbt);
/* check if the timer can count down, otherwise return */
- old = apbt_read_clocksource(&clocksource_apbt);
+ old = dw_apb_clocksource_read(clocksource_apbt);
i = 10000;
while (--i) {
- if (old != apbt_read_clocksource(&clocksource_apbt))
+ if (old != dw_apb_clocksource_read(clocksource_apbt))
break;
}
if (!i)
goto failed;
/* count 16 ms */
- loop = (apbt_freq * 1000) << 4;
+ loop = (apbt_freq / 1000) << 4;
/* restart the timer to ensure it won't get to 0 in the calibration */
- apbt_start_counter(phy_cs_timer_id);
+ dw_apb_clocksource_start(clocksource_apbt);
- old = apbt_read_clocksource(&clocksource_apbt);
+ old = dw_apb_clocksource_read(clocksource_apbt);
old += loop;
t1 = __native_read_tsc();
do {
- new = apbt_read_clocksource(&clocksource_apbt);
+ new = dw_apb_clocksource_read(clocksource_apbt);
} while (new < old);
t2 = __native_read_tsc();
@@ -701,7 +423,7 @@ unsigned long apbt_quick_calibrate()
return 0;
}
scale = (int)div_u64((t2 - t1), loop >> shift);
- khz = (scale * apbt_freq * 1000) >> shift;
+ khz = (scale * (apbt_freq / 1000)) >> shift;
printk(KERN_INFO "TSC freq calculated by APB timer is %lu khz\n", khz);
return khz;
failed:
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 73fb469908c..3d2661ca654 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -30,6 +30,22 @@
#include <asm/amd_nb.h>
#include <asm/x86_init.h>
+/*
+ * Using 512M as goal, in case kexec will load kernel_big
+ * that will do the on-position decompress, and could overlap with
+ * with the gart aperture that is used.
+ * Sequence:
+ * kernel_small
+ * ==> kexec (with kdump trigger path or gart still enabled)
+ * ==> kernel_small (gart area become e820_reserved)
+ * ==> kexec (with kdump trigger path or gart still enabled)
+ * ==> kerne_big (uncompressed size will be big than 64M or 128M)
+ * So don't use 512M below as gart iommu, leave the space for kernel
+ * code for safe.
+ */
+#define GART_MIN_ADDR (512ULL << 20)
+#define GART_MAX_ADDR (1ULL << 32)
+
int gart_iommu_aperture;
int gart_iommu_aperture_disabled __initdata;
int gart_iommu_aperture_allowed __initdata;
@@ -70,21 +86,9 @@ static u32 __init allocate_aperture(void)
* memory. Unfortunately we cannot move it up because that would
* make the IOMMU useless.
*/
- /*
- * using 512M as goal, in case kexec will load kernel_big
- * that will do the on position decompress, and could overlap with
- * that position with gart that is used.
- * sequende:
- * kernel_small
- * ==> kexec (with kdump trigger path or previous doesn't shutdown gart)
- * ==> kernel_small(gart area become e820_reserved)
- * ==> kexec (with kdump trigger path or previous doesn't shutdown gart)
- * ==> kerne_big (uncompressed size will be big than 64M or 128M)
- * so don't use 512M below as gart iommu, leave the space for kernel
- * code for safe
- */
- addr = memblock_find_in_range(0, 1ULL<<32, aper_size, 512ULL<<20);
- if (addr == MEMBLOCK_ERROR || addr + aper_size > 0xffffffff) {
+ addr = memblock_find_in_range(GART_MIN_ADDR, GART_MAX_ADDR,
+ aper_size, aper_size);
+ if (addr == MEMBLOCK_ERROR || addr + aper_size > GART_MAX_ADDR) {
printk(KERN_ERR
"Cannot allocate aperture memory hole (%lx,%uK)\n",
addr, aper_size>>10);
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index 3966b564ea4..767fd04f284 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -2,20 +2,25 @@
# Makefile for local APIC drivers and for the IO-APIC code
#
-obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o
+obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o ipi.o
obj-y += hw_nmi.o
obj-$(CONFIG_X86_IO_APIC) += io_apic.o
obj-$(CONFIG_SMP) += ipi.o
ifeq ($(CONFIG_X86_64),y)
-obj-y += apic_flat_64.o
-obj-$(CONFIG_X86_X2APIC) += x2apic_cluster.o
-obj-$(CONFIG_X86_X2APIC) += x2apic_phys.o
+# APIC probe will depend on the listing order here
obj-$(CONFIG_X86_UV) += x2apic_uv_x.o
+obj-$(CONFIG_X86_X2APIC) += x2apic_phys.o
+obj-$(CONFIG_X86_X2APIC) += x2apic_cluster.o
+obj-y += apic_flat_64.o
endif
-obj-$(CONFIG_X86_BIGSMP) += bigsmp_32.o
+# APIC probe will depend on the listing order here
obj-$(CONFIG_X86_NUMAQ) += numaq_32.o
-obj-$(CONFIG_X86_ES7000) += es7000_32.o
obj-$(CONFIG_X86_SUMMIT) += summit_32.o
+obj-$(CONFIG_X86_BIGSMP) += bigsmp_32.o
+obj-$(CONFIG_X86_ES7000) += es7000_32.o
+
+# For 32bit, probe_32 need to be listed last
+obj-$(CONFIG_X86_LOCAL_APIC) += probe_$(BITS).o
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index fabf01eff77..9498b844518 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -27,6 +27,7 @@
#include <linux/syscore_ops.h>
#include <linux/delay.h>
#include <linux/timex.h>
+#include <linux/i8253.h>
#include <linux/dmar.h>
#include <linux/init.h>
#include <linux/cpu.h>
@@ -39,7 +40,6 @@
#include <asm/pgalloc.h>
#include <asm/atomic.h>
#include <asm/mpspec.h>
-#include <asm/i8253.h>
#include <asm/i8259.h>
#include <asm/proto.h>
#include <asm/apic.h>
@@ -48,6 +48,7 @@
#include <asm/hpet.h>
#include <asm/idle.h>
#include <asm/mtrr.h>
+#include <asm/time.h>
#include <asm/smp.h>
#include <asm/mce.h>
#include <asm/tsc.h>
@@ -390,7 +391,8 @@ static unsigned int reserve_eilvt_offset(int offset, unsigned int new)
/*
* If mask=1, the LVT entry does not generate interrupts while mask=0
- * enables the vector. See also the BKDGs.
+ * enables the vector. See also the BKDGs. Must be called with
+ * preemption disabled.
*/
int setup_APIC_eilvt(u8 offset, u8 vector, u8 msg_type, u8 mask)
@@ -505,7 +507,7 @@ static void __cpuinit setup_APIC_timer(void)
{
struct clock_event_device *levt = &__get_cpu_var(lapic_events);
- if (cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_ARAT)) {
+ if (this_cpu_has(X86_FEATURE_ARAT)) {
lapic_clockevent.features &= ~CLOCK_EVT_FEAT_C3STOP;
/* Make LAPIC timer preferrable over percpu HPET */
lapic_clockevent.rating = 150;
@@ -1237,6 +1239,17 @@ void __cpuinit setup_local_APIC(void)
/* always use the value from LDR */
early_per_cpu(x86_cpu_to_logical_apicid, cpu) =
logical_smp_processor_id();
+
+ /*
+ * Some NUMA implementations (NUMAQ) don't initialize apicid to
+ * node mapping during NUMA init. Now that logical apicid is
+ * guaranteed to be known, give it another chance. This is already
+ * a bit too late - percpu allocation has already happened without
+ * proper NUMA affinity.
+ */
+ if (apic->x86_32_numa_cpu_node)
+ set_apicid_to_node(early_per_cpu(x86_cpu_to_apicid, cpu),
+ apic->x86_32_numa_cpu_node(cpu));
#endif
/*
@@ -1417,7 +1430,7 @@ void enable_x2apic(void)
rdmsr(MSR_IA32_APICBASE, msr, msr2);
if (!(msr & X2APIC_ENABLE)) {
printk_once(KERN_INFO "Enabling x2apic\n");
- wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0);
+ wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, msr2);
}
}
#endif /* CONFIG_X86_X2APIC */
@@ -1450,7 +1463,6 @@ int __init enable_IR(void)
void __init enable_IR_x2apic(void)
{
unsigned long flags;
- struct IO_APIC_route_entry **ioapic_entries;
int ret, x2apic_enabled = 0;
int dmar_table_init_ret;
@@ -1458,13 +1470,7 @@ void __init enable_IR_x2apic(void)
if (dmar_table_init_ret && !x2apic_supported())
return;
- ioapic_entries = alloc_ioapic_entries();
- if (!ioapic_entries) {
- pr_err("Allocate ioapic_entries failed\n");
- goto out;
- }
-
- ret = save_IO_APIC_setup(ioapic_entries);
+ ret = save_ioapic_entries();
if (ret) {
pr_info("Saving IO-APIC state failed: %d\n", ret);
goto out;
@@ -1472,7 +1478,7 @@ void __init enable_IR_x2apic(void)
local_irq_save(flags);
legacy_pic->mask_all();
- mask_IO_APIC_setup(ioapic_entries);
+ mask_ioapic_entries();
if (dmar_table_init_ret)
ret = 0;
@@ -1503,14 +1509,11 @@ void __init enable_IR_x2apic(void)
nox2apic:
if (!ret) /* IR enabling failed */
- restore_IO_APIC_setup(ioapic_entries);
+ restore_ioapic_entries();
legacy_pic->restore_mask();
local_irq_restore(flags);
out:
- if (ioapic_entries)
- free_ioapic_entries(ioapic_entries);
-
if (x2apic_enabled)
return;
@@ -1812,30 +1815,41 @@ void smp_spurious_interrupt(struct pt_regs *regs)
*/
void smp_error_interrupt(struct pt_regs *regs)
{
- u32 v, v1;
+ u32 v0, v1;
+ u32 i = 0;
+ static const char * const error_interrupt_reason[] = {
+ "Send CS error", /* APIC Error Bit 0 */
+ "Receive CS error", /* APIC Error Bit 1 */
+ "Send accept error", /* APIC Error Bit 2 */
+ "Receive accept error", /* APIC Error Bit 3 */
+ "Redirectable IPI", /* APIC Error Bit 4 */
+ "Send illegal vector", /* APIC Error Bit 5 */
+ "Received illegal vector", /* APIC Error Bit 6 */
+ "Illegal register address", /* APIC Error Bit 7 */
+ };
exit_idle();
irq_enter();
/* First tickle the hardware, only then report what went on. -- REW */
- v = apic_read(APIC_ESR);
+ v0 = apic_read(APIC_ESR);
apic_write(APIC_ESR, 0);
v1 = apic_read(APIC_ESR);
ack_APIC_irq();
atomic_inc(&irq_err_count);
- /*
- * Here is what the APIC error bits mean:
- * 0: Send CS error
- * 1: Receive CS error
- * 2: Send accept error
- * 3: Receive accept error
- * 4: Reserved
- * 5: Send illegal vector
- * 6: Received illegal vector
- * 7: Illegal register address
- */
- pr_debug("APIC error on CPU%d: %02x(%02x)\n",
- smp_processor_id(), v , v1);
+ apic_printk(APIC_DEBUG, KERN_DEBUG "APIC error on CPU%d: %02x(%02x)",
+ smp_processor_id(), v0 , v1);
+
+ v1 = v1 & 0xff;
+ while (v1) {
+ if (v1 & 0x1)
+ apic_printk(APIC_DEBUG, KERN_CONT " : %s", error_interrupt_reason[i]);
+ i++;
+ v1 >>= 1;
+ };
+
+ apic_printk(APIC_DEBUG, KERN_CONT "\n");
+
irq_exit();
}
@@ -2003,21 +2017,6 @@ void default_init_apic_ldr(void)
apic_write(APIC_LDR, val);
}
-#ifdef CONFIG_X86_32
-int default_x86_32_numa_cpu_node(int cpu)
-{
-#ifdef CONFIG_NUMA
- int apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
-
- if (apicid != BAD_APICID)
- return __apicid_to_node[apicid];
- return NUMA_NO_NODE;
-#else
- return 0;
-#endif
-}
-#endif
-
/*
* Power management
*/
@@ -2088,28 +2087,20 @@ static void lapic_resume(void)
{
unsigned int l, h;
unsigned long flags;
- int maxlvt, ret;
- struct IO_APIC_route_entry **ioapic_entries = NULL;
+ int maxlvt;
if (!apic_pm_state.active)
return;
local_irq_save(flags);
if (intr_remapping_enabled) {
- ioapic_entries = alloc_ioapic_entries();
- if (!ioapic_entries) {
- WARN(1, "Alloc ioapic_entries in lapic resume failed.");
- goto restore;
- }
-
- ret = save_IO_APIC_setup(ioapic_entries);
- if (ret) {
- WARN(1, "Saving IO-APIC state failed: %d\n", ret);
- free_ioapic_entries(ioapic_entries);
- goto restore;
- }
-
- mask_IO_APIC_setup(ioapic_entries);
+ /*
+ * IO-APIC and PIC have their own resume routines.
+ * We just mask them here to make sure the interrupt
+ * subsystem is completely quiet while we enable x2apic
+ * and interrupt-remapping.
+ */
+ mask_ioapic_entries();
legacy_pic->mask_all();
}
@@ -2152,13 +2143,9 @@ static void lapic_resume(void)
apic_write(APIC_ESR, 0);
apic_read(APIC_ESR);
- if (intr_remapping_enabled) {
+ if (intr_remapping_enabled)
reenable_intr_remapping(x2apic_mode);
- legacy_pic->restore_mask();
- restore_IO_APIC_setup(ioapic_entries);
- free_ioapic_entries(ioapic_entries);
- }
-restore:
+
local_irq_restore(flags);
}
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 5652d31fe10..f7a41e4cae4 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -16,6 +16,7 @@
#include <linux/ctype.h>
#include <linux/init.h>
#include <linux/hardirq.h>
+#include <linux/module.h>
#include <asm/smp.h>
#include <asm/apic.h>
#include <asm/ipi.h>
@@ -24,6 +25,12 @@
#include <acpi/acpi_bus.h>
#endif
+static struct apic apic_physflat;
+static struct apic apic_flat;
+
+struct apic __read_mostly *apic = &apic_flat;
+EXPORT_SYMBOL_GPL(apic);
+
static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
return 1;
@@ -164,7 +171,7 @@ static int flat_phys_pkg_id(int initial_apic_id, int index_msb)
return initial_apic_id >> index_msb;
}
-struct apic apic_flat = {
+static struct apic apic_flat = {
.name = "flat",
.probe = NULL,
.acpi_madt_oem_check = flat_acpi_madt_oem_check,
@@ -312,10 +319,18 @@ physflat_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
return per_cpu(x86_cpu_to_apicid, cpu);
}
-struct apic apic_physflat = {
+static int physflat_probe(void)
+{
+ if (apic == &apic_physflat || num_possible_cpus() > 8)
+ return 1;
+
+ return 0;
+}
+
+static struct apic apic_physflat = {
.name = "physical flat",
- .probe = NULL,
+ .probe = physflat_probe,
.acpi_madt_oem_check = physflat_acpi_madt_oem_check,
.apic_id_registered = flat_apic_id_registered,
@@ -369,3 +384,8 @@ struct apic apic_physflat = {
.wait_icr_idle = native_apic_wait_icr_idle,
.safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
};
+
+/*
+ * We need to check for physflat first, so this order is important.
+ */
+apic_drivers(apic_physflat, apic_flat);
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index f1baa2dc087..775b82bc655 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -119,14 +119,6 @@ static void noop_apic_write(u32 reg, u32 v)
WARN_ON_ONCE(cpu_has_apic && !disable_apic);
}
-#ifdef CONFIG_X86_32
-static int noop_x86_32_numa_cpu_node(int cpu)
-{
- /* we're always on node 0 */
- return 0;
-}
-#endif
-
struct apic apic_noop = {
.name = "noop",
.probe = noop_probe,
@@ -195,6 +187,5 @@ struct apic apic_noop = {
#ifdef CONFIG_X86_32
.x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid,
- .x86_32_numa_cpu_node = noop_x86_32_numa_cpu_node,
#endif
};
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index 541a2e43165..efd737e827f 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -193,7 +193,7 @@ static int probe_bigsmp(void)
return dmi_bigsmp;
}
-struct apic apic_bigsmp = {
+static struct apic apic_bigsmp = {
.name = "bigsmp",
.probe = probe_bigsmp,
@@ -253,5 +253,14 @@ struct apic apic_bigsmp = {
.safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
.x86_32_early_logical_apicid = bigsmp_early_logical_apicid,
- .x86_32_numa_cpu_node = default_x86_32_numa_cpu_node,
};
+
+struct apic * __init generic_bigsmp_probe(void)
+{
+ if (probe_bigsmp())
+ return &apic_bigsmp;
+
+ return NULL;
+}
+
+apic_driver(apic_bigsmp);
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 3e9de4854c5..9536b3fe43f 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -510,11 +510,6 @@ static void es7000_setup_apic_routing(void)
nr_ioapics, cpumask_bits(es7000_target_cpus())[0]);
}
-static int es7000_numa_cpu_node(int cpu)
-{
- return 0;
-}
-
static int es7000_cpu_present_to_apicid(int mps_cpu)
{
if (!mps_cpu)
@@ -625,7 +620,7 @@ static int es7000_mps_oem_check_cluster(struct mpc_table *mpc, char *oem,
}
/* We've been warned by a false positive warning.Use __refdata to keep calm. */
-struct apic __refdata apic_es7000_cluster = {
+static struct apic __refdata apic_es7000_cluster = {
.name = "es7000",
.probe = probe_es7000,
@@ -688,10 +683,9 @@ struct apic __refdata apic_es7000_cluster = {
.safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
.x86_32_early_logical_apicid = es7000_early_logical_apicid,
- .x86_32_numa_cpu_node = es7000_numa_cpu_node,
};
-struct apic __refdata apic_es7000 = {
+static struct apic __refdata apic_es7000 = {
.name = "es7000",
.probe = probe_es7000,
@@ -752,5 +746,10 @@ struct apic __refdata apic_es7000 = {
.safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
.x86_32_early_logical_apicid = es7000_early_logical_apicid,
- .x86_32_numa_cpu_node = es7000_numa_cpu_node,
};
+
+/*
+ * Need to check for es7000 followed by es7000_cluster, so this order
+ * in apic_drivers is important.
+ */
+apic_drivers(apic_es7000, apic_es7000_cluster);
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index 5260fe91bcb..d5e57db0f7b 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -19,9 +19,9 @@
#include <linux/delay.h>
#ifdef CONFIG_HARDLOCKUP_DETECTOR
-u64 hw_nmi_get_sample_period(void)
+u64 hw_nmi_get_sample_period(int watchdog_thresh)
{
- return (u64)(cpu_khz) * 1000 * 60;
+ return (u64)(cpu_khz) * 1000 * watchdog_thresh;
}
#endif
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 45fd33d1fd3..8eb863e27ea 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -76,17 +76,40 @@ int sis_apic_bug = -1;
static DEFINE_RAW_SPINLOCK(ioapic_lock);
static DEFINE_RAW_SPINLOCK(vector_lock);
-/*
- * # of IRQ routing registers
- */
-int nr_ioapic_registers[MAX_IO_APICS];
+static struct ioapic {
+ /*
+ * # of IRQ routing registers
+ */
+ int nr_registers;
+ /*
+ * Saved state during suspend/resume, or while enabling intr-remap.
+ */
+ struct IO_APIC_route_entry *saved_registers;
+ /* I/O APIC config */
+ struct mpc_ioapic mp_config;
+ /* IO APIC gsi routing info */
+ struct mp_ioapic_gsi gsi_config;
+ DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
+} ioapics[MAX_IO_APICS];
-/* I/O APIC entries */
-struct mpc_ioapic mp_ioapics[MAX_IO_APICS];
-int nr_ioapics;
+#define mpc_ioapic_ver(id) ioapics[id].mp_config.apicver
+
+int mpc_ioapic_id(int id)
+{
+ return ioapics[id].mp_config.apicid;
+}
-/* IO APIC gsi routing info */
-struct mp_ioapic_gsi mp_gsi_routing[MAX_IO_APICS];
+unsigned int mpc_ioapic_addr(int id)
+{
+ return ioapics[id].mp_config.apicaddr;
+}
+
+struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int id)
+{
+ return &ioapics[id].gsi_config;
+}
+
+int nr_ioapics;
/* The one past the highest gsi number used */
u32 gsi_top;
@@ -179,6 +202,14 @@ int __init arch_early_irq_init(void)
io_apic_irqs = ~0UL;
}
+ for (i = 0; i < nr_ioapics; i++) {
+ ioapics[i].saved_registers =
+ kzalloc(sizeof(struct IO_APIC_route_entry) *
+ ioapics[i].nr_registers, GFP_KERNEL);
+ if (!ioapics[i].saved_registers)
+ pr_err("IOAPIC %d: suspend/resume impossible!\n", i);
+ }
+
cfg = irq_cfgx;
count = ARRAY_SIZE(irq_cfgx);
node = cpu_to_node(0);
@@ -297,7 +328,7 @@ struct io_apic {
static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
{
return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
- + (mp_ioapics[idx].apicaddr & ~PAGE_MASK);
+ + (mpc_ioapic_addr(idx) & ~PAGE_MASK);
}
static inline void io_apic_eoi(unsigned int apic, unsigned int vector)
@@ -573,7 +604,7 @@ static void clear_IO_APIC (void)
int apic, pin;
for (apic = 0; apic < nr_ioapics; apic++)
- for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
+ for (pin = 0; pin < ioapics[apic].nr_registers; pin++)
clear_IO_APIC_pin(apic, pin);
}
@@ -615,74 +646,43 @@ static int __init ioapic_pirq_setup(char *str)
__setup("pirq=", ioapic_pirq_setup);
#endif /* CONFIG_X86_32 */
-struct IO_APIC_route_entry **alloc_ioapic_entries(void)
-{
- int apic;
- struct IO_APIC_route_entry **ioapic_entries;
-
- ioapic_entries = kzalloc(sizeof(*ioapic_entries) * nr_ioapics,
- GFP_KERNEL);
- if (!ioapic_entries)
- return 0;
-
- for (apic = 0; apic < nr_ioapics; apic++) {
- ioapic_entries[apic] =
- kzalloc(sizeof(struct IO_APIC_route_entry) *
- nr_ioapic_registers[apic], GFP_KERNEL);
- if (!ioapic_entries[apic])
- goto nomem;
- }
-
- return ioapic_entries;
-
-nomem:
- while (--apic >= 0)
- kfree(ioapic_entries[apic]);
- kfree(ioapic_entries);
-
- return 0;
-}
-
/*
* Saves all the IO-APIC RTE's
*/
-int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries)
+int save_ioapic_entries(void)
{
int apic, pin;
-
- if (!ioapic_entries)
- return -ENOMEM;
+ int err = 0;
for (apic = 0; apic < nr_ioapics; apic++) {
- if (!ioapic_entries[apic])
- return -ENOMEM;
+ if (!ioapics[apic].saved_registers) {
+ err = -ENOMEM;
+ continue;
+ }
- for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
- ioapic_entries[apic][pin] =
+ for (pin = 0; pin < ioapics[apic].nr_registers; pin++)
+ ioapics[apic].saved_registers[pin] =
ioapic_read_entry(apic, pin);
}
- return 0;
+ return err;
}
/*
* Mask all IO APIC entries.
*/
-void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries)
+void mask_ioapic_entries(void)
{
int apic, pin;
- if (!ioapic_entries)
- return;
-
for (apic = 0; apic < nr_ioapics; apic++) {
- if (!ioapic_entries[apic])
- break;
+ if (!ioapics[apic].saved_registers)
+ continue;
- for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
+ for (pin = 0; pin < ioapics[apic].nr_registers; pin++) {
struct IO_APIC_route_entry entry;
- entry = ioapic_entries[apic][pin];
+ entry = ioapics[apic].saved_registers[pin];
if (!entry.mask) {
entry.mask = 1;
ioapic_write_entry(apic, pin, entry);
@@ -692,36 +692,23 @@ void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries)
}
/*
- * Restore IO APIC entries which was saved in ioapic_entries.
+ * Restore IO APIC entries which was saved in the ioapic structure.
*/
-int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries)
+int restore_ioapic_entries(void)
{
int apic, pin;
- if (!ioapic_entries)
- return -ENOMEM;
-
for (apic = 0; apic < nr_ioapics; apic++) {
- if (!ioapic_entries[apic])
- return -ENOMEM;
+ if (!ioapics[apic].saved_registers)
+ continue;
- for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
+ for (pin = 0; pin < ioapics[apic].nr_registers; pin++)
ioapic_write_entry(apic, pin,
- ioapic_entries[apic][pin]);
+ ioapics[apic].saved_registers[pin]);
}
return 0;
}
-void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries)
-{
- int apic;
-
- for (apic = 0; apic < nr_ioapics; apic++)
- kfree(ioapic_entries[apic]);
-
- kfree(ioapic_entries);
-}
-
/*
* Find the IRQ entry number of a certain pin.
*/
@@ -731,7 +718,7 @@ static int find_irq_entry(int apic, int pin, int type)
for (i = 0; i < mp_irq_entries; i++)
if (mp_irqs[i].irqtype == type &&
- (mp_irqs[i].dstapic == mp_ioapics[apic].apicid ||
+ (mp_irqs[i].dstapic == mpc_ioapic_id(apic) ||
mp_irqs[i].dstapic == MP_APIC_ALL) &&
mp_irqs[i].dstirq == pin)
return i;
@@ -773,7 +760,7 @@ static int __init find_isa_irq_apic(int irq, int type)
if (i < mp_irq_entries) {
int apic;
for(apic = 0; apic < nr_ioapics; apic++) {
- if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic)
+ if (mpc_ioapic_id(apic) == mp_irqs[i].dstapic)
return apic;
}
}
@@ -942,6 +929,7 @@ static int pin_2_irq(int idx, int apic, int pin)
{
int irq;
int bus = mp_irqs[idx].srcbus;
+ struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(apic);
/*
* Debugging check, we are in big trouble if this message pops up!
@@ -952,7 +940,7 @@ static int pin_2_irq(int idx, int apic, int pin)
if (test_bit(bus, mp_bus_not_pci)) {
irq = mp_irqs[idx].srcbusirq;
} else {
- u32 gsi = mp_gsi_routing[apic].gsi_base + pin;
+ u32 gsi = gsi_cfg->gsi_base + pin;
if (gsi >= NR_IRQS_LEGACY)
irq = gsi;
@@ -1003,7 +991,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin,
int lbus = mp_irqs[i].srcbus;
for (apic = 0; apic < nr_ioapics; apic++)
- if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic ||
+ if (mpc_ioapic_id(apic) == mp_irqs[i].dstapic ||
mp_irqs[i].dstapic == MP_APIC_ALL)
break;
@@ -1222,7 +1210,7 @@ static inline int IO_APIC_irq_trigger(int irq)
int apic, idx, pin;
for (apic = 0; apic < nr_ioapics; apic++) {
- for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
+ for (pin = 0; pin < ioapics[apic].nr_registers; pin++) {
idx = find_irq_entry(apic, pin, mp_INT);
if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin)))
return irq_trigger(idx);
@@ -1307,6 +1295,16 @@ static int setup_ioapic_entry(int apic_id, int irq,
* irq handler will do the explicit EOI to the io-apic.
*/
ir_entry->vector = pin;
+
+ apic_printk(APIC_VERBOSE, KERN_DEBUG "IOAPIC[%d]: "
+ "Set IRTE entry (P:%d FPD:%d Dst_Mode:%d "
+ "Redir_hint:%d Trig_Mode:%d Dlvry_Mode:%X "
+ "Avail:%X Vector:%02X Dest:%08X "
+ "SID:%04X SQ:%X SVT:%X)\n",
+ apic_id, irte.present, irte.fpd, irte.dst_mode,
+ irte.redir_hint, irte.trigger_mode, irte.dlvry_mode,
+ irte.avail, irte.vector, irte.dest_id,
+ irte.sid, irte.sq, irte.svt);
} else {
entry->delivery_mode = apic->irq_delivery_mode;
entry->dest_mode = apic->irq_dest_mode;
@@ -1349,15 +1347,15 @@ static void setup_ioapic_irq(int apic_id, int pin, unsigned int irq,
apic_printk(APIC_VERBOSE,KERN_DEBUG
"IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
- "IRQ %d Mode:%i Active:%i)\n",
- apic_id, mp_ioapics[apic_id].apicid, pin, cfg->vector,
- irq, trigger, polarity);
+ "IRQ %d Mode:%i Active:%i Dest:%d)\n",
+ apic_id, mpc_ioapic_id(apic_id), pin, cfg->vector,
+ irq, trigger, polarity, dest);
- if (setup_ioapic_entry(mp_ioapics[apic_id].apicid, irq, &entry,
+ if (setup_ioapic_entry(mpc_ioapic_id(apic_id), irq, &entry,
dest, trigger, polarity, cfg->vector, pin)) {
printk("Failed to setup ioapic entry for ioapic %d, pin %d\n",
- mp_ioapics[apic_id].apicid, pin);
+ mpc_ioapic_id(apic_id), pin);
__clear_irq_vector(irq, cfg);
return;
}
@@ -1369,17 +1367,13 @@ static void setup_ioapic_irq(int apic_id, int pin, unsigned int irq,
ioapic_write_entry(apic_id, pin, entry);
}
-static struct {
- DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
-} mp_ioapic_routing[MAX_IO_APICS];
-
static bool __init io_apic_pin_not_connected(int idx, int apic_id, int pin)
{
if (idx != -1)
return false;
apic_printk(APIC_VERBOSE, KERN_DEBUG " apic %d pin %d not connected\n",
- mp_ioapics[apic_id].apicid, pin);
+ mpc_ioapic_id(apic_id), pin);
return true;
}
@@ -1389,7 +1383,7 @@ static void __init __io_apic_setup_irqs(unsigned int apic_id)
struct io_apic_irq_attr attr;
unsigned int pin, irq;
- for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) {
+ for (pin = 0; pin < ioapics[apic_id].nr_registers; pin++) {
idx = find_irq_entry(apic_id, pin, mp_INT);
if (io_apic_pin_not_connected(idx, apic_id, pin))
continue;
@@ -1511,7 +1505,7 @@ __apicdebuginit(void) print_IO_APIC(void)
printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
for (i = 0; i < nr_ioapics; i++)
printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
- mp_ioapics[i].apicid, nr_ioapic_registers[i]);
+ mpc_ioapic_id(i), ioapics[i].nr_registers);
/*
* We are a bit conservative about what we expect. We have to
@@ -1531,17 +1525,19 @@ __apicdebuginit(void) print_IO_APIC(void)
raw_spin_unlock_irqrestore(&ioapic_lock, flags);
printk("\n");
- printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].apicid);
+ printk(KERN_DEBUG "IO APIC #%d......\n", mpc_ioapic_id(apic));
printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS);
printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)&reg_01);
- printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries);
+ printk(KERN_DEBUG "....... : max redirection entries: %02X\n",
+ reg_01.bits.entries);
printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
- printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version);
+ printk(KERN_DEBUG "....... : IO APIC version: %02X\n",
+ reg_01.bits.version);
/*
* Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
@@ -1566,31 +1562,60 @@ __apicdebuginit(void) print_IO_APIC(void)
printk(KERN_DEBUG ".... IRQ redirection table:\n");
- printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol"
- " Stat Dmod Deli Vect:\n");
+ if (intr_remapping_enabled) {
+ printk(KERN_DEBUG " NR Indx Fmt Mask Trig IRR"
+ " Pol Stat Indx2 Zero Vect:\n");
+ } else {
+ printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol"
+ " Stat Dmod Deli Vect:\n");
+ }
for (i = 0; i <= reg_01.bits.entries; i++) {
- struct IO_APIC_route_entry entry;
-
- entry = ioapic_read_entry(apic, i);
-
- printk(KERN_DEBUG " %02x %03X ",
- i,
- entry.dest
- );
+ if (intr_remapping_enabled) {
+ struct IO_APIC_route_entry entry;
+ struct IR_IO_APIC_route_entry *ir_entry;
+
+ entry = ioapic_read_entry(apic, i);
+ ir_entry = (struct IR_IO_APIC_route_entry *) &entry;
+ printk(KERN_DEBUG " %02x %04X ",
+ i,
+ ir_entry->index
+ );
+ printk("%1d %1d %1d %1d %1d "
+ "%1d %1d %X %02X\n",
+ ir_entry->format,
+ ir_entry->mask,
+ ir_entry->trigger,
+ ir_entry->irr,
+ ir_entry->polarity,
+ ir_entry->delivery_status,
+ ir_entry->index2,
+ ir_entry->zero,
+ ir_entry->vector
+ );
+ } else {
+ struct IO_APIC_route_entry entry;
- printk("%1d %1d %1d %1d %1d %1d %1d %02X\n",
- entry.mask,
- entry.trigger,
- entry.irr,
- entry.polarity,
- entry.delivery_status,
- entry.dest_mode,
- entry.delivery_mode,
- entry.vector
- );
+ entry = ioapic_read_entry(apic, i);
+ printk(KERN_DEBUG " %02x %02X ",
+ i,
+ entry.dest
+ );
+ printk("%1d %1d %1d %1d %1d "
+ "%1d %1d %02X\n",
+ entry.mask,
+ entry.trigger,
+ entry.irr,
+ entry.polarity,
+ entry.delivery_status,
+ entry.dest_mode,
+ entry.delivery_mode,
+ entry.vector
+ );
+ }
}
}
+
printk(KERN_DEBUG "IRQ to pin mappings:\n");
for_each_active_irq(irq) {
struct irq_pin_list *entry;
@@ -1808,7 +1833,7 @@ __apicdebuginit(int) print_ICs(void)
return 0;
}
-fs_initcall(print_ICs);
+late_initcall(print_ICs);
/* Where if anywhere is the i8259 connect in external int mode */
@@ -1825,7 +1850,7 @@ void __init enable_IO_APIC(void)
for(apic = 0; apic < nr_ioapics; apic++) {
int pin;
/* See if any of the pins is in ExtINT mode */
- for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
+ for (pin = 0; pin < ioapics[apic].nr_registers; pin++) {
struct IO_APIC_route_entry entry;
entry = ioapic_read_entry(apic, pin);
@@ -1949,14 +1974,14 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void)
reg_00.raw = io_apic_read(apic_id, 0);
raw_spin_unlock_irqrestore(&ioapic_lock, flags);
- old_id = mp_ioapics[apic_id].apicid;
+ old_id = mpc_ioapic_id(apic_id);
- if (mp_ioapics[apic_id].apicid >= get_physical_broadcast()) {
+ if (mpc_ioapic_id(apic_id) >= get_physical_broadcast()) {
printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
- apic_id, mp_ioapics[apic_id].apicid);
+ apic_id, mpc_ioapic_id(apic_id));
printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
reg_00.bits.ID);
- mp_ioapics[apic_id].apicid = reg_00.bits.ID;
+ ioapics[apic_id].mp_config.apicid = reg_00.bits.ID;
}
/*
@@ -1965,9 +1990,9 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void)
* 'stuck on smp_invalidate_needed IPI wait' messages.
*/
if (apic->check_apicid_used(&phys_id_present_map,
- mp_ioapics[apic_id].apicid)) {
+ mpc_ioapic_id(apic_id))) {
printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
- apic_id, mp_ioapics[apic_id].apicid);
+ apic_id, mpc_ioapic_id(apic_id));
for (i = 0; i < get_physical_broadcast(); i++)
if (!physid_isset(i, phys_id_present_map))
break;
@@ -1976,13 +2001,14 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void)
printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
i);
physid_set(i, phys_id_present_map);
- mp_ioapics[apic_id].apicid = i;
+ ioapics[apic_id].mp_config.apicid = i;
} else {
physid_mask_t tmp;
- apic->apicid_to_cpu_present(mp_ioapics[apic_id].apicid, &tmp);
+ apic->apicid_to_cpu_present(mpc_ioapic_id(apic_id),
+ &tmp);
apic_printk(APIC_VERBOSE, "Setting %d in the "
"phys_id_present_map\n",
- mp_ioapics[apic_id].apicid);
+ mpc_ioapic_id(apic_id));
physids_or(phys_id_present_map, phys_id_present_map, tmp);
}
@@ -1990,24 +2016,24 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void)
* We need to adjust the IRQ routing table
* if the ID changed.
*/
- if (old_id != mp_ioapics[apic_id].apicid)
+ if (old_id != mpc_ioapic_id(apic_id))
for (i = 0; i < mp_irq_entries; i++)
if (mp_irqs[i].dstapic == old_id)
mp_irqs[i].dstapic
- = mp_ioapics[apic_id].apicid;
+ = mpc_ioapic_id(apic_id);
/*
* Update the ID register according to the right value
* from the MPC table if they are different.
*/
- if (mp_ioapics[apic_id].apicid == reg_00.bits.ID)
+ if (mpc_ioapic_id(apic_id) == reg_00.bits.ID)
continue;
apic_printk(APIC_VERBOSE, KERN_INFO
"...changing IO-APIC physical APIC ID to %d ...",
- mp_ioapics[apic_id].apicid);
+ mpc_ioapic_id(apic_id));
- reg_00.bits.ID = mp_ioapics[apic_id].apicid;
+ reg_00.bits.ID = mpc_ioapic_id(apic_id);
raw_spin_lock_irqsave(&ioapic_lock, flags);
io_apic_write(apic_id, 0, reg_00.raw);
raw_spin_unlock_irqrestore(&ioapic_lock, flags);
@@ -2018,7 +2044,7 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void)
raw_spin_lock_irqsave(&ioapic_lock, flags);
reg_00.raw = io_apic_read(apic_id, 0);
raw_spin_unlock_irqrestore(&ioapic_lock, flags);
- if (reg_00.bits.ID != mp_ioapics[apic_id].apicid)
+ if (reg_00.bits.ID != mpc_ioapic_id(apic_id))
printk("could not set ID!\n");
else
apic_printk(APIC_VERBOSE, " ok.\n");
@@ -2404,7 +2430,7 @@ static void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
raw_spin_lock_irqsave(&ioapic_lock, flags);
for_each_irq_pin(entry, cfg->irq_2_pin) {
- if (mp_ioapics[entry->apic].apicver >= 0x20) {
+ if (mpc_ioapic_ver(entry->apic) >= 0x20) {
/*
* Intr-remapping uses pin number as the virtual vector
* in the RTE. Actual vector is programmed in
@@ -2918,49 +2944,19 @@ static int __init io_apic_bug_finalize(void)
late_initcall(io_apic_bug_finalize);
-static struct IO_APIC_route_entry *ioapic_saved_data[MAX_IO_APICS];
-
-static void suspend_ioapic(int ioapic_id)
+static void resume_ioapic_id(int ioapic_id)
{
- struct IO_APIC_route_entry *saved_data = ioapic_saved_data[ioapic_id];
- int i;
-
- if (!saved_data)
- return;
-
- for (i = 0; i < nr_ioapic_registers[ioapic_id]; i++)
- saved_data[i] = ioapic_read_entry(ioapic_id, i);
-}
-
-static int ioapic_suspend(void)
-{
- int ioapic_id;
-
- for (ioapic_id = 0; ioapic_id < nr_ioapics; ioapic_id++)
- suspend_ioapic(ioapic_id);
-
- return 0;
-}
-
-static void resume_ioapic(int ioapic_id)
-{
- struct IO_APIC_route_entry *saved_data = ioapic_saved_data[ioapic_id];
unsigned long flags;
union IO_APIC_reg_00 reg_00;
- int i;
- if (!saved_data)
- return;
raw_spin_lock_irqsave(&ioapic_lock, flags);
reg_00.raw = io_apic_read(ioapic_id, 0);
- if (reg_00.bits.ID != mp_ioapics[ioapic_id].apicid) {
- reg_00.bits.ID = mp_ioapics[ioapic_id].apicid;
+ if (reg_00.bits.ID != mpc_ioapic_id(ioapic_id)) {
+ reg_00.bits.ID = mpc_ioapic_id(ioapic_id);
io_apic_write(ioapic_id, 0, reg_00.raw);
}
raw_spin_unlock_irqrestore(&ioapic_lock, flags);
- for (i = 0; i < nr_ioapic_registers[ioapic_id]; i++)
- ioapic_write_entry(ioapic_id, i, saved_data[i]);
}
static void ioapic_resume(void)
@@ -2968,28 +2964,18 @@ static void ioapic_resume(void)
int ioapic_id;
for (ioapic_id = nr_ioapics - 1; ioapic_id >= 0; ioapic_id--)
- resume_ioapic(ioapic_id);
+ resume_ioapic_id(ioapic_id);
+
+ restore_ioapic_entries();
}
static struct syscore_ops ioapic_syscore_ops = {
- .suspend = ioapic_suspend,
+ .suspend = save_ioapic_entries,
.resume = ioapic_resume,
};
static int __init ioapic_init_ops(void)
{
- int i;
-
- for (i = 0; i < nr_ioapics; i++) {
- unsigned int size;
-
- size = nr_ioapic_registers[i]
- * sizeof(struct IO_APIC_route_entry);
- ioapic_saved_data[i] = kzalloc(size, GFP_KERNEL);
- if (!ioapic_saved_data[i])
- pr_err("IOAPIC %d: suspend/resume impossible!\n", i);
- }
-
register_syscore_ops(&ioapic_syscore_ops);
return 0;
@@ -3592,14 +3578,14 @@ int io_apic_setup_irq_pin_once(unsigned int irq, int node,
int ret;
/* Avoid redundant programming */
- if (test_bit(pin, mp_ioapic_routing[id].pin_programmed)) {
+ if (test_bit(pin, ioapics[id].pin_programmed)) {
pr_debug("Pin %d-%d already programmed\n",
- mp_ioapics[id].apicid, pin);
+ mpc_ioapic_id(id), pin);
return 0;
}
ret = io_apic_setup_irq_pin(irq, node, attr);
if (!ret)
- set_bit(pin, mp_ioapic_routing[id].pin_programmed);
+ set_bit(pin, ioapics[id].pin_programmed);
return ret;
}
@@ -3764,8 +3750,7 @@ static u8 __init io_apic_unique_id(u8 id)
bitmap_zero(used, 256);
for (i = 0; i < nr_ioapics; i++) {
- struct mpc_ioapic *ia = &mp_ioapics[i];
- __set_bit(ia->apicid, used);
+ __set_bit(mpc_ioapic_id(i), used);
}
if (!test_bit(id, used))
return id;
@@ -3825,7 +3810,7 @@ void __init setup_ioapic_dest(void)
return;
for (ioapic = 0; ioapic < nr_ioapics; ioapic++)
- for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
+ for (pin = 0; pin < ioapics[ioapic].nr_registers; pin++) {
irq_entry = find_irq_entry(ioapic, pin, mp_INT);
if (irq_entry == -1)
continue;
@@ -3896,7 +3881,7 @@ void __init ioapic_and_gsi_init(void)
ioapic_res = ioapic_setup_resources(nr_ioapics);
for (i = 0; i < nr_ioapics; i++) {
if (smp_found_config) {
- ioapic_phys = mp_ioapics[i].apicaddr;
+ ioapic_phys = mpc_ioapic_addr(i);
#ifdef CONFIG_X86_32
if (!ioapic_phys) {
printk(KERN_ERR
@@ -3956,8 +3941,9 @@ int mp_find_ioapic(u32 gsi)
/* Find the IOAPIC that manages this GSI. */
for (i = 0; i < nr_ioapics; i++) {
- if ((gsi >= mp_gsi_routing[i].gsi_base)
- && (gsi <= mp_gsi_routing[i].gsi_end))
+ struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(i);
+ if ((gsi >= gsi_cfg->gsi_base)
+ && (gsi <= gsi_cfg->gsi_end))
return i;
}
@@ -3967,12 +3953,16 @@ int mp_find_ioapic(u32 gsi)
int mp_find_ioapic_pin(int ioapic, u32 gsi)
{
+ struct mp_ioapic_gsi *gsi_cfg;
+
if (WARN_ON(ioapic == -1))
return -1;
- if (WARN_ON(gsi > mp_gsi_routing[ioapic].gsi_end))
+
+ gsi_cfg = mp_ioapic_gsi_routing(ioapic);
+ if (WARN_ON(gsi > gsi_cfg->gsi_end))
return -1;
- return gsi - mp_gsi_routing[ioapic].gsi_base;
+ return gsi - gsi_cfg->gsi_base;
}
static __init int bad_ioapic(unsigned long address)
@@ -3994,40 +3984,42 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
{
int idx = 0;
int entries;
+ struct mp_ioapic_gsi *gsi_cfg;
if (bad_ioapic(address))
return;
idx = nr_ioapics;
- mp_ioapics[idx].type = MP_IOAPIC;
- mp_ioapics[idx].flags = MPC_APIC_USABLE;
- mp_ioapics[idx].apicaddr = address;
+ ioapics[idx].mp_config.type = MP_IOAPIC;
+ ioapics[idx].mp_config.flags = MPC_APIC_USABLE;
+ ioapics[idx].mp_config.apicaddr = address;
set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
- mp_ioapics[idx].apicid = io_apic_unique_id(id);
- mp_ioapics[idx].apicver = io_apic_get_version(idx);
+ ioapics[idx].mp_config.apicid = io_apic_unique_id(id);
+ ioapics[idx].mp_config.apicver = io_apic_get_version(idx);
/*
* Build basic GSI lookup table to facilitate gsi->io_apic lookups
* and to prevent reprogramming of IOAPIC pins (PCI GSIs).
*/
entries = io_apic_get_redir_entries(idx);
- mp_gsi_routing[idx].gsi_base = gsi_base;
- mp_gsi_routing[idx].gsi_end = gsi_base + entries - 1;
+ gsi_cfg = mp_ioapic_gsi_routing(idx);
+ gsi_cfg->gsi_base = gsi_base;
+ gsi_cfg->gsi_end = gsi_base + entries - 1;
/*
* The number of IO-APIC IRQ registers (== #pins):
*/
- nr_ioapic_registers[idx] = entries;
+ ioapics[idx].nr_registers = entries;
- if (mp_gsi_routing[idx].gsi_end >= gsi_top)
- gsi_top = mp_gsi_routing[idx].gsi_end + 1;
+ if (gsi_cfg->gsi_end >= gsi_top)
+ gsi_top = gsi_cfg->gsi_end + 1;
printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
- "GSI %d-%d\n", idx, mp_ioapics[idx].apicid,
- mp_ioapics[idx].apicver, mp_ioapics[idx].apicaddr,
- mp_gsi_routing[idx].gsi_base, mp_gsi_routing[idx].gsi_end);
+ "GSI %d-%d\n", idx, mpc_ioapic_id(idx),
+ mpc_ioapic_ver(idx), mpc_ioapic_addr(idx),
+ gsi_cfg->gsi_base, gsi_cfg->gsi_end);
nr_ioapics++;
}
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index 6273eee5134..c4a61ca1349 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -48,8 +48,6 @@
#include <asm/e820.h>
#include <asm/ipi.h>
-#define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT))
-
int found_numaq;
/*
@@ -79,31 +77,20 @@ int quad_local_to_mp_bus_id[NR_CPUS/4][4];
static inline void numaq_register_node(int node, struct sys_cfg_data *scd)
{
struct eachquadmem *eq = scd->eq + node;
+ u64 start = (u64)(eq->hi_shrd_mem_start - eq->priv_mem_size) << 20;
+ u64 end = (u64)(eq->hi_shrd_mem_start + eq->hi_shrd_mem_size) << 20;
+ int ret;
- node_set_online(node);
-
- /* Convert to pages */
- node_start_pfn[node] =
- MB_TO_PAGES(eq->hi_shrd_mem_start - eq->priv_mem_size);
-
- node_end_pfn[node] =
- MB_TO_PAGES(eq->hi_shrd_mem_start + eq->hi_shrd_mem_size);
-
- memblock_x86_register_active_regions(node, node_start_pfn[node],
- node_end_pfn[node]);
-
- memory_present(node, node_start_pfn[node], node_end_pfn[node]);
-
- node_remap_size[node] = node_memmap_size_bytes(node,
- node_start_pfn[node],
- node_end_pfn[node]);
+ node_set(node, numa_nodes_parsed);
+ ret = numa_add_memblk(node, start, end);
+ BUG_ON(ret < 0);
}
/*
* Function: smp_dump_qct()
*
* Description: gets memory layout from the quad config table. This
- * function also updates node_online_map with the nodes (quads) present.
+ * function also updates numa_nodes_parsed with the nodes (quads) present.
*/
static void __init smp_dump_qct(void)
{
@@ -112,7 +99,6 @@ static void __init smp_dump_qct(void)
scd = (void *)__va(SYS_CFG_DATA_PRIV_ADDR);
- nodes_clear(node_online_map);
for_each_node(node) {
if (scd->quads_present31_0 & (1 << node))
numaq_register_node(node, scd);
@@ -282,14 +268,14 @@ static __init void early_check_numaq(void)
}
}
-int __init get_memcfg_numaq(void)
+int __init numaq_numa_init(void)
{
early_check_numaq();
if (!found_numaq)
- return 0;
+ return -ENOENT;
smp_dump_qct();
- return 1;
+ return 0;
}
#define NUMAQ_APIC_DFR_VALUE (APIC_DFR_CLUSTER)
@@ -486,8 +472,8 @@ static void numaq_setup_portio_remap(void)
(u_long) xquad_portio, (u_long) num_quads*XQUAD_PORTIO_QUAD);
}
-/* Use __refdata to keep false positive warning calm. */
-struct apic __refdata apic_numaq = {
+/* Use __refdata to keep false positive warning calm. */
+static struct apic __refdata apic_numaq = {
.name = "NUMAQ",
.probe = probe_numaq,
@@ -551,3 +537,5 @@ struct apic __refdata apic_numaq = {
.x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid,
.x86_32_numa_cpu_node = numaq_numa_cpu_node,
};
+
+apic_driver(apic_numaq);
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index fc84c7b6110..b5254ad044a 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -52,31 +52,6 @@ static int __init print_ipi_mode(void)
}
late_initcall(print_ipi_mode);
-void __init default_setup_apic_routing(void)
-{
- int version = apic_version[boot_cpu_physical_apicid];
-
- if (num_possible_cpus() > 8) {
- switch (boot_cpu_data.x86_vendor) {
- case X86_VENDOR_INTEL:
- if (!APIC_XAPIC(version)) {
- def_to_bigsmp = 0;
- break;
- }
- /* If P4 and above fall through */
- case X86_VENDOR_AMD:
- def_to_bigsmp = 1;
- }
- }
-
-#ifdef CONFIG_X86_BIGSMP
- generic_bigsmp_probe();
-#endif
-
- if (apic->setup_apic_routing)
- apic->setup_apic_routing();
-}
-
static int default_x86_32_early_logical_apicid(int cpu)
{
return 1 << cpu;
@@ -112,7 +87,7 @@ static int probe_default(void)
return 1;
}
-struct apic apic_default = {
+static struct apic apic_default = {
.name = "default",
.probe = probe_default,
@@ -172,47 +147,24 @@ struct apic apic_default = {
.safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
.x86_32_early_logical_apicid = default_x86_32_early_logical_apicid,
- .x86_32_numa_cpu_node = default_x86_32_numa_cpu_node,
};
-extern struct apic apic_numaq;
-extern struct apic apic_summit;
-extern struct apic apic_bigsmp;
-extern struct apic apic_es7000;
-extern struct apic apic_es7000_cluster;
+apic_driver(apic_default);
struct apic *apic = &apic_default;
EXPORT_SYMBOL_GPL(apic);
-static struct apic *apic_probe[] __initdata = {
-#ifdef CONFIG_X86_NUMAQ
- &apic_numaq,
-#endif
-#ifdef CONFIG_X86_SUMMIT
- &apic_summit,
-#endif
-#ifdef CONFIG_X86_BIGSMP
- &apic_bigsmp,
-#endif
-#ifdef CONFIG_X86_ES7000
- &apic_es7000,
- &apic_es7000_cluster,
-#endif
- &apic_default, /* must be last */
- NULL,
-};
-
static int cmdline_apic __initdata;
static int __init parse_apic(char *arg)
{
- int i;
+ struct apic **drv;
if (!arg)
return -EINVAL;
- for (i = 0; apic_probe[i]; i++) {
- if (!strcmp(apic_probe[i]->name, arg)) {
- apic = apic_probe[i];
+ for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
+ if (!strcmp((*drv)->name, arg)) {
+ apic = *drv;
cmdline_apic = 1;
return 0;
}
@@ -223,38 +175,58 @@ static int __init parse_apic(char *arg)
}
early_param("apic", parse_apic);
-void __init generic_bigsmp_probe(void)
+void __init default_setup_apic_routing(void)
{
+ int version = apic_version[boot_cpu_physical_apicid];
+
+ if (num_possible_cpus() > 8) {
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_INTEL:
+ if (!APIC_XAPIC(version)) {
+ def_to_bigsmp = 0;
+ break;
+ }
+ /* If P4 and above fall through */
+ case X86_VENDOR_AMD:
+ def_to_bigsmp = 1;
+ }
+ }
+
#ifdef CONFIG_X86_BIGSMP
/*
- * This routine is used to switch to bigsmp mode when
+ * This is used to switch to bigsmp mode when
* - There is no apic= option specified by the user
* - generic_apic_probe() has chosen apic_default as the sub_arch
* - we find more than 8 CPUs in acpi LAPIC listing with xAPIC support
*/
if (!cmdline_apic && apic == &apic_default) {
- if (apic_bigsmp.probe()) {
- apic = &apic_bigsmp;
+ struct apic *bigsmp = generic_bigsmp_probe();
+ if (bigsmp) {
+ apic = bigsmp;
printk(KERN_INFO "Overriding APIC driver with %s\n",
apic->name);
}
}
#endif
+
+ if (apic->setup_apic_routing)
+ apic->setup_apic_routing();
}
void __init generic_apic_probe(void)
{
if (!cmdline_apic) {
- int i;
- for (i = 0; apic_probe[i]; i++) {
- if (apic_probe[i]->probe()) {
- apic = apic_probe[i];
+ struct apic **drv;
+
+ for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
+ if ((*drv)->probe()) {
+ apic = *drv;
break;
}
}
/* Not visible without early console */
- if (!apic_probe[i])
+ if (drv == __apicdrivers_end)
panic("Didn't find an APIC driver");
}
printk(KERN_INFO "Using APIC driver %s\n", apic->name);
@@ -265,16 +237,16 @@ void __init generic_apic_probe(void)
int __init
generic_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid)
{
- int i;
+ struct apic **drv;
- for (i = 0; apic_probe[i]; ++i) {
- if (!apic_probe[i]->mps_oem_check)
+ for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
+ if (!((*drv)->mps_oem_check))
continue;
- if (!apic_probe[i]->mps_oem_check(mpc, oem, productid))
+ if (!(*drv)->mps_oem_check(mpc, oem, productid))
continue;
if (!cmdline_apic) {
- apic = apic_probe[i];
+ apic = *drv;
printk(KERN_INFO "Switched to APIC driver `%s'.\n",
apic->name);
}
@@ -285,16 +257,16 @@ generic_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid)
int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
- int i;
+ struct apic **drv;
- for (i = 0; apic_probe[i]; ++i) {
- if (!apic_probe[i]->acpi_madt_oem_check)
+ for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
+ if (!(*drv)->acpi_madt_oem_check)
continue;
- if (!apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id))
+ if (!(*drv)->acpi_madt_oem_check(oem_id, oem_table_id))
continue;
if (!cmdline_apic) {
- apic = apic_probe[i];
+ apic = *drv;
printk(KERN_INFO "Switched to APIC driver `%s'.\n",
apic->name);
}
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index d8c4a6feb28..3fe98669892 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -23,27 +23,6 @@
#include <asm/ipi.h>
#include <asm/setup.h>
-extern struct apic apic_flat;
-extern struct apic apic_physflat;
-extern struct apic apic_x2xpic_uv_x;
-extern struct apic apic_x2apic_phys;
-extern struct apic apic_x2apic_cluster;
-
-struct apic __read_mostly *apic = &apic_flat;
-EXPORT_SYMBOL_GPL(apic);
-
-static struct apic *apic_probe[] __initdata = {
-#ifdef CONFIG_X86_UV
- &apic_x2apic_uv_x,
-#endif
-#ifdef CONFIG_X86_X2APIC
- &apic_x2apic_phys,
- &apic_x2apic_cluster,
-#endif
- &apic_physflat,
- NULL,
-};
-
static int apicid_phys_pkg_id(int initial_apic_id, int index_msb)
{
return hard_smp_processor_id() >> index_msb;
@@ -54,26 +33,20 @@ static int apicid_phys_pkg_id(int initial_apic_id, int index_msb)
*/
void __init default_setup_apic_routing(void)
{
+ struct apic **drv;
enable_IR_x2apic();
-#ifdef CONFIG_X86_X2APIC
- if (x2apic_mode
-#ifdef CONFIG_X86_UV
- && apic != &apic_x2apic_uv_x
-#endif
- ) {
- if (x2apic_phys)
- apic = &apic_x2apic_phys;
- else
- apic = &apic_x2apic_cluster;
+ for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
+ if ((*drv)->probe && (*drv)->probe()) {
+ if (apic != *drv) {
+ apic = *drv;
+ pr_info("Switched APIC routing to %s.\n",
+ apic->name);
+ }
+ break;
+ }
}
-#endif
-
- if (apic == &apic_flat && num_possible_cpus() > 8)
- apic = &apic_physflat;
-
- printk(KERN_INFO "Setting APIC routing to %s\n", apic->name);
if (is_vsmp_box()) {
/* need to update phys_pkg_id */
@@ -90,13 +63,15 @@ void apic_send_IPI_self(int vector)
int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
- int i;
+ struct apic **drv;
- for (i = 0; apic_probe[i]; ++i) {
- if (apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) {
- apic = apic_probe[i];
- printk(KERN_INFO "Setting APIC routing to %s.\n",
- apic->name);
+ for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
+ if ((*drv)->acpi_madt_oem_check(oem_id, oem_table_id)) {
+ if (apic != *drv) {
+ apic = *drv;
+ pr_info("Setting APIC routing to %s.\n",
+ apic->name);
+ }
return 1;
}
}
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
index e4b8059b414..19114423c58 100644
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -491,7 +491,7 @@ void setup_summit(void)
}
#endif
-struct apic apic_summit = {
+static struct apic apic_summit = {
.name = "summit",
.probe = probe_summit,
@@ -551,5 +551,6 @@ struct apic apic_summit = {
.safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
.x86_32_early_logical_apicid = summit_early_logical_apicid,
- .x86_32_numa_cpu_node = default_x86_32_numa_cpu_node,
};
+
+apic_driver(apic_summit);
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 90949bbd566..50079587582 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -5,118 +5,95 @@
#include <linux/ctype.h>
#include <linux/init.h>
#include <linux/dmar.h>
+#include <linux/cpu.h>
#include <asm/smp.h>
-#include <asm/apic.h>
-#include <asm/ipi.h>
+#include <asm/x2apic.h>
static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
+static DEFINE_PER_CPU(cpumask_var_t, cpus_in_cluster);
+static DEFINE_PER_CPU(cpumask_var_t, ipi_mask);
static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
return x2apic_enabled();
}
-/*
- * need to use more than cpu 0, because we need more vectors when
- * MSI-X are used.
- */
-static const struct cpumask *x2apic_target_cpus(void)
+static inline u32 x2apic_cluster(int cpu)
{
- return cpu_online_mask;
-}
-
-/*
- * for now each logical cpu is in its own vector allocation domain.
- */
-static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
-{
- cpumask_clear(retmask);
- cpumask_set_cpu(cpu, retmask);
+ return per_cpu(x86_cpu_to_logical_apicid, cpu) >> 16;
}
static void
- __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest)
+__x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
{
- unsigned long cfg;
+ struct cpumask *cpus_in_cluster_ptr;
+ struct cpumask *ipi_mask_ptr;
+ unsigned int cpu, this_cpu;
+ unsigned long flags;
+ u32 dest;
+
+ x2apic_wrmsr_fence();
+
+ local_irq_save(flags);
- cfg = __prepare_ICR(0, vector, dest);
+ this_cpu = smp_processor_id();
/*
- * send the IPI.
+ * We are to modify mask, so we need an own copy
+ * and be sure it's manipulated with irq off.
*/
- native_x2apic_icr_write(cfg, apicid);
-}
+ ipi_mask_ptr = __raw_get_cpu_var(ipi_mask);
+ cpumask_copy(ipi_mask_ptr, mask);
-/*
- * for now, we send the IPI's one by one in the cpumask.
- * TBD: Based on the cpu mask, we can send the IPI's to the cluster group
- * at once. We have 16 cpu's in a cluster. This will minimize IPI register
- * writes.
- */
-static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
-{
- unsigned long query_cpu;
- unsigned long flags;
+ /*
+ * The idea is to send one IPI per cluster.
+ */
+ for_each_cpu(cpu, ipi_mask_ptr) {
+ unsigned long i;
- x2apic_wrmsr_fence();
+ cpus_in_cluster_ptr = per_cpu(cpus_in_cluster, cpu);
+ dest = 0;
- local_irq_save(flags);
- for_each_cpu(query_cpu, mask) {
- __x2apic_send_IPI_dest(
- per_cpu(x86_cpu_to_logical_apicid, query_cpu),
- vector, apic->dest_logical);
+ /* Collect cpus in cluster. */
+ for_each_cpu_and(i, ipi_mask_ptr, cpus_in_cluster_ptr) {
+ if (apic_dest == APIC_DEST_ALLINC || i != this_cpu)
+ dest |= per_cpu(x86_cpu_to_logical_apicid, i);
+ }
+
+ if (!dest)
+ continue;
+
+ __x2apic_send_IPI_dest(dest, vector, apic->dest_logical);
+ /*
+ * Cluster sibling cpus should be discared now so
+ * we would not send IPI them second time.
+ */
+ cpumask_andnot(ipi_mask_ptr, ipi_mask_ptr, cpus_in_cluster_ptr);
}
+
local_irq_restore(flags);
}
+static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
+{
+ __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLINC);
+}
+
static void
x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
{
- unsigned long this_cpu = smp_processor_id();
- unsigned long query_cpu;
- unsigned long flags;
-
- x2apic_wrmsr_fence();
-
- local_irq_save(flags);
- for_each_cpu(query_cpu, mask) {
- if (query_cpu == this_cpu)
- continue;
- __x2apic_send_IPI_dest(
- per_cpu(x86_cpu_to_logical_apicid, query_cpu),
- vector, apic->dest_logical);
- }
- local_irq_restore(flags);
+ __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT);
}
static void x2apic_send_IPI_allbutself(int vector)
{
- unsigned long this_cpu = smp_processor_id();
- unsigned long query_cpu;
- unsigned long flags;
-
- x2apic_wrmsr_fence();
-
- local_irq_save(flags);
- for_each_online_cpu(query_cpu) {
- if (query_cpu == this_cpu)
- continue;
- __x2apic_send_IPI_dest(
- per_cpu(x86_cpu_to_logical_apicid, query_cpu),
- vector, apic->dest_logical);
- }
- local_irq_restore(flags);
+ __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLBUT);
}
static void x2apic_send_IPI_all(int vector)
{
- x2apic_send_IPI_mask(cpu_online_mask, vector);
-}
-
-static int x2apic_apic_id_registered(void)
-{
- return 1;
+ __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC);
}
static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask)
@@ -151,43 +128,90 @@ x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
return per_cpu(x86_cpu_to_logical_apicid, cpu);
}
-static unsigned int x2apic_cluster_phys_get_apic_id(unsigned long x)
+static void init_x2apic_ldr(void)
{
- unsigned int id;
+ unsigned int this_cpu = smp_processor_id();
+ unsigned int cpu;
- id = x;
- return id;
+ per_cpu(x86_cpu_to_logical_apicid, this_cpu) = apic_read(APIC_LDR);
+
+ __cpu_set(this_cpu, per_cpu(cpus_in_cluster, this_cpu));
+ for_each_online_cpu(cpu) {
+ if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu))
+ continue;
+ __cpu_set(this_cpu, per_cpu(cpus_in_cluster, cpu));
+ __cpu_set(cpu, per_cpu(cpus_in_cluster, this_cpu));
+ }
}
-static unsigned long set_apic_id(unsigned int id)
+ /*
+ * At CPU state changes, update the x2apic cluster sibling info.
+ */
+static int __cpuinit
+update_clusterinfo(struct notifier_block *nfb, unsigned long action, void *hcpu)
{
- unsigned long x;
+ unsigned int this_cpu = (unsigned long)hcpu;
+ unsigned int cpu;
+ int err = 0;
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+ if (!zalloc_cpumask_var(&per_cpu(cpus_in_cluster, this_cpu),
+ GFP_KERNEL)) {
+ err = -ENOMEM;
+ } else if (!zalloc_cpumask_var(&per_cpu(ipi_mask, this_cpu),
+ GFP_KERNEL)) {
+ free_cpumask_var(per_cpu(cpus_in_cluster, this_cpu));
+ err = -ENOMEM;
+ }
+ break;
+ case CPU_UP_CANCELED:
+ case CPU_UP_CANCELED_FROZEN:
+ case CPU_DEAD:
+ for_each_online_cpu(cpu) {
+ if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu))
+ continue;
+ __cpu_clear(this_cpu, per_cpu(cpus_in_cluster, cpu));
+ __cpu_clear(cpu, per_cpu(cpus_in_cluster, this_cpu));
+ }
+ free_cpumask_var(per_cpu(cpus_in_cluster, this_cpu));
+ free_cpumask_var(per_cpu(ipi_mask, this_cpu));
+ break;
+ }
- x = id;
- return x;
+ return notifier_from_errno(err);
}
-static int x2apic_cluster_phys_pkg_id(int initial_apicid, int index_msb)
-{
- return initial_apicid >> index_msb;
-}
+static struct notifier_block __refdata x2apic_cpu_notifier = {
+ .notifier_call = update_clusterinfo,
+};
-static void x2apic_send_IPI_self(int vector)
+static int x2apic_init_cpu_notifier(void)
{
- apic_write(APIC_SELF_IPI, vector);
+ int cpu = smp_processor_id();
+
+ zalloc_cpumask_var(&per_cpu(cpus_in_cluster, cpu), GFP_KERNEL);
+ zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL);
+
+ BUG_ON(!per_cpu(cpus_in_cluster, cpu) || !per_cpu(ipi_mask, cpu));
+
+ __cpu_set(cpu, per_cpu(cpus_in_cluster, cpu));
+ register_hotcpu_notifier(&x2apic_cpu_notifier);
+ return 1;
}
-static void init_x2apic_ldr(void)
+static int x2apic_cluster_probe(void)
{
- int cpu = smp_processor_id();
-
- per_cpu(x86_cpu_to_logical_apicid, cpu) = apic_read(APIC_LDR);
+ if (x2apic_mode)
+ return x2apic_init_cpu_notifier();
+ else
+ return 0;
}
-struct apic apic_x2apic_cluster = {
+static struct apic apic_x2apic_cluster = {
.name = "cluster x2apic",
- .probe = NULL,
+ .probe = x2apic_cluster_probe,
.acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
.apic_id_registered = x2apic_apic_id_registered,
@@ -211,11 +235,11 @@ struct apic apic_x2apic_cluster = {
.setup_portio_remap = NULL,
.check_phys_apicid_present = default_check_phys_apicid_present,
.enable_apic_mode = NULL,
- .phys_pkg_id = x2apic_cluster_phys_pkg_id,
+ .phys_pkg_id = x2apic_phys_pkg_id,
.mps_oem_check = NULL,
- .get_apic_id = x2apic_cluster_phys_get_apic_id,
- .set_apic_id = set_apic_id,
+ .get_apic_id = x2apic_get_apic_id,
+ .set_apic_id = x2apic_set_apic_id,
.apic_id_mask = 0xFFFFFFFFu,
.cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
@@ -240,3 +264,5 @@ struct apic apic_x2apic_cluster = {
.wait_icr_idle = native_x2apic_wait_icr_idle,
.safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle,
};
+
+apic_driver(apic_x2apic_cluster);
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index c7e6d6645bf..f5373dfde21 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -7,11 +7,12 @@
#include <linux/dmar.h>
#include <asm/smp.h>
-#include <asm/apic.h>
-#include <asm/ipi.h>
+#include <asm/x2apic.h>
int x2apic_phys;
+static struct apic apic_x2apic_phys;
+
static int set_x2apic_phys_mode(char *arg)
{
x2apic_phys = 1;
@@ -27,94 +28,46 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
return 0;
}
-/*
- * need to use more than cpu 0, because we need more vectors when
- * MSI-X are used.
- */
-static const struct cpumask *x2apic_target_cpus(void)
-{
- return cpu_online_mask;
-}
-
-static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
-{
- cpumask_clear(retmask);
- cpumask_set_cpu(cpu, retmask);
-}
-
-static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
- unsigned int dest)
-{
- unsigned long cfg;
-
- cfg = __prepare_ICR(0, vector, dest);
-
- /*
- * send the IPI.
- */
- native_x2apic_icr_write(cfg, apicid);
-}
-
-static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
+static void
+__x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
{
unsigned long query_cpu;
+ unsigned long this_cpu;
unsigned long flags;
x2apic_wrmsr_fence();
local_irq_save(flags);
+
+ this_cpu = smp_processor_id();
for_each_cpu(query_cpu, mask) {
+ if (apic_dest == APIC_DEST_ALLBUT && this_cpu == query_cpu)
+ continue;
__x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu),
vector, APIC_DEST_PHYSICAL);
}
local_irq_restore(flags);
}
+static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
+{
+ __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLINC);
+}
+
static void
x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
{
- unsigned long this_cpu = smp_processor_id();
- unsigned long query_cpu;
- unsigned long flags;
-
- x2apic_wrmsr_fence();
-
- local_irq_save(flags);
- for_each_cpu(query_cpu, mask) {
- if (query_cpu != this_cpu)
- __x2apic_send_IPI_dest(
- per_cpu(x86_cpu_to_apicid, query_cpu),
- vector, APIC_DEST_PHYSICAL);
- }
- local_irq_restore(flags);
+ __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT);
}
static void x2apic_send_IPI_allbutself(int vector)
{
- unsigned long this_cpu = smp_processor_id();
- unsigned long query_cpu;
- unsigned long flags;
-
- x2apic_wrmsr_fence();
-
- local_irq_save(flags);
- for_each_online_cpu(query_cpu) {
- if (query_cpu == this_cpu)
- continue;
- __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu),
- vector, APIC_DEST_PHYSICAL);
- }
- local_irq_restore(flags);
+ __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLBUT);
}
static void x2apic_send_IPI_all(int vector)
{
- x2apic_send_IPI_mask(cpu_online_mask, vector);
-}
-
-static int x2apic_apic_id_registered(void)
-{
- return 1;
+ __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC);
}
static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask)
@@ -149,34 +102,22 @@ x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
return per_cpu(x86_cpu_to_apicid, cpu);
}
-static unsigned int x2apic_phys_get_apic_id(unsigned long x)
-{
- return x;
-}
-
-static unsigned long set_apic_id(unsigned int id)
-{
- return id;
-}
-
-static int x2apic_phys_pkg_id(int initial_apicid, int index_msb)
+static void init_x2apic_ldr(void)
{
- return initial_apicid >> index_msb;
}
-static void x2apic_send_IPI_self(int vector)
+static int x2apic_phys_probe(void)
{
- apic_write(APIC_SELF_IPI, vector);
-}
+ if (x2apic_mode && x2apic_phys)
+ return 1;
-static void init_x2apic_ldr(void)
-{
+ return apic == &apic_x2apic_phys;
}
-struct apic apic_x2apic_phys = {
+static struct apic apic_x2apic_phys = {
.name = "physical x2apic",
- .probe = NULL,
+ .probe = x2apic_phys_probe,
.acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
.apic_id_registered = x2apic_apic_id_registered,
@@ -203,8 +144,8 @@ struct apic apic_x2apic_phys = {
.phys_pkg_id = x2apic_phys_pkg_id,
.mps_oem_check = NULL,
- .get_apic_id = x2apic_phys_get_apic_id,
- .set_apic_id = set_apic_id,
+ .get_apic_id = x2apic_get_apic_id,
+ .set_apic_id = x2apic_set_apic_id,
.apic_id_mask = 0xFFFFFFFFu,
.cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
@@ -229,3 +170,5 @@ struct apic apic_x2apic_phys = {
.wait_icr_idle = native_x2apic_wait_icr_idle,
.safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle,
};
+
+apic_driver(apic_x2apic_phys);
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 7acd2d2ac96..adc66c3a1fe 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -58,6 +58,8 @@ unsigned int uv_apicid_hibits;
EXPORT_SYMBOL_GPL(uv_apicid_hibits);
static DEFINE_SPINLOCK(uv_nmi_lock);
+static struct apic apic_x2apic_uv_x;
+
static unsigned long __init uv_early_read_mmr(unsigned long addr)
{
unsigned long val, *mmr;
@@ -89,6 +91,10 @@ static int __init early_get_pnodeid(void)
m_n_config.v = uv_early_read_mmr(UVH_RH_GAM_CONFIG_MMR);
uv_min_hub_revision_id = node_id.s.revision;
+ if (node_id.s.part_number == UV2_HUB_PART_NUMBER)
+ uv_min_hub_revision_id += UV2_HUB_REVISION_BASE - 1;
+
+ uv_hub_info->hub_revision = uv_min_hub_revision_id;
pnode = (node_id.s.node_id >> 1) & ((1 << m_n_config.s.n_skt) - 1);
return pnode;
}
@@ -110,17 +116,25 @@ static void __init early_get_apic_pnode_shift(void)
*/
static void __init uv_set_apicid_hibit(void)
{
- union uvh_lb_target_physical_apic_id_mask_u apicid_mask;
+ union uv1h_lb_target_physical_apic_id_mask_u apicid_mask;
- apicid_mask.v = uv_early_read_mmr(UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK);
- uv_apicid_hibits = apicid_mask.s.bit_enables & UV_APICID_HIBIT_MASK;
+ if (is_uv1_hub()) {
+ apicid_mask.v =
+ uv_early_read_mmr(UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK);
+ uv_apicid_hibits =
+ apicid_mask.s1.bit_enables & UV_APICID_HIBIT_MASK;
+ }
}
static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
- int pnodeid;
+ int pnodeid, is_uv1, is_uv2;
- if (!strcmp(oem_id, "SGI")) {
+ is_uv1 = !strcmp(oem_id, "SGI");
+ is_uv2 = !strcmp(oem_id, "SGI2");
+ if (is_uv1 || is_uv2) {
+ uv_hub_info->hub_revision =
+ is_uv1 ? UV1_HUB_REVISION_BASE : UV2_HUB_REVISION_BASE;
pnodeid = early_get_pnodeid();
early_get_apic_pnode_shift();
x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range;
@@ -326,10 +340,15 @@ static void uv_send_IPI_self(int vector)
apic_write(APIC_SELF_IPI, vector);
}
-struct apic __refdata apic_x2apic_uv_x = {
+static int uv_probe(void)
+{
+ return apic == &apic_x2apic_uv_x;
+}
+
+static struct apic __refdata apic_x2apic_uv_x = {
.name = "UV large system",
- .probe = NULL,
+ .probe = uv_probe,
.acpi_madt_oem_check = uv_acpi_madt_oem_check,
.apic_id_registered = uv_apic_id_registered,
@@ -477,12 +496,19 @@ static __init void map_mmr_high(int max_pnode)
static __init void map_mmioh_high(int max_pnode)
{
union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh;
- int shift = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT;
+ int shift;
mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR);
- if (mmioh.s.enable)
- map_high("MMIOH", mmioh.s.base, shift, mmioh.s.m_io,
+ if (is_uv1_hub() && mmioh.s1.enable) {
+ shift = UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT;
+ map_high("MMIOH", mmioh.s1.base, shift, mmioh.s1.m_io,
max_pnode, map_uc);
+ }
+ if (is_uv2_hub() && mmioh.s2.enable) {
+ shift = UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT;
+ map_high("MMIOH", mmioh.s2.base, shift, mmioh.s2.m_io,
+ max_pnode, map_uc);
+ }
}
static __init void map_low_mmrs(void)
@@ -606,14 +632,14 @@ late_initcall(uv_init_heartbeat);
/* Direct Legacy VGA I/O traffic to designated IOH */
int uv_set_vga_state(struct pci_dev *pdev, bool decode,
- unsigned int command_bits, bool change_bridge)
+ unsigned int command_bits, u32 flags)
{
int domain, bus, rc;
- PR_DEVEL("devfn %x decode %d cmd %x chg_brdg %d\n",
- pdev->devfn, decode, command_bits, change_bridge);
+ PR_DEVEL("devfn %x decode %d cmd %x flags %d\n",
+ pdev->devfn, decode, command_bits, flags);
- if (!change_bridge)
+ if (!(flags & PCI_VGA_STATE_CHANGE_BRIDGE))
return 0;
if ((command_bits & PCI_COMMAND_IO) == 0)
@@ -729,13 +755,14 @@ void __init uv_system_init(void)
unsigned long mmr_base, present, paddr;
unsigned short pnode_mask, pnode_io_mask;
+ printk(KERN_INFO "UV: Found %s hub\n", is_uv1_hub() ? "UV1" : "UV2");
map_low_mmrs();
m_n_config.v = uv_read_local_mmr(UVH_RH_GAM_CONFIG_MMR );
m_val = m_n_config.s.m_skt;
n_val = m_n_config.s.n_skt;
mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR);
- n_io = mmioh.s.n_io;
+ n_io = is_uv1_hub() ? mmioh.s1.n_io : mmioh.s2.n_io;
mmr_base =
uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) &
~UV_MMR_ENABLE;
@@ -804,6 +831,8 @@ void __init uv_system_init(void)
*/
uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask;
uv_cpu_hub_info(cpu)->apic_pnode_shift = uvh_apicid.s.pnode_shift;
+ uv_cpu_hub_info(cpu)->hub_revision = uv_hub_info->hub_revision;
+
pnode = uv_apicid_to_pnode(apicid);
blade = boot_pnode_to_blade(pnode);
lcpu = uv_blade_info[blade].nr_possible_cpus;
@@ -859,3 +888,5 @@ void __init uv_system_init(void)
if (is_kdump_kernel())
reboot_type = BOOT_ACPI;
}
+
+apic_driver(apic_x2apic_uv_x);
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index adee12e0da1..0371c484bb8 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -229,11 +229,11 @@
#include <linux/jiffies.h>
#include <linux/acpi.h>
#include <linux/syscore_ops.h>
+#include <linux/i8253.h>
#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/desc.h>
-#include <asm/i8253.h>
#include <asm/olpc.h>
#include <asm/paravirt.h>
#include <asm/reboot.h>
@@ -361,6 +361,7 @@ struct apm_user {
* idle percentage above which bios idle calls are done
*/
#ifdef CONFIG_APM_CPU_IDLE
+#warning deprecated CONFIG_APM_CPU_IDLE will be deleted in 2012
#define DEFAULT_IDLE_THRESHOLD 95
#else
#define DEFAULT_IDLE_THRESHOLD 100
@@ -904,6 +905,7 @@ static void apm_cpu_idle(void)
unsigned int jiffies_since_last_check = jiffies - last_jiffies;
unsigned int bucket;
+ WARN_ONCE(1, "deprecated apm_cpu_idle will be deleted in 2012");
recalc:
if (jiffies_since_last_check > IDLE_CALC_LIMIT) {
use_apm_idle = 0;
@@ -1218,11 +1220,11 @@ static void reinit_timer(void)
raw_spin_lock_irqsave(&i8253_lock, flags);
/* set the clock to HZ */
- outb_pit(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */
+ outb_p(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */
udelay(10);
- outb_pit(LATCH & 0xff, PIT_CH0); /* LSB */
+ outb_p(LATCH & 0xff, PIT_CH0); /* LSB */
udelay(10);
- outb_pit(LATCH >> 8, PIT_CH0); /* MSB */
+ outb_p(LATCH >> 8, PIT_CH0); /* MSB */
udelay(10);
raw_spin_unlock_irqrestore(&i8253_lock, flags);
#endif
@@ -1238,7 +1240,6 @@ static int suspend(int vetoable)
dpm_suspend_noirq(PMSG_SUSPEND);
local_irq_disable();
- sysdev_suspend(PMSG_SUSPEND);
syscore_suspend();
local_irq_enable();
@@ -1258,7 +1259,6 @@ static int suspend(int vetoable)
err = (err == APM_SUCCESS) ? 0 : -EIO;
syscore_resume();
- sysdev_resume();
local_irq_enable();
dpm_resume_noirq(PMSG_RESUME);
@@ -1282,7 +1282,6 @@ static void standby(void)
dpm_suspend_noirq(PMSG_SUSPEND);
local_irq_disable();
- sysdev_suspend(PMSG_SUSPEND);
syscore_suspend();
local_irq_enable();
@@ -1292,7 +1291,6 @@ static void standby(void)
local_irq_disable();
syscore_resume();
- sysdev_resume();
local_irq_enable();
dpm_resume_noirq(PMSG_RESUME);
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index c29d631af6f..395a10e6806 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -63,7 +63,6 @@ void foo(void)
BLANK();
OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending);
- OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir);
BLANK();
OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc);
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 3f0ebe429a0..6042981d030 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -30,7 +30,6 @@ obj-$(CONFIG_PERF_EVENTS) += perf_event.o
obj-$(CONFIG_X86_MCE) += mcheck/
obj-$(CONFIG_MTRR) += mtrr/
-obj-$(CONFIG_CPU_FREQ) += cpufreq/
obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 6f9d1f6063e..b13ed393dfc 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -612,8 +612,11 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
}
#endif
- /* As a rule processors have APIC timer running in deep C states */
- if (c->x86 > 0xf && !cpu_has_amd_erratum(amd_erratum_400))
+ /*
+ * Family 0x12 and above processors have APIC timer
+ * running in deep C states.
+ */
+ if (c->x86 > 0x11)
set_cpu_cap(c, X86_FEATURE_ARAT);
/*
@@ -629,10 +632,13 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
* Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=33012
*/
u64 mask;
+ int err;
- rdmsrl(MSR_AMD64_MCx_MASK(4), mask);
- mask |= (1 << 10);
- wrmsrl(MSR_AMD64_MCx_MASK(4), mask);
+ err = rdmsrl_safe(MSR_AMD64_MCx_MASK(4), &mask);
+ if (err == 0) {
+ mask |= (1 << 10);
+ checking_wrmsrl(MSR_AMD64_MCx_MASK(4), mask);
+ }
}
}
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index c39576cb301..525514cf33c 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -19,6 +19,7 @@
static int __init no_halt(char *s)
{
+ WARN_ONCE(1, "\"no-hlt\" is deprecated, please use \"idle=poll\"\n");
boot_cpu_data.hlt_works_ok = 0;
return 1;
}
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index e2ced0074a4..22a073d7fbf 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -254,6 +254,25 @@ static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
}
#endif
+static int disable_smep __cpuinitdata;
+static __init int setup_disable_smep(char *arg)
+{
+ disable_smep = 1;
+ return 1;
+}
+__setup("nosmep", setup_disable_smep);
+
+static __cpuinit void setup_smep(struct cpuinfo_x86 *c)
+{
+ if (cpu_has(c, X86_FEATURE_SMEP)) {
+ if (unlikely(disable_smep)) {
+ setup_clear_cpu_cap(X86_FEATURE_SMEP);
+ clear_in_cr4(X86_CR4_SMEP);
+ } else
+ set_in_cr4(X86_CR4_SMEP);
+ }
+}
+
/*
* Some CPU features depend on higher CPUID levels, which may not always
* be available due to CPUID level capping or broken virtualization
@@ -458,13 +477,6 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
if (smp_num_siblings <= 1)
goto out;
- if (smp_num_siblings > nr_cpu_ids) {
- pr_warning("CPU: Unsupported number of siblings %d",
- smp_num_siblings);
- smp_num_siblings = 1;
- return;
- }
-
index_msb = get_count_order(smp_num_siblings);
c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb);
@@ -565,8 +577,7 @@ void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx);
- if (eax > 0)
- c->x86_capability[9] = ebx;
+ c->x86_capability[9] = ebx;
}
/* AMD-defined flags: level 0x80000001 */
@@ -668,6 +679,8 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
c->cpu_index = 0;
#endif
filter_cpuid_features(c, false);
+
+ setup_smep(c);
}
void __init early_cpu_init(void)
@@ -753,6 +766,8 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
#endif
}
+ setup_smep(c);
+
get_model_name(c); /* Default name */
detect_nopl(c);
@@ -887,7 +902,7 @@ static void vgetcpu_set_mode(void)
void __init identify_boot_cpu(void)
{
identify_cpu(&boot_cpu_data);
- init_c1e_mask();
+ init_amd_e400_c1e_mask();
#ifdef CONFIG_X86_32
sysenter_setup();
enable_sep_cpu();
diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig
deleted file mode 100644
index 870e6cc6ad2..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/Kconfig
+++ /dev/null
@@ -1,266 +0,0 @@
-#
-# CPU Frequency scaling
-#
-
-menu "CPU Frequency scaling"
-
-source "drivers/cpufreq/Kconfig"
-
-if CPU_FREQ
-
-comment "CPUFreq processor drivers"
-
-config X86_PCC_CPUFREQ
- tristate "Processor Clocking Control interface driver"
- depends on ACPI && ACPI_PROCESSOR
- help
- This driver adds support for the PCC interface.
-
- For details, take a look at:
- <file:Documentation/cpu-freq/pcc-cpufreq.txt>.
-
- To compile this driver as a module, choose M here: the
- module will be called pcc-cpufreq.
-
- If in doubt, say N.
-
-config X86_ACPI_CPUFREQ
- tristate "ACPI Processor P-States driver"
- select CPU_FREQ_TABLE
- depends on ACPI_PROCESSOR
- help
- This driver adds a CPUFreq driver which utilizes the ACPI
- Processor Performance States.
- This driver also supports Intel Enhanced Speedstep.
-
- To compile this driver as a module, choose M here: the
- module will be called acpi-cpufreq.
-
- For details, take a look at <file:Documentation/cpu-freq/>.
-
- If in doubt, say N.
-
-config ELAN_CPUFREQ
- tristate "AMD Elan SC400 and SC410"
- select CPU_FREQ_TABLE
- depends on X86_ELAN
- ---help---
- This adds the CPUFreq driver for AMD Elan SC400 and SC410
- processors.
-
- You need to specify the processor maximum speed as boot
- parameter: elanfreq=maxspeed (in kHz) or as module
- parameter "max_freq".
-
- For details, take a look at <file:Documentation/cpu-freq/>.
-
- If in doubt, say N.
-
-config SC520_CPUFREQ
- tristate "AMD Elan SC520"
- select CPU_FREQ_TABLE
- depends on X86_ELAN
- ---help---
- This adds the CPUFreq driver for AMD Elan SC520 processor.
-
- For details, take a look at <file:Documentation/cpu-freq/>.
-
- If in doubt, say N.
-
-
-config X86_POWERNOW_K6
- tristate "AMD Mobile K6-2/K6-3 PowerNow!"
- select CPU_FREQ_TABLE
- depends on X86_32
- help
- This adds the CPUFreq driver for mobile AMD K6-2+ and mobile
- AMD K6-3+ processors.
-
- For details, take a look at <file:Documentation/cpu-freq/>.
-
- If in doubt, say N.
-
-config X86_POWERNOW_K7
- tristate "AMD Mobile Athlon/Duron PowerNow!"
- select CPU_FREQ_TABLE
- depends on X86_32
- help
- This adds the CPUFreq driver for mobile AMD K7 mobile processors.
-
- For details, take a look at <file:Documentation/cpu-freq/>.
-
- If in doubt, say N.
-
-config X86_POWERNOW_K7_ACPI
- bool
- depends on X86_POWERNOW_K7 && ACPI_PROCESSOR
- depends on !(X86_POWERNOW_K7 = y && ACPI_PROCESSOR = m)
- depends on X86_32
- default y
-
-config X86_POWERNOW_K8
- tristate "AMD Opteron/Athlon64 PowerNow!"
- select CPU_FREQ_TABLE
- depends on ACPI && ACPI_PROCESSOR
- help
- This adds the CPUFreq driver for K8/K10 Opteron/Athlon64 processors.
-
- To compile this driver as a module, choose M here: the
- module will be called powernow-k8.
-
- For details, take a look at <file:Documentation/cpu-freq/>.
-
-config X86_GX_SUSPMOD
- tristate "Cyrix MediaGX/NatSemi Geode Suspend Modulation"
- depends on X86_32 && PCI
- help
- This add the CPUFreq driver for NatSemi Geode processors which
- support suspend modulation.
-
- For details, take a look at <file:Documentation/cpu-freq/>.
-
- If in doubt, say N.
-
-config X86_SPEEDSTEP_CENTRINO
- tristate "Intel Enhanced SpeedStep (deprecated)"
- select CPU_FREQ_TABLE
- select X86_SPEEDSTEP_CENTRINO_TABLE if X86_32
- depends on X86_32 || (X86_64 && ACPI_PROCESSOR)
- help
- This is deprecated and this functionality is now merged into
- acpi_cpufreq (X86_ACPI_CPUFREQ). Use that driver instead of
- speedstep_centrino.
- This adds the CPUFreq driver for Enhanced SpeedStep enabled
- mobile CPUs. This means Intel Pentium M (Centrino) CPUs
- or 64bit enabled Intel Xeons.
-
- To compile this driver as a module, choose M here: the
- module will be called speedstep-centrino.
-
- For details, take a look at <file:Documentation/cpu-freq/>.
-
- If in doubt, say N.
-
-config X86_SPEEDSTEP_CENTRINO_TABLE
- bool "Built-in tables for Banias CPUs"
- depends on X86_32 && X86_SPEEDSTEP_CENTRINO
- default y
- help
- Use built-in tables for Banias CPUs if ACPI encoding
- is not available.
-
- If in doubt, say N.
-
-config X86_SPEEDSTEP_ICH
- tristate "Intel Speedstep on ICH-M chipsets (ioport interface)"
- select CPU_FREQ_TABLE
- depends on X86_32
- help
- This adds the CPUFreq driver for certain mobile Intel Pentium III
- (Coppermine), all mobile Intel Pentium III-M (Tualatin) and all
- mobile Intel Pentium 4 P4-M on systems which have an Intel ICH2,
- ICH3 or ICH4 southbridge.
-
- For details, take a look at <file:Documentation/cpu-freq/>.
-
- If in doubt, say N.
-
-config X86_SPEEDSTEP_SMI
- tristate "Intel SpeedStep on 440BX/ZX/MX chipsets (SMI interface)"
- select CPU_FREQ_TABLE
- depends on X86_32 && EXPERIMENTAL
- help
- This adds the CPUFreq driver for certain mobile Intel Pentium III
- (Coppermine), all mobile Intel Pentium III-M (Tualatin)
- on systems which have an Intel 440BX/ZX/MX southbridge.
-
- For details, take a look at <file:Documentation/cpu-freq/>.
-
- If in doubt, say N.
-
-config X86_P4_CLOCKMOD
- tristate "Intel Pentium 4 clock modulation"
- select CPU_FREQ_TABLE
- help
- This adds the CPUFreq driver for Intel Pentium 4 / XEON
- processors. When enabled it will lower CPU temperature by skipping
- clocks.
-
- This driver should be only used in exceptional
- circumstances when very low power is needed because it causes severe
- slowdowns and noticeable latencies. Normally Speedstep should be used
- instead.
-
- To compile this driver as a module, choose M here: the
- module will be called p4-clockmod.
-
- For details, take a look at <file:Documentation/cpu-freq/>.
-
- Unless you are absolutely sure say N.
-
-config X86_CPUFREQ_NFORCE2
- tristate "nVidia nForce2 FSB changing"
- depends on X86_32 && EXPERIMENTAL
- help
- This adds the CPUFreq driver for FSB changing on nVidia nForce2
- platforms.
-
- For details, take a look at <file:Documentation/cpu-freq/>.
-
- If in doubt, say N.
-
-config X86_LONGRUN
- tristate "Transmeta LongRun"
- depends on X86_32
- help
- This adds the CPUFreq driver for Transmeta Crusoe and Efficeon processors
- which support LongRun.
-
- For details, take a look at <file:Documentation/cpu-freq/>.
-
- If in doubt, say N.
-
-config X86_LONGHAUL
- tristate "VIA Cyrix III Longhaul"
- select CPU_FREQ_TABLE
- depends on X86_32 && ACPI_PROCESSOR
- help
- This adds the CPUFreq driver for VIA Samuel/CyrixIII,
- VIA Cyrix Samuel/C3, VIA Cyrix Ezra and VIA Cyrix Ezra-T
- processors.
-
- For details, take a look at <file:Documentation/cpu-freq/>.
-
- If in doubt, say N.
-
-config X86_E_POWERSAVER
- tristate "VIA C7 Enhanced PowerSaver (DANGEROUS)"
- select CPU_FREQ_TABLE
- depends on X86_32 && EXPERIMENTAL
- help
- This adds the CPUFreq driver for VIA C7 processors. However, this driver
- does not have any safeguards to prevent operating the CPU out of spec
- and is thus considered dangerous. Please use the regular ACPI cpufreq
- driver, enabled by CONFIG_X86_ACPI_CPUFREQ.
-
- If in doubt, say N.
-
-comment "shared options"
-
-config X86_SPEEDSTEP_LIB
- tristate
- default (X86_SPEEDSTEP_ICH || X86_SPEEDSTEP_SMI || X86_P4_CLOCKMOD)
-
-config X86_SPEEDSTEP_RELAXED_CAP_CHECK
- bool "Relaxed speedstep capability checks"
- depends on X86_32 && (X86_SPEEDSTEP_SMI || X86_SPEEDSTEP_ICH)
- help
- Don't perform all checks for a speedstep capable system which would
- normally be done. Some ancient or strange systems, though speedstep
- capable, don't always indicate that they are speedstep capable. This
- option lets the probing code bypass some of those checks if the
- parameter "relaxed_check=1" is passed to the module.
-
-endif # CPU_FREQ
-
-endmenu
diff --git a/arch/x86/kernel/cpu/cpufreq/Makefile b/arch/x86/kernel/cpu/cpufreq/Makefile
deleted file mode 100644
index bd54bf67e6f..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/Makefile
+++ /dev/null
@@ -1,21 +0,0 @@
-# Link order matters. K8 is preferred to ACPI because of firmware bugs in early
-# K8 systems. ACPI is preferred to all other hardware-specific drivers.
-# speedstep-* is preferred over p4-clockmod.
-
-obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o mperf.o
-obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o mperf.o
-obj-$(CONFIG_X86_PCC_CPUFREQ) += pcc-cpufreq.o
-obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o
-obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o
-obj-$(CONFIG_X86_LONGHAUL) += longhaul.o
-obj-$(CONFIG_X86_E_POWERSAVER) += e_powersaver.o
-obj-$(CONFIG_ELAN_CPUFREQ) += elanfreq.o
-obj-$(CONFIG_SC520_CPUFREQ) += sc520_freq.o
-obj-$(CONFIG_X86_LONGRUN) += longrun.o
-obj-$(CONFIG_X86_GX_SUSPMOD) += gx-suspmod.o
-obj-$(CONFIG_X86_SPEEDSTEP_ICH) += speedstep-ich.o
-obj-$(CONFIG_X86_SPEEDSTEP_LIB) += speedstep-lib.o
-obj-$(CONFIG_X86_SPEEDSTEP_SMI) += speedstep-smi.o
-obj-$(CONFIG_X86_SPEEDSTEP_CENTRINO) += speedstep-centrino.o
-obj-$(CONFIG_X86_P4_CLOCKMOD) += p4-clockmod.o
-obj-$(CONFIG_X86_CPUFREQ_NFORCE2) += cpufreq-nforce2.o
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
deleted file mode 100644
index a2baafb2fe6..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ /dev/null
@@ -1,776 +0,0 @@
-/*
- * acpi-cpufreq.c - ACPI Processor P-States Driver
- *
- * Copyright (C) 2001, 2002 Andy Grover <andrew.grover@intel.com>
- * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
- * Copyright (C) 2002 - 2004 Dominik Brodowski <linux@brodo.de>
- * Copyright (C) 2006 Denis Sadykov <denis.m.sadykov@intel.com>
- *
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or (at
- * your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
- *
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-#include <linux/sched.h>
-#include <linux/cpufreq.h>
-#include <linux/compiler.h>
-#include <linux/dmi.h>
-#include <linux/slab.h>
-
-#include <linux/acpi.h>
-#include <linux/io.h>
-#include <linux/delay.h>
-#include <linux/uaccess.h>
-
-#include <acpi/processor.h>
-
-#include <asm/msr.h>
-#include <asm/processor.h>
-#include <asm/cpufeature.h>
-#include "mperf.h"
-
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
- "acpi-cpufreq", msg)
-
-MODULE_AUTHOR("Paul Diefenbaugh, Dominik Brodowski");
-MODULE_DESCRIPTION("ACPI Processor P-States Driver");
-MODULE_LICENSE("GPL");
-
-enum {
- UNDEFINED_CAPABLE = 0,
- SYSTEM_INTEL_MSR_CAPABLE,
- SYSTEM_IO_CAPABLE,
-};
-
-#define INTEL_MSR_RANGE (0xffff)
-
-struct acpi_cpufreq_data {
- struct acpi_processor_performance *acpi_data;
- struct cpufreq_frequency_table *freq_table;
- unsigned int resume;
- unsigned int cpu_feature;
-};
-
-static DEFINE_PER_CPU(struct acpi_cpufreq_data *, acfreq_data);
-
-/* acpi_perf_data is a pointer to percpu data. */
-static struct acpi_processor_performance __percpu *acpi_perf_data;
-
-static struct cpufreq_driver acpi_cpufreq_driver;
-
-static unsigned int acpi_pstate_strict;
-
-static int check_est_cpu(unsigned int cpuid)
-{
- struct cpuinfo_x86 *cpu = &cpu_data(cpuid);
-
- return cpu_has(cpu, X86_FEATURE_EST);
-}
-
-static unsigned extract_io(u32 value, struct acpi_cpufreq_data *data)
-{
- struct acpi_processor_performance *perf;
- int i;
-
- perf = data->acpi_data;
-
- for (i = 0; i < perf->state_count; i++) {
- if (value == perf->states[i].status)
- return data->freq_table[i].frequency;
- }
- return 0;
-}
-
-static unsigned extract_msr(u32 msr, struct acpi_cpufreq_data *data)
-{
- int i;
- struct acpi_processor_performance *perf;
-
- msr &= INTEL_MSR_RANGE;
- perf = data->acpi_data;
-
- for (i = 0; data->freq_table[i].frequency != CPUFREQ_TABLE_END; i++) {
- if (msr == perf->states[data->freq_table[i].index].status)
- return data->freq_table[i].frequency;
- }
- return data->freq_table[0].frequency;
-}
-
-static unsigned extract_freq(u32 val, struct acpi_cpufreq_data *data)
-{
- switch (data->cpu_feature) {
- case SYSTEM_INTEL_MSR_CAPABLE:
- return extract_msr(val, data);
- case SYSTEM_IO_CAPABLE:
- return extract_io(val, data);
- default:
- return 0;
- }
-}
-
-struct msr_addr {
- u32 reg;
-};
-
-struct io_addr {
- u16 port;
- u8 bit_width;
-};
-
-struct drv_cmd {
- unsigned int type;
- const struct cpumask *mask;
- union {
- struct msr_addr msr;
- struct io_addr io;
- } addr;
- u32 val;
-};
-
-/* Called via smp_call_function_single(), on the target CPU */
-static void do_drv_read(void *_cmd)
-{
- struct drv_cmd *cmd = _cmd;
- u32 h;
-
- switch (cmd->type) {
- case SYSTEM_INTEL_MSR_CAPABLE:
- rdmsr(cmd->addr.msr.reg, cmd->val, h);
- break;
- case SYSTEM_IO_CAPABLE:
- acpi_os_read_port((acpi_io_address)cmd->addr.io.port,
- &cmd->val,
- (u32)cmd->addr.io.bit_width);
- break;
- default:
- break;
- }
-}
-
-/* Called via smp_call_function_many(), on the target CPUs */
-static void do_drv_write(void *_cmd)
-{
- struct drv_cmd *cmd = _cmd;
- u32 lo, hi;
-
- switch (cmd->type) {
- case SYSTEM_INTEL_MSR_CAPABLE:
- rdmsr(cmd->addr.msr.reg, lo, hi);
- lo = (lo & ~INTEL_MSR_RANGE) | (cmd->val & INTEL_MSR_RANGE);
- wrmsr(cmd->addr.msr.reg, lo, hi);
- break;
- case SYSTEM_IO_CAPABLE:
- acpi_os_write_port((acpi_io_address)cmd->addr.io.port,
- cmd->val,
- (u32)cmd->addr.io.bit_width);
- break;
- default:
- break;
- }
-}
-
-static void drv_read(struct drv_cmd *cmd)
-{
- int err;
- cmd->val = 0;
-
- err = smp_call_function_any(cmd->mask, do_drv_read, cmd, 1);
- WARN_ON_ONCE(err); /* smp_call_function_any() was buggy? */
-}
-
-static void drv_write(struct drv_cmd *cmd)
-{
- int this_cpu;
-
- this_cpu = get_cpu();
- if (cpumask_test_cpu(this_cpu, cmd->mask))
- do_drv_write(cmd);
- smp_call_function_many(cmd->mask, do_drv_write, cmd, 1);
- put_cpu();
-}
-
-static u32 get_cur_val(const struct cpumask *mask)
-{
- struct acpi_processor_performance *perf;
- struct drv_cmd cmd;
-
- if (unlikely(cpumask_empty(mask)))
- return 0;
-
- switch (per_cpu(acfreq_data, cpumask_first(mask))->cpu_feature) {
- case SYSTEM_INTEL_MSR_CAPABLE:
- cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
- cmd.addr.msr.reg = MSR_IA32_PERF_STATUS;
- break;
- case SYSTEM_IO_CAPABLE:
- cmd.type = SYSTEM_IO_CAPABLE;
- perf = per_cpu(acfreq_data, cpumask_first(mask))->acpi_data;
- cmd.addr.io.port = perf->control_register.address;
- cmd.addr.io.bit_width = perf->control_register.bit_width;
- break;
- default:
- return 0;
- }
-
- cmd.mask = mask;
- drv_read(&cmd);
-
- dprintk("get_cur_val = %u\n", cmd.val);
-
- return cmd.val;
-}
-
-static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
-{
- struct acpi_cpufreq_data *data = per_cpu(acfreq_data, cpu);
- unsigned int freq;
- unsigned int cached_freq;
-
- dprintk("get_cur_freq_on_cpu (%d)\n", cpu);
-
- if (unlikely(data == NULL ||
- data->acpi_data == NULL || data->freq_table == NULL)) {
- return 0;
- }
-
- cached_freq = data->freq_table[data->acpi_data->state].frequency;
- freq = extract_freq(get_cur_val(cpumask_of(cpu)), data);
- if (freq != cached_freq) {
- /*
- * The dreaded BIOS frequency change behind our back.
- * Force set the frequency on next target call.
- */
- data->resume = 1;
- }
-
- dprintk("cur freq = %u\n", freq);
-
- return freq;
-}
-
-static unsigned int check_freqs(const struct cpumask *mask, unsigned int freq,
- struct acpi_cpufreq_data *data)
-{
- unsigned int cur_freq;
- unsigned int i;
-
- for (i = 0; i < 100; i++) {
- cur_freq = extract_freq(get_cur_val(mask), data);
- if (cur_freq == freq)
- return 1;
- udelay(10);
- }
- return 0;
-}
-
-static int acpi_cpufreq_target(struct cpufreq_policy *policy,
- unsigned int target_freq, unsigned int relation)
-{
- struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
- struct acpi_processor_performance *perf;
- struct cpufreq_freqs freqs;
- struct drv_cmd cmd;
- unsigned int next_state = 0; /* Index into freq_table */
- unsigned int next_perf_state = 0; /* Index into perf table */
- unsigned int i;
- int result = 0;
-
- dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu);
-
- if (unlikely(data == NULL ||
- data->acpi_data == NULL || data->freq_table == NULL)) {
- return -ENODEV;
- }
-
- perf = data->acpi_data;
- result = cpufreq_frequency_table_target(policy,
- data->freq_table,
- target_freq,
- relation, &next_state);
- if (unlikely(result)) {
- result = -ENODEV;
- goto out;
- }
-
- next_perf_state = data->freq_table[next_state].index;
- if (perf->state == next_perf_state) {
- if (unlikely(data->resume)) {
- dprintk("Called after resume, resetting to P%d\n",
- next_perf_state);
- data->resume = 0;
- } else {
- dprintk("Already at target state (P%d)\n",
- next_perf_state);
- goto out;
- }
- }
-
- switch (data->cpu_feature) {
- case SYSTEM_INTEL_MSR_CAPABLE:
- cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
- cmd.addr.msr.reg = MSR_IA32_PERF_CTL;
- cmd.val = (u32) perf->states[next_perf_state].control;
- break;
- case SYSTEM_IO_CAPABLE:
- cmd.type = SYSTEM_IO_CAPABLE;
- cmd.addr.io.port = perf->control_register.address;
- cmd.addr.io.bit_width = perf->control_register.bit_width;
- cmd.val = (u32) perf->states[next_perf_state].control;
- break;
- default:
- result = -ENODEV;
- goto out;
- }
-
- /* cpufreq holds the hotplug lock, so we are safe from here on */
- if (policy->shared_type != CPUFREQ_SHARED_TYPE_ANY)
- cmd.mask = policy->cpus;
- else
- cmd.mask = cpumask_of(policy->cpu);
-
- freqs.old = perf->states[perf->state].core_frequency * 1000;
- freqs.new = data->freq_table[next_state].frequency;
- for_each_cpu(i, policy->cpus) {
- freqs.cpu = i;
- cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
- }
-
- drv_write(&cmd);
-
- if (acpi_pstate_strict) {
- if (!check_freqs(cmd.mask, freqs.new, data)) {
- dprintk("acpi_cpufreq_target failed (%d)\n",
- policy->cpu);
- result = -EAGAIN;
- goto out;
- }
- }
-
- for_each_cpu(i, policy->cpus) {
- freqs.cpu = i;
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
- }
- perf->state = next_perf_state;
-
-out:
- return result;
-}
-
-static int acpi_cpufreq_verify(struct cpufreq_policy *policy)
-{
- struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
-
- dprintk("acpi_cpufreq_verify\n");
-
- return cpufreq_frequency_table_verify(policy, data->freq_table);
-}
-
-static unsigned long
-acpi_cpufreq_guess_freq(struct acpi_cpufreq_data *data, unsigned int cpu)
-{
- struct acpi_processor_performance *perf = data->acpi_data;
-
- if (cpu_khz) {
- /* search the closest match to cpu_khz */
- unsigned int i;
- unsigned long freq;
- unsigned long freqn = perf->states[0].core_frequency * 1000;
-
- for (i = 0; i < (perf->state_count-1); i++) {
- freq = freqn;
- freqn = perf->states[i+1].core_frequency * 1000;
- if ((2 * cpu_khz) > (freqn + freq)) {
- perf->state = i;
- return freq;
- }
- }
- perf->state = perf->state_count-1;
- return freqn;
- } else {
- /* assume CPU is at P0... */
- perf->state = 0;
- return perf->states[0].core_frequency * 1000;
- }
-}
-
-static void free_acpi_perf_data(void)
-{
- unsigned int i;
-
- /* Freeing a NULL pointer is OK, and alloc_percpu zeroes. */
- for_each_possible_cpu(i)
- free_cpumask_var(per_cpu_ptr(acpi_perf_data, i)
- ->shared_cpu_map);
- free_percpu(acpi_perf_data);
-}
-
-/*
- * acpi_cpufreq_early_init - initialize ACPI P-States library
- *
- * Initialize the ACPI P-States library (drivers/acpi/processor_perflib.c)
- * in order to determine correct frequency and voltage pairings. We can
- * do _PDC and _PSD and find out the processor dependency for the
- * actual init that will happen later...
- */
-static int __init acpi_cpufreq_early_init(void)
-{
- unsigned int i;
- dprintk("acpi_cpufreq_early_init\n");
-
- acpi_perf_data = alloc_percpu(struct acpi_processor_performance);
- if (!acpi_perf_data) {
- dprintk("Memory allocation error for acpi_perf_data.\n");
- return -ENOMEM;
- }
- for_each_possible_cpu(i) {
- if (!zalloc_cpumask_var_node(
- &per_cpu_ptr(acpi_perf_data, i)->shared_cpu_map,
- GFP_KERNEL, cpu_to_node(i))) {
-
- /* Freeing a NULL pointer is OK: alloc_percpu zeroes. */
- free_acpi_perf_data();
- return -ENOMEM;
- }
- }
-
- /* Do initialization in ACPI core */
- acpi_processor_preregister_performance(acpi_perf_data);
- return 0;
-}
-
-#ifdef CONFIG_SMP
-/*
- * Some BIOSes do SW_ANY coordination internally, either set it up in hw
- * or do it in BIOS firmware and won't inform about it to OS. If not
- * detected, this has a side effect of making CPU run at a different speed
- * than OS intended it to run at. Detect it and handle it cleanly.
- */
-static int bios_with_sw_any_bug;
-
-static int sw_any_bug_found(const struct dmi_system_id *d)
-{
- bios_with_sw_any_bug = 1;
- return 0;
-}
-
-static const struct dmi_system_id sw_any_bug_dmi_table[] = {
- {
- .callback = sw_any_bug_found,
- .ident = "Supermicro Server X6DLP",
- .matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Supermicro"),
- DMI_MATCH(DMI_BIOS_VERSION, "080010"),
- DMI_MATCH(DMI_PRODUCT_NAME, "X6DLP"),
- },
- },
- { }
-};
-
-static int acpi_cpufreq_blacklist(struct cpuinfo_x86 *c)
-{
- /* Intel Xeon Processor 7100 Series Specification Update
- * http://www.intel.com/Assets/PDF/specupdate/314554.pdf
- * AL30: A Machine Check Exception (MCE) Occurring during an
- * Enhanced Intel SpeedStep Technology Ratio Change May Cause
- * Both Processor Cores to Lock Up. */
- if (c->x86_vendor == X86_VENDOR_INTEL) {
- if ((c->x86 == 15) &&
- (c->x86_model == 6) &&
- (c->x86_mask == 8)) {
- printk(KERN_INFO "acpi-cpufreq: Intel(R) "
- "Xeon(R) 7100 Errata AL30, processors may "
- "lock up on frequency changes: disabling "
- "acpi-cpufreq.\n");
- return -ENODEV;
- }
- }
- return 0;
-}
-#endif
-
-static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
-{
- unsigned int i;
- unsigned int valid_states = 0;
- unsigned int cpu = policy->cpu;
- struct acpi_cpufreq_data *data;
- unsigned int result = 0;
- struct cpuinfo_x86 *c = &cpu_data(policy->cpu);
- struct acpi_processor_performance *perf;
-#ifdef CONFIG_SMP
- static int blacklisted;
-#endif
-
- dprintk("acpi_cpufreq_cpu_init\n");
-
-#ifdef CONFIG_SMP
- if (blacklisted)
- return blacklisted;
- blacklisted = acpi_cpufreq_blacklist(c);
- if (blacklisted)
- return blacklisted;
-#endif
-
- data = kzalloc(sizeof(struct acpi_cpufreq_data), GFP_KERNEL);
- if (!data)
- return -ENOMEM;
-
- data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu);
- per_cpu(acfreq_data, cpu) = data;
-
- if (cpu_has(c, X86_FEATURE_CONSTANT_TSC))
- acpi_cpufreq_driver.flags |= CPUFREQ_CONST_LOOPS;
-
- result = acpi_processor_register_performance(data->acpi_data, cpu);
- if (result)
- goto err_free;
-
- perf = data->acpi_data;
- policy->shared_type = perf->shared_type;
-
- /*
- * Will let policy->cpus know about dependency only when software
- * coordination is required.
- */
- if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL ||
- policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) {
- cpumask_copy(policy->cpus, perf->shared_cpu_map);
- }
- cpumask_copy(policy->related_cpus, perf->shared_cpu_map);
-
-#ifdef CONFIG_SMP
- dmi_check_system(sw_any_bug_dmi_table);
- if (bios_with_sw_any_bug && cpumask_weight(policy->cpus) == 1) {
- policy->shared_type = CPUFREQ_SHARED_TYPE_ALL;
- cpumask_copy(policy->cpus, cpu_core_mask(cpu));
- }
-#endif
-
- /* capability check */
- if (perf->state_count <= 1) {
- dprintk("No P-States\n");
- result = -ENODEV;
- goto err_unreg;
- }
-
- if (perf->control_register.space_id != perf->status_register.space_id) {
- result = -ENODEV;
- goto err_unreg;
- }
-
- switch (perf->control_register.space_id) {
- case ACPI_ADR_SPACE_SYSTEM_IO:
- dprintk("SYSTEM IO addr space\n");
- data->cpu_feature = SYSTEM_IO_CAPABLE;
- break;
- case ACPI_ADR_SPACE_FIXED_HARDWARE:
- dprintk("HARDWARE addr space\n");
- if (!check_est_cpu(cpu)) {
- result = -ENODEV;
- goto err_unreg;
- }
- data->cpu_feature = SYSTEM_INTEL_MSR_CAPABLE;
- break;
- default:
- dprintk("Unknown addr space %d\n",
- (u32) (perf->control_register.space_id));
- result = -ENODEV;
- goto err_unreg;
- }
-
- data->freq_table = kmalloc(sizeof(struct cpufreq_frequency_table) *
- (perf->state_count+1), GFP_KERNEL);
- if (!data->freq_table) {
- result = -ENOMEM;
- goto err_unreg;
- }
-
- /* detect transition latency */
- policy->cpuinfo.transition_latency = 0;
- for (i = 0; i < perf->state_count; i++) {
- if ((perf->states[i].transition_latency * 1000) >
- policy->cpuinfo.transition_latency)
- policy->cpuinfo.transition_latency =
- perf->states[i].transition_latency * 1000;
- }
-
- /* Check for high latency (>20uS) from buggy BIOSes, like on T42 */
- if (perf->control_register.space_id == ACPI_ADR_SPACE_FIXED_HARDWARE &&
- policy->cpuinfo.transition_latency > 20 * 1000) {
- policy->cpuinfo.transition_latency = 20 * 1000;
- printk_once(KERN_INFO
- "P-state transition latency capped at 20 uS\n");
- }
-
- /* table init */
- for (i = 0; i < perf->state_count; i++) {
- if (i > 0 && perf->states[i].core_frequency >=
- data->freq_table[valid_states-1].frequency / 1000)
- continue;
-
- data->freq_table[valid_states].index = i;
- data->freq_table[valid_states].frequency =
- perf->states[i].core_frequency * 1000;
- valid_states++;
- }
- data->freq_table[valid_states].frequency = CPUFREQ_TABLE_END;
- perf->state = 0;
-
- result = cpufreq_frequency_table_cpuinfo(policy, data->freq_table);
- if (result)
- goto err_freqfree;
-
- if (perf->states[0].core_frequency * 1000 != policy->cpuinfo.max_freq)
- printk(KERN_WARNING FW_WARN "P-state 0 is not max freq\n");
-
- switch (perf->control_register.space_id) {
- case ACPI_ADR_SPACE_SYSTEM_IO:
- /* Current speed is unknown and not detectable by IO port */
- policy->cur = acpi_cpufreq_guess_freq(data, policy->cpu);
- break;
- case ACPI_ADR_SPACE_FIXED_HARDWARE:
- acpi_cpufreq_driver.get = get_cur_freq_on_cpu;
- policy->cur = get_cur_freq_on_cpu(cpu);
- break;
- default:
- break;
- }
-
- /* notify BIOS that we exist */
- acpi_processor_notify_smm(THIS_MODULE);
-
- /* Check for APERF/MPERF support in hardware */
- if (cpu_has(c, X86_FEATURE_APERFMPERF))
- acpi_cpufreq_driver.getavg = cpufreq_get_measured_perf;
-
- dprintk("CPU%u - ACPI performance management activated.\n", cpu);
- for (i = 0; i < perf->state_count; i++)
- dprintk(" %cP%d: %d MHz, %d mW, %d uS\n",
- (i == perf->state ? '*' : ' '), i,
- (u32) perf->states[i].core_frequency,
- (u32) perf->states[i].power,
- (u32) perf->states[i].transition_latency);
-
- cpufreq_frequency_table_get_attr(data->freq_table, policy->cpu);
-
- /*
- * the first call to ->target() should result in us actually
- * writing something to the appropriate registers.
- */
- data->resume = 1;
-
- return result;
-
-err_freqfree:
- kfree(data->freq_table);
-err_unreg:
- acpi_processor_unregister_performance(perf, cpu);
-err_free:
- kfree(data);
- per_cpu(acfreq_data, cpu) = NULL;
-
- return result;
-}
-
-static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy)
-{
- struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
-
- dprintk("acpi_cpufreq_cpu_exit\n");
-
- if (data) {
- cpufreq_frequency_table_put_attr(policy->cpu);
- per_cpu(acfreq_data, policy->cpu) = NULL;
- acpi_processor_unregister_performance(data->acpi_data,
- policy->cpu);
- kfree(data->freq_table);
- kfree(data);
- }
-
- return 0;
-}
-
-static int acpi_cpufreq_resume(struct cpufreq_policy *policy)
-{
- struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
-
- dprintk("acpi_cpufreq_resume\n");
-
- data->resume = 1;
-
- return 0;
-}
-
-static struct freq_attr *acpi_cpufreq_attr[] = {
- &cpufreq_freq_attr_scaling_available_freqs,
- NULL,
-};
-
-static struct cpufreq_driver acpi_cpufreq_driver = {
- .verify = acpi_cpufreq_verify,
- .target = acpi_cpufreq_target,
- .bios_limit = acpi_processor_get_bios_limit,
- .init = acpi_cpufreq_cpu_init,
- .exit = acpi_cpufreq_cpu_exit,
- .resume = acpi_cpufreq_resume,
- .name = "acpi-cpufreq",
- .owner = THIS_MODULE,
- .attr = acpi_cpufreq_attr,
-};
-
-static int __init acpi_cpufreq_init(void)
-{
- int ret;
-
- if (acpi_disabled)
- return 0;
-
- dprintk("acpi_cpufreq_init\n");
-
- ret = acpi_cpufreq_early_init();
- if (ret)
- return ret;
-
- ret = cpufreq_register_driver(&acpi_cpufreq_driver);
- if (ret)
- free_acpi_perf_data();
-
- return ret;
-}
-
-static void __exit acpi_cpufreq_exit(void)
-{
- dprintk("acpi_cpufreq_exit\n");
-
- cpufreq_unregister_driver(&acpi_cpufreq_driver);
-
- free_percpu(acpi_perf_data);
-}
-
-module_param(acpi_pstate_strict, uint, 0644);
-MODULE_PARM_DESC(acpi_pstate_strict,
- "value 0 or non-zero. non-zero -> strict ACPI checks are "
- "performed during frequency changes.");
-
-late_initcall(acpi_cpufreq_init);
-module_exit(acpi_cpufreq_exit);
-
-MODULE_ALIAS("acpi");
diff --git a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c b/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
deleted file mode 100644
index 141abebc451..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
+++ /dev/null
@@ -1,446 +0,0 @@
-/*
- * (C) 2004-2006 Sebastian Witt <se.witt@gmx.net>
- *
- * Licensed under the terms of the GNU GPL License version 2.
- * Based upon reverse engineered information
- *
- * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/pci.h>
-#include <linux/delay.h>
-
-#define NFORCE2_XTAL 25
-#define NFORCE2_BOOTFSB 0x48
-#define NFORCE2_PLLENABLE 0xa8
-#define NFORCE2_PLLREG 0xa4
-#define NFORCE2_PLLADR 0xa0
-#define NFORCE2_PLL(mul, div) (0x100000 | (mul << 8) | div)
-
-#define NFORCE2_MIN_FSB 50
-#define NFORCE2_SAFE_DISTANCE 50
-
-/* Delay in ms between FSB changes */
-/* #define NFORCE2_DELAY 10 */
-
-/*
- * nforce2_chipset:
- * FSB is changed using the chipset
- */
-static struct pci_dev *nforce2_dev;
-
-/* fid:
- * multiplier * 10
- */
-static int fid;
-
-/* min_fsb, max_fsb:
- * minimum and maximum FSB (= FSB at boot time)
- */
-static int min_fsb;
-static int max_fsb;
-
-MODULE_AUTHOR("Sebastian Witt <se.witt@gmx.net>");
-MODULE_DESCRIPTION("nForce2 FSB changing cpufreq driver");
-MODULE_LICENSE("GPL");
-
-module_param(fid, int, 0444);
-module_param(min_fsb, int, 0444);
-
-MODULE_PARM_DESC(fid, "CPU multiplier to use (11.5 = 115)");
-MODULE_PARM_DESC(min_fsb,
- "Minimum FSB to use, if not defined: current FSB - 50");
-
-#define PFX "cpufreq-nforce2: "
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
- "cpufreq-nforce2", msg)
-
-/**
- * nforce2_calc_fsb - calculate FSB
- * @pll: PLL value
- *
- * Calculates FSB from PLL value
- */
-static int nforce2_calc_fsb(int pll)
-{
- unsigned char mul, div;
-
- mul = (pll >> 8) & 0xff;
- div = pll & 0xff;
-
- if (div > 0)
- return NFORCE2_XTAL * mul / div;
-
- return 0;
-}
-
-/**
- * nforce2_calc_pll - calculate PLL value
- * @fsb: FSB
- *
- * Calculate PLL value for given FSB
- */
-static int nforce2_calc_pll(unsigned int fsb)
-{
- unsigned char xmul, xdiv;
- unsigned char mul = 0, div = 0;
- int tried = 0;
-
- /* Try to calculate multiplier and divider up to 4 times */
- while (((mul == 0) || (div == 0)) && (tried <= 3)) {
- for (xdiv = 2; xdiv <= 0x80; xdiv++)
- for (xmul = 1; xmul <= 0xfe; xmul++)
- if (nforce2_calc_fsb(NFORCE2_PLL(xmul, xdiv)) ==
- fsb + tried) {
- mul = xmul;
- div = xdiv;
- }
- tried++;
- }
-
- if ((mul == 0) || (div == 0))
- return -1;
-
- return NFORCE2_PLL(mul, div);
-}
-
-/**
- * nforce2_write_pll - write PLL value to chipset
- * @pll: PLL value
- *
- * Writes new FSB PLL value to chipset
- */
-static void nforce2_write_pll(int pll)
-{
- int temp;
-
- /* Set the pll addr. to 0x00 */
- pci_write_config_dword(nforce2_dev, NFORCE2_PLLADR, 0);
-
- /* Now write the value in all 64 registers */
- for (temp = 0; temp <= 0x3f; temp++)
- pci_write_config_dword(nforce2_dev, NFORCE2_PLLREG, pll);
-
- return;
-}
-
-/**
- * nforce2_fsb_read - Read FSB
- *
- * Read FSB from chipset
- * If bootfsb != 0, return FSB at boot-time
- */
-static unsigned int nforce2_fsb_read(int bootfsb)
-{
- struct pci_dev *nforce2_sub5;
- u32 fsb, temp = 0;
-
- /* Get chipset boot FSB from subdevice 5 (FSB at boot-time) */
- nforce2_sub5 = pci_get_subsys(PCI_VENDOR_ID_NVIDIA, 0x01EF,
- PCI_ANY_ID, PCI_ANY_ID, NULL);
- if (!nforce2_sub5)
- return 0;
-
- pci_read_config_dword(nforce2_sub5, NFORCE2_BOOTFSB, &fsb);
- fsb /= 1000000;
-
- /* Check if PLL register is already set */
- pci_read_config_byte(nforce2_dev, NFORCE2_PLLENABLE, (u8 *)&temp);
-
- if (bootfsb || !temp)
- return fsb;
-
- /* Use PLL register FSB value */
- pci_read_config_dword(nforce2_dev, NFORCE2_PLLREG, &temp);
- fsb = nforce2_calc_fsb(temp);
-
- return fsb;
-}
-
-/**
- * nforce2_set_fsb - set new FSB
- * @fsb: New FSB
- *
- * Sets new FSB
- */
-static int nforce2_set_fsb(unsigned int fsb)
-{
- u32 temp = 0;
- unsigned int tfsb;
- int diff;
- int pll = 0;
-
- if ((fsb > max_fsb) || (fsb < NFORCE2_MIN_FSB)) {
- printk(KERN_ERR PFX "FSB %d is out of range!\n", fsb);
- return -EINVAL;
- }
-
- tfsb = nforce2_fsb_read(0);
- if (!tfsb) {
- printk(KERN_ERR PFX "Error while reading the FSB\n");
- return -EINVAL;
- }
-
- /* First write? Then set actual value */
- pci_read_config_byte(nforce2_dev, NFORCE2_PLLENABLE, (u8 *)&temp);
- if (!temp) {
- pll = nforce2_calc_pll(tfsb);
-
- if (pll < 0)
- return -EINVAL;
-
- nforce2_write_pll(pll);
- }
-
- /* Enable write access */
- temp = 0x01;
- pci_write_config_byte(nforce2_dev, NFORCE2_PLLENABLE, (u8)temp);
-
- diff = tfsb - fsb;
-
- if (!diff)
- return 0;
-
- while ((tfsb != fsb) && (tfsb <= max_fsb) && (tfsb >= min_fsb)) {
- if (diff < 0)
- tfsb++;
- else
- tfsb--;
-
- /* Calculate the PLL reg. value */
- pll = nforce2_calc_pll(tfsb);
- if (pll == -1)
- return -EINVAL;
-
- nforce2_write_pll(pll);
-#ifdef NFORCE2_DELAY
- mdelay(NFORCE2_DELAY);
-#endif
- }
-
- temp = 0x40;
- pci_write_config_byte(nforce2_dev, NFORCE2_PLLADR, (u8)temp);
-
- return 0;
-}
-
-/**
- * nforce2_get - get the CPU frequency
- * @cpu: CPU number
- *
- * Returns the CPU frequency
- */
-static unsigned int nforce2_get(unsigned int cpu)
-{
- if (cpu)
- return 0;
- return nforce2_fsb_read(0) * fid * 100;
-}
-
-/**
- * nforce2_target - set a new CPUFreq policy
- * @policy: new policy
- * @target_freq: the target frequency
- * @relation: how that frequency relates to achieved frequency
- * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
- *
- * Sets a new CPUFreq policy.
- */
-static int nforce2_target(struct cpufreq_policy *policy,
- unsigned int target_freq, unsigned int relation)
-{
-/* unsigned long flags; */
- struct cpufreq_freqs freqs;
- unsigned int target_fsb;
-
- if ((target_freq > policy->max) || (target_freq < policy->min))
- return -EINVAL;
-
- target_fsb = target_freq / (fid * 100);
-
- freqs.old = nforce2_get(policy->cpu);
- freqs.new = target_fsb * fid * 100;
- freqs.cpu = 0; /* Only one CPU on nForce2 platforms */
-
- if (freqs.old == freqs.new)
- return 0;
-
- dprintk("Old CPU frequency %d kHz, new %d kHz\n",
- freqs.old, freqs.new);
-
- cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-
- /* Disable IRQs */
- /* local_irq_save(flags); */
-
- if (nforce2_set_fsb(target_fsb) < 0)
- printk(KERN_ERR PFX "Changing FSB to %d failed\n",
- target_fsb);
- else
- dprintk("Changed FSB successfully to %d\n",
- target_fsb);
-
- /* Enable IRQs */
- /* local_irq_restore(flags); */
-
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-
- return 0;
-}
-
-/**
- * nforce2_verify - verifies a new CPUFreq policy
- * @policy: new policy
- */
-static int nforce2_verify(struct cpufreq_policy *policy)
-{
- unsigned int fsb_pol_max;
-
- fsb_pol_max = policy->max / (fid * 100);
-
- if (policy->min < (fsb_pol_max * fid * 100))
- policy->max = (fsb_pol_max + 1) * fid * 100;
-
- cpufreq_verify_within_limits(policy,
- policy->cpuinfo.min_freq,
- policy->cpuinfo.max_freq);
- return 0;
-}
-
-static int nforce2_cpu_init(struct cpufreq_policy *policy)
-{
- unsigned int fsb;
- unsigned int rfid;
-
- /* capability check */
- if (policy->cpu != 0)
- return -ENODEV;
-
- /* Get current FSB */
- fsb = nforce2_fsb_read(0);
-
- if (!fsb)
- return -EIO;
-
- /* FIX: Get FID from CPU */
- if (!fid) {
- if (!cpu_khz) {
- printk(KERN_WARNING PFX
- "cpu_khz not set, can't calculate multiplier!\n");
- return -ENODEV;
- }
-
- fid = cpu_khz / (fsb * 100);
- rfid = fid % 5;
-
- if (rfid) {
- if (rfid > 2)
- fid += 5 - rfid;
- else
- fid -= rfid;
- }
- }
-
- printk(KERN_INFO PFX "FSB currently at %i MHz, FID %d.%d\n", fsb,
- fid / 10, fid % 10);
-
- /* Set maximum FSB to FSB at boot time */
- max_fsb = nforce2_fsb_read(1);
-
- if (!max_fsb)
- return -EIO;
-
- if (!min_fsb)
- min_fsb = max_fsb - NFORCE2_SAFE_DISTANCE;
-
- if (min_fsb < NFORCE2_MIN_FSB)
- min_fsb = NFORCE2_MIN_FSB;
-
- /* cpuinfo and default policy values */
- policy->cpuinfo.min_freq = min_fsb * fid * 100;
- policy->cpuinfo.max_freq = max_fsb * fid * 100;
- policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
- policy->cur = nforce2_get(policy->cpu);
- policy->min = policy->cpuinfo.min_freq;
- policy->max = policy->cpuinfo.max_freq;
-
- return 0;
-}
-
-static int nforce2_cpu_exit(struct cpufreq_policy *policy)
-{
- return 0;
-}
-
-static struct cpufreq_driver nforce2_driver = {
- .name = "nforce2",
- .verify = nforce2_verify,
- .target = nforce2_target,
- .get = nforce2_get,
- .init = nforce2_cpu_init,
- .exit = nforce2_cpu_exit,
- .owner = THIS_MODULE,
-};
-
-/**
- * nforce2_detect_chipset - detect the Southbridge which contains FSB PLL logic
- *
- * Detects nForce2 A2 and C1 stepping
- *
- */
-static int nforce2_detect_chipset(void)
-{
- nforce2_dev = pci_get_subsys(PCI_VENDOR_ID_NVIDIA,
- PCI_DEVICE_ID_NVIDIA_NFORCE2,
- PCI_ANY_ID, PCI_ANY_ID, NULL);
-
- if (nforce2_dev == NULL)
- return -ENODEV;
-
- printk(KERN_INFO PFX "Detected nForce2 chipset revision %X\n",
- nforce2_dev->revision);
- printk(KERN_INFO PFX
- "FSB changing is maybe unstable and can lead to "
- "crashes and data loss.\n");
-
- return 0;
-}
-
-/**
- * nforce2_init - initializes the nForce2 CPUFreq driver
- *
- * Initializes the nForce2 FSB support. Returns -ENODEV on unsupported
- * devices, -EINVAL on problems during initiatization, and zero on
- * success.
- */
-static int __init nforce2_init(void)
-{
- /* TODO: do we need to detect the processor? */
-
- /* detect chipset */
- if (nforce2_detect_chipset()) {
- printk(KERN_INFO PFX "No nForce2 chipset.\n");
- return -ENODEV;
- }
-
- return cpufreq_register_driver(&nforce2_driver);
-}
-
-/**
- * nforce2_exit - unregisters cpufreq module
- *
- * Unregisters nForce2 FSB change support.
- */
-static void __exit nforce2_exit(void)
-{
- cpufreq_unregister_driver(&nforce2_driver);
-}
-
-module_init(nforce2_init);
-module_exit(nforce2_exit);
-
diff --git a/arch/x86/kernel/cpu/cpufreq/e_powersaver.c b/arch/x86/kernel/cpu/cpufreq/e_powersaver.c
deleted file mode 100644
index 35a257dd4bb..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/e_powersaver.c
+++ /dev/null
@@ -1,367 +0,0 @@
-/*
- * Based on documentation provided by Dave Jones. Thanks!
- *
- * Licensed under the terms of the GNU GPL License version 2.
- *
- * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/ioport.h>
-#include <linux/slab.h>
-#include <linux/timex.h>
-#include <linux/io.h>
-#include <linux/delay.h>
-
-#include <asm/msr.h>
-#include <asm/tsc.h>
-
-#define EPS_BRAND_C7M 0
-#define EPS_BRAND_C7 1
-#define EPS_BRAND_EDEN 2
-#define EPS_BRAND_C3 3
-#define EPS_BRAND_C7D 4
-
-struct eps_cpu_data {
- u32 fsb;
- struct cpufreq_frequency_table freq_table[];
-};
-
-static struct eps_cpu_data *eps_cpu[NR_CPUS];
-
-
-static unsigned int eps_get(unsigned int cpu)
-{
- struct eps_cpu_data *centaur;
- u32 lo, hi;
-
- if (cpu)
- return 0;
- centaur = eps_cpu[cpu];
- if (centaur == NULL)
- return 0;
-
- /* Return current frequency */
- rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
- return centaur->fsb * ((lo >> 8) & 0xff);
-}
-
-static int eps_set_state(struct eps_cpu_data *centaur,
- unsigned int cpu,
- u32 dest_state)
-{
- struct cpufreq_freqs freqs;
- u32 lo, hi;
- int err = 0;
- int i;
-
- freqs.old = eps_get(cpu);
- freqs.new = centaur->fsb * ((dest_state >> 8) & 0xff);
- freqs.cpu = cpu;
- cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-
- /* Wait while CPU is busy */
- rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
- i = 0;
- while (lo & ((1 << 16) | (1 << 17))) {
- udelay(16);
- rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
- i++;
- if (unlikely(i > 64)) {
- err = -ENODEV;
- goto postchange;
- }
- }
- /* Set new multiplier and voltage */
- wrmsr(MSR_IA32_PERF_CTL, dest_state & 0xffff, 0);
- /* Wait until transition end */
- i = 0;
- do {
- udelay(16);
- rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
- i++;
- if (unlikely(i > 64)) {
- err = -ENODEV;
- goto postchange;
- }
- } while (lo & ((1 << 16) | (1 << 17)));
-
- /* Return current frequency */
-postchange:
- rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
- freqs.new = centaur->fsb * ((lo >> 8) & 0xff);
-
-#ifdef DEBUG
- {
- u8 current_multiplier, current_voltage;
-
- /* Print voltage and multiplier */
- rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
- current_voltage = lo & 0xff;
- printk(KERN_INFO "eps: Current voltage = %dmV\n",
- current_voltage * 16 + 700);
- current_multiplier = (lo >> 8) & 0xff;
- printk(KERN_INFO "eps: Current multiplier = %d\n",
- current_multiplier);
- }
-#endif
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
- return err;
-}
-
-static int eps_target(struct cpufreq_policy *policy,
- unsigned int target_freq,
- unsigned int relation)
-{
- struct eps_cpu_data *centaur;
- unsigned int newstate = 0;
- unsigned int cpu = policy->cpu;
- unsigned int dest_state;
- int ret;
-
- if (unlikely(eps_cpu[cpu] == NULL))
- return -ENODEV;
- centaur = eps_cpu[cpu];
-
- if (unlikely(cpufreq_frequency_table_target(policy,
- &eps_cpu[cpu]->freq_table[0],
- target_freq,
- relation,
- &newstate))) {
- return -EINVAL;
- }
-
- /* Make frequency transition */
- dest_state = centaur->freq_table[newstate].index & 0xffff;
- ret = eps_set_state(centaur, cpu, dest_state);
- if (ret)
- printk(KERN_ERR "eps: Timeout!\n");
- return ret;
-}
-
-static int eps_verify(struct cpufreq_policy *policy)
-{
- return cpufreq_frequency_table_verify(policy,
- &eps_cpu[policy->cpu]->freq_table[0]);
-}
-
-static int eps_cpu_init(struct cpufreq_policy *policy)
-{
- unsigned int i;
- u32 lo, hi;
- u64 val;
- u8 current_multiplier, current_voltage;
- u8 max_multiplier, max_voltage;
- u8 min_multiplier, min_voltage;
- u8 brand = 0;
- u32 fsb;
- struct eps_cpu_data *centaur;
- struct cpuinfo_x86 *c = &cpu_data(0);
- struct cpufreq_frequency_table *f_table;
- int k, step, voltage;
- int ret;
- int states;
-
- if (policy->cpu != 0)
- return -ENODEV;
-
- /* Check brand */
- printk(KERN_INFO "eps: Detected VIA ");
-
- switch (c->x86_model) {
- case 10:
- rdmsr(0x1153, lo, hi);
- brand = (((lo >> 2) ^ lo) >> 18) & 3;
- printk(KERN_CONT "Model A ");
- break;
- case 13:
- rdmsr(0x1154, lo, hi);
- brand = (((lo >> 4) ^ (lo >> 2))) & 0x000000ff;
- printk(KERN_CONT "Model D ");
- break;
- }
-
- switch (brand) {
- case EPS_BRAND_C7M:
- printk(KERN_CONT "C7-M\n");
- break;
- case EPS_BRAND_C7:
- printk(KERN_CONT "C7\n");
- break;
- case EPS_BRAND_EDEN:
- printk(KERN_CONT "Eden\n");
- break;
- case EPS_BRAND_C7D:
- printk(KERN_CONT "C7-D\n");
- break;
- case EPS_BRAND_C3:
- printk(KERN_CONT "C3\n");
- return -ENODEV;
- break;
- }
- /* Enable Enhanced PowerSaver */
- rdmsrl(MSR_IA32_MISC_ENABLE, val);
- if (!(val & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
- val |= MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP;
- wrmsrl(MSR_IA32_MISC_ENABLE, val);
- /* Can be locked at 0 */
- rdmsrl(MSR_IA32_MISC_ENABLE, val);
- if (!(val & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
- printk(KERN_INFO "eps: Can't enable Enhanced PowerSaver\n");
- return -ENODEV;
- }
- }
-
- /* Print voltage and multiplier */
- rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
- current_voltage = lo & 0xff;
- printk(KERN_INFO "eps: Current voltage = %dmV\n",
- current_voltage * 16 + 700);
- current_multiplier = (lo >> 8) & 0xff;
- printk(KERN_INFO "eps: Current multiplier = %d\n", current_multiplier);
-
- /* Print limits */
- max_voltage = hi & 0xff;
- printk(KERN_INFO "eps: Highest voltage = %dmV\n",
- max_voltage * 16 + 700);
- max_multiplier = (hi >> 8) & 0xff;
- printk(KERN_INFO "eps: Highest multiplier = %d\n", max_multiplier);
- min_voltage = (hi >> 16) & 0xff;
- printk(KERN_INFO "eps: Lowest voltage = %dmV\n",
- min_voltage * 16 + 700);
- min_multiplier = (hi >> 24) & 0xff;
- printk(KERN_INFO "eps: Lowest multiplier = %d\n", min_multiplier);
-
- /* Sanity checks */
- if (current_multiplier == 0 || max_multiplier == 0
- || min_multiplier == 0)
- return -EINVAL;
- if (current_multiplier > max_multiplier
- || max_multiplier <= min_multiplier)
- return -EINVAL;
- if (current_voltage > 0x1f || max_voltage > 0x1f)
- return -EINVAL;
- if (max_voltage < min_voltage)
- return -EINVAL;
-
- /* Calc FSB speed */
- fsb = cpu_khz / current_multiplier;
- /* Calc number of p-states supported */
- if (brand == EPS_BRAND_C7M)
- states = max_multiplier - min_multiplier + 1;
- else
- states = 2;
-
- /* Allocate private data and frequency table for current cpu */
- centaur = kzalloc(sizeof(struct eps_cpu_data)
- + (states + 1) * sizeof(struct cpufreq_frequency_table),
- GFP_KERNEL);
- if (!centaur)
- return -ENOMEM;
- eps_cpu[0] = centaur;
-
- /* Copy basic values */
- centaur->fsb = fsb;
-
- /* Fill frequency and MSR value table */
- f_table = &centaur->freq_table[0];
- if (brand != EPS_BRAND_C7M) {
- f_table[0].frequency = fsb * min_multiplier;
- f_table[0].index = (min_multiplier << 8) | min_voltage;
- f_table[1].frequency = fsb * max_multiplier;
- f_table[1].index = (max_multiplier << 8) | max_voltage;
- f_table[2].frequency = CPUFREQ_TABLE_END;
- } else {
- k = 0;
- step = ((max_voltage - min_voltage) * 256)
- / (max_multiplier - min_multiplier);
- for (i = min_multiplier; i <= max_multiplier; i++) {
- voltage = (k * step) / 256 + min_voltage;
- f_table[k].frequency = fsb * i;
- f_table[k].index = (i << 8) | voltage;
- k++;
- }
- f_table[k].frequency = CPUFREQ_TABLE_END;
- }
-
- policy->cpuinfo.transition_latency = 140000; /* 844mV -> 700mV in ns */
- policy->cur = fsb * current_multiplier;
-
- ret = cpufreq_frequency_table_cpuinfo(policy, &centaur->freq_table[0]);
- if (ret) {
- kfree(centaur);
- return ret;
- }
-
- cpufreq_frequency_table_get_attr(&centaur->freq_table[0], policy->cpu);
- return 0;
-}
-
-static int eps_cpu_exit(struct cpufreq_policy *policy)
-{
- unsigned int cpu = policy->cpu;
- struct eps_cpu_data *centaur;
- u32 lo, hi;
-
- if (eps_cpu[cpu] == NULL)
- return -ENODEV;
- centaur = eps_cpu[cpu];
-
- /* Get max frequency */
- rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
- /* Set max frequency */
- eps_set_state(centaur, cpu, hi & 0xffff);
- /* Bye */
- cpufreq_frequency_table_put_attr(policy->cpu);
- kfree(eps_cpu[cpu]);
- eps_cpu[cpu] = NULL;
- return 0;
-}
-
-static struct freq_attr *eps_attr[] = {
- &cpufreq_freq_attr_scaling_available_freqs,
- NULL,
-};
-
-static struct cpufreq_driver eps_driver = {
- .verify = eps_verify,
- .target = eps_target,
- .init = eps_cpu_init,
- .exit = eps_cpu_exit,
- .get = eps_get,
- .name = "e_powersaver",
- .owner = THIS_MODULE,
- .attr = eps_attr,
-};
-
-static int __init eps_init(void)
-{
- struct cpuinfo_x86 *c = &cpu_data(0);
-
- /* This driver will work only on Centaur C7 processors with
- * Enhanced SpeedStep/PowerSaver registers */
- if (c->x86_vendor != X86_VENDOR_CENTAUR
- || c->x86 != 6 || c->x86_model < 10)
- return -ENODEV;
- if (!cpu_has(c, X86_FEATURE_EST))
- return -ENODEV;
-
- if (cpufreq_register_driver(&eps_driver))
- return -EINVAL;
- return 0;
-}
-
-static void __exit eps_exit(void)
-{
- cpufreq_unregister_driver(&eps_driver);
-}
-
-MODULE_AUTHOR("Rafal Bilski <rafalbilski@interia.pl>");
-MODULE_DESCRIPTION("Enhanced PowerSaver driver for VIA C7 CPU's.");
-MODULE_LICENSE("GPL");
-
-module_init(eps_init);
-module_exit(eps_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/elanfreq.c b/arch/x86/kernel/cpu/cpufreq/elanfreq.c
deleted file mode 100644
index c587db472a7..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/elanfreq.c
+++ /dev/null
@@ -1,309 +0,0 @@
-/*
- * elanfreq: cpufreq driver for the AMD ELAN family
- *
- * (c) Copyright 2002 Robert Schwebel <r.schwebel@pengutronix.de>
- *
- * Parts of this code are (c) Sven Geggus <sven@geggus.net>
- *
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * 2002-02-13: - initial revision for 2.4.18-pre9 by Robert Schwebel
- *
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-
-#include <linux/delay.h>
-#include <linux/cpufreq.h>
-
-#include <asm/msr.h>
-#include <linux/timex.h>
-#include <linux/io.h>
-
-#define REG_CSCIR 0x22 /* Chip Setup and Control Index Register */
-#define REG_CSCDR 0x23 /* Chip Setup and Control Data Register */
-
-/* Module parameter */
-static int max_freq;
-
-struct s_elan_multiplier {
- int clock; /* frequency in kHz */
- int val40h; /* PMU Force Mode register */
- int val80h; /* CPU Clock Speed Register */
-};
-
-/*
- * It is important that the frequencies
- * are listed in ascending order here!
- */
-static struct s_elan_multiplier elan_multiplier[] = {
- {1000, 0x02, 0x18},
- {2000, 0x02, 0x10},
- {4000, 0x02, 0x08},
- {8000, 0x00, 0x00},
- {16000, 0x00, 0x02},
- {33000, 0x00, 0x04},
- {66000, 0x01, 0x04},
- {99000, 0x01, 0x05}
-};
-
-static struct cpufreq_frequency_table elanfreq_table[] = {
- {0, 1000},
- {1, 2000},
- {2, 4000},
- {3, 8000},
- {4, 16000},
- {5, 33000},
- {6, 66000},
- {7, 99000},
- {0, CPUFREQ_TABLE_END},
-};
-
-
-/**
- * elanfreq_get_cpu_frequency: determine current cpu speed
- *
- * Finds out at which frequency the CPU of the Elan SOC runs
- * at the moment. Frequencies from 1 to 33 MHz are generated
- * the normal way, 66 and 99 MHz are called "Hyperspeed Mode"
- * and have the rest of the chip running with 33 MHz.
- */
-
-static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu)
-{
- u8 clockspeed_reg; /* Clock Speed Register */
-
- local_irq_disable();
- outb_p(0x80, REG_CSCIR);
- clockspeed_reg = inb_p(REG_CSCDR);
- local_irq_enable();
-
- if ((clockspeed_reg & 0xE0) == 0xE0)
- return 0;
-
- /* Are we in CPU clock multiplied mode (66/99 MHz)? */
- if ((clockspeed_reg & 0xE0) == 0xC0) {
- if ((clockspeed_reg & 0x01) == 0)
- return 66000;
- else
- return 99000;
- }
-
- /* 33 MHz is not 32 MHz... */
- if ((clockspeed_reg & 0xE0) == 0xA0)
- return 33000;
-
- return (1<<((clockspeed_reg & 0xE0) >> 5)) * 1000;
-}
-
-
-/**
- * elanfreq_set_cpu_frequency: Change the CPU core frequency
- * @cpu: cpu number
- * @freq: frequency in kHz
- *
- * This function takes a frequency value and changes the CPU frequency
- * according to this. Note that the frequency has to be checked by
- * elanfreq_validatespeed() for correctness!
- *
- * There is no return value.
- */
-
-static void elanfreq_set_cpu_state(unsigned int state)
-{
- struct cpufreq_freqs freqs;
-
- freqs.old = elanfreq_get_cpu_frequency(0);
- freqs.new = elan_multiplier[state].clock;
- freqs.cpu = 0; /* elanfreq.c is UP only driver */
-
- cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-
- printk(KERN_INFO "elanfreq: attempting to set frequency to %i kHz\n",
- elan_multiplier[state].clock);
-
-
- /*
- * Access to the Elan's internal registers is indexed via
- * 0x22: Chip Setup & Control Register Index Register (CSCI)
- * 0x23: Chip Setup & Control Register Data Register (CSCD)
- *
- */
-
- /*
- * 0x40 is the Power Management Unit's Force Mode Register.
- * Bit 6 enables Hyperspeed Mode (66/100 MHz core frequency)
- */
-
- local_irq_disable();
- outb_p(0x40, REG_CSCIR); /* Disable hyperspeed mode */
- outb_p(0x00, REG_CSCDR);
- local_irq_enable(); /* wait till internal pipelines and */
- udelay(1000); /* buffers have cleaned up */
-
- local_irq_disable();
-
- /* now, set the CPU clock speed register (0x80) */
- outb_p(0x80, REG_CSCIR);
- outb_p(elan_multiplier[state].val80h, REG_CSCDR);
-
- /* now, the hyperspeed bit in PMU Force Mode Register (0x40) */
- outb_p(0x40, REG_CSCIR);
- outb_p(elan_multiplier[state].val40h, REG_CSCDR);
- udelay(10000);
- local_irq_enable();
-
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-};
-
-
-/**
- * elanfreq_validatespeed: test if frequency range is valid
- * @policy: the policy to validate
- *
- * This function checks if a given frequency range in kHz is valid
- * for the hardware supported by the driver.
- */
-
-static int elanfreq_verify(struct cpufreq_policy *policy)
-{
- return cpufreq_frequency_table_verify(policy, &elanfreq_table[0]);
-}
-
-static int elanfreq_target(struct cpufreq_policy *policy,
- unsigned int target_freq,
- unsigned int relation)
-{
- unsigned int newstate = 0;
-
- if (cpufreq_frequency_table_target(policy, &elanfreq_table[0],
- target_freq, relation, &newstate))
- return -EINVAL;
-
- elanfreq_set_cpu_state(newstate);
-
- return 0;
-}
-
-
-/*
- * Module init and exit code
- */
-
-static int elanfreq_cpu_init(struct cpufreq_policy *policy)
-{
- struct cpuinfo_x86 *c = &cpu_data(0);
- unsigned int i;
- int result;
-
- /* capability check */
- if ((c->x86_vendor != X86_VENDOR_AMD) ||
- (c->x86 != 4) || (c->x86_model != 10))
- return -ENODEV;
-
- /* max freq */
- if (!max_freq)
- max_freq = elanfreq_get_cpu_frequency(0);
-
- /* table init */
- for (i = 0; (elanfreq_table[i].frequency != CPUFREQ_TABLE_END); i++) {
- if (elanfreq_table[i].frequency > max_freq)
- elanfreq_table[i].frequency = CPUFREQ_ENTRY_INVALID;
- }
-
- /* cpuinfo and default policy values */
- policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
- policy->cur = elanfreq_get_cpu_frequency(0);
-
- result = cpufreq_frequency_table_cpuinfo(policy, elanfreq_table);
- if (result)
- return result;
-
- cpufreq_frequency_table_get_attr(elanfreq_table, policy->cpu);
- return 0;
-}
-
-
-static int elanfreq_cpu_exit(struct cpufreq_policy *policy)
-{
- cpufreq_frequency_table_put_attr(policy->cpu);
- return 0;
-}
-
-
-#ifndef MODULE
-/**
- * elanfreq_setup - elanfreq command line parameter parsing
- *
- * elanfreq command line parameter. Use:
- * elanfreq=66000
- * to set the maximum CPU frequency to 66 MHz. Note that in
- * case you do not give this boot parameter, the maximum
- * frequency will fall back to _current_ CPU frequency which
- * might be lower. If you build this as a module, use the
- * max_freq module parameter instead.
- */
-static int __init elanfreq_setup(char *str)
-{
- max_freq = simple_strtoul(str, &str, 0);
- printk(KERN_WARNING "You're using the deprecated elanfreq command line option. Use elanfreq.max_freq instead, please!\n");
- return 1;
-}
-__setup("elanfreq=", elanfreq_setup);
-#endif
-
-
-static struct freq_attr *elanfreq_attr[] = {
- &cpufreq_freq_attr_scaling_available_freqs,
- NULL,
-};
-
-
-static struct cpufreq_driver elanfreq_driver = {
- .get = elanfreq_get_cpu_frequency,
- .verify = elanfreq_verify,
- .target = elanfreq_target,
- .init = elanfreq_cpu_init,
- .exit = elanfreq_cpu_exit,
- .name = "elanfreq",
- .owner = THIS_MODULE,
- .attr = elanfreq_attr,
-};
-
-
-static int __init elanfreq_init(void)
-{
- struct cpuinfo_x86 *c = &cpu_data(0);
-
- /* Test if we have the right hardware */
- if ((c->x86_vendor != X86_VENDOR_AMD) ||
- (c->x86 != 4) || (c->x86_model != 10)) {
- printk(KERN_INFO "elanfreq: error: no Elan processor found!\n");
- return -ENODEV;
- }
- return cpufreq_register_driver(&elanfreq_driver);
-}
-
-
-static void __exit elanfreq_exit(void)
-{
- cpufreq_unregister_driver(&elanfreq_driver);
-}
-
-
-module_param(max_freq, int, 0444);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Robert Schwebel <r.schwebel@pengutronix.de>, "
- "Sven Geggus <sven@geggus.net>");
-MODULE_DESCRIPTION("cpufreq driver for AMD's Elan CPUs");
-
-module_init(elanfreq_init);
-module_exit(elanfreq_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
deleted file mode 100644
index 32974cf8423..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
+++ /dev/null
@@ -1,517 +0,0 @@
-/*
- * Cyrix MediaGX and NatSemi Geode Suspend Modulation
- * (C) 2002 Zwane Mwaikambo <zwane@commfireservices.com>
- * (C) 2002 Hiroshi Miura <miura@da-cha.org>
- * All Rights Reserved
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2 as published by the Free Software Foundation
- *
- * The author(s) of this software shall not be held liable for damages
- * of any nature resulting due to the use of this software. This
- * software is provided AS-IS with no warranties.
- *
- * Theoretical note:
- *
- * (see Geode(tm) CS5530 manual (rev.4.1) page.56)
- *
- * CPU frequency control on NatSemi Geode GX1/GXLV processor and CS55x0
- * are based on Suspend Modulation.
- *
- * Suspend Modulation works by asserting and de-asserting the SUSP# pin
- * to CPU(GX1/GXLV) for configurable durations. When asserting SUSP#
- * the CPU enters an idle state. GX1 stops its core clock when SUSP# is
- * asserted then power consumption is reduced.
- *
- * Suspend Modulation's OFF/ON duration are configurable
- * with 'Suspend Modulation OFF Count Register'
- * and 'Suspend Modulation ON Count Register'.
- * These registers are 8bit counters that represent the number of
- * 32us intervals which the SUSP# pin is asserted(ON)/de-asserted(OFF)
- * to the processor.
- *
- * These counters define a ratio which is the effective frequency
- * of operation of the system.
- *
- * OFF Count
- * F_eff = Fgx * ----------------------
- * OFF Count + ON Count
- *
- * 0 <= On Count, Off Count <= 255
- *
- * From these limits, we can get register values
- *
- * off_duration + on_duration <= MAX_DURATION
- * on_duration = off_duration * (stock_freq - freq) / freq
- *
- * off_duration = (freq * DURATION) / stock_freq
- * on_duration = DURATION - off_duration
- *
- *
- *---------------------------------------------------------------------------
- *
- * ChangeLog:
- * Dec. 12, 2003 Hiroshi Miura <miura@da-cha.org>
- * - fix on/off register mistake
- * - fix cpu_khz calc when it stops cpu modulation.
- *
- * Dec. 11, 2002 Hiroshi Miura <miura@da-cha.org>
- * - rewrite for Cyrix MediaGX Cx5510/5520 and
- * NatSemi Geode Cs5530(A).
- *
- * Jul. ??, 2002 Zwane Mwaikambo <zwane@commfireservices.com>
- * - cs5530_mod patch for 2.4.19-rc1.
- *
- *---------------------------------------------------------------------------
- *
- * Todo
- * Test on machines with 5510, 5530, 5530A
- */
-
-/************************************************************************
- * Suspend Modulation - Definitions *
- ************************************************************************/
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-#include <linux/cpufreq.h>
-#include <linux/pci.h>
-#include <linux/errno.h>
-#include <linux/slab.h>
-
-#include <asm/processor-cyrix.h>
-
-/* PCI config registers, all at F0 */
-#define PCI_PMER1 0x80 /* power management enable register 1 */
-#define PCI_PMER2 0x81 /* power management enable register 2 */
-#define PCI_PMER3 0x82 /* power management enable register 3 */
-#define PCI_IRQTC 0x8c /* irq speedup timer counter register:typical 2 to 4ms */
-#define PCI_VIDTC 0x8d /* video speedup timer counter register: typical 50 to 100ms */
-#define PCI_MODOFF 0x94 /* suspend modulation OFF counter register, 1 = 32us */
-#define PCI_MODON 0x95 /* suspend modulation ON counter register */
-#define PCI_SUSCFG 0x96 /* suspend configuration register */
-
-/* PMER1 bits */
-#define GPM (1<<0) /* global power management */
-#define GIT (1<<1) /* globally enable PM device idle timers */
-#define GTR (1<<2) /* globally enable IO traps */
-#define IRQ_SPDUP (1<<3) /* disable clock throttle during interrupt handling */
-#define VID_SPDUP (1<<4) /* disable clock throttle during vga video handling */
-
-/* SUSCFG bits */
-#define SUSMOD (1<<0) /* enable/disable suspend modulation */
-/* the below is supported only with cs5530 (after rev.1.2)/cs5530A */
-#define SMISPDUP (1<<1) /* select how SMI re-enable suspend modulation: */
- /* IRQTC timer or read SMI speedup disable reg.(F1BAR[08-09h]) */
-#define SUSCFG (1<<2) /* enable powering down a GXLV processor. "Special 3Volt Suspend" mode */
-/* the below is supported only with cs5530A */
-#define PWRSVE_ISA (1<<3) /* stop ISA clock */
-#define PWRSVE (1<<4) /* active idle */
-
-struct gxfreq_params {
- u8 on_duration;
- u8 off_duration;
- u8 pci_suscfg;
- u8 pci_pmer1;
- u8 pci_pmer2;
- struct pci_dev *cs55x0;
-};
-
-static struct gxfreq_params *gx_params;
-static int stock_freq;
-
-/* PCI bus clock - defaults to 30.000 if cpu_khz is not available */
-static int pci_busclk;
-module_param(pci_busclk, int, 0444);
-
-/* maximum duration for which the cpu may be suspended
- * (32us * MAX_DURATION). If no parameter is given, this defaults
- * to 255.
- * Note that this leads to a maximum of 8 ms(!) where the CPU clock
- * is suspended -- processing power is just 0.39% of what it used to be,
- * though. 781.25 kHz(!) for a 200 MHz processor -- wow. */
-static int max_duration = 255;
-module_param(max_duration, int, 0444);
-
-/* For the default policy, we want at least some processing power
- * - let's say 5%. (min = maxfreq / POLICY_MIN_DIV)
- */
-#define POLICY_MIN_DIV 20
-
-
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
- "gx-suspmod", msg)
-
-/**
- * we can detect a core multipiler from dir0_lsb
- * from GX1 datasheet p.56,
- * MULT[3:0]:
- * 0000 = SYSCLK multiplied by 4 (test only)
- * 0001 = SYSCLK multiplied by 10
- * 0010 = SYSCLK multiplied by 4
- * 0011 = SYSCLK multiplied by 6
- * 0100 = SYSCLK multiplied by 9
- * 0101 = SYSCLK multiplied by 5
- * 0110 = SYSCLK multiplied by 7
- * 0111 = SYSCLK multiplied by 8
- * of 33.3MHz
- **/
-static int gx_freq_mult[16] = {
- 4, 10, 4, 6, 9, 5, 7, 8,
- 0, 0, 0, 0, 0, 0, 0, 0
-};
-
-
-/****************************************************************
- * Low Level chipset interface *
- ****************************************************************/
-static struct pci_device_id gx_chipset_tbl[] __initdata = {
- { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY), },
- { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5520), },
- { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5510), },
- { 0, },
-};
-
-static void gx_write_byte(int reg, int value)
-{
- pci_write_config_byte(gx_params->cs55x0, reg, value);
-}
-
-/**
- * gx_detect_chipset:
- *
- **/
-static __init struct pci_dev *gx_detect_chipset(void)
-{
- struct pci_dev *gx_pci = NULL;
-
- /* check if CPU is a MediaGX or a Geode. */
- if ((boot_cpu_data.x86_vendor != X86_VENDOR_NSC) &&
- (boot_cpu_data.x86_vendor != X86_VENDOR_CYRIX)) {
- dprintk("error: no MediaGX/Geode processor found!\n");
- return NULL;
- }
-
- /* detect which companion chip is used */
- for_each_pci_dev(gx_pci) {
- if ((pci_match_id(gx_chipset_tbl, gx_pci)) != NULL)
- return gx_pci;
- }
-
- dprintk("error: no supported chipset found!\n");
- return NULL;
-}
-
-/**
- * gx_get_cpuspeed:
- *
- * Finds out at which efficient frequency the Cyrix MediaGX/NatSemi
- * Geode CPU runs.
- */
-static unsigned int gx_get_cpuspeed(unsigned int cpu)
-{
- if ((gx_params->pci_suscfg & SUSMOD) == 0)
- return stock_freq;
-
- return (stock_freq * gx_params->off_duration)
- / (gx_params->on_duration + gx_params->off_duration);
-}
-
-/**
- * gx_validate_speed:
- * determine current cpu speed
- *
- **/
-
-static unsigned int gx_validate_speed(unsigned int khz, u8 *on_duration,
- u8 *off_duration)
-{
- unsigned int i;
- u8 tmp_on, tmp_off;
- int old_tmp_freq = stock_freq;
- int tmp_freq;
-
- *off_duration = 1;
- *on_duration = 0;
-
- for (i = max_duration; i > 0; i--) {
- tmp_off = ((khz * i) / stock_freq) & 0xff;
- tmp_on = i - tmp_off;
- tmp_freq = (stock_freq * tmp_off) / i;
- /* if this relation is closer to khz, use this. If it's equal,
- * prefer it, too - lower latency */
- if (abs(tmp_freq - khz) <= abs(old_tmp_freq - khz)) {
- *on_duration = tmp_on;
- *off_duration = tmp_off;
- old_tmp_freq = tmp_freq;
- }
- }
-
- return old_tmp_freq;
-}
-
-
-/**
- * gx_set_cpuspeed:
- * set cpu speed in khz.
- **/
-
-static void gx_set_cpuspeed(unsigned int khz)
-{
- u8 suscfg, pmer1;
- unsigned int new_khz;
- unsigned long flags;
- struct cpufreq_freqs freqs;
-
- freqs.cpu = 0;
- freqs.old = gx_get_cpuspeed(0);
-
- new_khz = gx_validate_speed(khz, &gx_params->on_duration,
- &gx_params->off_duration);
-
- freqs.new = new_khz;
-
- cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
- local_irq_save(flags);
-
-
-
- if (new_khz != stock_freq) {
- /* if new khz == 100% of CPU speed, it is special case */
- switch (gx_params->cs55x0->device) {
- case PCI_DEVICE_ID_CYRIX_5530_LEGACY:
- pmer1 = gx_params->pci_pmer1 | IRQ_SPDUP | VID_SPDUP;
- /* FIXME: need to test other values -- Zwane,Miura */
- /* typical 2 to 4ms */
- gx_write_byte(PCI_IRQTC, 4);
- /* typical 50 to 100ms */
- gx_write_byte(PCI_VIDTC, 100);
- gx_write_byte(PCI_PMER1, pmer1);
-
- if (gx_params->cs55x0->revision < 0x10) {
- /* CS5530(rev 1.2, 1.3) */
- suscfg = gx_params->pci_suscfg|SUSMOD;
- } else {
- /* CS5530A,B.. */
- suscfg = gx_params->pci_suscfg|SUSMOD|PWRSVE;
- }
- break;
- case PCI_DEVICE_ID_CYRIX_5520:
- case PCI_DEVICE_ID_CYRIX_5510:
- suscfg = gx_params->pci_suscfg | SUSMOD;
- break;
- default:
- local_irq_restore(flags);
- dprintk("fatal: try to set unknown chipset.\n");
- return;
- }
- } else {
- suscfg = gx_params->pci_suscfg & ~(SUSMOD);
- gx_params->off_duration = 0;
- gx_params->on_duration = 0;
- dprintk("suspend modulation disabled: cpu runs 100%% speed.\n");
- }
-
- gx_write_byte(PCI_MODOFF, gx_params->off_duration);
- gx_write_byte(PCI_MODON, gx_params->on_duration);
-
- gx_write_byte(PCI_SUSCFG, suscfg);
- pci_read_config_byte(gx_params->cs55x0, PCI_SUSCFG, &suscfg);
-
- local_irq_restore(flags);
-
- gx_params->pci_suscfg = suscfg;
-
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-
- dprintk("suspend modulation w/ duration of ON:%d us, OFF:%d us\n",
- gx_params->on_duration * 32, gx_params->off_duration * 32);
- dprintk("suspend modulation w/ clock speed: %d kHz.\n", freqs.new);
-}
-
-/****************************************************************
- * High level functions *
- ****************************************************************/
-
-/*
- * cpufreq_gx_verify: test if frequency range is valid
- *
- * This function checks if a given frequency range in kHz is valid
- * for the hardware supported by the driver.
- */
-
-static int cpufreq_gx_verify(struct cpufreq_policy *policy)
-{
- unsigned int tmp_freq = 0;
- u8 tmp1, tmp2;
-
- if (!stock_freq || !policy)
- return -EINVAL;
-
- policy->cpu = 0;
- cpufreq_verify_within_limits(policy, (stock_freq / max_duration),
- stock_freq);
-
- /* it needs to be assured that at least one supported frequency is
- * within policy->min and policy->max. If it is not, policy->max
- * needs to be increased until one freuqency is supported.
- * policy->min may not be decreased, though. This way we guarantee a
- * specific processing capacity.
- */
- tmp_freq = gx_validate_speed(policy->min, &tmp1, &tmp2);
- if (tmp_freq < policy->min)
- tmp_freq += stock_freq / max_duration;
- policy->min = tmp_freq;
- if (policy->min > policy->max)
- policy->max = tmp_freq;
- tmp_freq = gx_validate_speed(policy->max, &tmp1, &tmp2);
- if (tmp_freq > policy->max)
- tmp_freq -= stock_freq / max_duration;
- policy->max = tmp_freq;
- if (policy->max < policy->min)
- policy->max = policy->min;
- cpufreq_verify_within_limits(policy, (stock_freq / max_duration),
- stock_freq);
-
- return 0;
-}
-
-/*
- * cpufreq_gx_target:
- *
- */
-static int cpufreq_gx_target(struct cpufreq_policy *policy,
- unsigned int target_freq,
- unsigned int relation)
-{
- u8 tmp1, tmp2;
- unsigned int tmp_freq;
-
- if (!stock_freq || !policy)
- return -EINVAL;
-
- policy->cpu = 0;
-
- tmp_freq = gx_validate_speed(target_freq, &tmp1, &tmp2);
- while (tmp_freq < policy->min) {
- tmp_freq += stock_freq / max_duration;
- tmp_freq = gx_validate_speed(tmp_freq, &tmp1, &tmp2);
- }
- while (tmp_freq > policy->max) {
- tmp_freq -= stock_freq / max_duration;
- tmp_freq = gx_validate_speed(tmp_freq, &tmp1, &tmp2);
- }
-
- gx_set_cpuspeed(tmp_freq);
-
- return 0;
-}
-
-static int cpufreq_gx_cpu_init(struct cpufreq_policy *policy)
-{
- unsigned int maxfreq, curfreq;
-
- if (!policy || policy->cpu != 0)
- return -ENODEV;
-
- /* determine maximum frequency */
- if (pci_busclk)
- maxfreq = pci_busclk * gx_freq_mult[getCx86(CX86_DIR1) & 0x0f];
- else if (cpu_khz)
- maxfreq = cpu_khz;
- else
- maxfreq = 30000 * gx_freq_mult[getCx86(CX86_DIR1) & 0x0f];
-
- stock_freq = maxfreq;
- curfreq = gx_get_cpuspeed(0);
-
- dprintk("cpu max frequency is %d.\n", maxfreq);
- dprintk("cpu current frequency is %dkHz.\n", curfreq);
-
- /* setup basic struct for cpufreq API */
- policy->cpu = 0;
-
- if (max_duration < POLICY_MIN_DIV)
- policy->min = maxfreq / max_duration;
- else
- policy->min = maxfreq / POLICY_MIN_DIV;
- policy->max = maxfreq;
- policy->cur = curfreq;
- policy->cpuinfo.min_freq = maxfreq / max_duration;
- policy->cpuinfo.max_freq = maxfreq;
- policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
-
- return 0;
-}
-
-/*
- * cpufreq_gx_init:
- * MediaGX/Geode GX initialize cpufreq driver
- */
-static struct cpufreq_driver gx_suspmod_driver = {
- .get = gx_get_cpuspeed,
- .verify = cpufreq_gx_verify,
- .target = cpufreq_gx_target,
- .init = cpufreq_gx_cpu_init,
- .name = "gx-suspmod",
- .owner = THIS_MODULE,
-};
-
-static int __init cpufreq_gx_init(void)
-{
- int ret;
- struct gxfreq_params *params;
- struct pci_dev *gx_pci;
-
- /* Test if we have the right hardware */
- gx_pci = gx_detect_chipset();
- if (gx_pci == NULL)
- return -ENODEV;
-
- /* check whether module parameters are sane */
- if (max_duration > 0xff)
- max_duration = 0xff;
-
- dprintk("geode suspend modulation available.\n");
-
- params = kzalloc(sizeof(struct gxfreq_params), GFP_KERNEL);
- if (params == NULL)
- return -ENOMEM;
-
- params->cs55x0 = gx_pci;
- gx_params = params;
-
- /* keep cs55x0 configurations */
- pci_read_config_byte(params->cs55x0, PCI_SUSCFG, &(params->pci_suscfg));
- pci_read_config_byte(params->cs55x0, PCI_PMER1, &(params->pci_pmer1));
- pci_read_config_byte(params->cs55x0, PCI_PMER2, &(params->pci_pmer2));
- pci_read_config_byte(params->cs55x0, PCI_MODON, &(params->on_duration));
- pci_read_config_byte(params->cs55x0, PCI_MODOFF,
- &(params->off_duration));
-
- ret = cpufreq_register_driver(&gx_suspmod_driver);
- if (ret) {
- kfree(params);
- return ret; /* register error! */
- }
-
- return 0;
-}
-
-static void __exit cpufreq_gx_exit(void)
-{
- cpufreq_unregister_driver(&gx_suspmod_driver);
- pci_dev_put(gx_params->cs55x0);
- kfree(gx_params);
-}
-
-MODULE_AUTHOR("Hiroshi Miura <miura@da-cha.org>");
-MODULE_DESCRIPTION("Cpufreq driver for Cyrix MediaGX and NatSemi Geode");
-MODULE_LICENSE("GPL");
-
-module_init(cpufreq_gx_init);
-module_exit(cpufreq_gx_exit);
-
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c
deleted file mode 100644
index cf48cdd6907..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.c
+++ /dev/null
@@ -1,1029 +0,0 @@
-/*
- * (C) 2001-2004 Dave Jones. <davej@redhat.com>
- * (C) 2002 Padraig Brady. <padraig@antefacto.com>
- *
- * Licensed under the terms of the GNU GPL License version 2.
- * Based upon datasheets & sample CPUs kindly provided by VIA.
- *
- * VIA have currently 3 different versions of Longhaul.
- * Version 1 (Longhaul) uses the BCR2 MSR at 0x1147.
- * It is present only in Samuel 1 (C5A), Samuel 2 (C5B) stepping 0.
- * Version 2 of longhaul is backward compatible with v1, but adds
- * LONGHAUL MSR for purpose of both frequency and voltage scaling.
- * Present in Samuel 2 (steppings 1-7 only) (C5B), and Ezra (C5C).
- * Version 3 of longhaul got renamed to Powersaver and redesigned
- * to use only the POWERSAVER MSR at 0x110a.
- * It is present in Ezra-T (C5M), Nehemiah (C5X) and above.
- * It's pretty much the same feature wise to longhaul v2, though
- * there is provision for scaling FSB too, but this doesn't work
- * too well in practice so we don't even try to use this.
- *
- * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/pci.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/delay.h>
-#include <linux/timex.h>
-#include <linux/io.h>
-#include <linux/acpi.h>
-
-#include <asm/msr.h>
-#include <acpi/processor.h>
-
-#include "longhaul.h"
-
-#define PFX "longhaul: "
-
-#define TYPE_LONGHAUL_V1 1
-#define TYPE_LONGHAUL_V2 2
-#define TYPE_POWERSAVER 3
-
-#define CPU_SAMUEL 1
-#define CPU_SAMUEL2 2
-#define CPU_EZRA 3
-#define CPU_EZRA_T 4
-#define CPU_NEHEMIAH 5
-#define CPU_NEHEMIAH_C 6
-
-/* Flags */
-#define USE_ACPI_C3 (1 << 1)
-#define USE_NORTHBRIDGE (1 << 2)
-
-static int cpu_model;
-static unsigned int numscales = 16;
-static unsigned int fsb;
-
-static const struct mV_pos *vrm_mV_table;
-static const unsigned char *mV_vrm_table;
-
-static unsigned int highest_speed, lowest_speed; /* kHz */
-static unsigned int minmult, maxmult;
-static int can_scale_voltage;
-static struct acpi_processor *pr;
-static struct acpi_processor_cx *cx;
-static u32 acpi_regs_addr;
-static u8 longhaul_flags;
-static unsigned int longhaul_index;
-
-/* Module parameters */
-static int scale_voltage;
-static int disable_acpi_c3;
-static int revid_errata;
-
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
- "longhaul", msg)
-
-
-/* Clock ratios multiplied by 10 */
-static int mults[32];
-static int eblcr[32];
-static int longhaul_version;
-static struct cpufreq_frequency_table *longhaul_table;
-
-#ifdef CONFIG_CPU_FREQ_DEBUG
-static char speedbuffer[8];
-
-static char *print_speed(int speed)
-{
- if (speed < 1000) {
- snprintf(speedbuffer, sizeof(speedbuffer), "%dMHz", speed);
- return speedbuffer;
- }
-
- if (speed%1000 == 0)
- snprintf(speedbuffer, sizeof(speedbuffer),
- "%dGHz", speed/1000);
- else
- snprintf(speedbuffer, sizeof(speedbuffer),
- "%d.%dGHz", speed/1000, (speed%1000)/100);
-
- return speedbuffer;
-}
-#endif
-
-
-static unsigned int calc_speed(int mult)
-{
- int khz;
- khz = (mult/10)*fsb;
- if (mult%10)
- khz += fsb/2;
- khz *= 1000;
- return khz;
-}
-
-
-static int longhaul_get_cpu_mult(void)
-{
- unsigned long invalue = 0, lo, hi;
-
- rdmsr(MSR_IA32_EBL_CR_POWERON, lo, hi);
- invalue = (lo & (1<<22|1<<23|1<<24|1<<25))>>22;
- if (longhaul_version == TYPE_LONGHAUL_V2 ||
- longhaul_version == TYPE_POWERSAVER) {
- if (lo & (1<<27))
- invalue += 16;
- }
- return eblcr[invalue];
-}
-
-/* For processor with BCR2 MSR */
-
-static void do_longhaul1(unsigned int mults_index)
-{
- union msr_bcr2 bcr2;
-
- rdmsrl(MSR_VIA_BCR2, bcr2.val);
- /* Enable software clock multiplier */
- bcr2.bits.ESOFTBF = 1;
- bcr2.bits.CLOCKMUL = mults_index & 0xff;
-
- /* Sync to timer tick */
- safe_halt();
- /* Change frequency on next halt or sleep */
- wrmsrl(MSR_VIA_BCR2, bcr2.val);
- /* Invoke transition */
- ACPI_FLUSH_CPU_CACHE();
- halt();
-
- /* Disable software clock multiplier */
- local_irq_disable();
- rdmsrl(MSR_VIA_BCR2, bcr2.val);
- bcr2.bits.ESOFTBF = 0;
- wrmsrl(MSR_VIA_BCR2, bcr2.val);
-}
-
-/* For processor with Longhaul MSR */
-
-static void do_powersaver(int cx_address, unsigned int mults_index,
- unsigned int dir)
-{
- union msr_longhaul longhaul;
- u32 t;
-
- rdmsrl(MSR_VIA_LONGHAUL, longhaul.val);
- /* Setup new frequency */
- if (!revid_errata)
- longhaul.bits.RevisionKey = longhaul.bits.RevisionID;
- else
- longhaul.bits.RevisionKey = 0;
- longhaul.bits.SoftBusRatio = mults_index & 0xf;
- longhaul.bits.SoftBusRatio4 = (mults_index & 0x10) >> 4;
- /* Setup new voltage */
- if (can_scale_voltage)
- longhaul.bits.SoftVID = (mults_index >> 8) & 0x1f;
- /* Sync to timer tick */
- safe_halt();
- /* Raise voltage if necessary */
- if (can_scale_voltage && dir) {
- longhaul.bits.EnableSoftVID = 1;
- wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
- /* Change voltage */
- if (!cx_address) {
- ACPI_FLUSH_CPU_CACHE();
- halt();
- } else {
- ACPI_FLUSH_CPU_CACHE();
- /* Invoke C3 */
- inb(cx_address);
- /* Dummy op - must do something useless after P_LVL3
- * read */
- t = inl(acpi_gbl_FADT.xpm_timer_block.address);
- }
- longhaul.bits.EnableSoftVID = 0;
- wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
- }
-
- /* Change frequency on next halt or sleep */
- longhaul.bits.EnableSoftBusRatio = 1;
- wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
- if (!cx_address) {
- ACPI_FLUSH_CPU_CACHE();
- halt();
- } else {
- ACPI_FLUSH_CPU_CACHE();
- /* Invoke C3 */
- inb(cx_address);
- /* Dummy op - must do something useless after P_LVL3 read */
- t = inl(acpi_gbl_FADT.xpm_timer_block.address);
- }
- /* Disable bus ratio bit */
- longhaul.bits.EnableSoftBusRatio = 0;
- wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
-
- /* Reduce voltage if necessary */
- if (can_scale_voltage && !dir) {
- longhaul.bits.EnableSoftVID = 1;
- wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
- /* Change voltage */
- if (!cx_address) {
- ACPI_FLUSH_CPU_CACHE();
- halt();
- } else {
- ACPI_FLUSH_CPU_CACHE();
- /* Invoke C3 */
- inb(cx_address);
- /* Dummy op - must do something useless after P_LVL3
- * read */
- t = inl(acpi_gbl_FADT.xpm_timer_block.address);
- }
- longhaul.bits.EnableSoftVID = 0;
- wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
- }
-}
-
-/**
- * longhaul_set_cpu_frequency()
- * @mults_index : bitpattern of the new multiplier.
- *
- * Sets a new clock ratio.
- */
-
-static void longhaul_setstate(unsigned int table_index)
-{
- unsigned int mults_index;
- int speed, mult;
- struct cpufreq_freqs freqs;
- unsigned long flags;
- unsigned int pic1_mask, pic2_mask;
- u16 bm_status = 0;
- u32 bm_timeout = 1000;
- unsigned int dir = 0;
-
- mults_index = longhaul_table[table_index].index;
- /* Safety precautions */
- mult = mults[mults_index & 0x1f];
- if (mult == -1)
- return;
- speed = calc_speed(mult);
- if ((speed > highest_speed) || (speed < lowest_speed))
- return;
- /* Voltage transition before frequency transition? */
- if (can_scale_voltage && longhaul_index < table_index)
- dir = 1;
-
- freqs.old = calc_speed(longhaul_get_cpu_mult());
- freqs.new = speed;
- freqs.cpu = 0; /* longhaul.c is UP only driver */
-
- cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-
- dprintk("Setting to FSB:%dMHz Mult:%d.%dx (%s)\n",
- fsb, mult/10, mult%10, print_speed(speed/1000));
-retry_loop:
- preempt_disable();
- local_irq_save(flags);
-
- pic2_mask = inb(0xA1);
- pic1_mask = inb(0x21); /* works on C3. save mask. */
- outb(0xFF, 0xA1); /* Overkill */
- outb(0xFE, 0x21); /* TMR0 only */
-
- /* Wait while PCI bus is busy. */
- if (acpi_regs_addr && (longhaul_flags & USE_NORTHBRIDGE
- || ((pr != NULL) && pr->flags.bm_control))) {
- bm_status = inw(acpi_regs_addr);
- bm_status &= 1 << 4;
- while (bm_status && bm_timeout) {
- outw(1 << 4, acpi_regs_addr);
- bm_timeout--;
- bm_status = inw(acpi_regs_addr);
- bm_status &= 1 << 4;
- }
- }
-
- if (longhaul_flags & USE_NORTHBRIDGE) {
- /* Disable AGP and PCI arbiters */
- outb(3, 0x22);
- } else if ((pr != NULL) && pr->flags.bm_control) {
- /* Disable bus master arbitration */
- acpi_write_bit_register(ACPI_BITREG_ARB_DISABLE, 1);
- }
- switch (longhaul_version) {
-
- /*
- * Longhaul v1. (Samuel[C5A] and Samuel2 stepping 0[C5B])
- * Software controlled multipliers only.
- */
- case TYPE_LONGHAUL_V1:
- do_longhaul1(mults_index);
- break;
-
- /*
- * Longhaul v2 appears in Samuel2 Steppings 1->7 [C5B] and Ezra [C5C]
- *
- * Longhaul v3 (aka Powersaver). (Ezra-T [C5M] & Nehemiah [C5N])
- * Nehemiah can do FSB scaling too, but this has never been proven
- * to work in practice.
- */
- case TYPE_LONGHAUL_V2:
- case TYPE_POWERSAVER:
- if (longhaul_flags & USE_ACPI_C3) {
- /* Don't allow wakeup */
- acpi_write_bit_register(ACPI_BITREG_BUS_MASTER_RLD, 0);
- do_powersaver(cx->address, mults_index, dir);
- } else {
- do_powersaver(0, mults_index, dir);
- }
- break;
- }
-
- if (longhaul_flags & USE_NORTHBRIDGE) {
- /* Enable arbiters */
- outb(0, 0x22);
- } else if ((pr != NULL) && pr->flags.bm_control) {
- /* Enable bus master arbitration */
- acpi_write_bit_register(ACPI_BITREG_ARB_DISABLE, 0);
- }
- outb(pic2_mask, 0xA1); /* restore mask */
- outb(pic1_mask, 0x21);
-
- local_irq_restore(flags);
- preempt_enable();
-
- freqs.new = calc_speed(longhaul_get_cpu_mult());
- /* Check if requested frequency is set. */
- if (unlikely(freqs.new != speed)) {
- printk(KERN_INFO PFX "Failed to set requested frequency!\n");
- /* Revision ID = 1 but processor is expecting revision key
- * equal to 0. Jumpers at the bottom of processor will change
- * multiplier and FSB, but will not change bits in Longhaul
- * MSR nor enable voltage scaling. */
- if (!revid_errata) {
- printk(KERN_INFO PFX "Enabling \"Ignore Revision ID\" "
- "option.\n");
- revid_errata = 1;
- msleep(200);
- goto retry_loop;
- }
- /* Why ACPI C3 sometimes doesn't work is a mystery for me.
- * But it does happen. Processor is entering ACPI C3 state,
- * but it doesn't change frequency. I tried poking various
- * bits in northbridge registers, but without success. */
- if (longhaul_flags & USE_ACPI_C3) {
- printk(KERN_INFO PFX "Disabling ACPI C3 support.\n");
- longhaul_flags &= ~USE_ACPI_C3;
- if (revid_errata) {
- printk(KERN_INFO PFX "Disabling \"Ignore "
- "Revision ID\" option.\n");
- revid_errata = 0;
- }
- msleep(200);
- goto retry_loop;
- }
- /* This shouldn't happen. Longhaul ver. 2 was reported not
- * working on processors without voltage scaling, but with
- * RevID = 1. RevID errata will make things right. Just
- * to be 100% sure. */
- if (longhaul_version == TYPE_LONGHAUL_V2) {
- printk(KERN_INFO PFX "Switching to Longhaul ver. 1\n");
- longhaul_version = TYPE_LONGHAUL_V1;
- msleep(200);
- goto retry_loop;
- }
- }
- /* Report true CPU frequency */
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-
- if (!bm_timeout)
- printk(KERN_INFO PFX "Warning: Timeout while waiting for "
- "idle PCI bus.\n");
-}
-
-/*
- * Centaur decided to make life a little more tricky.
- * Only longhaul v1 is allowed to read EBLCR BSEL[0:1].
- * Samuel2 and above have to try and guess what the FSB is.
- * We do this by assuming we booted at maximum multiplier, and interpolate
- * between that value multiplied by possible FSBs and cpu_mhz which
- * was calculated at boot time. Really ugly, but no other way to do this.
- */
-
-#define ROUNDING 0xf
-
-static int guess_fsb(int mult)
-{
- int speed = cpu_khz / 1000;
- int i;
- int speeds[] = { 666, 1000, 1333, 2000 };
- int f_max, f_min;
-
- for (i = 0; i < 4; i++) {
- f_max = ((speeds[i] * mult) + 50) / 100;
- f_max += (ROUNDING / 2);
- f_min = f_max - ROUNDING;
- if ((speed <= f_max) && (speed >= f_min))
- return speeds[i] / 10;
- }
- return 0;
-}
-
-
-static int __cpuinit longhaul_get_ranges(void)
-{
- unsigned int i, j, k = 0;
- unsigned int ratio;
- int mult;
-
- /* Get current frequency */
- mult = longhaul_get_cpu_mult();
- if (mult == -1) {
- printk(KERN_INFO PFX "Invalid (reserved) multiplier!\n");
- return -EINVAL;
- }
- fsb = guess_fsb(mult);
- if (fsb == 0) {
- printk(KERN_INFO PFX "Invalid (reserved) FSB!\n");
- return -EINVAL;
- }
- /* Get max multiplier - as we always did.
- * Longhaul MSR is useful only when voltage scaling is enabled.
- * C3 is booting at max anyway. */
- maxmult = mult;
- /* Get min multiplier */
- switch (cpu_model) {
- case CPU_NEHEMIAH:
- minmult = 50;
- break;
- case CPU_NEHEMIAH_C:
- minmult = 40;
- break;
- default:
- minmult = 30;
- break;
- }
-
- dprintk("MinMult:%d.%dx MaxMult:%d.%dx\n",
- minmult/10, minmult%10, maxmult/10, maxmult%10);
-
- highest_speed = calc_speed(maxmult);
- lowest_speed = calc_speed(minmult);
- dprintk("FSB:%dMHz Lowest speed: %s Highest speed:%s\n", fsb,
- print_speed(lowest_speed/1000),
- print_speed(highest_speed/1000));
-
- if (lowest_speed == highest_speed) {
- printk(KERN_INFO PFX "highestspeed == lowest, aborting.\n");
- return -EINVAL;
- }
- if (lowest_speed > highest_speed) {
- printk(KERN_INFO PFX "nonsense! lowest (%d > %d) !\n",
- lowest_speed, highest_speed);
- return -EINVAL;
- }
-
- longhaul_table = kmalloc((numscales + 1) * sizeof(*longhaul_table),
- GFP_KERNEL);
- if (!longhaul_table)
- return -ENOMEM;
-
- for (j = 0; j < numscales; j++) {
- ratio = mults[j];
- if (ratio == -1)
- continue;
- if (ratio > maxmult || ratio < minmult)
- continue;
- longhaul_table[k].frequency = calc_speed(ratio);
- longhaul_table[k].index = j;
- k++;
- }
- if (k <= 1) {
- kfree(longhaul_table);
- return -ENODEV;
- }
- /* Sort */
- for (j = 0; j < k - 1; j++) {
- unsigned int min_f, min_i;
- min_f = longhaul_table[j].frequency;
- min_i = j;
- for (i = j + 1; i < k; i++) {
- if (longhaul_table[i].frequency < min_f) {
- min_f = longhaul_table[i].frequency;
- min_i = i;
- }
- }
- if (min_i != j) {
- swap(longhaul_table[j].frequency,
- longhaul_table[min_i].frequency);
- swap(longhaul_table[j].index,
- longhaul_table[min_i].index);
- }
- }
-
- longhaul_table[k].frequency = CPUFREQ_TABLE_END;
-
- /* Find index we are running on */
- for (j = 0; j < k; j++) {
- if (mults[longhaul_table[j].index & 0x1f] == mult) {
- longhaul_index = j;
- break;
- }
- }
- return 0;
-}
-
-
-static void __cpuinit longhaul_setup_voltagescaling(void)
-{
- union msr_longhaul longhaul;
- struct mV_pos minvid, maxvid, vid;
- unsigned int j, speed, pos, kHz_step, numvscales;
- int min_vid_speed;
-
- rdmsrl(MSR_VIA_LONGHAUL, longhaul.val);
- if (!(longhaul.bits.RevisionID & 1)) {
- printk(KERN_INFO PFX "Voltage scaling not supported by CPU.\n");
- return;
- }
-
- if (!longhaul.bits.VRMRev) {
- printk(KERN_INFO PFX "VRM 8.5\n");
- vrm_mV_table = &vrm85_mV[0];
- mV_vrm_table = &mV_vrm85[0];
- } else {
- printk(KERN_INFO PFX "Mobile VRM\n");
- if (cpu_model < CPU_NEHEMIAH)
- return;
- vrm_mV_table = &mobilevrm_mV[0];
- mV_vrm_table = &mV_mobilevrm[0];
- }
-
- minvid = vrm_mV_table[longhaul.bits.MinimumVID];
- maxvid = vrm_mV_table[longhaul.bits.MaximumVID];
-
- if (minvid.mV == 0 || maxvid.mV == 0 || minvid.mV > maxvid.mV) {
- printk(KERN_INFO PFX "Bogus values Min:%d.%03d Max:%d.%03d. "
- "Voltage scaling disabled.\n",
- minvid.mV/1000, minvid.mV%1000,
- maxvid.mV/1000, maxvid.mV%1000);
- return;
- }
-
- if (minvid.mV == maxvid.mV) {
- printk(KERN_INFO PFX "Claims to support voltage scaling but "
- "min & max are both %d.%03d. "
- "Voltage scaling disabled\n",
- maxvid.mV/1000, maxvid.mV%1000);
- return;
- }
-
- /* How many voltage steps*/
- numvscales = maxvid.pos - minvid.pos + 1;
- printk(KERN_INFO PFX
- "Max VID=%d.%03d "
- "Min VID=%d.%03d, "
- "%d possible voltage scales\n",
- maxvid.mV/1000, maxvid.mV%1000,
- minvid.mV/1000, minvid.mV%1000,
- numvscales);
-
- /* Calculate max frequency at min voltage */
- j = longhaul.bits.MinMHzBR;
- if (longhaul.bits.MinMHzBR4)
- j += 16;
- min_vid_speed = eblcr[j];
- if (min_vid_speed == -1)
- return;
- switch (longhaul.bits.MinMHzFSB) {
- case 0:
- min_vid_speed *= 13333;
- break;
- case 1:
- min_vid_speed *= 10000;
- break;
- case 3:
- min_vid_speed *= 6666;
- break;
- default:
- return;
- break;
- }
- if (min_vid_speed >= highest_speed)
- return;
- /* Calculate kHz for one voltage step */
- kHz_step = (highest_speed - min_vid_speed) / numvscales;
-
- j = 0;
- while (longhaul_table[j].frequency != CPUFREQ_TABLE_END) {
- speed = longhaul_table[j].frequency;
- if (speed > min_vid_speed)
- pos = (speed - min_vid_speed) / kHz_step + minvid.pos;
- else
- pos = minvid.pos;
- longhaul_table[j].index |= mV_vrm_table[pos] << 8;
- vid = vrm_mV_table[mV_vrm_table[pos]];
- printk(KERN_INFO PFX "f: %d kHz, index: %d, vid: %d mV\n",
- speed, j, vid.mV);
- j++;
- }
-
- can_scale_voltage = 1;
- printk(KERN_INFO PFX "Voltage scaling enabled.\n");
-}
-
-
-static int longhaul_verify(struct cpufreq_policy *policy)
-{
- return cpufreq_frequency_table_verify(policy, longhaul_table);
-}
-
-
-static int longhaul_target(struct cpufreq_policy *policy,
- unsigned int target_freq, unsigned int relation)
-{
- unsigned int table_index = 0;
- unsigned int i;
- unsigned int dir = 0;
- u8 vid, current_vid;
-
- if (cpufreq_frequency_table_target(policy, longhaul_table, target_freq,
- relation, &table_index))
- return -EINVAL;
-
- /* Don't set same frequency again */
- if (longhaul_index == table_index)
- return 0;
-
- if (!can_scale_voltage)
- longhaul_setstate(table_index);
- else {
- /* On test system voltage transitions exceeding single
- * step up or down were turning motherboard off. Both
- * "ondemand" and "userspace" are unsafe. C7 is doing
- * this in hardware, C3 is old and we need to do this
- * in software. */
- i = longhaul_index;
- current_vid = (longhaul_table[longhaul_index].index >> 8);
- current_vid &= 0x1f;
- if (table_index > longhaul_index)
- dir = 1;
- while (i != table_index) {
- vid = (longhaul_table[i].index >> 8) & 0x1f;
- if (vid != current_vid) {
- longhaul_setstate(i);
- current_vid = vid;
- msleep(200);
- }
- if (dir)
- i++;
- else
- i--;
- }
- longhaul_setstate(table_index);
- }
- longhaul_index = table_index;
- return 0;
-}
-
-
-static unsigned int longhaul_get(unsigned int cpu)
-{
- if (cpu)
- return 0;
- return calc_speed(longhaul_get_cpu_mult());
-}
-
-static acpi_status longhaul_walk_callback(acpi_handle obj_handle,
- u32 nesting_level,
- void *context, void **return_value)
-{
- struct acpi_device *d;
-
- if (acpi_bus_get_device(obj_handle, &d))
- return 0;
-
- *return_value = acpi_driver_data(d);
- return 1;
-}
-
-/* VIA don't support PM2 reg, but have something similar */
-static int enable_arbiter_disable(void)
-{
- struct pci_dev *dev;
- int status = 1;
- int reg;
- u8 pci_cmd;
-
- /* Find PLE133 host bridge */
- reg = 0x78;
- dev = pci_get_device(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8601_0,
- NULL);
- /* Find PM133/VT8605 host bridge */
- if (dev == NULL)
- dev = pci_get_device(PCI_VENDOR_ID_VIA,
- PCI_DEVICE_ID_VIA_8605_0, NULL);
- /* Find CLE266 host bridge */
- if (dev == NULL) {
- reg = 0x76;
- dev = pci_get_device(PCI_VENDOR_ID_VIA,
- PCI_DEVICE_ID_VIA_862X_0, NULL);
- /* Find CN400 V-Link host bridge */
- if (dev == NULL)
- dev = pci_get_device(PCI_VENDOR_ID_VIA, 0x7259, NULL);
- }
- if (dev != NULL) {
- /* Enable access to port 0x22 */
- pci_read_config_byte(dev, reg, &pci_cmd);
- if (!(pci_cmd & 1<<7)) {
- pci_cmd |= 1<<7;
- pci_write_config_byte(dev, reg, pci_cmd);
- pci_read_config_byte(dev, reg, &pci_cmd);
- if (!(pci_cmd & 1<<7)) {
- printk(KERN_ERR PFX
- "Can't enable access to port 0x22.\n");
- status = 0;
- }
- }
- pci_dev_put(dev);
- return status;
- }
- return 0;
-}
-
-static int longhaul_setup_southbridge(void)
-{
- struct pci_dev *dev;
- u8 pci_cmd;
-
- /* Find VT8235 southbridge */
- dev = pci_get_device(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235, NULL);
- if (dev == NULL)
- /* Find VT8237 southbridge */
- dev = pci_get_device(PCI_VENDOR_ID_VIA,
- PCI_DEVICE_ID_VIA_8237, NULL);
- if (dev != NULL) {
- /* Set transition time to max */
- pci_read_config_byte(dev, 0xec, &pci_cmd);
- pci_cmd &= ~(1 << 2);
- pci_write_config_byte(dev, 0xec, pci_cmd);
- pci_read_config_byte(dev, 0xe4, &pci_cmd);
- pci_cmd &= ~(1 << 7);
- pci_write_config_byte(dev, 0xe4, pci_cmd);
- pci_read_config_byte(dev, 0xe5, &pci_cmd);
- pci_cmd |= 1 << 7;
- pci_write_config_byte(dev, 0xe5, pci_cmd);
- /* Get address of ACPI registers block*/
- pci_read_config_byte(dev, 0x81, &pci_cmd);
- if (pci_cmd & 1 << 7) {
- pci_read_config_dword(dev, 0x88, &acpi_regs_addr);
- acpi_regs_addr &= 0xff00;
- printk(KERN_INFO PFX "ACPI I/O at 0x%x\n",
- acpi_regs_addr);
- }
-
- pci_dev_put(dev);
- return 1;
- }
- return 0;
-}
-
-static int __cpuinit longhaul_cpu_init(struct cpufreq_policy *policy)
-{
- struct cpuinfo_x86 *c = &cpu_data(0);
- char *cpuname = NULL;
- int ret;
- u32 lo, hi;
-
- /* Check what we have on this motherboard */
- switch (c->x86_model) {
- case 6:
- cpu_model = CPU_SAMUEL;
- cpuname = "C3 'Samuel' [C5A]";
- longhaul_version = TYPE_LONGHAUL_V1;
- memcpy(mults, samuel1_mults, sizeof(samuel1_mults));
- memcpy(eblcr, samuel1_eblcr, sizeof(samuel1_eblcr));
- break;
-
- case 7:
- switch (c->x86_mask) {
- case 0:
- longhaul_version = TYPE_LONGHAUL_V1;
- cpu_model = CPU_SAMUEL2;
- cpuname = "C3 'Samuel 2' [C5B]";
- /* Note, this is not a typo, early Samuel2's had
- * Samuel1 ratios. */
- memcpy(mults, samuel1_mults, sizeof(samuel1_mults));
- memcpy(eblcr, samuel2_eblcr, sizeof(samuel2_eblcr));
- break;
- case 1 ... 15:
- longhaul_version = TYPE_LONGHAUL_V2;
- if (c->x86_mask < 8) {
- cpu_model = CPU_SAMUEL2;
- cpuname = "C3 'Samuel 2' [C5B]";
- } else {
- cpu_model = CPU_EZRA;
- cpuname = "C3 'Ezra' [C5C]";
- }
- memcpy(mults, ezra_mults, sizeof(ezra_mults));
- memcpy(eblcr, ezra_eblcr, sizeof(ezra_eblcr));
- break;
- }
- break;
-
- case 8:
- cpu_model = CPU_EZRA_T;
- cpuname = "C3 'Ezra-T' [C5M]";
- longhaul_version = TYPE_POWERSAVER;
- numscales = 32;
- memcpy(mults, ezrat_mults, sizeof(ezrat_mults));
- memcpy(eblcr, ezrat_eblcr, sizeof(ezrat_eblcr));
- break;
-
- case 9:
- longhaul_version = TYPE_POWERSAVER;
- numscales = 32;
- memcpy(mults, nehemiah_mults, sizeof(nehemiah_mults));
- memcpy(eblcr, nehemiah_eblcr, sizeof(nehemiah_eblcr));
- switch (c->x86_mask) {
- case 0 ... 1:
- cpu_model = CPU_NEHEMIAH;
- cpuname = "C3 'Nehemiah A' [C5XLOE]";
- break;
- case 2 ... 4:
- cpu_model = CPU_NEHEMIAH;
- cpuname = "C3 'Nehemiah B' [C5XLOH]";
- break;
- case 5 ... 15:
- cpu_model = CPU_NEHEMIAH_C;
- cpuname = "C3 'Nehemiah C' [C5P]";
- break;
- }
- break;
-
- default:
- cpuname = "Unknown";
- break;
- }
- /* Check Longhaul ver. 2 */
- if (longhaul_version == TYPE_LONGHAUL_V2) {
- rdmsr(MSR_VIA_LONGHAUL, lo, hi);
- if (lo == 0 && hi == 0)
- /* Looks like MSR isn't present */
- longhaul_version = TYPE_LONGHAUL_V1;
- }
-
- printk(KERN_INFO PFX "VIA %s CPU detected. ", cpuname);
- switch (longhaul_version) {
- case TYPE_LONGHAUL_V1:
- case TYPE_LONGHAUL_V2:
- printk(KERN_CONT "Longhaul v%d supported.\n", longhaul_version);
- break;
- case TYPE_POWERSAVER:
- printk(KERN_CONT "Powersaver supported.\n");
- break;
- };
-
- /* Doesn't hurt */
- longhaul_setup_southbridge();
-
- /* Find ACPI data for processor */
- acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT,
- ACPI_UINT32_MAX, &longhaul_walk_callback, NULL,
- NULL, (void *)&pr);
-
- /* Check ACPI support for C3 state */
- if (pr != NULL && longhaul_version == TYPE_POWERSAVER) {
- cx = &pr->power.states[ACPI_STATE_C3];
- if (cx->address > 0 && cx->latency <= 1000)
- longhaul_flags |= USE_ACPI_C3;
- }
- /* Disable if it isn't working */
- if (disable_acpi_c3)
- longhaul_flags &= ~USE_ACPI_C3;
- /* Check if northbridge is friendly */
- if (enable_arbiter_disable())
- longhaul_flags |= USE_NORTHBRIDGE;
-
- /* Check ACPI support for bus master arbiter disable */
- if (!(longhaul_flags & USE_ACPI_C3
- || longhaul_flags & USE_NORTHBRIDGE)
- && ((pr == NULL) || !(pr->flags.bm_control))) {
- printk(KERN_ERR PFX
- "No ACPI support. Unsupported northbridge.\n");
- return -ENODEV;
- }
-
- if (longhaul_flags & USE_NORTHBRIDGE)
- printk(KERN_INFO PFX "Using northbridge support.\n");
- if (longhaul_flags & USE_ACPI_C3)
- printk(KERN_INFO PFX "Using ACPI support.\n");
-
- ret = longhaul_get_ranges();
- if (ret != 0)
- return ret;
-
- if ((longhaul_version != TYPE_LONGHAUL_V1) && (scale_voltage != 0))
- longhaul_setup_voltagescaling();
-
- policy->cpuinfo.transition_latency = 200000; /* nsec */
- policy->cur = calc_speed(longhaul_get_cpu_mult());
-
- ret = cpufreq_frequency_table_cpuinfo(policy, longhaul_table);
- if (ret)
- return ret;
-
- cpufreq_frequency_table_get_attr(longhaul_table, policy->cpu);
-
- return 0;
-}
-
-static int __devexit longhaul_cpu_exit(struct cpufreq_policy *policy)
-{
- cpufreq_frequency_table_put_attr(policy->cpu);
- return 0;
-}
-
-static struct freq_attr *longhaul_attr[] = {
- &cpufreq_freq_attr_scaling_available_freqs,
- NULL,
-};
-
-static struct cpufreq_driver longhaul_driver = {
- .verify = longhaul_verify,
- .target = longhaul_target,
- .get = longhaul_get,
- .init = longhaul_cpu_init,
- .exit = __devexit_p(longhaul_cpu_exit),
- .name = "longhaul",
- .owner = THIS_MODULE,
- .attr = longhaul_attr,
-};
-
-
-static int __init longhaul_init(void)
-{
- struct cpuinfo_x86 *c = &cpu_data(0);
-
- if (c->x86_vendor != X86_VENDOR_CENTAUR || c->x86 != 6)
- return -ENODEV;
-
-#ifdef CONFIG_SMP
- if (num_online_cpus() > 1) {
- printk(KERN_ERR PFX "More than 1 CPU detected, "
- "longhaul disabled.\n");
- return -ENODEV;
- }
-#endif
-#ifdef CONFIG_X86_IO_APIC
- if (cpu_has_apic) {
- printk(KERN_ERR PFX "APIC detected. Longhaul is currently "
- "broken in this configuration.\n");
- return -ENODEV;
- }
-#endif
- switch (c->x86_model) {
- case 6 ... 9:
- return cpufreq_register_driver(&longhaul_driver);
- case 10:
- printk(KERN_ERR PFX "Use acpi-cpufreq driver for VIA C7\n");
- default:
- ;
- }
-
- return -ENODEV;
-}
-
-
-static void __exit longhaul_exit(void)
-{
- int i;
-
- for (i = 0; i < numscales; i++) {
- if (mults[i] == maxmult) {
- longhaul_setstate(i);
- break;
- }
- }
-
- cpufreq_unregister_driver(&longhaul_driver);
- kfree(longhaul_table);
-}
-
-/* Even if BIOS is exporting ACPI C3 state, and it is used
- * with success when CPU is idle, this state doesn't
- * trigger frequency transition in some cases. */
-module_param(disable_acpi_c3, int, 0644);
-MODULE_PARM_DESC(disable_acpi_c3, "Don't use ACPI C3 support");
-/* Change CPU voltage with frequency. Very useful to save
- * power, but most VIA C3 processors aren't supporting it. */
-module_param(scale_voltage, int, 0644);
-MODULE_PARM_DESC(scale_voltage, "Scale voltage of processor");
-/* Force revision key to 0 for processors which doesn't
- * support voltage scaling, but are introducing itself as
- * such. */
-module_param(revid_errata, int, 0644);
-MODULE_PARM_DESC(revid_errata, "Ignore CPU Revision ID");
-
-MODULE_AUTHOR("Dave Jones <davej@redhat.com>");
-MODULE_DESCRIPTION("Longhaul driver for VIA Cyrix processors.");
-MODULE_LICENSE("GPL");
-
-late_initcall(longhaul_init);
-module_exit(longhaul_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.h b/arch/x86/kernel/cpu/cpufreq/longhaul.h
deleted file mode 100644
index cbf48fbca88..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.h
+++ /dev/null
@@ -1,353 +0,0 @@
-/*
- * longhaul.h
- * (C) 2003 Dave Jones.
- *
- * Licensed under the terms of the GNU GPL License version 2.
- *
- * VIA-specific information
- */
-
-union msr_bcr2 {
- struct {
- unsigned Reseved:19, // 18:0
- ESOFTBF:1, // 19
- Reserved2:3, // 22:20
- CLOCKMUL:4, // 26:23
- Reserved3:5; // 31:27
- } bits;
- unsigned long val;
-};
-
-union msr_longhaul {
- struct {
- unsigned RevisionID:4, // 3:0
- RevisionKey:4, // 7:4
- EnableSoftBusRatio:1, // 8
- EnableSoftVID:1, // 9
- EnableSoftBSEL:1, // 10
- Reserved:3, // 11:13
- SoftBusRatio4:1, // 14
- VRMRev:1, // 15
- SoftBusRatio:4, // 19:16
- SoftVID:5, // 24:20
- Reserved2:3, // 27:25
- SoftBSEL:2, // 29:28
- Reserved3:2, // 31:30
- MaxMHzBR:4, // 35:32
- MaximumVID:5, // 40:36
- MaxMHzFSB:2, // 42:41
- MaxMHzBR4:1, // 43
- Reserved4:4, // 47:44
- MinMHzBR:4, // 51:48
- MinimumVID:5, // 56:52
- MinMHzFSB:2, // 58:57
- MinMHzBR4:1, // 59
- Reserved5:4; // 63:60
- } bits;
- unsigned long long val;
-};
-
-/*
- * Clock ratio tables. Div/Mod by 10 to get ratio.
- * The eblcr values specify the ratio read from the CPU.
- * The mults values specify what to write to the CPU.
- */
-
-/*
- * VIA C3 Samuel 1 & Samuel 2 (stepping 0)
- */
-static const int __cpuinitdata samuel1_mults[16] = {
- -1, /* 0000 -> RESERVED */
- 30, /* 0001 -> 3.0x */
- 40, /* 0010 -> 4.0x */
- -1, /* 0011 -> RESERVED */
- -1, /* 0100 -> RESERVED */
- 35, /* 0101 -> 3.5x */
- 45, /* 0110 -> 4.5x */
- 55, /* 0111 -> 5.5x */
- 60, /* 1000 -> 6.0x */
- 70, /* 1001 -> 7.0x */
- 80, /* 1010 -> 8.0x */
- 50, /* 1011 -> 5.0x */
- 65, /* 1100 -> 6.5x */
- 75, /* 1101 -> 7.5x */
- -1, /* 1110 -> RESERVED */
- -1, /* 1111 -> RESERVED */
-};
-
-static const int __cpuinitdata samuel1_eblcr[16] = {
- 50, /* 0000 -> RESERVED */
- 30, /* 0001 -> 3.0x */
- 40, /* 0010 -> 4.0x */
- -1, /* 0011 -> RESERVED */
- 55, /* 0100 -> 5.5x */
- 35, /* 0101 -> 3.5x */
- 45, /* 0110 -> 4.5x */
- -1, /* 0111 -> RESERVED */
- -1, /* 1000 -> RESERVED */
- 70, /* 1001 -> 7.0x */
- 80, /* 1010 -> 8.0x */
- 60, /* 1011 -> 6.0x */
- -1, /* 1100 -> RESERVED */
- 75, /* 1101 -> 7.5x */
- -1, /* 1110 -> RESERVED */
- 65, /* 1111 -> 6.5x */
-};
-
-/*
- * VIA C3 Samuel2 Stepping 1->15
- */
-static const int __cpuinitdata samuel2_eblcr[16] = {
- 50, /* 0000 -> 5.0x */
- 30, /* 0001 -> 3.0x */
- 40, /* 0010 -> 4.0x */
- 100, /* 0011 -> 10.0x */
- 55, /* 0100 -> 5.5x */
- 35, /* 0101 -> 3.5x */
- 45, /* 0110 -> 4.5x */
- 110, /* 0111 -> 11.0x */
- 90, /* 1000 -> 9.0x */
- 70, /* 1001 -> 7.0x */
- 80, /* 1010 -> 8.0x */
- 60, /* 1011 -> 6.0x */
- 120, /* 1100 -> 12.0x */
- 75, /* 1101 -> 7.5x */
- 130, /* 1110 -> 13.0x */
- 65, /* 1111 -> 6.5x */
-};
-
-/*
- * VIA C3 Ezra
- */
-static const int __cpuinitdata ezra_mults[16] = {
- 100, /* 0000 -> 10.0x */
- 30, /* 0001 -> 3.0x */
- 40, /* 0010 -> 4.0x */
- 90, /* 0011 -> 9.0x */
- 95, /* 0100 -> 9.5x */
- 35, /* 0101 -> 3.5x */
- 45, /* 0110 -> 4.5x */
- 55, /* 0111 -> 5.5x */
- 60, /* 1000 -> 6.0x */
- 70, /* 1001 -> 7.0x */
- 80, /* 1010 -> 8.0x */
- 50, /* 1011 -> 5.0x */
- 65, /* 1100 -> 6.5x */
- 75, /* 1101 -> 7.5x */
- 85, /* 1110 -> 8.5x */
- 120, /* 1111 -> 12.0x */
-};
-
-static const int __cpuinitdata ezra_eblcr[16] = {
- 50, /* 0000 -> 5.0x */
- 30, /* 0001 -> 3.0x */
- 40, /* 0010 -> 4.0x */
- 100, /* 0011 -> 10.0x */
- 55, /* 0100 -> 5.5x */
- 35, /* 0101 -> 3.5x */
- 45, /* 0110 -> 4.5x */
- 95, /* 0111 -> 9.5x */
- 90, /* 1000 -> 9.0x */
- 70, /* 1001 -> 7.0x */
- 80, /* 1010 -> 8.0x */
- 60, /* 1011 -> 6.0x */
- 120, /* 1100 -> 12.0x */
- 75, /* 1101 -> 7.5x */
- 85, /* 1110 -> 8.5x */
- 65, /* 1111 -> 6.5x */
-};
-
-/*
- * VIA C3 (Ezra-T) [C5M].
- */
-static const int __cpuinitdata ezrat_mults[32] = {
- 100, /* 0000 -> 10.0x */
- 30, /* 0001 -> 3.0x */
- 40, /* 0010 -> 4.0x */
- 90, /* 0011 -> 9.0x */
- 95, /* 0100 -> 9.5x */
- 35, /* 0101 -> 3.5x */
- 45, /* 0110 -> 4.5x */
- 55, /* 0111 -> 5.5x */
- 60, /* 1000 -> 6.0x */
- 70, /* 1001 -> 7.0x */
- 80, /* 1010 -> 8.0x */
- 50, /* 1011 -> 5.0x */
- 65, /* 1100 -> 6.5x */
- 75, /* 1101 -> 7.5x */
- 85, /* 1110 -> 8.5x */
- 120, /* 1111 -> 12.0x */
-
- -1, /* 0000 -> RESERVED (10.0x) */
- 110, /* 0001 -> 11.0x */
- -1, /* 0010 -> 12.0x */
- -1, /* 0011 -> RESERVED (9.0x)*/
- 105, /* 0100 -> 10.5x */
- 115, /* 0101 -> 11.5x */
- 125, /* 0110 -> 12.5x */
- 135, /* 0111 -> 13.5x */
- 140, /* 1000 -> 14.0x */
- 150, /* 1001 -> 15.0x */
- 160, /* 1010 -> 16.0x */
- 130, /* 1011 -> 13.0x */
- 145, /* 1100 -> 14.5x */
- 155, /* 1101 -> 15.5x */
- -1, /* 1110 -> RESERVED (13.0x) */
- -1, /* 1111 -> RESERVED (12.0x) */
-};
-
-static const int __cpuinitdata ezrat_eblcr[32] = {
- 50, /* 0000 -> 5.0x */
- 30, /* 0001 -> 3.0x */
- 40, /* 0010 -> 4.0x */
- 100, /* 0011 -> 10.0x */
- 55, /* 0100 -> 5.5x */
- 35, /* 0101 -> 3.5x */
- 45, /* 0110 -> 4.5x */
- 95, /* 0111 -> 9.5x */
- 90, /* 1000 -> 9.0x */
- 70, /* 1001 -> 7.0x */
- 80, /* 1010 -> 8.0x */
- 60, /* 1011 -> 6.0x */
- 120, /* 1100 -> 12.0x */
- 75, /* 1101 -> 7.5x */
- 85, /* 1110 -> 8.5x */
- 65, /* 1111 -> 6.5x */
-
- -1, /* 0000 -> RESERVED (9.0x) */
- 110, /* 0001 -> 11.0x */
- 120, /* 0010 -> 12.0x */
- -1, /* 0011 -> RESERVED (10.0x)*/
- 135, /* 0100 -> 13.5x */
- 115, /* 0101 -> 11.5x */
- 125, /* 0110 -> 12.5x */
- 105, /* 0111 -> 10.5x */
- 130, /* 1000 -> 13.0x */
- 150, /* 1001 -> 15.0x */
- 160, /* 1010 -> 16.0x */
- 140, /* 1011 -> 14.0x */
- -1, /* 1100 -> RESERVED (12.0x) */
- 155, /* 1101 -> 15.5x */
- -1, /* 1110 -> RESERVED (13.0x) */
- 145, /* 1111 -> 14.5x */
-};
-
-/*
- * VIA C3 Nehemiah */
-
-static const int __cpuinitdata nehemiah_mults[32] = {
- 100, /* 0000 -> 10.0x */
- -1, /* 0001 -> 16.0x */
- 40, /* 0010 -> 4.0x */
- 90, /* 0011 -> 9.0x */
- 95, /* 0100 -> 9.5x */
- -1, /* 0101 -> RESERVED */
- 45, /* 0110 -> 4.5x */
- 55, /* 0111 -> 5.5x */
- 60, /* 1000 -> 6.0x */
- 70, /* 1001 -> 7.0x */
- 80, /* 1010 -> 8.0x */
- 50, /* 1011 -> 5.0x */
- 65, /* 1100 -> 6.5x */
- 75, /* 1101 -> 7.5x */
- 85, /* 1110 -> 8.5x */
- 120, /* 1111 -> 12.0x */
- -1, /* 0000 -> 10.0x */
- 110, /* 0001 -> 11.0x */
- -1, /* 0010 -> 12.0x */
- -1, /* 0011 -> 9.0x */
- 105, /* 0100 -> 10.5x */
- 115, /* 0101 -> 11.5x */
- 125, /* 0110 -> 12.5x */
- 135, /* 0111 -> 13.5x */
- 140, /* 1000 -> 14.0x */
- 150, /* 1001 -> 15.0x */
- 160, /* 1010 -> 16.0x */
- 130, /* 1011 -> 13.0x */
- 145, /* 1100 -> 14.5x */
- 155, /* 1101 -> 15.5x */
- -1, /* 1110 -> RESERVED (13.0x) */
- -1, /* 1111 -> 12.0x */
-};
-
-static const int __cpuinitdata nehemiah_eblcr[32] = {
- 50, /* 0000 -> 5.0x */
- 160, /* 0001 -> 16.0x */
- 40, /* 0010 -> 4.0x */
- 100, /* 0011 -> 10.0x */
- 55, /* 0100 -> 5.5x */
- -1, /* 0101 -> RESERVED */
- 45, /* 0110 -> 4.5x */
- 95, /* 0111 -> 9.5x */
- 90, /* 1000 -> 9.0x */
- 70, /* 1001 -> 7.0x */
- 80, /* 1010 -> 8.0x */
- 60, /* 1011 -> 6.0x */
- 120, /* 1100 -> 12.0x */
- 75, /* 1101 -> 7.5x */
- 85, /* 1110 -> 8.5x */
- 65, /* 1111 -> 6.5x */
- 90, /* 0000 -> 9.0x */
- 110, /* 0001 -> 11.0x */
- 120, /* 0010 -> 12.0x */
- 100, /* 0011 -> 10.0x */
- 135, /* 0100 -> 13.5x */
- 115, /* 0101 -> 11.5x */
- 125, /* 0110 -> 12.5x */
- 105, /* 0111 -> 10.5x */
- 130, /* 1000 -> 13.0x */
- 150, /* 1001 -> 15.0x */
- 160, /* 1010 -> 16.0x */
- 140, /* 1011 -> 14.0x */
- 120, /* 1100 -> 12.0x */
- 155, /* 1101 -> 15.5x */
- -1, /* 1110 -> RESERVED (13.0x) */
- 145 /* 1111 -> 14.5x */
-};
-
-/*
- * Voltage scales. Div/Mod by 1000 to get actual voltage.
- * Which scale to use depends on the VRM type in use.
- */
-
-struct mV_pos {
- unsigned short mV;
- unsigned short pos;
-};
-
-static const struct mV_pos __cpuinitdata vrm85_mV[32] = {
- {1250, 8}, {1200, 6}, {1150, 4}, {1100, 2},
- {1050, 0}, {1800, 30}, {1750, 28}, {1700, 26},
- {1650, 24}, {1600, 22}, {1550, 20}, {1500, 18},
- {1450, 16}, {1400, 14}, {1350, 12}, {1300, 10},
- {1275, 9}, {1225, 7}, {1175, 5}, {1125, 3},
- {1075, 1}, {1825, 31}, {1775, 29}, {1725, 27},
- {1675, 25}, {1625, 23}, {1575, 21}, {1525, 19},
- {1475, 17}, {1425, 15}, {1375, 13}, {1325, 11}
-};
-
-static const unsigned char __cpuinitdata mV_vrm85[32] = {
- 0x04, 0x14, 0x03, 0x13, 0x02, 0x12, 0x01, 0x11,
- 0x00, 0x10, 0x0f, 0x1f, 0x0e, 0x1e, 0x0d, 0x1d,
- 0x0c, 0x1c, 0x0b, 0x1b, 0x0a, 0x1a, 0x09, 0x19,
- 0x08, 0x18, 0x07, 0x17, 0x06, 0x16, 0x05, 0x15
-};
-
-static const struct mV_pos __cpuinitdata mobilevrm_mV[32] = {
- {1750, 31}, {1700, 30}, {1650, 29}, {1600, 28},
- {1550, 27}, {1500, 26}, {1450, 25}, {1400, 24},
- {1350, 23}, {1300, 22}, {1250, 21}, {1200, 20},
- {1150, 19}, {1100, 18}, {1050, 17}, {1000, 16},
- {975, 15}, {950, 14}, {925, 13}, {900, 12},
- {875, 11}, {850, 10}, {825, 9}, {800, 8},
- {775, 7}, {750, 6}, {725, 5}, {700, 4},
- {675, 3}, {650, 2}, {625, 1}, {600, 0}
-};
-
-static const unsigned char __cpuinitdata mV_mobilevrm[32] = {
- 0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18,
- 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10,
- 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08,
- 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
-};
-
diff --git a/arch/x86/kernel/cpu/cpufreq/longrun.c b/arch/x86/kernel/cpu/cpufreq/longrun.c
deleted file mode 100644
index d9f51367666..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/longrun.c
+++ /dev/null
@@ -1,327 +0,0 @@
-/*
- * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
- *
- * Licensed under the terms of the GNU GPL License version 2.
- *
- * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/timex.h>
-
-#include <asm/msr.h>
-#include <asm/processor.h>
-
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
- "longrun", msg)
-
-static struct cpufreq_driver longrun_driver;
-
-/**
- * longrun_{low,high}_freq is needed for the conversion of cpufreq kHz
- * values into per cent values. In TMTA microcode, the following is valid:
- * performance_pctg = (current_freq - low_freq)/(high_freq - low_freq)
- */
-static unsigned int longrun_low_freq, longrun_high_freq;
-
-
-/**
- * longrun_get_policy - get the current LongRun policy
- * @policy: struct cpufreq_policy where current policy is written into
- *
- * Reads the current LongRun policy by access to MSR_TMTA_LONGRUN_FLAGS
- * and MSR_TMTA_LONGRUN_CTRL
- */
-static void __cpuinit longrun_get_policy(struct cpufreq_policy *policy)
-{
- u32 msr_lo, msr_hi;
-
- rdmsr(MSR_TMTA_LONGRUN_FLAGS, msr_lo, msr_hi);
- dprintk("longrun flags are %x - %x\n", msr_lo, msr_hi);
- if (msr_lo & 0x01)
- policy->policy = CPUFREQ_POLICY_PERFORMANCE;
- else
- policy->policy = CPUFREQ_POLICY_POWERSAVE;
-
- rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
- dprintk("longrun ctrl is %x - %x\n", msr_lo, msr_hi);
- msr_lo &= 0x0000007F;
- msr_hi &= 0x0000007F;
-
- if (longrun_high_freq <= longrun_low_freq) {
- /* Assume degenerate Longrun table */
- policy->min = policy->max = longrun_high_freq;
- } else {
- policy->min = longrun_low_freq + msr_lo *
- ((longrun_high_freq - longrun_low_freq) / 100);
- policy->max = longrun_low_freq + msr_hi *
- ((longrun_high_freq - longrun_low_freq) / 100);
- }
- policy->cpu = 0;
-}
-
-
-/**
- * longrun_set_policy - sets a new CPUFreq policy
- * @policy: new policy
- *
- * Sets a new CPUFreq policy on LongRun-capable processors. This function
- * has to be called with cpufreq_driver locked.
- */
-static int longrun_set_policy(struct cpufreq_policy *policy)
-{
- u32 msr_lo, msr_hi;
- u32 pctg_lo, pctg_hi;
-
- if (!policy)
- return -EINVAL;
-
- if (longrun_high_freq <= longrun_low_freq) {
- /* Assume degenerate Longrun table */
- pctg_lo = pctg_hi = 100;
- } else {
- pctg_lo = (policy->min - longrun_low_freq) /
- ((longrun_high_freq - longrun_low_freq) / 100);
- pctg_hi = (policy->max - longrun_low_freq) /
- ((longrun_high_freq - longrun_low_freq) / 100);
- }
-
- if (pctg_hi > 100)
- pctg_hi = 100;
- if (pctg_lo > pctg_hi)
- pctg_lo = pctg_hi;
-
- /* performance or economy mode */
- rdmsr(MSR_TMTA_LONGRUN_FLAGS, msr_lo, msr_hi);
- msr_lo &= 0xFFFFFFFE;
- switch (policy->policy) {
- case CPUFREQ_POLICY_PERFORMANCE:
- msr_lo |= 0x00000001;
- break;
- case CPUFREQ_POLICY_POWERSAVE:
- break;
- }
- wrmsr(MSR_TMTA_LONGRUN_FLAGS, msr_lo, msr_hi);
-
- /* lower and upper boundary */
- rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
- msr_lo &= 0xFFFFFF80;
- msr_hi &= 0xFFFFFF80;
- msr_lo |= pctg_lo;
- msr_hi |= pctg_hi;
- wrmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
-
- return 0;
-}
-
-
-/**
- * longrun_verify_poliy - verifies a new CPUFreq policy
- * @policy: the policy to verify
- *
- * Validates a new CPUFreq policy. This function has to be called with
- * cpufreq_driver locked.
- */
-static int longrun_verify_policy(struct cpufreq_policy *policy)
-{
- if (!policy)
- return -EINVAL;
-
- policy->cpu = 0;
- cpufreq_verify_within_limits(policy,
- policy->cpuinfo.min_freq,
- policy->cpuinfo.max_freq);
-
- if ((policy->policy != CPUFREQ_POLICY_POWERSAVE) &&
- (policy->policy != CPUFREQ_POLICY_PERFORMANCE))
- return -EINVAL;
-
- return 0;
-}
-
-static unsigned int longrun_get(unsigned int cpu)
-{
- u32 eax, ebx, ecx, edx;
-
- if (cpu)
- return 0;
-
- cpuid(0x80860007, &eax, &ebx, &ecx, &edx);
- dprintk("cpuid eax is %u\n", eax);
-
- return eax * 1000;
-}
-
-/**
- * longrun_determine_freqs - determines the lowest and highest possible core frequency
- * @low_freq: an int to put the lowest frequency into
- * @high_freq: an int to put the highest frequency into
- *
- * Determines the lowest and highest possible core frequencies on this CPU.
- * This is necessary to calculate the performance percentage according to
- * TMTA rules:
- * performance_pctg = (target_freq - low_freq)/(high_freq - low_freq)
- */
-static int __cpuinit longrun_determine_freqs(unsigned int *low_freq,
- unsigned int *high_freq)
-{
- u32 msr_lo, msr_hi;
- u32 save_lo, save_hi;
- u32 eax, ebx, ecx, edx;
- u32 try_hi;
- struct cpuinfo_x86 *c = &cpu_data(0);
-
- if (!low_freq || !high_freq)
- return -EINVAL;
-
- if (cpu_has(c, X86_FEATURE_LRTI)) {
- /* if the LongRun Table Interface is present, the
- * detection is a bit easier:
- * For minimum frequency, read out the maximum
- * level (msr_hi), write that into "currently
- * selected level", and read out the frequency.
- * For maximum frequency, read out level zero.
- */
- /* minimum */
- rdmsr(MSR_TMTA_LRTI_READOUT, msr_lo, msr_hi);
- wrmsr(MSR_TMTA_LRTI_READOUT, msr_hi, msr_hi);
- rdmsr(MSR_TMTA_LRTI_VOLT_MHZ, msr_lo, msr_hi);
- *low_freq = msr_lo * 1000; /* to kHz */
-
- /* maximum */
- wrmsr(MSR_TMTA_LRTI_READOUT, 0, msr_hi);
- rdmsr(MSR_TMTA_LRTI_VOLT_MHZ, msr_lo, msr_hi);
- *high_freq = msr_lo * 1000; /* to kHz */
-
- dprintk("longrun table interface told %u - %u kHz\n",
- *low_freq, *high_freq);
-
- if (*low_freq > *high_freq)
- *low_freq = *high_freq;
- return 0;
- }
-
- /* set the upper border to the value determined during TSC init */
- *high_freq = (cpu_khz / 1000);
- *high_freq = *high_freq * 1000;
- dprintk("high frequency is %u kHz\n", *high_freq);
-
- /* get current borders */
- rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
- save_lo = msr_lo & 0x0000007F;
- save_hi = msr_hi & 0x0000007F;
-
- /* if current perf_pctg is larger than 90%, we need to decrease the
- * upper limit to make the calculation more accurate.
- */
- cpuid(0x80860007, &eax, &ebx, &ecx, &edx);
- /* try decreasing in 10% steps, some processors react only
- * on some barrier values */
- for (try_hi = 80; try_hi > 0 && ecx > 90; try_hi -= 10) {
- /* set to 0 to try_hi perf_pctg */
- msr_lo &= 0xFFFFFF80;
- msr_hi &= 0xFFFFFF80;
- msr_hi |= try_hi;
- wrmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
-
- /* read out current core MHz and current perf_pctg */
- cpuid(0x80860007, &eax, &ebx, &ecx, &edx);
-
- /* restore values */
- wrmsr(MSR_TMTA_LONGRUN_CTRL, save_lo, save_hi);
- }
- dprintk("percentage is %u %%, freq is %u MHz\n", ecx, eax);
-
- /* performance_pctg = (current_freq - low_freq)/(high_freq - low_freq)
- * eqals
- * low_freq * (1 - perf_pctg) = (cur_freq - high_freq * perf_pctg)
- *
- * high_freq * perf_pctg is stored tempoarily into "ebx".
- */
- ebx = (((cpu_khz / 1000) * ecx) / 100); /* to MHz */
-
- if ((ecx > 95) || (ecx == 0) || (eax < ebx))
- return -EIO;
-
- edx = ((eax - ebx) * 100) / (100 - ecx);
- *low_freq = edx * 1000; /* back to kHz */
-
- dprintk("low frequency is %u kHz\n", *low_freq);
-
- if (*low_freq > *high_freq)
- *low_freq = *high_freq;
-
- return 0;
-}
-
-
-static int __cpuinit longrun_cpu_init(struct cpufreq_policy *policy)
-{
- int result = 0;
-
- /* capability check */
- if (policy->cpu != 0)
- return -ENODEV;
-
- /* detect low and high frequency */
- result = longrun_determine_freqs(&longrun_low_freq, &longrun_high_freq);
- if (result)
- return result;
-
- /* cpuinfo and default policy values */
- policy->cpuinfo.min_freq = longrun_low_freq;
- policy->cpuinfo.max_freq = longrun_high_freq;
- policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
- longrun_get_policy(policy);
-
- return 0;
-}
-
-
-static struct cpufreq_driver longrun_driver = {
- .flags = CPUFREQ_CONST_LOOPS,
- .verify = longrun_verify_policy,
- .setpolicy = longrun_set_policy,
- .get = longrun_get,
- .init = longrun_cpu_init,
- .name = "longrun",
- .owner = THIS_MODULE,
-};
-
-
-/**
- * longrun_init - initializes the Transmeta Crusoe LongRun CPUFreq driver
- *
- * Initializes the LongRun support.
- */
-static int __init longrun_init(void)
-{
- struct cpuinfo_x86 *c = &cpu_data(0);
-
- if (c->x86_vendor != X86_VENDOR_TRANSMETA ||
- !cpu_has(c, X86_FEATURE_LONGRUN))
- return -ENODEV;
-
- return cpufreq_register_driver(&longrun_driver);
-}
-
-
-/**
- * longrun_exit - unregisters LongRun support
- */
-static void __exit longrun_exit(void)
-{
- cpufreq_unregister_driver(&longrun_driver);
-}
-
-
-MODULE_AUTHOR("Dominik Brodowski <linux@brodo.de>");
-MODULE_DESCRIPTION("LongRun driver for Transmeta Crusoe and "
- "Efficeon processors.");
-MODULE_LICENSE("GPL");
-
-module_init(longrun_init);
-module_exit(longrun_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/mperf.c b/arch/x86/kernel/cpu/cpufreq/mperf.c
deleted file mode 100644
index 911e193018a..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/mperf.c
+++ /dev/null
@@ -1,51 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/smp.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/slab.h>
-
-#include "mperf.h"
-
-static DEFINE_PER_CPU(struct aperfmperf, acfreq_old_perf);
-
-/* Called via smp_call_function_single(), on the target CPU */
-static void read_measured_perf_ctrs(void *_cur)
-{
- struct aperfmperf *am = _cur;
-
- get_aperfmperf(am);
-}
-
-/*
- * Return the measured active (C0) frequency on this CPU since last call
- * to this function.
- * Input: cpu number
- * Return: Average CPU frequency in terms of max frequency (zero on error)
- *
- * We use IA32_MPERF and IA32_APERF MSRs to get the measured performance
- * over a period of time, while CPU is in C0 state.
- * IA32_MPERF counts at the rate of max advertised frequency
- * IA32_APERF counts at the rate of actual CPU frequency
- * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and
- * no meaning should be associated with absolute values of these MSRs.
- */
-unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy,
- unsigned int cpu)
-{
- struct aperfmperf perf;
- unsigned long ratio;
- unsigned int retval;
-
- if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1))
- return 0;
-
- ratio = calc_aperfmperf_ratio(&per_cpu(acfreq_old_perf, cpu), &perf);
- per_cpu(acfreq_old_perf, cpu) = perf;
-
- retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT;
-
- return retval;
-}
-EXPORT_SYMBOL_GPL(cpufreq_get_measured_perf);
-MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/cpufreq/mperf.h b/arch/x86/kernel/cpu/cpufreq/mperf.h
deleted file mode 100644
index 5dbf2950dc2..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/mperf.h
+++ /dev/null
@@ -1,9 +0,0 @@
-/*
- * (c) 2010 Advanced Micro Devices, Inc.
- * Your use of this code is subject to the terms and conditions of the
- * GNU general public license version 2. See "COPYING" or
- * http://www.gnu.org/licenses/gpl.html
- */
-
-unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy,
- unsigned int cpu);
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
deleted file mode 100644
index 52c93648e49..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ /dev/null
@@ -1,331 +0,0 @@
-/*
- * Pentium 4/Xeon CPU on demand clock modulation/speed scaling
- * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
- * (C) 2002 Zwane Mwaikambo <zwane@commfireservices.com>
- * (C) 2002 Arjan van de Ven <arjanv@redhat.com>
- * (C) 2002 Tora T. Engstad
- * All Rights Reserved
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * The author(s) of this software shall not be held liable for damages
- * of any nature resulting due to the use of this software. This
- * software is provided AS-IS with no warranties.
- *
- * Date Errata Description
- * 20020525 N44, O17 12.5% or 25% DC causes lockup
- *
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-#include <linux/cpufreq.h>
-#include <linux/cpumask.h>
-#include <linux/timex.h>
-
-#include <asm/processor.h>
-#include <asm/msr.h>
-#include <asm/timer.h>
-
-#include "speedstep-lib.h"
-
-#define PFX "p4-clockmod: "
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
- "p4-clockmod", msg)
-
-/*
- * Duty Cycle (3bits), note DC_DISABLE is not specified in
- * intel docs i just use it to mean disable
- */
-enum {
- DC_RESV, DC_DFLT, DC_25PT, DC_38PT, DC_50PT,
- DC_64PT, DC_75PT, DC_88PT, DC_DISABLE
-};
-
-#define DC_ENTRIES 8
-
-
-static int has_N44_O17_errata[NR_CPUS];
-static unsigned int stock_freq;
-static struct cpufreq_driver p4clockmod_driver;
-static unsigned int cpufreq_p4_get(unsigned int cpu);
-
-static int cpufreq_p4_setdc(unsigned int cpu, unsigned int newstate)
-{
- u32 l, h;
-
- if (!cpu_online(cpu) ||
- (newstate > DC_DISABLE) || (newstate == DC_RESV))
- return -EINVAL;
-
- rdmsr_on_cpu(cpu, MSR_IA32_THERM_STATUS, &l, &h);
-
- if (l & 0x01)
- dprintk("CPU#%d currently thermal throttled\n", cpu);
-
- if (has_N44_O17_errata[cpu] &&
- (newstate == DC_25PT || newstate == DC_DFLT))
- newstate = DC_38PT;
-
- rdmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, &l, &h);
- if (newstate == DC_DISABLE) {
- dprintk("CPU#%d disabling modulation\n", cpu);
- wrmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, l & ~(1<<4), h);
- } else {
- dprintk("CPU#%d setting duty cycle to %d%%\n",
- cpu, ((125 * newstate) / 10));
- /* bits 63 - 5 : reserved
- * bit 4 : enable/disable
- * bits 3-1 : duty cycle
- * bit 0 : reserved
- */
- l = (l & ~14);
- l = l | (1<<4) | ((newstate & 0x7)<<1);
- wrmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, l, h);
- }
-
- return 0;
-}
-
-
-static struct cpufreq_frequency_table p4clockmod_table[] = {
- {DC_RESV, CPUFREQ_ENTRY_INVALID},
- {DC_DFLT, 0},
- {DC_25PT, 0},
- {DC_38PT, 0},
- {DC_50PT, 0},
- {DC_64PT, 0},
- {DC_75PT, 0},
- {DC_88PT, 0},
- {DC_DISABLE, 0},
- {DC_RESV, CPUFREQ_TABLE_END},
-};
-
-
-static int cpufreq_p4_target(struct cpufreq_policy *policy,
- unsigned int target_freq,
- unsigned int relation)
-{
- unsigned int newstate = DC_RESV;
- struct cpufreq_freqs freqs;
- int i;
-
- if (cpufreq_frequency_table_target(policy, &p4clockmod_table[0],
- target_freq, relation, &newstate))
- return -EINVAL;
-
- freqs.old = cpufreq_p4_get(policy->cpu);
- freqs.new = stock_freq * p4clockmod_table[newstate].index / 8;
-
- if (freqs.new == freqs.old)
- return 0;
-
- /* notifiers */
- for_each_cpu(i, policy->cpus) {
- freqs.cpu = i;
- cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
- }
-
- /* run on each logical CPU,
- * see section 13.15.3 of IA32 Intel Architecture Software
- * Developer's Manual, Volume 3
- */
- for_each_cpu(i, policy->cpus)
- cpufreq_p4_setdc(i, p4clockmod_table[newstate].index);
-
- /* notifiers */
- for_each_cpu(i, policy->cpus) {
- freqs.cpu = i;
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
- }
-
- return 0;
-}
-
-
-static int cpufreq_p4_verify(struct cpufreq_policy *policy)
-{
- return cpufreq_frequency_table_verify(policy, &p4clockmod_table[0]);
-}
-
-
-static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c)
-{
- if (c->x86 == 0x06) {
- if (cpu_has(c, X86_FEATURE_EST))
- printk_once(KERN_WARNING PFX "Warning: EST-capable "
- "CPU detected. The acpi-cpufreq module offers "
- "voltage scaling in addition to frequency "
- "scaling. You should use that instead of "
- "p4-clockmod, if possible.\n");
- switch (c->x86_model) {
- case 0x0E: /* Core */
- case 0x0F: /* Core Duo */
- case 0x16: /* Celeron Core */
- case 0x1C: /* Atom */
- p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS;
- return speedstep_get_frequency(SPEEDSTEP_CPU_PCORE);
- case 0x0D: /* Pentium M (Dothan) */
- p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS;
- /* fall through */
- case 0x09: /* Pentium M (Banias) */
- return speedstep_get_frequency(SPEEDSTEP_CPU_PM);
- }
- }
-
- if (c->x86 != 0xF)
- return 0;
-
- /* on P-4s, the TSC runs with constant frequency independent whether
- * throttling is active or not. */
- p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS;
-
- if (speedstep_detect_processor() == SPEEDSTEP_CPU_P4M) {
- printk(KERN_WARNING PFX "Warning: Pentium 4-M detected. "
- "The speedstep-ich or acpi cpufreq modules offer "
- "voltage scaling in addition of frequency scaling. "
- "You should use either one instead of p4-clockmod, "
- "if possible.\n");
- return speedstep_get_frequency(SPEEDSTEP_CPU_P4M);
- }
-
- return speedstep_get_frequency(SPEEDSTEP_CPU_P4D);
-}
-
-
-
-static int cpufreq_p4_cpu_init(struct cpufreq_policy *policy)
-{
- struct cpuinfo_x86 *c = &cpu_data(policy->cpu);
- int cpuid = 0;
- unsigned int i;
-
-#ifdef CONFIG_SMP
- cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu));
-#endif
-
- /* Errata workaround */
- cpuid = (c->x86 << 8) | (c->x86_model << 4) | c->x86_mask;
- switch (cpuid) {
- case 0x0f07:
- case 0x0f0a:
- case 0x0f11:
- case 0x0f12:
- has_N44_O17_errata[policy->cpu] = 1;
- dprintk("has errata -- disabling low frequencies\n");
- }
-
- if (speedstep_detect_processor() == SPEEDSTEP_CPU_P4D &&
- c->x86_model < 2) {
- /* switch to maximum frequency and measure result */
- cpufreq_p4_setdc(policy->cpu, DC_DISABLE);
- recalibrate_cpu_khz();
- }
- /* get max frequency */
- stock_freq = cpufreq_p4_get_frequency(c);
- if (!stock_freq)
- return -EINVAL;
-
- /* table init */
- for (i = 1; (p4clockmod_table[i].frequency != CPUFREQ_TABLE_END); i++) {
- if ((i < 2) && (has_N44_O17_errata[policy->cpu]))
- p4clockmod_table[i].frequency = CPUFREQ_ENTRY_INVALID;
- else
- p4clockmod_table[i].frequency = (stock_freq * i)/8;
- }
- cpufreq_frequency_table_get_attr(p4clockmod_table, policy->cpu);
-
- /* cpuinfo and default policy values */
-
- /* the transition latency is set to be 1 higher than the maximum
- * transition latency of the ondemand governor */
- policy->cpuinfo.transition_latency = 10000001;
- policy->cur = stock_freq;
-
- return cpufreq_frequency_table_cpuinfo(policy, &p4clockmod_table[0]);
-}
-
-
-static int cpufreq_p4_cpu_exit(struct cpufreq_policy *policy)
-{
- cpufreq_frequency_table_put_attr(policy->cpu);
- return 0;
-}
-
-static unsigned int cpufreq_p4_get(unsigned int cpu)
-{
- u32 l, h;
-
- rdmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, &l, &h);
-
- if (l & 0x10) {
- l = l >> 1;
- l &= 0x7;
- } else
- l = DC_DISABLE;
-
- if (l != DC_DISABLE)
- return stock_freq * l / 8;
-
- return stock_freq;
-}
-
-static struct freq_attr *p4clockmod_attr[] = {
- &cpufreq_freq_attr_scaling_available_freqs,
- NULL,
-};
-
-static struct cpufreq_driver p4clockmod_driver = {
- .verify = cpufreq_p4_verify,
- .target = cpufreq_p4_target,
- .init = cpufreq_p4_cpu_init,
- .exit = cpufreq_p4_cpu_exit,
- .get = cpufreq_p4_get,
- .name = "p4-clockmod",
- .owner = THIS_MODULE,
- .attr = p4clockmod_attr,
-};
-
-
-static int __init cpufreq_p4_init(void)
-{
- struct cpuinfo_x86 *c = &cpu_data(0);
- int ret;
-
- /*
- * THERM_CONTROL is architectural for IA32 now, so
- * we can rely on the capability checks
- */
- if (c->x86_vendor != X86_VENDOR_INTEL)
- return -ENODEV;
-
- if (!test_cpu_cap(c, X86_FEATURE_ACPI) ||
- !test_cpu_cap(c, X86_FEATURE_ACC))
- return -ENODEV;
-
- ret = cpufreq_register_driver(&p4clockmod_driver);
- if (!ret)
- printk(KERN_INFO PFX "P4/Xeon(TM) CPU On-Demand Clock "
- "Modulation available\n");
-
- return ret;
-}
-
-
-static void __exit cpufreq_p4_exit(void)
-{
- cpufreq_unregister_driver(&p4clockmod_driver);
-}
-
-
-MODULE_AUTHOR("Zwane Mwaikambo <zwane@commfireservices.com>");
-MODULE_DESCRIPTION("cpufreq driver for Pentium(TM) 4/Xeon(TM)");
-MODULE_LICENSE("GPL");
-
-late_initcall(cpufreq_p4_init);
-module_exit(cpufreq_p4_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
deleted file mode 100644
index 755a31e0f5b..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
+++ /dev/null
@@ -1,624 +0,0 @@
-/*
- * pcc-cpufreq.c - Processor Clocking Control firmware cpufreq interface
- *
- * Copyright (C) 2009 Red Hat, Matthew Garrett <mjg@redhat.com>
- * Copyright (C) 2009 Hewlett-Packard Development Company, L.P.
- * Nagananda Chumbalkar <nagananda.chumbalkar@hp.com>
- *
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; version 2 of the License.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or NON
- * INFRINGEMENT. See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-#include <linux/sched.h>
-#include <linux/cpufreq.h>
-#include <linux/compiler.h>
-#include <linux/slab.h>
-
-#include <linux/acpi.h>
-#include <linux/io.h>
-#include <linux/spinlock.h>
-#include <linux/uaccess.h>
-
-#include <acpi/processor.h>
-
-#define PCC_VERSION "1.00.00"
-#define POLL_LOOPS 300
-
-#define CMD_COMPLETE 0x1
-#define CMD_GET_FREQ 0x0
-#define CMD_SET_FREQ 0x1
-
-#define BUF_SZ 4
-
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
- "pcc-cpufreq", msg)
-
-struct pcc_register_resource {
- u8 descriptor;
- u16 length;
- u8 space_id;
- u8 bit_width;
- u8 bit_offset;
- u8 access_size;
- u64 address;
-} __attribute__ ((packed));
-
-struct pcc_memory_resource {
- u8 descriptor;
- u16 length;
- u8 space_id;
- u8 resource_usage;
- u8 type_specific;
- u64 granularity;
- u64 minimum;
- u64 maximum;
- u64 translation_offset;
- u64 address_length;
-} __attribute__ ((packed));
-
-static struct cpufreq_driver pcc_cpufreq_driver;
-
-struct pcc_header {
- u32 signature;
- u16 length;
- u8 major;
- u8 minor;
- u32 features;
- u16 command;
- u16 status;
- u32 latency;
- u32 minimum_time;
- u32 maximum_time;
- u32 nominal;
- u32 throttled_frequency;
- u32 minimum_frequency;
-};
-
-static void __iomem *pcch_virt_addr;
-static struct pcc_header __iomem *pcch_hdr;
-
-static DEFINE_SPINLOCK(pcc_lock);
-
-static struct acpi_generic_address doorbell;
-
-static u64 doorbell_preserve;
-static u64 doorbell_write;
-
-static u8 OSC_UUID[16] = {0x63, 0x9B, 0x2C, 0x9F, 0x70, 0x91, 0x49, 0x1f,
- 0xBB, 0x4F, 0xA5, 0x98, 0x2F, 0xA1, 0xB5, 0x46};
-
-struct pcc_cpu {
- u32 input_offset;
- u32 output_offset;
-};
-
-static struct pcc_cpu __percpu *pcc_cpu_info;
-
-static int pcc_cpufreq_verify(struct cpufreq_policy *policy)
-{
- cpufreq_verify_within_limits(policy, policy->cpuinfo.min_freq,
- policy->cpuinfo.max_freq);
- return 0;
-}
-
-static inline void pcc_cmd(void)
-{
- u64 doorbell_value;
- int i;
-
- acpi_read(&doorbell_value, &doorbell);
- acpi_write((doorbell_value & doorbell_preserve) | doorbell_write,
- &doorbell);
-
- for (i = 0; i < POLL_LOOPS; i++) {
- if (ioread16(&pcch_hdr->status) & CMD_COMPLETE)
- break;
- }
-}
-
-static inline void pcc_clear_mapping(void)
-{
- if (pcch_virt_addr)
- iounmap(pcch_virt_addr);
- pcch_virt_addr = NULL;
-}
-
-static unsigned int pcc_get_freq(unsigned int cpu)
-{
- struct pcc_cpu *pcc_cpu_data;
- unsigned int curr_freq;
- unsigned int freq_limit;
- u16 status;
- u32 input_buffer;
- u32 output_buffer;
-
- spin_lock(&pcc_lock);
-
- dprintk("get: get_freq for CPU %d\n", cpu);
- pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu);
-
- input_buffer = 0x1;
- iowrite32(input_buffer,
- (pcch_virt_addr + pcc_cpu_data->input_offset));
- iowrite16(CMD_GET_FREQ, &pcch_hdr->command);
-
- pcc_cmd();
-
- output_buffer =
- ioread32(pcch_virt_addr + pcc_cpu_data->output_offset);
-
- /* Clear the input buffer - we are done with the current command */
- memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ);
-
- status = ioread16(&pcch_hdr->status);
- if (status != CMD_COMPLETE) {
- dprintk("get: FAILED: for CPU %d, status is %d\n",
- cpu, status);
- goto cmd_incomplete;
- }
- iowrite16(0, &pcch_hdr->status);
- curr_freq = (((ioread32(&pcch_hdr->nominal) * (output_buffer & 0xff))
- / 100) * 1000);
-
- dprintk("get: SUCCESS: (virtual) output_offset for cpu %d is "
- "0x%x, contains a value of: 0x%x. Speed is: %d MHz\n",
- cpu, (pcch_virt_addr + pcc_cpu_data->output_offset),
- output_buffer, curr_freq);
-
- freq_limit = (output_buffer >> 8) & 0xff;
- if (freq_limit != 0xff) {
- dprintk("get: frequency for cpu %d is being temporarily"
- " capped at %d\n", cpu, curr_freq);
- }
-
- spin_unlock(&pcc_lock);
- return curr_freq;
-
-cmd_incomplete:
- iowrite16(0, &pcch_hdr->status);
- spin_unlock(&pcc_lock);
- return 0;
-}
-
-static int pcc_cpufreq_target(struct cpufreq_policy *policy,
- unsigned int target_freq,
- unsigned int relation)
-{
- struct pcc_cpu *pcc_cpu_data;
- struct cpufreq_freqs freqs;
- u16 status;
- u32 input_buffer;
- int cpu;
-
- spin_lock(&pcc_lock);
- cpu = policy->cpu;
- pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu);
-
- dprintk("target: CPU %d should go to target freq: %d "
- "(virtual) input_offset is 0x%x\n",
- cpu, target_freq,
- (pcch_virt_addr + pcc_cpu_data->input_offset));
-
- freqs.new = target_freq;
- freqs.cpu = cpu;
- cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-
- input_buffer = 0x1 | (((target_freq * 100)
- / (ioread32(&pcch_hdr->nominal) * 1000)) << 8);
- iowrite32(input_buffer,
- (pcch_virt_addr + pcc_cpu_data->input_offset));
- iowrite16(CMD_SET_FREQ, &pcch_hdr->command);
-
- pcc_cmd();
-
- /* Clear the input buffer - we are done with the current command */
- memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ);
-
- status = ioread16(&pcch_hdr->status);
- if (status != CMD_COMPLETE) {
- dprintk("target: FAILED for cpu %d, with status: 0x%x\n",
- cpu, status);
- goto cmd_incomplete;
- }
- iowrite16(0, &pcch_hdr->status);
-
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
- dprintk("target: was SUCCESSFUL for cpu %d\n", cpu);
- spin_unlock(&pcc_lock);
-
- return 0;
-
-cmd_incomplete:
- iowrite16(0, &pcch_hdr->status);
- spin_unlock(&pcc_lock);
- return -EINVAL;
-}
-
-static int pcc_get_offset(int cpu)
-{
- acpi_status status;
- struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL};
- union acpi_object *pccp, *offset;
- struct pcc_cpu *pcc_cpu_data;
- struct acpi_processor *pr;
- int ret = 0;
-
- pr = per_cpu(processors, cpu);
- pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu);
-
- status = acpi_evaluate_object(pr->handle, "PCCP", NULL, &buffer);
- if (ACPI_FAILURE(status))
- return -ENODEV;
-
- pccp = buffer.pointer;
- if (!pccp || pccp->type != ACPI_TYPE_PACKAGE) {
- ret = -ENODEV;
- goto out_free;
- };
-
- offset = &(pccp->package.elements[0]);
- if (!offset || offset->type != ACPI_TYPE_INTEGER) {
- ret = -ENODEV;
- goto out_free;
- }
-
- pcc_cpu_data->input_offset = offset->integer.value;
-
- offset = &(pccp->package.elements[1]);
- if (!offset || offset->type != ACPI_TYPE_INTEGER) {
- ret = -ENODEV;
- goto out_free;
- }
-
- pcc_cpu_data->output_offset = offset->integer.value;
-
- memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ);
- memset_io((pcch_virt_addr + pcc_cpu_data->output_offset), 0, BUF_SZ);
-
- dprintk("pcc_get_offset: for CPU %d: pcc_cpu_data "
- "input_offset: 0x%x, pcc_cpu_data output_offset: 0x%x\n",
- cpu, pcc_cpu_data->input_offset, pcc_cpu_data->output_offset);
-out_free:
- kfree(buffer.pointer);
- return ret;
-}
-
-static int __init pcc_cpufreq_do_osc(acpi_handle *handle)
-{
- acpi_status status;
- struct acpi_object_list input;
- struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL};
- union acpi_object in_params[4];
- union acpi_object *out_obj;
- u32 capabilities[2];
- u32 errors;
- u32 supported;
- int ret = 0;
-
- input.count = 4;
- input.pointer = in_params;
- in_params[0].type = ACPI_TYPE_BUFFER;
- in_params[0].buffer.length = 16;
- in_params[0].buffer.pointer = OSC_UUID;
- in_params[1].type = ACPI_TYPE_INTEGER;
- in_params[1].integer.value = 1;
- in_params[2].type = ACPI_TYPE_INTEGER;
- in_params[2].integer.value = 2;
- in_params[3].type = ACPI_TYPE_BUFFER;
- in_params[3].buffer.length = 8;
- in_params[3].buffer.pointer = (u8 *)&capabilities;
-
- capabilities[0] = OSC_QUERY_ENABLE;
- capabilities[1] = 0x1;
-
- status = acpi_evaluate_object(*handle, "_OSC", &input, &output);
- if (ACPI_FAILURE(status))
- return -ENODEV;
-
- if (!output.length)
- return -ENODEV;
-
- out_obj = output.pointer;
- if (out_obj->type != ACPI_TYPE_BUFFER) {
- ret = -ENODEV;
- goto out_free;
- }
-
- errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0);
- if (errors) {
- ret = -ENODEV;
- goto out_free;
- }
-
- supported = *((u32 *)(out_obj->buffer.pointer + 4));
- if (!(supported & 0x1)) {
- ret = -ENODEV;
- goto out_free;
- }
-
- kfree(output.pointer);
- capabilities[0] = 0x0;
- capabilities[1] = 0x1;
-
- status = acpi_evaluate_object(*handle, "_OSC", &input, &output);
- if (ACPI_FAILURE(status))
- return -ENODEV;
-
- if (!output.length)
- return -ENODEV;
-
- out_obj = output.pointer;
- if (out_obj->type != ACPI_TYPE_BUFFER) {
- ret = -ENODEV;
- goto out_free;
- }
-
- errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0);
- if (errors) {
- ret = -ENODEV;
- goto out_free;
- }
-
- supported = *((u32 *)(out_obj->buffer.pointer + 4));
- if (!(supported & 0x1)) {
- ret = -ENODEV;
- goto out_free;
- }
-
-out_free:
- kfree(output.pointer);
- return ret;
-}
-
-static int __init pcc_cpufreq_probe(void)
-{
- acpi_status status;
- struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL};
- struct pcc_memory_resource *mem_resource;
- struct pcc_register_resource *reg_resource;
- union acpi_object *out_obj, *member;
- acpi_handle handle, osc_handle, pcch_handle;
- int ret = 0;
-
- status = acpi_get_handle(NULL, "\\_SB", &handle);
- if (ACPI_FAILURE(status))
- return -ENODEV;
-
- status = acpi_get_handle(handle, "PCCH", &pcch_handle);
- if (ACPI_FAILURE(status))
- return -ENODEV;
-
- status = acpi_get_handle(handle, "_OSC", &osc_handle);
- if (ACPI_SUCCESS(status)) {
- ret = pcc_cpufreq_do_osc(&osc_handle);
- if (ret)
- dprintk("probe: _OSC evaluation did not succeed\n");
- /* Firmware's use of _OSC is optional */
- ret = 0;
- }
-
- status = acpi_evaluate_object(handle, "PCCH", NULL, &output);
- if (ACPI_FAILURE(status))
- return -ENODEV;
-
- out_obj = output.pointer;
- if (out_obj->type != ACPI_TYPE_PACKAGE) {
- ret = -ENODEV;
- goto out_free;
- }
-
- member = &out_obj->package.elements[0];
- if (member->type != ACPI_TYPE_BUFFER) {
- ret = -ENODEV;
- goto out_free;
- }
-
- mem_resource = (struct pcc_memory_resource *)member->buffer.pointer;
-
- dprintk("probe: mem_resource descriptor: 0x%x,"
- " length: %d, space_id: %d, resource_usage: %d,"
- " type_specific: %d, granularity: 0x%llx,"
- " minimum: 0x%llx, maximum: 0x%llx,"
- " translation_offset: 0x%llx, address_length: 0x%llx\n",
- mem_resource->descriptor, mem_resource->length,
- mem_resource->space_id, mem_resource->resource_usage,
- mem_resource->type_specific, mem_resource->granularity,
- mem_resource->minimum, mem_resource->maximum,
- mem_resource->translation_offset,
- mem_resource->address_length);
-
- if (mem_resource->space_id != ACPI_ADR_SPACE_SYSTEM_MEMORY) {
- ret = -ENODEV;
- goto out_free;
- }
-
- pcch_virt_addr = ioremap_nocache(mem_resource->minimum,
- mem_resource->address_length);
- if (pcch_virt_addr == NULL) {
- dprintk("probe: could not map shared mem region\n");
- goto out_free;
- }
- pcch_hdr = pcch_virt_addr;
-
- dprintk("probe: PCCH header (virtual) addr: 0x%p\n", pcch_hdr);
- dprintk("probe: PCCH header is at physical address: 0x%llx,"
- " signature: 0x%x, length: %d bytes, major: %d, minor: %d,"
- " supported features: 0x%x, command field: 0x%x,"
- " status field: 0x%x, nominal latency: %d us\n",
- mem_resource->minimum, ioread32(&pcch_hdr->signature),
- ioread16(&pcch_hdr->length), ioread8(&pcch_hdr->major),
- ioread8(&pcch_hdr->minor), ioread32(&pcch_hdr->features),
- ioread16(&pcch_hdr->command), ioread16(&pcch_hdr->status),
- ioread32(&pcch_hdr->latency));
-
- dprintk("probe: min time between commands: %d us,"
- " max time between commands: %d us,"
- " nominal CPU frequency: %d MHz,"
- " minimum CPU frequency: %d MHz,"
- " minimum CPU frequency without throttling: %d MHz\n",
- ioread32(&pcch_hdr->minimum_time),
- ioread32(&pcch_hdr->maximum_time),
- ioread32(&pcch_hdr->nominal),
- ioread32(&pcch_hdr->throttled_frequency),
- ioread32(&pcch_hdr->minimum_frequency));
-
- member = &out_obj->package.elements[1];
- if (member->type != ACPI_TYPE_BUFFER) {
- ret = -ENODEV;
- goto pcch_free;
- }
-
- reg_resource = (struct pcc_register_resource *)member->buffer.pointer;
-
- doorbell.space_id = reg_resource->space_id;
- doorbell.bit_width = reg_resource->bit_width;
- doorbell.bit_offset = reg_resource->bit_offset;
- doorbell.access_width = 64;
- doorbell.address = reg_resource->address;
-
- dprintk("probe: doorbell: space_id is %d, bit_width is %d, "
- "bit_offset is %d, access_width is %d, address is 0x%llx\n",
- doorbell.space_id, doorbell.bit_width, doorbell.bit_offset,
- doorbell.access_width, reg_resource->address);
-
- member = &out_obj->package.elements[2];
- if (member->type != ACPI_TYPE_INTEGER) {
- ret = -ENODEV;
- goto pcch_free;
- }
-
- doorbell_preserve = member->integer.value;
-
- member = &out_obj->package.elements[3];
- if (member->type != ACPI_TYPE_INTEGER) {
- ret = -ENODEV;
- goto pcch_free;
- }
-
- doorbell_write = member->integer.value;
-
- dprintk("probe: doorbell_preserve: 0x%llx,"
- " doorbell_write: 0x%llx\n",
- doorbell_preserve, doorbell_write);
-
- pcc_cpu_info = alloc_percpu(struct pcc_cpu);
- if (!pcc_cpu_info) {
- ret = -ENOMEM;
- goto pcch_free;
- }
-
- printk(KERN_DEBUG "pcc-cpufreq: (v%s) driver loaded with frequency"
- " limits: %d MHz, %d MHz\n", PCC_VERSION,
- ioread32(&pcch_hdr->minimum_frequency),
- ioread32(&pcch_hdr->nominal));
- kfree(output.pointer);
- return ret;
-pcch_free:
- pcc_clear_mapping();
-out_free:
- kfree(output.pointer);
- return ret;
-}
-
-static int pcc_cpufreq_cpu_init(struct cpufreq_policy *policy)
-{
- unsigned int cpu = policy->cpu;
- unsigned int result = 0;
-
- if (!pcch_virt_addr) {
- result = -1;
- goto out;
- }
-
- result = pcc_get_offset(cpu);
- if (result) {
- dprintk("init: PCCP evaluation failed\n");
- goto out;
- }
-
- policy->max = policy->cpuinfo.max_freq =
- ioread32(&pcch_hdr->nominal) * 1000;
- policy->min = policy->cpuinfo.min_freq =
- ioread32(&pcch_hdr->minimum_frequency) * 1000;
- policy->cur = pcc_get_freq(cpu);
-
- if (!policy->cur) {
- dprintk("init: Unable to get current CPU frequency\n");
- result = -EINVAL;
- goto out;
- }
-
- dprintk("init: policy->max is %d, policy->min is %d\n",
- policy->max, policy->min);
-out:
- return result;
-}
-
-static int pcc_cpufreq_cpu_exit(struct cpufreq_policy *policy)
-{
- return 0;
-}
-
-static struct cpufreq_driver pcc_cpufreq_driver = {
- .flags = CPUFREQ_CONST_LOOPS,
- .get = pcc_get_freq,
- .verify = pcc_cpufreq_verify,
- .target = pcc_cpufreq_target,
- .init = pcc_cpufreq_cpu_init,
- .exit = pcc_cpufreq_cpu_exit,
- .name = "pcc-cpufreq",
- .owner = THIS_MODULE,
-};
-
-static int __init pcc_cpufreq_init(void)
-{
- int ret;
-
- if (acpi_disabled)
- return 0;
-
- ret = pcc_cpufreq_probe();
- if (ret) {
- dprintk("pcc_cpufreq_init: PCCH evaluation failed\n");
- return ret;
- }
-
- ret = cpufreq_register_driver(&pcc_cpufreq_driver);
-
- return ret;
-}
-
-static void __exit pcc_cpufreq_exit(void)
-{
- cpufreq_unregister_driver(&pcc_cpufreq_driver);
-
- pcc_clear_mapping();
-
- free_percpu(pcc_cpu_info);
-}
-
-MODULE_AUTHOR("Matthew Garrett, Naga Chumbalkar");
-MODULE_VERSION(PCC_VERSION);
-MODULE_DESCRIPTION("Processor Clocking Control interface driver");
-MODULE_LICENSE("GPL");
-
-late_initcall(pcc_cpufreq_init);
-module_exit(pcc_cpufreq_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
deleted file mode 100644
index b3379d6a5c5..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
+++ /dev/null
@@ -1,261 +0,0 @@
-/*
- * This file was based upon code in Powertweak Linux (http://powertweak.sf.net)
- * (C) 2000-2003 Dave Jones, Arjan van de Ven, Janne Pänkälä,
- * Dominik Brodowski.
- *
- * Licensed under the terms of the GNU GPL License version 2.
- *
- * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/ioport.h>
-#include <linux/timex.h>
-#include <linux/io.h>
-
-#include <asm/msr.h>
-
-#define POWERNOW_IOPORT 0xfff0 /* it doesn't matter where, as long
- as it is unused */
-
-#define PFX "powernow-k6: "
-static unsigned int busfreq; /* FSB, in 10 kHz */
-static unsigned int max_multiplier;
-
-
-/* Clock ratio multiplied by 10 - see table 27 in AMD#23446 */
-static struct cpufreq_frequency_table clock_ratio[] = {
- {45, /* 000 -> 4.5x */ 0},
- {50, /* 001 -> 5.0x */ 0},
- {40, /* 010 -> 4.0x */ 0},
- {55, /* 011 -> 5.5x */ 0},
- {20, /* 100 -> 2.0x */ 0},
- {30, /* 101 -> 3.0x */ 0},
- {60, /* 110 -> 6.0x */ 0},
- {35, /* 111 -> 3.5x */ 0},
- {0, CPUFREQ_TABLE_END}
-};
-
-
-/**
- * powernow_k6_get_cpu_multiplier - returns the current FSB multiplier
- *
- * Returns the current setting of the frequency multiplier. Core clock
- * speed is frequency of the Front-Side Bus multiplied with this value.
- */
-static int powernow_k6_get_cpu_multiplier(void)
-{
- u64 invalue = 0;
- u32 msrval;
-
- msrval = POWERNOW_IOPORT + 0x1;
- wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */
- invalue = inl(POWERNOW_IOPORT + 0x8);
- msrval = POWERNOW_IOPORT + 0x0;
- wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */
-
- return clock_ratio[(invalue >> 5)&7].index;
-}
-
-
-/**
- * powernow_k6_set_state - set the PowerNow! multiplier
- * @best_i: clock_ratio[best_i] is the target multiplier
- *
- * Tries to change the PowerNow! multiplier
- */
-static void powernow_k6_set_state(unsigned int best_i)
-{
- unsigned long outvalue = 0, invalue = 0;
- unsigned long msrval;
- struct cpufreq_freqs freqs;
-
- if (clock_ratio[best_i].index > max_multiplier) {
- printk(KERN_ERR PFX "invalid target frequency\n");
- return;
- }
-
- freqs.old = busfreq * powernow_k6_get_cpu_multiplier();
- freqs.new = busfreq * clock_ratio[best_i].index;
- freqs.cpu = 0; /* powernow-k6.c is UP only driver */
-
- cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-
- /* we now need to transform best_i to the BVC format, see AMD#23446 */
-
- outvalue = (1<<12) | (1<<10) | (1<<9) | (best_i<<5);
-
- msrval = POWERNOW_IOPORT + 0x1;
- wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */
- invalue = inl(POWERNOW_IOPORT + 0x8);
- invalue = invalue & 0xf;
- outvalue = outvalue | invalue;
- outl(outvalue , (POWERNOW_IOPORT + 0x8));
- msrval = POWERNOW_IOPORT + 0x0;
- wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */
-
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-
- return;
-}
-
-
-/**
- * powernow_k6_verify - verifies a new CPUfreq policy
- * @policy: new policy
- *
- * Policy must be within lowest and highest possible CPU Frequency,
- * and at least one possible state must be within min and max.
- */
-static int powernow_k6_verify(struct cpufreq_policy *policy)
-{
- return cpufreq_frequency_table_verify(policy, &clock_ratio[0]);
-}
-
-
-/**
- * powernow_k6_setpolicy - sets a new CPUFreq policy
- * @policy: new policy
- * @target_freq: the target frequency
- * @relation: how that frequency relates to achieved frequency
- * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
- *
- * sets a new CPUFreq policy
- */
-static int powernow_k6_target(struct cpufreq_policy *policy,
- unsigned int target_freq,
- unsigned int relation)
-{
- unsigned int newstate = 0;
-
- if (cpufreq_frequency_table_target(policy, &clock_ratio[0],
- target_freq, relation, &newstate))
- return -EINVAL;
-
- powernow_k6_set_state(newstate);
-
- return 0;
-}
-
-
-static int powernow_k6_cpu_init(struct cpufreq_policy *policy)
-{
- unsigned int i, f;
- int result;
-
- if (policy->cpu != 0)
- return -ENODEV;
-
- /* get frequencies */
- max_multiplier = powernow_k6_get_cpu_multiplier();
- busfreq = cpu_khz / max_multiplier;
-
- /* table init */
- for (i = 0; (clock_ratio[i].frequency != CPUFREQ_TABLE_END); i++) {
- f = clock_ratio[i].index;
- if (f > max_multiplier)
- clock_ratio[i].frequency = CPUFREQ_ENTRY_INVALID;
- else
- clock_ratio[i].frequency = busfreq * f;
- }
-
- /* cpuinfo and default policy values */
- policy->cpuinfo.transition_latency = 200000;
- policy->cur = busfreq * max_multiplier;
-
- result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio);
- if (result)
- return result;
-
- cpufreq_frequency_table_get_attr(clock_ratio, policy->cpu);
-
- return 0;
-}
-
-
-static int powernow_k6_cpu_exit(struct cpufreq_policy *policy)
-{
- unsigned int i;
- for (i = 0; i < 8; i++) {
- if (i == max_multiplier)
- powernow_k6_set_state(i);
- }
- cpufreq_frequency_table_put_attr(policy->cpu);
- return 0;
-}
-
-static unsigned int powernow_k6_get(unsigned int cpu)
-{
- unsigned int ret;
- ret = (busfreq * powernow_k6_get_cpu_multiplier());
- return ret;
-}
-
-static struct freq_attr *powernow_k6_attr[] = {
- &cpufreq_freq_attr_scaling_available_freqs,
- NULL,
-};
-
-static struct cpufreq_driver powernow_k6_driver = {
- .verify = powernow_k6_verify,
- .target = powernow_k6_target,
- .init = powernow_k6_cpu_init,
- .exit = powernow_k6_cpu_exit,
- .get = powernow_k6_get,
- .name = "powernow-k6",
- .owner = THIS_MODULE,
- .attr = powernow_k6_attr,
-};
-
-
-/**
- * powernow_k6_init - initializes the k6 PowerNow! CPUFreq driver
- *
- * Initializes the K6 PowerNow! support. Returns -ENODEV on unsupported
- * devices, -EINVAL or -ENOMEM on problems during initiatization, and zero
- * on success.
- */
-static int __init powernow_k6_init(void)
-{
- struct cpuinfo_x86 *c = &cpu_data(0);
-
- if ((c->x86_vendor != X86_VENDOR_AMD) || (c->x86 != 5) ||
- ((c->x86_model != 12) && (c->x86_model != 13)))
- return -ENODEV;
-
- if (!request_region(POWERNOW_IOPORT, 16, "PowerNow!")) {
- printk(KERN_INFO PFX "PowerNow IOPORT region already used.\n");
- return -EIO;
- }
-
- if (cpufreq_register_driver(&powernow_k6_driver)) {
- release_region(POWERNOW_IOPORT, 16);
- return -EINVAL;
- }
-
- return 0;
-}
-
-
-/**
- * powernow_k6_exit - unregisters AMD K6-2+/3+ PowerNow! support
- *
- * Unregisters AMD K6-2+ / K6-3+ PowerNow! support.
- */
-static void __exit powernow_k6_exit(void)
-{
- cpufreq_unregister_driver(&powernow_k6_driver);
- release_region(POWERNOW_IOPORT, 16);
-}
-
-
-MODULE_AUTHOR("Arjan van de Ven, Dave Jones <davej@redhat.com>, "
- "Dominik Brodowski <linux@brodo.de>");
-MODULE_DESCRIPTION("PowerNow! driver for AMD K6-2+ / K6-3+ processors.");
-MODULE_LICENSE("GPL");
-
-module_init(powernow_k6_init);
-module_exit(powernow_k6_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
deleted file mode 100644
index 4a45fd6e41b..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
+++ /dev/null
@@ -1,752 +0,0 @@
-/*
- * AMD K7 Powernow driver.
- * (C) 2003 Dave Jones on behalf of SuSE Labs.
- * (C) 2003-2004 Dave Jones <davej@redhat.com>
- *
- * Licensed under the terms of the GNU GPL License version 2.
- * Based upon datasheets & sample CPUs kindly provided by AMD.
- *
- * Errata 5:
- * CPU may fail to execute a FID/VID change in presence of interrupt.
- * - We cli/sti on stepping A0 CPUs around the FID/VID transition.
- * Errata 15:
- * CPU with half frequency multipliers may hang upon wakeup from disconnect.
- * - We disable half multipliers if ACPI is used on A0 stepping CPUs.
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/dmi.h>
-#include <linux/timex.h>
-#include <linux/io.h>
-
-#include <asm/timer.h> /* Needed for recalibrate_cpu_khz() */
-#include <asm/msr.h>
-#include <asm/system.h>
-
-#ifdef CONFIG_X86_POWERNOW_K7_ACPI
-#include <linux/acpi.h>
-#include <acpi/processor.h>
-#endif
-
-#include "powernow-k7.h"
-
-#define PFX "powernow: "
-
-
-struct psb_s {
- u8 signature[10];
- u8 tableversion;
- u8 flags;
- u16 settlingtime;
- u8 reserved1;
- u8 numpst;
-};
-
-struct pst_s {
- u32 cpuid;
- u8 fsbspeed;
- u8 maxfid;
- u8 startvid;
- u8 numpstates;
-};
-
-#ifdef CONFIG_X86_POWERNOW_K7_ACPI
-union powernow_acpi_control_t {
- struct {
- unsigned long fid:5,
- vid:5,
- sgtc:20,
- res1:2;
- } bits;
- unsigned long val;
-};
-#endif
-
-#ifdef CONFIG_CPU_FREQ_DEBUG
-/* divide by 1000 to get VCore voltage in V. */
-static const int mobile_vid_table[32] = {
- 2000, 1950, 1900, 1850, 1800, 1750, 1700, 1650,
- 1600, 1550, 1500, 1450, 1400, 1350, 1300, 0,
- 1275, 1250, 1225, 1200, 1175, 1150, 1125, 1100,
- 1075, 1050, 1025, 1000, 975, 950, 925, 0,
-};
-#endif
-
-/* divide by 10 to get FID. */
-static const int fid_codes[32] = {
- 110, 115, 120, 125, 50, 55, 60, 65,
- 70, 75, 80, 85, 90, 95, 100, 105,
- 30, 190, 40, 200, 130, 135, 140, 210,
- 150, 225, 160, 165, 170, 180, -1, -1,
-};
-
-/* This parameter is used in order to force ACPI instead of legacy method for
- * configuration purpose.
- */
-
-static int acpi_force;
-
-static struct cpufreq_frequency_table *powernow_table;
-
-static unsigned int can_scale_bus;
-static unsigned int can_scale_vid;
-static unsigned int minimum_speed = -1;
-static unsigned int maximum_speed;
-static unsigned int number_scales;
-static unsigned int fsb;
-static unsigned int latency;
-static char have_a0;
-
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
- "powernow-k7", msg)
-
-static int check_fsb(unsigned int fsbspeed)
-{
- int delta;
- unsigned int f = fsb / 1000;
-
- delta = (fsbspeed > f) ? fsbspeed - f : f - fsbspeed;
- return delta < 5;
-}
-
-static int check_powernow(void)
-{
- struct cpuinfo_x86 *c = &cpu_data(0);
- unsigned int maxei, eax, ebx, ecx, edx;
-
- if ((c->x86_vendor != X86_VENDOR_AMD) || (c->x86 != 6)) {
-#ifdef MODULE
- printk(KERN_INFO PFX "This module only works with "
- "AMD K7 CPUs\n");
-#endif
- return 0;
- }
-
- /* Get maximum capabilities */
- maxei = cpuid_eax(0x80000000);
- if (maxei < 0x80000007) { /* Any powernow info ? */
-#ifdef MODULE
- printk(KERN_INFO PFX "No powernow capabilities detected\n");
-#endif
- return 0;
- }
-
- if ((c->x86_model == 6) && (c->x86_mask == 0)) {
- printk(KERN_INFO PFX "K7 660[A0] core detected, "
- "enabling errata workarounds\n");
- have_a0 = 1;
- }
-
- cpuid(0x80000007, &eax, &ebx, &ecx, &edx);
-
- /* Check we can actually do something before we say anything.*/
- if (!(edx & (1 << 1 | 1 << 2)))
- return 0;
-
- printk(KERN_INFO PFX "PowerNOW! Technology present. Can scale: ");
-
- if (edx & 1 << 1) {
- printk("frequency");
- can_scale_bus = 1;
- }
-
- if ((edx & (1 << 1 | 1 << 2)) == 0x6)
- printk(" and ");
-
- if (edx & 1 << 2) {
- printk("voltage");
- can_scale_vid = 1;
- }
-
- printk(".\n");
- return 1;
-}
-
-#ifdef CONFIG_X86_POWERNOW_K7_ACPI
-static void invalidate_entry(unsigned int entry)
-{
- powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID;
-}
-#endif
-
-static int get_ranges(unsigned char *pst)
-{
- unsigned int j;
- unsigned int speed;
- u8 fid, vid;
-
- powernow_table = kzalloc((sizeof(struct cpufreq_frequency_table) *
- (number_scales + 1)), GFP_KERNEL);
- if (!powernow_table)
- return -ENOMEM;
-
- for (j = 0 ; j < number_scales; j++) {
- fid = *pst++;
-
- powernow_table[j].frequency = (fsb * fid_codes[fid]) / 10;
- powernow_table[j].index = fid; /* lower 8 bits */
-
- speed = powernow_table[j].frequency;
-
- if ((fid_codes[fid] % 10) == 5) {
-#ifdef CONFIG_X86_POWERNOW_K7_ACPI
- if (have_a0 == 1)
- invalidate_entry(j);
-#endif
- }
-
- if (speed < minimum_speed)
- minimum_speed = speed;
- if (speed > maximum_speed)
- maximum_speed = speed;
-
- vid = *pst++;
- powernow_table[j].index |= (vid << 8); /* upper 8 bits */
-
- dprintk(" FID: 0x%x (%d.%dx [%dMHz]) "
- "VID: 0x%x (%d.%03dV)\n", fid, fid_codes[fid] / 10,
- fid_codes[fid] % 10, speed/1000, vid,
- mobile_vid_table[vid]/1000,
- mobile_vid_table[vid]%1000);
- }
- powernow_table[number_scales].frequency = CPUFREQ_TABLE_END;
- powernow_table[number_scales].index = 0;
-
- return 0;
-}
-
-
-static void change_FID(int fid)
-{
- union msr_fidvidctl fidvidctl;
-
- rdmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);
- if (fidvidctl.bits.FID != fid) {
- fidvidctl.bits.SGTC = latency;
- fidvidctl.bits.FID = fid;
- fidvidctl.bits.VIDC = 0;
- fidvidctl.bits.FIDC = 1;
- wrmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);
- }
-}
-
-
-static void change_VID(int vid)
-{
- union msr_fidvidctl fidvidctl;
-
- rdmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);
- if (fidvidctl.bits.VID != vid) {
- fidvidctl.bits.SGTC = latency;
- fidvidctl.bits.VID = vid;
- fidvidctl.bits.FIDC = 0;
- fidvidctl.bits.VIDC = 1;
- wrmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);
- }
-}
-
-
-static void change_speed(unsigned int index)
-{
- u8 fid, vid;
- struct cpufreq_freqs freqs;
- union msr_fidvidstatus fidvidstatus;
- int cfid;
-
- /* fid are the lower 8 bits of the index we stored into
- * the cpufreq frequency table in powernow_decode_bios,
- * vid are the upper 8 bits.
- */
-
- fid = powernow_table[index].index & 0xFF;
- vid = (powernow_table[index].index & 0xFF00) >> 8;
-
- freqs.cpu = 0;
-
- rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val);
- cfid = fidvidstatus.bits.CFID;
- freqs.old = fsb * fid_codes[cfid] / 10;
-
- freqs.new = powernow_table[index].frequency;
-
- cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-
- /* Now do the magic poking into the MSRs. */
-
- if (have_a0 == 1) /* A0 errata 5 */
- local_irq_disable();
-
- if (freqs.old > freqs.new) {
- /* Going down, so change FID first */
- change_FID(fid);
- change_VID(vid);
- } else {
- /* Going up, so change VID first */
- change_VID(vid);
- change_FID(fid);
- }
-
-
- if (have_a0 == 1)
- local_irq_enable();
-
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-}
-
-
-#ifdef CONFIG_X86_POWERNOW_K7_ACPI
-
-static struct acpi_processor_performance *acpi_processor_perf;
-
-static int powernow_acpi_init(void)
-{
- int i;
- int retval = 0;
- union powernow_acpi_control_t pc;
-
- if (acpi_processor_perf != NULL && powernow_table != NULL) {
- retval = -EINVAL;
- goto err0;
- }
-
- acpi_processor_perf = kzalloc(sizeof(struct acpi_processor_performance),
- GFP_KERNEL);
- if (!acpi_processor_perf) {
- retval = -ENOMEM;
- goto err0;
- }
-
- if (!zalloc_cpumask_var(&acpi_processor_perf->shared_cpu_map,
- GFP_KERNEL)) {
- retval = -ENOMEM;
- goto err05;
- }
-
- if (acpi_processor_register_performance(acpi_processor_perf, 0)) {
- retval = -EIO;
- goto err1;
- }
-
- if (acpi_processor_perf->control_register.space_id !=
- ACPI_ADR_SPACE_FIXED_HARDWARE) {
- retval = -ENODEV;
- goto err2;
- }
-
- if (acpi_processor_perf->status_register.space_id !=
- ACPI_ADR_SPACE_FIXED_HARDWARE) {
- retval = -ENODEV;
- goto err2;
- }
-
- number_scales = acpi_processor_perf->state_count;
-
- if (number_scales < 2) {
- retval = -ENODEV;
- goto err2;
- }
-
- powernow_table = kzalloc((sizeof(struct cpufreq_frequency_table) *
- (number_scales + 1)), GFP_KERNEL);
- if (!powernow_table) {
- retval = -ENOMEM;
- goto err2;
- }
-
- pc.val = (unsigned long) acpi_processor_perf->states[0].control;
- for (i = 0; i < number_scales; i++) {
- u8 fid, vid;
- struct acpi_processor_px *state =
- &acpi_processor_perf->states[i];
- unsigned int speed, speed_mhz;
-
- pc.val = (unsigned long) state->control;
- dprintk("acpi: P%d: %d MHz %d mW %d uS control %08x SGTC %d\n",
- i,
- (u32) state->core_frequency,
- (u32) state->power,
- (u32) state->transition_latency,
- (u32) state->control,
- pc.bits.sgtc);
-
- vid = pc.bits.vid;
- fid = pc.bits.fid;
-
- powernow_table[i].frequency = fsb * fid_codes[fid] / 10;
- powernow_table[i].index = fid; /* lower 8 bits */
- powernow_table[i].index |= (vid << 8); /* upper 8 bits */
-
- speed = powernow_table[i].frequency;
- speed_mhz = speed / 1000;
-
- /* processor_perflib will multiply the MHz value by 1000 to
- * get a KHz value (e.g. 1266000). However, powernow-k7 works
- * with true KHz values (e.g. 1266768). To ensure that all
- * powernow frequencies are available, we must ensure that
- * ACPI doesn't restrict them, so we round up the MHz value
- * to ensure that perflib's computed KHz value is greater than
- * or equal to powernow's KHz value.
- */
- if (speed % 1000 > 0)
- speed_mhz++;
-
- if ((fid_codes[fid] % 10) == 5) {
- if (have_a0 == 1)
- invalidate_entry(i);
- }
-
- dprintk(" FID: 0x%x (%d.%dx [%dMHz]) "
- "VID: 0x%x (%d.%03dV)\n", fid, fid_codes[fid] / 10,
- fid_codes[fid] % 10, speed_mhz, vid,
- mobile_vid_table[vid]/1000,
- mobile_vid_table[vid]%1000);
-
- if (state->core_frequency != speed_mhz) {
- state->core_frequency = speed_mhz;
- dprintk(" Corrected ACPI frequency to %d\n",
- speed_mhz);
- }
-
- if (latency < pc.bits.sgtc)
- latency = pc.bits.sgtc;
-
- if (speed < minimum_speed)
- minimum_speed = speed;
- if (speed > maximum_speed)
- maximum_speed = speed;
- }
-
- powernow_table[i].frequency = CPUFREQ_TABLE_END;
- powernow_table[i].index = 0;
-
- /* notify BIOS that we exist */
- acpi_processor_notify_smm(THIS_MODULE);
-
- return 0;
-
-err2:
- acpi_processor_unregister_performance(acpi_processor_perf, 0);
-err1:
- free_cpumask_var(acpi_processor_perf->shared_cpu_map);
-err05:
- kfree(acpi_processor_perf);
-err0:
- printk(KERN_WARNING PFX "ACPI perflib can not be used on "
- "this platform\n");
- acpi_processor_perf = NULL;
- return retval;
-}
-#else
-static int powernow_acpi_init(void)
-{
- printk(KERN_INFO PFX "no support for ACPI processor found."
- " Please recompile your kernel with ACPI processor\n");
- return -EINVAL;
-}
-#endif
-
-static void print_pst_entry(struct pst_s *pst, unsigned int j)
-{
- dprintk("PST:%d (@%p)\n", j, pst);
- dprintk(" cpuid: 0x%x fsb: %d maxFID: 0x%x startvid: 0x%x\n",
- pst->cpuid, pst->fsbspeed, pst->maxfid, pst->startvid);
-}
-
-static int powernow_decode_bios(int maxfid, int startvid)
-{
- struct psb_s *psb;
- struct pst_s *pst;
- unsigned int i, j;
- unsigned char *p;
- unsigned int etuple;
- unsigned int ret;
-
- etuple = cpuid_eax(0x80000001);
-
- for (i = 0xC0000; i < 0xffff0 ; i += 16) {
-
- p = phys_to_virt(i);
-
- if (memcmp(p, "AMDK7PNOW!", 10) == 0) {
- dprintk("Found PSB header at %p\n", p);
- psb = (struct psb_s *) p;
- dprintk("Table version: 0x%x\n", psb->tableversion);
- if (psb->tableversion != 0x12) {
- printk(KERN_INFO PFX "Sorry, only v1.2 tables"
- " supported right now\n");
- return -ENODEV;
- }
-
- dprintk("Flags: 0x%x\n", psb->flags);
- if ((psb->flags & 1) == 0)
- dprintk("Mobile voltage regulator\n");
- else
- dprintk("Desktop voltage regulator\n");
-
- latency = psb->settlingtime;
- if (latency < 100) {
- printk(KERN_INFO PFX "BIOS set settling time "
- "to %d microseconds. "
- "Should be at least 100. "
- "Correcting.\n", latency);
- latency = 100;
- }
- dprintk("Settling Time: %d microseconds.\n",
- psb->settlingtime);
- dprintk("Has %d PST tables. (Only dumping ones "
- "relevant to this CPU).\n",
- psb->numpst);
-
- p += sizeof(struct psb_s);
-
- pst = (struct pst_s *) p;
-
- for (j = 0; j < psb->numpst; j++) {
- pst = (struct pst_s *) p;
- number_scales = pst->numpstates;
-
- if ((etuple == pst->cpuid) &&
- check_fsb(pst->fsbspeed) &&
- (maxfid == pst->maxfid) &&
- (startvid == pst->startvid)) {
- print_pst_entry(pst, j);
- p = (char *)pst + sizeof(struct pst_s);
- ret = get_ranges(p);
- return ret;
- } else {
- unsigned int k;
- p = (char *)pst + sizeof(struct pst_s);
- for (k = 0; k < number_scales; k++)
- p += 2;
- }
- }
- printk(KERN_INFO PFX "No PST tables match this cpuid "
- "(0x%x)\n", etuple);
- printk(KERN_INFO PFX "This is indicative of a broken "
- "BIOS.\n");
-
- return -EINVAL;
- }
- p++;
- }
-
- return -ENODEV;
-}
-
-
-static int powernow_target(struct cpufreq_policy *policy,
- unsigned int target_freq,
- unsigned int relation)
-{
- unsigned int newstate;
-
- if (cpufreq_frequency_table_target(policy, powernow_table, target_freq,
- relation, &newstate))
- return -EINVAL;
-
- change_speed(newstate);
-
- return 0;
-}
-
-
-static int powernow_verify(struct cpufreq_policy *policy)
-{
- return cpufreq_frequency_table_verify(policy, powernow_table);
-}
-
-/*
- * We use the fact that the bus frequency is somehow
- * a multiple of 100000/3 khz, then we compute sgtc according
- * to this multiple.
- * That way, we match more how AMD thinks all of that work.
- * We will then get the same kind of behaviour already tested under
- * the "well-known" other OS.
- */
-static int __cpuinit fixup_sgtc(void)
-{
- unsigned int sgtc;
- unsigned int m;
-
- m = fsb / 3333;
- if ((m % 10) >= 5)
- m += 5;
-
- m /= 10;
-
- sgtc = 100 * m * latency;
- sgtc = sgtc / 3;
- if (sgtc > 0xfffff) {
- printk(KERN_WARNING PFX "SGTC too large %d\n", sgtc);
- sgtc = 0xfffff;
- }
- return sgtc;
-}
-
-static unsigned int powernow_get(unsigned int cpu)
-{
- union msr_fidvidstatus fidvidstatus;
- unsigned int cfid;
-
- if (cpu)
- return 0;
- rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val);
- cfid = fidvidstatus.bits.CFID;
-
- return fsb * fid_codes[cfid] / 10;
-}
-
-
-static int __cpuinit acer_cpufreq_pst(const struct dmi_system_id *d)
-{
- printk(KERN_WARNING PFX
- "%s laptop with broken PST tables in BIOS detected.\n",
- d->ident);
- printk(KERN_WARNING PFX
- "You need to downgrade to 3A21 (09/09/2002), or try a newer "
- "BIOS than 3A71 (01/20/2003)\n");
- printk(KERN_WARNING PFX
- "cpufreq scaling has been disabled as a result of this.\n");
- return 0;
-}
-
-/*
- * Some Athlon laptops have really fucked PST tables.
- * A BIOS update is all that can save them.
- * Mention this, and disable cpufreq.
- */
-static struct dmi_system_id __cpuinitdata powernow_dmi_table[] = {
- {
- .callback = acer_cpufreq_pst,
- .ident = "Acer Aspire",
- .matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Insyde Software"),
- DMI_MATCH(DMI_BIOS_VERSION, "3A71"),
- },
- },
- { }
-};
-
-static int __cpuinit powernow_cpu_init(struct cpufreq_policy *policy)
-{
- union msr_fidvidstatus fidvidstatus;
- int result;
-
- if (policy->cpu != 0)
- return -ENODEV;
-
- rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val);
-
- recalibrate_cpu_khz();
-
- fsb = (10 * cpu_khz) / fid_codes[fidvidstatus.bits.CFID];
- if (!fsb) {
- printk(KERN_WARNING PFX "can not determine bus frequency\n");
- return -EINVAL;
- }
- dprintk("FSB: %3dMHz\n", fsb/1000);
-
- if (dmi_check_system(powernow_dmi_table) || acpi_force) {
- printk(KERN_INFO PFX "PSB/PST known to be broken. "
- "Trying ACPI instead\n");
- result = powernow_acpi_init();
- } else {
- result = powernow_decode_bios(fidvidstatus.bits.MFID,
- fidvidstatus.bits.SVID);
- if (result) {
- printk(KERN_INFO PFX "Trying ACPI perflib\n");
- maximum_speed = 0;
- minimum_speed = -1;
- latency = 0;
- result = powernow_acpi_init();
- if (result) {
- printk(KERN_INFO PFX
- "ACPI and legacy methods failed\n");
- }
- } else {
- /* SGTC use the bus clock as timer */
- latency = fixup_sgtc();
- printk(KERN_INFO PFX "SGTC: %d\n", latency);
- }
- }
-
- if (result)
- return result;
-
- printk(KERN_INFO PFX "Minimum speed %d MHz. Maximum speed %d MHz.\n",
- minimum_speed/1000, maximum_speed/1000);
-
- policy->cpuinfo.transition_latency =
- cpufreq_scale(2000000UL, fsb, latency);
-
- policy->cur = powernow_get(0);
-
- cpufreq_frequency_table_get_attr(powernow_table, policy->cpu);
-
- return cpufreq_frequency_table_cpuinfo(policy, powernow_table);
-}
-
-static int powernow_cpu_exit(struct cpufreq_policy *policy)
-{
- cpufreq_frequency_table_put_attr(policy->cpu);
-
-#ifdef CONFIG_X86_POWERNOW_K7_ACPI
- if (acpi_processor_perf) {
- acpi_processor_unregister_performance(acpi_processor_perf, 0);
- free_cpumask_var(acpi_processor_perf->shared_cpu_map);
- kfree(acpi_processor_perf);
- }
-#endif
-
- kfree(powernow_table);
- return 0;
-}
-
-static struct freq_attr *powernow_table_attr[] = {
- &cpufreq_freq_attr_scaling_available_freqs,
- NULL,
-};
-
-static struct cpufreq_driver powernow_driver = {
- .verify = powernow_verify,
- .target = powernow_target,
- .get = powernow_get,
-#ifdef CONFIG_X86_POWERNOW_K7_ACPI
- .bios_limit = acpi_processor_get_bios_limit,
-#endif
- .init = powernow_cpu_init,
- .exit = powernow_cpu_exit,
- .name = "powernow-k7",
- .owner = THIS_MODULE,
- .attr = powernow_table_attr,
-};
-
-static int __init powernow_init(void)
-{
- if (check_powernow() == 0)
- return -ENODEV;
- return cpufreq_register_driver(&powernow_driver);
-}
-
-
-static void __exit powernow_exit(void)
-{
- cpufreq_unregister_driver(&powernow_driver);
-}
-
-module_param(acpi_force, int, 0444);
-MODULE_PARM_DESC(acpi_force, "Force ACPI to be used.");
-
-MODULE_AUTHOR("Dave Jones <davej@redhat.com>");
-MODULE_DESCRIPTION("Powernow driver for AMD K7 processors.");
-MODULE_LICENSE("GPL");
-
-late_initcall(powernow_init);
-module_exit(powernow_exit);
-
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.h b/arch/x86/kernel/cpu/cpufreq/powernow-k7.h
deleted file mode 100644
index 35fb4eaf6e1..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * (C) 2003 Dave Jones.
- *
- * Licensed under the terms of the GNU GPL License version 2.
- *
- * AMD-specific information
- *
- */
-
-union msr_fidvidctl {
- struct {
- unsigned FID:5, // 4:0
- reserved1:3, // 7:5
- VID:5, // 12:8
- reserved2:3, // 15:13
- FIDC:1, // 16
- VIDC:1, // 17
- reserved3:2, // 19:18
- FIDCHGRATIO:1, // 20
- reserved4:11, // 31-21
- SGTC:20, // 32:51
- reserved5:12; // 63:52
- } bits;
- unsigned long long val;
-};
-
-union msr_fidvidstatus {
- struct {
- unsigned CFID:5, // 4:0
- reserved1:3, // 7:5
- SFID:5, // 12:8
- reserved2:3, // 15:13
- MFID:5, // 20:16
- reserved3:11, // 31:21
- CVID:5, // 36:32
- reserved4:3, // 39:37
- SVID:5, // 44:40
- reserved5:3, // 47:45
- MVID:5, // 52:48
- reserved6:11; // 63:53
- } bits;
- unsigned long long val;
-};
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
deleted file mode 100644
index 2368e38327b..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ /dev/null
@@ -1,1607 +0,0 @@
-/*
- * (c) 2003-2010 Advanced Micro Devices, Inc.
- * Your use of this code is subject to the terms and conditions of the
- * GNU general public license version 2. See "COPYING" or
- * http://www.gnu.org/licenses/gpl.html
- *
- * Support : mark.langsdorf@amd.com
- *
- * Based on the powernow-k7.c module written by Dave Jones.
- * (C) 2003 Dave Jones on behalf of SuSE Labs
- * (C) 2004 Dominik Brodowski <linux@brodo.de>
- * (C) 2004 Pavel Machek <pavel@ucw.cz>
- * Licensed under the terms of the GNU GPL License version 2.
- * Based upon datasheets & sample CPUs kindly provided by AMD.
- *
- * Valuable input gratefully received from Dave Jones, Pavel Machek,
- * Dominik Brodowski, Jacob Shin, and others.
- * Originally developed by Paul Devriendt.
- * Processor information obtained from Chapter 9 (Power and Thermal Management)
- * of the "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD
- * Opteron Processors" available for download from www.amd.com
- *
- * Tables for specific CPUs can be inferred from
- * http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/30430.pdf
- */
-
-#include <linux/kernel.h>
-#include <linux/smp.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/cpumask.h>
-#include <linux/sched.h> /* for current / set_cpus_allowed() */
-#include <linux/io.h>
-#include <linux/delay.h>
-
-#include <asm/msr.h>
-
-#include <linux/acpi.h>
-#include <linux/mutex.h>
-#include <acpi/processor.h>
-
-#define PFX "powernow-k8: "
-#define VERSION "version 2.20.00"
-#include "powernow-k8.h"
-#include "mperf.h"
-
-/* serialize freq changes */
-static DEFINE_MUTEX(fidvid_mutex);
-
-static DEFINE_PER_CPU(struct powernow_k8_data *, powernow_data);
-
-static int cpu_family = CPU_OPTERON;
-
-/* core performance boost */
-static bool cpb_capable, cpb_enabled;
-static struct msr __percpu *msrs;
-
-static struct cpufreq_driver cpufreq_amd64_driver;
-
-#ifndef CONFIG_SMP
-static inline const struct cpumask *cpu_core_mask(int cpu)
-{
- return cpumask_of(0);
-}
-#endif
-
-/* Return a frequency in MHz, given an input fid */
-static u32 find_freq_from_fid(u32 fid)
-{
- return 800 + (fid * 100);
-}
-
-/* Return a frequency in KHz, given an input fid */
-static u32 find_khz_freq_from_fid(u32 fid)
-{
- return 1000 * find_freq_from_fid(fid);
-}
-
-static u32 find_khz_freq_from_pstate(struct cpufreq_frequency_table *data,
- u32 pstate)
-{
- return data[pstate].frequency;
-}
-
-/* Return the vco fid for an input fid
- *
- * Each "low" fid has corresponding "high" fid, and you can get to "low" fids
- * only from corresponding high fids. This returns "high" fid corresponding to
- * "low" one.
- */
-static u32 convert_fid_to_vco_fid(u32 fid)
-{
- if (fid < HI_FID_TABLE_BOTTOM)
- return 8 + (2 * fid);
- else
- return fid;
-}
-
-/*
- * Return 1 if the pending bit is set. Unless we just instructed the processor
- * to transition to a new state, seeing this bit set is really bad news.
- */
-static int pending_bit_stuck(void)
-{
- u32 lo, hi;
-
- if (cpu_family == CPU_HW_PSTATE)
- return 0;
-
- rdmsr(MSR_FIDVID_STATUS, lo, hi);
- return lo & MSR_S_LO_CHANGE_PENDING ? 1 : 0;
-}
-
-/*
- * Update the global current fid / vid values from the status msr.
- * Returns 1 on error.
- */
-static int query_current_values_with_pending_wait(struct powernow_k8_data *data)
-{
- u32 lo, hi;
- u32 i = 0;
-
- if (cpu_family == CPU_HW_PSTATE) {
- rdmsr(MSR_PSTATE_STATUS, lo, hi);
- i = lo & HW_PSTATE_MASK;
- data->currpstate = i;
-
- /*
- * a workaround for family 11h erratum 311 might cause
- * an "out-of-range Pstate if the core is in Pstate-0
- */
- if ((boot_cpu_data.x86 == 0x11) && (i >= data->numps))
- data->currpstate = HW_PSTATE_0;
-
- return 0;
- }
- do {
- if (i++ > 10000) {
- dprintk("detected change pending stuck\n");
- return 1;
- }
- rdmsr(MSR_FIDVID_STATUS, lo, hi);
- } while (lo & MSR_S_LO_CHANGE_PENDING);
-
- data->currvid = hi & MSR_S_HI_CURRENT_VID;
- data->currfid = lo & MSR_S_LO_CURRENT_FID;
-
- return 0;
-}
-
-/* the isochronous relief time */
-static void count_off_irt(struct powernow_k8_data *data)
-{
- udelay((1 << data->irt) * 10);
- return;
-}
-
-/* the voltage stabilization time */
-static void count_off_vst(struct powernow_k8_data *data)
-{
- udelay(data->vstable * VST_UNITS_20US);
- return;
-}
-
-/* need to init the control msr to a safe value (for each cpu) */
-static void fidvid_msr_init(void)
-{
- u32 lo, hi;
- u8 fid, vid;
-
- rdmsr(MSR_FIDVID_STATUS, lo, hi);
- vid = hi & MSR_S_HI_CURRENT_VID;
- fid = lo & MSR_S_LO_CURRENT_FID;
- lo = fid | (vid << MSR_C_LO_VID_SHIFT);
- hi = MSR_C_HI_STP_GNT_BENIGN;
- dprintk("cpu%d, init lo 0x%x, hi 0x%x\n", smp_processor_id(), lo, hi);
- wrmsr(MSR_FIDVID_CTL, lo, hi);
-}
-
-/* write the new fid value along with the other control fields to the msr */
-static int write_new_fid(struct powernow_k8_data *data, u32 fid)
-{
- u32 lo;
- u32 savevid = data->currvid;
- u32 i = 0;
-
- if ((fid & INVALID_FID_MASK) || (data->currvid & INVALID_VID_MASK)) {
- printk(KERN_ERR PFX "internal error - overflow on fid write\n");
- return 1;
- }
-
- lo = fid;
- lo |= (data->currvid << MSR_C_LO_VID_SHIFT);
- lo |= MSR_C_LO_INIT_FID_VID;
-
- dprintk("writing fid 0x%x, lo 0x%x, hi 0x%x\n",
- fid, lo, data->plllock * PLL_LOCK_CONVERSION);
-
- do {
- wrmsr(MSR_FIDVID_CTL, lo, data->plllock * PLL_LOCK_CONVERSION);
- if (i++ > 100) {
- printk(KERN_ERR PFX
- "Hardware error - pending bit very stuck - "
- "no further pstate changes possible\n");
- return 1;
- }
- } while (query_current_values_with_pending_wait(data));
-
- count_off_irt(data);
-
- if (savevid != data->currvid) {
- printk(KERN_ERR PFX
- "vid change on fid trans, old 0x%x, new 0x%x\n",
- savevid, data->currvid);
- return 1;
- }
-
- if (fid != data->currfid) {
- printk(KERN_ERR PFX
- "fid trans failed, fid 0x%x, curr 0x%x\n", fid,
- data->currfid);
- return 1;
- }
-
- return 0;
-}
-
-/* Write a new vid to the hardware */
-static int write_new_vid(struct powernow_k8_data *data, u32 vid)
-{
- u32 lo;
- u32 savefid = data->currfid;
- int i = 0;
-
- if ((data->currfid & INVALID_FID_MASK) || (vid & INVALID_VID_MASK)) {
- printk(KERN_ERR PFX "internal error - overflow on vid write\n");
- return 1;
- }
-
- lo = data->currfid;
- lo |= (vid << MSR_C_LO_VID_SHIFT);
- lo |= MSR_C_LO_INIT_FID_VID;
-
- dprintk("writing vid 0x%x, lo 0x%x, hi 0x%x\n",
- vid, lo, STOP_GRANT_5NS);
-
- do {
- wrmsr(MSR_FIDVID_CTL, lo, STOP_GRANT_5NS);
- if (i++ > 100) {
- printk(KERN_ERR PFX "internal error - pending bit "
- "very stuck - no further pstate "
- "changes possible\n");
- return 1;
- }
- } while (query_current_values_with_pending_wait(data));
-
- if (savefid != data->currfid) {
- printk(KERN_ERR PFX "fid changed on vid trans, old "
- "0x%x new 0x%x\n",
- savefid, data->currfid);
- return 1;
- }
-
- if (vid != data->currvid) {
- printk(KERN_ERR PFX "vid trans failed, vid 0x%x, "
- "curr 0x%x\n",
- vid, data->currvid);
- return 1;
- }
-
- return 0;
-}
-
-/*
- * Reduce the vid by the max of step or reqvid.
- * Decreasing vid codes represent increasing voltages:
- * vid of 0 is 1.550V, vid of 0x1e is 0.800V, vid of VID_OFF is off.
- */
-static int decrease_vid_code_by_step(struct powernow_k8_data *data,
- u32 reqvid, u32 step)
-{
- if ((data->currvid - reqvid) > step)
- reqvid = data->currvid - step;
-
- if (write_new_vid(data, reqvid))
- return 1;
-
- count_off_vst(data);
-
- return 0;
-}
-
-/* Change hardware pstate by single MSR write */
-static int transition_pstate(struct powernow_k8_data *data, u32 pstate)
-{
- wrmsr(MSR_PSTATE_CTRL, pstate, 0);
- data->currpstate = pstate;
- return 0;
-}
-
-/* Change Opteron/Athlon64 fid and vid, by the 3 phases. */
-static int transition_fid_vid(struct powernow_k8_data *data,
- u32 reqfid, u32 reqvid)
-{
- if (core_voltage_pre_transition(data, reqvid, reqfid))
- return 1;
-
- if (core_frequency_transition(data, reqfid))
- return 1;
-
- if (core_voltage_post_transition(data, reqvid))
- return 1;
-
- if (query_current_values_with_pending_wait(data))
- return 1;
-
- if ((reqfid != data->currfid) || (reqvid != data->currvid)) {
- printk(KERN_ERR PFX "failed (cpu%d): req 0x%x 0x%x, "
- "curr 0x%x 0x%x\n",
- smp_processor_id(),
- reqfid, reqvid, data->currfid, data->currvid);
- return 1;
- }
-
- dprintk("transitioned (cpu%d): new fid 0x%x, vid 0x%x\n",
- smp_processor_id(), data->currfid, data->currvid);
-
- return 0;
-}
-
-/* Phase 1 - core voltage transition ... setup voltage */
-static int core_voltage_pre_transition(struct powernow_k8_data *data,
- u32 reqvid, u32 reqfid)
-{
- u32 rvosteps = data->rvo;
- u32 savefid = data->currfid;
- u32 maxvid, lo, rvomult = 1;
-
- dprintk("ph1 (cpu%d): start, currfid 0x%x, currvid 0x%x, "
- "reqvid 0x%x, rvo 0x%x\n",
- smp_processor_id(),
- data->currfid, data->currvid, reqvid, data->rvo);
-
- if ((savefid < LO_FID_TABLE_TOP) && (reqfid < LO_FID_TABLE_TOP))
- rvomult = 2;
- rvosteps *= rvomult;
- rdmsr(MSR_FIDVID_STATUS, lo, maxvid);
- maxvid = 0x1f & (maxvid >> 16);
- dprintk("ph1 maxvid=0x%x\n", maxvid);
- if (reqvid < maxvid) /* lower numbers are higher voltages */
- reqvid = maxvid;
-
- while (data->currvid > reqvid) {
- dprintk("ph1: curr 0x%x, req vid 0x%x\n",
- data->currvid, reqvid);
- if (decrease_vid_code_by_step(data, reqvid, data->vidmvs))
- return 1;
- }
-
- while ((rvosteps > 0) &&
- ((rvomult * data->rvo + data->currvid) > reqvid)) {
- if (data->currvid == maxvid) {
- rvosteps = 0;
- } else {
- dprintk("ph1: changing vid for rvo, req 0x%x\n",
- data->currvid - 1);
- if (decrease_vid_code_by_step(data, data->currvid-1, 1))
- return 1;
- rvosteps--;
- }
- }
-
- if (query_current_values_with_pending_wait(data))
- return 1;
-
- if (savefid != data->currfid) {
- printk(KERN_ERR PFX "ph1 err, currfid changed 0x%x\n",
- data->currfid);
- return 1;
- }
-
- dprintk("ph1 complete, currfid 0x%x, currvid 0x%x\n",
- data->currfid, data->currvid);
-
- return 0;
-}
-
-/* Phase 2 - core frequency transition */
-static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid)
-{
- u32 vcoreqfid, vcocurrfid, vcofiddiff;
- u32 fid_interval, savevid = data->currvid;
-
- if (data->currfid == reqfid) {
- printk(KERN_ERR PFX "ph2 null fid transition 0x%x\n",
- data->currfid);
- return 0;
- }
-
- dprintk("ph2 (cpu%d): starting, currfid 0x%x, currvid 0x%x, "
- "reqfid 0x%x\n",
- smp_processor_id(),
- data->currfid, data->currvid, reqfid);
-
- vcoreqfid = convert_fid_to_vco_fid(reqfid);
- vcocurrfid = convert_fid_to_vco_fid(data->currfid);
- vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid
- : vcoreqfid - vcocurrfid;
-
- if ((reqfid <= LO_FID_TABLE_TOP) && (data->currfid <= LO_FID_TABLE_TOP))
- vcofiddiff = 0;
-
- while (vcofiddiff > 2) {
- (data->currfid & 1) ? (fid_interval = 1) : (fid_interval = 2);
-
- if (reqfid > data->currfid) {
- if (data->currfid > LO_FID_TABLE_TOP) {
- if (write_new_fid(data,
- data->currfid + fid_interval))
- return 1;
- } else {
- if (write_new_fid
- (data,
- 2 + convert_fid_to_vco_fid(data->currfid)))
- return 1;
- }
- } else {
- if (write_new_fid(data, data->currfid - fid_interval))
- return 1;
- }
-
- vcocurrfid = convert_fid_to_vco_fid(data->currfid);
- vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid
- : vcoreqfid - vcocurrfid;
- }
-
- if (write_new_fid(data, reqfid))
- return 1;
-
- if (query_current_values_with_pending_wait(data))
- return 1;
-
- if (data->currfid != reqfid) {
- printk(KERN_ERR PFX
- "ph2: mismatch, failed fid transition, "
- "curr 0x%x, req 0x%x\n",
- data->currfid, reqfid);
- return 1;
- }
-
- if (savevid != data->currvid) {
- printk(KERN_ERR PFX "ph2: vid changed, save 0x%x, curr 0x%x\n",
- savevid, data->currvid);
- return 1;
- }
-
- dprintk("ph2 complete, currfid 0x%x, currvid 0x%x\n",
- data->currfid, data->currvid);
-
- return 0;
-}
-
-/* Phase 3 - core voltage transition flow ... jump to the final vid. */
-static int core_voltage_post_transition(struct powernow_k8_data *data,
- u32 reqvid)
-{
- u32 savefid = data->currfid;
- u32 savereqvid = reqvid;
-
- dprintk("ph3 (cpu%d): starting, currfid 0x%x, currvid 0x%x\n",
- smp_processor_id(),
- data->currfid, data->currvid);
-
- if (reqvid != data->currvid) {
- if (write_new_vid(data, reqvid))
- return 1;
-
- if (savefid != data->currfid) {
- printk(KERN_ERR PFX
- "ph3: bad fid change, save 0x%x, curr 0x%x\n",
- savefid, data->currfid);
- return 1;
- }
-
- if (data->currvid != reqvid) {
- printk(KERN_ERR PFX
- "ph3: failed vid transition\n, "
- "req 0x%x, curr 0x%x",
- reqvid, data->currvid);
- return 1;
- }
- }
-
- if (query_current_values_with_pending_wait(data))
- return 1;
-
- if (savereqvid != data->currvid) {
- dprintk("ph3 failed, currvid 0x%x\n", data->currvid);
- return 1;
- }
-
- if (savefid != data->currfid) {
- dprintk("ph3 failed, currfid changed 0x%x\n",
- data->currfid);
- return 1;
- }
-
- dprintk("ph3 complete, currfid 0x%x, currvid 0x%x\n",
- data->currfid, data->currvid);
-
- return 0;
-}
-
-static void check_supported_cpu(void *_rc)
-{
- u32 eax, ebx, ecx, edx;
- int *rc = _rc;
-
- *rc = -ENODEV;
-
- if (__this_cpu_read(cpu_info.x86_vendor) != X86_VENDOR_AMD)
- return;
-
- eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
- if (((eax & CPUID_XFAM) != CPUID_XFAM_K8) &&
- ((eax & CPUID_XFAM) < CPUID_XFAM_10H))
- return;
-
- if ((eax & CPUID_XFAM) == CPUID_XFAM_K8) {
- if (((eax & CPUID_USE_XFAM_XMOD) != CPUID_USE_XFAM_XMOD) ||
- ((eax & CPUID_XMOD) > CPUID_XMOD_REV_MASK)) {
- printk(KERN_INFO PFX
- "Processor cpuid %x not supported\n", eax);
- return;
- }
-
- eax = cpuid_eax(CPUID_GET_MAX_CAPABILITIES);
- if (eax < CPUID_FREQ_VOLT_CAPABILITIES) {
- printk(KERN_INFO PFX
- "No frequency change capabilities detected\n");
- return;
- }
-
- cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx);
- if ((edx & P_STATE_TRANSITION_CAPABLE)
- != P_STATE_TRANSITION_CAPABLE) {
- printk(KERN_INFO PFX
- "Power state transitions not supported\n");
- return;
- }
- } else { /* must be a HW Pstate capable processor */
- cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx);
- if ((edx & USE_HW_PSTATE) == USE_HW_PSTATE)
- cpu_family = CPU_HW_PSTATE;
- else
- return;
- }
-
- *rc = 0;
-}
-
-static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst,
- u8 maxvid)
-{
- unsigned int j;
- u8 lastfid = 0xff;
-
- for (j = 0; j < data->numps; j++) {
- if (pst[j].vid > LEAST_VID) {
- printk(KERN_ERR FW_BUG PFX "vid %d invalid : 0x%x\n",
- j, pst[j].vid);
- return -EINVAL;
- }
- if (pst[j].vid < data->rvo) {
- /* vid + rvo >= 0 */
- printk(KERN_ERR FW_BUG PFX "0 vid exceeded with pstate"
- " %d\n", j);
- return -ENODEV;
- }
- if (pst[j].vid < maxvid + data->rvo) {
- /* vid + rvo >= maxvid */
- printk(KERN_ERR FW_BUG PFX "maxvid exceeded with pstate"
- " %d\n", j);
- return -ENODEV;
- }
- if (pst[j].fid > MAX_FID) {
- printk(KERN_ERR FW_BUG PFX "maxfid exceeded with pstate"
- " %d\n", j);
- return -ENODEV;
- }
- if (j && (pst[j].fid < HI_FID_TABLE_BOTTOM)) {
- /* Only first fid is allowed to be in "low" range */
- printk(KERN_ERR FW_BUG PFX "two low fids - %d : "
- "0x%x\n", j, pst[j].fid);
- return -EINVAL;
- }
- if (pst[j].fid < lastfid)
- lastfid = pst[j].fid;
- }
- if (lastfid & 1) {
- printk(KERN_ERR FW_BUG PFX "lastfid invalid\n");
- return -EINVAL;
- }
- if (lastfid > LO_FID_TABLE_TOP)
- printk(KERN_INFO FW_BUG PFX
- "first fid not from lo freq table\n");
-
- return 0;
-}
-
-static void invalidate_entry(struct cpufreq_frequency_table *powernow_table,
- unsigned int entry)
-{
- powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID;
-}
-
-static void print_basics(struct powernow_k8_data *data)
-{
- int j;
- for (j = 0; j < data->numps; j++) {
- if (data->powernow_table[j].frequency !=
- CPUFREQ_ENTRY_INVALID) {
- if (cpu_family == CPU_HW_PSTATE) {
- printk(KERN_INFO PFX
- " %d : pstate %d (%d MHz)\n", j,
- data->powernow_table[j].index,
- data->powernow_table[j].frequency/1000);
- } else {
- printk(KERN_INFO PFX
- "fid 0x%x (%d MHz), vid 0x%x\n",
- data->powernow_table[j].index & 0xff,
- data->powernow_table[j].frequency/1000,
- data->powernow_table[j].index >> 8);
- }
- }
- }
- if (data->batps)
- printk(KERN_INFO PFX "Only %d pstates on battery\n",
- data->batps);
-}
-
-static u32 freq_from_fid_did(u32 fid, u32 did)
-{
- u32 mhz = 0;
-
- if (boot_cpu_data.x86 == 0x10)
- mhz = (100 * (fid + 0x10)) >> did;
- else if (boot_cpu_data.x86 == 0x11)
- mhz = (100 * (fid + 8)) >> did;
- else
- BUG();
-
- return mhz * 1000;
-}
-
-static int fill_powernow_table(struct powernow_k8_data *data,
- struct pst_s *pst, u8 maxvid)
-{
- struct cpufreq_frequency_table *powernow_table;
- unsigned int j;
-
- if (data->batps) {
- /* use ACPI support to get full speed on mains power */
- printk(KERN_WARNING PFX
- "Only %d pstates usable (use ACPI driver for full "
- "range\n", data->batps);
- data->numps = data->batps;
- }
-
- for (j = 1; j < data->numps; j++) {
- if (pst[j-1].fid >= pst[j].fid) {
- printk(KERN_ERR PFX "PST out of sequence\n");
- return -EINVAL;
- }
- }
-
- if (data->numps < 2) {
- printk(KERN_ERR PFX "no p states to transition\n");
- return -ENODEV;
- }
-
- if (check_pst_table(data, pst, maxvid))
- return -EINVAL;
-
- powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table)
- * (data->numps + 1)), GFP_KERNEL);
- if (!powernow_table) {
- printk(KERN_ERR PFX "powernow_table memory alloc failure\n");
- return -ENOMEM;
- }
-
- for (j = 0; j < data->numps; j++) {
- int freq;
- powernow_table[j].index = pst[j].fid; /* lower 8 bits */
- powernow_table[j].index |= (pst[j].vid << 8); /* upper 8 bits */
- freq = find_khz_freq_from_fid(pst[j].fid);
- powernow_table[j].frequency = freq;
- }
- powernow_table[data->numps].frequency = CPUFREQ_TABLE_END;
- powernow_table[data->numps].index = 0;
-
- if (query_current_values_with_pending_wait(data)) {
- kfree(powernow_table);
- return -EIO;
- }
-
- dprintk("cfid 0x%x, cvid 0x%x\n", data->currfid, data->currvid);
- data->powernow_table = powernow_table;
- if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu)
- print_basics(data);
-
- for (j = 0; j < data->numps; j++)
- if ((pst[j].fid == data->currfid) &&
- (pst[j].vid == data->currvid))
- return 0;
-
- dprintk("currfid/vid do not match PST, ignoring\n");
- return 0;
-}
-
-/* Find and validate the PSB/PST table in BIOS. */
-static int find_psb_table(struct powernow_k8_data *data)
-{
- struct psb_s *psb;
- unsigned int i;
- u32 mvs;
- u8 maxvid;
- u32 cpst = 0;
- u32 thiscpuid;
-
- for (i = 0xc0000; i < 0xffff0; i += 0x10) {
- /* Scan BIOS looking for the signature. */
- /* It can not be at ffff0 - it is too big. */
-
- psb = phys_to_virt(i);
- if (memcmp(psb, PSB_ID_STRING, PSB_ID_STRING_LEN) != 0)
- continue;
-
- dprintk("found PSB header at 0x%p\n", psb);
-
- dprintk("table vers: 0x%x\n", psb->tableversion);
- if (psb->tableversion != PSB_VERSION_1_4) {
- printk(KERN_ERR FW_BUG PFX "PSB table is not v1.4\n");
- return -ENODEV;
- }
-
- dprintk("flags: 0x%x\n", psb->flags1);
- if (psb->flags1) {
- printk(KERN_ERR FW_BUG PFX "unknown flags\n");
- return -ENODEV;
- }
-
- data->vstable = psb->vstable;
- dprintk("voltage stabilization time: %d(*20us)\n",
- data->vstable);
-
- dprintk("flags2: 0x%x\n", psb->flags2);
- data->rvo = psb->flags2 & 3;
- data->irt = ((psb->flags2) >> 2) & 3;
- mvs = ((psb->flags2) >> 4) & 3;
- data->vidmvs = 1 << mvs;
- data->batps = ((psb->flags2) >> 6) & 3;
-
- dprintk("ramp voltage offset: %d\n", data->rvo);
- dprintk("isochronous relief time: %d\n", data->irt);
- dprintk("maximum voltage step: %d - 0x%x\n", mvs, data->vidmvs);
-
- dprintk("numpst: 0x%x\n", psb->num_tables);
- cpst = psb->num_tables;
- if ((psb->cpuid == 0x00000fc0) ||
- (psb->cpuid == 0x00000fe0)) {
- thiscpuid = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
- if ((thiscpuid == 0x00000fc0) ||
- (thiscpuid == 0x00000fe0))
- cpst = 1;
- }
- if (cpst != 1) {
- printk(KERN_ERR FW_BUG PFX "numpst must be 1\n");
- return -ENODEV;
- }
-
- data->plllock = psb->plllocktime;
- dprintk("plllocktime: 0x%x (units 1us)\n", psb->plllocktime);
- dprintk("maxfid: 0x%x\n", psb->maxfid);
- dprintk("maxvid: 0x%x\n", psb->maxvid);
- maxvid = psb->maxvid;
-
- data->numps = psb->numps;
- dprintk("numpstates: 0x%x\n", data->numps);
- return fill_powernow_table(data,
- (struct pst_s *)(psb+1), maxvid);
- }
- /*
- * If you see this message, complain to BIOS manufacturer. If
- * he tells you "we do not support Linux" or some similar
- * nonsense, remember that Windows 2000 uses the same legacy
- * mechanism that the old Linux PSB driver uses. Tell them it
- * is broken with Windows 2000.
- *
- * The reference to the AMD documentation is chapter 9 in the
- * BIOS and Kernel Developer's Guide, which is available on
- * www.amd.com
- */
- printk(KERN_ERR FW_BUG PFX "No PSB or ACPI _PSS objects\n");
- printk(KERN_ERR PFX "Make sure that your BIOS is up to date"
- " and Cool'N'Quiet support is enabled in BIOS setup\n");
- return -ENODEV;
-}
-
-static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data,
- unsigned int index)
-{
- u64 control;
-
- if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE))
- return;
-
- control = data->acpi_data.states[index].control;
- data->irt = (control >> IRT_SHIFT) & IRT_MASK;
- data->rvo = (control >> RVO_SHIFT) & RVO_MASK;
- data->exttype = (control >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK;
- data->plllock = (control >> PLL_L_SHIFT) & PLL_L_MASK;
- data->vidmvs = 1 << ((control >> MVS_SHIFT) & MVS_MASK);
- data->vstable = (control >> VST_SHIFT) & VST_MASK;
-}
-
-static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
-{
- struct cpufreq_frequency_table *powernow_table;
- int ret_val = -ENODEV;
- u64 control, status;
-
- if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) {
- dprintk("register performance failed: bad ACPI data\n");
- return -EIO;
- }
-
- /* verify the data contained in the ACPI structures */
- if (data->acpi_data.state_count <= 1) {
- dprintk("No ACPI P-States\n");
- goto err_out;
- }
-
- control = data->acpi_data.control_register.space_id;
- status = data->acpi_data.status_register.space_id;
-
- if ((control != ACPI_ADR_SPACE_FIXED_HARDWARE) ||
- (status != ACPI_ADR_SPACE_FIXED_HARDWARE)) {
- dprintk("Invalid control/status registers (%x - %x)\n",
- control, status);
- goto err_out;
- }
-
- /* fill in data->powernow_table */
- powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table)
- * (data->acpi_data.state_count + 1)), GFP_KERNEL);
- if (!powernow_table) {
- dprintk("powernow_table memory alloc failure\n");
- goto err_out;
- }
-
- /* fill in data */
- data->numps = data->acpi_data.state_count;
- powernow_k8_acpi_pst_values(data, 0);
-
- if (cpu_family == CPU_HW_PSTATE)
- ret_val = fill_powernow_table_pstate(data, powernow_table);
- else
- ret_val = fill_powernow_table_fidvid(data, powernow_table);
- if (ret_val)
- goto err_out_mem;
-
- powernow_table[data->acpi_data.state_count].frequency =
- CPUFREQ_TABLE_END;
- powernow_table[data->acpi_data.state_count].index = 0;
- data->powernow_table = powernow_table;
-
- if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu)
- print_basics(data);
-
- /* notify BIOS that we exist */
- acpi_processor_notify_smm(THIS_MODULE);
-
- if (!zalloc_cpumask_var(&data->acpi_data.shared_cpu_map, GFP_KERNEL)) {
- printk(KERN_ERR PFX
- "unable to alloc powernow_k8_data cpumask\n");
- ret_val = -ENOMEM;
- goto err_out_mem;
- }
-
- return 0;
-
-err_out_mem:
- kfree(powernow_table);
-
-err_out:
- acpi_processor_unregister_performance(&data->acpi_data, data->cpu);
-
- /* data->acpi_data.state_count informs us at ->exit()
- * whether ACPI was used */
- data->acpi_data.state_count = 0;
-
- return ret_val;
-}
-
-static int fill_powernow_table_pstate(struct powernow_k8_data *data,
- struct cpufreq_frequency_table *powernow_table)
-{
- int i;
- u32 hi = 0, lo = 0;
- rdmsr(MSR_PSTATE_CUR_LIMIT, lo, hi);
- data->max_hw_pstate = (lo & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT;
-
- for (i = 0; i < data->acpi_data.state_count; i++) {
- u32 index;
-
- index = data->acpi_data.states[i].control & HW_PSTATE_MASK;
- if (index > data->max_hw_pstate) {
- printk(KERN_ERR PFX "invalid pstate %d - "
- "bad value %d.\n", i, index);
- printk(KERN_ERR PFX "Please report to BIOS "
- "manufacturer\n");
- invalidate_entry(powernow_table, i);
- continue;
- }
- rdmsr(MSR_PSTATE_DEF_BASE + index, lo, hi);
- if (!(hi & HW_PSTATE_VALID_MASK)) {
- dprintk("invalid pstate %d, ignoring\n", index);
- invalidate_entry(powernow_table, i);
- continue;
- }
-
- powernow_table[i].index = index;
-
- /* Frequency may be rounded for these */
- if ((boot_cpu_data.x86 == 0x10 && boot_cpu_data.x86_model < 10)
- || boot_cpu_data.x86 == 0x11) {
- powernow_table[i].frequency =
- freq_from_fid_did(lo & 0x3f, (lo >> 6) & 7);
- } else
- powernow_table[i].frequency =
- data->acpi_data.states[i].core_frequency * 1000;
- }
- return 0;
-}
-
-static int fill_powernow_table_fidvid(struct powernow_k8_data *data,
- struct cpufreq_frequency_table *powernow_table)
-{
- int i;
-
- for (i = 0; i < data->acpi_data.state_count; i++) {
- u32 fid;
- u32 vid;
- u32 freq, index;
- u64 status, control;
-
- if (data->exttype) {
- status = data->acpi_data.states[i].status;
- fid = status & EXT_FID_MASK;
- vid = (status >> VID_SHIFT) & EXT_VID_MASK;
- } else {
- control = data->acpi_data.states[i].control;
- fid = control & FID_MASK;
- vid = (control >> VID_SHIFT) & VID_MASK;
- }
-
- dprintk(" %d : fid 0x%x, vid 0x%x\n", i, fid, vid);
-
- index = fid | (vid<<8);
- powernow_table[i].index = index;
-
- freq = find_khz_freq_from_fid(fid);
- powernow_table[i].frequency = freq;
-
- /* verify frequency is OK */
- if ((freq > (MAX_FREQ * 1000)) || (freq < (MIN_FREQ * 1000))) {
- dprintk("invalid freq %u kHz, ignoring\n", freq);
- invalidate_entry(powernow_table, i);
- continue;
- }
-
- /* verify voltage is OK -
- * BIOSs are using "off" to indicate invalid */
- if (vid == VID_OFF) {
- dprintk("invalid vid %u, ignoring\n", vid);
- invalidate_entry(powernow_table, i);
- continue;
- }
-
- if (freq != (data->acpi_data.states[i].core_frequency * 1000)) {
- printk(KERN_INFO PFX "invalid freq entries "
- "%u kHz vs. %u kHz\n", freq,
- (unsigned int)
- (data->acpi_data.states[i].core_frequency
- * 1000));
- invalidate_entry(powernow_table, i);
- continue;
- }
- }
- return 0;
-}
-
-static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data)
-{
- if (data->acpi_data.state_count)
- acpi_processor_unregister_performance(&data->acpi_data,
- data->cpu);
- free_cpumask_var(data->acpi_data.shared_cpu_map);
-}
-
-static int get_transition_latency(struct powernow_k8_data *data)
-{
- int max_latency = 0;
- int i;
- for (i = 0; i < data->acpi_data.state_count; i++) {
- int cur_latency = data->acpi_data.states[i].transition_latency
- + data->acpi_data.states[i].bus_master_latency;
- if (cur_latency > max_latency)
- max_latency = cur_latency;
- }
- if (max_latency == 0) {
- /*
- * Fam 11h and later may return 0 as transition latency. This
- * is intended and means "very fast". While cpufreq core and
- * governors currently can handle that gracefully, better set it
- * to 1 to avoid problems in the future.
- */
- if (boot_cpu_data.x86 < 0x11)
- printk(KERN_ERR FW_WARN PFX "Invalid zero transition "
- "latency\n");
- max_latency = 1;
- }
- /* value in usecs, needs to be in nanoseconds */
- return 1000 * max_latency;
-}
-
-/* Take a frequency, and issue the fid/vid transition command */
-static int transition_frequency_fidvid(struct powernow_k8_data *data,
- unsigned int index)
-{
- u32 fid = 0;
- u32 vid = 0;
- int res, i;
- struct cpufreq_freqs freqs;
-
- dprintk("cpu %d transition to index %u\n", smp_processor_id(), index);
-
- /* fid/vid correctness check for k8 */
- /* fid are the lower 8 bits of the index we stored into
- * the cpufreq frequency table in find_psb_table, vid
- * are the upper 8 bits.
- */
- fid = data->powernow_table[index].index & 0xFF;
- vid = (data->powernow_table[index].index & 0xFF00) >> 8;
-
- dprintk("table matched fid 0x%x, giving vid 0x%x\n", fid, vid);
-
- if (query_current_values_with_pending_wait(data))
- return 1;
-
- if ((data->currvid == vid) && (data->currfid == fid)) {
- dprintk("target matches current values (fid 0x%x, vid 0x%x)\n",
- fid, vid);
- return 0;
- }
-
- dprintk("cpu %d, changing to fid 0x%x, vid 0x%x\n",
- smp_processor_id(), fid, vid);
- freqs.old = find_khz_freq_from_fid(data->currfid);
- freqs.new = find_khz_freq_from_fid(fid);
-
- for_each_cpu(i, data->available_cores) {
- freqs.cpu = i;
- cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
- }
-
- res = transition_fid_vid(data, fid, vid);
- freqs.new = find_khz_freq_from_fid(data->currfid);
-
- for_each_cpu(i, data->available_cores) {
- freqs.cpu = i;
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
- }
- return res;
-}
-
-/* Take a frequency, and issue the hardware pstate transition command */
-static int transition_frequency_pstate(struct powernow_k8_data *data,
- unsigned int index)
-{
- u32 pstate = 0;
- int res, i;
- struct cpufreq_freqs freqs;
-
- dprintk("cpu %d transition to index %u\n", smp_processor_id(), index);
-
- /* get MSR index for hardware pstate transition */
- pstate = index & HW_PSTATE_MASK;
- if (pstate > data->max_hw_pstate)
- return 0;
- freqs.old = find_khz_freq_from_pstate(data->powernow_table,
- data->currpstate);
- freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate);
-
- for_each_cpu(i, data->available_cores) {
- freqs.cpu = i;
- cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
- }
-
- res = transition_pstate(data, pstate);
- freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate);
-
- for_each_cpu(i, data->available_cores) {
- freqs.cpu = i;
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
- }
- return res;
-}
-
-/* Driver entry point to switch to the target frequency */
-static int powernowk8_target(struct cpufreq_policy *pol,
- unsigned targfreq, unsigned relation)
-{
- cpumask_var_t oldmask;
- struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
- u32 checkfid;
- u32 checkvid;
- unsigned int newstate;
- int ret = -EIO;
-
- if (!data)
- return -EINVAL;
-
- checkfid = data->currfid;
- checkvid = data->currvid;
-
- /* only run on specific CPU from here on. */
- /* This is poor form: use a workqueue or smp_call_function_single */
- if (!alloc_cpumask_var(&oldmask, GFP_KERNEL))
- return -ENOMEM;
-
- cpumask_copy(oldmask, tsk_cpus_allowed(current));
- set_cpus_allowed_ptr(current, cpumask_of(pol->cpu));
-
- if (smp_processor_id() != pol->cpu) {
- printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu);
- goto err_out;
- }
-
- if (pending_bit_stuck()) {
- printk(KERN_ERR PFX "failing targ, change pending bit set\n");
- goto err_out;
- }
-
- dprintk("targ: cpu %d, %d kHz, min %d, max %d, relation %d\n",
- pol->cpu, targfreq, pol->min, pol->max, relation);
-
- if (query_current_values_with_pending_wait(data))
- goto err_out;
-
- if (cpu_family != CPU_HW_PSTATE) {
- dprintk("targ: curr fid 0x%x, vid 0x%x\n",
- data->currfid, data->currvid);
-
- if ((checkvid != data->currvid) ||
- (checkfid != data->currfid)) {
- printk(KERN_INFO PFX
- "error - out of sync, fix 0x%x 0x%x, "
- "vid 0x%x 0x%x\n",
- checkfid, data->currfid,
- checkvid, data->currvid);
- }
- }
-
- if (cpufreq_frequency_table_target(pol, data->powernow_table,
- targfreq, relation, &newstate))
- goto err_out;
-
- mutex_lock(&fidvid_mutex);
-
- powernow_k8_acpi_pst_values(data, newstate);
-
- if (cpu_family == CPU_HW_PSTATE)
- ret = transition_frequency_pstate(data, newstate);
- else
- ret = transition_frequency_fidvid(data, newstate);
- if (ret) {
- printk(KERN_ERR PFX "transition frequency failed\n");
- ret = 1;
- mutex_unlock(&fidvid_mutex);
- goto err_out;
- }
- mutex_unlock(&fidvid_mutex);
-
- if (cpu_family == CPU_HW_PSTATE)
- pol->cur = find_khz_freq_from_pstate(data->powernow_table,
- newstate);
- else
- pol->cur = find_khz_freq_from_fid(data->currfid);
- ret = 0;
-
-err_out:
- set_cpus_allowed_ptr(current, oldmask);
- free_cpumask_var(oldmask);
- return ret;
-}
-
-/* Driver entry point to verify the policy and range of frequencies */
-static int powernowk8_verify(struct cpufreq_policy *pol)
-{
- struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
-
- if (!data)
- return -EINVAL;
-
- return cpufreq_frequency_table_verify(pol, data->powernow_table);
-}
-
-struct init_on_cpu {
- struct powernow_k8_data *data;
- int rc;
-};
-
-static void __cpuinit powernowk8_cpu_init_on_cpu(void *_init_on_cpu)
-{
- struct init_on_cpu *init_on_cpu = _init_on_cpu;
-
- if (pending_bit_stuck()) {
- printk(KERN_ERR PFX "failing init, change pending bit set\n");
- init_on_cpu->rc = -ENODEV;
- return;
- }
-
- if (query_current_values_with_pending_wait(init_on_cpu->data)) {
- init_on_cpu->rc = -ENODEV;
- return;
- }
-
- if (cpu_family == CPU_OPTERON)
- fidvid_msr_init();
-
- init_on_cpu->rc = 0;
-}
-
-/* per CPU init entry point to the driver */
-static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
-{
- static const char ACPI_PSS_BIOS_BUG_MSG[] =
- KERN_ERR FW_BUG PFX "No compatible ACPI _PSS objects found.\n"
- FW_BUG PFX "Try again with latest BIOS.\n";
- struct powernow_k8_data *data;
- struct init_on_cpu init_on_cpu;
- int rc;
- struct cpuinfo_x86 *c = &cpu_data(pol->cpu);
-
- if (!cpu_online(pol->cpu))
- return -ENODEV;
-
- smp_call_function_single(pol->cpu, check_supported_cpu, &rc, 1);
- if (rc)
- return -ENODEV;
-
- data = kzalloc(sizeof(struct powernow_k8_data), GFP_KERNEL);
- if (!data) {
- printk(KERN_ERR PFX "unable to alloc powernow_k8_data");
- return -ENOMEM;
- }
-
- data->cpu = pol->cpu;
- data->currpstate = HW_PSTATE_INVALID;
-
- if (powernow_k8_cpu_init_acpi(data)) {
- /*
- * Use the PSB BIOS structure. This is only available on
- * an UP version, and is deprecated by AMD.
- */
- if (num_online_cpus() != 1) {
- printk_once(ACPI_PSS_BIOS_BUG_MSG);
- goto err_out;
- }
- if (pol->cpu != 0) {
- printk(KERN_ERR FW_BUG PFX "No ACPI _PSS objects for "
- "CPU other than CPU0. Complain to your BIOS "
- "vendor.\n");
- goto err_out;
- }
- rc = find_psb_table(data);
- if (rc)
- goto err_out;
-
- /* Take a crude guess here.
- * That guess was in microseconds, so multiply with 1000 */
- pol->cpuinfo.transition_latency = (
- ((data->rvo + 8) * data->vstable * VST_UNITS_20US) +
- ((1 << data->irt) * 30)) * 1000;
- } else /* ACPI _PSS objects available */
- pol->cpuinfo.transition_latency = get_transition_latency(data);
-
- /* only run on specific CPU from here on */
- init_on_cpu.data = data;
- smp_call_function_single(data->cpu, powernowk8_cpu_init_on_cpu,
- &init_on_cpu, 1);
- rc = init_on_cpu.rc;
- if (rc != 0)
- goto err_out_exit_acpi;
-
- if (cpu_family == CPU_HW_PSTATE)
- cpumask_copy(pol->cpus, cpumask_of(pol->cpu));
- else
- cpumask_copy(pol->cpus, cpu_core_mask(pol->cpu));
- data->available_cores = pol->cpus;
-
- if (cpu_family == CPU_HW_PSTATE)
- pol->cur = find_khz_freq_from_pstate(data->powernow_table,
- data->currpstate);
- else
- pol->cur = find_khz_freq_from_fid(data->currfid);
- dprintk("policy current frequency %d kHz\n", pol->cur);
-
- /* min/max the cpu is capable of */
- if (cpufreq_frequency_table_cpuinfo(pol, data->powernow_table)) {
- printk(KERN_ERR FW_BUG PFX "invalid powernow_table\n");
- powernow_k8_cpu_exit_acpi(data);
- kfree(data->powernow_table);
- kfree(data);
- return -EINVAL;
- }
-
- /* Check for APERF/MPERF support in hardware */
- if (cpu_has(c, X86_FEATURE_APERFMPERF))
- cpufreq_amd64_driver.getavg = cpufreq_get_measured_perf;
-
- cpufreq_frequency_table_get_attr(data->powernow_table, pol->cpu);
-
- if (cpu_family == CPU_HW_PSTATE)
- dprintk("cpu_init done, current pstate 0x%x\n",
- data->currpstate);
- else
- dprintk("cpu_init done, current fid 0x%x, vid 0x%x\n",
- data->currfid, data->currvid);
-
- per_cpu(powernow_data, pol->cpu) = data;
-
- return 0;
-
-err_out_exit_acpi:
- powernow_k8_cpu_exit_acpi(data);
-
-err_out:
- kfree(data);
- return -ENODEV;
-}
-
-static int __devexit powernowk8_cpu_exit(struct cpufreq_policy *pol)
-{
- struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
-
- if (!data)
- return -EINVAL;
-
- powernow_k8_cpu_exit_acpi(data);
-
- cpufreq_frequency_table_put_attr(pol->cpu);
-
- kfree(data->powernow_table);
- kfree(data);
- per_cpu(powernow_data, pol->cpu) = NULL;
-
- return 0;
-}
-
-static void query_values_on_cpu(void *_err)
-{
- int *err = _err;
- struct powernow_k8_data *data = __this_cpu_read(powernow_data);
-
- *err = query_current_values_with_pending_wait(data);
-}
-
-static unsigned int powernowk8_get(unsigned int cpu)
-{
- struct powernow_k8_data *data = per_cpu(powernow_data, cpu);
- unsigned int khz = 0;
- int err;
-
- if (!data)
- return 0;
-
- smp_call_function_single(cpu, query_values_on_cpu, &err, true);
- if (err)
- goto out;
-
- if (cpu_family == CPU_HW_PSTATE)
- khz = find_khz_freq_from_pstate(data->powernow_table,
- data->currpstate);
- else
- khz = find_khz_freq_from_fid(data->currfid);
-
-
-out:
- return khz;
-}
-
-static void _cpb_toggle_msrs(bool t)
-{
- int cpu;
-
- get_online_cpus();
-
- rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
-
- for_each_cpu(cpu, cpu_online_mask) {
- struct msr *reg = per_cpu_ptr(msrs, cpu);
- if (t)
- reg->l &= ~BIT(25);
- else
- reg->l |= BIT(25);
- }
- wrmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
-
- put_online_cpus();
-}
-
-/*
- * Switch on/off core performance boosting.
- *
- * 0=disable
- * 1=enable.
- */
-static void cpb_toggle(bool t)
-{
- if (!cpb_capable)
- return;
-
- if (t && !cpb_enabled) {
- cpb_enabled = true;
- _cpb_toggle_msrs(t);
- printk(KERN_INFO PFX "Core Boosting enabled.\n");
- } else if (!t && cpb_enabled) {
- cpb_enabled = false;
- _cpb_toggle_msrs(t);
- printk(KERN_INFO PFX "Core Boosting disabled.\n");
- }
-}
-
-static ssize_t store_cpb(struct cpufreq_policy *policy, const char *buf,
- size_t count)
-{
- int ret = -EINVAL;
- unsigned long val = 0;
-
- ret = strict_strtoul(buf, 10, &val);
- if (!ret && (val == 0 || val == 1) && cpb_capable)
- cpb_toggle(val);
- else
- return -EINVAL;
-
- return count;
-}
-
-static ssize_t show_cpb(struct cpufreq_policy *policy, char *buf)
-{
- return sprintf(buf, "%u\n", cpb_enabled);
-}
-
-#define define_one_rw(_name) \
-static struct freq_attr _name = \
-__ATTR(_name, 0644, show_##_name, store_##_name)
-
-define_one_rw(cpb);
-
-static struct freq_attr *powernow_k8_attr[] = {
- &cpufreq_freq_attr_scaling_available_freqs,
- &cpb,
- NULL,
-};
-
-static struct cpufreq_driver cpufreq_amd64_driver = {
- .verify = powernowk8_verify,
- .target = powernowk8_target,
- .bios_limit = acpi_processor_get_bios_limit,
- .init = powernowk8_cpu_init,
- .exit = __devexit_p(powernowk8_cpu_exit),
- .get = powernowk8_get,
- .name = "powernow-k8",
- .owner = THIS_MODULE,
- .attr = powernow_k8_attr,
-};
-
-/*
- * Clear the boost-disable flag on the CPU_DOWN path so that this cpu
- * cannot block the remaining ones from boosting. On the CPU_UP path we
- * simply keep the boost-disable flag in sync with the current global
- * state.
- */
-static int cpb_notify(struct notifier_block *nb, unsigned long action,
- void *hcpu)
-{
- unsigned cpu = (long)hcpu;
- u32 lo, hi;
-
- switch (action) {
- case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
-
- if (!cpb_enabled) {
- rdmsr_on_cpu(cpu, MSR_K7_HWCR, &lo, &hi);
- lo |= BIT(25);
- wrmsr_on_cpu(cpu, MSR_K7_HWCR, lo, hi);
- }
- break;
-
- case CPU_DOWN_PREPARE:
- case CPU_DOWN_PREPARE_FROZEN:
- rdmsr_on_cpu(cpu, MSR_K7_HWCR, &lo, &hi);
- lo &= ~BIT(25);
- wrmsr_on_cpu(cpu, MSR_K7_HWCR, lo, hi);
- break;
-
- default:
- break;
- }
-
- return NOTIFY_OK;
-}
-
-static struct notifier_block cpb_nb = {
- .notifier_call = cpb_notify,
-};
-
-/* driver entry point for init */
-static int __cpuinit powernowk8_init(void)
-{
- unsigned int i, supported_cpus = 0, cpu;
- int rv;
-
- for_each_online_cpu(i) {
- int rc;
- smp_call_function_single(i, check_supported_cpu, &rc, 1);
- if (rc == 0)
- supported_cpus++;
- }
-
- if (supported_cpus != num_online_cpus())
- return -ENODEV;
-
- printk(KERN_INFO PFX "Found %d %s (%d cpu cores) (" VERSION ")\n",
- num_online_nodes(), boot_cpu_data.x86_model_id, supported_cpus);
-
- if (boot_cpu_has(X86_FEATURE_CPB)) {
-
- cpb_capable = true;
-
- msrs = msrs_alloc();
- if (!msrs) {
- printk(KERN_ERR "%s: Error allocating msrs!\n", __func__);
- return -ENOMEM;
- }
-
- register_cpu_notifier(&cpb_nb);
-
- rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
-
- for_each_cpu(cpu, cpu_online_mask) {
- struct msr *reg = per_cpu_ptr(msrs, cpu);
- cpb_enabled |= !(!!(reg->l & BIT(25)));
- }
-
- printk(KERN_INFO PFX "Core Performance Boosting: %s.\n",
- (cpb_enabled ? "on" : "off"));
- }
-
- rv = cpufreq_register_driver(&cpufreq_amd64_driver);
- if (rv < 0 && boot_cpu_has(X86_FEATURE_CPB)) {
- unregister_cpu_notifier(&cpb_nb);
- msrs_free(msrs);
- msrs = NULL;
- }
- return rv;
-}
-
-/* driver entry point for term */
-static void __exit powernowk8_exit(void)
-{
- dprintk("exit\n");
-
- if (boot_cpu_has(X86_FEATURE_CPB)) {
- msrs_free(msrs);
- msrs = NULL;
-
- unregister_cpu_notifier(&cpb_nb);
- }
-
- cpufreq_unregister_driver(&cpufreq_amd64_driver);
-}
-
-MODULE_AUTHOR("Paul Devriendt <paul.devriendt@amd.com> and "
- "Mark Langsdorf <mark.langsdorf@amd.com>");
-MODULE_DESCRIPTION("AMD Athlon 64 and Opteron processor frequency driver.");
-MODULE_LICENSE("GPL");
-
-late_initcall(powernowk8_init);
-module_exit(powernowk8_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
deleted file mode 100644
index df3529b1c02..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
+++ /dev/null
@@ -1,224 +0,0 @@
-/*
- * (c) 2003-2006 Advanced Micro Devices, Inc.
- * Your use of this code is subject to the terms and conditions of the
- * GNU general public license version 2. See "COPYING" or
- * http://www.gnu.org/licenses/gpl.html
- */
-
-enum pstate {
- HW_PSTATE_INVALID = 0xff,
- HW_PSTATE_0 = 0,
- HW_PSTATE_1 = 1,
- HW_PSTATE_2 = 2,
- HW_PSTATE_3 = 3,
- HW_PSTATE_4 = 4,
- HW_PSTATE_5 = 5,
- HW_PSTATE_6 = 6,
- HW_PSTATE_7 = 7,
-};
-
-struct powernow_k8_data {
- unsigned int cpu;
-
- u32 numps; /* number of p-states */
- u32 batps; /* number of p-states supported on battery */
- u32 max_hw_pstate; /* maximum legal hardware pstate */
-
- /* these values are constant when the PSB is used to determine
- * vid/fid pairings, but are modified during the ->target() call
- * when ACPI is used */
- u32 rvo; /* ramp voltage offset */
- u32 irt; /* isochronous relief time */
- u32 vidmvs; /* usable value calculated from mvs */
- u32 vstable; /* voltage stabilization time, units 20 us */
- u32 plllock; /* pll lock time, units 1 us */
- u32 exttype; /* extended interface = 1 */
-
- /* keep track of the current fid / vid or pstate */
- u32 currvid;
- u32 currfid;
- enum pstate currpstate;
-
- /* the powernow_table includes all frequency and vid/fid pairings:
- * fid are the lower 8 bits of the index, vid are the upper 8 bits.
- * frequency is in kHz */
- struct cpufreq_frequency_table *powernow_table;
-
- /* the acpi table needs to be kept. it's only available if ACPI was
- * used to determine valid frequency/vid/fid states */
- struct acpi_processor_performance acpi_data;
-
- /* we need to keep track of associated cores, but let cpufreq
- * handle hotplug events - so just point at cpufreq pol->cpus
- * structure */
- struct cpumask *available_cores;
-};
-
-/* processor's cpuid instruction support */
-#define CPUID_PROCESSOR_SIGNATURE 1 /* function 1 */
-#define CPUID_XFAM 0x0ff00000 /* extended family */
-#define CPUID_XFAM_K8 0
-#define CPUID_XMOD 0x000f0000 /* extended model */
-#define CPUID_XMOD_REV_MASK 0x000c0000
-#define CPUID_XFAM_10H 0x00100000 /* family 0x10 */
-#define CPUID_USE_XFAM_XMOD 0x00000f00
-#define CPUID_GET_MAX_CAPABILITIES 0x80000000
-#define CPUID_FREQ_VOLT_CAPABILITIES 0x80000007
-#define P_STATE_TRANSITION_CAPABLE 6
-
-/* Model Specific Registers for p-state transitions. MSRs are 64-bit. For */
-/* writes (wrmsr - opcode 0f 30), the register number is placed in ecx, and */
-/* the value to write is placed in edx:eax. For reads (rdmsr - opcode 0f 32), */
-/* the register number is placed in ecx, and the data is returned in edx:eax. */
-
-#define MSR_FIDVID_CTL 0xc0010041
-#define MSR_FIDVID_STATUS 0xc0010042
-
-/* Field definitions within the FID VID Low Control MSR : */
-#define MSR_C_LO_INIT_FID_VID 0x00010000
-#define MSR_C_LO_NEW_VID 0x00003f00
-#define MSR_C_LO_NEW_FID 0x0000003f
-#define MSR_C_LO_VID_SHIFT 8
-
-/* Field definitions within the FID VID High Control MSR : */
-#define MSR_C_HI_STP_GNT_TO 0x000fffff
-
-/* Field definitions within the FID VID Low Status MSR : */
-#define MSR_S_LO_CHANGE_PENDING 0x80000000 /* cleared when completed */
-#define MSR_S_LO_MAX_RAMP_VID 0x3f000000
-#define MSR_S_LO_MAX_FID 0x003f0000
-#define MSR_S_LO_START_FID 0x00003f00
-#define MSR_S_LO_CURRENT_FID 0x0000003f
-
-/* Field definitions within the FID VID High Status MSR : */
-#define MSR_S_HI_MIN_WORKING_VID 0x3f000000
-#define MSR_S_HI_MAX_WORKING_VID 0x003f0000
-#define MSR_S_HI_START_VID 0x00003f00
-#define MSR_S_HI_CURRENT_VID 0x0000003f
-#define MSR_C_HI_STP_GNT_BENIGN 0x00000001
-
-
-/* Hardware Pstate _PSS and MSR definitions */
-#define USE_HW_PSTATE 0x00000080
-#define HW_PSTATE_MASK 0x00000007
-#define HW_PSTATE_VALID_MASK 0x80000000
-#define HW_PSTATE_MAX_MASK 0x000000f0
-#define HW_PSTATE_MAX_SHIFT 4
-#define MSR_PSTATE_DEF_BASE 0xc0010064 /* base of Pstate MSRs */
-#define MSR_PSTATE_STATUS 0xc0010063 /* Pstate Status MSR */
-#define MSR_PSTATE_CTRL 0xc0010062 /* Pstate control MSR */
-#define MSR_PSTATE_CUR_LIMIT 0xc0010061 /* pstate current limit MSR */
-
-/* define the two driver architectures */
-#define CPU_OPTERON 0
-#define CPU_HW_PSTATE 1
-
-
-/*
- * There are restrictions frequencies have to follow:
- * - only 1 entry in the low fid table ( <=1.4GHz )
- * - lowest entry in the high fid table must be >= 2 * the entry in the
- * low fid table
- * - lowest entry in the high fid table must be a <= 200MHz + 2 * the entry
- * in the low fid table
- * - the parts can only step at <= 200 MHz intervals, odd fid values are
- * supported in revision G and later revisions.
- * - lowest frequency must be >= interprocessor hypertransport link speed
- * (only applies to MP systems obviously)
- */
-
-/* fids (frequency identifiers) are arranged in 2 tables - lo and hi */
-#define LO_FID_TABLE_TOP 7 /* fid values marking the boundary */
-#define HI_FID_TABLE_BOTTOM 8 /* between the low and high tables */
-
-#define LO_VCOFREQ_TABLE_TOP 1400 /* corresponding vco frequency values */
-#define HI_VCOFREQ_TABLE_BOTTOM 1600
-
-#define MIN_FREQ_RESOLUTION 200 /* fids jump by 2 matching freq jumps by 200 */
-
-#define MAX_FID 0x2a /* Spec only gives FID values as far as 5 GHz */
-#define LEAST_VID 0x3e /* Lowest (numerically highest) useful vid value */
-
-#define MIN_FREQ 800 /* Min and max freqs, per spec */
-#define MAX_FREQ 5000
-
-#define INVALID_FID_MASK 0xffffffc0 /* not a valid fid if these bits are set */
-#define INVALID_VID_MASK 0xffffffc0 /* not a valid vid if these bits are set */
-
-#define VID_OFF 0x3f
-
-#define STOP_GRANT_5NS 1 /* min poss memory access latency for voltage change */
-
-#define PLL_LOCK_CONVERSION (1000/5) /* ms to ns, then divide by clock period */
-
-#define MAXIMUM_VID_STEPS 1 /* Current cpus only allow a single step of 25mV */
-#define VST_UNITS_20US 20 /* Voltage Stabilization Time is in units of 20us */
-
-/*
- * Most values of interest are encoded in a single field of the _PSS
- * entries: the "control" value.
- */
-
-#define IRT_SHIFT 30
-#define RVO_SHIFT 28
-#define EXT_TYPE_SHIFT 27
-#define PLL_L_SHIFT 20
-#define MVS_SHIFT 18
-#define VST_SHIFT 11
-#define VID_SHIFT 6
-#define IRT_MASK 3
-#define RVO_MASK 3
-#define EXT_TYPE_MASK 1
-#define PLL_L_MASK 0x7f
-#define MVS_MASK 3
-#define VST_MASK 0x7f
-#define VID_MASK 0x1f
-#define FID_MASK 0x1f
-#define EXT_VID_MASK 0x3f
-#define EXT_FID_MASK 0x3f
-
-
-/*
- * Version 1.4 of the PSB table. This table is constructed by BIOS and is
- * to tell the OS's power management driver which VIDs and FIDs are
- * supported by this particular processor.
- * If the data in the PSB / PST is wrong, then this driver will program the
- * wrong values into hardware, which is very likely to lead to a crash.
- */
-
-#define PSB_ID_STRING "AMDK7PNOW!"
-#define PSB_ID_STRING_LEN 10
-
-#define PSB_VERSION_1_4 0x14
-
-struct psb_s {
- u8 signature[10];
- u8 tableversion;
- u8 flags1;
- u16 vstable;
- u8 flags2;
- u8 num_tables;
- u32 cpuid;
- u8 plllocktime;
- u8 maxfid;
- u8 maxvid;
- u8 numps;
-};
-
-/* Pairs of fid/vid values are appended to the version 1.4 PSB table. */
-struct pst_s {
- u8 fid;
- u8 vid;
-};
-
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "powernow-k8", msg)
-
-static int core_voltage_pre_transition(struct powernow_k8_data *data,
- u32 reqvid, u32 regfid);
-static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvid);
-static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid);
-
-static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index);
-
-static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table);
-static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table);
diff --git a/arch/x86/kernel/cpu/cpufreq/sc520_freq.c b/arch/x86/kernel/cpu/cpufreq/sc520_freq.c
deleted file mode 100644
index 435a996a613..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/sc520_freq.c
+++ /dev/null
@@ -1,194 +0,0 @@
-/*
- * sc520_freq.c: cpufreq driver for the AMD Elan sc520
- *
- * Copyright (C) 2005 Sean Young <sean@mess.org>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * Based on elanfreq.c
- *
- * 2005-03-30: - initial revision
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-
-#include <linux/delay.h>
-#include <linux/cpufreq.h>
-#include <linux/timex.h>
-#include <linux/io.h>
-
-#include <asm/msr.h>
-
-#define MMCR_BASE 0xfffef000 /* The default base address */
-#define OFFS_CPUCTL 0x2 /* CPU Control Register */
-
-static __u8 __iomem *cpuctl;
-
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
- "sc520_freq", msg)
-#define PFX "sc520_freq: "
-
-static struct cpufreq_frequency_table sc520_freq_table[] = {
- {0x01, 100000},
- {0x02, 133000},
- {0, CPUFREQ_TABLE_END},
-};
-
-static unsigned int sc520_freq_get_cpu_frequency(unsigned int cpu)
-{
- u8 clockspeed_reg = *cpuctl;
-
- switch (clockspeed_reg & 0x03) {
- default:
- printk(KERN_ERR PFX "error: cpuctl register has unexpected "
- "value %02x\n", clockspeed_reg);
- case 0x01:
- return 100000;
- case 0x02:
- return 133000;
- }
-}
-
-static void sc520_freq_set_cpu_state(unsigned int state)
-{
-
- struct cpufreq_freqs freqs;
- u8 clockspeed_reg;
-
- freqs.old = sc520_freq_get_cpu_frequency(0);
- freqs.new = sc520_freq_table[state].frequency;
- freqs.cpu = 0; /* AMD Elan is UP */
-
- cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-
- dprintk("attempting to set frequency to %i kHz\n",
- sc520_freq_table[state].frequency);
-
- local_irq_disable();
-
- clockspeed_reg = *cpuctl & ~0x03;
- *cpuctl = clockspeed_reg | sc520_freq_table[state].index;
-
- local_irq_enable();
-
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-};
-
-static int sc520_freq_verify(struct cpufreq_policy *policy)
-{
- return cpufreq_frequency_table_verify(policy, &sc520_freq_table[0]);
-}
-
-static int sc520_freq_target(struct cpufreq_policy *policy,
- unsigned int target_freq,
- unsigned int relation)
-{
- unsigned int newstate = 0;
-
- if (cpufreq_frequency_table_target(policy, sc520_freq_table,
- target_freq, relation, &newstate))
- return -EINVAL;
-
- sc520_freq_set_cpu_state(newstate);
-
- return 0;
-}
-
-
-/*
- * Module init and exit code
- */
-
-static int sc520_freq_cpu_init(struct cpufreq_policy *policy)
-{
- struct cpuinfo_x86 *c = &cpu_data(0);
- int result;
-
- /* capability check */
- if (c->x86_vendor != X86_VENDOR_AMD ||
- c->x86 != 4 || c->x86_model != 9)
- return -ENODEV;
-
- /* cpuinfo and default policy values */
- policy->cpuinfo.transition_latency = 1000000; /* 1ms */
- policy->cur = sc520_freq_get_cpu_frequency(0);
-
- result = cpufreq_frequency_table_cpuinfo(policy, sc520_freq_table);
- if (result)
- return result;
-
- cpufreq_frequency_table_get_attr(sc520_freq_table, policy->cpu);
-
- return 0;
-}
-
-
-static int sc520_freq_cpu_exit(struct cpufreq_policy *policy)
-{
- cpufreq_frequency_table_put_attr(policy->cpu);
- return 0;
-}
-
-
-static struct freq_attr *sc520_freq_attr[] = {
- &cpufreq_freq_attr_scaling_available_freqs,
- NULL,
-};
-
-
-static struct cpufreq_driver sc520_freq_driver = {
- .get = sc520_freq_get_cpu_frequency,
- .verify = sc520_freq_verify,
- .target = sc520_freq_target,
- .init = sc520_freq_cpu_init,
- .exit = sc520_freq_cpu_exit,
- .name = "sc520_freq",
- .owner = THIS_MODULE,
- .attr = sc520_freq_attr,
-};
-
-
-static int __init sc520_freq_init(void)
-{
- struct cpuinfo_x86 *c = &cpu_data(0);
- int err;
-
- /* Test if we have the right hardware */
- if (c->x86_vendor != X86_VENDOR_AMD ||
- c->x86 != 4 || c->x86_model != 9) {
- dprintk("no Elan SC520 processor found!\n");
- return -ENODEV;
- }
- cpuctl = ioremap((unsigned long)(MMCR_BASE + OFFS_CPUCTL), 1);
- if (!cpuctl) {
- printk(KERN_ERR "sc520_freq: error: failed to remap memory\n");
- return -ENOMEM;
- }
-
- err = cpufreq_register_driver(&sc520_freq_driver);
- if (err)
- iounmap(cpuctl);
-
- return err;
-}
-
-
-static void __exit sc520_freq_exit(void)
-{
- cpufreq_unregister_driver(&sc520_freq_driver);
- iounmap(cpuctl);
-}
-
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Sean Young <sean@mess.org>");
-MODULE_DESCRIPTION("cpufreq driver for AMD's Elan sc520 CPU");
-
-module_init(sc520_freq_init);
-module_exit(sc520_freq_exit);
-
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
deleted file mode 100644
index 9b1ff37de46..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
+++ /dev/null
@@ -1,636 +0,0 @@
-/*
- * cpufreq driver for Enhanced SpeedStep, as found in Intel's Pentium
- * M (part of the Centrino chipset).
- *
- * Since the original Pentium M, most new Intel CPUs support Enhanced
- * SpeedStep.
- *
- * Despite the "SpeedStep" in the name, this is almost entirely unlike
- * traditional SpeedStep.
- *
- * Modelled on speedstep.c
- *
- * Copyright (C) 2003 Jeremy Fitzhardinge <jeremy@goop.org>
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/sched.h> /* current */
-#include <linux/delay.h>
-#include <linux/compiler.h>
-#include <linux/gfp.h>
-
-#include <asm/msr.h>
-#include <asm/processor.h>
-#include <asm/cpufeature.h>
-
-#define PFX "speedstep-centrino: "
-#define MAINTAINER "cpufreq@vger.kernel.org"
-
-#define dprintk(msg...) \
- cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg)
-
-#define INTEL_MSR_RANGE (0xffff)
-
-struct cpu_id
-{
- __u8 x86; /* CPU family */
- __u8 x86_model; /* model */
- __u8 x86_mask; /* stepping */
-};
-
-enum {
- CPU_BANIAS,
- CPU_DOTHAN_A1,
- CPU_DOTHAN_A2,
- CPU_DOTHAN_B0,
- CPU_MP4HT_D0,
- CPU_MP4HT_E0,
-};
-
-static const struct cpu_id cpu_ids[] = {
- [CPU_BANIAS] = { 6, 9, 5 },
- [CPU_DOTHAN_A1] = { 6, 13, 1 },
- [CPU_DOTHAN_A2] = { 6, 13, 2 },
- [CPU_DOTHAN_B0] = { 6, 13, 6 },
- [CPU_MP4HT_D0] = {15, 3, 4 },
- [CPU_MP4HT_E0] = {15, 4, 1 },
-};
-#define N_IDS ARRAY_SIZE(cpu_ids)
-
-struct cpu_model
-{
- const struct cpu_id *cpu_id;
- const char *model_name;
- unsigned max_freq; /* max clock in kHz */
-
- struct cpufreq_frequency_table *op_points; /* clock/voltage pairs */
-};
-static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c,
- const struct cpu_id *x);
-
-/* Operating points for current CPU */
-static DEFINE_PER_CPU(struct cpu_model *, centrino_model);
-static DEFINE_PER_CPU(const struct cpu_id *, centrino_cpu);
-
-static struct cpufreq_driver centrino_driver;
-
-#ifdef CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE
-
-/* Computes the correct form for IA32_PERF_CTL MSR for a particular
- frequency/voltage operating point; frequency in MHz, volts in mV.
- This is stored as "index" in the structure. */
-#define OP(mhz, mv) \
- { \
- .frequency = (mhz) * 1000, \
- .index = (((mhz)/100) << 8) | ((mv - 700) / 16) \
- }
-
-/*
- * These voltage tables were derived from the Intel Pentium M
- * datasheet, document 25261202.pdf, Table 5. I have verified they
- * are consistent with my IBM ThinkPad X31, which has a 1.3GHz Pentium
- * M.
- */
-
-/* Ultra Low Voltage Intel Pentium M processor 900MHz (Banias) */
-static struct cpufreq_frequency_table banias_900[] =
-{
- OP(600, 844),
- OP(800, 988),
- OP(900, 1004),
- { .frequency = CPUFREQ_TABLE_END }
-};
-
-/* Ultra Low Voltage Intel Pentium M processor 1000MHz (Banias) */
-static struct cpufreq_frequency_table banias_1000[] =
-{
- OP(600, 844),
- OP(800, 972),
- OP(900, 988),
- OP(1000, 1004),
- { .frequency = CPUFREQ_TABLE_END }
-};
-
-/* Low Voltage Intel Pentium M processor 1.10GHz (Banias) */
-static struct cpufreq_frequency_table banias_1100[] =
-{
- OP( 600, 956),
- OP( 800, 1020),
- OP( 900, 1100),
- OP(1000, 1164),
- OP(1100, 1180),
- { .frequency = CPUFREQ_TABLE_END }
-};
-
-
-/* Low Voltage Intel Pentium M processor 1.20GHz (Banias) */
-static struct cpufreq_frequency_table banias_1200[] =
-{
- OP( 600, 956),
- OP( 800, 1004),
- OP( 900, 1020),
- OP(1000, 1100),
- OP(1100, 1164),
- OP(1200, 1180),
- { .frequency = CPUFREQ_TABLE_END }
-};
-
-/* Intel Pentium M processor 1.30GHz (Banias) */
-static struct cpufreq_frequency_table banias_1300[] =
-{
- OP( 600, 956),
- OP( 800, 1260),
- OP(1000, 1292),
- OP(1200, 1356),
- OP(1300, 1388),
- { .frequency = CPUFREQ_TABLE_END }
-};
-
-/* Intel Pentium M processor 1.40GHz (Banias) */
-static struct cpufreq_frequency_table banias_1400[] =
-{
- OP( 600, 956),
- OP( 800, 1180),
- OP(1000, 1308),
- OP(1200, 1436),
- OP(1400, 1484),
- { .frequency = CPUFREQ_TABLE_END }
-};
-
-/* Intel Pentium M processor 1.50GHz (Banias) */
-static struct cpufreq_frequency_table banias_1500[] =
-{
- OP( 600, 956),
- OP( 800, 1116),
- OP(1000, 1228),
- OP(1200, 1356),
- OP(1400, 1452),
- OP(1500, 1484),
- { .frequency = CPUFREQ_TABLE_END }
-};
-
-/* Intel Pentium M processor 1.60GHz (Banias) */
-static struct cpufreq_frequency_table banias_1600[] =
-{
- OP( 600, 956),
- OP( 800, 1036),
- OP(1000, 1164),
- OP(1200, 1276),
- OP(1400, 1420),
- OP(1600, 1484),
- { .frequency = CPUFREQ_TABLE_END }
-};
-
-/* Intel Pentium M processor 1.70GHz (Banias) */
-static struct cpufreq_frequency_table banias_1700[] =
-{
- OP( 600, 956),
- OP( 800, 1004),
- OP(1000, 1116),
- OP(1200, 1228),
- OP(1400, 1308),
- OP(1700, 1484),
- { .frequency = CPUFREQ_TABLE_END }
-};
-#undef OP
-
-#define _BANIAS(cpuid, max, name) \
-{ .cpu_id = cpuid, \
- .model_name = "Intel(R) Pentium(R) M processor " name "MHz", \
- .max_freq = (max)*1000, \
- .op_points = banias_##max, \
-}
-#define BANIAS(max) _BANIAS(&cpu_ids[CPU_BANIAS], max, #max)
-
-/* CPU models, their operating frequency range, and freq/voltage
- operating points */
-static struct cpu_model models[] =
-{
- _BANIAS(&cpu_ids[CPU_BANIAS], 900, " 900"),
- BANIAS(1000),
- BANIAS(1100),
- BANIAS(1200),
- BANIAS(1300),
- BANIAS(1400),
- BANIAS(1500),
- BANIAS(1600),
- BANIAS(1700),
-
- /* NULL model_name is a wildcard */
- { &cpu_ids[CPU_DOTHAN_A1], NULL, 0, NULL },
- { &cpu_ids[CPU_DOTHAN_A2], NULL, 0, NULL },
- { &cpu_ids[CPU_DOTHAN_B0], NULL, 0, NULL },
- { &cpu_ids[CPU_MP4HT_D0], NULL, 0, NULL },
- { &cpu_ids[CPU_MP4HT_E0], NULL, 0, NULL },
-
- { NULL, }
-};
-#undef _BANIAS
-#undef BANIAS
-
-static int centrino_cpu_init_table(struct cpufreq_policy *policy)
-{
- struct cpuinfo_x86 *cpu = &cpu_data(policy->cpu);
- struct cpu_model *model;
-
- for(model = models; model->cpu_id != NULL; model++)
- if (centrino_verify_cpu_id(cpu, model->cpu_id) &&
- (model->model_name == NULL ||
- strcmp(cpu->x86_model_id, model->model_name) == 0))
- break;
-
- if (model->cpu_id == NULL) {
- /* No match at all */
- dprintk("no support for CPU model \"%s\": "
- "send /proc/cpuinfo to " MAINTAINER "\n",
- cpu->x86_model_id);
- return -ENOENT;
- }
-
- if (model->op_points == NULL) {
- /* Matched a non-match */
- dprintk("no table support for CPU model \"%s\"\n",
- cpu->x86_model_id);
- dprintk("try using the acpi-cpufreq driver\n");
- return -ENOENT;
- }
-
- per_cpu(centrino_model, policy->cpu) = model;
-
- dprintk("found \"%s\": max frequency: %dkHz\n",
- model->model_name, model->max_freq);
-
- return 0;
-}
-
-#else
-static inline int centrino_cpu_init_table(struct cpufreq_policy *policy)
-{
- return -ENODEV;
-}
-#endif /* CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE */
-
-static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c,
- const struct cpu_id *x)
-{
- if ((c->x86 == x->x86) &&
- (c->x86_model == x->x86_model) &&
- (c->x86_mask == x->x86_mask))
- return 1;
- return 0;
-}
-
-/* To be called only after centrino_model is initialized */
-static unsigned extract_clock(unsigned msr, unsigned int cpu, int failsafe)
-{
- int i;
-
- /*
- * Extract clock in kHz from PERF_CTL value
- * for centrino, as some DSDTs are buggy.
- * Ideally, this can be done using the acpi_data structure.
- */
- if ((per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_BANIAS]) ||
- (per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_A1]) ||
- (per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_B0])) {
- msr = (msr >> 8) & 0xff;
- return msr * 100000;
- }
-
- if ((!per_cpu(centrino_model, cpu)) ||
- (!per_cpu(centrino_model, cpu)->op_points))
- return 0;
-
- msr &= 0xffff;
- for (i = 0;
- per_cpu(centrino_model, cpu)->op_points[i].frequency
- != CPUFREQ_TABLE_END;
- i++) {
- if (msr == per_cpu(centrino_model, cpu)->op_points[i].index)
- return per_cpu(centrino_model, cpu)->
- op_points[i].frequency;
- }
- if (failsafe)
- return per_cpu(centrino_model, cpu)->op_points[i-1].frequency;
- else
- return 0;
-}
-
-/* Return the current CPU frequency in kHz */
-static unsigned int get_cur_freq(unsigned int cpu)
-{
- unsigned l, h;
- unsigned clock_freq;
-
- rdmsr_on_cpu(cpu, MSR_IA32_PERF_STATUS, &l, &h);
- clock_freq = extract_clock(l, cpu, 0);
-
- if (unlikely(clock_freq == 0)) {
- /*
- * On some CPUs, we can see transient MSR values (which are
- * not present in _PSS), while CPU is doing some automatic
- * P-state transition (like TM2). Get the last freq set
- * in PERF_CTL.
- */
- rdmsr_on_cpu(cpu, MSR_IA32_PERF_CTL, &l, &h);
- clock_freq = extract_clock(l, cpu, 1);
- }
- return clock_freq;
-}
-
-
-static int centrino_cpu_init(struct cpufreq_policy *policy)
-{
- struct cpuinfo_x86 *cpu = &cpu_data(policy->cpu);
- unsigned freq;
- unsigned l, h;
- int ret;
- int i;
-
- /* Only Intel makes Enhanced Speedstep-capable CPUs */
- if (cpu->x86_vendor != X86_VENDOR_INTEL ||
- !cpu_has(cpu, X86_FEATURE_EST))
- return -ENODEV;
-
- if (cpu_has(cpu, X86_FEATURE_CONSTANT_TSC))
- centrino_driver.flags |= CPUFREQ_CONST_LOOPS;
-
- if (policy->cpu != 0)
- return -ENODEV;
-
- for (i = 0; i < N_IDS; i++)
- if (centrino_verify_cpu_id(cpu, &cpu_ids[i]))
- break;
-
- if (i != N_IDS)
- per_cpu(centrino_cpu, policy->cpu) = &cpu_ids[i];
-
- if (!per_cpu(centrino_cpu, policy->cpu)) {
- dprintk("found unsupported CPU with "
- "Enhanced SpeedStep: send /proc/cpuinfo to "
- MAINTAINER "\n");
- return -ENODEV;
- }
-
- if (centrino_cpu_init_table(policy)) {
- return -ENODEV;
- }
-
- /* Check to see if Enhanced SpeedStep is enabled, and try to
- enable it if not. */
- rdmsr(MSR_IA32_MISC_ENABLE, l, h);
-
- if (!(l & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
- l |= MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP;
- dprintk("trying to enable Enhanced SpeedStep (%x)\n", l);
- wrmsr(MSR_IA32_MISC_ENABLE, l, h);
-
- /* check to see if it stuck */
- rdmsr(MSR_IA32_MISC_ENABLE, l, h);
- if (!(l & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
- printk(KERN_INFO PFX
- "couldn't enable Enhanced SpeedStep\n");
- return -ENODEV;
- }
- }
-
- freq = get_cur_freq(policy->cpu);
- policy->cpuinfo.transition_latency = 10000;
- /* 10uS transition latency */
- policy->cur = freq;
-
- dprintk("centrino_cpu_init: cur=%dkHz\n", policy->cur);
-
- ret = cpufreq_frequency_table_cpuinfo(policy,
- per_cpu(centrino_model, policy->cpu)->op_points);
- if (ret)
- return (ret);
-
- cpufreq_frequency_table_get_attr(
- per_cpu(centrino_model, policy->cpu)->op_points, policy->cpu);
-
- return 0;
-}
-
-static int centrino_cpu_exit(struct cpufreq_policy *policy)
-{
- unsigned int cpu = policy->cpu;
-
- if (!per_cpu(centrino_model, cpu))
- return -ENODEV;
-
- cpufreq_frequency_table_put_attr(cpu);
-
- per_cpu(centrino_model, cpu) = NULL;
-
- return 0;
-}
-
-/**
- * centrino_verify - verifies a new CPUFreq policy
- * @policy: new policy
- *
- * Limit must be within this model's frequency range at least one
- * border included.
- */
-static int centrino_verify (struct cpufreq_policy *policy)
-{
- return cpufreq_frequency_table_verify(policy,
- per_cpu(centrino_model, policy->cpu)->op_points);
-}
-
-/**
- * centrino_setpolicy - set a new CPUFreq policy
- * @policy: new policy
- * @target_freq: the target frequency
- * @relation: how that frequency relates to achieved frequency
- * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
- *
- * Sets a new CPUFreq policy.
- */
-static int centrino_target (struct cpufreq_policy *policy,
- unsigned int target_freq,
- unsigned int relation)
-{
- unsigned int newstate = 0;
- unsigned int msr, oldmsr = 0, h = 0, cpu = policy->cpu;
- struct cpufreq_freqs freqs;
- int retval = 0;
- unsigned int j, k, first_cpu, tmp;
- cpumask_var_t covered_cpus;
-
- if (unlikely(!zalloc_cpumask_var(&covered_cpus, GFP_KERNEL)))
- return -ENOMEM;
-
- if (unlikely(per_cpu(centrino_model, cpu) == NULL)) {
- retval = -ENODEV;
- goto out;
- }
-
- if (unlikely(cpufreq_frequency_table_target(policy,
- per_cpu(centrino_model, cpu)->op_points,
- target_freq,
- relation,
- &newstate))) {
- retval = -EINVAL;
- goto out;
- }
-
- first_cpu = 1;
- for_each_cpu(j, policy->cpus) {
- int good_cpu;
-
- /* cpufreq holds the hotplug lock, so we are safe here */
- if (!cpu_online(j))
- continue;
-
- /*
- * Support for SMP systems.
- * Make sure we are running on CPU that wants to change freq
- */
- if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY)
- good_cpu = cpumask_any_and(policy->cpus,
- cpu_online_mask);
- else
- good_cpu = j;
-
- if (good_cpu >= nr_cpu_ids) {
- dprintk("couldn't limit to CPUs in this domain\n");
- retval = -EAGAIN;
- if (first_cpu) {
- /* We haven't started the transition yet. */
- goto out;
- }
- break;
- }
-
- msr = per_cpu(centrino_model, cpu)->op_points[newstate].index;
-
- if (first_cpu) {
- rdmsr_on_cpu(good_cpu, MSR_IA32_PERF_CTL, &oldmsr, &h);
- if (msr == (oldmsr & 0xffff)) {
- dprintk("no change needed - msr was and needs "
- "to be %x\n", oldmsr);
- retval = 0;
- goto out;
- }
-
- freqs.old = extract_clock(oldmsr, cpu, 0);
- freqs.new = extract_clock(msr, cpu, 0);
-
- dprintk("target=%dkHz old=%d new=%d msr=%04x\n",
- target_freq, freqs.old, freqs.new, msr);
-
- for_each_cpu(k, policy->cpus) {
- if (!cpu_online(k))
- continue;
- freqs.cpu = k;
- cpufreq_notify_transition(&freqs,
- CPUFREQ_PRECHANGE);
- }
-
- first_cpu = 0;
- /* all but 16 LSB are reserved, treat them with care */
- oldmsr &= ~0xffff;
- msr &= 0xffff;
- oldmsr |= msr;
- }
-
- wrmsr_on_cpu(good_cpu, MSR_IA32_PERF_CTL, oldmsr, h);
- if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY)
- break;
-
- cpumask_set_cpu(j, covered_cpus);
- }
-
- for_each_cpu(k, policy->cpus) {
- if (!cpu_online(k))
- continue;
- freqs.cpu = k;
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
- }
-
- if (unlikely(retval)) {
- /*
- * We have failed halfway through the frequency change.
- * We have sent callbacks to policy->cpus and
- * MSRs have already been written on coverd_cpus.
- * Best effort undo..
- */
-
- for_each_cpu(j, covered_cpus)
- wrmsr_on_cpu(j, MSR_IA32_PERF_CTL, oldmsr, h);
-
- tmp = freqs.new;
- freqs.new = freqs.old;
- freqs.old = tmp;
- for_each_cpu(j, policy->cpus) {
- if (!cpu_online(j))
- continue;
- cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
- }
- }
- retval = 0;
-
-out:
- free_cpumask_var(covered_cpus);
- return retval;
-}
-
-static struct freq_attr* centrino_attr[] = {
- &cpufreq_freq_attr_scaling_available_freqs,
- NULL,
-};
-
-static struct cpufreq_driver centrino_driver = {
- .name = "centrino", /* should be speedstep-centrino,
- but there's a 16 char limit */
- .init = centrino_cpu_init,
- .exit = centrino_cpu_exit,
- .verify = centrino_verify,
- .target = centrino_target,
- .get = get_cur_freq,
- .attr = centrino_attr,
- .owner = THIS_MODULE,
-};
-
-
-/**
- * centrino_init - initializes the Enhanced SpeedStep CPUFreq driver
- *
- * Initializes the Enhanced SpeedStep support. Returns -ENODEV on
- * unsupported devices, -ENOENT if there's no voltage table for this
- * particular CPU model, -EINVAL on problems during initiatization,
- * and zero on success.
- *
- * This is quite picky. Not only does the CPU have to advertise the
- * "est" flag in the cpuid capability flags, we look for a specific
- * CPU model and stepping, and we need to have the exact model name in
- * our voltage tables. That is, be paranoid about not releasing
- * someone's valuable magic smoke.
- */
-static int __init centrino_init(void)
-{
- struct cpuinfo_x86 *cpu = &cpu_data(0);
-
- if (!cpu_has(cpu, X86_FEATURE_EST))
- return -ENODEV;
-
- return cpufreq_register_driver(&centrino_driver);
-}
-
-static void __exit centrino_exit(void)
-{
- cpufreq_unregister_driver(&centrino_driver);
-}
-
-MODULE_AUTHOR ("Jeremy Fitzhardinge <jeremy@goop.org>");
-MODULE_DESCRIPTION ("Enhanced SpeedStep driver for Intel Pentium M processors.");
-MODULE_LICENSE ("GPL");
-
-late_initcall(centrino_init);
-module_exit(centrino_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
deleted file mode 100644
index 561758e9518..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
+++ /dev/null
@@ -1,452 +0,0 @@
-/*
- * (C) 2001 Dave Jones, Arjan van de ven.
- * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
- *
- * Licensed under the terms of the GNU GPL License version 2.
- * Based upon reverse engineered information, and on Intel documentation
- * for chipsets ICH2-M and ICH3-M.
- *
- * Many thanks to Ducrot Bruno for finding and fixing the last
- * "missing link" for ICH2-M/ICH3-M support, and to Thomas Winkler
- * for extensive testing.
- *
- * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
- */
-
-
-/*********************************************************************
- * SPEEDSTEP - DEFINITIONS *
- *********************************************************************/
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/pci.h>
-#include <linux/sched.h>
-
-#include "speedstep-lib.h"
-
-
-/* speedstep_chipset:
- * It is necessary to know which chipset is used. As accesses to
- * this device occur at various places in this module, we need a
- * static struct pci_dev * pointing to that device.
- */
-static struct pci_dev *speedstep_chipset_dev;
-
-
-/* speedstep_processor
- */
-static enum speedstep_processor speedstep_processor;
-
-static u32 pmbase;
-
-/*
- * There are only two frequency states for each processor. Values
- * are in kHz for the time being.
- */
-static struct cpufreq_frequency_table speedstep_freqs[] = {
- {SPEEDSTEP_HIGH, 0},
- {SPEEDSTEP_LOW, 0},
- {0, CPUFREQ_TABLE_END},
-};
-
-
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
- "speedstep-ich", msg)
-
-
-/**
- * speedstep_find_register - read the PMBASE address
- *
- * Returns: -ENODEV if no register could be found
- */
-static int speedstep_find_register(void)
-{
- if (!speedstep_chipset_dev)
- return -ENODEV;
-
- /* get PMBASE */
- pci_read_config_dword(speedstep_chipset_dev, 0x40, &pmbase);
- if (!(pmbase & 0x01)) {
- printk(KERN_ERR "speedstep-ich: could not find speedstep register\n");
- return -ENODEV;
- }
-
- pmbase &= 0xFFFFFFFE;
- if (!pmbase) {
- printk(KERN_ERR "speedstep-ich: could not find speedstep register\n");
- return -ENODEV;
- }
-
- dprintk("pmbase is 0x%x\n", pmbase);
- return 0;
-}
-
-/**
- * speedstep_set_state - set the SpeedStep state
- * @state: new processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH)
- *
- * Tries to change the SpeedStep state. Can be called from
- * smp_call_function_single.
- */
-static void speedstep_set_state(unsigned int state)
-{
- u8 pm2_blk;
- u8 value;
- unsigned long flags;
-
- if (state > 0x1)
- return;
-
- /* Disable IRQs */
- local_irq_save(flags);
-
- /* read state */
- value = inb(pmbase + 0x50);
-
- dprintk("read at pmbase 0x%x + 0x50 returned 0x%x\n", pmbase, value);
-
- /* write new state */
- value &= 0xFE;
- value |= state;
-
- dprintk("writing 0x%x to pmbase 0x%x + 0x50\n", value, pmbase);
-
- /* Disable bus master arbitration */
- pm2_blk = inb(pmbase + 0x20);
- pm2_blk |= 0x01;
- outb(pm2_blk, (pmbase + 0x20));
-
- /* Actual transition */
- outb(value, (pmbase + 0x50));
-
- /* Restore bus master arbitration */
- pm2_blk &= 0xfe;
- outb(pm2_blk, (pmbase + 0x20));
-
- /* check if transition was successful */
- value = inb(pmbase + 0x50);
-
- /* Enable IRQs */
- local_irq_restore(flags);
-
- dprintk("read at pmbase 0x%x + 0x50 returned 0x%x\n", pmbase, value);
-
- if (state == (value & 0x1))
- dprintk("change to %u MHz succeeded\n",
- speedstep_get_frequency(speedstep_processor) / 1000);
- else
- printk(KERN_ERR "cpufreq: change failed - I/O error\n");
-
- return;
-}
-
-/* Wrapper for smp_call_function_single. */
-static void _speedstep_set_state(void *_state)
-{
- speedstep_set_state(*(unsigned int *)_state);
-}
-
-/**
- * speedstep_activate - activate SpeedStep control in the chipset
- *
- * Tries to activate the SpeedStep status and control registers.
- * Returns -EINVAL on an unsupported chipset, and zero on success.
- */
-static int speedstep_activate(void)
-{
- u16 value = 0;
-
- if (!speedstep_chipset_dev)
- return -EINVAL;
-
- pci_read_config_word(speedstep_chipset_dev, 0x00A0, &value);
- if (!(value & 0x08)) {
- value |= 0x08;
- dprintk("activating SpeedStep (TM) registers\n");
- pci_write_config_word(speedstep_chipset_dev, 0x00A0, value);
- }
-
- return 0;
-}
-
-
-/**
- * speedstep_detect_chipset - detect the Southbridge which contains SpeedStep logic
- *
- * Detects ICH2-M, ICH3-M and ICH4-M so far. The pci_dev points to
- * the LPC bridge / PM module which contains all power-management
- * functions. Returns the SPEEDSTEP_CHIPSET_-number for the detected
- * chipset, or zero on failure.
- */
-static unsigned int speedstep_detect_chipset(void)
-{
- speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL,
- PCI_DEVICE_ID_INTEL_82801DB_12,
- PCI_ANY_ID, PCI_ANY_ID,
- NULL);
- if (speedstep_chipset_dev)
- return 4; /* 4-M */
-
- speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL,
- PCI_DEVICE_ID_INTEL_82801CA_12,
- PCI_ANY_ID, PCI_ANY_ID,
- NULL);
- if (speedstep_chipset_dev)
- return 3; /* 3-M */
-
-
- speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL,
- PCI_DEVICE_ID_INTEL_82801BA_10,
- PCI_ANY_ID, PCI_ANY_ID,
- NULL);
- if (speedstep_chipset_dev) {
- /* speedstep.c causes lockups on Dell Inspirons 8000 and
- * 8100 which use a pretty old revision of the 82815
- * host brige. Abort on these systems.
- */
- static struct pci_dev *hostbridge;
-
- hostbridge = pci_get_subsys(PCI_VENDOR_ID_INTEL,
- PCI_DEVICE_ID_INTEL_82815_MC,
- PCI_ANY_ID, PCI_ANY_ID,
- NULL);
-
- if (!hostbridge)
- return 2; /* 2-M */
-
- if (hostbridge->revision < 5) {
- dprintk("hostbridge does not support speedstep\n");
- speedstep_chipset_dev = NULL;
- pci_dev_put(hostbridge);
- return 0;
- }
-
- pci_dev_put(hostbridge);
- return 2; /* 2-M */
- }
-
- return 0;
-}
-
-static void get_freq_data(void *_speed)
-{
- unsigned int *speed = _speed;
-
- *speed = speedstep_get_frequency(speedstep_processor);
-}
-
-static unsigned int speedstep_get(unsigned int cpu)
-{
- unsigned int speed;
-
- /* You're supposed to ensure CPU is online. */
- if (smp_call_function_single(cpu, get_freq_data, &speed, 1) != 0)
- BUG();
-
- dprintk("detected %u kHz as current frequency\n", speed);
- return speed;
-}
-
-/**
- * speedstep_target - set a new CPUFreq policy
- * @policy: new policy
- * @target_freq: the target frequency
- * @relation: how that frequency relates to achieved frequency
- * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
- *
- * Sets a new CPUFreq policy.
- */
-static int speedstep_target(struct cpufreq_policy *policy,
- unsigned int target_freq,
- unsigned int relation)
-{
- unsigned int newstate = 0, policy_cpu;
- struct cpufreq_freqs freqs;
- int i;
-
- if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0],
- target_freq, relation, &newstate))
- return -EINVAL;
-
- policy_cpu = cpumask_any_and(policy->cpus, cpu_online_mask);
- freqs.old = speedstep_get(policy_cpu);
- freqs.new = speedstep_freqs[newstate].frequency;
- freqs.cpu = policy->cpu;
-
- dprintk("transiting from %u to %u kHz\n", freqs.old, freqs.new);
-
- /* no transition necessary */
- if (freqs.old == freqs.new)
- return 0;
-
- for_each_cpu(i, policy->cpus) {
- freqs.cpu = i;
- cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
- }
-
- smp_call_function_single(policy_cpu, _speedstep_set_state, &newstate,
- true);
-
- for_each_cpu(i, policy->cpus) {
- freqs.cpu = i;
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
- }
-
- return 0;
-}
-
-
-/**
- * speedstep_verify - verifies a new CPUFreq policy
- * @policy: new policy
- *
- * Limit must be within speedstep_low_freq and speedstep_high_freq, with
- * at least one border included.
- */
-static int speedstep_verify(struct cpufreq_policy *policy)
-{
- return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]);
-}
-
-struct get_freqs {
- struct cpufreq_policy *policy;
- int ret;
-};
-
-static void get_freqs_on_cpu(void *_get_freqs)
-{
- struct get_freqs *get_freqs = _get_freqs;
-
- get_freqs->ret =
- speedstep_get_freqs(speedstep_processor,
- &speedstep_freqs[SPEEDSTEP_LOW].frequency,
- &speedstep_freqs[SPEEDSTEP_HIGH].frequency,
- &get_freqs->policy->cpuinfo.transition_latency,
- &speedstep_set_state);
-}
-
-static int speedstep_cpu_init(struct cpufreq_policy *policy)
-{
- int result;
- unsigned int policy_cpu, speed;
- struct get_freqs gf;
-
- /* only run on CPU to be set, or on its sibling */
-#ifdef CONFIG_SMP
- cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu));
-#endif
- policy_cpu = cpumask_any_and(policy->cpus, cpu_online_mask);
-
- /* detect low and high frequency and transition latency */
- gf.policy = policy;
- smp_call_function_single(policy_cpu, get_freqs_on_cpu, &gf, 1);
- if (gf.ret)
- return gf.ret;
-
- /* get current speed setting */
- speed = speedstep_get(policy_cpu);
- if (!speed)
- return -EIO;
-
- dprintk("currently at %s speed setting - %i MHz\n",
- (speed == speedstep_freqs[SPEEDSTEP_LOW].frequency)
- ? "low" : "high",
- (speed / 1000));
-
- /* cpuinfo and default policy values */
- policy->cur = speed;
-
- result = cpufreq_frequency_table_cpuinfo(policy, speedstep_freqs);
- if (result)
- return result;
-
- cpufreq_frequency_table_get_attr(speedstep_freqs, policy->cpu);
-
- return 0;
-}
-
-
-static int speedstep_cpu_exit(struct cpufreq_policy *policy)
-{
- cpufreq_frequency_table_put_attr(policy->cpu);
- return 0;
-}
-
-static struct freq_attr *speedstep_attr[] = {
- &cpufreq_freq_attr_scaling_available_freqs,
- NULL,
-};
-
-
-static struct cpufreq_driver speedstep_driver = {
- .name = "speedstep-ich",
- .verify = speedstep_verify,
- .target = speedstep_target,
- .init = speedstep_cpu_init,
- .exit = speedstep_cpu_exit,
- .get = speedstep_get,
- .owner = THIS_MODULE,
- .attr = speedstep_attr,
-};
-
-
-/**
- * speedstep_init - initializes the SpeedStep CPUFreq driver
- *
- * Initializes the SpeedStep support. Returns -ENODEV on unsupported
- * devices, -EINVAL on problems during initiatization, and zero on
- * success.
- */
-static int __init speedstep_init(void)
-{
- /* detect processor */
- speedstep_processor = speedstep_detect_processor();
- if (!speedstep_processor) {
- dprintk("Intel(R) SpeedStep(TM) capable processor "
- "not found\n");
- return -ENODEV;
- }
-
- /* detect chipset */
- if (!speedstep_detect_chipset()) {
- dprintk("Intel(R) SpeedStep(TM) for this chipset not "
- "(yet) available.\n");
- return -ENODEV;
- }
-
- /* activate speedstep support */
- if (speedstep_activate()) {
- pci_dev_put(speedstep_chipset_dev);
- return -EINVAL;
- }
-
- if (speedstep_find_register())
- return -ENODEV;
-
- return cpufreq_register_driver(&speedstep_driver);
-}
-
-
-/**
- * speedstep_exit - unregisters SpeedStep support
- *
- * Unregisters SpeedStep support.
- */
-static void __exit speedstep_exit(void)
-{
- pci_dev_put(speedstep_chipset_dev);
- cpufreq_unregister_driver(&speedstep_driver);
-}
-
-
-MODULE_AUTHOR("Dave Jones <davej@redhat.com>, "
- "Dominik Brodowski <linux@brodo.de>");
-MODULE_DESCRIPTION("Speedstep driver for Intel mobile processors on chipsets "
- "with ICH-M southbridges.");
-MODULE_LICENSE("GPL");
-
-module_init(speedstep_init);
-module_exit(speedstep_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
deleted file mode 100644
index a94ec6be69f..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
+++ /dev/null
@@ -1,481 +0,0 @@
-/*
- * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
- *
- * Licensed under the terms of the GNU GPL License version 2.
- *
- * Library for common functions for Intel SpeedStep v.1 and v.2 support
- *
- * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-
-#include <asm/msr.h>
-#include <asm/tsc.h>
-#include "speedstep-lib.h"
-
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
- "speedstep-lib", msg)
-
-#define PFX "speedstep-lib: "
-
-#ifdef CONFIG_X86_SPEEDSTEP_RELAXED_CAP_CHECK
-static int relaxed_check;
-#else
-#define relaxed_check 0
-#endif
-
-/*********************************************************************
- * GET PROCESSOR CORE SPEED IN KHZ *
- *********************************************************************/
-
-static unsigned int pentium3_get_frequency(enum speedstep_processor processor)
-{
- /* See table 14 of p3_ds.pdf and table 22 of 29834003.pdf */
- struct {
- unsigned int ratio; /* Frequency Multiplier (x10) */
- u8 bitmap; /* power on configuration bits
- [27, 25:22] (in MSR 0x2a) */
- } msr_decode_mult[] = {
- { 30, 0x01 },
- { 35, 0x05 },
- { 40, 0x02 },
- { 45, 0x06 },
- { 50, 0x00 },
- { 55, 0x04 },
- { 60, 0x0b },
- { 65, 0x0f },
- { 70, 0x09 },
- { 75, 0x0d },
- { 80, 0x0a },
- { 85, 0x26 },
- { 90, 0x20 },
- { 100, 0x2b },
- { 0, 0xff } /* error or unknown value */
- };
-
- /* PIII(-M) FSB settings: see table b1-b of 24547206.pdf */
- struct {
- unsigned int value; /* Front Side Bus speed in MHz */
- u8 bitmap; /* power on configuration bits [18: 19]
- (in MSR 0x2a) */
- } msr_decode_fsb[] = {
- { 66, 0x0 },
- { 100, 0x2 },
- { 133, 0x1 },
- { 0, 0xff}
- };
-
- u32 msr_lo, msr_tmp;
- int i = 0, j = 0;
-
- /* read MSR 0x2a - we only need the low 32 bits */
- rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp);
- dprintk("P3 - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n", msr_lo, msr_tmp);
- msr_tmp = msr_lo;
-
- /* decode the FSB */
- msr_tmp &= 0x00c0000;
- msr_tmp >>= 18;
- while (msr_tmp != msr_decode_fsb[i].bitmap) {
- if (msr_decode_fsb[i].bitmap == 0xff)
- return 0;
- i++;
- }
-
- /* decode the multiplier */
- if (processor == SPEEDSTEP_CPU_PIII_C_EARLY) {
- dprintk("workaround for early PIIIs\n");
- msr_lo &= 0x03c00000;
- } else
- msr_lo &= 0x0bc00000;
- msr_lo >>= 22;
- while (msr_lo != msr_decode_mult[j].bitmap) {
- if (msr_decode_mult[j].bitmap == 0xff)
- return 0;
- j++;
- }
-
- dprintk("speed is %u\n",
- (msr_decode_mult[j].ratio * msr_decode_fsb[i].value * 100));
-
- return msr_decode_mult[j].ratio * msr_decode_fsb[i].value * 100;
-}
-
-
-static unsigned int pentiumM_get_frequency(void)
-{
- u32 msr_lo, msr_tmp;
-
- rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp);
- dprintk("PM - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n", msr_lo, msr_tmp);
-
- /* see table B-2 of 24547212.pdf */
- if (msr_lo & 0x00040000) {
- printk(KERN_DEBUG PFX "PM - invalid FSB: 0x%x 0x%x\n",
- msr_lo, msr_tmp);
- return 0;
- }
-
- msr_tmp = (msr_lo >> 22) & 0x1f;
- dprintk("bits 22-26 are 0x%x, speed is %u\n",
- msr_tmp, (msr_tmp * 100 * 1000));
-
- return msr_tmp * 100 * 1000;
-}
-
-static unsigned int pentium_core_get_frequency(void)
-{
- u32 fsb = 0;
- u32 msr_lo, msr_tmp;
- int ret;
-
- rdmsr(MSR_FSB_FREQ, msr_lo, msr_tmp);
- /* see table B-2 of 25366920.pdf */
- switch (msr_lo & 0x07) {
- case 5:
- fsb = 100000;
- break;
- case 1:
- fsb = 133333;
- break;
- case 3:
- fsb = 166667;
- break;
- case 2:
- fsb = 200000;
- break;
- case 0:
- fsb = 266667;
- break;
- case 4:
- fsb = 333333;
- break;
- default:
- printk(KERN_ERR "PCORE - MSR_FSB_FREQ undefined value");
- }
-
- rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp);
- dprintk("PCORE - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n",
- msr_lo, msr_tmp);
-
- msr_tmp = (msr_lo >> 22) & 0x1f;
- dprintk("bits 22-26 are 0x%x, speed is %u\n",
- msr_tmp, (msr_tmp * fsb));
-
- ret = (msr_tmp * fsb);
- return ret;
-}
-
-
-static unsigned int pentium4_get_frequency(void)
-{
- struct cpuinfo_x86 *c = &boot_cpu_data;
- u32 msr_lo, msr_hi, mult;
- unsigned int fsb = 0;
- unsigned int ret;
- u8 fsb_code;
-
- /* Pentium 4 Model 0 and 1 do not have the Core Clock Frequency
- * to System Bus Frequency Ratio Field in the Processor Frequency
- * Configuration Register of the MSR. Therefore the current
- * frequency cannot be calculated and has to be measured.
- */
- if (c->x86_model < 2)
- return cpu_khz;
-
- rdmsr(0x2c, msr_lo, msr_hi);
-
- dprintk("P4 - MSR_EBC_FREQUENCY_ID: 0x%x 0x%x\n", msr_lo, msr_hi);
-
- /* decode the FSB: see IA-32 Intel (C) Architecture Software
- * Developer's Manual, Volume 3: System Prgramming Guide,
- * revision #12 in Table B-1: MSRs in the Pentium 4 and
- * Intel Xeon Processors, on page B-4 and B-5.
- */
- fsb_code = (msr_lo >> 16) & 0x7;
- switch (fsb_code) {
- case 0:
- fsb = 100 * 1000;
- break;
- case 1:
- fsb = 13333 * 10;
- break;
- case 2:
- fsb = 200 * 1000;
- break;
- }
-
- if (!fsb)
- printk(KERN_DEBUG PFX "couldn't detect FSB speed. "
- "Please send an e-mail to <linux@brodo.de>\n");
-
- /* Multiplier. */
- mult = msr_lo >> 24;
-
- dprintk("P4 - FSB %u kHz; Multiplier %u; Speed %u kHz\n",
- fsb, mult, (fsb * mult));
-
- ret = (fsb * mult);
- return ret;
-}
-
-
-/* Warning: may get called from smp_call_function_single. */
-unsigned int speedstep_get_frequency(enum speedstep_processor processor)
-{
- switch (processor) {
- case SPEEDSTEP_CPU_PCORE:
- return pentium_core_get_frequency();
- case SPEEDSTEP_CPU_PM:
- return pentiumM_get_frequency();
- case SPEEDSTEP_CPU_P4D:
- case SPEEDSTEP_CPU_P4M:
- return pentium4_get_frequency();
- case SPEEDSTEP_CPU_PIII_T:
- case SPEEDSTEP_CPU_PIII_C:
- case SPEEDSTEP_CPU_PIII_C_EARLY:
- return pentium3_get_frequency(processor);
- default:
- return 0;
- };
- return 0;
-}
-EXPORT_SYMBOL_GPL(speedstep_get_frequency);
-
-
-/*********************************************************************
- * DETECT SPEEDSTEP-CAPABLE PROCESSOR *
- *********************************************************************/
-
-unsigned int speedstep_detect_processor(void)
-{
- struct cpuinfo_x86 *c = &cpu_data(0);
- u32 ebx, msr_lo, msr_hi;
-
- dprintk("x86: %x, model: %x\n", c->x86, c->x86_model);
-
- if ((c->x86_vendor != X86_VENDOR_INTEL) ||
- ((c->x86 != 6) && (c->x86 != 0xF)))
- return 0;
-
- if (c->x86 == 0xF) {
- /* Intel Mobile Pentium 4-M
- * or Intel Mobile Pentium 4 with 533 MHz FSB */
- if (c->x86_model != 2)
- return 0;
-
- ebx = cpuid_ebx(0x00000001);
- ebx &= 0x000000FF;
-
- dprintk("ebx value is %x, x86_mask is %x\n", ebx, c->x86_mask);
-
- switch (c->x86_mask) {
- case 4:
- /*
- * B-stepping [M-P4-M]
- * sample has ebx = 0x0f, production has 0x0e.
- */
- if ((ebx == 0x0e) || (ebx == 0x0f))
- return SPEEDSTEP_CPU_P4M;
- break;
- case 7:
- /*
- * C-stepping [M-P4-M]
- * needs to have ebx=0x0e, else it's a celeron:
- * cf. 25130917.pdf / page 7, footnote 5 even
- * though 25072120.pdf / page 7 doesn't say
- * samples are only of B-stepping...
- */
- if (ebx == 0x0e)
- return SPEEDSTEP_CPU_P4M;
- break;
- case 9:
- /*
- * D-stepping [M-P4-M or M-P4/533]
- *
- * this is totally strange: CPUID 0x0F29 is
- * used by M-P4-M, M-P4/533 and(!) Celeron CPUs.
- * The latter need to be sorted out as they don't
- * support speedstep.
- * Celerons with CPUID 0x0F29 may have either
- * ebx=0x8 or 0xf -- 25130917.pdf doesn't say anything
- * specific.
- * M-P4-Ms may have either ebx=0xe or 0xf [see above]
- * M-P4/533 have either ebx=0xe or 0xf. [25317607.pdf]
- * also, M-P4M HTs have ebx=0x8, too
- * For now, they are distinguished by the model_id
- * string
- */
- if ((ebx == 0x0e) ||
- (strstr(c->x86_model_id,
- "Mobile Intel(R) Pentium(R) 4") != NULL))
- return SPEEDSTEP_CPU_P4M;
- break;
- default:
- break;
- }
- return 0;
- }
-
- switch (c->x86_model) {
- case 0x0B: /* Intel PIII [Tualatin] */
- /* cpuid_ebx(1) is 0x04 for desktop PIII,
- * 0x06 for mobile PIII-M */
- ebx = cpuid_ebx(0x00000001);
- dprintk("ebx is %x\n", ebx);
-
- ebx &= 0x000000FF;
-
- if (ebx != 0x06)
- return 0;
-
- /* So far all PIII-M processors support SpeedStep. See
- * Intel's 24540640.pdf of June 2003
- */
- return SPEEDSTEP_CPU_PIII_T;
-
- case 0x08: /* Intel PIII [Coppermine] */
-
- /* all mobile PIII Coppermines have FSB 100 MHz
- * ==> sort out a few desktop PIIIs. */
- rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_hi);
- dprintk("Coppermine: MSR_IA32_EBL_CR_POWERON is 0x%x, 0x%x\n",
- msr_lo, msr_hi);
- msr_lo &= 0x00c0000;
- if (msr_lo != 0x0080000)
- return 0;
-
- /*
- * If the processor is a mobile version,
- * platform ID has bit 50 set
- * it has SpeedStep technology if either
- * bit 56 or 57 is set
- */
- rdmsr(MSR_IA32_PLATFORM_ID, msr_lo, msr_hi);
- dprintk("Coppermine: MSR_IA32_PLATFORM ID is 0x%x, 0x%x\n",
- msr_lo, msr_hi);
- if ((msr_hi & (1<<18)) &&
- (relaxed_check ? 1 : (msr_hi & (3<<24)))) {
- if (c->x86_mask == 0x01) {
- dprintk("early PIII version\n");
- return SPEEDSTEP_CPU_PIII_C_EARLY;
- } else
- return SPEEDSTEP_CPU_PIII_C;
- }
-
- default:
- return 0;
- }
-}
-EXPORT_SYMBOL_GPL(speedstep_detect_processor);
-
-
-/*********************************************************************
- * DETECT SPEEDSTEP SPEEDS *
- *********************************************************************/
-
-unsigned int speedstep_get_freqs(enum speedstep_processor processor,
- unsigned int *low_speed,
- unsigned int *high_speed,
- unsigned int *transition_latency,
- void (*set_state) (unsigned int state))
-{
- unsigned int prev_speed;
- unsigned int ret = 0;
- unsigned long flags;
- struct timeval tv1, tv2;
-
- if ((!processor) || (!low_speed) || (!high_speed) || (!set_state))
- return -EINVAL;
-
- dprintk("trying to determine both speeds\n");
-
- /* get current speed */
- prev_speed = speedstep_get_frequency(processor);
- if (!prev_speed)
- return -EIO;
-
- dprintk("previous speed is %u\n", prev_speed);
-
- local_irq_save(flags);
-
- /* switch to low state */
- set_state(SPEEDSTEP_LOW);
- *low_speed = speedstep_get_frequency(processor);
- if (!*low_speed) {
- ret = -EIO;
- goto out;
- }
-
- dprintk("low speed is %u\n", *low_speed);
-
- /* start latency measurement */
- if (transition_latency)
- do_gettimeofday(&tv1);
-
- /* switch to high state */
- set_state(SPEEDSTEP_HIGH);
-
- /* end latency measurement */
- if (transition_latency)
- do_gettimeofday(&tv2);
-
- *high_speed = speedstep_get_frequency(processor);
- if (!*high_speed) {
- ret = -EIO;
- goto out;
- }
-
- dprintk("high speed is %u\n", *high_speed);
-
- if (*low_speed == *high_speed) {
- ret = -ENODEV;
- goto out;
- }
-
- /* switch to previous state, if necessary */
- if (*high_speed != prev_speed)
- set_state(SPEEDSTEP_LOW);
-
- if (transition_latency) {
- *transition_latency = (tv2.tv_sec - tv1.tv_sec) * USEC_PER_SEC +
- tv2.tv_usec - tv1.tv_usec;
- dprintk("transition latency is %u uSec\n", *transition_latency);
-
- /* convert uSec to nSec and add 20% for safety reasons */
- *transition_latency *= 1200;
-
- /* check if the latency measurement is too high or too low
- * and set it to a safe value (500uSec) in that case
- */
- if (*transition_latency > 10000000 ||
- *transition_latency < 50000) {
- printk(KERN_WARNING PFX "frequency transition "
- "measured seems out of range (%u "
- "nSec), falling back to a safe one of"
- "%u nSec.\n",
- *transition_latency, 500000);
- *transition_latency = 500000;
- }
- }
-
-out:
- local_irq_restore(flags);
- return ret;
-}
-EXPORT_SYMBOL_GPL(speedstep_get_freqs);
-
-#ifdef CONFIG_X86_SPEEDSTEP_RELAXED_CAP_CHECK
-module_param(relaxed_check, int, 0444);
-MODULE_PARM_DESC(relaxed_check,
- "Don't do all checks for speedstep capability.");
-#endif
-
-MODULE_AUTHOR("Dominik Brodowski <linux@brodo.de>");
-MODULE_DESCRIPTION("Library for Intel SpeedStep 1 or 2 cpufreq drivers.");
-MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h
deleted file mode 100644
index 70d9cea1219..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
- *
- * Licensed under the terms of the GNU GPL License version 2.
- *
- * Library for common functions for Intel SpeedStep v.1 and v.2 support
- *
- * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
- */
-
-
-
-/* processors */
-enum speedstep_processor {
- SPEEDSTEP_CPU_PIII_C_EARLY = 0x00000001, /* Coppermine core */
- SPEEDSTEP_CPU_PIII_C = 0x00000002, /* Coppermine core */
- SPEEDSTEP_CPU_PIII_T = 0x00000003, /* Tualatin core */
- SPEEDSTEP_CPU_P4M = 0x00000004, /* P4-M */
-/* the following processors are not speedstep-capable and are not auto-detected
- * in speedstep_detect_processor(). However, their speed can be detected using
- * the speedstep_get_frequency() call. */
- SPEEDSTEP_CPU_PM = 0xFFFFFF03, /* Pentium M */
- SPEEDSTEP_CPU_P4D = 0xFFFFFF04, /* desktop P4 */
- SPEEDSTEP_CPU_PCORE = 0xFFFFFF05, /* Core */
-};
-
-/* speedstep states -- only two of them */
-
-#define SPEEDSTEP_HIGH 0x00000000
-#define SPEEDSTEP_LOW 0x00000001
-
-
-/* detect a speedstep-capable processor */
-extern enum speedstep_processor speedstep_detect_processor(void);
-
-/* detect the current speed (in khz) of the processor */
-extern unsigned int speedstep_get_frequency(enum speedstep_processor processor);
-
-
-/* detect the low and high speeds of the processor. The callback
- * set_state"'s first argument is either SPEEDSTEP_HIGH or
- * SPEEDSTEP_LOW; the second argument is zero so that no
- * cpufreq_notify_transition calls are initiated.
- */
-extern unsigned int speedstep_get_freqs(enum speedstep_processor processor,
- unsigned int *low_speed,
- unsigned int *high_speed,
- unsigned int *transition_latency,
- void (*set_state) (unsigned int state));
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
deleted file mode 100644
index 91bc25b67bc..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
+++ /dev/null
@@ -1,467 +0,0 @@
-/*
- * Intel SpeedStep SMI driver.
- *
- * (C) 2003 Hiroshi Miura <miura@da-cha.org>
- *
- * Licensed under the terms of the GNU GPL License version 2.
- *
- */
-
-
-/*********************************************************************
- * SPEEDSTEP - DEFINITIONS *
- *********************************************************************/
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/delay.h>
-#include <linux/io.h>
-#include <asm/ist.h>
-
-#include "speedstep-lib.h"
-
-/* speedstep system management interface port/command.
- *
- * These parameters are got from IST-SMI BIOS call.
- * If user gives it, these are used.
- *
- */
-static int smi_port;
-static int smi_cmd;
-static unsigned int smi_sig;
-
-/* info about the processor */
-static enum speedstep_processor speedstep_processor;
-
-/*
- * There are only two frequency states for each processor. Values
- * are in kHz for the time being.
- */
-static struct cpufreq_frequency_table speedstep_freqs[] = {
- {SPEEDSTEP_HIGH, 0},
- {SPEEDSTEP_LOW, 0},
- {0, CPUFREQ_TABLE_END},
-};
-
-#define GET_SPEEDSTEP_OWNER 0
-#define GET_SPEEDSTEP_STATE 1
-#define SET_SPEEDSTEP_STATE 2
-#define GET_SPEEDSTEP_FREQS 4
-
-/* how often shall the SMI call be tried if it failed, e.g. because
- * of DMA activity going on? */
-#define SMI_TRIES 5
-
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
- "speedstep-smi", msg)
-
-/**
- * speedstep_smi_ownership
- */
-static int speedstep_smi_ownership(void)
-{
- u32 command, result, magic, dummy;
- u32 function = GET_SPEEDSTEP_OWNER;
- unsigned char magic_data[] = "Copyright (c) 1999 Intel Corporation";
-
- command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
- magic = virt_to_phys(magic_data);
-
- dprintk("trying to obtain ownership with command %x at port %x\n",
- command, smi_port);
-
- __asm__ __volatile__(
- "push %%ebp\n"
- "out %%al, (%%dx)\n"
- "pop %%ebp\n"
- : "=D" (result),
- "=a" (dummy), "=b" (dummy), "=c" (dummy), "=d" (dummy),
- "=S" (dummy)
- : "a" (command), "b" (function), "c" (0), "d" (smi_port),
- "D" (0), "S" (magic)
- : "memory"
- );
-
- dprintk("result is %x\n", result);
-
- return result;
-}
-
-/**
- * speedstep_smi_get_freqs - get SpeedStep preferred & current freq.
- * @low: the low frequency value is placed here
- * @high: the high frequency value is placed here
- *
- * Only available on later SpeedStep-enabled systems, returns false results or
- * even hangs [cf. bugme.osdl.org # 1422] on earlier systems. Empirical testing
- * shows that the latter occurs if !(ist_info.event & 0xFFFF).
- */
-static int speedstep_smi_get_freqs(unsigned int *low, unsigned int *high)
-{
- u32 command, result = 0, edi, high_mhz, low_mhz, dummy;
- u32 state = 0;
- u32 function = GET_SPEEDSTEP_FREQS;
-
- if (!(ist_info.event & 0xFFFF)) {
- dprintk("bug #1422 -- can't read freqs from BIOS\n");
- return -ENODEV;
- }
-
- command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
-
- dprintk("trying to determine frequencies with command %x at port %x\n",
- command, smi_port);
-
- __asm__ __volatile__(
- "push %%ebp\n"
- "out %%al, (%%dx)\n"
- "pop %%ebp"
- : "=a" (result),
- "=b" (high_mhz),
- "=c" (low_mhz),
- "=d" (state), "=D" (edi), "=S" (dummy)
- : "a" (command),
- "b" (function),
- "c" (state),
- "d" (smi_port), "S" (0), "D" (0)
- );
-
- dprintk("result %x, low_freq %u, high_freq %u\n",
- result, low_mhz, high_mhz);
-
- /* abort if results are obviously incorrect... */
- if ((high_mhz + low_mhz) < 600)
- return -EINVAL;
-
- *high = high_mhz * 1000;
- *low = low_mhz * 1000;
-
- return result;
-}
-
-/**
- * speedstep_get_state - set the SpeedStep state
- * @state: processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH)
- *
- */
-static int speedstep_get_state(void)
-{
- u32 function = GET_SPEEDSTEP_STATE;
- u32 result, state, edi, command, dummy;
-
- command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
-
- dprintk("trying to determine current setting with command %x "
- "at port %x\n", command, smi_port);
-
- __asm__ __volatile__(
- "push %%ebp\n"
- "out %%al, (%%dx)\n"
- "pop %%ebp\n"
- : "=a" (result),
- "=b" (state), "=D" (edi),
- "=c" (dummy), "=d" (dummy), "=S" (dummy)
- : "a" (command), "b" (function), "c" (0),
- "d" (smi_port), "S" (0), "D" (0)
- );
-
- dprintk("state is %x, result is %x\n", state, result);
-
- return state & 1;
-}
-
-
-/**
- * speedstep_set_state - set the SpeedStep state
- * @state: new processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH)
- *
- */
-static void speedstep_set_state(unsigned int state)
-{
- unsigned int result = 0, command, new_state, dummy;
- unsigned long flags;
- unsigned int function = SET_SPEEDSTEP_STATE;
- unsigned int retry = 0;
-
- if (state > 0x1)
- return;
-
- /* Disable IRQs */
- local_irq_save(flags);
-
- command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
-
- dprintk("trying to set frequency to state %u "
- "with command %x at port %x\n",
- state, command, smi_port);
-
- do {
- if (retry) {
- dprintk("retry %u, previous result %u, waiting...\n",
- retry, result);
- mdelay(retry * 50);
- }
- retry++;
- __asm__ __volatile__(
- "push %%ebp\n"
- "out %%al, (%%dx)\n"
- "pop %%ebp"
- : "=b" (new_state), "=D" (result),
- "=c" (dummy), "=a" (dummy),
- "=d" (dummy), "=S" (dummy)
- : "a" (command), "b" (function), "c" (state),
- "d" (smi_port), "S" (0), "D" (0)
- );
- } while ((new_state != state) && (retry <= SMI_TRIES));
-
- /* enable IRQs */
- local_irq_restore(flags);
-
- if (new_state == state)
- dprintk("change to %u MHz succeeded after %u tries "
- "with result %u\n",
- (speedstep_freqs[new_state].frequency / 1000),
- retry, result);
- else
- printk(KERN_ERR "cpufreq: change to state %u "
- "failed with new_state %u and result %u\n",
- state, new_state, result);
-
- return;
-}
-
-
-/**
- * speedstep_target - set a new CPUFreq policy
- * @policy: new policy
- * @target_freq: new freq
- * @relation:
- *
- * Sets a new CPUFreq policy/freq.
- */
-static int speedstep_target(struct cpufreq_policy *policy,
- unsigned int target_freq, unsigned int relation)
-{
- unsigned int newstate = 0;
- struct cpufreq_freqs freqs;
-
- if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0],
- target_freq, relation, &newstate))
- return -EINVAL;
-
- freqs.old = speedstep_freqs[speedstep_get_state()].frequency;
- freqs.new = speedstep_freqs[newstate].frequency;
- freqs.cpu = 0; /* speedstep.c is UP only driver */
-
- if (freqs.old == freqs.new)
- return 0;
-
- cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
- speedstep_set_state(newstate);
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-
- return 0;
-}
-
-
-/**
- * speedstep_verify - verifies a new CPUFreq policy
- * @policy: new policy
- *
- * Limit must be within speedstep_low_freq and speedstep_high_freq, with
- * at least one border included.
- */
-static int speedstep_verify(struct cpufreq_policy *policy)
-{
- return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]);
-}
-
-
-static int speedstep_cpu_init(struct cpufreq_policy *policy)
-{
- int result;
- unsigned int speed, state;
- unsigned int *low, *high;
-
- /* capability check */
- if (policy->cpu != 0)
- return -ENODEV;
-
- result = speedstep_smi_ownership();
- if (result) {
- dprintk("fails in acquiring ownership of a SMI interface.\n");
- return -EINVAL;
- }
-
- /* detect low and high frequency */
- low = &speedstep_freqs[SPEEDSTEP_LOW].frequency;
- high = &speedstep_freqs[SPEEDSTEP_HIGH].frequency;
-
- result = speedstep_smi_get_freqs(low, high);
- if (result) {
- /* fall back to speedstep_lib.c dection mechanism:
- * try both states out */
- dprintk("could not detect low and high frequencies "
- "by SMI call.\n");
- result = speedstep_get_freqs(speedstep_processor,
- low, high,
- NULL,
- &speedstep_set_state);
-
- if (result) {
- dprintk("could not detect two different speeds"
- " -- aborting.\n");
- return result;
- } else
- dprintk("workaround worked.\n");
- }
-
- /* get current speed setting */
- state = speedstep_get_state();
- speed = speedstep_freqs[state].frequency;
-
- dprintk("currently at %s speed setting - %i MHz\n",
- (speed == speedstep_freqs[SPEEDSTEP_LOW].frequency)
- ? "low" : "high",
- (speed / 1000));
-
- /* cpuinfo and default policy values */
- policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
- policy->cur = speed;
-
- result = cpufreq_frequency_table_cpuinfo(policy, speedstep_freqs);
- if (result)
- return result;
-
- cpufreq_frequency_table_get_attr(speedstep_freqs, policy->cpu);
-
- return 0;
-}
-
-static int speedstep_cpu_exit(struct cpufreq_policy *policy)
-{
- cpufreq_frequency_table_put_attr(policy->cpu);
- return 0;
-}
-
-static unsigned int speedstep_get(unsigned int cpu)
-{
- if (cpu)
- return -ENODEV;
- return speedstep_get_frequency(speedstep_processor);
-}
-
-
-static int speedstep_resume(struct cpufreq_policy *policy)
-{
- int result = speedstep_smi_ownership();
-
- if (result)
- dprintk("fails in re-acquiring ownership of a SMI interface.\n");
-
- return result;
-}
-
-static struct freq_attr *speedstep_attr[] = {
- &cpufreq_freq_attr_scaling_available_freqs,
- NULL,
-};
-
-static struct cpufreq_driver speedstep_driver = {
- .name = "speedstep-smi",
- .verify = speedstep_verify,
- .target = speedstep_target,
- .init = speedstep_cpu_init,
- .exit = speedstep_cpu_exit,
- .get = speedstep_get,
- .resume = speedstep_resume,
- .owner = THIS_MODULE,
- .attr = speedstep_attr,
-};
-
-/**
- * speedstep_init - initializes the SpeedStep CPUFreq driver
- *
- * Initializes the SpeedStep support. Returns -ENODEV on unsupported
- * BIOS, -EINVAL on problems during initiatization, and zero on
- * success.
- */
-static int __init speedstep_init(void)
-{
- speedstep_processor = speedstep_detect_processor();
-
- switch (speedstep_processor) {
- case SPEEDSTEP_CPU_PIII_T:
- case SPEEDSTEP_CPU_PIII_C:
- case SPEEDSTEP_CPU_PIII_C_EARLY:
- break;
- default:
- speedstep_processor = 0;
- }
-
- if (!speedstep_processor) {
- dprintk("No supported Intel CPU detected.\n");
- return -ENODEV;
- }
-
- dprintk("signature:0x%.8lx, command:0x%.8lx, "
- "event:0x%.8lx, perf_level:0x%.8lx.\n",
- ist_info.signature, ist_info.command,
- ist_info.event, ist_info.perf_level);
-
- /* Error if no IST-SMI BIOS or no PARM
- sig= 'ISGE' aka 'Intel Speedstep Gate E' */
- if ((ist_info.signature != 0x47534943) && (
- (smi_port == 0) || (smi_cmd == 0)))
- return -ENODEV;
-
- if (smi_sig == 1)
- smi_sig = 0x47534943;
- else
- smi_sig = ist_info.signature;
-
- /* setup smi_port from MODLULE_PARM or BIOS */
- if ((smi_port > 0xff) || (smi_port < 0))
- return -EINVAL;
- else if (smi_port == 0)
- smi_port = ist_info.command & 0xff;
-
- if ((smi_cmd > 0xff) || (smi_cmd < 0))
- return -EINVAL;
- else if (smi_cmd == 0)
- smi_cmd = (ist_info.command >> 16) & 0xff;
-
- return cpufreq_register_driver(&speedstep_driver);
-}
-
-
-/**
- * speedstep_exit - unregisters SpeedStep support
- *
- * Unregisters SpeedStep support.
- */
-static void __exit speedstep_exit(void)
-{
- cpufreq_unregister_driver(&speedstep_driver);
-}
-
-module_param(smi_port, int, 0444);
-module_param(smi_cmd, int, 0444);
-module_param(smi_sig, uint, 0444);
-
-MODULE_PARM_DESC(smi_port, "Override the BIOS-given IST port with this value "
- "-- Intel's default setting is 0xb2");
-MODULE_PARM_DESC(smi_cmd, "Override the BIOS-given IST command with this value "
- "-- Intel's default setting is 0x82");
-MODULE_PARM_DESC(smi_sig, "Set to 1 to fake the IST signature when using the "
- "SMI interface.");
-
-MODULE_AUTHOR("Hiroshi Miura");
-MODULE_DESCRIPTION("Speedstep driver for IST applet SMI interface.");
-MODULE_LICENSE("GPL");
-
-module_init(speedstep_init);
-module_exit(speedstep_exit);
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index df86bc8c859..ed6086eedf1 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -29,10 +29,10 @@
static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
{
+ u64 misc_enable;
+
/* Unmask CPUID levels if masked: */
if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) {
- u64 misc_enable;
-
rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
if (misc_enable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID) {
@@ -118,8 +118,6 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
* (model 2) with the same problem.
*/
if (c->x86 == 15) {
- u64 misc_enable;
-
rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) {
@@ -130,6 +128,19 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
}
}
#endif
+
+ /*
+ * If fast string is not enabled in IA32_MISC_ENABLE for any reason,
+ * clear the fast string and enhanced fast string CPU capabilities.
+ */
+ if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) {
+ rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
+ if (!(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) {
+ printk(KERN_INFO "Disabled fast string operations\n");
+ setup_clear_cpu_cap(X86_FEATURE_REP_GOOD);
+ setup_clear_cpu_cap(X86_FEATURE_ERMS);
+ }
+ }
}
#ifdef CONFIG_X86_32
@@ -400,12 +411,10 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
switch (c->x86_model) {
case 5:
- if (c->x86_mask == 0) {
- if (l2 == 0)
- p = "Celeron (Covington)";
- else if (l2 == 256)
- p = "Mobile Pentium II (Dixon)";
- }
+ if (l2 == 0)
+ p = "Celeron (Covington)";
+ else if (l2 == 256)
+ p = "Mobile Pentium II (Dixon)";
break;
case 6:
@@ -447,6 +456,24 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
if (cpu_has(c, X86_FEATURE_VMX))
detect_vmx_virtcap(c);
+
+ /*
+ * Initialize MSR_IA32_ENERGY_PERF_BIAS if BIOS did not.
+ * x86_energy_perf_policy(8) is available to change it at run-time
+ */
+ if (cpu_has(c, X86_FEATURE_EPB)) {
+ u64 epb;
+
+ rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
+ if ((epb & 0xF) == ENERGY_PERF_BIAS_PERFORMANCE) {
+ printk_once(KERN_WARNING "ENERGY_PERF_BIAS:"
+ " Set to 'normal', was 'performance'\n"
+ "ENERGY_PERF_BIAS: View and update with"
+ " x86_energy_perf_policy(8)\n");
+ epb = (epb & ~0xF) | ENERGY_PERF_BIAS_NORMAL;
+ wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
+ }
+ }
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 1ce1af2899d..c105c533ed9 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -327,7 +327,6 @@ static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3)
l3->subcaches[2] = sc2 = !(val & BIT(8)) + !(val & BIT(9));
l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13));
- l3->indices = (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1;
l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1;
}
@@ -454,27 +453,16 @@ int amd_set_l3_disable_slot(struct amd_l3_cache *l3, int cpu, unsigned slot,
{
int ret = 0;
-#define SUBCACHE_MASK (3UL << 20)
-#define SUBCACHE_INDEX 0xfff
-
- /*
- * check whether this slot is already used or
- * the index is already disabled
- */
+ /* check if @slot is already used or the index is already disabled */
ret = amd_get_l3_disable_slot(l3, slot);
if (ret >= 0)
return -EINVAL;
- /*
- * check whether the other slot has disabled the
- * same index already
- */
- if (index == amd_get_l3_disable_slot(l3, !slot))
+ if (index > l3->indices)
return -EINVAL;
- /* do not allow writes outside of allowed bits */
- if ((index & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) ||
- ((index & SUBCACHE_INDEX) > l3->indices))
+ /* check whether the other slot has disabled the same index already */
+ if (index == amd_get_l3_disable_slot(l3, !slot))
return -EINVAL;
amd_l3_disable_index(l3, cpu, slot, index);
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 1e8d66c1336..7395d5f4272 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -43,61 +43,105 @@ static struct severity {
unsigned char covered;
char *msg;
} severities[] = {
-#define KERNEL .context = IN_KERNEL
-#define USER .context = IN_USER
-#define SER .ser = SER_REQUIRED
-#define NOSER .ser = NO_SER
-#define SEV(s) .sev = MCE_ ## s ## _SEVERITY
-#define BITCLR(x, s, m, r...) { .mask = x, .result = 0, SEV(s), .msg = m, ## r }
-#define BITSET(x, s, m, r...) { .mask = x, .result = x, SEV(s), .msg = m, ## r }
-#define MCGMASK(x, res, s, m, r...) \
- { .mcgmask = x, .mcgres = res, SEV(s), .msg = m, ## r }
-#define MASK(x, y, s, m, r...) \
- { .mask = x, .result = y, SEV(s), .msg = m, ## r }
+#define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c }
+#define KERNEL .context = IN_KERNEL
+#define USER .context = IN_USER
+#define SER .ser = SER_REQUIRED
+#define NOSER .ser = NO_SER
+#define BITCLR(x) .mask = x, .result = 0
+#define BITSET(x) .mask = x, .result = x
+#define MCGMASK(x, y) .mcgmask = x, .mcgres = y
+#define MASK(x, y) .mask = x, .result = y
#define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S)
#define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR)
#define MCACOD 0xffff
- BITCLR(MCI_STATUS_VAL, NO, "Invalid"),
- BITCLR(MCI_STATUS_EN, NO, "Not enabled"),
- BITSET(MCI_STATUS_PCC, PANIC, "Processor context corrupt"),
+ MCESEV(
+ NO, "Invalid",
+ BITCLR(MCI_STATUS_VAL)
+ ),
+ MCESEV(
+ NO, "Not enabled",
+ BITCLR(MCI_STATUS_EN)
+ ),
+ MCESEV(
+ PANIC, "Processor context corrupt",
+ BITSET(MCI_STATUS_PCC)
+ ),
/* When MCIP is not set something is very confused */
- MCGMASK(MCG_STATUS_MCIP, 0, PANIC, "MCIP not set in MCA handler"),
+ MCESEV(
+ PANIC, "MCIP not set in MCA handler",
+ MCGMASK(MCG_STATUS_MCIP, 0)
+ ),
/* Neither return not error IP -- no chance to recover -> PANIC */
- MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0, PANIC,
- "Neither restart nor error IP"),
- MCGMASK(MCG_STATUS_RIPV, 0, PANIC, "In kernel and no restart IP",
- KERNEL),
- BITCLR(MCI_STATUS_UC, KEEP, "Corrected error", NOSER),
- MASK(MCI_STATUS_OVER|MCI_STATUS_UC|MCI_STATUS_EN, MCI_STATUS_UC, SOME,
- "Spurious not enabled", SER),
+ MCESEV(
+ PANIC, "Neither restart nor error IP",
+ MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0)
+ ),
+ MCESEV(
+ PANIC, "In kernel and no restart IP",
+ KERNEL, MCGMASK(MCG_STATUS_RIPV, 0)
+ ),
+ MCESEV(
+ KEEP, "Corrected error",
+ NOSER, BITCLR(MCI_STATUS_UC)
+ ),
/* ignore OVER for UCNA */
- MASK(MCI_UC_SAR, MCI_STATUS_UC, KEEP,
- "Uncorrected no action required", SER),
- MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR, PANIC,
- "Illegal combination (UCNA with AR=1)", SER),
- MASK(MCI_STATUS_S, 0, KEEP, "Non signalled machine check", SER),
+ MCESEV(
+ KEEP, "Uncorrected no action required",
+ SER, MASK(MCI_UC_SAR, MCI_STATUS_UC)
+ ),
+ MCESEV(
+ PANIC, "Illegal combination (UCNA with AR=1)",
+ SER,
+ MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR)
+ ),
+ MCESEV(
+ KEEP, "Non signalled machine check",
+ SER, BITCLR(MCI_STATUS_S)
+ ),
/* AR add known MCACODs here */
- MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_SAR, PANIC,
- "Action required with lost events", SER),
- MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_SAR, PANIC,
- "Action required; unknown MCACOD", SER),
+ MCESEV(
+ PANIC, "Action required with lost events",
+ SER, BITSET(MCI_STATUS_OVER|MCI_UC_SAR)
+ ),
+ MCESEV(
+ PANIC, "Action required: unknown MCACOD",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR)
+ ),
/* known AO MCACODs: */
- MASK(MCI_UC_SAR|MCI_STATUS_OVER|0xfff0, MCI_UC_S|0xc0, AO,
- "Action optional: memory scrubbing error", SER),
- MASK(MCI_UC_SAR|MCI_STATUS_OVER|MCACOD, MCI_UC_S|0x17a, AO,
- "Action optional: last level cache writeback error", SER),
-
- MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S, SOME,
- "Action optional unknown MCACOD", SER),
- MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S|MCI_STATUS_OVER, SOME,
- "Action optional with lost events", SER),
- BITSET(MCI_STATUS_UC|MCI_STATUS_OVER, PANIC, "Overflowed uncorrected"),
- BITSET(MCI_STATUS_UC, UC, "Uncorrected"),
- BITSET(0, SOME, "No match") /* always matches. keep at end */
+ MCESEV(
+ AO, "Action optional: memory scrubbing error",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|0xfff0, MCI_UC_S|0x00c0)
+ ),
+ MCESEV(
+ AO, "Action optional: last level cache writeback error",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_S|0x017a)
+ ),
+ MCESEV(
+ SOME, "Action optional: unknown MCACOD",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S)
+ ),
+ MCESEV(
+ SOME, "Action optional with lost events",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_S)
+ ),
+
+ MCESEV(
+ PANIC, "Overflowed uncorrected",
+ BITSET(MCI_STATUS_OVER|MCI_STATUS_UC)
+ ),
+ MCESEV(
+ UC, "Uncorrected",
+ BITSET(MCI_STATUS_UC)
+ ),
+ MCESEV(
+ SOME, "No match",
+ BITSET(0)
+ ) /* always matches. keep at end */
};
/*
@@ -112,15 +156,15 @@ static int error_context(struct mce *m)
return IN_KERNEL;
}
-int mce_severity(struct mce *a, int tolerant, char **msg)
+int mce_severity(struct mce *m, int tolerant, char **msg)
{
- enum context ctx = error_context(a);
+ enum context ctx = error_context(m);
struct severity *s;
for (s = severities;; s++) {
- if ((a->status & s->mask) != s->result)
+ if ((m->status & s->mask) != s->result)
continue;
- if ((a->mcgstatus & s->mcgmask) != s->mcgres)
+ if ((m->mcgstatus & s->mcgmask) != s->mcgres)
continue;
if (s->ser == SER_REQUIRED && !mce_ser)
continue;
@@ -197,15 +241,15 @@ static const struct file_operations severities_coverage_fops = {
static int __init severities_debugfs_init(void)
{
- struct dentry *dmce = NULL, *fseverities_coverage = NULL;
+ struct dentry *dmce, *fsev;
dmce = mce_get_debugfs_dir();
- if (dmce == NULL)
+ if (!dmce)
goto err_out;
- fseverities_coverage = debugfs_create_file("severities-coverage",
- 0444, dmce, NULL,
- &severities_coverage_fops);
- if (fseverities_coverage == NULL)
+
+ fsev = debugfs_create_file("severities-coverage", 0444, dmce, NULL,
+ &severities_coverage_fops);
+ if (!fsev)
goto err_out;
return 0;
@@ -214,4 +258,4 @@ err_out:
return -ENOMEM;
}
late_initcall(severities_debugfs_init);
-#endif
+#endif /* CONFIG_DEBUG_FS */
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 3385ea26f68..08363b04212 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -10,7 +10,6 @@
#include <linux/thread_info.h>
#include <linux/capability.h>
#include <linux/miscdevice.h>
-#include <linux/interrupt.h>
#include <linux/ratelimit.h>
#include <linux/kallsyms.h>
#include <linux/rcupdate.h>
@@ -38,23 +37,20 @@
#include <linux/mm.h>
#include <linux/debugfs.h>
#include <linux/edac_mce.h>
+#include <linux/irq_work.h>
#include <asm/processor.h>
-#include <asm/hw_irq.h>
-#include <asm/apic.h>
-#include <asm/idle.h>
-#include <asm/ipi.h>
#include <asm/mce.h>
#include <asm/msr.h>
#include "mce-internal.h"
-static DEFINE_MUTEX(mce_read_mutex);
+static DEFINE_MUTEX(mce_chrdev_read_mutex);
#define rcu_dereference_check_mce(p) \
rcu_dereference_index_check((p), \
rcu_read_lock_sched_held() || \
- lockdep_is_held(&mce_read_mutex))
+ lockdep_is_held(&mce_chrdev_read_mutex))
#define CREATE_TRACE_POINTS
#include <trace/events/mce.h>
@@ -94,7 +90,8 @@ static unsigned long mce_need_notify;
static char mce_helper[128];
static char *mce_helper_argv[2] = { mce_helper, NULL };
-static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
+static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
+
static DEFINE_PER_CPU(struct mce, mces_seen);
static int cpu_missing;
@@ -105,20 +102,6 @@ static int cpu_missing;
ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
EXPORT_SYMBOL_GPL(x86_mce_decoder_chain);
-static int default_decode_mce(struct notifier_block *nb, unsigned long val,
- void *data)
-{
- pr_emerg(HW_ERR "No human readable MCE decoding support on this CPU type.\n");
- pr_emerg(HW_ERR "Run the message through 'mcelog --ascii' to decode.\n");
-
- return NOTIFY_STOP;
-}
-
-static struct notifier_block mce_dec_nb = {
- .notifier_call = default_decode_mce,
- .priority = -1,
-};
-
/* MCA banks polled by the period polling timer for corrected events */
DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
@@ -212,6 +195,8 @@ void mce_log(struct mce *mce)
static void print_mce(struct mce *m)
{
+ int ret = 0;
+
pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n",
m->extcpu, m->mcgstatus, m->bank, m->status);
@@ -239,7 +224,11 @@ static void print_mce(struct mce *m)
* Print out human-readable details about the MCE error,
* (if the CPU has an implementation for that)
*/
- atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
+ ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
+ if (ret == NOTIFY_STOP)
+ return;
+
+ pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
}
#define PANIC_TIMEOUT 5 /* 5 seconds */
@@ -381,6 +370,31 @@ static void mce_wrmsrl(u32 msr, u64 v)
}
/*
+ * Collect all global (w.r.t. this processor) status about this machine
+ * check into our "mce" struct so that we can use it later to assess
+ * the severity of the problem as we read per-bank specific details.
+ */
+static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
+{
+ mce_setup(m);
+
+ m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
+ if (regs) {
+ /*
+ * Get the address of the instruction at the time of
+ * the machine check error.
+ */
+ if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
+ m->ip = regs->ip;
+ m->cs = regs->cs;
+ }
+ /* Use accurate RIP reporting if available. */
+ if (rip_msr)
+ m->ip = mce_rdmsrl(rip_msr);
+ }
+}
+
+/*
* Simple lockless ring to communicate PFNs from the exception handler with the
* process context work function. This is vastly simplified because there's
* only a single reader and a single writer.
@@ -451,40 +465,13 @@ static void mce_schedule_work(void)
}
}
-/*
- * Get the address of the instruction at the time of the machine check
- * error.
- */
-static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
-{
+DEFINE_PER_CPU(struct irq_work, mce_irq_work);
- if (regs && (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV))) {
- m->ip = regs->ip;
- m->cs = regs->cs;
- } else {
- m->ip = 0;
- m->cs = 0;
- }
- if (rip_msr)
- m->ip = mce_rdmsrl(rip_msr);
-}
-
-#ifdef CONFIG_X86_LOCAL_APIC
-/*
- * Called after interrupts have been reenabled again
- * when a MCE happened during an interrupts off region
- * in the kernel.
- */
-asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs)
+static void mce_irq_work_cb(struct irq_work *entry)
{
- ack_APIC_irq();
- exit_idle();
- irq_enter();
mce_notify_irq();
mce_schedule_work();
- irq_exit();
}
-#endif
static void mce_report_event(struct pt_regs *regs)
{
@@ -500,29 +487,7 @@ static void mce_report_event(struct pt_regs *regs)
return;
}
-#ifdef CONFIG_X86_LOCAL_APIC
- /*
- * Without APIC do not notify. The event will be picked
- * up eventually.
- */
- if (!cpu_has_apic)
- return;
-
- /*
- * When interrupts are disabled we cannot use
- * kernel services safely. Trigger an self interrupt
- * through the APIC to instead do the notification
- * after interrupts are reenabled again.
- */
- apic->send_IPI_self(MCE_SELF_VECTOR);
-
- /*
- * Wait for idle afterwards again so that we don't leave the
- * APIC in a non idle state because the normal APIC writes
- * cannot exclude us.
- */
- apic_wait_icr_idle();
-#endif
+ irq_work_queue(&__get_cpu_var(mce_irq_work));
}
DEFINE_PER_CPU(unsigned, mce_poll_count);
@@ -549,9 +514,8 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
percpu_inc(mce_poll_count);
- mce_setup(&m);
+ mce_gather_info(&m, NULL);
- m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
for (i = 0; i < banks; i++) {
if (!mce_banks[i].ctl || !test_bit(i, *b))
continue;
@@ -590,7 +554,6 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) {
mce_log(&m);
atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, &m);
- add_taint(TAINT_MACHINE_CHECK);
}
/*
@@ -888,9 +851,9 @@ static int mce_usable_address(struct mce *m)
{
if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
return 0;
- if ((m->misc & 0x3f) > PAGE_SHIFT)
+ if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
return 0;
- if (((m->misc >> 6) & 7) != MCM_ADDR_PHYS)
+ if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
return 0;
return 1;
}
@@ -951,9 +914,8 @@ void do_machine_check(struct pt_regs *regs, long error_code)
if (!banks)
goto out;
- mce_setup(&m);
+ mce_gather_info(&m, regs);
- m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
final = &__get_cpu_var(mces_seen);
*final = m;
@@ -1037,7 +999,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
if (severity == MCE_AO_SEVERITY && mce_usable_address(&m))
mce_ring_add(m.addr >> PAGE_SHIFT);
- mce_get_rip(&m, regs);
mce_log(&m);
if (severity > worst) {
@@ -1199,7 +1160,8 @@ int mce_notify_irq(void)
clear_thread_flag(TIF_MCE_NOTIFY);
if (test_and_clear_bit(0, &mce_need_notify)) {
- wake_up_interruptible(&mce_wait);
+ /* wake processes polling /dev/mcelog */
+ wake_up_interruptible(&mce_chrdev_wait);
/*
* There is no risk of missing notifications because
@@ -1372,18 +1334,23 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
return 0;
}
-static void __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
+static int __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
{
if (c->x86 != 5)
- return;
+ return 0;
+
switch (c->x86_vendor) {
case X86_VENDOR_INTEL:
intel_p5_mcheck_init(c);
+ return 1;
break;
case X86_VENDOR_CENTAUR:
winchip_mcheck_init(c);
+ return 1;
break;
}
+
+ return 0;
}
static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
@@ -1437,7 +1404,8 @@ void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c)
if (mce_disabled)
return;
- __mcheck_cpu_ancient_init(c);
+ if (__mcheck_cpu_ancient_init(c))
+ return;
if (!mce_available(c))
return;
@@ -1453,44 +1421,45 @@ void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c)
__mcheck_cpu_init_vendor(c);
__mcheck_cpu_init_timer();
INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);
-
+ init_irq_work(&__get_cpu_var(mce_irq_work), &mce_irq_work_cb);
}
/*
- * Character device to read and clear the MCE log.
+ * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log.
*/
-static DEFINE_SPINLOCK(mce_state_lock);
-static int open_count; /* #times opened */
-static int open_exclu; /* already open exclusive? */
+static DEFINE_SPINLOCK(mce_chrdev_state_lock);
+static int mce_chrdev_open_count; /* #times opened */
+static int mce_chrdev_open_exclu; /* already open exclusive? */
-static int mce_open(struct inode *inode, struct file *file)
+static int mce_chrdev_open(struct inode *inode, struct file *file)
{
- spin_lock(&mce_state_lock);
+ spin_lock(&mce_chrdev_state_lock);
- if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
- spin_unlock(&mce_state_lock);
+ if (mce_chrdev_open_exclu ||
+ (mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
+ spin_unlock(&mce_chrdev_state_lock);
return -EBUSY;
}
if (file->f_flags & O_EXCL)
- open_exclu = 1;
- open_count++;
+ mce_chrdev_open_exclu = 1;
+ mce_chrdev_open_count++;
- spin_unlock(&mce_state_lock);
+ spin_unlock(&mce_chrdev_state_lock);
return nonseekable_open(inode, file);
}
-static int mce_release(struct inode *inode, struct file *file)
+static int mce_chrdev_release(struct inode *inode, struct file *file)
{
- spin_lock(&mce_state_lock);
+ spin_lock(&mce_chrdev_state_lock);
- open_count--;
- open_exclu = 0;
+ mce_chrdev_open_count--;
+ mce_chrdev_open_exclu = 0;
- spin_unlock(&mce_state_lock);
+ spin_unlock(&mce_chrdev_state_lock);
return 0;
}
@@ -1539,8 +1508,8 @@ static int __mce_read_apei(char __user **ubuf, size_t usize)
return 0;
}
-static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
- loff_t *off)
+static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
+ size_t usize, loff_t *off)
{
char __user *buf = ubuf;
unsigned long *cpu_tsc;
@@ -1551,7 +1520,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
if (!cpu_tsc)
return -ENOMEM;
- mutex_lock(&mce_read_mutex);
+ mutex_lock(&mce_chrdev_read_mutex);
if (!mce_apei_read_done) {
err = __mce_read_apei(&buf, usize);
@@ -1571,19 +1540,18 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
do {
for (i = prev; i < next; i++) {
unsigned long start = jiffies;
+ struct mce *m = &mcelog.entry[i];
- while (!mcelog.entry[i].finished) {
+ while (!m->finished) {
if (time_after_eq(jiffies, start + 2)) {
- memset(mcelog.entry + i, 0,
- sizeof(struct mce));
+ memset(m, 0, sizeof(*m));
goto timeout;
}
cpu_relax();
}
smp_rmb();
- err |= copy_to_user(buf, mcelog.entry + i,
- sizeof(struct mce));
- buf += sizeof(struct mce);
+ err |= copy_to_user(buf, m, sizeof(*m));
+ buf += sizeof(*m);
timeout:
;
}
@@ -1603,13 +1571,13 @@ timeout:
on_each_cpu(collect_tscs, cpu_tsc, 1);
for (i = next; i < MCE_LOG_LEN; i++) {
- if (mcelog.entry[i].finished &&
- mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
- err |= copy_to_user(buf, mcelog.entry+i,
- sizeof(struct mce));
+ struct mce *m = &mcelog.entry[i];
+
+ if (m->finished && m->tsc < cpu_tsc[m->cpu]) {
+ err |= copy_to_user(buf, m, sizeof(*m));
smp_rmb();
- buf += sizeof(struct mce);
- memset(&mcelog.entry[i], 0, sizeof(struct mce));
+ buf += sizeof(*m);
+ memset(m, 0, sizeof(*m));
}
}
@@ -1617,15 +1585,15 @@ timeout:
err = -EFAULT;
out:
- mutex_unlock(&mce_read_mutex);
+ mutex_unlock(&mce_chrdev_read_mutex);
kfree(cpu_tsc);
return err ? err : buf - ubuf;
}
-static unsigned int mce_poll(struct file *file, poll_table *wait)
+static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait)
{
- poll_wait(file, &mce_wait, wait);
+ poll_wait(file, &mce_chrdev_wait, wait);
if (rcu_access_index(mcelog.next))
return POLLIN | POLLRDNORM;
if (!mce_apei_read_done && apei_check_mce())
@@ -1633,7 +1601,8 @@ static unsigned int mce_poll(struct file *file, poll_table *wait)
return 0;
}
-static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
+static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
+ unsigned long arg)
{
int __user *p = (int __user *)arg;
@@ -1661,16 +1630,16 @@ static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
/* Modified in mce-inject.c, so not static or const */
struct file_operations mce_chrdev_ops = {
- .open = mce_open,
- .release = mce_release,
- .read = mce_read,
- .poll = mce_poll,
- .unlocked_ioctl = mce_ioctl,
- .llseek = no_llseek,
+ .open = mce_chrdev_open,
+ .release = mce_chrdev_release,
+ .read = mce_chrdev_read,
+ .poll = mce_chrdev_poll,
+ .unlocked_ioctl = mce_chrdev_ioctl,
+ .llseek = no_llseek,
};
EXPORT_SYMBOL_GPL(mce_chrdev_ops);
-static struct miscdevice mce_log_device = {
+static struct miscdevice mce_chrdev_device = {
MISC_MCELOG_MINOR,
"mcelog",
&mce_chrdev_ops,
@@ -1722,15 +1691,13 @@ __setup("mce", mcheck_enable);
int __init mcheck_init(void)
{
- atomic_notifier_chain_register(&x86_mce_decoder_chain, &mce_dec_nb);
-
mcheck_intel_therm_init();
return 0;
}
/*
- * Sysfs support
+ * mce_syscore: PM support
*/
/*
@@ -1750,12 +1717,12 @@ static int mce_disable_error_reporting(void)
return 0;
}
-static int mce_suspend(void)
+static int mce_syscore_suspend(void)
{
return mce_disable_error_reporting();
}
-static void mce_shutdown(void)
+static void mce_syscore_shutdown(void)
{
mce_disable_error_reporting();
}
@@ -1765,18 +1732,22 @@ static void mce_shutdown(void)
* Only one CPU is active at this time, the others get re-added later using
* CPU hotplug:
*/
-static void mce_resume(void)
+static void mce_syscore_resume(void)
{
__mcheck_cpu_init_generic();
__mcheck_cpu_init_vendor(__this_cpu_ptr(&cpu_info));
}
static struct syscore_ops mce_syscore_ops = {
- .suspend = mce_suspend,
- .shutdown = mce_shutdown,
- .resume = mce_resume,
+ .suspend = mce_syscore_suspend,
+ .shutdown = mce_syscore_shutdown,
+ .resume = mce_syscore_resume,
};
+/*
+ * mce_sysdev: Sysfs support
+ */
+
static void mce_cpu_restart(void *data)
{
del_timer_sync(&__get_cpu_var(mce_timer));
@@ -1812,11 +1783,11 @@ static void mce_enable_ce(void *all)
__mcheck_cpu_init_timer();
}
-static struct sysdev_class mce_sysclass = {
+static struct sysdev_class mce_sysdev_class = {
.name = "machinecheck",
};
-DEFINE_PER_CPU(struct sys_device, mce_dev);
+DEFINE_PER_CPU(struct sys_device, mce_sysdev);
__cpuinitdata
void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
@@ -1945,7 +1916,7 @@ static struct sysdev_ext_attribute attr_cmci_disabled = {
&mce_cmci_disabled
};
-static struct sysdev_attribute *mce_attrs[] = {
+static struct sysdev_attribute *mce_sysdev_attrs[] = {
&attr_tolerant.attr,
&attr_check_interval.attr,
&attr_trigger,
@@ -1956,66 +1927,67 @@ static struct sysdev_attribute *mce_attrs[] = {
NULL
};
-static cpumask_var_t mce_dev_initialized;
+static cpumask_var_t mce_sysdev_initialized;
/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */
-static __cpuinit int mce_create_device(unsigned int cpu)
+static __cpuinit int mce_sysdev_create(unsigned int cpu)
{
+ struct sys_device *sysdev = &per_cpu(mce_sysdev, cpu);
int err;
int i, j;
if (!mce_available(&boot_cpu_data))
return -EIO;
- memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject));
- per_cpu(mce_dev, cpu).id = cpu;
- per_cpu(mce_dev, cpu).cls = &mce_sysclass;
+ memset(&sysdev->kobj, 0, sizeof(struct kobject));
+ sysdev->id = cpu;
+ sysdev->cls = &mce_sysdev_class;
- err = sysdev_register(&per_cpu(mce_dev, cpu));
+ err = sysdev_register(sysdev);
if (err)
return err;
- for (i = 0; mce_attrs[i]; i++) {
- err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
+ for (i = 0; mce_sysdev_attrs[i]; i++) {
+ err = sysdev_create_file(sysdev, mce_sysdev_attrs[i]);
if (err)
goto error;
}
for (j = 0; j < banks; j++) {
- err = sysdev_create_file(&per_cpu(mce_dev, cpu),
- &mce_banks[j].attr);
+ err = sysdev_create_file(sysdev, &mce_banks[j].attr);
if (err)
goto error2;
}
- cpumask_set_cpu(cpu, mce_dev_initialized);
+ cpumask_set_cpu(cpu, mce_sysdev_initialized);
return 0;
error2:
while (--j >= 0)
- sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr);
+ sysdev_remove_file(sysdev, &mce_banks[j].attr);
error:
while (--i >= 0)
- sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
+ sysdev_remove_file(sysdev, mce_sysdev_attrs[i]);
- sysdev_unregister(&per_cpu(mce_dev, cpu));
+ sysdev_unregister(sysdev);
return err;
}
-static __cpuinit void mce_remove_device(unsigned int cpu)
+static __cpuinit void mce_sysdev_remove(unsigned int cpu)
{
+ struct sys_device *sysdev = &per_cpu(mce_sysdev, cpu);
int i;
- if (!cpumask_test_cpu(cpu, mce_dev_initialized))
+ if (!cpumask_test_cpu(cpu, mce_sysdev_initialized))
return;
- for (i = 0; mce_attrs[i]; i++)
- sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
+ for (i = 0; mce_sysdev_attrs[i]; i++)
+ sysdev_remove_file(sysdev, mce_sysdev_attrs[i]);
for (i = 0; i < banks; i++)
- sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr);
+ sysdev_remove_file(sysdev, &mce_banks[i].attr);
- sysdev_unregister(&per_cpu(mce_dev, cpu));
- cpumask_clear_cpu(cpu, mce_dev_initialized);
+ sysdev_unregister(sysdev);
+ cpumask_clear_cpu(cpu, mce_sysdev_initialized);
}
/* Make sure there are no machine checks on offlined CPUs. */
@@ -2065,7 +2037,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
switch (action) {
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
- mce_create_device(cpu);
+ mce_sysdev_create(cpu);
if (threshold_cpu_callback)
threshold_cpu_callback(action, cpu);
break;
@@ -2073,7 +2045,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
case CPU_DEAD_FROZEN:
if (threshold_cpu_callback)
threshold_cpu_callback(action, cpu);
- mce_remove_device(cpu);
+ mce_sysdev_remove(cpu);
break;
case CPU_DOWN_PREPARE:
case CPU_DOWN_PREPARE_FROZEN:
@@ -2127,27 +2099,28 @@ static __init int mcheck_init_device(void)
if (!mce_available(&boot_cpu_data))
return -EIO;
- zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL);
+ zalloc_cpumask_var(&mce_sysdev_initialized, GFP_KERNEL);
mce_init_banks();
- err = sysdev_class_register(&mce_sysclass);
+ err = sysdev_class_register(&mce_sysdev_class);
if (err)
return err;
for_each_online_cpu(i) {
- err = mce_create_device(i);
+ err = mce_sysdev_create(i);
if (err)
return err;
}
register_syscore_ops(&mce_syscore_ops);
register_hotcpu_notifier(&mce_cpu_notifier);
- misc_register(&mce_log_device);
+
+ /* register character device /dev/mcelog */
+ misc_register(&mce_chrdev_device);
return err;
}
-
device_initcall(mcheck_init_device);
/*
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index bb0adad3514..f5474218cff 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -548,7 +548,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
if (!b)
goto out;
- err = sysfs_create_link(&per_cpu(mce_dev, cpu).kobj,
+ err = sysfs_create_link(&per_cpu(mce_sysdev, cpu).kobj,
b->kobj, name);
if (err)
goto out;
@@ -571,7 +571,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
goto out;
}
- b->kobj = kobject_create_and_add(name, &per_cpu(mce_dev, cpu).kobj);
+ b->kobj = kobject_create_and_add(name, &per_cpu(mce_sysdev, cpu).kobj);
if (!b->kobj)
goto out_free;
@@ -591,7 +591,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
if (i == cpu)
continue;
- err = sysfs_create_link(&per_cpu(mce_dev, i).kobj,
+ err = sysfs_create_link(&per_cpu(mce_sysdev, i).kobj,
b->kobj, name);
if (err)
goto out;
@@ -669,7 +669,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
#ifdef CONFIG_SMP
/* sibling symlink */
if (shared_bank[bank] && b->blocks->cpu != cpu) {
- sysfs_remove_link(&per_cpu(mce_dev, cpu).kobj, name);
+ sysfs_remove_link(&per_cpu(mce_sysdev, cpu).kobj, name);
per_cpu(threshold_banks, cpu)[bank] = NULL;
return;
@@ -681,7 +681,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
if (i == cpu)
continue;
- sysfs_remove_link(&per_cpu(mce_dev, i).kobj, name);
+ sysfs_remove_link(&per_cpu(mce_sysdev, i).kobj, name);
per_cpu(threshold_banks, i)[bank] = NULL;
}
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 0f034460260..27c625178bf 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -187,8 +187,6 @@ static int therm_throt_process(bool new_event, int event, int level)
this_cpu,
level == CORE_LEVEL ? "Core" : "Package",
state->count);
-
- add_taint(TAINT_MACHINE_CHECK);
return 1;
}
if (old_event) {
@@ -355,7 +353,6 @@ static void notify_thresholds(__u64 msr_val)
static void intel_thermal_interrupt(void)
{
__u64 msr_val;
- struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
@@ -367,19 +364,19 @@ static void intel_thermal_interrupt(void)
CORE_LEVEL) != 0)
mce_log_therm_throt_event(CORE_THROTTLED | msr_val);
- if (cpu_has(c, X86_FEATURE_PLN))
+ if (this_cpu_has(X86_FEATURE_PLN))
if (therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,
POWER_LIMIT_EVENT,
CORE_LEVEL) != 0)
mce_log_therm_throt_event(CORE_POWER_LIMIT | msr_val);
- if (cpu_has(c, X86_FEATURE_PTS)) {
+ if (this_cpu_has(X86_FEATURE_PTS)) {
rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT,
THERMAL_THROTTLING_EVENT,
PACKAGE_LEVEL) != 0)
mce_log_therm_throt_event(PACKAGE_THROTTLED | msr_val);
- if (cpu_has(c, X86_FEATURE_PLN))
+ if (this_cpu_has(X86_FEATURE_PLN))
if (therm_throt_process(msr_val &
PACKAGE_THERM_STATUS_POWER_LIMIT,
POWER_LIMIT_EVENT,
@@ -393,7 +390,6 @@ static void unexpected_thermal_interrupt(void)
{
printk(KERN_ERR "CPU%d: Unexpected LVT thermal interrupt!\n",
smp_processor_id());
- add_taint(TAINT_MACHINE_CHECK);
}
static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt;
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 929739a653d..08119a37e53 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -79,7 +79,6 @@ void set_mtrr_ops(const struct mtrr_ops *ops)
static int have_wrcomb(void)
{
struct pci_dev *dev;
- u8 rev;
dev = pci_get_class(PCI_CLASS_BRIDGE_HOST << 8, NULL);
if (dev != NULL) {
@@ -89,13 +88,11 @@ static int have_wrcomb(void)
* chipsets to be tagged
*/
if (dev->vendor == PCI_VENDOR_ID_SERVERWORKS &&
- dev->device == PCI_DEVICE_ID_SERVERWORKS_LE) {
- pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
- if (rev <= 5) {
- pr_info("mtrr: Serverworks LE rev < 6 detected. Write-combining disabled.\n");
- pci_dev_put(dev);
- return 0;
- }
+ dev->device == PCI_DEVICE_ID_SERVERWORKS_LE &&
+ dev->revision <= 5) {
+ pr_info("mtrr: Serverworks LE rev < 6 detected. Write-combining disabled.\n");
+ pci_dev_put(dev);
+ return 0;
}
/*
* Intel 450NX errata # 23. Non ascending cacheline evictions to
@@ -137,55 +134,43 @@ static void __init init_table(void)
}
struct set_mtrr_data {
- atomic_t count;
- atomic_t gate;
unsigned long smp_base;
unsigned long smp_size;
unsigned int smp_reg;
mtrr_type smp_type;
};
-static DEFINE_PER_CPU(struct cpu_stop_work, mtrr_work);
-
/**
- * mtrr_work_handler - Synchronisation handler. Executed by "other" CPUs.
+ * mtrr_rendezvous_handler - Work done in the synchronization handler. Executed
+ * by all the CPUs.
* @info: pointer to mtrr configuration data
*
* Returns nothing.
*/
-static int mtrr_work_handler(void *info)
+static int mtrr_rendezvous_handler(void *info)
{
#ifdef CONFIG_SMP
struct set_mtrr_data *data = info;
- unsigned long flags;
-
- atomic_dec(&data->count);
- while (!atomic_read(&data->gate))
- cpu_relax();
-
- local_irq_save(flags);
-
- atomic_dec(&data->count);
- while (atomic_read(&data->gate))
- cpu_relax();
- /* The master has cleared me to execute */
+ /*
+ * We use this same function to initialize the mtrrs during boot,
+ * resume, runtime cpu online and on an explicit request to set a
+ * specific MTRR.
+ *
+ * During boot or suspend, the state of the boot cpu's mtrrs has been
+ * saved, and we want to replicate that across all the cpus that come
+ * online (either at the end of boot or resume or during a runtime cpu
+ * online). If we're doing that, @reg is set to something special and on
+ * all the cpu's we do mtrr_if->set_all() (On the logical cpu that
+ * started the boot/resume sequence, this might be a duplicate
+ * set_all()).
+ */
if (data->smp_reg != ~0U) {
mtrr_if->set(data->smp_reg, data->smp_base,
data->smp_size, data->smp_type);
- } else if (mtrr_aps_delayed_init) {
- /*
- * Initialize the MTRRs inaddition to the synchronisation.
- */
+ } else if (mtrr_aps_delayed_init || !cpu_online(smp_processor_id())) {
mtrr_if->set_all();
}
-
- atomic_dec(&data->count);
- while (!atomic_read(&data->gate))
- cpu_relax();
-
- atomic_dec(&data->count);
- local_irq_restore(flags);
#endif
return 0;
}
@@ -223,20 +208,11 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2)
* 14. Wait for buddies to catch up
* 15. Enable interrupts.
*
- * What does that mean for us? Well, first we set data.count to the number
- * of CPUs. As each CPU announces that it started the rendezvous handler by
- * decrementing the count, We reset data.count and set the data.gate flag
- * allowing all the cpu's to proceed with the work. As each cpu disables
- * interrupts, it'll decrement data.count once. We wait until it hits 0 and
- * proceed. We clear the data.gate flag and reset data.count. Meanwhile, they
- * are waiting for that flag to be cleared. Once it's cleared, each
- * CPU goes through the transition of updating MTRRs.
- * The CPU vendors may each do it differently,
- * so we call mtrr_if->set() callback and let them take care of it.
- * When they're done, they again decrement data->count and wait for data.gate
- * to be set.
- * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag
- * Everyone then enables interrupts and we all continue on.
+ * What does that mean for us? Well, stop_machine() will ensure that
+ * the rendezvous handler is started on each CPU. And in lockstep they
+ * do the state transition of disabling interrupts, updating MTRR's
+ * (the CPU vendors may each do it differently, so we call mtrr_if->set()
+ * callback and let them take care of it.) and enabling interrupts.
*
* Note that the mechanism is the same for UP systems, too; all the SMP stuff
* becomes nops.
@@ -244,92 +220,26 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2)
static void
set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type)
{
- struct set_mtrr_data data;
- unsigned long flags;
- int cpu;
-
- preempt_disable();
-
- data.smp_reg = reg;
- data.smp_base = base;
- data.smp_size = size;
- data.smp_type = type;
- atomic_set(&data.count, num_booting_cpus() - 1);
-
- /* Make sure data.count is visible before unleashing other CPUs */
- smp_wmb();
- atomic_set(&data.gate, 0);
-
- /* Start the ball rolling on other CPUs */
- for_each_online_cpu(cpu) {
- struct cpu_stop_work *work = &per_cpu(mtrr_work, cpu);
-
- if (cpu == smp_processor_id())
- continue;
-
- stop_one_cpu_nowait(cpu, mtrr_work_handler, &data, work);
- }
-
-
- while (atomic_read(&data.count))
- cpu_relax();
-
- /* Ok, reset count and toggle gate */
- atomic_set(&data.count, num_booting_cpus() - 1);
- smp_wmb();
- atomic_set(&data.gate, 1);
-
- local_irq_save(flags);
-
- while (atomic_read(&data.count))
- cpu_relax();
-
- /* Ok, reset count and toggle gate */
- atomic_set(&data.count, num_booting_cpus() - 1);
- smp_wmb();
- atomic_set(&data.gate, 0);
-
- /* Do our MTRR business */
-
- /*
- * HACK!
- *
- * We use this same function to initialize the mtrrs during boot,
- * resume, runtime cpu online and on an explicit request to set a
- * specific MTRR.
- *
- * During boot or suspend, the state of the boot cpu's mtrrs has been
- * saved, and we want to replicate that across all the cpus that come
- * online (either at the end of boot or resume or during a runtime cpu
- * online). If we're doing that, @reg is set to something special and on
- * this cpu we still do mtrr_if->set_all(). During boot/resume, this
- * is unnecessary if at this point we are still on the cpu that started
- * the boot/resume sequence. But there is no guarantee that we are still
- * on the same cpu. So we do mtrr_if->set_all() on this cpu aswell to be
- * sure that we are in sync with everyone else.
- */
- if (reg != ~0U)
- mtrr_if->set(reg, base, size, type);
- else
- mtrr_if->set_all();
+ struct set_mtrr_data data = { .smp_reg = reg,
+ .smp_base = base,
+ .smp_size = size,
+ .smp_type = type
+ };
- /* Wait for the others */
- while (atomic_read(&data.count))
- cpu_relax();
-
- atomic_set(&data.count, num_booting_cpus() - 1);
- smp_wmb();
- atomic_set(&data.gate, 1);
-
- /*
- * Wait here for everyone to have seen the gate change
- * So we're the last ones to touch 'data'
- */
- while (atomic_read(&data.count))
- cpu_relax();
+ stop_machine(mtrr_rendezvous_handler, &data, cpu_online_mask);
+}
- local_irq_restore(flags);
- preempt_enable();
+static void set_mtrr_from_inactive_cpu(unsigned int reg, unsigned long base,
+ unsigned long size, mtrr_type type)
+{
+ struct set_mtrr_data data = { .smp_reg = reg,
+ .smp_base = base,
+ .smp_size = size,
+ .smp_type = type
+ };
+
+ stop_machine_from_inactive_cpu(mtrr_rendezvous_handler, &data,
+ cpu_callout_mask);
}
/**
@@ -783,7 +693,7 @@ void mtrr_ap_init(void)
* 2. cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug
* lock to prevent mtrr entry changes
*/
- set_mtrr(~0U, 0, 0, 0);
+ set_mtrr_from_inactive_cpu(~0U, 0, 0, 0);
}
/**
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index e638689279d..4ee3abf20ed 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -22,7 +22,6 @@
#include <linux/sched.h>
#include <linux/uaccess.h>
#include <linux/slab.h>
-#include <linux/highmem.h>
#include <linux/cpu.h>
#include <linux/bitops.h>
@@ -31,6 +30,7 @@
#include <asm/nmi.h>
#include <asm/compat.h>
#include <asm/smp.h>
+#include <asm/alternative.h>
#if 0
#undef wrmsrl
@@ -44,38 +44,27 @@ do { \
#endif
/*
- * best effort, GUP based copy_from_user() that assumes IRQ or NMI context
+ * | NHM/WSM | SNB |
+ * register -------------------------------
+ * | HT | no HT | HT | no HT |
+ *-----------------------------------------
+ * offcore | core | core | cpu | core |
+ * lbr_sel | core | core | cpu | core |
+ * ld_lat | cpu | core | cpu | core |
+ *-----------------------------------------
+ *
+ * Given that there is a small number of shared regs,
+ * we can pre-allocate their slot in the per-cpu
+ * per-core reg tables.
*/
-static unsigned long
-copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
-{
- unsigned long offset, addr = (unsigned long)from;
- unsigned long size, len = 0;
- struct page *page;
- void *map;
- int ret;
-
- do {
- ret = __get_user_pages_fast(addr, 1, 0, &page);
- if (!ret)
- break;
-
- offset = addr & (PAGE_SIZE - 1);
- size = min(PAGE_SIZE - offset, n - len);
-
- map = kmap_atomic(page);
- memcpy(to, map+offset, size);
- kunmap_atomic(map);
- put_page(page);
-
- len += size;
- to += size;
- addr += size;
+enum extra_reg_type {
+ EXTRA_REG_NONE = -1, /* not used */
- } while (len < n);
+ EXTRA_REG_RSP_0 = 0, /* offcore_response_0 */
+ EXTRA_REG_RSP_1 = 1, /* offcore_response_1 */
- return len;
-}
+ EXTRA_REG_MAX /* number of entries needed */
+};
struct event_constraint {
union {
@@ -131,11 +120,10 @@ struct cpu_hw_events {
struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES];
/*
- * Intel percore register state.
- * Coordinate shared resources between HT threads.
+ * manage shared (per-core, per-cpu) registers
+ * used on Intel NHM/WSM/SNB
*/
- int percore_used; /* Used by this CPU? */
- struct intel_percore *per_core;
+ struct intel_shared_regs *shared_regs;
/*
* AMD specific bits
@@ -186,26 +174,45 @@ struct cpu_hw_events {
for ((e) = (c); (e)->weight; (e)++)
/*
+ * Per register state.
+ */
+struct er_account {
+ raw_spinlock_t lock; /* per-core: protect structure */
+ u64 config; /* extra MSR config */
+ u64 reg; /* extra MSR number */
+ atomic_t ref; /* reference count */
+};
+
+/*
* Extra registers for specific events.
+ *
* Some events need large masks and require external MSRs.
- * Define a mapping to these extra registers.
+ * Those extra MSRs end up being shared for all events on
+ * a PMU and sometimes between PMU of sibling HT threads.
+ * In either case, the kernel needs to handle conflicting
+ * accesses to those extra, shared, regs. The data structure
+ * to manage those registers is stored in cpu_hw_event.
*/
struct extra_reg {
unsigned int event;
unsigned int msr;
u64 config_mask;
u64 valid_mask;
+ int idx; /* per_xxx->regs[] reg index */
};
-#define EVENT_EXTRA_REG(e, ms, m, vm) { \
+#define EVENT_EXTRA_REG(e, ms, m, vm, i) { \
.event = (e), \
.msr = (ms), \
.config_mask = (m), \
.valid_mask = (vm), \
+ .idx = EXTRA_REG_##i \
}
-#define INTEL_EVENT_EXTRA_REG(event, msr, vm) \
- EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm)
-#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0)
+
+#define INTEL_EVENT_EXTRA_REG(event, msr, vm, idx) \
+ EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm, idx)
+
+#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0, RSP_0)
union perf_capabilities {
struct {
@@ -251,7 +258,6 @@ struct x86_pmu {
void (*put_event_constraints)(struct cpu_hw_events *cpuc,
struct perf_event *event);
struct event_constraint *event_constraints;
- struct event_constraint *percore_constraints;
void (*quirks)(void);
int perfctr_second_write;
@@ -285,8 +291,12 @@ struct x86_pmu {
* Extra registers for events
*/
struct extra_reg *extra_regs;
+ unsigned int er_flags;
};
+#define ERF_NO_HT_SHARING 1
+#define ERF_HAS_RSP_1 2
+
static struct x86_pmu x86_pmu __read_mostly;
static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
@@ -363,12 +373,18 @@ again:
return new_raw_count;
}
-/* using X86_FEATURE_PERFCTR_CORE to later implement ALTERNATIVE() here */
static inline int x86_pmu_addr_offset(int index)
{
- if (boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
- return index << 1;
- return index;
+ int offset;
+
+ /* offset = X86_FEATURE_PERFCTR_CORE ? index << 1 : index */
+ alternative_io(ASM_NOP2,
+ "shll $1, %%eax",
+ X86_FEATURE_PERFCTR_CORE,
+ "=a" (offset),
+ "a" (index));
+
+ return offset;
}
static inline unsigned int x86_pmu_config_addr(int index)
@@ -386,10 +402,10 @@ static inline unsigned int x86_pmu_event_addr(int index)
*/
static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
{
+ struct hw_perf_event_extra *reg;
struct extra_reg *er;
- event->hw.extra_reg = 0;
- event->hw.extra_config = 0;
+ reg = &event->hw.extra_reg;
if (!x86_pmu.extra_regs)
return 0;
@@ -399,8 +415,10 @@ static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
continue;
if (event->attr.config1 & ~er->valid_mask)
return -EINVAL;
- event->hw.extra_reg = er->msr;
- event->hw.extra_config = event->attr.config1;
+
+ reg->idx = er->idx;
+ reg->config = event->attr.config1;
+ reg->reg = er->msr;
break;
}
return 0;
@@ -699,6 +717,9 @@ static int __x86_pmu_event_init(struct perf_event *event)
event->hw.last_cpu = -1;
event->hw.last_tag = ~0ULL;
+ /* mark unused */
+ event->hw.extra_reg.idx = EXTRA_REG_NONE;
+
return x86_pmu.hw_config(event);
}
@@ -740,8 +761,8 @@ static void x86_pmu_disable(struct pmu *pmu)
static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
u64 enable_mask)
{
- if (hwc->extra_reg)
- wrmsrl(hwc->extra_reg, hwc->extra_config);
+ if (hwc->extra_reg.reg)
+ wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config);
wrmsrl(hwc->config_base, hwc->config | enable_mask);
}
@@ -1325,7 +1346,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
if (!x86_perf_event_set_period(event))
continue;
- if (perf_event_overflow(event, 1, &data, regs))
+ if (perf_event_overflow(event, &data, regs))
x86_pmu_stop(event, 0);
}
@@ -1630,6 +1651,40 @@ static int x86_pmu_commit_txn(struct pmu *pmu)
perf_pmu_enable(pmu);
return 0;
}
+/*
+ * a fake_cpuc is used to validate event groups. Due to
+ * the extra reg logic, we need to also allocate a fake
+ * per_core and per_cpu structure. Otherwise, group events
+ * using extra reg may conflict without the kernel being
+ * able to catch this when the last event gets added to
+ * the group.
+ */
+static void free_fake_cpuc(struct cpu_hw_events *cpuc)
+{
+ kfree(cpuc->shared_regs);
+ kfree(cpuc);
+}
+
+static struct cpu_hw_events *allocate_fake_cpuc(void)
+{
+ struct cpu_hw_events *cpuc;
+ int cpu = raw_smp_processor_id();
+
+ cpuc = kzalloc(sizeof(*cpuc), GFP_KERNEL);
+ if (!cpuc)
+ return ERR_PTR(-ENOMEM);
+
+ /* only needed, if we have extra_regs */
+ if (x86_pmu.extra_regs) {
+ cpuc->shared_regs = allocate_shared_regs(cpu);
+ if (!cpuc->shared_regs)
+ goto error;
+ }
+ return cpuc;
+error:
+ free_fake_cpuc(cpuc);
+ return ERR_PTR(-ENOMEM);
+}
/*
* validate that we can schedule this event
@@ -1640,9 +1695,9 @@ static int validate_event(struct perf_event *event)
struct event_constraint *c;
int ret = 0;
- fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO);
- if (!fake_cpuc)
- return -ENOMEM;
+ fake_cpuc = allocate_fake_cpuc();
+ if (IS_ERR(fake_cpuc))
+ return PTR_ERR(fake_cpuc);
c = x86_pmu.get_event_constraints(fake_cpuc, event);
@@ -1652,7 +1707,7 @@ static int validate_event(struct perf_event *event)
if (x86_pmu.put_event_constraints)
x86_pmu.put_event_constraints(fake_cpuc, event);
- kfree(fake_cpuc);
+ free_fake_cpuc(fake_cpuc);
return ret;
}
@@ -1672,36 +1727,32 @@ static int validate_group(struct perf_event *event)
{
struct perf_event *leader = event->group_leader;
struct cpu_hw_events *fake_cpuc;
- int ret, n;
-
- ret = -ENOMEM;
- fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO);
- if (!fake_cpuc)
- goto out;
+ int ret = -ENOSPC, n;
+ fake_cpuc = allocate_fake_cpuc();
+ if (IS_ERR(fake_cpuc))
+ return PTR_ERR(fake_cpuc);
/*
* the event is not yet connected with its
* siblings therefore we must first collect
* existing siblings, then add the new event
* before we can simulate the scheduling
*/
- ret = -ENOSPC;
n = collect_events(fake_cpuc, leader, true);
if (n < 0)
- goto out_free;
+ goto out;
fake_cpuc->n_events = n;
n = collect_events(fake_cpuc, event, false);
if (n < 0)
- goto out_free;
+ goto out;
fake_cpuc->n_events = n;
ret = x86_pmu.schedule_events(fake_cpuc, n, NULL);
-out_free:
- kfree(fake_cpuc);
out:
+ free_fake_cpuc(fake_cpuc);
return ret;
}
@@ -1766,17 +1817,6 @@ static struct pmu pmu = {
* callchain support
*/
-static void
-backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
-{
- /* Ignore warnings */
-}
-
-static void backtrace_warning(void *data, char *msg)
-{
- /* Ignore warnings */
-}
-
static int backtrace_stack(void *data, char *name)
{
return 0;
@@ -1790,8 +1830,6 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
}
static const struct stacktrace_ops backtrace_ops = {
- .warning = backtrace_warning,
- .warning_symbol = backtrace_warning_symbol,
.stack = backtrace_stack,
.address = backtrace_address,
.walk_stack = print_context_stack_bp,
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index cf4e369cea6..941caa2e449 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -89,6 +89,20 @@ static __initconst const u64 amd_hw_cache_event_ids
[ C(RESULT_MISS) ] = -1,
},
},
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0xb8e9, /* CPU Request to Memory, l+r */
+ [ C(RESULT_MISS) ] = 0x98e9, /* CPU Request to Memory, r */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
};
/*
@@ -96,12 +110,14 @@ static __initconst const u64 amd_hw_cache_event_ids
*/
static const u64 amd_perfmon_event_map[] =
{
- [PERF_COUNT_HW_CPU_CYCLES] = 0x0076,
- [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
- [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080,
- [PERF_COUNT_HW_CACHE_MISSES] = 0x0081,
- [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2,
- [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3,
+ [PERF_COUNT_HW_CPU_CYCLES] = 0x0076,
+ [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
+ [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080,
+ [PERF_COUNT_HW_CACHE_MISSES] = 0x0081,
+ [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2,
+ [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3,
+ [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00d0, /* "Decoder empty" event */
+ [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x00d1, /* "Dispatch stalls" event */
};
static u64 amd_pmu_event_map(int hw_event)
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 447a28de6f0..45fbb8f7f54 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1,25 +1,15 @@
#ifdef CONFIG_CPU_SUP_INTEL
-#define MAX_EXTRA_REGS 2
-
-/*
- * Per register state.
- */
-struct er_account {
- int ref; /* reference count */
- unsigned int extra_reg; /* extra MSR number */
- u64 extra_config; /* extra MSR config */
-};
-
/*
- * Per core state
- * This used to coordinate shared registers for HT threads.
+ * Per core/cpu state
+ *
+ * Used to coordinate shared registers between HT threads or
+ * among events on a single PMU.
*/
-struct intel_percore {
- raw_spinlock_t lock; /* protect structure */
- struct er_account regs[MAX_EXTRA_REGS];
- int refcnt; /* number of threads */
- unsigned core_id;
+struct intel_shared_regs {
+ struct er_account regs[EXTRA_REG_MAX];
+ int refcnt; /* per-core: #HT threads */
+ unsigned core_id; /* per-core: core id */
};
/*
@@ -36,7 +26,7 @@ static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly =
[PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
};
-static struct event_constraint intel_core_event_constraints[] =
+static struct event_constraint intel_core_event_constraints[] __read_mostly =
{
INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
@@ -47,7 +37,7 @@ static struct event_constraint intel_core_event_constraints[] =
EVENT_CONSTRAINT_END
};
-static struct event_constraint intel_core2_event_constraints[] =
+static struct event_constraint intel_core2_event_constraints[] __read_mostly =
{
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
@@ -70,7 +60,7 @@ static struct event_constraint intel_core2_event_constraints[] =
EVENT_CONSTRAINT_END
};
-static struct event_constraint intel_nehalem_event_constraints[] =
+static struct event_constraint intel_nehalem_event_constraints[] __read_mostly =
{
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
@@ -86,19 +76,13 @@ static struct event_constraint intel_nehalem_event_constraints[] =
EVENT_CONSTRAINT_END
};
-static struct extra_reg intel_nehalem_extra_regs[] =
+static struct extra_reg intel_nehalem_extra_regs[] __read_mostly =
{
- INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff),
+ INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
EVENT_EXTRA_END
};
-static struct event_constraint intel_nehalem_percore_constraints[] =
-{
- INTEL_EVENT_CONSTRAINT(0xb7, 0),
- EVENT_CONSTRAINT_END
-};
-
-static struct event_constraint intel_westmere_event_constraints[] =
+static struct event_constraint intel_westmere_event_constraints[] __read_mostly =
{
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
@@ -110,34 +94,30 @@ static struct event_constraint intel_westmere_event_constraints[] =
EVENT_CONSTRAINT_END
};
-static struct event_constraint intel_snb_event_constraints[] =
+static struct event_constraint intel_snb_event_constraints[] __read_mostly =
{
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
/* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */
- INTEL_EVENT_CONSTRAINT(0xb7, 0x1), /* OFF_CORE_RESPONSE_0 */
- INTEL_EVENT_CONSTRAINT(0xbb, 0x8), /* OFF_CORE_RESPONSE_1 */
INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
EVENT_CONSTRAINT_END
};
-static struct extra_reg intel_westmere_extra_regs[] =
+static struct extra_reg intel_westmere_extra_regs[] __read_mostly =
{
- INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff),
- INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0xffff),
+ INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
+ INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0xffff, RSP_1),
EVENT_EXTRA_END
};
-static struct event_constraint intel_westmere_percore_constraints[] =
+static struct event_constraint intel_v1_event_constraints[] __read_mostly =
{
- INTEL_EVENT_CONSTRAINT(0xb7, 0),
- INTEL_EVENT_CONSTRAINT(0xbb, 0),
EVENT_CONSTRAINT_END
};
-static struct event_constraint intel_gen_event_constraints[] =
+static struct event_constraint intel_gen_event_constraints[] __read_mostly =
{
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
@@ -145,6 +125,12 @@ static struct event_constraint intel_gen_event_constraints[] =
EVENT_CONSTRAINT_END
};
+static struct extra_reg intel_snb_extra_regs[] __read_mostly = {
+ INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0x3fffffffffull, RSP_0),
+ INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0x3fffffffffull, RSP_1),
+ EVENT_EXTRA_END
+};
+
static u64 intel_pmu_event_map(int hw_event)
{
return intel_perfmon_event_map[hw_event];
@@ -245,6 +231,21 @@ static __initconst const u64 snb_hw_cache_event_ids
[ C(RESULT_MISS) ] = -1,
},
},
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+
};
static __initconst const u64 westmere_hw_cache_event_ids
@@ -346,6 +347,20 @@ static __initconst const u64 westmere_hw_cache_event_ids
[ C(RESULT_MISS) ] = -1,
},
},
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ [ C(RESULT_MISS) ] = 0x01b7,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ [ C(RESULT_MISS) ] = 0x01b7,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ [ C(RESULT_MISS) ] = 0x01b7,
+ },
+ },
};
/*
@@ -398,7 +413,21 @@ static __initconst const u64 nehalem_hw_cache_extra_regs
[ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_L3_ACCESS,
[ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_L3_MISS,
},
- }
+ },
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_ALL_DRAM,
+ [ C(RESULT_MISS) ] = NHM_DMND_READ|NHM_REMOTE_DRAM,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_ALL_DRAM,
+ [ C(RESULT_MISS) ] = NHM_DMND_WRITE|NHM_REMOTE_DRAM,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_ALL_DRAM,
+ [ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_REMOTE_DRAM,
+ },
+ },
};
static __initconst const u64 nehalem_hw_cache_event_ids
@@ -500,6 +529,20 @@ static __initconst const u64 nehalem_hw_cache_event_ids
[ C(RESULT_MISS) ] = -1,
},
},
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ [ C(RESULT_MISS) ] = 0x01b7,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ [ C(RESULT_MISS) ] = 0x01b7,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ [ C(RESULT_MISS) ] = 0x01b7,
+ },
+ },
};
static __initconst const u64 core2_hw_cache_event_ids
@@ -1003,7 +1046,7 @@ again:
data.period = event->hw.last_period;
- if (perf_event_overflow(event, 1, &data, regs))
+ if (perf_event_overflow(event, &data, regs))
x86_pmu_stop(event, 0);
}
@@ -1037,65 +1080,121 @@ intel_bts_constraints(struct perf_event *event)
return NULL;
}
+static bool intel_try_alt_er(struct perf_event *event, int orig_idx)
+{
+ if (!(x86_pmu.er_flags & ERF_HAS_RSP_1))
+ return false;
+
+ if (event->hw.extra_reg.idx == EXTRA_REG_RSP_0) {
+ event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
+ event->hw.config |= 0x01bb;
+ event->hw.extra_reg.idx = EXTRA_REG_RSP_1;
+ event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1;
+ } else if (event->hw.extra_reg.idx == EXTRA_REG_RSP_1) {
+ event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
+ event->hw.config |= 0x01b7;
+ event->hw.extra_reg.idx = EXTRA_REG_RSP_0;
+ event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0;
+ }
+
+ if (event->hw.extra_reg.idx == orig_idx)
+ return false;
+
+ return true;
+}
+
+/*
+ * manage allocation of shared extra msr for certain events
+ *
+ * sharing can be:
+ * per-cpu: to be shared between the various events on a single PMU
+ * per-core: per-cpu + shared by HT threads
+ */
static struct event_constraint *
-intel_percore_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+__intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc,
+ struct perf_event *event)
{
- struct hw_perf_event *hwc = &event->hw;
- unsigned int e = hwc->config & ARCH_PERFMON_EVENTSEL_EVENT;
- struct event_constraint *c;
- struct intel_percore *pc;
+ struct event_constraint *c = &emptyconstraint;
+ struct hw_perf_event_extra *reg = &event->hw.extra_reg;
struct er_account *era;
- int i;
- int free_slot;
- int found;
+ unsigned long flags;
+ int orig_idx = reg->idx;
- if (!x86_pmu.percore_constraints || hwc->extra_alloc)
- return NULL;
+ /* already allocated shared msr */
+ if (reg->alloc)
+ return &unconstrained;
- for (c = x86_pmu.percore_constraints; c->cmask; c++) {
- if (e != c->code)
- continue;
+again:
+ era = &cpuc->shared_regs->regs[reg->idx];
+ /*
+ * we use spin_lock_irqsave() to avoid lockdep issues when
+ * passing a fake cpuc
+ */
+ raw_spin_lock_irqsave(&era->lock, flags);
+
+ if (!atomic_read(&era->ref) || era->config == reg->config) {
+
+ /* lock in msr value */
+ era->config = reg->config;
+ era->reg = reg->reg;
+
+ /* one more user */
+ atomic_inc(&era->ref);
+
+ /* no need to reallocate during incremental event scheduling */
+ reg->alloc = 1;
/*
- * Allocate resource per core.
+ * All events using extra_reg are unconstrained.
+ * Avoids calling x86_get_event_constraints()
+ *
+ * Must revisit if extra_reg controlling events
+ * ever have constraints. Worst case we go through
+ * the regular event constraint table.
*/
- pc = cpuc->per_core;
- if (!pc)
- break;
- c = &emptyconstraint;
- raw_spin_lock(&pc->lock);
- free_slot = -1;
- found = 0;
- for (i = 0; i < MAX_EXTRA_REGS; i++) {
- era = &pc->regs[i];
- if (era->ref > 0 && hwc->extra_reg == era->extra_reg) {
- /* Allow sharing same config */
- if (hwc->extra_config == era->extra_config) {
- era->ref++;
- cpuc->percore_used = 1;
- hwc->extra_alloc = 1;
- c = NULL;
- }
- /* else conflict */
- found = 1;
- break;
- } else if (era->ref == 0 && free_slot == -1)
- free_slot = i;
- }
- if (!found && free_slot != -1) {
- era = &pc->regs[free_slot];
- era->ref = 1;
- era->extra_reg = hwc->extra_reg;
- era->extra_config = hwc->extra_config;
- cpuc->percore_used = 1;
- hwc->extra_alloc = 1;
- c = NULL;
- }
- raw_spin_unlock(&pc->lock);
- return c;
+ c = &unconstrained;
+ } else if (intel_try_alt_er(event, orig_idx)) {
+ raw_spin_unlock(&era->lock);
+ goto again;
}
+ raw_spin_unlock_irqrestore(&era->lock, flags);
- return NULL;
+ return c;
+}
+
+static void
+__intel_shared_reg_put_constraints(struct cpu_hw_events *cpuc,
+ struct hw_perf_event_extra *reg)
+{
+ struct er_account *era;
+
+ /*
+ * only put constraint if extra reg was actually
+ * allocated. Also takes care of event which do
+ * not use an extra shared reg
+ */
+ if (!reg->alloc)
+ return;
+
+ era = &cpuc->shared_regs->regs[reg->idx];
+
+ /* one fewer user */
+ atomic_dec(&era->ref);
+
+ /* allocate again next time */
+ reg->alloc = 0;
+}
+
+static struct event_constraint *
+intel_shared_regs_constraints(struct cpu_hw_events *cpuc,
+ struct perf_event *event)
+{
+ struct event_constraint *c = NULL;
+
+ if (event->hw.extra_reg.idx != EXTRA_REG_NONE)
+ c = __intel_shared_reg_get_constraints(cpuc, event);
+
+ return c;
}
static struct event_constraint *
@@ -1111,49 +1210,28 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event
if (c)
return c;
- c = intel_percore_constraints(cpuc, event);
+ c = intel_shared_regs_constraints(cpuc, event);
if (c)
return c;
return x86_get_event_constraints(cpuc, event);
}
-static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
+static void
+intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc,
struct perf_event *event)
{
- struct extra_reg *er;
- struct intel_percore *pc;
- struct er_account *era;
- struct hw_perf_event *hwc = &event->hw;
- int i, allref;
-
- if (!cpuc->percore_used)
- return;
+ struct hw_perf_event_extra *reg;
- for (er = x86_pmu.extra_regs; er->msr; er++) {
- if (er->event != (hwc->config & er->config_mask))
- continue;
+ reg = &event->hw.extra_reg;
+ if (reg->idx != EXTRA_REG_NONE)
+ __intel_shared_reg_put_constraints(cpuc, reg);
+}
- pc = cpuc->per_core;
- raw_spin_lock(&pc->lock);
- for (i = 0; i < MAX_EXTRA_REGS; i++) {
- era = &pc->regs[i];
- if (era->ref > 0 &&
- era->extra_config == hwc->extra_config &&
- era->extra_reg == er->msr) {
- era->ref--;
- hwc->extra_alloc = 0;
- break;
- }
- }
- allref = 0;
- for (i = 0; i < MAX_EXTRA_REGS; i++)
- allref += pc->regs[i].ref;
- if (allref == 0)
- cpuc->percore_used = 0;
- raw_spin_unlock(&pc->lock);
- break;
- }
+static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
+ struct perf_event *event)
+{
+ intel_put_shared_regs_event_constraints(cpuc, event);
}
static int intel_pmu_hw_config(struct perf_event *event)
@@ -1231,20 +1309,36 @@ static __initconst const struct x86_pmu core_pmu = {
.event_constraints = intel_core_event_constraints,
};
+static struct intel_shared_regs *allocate_shared_regs(int cpu)
+{
+ struct intel_shared_regs *regs;
+ int i;
+
+ regs = kzalloc_node(sizeof(struct intel_shared_regs),
+ GFP_KERNEL, cpu_to_node(cpu));
+ if (regs) {
+ /*
+ * initialize the locks to keep lockdep happy
+ */
+ for (i = 0; i < EXTRA_REG_MAX; i++)
+ raw_spin_lock_init(&regs->regs[i].lock);
+
+ regs->core_id = -1;
+ }
+ return regs;
+}
+
static int intel_pmu_cpu_prepare(int cpu)
{
struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
- if (!cpu_has_ht_siblings())
+ if (!x86_pmu.extra_regs)
return NOTIFY_OK;
- cpuc->per_core = kzalloc_node(sizeof(struct intel_percore),
- GFP_KERNEL, cpu_to_node(cpu));
- if (!cpuc->per_core)
+ cpuc->shared_regs = allocate_shared_regs(cpu);
+ if (!cpuc->shared_regs)
return NOTIFY_BAD;
- raw_spin_lock_init(&cpuc->per_core->lock);
- cpuc->per_core->core_id = -1;
return NOTIFY_OK;
}
@@ -1260,32 +1354,34 @@ static void intel_pmu_cpu_starting(int cpu)
*/
intel_pmu_lbr_reset();
- if (!cpu_has_ht_siblings())
+ if (!cpuc->shared_regs || (x86_pmu.er_flags & ERF_NO_HT_SHARING))
return;
for_each_cpu(i, topology_thread_cpumask(cpu)) {
- struct intel_percore *pc = per_cpu(cpu_hw_events, i).per_core;
+ struct intel_shared_regs *pc;
+ pc = per_cpu(cpu_hw_events, i).shared_regs;
if (pc && pc->core_id == core_id) {
- kfree(cpuc->per_core);
- cpuc->per_core = pc;
+ kfree(cpuc->shared_regs);
+ cpuc->shared_regs = pc;
break;
}
}
- cpuc->per_core->core_id = core_id;
- cpuc->per_core->refcnt++;
+ cpuc->shared_regs->core_id = core_id;
+ cpuc->shared_regs->refcnt++;
}
static void intel_pmu_cpu_dying(int cpu)
{
struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
- struct intel_percore *pc = cpuc->per_core;
+ struct intel_shared_regs *pc;
+ pc = cpuc->shared_regs;
if (pc) {
if (pc->core_id == -1 || --pc->refcnt == 0)
kfree(pc);
- cpuc->per_core = NULL;
+ cpuc->shared_regs = NULL;
}
fini_debug_store_on_cpu(cpu);
@@ -1436,10 +1532,14 @@ static __init int intel_pmu_init(void)
x86_pmu.event_constraints = intel_nehalem_event_constraints;
x86_pmu.pebs_constraints = intel_nehalem_pebs_event_constraints;
- x86_pmu.percore_constraints = intel_nehalem_percore_constraints;
x86_pmu.enable_all = intel_pmu_nhm_enable_all;
x86_pmu.extra_regs = intel_nehalem_extra_regs;
+ /* UOPS_ISSUED.STALLED_CYCLES */
+ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e;
+ /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
+ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1;
+
if (ebx & 0x40) {
/*
* Erratum AAJ80 detected, we work it around by using
@@ -1476,10 +1576,16 @@ static __init int intel_pmu_init(void)
intel_pmu_lbr_init_nhm();
x86_pmu.event_constraints = intel_westmere_event_constraints;
- x86_pmu.percore_constraints = intel_westmere_percore_constraints;
x86_pmu.enable_all = intel_pmu_nhm_enable_all;
x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints;
x86_pmu.extra_regs = intel_westmere_extra_regs;
+ x86_pmu.er_flags |= ERF_HAS_RSP_1;
+
+ /* UOPS_ISSUED.STALLED_CYCLES */
+ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e;
+ /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
+ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1;
+
pr_cont("Westmere events, ");
break;
@@ -1491,15 +1597,33 @@ static __init int intel_pmu_init(void)
x86_pmu.event_constraints = intel_snb_event_constraints;
x86_pmu.pebs_constraints = intel_snb_pebs_events;
+ x86_pmu.extra_regs = intel_snb_extra_regs;
+ /* all extra regs are per-cpu when HT is on */
+ x86_pmu.er_flags |= ERF_HAS_RSP_1;
+ x86_pmu.er_flags |= ERF_NO_HT_SHARING;
+
+ /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
+ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e;
+ /* UOPS_DISPATCHED.THREAD,c=1,i=1 to count stall cycles*/
+ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x18001b1;
+
pr_cont("SandyBridge events, ");
break;
default:
- /*
- * default constraints for v2 and up
- */
- x86_pmu.event_constraints = intel_gen_event_constraints;
- pr_cont("generic architected perfmon, ");
+ switch (x86_pmu.version) {
+ case 1:
+ x86_pmu.event_constraints = intel_v1_event_constraints;
+ pr_cont("generic architected perfmon v1, ");
+ break;
+ default:
+ /*
+ * default constraints for v2 and up
+ */
+ x86_pmu.event_constraints = intel_gen_event_constraints;
+ pr_cont("generic architected perfmon, ");
+ break;
+ }
}
return 0;
}
@@ -1511,4 +1635,8 @@ static int intel_pmu_init(void)
return 0;
}
+static struct intel_shared_regs *allocate_shared_regs(int cpu)
+{
+ return NULL;
+}
#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index bab491b8ee2..1b1ef3addcf 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -340,7 +340,7 @@ static int intel_pmu_drain_bts_buffer(void)
*/
perf_prepare_sample(&header, &data, event, &regs);
- if (perf_output_begin(&handle, event, header.size * (top - at), 1, 1))
+ if (perf_output_begin(&handle, event, header.size * (top - at)))
return 1;
for (; at < top; at++) {
@@ -616,7 +616,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
else
regs.flags &= ~PERF_EFLAGS_EXACT;
- if (perf_event_overflow(event, 1, &data, &regs))
+ if (perf_event_overflow(event, &data, &regs))
x86_pmu_stop(event, 0);
}
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index e93fcd55fae..7809d2bcb20 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -468,7 +468,7 @@ static struct p4_event_bind p4_event_bind_map[] = {
.opcode = P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED),
.escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
.escr_emask =
- P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS),
+ P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS),
.cntr = { {12, 13, 16}, {14, 15, 17} },
},
[P4_EVENT_X87_ASSIST] = {
@@ -554,13 +554,102 @@ static __initconst const u64 p4_hw_cache_event_ids
[ C(RESULT_MISS) ] = -1,
},
},
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
};
+/*
+ * Because of Netburst being quite restricted in how many
+ * identical events may run simultaneously, we introduce event aliases,
+ * ie the different events which have the same functionality but
+ * utilize non-intersected resources (ESCR/CCCR/counter registers).
+ *
+ * This allow us to relax restrictions a bit and run two or more
+ * identical events together.
+ *
+ * Never set any custom internal bits such as P4_CONFIG_HT,
+ * P4_CONFIG_ALIASABLE or bits for P4_PEBS_METRIC, they are
+ * either up to date automatically or not applicable at all.
+ */
+struct p4_event_alias {
+ u64 original;
+ u64 alternative;
+} p4_event_aliases[] = {
+ {
+ /*
+ * Non-halted cycles can be substituted with non-sleeping cycles (see
+ * Intel SDM Vol3b for details). We need this alias to be able
+ * to run nmi-watchdog and 'perf top' (or any other user space tool
+ * which is interested in running PERF_COUNT_HW_CPU_CYCLES)
+ * simultaneously.
+ */
+ .original =
+ p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)),
+ .alternative =
+ p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_EXECUTION_EVENT) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0)|
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1)|
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2)|
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3)|
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3))|
+ p4_config_pack_cccr(P4_CCCR_THRESHOLD(15) | P4_CCCR_COMPLEMENT |
+ P4_CCCR_COMPARE),
+ },
+};
+
+static u64 p4_get_alias_event(u64 config)
+{
+ u64 config_match;
+ int i;
+
+ /*
+ * Only event with special mark is allowed,
+ * we're to be sure it didn't come as malformed
+ * RAW event.
+ */
+ if (!(config & P4_CONFIG_ALIASABLE))
+ return 0;
+
+ config_match = config & P4_CONFIG_EVENT_ALIAS_MASK;
+
+ for (i = 0; i < ARRAY_SIZE(p4_event_aliases); i++) {
+ if (config_match == p4_event_aliases[i].original) {
+ config_match = p4_event_aliases[i].alternative;
+ break;
+ } else if (config_match == p4_event_aliases[i].alternative) {
+ config_match = p4_event_aliases[i].original;
+ break;
+ }
+ }
+
+ if (i >= ARRAY_SIZE(p4_event_aliases))
+ return 0;
+
+ return config_match | (config & P4_CONFIG_EVENT_ALIAS_IMMUTABLE_BITS);
+}
+
static u64 p4_general_events[PERF_COUNT_HW_MAX] = {
/* non-halted CPU clocks */
[PERF_COUNT_HW_CPU_CYCLES] =
p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS) |
- P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)),
+ P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)) |
+ P4_CONFIG_ALIASABLE,
/*
* retired instructions
@@ -912,8 +1001,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
int idx, handled = 0;
u64 val;
- data.addr = 0;
- data.raw = NULL;
+ perf_sample_data_init(&data, 0);
cpuc = &__get_cpu_var(cpu_hw_events);
@@ -946,7 +1034,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
if (!x86_perf_event_set_period(event))
continue;
- if (perf_event_overflow(event, 1, &data, regs))
+ if (perf_event_overflow(event, &data, regs))
x86_pmu_stop(event, 0);
}
@@ -1121,6 +1209,8 @@ static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign
struct p4_event_bind *bind;
unsigned int i, thread, num;
int cntr_idx, escr_idx;
+ u64 config_alias;
+ int pass;
bitmap_zero(used_mask, X86_PMC_IDX_MAX);
bitmap_zero(escr_mask, P4_ESCR_MSR_TABLE_SIZE);
@@ -1129,6 +1219,17 @@ static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign
hwc = &cpuc->event_list[i]->hw;
thread = p4_ht_thread(cpu);
+ pass = 0;
+
+again:
+ /*
+ * It's possible to hit a circular lock
+ * between original and alternative events
+ * if both are scheduled already.
+ */
+ if (pass > 2)
+ goto done;
+
bind = p4_config_get_bind(hwc->config);
escr_idx = p4_get_escr_idx(bind->escr_msr[thread]);
if (unlikely(escr_idx == -1))
@@ -1142,8 +1243,17 @@ static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign
}
cntr_idx = p4_next_cntr(thread, used_mask, bind);
- if (cntr_idx == -1 || test_bit(escr_idx, escr_mask))
- goto done;
+ if (cntr_idx == -1 || test_bit(escr_idx, escr_mask)) {
+ /*
+ * Check whether an event alias is still available.
+ */
+ config_alias = p4_get_alias_event(hwc->config);
+ if (!config_alias)
+ goto done;
+ hwc->config = config_alias;
+ pass++;
+ goto again;
+ }
p4_pmu_swap_config_ts(hwc, cpu);
if (assign)
@@ -1197,7 +1307,7 @@ static __init int p4_pmu_init(void)
{
unsigned int low, high;
- /* If we get stripped -- indexig fails */
+ /* If we get stripped -- indexing fails */
BUILD_BUG_ON(ARCH_P4_MAX_CCCR > X86_PMC_MAX_GENERIC);
rdmsr(MSR_IA32_MISC_ENABLE, low, high);
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index e90f08458e6..a621f342768 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -13,6 +13,7 @@
#include <linux/slab.h>
#include <linux/pci.h>
#include <linux/of_pci.h>
+#include <linux/initrd.h>
#include <asm/hpet.h>
#include <asm/irq_controller.h>
@@ -98,6 +99,16 @@ void * __init early_init_dt_alloc_memory_arch(u64 size, u64 align)
return __alloc_bootmem(size, align, __pa(MAX_DMA_ADDRESS));
}
+#ifdef CONFIG_BLK_DEV_INITRD
+void __init early_init_dt_setup_initrd_arch(unsigned long start,
+ unsigned long end)
+{
+ initrd_start = (unsigned long)__va(start);
+ initrd_end = (unsigned long)__va(end);
+ initrd_below_start_ok = 1;
+}
+#endif
+
void __init add_dtb(u64 data)
{
initial_dtb = data + offsetof(struct setup_data, data);
@@ -123,6 +134,24 @@ static int __init add_bus_probe(void)
module_init(add_bus_probe);
#ifdef CONFIG_PCI
+struct device_node *pcibios_get_phb_of_node(struct pci_bus *bus)
+{
+ struct device_node *np;
+
+ for_each_node_by_type(np, "pci") {
+ const void *prop;
+ unsigned int bus_min;
+
+ prop = of_get_property(np, "bus-range", NULL);
+ if (!prop)
+ continue;
+ bus_min = be32_to_cpup(prop);
+ if (bus->number == bus_min)
+ return np;
+ }
+ return NULL;
+}
+
static int x86_of_pci_irq_enable(struct pci_dev *dev)
{
struct of_irq oirq;
@@ -154,50 +183,8 @@ static void x86_of_pci_irq_disable(struct pci_dev *dev)
void __cpuinit x86_of_pci_init(void)
{
- struct device_node *np;
-
pcibios_enable_irq = x86_of_pci_irq_enable;
pcibios_disable_irq = x86_of_pci_irq_disable;
-
- for_each_node_by_type(np, "pci") {
- const void *prop;
- struct pci_bus *bus;
- unsigned int bus_min;
- struct device_node *child;
-
- prop = of_get_property(np, "bus-range", NULL);
- if (!prop)
- continue;
- bus_min = be32_to_cpup(prop);
-
- bus = pci_find_bus(0, bus_min);
- if (!bus) {
- printk(KERN_ERR "Can't find a node for bus %s.\n",
- np->full_name);
- continue;
- }
-
- if (bus->self)
- bus->self->dev.of_node = np;
- else
- bus->dev.of_node = np;
-
- for_each_child_of_node(np, child) {
- struct pci_dev *dev;
- u32 devfn;
-
- prop = of_get_property(child, "reg", NULL);
- if (!prop)
- continue;
-
- devfn = (be32_to_cpup(prop) >> 8) & 0xff;
- dev = pci_get_slot(bus, devfn);
- if (!dev)
- continue;
- dev->dev.of_node = child;
- pci_dev_put(dev);
- }
- }
}
#endif
@@ -369,6 +356,7 @@ static struct of_ioapic_type of_ioapic_type[] =
static int ioapic_xlate(struct irq_domain *id, const u32 *intspec, u32 intsize,
u32 *out_hwirq, u32 *out_type)
{
+ struct mp_ioapic_gsi *gsi_cfg;
struct io_apic_irq_attr attr;
struct of_ioapic_type *it;
u32 line, idx, type;
@@ -378,7 +366,8 @@ static int ioapic_xlate(struct irq_domain *id, const u32 *intspec, u32 intsize,
line = *intspec;
idx = (u32) id->priv;
- *out_hwirq = line + mp_gsi_routing[idx].gsi_base;
+ gsi_cfg = mp_ioapic_gsi_routing(idx);
+ *out_hwirq = line + gsi_cfg->gsi_base;
intspec++;
type = *intspec;
@@ -407,7 +396,7 @@ static void __init ioapic_add_ofnode(struct device_node *np)
}
for (i = 0; i < nr_ioapics; i++) {
- if (r.start == mp_ioapics[i].apicaddr) {
+ if (r.start == mpc_ioapic_addr(i)) {
struct irq_domain *id;
id = kzalloc(sizeof(*id), GFP_KERNEL);
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index e2a3f0606da..1aae78f775f 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -135,20 +135,6 @@ print_context_stack_bp(struct thread_info *tinfo,
}
EXPORT_SYMBOL_GPL(print_context_stack_bp);
-
-static void
-print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
-{
- printk(data);
- print_symbol(msg, symbol);
- printk("\n");
-}
-
-static void print_trace_warning(void *data, char *msg)
-{
- printk("%s%s\n", (char *)data, msg);
-}
-
static int print_trace_stack(void *data, char *name)
{
printk("%s <%s> ", (char *)data, name);
@@ -166,8 +152,6 @@ static void print_trace_address(void *data, unsigned long addr, int reliable)
}
static const struct stacktrace_ops print_trace_ops = {
- .warning = print_trace_warning,
- .warning_symbol = print_trace_warning_symbol,
.stack = print_trace_stack,
.address = print_trace_address,
.walk_stack = print_context_stack,
@@ -279,7 +263,6 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err)
printk("DEBUG_PAGEALLOC");
#endif
printk("\n");
- sysfs_printk_last_file();
if (notify_die(DIE_OOPS, str, regs, err,
current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
return 1;
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index e71c98d3c0d..19853ad8afc 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -105,34 +105,6 @@ in_irq_stack(unsigned long *stack, unsigned long *irq_stack,
}
/*
- * We are returning from the irq stack and go to the previous one.
- * If the previous stack is also in the irq stack, then bp in the first
- * frame of the irq stack points to the previous, interrupted one.
- * Otherwise we have another level of indirection: We first save
- * the bp of the previous stack, then we switch the stack to the irq one
- * and save a new bp that links to the previous one.
- * (See save_args())
- */
-static inline unsigned long
-fixup_bp_irq_link(unsigned long bp, unsigned long *stack,
- unsigned long *irq_stack, unsigned long *irq_stack_end)
-{
-#ifdef CONFIG_FRAME_POINTER
- struct stack_frame *frame = (struct stack_frame *)bp;
- unsigned long next;
-
- if (!in_irq_stack(stack, irq_stack, irq_stack_end)) {
- if (!probe_kernel_address(&frame->next_frame, next))
- return next;
- else
- WARN_ONCE(1, "Perf: bad frame pointer = %p in "
- "callchain\n", &frame->next_frame);
- }
-#endif
- return bp;
-}
-
-/*
* x86-64 can have up to three kernel stacks:
* process stack
* interrupt stack
@@ -155,9 +127,12 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
task = current;
if (!stack) {
- stack = &dummy;
- if (task && task != current)
+ if (regs)
+ stack = (unsigned long *)regs->sp;
+ else if (task && task != current)
stack = (unsigned long *)task->thread.sp;
+ else
+ stack = &dummy;
}
if (!bp)
@@ -205,8 +180,6 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
* pointer (index -1 to end) in the IRQ stack:
*/
stack = (unsigned long *) (irq_stack_end[-1]);
- bp = fixup_bp_irq_link(bp, stack, irq_stack,
- irq_stack_end);
irq_stack_end = NULL;
ops->stack(data, "EOI");
continue;
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 8a445a0c989..e13329d800c 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -9,6 +9,8 @@
/*
* entry.S contains the system-call and fault low-level handling routines.
*
+ * Some of this is documented in Documentation/x86/entry_64.txt
+ *
* NOTE: This code handles signal-recognition, which happens every time
* after an interrupt and after each system call.
*
@@ -297,27 +299,26 @@ ENDPROC(native_usergs_sysret64)
.endm
/* save partial stack frame */
- .pushsection .kprobes.text, "ax"
-ENTRY(save_args)
- XCPT_FRAME
+ .macro SAVE_ARGS_IRQ
cld
- /*
- * start from rbp in pt_regs and jump over
- * return address.
- */
- movq_cfi rdi, RDI+8-RBP
- movq_cfi rsi, RSI+8-RBP
- movq_cfi rdx, RDX+8-RBP
- movq_cfi rcx, RCX+8-RBP
- movq_cfi rax, RAX+8-RBP
- movq_cfi r8, R8+8-RBP
- movq_cfi r9, R9+8-RBP
- movq_cfi r10, R10+8-RBP
- movq_cfi r11, R11+8-RBP
-
- leaq -RBP+8(%rsp),%rdi /* arg1 for handler */
- movq_cfi rbp, 8 /* push %rbp */
- leaq 8(%rsp), %rbp /* mov %rsp, %ebp */
+ /* start from rbp in pt_regs and jump over */
+ movq_cfi rdi, RDI-RBP
+ movq_cfi rsi, RSI-RBP
+ movq_cfi rdx, RDX-RBP
+ movq_cfi rcx, RCX-RBP
+ movq_cfi rax, RAX-RBP
+ movq_cfi r8, R8-RBP
+ movq_cfi r9, R9-RBP
+ movq_cfi r10, R10-RBP
+ movq_cfi r11, R11-RBP
+
+ /* Save rbp so that we can unwind from get_irq_regs() */
+ movq_cfi rbp, 0
+
+ /* Save previous stack value */
+ movq %rsp, %rsi
+
+ leaq -RBP(%rsp),%rdi /* arg1 for handler */
testl $3, CS(%rdi)
je 1f
SWAPGS
@@ -329,19 +330,14 @@ ENTRY(save_args)
*/
1: incl PER_CPU_VAR(irq_count)
jne 2f
- popq_cfi %rax /* move return address... */
mov PER_CPU_VAR(irq_stack_ptr),%rsp
EMPTY_FRAME 0
- pushq_cfi %rbp /* backlink for unwinder */
- pushq_cfi %rax /* ... to the new stack */
- /*
- * We entered an interrupt context - irqs are off:
- */
-2: TRACE_IRQS_OFF
- ret
- CFI_ENDPROC
-END(save_args)
- .popsection
+
+2: /* Store previous stack value */
+ pushq %rsi
+ /* We entered an interrupt context - irqs are off: */
+ TRACE_IRQS_OFF
+ .endm
ENTRY(save_rest)
PARTIAL_FRAME 1 REST_SKIP+8
@@ -473,7 +469,7 @@ ENTRY(system_call_after_swapgs)
* and short:
*/
ENABLE_INTERRUPTS(CLBR_NONE)
- SAVE_ARGS 8,1
+ SAVE_ARGS 8,0
movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
movq %rcx,RIP-ARGOFFSET(%rsp)
CFI_REL_OFFSET rip,RIP-ARGOFFSET
@@ -508,7 +504,7 @@ sysret_check:
TRACE_IRQS_ON
movq RIP-ARGOFFSET(%rsp),%rcx
CFI_REGISTER rip,rcx
- RESTORE_ARGS 0,-ARG_SKIP,1
+ RESTORE_ARGS 1,-ARG_SKIP,0
/*CFI_REGISTER rflags,r11*/
movq PER_CPU_VAR(old_rsp), %rsp
USERGS_SYSRET64
@@ -791,7 +787,7 @@ END(interrupt)
/* reserve pt_regs for scratch regs and rbp */
subq $ORIG_RAX-RBP, %rsp
CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP
- call save_args
+ SAVE_ARGS_IRQ
PARTIAL_FRAME 0
call \func
.endm
@@ -814,15 +810,14 @@ ret_from_intr:
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
decl PER_CPU_VAR(irq_count)
- leaveq
- CFI_RESTORE rbp
+ /* Restore saved previous stack */
+ popq %rsi
+ leaq 16(%rsi), %rsp
+
CFI_DEF_CFA_REGISTER rsp
- CFI_ADJUST_CFA_OFFSET -8
+ CFI_ADJUST_CFA_OFFSET -16
- /* we did not save rbx, restore only from ARGOFFSET */
- addq $8, %rsp
- CFI_ADJUST_CFA_OFFSET -8
exit_intr:
GET_THREAD_INFO(%rcx)
testl $3,CS-ARGOFFSET(%rsp)
@@ -858,7 +853,7 @@ retint_restore_args: /* return to kernel space */
*/
TRACE_IRQS_IRETQ
restore_args:
- RESTORE_ARGS 0,8,0
+ RESTORE_ARGS 1,8,1
irq_return:
INTERRUPT_RETURN
@@ -991,11 +986,6 @@ apicinterrupt THRESHOLD_APIC_VECTOR \
apicinterrupt THERMAL_APIC_VECTOR \
thermal_interrupt smp_thermal_interrupt
-#ifdef CONFIG_X86_MCE
-apicinterrupt MCE_SELF_VECTOR \
- mce_self_interrupt smp_mce_self_interrupt
-#endif
-
#ifdef CONFIG_SMP
apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \
call_function_single_interrupt smp_call_function_single_interrupt
@@ -1121,6 +1111,8 @@ zeroentry spurious_interrupt_bug do_spurious_interrupt_bug
zeroentry coprocessor_error do_coprocessor_error
errorentry alignment_check do_alignment_check
zeroentry simd_coprocessor_error do_simd_coprocessor_error
+zeroentry emulate_vsyscall do_emulate_vsyscall
+
/* Reload gs selector with exception handling */
/* edi: new selector */
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index a93742a5746..c9a281f272f 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -123,7 +123,7 @@ static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
static atomic_t nmi_running = ATOMIC_INIT(0);
static int mod_code_status; /* holds return value of text write */
static void *mod_code_ip; /* holds the IP to write to */
-static void *mod_code_newcode; /* holds the text to write to the IP */
+static const void *mod_code_newcode; /* holds the text to write to the IP */
static unsigned nmi_wait_count;
static atomic_t nmi_update_count = ATOMIC_INIT(0);
@@ -225,7 +225,7 @@ within(unsigned long addr, unsigned long start, unsigned long end)
}
static int
-do_ftrace_mod_code(unsigned long ip, void *new_code)
+do_ftrace_mod_code(unsigned long ip, const void *new_code)
{
/*
* On x86_64, kernel text mappings are mapped read-only with
@@ -260,14 +260,14 @@ do_ftrace_mod_code(unsigned long ip, void *new_code)
return mod_code_status;
}
-static unsigned char *ftrace_nop_replace(void)
+static const unsigned char *ftrace_nop_replace(void)
{
- return ideal_nop5;
+ return ideal_nops[NOP_ATOMIC5];
}
static int
-ftrace_modify_code(unsigned long ip, unsigned char *old_code,
- unsigned char *new_code)
+ftrace_modify_code(unsigned long ip, unsigned const char *old_code,
+ unsigned const char *new_code)
{
unsigned char replaced[MCOUNT_INSN_SIZE];
@@ -301,7 +301,7 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
int ftrace_make_nop(struct module *mod,
struct dyn_ftrace *rec, unsigned long addr)
{
- unsigned char *new, *old;
+ unsigned const char *new, *old;
unsigned long ip = rec->ip;
old = ftrace_call_replace(ip, addr);
@@ -312,7 +312,7 @@ int ftrace_make_nop(struct module *mod,
int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
{
- unsigned char *new, *old;
+ unsigned const char *new, *old;
unsigned long ip = rec->ip;
old = ftrace_nop_replace();
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index d6d6bb36193..3bb08509a7a 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -23,7 +23,6 @@
static void __init i386_default_early_setup(void)
{
/* Initialize 32bit specific setup functions */
- x86_init.resources.probe_roms = probe_roms;
x86_init.resources.reserve_resources = i386_reserve_resources;
x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc;
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index bfe8f729e08..4aecc54236a 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -4,6 +4,7 @@
#include <linux/sysdev.h>
#include <linux/delay.h>
#include <linux/errno.h>
+#include <linux/i8253.h>
#include <linux/slab.h>
#include <linux/hpet.h>
#include <linux/init.h>
@@ -12,8 +13,8 @@
#include <linux/io.h>
#include <asm/fixmap.h>
-#include <asm/i8253.h>
#include <asm/hpet.h>
+#include <asm/time.h>
#define HPET_MASK CLOCKSOURCE_MASK(32)
@@ -71,7 +72,7 @@ static inline void hpet_set_mapping(void)
{
hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE);
#ifdef CONFIG_X86_64
- __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE);
+ __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VVAR_NOCACHE);
#endif
}
@@ -217,7 +218,7 @@ static void hpet_reserve_platform_timers(unsigned int id) { }
/*
* Common hpet info
*/
-static unsigned long hpet_period;
+static unsigned long hpet_freq;
static void hpet_legacy_set_mode(enum clock_event_mode mode,
struct clock_event_device *evt);
@@ -232,7 +233,6 @@ static struct clock_event_device hpet_clockevent = {
.features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
.set_mode = hpet_legacy_set_mode,
.set_next_event = hpet_legacy_next_event,
- .shift = 32,
.irq = 0,
.rating = 50,
};
@@ -290,28 +290,12 @@ static void hpet_legacy_clockevent_register(void)
hpet_enable_legacy_int();
/*
- * The mult factor is defined as (include/linux/clockchips.h)
- * mult/2^shift = cyc/ns (in contrast to ns/cyc in clocksource.h)
- * hpet_period is in units of femtoseconds (per cycle), so
- * mult/2^shift = cyc/ns = 10^6/hpet_period
- * mult = (10^6 * 2^shift)/hpet_period
- * mult = (FSEC_PER_NSEC << hpet_clockevent.shift)/hpet_period
- */
- hpet_clockevent.mult = div_sc((unsigned long) FSEC_PER_NSEC,
- hpet_period, hpet_clockevent.shift);
- /* Calculate the min / max delta */
- hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF,
- &hpet_clockevent);
- /* Setup minimum reprogramming delta. */
- hpet_clockevent.min_delta_ns = clockevent_delta2ns(HPET_MIN_PROG_DELTA,
- &hpet_clockevent);
-
- /*
* Start hpet with the boot cpu mask and make it
* global after the IO_APIC has been initialized.
*/
hpet_clockevent.cpumask = cpumask_of(smp_processor_id());
- clockevents_register_device(&hpet_clockevent);
+ clockevents_config_and_register(&hpet_clockevent, hpet_freq,
+ HPET_MIN_PROG_DELTA, 0x7FFFFFFF);
global_clock_event = &hpet_clockevent;
printk(KERN_DEBUG "hpet clockevent registered\n");
}
@@ -549,7 +533,6 @@ static int hpet_setup_irq(struct hpet_dev *dev)
static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu)
{
struct clock_event_device *evt = &hdev->evt;
- uint64_t hpet_freq;
WARN_ON(cpu != smp_processor_id());
if (!(hdev->flags & HPET_DEV_VALID))
@@ -571,24 +554,10 @@ static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu)
evt->set_mode = hpet_msi_set_mode;
evt->set_next_event = hpet_msi_next_event;
- evt->shift = 32;
-
- /*
- * The period is a femto seconds value. We need to calculate the
- * scaled math multiplication factor for nanosecond to hpet tick
- * conversion.
- */
- hpet_freq = FSEC_PER_SEC;
- do_div(hpet_freq, hpet_period);
- evt->mult = div_sc((unsigned long) hpet_freq,
- NSEC_PER_SEC, evt->shift);
- /* Calculate the max delta */
- evt->max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, evt);
- /* 5 usec minimum reprogramming delta. */
- evt->min_delta_ns = 5000;
-
evt->cpumask = cpumask_of(hdev->cpu);
- clockevents_register_device(evt);
+
+ clockevents_config_and_register(evt, hpet_freq, HPET_MIN_PROG_DELTA,
+ 0x7FFFFFFF);
}
#ifdef CONFIG_HPET
@@ -770,13 +739,6 @@ static cycle_t read_hpet(struct clocksource *cs)
return (cycle_t)hpet_readl(HPET_COUNTER);
}
-#ifdef CONFIG_X86_64
-static cycle_t __vsyscall_fn vread_hpet(void)
-{
- return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0);
-}
-#endif
-
static struct clocksource clocksource_hpet = {
.name = "hpet",
.rating = 250,
@@ -785,14 +747,13 @@ static struct clocksource clocksource_hpet = {
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
.resume = hpet_resume_counter,
#ifdef CONFIG_X86_64
- .vread = vread_hpet,
+ .archdata = { .vclock_mode = VCLOCK_HPET },
#endif
};
static int hpet_clocksource_register(void)
{
u64 start, now;
- u64 hpet_freq;
cycle_t t1;
/* Start the counter */
@@ -819,24 +780,7 @@ static int hpet_clocksource_register(void)
return -ENODEV;
}
- /*
- * The definition of mult is (include/linux/clocksource.h)
- * mult/2^shift = ns/cyc and hpet_period is in units of fsec/cyc
- * so we first need to convert hpet_period to ns/cyc units:
- * mult/2^shift = ns/cyc = hpet_period/10^6
- * mult = (hpet_period * 2^shift)/10^6
- * mult = (hpet_period << shift)/FSEC_PER_NSEC
- */
-
- /* Need to convert hpet_period (fsec/cyc) to cyc/sec:
- *
- * cyc/sec = FSEC_PER_SEC/hpet_period(fsec/cyc)
- * cyc/sec = (FSEC_PER_NSEC * NSEC_PER_SEC)/hpet_period
- */
- hpet_freq = FSEC_PER_SEC;
- do_div(hpet_freq, hpet_period);
clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq);
-
return 0;
}
@@ -845,7 +789,9 @@ static int hpet_clocksource_register(void)
*/
int __init hpet_enable(void)
{
+ unsigned long hpet_period;
unsigned int id;
+ u64 freq;
int i;
if (!is_hpet_capable())
@@ -884,6 +830,14 @@ int __init hpet_enable(void)
goto out_nohpet;
/*
+ * The period is a femto seconds value. Convert it to a
+ * frequency.
+ */
+ freq = FSEC_PER_SEC;
+ do_div(freq, hpet_period);
+ hpet_freq = freq;
+
+ /*
* Read the HPET ID register to retrieve the IRQ routing
* information and the number of channels
*/
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index 2dfd3159744..f2b96de3c7c 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -3,195 +3,27 @@
*
*/
#include <linux/clockchips.h>
-#include <linux/interrupt.h>
-#include <linux/spinlock.h>
-#include <linux/jiffies.h>
#include <linux/module.h>
#include <linux/timex.h>
-#include <linux/delay.h>
-#include <linux/init.h>
-#include <linux/io.h>
+#include <linux/i8253.h>
-#include <asm/i8253.h>
#include <asm/hpet.h>
+#include <asm/time.h>
#include <asm/smp.h>
-DEFINE_RAW_SPINLOCK(i8253_lock);
-EXPORT_SYMBOL(i8253_lock);
-
/*
* HPET replaces the PIT, when enabled. So we need to know, which of
* the two timers is used
*/
struct clock_event_device *global_clock_event;
-/*
- * Initialize the PIT timer.
- *
- * This is also called after resume to bring the PIT into operation again.
- */
-static void init_pit_timer(enum clock_event_mode mode,
- struct clock_event_device *evt)
-{
- raw_spin_lock(&i8253_lock);
-
- switch (mode) {
- case CLOCK_EVT_MODE_PERIODIC:
- /* binary, mode 2, LSB/MSB, ch 0 */
- outb_pit(0x34, PIT_MODE);
- outb_pit(LATCH & 0xff , PIT_CH0); /* LSB */
- outb_pit(LATCH >> 8 , PIT_CH0); /* MSB */
- break;
-
- case CLOCK_EVT_MODE_SHUTDOWN:
- case CLOCK_EVT_MODE_UNUSED:
- if (evt->mode == CLOCK_EVT_MODE_PERIODIC ||
- evt->mode == CLOCK_EVT_MODE_ONESHOT) {
- outb_pit(0x30, PIT_MODE);
- outb_pit(0, PIT_CH0);
- outb_pit(0, PIT_CH0);
- }
- break;
-
- case CLOCK_EVT_MODE_ONESHOT:
- /* One shot setup */
- outb_pit(0x38, PIT_MODE);
- break;
-
- case CLOCK_EVT_MODE_RESUME:
- /* Nothing to do here */
- break;
- }
- raw_spin_unlock(&i8253_lock);
-}
-
-/*
- * Program the next event in oneshot mode
- *
- * Delta is given in PIT ticks
- */
-static int pit_next_event(unsigned long delta, struct clock_event_device *evt)
-{
- raw_spin_lock(&i8253_lock);
- outb_pit(delta & 0xff , PIT_CH0); /* LSB */
- outb_pit(delta >> 8 , PIT_CH0); /* MSB */
- raw_spin_unlock(&i8253_lock);
-
- return 0;
-}
-
-/*
- * On UP the PIT can serve all of the possible timer functions. On SMP systems
- * it can be solely used for the global tick.
- *
- * The profiling and update capabilities are switched off once the local apic is
- * registered. This mechanism replaces the previous #ifdef LOCAL_APIC -
- * !using_apic_timer decisions in do_timer_interrupt_hook()
- */
-static struct clock_event_device pit_ce = {
- .name = "pit",
- .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
- .set_mode = init_pit_timer,
- .set_next_event = pit_next_event,
- .shift = 32,
- .irq = 0,
-};
-
-/*
- * Initialize the conversion factor and the min/max deltas of the clock event
- * structure and register the clock event source with the framework.
- */
void __init setup_pit_timer(void)
{
- /*
- * Start pit with the boot cpu mask and make it global after the
- * IO_APIC has been initialized.
- */
- pit_ce.cpumask = cpumask_of(smp_processor_id());
- pit_ce.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC, pit_ce.shift);
- pit_ce.max_delta_ns = clockevent_delta2ns(0x7FFF, &pit_ce);
- pit_ce.min_delta_ns = clockevent_delta2ns(0xF, &pit_ce);
-
- clockevents_register_device(&pit_ce);
- global_clock_event = &pit_ce;
+ clockevent_i8253_init(true);
+ global_clock_event = &i8253_clockevent;
}
#ifndef CONFIG_X86_64
-/*
- * Since the PIT overflows every tick, its not very useful
- * to just read by itself. So use jiffies to emulate a free
- * running counter:
- */
-static cycle_t pit_read(struct clocksource *cs)
-{
- static int old_count;
- static u32 old_jifs;
- unsigned long flags;
- int count;
- u32 jifs;
-
- raw_spin_lock_irqsave(&i8253_lock, flags);
- /*
- * Although our caller may have the read side of xtime_lock,
- * this is now a seqlock, and we are cheating in this routine
- * by having side effects on state that we cannot undo if
- * there is a collision on the seqlock and our caller has to
- * retry. (Namely, old_jifs and old_count.) So we must treat
- * jiffies as volatile despite the lock. We read jiffies
- * before latching the timer count to guarantee that although
- * the jiffies value might be older than the count (that is,
- * the counter may underflow between the last point where
- * jiffies was incremented and the point where we latch the
- * count), it cannot be newer.
- */
- jifs = jiffies;
- outb_pit(0x00, PIT_MODE); /* latch the count ASAP */
- count = inb_pit(PIT_CH0); /* read the latched count */
- count |= inb_pit(PIT_CH0) << 8;
-
- /* VIA686a test code... reset the latch if count > max + 1 */
- if (count > LATCH) {
- outb_pit(0x34, PIT_MODE);
- outb_pit(LATCH & 0xff, PIT_CH0);
- outb_pit(LATCH >> 8, PIT_CH0);
- count = LATCH - 1;
- }
-
- /*
- * It's possible for count to appear to go the wrong way for a
- * couple of reasons:
- *
- * 1. The timer counter underflows, but we haven't handled the
- * resulting interrupt and incremented jiffies yet.
- * 2. Hardware problem with the timer, not giving us continuous time,
- * the counter does small "jumps" upwards on some Pentium systems,
- * (see c't 95/10 page 335 for Neptun bug.)
- *
- * Previous attempts to handle these cases intelligently were
- * buggy, so we just do the simple thing now.
- */
- if (count > old_count && jifs == old_jifs)
- count = old_count;
-
- old_count = count;
- old_jifs = jifs;
-
- raw_spin_unlock_irqrestore(&i8253_lock, flags);
-
- count = (LATCH - 1) - count;
-
- return (cycle_t)(jifs * LATCH) + count;
-}
-
-static struct clocksource pit_cs = {
- .name = "pit",
- .rating = 110,
- .read = pit_read,
- .mask = CLOCKSOURCE_MASK(32),
- .mult = 0,
- .shift = 20,
-};
-
static int __init init_pit_clocksource(void)
{
/*
@@ -202,13 +34,10 @@ static int __init init_pit_clocksource(void)
* - when local APIC timer is active (PIT is switched off)
*/
if (num_possible_cpus() > 1 || is_hpet_enabled() ||
- pit_ce.mode != CLOCK_EVT_MODE_PERIODIC)
+ i8253_clockevent.mode != CLOCK_EVT_MODE_PERIODIC)
return 0;
- pit_cs.mult = clocksource_hz2mult(CLOCK_TICK_RATE, pit_cs.shift);
-
- return clocksource_register(&pit_cs);
+ return clocksource_i8253_init();
}
arch_initcall(init_pit_clocksource);
-
#endif /* !CONFIG_X86_64 */
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 1cb0b9fc78d..6c0802eb2f7 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -249,7 +249,7 @@ void fixup_irqs(void)
data = irq_desc_get_irq_data(desc);
affinity = data->affinity;
- if (!irq_has_action(irq) ||
+ if (!irq_has_action(irq) || irqd_is_per_cpu(data) ||
cpumask_subset(affinity, cpu_online_mask)) {
raw_spin_unlock(&desc->lock);
continue;
@@ -276,7 +276,8 @@ void fixup_irqs(void)
else if (!(warned++))
set_affinity = 0;
- if (!irqd_can_move_in_process_context(data) && chip->irq_unmask)
+ if (!irqd_can_move_in_process_context(data) &&
+ !irqd_irq_disabled(data) && chip->irq_unmask)
chip->irq_unmask(data);
raw_spin_unlock(&desc->lock);
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index f470e4ef993..f09d4bbe2d2 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -272,9 +272,6 @@ static void __init apic_intr_init(void)
#ifdef CONFIG_X86_MCE_THRESHOLD
alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
#endif
-#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_LOCAL_APIC)
- alloc_intr_gate(MCE_SELF_VECTOR, mce_self_interrupt);
-#endif
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
/* self generated IPI for local APIC timer */
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c
index 961b6b30ba9..3fee346ef54 100644
--- a/arch/x86/kernel/jump_label.c
+++ b/arch/x86/kernel/jump_label.c
@@ -34,7 +34,7 @@ void arch_jump_label_transform(struct jump_entry *entry,
code.offset = entry->target -
(entry->code + JUMP_LABEL_NOP_SIZE);
} else
- memcpy(&code, ideal_nop5, JUMP_LABEL_NOP_SIZE);
+ memcpy(&code, ideal_nops[NOP_ATOMIC5], JUMP_LABEL_NOP_SIZE);
get_online_cpus();
mutex_lock(&text_mutex);
text_poke_smp((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE);
@@ -44,7 +44,8 @@ void arch_jump_label_transform(struct jump_entry *entry,
void arch_jump_label_text_poke_early(jump_label_t addr)
{
- text_poke_early((void *)addr, ideal_nop5, JUMP_LABEL_NOP_SIZE);
+ text_poke_early((void *)addr, ideal_nops[NOP_ATOMIC5],
+ JUMP_LABEL_NOP_SIZE);
}
#endif
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 5f9ecff328b..00354d4919a 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -608,7 +608,7 @@ int kgdb_arch_init(void)
return register_die_notifier(&kgdb_notifier);
}
-static void kgdb_hw_overflow_handler(struct perf_event *event, int nmi,
+static void kgdb_hw_overflow_handler(struct perf_event *event,
struct perf_sample_data *data, struct pt_regs *regs)
{
struct task_struct *tsk = current;
@@ -638,7 +638,7 @@ void kgdb_arch_late(void)
for (i = 0; i < HBP_NUM; i++) {
if (breakinfo[i].pev)
continue;
- breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL);
+ breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL, NULL);
if (IS_ERR((void * __force)breakinfo[i].pev)) {
printk(KERN_ERR "kgdb: Could not allocate hw"
"breakpoints\nDisabling the kernel debugger\n");
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index f98d3eafe07..6389a6bca11 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -26,8 +26,6 @@
#include <asm/x86_init.h>
#include <asm/reboot.h>
-#define KVM_SCALE 22
-
static int kvmclock = 1;
static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME;
static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK;
@@ -120,8 +118,6 @@ static struct clocksource kvm_clock = {
.read = kvm_clock_get_cycles,
.rating = 400,
.mask = CLOCKSOURCE_MASK(64),
- .mult = 1 << KVM_SCALE,
- .shift = KVM_SCALE,
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
};
@@ -203,7 +199,7 @@ void __init kvmclock_init(void)
machine_ops.crash_shutdown = kvm_crash_shutdown;
#endif
kvm_get_preset_lpj();
- clocksource_register(&kvm_clock);
+ clocksource_register_hz(&kvm_clock, NSEC_PER_SEC);
pv_info.paravirt_enabled = 1;
pv_info.name = "KVM";
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index c5610384ab1..591be0ee193 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -66,8 +66,8 @@ struct microcode_amd {
unsigned int mpb[0];
};
-#define UCODE_CONTAINER_SECTION_HDR 8
-#define UCODE_CONTAINER_HEADER_SIZE 12
+#define SECTION_HDR_SIZE 8
+#define CONTAINER_HDR_SZ 12
static struct equiv_cpu_entry *equiv_cpu_table;
@@ -157,7 +157,7 @@ static int apply_microcode_amd(int cpu)
static unsigned int verify_ucode_size(int cpu, const u8 *buf, unsigned int size)
{
struct cpuinfo_x86 *c = &cpu_data(cpu);
- unsigned int max_size, actual_size;
+ u32 max_size, actual_size;
#define F1XH_MPB_MAX_SIZE 2048
#define F14H_MPB_MAX_SIZE 1824
@@ -175,9 +175,9 @@ static unsigned int verify_ucode_size(int cpu, const u8 *buf, unsigned int size)
break;
}
- actual_size = buf[4] + (buf[5] << 8);
+ actual_size = *(u32 *)(buf + 4);
- if (actual_size > size || actual_size > max_size) {
+ if (actual_size + SECTION_HDR_SIZE > size || actual_size > max_size) {
pr_err("section size mismatch\n");
return 0;
}
@@ -191,7 +191,7 @@ get_next_ucode(int cpu, const u8 *buf, unsigned int size, unsigned int *mc_size)
struct microcode_header_amd *mc = NULL;
unsigned int actual_size = 0;
- if (buf[0] != UCODE_UCODE_TYPE) {
+ if (*(u32 *)buf != UCODE_UCODE_TYPE) {
pr_err("invalid type field in container file section header\n");
goto out;
}
@@ -204,8 +204,8 @@ get_next_ucode(int cpu, const u8 *buf, unsigned int size, unsigned int *mc_size)
if (!mc)
goto out;
- get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, actual_size);
- *mc_size = actual_size + UCODE_CONTAINER_SECTION_HDR;
+ get_ucode_data(mc, buf + SECTION_HDR_SIZE, actual_size);
+ *mc_size = actual_size + SECTION_HDR_SIZE;
out:
return mc;
@@ -229,9 +229,10 @@ static int install_equiv_cpu_table(const u8 *buf)
return -ENOMEM;
}
- get_ucode_data(equiv_cpu_table, buf + UCODE_CONTAINER_HEADER_SIZE, size);
+ get_ucode_data(equiv_cpu_table, buf + CONTAINER_HDR_SZ, size);
- return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */
+ /* add header length */
+ return size + CONTAINER_HDR_SZ;
}
static void free_equiv_cpu_table(void)
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index ab23f1ad4bf..52f256f2cc8 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -24,6 +24,7 @@
#include <linux/bug.h>
#include <linux/mm.h>
#include <linux/gfp.h>
+#include <linux/jump_label.h>
#include <asm/system.h>
#include <asm/page.h>
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 5a532ce646b..9103b89c145 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -285,7 +285,7 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)
intsrc.type = MP_INTSRC;
intsrc.irqflag = 0; /* conforming */
intsrc.srcbus = 0;
- intsrc.dstapic = mp_ioapics[0].apicid;
+ intsrc.dstapic = mpc_ioapic_id(0);
intsrc.irqtype = mp_INT;
@@ -715,17 +715,15 @@ static void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare)
}
}
-static int
+static int __init
check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count)
{
- int ret = 0;
-
if (!mpc_new_phys || count <= mpc_new_length) {
WARN(1, "update_mptable: No spare slots (length: %x)\n", count);
return -1;
}
- return ret;
+ return 0;
}
#else /* CONFIG_X86_IO_APIC */
static
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 9ea999a4dcc..b49d00da2ae 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -68,74 +68,10 @@ int dma_set_mask(struct device *dev, u64 mask)
}
EXPORT_SYMBOL(dma_set_mask);
-#if defined(CONFIG_X86_64) && !defined(CONFIG_NUMA)
-static __initdata void *dma32_bootmem_ptr;
-static unsigned long dma32_bootmem_size __initdata = (128ULL<<20);
-
-static int __init parse_dma32_size_opt(char *p)
-{
- if (!p)
- return -EINVAL;
- dma32_bootmem_size = memparse(p, &p);
- return 0;
-}
-early_param("dma32_size", parse_dma32_size_opt);
-
-void __init dma32_reserve_bootmem(void)
-{
- unsigned long size, align;
- if (max_pfn <= MAX_DMA32_PFN)
- return;
-
- /*
- * check aperture_64.c allocate_aperture() for reason about
- * using 512M as goal
- */
- align = 64ULL<<20;
- size = roundup(dma32_bootmem_size, align);
- dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align,
- 512ULL<<20);
- /*
- * Kmemleak should not scan this block as it may not be mapped via the
- * kernel direct mapping.
- */
- kmemleak_ignore(dma32_bootmem_ptr);
- if (dma32_bootmem_ptr)
- dma32_bootmem_size = size;
- else
- dma32_bootmem_size = 0;
-}
-static void __init dma32_free_bootmem(void)
-{
-
- if (max_pfn <= MAX_DMA32_PFN)
- return;
-
- if (!dma32_bootmem_ptr)
- return;
-
- free_bootmem(__pa(dma32_bootmem_ptr), dma32_bootmem_size);
-
- dma32_bootmem_ptr = NULL;
- dma32_bootmem_size = 0;
-}
-#else
-void __init dma32_reserve_bootmem(void)
-{
-}
-static void __init dma32_free_bootmem(void)
-{
-}
-
-#endif
-
void __init pci_iommu_alloc(void)
{
struct iommu_table_entry *p;
- /* free the range so iommu could get some range less than 4G */
- dma32_free_bootmem();
-
sort_iommu_table(__iommu_table, __iommu_table_end);
check_iommu_entries(__iommu_table, __iommu_table_end);
diff --git a/arch/x86/kernel/pci-iommu_table.c b/arch/x86/kernel/pci-iommu_table.c
index 55d745ec118..35ccf75696e 100644
--- a/arch/x86/kernel/pci-iommu_table.c
+++ b/arch/x86/kernel/pci-iommu_table.c
@@ -50,20 +50,14 @@ void __init check_iommu_entries(struct iommu_table_entry *start,
struct iommu_table_entry *finish)
{
struct iommu_table_entry *p, *q, *x;
- char sym_p[KSYM_SYMBOL_LEN];
- char sym_q[KSYM_SYMBOL_LEN];
/* Simple cyclic dependency checker. */
for (p = start; p < finish; p++) {
q = find_dependents_of(start, finish, p);
x = find_dependents_of(start, finish, q);
if (p == x) {
- sprint_symbol(sym_p, (unsigned long)p->detect);
- sprint_symbol(sym_q, (unsigned long)q->detect);
-
- printk(KERN_ERR "CYCLIC DEPENDENCY FOUND! %s depends" \
- " on %s and vice-versa. BREAKING IT.\n",
- sym_p, sym_q);
+ printk(KERN_ERR "CYCLIC DEPENDENCY FOUND! %pS depends on %pS and vice-versa. BREAKING IT.\n",
+ p->detect, q->detect);
/* Heavy handed way..*/
x->depend = 0;
}
@@ -72,12 +66,8 @@ void __init check_iommu_entries(struct iommu_table_entry *start,
for (p = start; p < finish; p++) {
q = find_dependents_of(p, finish, p);
if (q && q > p) {
- sprint_symbol(sym_p, (unsigned long)p->detect);
- sprint_symbol(sym_q, (unsigned long)q->detect);
-
- printk(KERN_ERR "EXECUTION ORDER INVALID! %s "\
- "should be called before %s!\n",
- sym_p, sym_q);
+ printk(KERN_ERR "EXECUTION ORDER INVALID! %pS should be called before %pS!\n",
+ p->detect, q->detect);
}
}
}
diff --git a/arch/x86/kernel/probe_roms_32.c b/arch/x86/kernel/probe_roms.c
index 071e7fea42e..ba0a4cce53b 100644
--- a/arch/x86/kernel/probe_roms_32.c
+++ b/arch/x86/kernel/probe_roms.c
@@ -73,6 +73,107 @@ static struct resource video_rom_resource = {
.flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
};
+/* does this oprom support the given pci device, or any of the devices
+ * that the driver supports?
+ */
+static bool match_id(struct pci_dev *pdev, unsigned short vendor, unsigned short device)
+{
+ struct pci_driver *drv = pdev->driver;
+ const struct pci_device_id *id;
+
+ if (pdev->vendor == vendor && pdev->device == device)
+ return true;
+
+ for (id = drv ? drv->id_table : NULL; id && id->vendor; id++)
+ if (id->vendor == vendor && id->device == device)
+ break;
+
+ return id && id->vendor;
+}
+
+static bool probe_list(struct pci_dev *pdev, unsigned short vendor,
+ const unsigned char *rom_list)
+{
+ unsigned short device;
+
+ do {
+ if (probe_kernel_address(rom_list, device) != 0)
+ device = 0;
+
+ if (device && match_id(pdev, vendor, device))
+ break;
+
+ rom_list += 2;
+ } while (device);
+
+ return !!device;
+}
+
+static struct resource *find_oprom(struct pci_dev *pdev)
+{
+ struct resource *oprom = NULL;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(adapter_rom_resources); i++) {
+ struct resource *res = &adapter_rom_resources[i];
+ unsigned short offset, vendor, device, list, rev;
+ const unsigned char *rom;
+
+ if (res->end == 0)
+ break;
+
+ rom = isa_bus_to_virt(res->start);
+ if (probe_kernel_address(rom + 0x18, offset) != 0)
+ continue;
+
+ if (probe_kernel_address(rom + offset + 0x4, vendor) != 0)
+ continue;
+
+ if (probe_kernel_address(rom + offset + 0x6, device) != 0)
+ continue;
+
+ if (match_id(pdev, vendor, device)) {
+ oprom = res;
+ break;
+ }
+
+ if (probe_kernel_address(rom + offset + 0x8, list) == 0 &&
+ probe_kernel_address(rom + offset + 0xc, rev) == 0 &&
+ rev >= 3 && list &&
+ probe_list(pdev, vendor, rom + offset + list)) {
+ oprom = res;
+ break;
+ }
+ }
+
+ return oprom;
+}
+
+void *pci_map_biosrom(struct pci_dev *pdev)
+{
+ struct resource *oprom = find_oprom(pdev);
+
+ if (!oprom)
+ return NULL;
+
+ return ioremap(oprom->start, resource_size(oprom));
+}
+EXPORT_SYMBOL(pci_map_biosrom);
+
+void pci_unmap_biosrom(void __iomem *image)
+{
+ iounmap(image);
+}
+EXPORT_SYMBOL(pci_unmap_biosrom);
+
+size_t pci_biosrom_size(struct pci_dev *pdev)
+{
+ struct resource *oprom = find_oprom(pdev);
+
+ return oprom ? resource_size(oprom) : 0;
+}
+EXPORT_SYMBOL(pci_biosrom_size);
+
#define ROMSIGNATURE 0xaa55
static int __init romsignature(const unsigned char *rom)
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index d46cbe46b7a..e1ba8cb24e4 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -337,7 +337,9 @@ EXPORT_SYMBOL(boot_option_idle_override);
* Powermanagement idle function, if any..
*/
void (*pm_idle)(void);
+#ifdef CONFIG_APM_MODULE
EXPORT_SYMBOL(pm_idle);
+#endif
#ifdef CONFIG_X86_32
/*
@@ -449,7 +451,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
{
if (!need_resched()) {
- if (cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLUSH_MONITOR))
+ if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
clflush((void *)&current_thread_info()->flags);
__monitor((void *)&current_thread_info()->flags, 0, 0);
@@ -465,7 +467,7 @@ static void mwait_idle(void)
if (!need_resched()) {
trace_power_start(POWER_CSTATE, 1, smp_processor_id());
trace_cpu_idle(1, smp_processor_id());
- if (cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLUSH_MONITOR))
+ if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
clflush((void *)&current_thread_info()->flags);
__monitor((void *)&current_thread_info()->flags, 0, 0);
@@ -535,45 +537,45 @@ int mwait_usable(const struct cpuinfo_x86 *c)
return (edx & MWAIT_EDX_C1);
}
-bool c1e_detected;
-EXPORT_SYMBOL(c1e_detected);
+bool amd_e400_c1e_detected;
+EXPORT_SYMBOL(amd_e400_c1e_detected);
-static cpumask_var_t c1e_mask;
+static cpumask_var_t amd_e400_c1e_mask;
-void c1e_remove_cpu(int cpu)
+void amd_e400_remove_cpu(int cpu)
{
- if (c1e_mask != NULL)
- cpumask_clear_cpu(cpu, c1e_mask);
+ if (amd_e400_c1e_mask != NULL)
+ cpumask_clear_cpu(cpu, amd_e400_c1e_mask);
}
/*
- * C1E aware idle routine. We check for C1E active in the interrupt
+ * AMD Erratum 400 aware idle routine. We check for C1E active in the interrupt
* pending message MSR. If we detect C1E, then we handle it the same
* way as C3 power states (local apic timer and TSC stop)
*/
-static void c1e_idle(void)
+static void amd_e400_idle(void)
{
if (need_resched())
return;
- if (!c1e_detected) {
+ if (!amd_e400_c1e_detected) {
u32 lo, hi;
rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
if (lo & K8_INTP_C1E_ACTIVE_MASK) {
- c1e_detected = true;
+ amd_e400_c1e_detected = true;
if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
mark_tsc_unstable("TSC halt in AMD C1E");
printk(KERN_INFO "System has AMD C1E enabled\n");
}
}
- if (c1e_detected) {
+ if (amd_e400_c1e_detected) {
int cpu = smp_processor_id();
- if (!cpumask_test_cpu(cpu, c1e_mask)) {
- cpumask_set_cpu(cpu, c1e_mask);
+ if (!cpumask_test_cpu(cpu, amd_e400_c1e_mask)) {
+ cpumask_set_cpu(cpu, amd_e400_c1e_mask);
/*
* Force broadcast so ACPI can not interfere.
*/
@@ -616,17 +618,17 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
pm_idle = mwait_idle;
} else if (cpu_has_amd_erratum(amd_erratum_400)) {
/* E400: APIC timer interrupt does not wake up CPU from C1e */
- printk(KERN_INFO "using C1E aware idle routine\n");
- pm_idle = c1e_idle;
+ printk(KERN_INFO "using AMD E400 aware idle routine\n");
+ pm_idle = amd_e400_idle;
} else
pm_idle = default_idle;
}
-void __init init_c1e_mask(void)
+void __init init_amd_e400_c1e_mask(void)
{
- /* If we're using c1e_idle, we need to allocate c1e_mask. */
- if (pm_idle == c1e_idle)
- zalloc_cpumask_var(&c1e_mask, GFP_KERNEL);
+ /* If we're using amd_e400_idle, we need to allocate amd_e400_c1e_mask. */
+ if (pm_idle == amd_e400_idle)
+ zalloc_cpumask_var(&amd_e400_c1e_mask, GFP_KERNEL);
}
static int __init idle_setup(char *str)
@@ -640,6 +642,7 @@ static int __init idle_setup(char *str)
boot_option_idle_override = IDLE_POLL;
} else if (!strcmp(str, "mwait")) {
boot_option_idle_override = IDLE_FORCE_MWAIT;
+ WARN_ONCE(1, "\"idle=mwait\" will be removed in 2012\n");
} else if (!strcmp(str, "halt")) {
/*
* When the boot option of idle=halt is added, halt is
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 8d128783af4..a3d0dc59067 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -245,7 +245,6 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
{
set_user_gs(regs, 0);
regs->fs = 0;
- set_fs(USER_DS);
regs->ds = __USER_DS;
regs->es = __USER_DS;
regs->ss = __USER_DS;
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 6c9dd922ac0..ca6f7ab8df3 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -338,7 +338,6 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip,
regs->cs = _cs;
regs->ss = _ss;
regs->flags = X86_EFLAGS_IF;
- set_fs(USER_DS);
/*
* Free the old FP and other extended state
*/
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index f65e5b521db..82528799c5d 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -528,7 +528,7 @@ static int genregs_set(struct task_struct *target,
return ret;
}
-static void ptrace_triggered(struct perf_event *bp, int nmi,
+static void ptrace_triggered(struct perf_event *bp,
struct perf_sample_data *data,
struct pt_regs *regs)
{
@@ -715,7 +715,8 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
attr.bp_type = HW_BREAKPOINT_W;
attr.disabled = 1;
- bp = register_user_hw_breakpoint(&attr, ptrace_triggered, tsk);
+ bp = register_user_hw_breakpoint(&attr, ptrace_triggered,
+ NULL, tsk);
/*
* CHECKME: the previous code returned -EIO if the addr wasn't
@@ -1363,7 +1364,7 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
* We must return the syscall number to actually look up in the table.
* This can be -1L to skip running any syscall at all.
*/
-asmregparm long syscall_trace_enter(struct pt_regs *regs)
+long syscall_trace_enter(struct pt_regs *regs)
{
long ret = 0;
@@ -1408,7 +1409,7 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs)
return ret ?: regs->orig_ax;
}
-asmregparm void syscall_trace_leave(struct pt_regs *regs)
+void syscall_trace_leave(struct pt_regs *regs)
{
bool step;
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 08c44b08bf5..9242436e993 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -36,7 +36,7 @@ EXPORT_SYMBOL(pm_power_off);
static const struct desc_ptr no_idt = {};
static int reboot_mode;
-enum reboot_type reboot_type = BOOT_KBD;
+enum reboot_type reboot_type = BOOT_ACPI;
int reboot_force;
#if defined(CONFIG_X86_32) && defined(CONFIG_SMP)
@@ -294,6 +294,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
DMI_MATCH(DMI_BOARD_NAME, "VersaLogic Menlow board"),
},
},
+ { /* Handle reboot issue on Acer Aspire one */
+ .callback = set_bios_reboot,
+ .ident = "Acer Aspire One A110",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "AOA110"),
+ },
+ },
{ }
};
@@ -411,6 +419,30 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"),
},
},
+ { /* Handle problems with rebooting on the Latitude E6320. */
+ .callback = set_pci_reboot,
+ .ident = "Dell Latitude E6320",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6320"),
+ },
+ },
+ { /* Handle problems with rebooting on the Latitude E5420. */
+ .callback = set_pci_reboot,
+ .ident = "Dell Latitude E5420",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E5420"),
+ },
+ },
+ { /* Handle problems with rebooting on the Latitude E6420. */
+ .callback = set_pci_reboot,
+ .ident = "Dell Latitude E6420",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6420"),
+ },
+ },
{ }
};
@@ -478,9 +510,24 @@ void __attribute__((weak)) mach_reboot_fixups(void)
{
}
+/*
+ * Windows compatible x86 hardware expects the following on reboot:
+ *
+ * 1) If the FADT has the ACPI reboot register flag set, try it
+ * 2) If still alive, write to the keyboard controller
+ * 3) If still alive, write to the ACPI reboot register again
+ * 4) If still alive, write to the keyboard controller again
+ *
+ * If the machine is still alive at this stage, it gives up. We default to
+ * following the same pattern, except that if we're still alive after (4) we'll
+ * try to force a triple fault and then cycle between hitting the keyboard
+ * controller and doing that
+ */
static void native_machine_emergency_restart(void)
{
int i;
+ int attempt = 0;
+ int orig_reboot_type = reboot_type;
if (reboot_emergency)
emergency_vmx_disable_all();
@@ -502,6 +549,13 @@ static void native_machine_emergency_restart(void)
outb(0xfe, 0x64); /* pulse reset low */
udelay(50);
}
+ if (attempt == 0 && orig_reboot_type == BOOT_ACPI) {
+ attempt = 1;
+ reboot_type = BOOT_ACPI;
+ } else {
+ reboot_type = BOOT_TRIPLE;
+ }
+ break;
case BOOT_TRIPLE:
load_idt(&no_idt);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 4be9b398470..afaf38447ef 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -691,8 +691,6 @@ early_param("reservelow", parse_reservelow);
void __init setup_arch(char **cmdline_p)
{
- unsigned long flags;
-
#ifdef CONFIG_X86_32
memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
visws_early_detect();
@@ -912,6 +910,13 @@ void __init setup_arch(char **cmdline_p)
memblock.current_limit = get_max_mapped();
memblock_x86_fill();
+ /*
+ * The EFI specification says that boot service code won't be called
+ * after ExitBootServices(). This is, in fact, a lie.
+ */
+ if (efi_enabled)
+ efi_reserve_boot_services();
+
/* preallocate 4k for mptable mpc */
early_reserve_e820_mpc_new();
@@ -948,6 +953,8 @@ void __init setup_arch(char **cmdline_p)
if (init_ohci1394_dma_early)
init_ohci1394_dma_on_all_controllers();
#endif
+ /* Allocate bigger log buffer */
+ setup_log_buf(1);
reserve_initrd();
@@ -966,7 +973,6 @@ void __init setup_arch(char **cmdline_p)
initmem_init();
memblock_find_dma_reserve();
- dma32_reserve_bootmem();
#ifdef CONFIG_KVM_CLOCK
kvmclock_init();
@@ -1041,9 +1047,7 @@ void __init setup_arch(char **cmdline_p)
mcheck_init();
- local_irq_save(flags);
- arch_init_ideal_nop5();
- local_irq_restore(flags);
+ arch_init_ideal_nops();
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 4fd173cd8e5..54ddaeb221c 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -485,17 +485,18 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
asmlinkage int
sys_sigsuspend(int history0, int history1, old_sigset_t mask)
{
- mask &= _BLOCKABLE;
- spin_lock_irq(&current->sighand->siglock);
+ sigset_t blocked;
+
current->saved_sigmask = current->blocked;
- siginitset(&current->blocked, mask);
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
+
+ mask &= _BLOCKABLE;
+ siginitset(&blocked, mask);
+ set_current_blocked(&blocked);
current->state = TASK_INTERRUPTIBLE;
schedule();
- set_restore_sigmask();
+ set_restore_sigmask();
return -ERESTARTNOHAND;
}
@@ -572,10 +573,7 @@ unsigned long sys_sigreturn(struct pt_regs *regs)
goto badframe;
sigdelsetmask(&set, ~_BLOCKABLE);
- spin_lock_irq(&current->sighand->siglock);
- current->blocked = set;
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
+ set_current_blocked(&set);
if (restore_sigcontext(regs, &frame->sc, &ax))
goto badframe;
@@ -601,10 +599,7 @@ long sys_rt_sigreturn(struct pt_regs *regs)
goto badframe;
sigdelsetmask(&set, ~_BLOCKABLE);
- spin_lock_irq(&current->sighand->siglock);
- current->blocked = set;
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
+ set_current_blocked(&set);
if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
goto badframe;
@@ -656,11 +651,15 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
static int
setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
- sigset_t *set, struct pt_regs *regs)
+ struct pt_regs *regs)
{
int usig = signr_convert(sig);
+ sigset_t *set = &current->blocked;
int ret;
+ if (current_thread_info()->status & TS_RESTORE_SIGMASK)
+ set = &current->saved_sigmask;
+
/* Set up the stack frame */
if (is_ia32) {
if (ka->sa.sa_flags & SA_SIGINFO)
@@ -675,13 +674,15 @@ setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
return -EFAULT;
}
+ current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
return ret;
}
static int
handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
- sigset_t *oldset, struct pt_regs *regs)
+ struct pt_regs *regs)
{
+ sigset_t blocked;
int ret;
/* Are we from a system call? */
@@ -714,20 +715,11 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
likely(test_and_clear_thread_flag(TIF_FORCED_TF)))
regs->flags &= ~X86_EFLAGS_TF;
- ret = setup_rt_frame(sig, ka, info, oldset, regs);
+ ret = setup_rt_frame(sig, ka, info, regs);
if (ret)
return ret;
-#ifdef CONFIG_X86_64
- /*
- * This has nothing to do with segment registers,
- * despite the name. This magic affects uaccess.h
- * macros' behavior. Reset it to the normal setting.
- */
- set_fs(USER_DS);
-#endif
-
/*
* Clear the direction flag as per the ABI for function entry.
*/
@@ -741,12 +733,10 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
*/
regs->flags &= ~X86_EFLAGS_TF;
- spin_lock_irq(&current->sighand->siglock);
- sigorsets(&current->blocked, &current->blocked, &ka->sa.sa_mask);
+ sigorsets(&blocked, &current->blocked, &ka->sa.sa_mask);
if (!(ka->sa.sa_flags & SA_NODEFER))
- sigaddset(&current->blocked, sig);
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
+ sigaddset(&blocked, sig);
+ set_current_blocked(&blocked);
tracehook_signal_handler(sig, info, ka, regs,
test_thread_flag(TIF_SINGLESTEP));
@@ -771,7 +761,6 @@ static void do_signal(struct pt_regs *regs)
struct k_sigaction ka;
siginfo_t info;
int signr;
- sigset_t *oldset;
/*
* We want the common case to go fast, which is why we may in certain
@@ -783,23 +772,10 @@ static void do_signal(struct pt_regs *regs)
if (!user_mode(regs))
return;
- if (current_thread_info()->status & TS_RESTORE_SIGMASK)
- oldset = &current->saved_sigmask;
- else
- oldset = &current->blocked;
-
signr = get_signal_to_deliver(&info, &ka, regs, NULL);
if (signr > 0) {
/* Whee! Actually deliver the signal. */
- if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
- /*
- * A signal was successfully delivered; the saved
- * sigmask will have been stored in the signal frame,
- * and will be restored by sigreturn, so we can simply
- * clear the TS_RESTORE_SIGMASK flag.
- */
- current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
- }
+ handle_signal(signr, &info, &ka, regs);
return;
}
@@ -827,7 +803,7 @@ static void do_signal(struct pt_regs *regs)
*/
if (current_thread_info()->status & TS_RESTORE_SIGMASK) {
current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
- sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL);
+ set_current_blocked(&current->saved_sigmask);
}
}
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 513deac7228..013e7eba83b 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -194,14 +194,13 @@ static void native_stop_other_cpus(int wait)
}
/*
- * Reschedule call back. Nothing to do,
- * all the work is done automatically when
- * we return from the interrupt.
+ * Reschedule call back.
*/
void smp_reschedule_interrupt(struct pt_regs *regs)
{
ack_APIC_irq();
inc_irq_stat(irq_resched_count);
+ scheduler_ipi();
/*
* KVM uses this interrupt to force a cpu out of guest mode
*/
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index c2871d3c71b..9f548cb4a95 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -285,6 +285,19 @@ notrace static void __cpuinit start_secondary(void *unused)
per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
x86_platform.nmi_init();
+ /*
+ * Wait until the cpu which brought this one up marked it
+ * online before enabling interrupts. If we don't do that then
+ * we can end up waking up the softirq thread before this cpu
+ * reached the active state, which makes the scheduler unhappy
+ * and schedule the softirq thread on the wrong cpu. This is
+ * only observable with forced threaded interrupts, but in
+ * theory it could also happen w/o them. It's just way harder
+ * to achieve.
+ */
+ while (!cpumask_test_cpu(smp_processor_id(), cpu_active_mask))
+ cpu_relax();
+
/* enable local interrupts */
local_irq_enable();
@@ -425,7 +438,7 @@ static void impress_friends(void)
void __inquire_remote_apic(int apicid)
{
unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
- char *names[] = { "ID", "VERSION", "SPIV" };
+ const char * const names[] = { "ID", "VERSION", "SPIV" };
int timeout;
u32 status;
@@ -1307,7 +1320,7 @@ void play_dead_common(void)
{
idle_task_exit();
reset_lazy_tlbstate();
- c1e_remove_cpu(raw_smp_processor_id());
+ amd_e400_remove_cpu(raw_smp_processor_id());
mb();
/* Ack it */
@@ -1332,9 +1345,9 @@ static inline void mwait_play_dead(void)
void *mwait_ptr;
struct cpuinfo_x86 *c = __this_cpu_ptr(&cpu_info);
- if (!(cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)))
+ if (!(this_cpu_has(X86_FEATURE_MWAIT) && mwait_usable(c)))
return;
- if (!cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLSH))
+ if (!this_cpu_has(X86_FEATURE_CLFLSH))
return;
if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF)
return;
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 6515733a289..fdd0c6430e5 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -9,15 +9,6 @@
#include <linux/uaccess.h>
#include <asm/stacktrace.h>
-static void save_stack_warning(void *data, char *msg)
-{
-}
-
-static void
-save_stack_warning_symbol(void *data, char *msg, unsigned long symbol)
-{
-}
-
static int save_stack_stack(void *data, char *name)
{
return 0;
@@ -53,16 +44,12 @@ save_stack_address_nosched(void *data, unsigned long addr, int reliable)
}
static const struct stacktrace_ops save_stack_ops = {
- .warning = save_stack_warning,
- .warning_symbol = save_stack_warning_symbol,
.stack = save_stack_stack,
.address = save_stack_address,
.walk_stack = print_context_stack,
};
static const struct stacktrace_ops save_stack_ops_nosched = {
- .warning = save_stack_warning,
- .warning_symbol = save_stack_warning_symbol,
.stack = save_stack_stack,
.address = save_stack_address_nosched,
.walk_stack = print_context_stack,
@@ -79,7 +66,7 @@ void save_stack_trace(struct stack_trace *trace)
}
EXPORT_SYMBOL_GPL(save_stack_trace);
-void save_stack_trace_regs(struct stack_trace *trace, struct pt_regs *regs)
+void save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace)
{
dump_trace(current, regs, NULL, 0, &save_stack_ops, trace);
if (trace->nr_entries < trace->max_entries)
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index abce34d5c79..fbb0a045a1a 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -344,3 +344,5 @@ ENTRY(sys_call_table)
.long sys_open_by_handle_at
.long sys_clock_adjtime
.long sys_syncfs
+ .long sys_sendmmsg /* 345 */
+ .long sys_setns
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 998e972f3b1..e07a2fc876b 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -36,6 +36,7 @@
#include <asm/bootparam.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
+#include <asm/swiotlb.h>
#include <asm/fixmap.h>
#include <asm/proto.h>
#include <asm/setup.h>
@@ -110,7 +111,6 @@ static struct mm_struct tboot_mm = {
.mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem),
.page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
.mmlist = LIST_HEAD_INIT(init_mm.mmlist),
- .cpu_vm_mask = CPU_MASK_ALL,
};
static inline void switch_to_tboot_pt(void)
diff --git a/arch/x86/kernel/test_nx.c b/arch/x86/kernel/test_nx.c
index 787a5e499dd..3f92ce07e52 100644
--- a/arch/x86/kernel/test_nx.c
+++ b/arch/x86/kernel/test_nx.c
@@ -161,7 +161,7 @@ static int test_NX(void)
}
#endif
- return 0;
+ return ret;
}
static void test_exit(void)
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index 25a28a24593..5a64d057be5 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -11,19 +11,19 @@
#include <linux/clockchips.h>
#include <linux/interrupt.h>
+#include <linux/i8253.h>
#include <linux/time.h>
#include <linux/mca.h>
#include <asm/vsyscall.h>
#include <asm/x86_init.h>
#include <asm/i8259.h>
-#include <asm/i8253.h>
#include <asm/timer.h>
#include <asm/hpet.h>
#include <asm/time.h>
#ifdef CONFIG_X86_64
-volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
+DEFINE_VVAR(volatile unsigned long, jiffies) = INITIAL_JIFFIES;
#endif
unsigned long profile_pc(struct pt_regs *regs)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index b9b67166f9d..fbc097a085c 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -872,6 +872,12 @@ void __init trap_init(void)
set_bit(SYSCALL_VECTOR, used_vectors);
#endif
+#ifdef CONFIG_X86_64
+ BUG_ON(test_bit(VSYSCALL_EMU_VECTOR, used_vectors));
+ set_system_intr_gate(VSYSCALL_EMU_VECTOR, &emulate_vsyscall);
+ set_bit(VSYSCALL_EMU_VECTOR, used_vectors);
+#endif
+
/*
* Should be a barrier for any external CPU state:
*/
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 9335bf7dd2e..56c633a5db7 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -763,25 +763,6 @@ static cycle_t read_tsc(struct clocksource *cs)
ret : clocksource_tsc.cycle_last;
}
-#ifdef CONFIG_X86_64
-static cycle_t __vsyscall_fn vread_tsc(void)
-{
- cycle_t ret;
-
- /*
- * Surround the RDTSC by barriers, to make sure it's not
- * speculated to outside the seqlock critical section and
- * does not cause time warps:
- */
- rdtsc_barrier();
- ret = (cycle_t)vget_cycles();
- rdtsc_barrier();
-
- return ret >= __vsyscall_gtod_data.clock.cycle_last ?
- ret : __vsyscall_gtod_data.clock.cycle_last;
-}
-#endif
-
static void resume_tsc(struct clocksource *cs)
{
clocksource_tsc.cycle_last = 0;
@@ -796,7 +777,7 @@ static struct clocksource clocksource_tsc = {
.flags = CLOCK_SOURCE_IS_CONTINUOUS |
CLOCK_SOURCE_MUST_VERIFY,
#ifdef CONFIG_X86_64
- .vread = vread_tsc,
+ .archdata = { .vclock_mode = VCLOCK_TSC },
#endif
};
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 624a2016198..4aa9c54a9b7 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -166,50 +166,18 @@ SECTIONS
__vsyscall_0 = .;
. = VSYSCALL_ADDR;
- .vsyscall_0 : AT(VLOAD(.vsyscall_0)) {
+ .vsyscall : AT(VLOAD(.vsyscall)) {
*(.vsyscall_0)
- } :user
- . = ALIGN(L1_CACHE_BYTES);
- .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) {
- *(.vsyscall_fn)
- }
-
- . = ALIGN(L1_CACHE_BYTES);
- .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) {
- *(.vsyscall_gtod_data)
- }
-
- vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data);
- .vsyscall_clock : AT(VLOAD(.vsyscall_clock)) {
- *(.vsyscall_clock)
- }
- vsyscall_clock = VVIRT(.vsyscall_clock);
-
-
- .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) {
+ . = 1024;
*(.vsyscall_1)
- }
- .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) {
- *(.vsyscall_2)
- }
- .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) {
- *(.vgetcpu_mode)
- }
- vgetcpu_mode = VVIRT(.vgetcpu_mode);
-
- . = ALIGN(L1_CACHE_BYTES);
- .jiffies : AT(VLOAD(.jiffies)) {
- *(.jiffies)
- }
- jiffies = VVIRT(.jiffies);
-
- .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) {
- *(.vsyscall_3)
- }
+ . = 2048;
+ *(.vsyscall_2)
- . = __vsyscall_0 + PAGE_SIZE;
+ . = 4096; /* Pad the whole page. */
+ } :user =0xcc
+ . = ALIGN(__vsyscall_0 + PAGE_SIZE, PAGE_SIZE);
#undef VSYSCALL_ADDR
#undef VLOAD_OFFSET
@@ -217,6 +185,23 @@ SECTIONS
#undef VVIRT_OFFSET
#undef VVIRT
+ __vvar_page = .;
+
+ .vvar : AT(ADDR(.vvar) - LOAD_OFFSET) {
+
+ /* Place all vvars at the offsets in asm/vvar.h. */
+#define EMIT_VVAR(name, offset) \
+ . = offset; \
+ *(.vvar_ ## name)
+#define __VVAR_KERNEL_LDS
+#include <asm/vvar.h>
+#undef __VVAR_KERNEL_LDS
+#undef EMIT_VVAR
+
+ } :data
+
+ . = ALIGN(__vvar_page + PAGE_SIZE, PAGE_SIZE);
+
#endif /* CONFIG_X86_64 */
/* Init code and data - will be freed after init */
@@ -306,6 +291,13 @@ SECTIONS
}
. = ALIGN(8);
+ .apicdrivers : AT(ADDR(.apicdrivers) - LOAD_OFFSET) {
+ __apicdrivers = .;
+ *(.apicdrivers);
+ __apicdrivers_end = .;
+ }
+
+ . = ALIGN(8);
/*
* .exit.text is discard at runtime, not link time, to deal with
* references from .altinstructions and .eh_frame
@@ -319,7 +311,7 @@ SECTIONS
}
#if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP)
- PERCPU(INTERNODE_CACHE_BYTES, PAGE_SIZE)
+ PERCPU_SECTION(INTERNODE_CACHE_BYTES)
#endif
. = ALIGN(PAGE_SIZE);
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index dcbb28c4b69..dda7dff9cef 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -2,6 +2,8 @@
* Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
* Copyright 2003 Andi Kleen, SuSE Labs.
*
+ * [ NOTE: this mechanism is now deprecated in favor of the vDSO. ]
+ *
* Thanks to hpa@transmeta.com for some useful hint.
* Special thanks to Ingo Molnar for his early experience with
* a different vsyscall implementation for Linux/IA32 and for the name.
@@ -11,10 +13,9 @@
* vsyscalls. One vsyscall can reserve more than 1 slot to avoid
* jumping out of line if necessary. We cannot add more with this
* mechanism because older kernels won't return -ENOSYS.
- * If we want more than four we need a vDSO.
*
- * Note: the concept clashes with user mode linux. If you use UML and
- * want per guest time just set the kernel.vsyscall64 sysctl to 0.
+ * Note: the concept clashes with user mode linux. UML users should
+ * use the vDSO.
*/
/* Disable profiling for userspace code: */
@@ -32,9 +33,12 @@
#include <linux/cpu.h>
#include <linux/smp.h>
#include <linux/notifier.h>
+#include <linux/syscalls.h>
+#include <linux/ratelimit.h>
#include <asm/vsyscall.h>
#include <asm/pgtable.h>
+#include <asm/compat.h>
#include <asm/page.h>
#include <asm/unistd.h>
#include <asm/fixmap.h>
@@ -44,23 +48,12 @@
#include <asm/desc.h>
#include <asm/topology.h>
#include <asm/vgtod.h>
+#include <asm/traps.h>
-#define __vsyscall(nr) \
- __attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace
-#define __syscall_clobber "r11","cx","memory"
-
-/*
- * vsyscall_gtod_data contains data that is :
- * - readonly from vsyscalls
- * - written by timer interrupt or systcl (/proc/sys/kernel/vsyscall64)
- * Try to keep this structure as small as possible to avoid cache line ping pongs
- */
-int __vgetcpu_mode __section_vgetcpu_mode;
-
-struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data =
+DEFINE_VVAR(int, vgetcpu_mode);
+DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) =
{
- .lock = SEQLOCK_UNLOCKED,
- .sysctl_enabled = 1,
+ .lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock),
};
void update_vsyscall_tz(void)
@@ -79,178 +72,149 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
unsigned long flags;
write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
+
/* copy vsyscall data */
- vsyscall_gtod_data.clock.vread = clock->vread;
- vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
- vsyscall_gtod_data.clock.mask = clock->mask;
- vsyscall_gtod_data.clock.mult = mult;
- vsyscall_gtod_data.clock.shift = clock->shift;
- vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
- vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
- vsyscall_gtod_data.wall_to_monotonic = *wtm;
- vsyscall_gtod_data.wall_time_coarse = __current_kernel_time();
+ vsyscall_gtod_data.clock.vclock_mode = clock->archdata.vclock_mode;
+ vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
+ vsyscall_gtod_data.clock.mask = clock->mask;
+ vsyscall_gtod_data.clock.mult = mult;
+ vsyscall_gtod_data.clock.shift = clock->shift;
+ vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
+ vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
+ vsyscall_gtod_data.wall_to_monotonic = *wtm;
+ vsyscall_gtod_data.wall_time_coarse = __current_kernel_time();
+
write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
}
-/* RED-PEN may want to readd seq locking, but then the variable should be
- * write-once.
- */
-static __always_inline void do_get_tz(struct timezone * tz)
+static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
+ const char *message)
{
- *tz = __vsyscall_gtod_data.sys_tz;
-}
+ static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST);
+ struct task_struct *tsk;
-static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
-{
- int ret;
- asm volatile("syscall"
- : "=a" (ret)
- : "0" (__NR_gettimeofday),"D" (tv),"S" (tz)
- : __syscall_clobber );
- return ret;
-}
+ if (!show_unhandled_signals || !__ratelimit(&rs))
+ return;
-static __always_inline long time_syscall(long *t)
-{
- long secs;
- asm volatile("syscall"
- : "=a" (secs)
- : "0" (__NR_time),"D" (t) : __syscall_clobber);
- return secs;
-}
+ tsk = current;
-static __always_inline void do_vgettimeofday(struct timeval * tv)
-{
- cycle_t now, base, mask, cycle_delta;
- unsigned seq;
- unsigned long mult, shift, nsec;
- cycle_t (*vread)(void);
- do {
- seq = read_seqbegin(&__vsyscall_gtod_data.lock);
-
- vread = __vsyscall_gtod_data.clock.vread;
- if (unlikely(!__vsyscall_gtod_data.sysctl_enabled || !vread)) {
- gettimeofday(tv,NULL);
- return;
- }
-
- now = vread();
- base = __vsyscall_gtod_data.clock.cycle_last;
- mask = __vsyscall_gtod_data.clock.mask;
- mult = __vsyscall_gtod_data.clock.mult;
- shift = __vsyscall_gtod_data.clock.shift;
-
- tv->tv_sec = __vsyscall_gtod_data.wall_time_sec;
- nsec = __vsyscall_gtod_data.wall_time_nsec;
- } while (read_seqretry(&__vsyscall_gtod_data.lock, seq));
-
- /* calculate interval: */
- cycle_delta = (now - base) & mask;
- /* convert to nsecs: */
- nsec += (cycle_delta * mult) >> shift;
-
- while (nsec >= NSEC_PER_SEC) {
- tv->tv_sec += 1;
- nsec -= NSEC_PER_SEC;
- }
- tv->tv_usec = nsec / NSEC_PER_USEC;
+ printk("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n",
+ level, tsk->comm, task_pid_nr(tsk),
+ message, regs->ip - 2, regs->cs,
+ regs->sp, regs->ax, regs->si, regs->di);
}
-int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
+static int addr_to_vsyscall_nr(unsigned long addr)
{
- if (tv)
- do_vgettimeofday(tv);
- if (tz)
- do_get_tz(tz);
- return 0;
-}
+ int nr;
-/* This will break when the xtime seconds get inaccurate, but that is
- * unlikely */
-time_t __vsyscall(1) vtime(time_t *t)
-{
- unsigned seq;
- time_t result;
- if (unlikely(!__vsyscall_gtod_data.sysctl_enabled))
- return time_syscall(t);
+ if ((addr & ~0xC00UL) != VSYSCALL_START)
+ return -EINVAL;
- do {
- seq = read_seqbegin(&__vsyscall_gtod_data.lock);
+ nr = (addr & 0xC00UL) >> 10;
+ if (nr >= 3)
+ return -EINVAL;
- result = __vsyscall_gtod_data.wall_time_sec;
+ return nr;
+}
- } while (read_seqretry(&__vsyscall_gtod_data.lock, seq));
+void dotraplinkage do_emulate_vsyscall(struct pt_regs *regs, long error_code)
+{
+ struct task_struct *tsk;
+ unsigned long caller;
+ int vsyscall_nr;
+ long ret;
+
+ local_irq_enable();
+
+ /*
+ * Real 64-bit user mode code has cs == __USER_CS. Anything else
+ * is bogus.
+ */
+ if (regs->cs != __USER_CS) {
+ /*
+ * If we trapped from kernel mode, we might as well OOPS now
+ * instead of returning to some random address and OOPSing
+ * then.
+ */
+ BUG_ON(!user_mode(regs));
+
+ /* Compat mode and non-compat 32-bit CS should both segfault. */
+ warn_bad_vsyscall(KERN_WARNING, regs,
+ "illegal int 0xcc from 32-bit mode");
+ goto sigsegv;
+ }
- if (t)
- *t = result;
- return result;
-}
+ /*
+ * x86-ism here: regs->ip points to the instruction after the int 0xcc,
+ * and int 0xcc is two bytes long.
+ */
+ vsyscall_nr = addr_to_vsyscall_nr(regs->ip - 2);
+ if (vsyscall_nr < 0) {
+ warn_bad_vsyscall(KERN_WARNING, regs,
+ "illegal int 0xcc (exploit attempt?)");
+ goto sigsegv;
+ }
-/* Fast way to get current CPU and node.
- This helps to do per node and per CPU caches in user space.
- The result is not guaranteed without CPU affinity, but usually
- works out because the scheduler tries to keep a thread on the same
- CPU.
+ if (get_user(caller, (unsigned long __user *)regs->sp) != 0) {
+ warn_bad_vsyscall(KERN_WARNING, regs, "int 0xcc with bad stack (exploit attempt?)");
+ goto sigsegv;
+ }
- tcache must point to a two element sized long array.
- All arguments can be NULL. */
-long __vsyscall(2)
-vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
-{
- unsigned int p;
- unsigned long j = 0;
-
- /* Fast cache - only recompute value once per jiffies and avoid
- relatively costly rdtscp/cpuid otherwise.
- This works because the scheduler usually keeps the process
- on the same CPU and this syscall doesn't guarantee its
- results anyways.
- We do this here because otherwise user space would do it on
- its own in a likely inferior way (no access to jiffies).
- If you don't like it pass NULL. */
- if (tcache && tcache->blob[0] == (j = __jiffies)) {
- p = tcache->blob[1];
- } else if (__vgetcpu_mode == VGETCPU_RDTSCP) {
- /* Load per CPU data from RDTSCP */
- native_read_tscp(&p);
- } else {
- /* Load per CPU data from GDT */
- asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
+ tsk = current;
+ if (seccomp_mode(&tsk->seccomp))
+ do_exit(SIGKILL);
+
+ switch (vsyscall_nr) {
+ case 0:
+ ret = sys_gettimeofday(
+ (struct timeval __user *)regs->di,
+ (struct timezone __user *)regs->si);
+ break;
+
+ case 1:
+ ret = sys_time((time_t __user *)regs->di);
+ break;
+
+ case 2:
+ ret = sys_getcpu((unsigned __user *)regs->di,
+ (unsigned __user *)regs->si,
+ 0);
+ break;
}
- if (tcache) {
- tcache->blob[0] = j;
- tcache->blob[1] = p;
+
+ if (ret == -EFAULT) {
+ /*
+ * Bad news -- userspace fed a bad pointer to a vsyscall.
+ *
+ * With a real vsyscall, that would have caused SIGSEGV.
+ * To make writing reliable exploits using the emulated
+ * vsyscalls harder, generate SIGSEGV here as well.
+ */
+ warn_bad_vsyscall(KERN_INFO, regs,
+ "vsyscall fault (exploit attempt?)");
+ goto sigsegv;
}
- if (cpu)
- *cpu = p & 0xfff;
- if (node)
- *node = p >> 12;
- return 0;
-}
-static long __vsyscall(3) venosys_1(void)
-{
- return -ENOSYS;
-}
+ regs->ax = ret;
-#ifdef CONFIG_SYSCTL
-static ctl_table kernel_table2[] = {
- { .procname = "vsyscall64",
- .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec },
- {}
-};
+ /* Emulate a ret instruction. */
+ regs->ip = caller;
+ regs->sp += 8;
-static ctl_table kernel_root_table2[] = {
- { .procname = "kernel", .mode = 0555,
- .child = kernel_table2 },
- {}
-};
-#endif
+ local_irq_disable();
+ return;
-/* Assume __initcall executes before all user space. Hopefully kmod
- doesn't violate that. We'll find out if it does. */
+sigsegv:
+ regs->ip -= 2; /* The faulting instruction should be the int 0xcc. */
+ force_sig(SIGSEGV, current);
+ local_irq_disable();
+}
+
+/*
+ * Assume __initcall executes before all user space. Hopefully kmod
+ * doesn't violate that. We'll find out if it does.
+ */
static void __cpuinit vsyscall_set_cpu(int cpu)
{
unsigned long d;
@@ -261,13 +225,15 @@ static void __cpuinit vsyscall_set_cpu(int cpu)
if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP))
write_rdtscp_aux((node << 12) | cpu);
- /* Store cpu number in limit so that it can be loaded quickly
- in user space in vgetcpu.
- 12 bits for the CPU and 8 bits for the node. */
+ /*
+ * Store cpu number in limit so that it can be loaded quickly
+ * in user space in vgetcpu. (12 bits for the CPU and 8 bits for the node)
+ */
d = 0x0f40000000000ULL;
d |= cpu;
d |= (node & 0xf) << 12;
d |= (node >> 4) << 48;
+
write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
}
@@ -281,8 +247,10 @@ static int __cpuinit
cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
{
long cpu = (long)arg;
+
if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1);
+
return NOTIFY_DONE;
}
@@ -290,25 +258,23 @@ void __init map_vsyscall(void)
{
extern char __vsyscall_0;
unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
+ extern char __vvar_page;
+ unsigned long physaddr_vvar_page = __pa_symbol(&__vvar_page);
/* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */
__set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
+ __set_fixmap(VVAR_PAGE, physaddr_vvar_page, PAGE_KERNEL_VVAR);
+ BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) != (unsigned long)VVAR_ADDRESS);
}
static int __init vsyscall_init(void)
{
- BUG_ON(((unsigned long) &vgettimeofday !=
- VSYSCALL_ADDR(__NR_vgettimeofday)));
- BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
- BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
- BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu));
-#ifdef CONFIG_SYSCTL
- register_sysctl_table(kernel_root_table2);
-#endif
+ BUG_ON(VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE));
+
on_each_cpu(cpu_vsyscall_init, NULL, 1);
/* notifier priority > KVM */
hotcpu_notifier(cpu_vsyscall_notifier, 30);
+
return 0;
}
-
__initcall(vsyscall_init);
diff --git a/arch/x86/kernel/vsyscall_emu_64.S b/arch/x86/kernel/vsyscall_emu_64.S
new file mode 100644
index 00000000000..ffa845eae5c
--- /dev/null
+++ b/arch/x86/kernel/vsyscall_emu_64.S
@@ -0,0 +1,27 @@
+/*
+ * vsyscall_emu_64.S: Vsyscall emulation page
+ *
+ * Copyright (c) 2011 Andy Lutomirski
+ *
+ * Subject to the GNU General Public License, version 2
+ */
+
+#include <linux/linkage.h>
+#include <asm/irq_vectors.h>
+
+/* The unused parts of the page are filled with 0xcc by the linker script. */
+
+.section .vsyscall_0, "a"
+ENTRY(vsyscall_0)
+ int $VSYSCALL_EMU_VECTOR
+END(vsyscall_0)
+
+.section .vsyscall_1, "a"
+ENTRY(vsyscall_1)
+ int $VSYSCALL_EMU_VECTOR
+END(vsyscall_1)
+
+.section .vsyscall_2, "a"
+ENTRY(vsyscall_2)
+ int $VSYSCALL_EMU_VECTOR
+END(vsyscall_2)
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 75ef4b18e9b..6f164bd5e14 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -35,7 +35,7 @@ void iommu_shutdown_noop(void) { }
struct x86_init_ops x86_init __initdata = {
.resources = {
- .probe_roms = x86_init_noop,
+ .probe_roms = probe_roms,
.reserve_resources = reserve_standard_io_resources,
.memory_setup = default_machine_specific_memory_setup,
},
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 50f63648ce1..65cf8233d25 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -76,6 +76,5 @@ config KVM_MMU_AUDIT
# the virtualization menu.
source drivers/vhost/Kconfig
source drivers/lguest/Kconfig
-source drivers/virtio/Kconfig
endif # VIRTUALIZATION
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 0ad47b819a8..adc98675cda 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -47,35 +47,42 @@
#define DstDI (5<<1) /* Destination is in ES:(E)DI */
#define DstMem64 (6<<1) /* 64bit memory operand */
#define DstImmUByte (7<<1) /* 8-bit unsigned immediate operand */
-#define DstMask (7<<1)
+#define DstDX (8<<1) /* Destination is in DX register */
+#define DstMask (0xf<<1)
/* Source operand type. */
-#define SrcNone (0<<4) /* No source operand. */
-#define SrcReg (1<<4) /* Register operand. */
-#define SrcMem (2<<4) /* Memory operand. */
-#define SrcMem16 (3<<4) /* Memory operand (16-bit). */
-#define SrcMem32 (4<<4) /* Memory operand (32-bit). */
-#define SrcImm (5<<4) /* Immediate operand. */
-#define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */
-#define SrcOne (7<<4) /* Implied '1' */
-#define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */
-#define SrcImmU (9<<4) /* Immediate operand, unsigned */
-#define SrcSI (0xa<<4) /* Source is in the DS:RSI */
-#define SrcImmFAddr (0xb<<4) /* Source is immediate far address */
-#define SrcMemFAddr (0xc<<4) /* Source is far address in memory */
-#define SrcAcc (0xd<<4) /* Source Accumulator */
-#define SrcImmU16 (0xe<<4) /* Immediate operand, unsigned, 16 bits */
-#define SrcMask (0xf<<4)
+#define SrcNone (0<<5) /* No source operand. */
+#define SrcReg (1<<5) /* Register operand. */
+#define SrcMem (2<<5) /* Memory operand. */
+#define SrcMem16 (3<<5) /* Memory operand (16-bit). */
+#define SrcMem32 (4<<5) /* Memory operand (32-bit). */
+#define SrcImm (5<<5) /* Immediate operand. */
+#define SrcImmByte (6<<5) /* 8-bit sign-extended immediate operand. */
+#define SrcOne (7<<5) /* Implied '1' */
+#define SrcImmUByte (8<<5) /* 8-bit unsigned immediate operand. */
+#define SrcImmU (9<<5) /* Immediate operand, unsigned */
+#define SrcSI (0xa<<5) /* Source is in the DS:RSI */
+#define SrcImmFAddr (0xb<<5) /* Source is immediate far address */
+#define SrcMemFAddr (0xc<<5) /* Source is far address in memory */
+#define SrcAcc (0xd<<5) /* Source Accumulator */
+#define SrcImmU16 (0xe<<5) /* Immediate operand, unsigned, 16 bits */
+#define SrcDX (0xf<<5) /* Source is in DX register */
+#define SrcMask (0xf<<5)
/* Generic ModRM decode. */
-#define ModRM (1<<8)
+#define ModRM (1<<9)
/* Destination is only written; never read. */
-#define Mov (1<<9)
-#define BitOp (1<<10)
-#define MemAbs (1<<11) /* Memory operand is absolute displacement */
-#define String (1<<12) /* String instruction (rep capable) */
-#define Stack (1<<13) /* Stack instruction (push/pop) */
-#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */
-#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */
+#define Mov (1<<10)
+#define BitOp (1<<11)
+#define MemAbs (1<<12) /* Memory operand is absolute displacement */
+#define String (1<<13) /* String instruction (rep capable) */
+#define Stack (1<<14) /* Stack instruction (push/pop) */
+#define GroupMask (7<<15) /* Opcode uses one of the group mechanisms */
+#define Group (1<<15) /* Bits 3:5 of modrm byte extend opcode */
+#define GroupDual (2<<15) /* Alternate decoding of mod == 3 */
+#define Prefix (3<<15) /* Instruction varies with 66/f2/f3 prefix */
+#define RMExt (4<<15) /* Opcode extension in ModRM r/m if mod == 3 */
+#define Sse (1<<18) /* SSE Vector instruction */
/* Misc flags */
+#define Prot (1<<21) /* instruction generates #UD if not in prot-mode */
#define VendorSpecific (1<<22) /* Vendor specific instruction */
#define NoAccess (1<<23) /* Don't access memory (lea/invlpg/verr etc) */
#define Op3264 (1<<24) /* Operand is 64b in long mode, 32b otherwise */
@@ -102,11 +109,14 @@
struct opcode {
u32 flags;
+ u8 intercept;
union {
int (*execute)(struct x86_emulate_ctxt *ctxt);
struct opcode *group;
struct group_dual *gdual;
+ struct gprefix *gprefix;
} u;
+ int (*check_perm)(struct x86_emulate_ctxt *ctxt);
};
struct group_dual {
@@ -114,6 +124,13 @@ struct group_dual {
struct opcode mod3[8];
};
+struct gprefix {
+ struct opcode pfx_no;
+ struct opcode pfx_66;
+ struct opcode pfx_f2;
+ struct opcode pfx_f3;
+};
+
/* EFLAGS bit definitions. */
#define EFLG_ID (1<<21)
#define EFLG_VIP (1<<20)
@@ -248,42 +265,42 @@ struct group_dual {
"w", "r", _LO32, "r", "", "r")
/* Instruction has three operands and one operand is stored in ECX register */
-#define __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, _suffix, _type) \
- do { \
- unsigned long _tmp; \
- _type _clv = (_cl).val; \
- _type _srcv = (_src).val; \
- _type _dstv = (_dst).val; \
- \
- __asm__ __volatile__ ( \
- _PRE_EFLAGS("0", "5", "2") \
- _op _suffix " %4,%1 \n" \
- _POST_EFLAGS("0", "5", "2") \
- : "=m" (_eflags), "+r" (_dstv), "=&r" (_tmp) \
- : "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK) \
- ); \
- \
- (_cl).val = (unsigned long) _clv; \
- (_src).val = (unsigned long) _srcv; \
- (_dst).val = (unsigned long) _dstv; \
+#define __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, _suffix, _type) \
+ do { \
+ unsigned long _tmp; \
+ _type _clv = (_cl).val; \
+ _type _srcv = (_src).val; \
+ _type _dstv = (_dst).val; \
+ \
+ __asm__ __volatile__ ( \
+ _PRE_EFLAGS("0", "5", "2") \
+ _op _suffix " %4,%1 \n" \
+ _POST_EFLAGS("0", "5", "2") \
+ : "=m" (_eflags), "+r" (_dstv), "=&r" (_tmp) \
+ : "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK) \
+ ); \
+ \
+ (_cl).val = (unsigned long) _clv; \
+ (_src).val = (unsigned long) _srcv; \
+ (_dst).val = (unsigned long) _dstv; \
} while (0)
-#define emulate_2op_cl(_op, _cl, _src, _dst, _eflags) \
- do { \
- switch ((_dst).bytes) { \
- case 2: \
- __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \
- "w", unsigned short); \
- break; \
- case 4: \
- __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \
- "l", unsigned int); \
- break; \
- case 8: \
- ON64(__emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \
- "q", unsigned long)); \
- break; \
- } \
+#define emulate_2op_cl(_op, _cl, _src, _dst, _eflags) \
+ do { \
+ switch ((_dst).bytes) { \
+ case 2: \
+ __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \
+ "w", unsigned short); \
+ break; \
+ case 4: \
+ __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \
+ "l", unsigned int); \
+ break; \
+ case 8: \
+ ON64(__emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \
+ "q", unsigned long)); \
+ break; \
+ } \
} while (0)
#define __emulate_1op(_op, _dst, _eflags, _suffix) \
@@ -346,13 +363,25 @@ struct group_dual {
} while (0)
/* instruction has only one source operand, destination is implicit (e.g. mul, div, imul, idiv) */
-#define emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags) \
- do { \
- switch((_src).bytes) { \
- case 1: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "b"); break; \
- case 2: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "w"); break; \
- case 4: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "l"); break; \
- case 8: ON64(__emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "q")); break; \
+#define emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags) \
+ do { \
+ switch((_src).bytes) { \
+ case 1: \
+ __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \
+ _eflags, "b"); \
+ break; \
+ case 2: \
+ __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \
+ _eflags, "w"); \
+ break; \
+ case 4: \
+ __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \
+ _eflags, "l"); \
+ break; \
+ case 8: \
+ ON64(__emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \
+ _eflags, "q")); \
+ break; \
} \
} while (0)
@@ -388,13 +417,33 @@ struct group_dual {
(_type)_x; \
})
-#define insn_fetch_arr(_arr, _size, _eip) \
+#define insn_fetch_arr(_arr, _size, _eip) \
({ rc = do_insn_fetch(ctxt, ops, (_eip), _arr, (_size)); \
if (rc != X86EMUL_CONTINUE) \
goto done; \
(_eip) += (_size); \
})
+static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt,
+ enum x86_intercept intercept,
+ enum x86_intercept_stage stage)
+{
+ struct x86_instruction_info info = {
+ .intercept = intercept,
+ .rep_prefix = ctxt->decode.rep_prefix,
+ .modrm_mod = ctxt->decode.modrm_mod,
+ .modrm_reg = ctxt->decode.modrm_reg,
+ .modrm_rm = ctxt->decode.modrm_rm,
+ .src_val = ctxt->decode.src.val64,
+ .src_bytes = ctxt->decode.src.bytes,
+ .dst_bytes = ctxt->decode.dst.bytes,
+ .ad_bytes = ctxt->decode.ad_bytes,
+ .next_rip = ctxt->eip,
+ };
+
+ return ctxt->ops->intercept(ctxt, &info, stage);
+}
+
static inline unsigned long ad_mask(struct decode_cache *c)
{
return (1UL << (c->ad_bytes << 3)) - 1;
@@ -430,6 +479,13 @@ static inline void jmp_rel(struct decode_cache *c, int rel)
register_address_increment(c, &c->eip, rel);
}
+static u32 desc_limit_scaled(struct desc_struct *desc)
+{
+ u32 limit = get_desc_limit(desc);
+
+ return desc->g ? (limit << 12) | 0xfff : limit;
+}
+
static void set_seg_override(struct decode_cache *c, int seg)
{
c->has_seg_override = true;
@@ -442,11 +498,10 @@ static unsigned long seg_base(struct x86_emulate_ctxt *ctxt,
if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS)
return 0;
- return ops->get_cached_segment_base(seg, ctxt->vcpu);
+ return ops->get_cached_segment_base(ctxt, seg);
}
static unsigned seg_override(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
struct decode_cache *c)
{
if (!c->has_seg_override)
@@ -455,18 +510,6 @@ static unsigned seg_override(struct x86_emulate_ctxt *ctxt,
return c->seg_override;
}
-static ulong linear(struct x86_emulate_ctxt *ctxt,
- struct segmented_address addr)
-{
- struct decode_cache *c = &ctxt->decode;
- ulong la;
-
- la = seg_base(ctxt, ctxt->ops, addr.seg) + addr.ea;
- if (c->ad_bytes != 8)
- la &= (u32)-1;
- return la;
-}
-
static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
u32 error, bool valid)
{
@@ -476,11 +519,21 @@ static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
return X86EMUL_PROPAGATE_FAULT;
}
+static int emulate_db(struct x86_emulate_ctxt *ctxt)
+{
+ return emulate_exception(ctxt, DB_VECTOR, 0, false);
+}
+
static int emulate_gp(struct x86_emulate_ctxt *ctxt, int err)
{
return emulate_exception(ctxt, GP_VECTOR, err, true);
}
+static int emulate_ss(struct x86_emulate_ctxt *ctxt, int err)
+{
+ return emulate_exception(ctxt, SS_VECTOR, err, true);
+}
+
static int emulate_ud(struct x86_emulate_ctxt *ctxt)
{
return emulate_exception(ctxt, UD_VECTOR, 0, false);
@@ -496,6 +549,128 @@ static int emulate_de(struct x86_emulate_ctxt *ctxt)
return emulate_exception(ctxt, DE_VECTOR, 0, false);
}
+static int emulate_nm(struct x86_emulate_ctxt *ctxt)
+{
+ return emulate_exception(ctxt, NM_VECTOR, 0, false);
+}
+
+static u16 get_segment_selector(struct x86_emulate_ctxt *ctxt, unsigned seg)
+{
+ u16 selector;
+ struct desc_struct desc;
+
+ ctxt->ops->get_segment(ctxt, &selector, &desc, NULL, seg);
+ return selector;
+}
+
+static void set_segment_selector(struct x86_emulate_ctxt *ctxt, u16 selector,
+ unsigned seg)
+{
+ u16 dummy;
+ u32 base3;
+ struct desc_struct desc;
+
+ ctxt->ops->get_segment(ctxt, &dummy, &desc, &base3, seg);
+ ctxt->ops->set_segment(ctxt, selector, &desc, base3, seg);
+}
+
+static int __linearize(struct x86_emulate_ctxt *ctxt,
+ struct segmented_address addr,
+ unsigned size, bool write, bool fetch,
+ ulong *linear)
+{
+ struct decode_cache *c = &ctxt->decode;
+ struct desc_struct desc;
+ bool usable;
+ ulong la;
+ u32 lim;
+ u16 sel;
+ unsigned cpl, rpl;
+
+ la = seg_base(ctxt, ctxt->ops, addr.seg) + addr.ea;
+ switch (ctxt->mode) {
+ case X86EMUL_MODE_REAL:
+ break;
+ case X86EMUL_MODE_PROT64:
+ if (((signed long)la << 16) >> 16 != la)
+ return emulate_gp(ctxt, 0);
+ break;
+ default:
+ usable = ctxt->ops->get_segment(ctxt, &sel, &desc, NULL,
+ addr.seg);
+ if (!usable)
+ goto bad;
+ /* code segment or read-only data segment */
+ if (((desc.type & 8) || !(desc.type & 2)) && write)
+ goto bad;
+ /* unreadable code segment */
+ if (!fetch && (desc.type & 8) && !(desc.type & 2))
+ goto bad;
+ lim = desc_limit_scaled(&desc);
+ if ((desc.type & 8) || !(desc.type & 4)) {
+ /* expand-up segment */
+ if (addr.ea > lim || (u32)(addr.ea + size - 1) > lim)
+ goto bad;
+ } else {
+ /* exapand-down segment */
+ if (addr.ea <= lim || (u32)(addr.ea + size - 1) <= lim)
+ goto bad;
+ lim = desc.d ? 0xffffffff : 0xffff;
+ if (addr.ea > lim || (u32)(addr.ea + size - 1) > lim)
+ goto bad;
+ }
+ cpl = ctxt->ops->cpl(ctxt);
+ rpl = sel & 3;
+ cpl = max(cpl, rpl);
+ if (!(desc.type & 8)) {
+ /* data segment */
+ if (cpl > desc.dpl)
+ goto bad;
+ } else if ((desc.type & 8) && !(desc.type & 4)) {
+ /* nonconforming code segment */
+ if (cpl != desc.dpl)
+ goto bad;
+ } else if ((desc.type & 8) && (desc.type & 4)) {
+ /* conforming code segment */
+ if (cpl < desc.dpl)
+ goto bad;
+ }
+ break;
+ }
+ if (fetch ? ctxt->mode != X86EMUL_MODE_PROT64 : c->ad_bytes != 8)
+ la &= (u32)-1;
+ *linear = la;
+ return X86EMUL_CONTINUE;
+bad:
+ if (addr.seg == VCPU_SREG_SS)
+ return emulate_ss(ctxt, addr.seg);
+ else
+ return emulate_gp(ctxt, addr.seg);
+}
+
+static int linearize(struct x86_emulate_ctxt *ctxt,
+ struct segmented_address addr,
+ unsigned size, bool write,
+ ulong *linear)
+{
+ return __linearize(ctxt, addr, size, write, false, linear);
+}
+
+
+static int segmented_read_std(struct x86_emulate_ctxt *ctxt,
+ struct segmented_address addr,
+ void *data,
+ unsigned size)
+{
+ int rc;
+ ulong linear;
+
+ rc = linearize(ctxt, addr, size, false, &linear);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception);
+}
+
static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops,
unsigned long eip, u8 *dest)
@@ -505,10 +680,15 @@ static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
int size, cur_size;
if (eip == fc->end) {
+ unsigned long linear;
+ struct segmented_address addr = { .seg=VCPU_SREG_CS, .ea=eip};
cur_size = fc->end - fc->start;
size = min(15UL - cur_size, PAGE_SIZE - offset_in_page(eip));
- rc = ops->fetch(ctxt->cs_base + eip, fc->data + cur_size,
- size, ctxt->vcpu, &ctxt->exception);
+ rc = __linearize(ctxt, addr, size, false, true, &linear);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ rc = ops->fetch(ctxt, linear, fc->data + cur_size,
+ size, &ctxt->exception);
if (rc != X86EMUL_CONTINUE)
return rc;
fc->end += size;
@@ -551,7 +731,6 @@ static void *decode_register(u8 modrm_reg, unsigned long *regs,
}
static int read_descriptor(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
struct segmented_address addr,
u16 *size, unsigned long *address, int op_bytes)
{
@@ -560,13 +739,11 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt,
if (op_bytes == 2)
op_bytes = 3;
*address = 0;
- rc = ops->read_std(linear(ctxt, addr), (unsigned long *)size, 2,
- ctxt->vcpu, &ctxt->exception);
+ rc = segmented_read_std(ctxt, addr, size, 2);
if (rc != X86EMUL_CONTINUE)
return rc;
addr.ea += 2;
- rc = ops->read_std(linear(ctxt, addr), address, op_bytes,
- ctxt->vcpu, &ctxt->exception);
+ rc = segmented_read_std(ctxt, addr, address, op_bytes);
return rc;
}
@@ -623,7 +800,63 @@ static void fetch_register_operand(struct operand *op)
}
}
-static void decode_register_operand(struct operand *op,
+static void read_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, int reg)
+{
+ ctxt->ops->get_fpu(ctxt);
+ switch (reg) {
+ case 0: asm("movdqu %%xmm0, %0" : "=m"(*data)); break;
+ case 1: asm("movdqu %%xmm1, %0" : "=m"(*data)); break;
+ case 2: asm("movdqu %%xmm2, %0" : "=m"(*data)); break;
+ case 3: asm("movdqu %%xmm3, %0" : "=m"(*data)); break;
+ case 4: asm("movdqu %%xmm4, %0" : "=m"(*data)); break;
+ case 5: asm("movdqu %%xmm5, %0" : "=m"(*data)); break;
+ case 6: asm("movdqu %%xmm6, %0" : "=m"(*data)); break;
+ case 7: asm("movdqu %%xmm7, %0" : "=m"(*data)); break;
+#ifdef CONFIG_X86_64
+ case 8: asm("movdqu %%xmm8, %0" : "=m"(*data)); break;
+ case 9: asm("movdqu %%xmm9, %0" : "=m"(*data)); break;
+ case 10: asm("movdqu %%xmm10, %0" : "=m"(*data)); break;
+ case 11: asm("movdqu %%xmm11, %0" : "=m"(*data)); break;
+ case 12: asm("movdqu %%xmm12, %0" : "=m"(*data)); break;
+ case 13: asm("movdqu %%xmm13, %0" : "=m"(*data)); break;
+ case 14: asm("movdqu %%xmm14, %0" : "=m"(*data)); break;
+ case 15: asm("movdqu %%xmm15, %0" : "=m"(*data)); break;
+#endif
+ default: BUG();
+ }
+ ctxt->ops->put_fpu(ctxt);
+}
+
+static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data,
+ int reg)
+{
+ ctxt->ops->get_fpu(ctxt);
+ switch (reg) {
+ case 0: asm("movdqu %0, %%xmm0" : : "m"(*data)); break;
+ case 1: asm("movdqu %0, %%xmm1" : : "m"(*data)); break;
+ case 2: asm("movdqu %0, %%xmm2" : : "m"(*data)); break;
+ case 3: asm("movdqu %0, %%xmm3" : : "m"(*data)); break;
+ case 4: asm("movdqu %0, %%xmm4" : : "m"(*data)); break;
+ case 5: asm("movdqu %0, %%xmm5" : : "m"(*data)); break;
+ case 6: asm("movdqu %0, %%xmm6" : : "m"(*data)); break;
+ case 7: asm("movdqu %0, %%xmm7" : : "m"(*data)); break;
+#ifdef CONFIG_X86_64
+ case 8: asm("movdqu %0, %%xmm8" : : "m"(*data)); break;
+ case 9: asm("movdqu %0, %%xmm9" : : "m"(*data)); break;
+ case 10: asm("movdqu %0, %%xmm10" : : "m"(*data)); break;
+ case 11: asm("movdqu %0, %%xmm11" : : "m"(*data)); break;
+ case 12: asm("movdqu %0, %%xmm12" : : "m"(*data)); break;
+ case 13: asm("movdqu %0, %%xmm13" : : "m"(*data)); break;
+ case 14: asm("movdqu %0, %%xmm14" : : "m"(*data)); break;
+ case 15: asm("movdqu %0, %%xmm15" : : "m"(*data)); break;
+#endif
+ default: BUG();
+ }
+ ctxt->ops->put_fpu(ctxt);
+}
+
+static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
+ struct operand *op,
struct decode_cache *c,
int inhibit_bytereg)
{
@@ -632,6 +865,15 @@ static void decode_register_operand(struct operand *op,
if (!(c->d & ModRM))
reg = (c->b & 7) | ((c->rex_prefix & 1) << 3);
+
+ if (c->d & Sse) {
+ op->type = OP_XMM;
+ op->bytes = 16;
+ op->addr.xmm = reg;
+ read_sse_reg(ctxt, &op->vec_val, reg);
+ return;
+ }
+
op->type = OP_REG;
if ((c->d & ByteOp) && !inhibit_bytereg) {
op->addr.reg = decode_register(reg, c->regs, highbyte_regs);
@@ -671,6 +913,13 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
op->bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
op->addr.reg = decode_register(c->modrm_rm,
c->regs, c->d & ByteOp);
+ if (c->d & Sse) {
+ op->type = OP_XMM;
+ op->bytes = 16;
+ op->addr.xmm = c->modrm_rm;
+ read_sse_reg(ctxt, &op->vec_val, c->modrm_rm);
+ return rc;
+ }
fetch_register_operand(op);
return rc;
}
@@ -819,8 +1068,8 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt,
if (mc->pos < mc->end)
goto read_cached;
- rc = ops->read_emulated(addr, mc->data + mc->end, n,
- &ctxt->exception, ctxt->vcpu);
+ rc = ops->read_emulated(ctxt, addr, mc->data + mc->end, n,
+ &ctxt->exception);
if (rc != X86EMUL_CONTINUE)
return rc;
mc->end += n;
@@ -834,6 +1083,50 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt,
return X86EMUL_CONTINUE;
}
+static int segmented_read(struct x86_emulate_ctxt *ctxt,
+ struct segmented_address addr,
+ void *data,
+ unsigned size)
+{
+ int rc;
+ ulong linear;
+
+ rc = linearize(ctxt, addr, size, false, &linear);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ return read_emulated(ctxt, ctxt->ops, linear, data, size);
+}
+
+static int segmented_write(struct x86_emulate_ctxt *ctxt,
+ struct segmented_address addr,
+ const void *data,
+ unsigned size)
+{
+ int rc;
+ ulong linear;
+
+ rc = linearize(ctxt, addr, size, true, &linear);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ return ctxt->ops->write_emulated(ctxt, linear, data, size,
+ &ctxt->exception);
+}
+
+static int segmented_cmpxchg(struct x86_emulate_ctxt *ctxt,
+ struct segmented_address addr,
+ const void *orig_data, const void *data,
+ unsigned size)
+{
+ int rc;
+ ulong linear;
+
+ rc = linearize(ctxt, addr, size, true, &linear);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ return ctxt->ops->cmpxchg_emulated(ctxt, linear, orig_data, data,
+ size, &ctxt->exception);
+}
+
static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops,
unsigned int size, unsigned short port,
@@ -854,7 +1147,7 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
if (n == 0)
n = 1;
rc->pos = rc->end = 0;
- if (!ops->pio_in_emulated(size, port, rc->data, n, ctxt->vcpu))
+ if (!ops->pio_in_emulated(ctxt, size, port, rc->data, n))
return 0;
rc->end = n * size;
}
@@ -864,28 +1157,22 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
return 1;
}
-static u32 desc_limit_scaled(struct desc_struct *desc)
-{
- u32 limit = get_desc_limit(desc);
-
- return desc->g ? (limit << 12) | 0xfff : limit;
-}
-
static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops,
u16 selector, struct desc_ptr *dt)
{
if (selector & 1 << 2) {
struct desc_struct desc;
+ u16 sel;
+
memset (dt, 0, sizeof *dt);
- if (!ops->get_cached_descriptor(&desc, NULL, VCPU_SREG_LDTR,
- ctxt->vcpu))
+ if (!ops->get_segment(ctxt, &sel, &desc, NULL, VCPU_SREG_LDTR))
return;
dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */
dt->address = get_desc_base(&desc);
} else
- ops->get_gdt(dt, ctxt->vcpu);
+ ops->get_gdt(ctxt, dt);
}
/* allowed just for 8 bytes segments */
@@ -903,8 +1190,7 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
if (dt.size < index * 8 + 7)
return emulate_gp(ctxt, selector & 0xfffc);
addr = dt.address + index * 8;
- ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu,
- &ctxt->exception);
+ ret = ops->read_std(ctxt, addr, desc, sizeof *desc, &ctxt->exception);
return ret;
}
@@ -925,8 +1211,7 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
return emulate_gp(ctxt, selector & 0xfffc);
addr = dt.address + index * 8;
- ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu,
- &ctxt->exception);
+ ret = ops->write_std(ctxt, addr, desc, sizeof *desc, &ctxt->exception);
return ret;
}
@@ -986,7 +1271,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
rpl = selector & 3;
dpl = seg_desc.dpl;
- cpl = ops->cpl(ctxt->vcpu);
+ cpl = ops->cpl(ctxt);
switch (seg) {
case VCPU_SREG_SS:
@@ -1042,8 +1327,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
return ret;
}
load:
- ops->set_segment_selector(selector, seg, ctxt->vcpu);
- ops->set_cached_descriptor(&seg_desc, 0, seg, ctxt->vcpu);
+ ops->set_segment(ctxt, selector, &seg_desc, 0, seg);
return X86EMUL_CONTINUE;
exception:
emulate_exception(ctxt, err_vec, err_code, true);
@@ -1069,8 +1353,7 @@ static void write_register_operand(struct operand *op)
}
}
-static inline int writeback(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops)
+static int writeback(struct x86_emulate_ctxt *ctxt)
{
int rc;
struct decode_cache *c = &ctxt->decode;
@@ -1081,23 +1364,22 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
break;
case OP_MEM:
if (c->lock_prefix)
- rc = ops->cmpxchg_emulated(
- linear(ctxt, c->dst.addr.mem),
- &c->dst.orig_val,
- &c->dst.val,
- c->dst.bytes,
- &ctxt->exception,
- ctxt->vcpu);
+ rc = segmented_cmpxchg(ctxt,
+ c->dst.addr.mem,
+ &c->dst.orig_val,
+ &c->dst.val,
+ c->dst.bytes);
else
- rc = ops->write_emulated(
- linear(ctxt, c->dst.addr.mem),
- &c->dst.val,
- c->dst.bytes,
- &ctxt->exception,
- ctxt->vcpu);
+ rc = segmented_write(ctxt,
+ c->dst.addr.mem,
+ &c->dst.val,
+ c->dst.bytes);
if (rc != X86EMUL_CONTINUE)
return rc;
break;
+ case OP_XMM:
+ write_sse_reg(ctxt, &c->dst.vec_val, c->dst.addr.xmm);
+ break;
case OP_NONE:
/* no writeback */
break;
@@ -1107,21 +1389,21 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
return X86EMUL_CONTINUE;
}
-static inline void emulate_push(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops)
+static int em_push(struct x86_emulate_ctxt *ctxt)
{
struct decode_cache *c = &ctxt->decode;
+ struct segmented_address addr;
- c->dst.type = OP_MEM;
- c->dst.bytes = c->op_bytes;
- c->dst.val = c->src.val;
register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes);
- c->dst.addr.mem.ea = register_address(c, c->regs[VCPU_REGS_RSP]);
- c->dst.addr.mem.seg = VCPU_SREG_SS;
+ addr.ea = register_address(c, c->regs[VCPU_REGS_RSP]);
+ addr.seg = VCPU_SREG_SS;
+
+ /* Disable writeback. */
+ c->dst.type = OP_NONE;
+ return segmented_write(ctxt, addr, &c->src.val, c->op_bytes);
}
static int emulate_pop(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
void *dest, int len)
{
struct decode_cache *c = &ctxt->decode;
@@ -1130,7 +1412,7 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt,
addr.ea = register_address(c, c->regs[VCPU_REGS_RSP]);
addr.seg = VCPU_SREG_SS;
- rc = read_emulated(ctxt, ops, linear(ctxt, addr), dest, len);
+ rc = segmented_read(ctxt, addr, dest, len);
if (rc != X86EMUL_CONTINUE)
return rc;
@@ -1138,6 +1420,13 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt,
return rc;
}
+static int em_pop(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+
+ return emulate_pop(ctxt, &c->dst.val, c->op_bytes);
+}
+
static int emulate_popf(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops,
void *dest, int len)
@@ -1145,9 +1434,9 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
int rc;
unsigned long val, change_mask;
int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
- int cpl = ops->cpl(ctxt->vcpu);
+ int cpl = ops->cpl(ctxt);
- rc = emulate_pop(ctxt, ops, &val, len);
+ rc = emulate_pop(ctxt, &val, len);
if (rc != X86EMUL_CONTINUE)
return rc;
@@ -1179,14 +1468,24 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
return rc;
}
-static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops, int seg)
+static int em_popf(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+
+ c->dst.type = OP_REG;
+ c->dst.addr.reg = &ctxt->eflags;
+ c->dst.bytes = c->op_bytes;
+ return emulate_popf(ctxt, ctxt->ops, &c->dst.val, c->op_bytes);
+}
+
+static int emulate_push_sreg(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops, int seg)
{
struct decode_cache *c = &ctxt->decode;
- c->src.val = ops->get_segment_selector(seg, ctxt->vcpu);
+ c->src.val = get_segment_selector(ctxt, seg);
- emulate_push(ctxt, ops);
+ return em_push(ctxt);
}
static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
@@ -1196,7 +1495,7 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
unsigned long selector;
int rc;
- rc = emulate_pop(ctxt, ops, &selector, c->op_bytes);
+ rc = emulate_pop(ctxt, &selector, c->op_bytes);
if (rc != X86EMUL_CONTINUE)
return rc;
@@ -1204,8 +1503,7 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
return rc;
}
-static int emulate_pusha(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops)
+static int em_pusha(struct x86_emulate_ctxt *ctxt)
{
struct decode_cache *c = &ctxt->decode;
unsigned long old_esp = c->regs[VCPU_REGS_RSP];
@@ -1216,23 +1514,25 @@ static int emulate_pusha(struct x86_emulate_ctxt *ctxt,
(reg == VCPU_REGS_RSP) ?
(c->src.val = old_esp) : (c->src.val = c->regs[reg]);
- emulate_push(ctxt, ops);
-
- rc = writeback(ctxt, ops);
+ rc = em_push(ctxt);
if (rc != X86EMUL_CONTINUE)
return rc;
++reg;
}
- /* Disable writeback. */
- c->dst.type = OP_NONE;
-
return rc;
}
-static int emulate_popa(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops)
+static int em_pushf(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+
+ c->src.val = (unsigned long)ctxt->eflags;
+ return em_push(ctxt);
+}
+
+static int em_popa(struct x86_emulate_ctxt *ctxt)
{
struct decode_cache *c = &ctxt->decode;
int rc = X86EMUL_CONTINUE;
@@ -1245,7 +1545,7 @@ static int emulate_popa(struct x86_emulate_ctxt *ctxt,
--reg;
}
- rc = emulate_pop(ctxt, ops, &c->regs[reg], c->op_bytes);
+ rc = emulate_pop(ctxt, &c->regs[reg], c->op_bytes);
if (rc != X86EMUL_CONTINUE)
break;
--reg;
@@ -1265,37 +1565,32 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt,
/* TODO: Add limit checks */
c->src.val = ctxt->eflags;
- emulate_push(ctxt, ops);
- rc = writeback(ctxt, ops);
+ rc = em_push(ctxt);
if (rc != X86EMUL_CONTINUE)
return rc;
ctxt->eflags &= ~(EFLG_IF | EFLG_TF | EFLG_AC);
- c->src.val = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu);
- emulate_push(ctxt, ops);
- rc = writeback(ctxt, ops);
+ c->src.val = get_segment_selector(ctxt, VCPU_SREG_CS);
+ rc = em_push(ctxt);
if (rc != X86EMUL_CONTINUE)
return rc;
c->src.val = c->eip;
- emulate_push(ctxt, ops);
- rc = writeback(ctxt, ops);
+ rc = em_push(ctxt);
if (rc != X86EMUL_CONTINUE)
return rc;
- c->dst.type = OP_NONE;
-
- ops->get_idt(&dt, ctxt->vcpu);
+ ops->get_idt(ctxt, &dt);
eip_addr = dt.address + (irq << 2);
cs_addr = dt.address + (irq << 2) + 2;
- rc = ops->read_std(cs_addr, &cs, 2, ctxt->vcpu, &ctxt->exception);
+ rc = ops->read_std(ctxt, cs_addr, &cs, 2, &ctxt->exception);
if (rc != X86EMUL_CONTINUE)
return rc;
- rc = ops->read_std(eip_addr, &eip, 2, ctxt->vcpu, &ctxt->exception);
+ rc = ops->read_std(ctxt, eip_addr, &eip, 2, &ctxt->exception);
if (rc != X86EMUL_CONTINUE)
return rc;
@@ -1339,7 +1634,7 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt,
/* TODO: Add stack limit check */
- rc = emulate_pop(ctxt, ops, &temp_eip, c->op_bytes);
+ rc = emulate_pop(ctxt, &temp_eip, c->op_bytes);
if (rc != X86EMUL_CONTINUE)
return rc;
@@ -1347,12 +1642,12 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt,
if (temp_eip & ~0xffff)
return emulate_gp(ctxt, 0);
- rc = emulate_pop(ctxt, ops, &cs, c->op_bytes);
+ rc = emulate_pop(ctxt, &cs, c->op_bytes);
if (rc != X86EMUL_CONTINUE)
return rc;
- rc = emulate_pop(ctxt, ops, &temp_eflags, c->op_bytes);
+ rc = emulate_pop(ctxt, &temp_eflags, c->op_bytes);
if (rc != X86EMUL_CONTINUE)
return rc;
@@ -1394,15 +1689,31 @@ static inline int emulate_iret(struct x86_emulate_ctxt *ctxt,
}
}
-static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops)
+static int em_jmp_far(struct x86_emulate_ctxt *ctxt)
{
struct decode_cache *c = &ctxt->decode;
+ int rc;
+ unsigned short sel;
- return emulate_pop(ctxt, ops, &c->dst.val, c->dst.bytes);
+ memcpy(&sel, c->src.valptr + c->op_bytes, 2);
+
+ rc = load_segment_descriptor(ctxt, ctxt->ops, sel, VCPU_SREG_CS);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ c->eip = 0;
+ memcpy(&c->eip, c->src.valptr, c->op_bytes);
+ return X86EMUL_CONTINUE;
+}
+
+static int em_grp1a(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+
+ return emulate_pop(ctxt, &c->dst.val, c->dst.bytes);
}
-static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt)
+static int em_grp2(struct x86_emulate_ctxt *ctxt)
{
struct decode_cache *c = &ctxt->decode;
switch (c->modrm_reg) {
@@ -1429,10 +1740,10 @@ static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt)
emulate_2op_SrcB("sar", c->src, c->dst, ctxt->eflags);
break;
}
+ return X86EMUL_CONTINUE;
}
-static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops)
+static int em_grp3(struct x86_emulate_ctxt *ctxt)
{
struct decode_cache *c = &ctxt->decode;
unsigned long *rax = &c->regs[VCPU_REGS_RAX];
@@ -1471,10 +1782,10 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
return X86EMUL_CONTINUE;
}
-static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops)
+static int em_grp45(struct x86_emulate_ctxt *ctxt)
{
struct decode_cache *c = &ctxt->decode;
+ int rc = X86EMUL_CONTINUE;
switch (c->modrm_reg) {
case 0: /* inc */
@@ -1488,21 +1799,23 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
old_eip = c->eip;
c->eip = c->src.val;
c->src.val = old_eip;
- emulate_push(ctxt, ops);
+ rc = em_push(ctxt);
break;
}
case 4: /* jmp abs */
c->eip = c->src.val;
break;
+ case 5: /* jmp far */
+ rc = em_jmp_far(ctxt);
+ break;
case 6: /* push */
- emulate_push(ctxt, ops);
+ rc = em_push(ctxt);
break;
}
- return X86EMUL_CONTINUE;
+ return rc;
}
-static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops)
+static int em_grp9(struct x86_emulate_ctxt *ctxt)
{
struct decode_cache *c = &ctxt->decode;
u64 old = c->dst.orig_val64;
@@ -1528,12 +1841,12 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
int rc;
unsigned long cs;
- rc = emulate_pop(ctxt, ops, &c->eip, c->op_bytes);
+ rc = emulate_pop(ctxt, &c->eip, c->op_bytes);
if (rc != X86EMUL_CONTINUE)
return rc;
if (c->op_bytes == 4)
c->eip = (u32)c->eip;
- rc = emulate_pop(ctxt, ops, &cs, c->op_bytes);
+ rc = emulate_pop(ctxt, &cs, c->op_bytes);
if (rc != X86EMUL_CONTINUE)
return rc;
rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS);
@@ -1562,8 +1875,10 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops, struct desc_struct *cs,
struct desc_struct *ss)
{
+ u16 selector;
+
memset(cs, 0, sizeof(struct desc_struct));
- ops->get_cached_descriptor(cs, NULL, VCPU_SREG_CS, ctxt->vcpu);
+ ops->get_segment(ctxt, &selector, cs, NULL, VCPU_SREG_CS);
memset(ss, 0, sizeof(struct desc_struct));
cs->l = 0; /* will be adjusted later */
@@ -1593,44 +1908,44 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
struct desc_struct cs, ss;
u64 msr_data;
u16 cs_sel, ss_sel;
+ u64 efer = 0;
/* syscall is not available in real mode */
if (ctxt->mode == X86EMUL_MODE_REAL ||
ctxt->mode == X86EMUL_MODE_VM86)
return emulate_ud(ctxt);
+ ops->get_msr(ctxt, MSR_EFER, &efer);
setup_syscalls_segments(ctxt, ops, &cs, &ss);
- ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
+ ops->get_msr(ctxt, MSR_STAR, &msr_data);
msr_data >>= 32;
cs_sel = (u16)(msr_data & 0xfffc);
ss_sel = (u16)(msr_data + 8);
- if (is_long_mode(ctxt->vcpu)) {
+ if (efer & EFER_LMA) {
cs.d = 0;
cs.l = 1;
}
- ops->set_cached_descriptor(&cs, 0, VCPU_SREG_CS, ctxt->vcpu);
- ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
- ops->set_cached_descriptor(&ss, 0, VCPU_SREG_SS, ctxt->vcpu);
- ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
+ ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
+ ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
c->regs[VCPU_REGS_RCX] = c->eip;
- if (is_long_mode(ctxt->vcpu)) {
+ if (efer & EFER_LMA) {
#ifdef CONFIG_X86_64
c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF;
- ops->get_msr(ctxt->vcpu,
+ ops->get_msr(ctxt,
ctxt->mode == X86EMUL_MODE_PROT64 ?
MSR_LSTAR : MSR_CSTAR, &msr_data);
c->eip = msr_data;
- ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data);
+ ops->get_msr(ctxt, MSR_SYSCALL_MASK, &msr_data);
ctxt->eflags &= ~(msr_data | EFLG_RF);
#endif
} else {
/* legacy mode */
- ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
+ ops->get_msr(ctxt, MSR_STAR, &msr_data);
c->eip = (u32)msr_data;
ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
@@ -1646,7 +1961,9 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
struct desc_struct cs, ss;
u64 msr_data;
u16 cs_sel, ss_sel;
+ u64 efer = 0;
+ ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
/* inject #GP if in real mode */
if (ctxt->mode == X86EMUL_MODE_REAL)
return emulate_gp(ctxt, 0);
@@ -1659,7 +1976,7 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
setup_syscalls_segments(ctxt, ops, &cs, &ss);
- ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
+ ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data);
switch (ctxt->mode) {
case X86EMUL_MODE_PROT32:
if ((msr_data & 0xfffc) == 0x0)
@@ -1676,21 +1993,18 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
cs_sel &= ~SELECTOR_RPL_MASK;
ss_sel = cs_sel + 8;
ss_sel &= ~SELECTOR_RPL_MASK;
- if (ctxt->mode == X86EMUL_MODE_PROT64
- || is_long_mode(ctxt->vcpu)) {
+ if (ctxt->mode == X86EMUL_MODE_PROT64 || (efer & EFER_LMA)) {
cs.d = 0;
cs.l = 1;
}
- ops->set_cached_descriptor(&cs, 0, VCPU_SREG_CS, ctxt->vcpu);
- ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
- ops->set_cached_descriptor(&ss, 0, VCPU_SREG_SS, ctxt->vcpu);
- ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
+ ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
+ ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
- ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data);
+ ops->get_msr(ctxt, MSR_IA32_SYSENTER_EIP, &msr_data);
c->eip = msr_data;
- ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data);
+ ops->get_msr(ctxt, MSR_IA32_SYSENTER_ESP, &msr_data);
c->regs[VCPU_REGS_RSP] = msr_data;
return X86EMUL_CONTINUE;
@@ -1719,7 +2033,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
cs.dpl = 3;
ss.dpl = 3;
- ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
+ ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data);
switch (usermode) {
case X86EMUL_MODE_PROT32:
cs_sel = (u16)(msr_data + 16);
@@ -1739,10 +2053,8 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
cs_sel |= SELECTOR_RPL_MASK;
ss_sel |= SELECTOR_RPL_MASK;
- ops->set_cached_descriptor(&cs, 0, VCPU_SREG_CS, ctxt->vcpu);
- ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
- ops->set_cached_descriptor(&ss, 0, VCPU_SREG_SS, ctxt->vcpu);
- ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
+ ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
+ ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
c->eip = c->regs[VCPU_REGS_RDX];
c->regs[VCPU_REGS_RSP] = c->regs[VCPU_REGS_RCX];
@@ -1759,7 +2071,7 @@ static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt,
if (ctxt->mode == X86EMUL_MODE_VM86)
return true;
iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
- return ops->cpl(ctxt->vcpu) > iopl;
+ return ops->cpl(ctxt) > iopl;
}
static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
@@ -1769,11 +2081,11 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
struct desc_struct tr_seg;
u32 base3;
int r;
- u16 io_bitmap_ptr, perm, bit_idx = port & 0x7;
+ u16 tr, io_bitmap_ptr, perm, bit_idx = port & 0x7;
unsigned mask = (1 << len) - 1;
unsigned long base;
- ops->get_cached_descriptor(&tr_seg, &base3, VCPU_SREG_TR, ctxt->vcpu);
+ ops->get_segment(ctxt, &tr, &tr_seg, &base3, VCPU_SREG_TR);
if (!tr_seg.p)
return false;
if (desc_limit_scaled(&tr_seg) < 103)
@@ -1782,13 +2094,12 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
#ifdef CONFIG_X86_64
base |= ((u64)base3) << 32;
#endif
- r = ops->read_std(base + 102, &io_bitmap_ptr, 2, ctxt->vcpu, NULL);
+ r = ops->read_std(ctxt, base + 102, &io_bitmap_ptr, 2, NULL);
if (r != X86EMUL_CONTINUE)
return false;
if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg))
return false;
- r = ops->read_std(base + io_bitmap_ptr + port/8, &perm, 2, ctxt->vcpu,
- NULL);
+ r = ops->read_std(ctxt, base + io_bitmap_ptr + port/8, &perm, 2, NULL);
if (r != X86EMUL_CONTINUE)
return false;
if ((perm >> bit_idx) & mask)
@@ -1829,11 +2140,11 @@ static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt,
tss->si = c->regs[VCPU_REGS_RSI];
tss->di = c->regs[VCPU_REGS_RDI];
- tss->es = ops->get_segment_selector(VCPU_SREG_ES, ctxt->vcpu);
- tss->cs = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu);
- tss->ss = ops->get_segment_selector(VCPU_SREG_SS, ctxt->vcpu);
- tss->ds = ops->get_segment_selector(VCPU_SREG_DS, ctxt->vcpu);
- tss->ldt = ops->get_segment_selector(VCPU_SREG_LDTR, ctxt->vcpu);
+ tss->es = get_segment_selector(ctxt, VCPU_SREG_ES);
+ tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS);
+ tss->ss = get_segment_selector(ctxt, VCPU_SREG_SS);
+ tss->ds = get_segment_selector(ctxt, VCPU_SREG_DS);
+ tss->ldt = get_segment_selector(ctxt, VCPU_SREG_LDTR);
}
static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
@@ -1858,11 +2169,11 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
* SDM says that segment selectors are loaded before segment
* descriptors
*/
- ops->set_segment_selector(tss->ldt, VCPU_SREG_LDTR, ctxt->vcpu);
- ops->set_segment_selector(tss->es, VCPU_SREG_ES, ctxt->vcpu);
- ops->set_segment_selector(tss->cs, VCPU_SREG_CS, ctxt->vcpu);
- ops->set_segment_selector(tss->ss, VCPU_SREG_SS, ctxt->vcpu);
- ops->set_segment_selector(tss->ds, VCPU_SREG_DS, ctxt->vcpu);
+ set_segment_selector(ctxt, tss->ldt, VCPU_SREG_LDTR);
+ set_segment_selector(ctxt, tss->es, VCPU_SREG_ES);
+ set_segment_selector(ctxt, tss->cs, VCPU_SREG_CS);
+ set_segment_selector(ctxt, tss->ss, VCPU_SREG_SS);
+ set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS);
/*
* Now load segment descriptors. If fault happenes at this stage
@@ -1896,7 +2207,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
int ret;
u32 new_tss_base = get_desc_base(new_desc);
- ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
+ ret = ops->read_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg,
&ctxt->exception);
if (ret != X86EMUL_CONTINUE)
/* FIXME: need to provide precise fault address */
@@ -1904,13 +2215,13 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
save_state_to_tss16(ctxt, ops, &tss_seg);
- ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
+ ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg,
&ctxt->exception);
if (ret != X86EMUL_CONTINUE)
/* FIXME: need to provide precise fault address */
return ret;
- ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
+ ret = ops->read_std(ctxt, new_tss_base, &tss_seg, sizeof tss_seg,
&ctxt->exception);
if (ret != X86EMUL_CONTINUE)
/* FIXME: need to provide precise fault address */
@@ -1919,10 +2230,10 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
if (old_tss_sel != 0xffff) {
tss_seg.prev_task_link = old_tss_sel;
- ret = ops->write_std(new_tss_base,
+ ret = ops->write_std(ctxt, new_tss_base,
&tss_seg.prev_task_link,
sizeof tss_seg.prev_task_link,
- ctxt->vcpu, &ctxt->exception);
+ &ctxt->exception);
if (ret != X86EMUL_CONTINUE)
/* FIXME: need to provide precise fault address */
return ret;
@@ -1937,7 +2248,7 @@ static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt,
{
struct decode_cache *c = &ctxt->decode;
- tss->cr3 = ops->get_cr(3, ctxt->vcpu);
+ tss->cr3 = ops->get_cr(ctxt, 3);
tss->eip = c->eip;
tss->eflags = ctxt->eflags;
tss->eax = c->regs[VCPU_REGS_RAX];
@@ -1949,13 +2260,13 @@ static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt,
tss->esi = c->regs[VCPU_REGS_RSI];
tss->edi = c->regs[VCPU_REGS_RDI];
- tss->es = ops->get_segment_selector(VCPU_SREG_ES, ctxt->vcpu);
- tss->cs = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu);
- tss->ss = ops->get_segment_selector(VCPU_SREG_SS, ctxt->vcpu);
- tss->ds = ops->get_segment_selector(VCPU_SREG_DS, ctxt->vcpu);
- tss->fs = ops->get_segment_selector(VCPU_SREG_FS, ctxt->vcpu);
- tss->gs = ops->get_segment_selector(VCPU_SREG_GS, ctxt->vcpu);
- tss->ldt_selector = ops->get_segment_selector(VCPU_SREG_LDTR, ctxt->vcpu);
+ tss->es = get_segment_selector(ctxt, VCPU_SREG_ES);
+ tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS);
+ tss->ss = get_segment_selector(ctxt, VCPU_SREG_SS);
+ tss->ds = get_segment_selector(ctxt, VCPU_SREG_DS);
+ tss->fs = get_segment_selector(ctxt, VCPU_SREG_FS);
+ tss->gs = get_segment_selector(ctxt, VCPU_SREG_GS);
+ tss->ldt_selector = get_segment_selector(ctxt, VCPU_SREG_LDTR);
}
static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
@@ -1965,7 +2276,7 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
struct decode_cache *c = &ctxt->decode;
int ret;
- if (ops->set_cr(3, tss->cr3, ctxt->vcpu))
+ if (ops->set_cr(ctxt, 3, tss->cr3))
return emulate_gp(ctxt, 0);
c->eip = tss->eip;
ctxt->eflags = tss->eflags | 2;
@@ -1982,13 +2293,13 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
* SDM says that segment selectors are loaded before segment
* descriptors
*/
- ops->set_segment_selector(tss->ldt_selector, VCPU_SREG_LDTR, ctxt->vcpu);
- ops->set_segment_selector(tss->es, VCPU_SREG_ES, ctxt->vcpu);
- ops->set_segment_selector(tss->cs, VCPU_SREG_CS, ctxt->vcpu);
- ops->set_segment_selector(tss->ss, VCPU_SREG_SS, ctxt->vcpu);
- ops->set_segment_selector(tss->ds, VCPU_SREG_DS, ctxt->vcpu);
- ops->set_segment_selector(tss->fs, VCPU_SREG_FS, ctxt->vcpu);
- ops->set_segment_selector(tss->gs, VCPU_SREG_GS, ctxt->vcpu);
+ set_segment_selector(ctxt, tss->ldt_selector, VCPU_SREG_LDTR);
+ set_segment_selector(ctxt, tss->es, VCPU_SREG_ES);
+ set_segment_selector(ctxt, tss->cs, VCPU_SREG_CS);
+ set_segment_selector(ctxt, tss->ss, VCPU_SREG_SS);
+ set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS);
+ set_segment_selector(ctxt, tss->fs, VCPU_SREG_FS);
+ set_segment_selector(ctxt, tss->gs, VCPU_SREG_GS);
/*
* Now load segment descriptors. If fault happenes at this stage
@@ -2028,7 +2339,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
int ret;
u32 new_tss_base = get_desc_base(new_desc);
- ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
+ ret = ops->read_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg,
&ctxt->exception);
if (ret != X86EMUL_CONTINUE)
/* FIXME: need to provide precise fault address */
@@ -2036,13 +2347,13 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
save_state_to_tss32(ctxt, ops, &tss_seg);
- ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
+ ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg,
&ctxt->exception);
if (ret != X86EMUL_CONTINUE)
/* FIXME: need to provide precise fault address */
return ret;
- ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
+ ret = ops->read_std(ctxt, new_tss_base, &tss_seg, sizeof tss_seg,
&ctxt->exception);
if (ret != X86EMUL_CONTINUE)
/* FIXME: need to provide precise fault address */
@@ -2051,10 +2362,10 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
if (old_tss_sel != 0xffff) {
tss_seg.prev_task_link = old_tss_sel;
- ret = ops->write_std(new_tss_base,
+ ret = ops->write_std(ctxt, new_tss_base,
&tss_seg.prev_task_link,
sizeof tss_seg.prev_task_link,
- ctxt->vcpu, &ctxt->exception);
+ &ctxt->exception);
if (ret != X86EMUL_CONTINUE)
/* FIXME: need to provide precise fault address */
return ret;
@@ -2070,9 +2381,9 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
{
struct desc_struct curr_tss_desc, next_tss_desc;
int ret;
- u16 old_tss_sel = ops->get_segment_selector(VCPU_SREG_TR, ctxt->vcpu);
+ u16 old_tss_sel = get_segment_selector(ctxt, VCPU_SREG_TR);
ulong old_tss_base =
- ops->get_cached_segment_base(VCPU_SREG_TR, ctxt->vcpu);
+ ops->get_cached_segment_base(ctxt, VCPU_SREG_TR);
u32 desc_limit;
/* FIXME: old_tss_base == ~0 ? */
@@ -2088,7 +2399,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
if (reason != TASK_SWITCH_IRET) {
if ((tss_selector & 3) > next_tss_desc.dpl ||
- ops->cpl(ctxt->vcpu) > next_tss_desc.dpl)
+ ops->cpl(ctxt) > next_tss_desc.dpl)
return emulate_gp(ctxt, 0);
}
@@ -2132,9 +2443,8 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
&next_tss_desc);
}
- ops->set_cr(0, ops->get_cr(0, ctxt->vcpu) | X86_CR0_TS, ctxt->vcpu);
- ops->set_cached_descriptor(&next_tss_desc, 0, VCPU_SREG_TR, ctxt->vcpu);
- ops->set_segment_selector(tss_selector, VCPU_SREG_TR, ctxt->vcpu);
+ ops->set_cr(ctxt, 0, ops->get_cr(ctxt, 0) | X86_CR0_TS);
+ ops->set_segment(ctxt, tss_selector, &next_tss_desc, 0, VCPU_SREG_TR);
if (has_error_code) {
struct decode_cache *c = &ctxt->decode;
@@ -2142,7 +2452,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2;
c->lock_prefix = 0;
c->src.val = (unsigned long) error_code;
- emulate_push(ctxt, ops);
+ ret = em_push(ctxt);
}
return ret;
@@ -2162,13 +2472,10 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason,
has_error_code, error_code);
- if (rc == X86EMUL_CONTINUE) {
- rc = writeback(ctxt, ops);
- if (rc == X86EMUL_CONTINUE)
- ctxt->eip = c->eip;
- }
+ if (rc == X86EMUL_CONTINUE)
+ ctxt->eip = c->eip;
- return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
+ return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
}
static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg,
@@ -2182,12 +2489,6 @@ static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg,
op->addr.mem.seg = seg;
}
-static int em_push(struct x86_emulate_ctxt *ctxt)
-{
- emulate_push(ctxt, ctxt->ops);
- return X86EMUL_CONTINUE;
-}
-
static int em_das(struct x86_emulate_ctxt *ctxt)
{
struct decode_cache *c = &ctxt->decode;
@@ -2234,7 +2535,7 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt)
ulong old_eip;
int rc;
- old_cs = ctxt->ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu);
+ old_cs = get_segment_selector(ctxt, VCPU_SREG_CS);
old_eip = c->eip;
memcpy(&sel, c->src.valptr + c->op_bytes, 2);
@@ -2245,20 +2546,12 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt)
memcpy(&c->eip, c->src.valptr, c->op_bytes);
c->src.val = old_cs;
- emulate_push(ctxt, ctxt->ops);
- rc = writeback(ctxt, ctxt->ops);
+ rc = em_push(ctxt);
if (rc != X86EMUL_CONTINUE)
return rc;
c->src.val = old_eip;
- emulate_push(ctxt, ctxt->ops);
- rc = writeback(ctxt, ctxt->ops);
- if (rc != X86EMUL_CONTINUE)
- return rc;
-
- c->dst.type = OP_NONE;
-
- return X86EMUL_CONTINUE;
+ return em_push(ctxt);
}
static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt)
@@ -2269,13 +2562,79 @@ static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt)
c->dst.type = OP_REG;
c->dst.addr.reg = &c->eip;
c->dst.bytes = c->op_bytes;
- rc = emulate_pop(ctxt, ctxt->ops, &c->dst.val, c->op_bytes);
+ rc = emulate_pop(ctxt, &c->dst.val, c->op_bytes);
if (rc != X86EMUL_CONTINUE)
return rc;
register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->src.val);
return X86EMUL_CONTINUE;
}
+static int em_add(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+
+ emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
+ return X86EMUL_CONTINUE;
+}
+
+static int em_or(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+
+ emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
+ return X86EMUL_CONTINUE;
+}
+
+static int em_adc(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+
+ emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags);
+ return X86EMUL_CONTINUE;
+}
+
+static int em_sbb(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+
+ emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
+ return X86EMUL_CONTINUE;
+}
+
+static int em_and(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+
+ emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags);
+ return X86EMUL_CONTINUE;
+}
+
+static int em_sub(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+
+ emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags);
+ return X86EMUL_CONTINUE;
+}
+
+static int em_xor(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+
+ emulate_2op_SrcV("xor", c->src, c->dst, ctxt->eflags);
+ return X86EMUL_CONTINUE;
+}
+
+static int em_cmp(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+
+ emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
+ /* Disable writeback. */
+ c->dst.type = OP_NONE;
+ return X86EMUL_CONTINUE;
+}
+
static int em_imul(struct x86_emulate_ctxt *ctxt)
{
struct decode_cache *c = &ctxt->decode;
@@ -2306,13 +2665,10 @@ static int em_cwd(struct x86_emulate_ctxt *ctxt)
static int em_rdtsc(struct x86_emulate_ctxt *ctxt)
{
- unsigned cpl = ctxt->ops->cpl(ctxt->vcpu);
struct decode_cache *c = &ctxt->decode;
u64 tsc = 0;
- if (cpl > 0 && (ctxt->ops->get_cr(4, ctxt->vcpu) & X86_CR4_TSD))
- return emulate_gp(ctxt, 0);
- ctxt->ops->get_msr(ctxt->vcpu, MSR_IA32_TSC, &tsc);
+ ctxt->ops->get_msr(ctxt, MSR_IA32_TSC, &tsc);
c->regs[VCPU_REGS_RAX] = (u32)tsc;
c->regs[VCPU_REGS_RDX] = tsc >> 32;
return X86EMUL_CONTINUE;
@@ -2325,22 +2681,375 @@ static int em_mov(struct x86_emulate_ctxt *ctxt)
return X86EMUL_CONTINUE;
}
+static int em_movdqu(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+ memcpy(&c->dst.vec_val, &c->src.vec_val, c->op_bytes);
+ return X86EMUL_CONTINUE;
+}
+
+static int em_invlpg(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+ int rc;
+ ulong linear;
+
+ rc = linearize(ctxt, c->src.addr.mem, 1, false, &linear);
+ if (rc == X86EMUL_CONTINUE)
+ ctxt->ops->invlpg(ctxt, linear);
+ /* Disable writeback. */
+ c->dst.type = OP_NONE;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_clts(struct x86_emulate_ctxt *ctxt)
+{
+ ulong cr0;
+
+ cr0 = ctxt->ops->get_cr(ctxt, 0);
+ cr0 &= ~X86_CR0_TS;
+ ctxt->ops->set_cr(ctxt, 0, cr0);
+ return X86EMUL_CONTINUE;
+}
+
+static int em_vmcall(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+ int rc;
+
+ if (c->modrm_mod != 3 || c->modrm_rm != 1)
+ return X86EMUL_UNHANDLEABLE;
+
+ rc = ctxt->ops->fix_hypercall(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ /* Let the processor re-execute the fixed hypercall */
+ c->eip = ctxt->eip;
+ /* Disable writeback. */
+ c->dst.type = OP_NONE;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_lgdt(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+ struct desc_ptr desc_ptr;
+ int rc;
+
+ rc = read_descriptor(ctxt, c->src.addr.mem,
+ &desc_ptr.size, &desc_ptr.address,
+ c->op_bytes);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ ctxt->ops->set_gdt(ctxt, &desc_ptr);
+ /* Disable writeback. */
+ c->dst.type = OP_NONE;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_vmmcall(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+ int rc;
+
+ rc = ctxt->ops->fix_hypercall(ctxt);
+
+ /* Disable writeback. */
+ c->dst.type = OP_NONE;
+ return rc;
+}
+
+static int em_lidt(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+ struct desc_ptr desc_ptr;
+ int rc;
+
+ rc = read_descriptor(ctxt, c->src.addr.mem,
+ &desc_ptr.size, &desc_ptr.address,
+ c->op_bytes);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ ctxt->ops->set_idt(ctxt, &desc_ptr);
+ /* Disable writeback. */
+ c->dst.type = OP_NONE;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_smsw(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+
+ c->dst.bytes = 2;
+ c->dst.val = ctxt->ops->get_cr(ctxt, 0);
+ return X86EMUL_CONTINUE;
+}
+
+static int em_lmsw(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+ ctxt->ops->set_cr(ctxt, 0, (ctxt->ops->get_cr(ctxt, 0) & ~0x0eul)
+ | (c->src.val & 0x0f));
+ c->dst.type = OP_NONE;
+ return X86EMUL_CONTINUE;
+}
+
+static bool valid_cr(int nr)
+{
+ switch (nr) {
+ case 0:
+ case 2 ... 4:
+ case 8:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static int check_cr_read(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+
+ if (!valid_cr(c->modrm_reg))
+ return emulate_ud(ctxt);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int check_cr_write(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+ u64 new_val = c->src.val64;
+ int cr = c->modrm_reg;
+ u64 efer = 0;
+
+ static u64 cr_reserved_bits[] = {
+ 0xffffffff00000000ULL,
+ 0, 0, 0, /* CR3 checked later */
+ CR4_RESERVED_BITS,
+ 0, 0, 0,
+ CR8_RESERVED_BITS,
+ };
+
+ if (!valid_cr(cr))
+ return emulate_ud(ctxt);
+
+ if (new_val & cr_reserved_bits[cr])
+ return emulate_gp(ctxt, 0);
+
+ switch (cr) {
+ case 0: {
+ u64 cr4;
+ if (((new_val & X86_CR0_PG) && !(new_val & X86_CR0_PE)) ||
+ ((new_val & X86_CR0_NW) && !(new_val & X86_CR0_CD)))
+ return emulate_gp(ctxt, 0);
+
+ cr4 = ctxt->ops->get_cr(ctxt, 4);
+ ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
+
+ if ((new_val & X86_CR0_PG) && (efer & EFER_LME) &&
+ !(cr4 & X86_CR4_PAE))
+ return emulate_gp(ctxt, 0);
+
+ break;
+ }
+ case 3: {
+ u64 rsvd = 0;
+
+ ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
+ if (efer & EFER_LMA)
+ rsvd = CR3_L_MODE_RESERVED_BITS;
+ else if (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_PAE)
+ rsvd = CR3_PAE_RESERVED_BITS;
+ else if (ctxt->ops->get_cr(ctxt, 0) & X86_CR0_PG)
+ rsvd = CR3_NONPAE_RESERVED_BITS;
+
+ if (new_val & rsvd)
+ return emulate_gp(ctxt, 0);
+
+ break;
+ }
+ case 4: {
+ u64 cr4;
+
+ cr4 = ctxt->ops->get_cr(ctxt, 4);
+ ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
+
+ if ((efer & EFER_LMA) && !(new_val & X86_CR4_PAE))
+ return emulate_gp(ctxt, 0);
+
+ break;
+ }
+ }
+
+ return X86EMUL_CONTINUE;
+}
+
+static int check_dr7_gd(struct x86_emulate_ctxt *ctxt)
+{
+ unsigned long dr7;
+
+ ctxt->ops->get_dr(ctxt, 7, &dr7);
+
+ /* Check if DR7.Global_Enable is set */
+ return dr7 & (1 << 13);
+}
+
+static int check_dr_read(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+ int dr = c->modrm_reg;
+ u64 cr4;
+
+ if (dr > 7)
+ return emulate_ud(ctxt);
+
+ cr4 = ctxt->ops->get_cr(ctxt, 4);
+ if ((cr4 & X86_CR4_DE) && (dr == 4 || dr == 5))
+ return emulate_ud(ctxt);
+
+ if (check_dr7_gd(ctxt))
+ return emulate_db(ctxt);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int check_dr_write(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+ u64 new_val = c->src.val64;
+ int dr = c->modrm_reg;
+
+ if ((dr == 6 || dr == 7) && (new_val & 0xffffffff00000000ULL))
+ return emulate_gp(ctxt, 0);
+
+ return check_dr_read(ctxt);
+}
+
+static int check_svme(struct x86_emulate_ctxt *ctxt)
+{
+ u64 efer;
+
+ ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
+
+ if (!(efer & EFER_SVME))
+ return emulate_ud(ctxt);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int check_svme_pa(struct x86_emulate_ctxt *ctxt)
+{
+ u64 rax = ctxt->decode.regs[VCPU_REGS_RAX];
+
+ /* Valid physical address? */
+ if (rax & 0xffff000000000000ULL)
+ return emulate_gp(ctxt, 0);
+
+ return check_svme(ctxt);
+}
+
+static int check_rdtsc(struct x86_emulate_ctxt *ctxt)
+{
+ u64 cr4 = ctxt->ops->get_cr(ctxt, 4);
+
+ if (cr4 & X86_CR4_TSD && ctxt->ops->cpl(ctxt))
+ return emulate_ud(ctxt);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int check_rdpmc(struct x86_emulate_ctxt *ctxt)
+{
+ u64 cr4 = ctxt->ops->get_cr(ctxt, 4);
+ u64 rcx = ctxt->decode.regs[VCPU_REGS_RCX];
+
+ if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) ||
+ (rcx > 3))
+ return emulate_gp(ctxt, 0);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int check_perm_in(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+
+ c->dst.bytes = min(c->dst.bytes, 4u);
+ if (!emulator_io_permited(ctxt, ctxt->ops, c->src.val, c->dst.bytes))
+ return emulate_gp(ctxt, 0);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int check_perm_out(struct x86_emulate_ctxt *ctxt)
+{
+ struct decode_cache *c = &ctxt->decode;
+
+ c->src.bytes = min(c->src.bytes, 4u);
+ if (!emulator_io_permited(ctxt, ctxt->ops, c->dst.val, c->src.bytes))
+ return emulate_gp(ctxt, 0);
+
+ return X86EMUL_CONTINUE;
+}
+
#define D(_y) { .flags = (_y) }
+#define DI(_y, _i) { .flags = (_y), .intercept = x86_intercept_##_i }
+#define DIP(_y, _i, _p) { .flags = (_y), .intercept = x86_intercept_##_i, \
+ .check_perm = (_p) }
#define N D(0)
+#define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) }
#define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) }
-#define GD(_f, _g) { .flags = ((_f) | Group | GroupDual), .u.gdual = (_g) }
+#define GD(_f, _g) { .flags = ((_f) | GroupDual), .u.gdual = (_g) }
#define I(_f, _e) { .flags = (_f), .u.execute = (_e) }
+#define II(_f, _e, _i) \
+ { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i }
+#define IIP(_f, _e, _i, _p) \
+ { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i, \
+ .check_perm = (_p) }
+#define GP(_f, _g) { .flags = ((_f) | Prefix), .u.gprefix = (_g) }
#define D2bv(_f) D((_f) | ByteOp), D(_f)
+#define D2bvIP(_f, _i, _p) DIP((_f) | ByteOp, _i, _p), DIP(_f, _i, _p)
#define I2bv(_f, _e) I((_f) | ByteOp, _e), I(_f, _e)
-#define D6ALU(_f) D2bv((_f) | DstMem | SrcReg | ModRM), \
- D2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock), \
- D2bv(((_f) & ~Lock) | DstAcc | SrcImm)
+#define I6ALU(_f, _e) I2bv((_f) | DstMem | SrcReg | ModRM, _e), \
+ I2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e), \
+ I2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e)
+static struct opcode group7_rm1[] = {
+ DI(SrcNone | ModRM | Priv, monitor),
+ DI(SrcNone | ModRM | Priv, mwait),
+ N, N, N, N, N, N,
+};
+
+static struct opcode group7_rm3[] = {
+ DIP(SrcNone | ModRM | Prot | Priv, vmrun, check_svme_pa),
+ II(SrcNone | ModRM | Prot | VendorSpecific, em_vmmcall, vmmcall),
+ DIP(SrcNone | ModRM | Prot | Priv, vmload, check_svme_pa),
+ DIP(SrcNone | ModRM | Prot | Priv, vmsave, check_svme_pa),
+ DIP(SrcNone | ModRM | Prot | Priv, stgi, check_svme),
+ DIP(SrcNone | ModRM | Prot | Priv, clgi, check_svme),
+ DIP(SrcNone | ModRM | Prot | Priv, skinit, check_svme),
+ DIP(SrcNone | ModRM | Prot | Priv, invlpga, check_svme),
+};
+
+static struct opcode group7_rm7[] = {
+ N,
+ DIP(SrcNone | ModRM, rdtscp, check_rdtsc),
+ N, N, N, N, N, N,
+};
static struct opcode group1[] = {
- X7(D(Lock)), N
+ I(Lock, em_add),
+ I(Lock, em_or),
+ I(Lock, em_adc),
+ I(Lock, em_sbb),
+ I(Lock, em_and),
+ I(Lock, em_sub),
+ I(Lock, em_xor),
+ I(0, em_cmp),
};
static struct opcode group1A[] = {
@@ -2366,16 +3075,28 @@ static struct opcode group5[] = {
D(SrcMem | ModRM | Stack), N,
};
+static struct opcode group6[] = {
+ DI(ModRM | Prot, sldt),
+ DI(ModRM | Prot, str),
+ DI(ModRM | Prot | Priv, lldt),
+ DI(ModRM | Prot | Priv, ltr),
+ N, N, N, N,
+};
+
static struct group_dual group7 = { {
- N, N, D(ModRM | SrcMem | Priv), D(ModRM | SrcMem | Priv),
- D(SrcNone | ModRM | DstMem | Mov), N,
- D(SrcMem16 | ModRM | Mov | Priv),
- D(SrcMem | ModRM | ByteOp | Priv | NoAccess),
+ DI(ModRM | Mov | DstMem | Priv, sgdt),
+ DI(ModRM | Mov | DstMem | Priv, sidt),
+ II(ModRM | SrcMem | Priv, em_lgdt, lgdt),
+ II(ModRM | SrcMem | Priv, em_lidt, lidt),
+ II(SrcNone | ModRM | DstMem | Mov, em_smsw, smsw), N,
+ II(SrcMem16 | ModRM | Mov | Priv, em_lmsw, lmsw),
+ II(SrcMem | ModRM | ByteOp | Priv | NoAccess, em_invlpg, invlpg),
}, {
- D(SrcNone | ModRM | Priv | VendorSpecific), N,
- N, D(SrcNone | ModRM | Priv | VendorSpecific),
- D(SrcNone | ModRM | DstMem | Mov), N,
- D(SrcMem16 | ModRM | Mov | Priv), N,
+ I(SrcNone | ModRM | Priv | VendorSpecific, em_vmcall),
+ EXT(0, group7_rm1),
+ N, EXT(0, group7_rm3),
+ II(SrcNone | ModRM | DstMem | Mov, em_smsw, smsw), N,
+ II(SrcMem16 | ModRM | Mov | Priv, em_lmsw, lmsw), EXT(0, group7_rm7),
} };
static struct opcode group8[] = {
@@ -2394,35 +3115,40 @@ static struct opcode group11[] = {
I(DstMem | SrcImm | ModRM | Mov, em_mov), X7(D(Undefined)),
};
+static struct gprefix pfx_0f_6f_0f_7f = {
+ N, N, N, I(Sse, em_movdqu),
+};
+
static struct opcode opcode_table[256] = {
/* 0x00 - 0x07 */
- D6ALU(Lock),
+ I6ALU(Lock, em_add),
D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
/* 0x08 - 0x0F */
- D6ALU(Lock),
+ I6ALU(Lock, em_or),
D(ImplicitOps | Stack | No64), N,
/* 0x10 - 0x17 */
- D6ALU(Lock),
+ I6ALU(Lock, em_adc),
D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
/* 0x18 - 0x1F */
- D6ALU(Lock),
+ I6ALU(Lock, em_sbb),
D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
/* 0x20 - 0x27 */
- D6ALU(Lock), N, N,
+ I6ALU(Lock, em_and), N, N,
/* 0x28 - 0x2F */
- D6ALU(Lock), N, I(ByteOp | DstAcc | No64, em_das),
+ I6ALU(Lock, em_sub), N, I(ByteOp | DstAcc | No64, em_das),
/* 0x30 - 0x37 */
- D6ALU(Lock), N, N,
+ I6ALU(Lock, em_xor), N, N,
/* 0x38 - 0x3F */
- D6ALU(0), N, N,
+ I6ALU(0, em_cmp), N, N,
/* 0x40 - 0x4F */
X16(D(DstReg)),
/* 0x50 - 0x57 */
X8(I(SrcReg | Stack, em_push)),
/* 0x58 - 0x5F */
- X8(D(DstReg | Stack)),
+ X8(I(DstReg | Stack, em_pop)),
/* 0x60 - 0x67 */
- D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
+ I(ImplicitOps | Stack | No64, em_pusha),
+ I(ImplicitOps | Stack | No64, em_popa),
N, D(DstReg | SrcMem32 | ModRM | Mov) /* movsxd (x86/64) */ ,
N, N, N, N,
/* 0x68 - 0x6F */
@@ -2430,8 +3156,8 @@ static struct opcode opcode_table[256] = {
I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op),
I(SrcImmByte | Mov | Stack, em_push),
I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op),
- D2bv(DstDI | Mov | String), /* insb, insw/insd */
- D2bv(SrcSI | ImplicitOps | String), /* outsb, outsw/outsd */
+ D2bvIP(DstDI | SrcDX | Mov | String, ins, check_perm_in), /* insb, insw/insd */
+ D2bvIP(SrcSI | DstDX | String, outs, check_perm_out), /* outsb, outsw/outsd */
/* 0x70 - 0x7F */
X16(D(SrcImmByte)),
/* 0x80 - 0x87 */
@@ -2446,21 +3172,22 @@ static struct opcode opcode_table[256] = {
D(DstMem | SrcNone | ModRM | Mov), D(ModRM | SrcMem | NoAccess | DstReg),
D(ImplicitOps | SrcMem16 | ModRM), G(0, group1A),
/* 0x90 - 0x97 */
- X8(D(SrcAcc | DstReg)),
+ DI(SrcAcc | DstReg, pause), X7(D(SrcAcc | DstReg)),
/* 0x98 - 0x9F */
D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd),
I(SrcImmFAddr | No64, em_call_far), N,
- D(ImplicitOps | Stack), D(ImplicitOps | Stack), N, N,
+ II(ImplicitOps | Stack, em_pushf, pushf),
+ II(ImplicitOps | Stack, em_popf, popf), N, N,
/* 0xA0 - 0xA7 */
I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov),
I2bv(DstMem | SrcAcc | Mov | MemAbs, em_mov),
I2bv(SrcSI | DstDI | Mov | String, em_mov),
- D2bv(SrcSI | DstDI | String),
+ I2bv(SrcSI | DstDI | String, em_cmp),
/* 0xA8 - 0xAF */
D2bv(DstAcc | SrcImm),
I2bv(SrcAcc | DstDI | Mov | String, em_mov),
I2bv(SrcSI | DstAcc | Mov | String, em_mov),
- D2bv(SrcAcc | DstDI | String),
+ I2bv(SrcAcc | DstDI | String, em_cmp),
/* 0xB0 - 0xB7 */
X8(I(ByteOp | DstReg | SrcImm | Mov, em_mov)),
/* 0xB8 - 0xBF */
@@ -2473,7 +3200,8 @@ static struct opcode opcode_table[256] = {
G(ByteOp, group11), G(0, group11),
/* 0xC8 - 0xCF */
N, N, N, D(ImplicitOps | Stack),
- D(ImplicitOps), D(SrcImmByte), D(ImplicitOps | No64), D(ImplicitOps),
+ D(ImplicitOps), DI(SrcImmByte, intn),
+ D(ImplicitOps | No64), DI(ImplicitOps, iret),
/* 0xD0 - 0xD7 */
D2bv(DstMem | SrcOne | ModRM), D2bv(DstMem | ModRM),
N, N, N, N,
@@ -2481,14 +3209,17 @@ static struct opcode opcode_table[256] = {
N, N, N, N, N, N, N, N,
/* 0xE0 - 0xE7 */
X4(D(SrcImmByte)),
- D2bv(SrcImmUByte | DstAcc), D2bv(SrcAcc | DstImmUByte),
+ D2bvIP(SrcImmUByte | DstAcc, in, check_perm_in),
+ D2bvIP(SrcAcc | DstImmUByte, out, check_perm_out),
/* 0xE8 - 0xEF */
D(SrcImm | Stack), D(SrcImm | ImplicitOps),
D(SrcImmFAddr | No64), D(SrcImmByte | ImplicitOps),
- D2bv(SrcNone | DstAcc), D2bv(SrcAcc | ImplicitOps),
+ D2bvIP(SrcDX | DstAcc, in, check_perm_in),
+ D2bvIP(SrcAcc | DstDX, out, check_perm_out),
/* 0xF0 - 0xF7 */
- N, N, N, N,
- D(ImplicitOps | Priv), D(ImplicitOps), G(ByteOp, group3), G(0, group3),
+ N, DI(ImplicitOps, icebp), N, N,
+ DI(ImplicitOps | Priv, hlt), D(ImplicitOps),
+ G(ByteOp, group3), G(0, group3),
/* 0xF8 - 0xFF */
D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), D(ImplicitOps),
D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5),
@@ -2496,20 +3227,24 @@ static struct opcode opcode_table[256] = {
static struct opcode twobyte_table[256] = {
/* 0x00 - 0x0F */
- N, GD(0, &group7), N, N,
- N, D(ImplicitOps | VendorSpecific), D(ImplicitOps | Priv), N,
- D(ImplicitOps | Priv), D(ImplicitOps | Priv), N, N,
+ G(0, group6), GD(0, &group7), N, N,
+ N, D(ImplicitOps | VendorSpecific), DI(ImplicitOps | Priv, clts), N,
+ DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N,
N, D(ImplicitOps | ModRM), N, N,
/* 0x10 - 0x1F */
N, N, N, N, N, N, N, N, D(ImplicitOps | ModRM), N, N, N, N, N, N, N,
/* 0x20 - 0x2F */
- D(ModRM | DstMem | Priv | Op3264), D(ModRM | DstMem | Priv | Op3264),
- D(ModRM | SrcMem | Priv | Op3264), D(ModRM | SrcMem | Priv | Op3264),
+ DIP(ModRM | DstMem | Priv | Op3264, cr_read, check_cr_read),
+ DIP(ModRM | DstMem | Priv | Op3264, dr_read, check_dr_read),
+ DIP(ModRM | SrcMem | Priv | Op3264, cr_write, check_cr_write),
+ DIP(ModRM | SrcMem | Priv | Op3264, dr_write, check_dr_write),
N, N, N, N,
N, N, N, N, N, N, N, N,
/* 0x30 - 0x3F */
- D(ImplicitOps | Priv), I(ImplicitOps, em_rdtsc),
- D(ImplicitOps | Priv), N,
+ DI(ImplicitOps | Priv, wrmsr),
+ IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc),
+ DI(ImplicitOps | Priv, rdmsr),
+ DIP(ImplicitOps | Priv, rdpmc, check_rdpmc),
D(ImplicitOps | VendorSpecific), D(ImplicitOps | Priv | VendorSpecific),
N, N,
N, N, N, N, N, N, N, N,
@@ -2518,21 +3253,27 @@ static struct opcode twobyte_table[256] = {
/* 0x50 - 0x5F */
N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
/* 0x60 - 0x6F */
- N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
+ N, N, N, N,
+ N, N, N, N,
+ N, N, N, N,
+ N, N, N, GP(SrcMem | DstReg | ModRM | Mov, &pfx_0f_6f_0f_7f),
/* 0x70 - 0x7F */
- N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
+ N, N, N, N,
+ N, N, N, N,
+ N, N, N, N,
+ N, N, N, GP(SrcReg | DstMem | ModRM | Mov, &pfx_0f_6f_0f_7f),
/* 0x80 - 0x8F */
X16(D(SrcImm)),
/* 0x90 - 0x9F */
X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)),
/* 0xA0 - 0xA7 */
D(ImplicitOps | Stack), D(ImplicitOps | Stack),
- N, D(DstMem | SrcReg | ModRM | BitOp),
+ DI(ImplicitOps, cpuid), D(DstMem | SrcReg | ModRM | BitOp),
D(DstMem | SrcReg | Src2ImmByte | ModRM),
D(DstMem | SrcReg | Src2CL | ModRM), N, N,
/* 0xA8 - 0xAF */
D(ImplicitOps | Stack), D(ImplicitOps | Stack),
- N, D(DstMem | SrcReg | ModRM | BitOp | Lock),
+ DI(ImplicitOps, rsm), D(DstMem | SrcReg | ModRM | BitOp | Lock),
D(DstMem | SrcReg | Src2ImmByte | ModRM),
D(DstMem | SrcReg | Src2CL | ModRM),
D(ModRM), I(DstReg | SrcMem | ModRM, em_imul),
@@ -2564,10 +3305,13 @@ static struct opcode twobyte_table[256] = {
#undef G
#undef GD
#undef I
+#undef GP
+#undef EXT
#undef D2bv
+#undef D2bvIP
#undef I2bv
-#undef D6ALU
+#undef I6ALU
static unsigned imm_size(struct decode_cache *c)
{
@@ -2625,16 +3369,16 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
struct decode_cache *c = &ctxt->decode;
int rc = X86EMUL_CONTINUE;
int mode = ctxt->mode;
- int def_op_bytes, def_ad_bytes, dual, goffset;
- struct opcode opcode, *g_mod012, *g_mod3;
- struct operand memop = { .type = OP_NONE };
+ int def_op_bytes, def_ad_bytes, goffset, simd_prefix;
+ bool op_prefix = false;
+ struct opcode opcode;
+ struct operand memop = { .type = OP_NONE }, *memopp = NULL;
c->eip = ctxt->eip;
c->fetch.start = c->eip;
c->fetch.end = c->fetch.start + insn_len;
if (insn_len > 0)
memcpy(c->fetch.data, insn, insn_len);
- ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS);
switch (mode) {
case X86EMUL_MODE_REAL:
@@ -2662,6 +3406,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
for (;;) {
switch (c->b = insn_fetch(u8, 1, c->eip)) {
case 0x66: /* operand-size override */
+ op_prefix = true;
/* switch between 2/4 bytes */
c->op_bytes = def_op_bytes ^ 6;
break;
@@ -2692,10 +3437,8 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
c->lock_prefix = 1;
break;
case 0xf2: /* REPNE/REPNZ */
- c->rep_prefix = REPNE_PREFIX;
- break;
case 0xf3: /* REP/REPE/REPZ */
- c->rep_prefix = REPE_PREFIX;
+ c->rep_prefix = c->b;
break;
default:
goto done_prefixes;
@@ -2722,29 +3465,49 @@ done_prefixes:
}
c->d = opcode.flags;
- if (c->d & Group) {
- dual = c->d & GroupDual;
- c->modrm = insn_fetch(u8, 1, c->eip);
- --c->eip;
-
- if (c->d & GroupDual) {
- g_mod012 = opcode.u.gdual->mod012;
- g_mod3 = opcode.u.gdual->mod3;
- } else
- g_mod012 = g_mod3 = opcode.u.group;
-
- c->d &= ~(Group | GroupDual);
-
- goffset = (c->modrm >> 3) & 7;
+ while (c->d & GroupMask) {
+ switch (c->d & GroupMask) {
+ case Group:
+ c->modrm = insn_fetch(u8, 1, c->eip);
+ --c->eip;
+ goffset = (c->modrm >> 3) & 7;
+ opcode = opcode.u.group[goffset];
+ break;
+ case GroupDual:
+ c->modrm = insn_fetch(u8, 1, c->eip);
+ --c->eip;
+ goffset = (c->modrm >> 3) & 7;
+ if ((c->modrm >> 6) == 3)
+ opcode = opcode.u.gdual->mod3[goffset];
+ else
+ opcode = opcode.u.gdual->mod012[goffset];
+ break;
+ case RMExt:
+ goffset = c->modrm & 7;
+ opcode = opcode.u.group[goffset];
+ break;
+ case Prefix:
+ if (c->rep_prefix && op_prefix)
+ return X86EMUL_UNHANDLEABLE;
+ simd_prefix = op_prefix ? 0x66 : c->rep_prefix;
+ switch (simd_prefix) {
+ case 0x00: opcode = opcode.u.gprefix->pfx_no; break;
+ case 0x66: opcode = opcode.u.gprefix->pfx_66; break;
+ case 0xf2: opcode = opcode.u.gprefix->pfx_f2; break;
+ case 0xf3: opcode = opcode.u.gprefix->pfx_f3; break;
+ }
+ break;
+ default:
+ return X86EMUL_UNHANDLEABLE;
+ }
- if ((c->modrm >> 6) == 3)
- opcode = g_mod3[goffset];
- else
- opcode = g_mod012[goffset];
+ c->d &= ~GroupMask;
c->d |= opcode.flags;
}
c->execute = opcode.u.execute;
+ c->check_perm = opcode.check_perm;
+ c->intercept = opcode.intercept;
/* Unrecognised? */
if (c->d == 0 || (c->d & Undefined))
@@ -2763,6 +3526,9 @@ done_prefixes:
c->op_bytes = 4;
}
+ if (c->d & Sse)
+ c->op_bytes = 16;
+
/* ModRM and SIB bytes. */
if (c->d & ModRM) {
rc = decode_modrm(ctxt, ops, &memop);
@@ -2776,14 +3542,11 @@ done_prefixes:
if (!c->has_seg_override)
set_seg_override(c, VCPU_SREG_DS);
- memop.addr.mem.seg = seg_override(ctxt, ops, c);
+ memop.addr.mem.seg = seg_override(ctxt, c);
if (memop.type == OP_MEM && c->ad_bytes != 8)
memop.addr.mem.ea = (u32)memop.addr.mem.ea;
- if (memop.type == OP_MEM && c->rip_relative)
- memop.addr.mem.ea += c->eip;
-
/*
* Decode and fetch the source operand: register, memory
* or immediate.
@@ -2792,7 +3555,7 @@ done_prefixes:
case SrcNone:
break;
case SrcReg:
- decode_register_operand(&c->src, c, 0);
+ decode_register_operand(ctxt, &c->src, c, 0);
break;
case SrcMem16:
memop.bytes = 2;
@@ -2805,6 +3568,7 @@ done_prefixes:
c->op_bytes;
srcmem_common:
c->src = memop;
+ memopp = &c->src;
break;
case SrcImmU16:
rc = decode_imm(ctxt, &c->src, 2, false);
@@ -2836,7 +3600,7 @@ done_prefixes:
c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
c->src.addr.mem.ea =
register_address(c, c->regs[VCPU_REGS_RSI]);
- c->src.addr.mem.seg = seg_override(ctxt, ops, c),
+ c->src.addr.mem.seg = seg_override(ctxt, c);
c->src.val = 0;
break;
case SrcImmFAddr:
@@ -2849,6 +3613,12 @@ done_prefixes:
memop.bytes = c->op_bytes + 2;
goto srcmem_common;
break;
+ case SrcDX:
+ c->src.type = OP_REG;
+ c->src.bytes = 2;
+ c->src.addr.reg = &c->regs[VCPU_REGS_RDX];
+ fetch_register_operand(&c->src);
+ break;
}
if (rc != X86EMUL_CONTINUE)
@@ -2883,7 +3653,7 @@ done_prefixes:
/* Decode and fetch the destination operand: register or memory. */
switch (c->d & DstMask) {
case DstReg:
- decode_register_operand(&c->dst, c,
+ decode_register_operand(ctxt, &c->dst, c,
c->twobyte && (c->b == 0xb6 || c->b == 0xb7));
break;
case DstImmUByte:
@@ -2895,6 +3665,7 @@ done_prefixes:
case DstMem:
case DstMem64:
c->dst = memop;
+ memopp = &c->dst;
if ((c->d & DstMask) == DstMem64)
c->dst.bytes = 8;
else
@@ -2918,15 +3689,24 @@ done_prefixes:
c->dst.addr.mem.seg = VCPU_SREG_ES;
c->dst.val = 0;
break;
+ case DstDX:
+ c->dst.type = OP_REG;
+ c->dst.bytes = 2;
+ c->dst.addr.reg = &c->regs[VCPU_REGS_RDX];
+ fetch_register_operand(&c->dst);
+ break;
case ImplicitOps:
/* Special instructions do their own operand decoding. */
default:
c->dst.type = OP_NONE; /* Disable writeback. */
- return 0;
+ break;
}
done:
- return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
+ if (memopp && memopp->type == OP_MEM && c->rip_relative)
+ memopp->addr.mem.ea += c->eip;
+
+ return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
}
static bool string_insn_completed(struct x86_emulate_ctxt *ctxt)
@@ -2979,12 +3759,51 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
goto done;
}
+ if ((c->d & Sse)
+ && ((ops->get_cr(ctxt, 0) & X86_CR0_EM)
+ || !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) {
+ rc = emulate_ud(ctxt);
+ goto done;
+ }
+
+ if ((c->d & Sse) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) {
+ rc = emulate_nm(ctxt);
+ goto done;
+ }
+
+ if (unlikely(ctxt->guest_mode) && c->intercept) {
+ rc = emulator_check_intercept(ctxt, c->intercept,
+ X86_ICPT_PRE_EXCEPT);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
+
/* Privileged instruction can be executed only in CPL=0 */
- if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) {
+ if ((c->d & Priv) && ops->cpl(ctxt)) {
rc = emulate_gp(ctxt, 0);
goto done;
}
+ /* Instruction can only be executed in protected mode */
+ if ((c->d & Prot) && !(ctxt->mode & X86EMUL_MODE_PROT)) {
+ rc = emulate_ud(ctxt);
+ goto done;
+ }
+
+ /* Do instruction specific permission checks */
+ if (c->check_perm) {
+ rc = c->check_perm(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
+
+ if (unlikely(ctxt->guest_mode) && c->intercept) {
+ rc = emulator_check_intercept(ctxt, c->intercept,
+ X86_ICPT_POST_EXCEPT);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
+
if (c->rep_prefix && (c->d & String)) {
/* All REP prefixes have the same first termination condition */
if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) {
@@ -2994,16 +3813,16 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
}
if ((c->src.type == OP_MEM) && !(c->d & NoAccess)) {
- rc = read_emulated(ctxt, ops, linear(ctxt, c->src.addr.mem),
- c->src.valptr, c->src.bytes);
+ rc = segmented_read(ctxt, c->src.addr.mem,
+ c->src.valptr, c->src.bytes);
if (rc != X86EMUL_CONTINUE)
goto done;
c->src.orig_val64 = c->src.val64;
}
if (c->src2.type == OP_MEM) {
- rc = read_emulated(ctxt, ops, linear(ctxt, c->src2.addr.mem),
- &c->src2.val, c->src2.bytes);
+ rc = segmented_read(ctxt, c->src2.addr.mem,
+ &c->src2.val, c->src2.bytes);
if (rc != X86EMUL_CONTINUE)
goto done;
}
@@ -3014,7 +3833,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
if ((c->dst.type == OP_MEM) && !(c->d & Mov)) {
/* optimisation - avoid slow emulated read if Mov */
- rc = read_emulated(ctxt, ops, linear(ctxt, c->dst.addr.mem),
+ rc = segmented_read(ctxt, c->dst.addr.mem,
&c->dst.val, c->dst.bytes);
if (rc != X86EMUL_CONTINUE)
goto done;
@@ -3023,6 +3842,13 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
special_insn:
+ if (unlikely(ctxt->guest_mode) && c->intercept) {
+ rc = emulator_check_intercept(ctxt, c->intercept,
+ X86_ICPT_POST_MEMACCESS);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
+
if (c->execute) {
rc = c->execute(ctxt);
if (rc != X86EMUL_CONTINUE)
@@ -3034,75 +3860,33 @@ special_insn:
goto twobyte_insn;
switch (c->b) {
- case 0x00 ... 0x05:
- add: /* add */
- emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
- break;
case 0x06: /* push es */
- emulate_push_sreg(ctxt, ops, VCPU_SREG_ES);
+ rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_ES);
break;
case 0x07: /* pop es */
rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES);
break;
- case 0x08 ... 0x0d:
- or: /* or */
- emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
- break;
case 0x0e: /* push cs */
- emulate_push_sreg(ctxt, ops, VCPU_SREG_CS);
- break;
- case 0x10 ... 0x15:
- adc: /* adc */
- emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags);
+ rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_CS);
break;
case 0x16: /* push ss */
- emulate_push_sreg(ctxt, ops, VCPU_SREG_SS);
+ rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_SS);
break;
case 0x17: /* pop ss */
rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS);
break;
- case 0x18 ... 0x1d:
- sbb: /* sbb */
- emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
- break;
case 0x1e: /* push ds */
- emulate_push_sreg(ctxt, ops, VCPU_SREG_DS);
+ rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_DS);
break;
case 0x1f: /* pop ds */
rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS);
break;
- case 0x20 ... 0x25:
- and: /* and */
- emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags);
- break;
- case 0x28 ... 0x2d:
- sub: /* sub */
- emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags);
- break;
- case 0x30 ... 0x35:
- xor: /* xor */
- emulate_2op_SrcV("xor", c->src, c->dst, ctxt->eflags);
- break;
- case 0x38 ... 0x3d:
- cmp: /* cmp */
- emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
- break;
case 0x40 ... 0x47: /* inc r16/r32 */
emulate_1op("inc", c->dst, ctxt->eflags);
break;
case 0x48 ... 0x4f: /* dec r16/r32 */
emulate_1op("dec", c->dst, ctxt->eflags);
break;
- case 0x58 ... 0x5f: /* pop reg */
- pop_instruction:
- rc = emulate_pop(ctxt, ops, &c->dst.val, c->op_bytes);
- break;
- case 0x60: /* pusha */
- rc = emulate_pusha(ctxt, ops);
- break;
- case 0x61: /* popa */
- rc = emulate_popa(ctxt, ops);
- break;
case 0x63: /* movsxd */
if (ctxt->mode != X86EMUL_MODE_PROT64)
goto cannot_emulate;
@@ -3121,26 +3905,6 @@ special_insn:
if (test_cc(c->b, ctxt->eflags))
jmp_rel(c, c->src.val);
break;
- case 0x80 ... 0x83: /* Grp1 */
- switch (c->modrm_reg) {
- case 0:
- goto add;
- case 1:
- goto or;
- case 2:
- goto adc;
- case 3:
- goto sbb;
- case 4:
- goto and;
- case 5:
- goto sub;
- case 6:
- goto xor;
- case 7:
- goto cmp;
- }
- break;
case 0x84 ... 0x85:
test:
emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
@@ -3162,7 +3926,7 @@ special_insn:
rc = emulate_ud(ctxt);
goto done;
}
- c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu);
+ c->dst.val = get_segment_selector(ctxt, c->modrm_reg);
break;
case 0x8d: /* lea r16/r32, m */
c->dst.val = c->src.addr.mem.ea;
@@ -3187,7 +3951,7 @@ special_insn:
break;
}
case 0x8f: /* pop (sole member of Grp1a) */
- rc = emulate_grp1a(ctxt, ops);
+ rc = em_grp1a(ctxt);
break;
case 0x90 ... 0x97: /* nop / xchg reg, rax */
if (c->dst.addr.reg == &c->regs[VCPU_REGS_RAX])
@@ -3200,31 +3964,17 @@ special_insn:
case 8: c->dst.val = (s32)c->dst.val; break;
}
break;
- case 0x9c: /* pushf */
- c->src.val = (unsigned long) ctxt->eflags;
- emulate_push(ctxt, ops);
- break;
- case 0x9d: /* popf */
- c->dst.type = OP_REG;
- c->dst.addr.reg = &ctxt->eflags;
- c->dst.bytes = c->op_bytes;
- rc = emulate_popf(ctxt, ops, &c->dst.val, c->op_bytes);
- break;
- case 0xa6 ... 0xa7: /* cmps */
- c->dst.type = OP_NONE; /* Disable writeback. */
- goto cmp;
case 0xa8 ... 0xa9: /* test ax, imm */
goto test;
- case 0xae ... 0xaf: /* scas */
- goto cmp;
case 0xc0 ... 0xc1:
- emulate_grp2(ctxt);
+ rc = em_grp2(ctxt);
break;
case 0xc3: /* ret */
c->dst.type = OP_REG;
c->dst.addr.reg = &c->eip;
c->dst.bytes = c->op_bytes;
- goto pop_instruction;
+ rc = em_pop(ctxt);
+ break;
case 0xc4: /* les */
rc = emulate_load_segment(ctxt, ops, VCPU_SREG_ES);
break;
@@ -3252,11 +4002,11 @@ special_insn:
rc = emulate_iret(ctxt, ops);
break;
case 0xd0 ... 0xd1: /* Grp2 */
- emulate_grp2(ctxt);
+ rc = em_grp2(ctxt);
break;
case 0xd2 ... 0xd3: /* Grp2 */
c->src.val = c->regs[VCPU_REGS_RCX];
- emulate_grp2(ctxt);
+ rc = em_grp2(ctxt);
break;
case 0xe0 ... 0xe2: /* loop/loopz/loopnz */
register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
@@ -3278,23 +4028,14 @@ special_insn:
long int rel = c->src.val;
c->src.val = (unsigned long) c->eip;
jmp_rel(c, rel);
- emulate_push(ctxt, ops);
+ rc = em_push(ctxt);
break;
}
case 0xe9: /* jmp rel */
goto jmp;
- case 0xea: { /* jmp far */
- unsigned short sel;
- jump_far:
- memcpy(&sel, c->src.valptr + c->op_bytes, 2);
-
- if (load_segment_descriptor(ctxt, ops, sel, VCPU_SREG_CS))
- goto done;
-
- c->eip = 0;
- memcpy(&c->eip, c->src.valptr, c->op_bytes);
+ case 0xea: /* jmp far */
+ rc = em_jmp_far(ctxt);
break;
- }
case 0xeb:
jmp: /* jmp rel short */
jmp_rel(c, c->src.val);
@@ -3302,40 +4043,27 @@ special_insn:
break;
case 0xec: /* in al,dx */
case 0xed: /* in (e/r)ax,dx */
- c->src.val = c->regs[VCPU_REGS_RDX];
do_io_in:
- c->dst.bytes = min(c->dst.bytes, 4u);
- if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
- rc = emulate_gp(ctxt, 0);
- goto done;
- }
if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val,
&c->dst.val))
goto done; /* IO is needed */
break;
case 0xee: /* out dx,al */
case 0xef: /* out dx,(e/r)ax */
- c->dst.val = c->regs[VCPU_REGS_RDX];
do_io_out:
- c->src.bytes = min(c->src.bytes, 4u);
- if (!emulator_io_permited(ctxt, ops, c->dst.val,
- c->src.bytes)) {
- rc = emulate_gp(ctxt, 0);
- goto done;
- }
- ops->pio_out_emulated(c->src.bytes, c->dst.val,
- &c->src.val, 1, ctxt->vcpu);
+ ops->pio_out_emulated(ctxt, c->src.bytes, c->dst.val,
+ &c->src.val, 1);
c->dst.type = OP_NONE; /* Disable writeback. */
break;
case 0xf4: /* hlt */
- ctxt->vcpu->arch.halt_request = 1;
+ ctxt->ops->halt(ctxt);
break;
case 0xf5: /* cmc */
/* complement carry flag from eflags reg */
ctxt->eflags ^= EFLG_CF;
break;
case 0xf6 ... 0xf7: /* Grp3 */
- rc = emulate_grp3(ctxt, ops);
+ rc = em_grp3(ctxt);
break;
case 0xf8: /* clc */
ctxt->eflags &= ~EFLG_CF;
@@ -3366,13 +4094,11 @@ special_insn:
ctxt->eflags |= EFLG_DF;
break;
case 0xfe: /* Grp4 */
- grp45:
- rc = emulate_grp45(ctxt, ops);
+ rc = em_grp45(ctxt);
break;
case 0xff: /* Grp5 */
- if (c->modrm_reg == 5)
- goto jump_far;
- goto grp45;
+ rc = em_grp45(ctxt);
+ break;
default:
goto cannot_emulate;
}
@@ -3381,7 +4107,7 @@ special_insn:
goto done;
writeback:
- rc = writeback(ctxt, ops);
+ rc = writeback(ctxt);
if (rc != X86EMUL_CONTINUE)
goto done;
@@ -3392,7 +4118,7 @@ writeback:
c->dst.type = saved_dst_type;
if ((c->d & SrcMask) == SrcSI)
- string_addr_inc(ctxt, seg_override(ctxt, ops, c),
+ string_addr_inc(ctxt, seg_override(ctxt, c),
VCPU_REGS_RSI, &c->src);
if ((c->d & DstMask) == DstDI)
@@ -3427,115 +4153,34 @@ writeback:
done:
if (rc == X86EMUL_PROPAGATE_FAULT)
ctxt->have_exception = true;
+ if (rc == X86EMUL_INTERCEPTED)
+ return EMULATION_INTERCEPTED;
+
return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
twobyte_insn:
switch (c->b) {
- case 0x01: /* lgdt, lidt, lmsw */
- switch (c->modrm_reg) {
- u16 size;
- unsigned long address;
-
- case 0: /* vmcall */
- if (c->modrm_mod != 3 || c->modrm_rm != 1)
- goto cannot_emulate;
-
- rc = kvm_fix_hypercall(ctxt->vcpu);
- if (rc != X86EMUL_CONTINUE)
- goto done;
-
- /* Let the processor re-execute the fixed hypercall */
- c->eip = ctxt->eip;
- /* Disable writeback. */
- c->dst.type = OP_NONE;
- break;
- case 2: /* lgdt */
- rc = read_descriptor(ctxt, ops, c->src.addr.mem,
- &size, &address, c->op_bytes);
- if (rc != X86EMUL_CONTINUE)
- goto done;
- realmode_lgdt(ctxt->vcpu, size, address);
- /* Disable writeback. */
- c->dst.type = OP_NONE;
- break;
- case 3: /* lidt/vmmcall */
- if (c->modrm_mod == 3) {
- switch (c->modrm_rm) {
- case 1:
- rc = kvm_fix_hypercall(ctxt->vcpu);
- break;
- default:
- goto cannot_emulate;
- }
- } else {
- rc = read_descriptor(ctxt, ops, c->src.addr.mem,
- &size, &address,
- c->op_bytes);
- if (rc != X86EMUL_CONTINUE)
- goto done;
- realmode_lidt(ctxt->vcpu, size, address);
- }
- /* Disable writeback. */
- c->dst.type = OP_NONE;
- break;
- case 4: /* smsw */
- c->dst.bytes = 2;
- c->dst.val = ops->get_cr(0, ctxt->vcpu);
- break;
- case 6: /* lmsw */
- ops->set_cr(0, (ops->get_cr(0, ctxt->vcpu) & ~0x0eul) |
- (c->src.val & 0x0f), ctxt->vcpu);
- c->dst.type = OP_NONE;
- break;
- case 5: /* not defined */
- emulate_ud(ctxt);
- rc = X86EMUL_PROPAGATE_FAULT;
- goto done;
- case 7: /* invlpg*/
- emulate_invlpg(ctxt->vcpu,
- linear(ctxt, c->src.addr.mem));
- /* Disable writeback. */
- c->dst.type = OP_NONE;
- break;
- default:
- goto cannot_emulate;
- }
- break;
case 0x05: /* syscall */
rc = emulate_syscall(ctxt, ops);
break;
case 0x06:
- emulate_clts(ctxt->vcpu);
+ rc = em_clts(ctxt);
break;
case 0x09: /* wbinvd */
- kvm_emulate_wbinvd(ctxt->vcpu);
+ (ctxt->ops->wbinvd)(ctxt);
break;
case 0x08: /* invd */
case 0x0d: /* GrpP (prefetch) */
case 0x18: /* Grp16 (prefetch/nop) */
break;
case 0x20: /* mov cr, reg */
- switch (c->modrm_reg) {
- case 1:
- case 5 ... 7:
- case 9 ... 15:
- emulate_ud(ctxt);
- rc = X86EMUL_PROPAGATE_FAULT;
- goto done;
- }
- c->dst.val = ops->get_cr(c->modrm_reg, ctxt->vcpu);
+ c->dst.val = ops->get_cr(ctxt, c->modrm_reg);
break;
case 0x21: /* mov from dr to reg */
- if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
- (c->modrm_reg == 4 || c->modrm_reg == 5)) {
- emulate_ud(ctxt);
- rc = X86EMUL_PROPAGATE_FAULT;
- goto done;
- }
- ops->get_dr(c->modrm_reg, &c->dst.val, ctxt->vcpu);
+ ops->get_dr(ctxt, c->modrm_reg, &c->dst.val);
break;
case 0x22: /* mov reg, cr */
- if (ops->set_cr(c->modrm_reg, c->src.val, ctxt->vcpu)) {
+ if (ops->set_cr(ctxt, c->modrm_reg, c->src.val)) {
emulate_gp(ctxt, 0);
rc = X86EMUL_PROPAGATE_FAULT;
goto done;
@@ -3543,16 +4188,9 @@ twobyte_insn:
c->dst.type = OP_NONE;
break;
case 0x23: /* mov from reg to dr */
- if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
- (c->modrm_reg == 4 || c->modrm_reg == 5)) {
- emulate_ud(ctxt);
- rc = X86EMUL_PROPAGATE_FAULT;
- goto done;
- }
-
- if (ops->set_dr(c->modrm_reg, c->src.val &
+ if (ops->set_dr(ctxt, c->modrm_reg, c->src.val &
((ctxt->mode == X86EMUL_MODE_PROT64) ?
- ~0ULL : ~0U), ctxt->vcpu) < 0) {
+ ~0ULL : ~0U)) < 0) {
/* #UD condition is already handled by the code above */
emulate_gp(ctxt, 0);
rc = X86EMUL_PROPAGATE_FAULT;
@@ -3565,7 +4203,7 @@ twobyte_insn:
/* wrmsr */
msr_data = (u32)c->regs[VCPU_REGS_RAX]
| ((u64)c->regs[VCPU_REGS_RDX] << 32);
- if (ops->set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) {
+ if (ops->set_msr(ctxt, c->regs[VCPU_REGS_RCX], msr_data)) {
emulate_gp(ctxt, 0);
rc = X86EMUL_PROPAGATE_FAULT;
goto done;
@@ -3574,7 +4212,7 @@ twobyte_insn:
break;
case 0x32:
/* rdmsr */
- if (ops->get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) {
+ if (ops->get_msr(ctxt, c->regs[VCPU_REGS_RCX], &msr_data)) {
emulate_gp(ctxt, 0);
rc = X86EMUL_PROPAGATE_FAULT;
goto done;
@@ -3603,7 +4241,7 @@ twobyte_insn:
c->dst.val = test_cc(c->b, ctxt->eflags);
break;
case 0xa0: /* push fs */
- emulate_push_sreg(ctxt, ops, VCPU_SREG_FS);
+ rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_FS);
break;
case 0xa1: /* pop fs */
rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS);
@@ -3620,7 +4258,7 @@ twobyte_insn:
emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags);
break;
case 0xa8: /* push gs */
- emulate_push_sreg(ctxt, ops, VCPU_SREG_GS);
+ rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_GS);
break;
case 0xa9: /* pop gs */
rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS);
@@ -3727,7 +4365,7 @@ twobyte_insn:
(u64) c->src.val;
break;
case 0xc7: /* Grp9 (cmpxchg8b) */
- rc = emulate_grp9(ctxt, ops);
+ rc = em_grp9(ctxt);
break;
default:
goto cannot_emulate;
@@ -3739,5 +4377,5 @@ twobyte_insn:
goto writeback;
cannot_emulate:
- return -1;
+ return EMULATION_FAILED;
}
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index 46d08ca0b48..51a97426e79 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -33,7 +33,6 @@ struct kvm_kpit_state {
};
struct kvm_pit {
- unsigned long base_addresss;
struct kvm_io_device dev;
struct kvm_io_device speaker_dev;
struct kvm *kvm;
@@ -51,7 +50,6 @@ struct kvm_pit {
#define KVM_MAX_PIT_INTR_INTERVAL HZ / 100
#define KVM_PIT_CHANNEL_MASK 0x3
-void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu);
void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_start);
struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags);
void kvm_free_pit(struct kvm *kvm);
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index ba910d14941..53e2d084bff 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -75,7 +75,6 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm);
void kvm_destroy_pic(struct kvm *kvm);
int kvm_pic_read_irq(struct kvm *kvm);
void kvm_pic_update_irq(struct kvm_pic *s);
-void kvm_pic_clear_isr_ack(struct kvm *kvm);
static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
{
@@ -100,7 +99,6 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu);
void __kvm_migrate_timers(struct kvm_vcpu *vcpu);
-int pit_has_pending_timer(struct kvm_vcpu *vcpu);
int apic_has_pending_timer(struct kvm_vcpu *vcpu);
#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 22fae7593ee..aee38623b76 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -565,7 +565,7 @@ gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn)
{
- return gfn_to_memslot_dirty_bitmap(vcpu, large_gfn, true);
+ return !gfn_to_memslot_dirty_bitmap(vcpu, large_gfn, true);
}
static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
@@ -1206,7 +1206,7 @@ static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
static void nonpaging_update_pte(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp, u64 *spte,
- const void *pte, unsigned long mmu_seq)
+ const void *pte)
{
WARN_ON(1);
}
@@ -3163,9 +3163,8 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
}
static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp,
- u64 *spte,
- const void *new, unsigned long mmu_seq)
+ struct kvm_mmu_page *sp, u64 *spte,
+ const void *new)
{
if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
++vcpu->kvm->stat.mmu_pde_zapped;
@@ -3173,7 +3172,7 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
}
++vcpu->kvm->stat.mmu_pte_updated;
- vcpu->arch.mmu.update_pte(vcpu, sp, spte, new, mmu_seq);
+ vcpu->arch.mmu.update_pte(vcpu, sp, spte, new);
}
static bool need_remote_flush(u64 old, u64 new)
@@ -3229,7 +3228,6 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
struct kvm_mmu_page *sp;
struct hlist_node *node;
LIST_HEAD(invalid_list);
- unsigned long mmu_seq;
u64 entry, gentry, *spte;
unsigned pte_size, page_offset, misaligned, quadrant, offset;
int level, npte, invlpg_counter, r, flooded = 0;
@@ -3271,9 +3269,6 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
break;
}
- mmu_seq = vcpu->kvm->mmu_notifier_seq;
- smp_rmb();
-
spin_lock(&vcpu->kvm->mmu_lock);
if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter)
gentry = 0;
@@ -3345,8 +3340,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
if (gentry &&
!((sp->role.word ^ vcpu->arch.mmu.base_role.word)
& mask.word))
- mmu_pte_write_new_pte(vcpu, sp, spte, &gentry,
- mmu_seq);
+ mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
if (!remote_flush && need_remote_flush(entry, *spte))
remote_flush = true;
++spte;
@@ -3551,10 +3545,11 @@ static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm,
return kvm_mmu_prepare_zap_page(kvm, page, invalid_list);
}
-static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
+static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
{
struct kvm *kvm;
struct kvm *kvm_freed = NULL;
+ int nr_to_scan = sc->nr_to_scan;
if (nr_to_scan == 0)
goto out;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index c6397795d86..9d03ad4dd5e 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -78,15 +78,19 @@ static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT;
}
-static bool FNAME(cmpxchg_gpte)(struct kvm *kvm,
- gfn_t table_gfn, unsigned index,
- pt_element_t orig_pte, pt_element_t new_pte)
+static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ pt_element_t __user *ptep_user, unsigned index,
+ pt_element_t orig_pte, pt_element_t new_pte)
{
+ int npages;
pt_element_t ret;
pt_element_t *table;
struct page *page;
- page = gfn_to_page(kvm, table_gfn);
+ npages = get_user_pages_fast((unsigned long)ptep_user, 1, 1, &page);
+ /* Check if the user is doing something meaningless. */
+ if (unlikely(npages != 1))
+ return -EFAULT;
table = kmap_atomic(page, KM_USER0);
ret = CMPXCHG(&table[index], orig_pte, new_pte);
@@ -117,6 +121,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
gva_t addr, u32 access)
{
pt_element_t pte;
+ pt_element_t __user *uninitialized_var(ptep_user);
gfn_t table_gfn;
unsigned index, pt_access, uninitialized_var(pte_access);
gpa_t pte_gpa;
@@ -152,6 +157,9 @@ walk:
pt_access = ACC_ALL;
for (;;) {
+ gfn_t real_gfn;
+ unsigned long host_addr;
+
index = PT_INDEX(addr, walker->level);
table_gfn = gpte_to_gfn(pte);
@@ -160,43 +168,64 @@ walk:
walker->table_gfn[walker->level - 1] = table_gfn;
walker->pte_gpa[walker->level - 1] = pte_gpa;
- if (kvm_read_guest_page_mmu(vcpu, mmu, table_gfn, &pte,
- offset, sizeof(pte),
- PFERR_USER_MASK|PFERR_WRITE_MASK)) {
+ real_gfn = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn),
+ PFERR_USER_MASK|PFERR_WRITE_MASK);
+ if (unlikely(real_gfn == UNMAPPED_GVA)) {
+ present = false;
+ break;
+ }
+ real_gfn = gpa_to_gfn(real_gfn);
+
+ host_addr = gfn_to_hva(vcpu->kvm, real_gfn);
+ if (unlikely(kvm_is_error_hva(host_addr))) {
+ present = false;
+ break;
+ }
+
+ ptep_user = (pt_element_t __user *)((void *)host_addr + offset);
+ if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte)))) {
present = false;
break;
}
trace_kvm_mmu_paging_element(pte, walker->level);
- if (!is_present_gpte(pte)) {
+ if (unlikely(!is_present_gpte(pte))) {
present = false;
break;
}
- if (is_rsvd_bits_set(&vcpu->arch.mmu, pte, walker->level)) {
+ if (unlikely(is_rsvd_bits_set(&vcpu->arch.mmu, pte,
+ walker->level))) {
rsvd_fault = true;
break;
}
- if (write_fault && !is_writable_pte(pte))
- if (user_fault || is_write_protection(vcpu))
- eperm = true;
+ if (unlikely(write_fault && !is_writable_pte(pte)
+ && (user_fault || is_write_protection(vcpu))))
+ eperm = true;
- if (user_fault && !(pte & PT_USER_MASK))
+ if (unlikely(user_fault && !(pte & PT_USER_MASK)))
eperm = true;
#if PTTYPE == 64
- if (fetch_fault && (pte & PT64_NX_MASK))
+ if (unlikely(fetch_fault && (pte & PT64_NX_MASK)))
eperm = true;
#endif
- if (!eperm && !rsvd_fault && !(pte & PT_ACCESSED_MASK)) {
+ if (!eperm && !rsvd_fault
+ && unlikely(!(pte & PT_ACCESSED_MASK))) {
+ int ret;
trace_kvm_mmu_set_accessed_bit(table_gfn, index,
sizeof(pte));
- if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn,
- index, pte, pte|PT_ACCESSED_MASK))
+ ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index,
+ pte, pte|PT_ACCESSED_MASK);
+ if (unlikely(ret < 0)) {
+ present = false;
+ break;
+ } else if (ret)
goto walk;
+
mark_page_dirty(vcpu->kvm, table_gfn);
pte |= PT_ACCESSED_MASK;
}
@@ -241,17 +270,21 @@ walk:
--walker->level;
}
- if (!present || eperm || rsvd_fault)
+ if (unlikely(!present || eperm || rsvd_fault))
goto error;
- if (write_fault && !is_dirty_gpte(pte)) {
- bool ret;
+ if (write_fault && unlikely(!is_dirty_gpte(pte))) {
+ int ret;
trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
- ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte,
- pte|PT_DIRTY_MASK);
- if (ret)
+ ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index,
+ pte, pte|PT_DIRTY_MASK);
+ if (unlikely(ret < 0)) {
+ present = false;
+ goto error;
+ } else if (ret)
goto walk;
+
mark_page_dirty(vcpu->kvm, table_gfn);
pte |= PT_DIRTY_MASK;
walker->ptes[walker->level - 1] = pte;
@@ -325,7 +358,7 @@ no_present:
}
static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
- u64 *spte, const void *pte, unsigned long mmu_seq)
+ u64 *spte, const void *pte)
{
pt_element_t gpte;
unsigned pte_access;
@@ -342,8 +375,6 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
kvm_release_pfn_clean(pfn);
return;
}
- if (mmu_notifier_retry(vcpu, mmu_seq))
- return;
/*
* we call mmu_set_spte() with host_writable = true because that
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 6bb15d583e4..506e4fe23ad 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -63,6 +63,10 @@ MODULE_LICENSE("GPL");
#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
+#define TSC_RATIO_RSVD 0xffffff0000000000ULL
+#define TSC_RATIO_MIN 0x0000000000000001ULL
+#define TSC_RATIO_MAX 0x000000ffffffffffULL
+
static bool erratum_383_found __read_mostly;
static const u32 host_save_user_msrs[] = {
@@ -93,14 +97,6 @@ struct nested_state {
/* A VMEXIT is required but not yet emulated */
bool exit_required;
- /*
- * If we vmexit during an instruction emulation we need this to restore
- * the l1 guest rip after the emulation
- */
- unsigned long vmexit_rip;
- unsigned long vmexit_rsp;
- unsigned long vmexit_rax;
-
/* cache for intercepts of the guest */
u32 intercept_cr;
u32 intercept_dr;
@@ -144,8 +140,13 @@ struct vcpu_svm {
unsigned int3_injected;
unsigned long int3_rip;
u32 apf_reason;
+
+ u64 tsc_ratio;
};
+static DEFINE_PER_CPU(u64, current_tsc_ratio);
+#define TSC_RATIO_DEFAULT 0x0100000000ULL
+
#define MSR_INVALID 0xffffffffU
static struct svm_direct_access_msrs {
@@ -190,6 +191,7 @@ static int nested_svm_intercept(struct vcpu_svm *svm);
static int nested_svm_vmexit(struct vcpu_svm *svm);
static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
bool has_error_code, u32 error_code);
+static u64 __scale_tsc(u64 ratio, u64 tsc);
enum {
VMCB_INTERCEPTS, /* Intercept vectors, TSC offset,
@@ -376,7 +378,6 @@ struct svm_cpu_data {
};
static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
-static uint32_t svm_features;
struct svm_init_data {
int cpu;
@@ -569,6 +570,10 @@ static int has_svm(void)
static void svm_hardware_disable(void *garbage)
{
+ /* Make sure we clean up behind us */
+ if (static_cpu_has(X86_FEATURE_TSCRATEMSR))
+ wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
+
cpu_svm_disable();
}
@@ -610,6 +615,11 @@ static int svm_hardware_enable(void *garbage)
wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT);
+ if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
+ wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
+ __get_cpu_var(current_tsc_ratio) = TSC_RATIO_DEFAULT;
+ }
+
svm_init_erratum_383();
return 0;
@@ -791,6 +801,23 @@ static __init int svm_hardware_setup(void)
if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
kvm_enable_efer_bits(EFER_FFXSR);
+ if (boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
+ u64 max;
+
+ kvm_has_tsc_control = true;
+
+ /*
+ * Make sure the user can only configure tsc_khz values that
+ * fit into a signed integer.
+ * A min value is not calculated needed because it will always
+ * be 1 on all machines and a value of 0 is used to disable
+ * tsc-scaling for the vcpu.
+ */
+ max = min(0x7fffffffULL, __scale_tsc(tsc_khz, TSC_RATIO_MAX));
+
+ kvm_max_guest_tsc_khz = max;
+ }
+
if (nested) {
printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
@@ -802,8 +829,6 @@ static __init int svm_hardware_setup(void)
goto err;
}
- svm_features = cpuid_edx(SVM_CPUID_FUNC);
-
if (!boot_cpu_has(X86_FEATURE_NPT))
npt_enabled = false;
@@ -854,6 +879,64 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
seg->base = 0;
}
+static u64 __scale_tsc(u64 ratio, u64 tsc)
+{
+ u64 mult, frac, _tsc;
+
+ mult = ratio >> 32;
+ frac = ratio & ((1ULL << 32) - 1);
+
+ _tsc = tsc;
+ _tsc *= mult;
+ _tsc += (tsc >> 32) * frac;
+ _tsc += ((tsc & ((1ULL << 32) - 1)) * frac) >> 32;
+
+ return _tsc;
+}
+
+static u64 svm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u64 _tsc = tsc;
+
+ if (svm->tsc_ratio != TSC_RATIO_DEFAULT)
+ _tsc = __scale_tsc(svm->tsc_ratio, tsc);
+
+ return _tsc;
+}
+
+static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u64 ratio;
+ u64 khz;
+
+ /* TSC scaling supported? */
+ if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR))
+ return;
+
+ /* TSC-Scaling disabled or guest TSC same frequency as host TSC? */
+ if (user_tsc_khz == 0) {
+ vcpu->arch.virtual_tsc_khz = 0;
+ svm->tsc_ratio = TSC_RATIO_DEFAULT;
+ return;
+ }
+
+ khz = user_tsc_khz;
+
+ /* TSC scaling required - calculate ratio */
+ ratio = khz << 32;
+ do_div(ratio, tsc_khz);
+
+ if (ratio == 0 || ratio & TSC_RATIO_RSVD) {
+ WARN_ONCE(1, "Invalid TSC ratio - virtual-tsc-khz=%u\n",
+ user_tsc_khz);
+ return;
+ }
+ vcpu->arch.virtual_tsc_khz = user_tsc_khz;
+ svm->tsc_ratio = ratio;
+}
+
static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -880,6 +963,15 @@ static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
}
+static u64 svm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
+{
+ u64 tsc;
+
+ tsc = svm_scale_tsc(vcpu, native_read_tsc());
+
+ return target_tsc - tsc;
+}
+
static void init_vmcb(struct vcpu_svm *svm)
{
struct vmcb_control_area *control = &svm->vmcb->control;
@@ -975,7 +1067,7 @@ static void init_vmcb(struct vcpu_svm *svm)
svm_set_efer(&svm->vcpu, 0);
save->dr6 = 0xffff0ff0;
save->dr7 = 0x400;
- save->rflags = 2;
+ kvm_set_rflags(&svm->vcpu, 2);
save->rip = 0x0000fff0;
svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
@@ -1048,6 +1140,8 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
goto out;
}
+ svm->tsc_ratio = TSC_RATIO_DEFAULT;
+
err = kvm_vcpu_init(&svm->vcpu, kvm, id);
if (err)
goto free_svm;
@@ -1141,6 +1235,12 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
+
+ if (static_cpu_has(X86_FEATURE_TSCRATEMSR) &&
+ svm->tsc_ratio != __get_cpu_var(current_tsc_ratio)) {
+ __get_cpu_var(current_tsc_ratio) = svm->tsc_ratio;
+ wrmsrl(MSR_AMD64_TSC_RATIO, svm->tsc_ratio);
+ }
}
static void svm_vcpu_put(struct kvm_vcpu *vcpu)
@@ -1365,31 +1465,6 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
struct vcpu_svm *svm = to_svm(vcpu);
- if (is_guest_mode(vcpu)) {
- /*
- * We are here because we run in nested mode, the host kvm
- * intercepts cr0 writes but the l1 hypervisor does not.
- * But the L1 hypervisor may intercept selective cr0 writes.
- * This needs to be checked here.
- */
- unsigned long old, new;
-
- /* Remove bits that would trigger a real cr0 write intercept */
- old = vcpu->arch.cr0 & SVM_CR0_SELECTIVE_MASK;
- new = cr0 & SVM_CR0_SELECTIVE_MASK;
-
- if (old == new) {
- /* cr0 write with ts and mp unchanged */
- svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
- if (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE) {
- svm->nested.vmexit_rip = kvm_rip_read(vcpu);
- svm->nested.vmexit_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
- svm->nested.vmexit_rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
- return;
- }
- }
- }
-
#ifdef CONFIG_X86_64
if (vcpu->arch.efer & EFER_LME) {
if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
@@ -2127,7 +2202,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
nested_vmcb->save.cr3 = kvm_read_cr3(&svm->vcpu);
nested_vmcb->save.cr2 = vmcb->save.cr2;
nested_vmcb->save.cr4 = svm->vcpu.arch.cr4;
- nested_vmcb->save.rflags = vmcb->save.rflags;
+ nested_vmcb->save.rflags = kvm_get_rflags(&svm->vcpu);
nested_vmcb->save.rip = vmcb->save.rip;
nested_vmcb->save.rsp = vmcb->save.rsp;
nested_vmcb->save.rax = vmcb->save.rax;
@@ -2184,7 +2259,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
svm->vmcb->save.ds = hsave->save.ds;
svm->vmcb->save.gdtr = hsave->save.gdtr;
svm->vmcb->save.idtr = hsave->save.idtr;
- svm->vmcb->save.rflags = hsave->save.rflags;
+ kvm_set_rflags(&svm->vcpu, hsave->save.rflags);
svm_set_efer(&svm->vcpu, hsave->save.efer);
svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE);
svm_set_cr4(&svm->vcpu, hsave->save.cr4);
@@ -2312,7 +2387,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
hsave->save.efer = svm->vcpu.arch.efer;
hsave->save.cr0 = kvm_read_cr0(&svm->vcpu);
hsave->save.cr4 = svm->vcpu.arch.cr4;
- hsave->save.rflags = vmcb->save.rflags;
+ hsave->save.rflags = kvm_get_rflags(&svm->vcpu);
hsave->save.rip = kvm_rip_read(&svm->vcpu);
hsave->save.rsp = vmcb->save.rsp;
hsave->save.rax = vmcb->save.rax;
@@ -2323,7 +2398,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
copy_vmcb_control_area(hsave, vmcb);
- if (svm->vmcb->save.rflags & X86_EFLAGS_IF)
+ if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF)
svm->vcpu.arch.hflags |= HF_HIF_MASK;
else
svm->vcpu.arch.hflags &= ~HF_HIF_MASK;
@@ -2341,7 +2416,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
svm->vmcb->save.ds = nested_vmcb->save.ds;
svm->vmcb->save.gdtr = nested_vmcb->save.gdtr;
svm->vmcb->save.idtr = nested_vmcb->save.idtr;
- svm->vmcb->save.rflags = nested_vmcb->save.rflags;
+ kvm_set_rflags(&svm->vcpu, nested_vmcb->save.rflags);
svm_set_efer(&svm->vcpu, nested_vmcb->save.efer);
svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0);
svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4);
@@ -2443,13 +2518,13 @@ static int vmload_interception(struct vcpu_svm *svm)
if (nested_svm_check_permissions(svm))
return 1;
- svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
- skip_emulated_instruction(&svm->vcpu);
-
nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
if (!nested_vmcb)
return 1;
+ svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
+ skip_emulated_instruction(&svm->vcpu);
+
nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
nested_svm_unmap(page);
@@ -2464,13 +2539,13 @@ static int vmsave_interception(struct vcpu_svm *svm)
if (nested_svm_check_permissions(svm))
return 1;
- svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
- skip_emulated_instruction(&svm->vcpu);
-
nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
if (!nested_vmcb)
return 1;
+ svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
+ skip_emulated_instruction(&svm->vcpu);
+
nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
nested_svm_unmap(page);
@@ -2676,6 +2751,29 @@ static int emulate_on_interception(struct vcpu_svm *svm)
return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
}
+bool check_selective_cr0_intercepted(struct vcpu_svm *svm, unsigned long val)
+{
+ unsigned long cr0 = svm->vcpu.arch.cr0;
+ bool ret = false;
+ u64 intercept;
+
+ intercept = svm->nested.intercept;
+
+ if (!is_guest_mode(&svm->vcpu) ||
+ (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0))))
+ return false;
+
+ cr0 &= ~SVM_CR0_SELECTIVE_MASK;
+ val &= ~SVM_CR0_SELECTIVE_MASK;
+
+ if (cr0 ^ val) {
+ svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
+ ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE);
+ }
+
+ return ret;
+}
+
#define CR_VALID (1ULL << 63)
static int cr_interception(struct vcpu_svm *svm)
@@ -2699,7 +2797,11 @@ static int cr_interception(struct vcpu_svm *svm)
val = kvm_register_read(&svm->vcpu, reg);
switch (cr) {
case 0:
- err = kvm_set_cr0(&svm->vcpu, val);
+ if (!check_selective_cr0_intercepted(svm, val))
+ err = kvm_set_cr0(&svm->vcpu, val);
+ else
+ return 1;
+
break;
case 3:
err = kvm_set_cr3(&svm->vcpu, val);
@@ -2744,23 +2846,6 @@ static int cr_interception(struct vcpu_svm *svm)
return 1;
}
-static int cr0_write_interception(struct vcpu_svm *svm)
-{
- struct kvm_vcpu *vcpu = &svm->vcpu;
- int r;
-
- r = cr_interception(svm);
-
- if (svm->nested.vmexit_rip) {
- kvm_register_write(vcpu, VCPU_REGS_RIP, svm->nested.vmexit_rip);
- kvm_register_write(vcpu, VCPU_REGS_RSP, svm->nested.vmexit_rsp);
- kvm_register_write(vcpu, VCPU_REGS_RAX, svm->nested.vmexit_rax);
- svm->nested.vmexit_rip = 0;
- }
-
- return r;
-}
-
static int dr_interception(struct vcpu_svm *svm)
{
int reg, dr;
@@ -2813,7 +2898,9 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
case MSR_IA32_TSC: {
struct vmcb *vmcb = get_host_vmcb(svm);
- *data = vmcb->control.tsc_offset + native_read_tsc();
+ *data = vmcb->control.tsc_offset +
+ svm_scale_tsc(vcpu, native_read_tsc());
+
break;
}
case MSR_STAR:
@@ -3048,7 +3135,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
[SVM_EXIT_READ_CR4] = cr_interception,
[SVM_EXIT_READ_CR8] = cr_interception,
[SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception,
- [SVM_EXIT_WRITE_CR0] = cr0_write_interception,
+ [SVM_EXIT_WRITE_CR0] = cr_interception,
[SVM_EXIT_WRITE_CR3] = cr_interception,
[SVM_EXIT_WRITE_CR4] = cr_interception,
[SVM_EXIT_WRITE_CR8] = cr8_write_interception,
@@ -3104,97 +3191,109 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
[SVM_EXIT_NPF] = pf_interception,
};
-void dump_vmcb(struct kvm_vcpu *vcpu)
+static void dump_vmcb(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
struct vmcb_control_area *control = &svm->vmcb->control;
struct vmcb_save_area *save = &svm->vmcb->save;
pr_err("VMCB Control Area:\n");
- pr_err("cr_read: %04x\n", control->intercept_cr & 0xffff);
- pr_err("cr_write: %04x\n", control->intercept_cr >> 16);
- pr_err("dr_read: %04x\n", control->intercept_dr & 0xffff);
- pr_err("dr_write: %04x\n", control->intercept_dr >> 16);
- pr_err("exceptions: %08x\n", control->intercept_exceptions);
- pr_err("intercepts: %016llx\n", control->intercept);
- pr_err("pause filter count: %d\n", control->pause_filter_count);
- pr_err("iopm_base_pa: %016llx\n", control->iopm_base_pa);
- pr_err("msrpm_base_pa: %016llx\n", control->msrpm_base_pa);
- pr_err("tsc_offset: %016llx\n", control->tsc_offset);
- pr_err("asid: %d\n", control->asid);
- pr_err("tlb_ctl: %d\n", control->tlb_ctl);
- pr_err("int_ctl: %08x\n", control->int_ctl);
- pr_err("int_vector: %08x\n", control->int_vector);
- pr_err("int_state: %08x\n", control->int_state);
- pr_err("exit_code: %08x\n", control->exit_code);
- pr_err("exit_info1: %016llx\n", control->exit_info_1);
- pr_err("exit_info2: %016llx\n", control->exit_info_2);
- pr_err("exit_int_info: %08x\n", control->exit_int_info);
- pr_err("exit_int_info_err: %08x\n", control->exit_int_info_err);
- pr_err("nested_ctl: %lld\n", control->nested_ctl);
- pr_err("nested_cr3: %016llx\n", control->nested_cr3);
- pr_err("event_inj: %08x\n", control->event_inj);
- pr_err("event_inj_err: %08x\n", control->event_inj_err);
- pr_err("lbr_ctl: %lld\n", control->lbr_ctl);
- pr_err("next_rip: %016llx\n", control->next_rip);
+ pr_err("%-20s%04x\n", "cr_read:", control->intercept_cr & 0xffff);
+ pr_err("%-20s%04x\n", "cr_write:", control->intercept_cr >> 16);
+ pr_err("%-20s%04x\n", "dr_read:", control->intercept_dr & 0xffff);
+ pr_err("%-20s%04x\n", "dr_write:", control->intercept_dr >> 16);
+ pr_err("%-20s%08x\n", "exceptions:", control->intercept_exceptions);
+ pr_err("%-20s%016llx\n", "intercepts:", control->intercept);
+ pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count);
+ pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa);
+ pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa);
+ pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset);
+ pr_err("%-20s%d\n", "asid:", control->asid);
+ pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl);
+ pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl);
+ pr_err("%-20s%08x\n", "int_vector:", control->int_vector);
+ pr_err("%-20s%08x\n", "int_state:", control->int_state);
+ pr_err("%-20s%08x\n", "exit_code:", control->exit_code);
+ pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1);
+ pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2);
+ pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info);
+ pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err);
+ pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl);
+ pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3);
+ pr_err("%-20s%08x\n", "event_inj:", control->event_inj);
+ pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err);
+ pr_err("%-20s%lld\n", "lbr_ctl:", control->lbr_ctl);
+ pr_err("%-20s%016llx\n", "next_rip:", control->next_rip);
pr_err("VMCB State Save Area:\n");
- pr_err("es: s: %04x a: %04x l: %08x b: %016llx\n",
- save->es.selector, save->es.attrib,
- save->es.limit, save->es.base);
- pr_err("cs: s: %04x a: %04x l: %08x b: %016llx\n",
- save->cs.selector, save->cs.attrib,
- save->cs.limit, save->cs.base);
- pr_err("ss: s: %04x a: %04x l: %08x b: %016llx\n",
- save->ss.selector, save->ss.attrib,
- save->ss.limit, save->ss.base);
- pr_err("ds: s: %04x a: %04x l: %08x b: %016llx\n",
- save->ds.selector, save->ds.attrib,
- save->ds.limit, save->ds.base);
- pr_err("fs: s: %04x a: %04x l: %08x b: %016llx\n",
- save->fs.selector, save->fs.attrib,
- save->fs.limit, save->fs.base);
- pr_err("gs: s: %04x a: %04x l: %08x b: %016llx\n",
- save->gs.selector, save->gs.attrib,
- save->gs.limit, save->gs.base);
- pr_err("gdtr: s: %04x a: %04x l: %08x b: %016llx\n",
- save->gdtr.selector, save->gdtr.attrib,
- save->gdtr.limit, save->gdtr.base);
- pr_err("ldtr: s: %04x a: %04x l: %08x b: %016llx\n",
- save->ldtr.selector, save->ldtr.attrib,
- save->ldtr.limit, save->ldtr.base);
- pr_err("idtr: s: %04x a: %04x l: %08x b: %016llx\n",
- save->idtr.selector, save->idtr.attrib,
- save->idtr.limit, save->idtr.base);
- pr_err("tr: s: %04x a: %04x l: %08x b: %016llx\n",
- save->tr.selector, save->tr.attrib,
- save->tr.limit, save->tr.base);
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "es:",
+ save->es.selector, save->es.attrib,
+ save->es.limit, save->es.base);
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "cs:",
+ save->cs.selector, save->cs.attrib,
+ save->cs.limit, save->cs.base);
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "ss:",
+ save->ss.selector, save->ss.attrib,
+ save->ss.limit, save->ss.base);
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "ds:",
+ save->ds.selector, save->ds.attrib,
+ save->ds.limit, save->ds.base);
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "fs:",
+ save->fs.selector, save->fs.attrib,
+ save->fs.limit, save->fs.base);
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "gs:",
+ save->gs.selector, save->gs.attrib,
+ save->gs.limit, save->gs.base);
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "gdtr:",
+ save->gdtr.selector, save->gdtr.attrib,
+ save->gdtr.limit, save->gdtr.base);
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "ldtr:",
+ save->ldtr.selector, save->ldtr.attrib,
+ save->ldtr.limit, save->ldtr.base);
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "idtr:",
+ save->idtr.selector, save->idtr.attrib,
+ save->idtr.limit, save->idtr.base);
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "tr:",
+ save->tr.selector, save->tr.attrib,
+ save->tr.limit, save->tr.base);
pr_err("cpl: %d efer: %016llx\n",
save->cpl, save->efer);
- pr_err("cr0: %016llx cr2: %016llx\n",
- save->cr0, save->cr2);
- pr_err("cr3: %016llx cr4: %016llx\n",
- save->cr3, save->cr4);
- pr_err("dr6: %016llx dr7: %016llx\n",
- save->dr6, save->dr7);
- pr_err("rip: %016llx rflags: %016llx\n",
- save->rip, save->rflags);
- pr_err("rsp: %016llx rax: %016llx\n",
- save->rsp, save->rax);
- pr_err("star: %016llx lstar: %016llx\n",
- save->star, save->lstar);
- pr_err("cstar: %016llx sfmask: %016llx\n",
- save->cstar, save->sfmask);
- pr_err("kernel_gs_base: %016llx sysenter_cs: %016llx\n",
- save->kernel_gs_base, save->sysenter_cs);
- pr_err("sysenter_esp: %016llx sysenter_eip: %016llx\n",
- save->sysenter_esp, save->sysenter_eip);
- pr_err("gpat: %016llx dbgctl: %016llx\n",
- save->g_pat, save->dbgctl);
- pr_err("br_from: %016llx br_to: %016llx\n",
- save->br_from, save->br_to);
- pr_err("excp_from: %016llx excp_to: %016llx\n",
- save->last_excp_from, save->last_excp_to);
-
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "cr0:", save->cr0, "cr2:", save->cr2);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "cr3:", save->cr3, "cr4:", save->cr4);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "dr6:", save->dr6, "dr7:", save->dr7);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "rip:", save->rip, "rflags:", save->rflags);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "rsp:", save->rsp, "rax:", save->rax);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "star:", save->star, "lstar:", save->lstar);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "cstar:", save->cstar, "sfmask:", save->sfmask);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "kernel_gs_base:", save->kernel_gs_base,
+ "sysenter_cs:", save->sysenter_cs);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "sysenter_esp:", save->sysenter_esp,
+ "sysenter_eip:", save->sysenter_eip);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "gpat:", save->g_pat, "dbgctl:", save->dbgctl);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "br_from:", save->br_from, "br_to:", save->br_to);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "excp_from:", save->last_excp_from,
+ "excp_to:", save->last_excp_to);
}
static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
@@ -3384,7 +3483,7 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK))
return 0;
- ret = !!(vmcb->save.rflags & X86_EFLAGS_IF);
+ ret = !!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF);
if (is_guest_mode(vcpu))
return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK);
@@ -3871,6 +3970,186 @@ static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
update_cr0_intercept(svm);
}
+#define PRE_EX(exit) { .exit_code = (exit), \
+ .stage = X86_ICPT_PRE_EXCEPT, }
+#define POST_EX(exit) { .exit_code = (exit), \
+ .stage = X86_ICPT_POST_EXCEPT, }
+#define POST_MEM(exit) { .exit_code = (exit), \
+ .stage = X86_ICPT_POST_MEMACCESS, }
+
+static struct __x86_intercept {
+ u32 exit_code;
+ enum x86_intercept_stage stage;
+} x86_intercept_map[] = {
+ [x86_intercept_cr_read] = POST_EX(SVM_EXIT_READ_CR0),
+ [x86_intercept_cr_write] = POST_EX(SVM_EXIT_WRITE_CR0),
+ [x86_intercept_clts] = POST_EX(SVM_EXIT_WRITE_CR0),
+ [x86_intercept_lmsw] = POST_EX(SVM_EXIT_WRITE_CR0),
+ [x86_intercept_smsw] = POST_EX(SVM_EXIT_READ_CR0),
+ [x86_intercept_dr_read] = POST_EX(SVM_EXIT_READ_DR0),
+ [x86_intercept_dr_write] = POST_EX(SVM_EXIT_WRITE_DR0),
+ [x86_intercept_sldt] = POST_EX(SVM_EXIT_LDTR_READ),
+ [x86_intercept_str] = POST_EX(SVM_EXIT_TR_READ),
+ [x86_intercept_lldt] = POST_EX(SVM_EXIT_LDTR_WRITE),
+ [x86_intercept_ltr] = POST_EX(SVM_EXIT_TR_WRITE),
+ [x86_intercept_sgdt] = POST_EX(SVM_EXIT_GDTR_READ),
+ [x86_intercept_sidt] = POST_EX(SVM_EXIT_IDTR_READ),
+ [x86_intercept_lgdt] = POST_EX(SVM_EXIT_GDTR_WRITE),
+ [x86_intercept_lidt] = POST_EX(SVM_EXIT_IDTR_WRITE),
+ [x86_intercept_vmrun] = POST_EX(SVM_EXIT_VMRUN),
+ [x86_intercept_vmmcall] = POST_EX(SVM_EXIT_VMMCALL),
+ [x86_intercept_vmload] = POST_EX(SVM_EXIT_VMLOAD),
+ [x86_intercept_vmsave] = POST_EX(SVM_EXIT_VMSAVE),
+ [x86_intercept_stgi] = POST_EX(SVM_EXIT_STGI),
+ [x86_intercept_clgi] = POST_EX(SVM_EXIT_CLGI),
+ [x86_intercept_skinit] = POST_EX(SVM_EXIT_SKINIT),
+ [x86_intercept_invlpga] = POST_EX(SVM_EXIT_INVLPGA),
+ [x86_intercept_rdtscp] = POST_EX(SVM_EXIT_RDTSCP),
+ [x86_intercept_monitor] = POST_MEM(SVM_EXIT_MONITOR),
+ [x86_intercept_mwait] = POST_EX(SVM_EXIT_MWAIT),
+ [x86_intercept_invlpg] = POST_EX(SVM_EXIT_INVLPG),
+ [x86_intercept_invd] = POST_EX(SVM_EXIT_INVD),
+ [x86_intercept_wbinvd] = POST_EX(SVM_EXIT_WBINVD),
+ [x86_intercept_wrmsr] = POST_EX(SVM_EXIT_MSR),
+ [x86_intercept_rdtsc] = POST_EX(SVM_EXIT_RDTSC),
+ [x86_intercept_rdmsr] = POST_EX(SVM_EXIT_MSR),
+ [x86_intercept_rdpmc] = POST_EX(SVM_EXIT_RDPMC),
+ [x86_intercept_cpuid] = PRE_EX(SVM_EXIT_CPUID),
+ [x86_intercept_rsm] = PRE_EX(SVM_EXIT_RSM),
+ [x86_intercept_pause] = PRE_EX(SVM_EXIT_PAUSE),
+ [x86_intercept_pushf] = PRE_EX(SVM_EXIT_PUSHF),
+ [x86_intercept_popf] = PRE_EX(SVM_EXIT_POPF),
+ [x86_intercept_intn] = PRE_EX(SVM_EXIT_SWINT),
+ [x86_intercept_iret] = PRE_EX(SVM_EXIT_IRET),
+ [x86_intercept_icebp] = PRE_EX(SVM_EXIT_ICEBP),
+ [x86_intercept_hlt] = POST_EX(SVM_EXIT_HLT),
+ [x86_intercept_in] = POST_EX(SVM_EXIT_IOIO),
+ [x86_intercept_ins] = POST_EX(SVM_EXIT_IOIO),
+ [x86_intercept_out] = POST_EX(SVM_EXIT_IOIO),
+ [x86_intercept_outs] = POST_EX(SVM_EXIT_IOIO),
+};
+
+#undef PRE_EX
+#undef POST_EX
+#undef POST_MEM
+
+static int svm_check_intercept(struct kvm_vcpu *vcpu,
+ struct x86_instruction_info *info,
+ enum x86_intercept_stage stage)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int vmexit, ret = X86EMUL_CONTINUE;
+ struct __x86_intercept icpt_info;
+ struct vmcb *vmcb = svm->vmcb;
+
+ if (info->intercept >= ARRAY_SIZE(x86_intercept_map))
+ goto out;
+
+ icpt_info = x86_intercept_map[info->intercept];
+
+ if (stage != icpt_info.stage)
+ goto out;
+
+ switch (icpt_info.exit_code) {
+ case SVM_EXIT_READ_CR0:
+ if (info->intercept == x86_intercept_cr_read)
+ icpt_info.exit_code += info->modrm_reg;
+ break;
+ case SVM_EXIT_WRITE_CR0: {
+ unsigned long cr0, val;
+ u64 intercept;
+
+ if (info->intercept == x86_intercept_cr_write)
+ icpt_info.exit_code += info->modrm_reg;
+
+ if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0)
+ break;
+
+ intercept = svm->nested.intercept;
+
+ if (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0)))
+ break;
+
+ cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK;
+ val = info->src_val & ~SVM_CR0_SELECTIVE_MASK;
+
+ if (info->intercept == x86_intercept_lmsw) {
+ cr0 &= 0xfUL;
+ val &= 0xfUL;
+ /* lmsw can't clear PE - catch this here */
+ if (cr0 & X86_CR0_PE)
+ val |= X86_CR0_PE;
+ }
+
+ if (cr0 ^ val)
+ icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE;
+
+ break;
+ }
+ case SVM_EXIT_READ_DR0:
+ case SVM_EXIT_WRITE_DR0:
+ icpt_info.exit_code += info->modrm_reg;
+ break;
+ case SVM_EXIT_MSR:
+ if (info->intercept == x86_intercept_wrmsr)
+ vmcb->control.exit_info_1 = 1;
+ else
+ vmcb->control.exit_info_1 = 0;
+ break;
+ case SVM_EXIT_PAUSE:
+ /*
+ * We get this for NOP only, but pause
+ * is rep not, check this here
+ */
+ if (info->rep_prefix != REPE_PREFIX)
+ goto out;
+ case SVM_EXIT_IOIO: {
+ u64 exit_info;
+ u32 bytes;
+
+ exit_info = (vcpu->arch.regs[VCPU_REGS_RDX] & 0xffff) << 16;
+
+ if (info->intercept == x86_intercept_in ||
+ info->intercept == x86_intercept_ins) {
+ exit_info |= SVM_IOIO_TYPE_MASK;
+ bytes = info->src_bytes;
+ } else {
+ bytes = info->dst_bytes;
+ }
+
+ if (info->intercept == x86_intercept_outs ||
+ info->intercept == x86_intercept_ins)
+ exit_info |= SVM_IOIO_STR_MASK;
+
+ if (info->rep_prefix)
+ exit_info |= SVM_IOIO_REP_MASK;
+
+ bytes = min(bytes, 4u);
+
+ exit_info |= bytes << SVM_IOIO_SIZE_SHIFT;
+
+ exit_info |= (u32)info->ad_bytes << (SVM_IOIO_ASIZE_SHIFT - 1);
+
+ vmcb->control.exit_info_1 = exit_info;
+ vmcb->control.exit_info_2 = info->next_rip;
+
+ break;
+ }
+ default:
+ break;
+ }
+
+ vmcb->control.next_rip = info->next_rip;
+ vmcb->control.exit_code = icpt_info.exit_code;
+ vmexit = nested_svm_exit_handled(svm);
+
+ ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED
+ : X86EMUL_CONTINUE;
+
+out:
+ return ret;
+}
+
static struct kvm_x86_ops svm_x86_ops = {
.cpu_has_kvm_support = has_svm,
.disabled_by_bios = is_disabled,
@@ -3952,10 +4231,14 @@ static struct kvm_x86_ops svm_x86_ops = {
.has_wbinvd_exit = svm_has_wbinvd_exit,
+ .set_tsc_khz = svm_set_tsc_khz,
.write_tsc_offset = svm_write_tsc_offset,
.adjust_tsc_offset = svm_adjust_tsc_offset,
+ .compute_tsc_offset = svm_compute_tsc_offset,
.set_tdp_cr3 = set_tdp_cr3,
+
+ .check_intercept = svm_check_intercept,
};
static int __init svm_init(void)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 5b4cdcbd154..d48ec60ea42 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -128,8 +128,11 @@ struct vcpu_vmx {
unsigned long host_rsp;
int launched;
u8 fail;
+ u8 cpl;
+ bool nmi_known_unmasked;
u32 exit_intr_info;
u32 idt_vectoring_info;
+ ulong rflags;
struct shared_msr_entry *guest_msrs;
int nmsrs;
int save_nmsrs;
@@ -159,6 +162,10 @@ struct vcpu_vmx {
u32 ar;
} tr, es, ds, fs, gs;
} rmode;
+ struct {
+ u32 bitmask; /* 4 bits per segment (1 bit per field) */
+ struct kvm_save_segment seg[8];
+ } segment_cache;
int vpid;
bool emulation_required;
@@ -171,6 +178,15 @@ struct vcpu_vmx {
bool rdtscp_enabled;
};
+enum segment_cache_field {
+ SEG_FIELD_SEL = 0,
+ SEG_FIELD_BASE = 1,
+ SEG_FIELD_LIMIT = 2,
+ SEG_FIELD_AR = 3,
+
+ SEG_FIELD_NR = 4
+};
+
static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
{
return container_of(vcpu, struct vcpu_vmx, vcpu);
@@ -643,6 +659,62 @@ static void vmcs_set_bits(unsigned long field, u32 mask)
vmcs_writel(field, vmcs_readl(field) | mask);
}
+static void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
+{
+ vmx->segment_cache.bitmask = 0;
+}
+
+static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg,
+ unsigned field)
+{
+ bool ret;
+ u32 mask = 1 << (seg * SEG_FIELD_NR + field);
+
+ if (!(vmx->vcpu.arch.regs_avail & (1 << VCPU_EXREG_SEGMENTS))) {
+ vmx->vcpu.arch.regs_avail |= (1 << VCPU_EXREG_SEGMENTS);
+ vmx->segment_cache.bitmask = 0;
+ }
+ ret = vmx->segment_cache.bitmask & mask;
+ vmx->segment_cache.bitmask |= mask;
+ return ret;
+}
+
+static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg)
+{
+ u16 *p = &vmx->segment_cache.seg[seg].selector;
+
+ if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL))
+ *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector);
+ return *p;
+}
+
+static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg)
+{
+ ulong *p = &vmx->segment_cache.seg[seg].base;
+
+ if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE))
+ *p = vmcs_readl(kvm_vmx_segment_fields[seg].base);
+ return *p;
+}
+
+static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg)
+{
+ u32 *p = &vmx->segment_cache.seg[seg].limit;
+
+ if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT))
+ *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit);
+ return *p;
+}
+
+static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg)
+{
+ u32 *p = &vmx->segment_cache.seg[seg].ar;
+
+ if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR))
+ *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes);
+ return *p;
+}
+
static void update_exception_bitmap(struct kvm_vcpu *vcpu)
{
u32 eb;
@@ -970,17 +1042,24 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
{
unsigned long rflags, save_rflags;
- rflags = vmcs_readl(GUEST_RFLAGS);
- if (to_vmx(vcpu)->rmode.vm86_active) {
- rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
- save_rflags = to_vmx(vcpu)->rmode.save_rflags;
- rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
+ if (!test_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail)) {
+ __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
+ rflags = vmcs_readl(GUEST_RFLAGS);
+ if (to_vmx(vcpu)->rmode.vm86_active) {
+ rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
+ save_rflags = to_vmx(vcpu)->rmode.save_rflags;
+ rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
+ }
+ to_vmx(vcpu)->rflags = rflags;
}
- return rflags;
+ return to_vmx(vcpu)->rflags;
}
static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
{
+ __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
+ __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
+ to_vmx(vcpu)->rflags = rflags;
if (to_vmx(vcpu)->rmode.vm86_active) {
to_vmx(vcpu)->rmode.save_rflags = rflags;
rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
@@ -1053,7 +1132,10 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
}
if (vmx->rmode.vm86_active) {
- if (kvm_inject_realmode_interrupt(vcpu, nr) != EMULATE_DONE)
+ int inc_eip = 0;
+ if (kvm_exception_is_soft(nr))
+ inc_eip = vcpu->arch.event_exit_inst_len;
+ if (kvm_inject_realmode_interrupt(vcpu, nr, inc_eip) != EMULATE_DONE)
kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
return;
}
@@ -1151,6 +1233,16 @@ static u64 guest_read_tsc(void)
}
/*
+ * Empty call-back. Needs to be implemented when VMX enables the SET_TSC_KHZ
+ * ioctl. In this case the call-back should update internal vmx state to make
+ * the changes effective.
+ */
+static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
+{
+ /* Nothing to do here */
+}
+
+/*
* writes 'offset' into guest's timestamp counter offset register
*/
static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
@@ -1164,6 +1256,11 @@ static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
vmcs_write64(TSC_OFFSET, offset + adjustment);
}
+static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
+{
+ return target_tsc - native_read_tsc();
+}
+
/*
* Reads an msr value (of 'msr_index') into 'pdata'.
* Returns 0 on success, non-0 otherwise.
@@ -1243,9 +1340,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
break;
#ifdef CONFIG_X86_64
case MSR_FS_BASE:
+ vmx_segment_cache_clear(vmx);
vmcs_writel(GUEST_FS_BASE, data);
break;
case MSR_GS_BASE:
+ vmx_segment_cache_clear(vmx);
vmcs_writel(GUEST_GS_BASE, data);
break;
case MSR_KERNEL_GS_BASE:
@@ -1689,6 +1788,8 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
vmx->emulation_required = 1;
vmx->rmode.vm86_active = 0;
+ vmx_segment_cache_clear(vmx);
+
vmcs_write16(GUEST_TR_SELECTOR, vmx->rmode.tr.selector);
vmcs_writel(GUEST_TR_BASE, vmx->rmode.tr.base);
vmcs_write32(GUEST_TR_LIMIT, vmx->rmode.tr.limit);
@@ -1712,6 +1813,8 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
fix_pmode_dataseg(VCPU_SREG_GS, &vmx->rmode.gs);
fix_pmode_dataseg(VCPU_SREG_FS, &vmx->rmode.fs);
+ vmx_segment_cache_clear(vmx);
+
vmcs_write16(GUEST_SS_SELECTOR, 0);
vmcs_write32(GUEST_SS_AR_BYTES, 0x93);
@@ -1775,6 +1878,8 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
}
+ vmx_segment_cache_clear(vmx);
+
vmx->rmode.tr.selector = vmcs_read16(GUEST_TR_SELECTOR);
vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
@@ -1851,6 +1956,8 @@ static void enter_lmode(struct kvm_vcpu *vcpu)
{
u32 guest_tr_ar;
+ vmx_segment_cache_clear(to_vmx(vcpu));
+
guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
printk(KERN_DEBUG "%s: tss fixup for long mode. \n",
@@ -1940,7 +2047,8 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
unsigned long cr0,
struct kvm_vcpu *vcpu)
{
- vmx_decache_cr3(vcpu);
+ if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
+ vmx_decache_cr3(vcpu);
if (!(cr0 & X86_CR0_PG)) {
/* From paging/starting to nonpaging */
vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
@@ -1998,6 +2106,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
vmcs_writel(CR0_READ_SHADOW, cr0);
vmcs_writel(GUEST_CR0, hw_cr0);
vcpu->arch.cr0 = cr0;
+ __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
}
static u64 construct_eptp(unsigned long root_hpa)
@@ -2053,7 +2162,6 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
struct kvm_save_segment *save;
u32 ar;
@@ -2075,13 +2183,13 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
var->limit = save->limit;
ar = save->ar;
if (seg == VCPU_SREG_TR
- || var->selector == vmcs_read16(sf->selector))
+ || var->selector == vmx_read_guest_seg_selector(vmx, seg))
goto use_saved_rmode_seg;
}
- var->base = vmcs_readl(sf->base);
- var->limit = vmcs_read32(sf->limit);
- var->selector = vmcs_read16(sf->selector);
- ar = vmcs_read32(sf->ar_bytes);
+ var->base = vmx_read_guest_seg_base(vmx, seg);
+ var->limit = vmx_read_guest_seg_limit(vmx, seg);
+ var->selector = vmx_read_guest_seg_selector(vmx, seg);
+ ar = vmx_read_guest_seg_ar(vmx, seg);
use_saved_rmode_seg:
if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state)
ar = 0;
@@ -2098,27 +2206,37 @@ use_saved_rmode_seg:
static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
{
- struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
struct kvm_segment s;
if (to_vmx(vcpu)->rmode.vm86_active) {
vmx_get_segment(vcpu, &s, seg);
return s.base;
}
- return vmcs_readl(sf->base);
+ return vmx_read_guest_seg_base(to_vmx(vcpu), seg);
}
-static int vmx_get_cpl(struct kvm_vcpu *vcpu)
+static int __vmx_get_cpl(struct kvm_vcpu *vcpu)
{
if (!is_protmode(vcpu))
return 0;
- if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */
+ if (!is_long_mode(vcpu)
+ && (kvm_get_rflags(vcpu) & X86_EFLAGS_VM)) /* if virtual 8086 */
return 3;
- return vmcs_read16(GUEST_CS_SELECTOR) & 3;
+ return vmx_read_guest_seg_selector(to_vmx(vcpu), VCPU_SREG_CS) & 3;
}
+static int vmx_get_cpl(struct kvm_vcpu *vcpu)
+{
+ if (!test_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail)) {
+ __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
+ to_vmx(vcpu)->cpl = __vmx_get_cpl(vcpu);
+ }
+ return to_vmx(vcpu)->cpl;
+}
+
+
static u32 vmx_segment_access_rights(struct kvm_segment *var)
{
u32 ar;
@@ -2148,6 +2266,8 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
u32 ar;
+ vmx_segment_cache_clear(vmx);
+
if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) {
vmcs_write16(sf->selector, var->selector);
vmx->rmode.tr.selector = var->selector;
@@ -2184,11 +2304,12 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
ar |= 0x1; /* Accessed */
vmcs_write32(sf->ar_bytes, ar);
+ __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
}
static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
{
- u32 ar = vmcs_read32(GUEST_CS_AR_BYTES);
+ u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS);
*db = (ar >> 14) & 1;
*l = (ar >> 13) & 1;
@@ -2775,6 +2896,8 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
if (ret != 0)
goto out;
+ vmx_segment_cache_clear(vmx);
+
seg_setup(VCPU_SREG_CS);
/*
* GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
@@ -2904,7 +3027,10 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu)
++vcpu->stat.irq_injections;
if (vmx->rmode.vm86_active) {
- if (kvm_inject_realmode_interrupt(vcpu, irq) != EMULATE_DONE)
+ int inc_eip = 0;
+ if (vcpu->arch.interrupt.soft)
+ inc_eip = vcpu->arch.event_exit_inst_len;
+ if (kvm_inject_realmode_interrupt(vcpu, irq, inc_eip) != EMULATE_DONE)
kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
return;
}
@@ -2937,8 +3063,9 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
}
++vcpu->stat.nmi_injections;
+ vmx->nmi_known_unmasked = false;
if (vmx->rmode.vm86_active) {
- if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR) != EMULATE_DONE)
+ if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE)
kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
return;
}
@@ -2961,6 +3088,8 @@ static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
{
if (!cpu_has_virtual_nmis())
return to_vmx(vcpu)->soft_vnmi_blocked;
+ if (to_vmx(vcpu)->nmi_known_unmasked)
+ return false;
return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
}
@@ -2974,6 +3103,7 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
vmx->vnmi_blocked_time = 0;
}
} else {
+ vmx->nmi_known_unmasked = !masked;
if (masked)
vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
GUEST_INTR_STATE_NMI);
@@ -3091,7 +3221,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
enum emulation_result er;
vect_info = vmx->idt_vectoring_info;
- intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+ intr_info = vmx->exit_intr_info;
if (is_machine_check(intr_info))
return handle_machine_check(vcpu);
@@ -3122,7 +3252,6 @@ static int handle_exception(struct kvm_vcpu *vcpu)
}
error_code = 0;
- rip = kvm_rip_read(vcpu);
if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
if (is_page_fault(intr_info)) {
@@ -3169,6 +3298,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
vmx->vcpu.arch.event_exit_inst_len =
vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
kvm_run->exit_reason = KVM_EXIT_DEBUG;
+ rip = kvm_rip_read(vcpu);
kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip;
kvm_run->debug.arch.exception = ex_no;
break;
@@ -3505,9 +3635,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
switch (type) {
case INTR_TYPE_NMI_INTR:
vcpu->arch.nmi_injected = false;
- if (cpu_has_virtual_nmis())
- vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
- GUEST_INTR_STATE_NMI);
+ vmx_set_nmi_mask(vcpu, true);
break;
case INTR_TYPE_EXT_INTR:
case INTR_TYPE_SOFT_INTR:
@@ -3867,12 +3995,17 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
{
- u32 exit_intr_info = vmx->exit_intr_info;
+ u32 exit_intr_info;
+
+ if (!(vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY
+ || vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI))
+ return;
+
+ vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+ exit_intr_info = vmx->exit_intr_info;
/* Handle machine checks before interrupts are enabled */
- if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)
- || (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI
- && is_machine_check(exit_intr_info)))
+ if (is_machine_check(exit_intr_info))
kvm_machine_check();
/* We need to handle NMIs before interrupts are enabled */
@@ -3886,7 +4019,7 @@ static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
{
- u32 exit_intr_info = vmx->exit_intr_info;
+ u32 exit_intr_info;
bool unblock_nmi;
u8 vector;
bool idtv_info_valid;
@@ -3894,6 +4027,13 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
if (cpu_has_virtual_nmis()) {
+ if (vmx->nmi_known_unmasked)
+ return;
+ /*
+ * Can't use vmx->exit_intr_info since we're not sure what
+ * the exit reason is.
+ */
+ exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
/*
@@ -3910,6 +4050,10 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
vector != DF_VECTOR && !idtv_info_valid)
vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
GUEST_INTR_STATE_NMI);
+ else
+ vmx->nmi_known_unmasked =
+ !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
+ & GUEST_INTR_STATE_NMI);
} else if (unlikely(vmx->soft_vnmi_blocked))
vmx->vnmi_blocked_time +=
ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
@@ -3946,8 +4090,7 @@ static void __vmx_complete_interrupts(struct vcpu_vmx *vmx,
* Clear bit "block by NMI" before VM entry if a NMI
* delivery faulted.
*/
- vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
- GUEST_INTR_STATE_NMI);
+ vmx_set_nmi_mask(&vmx->vcpu, false);
break;
case INTR_TYPE_SOFT_EXCEPTION:
vmx->vcpu.arch.event_exit_inst_len =
@@ -4124,7 +4267,10 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
);
vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
+ | (1 << VCPU_EXREG_RFLAGS)
+ | (1 << VCPU_EXREG_CPL)
| (1 << VCPU_EXREG_PDPTR)
+ | (1 << VCPU_EXREG_SEGMENTS)
| (1 << VCPU_EXREG_CR3));
vcpu->arch.regs_dirty = 0;
@@ -4134,7 +4280,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmx->launched = 1;
vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
- vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
vmx_complete_atomic_exit(vmx);
vmx_recover_nmi_blocking(vmx);
@@ -4195,8 +4340,8 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
goto free_vcpu;
vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ err = -ENOMEM;
if (!vmx->guest_msrs) {
- err = -ENOMEM;
goto uninit_vcpu;
}
@@ -4215,7 +4360,8 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
if (err)
goto free_vmcs;
if (vm_need_virtualize_apic_accesses(kvm))
- if (alloc_apic_access_page(kvm) != 0)
+ err = alloc_apic_access_page(kvm);
+ if (err)
goto free_vmcs;
if (enable_ept) {
@@ -4368,6 +4514,13 @@ static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
{
}
+static int vmx_check_intercept(struct kvm_vcpu *vcpu,
+ struct x86_instruction_info *info,
+ enum x86_intercept_stage stage)
+{
+ return X86EMUL_CONTINUE;
+}
+
static struct kvm_x86_ops vmx_x86_ops = {
.cpu_has_kvm_support = cpu_has_kvm_support,
.disabled_by_bios = vmx_disabled_by_bios,
@@ -4449,10 +4602,14 @@ static struct kvm_x86_ops vmx_x86_ops = {
.has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
+ .set_tsc_khz = vmx_set_tsc_khz,
.write_tsc_offset = vmx_write_tsc_offset,
.adjust_tsc_offset = vmx_adjust_tsc_offset,
+ .compute_tsc_offset = vmx_compute_tsc_offset,
.set_tdp_cr3 = vmx_set_cr3,
+
+ .check_intercept = vmx_check_intercept,
};
static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 934b4c6b0bf..77c9d8673dc 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -60,22 +60,12 @@
#include <asm/div64.h>
#define MAX_IO_MSRS 256
-#define CR0_RESERVED_BITS \
- (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
- | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
- | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
-#define CR4_RESERVED_BITS \
- (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
- | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
- | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \
- | X86_CR4_OSXSAVE \
- | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
-
-#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
-
#define KVM_MAX_MCE_BANKS 32
#define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P)
+#define emul_to_vcpu(ctxt) \
+ container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt)
+
/* EFER defaults:
* - enable syscall per default because its emulated by KVM
* - enable LME and LMA per default on 64 bit KVM
@@ -100,6 +90,11 @@ EXPORT_SYMBOL_GPL(kvm_x86_ops);
int ignore_msrs = 0;
module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR);
+bool kvm_has_tsc_control;
+EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
+u32 kvm_max_guest_tsc_khz;
+EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
+
#define KVM_NR_SHARED_MSRS 16
struct kvm_shared_msrs_global {
@@ -157,6 +152,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
u64 __read_mostly host_xcr0;
+int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
+
static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
{
int i;
@@ -361,8 +358,8 @@ void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
void kvm_inject_nmi(struct kvm_vcpu *vcpu)
{
- kvm_make_request(KVM_REQ_NMI, vcpu);
kvm_make_request(KVM_REQ_EVENT, vcpu);
+ vcpu->arch.nmi_pending = 1;
}
EXPORT_SYMBOL_GPL(kvm_inject_nmi);
@@ -982,7 +979,15 @@ static inline int kvm_tsc_changes_freq(void)
return ret;
}
-static inline u64 nsec_to_cycles(u64 nsec)
+static u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->arch.virtual_tsc_khz)
+ return vcpu->arch.virtual_tsc_khz;
+ else
+ return __this_cpu_read(cpu_tsc_khz);
+}
+
+static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
{
u64 ret;
@@ -990,25 +995,24 @@ static inline u64 nsec_to_cycles(u64 nsec)
if (kvm_tsc_changes_freq())
printk_once(KERN_WARNING
"kvm: unreliable cycle conversion on adjustable rate TSC\n");
- ret = nsec * __this_cpu_read(cpu_tsc_khz);
+ ret = nsec * vcpu_tsc_khz(vcpu);
do_div(ret, USEC_PER_SEC);
return ret;
}
-static void kvm_arch_set_tsc_khz(struct kvm *kvm, u32 this_tsc_khz)
+static void kvm_init_tsc_catchup(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
{
/* Compute a scale to convert nanoseconds in TSC cycles */
kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,
- &kvm->arch.virtual_tsc_shift,
- &kvm->arch.virtual_tsc_mult);
- kvm->arch.virtual_tsc_khz = this_tsc_khz;
+ &vcpu->arch.tsc_catchup_shift,
+ &vcpu->arch.tsc_catchup_mult);
}
static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
{
u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec,
- vcpu->kvm->arch.virtual_tsc_mult,
- vcpu->kvm->arch.virtual_tsc_shift);
+ vcpu->arch.tsc_catchup_mult,
+ vcpu->arch.tsc_catchup_shift);
tsc += vcpu->arch.last_tsc_write;
return tsc;
}
@@ -1021,7 +1025,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
s64 sdiff;
raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
- offset = data - native_read_tsc();
+ offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
ns = get_kernel_ns();
elapsed = ns - kvm->arch.last_tsc_nsec;
sdiff = data - kvm->arch.last_tsc_write;
@@ -1037,13 +1041,13 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
* In that case, for a reliable TSC, we can match TSC offsets,
* or make a best guest using elapsed value.
*/
- if (sdiff < nsec_to_cycles(5ULL * NSEC_PER_SEC) &&
+ if (sdiff < nsec_to_cycles(vcpu, 5ULL * NSEC_PER_SEC) &&
elapsed < 5ULL * NSEC_PER_SEC) {
if (!check_tsc_unstable()) {
offset = kvm->arch.last_tsc_offset;
pr_debug("kvm: matched tsc offset for %llu\n", data);
} else {
- u64 delta = nsec_to_cycles(elapsed);
+ u64 delta = nsec_to_cycles(vcpu, elapsed);
offset += delta;
pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
}
@@ -1075,8 +1079,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
local_irq_save(flags);
kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp);
kernel_ns = get_kernel_ns();
- this_tsc_khz = __this_cpu_read(cpu_tsc_khz);
-
+ this_tsc_khz = vcpu_tsc_khz(v);
if (unlikely(this_tsc_khz == 0)) {
local_irq_restore(flags);
kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
@@ -1993,6 +1996,7 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_X86_ROBUST_SINGLESTEP:
case KVM_CAP_XSAVE:
case KVM_CAP_ASYNC_PF:
+ case KVM_CAP_GET_TSC_KHZ:
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
@@ -2019,6 +2023,9 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_XCRS:
r = cpu_has_xsave;
break;
+ case KVM_CAP_TSC_CONTROL:
+ r = kvm_has_tsc_control;
+ break;
default:
r = 0;
break;
@@ -2120,8 +2127,13 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
kvm_x86_ops->vcpu_load(vcpu, cpu);
if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
/* Make sure TSC doesn't go backwards */
- s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
- native_read_tsc() - vcpu->arch.last_host_tsc;
+ s64 tsc_delta;
+ u64 tsc;
+
+ kvm_get_msr(vcpu, MSR_IA32_TSC, &tsc);
+ tsc_delta = !vcpu->arch.last_guest_tsc ? 0 :
+ tsc - vcpu->arch.last_guest_tsc;
+
if (tsc_delta < 0)
mark_tsc_unstable("KVM discovered backwards TSC");
if (check_tsc_unstable()) {
@@ -2139,7 +2151,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
{
kvm_x86_ops->vcpu_put(vcpu);
kvm_put_guest_fpu(vcpu);
- vcpu->arch.last_host_tsc = native_read_tsc();
+ kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc);
}
static int is_efer_nx(void)
@@ -2324,6 +2336,12 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) |
0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM);
+ /* cpuid 0xC0000001.edx */
+ const u32 kvm_supported_word5_x86_features =
+ F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) |
+ F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) |
+ F(PMM) | F(PMM_EN);
+
/* all calls to cpuid_count() should be made on the same cpu */
get_cpu();
do_cpuid_1_ent(entry, function, index);
@@ -2418,6 +2436,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) |
(1 << KVM_FEATURE_NOP_IO_DELAY) |
(1 << KVM_FEATURE_CLOCKSOURCE2) |
+ (1 << KVM_FEATURE_ASYNC_PF) |
(1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
entry->ebx = 0;
entry->ecx = 0;
@@ -2432,6 +2451,20 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
entry->ecx &= kvm_supported_word6_x86_features;
cpuid_mask(&entry->ecx, 6);
break;
+ /*Add support for Centaur's CPUID instruction*/
+ case 0xC0000000:
+ /*Just support up to 0xC0000004 now*/
+ entry->eax = min(entry->eax, 0xC0000004);
+ break;
+ case 0xC0000001:
+ entry->edx &= kvm_supported_word5_x86_features;
+ cpuid_mask(&entry->edx, 5);
+ break;
+ case 0xC0000002:
+ case 0xC0000003:
+ case 0xC0000004:
+ /*Now nothing to do, reserved for the future*/
+ break;
}
kvm_x86_ops->set_supported_cpuid(function, entry);
@@ -2478,6 +2511,26 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
if (nent >= cpuid->nent)
goto out_free;
+ /* Add support for Centaur's CPUID instruction. */
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR) {
+ do_cpuid_ent(&cpuid_entries[nent], 0xC0000000, 0,
+ &nent, cpuid->nent);
+
+ r = -E2BIG;
+ if (nent >= cpuid->nent)
+ goto out_free;
+
+ limit = cpuid_entries[nent - 1].eax;
+ for (func = 0xC0000001;
+ func <= limit && nent < cpuid->nent; ++func)
+ do_cpuid_ent(&cpuid_entries[nent], func, 0,
+ &nent, cpuid->nent);
+
+ r = -E2BIG;
+ if (nent >= cpuid->nent)
+ goto out_free;
+ }
+
do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_SIGNATURE, 0, &nent,
cpuid->nent);
@@ -3046,6 +3099,32 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);
break;
}
+ case KVM_SET_TSC_KHZ: {
+ u32 user_tsc_khz;
+
+ r = -EINVAL;
+ if (!kvm_has_tsc_control)
+ break;
+
+ user_tsc_khz = (u32)arg;
+
+ if (user_tsc_khz >= kvm_max_guest_tsc_khz)
+ goto out;
+
+ kvm_x86_ops->set_tsc_khz(vcpu, user_tsc_khz);
+
+ r = 0;
+ goto out;
+ }
+ case KVM_GET_TSC_KHZ: {
+ r = -EIO;
+ if (check_tsc_unstable())
+ goto out;
+
+ r = vcpu_tsc_khz(vcpu);
+
+ goto out;
+ }
default:
r = -EINVAL;
}
@@ -3595,20 +3674,43 @@ static void kvm_init_msr_list(void)
static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
const void *v)
{
- if (vcpu->arch.apic &&
- !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v))
- return 0;
+ int handled = 0;
+ int n;
- return kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, len, v);
+ do {
+ n = min(len, 8);
+ if (!(vcpu->arch.apic &&
+ !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, n, v))
+ && kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, n, v))
+ break;
+ handled += n;
+ addr += n;
+ len -= n;
+ v += n;
+ } while (len);
+
+ return handled;
}
static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
{
- if (vcpu->arch.apic &&
- !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v))
- return 0;
+ int handled = 0;
+ int n;
+
+ do {
+ n = min(len, 8);
+ if (!(vcpu->arch.apic &&
+ !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, n, v))
+ && kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, n, v))
+ break;
+ trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, *(u64 *)v);
+ handled += n;
+ addr += n;
+ len -= n;
+ v += n;
+ } while (len);
- return kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v);
+ return handled;
}
static void kvm_set_segment(struct kvm_vcpu *vcpu,
@@ -3703,37 +3805,43 @@ out:
}
/* used for instruction fetching */
-static int kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes,
- struct kvm_vcpu *vcpu,
+static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,
+ gva_t addr, void *val, unsigned int bytes,
struct x86_exception *exception)
{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+
return kvm_read_guest_virt_helper(addr, val, bytes, vcpu,
access | PFERR_FETCH_MASK,
exception);
}
-static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
- struct kvm_vcpu *vcpu,
+static int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
+ gva_t addr, void *val, unsigned int bytes,
struct x86_exception *exception)
{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+
return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
exception);
}
-static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes,
- struct kvm_vcpu *vcpu,
+static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt,
+ gva_t addr, void *val, unsigned int bytes,
struct x86_exception *exception)
{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception);
}
-static int kvm_write_guest_virt_system(gva_t addr, void *val,
+static int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
+ gva_t addr, void *val,
unsigned int bytes,
- struct kvm_vcpu *vcpu,
struct x86_exception *exception)
{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
void *data = val;
int r = X86EMUL_CONTINUE;
@@ -3761,13 +3869,15 @@ out:
return r;
}
-static int emulator_read_emulated(unsigned long addr,
+static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr,
void *val,
unsigned int bytes,
- struct x86_exception *exception,
- struct kvm_vcpu *vcpu)
+ struct x86_exception *exception)
{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
gpa_t gpa;
+ int handled;
if (vcpu->mmio_read_completed) {
memcpy(val, vcpu->mmio_data, bytes);
@@ -3786,7 +3896,7 @@ static int emulator_read_emulated(unsigned long addr,
if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
goto mmio;
- if (kvm_read_guest_virt(addr, val, bytes, vcpu, exception)
+ if (kvm_read_guest_virt(ctxt, addr, val, bytes, exception)
== X86EMUL_CONTINUE)
return X86EMUL_CONTINUE;
@@ -3794,18 +3904,24 @@ mmio:
/*
* Is this MMIO handled locally?
*/
- if (!vcpu_mmio_read(vcpu, gpa, bytes, val)) {
- trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, gpa, *(u64 *)val);
+ handled = vcpu_mmio_read(vcpu, gpa, bytes, val);
+
+ if (handled == bytes)
return X86EMUL_CONTINUE;
- }
+
+ gpa += handled;
+ bytes -= handled;
+ val += handled;
trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0);
vcpu->mmio_needed = 1;
vcpu->run->exit_reason = KVM_EXIT_MMIO;
vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
- vcpu->run->mmio.len = vcpu->mmio_size = bytes;
+ vcpu->mmio_size = bytes;
+ vcpu->run->mmio.len = min(vcpu->mmio_size, 8);
vcpu->run->mmio.is_write = vcpu->mmio_is_write = 0;
+ vcpu->mmio_index = 0;
return X86EMUL_IO_NEEDED;
}
@@ -3829,6 +3945,7 @@ static int emulator_write_emulated_onepage(unsigned long addr,
struct kvm_vcpu *vcpu)
{
gpa_t gpa;
+ int handled;
gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
@@ -3847,25 +3964,35 @@ mmio:
/*
* Is this MMIO handled locally?
*/
- if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
+ handled = vcpu_mmio_write(vcpu, gpa, bytes, val);
+ if (handled == bytes)
return X86EMUL_CONTINUE;
+ gpa += handled;
+ bytes -= handled;
+ val += handled;
+
vcpu->mmio_needed = 1;
+ memcpy(vcpu->mmio_data, val, bytes);
vcpu->run->exit_reason = KVM_EXIT_MMIO;
vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
- vcpu->run->mmio.len = vcpu->mmio_size = bytes;
+ vcpu->mmio_size = bytes;
+ vcpu->run->mmio.len = min(vcpu->mmio_size, 8);
vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
- memcpy(vcpu->run->mmio.data, val, bytes);
+ memcpy(vcpu->run->mmio.data, vcpu->mmio_data, 8);
+ vcpu->mmio_index = 0;
return X86EMUL_CONTINUE;
}
-int emulator_write_emulated(unsigned long addr,
+int emulator_write_emulated(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr,
const void *val,
unsigned int bytes,
- struct x86_exception *exception,
- struct kvm_vcpu *vcpu)
+ struct x86_exception *exception)
{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+
/* Crossing a page boundary? */
if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
int rc, now;
@@ -3893,13 +4020,14 @@ int emulator_write_emulated(unsigned long addr,
(cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old))
#endif
-static int emulator_cmpxchg_emulated(unsigned long addr,
+static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr,
const void *old,
const void *new,
unsigned int bytes,
- struct x86_exception *exception,
- struct kvm_vcpu *vcpu)
+ struct x86_exception *exception)
{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
gpa_t gpa;
struct page *page;
char *kaddr;
@@ -3955,7 +4083,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
emul_write:
printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
- return emulator_write_emulated(addr, new, bytes, exception, vcpu);
+ return emulator_write_emulated(ctxt, addr, new, bytes, exception);
}
static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
@@ -3974,9 +4102,12 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
}
-static int emulator_pio_in_emulated(int size, unsigned short port, void *val,
- unsigned int count, struct kvm_vcpu *vcpu)
+static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
+ int size, unsigned short port, void *val,
+ unsigned int count)
{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+
if (vcpu->arch.pio.count)
goto data_avail;
@@ -4004,10 +4135,12 @@ static int emulator_pio_in_emulated(int size, unsigned short port, void *val,
return 0;
}
-static int emulator_pio_out_emulated(int size, unsigned short port,
- const void *val, unsigned int count,
- struct kvm_vcpu *vcpu)
+static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
+ int size, unsigned short port,
+ const void *val, unsigned int count)
{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+
trace_kvm_pio(1, port, size, count);
vcpu->arch.pio.port = port;
@@ -4037,10 +4170,9 @@ static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
return kvm_x86_ops->get_segment_base(vcpu, seg);
}
-int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
+static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address)
{
- kvm_mmu_invlpg(vcpu, address);
- return X86EMUL_CONTINUE;
+ kvm_mmu_invlpg(emul_to_vcpu(ctxt), address);
}
int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
@@ -4062,22 +4194,20 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
-int emulate_clts(struct kvm_vcpu *vcpu)
+static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt)
{
- kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
- kvm_x86_ops->fpu_activate(vcpu);
- return X86EMUL_CONTINUE;
+ kvm_emulate_wbinvd(emul_to_vcpu(ctxt));
}
-int emulator_get_dr(int dr, unsigned long *dest, struct kvm_vcpu *vcpu)
+int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
{
- return _kvm_get_dr(vcpu, dr, dest);
+ return _kvm_get_dr(emul_to_vcpu(ctxt), dr, dest);
}
-int emulator_set_dr(int dr, unsigned long value, struct kvm_vcpu *vcpu)
+int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
{
- return __kvm_set_dr(vcpu, dr, value);
+ return __kvm_set_dr(emul_to_vcpu(ctxt), dr, value);
}
static u64 mk_cr_64(u64 curr_cr, u32 new_val)
@@ -4085,8 +4215,9 @@ static u64 mk_cr_64(u64 curr_cr, u32 new_val)
return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
}
-static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu)
+static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr)
{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
unsigned long value;
switch (cr) {
@@ -4113,8 +4244,9 @@ static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu)
return value;
}
-static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)
+static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
int res = 0;
switch (cr) {
@@ -4141,33 +4273,45 @@ static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)
return res;
}
-static int emulator_get_cpl(struct kvm_vcpu *vcpu)
+static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)
+{
+ return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt));
+}
+
+static void emulator_get_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
+{
+ kvm_x86_ops->get_gdt(emul_to_vcpu(ctxt), dt);
+}
+
+static void emulator_get_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
{
- return kvm_x86_ops->get_cpl(vcpu);
+ kvm_x86_ops->get_idt(emul_to_vcpu(ctxt), dt);
}
-static void emulator_get_gdt(struct desc_ptr *dt, struct kvm_vcpu *vcpu)
+static void emulator_set_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
{
- kvm_x86_ops->get_gdt(vcpu, dt);
+ kvm_x86_ops->set_gdt(emul_to_vcpu(ctxt), dt);
}
-static void emulator_get_idt(struct desc_ptr *dt, struct kvm_vcpu *vcpu)
+static void emulator_set_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
{
- kvm_x86_ops->get_idt(vcpu, dt);
+ kvm_x86_ops->set_idt(emul_to_vcpu(ctxt), dt);
}
-static unsigned long emulator_get_cached_segment_base(int seg,
- struct kvm_vcpu *vcpu)
+static unsigned long emulator_get_cached_segment_base(
+ struct x86_emulate_ctxt *ctxt, int seg)
{
- return get_segment_base(vcpu, seg);
+ return get_segment_base(emul_to_vcpu(ctxt), seg);
}
-static bool emulator_get_cached_descriptor(struct desc_struct *desc, u32 *base3,
- int seg, struct kvm_vcpu *vcpu)
+static bool emulator_get_segment(struct x86_emulate_ctxt *ctxt, u16 *selector,
+ struct desc_struct *desc, u32 *base3,
+ int seg)
{
struct kvm_segment var;
- kvm_get_segment(vcpu, &var, seg);
+ kvm_get_segment(emul_to_vcpu(ctxt), &var, seg);
+ *selector = var.selector;
if (var.unusable)
return false;
@@ -4192,14 +4336,14 @@ static bool emulator_get_cached_descriptor(struct desc_struct *desc, u32 *base3,
return true;
}
-static void emulator_set_cached_descriptor(struct desc_struct *desc, u32 base3,
- int seg, struct kvm_vcpu *vcpu)
+static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector,
+ struct desc_struct *desc, u32 base3,
+ int seg)
{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
struct kvm_segment var;
- /* needed to preserve selector */
- kvm_get_segment(vcpu, &var, seg);
-
+ var.selector = selector;
var.base = get_desc_base(desc);
#ifdef CONFIG_X86_64
var.base |= ((u64)base3) << 32;
@@ -4223,22 +4367,44 @@ static void emulator_set_cached_descriptor(struct desc_struct *desc, u32 base3,
return;
}
-static u16 emulator_get_segment_selector(int seg, struct kvm_vcpu *vcpu)
+static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
+ u32 msr_index, u64 *pdata)
{
- struct kvm_segment kvm_seg;
+ return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata);
+}
- kvm_get_segment(vcpu, &kvm_seg, seg);
- return kvm_seg.selector;
+static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
+ u32 msr_index, u64 data)
+{
+ return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data);
}
-static void emulator_set_segment_selector(u16 sel, int seg,
- struct kvm_vcpu *vcpu)
+static void emulator_halt(struct x86_emulate_ctxt *ctxt)
{
- struct kvm_segment kvm_seg;
+ emul_to_vcpu(ctxt)->arch.halt_request = 1;
+}
- kvm_get_segment(vcpu, &kvm_seg, seg);
- kvm_seg.selector = sel;
- kvm_set_segment(vcpu, &kvm_seg, seg);
+static void emulator_get_fpu(struct x86_emulate_ctxt *ctxt)
+{
+ preempt_disable();
+ kvm_load_guest_fpu(emul_to_vcpu(ctxt));
+ /*
+ * CR0.TS may reference the host fpu state, not the guest fpu state,
+ * so it may be clear at this point.
+ */
+ clts();
+}
+
+static void emulator_put_fpu(struct x86_emulate_ctxt *ctxt)
+{
+ preempt_enable();
+}
+
+static int emulator_intercept(struct x86_emulate_ctxt *ctxt,
+ struct x86_instruction_info *info,
+ enum x86_intercept_stage stage)
+{
+ return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage);
}
static struct x86_emulate_ops emulate_ops = {
@@ -4248,22 +4414,29 @@ static struct x86_emulate_ops emulate_ops = {
.read_emulated = emulator_read_emulated,
.write_emulated = emulator_write_emulated,
.cmpxchg_emulated = emulator_cmpxchg_emulated,
+ .invlpg = emulator_invlpg,
.pio_in_emulated = emulator_pio_in_emulated,
.pio_out_emulated = emulator_pio_out_emulated,
- .get_cached_descriptor = emulator_get_cached_descriptor,
- .set_cached_descriptor = emulator_set_cached_descriptor,
- .get_segment_selector = emulator_get_segment_selector,
- .set_segment_selector = emulator_set_segment_selector,
+ .get_segment = emulator_get_segment,
+ .set_segment = emulator_set_segment,
.get_cached_segment_base = emulator_get_cached_segment_base,
.get_gdt = emulator_get_gdt,
.get_idt = emulator_get_idt,
+ .set_gdt = emulator_set_gdt,
+ .set_idt = emulator_set_idt,
.get_cr = emulator_get_cr,
.set_cr = emulator_set_cr,
.cpl = emulator_get_cpl,
.get_dr = emulator_get_dr,
.set_dr = emulator_set_dr,
- .set_msr = kvm_set_msr,
- .get_msr = kvm_get_msr,
+ .set_msr = emulator_set_msr,
+ .get_msr = emulator_get_msr,
+ .halt = emulator_halt,
+ .wbinvd = emulator_wbinvd,
+ .fix_hypercall = emulator_fix_hypercall,
+ .get_fpu = emulator_get_fpu,
+ .put_fpu = emulator_put_fpu,
+ .intercept = emulator_intercept,
};
static void cache_all_regs(struct kvm_vcpu *vcpu)
@@ -4305,12 +4478,17 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
int cs_db, cs_l;
+ /*
+ * TODO: fix emulate.c to use guest_read/write_register
+ * instead of direct ->regs accesses, can save hundred cycles
+ * on Intel for instructions that don't read/change RSP, for
+ * for example.
+ */
cache_all_regs(vcpu);
kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
- vcpu->arch.emulate_ctxt.vcpu = vcpu;
- vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
+ vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu);
vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
vcpu->arch.emulate_ctxt.mode =
(!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
@@ -4318,11 +4496,13 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
? X86EMUL_MODE_VM86 : cs_l
? X86EMUL_MODE_PROT64 : cs_db
? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
+ vcpu->arch.emulate_ctxt.guest_mode = is_guest_mode(vcpu);
memset(c, 0, sizeof(struct decode_cache));
memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
+ vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
}
-int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq)
+int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
{
struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
int ret;
@@ -4331,7 +4511,8 @@ int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq)
vcpu->arch.emulate_ctxt.decode.op_bytes = 2;
vcpu->arch.emulate_ctxt.decode.ad_bytes = 2;
- vcpu->arch.emulate_ctxt.decode.eip = vcpu->arch.emulate_ctxt.eip;
+ vcpu->arch.emulate_ctxt.decode.eip = vcpu->arch.emulate_ctxt.eip +
+ inc_eip;
ret = emulate_int_real(&vcpu->arch.emulate_ctxt, &emulate_ops, irq);
if (ret != X86EMUL_CONTINUE)
@@ -4340,7 +4521,7 @@ int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq)
vcpu->arch.emulate_ctxt.eip = c->eip;
memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
- kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
+ kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
if (irq == NMI_VECTOR)
vcpu->arch.nmi_pending = false;
@@ -4402,16 +4583,9 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
{
int r;
struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
+ bool writeback = true;
kvm_clear_exception_queue(vcpu);
- vcpu->arch.mmio_fault_cr2 = cr2;
- /*
- * TODO: fix emulate.c to use guest_read/write_register
- * instead of direct ->regs accesses, can save hundred cycles
- * on Intel for instructions that don't read/change RSP, for
- * for example.
- */
- cache_all_regs(vcpu);
if (!(emulation_type & EMULTYPE_NO_DECODE)) {
init_emulate_ctxt(vcpu);
@@ -4442,13 +4616,19 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
return EMULATE_DONE;
}
- /* this is needed for vmware backdor interface to work since it
+ /* this is needed for vmware backdoor interface to work since it
changes registers values during IO operation */
- memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
+ if (vcpu->arch.emulate_regs_need_sync_from_vcpu) {
+ vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
+ memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
+ }
restart:
r = x86_emulate_insn(&vcpu->arch.emulate_ctxt);
+ if (r == EMULATION_INTERCEPTED)
+ return EMULATE_DONE;
+
if (r == EMULATION_FAILED) {
if (reexecute_instruction(vcpu, cr2))
return EMULATE_DONE;
@@ -4462,21 +4642,28 @@ restart:
} else if (vcpu->arch.pio.count) {
if (!vcpu->arch.pio.in)
vcpu->arch.pio.count = 0;
+ else
+ writeback = false;
r = EMULATE_DO_MMIO;
} else if (vcpu->mmio_needed) {
- if (vcpu->mmio_is_write)
- vcpu->mmio_needed = 0;
+ if (!vcpu->mmio_is_write)
+ writeback = false;
r = EMULATE_DO_MMIO;
} else if (r == EMULATION_RESTART)
goto restart;
else
r = EMULATE_DONE;
- toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility);
- kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
- kvm_make_request(KVM_REQ_EVENT, vcpu);
- memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
- kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
+ if (writeback) {
+ toggle_interruptibility(vcpu,
+ vcpu->arch.emulate_ctxt.interruptibility);
+ kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
+ vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
+ kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
+ } else
+ vcpu->arch.emulate_regs_need_sync_to_vcpu = true;
return r;
}
@@ -4485,7 +4672,8 @@ EXPORT_SYMBOL_GPL(x86_emulate_instruction);
int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port)
{
unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX);
- int ret = emulator_pio_out_emulated(size, port, &val, 1, vcpu);
+ int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt,
+ size, port, &val, 1);
/* do not return to emulator after return from userspace */
vcpu->arch.pio.count = 0;
return ret;
@@ -4879,8 +5067,9 @@ out:
}
EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
-int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
+int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
char instruction[3];
unsigned long rip = kvm_rip_read(vcpu);
@@ -4893,21 +5082,8 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
kvm_x86_ops->patch_hypercall(vcpu, instruction);
- return emulator_write_emulated(rip, instruction, 3, NULL, vcpu);
-}
-
-void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
-{
- struct desc_ptr dt = { limit, base };
-
- kvm_x86_ops->set_gdt(vcpu, &dt);
-}
-
-void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
-{
- struct desc_ptr dt = { limit, base };
-
- kvm_x86_ops->set_idt(vcpu, &dt);
+ return emulator_write_emulated(&vcpu->arch.emulate_ctxt,
+ rip, instruction, 3, NULL);
}
static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
@@ -5170,6 +5346,7 @@ static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
{
int r;
+ bool nmi_pending;
bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
vcpu->run->request_interrupt_window;
@@ -5207,19 +5384,25 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
r = 1;
goto out;
}
- if (kvm_check_request(KVM_REQ_NMI, vcpu))
- vcpu->arch.nmi_pending = true;
}
r = kvm_mmu_reload(vcpu);
if (unlikely(r))
goto out;
+ /*
+ * An NMI can be injected between local nmi_pending read and
+ * vcpu->arch.nmi_pending read inside inject_pending_event().
+ * But in that case, KVM_REQ_EVENT will be set, which makes
+ * the race described above benign.
+ */
+ nmi_pending = ACCESS_ONCE(vcpu->arch.nmi_pending);
+
if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
inject_pending_event(vcpu);
/* enable NMI/IRQ window open exits if needed */
- if (vcpu->arch.nmi_pending)
+ if (nmi_pending)
kvm_x86_ops->enable_nmi_window(vcpu);
else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
kvm_x86_ops->enable_irq_window(vcpu);
@@ -5399,6 +5582,41 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
return r;
}
+static int complete_mmio(struct kvm_vcpu *vcpu)
+{
+ struct kvm_run *run = vcpu->run;
+ int r;
+
+ if (!(vcpu->arch.pio.count || vcpu->mmio_needed))
+ return 1;
+
+ if (vcpu->mmio_needed) {
+ vcpu->mmio_needed = 0;
+ if (!vcpu->mmio_is_write)
+ memcpy(vcpu->mmio_data + vcpu->mmio_index,
+ run->mmio.data, 8);
+ vcpu->mmio_index += 8;
+ if (vcpu->mmio_index < vcpu->mmio_size) {
+ run->exit_reason = KVM_EXIT_MMIO;
+ run->mmio.phys_addr = vcpu->mmio_phys_addr + vcpu->mmio_index;
+ memcpy(run->mmio.data, vcpu->mmio_data + vcpu->mmio_index, 8);
+ run->mmio.len = min(vcpu->mmio_size - vcpu->mmio_index, 8);
+ run->mmio.is_write = vcpu->mmio_is_write;
+ vcpu->mmio_needed = 1;
+ return 0;
+ }
+ if (vcpu->mmio_is_write)
+ return 1;
+ vcpu->mmio_read_completed = 1;
+ }
+ vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+ r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
+ srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+ if (r != EMULATE_DONE)
+ return 0;
+ return 1;
+}
+
int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
int r;
@@ -5425,20 +5643,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
}
}
- if (vcpu->arch.pio.count || vcpu->mmio_needed) {
- if (vcpu->mmio_needed) {
- memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
- vcpu->mmio_read_completed = 1;
- vcpu->mmio_needed = 0;
- }
- vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
- r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
- srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
- if (r != EMULATE_DONE) {
- r = 0;
- goto out;
- }
- }
+ r = complete_mmio(vcpu);
+ if (r <= 0)
+ goto out;
+
if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL)
kvm_register_write(vcpu, VCPU_REGS_RAX,
kvm_run->hypercall.ret);
@@ -5455,6 +5663,18 @@ out:
int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
{
+ if (vcpu->arch.emulate_regs_need_sync_to_vcpu) {
+ /*
+ * We are here if userspace calls get_regs() in the middle of
+ * instruction emulation. Registers state needs to be copied
+ * back from emulation context to vcpu. Usrapace shouldn't do
+ * that usually, but some bad designed PV devices (vmware
+ * backdoor interface) need this to work
+ */
+ struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
+ memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
+ vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
+ }
regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
@@ -5482,6 +5702,9 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
{
+ vcpu->arch.emulate_regs_need_sync_from_vcpu = true;
+ vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
+
kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
@@ -5592,7 +5815,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
- kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
+ kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
kvm_make_request(KVM_REQ_EVENT, vcpu);
return EMULATE_DONE;
}
@@ -5974,8 +6197,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
}
vcpu->arch.pio_data = page_address(page);
- if (!kvm->arch.virtual_tsc_khz)
- kvm_arch_set_tsc_khz(kvm, max_tsc_khz);
+ kvm_init_tsc_catchup(vcpu, max_tsc_khz);
r = kvm_mmu_create(vcpu);
if (r < 0)
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index c600da830ce..e407ed3df81 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -77,7 +77,7 @@ static inline u32 bit(int bitno)
void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
-int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq);
+int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data);
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 1cd608973ce..13ee258442a 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -7,7 +7,7 @@
* kernel and insert a module (lg.ko) which allows us to run other Linux
* kernels the same way we'd run processes. We call the first kernel the Host,
* and the others the Guests. The program which sets up and configures Guests
- * (such as the example in Documentation/lguest/lguest.c) is called the
+ * (such as the example in Documentation/virtual/lguest/lguest.c) is called the
* Launcher.
*
* Secondly, we only run specially modified Guests, not normal kernels: setting
@@ -71,7 +71,8 @@
#include <asm/stackprotector.h>
#include <asm/reboot.h> /* for struct machine_ops */
-/*G:010 Welcome to the Guest!
+/*G:010
+ * Welcome to the Guest!
*
* The Guest in our tale is a simple creature: identical to the Host but
* behaving in simplified but equivalent ways. In particular, the Guest is the
@@ -190,15 +191,23 @@ static void lazy_hcall4(unsigned long call,
#endif
/*G:036
- * When lazy mode is turned off reset the per-cpu lazy mode variable and then
- * issue the do-nothing hypercall to flush any stored calls.
-:*/
+ * When lazy mode is turned off, we issue the do-nothing hypercall to
+ * flush any stored calls, and call the generic helper to reset the
+ * per-cpu lazy mode variable.
+ */
static void lguest_leave_lazy_mmu_mode(void)
{
hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0);
paravirt_leave_lazy_mmu();
}
+/*
+ * We also catch the end of context switch; we enter lazy mode for much of
+ * that too, so again we need to flush here.
+ *
+ * (Technically, this is lazy CPU mode, and normally we're in lazy MMU
+ * mode, but unlike Xen, lguest doesn't care about the difference).
+ */
static void lguest_end_context_switch(struct task_struct *next)
{
hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0);
@@ -391,7 +400,7 @@ static void lguest_load_tr_desc(void)
* giant ball of hair. Its entry in the current Intel manual runs to 28 pages.
*
* This instruction even it has its own Wikipedia entry. The Wikipedia entry
- * has been translated into 5 languages. I am not making this up!
+ * has been translated into 6 languages. I am not making this up!
*
* We could get funky here and identify ourselves as "GenuineLguest", but
* instead we just use the real "cpuid" instruction. Then I pretty much turned
@@ -458,7 +467,7 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
/*
* PAE systems can mark pages as non-executable. Linux calls this the
* NX bit. Intel calls it XD (eXecute Disable), AMD EVP (Enhanced
- * Virus Protection). We just switch turn if off here, since we don't
+ * Virus Protection). We just switch it off here, since we don't
* support it.
*/
case 0x80000001:
@@ -520,17 +529,16 @@ static unsigned long lguest_read_cr2(void)
/* See lguest_set_pte() below. */
static bool cr3_changed = false;
+static unsigned long current_cr3;
/*
* cr3 is the current toplevel pagetable page: the principle is the same as
- * cr0. Keep a local copy, and tell the Host when it changes. The only
- * difference is that our local copy is in lguest_data because the Host needs
- * to set it upon our initial hypercall.
+ * cr0. Keep a local copy, and tell the Host when it changes.
*/
static void lguest_write_cr3(unsigned long cr3)
{
- lguest_data.pgdir = cr3;
lazy_hcall1(LHCALL_NEW_PGTABLE, cr3);
+ current_cr3 = cr3;
/* These two page tables are simple, linear, and used during boot */
if (cr3 != __pa(swapper_pg_dir) && cr3 != __pa(initial_page_table))
@@ -539,7 +547,7 @@ static void lguest_write_cr3(unsigned long cr3)
static unsigned long lguest_read_cr3(void)
{
- return lguest_data.pgdir;
+ return current_cr3;
}
/* cr4 is used to enable and disable PGE, but we don't care. */
@@ -641,7 +649,7 @@ static void lguest_write_cr4(unsigned long val)
/*
* The Guest calls this after it has set a second-level entry (pte), ie. to map
- * a page into a process' address space. Wetell the Host the toplevel and
+ * a page into a process' address space. We tell the Host the toplevel and
* address this corresponds to. The Guest uses one pagetable per process, so
* we need to tell the Host which one we're changing (mm->pgd).
*/
@@ -758,7 +766,7 @@ static void lguest_pmd_clear(pmd_t *pmdp)
static void lguest_flush_tlb_single(unsigned long addr)
{
/* Simply set it to zero: if it was not, it will fault back in. */
- lazy_hcall3(LHCALL_SET_PTE, lguest_data.pgdir, addr, 0);
+ lazy_hcall3(LHCALL_SET_PTE, current_cr3, addr, 0);
}
/*
@@ -913,8 +921,6 @@ static struct clocksource lguest_clock = {
.rating = 200,
.read = lguest_clock_read,
.mask = CLOCKSOURCE_MASK(64),
- .mult = 1 << 22,
- .shift = 22,
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
};
@@ -995,9 +1001,10 @@ static void lguest_time_irq(unsigned int irq, struct irq_desc *desc)
static void lguest_time_init(void)
{
/* Set up the timer interrupt (0) to go to our simple timer routine */
+ lguest_setup_irq(0);
irq_set_handler(0, lguest_time_irq);
- clocksource_register(&lguest_clock);
+ clocksource_register_hz(&lguest_clock, NSEC_PER_SEC);
/* We can't set cpumask in the initializer: damn C limitations! Set it
* here and register our timer device. */
@@ -1141,7 +1148,7 @@ static struct notifier_block paniced = {
static __init char *lguest_memory_setup(void)
{
/*
- *The Linux bootloader header contains an "e820" memory map: the
+ * The Linux bootloader header contains an "e820" memory map: the
* Launcher populated the first entry with our memory limit.
*/
e820_add_region(boot_params.e820_map[0].addr,
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S
index 4f420c2f2d5..6ddfe4fc23c 100644
--- a/arch/x86/lguest/i386_head.S
+++ b/arch/x86/lguest/i386_head.S
@@ -6,18 +6,22 @@
#include <asm/processor-flags.h>
/*G:020
- * Our story starts with the kernel booting into startup_32 in
- * arch/x86/kernel/head_32.S. It expects a boot header, which is created by
- * the bootloader (the Launcher in our case).
+
+ * Our story starts with the bzImage: booting starts at startup_32 in
+ * arch/x86/boot/compressed/head_32.S. This merely uncompresses the real
+ * kernel in place and then jumps into it: startup_32 in
+ * arch/x86/kernel/head_32.S. Both routines expects a boot header in the %esi
+ * register, which is created by the bootloader (the Launcher in our case).
*
* The startup_32 function does very little: it clears the uninitialized global
* C variables which we expect to be zero (ie. BSS) and then copies the boot
- * header and kernel command line somewhere safe. Finally it checks the
- * 'hardware_subarch' field. This was introduced in 2.6.24 for lguest and Xen:
- * if it's set to '1' (lguest's assigned number), then it calls us here.
+ * header and kernel command line somewhere safe, and populates some initial
+ * page tables. Finally it checks the 'hardware_subarch' field. This was
+ * introduced in 2.6.24 for lguest and Xen: if it's set to '1' (lguest's
+ * assigned number), then it calls us here.
*
* WARNING: be very careful here! We're running at addresses equal to physical
- * addesses (around 0), not above PAGE_OFFSET as most code expectes
+ * addresses (around 0), not above PAGE_OFFSET as most code expects
* (eg. 0xC0000000). Jumps are relative, so they're OK, but we can't touch any
* data without remembering to subtract __PAGE_OFFSET!
*
@@ -27,13 +31,18 @@
.section .init.text, "ax", @progbits
ENTRY(lguest_entry)
/*
- * We make the "initialization" hypercall now to tell the Host about
- * us, and also find out where it put our page tables.
+ * We make the "initialization" hypercall now to tell the Host where
+ * our lguest_data struct is.
*/
movl $LHCALL_LGUEST_INIT, %eax
movl $lguest_data - __PAGE_OFFSET, %ebx
int $LGUEST_TRAP_ENTRY
+ /* Now turn our pagetables on; setup by arch/x86/kernel/head_32.S. */
+ movl $LHCALL_NEW_PGTABLE, %eax
+ movl $(initial_page_table - __PAGE_OFFSET), %ebx
+ int $LGUEST_TRAP_ENTRY
+
/* Set up the initial stack so we can run C code. */
movl $(init_thread_union+THREAD_SIZE),%esp
@@ -96,12 +105,8 @@ send_interrupts:
*/
pushl %eax
movl $LHCALL_SEND_INTERRUPTS, %eax
- /*
- * This is a vmcall instruction (same thing that KVM uses). Older
- * assembler versions might not know the "vmcall" instruction, so we
- * create one manually here.
- */
- .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */
+ /* This is the actual hypercall trap. */
+ int $LGUEST_TRAP_ENTRY
/* Put eax back the way we found it. */
popl %eax
ret
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index f2479f19ddd..b00f6785da7 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -18,8 +18,10 @@ obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o
lib-y := delay.o
lib-y += thunk_$(BITS).o
-lib-y += usercopy_$(BITS).o getuser.o putuser.o
+lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o
lib-y += memcpy_$(BITS).o
+lib-$(CONFIG_SMP) += rwlock.o
+lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o
obj-y += msr.o msr-reg.o msr-reg-export.o
@@ -29,7 +31,7 @@ ifeq ($(CONFIG_X86_32),y)
lib-y += atomic64_cx8_32.o
lib-y += checksum_32.o
lib-y += strstr_32.o
- lib-y += semaphore_32.o string_32.o
+ lib-y += string_32.o
lib-y += cmpxchg.o
ifneq ($(CONFIG_X86_CMPXCHG64),y)
lib-y += cmpxchg8b_emu.o atomic64_386_32.o
@@ -40,7 +42,6 @@ else
lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
lib-y += thunk_64.o clear_page_64.o copy_page_64.o
lib-y += memmove_64.o memset_64.o
- lib-y += copy_user_64.o rwlock_64.o copy_user_nocache_64.o
- lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem_64.o
+ lib-y += copy_user_64.o copy_user_nocache_64.o
lib-y += cmpxchg16b_emu.o
endif
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index aa4326bfb24..f2145cfa12a 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -1,5 +1,6 @@
#include <linux/linkage.h>
#include <asm/dwarf2.h>
+#include <asm/alternative-asm.h>
/*
* Zero a page.
@@ -14,6 +15,15 @@ ENTRY(clear_page_c)
CFI_ENDPROC
ENDPROC(clear_page_c)
+ENTRY(clear_page_c_e)
+ CFI_STARTPROC
+ movl $4096,%ecx
+ xorl %eax,%eax
+ rep stosb
+ ret
+ CFI_ENDPROC
+ENDPROC(clear_page_c_e)
+
ENTRY(clear_page)
CFI_STARTPROC
xorl %eax,%eax
@@ -38,21 +48,26 @@ ENTRY(clear_page)
.Lclear_page_end:
ENDPROC(clear_page)
- /* Some CPUs run faster using the string instructions.
- It is also a lot simpler. Use this when possible */
+ /*
+ * Some CPUs support enhanced REP MOVSB/STOSB instructions.
+ * It is recommended to use this when possible.
+ * If enhanced REP MOVSB/STOSB is not available, try to use fast string.
+ * Otherwise, use original function.
+ *
+ */
#include <asm/cpufeature.h>
.section .altinstr_replacement,"ax"
1: .byte 0xeb /* jmp <disp8> */
.byte (clear_page_c - clear_page) - (2f - 1b) /* offset */
-2:
+2: .byte 0xeb /* jmp <disp8> */
+ .byte (clear_page_c_e - clear_page) - (3f - 2b) /* offset */
+3:
.previous
.section .altinstructions,"a"
- .align 8
- .quad clear_page
- .quad 1b
- .word X86_FEATURE_REP_GOOD
- .byte .Lclear_page_end - clear_page
- .byte 2b - 1b
+ altinstruction_entry clear_page,1b,X86_FEATURE_REP_GOOD,\
+ .Lclear_page_end-clear_page, 2b-1b
+ altinstruction_entry clear_page,2b,X86_FEATURE_ERMS, \
+ .Lclear_page_end-clear_page,3b-2b
.previous
diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
index 6fec2d1cebe..01c805ba535 100644
--- a/arch/x86/lib/copy_page_64.S
+++ b/arch/x86/lib/copy_page_64.S
@@ -2,6 +2,7 @@
#include <linux/linkage.h>
#include <asm/dwarf2.h>
+#include <asm/alternative-asm.h>
ALIGN
copy_page_c:
@@ -110,10 +111,6 @@ ENDPROC(copy_page)
2:
.previous
.section .altinstructions,"a"
- .align 8
- .quad copy_page
- .quad 1b
- .word X86_FEATURE_REP_GOOD
- .byte .Lcopy_page_end - copy_page
- .byte 2b - 1b
+ altinstruction_entry copy_page, 1b, X86_FEATURE_REP_GOOD, \
+ .Lcopy_page_end-copy_page, 2b-1b
.previous
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index 99e48261519..024840266ba 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -15,23 +15,30 @@
#include <asm/asm-offsets.h>
#include <asm/thread_info.h>
#include <asm/cpufeature.h>
+#include <asm/alternative-asm.h>
- .macro ALTERNATIVE_JUMP feature,orig,alt
+/*
+ * By placing feature2 after feature1 in altinstructions section, we logically
+ * implement:
+ * If CPU has feature2, jmp to alt2 is used
+ * else if CPU has feature1, jmp to alt1 is used
+ * else jmp to orig is used.
+ */
+ .macro ALTERNATIVE_JUMP feature1,feature2,orig,alt1,alt2
0:
.byte 0xe9 /* 32bit jump */
.long \orig-1f /* by default jump to orig */
1:
.section .altinstr_replacement,"ax"
2: .byte 0xe9 /* near jump with 32bit immediate */
- .long \alt-1b /* offset */ /* or alternatively to alt */
+ .long \alt1-1b /* offset */ /* or alternatively to alt1 */
+3: .byte 0xe9 /* near jump with 32bit immediate */
+ .long \alt2-1b /* offset */ /* or alternatively to alt2 */
.previous
+
.section .altinstructions,"a"
- .align 8
- .quad 0b
- .quad 2b
- .word \feature /* when feature is set */
- .byte 5
- .byte 5
+ altinstruction_entry 0b,2b,\feature1,5,5
+ altinstruction_entry 0b,3b,\feature2,5,5
.previous
.endm
@@ -72,8 +79,10 @@ ENTRY(_copy_to_user)
addq %rdx,%rcx
jc bad_to_user
cmpq TI_addr_limit(%rax),%rcx
- jae bad_to_user
- ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
+ ja bad_to_user
+ ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \
+ copy_user_generic_unrolled,copy_user_generic_string, \
+ copy_user_enhanced_fast_string
CFI_ENDPROC
ENDPROC(_copy_to_user)
@@ -85,8 +94,10 @@ ENTRY(_copy_from_user)
addq %rdx,%rcx
jc bad_from_user
cmpq TI_addr_limit(%rax),%rcx
- jae bad_from_user
- ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
+ ja bad_from_user
+ ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \
+ copy_user_generic_unrolled,copy_user_generic_string, \
+ copy_user_enhanced_fast_string
CFI_ENDPROC
ENDPROC(_copy_from_user)
@@ -255,3 +266,37 @@ ENTRY(copy_user_generic_string)
.previous
CFI_ENDPROC
ENDPROC(copy_user_generic_string)
+
+/*
+ * Some CPUs are adding enhanced REP MOVSB/STOSB instructions.
+ * It's recommended to use enhanced REP MOVSB/STOSB if it's enabled.
+ *
+ * Input:
+ * rdi destination
+ * rsi source
+ * rdx count
+ *
+ * Output:
+ * eax uncopied bytes or 0 if successful.
+ */
+ENTRY(copy_user_enhanced_fast_string)
+ CFI_STARTPROC
+ andl %edx,%edx
+ jz 2f
+ movl %edx,%ecx
+1: rep
+ movsb
+2: xorl %eax,%eax
+ ret
+
+ .section .fixup,"ax"
+12: movl %ecx,%edx /* ecx is zerorest also */
+ jmp copy_user_handle_tail
+ .previous
+
+ .section __ex_table,"a"
+ .align 8
+ .quad 1b,12b
+ .previous
+ CFI_ENDPROC
+ENDPROC(copy_user_enhanced_fast_string)
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 75ef61e35e3..efbf2a0ecde 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -4,6 +4,7 @@
#include <asm/cpufeature.h>
#include <asm/dwarf2.h>
+#include <asm/alternative-asm.h>
/*
* memcpy - Copy a memory block.
@@ -37,6 +38,23 @@
.Lmemcpy_e:
.previous
+/*
+ * memcpy_c_e() - enhanced fast string memcpy. This is faster and simpler than
+ * memcpy_c. Use memcpy_c_e when possible.
+ *
+ * This gets patched over the unrolled variant (below) via the
+ * alternative instructions framework:
+ */
+ .section .altinstr_replacement, "ax", @progbits
+.Lmemcpy_c_e:
+ movq %rdi, %rax
+
+ movl %edx, %ecx
+ rep movsb
+ ret
+.Lmemcpy_e_e:
+ .previous
+
ENTRY(__memcpy)
ENTRY(memcpy)
CFI_STARTPROC
@@ -49,7 +67,7 @@ ENTRY(memcpy)
jb .Lhandle_tail
/*
- * We check whether memory false dependece could occur,
+ * We check whether memory false dependence could occur,
* then jump to corresponding copy mode.
*/
cmp %dil, %sil
@@ -171,21 +189,22 @@ ENDPROC(memcpy)
ENDPROC(__memcpy)
/*
- * Some CPUs run faster using the string copy instructions.
- * It is also a lot simpler. Use this when possible:
- */
-
- .section .altinstructions, "a"
- .align 8
- .quad memcpy
- .quad .Lmemcpy_c
- .word X86_FEATURE_REP_GOOD
-
- /*
+ * Some CPUs are adding enhanced REP MOVSB/STOSB feature
+ * If the feature is supported, memcpy_c_e() is the first choice.
+ * If enhanced rep movsb copy is not available, use fast string copy
+ * memcpy_c() when possible. This is faster and code is simpler than
+ * original memcpy().
+ * Otherwise, original memcpy() is used.
+ * In .altinstructions section, ERMS feature is placed after REG_GOOD
+ * feature to implement the right patch order.
+ *
* Replace only beginning, memcpy is used to apply alternatives,
* so it is silly to overwrite itself with nops - reboot is the
* only outcome...
*/
- .byte .Lmemcpy_e - .Lmemcpy_c
- .byte .Lmemcpy_e - .Lmemcpy_c
+ .section .altinstructions, "a"
+ altinstruction_entry memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\
+ .Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c
+ altinstruction_entry memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \
+ .Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e
.previous
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
index 0ecb8433e5a..ee164610ec4 100644
--- a/arch/x86/lib/memmove_64.S
+++ b/arch/x86/lib/memmove_64.S
@@ -8,6 +8,8 @@
#define _STRING_C
#include <linux/linkage.h>
#include <asm/dwarf2.h>
+#include <asm/cpufeature.h>
+#include <asm/alternative-asm.h>
#undef memmove
@@ -24,6 +26,7 @@
*/
ENTRY(memmove)
CFI_STARTPROC
+
/* Handle more 32bytes in loop */
mov %rdi, %rax
cmp $0x20, %rdx
@@ -31,8 +34,13 @@ ENTRY(memmove)
/* Decide forward/backward copy mode */
cmp %rdi, %rsi
- jb 2f
+ jge .Lmemmove_begin_forward
+ mov %rsi, %r8
+ add %rdx, %r8
+ cmp %rdi, %r8
+ jg 2f
+.Lmemmove_begin_forward:
/*
* movsq instruction have many startup latency
* so we handle small size by general register.
@@ -78,6 +86,8 @@ ENTRY(memmove)
rep movsq
movq %r11, (%r10)
jmp 13f
+.Lmemmove_end_forward:
+
/*
* Handle data backward by movsq.
*/
@@ -194,4 +204,20 @@ ENTRY(memmove)
13:
retq
CFI_ENDPROC
+
+ .section .altinstr_replacement,"ax"
+.Lmemmove_begin_forward_efs:
+ /* Forward moving data. */
+ movq %rdx, %rcx
+ rep movsb
+ retq
+.Lmemmove_end_forward_efs:
+ .previous
+
+ .section .altinstructions,"a"
+ altinstruction_entry .Lmemmove_begin_forward, \
+ .Lmemmove_begin_forward_efs,X86_FEATURE_ERMS, \
+ .Lmemmove_end_forward-.Lmemmove_begin_forward, \
+ .Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs
+ .previous
ENDPROC(memmove)
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index 09d34426965..79bd454b78a 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -2,9 +2,13 @@
#include <linux/linkage.h>
#include <asm/dwarf2.h>
+#include <asm/cpufeature.h>
+#include <asm/alternative-asm.h>
/*
- * ISO C memset - set a memory block to a byte value.
+ * ISO C memset - set a memory block to a byte value. This function uses fast
+ * string to get better performance than the original function. The code is
+ * simpler and shorter than the orignal function as well.
*
* rdi destination
* rsi value (char)
@@ -31,6 +35,28 @@
.Lmemset_e:
.previous
+/*
+ * ISO C memset - set a memory block to a byte value. This function uses
+ * enhanced rep stosb to override the fast string function.
+ * The code is simpler and shorter than the fast string function as well.
+ *
+ * rdi destination
+ * rsi value (char)
+ * rdx count (bytes)
+ *
+ * rax original destination
+ */
+ .section .altinstr_replacement, "ax", @progbits
+.Lmemset_c_e:
+ movq %rdi,%r9
+ movb %sil,%al
+ movl %edx,%ecx
+ rep stosb
+ movq %r9,%rax
+ ret
+.Lmemset_e_e:
+ .previous
+
ENTRY(memset)
ENTRY(__memset)
CFI_STARTPROC
@@ -112,16 +138,20 @@ ENTRY(__memset)
ENDPROC(memset)
ENDPROC(__memset)
- /* Some CPUs run faster using the string instructions.
- It is also a lot simpler. Use this when possible */
-
-#include <asm/cpufeature.h>
-
+ /* Some CPUs support enhanced REP MOVSB/STOSB feature.
+ * It is recommended to use this when possible.
+ *
+ * If enhanced REP MOVSB/STOSB feature is not available, use fast string
+ * instructions.
+ *
+ * Otherwise, use original memset function.
+ *
+ * In .altinstructions section, ERMS feature is placed after REG_GOOD
+ * feature to implement the right patch order.
+ */
.section .altinstructions,"a"
- .align 8
- .quad memset
- .quad .Lmemset_c
- .word X86_FEATURE_REP_GOOD
- .byte .Lfinal - memset
- .byte .Lmemset_e - .Lmemset_c
+ altinstruction_entry memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\
+ .Lfinal-memset,.Lmemset_e-.Lmemset_c
+ altinstruction_entry memset,.Lmemset_c_e,X86_FEATURE_ERMS, \
+ .Lfinal-memset,.Lmemset_e_e-.Lmemset_c_e
.previous
diff --git a/arch/x86/lib/rwlock.S b/arch/x86/lib/rwlock.S
new file mode 100644
index 00000000000..1cad22139c8
--- /dev/null
+++ b/arch/x86/lib/rwlock.S
@@ -0,0 +1,44 @@
+/* Slow paths of read/write spinlocks. */
+
+#include <linux/linkage.h>
+#include <asm/alternative-asm.h>
+#include <asm/frame.h>
+#include <asm/rwlock.h>
+
+#ifdef CONFIG_X86_32
+# define __lock_ptr eax
+#else
+# define __lock_ptr rdi
+#endif
+
+ENTRY(__write_lock_failed)
+ CFI_STARTPROC
+ FRAME
+0: LOCK_PREFIX
+ WRITE_LOCK_ADD($RW_LOCK_BIAS) (%__lock_ptr)
+1: rep; nop
+ cmpl $WRITE_LOCK_CMP, (%__lock_ptr)
+ jne 1b
+ LOCK_PREFIX
+ WRITE_LOCK_SUB($RW_LOCK_BIAS) (%__lock_ptr)
+ jnz 0b
+ ENDFRAME
+ ret
+ CFI_ENDPROC
+END(__write_lock_failed)
+
+ENTRY(__read_lock_failed)
+ CFI_STARTPROC
+ FRAME
+0: LOCK_PREFIX
+ READ_LOCK_SIZE(inc) (%__lock_ptr)
+1: rep; nop
+ READ_LOCK_SIZE(cmp) $1, (%__lock_ptr)
+ js 1b
+ LOCK_PREFIX
+ READ_LOCK_SIZE(dec) (%__lock_ptr)
+ js 0b
+ ENDFRAME
+ ret
+ CFI_ENDPROC
+END(__read_lock_failed)
diff --git a/arch/x86/lib/rwlock_64.S b/arch/x86/lib/rwlock_64.S
deleted file mode 100644
index 05ea55f7140..00000000000
--- a/arch/x86/lib/rwlock_64.S
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Slow paths of read/write spinlocks. */
-
-#include <linux/linkage.h>
-#include <asm/rwlock.h>
-#include <asm/alternative-asm.h>
-#include <asm/dwarf2.h>
-
-/* rdi: pointer to rwlock_t */
-ENTRY(__write_lock_failed)
- CFI_STARTPROC
- LOCK_PREFIX
- addl $RW_LOCK_BIAS,(%rdi)
-1: rep
- nop
- cmpl $RW_LOCK_BIAS,(%rdi)
- jne 1b
- LOCK_PREFIX
- subl $RW_LOCK_BIAS,(%rdi)
- jnz __write_lock_failed
- ret
- CFI_ENDPROC
-END(__write_lock_failed)
-
-/* rdi: pointer to rwlock_t */
-ENTRY(__read_lock_failed)
- CFI_STARTPROC
- LOCK_PREFIX
- incl (%rdi)
-1: rep
- nop
- cmpl $1,(%rdi)
- js 1b
- LOCK_PREFIX
- decl (%rdi)
- js __read_lock_failed
- ret
- CFI_ENDPROC
-END(__read_lock_failed)
diff --git a/arch/x86/lib/rwsem_64.S b/arch/x86/lib/rwsem.S
index 67743977398..5dff5f04246 100644
--- a/arch/x86/lib/rwsem_64.S
+++ b/arch/x86/lib/rwsem.S
@@ -1,4 +1,51 @@
/*
+ * x86 semaphore implementation.
+ *
+ * (C) Copyright 1999 Linus Torvalds
+ *
+ * Portions Copyright 1999 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * rw semaphores implemented November 1999 by Benjamin LaHaise <bcrl@kvack.org>
+ */
+
+#include <linux/linkage.h>
+#include <asm/alternative-asm.h>
+#include <asm/dwarf2.h>
+
+#define __ASM_HALF_REG(reg) __ASM_SEL(reg, e##reg)
+#define __ASM_HALF_SIZE(inst) __ASM_SEL(inst##w, inst##l)
+
+#ifdef CONFIG_X86_32
+
+/*
+ * The semaphore operations have a special calling sequence that
+ * allow us to do a simpler in-line version of them. These routines
+ * need to convert that sequence back into the C sequence when
+ * there is contention on the semaphore.
+ *
+ * %eax contains the semaphore pointer on entry. Save the C-clobbered
+ * registers (%eax, %edx and %ecx) except %eax whish is either a return
+ * value or just clobbered..
+ */
+
+#define save_common_regs \
+ pushl_cfi %ecx; CFI_REL_OFFSET ecx, 0
+
+#define restore_common_regs \
+ popl_cfi %ecx; CFI_RESTORE ecx
+
+ /* Avoid uglifying the argument copying x86-64 needs to do. */
+ .macro movq src, dst
+ .endm
+
+#else
+
+/*
* x86-64 rwsem wrappers
*
* This interfaces the inline asm code to the slow-path
@@ -16,12 +63,6 @@
* but %rdi, %rsi, %rcx, %r8-r11 always need saving.
*/
-#include <linux/linkage.h>
-#include <asm/rwlock.h>
-#include <asm/alternative-asm.h>
-#include <asm/frame.h>
-#include <asm/dwarf2.h>
-
#define save_common_regs \
pushq_cfi %rdi; CFI_REL_OFFSET rdi, 0; \
pushq_cfi %rsi; CFI_REL_OFFSET rsi, 0; \
@@ -40,16 +81,18 @@
popq_cfi %rsi; CFI_RESTORE rsi; \
popq_cfi %rdi; CFI_RESTORE rdi
+#endif
+
/* Fix up special calling conventions */
ENTRY(call_rwsem_down_read_failed)
CFI_STARTPROC
save_common_regs
- pushq_cfi %rdx
- CFI_REL_OFFSET rdx, 0
+ __ASM_SIZE(push,_cfi) %__ASM_REG(dx)
+ CFI_REL_OFFSET __ASM_REG(dx), 0
movq %rax,%rdi
call rwsem_down_read_failed
- popq_cfi %rdx
- CFI_RESTORE rdx
+ __ASM_SIZE(pop,_cfi) %__ASM_REG(dx)
+ CFI_RESTORE __ASM_REG(dx)
restore_common_regs
ret
CFI_ENDPROC
@@ -67,7 +110,8 @@ ENDPROC(call_rwsem_down_write_failed)
ENTRY(call_rwsem_wake)
CFI_STARTPROC
- decl %edx /* do nothing if still outstanding active readers */
+ /* do nothing if still outstanding active readers */
+ __ASM_HALF_SIZE(dec) %__ASM_HALF_REG(dx)
jnz 1f
save_common_regs
movq %rax,%rdi
@@ -77,16 +121,15 @@ ENTRY(call_rwsem_wake)
CFI_ENDPROC
ENDPROC(call_rwsem_wake)
-/* Fix up special calling conventions */
ENTRY(call_rwsem_downgrade_wake)
CFI_STARTPROC
save_common_regs
- pushq_cfi %rdx
- CFI_REL_OFFSET rdx, 0
+ __ASM_SIZE(push,_cfi) %__ASM_REG(dx)
+ CFI_REL_OFFSET __ASM_REG(dx), 0
movq %rax,%rdi
call rwsem_downgrade_wake
- popq_cfi %rdx
- CFI_RESTORE rdx
+ __ASM_SIZE(pop,_cfi) %__ASM_REG(dx)
+ CFI_RESTORE __ASM_REG(dx)
restore_common_regs
ret
CFI_ENDPROC
diff --git a/arch/x86/lib/semaphore_32.S b/arch/x86/lib/semaphore_32.S
deleted file mode 100644
index 06691daa410..00000000000
--- a/arch/x86/lib/semaphore_32.S
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- * i386 semaphore implementation.
- *
- * (C) Copyright 1999 Linus Torvalds
- *
- * Portions Copyright 1999 Red Hat, Inc.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * rw semaphores implemented November 1999 by Benjamin LaHaise <bcrl@kvack.org>
- */
-
-#include <linux/linkage.h>
-#include <asm/rwlock.h>
-#include <asm/alternative-asm.h>
-#include <asm/frame.h>
-#include <asm/dwarf2.h>
-
-/*
- * The semaphore operations have a special calling sequence that
- * allow us to do a simpler in-line version of them. These routines
- * need to convert that sequence back into the C sequence when
- * there is contention on the semaphore.
- *
- * %eax contains the semaphore pointer on entry. Save the C-clobbered
- * registers (%eax, %edx and %ecx) except %eax whish is either a return
- * value or just clobbered..
- */
- .section .sched.text, "ax"
-
-/*
- * rw spinlock fallbacks
- */
-#ifdef CONFIG_SMP
-ENTRY(__write_lock_failed)
- CFI_STARTPROC
- FRAME
-2: LOCK_PREFIX
- addl $ RW_LOCK_BIAS,(%eax)
-1: rep; nop
- cmpl $ RW_LOCK_BIAS,(%eax)
- jne 1b
- LOCK_PREFIX
- subl $ RW_LOCK_BIAS,(%eax)
- jnz 2b
- ENDFRAME
- ret
- CFI_ENDPROC
- ENDPROC(__write_lock_failed)
-
-ENTRY(__read_lock_failed)
- CFI_STARTPROC
- FRAME
-2: LOCK_PREFIX
- incl (%eax)
-1: rep; nop
- cmpl $1,(%eax)
- js 1b
- LOCK_PREFIX
- decl (%eax)
- js 2b
- ENDFRAME
- ret
- CFI_ENDPROC
- ENDPROC(__read_lock_failed)
-
-#endif
-
-#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM
-
-/* Fix up special calling conventions */
-ENTRY(call_rwsem_down_read_failed)
- CFI_STARTPROC
- pushl_cfi %ecx
- CFI_REL_OFFSET ecx,0
- pushl_cfi %edx
- CFI_REL_OFFSET edx,0
- call rwsem_down_read_failed
- popl_cfi %edx
- popl_cfi %ecx
- ret
- CFI_ENDPROC
- ENDPROC(call_rwsem_down_read_failed)
-
-ENTRY(call_rwsem_down_write_failed)
- CFI_STARTPROC
- pushl_cfi %ecx
- CFI_REL_OFFSET ecx,0
- calll rwsem_down_write_failed
- popl_cfi %ecx
- ret
- CFI_ENDPROC
- ENDPROC(call_rwsem_down_write_failed)
-
-ENTRY(call_rwsem_wake)
- CFI_STARTPROC
- decw %dx /* do nothing if still outstanding active readers */
- jnz 1f
- pushl_cfi %ecx
- CFI_REL_OFFSET ecx,0
- call rwsem_wake
- popl_cfi %ecx
-1: ret
- CFI_ENDPROC
- ENDPROC(call_rwsem_wake)
-
-/* Fix up special calling conventions */
-ENTRY(call_rwsem_downgrade_wake)
- CFI_STARTPROC
- pushl_cfi %ecx
- CFI_REL_OFFSET ecx,0
- pushl_cfi %edx
- CFI_REL_OFFSET edx,0
- call rwsem_downgrade_wake
- popl_cfi %edx
- popl_cfi %ecx
- ret
- CFI_ENDPROC
- ENDPROC(call_rwsem_downgrade_wake)
-
-#endif
diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S
index 782b082c9ff..a63efd6bb6a 100644
--- a/arch/x86/lib/thunk_64.S
+++ b/arch/x86/lib/thunk_64.S
@@ -5,50 +5,41 @@
* Added trace_hardirqs callers - Copyright 2007 Steven Rostedt, Red Hat, Inc.
* Subject to the GNU public license, v.2. No warranty of any kind.
*/
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#include <asm/calling.h>
- #include <linux/linkage.h>
- #include <asm/dwarf2.h>
- #include <asm/calling.h>
- #include <asm/rwlock.h>
-
- /* rdi: arg1 ... normal C conventions. rax is saved/restored. */
- .macro thunk name,func
- .globl \name
-\name:
- CFI_STARTPROC
- SAVE_ARGS
- call \func
- jmp restore
- CFI_ENDPROC
- .endm
-
-#ifdef CONFIG_TRACE_IRQFLAGS
- /* put return address in rdi (arg1) */
- .macro thunk_ra name,func
+ /* rdi: arg1 ... normal C conventions. rax is saved/restored. */
+ .macro THUNK name, func, put_ret_addr_in_rdi=0
.globl \name
\name:
CFI_STARTPROC
+
+ /* this one pushes 9 elems, the next one would be %rIP */
SAVE_ARGS
- /* SAVE_ARGS pushs 9 elements */
- /* the next element would be the rip */
- movq 9*8(%rsp), %rdi
+
+ .if \put_ret_addr_in_rdi
+ movq_cfi_restore 9*8, rdi
+ .endif
+
call \func
jmp restore
CFI_ENDPROC
.endm
- thunk_ra trace_hardirqs_on_thunk,trace_hardirqs_on_caller
- thunk_ra trace_hardirqs_off_thunk,trace_hardirqs_off_caller
+#ifdef CONFIG_TRACE_IRQFLAGS
+ THUNK trace_hardirqs_on_thunk,trace_hardirqs_on_caller,1
+ THUNK trace_hardirqs_off_thunk,trace_hardirqs_off_caller,1
#endif
#ifdef CONFIG_DEBUG_LOCK_ALLOC
- thunk lockdep_sys_exit_thunk,lockdep_sys_exit
+ THUNK lockdep_sys_exit_thunk,lockdep_sys_exit
#endif
-
+
/* SAVE_ARGS below is used only for the .cfi directives it contains. */
CFI_STARTPROC
SAVE_ARGS
restore:
RESTORE_ARGS
- ret
+ ret
CFI_ENDPROC
diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c
new file mode 100644
index 00000000000..97be9cb5448
--- /dev/null
+++ b/arch/x86/lib/usercopy.c
@@ -0,0 +1,43 @@
+/*
+ * User address space access functions.
+ *
+ * For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/highmem.h>
+#include <linux/module.h>
+
+/*
+ * best effort, GUP based copy_from_user() that is NMI-safe
+ */
+unsigned long
+copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
+{
+ unsigned long offset, addr = (unsigned long)from;
+ unsigned long size, len = 0;
+ struct page *page;
+ void *map;
+ int ret;
+
+ do {
+ ret = __get_user_pages_fast(addr, 1, 0, &page);
+ if (!ret)
+ break;
+
+ offset = addr & (PAGE_SIZE - 1);
+ size = min(PAGE_SIZE - offset, n - len);
+
+ map = kmap_atomic(page);
+ memcpy(to, map+offset, size);
+ kunmap_atomic(map);
+ put_page(page);
+
+ len += size;
+ to += size;
+ addr += size;
+
+ } while (len < n);
+
+ return len;
+}
+EXPORT_SYMBOL_GPL(copy_from_user_nmi);
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 3e608edf995..3d11327c9ab 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -23,8 +23,8 @@ mmiotrace-y := kmmio.o pf_in.o mmio-mod.o
obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o
-obj-$(CONFIG_AMD_NUMA) += amdtopology_64.o
-obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o
+obj-$(CONFIG_AMD_NUMA) += amdtopology.o
+obj-$(CONFIG_ACPI_NUMA) += srat.o
obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology.c
index 0919c26820d..5247d01329c 100644
--- a/arch/x86/mm/amdtopology_64.c
+++ b/arch/x86/mm/amdtopology.c
@@ -12,6 +12,7 @@
#include <linux/module.h>
#include <linux/nodemask.h>
#include <linux/memblock.h>
+#include <linux/bootmem.h>
#include <asm/io.h>
#include <linux/pci_ids.h>
@@ -69,10 +70,10 @@ static __init void early_get_boot_cpu_id(void)
int __init amd_numa_init(void)
{
- unsigned long start = PFN_PHYS(0);
- unsigned long end = PFN_PHYS(max_pfn);
+ u64 start = PFN_PHYS(0);
+ u64 end = PFN_PHYS(max_pfn);
unsigned numnodes;
- unsigned long prevbase;
+ u64 prevbase;
int i, j, nb;
u32 nodeid, reg;
unsigned int bits, cores, apicid_base;
@@ -95,7 +96,7 @@ int __init amd_numa_init(void)
prevbase = 0;
for (i = 0; i < 8; i++) {
- unsigned long base, limit;
+ u64 base, limit;
base = read_pci_config(0, nb, 1, 0x40 + i*8);
limit = read_pci_config(0, nb, 1, 0x44 + i*8);
@@ -107,18 +108,18 @@ int __init amd_numa_init(void)
continue;
}
if (nodeid >= numnodes) {
- pr_info("Ignoring excess node %d (%lx:%lx)\n", nodeid,
+ pr_info("Ignoring excess node %d (%Lx:%Lx)\n", nodeid,
base, limit);
continue;
}
if (!limit) {
- pr_info("Skipping node entry %d (base %lx)\n",
+ pr_info("Skipping node entry %d (base %Lx)\n",
i, base);
continue;
}
if ((base >> 8) & 3 || (limit >> 8) & 3) {
- pr_err("Node %d using interleaving mode %lx/%lx\n",
+ pr_err("Node %d using interleaving mode %Lx/%Lx\n",
nodeid, (base >> 8) & 3, (limit >> 8) & 3);
return -EINVAL;
}
@@ -150,19 +151,19 @@ int __init amd_numa_init(void)
continue;
}
if (limit < base) {
- pr_err("Node %d bogus settings %lx-%lx.\n",
+ pr_err("Node %d bogus settings %Lx-%Lx.\n",
nodeid, base, limit);
continue;
}
/* Could sort here, but pun for now. Should not happen anyroads. */
if (prevbase > base) {
- pr_err("Node map not sorted %lx,%lx\n",
+ pr_err("Node map not sorted %Lx,%Lx\n",
prevbase, base);
return -EINVAL;
}
- pr_info("Node %d MemBase %016lx Limit %016lx\n",
+ pr_info("Node %d MemBase %016Lx Limit %016Lx\n",
nodeid, base, limit);
prevbase = base;
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 20e3f8702d1..4d09df054e3 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -12,6 +12,7 @@
#include <linux/mmiotrace.h> /* kmmio_handler, ... */
#include <linux/perf_event.h> /* perf_sw_event */
#include <linux/hugetlb.h> /* hstate_index_to_shift */
+#include <linux/prefetch.h> /* prefetchw */
#include <asm/traps.h> /* dotraplinkage, ... */
#include <asm/pgalloc.h> /* pgd_*(), ... */
@@ -822,16 +823,30 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
force_sig_info_fault(SIGBUS, code, address, tsk, fault);
}
-static noinline void
+static noinline int
mm_fault_error(struct pt_regs *regs, unsigned long error_code,
unsigned long address, unsigned int fault)
{
+ /*
+ * Pagefault was interrupted by SIGKILL. We have no reason to
+ * continue pagefault.
+ */
+ if (fatal_signal_pending(current)) {
+ if (!(fault & VM_FAULT_RETRY))
+ up_read(&current->mm->mmap_sem);
+ if (!(error_code & PF_USER))
+ no_context(regs, error_code, address);
+ return 1;
+ }
+ if (!(fault & VM_FAULT_ERROR))
+ return 0;
+
if (fault & VM_FAULT_OOM) {
/* Kernel mode? Handle exceptions or die: */
if (!(error_code & PF_USER)) {
up_read(&current->mm->mmap_sem);
no_context(regs, error_code, address);
- return;
+ return 1;
}
out_of_memory(regs, error_code, address);
@@ -842,6 +857,7 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
else
BUG();
}
+ return 1;
}
static int spurious_fault_check(unsigned long error_code, pte_t *pte)
@@ -964,7 +980,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
struct mm_struct *mm;
int fault;
int write = error_code & PF_WRITE;
- unsigned int flags = FAULT_FLAG_ALLOW_RETRY |
+ unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
(write ? FAULT_FLAG_WRITE : 0);
tsk = current;
@@ -1043,7 +1059,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
if (unlikely(error_code & PF_RSVD))
pgtable_bad(regs, error_code, address);
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
/*
* If we're in an interrupt, have no user context or are running
@@ -1132,9 +1148,9 @@ good_area:
*/
fault = handle_mm_fault(mm, vma, address, flags);
- if (unlikely(fault & VM_FAULT_ERROR)) {
- mm_fault_error(regs, error_code, address, fault);
- return;
+ if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) {
+ if (mm_fault_error(regs, error_code, address, fault))
+ return;
}
/*
@@ -1145,11 +1161,11 @@ good_area:
if (flags & FAULT_FLAG_ALLOW_RETRY) {
if (fault & VM_FAULT_MAJOR) {
tsk->maj_flt++;
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,
regs, address);
} else {
tsk->min_flt++;
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
regs, address);
}
if (fault & VM_FAULT_RETRY) {
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index d4203988504..f581a18c0d4 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -72,7 +72,7 @@ static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
if (!vma_shareable(vma, addr))
return;
- spin_lock(&mapping->i_mmap_lock);
+ mutex_lock(&mapping->i_mmap_mutex);
vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) {
if (svma == vma)
continue;
@@ -97,7 +97,7 @@ static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
put_page(virt_to_page(spte));
spin_unlock(&mm->page_table_lock);
out:
- spin_unlock(&mapping->i_mmap_lock);
+ mutex_unlock(&mapping->i_mmap_mutex);
}
/*
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 37b8b0fe832..30326443ab8 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -16,8 +16,6 @@
#include <asm/tlb.h>
#include <asm/proto.h>
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
-
unsigned long __initdata pgt_buf_start;
unsigned long __meminitdata pgt_buf_end;
unsigned long __meminitdata pgt_buf_top;
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 80088f99419..29f7c6d9817 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -678,8 +678,10 @@ static void __init zone_sizes_init(void)
{
unsigned long max_zone_pfns[MAX_NR_ZONES];
memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
+#ifdef CONFIG_ZONE_DMA
max_zone_pfns[ZONE_DMA] =
virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
+#endif
max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
#ifdef CONFIG_HIGHMEM
max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
@@ -716,6 +718,7 @@ void __init paging_init(void)
* NOTE: at this point the bootmem allocator is fully available.
*/
olpc_dt_build_devicetree();
+ sparse_memory_present_with_active_regions(MAX_NUMNODES);
sparse_init();
zone_sizes_init();
}
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 79423358728..bbaaa005bf0 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -28,6 +28,7 @@
#include <linux/poison.h>
#include <linux/dma-mapping.h>
#include <linux/module.h>
+#include <linux/memory.h>
#include <linux/memory_hotplug.h>
#include <linux/nmi.h>
#include <linux/gfp.h>
@@ -616,7 +617,9 @@ void __init paging_init(void)
unsigned long max_zone_pfns[MAX_NR_ZONES];
memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
+#ifdef CONFIG_ZONE_DMA
max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
+#endif
max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
max_zone_pfns[ZONE_NORMAL] = max_pfn;
@@ -679,14 +682,6 @@ int arch_add_memory(int nid, u64 start, u64 size)
}
EXPORT_SYMBOL_GPL(arch_add_memory);
-#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA)
-int memory_add_physaddr_to_nid(u64 start)
-{
- return 0;
-}
-EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
-#endif
-
#endif /* CONFIG_MEMORY_HOTPLUG */
static struct kcore_list kcore_vsyscall;
@@ -901,8 +896,6 @@ const char *arch_vma_name(struct vm_area_struct *vma)
}
#ifdef CONFIG_X86_UV
-#define MIN_MEMORY_BLOCK_SIZE (1 << SECTION_SIZE_BITS)
-
unsigned long memory_block_size_bytes(void)
{
if (is_uv_system()) {
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 0369843511d..be1ef574ce9 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -91,13 +91,6 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
return (__force void __iomem *)phys_to_virt(phys_addr);
/*
- * Check if the request spans more than any BAR in the iomem resource
- * tree.
- */
- WARN_ONCE(iomem_map_sanity_check(phys_addr, size),
- KERN_INFO "Info: mapping multiple BARs. Your kernel is fine.");
-
- /*
* Don't allow anybody to remap normal RAM that we're using..
*/
last_pfn = last_addr >> PAGE_SHIFT;
@@ -170,6 +163,13 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
ret_addr = (void __iomem *) (vaddr + offset);
mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr);
+ /*
+ * Check if the request spans more than any BAR in the iomem resource
+ * tree.
+ */
+ WARN_ONCE(iomem_map_sanity_check(unaligned_phys_addr, unaligned_size),
+ KERN_INFO "Info: mapping multiple BARs. Your kernel is fine.");
+
return ret_addr;
err_free_area:
free_vm_area(area);
diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c
index 704a37ceddd..dab41876cdd 100644
--- a/arch/x86/mm/kmemcheck/error.c
+++ b/arch/x86/mm/kmemcheck/error.c
@@ -185,7 +185,7 @@ void kmemcheck_error_save(enum kmemcheck_shadow state,
e->trace.entries = e->trace_entries;
e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
e->trace.skip = 0;
- save_stack_trace_regs(&e->trace, regs);
+ save_stack_trace_regs(regs, &e->trace);
/* Round address down to nearest 16 bytes */
shadow_copy = kmemcheck_shadow_lookup(address
diff --git a/arch/x86/mm/memblock.c b/arch/x86/mm/memblock.c
index aa1169392b8..992da5ec5a6 100644
--- a/arch/x86/mm/memblock.c
+++ b/arch/x86/mm/memblock.c
@@ -8,7 +8,7 @@
#include <linux/range.h>
/* Check for already reserved areas */
-static bool __init check_with_memblock_reserved_size(u64 *addrp, u64 *sizep, u64 align)
+bool __init memblock_x86_check_reserved_size(u64 *addrp, u64 *sizep, u64 align)
{
struct memblock_region *r;
u64 addr = *addrp, last;
@@ -59,7 +59,7 @@ u64 __init memblock_x86_find_in_range_size(u64 start, u64 *sizep, u64 align)
if (addr >= ei_last)
continue;
*sizep = ei_last - addr;
- while (check_with_memblock_reserved_size(&addr, sizep, align))
+ while (memblock_x86_check_reserved_size(&addr, sizep, align))
;
if (*sizep)
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 745258dfc4d..fbeaaf41661 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -1,11 +1,39 @@
/* Common code for 32 and 64-bit NUMA */
-#include <linux/topology.h>
-#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/init.h>
#include <linux/bootmem.h>
-#include <asm/numa.h>
+#include <linux/memblock.h>
+#include <linux/mmzone.h>
+#include <linux/ctype.h>
+#include <linux/module.h>
+#include <linux/nodemask.h>
+#include <linux/sched.h>
+#include <linux/topology.h>
+
+#include <asm/e820.h>
+#include <asm/proto.h>
+#include <asm/dma.h>
#include <asm/acpi.h>
+#include <asm/amd_nb.h>
+
+#include "numa_internal.h"
int __initdata numa_off;
+nodemask_t numa_nodes_parsed __initdata;
+
+struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
+EXPORT_SYMBOL(node_data);
+
+static struct numa_meminfo numa_meminfo
+#ifndef CONFIG_MEMORY_HOTPLUG
+__initdata
+#endif
+;
+
+static int numa_distance_cnt;
+static u8 *numa_distance;
static __init int numa_setup(char *opt)
{
@@ -32,6 +60,15 @@ s16 __apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
};
+int __cpuinit numa_cpu_node(int cpu)
+{
+ int apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
+
+ if (apicid != BAD_APICID)
+ return __apicid_to_node[apicid];
+ return NUMA_NO_NODE;
+}
+
cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
EXPORT_SYMBOL(node_to_cpumask_map);
@@ -95,6 +132,422 @@ void __init setup_node_to_cpumask_map(void)
pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids);
}
+static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
+ struct numa_meminfo *mi)
+{
+ /* ignore zero length blks */
+ if (start == end)
+ return 0;
+
+ /* whine about and ignore invalid blks */
+ if (start > end || nid < 0 || nid >= MAX_NUMNODES) {
+ pr_warning("NUMA: Warning: invalid memblk node %d (%Lx-%Lx)\n",
+ nid, start, end);
+ return 0;
+ }
+
+ if (mi->nr_blks >= NR_NODE_MEMBLKS) {
+ pr_err("NUMA: too many memblk ranges\n");
+ return -EINVAL;
+ }
+
+ mi->blk[mi->nr_blks].start = start;
+ mi->blk[mi->nr_blks].end = end;
+ mi->blk[mi->nr_blks].nid = nid;
+ mi->nr_blks++;
+ return 0;
+}
+
+/**
+ * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo
+ * @idx: Index of memblk to remove
+ * @mi: numa_meminfo to remove memblk from
+ *
+ * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and
+ * decrementing @mi->nr_blks.
+ */
+void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
+{
+ mi->nr_blks--;
+ memmove(&mi->blk[idx], &mi->blk[idx + 1],
+ (mi->nr_blks - idx) * sizeof(mi->blk[0]));
+}
+
+/**
+ * numa_add_memblk - Add one numa_memblk to numa_meminfo
+ * @nid: NUMA node ID of the new memblk
+ * @start: Start address of the new memblk
+ * @end: End address of the new memblk
+ *
+ * Add a new memblk to the default numa_meminfo.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int __init numa_add_memblk(int nid, u64 start, u64 end)
+{
+ return numa_add_memblk_to(nid, start, end, &numa_meminfo);
+}
+
+/* Initialize NODE_DATA for a node on the local memory */
+static void __init setup_node_data(int nid, u64 start, u64 end)
+{
+ const u64 nd_low = PFN_PHYS(MAX_DMA_PFN);
+ const u64 nd_high = PFN_PHYS(max_pfn_mapped);
+ const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
+ bool remapped = false;
+ u64 nd_pa;
+ void *nd;
+ int tnid;
+
+ /*
+ * Don't confuse VM with a node that doesn't have the
+ * minimum amount of memory:
+ */
+ if (end && (end - start) < NODE_MIN_SIZE)
+ return;
+
+ /* initialize remap allocator before aligning to ZONE_ALIGN */
+ init_alloc_remap(nid, start, end);
+
+ start = roundup(start, ZONE_ALIGN);
+
+ printk(KERN_INFO "Initmem setup node %d %016Lx-%016Lx\n",
+ nid, start, end);
+
+ /*
+ * Allocate node data. Try remap allocator first, node-local
+ * memory and then any node. Never allocate in DMA zone.
+ */
+ nd = alloc_remap(nid, nd_size);
+ if (nd) {
+ nd_pa = __pa(nd);
+ remapped = true;
+ } else {
+ nd_pa = memblock_x86_find_in_range_node(nid, nd_low, nd_high,
+ nd_size, SMP_CACHE_BYTES);
+ if (nd_pa == MEMBLOCK_ERROR)
+ nd_pa = memblock_find_in_range(nd_low, nd_high,
+ nd_size, SMP_CACHE_BYTES);
+ if (nd_pa == MEMBLOCK_ERROR) {
+ pr_err("Cannot find %zu bytes in node %d\n",
+ nd_size, nid);
+ return;
+ }
+ memblock_x86_reserve_range(nd_pa, nd_pa + nd_size, "NODE_DATA");
+ nd = __va(nd_pa);
+ }
+
+ /* report and initialize */
+ printk(KERN_INFO " NODE_DATA [%016Lx - %016Lx]%s\n",
+ nd_pa, nd_pa + nd_size - 1, remapped ? " (remapped)" : "");
+ tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
+ if (!remapped && tnid != nid)
+ printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nid, tnid);
+
+ node_data[nid] = nd;
+ memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
+ NODE_DATA(nid)->node_id = nid;
+ NODE_DATA(nid)->node_start_pfn = start >> PAGE_SHIFT;
+ NODE_DATA(nid)->node_spanned_pages = (end - start) >> PAGE_SHIFT;
+
+ node_set_online(nid);
+}
+
+/**
+ * numa_cleanup_meminfo - Cleanup a numa_meminfo
+ * @mi: numa_meminfo to clean up
+ *
+ * Sanitize @mi by merging and removing unncessary memblks. Also check for
+ * conflicts and clear unused memblks.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
+{
+ const u64 low = 0;
+ const u64 high = PFN_PHYS(max_pfn);
+ int i, j, k;
+
+ /* first, trim all entries */
+ for (i = 0; i < mi->nr_blks; i++) {
+ struct numa_memblk *bi = &mi->blk[i];
+
+ /* make sure all blocks are inside the limits */
+ bi->start = max(bi->start, low);
+ bi->end = min(bi->end, high);
+
+ /* and there's no empty block */
+ if (bi->start >= bi->end)
+ numa_remove_memblk_from(i--, mi);
+ }
+
+ /* merge neighboring / overlapping entries */
+ for (i = 0; i < mi->nr_blks; i++) {
+ struct numa_memblk *bi = &mi->blk[i];
+
+ for (j = i + 1; j < mi->nr_blks; j++) {
+ struct numa_memblk *bj = &mi->blk[j];
+ u64 start, end;
+
+ /*
+ * See whether there are overlapping blocks. Whine
+ * about but allow overlaps of the same nid. They
+ * will be merged below.
+ */
+ if (bi->end > bj->start && bi->start < bj->end) {
+ if (bi->nid != bj->nid) {
+ pr_err("NUMA: node %d (%Lx-%Lx) overlaps with node %d (%Lx-%Lx)\n",
+ bi->nid, bi->start, bi->end,
+ bj->nid, bj->start, bj->end);
+ return -EINVAL;
+ }
+ pr_warning("NUMA: Warning: node %d (%Lx-%Lx) overlaps with itself (%Lx-%Lx)\n",
+ bi->nid, bi->start, bi->end,
+ bj->start, bj->end);
+ }
+
+ /*
+ * Join together blocks on the same node, holes
+ * between which don't overlap with memory on other
+ * nodes.
+ */
+ if (bi->nid != bj->nid)
+ continue;
+ start = min(bi->start, bj->start);
+ end = max(bi->end, bj->end);
+ for (k = 0; k < mi->nr_blks; k++) {
+ struct numa_memblk *bk = &mi->blk[k];
+
+ if (bi->nid == bk->nid)
+ continue;
+ if (start < bk->end && end > bk->start)
+ break;
+ }
+ if (k < mi->nr_blks)
+ continue;
+ printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%Lx,%Lx)\n",
+ bi->nid, bi->start, bi->end, bj->start, bj->end,
+ start, end);
+ bi->start = start;
+ bi->end = end;
+ numa_remove_memblk_from(j--, mi);
+ }
+ }
+
+ /* clear unused ones */
+ for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) {
+ mi->blk[i].start = mi->blk[i].end = 0;
+ mi->blk[i].nid = NUMA_NO_NODE;
+ }
+
+ return 0;
+}
+
+/*
+ * Set nodes, which have memory in @mi, in *@nodemask.
+ */
+static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask,
+ const struct numa_meminfo *mi)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(mi->blk); i++)
+ if (mi->blk[i].start != mi->blk[i].end &&
+ mi->blk[i].nid != NUMA_NO_NODE)
+ node_set(mi->blk[i].nid, *nodemask);
+}
+
+/**
+ * numa_reset_distance - Reset NUMA distance table
+ *
+ * The current table is freed. The next numa_set_distance() call will
+ * create a new one.
+ */
+void __init numa_reset_distance(void)
+{
+ size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]);
+
+ /* numa_distance could be 1LU marking allocation failure, test cnt */
+ if (numa_distance_cnt)
+ memblock_x86_free_range(__pa(numa_distance),
+ __pa(numa_distance) + size);
+ numa_distance_cnt = 0;
+ numa_distance = NULL; /* enable table creation */
+}
+
+static int __init numa_alloc_distance(void)
+{
+ nodemask_t nodes_parsed;
+ size_t size;
+ int i, j, cnt = 0;
+ u64 phys;
+
+ /* size the new table and allocate it */
+ nodes_parsed = numa_nodes_parsed;
+ numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo);
+
+ for_each_node_mask(i, nodes_parsed)
+ cnt = i;
+ cnt++;
+ size = cnt * cnt * sizeof(numa_distance[0]);
+
+ phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
+ size, PAGE_SIZE);
+ if (phys == MEMBLOCK_ERROR) {
+ pr_warning("NUMA: Warning: can't allocate distance table!\n");
+ /* don't retry until explicitly reset */
+ numa_distance = (void *)1LU;
+ return -ENOMEM;
+ }
+ memblock_x86_reserve_range(phys, phys + size, "NUMA DIST");
+
+ numa_distance = __va(phys);
+ numa_distance_cnt = cnt;
+
+ /* fill with the default distances */
+ for (i = 0; i < cnt; i++)
+ for (j = 0; j < cnt; j++)
+ numa_distance[i * cnt + j] = i == j ?
+ LOCAL_DISTANCE : REMOTE_DISTANCE;
+ printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt);
+
+ return 0;
+}
+
+/**
+ * numa_set_distance - Set NUMA distance from one NUMA to another
+ * @from: the 'from' node to set distance
+ * @to: the 'to' node to set distance
+ * @distance: NUMA distance
+ *
+ * Set the distance from node @from to @to to @distance. If distance table
+ * doesn't exist, one which is large enough to accommodate all the currently
+ * known nodes will be created.
+ *
+ * If such table cannot be allocated, a warning is printed and further
+ * calls are ignored until the distance table is reset with
+ * numa_reset_distance().
+ *
+ * If @from or @to is higher than the highest known node at the time of
+ * table creation or @distance doesn't make sense, the call is ignored.
+ * This is to allow simplification of specific NUMA config implementations.
+ */
+void __init numa_set_distance(int from, int to, int distance)
+{
+ if (!numa_distance && numa_alloc_distance() < 0)
+ return;
+
+ if (from >= numa_distance_cnt || to >= numa_distance_cnt) {
+ printk_once(KERN_DEBUG "NUMA: Debug: distance out of bound, from=%d to=%d distance=%d\n",
+ from, to, distance);
+ return;
+ }
+
+ if ((u8)distance != distance ||
+ (from == to && distance != LOCAL_DISTANCE)) {
+ pr_warn_once("NUMA: Warning: invalid distance parameter, from=%d to=%d distance=%d\n",
+ from, to, distance);
+ return;
+ }
+
+ numa_distance[from * numa_distance_cnt + to] = distance;
+}
+
+int __node_distance(int from, int to)
+{
+ if (from >= numa_distance_cnt || to >= numa_distance_cnt)
+ return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE;
+ return numa_distance[from * numa_distance_cnt + to];
+}
+EXPORT_SYMBOL(__node_distance);
+
+/*
+ * Sanity check to catch more bad NUMA configurations (they are amazingly
+ * common). Make sure the nodes cover all memory.
+ */
+static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi)
+{
+ u64 numaram, e820ram;
+ int i;
+
+ numaram = 0;
+ for (i = 0; i < mi->nr_blks; i++) {
+ u64 s = mi->blk[i].start >> PAGE_SHIFT;
+ u64 e = mi->blk[i].end >> PAGE_SHIFT;
+ numaram += e - s;
+ numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e);
+ if ((s64)numaram < 0)
+ numaram = 0;
+ }
+
+ e820ram = max_pfn - (memblock_x86_hole_size(0,
+ PFN_PHYS(max_pfn)) >> PAGE_SHIFT);
+ /* We seem to lose 3 pages somewhere. Allow 1M of slack. */
+ if ((s64)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) {
+ printk(KERN_ERR "NUMA: nodes only cover %LuMB of your %LuMB e820 RAM. Not used.\n",
+ (numaram << PAGE_SHIFT) >> 20,
+ (e820ram << PAGE_SHIFT) >> 20);
+ return false;
+ }
+ return true;
+}
+
+static int __init numa_register_memblks(struct numa_meminfo *mi)
+{
+ unsigned long uninitialized_var(pfn_align);
+ int i, nid;
+
+ /* Account for nodes with cpus and no memory */
+ node_possible_map = numa_nodes_parsed;
+ numa_nodemask_from_meminfo(&node_possible_map, mi);
+ if (WARN_ON(nodes_empty(node_possible_map)))
+ return -EINVAL;
+
+ for (i = 0; i < mi->nr_blks; i++)
+ memblock_x86_register_active_regions(mi->blk[i].nid,
+ mi->blk[i].start >> PAGE_SHIFT,
+ mi->blk[i].end >> PAGE_SHIFT);
+
+ /* for out of order entries */
+ sort_node_map();
+
+ /*
+ * If sections array is gonna be used for pfn -> nid mapping, check
+ * whether its granularity is fine enough.
+ */
+#ifdef NODE_NOT_IN_PAGE_FLAGS
+ pfn_align = node_map_pfn_alignment();
+ if (pfn_align && pfn_align < PAGES_PER_SECTION) {
+ printk(KERN_WARNING "Node alignment %LuMB < min %LuMB, rejecting NUMA config\n",
+ PFN_PHYS(pfn_align) >> 20,
+ PFN_PHYS(PAGES_PER_SECTION) >> 20);
+ return -EINVAL;
+ }
+#endif
+ if (!numa_meminfo_cover_memory(mi))
+ return -EINVAL;
+
+ /* Finally register nodes. */
+ for_each_node_mask(nid, node_possible_map) {
+ u64 start = PFN_PHYS(max_pfn);
+ u64 end = 0;
+
+ for (i = 0; i < mi->nr_blks; i++) {
+ if (nid != mi->blk[i].nid)
+ continue;
+ start = min(mi->blk[i].start, start);
+ end = max(mi->blk[i].end, end);
+ }
+
+ if (start < end)
+ setup_node_data(nid, start, end);
+ }
+
+ return 0;
+}
+
/*
* There are unfortunately some poorly designed mainboards around that
* only connect memory to a single CPU. This breaks the 1:1 cpu->node
@@ -102,7 +555,7 @@ void __init setup_node_to_cpumask_map(void)
* as the number of CPUs is not known yet. We round robin the existing
* nodes.
*/
-void __init numa_init_array(void)
+static void __init numa_init_array(void)
{
int rr, i;
@@ -117,6 +570,95 @@ void __init numa_init_array(void)
}
}
+static int __init numa_init(int (*init_func)(void))
+{
+ int i;
+ int ret;
+
+ for (i = 0; i < MAX_LOCAL_APIC; i++)
+ set_apicid_to_node(i, NUMA_NO_NODE);
+
+ nodes_clear(numa_nodes_parsed);
+ nodes_clear(node_possible_map);
+ nodes_clear(node_online_map);
+ memset(&numa_meminfo, 0, sizeof(numa_meminfo));
+ remove_all_active_ranges();
+ numa_reset_distance();
+
+ ret = init_func();
+ if (ret < 0)
+ return ret;
+ ret = numa_cleanup_meminfo(&numa_meminfo);
+ if (ret < 0)
+ return ret;
+
+ numa_emulation(&numa_meminfo, numa_distance_cnt);
+
+ ret = numa_register_memblks(&numa_meminfo);
+ if (ret < 0)
+ return ret;
+
+ for (i = 0; i < nr_cpu_ids; i++) {
+ int nid = early_cpu_to_node(i);
+
+ if (nid == NUMA_NO_NODE)
+ continue;
+ if (!node_online(nid))
+ numa_clear_node(i);
+ }
+ numa_init_array();
+ return 0;
+}
+
+/**
+ * dummy_numa_init - Fallback dummy NUMA init
+ *
+ * Used if there's no underlying NUMA architecture, NUMA initialization
+ * fails, or NUMA is disabled on the command line.
+ *
+ * Must online at least one node and add memory blocks that cover all
+ * allowed memory. This function must not fail.
+ */
+static int __init dummy_numa_init(void)
+{
+ printk(KERN_INFO "%s\n",
+ numa_off ? "NUMA turned off" : "No NUMA configuration found");
+ printk(KERN_INFO "Faking a node at %016Lx-%016Lx\n",
+ 0LLU, PFN_PHYS(max_pfn));
+
+ node_set(0, numa_nodes_parsed);
+ numa_add_memblk(0, 0, PFN_PHYS(max_pfn));
+
+ return 0;
+}
+
+/**
+ * x86_numa_init - Initialize NUMA
+ *
+ * Try each configured NUMA initialization method until one succeeds. The
+ * last fallback is dummy single node config encomapssing whole memory and
+ * never fails.
+ */
+void __init x86_numa_init(void)
+{
+ if (!numa_off) {
+#ifdef CONFIG_X86_NUMAQ
+ if (!numa_init(numaq_numa_init))
+ return;
+#endif
+#ifdef CONFIG_ACPI_NUMA
+ if (!numa_init(x86_acpi_numa_init))
+ return;
+#endif
+#ifdef CONFIG_AMD_NUMA
+ if (!numa_init(amd_numa_init))
+ return;
+#endif
+ }
+
+ numa_init(dummy_numa_init);
+}
+
static __init int find_near_online_node(int node)
{
int n, val;
@@ -282,3 +824,18 @@ const struct cpumask *cpumask_of_node(int node)
EXPORT_SYMBOL(cpumask_of_node);
#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+int memory_add_physaddr_to_nid(u64 start)
+{
+ struct numa_meminfo *mi = &numa_meminfo;
+ int nid = mi->blk[0].nid;
+ int i;
+
+ for (i = 0; i < mi->nr_blks; i++)
+ if (mi->blk[i].start <= start && mi->blk[i].end > start)
+ nid = mi->blk[i].nid;
+ return nid;
+}
+EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
+#endif
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index bde3906420d..3adebe7e536 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -22,39 +22,11 @@
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
-#include <linux/mm.h>
#include <linux/bootmem.h>
#include <linux/memblock.h>
-#include <linux/mmzone.h>
-#include <linux/highmem.h>
-#include <linux/initrd.h>
-#include <linux/nodemask.h>
#include <linux/module.h>
-#include <linux/kexec.h>
-#include <linux/pfn.h>
-#include <linux/swap.h>
-#include <linux/acpi.h>
-
-#include <asm/e820.h>
-#include <asm/setup.h>
-#include <asm/mmzone.h>
-#include <asm/bios_ebda.h>
-#include <asm/proto.h>
-
-struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
-EXPORT_SYMBOL(node_data);
-
-/*
- * numa interface - we expect the numa architecture specific code to have
- * populated the following initialisation.
- *
- * 1) node_online_map - the map of all nodes configured (online) in the system
- * 2) node_start_pfn - the starting page frame number for a node
- * 3) node_end_pfn - the ending page fram number for a node
- */
-unsigned long node_start_pfn[MAX_NUMNODES] __read_mostly;
-unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly;
+#include "numa_internal.h"
#ifdef CONFIG_DISCONTIGMEM
/*
@@ -69,7 +41,7 @@ unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly;
* physnode_map[16-31] = 1;
* physnode_map[32- ] = -1;
*/
-s8 physnode_map[MAX_ELEMENTS] __read_mostly = { [0 ... (MAX_ELEMENTS - 1)] = -1};
+s8 physnode_map[MAX_SECTIONS] __read_mostly = { [0 ... (MAX_SECTIONS - 1)] = -1};
EXPORT_SYMBOL(physnode_map);
void memory_present(int nid, unsigned long start, unsigned long end)
@@ -80,8 +52,8 @@ void memory_present(int nid, unsigned long start, unsigned long end)
nid, start, end);
printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid);
printk(KERN_DEBUG " ");
- for (pfn = start; pfn < end; pfn += PAGES_PER_ELEMENT) {
- physnode_map[pfn / PAGES_PER_ELEMENT] = nid;
+ for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
+ physnode_map[pfn / PAGES_PER_SECTION] = nid;
printk(KERN_CONT "%lx ", pfn);
}
printk(KERN_CONT "\n");
@@ -99,108 +71,46 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
}
#endif
-extern unsigned long find_max_low_pfn(void);
extern unsigned long highend_pfn, highstart_pfn;
#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)
-unsigned long node_remap_size[MAX_NUMNODES];
static void *node_remap_start_vaddr[MAX_NUMNODES];
void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
-static unsigned long kva_start_pfn;
-static unsigned long kva_pages;
-
-int __cpuinit numa_cpu_node(int cpu)
-{
- return apic->x86_32_numa_cpu_node(cpu);
-}
-
-/*
- * FLAT - support for basic PC memory model with discontig enabled, essentially
- * a single node with all available processors in it with a flat
- * memory map.
- */
-int __init get_memcfg_numa_flat(void)
-{
- printk(KERN_DEBUG "NUMA - single node, flat memory mode\n");
-
- node_start_pfn[0] = 0;
- node_end_pfn[0] = max_pfn;
- memblock_x86_register_active_regions(0, 0, max_pfn);
- memory_present(0, 0, max_pfn);
- node_remap_size[0] = node_memmap_size_bytes(0, 0, max_pfn);
-
- /* Indicate there is one node available. */
- nodes_clear(node_online_map);
- node_set_online(0);
- return 1;
-}
-
-/*
- * Find the highest page frame number we have available for the node
- */
-static void __init propagate_e820_map_node(int nid)
-{
- if (node_end_pfn[nid] > max_pfn)
- node_end_pfn[nid] = max_pfn;
- /*
- * if a user has given mem=XXXX, then we need to make sure
- * that the node _starts_ before that, too, not just ends
- */
- if (node_start_pfn[nid] > max_pfn)
- node_start_pfn[nid] = max_pfn;
- BUG_ON(node_start_pfn[nid] > node_end_pfn[nid]);
-}
-
-/*
- * Allocate memory for the pg_data_t for this node via a crude pre-bootmem
- * method. For node zero take this from the bottom of memory, for
- * subsequent nodes place them at node_remap_start_vaddr which contains
- * node local data in physically node local memory. See setup_memory()
- * for details.
- */
-static void __init allocate_pgdat(int nid)
-{
- char buf[16];
-
- if (node_has_online_mem(nid) && node_remap_start_vaddr[nid])
- NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid];
- else {
- unsigned long pgdat_phys;
- pgdat_phys = memblock_find_in_range(min_low_pfn<<PAGE_SHIFT,
- max_pfn_mapped<<PAGE_SHIFT,
- sizeof(pg_data_t),
- PAGE_SIZE);
- NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(pgdat_phys>>PAGE_SHIFT));
- memset(buf, 0, sizeof(buf));
- sprintf(buf, "NODE_DATA %d", nid);
- memblock_x86_reserve_range(pgdat_phys, pgdat_phys + sizeof(pg_data_t), buf);
- }
- printk(KERN_DEBUG "allocate_pgdat: node %d NODE_DATA %08lx\n",
- nid, (unsigned long)NODE_DATA(nid));
-}
-
/*
- * In the DISCONTIGMEM and SPARSEMEM memory model, a portion of the kernel
- * virtual address space (KVA) is reserved and portions of nodes are mapped
- * using it. This is to allow node-local memory to be allocated for
- * structures that would normally require ZONE_NORMAL. The memory is
- * allocated with alloc_remap() and callers should be prepared to allocate
- * from the bootmem allocator instead.
+ * Remap memory allocator
*/
static unsigned long node_remap_start_pfn[MAX_NUMNODES];
static void *node_remap_end_vaddr[MAX_NUMNODES];
static void *node_remap_alloc_vaddr[MAX_NUMNODES];
-static unsigned long node_remap_offset[MAX_NUMNODES];
+/**
+ * alloc_remap - Allocate remapped memory
+ * @nid: NUMA node to allocate memory from
+ * @size: The size of allocation
+ *
+ * Allocate @size bytes from the remap area of NUMA node @nid. The
+ * size of the remap area is predetermined by init_alloc_remap() and
+ * only the callers considered there should call this function. For
+ * more info, please read the comment on top of init_alloc_remap().
+ *
+ * The caller must be ready to handle allocation failure from this
+ * function and fall back to regular memory allocator in such cases.
+ *
+ * CONTEXT:
+ * Single CPU early boot context.
+ *
+ * RETURNS:
+ * Pointer to the allocated memory on success, %NULL on failure.
+ */
void *alloc_remap(int nid, unsigned long size)
{
void *allocation = node_remap_alloc_vaddr[nid];
size = ALIGN(size, L1_CACHE_BYTES);
- if (!allocation || (allocation + size) >= node_remap_end_vaddr[nid])
+ if (!allocation || (allocation + size) > node_remap_end_vaddr[nid])
return NULL;
node_remap_alloc_vaddr[nid] += size;
@@ -209,26 +119,6 @@ void *alloc_remap(int nid, unsigned long size)
return allocation;
}
-static void __init remap_numa_kva(void)
-{
- void *vaddr;
- unsigned long pfn;
- int node;
-
- for_each_online_node(node) {
- printk(KERN_DEBUG "remap_numa_kva: node %d\n", node);
- for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) {
- vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT);
- printk(KERN_DEBUG "remap_numa_kva: %08lx to pfn %08lx\n",
- (unsigned long)vaddr,
- node_remap_start_pfn[node] + pfn);
- set_pmd_pfn((ulong) vaddr,
- node_remap_start_pfn[node] + pfn,
- PAGE_KERNEL_LARGE);
- }
- }
-}
-
#ifdef CONFIG_HIBERNATION
/**
* resume_map_numa_kva - add KVA mapping to the temporary page tables created
@@ -240,15 +130,16 @@ void resume_map_numa_kva(pgd_t *pgd_base)
int node;
for_each_online_node(node) {
- unsigned long start_va, start_pfn, size, pfn;
+ unsigned long start_va, start_pfn, nr_pages, pfn;
start_va = (unsigned long)node_remap_start_vaddr[node];
start_pfn = node_remap_start_pfn[node];
- size = node_remap_size[node];
+ nr_pages = (node_remap_end_vaddr[node] -
+ node_remap_start_vaddr[node]) >> PAGE_SHIFT;
printk(KERN_DEBUG "%s: node %d\n", __func__, node);
- for (pfn = 0; pfn < size; pfn += PTRS_PER_PTE) {
+ for (pfn = 0; pfn < nr_pages; pfn += PTRS_PER_PTE) {
unsigned long vaddr = start_va + (pfn << PAGE_SHIFT);
pgd_t *pgd = pgd_base + pgd_index(vaddr);
pud_t *pud = pud_offset(pgd, vaddr);
@@ -264,132 +155,89 @@ void resume_map_numa_kva(pgd_t *pgd_base)
}
#endif
-static __init unsigned long calculate_numa_remap_pages(void)
+/**
+ * init_alloc_remap - Initialize remap allocator for a NUMA node
+ * @nid: NUMA node to initizlie remap allocator for
+ *
+ * NUMA nodes may end up without any lowmem. As allocating pgdat and
+ * memmap on a different node with lowmem is inefficient, a special
+ * remap allocator is implemented which can be used by alloc_remap().
+ *
+ * For each node, the amount of memory which will be necessary for
+ * pgdat and memmap is calculated and two memory areas of the size are
+ * allocated - one in the node and the other in lowmem; then, the area
+ * in the node is remapped to the lowmem area.
+ *
+ * As pgdat and memmap must be allocated in lowmem anyway, this
+ * doesn't waste lowmem address space; however, the actual lowmem
+ * which gets remapped over is wasted. The amount shouldn't be
+ * problematic on machines this feature will be used.
+ *
+ * Initialization failure isn't fatal. alloc_remap() is used
+ * opportunistically and the callers will fall back to other memory
+ * allocation mechanisms on failure.
+ */
+void __init init_alloc_remap(int nid, u64 start, u64 end)
{
- int nid;
- unsigned long size, reserve_pages = 0;
-
- for_each_online_node(nid) {
- u64 node_kva_target;
- u64 node_kva_final;
-
- /*
- * The acpi/srat node info can show hot-add memroy zones
- * where memory could be added but not currently present.
- */
- printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n",
- nid, node_start_pfn[nid], node_end_pfn[nid]);
- if (node_start_pfn[nid] > max_pfn)
- continue;
- if (!node_end_pfn[nid])
- continue;
- if (node_end_pfn[nid] > max_pfn)
- node_end_pfn[nid] = max_pfn;
-
- /* ensure the remap includes space for the pgdat. */
- size = node_remap_size[nid] + sizeof(pg_data_t);
-
- /* convert size to large (pmd size) pages, rounding up */
- size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES;
- /* now the roundup is correct, convert to PAGE_SIZE pages */
- size = size * PTRS_PER_PTE;
-
- node_kva_target = round_down(node_end_pfn[nid] - size,
- PTRS_PER_PTE);
- node_kva_target <<= PAGE_SHIFT;
- do {
- node_kva_final = memblock_find_in_range(node_kva_target,
- ((u64)node_end_pfn[nid])<<PAGE_SHIFT,
- ((u64)size)<<PAGE_SHIFT,
- LARGE_PAGE_BYTES);
- node_kva_target -= LARGE_PAGE_BYTES;
- } while (node_kva_final == MEMBLOCK_ERROR &&
- (node_kva_target>>PAGE_SHIFT) > (node_start_pfn[nid]));
-
- if (node_kva_final == MEMBLOCK_ERROR)
- panic("Can not get kva ram\n");
-
- node_remap_size[nid] = size;
- node_remap_offset[nid] = reserve_pages;
- reserve_pages += size;
- printk(KERN_DEBUG "Reserving %ld pages of KVA for lmem_map of"
- " node %d at %llx\n",
- size, nid, node_kva_final>>PAGE_SHIFT);
-
- /*
- * prevent kva address below max_low_pfn want it on system
- * with less memory later.
- * layout will be: KVA address , KVA RAM
- *
- * we are supposed to only record the one less then max_low_pfn
- * but we could have some hole in high memory, and it will only
- * check page_is_ram(pfn) && !page_is_reserved_early(pfn) to decide
- * to use it as free.
- * So memblock_x86_reserve_range here, hope we don't run out of that array
- */
- memblock_x86_reserve_range(node_kva_final,
- node_kva_final+(((u64)size)<<PAGE_SHIFT),
- "KVA RAM");
-
- node_remap_start_pfn[nid] = node_kva_final>>PAGE_SHIFT;
- }
- printk(KERN_INFO "Reserving total of %lx pages for numa KVA remap\n",
- reserve_pages);
- return reserve_pages;
-}
+ unsigned long start_pfn = start >> PAGE_SHIFT;
+ unsigned long end_pfn = end >> PAGE_SHIFT;
+ unsigned long size, pfn;
+ u64 node_pa, remap_pa;
+ void *remap_va;
-static void init_remap_allocator(int nid)
-{
- node_remap_start_vaddr[nid] = pfn_to_kaddr(
- kva_start_pfn + node_remap_offset[nid]);
- node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] +
- (node_remap_size[nid] * PAGE_SIZE);
- node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] +
- ALIGN(sizeof(pg_data_t), PAGE_SIZE);
-
- printk(KERN_DEBUG "node %d will remap to vaddr %08lx - %08lx\n", nid,
- (ulong) node_remap_start_vaddr[nid],
- (ulong) node_remap_end_vaddr[nid]);
+ /*
+ * The acpi/srat node info can show hot-add memroy zones where
+ * memory could be added but not currently present.
+ */
+ printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n",
+ nid, start_pfn, end_pfn);
+
+ /* calculate the necessary space aligned to large page size */
+ size = node_memmap_size_bytes(nid, start_pfn, end_pfn);
+ size += ALIGN(sizeof(pg_data_t), PAGE_SIZE);
+ size = ALIGN(size, LARGE_PAGE_BYTES);
+
+ /* allocate node memory and the lowmem remap area */
+ node_pa = memblock_find_in_range(start, end, size, LARGE_PAGE_BYTES);
+ if (node_pa == MEMBLOCK_ERROR) {
+ pr_warning("remap_alloc: failed to allocate %lu bytes for node %d\n",
+ size, nid);
+ return;
+ }
+ memblock_x86_reserve_range(node_pa, node_pa + size, "KVA RAM");
+
+ remap_pa = memblock_find_in_range(min_low_pfn << PAGE_SHIFT,
+ max_low_pfn << PAGE_SHIFT,
+ size, LARGE_PAGE_BYTES);
+ if (remap_pa == MEMBLOCK_ERROR) {
+ pr_warning("remap_alloc: failed to allocate %lu bytes remap area for node %d\n",
+ size, nid);
+ memblock_x86_free_range(node_pa, node_pa + size);
+ return;
+ }
+ memblock_x86_reserve_range(remap_pa, remap_pa + size, "KVA PG");
+ remap_va = phys_to_virt(remap_pa);
+
+ /* perform actual remap */
+ for (pfn = 0; pfn < size >> PAGE_SHIFT; pfn += PTRS_PER_PTE)
+ set_pmd_pfn((unsigned long)remap_va + (pfn << PAGE_SHIFT),
+ (node_pa >> PAGE_SHIFT) + pfn,
+ PAGE_KERNEL_LARGE);
+
+ /* initialize remap allocator parameters */
+ node_remap_start_pfn[nid] = node_pa >> PAGE_SHIFT;
+ node_remap_start_vaddr[nid] = remap_va;
+ node_remap_end_vaddr[nid] = remap_va + size;
+ node_remap_alloc_vaddr[nid] = remap_va;
+
+ printk(KERN_DEBUG "remap_alloc: node %d [%08llx-%08llx) -> [%p-%p)\n",
+ nid, node_pa, node_pa + size, remap_va, remap_va + size);
}
void __init initmem_init(void)
{
- int nid;
- long kva_target_pfn;
-
- /*
- * When mapping a NUMA machine we allocate the node_mem_map arrays
- * from node local memory. They are then mapped directly into KVA
- * between zone normal and vmalloc space. Calculate the size of
- * this space and use it to adjust the boundary between ZONE_NORMAL
- * and ZONE_HIGHMEM.
- */
-
- get_memcfg_numa();
- numa_init_array();
-
- kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE);
+ x86_numa_init();
- kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE);
- do {
- kva_start_pfn = memblock_find_in_range(kva_target_pfn<<PAGE_SHIFT,
- max_low_pfn<<PAGE_SHIFT,
- kva_pages<<PAGE_SHIFT,
- PTRS_PER_PTE<<PAGE_SHIFT) >> PAGE_SHIFT;
- kva_target_pfn -= PTRS_PER_PTE;
- } while (kva_start_pfn == MEMBLOCK_ERROR && kva_target_pfn > min_low_pfn);
-
- if (kva_start_pfn == MEMBLOCK_ERROR)
- panic("Can not get kva space\n");
-
- printk(KERN_INFO "kva_start_pfn ~ %lx max_low_pfn ~ %lx\n",
- kva_start_pfn, max_low_pfn);
- printk(KERN_INFO "max_pfn = %lx\n", max_pfn);
-
- /* avoid clash with initrd */
- memblock_x86_reserve_range(kva_start_pfn<<PAGE_SHIFT,
- (kva_start_pfn + kva_pages)<<PAGE_SHIFT,
- "KVA PG");
#ifdef CONFIG_HIGHMEM
highstart_pfn = highend_pfn = max_pfn;
if (max_pfn > max_low_pfn)
@@ -409,51 +257,9 @@ void __init initmem_init(void)
printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n",
(ulong) pfn_to_kaddr(max_low_pfn));
- for_each_online_node(nid) {
- init_remap_allocator(nid);
-
- allocate_pgdat(nid);
- }
- remap_numa_kva();
printk(KERN_DEBUG "High memory starts at vaddr %08lx\n",
(ulong) pfn_to_kaddr(highstart_pfn));
- for_each_online_node(nid)
- propagate_e820_map_node(nid);
-
- for_each_online_node(nid) {
- memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
- NODE_DATA(nid)->node_id = nid;
- }
setup_bootmem_allocator();
}
-
-#ifdef CONFIG_MEMORY_HOTPLUG
-static int paddr_to_nid(u64 addr)
-{
- int nid;
- unsigned long pfn = PFN_DOWN(addr);
-
- for_each_node(nid)
- if (node_start_pfn[nid] <= pfn &&
- pfn < node_end_pfn[nid])
- return nid;
-
- return -1;
-}
-
-/*
- * This function is used to ask node id BEFORE memmap and mem_section's
- * initialization (pfn_to_nid() can't be used yet).
- * If _PXM is not defined on ACPI's DSDT, node id must be found by this.
- */
-int memory_add_physaddr_to_nid(u64 addr)
-{
- int nid = paddr_to_nid(addr);
- return (nid >= 0) ? nid : 0;
-}
-
-EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
-#endif
-
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 85b52fc0308..dd27f401f0a 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -2,646 +2,13 @@
* Generic VM initialization for x86-64 NUMA setups.
* Copyright 2002,2003 Andi Kleen, SuSE Labs.
*/
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/string.h>
-#include <linux/init.h>
#include <linux/bootmem.h>
-#include <linux/memblock.h>
-#include <linux/mmzone.h>
-#include <linux/ctype.h>
-#include <linux/module.h>
-#include <linux/nodemask.h>
-#include <linux/sched.h>
-#include <linux/acpi.h>
-
-#include <asm/e820.h>
-#include <asm/proto.h>
-#include <asm/dma.h>
-#include <asm/acpi.h>
-#include <asm/amd_nb.h>
#include "numa_internal.h"
-struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
-EXPORT_SYMBOL(node_data);
-
-nodemask_t numa_nodes_parsed __initdata;
-
-struct memnode memnode;
-
-static unsigned long __initdata nodemap_addr;
-static unsigned long __initdata nodemap_size;
-
-static struct numa_meminfo numa_meminfo __initdata;
-
-static int numa_distance_cnt;
-static u8 *numa_distance;
-
-/*
- * Given a shift value, try to populate memnodemap[]
- * Returns :
- * 1 if OK
- * 0 if memnodmap[] too small (of shift too small)
- * -1 if node overlap or lost ram (shift too big)
- */
-static int __init populate_memnodemap(const struct numa_meminfo *mi, int shift)
-{
- unsigned long addr, end;
- int i, res = -1;
-
- memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize);
- for (i = 0; i < mi->nr_blks; i++) {
- addr = mi->blk[i].start;
- end = mi->blk[i].end;
- if (addr >= end)
- continue;
- if ((end >> shift) >= memnodemapsize)
- return 0;
- do {
- if (memnodemap[addr >> shift] != NUMA_NO_NODE)
- return -1;
- memnodemap[addr >> shift] = mi->blk[i].nid;
- addr += (1UL << shift);
- } while (addr < end);
- res = 1;
- }
- return res;
-}
-
-static int __init allocate_cachealigned_memnodemap(void)
-{
- unsigned long addr;
-
- memnodemap = memnode.embedded_map;
- if (memnodemapsize <= ARRAY_SIZE(memnode.embedded_map))
- return 0;
-
- addr = 0x8000;
- nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES);
- nodemap_addr = memblock_find_in_range(addr, get_max_mapped(),
- nodemap_size, L1_CACHE_BYTES);
- if (nodemap_addr == MEMBLOCK_ERROR) {
- printk(KERN_ERR
- "NUMA: Unable to allocate Memory to Node hash map\n");
- nodemap_addr = nodemap_size = 0;
- return -1;
- }
- memnodemap = phys_to_virt(nodemap_addr);
- memblock_x86_reserve_range(nodemap_addr, nodemap_addr + nodemap_size, "MEMNODEMAP");
-
- printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
- nodemap_addr, nodemap_addr + nodemap_size);
- return 0;
-}
-
-/*
- * The LSB of all start and end addresses in the node map is the value of the
- * maximum possible shift.
- */
-static int __init extract_lsb_from_nodes(const struct numa_meminfo *mi)
-{
- int i, nodes_used = 0;
- unsigned long start, end;
- unsigned long bitfield = 0, memtop = 0;
-
- for (i = 0; i < mi->nr_blks; i++) {
- start = mi->blk[i].start;
- end = mi->blk[i].end;
- if (start >= end)
- continue;
- bitfield |= start;
- nodes_used++;
- if (end > memtop)
- memtop = end;
- }
- if (nodes_used <= 1)
- i = 63;
- else
- i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
- memnodemapsize = (memtop >> i)+1;
- return i;
-}
-
-static int __init compute_hash_shift(const struct numa_meminfo *mi)
-{
- int shift;
-
- shift = extract_lsb_from_nodes(mi);
- if (allocate_cachealigned_memnodemap())
- return -1;
- printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
- shift);
-
- if (populate_memnodemap(mi, shift) != 1) {
- printk(KERN_INFO "Your memory is not aligned you need to "
- "rebuild your kernel with a bigger NODEMAPSIZE "
- "shift=%d\n", shift);
- return -1;
- }
- return shift;
-}
-
-int __meminit __early_pfn_to_nid(unsigned long pfn)
-{
- return phys_to_nid(pfn << PAGE_SHIFT);
-}
-
-static void * __init early_node_mem(int nodeid, unsigned long start,
- unsigned long end, unsigned long size,
- unsigned long align)
-{
- unsigned long mem;
-
- /*
- * put it on high as possible
- * something will go with NODE_DATA
- */
- if (start < (MAX_DMA_PFN<<PAGE_SHIFT))
- start = MAX_DMA_PFN<<PAGE_SHIFT;
- if (start < (MAX_DMA32_PFN<<PAGE_SHIFT) &&
- end > (MAX_DMA32_PFN<<PAGE_SHIFT))
- start = MAX_DMA32_PFN<<PAGE_SHIFT;
- mem = memblock_x86_find_in_range_node(nodeid, start, end, size, align);
- if (mem != MEMBLOCK_ERROR)
- return __va(mem);
-
- /* extend the search scope */
- end = max_pfn_mapped << PAGE_SHIFT;
- start = MAX_DMA_PFN << PAGE_SHIFT;
- mem = memblock_find_in_range(start, end, size, align);
- if (mem != MEMBLOCK_ERROR)
- return __va(mem);
-
- printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
- size, nodeid);
-
- return NULL;
-}
-
-static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
- struct numa_meminfo *mi)
-{
- /* ignore zero length blks */
- if (start == end)
- return 0;
-
- /* whine about and ignore invalid blks */
- if (start > end || nid < 0 || nid >= MAX_NUMNODES) {
- pr_warning("NUMA: Warning: invalid memblk node %d (%Lx-%Lx)\n",
- nid, start, end);
- return 0;
- }
-
- if (mi->nr_blks >= NR_NODE_MEMBLKS) {
- pr_err("NUMA: too many memblk ranges\n");
- return -EINVAL;
- }
-
- mi->blk[mi->nr_blks].start = start;
- mi->blk[mi->nr_blks].end = end;
- mi->blk[mi->nr_blks].nid = nid;
- mi->nr_blks++;
- return 0;
-}
-
-/**
- * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo
- * @idx: Index of memblk to remove
- * @mi: numa_meminfo to remove memblk from
- *
- * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and
- * decrementing @mi->nr_blks.
- */
-void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
-{
- mi->nr_blks--;
- memmove(&mi->blk[idx], &mi->blk[idx + 1],
- (mi->nr_blks - idx) * sizeof(mi->blk[0]));
-}
-
-/**
- * numa_add_memblk - Add one numa_memblk to numa_meminfo
- * @nid: NUMA node ID of the new memblk
- * @start: Start address of the new memblk
- * @end: End address of the new memblk
- *
- * Add a new memblk to the default numa_meminfo.
- *
- * RETURNS:
- * 0 on success, -errno on failure.
- */
-int __init numa_add_memblk(int nid, u64 start, u64 end)
-{
- return numa_add_memblk_to(nid, start, end, &numa_meminfo);
-}
-
-/* Initialize bootmem allocator for a node */
-void __init
-setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
-{
- unsigned long start_pfn, last_pfn, nodedata_phys;
- const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
- int nid;
-
- if (!end)
- return;
-
- /*
- * Don't confuse VM with a node that doesn't have the
- * minimum amount of memory:
- */
- if (end && (end - start) < NODE_MIN_SIZE)
- return;
-
- start = roundup(start, ZONE_ALIGN);
-
- printk(KERN_INFO "Initmem setup node %d %016lx-%016lx\n", nodeid,
- start, end);
-
- start_pfn = start >> PAGE_SHIFT;
- last_pfn = end >> PAGE_SHIFT;
-
- node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size,
- SMP_CACHE_BYTES);
- if (node_data[nodeid] == NULL)
- return;
- nodedata_phys = __pa(node_data[nodeid]);
- memblock_x86_reserve_range(nodedata_phys, nodedata_phys + pgdat_size, "NODE_DATA");
- printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", nodedata_phys,
- nodedata_phys + pgdat_size - 1);
- nid = phys_to_nid(nodedata_phys);
- if (nid != nodeid)
- printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nodeid, nid);
-
- memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
- NODE_DATA(nodeid)->node_id = nodeid;
- NODE_DATA(nodeid)->node_start_pfn = start_pfn;
- NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn;
-
- node_set_online(nodeid);
-}
-
-/**
- * numa_cleanup_meminfo - Cleanup a numa_meminfo
- * @mi: numa_meminfo to clean up
- *
- * Sanitize @mi by merging and removing unncessary memblks. Also check for
- * conflicts and clear unused memblks.
- *
- * RETURNS:
- * 0 on success, -errno on failure.
- */
-int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
-{
- const u64 low = 0;
- const u64 high = (u64)max_pfn << PAGE_SHIFT;
- int i, j, k;
-
- for (i = 0; i < mi->nr_blks; i++) {
- struct numa_memblk *bi = &mi->blk[i];
-
- /* make sure all blocks are inside the limits */
- bi->start = max(bi->start, low);
- bi->end = min(bi->end, high);
-
- /* and there's no empty block */
- if (bi->start >= bi->end) {
- numa_remove_memblk_from(i--, mi);
- continue;
- }
-
- for (j = i + 1; j < mi->nr_blks; j++) {
- struct numa_memblk *bj = &mi->blk[j];
- unsigned long start, end;
-
- /*
- * See whether there are overlapping blocks. Whine
- * about but allow overlaps of the same nid. They
- * will be merged below.
- */
- if (bi->end > bj->start && bi->start < bj->end) {
- if (bi->nid != bj->nid) {
- pr_err("NUMA: node %d (%Lx-%Lx) overlaps with node %d (%Lx-%Lx)\n",
- bi->nid, bi->start, bi->end,
- bj->nid, bj->start, bj->end);
- return -EINVAL;
- }
- pr_warning("NUMA: Warning: node %d (%Lx-%Lx) overlaps with itself (%Lx-%Lx)\n",
- bi->nid, bi->start, bi->end,
- bj->start, bj->end);
- }
-
- /*
- * Join together blocks on the same node, holes
- * between which don't overlap with memory on other
- * nodes.
- */
- if (bi->nid != bj->nid)
- continue;
- start = max(min(bi->start, bj->start), low);
- end = min(max(bi->end, bj->end), high);
- for (k = 0; k < mi->nr_blks; k++) {
- struct numa_memblk *bk = &mi->blk[k];
-
- if (bi->nid == bk->nid)
- continue;
- if (start < bk->end && end > bk->start)
- break;
- }
- if (k < mi->nr_blks)
- continue;
- printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n",
- bi->nid, bi->start, bi->end, bj->start, bj->end,
- start, end);
- bi->start = start;
- bi->end = end;
- numa_remove_memblk_from(j--, mi);
- }
- }
-
- for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) {
- mi->blk[i].start = mi->blk[i].end = 0;
- mi->blk[i].nid = NUMA_NO_NODE;
- }
-
- return 0;
-}
-
-/*
- * Set nodes, which have memory in @mi, in *@nodemask.
- */
-static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask,
- const struct numa_meminfo *mi)
-{
- int i;
-
- for (i = 0; i < ARRAY_SIZE(mi->blk); i++)
- if (mi->blk[i].start != mi->blk[i].end &&
- mi->blk[i].nid != NUMA_NO_NODE)
- node_set(mi->blk[i].nid, *nodemask);
-}
-
-/**
- * numa_reset_distance - Reset NUMA distance table
- *
- * The current table is freed. The next numa_set_distance() call will
- * create a new one.
- */
-void __init numa_reset_distance(void)
-{
- size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]);
-
- /* numa_distance could be 1LU marking allocation failure, test cnt */
- if (numa_distance_cnt)
- memblock_x86_free_range(__pa(numa_distance),
- __pa(numa_distance) + size);
- numa_distance_cnt = 0;
- numa_distance = NULL; /* enable table creation */
-}
-
-static int __init numa_alloc_distance(void)
-{
- nodemask_t nodes_parsed;
- size_t size;
- int i, j, cnt = 0;
- u64 phys;
-
- /* size the new table and allocate it */
- nodes_parsed = numa_nodes_parsed;
- numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo);
-
- for_each_node_mask(i, nodes_parsed)
- cnt = i;
- cnt++;
- size = cnt * cnt * sizeof(numa_distance[0]);
-
- phys = memblock_find_in_range(0, (u64)max_pfn_mapped << PAGE_SHIFT,
- size, PAGE_SIZE);
- if (phys == MEMBLOCK_ERROR) {
- pr_warning("NUMA: Warning: can't allocate distance table!\n");
- /* don't retry until explicitly reset */
- numa_distance = (void *)1LU;
- return -ENOMEM;
- }
- memblock_x86_reserve_range(phys, phys + size, "NUMA DIST");
-
- numa_distance = __va(phys);
- numa_distance_cnt = cnt;
-
- /* fill with the default distances */
- for (i = 0; i < cnt; i++)
- for (j = 0; j < cnt; j++)
- numa_distance[i * cnt + j] = i == j ?
- LOCAL_DISTANCE : REMOTE_DISTANCE;
- printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt);
-
- return 0;
-}
-
-/**
- * numa_set_distance - Set NUMA distance from one NUMA to another
- * @from: the 'from' node to set distance
- * @to: the 'to' node to set distance
- * @distance: NUMA distance
- *
- * Set the distance from node @from to @to to @distance. If distance table
- * doesn't exist, one which is large enough to accommodate all the currently
- * known nodes will be created.
- *
- * If such table cannot be allocated, a warning is printed and further
- * calls are ignored until the distance table is reset with
- * numa_reset_distance().
- *
- * If @from or @to is higher than the highest known node at the time of
- * table creation or @distance doesn't make sense, the call is ignored.
- * This is to allow simplification of specific NUMA config implementations.
- */
-void __init numa_set_distance(int from, int to, int distance)
-{
- if (!numa_distance && numa_alloc_distance() < 0)
- return;
-
- if (from >= numa_distance_cnt || to >= numa_distance_cnt) {
- printk_once(KERN_DEBUG "NUMA: Debug: distance out of bound, from=%d to=%d distance=%d\n",
- from, to, distance);
- return;
- }
-
- if ((u8)distance != distance ||
- (from == to && distance != LOCAL_DISTANCE)) {
- pr_warn_once("NUMA: Warning: invalid distance parameter, from=%d to=%d distance=%d\n",
- from, to, distance);
- return;
- }
-
- numa_distance[from * numa_distance_cnt + to] = distance;
-}
-
-int __node_distance(int from, int to)
-{
- if (from >= numa_distance_cnt || to >= numa_distance_cnt)
- return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE;
- return numa_distance[from * numa_distance_cnt + to];
-}
-EXPORT_SYMBOL(__node_distance);
-
-/*
- * Sanity check to catch more bad NUMA configurations (they are amazingly
- * common). Make sure the nodes cover all memory.
- */
-static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi)
-{
- unsigned long numaram, e820ram;
- int i;
-
- numaram = 0;
- for (i = 0; i < mi->nr_blks; i++) {
- unsigned long s = mi->blk[i].start >> PAGE_SHIFT;
- unsigned long e = mi->blk[i].end >> PAGE_SHIFT;
- numaram += e - s;
- numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e);
- if ((long)numaram < 0)
- numaram = 0;
- }
-
- e820ram = max_pfn - (memblock_x86_hole_size(0,
- max_pfn << PAGE_SHIFT) >> PAGE_SHIFT);
- /* We seem to lose 3 pages somewhere. Allow 1M of slack. */
- if ((long)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) {
- printk(KERN_ERR "NUMA: nodes only cover %luMB of your %luMB e820 RAM. Not used.\n",
- (numaram << PAGE_SHIFT) >> 20,
- (e820ram << PAGE_SHIFT) >> 20);
- return false;
- }
- return true;
-}
-
-static int __init numa_register_memblks(struct numa_meminfo *mi)
-{
- int i, nid;
-
- /* Account for nodes with cpus and no memory */
- node_possible_map = numa_nodes_parsed;
- numa_nodemask_from_meminfo(&node_possible_map, mi);
- if (WARN_ON(nodes_empty(node_possible_map)))
- return -EINVAL;
-
- memnode_shift = compute_hash_shift(mi);
- if (memnode_shift < 0) {
- printk(KERN_ERR "NUMA: No NUMA node hash function found. Contact maintainer\n");
- return -EINVAL;
- }
-
- for (i = 0; i < mi->nr_blks; i++)
- memblock_x86_register_active_regions(mi->blk[i].nid,
- mi->blk[i].start >> PAGE_SHIFT,
- mi->blk[i].end >> PAGE_SHIFT);
-
- /* for out of order entries */
- sort_node_map();
- if (!numa_meminfo_cover_memory(mi))
- return -EINVAL;
-
- /* Finally register nodes. */
- for_each_node_mask(nid, node_possible_map) {
- u64 start = (u64)max_pfn << PAGE_SHIFT;
- u64 end = 0;
-
- for (i = 0; i < mi->nr_blks; i++) {
- if (nid != mi->blk[i].nid)
- continue;
- start = min(mi->blk[i].start, start);
- end = max(mi->blk[i].end, end);
- }
-
- if (start < end)
- setup_node_bootmem(nid, start, end);
- }
-
- return 0;
-}
-
-/**
- * dummy_numma_init - Fallback dummy NUMA init
- *
- * Used if there's no underlying NUMA architecture, NUMA initialization
- * fails, or NUMA is disabled on the command line.
- *
- * Must online at least one node and add memory blocks that cover all
- * allowed memory. This function must not fail.
- */
-static int __init dummy_numa_init(void)
-{
- printk(KERN_INFO "%s\n",
- numa_off ? "NUMA turned off" : "No NUMA configuration found");
- printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
- 0LU, max_pfn << PAGE_SHIFT);
-
- node_set(0, numa_nodes_parsed);
- numa_add_memblk(0, 0, (u64)max_pfn << PAGE_SHIFT);
-
- return 0;
-}
-
-static int __init numa_init(int (*init_func)(void))
-{
- int i;
- int ret;
-
- for (i = 0; i < MAX_LOCAL_APIC; i++)
- set_apicid_to_node(i, NUMA_NO_NODE);
-
- nodes_clear(numa_nodes_parsed);
- nodes_clear(node_possible_map);
- nodes_clear(node_online_map);
- memset(&numa_meminfo, 0, sizeof(numa_meminfo));
- remove_all_active_ranges();
- numa_reset_distance();
-
- ret = init_func();
- if (ret < 0)
- return ret;
- ret = numa_cleanup_meminfo(&numa_meminfo);
- if (ret < 0)
- return ret;
-
- numa_emulation(&numa_meminfo, numa_distance_cnt);
-
- ret = numa_register_memblks(&numa_meminfo);
- if (ret < 0)
- return ret;
-
- for (i = 0; i < nr_cpu_ids; i++) {
- int nid = early_cpu_to_node(i);
-
- if (nid == NUMA_NO_NODE)
- continue;
- if (!node_online(nid))
- numa_clear_node(i);
- }
- numa_init_array();
- return 0;
-}
-
void __init initmem_init(void)
{
- int ret;
-
- if (!numa_off) {
-#ifdef CONFIG_ACPI_NUMA
- ret = numa_init(x86_acpi_numa_init);
- if (!ret)
- return;
-#endif
-#ifdef CONFIG_AMD_NUMA
- ret = numa_init(amd_numa_init);
- if (!ret)
- return;
-#endif
- }
-
- numa_init(dummy_numa_init);
+ x86_numa_init();
}
unsigned long __init numa_free_all_bootmem(void)
@@ -656,12 +23,3 @@ unsigned long __init numa_free_all_bootmem(void)
return pages;
}
-
-int __cpuinit numa_cpu_node(int cpu)
-{
- int apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
-
- if (apicid != BAD_APICID)
- return __apicid_to_node[apicid];
- return NUMA_NO_NODE;
-}
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
index de84cc14037..d0ed086b624 100644
--- a/arch/x86/mm/numa_emulation.c
+++ b/arch/x86/mm/numa_emulation.c
@@ -5,6 +5,7 @@
#include <linux/errno.h>
#include <linux/topology.h>
#include <linux/memblock.h>
+#include <linux/bootmem.h>
#include <asm/dma.h>
#include "numa_internal.h"
@@ -84,7 +85,13 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei,
nr_nodes = MAX_NUMNODES;
}
- size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / nr_nodes;
+ /*
+ * Calculate target node size. x86_32 freaks on __udivdi3() so do
+ * the division in ulong number of pages and convert back.
+ */
+ size = max_addr - addr - memblock_x86_hole_size(addr, max_addr);
+ size = PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes);
+
/*
* Calculate the number of big nodes that can be allocated as a result
* of consolidating the remainder.
@@ -226,7 +233,7 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
*/
while (nodes_weight(physnode_mask)) {
for_each_node_mask(i, physnode_mask) {
- u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT;
+ u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
u64 start, limit, end;
int phys_blk;
@@ -298,7 +305,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
{
static struct numa_meminfo ei __initdata;
static struct numa_meminfo pi __initdata;
- const u64 max_addr = max_pfn << PAGE_SHIFT;
+ const u64 max_addr = PFN_PHYS(max_pfn);
u8 *phys_dist = NULL;
size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]);
int max_emu_nid, dfl_phys_nid;
@@ -342,8 +349,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
if (numa_dist_cnt) {
u64 phys;
- phys = memblock_find_in_range(0,
- (u64)max_pfn_mapped << PAGE_SHIFT,
+ phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
phys_size, PAGE_SIZE);
if (phys == MEMBLOCK_ERROR) {
pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n");
diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h
index ef2d97377d7..7178c3afe05 100644
--- a/arch/x86/mm/numa_internal.h
+++ b/arch/x86/mm/numa_internal.h
@@ -19,6 +19,14 @@ void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi);
int __init numa_cleanup_meminfo(struct numa_meminfo *mi);
void __init numa_reset_distance(void);
+void __init x86_numa_init(void);
+
+#ifdef CONFIG_X86_64
+static inline void init_alloc_remap(int nid, u64 start, u64 end) { }
+#else
+void __init init_alloc_remap(int nid, u64 start, u64 end);
+#endif
+
#ifdef CONFIG_NUMA_EMU
void __init numa_emulation(struct numa_meminfo *numa_meminfo,
int numa_dist_cnt);
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c
index e1d10690921..b0086567271 100644
--- a/arch/x86/mm/pageattr-test.c
+++ b/arch/x86/mm/pageattr-test.c
@@ -123,12 +123,11 @@ static int pageattr_test(void)
if (print)
printk(KERN_INFO "CPA self-test:\n");
- bm = vmalloc((max_pfn_mapped + 7) / 8);
+ bm = vzalloc((max_pfn_mapped + 7) / 8);
if (!bm) {
printk(KERN_ERR "CPA Cannot vmalloc bitmap\n");
return -ENOMEM;
}
- memset(bm, 0, (max_pfn_mapped + 7) / 8);
failed += print_split(&sa);
srandom32(100);
diff --git a/arch/x86/mm/pf_in.c b/arch/x86/mm/pf_in.c
index 38e6d174c49..9f0614daea8 100644
--- a/arch/x86/mm/pf_in.c
+++ b/arch/x86/mm/pf_in.c
@@ -414,22 +414,17 @@ unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs)
unsigned char *p;
struct prefix_bits prf;
int i;
- unsigned long rv;
p = (unsigned char *)ins_addr;
p += skip_prefix(p, &prf);
p += get_opcode(p, &opcode);
for (i = 0; i < ARRAY_SIZE(reg_rop); i++)
- if (reg_rop[i] == opcode) {
- rv = REG_READ;
+ if (reg_rop[i] == opcode)
goto do_work;
- }
for (i = 0; i < ARRAY_SIZE(reg_wop); i++)
- if (reg_wop[i] == opcode) {
- rv = REG_WRITE;
+ if (reg_wop[i] == opcode)
goto do_work;
- }
printk(KERN_ERR "mmiotrace: Not a register instruction, opcode "
"0x%02x\n", opcode);
@@ -474,16 +469,13 @@ unsigned long get_ins_imm_val(unsigned long ins_addr)
unsigned char *p;
struct prefix_bits prf;
int i;
- unsigned long rv;
p = (unsigned char *)ins_addr;
p += skip_prefix(p, &prf);
p += get_opcode(p, &opcode);
for (i = 0; i < ARRAY_SIZE(imm_wop); i++)
- if (imm_wop[i] == opcode) {
- rv = IMM_WRITE;
+ if (imm_wop[i] == opcode)
goto do_work;
- }
printk(KERN_ERR "mmiotrace: Not an immediate instruction, opcode "
"0x%02x\n", opcode);
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat.c
index 8e9d3394f6d..81dbfdeb080 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat.c
@@ -26,8 +26,6 @@
int acpi_numa __initdata;
-static struct bootnode nodes_add[MAX_NUMNODES];
-
static __init int setup_node(int pxm)
{
return acpi_map_pxm_to_node(pxm);
@@ -37,7 +35,6 @@ static __init void bad_srat(void)
{
printk(KERN_ERR "SRAT: SRAT not used.\n");
acpi_numa = -1;
- memset(nodes_add, 0, sizeof(nodes_add));
}
static __init inline int srat_disabled(void)
@@ -131,73 +128,17 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
pxm, apic_id, node);
}
-#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
+#ifdef CONFIG_MEMORY_HOTPLUG
static inline int save_add_info(void) {return 1;}
#else
static inline int save_add_info(void) {return 0;}
#endif
-/*
- * Update nodes_add[]
- * This code supports one contiguous hot add area per node
- */
-static void __init
-update_nodes_add(int node, unsigned long start, unsigned long end)
-{
- unsigned long s_pfn = start >> PAGE_SHIFT;
- unsigned long e_pfn = end >> PAGE_SHIFT;
- int changed = 0;
- struct bootnode *nd = &nodes_add[node];
-
- /* I had some trouble with strange memory hotadd regions breaking
- the boot. Be very strict here and reject anything unexpected.
- If you want working memory hotadd write correct SRATs.
-
- The node size check is a basic sanity check to guard against
- mistakes */
- if ((signed long)(end - start) < NODE_MIN_SIZE) {
- printk(KERN_ERR "SRAT: Hotplug area too small\n");
- return;
- }
-
- /* This check might be a bit too strict, but I'm keeping it for now. */
- if (absent_pages_in_range(s_pfn, e_pfn) != e_pfn - s_pfn) {
- printk(KERN_ERR
- "SRAT: Hotplug area %lu -> %lu has existing memory\n",
- s_pfn, e_pfn);
- return;
- }
-
- /* Looks good */
-
- if (nd->start == nd->end) {
- nd->start = start;
- nd->end = end;
- changed = 1;
- } else {
- if (nd->start == end) {
- nd->start = start;
- changed = 1;
- }
- if (nd->end == start) {
- nd->end = end;
- changed = 1;
- }
- if (!changed)
- printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
- }
-
- if (changed) {
- node_set(node, numa_nodes_parsed);
- printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n",
- nd->start, nd->end);
- }
-}
/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
void __init
acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
{
- unsigned long start, end;
+ u64 start, end;
int node, pxm;
if (srat_disabled())
@@ -226,11 +167,8 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
return;
}
- printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm,
+ printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm,
start, end);
-
- if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE)
- update_nodes_add(node, start, end);
}
void __init acpi_numa_arch_fixup(void) {}
@@ -244,17 +182,3 @@ int __init x86_acpi_numa_init(void)
return ret;
return srat_disabled() ? -EINVAL : 0;
}
-
-#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || defined(CONFIG_ACPI_HOTPLUG_MEMORY)
-int memory_add_physaddr_to_nid(u64 start)
-{
- int i, ret = 0;
-
- for_each_node(i)
- if (nodes_add[i].start <= start && nodes_add[i].end > start)
- ret = i;
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
-#endif
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c
deleted file mode 100644
index 364f36bdfad..00000000000
--- a/arch/x86/mm/srat_32.c
+++ /dev/null
@@ -1,288 +0,0 @@
-/*
- * Some of the code in this file has been gleaned from the 64 bit
- * discontigmem support code base.
- *
- * Copyright (C) 2002, IBM Corp.
- *
- * All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT. See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Send feedback to Pat Gaughen <gone@us.ibm.com>
- */
-#include <linux/mm.h>
-#include <linux/bootmem.h>
-#include <linux/memblock.h>
-#include <linux/mmzone.h>
-#include <linux/acpi.h>
-#include <linux/nodemask.h>
-#include <asm/srat.h>
-#include <asm/topology.h>
-#include <asm/smp.h>
-#include <asm/e820.h>
-
-/*
- * proximity macros and definitions
- */
-#define NODE_ARRAY_INDEX(x) ((x) / 8) /* 8 bits/char */
-#define NODE_ARRAY_OFFSET(x) ((x) % 8) /* 8 bits/char */
-#define BMAP_SET(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] |= 1 << NODE_ARRAY_OFFSET(bit))
-#define BMAP_TEST(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] & (1 << NODE_ARRAY_OFFSET(bit)))
-/* bitmap length; _PXM is at most 255 */
-#define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8)
-static u8 __initdata pxm_bitmap[PXM_BITMAP_LEN]; /* bitmap of proximity domains */
-
-#define MAX_CHUNKS_PER_NODE 3
-#define MAXCHUNKS (MAX_CHUNKS_PER_NODE * MAX_NUMNODES)
-struct node_memory_chunk_s {
- unsigned long start_pfn;
- unsigned long end_pfn;
- u8 pxm; // proximity domain of node
- u8 nid; // which cnode contains this chunk?
- u8 bank; // which mem bank on this node
-};
-static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS];
-
-static int __initdata num_memory_chunks; /* total number of memory chunks */
-static u8 __initdata apicid_to_pxm[MAX_LOCAL_APIC];
-
-int acpi_numa __initdata;
-
-static __init void bad_srat(void)
-{
- printk(KERN_ERR "SRAT: SRAT not used.\n");
- acpi_numa = -1;
- num_memory_chunks = 0;
-}
-
-static __init inline int srat_disabled(void)
-{
- return numa_off || acpi_numa < 0;
-}
-
-/* Identify CPU proximity domains */
-void __init
-acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *cpu_affinity)
-{
- if (srat_disabled())
- return;
- if (cpu_affinity->header.length !=
- sizeof(struct acpi_srat_cpu_affinity)) {
- bad_srat();
- return;
- }
-
- if ((cpu_affinity->flags & ACPI_SRAT_CPU_ENABLED) == 0)
- return; /* empty entry */
-
- /* mark this node as "seen" in node bitmap */
- BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain_lo);
-
- /* don't need to check apic_id here, because it is always 8 bits */
- apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo;
-
- printk(KERN_DEBUG "CPU %02x in proximity domain %02x\n",
- cpu_affinity->apic_id, cpu_affinity->proximity_domain_lo);
-}
-
-/*
- * Identify memory proximity domains and hot-remove capabilities.
- * Fill node memory chunk list structure.
- */
-void __init
-acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *memory_affinity)
-{
- unsigned long long paddr, size;
- unsigned long start_pfn, end_pfn;
- u8 pxm;
- struct node_memory_chunk_s *p, *q, *pend;
-
- if (srat_disabled())
- return;
- if (memory_affinity->header.length !=
- sizeof(struct acpi_srat_mem_affinity)) {
- bad_srat();
- return;
- }
-
- if ((memory_affinity->flags & ACPI_SRAT_MEM_ENABLED) == 0)
- return; /* empty entry */
-
- pxm = memory_affinity->proximity_domain & 0xff;
-
- /* mark this node as "seen" in node bitmap */
- BMAP_SET(pxm_bitmap, pxm);
-
- /* calculate info for memory chunk structure */
- paddr = memory_affinity->base_address;
- size = memory_affinity->length;
-
- start_pfn = paddr >> PAGE_SHIFT;
- end_pfn = (paddr + size) >> PAGE_SHIFT;
-
-
- if (num_memory_chunks >= MAXCHUNKS) {
- printk(KERN_WARNING "Too many mem chunks in SRAT."
- " Ignoring %lld MBytes at %llx\n",
- size/(1024*1024), paddr);
- return;
- }
-
- /* Insertion sort based on base address */
- pend = &node_memory_chunk[num_memory_chunks];
- for (p = &node_memory_chunk[0]; p < pend; p++) {
- if (start_pfn < p->start_pfn)
- break;
- }
- if (p < pend) {
- for (q = pend; q >= p; q--)
- *(q + 1) = *q;
- }
- p->start_pfn = start_pfn;
- p->end_pfn = end_pfn;
- p->pxm = pxm;
-
- num_memory_chunks++;
-
- printk(KERN_DEBUG "Memory range %08lx to %08lx"
- " in proximity domain %02x %s\n",
- start_pfn, end_pfn,
- pxm,
- ((memory_affinity->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) ?
- "enabled and removable" : "enabled" ) );
-}
-
-/* Callback for SLIT parsing */
-void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
-{
-}
-
-void acpi_numa_arch_fixup(void)
-{
-}
-/*
- * The SRAT table always lists ascending addresses, so can always
- * assume that the first "start" address that you see is the real
- * start of the node, and that the current "end" address is after
- * the previous one.
- */
-static __init int node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk)
-{
- /*
- * Only add present memory as told by the e820.
- * There is no guarantee from the SRAT that the memory it
- * enumerates is present at boot time because it represents
- * *possible* memory hotplug areas the same as normal RAM.
- */
- if (memory_chunk->start_pfn >= max_pfn) {
- printk(KERN_INFO "Ignoring SRAT pfns: %08lx - %08lx\n",
- memory_chunk->start_pfn, memory_chunk->end_pfn);
- return -1;
- }
- if (memory_chunk->nid != nid)
- return -1;
-
- if (!node_has_online_mem(nid))
- node_start_pfn[nid] = memory_chunk->start_pfn;
-
- if (node_start_pfn[nid] > memory_chunk->start_pfn)
- node_start_pfn[nid] = memory_chunk->start_pfn;
-
- if (node_end_pfn[nid] < memory_chunk->end_pfn)
- node_end_pfn[nid] = memory_chunk->end_pfn;
-
- return 0;
-}
-
-int __init get_memcfg_from_srat(void)
-{
- int i, j, nid;
-
- if (srat_disabled())
- goto out_fail;
-
- if (acpi_numa_init() < 0)
- goto out_fail;
-
- if (num_memory_chunks == 0) {
- printk(KERN_DEBUG
- "could not find any ACPI SRAT memory areas.\n");
- goto out_fail;
- }
-
- /* Calculate total number of nodes in system from PXM bitmap and create
- * a set of sequential node IDs starting at zero. (ACPI doesn't seem
- * to specify the range of _PXM values.)
- */
- /*
- * MCD - we no longer HAVE to number nodes sequentially. PXM domain
- * numbers could go as high as 256, and MAX_NUMNODES for i386 is typically
- * 32, so we will continue numbering them in this manner until MAX_NUMNODES
- * approaches MAX_PXM_DOMAINS for i386.
- */
- nodes_clear(node_online_map);
- for (i = 0; i < MAX_PXM_DOMAINS; i++) {
- if (BMAP_TEST(pxm_bitmap, i)) {
- int nid = acpi_map_pxm_to_node(i);
- node_set_online(nid);
- }
- }
- BUG_ON(num_online_nodes() == 0);
-
- /* set cnode id in memory chunk structure */
- for (i = 0; i < num_memory_chunks; i++)
- node_memory_chunk[i].nid = pxm_to_node(node_memory_chunk[i].pxm);
-
- printk(KERN_DEBUG "pxm bitmap: ");
- for (i = 0; i < sizeof(pxm_bitmap); i++) {
- printk(KERN_CONT "%02x ", pxm_bitmap[i]);
- }
- printk(KERN_CONT "\n");
- printk(KERN_DEBUG "Number of logical nodes in system = %d\n",
- num_online_nodes());
- printk(KERN_DEBUG "Number of memory chunks in system = %d\n",
- num_memory_chunks);
-
- for (i = 0; i < MAX_LOCAL_APIC; i++)
- set_apicid_to_node(i, pxm_to_node(apicid_to_pxm[i]));
-
- for (j = 0; j < num_memory_chunks; j++){
- struct node_memory_chunk_s * chunk = &node_memory_chunk[j];
- printk(KERN_DEBUG
- "chunk %d nid %d start_pfn %08lx end_pfn %08lx\n",
- j, chunk->nid, chunk->start_pfn, chunk->end_pfn);
- if (node_read_chunk(chunk->nid, chunk))
- continue;
-
- memblock_x86_register_active_regions(chunk->nid, chunk->start_pfn,
- min(chunk->end_pfn, max_pfn));
- }
- /* for out of order entries in SRAT */
- sort_node_map();
-
- for_each_online_node(nid) {
- unsigned long start = node_start_pfn[nid];
- unsigned long end = min(node_end_pfn[nid], max_pfn);
-
- memory_present(nid, start, end);
- node_remap_size[nid] = node_memmap_size_bytes(nid, start, end);
- }
- return 1;
-out_fail:
- printk(KERN_DEBUG "failed to get NUMA memory information from SRAT"
- " table\n");
- return 0;
-}
diff --git a/arch/x86/net/Makefile b/arch/x86/net/Makefile
new file mode 100644
index 00000000000..90568c33ddb
--- /dev/null
+++ b/arch/x86/net/Makefile
@@ -0,0 +1,4 @@
+#
+# Arch-specific network modules
+#
+obj-$(CONFIG_BPF_JIT) += bpf_jit.o bpf_jit_comp.o
diff --git a/arch/x86/net/bpf_jit.S b/arch/x86/net/bpf_jit.S
new file mode 100644
index 00000000000..66870223f8c
--- /dev/null
+++ b/arch/x86/net/bpf_jit.S
@@ -0,0 +1,140 @@
+/* bpf_jit.S : BPF JIT helper functions
+ *
+ * Copyright (C) 2011 Eric Dumazet (eric.dumazet@gmail.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+
+/*
+ * Calling convention :
+ * rdi : skb pointer
+ * esi : offset of byte(s) to fetch in skb (can be scratched)
+ * r8 : copy of skb->data
+ * r9d : hlen = skb->len - skb->data_len
+ */
+#define SKBDATA %r8
+
+sk_load_word_ind:
+ .globl sk_load_word_ind
+
+ add %ebx,%esi /* offset += X */
+# test %esi,%esi /* if (offset < 0) goto bpf_error; */
+ js bpf_error
+
+sk_load_word:
+ .globl sk_load_word
+
+ mov %r9d,%eax # hlen
+ sub %esi,%eax # hlen - offset
+ cmp $3,%eax
+ jle bpf_slow_path_word
+ mov (SKBDATA,%rsi),%eax
+ bswap %eax /* ntohl() */
+ ret
+
+
+sk_load_half_ind:
+ .globl sk_load_half_ind
+
+ add %ebx,%esi /* offset += X */
+ js bpf_error
+
+sk_load_half:
+ .globl sk_load_half
+
+ mov %r9d,%eax
+ sub %esi,%eax # hlen - offset
+ cmp $1,%eax
+ jle bpf_slow_path_half
+ movzwl (SKBDATA,%rsi),%eax
+ rol $8,%ax # ntohs()
+ ret
+
+sk_load_byte_ind:
+ .globl sk_load_byte_ind
+ add %ebx,%esi /* offset += X */
+ js bpf_error
+
+sk_load_byte:
+ .globl sk_load_byte
+
+ cmp %esi,%r9d /* if (offset >= hlen) goto bpf_slow_path_byte */
+ jle bpf_slow_path_byte
+ movzbl (SKBDATA,%rsi),%eax
+ ret
+
+/**
+ * sk_load_byte_msh - BPF_S_LDX_B_MSH helper
+ *
+ * Implements BPF_S_LDX_B_MSH : ldxb 4*([offset]&0xf)
+ * Must preserve A accumulator (%eax)
+ * Inputs : %esi is the offset value, already known positive
+ */
+ENTRY(sk_load_byte_msh)
+ CFI_STARTPROC
+ cmp %esi,%r9d /* if (offset >= hlen) goto bpf_slow_path_byte_msh */
+ jle bpf_slow_path_byte_msh
+ movzbl (SKBDATA,%rsi),%ebx
+ and $15,%bl
+ shl $2,%bl
+ ret
+ CFI_ENDPROC
+ENDPROC(sk_load_byte_msh)
+
+bpf_error:
+# force a return 0 from jit handler
+ xor %eax,%eax
+ mov -8(%rbp),%rbx
+ leaveq
+ ret
+
+/* rsi contains offset and can be scratched */
+#define bpf_slow_path_common(LEN) \
+ push %rdi; /* save skb */ \
+ push %r9; \
+ push SKBDATA; \
+/* rsi already has offset */ \
+ mov $LEN,%ecx; /* len */ \
+ lea -12(%rbp),%rdx; \
+ call skb_copy_bits; \
+ test %eax,%eax; \
+ pop SKBDATA; \
+ pop %r9; \
+ pop %rdi
+
+
+bpf_slow_path_word:
+ bpf_slow_path_common(4)
+ js bpf_error
+ mov -12(%rbp),%eax
+ bswap %eax
+ ret
+
+bpf_slow_path_half:
+ bpf_slow_path_common(2)
+ js bpf_error
+ mov -12(%rbp),%ax
+ rol $8,%ax
+ movzwl %ax,%eax
+ ret
+
+bpf_slow_path_byte:
+ bpf_slow_path_common(1)
+ js bpf_error
+ movzbl -12(%rbp),%eax
+ ret
+
+bpf_slow_path_byte_msh:
+ xchg %eax,%ebx /* dont lose A , X is about to be scratched */
+ bpf_slow_path_common(1)
+ js bpf_error
+ movzbl -12(%rbp),%eax
+ and $15,%al
+ shl $2,%al
+ xchg %eax,%ebx
+ ret
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
new file mode 100644
index 00000000000..bfab3fa10ed
--- /dev/null
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -0,0 +1,654 @@
+/* bpf_jit_comp.c : BPF JIT compiler
+ *
+ * Copyright (C) 2011 Eric Dumazet (eric.dumazet@gmail.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+#include <linux/moduleloader.h>
+#include <asm/cacheflush.h>
+#include <linux/netdevice.h>
+#include <linux/filter.h>
+
+/*
+ * Conventions :
+ * EAX : BPF A accumulator
+ * EBX : BPF X accumulator
+ * RDI : pointer to skb (first argument given to JIT function)
+ * RBP : frame pointer (even if CONFIG_FRAME_POINTER=n)
+ * ECX,EDX,ESI : scratch registers
+ * r9d : skb->len - skb->data_len (headlen)
+ * r8 : skb->data
+ * -8(RBP) : saved RBX value
+ * -16(RBP)..-80(RBP) : BPF_MEMWORDS values
+ */
+int bpf_jit_enable __read_mostly;
+
+/*
+ * assembly code in arch/x86/net/bpf_jit.S
+ */
+extern u8 sk_load_word[], sk_load_half[], sk_load_byte[], sk_load_byte_msh[];
+extern u8 sk_load_word_ind[], sk_load_half_ind[], sk_load_byte_ind[];
+
+static inline u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
+{
+ if (len == 1)
+ *ptr = bytes;
+ else if (len == 2)
+ *(u16 *)ptr = bytes;
+ else {
+ *(u32 *)ptr = bytes;
+ barrier();
+ }
+ return ptr + len;
+}
+
+#define EMIT(bytes, len) do { prog = emit_code(prog, bytes, len); } while (0)
+
+#define EMIT1(b1) EMIT(b1, 1)
+#define EMIT2(b1, b2) EMIT((b1) + ((b2) << 8), 2)
+#define EMIT3(b1, b2, b3) EMIT((b1) + ((b2) << 8) + ((b3) << 16), 3)
+#define EMIT4(b1, b2, b3, b4) EMIT((b1) + ((b2) << 8) + ((b3) << 16) + ((b4) << 24), 4)
+#define EMIT1_off32(b1, off) do { EMIT1(b1); EMIT(off, 4);} while (0)
+
+#define CLEAR_A() EMIT2(0x31, 0xc0) /* xor %eax,%eax */
+#define CLEAR_X() EMIT2(0x31, 0xdb) /* xor %ebx,%ebx */
+
+static inline bool is_imm8(int value)
+{
+ return value <= 127 && value >= -128;
+}
+
+static inline bool is_near(int offset)
+{
+ return offset <= 127 && offset >= -128;
+}
+
+#define EMIT_JMP(offset) \
+do { \
+ if (offset) { \
+ if (is_near(offset)) \
+ EMIT2(0xeb, offset); /* jmp .+off8 */ \
+ else \
+ EMIT1_off32(0xe9, offset); /* jmp .+off32 */ \
+ } \
+} while (0)
+
+/* list of x86 cond jumps opcodes (. + s8)
+ * Add 0x10 (and an extra 0x0f) to generate far jumps (. + s32)
+ */
+#define X86_JB 0x72
+#define X86_JAE 0x73
+#define X86_JE 0x74
+#define X86_JNE 0x75
+#define X86_JBE 0x76
+#define X86_JA 0x77
+
+#define EMIT_COND_JMP(op, offset) \
+do { \
+ if (is_near(offset)) \
+ EMIT2(op, offset); /* jxx .+off8 */ \
+ else { \
+ EMIT2(0x0f, op + 0x10); \
+ EMIT(offset, 4); /* jxx .+off32 */ \
+ } \
+} while (0)
+
+#define COND_SEL(CODE, TOP, FOP) \
+ case CODE: \
+ t_op = TOP; \
+ f_op = FOP; \
+ goto cond_branch
+
+
+#define SEEN_DATAREF 1 /* might call external helpers */
+#define SEEN_XREG 2 /* ebx is used */
+#define SEEN_MEM 4 /* use mem[] for temporary storage */
+
+static inline void bpf_flush_icache(void *start, void *end)
+{
+ mm_segment_t old_fs = get_fs();
+
+ set_fs(KERNEL_DS);
+ smp_wmb();
+ flush_icache_range((unsigned long)start, (unsigned long)end);
+ set_fs(old_fs);
+}
+
+
+void bpf_jit_compile(struct sk_filter *fp)
+{
+ u8 temp[64];
+ u8 *prog;
+ unsigned int proglen, oldproglen = 0;
+ int ilen, i;
+ int t_offset, f_offset;
+ u8 t_op, f_op, seen = 0, pass;
+ u8 *image = NULL;
+ u8 *func;
+ int pc_ret0 = -1; /* bpf index of first RET #0 instruction (if any) */
+ unsigned int cleanup_addr; /* epilogue code offset */
+ unsigned int *addrs;
+ const struct sock_filter *filter = fp->insns;
+ int flen = fp->len;
+
+ if (!bpf_jit_enable)
+ return;
+
+ addrs = kmalloc(flen * sizeof(*addrs), GFP_KERNEL);
+ if (addrs == NULL)
+ return;
+
+ /* Before first pass, make a rough estimation of addrs[]
+ * each bpf instruction is translated to less than 64 bytes
+ */
+ for (proglen = 0, i = 0; i < flen; i++) {
+ proglen += 64;
+ addrs[i] = proglen;
+ }
+ cleanup_addr = proglen; /* epilogue address */
+
+ for (pass = 0; pass < 10; pass++) {
+ /* no prologue/epilogue for trivial filters (RET something) */
+ proglen = 0;
+ prog = temp;
+
+ if (seen) {
+ EMIT4(0x55, 0x48, 0x89, 0xe5); /* push %rbp; mov %rsp,%rbp */
+ EMIT4(0x48, 0x83, 0xec, 96); /* subq $96,%rsp */
+ /* note : must save %rbx in case bpf_error is hit */
+ if (seen & (SEEN_XREG | SEEN_DATAREF))
+ EMIT4(0x48, 0x89, 0x5d, 0xf8); /* mov %rbx, -8(%rbp) */
+ if (seen & SEEN_XREG)
+ CLEAR_X(); /* make sure we dont leek kernel memory */
+
+ /*
+ * If this filter needs to access skb data,
+ * loads r9 and r8 with :
+ * r9 = skb->len - skb->data_len
+ * r8 = skb->data
+ */
+ if (seen & SEEN_DATAREF) {
+ if (offsetof(struct sk_buff, len) <= 127)
+ /* mov off8(%rdi),%r9d */
+ EMIT4(0x44, 0x8b, 0x4f, offsetof(struct sk_buff, len));
+ else {
+ /* mov off32(%rdi),%r9d */
+ EMIT3(0x44, 0x8b, 0x8f);
+ EMIT(offsetof(struct sk_buff, len), 4);
+ }
+ if (is_imm8(offsetof(struct sk_buff, data_len)))
+ /* sub off8(%rdi),%r9d */
+ EMIT4(0x44, 0x2b, 0x4f, offsetof(struct sk_buff, data_len));
+ else {
+ EMIT3(0x44, 0x2b, 0x8f);
+ EMIT(offsetof(struct sk_buff, data_len), 4);
+ }
+
+ if (is_imm8(offsetof(struct sk_buff, data)))
+ /* mov off8(%rdi),%r8 */
+ EMIT4(0x4c, 0x8b, 0x47, offsetof(struct sk_buff, data));
+ else {
+ /* mov off32(%rdi),%r8 */
+ EMIT3(0x4c, 0x8b, 0x87);
+ EMIT(offsetof(struct sk_buff, data), 4);
+ }
+ }
+ }
+
+ switch (filter[0].code) {
+ case BPF_S_RET_K:
+ case BPF_S_LD_W_LEN:
+ case BPF_S_ANC_PROTOCOL:
+ case BPF_S_ANC_IFINDEX:
+ case BPF_S_ANC_MARK:
+ case BPF_S_ANC_RXHASH:
+ case BPF_S_ANC_CPU:
+ case BPF_S_ANC_QUEUE:
+ case BPF_S_LD_W_ABS:
+ case BPF_S_LD_H_ABS:
+ case BPF_S_LD_B_ABS:
+ /* first instruction sets A register (or is RET 'constant') */
+ break;
+ default:
+ /* make sure we dont leak kernel information to user */
+ CLEAR_A(); /* A = 0 */
+ }
+
+ for (i = 0; i < flen; i++) {
+ unsigned int K = filter[i].k;
+
+ switch (filter[i].code) {
+ case BPF_S_ALU_ADD_X: /* A += X; */
+ seen |= SEEN_XREG;
+ EMIT2(0x01, 0xd8); /* add %ebx,%eax */
+ break;
+ case BPF_S_ALU_ADD_K: /* A += K; */
+ if (!K)
+ break;
+ if (is_imm8(K))
+ EMIT3(0x83, 0xc0, K); /* add imm8,%eax */
+ else
+ EMIT1_off32(0x05, K); /* add imm32,%eax */
+ break;
+ case BPF_S_ALU_SUB_X: /* A -= X; */
+ seen |= SEEN_XREG;
+ EMIT2(0x29, 0xd8); /* sub %ebx,%eax */
+ break;
+ case BPF_S_ALU_SUB_K: /* A -= K */
+ if (!K)
+ break;
+ if (is_imm8(K))
+ EMIT3(0x83, 0xe8, K); /* sub imm8,%eax */
+ else
+ EMIT1_off32(0x2d, K); /* sub imm32,%eax */
+ break;
+ case BPF_S_ALU_MUL_X: /* A *= X; */
+ seen |= SEEN_XREG;
+ EMIT3(0x0f, 0xaf, 0xc3); /* imul %ebx,%eax */
+ break;
+ case BPF_S_ALU_MUL_K: /* A *= K */
+ if (is_imm8(K))
+ EMIT3(0x6b, 0xc0, K); /* imul imm8,%eax,%eax */
+ else {
+ EMIT2(0x69, 0xc0); /* imul imm32,%eax */
+ EMIT(K, 4);
+ }
+ break;
+ case BPF_S_ALU_DIV_X: /* A /= X; */
+ seen |= SEEN_XREG;
+ EMIT2(0x85, 0xdb); /* test %ebx,%ebx */
+ if (pc_ret0 != -1)
+ EMIT_COND_JMP(X86_JE, addrs[pc_ret0] - (addrs[i] - 4));
+ else {
+ EMIT_COND_JMP(X86_JNE, 2 + 5);
+ CLEAR_A();
+ EMIT1_off32(0xe9, cleanup_addr - (addrs[i] - 4)); /* jmp .+off32 */
+ }
+ EMIT4(0x31, 0xd2, 0xf7, 0xf3); /* xor %edx,%edx; div %ebx */
+ break;
+ case BPF_S_ALU_DIV_K: /* A = reciprocal_divide(A, K); */
+ EMIT3(0x48, 0x69, 0xc0); /* imul imm32,%rax,%rax */
+ EMIT(K, 4);
+ EMIT4(0x48, 0xc1, 0xe8, 0x20); /* shr $0x20,%rax */
+ break;
+ case BPF_S_ALU_AND_X:
+ seen |= SEEN_XREG;
+ EMIT2(0x21, 0xd8); /* and %ebx,%eax */
+ break;
+ case BPF_S_ALU_AND_K:
+ if (K >= 0xFFFFFF00) {
+ EMIT2(0x24, K & 0xFF); /* and imm8,%al */
+ } else if (K >= 0xFFFF0000) {
+ EMIT2(0x66, 0x25); /* and imm16,%ax */
+ EMIT2(K, 2);
+ } else {
+ EMIT1_off32(0x25, K); /* and imm32,%eax */
+ }
+ break;
+ case BPF_S_ALU_OR_X:
+ seen |= SEEN_XREG;
+ EMIT2(0x09, 0xd8); /* or %ebx,%eax */
+ break;
+ case BPF_S_ALU_OR_K:
+ if (is_imm8(K))
+ EMIT3(0x83, 0xc8, K); /* or imm8,%eax */
+ else
+ EMIT1_off32(0x0d, K); /* or imm32,%eax */
+ break;
+ case BPF_S_ALU_LSH_X: /* A <<= X; */
+ seen |= SEEN_XREG;
+ EMIT4(0x89, 0xd9, 0xd3, 0xe0); /* mov %ebx,%ecx; shl %cl,%eax */
+ break;
+ case BPF_S_ALU_LSH_K:
+ if (K == 0)
+ break;
+ else if (K == 1)
+ EMIT2(0xd1, 0xe0); /* shl %eax */
+ else
+ EMIT3(0xc1, 0xe0, K);
+ break;
+ case BPF_S_ALU_RSH_X: /* A >>= X; */
+ seen |= SEEN_XREG;
+ EMIT4(0x89, 0xd9, 0xd3, 0xe8); /* mov %ebx,%ecx; shr %cl,%eax */
+ break;
+ case BPF_S_ALU_RSH_K: /* A >>= K; */
+ if (K == 0)
+ break;
+ else if (K == 1)
+ EMIT2(0xd1, 0xe8); /* shr %eax */
+ else
+ EMIT3(0xc1, 0xe8, K);
+ break;
+ case BPF_S_ALU_NEG:
+ EMIT2(0xf7, 0xd8); /* neg %eax */
+ break;
+ case BPF_S_RET_K:
+ if (!K) {
+ if (pc_ret0 == -1)
+ pc_ret0 = i;
+ CLEAR_A();
+ } else {
+ EMIT1_off32(0xb8, K); /* mov $imm32,%eax */
+ }
+ /* fallinto */
+ case BPF_S_RET_A:
+ if (seen) {
+ if (i != flen - 1) {
+ EMIT_JMP(cleanup_addr - addrs[i]);
+ break;
+ }
+ if (seen & SEEN_XREG)
+ EMIT4(0x48, 0x8b, 0x5d, 0xf8); /* mov -8(%rbp),%rbx */
+ EMIT1(0xc9); /* leaveq */
+ }
+ EMIT1(0xc3); /* ret */
+ break;
+ case BPF_S_MISC_TAX: /* X = A */
+ seen |= SEEN_XREG;
+ EMIT2(0x89, 0xc3); /* mov %eax,%ebx */
+ break;
+ case BPF_S_MISC_TXA: /* A = X */
+ seen |= SEEN_XREG;
+ EMIT2(0x89, 0xd8); /* mov %ebx,%eax */
+ break;
+ case BPF_S_LD_IMM: /* A = K */
+ if (!K)
+ CLEAR_A();
+ else
+ EMIT1_off32(0xb8, K); /* mov $imm32,%eax */
+ break;
+ case BPF_S_LDX_IMM: /* X = K */
+ seen |= SEEN_XREG;
+ if (!K)
+ CLEAR_X();
+ else
+ EMIT1_off32(0xbb, K); /* mov $imm32,%ebx */
+ break;
+ case BPF_S_LD_MEM: /* A = mem[K] : mov off8(%rbp),%eax */
+ seen |= SEEN_MEM;
+ EMIT3(0x8b, 0x45, 0xf0 - K*4);
+ break;
+ case BPF_S_LDX_MEM: /* X = mem[K] : mov off8(%rbp),%ebx */
+ seen |= SEEN_XREG | SEEN_MEM;
+ EMIT3(0x8b, 0x5d, 0xf0 - K*4);
+ break;
+ case BPF_S_ST: /* mem[K] = A : mov %eax,off8(%rbp) */
+ seen |= SEEN_MEM;
+ EMIT3(0x89, 0x45, 0xf0 - K*4);
+ break;
+ case BPF_S_STX: /* mem[K] = X : mov %ebx,off8(%rbp) */
+ seen |= SEEN_XREG | SEEN_MEM;
+ EMIT3(0x89, 0x5d, 0xf0 - K*4);
+ break;
+ case BPF_S_LD_W_LEN: /* A = skb->len; */
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4);
+ if (is_imm8(offsetof(struct sk_buff, len)))
+ /* mov off8(%rdi),%eax */
+ EMIT3(0x8b, 0x47, offsetof(struct sk_buff, len));
+ else {
+ EMIT2(0x8b, 0x87);
+ EMIT(offsetof(struct sk_buff, len), 4);
+ }
+ break;
+ case BPF_S_LDX_W_LEN: /* X = skb->len; */
+ seen |= SEEN_XREG;
+ if (is_imm8(offsetof(struct sk_buff, len)))
+ /* mov off8(%rdi),%ebx */
+ EMIT3(0x8b, 0x5f, offsetof(struct sk_buff, len));
+ else {
+ EMIT2(0x8b, 0x9f);
+ EMIT(offsetof(struct sk_buff, len), 4);
+ }
+ break;
+ case BPF_S_ANC_PROTOCOL: /* A = ntohs(skb->protocol); */
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2);
+ if (is_imm8(offsetof(struct sk_buff, protocol))) {
+ /* movzwl off8(%rdi),%eax */
+ EMIT4(0x0f, 0xb7, 0x47, offsetof(struct sk_buff, protocol));
+ } else {
+ EMIT3(0x0f, 0xb7, 0x87); /* movzwl off32(%rdi),%eax */
+ EMIT(offsetof(struct sk_buff, protocol), 4);
+ }
+ EMIT2(0x86, 0xc4); /* ntohs() : xchg %al,%ah */
+ break;
+ case BPF_S_ANC_IFINDEX:
+ if (is_imm8(offsetof(struct sk_buff, dev))) {
+ /* movq off8(%rdi),%rax */
+ EMIT4(0x48, 0x8b, 0x47, offsetof(struct sk_buff, dev));
+ } else {
+ EMIT3(0x48, 0x8b, 0x87); /* movq off32(%rdi),%rax */
+ EMIT(offsetof(struct sk_buff, dev), 4);
+ }
+ EMIT3(0x48, 0x85, 0xc0); /* test %rax,%rax */
+ EMIT_COND_JMP(X86_JE, cleanup_addr - (addrs[i] - 6));
+ BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
+ EMIT2(0x8b, 0x80); /* mov off32(%rax),%eax */
+ EMIT(offsetof(struct net_device, ifindex), 4);
+ break;
+ case BPF_S_ANC_MARK:
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4);
+ if (is_imm8(offsetof(struct sk_buff, mark))) {
+ /* mov off8(%rdi),%eax */
+ EMIT3(0x8b, 0x47, offsetof(struct sk_buff, mark));
+ } else {
+ EMIT2(0x8b, 0x87);
+ EMIT(offsetof(struct sk_buff, mark), 4);
+ }
+ break;
+ case BPF_S_ANC_RXHASH:
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, rxhash) != 4);
+ if (is_imm8(offsetof(struct sk_buff, rxhash))) {
+ /* mov off8(%rdi),%eax */
+ EMIT3(0x8b, 0x47, offsetof(struct sk_buff, rxhash));
+ } else {
+ EMIT2(0x8b, 0x87);
+ EMIT(offsetof(struct sk_buff, rxhash), 4);
+ }
+ break;
+ case BPF_S_ANC_QUEUE:
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, queue_mapping) != 2);
+ if (is_imm8(offsetof(struct sk_buff, queue_mapping))) {
+ /* movzwl off8(%rdi),%eax */
+ EMIT4(0x0f, 0xb7, 0x47, offsetof(struct sk_buff, queue_mapping));
+ } else {
+ EMIT3(0x0f, 0xb7, 0x87); /* movzwl off32(%rdi),%eax */
+ EMIT(offsetof(struct sk_buff, queue_mapping), 4);
+ }
+ break;
+ case BPF_S_ANC_CPU:
+#ifdef CONFIG_SMP
+ EMIT4(0x65, 0x8b, 0x04, 0x25); /* mov %gs:off32,%eax */
+ EMIT((u32)(unsigned long)&cpu_number, 4); /* A = smp_processor_id(); */
+#else
+ CLEAR_A();
+#endif
+ break;
+ case BPF_S_LD_W_ABS:
+ func = sk_load_word;
+common_load: seen |= SEEN_DATAREF;
+ if ((int)K < 0)
+ goto out;
+ t_offset = func - (image + addrs[i]);
+ EMIT1_off32(0xbe, K); /* mov imm32,%esi */
+ EMIT1_off32(0xe8, t_offset); /* call */
+ break;
+ case BPF_S_LD_H_ABS:
+ func = sk_load_half;
+ goto common_load;
+ case BPF_S_LD_B_ABS:
+ func = sk_load_byte;
+ goto common_load;
+ case BPF_S_LDX_B_MSH:
+ if ((int)K < 0) {
+ if (pc_ret0 != -1) {
+ EMIT_JMP(addrs[pc_ret0] - addrs[i]);
+ break;
+ }
+ CLEAR_A();
+ EMIT_JMP(cleanup_addr - addrs[i]);
+ break;
+ }
+ seen |= SEEN_DATAREF | SEEN_XREG;
+ t_offset = sk_load_byte_msh - (image + addrs[i]);
+ EMIT1_off32(0xbe, K); /* mov imm32,%esi */
+ EMIT1_off32(0xe8, t_offset); /* call sk_load_byte_msh */
+ break;
+ case BPF_S_LD_W_IND:
+ func = sk_load_word_ind;
+common_load_ind: seen |= SEEN_DATAREF | SEEN_XREG;
+ t_offset = func - (image + addrs[i]);
+ EMIT1_off32(0xbe, K); /* mov imm32,%esi */
+ EMIT1_off32(0xe8, t_offset); /* call sk_load_xxx_ind */
+ break;
+ case BPF_S_LD_H_IND:
+ func = sk_load_half_ind;
+ goto common_load_ind;
+ case BPF_S_LD_B_IND:
+ func = sk_load_byte_ind;
+ goto common_load_ind;
+ case BPF_S_JMP_JA:
+ t_offset = addrs[i + K] - addrs[i];
+ EMIT_JMP(t_offset);
+ break;
+ COND_SEL(BPF_S_JMP_JGT_K, X86_JA, X86_JBE);
+ COND_SEL(BPF_S_JMP_JGE_K, X86_JAE, X86_JB);
+ COND_SEL(BPF_S_JMP_JEQ_K, X86_JE, X86_JNE);
+ COND_SEL(BPF_S_JMP_JSET_K,X86_JNE, X86_JE);
+ COND_SEL(BPF_S_JMP_JGT_X, X86_JA, X86_JBE);
+ COND_SEL(BPF_S_JMP_JGE_X, X86_JAE, X86_JB);
+ COND_SEL(BPF_S_JMP_JEQ_X, X86_JE, X86_JNE);
+ COND_SEL(BPF_S_JMP_JSET_X,X86_JNE, X86_JE);
+
+cond_branch: f_offset = addrs[i + filter[i].jf] - addrs[i];
+ t_offset = addrs[i + filter[i].jt] - addrs[i];
+
+ /* same targets, can avoid doing the test :) */
+ if (filter[i].jt == filter[i].jf) {
+ EMIT_JMP(t_offset);
+ break;
+ }
+
+ switch (filter[i].code) {
+ case BPF_S_JMP_JGT_X:
+ case BPF_S_JMP_JGE_X:
+ case BPF_S_JMP_JEQ_X:
+ seen |= SEEN_XREG;
+ EMIT2(0x39, 0xd8); /* cmp %ebx,%eax */
+ break;
+ case BPF_S_JMP_JSET_X:
+ seen |= SEEN_XREG;
+ EMIT2(0x85, 0xd8); /* test %ebx,%eax */
+ break;
+ case BPF_S_JMP_JEQ_K:
+ if (K == 0) {
+ EMIT2(0x85, 0xc0); /* test %eax,%eax */
+ break;
+ }
+ case BPF_S_JMP_JGT_K:
+ case BPF_S_JMP_JGE_K:
+ if (K <= 127)
+ EMIT3(0x83, 0xf8, K); /* cmp imm8,%eax */
+ else
+ EMIT1_off32(0x3d, K); /* cmp imm32,%eax */
+ break;
+ case BPF_S_JMP_JSET_K:
+ if (K <= 0xFF)
+ EMIT2(0xa8, K); /* test imm8,%al */
+ else if (!(K & 0xFFFF00FF))
+ EMIT3(0xf6, 0xc4, K >> 8); /* test imm8,%ah */
+ else if (K <= 0xFFFF) {
+ EMIT2(0x66, 0xa9); /* test imm16,%ax */
+ EMIT(K, 2);
+ } else {
+ EMIT1_off32(0xa9, K); /* test imm32,%eax */
+ }
+ break;
+ }
+ if (filter[i].jt != 0) {
+ if (filter[i].jf)
+ t_offset += is_near(f_offset) ? 2 : 6;
+ EMIT_COND_JMP(t_op, t_offset);
+ if (filter[i].jf)
+ EMIT_JMP(f_offset);
+ break;
+ }
+ EMIT_COND_JMP(f_op, f_offset);
+ break;
+ default:
+ /* hmm, too complex filter, give up with jit compiler */
+ goto out;
+ }
+ ilen = prog - temp;
+ if (image) {
+ if (unlikely(proglen + ilen > oldproglen)) {
+ pr_err("bpb_jit_compile fatal error\n");
+ kfree(addrs);
+ module_free(NULL, image);
+ return;
+ }
+ memcpy(image + proglen, temp, ilen);
+ }
+ proglen += ilen;
+ addrs[i] = proglen;
+ prog = temp;
+ }
+ /* last bpf instruction is always a RET :
+ * use it to give the cleanup instruction(s) addr
+ */
+ cleanup_addr = proglen - 1; /* ret */
+ if (seen)
+ cleanup_addr -= 1; /* leaveq */
+ if (seen & SEEN_XREG)
+ cleanup_addr -= 4; /* mov -8(%rbp),%rbx */
+
+ if (image) {
+ WARN_ON(proglen != oldproglen);
+ break;
+ }
+ if (proglen == oldproglen) {
+ image = module_alloc(max_t(unsigned int,
+ proglen,
+ sizeof(struct work_struct)));
+ if (!image)
+ goto out;
+ }
+ oldproglen = proglen;
+ }
+ if (bpf_jit_enable > 1)
+ pr_err("flen=%d proglen=%u pass=%d image=%p\n",
+ flen, proglen, pass, image);
+
+ if (image) {
+ if (bpf_jit_enable > 1)
+ print_hex_dump(KERN_ERR, "JIT code: ", DUMP_PREFIX_ADDRESS,
+ 16, 1, image, proglen, false);
+
+ bpf_flush_icache(image, image + proglen);
+
+ fp->bpf_func = (void *)image;
+ }
+out:
+ kfree(addrs);
+ return;
+}
+
+static void jit_free_defer(struct work_struct *arg)
+{
+ module_free(NULL, arg);
+}
+
+/* run from softirq, we must use a work_struct to call
+ * module_free() from process context
+ */
+void bpf_jit_free(struct sk_filter *fp)
+{
+ if (fp->bpf_func != sk_run_filter) {
+ struct work_struct *work = (struct work_struct *)fp->bpf_func;
+
+ INIT_WORK(work, jit_free_defer);
+ schedule_work(work);
+ }
+}
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c
index 2d49d4e19a3..bff89dfe361 100644
--- a/arch/x86/oprofile/backtrace.c
+++ b/arch/x86/oprofile/backtrace.c
@@ -11,21 +11,11 @@
#include <linux/oprofile.h>
#include <linux/sched.h>
#include <linux/mm.h>
-#include <asm/ptrace.h>
-#include <asm/uaccess.h>
-#include <asm/stacktrace.h>
#include <linux/compat.h>
+#include <linux/uaccess.h>
-static void backtrace_warning_symbol(void *data, char *msg,
- unsigned long symbol)
-{
- /* Ignore warnings */
-}
-
-static void backtrace_warning(void *data, char *msg)
-{
- /* Ignore warnings */
-}
+#include <asm/ptrace.h>
+#include <asm/stacktrace.h>
static int backtrace_stack(void *data, char *name)
{
@@ -42,8 +32,6 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
}
static struct stacktrace_ops backtrace_ops = {
- .warning = backtrace_warning,
- .warning_symbol = backtrace_warning_symbol,
.stack = backtrace_stack,
.address = backtrace_address,
.walk_stack = print_context_stack,
@@ -53,13 +41,13 @@ static struct stacktrace_ops backtrace_ops = {
static struct stack_frame_ia32 *
dump_user_backtrace_32(struct stack_frame_ia32 *head)
{
+ /* Also check accessibility of one struct frame_head beyond: */
struct stack_frame_ia32 bufhead[2];
struct stack_frame_ia32 *fp;
+ unsigned long bytes;
- /* Also check accessibility of one struct frame_head beyond */
- if (!access_ok(VERIFY_READ, head, sizeof(bufhead)))
- return NULL;
- if (__copy_from_user_inatomic(bufhead, head, sizeof(bufhead)))
+ bytes = copy_from_user_nmi(bufhead, head, sizeof(bufhead));
+ if (bytes != sizeof(bufhead))
return NULL;
fp = (struct stack_frame_ia32 *) compat_ptr(bufhead[0].next_frame);
@@ -100,12 +88,12 @@ x86_backtrace_32(struct pt_regs * const regs, unsigned int depth)
static struct stack_frame *dump_user_backtrace(struct stack_frame *head)
{
+ /* Also check accessibility of one struct frame_head beyond: */
struct stack_frame bufhead[2];
+ unsigned long bytes;
- /* Also check accessibility of one struct stack_frame beyond */
- if (!access_ok(VERIFY_READ, head, sizeof(bufhead)))
- return NULL;
- if (__copy_from_user_inatomic(bufhead, head, sizeof(bufhead)))
+ bytes = copy_from_user_nmi(bufhead, head, sizeof(bufhead));
+ if (bytes != sizeof(bufhead))
return NULL;
oprofile_add_trace(bufhead[0].return_address);
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index cf9750004a0..68894fdc034 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -112,8 +112,10 @@ static void nmi_cpu_start(void *dummy)
static int nmi_start(void)
{
get_online_cpus();
- on_each_cpu(nmi_cpu_start, NULL, 1);
ctr_running = 1;
+ /* make ctr_running visible to the nmi handler: */
+ smp_mb();
+ on_each_cpu(nmi_cpu_start, NULL, 1);
put_online_cpus();
return 0;
}
@@ -504,15 +506,18 @@ static int nmi_setup(void)
nmi_enabled = 0;
ctr_running = 0;
- barrier();
+ /* make variables visible to the nmi handler: */
+ smp_mb();
err = register_die_notifier(&profile_exceptions_nb);
if (err)
goto fail;
get_online_cpus();
register_cpu_notifier(&oprofile_cpu_nb);
- on_each_cpu(nmi_cpu_setup, NULL, 1);
nmi_enabled = 1;
+ /* make nmi_enabled visible to the nmi handler: */
+ smp_mb();
+ on_each_cpu(nmi_cpu_setup, NULL, 1);
put_online_cpus();
return 0;
@@ -531,7 +536,8 @@ static void nmi_shutdown(void)
nmi_enabled = 0;
ctr_running = 0;
put_online_cpus();
- barrier();
+ /* make variables visible to the nmi handler: */
+ smp_mb();
unregister_die_notifier(&profile_exceptions_nb);
msrs = &get_cpu_var(cpu_msrs);
model->shutdown(msrs);
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index c3b8e24f2b1..9cbb710dc94 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -316,16 +316,23 @@ static void op_amd_stop_ibs(void)
wrmsrl(MSR_AMD64_IBSOPCTL, 0);
}
-static inline int eilvt_is_available(int offset)
+static inline int get_eilvt(int offset)
{
- /* check if we may assign a vector */
return !setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 1);
}
+static inline int put_eilvt(int offset)
+{
+ return !setup_APIC_eilvt(offset, 0, 0, 1);
+}
+
static inline int ibs_eilvt_valid(void)
{
int offset;
u64 val;
+ int valid = 0;
+
+ preempt_disable();
rdmsrl(MSR_AMD64_IBSCTL, val);
offset = val & IBSCTL_LVT_OFFSET_MASK;
@@ -333,16 +340,20 @@ static inline int ibs_eilvt_valid(void)
if (!(val & IBSCTL_LVT_OFFSET_VALID)) {
pr_err(FW_BUG "cpu %d, invalid IBS interrupt offset %d (MSR%08X=0x%016llx)\n",
smp_processor_id(), offset, MSR_AMD64_IBSCTL, val);
- return 0;
+ goto out;
}
- if (!eilvt_is_available(offset)) {
+ if (!get_eilvt(offset)) {
pr_err(FW_BUG "cpu %d, IBS interrupt offset %d not available (MSR%08X=0x%016llx)\n",
smp_processor_id(), offset, MSR_AMD64_IBSCTL, val);
- return 0;
+ goto out;
}
- return 1;
+ valid = 1;
+out:
+ preempt_enable();
+
+ return valid;
}
static inline int get_ibs_offset(void)
@@ -598,69 +609,76 @@ static int setup_ibs_ctl(int ibs_eilvt_off)
return 0;
}
+/*
+ * This runs only on the current cpu. We try to find an LVT offset and
+ * setup the local APIC. For this we must disable preemption. On
+ * success we initialize all nodes with this offset. This updates then
+ * the offset in the IBS_CTL per-node msr. The per-core APIC setup of
+ * the IBS interrupt vector is called from op_amd_setup_ctrs()/op_-
+ * amd_cpu_shutdown() using the new offset.
+ */
static int force_ibs_eilvt_setup(void)
{
- int i;
+ int offset;
int ret;
- /* find the next free available EILVT entry */
- for (i = 1; i < 4; i++) {
- if (!eilvt_is_available(i))
- continue;
- ret = setup_ibs_ctl(i);
- if (ret)
- return ret;
- pr_err(FW_BUG "using offset %d for IBS interrupts\n", i);
- return 0;
+ preempt_disable();
+ /* find the next free available EILVT entry, skip offset 0 */
+ for (offset = 1; offset < APIC_EILVT_NR_MAX; offset++) {
+ if (get_eilvt(offset))
+ break;
}
+ preempt_enable();
- printk(KERN_DEBUG "No EILVT entry available\n");
-
- return -EBUSY;
-}
-
-static int __init_ibs_nmi(void)
-{
- int ret;
-
- if (ibs_eilvt_valid())
- return 0;
+ if (offset == APIC_EILVT_NR_MAX) {
+ printk(KERN_DEBUG "No EILVT entry available\n");
+ return -EBUSY;
+ }
- ret = force_ibs_eilvt_setup();
+ ret = setup_ibs_ctl(offset);
if (ret)
- return ret;
+ goto out;
- if (!ibs_eilvt_valid())
- return -EFAULT;
+ if (!ibs_eilvt_valid()) {
+ ret = -EFAULT;
+ goto out;
+ }
+ pr_err(FW_BUG "using offset %d for IBS interrupts\n", offset);
pr_err(FW_BUG "workaround enabled for IBS LVT offset\n");
return 0;
+out:
+ preempt_disable();
+ put_eilvt(offset);
+ preempt_enable();
+ return ret;
}
/*
* check and reserve APIC extended interrupt LVT offset for IBS if
* available
- *
- * init_ibs() preforms implicitly cpu-local operations, so pin this
- * thread to its current CPU
*/
static void init_ibs(void)
{
- preempt_disable();
-
ibs_caps = get_ibs_caps();
+
if (!ibs_caps)
+ return;
+
+ if (ibs_eilvt_valid())
goto out;
- if (__init_ibs_nmi() < 0)
- ibs_caps = 0;
- else
- printk(KERN_INFO "oprofile: AMD IBS detected (0x%08x)\n", ibs_caps);
+ if (!force_ibs_eilvt_setup())
+ goto out;
+
+ /* Failed to setup ibs */
+ ibs_caps = 0;
+ return;
out:
- preempt_enable();
+ printk(KERN_INFO "oprofile: AMD IBS detected (0x%08x)\n", ibs_caps);
}
static int (*create_arch_files)(struct super_block *sb, struct dentry *root);
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 0972315c386..68c3c139520 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -188,7 +188,7 @@ static bool resource_contains(struct resource *res, resource_size_t point)
return false;
}
-static void coalesce_windows(struct pci_root_info *info, int type)
+static void coalesce_windows(struct pci_root_info *info, unsigned long type)
{
int i, j;
struct resource *res1, *res2;
diff --git a/arch/x86/pci/direct.c b/arch/x86/pci/direct.c
index bd33620b007..e6fd8473fb7 100644
--- a/arch/x86/pci/direct.c
+++ b/arch/x86/pci/direct.c
@@ -280,12 +280,9 @@ void __init pci_direct_init(int type)
int __init pci_direct_probe(void)
{
- struct resource *region, *region2;
-
if ((pci_probe & PCI_PROBE_CONF1) == 0)
goto type2;
- region = request_region(0xCF8, 8, "PCI conf1");
- if (!region)
+ if (!request_region(0xCF8, 8, "PCI conf1"))
goto type2;
if (pci_check_type1()) {
@@ -293,16 +290,14 @@ int __init pci_direct_probe(void)
port_cf9_safe = true;
return 1;
}
- release_resource(region);
+ release_region(0xCF8, 8);
type2:
if ((pci_probe & PCI_PROBE_CONF2) == 0)
return 0;
- region = request_region(0xCF8, 4, "PCI conf2");
- if (!region)
+ if (!request_region(0xCF8, 4, "PCI conf2"))
return 0;
- region2 = request_region(0xC000, 0x1000, "PCI conf2");
- if (!region2)
+ if (!request_region(0xC000, 0x1000, "PCI conf2"))
goto fail2;
if (pci_check_type2()) {
@@ -311,8 +306,8 @@ int __init pci_direct_probe(void)
return 2;
}
- release_resource(region2);
+ release_region(0xC000, 0x1000);
fail2:
- release_resource(region);
+ release_region(0xCF8, 4);
return 0;
}
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 8201165bae2..372e9b8989b 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -602,7 +602,9 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route
|| (device >= PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MIN &&
device <= PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MAX)
|| (device >= PCI_DEVICE_ID_INTEL_DH89XXCC_LPC_MIN &&
- device <= PCI_DEVICE_ID_INTEL_DH89XXCC_LPC_MAX)) {
+ device <= PCI_DEVICE_ID_INTEL_DH89XXCC_LPC_MAX)
+ || (device >= PCI_DEVICE_ID_INTEL_PANTHERPOINT_LPC_MIN &&
+ device <= PCI_DEVICE_ID_INTEL_PANTHERPOINT_LPC_MAX)) {
r->name = "PIIX/ICH";
r->get = pirq_piix_get;
r->set = pirq_piix_set;
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index e282886616a..301e325992f 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -519,7 +519,8 @@ static int __init acpi_mcfg_check_entry(struct acpi_table_mcfg *mcfg,
if (cfg->address < 0xFFFFFFFF)
return 0;
- if (!strcmp(mcfg->header.oem_id, "SGI"))
+ if (!strcmp(mcfg->header.oem_id, "SGI") ||
+ !strcmp(mcfg->header.oem_id, "SGI2"))
return 0;
if (mcfg->header.revision >= 1) {
@@ -606,6 +607,16 @@ static void __init __pci_mmcfg_init(int early)
if (list_empty(&pci_mmcfg_list))
return;
+ if (pcibios_last_bus < 0) {
+ const struct pci_mmcfg_region *cfg;
+
+ list_for_each_entry(cfg, &pci_mmcfg_list, list) {
+ if (cfg->segment)
+ break;
+ pcibios_last_bus = cfg->end_bus;
+ }
+ }
+
if (pci_mmcfg_arch_init())
pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF;
else {
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index e37b407a0ee..1017c7bee38 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -1,8 +1,13 @@
/*
- * Xen PCI Frontend Stub - puts some "dummy" functions in to the Linux
- * x86 PCI core to support the Xen PCI Frontend
+ * Xen PCI - handle PCI (INTx) and MSI infrastructure calls for PV, HVM and
+ * initial domain support. We also handle the DSDT _PRT callbacks for GSI's
+ * used in HVM and initial domain mode (PV does not parse ACPI, so it has no
+ * concept of GSIs). Under PV we hook under the pnbbios API for IRQs and
+ * 0xcf8 PCI configuration read/write.
*
* Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ * Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+ * Stefano Stabellini <stefano.stabellini@eu.citrix.com>
*/
#include <linux/module.h>
#include <linux/init.h>
@@ -19,22 +24,53 @@
#include <xen/events.h>
#include <asm/xen/pci.h>
+static int xen_pcifront_enable_irq(struct pci_dev *dev)
+{
+ int rc;
+ int share = 1;
+ int pirq;
+ u8 gsi;
+
+ rc = pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &gsi);
+ if (rc < 0) {
+ dev_warn(&dev->dev, "Xen PCI: failed to read interrupt line: %d\n",
+ rc);
+ return rc;
+ }
+ /* In PV DomU the Xen PCI backend puts the PIRQ in the interrupt line.*/
+ pirq = gsi;
+
+ if (gsi < NR_IRQS_LEGACY)
+ share = 0;
+
+ rc = xen_bind_pirq_gsi_to_irq(gsi, pirq, share, "pcifront");
+ if (rc < 0) {
+ dev_warn(&dev->dev, "Xen PCI: failed to bind GSI%d (PIRQ%d) to IRQ: %d\n",
+ gsi, pirq, rc);
+ return rc;
+ }
+
+ dev->irq = rc;
+ dev_info(&dev->dev, "Xen PCI mapped GSI%d to IRQ%d\n", gsi, dev->irq);
+ return 0;
+}
+
#ifdef CONFIG_ACPI
-static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi,
- int trigger, int polarity)
+static int xen_register_pirq(u32 gsi, int gsi_override, int triggering,
+ bool set_pirq)
{
- int rc, irq;
+ int rc, pirq = -1, irq = -1;
struct physdev_map_pirq map_irq;
int shareable = 0;
char *name;
- if (!xen_hvm_domain())
- return -1;
+ if (set_pirq)
+ pirq = gsi;
map_irq.domid = DOMID_SELF;
map_irq.type = MAP_PIRQ_TYPE_GSI;
map_irq.index = gsi;
- map_irq.pirq = -1;
+ map_irq.pirq = pirq;
rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
if (rc) {
@@ -42,7 +78,7 @@ static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi,
return -1;
}
- if (trigger == ACPI_EDGE_SENSITIVE) {
+ if (triggering == ACPI_EDGE_SENSITIVE) {
shareable = 0;
name = "ioapic-edge";
} else {
@@ -50,12 +86,63 @@ static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi,
name = "ioapic-level";
}
+ if (gsi_override >= 0)
+ gsi = gsi_override;
+
irq = xen_bind_pirq_gsi_to_irq(gsi, map_irq.pirq, shareable, name);
+ if (irq < 0)
+ goto out;
- printk(KERN_DEBUG "xen: --> irq=%d, pirq=%d\n", irq, map_irq.pirq);
+ printk(KERN_DEBUG "xen: --> pirq=%d -> irq=%d (gsi=%d)\n", map_irq.pirq, irq, gsi);
+out:
+ return irq;
+}
+
+static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi,
+ int trigger, int polarity)
+{
+ if (!xen_hvm_domain())
+ return -1;
+
+ return xen_register_pirq(gsi, -1 /* no GSI override */, trigger,
+ false /* no mapping of GSI to PIRQ */);
+}
+
+#ifdef CONFIG_XEN_DOM0
+static int xen_register_gsi(u32 gsi, int gsi_override, int triggering, int polarity)
+{
+ int rc, irq;
+ struct physdev_setup_gsi setup_gsi;
+
+ if (!xen_pv_domain())
+ return -1;
+
+ printk(KERN_DEBUG "xen: registering gsi %u triggering %d polarity %d\n",
+ gsi, triggering, polarity);
+
+ irq = xen_register_pirq(gsi, gsi_override, triggering, true);
+
+ setup_gsi.gsi = gsi;
+ setup_gsi.triggering = (triggering == ACPI_EDGE_SENSITIVE ? 0 : 1);
+ setup_gsi.polarity = (polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
+
+ rc = HYPERVISOR_physdev_op(PHYSDEVOP_setup_gsi, &setup_gsi);
+ if (rc == -EEXIST)
+ printk(KERN_INFO "Already setup the GSI :%d\n", gsi);
+ else if (rc) {
+ printk(KERN_ERR "Failed to setup GSI :%d, err_code:%d\n",
+ gsi, rc);
+ }
return irq;
}
+
+static int acpi_register_gsi_xen(struct device *dev, u32 gsi,
+ int trigger, int polarity)
+{
+ return xen_register_gsi(gsi, -1 /* no GSI override */, trigger, polarity);
+}
+#endif
#endif
#if defined(CONFIG_PCI_MSI)
@@ -65,6 +152,43 @@ static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi,
struct xen_pci_frontend_ops *xen_pci_frontend;
EXPORT_SYMBOL_GPL(xen_pci_frontend);
+static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+{
+ int irq, ret, i;
+ struct msi_desc *msidesc;
+ int *v;
+
+ v = kzalloc(sizeof(int) * max(1, nvec), GFP_KERNEL);
+ if (!v)
+ return -ENOMEM;
+
+ if (type == PCI_CAP_ID_MSIX)
+ ret = xen_pci_frontend_enable_msix(dev, v, nvec);
+ else
+ ret = xen_pci_frontend_enable_msi(dev, v);
+ if (ret)
+ goto error;
+ i = 0;
+ list_for_each_entry(msidesc, &dev->msi_list, list) {
+ irq = xen_bind_pirq_msi_to_irq(dev, msidesc, v[i], 0,
+ (type == PCI_CAP_ID_MSIX) ?
+ "pcifront-msi-x" :
+ "pcifront-msi",
+ DOMID_SELF);
+ if (irq < 0)
+ goto free;
+ i++;
+ }
+ kfree(v);
+ return 0;
+
+error:
+ dev_err(&dev->dev, "Xen PCI frontend has not registered MSI/MSI-X support!\n");
+free:
+ kfree(v);
+ return ret;
+}
+
#define XEN_PIRQ_MSI_DATA (MSI_DATA_TRIGGER_EDGE | \
MSI_DATA_LEVEL_ASSERT | (3 << 8) | MSI_DATA_VECTOR(0))
@@ -108,7 +232,8 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
}
irq = xen_bind_pirq_msi_to_irq(dev, msidesc, pirq, 0,
(type == PCI_CAP_ID_MSIX) ?
- "msi-x" : "msi");
+ "msi-x" : "msi",
+ DOMID_SELF);
if (irq < 0)
goto error;
dev_dbg(&dev->dev,
@@ -122,66 +247,6 @@ error:
return -ENODEV;
}
-/*
- * For MSI interrupts we have to use drivers/xen/event.s functions to
- * allocate an irq_desc and setup the right */
-
-
-static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
-{
- int irq, ret, i;
- struct msi_desc *msidesc;
- int *v;
-
- v = kzalloc(sizeof(int) * max(1, nvec), GFP_KERNEL);
- if (!v)
- return -ENOMEM;
-
- if (type == PCI_CAP_ID_MSIX)
- ret = xen_pci_frontend_enable_msix(dev, v, nvec);
- else
- ret = xen_pci_frontend_enable_msi(dev, v);
- if (ret)
- goto error;
- i = 0;
- list_for_each_entry(msidesc, &dev->msi_list, list) {
- irq = xen_bind_pirq_msi_to_irq(dev, msidesc, v[i], 0,
- (type == PCI_CAP_ID_MSIX) ?
- "pcifront-msi-x" :
- "pcifront-msi");
- if (irq < 0)
- goto free;
- i++;
- }
- kfree(v);
- return 0;
-
-error:
- dev_err(&dev->dev, "Xen PCI frontend has not registered MSI/MSI-X support!\n");
-free:
- kfree(v);
- return ret;
-}
-
-static void xen_teardown_msi_irqs(struct pci_dev *dev)
-{
- struct msi_desc *msidesc;
-
- msidesc = list_entry(dev->msi_list.next, struct msi_desc, list);
- if (msidesc->msi_attrib.is_msix)
- xen_pci_frontend_disable_msix(dev);
- else
- xen_pci_frontend_disable_msi(dev);
-
- /* Free the IRQ's and the msidesc using the generic code. */
- default_teardown_msi_irqs(dev);
-}
-
-static void xen_teardown_msi_irq(unsigned int irq)
-{
- xen_destroy_irq(irq);
-}
-
#ifdef CONFIG_XEN_DOM0
static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
{
@@ -190,9 +255,16 @@ static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
list_for_each_entry(msidesc, &dev->msi_list, list) {
struct physdev_map_pirq map_irq;
+ domid_t domid;
+
+ domid = ret = xen_find_device_domain_owner(dev);
+ /* N.B. Casting int's -ENODEV to uint16_t results in 0xFFED,
+ * hence check ret value for < 0. */
+ if (ret < 0)
+ domid = DOMID_SELF;
memset(&map_irq, 0, sizeof(map_irq));
- map_irq.domid = DOMID_SELF;
+ map_irq.domid = domid;
map_irq.type = MAP_PIRQ_TYPE_MSI;
map_irq.index = -1;
map_irq.pirq = -1;
@@ -215,14 +287,16 @@ static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
ret = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
if (ret) {
- dev_warn(&dev->dev, "xen map irq failed %d\n", ret);
+ dev_warn(&dev->dev, "xen map irq failed %d for %d domain\n",
+ ret, domid);
goto out;
}
ret = xen_bind_pirq_msi_to_irq(dev, msidesc,
map_irq.pirq, map_irq.index,
(type == PCI_CAP_ID_MSIX) ?
- "msi-x" : "msi");
+ "msi-x" : "msi",
+ domid);
if (ret < 0)
goto out;
}
@@ -231,45 +305,28 @@ out:
return ret;
}
#endif
-#endif
-static int xen_pcifront_enable_irq(struct pci_dev *dev)
+static void xen_teardown_msi_irqs(struct pci_dev *dev)
{
- int rc;
- int share = 1;
- int pirq;
- u8 gsi;
-
- rc = pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &gsi);
- if (rc < 0) {
- dev_warn(&dev->dev, "Xen PCI: failed to read interrupt line: %d\n",
- rc);
- return rc;
- }
-
- rc = xen_allocate_pirq_gsi(gsi);
- if (rc < 0) {
- dev_warn(&dev->dev, "Xen PCI: failed to allocate a PIRQ for GSI%d: %d\n",
- gsi, rc);
- return rc;
- }
- pirq = rc;
+ struct msi_desc *msidesc;
- if (gsi < NR_IRQS_LEGACY)
- share = 0;
+ msidesc = list_entry(dev->msi_list.next, struct msi_desc, list);
+ if (msidesc->msi_attrib.is_msix)
+ xen_pci_frontend_disable_msix(dev);
+ else
+ xen_pci_frontend_disable_msi(dev);
- rc = xen_bind_pirq_gsi_to_irq(gsi, pirq, share, "pcifront");
- if (rc < 0) {
- dev_warn(&dev->dev, "Xen PCI: failed to bind GSI%d (PIRQ%d) to IRQ: %d\n",
- gsi, pirq, rc);
- return rc;
- }
+ /* Free the IRQ's and the msidesc using the generic code. */
+ default_teardown_msi_irqs(dev);
+}
- dev->irq = rc;
- dev_info(&dev->dev, "Xen PCI mapped GSI%d to IRQ%d\n", gsi, dev->irq);
- return 0;
+static void xen_teardown_msi_irq(unsigned int irq)
+{
+ xen_destroy_irq(irq);
}
+#endif
+
int __init pci_xen_init(void)
{
if (!xen_pv_domain() || xen_initial_domain())
@@ -316,82 +373,13 @@ int __init pci_xen_hvm_init(void)
}
#ifdef CONFIG_XEN_DOM0
-static int xen_register_pirq(u32 gsi, int triggering)
-{
- int rc, pirq, irq = -1;
- struct physdev_map_pirq map_irq;
- int shareable = 0;
- char *name;
-
- if (!xen_pv_domain())
- return -1;
-
- if (triggering == ACPI_EDGE_SENSITIVE) {
- shareable = 0;
- name = "ioapic-edge";
- } else {
- shareable = 1;
- name = "ioapic-level";
- }
-
- pirq = xen_allocate_pirq_gsi(gsi);
- if (pirq < 0)
- goto out;
-
- irq = xen_bind_pirq_gsi_to_irq(gsi, pirq, shareable, name);
- if (irq < 0)
- goto out;
-
- printk(KERN_DEBUG "xen: --> pirq=%d -> irq=%d\n", pirq, irq);
-
- map_irq.domid = DOMID_SELF;
- map_irq.type = MAP_PIRQ_TYPE_GSI;
- map_irq.index = gsi;
- map_irq.pirq = pirq;
-
- rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
- if (rc) {
- printk(KERN_WARNING "xen map irq failed %d\n", rc);
- return -1;
- }
-
-out:
- return irq;
-}
-
-static int xen_register_gsi(u32 gsi, int triggering, int polarity)
-{
- int rc, irq;
- struct physdev_setup_gsi setup_gsi;
-
- if (!xen_pv_domain())
- return -1;
-
- printk(KERN_DEBUG "xen: registering gsi %u triggering %d polarity %d\n",
- gsi, triggering, polarity);
-
- irq = xen_register_pirq(gsi, triggering);
-
- setup_gsi.gsi = gsi;
- setup_gsi.triggering = (triggering == ACPI_EDGE_SENSITIVE ? 0 : 1);
- setup_gsi.polarity = (polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
-
- rc = HYPERVISOR_physdev_op(PHYSDEVOP_setup_gsi, &setup_gsi);
- if (rc == -EEXIST)
- printk(KERN_INFO "Already setup the GSI :%d\n", gsi);
- else if (rc) {
- printk(KERN_ERR "Failed to setup GSI :%d, err_code:%d\n",
- gsi, rc);
- }
-
- return irq;
-}
-
static __init void xen_setup_acpi_sci(void)
{
int rc;
int trigger, polarity;
int gsi = acpi_sci_override_gsi;
+ int irq = -1;
+ int gsi_override = -1;
if (!gsi)
return;
@@ -404,60 +392,131 @@ static __init void xen_setup_acpi_sci(void)
}
trigger = trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE;
polarity = polarity ? ACPI_ACTIVE_LOW : ACPI_ACTIVE_HIGH;
-
+
printk(KERN_INFO "xen: sci override: global_irq=%d trigger=%d "
"polarity=%d\n", gsi, trigger, polarity);
- gsi = xen_register_gsi(gsi, trigger, polarity);
+ /* Before we bind the GSI to a Linux IRQ, check whether
+ * we need to override it with bus_irq (IRQ) value. Usually for
+ * IRQs below IRQ_LEGACY_IRQ this holds IRQ == GSI, as so:
+ * ACPI: INT_SRC_OVR (bus 0 bus_irq 9 global_irq 9 low level)
+ * but there are oddballs where the IRQ != GSI:
+ * ACPI: INT_SRC_OVR (bus 0 bus_irq 9 global_irq 20 low level)
+ * which ends up being: gsi_to_irq[9] == 20
+ * (which is what acpi_gsi_to_irq ends up calling when starting the
+ * the ACPI interpreter and keels over since IRQ 9 has not been
+ * setup as we had setup IRQ 20 for it).
+ */
+ if (acpi_gsi_to_irq(gsi, &irq) == 0) {
+ /* Use the provided value if it's valid. */
+ if (irq >= 0)
+ gsi_override = irq;
+ }
+
+ gsi = xen_register_gsi(gsi, gsi_override, trigger, polarity);
printk(KERN_INFO "xen: acpi sci %d\n", gsi);
return;
}
-static int acpi_register_gsi_xen(struct device *dev, u32 gsi,
- int trigger, int polarity)
+int __init pci_xen_initial_domain(void)
{
- return xen_register_gsi(gsi, trigger, polarity);
-}
+ int irq;
-static int __init pci_xen_initial_domain(void)
-{
#ifdef CONFIG_PCI_MSI
x86_msi.setup_msi_irqs = xen_initdom_setup_msi_irqs;
x86_msi.teardown_msi_irq = xen_teardown_msi_irq;
#endif
xen_setup_acpi_sci();
__acpi_register_gsi = acpi_register_gsi_xen;
+ /* Pre-allocate legacy irqs */
+ for (irq = 0; irq < NR_IRQS_LEGACY; irq++) {
+ int trigger, polarity;
+
+ if (acpi_get_override_irq(irq, &trigger, &polarity) == -1)
+ continue;
+ xen_register_pirq(irq, -1 /* no GSI override */,
+ trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE,
+ true /* Map GSI to PIRQ */);
+ }
+ if (0 == nr_ioapics) {
+ for (irq = 0; irq < NR_IRQS_LEGACY; irq++)
+ xen_bind_pirq_gsi_to_irq(irq, irq, 0, "xt-pic");
+ }
return 0;
}
-void __init xen_setup_pirqs(void)
-{
- int pirq, irq;
+struct xen_device_domain_owner {
+ domid_t domain;
+ struct pci_dev *dev;
+ struct list_head list;
+};
- pci_xen_initial_domain();
+static DEFINE_SPINLOCK(dev_domain_list_spinlock);
+static struct list_head dev_domain_list = LIST_HEAD_INIT(dev_domain_list);
- if (0 == nr_ioapics) {
- for (irq = 0; irq < NR_IRQS_LEGACY; irq++) {
- pirq = xen_allocate_pirq_gsi(irq);
- if (WARN(pirq < 0,
- "Could not allocate PIRQ for legacy interrupt\n"))
- break;
- irq = xen_bind_pirq_gsi_to_irq(irq, pirq, 0, "xt-pic");
- }
- return;
+static struct xen_device_domain_owner *find_device(struct pci_dev *dev)
+{
+ struct xen_device_domain_owner *owner;
+
+ list_for_each_entry(owner, &dev_domain_list, list) {
+ if (owner->dev == dev)
+ return owner;
}
+ return NULL;
+}
- /* Pre-allocate legacy irqs */
- for (irq = 0; irq < NR_IRQS_LEGACY; irq++) {
- int trigger, polarity;
+int xen_find_device_domain_owner(struct pci_dev *dev)
+{
+ struct xen_device_domain_owner *owner;
+ int domain = -ENODEV;
+
+ spin_lock(&dev_domain_list_spinlock);
+ owner = find_device(dev);
+ if (owner)
+ domain = owner->domain;
+ spin_unlock(&dev_domain_list_spinlock);
+ return domain;
+}
+EXPORT_SYMBOL_GPL(xen_find_device_domain_owner);
- if (acpi_get_override_irq(irq, &trigger, &polarity) == -1)
- continue;
+int xen_register_device_domain_owner(struct pci_dev *dev, uint16_t domain)
+{
+ struct xen_device_domain_owner *owner;
- xen_register_pirq(irq,
- trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE);
+ owner = kzalloc(sizeof(struct xen_device_domain_owner), GFP_KERNEL);
+ if (!owner)
+ return -ENODEV;
+
+ spin_lock(&dev_domain_list_spinlock);
+ if (find_device(dev)) {
+ spin_unlock(&dev_domain_list_spinlock);
+ kfree(owner);
+ return -EEXIST;
+ }
+ owner->domain = domain;
+ owner->dev = dev;
+ list_add_tail(&owner->list, &dev_domain_list);
+ spin_unlock(&dev_domain_list_spinlock);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(xen_register_device_domain_owner);
+
+int xen_unregister_device_domain_owner(struct pci_dev *dev)
+{
+ struct xen_device_domain_owner *owner;
+
+ spin_lock(&dev_domain_list_spinlock);
+ owner = find_device(dev);
+ if (!owner) {
+ spin_unlock(&dev_domain_list_spinlock);
+ return -ENODEV;
}
+ list_del(&owner->list);
+ spin_unlock(&dev_domain_list_spinlock);
+ kfree(owner);
+ return 0;
}
+EXPORT_SYMBOL_GPL(xen_unregister_device_domain_owner);
#endif
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 0fe27d7c625..3ae4128013e 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -51,7 +51,17 @@
int efi_enabled;
EXPORT_SYMBOL(efi_enabled);
-struct efi efi;
+struct efi __read_mostly efi = {
+ .mps = EFI_INVALID_TABLE_ADDR,
+ .acpi = EFI_INVALID_TABLE_ADDR,
+ .acpi20 = EFI_INVALID_TABLE_ADDR,
+ .smbios = EFI_INVALID_TABLE_ADDR,
+ .sal_systab = EFI_INVALID_TABLE_ADDR,
+ .boot_info = EFI_INVALID_TABLE_ADDR,
+ .hcdp = EFI_INVALID_TABLE_ADDR,
+ .uga = EFI_INVALID_TABLE_ADDR,
+ .uv_systab = EFI_INVALID_TABLE_ADDR,
+};
EXPORT_SYMBOL(efi);
struct efi_memory_map memmap;
@@ -79,26 +89,50 @@ early_param("add_efi_memmap", setup_add_efi_memmap);
static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc)
{
- return efi_call_virt2(get_time, tm, tc);
+ unsigned long flags;
+ efi_status_t status;
+
+ spin_lock_irqsave(&rtc_lock, flags);
+ status = efi_call_virt2(get_time, tm, tc);
+ spin_unlock_irqrestore(&rtc_lock, flags);
+ return status;
}
static efi_status_t virt_efi_set_time(efi_time_t *tm)
{
- return efi_call_virt1(set_time, tm);
+ unsigned long flags;
+ efi_status_t status;
+
+ spin_lock_irqsave(&rtc_lock, flags);
+ status = efi_call_virt1(set_time, tm);
+ spin_unlock_irqrestore(&rtc_lock, flags);
+ return status;
}
static efi_status_t virt_efi_get_wakeup_time(efi_bool_t *enabled,
efi_bool_t *pending,
efi_time_t *tm)
{
- return efi_call_virt3(get_wakeup_time,
- enabled, pending, tm);
+ unsigned long flags;
+ efi_status_t status;
+
+ spin_lock_irqsave(&rtc_lock, flags);
+ status = efi_call_virt3(get_wakeup_time,
+ enabled, pending, tm);
+ spin_unlock_irqrestore(&rtc_lock, flags);
+ return status;
}
static efi_status_t virt_efi_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm)
{
- return efi_call_virt2(set_wakeup_time,
- enabled, tm);
+ unsigned long flags;
+ efi_status_t status;
+
+ spin_lock_irqsave(&rtc_lock, flags);
+ status = efi_call_virt2(set_wakeup_time,
+ enabled, tm);
+ spin_unlock_irqrestore(&rtc_lock, flags);
+ return status;
}
static efi_status_t virt_efi_get_variable(efi_char16_t *name,
@@ -122,7 +156,7 @@ static efi_status_t virt_efi_get_next_variable(unsigned long *name_size,
static efi_status_t virt_efi_set_variable(efi_char16_t *name,
efi_guid_t *vendor,
- unsigned long attr,
+ u32 attr,
unsigned long data_size,
void *data)
{
@@ -131,6 +165,18 @@ static efi_status_t virt_efi_set_variable(efi_char16_t *name,
data_size, data);
}
+static efi_status_t virt_efi_query_variable_info(u32 attr,
+ u64 *storage_space,
+ u64 *remaining_space,
+ u64 *max_variable_size)
+{
+ if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION)
+ return EFI_UNSUPPORTED;
+
+ return efi_call_virt4(query_variable_info, attr, storage_space,
+ remaining_space, max_variable_size);
+}
+
static efi_status_t virt_efi_get_next_high_mono_count(u32 *count)
{
return efi_call_virt1(get_next_high_mono_count, count);
@@ -145,15 +191,26 @@ static void virt_efi_reset_system(int reset_type,
data_size, data);
}
-static efi_status_t virt_efi_set_virtual_address_map(
- unsigned long memory_map_size,
- unsigned long descriptor_size,
- u32 descriptor_version,
- efi_memory_desc_t *virtual_map)
+static efi_status_t virt_efi_update_capsule(efi_capsule_header_t **capsules,
+ unsigned long count,
+ unsigned long sg_list)
{
- return efi_call_virt4(set_virtual_address_map,
- memory_map_size, descriptor_size,
- descriptor_version, virtual_map);
+ if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION)
+ return EFI_UNSUPPORTED;
+
+ return efi_call_virt3(update_capsule, capsules, count, sg_list);
+}
+
+static efi_status_t virt_efi_query_capsule_caps(efi_capsule_header_t **capsules,
+ unsigned long count,
+ u64 *max_size,
+ int *reset_type)
+{
+ if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION)
+ return EFI_UNSUPPORTED;
+
+ return efi_call_virt4(query_capsule_caps, capsules, count, max_size,
+ reset_type);
}
static efi_status_t __init phys_efi_set_virtual_address_map(
@@ -175,11 +232,14 @@ static efi_status_t __init phys_efi_set_virtual_address_map(
static efi_status_t __init phys_efi_get_time(efi_time_t *tm,
efi_time_cap_t *tc)
{
+ unsigned long flags;
efi_status_t status;
+ spin_lock_irqsave(&rtc_lock, flags);
efi_call_phys_prelog();
status = efi_call_phys2(efi_phys.get_time, tm, tc);
efi_call_phys_epilog();
+ spin_unlock_irqrestore(&rtc_lock, flags);
return status;
}
@@ -315,6 +375,61 @@ static void __init print_efi_memmap(void)
}
#endif /* EFI_DEBUG */
+void __init efi_reserve_boot_services(void)
+{
+ void *p;
+
+ for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+ efi_memory_desc_t *md = p;
+ u64 start = md->phys_addr;
+ u64 size = md->num_pages << EFI_PAGE_SHIFT;
+
+ if (md->type != EFI_BOOT_SERVICES_CODE &&
+ md->type != EFI_BOOT_SERVICES_DATA)
+ continue;
+ /* Only reserve where possible:
+ * - Not within any already allocated areas
+ * - Not over any memory area (really needed, if above?)
+ * - Not within any part of the kernel
+ * - Not the bios reserved area
+ */
+ if ((start+size >= virt_to_phys(_text)
+ && start <= virt_to_phys(_end)) ||
+ !e820_all_mapped(start, start+size, E820_RAM) ||
+ memblock_x86_check_reserved_size(&start, &size,
+ 1<<EFI_PAGE_SHIFT)) {
+ /* Could not reserve, skip it */
+ md->num_pages = 0;
+ memblock_dbg(PFX "Could not reserve boot range "
+ "[0x%010llx-0x%010llx]\n",
+ start, start+size-1);
+ } else
+ memblock_x86_reserve_range(start, start+size,
+ "EFI Boot");
+ }
+}
+
+static void __init efi_free_boot_services(void)
+{
+ void *p;
+
+ for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+ efi_memory_desc_t *md = p;
+ unsigned long long start = md->phys_addr;
+ unsigned long long size = md->num_pages << EFI_PAGE_SHIFT;
+
+ if (md->type != EFI_BOOT_SERVICES_CODE &&
+ md->type != EFI_BOOT_SERVICES_DATA)
+ continue;
+
+ /* Could not reserve boot area */
+ if (!size)
+ continue;
+
+ free_bootmem_late(start, size);
+ }
+}
+
void __init efi_init(void)
{
efi_config_table_t *config_tables;
@@ -460,19 +575,30 @@ void __init efi_init(void)
x86_platform.set_wallclock = efi_set_rtc_mmss;
#endif
- /* Setup for EFI runtime service */
- reboot_type = BOOT_EFI;
-
#if EFI_DEBUG
print_efi_memmap();
#endif
}
+void __init efi_set_executable(efi_memory_desc_t *md, bool executable)
+{
+ u64 addr, npages;
+
+ addr = md->virt_addr;
+ npages = md->num_pages;
+
+ memrange_efi_to_native(&addr, &npages);
+
+ if (executable)
+ set_memory_x(addr, npages);
+ else
+ set_memory_nx(addr, npages);
+}
+
static void __init runtime_code_page_mkexec(void)
{
efi_memory_desc_t *md;
void *p;
- u64 addr, npages;
/* Make EFI runtime service code area executable */
for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
@@ -481,10 +607,7 @@ static void __init runtime_code_page_mkexec(void)
if (md->type != EFI_RUNTIME_SERVICES_CODE)
continue;
- addr = md->virt_addr;
- npages = md->num_pages;
- memrange_efi_to_native(&addr, &npages);
- set_memory_x(addr, npages);
+ efi_set_executable(md, true);
}
}
@@ -498,16 +621,47 @@ static void __init runtime_code_page_mkexec(void)
*/
void __init efi_enter_virtual_mode(void)
{
- efi_memory_desc_t *md;
+ efi_memory_desc_t *md, *prev_md = NULL;
efi_status_t status;
unsigned long size;
u64 end, systab, addr, npages, end_pfn;
- void *p, *va;
+ void *p, *va, *new_memmap = NULL;
+ int count = 0;
efi.systab = NULL;
+
+ /* Merge contiguous regions of the same type and attribute */
+ for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+ u64 prev_size;
+ md = p;
+
+ if (!prev_md) {
+ prev_md = md;
+ continue;
+ }
+
+ if (prev_md->type != md->type ||
+ prev_md->attribute != md->attribute) {
+ prev_md = md;
+ continue;
+ }
+
+ prev_size = prev_md->num_pages << EFI_PAGE_SHIFT;
+
+ if (md->phys_addr == (prev_md->phys_addr + prev_size)) {
+ prev_md->num_pages += md->num_pages;
+ md->type = EFI_RESERVED_TYPE;
+ md->attribute = 0;
+ continue;
+ }
+ prev_md = md;
+ }
+
for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
md = p;
- if (!(md->attribute & EFI_MEMORY_RUNTIME))
+ if (!(md->attribute & EFI_MEMORY_RUNTIME) &&
+ md->type != EFI_BOOT_SERVICES_CODE &&
+ md->type != EFI_BOOT_SERVICES_DATA)
continue;
size = md->num_pages << EFI_PAGE_SHIFT;
@@ -541,15 +695,21 @@ void __init efi_enter_virtual_mode(void)
systab += md->virt_addr - md->phys_addr;
efi.systab = (efi_system_table_t *) (unsigned long) systab;
}
+ new_memmap = krealloc(new_memmap,
+ (count + 1) * memmap.desc_size,
+ GFP_KERNEL);
+ memcpy(new_memmap + (count * memmap.desc_size), md,
+ memmap.desc_size);
+ count++;
}
BUG_ON(!efi.systab);
status = phys_efi_set_virtual_address_map(
- memmap.desc_size * memmap.nr_map,
+ memmap.desc_size * count,
memmap.desc_size,
memmap.desc_version,
- memmap.phys_map);
+ (efi_memory_desc_t *)__pa(new_memmap));
if (status != EFI_SUCCESS) {
printk(KERN_ALERT "Unable to switch EFI into virtual mode "
@@ -558,6 +718,13 @@ void __init efi_enter_virtual_mode(void)
}
/*
+ * Thankfully, it does seem that no runtime services other than
+ * SetVirtualAddressMap() will touch boot services code, so we can
+ * get rid of it all at this point
+ */
+ efi_free_boot_services();
+
+ /*
* Now that EFI is in virtual mode, update the function
* pointers in the runtime service table to the new virtual addresses.
*
@@ -572,11 +739,15 @@ void __init efi_enter_virtual_mode(void)
efi.set_variable = virt_efi_set_variable;
efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count;
efi.reset_system = virt_efi_reset_system;
- efi.set_virtual_address_map = virt_efi_set_virtual_address_map;
+ efi.set_virtual_address_map = NULL;
+ efi.query_variable_info = virt_efi_query_variable_info;
+ efi.update_capsule = virt_efi_update_capsule;
+ efi.query_capsule_caps = virt_efi_query_capsule_caps;
if (__supported_pte_mask & _PAGE_NX)
runtime_code_page_mkexec();
early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size);
memmap.map = NULL;
+ kfree(new_memmap);
}
/*
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index ac0621a7ac3..ac3aa54e265 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -41,22 +41,7 @@
static pgd_t save_pgd __initdata;
static unsigned long efi_flags __initdata;
-static void __init early_mapping_set_exec(unsigned long start,
- unsigned long end,
- int executable)
-{
- unsigned long num_pages;
-
- start &= PMD_MASK;
- end = (end + PMD_SIZE - 1) & PMD_MASK;
- num_pages = (end - start) >> PAGE_SHIFT;
- if (executable)
- set_memory_x((unsigned long)__va(start), num_pages);
- else
- set_memory_nx((unsigned long)__va(start), num_pages);
-}
-
-static void __init early_runtime_code_mapping_set_exec(int executable)
+static void __init early_code_mapping_set_exec(int executable)
{
efi_memory_desc_t *md;
void *p;
@@ -64,14 +49,12 @@ static void __init early_runtime_code_mapping_set_exec(int executable)
if (!(__supported_pte_mask & _PAGE_NX))
return;
- /* Make EFI runtime service code area executable */
+ /* Make EFI service code area executable */
for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
md = p;
- if (md->type == EFI_RUNTIME_SERVICES_CODE) {
- unsigned long end;
- end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT);
- early_mapping_set_exec(md->phys_addr, end, executable);
- }
+ if (md->type == EFI_RUNTIME_SERVICES_CODE ||
+ md->type == EFI_BOOT_SERVICES_CODE)
+ efi_set_executable(md, executable);
}
}
@@ -79,7 +62,7 @@ void __init efi_call_phys_prelog(void)
{
unsigned long vaddress;
- early_runtime_code_mapping_set_exec(1);
+ early_code_mapping_set_exec(1);
local_irq_save(efi_flags);
vaddress = (unsigned long)__va(0x0UL);
save_pgd = *pgd_offset_k(0x0UL);
@@ -95,7 +78,7 @@ void __init efi_call_phys_epilog(void)
set_pgd(pgd_offset_k(0x0UL), save_pgd);
__flush_tlb_all();
local_irq_restore(efi_flags);
- early_runtime_code_mapping_set_exec(0);
+ early_code_mapping_set_exec(0);
}
void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size,
@@ -107,8 +90,10 @@ void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size,
return ioremap(phys_addr, size);
last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size);
- if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size)
- return NULL;
+ if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size) {
+ unsigned long top = last_map_pfn << PAGE_SHIFT;
+ efi_ioremap(top, size - (top - phys_addr), type);
+ }
return (void __iomem *)__va(phys_addr);
}
diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c
index 275dbc19e2c..7000e74b308 100644
--- a/arch/x86/platform/mrst/mrst.c
+++ b/arch/x86/platform/mrst/mrst.c
@@ -194,7 +194,7 @@ static unsigned long __init mrst_calibrate_tsc(void)
return 0;
}
-void __init mrst_time_init(void)
+static void __init mrst_time_init(void)
{
sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr);
switch (mrst_timer_options) {
@@ -216,7 +216,7 @@ void __init mrst_time_init(void)
apbt_time_init();
}
-void __cpuinit mrst_arch_setup(void)
+static void __cpuinit mrst_arch_setup(void)
{
if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x27)
__mrst_cpu_chip = MRST_CPU_CHIP_PENWELL;
diff --git a/arch/x86/platform/olpc/Makefile b/arch/x86/platform/olpc/Makefile
index c2a8cab65e5..81c5e2165c2 100644
--- a/arch/x86/platform/olpc/Makefile
+++ b/arch/x86/platform/olpc/Makefile
@@ -1,4 +1,2 @@
-obj-$(CONFIG_OLPC) += olpc.o
+obj-$(CONFIG_OLPC) += olpc.o olpc_ofw.o olpc_dt.o
obj-$(CONFIG_OLPC_XO1) += olpc-xo1.o
-obj-$(CONFIG_OLPC) += olpc_ofw.o
-obj-$(CONFIG_OF_PROMTREE) += olpc_dt.o
diff --git a/arch/x86/platform/olpc/olpc.c b/arch/x86/platform/olpc/olpc.c
index edaf3fe8dc5..0060fd59ea0 100644
--- a/arch/x86/platform/olpc/olpc.c
+++ b/arch/x86/platform/olpc/olpc.c
@@ -18,6 +18,7 @@
#include <linux/io.h>
#include <linux/string.h>
#include <linux/platform_device.h>
+#include <linux/of.h>
#include <asm/geode.h>
#include <asm/setup.h>
@@ -187,41 +188,43 @@ err:
}
EXPORT_SYMBOL_GPL(olpc_ec_cmd);
-static bool __init check_ofw_architecture(void)
+static bool __init check_ofw_architecture(struct device_node *root)
{
- size_t propsize;
- char olpc_arch[5];
- const void *args[] = { NULL, "architecture", olpc_arch, (void *)5 };
- void *res[] = { &propsize };
+ const char *olpc_arch;
+ int propsize;
- if (olpc_ofw("getprop", args, res)) {
- printk(KERN_ERR "ofw: getprop call failed!\n");
- return false;
- }
+ olpc_arch = of_get_property(root, "architecture", &propsize);
return propsize == 5 && strncmp("OLPC", olpc_arch, 5) == 0;
}
-static u32 __init get_board_revision(void)
+static u32 __init get_board_revision(struct device_node *root)
{
- size_t propsize;
- __be32 rev;
- const void *args[] = { NULL, "board-revision-int", &rev, (void *)4 };
- void *res[] = { &propsize };
-
- if (olpc_ofw("getprop", args, res) || propsize != 4) {
- printk(KERN_ERR "ofw: getprop call failed!\n");
- return cpu_to_be32(0);
- }
- return be32_to_cpu(rev);
+ int propsize;
+ const __be32 *rev;
+
+ rev = of_get_property(root, "board-revision-int", &propsize);
+ if (propsize != 4)
+ return 0;
+
+ return be32_to_cpu(*rev);
}
static bool __init platform_detect(void)
{
- if (!check_ofw_architecture())
+ struct device_node *root = of_find_node_by_path("/");
+ bool success;
+
+ if (!root)
return false;
- olpc_platform_info.flags |= OLPC_F_PRESENT;
- olpc_platform_info.boardrev = get_board_revision();
- return true;
+
+ success = check_ofw_architecture(root);
+ if (success) {
+ olpc_platform_info.boardrev = get_board_revision(root);
+ olpc_platform_info.flags |= OLPC_F_PRESENT;
+ }
+
+ of_node_put(root);
+ return success;
}
static int __init add_xo1_platform_devices(void)
diff --git a/arch/x86/platform/olpc/olpc_dt.c b/arch/x86/platform/olpc/olpc_dt.c
index 044bda5b317..d39f63d017d 100644
--- a/arch/x86/platform/olpc/olpc_dt.c
+++ b/arch/x86/platform/olpc/olpc_dt.c
@@ -19,7 +19,9 @@
#include <linux/kernel.h>
#include <linux/bootmem.h>
#include <linux/of.h>
+#include <linux/of_platform.h>
#include <linux/of_pdt.h>
+#include <asm/olpc.h>
#include <asm/olpc_ofw.h>
static phandle __init olpc_dt_getsibling(phandle node)
@@ -180,3 +182,20 @@ void __init olpc_dt_build_devicetree(void)
pr_info("PROM DT: Built device tree with %u bytes of memory.\n",
prom_early_allocated);
}
+
+/* A list of DT node/bus matches that we want to expose as platform devices */
+static struct of_device_id __initdata of_ids[] = {
+ { .compatible = "olpc,xo1-battery" },
+ { .compatible = "olpc,xo1-dcon" },
+ { .compatible = "olpc,xo1-rtc" },
+ {},
+};
+
+static int __init olpc_create_platform_devices(void)
+{
+ if (machine_is_olpc())
+ return of_platform_bus_probe(NULL, of_ids, NULL);
+ else
+ return 0;
+}
+device_initcall(olpc_create_platform_devices);
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index c58e0ea39ef..db8b915f54b 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -1,7 +1,7 @@
/*
* SGI UltraViolet TLB flush routines.
*
- * (c) 2008-2010 Cliff Wickman <cpw@sgi.com>, SGI.
+ * (c) 2008-2011 Cliff Wickman <cpw@sgi.com>, SGI.
*
* This code is released under the GNU General Public License version 2 or
* later.
@@ -35,6 +35,7 @@ static int timeout_base_ns[] = {
5242880,
167772160
};
+
static int timeout_us;
static int nobau;
static int baudisabled;
@@ -42,20 +43,70 @@ static spinlock_t disable_lock;
static cycles_t congested_cycles;
/* tunables: */
-static int max_bau_concurrent = MAX_BAU_CONCURRENT;
-static int max_bau_concurrent_constant = MAX_BAU_CONCURRENT;
-static int plugged_delay = PLUGGED_DELAY;
-static int plugsb4reset = PLUGSB4RESET;
-static int timeoutsb4reset = TIMEOUTSB4RESET;
-static int ipi_reset_limit = IPI_RESET_LIMIT;
-static int complete_threshold = COMPLETE_THRESHOLD;
-static int congested_response_us = CONGESTED_RESPONSE_US;
-static int congested_reps = CONGESTED_REPS;
-static int congested_period = CONGESTED_PERIOD;
+static int max_concurr = MAX_BAU_CONCURRENT;
+static int max_concurr_const = MAX_BAU_CONCURRENT;
+static int plugged_delay = PLUGGED_DELAY;
+static int plugsb4reset = PLUGSB4RESET;
+static int timeoutsb4reset = TIMEOUTSB4RESET;
+static int ipi_reset_limit = IPI_RESET_LIMIT;
+static int complete_threshold = COMPLETE_THRESHOLD;
+static int congested_respns_us = CONGESTED_RESPONSE_US;
+static int congested_reps = CONGESTED_REPS;
+static int congested_period = CONGESTED_PERIOD;
+
+static struct tunables tunables[] = {
+ {&max_concurr, MAX_BAU_CONCURRENT}, /* must be [0] */
+ {&plugged_delay, PLUGGED_DELAY},
+ {&plugsb4reset, PLUGSB4RESET},
+ {&timeoutsb4reset, TIMEOUTSB4RESET},
+ {&ipi_reset_limit, IPI_RESET_LIMIT},
+ {&complete_threshold, COMPLETE_THRESHOLD},
+ {&congested_respns_us, CONGESTED_RESPONSE_US},
+ {&congested_reps, CONGESTED_REPS},
+ {&congested_period, CONGESTED_PERIOD}
+};
+
static struct dentry *tunables_dir;
static struct dentry *tunables_file;
-static int __init setup_nobau(char *arg)
+/* these correspond to the statistics printed by ptc_seq_show() */
+static char *stat_description[] = {
+ "sent: number of shootdown messages sent",
+ "stime: time spent sending messages",
+ "numuvhubs: number of hubs targeted with shootdown",
+ "numuvhubs16: number times 16 or more hubs targeted",
+ "numuvhubs8: number times 8 or more hubs targeted",
+ "numuvhubs4: number times 4 or more hubs targeted",
+ "numuvhubs2: number times 2 or more hubs targeted",
+ "numuvhubs1: number times 1 hub targeted",
+ "numcpus: number of cpus targeted with shootdown",
+ "dto: number of destination timeouts",
+ "retries: destination timeout retries sent",
+ "rok: : destination timeouts successfully retried",
+ "resetp: ipi-style resource resets for plugs",
+ "resett: ipi-style resource resets for timeouts",
+ "giveup: fall-backs to ipi-style shootdowns",
+ "sto: number of source timeouts",
+ "bz: number of stay-busy's",
+ "throt: number times spun in throttle",
+ "swack: image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE",
+ "recv: shootdown messages received",
+ "rtime: time spent processing messages",
+ "all: shootdown all-tlb messages",
+ "one: shootdown one-tlb messages",
+ "mult: interrupts that found multiple messages",
+ "none: interrupts that found no messages",
+ "retry: number of retry messages processed",
+ "canc: number messages canceled by retries",
+ "nocan: number retries that found nothing to cancel",
+ "reset: number of ipi-style reset requests processed",
+ "rcan: number messages canceled by reset requests",
+ "disable: number times use of the BAU was disabled",
+ "enable: number times use of the BAU was re-enabled"
+};
+
+static int __init
+setup_nobau(char *arg)
{
nobau = 1;
return 0;
@@ -63,7 +114,7 @@ static int __init setup_nobau(char *arg)
early_param("nobau", setup_nobau);
/* base pnode in this partition */
-static int uv_partition_base_pnode __read_mostly;
+static int uv_base_pnode __read_mostly;
/* position of pnode (which is nasid>>1): */
static int uv_nshift __read_mostly;
static unsigned long uv_mmask __read_mostly;
@@ -109,60 +160,52 @@ static int __init uvhub_to_first_apicid(int uvhub)
* clear of the Timeout bit (as well) will free the resource. No reply will
* be sent (the hardware will only do one reply per message).
*/
-static inline void uv_reply_to_message(struct msg_desc *mdp,
- struct bau_control *bcp)
+static void reply_to_message(struct msg_desc *mdp, struct bau_control *bcp)
{
unsigned long dw;
- struct bau_payload_queue_entry *msg;
+ struct bau_pq_entry *msg;
msg = mdp->msg;
if (!msg->canceled) {
- dw = (msg->sw_ack_vector << UV_SW_ACK_NPENDING) |
- msg->sw_ack_vector;
- uv_write_local_mmr(
- UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, dw);
+ dw = (msg->swack_vec << UV_SW_ACK_NPENDING) | msg->swack_vec;
+ write_mmr_sw_ack(dw);
}
msg->replied_to = 1;
- msg->sw_ack_vector = 0;
+ msg->swack_vec = 0;
}
/*
* Process the receipt of a RETRY message
*/
-static inline void uv_bau_process_retry_msg(struct msg_desc *mdp,
- struct bau_control *bcp)
+static void bau_process_retry_msg(struct msg_desc *mdp,
+ struct bau_control *bcp)
{
int i;
int cancel_count = 0;
- int slot2;
unsigned long msg_res;
unsigned long mmr = 0;
- struct bau_payload_queue_entry *msg;
- struct bau_payload_queue_entry *msg2;
- struct ptc_stats *stat;
+ struct bau_pq_entry *msg = mdp->msg;
+ struct bau_pq_entry *msg2;
+ struct ptc_stats *stat = bcp->statp;
- msg = mdp->msg;
- stat = bcp->statp;
stat->d_retries++;
/*
* cancel any message from msg+1 to the retry itself
*/
for (msg2 = msg+1, i = 0; i < DEST_Q_SIZE; msg2++, i++) {
- if (msg2 > mdp->va_queue_last)
- msg2 = mdp->va_queue_first;
+ if (msg2 > mdp->queue_last)
+ msg2 = mdp->queue_first;
if (msg2 == msg)
break;
- /* same conditions for cancellation as uv_do_reset */
+ /* same conditions for cancellation as do_reset */
if ((msg2->replied_to == 0) && (msg2->canceled == 0) &&
- (msg2->sw_ack_vector) && ((msg2->sw_ack_vector &
- msg->sw_ack_vector) == 0) &&
+ (msg2->swack_vec) && ((msg2->swack_vec &
+ msg->swack_vec) == 0) &&
(msg2->sending_cpu == msg->sending_cpu) &&
(msg2->msg_type != MSG_NOOP)) {
- slot2 = msg2 - mdp->va_queue_first;
- mmr = uv_read_local_mmr
- (UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
- msg_res = msg2->sw_ack_vector;
+ mmr = read_mmr_sw_ack();
+ msg_res = msg2->swack_vec;
/*
* This is a message retry; clear the resources held
* by the previous message only if they timed out.
@@ -170,6 +213,7 @@ static inline void uv_bau_process_retry_msg(struct msg_desc *mdp,
* situation to report.
*/
if (mmr & (msg_res << UV_SW_ACK_NPENDING)) {
+ unsigned long mr;
/*
* is the resource timed out?
* make everyone ignore the cancelled message.
@@ -177,10 +221,8 @@ static inline void uv_bau_process_retry_msg(struct msg_desc *mdp,
msg2->canceled = 1;
stat->d_canceled++;
cancel_count++;
- uv_write_local_mmr(
- UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS,
- (msg_res << UV_SW_ACK_NPENDING) |
- msg_res);
+ mr = (msg_res << UV_SW_ACK_NPENDING) | msg_res;
+ write_mmr_sw_ack(mr);
}
}
}
@@ -192,20 +234,19 @@ static inline void uv_bau_process_retry_msg(struct msg_desc *mdp,
* Do all the things a cpu should do for a TLB shootdown message.
* Other cpu's may come here at the same time for this message.
*/
-static void uv_bau_process_message(struct msg_desc *mdp,
- struct bau_control *bcp)
+static void bau_process_message(struct msg_desc *mdp,
+ struct bau_control *bcp)
{
- int msg_ack_count;
short socket_ack_count = 0;
- struct ptc_stats *stat;
- struct bau_payload_queue_entry *msg;
+ short *sp;
+ struct atomic_short *asp;
+ struct ptc_stats *stat = bcp->statp;
+ struct bau_pq_entry *msg = mdp->msg;
struct bau_control *smaster = bcp->socket_master;
/*
* This must be a normal message, or retry of a normal message
*/
- msg = mdp->msg;
- stat = bcp->statp;
if (msg->address == TLB_FLUSH_ALL) {
local_flush_tlb();
stat->d_alltlb++;
@@ -222,30 +263,32 @@ static void uv_bau_process_message(struct msg_desc *mdp,
* cpu number.
*/
if (msg->msg_type == MSG_RETRY && bcp == bcp->uvhub_master)
- uv_bau_process_retry_msg(mdp, bcp);
+ bau_process_retry_msg(mdp, bcp);
/*
- * This is a sw_ack message, so we have to reply to it.
+ * This is a swack message, so we have to reply to it.
* Count each responding cpu on the socket. This avoids
* pinging the count's cache line back and forth between
* the sockets.
*/
- socket_ack_count = atomic_add_short_return(1, (struct atomic_short *)
- &smaster->socket_acknowledge_count[mdp->msg_slot]);
+ sp = &smaster->socket_acknowledge_count[mdp->msg_slot];
+ asp = (struct atomic_short *)sp;
+ socket_ack_count = atom_asr(1, asp);
if (socket_ack_count == bcp->cpus_in_socket) {
+ int msg_ack_count;
/*
* Both sockets dump their completed count total into
* the message's count.
*/
smaster->socket_acknowledge_count[mdp->msg_slot] = 0;
- msg_ack_count = atomic_add_short_return(socket_ack_count,
- (struct atomic_short *)&msg->acknowledge_count);
+ asp = (struct atomic_short *)&msg->acknowledge_count;
+ msg_ack_count = atom_asr(socket_ack_count, asp);
if (msg_ack_count == bcp->cpus_in_uvhub) {
/*
* All cpus in uvhub saw it; reply
*/
- uv_reply_to_message(mdp, bcp);
+ reply_to_message(mdp, bcp);
}
}
@@ -253,14 +296,18 @@ static void uv_bau_process_message(struct msg_desc *mdp,
}
/*
- * Determine the first cpu on a uvhub.
+ * Determine the first cpu on a pnode.
*/
-static int uvhub_to_first_cpu(int uvhub)
+static int pnode_to_first_cpu(int pnode, struct bau_control *smaster)
{
int cpu;
- for_each_present_cpu(cpu)
- if (uvhub == uv_cpu_to_blade_id(cpu))
+ struct hub_and_pnode *hpp;
+
+ for_each_present_cpu(cpu) {
+ hpp = &smaster->thp[cpu];
+ if (pnode == hpp->pnode)
return cpu;
+ }
return -1;
}
@@ -268,62 +315,51 @@ static int uvhub_to_first_cpu(int uvhub)
* Last resort when we get a large number of destination timeouts is
* to clear resources held by a given cpu.
* Do this with IPI so that all messages in the BAU message queue
- * can be identified by their nonzero sw_ack_vector field.
+ * can be identified by their nonzero swack_vec field.
*
* This is entered for a single cpu on the uvhub.
* The sender want's this uvhub to free a specific message's
- * sw_ack resources.
+ * swack resources.
*/
-static void
-uv_do_reset(void *ptr)
+static void do_reset(void *ptr)
{
int i;
- int slot;
- int count = 0;
- unsigned long mmr;
- unsigned long msg_res;
- struct bau_control *bcp;
- struct reset_args *rap;
- struct bau_payload_queue_entry *msg;
- struct ptc_stats *stat;
+ struct bau_control *bcp = &per_cpu(bau_control, smp_processor_id());
+ struct reset_args *rap = (struct reset_args *)ptr;
+ struct bau_pq_entry *msg;
+ struct ptc_stats *stat = bcp->statp;
- bcp = &per_cpu(bau_control, smp_processor_id());
- rap = (struct reset_args *)ptr;
- stat = bcp->statp;
stat->d_resets++;
-
/*
* We're looking for the given sender, and
- * will free its sw_ack resource.
+ * will free its swack resource.
* If all cpu's finally responded after the timeout, its
* message 'replied_to' was set.
*/
- for (msg = bcp->va_queue_first, i = 0; i < DEST_Q_SIZE; msg++, i++) {
- /* uv_do_reset: same conditions for cancellation as
- uv_bau_process_retry_msg() */
+ for (msg = bcp->queue_first, i = 0; i < DEST_Q_SIZE; msg++, i++) {
+ unsigned long msg_res;
+ /* do_reset: same conditions for cancellation as
+ bau_process_retry_msg() */
if ((msg->replied_to == 0) &&
(msg->canceled == 0) &&
(msg->sending_cpu == rap->sender) &&
- (msg->sw_ack_vector) &&
+ (msg->swack_vec) &&
(msg->msg_type != MSG_NOOP)) {
+ unsigned long mmr;
+ unsigned long mr;
/*
* make everyone else ignore this message
*/
msg->canceled = 1;
- slot = msg - bcp->va_queue_first;
- count++;
/*
* only reset the resource if it is still pending
*/
- mmr = uv_read_local_mmr
- (UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
- msg_res = msg->sw_ack_vector;
+ mmr = read_mmr_sw_ack();
+ msg_res = msg->swack_vec;
+ mr = (msg_res << UV_SW_ACK_NPENDING) | msg_res;
if (mmr & msg_res) {
stat->d_rcanceled++;
- uv_write_local_mmr(
- UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS,
- (msg_res << UV_SW_ACK_NPENDING) |
- msg_res);
+ write_mmr_sw_ack(mr);
}
}
}
@@ -334,39 +370,42 @@ uv_do_reset(void *ptr)
* Use IPI to get all target uvhubs to release resources held by
* a given sending cpu number.
*/
-static void uv_reset_with_ipi(struct bau_target_uvhubmask *distribution,
- int sender)
+static void reset_with_ipi(struct pnmask *distribution, struct bau_control *bcp)
{
- int uvhub;
- int cpu;
- cpumask_t mask;
+ int pnode;
+ int apnode;
+ int maskbits;
+ int sender = bcp->cpu;
+ cpumask_t *mask = bcp->uvhub_master->cpumask;
+ struct bau_control *smaster = bcp->socket_master;
struct reset_args reset_args;
reset_args.sender = sender;
-
- cpus_clear(mask);
+ cpus_clear(*mask);
/* find a single cpu for each uvhub in this distribution mask */
- for (uvhub = 0;
- uvhub < sizeof(struct bau_target_uvhubmask) * BITSPERBYTE;
- uvhub++) {
- if (!bau_uvhub_isset(uvhub, distribution))
+ maskbits = sizeof(struct pnmask) * BITSPERBYTE;
+ /* each bit is a pnode relative to the partition base pnode */
+ for (pnode = 0; pnode < maskbits; pnode++) {
+ int cpu;
+ if (!bau_uvhub_isset(pnode, distribution))
continue;
- /* find a cpu for this uvhub */
- cpu = uvhub_to_first_cpu(uvhub);
- cpu_set(cpu, mask);
+ apnode = pnode + bcp->partition_base_pnode;
+ cpu = pnode_to_first_cpu(apnode, smaster);
+ cpu_set(cpu, *mask);
}
- /* IPI all cpus; Preemption is already disabled */
- smp_call_function_many(&mask, uv_do_reset, (void *)&reset_args, 1);
+
+ /* IPI all cpus; preemption is already disabled */
+ smp_call_function_many(mask, do_reset, (void *)&reset_args, 1);
return;
}
-static inline unsigned long
-cycles_2_us(unsigned long long cyc)
+static inline unsigned long cycles_2_us(unsigned long long cyc)
{
unsigned long long ns;
unsigned long us;
- ns = (cyc * per_cpu(cyc2ns, smp_processor_id()))
- >> CYC2NS_SCALE_FACTOR;
+ int cpu = smp_processor_id();
+
+ ns = (cyc * per_cpu(cyc2ns, cpu)) >> CYC2NS_SCALE_FACTOR;
us = ns / 1000;
return us;
}
@@ -376,56 +415,56 @@ cycles_2_us(unsigned long long cyc)
* leaves uvhub_quiesce set so that no new broadcasts are started by
* bau_flush_send_and_wait()
*/
-static inline void
-quiesce_local_uvhub(struct bau_control *hmaster)
+static inline void quiesce_local_uvhub(struct bau_control *hmaster)
{
- atomic_add_short_return(1, (struct atomic_short *)
- &hmaster->uvhub_quiesce);
+ atom_asr(1, (struct atomic_short *)&hmaster->uvhub_quiesce);
}
/*
* mark this quiet-requestor as done
*/
-static inline void
-end_uvhub_quiesce(struct bau_control *hmaster)
+static inline void end_uvhub_quiesce(struct bau_control *hmaster)
{
- atomic_add_short_return(-1, (struct atomic_short *)
- &hmaster->uvhub_quiesce);
+ atom_asr(-1, (struct atomic_short *)&hmaster->uvhub_quiesce);
+}
+
+static unsigned long uv1_read_status(unsigned long mmr_offset, int right_shift)
+{
+ unsigned long descriptor_status;
+
+ descriptor_status = uv_read_local_mmr(mmr_offset);
+ descriptor_status >>= right_shift;
+ descriptor_status &= UV_ACT_STATUS_MASK;
+ return descriptor_status;
}
/*
* Wait for completion of a broadcast software ack message
* return COMPLETE, RETRY(PLUGGED or TIMEOUT) or GIVEUP
*/
-static int uv_wait_completion(struct bau_desc *bau_desc,
- unsigned long mmr_offset, int right_shift, int this_cpu,
- struct bau_control *bcp, struct bau_control *smaster, long try)
+static int uv1_wait_completion(struct bau_desc *bau_desc,
+ unsigned long mmr_offset, int right_shift,
+ struct bau_control *bcp, long try)
{
unsigned long descriptor_status;
- cycles_t ttime;
+ cycles_t ttm;
struct ptc_stats *stat = bcp->statp;
- struct bau_control *hmaster;
-
- hmaster = bcp->uvhub_master;
+ descriptor_status = uv1_read_status(mmr_offset, right_shift);
/* spin on the status MMR, waiting for it to go idle */
- while ((descriptor_status = (((unsigned long)
- uv_read_local_mmr(mmr_offset) >>
- right_shift) & UV_ACT_STATUS_MASK)) !=
- DESC_STATUS_IDLE) {
+ while ((descriptor_status != DS_IDLE)) {
/*
- * Our software ack messages may be blocked because there are
- * no swack resources available. As long as none of them
- * has timed out hardware will NACK our message and its
- * state will stay IDLE.
+ * Our software ack messages may be blocked because
+ * there are no swack resources available. As long
+ * as none of them has timed out hardware will NACK
+ * our message and its state will stay IDLE.
*/
- if (descriptor_status == DESC_STATUS_SOURCE_TIMEOUT) {
+ if (descriptor_status == DS_SOURCE_TIMEOUT) {
stat->s_stimeout++;
return FLUSH_GIVEUP;
- } else if (descriptor_status ==
- DESC_STATUS_DESTINATION_TIMEOUT) {
+ } else if (descriptor_status == DS_DESTINATION_TIMEOUT) {
stat->s_dtimeout++;
- ttime = get_cycles();
+ ttm = get_cycles();
/*
* Our retries may be blocked by all destination
@@ -433,8 +472,7 @@ static int uv_wait_completion(struct bau_desc *bau_desc,
* pending. In that case hardware returns the
* ERROR that looks like a destination timeout.
*/
- if (cycles_2_us(ttime - bcp->send_message) <
- timeout_us) {
+ if (cycles_2_us(ttm - bcp->send_message) < timeout_us) {
bcp->conseccompletes = 0;
return FLUSH_RETRY_PLUGGED;
}
@@ -447,80 +485,160 @@ static int uv_wait_completion(struct bau_desc *bau_desc,
*/
cpu_relax();
}
+ descriptor_status = uv1_read_status(mmr_offset, right_shift);
}
bcp->conseccompletes++;
return FLUSH_COMPLETE;
}
-static inline cycles_t
-sec_2_cycles(unsigned long sec)
+/*
+ * UV2 has an extra bit of status in the ACTIVATION_STATUS_2 register.
+ */
+static unsigned long uv2_read_status(unsigned long offset, int rshft, int cpu)
{
- unsigned long ns;
- cycles_t cyc;
+ unsigned long descriptor_status;
+ unsigned long descriptor_status2;
- ns = sec * 1000000000;
- cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
- return cyc;
+ descriptor_status = ((read_lmmr(offset) >> rshft) & UV_ACT_STATUS_MASK);
+ descriptor_status2 = (read_mmr_uv2_status() >> cpu) & 0x1UL;
+ descriptor_status = (descriptor_status << 1) | descriptor_status2;
+ return descriptor_status;
+}
+
+static int uv2_wait_completion(struct bau_desc *bau_desc,
+ unsigned long mmr_offset, int right_shift,
+ struct bau_control *bcp, long try)
+{
+ unsigned long descriptor_stat;
+ cycles_t ttm;
+ int cpu = bcp->uvhub_cpu;
+ struct ptc_stats *stat = bcp->statp;
+
+ descriptor_stat = uv2_read_status(mmr_offset, right_shift, cpu);
+
+ /* spin on the status MMR, waiting for it to go idle */
+ while (descriptor_stat != UV2H_DESC_IDLE) {
+ /*
+ * Our software ack messages may be blocked because
+ * there are no swack resources available. As long
+ * as none of them has timed out hardware will NACK
+ * our message and its state will stay IDLE.
+ */
+ if ((descriptor_stat == UV2H_DESC_SOURCE_TIMEOUT) ||
+ (descriptor_stat == UV2H_DESC_DEST_STRONG_NACK) ||
+ (descriptor_stat == UV2H_DESC_DEST_PUT_ERR)) {
+ stat->s_stimeout++;
+ return FLUSH_GIVEUP;
+ } else if (descriptor_stat == UV2H_DESC_DEST_TIMEOUT) {
+ stat->s_dtimeout++;
+ ttm = get_cycles();
+ /*
+ * Our retries may be blocked by all destination
+ * swack resources being consumed, and a timeout
+ * pending. In that case hardware returns the
+ * ERROR that looks like a destination timeout.
+ */
+ if (cycles_2_us(ttm - bcp->send_message) < timeout_us) {
+ bcp->conseccompletes = 0;
+ return FLUSH_RETRY_PLUGGED;
+ }
+ bcp->conseccompletes = 0;
+ return FLUSH_RETRY_TIMEOUT;
+ } else {
+ /*
+ * descriptor_stat is still BUSY
+ */
+ cpu_relax();
+ }
+ descriptor_stat = uv2_read_status(mmr_offset, right_shift, cpu);
+ }
+ bcp->conseccompletes++;
+ return FLUSH_COMPLETE;
}
/*
- * conditionally add 1 to *v, unless *v is >= u
- * return 0 if we cannot add 1 to *v because it is >= u
- * return 1 if we can add 1 to *v because it is < u
- * the add is atomic
- *
- * This is close to atomic_add_unless(), but this allows the 'u' value
- * to be lowered below the current 'v'. atomic_add_unless can only stop
- * on equal.
+ * There are 2 status registers; each and array[32] of 2 bits. Set up for
+ * which register to read and position in that register based on cpu in
+ * current hub.
*/
-static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u)
+static int wait_completion(struct bau_desc *bau_desc,
+ struct bau_control *bcp, long try)
{
- spin_lock(lock);
- if (atomic_read(v) >= u) {
- spin_unlock(lock);
- return 0;
+ int right_shift;
+ unsigned long mmr_offset;
+ int cpu = bcp->uvhub_cpu;
+
+ if (cpu < UV_CPUS_PER_AS) {
+ mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
+ right_shift = cpu * UV_ACT_STATUS_SIZE;
+ } else {
+ mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1;
+ right_shift = ((cpu - UV_CPUS_PER_AS) * UV_ACT_STATUS_SIZE);
}
- atomic_inc(v);
- spin_unlock(lock);
- return 1;
+
+ if (is_uv1_hub())
+ return uv1_wait_completion(bau_desc, mmr_offset, right_shift,
+ bcp, try);
+ else
+ return uv2_wait_completion(bau_desc, mmr_offset, right_shift,
+ bcp, try);
+}
+
+static inline cycles_t sec_2_cycles(unsigned long sec)
+{
+ unsigned long ns;
+ cycles_t cyc;
+
+ ns = sec * 1000000000;
+ cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
+ return cyc;
}
/*
- * Our retries are blocked by all destination swack resources being
+ * Our retries are blocked by all destination sw ack resources being
* in use, and a timeout is pending. In that case hardware immediately
* returns the ERROR that looks like a destination timeout.
*/
-static void
-destination_plugged(struct bau_desc *bau_desc, struct bau_control *bcp,
+static void destination_plugged(struct bau_desc *bau_desc,
+ struct bau_control *bcp,
struct bau_control *hmaster, struct ptc_stats *stat)
{
udelay(bcp->plugged_delay);
bcp->plugged_tries++;
+
if (bcp->plugged_tries >= bcp->plugsb4reset) {
bcp->plugged_tries = 0;
+
quiesce_local_uvhub(hmaster);
+
spin_lock(&hmaster->queue_lock);
- uv_reset_with_ipi(&bau_desc->distribution, bcp->cpu);
+ reset_with_ipi(&bau_desc->distribution, bcp);
spin_unlock(&hmaster->queue_lock);
+
end_uvhub_quiesce(hmaster);
+
bcp->ipi_attempts++;
stat->s_resets_plug++;
}
}
-static void
-destination_timeout(struct bau_desc *bau_desc, struct bau_control *bcp,
- struct bau_control *hmaster, struct ptc_stats *stat)
+static void destination_timeout(struct bau_desc *bau_desc,
+ struct bau_control *bcp, struct bau_control *hmaster,
+ struct ptc_stats *stat)
{
- hmaster->max_bau_concurrent = 1;
+ hmaster->max_concurr = 1;
bcp->timeout_tries++;
if (bcp->timeout_tries >= bcp->timeoutsb4reset) {
bcp->timeout_tries = 0;
+
quiesce_local_uvhub(hmaster);
+
spin_lock(&hmaster->queue_lock);
- uv_reset_with_ipi(&bau_desc->distribution, bcp->cpu);
+ reset_with_ipi(&bau_desc->distribution, bcp);
spin_unlock(&hmaster->queue_lock);
+
end_uvhub_quiesce(hmaster);
+
bcp->ipi_attempts++;
stat->s_resets_timeout++;
}
@@ -530,34 +648,104 @@ destination_timeout(struct bau_desc *bau_desc, struct bau_control *bcp,
* Completions are taking a very long time due to a congested numalink
* network.
*/
-static void
-disable_for_congestion(struct bau_control *bcp, struct ptc_stats *stat)
+static void disable_for_congestion(struct bau_control *bcp,
+ struct ptc_stats *stat)
{
- int tcpu;
- struct bau_control *tbcp;
-
/* let only one cpu do this disabling */
spin_lock(&disable_lock);
+
if (!baudisabled && bcp->period_requests &&
((bcp->period_time / bcp->period_requests) > congested_cycles)) {
+ int tcpu;
+ struct bau_control *tbcp;
/* it becomes this cpu's job to turn on the use of the
BAU again */
baudisabled = 1;
bcp->set_bau_off = 1;
- bcp->set_bau_on_time = get_cycles() +
- sec_2_cycles(bcp->congested_period);
+ bcp->set_bau_on_time = get_cycles();
+ bcp->set_bau_on_time += sec_2_cycles(bcp->cong_period);
stat->s_bau_disabled++;
for_each_present_cpu(tcpu) {
tbcp = &per_cpu(bau_control, tcpu);
- tbcp->baudisabled = 1;
+ tbcp->baudisabled = 1;
}
}
+
spin_unlock(&disable_lock);
}
-/**
- * uv_flush_send_and_wait
- *
+static void count_max_concurr(int stat, struct bau_control *bcp,
+ struct bau_control *hmaster)
+{
+ bcp->plugged_tries = 0;
+ bcp->timeout_tries = 0;
+ if (stat != FLUSH_COMPLETE)
+ return;
+ if (bcp->conseccompletes <= bcp->complete_threshold)
+ return;
+ if (hmaster->max_concurr >= hmaster->max_concurr_const)
+ return;
+ hmaster->max_concurr++;
+}
+
+static void record_send_stats(cycles_t time1, cycles_t time2,
+ struct bau_control *bcp, struct ptc_stats *stat,
+ int completion_status, int try)
+{
+ cycles_t elapsed;
+
+ if (time2 > time1) {
+ elapsed = time2 - time1;
+ stat->s_time += elapsed;
+
+ if ((completion_status == FLUSH_COMPLETE) && (try == 1)) {
+ bcp->period_requests++;
+ bcp->period_time += elapsed;
+ if ((elapsed > congested_cycles) &&
+ (bcp->period_requests > bcp->cong_reps))
+ disable_for_congestion(bcp, stat);
+ }
+ } else
+ stat->s_requestor--;
+
+ if (completion_status == FLUSH_COMPLETE && try > 1)
+ stat->s_retriesok++;
+ else if (completion_status == FLUSH_GIVEUP)
+ stat->s_giveup++;
+}
+
+/*
+ * Because of a uv1 hardware bug only a limited number of concurrent
+ * requests can be made.
+ */
+static void uv1_throttle(struct bau_control *hmaster, struct ptc_stats *stat)
+{
+ spinlock_t *lock = &hmaster->uvhub_lock;
+ atomic_t *v;
+
+ v = &hmaster->active_descriptor_count;
+ if (!atomic_inc_unless_ge(lock, v, hmaster->max_concurr)) {
+ stat->s_throttles++;
+ do {
+ cpu_relax();
+ } while (!atomic_inc_unless_ge(lock, v, hmaster->max_concurr));
+ }
+}
+
+/*
+ * Handle the completion status of a message send.
+ */
+static void handle_cmplt(int completion_status, struct bau_desc *bau_desc,
+ struct bau_control *bcp, struct bau_control *hmaster,
+ struct ptc_stats *stat)
+{
+ if (completion_status == FLUSH_RETRY_PLUGGED)
+ destination_plugged(bau_desc, bcp, hmaster, stat);
+ else if (completion_status == FLUSH_RETRY_TIMEOUT)
+ destination_timeout(bau_desc, bcp, hmaster, stat);
+}
+
+/*
* Send a broadcast and wait for it to complete.
*
* The flush_mask contains the cpus the broadcast is to be sent to including
@@ -568,44 +756,23 @@ disable_for_congestion(struct bau_control *bcp, struct ptc_stats *stat)
* returned to the kernel.
*/
int uv_flush_send_and_wait(struct bau_desc *bau_desc,
- struct cpumask *flush_mask, struct bau_control *bcp)
+ struct cpumask *flush_mask, struct bau_control *bcp)
{
- int right_shift;
- int completion_status = 0;
int seq_number = 0;
+ int completion_stat = 0;
long try = 0;
- int cpu = bcp->uvhub_cpu;
- int this_cpu = bcp->cpu;
- unsigned long mmr_offset;
unsigned long index;
cycles_t time1;
cycles_t time2;
- cycles_t elapsed;
struct ptc_stats *stat = bcp->statp;
- struct bau_control *smaster = bcp->socket_master;
struct bau_control *hmaster = bcp->uvhub_master;
- if (!atomic_inc_unless_ge(&hmaster->uvhub_lock,
- &hmaster->active_descriptor_count,
- hmaster->max_bau_concurrent)) {
- stat->s_throttles++;
- do {
- cpu_relax();
- } while (!atomic_inc_unless_ge(&hmaster->uvhub_lock,
- &hmaster->active_descriptor_count,
- hmaster->max_bau_concurrent));
- }
+ if (is_uv1_hub())
+ uv1_throttle(hmaster, stat);
+
while (hmaster->uvhub_quiesce)
cpu_relax();
- if (cpu < UV_CPUS_PER_ACT_STATUS) {
- mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
- right_shift = cpu * UV_ACT_STATUS_SIZE;
- } else {
- mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1;
- right_shift =
- ((cpu - UV_CPUS_PER_ACT_STATUS) * UV_ACT_STATUS_SIZE);
- }
time1 = get_cycles();
do {
if (try == 0) {
@@ -615,64 +782,134 @@ int uv_flush_send_and_wait(struct bau_desc *bau_desc,
bau_desc->header.msg_type = MSG_RETRY;
stat->s_retry_messages++;
}
+
bau_desc->header.sequence = seq_number;
- index = (1UL << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) |
- bcp->uvhub_cpu;
+ index = (1UL << AS_PUSH_SHIFT) | bcp->uvhub_cpu;
bcp->send_message = get_cycles();
- uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index);
+
+ write_mmr_activation(index);
+
try++;
- completion_status = uv_wait_completion(bau_desc, mmr_offset,
- right_shift, this_cpu, bcp, smaster, try);
+ completion_stat = wait_completion(bau_desc, bcp, try);
+
+ handle_cmplt(completion_stat, bau_desc, bcp, hmaster, stat);
- if (completion_status == FLUSH_RETRY_PLUGGED) {
- destination_plugged(bau_desc, bcp, hmaster, stat);
- } else if (completion_status == FLUSH_RETRY_TIMEOUT) {
- destination_timeout(bau_desc, bcp, hmaster, stat);
- }
if (bcp->ipi_attempts >= bcp->ipi_reset_limit) {
bcp->ipi_attempts = 0;
- completion_status = FLUSH_GIVEUP;
+ completion_stat = FLUSH_GIVEUP;
break;
}
cpu_relax();
- } while ((completion_status == FLUSH_RETRY_PLUGGED) ||
- (completion_status == FLUSH_RETRY_TIMEOUT));
+ } while ((completion_stat == FLUSH_RETRY_PLUGGED) ||
+ (completion_stat == FLUSH_RETRY_TIMEOUT));
+
time2 = get_cycles();
- bcp->plugged_tries = 0;
- bcp->timeout_tries = 0;
- if ((completion_status == FLUSH_COMPLETE) &&
- (bcp->conseccompletes > bcp->complete_threshold) &&
- (hmaster->max_bau_concurrent <
- hmaster->max_bau_concurrent_constant))
- hmaster->max_bau_concurrent++;
+
+ count_max_concurr(completion_stat, bcp, hmaster);
+
while (hmaster->uvhub_quiesce)
cpu_relax();
+
atomic_dec(&hmaster->active_descriptor_count);
- if (time2 > time1) {
- elapsed = time2 - time1;
- stat->s_time += elapsed;
- if ((completion_status == FLUSH_COMPLETE) && (try == 1)) {
- bcp->period_requests++;
- bcp->period_time += elapsed;
- if ((elapsed > congested_cycles) &&
- (bcp->period_requests > bcp->congested_reps)) {
- disable_for_congestion(bcp, stat);
+
+ record_send_stats(time1, time2, bcp, stat, completion_stat, try);
+
+ if (completion_stat == FLUSH_GIVEUP)
+ return 1;
+ return 0;
+}
+
+/*
+ * The BAU is disabled. When the disabled time period has expired, the cpu
+ * that disabled it must re-enable it.
+ * Return 0 if it is re-enabled for all cpus.
+ */
+static int check_enable(struct bau_control *bcp, struct ptc_stats *stat)
+{
+ int tcpu;
+ struct bau_control *tbcp;
+
+ if (bcp->set_bau_off) {
+ if (get_cycles() >= bcp->set_bau_on_time) {
+ stat->s_bau_reenabled++;
+ baudisabled = 0;
+ for_each_present_cpu(tcpu) {
+ tbcp = &per_cpu(bau_control, tcpu);
+ tbcp->baudisabled = 0;
+ tbcp->period_requests = 0;
+ tbcp->period_time = 0;
}
+ return 0;
}
+ }
+ return -1;
+}
+
+static void record_send_statistics(struct ptc_stats *stat, int locals, int hubs,
+ int remotes, struct bau_desc *bau_desc)
+{
+ stat->s_requestor++;
+ stat->s_ntargcpu += remotes + locals;
+ stat->s_ntargremotes += remotes;
+ stat->s_ntarglocals += locals;
+
+ /* uvhub statistics */
+ hubs = bau_uvhub_weight(&bau_desc->distribution);
+ if (locals) {
+ stat->s_ntarglocaluvhub++;
+ stat->s_ntargremoteuvhub += (hubs - 1);
} else
- stat->s_requestor--;
- if (completion_status == FLUSH_COMPLETE && try > 1)
- stat->s_retriesok++;
- else if (completion_status == FLUSH_GIVEUP) {
- stat->s_giveup++;
- return 1;
+ stat->s_ntargremoteuvhub += hubs;
+
+ stat->s_ntarguvhub += hubs;
+
+ if (hubs >= 16)
+ stat->s_ntarguvhub16++;
+ else if (hubs >= 8)
+ stat->s_ntarguvhub8++;
+ else if (hubs >= 4)
+ stat->s_ntarguvhub4++;
+ else if (hubs >= 2)
+ stat->s_ntarguvhub2++;
+ else
+ stat->s_ntarguvhub1++;
+}
+
+/*
+ * Translate a cpu mask to the uvhub distribution mask in the BAU
+ * activation descriptor.
+ */
+static int set_distrib_bits(struct cpumask *flush_mask, struct bau_control *bcp,
+ struct bau_desc *bau_desc, int *localsp, int *remotesp)
+{
+ int cpu;
+ int pnode;
+ int cnt = 0;
+ struct hub_and_pnode *hpp;
+
+ for_each_cpu(cpu, flush_mask) {
+ /*
+ * The distribution vector is a bit map of pnodes, relative
+ * to the partition base pnode (and the partition base nasid
+ * in the header).
+ * Translate cpu to pnode and hub using a local memory array.
+ */
+ hpp = &bcp->socket_master->thp[cpu];
+ pnode = hpp->pnode - bcp->partition_base_pnode;
+ bau_uvhub_set(pnode, &bau_desc->distribution);
+ cnt++;
+ if (hpp->uvhub == bcp->uvhub)
+ (*localsp)++;
+ else
+ (*remotesp)++;
}
+ if (!cnt)
+ return 1;
return 0;
}
-/**
- * uv_flush_tlb_others - globally purge translation cache of a virtual
- * address or all TLB's
+/*
+ * globally purge translation cache of a virtual address or all TLB's
* @cpumask: mask of all cpu's in which the address is to be removed
* @mm: mm_struct containing virtual address range
* @va: virtual address to be removed (or TLB_FLUSH_ALL for all TLB's on cpu)
@@ -696,20 +933,16 @@ int uv_flush_send_and_wait(struct bau_desc *bau_desc,
* done. The returned pointer is valid till preemption is re-enabled.
*/
const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
- struct mm_struct *mm,
- unsigned long va, unsigned int cpu)
+ struct mm_struct *mm, unsigned long va,
+ unsigned int cpu)
{
int locals = 0;
int remotes = 0;
int hubs = 0;
- int tcpu;
- int tpnode;
struct bau_desc *bau_desc;
struct cpumask *flush_mask;
struct ptc_stats *stat;
struct bau_control *bcp;
- struct bau_control *tbcp;
- struct hub_and_pnode *hpp;
/* kernel was booted 'nobau' */
if (nobau)
@@ -720,20 +953,8 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
/* bau was disabled due to slow response */
if (bcp->baudisabled) {
- /* the cpu that disabled it must re-enable it */
- if (bcp->set_bau_off) {
- if (get_cycles() >= bcp->set_bau_on_time) {
- stat->s_bau_reenabled++;
- baudisabled = 0;
- for_each_present_cpu(tcpu) {
- tbcp = &per_cpu(bau_control, tcpu);
- tbcp->baudisabled = 0;
- tbcp->period_requests = 0;
- tbcp->period_time = 0;
- }
- }
- }
- return cpumask;
+ if (check_enable(bcp, stat))
+ return cpumask;
}
/*
@@ -744,59 +965,20 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
flush_mask = (struct cpumask *)per_cpu(uv_flush_tlb_mask, cpu);
/* don't actually do a shootdown of the local cpu */
cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
+
if (cpu_isset(cpu, *cpumask))
stat->s_ntargself++;
bau_desc = bcp->descriptor_base;
- bau_desc += UV_ITEMS_PER_DESCRIPTOR * bcp->uvhub_cpu;
+ bau_desc += ITEMS_PER_DESC * bcp->uvhub_cpu;
bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
-
- for_each_cpu(tcpu, flush_mask) {
- /*
- * The distribution vector is a bit map of pnodes, relative
- * to the partition base pnode (and the partition base nasid
- * in the header).
- * Translate cpu to pnode and hub using an array stored
- * in local memory.
- */
- hpp = &bcp->socket_master->target_hub_and_pnode[tcpu];
- tpnode = hpp->pnode - bcp->partition_base_pnode;
- bau_uvhub_set(tpnode, &bau_desc->distribution);
- if (hpp->uvhub == bcp->uvhub)
- locals++;
- else
- remotes++;
- }
- if ((locals + remotes) == 0)
+ if (set_distrib_bits(flush_mask, bcp, bau_desc, &locals, &remotes))
return NULL;
- stat->s_requestor++;
- stat->s_ntargcpu += remotes + locals;
- stat->s_ntargremotes += remotes;
- stat->s_ntarglocals += locals;
- remotes = bau_uvhub_weight(&bau_desc->distribution);
- /* uvhub statistics */
- hubs = bau_uvhub_weight(&bau_desc->distribution);
- if (locals) {
- stat->s_ntarglocaluvhub++;
- stat->s_ntargremoteuvhub += (hubs - 1);
- } else
- stat->s_ntargremoteuvhub += hubs;
- stat->s_ntarguvhub += hubs;
- if (hubs >= 16)
- stat->s_ntarguvhub16++;
- else if (hubs >= 8)
- stat->s_ntarguvhub8++;
- else if (hubs >= 4)
- stat->s_ntarguvhub4++;
- else if (hubs >= 2)
- stat->s_ntarguvhub2++;
- else
- stat->s_ntarguvhub1++;
+ record_send_statistics(stat, locals, hubs, remotes, bau_desc);
bau_desc->payload.address = va;
bau_desc->payload.sending_cpu = cpu;
-
/*
* uv_flush_send_and_wait returns 0 if all cpu's were messaged,
* or 1 if it gave up and the original cpumask should be returned.
@@ -825,26 +1007,31 @@ void uv_bau_message_interrupt(struct pt_regs *regs)
{
int count = 0;
cycles_t time_start;
- struct bau_payload_queue_entry *msg;
+ struct bau_pq_entry *msg;
struct bau_control *bcp;
struct ptc_stats *stat;
struct msg_desc msgdesc;
time_start = get_cycles();
+
bcp = &per_cpu(bau_control, smp_processor_id());
stat = bcp->statp;
- msgdesc.va_queue_first = bcp->va_queue_first;
- msgdesc.va_queue_last = bcp->va_queue_last;
+
+ msgdesc.queue_first = bcp->queue_first;
+ msgdesc.queue_last = bcp->queue_last;
+
msg = bcp->bau_msg_head;
- while (msg->sw_ack_vector) {
+ while (msg->swack_vec) {
count++;
- msgdesc.msg_slot = msg - msgdesc.va_queue_first;
- msgdesc.sw_ack_slot = ffs(msg->sw_ack_vector) - 1;
+
+ msgdesc.msg_slot = msg - msgdesc.queue_first;
+ msgdesc.swack_slot = ffs(msg->swack_vec) - 1;
msgdesc.msg = msg;
- uv_bau_process_message(&msgdesc, bcp);
+ bau_process_message(&msgdesc, bcp);
+
msg++;
- if (msg > msgdesc.va_queue_last)
- msg = msgdesc.va_queue_first;
+ if (msg > msgdesc.queue_last)
+ msg = msgdesc.queue_first;
bcp->bau_msg_head = msg;
}
stat->d_time += (get_cycles() - time_start);
@@ -852,18 +1039,17 @@ void uv_bau_message_interrupt(struct pt_regs *regs)
stat->d_nomsg++;
else if (count > 1)
stat->d_multmsg++;
+
ack_APIC_irq();
}
/*
- * uv_enable_timeouts
- *
- * Each target uvhub (i.e. a uvhub that has no cpu's) needs to have
+ * Each target uvhub (i.e. a uvhub that has cpu's) needs to have
* shootdown message timeouts enabled. The timeout does not cause
* an interrupt, but causes an error message to be returned to
* the sender.
*/
-static void __init uv_enable_timeouts(void)
+static void __init enable_timeouts(void)
{
int uvhub;
int nuvhubs;
@@ -877,47 +1063,44 @@ static void __init uv_enable_timeouts(void)
continue;
pnode = uv_blade_to_pnode(uvhub);
- mmr_image =
- uv_read_global_mmr64(pnode, UVH_LB_BAU_MISC_CONTROL);
+ mmr_image = read_mmr_misc_control(pnode);
/*
* Set the timeout period and then lock it in, in three
* steps; captures and locks in the period.
*
* To program the period, the SOFT_ACK_MODE must be off.
*/
- mmr_image &= ~((unsigned long)1 <<
- UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT);
- uv_write_global_mmr64
- (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
+ mmr_image &= ~(1L << SOFTACK_MSHIFT);
+ write_mmr_misc_control(pnode, mmr_image);
/*
* Set the 4-bit period.
*/
- mmr_image &= ~((unsigned long)0xf <<
- UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT);
- mmr_image |= (UV_INTD_SOFT_ACK_TIMEOUT_PERIOD <<
- UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT);
- uv_write_global_mmr64
- (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
+ mmr_image &= ~((unsigned long)0xf << SOFTACK_PSHIFT);
+ mmr_image |= (SOFTACK_TIMEOUT_PERIOD << SOFTACK_PSHIFT);
+ write_mmr_misc_control(pnode, mmr_image);
/*
+ * UV1:
* Subsequent reversals of the timebase bit (3) cause an
* immediate timeout of one or all INTD resources as
* indicated in bits 2:0 (7 causes all of them to timeout).
*/
- mmr_image |= ((unsigned long)1 <<
- UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT);
- uv_write_global_mmr64
- (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
+ mmr_image |= (1L << SOFTACK_MSHIFT);
+ if (is_uv2_hub()) {
+ mmr_image |= (1L << UV2_LEG_SHFT);
+ mmr_image |= (1L << UV2_EXT_SHFT);
+ }
+ write_mmr_misc_control(pnode, mmr_image);
}
}
-static void *uv_ptc_seq_start(struct seq_file *file, loff_t *offset)
+static void *ptc_seq_start(struct seq_file *file, loff_t *offset)
{
if (*offset < num_possible_cpus())
return offset;
return NULL;
}
-static void *uv_ptc_seq_next(struct seq_file *file, void *data, loff_t *offset)
+static void *ptc_seq_next(struct seq_file *file, void *data, loff_t *offset)
{
(*offset)++;
if (*offset < num_possible_cpus())
@@ -925,12 +1108,11 @@ static void *uv_ptc_seq_next(struct seq_file *file, void *data, loff_t *offset)
return NULL;
}
-static void uv_ptc_seq_stop(struct seq_file *file, void *data)
+static void ptc_seq_stop(struct seq_file *file, void *data)
{
}
-static inline unsigned long long
-microsec_2_cycles(unsigned long microsec)
+static inline unsigned long long usec_2_cycles(unsigned long microsec)
{
unsigned long ns;
unsigned long long cyc;
@@ -941,29 +1123,27 @@ microsec_2_cycles(unsigned long microsec)
}
/*
- * Display the statistics thru /proc.
+ * Display the statistics thru /proc/sgi_uv/ptc_statistics
* 'data' points to the cpu number
+ * Note: see the descriptions in stat_description[].
*/
-static int uv_ptc_seq_show(struct seq_file *file, void *data)
+static int ptc_seq_show(struct seq_file *file, void *data)
{
struct ptc_stats *stat;
int cpu;
cpu = *(loff_t *)data;
-
if (!cpu) {
seq_printf(file,
"# cpu sent stime self locals remotes ncpus localhub ");
seq_printf(file,
"remotehub numuvhubs numuvhubs16 numuvhubs8 ");
seq_printf(file,
- "numuvhubs4 numuvhubs2 numuvhubs1 dto ");
+ "numuvhubs4 numuvhubs2 numuvhubs1 dto retries rok ");
seq_printf(file,
- "retries rok resetp resett giveup sto bz throt ");
+ "resetp resett giveup sto bz throt swack recv rtime ");
seq_printf(file,
- "sw_ack recv rtime all ");
- seq_printf(file,
- "one mult none retry canc nocan reset rcan ");
+ "all one mult none retry canc nocan reset rcan ");
seq_printf(file,
"disable enable\n");
}
@@ -990,8 +1170,7 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data)
/* destination side statistics */
seq_printf(file,
"%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
- uv_read_global_mmr64(uv_cpu_to_pnode(cpu),
- UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE),
+ read_gmmr_sw_ack(uv_cpu_to_pnode(cpu)),
stat->d_requestee, cycles_2_us(stat->d_time),
stat->d_alltlb, stat->d_onetlb, stat->d_multmsg,
stat->d_nomsg, stat->d_retries, stat->d_canceled,
@@ -1000,7 +1179,6 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data)
seq_printf(file, "%ld %ld\n",
stat->s_bau_disabled, stat->s_bau_reenabled);
}
-
return 0;
}
@@ -1008,18 +1186,18 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data)
* Display the tunables thru debugfs
*/
static ssize_t tunables_read(struct file *file, char __user *userbuf,
- size_t count, loff_t *ppos)
+ size_t count, loff_t *ppos)
{
char *buf;
int ret;
buf = kasprintf(GFP_KERNEL, "%s %s %s\n%d %d %d %d %d %d %d %d %d\n",
- "max_bau_concurrent plugged_delay plugsb4reset",
+ "max_concur plugged_delay plugsb4reset",
"timeoutsb4reset ipi_reset_limit complete_threshold",
"congested_response_us congested_reps congested_period",
- max_bau_concurrent, plugged_delay, plugsb4reset,
+ max_concurr, plugged_delay, plugsb4reset,
timeoutsb4reset, ipi_reset_limit, complete_threshold,
- congested_response_us, congested_reps, congested_period);
+ congested_respns_us, congested_reps, congested_period);
if (!buf)
return -ENOMEM;
@@ -1030,13 +1208,16 @@ static ssize_t tunables_read(struct file *file, char __user *userbuf,
}
/*
- * -1: resetf the statistics
+ * handle a write to /proc/sgi_uv/ptc_statistics
+ * -1: reset the statistics
* 0: display meaning of the statistics
*/
-static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user,
- size_t count, loff_t *data)
+static ssize_t ptc_proc_write(struct file *file, const char __user *user,
+ size_t count, loff_t *data)
{
int cpu;
+ int i;
+ int elements;
long input_arg;
char optstr[64];
struct ptc_stats *stat;
@@ -1046,79 +1227,18 @@ static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user,
if (copy_from_user(optstr, user, count))
return -EFAULT;
optstr[count - 1] = '\0';
+
if (strict_strtol(optstr, 10, &input_arg) < 0) {
printk(KERN_DEBUG "%s is invalid\n", optstr);
return -EINVAL;
}
if (input_arg == 0) {
+ elements = sizeof(stat_description)/sizeof(*stat_description);
printk(KERN_DEBUG "# cpu: cpu number\n");
printk(KERN_DEBUG "Sender statistics:\n");
- printk(KERN_DEBUG
- "sent: number of shootdown messages sent\n");
- printk(KERN_DEBUG
- "stime: time spent sending messages\n");
- printk(KERN_DEBUG
- "numuvhubs: number of hubs targeted with shootdown\n");
- printk(KERN_DEBUG
- "numuvhubs16: number times 16 or more hubs targeted\n");
- printk(KERN_DEBUG
- "numuvhubs8: number times 8 or more hubs targeted\n");
- printk(KERN_DEBUG
- "numuvhubs4: number times 4 or more hubs targeted\n");
- printk(KERN_DEBUG
- "numuvhubs2: number times 2 or more hubs targeted\n");
- printk(KERN_DEBUG
- "numuvhubs1: number times 1 hub targeted\n");
- printk(KERN_DEBUG
- "numcpus: number of cpus targeted with shootdown\n");
- printk(KERN_DEBUG
- "dto: number of destination timeouts\n");
- printk(KERN_DEBUG
- "retries: destination timeout retries sent\n");
- printk(KERN_DEBUG
- "rok: : destination timeouts successfully retried\n");
- printk(KERN_DEBUG
- "resetp: ipi-style resource resets for plugs\n");
- printk(KERN_DEBUG
- "resett: ipi-style resource resets for timeouts\n");
- printk(KERN_DEBUG
- "giveup: fall-backs to ipi-style shootdowns\n");
- printk(KERN_DEBUG
- "sto: number of source timeouts\n");
- printk(KERN_DEBUG
- "bz: number of stay-busy's\n");
- printk(KERN_DEBUG
- "throt: number times spun in throttle\n");
- printk(KERN_DEBUG "Destination side statistics:\n");
- printk(KERN_DEBUG
- "sw_ack: image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE\n");
- printk(KERN_DEBUG
- "recv: shootdown messages received\n");
- printk(KERN_DEBUG
- "rtime: time spent processing messages\n");
- printk(KERN_DEBUG
- "all: shootdown all-tlb messages\n");
- printk(KERN_DEBUG
- "one: shootdown one-tlb messages\n");
- printk(KERN_DEBUG
- "mult: interrupts that found multiple messages\n");
- printk(KERN_DEBUG
- "none: interrupts that found no messages\n");
- printk(KERN_DEBUG
- "retry: number of retry messages processed\n");
- printk(KERN_DEBUG
- "canc: number messages canceled by retries\n");
- printk(KERN_DEBUG
- "nocan: number retries that found nothing to cancel\n");
- printk(KERN_DEBUG
- "reset: number of ipi-style reset requests processed\n");
- printk(KERN_DEBUG
- "rcan: number messages canceled by reset requests\n");
- printk(KERN_DEBUG
- "disable: number times use of the BAU was disabled\n");
- printk(KERN_DEBUG
- "enable: number times use of the BAU was re-enabled\n");
+ for (i = 0; i < elements; i++)
+ printk(KERN_DEBUG "%s\n", stat_description[i]);
} else if (input_arg == -1) {
for_each_present_cpu(cpu) {
stat = &per_cpu(ptcstats, cpu);
@@ -1145,27 +1265,18 @@ static int local_atoi(const char *name)
}
/*
- * set the tunables
- * 0 values reset them to defaults
+ * Parse the values written to /sys/kernel/debug/sgi_uv/bau_tunables.
+ * Zero values reset them to defaults.
*/
-static ssize_t tunables_write(struct file *file, const char __user *user,
- size_t count, loff_t *data)
+static int parse_tunables_write(struct bau_control *bcp, char *instr,
+ int count)
{
- int cpu;
- int cnt = 0;
- int val;
char *p;
char *q;
- char instr[64];
- struct bau_control *bcp;
-
- if (count == 0 || count > sizeof(instr)-1)
- return -EINVAL;
- if (copy_from_user(instr, user, count))
- return -EFAULT;
+ int cnt = 0;
+ int val;
+ int e = sizeof(tunables) / sizeof(*tunables);
- instr[count] = '\0';
- /* count the fields */
p = instr + strspn(instr, WHITESPACE);
q = p;
for (; *p; p = q + strspn(q, WHITESPACE)) {
@@ -1174,8 +1285,8 @@ static ssize_t tunables_write(struct file *file, const char __user *user,
if (q == p)
break;
}
- if (cnt != 9) {
- printk(KERN_INFO "bau tunable error: should be 9 numbers\n");
+ if (cnt != e) {
+ printk(KERN_INFO "bau tunable error: should be %d values\n", e);
return -EINVAL;
}
@@ -1187,97 +1298,81 @@ static ssize_t tunables_write(struct file *file, const char __user *user,
switch (cnt) {
case 0:
if (val == 0) {
- max_bau_concurrent = MAX_BAU_CONCURRENT;
- max_bau_concurrent_constant =
- MAX_BAU_CONCURRENT;
+ max_concurr = MAX_BAU_CONCURRENT;
+ max_concurr_const = MAX_BAU_CONCURRENT;
continue;
}
- bcp = &per_cpu(bau_control, smp_processor_id());
if (val < 1 || val > bcp->cpus_in_uvhub) {
printk(KERN_DEBUG
"Error: BAU max concurrent %d is invalid\n",
val);
return -EINVAL;
}
- max_bau_concurrent = val;
- max_bau_concurrent_constant = val;
- continue;
- case 1:
- if (val == 0)
- plugged_delay = PLUGGED_DELAY;
- else
- plugged_delay = val;
- continue;
- case 2:
- if (val == 0)
- plugsb4reset = PLUGSB4RESET;
- else
- plugsb4reset = val;
+ max_concurr = val;
+ max_concurr_const = val;
continue;
- case 3:
- if (val == 0)
- timeoutsb4reset = TIMEOUTSB4RESET;
- else
- timeoutsb4reset = val;
- continue;
- case 4:
- if (val == 0)
- ipi_reset_limit = IPI_RESET_LIMIT;
- else
- ipi_reset_limit = val;
- continue;
- case 5:
- if (val == 0)
- complete_threshold = COMPLETE_THRESHOLD;
- else
- complete_threshold = val;
- continue;
- case 6:
- if (val == 0)
- congested_response_us = CONGESTED_RESPONSE_US;
- else
- congested_response_us = val;
- continue;
- case 7:
- if (val == 0)
- congested_reps = CONGESTED_REPS;
- else
- congested_reps = val;
- continue;
- case 8:
+ default:
if (val == 0)
- congested_period = CONGESTED_PERIOD;
+ *tunables[cnt].tunp = tunables[cnt].deflt;
else
- congested_period = val;
+ *tunables[cnt].tunp = val;
continue;
}
if (q == p)
break;
}
+ return 0;
+}
+
+/*
+ * Handle a write to debugfs. (/sys/kernel/debug/sgi_uv/bau_tunables)
+ */
+static ssize_t tunables_write(struct file *file, const char __user *user,
+ size_t count, loff_t *data)
+{
+ int cpu;
+ int ret;
+ char instr[100];
+ struct bau_control *bcp;
+
+ if (count == 0 || count > sizeof(instr)-1)
+ return -EINVAL;
+ if (copy_from_user(instr, user, count))
+ return -EFAULT;
+
+ instr[count] = '\0';
+
+ cpu = get_cpu();
+ bcp = &per_cpu(bau_control, cpu);
+ ret = parse_tunables_write(bcp, instr, count);
+ put_cpu();
+ if (ret)
+ return ret;
+
for_each_present_cpu(cpu) {
bcp = &per_cpu(bau_control, cpu);
- bcp->max_bau_concurrent = max_bau_concurrent;
- bcp->max_bau_concurrent_constant = max_bau_concurrent;
- bcp->plugged_delay = plugged_delay;
- bcp->plugsb4reset = plugsb4reset;
- bcp->timeoutsb4reset = timeoutsb4reset;
- bcp->ipi_reset_limit = ipi_reset_limit;
- bcp->complete_threshold = complete_threshold;
- bcp->congested_response_us = congested_response_us;
- bcp->congested_reps = congested_reps;
- bcp->congested_period = congested_period;
+ bcp->max_concurr = max_concurr;
+ bcp->max_concurr_const = max_concurr;
+ bcp->plugged_delay = plugged_delay;
+ bcp->plugsb4reset = plugsb4reset;
+ bcp->timeoutsb4reset = timeoutsb4reset;
+ bcp->ipi_reset_limit = ipi_reset_limit;
+ bcp->complete_threshold = complete_threshold;
+ bcp->cong_response_us = congested_respns_us;
+ bcp->cong_reps = congested_reps;
+ bcp->cong_period = congested_period;
}
return count;
}
static const struct seq_operations uv_ptc_seq_ops = {
- .start = uv_ptc_seq_start,
- .next = uv_ptc_seq_next,
- .stop = uv_ptc_seq_stop,
- .show = uv_ptc_seq_show
+ .start = ptc_seq_start,
+ .next = ptc_seq_next,
+ .stop = ptc_seq_stop,
+ .show = ptc_seq_show
};
-static int uv_ptc_proc_open(struct inode *inode, struct file *file)
+static int ptc_proc_open(struct inode *inode, struct file *file)
{
return seq_open(file, &uv_ptc_seq_ops);
}
@@ -1288,9 +1383,9 @@ static int tunables_open(struct inode *inode, struct file *file)
}
static const struct file_operations proc_uv_ptc_operations = {
- .open = uv_ptc_proc_open,
+ .open = ptc_proc_open,
.read = seq_read,
- .write = uv_ptc_proc_write,
+ .write = ptc_proc_write,
.llseek = seq_lseek,
.release = seq_release,
};
@@ -1324,7 +1419,7 @@ static int __init uv_ptc_init(void)
return -EINVAL;
}
tunables_file = debugfs_create_file(UV_BAU_TUNABLES_FILE, 0600,
- tunables_dir, NULL, &tunables_fops);
+ tunables_dir, NULL, &tunables_fops);
if (!tunables_file) {
printk(KERN_ERR "unable to create debugfs file %s\n",
UV_BAU_TUNABLES_FILE);
@@ -1336,24 +1431,24 @@ static int __init uv_ptc_init(void)
/*
* Initialize the sending side's sending buffers.
*/
-static void
-uv_activation_descriptor_init(int node, int pnode, int base_pnode)
+static void activation_descriptor_init(int node, int pnode, int base_pnode)
{
int i;
int cpu;
unsigned long pa;
unsigned long m;
unsigned long n;
+ size_t dsize;
struct bau_desc *bau_desc;
struct bau_desc *bd2;
struct bau_control *bcp;
/*
- * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR)
- * per cpu; and one per cpu on the uvhub (UV_ADP_SIZE)
+ * each bau_desc is 64 bytes; there are 8 (ITEMS_PER_DESC)
+ * per cpu; and one per cpu on the uvhub (ADP_SZ)
*/
- bau_desc = kmalloc_node(sizeof(struct bau_desc) * UV_ADP_SIZE
- * UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node);
+ dsize = sizeof(struct bau_desc) * ADP_SZ * ITEMS_PER_DESC;
+ bau_desc = kmalloc_node(dsize, GFP_KERNEL, node);
BUG_ON(!bau_desc);
pa = uv_gpa(bau_desc); /* need the real nasid*/
@@ -1361,27 +1456,25 @@ uv_activation_descriptor_init(int node, int pnode, int base_pnode)
m = pa & uv_mmask;
/* the 14-bit pnode */
- uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE,
- (n << UV_DESC_BASE_PNODE_SHIFT | m));
+ write_mmr_descriptor_base(pnode, (n << UV_DESC_PSHIFT | m));
/*
- * Initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each
+ * Initializing all 8 (ITEMS_PER_DESC) descriptors for each
* cpu even though we only use the first one; one descriptor can
* describe a broadcast to 256 uv hubs.
*/
- for (i = 0, bd2 = bau_desc; i < (UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR);
- i++, bd2++) {
+ for (i = 0, bd2 = bau_desc; i < (ADP_SZ * ITEMS_PER_DESC); i++, bd2++) {
memset(bd2, 0, sizeof(struct bau_desc));
- bd2->header.sw_ack_flag = 1;
+ bd2->header.swack_flag = 1;
/*
* The base_dest_nasid set in the message header is the nasid
* of the first uvhub in the partition. The bit map will
* indicate destination pnode numbers relative to that base.
* They may not be consecutive if nasid striding is being used.
*/
- bd2->header.base_dest_nasid = UV_PNODE_TO_NASID(base_pnode);
- bd2->header.dest_subnodeid = UV_LB_SUBNODEID;
- bd2->header.command = UV_NET_ENDPOINT_INTD;
- bd2->header.int_both = 1;
+ bd2->header.base_dest_nasid = UV_PNODE_TO_NASID(base_pnode);
+ bd2->header.dest_subnodeid = UV_LB_SUBNODEID;
+ bd2->header.command = UV_NET_ENDPOINT_INTD;
+ bd2->header.int_both = 1;
/*
* all others need to be set to zero:
* fairness chaining multilevel count replied_to
@@ -1401,57 +1494,55 @@ uv_activation_descriptor_init(int node, int pnode, int base_pnode)
* - node is first node (kernel memory notion) on the uvhub
* - pnode is the uvhub's physical identifier
*/
-static void
-uv_payload_queue_init(int node, int pnode)
+static void pq_init(int node, int pnode)
{
- int pn;
int cpu;
+ size_t plsize;
char *cp;
- unsigned long pa;
- struct bau_payload_queue_entry *pqp;
- struct bau_payload_queue_entry *pqp_malloc;
+ void *vp;
+ unsigned long pn;
+ unsigned long first;
+ unsigned long pn_first;
+ unsigned long last;
+ struct bau_pq_entry *pqp;
struct bau_control *bcp;
- pqp = kmalloc_node((DEST_Q_SIZE + 1)
- * sizeof(struct bau_payload_queue_entry),
- GFP_KERNEL, node);
+ plsize = (DEST_Q_SIZE + 1) * sizeof(struct bau_pq_entry);
+ vp = kmalloc_node(plsize, GFP_KERNEL, node);
+ pqp = (struct bau_pq_entry *)vp;
BUG_ON(!pqp);
- pqp_malloc = pqp;
cp = (char *)pqp + 31;
- pqp = (struct bau_payload_queue_entry *)(((unsigned long)cp >> 5) << 5);
+ pqp = (struct bau_pq_entry *)(((unsigned long)cp >> 5) << 5);
for_each_present_cpu(cpu) {
if (pnode != uv_cpu_to_pnode(cpu))
continue;
/* for every cpu on this pnode: */
bcp = &per_cpu(bau_control, cpu);
- bcp->va_queue_first = pqp;
- bcp->bau_msg_head = pqp;
- bcp->va_queue_last = pqp + (DEST_Q_SIZE - 1);
+ bcp->queue_first = pqp;
+ bcp->bau_msg_head = pqp;
+ bcp->queue_last = pqp + (DEST_Q_SIZE - 1);
}
/*
* need the pnode of where the memory was really allocated
*/
- pa = uv_gpa(pqp);
- pn = pa >> uv_nshift;
- uv_write_global_mmr64(pnode,
- UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST,
- ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) |
- uv_physnodeaddr(pqp));
- uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL,
- uv_physnodeaddr(pqp));
- uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST,
- (unsigned long)
- uv_physnodeaddr(pqp + (DEST_Q_SIZE - 1)));
+ pn = uv_gpa(pqp) >> uv_nshift;
+ first = uv_physnodeaddr(pqp);
+ pn_first = ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) | first;
+ last = uv_physnodeaddr(pqp + (DEST_Q_SIZE - 1));
+ write_mmr_payload_first(pnode, pn_first);
+ write_mmr_payload_tail(pnode, first);
+ write_mmr_payload_last(pnode, last);
+
/* in effect, all msg_type's are set to MSG_NOOP */
- memset(pqp, 0, sizeof(struct bau_payload_queue_entry) * DEST_Q_SIZE);
+ memset(pqp, 0, sizeof(struct bau_pq_entry) * DEST_Q_SIZE);
}
/*
* Initialization of each UV hub's structures
*/
-static void __init uv_init_uvhub(int uvhub, int vector, int base_pnode)
+static void __init init_uvhub(int uvhub, int vector, int base_pnode)
{
int node;
int pnode;
@@ -1459,24 +1550,24 @@ static void __init uv_init_uvhub(int uvhub, int vector, int base_pnode)
node = uvhub_to_first_node(uvhub);
pnode = uv_blade_to_pnode(uvhub);
- uv_activation_descriptor_init(node, pnode, base_pnode);
- uv_payload_queue_init(node, pnode);
+
+ activation_descriptor_init(node, pnode, base_pnode);
+
+ pq_init(node, pnode);
/*
* The below initialization can't be in firmware because the
* messaging IRQ will be determined by the OS.
*/
apicid = uvhub_to_first_apicid(uvhub) | uv_apicid_hibits;
- uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,
- ((apicid << 32) | vector));
+ write_mmr_data_config(pnode, ((apicid << 32) | vector));
}
/*
* We will set BAU_MISC_CONTROL with a timeout period.
* But the BIOS has set UVH_AGING_PRESCALE_SEL and UVH_TRANSACTION_TIMEOUT.
- * So the destination timeout period has be be calculated from them.
+ * So the destination timeout period has to be calculated from them.
*/
-static int
-calculate_destination_timeout(void)
+static int calculate_destination_timeout(void)
{
unsigned long mmr_image;
int mult1;
@@ -1486,73 +1577,92 @@ calculate_destination_timeout(void)
int ret;
unsigned long ts_ns;
- mult1 = UV_INTD_SOFT_ACK_TIMEOUT_PERIOD & BAU_MISC_CONTROL_MULT_MASK;
- mmr_image = uv_read_local_mmr(UVH_AGING_PRESCALE_SEL);
- index = (mmr_image >> BAU_URGENCY_7_SHIFT) & BAU_URGENCY_7_MASK;
- mmr_image = uv_read_local_mmr(UVH_TRANSACTION_TIMEOUT);
- mult2 = (mmr_image >> BAU_TRANS_SHIFT) & BAU_TRANS_MASK;
- base = timeout_base_ns[index];
- ts_ns = base * mult1 * mult2;
- ret = ts_ns / 1000;
+ if (is_uv1_hub()) {
+ mult1 = SOFTACK_TIMEOUT_PERIOD & BAU_MISC_CONTROL_MULT_MASK;
+ mmr_image = uv_read_local_mmr(UVH_AGING_PRESCALE_SEL);
+ index = (mmr_image >> BAU_URGENCY_7_SHIFT) & BAU_URGENCY_7_MASK;
+ mmr_image = uv_read_local_mmr(UVH_TRANSACTION_TIMEOUT);
+ mult2 = (mmr_image >> BAU_TRANS_SHIFT) & BAU_TRANS_MASK;
+ base = timeout_base_ns[index];
+ ts_ns = base * mult1 * mult2;
+ ret = ts_ns / 1000;
+ } else {
+ /* 4 bits 0/1 for 10/80us, 3 bits of multiplier */
+ mmr_image = uv_read_local_mmr(UVH_AGING_PRESCALE_SEL);
+ mmr_image = (mmr_image & UV_SA_MASK) >> UV_SA_SHFT;
+ if (mmr_image & (1L << UV2_ACK_UNITS_SHFT))
+ mult1 = 80;
+ else
+ mult1 = 10;
+ base = mmr_image & UV2_ACK_MASK;
+ ret = mult1 * base;
+ }
return ret;
}
+static void __init init_per_cpu_tunables(void)
+{
+ int cpu;
+ struct bau_control *bcp;
+
+ for_each_present_cpu(cpu) {
+ bcp = &per_cpu(bau_control, cpu);
+ bcp->baudisabled = 0;
+ bcp->statp = &per_cpu(ptcstats, cpu);
+ /* time interval to catch a hardware stay-busy bug */
+ bcp->timeout_interval = usec_2_cycles(2*timeout_us);
+ bcp->max_concurr = max_concurr;
+ bcp->max_concurr_const = max_concurr;
+ bcp->plugged_delay = plugged_delay;
+ bcp->plugsb4reset = plugsb4reset;
+ bcp->timeoutsb4reset = timeoutsb4reset;
+ bcp->ipi_reset_limit = ipi_reset_limit;
+ bcp->complete_threshold = complete_threshold;
+ bcp->cong_response_us = congested_respns_us;
+ bcp->cong_reps = congested_reps;
+ bcp->cong_period = congested_period;
+ }
+}
+
/*
- * initialize the bau_control structure for each cpu
+ * Scan all cpus to collect blade and socket summaries.
*/
-static int __init uv_init_per_cpu(int nuvhubs, int base_part_pnode)
+static int __init get_cpu_topology(int base_pnode,
+ struct uvhub_desc *uvhub_descs,
+ unsigned char *uvhub_mask)
{
- int i;
int cpu;
- int tcpu;
int pnode;
int uvhub;
- int have_hmaster;
- short socket = 0;
- unsigned short socket_mask;
- unsigned char *uvhub_mask;
+ int socket;
struct bau_control *bcp;
struct uvhub_desc *bdp;
struct socket_desc *sdp;
- struct bau_control *hmaster = NULL;
- struct bau_control *smaster = NULL;
- struct socket_desc {
- short num_cpus;
- short cpu_number[MAX_CPUS_PER_SOCKET];
- };
- struct uvhub_desc {
- unsigned short socket_mask;
- short num_cpus;
- short uvhub;
- short pnode;
- struct socket_desc socket[2];
- };
- struct uvhub_desc *uvhub_descs;
- timeout_us = calculate_destination_timeout();
-
- uvhub_descs = kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL);
- memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc));
- uvhub_mask = kzalloc((nuvhubs+7)/8, GFP_KERNEL);
for_each_present_cpu(cpu) {
bcp = &per_cpu(bau_control, cpu);
+
memset(bcp, 0, sizeof(struct bau_control));
+
pnode = uv_cpu_hub_info(cpu)->pnode;
- if ((pnode - base_part_pnode) >= UV_DISTRIBUTION_SIZE) {
+ if ((pnode - base_pnode) >= UV_DISTRIBUTION_SIZE) {
printk(KERN_EMERG
"cpu %d pnode %d-%d beyond %d; BAU disabled\n",
- cpu, pnode, base_part_pnode,
- UV_DISTRIBUTION_SIZE);
+ cpu, pnode, base_pnode, UV_DISTRIBUTION_SIZE);
return 1;
}
+
bcp->osnode = cpu_to_node(cpu);
- bcp->partition_base_pnode = uv_partition_base_pnode;
+ bcp->partition_base_pnode = base_pnode;
+
uvhub = uv_cpu_hub_info(cpu)->numa_blade_id;
*(uvhub_mask + (uvhub/8)) |= (1 << (uvhub%8));
bdp = &uvhub_descs[uvhub];
+
bdp->num_cpus++;
bdp->uvhub = uvhub;
bdp->pnode = pnode;
+
/* kludge: 'assuming' one node per socket, and assuming that
disabling a socket just leaves a gap in node numbers */
socket = bcp->osnode & 1;
@@ -1561,85 +1671,146 @@ static int __init uv_init_per_cpu(int nuvhubs, int base_part_pnode)
sdp->cpu_number[sdp->num_cpus] = cpu;
sdp->num_cpus++;
if (sdp->num_cpus > MAX_CPUS_PER_SOCKET) {
- printk(KERN_EMERG "%d cpus per socket invalid\n", sdp->num_cpus);
+ printk(KERN_EMERG "%d cpus per socket invalid\n",
+ sdp->num_cpus);
return 1;
}
}
+ return 0;
+}
+
+/*
+ * Each socket is to get a local array of pnodes/hubs.
+ */
+static void make_per_cpu_thp(struct bau_control *smaster)
+{
+ int cpu;
+ size_t hpsz = sizeof(struct hub_and_pnode) * num_possible_cpus();
+
+ smaster->thp = kmalloc_node(hpsz, GFP_KERNEL, smaster->osnode);
+ memset(smaster->thp, 0, hpsz);
+ for_each_present_cpu(cpu) {
+ smaster->thp[cpu].pnode = uv_cpu_hub_info(cpu)->pnode;
+ smaster->thp[cpu].uvhub = uv_cpu_hub_info(cpu)->numa_blade_id;
+ }
+}
+
+/*
+ * Each uvhub is to get a local cpumask.
+ */
+static void make_per_hub_cpumask(struct bau_control *hmaster)
+{
+ int sz = sizeof(cpumask_t);
+
+ hmaster->cpumask = kzalloc_node(sz, GFP_KERNEL, hmaster->osnode);
+}
+
+/*
+ * Initialize all the per_cpu information for the cpu's on a given socket,
+ * given what has been gathered into the socket_desc struct.
+ * And reports the chosen hub and socket masters back to the caller.
+ */
+static int scan_sock(struct socket_desc *sdp, struct uvhub_desc *bdp,
+ struct bau_control **smasterp,
+ struct bau_control **hmasterp)
+{
+ int i;
+ int cpu;
+ struct bau_control *bcp;
+
+ for (i = 0; i < sdp->num_cpus; i++) {
+ cpu = sdp->cpu_number[i];
+ bcp = &per_cpu(bau_control, cpu);
+ bcp->cpu = cpu;
+ if (i == 0) {
+ *smasterp = bcp;
+ if (!(*hmasterp))
+ *hmasterp = bcp;
+ }
+ bcp->cpus_in_uvhub = bdp->num_cpus;
+ bcp->cpus_in_socket = sdp->num_cpus;
+ bcp->socket_master = *smasterp;
+ bcp->uvhub = bdp->uvhub;
+ bcp->uvhub_master = *hmasterp;
+ bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->blade_processor_id;
+ if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) {
+ printk(KERN_EMERG "%d cpus per uvhub invalid\n",
+ bcp->uvhub_cpu);
+ return 1;
+ }
+ }
+ return 0;
+}
+
+/*
+ * Summarize the blade and socket topology into the per_cpu structures.
+ */
+static int __init summarize_uvhub_sockets(int nuvhubs,
+ struct uvhub_desc *uvhub_descs,
+ unsigned char *uvhub_mask)
+{
+ int socket;
+ int uvhub;
+ unsigned short socket_mask;
+
for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
+ struct uvhub_desc *bdp;
+ struct bau_control *smaster = NULL;
+ struct bau_control *hmaster = NULL;
+
if (!(*(uvhub_mask + (uvhub/8)) & (1 << (uvhub%8))))
continue;
- have_hmaster = 0;
+
bdp = &uvhub_descs[uvhub];
socket_mask = bdp->socket_mask;
socket = 0;
while (socket_mask) {
- if (!(socket_mask & 1))
- goto nextsocket;
- sdp = &bdp->socket[socket];
- for (i = 0; i < sdp->num_cpus; i++) {
- cpu = sdp->cpu_number[i];
- bcp = &per_cpu(bau_control, cpu);
- bcp->cpu = cpu;
- if (i == 0) {
- smaster = bcp;
- if (!have_hmaster) {
- have_hmaster++;
- hmaster = bcp;
- }
- }
- bcp->cpus_in_uvhub = bdp->num_cpus;
- bcp->cpus_in_socket = sdp->num_cpus;
- bcp->socket_master = smaster;
- bcp->uvhub = bdp->uvhub;
- bcp->uvhub_master = hmaster;
- bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->
- blade_processor_id;
- if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) {
- printk(KERN_EMERG
- "%d cpus per uvhub invalid\n",
- bcp->uvhub_cpu);
+ struct socket_desc *sdp;
+ if ((socket_mask & 1)) {
+ sdp = &bdp->socket[socket];
+ if (scan_sock(sdp, bdp, &smaster, &hmaster))
return 1;
- }
+ make_per_cpu_thp(smaster);
}
-nextsocket:
socket++;
socket_mask = (socket_mask >> 1);
- /* each socket gets a local array of pnodes/hubs */
- bcp = smaster;
- bcp->target_hub_and_pnode = kmalloc_node(
- sizeof(struct hub_and_pnode) *
- num_possible_cpus(), GFP_KERNEL, bcp->osnode);
- memset(bcp->target_hub_and_pnode, 0,
- sizeof(struct hub_and_pnode) *
- num_possible_cpus());
- for_each_present_cpu(tcpu) {
- bcp->target_hub_and_pnode[tcpu].pnode =
- uv_cpu_hub_info(tcpu)->pnode;
- bcp->target_hub_and_pnode[tcpu].uvhub =
- uv_cpu_hub_info(tcpu)->numa_blade_id;
- }
}
+ make_per_hub_cpumask(hmaster);
}
+ return 0;
+}
+
+/*
+ * initialize the bau_control structure for each cpu
+ */
+static int __init init_per_cpu(int nuvhubs, int base_part_pnode)
+{
+ unsigned char *uvhub_mask;
+ void *vp;
+ struct uvhub_desc *uvhub_descs;
+
+ timeout_us = calculate_destination_timeout();
+
+ vp = kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL);
+ uvhub_descs = (struct uvhub_desc *)vp;
+ memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc));
+ uvhub_mask = kzalloc((nuvhubs+7)/8, GFP_KERNEL);
+
+ if (get_cpu_topology(base_part_pnode, uvhub_descs, uvhub_mask))
+ goto fail;
+
+ if (summarize_uvhub_sockets(nuvhubs, uvhub_descs, uvhub_mask))
+ goto fail;
+
kfree(uvhub_descs);
kfree(uvhub_mask);
- for_each_present_cpu(cpu) {
- bcp = &per_cpu(bau_control, cpu);
- bcp->baudisabled = 0;
- bcp->statp = &per_cpu(ptcstats, cpu);
- /* time interval to catch a hardware stay-busy bug */
- bcp->timeout_interval = microsec_2_cycles(2*timeout_us);
- bcp->max_bau_concurrent = max_bau_concurrent;
- bcp->max_bau_concurrent_constant = max_bau_concurrent;
- bcp->plugged_delay = plugged_delay;
- bcp->plugsb4reset = plugsb4reset;
- bcp->timeoutsb4reset = timeoutsb4reset;
- bcp->ipi_reset_limit = ipi_reset_limit;
- bcp->complete_threshold = complete_threshold;
- bcp->congested_response_us = congested_response_us;
- bcp->congested_reps = congested_reps;
- bcp->congested_period = congested_period;
- }
+ init_per_cpu_tunables();
return 0;
+
+fail:
+ kfree(uvhub_descs);
+ kfree(uvhub_mask);
+ return 1;
}
/*
@@ -1651,8 +1822,9 @@ static int __init uv_bau_init(void)
int pnode;
int nuvhubs;
int cur_cpu;
+ int cpus;
int vector;
- unsigned long mmr;
+ cpumask_var_t *mask;
if (!is_uv_system())
return 0;
@@ -1660,24 +1832,25 @@ static int __init uv_bau_init(void)
if (nobau)
return 0;
- for_each_possible_cpu(cur_cpu)
- zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu),
- GFP_KERNEL, cpu_to_node(cur_cpu));
+ for_each_possible_cpu(cur_cpu) {
+ mask = &per_cpu(uv_flush_tlb_mask, cur_cpu);
+ zalloc_cpumask_var_node(mask, GFP_KERNEL, cpu_to_node(cur_cpu));
+ }
uv_nshift = uv_hub_info->m_val;
uv_mmask = (1UL << uv_hub_info->m_val) - 1;
nuvhubs = uv_num_possible_blades();
spin_lock_init(&disable_lock);
- congested_cycles = microsec_2_cycles(congested_response_us);
+ congested_cycles = usec_2_cycles(congested_respns_us);
- uv_partition_base_pnode = 0x7fffffff;
+ uv_base_pnode = 0x7fffffff;
for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
- if (uv_blade_nr_possible_cpus(uvhub) &&
- (uv_blade_to_pnode(uvhub) < uv_partition_base_pnode))
- uv_partition_base_pnode = uv_blade_to_pnode(uvhub);
+ cpus = uv_blade_nr_possible_cpus(uvhub);
+ if (cpus && (uv_blade_to_pnode(uvhub) < uv_base_pnode))
+ uv_base_pnode = uv_blade_to_pnode(uvhub);
}
- if (uv_init_per_cpu(nuvhubs, uv_partition_base_pnode)) {
+ if (init_per_cpu(nuvhubs, uv_base_pnode)) {
nobau = 1;
return 0;
}
@@ -1685,21 +1858,21 @@ static int __init uv_bau_init(void)
vector = UV_BAU_MESSAGE;
for_each_possible_blade(uvhub)
if (uv_blade_nr_possible_cpus(uvhub))
- uv_init_uvhub(uvhub, vector, uv_partition_base_pnode);
+ init_uvhub(uvhub, vector, uv_base_pnode);
- uv_enable_timeouts();
+ enable_timeouts();
alloc_intr_gate(vector, uv_bau_message_intr1);
for_each_possible_blade(uvhub) {
if (uv_blade_nr_possible_cpus(uvhub)) {
+ unsigned long val;
+ unsigned long mmr;
pnode = uv_blade_to_pnode(uvhub);
/* INIT the bau */
- uv_write_global_mmr64(pnode,
- UVH_LB_BAU_SB_ACTIVATION_CONTROL,
- ((unsigned long)1 << 63));
+ val = 1L << 63;
+ write_gmmr_activation(pnode, val);
mmr = 1; /* should be 1 to broadcast to both sockets */
- uv_write_global_mmr64(pnode, UVH_BAU_DATA_BROADCAST,
- mmr);
+ write_mmr_data_broadcast(pnode, mmr);
}
}
diff --git a/arch/x86/platform/uv/uv_time.c b/arch/x86/platform/uv/uv_time.c
index 9daf5d1af9f..9f29a01ee1b 100644
--- a/arch/x86/platform/uv/uv_time.c
+++ b/arch/x86/platform/uv/uv_time.c
@@ -40,7 +40,6 @@ static struct clocksource clocksource_uv = {
.rating = 400,
.read = uv_read_rtc,
.mask = (cycle_t)UVH_RTC_REAL_TIME_CLOCK_MASK,
- .shift = 10,
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
};
@@ -100,8 +99,12 @@ static void uv_rtc_send_IPI(int cpu)
/* Check for an RTC interrupt pending */
static int uv_intr_pending(int pnode)
{
- return uv_read_global_mmr64(pnode, UVH_EVENT_OCCURRED0) &
- UVH_EVENT_OCCURRED0_RTC1_MASK;
+ if (is_uv1_hub())
+ return uv_read_global_mmr64(pnode, UVH_EVENT_OCCURRED0) &
+ UV1H_EVENT_OCCURRED0_RTC1_MASK;
+ else
+ return uv_read_global_mmr64(pnode, UV2H_EVENT_OCCURRED2) &
+ UV2H_EVENT_OCCURRED2_RTC_1_MASK;
}
/* Setup interrupt and return non-zero if early expiration occurred. */
@@ -115,8 +118,12 @@ static int uv_setup_intr(int cpu, u64 expires)
UVH_RTC1_INT_CONFIG_M_MASK);
uv_write_global_mmr64(pnode, UVH_INT_CMPB, -1L);
- uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS,
- UVH_EVENT_OCCURRED0_RTC1_MASK);
+ if (is_uv1_hub())
+ uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS,
+ UV1H_EVENT_OCCURRED0_RTC1_MASK);
+ else
+ uv_write_global_mmr64(pnode, UV2H_EVENT_OCCURRED2_ALIAS,
+ UV2H_EVENT_OCCURRED2_RTC_1_MASK);
val = (X86_PLATFORM_IPI_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) |
((u64)apicid << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT);
@@ -372,14 +379,11 @@ static __init int uv_rtc_setup_clock(void)
if (!is_uv_system())
return -ENODEV;
- clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second,
- clocksource_uv.shift);
-
/* If single blade, prefer tsc */
if (uv_num_possible_blades() == 1)
clocksource_uv.rating = 250;
- rc = clocksource_register(&clocksource_uv);
+ rc = clocksource_register_hz(&clocksource_uv, sn_rtc_cycles_per_second);
if (rc)
printk(KERN_INFO "UV RTC clocksource failed rc %d\n", rc);
else
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index b6552b189bc..5d179502a52 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -11,7 +11,7 @@ vdso-install-$(VDSO32-y) += $(vdso32-images)
# files to link into the vdso
-vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o vvar.o
+vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o
# files to link into kernel
obj-$(VDSO64-y) += vma.o vdso.o
@@ -26,6 +26,7 @@ targets += vdso.so vdso.so.dbg vdso.lds $(vobjs-y)
export CPPFLAGS_vdso.lds += -P -C
VDSO_LDFLAGS_vdso.lds = -m64 -Wl,-soname=linux-vdso.so.1 \
+ -Wl,--no-undefined \
-Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096
$(obj)/vdso.o: $(src)/vdso.S $(obj)/vdso.so
@@ -37,11 +38,24 @@ $(obj)/%.so: OBJCOPYFLAGS := -S
$(obj)/%.so: $(obj)/%.so.dbg FORCE
$(call if_changed,objcopy)
+#
+# Don't omit frame pointers for ease of userspace debugging, but do
+# optimize sibling calls.
+#
CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \
- $(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector)
+ $(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector) \
+ -fno-omit-frame-pointer -foptimize-sibling-calls
$(vobjs): KBUILD_CFLAGS += $(CFL)
+#
+# vDSO code runs in userspace and -pg doesn't help with profiling anyway.
+#
+CFLAGS_REMOVE_vdso-note.o = -pg
+CFLAGS_REMOVE_vclock_gettime.o = -pg
+CFLAGS_REMOVE_vgetcpu.o = -pg
+CFLAGS_REMOVE_vvar.o = -pg
+
targets += vdso-syms.lds
obj-$(VDSO64-y) += vdso-syms.lds
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index ee55754cc3c..6bc0e723b6e 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -2,11 +2,10 @@
* Copyright 2006 Andi Kleen, SUSE Labs.
* Subject to the GNU Public License, v.2
*
- * Fast user context implementation of clock_gettime and gettimeofday.
+ * Fast user context implementation of clock_gettime, gettimeofday, and time.
*
* The code should have no internal unresolved relocations.
* Check with readelf after changing.
- * Also alternative() doesn't work.
*/
/* Disable profiling for userspace code: */
@@ -17,14 +16,51 @@
#include <linux/time.h>
#include <linux/string.h>
#include <asm/vsyscall.h>
+#include <asm/fixmap.h>
#include <asm/vgtod.h>
#include <asm/timex.h>
#include <asm/hpet.h>
#include <asm/unistd.h>
#include <asm/io.h>
-#include "vextern.h"
-#define gtod vdso_vsyscall_gtod_data
+#define gtod (&VVAR(vsyscall_gtod_data))
+
+notrace static cycle_t vread_tsc(void)
+{
+ cycle_t ret;
+ u64 last;
+
+ /*
+ * Empirically, a fence (of type that depends on the CPU)
+ * before rdtsc is enough to ensure that rdtsc is ordered
+ * with respect to loads. The various CPU manuals are unclear
+ * as to whether rdtsc can be reordered with later loads,
+ * but no one has ever seen it happen.
+ */
+ rdtsc_barrier();
+ ret = (cycle_t)vget_cycles();
+
+ last = VVAR(vsyscall_gtod_data).clock.cycle_last;
+
+ if (likely(ret >= last))
+ return ret;
+
+ /*
+ * GCC likes to generate cmov here, but this branch is extremely
+ * predictable (it's just a funciton of time and the likely is
+ * very likely) and there's a data dependence, so force GCC
+ * to generate a branch instead. I don't barrier() because
+ * we don't actually need a barrier, and if this function
+ * ever gets inlined it will generate worse code.
+ */
+ asm volatile ("");
+ return last;
+}
+
+static notrace cycle_t vread_hpet(void)
+{
+ return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0);
+}
notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
{
@@ -37,9 +73,12 @@ notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
notrace static inline long vgetns(void)
{
long v;
- cycles_t (*vread)(void);
- vread = gtod->clock.vread;
- v = (vread() - gtod->clock.cycle_last) & gtod->clock.mask;
+ cycles_t cycles;
+ if (gtod->clock.vclock_mode == VCLOCK_TSC)
+ cycles = vread_tsc();
+ else
+ cycles = vread_hpet();
+ v = (cycles - gtod->clock.cycle_last) & gtod->clock.mask;
return (v * gtod->clock.mult) >> gtod->clock.shift;
}
@@ -56,22 +95,6 @@ notrace static noinline int do_realtime(struct timespec *ts)
return 0;
}
-/* Copy of the version in kernel/time.c which we cannot directly access */
-notrace static void
-vset_normalized_timespec(struct timespec *ts, long sec, long nsec)
-{
- while (nsec >= NSEC_PER_SEC) {
- nsec -= NSEC_PER_SEC;
- ++sec;
- }
- while (nsec < 0) {
- nsec += NSEC_PER_SEC;
- --sec;
- }
- ts->tv_sec = sec;
- ts->tv_nsec = nsec;
-}
-
notrace static noinline int do_monotonic(struct timespec *ts)
{
unsigned long seq, ns, secs;
@@ -82,7 +105,17 @@ notrace static noinline int do_monotonic(struct timespec *ts)
secs += gtod->wall_to_monotonic.tv_sec;
ns += gtod->wall_to_monotonic.tv_nsec;
} while (unlikely(read_seqretry(&gtod->lock, seq)));
- vset_normalized_timespec(ts, secs, ns);
+
+ /* wall_time_nsec, vgetns(), and wall_to_monotonic.tv_nsec
+ * are all guaranteed to be nonnegative.
+ */
+ while (ns >= NSEC_PER_SEC) {
+ ns -= NSEC_PER_SEC;
+ ++secs;
+ }
+ ts->tv_sec = secs;
+ ts->tv_nsec = ns;
+
return 0;
}
@@ -107,27 +140,37 @@ notrace static noinline int do_monotonic_coarse(struct timespec *ts)
secs += gtod->wall_to_monotonic.tv_sec;
ns += gtod->wall_to_monotonic.tv_nsec;
} while (unlikely(read_seqretry(&gtod->lock, seq)));
- vset_normalized_timespec(ts, secs, ns);
+
+ /* wall_time_nsec and wall_to_monotonic.tv_nsec are
+ * guaranteed to be between 0 and NSEC_PER_SEC.
+ */
+ if (ns >= NSEC_PER_SEC) {
+ ns -= NSEC_PER_SEC;
+ ++secs;
+ }
+ ts->tv_sec = secs;
+ ts->tv_nsec = ns;
+
return 0;
}
notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
{
- if (likely(gtod->sysctl_enabled))
- switch (clock) {
- case CLOCK_REALTIME:
- if (likely(gtod->clock.vread))
- return do_realtime(ts);
- break;
- case CLOCK_MONOTONIC:
- if (likely(gtod->clock.vread))
- return do_monotonic(ts);
- break;
- case CLOCK_REALTIME_COARSE:
- return do_realtime_coarse(ts);
- case CLOCK_MONOTONIC_COARSE:
- return do_monotonic_coarse(ts);
- }
+ switch (clock) {
+ case CLOCK_REALTIME:
+ if (likely(gtod->clock.vclock_mode != VCLOCK_NONE))
+ return do_realtime(ts);
+ break;
+ case CLOCK_MONOTONIC:
+ if (likely(gtod->clock.vclock_mode != VCLOCK_NONE))
+ return do_monotonic(ts);
+ break;
+ case CLOCK_REALTIME_COARSE:
+ return do_realtime_coarse(ts);
+ case CLOCK_MONOTONIC_COARSE:
+ return do_monotonic_coarse(ts);
+ }
+
return vdso_fallback_gettime(clock, ts);
}
int clock_gettime(clockid_t, struct timespec *)
@@ -136,7 +179,7 @@ int clock_gettime(clockid_t, struct timespec *)
notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
{
long ret;
- if (likely(gtod->sysctl_enabled && gtod->clock.vread)) {
+ if (likely(gtod->clock.vclock_mode != VCLOCK_NONE)) {
if (likely(tv != NULL)) {
BUILD_BUG_ON(offsetof(struct timeval, tv_usec) !=
offsetof(struct timespec, tv_nsec) ||
@@ -157,3 +200,19 @@ notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
}
int gettimeofday(struct timeval *, struct timezone *)
__attribute__((weak, alias("__vdso_gettimeofday")));
+
+/*
+ * This will break when the xtime seconds get inaccurate, but that is
+ * unlikely
+ */
+notrace time_t __vdso_time(time_t *t)
+{
+ /* This is atomic on x86_64 so we don't need any locks. */
+ time_t result = ACCESS_ONCE(VVAR(vsyscall_gtod_data).wall_time_sec);
+
+ if (t)
+ *t = result;
+ return result;
+}
+int time(time_t *t)
+ __attribute__((weak, alias("__vdso_time")));
diff --git a/arch/x86/vdso/vdso.S b/arch/x86/vdso/vdso.S
index 1d3aa6b8718..1b979c12ba8 100644
--- a/arch/x86/vdso/vdso.S
+++ b/arch/x86/vdso/vdso.S
@@ -1,10 +1,21 @@
+#include <asm/page_types.h>
+#include <linux/linkage.h>
#include <linux/init.h>
-__INITDATA
+__PAGE_ALIGNED_DATA
.globl vdso_start, vdso_end
+ .align PAGE_SIZE
vdso_start:
.incbin "arch/x86/vdso/vdso.so"
vdso_end:
-__FINIT
+.previous
+
+ .globl vdso_pages
+ .bss
+ .align 8
+ .type vdso_pages, @object
+vdso_pages:
+ .zero (vdso_end - vdso_start + PAGE_SIZE - 1) / PAGE_SIZE * 8
+ .size vdso_pages, .-vdso_pages
diff --git a/arch/x86/vdso/vdso.lds.S b/arch/x86/vdso/vdso.lds.S
index 4e5dd3b4de7..b96b2677cad 100644
--- a/arch/x86/vdso/vdso.lds.S
+++ b/arch/x86/vdso/vdso.lds.S
@@ -23,15 +23,10 @@ VERSION {
__vdso_gettimeofday;
getcpu;
__vdso_getcpu;
+ time;
+ __vdso_time;
local: *;
};
}
VDSO64_PRELINK = VDSO_PRELINK;
-
-/*
- * Define VDSO64_x for each VEXTERN(x), for use via VDSO64_SYMBOL.
- */
-#define VEXTERN(x) VDSO64_ ## x = vdso_ ## x;
-#include "vextern.h"
-#undef VEXTERN
diff --git a/arch/x86/vdso/vextern.h b/arch/x86/vdso/vextern.h
deleted file mode 100644
index 1683ba2ae3e..00000000000
--- a/arch/x86/vdso/vextern.h
+++ /dev/null
@@ -1,16 +0,0 @@
-#ifndef VEXTERN
-#include <asm/vsyscall.h>
-#define VEXTERN(x) \
- extern typeof(x) *vdso_ ## x __attribute__((visibility("hidden")));
-#endif
-
-#define VMAGIC 0xfeedbabeabcdefabUL
-
-/* Any kernel variables used in the vDSO must be exported in the main
- kernel's vmlinux.lds.S/vsyscall.h/proper __section and
- put into vextern.h and be referenced as a pointer with vdso prefix.
- The main kernel later fills in the values. */
-
-VEXTERN(jiffies)
-VEXTERN(vgetcpu_mode)
-VEXTERN(vsyscall_gtod_data)
diff --git a/arch/x86/vdso/vgetcpu.c b/arch/x86/vdso/vgetcpu.c
index 9fbc6b20026..5463ad55857 100644
--- a/arch/x86/vdso/vgetcpu.c
+++ b/arch/x86/vdso/vgetcpu.c
@@ -11,14 +11,13 @@
#include <linux/time.h>
#include <asm/vsyscall.h>
#include <asm/vgtod.h>
-#include "vextern.h"
notrace long
__vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused)
{
unsigned int p;
- if (*vdso_vgetcpu_mode == VGETCPU_RDTSCP) {
+ if (VVAR(vgetcpu_mode) == VGETCPU_RDTSCP) {
/* Load per CPU data from RDTSCP */
native_read_tscp(&p);
} else {
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index 4b5d26f108b..316fbca3490 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -14,68 +14,61 @@
#include <asm/vgtod.h>
#include <asm/proto.h>
#include <asm/vdso.h>
-
-#include "vextern.h" /* Just for VMAGIC. */
-#undef VEXTERN
+#include <asm/page.h>
unsigned int __read_mostly vdso_enabled = 1;
extern char vdso_start[], vdso_end[];
extern unsigned short vdso_sync_cpuid;
-static struct page **vdso_pages;
+extern struct page *vdso_pages[];
static unsigned vdso_size;
-static inline void *var_ref(void *p, char *name)
+static void __init patch_vdso(void *vdso, size_t len)
{
- if (*(void **)p != (void *)VMAGIC) {
- printk("VDSO: variable %s broken\n", name);
- vdso_enabled = 0;
+ Elf64_Ehdr *hdr = vdso;
+ Elf64_Shdr *sechdrs, *alt_sec = 0;
+ char *secstrings;
+ void *alt_data;
+ int i;
+
+ BUG_ON(len < sizeof(Elf64_Ehdr));
+ BUG_ON(memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0);
+
+ sechdrs = (void *)hdr + hdr->e_shoff;
+ secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
+
+ for (i = 1; i < hdr->e_shnum; i++) {
+ Elf64_Shdr *shdr = &sechdrs[i];
+ if (!strcmp(secstrings + shdr->sh_name, ".altinstructions")) {
+ alt_sec = shdr;
+ goto found;
+ }
}
- return p;
+
+ /* If we get here, it's probably a bug. */
+ pr_warning("patch_vdso: .altinstructions not found\n");
+ return; /* nothing to patch */
+
+found:
+ alt_data = (void *)hdr + alt_sec->sh_offset;
+ apply_alternatives(alt_data, alt_data + alt_sec->sh_size);
}
-static int __init init_vdso_vars(void)
+static int __init init_vdso(void)
{
int npages = (vdso_end - vdso_start + PAGE_SIZE - 1) / PAGE_SIZE;
int i;
- char *vbase;
- vdso_size = npages << PAGE_SHIFT;
- vdso_pages = kmalloc(sizeof(struct page *) * npages, GFP_KERNEL);
- if (!vdso_pages)
- goto oom;
- for (i = 0; i < npages; i++) {
- struct page *p;
- p = alloc_page(GFP_KERNEL);
- if (!p)
- goto oom;
- vdso_pages[i] = p;
- copy_page(page_address(p), vdso_start + i*PAGE_SIZE);
- }
-
- vbase = vmap(vdso_pages, npages, 0, PAGE_KERNEL);
- if (!vbase)
- goto oom;
+ patch_vdso(vdso_start, vdso_end - vdso_start);
- if (memcmp(vbase, "\177ELF", 4)) {
- printk("VDSO: I'm broken; not ELF\n");
- vdso_enabled = 0;
- }
+ vdso_size = npages << PAGE_SHIFT;
+ for (i = 0; i < npages; i++)
+ vdso_pages[i] = virt_to_page(vdso_start + i*PAGE_SIZE);
-#define VEXTERN(x) \
- *(typeof(__ ## x) **) var_ref(VDSO64_SYMBOL(vbase, x), #x) = &__ ## x;
-#include "vextern.h"
-#undef VEXTERN
- vunmap(vbase);
return 0;
-
- oom:
- printk("Cannot allocate vdso\n");
- vdso_enabled = 0;
- return -ENOMEM;
}
-subsys_initcall(init_vdso_vars);
+subsys_initcall(init_vdso);
struct linux_binprm;
diff --git a/arch/x86/vdso/vvar.c b/arch/x86/vdso/vvar.c
deleted file mode 100644
index 1b7e703684f..00000000000
--- a/arch/x86/vdso/vvar.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/* Define pointer to external vDSO variables.
- These are part of the vDSO. The kernel fills in the real addresses
- at boot time. This is done because when the vdso is linked the
- kernel isn't yet and we don't know the final addresses. */
-#include <linux/kernel.h>
-#include <linux/time.h>
-#include <asm/vsyscall.h>
-#include <asm/timex.h>
-#include <asm/vgtod.h>
-
-#define VEXTERN(x) typeof (__ ## x) *const vdso_ ## x = (void *)VMAGIC;
-#include "vextern.h"
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 17c565de3d6..a6575b949b1 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -18,5 +18,5 @@ obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \
obj-$(CONFIG_SMP) += smp.o
obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o
obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o
-
+obj-$(CONFIG_XEN_DOM0) += vga.o
obj-$(CONFIG_SWIOTLB_XEN) += pci-swiotlb-xen.o
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index e3c6a06cf72..53257421082 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -235,7 +235,7 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
*dx &= maskedx;
}
-static __init void xen_init_cpuid_mask(void)
+static void __init xen_init_cpuid_mask(void)
{
unsigned int ax, bx, cx, dx;
unsigned int xsave_mask;
@@ -400,7 +400,7 @@ static void xen_load_gdt(const struct desc_ptr *dtr)
/*
* load_gdt for early boot, when the gdt is only mapped once
*/
-static __init void xen_load_gdt_boot(const struct desc_ptr *dtr)
+static void __init xen_load_gdt_boot(const struct desc_ptr *dtr)
{
unsigned long va = dtr->address;
unsigned int size = dtr->size + 1;
@@ -662,7 +662,7 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
* Version of write_gdt_entry for use at early boot-time needed to
* update an entry as simply as possible.
*/
-static __init void xen_write_gdt_entry_boot(struct desc_struct *dt, int entry,
+static void __init xen_write_gdt_entry_boot(struct desc_struct *dt, int entry,
const void *desc, int type)
{
switch (type) {
@@ -933,18 +933,18 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
return ret;
}
-static const struct pv_info xen_info __initdata = {
+static const struct pv_info xen_info __initconst = {
.paravirt_enabled = 1,
.shared_kernel_pmd = 0,
.name = "Xen",
};
-static const struct pv_init_ops xen_init_ops __initdata = {
+static const struct pv_init_ops xen_init_ops __initconst = {
.patch = xen_patch,
};
-static const struct pv_cpu_ops xen_cpu_ops __initdata = {
+static const struct pv_cpu_ops xen_cpu_ops __initconst = {
.cpuid = xen_cpuid,
.set_debugreg = xen_set_debugreg,
@@ -1004,7 +1004,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
.end_context_switch = xen_end_context_switch,
};
-static const struct pv_apic_ops xen_apic_ops __initdata = {
+static const struct pv_apic_ops xen_apic_ops __initconst = {
#ifdef CONFIG_X86_LOCAL_APIC
.startup_ipi_hook = paravirt_nop,
#endif
@@ -1033,6 +1033,13 @@ static void xen_machine_halt(void)
xen_reboot(SHUTDOWN_poweroff);
}
+static void xen_machine_power_off(void)
+{
+ if (pm_power_off)
+ pm_power_off();
+ xen_reboot(SHUTDOWN_poweroff);
+}
+
static void xen_crash_shutdown(struct pt_regs *regs)
{
xen_reboot(SHUTDOWN_crash);
@@ -1055,10 +1062,10 @@ int xen_panic_handler_init(void)
return 0;
}
-static const struct machine_ops __initdata xen_machine_ops = {
+static const struct machine_ops xen_machine_ops __initconst = {
.restart = xen_restart,
.halt = xen_machine_halt,
- .power_off = xen_machine_halt,
+ .power_off = xen_machine_power_off,
.shutdown = xen_machine_halt,
.crash_shutdown = xen_crash_shutdown,
.emergency_restart = xen_emergency_restart,
@@ -1241,6 +1248,14 @@ asmlinkage void __init xen_start_kernel(void)
if (pci_xen)
x86_init.pci.arch_init = pci_xen_init;
} else {
+ const struct dom0_vga_console_info *info =
+ (void *)((char *)xen_start_info +
+ xen_start_info->console.dom0.info_off);
+
+ xen_init_vga(info, xen_start_info->console.dom0.info_size);
+ xen_start_info->console.domU.mfn = 0;
+ xen_start_info->console.domU.evtchn = 0;
+
/* Make sure ACS will be enabled */
pci_request_acs();
}
@@ -1332,7 +1347,7 @@ static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self,
return NOTIFY_OK;
}
-static struct notifier_block __cpuinitdata xen_hvm_cpu_notifier = {
+static struct notifier_block xen_hvm_cpu_notifier __cpuinitdata = {
.notifier_call = xen_hvm_cpu_notify,
};
@@ -1381,7 +1396,7 @@ bool xen_hvm_need_lapic(void)
}
EXPORT_SYMBOL_GPL(xen_hvm_need_lapic);
-const __refconst struct hypervisor_x86 x86_hyper_xen_hvm = {
+const struct hypervisor_x86 x86_hyper_xen_hvm __refconst = {
.name = "Xen HVM",
.detect = xen_hvm_platform,
.init_platform = xen_hvm_guest_init,
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index 6a6fe893964..8bbb465b6f0 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -113,7 +113,7 @@ static void xen_halt(void)
xen_safe_halt();
}
-static const struct pv_irq_ops xen_irq_ops __initdata = {
+static const struct pv_irq_ops xen_irq_ops __initconst = {
.save_fl = PV_CALLEE_SAVE(xen_save_fl),
.restore_fl = PV_CALLEE_SAVE(xen_restore_fl),
.irq_disable = PV_CALLEE_SAVE(xen_irq_disable),
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 0684f3c74d5..0ccccb67a99 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -59,6 +59,7 @@
#include <asm/page.h>
#include <asm/init.h>
#include <asm/pat.h>
+#include <asm/smp.h>
#include <asm/xen/hypercall.h>
#include <asm/xen/hypervisor.h>
@@ -75,67 +76,12 @@
#include "mmu.h"
#include "debugfs.h"
-#define MMU_UPDATE_HISTO 30
-
/*
* Protects atomic reservation decrease/increase against concurrent increases.
* Also protects non-atomic updates of current_pages and balloon lists.
*/
DEFINE_SPINLOCK(xen_reservation_lock);
-#ifdef CONFIG_XEN_DEBUG_FS
-
-static struct {
- u32 pgd_update;
- u32 pgd_update_pinned;
- u32 pgd_update_batched;
-
- u32 pud_update;
- u32 pud_update_pinned;
- u32 pud_update_batched;
-
- u32 pmd_update;
- u32 pmd_update_pinned;
- u32 pmd_update_batched;
-
- u32 pte_update;
- u32 pte_update_pinned;
- u32 pte_update_batched;
-
- u32 mmu_update;
- u32 mmu_update_extended;
- u32 mmu_update_histo[MMU_UPDATE_HISTO];
-
- u32 prot_commit;
- u32 prot_commit_batched;
-
- u32 set_pte_at;
- u32 set_pte_at_batched;
- u32 set_pte_at_pinned;
- u32 set_pte_at_current;
- u32 set_pte_at_kernel;
-} mmu_stats;
-
-static u8 zero_stats;
-
-static inline void check_zero(void)
-{
- if (unlikely(zero_stats)) {
- memset(&mmu_stats, 0, sizeof(mmu_stats));
- zero_stats = 0;
- }
-}
-
-#define ADD_STATS(elem, val) \
- do { check_zero(); mmu_stats.elem += (val); } while(0)
-
-#else /* !CONFIG_XEN_DEBUG_FS */
-
-#define ADD_STATS(elem, val) do { (void)(val); } while(0)
-
-#endif /* CONFIG_XEN_DEBUG_FS */
-
-
/*
* Identity map, in addition to plain kernel map. This needs to be
* large enough to allocate page table pages to allocate the rest.
@@ -243,11 +189,6 @@ static bool xen_page_pinned(void *ptr)
return PagePinned(page);
}
-static bool xen_iomap_pte(pte_t pte)
-{
- return pte_flags(pte) & _PAGE_IOMAP;
-}
-
void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid)
{
struct multicall_space mcs;
@@ -257,7 +198,7 @@ void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid)
u = mcs.args;
/* ptep might be kmapped when using 32-bit HIGHPTE */
- u->ptr = arbitrary_virt_to_machine(ptep).maddr;
+ u->ptr = virt_to_machine(ptep).maddr;
u->val = pte_val_ma(pteval);
MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid);
@@ -266,11 +207,6 @@ void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid)
}
EXPORT_SYMBOL_GPL(xen_set_domain_pte);
-static void xen_set_iomap_pte(pte_t *ptep, pte_t pteval)
-{
- xen_set_domain_pte(ptep, pteval, DOMID_IO);
-}
-
static void xen_extend_mmu_update(const struct mmu_update *update)
{
struct multicall_space mcs;
@@ -279,27 +215,17 @@ static void xen_extend_mmu_update(const struct mmu_update *update)
mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
if (mcs.mc != NULL) {
- ADD_STATS(mmu_update_extended, 1);
- ADD_STATS(mmu_update_histo[mcs.mc->args[1]], -1);
-
mcs.mc->args[1]++;
-
- if (mcs.mc->args[1] < MMU_UPDATE_HISTO)
- ADD_STATS(mmu_update_histo[mcs.mc->args[1]], 1);
- else
- ADD_STATS(mmu_update_histo[0], 1);
} else {
- ADD_STATS(mmu_update, 1);
mcs = __xen_mc_entry(sizeof(*u));
MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
- ADD_STATS(mmu_update_histo[1], 1);
}
u = mcs.args;
*u = *update;
}
-void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
+static void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
{
struct mmu_update u;
@@ -312,17 +238,13 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
u.val = pmd_val_ma(val);
xen_extend_mmu_update(&u);
- ADD_STATS(pmd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
-
xen_mc_issue(PARAVIRT_LAZY_MMU);
preempt_enable();
}
-void xen_set_pmd(pmd_t *ptr, pmd_t val)
+static void xen_set_pmd(pmd_t *ptr, pmd_t val)
{
- ADD_STATS(pmd_update, 1);
-
/* If page is not pinned, we can just update the entry
directly */
if (!xen_page_pinned(ptr)) {
@@ -330,8 +252,6 @@ void xen_set_pmd(pmd_t *ptr, pmd_t val)
return;
}
- ADD_STATS(pmd_update_pinned, 1);
-
xen_set_pmd_hyper(ptr, val);
}
@@ -344,35 +264,34 @@ void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
}
-void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pteval)
+static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval)
{
- if (xen_iomap_pte(pteval)) {
- xen_set_iomap_pte(ptep, pteval);
- goto out;
- }
+ struct mmu_update u;
- ADD_STATS(set_pte_at, 1);
-// ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep));
- ADD_STATS(set_pte_at_current, mm == current->mm);
- ADD_STATS(set_pte_at_kernel, mm == &init_mm);
+ if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU)
+ return false;
- if (mm == current->mm || mm == &init_mm) {
- if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
- struct multicall_space mcs;
- mcs = xen_mc_entry(0);
+ xen_mc_batch();
- MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
- ADD_STATS(set_pte_at_batched, 1);
- xen_mc_issue(PARAVIRT_LAZY_MMU);
- goto out;
- } else
- if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0)
- goto out;
- }
- xen_set_pte(ptep, pteval);
+ u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE;
+ u.val = pte_val_ma(pteval);
+ xen_extend_mmu_update(&u);
-out: return;
+ xen_mc_issue(PARAVIRT_LAZY_MMU);
+
+ return true;
+}
+
+static void xen_set_pte(pte_t *ptep, pte_t pteval)
+{
+ if (!xen_batched_set_pte(ptep, pteval))
+ native_set_pte(ptep, pteval);
+}
+
+static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pteval)
+{
+ xen_set_pte(ptep, pteval);
}
pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
@@ -389,13 +308,10 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
xen_mc_batch();
- u.ptr = arbitrary_virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
+ u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
u.val = pte_val_ma(pte);
xen_extend_mmu_update(&u);
- ADD_STATS(prot_commit, 1);
- ADD_STATS(prot_commit_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
-
xen_mc_issue(PARAVIRT_LAZY_MMU);
}
@@ -463,7 +379,7 @@ static pteval_t iomap_pte(pteval_t val)
return val;
}
-pteval_t xen_pte_val(pte_t pte)
+static pteval_t xen_pte_val(pte_t pte)
{
pteval_t pteval = pte.pte;
@@ -480,7 +396,7 @@ pteval_t xen_pte_val(pte_t pte)
}
PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
-pgdval_t xen_pgd_val(pgd_t pgd)
+static pgdval_t xen_pgd_val(pgd_t pgd)
{
return pte_mfn_to_pfn(pgd.pgd);
}
@@ -511,7 +427,7 @@ void xen_set_pat(u64 pat)
WARN_ON(pat != 0x0007010600070106ull);
}
-pte_t xen_make_pte(pteval_t pte)
+static pte_t xen_make_pte(pteval_t pte)
{
phys_addr_t addr = (pte & PTE_PFN_MASK);
@@ -581,20 +497,20 @@ pte_t xen_make_pte_debug(pteval_t pte)
PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte_debug);
#endif
-pgd_t xen_make_pgd(pgdval_t pgd)
+static pgd_t xen_make_pgd(pgdval_t pgd)
{
pgd = pte_pfn_to_mfn(pgd);
return native_make_pgd(pgd);
}
PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd);
-pmdval_t xen_pmd_val(pmd_t pmd)
+static pmdval_t xen_pmd_val(pmd_t pmd)
{
return pte_mfn_to_pfn(pmd.pmd);
}
PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val);
-void xen_set_pud_hyper(pud_t *ptr, pud_t val)
+static void xen_set_pud_hyper(pud_t *ptr, pud_t val)
{
struct mmu_update u;
@@ -607,17 +523,13 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val)
u.val = pud_val_ma(val);
xen_extend_mmu_update(&u);
- ADD_STATS(pud_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
-
xen_mc_issue(PARAVIRT_LAZY_MMU);
preempt_enable();
}
-void xen_set_pud(pud_t *ptr, pud_t val)
+static void xen_set_pud(pud_t *ptr, pud_t val)
{
- ADD_STATS(pud_update, 1);
-
/* If page is not pinned, we can just update the entry
directly */
if (!xen_page_pinned(ptr)) {
@@ -625,56 +537,28 @@ void xen_set_pud(pud_t *ptr, pud_t val)
return;
}
- ADD_STATS(pud_update_pinned, 1);
-
xen_set_pud_hyper(ptr, val);
}
-void xen_set_pte(pte_t *ptep, pte_t pte)
-{
- if (xen_iomap_pte(pte)) {
- xen_set_iomap_pte(ptep, pte);
- return;
- }
-
- ADD_STATS(pte_update, 1);
-// ADD_STATS(pte_update_pinned, xen_page_pinned(ptep));
- ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
-
#ifdef CONFIG_X86_PAE
- ptep->pte_high = pte.pte_high;
- smp_wmb();
- ptep->pte_low = pte.pte_low;
-#else
- *ptep = pte;
-#endif
-}
-
-#ifdef CONFIG_X86_PAE
-void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
+static void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
{
- if (xen_iomap_pte(pte)) {
- xen_set_iomap_pte(ptep, pte);
- return;
- }
-
set_64bit((u64 *)ptep, native_pte_val(pte));
}
-void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+static void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
{
- ptep->pte_low = 0;
- smp_wmb(); /* make sure low gets written first */
- ptep->pte_high = 0;
+ if (!xen_batched_set_pte(ptep, native_make_pte(0)))
+ native_pte_clear(mm, addr, ptep);
}
-void xen_pmd_clear(pmd_t *pmdp)
+static void xen_pmd_clear(pmd_t *pmdp)
{
set_pmd(pmdp, __pmd(0));
}
#endif /* CONFIG_X86_PAE */
-pmd_t xen_make_pmd(pmdval_t pmd)
+static pmd_t xen_make_pmd(pmdval_t pmd)
{
pmd = pte_pfn_to_mfn(pmd);
return native_make_pmd(pmd);
@@ -682,13 +566,13 @@ pmd_t xen_make_pmd(pmdval_t pmd)
PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
#if PAGETABLE_LEVELS == 4
-pudval_t xen_pud_val(pud_t pud)
+static pudval_t xen_pud_val(pud_t pud)
{
return pte_mfn_to_pfn(pud.pud);
}
PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val);
-pud_t xen_make_pud(pudval_t pud)
+static pud_t xen_make_pud(pudval_t pud)
{
pud = pte_pfn_to_mfn(pud);
@@ -696,7 +580,7 @@ pud_t xen_make_pud(pudval_t pud)
}
PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud);
-pgd_t *xen_get_user_pgd(pgd_t *pgd)
+static pgd_t *xen_get_user_pgd(pgd_t *pgd)
{
pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
unsigned offset = pgd - pgd_page;
@@ -728,7 +612,7 @@ static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
* 2. It is always pinned
* 3. It has no user pagetable attached to it
*/
-void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
+static void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
{
preempt_disable();
@@ -741,12 +625,10 @@ void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
preempt_enable();
}
-void xen_set_pgd(pgd_t *ptr, pgd_t val)
+static void xen_set_pgd(pgd_t *ptr, pgd_t val)
{
pgd_t *user_ptr = xen_get_user_pgd(ptr);
- ADD_STATS(pgd_update, 1);
-
/* If page is not pinned, we can just update the entry
directly */
if (!xen_page_pinned(ptr)) {
@@ -758,9 +640,6 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val)
return;
}
- ADD_STATS(pgd_update_pinned, 1);
- ADD_STATS(pgd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
-
/* If it's pinned, then we can at least batch the kernel and
user updates together. */
xen_mc_batch();
@@ -1054,7 +933,7 @@ void xen_mm_pin_all(void)
* that's before we have page structures to store the bits. So do all
* the book-keeping now.
*/
-static __init int xen_mark_pinned(struct mm_struct *mm, struct page *page,
+static int __init xen_mark_pinned(struct mm_struct *mm, struct page *page,
enum pt_level level)
{
SetPagePinned(page);
@@ -1162,14 +1041,14 @@ void xen_mm_unpin_all(void)
spin_unlock(&pgd_lock);
}
-void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
+static void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
{
spin_lock(&next->page_table_lock);
xen_pgd_pin(next);
spin_unlock(&next->page_table_lock);
}
-void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
+static void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
{
spin_lock(&mm->page_table_lock);
xen_pgd_pin(mm);
@@ -1187,7 +1066,7 @@ static void drop_other_mm_ref(void *info)
active_mm = percpu_read(cpu_tlbstate.active_mm);
- if (active_mm == mm)
+ if (active_mm == mm && percpu_read(cpu_tlbstate.state) != TLBSTATE_OK)
leave_mm(smp_processor_id());
/* If this cpu still has a stale cr3 reference, then make sure
@@ -1256,7 +1135,7 @@ static void xen_drop_mm_ref(struct mm_struct *mm)
* pagetable because of lazy tlb flushing. This means we need need to
* switch all CPUs off this pagetable before we can unpin it.
*/
-void xen_exit_mmap(struct mm_struct *mm)
+static void xen_exit_mmap(struct mm_struct *mm)
{
get_cpu(); /* make sure we don't move around */
xen_drop_mm_ref(mm);
@@ -1271,7 +1150,7 @@ void xen_exit_mmap(struct mm_struct *mm)
spin_unlock(&mm->page_table_lock);
}
-static __init void xen_pagetable_setup_start(pgd_t *base)
+static void __init xen_pagetable_setup_start(pgd_t *base)
{
}
@@ -1291,7 +1170,7 @@ static __init void xen_mapping_pagetable_reserve(u64 start, u64 end)
static void xen_post_allocator_init(void);
-static __init void xen_pagetable_setup_done(pgd_t *base)
+static void __init xen_pagetable_setup_done(pgd_t *base)
{
xen_setup_shared_info();
xen_post_allocator_init();
@@ -1353,7 +1232,11 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,
{
struct {
struct mmuext_op op;
+#ifdef CONFIG_SMP
+ DECLARE_BITMAP(mask, num_processors);
+#else
DECLARE_BITMAP(mask, NR_CPUS);
+#endif
} *args;
struct multicall_space mcs;
@@ -1488,7 +1371,7 @@ static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
}
#ifdef CONFIG_X86_32
-static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
+static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
{
/* If there's an existing pte, then don't allow _PAGE_RW to be set */
if (pte_val_ma(*ptep) & _PAGE_PRESENT)
@@ -1498,7 +1381,7 @@ static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
return pte;
}
#else /* CONFIG_X86_64 */
-static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
+static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
{
unsigned long pfn = pte_pfn(pte);
@@ -1519,7 +1402,7 @@ static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
/* Init-time set_pte while constructing initial pagetables, which
doesn't allow RO pagetable pages to be remapped RW */
-static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
+static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
{
pte = mask_rw_pte(ptep, pte);
@@ -1537,7 +1420,7 @@ static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
/* Early in boot, while setting up the initial pagetable, assume
everything is pinned. */
-static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
+static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
{
#ifdef CONFIG_FLATMEM
BUG_ON(mem_map); /* should only be used early */
@@ -1547,7 +1430,7 @@ static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
}
/* Used for pmd and pud */
-static __init void xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn)
+static void __init xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn)
{
#ifdef CONFIG_FLATMEM
BUG_ON(mem_map); /* should only be used early */
@@ -1557,13 +1440,13 @@ static __init void xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn)
/* Early release_pte assumes that all pts are pinned, since there's
only init_mm and anything attached to that is pinned. */
-static __init void xen_release_pte_init(unsigned long pfn)
+static void __init xen_release_pte_init(unsigned long pfn)
{
pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
}
-static __init void xen_release_pmd_init(unsigned long pfn)
+static void __init xen_release_pmd_init(unsigned long pfn)
{
make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
}
@@ -1689,7 +1572,7 @@ static void set_page_prot(void *addr, pgprot_t prot)
BUG();
}
-static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
+static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
{
unsigned pmdidx, pteidx;
unsigned ident_pte;
@@ -1721,6 +1604,11 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
pte_t pte;
+#ifdef CONFIG_X86_32
+ if (pfn > max_pfn_mapped)
+ max_pfn_mapped = pfn;
+#endif
+
if (!pte_none(pte_page[pteidx]))
continue;
@@ -1772,7 +1660,7 @@ static void convert_pfn_mfn(void *v)
* of the physical mapping once some sort of allocator has been set
* up.
*/
-__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
+pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd,
unsigned long max_pfn)
{
pud_t *l3;
@@ -1843,7 +1731,7 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD);
static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD);
-static __init void xen_write_cr3_init(unsigned long cr3)
+static void __init xen_write_cr3_init(unsigned long cr3)
{
unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir));
@@ -1880,7 +1768,7 @@ static __init void xen_write_cr3_init(unsigned long cr3)
pv_mmu_ops.write_cr3 = &xen_write_cr3;
}
-__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
+pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd,
unsigned long max_pfn)
{
pmd_t *kernel_pmd;
@@ -1888,7 +1776,9 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
initial_kernel_pmd =
extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
- max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
+ max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) +
+ xen_start_info->nr_pt_frames * PAGE_SIZE +
+ 512*1024);
kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
memcpy(initial_kernel_pmd, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
@@ -1986,7 +1876,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
#endif
}
-__init void xen_ident_map_ISA(void)
+void __init xen_ident_map_ISA(void)
{
unsigned long pa;
@@ -2009,7 +1899,7 @@ __init void xen_ident_map_ISA(void)
xen_flush_tlb();
}
-static __init void xen_post_allocator_init(void)
+static void __init xen_post_allocator_init(void)
{
#ifdef CONFIG_XEN_DEBUG
pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte_debug);
@@ -2046,7 +1936,7 @@ static void xen_leave_lazy_mmu(void)
preempt_enable();
}
-static const struct pv_mmu_ops xen_mmu_ops __initdata = {
+static const struct pv_mmu_ops xen_mmu_ops __initconst = {
.read_cr2 = xen_read_cr2,
.write_cr2 = xen_write_cr2,
@@ -2371,7 +2261,7 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
struct remap_data *rmd = data;
pte_t pte = pte_mkspecial(pfn_pte(rmd->mfn++, rmd->prot));
- rmd->mmu_update->ptr = arbitrary_virt_to_machine(ptep).maddr;
+ rmd->mmu_update->ptr = virt_to_machine(ptep).maddr;
rmd->mmu_update->val = pte_val_ma(pte);
rmd->mmu_update++;
@@ -2425,7 +2315,6 @@ out:
EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);
#ifdef CONFIG_XEN_DEBUG_FS
-
static int p2m_dump_open(struct inode *inode, struct file *filp)
{
return single_open(filp, p2m_dump_show, NULL);
@@ -2437,65 +2326,4 @@ static const struct file_operations p2m_dump_fops = {
.llseek = seq_lseek,
.release = single_release,
};
-
-static struct dentry *d_mmu_debug;
-
-static int __init xen_mmu_debugfs(void)
-{
- struct dentry *d_xen = xen_init_debugfs();
-
- if (d_xen == NULL)
- return -ENOMEM;
-
- d_mmu_debug = debugfs_create_dir("mmu", d_xen);
-
- debugfs_create_u8("zero_stats", 0644, d_mmu_debug, &zero_stats);
-
- debugfs_create_u32("pgd_update", 0444, d_mmu_debug, &mmu_stats.pgd_update);
- debugfs_create_u32("pgd_update_pinned", 0444, d_mmu_debug,
- &mmu_stats.pgd_update_pinned);
- debugfs_create_u32("pgd_update_batched", 0444, d_mmu_debug,
- &mmu_stats.pgd_update_pinned);
-
- debugfs_create_u32("pud_update", 0444, d_mmu_debug, &mmu_stats.pud_update);
- debugfs_create_u32("pud_update_pinned", 0444, d_mmu_debug,
- &mmu_stats.pud_update_pinned);
- debugfs_create_u32("pud_update_batched", 0444, d_mmu_debug,
- &mmu_stats.pud_update_pinned);
-
- debugfs_create_u32("pmd_update", 0444, d_mmu_debug, &mmu_stats.pmd_update);
- debugfs_create_u32("pmd_update_pinned", 0444, d_mmu_debug,
- &mmu_stats.pmd_update_pinned);
- debugfs_create_u32("pmd_update_batched", 0444, d_mmu_debug,
- &mmu_stats.pmd_update_pinned);
-
- debugfs_create_u32("pte_update", 0444, d_mmu_debug, &mmu_stats.pte_update);
-// debugfs_create_u32("pte_update_pinned", 0444, d_mmu_debug,
-// &mmu_stats.pte_update_pinned);
- debugfs_create_u32("pte_update_batched", 0444, d_mmu_debug,
- &mmu_stats.pte_update_pinned);
-
- debugfs_create_u32("mmu_update", 0444, d_mmu_debug, &mmu_stats.mmu_update);
- debugfs_create_u32("mmu_update_extended", 0444, d_mmu_debug,
- &mmu_stats.mmu_update_extended);
- xen_debugfs_create_u32_array("mmu_update_histo", 0444, d_mmu_debug,
- mmu_stats.mmu_update_histo, 20);
-
- debugfs_create_u32("set_pte_at", 0444, d_mmu_debug, &mmu_stats.set_pte_at);
- debugfs_create_u32("set_pte_at_batched", 0444, d_mmu_debug,
- &mmu_stats.set_pte_at_batched);
- debugfs_create_u32("set_pte_at_current", 0444, d_mmu_debug,
- &mmu_stats.set_pte_at_current);
- debugfs_create_u32("set_pte_at_kernel", 0444, d_mmu_debug,
- &mmu_stats.set_pte_at_kernel);
-
- debugfs_create_u32("prot_commit", 0444, d_mmu_debug, &mmu_stats.prot_commit);
- debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug,
- &mmu_stats.prot_commit_batched);
-
- debugfs_create_file("p2m", 0600, d_mmu_debug, NULL, &p2m_dump_fops);
- return 0;
-}
-fs_initcall(xen_mmu_debugfs);
-
-#endif /* CONFIG_XEN_DEBUG_FS */
+#endif /* CONFIG_XEN_DEBUG_FS */
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index 537bb9aab77..73809bb951b 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -15,43 +15,6 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
-
-void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next);
-void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
-void xen_exit_mmap(struct mm_struct *mm);
-
-pteval_t xen_pte_val(pte_t);
-pmdval_t xen_pmd_val(pmd_t);
-pgdval_t xen_pgd_val(pgd_t);
-
-pte_t xen_make_pte(pteval_t);
-pmd_t xen_make_pmd(pmdval_t);
-pgd_t xen_make_pgd(pgdval_t);
-
-void xen_set_pte(pte_t *ptep, pte_t pteval);
-void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pteval);
-
-#ifdef CONFIG_X86_PAE
-void xen_set_pte_atomic(pte_t *ptep, pte_t pte);
-void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
-void xen_pmd_clear(pmd_t *pmdp);
-#endif /* CONFIG_X86_PAE */
-
-void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval);
-void xen_set_pud(pud_t *ptr, pud_t val);
-void xen_set_pmd_hyper(pmd_t *pmdp, pmd_t pmdval);
-void xen_set_pud_hyper(pud_t *ptr, pud_t val);
-
-#if PAGETABLE_LEVELS == 4
-pudval_t xen_pud_val(pud_t pud);
-pud_t xen_make_pud(pudval_t pudval);
-void xen_set_pgd(pgd_t *pgdp, pgd_t pgd);
-void xen_set_pgd_hyper(pgd_t *pgdp, pgd_t pgd);
-#endif
-
-pgd_t *xen_get_user_pgd(pgd_t *pgd);
-
pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte);
diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c
index 8bff7e7c290..1b2b73ff0a6 100644
--- a/arch/x86/xen/multicalls.c
+++ b/arch/x86/xen/multicalls.c
@@ -189,10 +189,10 @@ struct multicall_space __xen_mc_entry(size_t args)
unsigned argidx = roundup(b->argidx, sizeof(u64));
BUG_ON(preemptible());
- BUG_ON(b->argidx > MC_ARGS);
+ BUG_ON(b->argidx >= MC_ARGS);
if (b->mcidx == MC_BATCH ||
- (argidx + args) > MC_ARGS) {
+ (argidx + args) >= MC_ARGS) {
mc_stats_flush(b->mcidx == MC_BATCH ? FL_SLOTS : FL_ARGS);
xen_mc_flush();
argidx = roundup(b->argidx, sizeof(u64));
@@ -206,7 +206,7 @@ struct multicall_space __xen_mc_entry(size_t args)
ret.args = &b->args[argidx];
b->argidx = argidx + args;
- BUG_ON(b->argidx > MC_ARGS);
+ BUG_ON(b->argidx >= MC_ARGS);
return ret;
}
@@ -216,7 +216,7 @@ struct multicall_space xen_mc_extend_args(unsigned long op, size_t size)
struct multicall_space ret = { NULL, NULL };
BUG_ON(preemptible());
- BUG_ON(b->argidx > MC_ARGS);
+ BUG_ON(b->argidx >= MC_ARGS);
if (b->mcidx == 0)
return ret;
@@ -224,14 +224,14 @@ struct multicall_space xen_mc_extend_args(unsigned long op, size_t size)
if (b->entries[b->mcidx - 1].op != op)
return ret;
- if ((b->argidx + size) > MC_ARGS)
+ if ((b->argidx + size) >= MC_ARGS)
return ret;
ret.mc = &b->entries[b->mcidx - 1];
ret.args = &b->args[b->argidx];
b->argidx += size;
- BUG_ON(b->argidx > MC_ARGS);
+ BUG_ON(b->argidx >= MC_ARGS);
return ret;
}
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 141eb0de8b0..58efeb9d544 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -522,11 +522,20 @@ static bool __init __early_alloc_p2m(unsigned long pfn)
/* Boundary cross-over for the edges: */
if (idx) {
unsigned long *p2m = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ unsigned long *mid_mfn_p;
p2m_init(p2m);
p2m_top[topidx][mididx] = p2m;
+ /* For save/restore we need to MFN of the P2M saved */
+
+ mid_mfn_p = p2m_top_mfn_p[topidx];
+ WARN(mid_mfn_p[mididx] != virt_to_mfn(p2m_missing),
+ "P2M_TOP_P[%d][%d] != MFN of p2m_missing!\n",
+ topidx, mididx);
+ mid_mfn_p[mididx] = virt_to_mfn(p2m);
+
}
return idx != 0;
}
@@ -549,12 +558,29 @@ unsigned long __init set_phys_range_identity(unsigned long pfn_s,
pfn += P2M_MID_PER_PAGE * P2M_PER_PAGE)
{
unsigned topidx = p2m_top_index(pfn);
- if (p2m_top[topidx] == p2m_mid_missing) {
- unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ unsigned long *mid_mfn_p;
+ unsigned long **mid;
+
+ mid = p2m_top[topidx];
+ mid_mfn_p = p2m_top_mfn_p[topidx];
+ if (mid == p2m_mid_missing) {
+ mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
p2m_mid_init(mid);
p2m_top[topidx] = mid;
+
+ BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
+ }
+ /* And the save/restore P2M tables.. */
+ if (mid_mfn_p == p2m_mid_missing_mfn) {
+ mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_mid_mfn_init(mid_mfn_p);
+
+ p2m_top_mfn_p[topidx] = mid_mfn_p;
+ p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
+ /* Note: we don't set mid_mfn_p[midix] here,
+ * look in __early_alloc_p2m */
}
}
@@ -650,7 +676,7 @@ static unsigned long mfn_hash(unsigned long mfn)
}
/* Add an MFN override for a particular page */
-int m2p_add_override(unsigned long mfn, struct page *page)
+int m2p_add_override(unsigned long mfn, struct page *page, bool clear_pte)
{
unsigned long flags;
unsigned long pfn;
@@ -662,7 +688,6 @@ int m2p_add_override(unsigned long mfn, struct page *page)
if (!PageHighMem(page)) {
address = (unsigned long)__va(pfn << PAGE_SHIFT);
ptep = lookup_address(address, &level);
-
if (WARN(ptep == NULL || level != PG_LEVEL_4K,
"m2p_add_override: pfn %lx not mapped", pfn))
return -EINVAL;
@@ -674,18 +699,17 @@ int m2p_add_override(unsigned long mfn, struct page *page)
if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn))))
return -ENOMEM;
- if (!PageHighMem(page))
+ if (clear_pte && !PageHighMem(page))
/* Just zap old mapping for now */
pte_clear(&init_mm, address, ptep);
-
spin_lock_irqsave(&m2p_override_lock, flags);
list_add(&page->lru, &m2p_overrides[mfn_hash(mfn)]);
spin_unlock_irqrestore(&m2p_override_lock, flags);
return 0;
}
-
-int m2p_remove_override(struct page *page)
+EXPORT_SYMBOL_GPL(m2p_add_override);
+int m2p_remove_override(struct page *page, bool clear_pte)
{
unsigned long flags;
unsigned long mfn;
@@ -713,7 +737,7 @@ int m2p_remove_override(struct page *page)
spin_unlock_irqrestore(&m2p_override_lock, flags);
set_phys_to_machine(pfn, page->index);
- if (!PageHighMem(page))
+ if (clear_pte && !PageHighMem(page))
set_pte_at(&init_mm, address, ptep,
pfn_pte(pfn, PAGE_KERNEL));
/* No tlb flush necessary because the caller already
@@ -721,6 +745,7 @@ int m2p_remove_override(struct page *page)
return 0;
}
+EXPORT_SYMBOL_GPL(m2p_remove_override);
struct page *m2p_find_override(unsigned long mfn)
{
diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c
index bfd0632fe65..b480d4207a4 100644
--- a/arch/x86/xen/pci-swiotlb-xen.c
+++ b/arch/x86/xen/pci-swiotlb-xen.c
@@ -36,7 +36,7 @@ int __init pci_xen_swiotlb_detect(void)
/* If running as PV guest, either iommu=soft, or swiotlb=force will
* activate this IOMMU. If running as PV privileged, activate it
- * irregardlesss.
+ * irregardless.
*/
if ((xen_initial_domain() || swiotlb || swiotlb_force) &&
(xen_pv_domain()))
diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c
index 25c52f94a27..ffcf2615640 100644
--- a/arch/x86/xen/platform-pci-unplug.c
+++ b/arch/x86/xen/platform-pci-unplug.c
@@ -35,7 +35,7 @@ EXPORT_SYMBOL_GPL(xen_platform_pci_unplug);
#ifdef CONFIG_XEN_PVHVM
static int xen_emul_unplug;
-static int __init check_platform_magic(void)
+static int check_platform_magic(void)
{
short magic;
char protocol;
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 90bac0aac3a..60aeeb56948 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -50,7 +50,7 @@ phys_addr_t xen_extra_mem_start, xen_extra_mem_size;
*/
#define EXTRA_MEM_RATIO (10)
-static __init void xen_add_extra_mem(unsigned long pages)
+static void __init xen_add_extra_mem(unsigned long pages)
{
unsigned long pfn;
@@ -166,7 +166,7 @@ static unsigned long __init xen_set_identity(const struct e820entry *list,
if (last > end)
continue;
- if (entry->type == E820_RAM) {
+ if ((entry->type == E820_RAM) || (entry->type == E820_UNUSABLE)) {
if (start > start_pci)
identity += set_phys_range_identity(
PFN_UP(start_pci), PFN_DOWN(start));
@@ -227,7 +227,7 @@ char * __init xen_memory_setup(void)
memcpy(map_raw, map, sizeof(map));
e820.nr_map = 0;
- xen_extra_mem_start = max((1ULL << 32), mem_end);
+ xen_extra_mem_start = mem_end;
for (i = 0; i < memmap.nr_entries; i++) {
unsigned long long end;
@@ -262,6 +262,12 @@ char * __init xen_memory_setup(void)
if (map[i].size > 0)
e820_add_region(map[i].addr, map[i].size, map[i].type);
}
+ /* Align the balloon area so that max_low_pfn does not get set
+ * to be at the _end_ of the PCI gap at the far end (fee01000).
+ * Note that xen_extra_mem_start gets set in the loop above to be
+ * past the last E820 region. */
+ if (xen_initial_domain() && (xen_extra_mem_start < (1ULL<<32)))
+ xen_extra_mem_start = (1ULL<<32);
/*
* In domU, the ISA region is normal, usable memory, but we
@@ -336,7 +342,7 @@ static void __init fiddle_vdso(void)
#endif
}
-static __cpuinit int register_callback(unsigned type, const void *func)
+static int __cpuinit register_callback(unsigned type, const void *func)
{
struct callback_register callback = {
.type = type,
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 30612441ed9..b4533a86d7e 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -46,18 +46,17 @@ static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id);
static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id);
/*
- * Reschedule call back. Nothing to do,
- * all the work is done automatically when
- * we return from the interrupt.
+ * Reschedule call back.
*/
static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
{
inc_irq_stat(irq_resched_count);
+ scheduler_ipi();
return IRQ_HANDLED;
}
-static __cpuinit void cpu_bringup(void)
+static void __cpuinit cpu_bringup(void)
{
int cpu = smp_processor_id();
@@ -85,7 +84,7 @@ static __cpuinit void cpu_bringup(void)
wmb(); /* make sure everything is out */
}
-static __cpuinit void cpu_bringup_and_idle(void)
+static void __cpuinit cpu_bringup_and_idle(void)
{
cpu_bringup();
cpu_idle();
@@ -206,11 +205,18 @@ static void __init xen_smp_prepare_boot_cpu(void)
static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
{
unsigned cpu;
+ unsigned int i;
xen_init_lock_cpu(0);
smp_store_cpu_info(0);
cpu_data(0).x86_max_cores = 1;
+
+ for_each_possible_cpu(i) {
+ zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
+ zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
+ zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL);
+ }
set_cpu_sibling_map(0);
if (xen_smp_intr_init(0))
@@ -242,7 +248,7 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
}
}
-static __cpuinit int
+static int __cpuinit
cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
{
struct vcpu_guest_context *ctxt;
@@ -486,7 +492,7 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
return IRQ_HANDLED;
}
-static const struct smp_ops xen_smp_ops __initdata = {
+static const struct smp_ops xen_smp_ops __initconst = {
.smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
.smp_prepare_cpus = xen_smp_prepare_cpus,
.smp_cpus_done = xen_smp_cpus_done,
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 2e2d370a47b..5158c505bef 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -26,8 +26,6 @@
#include "xen-ops.h"
-#define XEN_SHIFT 22
-
/* Xen may fire a timer up to this many ns early */
#define TIMER_SLOP 100000
#define NS_PER_TICK (1000000000LL / HZ)
@@ -211,8 +209,6 @@ static struct clocksource xen_clocksource __read_mostly = {
.rating = 400,
.read = xen_clocksource_get_cycles,
.mask = ~0,
- .mult = 1<<XEN_SHIFT, /* time directly in nanoseconds */
- .shift = XEN_SHIFT,
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
};
@@ -439,16 +435,16 @@ void xen_timer_resume(void)
}
}
-static const struct pv_time_ops xen_time_ops __initdata = {
+static const struct pv_time_ops xen_time_ops __initconst = {
.sched_clock = xen_clocksource_read,
};
-static __init void xen_time_init(void)
+static void __init xen_time_init(void)
{
int cpu = smp_processor_id();
struct timespec tp;
- clocksource_register(&xen_clocksource);
+ clocksource_register_hz(&xen_clocksource, NSEC_PER_SEC);
if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
/* Successfully turned off 100Hz tick, so we have the
@@ -468,7 +464,7 @@ static __init void xen_time_init(void)
xen_setup_cpu_clockevents();
}
-__init void xen_init_time_ops(void)
+void __init xen_init_time_ops(void)
{
pv_time_ops = xen_time_ops;
@@ -490,7 +486,7 @@ static void xen_hvm_setup_cpu_clockevents(void)
xen_setup_cpu_clockevents();
}
-__init void xen_hvm_init_time_ops(void)
+void __init xen_hvm_init_time_ops(void)
{
/* vector callback is needed otherwise we cannot receive interrupts
* on cpu > 0 and at this point we don't know how many cpus are
diff --git a/arch/x86/xen/vga.c b/arch/x86/xen/vga.c
new file mode 100644
index 00000000000..1cd7f4d11e2
--- /dev/null
+++ b/arch/x86/xen/vga.c
@@ -0,0 +1,67 @@
+#include <linux/screen_info.h>
+#include <linux/init.h>
+
+#include <asm/bootparam.h>
+#include <asm/setup.h>
+
+#include <xen/interface/xen.h>
+
+#include "xen-ops.h"
+
+void __init xen_init_vga(const struct dom0_vga_console_info *info, size_t size)
+{
+ struct screen_info *screen_info = &boot_params.screen_info;
+
+ /* This is drawn from a dump from vgacon:startup in
+ * standard Linux. */
+ screen_info->orig_video_mode = 3;
+ screen_info->orig_video_isVGA = 1;
+ screen_info->orig_video_lines = 25;
+ screen_info->orig_video_cols = 80;
+ screen_info->orig_video_ega_bx = 3;
+ screen_info->orig_video_points = 16;
+ screen_info->orig_y = screen_info->orig_video_lines - 1;
+
+ switch (info->video_type) {
+ case XEN_VGATYPE_TEXT_MODE_3:
+ if (size < offsetof(struct dom0_vga_console_info, u.text_mode_3)
+ + sizeof(info->u.text_mode_3))
+ break;
+ screen_info->orig_video_lines = info->u.text_mode_3.rows;
+ screen_info->orig_video_cols = info->u.text_mode_3.columns;
+ screen_info->orig_x = info->u.text_mode_3.cursor_x;
+ screen_info->orig_y = info->u.text_mode_3.cursor_y;
+ screen_info->orig_video_points =
+ info->u.text_mode_3.font_height;
+ break;
+
+ case XEN_VGATYPE_VESA_LFB:
+ if (size < offsetof(struct dom0_vga_console_info,
+ u.vesa_lfb.gbl_caps))
+ break;
+ screen_info->orig_video_isVGA = VIDEO_TYPE_VLFB;
+ screen_info->lfb_width = info->u.vesa_lfb.width;
+ screen_info->lfb_height = info->u.vesa_lfb.height;
+ screen_info->lfb_depth = info->u.vesa_lfb.bits_per_pixel;
+ screen_info->lfb_base = info->u.vesa_lfb.lfb_base;
+ screen_info->lfb_size = info->u.vesa_lfb.lfb_size;
+ screen_info->lfb_linelength = info->u.vesa_lfb.bytes_per_line;
+ screen_info->red_size = info->u.vesa_lfb.red_size;
+ screen_info->red_pos = info->u.vesa_lfb.red_pos;
+ screen_info->green_size = info->u.vesa_lfb.green_size;
+ screen_info->green_pos = info->u.vesa_lfb.green_pos;
+ screen_info->blue_size = info->u.vesa_lfb.blue_size;
+ screen_info->blue_pos = info->u.vesa_lfb.blue_pos;
+ screen_info->rsvd_size = info->u.vesa_lfb.rsvd_size;
+ screen_info->rsvd_pos = info->u.vesa_lfb.rsvd_pos;
+ if (size >= offsetof(struct dom0_vga_console_info,
+ u.vesa_lfb.gbl_caps)
+ + sizeof(info->u.vesa_lfb.gbl_caps))
+ screen_info->capabilities = info->u.vesa_lfb.gbl_caps;
+ if (size >= offsetof(struct dom0_vga_console_info,
+ u.vesa_lfb.mode_attrs)
+ + sizeof(info->u.vesa_lfb.mode_attrs))
+ screen_info->vesa_attributes = info->u.vesa_lfb.mode_attrs;
+ break;
+ }
+}
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 3112f55638c..b095739ccd4 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -74,7 +74,7 @@ static inline void xen_hvm_smp_init(void) {}
#ifdef CONFIG_PARAVIRT_SPINLOCKS
void __init xen_init_spinlocks(void);
-__cpuinit void xen_init_lock_cpu(int cpu);
+void __cpuinit xen_init_lock_cpu(int cpu);
void xen_uninit_lock_cpu(int cpu);
#else
static inline void xen_init_spinlocks(void)
@@ -88,6 +88,17 @@ static inline void xen_uninit_lock_cpu(int cpu)
}
#endif
+struct dom0_vga_console_info;
+
+#ifdef CONFIG_XEN_DOM0
+void __init xen_init_vga(const struct dom0_vga_console_info *, size_t size);
+#else
+static inline void __init xen_init_vga(const struct dom0_vga_console_info *info,
+ size_t size)
+{
+}
+#endif
+
/* Declare an asm function, along with symbols needed to make it
inlineable */
#define DECL_ASM(ret, name, ...) \