From e487683990972bf9aa4e688434c46ead76748bca Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Sat, 20 Jun 2009 23:27:16 -0700
Subject: x86, mce: fix typo in comment in asm/mce.h

Fix comment to match the actual declaration.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
Cc: Andi Kleen <ak@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/mce.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 5cdd8d100ec..b50b9e9042c 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -9,7 +9,7 @@
  */
 
 #define MCG_BANKCNT_MASK	0xff         /* Number of Banks */
-#define MCG_CTL_P		(1ULL<<8)    /* MCG_CAP register available */
+#define MCG_CTL_P		(1ULL<<8)    /* MCG_CTL register available */
 #define MCG_EXT_P		(1ULL<<9)    /* Extended registers available */
 #define MCG_CMCI_P		(1ULL<<10)   /* CMCI supported */
 #define MCG_EXT_CNT_MASK	0xff0000     /* Number of Extended registers */
-- 
cgit v1.2.3


From a95436e44a76a32dcbe7c8df59701ddde53017c1 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Sat, 20 Jun 2009 23:28:22 -0700
Subject: x86, mce: use atomic_inc_return() instead of add by 1

Use atomic_inc_return() instead of atomic_add_return() by 1.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
Cc: Andi Kleen <ak@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 284d1de968b..7da8fec9ca8 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -242,7 +242,7 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
 	/*
 	 * Make sure only one CPU runs in machine check panic
 	 */
-	if (atomic_add_return(1, &mce_paniced) > 1)
+	if (atomic_inc_return(&mce_paniced) > 1)
 		wait_for_panic();
 	barrier();
 
@@ -705,7 +705,7 @@ static int mce_start(int *no_way_out)
 	 * global_nwo should be updated before mce_callin
 	 */
 	smp_wmb();
-	order = atomic_add_return(1, &mce_callin);
+	order = atomic_inc_return(&mce_callin);
 
 	/*
 	 * Wait for everyone.
-- 
cgit v1.2.3


From e74e396204bfcb67570ba4517b08f5918e69afea Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 30 Mar 2009 19:07:44 +0900
Subject: percpu: use dynamic percpu allocator as the default percpu allocator

This patch makes most !CONFIG_HAVE_SETUP_PER_CPU_AREA archs use
dynamic percpu allocator.  The first chunk is allocated using
embedding helper and 8k is reserved for modules.  This ensures that
the new allocator behaves almost identically to the original allocator
as long as static percpu variables are concerned, so it shouldn't
introduce much breakage.

s390 and alpha use custom SHIFT_PERCPU_PTR() to work around addressing
range limit the addressing model imposes.  Unfortunately, this breaks
if the address is specified using a variable, so for now, the two
archs aren't converted.

The following architectures are affected by this change.

* sh
* arm
* cris
* mips
* sparc(32)
* blackfin
* avr32
* parisc (broken, under investigation)
* m32r
* powerpc(32)

As this change makes the dynamic allocator the default one,
CONFIG_HAVE_DYNAMIC_PER_CPU_AREA is replaced with its invert -
CONFIG_HAVE_LEGACY_PER_CPU_AREA, which is added to yet-to-be converted
archs.  These archs implement their own setup_per_cpu_areas() and the
conversion is not trivial.

* powerpc(64)
* sparc(64)
* ia64
* alpha
* s390

Boot and batch alloc/free tests on x86_32 with debug code (x86_32
doesn't use default first chunk initialization).  Compile tested on
sparc(32), powerpc(32), arm and alpha.

Kyle McMartin reported that this change breaks parisc.  The problem is
still under investigation and he is okay with pushing this patch
forward and fixing parisc later.

[ Impact: use dynamic allocator for most archs w/o custom percpu setup ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: David S. Miller <davem@davemloft.net>
Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Reviewed-by: Christoph Lameter <cl@linux.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Mikael Starvik <starvik@axis.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Bryan Wu <cooloney@kernel.org>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Matthew Wilcox <matthew@wil.cx>
Cc: Grant Grundler <grundler@parisc-linux.org>
Cc: Hirokazu Takata <takata@linux-m32r.org>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
---
 arch/alpha/Kconfig     |  3 +++
 arch/ia64/Kconfig      |  3 +++
 arch/powerpc/Kconfig   |  3 +++
 arch/s390/Kconfig      |  3 +++
 arch/sparc/Kconfig     |  3 +++
 arch/x86/Kconfig       |  3 ---
 include/linux/percpu.h | 12 +++++++++---
 init/main.c            | 24 ------------------------
 kernel/module.c        |  6 +++---
 mm/Makefile            |  2 +-
 mm/allocpercpu.c       | 28 ++++++++++++++++++++++++++++
 mm/percpu.c            | 40 +++++++++++++++++++++++++++++++++++++++-
 12 files changed, 95 insertions(+), 35 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index 9fb8aae5c39..05d86407188 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -70,6 +70,9 @@ config AUTO_IRQ_AFFINITY
 	depends on SMP
 	default y
 
+config HAVE_LEGACY_PER_CPU_AREA
+	def_bool y
+
 source "init/Kconfig"
 source "kernel/Kconfig.freezer"
 
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 170042b420d..328d2f8b8c3 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -89,6 +89,9 @@ config GENERIC_TIME_VSYSCALL
 	bool
 	default y
 
+config HAVE_LEGACY_PER_CPU_AREA
+	def_bool y
+
 config HAVE_SETUP_PER_CPU_AREA
 	def_bool y
 
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index bf6cedfa05d..a774c2acbe6 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -46,6 +46,9 @@ config GENERIC_HARDIRQS_NO__DO_IRQ
 	bool
 	default y
 
+config HAVE_LEGACY_PER_CPU_AREA
+	def_bool PPC64
+
 config HAVE_SETUP_PER_CPU_AREA
 	def_bool PPC64
 
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index a14dba0e4d6..f4a3cc62d28 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -75,6 +75,9 @@ config VIRT_CPU_ACCOUNTING
 config ARCH_SUPPORTS_DEBUG_PAGEALLOC
 	def_bool y
 
+config HAVE_LEGACY_PER_CPU_AREA
+	def_bool y
+
 mainmenu "Linux Kernel Configuration"
 
 config S390
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 3f8b6a92eab..7a8698b913f 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -92,6 +92,9 @@ config AUDIT_ARCH
 	bool
 	default y
 
+config HAVE_LEGACY_PER_CPU_AREA
+	def_bool y if SPARC64
+
 config HAVE_SETUP_PER_CPU_AREA
 	def_bool y if SPARC64
 
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index d1430ef6b4f..a48a90076d8 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -149,9 +149,6 @@ config ARCH_HAS_CACHE_LINE_SIZE
 config HAVE_SETUP_PER_CPU_AREA
 	def_bool y
 
-config HAVE_DYNAMIC_PER_CPU_AREA
-	def_bool y
-
 config HAVE_CPUMASK_OF_CPU_MAP
 	def_bool X86_64_SMP
 
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 26fd9d12f05..e5000343dd6 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -34,7 +34,7 @@
 
 #ifdef CONFIG_SMP
 
-#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
+#ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA
 
 /* minimum unit size, also is the maximum supported allocation size */
 #define PCPU_MIN_UNIT_SIZE		PFN_ALIGN(64 << 10)
@@ -80,7 +80,7 @@ extern ssize_t __init pcpu_embed_first_chunk(
 
 extern void *__alloc_reserved_percpu(size_t size, size_t align);
 
-#else /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
+#else /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */
 
 struct percpu_data {
 	void *ptrs[1];
@@ -99,11 +99,15 @@ struct percpu_data {
         (__typeof__(ptr))__p->ptrs[(cpu)];				\
 })
 
-#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
+#endif /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */
 
 extern void *__alloc_percpu(size_t size, size_t align);
 extern void free_percpu(void *__pdata);
 
+#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
+extern void __init setup_per_cpu_areas(void);
+#endif
+
 #else /* CONFIG_SMP */
 
 #define per_cpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); })
@@ -124,6 +128,8 @@ static inline void free_percpu(void *p)
 	kfree(p);
 }
 
+static inline void __init setup_per_cpu_areas(void) { }
+
 #endif /* CONFIG_SMP */
 
 #define alloc_percpu(type)	(type *)__alloc_percpu(sizeof(type), \
diff --git a/init/main.c b/init/main.c
index 09131ec090c..602d724afa5 100644
--- a/init/main.c
+++ b/init/main.c
@@ -357,7 +357,6 @@ static void __init smp_init(void)
 #define smp_init()	do { } while (0)
 #endif
 
-static inline void setup_per_cpu_areas(void) { }
 static inline void setup_nr_cpu_ids(void) { }
 static inline void smp_prepare_cpus(unsigned int maxcpus) { }
 
@@ -378,29 +377,6 @@ static void __init setup_nr_cpu_ids(void)
 	nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1;
 }
 
-#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
-unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
-
-EXPORT_SYMBOL(__per_cpu_offset);
-
-static void __init setup_per_cpu_areas(void)
-{
-	unsigned long size, i;
-	char *ptr;
-	unsigned long nr_possible_cpus = num_possible_cpus();
-
-	/* Copy section for each CPU (we discard the original) */
-	size = ALIGN(PERCPU_ENOUGH_ROOM, PAGE_SIZE);
-	ptr = alloc_bootmem_pages(size * nr_possible_cpus);
-
-	for_each_possible_cpu(i) {
-		__per_cpu_offset[i] = ptr - __per_cpu_start;
-		memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
-		ptr += size;
-	}
-}
-#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
-
 /* Called by boot processor to activate the rest. */
 static void __init smp_init(void)
 {
diff --git a/kernel/module.c b/kernel/module.c
index 38928fcaff2..f5934954fa9 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -364,7 +364,7 @@ EXPORT_SYMBOL_GPL(find_module);
 
 #ifdef CONFIG_SMP
 
-#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
+#ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA
 
 static void *percpu_modalloc(unsigned long size, unsigned long align,
 			     const char *name)
@@ -389,7 +389,7 @@ static void percpu_modfree(void *freeme)
 	free_percpu(freeme);
 }
 
-#else /* ... !CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
+#else /* ... CONFIG_HAVE_LEGACY_PER_CPU_AREA */
 
 /* Number of blocks used and allocated. */
 static unsigned int pcpu_num_used, pcpu_num_allocated;
@@ -535,7 +535,7 @@ static int percpu_modinit(void)
 }
 __initcall(percpu_modinit);
 
-#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
+#endif /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */
 
 static unsigned int find_pcpusec(Elf_Ehdr *hdr,
 				 Elf_Shdr *sechdrs,
diff --git a/mm/Makefile b/mm/Makefile
index 5e0bd642669..c77c6487552 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -33,7 +33,7 @@ obj-$(CONFIG_FAILSLAB) += failslab.o
 obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
 obj-$(CONFIG_FS_XIP) += filemap_xip.o
 obj-$(CONFIG_MIGRATION) += migrate.o
-ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
+ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA
 obj-$(CONFIG_SMP) += percpu.o
 else
 obj-$(CONFIG_SMP) += allocpercpu.o
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
index dfdee6a4735..df34ceae0c6 100644
--- a/mm/allocpercpu.c
+++ b/mm/allocpercpu.c
@@ -5,6 +5,8 @@
  */
 #include <linux/mm.h>
 #include <linux/module.h>
+#include <linux/bootmem.h>
+#include <asm/sections.h>
 
 #ifndef cache_line_size
 #define cache_line_size()	L1_CACHE_BYTES
@@ -147,3 +149,29 @@ void free_percpu(void *__pdata)
 	kfree(__percpu_disguise(__pdata));
 }
 EXPORT_SYMBOL_GPL(free_percpu);
+
+/*
+ * Generic percpu area setup.
+ */
+#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
+unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
+
+EXPORT_SYMBOL(__per_cpu_offset);
+
+void __init setup_per_cpu_areas(void)
+{
+	unsigned long size, i;
+	char *ptr;
+	unsigned long nr_possible_cpus = num_possible_cpus();
+
+	/* Copy section for each CPU (we discard the original) */
+	size = ALIGN(PERCPU_ENOUGH_ROOM, PAGE_SIZE);
+	ptr = alloc_bootmem_pages(size * nr_possible_cpus);
+
+	for_each_possible_cpu(i) {
+		__per_cpu_offset[i] = ptr - __per_cpu_start;
+		memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
+		ptr += size;
+	}
+}
+#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
diff --git a/mm/percpu.c b/mm/percpu.c
index b70f2acd885..b14984566f5 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -43,7 +43,7 @@
  *
  * To use this allocator, arch code should do the followings.
  *
- * - define CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
+ * - drop CONFIG_HAVE_LEGACY_PER_CPU_AREA
  *
  * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
  *   regular address to percpu pointer and back if they need to be
@@ -1275,3 +1275,41 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
 				      reserved_size, dyn_size,
 				      pcpue_unit_size, pcpue_ptr, NULL);
 }
+
+/*
+ * Generic percpu area setup.
+ *
+ * The embedding helper is used because its behavior closely resembles
+ * the original non-dynamic generic percpu area setup.  This is
+ * important because many archs have addressing restrictions and might
+ * fail if the percpu area is located far away from the previous
+ * location.  As an added bonus, in non-NUMA cases, embedding is
+ * generally a good idea TLB-wise because percpu area can piggy back
+ * on the physical linear memory mapping which uses large page
+ * mappings on applicable archs.
+ */
+#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
+unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
+EXPORT_SYMBOL(__per_cpu_offset);
+
+void __init setup_per_cpu_areas(void)
+{
+	size_t static_size = __per_cpu_end - __per_cpu_start;
+	ssize_t unit_size;
+	unsigned long delta;
+	unsigned int cpu;
+
+	/*
+	 * Always reserve area for module percpu variables.  That's
+	 * what the legacy allocator did.
+	 */
+	unit_size = pcpu_embed_first_chunk(static_size, PERCPU_MODULE_RESERVE,
+					   PERCPU_DYNAMIC_RESERVE, -1);
+	if (unit_size < 0)
+		panic("Failed to initialized percpu areas.");
+
+	delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
+	for_each_possible_cpu(cpu)
+		__per_cpu_offset[cpu] = delta + cpu * unit_size;
+}
+#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
-- 
cgit v1.2.3


From 204fba4aa303ea4a7bb726a539bf4a5b9e3203d0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 24 Jun 2009 15:13:45 +0900
Subject: percpu: cleanup percpu array definitions

Currently, the following three different ways to define percpu arrays
are in use.

1. DEFINE_PER_CPU(elem_type[array_len], array_name);
2. DEFINE_PER_CPU(elem_type, array_name[array_len]);
3. DEFINE_PER_CPU(elem_type, array_name)[array_len];

Unify to #1 which correctly separates the roles of the two parameters
and thus allows more flexibility in the way percpu variables are
defined.

[ Impact: cleanup ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: linux-mm@kvack.org
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: David S. Miller <davem@davemloft.net>
---
 arch/ia64/kernel/smp.c               | 2 +-
 arch/ia64/sn/kernel/setup.c          | 2 +-
 arch/powerpc/mm/stab.c               | 2 +-
 arch/powerpc/platforms/ps3/smp.c     | 2 +-
 arch/x86/kernel/cpu/cpu_debug.c      | 4 ++--
 arch/x86/kernel/cpu/mcheck/mce_amd.c | 2 +-
 arch/x86/kernel/cpu/perf_counter.c   | 2 +-
 drivers/xen/events.c                 | 4 ++--
 mm/quicklist.c                       | 2 +-
 mm/slub.c                            | 4 ++--
 net/ipv4/syncookies.c                | 2 +-
 net/ipv6/syncookies.c                | 2 +-
 12 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/ia64/kernel/smp.c b/arch/ia64/kernel/smp.c
index f0c521b0ba4..94cf78ba28f 100644
--- a/arch/ia64/kernel/smp.c
+++ b/arch/ia64/kernel/smp.c
@@ -58,7 +58,7 @@ static struct local_tlb_flush_counts {
 	unsigned int count;
 } __attribute__((__aligned__(32))) local_tlb_flush_counts[NR_CPUS];
 
-static DEFINE_PER_CPU(unsigned short, shadow_flush_counts[NR_CPUS]) ____cacheline_aligned;
+static DEFINE_PER_CPU(unsigned short [NR_CPUS], shadow_flush_counts) ____cacheline_aligned;
 
 #define IPI_CALL_FUNC		0
 #define IPI_CPU_STOP		1
diff --git a/arch/ia64/sn/kernel/setup.c b/arch/ia64/sn/kernel/setup.c
index e456f062f24..ece1bf99449 100644
--- a/arch/ia64/sn/kernel/setup.c
+++ b/arch/ia64/sn/kernel/setup.c
@@ -71,7 +71,7 @@ EXPORT_SYMBOL(sn_rtc_cycles_per_second);
 DEFINE_PER_CPU(struct sn_hub_info_s, __sn_hub_info);
 EXPORT_PER_CPU_SYMBOL(__sn_hub_info);
 
-DEFINE_PER_CPU(short, __sn_cnodeid_to_nasid[MAX_COMPACT_NODES]);
+DEFINE_PER_CPU(short [MAX_COMPACT_NODES], __sn_cnodeid_to_nasid);
 EXPORT_PER_CPU_SYMBOL(__sn_cnodeid_to_nasid);
 
 DEFINE_PER_CPU(struct nodepda_s *, __sn_nodepda);
diff --git a/arch/powerpc/mm/stab.c b/arch/powerpc/mm/stab.c
index 98cd1dc2ae7..6e9b69c9985 100644
--- a/arch/powerpc/mm/stab.c
+++ b/arch/powerpc/mm/stab.c
@@ -31,7 +31,7 @@ struct stab_entry {
 
 #define NR_STAB_CACHE_ENTRIES 8
 static DEFINE_PER_CPU(long, stab_cache_ptr);
-static DEFINE_PER_CPU(long, stab_cache[NR_STAB_CACHE_ENTRIES]);
+static DEFINE_PER_CPU(long [NR_STAB_CACHE_ENTRIES], stab_cache);
 
 /*
  * Create a segment table entry for the given esid/vsid pair.
diff --git a/arch/powerpc/platforms/ps3/smp.c b/arch/powerpc/platforms/ps3/smp.c
index f6e04bcc70e..51ffde40af2 100644
--- a/arch/powerpc/platforms/ps3/smp.c
+++ b/arch/powerpc/platforms/ps3/smp.c
@@ -37,7 +37,7 @@
   */
 
 #define MSG_COUNT 4
-static DEFINE_PER_CPU(unsigned int, ps3_ipi_virqs[MSG_COUNT]);
+static DEFINE_PER_CPU(unsigned int [MSG_COUNT], ps3_ipi_virqs);
 
 static void do_message_pass(int target, int msg)
 {
diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c
index 6b2a52dd040..dca325c0399 100644
--- a/arch/x86/kernel/cpu/cpu_debug.c
+++ b/arch/x86/kernel/cpu/cpu_debug.c
@@ -30,8 +30,8 @@
 #include <asm/apic.h>
 #include <asm/desc.h>
 
-static DEFINE_PER_CPU(struct cpu_cpuX_base, cpu_arr[CPU_REG_ALL_BIT]);
-static DEFINE_PER_CPU(struct cpu_private *, priv_arr[MAX_CPU_FILES]);
+static DEFINE_PER_CPU(struct cpu_cpuX_base [CPU_REG_ALL_BIT], cpu_arr);
+static DEFINE_PER_CPU(struct cpu_private * [MAX_CPU_FILES], priv_arr);
 static DEFINE_PER_CPU(int, cpu_priv_count);
 
 static DEFINE_MUTEX(cpu_debug_lock);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index ddae21620bd..bd2a2fa8462 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -69,7 +69,7 @@ struct threshold_bank {
 	struct threshold_block	*blocks;
 	cpumask_var_t		cpus;
 };
-static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]);
+static DEFINE_PER_CPU(struct threshold_bank * [NR_BANKS], threshold_banks);
 
 #ifdef CONFIG_SMP
 static unsigned char shared_bank[NR_BANKS] = {
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 76dfef23f78..4946288d683 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -862,7 +862,7 @@ amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
 	x86_pmu_disable_counter(hwc, idx);
 }
 
-static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
+static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], prev_left);
 
 /*
  * Set the next IRQ period, based on the hwc->period_left value.
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index 891d2e90753..ab581fa6268 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -47,10 +47,10 @@
 static DEFINE_SPINLOCK(irq_mapping_update_lock);
 
 /* IRQ <-> VIRQ mapping. */
-static DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1};
+static DEFINE_PER_CPU(int [NR_VIRQS], virq_to_irq) = {[0 ... NR_VIRQS-1] = -1};
 
 /* IRQ <-> IPI mapping */
-static DEFINE_PER_CPU(int, ipi_to_irq[XEN_NR_IPIS]) = {[0 ... XEN_NR_IPIS-1] = -1};
+static DEFINE_PER_CPU(int [XEN_NR_IPIS], ipi_to_irq) = {[0 ... XEN_NR_IPIS-1] = -1};
 
 /* Interrupt types. */
 enum xen_irq_type {
diff --git a/mm/quicklist.c b/mm/quicklist.c
index e66d07d1b4f..6eedf7e473d 100644
--- a/mm/quicklist.c
+++ b/mm/quicklist.c
@@ -19,7 +19,7 @@
 #include <linux/module.h>
 #include <linux/quicklist.h>
 
-DEFINE_PER_CPU(struct quicklist, quicklist)[CONFIG_NR_QUICK];
+DEFINE_PER_CPU(struct quicklist [CONFIG_NR_QUICK], quicklist);
 
 #define FRACTION_OF_NODE_MEM	16
 
diff --git a/mm/slub.c b/mm/slub.c
index ce62b770e2f..23bb79acc4b 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2086,8 +2086,8 @@ init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s)
  */
 #define NR_KMEM_CACHE_CPU 100
 
-static DEFINE_PER_CPU(struct kmem_cache_cpu,
-				kmem_cache_cpu)[NR_KMEM_CACHE_CPU];
+static DEFINE_PER_CPU(struct kmem_cache_cpu [NR_KMEM_CACHE_CPU],
+		      kmem_cache_cpu);
 
 static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free);
 static DECLARE_BITMAP(kmem_cach_cpu_free_init_once, CONFIG_NR_CPUS);
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index cd2b97f1b6e..84d90f2799b 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -37,7 +37,7 @@ __initcall(init_syncookies);
 #define COOKIEBITS 24	/* Upper bits store count */
 #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1)
 
-static DEFINE_PER_CPU(__u32, cookie_scratch)[16 + 5 + SHA_WORKSPACE_WORDS];
+static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], cookie_scratch);
 
 static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
 		       u32 count, int c)
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index 8c2513982b6..23d0d6db046 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -74,7 +74,7 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
 	return child;
 }
 
-static DEFINE_PER_CPU(__u32, cookie_scratch)[16 + 5 + SHA_WORKSPACE_WORDS];
+static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], cookie_scratch);
 
 static u32 cookie_hash(struct in6_addr *saddr, struct in6_addr *daddr,
 		       __be16 sport, __be16 dport, u32 count, int c)
-- 
cgit v1.2.3


From 245b2e70eabd797932adb263a65da0bab3711753 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 24 Jun 2009 15:13:48 +0900
Subject: percpu: clean up percpu variable definitions

Percpu variable definition is about to be updated such that all percpu
symbols including the static ones must be unique.  Update percpu
variable definitions accordingly.

* as,cfq: rename ioc_count uniquely

* cpufreq: rename cpu_dbs_info uniquely

* xen: move nesting_count out of xen_evtchn_do_upcall() and rename it

* mm: move ratelimits out of balance_dirty_pages_ratelimited_nr() and
  rename it

* ipv4,6: rename cookie_scratch uniquely

* x86 perf_counter: rename prev_left to pmc_prev_left, irq_entry to
  pmc_irq_entry and nmi_entry to pmc_nmi_entry

* perf_counter: rename disable_count to perf_disable_count

* ftrace: rename test_event_disable to ftrace_test_event_disable

* kmemleak: rename test_pointer to kmemleak_test_pointer

* mce: rename next_interval to mce_next_interval

[ Impact: percpu usage cleanups, no duplicate static percpu var names ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Dave Jones <davej@redhat.com>
Cc: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: linux-mm <linux-mm@kvack.org>
Cc: David S. Miller <davem@davemloft.net>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Steven Rostedt <srostedt@redhat.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Andi Kleen <andi@firstfloor.org>
---
 arch/x86/kernel/cpu/mcheck/mce.c       |  8 ++++----
 arch/x86/kernel/cpu/perf_counter.c     | 14 +++++++-------
 block/as-iosched.c                     | 10 +++++-----
 block/cfq-iosched.c                    | 10 +++++-----
 drivers/cpufreq/cpufreq_conservative.c | 12 ++++++------
 drivers/cpufreq/cpufreq_ondemand.c     | 15 ++++++++-------
 drivers/xen/events.c                   |  9 +++++----
 kernel/perf_counter.c                  |  6 +++---
 kernel/trace/trace_events.c            |  6 +++---
 mm/kmemleak-test.c                     |  6 +++---
 mm/page-writeback.c                    |  5 +++--
 net/ipv4/syncookies.c                  |  5 +++--
 net/ipv6/syncookies.c                  |  5 +++--
 13 files changed, 58 insertions(+), 53 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 284d1de968b..cba8cd3e957 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1091,7 +1091,7 @@ void mce_log_therm_throt_event(__u64 status)
  */
 static int check_interval = 5 * 60; /* 5 minutes */
 
-static DEFINE_PER_CPU(int, next_interval); /* in jiffies */
+static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */
 static DEFINE_PER_CPU(struct timer_list, mce_timer);
 
 static void mcheck_timer(unsigned long data)
@@ -1110,7 +1110,7 @@ static void mcheck_timer(unsigned long data)
 	 * Alert userspace if needed.  If we logged an MCE, reduce the
 	 * polling interval, otherwise increase the polling interval.
 	 */
-	n = &__get_cpu_var(next_interval);
+	n = &__get_cpu_var(mce_next_interval);
 	if (mce_notify_irq())
 		*n = max(*n/2, HZ/100);
 	else
@@ -1311,7 +1311,7 @@ static void mce_cpu_features(struct cpuinfo_x86 *c)
 static void mce_init_timer(void)
 {
 	struct timer_list *t = &__get_cpu_var(mce_timer);
-	int *n = &__get_cpu_var(next_interval);
+	int *n = &__get_cpu_var(mce_next_interval);
 
 	if (mce_ignore_ce)
 		return;
@@ -1914,7 +1914,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 	case CPU_DOWN_FAILED:
 	case CPU_DOWN_FAILED_FROZEN:
 		t->expires = round_jiffies(jiffies +
-						__get_cpu_var(next_interval));
+					   __get_cpu_var(mce_next_interval));
 		add_timer_on(t, cpu);
 		smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
 		break;
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 4946288d683..5fdf63aaaba 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -862,7 +862,7 @@ amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
 	x86_pmu_disable_counter(hwc, idx);
 }
 
-static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], prev_left);
+static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
 
 /*
  * Set the next IRQ period, based on the hwc->period_left value.
@@ -901,7 +901,7 @@ x86_perf_counter_set_period(struct perf_counter *counter,
 	if (left > x86_pmu.max_period)
 		left = x86_pmu.max_period;
 
-	per_cpu(prev_left[idx], smp_processor_id()) = left;
+	per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
 
 	/*
 	 * The hw counter starts counting from this counter offset,
@@ -1089,7 +1089,7 @@ void perf_counter_print_debug(void)
 		rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
 		rdmsrl(x86_pmu.perfctr  + idx, pmc_count);
 
-		prev_left = per_cpu(prev_left[idx], cpu);
+		prev_left = per_cpu(pmc_prev_left[idx], cpu);
 
 		pr_info("CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
 			cpu, idx, pmc_ctrl);
@@ -1561,8 +1561,8 @@ void callchain_store(struct perf_callchain_entry *entry, u64 ip)
 		entry->ip[entry->nr++] = ip;
 }
 
-static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry);
-static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry);
+static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry);
+static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry);
 
 
 static void
@@ -1709,9 +1709,9 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
 	struct perf_callchain_entry *entry;
 
 	if (in_nmi())
-		entry = &__get_cpu_var(nmi_entry);
+		entry = &__get_cpu_var(pmc_nmi_entry);
 	else
-		entry = &__get_cpu_var(irq_entry);
+		entry = &__get_cpu_var(pmc_irq_entry);
 
 	entry->nr = 0;
 
diff --git a/block/as-iosched.c b/block/as-iosched.c
index 7a12cf6ee1d..ce8ba57c655 100644
--- a/block/as-iosched.c
+++ b/block/as-iosched.c
@@ -146,7 +146,7 @@ enum arq_state {
 #define RQ_STATE(rq)	((enum arq_state)(rq)->elevator_private2)
 #define RQ_SET_STATE(rq, state)	((rq)->elevator_private2 = (void *) state)
 
-static DEFINE_PER_CPU(unsigned long, ioc_count);
+static DEFINE_PER_CPU(unsigned long, as_ioc_count);
 static struct completion *ioc_gone;
 static DEFINE_SPINLOCK(ioc_gone_lock);
 
@@ -161,7 +161,7 @@ static void as_antic_stop(struct as_data *ad);
 static void free_as_io_context(struct as_io_context *aic)
 {
 	kfree(aic);
-	elv_ioc_count_dec(ioc_count);
+	elv_ioc_count_dec(as_ioc_count);
 	if (ioc_gone) {
 		/*
 		 * AS scheduler is exiting, grab exit lock and check
@@ -169,7 +169,7 @@ static void free_as_io_context(struct as_io_context *aic)
 		 * complete ioc_gone and set it back to NULL.
 		 */
 		spin_lock(&ioc_gone_lock);
-		if (ioc_gone && !elv_ioc_count_read(ioc_count)) {
+		if (ioc_gone && !elv_ioc_count_read(as_ioc_count)) {
 			complete(ioc_gone);
 			ioc_gone = NULL;
 		}
@@ -211,7 +211,7 @@ static struct as_io_context *alloc_as_io_context(void)
 		ret->seek_total = 0;
 		ret->seek_samples = 0;
 		ret->seek_mean = 0;
-		elv_ioc_count_inc(ioc_count);
+		elv_ioc_count_inc(as_ioc_count);
 	}
 
 	return ret;
@@ -1507,7 +1507,7 @@ static void __exit as_exit(void)
 	ioc_gone = &all_gone;
 	/* ioc_gone's update must be visible before reading ioc_count */
 	smp_wmb();
-	if (elv_ioc_count_read(ioc_count))
+	if (elv_ioc_count_read(as_ioc_count))
 		wait_for_completion(&all_gone);
 	synchronize_rcu();
 }
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 833ec18eaa6..0f1cc7d3855 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -48,7 +48,7 @@ static int cfq_slice_idle = HZ / 125;
 static struct kmem_cache *cfq_pool;
 static struct kmem_cache *cfq_ioc_pool;
 
-static DEFINE_PER_CPU(unsigned long, ioc_count);
+static DEFINE_PER_CPU(unsigned long, cfq_ioc_count);
 static struct completion *ioc_gone;
 static DEFINE_SPINLOCK(ioc_gone_lock);
 
@@ -1422,7 +1422,7 @@ static void cfq_cic_free_rcu(struct rcu_head *head)
 	cic = container_of(head, struct cfq_io_context, rcu_head);
 
 	kmem_cache_free(cfq_ioc_pool, cic);
-	elv_ioc_count_dec(ioc_count);
+	elv_ioc_count_dec(cfq_ioc_count);
 
 	if (ioc_gone) {
 		/*
@@ -1431,7 +1431,7 @@ static void cfq_cic_free_rcu(struct rcu_head *head)
 		 * complete ioc_gone and set it back to NULL
 		 */
 		spin_lock(&ioc_gone_lock);
-		if (ioc_gone && !elv_ioc_count_read(ioc_count)) {
+		if (ioc_gone && !elv_ioc_count_read(cfq_ioc_count)) {
 			complete(ioc_gone);
 			ioc_gone = NULL;
 		}
@@ -1557,7 +1557,7 @@ cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
 		INIT_HLIST_NODE(&cic->cic_list);
 		cic->dtor = cfq_free_io_context;
 		cic->exit = cfq_exit_io_context;
-		elv_ioc_count_inc(ioc_count);
+		elv_ioc_count_inc(cfq_ioc_count);
 	}
 
 	return cic;
@@ -2658,7 +2658,7 @@ static void __exit cfq_exit(void)
 	 * this also protects us from entering cfq_slab_kill() with
 	 * pending RCU callbacks
 	 */
-	if (elv_ioc_count_read(ioc_count))
+	if (elv_ioc_count_read(cfq_ioc_count))
 		wait_for_completion(&all_gone);
 	cfq_slab_kill();
 }
diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
index 7fc58af748b..a7ef465c83b 100644
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@ -65,7 +65,7 @@ struct cpu_dbs_info_s {
 	int cpu;
 	unsigned int enable:1;
 };
-static DEFINE_PER_CPU(struct cpu_dbs_info_s, cpu_dbs_info);
+static DEFINE_PER_CPU(struct cpu_dbs_info_s, cs_cpu_dbs_info);
 
 static unsigned int dbs_enable;	/* number of CPUs using this policy */
 
@@ -138,7 +138,7 @@ dbs_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
 		     void *data)
 {
 	struct cpufreq_freqs *freq = data;
-	struct cpu_dbs_info_s *this_dbs_info = &per_cpu(cpu_dbs_info,
+	struct cpu_dbs_info_s *this_dbs_info = &per_cpu(cs_cpu_dbs_info,
 							freq->cpu);
 
 	struct cpufreq_policy *policy;
@@ -298,7 +298,7 @@ static ssize_t store_ignore_nice_load(struct cpufreq_policy *policy,
 	/* we need to re-evaluate prev_cpu_idle */
 	for_each_online_cpu(j) {
 		struct cpu_dbs_info_s *dbs_info;
-		dbs_info = &per_cpu(cpu_dbs_info, j);
+		dbs_info = &per_cpu(cs_cpu_dbs_info, j);
 		dbs_info->prev_cpu_idle = get_cpu_idle_time(j,
 						&dbs_info->prev_cpu_wall);
 		if (dbs_tuners_ins.ignore_nice)
@@ -388,7 +388,7 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info)
 		cputime64_t cur_wall_time, cur_idle_time;
 		unsigned int idle_time, wall_time;
 
-		j_dbs_info = &per_cpu(cpu_dbs_info, j);
+		j_dbs_info = &per_cpu(cs_cpu_dbs_info, j);
 
 		cur_idle_time = get_cpu_idle_time(j, &cur_wall_time);
 
@@ -528,7 +528,7 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
 	unsigned int j;
 	int rc;
 
-	this_dbs_info = &per_cpu(cpu_dbs_info, cpu);
+	this_dbs_info = &per_cpu(cs_cpu_dbs_info, cpu);
 
 	switch (event) {
 	case CPUFREQ_GOV_START:
@@ -548,7 +548,7 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
 
 		for_each_cpu(j, policy->cpus) {
 			struct cpu_dbs_info_s *j_dbs_info;
-			j_dbs_info = &per_cpu(cpu_dbs_info, j);
+			j_dbs_info = &per_cpu(cs_cpu_dbs_info, j);
 			j_dbs_info->cur_policy = policy;
 
 			j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j,
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index 1911d172935..36f292a7bd0 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -73,7 +73,7 @@ struct cpu_dbs_info_s {
 	unsigned int enable:1,
 		sample_type:1;
 };
-static DEFINE_PER_CPU(struct cpu_dbs_info_s, cpu_dbs_info);
+static DEFINE_PER_CPU(struct cpu_dbs_info_s, od_cpu_dbs_info);
 
 static unsigned int dbs_enable;	/* number of CPUs using this policy */
 
@@ -151,7 +151,8 @@ static unsigned int powersave_bias_target(struct cpufreq_policy *policy,
 	unsigned int freq_hi, freq_lo;
 	unsigned int index = 0;
 	unsigned int jiffies_total, jiffies_hi, jiffies_lo;
-	struct cpu_dbs_info_s *dbs_info = &per_cpu(cpu_dbs_info, policy->cpu);
+	struct cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info,
+						   policy->cpu);
 
 	if (!dbs_info->freq_table) {
 		dbs_info->freq_lo = 0;
@@ -196,7 +197,7 @@ static void ondemand_powersave_bias_init(void)
 {
 	int i;
 	for_each_online_cpu(i) {
-		struct cpu_dbs_info_s *dbs_info = &per_cpu(cpu_dbs_info, i);
+		struct cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, i);
 		dbs_info->freq_table = cpufreq_frequency_get_table(i);
 		dbs_info->freq_lo = 0;
 	}
@@ -297,7 +298,7 @@ static ssize_t store_ignore_nice_load(struct cpufreq_policy *policy,
 	/* we need to re-evaluate prev_cpu_idle */
 	for_each_online_cpu(j) {
 		struct cpu_dbs_info_s *dbs_info;
-		dbs_info = &per_cpu(cpu_dbs_info, j);
+		dbs_info = &per_cpu(od_cpu_dbs_info, j);
 		dbs_info->prev_cpu_idle = get_cpu_idle_time(j,
 						&dbs_info->prev_cpu_wall);
 		if (dbs_tuners_ins.ignore_nice)
@@ -391,7 +392,7 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info)
 		unsigned int load, load_freq;
 		int freq_avg;
 
-		j_dbs_info = &per_cpu(cpu_dbs_info, j);
+		j_dbs_info = &per_cpu(od_cpu_dbs_info, j);
 
 		cur_idle_time = get_cpu_idle_time(j, &cur_wall_time);
 
@@ -548,7 +549,7 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
 	unsigned int j;
 	int rc;
 
-	this_dbs_info = &per_cpu(cpu_dbs_info, cpu);
+	this_dbs_info = &per_cpu(od_cpu_dbs_info, cpu);
 
 	switch (event) {
 	case CPUFREQ_GOV_START:
@@ -570,7 +571,7 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
 
 		for_each_cpu(j, policy->cpus) {
 			struct cpu_dbs_info_s *j_dbs_info;
-			j_dbs_info = &per_cpu(cpu_dbs_info, j);
+			j_dbs_info = &per_cpu(od_cpu_dbs_info, j);
 			j_dbs_info->cur_policy = policy;
 
 			j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j,
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index ab581fa6268..7d2987e9b1b 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -602,6 +602,8 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
+static DEFINE_PER_CPU(unsigned, xed_nesting_count);
+
 /*
  * Search the CPUs pending events bitmasks.  For each one found, map
  * the event number to an irq, and feed it into do_IRQ() for
@@ -617,7 +619,6 @@ void xen_evtchn_do_upcall(struct pt_regs *regs)
 	struct pt_regs *old_regs = set_irq_regs(regs);
 	struct shared_info *s = HYPERVISOR_shared_info;
 	struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
-	static DEFINE_PER_CPU(unsigned, nesting_count);
  	unsigned count;
 
 	exit_idle();
@@ -628,7 +629,7 @@ void xen_evtchn_do_upcall(struct pt_regs *regs)
 
 		vcpu_info->evtchn_upcall_pending = 0;
 
-		if (__get_cpu_var(nesting_count)++)
+		if (__get_cpu_var(xed_nesting_count)++)
 			goto out;
 
 #ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */
@@ -653,8 +654,8 @@ void xen_evtchn_do_upcall(struct pt_regs *regs)
 
 		BUG_ON(!irqs_disabled());
 
-		count = __get_cpu_var(nesting_count);
-		__get_cpu_var(nesting_count) = 0;
+		count = __get_cpu_var(xed_nesting_count);
+		__get_cpu_var(xed_nesting_count) = 0;
 	} while(count != 1);
 
 out:
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 1a933a221ea..1fd7a2e7575 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -98,16 +98,16 @@ hw_perf_group_sched_in(struct perf_counter *group_leader,
 
 void __weak perf_counter_print_debug(void)	{ }
 
-static DEFINE_PER_CPU(int, disable_count);
+static DEFINE_PER_CPU(int, perf_disable_count);
 
 void __perf_disable(void)
 {
-	__get_cpu_var(disable_count)++;
+	__get_cpu_var(perf_disable_count)++;
 }
 
 bool __perf_enable(void)
 {
-	return !--__get_cpu_var(disable_count);
+	return !--__get_cpu_var(perf_disable_count);
 }
 
 void perf_disable(void)
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index aa08be69a1b..54b1de5074b 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1318,7 +1318,7 @@ static __init void event_trace_self_tests(void)
 
 #ifdef CONFIG_FUNCTION_TRACER
 
-static DEFINE_PER_CPU(atomic_t, test_event_disable);
+static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable);
 
 static void
 function_test_events_call(unsigned long ip, unsigned long parent_ip)
@@ -1334,7 +1334,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)
 	pc = preempt_count();
 	resched = ftrace_preempt_disable();
 	cpu = raw_smp_processor_id();
-	disabled = atomic_inc_return(&per_cpu(test_event_disable, cpu));
+	disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu));
 
 	if (disabled != 1)
 		goto out;
@@ -1352,7 +1352,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)
 	trace_nowake_buffer_unlock_commit(event, flags, pc);
 
  out:
-	atomic_dec(&per_cpu(test_event_disable, cpu));
+	atomic_dec(&per_cpu(ftrace_test_event_disable, cpu));
 	ftrace_preempt_enable(resched);
 }
 
diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c
index d5292fc6f52..177a5169bbd 100644
--- a/mm/kmemleak-test.c
+++ b/mm/kmemleak-test.c
@@ -36,7 +36,7 @@ struct test_node {
 };
 
 static LIST_HEAD(test_list);
-static DEFINE_PER_CPU(void *, test_pointer);
+static DEFINE_PER_CPU(void *, kmemleak_test_pointer);
 
 /*
  * Some very simple testing. This function needs to be extended for
@@ -86,9 +86,9 @@ static int __init kmemleak_test_init(void)
 	}
 
 	for_each_possible_cpu(i) {
-		per_cpu(test_pointer, i) = kmalloc(129, GFP_KERNEL);
+		per_cpu(kmemleak_test_pointer, i) = kmalloc(129, GFP_KERNEL);
 		pr_info("kmemleak: kmalloc(129) = %p\n",
-			per_cpu(test_pointer, i));
+			per_cpu(kmemleak_test_pointer, i));
 	}
 
 	return 0;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 7b0dcea4935..2c075dcf03d 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -607,6 +607,8 @@ void set_page_dirty_balance(struct page *page, int page_mkwrite)
 	}
 }
 
+static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0;
+
 /**
  * balance_dirty_pages_ratelimited_nr - balance dirty memory state
  * @mapping: address_space which was dirtied
@@ -624,7 +626,6 @@ void set_page_dirty_balance(struct page *page, int page_mkwrite)
 void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
 					unsigned long nr_pages_dirtied)
 {
-	static DEFINE_PER_CPU(unsigned long, ratelimits) = 0;
 	unsigned long ratelimit;
 	unsigned long *p;
 
@@ -637,7 +638,7 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
 	 * tasks in balance_dirty_pages(). Period.
 	 */
 	preempt_disable();
-	p =  &__get_cpu_var(ratelimits);
+	p =  &__get_cpu_var(bdp_ratelimits);
 	*p += nr_pages_dirtied;
 	if (unlikely(*p >= ratelimit)) {
 		*p = 0;
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 84d90f2799b..a6e0e077ac3 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -37,12 +37,13 @@ __initcall(init_syncookies);
 #define COOKIEBITS 24	/* Upper bits store count */
 #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1)
 
-static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], cookie_scratch);
+static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS],
+		      ipv4_cookie_scratch);
 
 static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
 		       u32 count, int c)
 {
-	__u32 *tmp = __get_cpu_var(cookie_scratch);
+	__u32 *tmp = __get_cpu_var(ipv4_cookie_scratch);
 
 	memcpy(tmp + 4, syncookie_secret[c], sizeof(syncookie_secret[c]));
 	tmp[0] = (__force u32)saddr;
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index 23d0d6db046..6b6ae913b5d 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -74,12 +74,13 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
 	return child;
 }
 
-static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], cookie_scratch);
+static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS],
+		      ipv6_cookie_scratch);
 
 static u32 cookie_hash(struct in6_addr *saddr, struct in6_addr *daddr,
 		       __be16 sport, __be16 dport, u32 count, int c)
 {
-	__u32 *tmp = __get_cpu_var(cookie_scratch);
+	__u32 *tmp = __get_cpu_var(ipv6_cookie_scratch);
 
 	/*
 	 * we have 320 bits of information to hash, copy in the remaining
-- 
cgit v1.2.3


From a4a874a906ae69c35df4b712fadbc35b15665355 Mon Sep 17 00:00:00 2001
From: Huang Weiyi <weiyi.huang@gmail.com>
Date: Thu, 18 Jun 2009 07:05:46 +0800
Subject: kmemcheck: remove duplicated #include

Remove duplicated #include in arch/x86/mm/kmemcheck/shadow.c.

Signed-off-by: Huang Weiyi <weiyi.huang@gmail.com>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Vegard Nossum <vegard.nossum@gmail.com>
---
 arch/x86/mm/kmemcheck/shadow.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/kmemcheck/shadow.c b/arch/x86/mm/kmemcheck/shadow.c
index e773b6bd007..3f66b82076a 100644
--- a/arch/x86/mm/kmemcheck/shadow.c
+++ b/arch/x86/mm/kmemcheck/shadow.c
@@ -1,7 +1,6 @@
 #include <linux/kmemcheck.h>
 #include <linux/module.h>
 #include <linux/mm.h>
-#include <linux/module.h>
 
 #include <asm/page.h>
 #include <asm/pgtable.h>
-- 
cgit v1.2.3


From 414f3251aa1b4cbd1e070866971eabc004a7dc20 Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@gmail.com>
Date: Mon, 22 Jun 2009 14:31:53 +0200
Subject: kmemcheck: remove useless check

This check is a left-over from ancient times. We now have the equivalent
check much earlier in both the page fault handler and the debug trap
handler (the calls to kmemcheck_active()).

Signed-off-by: Vegard Nossum <vegard.nossum@gmail.com>
---
 arch/x86/mm/kmemcheck/kmemcheck.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c
index 2c55ed09865..5b99004fb4b 100644
--- a/arch/x86/mm/kmemcheck/kmemcheck.c
+++ b/arch/x86/mm/kmemcheck/kmemcheck.c
@@ -225,9 +225,6 @@ void kmemcheck_hide(struct pt_regs *regs)
 
 	BUG_ON(!irqs_disabled());
 
-	if (data->balance == 0)
-		return;
-
 	if (unlikely(data->balance != 1)) {
 		kmemcheck_show_all();
 		kmemcheck_error_save_bug(regs);
-- 
cgit v1.2.3


From 788e5abc5441e9046dd91c995c6f1f75bbd144bf Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 4 Jul 2009 08:10:58 +0900
Subject: percpu: drop @unit_size from embed first chunk allocator

The only extra feature @unit_size provides is making dead space at the
end of the first chunk which doesn't have any valid usecase.  Drop the
parameter.  This will increase consistency with generalized 4k
allocator.

James Bottomley spotted missing conversion for the default
setup_per_cpu_areas() which caused build breakage on all arcsh which
use it.

[ Impact: drop unused code path ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup_percpu.c |  2 +-
 include/linux/percpu.h         |  2 +-
 mm/percpu.c                    | 18 ++++++------------
 3 files changed, 8 insertions(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 29a3eef7cf4..14728206fb5 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -342,7 +342,7 @@ static ssize_t __init setup_pcpu_embed(size_t static_size, bool chosen)
 		return -EINVAL;
 
 	return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE,
-				      reserve - PERCPU_FIRST_CHUNK_RESERVE, -1);
+				      reserve - PERCPU_FIRST_CHUNK_RESERVE);
 }
 
 /*
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index e5000343dd6..83bff053bd1 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -69,7 +69,7 @@ extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 
 extern ssize_t __init pcpu_embed_first_chunk(
 				size_t static_size, size_t reserved_size,
-				ssize_t dyn_size, ssize_t unit_size);
+				ssize_t dyn_size);
 
 /*
  * Use this to get to a cpu's version of the per-cpu object
diff --git a/mm/percpu.c b/mm/percpu.c
index 19dd83b5cbd..fc6babe6e55 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1207,7 +1207,6 @@ static struct page * __init pcpue_get_page(unsigned int cpu, int pageno)
  * @static_size: the size of static percpu area in bytes
  * @reserved_size: the size of reserved percpu area in bytes
  * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
- * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto
  *
  * This is a helper to ease setting up embedded first percpu chunk and
  * can be called where pcpu_setup_first_chunk() is expected.
@@ -1219,9 +1218,9 @@ static struct page * __init pcpue_get_page(unsigned int cpu, int pageno)
  * page size.
  *
  * When @dyn_size is positive, dynamic area might be larger than
- * specified to fill page alignment.  Also, when @dyn_size is auto,
- * @dyn_size does not fill the whole first chunk but only what's
- * necessary for page alignment after static and reserved areas.
+ * specified to fill page alignment.  When @dyn_size is auto,
+ * @dyn_size is just big enough to fill page alignment after static
+ * and reserved areas.
  *
  * If the needed size is smaller than the minimum or specified unit
  * size, the leftover is returned to the bootmem allocator.
@@ -1231,7 +1230,7 @@ static struct page * __init pcpue_get_page(unsigned int cpu, int pageno)
  * percpu access on success, -errno on failure.
  */
 ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
-				      ssize_t dyn_size, ssize_t unit_size)
+				      ssize_t dyn_size)
 {
 	size_t chunk_size;
 	unsigned int cpu;
@@ -1242,12 +1241,7 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
 	if (dyn_size != 0)
 		dyn_size = pcpue_size - static_size - reserved_size;
 
-	if (unit_size >= 0) {
-		BUG_ON(unit_size < pcpue_size);
-		pcpue_unit_size = unit_size;
-	} else
-		pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE);
-
+	pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE);
 	chunk_size = pcpue_unit_size * num_possible_cpus();
 
 	pcpue_ptr = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE,
@@ -1304,7 +1298,7 @@ void __init setup_per_cpu_areas(void)
 	 * what the legacy allocator did.
 	 */
 	unit_size = pcpu_embed_first_chunk(static_size, PERCPU_MODULE_RESERVE,
-					   PERCPU_DYNAMIC_RESERVE, -1);
+					   PERCPU_DYNAMIC_RESERVE);
 	if (unit_size < 0)
 		panic("Failed to initialized percpu areas.");
 
-- 
cgit v1.2.3


From d4b95f80399471e4bce5e992700ff7f06ef91f6a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 4 Jul 2009 08:10:59 +0900
Subject: x86,percpu: generalize 4k first chunk allocator

Generalize and move x86 setup_pcpu_4k() into pcpu_4k_first_chunk().
setup_pcpu_4k() now is a simple wrapper around the generalized
version.  Other than taking size parameters and using arch supplied
callbacks to allocate/free memory, pcpu_4k_first_chunk() is identical
to the original implementation.

This simplifies arch code and will help converting more archs to
dynamic percpu allocator.

While at it, s/pcpu_populate_pte_fn_t/pcpu_fc_populate_pte_fn_t/ for
consistency.

[ Impact: code reorganization and generalization ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup_percpu.c | 78 ++++++++++----------------------------
 include/linux/percpu.h         | 12 +++++-
 mm/percpu.c                    | 85 +++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 113 insertions(+), 62 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 14728206fb5..ab896b31e80 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -123,6 +123,19 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
 #endif
 }
 
+/*
+ * Helpers for first chunk memory allocation
+ */
+static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size)
+{
+	return pcpu_alloc_bootmem(cpu, size, size);
+}
+
+static void __init pcpu_fc_free(void *ptr, size_t size)
+{
+	free_bootmem(__pa(ptr), size);
+}
+
 /*
  * Large page remap allocator
  *
@@ -346,22 +359,11 @@ static ssize_t __init setup_pcpu_embed(size_t static_size, bool chosen)
 }
 
 /*
- * 4k page allocator
+ * 4k allocator
  *
- * This is the basic allocator.  Static percpu area is allocated
- * page-by-page and most of initialization is done by the generic
- * setup function.
+ * Boring fallback 4k allocator.  This allocator puts more pressure on
+ * PTE TLBs but other than that behaves nicely on both UMA and NUMA.
  */
-static struct page **pcpu4k_pages __initdata;
-static int pcpu4k_nr_static_pages __initdata;
-
-static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno)
-{
-	if (pageno < pcpu4k_nr_static_pages)
-		return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno];
-	return NULL;
-}
-
 static void __init pcpu4k_populate_pte(unsigned long addr)
 {
 	populate_extra_pte(addr);
@@ -369,51 +371,9 @@ static void __init pcpu4k_populate_pte(unsigned long addr)
 
 static ssize_t __init setup_pcpu_4k(size_t static_size)
 {
-	size_t pages_size;
-	unsigned int cpu;
-	int i, j;
-	ssize_t ret;
-
-	pcpu4k_nr_static_pages = PFN_UP(static_size);
-
-	/* unaligned allocations can't be freed, round up to page size */
-	pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * num_possible_cpus()
-			       * sizeof(pcpu4k_pages[0]));
-	pcpu4k_pages = alloc_bootmem(pages_size);
-
-	/* allocate and copy */
-	j = 0;
-	for_each_possible_cpu(cpu)
-		for (i = 0; i < pcpu4k_nr_static_pages; i++) {
-			void *ptr;
-
-			ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE);
-			if (!ptr) {
-				pr_warning("PERCPU: failed to allocate "
-					   "4k page for cpu%u\n", cpu);
-				goto enomem;
-			}
-
-			memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE);
-			pcpu4k_pages[j++] = virt_to_page(ptr);
-		}
-
-	/* we're ready, commit */
-	pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n",
-		pcpu4k_nr_static_pages, static_size);
-
-	ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size,
-				     PERCPU_FIRST_CHUNK_RESERVE, -1,
-				     -1, NULL, pcpu4k_populate_pte);
-	goto out_free_ar;
-
-enomem:
-	while (--j >= 0)
-		free_bootmem(__pa(page_address(pcpu4k_pages[j])), PAGE_SIZE);
-	ret = -ENOMEM;
-out_free_ar:
-	free_bootmem(__pa(pcpu4k_pages), pages_size);
-	return ret;
+	return pcpu_4k_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE,
+				   pcpu_fc_alloc, pcpu_fc_free,
+				   pcpu4k_populate_pte);
 }
 
 /* for explicit first chunk allocator selection */
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 83bff053bd1..41b5bfab419 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -59,18 +59,26 @@
 extern void *pcpu_base_addr;
 
 typedef struct page * (*pcpu_get_page_fn_t)(unsigned int cpu, int pageno);
-typedef void (*pcpu_populate_pte_fn_t)(unsigned long addr);
+typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size);
+typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size);
+typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr);
 
 extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 				size_t static_size, size_t reserved_size,
 				ssize_t dyn_size, ssize_t unit_size,
 				void *base_addr,
-				pcpu_populate_pte_fn_t populate_pte_fn);
+				pcpu_fc_populate_pte_fn_t populate_pte_fn);
 
 extern ssize_t __init pcpu_embed_first_chunk(
 				size_t static_size, size_t reserved_size,
 				ssize_t dyn_size);
 
+extern ssize_t __init pcpu_4k_first_chunk(
+				size_t static_size, size_t reserved_size,
+				pcpu_fc_alloc_fn_t alloc_fn,
+				pcpu_fc_free_fn_t free_fn,
+				pcpu_fc_populate_pte_fn_t populate_pte_fn);
+
 /*
  * Use this to get to a cpu's version of the per-cpu object
  * dynamically allocated. Non-atomic access to the current CPU's
diff --git a/mm/percpu.c b/mm/percpu.c
index fc6babe6e55..27b0f40a3ea 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1037,7 +1037,7 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 				     size_t static_size, size_t reserved_size,
 				     ssize_t dyn_size, ssize_t unit_size,
 				     void *base_addr,
-				     pcpu_populate_pte_fn_t populate_pte_fn)
+				     pcpu_fc_populate_pte_fn_t populate_pte_fn)
 {
 	static struct vm_struct first_vm;
 	static int smap[2], dmap[2];
@@ -1270,6 +1270,89 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
 				      pcpue_unit_size, pcpue_ptr, NULL);
 }
 
+/*
+ * 4k page first chunk setup helper.
+ */
+static struct page **pcpu4k_pages __initdata;
+static int pcpu4k_nr_static_pages __initdata;
+
+static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno)
+{
+	if (pageno < pcpu4k_nr_static_pages)
+		return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno];
+	return NULL;
+}
+
+/**
+ * pcpu_4k_first_chunk - map the first chunk using PAGE_SIZE pages
+ * @static_size: the size of static percpu area in bytes
+ * @reserved_size: the size of reserved percpu area in bytes
+ * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE
+ * @free_fn: funtion to free percpu page, always called with PAGE_SIZE
+ * @populate_pte_fn: function to populate pte
+ *
+ * This is a helper to ease setting up embedded first percpu chunk and
+ * can be called where pcpu_setup_first_chunk() is expected.
+ *
+ * This is the basic allocator.  Static percpu area is allocated
+ * page-by-page into vmalloc area.
+ *
+ * RETURNS:
+ * The determined pcpu_unit_size which can be used to initialize
+ * percpu access on success, -errno on failure.
+ */
+ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size,
+				   pcpu_fc_alloc_fn_t alloc_fn,
+				   pcpu_fc_free_fn_t free_fn,
+				   pcpu_fc_populate_pte_fn_t populate_pte_fn)
+{
+	size_t pages_size;
+	unsigned int cpu;
+	int i, j;
+	ssize_t ret;
+
+	pcpu4k_nr_static_pages = PFN_UP(static_size);
+
+	/* unaligned allocations can't be freed, round up to page size */
+	pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * num_possible_cpus() *
+			       sizeof(pcpu4k_pages[0]));
+	pcpu4k_pages = alloc_bootmem(pages_size);
+
+	/* allocate and copy */
+	j = 0;
+	for_each_possible_cpu(cpu)
+		for (i = 0; i < pcpu4k_nr_static_pages; i++) {
+			void *ptr;
+
+			ptr = alloc_fn(cpu, PAGE_SIZE);
+			if (!ptr) {
+				pr_warning("PERCPU: failed to allocate "
+					   "4k page for cpu%u\n", cpu);
+				goto enomem;
+			}
+
+			memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE);
+			pcpu4k_pages[j++] = virt_to_page(ptr);
+		}
+
+	/* we're ready, commit */
+	pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n",
+		pcpu4k_nr_static_pages, static_size);
+
+	ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size,
+				     reserved_size, -1,
+				     -1, NULL, populate_pte_fn);
+	goto out_free_ar;
+
+enomem:
+	while (--j >= 0)
+		free_fn(page_address(pcpu4k_pages[j]), PAGE_SIZE);
+	ret = -ENOMEM;
+out_free_ar:
+	free_bootmem(__pa(pcpu4k_pages), pages_size);
+	return ret;
+}
+
 /*
  * Generic percpu area setup.
  *
-- 
cgit v1.2.3


From 8c4bfc6e8801616ab2e01c38140b2159b388d2ff Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 4 Jul 2009 08:10:59 +0900
Subject: x86,percpu: generalize lpage first chunk allocator

Generalize and move x86 setup_pcpu_lpage() into
pcpu_lpage_first_chunk().  setup_pcpu_lpage() now is a simple wrapper
around the generalized version.  Other than taking size parameters and
using arch supplied callbacks to allocate/free/map memory,
pcpu_lpage_first_chunk() is identical to the original implementation.

This simplifies arch code and will help converting more archs to
dynamic percpu allocator.

While at it, factor out pcpu_calc_fc_sizes() which is common to
pcpu_embed_first_chunk() and pcpu_lpage_first_chunk().

[ Impact: code reorganization and generalization ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/percpu.h  |   9 --
 arch/x86/kernel/setup_percpu.c | 169 +++------------------------------
 arch/x86/mm/pageattr.c         |   1 +
 include/linux/percpu.h         |  27 ++++++
 mm/percpu.c                    | 209 ++++++++++++++++++++++++++++++++++++++++-
 5 files changed, 244 insertions(+), 171 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 103f1ddb0d8..a18c038a307 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -156,15 +156,6 @@ do {							\
 /* We can use this directly for local CPU (faster). */
 DECLARE_PER_CPU(unsigned long, this_cpu_off);
 
-#ifdef CONFIG_NEED_MULTIPLE_NODES
-void *pcpu_lpage_remapped(void *kaddr);
-#else
-static inline void *pcpu_lpage_remapped(void *kaddr)
-{
-	return NULL;
-}
-#endif
-
 #endif /* !__ASSEMBLY__ */
 
 #ifdef CONFIG_SMP
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index ab896b31e80..4f2e0ac9130 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -137,44 +137,21 @@ static void __init pcpu_fc_free(void *ptr, size_t size)
 }
 
 /*
- * Large page remap allocator
- *
- * This allocator uses PMD page as unit.  A PMD page is allocated for
- * each cpu and each is remapped into vmalloc area using PMD mapping.
- * As PMD page is quite large, only part of it is used for the first
- * chunk.  Unused part is returned to the bootmem allocator.
- *
- * So, the PMD pages are mapped twice - once to the physical mapping
- * and to the vmalloc area for the first percpu chunk.  The double
- * mapping does add one more PMD TLB entry pressure but still is much
- * better than only using 4k mappings while still being NUMA friendly.
+ * Large page remapping allocator
  */
 #ifdef CONFIG_NEED_MULTIPLE_NODES
-struct pcpul_ent {
-	unsigned int	cpu;
-	void		*ptr;
-};
-
-static size_t pcpul_size;
-static struct pcpul_ent *pcpul_map;
-static struct vm_struct pcpul_vm;
-
-static struct page * __init pcpul_get_page(unsigned int cpu, int pageno)
+static void __init pcpul_map(void *ptr, size_t size, void *addr)
 {
-	size_t off = (size_t)pageno << PAGE_SHIFT;
+	pmd_t *pmd, pmd_v;
 
-	if (off >= pcpul_size)
-		return NULL;
-
-	return virt_to_page(pcpul_map[cpu].ptr + off);
+	pmd = populate_extra_pmd((unsigned long)addr);
+	pmd_v = pfn_pmd(page_to_pfn(virt_to_page(ptr)), PAGE_KERNEL_LARGE);
+	set_pmd(pmd, pmd_v);
 }
 
 static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
 {
-	size_t map_size, dyn_size;
-	unsigned int cpu;
-	int i, j;
-	ssize_t ret;
+	size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
 
 	if (!chosen) {
 		size_t vm_size = VMALLOC_END - VMALLOC_START;
@@ -198,134 +175,10 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
 		return -EINVAL;
 	}
 
-	/*
-	 * Currently supports only single page.  Supporting multiple
-	 * pages won't be too difficult if it ever becomes necessary.
-	 */
-	pcpul_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
-			       PERCPU_DYNAMIC_RESERVE);
-	if (pcpul_size > PMD_SIZE) {
-		pr_warning("PERCPU: static data is larger than large page, "
-			   "can't use large page\n");
-		return -EINVAL;
-	}
-	dyn_size = pcpul_size - static_size - PERCPU_FIRST_CHUNK_RESERVE;
-
-	/* allocate pointer array and alloc large pages */
-	map_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpul_map[0]));
-	pcpul_map = alloc_bootmem(map_size);
-
-	for_each_possible_cpu(cpu) {
-		pcpul_map[cpu].cpu = cpu;
-		pcpul_map[cpu].ptr = pcpu_alloc_bootmem(cpu, PMD_SIZE,
-							PMD_SIZE);
-		if (!pcpul_map[cpu].ptr) {
-			pr_warning("PERCPU: failed to allocate large page "
-				   "for cpu%u\n", cpu);
-			goto enomem;
-		}
-
-		/*
-		 * Only use pcpul_size bytes and give back the rest.
-		 *
-		 * Ingo: The 2MB up-rounding bootmem is needed to make
-		 * sure the partial 2MB page is still fully RAM - it's
-		 * not well-specified to have a PAT-incompatible area
-		 * (unmapped RAM, device memory, etc.) in that hole.
-		 */
-		free_bootmem(__pa(pcpul_map[cpu].ptr + pcpul_size),
-			     PMD_SIZE - pcpul_size);
-
-		memcpy(pcpul_map[cpu].ptr, __per_cpu_load, static_size);
-	}
-
-	/* allocate address and map */
-	pcpul_vm.flags = VM_ALLOC;
-	pcpul_vm.size = num_possible_cpus() * PMD_SIZE;
-	vm_area_register_early(&pcpul_vm, PMD_SIZE);
-
-	for_each_possible_cpu(cpu) {
-		pmd_t *pmd, pmd_v;
-
-		pmd = populate_extra_pmd((unsigned long)pcpul_vm.addr +
-					 cpu * PMD_SIZE);
-		pmd_v = pfn_pmd(page_to_pfn(virt_to_page(pcpul_map[cpu].ptr)),
-				PAGE_KERNEL_LARGE);
-		set_pmd(pmd, pmd_v);
-	}
-
-	/* we're ready, commit */
-	pr_info("PERCPU: Remapped at %p with large pages, static data "
-		"%zu bytes\n", pcpul_vm.addr, static_size);
-
-	ret = pcpu_setup_first_chunk(pcpul_get_page, static_size,
-				     PERCPU_FIRST_CHUNK_RESERVE, dyn_size,
-				     PMD_SIZE, pcpul_vm.addr, NULL);
-
-	/* sort pcpul_map array for pcpu_lpage_remapped() */
-	for (i = 0; i < num_possible_cpus() - 1; i++)
-		for (j = i + 1; j < num_possible_cpus(); j++)
-			if (pcpul_map[i].ptr > pcpul_map[j].ptr) {
-				struct pcpul_ent tmp = pcpul_map[i];
-				pcpul_map[i] = pcpul_map[j];
-				pcpul_map[j] = tmp;
-			}
-
-	return ret;
-
-enomem:
-	for_each_possible_cpu(cpu)
-		if (pcpul_map[cpu].ptr)
-			free_bootmem(__pa(pcpul_map[cpu].ptr), pcpul_size);
-	free_bootmem(__pa(pcpul_map), map_size);
-	return -ENOMEM;
-}
-
-/**
- * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area
- * @kaddr: the kernel address in question
- *
- * Determine whether @kaddr falls in the pcpul recycled area.  This is
- * used by pageattr to detect VM aliases and break up the pcpu PMD
- * mapping such that the same physical page is not mapped under
- * different attributes.
- *
- * The recycled area is always at the tail of a partially used PMD
- * page.
- *
- * RETURNS:
- * Address of corresponding remapped pcpu address if match is found;
- * otherwise, NULL.
- */
-void *pcpu_lpage_remapped(void *kaddr)
-{
-	void *pmd_addr = (void *)((unsigned long)kaddr & PMD_MASK);
-	unsigned long offset = (unsigned long)kaddr & ~PMD_MASK;
-	int left = 0, right = num_possible_cpus() - 1;
-	int pos;
-
-	/* pcpul in use at all? */
-	if (!pcpul_map)
-		return NULL;
-
-	/* okay, perform binary search */
-	while (left <= right) {
-		pos = (left + right) / 2;
-
-		if (pcpul_map[pos].ptr < pmd_addr)
-			left = pos + 1;
-		else if (pcpul_map[pos].ptr > pmd_addr)
-			right = pos - 1;
-		else {
-			/* it shouldn't be in the area for the first chunk */
-			WARN_ON(offset < pcpul_size);
-
-			return pcpul_vm.addr +
-				pcpul_map[pos].cpu * PMD_SIZE + offset;
-		}
-	}
-
-	return NULL;
+	return pcpu_lpage_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE,
+				      reserve - PERCPU_FIRST_CHUNK_RESERVE,
+				      PMD_SIZE,
+				      pcpu_fc_alloc, pcpu_fc_free, pcpul_map);
 }
 #else
 static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 1b734d7a896..c106f785242 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -12,6 +12,7 @@
 #include <linux/seq_file.h>
 #include <linux/debugfs.h>
 #include <linux/pfn.h>
+#include <linux/percpu.h>
 
 #include <asm/e820.h>
 #include <asm/processor.h>
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 41b5bfab419..9f6bfd7d4b9 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -62,6 +62,7 @@ typedef struct page * (*pcpu_get_page_fn_t)(unsigned int cpu, int pageno);
 typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size);
 typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size);
 typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr);
+typedef void (*pcpu_fc_map_fn_t)(void *ptr, size_t size, void *addr);
 
 extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 				size_t static_size, size_t reserved_size,
@@ -79,6 +80,32 @@ extern ssize_t __init pcpu_4k_first_chunk(
 				pcpu_fc_free_fn_t free_fn,
 				pcpu_fc_populate_pte_fn_t populate_pte_fn);
 
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+extern ssize_t __init pcpu_lpage_first_chunk(
+				size_t static_size, size_t reserved_size,
+				ssize_t dyn_size, size_t lpage_size,
+				pcpu_fc_alloc_fn_t alloc_fn,
+				pcpu_fc_free_fn_t free_fn,
+				pcpu_fc_map_fn_t map_fn);
+
+extern void *pcpu_lpage_remapped(void *kaddr);
+#else
+static inline ssize_t __init pcpu_lpage_first_chunk(
+				size_t static_size, size_t reserved_size,
+				ssize_t dyn_size, size_t lpage_size,
+				pcpu_fc_alloc_fn_t alloc_fn,
+				pcpu_fc_free_fn_t free_fn,
+				pcpu_fc_map_fn_t map_fn)
+{
+	return -EINVAL;
+}
+
+static inline void *pcpu_lpage_remapped(void *kaddr)
+{
+	return NULL;
+}
+#endif
+
 /*
  * Use this to get to a cpu's version of the per-cpu object
  * dynamically allocated. Non-atomic access to the current CPU's
diff --git a/mm/percpu.c b/mm/percpu.c
index f3fe7bc7378..17db527ee2e 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1190,6 +1190,19 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 	return pcpu_unit_size;
 }
 
+static size_t pcpu_calc_fc_sizes(size_t static_size, size_t reserved_size,
+				 ssize_t *dyn_sizep)
+{
+	size_t size_sum;
+
+	size_sum = PFN_ALIGN(static_size + reserved_size +
+			     (*dyn_sizep >= 0 ? *dyn_sizep : 0));
+	if (*dyn_sizep != 0)
+		*dyn_sizep = size_sum - static_size - reserved_size;
+
+	return size_sum;
+}
+
 /*
  * Embedding first chunk setup helper.
  */
@@ -1241,10 +1254,7 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
 	unsigned int cpu;
 
 	/* determine parameters and allocate */
-	pcpue_size = PFN_ALIGN(static_size + reserved_size +
-			       (dyn_size >= 0 ? dyn_size : 0));
-	if (dyn_size != 0)
-		dyn_size = pcpue_size - static_size - reserved_size;
+	pcpue_size = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);
 
 	pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE);
 	chunk_size = pcpue_unit_size * num_possible_cpus();
@@ -1390,6 +1400,197 @@ out_free_ar:
 	return ret;
 }
 
+/*
+ * Large page remapping first chunk setup helper
+ */
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+struct pcpul_ent {
+	unsigned int	cpu;
+	void		*ptr;
+};
+
+static size_t pcpul_size;
+static size_t pcpul_unit_size;
+static struct pcpul_ent *pcpul_map;
+static struct vm_struct pcpul_vm;
+
+static struct page * __init pcpul_get_page(unsigned int cpu, int pageno)
+{
+	size_t off = (size_t)pageno << PAGE_SHIFT;
+
+	if (off >= pcpul_size)
+		return NULL;
+
+	return virt_to_page(pcpul_map[cpu].ptr + off);
+}
+
+/**
+ * pcpu_lpage_first_chunk - remap the first percpu chunk using large page
+ * @static_size: the size of static percpu area in bytes
+ * @reserved_size: the size of reserved percpu area in bytes
+ * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
+ * @lpage_size: the size of a large page
+ * @alloc_fn: function to allocate percpu lpage, always called with lpage_size
+ * @free_fn: function to free percpu memory, @size <= lpage_size
+ * @map_fn: function to map percpu lpage, always called with lpage_size
+ *
+ * This allocator uses large page as unit.  A large page is allocated
+ * for each cpu and each is remapped into vmalloc area using large
+ * page mapping.  As large page can be quite large, only part of it is
+ * used for the first chunk.  Unused part is returned to the bootmem
+ * allocator.
+ *
+ * So, the large pages are mapped twice - once to the physical mapping
+ * and to the vmalloc area for the first percpu chunk.  The double
+ * mapping does add one more large TLB entry pressure but still is
+ * much better than only using 4k mappings while still being NUMA
+ * friendly.
+ *
+ * RETURNS:
+ * The determined pcpu_unit_size which can be used to initialize
+ * percpu access on success, -errno on failure.
+ */
+ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size,
+				      ssize_t dyn_size, size_t lpage_size,
+				      pcpu_fc_alloc_fn_t alloc_fn,
+				      pcpu_fc_free_fn_t free_fn,
+				      pcpu_fc_map_fn_t map_fn)
+{
+	size_t size_sum;
+	size_t map_size;
+	unsigned int cpu;
+	int i, j;
+	ssize_t ret;
+
+	/*
+	 * Currently supports only single page.  Supporting multiple
+	 * pages won't be too difficult if it ever becomes necessary.
+	 */
+	size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);
+
+	pcpul_unit_size = lpage_size;
+	pcpul_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
+	if (pcpul_size > pcpul_unit_size) {
+		pr_warning("PERCPU: static data is larger than large page, "
+			   "can't use large page\n");
+		return -EINVAL;
+	}
+
+	/* allocate pointer array and alloc large pages */
+	map_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpul_map[0]));
+	pcpul_map = alloc_bootmem(map_size);
+
+	for_each_possible_cpu(cpu) {
+		void *ptr;
+
+		ptr = alloc_fn(cpu, lpage_size);
+		if (!ptr) {
+			pr_warning("PERCPU: failed to allocate large page "
+				   "for cpu%u\n", cpu);
+			goto enomem;
+		}
+
+		/*
+		 * Only use pcpul_size bytes and give back the rest.
+		 *
+		 * Ingo: The lpage_size up-rounding bootmem is needed
+		 * to make sure the partial lpage is still fully RAM -
+		 * it's not well-specified to have a incompatible area
+		 * (unmapped RAM, device memory, etc.) in that hole.
+		 */
+		free_fn(ptr + pcpul_size, lpage_size - pcpul_size);
+
+		pcpul_map[cpu].cpu = cpu;
+		pcpul_map[cpu].ptr = ptr;
+
+		memcpy(ptr, __per_cpu_load, static_size);
+	}
+
+	/* allocate address and map */
+	pcpul_vm.flags = VM_ALLOC;
+	pcpul_vm.size = num_possible_cpus() * pcpul_unit_size;
+	vm_area_register_early(&pcpul_vm, pcpul_unit_size);
+
+	for_each_possible_cpu(cpu)
+		map_fn(pcpul_map[cpu].ptr, pcpul_unit_size,
+		       pcpul_vm.addr + cpu * pcpul_unit_size);
+
+	/* we're ready, commit */
+	pr_info("PERCPU: Remapped at %p with large pages, static data "
+		"%zu bytes\n", pcpul_vm.addr, static_size);
+
+	ret = pcpu_setup_first_chunk(pcpul_get_page, static_size,
+				     reserved_size, dyn_size, pcpul_unit_size,
+				     pcpul_vm.addr, NULL);
+
+	/* sort pcpul_map array for pcpu_lpage_remapped() */
+	for (i = 0; i < num_possible_cpus() - 1; i++)
+		for (j = i + 1; j < num_possible_cpus(); j++)
+			if (pcpul_map[i].ptr > pcpul_map[j].ptr) {
+				struct pcpul_ent tmp = pcpul_map[i];
+				pcpul_map[i] = pcpul_map[j];
+				pcpul_map[j] = tmp;
+			}
+
+	return ret;
+
+enomem:
+	for_each_possible_cpu(cpu)
+		if (pcpul_map[cpu].ptr)
+			free_fn(pcpul_map[cpu].ptr, pcpul_size);
+	free_bootmem(__pa(pcpul_map), map_size);
+	return -ENOMEM;
+}
+
+/**
+ * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area
+ * @kaddr: the kernel address in question
+ *
+ * Determine whether @kaddr falls in the pcpul recycled area.  This is
+ * used by pageattr to detect VM aliases and break up the pcpu large
+ * page mapping such that the same physical page is not mapped under
+ * different attributes.
+ *
+ * The recycled area is always at the tail of a partially used large
+ * page.
+ *
+ * RETURNS:
+ * Address of corresponding remapped pcpu address if match is found;
+ * otherwise, NULL.
+ */
+void *pcpu_lpage_remapped(void *kaddr)
+{
+	unsigned long unit_mask = pcpul_unit_size - 1;
+	void *lpage_addr = (void *)((unsigned long)kaddr & ~unit_mask);
+	unsigned long offset = (unsigned long)kaddr & unit_mask;
+	int left = 0, right = num_possible_cpus() - 1;
+	int pos;
+
+	/* pcpul in use at all? */
+	if (!pcpul_map)
+		return NULL;
+
+	/* okay, perform binary search */
+	while (left <= right) {
+		pos = (left + right) / 2;
+
+		if (pcpul_map[pos].ptr < lpage_addr)
+			left = pos + 1;
+		else if (pcpul_map[pos].ptr > lpage_addr)
+			right = pos - 1;
+		else {
+			/* it shouldn't be in the area for the first chunk */
+			WARN_ON(offset < pcpul_size);
+
+			return pcpul_vm.addr +
+				pcpul_map[pos].cpu * pcpul_unit_size + offset;
+		}
+	}
+
+	return NULL;
+}
+#endif
+
 /*
  * Generic percpu area setup.
  *
-- 
cgit v1.2.3


From a530b7958612bafe2027e21359083dba84f0b3b4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 4 Jul 2009 08:11:00 +0900
Subject: percpu: teach large page allocator about NUMA

Large page first chunk allocator is primarily used for NUMA machines;
however, its NUMA handling is extremely simplistic.  Regardless of
their proximity, each cpu is put into separate large page just to
return most of the allocated space back wasting large amount of
vmalloc space and increasing cache footprint.

This patch teachs NUMA details to large page allocator.  Given
processor proximity information, pcpu_lpage_build_unit_map() will find
fitting cpu -> unit mapping in which cpus in LOCAL_DISTANCE share the
same large page and not too much virtual address space is wasted.

This greatly reduces the unit and thus chunk size and wastes much less
address space for the first chunk.  For example, on 4/4 NUMA machine,
the original code occupied 16MB of virtual space for the first chunk
while the new code only uses 4MB - one 2MB page for each node.

[ Impact: much better space efficiency on NUMA machines ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Jan Beulich <JBeulich@novell.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Miller <davem@davemloft.net>
---
 arch/x86/kernel/setup_percpu.c |  72 +++++++--
 include/linux/percpu.h         |  24 ++-
 mm/percpu.c                    | 358 ++++++++++++++++++++++++++++++++---------
 3 files changed, 359 insertions(+), 95 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 4f2e0ac9130..7501bb14bd5 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -149,36 +149,73 @@ static void __init pcpul_map(void *ptr, size_t size, void *addr)
 	set_pmd(pmd, pmd_v);
 }
 
+static int pcpu_lpage_cpu_distance(unsigned int from, unsigned int to)
+{
+	if (early_cpu_to_node(from) == early_cpu_to_node(to))
+		return LOCAL_DISTANCE;
+	else
+		return REMOTE_DISTANCE;
+}
+
 static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
 {
 	size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
+	size_t dyn_size = reserve - PERCPU_FIRST_CHUNK_RESERVE;
+	size_t unit_map_size, unit_size;
+	int *unit_map;
+	int nr_units;
+	ssize_t ret;
+
+	/* on non-NUMA, embedding is better */
+	if (!chosen && !pcpu_need_numa())
+		return -EINVAL;
+
+	/* need PSE */
+	if (!cpu_has_pse) {
+		pr_warning("PERCPU: lpage allocator requires PSE\n");
+		return -EINVAL;
+	}
 
+	/* allocate and build unit_map */
+	unit_map_size = num_possible_cpus() * sizeof(int);
+	unit_map = alloc_bootmem_nopanic(unit_map_size);
+	if (!unit_map) {
+		pr_warning("PERCPU: failed to allocate unit_map\n");
+		return -ENOMEM;
+	}
+
+	ret = pcpu_lpage_build_unit_map(static_size,
+					PERCPU_FIRST_CHUNK_RESERVE,
+					&dyn_size, &unit_size, PMD_SIZE,
+					unit_map, pcpu_lpage_cpu_distance);
+	if (ret < 0) {
+		pr_warning("PERCPU: failed to build unit_map\n");
+		goto out_free;
+	}
+	nr_units = ret;
+
+	/* do the parameters look okay? */
 	if (!chosen) {
 		size_t vm_size = VMALLOC_END - VMALLOC_START;
-		size_t tot_size = num_possible_cpus() * PMD_SIZE;
-
-		/* on non-NUMA, embedding is better */
-		if (!pcpu_need_numa())
-			return -EINVAL;
+		size_t tot_size = nr_units * unit_size;
 
 		/* don't consume more than 20% of vmalloc area */
 		if (tot_size > vm_size / 5) {
 			pr_info("PERCPU: too large chunk size %zuMB for "
 				"large page remap\n", tot_size >> 20);
-			return -EINVAL;
+			ret = -EINVAL;
+			goto out_free;
 		}
 	}
 
-	/* need PSE */
-	if (!cpu_has_pse) {
-		pr_warning("PERCPU: lpage allocator requires PSE\n");
-		return -EINVAL;
-	}
-
-	return pcpu_lpage_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE,
-				      reserve - PERCPU_FIRST_CHUNK_RESERVE,
-				      PMD_SIZE,
-				      pcpu_fc_alloc, pcpu_fc_free, pcpul_map);
+	ret = pcpu_lpage_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE,
+				     dyn_size, unit_size, PMD_SIZE,
+				     unit_map, nr_units,
+				     pcpu_fc_alloc, pcpu_fc_free, pcpul_map);
+out_free:
+	if (ret < 0)
+		free_bootmem(__pa(unit_map), unit_map_size);
+	return ret;
 }
 #else
 static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
@@ -299,7 +336,8 @@ void __init setup_per_cpu_areas(void)
 	/* alrighty, percpu areas up and running */
 	delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
 	for_each_possible_cpu(cpu) {
-		per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size;
+		per_cpu_offset(cpu) =
+			delta + pcpu_unit_map[cpu] * pcpu_unit_size;
 		per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
 		per_cpu(cpu_number, cpu) = cpu;
 		setup_percpu_segment(cpu);
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 1e0e8878dc2..8ce91af4aa1 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -62,6 +62,7 @@ extern const int *pcpu_unit_map;
 typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size);
 typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size);
 typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr);
+typedef int (pcpu_fc_cpu_distance_fn_t)(unsigned int from, unsigned int to);
 typedef void (*pcpu_fc_map_fn_t)(void *ptr, size_t size, void *addr);
 
 extern size_t __init pcpu_setup_first_chunk(
@@ -80,18 +81,37 @@ extern ssize_t __init pcpu_4k_first_chunk(
 				pcpu_fc_populate_pte_fn_t populate_pte_fn);
 
 #ifdef CONFIG_NEED_MULTIPLE_NODES
+extern int __init pcpu_lpage_build_unit_map(
+				size_t static_size, size_t reserved_size,
+				ssize_t *dyn_sizep, size_t *unit_sizep,
+				size_t lpage_size, int *unit_map,
+				pcpu_fc_cpu_distance_fn_t cpu_distance_fn);
+
 extern ssize_t __init pcpu_lpage_first_chunk(
 				size_t static_size, size_t reserved_size,
-				ssize_t dyn_size, size_t lpage_size,
+				size_t dyn_size, size_t unit_size,
+				size_t lpage_size, const int *unit_map,
+				int nr_units,
 				pcpu_fc_alloc_fn_t alloc_fn,
 				pcpu_fc_free_fn_t free_fn,
 				pcpu_fc_map_fn_t map_fn);
 
 extern void *pcpu_lpage_remapped(void *kaddr);
 #else
+static inline int pcpu_lpage_build_unit_map(
+				size_t static_size, size_t reserved_size,
+				ssize_t *dyn_sizep, size_t *unit_sizep,
+				size_t lpage_size, int *unit_map,
+				pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
+{
+	return -EINVAL;
+}
+
 static inline ssize_t __init pcpu_lpage_first_chunk(
 				size_t static_size, size_t reserved_size,
-				ssize_t dyn_size, size_t lpage_size,
+				size_t dyn_size, size_t unit_size,
+				size_t lpage_size, const int *unit_map,
+				int nr_units,
 				pcpu_fc_alloc_fn_t alloc_fn,
 				pcpu_fc_free_fn_t free_fn,
 				pcpu_fc_map_fn_t map_fn)
diff --git a/mm/percpu.c b/mm/percpu.c
index 2196fae24f0..b3d0bcff8c7 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -59,6 +59,7 @@
 #include <linux/bitmap.h>
 #include <linux/bootmem.h>
 #include <linux/list.h>
+#include <linux/log2.h>
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
@@ -1594,75 +1595,259 @@ out_free_ar:
  * Large page remapping first chunk setup helper
  */
 #ifdef CONFIG_NEED_MULTIPLE_NODES
+
+/**
+ * pcpu_lpage_build_unit_map - build unit_map for large page remapping
+ * @static_size: the size of static percpu area in bytes
+ * @reserved_size: the size of reserved percpu area in bytes
+ * @dyn_sizep: in/out parameter for dynamic size, -1 for auto
+ * @unit_sizep: out parameter for unit size
+ * @unit_map: unit_map to be filled
+ * @cpu_distance_fn: callback to determine distance between cpus
+ *
+ * This function builds cpu -> unit map and determine other parameters
+ * considering needed percpu size, large page size and distances
+ * between CPUs in NUMA.
+ *
+ * CPUs which are of LOCAL_DISTANCE both ways are grouped together and
+ * may share units in the same large page.  The returned configuration
+ * is guaranteed to have CPUs on different nodes on different large
+ * pages and >=75% usage of allocated virtual address space.
+ *
+ * RETURNS:
+ * On success, fills in @unit_map, sets *@dyn_sizep, *@unit_sizep and
+ * returns the number of units to be allocated.  -errno on failure.
+ */
+int __init pcpu_lpage_build_unit_map(size_t static_size, size_t reserved_size,
+				     ssize_t *dyn_sizep, size_t *unit_sizep,
+				     size_t lpage_size, int *unit_map,
+				     pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
+{
+	static int group_map[NR_CPUS] __initdata;
+	static int group_cnt[NR_CPUS] __initdata;
+	int group_cnt_max = 0;
+	size_t size_sum, min_unit_size, alloc_size;
+	int upa, max_upa, uninitialized_var(best_upa);	/* units_per_alloc */
+	int last_allocs;
+	unsigned int cpu, tcpu;
+	int group, unit;
+
+	/*
+	 * Determine min_unit_size, alloc_size and max_upa such that
+	 * alloc_size is multiple of lpage_size and is the smallest
+	 * which can accomodate 4k aligned segments which are equal to
+	 * or larger than min_unit_size.
+	 */
+	size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, dyn_sizep);
+	min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
+
+	alloc_size = roundup(min_unit_size, lpage_size);
+	upa = alloc_size / min_unit_size;
+	while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
+		upa--;
+	max_upa = upa;
+
+	/* group cpus according to their proximity */
+	for_each_possible_cpu(cpu) {
+		group = 0;
+	next_group:
+		for_each_possible_cpu(tcpu) {
+			if (cpu == tcpu)
+				break;
+			if (group_map[tcpu] == group &&
+			    (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
+			     cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
+				group++;
+				goto next_group;
+			}
+		}
+		group_map[cpu] = group;
+		group_cnt[group]++;
+		group_cnt_max = max(group_cnt_max, group_cnt[group]);
+	}
+
+	/*
+	 * Expand unit size until address space usage goes over 75%
+	 * and then as much as possible without using more address
+	 * space.
+	 */
+	last_allocs = INT_MAX;
+	for (upa = max_upa; upa; upa--) {
+		int allocs = 0, wasted = 0;
+
+		if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
+			continue;
+
+		for (group = 0; group_cnt[group]; group++) {
+			int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
+			allocs += this_allocs;
+			wasted += this_allocs * upa - group_cnt[group];
+		}
+
+		/*
+		 * Don't accept if wastage is over 25%.  The
+		 * greater-than comparison ensures upa==1 always
+		 * passes the following check.
+		 */
+		if (wasted > num_possible_cpus() / 3)
+			continue;
+
+		/* and then don't consume more memory */
+		if (allocs > last_allocs)
+			break;
+		last_allocs = allocs;
+		best_upa = upa;
+	}
+	*unit_sizep = alloc_size / best_upa;
+
+	/* assign units to cpus accordingly */
+	unit = 0;
+	for (group = 0; group_cnt[group]; group++) {
+		for_each_possible_cpu(cpu)
+			if (group_map[cpu] == group)
+				unit_map[cpu] = unit++;
+		unit = roundup(unit, best_upa);
+	}
+
+	return unit;	/* unit contains aligned number of units */
+}
+
 struct pcpul_ent {
-	unsigned int	cpu;
 	void		*ptr;
+	void		*map_addr;
 };
 
 static size_t pcpul_size;
-static size_t pcpul_unit_size;
+static size_t pcpul_lpage_size;
+static int pcpul_nr_lpages;
 static struct pcpul_ent *pcpul_map;
-static struct vm_struct pcpul_vm;
+
+static bool __init pcpul_unit_to_cpu(int unit, const int *unit_map,
+				     unsigned int *cpup)
+{
+	unsigned int cpu;
+
+	for_each_possible_cpu(cpu)
+		if (unit_map[cpu] == unit) {
+			if (cpup)
+				*cpup = cpu;
+			return true;
+		}
+
+	return false;
+}
+
+static void __init pcpul_lpage_dump_cfg(const char *lvl, size_t static_size,
+					size_t reserved_size, size_t dyn_size,
+					size_t unit_size, size_t lpage_size,
+					const int *unit_map, int nr_units)
+{
+	int width = 1, v = nr_units;
+	char empty_str[] = "--------";
+	int upl, lpl;	/* units per lpage, lpage per line */
+	unsigned int cpu;
+	int lpage, unit;
+
+	while (v /= 10)
+		width++;
+	empty_str[min_t(int, width, sizeof(empty_str) - 1)] = '\0';
+
+	upl = max_t(int, lpage_size / unit_size, 1);
+	lpl = rounddown_pow_of_two(max_t(int, 60 / (upl * (width + 1) + 2), 1));
+
+	printk("%spcpu-lpage: sta/res/dyn=%zu/%zu/%zu unit=%zu lpage=%zu", lvl,
+	       static_size, reserved_size, dyn_size, unit_size, lpage_size);
+
+	for (lpage = 0, unit = 0; unit < nr_units; unit++) {
+		if (!(unit % upl)) {
+			if (!(lpage++ % lpl)) {
+				printk("\n");
+				printk("%spcpu-lpage: ", lvl);
+			} else
+				printk("| ");
+		}
+		if (pcpul_unit_to_cpu(unit, unit_map, &cpu))
+			printk("%0*d ", width, cpu);
+		else
+			printk("%s ", empty_str);
+	}
+	printk("\n");
+}
 
 /**
  * pcpu_lpage_first_chunk - remap the first percpu chunk using large page
  * @static_size: the size of static percpu area in bytes
  * @reserved_size: the size of reserved percpu area in bytes
- * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
+ * @dyn_size: free size for dynamic allocation in bytes
+ * @unit_size: unit size in bytes
  * @lpage_size: the size of a large page
+ * @unit_map: cpu -> unit mapping
+ * @nr_units: the number of units
  * @alloc_fn: function to allocate percpu lpage, always called with lpage_size
  * @free_fn: function to free percpu memory, @size <= lpage_size
  * @map_fn: function to map percpu lpage, always called with lpage_size
  *
- * This allocator uses large page as unit.  A large page is allocated
- * for each cpu and each is remapped into vmalloc area using large
- * page mapping.  As large page can be quite large, only part of it is
- * used for the first chunk.  Unused part is returned to the bootmem
- * allocator.
- *
- * So, the large pages are mapped twice - once to the physical mapping
- * and to the vmalloc area for the first percpu chunk.  The double
- * mapping does add one more large TLB entry pressure but still is
- * much better than only using 4k mappings while still being NUMA
- * friendly.
+ * This allocator uses large page to build and map the first chunk.
+ * Unlike other helpers, the caller should always specify @dyn_size
+ * and @unit_size.  These parameters along with @unit_map and
+ * @nr_units can be determined using pcpu_lpage_build_unit_map().
+ * This two stage initialization is to allow arch code to evaluate the
+ * parameters before committing to it.
+ *
+ * Large pages are allocated as directed by @unit_map and other
+ * parameters and mapped to vmalloc space.  Unused holes are returned
+ * to the page allocator.  Note that these holes end up being actively
+ * mapped twice - once to the physical mapping and to the vmalloc area
+ * for the first percpu chunk.  Depending on architecture, this might
+ * cause problem when changing page attributes of the returned area.
+ * These double mapped areas can be detected using
+ * pcpu_lpage_remapped().
  *
  * RETURNS:
  * The determined pcpu_unit_size which can be used to initialize
  * percpu access on success, -errno on failure.
  */
 ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size,
-				      ssize_t dyn_size, size_t lpage_size,
+				      size_t dyn_size, size_t unit_size,
+				      size_t lpage_size, const int *unit_map,
+				      int nr_units,
 				      pcpu_fc_alloc_fn_t alloc_fn,
 				      pcpu_fc_free_fn_t free_fn,
 				      pcpu_fc_map_fn_t map_fn)
 {
-	size_t size_sum;
+	static struct vm_struct vm;
+	size_t chunk_size = unit_size * nr_units;
 	size_t map_size;
 	unsigned int cpu;
-	int i, j;
 	ssize_t ret;
+	int i, j, unit;
 
-	/*
-	 * Currently supports only single page.  Supporting multiple
-	 * pages won't be too difficult if it ever becomes necessary.
-	 */
-	size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);
+	pcpul_lpage_dump_cfg(KERN_DEBUG, static_size, reserved_size, dyn_size,
+			     unit_size, lpage_size, unit_map, nr_units);
 
-	pcpul_unit_size = lpage_size;
-	pcpul_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
-	if (pcpul_size > pcpul_unit_size) {
-		pr_warning("PERCPU: static data is larger than large page, "
-			   "can't use large page\n");
-		return -EINVAL;
-	}
+	BUG_ON(chunk_size % lpage_size);
+
+	pcpul_size = static_size + reserved_size + dyn_size;
+	pcpul_lpage_size = lpage_size;
+	pcpul_nr_lpages = chunk_size / lpage_size;
 
 	/* allocate pointer array and alloc large pages */
-	map_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpul_map[0]));
+	map_size = pcpul_nr_lpages * sizeof(pcpul_map[0]);
 	pcpul_map = alloc_bootmem(map_size);
 
-	for_each_possible_cpu(cpu) {
+	/* allocate all pages */
+	for (i = 0; i < pcpul_nr_lpages; i++) {
+		size_t offset = i * lpage_size;
+		int first_unit = offset / unit_size;
+		int last_unit = (offset + lpage_size - 1) / unit_size;
 		void *ptr;
 
+		/* find out which cpu is mapped to this unit */
+		for (unit = first_unit; unit <= last_unit; unit++)
+			if (pcpul_unit_to_cpu(unit, unit_map, &cpu))
+				goto found;
+		continue;
+	found:
 		ptr = alloc_fn(cpu, lpage_size);
 		if (!ptr) {
 			pr_warning("PERCPU: failed to allocate large page "
@@ -1670,53 +1855,79 @@ ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size,
 			goto enomem;
 		}
 
-		/*
-		 * Only use pcpul_size bytes and give back the rest.
-		 *
-		 * Ingo: The lpage_size up-rounding bootmem is needed
-		 * to make sure the partial lpage is still fully RAM -
-		 * it's not well-specified to have a incompatible area
-		 * (unmapped RAM, device memory, etc.) in that hole.
-		 */
-		free_fn(ptr + pcpul_size, lpage_size - pcpul_size);
-
-		pcpul_map[cpu].cpu = cpu;
-		pcpul_map[cpu].ptr = ptr;
+		pcpul_map[i].ptr = ptr;
+	}
 
-		memcpy(ptr, __per_cpu_load, static_size);
+	/* return unused holes */
+	for (unit = 0; unit < nr_units; unit++) {
+		size_t start = unit * unit_size;
+		size_t end = start + unit_size;
+		size_t off, next;
+
+		/* don't free used part of occupied unit */
+		if (pcpul_unit_to_cpu(unit, unit_map, NULL))
+			start += pcpul_size;
+
+		/* unit can span more than one page, punch the holes */
+		for (off = start; off < end; off = next) {
+			void *ptr = pcpul_map[off / lpage_size].ptr;
+			next = min(roundup(off + 1, lpage_size), end);
+			if (ptr)
+				free_fn(ptr + off % lpage_size, next - off);
+		}
 	}
 
-	/* allocate address and map */
-	pcpul_vm.flags = VM_ALLOC;
-	pcpul_vm.size = num_possible_cpus() * pcpul_unit_size;
-	vm_area_register_early(&pcpul_vm, pcpul_unit_size);
+	/* allocate address, map and copy */
+	vm.flags = VM_ALLOC;
+	vm.size = chunk_size;
+	vm_area_register_early(&vm, unit_size);
+
+	for (i = 0; i < pcpul_nr_lpages; i++) {
+		if (!pcpul_map[i].ptr)
+			continue;
+		pcpul_map[i].map_addr = vm.addr + i * lpage_size;
+		map_fn(pcpul_map[i].ptr, lpage_size, pcpul_map[i].map_addr);
+	}
 
 	for_each_possible_cpu(cpu)
-		map_fn(pcpul_map[cpu].ptr, pcpul_unit_size,
-		       pcpul_vm.addr + cpu * pcpul_unit_size);
+		memcpy(vm.addr + unit_map[cpu] * unit_size, __per_cpu_load,
+		       static_size);
 
 	/* we're ready, commit */
 	pr_info("PERCPU: Remapped at %p with large pages, static data "
-		"%zu bytes\n", pcpul_vm.addr, static_size);
+		"%zu bytes\n", vm.addr, static_size);
 
 	ret = pcpu_setup_first_chunk(static_size, reserved_size, dyn_size,
-				     pcpul_unit_size, pcpul_vm.addr, NULL);
-
-	/* sort pcpul_map array for pcpu_lpage_remapped() */
-	for (i = 0; i < num_possible_cpus() - 1; i++)
-		for (j = i + 1; j < num_possible_cpus(); j++)
-			if (pcpul_map[i].ptr > pcpul_map[j].ptr) {
-				struct pcpul_ent tmp = pcpul_map[i];
-				pcpul_map[i] = pcpul_map[j];
-				pcpul_map[j] = tmp;
-			}
+				     unit_size, vm.addr, unit_map);
+
+	/*
+	 * Sort pcpul_map array for pcpu_lpage_remapped().  Unmapped
+	 * lpages are pushed to the end and trimmed.
+	 */
+	for (i = 0; i < pcpul_nr_lpages - 1; i++)
+		for (j = i + 1; j < pcpul_nr_lpages; j++) {
+			struct pcpul_ent tmp;
+
+			if (!pcpul_map[j].ptr)
+				continue;
+			if (pcpul_map[i].ptr &&
+			    pcpul_map[i].ptr < pcpul_map[j].ptr)
+				continue;
+
+			tmp = pcpul_map[i];
+			pcpul_map[i] = pcpul_map[j];
+			pcpul_map[j] = tmp;
+		}
+
+	while (pcpul_nr_lpages && !pcpul_map[pcpul_nr_lpages - 1].ptr)
+		pcpul_nr_lpages--;
 
 	return ret;
 
 enomem:
-	for_each_possible_cpu(cpu)
-		if (pcpul_map[cpu].ptr)
-			free_fn(pcpul_map[cpu].ptr, pcpul_size);
+	for (i = 0; i < pcpul_nr_lpages; i++)
+		if (pcpul_map[i].ptr)
+			free_fn(pcpul_map[i].ptr, lpage_size);
 	free_bootmem(__pa(pcpul_map), map_size);
 	return -ENOMEM;
 }
@@ -1739,10 +1950,10 @@ enomem:
  */
 void *pcpu_lpage_remapped(void *kaddr)
 {
-	unsigned long unit_mask = pcpul_unit_size - 1;
-	void *lpage_addr = (void *)((unsigned long)kaddr & ~unit_mask);
-	unsigned long offset = (unsigned long)kaddr & unit_mask;
-	int left = 0, right = num_possible_cpus() - 1;
+	unsigned long lpage_mask = pcpul_lpage_size - 1;
+	void *lpage_addr = (void *)((unsigned long)kaddr & ~lpage_mask);
+	unsigned long offset = (unsigned long)kaddr & lpage_mask;
+	int left = 0, right = pcpul_nr_lpages - 1;
 	int pos;
 
 	/* pcpul in use at all? */
@@ -1757,13 +1968,8 @@ void *pcpu_lpage_remapped(void *kaddr)
 			left = pos + 1;
 		else if (pcpul_map[pos].ptr > lpage_addr)
 			right = pos - 1;
-		else {
-			/* it shouldn't be in the area for the first chunk */
-			WARN_ON(offset < pcpul_size);
-
-			return pcpul_vm.addr +
-				pcpul_map[pos].cpu * pcpul_unit_size + offset;
-		}
+		else
+			return pcpul_map[pos].map_addr + offset;
 	}
 
 	return NULL;
-- 
cgit v1.2.3


From 023bf6f1b8bf58dc4da7f0dc1cf4787b0d5297c1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 9 Jul 2009 11:27:40 +0900
Subject: linker script: unify usage of discard definition

Discarded sections in different archs share some commonality but have
considerable differences.  This led to linker script for each arch
implementing its own /DISCARD/ definition, which makes maintaining
tedious and adding new entries error-prone.

This patch makes all linker scripts to move discard definitions to the
end of the linker script and use the common DISCARDS macro.  As ld
uses the first matching section definition, archs can include default
discarded sections by including them earlier in the linker script.

ia64 is notable because it first throws away some ia64 specific
subsections and then include the rest of the sections into the final
image, so those sections must be discarded before the inclusion.

defconfig compile tested for x86, x86-64, powerpc, powerpc64, ia64,
alpha, sparc, sparc64 and s390.  Michal Simek tested microblaze.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Paul Mundt <lethal@linux-sh.org>
Acked-by: Mike Frysinger <vapier@gentoo.org>
Tested-by: Michal Simek <monstr@monstr.eu>
Cc: linux-arch@vger.kernel.org
Cc: Michal Simek <monstr@monstr.eu>
Cc: microblaze-uclinux@itee.uq.edu.au
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Tony Luck <tony.luck@intel.com>
---
 arch/alpha/kernel/vmlinux.lds.S      | 10 ++--------
 arch/avr32/kernel/vmlinux.lds.S      | 10 +++-------
 arch/blackfin/kernel/vmlinux.lds.S   |  6 +-----
 arch/cris/kernel/vmlinux.lds.S       | 10 ++--------
 arch/frv/kernel/vmlinux.lds.S        |  2 +-
 arch/h8300/kernel/vmlinux.lds.S      |  6 ++----
 arch/ia64/kernel/vmlinux.lds.S       | 17 ++++++++---------
 arch/m32r/kernel/vmlinux.lds.S       | 11 +++--------
 arch/m68k/kernel/vmlinux-std.lds     | 11 +++--------
 arch/m68k/kernel/vmlinux-sun3.lds    | 10 ++--------
 arch/m68knommu/kernel/vmlinux.lds.S  |  8 +-------
 arch/microblaze/kernel/vmlinux.lds.S |  2 +-
 arch/mips/kernel/vmlinux.lds.S       | 22 ++++++++++------------
 arch/mn10300/kernel/vmlinux.lds.S    |  9 +++------
 arch/parisc/kernel/vmlinux.lds.S     |  9 ++++-----
 arch/powerpc/kernel/vmlinux.lds.S    | 10 +++-------
 arch/s390/kernel/vmlinux.lds.S       | 10 +++-------
 arch/sh/kernel/vmlinux.lds.S         | 11 ++++-------
 arch/sparc/kernel/vmlinux.lds.S      |  9 ++-------
 arch/um/include/asm/common.lds.S     |  5 -----
 arch/um/kernel/dyn.lds.S             |  2 +-
 arch/um/kernel/uml.lds.S             |  2 +-
 arch/x86/kernel/vmlinux.lds.S        | 11 ++++-------
 arch/xtensa/kernel/vmlinux.lds.S     | 14 ++++----------
 include/asm-generic/vmlinux.lds.h    | 18 ++++++++++++------
 25 files changed, 80 insertions(+), 155 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/alpha/kernel/vmlinux.lds.S b/arch/alpha/kernel/vmlinux.lds.S
index 75fe1d6877e..6dc03c35caa 100644
--- a/arch/alpha/kernel/vmlinux.lds.S
+++ b/arch/alpha/kernel/vmlinux.lds.S
@@ -134,14 +134,6 @@ SECTIONS
 	__bss_stop = .;
 	_end = .;
 
-	/* Sections to be discarded */
-	/DISCARD/ : {
-		EXIT_TEXT
-		EXIT_DATA
-		*(.exitcall.exit)
-		*(.discard)
-	}
-
 	.mdebug 0 : {
 		*(.mdebug)
 	}
@@ -151,4 +143,6 @@ SECTIONS
 
 	STABS_DEBUG
 	DWARF_DEBUG
+
+	DISCARDS
 }
diff --git a/arch/avr32/kernel/vmlinux.lds.S b/arch/avr32/kernel/vmlinux.lds.S
index b8324608ec0..c4b56654349 100644
--- a/arch/avr32/kernel/vmlinux.lds.S
+++ b/arch/avr32/kernel/vmlinux.lds.S
@@ -124,15 +124,11 @@ SECTIONS
 		_end = .;
 	}
 
+	DWARF_DEBUG
+
 	/* When something in the kernel is NOT compiled as a module, the module
 	 * cleanup code and data are put into these segments. Both can then be
 	 * thrown away, as cleanup code is never called unless it's a module.
 	 */
-	/DISCARD/       	: {
-		EXIT_DATA
-		*(.exitcall.exit)
-		*(.discard)
-	}
-
-	DWARF_DEBUG
+	DISCARDS
 }
diff --git a/arch/blackfin/kernel/vmlinux.lds.S b/arch/blackfin/kernel/vmlinux.lds.S
index 6e8eabd8f0a..d7ffe299b97 100644
--- a/arch/blackfin/kernel/vmlinux.lds.S
+++ b/arch/blackfin/kernel/vmlinux.lds.S
@@ -277,9 +277,5 @@ SECTIONS
 
 	DWARF_DEBUG
 
-	/DISCARD/ :
-	{
-		*(.exitcall.exit)
-		*(.discard)
-	}
+	DISCARDS
 }
diff --git a/arch/cris/kernel/vmlinux.lds.S b/arch/cris/kernel/vmlinux.lds.S
index a3175ebb38c..6c81836b922 100644
--- a/arch/cris/kernel/vmlinux.lds.S
+++ b/arch/cris/kernel/vmlinux.lds.S
@@ -140,13 +140,7 @@ SECTIONS
 	_end = .;
 	__end = .;
 
-	/* Sections to be discarded */
-	/DISCARD/ : {
-		EXIT_TEXT
-		EXIT_DATA
-		*(.exitcall.exit)
-		*(.discard)
-        }
-
 	dram_end = dram_start + (CONFIG_ETRAX_DRAM_SIZE - __CONFIG_ETRAX_VMEM_SIZE)*1024*1024;
+
+	DISCARDS
 }
diff --git a/arch/frv/kernel/vmlinux.lds.S b/arch/frv/kernel/vmlinux.lds.S
index 64b5a5e4d35..7dbf41f68b5 100644
--- a/arch/frv/kernel/vmlinux.lds.S
+++ b/arch/frv/kernel/vmlinux.lds.S
@@ -178,7 +178,7 @@ SECTIONS
 
   .comment 0 : { *(.comment) }
 
-  /DISCARD/ : { *(.discard) }
+  DISCARDS
 }
 
 __kernel_image_size_no_bss = __bss_start - __kernel_image_start;
diff --git a/arch/h8300/kernel/vmlinux.lds.S b/arch/h8300/kernel/vmlinux.lds.S
index 03d6c0df33d..662b02ecb86 100644
--- a/arch/h8300/kernel/vmlinux.lds.S
+++ b/arch/h8300/kernel/vmlinux.lds.S
@@ -152,10 +152,6 @@ SECTIONS
 	__end = . ;
 	__ramstart = .;
 	}
-	/DISCARD/ : {
-		*(.exitcall.exit)
-		*(.discard)
-	}
         .romfs :	
 	{
 		*(.romfs*)
@@ -166,4 +162,6 @@ SECTIONS
 	COMMAND_START = . - 0x200 ;
 	__ramend = . ;
 	}
+
+	DISCARDS
 }
diff --git a/arch/ia64/kernel/vmlinux.lds.S b/arch/ia64/kernel/vmlinux.lds.S
index 13d95897587..eb4214d1c5a 100644
--- a/arch/ia64/kernel/vmlinux.lds.S
+++ b/arch/ia64/kernel/vmlinux.lds.S
@@ -24,15 +24,14 @@ PHDRS {
 }
 SECTIONS
 {
-  /* Sections to be discarded */
+  /* unwind exit sections must be discarded before the rest of the
+     sections get included. */
   /DISCARD/ : {
-	EXIT_TEXT
-	EXIT_DATA
-	*(.exitcall.exit)
-	*(.discard)
 	*(.IA_64.unwind.exit.text)
 	*(.IA_64.unwind_info.exit.text)
-	}
+	*(.comment)
+	*(.note)
+  }
 
   v = PAGE_OFFSET;	/* this symbol is here to make debugging easier... */
   phys_start = _start - LOAD_OFFSET;
@@ -317,7 +316,7 @@ SECTIONS
   .debug_funcnames 0 : { *(.debug_funcnames) }
   .debug_typenames 0 : { *(.debug_typenames) }
   .debug_varnames  0 : { *(.debug_varnames) }
-  /* These must appear regardless of  .  */
-  /DISCARD/ : { *(.comment) }
-  /DISCARD/ : { *(.note) }
+
+  /* Default discards */
+  DISCARDS
 }
diff --git a/arch/m32r/kernel/vmlinux.lds.S b/arch/m32r/kernel/vmlinux.lds.S
index 480a49944cf..de5e21cca6a 100644
--- a/arch/m32r/kernel/vmlinux.lds.S
+++ b/arch/m32r/kernel/vmlinux.lds.S
@@ -120,14 +120,6 @@ SECTIONS
 
   _end = . ;
 
-  /* Sections to be discarded */
-  /DISCARD/ : {
-	EXIT_TEXT
-	EXIT_DATA
-	*(.exitcall.exit)
-	*(.discard)
-	}
-
   /* Stabs debugging sections.  */
   .stab 0 : { *(.stab) }
   .stabstr 0 : { *(.stabstr) }
@@ -136,4 +128,7 @@ SECTIONS
   .stab.index 0 : { *(.stab.index) }
   .stab.indexstr 0 : { *(.stab.indexstr) }
   .comment 0 : { *(.comment) }
+
+  /* Sections to be discarded */
+  DISCARDS
 }
diff --git a/arch/m68k/kernel/vmlinux-std.lds b/arch/m68k/kernel/vmlinux-std.lds
index 905a797ada9..47eac19e8f6 100644
--- a/arch/m68k/kernel/vmlinux-std.lds
+++ b/arch/m68k/kernel/vmlinux-std.lds
@@ -82,14 +82,6 @@ SECTIONS
 
   _end = . ;
 
-  /* Sections to be discarded */
-  /DISCARD/ : {
-	EXIT_TEXT
-	EXIT_DATA
-	*(.exitcall.exit)
-	*(.discard)
-	}
-
   /* Stabs debugging sections.  */
   .stab 0 : { *(.stab) }
   .stabstr 0 : { *(.stabstr) }
@@ -98,4 +90,7 @@ SECTIONS
   .stab.index 0 : { *(.stab.index) }
   .stab.indexstr 0 : { *(.stab.indexstr) }
   .comment 0 : { *(.comment) }
+
+  /* Sections to be discarded */
+  DISCARDS
 }
diff --git a/arch/m68k/kernel/vmlinux-sun3.lds b/arch/m68k/kernel/vmlinux-sun3.lds
index 47d04be322a..03efaf04d7d 100644
--- a/arch/m68k/kernel/vmlinux-sun3.lds
+++ b/arch/m68k/kernel/vmlinux-sun3.lds
@@ -77,14 +77,6 @@ __init_begin = .;
 
   _end = . ;
 
-  /* Sections to be discarded */
-  /DISCARD/ : {
-	EXIT_TEXT
-	EXIT_DATA
-	*(.exitcall.exit)
-	*(.discard)
-	}
-
   .crap : {
 	/* Stabs debugging sections.  */
 	*(.stab)
@@ -97,4 +89,6 @@ __init_begin = .;
 	*(.note)
   }
 
+  /* Sections to be discarded */
+  DISCARDS
 }
diff --git a/arch/m68knommu/kernel/vmlinux.lds.S b/arch/m68knommu/kernel/vmlinux.lds.S
index 68111a61a77..2736a5e309c 100644
--- a/arch/m68knommu/kernel/vmlinux.lds.S
+++ b/arch/m68knommu/kernel/vmlinux.lds.S
@@ -184,13 +184,6 @@ SECTIONS {
 		__init_end = .;
 	} > INIT
 
-	/DISCARD/ : {
-		EXIT_TEXT
-		EXIT_DATA
-		*(.exitcall.exit)
-		*(.discard)
-	}
-
 	.bss : {
 		. = ALIGN(4);
 		_sbss = . ;
@@ -201,5 +194,6 @@ SECTIONS {
 	 	_end = . ;
 	} > BSS
 
+	DISCARDS
 }
 
diff --git a/arch/microblaze/kernel/vmlinux.lds.S b/arch/microblaze/kernel/vmlinux.lds.S
index 81bebdcb18f..ec5fa91a48d 100644
--- a/arch/microblaze/kernel/vmlinux.lds.S
+++ b/arch/microblaze/kernel/vmlinux.lds.S
@@ -163,5 +163,5 @@ SECTIONS {
 	. = ALIGN(4096);
 	_end = .;
 
-	/DISCARD/ : { *(.discard) }
+	DISCARDS
 }
diff --git a/arch/mips/kernel/vmlinux.lds.S b/arch/mips/kernel/vmlinux.lds.S
index 45901609b74..1474c18fb77 100644
--- a/arch/mips/kernel/vmlinux.lds.S
+++ b/arch/mips/kernel/vmlinux.lds.S
@@ -176,18 +176,6 @@ SECTIONS
 
 	_end = . ;
 
-	/* Sections to be discarded */
-	/DISCARD/ : {
-		*(.exitcall.exit)
-		*(.discard)
-
-		/* ABI crap starts here */
-		*(.MIPS.options)
-		*(.options)
-		*(.pdr)
-		*(.reginfo)
-	}
-
 	/* These mark the ABI of the kernel for debuggers.  */
 	.mdebug.abi32 : {
 		KEEP(*(.mdebug.abi32))
@@ -213,4 +201,14 @@ SECTIONS
 		*(.gptab.bss)
 		*(.gptab.sbss)
 	}
+
+	/* Sections to be discarded */
+	DISCARDS
+	/DISCARD/ : {
+		/* ABI crap starts here */
+		*(.MIPS.options)
+		*(.options)
+		*(.pdr)
+		*(.reginfo)
+	}
 }
diff --git a/arch/mn10300/kernel/vmlinux.lds.S b/arch/mn10300/kernel/vmlinux.lds.S
index 5609d4962a5..8fcd0f1e21d 100644
--- a/arch/mn10300/kernel/vmlinux.lds.S
+++ b/arch/mn10300/kernel/vmlinux.lds.S
@@ -115,13 +115,10 @@ SECTIONS
   . = ALIGN(PAGE_SIZE);
   pg0 = .;
 
-  /* Sections to be discarded */
-  /DISCARD/ : {
-	EXIT_CALL
-	*(.discard)
-	}
-
   STABS_DEBUG
 
   DWARF_DEBUG
+
+  /* Sections to be discarded */
+  DISCARDS
 }
diff --git a/arch/parisc/kernel/vmlinux.lds.S b/arch/parisc/kernel/vmlinux.lds.S
index ccf58341845..aea1784edbd 100644
--- a/arch/parisc/kernel/vmlinux.lds.S
+++ b/arch/parisc/kernel/vmlinux.lds.S
@@ -237,10 +237,12 @@ SECTIONS
 	/* freed after init ends here */
 	_end = . ;
 
+	STABS_DEBUG
+	.note 0 : { *(.note) }
+
 	/* Sections to be discarded */
+	DISCARDS
 	/DISCARD/ : {
-		*(.exitcall.exit)
-		*(.discard)
 #ifdef CONFIG_64BIT
 		/* temporary hack until binutils is fixed to not emit these
 	 	 * for static binaries
@@ -253,7 +255,4 @@ SECTIONS
 		*(.gnu.hash)
 #endif
 	}
-
-	STABS_DEBUG
-	.note 0 : { *(.note) }	
 }
diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S
index 7fca9355fd3..244e3658983 100644
--- a/arch/powerpc/kernel/vmlinux.lds.S
+++ b/arch/powerpc/kernel/vmlinux.lds.S
@@ -37,13 +37,6 @@ jiffies = jiffies_64 + 4;
 #endif
 SECTIONS
 {
-	/* Sections to be discarded. */
-	/DISCARD/ : {
-	*(.exitcall.exit)
-	*(.discard)
-	EXIT_DATA
-	}
-
 	. = KERNELBASE;
 
 /*
@@ -299,4 +292,7 @@ SECTIONS
 	. = ALIGN(PAGE_SIZE);
 	_end = . ;
 	PROVIDE32 (end = .);
+
+	/* Sections to be discarded. */
+	DISCARDS
 }
diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S
index 98867dfea46..82415c75b99 100644
--- a/arch/s390/kernel/vmlinux.lds.S
+++ b/arch/s390/kernel/vmlinux.lds.S
@@ -157,14 +157,10 @@ SECTIONS
 
 	_end = . ;
 
-	/* Sections to be discarded */
-	/DISCARD/ : {
-		EXIT_DATA
-		*(.exitcall.exit)
-		*(.discard)
-	}
-
 	/* Debugging sections.	*/
 	STABS_DEBUG
 	DWARF_DEBUG
+
+	/* Sections to be discarded */
+	DISCARDS
 }
diff --git a/arch/sh/kernel/vmlinux.lds.S b/arch/sh/kernel/vmlinux.lds.S
index 766976d27b2..0ce254bca92 100644
--- a/arch/sh/kernel/vmlinux.lds.S
+++ b/arch/sh/kernel/vmlinux.lds.S
@@ -163,17 +163,14 @@ SECTIONS
 		_end = . ;
 	}
 
+	STABS_DEBUG
+	DWARF_DEBUG
+
 	/*
 	 * When something in the kernel is NOT compiled as a module, the
 	 * module cleanup code and data are put into these segments. Both
 	 * can then be thrown away, as cleanup code is never called unless
 	 * it's a module.
 	 */
-	/DISCARD/ : {
-		*(.exitcall.exit)
-		*(.discard)
-	}
-
-	STABS_DEBUG
-	DWARF_DEBUG
+	DISCARDS
 }
diff --git a/arch/sparc/kernel/vmlinux.lds.S b/arch/sparc/kernel/vmlinux.lds.S
index d63cf914667..866390feb68 100644
--- a/arch/sparc/kernel/vmlinux.lds.S
+++ b/arch/sparc/kernel/vmlinux.lds.S
@@ -171,13 +171,8 @@ SECTIONS
 	}
 	_end = . ;
 
-	/DISCARD/ : {
-		EXIT_TEXT
-		EXIT_DATA
-		*(.exitcall.exit)
-		*(.discard)
-	}
-
 	STABS_DEBUG
 	DWARF_DEBUG
+
+	DISCARDS
 }
diff --git a/arch/um/include/asm/common.lds.S b/arch/um/include/asm/common.lds.S
index cb0248616d4..37ecc5577a9 100644
--- a/arch/um/include/asm/common.lds.S
+++ b/arch/um/include/asm/common.lds.S
@@ -123,8 +123,3 @@
 	__initramfs_end = .;
   }
 
-  /* Sections to be discarded */
-  /DISCARD/ : {
- 	*(.exitcall.exit)
-  }
-
diff --git a/arch/um/kernel/dyn.lds.S b/arch/um/kernel/dyn.lds.S
index 2916d6eadff..715a188c047 100644
--- a/arch/um/kernel/dyn.lds.S
+++ b/arch/um/kernel/dyn.lds.S
@@ -157,5 +157,5 @@ SECTIONS
 
   DWARF_DEBUG
 
-  /DISCARD/	: { *(.discard) }
+  DISCARDS
 }
diff --git a/arch/um/kernel/uml.lds.S b/arch/um/kernel/uml.lds.S
index 1f8a622cabe..2ebd39765db 100644
--- a/arch/um/kernel/uml.lds.S
+++ b/arch/um/kernel/uml.lds.S
@@ -101,5 +101,5 @@ SECTIONS
 
   DWARF_DEBUG
 
-  /DISCARD/	: { *(.discard) }
+  DISCARDS
 }
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 367e8788204..b600c843710 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -387,15 +387,12 @@ SECTIONS
 		_end = .;
 	}
 
-	/* Sections to be discarded */
-	/DISCARD/ : {
-		*(.exitcall.exit)
-		*(.eh_frame)
-		*(.discard)
-	}
-
         STABS_DEBUG
         DWARF_DEBUG
+
+	/* Sections to be discarded */
+	DISCARDS
+	/DISCARD/ : { *(.eh_frame) }
 }
 
 
diff --git a/arch/xtensa/kernel/vmlinux.lds.S b/arch/xtensa/kernel/vmlinux.lds.S
index b1e24638acd..921b6ff3b64 100644
--- a/arch/xtensa/kernel/vmlinux.lds.S
+++ b/arch/xtensa/kernel/vmlinux.lds.S
@@ -280,16 +280,6 @@ SECTIONS
     *(.ResetVector.text)
   }
 
-  /* Sections to be discarded */
-  /DISCARD/ :
-  {
-	*(.exit.literal)
-	EXIT_TEXT
-	EXIT_DATA
-        *(.exitcall.exit)
-	*(.discard)
-  }
-
   .xt.lit : { *(.xt.lit) }
   .xt.prop : { *(.xt.prop) }
 
@@ -322,4 +312,8 @@ SECTIONS
     *(.xt.lit)
     *(.gnu.linkonce.p*)
   }
+
+  /* Sections to be discarded */
+  DISCARDS
+  /DISCARD/ : { *(.exit.literal) }
 }
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index c5c18ac878a..ab8ea9b7741 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -35,13 +35,10 @@
  *	__bss_stop = .;
  *	_end = .;
  *
- *	/DISCARD/ : {
- *		EXIT_TEXT
- *		EXIT_DATA
- *		EXIT_CALL
- *	}
  *	STABS_DEBUG
  *	DWARF_DEBUG
+ *
+ *	DISCARDS		// must be the last
  * }
  *
  * [__init_begin, __init_end] is the init section that may be freed after init
@@ -629,11 +626,20 @@
 #define INIT_RAM_FS
 #endif
 
+/*
+ * Default discarded sections.
+ *
+ * Some archs want to discard exit text/data at runtime rather than
+ * link time due to cross-section references such as alt instructions,
+ * bug table, eh_frame, etc.  DISCARDS must be the last of output
+ * section definitions so that such archs put those in earlier section
+ * definitions.
+ */
 #define DISCARDS							\
 	/DISCARD/ : {							\
 	EXIT_TEXT							\
 	EXIT_DATA							\
-	*(.exitcall.exit)						\
+	EXIT_CALL							\
 	*(.discard)							\
 	}
 
-- 
cgit v1.2.3


From c31d96338a6041520ba5f1b6a4a5012ef00686b3 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Thu, 9 Jul 2009 00:31:37 +0200
Subject: x86: mce: Make CONFIG_X86_ANCIENT_MCE dependent on CONFIG_X86_MCE

Add a missing depency for ANCIENT_MCE. It didn't matter
in practice because the ANCIENT code wasn't compiled without
X86_MCE, but it's better to express that clearly in
Kconfig.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 356d2ec8e2f..5962b872a7a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -823,7 +823,7 @@ config X86_MCE_AMD
 
 config X86_ANCIENT_MCE
 	def_bool n
-	depends on X86_32
+	depends on X86_32 && X86_MCE
 	prompt "Support for old Pentium 5 / WinChip machine checks"
 	---help---
 	  Include support for machine check handling on old Pentium 5 or WinChip
-- 
cgit v1.2.3


From bab9bc6583fe6c1660d6ed36dd14bbb4edfaf393 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Thu, 9 Jul 2009 00:31:38 +0200
Subject: x86: mce: Update X86_MCE description in x86/Kconfig

- Clarify that this config controls thermal throttling
reporting too
- Clarify the types of errors reported by machine checks
- Drop references to ancient CPUs.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/Kconfig | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 5962b872a7a..134a8c0d80d 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -774,20 +774,12 @@ config X86_REROUTE_FOR_BROKEN_BOOT_IRQS
 	  increased on these systems.
 
 config X86_MCE
-	bool "Machine Check Exception"
+	bool "Machine Check / overheating reporting"
 	---help---
-	  Machine Check Exception support allows the processor to notify the
-	  kernel if it detects a problem (e.g. overheating, component failure).
+	  Machine Check support allows the processor to notify the
+	  kernel if it detects a problem (e.g. overheating, data corruption).
 	  The action the kernel takes depends on the severity of the problem,
-	  ranging from a warning message on the console, to halting the machine.
-	  Your processor must be a Pentium or newer to support this - check the
-	  flags in /proc/cpuinfo for mce.  Note that some older Pentium systems
-	  have a design flaw which leads to false MCE events - hence MCE is
-	  disabled on all P5 processors, unless explicitly enabled with "mce"
-	  as a boot argument.  Similarly, if MCE is built in and creates a
-	  problem on some new non-standard machine, you can boot with "nomce"
-	  to disable it.  MCE support simply ignores non-MCE processors like
-	  the 386 and 486, so nearly everyone can say Y here.
+	  ranging from warning messages to halting the machine.
 
 config X86_OLD_MCE
 	depends on X86_32 && X86_MCE
-- 
cgit v1.2.3


From 5bb38adcb54cf7192b154368ad62982caa11ca0b Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Thu, 9 Jul 2009 00:31:39 +0200
Subject: x86: mce: Remove old i386 machine check code

As announced in feature-remove-schedule.txt remove CONFIG_X86_OLD_MCE

This patch only removes code.

The ancient machine check code for very old systems that are not supported
by CONFIG_X86_NEW_MCE is still kept.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 Documentation/feature-removal-schedule.txt |  10 --
 arch/x86/Kconfig                           |  35 +------
 arch/x86/include/asm/mce.h                 |  11 --
 arch/x86/kernel/cpu/mcheck/Makefile        |   2 -
 arch/x86/kernel/cpu/mcheck/k7.c            | 116 --------------------
 arch/x86/kernel/cpu/mcheck/mce.c           |  47 ---------
 arch/x86/kernel/cpu/mcheck/non-fatal.c     |  94 -----------------
 arch/x86/kernel/cpu/mcheck/p4.c            | 163 -----------------------------
 arch/x86/kernel/cpu/mcheck/p6.c            | 127 ----------------------
 9 files changed, 2 insertions(+), 603 deletions(-)
 delete mode 100644 arch/x86/kernel/cpu/mcheck/k7.c
 delete mode 100644 arch/x86/kernel/cpu/mcheck/non-fatal.c
 delete mode 100644 arch/x86/kernel/cpu/mcheck/p4.c
 delete mode 100644 arch/x86/kernel/cpu/mcheck/p6.c

(limited to 'arch/x86')

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 7129846a278..edb2f0b0761 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -444,13 +444,3 @@ What:	CONFIG_RFKILL_INPUT
 When:	2.6.33
 Why:	Should be implemented in userspace, policy daemon.
 Who:	Johannes Berg <johannes@sipsolutions.net>
-
-----------------------------
-
-What:	CONFIG_X86_OLD_MCE
-When:	2.6.32
-Why:	Remove the old legacy 32bit machine check code. This has been
-	superseded by the newer machine check code from the 64bit port,
-	but the old version has been kept around for easier testing. Note this
-	doesn't impact the old P5 and WinChip machine check handlers.
-Who:	Andi Kleen <andi@firstfloor.org>
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 134a8c0d80d..d986769a7d9 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -781,21 +781,10 @@ config X86_MCE
 	  The action the kernel takes depends on the severity of the problem,
 	  ranging from warning messages to halting the machine.
 
-config X86_OLD_MCE
-	depends on X86_32 && X86_MCE
-	bool "Use legacy machine check code (will go away)"
-	default n
-	select X86_ANCIENT_MCE
-	---help---
-	  Use the old i386 machine check code. This is merely intended for
-	  testing in a transition period. Try this if you run into any machine
-	  check related software problems, but report the problem to
-	  linux-kernel.  When in doubt say no.
-
 config X86_NEW_MCE
 	depends on X86_MCE
 	bool
-	default y if (!X86_OLD_MCE && X86_32) || X86_64
+	default y
 
 config X86_MCE_INTEL
 	def_bool y
@@ -835,29 +824,9 @@ config X86_MCE_INJECT
 	  If you don't know what a machine check is and you don't do kernel
 	  QA it is safe to say n.
 
-config X86_MCE_NONFATAL
-	tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4"
-	depends on X86_OLD_MCE
-	---help---
-	  Enabling this feature starts a timer that triggers every 5 seconds which
-	  will look at the machine check registers to see if anything happened.
-	  Non-fatal problems automatically get corrected (but still logged).
-	  Disable this if you don't want to see these messages.
-	  Seeing the messages this option prints out may be indicative of dying
-	  or out-of-spec (ie, overclocked) hardware.
-	  This option only does something on certain CPUs.
-	  (AMD Athlon/Duron and Intel Pentium 4)
-
-config X86_MCE_P4THERMAL
-	bool "check for P4 thermal throttling interrupt."
-	depends on X86_OLD_MCE && X86_MCE && (X86_UP_APIC || SMP)
-	---help---
-	  Enabling this feature will cause a message to be printed when the P4
-	  enters thermal throttling.
-
 config X86_THERMAL_VECTOR
 	def_bool y
-	depends on X86_MCE_P4THERMAL || X86_MCE_INTEL
+	depends on X86_MCE_INTEL
 
 config VM86
 	bool "Enable VM86 support" if EMBEDDED
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index b50b9e9042c..6b8a974e127 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -115,13 +115,6 @@ void mcheck_init(struct cpuinfo_x86 *c);
 static inline void mcheck_init(struct cpuinfo_x86 *c) {}
 #endif
 
-#ifdef CONFIG_X86_OLD_MCE
-extern int nr_mce_banks;
-void amd_mcheck_init(struct cpuinfo_x86 *c);
-void intel_p4_mcheck_init(struct cpuinfo_x86 *c);
-void intel_p6_mcheck_init(struct cpuinfo_x86 *c);
-#endif
-
 #ifdef CONFIG_X86_ANCIENT_MCE
 void intel_p5_mcheck_init(struct cpuinfo_x86 *c);
 void winchip_mcheck_init(struct cpuinfo_x86 *c);
@@ -208,11 +201,7 @@ extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
 
 void intel_init_thermal(struct cpuinfo_x86 *c);
 
-#ifdef CONFIG_X86_NEW_MCE
 void mce_log_therm_throt_event(__u64 status);
-#else
-static inline void mce_log_therm_throt_event(__u64 status) {}
-#endif
 
 #endif /* __KERNEL__ */
 #endif /* _ASM_X86_MCE_H */
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
index 188a1ca5ad2..022a036ce21 100644
--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -1,11 +1,9 @@
 obj-y				=  mce.o
 
 obj-$(CONFIG_X86_NEW_MCE)	+= mce-severity.o
-obj-$(CONFIG_X86_OLD_MCE)	+= k7.o p4.o p6.o
 obj-$(CONFIG_X86_ANCIENT_MCE)	+= winchip.o p5.o
 obj-$(CONFIG_X86_MCE_INTEL)	+= mce_intel.o
 obj-$(CONFIG_X86_MCE_AMD)	+= mce_amd.o
-obj-$(CONFIG_X86_MCE_NONFATAL)	+= non-fatal.o
 obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
 obj-$(CONFIG_X86_MCE_INJECT)	+= mce-inject.o
 
diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c
deleted file mode 100644
index b945d5dbc60..00000000000
--- a/arch/x86/kernel/cpu/mcheck/k7.c
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Athlon specific Machine Check Exception Reporting
- * (C) Copyright 2002 Dave Jones <davej@redhat.com>
- */
-#include <linux/interrupt.h>
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-
-#include <asm/processor.h>
-#include <asm/system.h>
-#include <asm/mce.h>
-#include <asm/msr.h>
-
-/* Machine Check Handler For AMD Athlon/Duron: */
-static void k7_machine_check(struct pt_regs *regs, long error_code)
-{
-	u32 alow, ahigh, high, low;
-	u32 mcgstl, mcgsth;
-	int recover = 1;
-	int i;
-
-	rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
-	if (mcgstl & (1<<0))	/* Recoverable ? */
-		recover = 0;
-
-	printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
-		smp_processor_id(), mcgsth, mcgstl);
-
-	for (i = 1; i < nr_mce_banks; i++) {
-		rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
-		if (high & (1<<31)) {
-			char misc[20];
-			char addr[24];
-
-			misc[0] = '\0';
-			addr[0] = '\0';
-
-			if (high & (1<<29))
-				recover |= 1;
-			if (high & (1<<25))
-				recover |= 2;
-			high &= ~(1<<31);
-
-			if (high & (1<<27)) {
-				rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);
-				snprintf(misc, 20, "[%08x%08x]", ahigh, alow);
-			}
-			if (high & (1<<26)) {
-				rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
-				snprintf(addr, 24, " at %08x%08x", ahigh, alow);
-			}
-
-			printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",
-				smp_processor_id(), i, high, low, misc, addr);
-
-			/* Clear it: */
-			wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
-			/* Serialize: */
-			wmb();
-			add_taint(TAINT_MACHINE_CHECK);
-		}
-	}
-
-	if (recover & 2)
-		panic("CPU context corrupt");
-	if (recover & 1)
-		panic("Unable to continue");
-
-	printk(KERN_EMERG "Attempting to continue.\n");
-
-	mcgstl &= ~(1<<2);
-	wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
-}
-
-
-/* AMD K7 machine check is Intel like: */
-void amd_mcheck_init(struct cpuinfo_x86 *c)
-{
-	u32 l, h;
-	int i;
-
-	if (!cpu_has(c, X86_FEATURE_MCE))
-		return;
-
-	machine_check_vector = k7_machine_check;
-	/* Make sure the vector pointer is visible before we enable MCEs: */
-	wmb();
-
-	printk(KERN_INFO "Intel machine check architecture supported.\n");
-
-	rdmsr(MSR_IA32_MCG_CAP, l, h);
-	if (l & (1<<8))	/* Control register present ? */
-		wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
-	nr_mce_banks = l & 0xff;
-
-	/*
-	 * Clear status for MC index 0 separately, we don't touch CTL,
-	 * as some K7 Athlons cause spurious MCEs when its enabled:
-	 */
-	if (boot_cpu_data.x86 == 6) {
-		wrmsr(MSR_IA32_MC0_STATUS, 0x0, 0x0);
-		i = 1;
-	} else
-		i = 0;
-
-	for (; i < nr_mce_banks; i++) {
-		wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
-		wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
-	}
-
-	set_in_cr4(X86_CR4_MCE);
-	printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
-		smp_processor_id());
-}
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 7da8fec9ca8..5ff6362ecb1 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -58,8 +58,6 @@ void (*machine_check_vector)(struct pt_regs *, long error_code) =
 
 int mce_disabled __read_mostly;
 
-#ifdef CONFIG_X86_NEW_MCE
-
 #define MISC_MCELOG_MINOR	227
 
 #define SPINUNIT 100	/* 100ns */
@@ -1993,51 +1991,6 @@ static __init int mce_init_device(void)
 
 device_initcall(mce_init_device);
 
-#else /* CONFIG_X86_OLD_MCE: */
-
-int nr_mce_banks;
-EXPORT_SYMBOL_GPL(nr_mce_banks);	/* non-fatal.o */
-
-/* This has to be run for each processor */
-void mcheck_init(struct cpuinfo_x86 *c)
-{
-	if (mce_disabled)
-		return;
-
-	switch (c->x86_vendor) {
-	case X86_VENDOR_AMD:
-		amd_mcheck_init(c);
-		break;
-
-	case X86_VENDOR_INTEL:
-		if (c->x86 == 5)
-			intel_p5_mcheck_init(c);
-		if (c->x86 == 6)
-			intel_p6_mcheck_init(c);
-		if (c->x86 == 15)
-			intel_p4_mcheck_init(c);
-		break;
-
-	case X86_VENDOR_CENTAUR:
-		if (c->x86 == 5)
-			winchip_mcheck_init(c);
-		break;
-
-	default:
-		break;
-	}
-	printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks);
-}
-
-static int __init mcheck_enable(char *str)
-{
-	mce_p5_enabled = 1;
-	return 1;
-}
-__setup("mce", mcheck_enable);
-
-#endif /* CONFIG_X86_OLD_MCE */
-
 /*
  * Old style boot options parsing. Only for compatibility.
  */
diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c
deleted file mode 100644
index f5f2d6f71fb..00000000000
--- a/arch/x86/kernel/cpu/mcheck/non-fatal.c
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Non Fatal Machine Check Exception Reporting
- *
- * (C) Copyright 2002 Dave Jones. <davej@redhat.com>
- *
- * This file contains routines to check for non-fatal MCEs every 15s
- *
- */
-#include <linux/interrupt.h>
-#include <linux/workqueue.h>
-#include <linux/jiffies.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-
-#include <asm/processor.h>
-#include <asm/system.h>
-#include <asm/mce.h>
-#include <asm/msr.h>
-
-static int		firstbank;
-
-#define MCE_RATE	(15*HZ)	/* timer rate is 15s */
-
-static void mce_checkregs(void *info)
-{
-	u32 low, high;
-	int i;
-
-	for (i = firstbank; i < nr_mce_banks; i++) {
-		rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
-
-		if (!(high & (1<<31)))
-			continue;
-
-		printk(KERN_INFO "MCE: The hardware reports a non fatal, "
-			"correctable incident occurred on CPU %d.\n",
-				smp_processor_id());
-
-		printk(KERN_INFO "Bank %d: %08x%08x\n", i, high, low);
-
-		/*
-		 * Scrub the error so we don't pick it up in MCE_RATE
-		 * seconds time:
-		 */
-		wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
-
-		/* Serialize: */
-		wmb();
-		add_taint(TAINT_MACHINE_CHECK);
-	}
-}
-
-static void mce_work_fn(struct work_struct *work);
-static DECLARE_DELAYED_WORK(mce_work, mce_work_fn);
-
-static void mce_work_fn(struct work_struct *work)
-{
-	on_each_cpu(mce_checkregs, NULL, 1);
-	schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE));
-}
-
-static int __init init_nonfatal_mce_checker(void)
-{
-	struct cpuinfo_x86 *c = &boot_cpu_data;
-
-	/* Check for MCE support */
-	if (!cpu_has(c, X86_FEATURE_MCE))
-		return -ENODEV;
-
-	/* Check for PPro style MCA */
-	if (!cpu_has(c, X86_FEATURE_MCA))
-		return -ENODEV;
-
-	/* Some Athlons misbehave when we frob bank 0 */
-	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
-						boot_cpu_data.x86 == 6)
-		firstbank = 1;
-	else
-		firstbank = 0;
-
-	/*
-	 * Check for non-fatal errors every MCE_RATE s
-	 */
-	schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE));
-	printk(KERN_INFO "Machine check exception polling timer started.\n");
-
-	return 0;
-}
-module_init(init_nonfatal_mce_checker);
-
-MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c
deleted file mode 100644
index 4482aea9aa2..00000000000
--- a/arch/x86/kernel/cpu/mcheck/p4.c
+++ /dev/null
@@ -1,163 +0,0 @@
-/*
- * P4 specific Machine Check Exception Reporting
- */
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-
-#include <asm/processor.h>
-#include <asm/mce.h>
-#include <asm/msr.h>
-
-/* as supported by the P4/Xeon family */
-struct intel_mce_extended_msrs {
-	u32 eax;
-	u32 ebx;
-	u32 ecx;
-	u32 edx;
-	u32 esi;
-	u32 edi;
-	u32 ebp;
-	u32 esp;
-	u32 eflags;
-	u32 eip;
-	/* u32 *reserved[]; */
-};
-
-static int mce_num_extended_msrs;
-
-/* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */
-static void intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
-{
-	u32 h;
-
-	rdmsr(MSR_IA32_MCG_EAX, r->eax, h);
-	rdmsr(MSR_IA32_MCG_EBX, r->ebx, h);
-	rdmsr(MSR_IA32_MCG_ECX, r->ecx, h);
-	rdmsr(MSR_IA32_MCG_EDX, r->edx, h);
-	rdmsr(MSR_IA32_MCG_ESI, r->esi, h);
-	rdmsr(MSR_IA32_MCG_EDI, r->edi, h);
-	rdmsr(MSR_IA32_MCG_EBP, r->ebp, h);
-	rdmsr(MSR_IA32_MCG_ESP, r->esp, h);
-	rdmsr(MSR_IA32_MCG_EFLAGS, r->eflags, h);
-	rdmsr(MSR_IA32_MCG_EIP, r->eip, h);
-}
-
-static void intel_machine_check(struct pt_regs *regs, long error_code)
-{
-	u32 alow, ahigh, high, low;
-	u32 mcgstl, mcgsth;
-	int recover = 1;
-	int i;
-
-	rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
-	if (mcgstl & (1<<0))	/* Recoverable ? */
-		recover = 0;
-
-	printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
-		smp_processor_id(), mcgsth, mcgstl);
-
-	if (mce_num_extended_msrs > 0) {
-		struct intel_mce_extended_msrs dbg;
-
-		intel_get_extended_msrs(&dbg);
-
-		printk(KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n"
-			"\teax: %08x ebx: %08x ecx: %08x edx: %08x\n"
-			"\tesi: %08x edi: %08x ebp: %08x esp: %08x\n",
-			smp_processor_id(), dbg.eip, dbg.eflags,
-			dbg.eax, dbg.ebx, dbg.ecx, dbg.edx,
-			dbg.esi, dbg.edi, dbg.ebp, dbg.esp);
-	}
-
-	for (i = 0; i < nr_mce_banks; i++) {
-		rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
-		if (high & (1<<31)) {
-			char misc[20];
-			char addr[24];
-
-			misc[0] = addr[0] = '\0';
-			if (high & (1<<29))
-				recover |= 1;
-			if (high & (1<<25))
-				recover |= 2;
-			high &= ~(1<<31);
-			if (high & (1<<27)) {
-				rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);
-				snprintf(misc, 20, "[%08x%08x]", ahigh, alow);
-			}
-			if (high & (1<<26)) {
-				rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
-				snprintf(addr, 24, " at %08x%08x", ahigh, alow);
-			}
-			printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",
-				smp_processor_id(), i, high, low, misc, addr);
-		}
-	}
-
-	if (recover & 2)
-		panic("CPU context corrupt");
-	if (recover & 1)
-		panic("Unable to continue");
-
-	printk(KERN_EMERG "Attempting to continue.\n");
-
-	/*
-	 * Do not clear the MSR_IA32_MCi_STATUS if the error is not
-	 * recoverable/continuable.This will allow BIOS to look at the MSRs
-	 * for errors if the OS could not log the error.
-	 */
-	for (i = 0; i < nr_mce_banks; i++) {
-		u32 msr;
-		msr = MSR_IA32_MC0_STATUS+i*4;
-		rdmsr(msr, low, high);
-		if (high&(1<<31)) {
-			/* Clear it */
-			wrmsr(msr, 0UL, 0UL);
-			/* Serialize */
-			wmb();
-			add_taint(TAINT_MACHINE_CHECK);
-		}
-	}
-	mcgstl &= ~(1<<2);
-	wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
-}
-
-void intel_p4_mcheck_init(struct cpuinfo_x86 *c)
-{
-	u32 l, h;
-	int i;
-
-	machine_check_vector = intel_machine_check;
-	wmb();
-
-	printk(KERN_INFO "Intel machine check architecture supported.\n");
-	rdmsr(MSR_IA32_MCG_CAP, l, h);
-	if (l & (1<<8))	/* Control register present ? */
-		wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
-	nr_mce_banks = l & 0xff;
-
-	for (i = 0; i < nr_mce_banks; i++) {
-		wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
-		wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
-	}
-
-	set_in_cr4(X86_CR4_MCE);
-	printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
-		smp_processor_id());
-
-	/* Check for P4/Xeon extended MCE MSRs */
-	rdmsr(MSR_IA32_MCG_CAP, l, h);
-	if (l & (1<<9))	{/* MCG_EXT_P */
-		mce_num_extended_msrs = (l >> 16) & 0xff;
-		printk(KERN_INFO "CPU%d: Intel P4/Xeon Extended MCE MSRs (%d)"
-				" available\n",
-			smp_processor_id(), mce_num_extended_msrs);
-
-#ifdef CONFIG_X86_MCE_P4THERMAL
-		/* Check for P4/Xeon Thermal monitor */
-		intel_init_thermal(c);
-#endif
-	}
-}
diff --git a/arch/x86/kernel/cpu/mcheck/p6.c b/arch/x86/kernel/cpu/mcheck/p6.c
deleted file mode 100644
index 01e4f817818..00000000000
--- a/arch/x86/kernel/cpu/mcheck/p6.c
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * P6 specific Machine Check Exception Reporting
- * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>
- */
-#include <linux/interrupt.h>
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-
-#include <asm/processor.h>
-#include <asm/system.h>
-#include <asm/mce.h>
-#include <asm/msr.h>
-
-/* Machine Check Handler For PII/PIII */
-static void intel_machine_check(struct pt_regs *regs, long error_code)
-{
-	u32 alow, ahigh, high, low;
-	u32 mcgstl, mcgsth;
-	int recover = 1;
-	int i;
-
-	rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
-	if (mcgstl & (1<<0))	/* Recoverable ? */
-		recover = 0;
-
-	printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
-		smp_processor_id(), mcgsth, mcgstl);
-
-	for (i = 0; i < nr_mce_banks; i++) {
-		rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
-		if (high & (1<<31)) {
-			char misc[20];
-			char addr[24];
-
-			misc[0] = '\0';
-			addr[0] = '\0';
-
-			if (high & (1<<29))
-				recover |= 1;
-			if (high & (1<<25))
-				recover |= 2;
-			high &= ~(1<<31);
-
-			if (high & (1<<27)) {
-				rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);
-				snprintf(misc, 20, "[%08x%08x]", ahigh, alow);
-			}
-			if (high & (1<<26)) {
-				rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
-				snprintf(addr, 24, " at %08x%08x", ahigh, alow);
-			}
-
-			printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",
-				smp_processor_id(), i, high, low, misc, addr);
-		}
-	}
-
-	if (recover & 2)
-		panic("CPU context corrupt");
-	if (recover & 1)
-		panic("Unable to continue");
-
-	printk(KERN_EMERG "Attempting to continue.\n");
-	/*
-	 * Do not clear the MSR_IA32_MCi_STATUS if the error is not
-	 * recoverable/continuable.This will allow BIOS to look at the MSRs
-	 * for errors if the OS could not log the error:
-	 */
-	for (i = 0; i < nr_mce_banks; i++) {
-		unsigned int msr;
-
-		msr = MSR_IA32_MC0_STATUS+i*4;
-		rdmsr(msr, low, high);
-		if (high & (1<<31)) {
-			/* Clear it: */
-			wrmsr(msr, 0UL, 0UL);
-			/* Serialize: */
-			wmb();
-			add_taint(TAINT_MACHINE_CHECK);
-		}
-	}
-	mcgstl &= ~(1<<2);
-	wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
-}
-
-/* Set up machine check reporting for processors with Intel style MCE: */
-void intel_p6_mcheck_init(struct cpuinfo_x86 *c)
-{
-	u32 l, h;
-	int i;
-
-	/* Check for MCE support */
-	if (!cpu_has(c, X86_FEATURE_MCE))
-		return;
-
-	/* Check for PPro style MCA */
-	if (!cpu_has(c, X86_FEATURE_MCA))
-		return;
-
-	/* Ok machine check is available */
-	machine_check_vector = intel_machine_check;
-	/* Make sure the vector pointer is visible before we enable MCEs: */
-	wmb();
-
-	printk(KERN_INFO "Intel machine check architecture supported.\n");
-	rdmsr(MSR_IA32_MCG_CAP, l, h);
-	if (l & (1<<8))	/* Control register present ? */
-		wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
-	nr_mce_banks = l & 0xff;
-
-	/*
-	 * Following the example in IA-32 SDM Vol 3:
-	 * - MC0_CTL should not be written
-	 * - Status registers on all banks should be cleared on reset
-	 */
-	for (i = 1; i < nr_mce_banks; i++)
-		wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
-
-	for (i = 0; i < nr_mce_banks; i++)
-		wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
-
-	set_in_cr4(X86_CR4_MCE);
-	printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
-		smp_processor_id());
-}
-- 
cgit v1.2.3


From c1ebf835617035b1f08f734247dcb981e17aac6b Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Thu, 9 Jul 2009 00:31:41 +0200
Subject: x86: mce: Rename CONFIG_X86_NEW_MCE to CONFIG_X86_MCE

Drop the CONFIG_X86_NEW_MCE symbol and change all
references to it to check for CONFIG_X86_MCE directly.

No code changes

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/Kconfig                    | 11 +++--------
 arch/x86/include/asm/entry_arch.h   |  2 +-
 arch/x86/kernel/apic/nmi.c          |  2 +-
 arch/x86/kernel/cpu/mcheck/Makefile |  3 +--
 arch/x86/kernel/irq.c               |  4 ++--
 arch/x86/kernel/irqinit.c           |  2 +-
 arch/x86/kernel/signal.c            |  2 +-
 7 files changed, 10 insertions(+), 16 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index d986769a7d9..06880ca677f 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -781,15 +781,10 @@ config X86_MCE
 	  The action the kernel takes depends on the severity of the problem,
 	  ranging from warning messages to halting the machine.
 
-config X86_NEW_MCE
-	depends on X86_MCE
-	bool
-	default y
-
 config X86_MCE_INTEL
 	def_bool y
 	prompt "Intel MCE features"
-	depends on X86_NEW_MCE && X86_LOCAL_APIC
+	depends on X86_MCE && X86_LOCAL_APIC
 	---help---
 	   Additional support for intel specific MCE features such as
 	   the thermal monitor.
@@ -797,7 +792,7 @@ config X86_MCE_INTEL
 config X86_MCE_AMD
 	def_bool y
 	prompt "AMD MCE features"
-	depends on X86_NEW_MCE && X86_LOCAL_APIC
+	depends on X86_MCE && X86_LOCAL_APIC
 	---help---
 	   Additional support for AMD specific MCE features such as
 	   the DRAM Error Threshold.
@@ -817,7 +812,7 @@ config X86_MCE_THRESHOLD
 	default y
 
 config X86_MCE_INJECT
-	depends on X86_NEW_MCE
+	depends on X86_MCE
 	tristate "Machine check injector support"
 	---help---
 	  Provide support for injecting machine checks for testing purposes.
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index ff8cbfa0785..5e3f2044f0d 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -61,7 +61,7 @@ BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR)
 BUILD_INTERRUPT(threshold_interrupt,THRESHOLD_APIC_VECTOR)
 #endif
 
-#ifdef CONFIG_X86_NEW_MCE
+#ifdef CONFIG_X86_MCE
 BUILD_INTERRUPT(mce_self_interrupt,MCE_SELF_VECTOR)
 #endif
 
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index b3025b43b63..f4227289caf 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -66,7 +66,7 @@ static inline unsigned int get_nmi_count(int cpu)
 
 static inline int mce_in_progress(void)
 {
-#if defined(CONFIG_X86_NEW_MCE)
+#if defined(CONFIG_X86_MCE)
 	return atomic_read(&mce_entry) > 0;
 #endif
 	return 0;
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
index 022a036ce21..4ac6d48fe11 100644
--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -1,6 +1,5 @@
-obj-y				=  mce.o
+obj-y				=  mce.o mce-severity.o
 
-obj-$(CONFIG_X86_NEW_MCE)	+= mce-severity.o
 obj-$(CONFIG_X86_ANCIENT_MCE)	+= winchip.o p5.o
 obj-$(CONFIG_X86_MCE_INTEL)	+= mce_intel.o
 obj-$(CONFIG_X86_MCE_AMD)	+= mce_amd.o
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index b0cdde6932f..74656d1d4e3 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -104,7 +104,7 @@ static int show_other_interrupts(struct seq_file *p, int prec)
 	seq_printf(p, "  Threshold APIC interrupts\n");
 # endif
 #endif
-#ifdef CONFIG_X86_NEW_MCE
+#ifdef CONFIG_X86_MCE
 	seq_printf(p, "%*s: ", prec, "MCE");
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", per_cpu(mce_exception_count, j));
@@ -200,7 +200,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
 	sum += irq_stats(cpu)->irq_threshold_count;
 # endif
 #endif
-#ifdef CONFIG_X86_NEW_MCE
+#ifdef CONFIG_X86_MCE
 	sum += per_cpu(mce_exception_count, cpu);
 	sum += per_cpu(mce_poll_count, cpu);
 #endif
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 696f0e475c2..8a194ad357e 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -190,7 +190,7 @@ static void __init apic_intr_init(void)
 #ifdef CONFIG_X86_THRESHOLD
 	alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
 #endif
-#if defined(CONFIG_X86_NEW_MCE) && defined(CONFIG_X86_LOCAL_APIC)
+#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_LOCAL_APIC)
 	alloc_intr_gate(MCE_SELF_VECTOR, mce_self_interrupt);
 #endif
 
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 4c578751e94..cc26ad4c307 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -856,7 +856,7 @@ static void do_signal(struct pt_regs *regs)
 void
 do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
 {
-#ifdef CONFIG_X86_NEW_MCE
+#ifdef CONFIG_X86_MCE
 	/* notify userspace of pending MCEs */
 	if (thread_info_flags & _TIF_MCE_NOTIFY)
 		mce_notify_process();
-- 
cgit v1.2.3


From 9eda8cb3ac235217e4ffa01cb9cedee1c1550599 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Thu, 9 Jul 2009 00:31:42 +0200
Subject: x86: mce: Move code in mce.c

Now that the X86_OLD_MCE ifdefs are gone move some code that
used to be outside the big ifdef to a more natural place
near its user.

No code change.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 5ff6362ecb1..e16271f01ac 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -45,17 +45,6 @@
 
 #include "mce-internal.h"
 
-/* Handle unconfigured int18 (should never happen) */
-static void unexpected_machine_check(struct pt_regs *regs, long error_code)
-{
-	printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
-	       smp_processor_id());
-}
-
-/* Call the installed machine check handler for this CPU setup. */
-void (*machine_check_vector)(struct pt_regs *, long error_code) =
-						unexpected_machine_check;
-
 int mce_disabled __read_mostly;
 
 #define MISC_MCELOG_MINOR	227
@@ -1322,6 +1311,17 @@ static void mce_init_timer(void)
 	add_timer(t);
 }
 
+/* Handle unconfigured int18 (should never happen) */
+static void unexpected_machine_check(struct pt_regs *regs, long error_code)
+{
+	printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
+	       smp_processor_id());
+}
+
+/* Call the installed machine check handler for this CPU setup. */
+void (*machine_check_vector)(struct pt_regs *, long error_code) =
+						unexpected_machine_check;
+
 /*
  * Called for each booted CPU to set up machine checks.
  * Must be called with preempt off:
-- 
cgit v1.2.3


From cebe182033f156b430952370fb0f9dbe6e89b081 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Thu, 9 Jul 2009 00:31:43 +0200
Subject: x86: mce: Move per bank data in a single datastructure

This addresses one of the leftover review comments.

Move the per bank data into a single structure. This avoids
several separate variables and also separate allocation of sysfs objects.

I didn't move the CMCI ownership information so far because
that would have needed some non trivial changes in the algorithms.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/mce-internal.h |  14 ++++
 arch/x86/kernel/cpu/mcheck/mce.c          | 109 +++++++++++++++---------------
 2 files changed, 67 insertions(+), 56 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index 54dcb8ff12e..6bd51e7ba87 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -1,3 +1,4 @@
+#include <linux/sysdev.h>
 #include <asm/mce.h>
 
 enum severity_level {
@@ -10,6 +11,19 @@ enum severity_level {
 	MCE_PANIC_SEVERITY,
 };
 
+#define ATTR_LEN		16
+
+/* One object for each MCE bank, shared by all CPUs */
+struct mce_bank {
+	u64			ctl;			/* subevents to enable */
+	unsigned char init;				/* initialise bank? */
+	struct sysdev_attribute attr;			/* sysdev attribute */
+	char			attrname[ATTR_LEN];	/* attribute name */
+};
+
 int mce_severity(struct mce *a, int tolerant, char **msg);
 
 extern int mce_ser;
+
+extern struct mce_bank *mce_banks;
+
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index e16271f01ac..a04806e01a8 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -64,7 +64,6 @@ DEFINE_PER_CPU(unsigned, mce_exception_count);
  */
 static int			tolerant		__read_mostly = 1;
 static int			banks			__read_mostly;
-static u64			*bank			__read_mostly;
 static int			rip_msr			__read_mostly;
 static int			mce_bootlog		__read_mostly = -1;
 static int			monarch_timeout		__read_mostly = -1;
@@ -74,13 +73,13 @@ int				mce_cmci_disabled	__read_mostly;
 int				mce_ignore_ce		__read_mostly;
 int				mce_ser			__read_mostly;
 
+struct mce_bank                *mce_banks		__read_mostly;
+
 /* User mode helper program triggered by machine check event */
 static unsigned long		mce_need_notify;
 static char			mce_helper[128];
 static char			*mce_helper_argv[2] = { mce_helper, NULL };
 
-static unsigned long		dont_init_banks;
-
 static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
 static DEFINE_PER_CPU(struct mce, mces_seen);
 static int			cpu_missing;
@@ -91,11 +90,6 @@ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
 	[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
 };
 
-static inline int skip_bank_init(int i)
-{
-	return i < BITS_PER_LONG && test_bit(i, &dont_init_banks);
-}
-
 static DEFINE_PER_CPU(struct work_struct, mce_work);
 
 /* Do initial initialization of a struct mce */
@@ -482,7 +476,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 
 	m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
 	for (i = 0; i < banks; i++) {
-		if (!bank[i] || !test_bit(i, *b))
+		if (!mce_banks[i].ctl || !test_bit(i, *b))
 			continue;
 
 		m.misc = 0;
@@ -903,7 +897,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 	order = mce_start(&no_way_out);
 	for (i = 0; i < banks; i++) {
 		__clear_bit(i, toclear);
-		if (!bank[i])
+		if (!mce_banks[i].ctl)
 			continue;
 
 		m.misc = 0;
@@ -1146,6 +1140,21 @@ int mce_notify_irq(void)
 }
 EXPORT_SYMBOL_GPL(mce_notify_irq);
 
+static int mce_banks_init(void)
+{
+	int i;
+
+	mce_banks = kzalloc(banks * sizeof(struct mce_bank), GFP_KERNEL);
+	if (!mce_banks)
+		return -ENOMEM;
+	for (i = 0; i < banks; i++) {
+		struct mce_bank *b = &mce_banks[i];
+		b->ctl = -1ULL;
+		b->init = 1;
+	}
+	return 0;
+}
+
 /*
  * Initialize Machine Checks for a CPU.
  */
@@ -1169,11 +1178,10 @@ static int mce_cap_init(void)
 	/* Don't support asymmetric configurations today */
 	WARN_ON(banks != 0 && b != banks);
 	banks = b;
-	if (!bank) {
-		bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
-		if (!bank)
-			return -ENOMEM;
-		memset(bank, 0xff, banks * sizeof(u64));
+	if (!mce_banks) {
+		int err = mce_banks_init();
+		if (err)
+			return err;
 	}
 
 	/* Use accurate RIP reporting if available. */
@@ -1205,9 +1213,10 @@ static void mce_init(void)
 		wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
 
 	for (i = 0; i < banks; i++) {
-		if (skip_bank_init(i))
+		struct mce_bank *b = &mce_banks[i];
+		if (!b->init)
 			continue;
-		wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
+		wrmsrl(MSR_IA32_MC0_CTL+4*i, b->ctl);
 		wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
 	}
 }
@@ -1223,7 +1232,7 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c)
 			 * trips off incorrectly with the IOMMU & 3ware
 			 * & Cerberus:
 			 */
-			clear_bit(10, (unsigned long *)&bank[4]);
+			clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
 		}
 		if (c->x86 <= 17 && mce_bootlog < 0) {
 			/*
@@ -1237,7 +1246,7 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c)
 		 * by default.
 		 */
 		 if (c->x86 == 6 && banks > 0)
-			bank[0] = 0;
+			mce_banks[0].ctl = 0;
 	}
 
 	if (c->x86_vendor == X86_VENDOR_INTEL) {
@@ -1250,8 +1259,8 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c)
 		 * valid event later, merely don't write CTL0.
 		 */
 
-		if (c->x86 == 6 && c->x86_model < 0x1A)
-			__set_bit(0, &dont_init_banks);
+		if (c->x86 == 6 && c->x86_model < 0x1A && banks > 0)
+			mce_banks[0].init = 0;
 
 		/*
 		 * All newer Intel systems support MCE broadcasting. Enable
@@ -1578,7 +1587,8 @@ static int mce_disable(void)
 	int i;
 
 	for (i = 0; i < banks; i++) {
-		if (!skip_bank_init(i))
+		struct mce_bank *b = &mce_banks[i];
+		if (b->init)
 			wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
 	}
 	return 0;
@@ -1654,14 +1664,15 @@ DEFINE_PER_CPU(struct sys_device, mce_dev);
 __cpuinitdata
 void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
 
-static struct sysdev_attribute *bank_attrs;
+static inline struct mce_bank *attr_to_bank(struct sysdev_attribute *attr)
+{
+	return container_of(attr, struct mce_bank, attr);
+}
 
 static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
 			 char *buf)
 {
-	u64 b = bank[attr - bank_attrs];
-
-	return sprintf(buf, "%llx\n", b);
+	return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl);
 }
 
 static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
@@ -1672,7 +1683,7 @@ static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
 	if (strict_strtoull(buf, 0, &new) < 0)
 		return -EINVAL;
 
-	bank[attr - bank_attrs] = new;
+	attr_to_bank(attr)->ctl = new;
 	mce_restart();
 
 	return size;
@@ -1816,7 +1827,7 @@ static __cpuinit int mce_create_device(unsigned int cpu)
 	}
 	for (j = 0; j < banks; j++) {
 		err = sysdev_create_file(&per_cpu(mce_dev, cpu),
-					&bank_attrs[j]);
+					&mce_banks[j].attr);
 		if (err)
 			goto error2;
 	}
@@ -1825,10 +1836,10 @@ static __cpuinit int mce_create_device(unsigned int cpu)
 	return 0;
 error2:
 	while (--j >= 0)
-		sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[j]);
+		sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr);
 error:
 	while (--i >= 0)
-		sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
+		sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr);
 
 	sysdev_unregister(&per_cpu(mce_dev, cpu));
 
@@ -1846,7 +1857,7 @@ static __cpuinit void mce_remove_device(unsigned int cpu)
 		sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
 
 	for (i = 0; i < banks; i++)
-		sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]);
+		sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr);
 
 	sysdev_unregister(&per_cpu(mce_dev, cpu));
 	cpumask_clear_cpu(cpu, mce_dev_initialized);
@@ -1863,7 +1874,8 @@ static void mce_disable_cpu(void *h)
 	if (!(action & CPU_TASKS_FROZEN))
 		cmci_clear();
 	for (i = 0; i < banks; i++) {
-		if (!skip_bank_init(i))
+		struct mce_bank *b = &mce_banks[i];
+		if (b->init)
 			wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
 	}
 }
@@ -1879,8 +1891,9 @@ static void mce_reenable_cpu(void *h)
 	if (!(action & CPU_TASKS_FROZEN))
 		cmci_reenable();
 	for (i = 0; i < banks; i++) {
-		if (!skip_bank_init(i))
-			wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
+		struct mce_bank *b = &mce_banks[i];
+		if (b->init)
+			wrmsrl(MSR_IA32_MC0_CTL + i*4, b->ctl);
 	}
 }
 
@@ -1928,35 +1941,21 @@ static struct notifier_block mce_cpu_notifier __cpuinitdata = {
 	.notifier_call = mce_cpu_callback,
 };
 
-static __init int mce_init_banks(void)
+static __init void mce_init_banks(void)
 {
 	int i;
 
-	bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
-				GFP_KERNEL);
-	if (!bank_attrs)
-		return -ENOMEM;
-
 	for (i = 0; i < banks; i++) {
-		struct sysdev_attribute *a = &bank_attrs[i];
+		struct mce_bank *b = &mce_banks[i];
+		struct sysdev_attribute *a = &b->attr;
 
-		a->attr.name	= kasprintf(GFP_KERNEL, "bank%d", i);
-		if (!a->attr.name)
-			goto nomem;
+		a->attr.name	= b->attrname;
+		snprintf(b->attrname, ATTR_LEN, "bank%d", i);
 
 		a->attr.mode	= 0644;
 		a->show		= show_bank;
 		a->store	= set_bank;
 	}
-	return 0;
-
-nomem:
-	while (--i >= 0)
-		kfree(bank_attrs[i].attr.name);
-	kfree(bank_attrs);
-	bank_attrs = NULL;
-
-	return -ENOMEM;
 }
 
 static __init int mce_init_device(void)
@@ -1969,9 +1968,7 @@ static __init int mce_init_device(void)
 
 	zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL);
 
-	err = mce_init_banks();
-	if (err)
-		return err;
+	mce_init_banks();
 
 	err = sysdev_class_register(&mce_sysclass);
 	if (err)
-- 
cgit v1.2.3


From a2d32bcbc008aa0f9c301a7c6f3494cb23e6af54 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Thu, 9 Jul 2009 00:31:44 +0200
Subject: x86: mce: macros to compute banks MSRs

Instead of open coded calculations for bank MSRs hide the indexing of higher
banks MCE register MSRs in new macros.

No semantic changes.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/msr-index.h       |  7 +++++++
 arch/x86/kernel/cpu/mcheck/mce.c       | 34 +++++++++++++++++-----------------
 arch/x86/kernel/cpu/mcheck/mce_intel.c | 10 +++++-----
 3 files changed, 29 insertions(+), 22 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 1692fb5050e..3d1ce094586 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -81,8 +81,15 @@
 #define MSR_IA32_MC0_ADDR		0x00000402
 #define MSR_IA32_MC0_MISC		0x00000403
 
+#define MSR_IA32_MCx_CTL(x)		(MSR_IA32_MC0_CTL + 4*(x))
+#define MSR_IA32_MCx_STATUS(x)		(MSR_IA32_MC0_STATUS + 4*(x))
+#define MSR_IA32_MCx_ADDR(x)		(MSR_IA32_MC0_ADDR + 4*(x))
+#define MSR_IA32_MCx_MISC(x)		(MSR_IA32_MC0_MISC + 4*(x))
+
 /* These are consecutive and not in the normal 4er MCE bank block */
 #define MSR_IA32_MC0_CTL2		0x00000280
+#define MSR_IA32_MCx_CTL2(x)		(MSR_IA32_MC0_CTL2 + (x))
+
 #define CMCI_EN			(1ULL << 30)
 #define CMCI_THRESHOLD_MASK		0xffffULL
 
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index a04806e01a8..07139a0578e 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -267,11 +267,11 @@ static int msr_to_offset(u32 msr)
 	unsigned bank = __get_cpu_var(injectm.bank);
 	if (msr == rip_msr)
 		return offsetof(struct mce, ip);
-	if (msr == MSR_IA32_MC0_STATUS + bank*4)
+	if (msr == MSR_IA32_MCx_STATUS(bank))
 		return offsetof(struct mce, status);
-	if (msr == MSR_IA32_MC0_ADDR + bank*4)
+	if (msr == MSR_IA32_MCx_ADDR(bank))
 		return offsetof(struct mce, addr);
-	if (msr == MSR_IA32_MC0_MISC + bank*4)
+	if (msr == MSR_IA32_MCx_MISC(bank))
 		return offsetof(struct mce, misc);
 	if (msr == MSR_IA32_MCG_STATUS)
 		return offsetof(struct mce, mcgstatus);
@@ -485,7 +485,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 		m.tsc = 0;
 
 		barrier();
-		m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
+		m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
 		if (!(m.status & MCI_STATUS_VAL))
 			continue;
 
@@ -500,9 +500,9 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 			continue;
 
 		if (m.status & MCI_STATUS_MISCV)
-			m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4);
+			m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
 		if (m.status & MCI_STATUS_ADDRV)
-			m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4);
+			m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
 
 		if (!(flags & MCP_TIMESTAMP))
 			m.tsc = 0;
@@ -518,7 +518,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 		/*
 		 * Clear state for this bank.
 		 */
-		mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
+		mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
 	}
 
 	/*
@@ -539,7 +539,7 @@ static int mce_no_way_out(struct mce *m, char **msg)
 	int i;
 
 	for (i = 0; i < banks; i++) {
-		m->status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
+		m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
 		if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY)
 			return 1;
 	}
@@ -823,7 +823,7 @@ static void mce_clear_state(unsigned long *toclear)
 
 	for (i = 0; i < banks; i++) {
 		if (test_bit(i, toclear))
-			mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
+			mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
 	}
 }
 
@@ -904,7 +904,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 		m.addr = 0;
 		m.bank = i;
 
-		m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
+		m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
 		if ((m.status & MCI_STATUS_VAL) == 0)
 			continue;
 
@@ -945,9 +945,9 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 			kill_it = 1;
 
 		if (m.status & MCI_STATUS_MISCV)
-			m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4);
+			m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
 		if (m.status & MCI_STATUS_ADDRV)
-			m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4);
+			m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
 
 		/*
 		 * Action optional error. Queue address for later processing.
@@ -1216,8 +1216,8 @@ static void mce_init(void)
 		struct mce_bank *b = &mce_banks[i];
 		if (!b->init)
 			continue;
-		wrmsrl(MSR_IA32_MC0_CTL+4*i, b->ctl);
-		wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
+		wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
+		wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
 	}
 }
 
@@ -1589,7 +1589,7 @@ static int mce_disable(void)
 	for (i = 0; i < banks; i++) {
 		struct mce_bank *b = &mce_banks[i];
 		if (b->init)
-			wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
+			wrmsrl(MSR_IA32_MCx_CTL(i), 0);
 	}
 	return 0;
 }
@@ -1876,7 +1876,7 @@ static void mce_disable_cpu(void *h)
 	for (i = 0; i < banks; i++) {
 		struct mce_bank *b = &mce_banks[i];
 		if (b->init)
-			wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
+			wrmsrl(MSR_IA32_MCx_CTL(i), 0);
 	}
 }
 
@@ -1893,7 +1893,7 @@ static void mce_reenable_cpu(void *h)
 	for (i = 0; i < banks; i++) {
 		struct mce_bank *b = &mce_banks[i];
 		if (b->init)
-			wrmsrl(MSR_IA32_MC0_CTL + i*4, b->ctl);
+			wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
 	}
 }
 
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index e1acec0f7a3..889f665fe93 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -90,7 +90,7 @@ static void cmci_discover(int banks, int boot)
 		if (test_bit(i, owned))
 			continue;
 
-		rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
+		rdmsrl(MSR_IA32_MCx_CTL2(i), val);
 
 		/* Already owned by someone else? */
 		if (val & CMCI_EN) {
@@ -101,8 +101,8 @@ static void cmci_discover(int banks, int boot)
 		}
 
 		val |= CMCI_EN | CMCI_THRESHOLD;
-		wrmsrl(MSR_IA32_MC0_CTL2 + i, val);
-		rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
+		wrmsrl(MSR_IA32_MCx_CTL2(i), val);
+		rdmsrl(MSR_IA32_MCx_CTL2(i), val);
 
 		/* Did the enable bit stick? -- the bank supports CMCI */
 		if (val & CMCI_EN) {
@@ -152,9 +152,9 @@ void cmci_clear(void)
 		if (!test_bit(i, __get_cpu_var(mce_banks_owned)))
 			continue;
 		/* Disable CMCI */
-		rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
+		rdmsrl(MSR_IA32_MCx_CTL2(i), val);
 		val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK);
-		wrmsrl(MSR_IA32_MC0_CTL2 + i, val);
+		wrmsrl(MSR_IA32_MCx_CTL2(i), val);
 		__clear_bit(i, __get_cpu_var(mce_banks_owned));
 	}
 	spin_unlock_irqrestore(&cmci_discover_lock, flags);
-- 
cgit v1.2.3


From 3ccdccfadbd2548abe38682b587f4ba27eac2fc9 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Thu, 9 Jul 2009 00:31:45 +0200
Subject: x86: mce: Lower maximum number of banks to architecture limit

The Intel x86 architecture right now only supports 32 machine check
banks, more would bump into other MSRs.

So lower the max define to 32.

This only affects a few bitmaps, most data structures are dynamically
sized anyways.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/mce.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 6b8a974e127..ad753537291 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -130,10 +130,11 @@ void mce_log(struct mce *m);
 DECLARE_PER_CPU(struct sys_device, mce_dev);
 
 /*
- * To support more than 128 would need to escape the predefined
- * Linux defined extended banks first.
+ * Maximum banks number.
+ * This is the limit of the current register layout on
+ * Intel CPUs.
  */
-#define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1)
+#define MAX_NR_BANKS 32
 
 #ifdef CONFIG_X86_MCE_INTEL
 extern int mce_cmci_disabled;
-- 
cgit v1.2.3


From 3162534069597e34dd0ac9eb711be8dc23835ae7 Mon Sep 17 00:00:00 2001
From: Joseph Cihula <joseph.cihula@intel.com>
Date: Tue, 30 Jun 2009 19:30:59 -0700
Subject: x86, intel_txt: Intel TXT boot support

This patch adds kernel configuration and boot support for Intel Trusted
Execution Technology (Intel TXT).

Intel's technology for safer computing, Intel Trusted Execution
Technology (Intel TXT), defines platform-level enhancements that
provide the building blocks for creating trusted platforms.

Intel TXT was formerly known by the code name LaGrande Technology (LT).

Intel TXT in Brief:
o  Provides dynamic root of trust for measurement (DRTM)
o  Data protection in case of improper shutdown
o  Measurement and verification of launched environment

Intel TXT is part of the vPro(TM) brand and is also available some
non-vPro systems.  It is currently available on desktop systems based on
the Q35, X38, Q45, and Q43 Express chipsets (e.g. Dell Optiplex 755, HP
dc7800, etc.) and mobile systems based on the GM45, PM45, and GS45
Express chipsets.

For more information, see http://www.intel.com/technology/security/.
This site also has a link to the Intel TXT MLE Developers Manual, which
has been updated for the new released platforms.

A much more complete description of how these patches support TXT, how to
configure a system for it, etc. is in the Documentation/intel_txt.txt file
in this patch.

This patch provides the TXT support routines for complete functionality,
documentation for TXT support and for the changes to the boot_params structure,
and boot detection of a TXT launch.  Attempts to shutdown (reboot, Sx) the system
will result in platform resets; subsequent patches will support these shutdown modes
properly.

 Documentation/intel_txt.txt      |  210 +++++++++++++++++++++
 Documentation/x86/zero-page.txt  |    1
 arch/x86/include/asm/bootparam.h |    3
 arch/x86/include/asm/fixmap.h    |    3
 arch/x86/include/asm/tboot.h     |  197 ++++++++++++++++++++
 arch/x86/kernel/Makefile         |    1
 arch/x86/kernel/setup.c          |    4
 arch/x86/kernel/tboot.c          |  379 +++++++++++++++++++++++++++++++++++++++
 security/Kconfig                 |   30 +++
 9 files changed, 827 insertions(+), 1 deletion(-)

Signed-off-by: Joseph Cihula <joseph.cihula@intel.com>
Signed-off-by: Shane Wang <shane.wang@intel.com>
Signed-off-by: Gang Wei <gang.wei@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 Documentation/intel_txt.txt      | 210 ++++++++++++++++++++++
 Documentation/x86/zero-page.txt  |   1 +
 arch/x86/include/asm/bootparam.h |   3 +-
 arch/x86/include/asm/fixmap.h    |   3 +
 arch/x86/include/asm/tboot.h     | 197 ++++++++++++++++++++
 arch/x86/kernel/Makefile         |   1 +
 arch/x86/kernel/setup.c          |   4 +
 arch/x86/kernel/tboot.c          | 379 +++++++++++++++++++++++++++++++++++++++
 security/Kconfig                 |  30 ++++
 9 files changed, 827 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/intel_txt.txt
 create mode 100644 arch/x86/include/asm/tboot.h
 create mode 100644 arch/x86/kernel/tboot.c

(limited to 'arch/x86')

diff --git a/Documentation/intel_txt.txt b/Documentation/intel_txt.txt
new file mode 100644
index 00000000000..f40a1f03001
--- /dev/null
+++ b/Documentation/intel_txt.txt
@@ -0,0 +1,210 @@
+Intel(R) TXT Overview:
+=====================
+
+Intel's technology for safer computing, Intel(R) Trusted Execution
+Technology (Intel(R) TXT), defines platform-level enhancements that
+provide the building blocks for creating trusted platforms.
+
+Intel TXT was formerly known by the code name LaGrande Technology (LT).
+
+Intel TXT in Brief:
+o  Provides dynamic root of trust for measurement (DRTM)
+o  Data protection in case of improper shutdown
+o  Measurement and verification of launched environment
+
+Intel TXT is part of the vPro(TM) brand and is also available some
+non-vPro systems.  It is currently available on desktop systems
+based on the Q35, X38, Q45, and Q43 Express chipsets (e.g. Dell
+Optiplex 755, HP dc7800, etc.) and mobile systems based on the GM45,
+PM45, and GS45 Express chipsets.
+
+For more information, see http://www.intel.com/technology/security/.
+This site also has a link to the Intel TXT MLE Developers Manual,
+which has been updated for the new released platforms.
+
+Intel TXT has been presented at various events over the past few
+years, some of which are:
+      LinuxTAG 2008:
+          http://www.linuxtag.org/2008/en/conf/events/vp-donnerstag/
+          details.html?talkid=110
+      TRUST2008:
+          http://www.trust2008.eu/downloads/Keynote-Speakers/
+          3_David-Grawrock_The-Front-Door-of-Trusted-Computing.pdf
+      IDF 2008, Shanghai:
+          http://inteldeveloperforum.com.edgesuite.net/shanghai_2008/
+          aep/PROS003/index.html
+      IDFs 2006, 2007 (I'm not sure if/where they are online)
+
+Trusted Boot Project Overview:
+=============================
+
+Trusted Boot (tboot) is an open source, pre- kernel/VMM module that
+uses Intel TXT to perform a measured and verified launch of an OS
+kernel/VMM.
+
+It is hosted on SourceForge at http://sourceforge.net/projects/tboot.
+The mercurial source repo is available at http://www.bughost.org/
+repos.hg/tboot.hg.
+
+Tboot currently supports launching Xen (open source VMM/hypervisor
+w/ TXT support since v3.2), and now Linux kernels.
+
+
+Value Proposition for Linux or "Why should you care?"
+=====================================================
+
+While there are many products and technologies that attempt to
+measure or protect the integrity of a running kernel, they all
+assume the kernel is "good" to begin with.  The Integrity
+Measurement Architecture (IMA) and Linux Integrity Module interface
+are examples of such solutions.
+
+To get trust in the initial kernel without using Intel TXT, a
+static root of trust must be used.  This bases trust in BIOS
+starting at system reset and requires measurement of all code
+executed between system reset through the completion of the kernel
+boot as well as data objects used by that code.  In the case of a
+Linux kernel, this means all of BIOS, any option ROMs, the
+bootloader and the boot config.  In practice, this is a lot of
+code/data, much of which is subject to change from boot to boot
+(e.g. changing NICs may change option ROMs).  Without reference
+hashes, these measurement changes are difficult to assess or
+confirm as benign.  This process also does not provide DMA
+protection, memory configuration/alias checks and locks, crash
+protection, or policy support.
+
+By using the hardware-based root of trust that Intel TXT provides,
+many of these issues can be mitigated.  Specifically: many
+pre-launch components can be removed from the trust chain, DMA
+protection is provided to all launched components, a large number
+of platform configuration checks are performed and values locked,
+protection is provided for any data in the event of an improper
+shutdown, and there is support for policy-based execution/verification.
+This provides a more stable measurement and a higher assurance of
+system configuration and initial state than would be otherwise
+possible.  Since the tboot project is open source, source code for
+almost all parts of the trust chain is available (excepting SMM and
+Intel-provided firmware).
+
+How Does it Work?
+=================
+
+o  Tboot is an executable that is launched by the bootloader as
+   the "kernel" (the binary the bootloader executes).
+o  It performs all of the work necessary to determine if the
+   platform supports Intel TXT and, if so, executes the GETSEC[SENTER]
+   processor instruction that initiates the dynamic root of trust.
+   -  If tboot determines that the system does not support Intel TXT
+      or is not configured correctly (e.g. the SINIT AC Module was
+      incorrect), it will directly launch the kernel with no changes
+      to any state.
+   -  Tboot will output various information about its progress to the
+      terminal, serial port, and/or an in-memory log; the output
+      locations can be configured with a command line switch.
+o  The GETSEC[SENTER] instruction will return control to tboot and
+   tboot then verifies certain aspects of the environment (e.g. TPM NV
+   lock, e820 table does not have invalid entries, etc.).
+o  It will wake the APs from the special sleep state the GETSEC[SENTER]
+   instruction had put them in and place them into a wait-for-SIPI
+   state.
+   -  Because the processors will not respond to an INIT or SIPI when
+      in the TXT environment, it is necessary to create a small VT-x
+      guest for the APs.  When they run in this guest, they will
+      simply wait for the INIT-SIPI-SIPI sequence, which will cause
+      VMEXITs, and then disable VT and jump to the SIPI vector.  This
+      approach seemed like a better choice than having to insert
+      special code into the kernel's MP wakeup sequence.
+o  Tboot then applies an (optional) user-defined launch policy to
+   verify the kernel and initrd.
+   -  This policy is rooted in TPM NV and is described in the tboot
+      project.  The tboot project also contains code for tools to
+      create and provision the policy.
+   -  Policies are completely under user control and if not present
+      then any kernel will be launched.
+   -  Policy action is flexible and can include halting on failures
+      or simply logging them and continuing.
+o  Tboot adjusts the e820 table provided by the bootloader to reserve
+   its own location in memory as well as to reserve certain other
+   TXT-related regions.
+o  As part of it's launch, tboot DMA protects all of RAM (using the
+   VT-d PMRs).  Thus, the kernel must be booted with 'intel_iommu=on'
+   in order to remove this blanket protection and use VT-d's
+   page-level protection.
+o  Tboot will populate a shared page with some data about itself and
+   pass this to the Linux kernel as it transfers control.
+   -  The location of the shared page is passed via the boot_params
+      struct as a physical address.
+o  The kernel will look for the tboot shared page address and, if it
+   exists, map it.
+o  As one of the checks/protections provided by TXT, it makes a copy
+   of the VT-d DMARs in a DMA-protected region of memory and verifies
+   them for correctness.  The VT-d code will detect if the kernel was
+   launched with tboot and use this copy instead of the one in the
+   ACPI table.
+o  At this point, tboot and TXT are out of the picture until a
+   shutdown (S<n>)
+o  In order to put a system into any of the sleep states after a TXT
+   launch, TXT must first be exited.  This is to prevent attacks that
+   attempt to crash the system to gain control on reboot and steal
+   data left in memory.
+   -  The kernel will perform all of its sleep preparation and
+      populate the shared page with the ACPI data needed to put the
+      platform in the desired sleep state.
+   -  Then the kernel jumps into tboot via the vector specified in the
+      shared page.
+   -  Tboot will clean up the environment and disable TXT, then use the
+      kernel-provided ACPI information to actually place the platform
+      into the desired sleep state.
+   -  In the case of S3, tboot will also register itself as the resume
+      vector.  This is necessary because it must re-establish the
+      measured environment upon resume.  Once the TXT environment
+      has been restored, it will restore the TPM PCRs and then
+      transfer control back to the kernel's S3 resume vector.
+      In order to preserve system integrity across S3, the kernel
+      provides tboot with a set of memory ranges (kernel
+      code/data/bss, S3 resume code, and AP trampoline) that tboot
+      will calculate a MAC (message authentication code) over and then
+      seal with the TPM.  On resume and once the measured environment
+      has been re-established, tboot will re-calculate the MAC and
+      verify it against the sealed value.  Tboot's policy determines
+      what happens if the verification fails.
+
+That's pretty much it for TXT support.
+
+
+Configuring the System:
+======================
+
+This code works with 32bit, 32bit PAE, and 64bit (x86_64) kernels.
+
+In BIOS, the user must enable:  TPM, TXT, VT-x, VT-d.  Not all BIOSes
+allow these to be individually enabled/disabled and the screens in
+which to find them are BIOS-specific.
+
+grub.conf needs to be modified as follows:
+        title Linux 2.6.29-tip w/ tboot
+          root (hd0,0)
+                kernel /tboot.gz logging=serial,vga,memory
+                module /vmlinuz-2.6.29-tip intel_iommu=on ro
+                       root=LABEL=/ rhgb console=ttyS0,115200 3
+                module /initrd-2.6.29-tip.img
+                module /Q35_SINIT_17.BIN
+
+The kernel option for enabling Intel TXT support is found under the
+Security top-level menu and is called "Enable Intel(R) Trusted
+Execution Technology (TXT)".  It is marked as EXPERIMENTAL and
+depends on the generic x86 support (to allow maximum flexibility in
+kernel build options), since the tboot code will detect whether the
+platform actually supports Intel TXT and thus whether any of the
+kernel code is executed.
+
+The Q35_SINIT_17.BIN file is what Intel TXT refers to as an
+Authenticated Code Module.  It is specific to the chipset in the
+system and can also be found on the Trusted Boot site.  It is an
+(unencrypted) module signed by Intel that is used as part of the
+DRTM process to verify and configure the system.  It is signed
+because it operates at a higher privilege level in the system than
+any other macrocode and its correct operation is critical to the
+establishment of the DRTM.  The process for determining the correct
+SINIT ACM for a system is documented in the SINIT-guide.txt file
+that is on the tboot SourceForge site under the SINIT ACM downloads.
diff --git a/Documentation/x86/zero-page.txt b/Documentation/x86/zero-page.txt
index 4f913857b8a..feb37e17701 100644
--- a/Documentation/x86/zero-page.txt
+++ b/Documentation/x86/zero-page.txt
@@ -12,6 +12,7 @@ Offset	Proto	Name		Meaning
 000/040	ALL	screen_info	Text mode or frame buffer information
 				(struct screen_info)
 040/014	ALL	apm_bios_info	APM BIOS information (struct apm_bios_info)
+058/008	ALL	tboot_addr      Physical address of tboot shared page
 060/010	ALL	ist_info	Intel SpeedStep (IST) BIOS support information
 				(struct ist_info)
 080/010	ALL	hd0_info	hd0 disk parameter, OBSOLETE!!
diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h
index 1724e8de317..6ca20218dd7 100644
--- a/arch/x86/include/asm/bootparam.h
+++ b/arch/x86/include/asm/bootparam.h
@@ -85,7 +85,8 @@ struct efi_info {
 struct boot_params {
 	struct screen_info screen_info;			/* 0x000 */
 	struct apm_bios_info apm_bios_info;		/* 0x040 */
-	__u8  _pad2[12];				/* 0x054 */
+	__u8  _pad2[4];					/* 0x054 */
+	__u64  tboot_addr;				/* 0x058 */
 	struct ist_info ist_info;			/* 0x060 */
 	__u8  _pad3[16];				/* 0x070 */
 	__u8  hd0_info[16];	/* obsolete! */		/* 0x080 */
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 7b2d71df39a..14f9890eb49 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -131,6 +131,9 @@ enum fixed_addresses {
 #endif
 #ifdef CONFIG_X86_32
 	FIX_WP_TEST,
+#endif
+#ifdef CONFIG_INTEL_TXT
+	FIX_TBOOT_BASE,
 #endif
 	__end_of_fixed_addresses
 };
diff --git a/arch/x86/include/asm/tboot.h b/arch/x86/include/asm/tboot.h
new file mode 100644
index 00000000000..b13929d4e5f
--- /dev/null
+++ b/arch/x86/include/asm/tboot.h
@@ -0,0 +1,197 @@
+/*
+ * tboot.h: shared data structure with tboot and kernel and functions
+ *          used by kernel for runtime support of Intel(R) Trusted
+ *          Execution Technology
+ *
+ * Copyright (c) 2006-2009, Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+
+#ifndef _ASM_TBOOT_H
+#define _ASM_TBOOT_H
+
+#include <acpi/acpi.h>
+
+/* these must have the values from 0-5 in this order */
+enum {
+	TB_SHUTDOWN_REBOOT = 0,
+	TB_SHUTDOWN_S5,
+	TB_SHUTDOWN_S4,
+	TB_SHUTDOWN_S3,
+	TB_SHUTDOWN_HALT,
+	TB_SHUTDOWN_WFS
+};
+
+#ifdef CONFIG_INTEL_TXT
+
+/* used to communicate between tboot and the launched kernel */
+
+#define TB_KEY_SIZE             64   /* 512 bits */
+
+#define MAX_TB_MAC_REGIONS      32
+
+struct tboot_mac_region {
+	u64  start;         /* must be 64 byte -aligned */
+	u32  size;          /* must be 64 byte -granular */
+} __packed;
+
+/* GAS - Generic Address Structure (ACPI 2.0+) */
+struct tboot_acpi_generic_address {
+	u8  space_id;
+	u8  bit_width;
+	u8  bit_offset;
+	u8  access_width;
+	u64 address;
+} __packed;
+
+/*
+ * combines Sx info from FADT and FACS tables per ACPI 2.0+ spec
+ * (http://www.acpi.info/)
+ */
+struct tboot_acpi_sleep_info {
+	struct tboot_acpi_generic_address pm1a_cnt_blk;
+	struct tboot_acpi_generic_address pm1b_cnt_blk;
+	struct tboot_acpi_generic_address pm1a_evt_blk;
+	struct tboot_acpi_generic_address pm1b_evt_blk;
+	u16 pm1a_cnt_val;
+	u16 pm1b_cnt_val;
+	u64 wakeup_vector;
+	u32 vector_width;
+	u64 kernel_s3_resume_vector;
+} __packed;
+
+/*
+ * shared memory page used for communication between tboot and kernel
+ */
+struct tboot {
+	/*
+	 * version 3+ fields:
+	 */
+
+	/* TBOOT_UUID */
+	u8 uuid[16];
+
+	/* version number: 5 is current */
+	u32 version;
+
+	/* physical addr of tb_log_t log */
+	u32 log_addr;
+
+	/*
+	 * physical addr of entry point for tboot shutdown and
+	 * type of shutdown (TB_SHUTDOWN_*) being requested
+	 */
+	u32 shutdown_entry;
+	u32 shutdown_type;
+
+	/* kernel-specified ACPI info for Sx shutdown */
+	struct tboot_acpi_sleep_info acpi_sinfo;
+
+	/* tboot location in memory (physical) */
+	u32 tboot_base;
+	u32 tboot_size;
+
+	/* memory regions (phys addrs) for tboot to MAC on S3 */
+	u8 num_mac_regions;
+	struct tboot_mac_region mac_regions[MAX_TB_MAC_REGIONS];
+
+
+	/*
+	 * version 4+ fields:
+	 */
+
+	/* symmetric key for use by kernel; will be encrypted on S3 */
+	u8 s3_key[TB_KEY_SIZE];
+
+
+	/*
+	 * version 5+ fields:
+	 */
+
+	/* used to 4byte-align num_in_wfs */
+	u8 reserved_align[3];
+
+	/* number of processors in wait-for-SIPI */
+	u32 num_in_wfs;
+} __packed;
+
+/*
+ * UUID for tboot data struct to facilitate matching
+ * defined as {663C8DFF-E8B3-4b82-AABF-19EA4D057A08} by tboot, which is
+ * represented as {} in the char array used here
+ */
+#define TBOOT_UUID	{0xff, 0x8d, 0x3c, 0x66, 0xb3, 0xe8, 0x82, 0x4b, 0xbf,\
+			 0xaa, 0x19, 0xea, 0x4d, 0x5, 0x7a, 0x8}
+
+extern struct tboot *tboot;
+
+static inline int tboot_enabled(void)
+{
+	return tboot != NULL;
+}
+
+extern void tboot_probe(void);
+extern void tboot_create_trampoline(void);
+extern void tboot_shutdown(u32 shutdown_type);
+extern void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control);
+extern int tboot_wait_for_aps(int num_aps);
+extern struct acpi_table_header *tboot_get_dmar_table(
+				      struct acpi_table_header *dmar_tbl);
+extern int tboot_force_iommu(void);
+
+#else     /* CONFIG_INTEL_TXT */
+
+static inline int tboot_enabled(void)
+{
+	return 0;
+}
+
+static inline void tboot_probe(void)
+{
+}
+
+static inline void tboot_create_trampoline(void)
+{
+}
+
+static inline void tboot_shutdown(u32 shutdown_type)
+{
+}
+
+static inline void tboot_sleep(u8 sleep_state, u32 pm1a_control,
+			       u32 pm1b_control)
+{
+}
+
+static inline int tboot_wait_for_aps(int num_aps)
+{
+	return 0;
+}
+
+static inline struct acpi_table_header *tboot_get_dmar_table(
+					struct acpi_table_header *dmar_tbl)
+{
+	return dmar_tbl;
+}
+
+static inline int tboot_force_iommu(void)
+{
+	return 0;
+}
+
+#endif /* !CONFIG_INTEL_TXT */
+
+#endif /* _ASM_TBOOT_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 430d5b24af7..832cb838cb4 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -52,6 +52,7 @@ obj-$(CONFIG_X86_DS_SELFTEST)		+= ds_selftest.o
 obj-$(CONFIG_X86_32)		+= tls.o
 obj-$(CONFIG_IA32_EMULATION)	+= tls.o
 obj-y				+= step.o
+obj-$(CONFIG_INTEL_TXT)		+= tboot.o
 obj-$(CONFIG_STACKTRACE)	+= stacktrace.o
 obj-y				+= cpu/
 obj-y				+= acpi/
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index de2cab13284..80d6e9e3248 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -145,6 +145,8 @@ struct boot_params __initdata boot_params;
 struct boot_params boot_params;
 #endif
 
+#include <asm/tboot.h>
+
 /*
  * Machine setup..
  */
@@ -964,6 +966,8 @@ void __init setup_arch(char **cmdline_p)
 	paravirt_pagetable_setup_done(swapper_pg_dir);
 	paravirt_post_allocator_init();
 
+	tboot_probe();
+
 #ifdef CONFIG_X86_64
 	map_vsyscall();
 #endif
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
new file mode 100644
index 00000000000..263591afd29
--- /dev/null
+++ b/arch/x86/kernel/tboot.c
@@ -0,0 +1,379 @@
+/*
+ * tboot.c: main implementation of helper functions used by kernel for
+ *          runtime support of Intel(R) Trusted Execution Technology
+ *
+ * Copyright (c) 2006-2009, Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+
+#include <linux/dma_remapping.h>
+#include <linux/init_task.h>
+#include <linux/spinlock.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/dmar.h>
+#include <linux/pfn.h>
+#include <linux/mm.h>
+
+#include <asm/trampoline.h>
+#include <asm/processor.h>
+#include <asm/bootparam.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/setup.h>
+#include <asm/tboot.h>
+#include <asm/e820.h>
+#include <asm/io.h>
+
+#include "acpi/realmode/wakeup.h"
+
+/* Global pointer to shared data; NULL means no measured launch. */
+struct tboot *tboot __read_mostly;
+
+/* timeout for APs (in secs) to enter wait-for-SIPI state during shutdown */
+#define AP_WAIT_TIMEOUT		1
+
+#undef pr_fmt
+#define pr_fmt(fmt)	"tboot: " fmt
+
+static u8 tboot_uuid[16] __initdata = TBOOT_UUID;
+
+void __init tboot_probe(void)
+{
+	/* Look for valid page-aligned address for shared page. */
+	if (!boot_params.tboot_addr)
+		return;
+	/*
+	 * also verify that it is mapped as we expect it before calling
+	 * set_fixmap(), to reduce chance of garbage value causing crash
+	 */
+	if (!e820_any_mapped(boot_params.tboot_addr,
+			     boot_params.tboot_addr, E820_RESERVED)) {
+		pr_warning("non-0 tboot_addr but it is not of type E820_RESERVED\n");
+		return;
+	}
+
+	/* only a natively booted kernel should be using TXT */
+	if (paravirt_enabled()) {
+		pr_warning("non-0 tboot_addr but pv_ops is enabled\n");
+		return;
+	}
+
+	/* Map and check for tboot UUID. */
+	set_fixmap(FIX_TBOOT_BASE, boot_params.tboot_addr);
+	tboot = (struct tboot *)fix_to_virt(FIX_TBOOT_BASE);
+	if (memcmp(&tboot_uuid, &tboot->uuid, sizeof(tboot->uuid))) {
+		pr_warning("tboot at 0x%llx is invalid\n",
+			   boot_params.tboot_addr);
+		tboot = NULL;
+		return;
+	}
+	if (tboot->version < 5) {
+		pr_warning("tboot version is invalid: %u\n", tboot->version);
+		tboot = NULL;
+		return;
+	}
+
+	pr_info("found shared page at phys addr 0x%llx:\n",
+		boot_params.tboot_addr);
+	pr_debug("version: %d\n", tboot->version);
+	pr_debug("log_addr: 0x%08x\n", tboot->log_addr);
+	pr_debug("shutdown_entry: 0x%x\n", tboot->shutdown_entry);
+	pr_debug("tboot_base: 0x%08x\n", tboot->tboot_base);
+	pr_debug("tboot_size: 0x%x\n", tboot->tboot_size);
+}
+
+static pgd_t *tboot_pg_dir;
+static struct mm_struct tboot_mm = {
+	.mm_rb          = RB_ROOT,
+	.pgd            = swapper_pg_dir,
+	.mm_users       = ATOMIC_INIT(2),
+	.mm_count       = ATOMIC_INIT(1),
+	.mmap_sem       = __RWSEM_INITIALIZER(init_mm.mmap_sem),
+	.page_table_lock =  __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
+	.mmlist         = LIST_HEAD_INIT(init_mm.mmlist),
+	.cpu_vm_mask    = CPU_MASK_ALL,
+};
+
+static inline void switch_to_tboot_pt(void)
+{
+	write_cr3(virt_to_phys(tboot_pg_dir));
+}
+
+static int map_tboot_page(unsigned long vaddr, unsigned long pfn,
+			  pgprot_t prot)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+
+	pgd = pgd_offset(&tboot_mm, vaddr);
+	pud = pud_alloc(&tboot_mm, pgd, vaddr);
+	if (!pud)
+		return -1;
+	pmd = pmd_alloc(&tboot_mm, pud, vaddr);
+	if (!pmd)
+		return -1;
+	pte = pte_alloc_map(&tboot_mm, pmd, vaddr);
+	if (!pte)
+		return -1;
+	set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot));
+	pte_unmap(pte);
+	return 0;
+}
+
+static int map_tboot_pages(unsigned long vaddr, unsigned long start_pfn,
+			   unsigned long nr)
+{
+	/* Reuse the original kernel mapping */
+	tboot_pg_dir = pgd_alloc(&tboot_mm);
+	if (!tboot_pg_dir)
+		return -1;
+
+	for (; nr > 0; nr--, vaddr += PAGE_SIZE, start_pfn++) {
+		if (map_tboot_page(vaddr, start_pfn, PAGE_KERNEL_EXEC))
+			return -1;
+	}
+
+	return 0;
+}
+
+void tboot_create_trampoline(void)
+{
+	u32 map_base, map_size;
+
+	if (!tboot_enabled())
+		return;
+
+	/* Create identity map for tboot shutdown code. */
+	map_base = PFN_DOWN(tboot->tboot_base);
+	map_size = PFN_UP(tboot->tboot_size);
+	if (map_tboot_pages(map_base << PAGE_SHIFT, map_base, map_size))
+		panic("tboot: Error mapping tboot pages (mfns) @ 0x%x, 0x%x\n", map_base, map_size);
+}
+
+static void set_mac_regions(void)
+{
+	tboot->num_mac_regions = 3;
+	/* S3 resume code */
+	tboot->mac_regions[0].start = PFN_PHYS(PFN_DOWN(acpi_wakeup_address));
+	tboot->mac_regions[0].size = PFN_UP(WAKEUP_SIZE) << PAGE_SHIFT;
+	/* AP trampoline code */
+	tboot->mac_regions[1].start =
+			PFN_PHYS(PFN_DOWN(virt_to_phys(trampoline_base)));
+	tboot->mac_regions[1].size = PFN_UP(TRAMPOLINE_SIZE) << PAGE_SHIFT;
+	/* kernel code + data + bss */
+	tboot->mac_regions[2].start = PFN_PHYS(PFN_DOWN(virt_to_phys(&_text)));
+	tboot->mac_regions[2].size = PFN_PHYS(PFN_UP(virt_to_phys(&_end))) -
+				     PFN_PHYS(PFN_DOWN(virt_to_phys(&_text)));
+}
+
+void tboot_shutdown(u32 shutdown_type)
+{
+	void (*shutdown)(void);
+
+	if (!tboot_enabled())
+		return;
+
+	/*
+	 * if we're being called before the 1:1 mapping is set up then just
+	 * return and let the normal shutdown happen; this should only be
+	 * due to very early panic()
+	 */
+	if (!tboot_pg_dir)
+		return;
+
+	/* if this is S3 then set regions to MAC */
+	if (shutdown_type == TB_SHUTDOWN_S3)
+		set_mac_regions();
+
+	tboot->shutdown_type = shutdown_type;
+
+	switch_to_tboot_pt();
+
+	shutdown = (void(*)(void))(unsigned long)tboot->shutdown_entry;
+	shutdown();
+
+	/* should not reach here */
+	while (1)
+		halt();
+}
+
+static void tboot_copy_fadt(const struct acpi_table_fadt *fadt)
+{
+#define TB_COPY_GAS(tbg, g)			\
+	tbg.space_id     = g.space_id;		\
+	tbg.bit_width    = g.bit_width;		\
+	tbg.bit_offset   = g.bit_offset;	\
+	tbg.access_width = g.access_width;	\
+	tbg.address      = g.address;
+
+	TB_COPY_GAS(tboot->acpi_sinfo.pm1a_cnt_blk, fadt->xpm1a_control_block);
+	TB_COPY_GAS(tboot->acpi_sinfo.pm1b_cnt_blk, fadt->xpm1b_control_block);
+	TB_COPY_GAS(tboot->acpi_sinfo.pm1a_evt_blk, fadt->xpm1a_event_block);
+	TB_COPY_GAS(tboot->acpi_sinfo.pm1b_evt_blk, fadt->xpm1b_event_block);
+
+	/*
+	 * We need phys addr of waking vector, but can't use virt_to_phys() on
+	 * &acpi_gbl_FACS because it is ioremap'ed, so calc from FACS phys
+	 * addr.
+	 */
+	tboot->acpi_sinfo.wakeup_vector = fadt->facs +
+		offsetof(struct acpi_table_facs, firmware_waking_vector);
+}
+
+void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control)
+{
+	static u32 acpi_shutdown_map[ACPI_S_STATE_COUNT] = {
+		/* S0,1,2: */ -1, -1, -1,
+		/* S3: */ TB_SHUTDOWN_S3,
+		/* S4: */ TB_SHUTDOWN_S4,
+		/* S5: */ TB_SHUTDOWN_S5 };
+
+	if (!tboot_enabled())
+		return;
+
+	tboot_copy_fadt(&acpi_gbl_FADT);
+	tboot->acpi_sinfo.pm1a_cnt_val = pm1a_control;
+	tboot->acpi_sinfo.pm1b_cnt_val = pm1b_control;
+	/* we always use the 32b wakeup vector */
+	tboot->acpi_sinfo.vector_width = 32;
+	tboot->acpi_sinfo.kernel_s3_resume_vector = acpi_wakeup_address;
+
+	if (sleep_state >= ACPI_S_STATE_COUNT ||
+	    acpi_shutdown_map[sleep_state] == -1) {
+		pr_warning("unsupported sleep state 0x%x\n", sleep_state);
+		return;
+	}
+
+	tboot_shutdown(acpi_shutdown_map[sleep_state]);
+}
+
+int tboot_wait_for_aps(int num_aps)
+{
+	unsigned long timeout;
+
+	if (!tboot_enabled())
+		return 0;
+
+	timeout = jiffies + AP_WAIT_TIMEOUT*HZ;
+	while (atomic_read((atomic_t *)&tboot->num_in_wfs) != num_aps &&
+	       time_before(jiffies, timeout))
+		cpu_relax();
+
+	return time_before(jiffies, timeout) ? 0 : 1;
+}
+
+/*
+ * TXT configuration registers (offsets from TXT_{PUB, PRIV}_CONFIG_REGS_BASE)
+ */
+
+#define TXT_PUB_CONFIG_REGS_BASE       0xfed30000
+#define TXT_PRIV_CONFIG_REGS_BASE      0xfed20000
+
+/* # pages for each config regs space - used by fixmap */
+#define NR_TXT_CONFIG_PAGES     ((TXT_PUB_CONFIG_REGS_BASE -                \
+				  TXT_PRIV_CONFIG_REGS_BASE) >> PAGE_SHIFT)
+
+/* offsets from pub/priv config space */
+#define TXTCR_HEAP_BASE             0x0300
+#define TXTCR_HEAP_SIZE             0x0308
+
+#define SHA1_SIZE      20
+
+struct sha1_hash {
+	u8 hash[SHA1_SIZE];
+};
+
+struct sinit_mle_data {
+	u32               version;             /* currently 6 */
+	struct sha1_hash  bios_acm_id;
+	u32               edx_senter_flags;
+	u64               mseg_valid;
+	struct sha1_hash  sinit_hash;
+	struct sha1_hash  mle_hash;
+	struct sha1_hash  stm_hash;
+	struct sha1_hash  lcp_policy_hash;
+	u32               lcp_policy_control;
+	u32               rlp_wakeup_addr;
+	u32               reserved;
+	u32               num_mdrs;
+	u32               mdrs_off;
+	u32               num_vtd_dmars;
+	u32               vtd_dmars_off;
+} __packed;
+
+struct acpi_table_header *tboot_get_dmar_table(struct acpi_table_header *dmar_tbl)
+{
+	void *heap_base, *heap_ptr, *config;
+
+	if (!tboot_enabled())
+		return dmar_tbl;
+
+	/*
+	 * ACPI tables may not be DMA protected by tboot, so use DMAR copy
+	 * SINIT saved in SinitMleData in TXT heap (which is DMA protected)
+	 */
+
+	/* map config space in order to get heap addr */
+	config = ioremap(TXT_PUB_CONFIG_REGS_BASE, NR_TXT_CONFIG_PAGES *
+			 PAGE_SIZE);
+	if (!config)
+		return NULL;
+
+	/* now map TXT heap */
+	heap_base = ioremap(*(u64 *)(config + TXTCR_HEAP_BASE),
+			    *(u64 *)(config + TXTCR_HEAP_SIZE));
+	iounmap(config);
+	if (!heap_base)
+		return NULL;
+
+	/* walk heap to SinitMleData */
+	/* skip BiosData */
+	heap_ptr = heap_base + *(u64 *)heap_base;
+	/* skip OsMleData */
+	heap_ptr += *(u64 *)heap_ptr;
+	/* skip OsSinitData */
+	heap_ptr += *(u64 *)heap_ptr;
+	/* now points to SinitMleDataSize; set to SinitMleData */
+	heap_ptr += sizeof(u64);
+	/* get addr of DMAR table */
+	dmar_tbl = (struct acpi_table_header *)(heap_ptr +
+		   ((struct sinit_mle_data *)heap_ptr)->vtd_dmars_off -
+		   sizeof(u64));
+
+	/* don't unmap heap because dmar.c needs access to this */
+
+	return dmar_tbl;
+}
+
+int tboot_force_iommu(void)
+{
+	if (!tboot_enabled())
+		return 0;
+
+	if (no_iommu || swiotlb || dmar_disabled)
+		pr_warning("Forcing Intel-IOMMU to enabled\n");
+
+	dmar_disabled = 0;
+#ifdef CONFIG_SWIOTLB
+	swiotlb = 0;
+#endif
+	no_iommu = 0;
+
+	return 1;
+}
diff --git a/security/Kconfig b/security/Kconfig
index d23c839038f..edc7cbdc012 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -113,6 +113,36 @@ config SECURITY_ROOTPLUG
 
 	  If you are unsure how to answer this question, answer N.
 
+config INTEL_TXT
+	bool "Enable Intel(R) Trusted Execution Technology (Intel(R) TXT)"
+	depends on EXPERIMENTAL && X86 && DMAR && ACPI
+	help
+	  This option enables support for booting the kernel with the
+	  Trusted Boot (tboot) module. This will utilize
+	  Intel(R) Trusted Execution Technology to perform a measured launch
+	  of the kernel. If the system does not support Intel(R) TXT, this
+	  will have no effect.
+
+	  Intel TXT will provide higher assurance of sysem configuration and
+	  initial state as well as data reset protection.  This is used to
+	  create a robust initial kernel measurement and verification, which
+	  helps to ensure that kernel security mechanisms are functioning
+	  correctly. This level of protection requires a root of trust outside
+	  of the kernel itself.
+
+	  Intel TXT also helps solve real end user concerns about having
+	  confidence that their hardware is running the VMM or kernel that
+	  it was conigured with, especially since they may be responsible for
+	  providing such assurances to VMs and services running on it.
+
+	  See <http://www.intel.com/technology/security/> for more information
+	  about Intel(R) TXT.
+	  See <http://tboot.sourceforge.net> for more information about tboot.
+	  See Documentation/intel_txt.txt for a description of how to enable
+	  Intel TXT support in a kernel boot.
+
+	  If you are unsure as to whether this is required, answer N.
+
 source security/selinux/Kconfig
 source security/smack/Kconfig
 source security/tomoyo/Kconfig
-- 
cgit v1.2.3


From 840c2baf2d4cdf35ecc3b7fcbba7740f97de30a4 Mon Sep 17 00:00:00 2001
From: Joseph Cihula <joseph.cihula@intel.com>
Date: Tue, 30 Jun 2009 19:31:02 -0700
Subject: x86, intel_txt: Intel TXT reboot/halt shutdown support

Support for graceful handling of kernel reboots after an Intel(R) TXT launch.

Without this patch, attempting to reboot or halt the system will cause the
TXT hardware to lock memory upon system restart because the secrets-in-memory
flag that was set on launch was never cleared.  This will in turn cause BIOS
to execute a TXT Authenticated Code Module (ACM) that will scrub all of memory
and then unlock it.  Depending on the amount of memory in the system and its type,
this may take some time.

This patch creates a 1:1 address mapping to the tboot module and then calls back
into tboot so that it may properly and securely clean up system state and clear
the secrets-in-memory flag.  When it has completed these steps, the tboot module
will reboot or halt the system.

 arch/x86/kernel/reboot.c |    8 ++++++++
 init/main.c              |    3 +++
 2 files changed, 11 insertions(+)

Signed-off-by: Joseph Cihula <joseph.cihula@intel.com>
Signed-off-by: Shane Wang <shane.wang@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/reboot.c | 8 ++++++++
 init/main.c              | 3 +++
 2 files changed, 11 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index d2d1ce8170f..9de01c5d979 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -24,6 +24,8 @@
 # include <asm/iommu.h>
 #endif
 
+#include <asm/tboot.h>
+
 /*
  * Power off function, if any
  */
@@ -460,6 +462,8 @@ static void native_machine_emergency_restart(void)
 	if (reboot_emergency)
 		emergency_vmx_disable_all();
 
+	tboot_shutdown(TB_SHUTDOWN_REBOOT);
+
 	/* Tell the BIOS if we want cold or warm reboot */
 	*((unsigned short *)__va(0x472)) = reboot_mode;
 
@@ -586,6 +590,8 @@ static void native_machine_halt(void)
 	/* stop other cpus and apics */
 	machine_shutdown();
 
+	tboot_shutdown(TB_SHUTDOWN_HALT);
+
 	/* stop this cpu */
 	stop_this_cpu(NULL);
 }
@@ -597,6 +603,8 @@ static void native_machine_power_off(void)
 			machine_shutdown();
 		pm_power_off();
 	}
+	/* a fallback in case there is no PM info available */
+	tboot_shutdown(TB_SHUTDOWN_HALT);
 }
 
 struct machine_ops machine_ops = {
diff --git a/init/main.c b/init/main.c
index 2c5ade79eb8..56ada27c4f4 100644
--- a/init/main.c
+++ b/init/main.c
@@ -73,6 +73,7 @@
 #include <asm/io.h>
 #include <asm/bugs.h>
 #include <asm/setup.h>
+#include <asm/tboot.h>
 #include <asm/sections.h>
 #include <asm/cacheflush.h>
 
@@ -715,6 +716,8 @@ asmlinkage void __init start_kernel(void)
 
 	ftrace_init();
 
+	tboot_create_trampoline();
+
 	/* Do the rest non-__init'ed, we're now alive */
 	rest_init();
 }
-- 
cgit v1.2.3


From 86886e55b273f565935491816c7c96b82469d4f8 Mon Sep 17 00:00:00 2001
From: Joseph Cihula <joseph.cihula@intel.com>
Date: Tue, 30 Jun 2009 19:31:07 -0700
Subject: x86, intel_txt: Intel TXT Sx shutdown support

Support for graceful handling of sleep states (S3/S4/S5) after an Intel(R) TXT launch.

Without this patch, attempting to place the system in one of the ACPI sleep
states (S3/S4/S5) will cause the TXT hardware to treat this as an attack and
will cause a system reset, with memory locked.  Not only may the subsequent
memory scrub take some time, but the platform will be unable to enter the
requested power state.

This patch calls back into the tboot so that it may properly and securely clean
up system state and clear the secrets-in-memory flag, after which it will place
the system into the requested sleep state using ACPI information passed by the kernel.

 arch/x86/kernel/smpboot.c     |    2 ++
 drivers/acpi/acpica/hwsleep.c |    3 +++
 kernel/cpu.c                  |    7 ++++++-
 3 files changed, 11 insertions(+), 1 deletion(-)

Signed-off-by: Joseph Cihula <joseph.cihula@intel.com>
Signed-off-by: Shane Wang <shane.wang@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/smpboot.c     | 2 ++
 drivers/acpi/acpica/hwsleep.c | 3 +++
 kernel/cpu.c                  | 7 ++++++-
 3 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 2fecda69ee6..61cc40887c4 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -62,6 +62,7 @@
 #include <asm/vmi.h>
 #include <asm/apic.h>
 #include <asm/setup.h>
+#include <asm/tboot.h>
 #include <asm/uv/uv.h>
 #include <linux/mc146818rtc.h>
 
@@ -1317,6 +1318,7 @@ void play_dead_common(void)
 void native_play_dead(void)
 {
 	play_dead_common();
+	tboot_shutdown(TB_SHUTDOWN_WFS);
 	wbinvd_halt();
 }
 
diff --git a/drivers/acpi/acpica/hwsleep.c b/drivers/acpi/acpica/hwsleep.c
index db307a356f0..8c01dd3724e 100644
--- a/drivers/acpi/acpica/hwsleep.c
+++ b/drivers/acpi/acpica/hwsleep.c
@@ -45,6 +45,7 @@
 #include <acpi/acpi.h>
 #include "accommon.h"
 #include "actables.h"
+#include <asm/tboot.h>
 
 #define _COMPONENT          ACPI_HARDWARE
 ACPI_MODULE_NAME("hwsleep")
@@ -342,6 +343,8 @@ acpi_status asmlinkage acpi_enter_sleep_state(u8 sleep_state)
 
 	ACPI_FLUSH_CPU_CACHE();
 
+	tboot_sleep(sleep_state, pm1a_control, pm1b_control);
+
 	/* Write #2: Write both SLP_TYP + SLP_EN */
 
 	status = acpi_hw_write_pm1_control(pm1a_control, pm1b_control);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 8ce10043e4a..ff071e022a8 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -14,6 +14,7 @@
 #include <linux/kthread.h>
 #include <linux/stop_machine.h>
 #include <linux/mutex.h>
+#include <asm/tboot.h>
 
 #ifdef CONFIG_SMP
 /* Serializes the updates to cpu_online_mask, cpu_present_mask */
@@ -376,7 +377,7 @@ static cpumask_var_t frozen_cpus;
 
 int disable_nonboot_cpus(void)
 {
-	int cpu, first_cpu, error;
+	int cpu, first_cpu, error, num_cpus = 0;
 
 	error = stop_machine_create();
 	if (error)
@@ -391,6 +392,7 @@ int disable_nonboot_cpus(void)
 	for_each_online_cpu(cpu) {
 		if (cpu == first_cpu)
 			continue;
+		num_cpus++;
 		error = _cpu_down(cpu, 1);
 		if (!error) {
 			cpumask_set_cpu(cpu, frozen_cpus);
@@ -401,6 +403,9 @@ int disable_nonboot_cpus(void)
 			break;
 		}
 	}
+	/* ensure all CPUs have gone into wait-for-SIPI */
+	error |= tboot_wait_for_aps(num_cpus);
+
 	if (!error) {
 		BUG_ON(num_online_cpus() > 1);
 		/* Make sure the CPUs won't be enabled by someone else */
-- 
cgit v1.2.3


From 94699b04eddd4b247d871930431d6fa1a46c175e Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 28 Jul 2009 23:52:54 +0200
Subject: x86, mce: don't log boot MCEs on Pentium M (model == 13) CPUs

On my legacy Pentium M laptop (Acer Extensa 2900) I get bogus MCE on a cold
boot with CONFIG_X86_NEW_MCE enabled, i.e. (after decoding it with mcelog):

MCE 0
HARDWARE ERROR. This is *NOT* a software problem!
Please contact your hardware vendor
CPU 0 BANK 1 MCG status:
MCi status:
Error overflow
Uncorrected error
Error enabled
Processor context corrupt
MCA: Data CACHE Level-1 UNKNOWN Error
STATUS f200000000000195 MCGSTATUS 0

[ The other STATUS values observed: f2000000000001b5 (... UNKNOWN error)
  and f200000000000115 (... READ Error).

  To verify that this is not a CONFIG_X86_NEW_MCE bug I also modified
  the CONFIG_X86_OLD_MCE code (which doesn't log any MCEs) to dump
  content of STATUS MSR before it is cleared during initialization. ]

Since the bogus MCE results in a kernel taint (which in turn disables
lockdep support) don't log boot MCEs on Pentium M (model == 13) CPUs
by default ("mce=bootlog" boot parameter can be be used to get the old
behavior).

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Reviewed-by: Andi Kleen <andi@firstfloor.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 07139a0578e..7bd19c7f531 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1269,6 +1269,10 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c)
 		if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
 			monarch_timeout < 0)
 			monarch_timeout = USEC_PER_SEC;
+
+		/* There are also broken BIOSes on some Pentium M systems. */
+		if (c->x86 == 6 && c->x86_model == 13 && mce_bootlog < 0)
+			mce_bootlog = 0;
 	}
 	if (monarch_timeout < 0)
 		monarch_timeout = 0;
-- 
cgit v1.2.3


From e3346fc48204d780f92527d06df8bf6f28d603ec Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 28 Jul 2009 23:55:09 +0200
Subject: x86, mce: fix "mce" boot option handling for CONFIG_X86_NEW_MCE

"mce argument mce ignored. Please use /sys" message shouldn't
be printed when using "mce" boot option.

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Reviewed-by: Andi Kleen <andi@firstfloor.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 7bd19c7f531..75919440a18 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1549,8 +1549,10 @@ static struct miscdevice mce_log_device = {
  */
 static int __init mcheck_enable(char *str)
 {
-	if (*str == 0)
+	if (*str == 0) {
 		enable_p5_mce();
+		return 1;
+	}
 	if (*str == '=')
 		str++;
 	if (!strcmp(str, "off"))
-- 
cgit v1.2.3


From 419d6162c0c0103fa2f44f6691dff9cac14c650d Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 28 Jul 2009 23:56:00 +0200
Subject: x86, mce: add missing __cpuinit tags

mce_cap_init() and mce_cpu_quirks() can be tagged with __cpuinit.

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Reviewed-by: Andi Kleen <andi@firstfloor.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 75919440a18..1ce6db1f878 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1158,7 +1158,7 @@ static int mce_banks_init(void)
 /*
  * Initialize Machine Checks for a CPU.
  */
-static int mce_cap_init(void)
+static int __cpuinit mce_cap_init(void)
 {
 	unsigned b;
 	u64 cap;
@@ -1222,7 +1222,7 @@ static void mce_init(void)
 }
 
 /* Add per CPU specific workarounds here */
-static void mce_cpu_quirks(struct cpuinfo_x86 *c)
+static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
 {
 	/* This should be disabled by the BIOS, but isn't always */
 	if (c->x86_vendor == X86_VENDOR_AMD) {
-- 
cgit v1.2.3


From d0c87d1f61704ed589fc0788bedd753632340e98 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 28 Jul 2009 23:56:37 +0200
Subject: x86, mce: remove never executed code

fseverities_coverage is never NULL in err_out code path.

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Reviewed-by: Andi Kleen <andi@firstfloor.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/mce-severity.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index ff0807f9705..51f7c725dab 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -209,8 +209,6 @@ static int __init severities_debugfs_init(void)
 	return 0;
 
 err_out:
-	if (fseverities_coverage)
-		debugfs_remove(fseverities_coverage);
 	if (dmce)
 		debugfs_remove(dmce);
 	return -ENOMEM;
-- 
cgit v1.2.3


From f3a0867b12e0cf1512c0bd0665f2339fc75ed2a8 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Wed, 29 Jul 2009 00:04:59 +0200
Subject: x86, mce: fix reporting of Thermal Monitoring mechanism enabled

Early Pentium M models use different method for enabling TM2
(per paragraph 13.5.2.3 of the "Intel 64 and IA-32 Architectures
Software Developer's Manual Volume 3A: System Programming Guide,
Part 1").

Tested on the affected Pentium M variant (model == 13).

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Cc: Andi Kleen <andi@firstfloor.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/msr-index.h         |  4 ++++
 arch/x86/kernel/cpu/mcheck/therm_throt.c | 13 ++++++++++---
 2 files changed, 14 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 3d1ce094586..cbec06deb68 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -222,6 +222,10 @@
 
 #define THERM_STATUS_PROCHOT		(1 << 0)
 
+#define MSR_THERM2_CTL			0x0000019d
+
+#define MSR_THERM2_CTL_TM_SELECT	(1ULL << 16)
+
 #define MSR_IA32_MISC_ENABLE		0x000001a0
 
 /* MISC_ENABLE bits: architectural */
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index bff8dd191dd..15f2bc07bb6 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -253,9 +253,6 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
 		return;
 	}
 
-	if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2))
-		tm2 = 1;
-
 	/* Check whether a vector already exists */
 	if (h & APIC_VECTOR_MASK) {
 		printk(KERN_DEBUG
@@ -264,6 +261,16 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
 		return;
 	}
 
+	/* early Pentium M models use different method for enabling TM2 */
+	if (cpu_has(c, X86_FEATURE_TM2)) {
+		if (c->x86 == 6 && (c->x86_model == 9 || c->x86_model == 13)) {
+			rdmsr(MSR_THERM2_CTL, l, h);
+			if (l & MSR_THERM2_CTL_TM_SELECT)
+				tm2 = 1;
+		} else if (l & MSR_IA32_MISC_ENABLE_TM2)
+			tm2 = 1;
+	}
+
 	/* We'll mask the thermal vector in the lapic till we're ready: */
 	h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED;
 	apic_write(APIC_LVTTHMR, h);
-- 
cgit v1.2.3


From 6a12235c7d2d75c7d94b9afcaaecd422ff845ce0 Mon Sep 17 00:00:00 2001
From: David Woodhouse <David.Woodhouse@intel.com>
Date: Wed, 29 Jul 2009 10:25:58 +0100
Subject: agp: kill phys_to_gart() and gart_to_phys()

There seems to be no reason for these -- they're a 1:1 mapping on all
platforms.

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 arch/alpha/include/asm/agp.h    | 4 ----
 arch/ia64/include/asm/agp.h     | 4 ----
 arch/parisc/include/asm/agp.h   | 4 ----
 arch/powerpc/include/asm/agp.h  | 4 ----
 arch/sparc/include/asm/agp.h    | 4 ----
 arch/x86/include/asm/agp.h      | 4 ----
 drivers/char/agp/agp.h          | 3 ---
 drivers/char/agp/ali-agp.c      | 4 ++--
 drivers/char/agp/amd-k7-agp.c   | 8 ++++----
 drivers/char/agp/amd64-agp.c    | 6 +++---
 drivers/char/agp/ati-agp.c      | 6 +++---
 drivers/char/agp/backend.c      | 2 +-
 drivers/char/agp/efficeon-agp.c | 4 ++--
 drivers/char/agp/generic.c      | 6 +++---
 drivers/char/agp/hp-agp.c       | 4 ++--
 drivers/char/agp/i460-agp.c     | 4 ++--
 drivers/char/agp/intel-agp.c    | 7 +++----
 drivers/char/agp/nvidia-agp.c   | 2 +-
 drivers/char/agp/sgi-agp.c      | 2 +-
 drivers/char/agp/sworks-agp.c   | 8 ++++----
 drivers/char/agp/uninorth-agp.c | 2 +-
 21 files changed, 32 insertions(+), 60 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/alpha/include/asm/agp.h b/arch/alpha/include/asm/agp.h
index 26c17913529..a94d48b8677 100644
--- a/arch/alpha/include/asm/agp.h
+++ b/arch/alpha/include/asm/agp.h
@@ -9,10 +9,6 @@
 #define unmap_page_from_agp(page) 
 #define flush_agp_cache() mb()
 
-/* Convert a physical address to an address suitable for the GART. */
-#define phys_to_gart(x) (x)
-#define gart_to_phys(x) (x)
-
 /* GATT allocation. Returns/accepts GATT kernel virtual address. */
 #define alloc_gatt_pages(order)		\
 	((char *)__get_free_pages(GFP_KERNEL, (order)))
diff --git a/arch/ia64/include/asm/agp.h b/arch/ia64/include/asm/agp.h
index c11fdd8ab4d..01d09c401c5 100644
--- a/arch/ia64/include/asm/agp.h
+++ b/arch/ia64/include/asm/agp.h
@@ -17,10 +17,6 @@
 #define unmap_page_from_agp(page)	/* nothing */
 #define flush_agp_cache()		mb()
 
-/* Convert a physical address to an address suitable for the GART. */
-#define phys_to_gart(x) (x)
-#define gart_to_phys(x) (x)
-
 /* GATT allocation. Returns/accepts GATT kernel virtual address. */
 #define alloc_gatt_pages(order)		\
 	((char *)__get_free_pages(GFP_KERNEL, (order)))
diff --git a/arch/parisc/include/asm/agp.h b/arch/parisc/include/asm/agp.h
index 9651660da63..d226ffa8fc1 100644
--- a/arch/parisc/include/asm/agp.h
+++ b/arch/parisc/include/asm/agp.h
@@ -11,10 +11,6 @@
 #define unmap_page_from_agp(page)	/* nothing */
 #define flush_agp_cache()		mb()
 
-/* Convert a physical address to an address suitable for the GART. */
-#define phys_to_gart(x) (x)
-#define gart_to_phys(x) (x)
-
 /* GATT allocation. Returns/accepts GATT kernel virtual address. */
 #define alloc_gatt_pages(order)		\
 	((char *)__get_free_pages(GFP_KERNEL, (order)))
diff --git a/arch/powerpc/include/asm/agp.h b/arch/powerpc/include/asm/agp.h
index 86455c4c31e..416e12c2d50 100644
--- a/arch/powerpc/include/asm/agp.h
+++ b/arch/powerpc/include/asm/agp.h
@@ -8,10 +8,6 @@
 #define unmap_page_from_agp(page)
 #define flush_agp_cache() mb()
 
-/* Convert a physical address to an address suitable for the GART. */
-#define phys_to_gart(x) (x)
-#define gart_to_phys(x) (x)
-
 /* GATT allocation. Returns/accepts GATT kernel virtual address. */
 #define alloc_gatt_pages(order)		\
 	((char *)__get_free_pages(GFP_KERNEL, (order)))
diff --git a/arch/sparc/include/asm/agp.h b/arch/sparc/include/asm/agp.h
index c2456870b05..70f52c1661b 100644
--- a/arch/sparc/include/asm/agp.h
+++ b/arch/sparc/include/asm/agp.h
@@ -7,10 +7,6 @@
 #define unmap_page_from_agp(page)
 #define flush_agp_cache() mb()
 
-/* Convert a physical address to an address suitable for the GART. */
-#define phys_to_gart(x) (x)
-#define gart_to_phys(x) (x)
-
 /* GATT allocation. Returns/accepts GATT kernel virtual address. */
 #define alloc_gatt_pages(order)		\
 	((char *)__get_free_pages(GFP_KERNEL, (order)))
diff --git a/arch/x86/include/asm/agp.h b/arch/x86/include/asm/agp.h
index 9825cd64c9b..eec2a70d437 100644
--- a/arch/x86/include/asm/agp.h
+++ b/arch/x86/include/asm/agp.h
@@ -22,10 +22,6 @@
  */
 #define flush_agp_cache() wbinvd()
 
-/* Convert a physical address to an address suitable for the GART. */
-#define phys_to_gart(x) (x)
-#define gart_to_phys(x) (x)
-
 /* GATT allocation. Returns/accepts GATT kernel virtual address. */
 #define alloc_gatt_pages(order)		\
 	((char *)__get_free_pages(GFP_KERNEL, (order)))
diff --git a/drivers/char/agp/agp.h b/drivers/char/agp/agp.h
index 4c6e5079d87..d6f36c004d9 100644
--- a/drivers/char/agp/agp.h
+++ b/drivers/char/agp/agp.h
@@ -318,9 +318,6 @@ void agp3_generic_cleanup(void);
 #define AGP_GENERIC_SIZES_ENTRIES 11
 extern const struct aper_size_info_16 agp3_generic_sizes[];
 
-#define virt_to_gart(x) (phys_to_gart(virt_to_phys(x)))
-#define gart_to_virt(x) (phys_to_virt(gart_to_phys(x)))
-
 extern int agp_off;
 extern int agp_try_unsupported_boot;
 
diff --git a/drivers/char/agp/ali-agp.c b/drivers/char/agp/ali-agp.c
index 201ef3ffd48..d2ce68f27e4 100644
--- a/drivers/char/agp/ali-agp.c
+++ b/drivers/char/agp/ali-agp.c
@@ -152,7 +152,7 @@ static struct page *m1541_alloc_page(struct agp_bridge_data *bridge)
 	pci_read_config_dword(agp_bridge->dev, ALI_CACHE_FLUSH_CTRL, &temp);
 	pci_write_config_dword(agp_bridge->dev, ALI_CACHE_FLUSH_CTRL,
 			(((temp & ALI_CACHE_FLUSH_ADDR_MASK) |
-			  phys_to_gart(page_to_phys(page))) | ALI_CACHE_FLUSH_EN ));
+			  page_to_phys(page)) | ALI_CACHE_FLUSH_EN ));
 	return page;
 }
 
@@ -180,7 +180,7 @@ static void m1541_destroy_page(struct page *page, int flags)
 		pci_read_config_dword(agp_bridge->dev, ALI_CACHE_FLUSH_CTRL, &temp);
 		pci_write_config_dword(agp_bridge->dev, ALI_CACHE_FLUSH_CTRL,
 				       (((temp & ALI_CACHE_FLUSH_ADDR_MASK) |
-					 phys_to_gart(page_to_phys(page))) | ALI_CACHE_FLUSH_EN));
+					 page_to_phys(page)) | ALI_CACHE_FLUSH_EN));
 	}
 	agp_generic_destroy_page(page, flags);
 }
diff --git a/drivers/char/agp/amd-k7-agp.c b/drivers/char/agp/amd-k7-agp.c
index 542a87895ae..73dbf40c874 100644
--- a/drivers/char/agp/amd-k7-agp.c
+++ b/drivers/char/agp/amd-k7-agp.c
@@ -44,7 +44,7 @@ static int amd_create_page_map(struct amd_page_map *page_map)
 #ifndef CONFIG_X86
 	SetPageReserved(virt_to_page(page_map->real));
 	global_cache_flush();
-	page_map->remapped = ioremap_nocache(virt_to_gart(page_map->real),
+	page_map->remapped = ioremap_nocache(virt_to_phys(page_map->real),
 					    PAGE_SIZE);
 	if (page_map->remapped == NULL) {
 		ClearPageReserved(virt_to_page(page_map->real));
@@ -160,7 +160,7 @@ static int amd_create_gatt_table(struct agp_bridge_data *bridge)
 
 	agp_bridge->gatt_table_real = (u32 *)page_dir.real;
 	agp_bridge->gatt_table = (u32 __iomem *)page_dir.remapped;
-	agp_bridge->gatt_bus_addr = virt_to_gart(page_dir.real);
+	agp_bridge->gatt_bus_addr = virt_to_phys(page_dir.real);
 
 	/* Get the address for the gart region.
 	 * This is a bus address even on the alpha, b/c its
@@ -173,7 +173,7 @@ static int amd_create_gatt_table(struct agp_bridge_data *bridge)
 
 	/* Calculate the agp offset */
 	for (i = 0; i < value->num_entries / 1024; i++, addr += 0x00400000) {
-		writel(virt_to_gart(amd_irongate_private.gatt_pages[i]->real) | 1,
+		writel(virt_to_phys(amd_irongate_private.gatt_pages[i]->real) | 1,
 			page_dir.remapped+GET_PAGE_DIR_OFF(addr));
 		readl(page_dir.remapped+GET_PAGE_DIR_OFF(addr));	/* PCI Posting. */
 	}
@@ -325,7 +325,7 @@ static int amd_insert_memory(struct agp_memory *mem, off_t pg_start, int type)
 		addr = (j * PAGE_SIZE) + agp_bridge->gart_bus_addr;
 		cur_gatt = GET_GATT(addr);
 		writel(agp_generic_mask_memory(agp_bridge,
-					       phys_to_gart(page_to_phys(mem->pages[i])),
+					       page_to_phys(mem->pages[i]),
 					       mem->type),
 		       cur_gatt+GET_GATT_OFF(addr));
 		readl(cur_gatt+GET_GATT_OFF(addr));	/* PCI Posting. */
diff --git a/drivers/char/agp/amd64-agp.c b/drivers/char/agp/amd64-agp.c
index e85a5b3e952..2fb2e6cc322 100644
--- a/drivers/char/agp/amd64-agp.c
+++ b/drivers/char/agp/amd64-agp.c
@@ -79,7 +79,7 @@ static int amd64_insert_memory(struct agp_memory *mem, off_t pg_start, int type)
 
 	for (i = 0, j = pg_start; i < mem->page_count; i++, j++) {
 		tmp = agp_bridge->driver->mask_memory(agp_bridge,
-						      phys_to_gart(page_to_phys(mem->pages[i])),
+						      page_to_phys(mem->pages[i]),
 						      mask_type);
 
 		BUG_ON(tmp & 0xffffff0000000ffcULL);
@@ -178,7 +178,7 @@ static const struct aper_size_info_32 amd_8151_sizes[7] =
 
 static int amd_8151_configure(void)
 {
-	unsigned long gatt_bus = virt_to_gart(agp_bridge->gatt_table_real);
+	unsigned long gatt_bus = virt_to_phys(agp_bridge->gatt_table_real);
 	int i;
 
 	/* Configure AGP regs in each x86-64 host bridge. */
@@ -558,7 +558,7 @@ static void __devexit agp_amd64_remove(struct pci_dev *pdev)
 {
 	struct agp_bridge_data *bridge = pci_get_drvdata(pdev);
 
-	release_mem_region(virt_to_gart(bridge->gatt_table_real),
+	release_mem_region(virt_to_phys(bridge->gatt_table_real),
 			   amd64_aperture_sizes[bridge->aperture_size_idx].size);
 	agp_remove_bridge(bridge);
 	agp_put_bridge(bridge);
diff --git a/drivers/char/agp/ati-agp.c b/drivers/char/agp/ati-agp.c
index 59ebd60c1b6..3b2ecbe86eb 100644
--- a/drivers/char/agp/ati-agp.c
+++ b/drivers/char/agp/ati-agp.c
@@ -302,7 +302,7 @@ static int ati_insert_memory(struct agp_memory * mem,
 		addr = (j * PAGE_SIZE) + agp_bridge->gart_bus_addr;
 		cur_gatt = GET_GATT(addr);
 		writel(agp_bridge->driver->mask_memory(agp_bridge,	
-						       phys_to_gart(page_to_phys(mem->pages[i])),
+						       page_to_phys(mem->pages[i]),
 						       mem->type),
 		       cur_gatt+GET_GATT_OFF(addr));
 	}
@@ -360,7 +360,7 @@ static int ati_create_gatt_table(struct agp_bridge_data *bridge)
 
 	agp_bridge->gatt_table_real = (u32 *)page_dir.real;
 	agp_bridge->gatt_table = (u32 __iomem *) page_dir.remapped;
-	agp_bridge->gatt_bus_addr = virt_to_gart(page_dir.real);
+	agp_bridge->gatt_bus_addr = virt_to_phys(page_dir.real);
 
 	/* Write out the size register */
 	current_size = A_SIZE_LVL2(agp_bridge->current_size);
@@ -390,7 +390,7 @@ static int ati_create_gatt_table(struct agp_bridge_data *bridge)
 
 	/* Calculate the agp offset */
 	for (i = 0; i < value->num_entries / 1024; i++, addr += 0x00400000) {
-		writel(virt_to_gart(ati_generic_private.gatt_pages[i]->real) | 1,
+		writel(virt_to_phys(ati_generic_private.gatt_pages[i]->real) | 1,
 			page_dir.remapped+GET_PAGE_DIR_OFF(addr));
 		readl(page_dir.remapped+GET_PAGE_DIR_OFF(addr));	/* PCI Posting. */
 	}
diff --git a/drivers/char/agp/backend.c b/drivers/char/agp/backend.c
index 343f102090a..ad87753f6de 100644
--- a/drivers/char/agp/backend.c
+++ b/drivers/char/agp/backend.c
@@ -159,7 +159,7 @@ static int agp_backend_initialize(struct agp_bridge_data *bridge)
 				goto err_out_nounmap;
 			}
 		} else {
-			bridge->scratch_page_dma = phys_to_gart(page_to_phys(page));
+			bridge->scratch_page_dma = page_to_phys(page);
 		}
 
 		bridge->scratch_page = bridge->driver->mask_memory(bridge,
diff --git a/drivers/char/agp/efficeon-agp.c b/drivers/char/agp/efficeon-agp.c
index 35d50f2861b..793f39ea961 100644
--- a/drivers/char/agp/efficeon-agp.c
+++ b/drivers/char/agp/efficeon-agp.c
@@ -67,7 +67,7 @@ static const struct gatt_mask efficeon_generic_masks[] =
 /* This function does the same thing as mask_memory() for this chipset... */
 static inline unsigned long efficeon_mask_memory(struct page *page)
 {
-	unsigned long addr = phys_to_gart(page_to_phys(page));
+	unsigned long addr = page_to_phys(page);
 	return addr | 0x00000001;
 }
 
@@ -226,7 +226,7 @@ static int efficeon_create_gatt_table(struct agp_bridge_data *bridge)
 
 		efficeon_private.l1_table[index] = page;
 
-		value = virt_to_gart((unsigned long *)page) | pati | present | index;
+		value = virt_to_phys((unsigned long *)page) | pati | present | index;
 
 		pci_write_config_dword(agp_bridge->dev,
 			EFFICEON_ATTPAGE, value);
diff --git a/drivers/char/agp/generic.c b/drivers/char/agp/generic.c
index 28f0208c66a..c50543966eb 100644
--- a/drivers/char/agp/generic.c
+++ b/drivers/char/agp/generic.c
@@ -988,7 +988,7 @@ int agp_generic_create_gatt_table(struct agp_bridge_data *bridge)
 	set_memory_uc((unsigned long)table, 1 << page_order);
 	bridge->gatt_table = (void *)table;
 #else
-	bridge->gatt_table = ioremap_nocache(virt_to_gart(table),
+	bridge->gatt_table = ioremap_nocache(virt_to_phys(table),
 					(PAGE_SIZE * (1 << page_order)));
 	bridge->driver->cache_flush();
 #endif
@@ -1001,7 +1001,7 @@ int agp_generic_create_gatt_table(struct agp_bridge_data *bridge)
 
 		return -ENOMEM;
 	}
-	bridge->gatt_bus_addr = virt_to_gart(bridge->gatt_table_real);
+	bridge->gatt_bus_addr = virt_to_phys(bridge->gatt_table_real);
 
 	/* AK: bogus, should encode addresses > 4GB */
 	for (i = 0; i < num_entries; i++) {
@@ -1142,7 +1142,7 @@ int agp_generic_insert_memory(struct agp_memory * mem, off_t pg_start, int type)
 
 	for (i = 0, j = pg_start; i < mem->page_count; i++, j++) {
 		writel(bridge->driver->mask_memory(bridge,
-						   phys_to_gart(page_to_phys(mem->pages[i])),
+						   page_to_phys(mem->pages[i]),
 						   mask_type),
 		       bridge->gatt_table+j);
 	}
diff --git a/drivers/char/agp/hp-agp.c b/drivers/char/agp/hp-agp.c
index 64dbf4b1cf2..501e293e5ad 100644
--- a/drivers/char/agp/hp-agp.c
+++ b/drivers/char/agp/hp-agp.c
@@ -107,7 +107,7 @@ static int __init hp_zx1_ioc_shared(void)
 	hp->gart_size = HP_ZX1_GART_SIZE;
 	hp->gatt_entries = hp->gart_size / hp->io_page_size;
 
-	hp->io_pdir = gart_to_virt(readq(hp->ioc_regs+HP_ZX1_PDIR_BASE));
+	hp->io_pdir = phys_to_virt(readq(hp->ioc_regs+HP_ZX1_PDIR_BASE));
 	hp->gatt = &hp->io_pdir[HP_ZX1_IOVA_TO_PDIR(hp->gart_base)];
 
 	if (hp->gatt[0] != HP_ZX1_SBA_IOMMU_COOKIE) {
@@ -246,7 +246,7 @@ hp_zx1_configure (void)
 	agp_bridge->mode = readl(hp->lba_regs+hp->lba_cap_offset+PCI_AGP_STATUS);
 
 	if (hp->io_pdir_owner) {
-		writel(virt_to_gart(hp->io_pdir), hp->ioc_regs+HP_ZX1_PDIR_BASE);
+		writel(virt_to_phys(hp->io_pdir), hp->ioc_regs+HP_ZX1_PDIR_BASE);
 		readl(hp->ioc_regs+HP_ZX1_PDIR_BASE);
 		writel(hp->io_tlb_ps, hp->ioc_regs+HP_ZX1_TCNFG);
 		readl(hp->ioc_regs+HP_ZX1_TCNFG);
diff --git a/drivers/char/agp/i460-agp.c b/drivers/char/agp/i460-agp.c
index 54191f86053..e763d3312ce 100644
--- a/drivers/char/agp/i460-agp.c
+++ b/drivers/char/agp/i460-agp.c
@@ -325,7 +325,7 @@ static int i460_insert_memory_small_io_page (struct agp_memory *mem,
 
 	io_page_size = 1UL << I460_IO_PAGE_SHIFT;
 	for (i = 0, j = io_pg_start; i < mem->page_count; i++) {
-		paddr = phys_to_gart(page_to_phys(mem->pages[i]));
+		paddr = page_to_phys(mem->pages[i]);
 		for (k = 0; k < I460_IOPAGES_PER_KPAGE; k++, j++, paddr += io_page_size)
 			WR_GATT(j, i460_mask_memory(agp_bridge, paddr, mem->type));
 	}
@@ -382,7 +382,7 @@ static int i460_alloc_large_page (struct lp_desc *lp)
 		return -ENOMEM;
 	}
 
-	lp->paddr = phys_to_gart(page_to_phys(lp->page));
+	lp->paddr = page_to_phys(lp->page);
 	lp->refcount = 0;
 	atomic_add(I460_KPAGES_PER_IOPAGE, &agp_bridge->current_memory_agp);
 	return 0;
diff --git a/drivers/char/agp/intel-agp.c b/drivers/char/agp/intel-agp.c
index d8c80d8be5e..aa8889e8afc 100644
--- a/drivers/char/agp/intel-agp.c
+++ b/drivers/char/agp/intel-agp.c
@@ -288,7 +288,7 @@ static void intel_agp_insert_sg_entries(struct agp_memory *mem,
 
 	for (i = 0, j = pg_start; i < mem->page_count; i++, j++) {
 		writel(agp_bridge->driver->mask_memory(agp_bridge,
-			       phys_to_gart(page_to_phys(mem->pages[i])), mask_type),
+				page_to_phys(mem->pages[i]), mask_type),
 		       intel_private.gtt+j);
 	}
 
@@ -470,8 +470,7 @@ static int intel_i810_insert_entries(struct agp_memory *mem, off_t pg_start,
 			global_cache_flush();
 		for (i = 0, j = pg_start; i < mem->page_count; i++, j++) {
 			writel(agp_bridge->driver->mask_memory(agp_bridge,
-							       phys_to_gart(page_to_phys(mem->pages[i])),
-							       mask_type),
+					page_to_phys(mem->pages[i]), mask_type),
 			       intel_private.registers+I810_PTE_BASE+(j*4));
 		}
 		readl(intel_private.registers+I810_PTE_BASE+((j-1)*4));
@@ -977,7 +976,7 @@ static int intel_i830_insert_entries(struct agp_memory *mem, off_t pg_start,
 
 	for (i = 0, j = pg_start; i < mem->page_count; i++, j++) {
 		writel(agp_bridge->driver->mask_memory(agp_bridge,
-			       phys_to_gart(page_to_phys(mem->pages[i])), mask_type),
+				page_to_phys(mem->pages[i]), mask_type),
 		       intel_private.registers+I810_PTE_BASE+(j*4));
 	}
 	readl(intel_private.registers+I810_PTE_BASE+((j-1)*4));
diff --git a/drivers/char/agp/nvidia-agp.c b/drivers/char/agp/nvidia-agp.c
index cedacee30ec..7e36d2b4f9d 100644
--- a/drivers/char/agp/nvidia-agp.c
+++ b/drivers/char/agp/nvidia-agp.c
@@ -225,7 +225,7 @@ static int nvidia_insert_memory(struct agp_memory *mem, off_t pg_start, int type
 	}
 	for (i = 0, j = pg_start; i < mem->page_count; i++, j++) {
 		writel(agp_bridge->driver->mask_memory(agp_bridge,
-			       phys_to_gart(page_to_phys(mem->pages[i])), mask_type),
+			       page_to_phys(mem->pages[i]), mask_type),
 			agp_bridge->gatt_table+nvidia_private.pg_offset+j);
 	}
 
diff --git a/drivers/char/agp/sgi-agp.c b/drivers/char/agp/sgi-agp.c
index 0d47fa84740..0d426ae39c8 100644
--- a/drivers/char/agp/sgi-agp.c
+++ b/drivers/char/agp/sgi-agp.c
@@ -190,7 +190,7 @@ static int sgi_tioca_insert_memory(struct agp_memory *mem, off_t pg_start,
 	for (i = 0, j = pg_start; i < mem->page_count; i++, j++) {
 		table[j] =
 		    bridge->driver->mask_memory(bridge,
-						phys_to_gart(page_to_phys(mem->pages[i])),
+						page_to_phys(mem->pages[i]),
 						mem->type);
 	}
 
diff --git a/drivers/char/agp/sworks-agp.c b/drivers/char/agp/sworks-agp.c
index 07259952fc3..13acaaf64ed 100644
--- a/drivers/char/agp/sworks-agp.c
+++ b/drivers/char/agp/sworks-agp.c
@@ -155,7 +155,7 @@ static int serverworks_create_gatt_table(struct agp_bridge_data *bridge)
 	/* Create a fake scratch directory */
 	for (i = 0; i < 1024; i++) {
 		writel(agp_bridge->scratch_page, serverworks_private.scratch_dir.remapped+i);
-		writel(virt_to_gart(serverworks_private.scratch_dir.real) | 1, page_dir.remapped+i);
+		writel(virt_to_phys(serverworks_private.scratch_dir.real) | 1, page_dir.remapped+i);
 	}
 
 	retval = serverworks_create_gatt_pages(value->num_entries / 1024);
@@ -167,7 +167,7 @@ static int serverworks_create_gatt_table(struct agp_bridge_data *bridge)
 
 	agp_bridge->gatt_table_real = (u32 *)page_dir.real;
 	agp_bridge->gatt_table = (u32 __iomem *)page_dir.remapped;
-	agp_bridge->gatt_bus_addr = virt_to_gart(page_dir.real);
+	agp_bridge->gatt_bus_addr = virt_to_phys(page_dir.real);
 
 	/* Get the address for the gart region.
 	 * This is a bus address even on the alpha, b/c its
@@ -179,7 +179,7 @@ static int serverworks_create_gatt_table(struct agp_bridge_data *bridge)
 
 	/* Calculate the agp offset */
 	for (i = 0; i < value->num_entries / 1024; i++)
-		writel(virt_to_gart(serverworks_private.gatt_pages[i]->real)|1, page_dir.remapped+i);
+		writel(virt_to_phys(serverworks_private.gatt_pages[i]->real)|1, page_dir.remapped+i);
 
 	return 0;
 }
@@ -350,7 +350,7 @@ static int serverworks_insert_memory(struct agp_memory *mem,
 		addr = (j * PAGE_SIZE) + agp_bridge->gart_bus_addr;
 		cur_gatt = SVRWRKS_GET_GATT(addr);
 		writel(agp_bridge->driver->mask_memory(agp_bridge, 
-				phys_to_gart(page_to_phys(mem->pages[i])), mem->type),
+				page_to_phys(mem->pages[i]), mem->type),
 		       cur_gatt+GET_GATT_OFF(addr));
 	}
 	serverworks_tlbflush(mem);
diff --git a/drivers/char/agp/uninorth-agp.c b/drivers/char/agp/uninorth-agp.c
index f192c3b9ad4..2e993112ab8 100644
--- a/drivers/char/agp/uninorth-agp.c
+++ b/drivers/char/agp/uninorth-agp.c
@@ -431,7 +431,7 @@ static int uninorth_create_gatt_table(struct agp_bridge_data *bridge)
 
 	bridge->gatt_table_real = (u32 *) table;
 	bridge->gatt_table = (u32 *)table;
-	bridge->gatt_bus_addr = virt_to_gart(table);
+	bridge->gatt_bus_addr = virt_to_phys(table);
 
 	for (i = 0; i < num_entries; i++)
 		bridge->gatt_table[i] = 0;
-- 
cgit v1.2.3


From cfc65dd57967f2e0c7b3a8b73e6d12470b1cf1c1 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@hp.com>
Date: Thu, 30 Jul 2009 16:15:18 -0600
Subject: iommu=pt is a valid early param

This avoids a "Malformed early option 'iommu'" warning on boot when
trying to use pass-through mode.

Signed-off-by: Alex Williamson <alex.williamson@hp.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 arch/x86/kernel/pci-dma.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 1a041bcf506..ae13e34f724 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -212,10 +212,8 @@ static __init int iommu_setup(char *p)
 		if (!strncmp(p, "soft", 4))
 			swiotlb = 1;
 #endif
-		if (!strncmp(p, "pt", 2)) {
+		if (!strncmp(p, "pt", 2))
 			iommu_pass_through = 1;
-			return 1;
-		}
 
 		gart_parse_options(p);
 
-- 
cgit v1.2.3


From 19943b0e30b05d42e494ae6fef78156ebc8c637e Mon Sep 17 00:00:00 2001
From: David Woodhouse <David.Woodhouse@intel.com>
Date: Tue, 4 Aug 2009 16:19:20 +0100
Subject: intel-iommu: Unify hardware and software passthrough support

This makes the hardware passthrough mode work a lot more like the
software version, so that the behaviour of a kernel with 'iommu=pt'
is the same whether the hardware supports passthrough or not.

In particular:
 - We use a single si_domain for the pass-through devices.
 - 32-bit devices can be taken out of the pass-through domain so that
   they don't have to use swiotlb.
 - Devices will work again after being removed from a KVM guest.
 - A potential oops on OOM (in init_context_pass_through()) is fixed.

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 arch/x86/kernel/pci-swiotlb.c |   5 +-
 drivers/pci/intel-iommu.c     | 174 ++++++++++++++++++------------------------
 2 files changed, 76 insertions(+), 103 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index 6af96ee4420..1e66b18f45c 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -71,9 +71,8 @@ void __init pci_swiotlb_init(void)
 {
 	/* don't initialize swiotlb if iommu=off (no_iommu=1) */
 #ifdef CONFIG_X86_64
-	if ((!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN) ||
-		iommu_pass_through)
-	       swiotlb = 1;
+	if ((!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN))
+		swiotlb = 1;
 #endif
 	if (swiotlb_force)
 		swiotlb = 1;
diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index 097d5da2fae..147b3b960b6 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -251,7 +251,8 @@ static inline int first_pte_in_page(struct dma_pte *pte)
  * 	2. It maps to each iommu if successful.
  *	3. Each iommu mapps to this domain if successful.
  */
-struct dmar_domain *si_domain;
+static struct dmar_domain *si_domain;
+static int hw_pass_through = 1;
 
 /* devices under the same p2p bridge are owned in one domain */
 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
@@ -1948,14 +1949,24 @@ static int iommu_prepare_identity_map(struct pci_dev *pdev,
 	struct dmar_domain *domain;
 	int ret;
 
-	printk(KERN_INFO
-	       "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
-	       pci_name(pdev), start, end);
-
 	domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
 	if (!domain)
 		return -ENOMEM;
 
+	/* For _hardware_ passthrough, don't bother. But for software
+	   passthrough, we do it anyway -- it may indicate a memory
+	   range which is reserved in E820, so which didn't get set
+	   up to start with in si_domain */
+	if (domain == si_domain && hw_pass_through) {
+		printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
+		       pci_name(pdev), start, end);
+		return 0;
+	}
+
+	printk(KERN_INFO
+	       "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
+	       pci_name(pdev), start, end);
+
 	ret = iommu_domain_identity_map(domain, start, end);
 	if (ret)
 		goto error;
@@ -2006,23 +2017,6 @@ static inline void iommu_prepare_isa(void)
 }
 #endif /* !CONFIG_DMAR_FLPY_WA */
 
-/* Initialize each context entry as pass through.*/
-static int __init init_context_pass_through(void)
-{
-	struct pci_dev *pdev = NULL;
-	struct dmar_domain *domain;
-	int ret;
-
-	for_each_pci_dev(pdev) {
-		domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
-		ret = domain_context_mapping(domain, pdev,
-					     CONTEXT_TT_PASS_THROUGH);
-		if (ret)
-			return ret;
-	}
-	return 0;
-}
-
 static int md_domain_init(struct dmar_domain *domain, int guest_width);
 
 static int __init si_domain_work_fn(unsigned long start_pfn,
@@ -2037,7 +2031,7 @@ static int __init si_domain_work_fn(unsigned long start_pfn,
 
 }
 
-static int si_domain_init(void)
+static int si_domain_init(int hw)
 {
 	struct dmar_drhd_unit *drhd;
 	struct intel_iommu *iommu;
@@ -2064,6 +2058,9 @@ static int si_domain_init(void)
 
 	si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
 
+	if (hw)
+		return 0;
+
 	for_each_online_node(nid) {
 		work_with_active_regions(nid, si_domain_work_fn, &ret);
 		if (ret)
@@ -2155,24 +2152,26 @@ static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
 	return 1;
 }
 
-static int iommu_prepare_static_identity_mapping(void)
+static int iommu_prepare_static_identity_mapping(int hw)
 {
 	struct pci_dev *pdev = NULL;
 	int ret;
 
-	ret = si_domain_init();
+	ret = si_domain_init(hw);
 	if (ret)
 		return -EFAULT;
 
 	for_each_pci_dev(pdev) {
 		if (iommu_should_identity_map(pdev, 1)) {
-			printk(KERN_INFO "IOMMU: identity mapping for device %s\n",
-			       pci_name(pdev));
+			printk(KERN_INFO "IOMMU: %s identity mapping for device %s\n",
+			       hw ? "hardware" : "software", pci_name(pdev));
 
 			ret = domain_context_mapping(si_domain, pdev,
+						     hw ? CONTEXT_TT_PASS_THROUGH :
 						     CONTEXT_TT_MULTI_LEVEL);
 			if (ret)
 				return ret;
+
 			ret = domain_add_dev_info(si_domain, pdev);
 			if (ret)
 				return ret;
@@ -2189,14 +2188,6 @@ int __init init_dmars(void)
 	struct pci_dev *pdev;
 	struct intel_iommu *iommu;
 	int i, ret;
-	int pass_through = 1;
-
-	/*
-	 * In case pass through can not be enabled, iommu tries to use identity
-	 * mapping.
-	 */
-	if (iommu_pass_through)
-		iommu_identity_mapping = 1;
 
 	/*
 	 * for each drhd
@@ -2250,14 +2241,8 @@ int __init init_dmars(void)
 			goto error;
 		}
 		if (!ecap_pass_through(iommu->ecap))
-			pass_through = 0;
+			hw_pass_through = 0;
 	}
-	if (iommu_pass_through)
-		if (!pass_through) {
-			printk(KERN_INFO
-			       "Pass Through is not supported by hardware.\n");
-			iommu_pass_through = 0;
-		}
 
 	/*
 	 * Start from the sane iommu hardware state.
@@ -2312,64 +2297,57 @@ int __init init_dmars(void)
 		}
 	}
 
+	if (iommu_pass_through)
+		iommu_identity_mapping = 1;
+#ifdef CONFIG_DMAR_BROKEN_GFX_WA
+	else
+		iommu_identity_mapping = 2;
+#endif
 	/*
-	 * If pass through is set and enabled, context entries of all pci
-	 * devices are intialized by pass through translation type.
+	 * If pass through is not set or not enabled, setup context entries for
+	 * identity mappings for rmrr, gfx, and isa and may fall back to static
+	 * identity mapping if iommu_identity_mapping is set.
 	 */
-	if (iommu_pass_through) {
-		ret = init_context_pass_through();
+	if (iommu_identity_mapping) {
+		ret = iommu_prepare_static_identity_mapping(hw_pass_through);
 		if (ret) {
-			printk(KERN_ERR "IOMMU: Pass through init failed.\n");
-			iommu_pass_through = 0;
+			printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
+			goto error;
 		}
 	}
-
 	/*
-	 * If pass through is not set or not enabled, setup context entries for
-	 * identity mappings for rmrr, gfx, and isa and may fall back to static
-	 * identity mapping if iommu_identity_mapping is set.
+	 * For each rmrr
+	 *   for each dev attached to rmrr
+	 *   do
+	 *     locate drhd for dev, alloc domain for dev
+	 *     allocate free domain
+	 *     allocate page table entries for rmrr
+	 *     if context not allocated for bus
+	 *           allocate and init context
+	 *           set present in root table for this bus
+	 *     init context with domain, translation etc
+	 *    endfor
+	 * endfor
 	 */
-	if (!iommu_pass_through) {
-#ifdef CONFIG_DMAR_BROKEN_GFX_WA
-		if (!iommu_identity_mapping)
-			iommu_identity_mapping = 2;
-#endif
-		if (iommu_identity_mapping)
-			iommu_prepare_static_identity_mapping();
-		/*
-		 * For each rmrr
-		 *   for each dev attached to rmrr
-		 *   do
-		 *     locate drhd for dev, alloc domain for dev
-		 *     allocate free domain
-		 *     allocate page table entries for rmrr
-		 *     if context not allocated for bus
-		 *           allocate and init context
-		 *           set present in root table for this bus
-		 *     init context with domain, translation etc
-		 *    endfor
-		 * endfor
-		 */
-		printk(KERN_INFO "IOMMU: Setting RMRR:\n");
-		for_each_rmrr_units(rmrr) {
-			for (i = 0; i < rmrr->devices_cnt; i++) {
-				pdev = rmrr->devices[i];
-				/*
-				 * some BIOS lists non-exist devices in DMAR
-				 * table.
-				 */
-				if (!pdev)
-					continue;
-				ret = iommu_prepare_rmrr_dev(rmrr, pdev);
-				if (ret)
-					printk(KERN_ERR
-				 "IOMMU: mapping reserved region failed\n");
-			}
+	printk(KERN_INFO "IOMMU: Setting RMRR:\n");
+	for_each_rmrr_units(rmrr) {
+		for (i = 0; i < rmrr->devices_cnt; i++) {
+			pdev = rmrr->devices[i];
+			/*
+			 * some BIOS lists non-exist devices in DMAR
+			 * table.
+			 */
+			if (!pdev)
+				continue;
+			ret = iommu_prepare_rmrr_dev(rmrr, pdev);
+			if (ret)
+				printk(KERN_ERR
+				       "IOMMU: mapping reserved region failed\n");
 		}
-
-		iommu_prepare_isa();
 	}
 
+	iommu_prepare_isa();
+
 	/*
 	 * for each drhd
 	 *   enable fault log
@@ -2536,7 +2514,10 @@ static int iommu_no_mapping(struct device *dev)
 			ret = domain_add_dev_info(si_domain, pdev);
 			if (ret)
 				return 0;
-			ret = domain_context_mapping(si_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
+			ret = domain_context_mapping(si_domain, pdev,
+						     hw_pass_through ?
+						     CONTEXT_TT_PASS_THROUGH :
+						     CONTEXT_TT_MULTI_LEVEL);
 			if (!ret) {
 				printk(KERN_INFO "64bit %s uses identity mapping\n",
 				       pci_name(pdev));
@@ -3202,7 +3183,7 @@ int __init intel_iommu_init(void)
 	 * Check the need for DMA-remapping initialization now.
 	 * Above initialization will also be used by Interrupt-remapping.
 	 */
-	if (no_iommu || (swiotlb && !iommu_pass_through) || dmar_disabled)
+	if (no_iommu || swiotlb || dmar_disabled)
 		return -ENODEV;
 
 	iommu_init_mempool();
@@ -3222,14 +3203,7 @@ int __init intel_iommu_init(void)
 
 	init_timer(&unmap_timer);
 	force_iommu = 1;
-
-	if (!iommu_pass_through) {
-		printk(KERN_INFO
-		       "Multi-level page-table translation for DMAR.\n");
-		dma_ops = &intel_dma_ops;
-	} else
-		printk(KERN_INFO
-		       "DMAR: Pass through translation for DMAR.\n");
+	dma_ops = &intel_dma_ops;
 
 	init_iommu_sysfs();
 
-- 
cgit v1.2.3


From 5b7e88edc6193f36941bccbfd5ed9ed5fe27d2e1 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Fri, 31 Jul 2009 09:41:40 +0800
Subject: x86, mce: Support specifying context for software mce injection

The cpu context is specified via the new mce.inject_flags fields.
This allows more realistic machine check testing in different
situations. "RANDOM" context is implemented via NMI broadcasting to
add randomization to testing.

AK: Fix NMI broadcasting check. Fix 32-bit building. Some race
fixes. Move to module. Various changes

ChangeLog:

v3:

- Re-based on latest x86-tip.git/mce4

- Fix 32-bit building

v2:

- Re-base on latest x86-tip.git/mce3

Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/mce.h              |  11 ++-
 arch/x86/kernel/cpu/mcheck/mce-inject.c | 156 ++++++++++++++++++++++++++------
 2 files changed, 135 insertions(+), 32 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index ad753537291..8945be9ad2b 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -38,6 +38,13 @@
 #define MCM_ADDR_MEM	 3	/* memory address */
 #define MCM_ADDR_GENERIC 7	/* generic */
 
+#define MCJ_CTX_MASK		3
+#define MCJ_CTX(flags)		((flags) & MCJ_CTX_MASK)
+#define MCJ_CTX_RANDOM		0    /* inject context: random */
+#define MCJ_CTX_PROCESS		1    /* inject context: process */
+#define MCJ_CTX_IRQ		2    /* inject context: IRQ */
+#define MCJ_NMI_BROADCAST	4    /* do NMI broadcasting */
+
 /* Fields are zero when not available */
 struct mce {
 	__u64 status;
@@ -48,8 +55,8 @@ struct mce {
 	__u64 tsc;	/* cpu time stamp counter */
 	__u64 time;	/* wall time_t when error was detected */
 	__u8  cpuvendor;	/* cpu vendor as encoded in system.h */
-	__u8  pad1;
-	__u16 pad2;
+	__u8  inject_flags;	/* software inject flags */
+	__u16  pad;
 	__u32 cpuid;	/* CPUID 1 EAX */
 	__u8  cs;		/* code segment */
 	__u8  bank;	/* machine check bank */
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
index a3a235a53f0..ad5d92790eb 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -18,7 +18,12 @@
 #include <linux/string.h>
 #include <linux/fs.h>
 #include <linux/smp.h>
+#include <linux/notifier.h>
+#include <linux/kdebug.h>
+#include <linux/cpu.h>
+#include <linux/sched.h>
 #include <asm/mce.h>
+#include <asm/apic.h>
 
 /* Update fake mce registers on current CPU. */
 static void inject_mce(struct mce *m)
@@ -39,44 +44,141 @@ static void inject_mce(struct mce *m)
 	i->finished = 1;
 }
 
-struct delayed_mce {
-	struct timer_list timer;
-	struct mce m;
+static void raise_corrected(struct mce *m)
+{
+	unsigned long flags;
+	mce_banks_t b;
+
+	memset(&b, 0xff, sizeof(mce_banks_t));
+	local_irq_save(flags);
+	machine_check_poll(0, &b);
+	local_irq_restore(flags);
+	m->finished = 0;
+}
+
+static void raise_uncorrected(struct mce *m, struct pt_regs *pregs)
+{
+	struct pt_regs regs;
+	unsigned long flags;
+
+	if (!pregs) {
+		memset(&regs, 0, sizeof(struct pt_regs));
+		regs.ip = m->ip;
+		regs.cs = m->cs;
+		pregs = &regs;
+	}
+	/* in mcheck exeception handler, irq will be disabled */
+	local_irq_save(flags);
+	do_machine_check(pregs, 0);
+	local_irq_restore(flags);
+	m->finished = 0;
+}
+
+static cpumask_t mce_inject_cpumask;
+
+static int mce_raise_notify(struct notifier_block *self,
+			    unsigned long val, void *data)
+{
+	struct die_args *args = (struct die_args *)data;
+	int cpu = smp_processor_id();
+	struct mce *m = &__get_cpu_var(injectm);
+	if (val != DIE_NMI_IPI || !cpu_isset(cpu, mce_inject_cpumask))
+		return NOTIFY_DONE;
+	cpu_clear(cpu, mce_inject_cpumask);
+	if (m->status & MCI_STATUS_UC)
+		raise_uncorrected(m, args->regs);
+	else if (m->status)
+		raise_corrected(m);
+	return NOTIFY_STOP;
+}
+
+static struct notifier_block mce_raise_nb = {
+	.notifier_call = mce_raise_notify,
+	.priority = 1000,
 };
 
 /* Inject mce on current CPU */
-static void raise_mce(unsigned long data)
+static int raise_local(struct mce *m)
 {
-	struct delayed_mce *dm = (struct delayed_mce *)data;
-	struct mce *m = &dm->m;
+	int context = MCJ_CTX(m->inject_flags);
+	int ret = 0;
 	int cpu = m->extcpu;
 
-	inject_mce(m);
 	if (m->status & MCI_STATUS_UC) {
-		struct pt_regs regs;
-		memset(&regs, 0, sizeof(struct pt_regs));
-		regs.ip = m->ip;
-		regs.cs = m->cs;
 		printk(KERN_INFO "Triggering MCE exception on CPU %d\n", cpu);
-		do_machine_check(&regs, 0);
+		switch (context) {
+		case MCJ_CTX_IRQ:
+			/*
+			 * Could do more to fake interrupts like
+			 * calling irq_enter, but the necessary
+			 * machinery isn't exported currently.
+			 */
+			/*FALL THROUGH*/
+		case MCJ_CTX_PROCESS:
+			raise_uncorrected(m, NULL);
+			break;
+		default:
+			printk(KERN_INFO "Invalid MCE context\n");
+			ret = -EINVAL;
+		}
 		printk(KERN_INFO "MCE exception done on CPU %d\n", cpu);
-	} else {
-		mce_banks_t b;
-		memset(&b, 0xff, sizeof(mce_banks_t));
+	} else if (m->status) {
 		printk(KERN_INFO "Starting machine check poll CPU %d\n", cpu);
-		machine_check_poll(0, &b);
+		raise_corrected(m);
 		mce_notify_irq();
-		printk(KERN_INFO "Finished machine check poll on CPU %d\n",
-		       cpu);
-	}
-	kfree(dm);
+		printk(KERN_INFO "Machine check poll done on CPU %d\n", cpu);
+	} else
+		m->finished = 0;
+
+	return ret;
+}
+
+static void raise_mce(struct mce *m)
+{
+	int context = MCJ_CTX(m->inject_flags);
+
+	inject_mce(m);
+
+	if (context == MCJ_CTX_RANDOM)
+		return;
+
+#ifdef CONFIG_X86_LOCAL_APIC
+	if (m->inject_flags & MCJ_NMI_BROADCAST) {
+		unsigned long start;
+		int cpu;
+		get_online_cpus();
+		mce_inject_cpumask = cpu_online_map;
+		cpu_clear(get_cpu(), mce_inject_cpumask);
+		for_each_online_cpu(cpu) {
+			struct mce *mcpu = &per_cpu(injectm, cpu);
+			if (!mcpu->finished ||
+			    MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM)
+				cpu_clear(cpu, mce_inject_cpumask);
+		}
+		if (!cpus_empty(mce_inject_cpumask))
+			apic->send_IPI_mask(&mce_inject_cpumask, NMI_VECTOR);
+		start = jiffies;
+		while (!cpus_empty(mce_inject_cpumask)) {
+			if (!time_before(jiffies, start + 2*HZ)) {
+				printk(KERN_ERR
+				"Timeout waiting for mce inject NMI %lx\n",
+					*cpus_addr(mce_inject_cpumask));
+				break;
+			}
+			cpu_relax();
+		}
+		raise_local(m);
+		put_cpu();
+		put_online_cpus();
+	} else
+#endif
+		raise_local(m);
 }
 
 /* Error injection interface */
 static ssize_t mce_write(struct file *filp, const char __user *ubuf,
 			 size_t usize, loff_t *off)
 {
-	struct delayed_mce *dm;
 	struct mce m;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -96,19 +198,12 @@ static ssize_t mce_write(struct file *filp, const char __user *ubuf,
 	if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu))
 		return -EINVAL;
 
-	dm = kmalloc(sizeof(struct delayed_mce), GFP_KERNEL);
-	if (!dm)
-		return -ENOMEM;
-
 	/*
 	 * Need to give user space some time to set everything up,
 	 * so do it a jiffie or two later everywhere.
-	 * Should we use a hrtimer here for better synchronization?
 	 */
-	memcpy(&dm->m, &m, sizeof(struct mce));
-	setup_timer(&dm->timer, raise_mce, (unsigned long)dm);
-	dm->timer.expires = jiffies + 2;
-	add_timer_on(&dm->timer, m.extcpu);
+	schedule_timeout(2);
+	raise_mce(&m);
 	return usize;
 }
 
@@ -116,6 +211,7 @@ static int inject_init(void)
 {
 	printk(KERN_INFO "Machine check injector initialized\n");
 	mce_chrdev_ops.write = mce_write;
+	register_die_notifier(&mce_raise_nb);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 0dcc66851f1091af421416c28a9458836885f522 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Fri, 31 Jul 2009 09:41:41 +0800
Subject: x86, mce: Support specifying raise mode for software MCE injection

Raise mode include raising as exception or raising as poll, it is
specified via the mce.inject_flags field.

This can be used to specify raise mode of UCNA, which is UC error but
raised not as exception. And this can be used to test the filter code
of poll handler or exception handler too. For example, enforce a poll
raise mode for a fatal MCE.

ChangeLog:

v2:

- Re-base on latest x86-tip.git/mce3

Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/mce.h              |  1 +
 arch/x86/kernel/cpu/mcheck/mce-inject.c | 16 ++++++++--------
 2 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 8945be9ad2b..b608a64c581 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -44,6 +44,7 @@
 #define MCJ_CTX_PROCESS		1    /* inject context: process */
 #define MCJ_CTX_IRQ		2    /* inject context: IRQ */
 #define MCJ_NMI_BROADCAST	4    /* do NMI broadcasting */
+#define MCJ_EXCEPTION		8    /* raise as exception */
 
 /* Fields are zero when not available */
 struct mce {
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
index ad5d92790eb..7029f0e2aca 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -44,7 +44,7 @@ static void inject_mce(struct mce *m)
 	i->finished = 1;
 }
 
-static void raise_corrected(struct mce *m)
+static void raise_poll(struct mce *m)
 {
 	unsigned long flags;
 	mce_banks_t b;
@@ -56,7 +56,7 @@ static void raise_corrected(struct mce *m)
 	m->finished = 0;
 }
 
-static void raise_uncorrected(struct mce *m, struct pt_regs *pregs)
+static void raise_exception(struct mce *m, struct pt_regs *pregs)
 {
 	struct pt_regs regs;
 	unsigned long flags;
@@ -85,10 +85,10 @@ static int mce_raise_notify(struct notifier_block *self,
 	if (val != DIE_NMI_IPI || !cpu_isset(cpu, mce_inject_cpumask))
 		return NOTIFY_DONE;
 	cpu_clear(cpu, mce_inject_cpumask);
-	if (m->status & MCI_STATUS_UC)
-		raise_uncorrected(m, args->regs);
+	if (m->inject_flags & MCJ_EXCEPTION)
+		raise_exception(m, args->regs);
 	else if (m->status)
-		raise_corrected(m);
+		raise_poll(m);
 	return NOTIFY_STOP;
 }
 
@@ -104,7 +104,7 @@ static int raise_local(struct mce *m)
 	int ret = 0;
 	int cpu = m->extcpu;
 
-	if (m->status & MCI_STATUS_UC) {
+	if (m->inject_flags & MCJ_EXCEPTION) {
 		printk(KERN_INFO "Triggering MCE exception on CPU %d\n", cpu);
 		switch (context) {
 		case MCJ_CTX_IRQ:
@@ -115,7 +115,7 @@ static int raise_local(struct mce *m)
 			 */
 			/*FALL THROUGH*/
 		case MCJ_CTX_PROCESS:
-			raise_uncorrected(m, NULL);
+			raise_exception(m, NULL);
 			break;
 		default:
 			printk(KERN_INFO "Invalid MCE context\n");
@@ -124,7 +124,7 @@ static int raise_local(struct mce *m)
 		printk(KERN_INFO "MCE exception done on CPU %d\n", cpu);
 	} else if (m->status) {
 		printk(KERN_INFO "Starting machine check poll CPU %d\n", cpu);
-		raise_corrected(m);
+		raise_poll(m);
 		mce_notify_irq();
 		printk(KERN_INFO "Machine check poll done on CPU %d\n", cpu);
 	} else
-- 
cgit v1.2.3


From 5be9ed251f58881dfc3dd6742a81ff9ad1a7bb04 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Fri, 31 Jul 2009 09:41:42 +0800
Subject: x86, mce: Move debugfs mce dir creating to mce.c

Because more debugfs files under mce dir will be create in mce.c.

ChangeLog:

v5:

- Rebased on x86-tip.git/mce

Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/mce-internal.h |  1 +
 arch/x86/kernel/cpu/mcheck/mce-severity.c |  4 +---
 arch/x86/kernel/cpu/mcheck/mce.c          | 13 +++++++++++++
 3 files changed, 15 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index 6bd51e7ba87..32996f9fab6 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -22,6 +22,7 @@ struct mce_bank {
 };
 
 int mce_severity(struct mce *a, int tolerant, char **msg);
+struct dentry *mce_get_debugfs_dir(void);
 
 extern int mce_ser;
 
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 51f7c725dab..bc35a073d15 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -197,7 +197,7 @@ static int __init severities_debugfs_init(void)
 {
 	struct dentry *dmce = NULL, *fseverities_coverage = NULL;
 
-	dmce = debugfs_create_dir("mce", NULL);
+	dmce = mce_get_debugfs_dir();
 	if (dmce == NULL)
 		goto err_out;
 	fseverities_coverage = debugfs_create_file("severities-coverage",
@@ -209,8 +209,6 @@ static int __init severities_debugfs_init(void)
 	return 0;
 
 err_out:
-	if (dmce)
-		debugfs_remove(dmce);
 	return -ENOMEM;
 }
 late_initcall(severities_debugfs_init);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 1ce6db1f878..9c7419e459d 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -34,6 +34,7 @@
 #include <linux/smp.h>
 #include <linux/fs.h>
 #include <linux/mm.h>
+#include <linux/debugfs.h>
 
 #include <asm/processor.h>
 #include <asm/hw_irq.h>
@@ -2003,3 +2004,15 @@ static int __init mcheck_disable(char *str)
 	return 1;
 }
 __setup("nomce", mcheck_disable);
+
+#ifdef CONFIG_DEBUG_FS
+struct dentry *mce_get_debugfs_dir(void)
+{
+	static struct dentry *dmce;
+
+	if (!dmce)
+		dmce = debugfs_create_dir("mce", NULL);
+
+	return dmce;
+}
+#endif
-- 
cgit v1.2.3


From bf783f9f7d33576815bc89f9f1856a7309ea2f17 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Fri, 31 Jul 2009 09:41:43 +0800
Subject: x86, mce: Fake panic support for MCE testing

If "fake panic" mode is turned on, just log panic message instead of
go real panic. This is used for testing only, so that the test suite
can check for the correct panic message and do regression testing for
MCE would go panic.

This patch is based on x86-tip.git/mce.

ChangeLog:

v5:

- Rebased on x86-tip.git/mce

v4:

- Move config file from sysfs to debugfs

Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 75 ++++++++++++++++++++++++++++++++++------
 1 file changed, 64 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 9c7419e459d..54bd1b2fb4c 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -204,6 +204,9 @@ static void print_mce_tail(void)
 
 static atomic_t mce_paniced;
 
+static int fake_panic;
+static atomic_t mce_fake_paniced;
+
 /* Panic in progress. Enable interrupts and wait for final IPI */
 static void wait_for_panic(void)
 {
@@ -221,15 +224,21 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
 {
 	int i;
 
-	/*
-	 * Make sure only one CPU runs in machine check panic
-	 */
-	if (atomic_inc_return(&mce_paniced) > 1)
-		wait_for_panic();
-	barrier();
+	if (!fake_panic) {
+		/*
+		 * Make sure only one CPU runs in machine check panic
+		 */
+		if (atomic_inc_return(&mce_paniced) > 1)
+			wait_for_panic();
+		barrier();
 
-	bust_spinlocks(1);
-	console_verbose();
+		bust_spinlocks(1);
+		console_verbose();
+	} else {
+		/* Don't log too much for fake panic */
+		if (atomic_inc_return(&mce_fake_paniced) > 1)
+			return;
+	}
 	print_mce_head();
 	/* First print corrected ones that are still unlogged */
 	for (i = 0; i < MCE_LOG_LEN; i++) {
@@ -256,9 +265,12 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
 	print_mce_tail();
 	if (exp)
 		printk(KERN_EMERG "Machine check: %s\n", exp);
-	if (panic_timeout == 0)
-		panic_timeout = mce_panic_timeout;
-	panic(msg);
+	if (!fake_panic) {
+		if (panic_timeout == 0)
+			panic_timeout = mce_panic_timeout;
+		panic(msg);
+	} else
+		printk(KERN_EMERG "Fake kernel panic: %s\n", msg);
 }
 
 /* Support code for software error injection */
@@ -2015,4 +2027,45 @@ struct dentry *mce_get_debugfs_dir(void)
 
 	return dmce;
 }
+
+static void mce_reset(void)
+{
+	cpu_missing = 0;
+	atomic_set(&mce_fake_paniced, 0);
+	atomic_set(&mce_executing, 0);
+	atomic_set(&mce_callin, 0);
+	atomic_set(&global_nwo, 0);
+}
+
+static int fake_panic_get(void *data, u64 *val)
+{
+	*val = fake_panic;
+	return 0;
+}
+
+static int fake_panic_set(void *data, u64 val)
+{
+	mce_reset();
+	fake_panic = val;
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get,
+			fake_panic_set, "%llu\n");
+
+static int __init mce_debugfs_init(void)
+{
+	struct dentry *dmce, *ffake_panic;
+
+	dmce = mce_get_debugfs_dir();
+	if (!dmce)
+		return -ENOMEM;
+	ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL,
+					  &fake_panic_fops);
+	if (!ffake_panic)
+		return -ENOMEM;
+
+	return 0;
+}
+late_initcall(mce_debugfs_init);
 #endif
-- 
cgit v1.2.3


From 81e2d7b30d718824434725a4a24d5864a71b1d30 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Wed, 12 Aug 2009 05:45:34 -0700
Subject: x86, intel_txt: tboot.c needs <asm/fixmap.h>

arch/x86/kernel/tboot.c needs <asm/fixmap.h>.  In most configurations
that ends up getting implicitly included, but not in all, causing
build failures in some configurations.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: Joseph Cihula <joseph.cihula@intel.com>
Cc: Shane Wang <shane.wang@intel.com>
---
 arch/x86/kernel/tboot.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 263591afd29..1ab80120894 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -33,6 +33,7 @@
 #include <asm/bootparam.h>
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
+#include <asm/fixmap.h>
 #include <asm/setup.h>
 #include <asm/tboot.h>
 #include <asm/e820.h>
-- 
cgit v1.2.3


From 00ae4064b1445524752575dd84df227c0687c99d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 14 Aug 2009 15:00:49 +0900
Subject: percpu: rename 4k first chunk allocator to page

Page size isn't always 4k depending on arch and configuration.  Rename
4k first chunk allocator to page.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: David Howells <dhowells@redhat.com>
---
 Documentation/kernel-parameters.txt |  2 +-
 arch/x86/kernel/setup_percpu.c      | 23 ++++++++++++-----------
 include/linux/percpu.h              |  2 +-
 mm/percpu.c                         | 25 ++++++++++++++-----------
 4 files changed, 28 insertions(+), 24 deletions(-)

(limited to 'arch/x86')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 7936b801fe6..12e9eb77ee0 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1920,7 +1920,7 @@ and is between 256 and 4096 characters. It is defined in the file
 			See arch/parisc/kernel/pdc_chassis.c
 
 	percpu_alloc=	[X86] Select which percpu first chunk allocator to use.
-			Allowed values are one of "lpage", "embed" and "4k".
+			Allowed values are one of "lpage", "embed" and "page".
 			See comments in arch/x86/kernel/setup_percpu.c for
 			details on each allocator.  This parameter is primarily
 			for debugging and performance comparison.
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index a26ff61e2fb..1e17711c29d 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -249,21 +249,22 @@ static ssize_t __init setup_pcpu_embed(size_t static_size, bool chosen)
 }
 
 /*
- * 4k allocator
+ * Page allocator
  *
- * Boring fallback 4k allocator.  This allocator puts more pressure on
- * PTE TLBs but other than that behaves nicely on both UMA and NUMA.
+ * Boring fallback 4k page allocator.  This allocator puts more
+ * pressure on PTE TLBs but other than that behaves nicely on both UMA
+ * and NUMA.
  */
-static void __init pcpu4k_populate_pte(unsigned long addr)
+static void __init pcpup_populate_pte(unsigned long addr)
 {
 	populate_extra_pte(addr);
 }
 
-static ssize_t __init setup_pcpu_4k(size_t static_size)
+static ssize_t __init setup_pcpu_page(size_t static_size)
 {
-	return pcpu_4k_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE,
-				   pcpu_fc_alloc, pcpu_fc_free,
-				   pcpu4k_populate_pte);
+	return pcpu_page_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE,
+				     pcpu_fc_alloc, pcpu_fc_free,
+				     pcpup_populate_pte);
 }
 
 /* for explicit first chunk allocator selection */
@@ -307,7 +308,7 @@ void __init setup_per_cpu_areas(void)
 	 */
 	ret = -EINVAL;
 	if (strlen(pcpu_chosen_alloc)) {
-		if (strcmp(pcpu_chosen_alloc, "4k")) {
+		if (strcmp(pcpu_chosen_alloc, "page")) {
 			if (!strcmp(pcpu_chosen_alloc, "lpage"))
 				ret = setup_pcpu_lpage(static_size, true);
 			else if (!strcmp(pcpu_chosen_alloc, "embed"))
@@ -317,7 +318,7 @@ void __init setup_per_cpu_areas(void)
 					   "specified\n", pcpu_chosen_alloc);
 			if (ret < 0)
 				pr_warning("PERCPU: %s allocator failed (%zd), "
-					   "falling back to 4k\n",
+					   "falling back to page size\n",
 					   pcpu_chosen_alloc, ret);
 		}
 	} else {
@@ -326,7 +327,7 @@ void __init setup_per_cpu_areas(void)
 			ret = setup_pcpu_embed(static_size, false);
 	}
 	if (ret < 0)
-		ret = setup_pcpu_4k(static_size);
+		ret = setup_pcpu_page(static_size);
 	if (ret < 0)
 		panic("cannot allocate static percpu area (%zu bytes, err=%zd)",
 		      static_size, ret);
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index e134c822963..7989f61b03f 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -74,7 +74,7 @@ extern ssize_t __init pcpu_embed_first_chunk(
 				size_t static_size, size_t reserved_size,
 				ssize_t dyn_size);
 
-extern ssize_t __init pcpu_4k_first_chunk(
+extern ssize_t __init pcpu_page_first_chunk(
 				size_t static_size, size_t reserved_size,
 				pcpu_fc_alloc_fn_t alloc_fn,
 				pcpu_fc_free_fn_t free_fn,
diff --git a/mm/percpu.c b/mm/percpu.c
index cbddcbdab68..6feac793490 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1497,15 +1497,15 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
 }
 
 /**
- * pcpu_4k_first_chunk - map the first chunk using PAGE_SIZE pages
+ * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages
  * @static_size: the size of static percpu area in bytes
  * @reserved_size: the size of reserved percpu area in bytes
  * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE
  * @free_fn: funtion to free percpu page, always called with PAGE_SIZE
  * @populate_pte_fn: function to populate pte
  *
- * This is a helper to ease setting up embedded first percpu chunk and
- * can be called where pcpu_setup_first_chunk() is expected.
+ * This is a helper to ease setting up page-remapped first percpu
+ * chunk and can be called where pcpu_setup_first_chunk() is expected.
  *
  * This is the basic allocator.  Static percpu area is allocated
  * page-by-page into vmalloc area.
@@ -1514,12 +1514,13 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
  * The determined pcpu_unit_size which can be used to initialize
  * percpu access on success, -errno on failure.
  */
-ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size,
-				   pcpu_fc_alloc_fn_t alloc_fn,
-				   pcpu_fc_free_fn_t free_fn,
-				   pcpu_fc_populate_pte_fn_t populate_pte_fn)
+ssize_t __init pcpu_page_first_chunk(size_t static_size, size_t reserved_size,
+				     pcpu_fc_alloc_fn_t alloc_fn,
+				     pcpu_fc_free_fn_t free_fn,
+				     pcpu_fc_populate_pte_fn_t populate_pte_fn)
 {
 	static struct vm_struct vm;
+	char psize_str[16];
 	int unit_pages;
 	size_t pages_size;
 	struct page **pages;
@@ -1527,6 +1528,8 @@ ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size,
 	int i, j;
 	ssize_t ret;
 
+	snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);
+
 	unit_pages = PFN_UP(max_t(size_t, static_size + reserved_size,
 				  PCPU_MIN_UNIT_SIZE));
 
@@ -1542,8 +1545,8 @@ ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size,
 
 			ptr = alloc_fn(cpu, PAGE_SIZE);
 			if (!ptr) {
-				pr_warning("PERCPU: failed to allocate "
-					   "4k page for cpu%u\n", cpu);
+				pr_warning("PERCPU: failed to allocate %s page "
+					   "for cpu%u\n", psize_str, cpu);
 				goto enomem;
 			}
 			pages[j++] = virt_to_page(ptr);
@@ -1580,8 +1583,8 @@ ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size,
 	}
 
 	/* we're ready, commit */
-	pr_info("PERCPU: %d 4k pages/cpu @%p s%zu r%zu\n",
-		unit_pages, vm.addr, static_size, reserved_size);
+	pr_info("PERCPU: %d %s pages/cpu @%p s%zu r%zu\n",
+		unit_pages, psize_str, vm.addr, static_size, reserved_size);
 
 	ret = pcpu_setup_first_chunk(static_size, reserved_size, -1,
 				     unit_pages << PAGE_SHIFT, vm.addr, NULL);
-- 
cgit v1.2.3


From 08fc45806103e59a37418e84719b878f9bb32540 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 14 Aug 2009 15:00:49 +0900
Subject: percpu: build first chunk allocators selectively

There's no need to build unused first chunk allocators in.  Define
CONFIG_NEED_PER_CPU_*_FIRST_CHUNK and let archs enable them
selectively.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 arch/x86/Kconfig       | 10 ++++++++++
 include/linux/percpu.h | 27 +++++----------------------
 mm/percpu.c            | 19 +++++++++++--------
 3 files changed, 26 insertions(+), 30 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e06b2eeff9f..f7ac2721551 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -150,6 +150,16 @@ config ARCH_HAS_CACHE_LINE_SIZE
 config HAVE_SETUP_PER_CPU_AREA
 	def_bool y
 
+config NEED_PER_CPU_EMBED_FIRST_CHUNK
+	def_bool y
+
+config NEED_PER_CPU_PAGE_FIRST_CHUNK
+	def_bool y
+
+config NEED_PER_CPU_LPAGE_FIRST_CHUNK
+	def_bool y
+	depends on NEED_MULTIPLE_NODES
+
 config HAVE_CPUMASK_OF_CPU_MAP
 	def_bool X86_64_SMP
 
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 7989f61b03f..e26788e0da4 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -70,17 +70,21 @@ extern size_t __init pcpu_setup_first_chunk(
 				ssize_t dyn_size, size_t unit_size,
 				void *base_addr, const int *unit_map);
 
+#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
 extern ssize_t __init pcpu_embed_first_chunk(
 				size_t static_size, size_t reserved_size,
 				ssize_t dyn_size);
+#endif
 
+#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
 extern ssize_t __init pcpu_page_first_chunk(
 				size_t static_size, size_t reserved_size,
 				pcpu_fc_alloc_fn_t alloc_fn,
 				pcpu_fc_free_fn_t free_fn,
 				pcpu_fc_populate_pte_fn_t populate_pte_fn);
+#endif
 
-#ifdef CONFIG_NEED_MULTIPLE_NODES
+#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK
 extern int __init pcpu_lpage_build_unit_map(
 				size_t static_size, size_t reserved_size,
 				ssize_t *dyn_sizep, size_t *unit_sizep,
@@ -98,27 +102,6 @@ extern ssize_t __init pcpu_lpage_first_chunk(
 
 extern void *pcpu_lpage_remapped(void *kaddr);
 #else
-static inline int pcpu_lpage_build_unit_map(
-				size_t static_size, size_t reserved_size,
-				ssize_t *dyn_sizep, size_t *unit_sizep,
-				size_t lpage_size, int *unit_map,
-				pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
-{
-	return -EINVAL;
-}
-
-static inline ssize_t __init pcpu_lpage_first_chunk(
-				size_t static_size, size_t reserved_size,
-				size_t dyn_size, size_t unit_size,
-				size_t lpage_size, const int *unit_map,
-				int nr_units,
-				pcpu_fc_alloc_fn_t alloc_fn,
-				pcpu_fc_free_fn_t free_fn,
-				pcpu_fc_map_fn_t map_fn)
-{
-	return -EINVAL;
-}
-
 static inline void *pcpu_lpage_remapped(void *kaddr)
 {
 	return NULL;
diff --git a/mm/percpu.c b/mm/percpu.c
index 6feac793490..7971997de31 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1414,8 +1414,9 @@ size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size,
 	return pcpu_unit_size;
 }
 
-static size_t pcpu_calc_fc_sizes(size_t static_size, size_t reserved_size,
-				 ssize_t *dyn_sizep)
+static inline size_t pcpu_calc_fc_sizes(size_t static_size,
+					size_t reserved_size,
+					ssize_t *dyn_sizep)
 {
 	size_t size_sum;
 
@@ -1427,6 +1428,8 @@ static size_t pcpu_calc_fc_sizes(size_t static_size, size_t reserved_size,
 	return size_sum;
 }
 
+#if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \
+	!defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
 /**
  * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
  * @static_size: the size of static percpu area in bytes
@@ -1495,7 +1498,10 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
 	return pcpu_setup_first_chunk(static_size, reserved_size, dyn_size,
 				      unit_size, base, NULL);
 }
+#endif /* CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK ||
+	  !CONFIG_HAVE_SETUP_PER_CPU_AREA */
 
+#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
 /**
  * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages
  * @static_size: the size of static percpu area in bytes
@@ -1598,12 +1604,9 @@ out_free_ar:
 	free_bootmem(__pa(pages), pages_size);
 	return ret;
 }
+#endif /* CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK */
 
-/*
- * Large page remapping first chunk setup helper
- */
-#ifdef CONFIG_NEED_MULTIPLE_NODES
-
+#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK
 /**
  * pcpu_lpage_build_unit_map - build unit_map for large page remapping
  * @static_size: the size of static percpu area in bytes
@@ -1982,7 +1985,7 @@ void *pcpu_lpage_remapped(void *kaddr)
 
 	return NULL;
 }
-#endif
+#endif /* CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK */
 
 /*
  * Generic percpu area setup.
-- 
cgit v1.2.3


From f58dc01ba2ca9fe3ab2ba4ca43d9c8a735cf62d8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 14 Aug 2009 15:00:50 +0900
Subject: percpu: generalize first chunk allocator selection

Now that all first chunk allocators are in mm/percpu.c, it makes sense
to make generalize percpu_alloc kernel parameter.  Define PCPU_FC_*
and set pcpu_chosen_fc using early_param() in mm/percpu.c.  Arch code
can use the set value to determine which first chunk allocator to use.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 Documentation/kernel-parameters.txt | 11 ++++++-----
 arch/x86/kernel/setup_percpu.c      | 24 ++++++------------------
 include/linux/percpu.h              | 12 ++++++++++++
 mm/percpu.c                         | 32 ++++++++++++++++++++++++++++++++
 4 files changed, 56 insertions(+), 23 deletions(-)

(limited to 'arch/x86')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 12e9eb77ee0..dee9ce2e6cf 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1919,11 +1919,12 @@ and is between 256 and 4096 characters. It is defined in the file
 			Format: { 0 | 1 }
 			See arch/parisc/kernel/pdc_chassis.c
 
-	percpu_alloc=	[X86] Select which percpu first chunk allocator to use.
-			Allowed values are one of "lpage", "embed" and "page".
-			See comments in arch/x86/kernel/setup_percpu.c for
-			details on each allocator.  This parameter is primarily
-			for debugging and performance comparison.
+	percpu_alloc=	Select which percpu first chunk allocator to use.
+			Currently supported values are "embed", "page" and
+			"lpage".  Archs may support subset or none of the
+			selections.  See comments in mm/percpu.c for details
+			on each allocator.  This parameter is primarily	for
+			debugging and performance comparison.
 
 	pf.		[PARIDE]
 			See Documentation/blockdev/paride.txt.
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 1e17711c29d..b961d99e641 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -267,16 +267,6 @@ static ssize_t __init setup_pcpu_page(size_t static_size)
 				     pcpup_populate_pte);
 }
 
-/* for explicit first chunk allocator selection */
-static char pcpu_chosen_alloc[16] __initdata;
-
-static int __init percpu_alloc_setup(char *str)
-{
-	strncpy(pcpu_chosen_alloc, str, sizeof(pcpu_chosen_alloc) - 1);
-	return 0;
-}
-early_param("percpu_alloc", percpu_alloc_setup);
-
 static inline void setup_percpu_segment(int cpu)
 {
 #ifdef CONFIG_X86_32
@@ -307,19 +297,17 @@ void __init setup_per_cpu_areas(void)
 	 * each allocator for details.
 	 */
 	ret = -EINVAL;
-	if (strlen(pcpu_chosen_alloc)) {
-		if (strcmp(pcpu_chosen_alloc, "page")) {
-			if (!strcmp(pcpu_chosen_alloc, "lpage"))
+	if (pcpu_chosen_fc != PCPU_FC_AUTO) {
+		if (pcpu_chosen_fc != PCPU_FC_PAGE) {
+			if (pcpu_chosen_fc == PCPU_FC_LPAGE)
 				ret = setup_pcpu_lpage(static_size, true);
-			else if (!strcmp(pcpu_chosen_alloc, "embed"))
-				ret = setup_pcpu_embed(static_size, true);
 			else
-				pr_warning("PERCPU: unknown allocator %s "
-					   "specified\n", pcpu_chosen_alloc);
+				ret = setup_pcpu_embed(static_size, true);
+
 			if (ret < 0)
 				pr_warning("PERCPU: %s allocator failed (%zd), "
 					   "falling back to page size\n",
-					   pcpu_chosen_alloc, ret);
+					   pcpu_fc_names[pcpu_chosen_fc], ret);
 		}
 	} else {
 		ret = setup_pcpu_lpage(static_size, false);
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index e26788e0da4..9be05cbe5ee 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -59,6 +59,18 @@
 extern void *pcpu_base_addr;
 extern const int *pcpu_unit_map;
 
+enum pcpu_fc {
+	PCPU_FC_AUTO,
+	PCPU_FC_EMBED,
+	PCPU_FC_PAGE,
+	PCPU_FC_LPAGE,
+
+	PCPU_FC_NR,
+};
+extern const char *pcpu_fc_names[PCPU_FC_NR];
+
+extern enum pcpu_fc pcpu_chosen_fc;
+
 typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size);
 typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size);
 typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr);
diff --git a/mm/percpu.c b/mm/percpu.c
index 7971997de31..7fb40bb1555 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1414,6 +1414,38 @@ size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size,
 	return pcpu_unit_size;
 }
 
+const char *pcpu_fc_names[PCPU_FC_NR] __initdata = {
+	[PCPU_FC_AUTO]	= "auto",
+	[PCPU_FC_EMBED]	= "embed",
+	[PCPU_FC_PAGE]	= "page",
+	[PCPU_FC_LPAGE]	= "lpage",
+};
+
+enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO;
+
+static int __init percpu_alloc_setup(char *str)
+{
+	if (0)
+		/* nada */;
+#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
+	else if (!strcmp(str, "embed"))
+		pcpu_chosen_fc = PCPU_FC_EMBED;
+#endif
+#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
+	else if (!strcmp(str, "page"))
+		pcpu_chosen_fc = PCPU_FC_PAGE;
+#endif
+#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK
+	else if (!strcmp(str, "lpage"))
+		pcpu_chosen_fc = PCPU_FC_LPAGE;
+#endif
+	else
+		pr_warning("PERCPU: unknown allocator %s specified\n", str);
+
+	return 0;
+}
+early_param("percpu_alloc", percpu_alloc_setup);
+
 static inline size_t pcpu_calc_fc_sizes(size_t static_size,
 					size_t reserved_size,
 					ssize_t *dyn_sizep)
-- 
cgit v1.2.3


From 9a7737691e90d3cce0e5248f91826c50e5aa3fcf Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 14 Aug 2009 15:00:50 +0900
Subject: percpu: drop @static_size from first chunk allocators

First chunk allocators assume percpu areas have been linked using one
of PERCPU_*() macros and depend on __per_cpu_load symbol defined by
those macros, so there isn't much point in passing in static area size
explicitly when it can be easily calculated from __per_cpu_start and
__per_cpu_end.  Drop @static_size from all percpu first chunk
allocators and helpers.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 arch/x86/kernel/setup_percpu.c | 34 +++++++++++++++-------------------
 include/linux/percpu.h         | 18 ++++++++----------
 mm/percpu.c                    | 29 +++++++++++++----------------
 3 files changed, 36 insertions(+), 45 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index b961d99e641..8aad486c688 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -157,7 +157,7 @@ static int pcpu_lpage_cpu_distance(unsigned int from, unsigned int to)
 		return REMOTE_DISTANCE;
 }
 
-static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
+static ssize_t __init setup_pcpu_lpage(bool chosen)
 {
 	size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
 	size_t dyn_size = reserve - PERCPU_FIRST_CHUNK_RESERVE;
@@ -184,8 +184,7 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
 		return -ENOMEM;
 	}
 
-	ret = pcpu_lpage_build_unit_map(static_size,
-					PERCPU_FIRST_CHUNK_RESERVE,
+	ret = pcpu_lpage_build_unit_map(PERCPU_FIRST_CHUNK_RESERVE,
 					&dyn_size, &unit_size, PMD_SIZE,
 					unit_map, pcpu_lpage_cpu_distance);
 	if (ret < 0) {
@@ -208,9 +207,8 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
 		}
 	}
 
-	ret = pcpu_lpage_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE,
-				     dyn_size, unit_size, PMD_SIZE,
-				     unit_map, nr_units,
+	ret = pcpu_lpage_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, dyn_size,
+				     unit_size, PMD_SIZE, unit_map, nr_units,
 				     pcpu_fc_alloc, pcpu_fc_free, pcpul_map);
 out_free:
 	if (ret < 0)
@@ -218,7 +216,7 @@ out_free:
 	return ret;
 }
 #else
-static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
+static ssize_t __init setup_pcpu_lpage(bool chosen)
 {
 	return -EINVAL;
 }
@@ -232,7 +230,7 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
  * mapping so that it can use PMD mapping without additional TLB
  * pressure.
  */
-static ssize_t __init setup_pcpu_embed(size_t static_size, bool chosen)
+static ssize_t __init setup_pcpu_embed(bool chosen)
 {
 	size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
 
@@ -244,7 +242,7 @@ static ssize_t __init setup_pcpu_embed(size_t static_size, bool chosen)
 	if (!chosen && (!cpu_has_pse || pcpu_need_numa()))
 		return -EINVAL;
 
-	return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE,
+	return pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
 				      reserve - PERCPU_FIRST_CHUNK_RESERVE);
 }
 
@@ -260,9 +258,9 @@ static void __init pcpup_populate_pte(unsigned long addr)
 	populate_extra_pte(addr);
 }
 
-static ssize_t __init setup_pcpu_page(size_t static_size)
+static ssize_t __init setup_pcpu_page(void)
 {
-	return pcpu_page_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE,
+	return pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
 				     pcpu_fc_alloc, pcpu_fc_free,
 				     pcpup_populate_pte);
 }
@@ -282,7 +280,6 @@ static inline void setup_percpu_segment(int cpu)
 
 void __init setup_per_cpu_areas(void)
 {
-	size_t static_size = __per_cpu_end - __per_cpu_start;
 	unsigned int cpu;
 	unsigned long delta;
 	size_t pcpu_unit_size;
@@ -300,9 +297,9 @@ void __init setup_per_cpu_areas(void)
 	if (pcpu_chosen_fc != PCPU_FC_AUTO) {
 		if (pcpu_chosen_fc != PCPU_FC_PAGE) {
 			if (pcpu_chosen_fc == PCPU_FC_LPAGE)
-				ret = setup_pcpu_lpage(static_size, true);
+				ret = setup_pcpu_lpage(true);
 			else
-				ret = setup_pcpu_embed(static_size, true);
+				ret = setup_pcpu_embed(true);
 
 			if (ret < 0)
 				pr_warning("PERCPU: %s allocator failed (%zd), "
@@ -310,15 +307,14 @@ void __init setup_per_cpu_areas(void)
 					   pcpu_fc_names[pcpu_chosen_fc], ret);
 		}
 	} else {
-		ret = setup_pcpu_lpage(static_size, false);
+		ret = setup_pcpu_lpage(false);
 		if (ret < 0)
-			ret = setup_pcpu_embed(static_size, false);
+			ret = setup_pcpu_embed(false);
 	}
 	if (ret < 0)
-		ret = setup_pcpu_page(static_size);
+		ret = setup_pcpu_page();
 	if (ret < 0)
-		panic("cannot allocate static percpu area (%zu bytes, err=%zd)",
-		      static_size, ret);
+		panic("cannot initialize percpu area (err=%zd)", ret);
 
 	pcpu_unit_size = ret;
 
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 9be05cbe5ee..be2fc8fb9b6 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -84,13 +84,12 @@ extern size_t __init pcpu_setup_first_chunk(
 
 #ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
 extern ssize_t __init pcpu_embed_first_chunk(
-				size_t static_size, size_t reserved_size,
-				ssize_t dyn_size);
+				size_t reserved_size, ssize_t dyn_size);
 #endif
 
 #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
 extern ssize_t __init pcpu_page_first_chunk(
-				size_t static_size, size_t reserved_size,
+				size_t reserved_size,
 				pcpu_fc_alloc_fn_t alloc_fn,
 				pcpu_fc_free_fn_t free_fn,
 				pcpu_fc_populate_pte_fn_t populate_pte_fn);
@@ -98,16 +97,15 @@ extern ssize_t __init pcpu_page_first_chunk(
 
 #ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK
 extern int __init pcpu_lpage_build_unit_map(
-				size_t static_size, size_t reserved_size,
-				ssize_t *dyn_sizep, size_t *unit_sizep,
-				size_t lpage_size, int *unit_map,
+				size_t reserved_size, ssize_t *dyn_sizep,
+				size_t *unit_sizep, size_t lpage_size,
+				int *unit_map,
 				pcpu_fc_cpu_distance_fn_t cpu_distance_fn);
 
 extern ssize_t __init pcpu_lpage_first_chunk(
-				size_t static_size, size_t reserved_size,
-				size_t dyn_size, size_t unit_size,
-				size_t lpage_size, const int *unit_map,
-				int nr_units,
+				size_t reserved_size, size_t dyn_size,
+				size_t unit_size, size_t lpage_size,
+				const int *unit_map, int nr_units,
 				pcpu_fc_alloc_fn_t alloc_fn,
 				pcpu_fc_free_fn_t free_fn,
 				pcpu_fc_map_fn_t map_fn);
diff --git a/mm/percpu.c b/mm/percpu.c
index 7fb40bb1555..e2ac58a39bb 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1464,7 +1464,6 @@ static inline size_t pcpu_calc_fc_sizes(size_t static_size,
 	!defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
 /**
  * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
- * @static_size: the size of static percpu area in bytes
  * @reserved_size: the size of reserved percpu area in bytes
  * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
  *
@@ -1489,9 +1488,9 @@ static inline size_t pcpu_calc_fc_sizes(size_t static_size,
  * The determined pcpu_unit_size which can be used to initialize
  * percpu access on success, -errno on failure.
  */
-ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
-				      ssize_t dyn_size)
+ssize_t __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size)
 {
+	const size_t static_size = __per_cpu_end - __per_cpu_start;
 	size_t size_sum, unit_size, chunk_size;
 	void *base;
 	unsigned int cpu;
@@ -1536,7 +1535,6 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
 #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
 /**
  * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages
- * @static_size: the size of static percpu area in bytes
  * @reserved_size: the size of reserved percpu area in bytes
  * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE
  * @free_fn: funtion to free percpu page, always called with PAGE_SIZE
@@ -1552,12 +1550,13 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
  * The determined pcpu_unit_size which can be used to initialize
  * percpu access on success, -errno on failure.
  */
-ssize_t __init pcpu_page_first_chunk(size_t static_size, size_t reserved_size,
+ssize_t __init pcpu_page_first_chunk(size_t reserved_size,
 				     pcpu_fc_alloc_fn_t alloc_fn,
 				     pcpu_fc_free_fn_t free_fn,
 				     pcpu_fc_populate_pte_fn_t populate_pte_fn)
 {
 	static struct vm_struct vm;
+	const size_t static_size = __per_cpu_end - __per_cpu_start;
 	char psize_str[16];
 	int unit_pages;
 	size_t pages_size;
@@ -1641,7 +1640,6 @@ out_free_ar:
 #ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK
 /**
  * pcpu_lpage_build_unit_map - build unit_map for large page remapping
- * @static_size: the size of static percpu area in bytes
  * @reserved_size: the size of reserved percpu area in bytes
  * @dyn_sizep: in/out parameter for dynamic size, -1 for auto
  * @unit_sizep: out parameter for unit size
@@ -1661,13 +1659,14 @@ out_free_ar:
  * On success, fills in @unit_map, sets *@dyn_sizep, *@unit_sizep and
  * returns the number of units to be allocated.  -errno on failure.
  */
-int __init pcpu_lpage_build_unit_map(size_t static_size, size_t reserved_size,
-				     ssize_t *dyn_sizep, size_t *unit_sizep,
-				     size_t lpage_size, int *unit_map,
+int __init pcpu_lpage_build_unit_map(size_t reserved_size, ssize_t *dyn_sizep,
+				     size_t *unit_sizep, size_t lpage_size,
+				     int *unit_map,
 				     pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
 {
 	static int group_map[NR_CPUS] __initdata;
 	static int group_cnt[NR_CPUS] __initdata;
+	const size_t static_size = __per_cpu_end - __per_cpu_start;
 	int group_cnt_max = 0;
 	size_t size_sum, min_unit_size, alloc_size;
 	int upa, max_upa, uninitialized_var(best_upa);	/* units_per_alloc */
@@ -1819,7 +1818,6 @@ static void __init pcpul_lpage_dump_cfg(const char *lvl, size_t static_size,
 
 /**
  * pcpu_lpage_first_chunk - remap the first percpu chunk using large page
- * @static_size: the size of static percpu area in bytes
  * @reserved_size: the size of reserved percpu area in bytes
  * @dyn_size: free size for dynamic allocation in bytes
  * @unit_size: unit size in bytes
@@ -1850,15 +1848,15 @@ static void __init pcpul_lpage_dump_cfg(const char *lvl, size_t static_size,
  * The determined pcpu_unit_size which can be used to initialize
  * percpu access on success, -errno on failure.
  */
-ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size,
-				      size_t dyn_size, size_t unit_size,
-				      size_t lpage_size, const int *unit_map,
-				      int nr_units,
+ssize_t __init pcpu_lpage_first_chunk(size_t reserved_size, size_t dyn_size,
+				      size_t unit_size, size_t lpage_size,
+				      const int *unit_map, int nr_units,
 				      pcpu_fc_alloc_fn_t alloc_fn,
 				      pcpu_fc_free_fn_t free_fn,
 				      pcpu_fc_map_fn_t map_fn)
 {
 	static struct vm_struct vm;
+	const size_t static_size = __per_cpu_end - __per_cpu_start;
 	size_t chunk_size = unit_size * nr_units;
 	size_t map_size;
 	unsigned int cpu;
@@ -2037,7 +2035,6 @@ EXPORT_SYMBOL(__per_cpu_offset);
 
 void __init setup_per_cpu_areas(void)
 {
-	size_t static_size = __per_cpu_end - __per_cpu_start;
 	ssize_t unit_size;
 	unsigned long delta;
 	unsigned int cpu;
@@ -2046,7 +2043,7 @@ void __init setup_per_cpu_areas(void)
 	 * Always reserve area for module percpu variables.  That's
 	 * what the legacy allocator did.
 	 */
-	unit_size = pcpu_embed_first_chunk(static_size, PERCPU_MODULE_RESERVE,
+	unit_size = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
 					   PERCPU_DYNAMIC_RESERVE);
 	if (unit_size < 0)
 		panic("Failed to initialized percpu areas.");
-- 
cgit v1.2.3


From 3cbc85652767c38b252c8de55f9fd180b29e4c0d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 14 Aug 2009 15:00:50 +0900
Subject: percpu: add @align to pcpu_fc_alloc_fn_t

pcpu_fc_alloc_fn_t is about to see more interesting usage, add @align
parameter.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 arch/x86/kernel/setup_percpu.c | 4 ++--
 include/linux/percpu.h         | 3 ++-
 mm/percpu.c                    | 4 ++--
 3 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 8aad486c688..660cde13314 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -126,9 +126,9 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
 /*
  * Helpers for first chunk memory allocation
  */
-static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size)
+static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align)
 {
-	return pcpu_alloc_bootmem(cpu, size, size);
+	return pcpu_alloc_bootmem(cpu, size, align);
 }
 
 static void __init pcpu_fc_free(void *ptr, size_t size)
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 0cfdd14d096..d385dbcf190 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -71,7 +71,8 @@ extern const char *pcpu_fc_names[PCPU_FC_NR];
 
 extern enum pcpu_fc pcpu_chosen_fc;
 
-typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size);
+typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size,
+				     size_t align);
 typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size);
 typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr);
 typedef int (pcpu_fc_cpu_distance_fn_t)(unsigned int from, unsigned int to);
diff --git a/mm/percpu.c b/mm/percpu.c
index 287f59cc5fb..3316e3aac7e 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1578,7 +1578,7 @@ ssize_t __init pcpu_page_first_chunk(size_t reserved_size,
 		for (i = 0; i < unit_pages; i++) {
 			void *ptr;
 
-			ptr = alloc_fn(cpu, PAGE_SIZE);
+			ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE);
 			if (!ptr) {
 				pr_warning("PERCPU: failed to allocate %s page "
 					   "for cpu%u\n", psize_str, cpu);
@@ -1888,7 +1888,7 @@ ssize_t __init pcpu_lpage_first_chunk(size_t reserved_size, size_t dyn_size,
 				goto found;
 		continue;
 	found:
-		ptr = alloc_fn(cpu, lpage_size);
+		ptr = alloc_fn(cpu, lpage_size, lpage_size);
 		if (!ptr) {
 			pr_warning("PERCPU: failed to allocate large page "
 				   "for cpu%u\n", cpu);
-- 
cgit v1.2.3


From fd1e8a1fe2b54df6c185b4fa65f181f50b9c4d4e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 14 Aug 2009 15:00:51 +0900
Subject: percpu: introduce pcpu_alloc_info and pcpu_group_info

Till now, non-linear cpu->unit map was expressed using an integer
array which maps each cpu to a unit and used only by lpage allocator.
Although how many units have been placed in a single contiguos area
(group) is known while building unit_map, the information is lost when
the result is recorded into the unit_map array.  For lpage allocator,
as all allocations are done by lpages and whether two adjacent lpages
are in the same group or not is irrelevant, this didn't cause any
problem.  Non-linear cpu->unit mapping will be used for sparse
embedding and this grouping information is necessary for that.

This patch introduces pcpu_alloc_info which contains all the
information necessary for initializing percpu allocator.
pcpu_alloc_info contains array of pcpu_group_info which describes how
units are grouped and mapped to cpus.  pcpu_group_info also has
base_offset field to specify its offset from the chunk's base address.
pcpu_build_alloc_info() initializes this field as if all groups are
allocated back-to-back as is currently done but this will be used to
sparsely place groups.

pcpu_alloc_info is a rather complex data structure which contains a
flexible array which in turn points to nested cpu_map arrays.

* pcpu_alloc_alloc_info() and pcpu_free_alloc_info() are provided to
  help dealing with pcpu_alloc_info.

* pcpu_lpage_build_unit_map() is updated to build pcpu_alloc_info,
  generalized and renamed to pcpu_build_alloc_info().
  @cpu_distance_fn may be NULL indicating that all cpus are of
  LOCAL_DISTANCE.

* pcpul_lpage_dump_cfg() is updated to process pcpu_alloc_info,
  generalized and renamed to pcpu_dump_alloc_info().  It now also
  prints which group each alloc unit belongs to.

* pcpu_setup_first_chunk() now takes pcpu_alloc_info instead of the
  separate parameters.  All first chunk allocators are updated to use
  pcpu_build_alloc_info() to build alloc_info and call
  pcpu_setup_first_chunk() with it.  This has the side effect of
  packing units for sparse possible cpus.  ie. if cpus 0, 2 and 4 are
  possible, they'll be assigned unit 0, 1 and 2 instead of 0, 2 and 4.

* x86 setup_pcpu_lpage() is updated to deal with alloc_info.

* sparc64 setup_per_cpu_areas() is updated to build alloc_info.

Although the changes made by this patch are pretty pervasive, it
doesn't cause any behavior difference other than packing of sparse
cpus.  It mostly changes how information is passed among
initialization functions and makes room for more flexibility.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
---
 arch/sparc/kernel/smp_64.c     |  24 +-
 arch/x86/kernel/setup_percpu.c |  38 ++-
 include/linux/percpu.h         |  42 +++-
 mm/percpu.c                    | 529 +++++++++++++++++++++++++----------------
 4 files changed, 389 insertions(+), 244 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index 9856d866b77..a42a4a744d1 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -1475,17 +1475,29 @@ static void __init pcpu_map_range(unsigned long start, unsigned long end,
 
 void __init setup_per_cpu_areas(void)
 {
-	size_t dyn_size, static_size = __per_cpu_end - __per_cpu_start;
 	static struct vm_struct vm;
+	struct pcpu_alloc_info *ai;
 	unsigned long delta, cpu;
 	size_t size_sum, pcpu_unit_size;
 	size_t ptrs_size;
 	void **ptrs;
 
-	size_sum = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
+	ai = pcpu_alloc_alloc_info(1, nr_cpu_ids);
+
+	ai->static_size = __per_cpu_end - __per_cpu_start;
+	ai->reserved_size = PERCPU_MODULE_RESERVE;
+
+	size_sum = PFN_ALIGN(ai->static_size + ai->reserved_size +
 			     PERCPU_DYNAMIC_RESERVE);
-	dyn_size = size_sum - static_size - PERCPU_MODULE_RESERVE;
 
+	ai->dyn_size = size_sum - ai->static_size - ai->reserved_size;
+	ai->unit_size = PCPU_CHUNK_SIZE;
+	ai->atom_size = PCPU_CHUNK_SIZE;
+	ai->alloc_size = PCPU_CHUNK_SIZE;
+	ai->groups[0].nr_units = nr_cpu_ids;
+
+	for_each_possible_cpu(cpu)
+		ai->groups[0].cpu_map[cpu] = cpu;
 
 	ptrs_size = PFN_ALIGN(nr_cpu_ids * sizeof(ptrs[0]));
 	ptrs = alloc_bootmem(ptrs_size);
@@ -1497,7 +1509,7 @@ void __init setup_per_cpu_areas(void)
 		free_bootmem(__pa(ptrs[cpu] + size_sum),
 			     PCPU_CHUNK_SIZE - size_sum);
 
-		memcpy(ptrs[cpu], __per_cpu_load, static_size);
+		memcpy(ptrs[cpu], __per_cpu_load, ai->static_size);
 	}
 
 	/* allocate address and map */
@@ -1514,9 +1526,7 @@ void __init setup_per_cpu_areas(void)
 		pcpu_map_range(start, end, virt_to_page(ptrs[cpu]));
 	}
 
-	pcpu_unit_size = pcpu_setup_first_chunk(static_size,
-						PERCPU_MODULE_RESERVE, dyn_size,
-						PCPU_CHUNK_SIZE, vm.addr, NULL);
+	pcpu_unit_size = pcpu_setup_first_chunk(ai, vm.addr);
 
 	free_bootmem(__pa(ptrs), ptrs_size);
 
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 660cde13314..db5f9c49fec 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -161,9 +161,7 @@ static ssize_t __init setup_pcpu_lpage(bool chosen)
 {
 	size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
 	size_t dyn_size = reserve - PERCPU_FIRST_CHUNK_RESERVE;
-	size_t unit_map_size, unit_size;
-	int *unit_map;
-	int nr_units;
+	struct pcpu_alloc_info *ai;
 	ssize_t ret;
 
 	/* on non-NUMA, embedding is better */
@@ -177,26 +175,22 @@ static ssize_t __init setup_pcpu_lpage(bool chosen)
 	}
 
 	/* allocate and build unit_map */
-	unit_map_size = nr_cpu_ids * sizeof(int);
-	unit_map = alloc_bootmem_nopanic(unit_map_size);
-	if (!unit_map) {
-		pr_warning("PERCPU: failed to allocate unit_map\n");
-		return -ENOMEM;
+	ai = pcpu_build_alloc_info(PERCPU_FIRST_CHUNK_RESERVE, dyn_size,
+				   PMD_SIZE, pcpu_lpage_cpu_distance);
+	if (IS_ERR(ai)) {
+		pr_warning("PERCPU: failed to build unit_map (%ld)\n",
+			   PTR_ERR(ai));
+		return PTR_ERR(ai);
 	}
 
-	ret = pcpu_lpage_build_unit_map(PERCPU_FIRST_CHUNK_RESERVE,
-					&dyn_size, &unit_size, PMD_SIZE,
-					unit_map, pcpu_lpage_cpu_distance);
-	if (ret < 0) {
-		pr_warning("PERCPU: failed to build unit_map\n");
-		goto out_free;
-	}
-	nr_units = ret;
-
 	/* do the parameters look okay? */
 	if (!chosen) {
 		size_t vm_size = VMALLOC_END - VMALLOC_START;
-		size_t tot_size = nr_units * unit_size;
+		size_t tot_size = 0;
+		int group;
+
+		for (group = 0; group < ai->nr_groups; group++)
+			tot_size += ai->unit_size * ai->groups[group].nr_units;
 
 		/* don't consume more than 20% of vmalloc area */
 		if (tot_size > vm_size / 5) {
@@ -207,12 +201,10 @@ static ssize_t __init setup_pcpu_lpage(bool chosen)
 		}
 	}
 
-	ret = pcpu_lpage_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, dyn_size,
-				     unit_size, PMD_SIZE, unit_map, nr_units,
-				     pcpu_fc_alloc, pcpu_fc_free, pcpul_map);
+	ret = pcpu_lpage_first_chunk(ai, pcpu_fc_alloc, pcpu_fc_free,
+				     pcpul_map);
 out_free:
-	if (ret < 0)
-		free_bootmem(__pa(unit_map), unit_map_size);
+	pcpu_free_alloc_info(ai);
 	return ret;
 }
 #else
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 570fb18de2b..77b86be8ce4 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -59,6 +59,25 @@
 extern void *pcpu_base_addr;
 extern const int *pcpu_unit_map;
 
+struct pcpu_group_info {
+	int			nr_units;	/* aligned # of units */
+	unsigned long		base_offset;	/* base address offset */
+	unsigned int		*cpu_map;	/* unit->cpu map, empty
+						 * entries contain NR_CPUS */
+};
+
+struct pcpu_alloc_info {
+	size_t			static_size;
+	size_t			reserved_size;
+	size_t			dyn_size;
+	size_t			unit_size;
+	size_t			atom_size;
+	size_t			alloc_size;
+	size_t			__ai_size;	/* internal, don't use */
+	int			nr_groups;	/* 0 if grouping unnecessary */
+	struct pcpu_group_info	groups[];
+};
+
 enum pcpu_fc {
 	PCPU_FC_AUTO,
 	PCPU_FC_EMBED,
@@ -78,18 +97,17 @@ typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr);
 typedef int (pcpu_fc_cpu_distance_fn_t)(unsigned int from, unsigned int to);
 typedef void (*pcpu_fc_map_fn_t)(void *ptr, size_t size, void *addr);
 
-#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK
-extern int __init pcpu_lpage_build_unit_map(
-				size_t reserved_size, ssize_t *dyn_sizep,
-				size_t *unit_sizep, size_t lpage_size,
-				int *unit_map,
+extern struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
+							     int nr_units);
+extern void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai);
+
+extern struct pcpu_alloc_info * __init pcpu_build_alloc_info(
+				size_t reserved_size, ssize_t dyn_size,
+				size_t atom_size,
 				pcpu_fc_cpu_distance_fn_t cpu_distance_fn);
-#endif
 
-extern size_t __init pcpu_setup_first_chunk(
-				size_t static_size, size_t reserved_size,
-				size_t dyn_size, size_t unit_size,
-				void *base_addr, const int *unit_map);
+extern size_t __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
+					    void *base_addr);
 
 #ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
 extern ssize_t __init pcpu_embed_first_chunk(
@@ -106,9 +124,7 @@ extern ssize_t __init pcpu_page_first_chunk(
 
 #ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK
 extern ssize_t __init pcpu_lpage_first_chunk(
-				size_t reserved_size, size_t dyn_size,
-				size_t unit_size, size_t lpage_size,
-				const int *unit_map, int nr_units,
+				const struct pcpu_alloc_info *ai,
 				pcpu_fc_alloc_fn_t alloc_fn,
 				pcpu_fc_free_fn_t free_fn,
 				pcpu_fc_map_fn_t map_fn);
diff --git a/mm/percpu.c b/mm/percpu.c
index 2b9c4b2a2fc..99f7fa68272 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -58,6 +58,7 @@
 
 #include <linux/bitmap.h>
 #include <linux/bootmem.h>
+#include <linux/err.h>
 #include <linux/list.h>
 #include <linux/log2.h>
 #include <linux/mm.h>
@@ -1245,53 +1246,108 @@ static inline size_t pcpu_calc_fc_sizes(size_t static_size,
 	return size_sum;
 }
 
-#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK
 /**
- * pcpu_lpage_build_unit_map - build unit_map for large page remapping
+ * pcpu_alloc_alloc_info - allocate percpu allocation info
+ * @nr_groups: the number of groups
+ * @nr_units: the number of units
+ *
+ * Allocate ai which is large enough for @nr_groups groups containing
+ * @nr_units units.  The returned ai's groups[0].cpu_map points to the
+ * cpu_map array which is long enough for @nr_units and filled with
+ * NR_CPUS.  It's the caller's responsibility to initialize cpu_map
+ * pointer of other groups.
+ *
+ * RETURNS:
+ * Pointer to the allocated pcpu_alloc_info on success, NULL on
+ * failure.
+ */
+struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
+						      int nr_units)
+{
+	struct pcpu_alloc_info *ai;
+	size_t base_size, ai_size;
+	void *ptr;
+	int unit;
+
+	base_size = ALIGN(sizeof(*ai) + nr_groups * sizeof(ai->groups[0]),
+			  __alignof__(ai->groups[0].cpu_map[0]));
+	ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]);
+
+	ptr = alloc_bootmem_nopanic(PFN_ALIGN(ai_size));
+	if (!ptr)
+		return NULL;
+	ai = ptr;
+	ptr += base_size;
+
+	ai->groups[0].cpu_map = ptr;
+
+	for (unit = 0; unit < nr_units; unit++)
+		ai->groups[0].cpu_map[unit] = NR_CPUS;
+
+	ai->nr_groups = nr_groups;
+	ai->__ai_size = PFN_ALIGN(ai_size);
+
+	return ai;
+}
+
+/**
+ * pcpu_free_alloc_info - free percpu allocation info
+ * @ai: pcpu_alloc_info to free
+ *
+ * Free @ai which was allocated by pcpu_alloc_alloc_info().
+ */
+void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
+{
+	free_bootmem(__pa(ai), ai->__ai_size);
+}
+
+/**
+ * pcpu_build_alloc_info - build alloc_info considering distances between CPUs
  * @reserved_size: the size of reserved percpu area in bytes
- * @dyn_sizep: in/out parameter for dynamic size, -1 for auto
- * @unit_sizep: out parameter for unit size
- * @unit_map: unit_map to be filled
- * @cpu_distance_fn: callback to determine distance between cpus
+ * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
+ * @atom_size: allocation atom size
+ * @cpu_distance_fn: callback to determine distance between cpus, optional
  *
- * This function builds cpu -> unit map and determine other parameters
- * considering needed percpu size, large page size and distances
- * between CPUs in NUMA.
+ * This function determines grouping of units, their mappings to cpus
+ * and other parameters considering needed percpu size, allocation
+ * atom size and distances between CPUs.
  *
- * CPUs which are of LOCAL_DISTANCE both ways are grouped together and
- * may share units in the same large page.  The returned configuration
- * is guaranteed to have CPUs on different nodes on different large
- * pages and >=75% usage of allocated virtual address space.
+ * Groups are always mutliples of atom size and CPUs which are of
+ * LOCAL_DISTANCE both ways are grouped together and share space for
+ * units in the same group.  The returned configuration is guaranteed
+ * to have CPUs on different nodes on different groups and >=75% usage
+ * of allocated virtual address space.
  *
  * RETURNS:
- * On success, fills in @unit_map, sets *@dyn_sizep, *@unit_sizep and
- * returns the number of units to be allocated.  -errno on failure.
+ * On success, pointer to the new allocation_info is returned.  On
+ * failure, ERR_PTR value is returned.
  */
-int __init pcpu_lpage_build_unit_map(size_t reserved_size, ssize_t *dyn_sizep,
-				     size_t *unit_sizep, size_t lpage_size,
-				     int *unit_map,
-				     pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
+struct pcpu_alloc_info * __init pcpu_build_alloc_info(
+				size_t reserved_size, ssize_t dyn_size,
+				size_t atom_size,
+				pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
 {
 	static int group_map[NR_CPUS] __initdata;
 	static int group_cnt[NR_CPUS] __initdata;
 	const size_t static_size = __per_cpu_end - __per_cpu_start;
-	int group_cnt_max = 0;
+	int group_cnt_max = 0, nr_groups = 1, nr_units = 0;
 	size_t size_sum, min_unit_size, alloc_size;
 	int upa, max_upa, uninitialized_var(best_upa);	/* units_per_alloc */
-	int last_allocs;
+	int last_allocs, group, unit;
 	unsigned int cpu, tcpu;
-	int group, unit;
+	struct pcpu_alloc_info *ai;
+	unsigned int *cpu_map;
 
 	/*
 	 * Determine min_unit_size, alloc_size and max_upa such that
-	 * alloc_size is multiple of lpage_size and is the smallest
+	 * alloc_size is multiple of atom_size and is the smallest
 	 * which can accomodate 4k aligned segments which are equal to
 	 * or larger than min_unit_size.
 	 */
-	size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, dyn_sizep);
+	size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);
 	min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
 
-	alloc_size = roundup(min_unit_size, lpage_size);
+	alloc_size = roundup(min_unit_size, atom_size);
 	upa = alloc_size / min_unit_size;
 	while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
 		upa--;
@@ -1304,10 +1360,11 @@ int __init pcpu_lpage_build_unit_map(size_t reserved_size, ssize_t *dyn_sizep,
 		for_each_possible_cpu(tcpu) {
 			if (cpu == tcpu)
 				break;
-			if (group_map[tcpu] == group &&
+			if (group_map[tcpu] == group && cpu_distance_fn &&
 			    (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
 			     cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
 				group++;
+				nr_groups = max(nr_groups, group + 1);
 				goto next_group;
 			}
 		}
@@ -1328,7 +1385,7 @@ int __init pcpu_lpage_build_unit_map(size_t reserved_size, ssize_t *dyn_sizep,
 		if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
 			continue;
 
-		for (group = 0; group_cnt[group]; group++) {
+		for (group = 0; group < nr_groups; group++) {
 			int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
 			allocs += this_allocs;
 			wasted += this_allocs * upa - group_cnt[group];
@@ -1348,75 +1405,122 @@ int __init pcpu_lpage_build_unit_map(size_t reserved_size, ssize_t *dyn_sizep,
 		last_allocs = allocs;
 		best_upa = upa;
 	}
-	*unit_sizep = alloc_size / best_upa;
+	upa = best_upa;
+
+	/* allocate and fill alloc_info */
+	for (group = 0; group < nr_groups; group++)
+		nr_units += roundup(group_cnt[group], upa);
+
+	ai = pcpu_alloc_alloc_info(nr_groups, nr_units);
+	if (!ai)
+		return ERR_PTR(-ENOMEM);
+	cpu_map = ai->groups[0].cpu_map;
+
+	for (group = 0; group < nr_groups; group++) {
+		ai->groups[group].cpu_map = cpu_map;
+		cpu_map += roundup(group_cnt[group], upa);
+	}
+
+	ai->static_size = static_size;
+	ai->reserved_size = reserved_size;
+	ai->dyn_size = dyn_size;
+	ai->unit_size = alloc_size / upa;
+	ai->atom_size = atom_size;
+	ai->alloc_size = alloc_size;
+
+	for (group = 0, unit = 0; group_cnt[group]; group++) {
+		struct pcpu_group_info *gi = &ai->groups[group];
+
+		/*
+		 * Initialize base_offset as if all groups are located
+		 * back-to-back.  The caller should update this to
+		 * reflect actual allocation.
+		 */
+		gi->base_offset = unit * ai->unit_size;
 
-	/* assign units to cpus accordingly */
-	unit = 0;
-	for (group = 0; group_cnt[group]; group++) {
 		for_each_possible_cpu(cpu)
 			if (group_map[cpu] == group)
-				unit_map[cpu] = unit++;
-		unit = roundup(unit, best_upa);
+				gi->cpu_map[gi->nr_units++] = cpu;
+		gi->nr_units = roundup(gi->nr_units, upa);
+		unit += gi->nr_units;
 	}
+	BUG_ON(unit != nr_units);
 
-	return unit;	/* unit contains aligned number of units */
+	return ai;
 }
 
-static bool __init pcpul_unit_to_cpu(int unit, const int *unit_map,
-				     unsigned int *cpup);
-
-static void __init pcpul_lpage_dump_cfg(const char *lvl, size_t static_size,
-					size_t reserved_size, size_t dyn_size,
-					size_t unit_size, size_t lpage_size,
-					const int *unit_map, int nr_units)
+/**
+ * pcpu_dump_alloc_info - print out information about pcpu_alloc_info
+ * @lvl: loglevel
+ * @ai: allocation info to dump
+ *
+ * Print out information about @ai using loglevel @lvl.
+ */
+static void pcpu_dump_alloc_info(const char *lvl,
+				 const struct pcpu_alloc_info *ai)
 {
-	int width = 1, v = nr_units;
+	int group_width = 1, cpu_width = 1, width;
 	char empty_str[] = "--------";
-	int upl, lpl;	/* units per lpage, lpage per line */
-	unsigned int cpu;
-	int lpage, unit;
+	int alloc = 0, alloc_end = 0;
+	int group, v;
+	int upa, apl;	/* units per alloc, allocs per line */
+
+	v = ai->nr_groups;
+	while (v /= 10)
+		group_width++;
 
+	v = num_possible_cpus();
 	while (v /= 10)
-		width++;
-	empty_str[min_t(int, width, sizeof(empty_str) - 1)] = '\0';
+		cpu_width++;
+	empty_str[min_t(int, cpu_width, sizeof(empty_str) - 1)] = '\0';
 
-	upl = max_t(int, lpage_size / unit_size, 1);
-	lpl = rounddown_pow_of_two(max_t(int, 60 / (upl * (width + 1) + 2), 1));
+	upa = ai->alloc_size / ai->unit_size;
+	width = upa * (cpu_width + 1) + group_width + 3;
+	apl = rounddown_pow_of_two(max(60 / width, 1));
 
-	printk("%spcpu-lpage: sta/res/dyn=%zu/%zu/%zu unit=%zu lpage=%zu", lvl,
-	       static_size, reserved_size, dyn_size, unit_size, lpage_size);
+	printk("%spcpu-alloc: s%zu r%zu d%zu u%zu alloc=%zu*%zu",
+	       lvl, ai->static_size, ai->reserved_size, ai->dyn_size,
+	       ai->unit_size, ai->alloc_size / ai->atom_size, ai->atom_size);
 
-	for (lpage = 0, unit = 0; unit < nr_units; unit++) {
-		if (!(unit % upl)) {
-			if (!(lpage++ % lpl)) {
+	for (group = 0; group < ai->nr_groups; group++) {
+		const struct pcpu_group_info *gi = &ai->groups[group];
+		int unit = 0, unit_end = 0;
+
+		BUG_ON(gi->nr_units % upa);
+		for (alloc_end += gi->nr_units / upa;
+		     alloc < alloc_end; alloc++) {
+			if (!(alloc % apl)) {
 				printk("\n");
-				printk("%spcpu-lpage: ", lvl);
-			} else
-				printk("| ");
+				printk("%spcpu-alloc: ", lvl);
+			}
+			printk("[%0*d] ", group_width, group);
+
+			for (unit_end += upa; unit < unit_end; unit++)
+				if (gi->cpu_map[unit] != NR_CPUS)
+					printk("%0*d ", cpu_width,
+					       gi->cpu_map[unit]);
+				else
+					printk("%s ", empty_str);
 		}
-		if (pcpul_unit_to_cpu(unit, unit_map, &cpu))
-			printk("%0*d ", width, cpu);
-		else
-			printk("%s ", empty_str);
 	}
 	printk("\n");
 }
-#endif
 
 /**
  * pcpu_setup_first_chunk - initialize the first percpu chunk
- * @static_size: the size of static percpu area in bytes
- * @reserved_size: the size of reserved percpu area in bytes, 0 for none
- * @dyn_size: free size for dynamic allocation in bytes
- * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE
+ * @ai: pcpu_alloc_info describing how to percpu area is shaped
  * @base_addr: mapped address
- * @unit_map: cpu -> unit map, NULL for sequential mapping
  *
  * Initialize the first percpu chunk which contains the kernel static
  * perpcu area.  This function is to be called from arch percpu area
  * setup path.
  *
- * @reserved_size, if non-zero, specifies the amount of bytes to
+ * @ai contains all information necessary to initialize the first
+ * chunk and prime the dynamic percpu allocator.
+ *
+ * @ai->static_size is the size of static percpu area.
+ *
+ * @ai->reserved_size, if non-zero, specifies the amount of bytes to
  * reserve after the static area in the first chunk.  This reserves
  * the first chunk such that it's available only through reserved
  * percpu allocation.  This is primarily used to serve module percpu
@@ -1424,13 +1528,26 @@ static void __init pcpul_lpage_dump_cfg(const char *lvl, size_t static_size,
  * limited offset range for symbol relocations to guarantee module
  * percpu symbols fall inside the relocatable range.
  *
- * @dyn_size determines the number of bytes available for dynamic
- * allocation in the first chunk.  The area between @static_size +
- * @reserved_size + @dyn_size and @unit_size is unused.
+ * @ai->dyn_size determines the number of bytes available for dynamic
+ * allocation in the first chunk.  The area between @ai->static_size +
+ * @ai->reserved_size + @ai->dyn_size and @ai->unit_size is unused.
  *
- * @unit_size specifies unit size and must be aligned to PAGE_SIZE and
- * equal to or larger than @static_size + @reserved_size + if
- * non-negative, @dyn_size.
+ * @ai->unit_size specifies unit size and must be aligned to PAGE_SIZE
+ * and equal to or larger than @ai->static_size + @ai->reserved_size +
+ * @ai->dyn_size.
+ *
+ * @ai->atom_size is the allocation atom size and used as alignment
+ * for vm areas.
+ *
+ * @ai->alloc_size is the allocation size and always multiple of
+ * @ai->atom_size.  This is larger than @ai->atom_size if
+ * @ai->unit_size is larger than @ai->atom_size.
+ *
+ * @ai->nr_groups and @ai->groups describe virtual memory layout of
+ * percpu areas.  Units which should be colocated are put into the
+ * same group.  Dynamic VM areas will be allocated according to these
+ * groupings.  If @ai->nr_groups is zero, a single group containing
+ * all units is assumed.
  *
  * The caller should have mapped the first chunk at @base_addr and
  * copied static data to each unit.
@@ -1446,70 +1563,63 @@ static void __init pcpul_lpage_dump_cfg(const char *lvl, size_t static_size,
  * The determined pcpu_unit_size which can be used to initialize
  * percpu access.
  */
-size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size,
-				     size_t dyn_size, size_t unit_size,
-				     void *base_addr, const int *unit_map)
+size_t __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
+				     void *base_addr)
 {
 	static struct vm_struct first_vm;
 	static int smap[2], dmap[2];
-	size_t size_sum = static_size + reserved_size + dyn_size;
+	size_t dyn_size = ai->dyn_size;
+	size_t size_sum = ai->static_size + ai->reserved_size + dyn_size;
 	struct pcpu_chunk *schunk, *dchunk = NULL;
-	unsigned int cpu, tcpu;
-	int i;
+	unsigned int cpu;
+	int *unit_map;
+	int group, unit, i;
 
 	/* sanity checks */
 	BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC ||
 		     ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC);
-	BUG_ON(!static_size);
+	BUG_ON(ai->nr_groups <= 0);
+	BUG_ON(!ai->static_size);
 	BUG_ON(!base_addr);
-	BUG_ON(unit_size < size_sum);
-	BUG_ON(unit_size & ~PAGE_MASK);
-	BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE);
+	BUG_ON(ai->unit_size < size_sum);
+	BUG_ON(ai->unit_size & ~PAGE_MASK);
+	BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
+
+	pcpu_dump_alloc_info(KERN_DEBUG, ai);
 
 	/* determine number of units and verify and initialize pcpu_unit_map */
-	if (unit_map) {
-		int first_unit = INT_MAX, last_unit = INT_MIN;
-
-		for_each_possible_cpu(cpu) {
-			int unit = unit_map[cpu];
-
-			BUG_ON(unit < 0);
-			for_each_possible_cpu(tcpu) {
-				if (tcpu == cpu)
-					break;
-				/* the mapping should be one-to-one */
-				BUG_ON(unit_map[tcpu] == unit);
-			}
+	unit_map = alloc_bootmem(nr_cpu_ids * sizeof(unit_map[0]));
 
-			if (unit < first_unit) {
-				pcpu_first_unit_cpu = cpu;
-				first_unit = unit;
-			}
-			if (unit > last_unit) {
-				pcpu_last_unit_cpu = cpu;
-				last_unit = unit;
-			}
-		}
-		pcpu_nr_units = last_unit + 1;
-		pcpu_unit_map = unit_map;
-	} else {
-		int *identity_map;
+	for (cpu = 0; cpu < nr_cpu_ids; cpu++)
+		unit_map[cpu] = NR_CPUS;
+	pcpu_first_unit_cpu = NR_CPUS;
 
-		/* #units == #cpus, identity mapped */
-		identity_map = alloc_bootmem(nr_cpu_ids *
-					     sizeof(identity_map[0]));
+	for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {
+		const struct pcpu_group_info *gi = &ai->groups[group];
 
-		for_each_possible_cpu(cpu)
-			identity_map[cpu] = cpu;
+		for (i = 0; i < gi->nr_units; i++) {
+			cpu = gi->cpu_map[i];
+			if (cpu == NR_CPUS)
+				continue;
 
-		pcpu_first_unit_cpu = 0;
-		pcpu_last_unit_cpu = pcpu_nr_units - 1;
-		pcpu_nr_units = nr_cpu_ids;
-		pcpu_unit_map = identity_map;
+			BUG_ON(cpu > nr_cpu_ids || !cpu_possible(cpu));
+			BUG_ON(unit_map[cpu] != NR_CPUS);
+
+			unit_map[cpu] = unit + i;
+			if (pcpu_first_unit_cpu == NR_CPUS)
+				pcpu_first_unit_cpu = cpu;
+		}
 	}
+	pcpu_last_unit_cpu = cpu;
+	pcpu_nr_units = unit;
+
+	for_each_possible_cpu(cpu)
+		BUG_ON(unit_map[cpu] == NR_CPUS);
+
+	pcpu_unit_map = unit_map;
 
 	/* determine basic parameters */
-	pcpu_unit_pages = unit_size >> PAGE_SHIFT;
+	pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT;
 	pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
 	pcpu_chunk_size = pcpu_nr_units * pcpu_unit_size;
 	pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) +
@@ -1543,17 +1653,17 @@ size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size,
 	schunk->immutable = true;
 	bitmap_fill(schunk->populated, pcpu_unit_pages);
 
-	if (reserved_size) {
-		schunk->free_size = reserved_size;
+	if (ai->reserved_size) {
+		schunk->free_size = ai->reserved_size;
 		pcpu_reserved_chunk = schunk;
-		pcpu_reserved_chunk_limit = static_size + reserved_size;
+		pcpu_reserved_chunk_limit = ai->static_size + ai->reserved_size;
 	} else {
 		schunk->free_size = dyn_size;
 		dyn_size = 0;			/* dynamic area covered */
 	}
 	schunk->contig_hint = schunk->free_size;
 
-	schunk->map[schunk->map_used++] = -static_size;
+	schunk->map[schunk->map_used++] = -ai->static_size;
 	if (schunk->free_size)
 		schunk->map[schunk->map_used++] = schunk->free_size;
 
@@ -1643,44 +1753,47 @@ early_param("percpu_alloc", percpu_alloc_setup);
  */
 ssize_t __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size)
 {
-	const size_t static_size = __per_cpu_end - __per_cpu_start;
-	size_t size_sum, unit_size, chunk_size;
+	struct pcpu_alloc_info *ai;
+	size_t size_sum, chunk_size;
 	void *base;
-	unsigned int cpu;
+	int unit;
+	ssize_t ret;
 
-	/* determine parameters and allocate */
-	size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);
+	ai = pcpu_build_alloc_info(reserved_size, dyn_size, PAGE_SIZE, NULL);
+	if (IS_ERR(ai))
+		return PTR_ERR(ai);
+	BUG_ON(ai->nr_groups != 1);
+	BUG_ON(ai->groups[0].nr_units != num_possible_cpus());
 
-	unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
-	chunk_size = unit_size * nr_cpu_ids;
+	size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
+	chunk_size = ai->unit_size * num_possible_cpus();
 
 	base = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE,
 				       __pa(MAX_DMA_ADDRESS));
 	if (!base) {
 		pr_warning("PERCPU: failed to allocate %zu bytes for "
 			   "embedding\n", chunk_size);
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto out_free_ai;
 	}
 
 	/* return the leftover and copy */
-	for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
-		void *ptr = base + cpu * unit_size;
-
-		if (cpu_possible(cpu)) {
-			free_bootmem(__pa(ptr + size_sum),
-				     unit_size - size_sum);
-			memcpy(ptr, __per_cpu_load, static_size);
-		} else
-			free_bootmem(__pa(ptr), unit_size);
+	for (unit = 0; unit < num_possible_cpus(); unit++) {
+		void *ptr = base + unit * ai->unit_size;
+
+		free_bootmem(__pa(ptr + size_sum), ai->unit_size - size_sum);
+		memcpy(ptr, __per_cpu_load, ai->static_size);
 	}
 
 	/* we're ready, commit */
 	pr_info("PERCPU: Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n",
-		PFN_DOWN(size_sum), base, static_size, reserved_size, dyn_size,
-		unit_size);
+		PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size,
+		ai->dyn_size, ai->unit_size);
 
-	return pcpu_setup_first_chunk(static_size, reserved_size, dyn_size,
-				      unit_size, base, NULL);
+	ret = pcpu_setup_first_chunk(ai, base);
+out_free_ai:
+	pcpu_free_alloc_info(ai);
+	return ret;
 }
 #endif /* CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK ||
 	  !CONFIG_HAVE_SETUP_PER_CPU_AREA */
@@ -1709,31 +1822,34 @@ ssize_t __init pcpu_page_first_chunk(size_t reserved_size,
 				     pcpu_fc_populate_pte_fn_t populate_pte_fn)
 {
 	static struct vm_struct vm;
-	const size_t static_size = __per_cpu_end - __per_cpu_start;
-	ssize_t dyn_size = -1;
-	size_t size_sum, unit_size;
+	struct pcpu_alloc_info *ai;
 	char psize_str[16];
 	int unit_pages;
 	size_t pages_size;
 	struct page **pages;
-	unsigned int cpu;
-	int i, j;
+	int unit, i, j;
 	ssize_t ret;
 
 	snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);
 
-	size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);
-	unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
-	unit_pages = unit_size >> PAGE_SHIFT;
+	ai = pcpu_build_alloc_info(reserved_size, -1, PAGE_SIZE, NULL);
+	if (IS_ERR(ai))
+		return PTR_ERR(ai);
+	BUG_ON(ai->nr_groups != 1);
+	BUG_ON(ai->groups[0].nr_units != num_possible_cpus());
+
+	unit_pages = ai->unit_size >> PAGE_SHIFT;
 
 	/* unaligned allocations can't be freed, round up to page size */
-	pages_size = PFN_ALIGN(unit_pages * nr_cpu_ids * sizeof(pages[0]));
+	pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
+			       sizeof(pages[0]));
 	pages = alloc_bootmem(pages_size);
 
 	/* allocate pages */
 	j = 0;
-	for_each_possible_cpu(cpu)
+	for (unit = 0; unit < num_possible_cpus(); unit++)
 		for (i = 0; i < unit_pages; i++) {
+			unsigned int cpu = ai->groups[0].cpu_map[unit];
 			void *ptr;
 
 			ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE);
@@ -1747,18 +1863,18 @@ ssize_t __init pcpu_page_first_chunk(size_t reserved_size,
 
 	/* allocate vm area, map the pages and copy static data */
 	vm.flags = VM_ALLOC;
-	vm.size = nr_cpu_ids * unit_size;
+	vm.size = num_possible_cpus() * ai->unit_size;
 	vm_area_register_early(&vm, PAGE_SIZE);
 
-	for_each_possible_cpu(cpu) {
+	for (unit = 0; unit < num_possible_cpus(); unit++) {
 		unsigned long unit_addr =
-			(unsigned long)vm.addr + cpu * unit_size;
+			(unsigned long)vm.addr + unit * ai->unit_size;
 
 		for (i = 0; i < unit_pages; i++)
 			populate_pte_fn(unit_addr + (i << PAGE_SHIFT));
 
 		/* pte already populated, the following shouldn't fail */
-		ret = __pcpu_map_pages(unit_addr, &pages[cpu * unit_pages],
+		ret = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages],
 				       unit_pages);
 		if (ret < 0)
 			panic("failed to map percpu area, err=%zd\n", ret);
@@ -1772,16 +1888,15 @@ ssize_t __init pcpu_page_first_chunk(size_t reserved_size,
 		 */
 
 		/* copy static data */
-		memcpy((void *)unit_addr, __per_cpu_load, static_size);
+		memcpy((void *)unit_addr, __per_cpu_load, ai->static_size);
 	}
 
 	/* we're ready, commit */
 	pr_info("PERCPU: %d %s pages/cpu @%p s%zu r%zu d%zu\n",
-		unit_pages, psize_str, vm.addr, static_size, reserved_size,
-		dyn_size);
+		unit_pages, psize_str, vm.addr, ai->static_size,
+		ai->reserved_size, ai->dyn_size);
 
-	ret = pcpu_setup_first_chunk(static_size, reserved_size, dyn_size,
-				     unit_size, vm.addr, NULL);
+	ret = pcpu_setup_first_chunk(ai, vm.addr);
 	goto out_free_ar;
 
 enomem:
@@ -1790,6 +1905,7 @@ enomem:
 	ret = -ENOMEM;
 out_free_ar:
 	free_bootmem(__pa(pages), pages_size);
+	pcpu_free_alloc_info(ai);
 	return ret;
 }
 #endif /* CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK */
@@ -1805,38 +1921,50 @@ static size_t pcpul_lpage_size;
 static int pcpul_nr_lpages;
 static struct pcpul_ent *pcpul_map;
 
-static bool __init pcpul_unit_to_cpu(int unit, const int *unit_map,
+static bool __init pcpul_unit_to_cpu(int unit, const struct pcpu_alloc_info *ai,
 				     unsigned int *cpup)
 {
-	unsigned int cpu;
+	int group, cunit;
 
-	for_each_possible_cpu(cpu)
-		if (unit_map[cpu] == unit) {
+	for (group = 0, cunit = 0; group < ai->nr_groups; group++) {
+		const struct pcpu_group_info *gi = &ai->groups[group];
+
+		if (unit < cunit + gi->nr_units) {
 			if (cpup)
-				*cpup = cpu;
+				*cpup = gi->cpu_map[unit - cunit];
 			return true;
 		}
+		cunit += gi->nr_units;
+	}
 
 	return false;
 }
 
+static int __init pcpul_cpu_to_unit(int cpu, const struct pcpu_alloc_info *ai)
+{
+	int group, unit, i;
+
+	for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {
+		const struct pcpu_group_info *gi = &ai->groups[group];
+
+		for (i = 0; i < gi->nr_units; i++)
+			if (gi->cpu_map[i] == cpu)
+				return unit + i;
+	}
+	BUG();
+}
+
 /**
  * pcpu_lpage_first_chunk - remap the first percpu chunk using large page
- * @reserved_size: the size of reserved percpu area in bytes
- * @dyn_size: free size for dynamic allocation in bytes
- * @unit_size: unit size in bytes
- * @lpage_size: the size of a large page
- * @unit_map: cpu -> unit mapping
- * @nr_units: the number of units
+ * @ai: pcpu_alloc_info
  * @alloc_fn: function to allocate percpu lpage, always called with lpage_size
  * @free_fn: function to free percpu memory, @size <= lpage_size
  * @map_fn: function to map percpu lpage, always called with lpage_size
  *
  * This allocator uses large page to build and map the first chunk.
- * Unlike other helpers, the caller should always specify @dyn_size
- * and @unit_size.  These parameters along with @unit_map and
- * @nr_units can be determined using pcpu_lpage_build_unit_map().
- * This two stage initialization is to allow arch code to evaluate the
+ * Unlike other helpers, the caller should provide fully initialized
+ * @ai.  This can be done using pcpu_build_alloc_info().  This two
+ * stage initialization is to allow arch code to evaluate the
  * parameters before committing to it.
  *
  * Large pages are allocated as directed by @unit_map and other
@@ -1852,27 +1980,26 @@ static bool __init pcpul_unit_to_cpu(int unit, const int *unit_map,
  * The determined pcpu_unit_size which can be used to initialize
  * percpu access on success, -errno on failure.
  */
-ssize_t __init pcpu_lpage_first_chunk(size_t reserved_size, size_t dyn_size,
-				      size_t unit_size, size_t lpage_size,
-				      const int *unit_map, int nr_units,
+ssize_t __init pcpu_lpage_first_chunk(const struct pcpu_alloc_info *ai,
 				      pcpu_fc_alloc_fn_t alloc_fn,
 				      pcpu_fc_free_fn_t free_fn,
 				      pcpu_fc_map_fn_t map_fn)
 {
 	static struct vm_struct vm;
-	const size_t static_size = __per_cpu_end - __per_cpu_start;
-	size_t chunk_size = unit_size * nr_units;
-	size_t map_size;
+	const size_t lpage_size = ai->atom_size;
+	size_t chunk_size, map_size;
 	unsigned int cpu;
 	ssize_t ret;
-	int i, j, unit;
+	int i, j, unit, nr_units;
 
-	pcpul_lpage_dump_cfg(KERN_DEBUG, static_size, reserved_size, dyn_size,
-			     unit_size, lpage_size, unit_map, nr_units);
+	nr_units = 0;
+	for (i = 0; i < ai->nr_groups; i++)
+		nr_units += ai->groups[i].nr_units;
 
+	chunk_size = ai->unit_size * nr_units;
 	BUG_ON(chunk_size % lpage_size);
 
-	pcpul_size = static_size + reserved_size + dyn_size;
+	pcpul_size = ai->static_size + ai->reserved_size + ai->dyn_size;
 	pcpul_lpage_size = lpage_size;
 	pcpul_nr_lpages = chunk_size / lpage_size;
 
@@ -1883,13 +2010,13 @@ ssize_t __init pcpu_lpage_first_chunk(size_t reserved_size, size_t dyn_size,
 	/* allocate all pages */
 	for (i = 0; i < pcpul_nr_lpages; i++) {
 		size_t offset = i * lpage_size;
-		int first_unit = offset / unit_size;
-		int last_unit = (offset + lpage_size - 1) / unit_size;
+		int first_unit = offset / ai->unit_size;
+		int last_unit = (offset + lpage_size - 1) / ai->unit_size;
 		void *ptr;
 
 		/* find out which cpu is mapped to this unit */
 		for (unit = first_unit; unit <= last_unit; unit++)
-			if (pcpul_unit_to_cpu(unit, unit_map, &cpu))
+			if (pcpul_unit_to_cpu(unit, ai, &cpu))
 				goto found;
 		continue;
 	found:
@@ -1905,12 +2032,12 @@ ssize_t __init pcpu_lpage_first_chunk(size_t reserved_size, size_t dyn_size,
 
 	/* return unused holes */
 	for (unit = 0; unit < nr_units; unit++) {
-		size_t start = unit * unit_size;
-		size_t end = start + unit_size;
+		size_t start = unit * ai->unit_size;
+		size_t end = start + ai->unit_size;
 		size_t off, next;
 
 		/* don't free used part of occupied unit */
-		if (pcpul_unit_to_cpu(unit, unit_map, NULL))
+		if (pcpul_unit_to_cpu(unit, ai, NULL))
 			start += pcpul_size;
 
 		/* unit can span more than one page, punch the holes */
@@ -1925,7 +2052,7 @@ ssize_t __init pcpu_lpage_first_chunk(size_t reserved_size, size_t dyn_size,
 	/* allocate address, map and copy */
 	vm.flags = VM_ALLOC;
 	vm.size = chunk_size;
-	vm_area_register_early(&vm, unit_size);
+	vm_area_register_early(&vm, ai->unit_size);
 
 	for (i = 0; i < pcpul_nr_lpages; i++) {
 		if (!pcpul_map[i].ptr)
@@ -1935,15 +2062,15 @@ ssize_t __init pcpu_lpage_first_chunk(size_t reserved_size, size_t dyn_size,
 	}
 
 	for_each_possible_cpu(cpu)
-		memcpy(vm.addr + unit_map[cpu] * unit_size, __per_cpu_load,
-		       static_size);
+		memcpy(vm.addr + pcpul_cpu_to_unit(cpu, ai) * ai->unit_size,
+		       __per_cpu_load, ai->static_size);
 
 	/* we're ready, commit */
 	pr_info("PERCPU: large pages @%p s%zu r%zu d%zu u%zu\n",
-		vm.addr, static_size, reserved_size, dyn_size, unit_size);
+		vm.addr, ai->static_size, ai->reserved_size, ai->dyn_size,
+		ai->unit_size);
 
-	ret = pcpu_setup_first_chunk(static_size, reserved_size, dyn_size,
-				     unit_size, vm.addr, unit_map);
+	ret = pcpu_setup_first_chunk(ai, vm.addr);
 
 	/*
 	 * Sort pcpul_map array for pcpu_lpage_remapped().  Unmapped
-- 
cgit v1.2.3


From fb435d5233f8b6f9b93c11d6304d8e98fed03234 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 14 Aug 2009 15:00:51 +0900
Subject: percpu: add pcpu_unit_offsets[]

Currently units are mapped sequentially into address space.  This
patch adds pcpu_unit_offsets[] which allows units to be mapped to
arbitrary offsets from the chunk base address.  This is necessary to
allow sparse embedding which might would need to allocate address
ranges and memory areas which aren't aligned to unit size but
allocation atom size (page or large page size).  This also simplifies
things a bit by removing the need to calculate offset from unit
number.

With this change, there's no need for the arch code to know
pcpu_unit_size.  Update pcpu_setup_first_chunk() and first chunk
allocators to return regular 0 or -errno return code instead of unit
size or -errno.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: David S. Miller <davem@davemloft.net>
---
 arch/sparc/kernel/smp_64.c     | 12 +++---
 arch/x86/kernel/setup_percpu.c | 51 ++++++++++-------------
 include/linux/percpu.h         | 16 ++++---
 mm/percpu.c                    | 95 +++++++++++++++++++++---------------------
 4 files changed, 84 insertions(+), 90 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index a42a4a744d1..b03fd362c62 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -1478,9 +1478,10 @@ void __init setup_per_cpu_areas(void)
 	static struct vm_struct vm;
 	struct pcpu_alloc_info *ai;
 	unsigned long delta, cpu;
-	size_t size_sum, pcpu_unit_size;
+	size_t size_sum;
 	size_t ptrs_size;
 	void **ptrs;
+	int rc;
 
 	ai = pcpu_alloc_alloc_info(1, nr_cpu_ids);
 
@@ -1526,14 +1527,15 @@ void __init setup_per_cpu_areas(void)
 		pcpu_map_range(start, end, virt_to_page(ptrs[cpu]));
 	}
 
-	pcpu_unit_size = pcpu_setup_first_chunk(ai, vm.addr);
+	rc = pcpu_setup_first_chunk(ai, vm.addr);
+	if (rc)
+		panic("failed to setup percpu first chunk (%d)", rc);
 
 	free_bootmem(__pa(ptrs), ptrs_size);
 
 	delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
-	for_each_possible_cpu(cpu) {
-		__per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size;
-	}
+	for_each_possible_cpu(cpu)
+		__per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu];
 
 	/* Setup %g5 for the boot cpu.  */
 	__local_per_cpu_offset = __per_cpu_offset(smp_processor_id());
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index db5f9c49fec..9becc5d4b51 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -157,12 +157,12 @@ static int pcpu_lpage_cpu_distance(unsigned int from, unsigned int to)
 		return REMOTE_DISTANCE;
 }
 
-static ssize_t __init setup_pcpu_lpage(bool chosen)
+static int __init setup_pcpu_lpage(bool chosen)
 {
 	size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
 	size_t dyn_size = reserve - PERCPU_FIRST_CHUNK_RESERVE;
 	struct pcpu_alloc_info *ai;
-	ssize_t ret;
+	int rc;
 
 	/* on non-NUMA, embedding is better */
 	if (!chosen && !pcpu_need_numa())
@@ -196,19 +196,18 @@ static ssize_t __init setup_pcpu_lpage(bool chosen)
 		if (tot_size > vm_size / 5) {
 			pr_info("PERCPU: too large chunk size %zuMB for "
 				"large page remap\n", tot_size >> 20);
-			ret = -EINVAL;
+			rc = -EINVAL;
 			goto out_free;
 		}
 	}
 
-	ret = pcpu_lpage_first_chunk(ai, pcpu_fc_alloc, pcpu_fc_free,
-				     pcpul_map);
+	rc = pcpu_lpage_first_chunk(ai, pcpu_fc_alloc, pcpu_fc_free, pcpul_map);
 out_free:
 	pcpu_free_alloc_info(ai);
-	return ret;
+	return rc;
 }
 #else
-static ssize_t __init setup_pcpu_lpage(bool chosen)
+static int __init setup_pcpu_lpage(bool chosen)
 {
 	return -EINVAL;
 }
@@ -222,7 +221,7 @@ static ssize_t __init setup_pcpu_lpage(bool chosen)
  * mapping so that it can use PMD mapping without additional TLB
  * pressure.
  */
-static ssize_t __init setup_pcpu_embed(bool chosen)
+static int __init setup_pcpu_embed(bool chosen)
 {
 	size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
 
@@ -250,7 +249,7 @@ static void __init pcpup_populate_pte(unsigned long addr)
 	populate_extra_pte(addr);
 }
 
-static ssize_t __init setup_pcpu_page(void)
+static int __init setup_pcpu_page(void)
 {
 	return pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
 				     pcpu_fc_alloc, pcpu_fc_free,
@@ -274,8 +273,7 @@ void __init setup_per_cpu_areas(void)
 {
 	unsigned int cpu;
 	unsigned long delta;
-	size_t pcpu_unit_size;
-	ssize_t ret;
+	int rc;
 
 	pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
 		NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
@@ -285,36 +283,33 @@ void __init setup_per_cpu_areas(void)
 	 * of large page mappings.  Please read comments on top of
 	 * each allocator for details.
 	 */
-	ret = -EINVAL;
+	rc = -EINVAL;
 	if (pcpu_chosen_fc != PCPU_FC_AUTO) {
 		if (pcpu_chosen_fc != PCPU_FC_PAGE) {
 			if (pcpu_chosen_fc == PCPU_FC_LPAGE)
-				ret = setup_pcpu_lpage(true);
+				rc = setup_pcpu_lpage(true);
 			else
-				ret = setup_pcpu_embed(true);
+				rc = setup_pcpu_embed(true);
 
-			if (ret < 0)
-				pr_warning("PERCPU: %s allocator failed (%zd), "
+			if (rc < 0)
+				pr_warning("PERCPU: %s allocator failed (%d), "
 					   "falling back to page size\n",
-					   pcpu_fc_names[pcpu_chosen_fc], ret);
+					   pcpu_fc_names[pcpu_chosen_fc], rc);
 		}
 	} else {
-		ret = setup_pcpu_lpage(false);
-		if (ret < 0)
-			ret = setup_pcpu_embed(false);
+		rc = setup_pcpu_lpage(false);
+		if (rc < 0)
+			rc = setup_pcpu_embed(false);
 	}
-	if (ret < 0)
-		ret = setup_pcpu_page();
-	if (ret < 0)
-		panic("cannot initialize percpu area (err=%zd)", ret);
-
-	pcpu_unit_size = ret;
+	if (rc < 0)
+		rc = setup_pcpu_page();
+	if (rc < 0)
+		panic("cannot initialize percpu area (err=%d)", rc);
 
 	/* alrighty, percpu areas up and running */
 	delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
 	for_each_possible_cpu(cpu) {
-		per_cpu_offset(cpu) =
-			delta + pcpu_unit_map[cpu] * pcpu_unit_size;
+		per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu];
 		per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
 		per_cpu(cpu_number, cpu) = cpu;
 		setup_percpu_segment(cpu);
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 77b86be8ce4..a7ec840f596 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -57,7 +57,7 @@
 #endif
 
 extern void *pcpu_base_addr;
-extern const int *pcpu_unit_map;
+extern const unsigned long *pcpu_unit_offsets;
 
 struct pcpu_group_info {
 	int			nr_units;	/* aligned # of units */
@@ -106,25 +106,23 @@ extern struct pcpu_alloc_info * __init pcpu_build_alloc_info(
 				size_t atom_size,
 				pcpu_fc_cpu_distance_fn_t cpu_distance_fn);
 
-extern size_t __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
-					    void *base_addr);
+extern int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
+					 void *base_addr);
 
 #ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
-extern ssize_t __init pcpu_embed_first_chunk(
-				size_t reserved_size, ssize_t dyn_size);
+extern int __init pcpu_embed_first_chunk(size_t reserved_size,
+					 ssize_t dyn_size);
 #endif
 
 #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
-extern ssize_t __init pcpu_page_first_chunk(
-				size_t reserved_size,
+extern int __init pcpu_page_first_chunk(size_t reserved_size,
 				pcpu_fc_alloc_fn_t alloc_fn,
 				pcpu_fc_free_fn_t free_fn,
 				pcpu_fc_populate_pte_fn_t populate_pte_fn);
 #endif
 
 #ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK
-extern ssize_t __init pcpu_lpage_first_chunk(
-				const struct pcpu_alloc_info *ai,
+extern int __init pcpu_lpage_first_chunk(const struct pcpu_alloc_info *ai,
 				pcpu_fc_alloc_fn_t alloc_fn,
 				pcpu_fc_free_fn_t free_fn,
 				pcpu_fc_map_fn_t map_fn);
diff --git a/mm/percpu.c b/mm/percpu.c
index 99f7fa68272..653b02c4020 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -117,8 +117,8 @@ static unsigned int pcpu_last_unit_cpu __read_mostly;
 void *pcpu_base_addr __read_mostly;
 EXPORT_SYMBOL_GPL(pcpu_base_addr);
 
-/* cpu -> unit map */
-const int *pcpu_unit_map __read_mostly;
+static const int *pcpu_unit_map __read_mostly;		/* cpu -> unit */
+const unsigned long *pcpu_unit_offsets __read_mostly;	/* cpu -> unit offset */
 
 /*
  * The first chunk which always exists.  Note that unlike other
@@ -196,8 +196,8 @@ static int pcpu_page_idx(unsigned int cpu, int page_idx)
 static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
 				     unsigned int cpu, int page_idx)
 {
-	return (unsigned long)chunk->vm->addr +
-		(pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT);
+	return (unsigned long)chunk->vm->addr + pcpu_unit_offsets[cpu] +
+		(page_idx << PAGE_SHIFT);
 }
 
 static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk,
@@ -341,7 +341,7 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
 	 * space.  Note that any possible cpu id can be used here, so
 	 * there's no need to worry about preemption or cpu hotplug.
 	 */
-	addr += pcpu_unit_map[smp_processor_id()] * pcpu_unit_size;
+	addr += pcpu_unit_offsets[smp_processor_id()];
 	return pcpu_get_page_chunk(vmalloc_to_page(addr));
 }
 
@@ -1560,17 +1560,17 @@ static void pcpu_dump_alloc_info(const char *lvl,
  * and available for dynamic allocation like any other chunks.
  *
  * RETURNS:
- * The determined pcpu_unit_size which can be used to initialize
- * percpu access.
+ * 0 on success, -errno on failure.
  */
-size_t __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
-				     void *base_addr)
+int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
+				  void *base_addr)
 {
 	static struct vm_struct first_vm;
 	static int smap[2], dmap[2];
 	size_t dyn_size = ai->dyn_size;
 	size_t size_sum = ai->static_size + ai->reserved_size + dyn_size;
 	struct pcpu_chunk *schunk, *dchunk = NULL;
+	unsigned long *unit_off;
 	unsigned int cpu;
 	int *unit_map;
 	int group, unit, i;
@@ -1587,8 +1587,9 @@ size_t __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
 
 	pcpu_dump_alloc_info(KERN_DEBUG, ai);
 
-	/* determine number of units and verify and initialize pcpu_unit_map */
+	/* determine number of units and initialize unit_map and base */
 	unit_map = alloc_bootmem(nr_cpu_ids * sizeof(unit_map[0]));
+	unit_off = alloc_bootmem(nr_cpu_ids * sizeof(unit_off[0]));
 
 	for (cpu = 0; cpu < nr_cpu_ids; cpu++)
 		unit_map[cpu] = NR_CPUS;
@@ -1606,6 +1607,8 @@ size_t __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
 			BUG_ON(unit_map[cpu] != NR_CPUS);
 
 			unit_map[cpu] = unit + i;
+			unit_off[cpu] = gi->base_offset + i * ai->unit_size;
+
 			if (pcpu_first_unit_cpu == NR_CPUS)
 				pcpu_first_unit_cpu = cpu;
 		}
@@ -1617,6 +1620,7 @@ size_t __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
 		BUG_ON(unit_map[cpu] == NR_CPUS);
 
 	pcpu_unit_map = unit_map;
+	pcpu_unit_offsets = unit_off;
 
 	/* determine basic parameters */
 	pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT;
@@ -1688,7 +1692,7 @@ size_t __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
 
 	/* we're done */
 	pcpu_base_addr = schunk->vm->addr;
-	return pcpu_unit_size;
+	return 0;
 }
 
 const char *pcpu_fc_names[PCPU_FC_NR] __initdata = {
@@ -1748,16 +1752,15 @@ early_param("percpu_alloc", percpu_alloc_setup);
  * size, the leftover is returned to the bootmem allocator.
  *
  * RETURNS:
- * The determined pcpu_unit_size which can be used to initialize
- * percpu access on success, -errno on failure.
+ * 0 on success, -errno on failure.
  */
-ssize_t __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size)
+int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size)
 {
 	struct pcpu_alloc_info *ai;
 	size_t size_sum, chunk_size;
 	void *base;
 	int unit;
-	ssize_t ret;
+	int rc;
 
 	ai = pcpu_build_alloc_info(reserved_size, dyn_size, PAGE_SIZE, NULL);
 	if (IS_ERR(ai))
@@ -1773,7 +1776,7 @@ ssize_t __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size)
 	if (!base) {
 		pr_warning("PERCPU: failed to allocate %zu bytes for "
 			   "embedding\n", chunk_size);
-		ret = -ENOMEM;
+		rc = -ENOMEM;
 		goto out_free_ai;
 	}
 
@@ -1790,10 +1793,10 @@ ssize_t __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size)
 		PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size,
 		ai->dyn_size, ai->unit_size);
 
-	ret = pcpu_setup_first_chunk(ai, base);
+	rc = pcpu_setup_first_chunk(ai, base);
 out_free_ai:
 	pcpu_free_alloc_info(ai);
-	return ret;
+	return rc;
 }
 #endif /* CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK ||
 	  !CONFIG_HAVE_SETUP_PER_CPU_AREA */
@@ -1813,13 +1816,12 @@ out_free_ai:
  * page-by-page into vmalloc area.
  *
  * RETURNS:
- * The determined pcpu_unit_size which can be used to initialize
- * percpu access on success, -errno on failure.
+ * 0 on success, -errno on failure.
  */
-ssize_t __init pcpu_page_first_chunk(size_t reserved_size,
-				     pcpu_fc_alloc_fn_t alloc_fn,
-				     pcpu_fc_free_fn_t free_fn,
-				     pcpu_fc_populate_pte_fn_t populate_pte_fn)
+int __init pcpu_page_first_chunk(size_t reserved_size,
+				 pcpu_fc_alloc_fn_t alloc_fn,
+				 pcpu_fc_free_fn_t free_fn,
+				 pcpu_fc_populate_pte_fn_t populate_pte_fn)
 {
 	static struct vm_struct vm;
 	struct pcpu_alloc_info *ai;
@@ -1827,8 +1829,7 @@ ssize_t __init pcpu_page_first_chunk(size_t reserved_size,
 	int unit_pages;
 	size_t pages_size;
 	struct page **pages;
-	int unit, i, j;
-	ssize_t ret;
+	int unit, i, j, rc;
 
 	snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);
 
@@ -1874,10 +1875,10 @@ ssize_t __init pcpu_page_first_chunk(size_t reserved_size,
 			populate_pte_fn(unit_addr + (i << PAGE_SHIFT));
 
 		/* pte already populated, the following shouldn't fail */
-		ret = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages],
-				       unit_pages);
-		if (ret < 0)
-			panic("failed to map percpu area, err=%zd\n", ret);
+		rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages],
+				      unit_pages);
+		if (rc < 0)
+			panic("failed to map percpu area, err=%d\n", rc);
 
 		/*
 		 * FIXME: Archs with virtual cache should flush local
@@ -1896,17 +1897,17 @@ ssize_t __init pcpu_page_first_chunk(size_t reserved_size,
 		unit_pages, psize_str, vm.addr, ai->static_size,
 		ai->reserved_size, ai->dyn_size);
 
-	ret = pcpu_setup_first_chunk(ai, vm.addr);
+	rc = pcpu_setup_first_chunk(ai, vm.addr);
 	goto out_free_ar;
 
 enomem:
 	while (--j >= 0)
 		free_fn(page_address(pages[j]), PAGE_SIZE);
-	ret = -ENOMEM;
+	rc = -ENOMEM;
 out_free_ar:
 	free_bootmem(__pa(pages), pages_size);
 	pcpu_free_alloc_info(ai);
-	return ret;
+	return rc;
 }
 #endif /* CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK */
 
@@ -1977,20 +1978,18 @@ static int __init pcpul_cpu_to_unit(int cpu, const struct pcpu_alloc_info *ai)
  * pcpu_lpage_remapped().
  *
  * RETURNS:
- * The determined pcpu_unit_size which can be used to initialize
- * percpu access on success, -errno on failure.
+ * 0 on success, -errno on failure.
  */
-ssize_t __init pcpu_lpage_first_chunk(const struct pcpu_alloc_info *ai,
-				      pcpu_fc_alloc_fn_t alloc_fn,
-				      pcpu_fc_free_fn_t free_fn,
-				      pcpu_fc_map_fn_t map_fn)
+int __init pcpu_lpage_first_chunk(const struct pcpu_alloc_info *ai,
+				  pcpu_fc_alloc_fn_t alloc_fn,
+				  pcpu_fc_free_fn_t free_fn,
+				  pcpu_fc_map_fn_t map_fn)
 {
 	static struct vm_struct vm;
 	const size_t lpage_size = ai->atom_size;
 	size_t chunk_size, map_size;
 	unsigned int cpu;
-	ssize_t ret;
-	int i, j, unit, nr_units;
+	int i, j, unit, nr_units, rc;
 
 	nr_units = 0;
 	for (i = 0; i < ai->nr_groups; i++)
@@ -2070,7 +2069,7 @@ ssize_t __init pcpu_lpage_first_chunk(const struct pcpu_alloc_info *ai,
 		vm.addr, ai->static_size, ai->reserved_size, ai->dyn_size,
 		ai->unit_size);
 
-	ret = pcpu_setup_first_chunk(ai, vm.addr);
+	rc = pcpu_setup_first_chunk(ai, vm.addr);
 
 	/*
 	 * Sort pcpul_map array for pcpu_lpage_remapped().  Unmapped
@@ -2094,7 +2093,7 @@ ssize_t __init pcpu_lpage_first_chunk(const struct pcpu_alloc_info *ai,
 	while (pcpul_nr_lpages && !pcpul_map[pcpul_nr_lpages - 1].ptr)
 		pcpul_nr_lpages--;
 
-	return ret;
+	return rc;
 
 enomem:
 	for (i = 0; i < pcpul_nr_lpages; i++)
@@ -2166,21 +2165,21 @@ EXPORT_SYMBOL(__per_cpu_offset);
 
 void __init setup_per_cpu_areas(void)
 {
-	ssize_t unit_size;
 	unsigned long delta;
 	unsigned int cpu;
+	int rc;
 
 	/*
 	 * Always reserve area for module percpu variables.  That's
 	 * what the legacy allocator did.
 	 */
-	unit_size = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
-					   PERCPU_DYNAMIC_RESERVE);
-	if (unit_size < 0)
+	rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
+				    PERCPU_DYNAMIC_RESERVE);
+	if (rc < 0)
 		panic("Failed to initialized percpu areas.");
 
 	delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
 	for_each_possible_cpu(cpu)
-		__per_cpu_offset[cpu] = delta + cpu * unit_size;
+		__per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
 }
 #endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
-- 
cgit v1.2.3


From c8826dd538602d730ed2c18c6753f1bbfa6c4933 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 14 Aug 2009 15:00:52 +0900
Subject: percpu: update embedding first chunk allocator to handle sparse units

Now that percpu core can handle very sparse units, given that vmalloc
space is large enough, embedding first chunk allocator can use any
memory to build the first chunk.  This patch teaches
pcpu_embed_first_chunk() about distances between cpus and to use
alloc/free callbacks to allocate node specific areas for each group
and use them for the first chunk.

This brings the benefits of embedding allocator to NUMA configurations
- no extra TLB pressure with the flexibility of unified dynamic
allocator and no need to restructure arch code to build memory layout
suitable for percpu.  With units put into atom_size aligned groups
according to cpu distances, using large page for dynamic chunks is
also easily possible with falling back to reuglar pages if large
allocation fails.

Embedding allocator users are converted to specify NULL
cpu_distance_fn, so this patch doesn't cause any visible behavior
difference.  Following patches will convert them.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 arch/x86/kernel/setup_percpu.c |   4 +-
 include/linux/percpu.h         |   7 ++-
 mm/percpu.c                    | 113 +++++++++++++++++++++++++++++++----------
 3 files changed, 93 insertions(+), 31 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 9becc5d4b51..67f6314de9f 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -234,7 +234,9 @@ static int __init setup_pcpu_embed(bool chosen)
 		return -EINVAL;
 
 	return pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
-				      reserve - PERCPU_FIRST_CHUNK_RESERVE);
+				      reserve - PERCPU_FIRST_CHUNK_RESERVE,
+				      PAGE_SIZE, NULL, pcpu_fc_alloc,
+				      pcpu_fc_free);
 }
 
 /*
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index a7ec840f596..25359932740 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -110,8 +110,11 @@ extern int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
 					 void *base_addr);
 
 #ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
-extern int __init pcpu_embed_first_chunk(size_t reserved_size,
-					 ssize_t dyn_size);
+extern int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size,
+				size_t atom_size,
+				pcpu_fc_cpu_distance_fn_t cpu_distance_fn,
+				pcpu_fc_alloc_fn_t alloc_fn,
+				pcpu_fc_free_fn_t free_fn);
 #endif
 
 #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
diff --git a/mm/percpu.c b/mm/percpu.c
index cc9c4c64606..c2826d05505 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1747,15 +1747,25 @@ early_param("percpu_alloc", percpu_alloc_setup);
  * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
  * @reserved_size: the size of reserved percpu area in bytes
  * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
+ * @atom_size: allocation atom size
+ * @cpu_distance_fn: callback to determine distance between cpus, optional
+ * @alloc_fn: function to allocate percpu page
+ * @free_fn: funtion to free percpu page
  *
  * This is a helper to ease setting up embedded first percpu chunk and
  * can be called where pcpu_setup_first_chunk() is expected.
  *
  * If this function is used to setup the first chunk, it is allocated
- * as a contiguous area using bootmem allocator and used as-is without
- * being mapped into vmalloc area.  This enables the first chunk to
- * piggy back on the linear physical mapping which often uses larger
- * page size.
+ * by calling @alloc_fn and used as-is without being mapped into
+ * vmalloc area.  Allocations are always whole multiples of @atom_size
+ * aligned to @atom_size.
+ *
+ * This enables the first chunk to piggy back on the linear physical
+ * mapping which often uses larger page size.  Please note that this
+ * can result in very sparse cpu->unit mapping on NUMA machines thus
+ * requiring large vmalloc address space.  Don't use this allocator if
+ * vmalloc space is not orders of magnitude larger than distances
+ * between node memory addresses (ie. 32bit NUMA machines).
  *
  * When @dyn_size is positive, dynamic area might be larger than
  * specified to fill page alignment.  When @dyn_size is auto,
@@ -1763,53 +1773,88 @@ early_param("percpu_alloc", percpu_alloc_setup);
  * and reserved areas.
  *
  * If the needed size is smaller than the minimum or specified unit
- * size, the leftover is returned to the bootmem allocator.
+ * size, the leftover is returned using @free_fn.
  *
  * RETURNS:
  * 0 on success, -errno on failure.
  */
-int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size)
+int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size,
+				  size_t atom_size,
+				  pcpu_fc_cpu_distance_fn_t cpu_distance_fn,
+				  pcpu_fc_alloc_fn_t alloc_fn,
+				  pcpu_fc_free_fn_t free_fn)
 {
+	void *base = (void *)ULONG_MAX;
+	void **areas = NULL;
 	struct pcpu_alloc_info *ai;
-	size_t size_sum, chunk_size;
-	void *base;
-	int unit;
-	int rc;
+	size_t size_sum, areas_size;
+	int group, i, rc;
 
-	ai = pcpu_build_alloc_info(reserved_size, dyn_size, PAGE_SIZE, NULL);
+	ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size,
+				   cpu_distance_fn);
 	if (IS_ERR(ai))
 		return PTR_ERR(ai);
-	BUG_ON(ai->nr_groups != 1);
-	BUG_ON(ai->groups[0].nr_units != num_possible_cpus());
 
 	size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
-	chunk_size = ai->unit_size * num_possible_cpus();
+	areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *));
 
-	base = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE,
-				       __pa(MAX_DMA_ADDRESS));
-	if (!base) {
-		pr_warning("PERCPU: failed to allocate %zu bytes for "
-			   "embedding\n", chunk_size);
+	areas = alloc_bootmem_nopanic(areas_size);
+	if (!areas) {
 		rc = -ENOMEM;
-		goto out_free_ai;
+		goto out_free;
 	}
 
-	/* return the leftover and copy */
-	for (unit = 0; unit < num_possible_cpus(); unit++) {
-		void *ptr = base + unit * ai->unit_size;
+	/* allocate, copy and determine base address */
+	for (group = 0; group < ai->nr_groups; group++) {
+		struct pcpu_group_info *gi = &ai->groups[group];
+		unsigned int cpu = NR_CPUS;
+		void *ptr;
+
+		for (i = 0; i < gi->nr_units && cpu == NR_CPUS; i++)
+			cpu = gi->cpu_map[i];
+		BUG_ON(cpu == NR_CPUS);
+
+		/* allocate space for the whole group */
+		ptr = alloc_fn(cpu, gi->nr_units * ai->unit_size, atom_size);
+		if (!ptr) {
+			rc = -ENOMEM;
+			goto out_free_areas;
+		}
+		areas[group] = ptr;
 
-		free_bootmem(__pa(ptr + size_sum), ai->unit_size - size_sum);
-		memcpy(ptr, __per_cpu_load, ai->static_size);
+		base = min(ptr, base);
+
+		for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) {
+			if (gi->cpu_map[i] == NR_CPUS) {
+				/* unused unit, free whole */
+				free_fn(ptr, ai->unit_size);
+				continue;
+			}
+			/* copy and return the unused part */
+			memcpy(ptr, __per_cpu_load, ai->static_size);
+			free_fn(ptr + size_sum, ai->unit_size - size_sum);
+		}
 	}
 
-	/* we're ready, commit */
+	/* base address is now known, determine group base offsets */
+	for (group = 0; group < ai->nr_groups; group++)
+		ai->groups[group].base_offset = areas[group] - base;
+
 	pr_info("PERCPU: Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n",
 		PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size,
 		ai->dyn_size, ai->unit_size);
 
 	rc = pcpu_setup_first_chunk(ai, base);
-out_free_ai:
+	goto out_free;
+
+out_free_areas:
+	for (group = 0; group < ai->nr_groups; group++)
+		free_fn(areas[group],
+			ai->groups[group].nr_units * ai->unit_size);
+out_free:
 	pcpu_free_alloc_info(ai);
+	if (areas)
+		free_bootmem(__pa(areas), areas_size);
 	return rc;
 }
 #endif /* CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK ||
@@ -2177,6 +2222,17 @@ void *pcpu_lpage_remapped(void *kaddr)
 unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
 EXPORT_SYMBOL(__per_cpu_offset);
 
+static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size,
+				       size_t align)
+{
+	return __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS));
+}
+
+static void __init pcpu_dfl_fc_free(void *ptr, size_t size)
+{
+	free_bootmem(__pa(ptr), size);
+}
+
 void __init setup_per_cpu_areas(void)
 {
 	unsigned long delta;
@@ -2188,7 +2244,8 @@ void __init setup_per_cpu_areas(void)
 	 * what the legacy allocator did.
 	 */
 	rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
-				    PERCPU_DYNAMIC_RESERVE);
+				    PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL,
+				    pcpu_dfl_fc_alloc, pcpu_dfl_fc_free);
 	if (rc < 0)
 		panic("Failed to initialized percpu areas.");
 
-- 
cgit v1.2.3


From 4518e6a0c038b98be4c480e6f4481e8676bd15dd Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 14 Aug 2009 15:00:52 +0900
Subject: x86,percpu: use embedding for 64bit NUMA and page for 32bit NUMA

Embedding percpu first chunk allocator can now handle very sparse unit
mapping.  Use embedding allocator instead of lpage for 64bit NUMA.
This removes extra TLB pressure and the need to do complex and fragile
dancing when changing page attributes.

For 32bit, using very sparse unit mapping isn't a good idea because
the vmalloc space is very constrained.  32bit NUMA machines aren't
exactly the focus of optimization and it isn't very clear whether
lpage performs better than page.  Use page first chunk allocator for
32bit NUMAs.

As this leaves setup_pcpu_*() functions pretty much empty, fold them
into setup_per_cpu_areas().

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Andi Kleen <andi@firstfloor.org>
---
 arch/x86/Kconfig               |   4 --
 arch/x86/kernel/setup_percpu.c | 155 ++++++++---------------------------------
 2 files changed, 28 insertions(+), 131 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index f7ac2721551..869d7d30144 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -156,10 +156,6 @@ config NEED_PER_CPU_EMBED_FIRST_CHUNK
 config NEED_PER_CPU_PAGE_FIRST_CHUNK
 	def_bool y
 
-config NEED_PER_CPU_LPAGE_FIRST_CHUNK
-	def_bool y
-	depends on NEED_MULTIPLE_NODES
-
 config HAVE_CPUMASK_OF_CPU_MAP
 	def_bool X86_64_SMP
 
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 67f6314de9f..d559af913e1 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -55,6 +55,7 @@ EXPORT_SYMBOL(__per_cpu_offset);
 #define PERCPU_FIRST_CHUNK_RESERVE	0
 #endif
 
+#ifdef CONFIG_X86_32
 /**
  * pcpu_need_numa - determine percpu allocation needs to consider NUMA
  *
@@ -83,6 +84,7 @@ static bool __init pcpu_need_numa(void)
 #endif
 	return false;
 }
+#endif
 
 /**
  * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu
@@ -136,128 +138,23 @@ static void __init pcpu_fc_free(void *ptr, size_t size)
 	free_bootmem(__pa(ptr), size);
 }
 
-/*
- * Large page remapping allocator
- */
-#ifdef CONFIG_NEED_MULTIPLE_NODES
-static void __init pcpul_map(void *ptr, size_t size, void *addr)
-{
-	pmd_t *pmd, pmd_v;
-
-	pmd = populate_extra_pmd((unsigned long)addr);
-	pmd_v = pfn_pmd(page_to_pfn(virt_to_page(ptr)), PAGE_KERNEL_LARGE);
-	set_pmd(pmd, pmd_v);
-}
-
-static int pcpu_lpage_cpu_distance(unsigned int from, unsigned int to)
+static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
 {
+#ifdef CONFIG_NEED_MULTIPLE_NODES
 	if (early_cpu_to_node(from) == early_cpu_to_node(to))
 		return LOCAL_DISTANCE;
 	else
 		return REMOTE_DISTANCE;
-}
-
-static int __init setup_pcpu_lpage(bool chosen)
-{
-	size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
-	size_t dyn_size = reserve - PERCPU_FIRST_CHUNK_RESERVE;
-	struct pcpu_alloc_info *ai;
-	int rc;
-
-	/* on non-NUMA, embedding is better */
-	if (!chosen && !pcpu_need_numa())
-		return -EINVAL;
-
-	/* need PSE */
-	if (!cpu_has_pse) {
-		pr_warning("PERCPU: lpage allocator requires PSE\n");
-		return -EINVAL;
-	}
-
-	/* allocate and build unit_map */
-	ai = pcpu_build_alloc_info(PERCPU_FIRST_CHUNK_RESERVE, dyn_size,
-				   PMD_SIZE, pcpu_lpage_cpu_distance);
-	if (IS_ERR(ai)) {
-		pr_warning("PERCPU: failed to build unit_map (%ld)\n",
-			   PTR_ERR(ai));
-		return PTR_ERR(ai);
-	}
-
-	/* do the parameters look okay? */
-	if (!chosen) {
-		size_t vm_size = VMALLOC_END - VMALLOC_START;
-		size_t tot_size = 0;
-		int group;
-
-		for (group = 0; group < ai->nr_groups; group++)
-			tot_size += ai->unit_size * ai->groups[group].nr_units;
-
-		/* don't consume more than 20% of vmalloc area */
-		if (tot_size > vm_size / 5) {
-			pr_info("PERCPU: too large chunk size %zuMB for "
-				"large page remap\n", tot_size >> 20);
-			rc = -EINVAL;
-			goto out_free;
-		}
-	}
-
-	rc = pcpu_lpage_first_chunk(ai, pcpu_fc_alloc, pcpu_fc_free, pcpul_map);
-out_free:
-	pcpu_free_alloc_info(ai);
-	return rc;
-}
 #else
-static int __init setup_pcpu_lpage(bool chosen)
-{
-	return -EINVAL;
-}
+	return LOCAL_DISTANCE;
 #endif
-
-/*
- * Embedding allocator
- *
- * The first chunk is sized to just contain the static area plus
- * module and dynamic reserves and embedded into linear physical
- * mapping so that it can use PMD mapping without additional TLB
- * pressure.
- */
-static int __init setup_pcpu_embed(bool chosen)
-{
-	size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
-
-	/*
-	 * If large page isn't supported, there's no benefit in doing
-	 * this.  Also, embedding allocation doesn't play well with
-	 * NUMA.
-	 */
-	if (!chosen && (!cpu_has_pse || pcpu_need_numa()))
-		return -EINVAL;
-
-	return pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
-				      reserve - PERCPU_FIRST_CHUNK_RESERVE,
-				      PAGE_SIZE, NULL, pcpu_fc_alloc,
-				      pcpu_fc_free);
 }
 
-/*
- * Page allocator
- *
- * Boring fallback 4k page allocator.  This allocator puts more
- * pressure on PTE TLBs but other than that behaves nicely on both UMA
- * and NUMA.
- */
 static void __init pcpup_populate_pte(unsigned long addr)
 {
 	populate_extra_pte(addr);
 }
 
-static int __init setup_pcpu_page(void)
-{
-	return pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
-				     pcpu_fc_alloc, pcpu_fc_free,
-				     pcpup_populate_pte);
-}
-
 static inline void setup_percpu_segment(int cpu)
 {
 #ifdef CONFIG_X86_32
@@ -281,30 +178,34 @@ void __init setup_per_cpu_areas(void)
 		NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
 
 	/*
-	 * Allocate percpu area.  If PSE is supported, try to make use
-	 * of large page mappings.  Please read comments on top of
-	 * each allocator for details.
+	 * Allocate percpu area.  Embedding allocator is our favorite;
+	 * however, on NUMA configurations, it can result in very
+	 * sparse unit mapping and vmalloc area isn't spacious enough
+	 * on 32bit.  Use page in that case.
 	 */
+#ifdef CONFIG_X86_32
+	if (pcpu_chosen_fc == PCPU_FC_AUTO && pcpu_need_numa())
+		pcpu_chosen_fc = PCPU_FC_PAGE;
+#endif
 	rc = -EINVAL;
-	if (pcpu_chosen_fc != PCPU_FC_AUTO) {
-		if (pcpu_chosen_fc != PCPU_FC_PAGE) {
-			if (pcpu_chosen_fc == PCPU_FC_LPAGE)
-				rc = setup_pcpu_lpage(true);
-			else
-				rc = setup_pcpu_embed(true);
-
-			if (rc < 0)
-				pr_warning("PERCPU: %s allocator failed (%d), "
-					   "falling back to page size\n",
-					   pcpu_fc_names[pcpu_chosen_fc], rc);
-		}
-	} else {
-		rc = setup_pcpu_lpage(false);
+	if (pcpu_chosen_fc != PCPU_FC_PAGE) {
+		const size_t atom_size = cpu_has_pse ? PMD_SIZE : PAGE_SIZE;
+		const size_t dyn_size = PERCPU_MODULE_RESERVE +
+			PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE;
+
+		rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
+					    dyn_size, atom_size,
+					    pcpu_cpu_distance,
+					    pcpu_fc_alloc, pcpu_fc_free);
 		if (rc < 0)
-			rc = setup_pcpu_embed(false);
+			pr_warning("PERCPU: %s allocator failed (%d), "
+				   "falling back to page size\n",
+				   pcpu_fc_names[pcpu_chosen_fc], rc);
 	}
 	if (rc < 0)
-		rc = setup_pcpu_page();
+		rc = pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
+					   pcpu_fc_alloc, pcpu_fc_free,
+					   pcpup_populate_pte);
 	if (rc < 0)
 		panic("cannot initialize percpu area (err=%d)", rc);
 
-- 
cgit v1.2.3


From e933a73f48e3b2d40cfa56d81e2646f194b5a66a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 14 Aug 2009 15:00:53 +0900
Subject: percpu: kill lpage first chunk allocator

With x86 converted to embedding allocator, lpage doesn't have any user
left.  Kill it along with cpa handling code.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jan Beulich <JBeulich@novell.com>
---
 Documentation/kernel-parameters.txt |  10 +-
 arch/x86/mm/pageattr.c              |  20 +--
 include/linux/percpu.h              |  16 ---
 mm/percpu.c                         | 241 ------------------------------------
 4 files changed, 6 insertions(+), 281 deletions(-)

(limited to 'arch/x86')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index dee9ce2e6cf..e710093e3d3 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1920,11 +1920,11 @@ and is between 256 and 4096 characters. It is defined in the file
 			See arch/parisc/kernel/pdc_chassis.c
 
 	percpu_alloc=	Select which percpu first chunk allocator to use.
-			Currently supported values are "embed", "page" and
-			"lpage".  Archs may support subset or none of the
-			selections.  See comments in mm/percpu.c for details
-			on each allocator.  This parameter is primarily	for
-			debugging and performance comparison.
+			Currently supported values are "embed" and "page".
+			Archs may support subset or none of the	selections.
+			See comments in mm/percpu.c for details on each
+			allocator.  This parameter is primarily	for debugging
+			and performance comparison.
 
 	pf.		[PARIDE]
 			See Documentation/blockdev/paride.txt.
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index dce282f6570..f53cfc7f963 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -687,7 +687,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
 {
 	struct cpa_data alias_cpa;
 	unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT);
-	unsigned long vaddr, remapped;
+	unsigned long vaddr;
 	int ret;
 
 	if (cpa->pfn >= max_pfn_mapped)
@@ -745,24 +745,6 @@ static int cpa_process_alias(struct cpa_data *cpa)
 	}
 #endif
 
-	/*
-	 * If the PMD page was partially used for per-cpu remapping,
-	 * the recycled area needs to be split and modified.  Because
-	 * the area is always proper subset of a PMD page
-	 * cpa->numpages is guaranteed to be 1 for these areas, so
-	 * there's no need to loop over and check for further remaps.
-	 */
-	remapped = (unsigned long)pcpu_lpage_remapped((void *)laddr);
-	if (remapped) {
-		WARN_ON(cpa->numpages > 1);
-		alias_cpa = *cpa;
-		alias_cpa.vaddr = &remapped;
-		alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
-		ret = __change_page_attr_set_clr(&alias_cpa, 0);
-		if (ret)
-			return ret;
-	}
-
 	return 0;
 }
 
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 25359932740..878836ca999 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -82,7 +82,6 @@ enum pcpu_fc {
 	PCPU_FC_AUTO,
 	PCPU_FC_EMBED,
 	PCPU_FC_PAGE,
-	PCPU_FC_LPAGE,
 
 	PCPU_FC_NR,
 };
@@ -95,7 +94,6 @@ typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size,
 typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size);
 typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr);
 typedef int (pcpu_fc_cpu_distance_fn_t)(unsigned int from, unsigned int to);
-typedef void (*pcpu_fc_map_fn_t)(void *ptr, size_t size, void *addr);
 
 extern struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
 							     int nr_units);
@@ -124,20 +122,6 @@ extern int __init pcpu_page_first_chunk(size_t reserved_size,
 				pcpu_fc_populate_pte_fn_t populate_pte_fn);
 #endif
 
-#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK
-extern int __init pcpu_lpage_first_chunk(const struct pcpu_alloc_info *ai,
-				pcpu_fc_alloc_fn_t alloc_fn,
-				pcpu_fc_free_fn_t free_fn,
-				pcpu_fc_map_fn_t map_fn);
-
-extern void *pcpu_lpage_remapped(void *kaddr);
-#else
-static inline void *pcpu_lpage_remapped(void *kaddr)
-{
-	return NULL;
-}
-#endif
-
 /*
  * Use this to get to a cpu's version of the per-cpu object
  * dynamically allocated. Non-atomic access to the current CPU's
diff --git a/mm/percpu.c b/mm/percpu.c
index c2826d05505..77933928107 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1713,7 +1713,6 @@ const char *pcpu_fc_names[PCPU_FC_NR] __initdata = {
 	[PCPU_FC_AUTO]	= "auto",
 	[PCPU_FC_EMBED]	= "embed",
 	[PCPU_FC_PAGE]	= "page",
-	[PCPU_FC_LPAGE]	= "lpage",
 };
 
 enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO;
@@ -1729,10 +1728,6 @@ static int __init percpu_alloc_setup(char *str)
 #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
 	else if (!strcmp(str, "page"))
 		pcpu_chosen_fc = PCPU_FC_PAGE;
-#endif
-#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK
-	else if (!strcmp(str, "lpage"))
-		pcpu_chosen_fc = PCPU_FC_LPAGE;
 #endif
 	else
 		pr_warning("PERCPU: unknown allocator %s specified\n", str);
@@ -1970,242 +1965,6 @@ out_free_ar:
 }
 #endif /* CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK */
 
-#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK
-struct pcpul_ent {
-	void		*ptr;
-	void		*map_addr;
-};
-
-static size_t pcpul_size;
-static size_t pcpul_lpage_size;
-static int pcpul_nr_lpages;
-static struct pcpul_ent *pcpul_map;
-
-static bool __init pcpul_unit_to_cpu(int unit, const struct pcpu_alloc_info *ai,
-				     unsigned int *cpup)
-{
-	int group, cunit;
-
-	for (group = 0, cunit = 0; group < ai->nr_groups; group++) {
-		const struct pcpu_group_info *gi = &ai->groups[group];
-
-		if (unit < cunit + gi->nr_units) {
-			if (cpup)
-				*cpup = gi->cpu_map[unit - cunit];
-			return true;
-		}
-		cunit += gi->nr_units;
-	}
-
-	return false;
-}
-
-static int __init pcpul_cpu_to_unit(int cpu, const struct pcpu_alloc_info *ai)
-{
-	int group, unit, i;
-
-	for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {
-		const struct pcpu_group_info *gi = &ai->groups[group];
-
-		for (i = 0; i < gi->nr_units; i++)
-			if (gi->cpu_map[i] == cpu)
-				return unit + i;
-	}
-	BUG();
-}
-
-/**
- * pcpu_lpage_first_chunk - remap the first percpu chunk using large page
- * @ai: pcpu_alloc_info
- * @alloc_fn: function to allocate percpu lpage, always called with lpage_size
- * @free_fn: function to free percpu memory, @size <= lpage_size
- * @map_fn: function to map percpu lpage, always called with lpage_size
- *
- * This allocator uses large page to build and map the first chunk.
- * Unlike other helpers, the caller should provide fully initialized
- * @ai.  This can be done using pcpu_build_alloc_info().  This two
- * stage initialization is to allow arch code to evaluate the
- * parameters before committing to it.
- *
- * Large pages are allocated as directed by @unit_map and other
- * parameters and mapped to vmalloc space.  Unused holes are returned
- * to the page allocator.  Note that these holes end up being actively
- * mapped twice - once to the physical mapping and to the vmalloc area
- * for the first percpu chunk.  Depending on architecture, this might
- * cause problem when changing page attributes of the returned area.
- * These double mapped areas can be detected using
- * pcpu_lpage_remapped().
- *
- * RETURNS:
- * 0 on success, -errno on failure.
- */
-int __init pcpu_lpage_first_chunk(const struct pcpu_alloc_info *ai,
-				  pcpu_fc_alloc_fn_t alloc_fn,
-				  pcpu_fc_free_fn_t free_fn,
-				  pcpu_fc_map_fn_t map_fn)
-{
-	static struct vm_struct vm;
-	const size_t lpage_size = ai->atom_size;
-	size_t chunk_size, map_size;
-	unsigned int cpu;
-	int i, j, unit, nr_units, rc;
-
-	nr_units = 0;
-	for (i = 0; i < ai->nr_groups; i++)
-		nr_units += ai->groups[i].nr_units;
-
-	chunk_size = ai->unit_size * nr_units;
-	BUG_ON(chunk_size % lpage_size);
-
-	pcpul_size = ai->static_size + ai->reserved_size + ai->dyn_size;
-	pcpul_lpage_size = lpage_size;
-	pcpul_nr_lpages = chunk_size / lpage_size;
-
-	/* allocate pointer array and alloc large pages */
-	map_size = pcpul_nr_lpages * sizeof(pcpul_map[0]);
-	pcpul_map = alloc_bootmem(map_size);
-
-	/* allocate all pages */
-	for (i = 0; i < pcpul_nr_lpages; i++) {
-		size_t offset = i * lpage_size;
-		int first_unit = offset / ai->unit_size;
-		int last_unit = (offset + lpage_size - 1) / ai->unit_size;
-		void *ptr;
-
-		/* find out which cpu is mapped to this unit */
-		for (unit = first_unit; unit <= last_unit; unit++)
-			if (pcpul_unit_to_cpu(unit, ai, &cpu))
-				goto found;
-		continue;
-	found:
-		ptr = alloc_fn(cpu, lpage_size, lpage_size);
-		if (!ptr) {
-			pr_warning("PERCPU: failed to allocate large page "
-				   "for cpu%u\n", cpu);
-			goto enomem;
-		}
-
-		pcpul_map[i].ptr = ptr;
-	}
-
-	/* return unused holes */
-	for (unit = 0; unit < nr_units; unit++) {
-		size_t start = unit * ai->unit_size;
-		size_t end = start + ai->unit_size;
-		size_t off, next;
-
-		/* don't free used part of occupied unit */
-		if (pcpul_unit_to_cpu(unit, ai, NULL))
-			start += pcpul_size;
-
-		/* unit can span more than one page, punch the holes */
-		for (off = start; off < end; off = next) {
-			void *ptr = pcpul_map[off / lpage_size].ptr;
-			next = min(roundup(off + 1, lpage_size), end);
-			if (ptr)
-				free_fn(ptr + off % lpage_size, next - off);
-		}
-	}
-
-	/* allocate address, map and copy */
-	vm.flags = VM_ALLOC;
-	vm.size = chunk_size;
-	vm_area_register_early(&vm, ai->unit_size);
-
-	for (i = 0; i < pcpul_nr_lpages; i++) {
-		if (!pcpul_map[i].ptr)
-			continue;
-		pcpul_map[i].map_addr = vm.addr + i * lpage_size;
-		map_fn(pcpul_map[i].ptr, lpage_size, pcpul_map[i].map_addr);
-	}
-
-	for_each_possible_cpu(cpu)
-		memcpy(vm.addr + pcpul_cpu_to_unit(cpu, ai) * ai->unit_size,
-		       __per_cpu_load, ai->static_size);
-
-	/* we're ready, commit */
-	pr_info("PERCPU: large pages @%p s%zu r%zu d%zu u%zu\n",
-		vm.addr, ai->static_size, ai->reserved_size, ai->dyn_size,
-		ai->unit_size);
-
-	rc = pcpu_setup_first_chunk(ai, vm.addr);
-
-	/*
-	 * Sort pcpul_map array for pcpu_lpage_remapped().  Unmapped
-	 * lpages are pushed to the end and trimmed.
-	 */
-	for (i = 0; i < pcpul_nr_lpages - 1; i++)
-		for (j = i + 1; j < pcpul_nr_lpages; j++) {
-			struct pcpul_ent tmp;
-
-			if (!pcpul_map[j].ptr)
-				continue;
-			if (pcpul_map[i].ptr &&
-			    pcpul_map[i].ptr < pcpul_map[j].ptr)
-				continue;
-
-			tmp = pcpul_map[i];
-			pcpul_map[i] = pcpul_map[j];
-			pcpul_map[j] = tmp;
-		}
-
-	while (pcpul_nr_lpages && !pcpul_map[pcpul_nr_lpages - 1].ptr)
-		pcpul_nr_lpages--;
-
-	return rc;
-
-enomem:
-	for (i = 0; i < pcpul_nr_lpages; i++)
-		if (pcpul_map[i].ptr)
-			free_fn(pcpul_map[i].ptr, lpage_size);
-	free_bootmem(__pa(pcpul_map), map_size);
-	return -ENOMEM;
-}
-
-/**
- * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area
- * @kaddr: the kernel address in question
- *
- * Determine whether @kaddr falls in the pcpul recycled area.  This is
- * used by pageattr to detect VM aliases and break up the pcpu large
- * page mapping such that the same physical page is not mapped under
- * different attributes.
- *
- * The recycled area is always at the tail of a partially used large
- * page.
- *
- * RETURNS:
- * Address of corresponding remapped pcpu address if match is found;
- * otherwise, NULL.
- */
-void *pcpu_lpage_remapped(void *kaddr)
-{
-	unsigned long lpage_mask = pcpul_lpage_size - 1;
-	void *lpage_addr = (void *)((unsigned long)kaddr & ~lpage_mask);
-	unsigned long offset = (unsigned long)kaddr & lpage_mask;
-	int left = 0, right = pcpul_nr_lpages - 1;
-	int pos;
-
-	/* pcpul in use at all? */
-	if (!pcpul_map)
-		return NULL;
-
-	/* okay, perform binary search */
-	while (left <= right) {
-		pos = (left + right) / 2;
-
-		if (pcpul_map[pos].ptr < lpage_addr)
-			left = pos + 1;
-		else if (pcpul_map[pos].ptr > lpage_addr)
-			right = pos - 1;
-		else
-			return pcpul_map[pos].map_addr + offset;
-	}
-
-	return NULL;
-}
-#endif /* CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK */
-
 /*
  * Generic percpu area setup.
  *
-- 
cgit v1.2.3


From 58c41d28259c246dbc11358d85d332dc20ccd57b Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Fri, 14 Aug 2009 12:14:19 -0700
Subject: x86, intel_txt: Factor out the code for S3 setup

S3 sleep requires special setup in tboot.  However, the data
structures needed to do such setup are only available if
CONFIG_ACPI_SLEEP is enabled.  Abstract them out as much as possible,
so we can have a single tboot_setup_sleep() which either is a proper
implementation or a stub which simply calls BUG().

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Acked-by: Shane Wang <shane.wang@intel.com>
Cc: Joseph Cihula <joseph.cihula@intel.com>
---
 arch/x86/kernel/tboot.c | 53 +++++++++++++++++++++++++++++++++++++------------
 1 file changed, 40 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 1ab80120894..a183beffe39 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -34,6 +34,7 @@
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 #include <asm/fixmap.h>
+#include <asm/proto.h>
 #include <asm/setup.h>
 #include <asm/tboot.h>
 #include <asm/e820.h>
@@ -164,25 +165,51 @@ void tboot_create_trampoline(void)
 	map_base = PFN_DOWN(tboot->tboot_base);
 	map_size = PFN_UP(tboot->tboot_size);
 	if (map_tboot_pages(map_base << PAGE_SHIFT, map_base, map_size))
-		panic("tboot: Error mapping tboot pages (mfns) @ 0x%x, 0x%x\n", map_base, map_size);
+		panic("tboot: Error mapping tboot pages (mfns) @ 0x%x, 0x%x\n",
+		      map_base, map_size);
 }
 
-static void set_mac_regions(void)
+#ifdef CONFIG_ACPI_SLEEP
+
+static void add_mac_region(phys_addr_t start, unsigned long size)
 {
-	tboot->num_mac_regions = 3;
+	struct tboot_mac_region *mr;
+	phys_addr_t end = start + size;
+
+	if (start && size) {
+		mr = &tboot->mac_regions[tboot->num_mac_regions++];
+		mr->start = round_down(start, PAGE_SIZE);
+		mr->size  = round_up(end, PAGE_SIZE) - mr->start;
+	}
+}
+
+static int tboot_setup_sleep(void)
+{
+	tboot->num_mac_regions = 0;
+
 	/* S3 resume code */
-	tboot->mac_regions[0].start = PFN_PHYS(PFN_DOWN(acpi_wakeup_address));
-	tboot->mac_regions[0].size = PFN_UP(WAKEUP_SIZE) << PAGE_SHIFT;
+	add_mac_region(acpi_wakeup_address, WAKEUP_SIZE);
 	/* AP trampoline code */
-	tboot->mac_regions[1].start =
-			PFN_PHYS(PFN_DOWN(virt_to_phys(trampoline_base)));
-	tboot->mac_regions[1].size = PFN_UP(TRAMPOLINE_SIZE) << PAGE_SHIFT;
+	add_mac_region(virt_to_phys(trampoline_base), TRAMPOLINE_SIZE);
 	/* kernel code + data + bss */
-	tboot->mac_regions[2].start = PFN_PHYS(PFN_DOWN(virt_to_phys(&_text)));
-	tboot->mac_regions[2].size = PFN_PHYS(PFN_UP(virt_to_phys(&_end))) -
-				     PFN_PHYS(PFN_DOWN(virt_to_phys(&_text)));
+	add_mac_region(virt_to_phys(_text), _end - _text);
+
+	tboot->acpi_sinfo.kernel_s3_resume_vector = acpi_wakeup_address;
+
+	return 0;
 }
 
+#else /* no CONFIG_ACPI_SLEEP */
+
+static int tboot_setup_sleep(void)
+{
+	/* S3 shutdown requested, but S3 not supported by the kernel... */
+	BUG();
+	return -1;
+}
+
+#endif
+
 void tboot_shutdown(u32 shutdown_type)
 {
 	void (*shutdown)(void);
@@ -200,7 +227,8 @@ void tboot_shutdown(u32 shutdown_type)
 
 	/* if this is S3 then set regions to MAC */
 	if (shutdown_type == TB_SHUTDOWN_S3)
-		set_mac_regions();
+		if (tboot_setup_sleep())
+			return;
 
 	tboot->shutdown_type = shutdown_type;
 
@@ -253,7 +281,6 @@ void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control)
 	tboot->acpi_sinfo.pm1b_cnt_val = pm1b_control;
 	/* we always use the 32b wakeup vector */
 	tboot->acpi_sinfo.vector_width = 32;
-	tboot->acpi_sinfo.kernel_s3_resume_vector = acpi_wakeup_address;
 
 	if (sleep_state >= ACPI_S_STATE_COUNT ||
 	    acpi_shutdown_map[sleep_state] == -1) {
-- 
cgit v1.2.3


From 1be396794897f80bfc8774719ba60309a9e3d374 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Fri, 14 Aug 2009 15:47:20 +0200
Subject: timekeeping: Move reset of cycle_last for tsc clocksource to tsc

change_clocksource resets the cycle_last value to zero then sets it to
a value read from the clocksource. The reset to zero is required only
for the TSC clocksource to make the read_tsc function work after a
resume. The reason is that the TSC read function uses cycle_last to
detect backwards going TSCs. In the resume case cycle_last contains
the TSC value from the last update before the suspend. On resume the
TSC starts counting from 0 again and would trip over the cycle_last
comparison.

This is subtle and surprising. Move the reset to a resume function in
the tsc code.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: John Stultz <johnstul@us.ibm.com>
Cc: Daniel Walker <dwalker@fifo99.com>
LKML-Reference: <20090814134808.142191175@de.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/tsc.c     | 6 ++++++
 kernel/time/timekeeping.c | 1 -
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 71f4368b357..968425422c4 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -744,10 +744,16 @@ static cycle_t __vsyscall_fn vread_tsc(void)
 }
 #endif
 
+static void resume_tsc(void)
+{
+	clocksource_tsc.cycle_last = 0;
+}
+
 static struct clocksource clocksource_tsc = {
 	.name                   = "tsc",
 	.rating                 = 300,
 	.read                   = read_tsc,
+	.resume			= resume_tsc,
 	.mask                   = CLOCKSOURCE_MASK(64),
 	.shift                  = 22,
 	.flags                  = CLOCK_SOURCE_IS_CONTINUOUS |
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 016a2591d71..b5673016089 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -295,7 +295,6 @@ static void change_clocksource(void)
 	if (old->disable)
 		old->disable(old);
 
-	clock->cycle_last = 0;
 	clock->cycle_last = clock->read(clock);
 	clock->error = 0;
 	clock->xtime_nsec = 0;
-- 
cgit v1.2.3


From d4f587c67fc39e0030ddd718675e252e208da4d7 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Fri, 14 Aug 2009 15:47:31 +0200
Subject: timekeeping: Increase granularity of read_persistent_clock()

The persistent clock of some architectures (e.g. s390) have a
better granularity than seconds. To reduce the delta between the
host clock and the guest clock in a virtualized system change the
read_persistent_clock function to return a struct timespec.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Acked-by: John Stultz <johnstul@us.ibm.com>
Cc: Daniel Walker <dwalker@fifo99.com>
LKML-Reference: <20090814134811.013873340@de.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/m68knommu/kernel/time.c          |  5 ++--
 arch/mips/dec/time.c                  |  5 ++--
 arch/mips/lasat/ds1603.c              |  5 ++--
 arch/mips/lasat/sysctl.c              |  8 ++++--
 arch/mips/lemote/lm2e/setup.c         |  5 ++--
 arch/mips/mti-malta/malta-time.c      |  5 ++--
 arch/mips/pmc-sierra/yosemite/setup.c |  5 ++--
 arch/mips/sibyte/swarm/setup.c        | 15 +++++++---
 arch/mips/sni/time.c                  |  5 ++--
 arch/powerpc/kernel/time.c            |  7 +++--
 arch/s390/kernel/time.c               | 22 +++------------
 arch/sh/kernel/time.c                 |  6 ++--
 arch/x86/kernel/rtc.c                 |  5 ++--
 arch/xtensa/kernel/time.c             |  5 ++--
 include/linux/time.h                  |  2 +-
 kernel/time/timekeeping.c             | 52 +++++++++++++++++++----------------
 16 files changed, 83 insertions(+), 74 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/m68knommu/kernel/time.c b/arch/m68knommu/kernel/time.c
index d182b2f7221..68432248515 100644
--- a/arch/m68knommu/kernel/time.c
+++ b/arch/m68knommu/kernel/time.c
@@ -72,9 +72,10 @@ static unsigned long read_rtc_mmss(void)
 	return  mktime(year, mon, day, hour, min, sec);;
 }
 
-unsigned long read_persistent_clock(void)
+void read_persistent_clock(struct timespec *ts)
 {
-	return read_rtc_mmss();
+	ts->tv_sec = read_rtc_mmss();
+	ts->tv_nsec = 0;
 }
 
 int update_persistent_clock(struct timespec now)
diff --git a/arch/mips/dec/time.c b/arch/mips/dec/time.c
index 463136e6685..02f505f23c3 100644
--- a/arch/mips/dec/time.c
+++ b/arch/mips/dec/time.c
@@ -18,7 +18,7 @@
 #include <asm/dec/ioasic.h>
 #include <asm/dec/machtype.h>
 
-unsigned long read_persistent_clock(void)
+void read_persistent_clock(struct timespec *ts)
 {
 	unsigned int year, mon, day, hour, min, sec, real_year;
 	unsigned long flags;
@@ -53,7 +53,8 @@ unsigned long read_persistent_clock(void)
 
 	year += real_year - 72 + 2000;
 
-	return mktime(year, mon, day, hour, min, sec);
+	ts->tv_sec = mktime(year, mon, day, hour, min, sec);
+	ts->tv_nsec = 0;
 }
 
 /*
diff --git a/arch/mips/lasat/ds1603.c b/arch/mips/lasat/ds1603.c
index 52cb1436a12..c6fd96ff118 100644
--- a/arch/mips/lasat/ds1603.c
+++ b/arch/mips/lasat/ds1603.c
@@ -135,7 +135,7 @@ static void rtc_end_op(void)
 	lasat_ndelay(1000);
 }
 
-unsigned long read_persistent_clock(void)
+void read_persistent_clock(struct timespec *ts)
 {
 	unsigned long word;
 	unsigned long flags;
@@ -147,7 +147,8 @@ unsigned long read_persistent_clock(void)
 	rtc_end_op();
 	spin_unlock_irqrestore(&rtc_lock, flags);
 
-	return word;
+	ts->tv_sec = word;
+	ts->tv_nsec = 0;
 }
 
 int rtc_mips_set_mmss(unsigned long time)
diff --git a/arch/mips/lasat/sysctl.c b/arch/mips/lasat/sysctl.c
index 8f88886feb1..3f04d4c406b 100644
--- a/arch/mips/lasat/sysctl.c
+++ b/arch/mips/lasat/sysctl.c
@@ -92,10 +92,12 @@ static int rtctmp;
 int proc_dolasatrtc(ctl_table *table, int write, struct file *filp,
 		       void *buffer, size_t *lenp, loff_t *ppos)
 {
+	struct timespec ts;
 	int r;
 
 	if (!write) {
-		rtctmp = read_persistent_clock();
+		read_persistent_clock(&ts);
+		rtctmp = ts.tv_sec;
 		/* check for time < 0 and set to 0 */
 		if (rtctmp < 0)
 			rtctmp = 0;
@@ -134,9 +136,11 @@ int sysctl_lasat_rtc(ctl_table *table,
 		    void *oldval, size_t *oldlenp,
 		    void *newval, size_t newlen)
 {
+	struct timespec ts;
 	int r;
 
-	rtctmp = read_persistent_clock();
+	read_persistent_clock(&ts);
+	rtctmp = ts.tv_sec;
 	if (rtctmp < 0)
 		rtctmp = 0;
 	r = sysctl_intvec(table, oldval, oldlenp, newval, newlen);
diff --git a/arch/mips/lemote/lm2e/setup.c b/arch/mips/lemote/lm2e/setup.c
index ebd6ceaef2f..24b355df612 100644
--- a/arch/mips/lemote/lm2e/setup.c
+++ b/arch/mips/lemote/lm2e/setup.c
@@ -54,9 +54,10 @@ void __init plat_time_init(void)
 	mips_hpt_frequency = cpu_clock_freq / 2;
 }
 
-unsigned long read_persistent_clock(void)
+void read_persistent_clock(struct timespec *ts)
 {
-	return mc146818_get_cmos_time();
+	ts->tv_sec = mc146818_get_cmos_time();
+	ts->tv_nsec = 0;
 }
 
 void (*__wbflush)(void);
diff --git a/arch/mips/mti-malta/malta-time.c b/arch/mips/mti-malta/malta-time.c
index 0b97d47691f..3c6f190aa61 100644
--- a/arch/mips/mti-malta/malta-time.c
+++ b/arch/mips/mti-malta/malta-time.c
@@ -100,9 +100,10 @@ static unsigned int __init estimate_cpu_frequency(void)
 	return count;
 }
 
-unsigned long read_persistent_clock(void)
+void read_persistent_clock(struct timespec *ts)
 {
-	return mc146818_get_cmos_time();
+	ts->tv_sec = mc146818_get_cmos_time();
+	ts->tv_nsec = 0;
 }
 
 static void __init plat_perf_setup(void)
diff --git a/arch/mips/pmc-sierra/yosemite/setup.c b/arch/mips/pmc-sierra/yosemite/setup.c
index 2d3c0dca275..3498ac9c35a 100644
--- a/arch/mips/pmc-sierra/yosemite/setup.c
+++ b/arch/mips/pmc-sierra/yosemite/setup.c
@@ -70,7 +70,7 @@ void __init bus_error_init(void)
 }
 
 
-unsigned long read_persistent_clock(void)
+void read_persistent_clock(struct timespec *ts)
 {
 	unsigned int year, month, day, hour, min, sec;
 	unsigned long flags;
@@ -92,7 +92,8 @@ unsigned long read_persistent_clock(void)
 	m48t37_base->control = 0x00;
 	spin_unlock_irqrestore(&rtc_lock, flags);
 
-	return mktime(year, month, day, hour, min, sec);
+	ts->tv_sec = mktime(year, month, day, hour, min, sec);
+	ts->tv_nsec = 0;
 }
 
 int rtc_mips_set_time(unsigned long tim)
diff --git a/arch/mips/sibyte/swarm/setup.c b/arch/mips/sibyte/swarm/setup.c
index 672e45d495a..623ffc933c4 100644
--- a/arch/mips/sibyte/swarm/setup.c
+++ b/arch/mips/sibyte/swarm/setup.c
@@ -87,19 +87,26 @@ enum swarm_rtc_type {
 
 enum swarm_rtc_type swarm_rtc_type;
 
-unsigned long read_persistent_clock(void)
+void read_persistent_clock(struct timespec *ts)
 {
+	unsigned long sec;
+
 	switch (swarm_rtc_type) {
 	case RTC_XICOR:
-		return xicor_get_time();
+		sec = xicor_get_time();
+		break;
 
 	case RTC_M4LT81:
-		return m41t81_get_time();
+		sec = m41t81_get_time();
+		break;
 
 	case RTC_NONE:
 	default:
-		return mktime(2000, 1, 1, 0, 0, 0);
+		sec = mktime(2000, 1, 1, 0, 0, 0);
+		break;
 	}
+	ts->tv_sec = sec;
+	tv->tv_nsec = 0;
 }
 
 int rtc_mips_set_time(unsigned long sec)
diff --git a/arch/mips/sni/time.c b/arch/mips/sni/time.c
index 0d9ec1a5c24..62df6a598e0 100644
--- a/arch/mips/sni/time.c
+++ b/arch/mips/sni/time.c
@@ -182,7 +182,8 @@ void __init plat_time_init(void)
 	setup_pit_timer();
 }
 
-unsigned long read_persistent_clock(void)
+void read_persistent_clock(struct timespec *ts)
 {
-	return -1;
+	ts->tv_sec = -1;
+	ts->tv_nsec = 0;
 }
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index eae4511ceea..ad63f30fe3d 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -769,7 +769,7 @@ int update_persistent_clock(struct timespec now)
 	return ppc_md.set_rtc_time(&tm);
 }
 
-unsigned long read_persistent_clock(void)
+void read_persistent_clock(struct timespec *ts)
 {
 	struct rtc_time tm;
 	static int first = 1;
@@ -787,8 +787,9 @@ unsigned long read_persistent_clock(void)
 	if (!ppc_md.get_rtc_time)
 		return 0;
 	ppc_md.get_rtc_time(&tm);
-	return mktime(tm.tm_year+1900, tm.tm_mon+1, tm.tm_mday,
-		      tm.tm_hour, tm.tm_min, tm.tm_sec);
+	ts->tv_sec = mktime(tm.tm_year+1900, tm.tm_mon+1, tm.tm_mday,
+			    tm.tm_hour, tm.tm_min, tm.tm_sec);
+	ts->tv_nsec = 0;
 }
 
 /* clocksource code */
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index e76c2e7a8b9..a94ec48587b 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -182,12 +182,9 @@ static void timing_alert_interrupt(__u16 code)
 static void etr_reset(void);
 static void stp_reset(void);
 
-unsigned long read_persistent_clock(void)
+void read_persistent_clock(struct timespec *ts)
 {
-	struct timespec ts;
-
-	tod_to_timeval(get_clock() - TOD_UNIX_EPOCH, &ts);
-	return ts.tv_sec;
+	tod_to_timeval(get_clock() - TOD_UNIX_EPOCH, ts);
 }
 
 static cycle_t read_tod_clock(struct clocksource *cs)
@@ -248,7 +245,6 @@ void __init time_init(void)
 {
 	struct timespec ts;
 	unsigned long flags;
-	cycle_t now;
 
 	/* Reset time synchronization interfaces. */
 	etr_reset();
@@ -266,20 +262,10 @@ void __init time_init(void)
 		panic("Could not register TOD clock source");
 
 	/*
-	 * The TOD clock is an accurate clock. The xtime should be
-	 * initialized in a way that the difference between TOD and
-	 * xtime is reasonably small. Too bad that timekeeping_init
-	 * sets xtime.tv_nsec to zero. In addition the clock source
-	 * change from the jiffies clock source to the TOD clock
-	 * source add another error of up to 1/HZ second. The same
-	 * function sets wall_to_monotonic to a value that is too
-	 * small for /proc/uptime to be accurate.
-	 * Reset xtime and wall_to_monotonic to sane values.
+	 * Reset wall_to_monotonic to the initial timestamp created
+	 * in head.S to get a precise value in /proc/uptime.
 	 */
 	write_seqlock_irqsave(&xtime_lock, flags);
-	now = get_clock();
-	tod_to_timeval(now - TOD_UNIX_EPOCH, &xtime);
-	clocksource_tod.cycle_last = now;
 	tod_to_timeval(sched_clock_base_cc - TOD_UNIX_EPOCH, &ts);
 	set_normalized_timespec(&wall_to_monotonic, -ts.tv_sec, -ts.tv_nsec);
 	write_sequnlock_irqrestore(&xtime_lock, flags);
diff --git a/arch/sh/kernel/time.c b/arch/sh/kernel/time.c
index 9b352a1e3fb..3f4706aa975 100644
--- a/arch/sh/kernel/time.c
+++ b/arch/sh/kernel/time.c
@@ -39,11 +39,9 @@ void (*rtc_sh_get_time)(struct timespec *) = null_rtc_get_time;
 int (*rtc_sh_set_time)(const time_t) = null_rtc_set_time;
 
 #ifdef CONFIG_GENERIC_CMOS_UPDATE
-unsigned long read_persistent_clock(void)
+void read_persistent_clock(struct timespec *ts)
 {
-	struct timespec tv;
-	rtc_sh_get_time(&tv);
-	return tv.tv_sec;
+	rtc_sh_get_time(&ts);
 }
 
 int update_persistent_clock(struct timespec now)
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 5d465b207e7..bf67dcb4a44 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -178,7 +178,7 @@ static int set_rtc_mmss(unsigned long nowtime)
 }
 
 /* not static: needed by APM */
-unsigned long read_persistent_clock(void)
+void read_persistent_clock(struct timespec *ts)
 {
 	unsigned long retval, flags;
 
@@ -186,7 +186,8 @@ unsigned long read_persistent_clock(void)
 	retval = get_wallclock();
 	spin_unlock_irqrestore(&rtc_lock, flags);
 
-	return retval;
+	ts->tv_sec = retval;
+	ts->tv_nsec = 0;
 }
 
 int update_persistent_clock(struct timespec now)
diff --git a/arch/xtensa/kernel/time.c b/arch/xtensa/kernel/time.c
index 8848120d291..19085ff0484 100644
--- a/arch/xtensa/kernel/time.c
+++ b/arch/xtensa/kernel/time.c
@@ -59,9 +59,8 @@ static struct irqaction timer_irqaction = {
 
 void __init time_init(void)
 {
-	xtime.tv_nsec = 0;
-	xtime.tv_sec = read_persistent_clock();
-
+	/* FIXME: xtime&wall_to_monotonic are set in timekeeping_init. */
+	read_persistent_clock(&xtime);
 	set_normalized_timespec(&wall_to_monotonic,
 		-xtime.tv_sec, -xtime.tv_nsec);
 
diff --git a/include/linux/time.h b/include/linux/time.h
index e7c84455888..53a3216f0d1 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -101,7 +101,7 @@ extern struct timespec xtime;
 extern struct timespec wall_to_monotonic;
 extern seqlock_t xtime_lock;
 
-extern unsigned long read_persistent_clock(void);
+extern void read_persistent_clock(struct timespec *ts);
 extern int update_persistent_clock(struct timespec now);
 extern int no_sync_cmos_clock __read_mostly;
 void timekeeping_init(void);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 41579e7fcf9..f1a21ce491e 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -154,7 +154,7 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
  */
 struct timespec xtime __attribute__ ((aligned (16)));
 struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
-static unsigned long total_sleep_time;		/* seconds */
+static struct timespec total_sleep_time;
 
 /*
  * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock.
@@ -487,17 +487,18 @@ int timekeeping_valid_for_hres(void)
 }
 
 /**
- * read_persistent_clock -  Return time in seconds from the persistent clock.
+ * read_persistent_clock -  Return time from the persistent clock.
  *
  * Weak dummy function for arches that do not yet support it.
- * Returns seconds from epoch using the battery backed persistent clock.
- * Returns zero if unsupported.
+ * Reads the time from the battery backed persistent clock.
+ * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported.
  *
  *  XXX - Do be sure to remove it once all arches implement it.
  */
-unsigned long __attribute__((weak)) read_persistent_clock(void)
+void __attribute__((weak)) read_persistent_clock(struct timespec *ts)
 {
-	return 0;
+	ts->tv_sec = 0;
+	ts->tv_nsec = 0;
 }
 
 /*
@@ -507,7 +508,9 @@ void __init timekeeping_init(void)
 {
 	struct clocksource *clock;
 	unsigned long flags;
-	unsigned long sec = read_persistent_clock();
+	struct timespec now;
+
+	read_persistent_clock(&now);
 
 	write_seqlock_irqsave(&xtime_lock, flags);
 
@@ -518,19 +521,20 @@ void __init timekeeping_init(void)
 		clock->enable(clock);
 	timekeeper_setup_internals(clock);
 
-	xtime.tv_sec = sec;
-	xtime.tv_nsec = 0;
+	xtime.tv_sec = now.tv_sec;
+	xtime.tv_nsec = now.tv_nsec;
 	raw_time.tv_sec = 0;
 	raw_time.tv_nsec = 0;
 	set_normalized_timespec(&wall_to_monotonic,
 		-xtime.tv_sec, -xtime.tv_nsec);
 	update_xtime_cache(0);
-	total_sleep_time = 0;
+	total_sleep_time.tv_sec = 0;
+	total_sleep_time.tv_nsec = 0;
 	write_sequnlock_irqrestore(&xtime_lock, flags);
 }
 
 /* time in seconds when suspend began */
-static unsigned long timekeeping_suspend_time;
+static struct timespec timekeeping_suspend_time;
 
 /**
  * timekeeping_resume - Resumes the generic timekeeping subsystem.
@@ -543,18 +547,19 @@ static unsigned long timekeeping_suspend_time;
 static int timekeeping_resume(struct sys_device *dev)
 {
 	unsigned long flags;
-	unsigned long now = read_persistent_clock();
+	struct timespec ts;
+
+	read_persistent_clock(&ts);
 
 	clocksource_resume();
 
 	write_seqlock_irqsave(&xtime_lock, flags);
 
-	if (now && (now > timekeeping_suspend_time)) {
-		unsigned long sleep_length = now - timekeeping_suspend_time;
-
-		xtime.tv_sec += sleep_length;
-		wall_to_monotonic.tv_sec -= sleep_length;
-		total_sleep_time += sleep_length;
+	if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) {
+		ts = timespec_sub(ts, timekeeping_suspend_time);
+		xtime = timespec_add_safe(xtime, ts);
+		wall_to_monotonic = timespec_sub(wall_to_monotonic, ts);
+		total_sleep_time = timespec_add_safe(total_sleep_time, ts);
 	}
 	update_xtime_cache(0);
 	/* re-base the last cycle value */
@@ -577,7 +582,7 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
 {
 	unsigned long flags;
 
-	timekeeping_suspend_time = read_persistent_clock();
+	read_persistent_clock(&timekeeping_suspend_time);
 
 	write_seqlock_irqsave(&xtime_lock, flags);
 	timekeeping_forward_now();
@@ -801,9 +806,10 @@ void update_wall_time(void)
  */
 void getboottime(struct timespec *ts)
 {
-	set_normalized_timespec(ts,
-		- (wall_to_monotonic.tv_sec + total_sleep_time),
-		- wall_to_monotonic.tv_nsec);
+	struct timespec boottime;
+
+	boottime = timespec_add_safe(wall_to_monotonic, total_sleep_time);
+	set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec);
 }
 
 /**
@@ -812,7 +818,7 @@ void getboottime(struct timespec *ts)
  */
 void monotonic_to_bootbased(struct timespec *ts)
 {
-	ts->tv_sec += total_sleep_time;
+	*ts = timespec_add_safe(*ts, total_sleep_time);
 }
 
 unsigned long get_seconds(void)
-- 
cgit v1.2.3


From 62a3207b8cf3de35368cdc3822b30b82d59eea95 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Mon, 17 Aug 2009 11:16:16 -0700
Subject: x86, intel_txt: Handle ACPI_SLEEP without X86_TRAMPOLINE

On 32 bits, we can have CONFIG_ACPI_SLEEP set without implying
CONFIG_X86_TRAMPOLINE.  In that case, we simply do not need to mark
the trampoline as a MAC region.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: Shane Wang <shane.wang@intel.com>
Cc: Joseph Cihula <joseph.cihula@intel.com>
---
 arch/x86/kernel/tboot.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index a183beffe39..c2e760ca7b0 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -189,8 +189,12 @@ static int tboot_setup_sleep(void)
 
 	/* S3 resume code */
 	add_mac_region(acpi_wakeup_address, WAKEUP_SIZE);
+
+#ifdef CONFIG_X86_TRAMPOLINE
 	/* AP trampoline code */
 	add_mac_region(virt_to_phys(trampoline_base), TRAMPOLINE_SIZE);
+#endif
+
 	/* kernel code + data + bss */
 	add_mac_region(virt_to_phys(_text), _end - _text);
 
-- 
cgit v1.2.3


From 8126dec32738421afa362114337331337b4be17f Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Thu, 20 Aug 2009 20:23:11 +0800
Subject: x86: Fix system crash when loading with "reservetop" parameter

The system will die if the kernel is booted with "reservetop"
parameter, in present code, parse "reservetop" parameter after
early_ioremap_init(), and some function still use
early_ioremap() after it.

The problem is, "reservetop" parameter can modify
'FIXADDR_TOP', then the virtual address got by early_ioremap()
is base on old 'FIXADDR_TOP', but the page mapping is base on
new 'FIXADDR_TOP', it will occur page fault, and the IDT is not
prepare yet, so, the system is dead.

So, put parse_early_param() in the front of
early_ioremap_init() in this patch.

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Cc: yinghai@kernel.org
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <4A8D402F.4080805@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 63f32d220ef..02643cc3bf2 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -711,6 +711,11 @@ void __init setup_arch(char **cmdline_p)
 	printk(KERN_INFO "Command line: %s\n", boot_command_line);
 #endif
 
+	strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
+	*cmdline_p = command_line;
+
+	parse_early_param();
+
 	/* VMI may relocate the fixmap; do this before touching ioremap area */
 	vmi_init();
 
@@ -793,11 +798,6 @@ void __init setup_arch(char **cmdline_p)
 #endif
 #endif
 
-	strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
-	*cmdline_p = command_line;
-
-	parse_early_param();
-
 #ifdef CONFIG_X86_64
 	check_efer();
 #endif
-- 
cgit v1.2.3


From 3e0e1e9c5a327d4dba8490d83ef55c0564e6e8a7 Mon Sep 17 00:00:00 2001
From: Amerigo Wang <amwang@redhat.com>
Date: Fri, 21 Aug 2009 04:34:45 -0400
Subject: x86: Fix an incorrect argument of reserve_bootmem()

This line looks suspicious, because if this is true, then the
'flags' parameter of function reserve_bootmem_generic() will be
unused when !CONFIG_NUMA. I don't think this is what we want.

Signed-off-by: WANG Cong <amwang@redhat.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: akpm@linux-foundation.org
LKML-Reference: <20090821083709.5098.52505.sendpatchset@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/init_64.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 6176fe8f29e..ea56b8cbb6a 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -796,7 +796,7 @@ int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
 		return ret;
 
 #else
-	reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
+	reserve_bootmem(phys, len, flags);
 #endif
 
 	if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
-- 
cgit v1.2.3


From 8cab02dc3c58a12235c6d463ce684dded9696848 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 20 Aug 2009 18:19:45 +0200
Subject: x86: Do not unregister PIT clocksource on PIT oneshot setup/shutdown

This basically reverts commit 1a0c009ac (x86: unregister PIT
clocksource when PIT is disabled) because the problem which was tried
to address with that patch has been solved by commit 3f68535ada
(clocksource: sanity check sysfs clocksource changes).

The problem addressed by the original patch is that PIT could be
selected as clocksource after the system switched the PIT off or set
the PIT into one shot mode which would result in complete timekeeping
wreckage.

Now with the sysfs sanity check in place PIT cannot be selected again
when the system is in oneshot mode. The system will not switch to one
shot mode as long as PIT is installed because PIT is not suitable for
one shot.

The shutdown case which happens when the lapic timer is installed is
covered by the fact that init_pit_clocksource() is called after the
lapic timer take over and then does not install the PIT clocksource
at all.

We should have done the sanity checks back then, but ...

This also solves the locking problem which was reported vs. the
clocksource rework.

LKML-Reference: <new-submission>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: john stultz <johnstul@us.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/i8253.c | 19 -------------------
 1 file changed, 19 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index 5cf36c053ac..23c167925a5 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -19,12 +19,6 @@
 DEFINE_SPINLOCK(i8253_lock);
 EXPORT_SYMBOL(i8253_lock);
 
-#ifdef CONFIG_X86_32
-static void pit_disable_clocksource(void);
-#else
-static inline void pit_disable_clocksource(void) { }
-#endif
-
 /*
  * HPET replaces the PIT, when enabled. So we need to know, which of
  * the two timers is used
@@ -57,12 +51,10 @@ static void init_pit_timer(enum clock_event_mode mode,
 			outb_pit(0, PIT_CH0);
 			outb_pit(0, PIT_CH0);
 		}
-		pit_disable_clocksource();
 		break;
 
 	case CLOCK_EVT_MODE_ONESHOT:
 		/* One shot setup */
-		pit_disable_clocksource();
 		outb_pit(0x38, PIT_MODE);
 		break;
 
@@ -200,17 +192,6 @@ static struct clocksource pit_cs = {
 	.shift		= 20,
 };
 
-static void pit_disable_clocksource(void)
-{
-	/*
-	 * Use mult to check whether it is registered or not
-	 */
-	if (pit_cs.mult) {
-		clocksource_unregister(&pit_cs);
-		pit_cs.mult = 0;
-	}
-}
-
 static int __init init_pit_clocksource(void)
 {
 	 /*
-- 
cgit v1.2.3


From da15cfdae03351c689736f8d142618592e3cebc3 Mon Sep 17 00:00:00 2001
From: john stultz <johnstul@us.ibm.com>
Date: Wed, 19 Aug 2009 19:13:34 -0700
Subject: time: Introduce CLOCK_REALTIME_COARSE

After talking with some application writers who want very fast, but not
fine-grained timestamps, I decided to try to implement new clock_ids
to clock_gettime(): CLOCK_REALTIME_COARSE and CLOCK_MONOTONIC_COARSE
which returns the time at the last tick. This is very fast as we don't
have to access any hardware (which can be very painful if you're using
something like the acpi_pm clocksource), and we can even use the vdso
clock_gettime() method to avoid the syscall. The only trade off is you
only get low-res tick grained time resolution.

This isn't a new idea, I know Ingo has a patch in the -rt tree that made
the vsyscall gettimeofday() return coarse grained time when the
vsyscall64 sysctrl was set to 2. However this affects all applications
on a system.

With this method, applications can choose the proper speed/granularity
trade-off for themselves.

Signed-off-by: John Stultz <johnstul@us.ibm.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: nikolag@ca.ibm.com
Cc: Darren Hart <dvhltc@us.ibm.com>
Cc: arjan@infradead.org
Cc: jonathan@jonmasters.org
LKML-Reference: <1250734414.6897.5.camel@localhost.localdomain>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/vgtod.h   |  1 +
 arch/x86/kernel/vsyscall_64.c  |  1 +
 arch/x86/vdso/vclock_gettime.c | 39 ++++++++++++++++++++++++++++++++++++---
 include/linux/time.h           |  4 ++++
 kernel/posix-timers.c          | 35 +++++++++++++++++++++++++++++++++++
 kernel/time/timekeeping.c      | 21 +++++++++++++++++++++
 6 files changed, 98 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h
index dc27a69e5d2..3d61e204826 100644
--- a/arch/x86/include/asm/vgtod.h
+++ b/arch/x86/include/asm/vgtod.h
@@ -21,6 +21,7 @@ struct vsyscall_gtod_data {
 		u32	shift;
 	} clock;
 	struct timespec wall_to_monotonic;
+	struct timespec wall_time_coarse;
 };
 extern struct vsyscall_gtod_data __vsyscall_gtod_data
 __section_vsyscall_gtod_data;
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 25ee06a80aa..cf53a78e2dc 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -87,6 +87,7 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
 	vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
 	vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
 	vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic;
+	vsyscall_gtod_data.wall_time_coarse = __current_kernel_time();
 	write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
 }
 
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 6a40b78b46a..ee55754cc3c 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -86,14 +86,47 @@ notrace static noinline int do_monotonic(struct timespec *ts)
 	return 0;
 }
 
+notrace static noinline int do_realtime_coarse(struct timespec *ts)
+{
+	unsigned long seq;
+	do {
+		seq = read_seqbegin(&gtod->lock);
+		ts->tv_sec = gtod->wall_time_coarse.tv_sec;
+		ts->tv_nsec = gtod->wall_time_coarse.tv_nsec;
+	} while (unlikely(read_seqretry(&gtod->lock, seq)));
+	return 0;
+}
+
+notrace static noinline int do_monotonic_coarse(struct timespec *ts)
+{
+	unsigned long seq, ns, secs;
+	do {
+		seq = read_seqbegin(&gtod->lock);
+		secs = gtod->wall_time_coarse.tv_sec;
+		ns = gtod->wall_time_coarse.tv_nsec;
+		secs += gtod->wall_to_monotonic.tv_sec;
+		ns += gtod->wall_to_monotonic.tv_nsec;
+	} while (unlikely(read_seqretry(&gtod->lock, seq)));
+	vset_normalized_timespec(ts, secs, ns);
+	return 0;
+}
+
 notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
 {
-	if (likely(gtod->sysctl_enabled && gtod->clock.vread))
+	if (likely(gtod->sysctl_enabled))
 		switch (clock) {
 		case CLOCK_REALTIME:
-			return do_realtime(ts);
+			if (likely(gtod->clock.vread))
+				return do_realtime(ts);
+			break;
 		case CLOCK_MONOTONIC:
-			return do_monotonic(ts);
+			if (likely(gtod->clock.vread))
+				return do_monotonic(ts);
+			break;
+		case CLOCK_REALTIME_COARSE:
+			return do_realtime_coarse(ts);
+		case CLOCK_MONOTONIC_COARSE:
+			return do_monotonic_coarse(ts);
 		}
 	return vdso_fallback_gettime(clock, ts);
 }
diff --git a/include/linux/time.h b/include/linux/time.h
index f505988398e..256232f7e5e 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -110,6 +110,8 @@ extern int timekeeping_suspended;
 
 unsigned long get_seconds(void);
 struct timespec current_kernel_time(void);
+struct timespec __current_kernel_time(void); /* does not hold xtime_lock */
+struct timespec get_monotonic_coarse(void);
 
 #define CURRENT_TIME		(current_kernel_time())
 #define CURRENT_TIME_SEC	((struct timespec) { get_seconds(), 0 })
@@ -243,6 +245,8 @@ struct itimerval {
 #define CLOCK_PROCESS_CPUTIME_ID	2
 #define CLOCK_THREAD_CPUTIME_ID		3
 #define CLOCK_MONOTONIC_RAW		4
+#define CLOCK_REALTIME_COARSE		5
+#define CLOCK_MONOTONIC_COARSE		6
 
 /*
  * The IDs of various hardware clocks:
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index d089d052c4a..495440779ce 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -242,6 +242,25 @@ static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp)
 	return 0;
 }
 
+
+static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec *tp)
+{
+	*tp = current_kernel_time();
+	return 0;
+}
+
+static int posix_get_monotonic_coarse(clockid_t which_clock,
+						struct timespec *tp)
+{
+	*tp = get_monotonic_coarse();
+	return 0;
+}
+
+int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp)
+{
+	*tp = ktime_to_timespec(KTIME_LOW_RES);
+	return 0;
+}
 /*
  * Initialize everything, well, just everything in Posix clocks/timers ;)
  */
@@ -262,10 +281,26 @@ static __init int init_posix_timers(void)
 		.timer_create = no_timer_create,
 		.nsleep = no_nsleep,
 	};
+	struct k_clock clock_realtime_coarse = {
+		.clock_getres = posix_get_coarse_res,
+		.clock_get = posix_get_realtime_coarse,
+		.clock_set = do_posix_clock_nosettime,
+		.timer_create = no_timer_create,
+		.nsleep = no_nsleep,
+	};
+	struct k_clock clock_monotonic_coarse = {
+		.clock_getres = posix_get_coarse_res,
+		.clock_get = posix_get_monotonic_coarse,
+		.clock_set = do_posix_clock_nosettime,
+		.timer_create = no_timer_create,
+		.nsleep = no_nsleep,
+	};
 
 	register_posix_clock(CLOCK_REALTIME, &clock_realtime);
 	register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic);
 	register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw);
+	register_posix_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse);
+	register_posix_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse);
 
 	posix_timers_cache = kmem_cache_create("posix_timers_cache",
 					sizeof (struct k_itimer), 0, SLAB_PANIC,
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 15e06defca5..03cbeb34d14 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -847,6 +847,10 @@ unsigned long get_seconds(void)
 }
 EXPORT_SYMBOL(get_seconds);
 
+struct timespec __current_kernel_time(void)
+{
+	return xtime_cache;
+}
 
 struct timespec current_kernel_time(void)
 {
@@ -862,3 +866,20 @@ struct timespec current_kernel_time(void)
 	return now;
 }
 EXPORT_SYMBOL(current_kernel_time);
+
+struct timespec get_monotonic_coarse(void)
+{
+	struct timespec now, mono;
+	unsigned long seq;
+
+	do {
+		seq = read_seqbegin(&xtime_lock);
+
+		now = xtime_cache;
+		mono = wall_to_monotonic;
+	} while (read_seqretry(&xtime_lock, seq));
+
+	set_normalized_timespec(&now, now.tv_sec + mono.tv_sec,
+				now.tv_nsec + mono.tv_nsec);
+	return now;
+}
-- 
cgit v1.2.3


From d0af9eed5aa91b6b7b5049cae69e5ea956fd85c3 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Wed, 19 Aug 2009 18:05:36 -0700
Subject: x86, pat/mtrr: Rendezvous all the cpus for MTRR/PAT init

SDM Vol 3a section titled "MTRR considerations in MP systems" specifies
the need for synchronizing the logical cpu's while initializing/updating
MTRR.

Currently Linux kernel does the synchronization of all cpu's only when
a single MTRR register is programmed/updated. During an AP online
(during boot/cpu-online/resume)  where we initialize all the MTRR/PAT registers,
we don't follow this synchronization algorithm.

This can lead to scenarios where during a dynamic cpu online, that logical cpu
is initializing MTRR/PAT with cache disabled (cr0.cd=1) etc while other logical
HT sibling continue to run (also with cache disabled because of cr0.cd=1
on its sibling).

Starting from Westmere, VMX transitions with cr0.cd=1 don't work properly
(because of some VMX performance optimizations) and the above scenario
(with one logical cpu doing VMX activity and another logical cpu coming online)
can result in system crash.

Fix the MTRR initialization by doing rendezvous of all the cpus. During
boot and resume, we delay the MTRR/PAT init for APs till all the
logical cpu's come online and the rendezvous process at the end of AP's bringup,
will initialize the MTRR/PAT for all AP's.

For dynamic single cpu online, we synchronize all the logical cpus and
do the MTRR/PAT init on the AP that is coming online.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/mtrr.h     |  7 +++++++
 arch/x86/kernel/cpu/mtrr/main.c | 46 +++++++++++++++++++++++++++++++++--------
 arch/x86/kernel/smpboot.c       | 14 +++++++++++++
 arch/x86/power/cpu.c            |  2 +-
 kernel/cpu.c                    | 14 +++++++++++++
 5 files changed, 73 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h
index a51ada8467d..d5366ec5cb8 100644
--- a/arch/x86/include/asm/mtrr.h
+++ b/arch/x86/include/asm/mtrr.h
@@ -121,8 +121,12 @@ extern int mtrr_del_page(int reg, unsigned long base, unsigned long size);
 extern void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi);
 extern void mtrr_ap_init(void);
 extern void mtrr_bp_init(void);
+extern void set_mtrr_aps_delayed_init(void);
+extern void mtrr_aps_init(void);
+extern void mtrr_bp_restore(void);
 extern int mtrr_trim_uncached_memory(unsigned long end_pfn);
 extern int amd_special_default_mtrr(void);
+extern u32 mtrr_aps_delayed_init;
 #  else
 static inline u8 mtrr_type_lookup(u64 addr, u64 end)
 {
@@ -161,6 +165,9 @@ static inline void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi)
 
 #define mtrr_ap_init() do {} while (0)
 #define mtrr_bp_init() do {} while (0)
+#define set_mtrr_aps_delayed_init() do {} while (0)
+#define mtrr_aps_init() do {} while (0)
+#define mtrr_bp_restore() do {} while (0)
 #  endif
 
 #ifdef CONFIG_COMPAT
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 7af0f88a416..7339be0aa58 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -58,6 +58,7 @@ unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
 static DEFINE_MUTEX(mtrr_mutex);
 
 u64 size_or_mask, size_and_mask;
+u32 mtrr_aps_delayed_init;
 
 static struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM];
 
@@ -163,7 +164,10 @@ static void ipi_handler(void *info)
 	if (data->smp_reg != ~0U) {
 		mtrr_if->set(data->smp_reg, data->smp_base,
 			     data->smp_size, data->smp_type);
-	} else {
+	} else if (mtrr_aps_delayed_init) {
+		/*
+		 * Initialize the MTRRs inaddition to the synchronisation.
+		 */
 		mtrr_if->set_all();
 	}
 
@@ -265,6 +269,8 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
 	 */
 	if (reg != ~0U)
 		mtrr_if->set(reg, base, size, type);
+	else if (!mtrr_aps_delayed_init)
+		mtrr_if->set_all();
 
 	/* Wait for the others */
 	while (atomic_read(&data.count))
@@ -721,9 +727,7 @@ void __init mtrr_bp_init(void)
 
 void mtrr_ap_init(void)
 {
-	unsigned long flags;
-
-	if (!mtrr_if || !use_intel())
+	if (!use_intel() || mtrr_aps_delayed_init)
 		return;
 	/*
 	 * Ideally we should hold mtrr_mutex here to avoid mtrr entries
@@ -738,11 +742,7 @@ void mtrr_ap_init(void)
 	 *   2. cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug
 	 *      lock to prevent mtrr entry changes
 	 */
-	local_irq_save(flags);
-
-	mtrr_if->set_all();
-
-	local_irq_restore(flags);
+	set_mtrr(~0U, 0, 0, 0);
 }
 
 /**
@@ -753,6 +753,34 @@ void mtrr_save_state(void)
 	smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1);
 }
 
+void set_mtrr_aps_delayed_init(void)
+{
+	if (!use_intel())
+		return;
+
+	mtrr_aps_delayed_init = 1;
+}
+
+/*
+ * MTRR initialization for all AP's
+ */
+void mtrr_aps_init(void)
+{
+	if (!use_intel())
+		return;
+
+	set_mtrr(~0U, 0, 0, 0);
+	mtrr_aps_delayed_init = 0;
+}
+
+void mtrr_bp_restore(void)
+{
+	if (!use_intel())
+		return;
+
+	mtrr_if->set_all();
+}
+
 static int __init mtrr_init_finialize(void)
 {
 	if (!mtrr_if)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 2fecda69ee6..d720b7e0cf3 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1116,9 +1116,22 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
 
 	if (is_uv_system())
 		uv_system_init();
+
+	set_mtrr_aps_delayed_init();
 out:
 	preempt_enable();
 }
+
+void arch_enable_nonboot_cpus_begin(void)
+{
+	set_mtrr_aps_delayed_init();
+}
+
+void arch_enable_nonboot_cpus_end(void)
+{
+	mtrr_aps_init();
+}
+
 /*
  * Early setup to make printk work.
  */
@@ -1140,6 +1153,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
 	setup_ioapic_dest();
 #endif
 	check_nmi_watchdog();
+	mtrr_aps_init();
 }
 
 static int __initdata setup_possible_cpus = -1;
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index b3d20b9cac6..417c9f5b4af 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -242,7 +242,7 @@ static void __restore_processor_state(struct saved_context *ctxt)
 	fix_processor_context();
 
 	do_fpu_end();
-	mtrr_ap_init();
+	mtrr_bp_restore();
 
 #ifdef CONFIG_X86_OLD_MCE
 	mcheck_init(&boot_cpu_data);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 8ce10043e4a..f5f9485b8c0 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -413,6 +413,14 @@ int disable_nonboot_cpus(void)
 	return error;
 }
 
+void __weak arch_enable_nonboot_cpus_begin(void)
+{
+}
+
+void __weak arch_enable_nonboot_cpus_end(void)
+{
+}
+
 void __ref enable_nonboot_cpus(void)
 {
 	int cpu, error;
@@ -424,6 +432,9 @@ void __ref enable_nonboot_cpus(void)
 		goto out;
 
 	printk("Enabling non-boot CPUs ...\n");
+
+	arch_enable_nonboot_cpus_begin();
+
 	for_each_cpu(cpu, frozen_cpus) {
 		error = _cpu_up(cpu, 1);
 		if (!error) {
@@ -432,6 +443,9 @@ void __ref enable_nonboot_cpus(void)
 		}
 		printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error);
 	}
+
+	arch_enable_nonboot_cpus_end();
+
 	cpumask_clear(frozen_cpus);
 out:
 	cpu_maps_update_done();
-- 
cgit v1.2.3


From 5400743db5a06a4e6e298725a2044c40edcb27b9 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Fri, 21 Aug 2009 17:00:02 -0700
Subject: x86, mtrr: make mtrr_aps_delayed_init static bool

mtr_aps_delayed_init was declared u32 and made global, but it only
ever takes boolean values and is only ever used in
arch/x86/kernel/cpu/mtrr/main.c.  Declare it "static bool" and remove
external references.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
---
 arch/x86/include/asm/mtrr.h     | 1 -
 arch/x86/kernel/cpu/mtrr/main.c | 6 +++---
 2 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h
index d5366ec5cb8..4365ffdb461 100644
--- a/arch/x86/include/asm/mtrr.h
+++ b/arch/x86/include/asm/mtrr.h
@@ -126,7 +126,6 @@ extern void mtrr_aps_init(void);
 extern void mtrr_bp_restore(void);
 extern int mtrr_trim_uncached_memory(unsigned long end_pfn);
 extern int amd_special_default_mtrr(void);
-extern u32 mtrr_aps_delayed_init;
 #  else
 static inline u8 mtrr_type_lookup(u64 addr, u64 end)
 {
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 7339be0aa58..84e83de5457 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -58,7 +58,7 @@ unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
 static DEFINE_MUTEX(mtrr_mutex);
 
 u64 size_or_mask, size_and_mask;
-u32 mtrr_aps_delayed_init;
+static bool mtrr_aps_delayed_init;
 
 static struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM];
 
@@ -758,7 +758,7 @@ void set_mtrr_aps_delayed_init(void)
 	if (!use_intel())
 		return;
 
-	mtrr_aps_delayed_init = 1;
+	mtrr_aps_delayed_init = true;
 }
 
 /*
@@ -770,7 +770,7 @@ void mtrr_aps_init(void)
 		return;
 
 	set_mtrr(~0U, 0, 0, 0);
-	mtrr_aps_delayed_init = 0;
+	mtrr_aps_delayed_init = false;
 }
 
 void mtrr_bp_restore(void)
-- 
cgit v1.2.3


From 680b6cfd3cee30a7d997d49430fb73af84523853 Mon Sep 17 00:00:00 2001
From: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Date: Wed, 26 Aug 2009 16:20:36 +0900
Subject: x86, mce: CE in last bank prevents panic by unknown MCE

If MCE handler is called but none of mces_seen have machine
check event which might signal the MCE (i.e. event higher than
MCE_KEEP_SEVERITY), panic with "Machine check from unknown
source" will be taken since the MCE is assumed to be signaled
from external agent or so.

Usually mces_seen never point MCE_KEEP_SEVERITY event such as
CE. But it can happen because initial value of mces_seen is
accidentally modified by mce_no_way_out() - in case if
mce_no_way_out() run through all banks and the last bank has
the CE, mces_seen points the CE and the "panic by unknown" will
not be taken.

This patch fixes this undesired behavior, and clarifies the logic.

Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jin Dongming <jin.dongming@np.css.fujitsu.com>
LKML-Reference: <4A94E244.3020301@jp.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Reported-by: Jin Dongming <jin.dongming@np.css.fujitsu.com>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 54bd1b2fb4c..325559d1aa5 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -612,7 +612,7 @@ out:
  * This way we prevent any potential data corruption in a unrecoverable case
  * and also makes sure always all CPU's errors are examined.
  *
- * Also this detects the case of an machine check event coming from outer
+ * Also this detects the case of a machine check event coming from outer
  * space (not detected by any CPUs) In this case some external agent wants
  * us to shut down, so panic too.
  *
@@ -665,7 +665,7 @@ static void mce_reign(void)
 	 * No machine check event found. Must be some external
 	 * source or one CPU is hung. Panic.
 	 */
-	if (!m && tolerant < 3)
+	if (global_worst <= MCE_KEEP_SEVERITY && tolerant < 3)
 		mce_panic("Machine check from unknown source", NULL, NULL);
 
 	/*
@@ -889,11 +889,11 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 	mce_setup(&m);
 
 	m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
-	no_way_out = mce_no_way_out(&m, &msg);
-
 	final = &__get_cpu_var(mces_seen);
 	*final = m;
 
+	no_way_out = mce_no_way_out(&m, &msg);
+
 	barrier();
 
 	/*
-- 
cgit v1.2.3


From 5fc517466dd3d0fc6d2a5180ca6792e60344d8be Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Date: Fri, 10 Jul 2009 09:57:32 -0700
Subject: x86, pat: Keep identity maps consistent with mmaps even when
 pat_disabled

Make reserve_memtype internally take care of pat disabled case and fallback
to default return values.

Remove the specific pat_disabled checks in track_* routines.

Change kernel_map_sync_memtype to sync identity map even when
pat_disabled.

This change ensures that, even for pat_disabled case, we take care of
keeping identity map in sync. Before this patch, in pat disabled case,
ioremap() keeps the identity maps in sync and other APIs like pci and
/dev/mem mmap don't, which is not a very consistent behavior.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/mm/pat.c | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index e6718bb2806..d5af2792d2f 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -339,6 +339,8 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
 		if (new_type) {
 			if (req_type == -1)
 				*new_type = _PAGE_CACHE_WB;
+			else if (req_type == _PAGE_CACHE_WC)
+				*new_type = _PAGE_CACHE_UC_MINUS;
 			else
 				*new_type = req_type & _PAGE_CACHE_MASK;
 		}
@@ -577,7 +579,7 @@ int kernel_map_sync_memtype(u64 base, unsigned long size, unsigned long flags)
 {
 	unsigned long id_sz;
 
-	if (!pat_enabled || base >= __pa(high_memory))
+	if (base >= __pa(high_memory))
 		return 0;
 
 	id_sz = (__pa(high_memory) < base + size) ?
@@ -677,9 +679,6 @@ int track_pfn_vma_copy(struct vm_area_struct *vma)
 	unsigned long vma_size = vma->vm_end - vma->vm_start;
 	pgprot_t pgprot;
 
-	if (!pat_enabled)
-		return 0;
-
 	/*
 	 * For now, only handle remap_pfn_range() vmas where
 	 * is_linear_pfn_mapping() == TRUE. Handling of
@@ -715,9 +714,6 @@ int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot,
 	resource_size_t paddr;
 	unsigned long vma_size = vma->vm_end - vma->vm_start;
 
-	if (!pat_enabled)
-		return 0;
-
 	/*
 	 * For now, only handle remap_pfn_range() vmas where
 	 * is_linear_pfn_mapping() == TRUE. Handling of
@@ -743,9 +739,6 @@ void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn,
 	resource_size_t paddr;
 	unsigned long vma_size = vma->vm_end - vma->vm_start;
 
-	if (!pat_enabled)
-		return;
-
 	/*
 	 * For now, only handle remap_pfn_range() vmas where
 	 * is_linear_pfn_mapping() == TRUE. Handling of
-- 
cgit v1.2.3


From 279e669b3fc0068cc3509e8e53036999e1e86588 Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Date: Fri, 10 Jul 2009 09:57:33 -0700
Subject: x86, pat: ioremap to follow same PAT restrictions as other PAT users

ioremap has this hard-coded check for new type and requested type. That
check differs from other PAT users like /dev/mem mmap, remap_pfn_range
in only one condition where requested type is UC_MINUS and new type
is WC. Under that condition, ioremap fails. But other PAT interfaces succeed
with a WC mapping.

Change to make ioremap be in sync with other PAT APIs and use the same
macro as others. Also changes the error print to KERN_ERR instead of
pr_debug.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/mm/ioremap.c | 17 +++--------------
 1 file changed, 3 insertions(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 8a450930834..aeaea8c5b2f 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -228,24 +228,13 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
 	retval = reserve_memtype(phys_addr, (u64)phys_addr + size,
 						prot_val, &new_prot_val);
 	if (retval) {
-		pr_debug("Warning: reserve_memtype returned %d\n", retval);
+		printk(KERN_ERR "ioremap reserve_memtype failed %d\n", retval);
 		return NULL;
 	}
 
 	if (prot_val != new_prot_val) {
-		/*
-		 * Do not fallback to certain memory types with certain
-		 * requested type:
-		 * - request is uc-, return cannot be write-back
-		 * - request is uc-, return cannot be write-combine
-		 * - request is write-combine, return cannot be write-back
-		 */
-		if ((prot_val == _PAGE_CACHE_UC_MINUS &&
-		     (new_prot_val == _PAGE_CACHE_WB ||
-		      new_prot_val == _PAGE_CACHE_WC)) ||
-		    (prot_val == _PAGE_CACHE_WC &&
-		     new_prot_val == _PAGE_CACHE_WB)) {
-			pr_debug(
+		if (!is_new_memtype_allowed(prot_val, new_prot_val)) {
+			printk(KERN_ERR
 		"ioremap error for 0x%llx-0x%llx, requested 0x%lx, got 0x%lx\n",
 				(unsigned long long)phys_addr,
 				(unsigned long long)(phys_addr + size),
-- 
cgit v1.2.3


From 9fd126bc742f74a95d2ba610247712ff05da02fe Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Date: Fri, 10 Jul 2009 09:57:34 -0700
Subject: x86, pat: New i/f for driver to request memtype for IO regions

Add new routines to request memtype for IO regions. This will currently
be a backend for io_mapping_* routines. But, it can also be made available
to drivers directly in future, in case it is needed.

reserve interface reserves the memory, makes sure we have a compatible
memory type available and keeps the identity map in sync when needed.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/pat.h |  5 +++++
 arch/x86/mm/pat.c          | 49 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 54 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h
index 7af14e512f9..e2c1668dde7 100644
--- a/arch/x86/include/asm/pat.h
+++ b/arch/x86/include/asm/pat.h
@@ -19,4 +19,9 @@ extern int free_memtype(u64 start, u64 end);
 extern int kernel_map_sync_memtype(u64 base, unsigned long size,
 		unsigned long flag);
 
+int io_reserve_memtype(resource_size_t start, resource_size_t end,
+			unsigned long *type);
+
+void io_free_memtype(resource_size_t start, resource_size_t end);
+
 #endif /* _ASM_X86_PAT_H */
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index d5af2792d2f..82d097ce309 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -498,6 +498,55 @@ int free_memtype(u64 start, u64 end)
 }
 
 
+/**
+ * io_reserve_memtype - Request a memory type mapping for a region of memory
+ * @start: start (physical address) of the region
+ * @end: end (physical address) of the region
+ * @type: A pointer to memtype, with requested type. On success, requested
+ * or any other compatible type that was available for the region is returned
+ *
+ * On success, returns 0
+ * On failure, returns non-zero
+ */
+int io_reserve_memtype(resource_size_t start, resource_size_t end,
+			unsigned long *type)
+{
+	unsigned long req_type = *type;
+	unsigned long new_type;
+	int ret;
+
+	WARN_ON_ONCE(iomem_map_sanity_check(start, end - start));
+
+	ret = reserve_memtype(start, end, req_type, &new_type);
+	if (ret)
+		goto out_err;
+
+	if (!is_new_memtype_allowed(req_type, new_type))
+		goto out_free;
+
+	if (kernel_map_sync_memtype(start, end - start, new_type) < 0)
+		goto out_free;
+
+	*type = new_type;
+	return 0;
+
+out_free:
+	free_memtype(start, end);
+	ret = -EBUSY;
+out_err:
+	return ret;
+}
+
+/**
+ * io_free_memtype - Release a memory type mapping for a region of memory
+ * @start: start (physical address) of the region
+ * @end: end (physical address) of the region
+ */
+void io_free_memtype(resource_size_t start, resource_size_t end)
+{
+	free_memtype(start, end);
+}
+
 pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
 				unsigned long size, pgprot_t vma_prot)
 {
-- 
cgit v1.2.3


From 9e36fda0b359d2a6ae039c3d7e71a04502a77898 Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Date: Fri, 10 Jul 2009 09:57:35 -0700
Subject: x86, pat: Add PAT reserve free to io_mapping* APIs

io_mapping_* interfaces were added, mainly for graphics drivers.
Make this interface go through the PAT reserve/free, instead of
hardcoding WC mapping. This makes sure that there are no
aliases due to unconditional WC setting.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/iomap.h |  9 ++++++---
 arch/x86/mm/iomap_32.c       | 27 +++++++++++++++++++++++++--
 include/linux/io-mapping.h   | 17 ++++++++++++-----
 3 files changed, 43 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/iomap.h b/arch/x86/include/asm/iomap.h
index 0e9fe1d9d97..f35eb45d657 100644
--- a/arch/x86/include/asm/iomap.h
+++ b/arch/x86/include/asm/iomap.h
@@ -26,13 +26,16 @@
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 
-int
-is_io_mapping_possible(resource_size_t base, unsigned long size);
-
 void *
 iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot);
 
 void
 iounmap_atomic(void *kvaddr, enum km_type type);
 
+int
+iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot);
+
+void
+iomap_free(resource_size_t base, unsigned long size);
+
 #endif /* _ASM_X86_IOMAP_H */
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
index fe6f84ca121..84e236ce76b 100644
--- a/arch/x86/mm/iomap_32.c
+++ b/arch/x86/mm/iomap_32.c
@@ -21,7 +21,7 @@
 #include <linux/module.h>
 #include <linux/highmem.h>
 
-int is_io_mapping_possible(resource_size_t base, unsigned long size)
+static int is_io_mapping_possible(resource_size_t base, unsigned long size)
 {
 #if !defined(CONFIG_X86_PAE) && defined(CONFIG_PHYS_ADDR_T_64BIT)
 	/* There is no way to map greater than 1 << 32 address without PAE */
@@ -30,7 +30,30 @@ int is_io_mapping_possible(resource_size_t base, unsigned long size)
 #endif
 	return 1;
 }
-EXPORT_SYMBOL_GPL(is_io_mapping_possible);
+
+int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot)
+{
+	unsigned long flag = _PAGE_CACHE_WC;
+	int ret;
+
+	if (!is_io_mapping_possible(base, size))
+		return -EINVAL;
+
+	ret = io_reserve_memtype(base, base + size, &flag);
+	if (ret)
+		return ret;
+
+	*prot = __pgprot(__PAGE_KERNEL | flag);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(iomap_create_wc);
+
+void
+iomap_free(resource_size_t base, unsigned long size)
+{
+	io_free_memtype(base, base + size);
+}
+EXPORT_SYMBOL_GPL(iomap_free);
 
 void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
 {
diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h
index 0adb0f91568..97eb928b492 100644
--- a/include/linux/io-mapping.h
+++ b/include/linux/io-mapping.h
@@ -49,23 +49,30 @@ static inline struct io_mapping *
 io_mapping_create_wc(resource_size_t base, unsigned long size)
 {
 	struct io_mapping *iomap;
-
-	if (!is_io_mapping_possible(base, size))
-		return NULL;
+	pgprot_t prot;
 
 	iomap = kmalloc(sizeof(*iomap), GFP_KERNEL);
 	if (!iomap)
-		return NULL;
+		goto out_err;
+
+	if (iomap_create_wc(base, size, &prot))
+		goto out_free;
 
 	iomap->base = base;
 	iomap->size = size;
-	iomap->prot = pgprot_writecombine(__pgprot(__PAGE_KERNEL));
+	iomap->prot = prot;
 	return iomap;
+
+out_free:
+	kfree(iomap);
+out_err:
+	return NULL;
 }
 
 static inline void
 io_mapping_free(struct io_mapping *mapping)
 {
+	iomap_free(mapping->base, mapping->size);
 	kfree(mapping);
 }
 
-- 
cgit v1.2.3


From 335ef896d4c6639849d79367f0fef9abc06d121b Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Date: Fri, 10 Jul 2009 09:57:36 -0700
Subject: x86, pat: Add rbtree to do quick lookup in memtype tracking

PAT memtype tracking uses a linear link list to keep track of IO
(non-RAM) regions and their memtypes. The code used a last_accessed
pointer as a cache to speedup the lookup. As per discussions with
H. Peter Anvin a while back, having a rbtree here will avoid bad
performances in pathological cases where we may end up with huge
linked list. This may not add any noticable performance speedup
in normal case as the number of entires in PAT memtype list tend
to be ~20-30 range. The patch removes the "cached_entry" logic
as with rbtree we have more generic way of speeding up the lookup.

With this patch, we use rbtree to do the quick lookup. We still use
linked list as the memtype range tracked can be of different sizes
and can overlap in different ways. We also keep track of usage counts
with linked list.

Example:
Multiple ioremaps with different sizes
uncached-minus @ 0xfffff00000-0xfffff04000
uncached-minus @ 0xfffff02000-0xfffff03000

And one userlevel mmap and the thread forks a new process
uncached-minus @ 0xbf453000-0xbf454000
uncached-minus @ 0xbf453000-0xbf454000

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/mm/pat.c | 106 +++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 86 insertions(+), 20 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 82d097ce309..c90f2420f56 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -15,6 +15,7 @@
 #include <linux/gfp.h>
 #include <linux/mm.h>
 #include <linux/fs.h>
+#include <linux/rbtree.h>
 
 #include <asm/cacheflush.h>
 #include <asm/processor.h>
@@ -148,11 +149,10 @@ static char *cattr_name(unsigned long flags)
  * areas). All the aliases have the same cache attributes of course.
  * Zero attributes are represented as holes.
  *
- * Currently the data structure is a list because the number of mappings
- * are expected to be relatively small. If this should be a problem
- * it could be changed to a rbtree or similar.
+ * The data structure is a list that is also organized as an rbtree
+ * sorted on the start address of memtype range.
  *
- * memtype_lock protects the whole list.
+ * memtype_lock protects both the linear list and rbtree.
  */
 
 struct memtype {
@@ -160,11 +160,53 @@ struct memtype {
 	u64			end;
 	unsigned long		type;
 	struct list_head	nd;
+	struct rb_node		rb;
 };
 
+static struct rb_root memtype_rbroot = RB_ROOT;
 static LIST_HEAD(memtype_list);
 static DEFINE_SPINLOCK(memtype_lock);	/* protects memtype list */
 
+static struct memtype *memtype_rb_search(struct rb_root *root, u64 start)
+{
+	struct rb_node *node = root->rb_node;
+	struct memtype *last_lower = NULL;
+
+	while (node) {
+		struct memtype *data = container_of(node, struct memtype, rb);
+
+		if (data->start < start) {
+			last_lower = data;
+			node = node->rb_right;
+		} else if (data->start > start) {
+			node = node->rb_left;
+		} else
+			return data;
+	}
+
+	/* Will return NULL if there is no entry with its start <= start */
+	return last_lower;
+}
+
+static void memtype_rb_insert(struct rb_root *root, struct memtype *data)
+{
+	struct rb_node **new = &(root->rb_node);
+	struct rb_node *parent = NULL;
+
+	while (*new) {
+		struct memtype *this = container_of(*new, struct memtype, rb);
+
+		parent = *new;
+		if (data->start <= this->start)
+			new = &((*new)->rb_left);
+		else if (data->start > this->start)
+			new = &((*new)->rb_right);
+	}
+
+	rb_link_node(&data->rb, parent, new);
+	rb_insert_color(&data->rb, root);
+}
+
 /*
  * Does intersection of PAT memory type and MTRR memory type and returns
  * the resulting memory type as PAT understands it.
@@ -218,9 +260,6 @@ chk_conflict(struct memtype *new, struct memtype *entry, unsigned long *type)
 	return -EBUSY;
 }
 
-static struct memtype *cached_entry;
-static u64 cached_start;
-
 static int pat_pagerange_is_ram(unsigned long start, unsigned long end)
 {
 	int ram_page = 0, not_rampage = 0;
@@ -382,17 +421,19 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
 
 	spin_lock(&memtype_lock);
 
-	if (cached_entry && start >= cached_start)
-		entry = cached_entry;
-	else
+	entry = memtype_rb_search(&memtype_rbroot, new->start);
+	if (likely(entry != NULL)) {
+		/* To work correctly with list_for_each_entry_continue */
+		entry = list_entry(entry->nd.prev, struct memtype, nd);
+	} else {
 		entry = list_entry(&memtype_list, struct memtype, nd);
+	}
 
 	/* Search for existing mapping that overlaps the current range */
 	where = NULL;
 	list_for_each_entry_continue(entry, &memtype_list, nd) {
 		if (end <= entry->start) {
 			where = entry->nd.prev;
-			cached_entry = list_entry(where, struct memtype, nd);
 			break;
 		} else if (start <= entry->start) { /* end > entry->start */
 			err = chk_conflict(new, entry, new_type);
@@ -400,8 +441,6 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
 				dprintk("Overlap at 0x%Lx-0x%Lx\n",
 					entry->start, entry->end);
 				where = entry->nd.prev;
-				cached_entry = list_entry(where,
-							struct memtype, nd);
 			}
 			break;
 		} else if (start < entry->end) { /* start > entry->start */
@@ -409,8 +448,6 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
 			if (!err) {
 				dprintk("Overlap at 0x%Lx-0x%Lx\n",
 					entry->start, entry->end);
-				cached_entry = list_entry(entry->nd.prev,
-							struct memtype, nd);
 
 				/*
 				 * Move to right position in the linked
@@ -438,13 +475,13 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
 		return err;
 	}
 
-	cached_start = start;
-
 	if (where)
 		list_add(&new->nd, where);
 	else
 		list_add_tail(&new->nd, &memtype_list);
 
+	memtype_rb_insert(&memtype_rbroot, new);
+
 	spin_unlock(&memtype_lock);
 
 	dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
@@ -456,7 +493,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
 
 int free_memtype(u64 start, u64 end)
 {
-	struct memtype *entry;
+	struct memtype *entry, *saved_entry;
 	int err = -EINVAL;
 	int is_range_ram;
 
@@ -474,17 +511,46 @@ int free_memtype(u64 start, u64 end)
 		return -EINVAL;
 
 	spin_lock(&memtype_lock);
+
+	entry = memtype_rb_search(&memtype_rbroot, start);
+	if (unlikely(entry == NULL))
+		goto unlock_ret;
+
+	/*
+	 * Saved entry points to an entry with start same or less than what
+	 * we searched for. Now go through the list in both directions to look
+	 * for the entry that matches with both start and end, with list stored
+	 * in sorted start address
+	 */
+	saved_entry = entry;
 	list_for_each_entry(entry, &memtype_list, nd) {
 		if (entry->start == start && entry->end == end) {
-			if (cached_entry == entry || cached_start == start)
-				cached_entry = NULL;
+			rb_erase(&entry->rb, &memtype_rbroot);
+			list_del(&entry->nd);
+			kfree(entry);
+			err = 0;
+			break;
+		} else if (entry->start > start) {
+			break;
+		}
+	}
+
+	if (!err)
+		goto unlock_ret;
 
+	entry = saved_entry;
+	list_for_each_entry_reverse(entry, &memtype_list, nd) {
+		if (entry->start == start && entry->end == end) {
+			rb_erase(&entry->rb, &memtype_rbroot);
 			list_del(&entry->nd);
 			kfree(entry);
 			err = 0;
 			break;
+		} else if (entry->start < start) {
+			break;
 		}
 	}
+unlock_ret:
 	spin_unlock(&memtype_lock);
 
 	if (err) {
-- 
cgit v1.2.3


From 46cf98cdaef5471926010b5bddf84c44ec177fdd Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Date: Fri, 10 Jul 2009 09:57:37 -0700
Subject: x86, pat: Generalize the use of page flag PG_uncached

Only IA64 was using PG_uncached as of now. We now intend to use this bit
in x86 as well, to keep track of memory type of those addresses that
have page struct for them. So, generalize the use of that bit across
ia64 and x86.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/ia64/Kconfig          | 4 ++++
 arch/x86/Kconfig           | 4 ++++
 include/linux/page-flags.h | 4 ++--
 3 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 170042b420d..e6246119932 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -112,6 +112,10 @@ config IA64_UNCACHED_ALLOCATOR
 	bool
 	select GENERIC_ALLOCATOR
 
+config ARCH_USES_PG_UNCACHED
+	def_bool y
+	depends on IA64_UNCACHED_ALLOCATOR
+
 config AUDIT_ARCH
 	bool
 	default y
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index c07f7220590..8e159538219 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1414,6 +1414,10 @@ config X86_PAT
 
 	  If unsure, say Y.
 
+config ARCH_USES_PG_UNCACHED
+	def_bool y
+	depends on X86_PAT
+
 config EFI
 	bool "EFI runtime service support"
 	depends on ACPI
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index e2e5ce54359..2b87acfc5f8 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -99,7 +99,7 @@ enum pageflags {
 #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
 	PG_mlocked,		/* Page is vma mlocked */
 #endif
-#ifdef CONFIG_IA64_UNCACHED_ALLOCATOR
+#ifdef CONFIG_ARCH_USES_PG_UNCACHED
 	PG_uncached,		/* Page has been mapped as uncached */
 #endif
 	__NR_PAGEFLAGS,
@@ -257,7 +257,7 @@ PAGEFLAG_FALSE(Mlocked)
 	SETPAGEFLAG_NOOP(Mlocked) TESTCLEARFLAG_FALSE(Mlocked)
 #endif
 
-#ifdef CONFIG_IA64_UNCACHED_ALLOCATOR
+#ifdef CONFIG_ARCH_USES_PG_UNCACHED
 PAGEFLAG(Uncached, uncached)
 #else
 PAGEFLAG_FALSE(Uncached)
-- 
cgit v1.2.3


From f58417409603d62f2eb23db4d2cf6853d84a1698 Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Date: Fri, 10 Jul 2009 09:57:38 -0700
Subject: x86, pat: Use page flags to track memtypes of RAM pages

Change reserve_ram_pages_type and free_ram_pages_type to use 2 page
flags to track UC_MINUS, WC, WB and default types. Previous RAM tracking
just tracked WB or NonWB, which was not complete and did not allow
tracking of RAM fully and there was no way to get the actual type
reserved by looking at the page flags.

We use the memtype_lock spinlock for atomicity in dealing with
memtype tracking in struct page.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/cacheflush.h | 54 ++++++++++++++++++++++-
 arch/x86/mm/pat.c                 | 91 +++++++++++++++++++++------------------
 2 files changed, 102 insertions(+), 43 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index e55dfc1ad45..b54f6afe7ec 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -43,8 +43,58 @@ static inline void copy_from_user_page(struct vm_area_struct *vma,
 	memcpy(dst, src, len);
 }
 
-#define PG_non_WB				PG_arch_1
-PAGEFLAG(NonWB, non_WB)
+#define PG_WC				PG_arch_1
+PAGEFLAG(WC, WC)
+
+#ifdef CONFIG_X86_PAT
+/*
+ * X86 PAT uses page flags WC and Uncached together to keep track of
+ * memory type of pages that have backing page struct. X86 PAT supports 3
+ * different memory types, _PAGE_CACHE_WB, _PAGE_CACHE_WC and
+ * _PAGE_CACHE_UC_MINUS and fourth state where page's memory type has not
+ * been changed from its default (value of -1 used to denote this).
+ * Note we do not support _PAGE_CACHE_UC here.
+ *
+ * Caller must hold memtype_lock for atomicity.
+ */
+static inline unsigned long get_page_memtype(struct page *pg)
+{
+	if (!PageUncached(pg) && !PageWC(pg))
+		return -1;
+	else if (!PageUncached(pg) && PageWC(pg))
+		return _PAGE_CACHE_WC;
+	else if (PageUncached(pg) && !PageWC(pg))
+		return _PAGE_CACHE_UC_MINUS;
+	else
+		return _PAGE_CACHE_WB;
+}
+
+static inline void set_page_memtype(struct page *pg, unsigned long memtype)
+{
+	switch (memtype) {
+	case _PAGE_CACHE_WC:
+		ClearPageUncached(pg);
+		SetPageWC(pg);
+		break;
+	case _PAGE_CACHE_UC_MINUS:
+		SetPageUncached(pg);
+		ClearPageWC(pg);
+		break;
+	case _PAGE_CACHE_WB:
+		SetPageUncached(pg);
+		SetPageWC(pg);
+		break;
+	default:
+	case -1:
+		ClearPageUncached(pg);
+		ClearPageWC(pg);
+		break;
+	}
+}
+#else
+static inline unsigned long get_page_memtype(struct page *pg) { return -1; }
+static inline void set_page_memtype(struct page *pg, unsigned long memtype) { }
+#endif
 
 /*
  * The set_memory_* API can be used to change various attributes of a virtual
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index c90f2420f56..1a9d0f07593 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -288,63 +288,61 @@ static int pat_pagerange_is_ram(unsigned long start, unsigned long end)
 }
 
 /*
- * For RAM pages, mark the pages as non WB memory type using
- * PageNonWB (PG_arch_1). We allow only one set_memory_uc() or
- * set_memory_wc() on a RAM page at a time before marking it as WB again.
- * This is ok, because only one driver will be owning the page and
- * doing set_memory_*() calls.
+ * For RAM pages, we use page flags to mark the pages with appropriate type.
+ * Here we do two pass:
+ * - Find the memtype of all the pages in the range, look for any conflicts
+ * - In case of no conflicts, set the new memtype for pages in the range
  *
- * For now, we use PageNonWB to track that the RAM page is being mapped
- * as non WB. In future, we will have to use one more flag
- * (or some other mechanism in page_struct) to distinguish between
- * UC and WC mapping.
+ * Caller must hold memtype_lock for atomicity.
  */
 static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type,
 				  unsigned long *new_type)
 {
 	struct page *page;
-	u64 pfn, end_pfn;
+	u64 pfn;
+
+	if (req_type == _PAGE_CACHE_UC) {
+		/* We do not support strong UC */
+		WARN_ON_ONCE(1);
+		req_type = _PAGE_CACHE_UC_MINUS;
+	}
 
 	for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
-		page = pfn_to_page(pfn);
-		if (page_mapped(page) || PageNonWB(page))
-			goto out;
+		unsigned long type;
 
-		SetPageNonWB(page);
+		page = pfn_to_page(pfn);
+		type = get_page_memtype(page);
+		if (type != -1) {
+			printk(KERN_INFO "reserve_ram_pages_type failed "
+				"0x%Lx-0x%Lx, track 0x%lx, req 0x%lx\n",
+				start, end, type, req_type);
+			if (new_type)
+				*new_type = type;
+
+			return -EBUSY;
+		}
 	}
-	return 0;
 
-out:
-	end_pfn = pfn;
-	for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) {
+	if (new_type)
+		*new_type = req_type;
+
+	for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
 		page = pfn_to_page(pfn);
-		ClearPageNonWB(page);
+		set_page_memtype(page, req_type);
 	}
-
-	return -EINVAL;
+	return 0;
 }
 
 static int free_ram_pages_type(u64 start, u64 end)
 {
 	struct page *page;
-	u64 pfn, end_pfn;
+	u64 pfn;
 
 	for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
 		page = pfn_to_page(pfn);
-		if (page_mapped(page) || !PageNonWB(page))
-			goto out;
-
-		ClearPageNonWB(page);
+		set_page_memtype(page, -1);
 	}
 	return 0;
-
-out:
-	end_pfn = pfn;
-	for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) {
-		page = pfn_to_page(pfn);
-		SetPageNonWB(page);
-	}
-	return -EINVAL;
 }
 
 /*
@@ -405,11 +403,16 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
 		*new_type = actual_type;
 
 	is_range_ram = pat_pagerange_is_ram(start, end);
-	if (is_range_ram == 1)
-		return reserve_ram_pages_type(start, end, req_type,
-					      new_type);
-	else if (is_range_ram < 0)
+	if (is_range_ram == 1) {
+
+		spin_lock(&memtype_lock);
+		err = reserve_ram_pages_type(start, end, req_type, new_type);
+		spin_unlock(&memtype_lock);
+
+		return err;
+	} else if (is_range_ram < 0) {
 		return -EINVAL;
+	}
 
 	new  = kmalloc(sizeof(struct memtype), GFP_KERNEL);
 	if (!new)
@@ -505,10 +508,16 @@ int free_memtype(u64 start, u64 end)
 		return 0;
 
 	is_range_ram = pat_pagerange_is_ram(start, end);
-	if (is_range_ram == 1)
-		return free_ram_pages_type(start, end);
-	else if (is_range_ram < 0)
+	if (is_range_ram == 1) {
+
+		spin_lock(&memtype_lock);
+		err = free_ram_pages_type(start, end);
+		spin_unlock(&memtype_lock);
+
+		return err;
+	} else if (is_range_ram < 0) {
 		return -EINVAL;
+	}
 
 	spin_lock(&memtype_lock);
 
-- 
cgit v1.2.3


From 637b86e75f4c255a4446bc0b67ce9d914b9d2d42 Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Date: Fri, 10 Jul 2009 09:57:39 -0700
Subject: x86, pat: Add lookup_memtype to get the current memtype of a paddr

Add a new routine lookup_memtype() to get the current memtype based on
the PAT reserves and frees.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/mm/pat.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 1a9d0f07593..71aa6f7246c 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -573,6 +573,51 @@ unlock_ret:
 }
 
 
+/**
+ * lookup_memtype - Looksup the memory type for a physical address
+ * @paddr: physical address of which memory type needs to be looked up
+ *
+ * Only to be called when PAT is enabled
+ *
+ * Returns _PAGE_CACHE_WB, _PAGE_CACHE_WC, _PAGE_CACHE_UC_MINUS or
+ * _PAGE_CACHE_UC
+ */
+static unsigned long lookup_memtype(u64 paddr)
+{
+	int rettype = _PAGE_CACHE_WB;
+	struct memtype *entry;
+
+	if (is_ISA_range(paddr, paddr + PAGE_SIZE - 1))
+		return rettype;
+
+	if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) {
+		struct page *page;
+		spin_lock(&memtype_lock);
+		page = pfn_to_page(paddr >> PAGE_SHIFT);
+		rettype = get_page_memtype(page);
+		spin_unlock(&memtype_lock);
+		/*
+		 * -1 from get_page_memtype() implies RAM page is in its
+		 * default state and not reserved, and hence of type WB
+		 */
+		if (rettype == -1)
+			rettype = _PAGE_CACHE_WB;
+
+		return rettype;
+	}
+
+	spin_lock(&memtype_lock);
+
+	entry = memtype_rb_search(&memtype_rbroot, paddr);
+	if (entry != NULL)
+		rettype = entry->type;
+	else
+		rettype = _PAGE_CACHE_UC_MINUS;
+
+	spin_unlock(&memtype_lock);
+	return rettype;
+}
+
 /**
  * io_reserve_memtype - Request a memory type mapping for a region of memory
  * @start: start (physical address) of the region
-- 
cgit v1.2.3


From 1087637616dd5e96d834164ea462aed6159d039b Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Date: Fri, 10 Jul 2009 09:57:40 -0700
Subject: x86, pat: Lookup the protection from memtype list on vm_insert_pfn()

Lookup the reserved memtype during vm_insert_pfn and use that memtype
for the new mapping. This takes care or handling of vm_insert_pfn()
interface in track_pfn_vma*/untrack_pfn_vma.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/mm/pat.c | 24 +++++++++---------------
 1 file changed, 9 insertions(+), 15 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 71aa6f7246c..b629f75f73d 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -848,11 +848,6 @@ int track_pfn_vma_copy(struct vm_area_struct *vma)
 	unsigned long vma_size = vma->vm_end - vma->vm_start;
 	pgprot_t pgprot;
 
-	/*
-	 * For now, only handle remap_pfn_range() vmas where
-	 * is_linear_pfn_mapping() == TRUE. Handling of
-	 * vm_insert_pfn() is TBD.
-	 */
 	if (is_linear_pfn_mapping(vma)) {
 		/*
 		 * reserve the whole chunk covered by vma. We need the
@@ -880,20 +875,24 @@ int track_pfn_vma_copy(struct vm_area_struct *vma)
 int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot,
 			unsigned long pfn, unsigned long size)
 {
+	unsigned long flags;
 	resource_size_t paddr;
 	unsigned long vma_size = vma->vm_end - vma->vm_start;
 
-	/*
-	 * For now, only handle remap_pfn_range() vmas where
-	 * is_linear_pfn_mapping() == TRUE. Handling of
-	 * vm_insert_pfn() is TBD.
-	 */
 	if (is_linear_pfn_mapping(vma)) {
 		/* reserve the whole chunk starting from vm_pgoff */
 		paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
 		return reserve_pfn_range(paddr, vma_size, prot, 0);
 	}
 
+	if (!pat_enabled)
+		return 0;
+
+	/* for vm_insert_pfn and friends, we set prot based on lookup */
+	flags = lookup_memtype(pfn << PAGE_SHIFT);
+	*prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) |
+			 flags);
+
 	return 0;
 }
 
@@ -908,11 +907,6 @@ void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn,
 	resource_size_t paddr;
 	unsigned long vma_size = vma->vm_end - vma->vm_start;
 
-	/*
-	 * For now, only handle remap_pfn_range() vmas where
-	 * is_linear_pfn_mapping() == TRUE. Handling of
-	 * vm_insert_pfn() is TBD.
-	 */
 	if (is_linear_pfn_mapping(vma)) {
 		/* free the whole chunk starting from vm_pgoff */
 		paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
-- 
cgit v1.2.3


From d886c73cd4cf02a71e1650cbcb6176799d78aac1 Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Date: Fri, 10 Jul 2009 09:57:41 -0700
Subject: x86, pat: Sanity check remap_pfn_range for RAM region

Add sanity check for remap_pfn_range of RAM regions using
lookup_memtype(). Previously, we did not have anyway to get the type of
RAM memory regions as they were tracked using a single bit in
page_struct (WB, nonWB). Now we can get the actual type from page struct
(WB, WC, UC_MINUS) and make sure the requester gets that type.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/mm/pat.c | 24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index b629f75f73d..a6cace0694a 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -783,11 +783,29 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
 	is_ram = pat_pagerange_is_ram(paddr, paddr + size);
 
 	/*
-	 * reserve_pfn_range() doesn't support RAM pages. Maintain the current
-	 * behavior with RAM pages by returning success.
+	 * reserve_pfn_range() for RAM pages. We do not refcount to keep
+	 * track of number of mappings of RAM pages. We can assert that
+	 * the type requested matches the type of first page in the range.
 	 */
-	if (is_ram != 0)
+	if (is_ram) {
+		if (!pat_enabled)
+			return 0;
+
+		flags = lookup_memtype(paddr);
+		if (want_flags != flags) {
+			printk(KERN_WARNING
+			"%s:%d map pfn RAM range req %s for %Lx-%Lx, got %s\n",
+				current->comm, current->pid,
+				cattr_name(want_flags),
+				(unsigned long long)paddr,
+				(unsigned long long)(paddr + size),
+				cattr_name(flags));
+			*vma_prot = __pgprot((pgprot_val(*vma_prot) &
+					      (~_PAGE_CACHE_MASK)) |
+					     flags);
+		}
 		return 0;
+	}
 
 	ret = reserve_memtype(paddr, paddr + size, want_flags, &flags);
 	if (ret)
-- 
cgit v1.2.3


From 57844a8f8e29802f37ad9a0f94eb11d6ae358603 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 19 Aug 2009 14:48:38 +0200
Subject: x86: Add x86_init infrastructure

The upcoming Moorestown support brings the embedded world to x86. The
setup code of x86 has already a couple of hooks which are either
x86_quirks or paravirt ops. Some of those setup hooks are pretty
convoluted like the timer setup and the tsc calibration code. But
there are other places which could do with a cleanup.

Instead of having inline functions/macros which are modified at
compile time I decided to introduce x86_init ops which are
unconditional in the code and make it clear that they can be changed
either during compile time or in the early boot process. The function
pointers are initialized by default functions which can be noops so
that the pointer can be called unconditionally in the most cases. This
also allows us to remove 32bit/64bit, paravirt and other #ifdeffery.

paravirt guests are just a hardware platform in the setup code, so we
should treat them as such and not hide all behind multiple layers of
indirection and compile time dependencies.

It's more obvious that x86_init.timers.timer_init() is a function
pointer than the late_time_init = choose_time_init() obscurity. It's
also way simpler to grep for x86_init.timers.timer_init and find all
the places which modify that function pointer instead of analyzing
weak functions, macros and paravirt indirections.

Note. This is not a general paravirt_ops replacement. It just will
move setup related hooks which are potentially useful for other
platform setup purposes as well out of the paravirt domain.

Add the base infrastructure without any functionality.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/setup.h    |  2 ++
 arch/x86/include/asm/x86_init.h | 15 +++++++++++++++
 arch/x86/kernel/Makefile        |  2 +-
 arch/x86/kernel/x86_init.c      | 17 +++++++++++++++++
 4 files changed, 35 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/include/asm/x86_init.h
 create mode 100644 arch/x86/kernel/x86_init.c

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 4093d1ed6db..741e2956f3c 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -7,6 +7,8 @@
 
 #ifndef __ASSEMBLY__
 
+#include <asm/x86_init.h>
+
 /*
  * Any setup quirks to be performed?
  */
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
new file mode 100644
index 00000000000..14d11071675
--- /dev/null
+++ b/arch/x86/include/asm/x86_init.h
@@ -0,0 +1,15 @@
+#ifndef _ASM_X86_PLATFORM_H
+#define _ASM_X86_PLATFORM_H
+
+/**
+ * struct x86_init_ops - functions for platform specific setup
+ *
+ */
+struct x86_init_ops {
+};
+
+extern struct x86_init_ops x86_init;
+
+extern void x86_init_noop(void);
+
+#endif
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 430d5b24af7..313ed6fca9b 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -32,7 +32,7 @@ GCOV_PROFILE_paravirt.o		:= n
 obj-y			:= process_$(BITS).o signal.o entry_$(BITS).o
 obj-y			+= traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
 obj-y			+= time_$(BITS).o ioport.o ldt.o dumpstack.o
-obj-y			+= setup.o i8259.o irqinit.o
+obj-y			+= setup.o x86_init.o i8259.o irqinit.o
 obj-$(CONFIG_X86_VISWS)	+= visws_quirks.o
 obj-$(CONFIG_X86_32)	+= probe_roms_32.o
 obj-$(CONFIG_X86_32)	+= sys_i386_32.o i386_ksyms_32.o
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
new file mode 100644
index 00000000000..82d510c9c99
--- /dev/null
+++ b/arch/x86/kernel/x86_init.c
@@ -0,0 +1,17 @@
+/*
+ * Copyright (C) 2009 Thomas Gleixner <tglx@linutronix.de>
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+#include <linux/init.h>
+
+#include <asm/x86_init.h>
+
+void __cpuinit x86_init_noop(void) { }
+
+/*
+ * The platform setup functions are preset with the default functions
+ * for standard PC hardware.
+ */
+struct __initdata x86_init_ops x86_init = {
+};
-- 
cgit v1.2.3


From f7cf5a5b8c0e59eac8d30b62271cb0fa52e53ebc Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 19 Aug 2009 14:43:56 +0200
Subject: x86: Add probe_roms to x86_init

probe_roms is only used on 32bit. Add it to the x86_init ops and
remove the #ifdefs.

Default initializer is x86_init_noop() which is overridden in
the 32bit boot code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/x86_init.h | 10 ++++++++++
 arch/x86/kernel/head32.c        |  3 +++
 arch/x86/kernel/setup.c         |  4 +---
 arch/x86/kernel/x86_init.c      |  4 ++++
 4 files changed, 18 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 14d11071675..75e9e68d635 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -1,11 +1,21 @@
 #ifndef _ASM_X86_PLATFORM_H
 #define _ASM_X86_PLATFORM_H
 
+/**
+ * struct x86_init_resources - platform specific resource related ops
+ * @probe_roms:			probe BIOS roms
+ *
+ */
+struct x86_init_resources {
+	void (*probe_roms)(void);
+};
+
 /**
  * struct x86_init_ops - functions for platform specific setup
  *
  */
 struct x86_init_ops {
+	struct x86_init_resources resources;
 };
 
 extern struct x86_init_ops x86_init;
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 3f8579f8d42..4049353152c 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -29,6 +29,9 @@ void __init i386_start_kernel(void)
 		reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
 	}
 #endif
+	/* Initilize 32bit specific setup functions */
+	x86_init.resources.probe_roms = probe_roms;
+
 	reserve_ebda_region();
 
 	/*
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 63f32d220ef..5796eb158d4 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -835,9 +835,7 @@ void __init setup_arch(char **cmdline_p)
 	 */
 	init_hypervisor(&boot_cpu_data);
 
-#ifdef CONFIG_X86_32
-	probe_roms();
-#endif
+	x86_init.resources.probe_roms();
 
 	/* after parse_early_param, so could debug it */
 	insert_resource(&iomem_resource, &code_resource);
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 82d510c9c99..88883f8006c 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -14,4 +14,8 @@ void __cpuinit x86_init_noop(void) { }
  * for standard PC hardware.
  */
 struct __initdata x86_init_ops x86_init = {
+
+	.resources = {
+		.probe_roms		= x86_init_noop,
+	},
 };
-- 
cgit v1.2.3


From 8fee697d990c54976c8dc167270633299e2515d2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 19 Aug 2009 14:55:50 +0200
Subject: x86: Add request_standard_resources to x86_init

The 32bit and the 64bit code are slighty different in the reservation
of standard resources. Also the upcoming Moorestown support needs its
own version of that.

Add it to x86_init_ops and initialize it with the 64bit default. 32bit
overrides it in early boot. Now moorestown can add it's own override
w/o sprinkling the code with more #ifdefs

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/setup.h    |  3 +++
 arch/x86/include/asm/x86_init.h |  3 +++
 arch/x86/kernel/head32.c        |  1 +
 arch/x86/kernel/setup.c         | 28 ++++++++++++++++------------
 arch/x86/kernel/x86_init.c      |  3 ++-
 5 files changed, 25 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 741e2956f3c..19769ac6061 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -88,6 +88,9 @@ extern unsigned long saved_video_mode;
 #define paravirt_post_allocator_init()	do {} while (0)
 #endif
 
+extern void reserve_standard_io_resources(void);
+extern void i386_reserve_resources(void);
+
 #ifndef _SETUP
 
 /*
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 75e9e68d635..d0d9be25ed9 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -4,10 +4,13 @@
 /**
  * struct x86_init_resources - platform specific resource related ops
  * @probe_roms:			probe BIOS roms
+ * @reserve_resources:		reserve the standard resources for the
+ *				platform
  *
  */
 struct x86_init_resources {
 	void (*probe_roms)(void);
+	void (*reserve_resources)(void);
 };
 
 /**
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 4049353152c..d91c37c0206 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -31,6 +31,7 @@ void __init i386_start_kernel(void)
 #endif
 	/* Initilize 32bit specific setup functions */
 	x86_init.resources.probe_roms = probe_roms;
+	x86_init.resources.reserve_resources = i386_reserve_resources;
 
 	reserve_ebda_region();
 
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 5796eb158d4..c2a8090e831 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -171,13 +171,6 @@ static struct resource bss_resource = {
 
 
 #ifdef CONFIG_X86_32
-static struct resource video_ram_resource = {
-	.name	= "Video RAM area",
-	.start	= 0xa0000,
-	.end	= 0xbffff,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
-};
-
 /* cpu data as detected by the assembly code in head.S */
 struct cpuinfo_x86 new_cpu_data __cpuinitdata = {0, 0, 0, 0, -1, 1, 0, 0, -1};
 /* common cpu data for all cpus */
@@ -605,7 +598,7 @@ static struct resource standard_io_resources[] = {
 		.flags = IORESOURCE_BUSY | IORESOURCE_IO }
 };
 
-static void __init reserve_standard_io_resources(void)
+void __init reserve_standard_io_resources(void)
 {
 	int i;
 
@@ -1013,10 +1006,7 @@ void __init setup_arch(char **cmdline_p)
 	e820_reserve_resources();
 	e820_mark_nosave_regions(max_low_pfn);
 
-#ifdef CONFIG_X86_32
-	request_resource(&iomem_resource, &video_ram_resource);
-#endif
-	reserve_standard_io_resources();
+	x86_init.resources.reserve_resources();
 
 	e820_setup_gap();
 
@@ -1102,4 +1092,18 @@ void __init x86_quirk_time_init(void)
 	irq0.mask = cpumask_of_cpu(0);
 	setup_irq(0, &irq0);
 }
+
+static struct resource video_ram_resource = {
+	.name	= "Video RAM area",
+	.start	= 0xa0000,
+	.end	= 0xbffff,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+void __init i386_reserve_resources(void)
+{
+	request_resource(&iomem_resource, &video_ram_resource);
+	reserve_standard_io_resources();
+}
+
 #endif /* CONFIG_X86_32 */
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 88883f8006c..68c093b67ec 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -5,7 +5,7 @@
  */
 #include <linux/init.h>
 
-#include <asm/x86_init.h>
+#include <asm/setup.h>
 
 void __cpuinit x86_init_noop(void) { }
 
@@ -17,5 +17,6 @@ struct __initdata x86_init_ops x86_init = {
 
 	.resources = {
 		.probe_roms		= x86_init_noop,
+		.reserve_resources	= reserve_standard_io_resources,
 	},
 };
-- 
cgit v1.2.3


From 816c25e7d4fb6fd40022a376e8b7f45b1edf5a89 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 19 Aug 2009 14:36:27 +0200
Subject: x86: Add reserve_ebda_region to x86_init_ops

reserve_ebda_region needs to be called befor start_kernel. Moorestown
needs to override it. Make it a x86_init_ops function and initialize
it with the default reserve_ebda_region.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/x86_init.h | 2 ++
 arch/x86/kernel/head32.c        | 4 ++--
 arch/x86/kernel/head64.c        | 3 +--
 arch/x86/kernel/x86_init.c      | 2 ++
 4 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index d0d9be25ed9..8a971cb3dd3 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -6,11 +6,13 @@
  * @probe_roms:			probe BIOS roms
  * @reserve_resources:		reserve the standard resources for the
  *				platform
+ * @reserve_ebda_region:	reserve the extended bios data area
  *
  */
 struct x86_init_resources {
 	void (*probe_roms)(void);
 	void (*reserve_resources)(void);
+	void (*reserve_ebda_region)(void);
 };
 
 /**
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index d91c37c0206..921a23b6c14 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -11,7 +11,7 @@
 #include <asm/setup.h>
 #include <asm/sections.h>
 #include <asm/e820.h>
-#include <asm/bios_ebda.h>
+#include <asm/page.h>
 #include <asm/trampoline.h>
 
 void __init i386_start_kernel(void)
@@ -33,7 +33,7 @@ void __init i386_start_kernel(void)
 	x86_init.resources.probe_roms = probe_roms;
 	x86_init.resources.reserve_resources = i386_reserve_resources;
 
-	reserve_ebda_region();
+	x86_init.resources.reserve_ebda_region();
 
 	/*
 	 * At this point everything still needed from the boot loader
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 70eaa852c73..cead8149c3d 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -23,7 +23,6 @@
 #include <asm/sections.h>
 #include <asm/kdebug.h>
 #include <asm/e820.h>
-#include <asm/bios_ebda.h>
 #include <asm/trampoline.h>
 
 static void __init zap_identity_mappings(void)
@@ -112,7 +111,7 @@ void __init x86_64_start_reservations(char *real_mode_data)
 	}
 #endif
 
-	reserve_ebda_region();
+	x86_init.resources.reserve_ebda_region();
 
 	/*
 	 * At this point everything still needed from the boot loader
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 68c093b67ec..1fff49a6858 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -5,6 +5,7 @@
  */
 #include <linux/init.h>
 
+#include <asm/bios_ebda.h>
 #include <asm/setup.h>
 
 void __cpuinit x86_init_noop(void) { }
@@ -18,5 +19,6 @@ struct __initdata x86_init_ops x86_init = {
 	.resources = {
 		.probe_roms		= x86_init_noop,
 		.reserve_resources	= reserve_standard_io_resources,
+		.reserve_ebda_region	= reserve_ebda_region,
 	},
 };
-- 
cgit v1.2.3


From 6b18ae3e2ff62daa9f181401759161dd8de0aadf Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 20 Aug 2009 10:19:54 +0200
Subject: x86: Move memory_setup to x86_init_ops

memory_setup is overridden by x86_quirks and by paravirts with weak
functions and quirks. Unify the whole mess and make it an
unconditional x86_init_ops function which defaults to the standard
function and can be overridden by the early platform code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/e820.h           |  2 --
 arch/x86/include/asm/paravirt_types.h |  1 -
 arch/x86/include/asm/setup.h          |  1 -
 arch/x86/include/asm/x86_init.h       |  2 ++
 arch/x86/kernel/apic/numaq_32.c       |  1 -
 arch/x86/kernel/e820.c                | 19 +------------------
 arch/x86/kernel/paravirt.c            |  6 ------
 arch/x86/kernel/visws_quirks.c        |  3 ++-
 arch/x86/kernel/x86_init.c            |  2 ++
 arch/x86/lguest/boot.c                |  3 ++-
 arch/x86/xen/enlighten.c              |  3 ++-
 11 files changed, 11 insertions(+), 32 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index 7ecba4d8508..40b4e614fe7 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -126,8 +126,6 @@ extern void e820_reserve_resources(void);
 extern void e820_reserve_resources_late(void);
 extern void setup_memory_map(void);
 extern char *default_machine_specific_memory_setup(void);
-extern char *machine_specific_memory_setup(void);
-extern char *memory_setup(void);
 #endif /* __KERNEL__ */
 #endif /* __ASSEMBLY__ */
 
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 2b3371bae29..6d668968b6b 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -81,7 +81,6 @@ struct pv_init_ops {
 
 	/* Basic arch-specific setup */
 	void (*arch_setup)(void);
-	char *(*memory_setup)(void);
 	void (*post_allocator_init)(void);
 
 	/* Print a banner to identify the environment */
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 19769ac6061..9cba9d6ca88 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -22,7 +22,6 @@ struct x86_quirks {
 	int (*arch_pre_intr_init)(void);
 	int (*arch_intr_init)(void);
 	int (*arch_trap_init)(void);
-	char * (*arch_memory_setup)(void);
 	int (*mach_get_smp_config)(unsigned int early);
 	int (*mach_find_smp_config)(unsigned int reserve);
 
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 8a971cb3dd3..6c084f2a6c3 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -7,12 +7,14 @@
  * @reserve_resources:		reserve the standard resources for the
  *				platform
  * @reserve_ebda_region:	reserve the extended bios data area
+ * @memory_setup:		platform specific memory setup
  *
  */
 struct x86_init_resources {
 	void (*probe_roms)(void);
 	void (*reserve_resources)(void);
 	void (*reserve_ebda_region)(void);
+	char *(*memory_setup)(void);
 };
 
 /**
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index ca96e68f0d2..403c062f69e 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -260,7 +260,6 @@ static struct x86_quirks numaq_x86_quirks __initdata = {
 	.arch_pre_time_init		= numaq_pre_time_init,
 	.arch_time_init			= NULL,
 	.arch_pre_intr_init		= NULL,
-	.arch_memory_setup		= NULL,
 	.arch_intr_init			= NULL,
 	.arch_trap_init			= NULL,
 	.mach_get_smp_config		= NULL,
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 5cb5725b2ba..0d804b907e8 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1455,28 +1455,11 @@ char *__init default_machine_specific_memory_setup(void)
 	return who;
 }
 
-char *__init __attribute__((weak)) machine_specific_memory_setup(void)
-{
-	if (x86_quirks->arch_memory_setup) {
-		char *who = x86_quirks->arch_memory_setup();
-
-		if (who)
-			return who;
-	}
-	return default_machine_specific_memory_setup();
-}
-
-/* Overridden in paravirt.c if CONFIG_PARAVIRT */
-char * __init __attribute__((weak)) memory_setup(void)
-{
-	return machine_specific_memory_setup();
-}
-
 void __init setup_memory_map(void)
 {
 	char *who;
 
-	who = memory_setup();
+	who = x86_init.resources.memory_setup();
 	memcpy(&e820_saved, &e820, sizeof(struct e820map));
 	printk(KERN_INFO "BIOS-provided physical RAM map:\n");
 	e820_print_map(who);
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 70ec9b951d7..532c9a2626c 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -60,11 +60,6 @@ static void __init default_banner(void)
 	       pv_info.name);
 }
 
-char *memory_setup(void)
-{
-	return pv_init_ops.memory_setup();
-}
-
 /* Simple instruction patching code. */
 #define DEF_NATIVE(ops, name, code)					\
 	extern const char start_##ops##_##name[], end_##ops##_##name[];	\
@@ -322,7 +317,6 @@ struct pv_init_ops pv_init_ops = {
 	.patch = native_patch,
 	.banner = default_banner,
 	.arch_setup = paravirt_nop,
-	.memory_setup = machine_specific_memory_setup,
 };
 
 struct pv_time_ops pv_time_ops = {
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index 31ffc24eec4..97c670df1ae 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -239,7 +239,6 @@ static int visws_trap_init(void);
 static struct x86_quirks visws_x86_quirks __initdata = {
 	.arch_time_init		= visws_time_init,
 	.arch_pre_intr_init	= visws_pre_intr_init,
-	.arch_memory_setup	= visws_memory_setup,
 	.arch_intr_init		= NULL,
 	.arch_trap_init		= visws_trap_init,
 	.mach_get_smp_config	= visws_get_smp_config,
@@ -263,6 +262,8 @@ void __init visws_early_detect(void)
 	 */
 	x86_quirks = &visws_x86_quirks;
 
+	x86_init.resources.memory_setup = visws_memory_setup;
+
 	/*
 	 * Install reboot quirks:
 	 */
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 1fff49a6858..1965bff3489 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -7,6 +7,7 @@
 
 #include <asm/bios_ebda.h>
 #include <asm/setup.h>
+#include <asm/e820.h>
 
 void __cpuinit x86_init_noop(void) { }
 
@@ -20,5 +21,6 @@ struct __initdata x86_init_ops x86_init = {
 		.probe_roms		= x86_init_noop,
 		.reserve_resources	= reserve_standard_io_resources,
 		.reserve_ebda_region	= reserve_ebda_region,
+		.memory_setup		= default_machine_specific_memory_setup,
 	},
 };
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index d677fa9ca65..11445c176de 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -1270,7 +1270,6 @@ __init void lguest_init(void)
 	pv_irq_ops.safe_halt = lguest_safe_halt;
 
 	/* Setup operations */
-	pv_init_ops.memory_setup = lguest_memory_setup;
 	pv_init_ops.patch = lguest_patch;
 
 	/* Intercepts of various CPU instructions */
@@ -1325,6 +1324,8 @@ __init void lguest_init(void)
 	pv_time_ops.time_init = lguest_time_init;
 	pv_time_ops.get_tsc_khz = lguest_tsc_khz;
 
+	x86_init.resources.memory_setup = lguest_memory_setup;
+
 	/*
 	 * Now is a good time to look at the implementations of these functions
 	 * before returning to the rest of lguest_init().
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index e90540a46a0..50b20c64f0b 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -841,7 +841,6 @@ static const struct pv_init_ops xen_init_ops __initdata = {
 	.patch = xen_patch,
 
 	.banner = xen_banner,
-	.memory_setup = xen_memory_setup,
 	.arch_setup = xen_arch_setup,
 	.post_allocator_init = xen_post_allocator_init,
 };
@@ -982,6 +981,8 @@ asmlinkage void __init xen_start_kernel(void)
 	pv_apic_ops = xen_apic_ops;
 	pv_mmu_ops = xen_mmu_ops;
 
+	x86_init.resources.memory_setup = xen_memory_setup;
+
 #ifdef CONFIG_X86_64
 	/*
 	 * Setup percpu state.  We only need to do this for 64-bit
-- 
cgit v1.2.3


From f4848472cd99487e182b64fb2a5d0e4fedbe86ad Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 20 Aug 2009 12:05:01 +0200
Subject: x86: Sanitize smp_record and move it to x86_init_ops

The x86 quirkification introduced an extra ugly hackery with a
variable pointer in the mpparse code. If the pointer is initialized
then it is dereferenced and the variable set to 0 or incremented.

Create a x86_init_ops function and let the affected numaq code
hold the function. Default init is a setup noop.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/setup.h    |  1 -
 arch/x86/include/asm/x86_init.h | 12 +++++++++++-
 arch/x86/kernel/apic/numaq_32.c | 19 ++++++++++++++++---
 arch/x86/kernel/mpparse.c       |  6 ++----
 arch/x86/kernel/x86_init.c      |  5 +++++
 5 files changed, 34 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 9cba9d6ca88..bbf2dfd59b4 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -25,7 +25,6 @@ struct x86_quirks {
 	int (*mach_get_smp_config)(unsigned int early);
 	int (*mach_find_smp_config)(unsigned int reserve);
 
-	int *mpc_record;
 	int (*mpc_apic_id)(struct mpc_cpu *m);
 	void (*mpc_oem_bus_info)(struct mpc_bus *m, char *name);
 	void (*mpc_oem_pci_bus)(struct mpc_bus *m);
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 6c084f2a6c3..10b297b1881 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -1,6 +1,14 @@
 #ifndef _ASM_X86_PLATFORM_H
 #define _ASM_X86_PLATFORM_H
 
+/**
+ * struct x86_init_mpparse - platform specific mpparse ops
+ * @mpc_record:			platform specific mpc record accounting
+ */
+struct x86_init_mpparse {
+	void (*mpc_record)(unsigned int mode);
+};
+
 /**
  * struct x86_init_resources - platform specific resource related ops
  * @probe_roms:			probe BIOS roms
@@ -22,11 +30,13 @@ struct x86_init_resources {
  *
  */
 struct x86_init_ops {
-	struct x86_init_resources resources;
+	struct x86_init_resources	resources;
+	struct x86_init_mpparse		mpparse;
 };
 
 extern struct x86_init_ops x86_init;
 
 extern void x86_init_noop(void);
+extern void x86_init_uint_noop(unsigned int unused);
 
 #endif
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index 403c062f69e..b5f0b1dc7dd 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -66,7 +66,6 @@ struct mpc_trans {
 	unsigned short			trans_reserved;
 };
 
-/* x86_quirks member */
 static int				mpc_record;
 
 static struct mpc_trans			*translation_table[MAX_MPC_ENTRY];
@@ -177,6 +176,19 @@ static void mpc_oem_pci_bus(struct mpc_bus *m)
 	quad_local_to_mp_bus_id[quad][local] = m->busid;
 }
 
+/*
+ * Called from mpparse code.
+ * mode = 0: prescan
+ * mode = 1: one mpc entry scanned
+ */
+static void numaq_mpc_record(unsigned int mode)
+{
+	if (!mode)
+		mpc_record = 0;
+	else
+		mpc_record++;
+}
+
 static void __init MP_translation_info(struct mpc_trans *m)
 {
 	printk(KERN_INFO
@@ -264,7 +276,6 @@ static struct x86_quirks numaq_x86_quirks __initdata = {
 	.arch_trap_init			= NULL,
 	.mach_get_smp_config		= NULL,
 	.mach_find_smp_config		= NULL,
-	.mpc_record			= &mpc_record,
 	.mpc_apic_id			= mpc_apic_id,
 	.mpc_oem_bus_info		= mpc_oem_bus_info,
 	.mpc_oem_pci_bus		= mpc_oem_pci_bus,
@@ -285,8 +296,10 @@ static __init void early_check_numaq(void)
 	if (smp_found_config)
 		early_get_smp_config();
 
-	if (found_numaq)
+	if (found_numaq) {
 		x86_quirks = &numaq_x86_quirks;
+		x86_init.mpparse.mpc_record = numaq_mpc_record;
+	}
 }
 
 int __init get_memcfg_numaq(void)
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 651c93b2886..b2179fdf0ff 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -320,8 +320,7 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
 	/*
 	 *      Now process the configuration blocks.
 	 */
-	if (x86_quirks->mpc_record)
-		*x86_quirks->mpc_record = 0;
+	x86_init.mpparse.mpc_record(0);
 
 	while (count < mpc->length) {
 		switch (*mpt) {
@@ -353,8 +352,7 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
 			count = mpc->length;
 			break;
 		}
-		if (x86_quirks->mpc_record)
-			(*x86_quirks->mpc_record)++;
+		x86_init.mpparse.mpc_record(1);
 	}
 
 #ifdef CONFIG_X86_BIGSMP
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 1965bff3489..83bd5db376b 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -10,6 +10,7 @@
 #include <asm/e820.h>
 
 void __cpuinit x86_init_noop(void) { }
+void __init x86_init_uint_noop(unsigned int unused) { }
 
 /*
  * The platform setup functions are preset with the default functions
@@ -23,4 +24,8 @@ struct __initdata x86_init_ops x86_init = {
 		.reserve_ebda_region	= reserve_ebda_region,
 		.memory_setup		= default_machine_specific_memory_setup,
 	},
+
+	.mpparse = {
+		.mpc_record		= x86_init_uint_noop,
+	},
 };
-- 
cgit v1.2.3


From de93410310952fb7b705f784ef22493c8362dbe8 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 20 Aug 2009 09:27:29 +0200
Subject: x86: Move ioapic_ids_setup to x86_init_ops

32bit and also the numaq code have special requirements on the
ioapic_id setup. Convert it to a x86_init_ops function and get rid
of the quirks and #ifdefs

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/io_apic.h  |  3 ++-
 arch/x86/include/asm/setup.h    |  1 -
 arch/x86/include/asm/x86_init.h |  2 ++
 arch/x86/kernel/apic/io_apic.c  | 11 ++++-------
 arch/x86/kernel/apic/numaq_32.c |  8 +-------
 arch/x86/kernel/head32.c        |  3 +++
 arch/x86/kernel/x86_init.c      |  1 +
 7 files changed, 13 insertions(+), 16 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 330ee807f89..2b8aeb89933 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -177,12 +177,13 @@ extern int setup_ioapic_entry(int apic, int irq,
 			      int polarity, int vector, int pin);
 extern void ioapic_write_entry(int apic, int pin,
 			       struct IO_APIC_route_entry e);
+extern void setup_ioapic_ids_from_mpc(void);
 #else  /* !CONFIG_X86_IO_APIC */
 #define io_apic_assign_pci_irqs 0
+#define setup_ioapic_ids_from_mpc x86_init_noop
 static const int timer_through_8259 = 0;
 static inline void ioapic_init_mappings(void)	{ }
 static inline void ioapic_insert_resources(void) { }
-
 static inline void probe_nr_irqs_gsi(void)	{ }
 #endif
 
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index bbf2dfd59b4..cc8b4b0550e 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -30,7 +30,6 @@ struct x86_quirks {
 	void (*mpc_oem_pci_bus)(struct mpc_bus *m);
 	void (*smp_read_mpc_oem)(struct mpc_oemtable *oemtable,
 				unsigned short oemsize);
-	int (*setup_ioapic_ids)(void);
 };
 
 extern void x86_quirk_intr_init(void);
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 10b297b1881..65985730b37 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -4,9 +4,11 @@
 /**
  * struct x86_init_mpparse - platform specific mpparse ops
  * @mpc_record:			platform specific mpc record accounting
+ * @setup_ioapic_ids:		platform specific ioapic id override
  */
 struct x86_init_mpparse {
 	void (*mpc_record)(unsigned int mode);
+	void (*setup_ioapic_ids)(void);
 };
 
 /**
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index d2ed6c5ddc8..5f4687187ce 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2014,7 +2014,7 @@ void disable_IO_APIC(void)
  * by Matt Domsch <Matt_Domsch@dell.com>  Tue Dec 21 12:25:05 CST 1999
  */
 
-static void __init setup_ioapic_ids_from_mpc(void)
+void __init setup_ioapic_ids_from_mpc(void)
 {
 	union IO_APIC_reg_00 reg_00;
 	physid_mask_t phys_id_present_map;
@@ -2023,9 +2023,8 @@ static void __init setup_ioapic_ids_from_mpc(void)
 	unsigned char old_id;
 	unsigned long flags;
 
-	if (x86_quirks->setup_ioapic_ids && x86_quirks->setup_ioapic_ids())
+	if (acpi_ioapic)
 		return;
-
 	/*
 	 * Don't check I/O APIC IDs for xAPIC systems.  They have
 	 * no meaning without the serial APIC bus.
@@ -3061,10 +3060,8 @@ void __init setup_IO_APIC(void)
 	/*
          * Set up IO-APIC IRQ routing.
          */
-#ifdef CONFIG_X86_32
-	if (!acpi_ioapic)
-		setup_ioapic_ids_from_mpc();
-#endif
+	x86_init.mpparse.setup_ioapic_ids();
+
 	sync_Arb_IDs();
 	setup_IO_APIC_irqs();
 	init_IO_APIC_traps();
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index b5f0b1dc7dd..f3717659265 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -262,12 +262,6 @@ static void __init
 	}
 }
 
-static int __init numaq_setup_ioapic_ids(void)
-{
-	/* so can skip it */
-	return 1;
-}
-
 static struct x86_quirks numaq_x86_quirks __initdata = {
 	.arch_pre_time_init		= numaq_pre_time_init,
 	.arch_time_init			= NULL,
@@ -280,7 +274,6 @@ static struct x86_quirks numaq_x86_quirks __initdata = {
 	.mpc_oem_bus_info		= mpc_oem_bus_info,
 	.mpc_oem_pci_bus		= mpc_oem_pci_bus,
 	.smp_read_mpc_oem		= smp_read_mpc_oem,
-	.setup_ioapic_ids		= numaq_setup_ioapic_ids,
 };
 
 static __init void early_check_numaq(void)
@@ -299,6 +292,7 @@ static __init void early_check_numaq(void)
 	if (found_numaq) {
 		x86_quirks = &numaq_x86_quirks;
 		x86_init.mpparse.mpc_record = numaq_mpc_record;
+		x86_init.mpparse.setup_ioapic_ids = x86_init_noop;
 	}
 }
 
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 921a23b6c14..a21398fac4f 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -13,6 +13,8 @@
 #include <asm/e820.h>
 #include <asm/page.h>
 #include <asm/trampoline.h>
+#include <asm/apic.h>
+#include <asm/io_apic.h>
 
 void __init i386_start_kernel(void)
 {
@@ -32,6 +34,7 @@ void __init i386_start_kernel(void)
 	/* Initilize 32bit specific setup functions */
 	x86_init.resources.probe_roms = probe_roms;
 	x86_init.resources.reserve_resources = i386_reserve_resources;
+	x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc;
 
 	x86_init.resources.reserve_ebda_region();
 
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 83bd5db376b..f4a32b3ab02 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -27,5 +27,6 @@ struct __initdata x86_init_ops x86_init = {
 
 	.mpparse = {
 		.mpc_record		= x86_init_uint_noop,
+		.setup_ioapic_ids	= x86_init_noop,
 	},
 };
-- 
cgit v1.2.3


From fd6c6661492226bb82f422157c535ac573cbecbd Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 20 Aug 2009 10:41:58 +0200
Subject: x86: Move mpc_apic_id to x86_init_ops

The mpc_apic_id setup is handled by a x86_quirk. Make it a
x86_init_ops function with a default implementation.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/mpspec.h   |  2 ++
 arch/x86/include/asm/setup.h    |  2 --
 arch/x86/include/asm/x86_init.h |  4 ++++
 arch/x86/kernel/apic/numaq_32.c |  2 +-
 arch/x86/kernel/mpparse.c       | 10 ++++++----
 arch/x86/kernel/x86_init.c      |  2 ++
 6 files changed, 15 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index e2a1bb6d71e..03c6a92bfd4 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -62,10 +62,12 @@ extern void get_smp_config(void);
 extern void find_smp_config(void);
 extern void early_reserve_e820_mpc_new(void);
 extern int enable_update_mptable;
+extern int default_mpc_apic_id(struct mpc_cpu *m);
 #else
 static inline void find_smp_config(void) { }
 static inline void early_reserve_e820_mpc_new(void) { }
 #define enable_update_mptable 0
+#define default_mpc_apic_id NULL
 #endif
 
 void __cpuinit generic_processor_info(int apicid, int version);
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index cc8b4b0550e..7c7f44f3e4d 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -12,7 +12,6 @@
 /*
  * Any setup quirks to be performed?
  */
-struct mpc_cpu;
 struct mpc_bus;
 struct mpc_oemtable;
 
@@ -25,7 +24,6 @@ struct x86_quirks {
 	int (*mach_get_smp_config)(unsigned int early);
 	int (*mach_find_smp_config)(unsigned int reserve);
 
-	int (*mpc_apic_id)(struct mpc_cpu *m);
 	void (*mpc_oem_bus_info)(struct mpc_bus *m, char *name);
 	void (*mpc_oem_pci_bus)(struct mpc_bus *m);
 	void (*smp_read_mpc_oem)(struct mpc_oemtable *oemtable,
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 65985730b37..f2be2a78018 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -1,14 +1,18 @@
 #ifndef _ASM_X86_PLATFORM_H
 #define _ASM_X86_PLATFORM_H
 
+struct mpc_cpu;
+
 /**
  * struct x86_init_mpparse - platform specific mpparse ops
  * @mpc_record:			platform specific mpc record accounting
  * @setup_ioapic_ids:		platform specific ioapic id override
+ * @mpc_apic_id:		platform specific mpc apic id assignment
  */
 struct x86_init_mpparse {
 	void (*mpc_record)(unsigned int mode);
 	void (*setup_ioapic_ids)(void);
+	int (*mpc_apic_id)(struct mpc_cpu *m);
 };
 
 /**
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index f3717659265..222413f7e79 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -270,7 +270,6 @@ static struct x86_quirks numaq_x86_quirks __initdata = {
 	.arch_trap_init			= NULL,
 	.mach_get_smp_config		= NULL,
 	.mach_find_smp_config		= NULL,
-	.mpc_apic_id			= mpc_apic_id,
 	.mpc_oem_bus_info		= mpc_oem_bus_info,
 	.mpc_oem_pci_bus		= mpc_oem_pci_bus,
 	.smp_read_mpc_oem		= smp_read_mpc_oem,
@@ -293,6 +292,7 @@ static __init void early_check_numaq(void)
 		x86_quirks = &numaq_x86_quirks;
 		x86_init.mpparse.mpc_record = numaq_mpc_record;
 		x86_init.mpparse.setup_ioapic_ids = x86_init_noop;
+		x86_init.mpparse.mpc_apic_id = mpc_apic_id;
 	}
 }
 
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index b2179fdf0ff..04560860a72 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -45,6 +45,11 @@ static int __init mpf_checksum(unsigned char *mp, int len)
 	return sum & 0xFF;
 }
 
+int __init default_mpc_apic_id(struct mpc_cpu *m)
+{
+	return m->apicid;
+}
+
 static void __init MP_processor_info(struct mpc_cpu *m)
 {
 	int apicid;
@@ -55,10 +60,7 @@ static void __init MP_processor_info(struct mpc_cpu *m)
 		return;
 	}
 
-	if (x86_quirks->mpc_apic_id)
-		apicid = x86_quirks->mpc_apic_id(m);
-	else
-		apicid = m->apicid;
+	apicid = x86_init.mpparse.mpc_apic_id(m);
 
 	if (m->cpuflag & CPU_BOOTPROCESSOR) {
 		bootup_cpu = " (Bootup-CPU)";
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index f4a32b3ab02..08749f2612f 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -6,6 +6,7 @@
 #include <linux/init.h>
 
 #include <asm/bios_ebda.h>
+#include <asm/mpspec.h>
 #include <asm/setup.h>
 #include <asm/e820.h>
 
@@ -28,5 +29,6 @@ struct __initdata x86_init_ops x86_init = {
 	.mpparse = {
 		.mpc_record		= x86_init_uint_noop,
 		.setup_ioapic_ids	= x86_init_noop,
+		.mpc_apic_id		= default_mpc_apic_id,
 	},
 };
-- 
cgit v1.2.3


From 72302142e165313ee58af81bd76708c12b58d7ab Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 20 Aug 2009 12:18:32 +0200
Subject: x86: Move smp_read_mpc_oem to x86_init_ops.

Move smp_read_mpc_oem from quirks to x86_init.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/mpspec.h   | 2 ++
 arch/x86/include/asm/setup.h    | 3 ---
 arch/x86/include/asm/x86_init.h | 3 +++
 arch/x86/kernel/apic/numaq_32.c | 6 +++---
 arch/x86/kernel/mpparse.c       | 8 ++++----
 arch/x86/kernel/x86_init.c      | 1 +
 6 files changed, 13 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index 03c6a92bfd4..5de8e92be2d 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -63,11 +63,13 @@ extern void find_smp_config(void);
 extern void early_reserve_e820_mpc_new(void);
 extern int enable_update_mptable;
 extern int default_mpc_apic_id(struct mpc_cpu *m);
+extern void default_smp_read_mpc_oem(struct mpc_table *mpc);
 #else
 static inline void find_smp_config(void) { }
 static inline void early_reserve_e820_mpc_new(void) { }
 #define enable_update_mptable 0
 #define default_mpc_apic_id NULL
+#define default_smp_read_mpc_oem NULL
 #endif
 
 void __cpuinit generic_processor_info(int apicid, int version);
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 7c7f44f3e4d..adb5d44d990 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -13,7 +13,6 @@
  * Any setup quirks to be performed?
  */
 struct mpc_bus;
-struct mpc_oemtable;
 
 struct x86_quirks {
 	int (*arch_pre_time_init)(void);
@@ -26,8 +25,6 @@ struct x86_quirks {
 
 	void (*mpc_oem_bus_info)(struct mpc_bus *m, char *name);
 	void (*mpc_oem_pci_bus)(struct mpc_bus *m);
-	void (*smp_read_mpc_oem)(struct mpc_oemtable *oemtable,
-				unsigned short oemsize);
 };
 
 extern void x86_quirk_intr_init(void);
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index f2be2a78018..fc0eef2f5fd 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -2,17 +2,20 @@
 #define _ASM_X86_PLATFORM_H
 
 struct mpc_cpu;
+struct mpc_table;
 
 /**
  * struct x86_init_mpparse - platform specific mpparse ops
  * @mpc_record:			platform specific mpc record accounting
  * @setup_ioapic_ids:		platform specific ioapic id override
  * @mpc_apic_id:		platform specific mpc apic id assignment
+ * @smp_read_mpc_oem:		platform specific oem mpc table setup
  */
 struct x86_init_mpparse {
 	void (*mpc_record)(unsigned int mode);
 	void (*setup_ioapic_ids)(void);
 	int (*mpc_apic_id)(struct mpc_cpu *m);
+	void (*smp_read_mpc_oem)(struct mpc_table *mpc);
 };
 
 /**
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index 222413f7e79..1bd3b0ed240 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -218,9 +218,9 @@ static int __init mpf_checksum(unsigned char *mp, int len)
 /*
  * Read/parse the MPC oem tables
  */
-static void __init
- smp_read_mpc_oem(struct mpc_oemtable *oemtable, unsigned short oemsize)
+static void __init smp_read_mpc_oem(struct mpc_table *mpc)
 {
+	struct mpc_oemtable *oemtable = (void *)(long)mpc->oemptr;
 	int count = sizeof(*oemtable);	/* the header size */
 	unsigned char *oemptr = ((unsigned char *)oemtable) + count;
 
@@ -272,7 +272,6 @@ static struct x86_quirks numaq_x86_quirks __initdata = {
 	.mach_find_smp_config		= NULL,
 	.mpc_oem_bus_info		= mpc_oem_bus_info,
 	.mpc_oem_pci_bus		= mpc_oem_pci_bus,
-	.smp_read_mpc_oem		= smp_read_mpc_oem,
 };
 
 static __init void early_check_numaq(void)
@@ -293,6 +292,7 @@ static __init void early_check_numaq(void)
 		x86_init.mpparse.mpc_record = numaq_mpc_record;
 		x86_init.mpparse.setup_ioapic_ids = x86_init_noop;
 		x86_init.mpparse.mpc_apic_id = mpc_apic_id;
+		x86_init.mpparse.smp_read_mpc_oem = smp_read_mpc_oem;
 	}
 }
 
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 04560860a72..45abdf63edc 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -293,6 +293,8 @@ static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt)
 			1, mpc, mpc->length, 1);
 }
 
+void __init default_smp_read_mpc_oem(struct mpc_table *mpc) { }
+
 static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
 {
 	char str[16];
@@ -314,10 +316,8 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
 	if (early)
 		return 1;
 
-	if (mpc->oemptr && x86_quirks->smp_read_mpc_oem) {
-		struct mpc_oemtable *oem_table = (void *)(long)mpc->oemptr;
-		x86_quirks->smp_read_mpc_oem(oem_table, mpc->oemsize);
-	}
+	if (mpc->oemptr)
+		x86_init.mpparse.smp_read_mpc_oem(mpc);
 
 	/*
 	 *      Now process the configuration blocks.
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 08749f2612f..fb5d93c077d 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -30,5 +30,6 @@ struct __initdata x86_init_ops x86_init = {
 		.mpc_record		= x86_init_uint_noop,
 		.setup_ioapic_ids	= x86_init_noop,
 		.mpc_apic_id		= default_mpc_apic_id,
+		.smp_read_mpc_oem	= default_smp_read_mpc_oem,
 	},
 };
-- 
cgit v1.2.3


From 52fdb5684660f9fd7129f7bbbe279a02893bacb8 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 20 Aug 2009 12:45:33 +0200
Subject: x86: Move mpc_oem_pci_bus to x86_init_ops

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/setup.h    | 1 -
 arch/x86/include/asm/x86_init.h | 3 +++
 arch/x86/kernel/apic/numaq_32.c | 2 +-
 arch/x86/kernel/mpparse.c       | 4 ++--
 4 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index adb5d44d990..fd2267baba1 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -24,7 +24,6 @@ struct x86_quirks {
 	int (*mach_find_smp_config)(unsigned int reserve);
 
 	void (*mpc_oem_bus_info)(struct mpc_bus *m, char *name);
-	void (*mpc_oem_pci_bus)(struct mpc_bus *m);
 };
 
 extern void x86_quirk_intr_init(void);
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index fc0eef2f5fd..404e2d2b06d 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -1,6 +1,7 @@
 #ifndef _ASM_X86_PLATFORM_H
 #define _ASM_X86_PLATFORM_H
 
+struct mpc_bus;
 struct mpc_cpu;
 struct mpc_table;
 
@@ -10,12 +11,14 @@ struct mpc_table;
  * @setup_ioapic_ids:		platform specific ioapic id override
  * @mpc_apic_id:		platform specific mpc apic id assignment
  * @smp_read_mpc_oem:		platform specific oem mpc table setup
+ * @mpc_oem_pci_bus:		platform specific pci bus setup (default NULL)
  */
 struct x86_init_mpparse {
 	void (*mpc_record)(unsigned int mode);
 	void (*setup_ioapic_ids)(void);
 	int (*mpc_apic_id)(struct mpc_cpu *m);
 	void (*smp_read_mpc_oem)(struct mpc_table *mpc);
+	void (*mpc_oem_pci_bus)(struct mpc_bus *m);
 };
 
 /**
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index 1bd3b0ed240..feebe8eed7d 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -271,7 +271,6 @@ static struct x86_quirks numaq_x86_quirks __initdata = {
 	.mach_get_smp_config		= NULL,
 	.mach_find_smp_config		= NULL,
 	.mpc_oem_bus_info		= mpc_oem_bus_info,
-	.mpc_oem_pci_bus		= mpc_oem_pci_bus,
 };
 
 static __init void early_check_numaq(void)
@@ -293,6 +292,7 @@ static __init void early_check_numaq(void)
 		x86_init.mpparse.setup_ioapic_ids = x86_init_noop;
 		x86_init.mpparse.mpc_apic_id = mpc_apic_id;
 		x86_init.mpparse.smp_read_mpc_oem = smp_read_mpc_oem;
+		x86_init.mpparse.mpc_oem_pci_bus = mpc_oem_pci_bus;
 	}
 }
 
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 45abdf63edc..72e1140723c 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -98,8 +98,8 @@ static void __init MP_bus_info(struct mpc_bus *m)
 		mp_bus_id_to_type[m->busid] = MP_BUS_ISA;
 #endif
 	} else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
-		if (x86_quirks->mpc_oem_pci_bus)
-			x86_quirks->mpc_oem_pci_bus(m);
+		if (x86_init.mpparse.mpc_oem_pci_bus)
+			x86_init.mpparse.mpc_oem_pci_bus(m);
 
 		clear_bit(m->busid, mp_bus_not_pci);
 #if defined(CONFIG_EISA) || defined(CONFIG_MCA)
-- 
cgit v1.2.3


From 90e1c6969d8711edb888a00ec54c74370f125c8f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 20 Aug 2009 12:34:47 +0200
Subject: x86: Move oem_bus_info to x86_init_ops

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/mpspec.h   |  6 ++++++
 arch/x86/include/asm/setup.h    |  3 ---
 arch/x86/include/asm/x86_init.h |  2 ++
 arch/x86/kernel/apic/numaq_32.c |  2 +-
 arch/x86/kernel/mpparse.c       | 14 ++++++++------
 arch/x86/kernel/x86_init.c      |  1 +
 6 files changed, 18 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index 5de8e92be2d..e3c579efde7 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -64,12 +64,18 @@ extern void early_reserve_e820_mpc_new(void);
 extern int enable_update_mptable;
 extern int default_mpc_apic_id(struct mpc_cpu *m);
 extern void default_smp_read_mpc_oem(struct mpc_table *mpc);
+# ifdef CONFIG_X86_IO_APIC
+extern void default_mpc_oem_bus_info(struct mpc_bus *m, char *str);
+# else
+#  define default_mpc_oem_bus_info NULL
+# endif
 #else
 static inline void find_smp_config(void) { }
 static inline void early_reserve_e820_mpc_new(void) { }
 #define enable_update_mptable 0
 #define default_mpc_apic_id NULL
 #define default_smp_read_mpc_oem NULL
+#define default_mpc_oem_bus_info NULL
 #endif
 
 void __cpuinit generic_processor_info(int apicid, int version);
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index fd2267baba1..6121a8ac7b0 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -12,7 +12,6 @@
 /*
  * Any setup quirks to be performed?
  */
-struct mpc_bus;
 
 struct x86_quirks {
 	int (*arch_pre_time_init)(void);
@@ -22,8 +21,6 @@ struct x86_quirks {
 	int (*arch_trap_init)(void);
 	int (*mach_get_smp_config)(unsigned int early);
 	int (*mach_find_smp_config)(unsigned int reserve);
-
-	void (*mpc_oem_bus_info)(struct mpc_bus *m, char *name);
 };
 
 extern void x86_quirk_intr_init(void);
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 404e2d2b06d..2833a873a90 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -12,6 +12,7 @@ struct mpc_table;
  * @mpc_apic_id:		platform specific mpc apic id assignment
  * @smp_read_mpc_oem:		platform specific oem mpc table setup
  * @mpc_oem_pci_bus:		platform specific pci bus setup (default NULL)
+ * @mpc_oem_bus_info:		platform specific mpc bus info
  */
 struct x86_init_mpparse {
 	void (*mpc_record)(unsigned int mode);
@@ -19,6 +20,7 @@ struct x86_init_mpparse {
 	int (*mpc_apic_id)(struct mpc_cpu *m);
 	void (*smp_read_mpc_oem)(struct mpc_table *mpc);
 	void (*mpc_oem_pci_bus)(struct mpc_bus *m);
+	void (*mpc_oem_bus_info)(struct mpc_bus *m, char *name);
 };
 
 /**
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index feebe8eed7d..700273dca68 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -270,7 +270,6 @@ static struct x86_quirks numaq_x86_quirks __initdata = {
 	.arch_trap_init			= NULL,
 	.mach_get_smp_config		= NULL,
 	.mach_find_smp_config		= NULL,
-	.mpc_oem_bus_info		= mpc_oem_bus_info,
 };
 
 static __init void early_check_numaq(void)
@@ -293,6 +292,7 @@ static __init void early_check_numaq(void)
 		x86_init.mpparse.mpc_apic_id = mpc_apic_id;
 		x86_init.mpparse.smp_read_mpc_oem = smp_read_mpc_oem;
 		x86_init.mpparse.mpc_oem_pci_bus = mpc_oem_pci_bus;
+		x86_init.mpparse.mpc_oem_bus_info = mpc_oem_bus_info;
 	}
 }
 
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 72e1140723c..a42f23f1dc7 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -72,16 +72,18 @@ static void __init MP_processor_info(struct mpc_cpu *m)
 }
 
 #ifdef CONFIG_X86_IO_APIC
-static void __init MP_bus_info(struct mpc_bus *m)
+void __init default_mpc_oem_bus_info(struct mpc_bus *m, char *str)
 {
-	char str[7];
 	memcpy(str, m->bustype, 6);
 	str[6] = 0;
+	apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->busid, str);
+}
+
+static void __init MP_bus_info(struct mpc_bus *m)
+{
+	char str[7];
 
-	if (x86_quirks->mpc_oem_bus_info)
-		x86_quirks->mpc_oem_bus_info(m, str);
-	else
-		apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->busid, str);
+	x86_init.mpparse.mpc_oem_bus_info(m, str);
 
 #if MAX_MP_BUSSES < 256
 	if (m->busid >= MAX_MP_BUSSES) {
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index fb5d93c077d..27685edc546 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -31,5 +31,6 @@ struct __initdata x86_init_ops x86_init = {
 		.setup_ioapic_ids	= x86_init_noop,
 		.mpc_apic_id		= default_mpc_apic_id,
 		.smp_read_mpc_oem	= default_smp_read_mpc_oem,
+		.mpc_oem_bus_info	= default_mpc_oem_bus_info,
 	},
 };
-- 
cgit v1.2.3


From 7285dd7fd375763bfb8ab1ac9cf3f1206f503c16 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 28 Aug 2009 20:25:24 +0200
Subject: clocksource: Resolve cpu hotplug dead lock with TSC unstable

Martin Schwidefsky analyzed it:
To register a clocksource the clocksource_mutex is acquired and if
necessary timekeeping_notify is called to install the clocksource as
the timekeeper clock. timekeeping_notify uses stop_machine which needs
to take cpu_add_remove_lock mutex.
Starting a new cpu is done with the cpu_add_remove_lock mutex held.
native_cpu_up checks the tsc of the new cpu and if the tsc is no good
clocksource_change_rating is called. Which needs the clocksource_mutex
and the deadlock is complete.

The solution is to replace the TSC via the clocksource watchdog
mechanism. Mark the TSC as unstable and schedule the watchdog work so
it gets removed in the watchdog thread context.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: John Stultz <johnstul@us.ibm.com>
---
 arch/x86/kernel/tsc.c       |  8 +++++---
 include/linux/clocksource.h |  1 +
 kernel/time/clocksource.c   | 33 ++++++++++++++++++++++++++++++---
 3 files changed, 36 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 968425422c4..fc3672a303d 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -767,12 +767,14 @@ void mark_tsc_unstable(char *reason)
 {
 	if (!tsc_unstable) {
 		tsc_unstable = 1;
-		printk("Marking TSC unstable due to %s\n", reason);
+		printk(KERN_INFO "Marking TSC unstable due to %s\n", reason);
 		/* Change only the rating, when not registered */
 		if (clocksource_tsc.mult)
-			clocksource_change_rating(&clocksource_tsc, 0);
-		else
+			clocksource_mark_unstable(&clocksource_tsc);
+		else {
+			clocksource_tsc.flags |= CLOCK_SOURCE_UNSTABLE;
 			clocksource_tsc.rating = 0;
+		}
 	}
 }
 
diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index 9ea40ff26f0..83d2fbd81b9 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -277,6 +277,7 @@ extern struct clocksource* clocksource_get_next(void);
 extern void clocksource_change_rating(struct clocksource *cs, int rating);
 extern void clocksource_resume(void);
 extern struct clocksource * __init __weak clocksource_default_clock(void);
+extern void clocksource_mark_unstable(struct clocksource *cs);
 
 #ifdef CONFIG_GENERIC_TIME_VSYSCALL
 extern void update_vsyscall(struct timespec *ts, struct clocksource *c);
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index e0c86ad6e9f..a0af4ffcb6e 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -149,15 +149,42 @@ static void clocksource_watchdog_work(struct work_struct *work)
 	kthread_run(clocksource_watchdog_kthread, NULL, "kwatchdog");
 }
 
-static void clocksource_unstable(struct clocksource *cs, int64_t delta)
+static void __clocksource_unstable(struct clocksource *cs)
 {
-	printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n",
-	       cs->name, delta);
 	cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG);
 	cs->flags |= CLOCK_SOURCE_UNSTABLE;
 	schedule_work(&watchdog_work);
 }
 
+static void clocksource_unstable(struct clocksource *cs, int64_t delta)
+{
+	printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n",
+	       cs->name, delta);
+	__clocksource_unstable(cs);
+}
+
+/**
+ * clocksource_mark_unstable - mark clocksource unstable via watchdog
+ * @cs:		clocksource to be marked unstable
+ *
+ * This function is called instead of clocksource_change_rating from
+ * cpu hotplug code to avoid a deadlock between the clocksource mutex
+ * and the cpu hotplug mutex. It defers the update of the clocksource
+ * to the watchdog thread.
+ */
+void clocksource_mark_unstable(struct clocksource *cs)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&watchdog_lock, flags);
+	if (!(cs->flags & CLOCK_SOURCE_UNSTABLE)) {
+		if (list_empty(&cs->wd_list))
+			list_add(&cs->wd_list, &watchdog_list);
+		__clocksource_unstable(cs);
+	}
+	spin_unlock_irqrestore(&watchdog_lock, flags);
+}
+
 static void clocksource_watchdog(unsigned long data)
 {
 	struct clocksource *cs;
-- 
cgit v1.2.3


From a192a9580bcc41692be1f36b77c3b681827f566a Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Tue, 28 Jul 2009 16:45:54 -0400
Subject: ACPI: Move definition of PREFIX from acpi_bus.h to internal..h

Linux/ACPI core files using internal.h all PREFIX "ACPI: ",
however, not all ACPI drivers use/want it -- and they
should not have to #undef PREFIX to define their own.

Add GPL commment to internal.h while we are there.

This does not change any actual console output,
asside from a whitespace fix.

Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/x86/pci/mmconfig-shared.c        |  2 ++
 drivers/acpi/ac.c                     |  2 ++
 drivers/acpi/battery.c                |  2 ++
 drivers/acpi/blacklist.c              |  2 ++
 drivers/acpi/button.c                 |  2 ++
 drivers/acpi/cm_sbs.c                 |  2 ++
 drivers/acpi/container.c              |  2 ++
 drivers/acpi/dock.c                   |  2 ++
 drivers/acpi/ec.c                     |  1 -
 drivers/acpi/event.c                  |  2 ++
 drivers/acpi/fan.c                    |  2 ++
 drivers/acpi/glue.c                   |  2 ++
 drivers/acpi/internal.h               | 22 +++++++++++++++++++++-
 drivers/acpi/numa.c                   |  2 ++
 drivers/acpi/pci_irq.c                |  2 ++
 drivers/acpi/pci_link.c               |  2 ++
 drivers/acpi/pci_root.c               |  2 ++
 drivers/acpi/power.c                  |  2 ++
 drivers/acpi/processor_core.c         |  2 ++
 drivers/acpi/processor_idle.c         |  2 ++
 drivers/acpi/processor_perflib.c      |  2 ++
 drivers/acpi/processor_thermal.c      |  2 ++
 drivers/acpi/processor_throttling.c   |  2 ++
 drivers/acpi/sbs.c                    |  2 ++
 drivers/acpi/sbshc.c                  |  2 ++
 drivers/acpi/system.c                 |  2 ++
 drivers/acpi/thermal.c                |  2 ++
 drivers/acpi/utils.c                  |  2 ++
 drivers/acpi/video.c                  |  2 ++
 drivers/acpi/video_detect.c           |  2 ++
 drivers/pci/dmar.c                    |  3 +--
 drivers/platform/x86/fujitsu-laptop.c |  4 ++--
 drivers/platform/x86/wmi.c            |  1 -
 include/acpi/acpi_bus.h               |  2 --
 34 files changed, 80 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index 712443ec6d4..81d3466765c 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -18,6 +18,8 @@
 #include <asm/e820.h>
 #include <asm/pci_x86.h>
 
+#define PREFIX "ACPI: "
+
 /* aperture is up to 256MB but BIOS may reserve less */
 #define MMCONFIG_APER_MIN	(2 * 1024*1024)
 #define MMCONFIG_APER_MAX	(256 * 1024*1024)
diff --git a/drivers/acpi/ac.c b/drivers/acpi/ac.c
index 0df8fcb687d..98b9690b015 100644
--- a/drivers/acpi/ac.c
+++ b/drivers/acpi/ac.c
@@ -37,6 +37,8 @@
 #include <acpi/acpi_bus.h>
 #include <acpi/acpi_drivers.h>
 
+#define PREFIX "ACPI: "
+
 #define ACPI_AC_CLASS			"ac_adapter"
 #define ACPI_AC_DEVICE_NAME		"AC Adapter"
 #define ACPI_AC_FILE_STATE		"state"
diff --git a/drivers/acpi/battery.c b/drivers/acpi/battery.c
index 58b4517ce71..f8c3d1bb696 100644
--- a/drivers/acpi/battery.c
+++ b/drivers/acpi/battery.c
@@ -45,6 +45,8 @@
 #include <linux/power_supply.h>
 #endif
 
+#define PREFIX "ACPI: "
+
 #define ACPI_BATTERY_VALUE_UNKNOWN 0xFFFFFFFF
 
 #define ACPI_BATTERY_CLASS		"battery"
diff --git a/drivers/acpi/blacklist.c b/drivers/acpi/blacklist.c
index f6baa77deef..19152ea2b10 100644
--- a/drivers/acpi/blacklist.c
+++ b/drivers/acpi/blacklist.c
@@ -34,6 +34,8 @@
 #include <acpi/acpi_bus.h>
 #include <linux/dmi.h>
 
+#include "internal.h"
+
 enum acpi_blacklist_predicates {
 	all_versions,
 	less_than_or_equal,
diff --git a/drivers/acpi/button.c b/drivers/acpi/button.c
index 9195deba9d9..d295bdccc09 100644
--- a/drivers/acpi/button.c
+++ b/drivers/acpi/button.c
@@ -33,6 +33,8 @@
 #include <acpi/acpi_bus.h>
 #include <acpi/acpi_drivers.h>
 
+#define PREFIX "ACPI: "
+
 #define ACPI_BUTTON_CLASS		"button"
 #define ACPI_BUTTON_FILE_INFO		"info"
 #define ACPI_BUTTON_FILE_STATE		"state"
diff --git a/drivers/acpi/cm_sbs.c b/drivers/acpi/cm_sbs.c
index 332fe4b2170..6c9ee68e46f 100644
--- a/drivers/acpi/cm_sbs.c
+++ b/drivers/acpi/cm_sbs.c
@@ -28,6 +28,8 @@
 #include <acpi/acpi_bus.h>
 #include <acpi/acpi_drivers.h>
 
+#define PREFIX "ACPI: "
+
 ACPI_MODULE_NAME("cm_sbs");
 #define ACPI_AC_CLASS		"ac_adapter"
 #define ACPI_BATTERY_CLASS	"battery"
diff --git a/drivers/acpi/container.c b/drivers/acpi/container.c
index fe0cdf83641..5f2c3c00a31 100644
--- a/drivers/acpi/container.c
+++ b/drivers/acpi/container.c
@@ -35,6 +35,8 @@
 #include <acpi/acpi_drivers.h>
 #include <acpi/container.h>
 
+#define PREFIX "ACPI: "
+
 #define ACPI_CONTAINER_DEVICE_NAME	"ACPI container device"
 #define ACPI_CONTAINER_CLASS		"container"
 
diff --git a/drivers/acpi/dock.c b/drivers/acpi/dock.c
index efb959d6c8a..9a855669ff1 100644
--- a/drivers/acpi/dock.c
+++ b/drivers/acpi/dock.c
@@ -33,6 +33,8 @@
 #include <acpi/acpi_bus.h>
 #include <acpi/acpi_drivers.h>
 
+#define PREFIX "ACPI: "
+
 #define ACPI_DOCK_DRIVER_DESCRIPTION "ACPI Dock Station Driver"
 
 ACPI_MODULE_NAME("dock");
diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c
index 391f331674c..5180f0f1dd0 100644
--- a/drivers/acpi/ec.c
+++ b/drivers/acpi/ec.c
@@ -47,7 +47,6 @@
 #define ACPI_EC_DEVICE_NAME		"Embedded Controller"
 #define ACPI_EC_FILE_INFO		"info"
 
-#undef PREFIX
 #define PREFIX				"ACPI: EC: "
 
 /* EC status register */
diff --git a/drivers/acpi/event.c b/drivers/acpi/event.c
index aeb7e5fb4a0..c511071bfd7 100644
--- a/drivers/acpi/event.c
+++ b/drivers/acpi/event.c
@@ -14,6 +14,8 @@
 #include <net/netlink.h>
 #include <net/genetlink.h>
 
+#include "internal.h"
+
 #define _COMPONENT		ACPI_SYSTEM_COMPONENT
 ACPI_MODULE_NAME("event");
 
diff --git a/drivers/acpi/fan.c b/drivers/acpi/fan.c
index 53698ea0837..f419849a0d3 100644
--- a/drivers/acpi/fan.c
+++ b/drivers/acpi/fan.c
@@ -34,6 +34,8 @@
 #include <acpi/acpi_bus.h>
 #include <acpi/acpi_drivers.h>
 
+#define PREFIX "ACPI: "
+
 #define ACPI_FAN_CLASS			"fan"
 #define ACPI_FAN_FILE_STATE		"state"
 
diff --git a/drivers/acpi/glue.c b/drivers/acpi/glue.c
index a8a5c29958c..dc36a448de4 100644
--- a/drivers/acpi/glue.c
+++ b/drivers/acpi/glue.c
@@ -12,6 +12,8 @@
 #include <linux/rwsem.h>
 #include <linux/acpi.h>
 
+#include "internal.h"
+
 #define ACPI_GLUE_DEBUG	0
 #if ACPI_GLUE_DEBUG
 #define DBG(x...) printk(PREFIX x)
diff --git a/drivers/acpi/internal.h b/drivers/acpi/internal.h
index 11a69b53004..074cf8682d5 100644
--- a/drivers/acpi/internal.h
+++ b/drivers/acpi/internal.h
@@ -1,4 +1,24 @@
-/* For use by Linux/ACPI infrastructure, not drivers */
+/*
+ * acpi/internal.h
+ * For use by Linux/ACPI infrastructure, not drivers
+ *
+ * Copyright (c) 2009, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#define PREFIX "ACPI: "
 
 int init_acpi_device_notify(void);
 int acpi_scan_init(void);
diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c
index d440ccd27d9..202dd0c976a 100644
--- a/drivers/acpi/numa.c
+++ b/drivers/acpi/numa.c
@@ -30,6 +30,8 @@
 #include <linux/acpi.h>
 #include <acpi/acpi_bus.h>
 
+#define PREFIX "ACPI: "
+
 #define ACPI_NUMA	0x80000000
 #define _COMPONENT	ACPI_NUMA
 ACPI_MODULE_NAME("numa");
diff --git a/drivers/acpi/pci_irq.c b/drivers/acpi/pci_irq.c
index b794eb88ab9..843699ed93f 100644
--- a/drivers/acpi/pci_irq.c
+++ b/drivers/acpi/pci_irq.c
@@ -40,6 +40,8 @@
 #include <acpi/acpi_bus.h>
 #include <acpi/acpi_drivers.h>
 
+#define PREFIX "ACPI: "
+
 #define _COMPONENT		ACPI_PCI_COMPONENT
 ACPI_MODULE_NAME("pci_irq");
 
diff --git a/drivers/acpi/pci_link.c b/drivers/acpi/pci_link.c
index 16e0f9d3d17..394ae89409c 100644
--- a/drivers/acpi/pci_link.c
+++ b/drivers/acpi/pci_link.c
@@ -43,6 +43,8 @@
 #include <acpi/acpi_bus.h>
 #include <acpi/acpi_drivers.h>
 
+#define PREFIX "ACPI: "
+
 #define _COMPONENT			ACPI_PCI_COMPONENT
 ACPI_MODULE_NAME("pci_link");
 #define ACPI_PCI_LINK_CLASS		"pci_irq_routing"
diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c
index 55b5b90c2a4..dee916707a7 100644
--- a/drivers/acpi/pci_root.c
+++ b/drivers/acpi/pci_root.c
@@ -36,6 +36,8 @@
 #include <acpi/acpi_bus.h>
 #include <acpi/acpi_drivers.h>
 
+#define PREFIX "ACPI: "
+
 #define _COMPONENT		ACPI_PCI_COMPONENT
 ACPI_MODULE_NAME("pci_root");
 #define ACPI_PCI_ROOT_CLASS		"pci_bridge"
diff --git a/drivers/acpi/power.c b/drivers/acpi/power.c
index d74365d4a6e..e86603f37de 100644
--- a/drivers/acpi/power.c
+++ b/drivers/acpi/power.c
@@ -44,6 +44,8 @@
 #include <acpi/acpi_bus.h>
 #include <acpi/acpi_drivers.h>
 
+#define PREFIX "ACPI: "
+
 #define _COMPONENT			ACPI_POWER_COMPONENT
 ACPI_MODULE_NAME("power");
 #define ACPI_POWER_CLASS		"power_resource"
diff --git a/drivers/acpi/processor_core.c b/drivers/acpi/processor_core.c
index 2cc4b303387..b4a1ab297e7 100644
--- a/drivers/acpi/processor_core.c
+++ b/drivers/acpi/processor_core.c
@@ -59,6 +59,8 @@
 #include <acpi/acpi_drivers.h>
 #include <acpi/processor.h>
 
+#define PREFIX "ACPI: "
+
 #define ACPI_PROCESSOR_CLASS		"processor"
 #define ACPI_PROCESSOR_DEVICE_NAME	"Processor"
 #define ACPI_PROCESSOR_FILE_INFO	"info"
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 66393d5c4c7..22aab1fc9b4 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -60,6 +60,8 @@
 #include <acpi/processor.h>
 #include <asm/processor.h>
 
+#define PREFIX "ACPI: "
+
 #define ACPI_PROCESSOR_CLASS            "processor"
 #define _COMPONENT              ACPI_PROCESSOR_COMPONENT
 ACPI_MODULE_NAME("processor_idle");
diff --git a/drivers/acpi/processor_perflib.c b/drivers/acpi/processor_perflib.c
index 60e543d3234..11088cf1031 100644
--- a/drivers/acpi/processor_perflib.c
+++ b/drivers/acpi/processor_perflib.c
@@ -39,6 +39,8 @@
 #include <acpi/acpi_drivers.h>
 #include <acpi/processor.h>
 
+#define PREFIX "ACPI: "
+
 #define ACPI_PROCESSOR_CLASS		"processor"
 #define ACPI_PROCESSOR_FILE_PERFORMANCE	"performance"
 #define _COMPONENT		ACPI_PROCESSOR_COMPONENT
diff --git a/drivers/acpi/processor_thermal.c b/drivers/acpi/processor_thermal.c
index 31adda1099e..3e3181c0efc 100644
--- a/drivers/acpi/processor_thermal.c
+++ b/drivers/acpi/processor_thermal.c
@@ -40,6 +40,8 @@
 #include <acpi/processor.h>
 #include <acpi/acpi_drivers.h>
 
+#define PREFIX "ACPI: "
+
 #define ACPI_PROCESSOR_CLASS            "processor"
 #define _COMPONENT              ACPI_PROCESSOR_COMPONENT
 ACPI_MODULE_NAME("processor_thermal");
diff --git a/drivers/acpi/processor_throttling.c b/drivers/acpi/processor_throttling.c
index ae39797aab5..b366b9c13d4 100644
--- a/drivers/acpi/processor_throttling.c
+++ b/drivers/acpi/processor_throttling.c
@@ -41,6 +41,8 @@
 #include <acpi/acpi_drivers.h>
 #include <acpi/processor.h>
 
+#define PREFIX "ACPI: "
+
 #define ACPI_PROCESSOR_CLASS            "processor"
 #define _COMPONENT              ACPI_PROCESSOR_COMPONENT
 ACPI_MODULE_NAME("processor_throttling");
diff --git a/drivers/acpi/sbs.c b/drivers/acpi/sbs.c
index 4b214b74eba..52b9db8afc2 100644
--- a/drivers/acpi/sbs.c
+++ b/drivers/acpi/sbs.c
@@ -46,6 +46,8 @@
 
 #include "sbshc.h"
 
+#define PREFIX "ACPI: "
+
 #define ACPI_SBS_CLASS			"sbs"
 #define ACPI_AC_CLASS			"ac_adapter"
 #define ACPI_BATTERY_CLASS		"battery"
diff --git a/drivers/acpi/sbshc.c b/drivers/acpi/sbshc.c
index 0619734895b..d9339806df4 100644
--- a/drivers/acpi/sbshc.c
+++ b/drivers/acpi/sbshc.c
@@ -15,6 +15,8 @@
 #include <linux/interrupt.h>
 #include "sbshc.h"
 
+#define PREFIX "ACPI: "
+
 #define ACPI_SMB_HC_CLASS	"smbus_host_controller"
 #define ACPI_SMB_HC_DEVICE_NAME	"ACPI SMBus HC"
 
diff --git a/drivers/acpi/system.c b/drivers/acpi/system.c
index 9c61ab2177c..d11282975f3 100644
--- a/drivers/acpi/system.c
+++ b/drivers/acpi/system.c
@@ -31,6 +31,8 @@
 
 #include <acpi/acpi_drivers.h>
 
+#define PREFIX "ACPI: "
+
 #define _COMPONENT		ACPI_SYSTEM_COMPONENT
 ACPI_MODULE_NAME("system");
 
diff --git a/drivers/acpi/thermal.c b/drivers/acpi/thermal.c
index 564ea142428..65f67815902 100644
--- a/drivers/acpi/thermal.c
+++ b/drivers/acpi/thermal.c
@@ -47,6 +47,8 @@
 #include <acpi/acpi_bus.h>
 #include <acpi/acpi_drivers.h>
 
+#define PREFIX "ACPI: "
+
 #define ACPI_THERMAL_CLASS		"thermal_zone"
 #define ACPI_THERMAL_DEVICE_NAME	"Thermal Zone"
 #define ACPI_THERMAL_FILE_STATE		"state"
diff --git a/drivers/acpi/utils.c b/drivers/acpi/utils.c
index f844941089b..811fec10462 100644
--- a/drivers/acpi/utils.c
+++ b/drivers/acpi/utils.c
@@ -30,6 +30,8 @@
 #include <acpi/acpi_bus.h>
 #include <acpi/acpi_drivers.h>
 
+#include "internal.h"
+
 #define _COMPONENT		ACPI_BUS_COMPONENT
 ACPI_MODULE_NAME("utils");
 
diff --git a/drivers/acpi/video.c b/drivers/acpi/video.c
index 8851315ce85..a0fa3946b50 100644
--- a/drivers/acpi/video.c
+++ b/drivers/acpi/video.c
@@ -44,6 +44,8 @@
 #include <acpi/acpi_bus.h>
 #include <acpi/acpi_drivers.h>
 
+#define PREFIX "ACPI: "
+
 #define ACPI_VIDEO_CLASS		"video"
 #define ACPI_VIDEO_BUS_NAME		"Video Bus"
 #define ACPI_VIDEO_DEVICE_NAME		"Video Device"
diff --git a/drivers/acpi/video_detect.c b/drivers/acpi/video_detect.c
index 7cd2b63435e..7032f25da9b 100644
--- a/drivers/acpi/video_detect.c
+++ b/drivers/acpi/video_detect.c
@@ -38,6 +38,8 @@
 #include <linux/dmi.h>
 #include <linux/pci.h>
 
+#define PREFIX "ACPI: "
+
 ACPI_MODULE_NAME("video");
 #define _COMPONENT		ACPI_VIDEO_COMPONENT
 
diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c
index 7b287cb38b7..998f02d2ba4 100644
--- a/drivers/pci/dmar.c
+++ b/drivers/pci/dmar.c
@@ -34,8 +34,7 @@
 #include <linux/irq.h>
 #include <linux/interrupt.h>
 
-#undef PREFIX
-#define PREFIX "DMAR:"
+#define PREFIX "DMAR: "
 
 /* No locks are needed as DMA remapping hardware unit
  * list is constructed at boot time and hotplug of
diff --git a/drivers/platform/x86/fujitsu-laptop.c b/drivers/platform/x86/fujitsu-laptop.c
index 218b9a16ac3..eabddc9c192 100644
--- a/drivers/platform/x86/fujitsu-laptop.c
+++ b/drivers/platform/x86/fujitsu-laptop.c
@@ -700,7 +700,7 @@ static int acpi_fujitsu_add(struct acpi_device *device)
 		goto end;
 	}
 
-	printk(KERN_INFO PREFIX "%s [%s] (%s)\n",
+	printk(KERN_INFO "ACPI: %s [%s] (%s)\n",
 	       acpi_device_name(device), acpi_device_bid(device),
 	       !device->power.state ? "on" : "off");
 
@@ -874,7 +874,7 @@ static int acpi_fujitsu_hotkey_add(struct acpi_device *device)
 		goto end;
 	}
 
-	printk(KERN_INFO PREFIX "%s [%s] (%s)\n",
+	printk(KERN_INFO "ACPI: %s [%s] (%s)\n",
 	       acpi_device_name(device), acpi_device_bid(device),
 	       !device->power.state ? "on" : "off");
 
diff --git a/drivers/platform/x86/wmi.c b/drivers/platform/x86/wmi.c
index f215a591919..177f8d767df 100644
--- a/drivers/platform/x86/wmi.c
+++ b/drivers/platform/x86/wmi.c
@@ -42,7 +42,6 @@ MODULE_LICENSE("GPL");
 
 #define ACPI_WMI_CLASS "wmi"
 
-#undef PREFIX
 #define PREFIX "ACPI: WMI: "
 
 static DEFINE_MUTEX(wmi_data_lock);
diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index c65e4ce6c3a..f485107ddc4 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -30,8 +30,6 @@
 
 #include <acpi/acpi.h>
 
-#define PREFIX			"ACPI: "
-
 /* TBD: Make dynamic */
 #define ACPI_MAX_HANDLES	10
 struct acpi_handle_list {
-- 
cgit v1.2.3


From f4a2d5840e9f0e48d1a787b66e7346087a756029 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Tue, 28 Jul 2009 16:48:02 -0400
Subject: ACPI, PCI: Change PREFIX to "PCI" from "ACPI" in mmconfig-shared.c

Signed-off-by: Len Brown <len.brown@intel.com>
Acked-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/pci/mmconfig-shared.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index 81d3466765c..b707a0141d3 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -18,7 +18,7 @@
 #include <asm/e820.h>
 #include <asm/pci_x86.h>
 
-#define PREFIX "ACPI: "
+#define PREFIX "PCI: "
 
 /* aperture is up to 256MB but BIOS may reserve less */
 #define MMCONFIG_APER_MIN	(2 * 1024*1024)
-- 
cgit v1.2.3


From e55a5999ffcf72dc4d43d73618957964cb87065a Mon Sep 17 00:00:00 2001
From: Feng Tang <feng.tang@intel.com>
Date: Tue, 28 Jul 2009 17:41:53 +0800
Subject: ACPI: Handle CONFIG_ACPI=n better from linux/acpi.h

linux/acpi.h is the top level header for interfacing
with the ACPI sub-system, so acpi_disabled should be
up there instead of down in asm/acpi.h -- particularly
since asm/acpi.h doesn't exist for all architectures.

Same story for acpi_table_parse(), which is a top-level
API to Linux/ACPI.

This is necessary for building some code that
used to always depend on CONFIG_ACPI=y, but will soon
also need to build with CONFIG_ACPI=n.

Signed-off-by: Feng Tang <feng.tang@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/x86/include/asm/acpi.h |  1 -
 include/linux/acpi.h        | 11 ++++++++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 20d1465a2ab..4518dc50090 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -144,7 +144,6 @@ static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate)
 
 #else /* !CONFIG_ACPI */
 
-#define acpi_disabled 1
 #define acpi_lapic 0
 #define acpi_ioapic 0
 static inline void acpi_noirq_set(void) { }
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 34321cfffea..3fce811bf9a 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -292,7 +292,10 @@ void __init acpi_s4_no_nvs(void);
 extern acpi_status acpi_pci_osc_control_set(acpi_handle handle, u32 flags);
 extern void acpi_early_init(void);
 
-#else	/* CONFIG_ACPI */
+#else	/* !CONFIG_ACPI */
+
+#define acpi_disabled 1
+
 static inline void acpi_early_init(void) { }
 
 static inline int early_acpi_boot_init(void)
@@ -331,5 +334,11 @@ static inline int acpi_check_mem_region(resource_size_t start,
 	return 0;
 }
 
+struct acpi_table_header;
+static inline int acpi_table_parse(char *id,
+				int (*handler)(struct acpi_table_header *))
+{
+	return -1;
+}
 #endif	/* !CONFIG_ACPI */
 #endif	/*_LINUX_ACPI_H*/
-- 
cgit v1.2.3


From efafc8b213e67ed148a5b53ade29ee7b48af907d Mon Sep 17 00:00:00 2001
From: Feng Tang <feng.tang@intel.com>
Date: Fri, 14 Aug 2009 15:23:29 -0400
Subject: x86: add arch-specific SFI support

arch/x86/kernel/sfi.c serves the dual-purpose of supporting the
SFI core with arch specific code, as well as a home for the
arch-specific code that uses SFI.

analogous to ACPI, drivers/sfi/Kconfig is pulled in by arch/x86/Kconfig

Signed-off-by: Feng Tang <feng.tang@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
Cc: x86@kernel.org
---
 arch/x86/Kconfig         |   2 +
 arch/x86/kernel/Makefile |   1 +
 arch/x86/kernel/setup.c  |   3 ++
 arch/x86/kernel/sfi.c    | 133 +++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 139 insertions(+)
 create mode 100644 arch/x86/kernel/sfi.c

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 13ffa5df37d..d8ba42418d3 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1683,6 +1683,8 @@ source "kernel/power/Kconfig"
 
 source "drivers/acpi/Kconfig"
 
+source "drivers/sfi/Kconfig"
+
 config X86_APM_BOOT
 	bool
 	default y
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 430d5b24af7..6321afaafb2 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -55,6 +55,7 @@ obj-y				+= step.o
 obj-$(CONFIG_STACKTRACE)	+= stacktrace.o
 obj-y				+= cpu/
 obj-y				+= acpi/
+obj-$(CONFIG_SFI)		+= sfi.o
 obj-y				+= reboot.o
 obj-$(CONFIG_MCA)		+= mca_32.o
 obj-$(CONFIG_X86_MSR)		+= msr.o
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 63f32d220ef..d784ea20760 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -27,6 +27,7 @@
 #include <linux/screen_info.h>
 #include <linux/ioport.h>
 #include <linux/acpi.h>
+#include <linux/sfi.h>
 #include <linux/apm_bios.h>
 #include <linux/initrd.h>
 #include <linux/bootmem.h>
@@ -990,6 +991,8 @@ void __init setup_arch(char **cmdline_p)
 	 */
 	acpi_boot_init();
 
+	sfi_init();
+
 #if defined(CONFIG_X86_MPPARSE) || defined(CONFIG_X86_VISWS)
 	/*
 	 * get boot-time SMP configuration:
diff --git a/arch/x86/kernel/sfi.c b/arch/x86/kernel/sfi.c
new file mode 100644
index 00000000000..761df3f759c
--- /dev/null
+++ b/arch/x86/kernel/sfi.c
@@ -0,0 +1,133 @@
+/*
+ * sfi.c - x86 architecture SFI support.
+ *
+ * Copyright (c) 2009, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+
+#define KMSG_COMPONENT "SFI"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/bootmem.h>
+#include <linux/cpumask.h>
+#include <linux/module.h>
+#include <linux/acpi.h>
+#include <linux/init.h>
+#include <linux/efi.h>
+#include <linux/irq.h>
+#include <linux/sfi.h>
+#include <linux/io.h>
+
+#include <asm/io_apic.h>
+#include <asm/pgtable.h>
+#include <asm/mpspec.h>
+#include <asm/setup.h>
+#include <asm/apic.h>
+#include <asm/e820.h>
+
+#ifdef CONFIG_X86_LOCAL_APIC
+static unsigned long sfi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
+
+void __init mp_sfi_register_lapic_address(unsigned long address)
+{
+	mp_lapic_addr = address;
+
+	set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
+	if (boot_cpu_physical_apicid == -1U)
+		boot_cpu_physical_apicid = read_apic_id();
+
+	pr_info("Boot CPU = %d\n", boot_cpu_physical_apicid);
+}
+
+/* All CPUs enumerated by SFI must be present and enabled */
+void __cpuinit mp_sfi_register_lapic(u8 id)
+{
+	int boot_cpu = 0;
+
+	if (MAX_APICS - id <= 0) {
+		pr_warning("Processor #%d invalid (max %d)\n",
+			id, MAX_APICS);
+		return;
+	}
+
+	if (id == boot_cpu_physical_apicid)
+		boot_cpu = 1;
+	pr_info("registering lapic[%d]\n", id);
+
+	generic_processor_info(id, GET_APIC_VERSION(apic_read(APIC_LVR)));
+}
+
+static int __init sfi_parse_cpus(struct sfi_table_header *table)
+{
+	struct sfi_table_simple *sb;
+	struct sfi_cpu_table_entry *pentry;
+	int i;
+	int cpu_num;
+
+	sb = (struct sfi_table_simple *)table;
+	cpu_num = SFI_GET_NUM_ENTRIES(sb, struct sfi_cpu_table_entry);
+	pentry = (struct sfi_cpu_table_entry *)sb->pentry;
+
+	for (i = 0; i < cpu_num; i++) {
+		mp_sfi_register_lapic(pentry->apic_id);
+		pentry++;
+	}
+
+	smp_found_config = 1;
+	return 0;
+}
+#endif /* CONFIG_X86_LOCAL_APIC */
+
+#ifdef CONFIG_X86_IO_APIC
+static u32 gsi_base;
+
+static int __init sfi_parse_ioapic(struct sfi_table_header *table)
+{
+	struct sfi_table_simple *sb;
+	struct sfi_apic_table_entry *pentry;
+	int i, num;
+
+	sb = (struct sfi_table_simple *)table;
+	num = SFI_GET_NUM_ENTRIES(sb, struct sfi_apic_table_entry);
+	pentry = (struct sfi_apic_table_entry *)sb->pentry;
+
+	for (i = 0; i < num; i++) {
+		mp_register_ioapic(i, pentry->phys_addr, gsi_base);
+		gsi_base += io_apic_get_redir_entries(i);
+		pentry++;
+	}
+
+	WARN(pic_mode, KERN_WARNING
+		"SFI: pic_mod shouldn't be 1 when IOAPIC table is present\n");
+	pic_mode = 0;
+	return 0;
+}
+#endif /* CONFIG_X86_IO_APIC */
+
+/*
+ * sfi_platform_init(): register lapics & io-apics
+ */
+int __init sfi_platform_init(void)
+{
+#ifdef CONFIG_X86_LOCAL_APIC
+	mp_sfi_register_lapic_address(sfi_lapic_addr);
+	sfi_table_parse(SFI_SIG_CPUS, NULL, NULL, sfi_parse_cpus);
+#endif
+#ifdef CONFIG_X86_IO_APIC
+	sfi_table_parse(SFI_SIG_APIC, NULL, NULL, sfi_parse_ioapic);
+#endif
+	return 0;
+}
-- 
cgit v1.2.3


From 5f0db7a2fb78895a197f64e548333b3bbd433996 Mon Sep 17 00:00:00 2001
From: Feng Tang <feng.tang@intel.com>
Date: Fri, 14 Aug 2009 15:37:50 -0400
Subject: SFI: Hook PCI MMCONFIG

First check ACPI, and if that fails, ask SFI to find the MCFG.

Signed-off-by: Feng Tang <feng.tang@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
Cc: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/Kconfig               | 2 +-
 arch/x86/pci/mmconfig-shared.c | 6 ++++--
 arch/x86/pci/mmconfig_32.c     | 2 +-
 3 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index d8ba42418d3..4c92c91f354 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1880,7 +1880,7 @@ config PCI_DIRECT
 
 config PCI_MMCONFIG
 	def_bool y
-	depends on X86_32 && PCI && ACPI && (PCI_GOMMCONFIG || PCI_GOANY)
+	depends on X86_32 && PCI && (ACPI || SFI) && (PCI_GOMMCONFIG || PCI_GOANY)
 
 config PCI_OLPC
 	def_bool y
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index b707a0141d3..602c172d3bd 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -13,10 +13,12 @@
 #include <linux/pci.h>
 #include <linux/init.h>
 #include <linux/acpi.h>
+#include <linux/sfi_acpi.h>
 #include <linux/bitmap.h>
 #include <linux/sort.h>
 #include <asm/e820.h>
 #include <asm/pci_x86.h>
+#include <asm/acpi.h>
 
 #define PREFIX "PCI: "
 
@@ -493,7 +495,7 @@ static void __init pci_mmcfg_reject_broken(int early)
 		       (unsigned int)cfg->start_bus_number,
 		       (unsigned int)cfg->end_bus_number);
 
-		if (!early)
+		if (!early && !acpi_disabled)
 			valid = is_mmconf_reserved(is_acpi_reserved, addr, size, i, cfg, 0);
 
 		if (valid)
@@ -608,7 +610,7 @@ static void __init __pci_mmcfg_init(int early)
 	}
 
 	if (!known_bridge)
-		acpi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg);
+		acpi_sfi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg);
 
 	pci_mmcfg_reject_broken(early);
 
diff --git a/arch/x86/pci/mmconfig_32.c b/arch/x86/pci/mmconfig_32.c
index 8b2d561046a..f10a7e94a84 100644
--- a/arch/x86/pci/mmconfig_32.c
+++ b/arch/x86/pci/mmconfig_32.c
@@ -11,9 +11,9 @@
 
 #include <linux/pci.h>
 #include <linux/init.h>
-#include <linux/acpi.h>
 #include <asm/e820.h>
 #include <asm/pci_x86.h>
+#include <acpi/acpi.h>
 
 /* Assume systems with more busses have correct MCFG */
 #define mmcfg_virt_addr ((void __iomem *) fix_to_virt(FIX_PCIE_MCFG))
-- 
cgit v1.2.3


From 47d25003cbd9e9030a95f7ccc4e70fec6aa7b844 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Fri, 28 Aug 2009 14:11:57 +0100
Subject: x86: Fix earlyprintk=dbgp for machines without NX

Since parse_early_param() may (e.g. for earlyprintk=dbgp)
involve calls to page table manipulation functions (here
set_fixmap_nocache()), NX hardware support must be determined
before calling that function (so that __supported_pte_mask gets
properly set up).

But the call after parse_early_param() can also not go away, as
that will honor eventual command line specified disabling of
the NX functionality.

( This will then just result in whatever mappings got
  established during parse_early_param() having the NX bit set
  despite it being disabled on the command line, but I think
  that's tolerable).

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Cc: Yinghai Lu <yhlu.kernel@gmail.com>
LKML-Reference: <4A97F3BD02000078000121B9@vpn.id2.novell.com>
[ merged to x86/pat to resolve a conflict. ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 02643cc3bf2..eb1f1e6e52b 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -714,6 +714,16 @@ void __init setup_arch(char **cmdline_p)
 	strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
 	*cmdline_p = command_line;
 
+#ifdef CONFIG_X86_64
+	/*
+	 * Must call this twice: Once just to detect whether hardware doesn't
+	 * support NX (so that the early EHCI debug console setup can safely
+	 * call set_fixmap(), and then again after parsing early parameters to
+	 * honor the respective command line option.
+	 */
+	check_efer();
+#endif
+
 	parse_early_param();
 
 	/* VMI may relocate the fixmap; do this before touching ioremap area */
-- 
cgit v1.2.3


From b3f1b617f49447df6c3f5fac9c225aaea8b724ea Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 20 Aug 2009 11:11:52 +0200
Subject: x86: Move get/find_smp_config to x86_init_ops

Replace the quirk machinery by a x86_init_ops function which defaults
to the standard implementation.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/mpspec.h   | 37 ++++++++++++++++++++++++++++++-------
 arch/x86/include/asm/setup.h    |  2 --
 arch/x86/include/asm/x86_init.h |  4 ++++
 arch/x86/kernel/apic/numaq_32.c |  2 --
 arch/x86/kernel/mpparse.c       | 33 ++-------------------------------
 arch/x86/kernel/setup.c         |  2 --
 arch/x86/kernel/visws_quirks.c  | 14 ++++----------
 arch/x86/kernel/x86_init.c      |  2 ++
 8 files changed, 42 insertions(+), 54 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index e3c579efde7..79c94500c0b 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -4,6 +4,7 @@
 #include <linux/init.h>
 
 #include <asm/mpspec_def.h>
+#include <asm/x86_init.h>
 
 extern int apic_version[MAX_APICS];
 extern int pic_mode;
@@ -41,9 +42,6 @@ extern int quad_local_to_mp_bus_id [NR_CPUS/4][4];
 
 #endif /* CONFIG_X86_64 */
 
-extern void early_find_smp_config(void);
-extern void early_get_smp_config(void);
-
 #if defined(CONFIG_MCA) || defined(CONFIG_EISA)
 extern int mp_bus_id_to_type[MAX_MP_BUSSES];
 #endif
@@ -52,14 +50,36 @@ extern DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
 
 extern unsigned int boot_cpu_physical_apicid;
 extern unsigned int max_physical_apicid;
-extern int smp_found_config;
 extern int mpc_default_type;
 extern unsigned long mp_lapic_addr;
 
-extern void get_smp_config(void);
+#ifdef CONFIG_X86_LOCAL_APIC
+extern int smp_found_config;
+#else
+# define smp_found_config 0
+#endif
+
+static inline void get_smp_config(void)
+{
+	x86_init.mpparse.get_smp_config(0);
+}
+
+static inline void early_get_smp_config(void)
+{
+	x86_init.mpparse.get_smp_config(1);
+}
+
+static inline void find_smp_config(void)
+{
+	x86_init.mpparse.find_smp_config(1);
+}
+
+static inline void early_find_smp_config(void)
+{
+	x86_init.mpparse.find_smp_config(0);
+}
 
 #ifdef CONFIG_X86_MPPARSE
-extern void find_smp_config(void);
 extern void early_reserve_e820_mpc_new(void);
 extern int enable_update_mptable;
 extern int default_mpc_apic_id(struct mpc_cpu *m);
@@ -69,13 +89,16 @@ extern void default_mpc_oem_bus_info(struct mpc_bus *m, char *str);
 # else
 #  define default_mpc_oem_bus_info NULL
 # endif
+extern void default_find_smp_config(unsigned int reserve);
+extern void default_get_smp_config(unsigned int early);
 #else
-static inline void find_smp_config(void) { }
 static inline void early_reserve_e820_mpc_new(void) { }
 #define enable_update_mptable 0
 #define default_mpc_apic_id NULL
 #define default_smp_read_mpc_oem NULL
 #define default_mpc_oem_bus_info NULL
+#define default_find_smp_config x86_init_uint_noop
+#define default_get_smp_config x86_init_uint_noop
 #endif
 
 void __cpuinit generic_processor_info(int apicid, int version);
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 6121a8ac7b0..345a2551af9 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -19,8 +19,6 @@ struct x86_quirks {
 	int (*arch_pre_intr_init)(void);
 	int (*arch_intr_init)(void);
 	int (*arch_trap_init)(void);
-	int (*mach_get_smp_config)(unsigned int early);
-	int (*mach_find_smp_config)(unsigned int reserve);
 };
 
 extern void x86_quirk_intr_init(void);
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 2833a873a90..e0d4729c905 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -13,6 +13,8 @@ struct mpc_table;
  * @smp_read_mpc_oem:		platform specific oem mpc table setup
  * @mpc_oem_pci_bus:		platform specific pci bus setup (default NULL)
  * @mpc_oem_bus_info:		platform specific mpc bus info
+ * @find_smp_config:		find the smp configuration
+ * @get_smp_config:		get the smp configuration
  */
 struct x86_init_mpparse {
 	void (*mpc_record)(unsigned int mode);
@@ -21,6 +23,8 @@ struct x86_init_mpparse {
 	void (*smp_read_mpc_oem)(struct mpc_table *mpc);
 	void (*mpc_oem_pci_bus)(struct mpc_bus *m);
 	void (*mpc_oem_bus_info)(struct mpc_bus *m, char *name);
+	void (*find_smp_config)(unsigned int reserve);
+	void (*get_smp_config)(unsigned int early);
 };
 
 /**
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index 700273dca68..3dd5fd76534 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -268,8 +268,6 @@ static struct x86_quirks numaq_x86_quirks __initdata = {
 	.arch_pre_intr_init		= NULL,
 	.arch_intr_init			= NULL,
 	.arch_trap_init			= NULL,
-	.mach_get_smp_config		= NULL,
-	.mach_find_smp_config		= NULL,
 };
 
 static __init void early_check_numaq(void)
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index a42f23f1dc7..75357647b6e 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -610,7 +610,7 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early)
 /*
  * Scan the memory blocks for an SMP configuration block.
  */
-static void __init __get_smp_config(unsigned int early)
+void __init default_get_smp_config(unsigned int early)
 {
 	struct mpf_intel *mpf = mpf_found;
 
@@ -627,11 +627,6 @@ static void __init __get_smp_config(unsigned int early)
 	if (acpi_lapic && acpi_ioapic)
 		return;
 
-	if (x86_quirks->mach_get_smp_config) {
-		if (x86_quirks->mach_get_smp_config(early))
-			return;
-	}
-
 	printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n",
 	       mpf->specification);
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32)
@@ -672,16 +667,6 @@ static void __init __get_smp_config(unsigned int early)
 	 */
 }
 
-void __init early_get_smp_config(void)
-{
-	__get_smp_config(1);
-}
-
-void __init get_smp_config(void)
-{
-	__get_smp_config(0);
-}
-
 static void __init smp_reserve_bootmem(struct mpf_intel *mpf)
 {
 	unsigned long size = get_mpc_size(mpf->physptr);
@@ -747,14 +732,10 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
 	return 0;
 }
 
-static void __init __find_smp_config(unsigned int reserve)
+void __init default_find_smp_config(unsigned int reserve)
 {
 	unsigned int address;
 
-	if (x86_quirks->mach_find_smp_config) {
-		if (x86_quirks->mach_find_smp_config(reserve))
-			return;
-	}
 	/*
 	 * FIXME: Linux assumes you have 640K of base ram..
 	 * this continues the error...
@@ -789,16 +770,6 @@ static void __init __find_smp_config(unsigned int reserve)
 		smp_scan_config(address, 0x400, reserve);
 }
 
-void __init early_find_smp_config(void)
-{
-	__find_smp_config(0);
-}
-
-void __init find_smp_config(void)
-{
-	__find_smp_config(1);
-}
-
 #ifdef CONFIG_X86_IO_APIC
 static u8 __initdata irq_used[MAX_IRQ_SOURCES];
 
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index c2a8090e831..54043cb7ba6 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -981,13 +981,11 @@ void __init setup_arch(char **cmdline_p)
 	 */
 	acpi_boot_init();
 
-#if defined(CONFIG_X86_MPPARSE) || defined(CONFIG_X86_VISWS)
 	/*
 	 * get boot-time SMP configuration:
 	 */
 	if (smp_found_config)
 		get_smp_config();
-#endif
 
 	prefill_possible_map();
 
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index 97c670df1ae..31e828118f8 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -156,12 +156,8 @@ static void visws_machine_power_off(void)
 	outl(PIIX_SPECIAL_STOP, 0xCFC);
 }
 
-static int __init visws_get_smp_config(unsigned int early)
+static void __init visws_get_smp_config(unsigned int early)
 {
-	/*
-	 * Prevent MP-table parsing by the generic code:
-	 */
-	return 1;
 }
 
 /*
@@ -208,7 +204,7 @@ static void __init MP_processor_info(struct mpc_cpu *m)
 	apic_version[m->apicid] = ver;
 }
 
-static int __init visws_find_smp_config(unsigned int reserve)
+static void __init visws_find_smp_config(unsigned int reserve)
 {
 	struct mpc_cpu *mp = phys_to_virt(CO_CPU_TAB_PHYS);
 	unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS));
@@ -230,8 +226,6 @@ static int __init visws_find_smp_config(unsigned int reserve)
 		MP_processor_info(mp++);
 
 	mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
-
-	return 1;
 }
 
 static int visws_trap_init(void);
@@ -241,8 +235,6 @@ static struct x86_quirks visws_x86_quirks __initdata = {
 	.arch_pre_intr_init	= visws_pre_intr_init,
 	.arch_intr_init		= NULL,
 	.arch_trap_init		= visws_trap_init,
-	.mach_get_smp_config	= visws_get_smp_config,
-	.mach_find_smp_config	= visws_find_smp_config,
 };
 
 void __init visws_early_detect(void)
@@ -263,6 +255,8 @@ void __init visws_early_detect(void)
 	x86_quirks = &visws_x86_quirks;
 
 	x86_init.resources.memory_setup = visws_memory_setup;
+	x86_init.mpparse.get_smp_config = visws_get_smp_config;
+	x86_init.mpparse.find_smp_config = visws_find_smp_config;
 
 	/*
 	 * Install reboot quirks:
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 27685edc546..3488fb62ac0 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -32,5 +32,7 @@ struct __initdata x86_init_ops x86_init = {
 		.mpc_apic_id		= default_mpc_apic_id,
 		.smp_read_mpc_oem	= default_smp_read_mpc_oem,
 		.mpc_oem_bus_info	= default_mpc_oem_bus_info,
+		.find_smp_config	= default_find_smp_config,
+		.get_smp_config		= default_get_smp_config,
 	},
 };
-- 
cgit v1.2.3


From d9112f43021554ded2ef2b9bea5f88ba4b52abe0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 20 Aug 2009 09:41:38 +0200
Subject: x86: Move pre_intr_init to x86_init_ops

Replace the quirk machinery by a x86_init_ops function which
defaults to the standard implementation. This is also a preparatory
patch for Moorestown support which needs to replace the default
init_ISA_irqs as well.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/irq.h      |  2 ++
 arch/x86/include/asm/setup.h    |  1 -
 arch/x86/include/asm/x86_init.h | 10 ++++++++++
 arch/x86/kernel/apic/numaq_32.c |  1 -
 arch/x86/kernel/irqinit.c       | 24 ++----------------------
 arch/x86/kernel/visws_quirks.c  | 10 +++-------
 arch/x86/kernel/x86_init.c      |  5 +++++
 7 files changed, 22 insertions(+), 31 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index f38481bcd45..8fe2782a253 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -47,4 +47,6 @@ extern unsigned int do_IRQ(struct pt_regs *regs);
 extern DECLARE_BITMAP(used_vectors, NR_VECTORS);
 extern int vector_used_by_percpu_irq(unsigned int vector);
 
+extern void init_ISA_irqs(void);
+
 #endif /* _ASM_X86_IRQ_H */
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 345a2551af9..66a319709d6 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -16,7 +16,6 @@
 struct x86_quirks {
 	int (*arch_pre_time_init)(void);
 	int (*arch_time_init)(void);
-	int (*arch_pre_intr_init)(void);
 	int (*arch_intr_init)(void);
 	int (*arch_trap_init)(void);
 };
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index e0d4729c905..65e3394c77f 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -43,6 +43,15 @@ struct x86_init_resources {
 	char *(*memory_setup)(void);
 };
 
+/**
+ * struct x86_init_irqs - platform specific interrupt setup
+ * @pre_vector_init:		init code to run before interrupt vectors
+ *				are set up.
+ */
+struct x86_init_irqs {
+	void (*pre_vector_init)(void);
+};
+
 /**
  * struct x86_init_ops - functions for platform specific setup
  *
@@ -50,6 +59,7 @@ struct x86_init_resources {
 struct x86_init_ops {
 	struct x86_init_resources	resources;
 	struct x86_init_mpparse		mpparse;
+	struct x86_init_irqs		irqs;
 };
 
 extern struct x86_init_ops x86_init;
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index 3dd5fd76534..ec8b3113716 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -265,7 +265,6 @@ static void __init smp_read_mpc_oem(struct mpc_table *mpc)
 static struct x86_quirks numaq_x86_quirks __initdata = {
 	.arch_pre_time_init		= numaq_pre_time_init,
 	.arch_time_init			= NULL,
-	.arch_pre_intr_init		= NULL,
 	.arch_intr_init			= NULL,
 	.arch_trap_init			= NULL,
 };
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 92b7703d3d5..acdf088c758 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -116,7 +116,7 @@ int vector_used_by_percpu_irq(unsigned int vector)
 	return 0;
 }
 
-static void __init init_ISA_irqs(void)
+void __init init_ISA_irqs(void)
 {
 	int i;
 
@@ -213,32 +213,12 @@ static void __init apic_intr_init(void)
 #endif
 }
 
-/**
- * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors
- *
- * Description:
- *	Perform any necessary interrupt initialisation prior to setting up
- *	the "ordinary" interrupt call gates.  For legacy reasons, the ISA
- *	interrupts should be initialised here if the machine emulates a PC
- *	in any way.
- **/
-static void __init x86_quirk_pre_intr_init(void)
-{
-#ifdef CONFIG_X86_32
-	if (x86_quirks->arch_pre_intr_init) {
-		if (x86_quirks->arch_pre_intr_init())
-			return;
-	}
-#endif
-	init_ISA_irqs();
-}
-
 void __init native_init_IRQ(void)
 {
 	int i;
 
 	/* Execute any quirks before the call gates are initialised: */
-	x86_quirk_pre_intr_init();
+	x86_init.irqs.pre_vector_init();
 
 	apic_intr_init();
 
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index 31e828118f8..1d6309d70df 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -73,14 +73,10 @@ static int __init visws_time_init(void)
 	return 0;
 }
 
-static int __init visws_pre_intr_init(void)
+/* Replaces the default init_ISA_irqs in the generic setup */
+static void __init visws_pre_intr_init(void)
 {
 	init_VISWS_APIC_irqs();
-
-	/*
-	 * We dont want ISA irqs to be set up by the generic code:
-	 */
-	return 1;
 }
 
 /* Quirk for machine specific memory setup. */
@@ -232,7 +228,6 @@ static int visws_trap_init(void);
 
 static struct x86_quirks visws_x86_quirks __initdata = {
 	.arch_time_init		= visws_time_init,
-	.arch_pre_intr_init	= visws_pre_intr_init,
 	.arch_intr_init		= NULL,
 	.arch_trap_init		= visws_trap_init,
 };
@@ -257,6 +252,7 @@ void __init visws_early_detect(void)
 	x86_init.resources.memory_setup = visws_memory_setup;
 	x86_init.mpparse.get_smp_config = visws_get_smp_config;
 	x86_init.mpparse.find_smp_config = visws_find_smp_config;
+	x86_init.irqs.pre_vector_init = visws_pre_intr_init;
 
 	/*
 	 * Install reboot quirks:
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 3488fb62ac0..f2abe2136da 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -9,6 +9,7 @@
 #include <asm/mpspec.h>
 #include <asm/setup.h>
 #include <asm/e820.h>
+#include <asm/irq.h>
 
 void __cpuinit x86_init_noop(void) { }
 void __init x86_init_uint_noop(unsigned int unused) { }
@@ -35,4 +36,8 @@ struct __initdata x86_init_ops x86_init = {
 		.find_smp_config	= default_find_smp_config,
 		.get_smp_config		= default_get_smp_config,
 	},
+
+	.irqs = {
+		.pre_vector_init	= init_ISA_irqs,
+	},
 };
-- 
cgit v1.2.3


From 66bcaf0bde100a4b54b82fc6fea6ceee2212ffb4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 20 Aug 2009 09:59:09 +0200
Subject: x86: Move irq_init to x86_init_ops

irq_init is overridden by x86_quirks and by paravirts. Unify the whole
mess and make it an unconditional x86_init_ops function which defaults
to the standard function and can be overridden by the early platform
code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/irq.h            |  1 -
 arch/x86/include/asm/paravirt_types.h |  2 --
 arch/x86/include/asm/setup.h          |  3 ---
 arch/x86/include/asm/x86_init.h       |  2 ++
 arch/x86/kernel/apic/numaq_32.c       |  1 -
 arch/x86/kernel/irqinit.c             | 12 ++++--------
 arch/x86/kernel/paravirt.c            |  6 ------
 arch/x86/kernel/setup.c               | 17 -----------------
 arch/x86/kernel/visws_quirks.c        |  1 -
 arch/x86/kernel/x86_init.c            |  1 +
 arch/x86/lguest/boot.c                |  2 +-
 arch/x86/xen/irq.c                    |  5 +++--
 12 files changed, 11 insertions(+), 42 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index 8fe2782a253..ddda6cbed6f 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -37,7 +37,6 @@ extern void fixup_irqs(void);
 #endif
 
 extern void (*generic_interrupt_extension)(void);
-extern void init_IRQ(void);
 extern void native_init_IRQ(void);
 extern bool handle_irq(unsigned irq, struct pt_regs *regs);
 
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 6d668968b6b..25922afb634 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -201,8 +201,6 @@ struct pv_cpu_ops {
 };
 
 struct pv_irq_ops {
-	void (*init_IRQ)(void);
-
 	/*
 	 * Get/set interrupt state.  save_fl and restore_fl are only
 	 * expected to use X86_EFLAGS_IF; all other bits
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 66a319709d6..404086f9411 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -16,12 +16,9 @@
 struct x86_quirks {
 	int (*arch_pre_time_init)(void);
 	int (*arch_time_init)(void);
-	int (*arch_intr_init)(void);
 	int (*arch_trap_init)(void);
 };
 
-extern void x86_quirk_intr_init(void);
-
 extern void x86_quirk_trap_init(void);
 
 extern void x86_quirk_pre_time_init(void);
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 65e3394c77f..8d7be65ccf7 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -47,9 +47,11 @@ struct x86_init_resources {
  * struct x86_init_irqs - platform specific interrupt setup
  * @pre_vector_init:		init code to run before interrupt vectors
  *				are set up.
+ * @intr_init:			interrupt init code
  */
 struct x86_init_irqs {
 	void (*pre_vector_init)(void);
+	void (*intr_init)(void);
 };
 
 /**
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index ec8b3113716..eafd341e42d 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -265,7 +265,6 @@ static void __init smp_read_mpc_oem(struct mpc_table *mpc)
 static struct x86_quirks numaq_x86_quirks __initdata = {
 	.arch_pre_time_init		= numaq_pre_time_init,
 	.arch_time_init			= NULL,
-	.arch_intr_init			= NULL,
 	.arch_trap_init			= NULL,
 };
 
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index acdf088c758..e0142cda239 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -140,8 +140,10 @@ void __init init_ISA_irqs(void)
 	}
 }
 
-/* Overridden in paravirt.c */
-void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
+void init_IRQ(void)
+{
+	x86_init.irqs.intr_init();
+}
 
 static void __init smp_intr_init(void)
 {
@@ -237,12 +239,6 @@ void __init native_init_IRQ(void)
 		setup_irq(2, &irq2);
 
 #ifdef CONFIG_X86_32
-	/*
-	 * Call quirks after call gates are initialised (usually add in
-	 * the architecture specific gates):
-	 */
-	x86_quirk_intr_init();
-
 	/*
 	 * External FPU? Set up irq13 if so, for
 	 * original braindamaged IBM FERR coupling.
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 532c9a2626c..d76bfbec71a 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -183,11 +183,6 @@ unsigned paravirt_patch_insns(void *insnbuf, unsigned len,
 	return insn_len;
 }
 
-void init_IRQ(void)
-{
-	pv_irq_ops.init_IRQ();
-}
-
 static void native_flush_tlb(void)
 {
 	__native_flush_tlb();
@@ -328,7 +323,6 @@ struct pv_time_ops pv_time_ops = {
 };
 
 struct pv_irq_ops pv_irq_ops = {
-	.init_IRQ = native_init_IRQ,
 	.save_fl = __PV_IS_CALLEE_SAVE(native_save_fl),
 	.restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl),
 	.irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable),
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 54043cb7ba6..d3da0f7333f 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1020,23 +1020,6 @@ void __init setup_arch(char **cmdline_p)
 
 #ifdef CONFIG_X86_32
 
-/**
- * x86_quirk_intr_init - post gate setup interrupt initialisation
- *
- * Description:
- *	Fill in any interrupts that may have been left out by the general
- *	init_IRQ() routine.  interrupts having to do with the machine rather
- *	than the devices on the I/O bus (like APIC interrupts in intel MP
- *	systems) are started here.
- **/
-void __init x86_quirk_intr_init(void)
-{
-	if (x86_quirks->arch_intr_init) {
-		if (x86_quirks->arch_intr_init())
-			return;
-	}
-}
-
 /**
  * x86_quirk_trap_init - initialise system specific traps
  *
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index 1d6309d70df..a49013716da 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -228,7 +228,6 @@ static int visws_trap_init(void);
 
 static struct x86_quirks visws_x86_quirks __initdata = {
 	.arch_time_init		= visws_time_init,
-	.arch_intr_init		= NULL,
 	.arch_trap_init		= visws_trap_init,
 };
 
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index f2abe2136da..8cb59332e3b 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -39,5 +39,6 @@ struct __initdata x86_init_ops x86_init = {
 
 	.irqs = {
 		.pre_vector_init	= init_ISA_irqs,
+		.intr_init		= native_init_IRQ,
 	},
 };
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 11445c176de..1ff986511f1 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -1262,7 +1262,6 @@ __init void lguest_init(void)
 	 */
 
 	/* Interrupt-related operations */
-	pv_irq_ops.init_IRQ = lguest_init_IRQ;
 	pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl);
 	pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl);
 	pv_irq_ops.irq_disable = PV_CALLEE_SAVE(irq_disable);
@@ -1325,6 +1324,7 @@ __init void lguest_init(void)
 	pv_time_ops.get_tsc_khz = lguest_tsc_khz;
 
 	x86_init.resources.memory_setup = lguest_memory_setup;
+	x86_init.irqs.intr_init = lguest_init_IRQ;
 
 	/*
 	 * Now is a good time to look at the implementations of these functions
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index cfd17799bd6..9d30105a0c4 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -1,5 +1,7 @@
 #include <linux/hardirq.h>
 
+#include <asm/x86_init.h>
+
 #include <xen/interface/xen.h>
 #include <xen/interface/sched.h>
 #include <xen/interface/vcpu.h>
@@ -112,8 +114,6 @@ static void xen_halt(void)
 }
 
 static const struct pv_irq_ops xen_irq_ops __initdata = {
-	.init_IRQ = xen_init_IRQ,
-
 	.save_fl = PV_CALLEE_SAVE(xen_save_fl),
 	.restore_fl = PV_CALLEE_SAVE(xen_restore_fl),
 	.irq_disable = PV_CALLEE_SAVE(xen_irq_disable),
@@ -129,4 +129,5 @@ static const struct pv_irq_ops xen_irq_ops __initdata = {
 void __init xen_init_irq_ops()
 {
 	pv_irq_ops = xen_irq_ops;
+	x86_init.irqs.intr_init = xen_init_IRQ;
 }
-- 
cgit v1.2.3


From 428cf9025b15573e16e658032f2b963283e34ae0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 20 Aug 2009 10:35:46 +0200
Subject: x86: Move traps_init to x86_init_ops

Replace the quirks by a simple x86_init_ops function.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/setup.h    |  3 ---
 arch/x86/include/asm/x86_init.h |  2 ++
 arch/x86/kernel/apic/numaq_32.c |  1 -
 arch/x86/kernel/setup.c         | 15 ---------------
 arch/x86/kernel/traps.c         |  5 ++---
 arch/x86/kernel/visws_quirks.c  |  8 +++-----
 arch/x86/kernel/x86_init.c      |  1 +
 7 files changed, 8 insertions(+), 27 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 404086f9411..7751d1f92bc 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -16,11 +16,8 @@
 struct x86_quirks {
 	int (*arch_pre_time_init)(void);
 	int (*arch_time_init)(void);
-	int (*arch_trap_init)(void);
 };
 
-extern void x86_quirk_trap_init(void);
-
 extern void x86_quirk_pre_time_init(void);
 extern void x86_quirk_time_init(void);
 
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 8d7be65ccf7..07c37bd879f 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -48,10 +48,12 @@ struct x86_init_resources {
  * @pre_vector_init:		init code to run before interrupt vectors
  *				are set up.
  * @intr_init:			interrupt init code
+ * @trap_init:			platform specific trap setup
  */
 struct x86_init_irqs {
 	void (*pre_vector_init)(void);
 	void (*intr_init)(void);
+	void (*trap_init)(void);
 };
 
 /**
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index eafd341e42d..71c5ea64586 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -265,7 +265,6 @@ static void __init smp_read_mpc_oem(struct mpc_table *mpc)
 static struct x86_quirks numaq_x86_quirks __initdata = {
 	.arch_pre_time_init		= numaq_pre_time_init,
 	.arch_time_init			= NULL,
-	.arch_trap_init			= NULL,
 };
 
 static __init void early_check_numaq(void)
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index d3da0f7333f..bf3b87f1f7d 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1020,21 +1020,6 @@ void __init setup_arch(char **cmdline_p)
 
 #ifdef CONFIG_X86_32
 
-/**
- * x86_quirk_trap_init - initialise system specific traps
- *
- * Description:
- *	Called as the final act of trap_init().  Used in VISWS to initialise
- *	the various board specific APIC traps.
- **/
-void __init x86_quirk_trap_init(void)
-{
-	if (x86_quirks->arch_trap_init) {
-		if (x86_quirks->arch_trap_init())
-			return;
-	}
-}
-
 static struct irqaction irq0  = {
 	.handler = timer_interrupt,
 	.flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER,
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 7e4b1f5dec8..ed96ed53f69 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -59,6 +59,7 @@
 #include <asm/mach_traps.h>
 
 #ifdef CONFIG_X86_64
+#include <asm/x86_init.h>
 #include <asm/pgalloc.h>
 #include <asm/proto.h>
 #else
@@ -980,7 +981,5 @@ void __init trap_init(void)
 	 */
 	cpu_init();
 
-#ifdef CONFIG_X86_32
-	x86_quirk_trap_init();
-#endif
+	x86_init.irqs.trap_init();
 }
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index a49013716da..2719091b335 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -224,11 +224,10 @@ static void __init visws_find_smp_config(unsigned int reserve)
 	mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
 }
 
-static int visws_trap_init(void);
+static void visws_trap_init(void);
 
 static struct x86_quirks visws_x86_quirks __initdata = {
 	.arch_time_init		= visws_time_init,
-	.arch_trap_init		= visws_trap_init,
 };
 
 void __init visws_early_detect(void)
@@ -252,6 +251,7 @@ void __init visws_early_detect(void)
 	x86_init.mpparse.get_smp_config = visws_get_smp_config;
 	x86_init.mpparse.find_smp_config = visws_find_smp_config;
 	x86_init.irqs.pre_vector_init = visws_pre_intr_init;
+	x86_init.irqs.trap_init = visws_trap_init;
 
 	/*
 	 * Install reboot quirks:
@@ -390,12 +390,10 @@ static __init void cobalt_init(void)
 		co_apic_read(CO_APIC_ID));
 }
 
-static int __init visws_trap_init(void)
+static void __init visws_trap_init(void)
 {
 	lithium_init();
 	cobalt_init();
-
-	return 1;
 }
 
 /*
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 8cb59332e3b..9f2b775dc72 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -40,5 +40,6 @@ struct __initdata x86_init_ops x86_init = {
 	.irqs = {
 		.pre_vector_init	= init_ISA_irqs,
 		.intr_init		= native_init_IRQ,
+		.trap_init		= x86_init_noop,
 	},
 };
-- 
cgit v1.2.3


From 42bbdb43b16d233b2dacb4cd76e28f61c2a86dc6 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 20 Aug 2009 13:04:10 +0200
Subject: x86: Replace ARCH_SETUP by a proper x86_init_ops

ARCH_SETUP is a horrible leftover from the old arch/i386 mach support
code. It still has a lonely user in xen. Move it to x86_init_ops.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/paravirt.h       | 1 -
 arch/x86/include/asm/paravirt_types.h | 1 -
 arch/x86/include/asm/x86_init.h       | 9 +++++++++
 arch/x86/kernel/paravirt.c            | 1 -
 arch/x86/kernel/setup.c               | 6 +-----
 arch/x86/kernel/x86_init.c            | 4 ++++
 arch/x86/xen/enlighten.c              | 2 +-
 7 files changed, 15 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 6a07af432c8..22cb3872f6d 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -24,7 +24,6 @@ static inline void load_sp0(struct tss_struct *tss,
 	PVOP_VCALL2(pv_cpu_ops.load_sp0, tss, thread);
 }
 
-#define ARCH_SETUP			pv_init_ops.arch_setup();
 static inline unsigned long get_wallclock(void)
 {
 	return PVOP_CALL0(unsigned long, pv_time_ops.get_wallclock);
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 25922afb634..a05085e5fdb 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -80,7 +80,6 @@ struct pv_init_ops {
 			  unsigned long addr, unsigned len);
 
 	/* Basic arch-specific setup */
-	void (*arch_setup)(void);
 	void (*post_allocator_init)(void);
 
 	/* Print a banner to identify the environment */
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 07c37bd879f..ceffbf358fc 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -56,6 +56,14 @@ struct x86_init_irqs {
 	void (*trap_init)(void);
 };
 
+/**
+ * struct x86_init_oem - oem platform specific customizing functions
+ * @arch_setup:			platform specific architecure setup
+ */
+struct x86_init_oem {
+	void (*arch_setup)(void);
+};
+
 /**
  * struct x86_init_ops - functions for platform specific setup
  *
@@ -64,6 +72,7 @@ struct x86_init_ops {
 	struct x86_init_resources	resources;
 	struct x86_init_mpparse		mpparse;
 	struct x86_init_irqs		irqs;
+	struct x86_init_oem		oem;
 };
 
 extern struct x86_init_ops x86_init;
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index d76bfbec71a..80275ef1651 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -311,7 +311,6 @@ struct pv_info pv_info = {
 struct pv_init_ops pv_init_ops = {
 	.patch = native_patch,
 	.banner = default_banner,
-	.arch_setup = paravirt_nop,
 };
 
 struct pv_time_ops pv_time_ops = {
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index bf3b87f1f7d..d12aa82c9c3 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -108,10 +108,6 @@
 #include <asm/numa_64.h>
 #endif
 
-#ifndef ARCH_SETUP
-#define ARCH_SETUP
-#endif
-
 /*
  * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
  * The direct mapping extends to max_pfn_mapped, so that we can directly access
@@ -750,7 +746,7 @@ void __init setup_arch(char **cmdline_p)
 	}
 #endif
 
-	ARCH_SETUP
+	x86_init.oem.arch_setup();
 
 	setup_memory_map();
 	parse_setup_data();
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 9f2b775dc72..fa2d849be35 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -42,4 +42,8 @@ struct __initdata x86_init_ops x86_init = {
 		.intr_init		= native_init_IRQ,
 		.trap_init		= x86_init_noop,
 	},
+
+	.oem = {
+		.arch_setup		= x86_init_noop,
+	},
 };
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 50b20c64f0b..73c7b1d610f 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -841,7 +841,6 @@ static const struct pv_init_ops xen_init_ops __initdata = {
 	.patch = xen_patch,
 
 	.banner = xen_banner,
-	.arch_setup = xen_arch_setup,
 	.post_allocator_init = xen_post_allocator_init,
 };
 
@@ -982,6 +981,7 @@ asmlinkage void __init xen_start_kernel(void)
 	pv_mmu_ops = xen_mmu_ops;
 
 	x86_init.resources.memory_setup = xen_memory_setup;
+	x86_init.oem.arch_setup = xen_arch_setup;
 
 #ifdef CONFIG_X86_64
 	/*
-- 
cgit v1.2.3


From 6f30c1ac3fcf11e08f00670f293546a112cdf4e3 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 20 Aug 2009 13:19:57 +0200
Subject: x86: Move paravirt banner printout to x86_init_ops

Replace another obscure paravirt magic and move it to
x86_init_ops. Such a hook is also useful for embedded and special
hardware.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/paravirt.h       |  6 +++++-
 arch/x86/include/asm/paravirt_types.h |  3 ---
 arch/x86/include/asm/x86_init.h       |  2 ++
 arch/x86/kernel/paravirt.c            | 10 +---------
 arch/x86/kernel/setup.c               |  1 +
 arch/x86/kernel/x86_init.c            |  2 ++
 arch/x86/xen/enlighten.c              |  2 +-
 7 files changed, 12 insertions(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 22cb3872f6d..3de6435a106 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -918,6 +918,8 @@ static inline unsigned long __raw_local_irq_save(void)
 #undef PVOP_VCALL4
 #undef PVOP_CALL4
 
+extern void default_banner(void);
+
 #else  /* __ASSEMBLY__ */
 
 #define _PVSITE(ptype, clobbers, ops, word, algn)	\
@@ -1058,5 +1060,7 @@ static inline unsigned long __raw_local_irq_save(void)
 #endif	/* CONFIG_X86_32 */
 
 #endif /* __ASSEMBLY__ */
-#endif /* CONFIG_PARAVIRT */
+#else  /* CONFIG_PARAVIRT */
+# define default_banner x86_init_noop
+#endif /* !CONFIG_PARAVIRT */
 #endif /* _ASM_X86_PARAVIRT_H */
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index a05085e5fdb..ce7723c81a1 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -81,9 +81,6 @@ struct pv_init_ops {
 
 	/* Basic arch-specific setup */
 	void (*post_allocator_init)(void);
-
-	/* Print a banner to identify the environment */
-	void (*banner)(void);
 };
 
 
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index ceffbf358fc..ee7c59df781 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -59,9 +59,11 @@ struct x86_init_irqs {
 /**
  * struct x86_init_oem - oem platform specific customizing functions
  * @arch_setup:			platform specific architecure setup
+ * @banner:			print a platform specific banner
  */
 struct x86_init_oem {
 	void (*arch_setup)(void);
+	void (*banner)(void);
 };
 
 /**
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 80275ef1651..f7a5fb79d18 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -54,7 +54,7 @@ u64 _paravirt_ident_64(u64 x)
 	return x;
 }
 
-static void __init default_banner(void)
+void __init default_banner(void)
 {
 	printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
 	       pv_info.name);
@@ -208,13 +208,6 @@ extern void native_irq_enable_sysexit(void);
 extern void native_usergs_sysret32(void);
 extern void native_usergs_sysret64(void);
 
-static int __init print_banner(void)
-{
-	pv_init_ops.banner();
-	return 0;
-}
-core_initcall(print_banner);
-
 static struct resource reserve_ioports = {
 	.start = 0,
 	.end = IO_SPACE_LIMIT,
@@ -310,7 +303,6 @@ struct pv_info pv_info = {
 
 struct pv_init_ops pv_init_ops = {
 	.patch = native_patch,
-	.banner = default_banner,
 };
 
 struct pv_time_ops pv_time_ops = {
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index d12aa82c9c3..bc5f0e561cf 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1012,6 +1012,7 @@ void __init setup_arch(char **cmdline_p)
 	conswitchp = &dummy_con;
 #endif
 #endif
+	x86_init.oem.banner();
 }
 
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index fa2d849be35..08fea49d59a 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -6,6 +6,7 @@
 #include <linux/init.h>
 
 #include <asm/bios_ebda.h>
+#include <asm/paravirt.h>
 #include <asm/mpspec.h>
 #include <asm/setup.h>
 #include <asm/e820.h>
@@ -45,5 +46,6 @@ struct __initdata x86_init_ops x86_init = {
 
 	.oem = {
 		.arch_setup		= x86_init_noop,
+		.banner			= default_banner,
 	},
 };
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 73c7b1d610f..46e23cde143 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -840,7 +840,6 @@ static const struct pv_info xen_info __initdata = {
 static const struct pv_init_ops xen_init_ops __initdata = {
 	.patch = xen_patch,
 
-	.banner = xen_banner,
 	.post_allocator_init = xen_post_allocator_init,
 };
 
@@ -982,6 +981,7 @@ asmlinkage void __init xen_start_kernel(void)
 
 	x86_init.resources.memory_setup = xen_memory_setup;
 	x86_init.oem.arch_setup = xen_arch_setup;
+	x86_init.oem.banner = xen_banner;
 
 #ifdef CONFIG_X86_64
 	/*
-- 
cgit v1.2.3


From 030cb6c00d242c20e92a3327d0cac17ce02d0cc3 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 20 Aug 2009 14:30:02 +0200
Subject: x86: Move paravirt pagetable_setup to x86_init_ops

Replace more paravirt hackery by proper x86_init_ops.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/paravirt.h       | 10 ----------
 arch/x86/include/asm/paravirt_types.h |  9 ---------
 arch/x86/include/asm/pgtable.h        | 10 ----------
 arch/x86/include/asm/pgtable_types.h  |  4 ++--
 arch/x86/include/asm/x86_init.h       | 13 +++++++++++++
 arch/x86/kernel/paravirt.c            |  7 -------
 arch/x86/kernel/setup.c               |  4 ++--
 arch/x86/kernel/x86_init.c            |  6 ++++++
 arch/x86/xen/enlighten.c              |  2 +-
 arch/x86/xen/mmu.c                    | 11 +++++++----
 arch/x86/xen/mmu.h                    |  2 +-
 11 files changed, 32 insertions(+), 46 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 3de6435a106..1caf25b91e6 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -351,16 +351,6 @@ static inline void paravirt_post_allocator_init(void)
 		(*pv_init_ops.post_allocator_init)();
 }
 
-static inline void paravirt_pagetable_setup_start(pgd_t *base)
-{
-	(*pv_mmu_ops.pagetable_setup_start)(base);
-}
-
-static inline void paravirt_pagetable_setup_done(pgd_t *base)
-{
-	(*pv_mmu_ops.pagetable_setup_done)(base);
-}
-
 #ifdef CONFIG_SMP
 static inline void startup_ipi_hook(int phys_apicid, unsigned long start_eip,
 				    unsigned long start_esp)
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index ce7723c81a1..4039eefd3eb 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -231,15 +231,6 @@ struct pv_apic_ops {
 };
 
 struct pv_mmu_ops {
-	/*
-	 * Called before/after init_mm pagetable setup. setup_start
-	 * may reset %cr3, and may pre-install parts of the pagetable;
-	 * pagetable setup is expected to preserve any existing
-	 * mapping.
-	 */
-	void (*pagetable_setup_start)(pgd_t *pgd_base);
-	void (*pagetable_setup_done)(pgd_t *pgd_base);
-
 	unsigned long (*read_cr2)(void);
 	void (*write_cr2)(unsigned long);
 
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 16748077559..60d422adf70 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -56,16 +56,6 @@ extern struct list_head pgd_list;
 #define pte_update(mm, addr, ptep)              do { } while (0)
 #define pte_update_defer(mm, addr, ptep)        do { } while (0)
 
-static inline void __init paravirt_pagetable_setup_start(pgd_t *base)
-{
-	native_pagetable_setup_start(base);
-}
-
-static inline void __init paravirt_pagetable_setup_done(pgd_t *base)
-{
-	native_pagetable_setup_done(base);
-}
-
 #define pgd_val(x)	native_pgd_val(x)
 #define __pgd(x)	native_make_pgd(x)
 
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 54cb697f490..7b467bf3c68 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -299,8 +299,8 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pte);
 extern void native_pagetable_setup_start(pgd_t *base);
 extern void native_pagetable_setup_done(pgd_t *base);
 #else
-static inline void native_pagetable_setup_start(pgd_t *base) {}
-static inline void native_pagetable_setup_done(pgd_t *base) {}
+#define native_pagetable_setup_start x86_init_pgd_noop
+#define native_pagetable_setup_done  x86_init_pgd_noop
 #endif
 
 struct seq_file;
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index ee7c59df781..b9bb4faefc4 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -1,6 +1,8 @@
 #ifndef _ASM_X86_PLATFORM_H
 #define _ASM_X86_PLATFORM_H
 
+#include <asm/pgtable_types.h>
+
 struct mpc_bus;
 struct mpc_cpu;
 struct mpc_table;
@@ -66,6 +68,16 @@ struct x86_init_oem {
 	void (*banner)(void);
 };
 
+/**
+ * struct x86_init_paging - platform specific paging functions
+ * @pagetable_setup_start:	platform specific pre paging_init() call
+ * @pagetable_setup_done:	platform specific post paging_init() call
+ */
+struct x86_init_paging {
+	void (*pagetable_setup_start)(pgd_t *base);
+	void (*pagetable_setup_done)(pgd_t *base);
+};
+
 /**
  * struct x86_init_ops - functions for platform specific setup
  *
@@ -75,6 +87,7 @@ struct x86_init_ops {
 	struct x86_init_mpparse		mpparse;
 	struct x86_init_irqs		irqs;
 	struct x86_init_oem		oem;
+	struct x86_init_paging		paging;
 };
 
 extern struct x86_init_ops x86_init;
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index f7a5fb79d18..8167be0b68c 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -402,13 +402,6 @@ struct pv_apic_ops pv_apic_ops = {
 #endif
 
 struct pv_mmu_ops pv_mmu_ops = {
-#ifndef CONFIG_X86_64
-	.pagetable_setup_start = native_pagetable_setup_start,
-	.pagetable_setup_done = native_pagetable_setup_done,
-#else
-	.pagetable_setup_start = paravirt_nop,
-	.pagetable_setup_done = paravirt_nop,
-#endif
 
 	.read_cr2 = native_read_cr2,
 	.write_cr2 = native_write_cr2,
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index bc5f0e561cf..4952d63dd67 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -959,9 +959,9 @@ void __init setup_arch(char **cmdline_p)
 	kvmclock_init();
 #endif
 
-	paravirt_pagetable_setup_start(swapper_pg_dir);
+	x86_init.paging.pagetable_setup_start(swapper_pg_dir);
 	paging_init();
-	paravirt_pagetable_setup_done(swapper_pg_dir);
+	x86_init.paging.pagetable_setup_done(swapper_pg_dir);
 	paravirt_post_allocator_init();
 
 #ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 08fea49d59a..7df020e6740 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -14,6 +14,7 @@
 
 void __cpuinit x86_init_noop(void) { }
 void __init x86_init_uint_noop(unsigned int unused) { }
+void __init x86_init_pgd_noop(pgd_t *unused) { }
 
 /*
  * The platform setup functions are preset with the default functions
@@ -48,4 +49,9 @@ struct __initdata x86_init_ops x86_init = {
 		.arch_setup		= x86_init_noop,
 		.banner			= default_banner,
 	},
+
+	.paging = {
+		.pagetable_setup_start	= native_pagetable_setup_start,
+		.pagetable_setup_done	= native_pagetable_setup_done,
+	},
 };
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 46e23cde143..12ea09ec39b 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -977,7 +977,6 @@ asmlinkage void __init xen_start_kernel(void)
 	pv_time_ops = xen_time_ops;
 	pv_cpu_ops = xen_cpu_ops;
 	pv_apic_ops = xen_apic_ops;
-	pv_mmu_ops = xen_mmu_ops;
 
 	x86_init.resources.memory_setup = xen_memory_setup;
 	x86_init.oem.arch_setup = xen_arch_setup;
@@ -991,6 +990,7 @@ asmlinkage void __init xen_start_kernel(void)
 	load_percpu_segment(0);
 #endif
 
+	xen_init_mmu_ops();
 	xen_init_irq_ops();
 	xen_init_cpuid_mask();
 
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 4ceb2858165..dbec51da930 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1875,10 +1875,7 @@ static void xen_leave_lazy_mmu(void)
 	preempt_enable();
 }
 
-const struct pv_mmu_ops xen_mmu_ops __initdata = {
-	.pagetable_setup_start = xen_pagetable_setup_start,
-	.pagetable_setup_done = xen_pagetable_setup_done,
-
+static const struct pv_mmu_ops xen_mmu_ops __initdata = {
 	.read_cr2 = xen_read_cr2,
 	.write_cr2 = xen_write_cr2,
 
@@ -1954,6 +1951,12 @@ const struct pv_mmu_ops xen_mmu_ops __initdata = {
 	.set_fixmap = xen_set_fixmap,
 };
 
+void __init xen_init_mmu_ops(void)
+{
+	x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start;
+	x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done;
+	pv_mmu_ops = xen_mmu_ops;
+}
 
 #ifdef CONFIG_XEN_DEBUG_FS
 
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index da730262489..5fe6bc7f5ec 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -59,5 +59,5 @@ void  xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
 
 unsigned long xen_read_cr2_direct(void);
 
-extern const struct pv_mmu_ops xen_mmu_ops;
+extern void xen_init_mmu_ops(void);
 #endif	/* _XEN_MMU_H */
-- 
cgit v1.2.3


From f1d7062a235d057e5d85ed2860bef609e0160cde Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 20 Aug 2009 13:13:52 +0200
Subject: x86: Move xen_post_allocator_init into xen_pagetable_setup_done

We really do not need two paravirt/x86_init_ops functions which are
called in two consecutive source lines. Move the only user of
post_allocator_init into the already existing pagetable_setup_done
function.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/paravirt.h       | 6 ------
 arch/x86/include/asm/paravirt_types.h | 3 ---
 arch/x86/include/asm/setup.h          | 4 ----
 arch/x86/kernel/setup.c               | 1 -
 arch/x86/xen/enlighten.c              | 2 --
 arch/x86/xen/mmu.c                    | 5 ++++-
 arch/x86/xen/xen-ops.h                | 2 --
 7 files changed, 4 insertions(+), 19 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 1caf25b91e6..7ce415e844b 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -345,12 +345,6 @@ static inline void setup_secondary_clock(void)
 }
 #endif
 
-static inline void paravirt_post_allocator_init(void)
-{
-	if (pv_init_ops.post_allocator_init)
-		(*pv_init_ops.post_allocator_init)();
-}
-
 #ifdef CONFIG_SMP
 static inline void startup_ipi_hook(int phys_apicid, unsigned long start_eip,
 				    unsigned long start_esp)
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 4039eefd3eb..ecc74e5ad40 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -78,9 +78,6 @@ struct pv_init_ops {
 	 */
 	unsigned (*patch)(u8 type, u16 clobber, void *insnbuf,
 			  unsigned long addr, unsigned len);
-
-	/* Basic arch-specific setup */
-	void (*post_allocator_init)(void);
 };
 
 
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 7751d1f92bc..58b58952b80 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -63,10 +63,6 @@ static inline int is_visws_box(void) { return 0; }
 extern struct x86_quirks *x86_quirks;
 extern unsigned long saved_video_mode;
 
-#ifndef CONFIG_PARAVIRT
-#define paravirt_post_allocator_init()	do {} while (0)
-#endif
-
 extern void reserve_standard_io_resources(void);
 extern void i386_reserve_resources(void);
 
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 4952d63dd67..43ec6aa175b 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -962,7 +962,6 @@ void __init setup_arch(char **cmdline_p)
 	x86_init.paging.pagetable_setup_start(swapper_pg_dir);
 	paging_init();
 	x86_init.paging.pagetable_setup_done(swapper_pg_dir);
-	paravirt_post_allocator_init();
 
 #ifdef CONFIG_X86_64
 	map_vsyscall();
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 12ea09ec39b..a924caa168d 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -839,8 +839,6 @@ static const struct pv_info xen_info __initdata = {
 
 static const struct pv_init_ops xen_init_ops __initdata = {
 	.patch = xen_patch,
-
-	.post_allocator_init = xen_post_allocator_init,
 };
 
 static const struct pv_time_ops xen_time_ops __initdata = {
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index dbec51da930..093dd59b538 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1229,9 +1229,12 @@ static __init void xen_pagetable_setup_start(pgd_t *base)
 {
 }
 
+static void xen_post_allocator_init(void);
+
 static __init void xen_pagetable_setup_done(pgd_t *base)
 {
 	xen_setup_shared_info();
+	xen_post_allocator_init();
 }
 
 static void xen_write_cr2(unsigned long cr2)
@@ -1841,7 +1844,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
 #endif
 }
 
-__init void xen_post_allocator_init(void)
+static __init void xen_post_allocator_init(void)
 {
 	pv_mmu_ops.set_pte = xen_set_pte;
 	pv_mmu_ops.set_pmd = xen_set_pmd;
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 22494fd4c9b..355fa6b99c9 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -30,8 +30,6 @@ pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
 void xen_ident_map_ISA(void);
 void xen_reserve_top(void);
 
-void xen_post_allocator_init(void);
-
 char * __init xen_memory_setup(void);
 void __init xen_arch_setup(void);
 void __init xen_init_IRQ(void);
-- 
cgit v1.2.3


From 736decac643e8982655e22ac7f0e5e61c5b7f9bd Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 19 Aug 2009 12:35:53 +0200
Subject: x86: Move percpu clockevents setup to x86_init_ops

paravirt overrides the setup of the default apic timers as per cpu
timers. Moorestown needs to override that as well.

Move it to x86_init_ops setup and create a separate x86_cpuinit struct
which holds the function for the secondary evtl. hotplugabble CPUs.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/apic.h           |  5 ++---
 arch/x86/include/asm/paravirt.h       | 12 ------------
 arch/x86/include/asm/paravirt_types.h |  3 ---
 arch/x86/include/asm/x86_init.h       | 19 +++++++++++++++++++
 arch/x86/kernel/apic/apic.c           |  3 ++-
 arch/x86/kernel/kvmclock.c            |  5 ++++-
 arch/x86/kernel/paravirt.c            |  2 --
 arch/x86/kernel/smpboot.c             |  4 ++--
 arch/x86/kernel/vmi_32.c              |  4 ++--
 arch/x86/kernel/x86_init.c            |  9 +++++++++
 arch/x86/xen/enlighten.c              |  4 ++--
 11 files changed, 42 insertions(+), 28 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index bb7d4792584..6f15b29005a 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -70,9 +70,6 @@ static inline void default_inquire_remote_apic(int apicid)
  */
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
-#else
-#define setup_boot_clock setup_boot_APIC_clock
-#define setup_secondary_clock setup_secondary_APIC_clock
 #endif
 
 #ifdef CONFIG_X86_64
@@ -245,6 +242,8 @@ static inline void lapic_shutdown(void) { }
 static inline void init_apic_mappings(void) { }
 static inline void disable_local_APIC(void) { }
 static inline void apic_disable(void) { }
+# define setup_boot_APIC_clock x86_init_noop
+# define setup_secondary_APIC_clock x86_init_noop
 #endif /* !CONFIG_X86_LOCAL_APIC */
 
 #ifdef CONFIG_X86_64
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 7ce415e844b..825674a968d 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -333,18 +333,6 @@ static inline void slow_down_io(void)
 #endif
 }
 
-#ifdef CONFIG_X86_LOCAL_APIC
-static inline void setup_boot_clock(void)
-{
-	PVOP_VCALL0(pv_apic_ops.setup_boot_clock);
-}
-
-static inline void setup_secondary_clock(void)
-{
-	PVOP_VCALL0(pv_apic_ops.setup_secondary_clock);
-}
-#endif
-
 #ifdef CONFIG_SMP
 static inline void startup_ipi_hook(int phys_apicid, unsigned long start_eip,
 				    unsigned long start_esp)
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index ecc74e5ad40..1da89276d14 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -218,9 +218,6 @@ struct pv_irq_ops {
 
 struct pv_apic_ops {
 #ifdef CONFIG_X86_LOCAL_APIC
-	void (*setup_boot_clock)(void);
-	void (*setup_secondary_clock)(void);
-
 	void (*startup_ipi_hook)(int phys_apicid,
 				 unsigned long start_eip,
 				 unsigned long start_esp);
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index b9bb4faefc4..b7d258f4c40 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -78,6 +78,15 @@ struct x86_init_paging {
 	void (*pagetable_setup_done)(pgd_t *base);
 };
 
+/**
+ * struct x86_init_timers - platform specific timer setup
+ * @setup_perpcu_clockev:	set up the per cpu clock event device for the
+ *				boot cpu
+ */
+struct x86_init_timers {
+	void (*setup_percpu_clockev)(void);
+};
+
 /**
  * struct x86_init_ops - functions for platform specific setup
  *
@@ -88,9 +97,19 @@ struct x86_init_ops {
 	struct x86_init_irqs		irqs;
 	struct x86_init_oem		oem;
 	struct x86_init_paging		paging;
+	struct x86_init_timers		timers;
+};
+
+/**
+ * struct x86_cpuinit_ops - platform specific cpu hotplug setups
+ * @setup_percpu_clockev:	set up the per cpu clock event device
+ */
+struct x86_cpuinit_ops {
+	void (*setup_percpu_clockev)(void);
 };
 
 extern struct x86_init_ops x86_init;
+extern struct x86_cpuinit_ops x86_cpuinit;
 
 extern void x86_init_noop(void);
 extern void x86_init_uint_noop(unsigned int unused);
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 0a1c2830ec6..ce0098066e9 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -36,6 +36,7 @@
 #include <linux/mm.h>
 
 #include <asm/perf_counter.h>
+#include <asm/x86_init.h>
 #include <asm/pgalloc.h>
 #include <asm/atomic.h>
 #include <asm/mpspec.h>
@@ -1701,7 +1702,7 @@ int __init APIC_init_uniprocessor(void)
 	localise_nmi_watchdog();
 #endif
 
-	setup_boot_clock();
+	x86_init.timers.setup_percpu_clockev();
 #ifdef CONFIG_X86_64
 	check_nmi_watchdog();
 #endif
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 223af43f152..64e9b5f59d2 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -22,6 +22,8 @@
 #include <asm/msr.h>
 #include <asm/apic.h>
 #include <linux/percpu.h>
+
+#include <asm/x86_init.h>
 #include <asm/reboot.h>
 
 #define KVM_SCALE 22
@@ -187,7 +189,8 @@ void __init kvmclock_init(void)
 		pv_time_ops.sched_clock = kvm_clock_read;
 		pv_time_ops.get_tsc_khz = kvm_get_tsc_khz;
 #ifdef CONFIG_X86_LOCAL_APIC
-		pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock;
+		x86_cpuinit.setup_percpu_clockev =
+			kvm_setup_secondary_clock;
 #endif
 #ifdef CONFIG_SMP
 		smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 8167be0b68c..1ed32c79679 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -387,8 +387,6 @@ struct pv_cpu_ops pv_cpu_ops = {
 
 struct pv_apic_ops pv_apic_ops = {
 #ifdef CONFIG_X86_LOCAL_APIC
-	.setup_boot_clock = setup_boot_APIC_clock,
-	.setup_secondary_clock = setup_secondary_APIC_clock,
 	.startup_ipi_hook = paravirt_nop,
 #endif
 };
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 2fecda69ee6..6eb81a87b4b 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -323,7 +323,7 @@ notrace static void __cpuinit start_secondary(void *unused)
 	/* enable local interrupts */
 	local_irq_enable();
 
-	setup_secondary_clock();
+	x86_cpuinit.setup_percpu_clockev();
 
 	wmb();
 	cpu_idle();
@@ -1112,7 +1112,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
 
 	printk(KERN_INFO "CPU%d: ", 0);
 	print_cpu_info(&cpu_data(0));
-	setup_boot_clock();
+	x86_init.timers.setup_percpu_clockev();
 
 	if (is_uv_system())
 		uv_system_init();
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 95a7289e4b0..b43b6685cae 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -821,8 +821,8 @@ static inline int __init activate_vmi(void)
 		pv_time_ops.get_wallclock = vmi_get_wallclock;
 		pv_time_ops.set_wallclock = vmi_set_wallclock;
 #ifdef CONFIG_X86_LOCAL_APIC
-		pv_apic_ops.setup_boot_clock = vmi_time_bsp_init;
-		pv_apic_ops.setup_secondary_clock = vmi_time_ap_init;
+		x86_init.timers.setup_percpu_clockev = vmi_time_bsp_init;
+		x86_cpuinit.setup_percpu_clockev = vmi_time_ap_init;
 #endif
 		pv_time_ops.sched_clock = vmi_sched_clock;
 		pv_time_ops.get_tsc_khz = vmi_tsc_khz;
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 7df020e6740..e666a98db7c 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -9,6 +9,7 @@
 #include <asm/paravirt.h>
 #include <asm/mpspec.h>
 #include <asm/setup.h>
+#include <asm/apic.h>
 #include <asm/e820.h>
 #include <asm/irq.h>
 
@@ -54,4 +55,12 @@ struct __initdata x86_init_ops x86_init = {
 		.pagetable_setup_start	= native_pagetable_setup_start,
 		.pagetable_setup_done	= native_pagetable_setup_done,
 	},
+
+	.timers = {
+		.setup_percpu_clockev	= setup_boot_APIC_clock,
+	},
+};
+
+__cpuinitdata struct x86_cpuinit_ops x86_cpuinit = {
+	.setup_percpu_clockev		= setup_secondary_APIC_clock,
 };
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index a924caa168d..14e597e0c16 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -912,8 +912,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
 
 static const struct pv_apic_ops xen_apic_ops __initdata = {
 #ifdef CONFIG_X86_LOCAL_APIC
-	.setup_boot_clock = paravirt_nop,
-	.setup_secondary_clock = paravirt_nop,
 	.startup_ipi_hook = paravirt_nop,
 #endif
 };
@@ -979,6 +977,8 @@ asmlinkage void __init xen_start_kernel(void)
 	x86_init.resources.memory_setup = xen_memory_setup;
 	x86_init.oem.arch_setup = xen_arch_setup;
 	x86_init.oem.banner = xen_banner;
+	x86_init.timers.setup_percpu_clockev = x86_init_noop;
+	x86_cpuinit.setup_percpu_clockev = x86_init_noop;
 
 #ifdef CONFIG_X86_64
 	/*
-- 
cgit v1.2.3


From 845b3944bbdf9e9247849bf037f27ff3a3f26d87 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 19 Aug 2009 15:37:03 +0200
Subject: x86: Add timer_init to x86_init_ops

The timer init code is convoluted with several quirks and the paravirt
timer chooser. Figuring out which code path is actually taken is not
for the faint hearted.

Move the numaq TSC quirk to tsc_pre_init x86_init_ops function and
replace the paravirt time chooser and the remaining x86 quirk with a
simple x86_init_ops function.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/paravirt.h       |  5 ----
 arch/x86/include/asm/paravirt_types.h |  2 --
 arch/x86/include/asm/setup.h          | 21 ++---------------
 arch/x86/include/asm/time.h           |  1 -
 arch/x86/include/asm/timer.h          |  3 +--
 arch/x86/include/asm/x86_init.h       |  4 ++++
 arch/x86/kernel/apic/numaq_32.c       | 10 ++------
 arch/x86/kernel/paravirt.c            |  1 -
 arch/x86/kernel/setup.c               | 43 -----------------------------------
 arch/x86/kernel/time_32.c             | 34 ++++++++++++++++++---------
 arch/x86/kernel/time_64.c             |  9 ++++++--
 arch/x86/kernel/tsc.c                 |  2 ++
 arch/x86/kernel/visws_quirks.c        | 20 ++++------------
 arch/x86/kernel/vmi_32.c              |  2 +-
 arch/x86/kernel/x86_init.c            |  3 +++
 arch/x86/lguest/boot.c                |  2 +-
 arch/x86/xen/enlighten.c              |  4 ++--
 17 files changed, 53 insertions(+), 113 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 825674a968d..11a4ba7b209 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -34,11 +34,6 @@ static inline int set_wallclock(unsigned long nowtime)
 	return PVOP_CALL1(int, pv_time_ops.set_wallclock, nowtime);
 }
 
-static inline void (*choose_time_init(void))(void)
-{
-	return pv_time_ops.time_init;
-}
-
 /* The paravirtualized CPUID instruction. */
 static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
 			   unsigned int *ecx, unsigned int *edx)
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 1da89276d14..0d812e592e3 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -88,8 +88,6 @@ struct pv_lazy_ops {
 };
 
 struct pv_time_ops {
-	void (*time_init)(void);
-
 	/* Set and set time of day */
 	unsigned long (*get_wallclock)(void);
 	int (*set_wallclock)(unsigned long);
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 58b58952b80..861e1fe2303 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -5,24 +5,6 @@
 
 #define COMMAND_LINE_SIZE 2048
 
-#ifndef __ASSEMBLY__
-
-#include <asm/x86_init.h>
-
-/*
- * Any setup quirks to be performed?
- */
-
-struct x86_quirks {
-	int (*arch_pre_time_init)(void);
-	int (*arch_time_init)(void);
-};
-
-extern void x86_quirk_pre_time_init(void);
-extern void x86_quirk_time_init(void);
-
-#endif /* __ASSEMBLY__ */
-
 #ifdef __i386__
 
 #include <linux/pfn.h>
@@ -42,6 +24,7 @@ extern void x86_quirk_time_init(void);
 
 #ifndef __ASSEMBLY__
 #include <asm/bootparam.h>
+#include <asm/x86_init.h>
 
 /* Interrupt control for vSMPowered x86_64 systems */
 #ifdef CONFIG_X86_64
@@ -60,11 +43,11 @@ static inline void visws_early_detect(void) { }
 static inline int is_visws_box(void) { return 0; }
 #endif
 
-extern struct x86_quirks *x86_quirks;
 extern unsigned long saved_video_mode;
 
 extern void reserve_standard_io_resources(void);
 extern void i386_reserve_resources(void);
+extern void setup_default_timer_irq(void);
 
 #ifndef _SETUP
 
diff --git a/arch/x86/include/asm/time.h b/arch/x86/include/asm/time.h
index 50c733aac42..91bb162b5a3 100644
--- a/arch/x86/include/asm/time.h
+++ b/arch/x86/include/asm/time.h
@@ -54,7 +54,6 @@ extern void time_init(void);
 
 #define get_wallclock() native_get_wallclock()
 #define set_wallclock(x) native_set_wallclock(x)
-#define choose_time_init() hpet_time_init
 
 #endif /* CONFIG_PARAVIRT */
 
diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
index 20ca9c4d468..e854c7ab416 100644
--- a/arch/x86/include/asm/timer.h
+++ b/arch/x86/include/asm/timer.h
@@ -12,8 +12,7 @@ unsigned long native_calibrate_tsc(void);
 
 #ifdef CONFIG_X86_32
 extern int timer_ack;
-extern irqreturn_t timer_interrupt(int irq, void *dev_id);
-#endif /* CONFIG_X86_32 */
+#endif
 extern int recalibrate_cpu_khz(void);
 
 extern int no_timer_check;
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index b7d258f4c40..f8bdd2271a0 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -82,9 +82,13 @@ struct x86_init_paging {
  * struct x86_init_timers - platform specific timer setup
  * @setup_perpcu_clockev:	set up the per cpu clock event device for the
  *				boot cpu
+ * @tsc_pre_init:		platform function called before TSC init
+ * @timer_init:			initialize the platform timer (default PIT/HPET)
  */
 struct x86_init_timers {
 	void (*setup_percpu_clockev)(void);
+	void (*tsc_pre_init)(void);
+	void (*timer_init)(void);
 };
 
 /**
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index 71c5ea64586..f1ebed6bd15 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -129,10 +129,9 @@ void __cpuinit numaq_tsc_disable(void)
 	}
 }
 
-static int __init numaq_pre_time_init(void)
+static void __init numaq_tsc_init(void)
 {
 	numaq_tsc_disable();
-	return 0;
 }
 
 static inline int generate_logical_apicid(int quad, int phys_apicid)
@@ -262,11 +261,6 @@ static void __init smp_read_mpc_oem(struct mpc_table *mpc)
 	}
 }
 
-static struct x86_quirks numaq_x86_quirks __initdata = {
-	.arch_pre_time_init		= numaq_pre_time_init,
-	.arch_time_init			= NULL,
-};
-
 static __init void early_check_numaq(void)
 {
 	/*
@@ -281,13 +275,13 @@ static __init void early_check_numaq(void)
 		early_get_smp_config();
 
 	if (found_numaq) {
-		x86_quirks = &numaq_x86_quirks;
 		x86_init.mpparse.mpc_record = numaq_mpc_record;
 		x86_init.mpparse.setup_ioapic_ids = x86_init_noop;
 		x86_init.mpparse.mpc_apic_id = mpc_apic_id;
 		x86_init.mpparse.smp_read_mpc_oem = smp_read_mpc_oem;
 		x86_init.mpparse.mpc_oem_pci_bus = mpc_oem_pci_bus;
 		x86_init.mpparse.mpc_oem_bus_info = mpc_oem_bus_info;
+		x86_init.timers.tsc_pre_init = numaq_tsc_init;
 	}
 }
 
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 1ed32c79679..9c0e644a76d 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -306,7 +306,6 @@ struct pv_init_ops pv_init_ops = {
 };
 
 struct pv_time_ops pv_time_ops = {
-	.time_init = hpet_time_init,
 	.get_wallclock = native_get_wallclock,
 	.set_wallclock = native_set_wallclock,
 	.sched_clock = native_sched_clock,
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 43ec6aa175b..bb207a47c63 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -626,10 +626,6 @@ static int __init setup_elfcorehdr(char *arg)
 early_param("elfcorehdr", setup_elfcorehdr);
 #endif
 
-static struct x86_quirks default_x86_quirks __initdata;
-
-struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;
-
 #ifdef CONFIG_X86_RESERVE_LOW_64K
 static int __init dmi_low_memory_corruption(const struct dmi_system_id *d)
 {
@@ -1016,45 +1012,6 @@ void __init setup_arch(char **cmdline_p)
 
 #ifdef CONFIG_X86_32
 
-static struct irqaction irq0  = {
-	.handler = timer_interrupt,
-	.flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER,
-	.name = "timer"
-};
-
-/**
- * x86_quirk_pre_time_init - do any specific initialisations before.
- *
- **/
-void __init x86_quirk_pre_time_init(void)
-{
-	if (x86_quirks->arch_pre_time_init)
-		x86_quirks->arch_pre_time_init();
-}
-
-/**
- * x86_quirk_time_init - do any specific initialisations for the system timer.
- *
- * Description:
- *	Must plug the system timer interrupt source at HZ into the IRQ listed
- *	in irq_vectors.h:TIMER_IRQ
- **/
-void __init x86_quirk_time_init(void)
-{
-	if (x86_quirks->arch_time_init) {
-		/*
-		 * A nonzero return code does not mean failure, it means
-		 * that the architecture quirk does not want any
-		 * generic (timer) setup to be performed after this:
-		 */
-		if (x86_quirks->arch_time_init())
-			return;
-	}
-
-	irq0.mask = cpumask_of_cpu(0);
-	setup_irq(0, &irq0);
-}
-
 static struct resource video_ram_resource = {
 	.name	= "Video RAM area",
 	.start	= 0xa0000,
diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c
index 5c5d87f0b2e..89bbb52218b 100644
--- a/arch/x86/kernel/time_32.c
+++ b/arch/x86/kernel/time_32.c
@@ -72,7 +72,7 @@ EXPORT_SYMBOL(profile_pc);
  * Time Stamp Counter value at the time of the timer interrupt, so that
  * we later on can estimate the time of day more exactly.
  */
-irqreturn_t timer_interrupt(int irq, void *dev_id)
+static irqreturn_t timer_interrupt(int irq, void *dev_id)
 {
 	/* Keep nmi watchdog up to date */
 	inc_irq_stat(irq0_irqs);
@@ -113,25 +113,37 @@ irqreturn_t timer_interrupt(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
-/* Duplicate of time_init() below, with hpet_enable part added */
+static struct irqaction irq0  = {
+	.handler = timer_interrupt,
+	.flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER,
+	.name = "timer"
+};
+
+void __init setup_default_timer_irq(void)
+{
+	irq0.mask = cpumask_of_cpu(0);
+	setup_irq(0, &irq0);
+}
+
+/* Default timer init function */
 void __init hpet_time_init(void)
 {
 	if (!hpet_enable())
 		setup_pit_timer();
-	x86_quirk_time_init();
+	setup_default_timer_irq();
+}
+
+static void x86_late_time_init(void)
+{
+	x86_init.timers.timer_init();
 }
 
 /*
- * This is called directly from init code; we must delay timer setup in the
- * HPET case as we can't make the decision to turn on HPET this early in the
- * boot process.
- *
- * The chosen time_init function will usually be hpet_time_init, above, but
- * in the case of virtual hardware, an alternative function may be substituted.
+ * Initialize TSC and delay the periodic timer init to
+ * late x86_late_time_init() so ioremap works.
  */
 void __init time_init(void)
 {
-	x86_quirk_pre_time_init();
 	tsc_init();
-	late_time_init = choose_time_init();
+	late_time_init = x86_late_time_init;
 }
diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c
index 5ba343e6184..38a7df94c10 100644
--- a/arch/x86/kernel/time_64.c
+++ b/arch/x86/kernel/time_64.c
@@ -19,6 +19,7 @@
 #include <linux/mca.h>
 #include <linux/nmi.h>
 
+#include <asm/x86_init.h>
 #include <asm/i8253.h>
 #include <asm/hpet.h>
 #include <asm/vgtod.h>
@@ -127,9 +128,13 @@ void __init hpet_time_init(void)
 	setup_irq(0, &irq0);
 }
 
+static void x86_late_time_init(void)
+{
+	x86_init.timers.timer_init();
+}
+
 void __init time_init(void)
 {
 	tsc_init();
-
-	late_time_init = choose_time_init();
+	late_time_init = x86_late_time_init;
 }
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 71f4368b357..652bc214eeb 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -857,6 +857,8 @@ void __init tsc_init(void)
 	u64 lpj;
 	int cpu;
 
+	x86_init.timers.tsc_pre_init();
+
 	if (!cpu_has_tsc)
 		return;
 
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index 2719091b335..f068553a1b1 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -30,6 +30,7 @@
 #include <asm/setup.h>
 #include <asm/apic.h>
 #include <asm/e820.h>
+#include <asm/time.h>
 #include <asm/io.h>
 
 #include <linux/kernel_stat.h>
@@ -53,7 +54,7 @@ int is_visws_box(void)
 	return visws_board_type >= 0;
 }
 
-static int __init visws_time_init(void)
+static void __init visws_time_init(void)
 {
 	printk(KERN_INFO "Starting Cobalt Timer system clock\n");
 
@@ -66,11 +67,7 @@ static int __init visws_time_init(void)
 	/* Enable (unmask) the timer interrupt */
 	co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) & ~CO_CTRL_TIMEMASK);
 
-	/*
-	 * Zero return means the generic timer setup code will set up
-	 * the standard vector:
-	 */
-	return 0;
+	setup_default_timer_irq();
 }
 
 /* Replaces the default init_ISA_irqs in the generic setup */
@@ -226,10 +223,6 @@ static void __init visws_find_smp_config(unsigned int reserve)
 
 static void visws_trap_init(void);
 
-static struct x86_quirks visws_x86_quirks __initdata = {
-	.arch_time_init		= visws_time_init,
-};
-
 void __init visws_early_detect(void)
 {
 	int raw;
@@ -241,17 +234,14 @@ void __init visws_early_detect(void)
 		return;
 
 	/*
-	 * Install special quirks for timer, interrupt and memory setup:
-	 * Fall back to generic behavior for traps:
-	 * Override generic MP-table parsing:
+	 * Override the default platform setup functions
 	 */
-	x86_quirks = &visws_x86_quirks;
-
 	x86_init.resources.memory_setup = visws_memory_setup;
 	x86_init.mpparse.get_smp_config = visws_get_smp_config;
 	x86_init.mpparse.find_smp_config = visws_find_smp_config;
 	x86_init.irqs.pre_vector_init = visws_pre_intr_init;
 	x86_init.irqs.trap_init = visws_trap_init;
+	x86_init.timers.timer_init = visws_time_init;
 
 	/*
 	 * Install reboot quirks:
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index b43b6685cae..cd7d0fbbf66 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -817,7 +817,7 @@ static inline int __init activate_vmi(void)
 		vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm);
 		vmi_timer_ops.cancel_alarm =
 			 vmi_get_function(VMI_CALL_CancelAlarm);
-		pv_time_ops.time_init = vmi_time_init;
+		x86_init.timers.timer_init = vmi_time_init;
 		pv_time_ops.get_wallclock = vmi_get_wallclock;
 		pv_time_ops.set_wallclock = vmi_set_wallclock;
 #ifdef CONFIG_X86_LOCAL_APIC
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index e666a98db7c..4790b92714a 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -11,6 +11,7 @@
 #include <asm/setup.h>
 #include <asm/apic.h>
 #include <asm/e820.h>
+#include <asm/time.h>
 #include <asm/irq.h>
 
 void __cpuinit x86_init_noop(void) { }
@@ -58,6 +59,8 @@ struct __initdata x86_init_ops x86_init = {
 
 	.timers = {
 		.setup_percpu_clockev	= setup_boot_APIC_clock,
+		.tsc_pre_init		= x86_init_noop,
+		.timer_init		= hpet_time_init,
 	},
 };
 
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 1ff986511f1..6caa8c0c793 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -1320,11 +1320,11 @@ __init void lguest_init(void)
 
 	/* Time operations */
 	pv_time_ops.get_wallclock = lguest_get_wallclock;
-	pv_time_ops.time_init = lguest_time_init;
 	pv_time_ops.get_tsc_khz = lguest_tsc_khz;
 
 	x86_init.resources.memory_setup = lguest_memory_setup;
 	x86_init.irqs.intr_init = lguest_init_IRQ;
+	x86_init.timers.timer_init = lguest_time_init;
 
 	/*
 	 * Now is a good time to look at the implementations of these functions
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 14e597e0c16..84826b842b5 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -842,8 +842,6 @@ static const struct pv_init_ops xen_init_ops __initdata = {
 };
 
 static const struct pv_time_ops xen_time_ops __initdata = {
-	.time_init = xen_time_init,
-
 	.set_wallclock = xen_set_wallclock,
 	.get_wallclock = xen_get_wallclock,
 	.get_tsc_khz = xen_tsc_khz,
@@ -977,6 +975,8 @@ asmlinkage void __init xen_start_kernel(void)
 	x86_init.resources.memory_setup = xen_memory_setup;
 	x86_init.oem.arch_setup = xen_arch_setup;
 	x86_init.oem.banner = xen_banner;
+
+	x86_init.timers.timer_init = xen_time_init;
 	x86_init.timers.setup_percpu_clockev = x86_init_noop;
 	x86_cpuinit.setup_percpu_clockev = x86_init_noop;
 
-- 
cgit v1.2.3


From ecce85089e6d31eed7535b68f5acdd194265690c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 20 Aug 2009 15:28:50 +0200
Subject: x86: Remove do_timer hook

This is a left over of the old x86 sub arch support. Remove it and
open code it like we do in time_64.c

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/do_timer.h | 16 ----------------
 arch/x86/kernel/time_32.c       |  7 ++++---
 2 files changed, 4 insertions(+), 19 deletions(-)
 delete mode 100644 arch/x86/include/asm/do_timer.h

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/do_timer.h b/arch/x86/include/asm/do_timer.h
deleted file mode 100644
index 23ecda0b28a..00000000000
--- a/arch/x86/include/asm/do_timer.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/* defines for inline arch setup functions */
-#include <linux/clockchips.h>
-
-#include <asm/i8259.h>
-#include <asm/i8253.h>
-
-/**
- * do_timer_interrupt_hook - hook into timer tick
- *
- * Call the pit clock event handler. see asm/i8253.h
- **/
-
-static inline void do_timer_interrupt_hook(void)
-{
-	global_clock_event->event_handler(global_clock_event);
-}
diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c
index 89bbb52218b..6fef4ea1e7a 100644
--- a/arch/x86/kernel/time_32.c
+++ b/arch/x86/kernel/time_32.c
@@ -28,6 +28,7 @@
  *	serialize accesses to xtime/lost_ticks).
  */
 
+#include <linux/clockchips.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
 #include <linux/time.h>
@@ -37,8 +38,8 @@
 #include <asm/hpet.h>
 #include <asm/time.h>
 #include <asm/timer.h>
-
-#include <asm/do_timer.h>
+#include <asm/i8259.h>
+#include <asm/i8253.h>
 
 int timer_ack;
 
@@ -92,7 +93,7 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id)
 	}
 #endif
 
-	do_timer_interrupt_hook();
+	global_clock_event->event_handler(global_clock_event);
 
 #ifdef CONFIG_MCA
 	if (MCA_bus) {
-- 
cgit v1.2.3


From dd3e6e8c6e7a2294f137c4dbccb3e73e7fa8ba15 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 20 Aug 2009 15:35:23 +0200
Subject: x86: Prepare unification of time_32/64.c

Unify the top comment and the includes.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/time_32.c | 44 +++++++++++++-------------------------------
 arch/x86/kernel/time_64.c | 13 +++++--------
 2 files changed, 18 insertions(+), 39 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c
index 6fef4ea1e7a..acbaefd61e8 100644
--- a/arch/x86/kernel/time_32.c
+++ b/arch/x86/kernel/time_32.c
@@ -1,45 +1,27 @@
 /*
- *  Copyright (C) 1991, 1992, 1995  Linus Torvalds
+ *  Copyright (c) 1991,1992,1995  Linus Torvalds
+ *  Copyright (c) 1994  Alan Modra
+ *  Copyright (c) 1995  Markus Kuhn
+ *  Copyright (c) 1996  Ingo Molnar
+ *  Copyright (c) 1998  Andrea Arcangeli
+ *  Copyright (c) 2002,2006  Vojtech Pavlik
+ *  Copyright (c) 2003  Andi Kleen
  *
- * This file contains the PC-specific time handling details:
- * reading the RTC at bootup, etc..
- * 1994-07-02    Alan Modra
- *	fixed set_rtc_mmss, fixed time.year for >= 2000, new mktime
- * 1995-03-26    Markus Kuhn
- *      fixed 500 ms bug at call to set_rtc_mmss, fixed DS12887
- *      precision CMOS clock update
- * 1996-05-03    Ingo Molnar
- *      fixed time warps in do_[slow|fast]_gettimeoffset()
- * 1997-09-10	Updated NTP code according to technical memorandum Jan '96
- *		"A Kernel Model for Precision Timekeeping" by Dave Mills
- * 1998-09-05    (Various)
- *	More robust do_fast_gettimeoffset() algorithm implemented
- *	(works with APM, Cyrix 6x86MX and Centaur C6),
- *	monotonic gettimeofday() with fast_get_timeoffset(),
- *	drift-proof precision TSC calibration on boot
- *	(C. Scott Ananian <cananian@alumni.princeton.edu>, Andrew D.
- *	Balsa <andrebalsa@altern.org>, Philip Gladstone <philip@raptor.com>;
- *	ported from 2.0.35 Jumbo-9 by Michael Krause <m.krause@tu-harburg.de>).
- * 1998-12-16    Andrea Arcangeli
- *	Fixed Jumbo-9 code in 2.1.131: do_gettimeofday was missing 1 jiffy
- *	because was not accounting lost_ticks.
- * 1998-12-24 Copyright (C) 1998  Andrea Arcangeli
- *	Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
- *	serialize accesses to xtime/lost_ticks).
  */
 
 #include <linux/clockchips.h>
-#include <linux/init.h>
 #include <linux/interrupt.h>
 #include <linux/time.h>
 #include <linux/mca.h>
 
-#include <asm/setup.h>
-#include <asm/hpet.h>
-#include <asm/time.h>
-#include <asm/timer.h>
+#include <asm/vsyscall.h>
+#include <asm/x86_init.h>
 #include <asm/i8259.h>
 #include <asm/i8253.h>
+#include <asm/timer.h>
+#include <asm/hpet.h>
+#include <asm/time.h>
+#include <asm/nmi.h>
 
 int timer_ack;
 
diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c
index 38a7df94c10..45914f8844a 100644
--- a/arch/x86/kernel/time_64.c
+++ b/arch/x86/kernel/time_64.c
@@ -1,6 +1,4 @@
 /*
- *  "High Precision Event Timer" based timekeeping.
- *
  *  Copyright (c) 1991,1992,1995  Linus Torvalds
  *  Copyright (c) 1994  Alan Modra
  *  Copyright (c) 1995  Markus Kuhn
@@ -8,23 +6,22 @@
  *  Copyright (c) 1998  Andrea Arcangeli
  *  Copyright (c) 2002,2006  Vojtech Pavlik
  *  Copyright (c) 2003  Andi Kleen
- *  RTC support code taken from arch/i386/kernel/timers/time_hpet.c
+ *
  */
 
 #include <linux/clockchips.h>
-#include <linux/init.h>
 #include <linux/interrupt.h>
-#include <linux/module.h>
 #include <linux/time.h>
 #include <linux/mca.h>
-#include <linux/nmi.h>
 
+#include <asm/vsyscall.h>
 #include <asm/x86_init.h>
+#include <asm/i8259.h>
 #include <asm/i8253.h>
+#include <asm/timer.h>
 #include <asm/hpet.h>
-#include <asm/vgtod.h>
 #include <asm/time.h>
-#include <asm/timer.h>
+#include <asm/nmi.h>
 
 volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
 
-- 
cgit v1.2.3


From 64fcbac1f38882d8ae82c44a1c2a676cfa5e79e1 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 20 Aug 2009 15:54:21 +0200
Subject: x86: Simplify timer_ack magic in time_32.c

Let the compiler optimize the timer_ack magic away in the 32bit timer
interrupt and put the same code into time_64.c. It's optimized out for
CONFIG_X86_IO_APIC on 32bit and for 64bit because timer_ack is const 0
in both cases.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/timer.h |  6 ++++--
 arch/x86/kernel/time_32.c    |  5 +++--
 arch/x86/kernel/time_64.c    | 14 ++++++++++++++
 3 files changed, 21 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
index e854c7ab416..65228ccc5f0 100644
--- a/arch/x86/include/asm/timer.h
+++ b/arch/x86/include/asm/timer.h
@@ -9,11 +9,13 @@
 
 unsigned long long native_sched_clock(void);
 unsigned long native_calibrate_tsc(void);
+extern int recalibrate_cpu_khz(void);
 
-#ifdef CONFIG_X86_32
+#if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC)
 extern int timer_ack;
+#else
+# define timer_ack (0)
 #endif
-extern int recalibrate_cpu_khz(void);
 
 extern int no_timer_check;
 
diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c
index acbaefd61e8..7a26bcf887f 100644
--- a/arch/x86/kernel/time_32.c
+++ b/arch/x86/kernel/time_32.c
@@ -23,7 +23,9 @@
 #include <asm/time.h>
 #include <asm/nmi.h>
 
+#if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC)
 int timer_ack;
+#endif
 
 unsigned long profile_pc(struct pt_regs *regs)
 {
@@ -60,7 +62,7 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id)
 	/* Keep nmi watchdog up to date */
 	inc_irq_stat(irq0_irqs);
 
-#ifdef CONFIG_X86_IO_APIC
+	/* Optimized out for !IO_APIC and x86_64 */
 	if (timer_ack) {
 		/*
 		 * Subtle, when I/O APICs are used we have to ack timer IRQ
@@ -73,7 +75,6 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id)
 		inb(PIC_MASTER_POLL);
 		spin_unlock(&i8259A_lock);
 	}
-#endif
 
 	global_clock_event->event_handler(global_clock_event);
 
diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c
index 45914f8844a..35e0a925da5 100644
--- a/arch/x86/kernel/time_64.c
+++ b/arch/x86/kernel/time_64.c
@@ -51,6 +51,20 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id)
 {
 	inc_irq_stat(irq0_irqs);
 
+	/* Optimized out for !IO_APIC and x86_64 */
+	if (timer_ack) {
+		/*
+		 * Subtle, when I/O APICs are used we have to ack timer IRQ
+		 * manually to deassert NMI lines for the watchdog if run
+		 * on an 82489DX-based system.
+		 */
+		spin_lock(&i8259A_lock);
+		outb(0x0c, PIC_MASTER_OCW3);
+		/* Ack the IRQ; AEOI will end it automatically. */
+		inb(PIC_MASTER_POLL);
+		spin_unlock(&i8259A_lock);
+	}
+
 	global_clock_event->event_handler(global_clock_event);
 
 #ifdef CONFIG_MCA
-- 
cgit v1.2.3


From 0be6939422eb2f54df4b3d8763c569c6759c1a42 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 20 Aug 2009 15:59:35 +0200
Subject: x86: Remove mca bus ifdef from timer interrupt

MCA_bus is constant 0 when CONFIG_MCA=n. So the compiler removes that
code w/o needing an extra #ifdef

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/time_32.c | 18 +++---------------
 arch/x86/kernel/time_64.c |  9 +++------
 2 files changed, 6 insertions(+), 21 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c
index 7a26bcf887f..ec729cdcfa3 100644
--- a/arch/x86/kernel/time_32.c
+++ b/arch/x86/kernel/time_32.c
@@ -78,21 +78,9 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id)
 
 	global_clock_event->event_handler(global_clock_event);
 
-#ifdef CONFIG_MCA
-	if (MCA_bus) {
-		/* The PS/2 uses level-triggered interrupts.  You can't
-		turn them off, nor would you want to (any attempt to
-		enable edge-triggered interrupts usually gets intercepted by a
-		special hardware circuit).  Hence we have to acknowledge
-		the timer interrupt.  Through some incredibly stupid
-		design idea, the reset for IRQ 0 is done by setting the
-		high bit of the PPI port B (0x61).  Note that some PS/2s,
-		notably the 55SX, work fine if this is removed.  */
-
-		u8 irq_v = inb_p(0x61);		/* read the current state */
-		outb_p(irq_v | 0x80, 0x61);	/* reset the IRQ */
-	}
-#endif
+	/* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */
+	if (MCA_bus)
+		outb_p(inb_p(0x61)| 0x80, 0x61);
 
 	return IRQ_HANDLED;
 }
diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c
index 35e0a925da5..7db3912b869 100644
--- a/arch/x86/kernel/time_64.c
+++ b/arch/x86/kernel/time_64.c
@@ -67,12 +67,9 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id)
 
 	global_clock_event->event_handler(global_clock_event);
 
-#ifdef CONFIG_MCA
-	if (MCA_bus) {
-		u8 irq_v = inb_p(0x61);       /* read the current state */
-		outb_p(irq_v|0x80, 0x61);     /* reset the IRQ */
-	}
-#endif
+	/* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */
+	if (MCA_bus)
+		outb_p(inb_p(0x61)| 0x80, 0x61);
 
 	return IRQ_HANDLED;
 }
-- 
cgit v1.2.3


From 454ede7eebf91b92ab1eafe10c6b6ed04de29bf8 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 20 Aug 2009 16:07:40 +0200
Subject: x86: Make timer setup and global variables the same in time_32/64.c

The timer and timer irq setup code is identical in 32 and 64 bit. Make
it the same formatting as well. Also add the global variables under
the necessary ifdefs to both files.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/time_32.c |  8 +++++---
 arch/x86/kernel/time_64.c | 38 ++++++++++++++++++++++++++++++--------
 2 files changed, 35 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c
index ec729cdcfa3..186abc577b2 100644
--- a/arch/x86/kernel/time_32.c
+++ b/arch/x86/kernel/time_32.c
@@ -27,6 +27,10 @@
 int timer_ack;
 #endif
 
+#ifdef CONFIG_X86_64
+volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
+#endif
+
 unsigned long profile_pc(struct pt_regs *regs)
 {
 	unsigned long pc = instruction_pointer(regs);
@@ -53,9 +57,7 @@ unsigned long profile_pc(struct pt_regs *regs)
 EXPORT_SYMBOL(profile_pc);
 
 /*
- * This is the same as the above, except we _also_ save the current
- * Time Stamp Counter value at the time of the timer interrupt, so that
- * we later on can estimate the time of day more exactly.
+ * Default timer interrupt handler for PIT/HPET
  */
 static irqreturn_t timer_interrupt(int irq, void *dev_id)
 {
diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c
index 7db3912b869..78cbdf5c006 100644
--- a/arch/x86/kernel/time_64.c
+++ b/arch/x86/kernel/time_64.c
@@ -23,7 +23,13 @@
 #include <asm/time.h>
 #include <asm/nmi.h>
 
+#if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC)
+int timer_ack;
+#endif
+
+#ifdef CONFIG_X86_64
 volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
+#endif
 
 unsigned long profile_pc(struct pt_regs *regs)
 {
@@ -47,8 +53,12 @@ unsigned long profile_pc(struct pt_regs *regs)
 }
 EXPORT_SYMBOL(profile_pc);
 
+/*
+ * Default timer interrupt handler for PIT/HPET
+ */
 static irqreturn_t timer_interrupt(int irq, void *dev_id)
 {
+	/* Keep nmi watchdog up to date */
 	inc_irq_stat(irq0_irqs);
 
 	/* Optimized out for !IO_APIC and x86_64 */
@@ -74,8 +84,10 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
-/* calibrate_cpu is used on systems with fixed rate TSCs to determine
- * processor frequency */
+/*
+ * calibrate_cpu is used on systems with fixed rate TSCs to determine
+ * processor frequency
+ */
 #define TICK_COUNT 100000000
 unsigned long __init calibrate_cpu(void)
 {
@@ -122,18 +134,24 @@ unsigned long __init calibrate_cpu(void)
 	return pmc_now * tsc_khz / (tsc_now - tsc_start);
 }
 
-static struct irqaction irq0 = {
-	.handler	= timer_interrupt,
-	.flags		= IRQF_DISABLED | IRQF_IRQPOLL | IRQF_NOBALANCING | IRQF_TIMER,
-	.name		= "timer"
+static struct irqaction irq0  = {
+	.handler = timer_interrupt,
+	.flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER,
+	.name = "timer"
 };
 
+void __init setup_default_timer_irq(void)
+{
+	irq0.mask = cpumask_of_cpu(0);
+	setup_irq(0, &irq0);
+}
+
+/* Default timer init function */
 void __init hpet_time_init(void)
 {
 	if (!hpet_enable())
 		setup_pit_timer();
-
-	setup_irq(0, &irq0);
+	setup_default_timer_irq();
 }
 
 static void x86_late_time_init(void)
@@ -141,6 +159,10 @@ static void x86_late_time_init(void)
 	x86_init.timers.timer_init();
 }
 
+/*
+ * Initialize TSC and delay the periodic timer init to
+ * late x86_late_time_init() so ioremap works.
+ */
 void __init time_init(void)
 {
 	tsc_init();
-- 
cgit v1.2.3


From 08047c4f1740c7cee75d58e2919d48c09f951649 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 20 Aug 2009 16:27:41 +0200
Subject: x86: Move calibrate_cpu to tsc.c

Move the code where it's only user is. Also we need to look whether
this hardwired hackery might interfere with perfcounters.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/time.h |  2 --
 arch/x86/kernel/time_32.c   |  1 -
 arch/x86/kernel/time_64.c   | 51 ----------------------------------------
 arch/x86/kernel/tsc.c       | 57 +++++++++++++++++++++++++++++++++++++++++++--
 4 files changed, 55 insertions(+), 56 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/time.h b/arch/x86/include/asm/time.h
index 91bb162b5a3..9c5608b21c2 100644
--- a/arch/x86/include/asm/time.h
+++ b/arch/x86/include/asm/time.h
@@ -57,6 +57,4 @@ extern void time_init(void);
 
 #endif /* CONFIG_PARAVIRT */
 
-extern unsigned long __init calibrate_cpu(void);
-
 #endif /* _ASM_X86_TIME_H */
diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c
index 186abc577b2..fd876cc7748 100644
--- a/arch/x86/kernel/time_32.c
+++ b/arch/x86/kernel/time_32.c
@@ -21,7 +21,6 @@
 #include <asm/timer.h>
 #include <asm/hpet.h>
 #include <asm/time.h>
-#include <asm/nmi.h>
 
 #if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC)
 int timer_ack;
diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c
index 78cbdf5c006..e59a40ebff1 100644
--- a/arch/x86/kernel/time_64.c
+++ b/arch/x86/kernel/time_64.c
@@ -21,7 +21,6 @@
 #include <asm/timer.h>
 #include <asm/hpet.h>
 #include <asm/time.h>
-#include <asm/nmi.h>
 
 #if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC)
 int timer_ack;
@@ -84,56 +83,6 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
-/*
- * calibrate_cpu is used on systems with fixed rate TSCs to determine
- * processor frequency
- */
-#define TICK_COUNT 100000000
-unsigned long __init calibrate_cpu(void)
-{
-	int tsc_start, tsc_now;
-	int i, no_ctr_free;
-	unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0;
-	unsigned long flags;
-
-	for (i = 0; i < 4; i++)
-		if (avail_to_resrv_perfctr_nmi_bit(i))
-			break;
-	no_ctr_free = (i == 4);
-	if (no_ctr_free) {
-		WARN(1, KERN_WARNING "Warning: AMD perfctrs busy ... "
-		     "cpu_khz value may be incorrect.\n");
-		i = 3;
-		rdmsrl(MSR_K7_EVNTSEL3, evntsel3);
-		wrmsrl(MSR_K7_EVNTSEL3, 0);
-		rdmsrl(MSR_K7_PERFCTR3, pmc3);
-	} else {
-		reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i);
-		reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
-	}
-	local_irq_save(flags);
-	/* start measuring cycles, incrementing from 0 */
-	wrmsrl(MSR_K7_PERFCTR0 + i, 0);
-	wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76);
-	rdtscl(tsc_start);
-	do {
-		rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now);
-		tsc_now = get_cycles();
-	} while ((tsc_now - tsc_start) < TICK_COUNT);
-
-	local_irq_restore(flags);
-	if (no_ctr_free) {
-		wrmsrl(MSR_K7_EVNTSEL3, 0);
-		wrmsrl(MSR_K7_PERFCTR3, pmc3);
-		wrmsrl(MSR_K7_EVNTSEL3, evntsel3);
-	} else {
-		release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
-		release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
-	}
-
-	return pmc_now * tsc_khz / (tsc_now - tsc_start);
-}
-
 static struct irqaction irq0  = {
 	.handler = timer_interrupt,
 	.flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER,
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 652bc214eeb..97a0bcbad10 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -17,6 +17,7 @@
 #include <asm/time.h>
 #include <asm/delay.h>
 #include <asm/hypervisor.h>
+#include <asm/nmi.h>
 
 unsigned int __read_mostly cpu_khz;	/* TSC clocks / usec, not used here */
 EXPORT_SYMBOL(cpu_khz);
@@ -852,6 +853,60 @@ static void __init init_tsc_clocksource(void)
 	clocksource_register(&clocksource_tsc);
 }
 
+#ifdef CONFIG_X86_64
+/*
+ * calibrate_cpu is used on systems with fixed rate TSCs to determine
+ * processor frequency
+ */
+#define TICK_COUNT 100000000
+static unsigned long __init calibrate_cpu(void)
+{
+	int tsc_start, tsc_now;
+	int i, no_ctr_free;
+	unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0;
+	unsigned long flags;
+
+	for (i = 0; i < 4; i++)
+		if (avail_to_resrv_perfctr_nmi_bit(i))
+			break;
+	no_ctr_free = (i == 4);
+	if (no_ctr_free) {
+		WARN(1, KERN_WARNING "Warning: AMD perfctrs busy ... "
+		     "cpu_khz value may be incorrect.\n");
+		i = 3;
+		rdmsrl(MSR_K7_EVNTSEL3, evntsel3);
+		wrmsrl(MSR_K7_EVNTSEL3, 0);
+		rdmsrl(MSR_K7_PERFCTR3, pmc3);
+	} else {
+		reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i);
+		reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
+	}
+	local_irq_save(flags);
+	/* start measuring cycles, incrementing from 0 */
+	wrmsrl(MSR_K7_PERFCTR0 + i, 0);
+	wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76);
+	rdtscl(tsc_start);
+	do {
+		rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now);
+		tsc_now = get_cycles();
+	} while ((tsc_now - tsc_start) < TICK_COUNT);
+
+	local_irq_restore(flags);
+	if (no_ctr_free) {
+		wrmsrl(MSR_K7_EVNTSEL3, 0);
+		wrmsrl(MSR_K7_PERFCTR3, pmc3);
+		wrmsrl(MSR_K7_EVNTSEL3, evntsel3);
+	} else {
+		release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
+		release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
+	}
+
+	return pmc_now * tsc_khz / (tsc_now - tsc_start);
+}
+#else
+static inline unsigned long calibrate_cpu(void) { return cpu_khz; }
+#endif
+
 void __init tsc_init(void)
 {
 	u64 lpj;
@@ -870,11 +925,9 @@ void __init tsc_init(void)
 		return;
 	}
 
-#ifdef CONFIG_X86_64
 	if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) &&
 			(boot_cpu_data.x86_vendor == X86_VENDOR_AMD))
 		cpu_khz = calibrate_cpu();
-#endif
 
 	printk("Detected %lu.%03lu MHz processor.\n",
 			(unsigned long)cpu_khz / 1000,
-- 
cgit v1.2.3


From ef4512882dbe9978e7a18ccbcb4cb45705ce5560 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 21 Aug 2009 13:24:08 +0200
Subject: x86: time_32/64.c unify profile_pc

The code is identical except for the formatting and a useless
#ifdef. Make it the same.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/time_32.c | 13 ++++++-------
 arch/x86/kernel/time_64.c |  8 +++++---
 2 files changed, 11 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c
index fd876cc7748..fda0c34da75 100644
--- a/arch/x86/kernel/time_32.c
+++ b/arch/x86/kernel/time_32.c
@@ -34,23 +34,22 @@ unsigned long profile_pc(struct pt_regs *regs)
 {
 	unsigned long pc = instruction_pointer(regs);
 
-#ifdef CONFIG_SMP
 	if (!user_mode_vm(regs) && in_lock_functions(pc)) {
 #ifdef CONFIG_FRAME_POINTER
 		return *(unsigned long *)(regs->bp + sizeof(long));
 #else
-		unsigned long *sp = (unsigned long *)&regs->sp;
-
-		/* Return address is either directly at stack pointer
-		   or above a saved flags. Eflags has bits 22-31 zero,
-		   kernel addresses don't. */
+		unsigned long *sp = (unsigned long *)regs->sp;
+		/*
+		 * Return address is either directly at stack pointer
+		 * or above a saved flags. Eflags has bits 22-31 zero,
+		 * kernel addresses don't.
+		 */
 		if (sp[0] >> 22)
 			return sp[0];
 		if (sp[1] >> 22)
 			return sp[1];
 #endif
 	}
-#endif
 	return pc;
 }
 EXPORT_SYMBOL(profile_pc);
diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c
index e59a40ebff1..fda0c34da75 100644
--- a/arch/x86/kernel/time_64.c
+++ b/arch/x86/kernel/time_64.c
@@ -34,14 +34,16 @@ unsigned long profile_pc(struct pt_regs *regs)
 {
 	unsigned long pc = instruction_pointer(regs);
 
-	/* Assume the lock function has either no stack frame or a copy
-	   of flags from PUSHF
-	   Eflags always has bits 22 and up cleared unlike kernel addresses. */
 	if (!user_mode_vm(regs) && in_lock_functions(pc)) {
 #ifdef CONFIG_FRAME_POINTER
 		return *(unsigned long *)(regs->bp + sizeof(long));
 #else
 		unsigned long *sp = (unsigned long *)regs->sp;
+		/*
+		 * Return address is either directly at stack pointer
+		 * or above a saved flags. Eflags has bits 22-31 zero,
+		 * kernel addresses don't.
+		 */
 		if (sp[0] >> 22)
 			return sp[0];
 		if (sp[1] >> 22)
-- 
cgit v1.2.3


From 47926214d8b2bef13b2be57c500194a804f16198 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 20 Aug 2009 16:47:19 +0200
Subject: x86: Replace the now identical time_32/64.c by time.c

Remove the redundant copy.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/Makefile  |   2 +-
 arch/x86/kernel/time.c    | 121 ++++++++++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/time_32.c | 121 ----------------------------------------------
 arch/x86/kernel/time_64.c | 121 ----------------------------------------------
 4 files changed, 122 insertions(+), 243 deletions(-)
 create mode 100644 arch/x86/kernel/time.c
 delete mode 100644 arch/x86/kernel/time_32.c
 delete mode 100644 arch/x86/kernel/time_64.c

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 313ed6fca9b..ccf3db607c2 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -31,7 +31,7 @@ GCOV_PROFILE_paravirt.o		:= n
 
 obj-y			:= process_$(BITS).o signal.o entry_$(BITS).o
 obj-y			+= traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
-obj-y			+= time_$(BITS).o ioport.o ldt.o dumpstack.o
+obj-y			+= time.o ioport.o ldt.o dumpstack.o
 obj-y			+= setup.o x86_init.o i8259.o irqinit.o
 obj-$(CONFIG_X86_VISWS)	+= visws_quirks.o
 obj-$(CONFIG_X86_32)	+= probe_roms_32.o
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
new file mode 100644
index 00000000000..fda0c34da75
--- /dev/null
+++ b/arch/x86/kernel/time.c
@@ -0,0 +1,121 @@
+/*
+ *  Copyright (c) 1991,1992,1995  Linus Torvalds
+ *  Copyright (c) 1994  Alan Modra
+ *  Copyright (c) 1995  Markus Kuhn
+ *  Copyright (c) 1996  Ingo Molnar
+ *  Copyright (c) 1998  Andrea Arcangeli
+ *  Copyright (c) 2002,2006  Vojtech Pavlik
+ *  Copyright (c) 2003  Andi Kleen
+ *
+ */
+
+#include <linux/clockchips.h>
+#include <linux/interrupt.h>
+#include <linux/time.h>
+#include <linux/mca.h>
+
+#include <asm/vsyscall.h>
+#include <asm/x86_init.h>
+#include <asm/i8259.h>
+#include <asm/i8253.h>
+#include <asm/timer.h>
+#include <asm/hpet.h>
+#include <asm/time.h>
+
+#if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC)
+int timer_ack;
+#endif
+
+#ifdef CONFIG_X86_64
+volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
+#endif
+
+unsigned long profile_pc(struct pt_regs *regs)
+{
+	unsigned long pc = instruction_pointer(regs);
+
+	if (!user_mode_vm(regs) && in_lock_functions(pc)) {
+#ifdef CONFIG_FRAME_POINTER
+		return *(unsigned long *)(regs->bp + sizeof(long));
+#else
+		unsigned long *sp = (unsigned long *)regs->sp;
+		/*
+		 * Return address is either directly at stack pointer
+		 * or above a saved flags. Eflags has bits 22-31 zero,
+		 * kernel addresses don't.
+		 */
+		if (sp[0] >> 22)
+			return sp[0];
+		if (sp[1] >> 22)
+			return sp[1];
+#endif
+	}
+	return pc;
+}
+EXPORT_SYMBOL(profile_pc);
+
+/*
+ * Default timer interrupt handler for PIT/HPET
+ */
+static irqreturn_t timer_interrupt(int irq, void *dev_id)
+{
+	/* Keep nmi watchdog up to date */
+	inc_irq_stat(irq0_irqs);
+
+	/* Optimized out for !IO_APIC and x86_64 */
+	if (timer_ack) {
+		/*
+		 * Subtle, when I/O APICs are used we have to ack timer IRQ
+		 * manually to deassert NMI lines for the watchdog if run
+		 * on an 82489DX-based system.
+		 */
+		spin_lock(&i8259A_lock);
+		outb(0x0c, PIC_MASTER_OCW3);
+		/* Ack the IRQ; AEOI will end it automatically. */
+		inb(PIC_MASTER_POLL);
+		spin_unlock(&i8259A_lock);
+	}
+
+	global_clock_event->event_handler(global_clock_event);
+
+	/* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */
+	if (MCA_bus)
+		outb_p(inb_p(0x61)| 0x80, 0x61);
+
+	return IRQ_HANDLED;
+}
+
+static struct irqaction irq0  = {
+	.handler = timer_interrupt,
+	.flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER,
+	.name = "timer"
+};
+
+void __init setup_default_timer_irq(void)
+{
+	irq0.mask = cpumask_of_cpu(0);
+	setup_irq(0, &irq0);
+}
+
+/* Default timer init function */
+void __init hpet_time_init(void)
+{
+	if (!hpet_enable())
+		setup_pit_timer();
+	setup_default_timer_irq();
+}
+
+static void x86_late_time_init(void)
+{
+	x86_init.timers.timer_init();
+}
+
+/*
+ * Initialize TSC and delay the periodic timer init to
+ * late x86_late_time_init() so ioremap works.
+ */
+void __init time_init(void)
+{
+	tsc_init();
+	late_time_init = x86_late_time_init;
+}
diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c
deleted file mode 100644
index fda0c34da75..00000000000
--- a/arch/x86/kernel/time_32.c
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- *  Copyright (c) 1991,1992,1995  Linus Torvalds
- *  Copyright (c) 1994  Alan Modra
- *  Copyright (c) 1995  Markus Kuhn
- *  Copyright (c) 1996  Ingo Molnar
- *  Copyright (c) 1998  Andrea Arcangeli
- *  Copyright (c) 2002,2006  Vojtech Pavlik
- *  Copyright (c) 2003  Andi Kleen
- *
- */
-
-#include <linux/clockchips.h>
-#include <linux/interrupt.h>
-#include <linux/time.h>
-#include <linux/mca.h>
-
-#include <asm/vsyscall.h>
-#include <asm/x86_init.h>
-#include <asm/i8259.h>
-#include <asm/i8253.h>
-#include <asm/timer.h>
-#include <asm/hpet.h>
-#include <asm/time.h>
-
-#if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC)
-int timer_ack;
-#endif
-
-#ifdef CONFIG_X86_64
-volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
-#endif
-
-unsigned long profile_pc(struct pt_regs *regs)
-{
-	unsigned long pc = instruction_pointer(regs);
-
-	if (!user_mode_vm(regs) && in_lock_functions(pc)) {
-#ifdef CONFIG_FRAME_POINTER
-		return *(unsigned long *)(regs->bp + sizeof(long));
-#else
-		unsigned long *sp = (unsigned long *)regs->sp;
-		/*
-		 * Return address is either directly at stack pointer
-		 * or above a saved flags. Eflags has bits 22-31 zero,
-		 * kernel addresses don't.
-		 */
-		if (sp[0] >> 22)
-			return sp[0];
-		if (sp[1] >> 22)
-			return sp[1];
-#endif
-	}
-	return pc;
-}
-EXPORT_SYMBOL(profile_pc);
-
-/*
- * Default timer interrupt handler for PIT/HPET
- */
-static irqreturn_t timer_interrupt(int irq, void *dev_id)
-{
-	/* Keep nmi watchdog up to date */
-	inc_irq_stat(irq0_irqs);
-
-	/* Optimized out for !IO_APIC and x86_64 */
-	if (timer_ack) {
-		/*
-		 * Subtle, when I/O APICs are used we have to ack timer IRQ
-		 * manually to deassert NMI lines for the watchdog if run
-		 * on an 82489DX-based system.
-		 */
-		spin_lock(&i8259A_lock);
-		outb(0x0c, PIC_MASTER_OCW3);
-		/* Ack the IRQ; AEOI will end it automatically. */
-		inb(PIC_MASTER_POLL);
-		spin_unlock(&i8259A_lock);
-	}
-
-	global_clock_event->event_handler(global_clock_event);
-
-	/* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */
-	if (MCA_bus)
-		outb_p(inb_p(0x61)| 0x80, 0x61);
-
-	return IRQ_HANDLED;
-}
-
-static struct irqaction irq0  = {
-	.handler = timer_interrupt,
-	.flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER,
-	.name = "timer"
-};
-
-void __init setup_default_timer_irq(void)
-{
-	irq0.mask = cpumask_of_cpu(0);
-	setup_irq(0, &irq0);
-}
-
-/* Default timer init function */
-void __init hpet_time_init(void)
-{
-	if (!hpet_enable())
-		setup_pit_timer();
-	setup_default_timer_irq();
-}
-
-static void x86_late_time_init(void)
-{
-	x86_init.timers.timer_init();
-}
-
-/*
- * Initialize TSC and delay the periodic timer init to
- * late x86_late_time_init() so ioremap works.
- */
-void __init time_init(void)
-{
-	tsc_init();
-	late_time_init = x86_late_time_init;
-}
diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c
deleted file mode 100644
index fda0c34da75..00000000000
--- a/arch/x86/kernel/time_64.c
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- *  Copyright (c) 1991,1992,1995  Linus Torvalds
- *  Copyright (c) 1994  Alan Modra
- *  Copyright (c) 1995  Markus Kuhn
- *  Copyright (c) 1996  Ingo Molnar
- *  Copyright (c) 1998  Andrea Arcangeli
- *  Copyright (c) 2002,2006  Vojtech Pavlik
- *  Copyright (c) 2003  Andi Kleen
- *
- */
-
-#include <linux/clockchips.h>
-#include <linux/interrupt.h>
-#include <linux/time.h>
-#include <linux/mca.h>
-
-#include <asm/vsyscall.h>
-#include <asm/x86_init.h>
-#include <asm/i8259.h>
-#include <asm/i8253.h>
-#include <asm/timer.h>
-#include <asm/hpet.h>
-#include <asm/time.h>
-
-#if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC)
-int timer_ack;
-#endif
-
-#ifdef CONFIG_X86_64
-volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
-#endif
-
-unsigned long profile_pc(struct pt_regs *regs)
-{
-	unsigned long pc = instruction_pointer(regs);
-
-	if (!user_mode_vm(regs) && in_lock_functions(pc)) {
-#ifdef CONFIG_FRAME_POINTER
-		return *(unsigned long *)(regs->bp + sizeof(long));
-#else
-		unsigned long *sp = (unsigned long *)regs->sp;
-		/*
-		 * Return address is either directly at stack pointer
-		 * or above a saved flags. Eflags has bits 22-31 zero,
-		 * kernel addresses don't.
-		 */
-		if (sp[0] >> 22)
-			return sp[0];
-		if (sp[1] >> 22)
-			return sp[1];
-#endif
-	}
-	return pc;
-}
-EXPORT_SYMBOL(profile_pc);
-
-/*
- * Default timer interrupt handler for PIT/HPET
- */
-static irqreturn_t timer_interrupt(int irq, void *dev_id)
-{
-	/* Keep nmi watchdog up to date */
-	inc_irq_stat(irq0_irqs);
-
-	/* Optimized out for !IO_APIC and x86_64 */
-	if (timer_ack) {
-		/*
-		 * Subtle, when I/O APICs are used we have to ack timer IRQ
-		 * manually to deassert NMI lines for the watchdog if run
-		 * on an 82489DX-based system.
-		 */
-		spin_lock(&i8259A_lock);
-		outb(0x0c, PIC_MASTER_OCW3);
-		/* Ack the IRQ; AEOI will end it automatically. */
-		inb(PIC_MASTER_POLL);
-		spin_unlock(&i8259A_lock);
-	}
-
-	global_clock_event->event_handler(global_clock_event);
-
-	/* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */
-	if (MCA_bus)
-		outb_p(inb_p(0x61)| 0x80, 0x61);
-
-	return IRQ_HANDLED;
-}
-
-static struct irqaction irq0  = {
-	.handler = timer_interrupt,
-	.flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER,
-	.name = "timer"
-};
-
-void __init setup_default_timer_irq(void)
-{
-	irq0.mask = cpumask_of_cpu(0);
-	setup_irq(0, &irq0);
-}
-
-/* Default timer init function */
-void __init hpet_time_init(void)
-{
-	if (!hpet_enable())
-		setup_pit_timer();
-	setup_default_timer_irq();
-}
-
-static void x86_late_time_init(void)
-{
-	x86_init.timers.timer_init();
-}
-
-/*
- * Initialize TSC and delay the periodic timer init to
- * late x86_late_time_init() so ioremap works.
- */
-void __init time_init(void)
-{
-	tsc_init();
-	late_time_init = x86_late_time_init;
-}
-- 
cgit v1.2.3


From 2d826404f0bdcac2a4dd7e3c446b70d6a3b63b78 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 20 Aug 2009 17:06:25 +0200
Subject: x86: Move tsc_calibration to x86_init_ops

TSC calibration is modified by the vmware hypervisor and paravirt by
separate means. Moorestown wants to add its own calibration routine as
well. So make calibrate_tsc a proper x86_init_ops function and
override it by paravirt or by the early setup of the vmware
hypervisor.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/hypervisor.h |  2 +-
 arch/x86/include/asm/paravirt.h   |  1 -
 arch/x86/include/asm/timer.h      |  5 -----
 arch/x86/include/asm/tsc.h        |  3 ++-
 arch/x86/include/asm/vmware.h     |  2 +-
 arch/x86/include/asm/x86_init.h   |  9 +++++++++
 arch/x86/kernel/cpu/hypervisor.c  | 14 +++++++-------
 arch/x86/kernel/cpu/vmware.c      | 21 ++++++++++++---------
 arch/x86/kernel/kvmclock.c        |  2 +-
 arch/x86/kernel/paravirt.c        |  1 -
 arch/x86/kernel/setup.c           |  2 +-
 arch/x86/kernel/tsc.c             | 13 ++++---------
 arch/x86/kernel/vmi_32.c          |  2 +-
 arch/x86/kernel/vmiclock_32.c     |  2 +-
 arch/x86/kernel/x86_init.c        |  5 +++++
 arch/x86/lguest/boot.c            |  2 +-
 arch/x86/xen/enlighten.c          |  3 ++-
 17 files changed, 48 insertions(+), 41 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h
index 369f5c5d09a..b78c0941e42 100644
--- a/arch/x86/include/asm/hypervisor.h
+++ b/arch/x86/include/asm/hypervisor.h
@@ -20,7 +20,7 @@
 #ifndef ASM_X86__HYPERVISOR_H
 #define ASM_X86__HYPERVISOR_H
 
-extern unsigned long get_hypervisor_tsc_freq(void);
 extern void init_hypervisor(struct cpuinfo_x86 *c);
+extern void init_hypervisor_platform(void);
 
 #endif
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 11a4ba7b209..1e458a55330 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -210,7 +210,6 @@ static inline unsigned long long paravirt_sched_clock(void)
 {
 	return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock);
 }
-#define calibrate_tsc() (pv_time_ops.get_tsc_khz())
 
 static inline unsigned long long paravirt_read_pmc(int counter)
 {
diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
index 65228ccc5f0..5469630b27f 100644
--- a/arch/x86/include/asm/timer.h
+++ b/arch/x86/include/asm/timer.h
@@ -8,7 +8,6 @@
 #define TICK_SIZE (tick_nsec / 1000)
 
 unsigned long long native_sched_clock(void);
-unsigned long native_calibrate_tsc(void);
 extern int recalibrate_cpu_khz(void);
 
 #if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC)
@@ -19,10 +18,6 @@ extern int timer_ack;
 
 extern int no_timer_check;
 
-#ifndef CONFIG_PARAVIRT
-#define calibrate_tsc() native_calibrate_tsc()
-#endif
-
 /* Accelerators for sched_clock()
  * convert from cycles(64bits) => nanoseconds (64bits)
  *  basic equation:
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 38ae163cc91..c0427295e8f 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -48,7 +48,8 @@ static __always_inline cycles_t vget_cycles(void)
 extern void tsc_init(void);
 extern void mark_tsc_unstable(char *reason);
 extern int unsynchronized_tsc(void);
-int check_tsc_unstable(void);
+extern int check_tsc_unstable(void);
+extern unsigned long native_calibrate_tsc(void);
 
 /*
  * Boot-time check whether the TSCs are synchronized across
diff --git a/arch/x86/include/asm/vmware.h b/arch/x86/include/asm/vmware.h
index c11b7e100d8..e49ed6d2fd4 100644
--- a/arch/x86/include/asm/vmware.h
+++ b/arch/x86/include/asm/vmware.h
@@ -20,7 +20,7 @@
 #ifndef ASM_X86__VMWARE_H
 #define ASM_X86__VMWARE_H
 
-extern unsigned long vmware_get_tsc_khz(void);
+extern void vmware_platform_setup(void);
 extern int vmware_platform(void);
 extern void vmware_set_feature_bits(struct cpuinfo_x86 *c);
 
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index f8bdd2271a0..20df5187171 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -112,8 +112,17 @@ struct x86_cpuinit_ops {
 	void (*setup_percpu_clockev)(void);
 };
 
+/**
+ * struct x86_platform_ops - platform specific runtime functions
+ * @calibrate_tsc:		calibrate TSC
+ */
+struct x86_platform_ops {
+	unsigned long (*calibrate_tsc)(void);
+};
+
 extern struct x86_init_ops x86_init;
 extern struct x86_cpuinit_ops x86_cpuinit;
+extern struct x86_platform_ops x86_platform;
 
 extern void x86_init_noop(void);
 extern void x86_init_uint_noop(unsigned int unused);
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index 93ba8eeb100..08be922de33 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -34,13 +34,6 @@ detect_hypervisor_vendor(struct cpuinfo_x86 *c)
 		c->x86_hyper_vendor = X86_HYPER_VENDOR_NONE;
 }
 
-unsigned long get_hypervisor_tsc_freq(void)
-{
-	if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE)
-		return vmware_get_tsc_khz();
-	return 0;
-}
-
 static inline void __cpuinit
 hypervisor_set_feature_bits(struct cpuinfo_x86 *c)
 {
@@ -55,3 +48,10 @@ void __cpuinit init_hypervisor(struct cpuinfo_x86 *c)
 	detect_hypervisor_vendor(c);
 	hypervisor_set_feature_bits(c);
 }
+
+void __init init_hypervisor_platform(void)
+{
+	init_hypervisor(&boot_cpu_data);
+	if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE)
+		vmware_platform_setup();
+}
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index bc24f514ec9..0a46b4df5d8 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -24,6 +24,7 @@
 #include <linux/dmi.h>
 #include <asm/div64.h>
 #include <asm/vmware.h>
+#include <asm/x86_init.h>
 
 #define CPUID_VMWARE_INFO_LEAF	0x40000000
 #define VMWARE_HYPERVISOR_MAGIC	0x564D5868
@@ -47,21 +48,29 @@ static inline int __vmware_platform(void)
 	return eax != (uint32_t)-1 && ebx == VMWARE_HYPERVISOR_MAGIC;
 }
 
-static unsigned long __vmware_get_tsc_khz(void)
+static unsigned long vmware_get_tsc_khz(void)
 {
 	uint64_t tsc_hz;
 	uint32_t eax, ebx, ecx, edx;
 
 	VMWARE_PORT(GETHZ, eax, ebx, ecx, edx);
 
-	if (ebx == UINT_MAX)
-		return 0;
 	tsc_hz = eax | (((uint64_t)ebx) << 32);
 	do_div(tsc_hz, 1000);
 	BUG_ON(tsc_hz >> 32);
 	return tsc_hz;
 }
 
+void __init vmware_platform_setup(void)
+{
+	uint32_t eax, ebx, ecx, edx;
+
+	VMWARE_PORT(GETHZ, eax, ebx, ecx, edx);
+
+	if (ebx != UINT_MAX)
+		x86_platform.calibrate_tsc = vmware_get_tsc_khz;
+}
+
 /*
  * While checking the dmi string infomation, just checking the product
  * serial key should be enough, as this will always have a VMware
@@ -87,12 +96,6 @@ int vmware_platform(void)
 	return 0;
 }
 
-unsigned long vmware_get_tsc_khz(void)
-{
-	BUG_ON(!vmware_platform());
-	return __vmware_get_tsc_khz();
-}
-
 /*
  * VMware hypervisor takes care of exporting a reliable TSC to the guest.
  * Still, due to timing difference when running on virtual cpus, the TSC can
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 64e9b5f59d2..75a21b61b86 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -187,7 +187,7 @@ void __init kvmclock_init(void)
 		pv_time_ops.get_wallclock = kvm_get_wallclock;
 		pv_time_ops.set_wallclock = kvm_set_wallclock;
 		pv_time_ops.sched_clock = kvm_clock_read;
-		pv_time_ops.get_tsc_khz = kvm_get_tsc_khz;
+		x86_platform.calibrate_tsc = kvm_get_tsc_khz;
 #ifdef CONFIG_X86_LOCAL_APIC
 		x86_cpuinit.setup_percpu_clockev =
 			kvm_setup_secondary_clock;
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 9c0e644a76d..7cbf898d839 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -309,7 +309,6 @@ struct pv_time_ops pv_time_ops = {
 	.get_wallclock = native_get_wallclock,
 	.set_wallclock = native_set_wallclock,
 	.sched_clock = native_sched_clock,
-	.get_tsc_khz = native_calibrate_tsc,
 };
 
 struct pv_irq_ops pv_irq_ops = {
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index bb207a47c63..2d93026af7c 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -818,7 +818,7 @@ void __init setup_arch(char **cmdline_p)
 	 * VMware detection requires dmi to be available, so this
 	 * needs to be done after dmi_scan_machine, for the BP.
 	 */
-	init_hypervisor(&boot_cpu_data);
+	init_hypervisor_platform();
 
 	x86_init.resources.probe_roms();
 
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 97a0bcbad10..9917632a8b4 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -18,6 +18,7 @@
 #include <asm/delay.h>
 #include <asm/hypervisor.h>
 #include <asm/nmi.h>
+#include <asm/x86_init.h>
 
 unsigned int __read_mostly cpu_khz;	/* TSC clocks / usec, not used here */
 EXPORT_SYMBOL(cpu_khz);
@@ -401,15 +402,9 @@ unsigned long native_calibrate_tsc(void)
 {
 	u64 tsc1, tsc2, delta, ref1, ref2;
 	unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
-	unsigned long flags, latch, ms, fast_calibrate, hv_tsc_khz;
+	unsigned long flags, latch, ms, fast_calibrate;
 	int hpet = is_hpet_enabled(), i, loopmin;
 
-	hv_tsc_khz = get_hypervisor_tsc_freq();
-	if (hv_tsc_khz) {
-		printk(KERN_INFO "TSC: Frequency read from the hypervisor\n");
-		return hv_tsc_khz;
-	}
-
 	local_irq_save(flags);
 	fast_calibrate = quick_pit_calibrate();
 	local_irq_restore(flags);
@@ -567,7 +562,7 @@ int recalibrate_cpu_khz(void)
 	unsigned long cpu_khz_old = cpu_khz;
 
 	if (cpu_has_tsc) {
-		tsc_khz = calibrate_tsc();
+		tsc_khz = x86_platform.calibrate_tsc();
 		cpu_khz = tsc_khz;
 		cpu_data(0).loops_per_jiffy =
 			cpufreq_scale(cpu_data(0).loops_per_jiffy,
@@ -917,7 +912,7 @@ void __init tsc_init(void)
 	if (!cpu_has_tsc)
 		return;
 
-	tsc_khz = calibrate_tsc();
+	tsc_khz = x86_platform.calibrate_tsc();
 	cpu_khz = tsc_khz;
 
 	if (!tsc_khz) {
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index cd7d0fbbf66..052ae81ee08 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -825,7 +825,7 @@ static inline int __init activate_vmi(void)
 		x86_cpuinit.setup_percpu_clockev = vmi_time_ap_init;
 #endif
 		pv_time_ops.sched_clock = vmi_sched_clock;
-		pv_time_ops.get_tsc_khz = vmi_tsc_khz;
+		x86_platform.calibrate_tsc = vmi_tsc_khz;
 
 		/* We have true wallclock functions; disable CMOS clock sync */
 		no_sync_cmos_clock = 1;
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
index 2b3eb82efee..611b9e2360d 100644
--- a/arch/x86/kernel/vmiclock_32.c
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -68,7 +68,7 @@ unsigned long long vmi_sched_clock(void)
 	return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE));
 }
 
-/* paravirt_ops.get_tsc_khz = vmi_tsc_khz */
+/* x86_platform.calibrate_tsc = vmi_tsc_khz */
 unsigned long vmi_tsc_khz(void)
 {
 	unsigned long long khz;
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 4790b92714a..13081b92191 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -13,6 +13,7 @@
 #include <asm/e820.h>
 #include <asm/time.h>
 #include <asm/irq.h>
+#include <asm/tsc.h>
 
 void __cpuinit x86_init_noop(void) { }
 void __init x86_init_uint_noop(unsigned int unused) { }
@@ -67,3 +68,7 @@ struct __initdata x86_init_ops x86_init = {
 __cpuinitdata struct x86_cpuinit_ops x86_cpuinit = {
 	.setup_percpu_clockev		= setup_secondary_APIC_clock,
 };
+
+struct x86_platform_ops x86_platform = {
+	.calibrate_tsc			= native_calibrate_tsc,
+};
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 6caa8c0c793..fabe745513d 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -1320,11 +1320,11 @@ __init void lguest_init(void)
 
 	/* Time operations */
 	pv_time_ops.get_wallclock = lguest_get_wallclock;
-	pv_time_ops.get_tsc_khz = lguest_tsc_khz;
 
 	x86_init.resources.memory_setup = lguest_memory_setup;
 	x86_init.irqs.intr_init = lguest_init_IRQ;
 	x86_init.timers.timer_init = lguest_time_init;
+	x86_platform.calibrate_tsc = lguest_tsc_khz;
 
 	/*
 	 * Now is a good time to look at the implementations of these functions
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 84826b842b5..ee8cac77c8a 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -844,7 +844,6 @@ static const struct pv_init_ops xen_init_ops __initdata = {
 static const struct pv_time_ops xen_time_ops __initdata = {
 	.set_wallclock = xen_set_wallclock,
 	.get_wallclock = xen_get_wallclock,
-	.get_tsc_khz = xen_tsc_khz,
 	.sched_clock = xen_sched_clock,
 };
 
@@ -980,6 +979,8 @@ asmlinkage void __init xen_start_kernel(void)
 	x86_init.timers.setup_percpu_clockev = x86_init_noop;
 	x86_cpuinit.setup_percpu_clockev = x86_init_noop;
 
+	x86_platform.calibrate_tsc = xen_tsc_khz;
+
 #ifdef CONFIG_X86_64
 	/*
 	 * Setup percpu state.  We only need to do this for 64-bit
-- 
cgit v1.2.3


From dd0a70c8f921708eba29ef9f30dde1f14a74af05 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 20 Aug 2009 16:51:07 +0200
Subject: x86: Move tsc_init to late_time_init

We do not need the TSC before late_time_init. Move the tsc_init to the
late time init code so we can also utilize HPET for calibration (which
we claimed to do but never did except in some older kernel
version). This also helps Moorestown to calibrate the TSC with the
AHBT timer which needs to be initialized in late_time_init like HPET.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/time.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index fda0c34da75..fcece00356a 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -108,6 +108,7 @@ void __init hpet_time_init(void)
 static void x86_late_time_init(void)
 {
 	x86_init.timers.timer_init();
+	tsc_init();
 }
 
 /*
@@ -116,6 +117,5 @@ static void x86_late_time_init(void)
  */
 void __init time_init(void)
 {
-	tsc_init();
 	late_time_init = x86_late_time_init;
 }
-- 
cgit v1.2.3


From 47a3d5da70f411bc044ecd3c0593b158b09d0efa Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 29 Aug 2009 15:03:59 +0200
Subject: x86: Add early platform detection

Platforms like Moorestown require early setup and want to avoid the
call to reserve_ebda_region. The x86_init override is too late when
the MRST detection happens in setup_arch. Move the default i386
x86_init overrides and the call to reserve_ebda_region into a separate
function which is called as the default of a switch case depending on
the hardware_subarch id in boot params. This allows us to add a case
for MRST and let MRST have its own early setup function.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/x86_init.h |  3 +--
 arch/x86/kernel/head32.c        | 22 +++++++++++++++++-----
 arch/x86/kernel/head64.c        |  3 ++-
 arch/x86/kernel/x86_init.c      |  1 -
 4 files changed, 20 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 20df5187171..b6c89428137 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -2,6 +2,7 @@
 #define _ASM_X86_PLATFORM_H
 
 #include <asm/pgtable_types.h>
+#include <asm/bootparam.h>
 
 struct mpc_bus;
 struct mpc_cpu;
@@ -34,14 +35,12 @@ struct x86_init_mpparse {
  * @probe_roms:			probe BIOS roms
  * @reserve_resources:		reserve the standard resources for the
  *				platform
- * @reserve_ebda_region:	reserve the extended bios data area
  * @memory_setup:		platform specific memory setup
  *
  */
 struct x86_init_resources {
 	void (*probe_roms)(void);
 	void (*reserve_resources)(void);
-	void (*reserve_ebda_region)(void);
 	char *(*memory_setup)(void);
 };
 
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index a21398fac4f..441c075e2b8 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -15,6 +15,17 @@
 #include <asm/trampoline.h>
 #include <asm/apic.h>
 #include <asm/io_apic.h>
+#include <asm/bios_ebda.h>
+
+static void __init i386_default_early_setup(void)
+{
+	/* Initilize 32bit specific setup functions */
+	x86_init.resources.probe_roms = probe_roms;
+	x86_init.resources.reserve_resources = i386_reserve_resources;
+	x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc;
+
+	reserve_ebda_region();
+}
 
 void __init i386_start_kernel(void)
 {
@@ -31,12 +42,13 @@ void __init i386_start_kernel(void)
 		reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
 	}
 #endif
-	/* Initilize 32bit specific setup functions */
-	x86_init.resources.probe_roms = probe_roms;
-	x86_init.resources.reserve_resources = i386_reserve_resources;
-	x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc;
 
-	x86_init.resources.reserve_ebda_region();
+	/* Call the subarch specific early setup function */
+	switch (boot_params.hdr.hardware_subarch) {
+	default:
+		i386_default_early_setup();
+		break;
+	}
 
 	/*
 	 * At this point everything still needed from the boot loader
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index cead8149c3d..0b06cd778fd 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -24,6 +24,7 @@
 #include <asm/kdebug.h>
 #include <asm/e820.h>
 #include <asm/trampoline.h>
+#include <asm/bios_ebda.h>
 
 static void __init zap_identity_mappings(void)
 {
@@ -111,7 +112,7 @@ void __init x86_64_start_reservations(char *real_mode_data)
 	}
 #endif
 
-	x86_init.resources.reserve_ebda_region();
+	reserve_ebda_region();
 
 	/*
 	 * At this point everything still needed from the boot loader
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 13081b92191..24be7f39789 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -28,7 +28,6 @@ struct __initdata x86_init_ops x86_init = {
 	.resources = {
 		.probe_roms		= x86_init_noop,
 		.reserve_resources	= reserve_standard_io_resources,
-		.reserve_ebda_region	= reserve_ebda_region,
 		.memory_setup		= default_machine_specific_memory_setup,
 	},
 
-- 
cgit v1.2.3


From 162bc7ab01a00eba1c5d614e64a51e1268ee3f96 Mon Sep 17 00:00:00 2001
From: "Pan, Jacob jun" <jacob.jun.pan@intel.com>
Date: Fri, 28 Aug 2009 14:52:47 -0700
Subject: x86: Add hardware_subarch ID for Moorestown

x86 bootprotocol 2.07 has introduced hardware_subarch ID in the boot
parameters provided by FW. We use it to identify Moorestown platforms.

[ tglx: Cleanup and paravirt fix ]

Signed-off-by: Jacob Pan <jacob.jun.pan@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 Documentation/x86/boot.txt       |  1 +
 arch/x86/include/asm/bootparam.h | 10 ++++++++++
 arch/x86/kernel/head_32.S        |  1 +
 3 files changed, 12 insertions(+)

(limited to 'arch/x86')

diff --git a/Documentation/x86/boot.txt b/Documentation/x86/boot.txt
index 8da3a795083..30b43e1b269 100644
--- a/Documentation/x86/boot.txt
+++ b/Documentation/x86/boot.txt
@@ -599,6 +599,7 @@ Protocol:	2.07+
   0x00000000	The default x86/PC environment
   0x00000001	lguest
   0x00000002	Xen
+  0x00000003	Moorestown MID
 
 Field name:	hardware_subarch_data
 Type:		write (subarch-dependent)
diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h
index 1724e8de317..283a9a1b3ef 100644
--- a/arch/x86/include/asm/bootparam.h
+++ b/arch/x86/include/asm/bootparam.h
@@ -109,4 +109,14 @@ struct boot_params {
 	__u8  _pad9[276];				/* 0xeec */
 } __attribute__((packed));
 
+enum {
+	X86_SUBARCH_PC = 0,
+	X86_SUBARCH_LGUEST,
+	X86_SUBARCH_XEN,
+	X86_SUBARCH_MRST,
+	X86_NR_SUBARCHS,
+};
+
+
+
 #endif /* _ASM_X86_BOOTPARAM_H */
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index cc827ac9e8d..304e3f3d747 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -157,6 +157,7 @@ subarch_entries:
 	.long default_entry		/* normal x86/PC */
 	.long lguest_entry		/* lguest hypervisor */
 	.long xen_entry			/* Xen hypervisor */
+	.long default_entry		/* Moorestown MID */
 num_subarch_entries = (. - subarch_entries) / 4
 .previous
 #endif /* CONFIG_PARAVIRT */
-- 
cgit v1.2.3


From 3f4110a48a749a1aa1c54fb807afb3f32f49711c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 29 Aug 2009 14:54:20 +0200
Subject: x86: Add Moorestown early detection

Moorestown MID devices need to be detected early in the boot process
to setup and do not call x86_default_early_setup as there is no EBDA
region to reserve.

[ Copied the minimal code from Jacobs latest MRST series ]

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Jacob Pan <jacob.jun.pan@intel.com>
---
 arch/x86/Kconfig             | 13 +++++++++++++
 arch/x86/include/asm/setup.h |  6 ++++++
 arch/x86/kernel/Makefile     |  1 +
 arch/x86/kernel/head32.c     |  3 +++
 arch/x86/kernel/mrst.c       | 24 ++++++++++++++++++++++++
 5 files changed, 47 insertions(+)
 create mode 100644 arch/x86/kernel/mrst.c

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 13ffa5df37d..586d84557f7 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -318,6 +318,7 @@ config X86_EXTENDED_PLATFORM
 		SGI 320/540 (Visual Workstation)
 		Summit/EXA (IBM x440)
 		Unisys ES7000 IA32 series
+		Moorestown MID devices
 
 	  If you have one of these systems, or if you want to build a
 	  generic distribution kernel, say Y here - otherwise say N.
@@ -377,6 +378,18 @@ config X86_ELAN
 
 	  If unsure, choose "PC-compatible" instead.
 
+config X86_MRST
+       bool "Moorestown MID platform"
+	depends on X86_32
+	depends on X86_EXTENDED_PLATFORM
+	---help---
+	  Moorestown is Intel's Low Power Intel Architecture (LPIA) based Moblin
+	  Internet Device(MID) platform. Moorestown consists of two chips:
+	  Lincroft (CPU core, graphics, and memory controller) and Langwell IOH.
+	  Unlike standard x86 PCs, Moorestown does not have many legacy devices
+	  nor standard legacy replacement devices/features. e.g. Moorestown does
+	  not contain i8259, i8254, HPET, legacy BIOS, most of the io ports.
+
 config X86_RDC321X
 	bool "RDC R-321x SoC"
 	depends on X86_32
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 861e1fe2303..18e496c98ff 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -49,6 +49,12 @@ extern void reserve_standard_io_resources(void);
 extern void i386_reserve_resources(void);
 extern void setup_default_timer_irq(void);
 
+#ifdef CONFIG_X86_MRST
+extern void x86_mrst_early_setup(void);
+#else
+static inline void x86_mrst_early_setup(void) { }
+#endif
+
 #ifndef _SETUP
 
 /*
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index ccf3db607c2..5f33316610d 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -104,6 +104,7 @@ obj-$(CONFIG_SCx200)		+= scx200.o
 scx200-y			+= scx200_32.o
 
 obj-$(CONFIG_OLPC)		+= olpc.o
+obj-$(CONFIG_X86_MRST)		+= mrst.o
 
 microcode-y				:= microcode_core.o
 microcode-$(CONFIG_MICROCODE_INTEL)	+= microcode_intel.o
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 441c075e2b8..4f8e2507e8f 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -45,6 +45,9 @@ void __init i386_start_kernel(void)
 
 	/* Call the subarch specific early setup function */
 	switch (boot_params.hdr.hardware_subarch) {
+	case X86_SUBARCH_MRST:
+		x86_mrst_early_setup();
+		break;
 	default:
 		i386_default_early_setup();
 		break;
diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c
new file mode 100644
index 00000000000..3b7078abc87
--- /dev/null
+++ b/arch/x86/kernel/mrst.c
@@ -0,0 +1,24 @@
+/*
+ * mrst.c: Intel Moorestown platform specific setup code
+ *
+ * (C) Copyright 2008 Intel Corporation
+ * Author: Jacob Pan (jacob.jun.pan@intel.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+#include <linux/init.h>
+
+#include <asm/setup.h>
+
+/*
+ * Moorestown specific x86_init function overrides and early setup
+ * calls.
+ */
+void __init x86_mrst_early_setup(void)
+{
+	x86_init.resources.probe_roms = x86_init_noop;
+	x86_init.resources.reserve_resources = x86_init_noop;
+}
-- 
cgit v1.2.3


From bc07844a33734c4b2f32ef26d942d2f3ef9302ea Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 29 Aug 2009 18:09:57 +0200
Subject: x86: Distangle ioapic and i8259

The proposed Moorestown support patches use an extra feature flag
mechanism to make the ioapic work w/o an i8259. There is a much
simpler solution.

Most i8259 specific functions are already called dependend on the irq
number less than NR_IRQS_LEGACY. Replacing that constant by a
read_mostly variable which can be set to 0 by the platform setup code
allows us to achieve the same without any special feature flags.

That trivial change allows us to proceed with MRST w/o doing a full
blown overhaul of the ioapic code which would delay MRST unduly.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/io_apic.h |  2 ++
 arch/x86/kernel/apic/io_apic.c | 41 +++++++++++++++++++++++++++++------------
 2 files changed, 31 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 2b8aeb89933..e1f89a1a07e 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -143,6 +143,8 @@ extern int noioapicreroute;
 /* 1 if the timer IRQ uses the '8259A Virtual Wire' mode */
 extern int timer_through_8259;
 
+extern void io_apic_disable_legacy(void);
+
 /*
  * If we use the IO-APIC for IRQ routing, disable automatic
  * assignment of PCI IRQ's.
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 5f4687187ce..6c961290a5f 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -91,6 +91,11 @@ struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
 /* # of MP IRQ source entries */
 int mp_irq_entries;
 
+/* Number of legacy interrupts */
+static int nr_legacy_irqs __read_mostly = NR_IRQS_LEGACY;
+/* GSI interrupts */
+static int nr_irqs_gsi = NR_IRQS_LEGACY;
+
 #if defined (CONFIG_MCA) || defined (CONFIG_EISA)
 int mp_bus_id_to_type[MAX_MP_BUSSES];
 #endif
@@ -172,6 +177,12 @@ static struct irq_cfg irq_cfgx[NR_IRQS] = {
 	[15] = { .vector = IRQ15_VECTOR, },
 };
 
+void __init io_apic_disable_legacy(void)
+{
+	nr_legacy_irqs = 0;
+	nr_irqs_gsi = 0;
+}
+
 int __init arch_early_irq_init(void)
 {
 	struct irq_cfg *cfg;
@@ -189,7 +200,7 @@ int __init arch_early_irq_init(void)
 		desc->chip_data = &cfg[i];
 		zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node);
 		zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node);
-		if (i < NR_IRQS_LEGACY)
+		if (i < nr_legacy_irqs)
 			cpumask_setall(cfg[i].domain);
 	}
 
@@ -883,7 +894,7 @@ static int __init find_isa_irq_apic(int irq, int type)
  */
 static int EISA_ELCR(unsigned int irq)
 {
-	if (irq < NR_IRQS_LEGACY) {
+	if (irq < nr_legacy_irqs) {
 		unsigned int port = 0x4d0 + (irq >> 3);
 		return (inb(port) >> (irq & 7)) & 1;
 	}
@@ -1480,7 +1491,7 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq
 	}
 
 	ioapic_register_intr(irq, desc, trigger);
-	if (irq < NR_IRQS_LEGACY)
+	if (irq < nr_legacy_irqs)
 		disable_8259A_irq(irq);
 
 	ioapic_write_entry(apic_id, pin, entry);
@@ -1851,7 +1862,7 @@ __apicdebuginit(void) print_PIC(void)
 	unsigned int v;
 	unsigned long flags;
 
-	if (apic_verbosity == APIC_QUIET)
+	if (apic_verbosity == APIC_QUIET || !nr_legacy_irqs)
 		return;
 
 	printk(KERN_DEBUG "\nprinting PIC contents\n");
@@ -1914,6 +1925,10 @@ void __init enable_IO_APIC(void)
 		spin_unlock_irqrestore(&ioapic_lock, flags);
 		nr_ioapic_registers[apic] = reg_01.bits.entries+1;
 	}
+
+	if (!nr_legacy_irqs)
+		return;
+
 	for(apic = 0; apic < nr_ioapics; apic++) {
 		int pin;
 		/* See if any of the pins is in ExtINT mode */
@@ -1968,6 +1983,9 @@ void disable_IO_APIC(void)
 	 */
 	clear_IO_APIC();
 
+	if (!nr_legacy_irqs)
+		return;
+
 	/*
 	 * If the i8259 is routed through an IOAPIC
 	 * Put that IOAPIC in virtual wire mode
@@ -2198,7 +2216,7 @@ static unsigned int startup_ioapic_irq(unsigned int irq)
 	struct irq_cfg *cfg;
 
 	spin_lock_irqsave(&ioapic_lock, flags);
-	if (irq < NR_IRQS_LEGACY) {
+	if (irq < nr_legacy_irqs) {
 		disable_8259A_irq(irq);
 		if (i8259A_irq_pending(irq))
 			was_pending = 1;
@@ -2709,7 +2727,7 @@ static inline void init_IO_APIC_traps(void)
 			 * so default to an old-fashioned 8259
 			 * interrupt if we can..
 			 */
-			if (irq < NR_IRQS_LEGACY)
+			if (irq < nr_legacy_irqs)
 				make_8259A_irq(irq);
 			else
 				/* Strange. Oh, well.. */
@@ -3045,7 +3063,7 @@ out:
  * the I/O APIC in all cases now.  No actual device should request
  * it anyway.  --macro
  */
-#define PIC_IRQS	(1 << PIC_CASCADE_IR)
+#define PIC_IRQS	(1UL << PIC_CASCADE_IR)
 
 void __init setup_IO_APIC(void)
 {
@@ -3053,8 +3071,7 @@ void __init setup_IO_APIC(void)
 	/*
 	 * calling enable_IO_APIC() is moved to setup_local_APIC for BP
 	 */
-
-	io_apic_irqs = ~PIC_IRQS;
+	io_apic_irqs = nr_legacy_irqs ? ~PIC_IRQS : ~0UL;
 
 	apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
 	/*
@@ -3065,7 +3082,8 @@ void __init setup_IO_APIC(void)
 	sync_Arb_IDs();
 	setup_IO_APIC_irqs();
 	init_IO_APIC_traps();
-	check_timer();
+	if (nr_legacy_irqs)
+		check_timer();
 }
 
 /*
@@ -3166,7 +3184,6 @@ static int __init ioapic_init_sysfs(void)
 
 device_initcall(ioapic_init_sysfs);
 
-static int nr_irqs_gsi = NR_IRQS_LEGACY;
 /*
  * Dynamic irq allocate and deallocation
  */
@@ -3907,7 +3924,7 @@ static int __io_apic_set_pci_routing(struct device *dev, int irq,
 	/*
 	 * IRQs < 16 are already in the irq_2_pin[] map
 	 */
-	if (irq >= NR_IRQS_LEGACY) {
+	if (irq >= nr_legacy_irqs) {
 		cfg = desc->chip_data;
 		add_pin_to_irq_node(cfg, node, ioapic, pin);
 	}
-- 
cgit v1.2.3


From e11dadabf443dc3101f28b74d8b9d56870a87db4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 31 Aug 2009 15:18:40 +0200
Subject: x86: apic namespace cleanup

boot_cpu_physical_apicid is a global variable and used as function
argument as well. Rename the function arguments to avoid confusion.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/apic.h      | 14 +++++++-------
 arch/x86/kernel/apic/bigsmp_32.c |  2 +-
 arch/x86/kernel/apic/numaq_32.c  |  2 +-
 arch/x86/kernel/apic/summit_32.c |  2 +-
 arch/x86/kernel/setup.c          |  4 ++--
 5 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 6f15b29005a..d6a0f2636a6 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -292,7 +292,7 @@ struct apic {
 	int (*cpu_present_to_apicid)(int mps_cpu);
 	physid_mask_t (*apicid_to_cpu_present)(int phys_apicid);
 	void (*setup_portio_remap)(void);
-	int (*check_phys_apicid_present)(int boot_cpu_physical_apicid);
+	int (*check_phys_apicid_present)(int phys_apicid);
 	void (*enable_apic_mode)(void);
 	int (*phys_pkg_id)(int cpuid_apic, int index_msb);
 
@@ -426,7 +426,7 @@ extern struct apic apic_x2apic_uv_x;
 DECLARE_PER_CPU(int, x2apic_extra_bits);
 
 extern int default_cpu_present_to_apicid(int mps_cpu);
-extern int default_check_phys_apicid_present(int boot_cpu_physical_apicid);
+extern int default_check_phys_apicid_present(int phys_apicid);
 #endif
 
 static inline void default_wait_for_init_deassert(atomic_t *deassert)
@@ -542,9 +542,9 @@ static inline int __default_cpu_present_to_apicid(int mps_cpu)
 }
 
 static inline int
-__default_check_phys_apicid_present(int boot_cpu_physical_apicid)
+__default_check_phys_apicid_present(int phys_apicid)
 {
-	return physid_isset(boot_cpu_physical_apicid, phys_cpu_present_map);
+	return physid_isset(phys_apicid, phys_cpu_present_map);
 }
 
 #ifdef CONFIG_X86_32
@@ -554,13 +554,13 @@ static inline int default_cpu_present_to_apicid(int mps_cpu)
 }
 
 static inline int
-default_check_phys_apicid_present(int boot_cpu_physical_apicid)
+default_check_phys_apicid_present(int phys_apicid)
 {
-	return __default_check_phys_apicid_present(boot_cpu_physical_apicid);
+	return __default_check_phys_apicid_present(phys_apicid);
 }
 #else
 extern int default_cpu_present_to_apicid(int mps_cpu);
-extern int default_check_phys_apicid_present(int boot_cpu_physical_apicid);
+extern int default_check_phys_apicid_present(int phys_apicid);
 #endif
 
 static inline physid_mask_t default_apicid_to_cpu_present(int phys_apicid)
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index 676cdac385c..77a06413b6b 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -112,7 +112,7 @@ static physid_mask_t bigsmp_ioapic_phys_id_map(physid_mask_t phys_map)
 	return physids_promote(0xFFL);
 }
 
-static int bigsmp_check_phys_apicid_present(int boot_cpu_physical_apicid)
+static int bigsmp_check_phys_apicid_present(int phys_apicid)
 {
 	return 1;
 }
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index f1ebed6bd15..efa00e2b850 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -413,7 +413,7 @@ static inline physid_mask_t numaq_apicid_to_cpu_present(int logical_apicid)
 /* Where the IO area was mapped on multiquad, always 0 otherwise */
 void *xquad_portio;
 
-static inline int numaq_check_phys_apicid_present(int boot_cpu_physical_apicid)
+static inline int numaq_check_phys_apicid_present(int phys_apicid)
 {
 	return 1;
 }
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
index eafdfbd1ea9..645ecc4ff0b 100644
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -272,7 +272,7 @@ static physid_mask_t summit_apicid_to_cpu_present(int apicid)
 	return physid_mask_of_physid(0);
 }
 
-static int summit_check_phys_apicid_present(int boot_cpu_physical_apicid)
+static int summit_check_phys_apicid_present(int physical_apicid)
 {
 	return 1;
 }
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 2d93026af7c..fda22ec1a93 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -129,9 +129,9 @@ int default_cpu_present_to_apicid(int mps_cpu)
 	return __default_cpu_present_to_apicid(mps_cpu);
 }
 
-int default_check_phys_apicid_present(int boot_cpu_physical_apicid)
+int default_check_phys_apicid_present(int phys_apicid)
 {
-	return __default_check_phys_apicid_present(boot_cpu_physical_apicid);
+	return __default_check_phys_apicid_present(phys_apicid);
 }
 #endif
 
-- 
cgit v1.2.3


From db39d5529d347de5e2eec1a72d67fcfacae6c5a2 Mon Sep 17 00:00:00 2001
From: Mark Langsdorf <mark.langsdorf@amd.com>
Date: Fri, 21 Aug 2009 19:15:28 -0500
Subject: [CPUFREQ] Powernow-k8: Enable more than 2 low P-states

Remove an obsolete check that used to prevent there being more
than 2 low P-states.  Now that low-to-low P-states changes are
enabled, it prevents otherwise workable configurations with
multiple low P-states.

Signed-off-by: Mark Langsdorf <mark.langsdorf@amd.com>
Tested-by: Krists Krilovs <pow@pow.za.net>
Signed-off-by: Dave Jones <davej@redhat.com>
---
 arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 29 ++++-------------------------
 1 file changed, 4 insertions(+), 25 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 2a50ef89100..0cbce0481a5 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -854,6 +854,10 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
 		goto err_out;
 	}
 
+	/* fill in data */
+	data->numps = data->acpi_data.state_count;
+	powernow_k8_acpi_pst_values(data, 0);
+
 	if (cpu_family == CPU_HW_PSTATE)
 		ret_val = fill_powernow_table_pstate(data, powernow_table);
 	else
@@ -866,11 +870,8 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
 	powernow_table[data->acpi_data.state_count].index = 0;
 	data->powernow_table = powernow_table;
 
-	/* fill in data */
-	data->numps = data->acpi_data.state_count;
 	if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu)
 		print_basics(data);
-	powernow_k8_acpi_pst_values(data, 0);
 
 	/* notify BIOS that we exist */
 	acpi_processor_notify_smm(THIS_MODULE);
@@ -941,7 +942,6 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data,
 		struct cpufreq_frequency_table *powernow_table)
 {
 	int i;
-	int cntlofreq = 0;
 
 	for (i = 0; i < data->acpi_data.state_count; i++) {
 		u32 fid;
@@ -982,27 +982,6 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data,
 			continue;
 		}
 
-		/* verify only 1 entry from the lo frequency table */
-		if (fid < HI_FID_TABLE_BOTTOM) {
-			if (cntlofreq) {
-				/* if both entries are the same,
-				 * ignore this one ... */
-				if ((freq != powernow_table[cntlofreq].frequency) ||
-				    (index != powernow_table[cntlofreq].index)) {
-					printk(KERN_ERR PFX
-						"Too many lo freq table "
-						"entries\n");
-					return 1;
-				}
-
-				dprintk("double low frequency table entry, "
-						"ignoring it.\n");
-				invalidate_entry(data, i);
-				continue;
-			} else
-				cntlofreq = i;
-		}
-
 		if (freq != (data->acpi_data.states[i].core_frequency * 1000)) {
 			printk(KERN_INFO PFX "invalid freq entries "
 				"%u kHz vs. %u kHz\n", freq,
-- 
cgit v1.2.3


From 1a8e42fa81e62d47cc471f7764f906bb42b27a54 Mon Sep 17 00:00:00 2001
From: Prarit Bhargava <prarit@redhat.com>
Date: Wed, 26 Aug 2009 13:19:37 -0400
Subject: [CPUFREQ] Create a blacklist for processors that should not load the
 acpi-cpufreq module.

Create a blacklist for processors that should not load the acpi-cpufreq module.

The initial entry in the blacklist function is the Intel 0f68 processor.  It's
specification update mentions errata AL30 which implies that cpufreq should not
run on this processor.

Signed-off-by: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: Dave Jones <davej@redhat.com>
---
 arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index ae9b503220c..badce508406 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -588,6 +588,21 @@ static const struct dmi_system_id sw_any_bug_dmi_table[] = {
 	},
 	{ }
 };
+
+static int acpi_cpufreq_blacklist(struct cpuinfo_x86 *c)
+{
+	/* http://www.intel.com/Assets/PDF/specupdate/314554.pdf
+	 * AL30: A Machine Check Exception (MCE) Occurring during an
+	 * Enhanced Intel SpeedStep Technology Ratio Change May Cause
+	 * Both Processor Cores to Lock Up when HT is enabled*/
+	if (c->x86_vendor == X86_VENDOR_INTEL) {
+		if ((c->x86 == 15) &&
+		    (c->x86_model == 6) &&
+		    (c->x86_mask == 8) && smt_capable())
+			return -ENODEV;
+		}
+	return 0;
+}
 #endif
 
 static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
@@ -602,6 +617,12 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
 
 	dprintk("acpi_cpufreq_cpu_init\n");
 
+#ifdef CONFIG_SMP
+	result = acpi_cpufreq_blacklist(c);
+	if (result)
+		return result;
+#endif
+
 	data = kzalloc(sizeof(struct acpi_cpufreq_data), GFP_KERNEL);
 	if (!data)
 		return -ENOMEM;
-- 
cgit v1.2.3


From 69575d388603365f2afbf4166df93152df59b165 Mon Sep 17 00:00:00 2001
From: Shane Wang <shane.wang@intel.com>
Date: Tue, 1 Sep 2009 18:25:07 -0700
Subject: x86, intel_txt: clean up the impact on generic code, unbreak non-x86

Move tboot.h from asm to linux to fix the build errors of intel_txt
patch on non-X86 platforms. Remove the tboot code from generic code
init/main.c and kernel/cpu.c.

Signed-off-by: Shane Wang <shane.wang@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/Kconfig              |   4 +
 arch/x86/include/asm/tboot.h  | 197 ------------------------------------------
 arch/x86/kernel/reboot.c      |   3 +-
 arch/x86/kernel/setup.c       |   3 +-
 arch/x86/kernel/smpboot.c     |   2 +-
 arch/x86/kernel/tboot.c       |  58 ++++++++++---
 drivers/acpi/acpica/hwsleep.c |   2 +-
 drivers/pci/dmar.c            |   2 +-
 drivers/pci/intel-iommu.c     |   2 +-
 include/linux/tboot.h         | 162 ++++++++++++++++++++++++++++++++++
 init/main.c                   |   3 -
 kernel/cpu.c                  |   6 +-
 security/Kconfig              |   2 +-
 13 files changed, 221 insertions(+), 225 deletions(-)
 delete mode 100644 arch/x86/include/asm/tboot.h
 create mode 100644 include/linux/tboot.h

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 738bdc6b0f8..b66f2102c35 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -178,6 +178,10 @@ config ARCH_SUPPORTS_OPTIMIZED_INLINING
 config ARCH_SUPPORTS_DEBUG_PAGEALLOC
 	def_bool y
 
+config HAVE_INTEL_TXT
+	def_bool y
+	depends on EXPERIMENTAL && DMAR && ACPI
+
 # Use the generic interrupt handling code in kernel/irq/:
 config GENERIC_HARDIRQS
 	bool
diff --git a/arch/x86/include/asm/tboot.h b/arch/x86/include/asm/tboot.h
deleted file mode 100644
index b13929d4e5f..00000000000
--- a/arch/x86/include/asm/tboot.h
+++ /dev/null
@@ -1,197 +0,0 @@
-/*
- * tboot.h: shared data structure with tboot and kernel and functions
- *          used by kernel for runtime support of Intel(R) Trusted
- *          Execution Technology
- *
- * Copyright (c) 2006-2009, Intel Corporation
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
- *
- */
-
-#ifndef _ASM_TBOOT_H
-#define _ASM_TBOOT_H
-
-#include <acpi/acpi.h>
-
-/* these must have the values from 0-5 in this order */
-enum {
-	TB_SHUTDOWN_REBOOT = 0,
-	TB_SHUTDOWN_S5,
-	TB_SHUTDOWN_S4,
-	TB_SHUTDOWN_S3,
-	TB_SHUTDOWN_HALT,
-	TB_SHUTDOWN_WFS
-};
-
-#ifdef CONFIG_INTEL_TXT
-
-/* used to communicate between tboot and the launched kernel */
-
-#define TB_KEY_SIZE             64   /* 512 bits */
-
-#define MAX_TB_MAC_REGIONS      32
-
-struct tboot_mac_region {
-	u64  start;         /* must be 64 byte -aligned */
-	u32  size;          /* must be 64 byte -granular */
-} __packed;
-
-/* GAS - Generic Address Structure (ACPI 2.0+) */
-struct tboot_acpi_generic_address {
-	u8  space_id;
-	u8  bit_width;
-	u8  bit_offset;
-	u8  access_width;
-	u64 address;
-} __packed;
-
-/*
- * combines Sx info from FADT and FACS tables per ACPI 2.0+ spec
- * (http://www.acpi.info/)
- */
-struct tboot_acpi_sleep_info {
-	struct tboot_acpi_generic_address pm1a_cnt_blk;
-	struct tboot_acpi_generic_address pm1b_cnt_blk;
-	struct tboot_acpi_generic_address pm1a_evt_blk;
-	struct tboot_acpi_generic_address pm1b_evt_blk;
-	u16 pm1a_cnt_val;
-	u16 pm1b_cnt_val;
-	u64 wakeup_vector;
-	u32 vector_width;
-	u64 kernel_s3_resume_vector;
-} __packed;
-
-/*
- * shared memory page used for communication between tboot and kernel
- */
-struct tboot {
-	/*
-	 * version 3+ fields:
-	 */
-
-	/* TBOOT_UUID */
-	u8 uuid[16];
-
-	/* version number: 5 is current */
-	u32 version;
-
-	/* physical addr of tb_log_t log */
-	u32 log_addr;
-
-	/*
-	 * physical addr of entry point for tboot shutdown and
-	 * type of shutdown (TB_SHUTDOWN_*) being requested
-	 */
-	u32 shutdown_entry;
-	u32 shutdown_type;
-
-	/* kernel-specified ACPI info for Sx shutdown */
-	struct tboot_acpi_sleep_info acpi_sinfo;
-
-	/* tboot location in memory (physical) */
-	u32 tboot_base;
-	u32 tboot_size;
-
-	/* memory regions (phys addrs) for tboot to MAC on S3 */
-	u8 num_mac_regions;
-	struct tboot_mac_region mac_regions[MAX_TB_MAC_REGIONS];
-
-
-	/*
-	 * version 4+ fields:
-	 */
-
-	/* symmetric key for use by kernel; will be encrypted on S3 */
-	u8 s3_key[TB_KEY_SIZE];
-
-
-	/*
-	 * version 5+ fields:
-	 */
-
-	/* used to 4byte-align num_in_wfs */
-	u8 reserved_align[3];
-
-	/* number of processors in wait-for-SIPI */
-	u32 num_in_wfs;
-} __packed;
-
-/*
- * UUID for tboot data struct to facilitate matching
- * defined as {663C8DFF-E8B3-4b82-AABF-19EA4D057A08} by tboot, which is
- * represented as {} in the char array used here
- */
-#define TBOOT_UUID	{0xff, 0x8d, 0x3c, 0x66, 0xb3, 0xe8, 0x82, 0x4b, 0xbf,\
-			 0xaa, 0x19, 0xea, 0x4d, 0x5, 0x7a, 0x8}
-
-extern struct tboot *tboot;
-
-static inline int tboot_enabled(void)
-{
-	return tboot != NULL;
-}
-
-extern void tboot_probe(void);
-extern void tboot_create_trampoline(void);
-extern void tboot_shutdown(u32 shutdown_type);
-extern void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control);
-extern int tboot_wait_for_aps(int num_aps);
-extern struct acpi_table_header *tboot_get_dmar_table(
-				      struct acpi_table_header *dmar_tbl);
-extern int tboot_force_iommu(void);
-
-#else     /* CONFIG_INTEL_TXT */
-
-static inline int tboot_enabled(void)
-{
-	return 0;
-}
-
-static inline void tboot_probe(void)
-{
-}
-
-static inline void tboot_create_trampoline(void)
-{
-}
-
-static inline void tboot_shutdown(u32 shutdown_type)
-{
-}
-
-static inline void tboot_sleep(u8 sleep_state, u32 pm1a_control,
-			       u32 pm1b_control)
-{
-}
-
-static inline int tboot_wait_for_aps(int num_aps)
-{
-	return 0;
-}
-
-static inline struct acpi_table_header *tboot_get_dmar_table(
-					struct acpi_table_header *dmar_tbl)
-{
-	return dmar_tbl;
-}
-
-static inline int tboot_force_iommu(void)
-{
-	return 0;
-}
-
-#endif /* !CONFIG_INTEL_TXT */
-
-#endif /* _ASM_TBOOT_H */
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 9de01c5d979..18ce5c04242 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -3,6 +3,7 @@
 #include <linux/init.h>
 #include <linux/pm.h>
 #include <linux/efi.h>
+#include <linux/tboot.h>
 #include <acpi/reboot.h>
 #include <asm/io.h>
 #include <asm/apic.h>
@@ -24,8 +25,6 @@
 # include <asm/iommu.h>
 #endif
 
-#include <asm/tboot.h>
-
 /*
  * Power off function, if any
  */
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 80d6e9e3248..6ce0d6f38f7 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -66,6 +66,7 @@
 
 #include <linux/percpu.h>
 #include <linux/crash_dump.h>
+#include <linux/tboot.h>
 
 #include <video/edid.h>
 
@@ -145,8 +146,6 @@ struct boot_params __initdata boot_params;
 struct boot_params boot_params;
 #endif
 
-#include <asm/tboot.h>
-
 /*
  * Machine setup..
  */
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 61cc40887c4..7d9d8eea20a 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -47,6 +47,7 @@
 #include <linux/bootmem.h>
 #include <linux/err.h>
 #include <linux/nmi.h>
+#include <linux/tboot.h>
 
 #include <asm/acpi.h>
 #include <asm/desc.h>
@@ -62,7 +63,6 @@
 #include <asm/vmi.h>
 #include <asm/apic.h>
 #include <asm/setup.h>
-#include <asm/tboot.h>
 #include <asm/uv/uv.h>
 #include <linux/mc146818rtc.h>
 
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index c2e760ca7b0..86c9f91b48a 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -22,11 +22,14 @@
 #include <linux/dma_remapping.h>
 #include <linux/init_task.h>
 #include <linux/spinlock.h>
+#include <linux/delay.h>
 #include <linux/sched.h>
 #include <linux/init.h>
 #include <linux/dmar.h>
+#include <linux/cpu.h>
 #include <linux/pfn.h>
 #include <linux/mm.h>
+#include <linux/tboot.h>
 
 #include <asm/trampoline.h>
 #include <asm/processor.h>
@@ -36,7 +39,6 @@
 #include <asm/fixmap.h>
 #include <asm/proto.h>
 #include <asm/setup.h>
-#include <asm/tboot.h>
 #include <asm/e820.h>
 #include <asm/io.h>
 
@@ -154,13 +156,10 @@ static int map_tboot_pages(unsigned long vaddr, unsigned long start_pfn,
 	return 0;
 }
 
-void tboot_create_trampoline(void)
+static void tboot_create_trampoline(void)
 {
 	u32 map_base, map_size;
 
-	if (!tboot_enabled())
-		return;
-
 	/* Create identity map for tboot shutdown code. */
 	map_base = PFN_DOWN(tboot->tboot_base);
 	map_size = PFN_UP(tboot->tboot_size);
@@ -295,21 +294,58 @@ void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control)
 	tboot_shutdown(acpi_shutdown_map[sleep_state]);
 }
 
-int tboot_wait_for_aps(int num_aps)
+static atomic_t ap_wfs_count;
+
+static int tboot_wait_for_aps(int num_aps)
 {
 	unsigned long timeout;
 
+	timeout = AP_WAIT_TIMEOUT*HZ;
+	while (atomic_read((atomic_t *)&tboot->num_in_wfs) != num_aps &&
+	       timeout) {
+		mdelay(1);
+		timeout--;
+	}
+
+	if (timeout)
+		pr_warning("tboot wait for APs timeout\n");
+
+	return !(atomic_read((atomic_t *)&tboot->num_in_wfs) == num_aps);
+}
+
+static int __cpuinit tboot_cpu_callback(struct notifier_block *nfb,
+			unsigned long action, void *hcpu)
+{
+	switch (action) {
+	case CPU_DYING:
+		atomic_inc(&ap_wfs_count);
+		if (num_online_cpus() == 1)
+			if (tboot_wait_for_aps(atomic_read(&ap_wfs_count)))
+				return NOTIFY_BAD;
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block tboot_cpu_notifier __cpuinitdata =
+{
+	.notifier_call = tboot_cpu_callback,
+};
+
+static __init int tboot_late_init(void)
+{
 	if (!tboot_enabled())
 		return 0;
 
-	timeout = jiffies + AP_WAIT_TIMEOUT*HZ;
-	while (atomic_read((atomic_t *)&tboot->num_in_wfs) != num_aps &&
-	       time_before(jiffies, timeout))
-		cpu_relax();
+	tboot_create_trampoline();
 
-	return time_before(jiffies, timeout) ? 0 : 1;
+	atomic_set(&ap_wfs_count, 0);
+	register_hotcpu_notifier(&tboot_cpu_notifier);
+	return 0;
 }
 
+late_initcall(tboot_late_init);
+
 /*
  * TXT configuration registers (offsets from TXT_{PUB, PRIV}_CONFIG_REGS_BASE)
  */
diff --git a/drivers/acpi/acpica/hwsleep.c b/drivers/acpi/acpica/hwsleep.c
index 8c01dd3724e..cc22f9a585b 100644
--- a/drivers/acpi/acpica/hwsleep.c
+++ b/drivers/acpi/acpica/hwsleep.c
@@ -45,7 +45,7 @@
 #include <acpi/acpi.h>
 #include "accommon.h"
 #include "actables.h"
-#include <asm/tboot.h>
+#include <linux/tboot.h>
 
 #define _COMPONENT          ACPI_HARDWARE
 ACPI_MODULE_NAME("hwsleep")
diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c
index 0cbc5fd26c3..ab99783dcce 100644
--- a/drivers/pci/dmar.c
+++ b/drivers/pci/dmar.c
@@ -33,7 +33,7 @@
 #include <linux/timer.h>
 #include <linux/irq.h>
 #include <linux/interrupt.h>
-#include <asm/tboot.h>
+#include <linux/tboot.h>
 
 #undef PREFIX
 #define PREFIX "DMAR:"
diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index 2dc72a6d741..833509b5352 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -37,8 +37,8 @@
 #include <linux/iommu.h>
 #include <linux/intel-iommu.h>
 #include <linux/sysdev.h>
+#include <linux/tboot.h>
 #include <asm/cacheflush.h>
-#include <asm/tboot.h>
 #include <asm/iommu.h>
 #include "pci.h"
 
diff --git a/include/linux/tboot.h b/include/linux/tboot.h
new file mode 100644
index 00000000000..bf2a0c74887
--- /dev/null
+++ b/include/linux/tboot.h
@@ -0,0 +1,162 @@
+/*
+ * tboot.h: shared data structure with tboot and kernel and functions
+ *          used by kernel for runtime support of Intel(R) Trusted
+ *          Execution Technology
+ *
+ * Copyright (c) 2006-2009, Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+
+#ifndef _LINUX_TBOOT_H
+#define _LINUX_TBOOT_H
+
+/* these must have the values from 0-5 in this order */
+enum {
+	TB_SHUTDOWN_REBOOT = 0,
+	TB_SHUTDOWN_S5,
+	TB_SHUTDOWN_S4,
+	TB_SHUTDOWN_S3,
+	TB_SHUTDOWN_HALT,
+	TB_SHUTDOWN_WFS
+};
+
+#ifdef CONFIG_INTEL_TXT
+#include <acpi/acpi.h>
+/* used to communicate between tboot and the launched kernel */
+
+#define TB_KEY_SIZE             64   /* 512 bits */
+
+#define MAX_TB_MAC_REGIONS      32
+
+struct tboot_mac_region {
+	u64  start;         /* must be 64 byte -aligned */
+	u32  size;          /* must be 64 byte -granular */
+} __packed;
+
+/* GAS - Generic Address Structure (ACPI 2.0+) */
+struct tboot_acpi_generic_address {
+	u8  space_id;
+	u8  bit_width;
+	u8  bit_offset;
+	u8  access_width;
+	u64 address;
+} __packed;
+
+/*
+ * combines Sx info from FADT and FACS tables per ACPI 2.0+ spec
+ * (http://www.acpi.info/)
+ */
+struct tboot_acpi_sleep_info {
+	struct tboot_acpi_generic_address pm1a_cnt_blk;
+	struct tboot_acpi_generic_address pm1b_cnt_blk;
+	struct tboot_acpi_generic_address pm1a_evt_blk;
+	struct tboot_acpi_generic_address pm1b_evt_blk;
+	u16 pm1a_cnt_val;
+	u16 pm1b_cnt_val;
+	u64 wakeup_vector;
+	u32 vector_width;
+	u64 kernel_s3_resume_vector;
+} __packed;
+
+/*
+ * shared memory page used for communication between tboot and kernel
+ */
+struct tboot {
+	/*
+	 * version 3+ fields:
+	 */
+
+	/* TBOOT_UUID */
+	u8 uuid[16];
+
+	/* version number: 5 is current */
+	u32 version;
+
+	/* physical addr of tb_log_t log */
+	u32 log_addr;
+
+	/*
+	 * physical addr of entry point for tboot shutdown and
+	 * type of shutdown (TB_SHUTDOWN_*) being requested
+	 */
+	u32 shutdown_entry;
+	u32 shutdown_type;
+
+	/* kernel-specified ACPI info for Sx shutdown */
+	struct tboot_acpi_sleep_info acpi_sinfo;
+
+	/* tboot location in memory (physical) */
+	u32 tboot_base;
+	u32 tboot_size;
+
+	/* memory regions (phys addrs) for tboot to MAC on S3 */
+	u8 num_mac_regions;
+	struct tboot_mac_region mac_regions[MAX_TB_MAC_REGIONS];
+
+
+	/*
+	 * version 4+ fields:
+	 */
+
+	/* symmetric key for use by kernel; will be encrypted on S3 */
+	u8 s3_key[TB_KEY_SIZE];
+
+
+	/*
+	 * version 5+ fields:
+	 */
+
+	/* used to 4byte-align num_in_wfs */
+	u8 reserved_align[3];
+
+	/* number of processors in wait-for-SIPI */
+	u32 num_in_wfs;
+} __packed;
+
+/*
+ * UUID for tboot data struct to facilitate matching
+ * defined as {663C8DFF-E8B3-4b82-AABF-19EA4D057A08} by tboot, which is
+ * represented as {} in the char array used here
+ */
+#define TBOOT_UUID	{0xff, 0x8d, 0x3c, 0x66, 0xb3, 0xe8, 0x82, 0x4b, 0xbf,\
+			 0xaa, 0x19, 0xea, 0x4d, 0x5, 0x7a, 0x8}
+
+extern struct tboot *tboot;
+
+static inline int tboot_enabled(void)
+{
+	return tboot != NULL;
+}
+
+extern void tboot_probe(void);
+extern void tboot_shutdown(u32 shutdown_type);
+extern void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control);
+extern struct acpi_table_header *tboot_get_dmar_table(
+				      struct acpi_table_header *dmar_tbl);
+extern int tboot_force_iommu(void);
+
+#else
+
+#define tboot_probe()			do { } while (0)
+#define tboot_shutdown(shutdown_type)	do { } while (0)
+#define tboot_sleep(sleep_state, pm1a_control, pm1b_control)	\
+					do { } while (0)
+#define tboot_get_dmar_table(dmar_tbl)	(dmar_tbl)
+#define tboot_force_iommu()		0
+
+#endif /* !CONFIG_INTEL_TXT */
+
+#endif /* _LINUX_TBOOT_H */
diff --git a/init/main.c b/init/main.c
index 56ada27c4f4..2c5ade79eb8 100644
--- a/init/main.c
+++ b/init/main.c
@@ -73,7 +73,6 @@
 #include <asm/io.h>
 #include <asm/bugs.h>
 #include <asm/setup.h>
-#include <asm/tboot.h>
 #include <asm/sections.h>
 #include <asm/cacheflush.h>
 
@@ -716,8 +715,6 @@ asmlinkage void __init start_kernel(void)
 
 	ftrace_init();
 
-	tboot_create_trampoline();
-
 	/* Do the rest non-__init'ed, we're now alive */
 	rest_init();
 }
diff --git a/kernel/cpu.c b/kernel/cpu.c
index ff071e022a8..67a60076dd7 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -14,7 +14,6 @@
 #include <linux/kthread.h>
 #include <linux/stop_machine.h>
 #include <linux/mutex.h>
-#include <asm/tboot.h>
 
 #ifdef CONFIG_SMP
 /* Serializes the updates to cpu_online_mask, cpu_present_mask */
@@ -377,7 +376,7 @@ static cpumask_var_t frozen_cpus;
 
 int disable_nonboot_cpus(void)
 {
-	int cpu, first_cpu, error, num_cpus = 0;
+	int cpu, first_cpu, error;
 
 	error = stop_machine_create();
 	if (error)
@@ -392,7 +391,6 @@ int disable_nonboot_cpus(void)
 	for_each_online_cpu(cpu) {
 		if (cpu == first_cpu)
 			continue;
-		num_cpus++;
 		error = _cpu_down(cpu, 1);
 		if (!error) {
 			cpumask_set_cpu(cpu, frozen_cpus);
@@ -403,8 +401,6 @@ int disable_nonboot_cpus(void)
 			break;
 		}
 	}
-	/* ensure all CPUs have gone into wait-for-SIPI */
-	error |= tboot_wait_for_aps(num_cpus);
 
 	if (!error) {
 		BUG_ON(num_online_cpus() > 1);
diff --git a/security/Kconfig b/security/Kconfig
index 6631774672c..5721847a7a6 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -115,7 +115,7 @@ config SECURITY_ROOTPLUG
 
 config INTEL_TXT
 	bool "Enable Intel(R) Trusted Execution Technology (Intel(R) TXT)"
-	depends on EXPERIMENTAL && X86 && DMAR && ACPI
+	depends on HAVE_INTEL_TXT
 	help
 	  This option enables support for booting the kernel with the
 	  Trusted Boot (tboot) module. This will utilize
-- 
cgit v1.2.3


From a7db50405216610c8a0d62b8b400180b6f366733 Mon Sep 17 00:00:00 2001
From: Alex Chiang <achiang@hp.com>
Date: Mon, 22 Jun 2009 08:08:07 -0600
Subject: PCI: remove pcibios_scan_all_fns()

This was #define'd as 0 on all platforms, so let's get rid of it.

This change makes pci_scan_slot() slightly easier to read.

Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Tony Luck <tony.luck@intel.com>
Cc: David Howells <dhowells@redhat.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Reviewed-by: Matthew Wilcox <willy@linux.intel.com>
Acked-by: Russell King <linux@arm.linux.org.uk>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Acked-by: Kyle McMartin <kyle@mcmartin.ca>
Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: Paul Mundt <lethal@linux-sh.org>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Alex Chiang <achiang@hp.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/alpha/include/asm/pci.h    |  1 -
 arch/arm/include/asm/pci.h      |  2 --
 arch/h8300/include/asm/pci.h    |  1 -
 arch/ia64/include/asm/pci.h     | 14 ++++++++++++--
 arch/mips/include/asm/pci.h     |  2 --
 arch/mn10300/include/asm/pci.h  | 13 ++++++++++++-
 arch/parisc/include/asm/pci.h   |  1 -
 arch/powerpc/include/asm/pci.h  |  1 -
 arch/sh/include/asm/pci.h       |  1 -
 arch/sparc/include/asm/pci_32.h |  1 -
 arch/sparc/include/asm/pci_64.h |  1 -
 arch/um/include/asm/pci.h       |  1 -
 arch/x86/include/asm/pci.h      |  1 -
 drivers/pci/probe.c             |  3 +--
 include/asm-generic/pci.h       | 13 ++++++++++++-
 15 files changed, 37 insertions(+), 19 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/alpha/include/asm/pci.h b/arch/alpha/include/asm/pci.h
index d22ace99d13..dd8dcabf160 100644
--- a/arch/alpha/include/asm/pci.h
+++ b/arch/alpha/include/asm/pci.h
@@ -52,7 +52,6 @@ struct pci_controller {
    bus numbers.  */
 
 #define pcibios_assign_all_busses()	1
-#define pcibios_scan_all_fns(a, b)	0
 
 #define PCIBIOS_MIN_IO		alpha_mv.min_io_address
 #define PCIBIOS_MIN_MEM		alpha_mv.min_mem_address
diff --git a/arch/arm/include/asm/pci.h b/arch/arm/include/asm/pci.h
index 0abf386ba3d..226cddd2fb6 100644
--- a/arch/arm/include/asm/pci.h
+++ b/arch/arm/include/asm/pci.h
@@ -6,8 +6,6 @@
 
 #include <mach/hardware.h> /* for PCIBIOS_MIN_* */
 
-#define pcibios_scan_all_fns(a, b)	0
-
 #ifdef CONFIG_PCI_HOST_ITE8152
 /* ITE bridge requires setting latency timer to avoid early bus access
    termination by PIC bus mater devices
diff --git a/arch/h8300/include/asm/pci.h b/arch/h8300/include/asm/pci.h
index 97389b35aa3..cc9762091c0 100644
--- a/arch/h8300/include/asm/pci.h
+++ b/arch/h8300/include/asm/pci.h
@@ -8,7 +8,6 @@
  */
 
 #define pcibios_assign_all_busses()	0
-#define pcibios_scan_all_fns(a, b)	0
 
 static inline void pcibios_set_master(struct pci_dev *dev)
 {
diff --git a/arch/ia64/include/asm/pci.h b/arch/ia64/include/asm/pci.h
index fcfca56bb85..55281aabe5f 100644
--- a/arch/ia64/include/asm/pci.h
+++ b/arch/ia64/include/asm/pci.h
@@ -17,7 +17,6 @@
  * loader.
  */
 #define pcibios_assign_all_busses()     0
-#define pcibios_scan_all_fns(a, b)	0
 
 #define PCIBIOS_MIN_IO		0x1000
 #define PCIBIOS_MIN_MEM		0x10000000
@@ -135,7 +134,18 @@ extern void pcibios_resource_to_bus(struct pci_dev *dev,
 extern void pcibios_bus_to_resource(struct pci_dev *dev,
 		struct resource *res, struct pci_bus_region *region);
 
-#define pcibios_scan_all_fns(a, b)	0
+static inline struct resource *
+pcibios_select_root(struct pci_dev *pdev, struct resource *res)
+{
+	struct resource *root = NULL;
+
+	if (res->flags & IORESOURCE_IO)
+		root = &ioport_resource;
+	if (res->flags & IORESOURCE_MEM)
+		root = &iomem_resource;
+
+	return root;
+}
 
 #define HAVE_ARCH_PCI_GET_LEGACY_IDE_IRQ
 static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel)
diff --git a/arch/mips/include/asm/pci.h b/arch/mips/include/asm/pci.h
index a68d111e55e..5ebf82572ec 100644
--- a/arch/mips/include/asm/pci.h
+++ b/arch/mips/include/asm/pci.h
@@ -65,8 +65,6 @@ extern int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin);
 
 extern unsigned int pcibios_assign_all_busses(void);
 
-#define pcibios_scan_all_fns(a, b)	0
-
 extern unsigned long PCIBIOS_MIN_IO;
 extern unsigned long PCIBIOS_MIN_MEM;
 
diff --git a/arch/mn10300/include/asm/pci.h b/arch/mn10300/include/asm/pci.h
index 19aecc90f7a..6095a28561d 100644
--- a/arch/mn10300/include/asm/pci.h
+++ b/arch/mn10300/include/asm/pci.h
@@ -101,7 +101,18 @@ extern void pcibios_bus_to_resource(struct pci_dev *dev,
 				    struct resource *res,
 				    struct pci_bus_region *region);
 
-#define pcibios_scan_all_fns(a, b)	0
+static inline struct resource *
+pcibios_select_root(struct pci_dev *pdev, struct resource *res)
+{
+	struct resource *root = NULL;
+
+	if (res->flags & IORESOURCE_IO)
+		root = &ioport_resource;
+	if (res->flags & IORESOURCE_MEM)
+		root = &iomem_resource;
+
+	return root;
+}
 
 static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel)
 {
diff --git a/arch/parisc/include/asm/pci.h b/arch/parisc/include/asm/pci.h
index 7d842d699df..64c7aa590ae 100644
--- a/arch/parisc/include/asm/pci.h
+++ b/arch/parisc/include/asm/pci.h
@@ -233,7 +233,6 @@ static inline void pcibios_register_hba(struct pci_hba_data *x)
  *   rp7420/8420 boxes and then revisit this issue.
  */
 #define pcibios_assign_all_busses()     (1)
-#define pcibios_scan_all_fns(a, b)	(0)
 
 #define PCIBIOS_MIN_IO          0x10
 #define PCIBIOS_MIN_MEM         0x1000 /* NBPG - but pci/setup-res.c dies */
diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h
index d9483c504d2..36057c821ff 100644
--- a/arch/powerpc/include/asm/pci.h
+++ b/arch/powerpc/include/asm/pci.h
@@ -40,7 +40,6 @@ struct pci_dev;
  */
 #define pcibios_assign_all_busses() \
 	(ppc_pci_has_flag(PPC_PCI_REASSIGN_ALL_BUS))
-#define pcibios_scan_all_fns(a, b)	0
 
 static inline void pcibios_set_master(struct pci_dev *dev)
 {
diff --git a/arch/sh/include/asm/pci.h b/arch/sh/include/asm/pci.h
index d3633f513eb..4163950cd1c 100644
--- a/arch/sh/include/asm/pci.h
+++ b/arch/sh/include/asm/pci.h
@@ -10,7 +10,6 @@
    or architectures with incomplete PCI setup by the loader */
 
 #define pcibios_assign_all_busses()	1
-#define pcibios_scan_all_fns(a, b)	0
 
 /*
  * A board can define one or more PCI channels that represent built-in (or
diff --git a/arch/sparc/include/asm/pci_32.h b/arch/sparc/include/asm/pci_32.h
index b41c4c19815..810d9248e23 100644
--- a/arch/sparc/include/asm/pci_32.h
+++ b/arch/sparc/include/asm/pci_32.h
@@ -10,7 +10,6 @@
  * or architectures with incomplete PCI setup by the loader.
  */
 #define pcibios_assign_all_busses()	0
-#define pcibios_scan_all_fns(a, b)	0
 
 #define PCIBIOS_MIN_IO		0UL
 #define PCIBIOS_MIN_MEM		0UL
diff --git a/arch/sparc/include/asm/pci_64.h b/arch/sparc/include/asm/pci_64.h
index 7a1e3566e59..a3297088828 100644
--- a/arch/sparc/include/asm/pci_64.h
+++ b/arch/sparc/include/asm/pci_64.h
@@ -10,7 +10,6 @@
  * or architectures with incomplete PCI setup by the loader.
  */
 #define pcibios_assign_all_busses()	0
-#define pcibios_scan_all_fns(a, b)	0
 
 #define PCIBIOS_MIN_IO		0UL
 #define PCIBIOS_MIN_MEM		0UL
diff --git a/arch/um/include/asm/pci.h b/arch/um/include/asm/pci.h
index 59923199cdc..b44cf59ede1 100644
--- a/arch/um/include/asm/pci.h
+++ b/arch/um/include/asm/pci.h
@@ -2,6 +2,5 @@
 #define __UM_PCI_H
 
 #define PCI_DMA_BUS_IS_PHYS     (1)
-#define pcibios_scan_all_fns(a, b)	0
 
 #endif
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index 1ff685ca221..f76a162c082 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -48,7 +48,6 @@ extern unsigned int pcibios_assign_all_busses(void);
 #else
 #define pcibios_assign_all_busses()	0
 #endif
-#define pcibios_scan_all_fns(a, b)	0
 
 extern unsigned long pci_mem_start;
 #define PCIBIOS_MIN_IO		0x1000
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 40e75f6a505..b9d4e95aafb 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -1061,8 +1061,7 @@ int pci_scan_slot(struct pci_bus *bus, int devfn)
 	if (dev && !dev->is_added)	/* new device? */
 		nr++;
 
-	if ((dev && dev->multifunction) ||
-	    (!dev && pcibios_scan_all_fns(bus, devfn))) {
+	if (dev && dev->multifunction) {
 		for (fn = 1; fn < 8; fn++) {
 			dev = pci_scan_single_device(bus, devfn + fn);
 			if (dev) {
diff --git a/include/asm-generic/pci.h b/include/asm-generic/pci.h
index b4326b5466e..26373cff454 100644
--- a/include/asm-generic/pci.h
+++ b/include/asm-generic/pci.h
@@ -30,7 +30,18 @@ pcibios_bus_to_resource(struct pci_dev *dev, struct resource *res,
 	res->end = region->end;
 }
 
-#define pcibios_scan_all_fns(a, b)	0
+static inline struct resource *
+pcibios_select_root(struct pci_dev *pdev, struct resource *res)
+{
+	struct resource *root = NULL;
+
+	if (res->flags & IORESOURCE_IO)
+		root = &ioport_resource;
+	if (res->flags & IORESOURCE_MEM)
+		root = &iomem_resource;
+
+	return root;
+}
 
 #ifndef HAVE_ARCH_PCI_GET_LEGACY_IDE_IRQ
 static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel)
-- 
cgit v1.2.3


From 2547089ca2db132e307ef68848ba029a8ec2f341 Mon Sep 17 00:00:00 2001
From: Jesse Barnes <jbarnes@virtuousgeek.org>
Date: Fri, 10 Jul 2009 14:04:30 -0700
Subject: x86/PCI: initialize PCI bus node numbers early

The current mp_bus_to_node array is initialized only by AMD specific
code, since AMD platforms have registers that can be used for
determining mode numbers.  On new Intel platforms it's necessary to
initialize this array as well though, otherwise all PCI node numbers
will be 0, when in fact they should be -1 (indicating that I/O isn't
tied to any particular node).

So move the mp_bus_to_node code into the common PCI code, and
initialize it early with a default value of -1.  This may be overridden
later by arch code (e.g. the AMD code).

With this change, PCI consistent memory and other node specific
allocations (e.g. skbuff allocs) should occur on the "current" node.
If, for performance reasons, applications want to be bound to specific
nodes, they should open their devices only after being pinned to the
CPU where they'll run, for maximum locality.

Acked-by: Yinghai Lu <yinghai@kernel.org>
Tested-by: Jesse Brandeburg <jesse.brandeburg@gmail.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/pci/amd_bus.c | 64 +---------------------------------------------
 arch/x86/pci/common.c  | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 70 insertions(+), 63 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index 3ffa10df20b..572ee9782f2 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -15,63 +15,6 @@
  * also get peer root bus resource for io,mmio
  */
 
-#ifdef CONFIG_NUMA
-
-#define BUS_NR 256
-
-#ifdef CONFIG_X86_64
-
-static int mp_bus_to_node[BUS_NR];
-
-void set_mp_bus_to_node(int busnum, int node)
-{
-	if (busnum >= 0 &&  busnum < BUS_NR)
-		mp_bus_to_node[busnum] = node;
-}
-
-int get_mp_bus_to_node(int busnum)
-{
-	int node = -1;
-
-	if (busnum < 0 || busnum > (BUS_NR - 1))
-		return node;
-
-	node = mp_bus_to_node[busnum];
-
-	/*
-	 * let numa_node_id to decide it later in dma_alloc_pages
-	 * if there is no ram on that node
-	 */
-	if (node != -1 && !node_online(node))
-		node = -1;
-
-	return node;
-}
-
-#else /* CONFIG_X86_32 */
-
-static unsigned char mp_bus_to_node[BUS_NR];
-
-void set_mp_bus_to_node(int busnum, int node)
-{
-	if (busnum >= 0 &&  busnum < BUS_NR)
-	mp_bus_to_node[busnum] = (unsigned char) node;
-}
-
-int get_mp_bus_to_node(int busnum)
-{
-	int node;
-
-	if (busnum < 0 || busnum > (BUS_NR - 1))
-		return 0;
-	node = mp_bus_to_node[busnum];
-	return node;
-}
-
-#endif /* CONFIG_X86_32 */
-
-#endif /* CONFIG_NUMA */
-
 #ifdef CONFIG_X86_64
 
 /*
@@ -301,11 +244,6 @@ static int __init early_fill_mp_bus_info(void)
 	u64 val;
 	u32 address;
 
-#ifdef CONFIG_NUMA
-	for (i = 0; i < BUS_NR; i++)
-		mp_bus_to_node[i] = -1;
-#endif
-
 	if (!early_pci_allowed())
 		return -1;
 
@@ -346,7 +284,7 @@ static int __init early_fill_mp_bus_info(void)
 		node = (reg >> 4) & 0x07;
 #ifdef CONFIG_NUMA
 		for (j = min_bus; j <= max_bus; j++)
-			mp_bus_to_node[j] = (unsigned char) node;
+			set_mp_bus_to_node(j, node);
 #endif
 		link = (reg >> 8) & 0x03;
 
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 2202b6257b8..5db96d4304d 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -600,3 +600,72 @@ struct pci_bus * __devinit pci_scan_bus_with_sysdata(int busno)
 {
 	return pci_scan_bus_on_node(busno, &pci_root_ops, -1);
 }
+
+/*
+ * NUMA info for PCI busses
+ *
+ * Early arch code is responsible for filling in reasonable values here.
+ * A node id of "-1" means "use current node".  In other words, if a bus
+ * has a -1 node id, it's not tightly coupled to any particular chunk
+ * of memory (as is the case on some Nehalem systems).
+ */
+#ifdef CONFIG_NUMA
+
+#define BUS_NR 256
+
+#ifdef CONFIG_X86_64
+
+static int mp_bus_to_node[BUS_NR] = {
+	[0 ... BUS_NR - 1] = -1
+};
+
+void set_mp_bus_to_node(int busnum, int node)
+{
+	if (busnum >= 0 &&  busnum < BUS_NR)
+		mp_bus_to_node[busnum] = node;
+}
+
+int get_mp_bus_to_node(int busnum)
+{
+	int node = -1;
+
+	if (busnum < 0 || busnum > (BUS_NR - 1))
+		return node;
+
+	node = mp_bus_to_node[busnum];
+
+	/*
+	 * let numa_node_id to decide it later in dma_alloc_pages
+	 * if there is no ram on that node
+	 */
+	if (node != -1 && !node_online(node))
+		node = -1;
+
+	return node;
+}
+
+#else /* CONFIG_X86_32 */
+
+static unsigned char mp_bus_to_node[BUS_NR] = {
+	[0 ... BUS_NR - 1] = -1
+};
+
+void set_mp_bus_to_node(int busnum, int node)
+{
+	if (busnum >= 0 &&  busnum < BUS_NR)
+	mp_bus_to_node[busnum] = (unsigned char) node;
+}
+
+int get_mp_bus_to_node(int busnum)
+{
+	int node;
+
+	if (busnum < 0 || busnum > (BUS_NR - 1))
+		return 0;
+	node = mp_bus_to_node[busnum];
+	return node;
+}
+
+#endif /* CONFIG_X86_32 */
+
+#endif /* CONFIG_NUMA */
-- 
cgit v1.2.3


From 80286879c209034245f0a28a2171d2ec23b7481c Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@hp.com>
Date: Thu, 30 Jul 2009 16:15:18 -0600
Subject: PCI iommu: iommu=pt is a valid early param

This avoids a "Malformed early option 'iommu'" on boot when trying
to use pass-through mode.

Signed-off-by: Alex Williamson <alex.williamson@hp.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/kernel/pci-dma.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 1a041bcf506..ae13e34f724 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -212,10 +212,8 @@ static __init int iommu_setup(char *p)
 		if (!strncmp(p, "soft", 4))
 			swiotlb = 1;
 #endif
-		if (!strncmp(p, "pt", 2)) {
+		if (!strncmp(p, "pt", 2))
 			iommu_pass_through = 1;
-			return 1;
-		}
 
 		gart_parse_options(p);
 
-- 
cgit v1.2.3


From fa526d0d641b5365676a1fb821ce359e217c9b85 Mon Sep 17 00:00:00 2001
From: Jack Steiner <steiner@sgi.com>
Date: Thu, 3 Sep 2009 12:56:02 -0500
Subject: x86, pat: Fix cacheflush address in change_page_attr_set_clr()

Fix address passed to cpa_flush_range() when changing page
attributes from WB to UC. The address (*addr) is
modified by __change_page_attr_set_clr(). The result is that
the pages being flushed start at the _end_ of the changed range
instead of the beginning.

This should be considered for 2.6.30-stable and 2.6.31-stable.

Signed-off-by: Jack Steiner <steiner@sgi.com>
Acked-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: Stable team <stable@kernel.org>
---
 arch/x86/mm/pageattr.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 7e600c1962d..e245775ec85 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -822,6 +822,7 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
 {
 	struct cpa_data cpa;
 	int ret, cache, checkalias;
+	unsigned long baddr = 0;
 
 	/*
 	 * Check, if we are requested to change a not supported
@@ -853,6 +854,11 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
 			 */
 			WARN_ON_ONCE(1);
 		}
+		/*
+		 * Save address for cache flush. *addr is modified in the call
+		 * to __change_page_attr_set_clr() below.
+		 */
+		baddr = *addr;
 	}
 
 	/* Must avoid aliasing mappings in the highmem code */
@@ -900,7 +906,7 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
 			cpa_flush_array(addr, numpages, cache,
 					cpa.flags, pages);
 		} else
-			cpa_flush_range(*addr, numpages, cache);
+			cpa_flush_range(baddr, numpages, cache);
 	} else
 		cpa_flush_all(cache);
 
-- 
cgit v1.2.3


From 748df9a4c65625c386674c9b54dde73de6cc5af5 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Tue, 8 Sep 2009 12:16:18 +0200
Subject: x86/PCI: pci quirks, fix pci refcounting

Stanse found a pci reference leak in quirk_amd_nb_node.
Instead of putting nb_ht, there is a put of dev passed as
an argument.

http://stanse.fi.muni.cz/

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/kernel/quirks.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index af71d06624b..6c3b2c6fd77 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -508,7 +508,7 @@ static void __init quirk_amd_nb_node(struct pci_dev *dev)
 
 	pci_read_config_dword(nb_ht, 0x60, &val);
 	set_dev_node(&dev->dev, val & 7);
-	pci_dev_put(dev);
+	pci_dev_put(nb_ht);
 }
 
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB,
-- 
cgit v1.2.3


From fc06b8520b3bf9aaeb2e27debe6719c215bd4916 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 10 Sep 2009 17:22:44 -0400
Subject: x86/tracing: comment need for atomic nop

The dynamic function tracer relys on the macro P6_NOP5 always being
an atomic NOP. If for some reason it is changed to be two operations
(like a nop2 nop3) it can faults within the kernel when the function
tracer modifies the code.

This patch adds a comment to note that the P6_NOPs are expected to
be atomic. This will hopefully prevent anyone from changing that.

Reported-by: Mathieu Desnoyer <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 arch/x86/include/asm/nops.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/nops.h b/arch/x86/include/asm/nops.h
index ad2668ee1aa..6d8723a766c 100644
--- a/arch/x86/include/asm/nops.h
+++ b/arch/x86/include/asm/nops.h
@@ -65,6 +65,8 @@
    6: osp nopl 0x00(%eax,%eax,1)
    7: nopl 0x00000000(%eax)
    8: nopl 0x00000000(%eax,%eax,1)
+   Note: All the above are assumed to be a single instruction.
+	There is kernel code that depends on this.
 */
 #define P6_NOP1	GENERIC_NOP1
 #define P6_NOP2	".byte 0x66,0x90\n"
-- 
cgit v1.2.3


From 80938332d8cf652f6b16e0788cf0ca136befe0b5 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.cz>
Date: Tue, 8 Sep 2009 11:01:55 +0200
Subject: x86: Increase MIN_GAP to include randomized stack

Currently we are not including randomized stack size when calculating
mmap_base address in arch_pick_mmap_layout for topdown case. This might
cause that mmap_base starts in the stack reserved area because stack is
randomized by 1GB for 64b (8MB for 32b) and the minimum gap is 128MB.

If the stack really grows down to mmap_base then we can get silent mmap
region overwrite by the stack values.

Let's include maximum stack randomization size into MIN_GAP which is
used as the low bound for the gap in mmap.

Signed-off-by: Michal Hocko <mhocko@suse.cz>
LKML-Reference: <1252400515-6866-1-git-send-email-mhocko@suse.cz>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: Stable Team <stable@kernel.org>
---
 arch/x86/include/asm/elf.h |  2 ++
 arch/x86/mm/mmap.c         | 17 +++++++++++++++--
 2 files changed, 17 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 83c1bc8d2e8..456a304b817 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -299,6 +299,8 @@ do {									\
 
 #ifdef CONFIG_X86_32
 
+#define STACK_RND_MASK (0x7ff)
+
 #define VDSO_HIGH_BASE		(__fix_to_virt(FIX_VDSO))
 
 #define ARCH_DLINFO		ARCH_DLINFO_IA32(vdso_enabled)
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 16582960056..c8191defc38 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -29,13 +29,26 @@
 #include <linux/random.h>
 #include <linux/limits.h>
 #include <linux/sched.h>
+#include <asm/elf.h>
+
+static unsigned int stack_maxrandom_size(void)
+{
+	unsigned int max = 0;
+	if ((current->flags & PF_RANDOMIZE) &&
+		!(current->personality & ADDR_NO_RANDOMIZE)) {
+		max = ((-1U) & STACK_RND_MASK) << PAGE_SHIFT;
+	}
+
+	return max;
+}
+
 
 /*
  * Top of mmap area (just below the process stack).
  *
- * Leave an at least ~128 MB hole.
+ * Leave an at least ~128 MB hole with possible stack randomization.
  */
-#define MIN_GAP (128*1024*1024)
+#define MIN_GAP (128*1024*1024UL + stack_maxrandom_size())
 #define MAX_GAP (TASK_SIZE/6*5)
 
 /*
-- 
cgit v1.2.3


From e517a5e97080bbe52857bd0d7df9b66602d53c4d Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Thu, 10 Sep 2009 17:48:48 -0700
Subject: agp/intel: Fix the pre-9xx chipset flush.

Ever since we enabled GEM, the pre-9xx chipsets (particularly 865) have had
serious stability issues.  Back in May a wbinvd was added to the DRM to
work around much of the problem.  Some failure remained -- easily visible
by dragging a window around on an X -retro desktop, or by looking at bugzilla.

The chipset flush was on the right track -- hitting the right amount of
memory, and it appears to be the only way to flush on these chipsets, but the
flush page was mapped uncached.  As a result, the writes trying to clear the
writeback cache ended up bypassing the cache, and not flushing anything!  The
wbinvd would flush out other writeback data and often cause the data we wanted
to get flushed, but not always.  By removing the setting of the page to UC
and instead just clflushing the data we write to try to flush it, we get the
desired behavior with no wbinvd.

This exports clflush_cache_range(), which was laying around and happened to
basically match the code I was otherwise going to copy from the DRM.

Signed-off-by: Eric Anholt <eric@anholt.net>
Signed-off-by: Brice Goglin <Brice.Goglin@ens-lyon.org>
Cc: stable@kernel.org
---
 arch/x86/mm/pageattr.c          |  1 +
 drivers/char/agp/intel-agp.c    | 30 +++++++++++++++++++++++-------
 drivers/gpu/drm/i915/i915_gem.c | 10 ----------
 3 files changed, 24 insertions(+), 17 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 7e600c1962d..5866b28eede 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -143,6 +143,7 @@ void clflush_cache_range(void *vaddr, unsigned int size)
 
 	mb();
 }
+EXPORT_SYMBOL_GPL(clflush_cache_range);
 
 static void __cpa_flush_all(void *arg)
 {
diff --git a/drivers/char/agp/intel-agp.c b/drivers/char/agp/intel-agp.c
index c1729171503..e8dc75fc33c 100644
--- a/drivers/char/agp/intel-agp.c
+++ b/drivers/char/agp/intel-agp.c
@@ -682,23 +682,39 @@ static void intel_i830_setup_flush(void)
 	if (!intel_private.i8xx_page)
 		return;
 
-	/* make page uncached */
-	map_page_into_agp(intel_private.i8xx_page);
-
 	intel_private.i8xx_flush_page = kmap(intel_private.i8xx_page);
 	if (!intel_private.i8xx_flush_page)
 		intel_i830_fini_flush();
 }
 
+static void
+do_wbinvd(void *null)
+{
+	wbinvd();
+}
+
+/* The chipset_flush interface needs to get data that has already been
+ * flushed out of the CPU all the way out to main memory, because the GPU
+ * doesn't snoop those buffers.
+ *
+ * The 8xx series doesn't have the same lovely interface for flushing the
+ * chipset write buffers that the later chips do. According to the 865
+ * specs, it's 64 octwords, or 1KB.  So, to get those previous things in
+ * that buffer out, we just fill 1KB and clflush it out, on the assumption
+ * that it'll push whatever was in there out.  It appears to work.
+ */
 static void intel_i830_chipset_flush(struct agp_bridge_data *bridge)
 {
 	unsigned int *pg = intel_private.i8xx_flush_page;
-	int i;
 
-	for (i = 0; i < 256; i += 2)
-		*(pg + i) = i;
+	memset(pg, 0, 1024);
 
-	wmb();
+	if (cpu_has_clflush) {
+		clflush_cache_range(pg, 1024);
+	} else {
+		if (on_each_cpu(do_wbinvd, NULL, 1) != 0)
+			printk(KERN_ERR "Timed out waiting for cache flush.\n");
+	}
 }
 
 /* The intel i830 automatically initializes the agp aperture during POST.
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index f3758f9fc97..30ea4b6b021 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2511,16 +2511,6 @@ i915_gem_clflush_object(struct drm_gem_object *obj)
 	if (obj_priv->pages == NULL)
 		return;
 
-	/* XXX: The 865 in particular appears to be weird in how it handles
-	 * cache flushing.  We haven't figured it out, but the
-	 * clflush+agp_chipset_flush doesn't appear to successfully get the
-	 * data visible to the PGU, while wbinvd + agp_chipset_flush does.
-	 */
-	if (IS_I865G(obj->dev)) {
-		wbinvd();
-		return;
-	}
-
 	drm_clflush_pages(obj_priv->pages, obj->size / PAGE_SIZE);
 }
 
-- 
cgit v1.2.3


From 4818d80942b7c0021d213b7c5f1a14a832820a01 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Wed, 29 Jul 2009 10:58:37 +0200
Subject: tracing/function-graph: x86_64 stack allocation cleanup

Only 24 bytes needs to be reserved on the stack for the function graph
tracer on x86_64.

Signed-off-by: Jiri Olsa <jolsa@redhat.com>
LKML-Reference: <20090729085837.GB4998@jolsa.lab.eng.brq.redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 arch/x86/kernel/entry_64.S | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index c251be74510..d59fe323807 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -146,7 +146,7 @@ ENTRY(ftrace_graph_caller)
 END(ftrace_graph_caller)
 
 GLOBAL(return_to_handler)
-	subq  $80, %rsp
+	subq  $24, %rsp
 
 	/* Save the return values */
 	movq %rax, (%rsp)
@@ -155,10 +155,10 @@ GLOBAL(return_to_handler)
 
 	call ftrace_return_to_handler
 
-	movq %rax, 72(%rsp)
+	movq %rax, 16(%rsp)
 	movq 8(%rsp), %rdx
 	movq (%rsp), %rax
-	addq $72, %rsp
+	addq $16, %rsp
 	retq
 #endif
 
-- 
cgit v1.2.3


From e34e77ce348feac3c8c607774efb1f8a9262127d Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Mon, 14 Sep 2009 20:52:25 +0200
Subject: x86, mce: Fix compilation with !CONFIG_DEBUG_FS in mce-severity.c

Fix compilation error in arch/x86/kernel/cpu/mcheck/mce-severity.c
when CONFIG_DEBUG_FS is disabled, introduced in commit
5be9ed251f58881dfc3dd6742a81ff9ad1a7bb04.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mcheck/mce-severity.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index bc35a073d15..8a85dd1b1aa 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -139,6 +139,7 @@ int mce_severity(struct mce *a, int tolerant, char **msg)
 	}
 }
 
+#ifdef CONFIG_DEBUG_FS
 static void *s_start(struct seq_file *f, loff_t *pos)
 {
 	if (*pos >= ARRAY_SIZE(severities))
@@ -212,3 +213,4 @@ err_out:
 	return -ENOMEM;
 }
 late_initcall(severities_debugfs_init);
+#endif
-- 
cgit v1.2.3


From c88d5910890ad35af283344417891344604f0438 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 10 Sep 2009 13:50:02 +0200
Subject: sched: Merge select_task_rq_fair() and sched_balance_self()

The problem with wake_idle() is that is doesn't respect things like
cpu_power, which means it doesn't deal well with SMT nor the recent
RT interaction.

To cure this, it needs to do what sched_balance_self() does, which
leads to the possibility of merging select_task_rq_fair() and
sched_balance_self().

Modify sched_balance_self() to:

  - update_shares() when walking up the domain tree,
    (it only called it for the top domain, but it should
     have done this anyway), which allows us to remove
    this ugly bit from try_to_wake_up().

  - do wake_affine() on the smallest domain that contains
    both this (the waking) and the prev (the wakee) cpu for
    WAKE invocations.

Then use the top-down balance steps it had to replace wake_idle().

This leads to the dissapearance of SD_WAKE_BALANCE and
SD_WAKE_IDLE_FAR, with SD_WAKE_IDLE replaced with SD_BALANCE_WAKE.

SD_WAKE_AFFINE needs SD_BALANCE_WAKE to be effective.

Touch all topology bits to replace the old with new SD flags --
platforms might need re-tuning, enabling SD_BALANCE_WAKE
conditionally on a NUMA distance seems like a good additional
feature, magny-core and small nehalem systems would want this
enabled, systems with slow interconnects would not.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/ia64/include/asm/topology.h           |   5 +-
 arch/mips/include/asm/mach-ip27/topology.h |   2 +-
 arch/powerpc/include/asm/topology.h        |   5 +-
 arch/sh/include/asm/topology.h             |   4 +-
 arch/sparc/include/asm/topology_64.h       |   4 +-
 arch/x86/include/asm/topology.h            |   4 +-
 include/linux/sched.h                      |   7 +-
 include/linux/topology.h                   |  16 +-
 kernel/sched.c                             |  41 +----
 kernel/sched_fair.c                        | 233 ++++++++---------------------
 10 files changed, 84 insertions(+), 237 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h
index 7b4c8c70b2d..cf6053b226c 100644
--- a/arch/ia64/include/asm/topology.h
+++ b/arch/ia64/include/asm/topology.h
@@ -67,6 +67,7 @@ void build_cpu_to_node_map(void);
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_NEWIDLE	\
 				| SD_BALANCE_EXEC	\
+				| SD_BALANCE_WAKE	\
 				| SD_WAKE_AFFINE,	\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\
@@ -91,8 +92,8 @@ void build_cpu_to_node_map(void);
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_EXEC	\
 				| SD_BALANCE_FORK	\
-				| SD_SERIALIZE		\
-				| SD_WAKE_BALANCE,	\
+				| SD_BALANCE_WAKE	\
+				| SD_SERIALIZE,		\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 64,			\
 	.nr_balance_failed	= 0,			\
diff --git a/arch/mips/include/asm/mach-ip27/topology.h b/arch/mips/include/asm/mach-ip27/topology.h
index 07547231e07..d8332398f5b 100644
--- a/arch/mips/include/asm/mach-ip27/topology.h
+++ b/arch/mips/include/asm/mach-ip27/topology.h
@@ -48,7 +48,7 @@ extern unsigned char __node_distances[MAX_COMPACT_NODES][MAX_COMPACT_NODES];
 	.cache_nice_tries	= 1,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_EXEC	\
-				| SD_WAKE_BALANCE,	\
+				| SD_BALANCE_WAKE,	\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\
 	.nr_balance_failed	= 0,			\
diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h
index 054a16d6808..c6343313ff5 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -62,9 +62,8 @@ static inline int pcibus_to_node(struct pci_bus *bus)
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_EXEC	\
 				| SD_BALANCE_NEWIDLE	\
-				| SD_WAKE_IDLE		\
-				| SD_SERIALIZE		\
-				| SD_WAKE_BALANCE,	\
+				| SD_BALANCE_WAKE	\
+				| SD_SERIALIZE,		\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\
 	.nr_balance_failed	= 0,			\
diff --git a/arch/sh/include/asm/topology.h b/arch/sh/include/asm/topology.h
index b69ee850906..dc1531e2f25 100644
--- a/arch/sh/include/asm/topology.h
+++ b/arch/sh/include/asm/topology.h
@@ -21,8 +21,8 @@
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_FORK	\
 				| SD_BALANCE_EXEC	\
-				| SD_SERIALIZE		\
-				| SD_WAKE_BALANCE,	\
+				| SD_BALANCE_WAKE	\
+				| SD_SERIALIZE,		\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\
 	.nr_balance_failed	= 0,			\
diff --git a/arch/sparc/include/asm/topology_64.h b/arch/sparc/include/asm/topology_64.h
index e5ea8d33242..1d091abd2d1 100644
--- a/arch/sparc/include/asm/topology_64.h
+++ b/arch/sparc/include/asm/topology_64.h
@@ -57,8 +57,8 @@ static inline int pcibus_to_node(struct pci_bus *pbus)
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_FORK	\
 				| SD_BALANCE_EXEC	\
-				| SD_SERIALIZE		\
-				| SD_WAKE_BALANCE,	\
+				| SD_BALANCE_WAKE	\
+				| SD_SERIALIZE,		\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\
 }
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 26d06e052a1..966d58dc627 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -145,14 +145,12 @@ extern unsigned long node_remap_size[];
 				| 1*SD_BALANCE_NEWIDLE			\
 				| 1*SD_BALANCE_EXEC			\
 				| 1*SD_BALANCE_FORK			\
-				| 0*SD_WAKE_IDLE			\
+				| 1*SD_BALANCE_WAKE			\
 				| 1*SD_WAKE_AFFINE			\
-				| 1*SD_WAKE_BALANCE			\
 				| 0*SD_SHARE_CPUPOWER			\
 				| 0*SD_POWERSAVINGS_BALANCE		\
 				| 0*SD_SHARE_PKG_RESOURCES		\
 				| 1*SD_SERIALIZE			\
-				| 1*SD_WAKE_IDLE_FAR			\
 				| 0*SD_PREFER_SIBLING			\
 				,					\
 	.last_balance		= jiffies,				\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3b0ca66bd6c..c30bf3d516d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -803,16 +803,15 @@ enum cpu_idle_type {
 #define SD_BALANCE_NEWIDLE	0x0002	/* Balance when about to become idle */
 #define SD_BALANCE_EXEC		0x0004	/* Balance on exec */
 #define SD_BALANCE_FORK		0x0008	/* Balance on fork, clone */
-#define SD_WAKE_IDLE		0x0010	/* Wake to idle CPU on task wakeup */
+#define SD_BALANCE_WAKE		0x0010  /* Balance on wakeup */
 #define SD_WAKE_AFFINE		0x0020	/* Wake task to waking CPU */
-#define SD_WAKE_BALANCE		0x0040	/* Perform balancing at task wakeup */
+
 #define SD_SHARE_CPUPOWER	0x0080	/* Domain members share cpu power */
 #define SD_POWERSAVINGS_BALANCE	0x0100	/* Balance for power savings */
 #define SD_SHARE_PKG_RESOURCES	0x0200	/* Domain members share cpu pkg resources */
 #define SD_SERIALIZE		0x0400	/* Only a single load balancing instance */
-#define SD_WAKE_IDLE_FAR	0x0800	/* Gain latency sacrificing cache hit */
+
 #define SD_PREFER_SIBLING	0x1000	/* Prefer to place tasks in a sibling domain */
-#define SD_BALANCE_WAKE		0x2000  /* Balance on wakeup */
 
 enum powersavings_balance_level {
 	POWERSAVINGS_BALANCE_NONE = 0,  /* No power saving load balance */
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 85e8cf7d393..6a8cd15555b 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -95,14 +95,12 @@ int arch_update_cpu_topology(void);
 				| 1*SD_BALANCE_NEWIDLE			\
 				| 1*SD_BALANCE_EXEC			\
 				| 1*SD_BALANCE_FORK			\
-				| 0*SD_WAKE_IDLE			\
+				| 1*SD_BALANCE_WAKE			\
 				| 1*SD_WAKE_AFFINE			\
-				| 1*SD_WAKE_BALANCE			\
 				| 1*SD_SHARE_CPUPOWER			\
 				| 0*SD_POWERSAVINGS_BALANCE		\
 				| 0*SD_SHARE_PKG_RESOURCES		\
 				| 0*SD_SERIALIZE			\
-				| 0*SD_WAKE_IDLE_FAR			\
 				| 0*SD_PREFER_SIBLING			\
 				,					\
 	.last_balance		= jiffies,				\
@@ -129,13 +127,11 @@ int arch_update_cpu_topology(void);
 				| 1*SD_BALANCE_NEWIDLE			\
 				| 1*SD_BALANCE_EXEC			\
 				| 1*SD_BALANCE_FORK			\
-				| 1*SD_WAKE_IDLE			\
+				| 1*SD_BALANCE_WAKE			\
 				| 1*SD_WAKE_AFFINE			\
-				| 1*SD_WAKE_BALANCE			\
 				| 0*SD_SHARE_CPUPOWER			\
 				| 1*SD_SHARE_PKG_RESOURCES		\
 				| 0*SD_SERIALIZE			\
-				| 0*SD_WAKE_IDLE_FAR			\
 				| sd_balance_for_mc_power()		\
 				| sd_power_saving_flags()		\
 				,					\
@@ -163,13 +159,11 @@ int arch_update_cpu_topology(void);
 				| 1*SD_BALANCE_NEWIDLE			\
 				| 1*SD_BALANCE_EXEC			\
 				| 1*SD_BALANCE_FORK			\
-				| 1*SD_WAKE_IDLE			\
+				| 1*SD_BALANCE_WAKE			\
 				| 0*SD_WAKE_AFFINE			\
-				| 1*SD_WAKE_BALANCE			\
 				| 0*SD_SHARE_CPUPOWER			\
 				| 0*SD_SHARE_PKG_RESOURCES		\
 				| 0*SD_SERIALIZE			\
-				| 0*SD_WAKE_IDLE_FAR			\
 				| sd_balance_for_package_power()	\
 				| sd_power_saving_flags()		\
 				,					\
@@ -191,14 +185,12 @@ int arch_update_cpu_topology(void);
 				| 1*SD_BALANCE_NEWIDLE			\
 				| 0*SD_BALANCE_EXEC			\
 				| 0*SD_BALANCE_FORK			\
-				| 0*SD_WAKE_IDLE			\
+				| 0*SD_BALANCE_WAKE			\
 				| 1*SD_WAKE_AFFINE			\
-				| 0*SD_WAKE_BALANCE			\
 				| 0*SD_SHARE_CPUPOWER			\
 				| 0*SD_POWERSAVINGS_BALANCE		\
 				| 0*SD_SHARE_PKG_RESOURCES		\
 				| 1*SD_SERIALIZE			\
-				| 1*SD_WAKE_IDLE_FAR			\
 				| 0*SD_PREFER_SIBLING			\
 				,					\
 	.last_balance		= jiffies,				\
diff --git a/kernel/sched.c b/kernel/sched.c
index fc6fda881d2..6c819f338b1 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -512,14 +512,6 @@ struct root_domain {
 #ifdef CONFIG_SMP
 	struct cpupri cpupri;
 #endif
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-	/*
-	 * Preferred wake up cpu nominated by sched_mc balance that will be
-	 * used when most cpus are idle in the system indicating overall very
-	 * low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2)
-	 */
-	unsigned int sched_mc_preferred_wakeup_cpu;
-#endif
 };
 
 /*
@@ -2315,22 +2307,6 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 	if (!sched_feat(SYNC_WAKEUPS))
 		sync = 0;
 
-#ifdef CONFIG_SMP
-	if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) {
-		struct sched_domain *sd;
-
-		this_cpu = raw_smp_processor_id();
-		cpu = task_cpu(p);
-
-		for_each_domain(this_cpu, sd) {
-			if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
-				update_shares(sd);
-				break;
-			}
-		}
-	}
-#endif
-
 	this_cpu = get_cpu();
 
 	smp_wmb();
@@ -3533,11 +3509,6 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
 	*imbalance = sds->min_load_per_task;
 	sds->busiest = sds->group_min;
 
-	if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
-		cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
-			group_first_cpu(sds->group_leader);
-	}
-
 	return 1;
 
 }
@@ -7850,9 +7821,7 @@ static int sd_degenerate(struct sched_domain *sd)
 	}
 
 	/* Following flags don't use groups */
-	if (sd->flags & (SD_WAKE_IDLE |
-			 SD_WAKE_AFFINE |
-			 SD_WAKE_BALANCE))
+	if (sd->flags & (SD_WAKE_AFFINE))
 		return 0;
 
 	return 1;
@@ -7869,10 +7838,6 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
 	if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
 		return 0;
 
-	/* Does parent contain flags not in child? */
-	/* WAKE_BALANCE is a subset of WAKE_AFFINE */
-	if (cflags & SD_WAKE_AFFINE)
-		pflags &= ~SD_WAKE_BALANCE;
 	/* Flags needing groups don't count if only 1 group in parent */
 	if (parent->groups == parent->groups->next) {
 		pflags &= ~(SD_LOAD_BALANCE |
@@ -8558,10 +8523,10 @@ static void set_domain_attribute(struct sched_domain *sd,
 		request = attr->relax_domain_level;
 	if (request < sd->level) {
 		/* turn off idle balance on this domain */
-		sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE);
+		sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
 	} else {
 		/* turn on idle balance on this domain */
-		sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE);
+		sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
 	}
 }
 
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index f2eb5b93471..09d19f77eb3 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1062,83 +1062,6 @@ static void yield_task_fair(struct rq *rq)
 	se->vruntime = rightmost->vruntime + 1;
 }
 
-/*
- * wake_idle() will wake a task on an idle cpu if task->cpu is
- * not idle and an idle cpu is available.  The span of cpus to
- * search starts with cpus closest then further out as needed,
- * so we always favor a closer, idle cpu.
- * Domains may include CPUs that are not usable for migration,
- * hence we need to mask them out (rq->rd->online)
- *
- * Returns the CPU we should wake onto.
- */
-#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
-
-#define cpu_rd_active(cpu, rq) cpumask_test_cpu(cpu, rq->rd->online)
-
-static int wake_idle(int cpu, struct task_struct *p)
-{
-	struct sched_domain *sd;
-	int i;
-	unsigned int chosen_wakeup_cpu;
-	int this_cpu;
-	struct rq *task_rq = task_rq(p);
-
-	/*
-	 * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu
-	 * are idle and this is not a kernel thread and this task's affinity
-	 * allows it to be moved to preferred cpu, then just move!
-	 */
-
-	this_cpu = smp_processor_id();
-	chosen_wakeup_cpu =
-		cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu;
-
-	if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP &&
-		idle_cpu(cpu) && idle_cpu(this_cpu) &&
-		p->mm && !(p->flags & PF_KTHREAD) &&
-		cpu_isset(chosen_wakeup_cpu, p->cpus_allowed))
-		return chosen_wakeup_cpu;
-
-	/*
-	 * If it is idle, then it is the best cpu to run this task.
-	 *
-	 * This cpu is also the best, if it has more than one task already.
-	 * Siblings must be also busy(in most cases) as they didn't already
-	 * pickup the extra load from this cpu and hence we need not check
-	 * sibling runqueue info. This will avoid the checks and cache miss
-	 * penalities associated with that.
-	 */
-	if (idle_cpu(cpu) || cpu_rq(cpu)->cfs.nr_running > 1)
-		return cpu;
-
-	for_each_domain(cpu, sd) {
-		if ((sd->flags & SD_WAKE_IDLE)
-		    || ((sd->flags & SD_WAKE_IDLE_FAR)
-			&& !task_hot(p, task_rq->clock, sd))) {
-			for_each_cpu_and(i, sched_domain_span(sd),
-					 &p->cpus_allowed) {
-				if (cpu_rd_active(i, task_rq) && idle_cpu(i)) {
-					if (i != task_cpu(p)) {
-						schedstat_inc(p,
-						       se.nr_wakeups_idle);
-					}
-					return i;
-				}
-			}
-		} else {
-			break;
-		}
-	}
-	return cpu;
-}
-#else /* !ARCH_HAS_SCHED_WAKE_IDLE*/
-static inline int wake_idle(int cpu, struct task_struct *p)
-{
-	return cpu;
-}
-#endif
-
 #ifdef CONFIG_SMP
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1225,21 +1148,22 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
 
 #endif
 
-static int
-wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
-	    struct task_struct *p, int prev_cpu, int this_cpu, int sync,
-	    int idx, unsigned long load, unsigned long this_load,
-	    unsigned int imbalance)
+static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 {
-	struct task_struct *curr = this_rq->curr;
-	struct task_group *tg;
-	unsigned long tl = this_load;
+	struct task_struct *curr = current;
+	unsigned long this_load, load;
+	int idx, this_cpu, prev_cpu;
 	unsigned long tl_per_task;
+	unsigned int imbalance;
+	struct task_group *tg;
 	unsigned long weight;
 	int balanced;
 
-	if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
-		return 0;
+	idx	  = sd->wake_idx;
+	this_cpu  = smp_processor_id();
+	prev_cpu  = task_cpu(p);
+	load	  = source_load(prev_cpu, idx);
+	this_load = target_load(this_cpu, idx);
 
 	if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost ||
 			p->se.avg_overlap > sysctl_sched_migration_cost))
@@ -1254,24 +1178,26 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
 		tg = task_group(current);
 		weight = current->se.load.weight;
 
-		tl += effective_load(tg, this_cpu, -weight, -weight);
+		this_load += effective_load(tg, this_cpu, -weight, -weight);
 		load += effective_load(tg, prev_cpu, 0, -weight);
 	}
 
 	tg = task_group(p);
 	weight = p->se.load.weight;
 
+	imbalance = 100 + (sd->imbalance_pct - 100) / 2;
+
 	/*
 	 * In low-load situations, where prev_cpu is idle and this_cpu is idle
-	 * due to the sync cause above having dropped tl to 0, we'll always have
-	 * an imbalance, but there's really nothing you can do about that, so
-	 * that's good too.
+	 * due to the sync cause above having dropped this_load to 0, we'll
+	 * always have an imbalance, but there's really nothing you can do
+	 * about that, so that's good too.
 	 *
 	 * Otherwise check if either cpus are near enough in load to allow this
 	 * task to be woken on this_cpu.
 	 */
-	balanced = !tl ||
-		100*(tl + effective_load(tg, this_cpu, weight, weight)) <=
+	balanced = !this_load ||
+		100*(this_load + effective_load(tg, this_cpu, weight, weight)) <=
 		imbalance*(load + effective_load(tg, prev_cpu, 0, weight));
 
 	/*
@@ -1285,14 +1211,15 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
 	schedstat_inc(p, se.nr_wakeups_affine_attempts);
 	tl_per_task = cpu_avg_load_per_task(this_cpu);
 
-	if (balanced || (tl <= load && tl + target_load(prev_cpu, idx) <=
-			tl_per_task)) {
+	if (balanced ||
+	    (this_load <= load &&
+	     this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
 		/*
 		 * This domain has SD_WAKE_AFFINE and
 		 * p is cache cold in this domain, and
 		 * there is no bad imbalance.
 		 */
-		schedstat_inc(this_sd, ttwu_move_affine);
+		schedstat_inc(sd, ttwu_move_affine);
 		schedstat_inc(p, se.nr_wakeups_affine);
 
 		return 1;
@@ -1300,72 +1227,6 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
 	return 0;
 }
 
-static int sched_balance_self(int cpu, int flag);
-
-static int select_task_rq_fair(struct task_struct *p, int flag, int sync)
-{
-	struct sched_domain *sd, *this_sd = NULL;
-	int prev_cpu, this_cpu, new_cpu;
-	unsigned long load, this_load;
-	struct rq *this_rq;
-	unsigned int imbalance;
-	int idx;
-
-	prev_cpu	= task_cpu(p);
-	this_cpu	= smp_processor_id();
-	this_rq		= cpu_rq(this_cpu);
-	new_cpu		= prev_cpu;
-
-	if (flag != SD_BALANCE_WAKE)
-		return sched_balance_self(this_cpu, flag);
-
-	/*
-	 * 'this_sd' is the first domain that both
-	 * this_cpu and prev_cpu are present in:
-	 */
-	for_each_domain(this_cpu, sd) {
-		if (cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) {
-			this_sd = sd;
-			break;
-		}
-	}
-
-	if (unlikely(!cpumask_test_cpu(this_cpu, &p->cpus_allowed)))
-		goto out;
-
-	/*
-	 * Check for affine wakeup and passive balancing possibilities.
-	 */
-	if (!this_sd)
-		goto out;
-
-	idx = this_sd->wake_idx;
-
-	imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
-
-	load = source_load(prev_cpu, idx);
-	this_load = target_load(this_cpu, idx);
-
-	if (wake_affine(this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx,
-				     load, this_load, imbalance))
-		return this_cpu;
-
-	/*
-	 * Start passive balancing when half the imbalance_pct
-	 * limit is reached.
-	 */
-	if (this_sd->flags & SD_WAKE_BALANCE) {
-		if (imbalance*this_load <= 100*load) {
-			schedstat_inc(this_sd, ttwu_move_balance);
-			schedstat_inc(p, se.nr_wakeups_passive);
-			return this_cpu;
-		}
-	}
-
-out:
-	return wake_idle(new_cpu, p);
-}
-
 /*
  * find_idlest_group finds and returns the least busy CPU group within the
  * domain.
@@ -1455,10 +1316,20 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
  *
  * preempt must be disabled.
  */
-static int sched_balance_self(int cpu, int flag)
+static int select_task_rq_fair(struct task_struct *p, int flag, int sync)
 {
 	struct task_struct *t = current;
 	struct sched_domain *tmp, *sd = NULL;
+	int cpu = smp_processor_id();
+	int prev_cpu = task_cpu(p);
+	int new_cpu = cpu;
+	int want_affine = 0;
+
+	if (flag & SD_BALANCE_WAKE) {
+		if (sched_feat(AFFINE_WAKEUPS))
+			want_affine = 1;
+		new_cpu = prev_cpu;
+	}
 
 	for_each_domain(cpu, tmp) {
 		/*
@@ -1466,16 +1337,38 @@ static int sched_balance_self(int cpu, int flag)
 		 */
 		if (tmp->flags & SD_POWERSAVINGS_BALANCE)
 			break;
-		if (tmp->flags & flag)
-			sd = tmp;
-	}
 
-	if (sd)
-		update_shares(sd);
+		switch (flag) {
+		case SD_BALANCE_WAKE:
+			if (!sched_feat(LB_WAKEUP_UPDATE))
+				break;
+		case SD_BALANCE_FORK:
+		case SD_BALANCE_EXEC:
+			if (root_task_group_empty())
+				break;
+			update_shares(tmp);
+		default:
+			break;
+		}
+
+		if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
+		    cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
+
+			if (wake_affine(tmp, p, sync))
+				return cpu;
+
+			want_affine = 0;
+		}
+
+		if (!(tmp->flags & flag))
+			continue;
+
+		sd = tmp;
+	}
 
 	while (sd) {
 		struct sched_group *group;
-		int new_cpu, weight;
+		int weight;
 
 		if (!(sd->flags & flag)) {
 			sd = sd->child;
@@ -1508,7 +1401,7 @@ static int sched_balance_self(int cpu, int flag)
 		/* while loop will break here if sd == NULL */
 	}
 
-	return cpu;
+	return new_cpu;
 }
 #endif /* CONFIG_SMP */
 
-- 
cgit v1.2.3


From 78e7ed53c9f42f04f9401ada6f7047db60781676 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 3 Sep 2009 13:16:51 +0200
Subject: sched: Tweak wake_idx

When merging select_task_rq_fair() and sched_balance_self() we lost
the use of wake_idx, restore that and set them to 0 to make wake
balancing more aggressive.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/ia64/include/asm/topology.h     |  5 +++--
 arch/powerpc/include/asm/topology.h  |  3 ++-
 arch/sh/include/asm/topology.h       |  2 +-
 arch/sparc/include/asm/topology_64.h |  2 +-
 arch/x86/include/asm/topology.h      |  2 +-
 include/linux/topology.h             |  4 ++--
 kernel/sched_fair.c                  | 21 ++++++++++++++++++---
 7 files changed, 28 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h
index cf6053b226c..47f3c51d5e2 100644
--- a/arch/ia64/include/asm/topology.h
+++ b/arch/ia64/include/asm/topology.h
@@ -62,11 +62,12 @@ void build_cpu_to_node_map(void);
 	.busy_idx		= 2,			\
 	.idle_idx		= 1,			\
 	.newidle_idx		= 2,			\
-	.wake_idx		= 1,			\
+	.wake_idx		= 0,			\
 	.forkexec_idx		= 1,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_NEWIDLE	\
 				| SD_BALANCE_EXEC	\
+				| SD_BALANCE_FORK	\
 				| SD_BALANCE_WAKE	\
 				| SD_WAKE_AFFINE,	\
 	.last_balance		= jiffies,		\
@@ -87,7 +88,7 @@ void build_cpu_to_node_map(void);
 	.busy_idx		= 3,			\
 	.idle_idx		= 2,			\
 	.newidle_idx		= 2,			\
-	.wake_idx		= 1,			\
+	.wake_idx		= 0,			\
 	.forkexec_idx		= 1,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_EXEC	\
diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h
index c6343313ff5..a6b220ab56d 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -58,9 +58,10 @@ static inline int pcibus_to_node(struct pci_bus *bus)
 	.busy_idx		= 3,			\
 	.idle_idx		= 1,			\
 	.newidle_idx		= 2,			\
-	.wake_idx		= 1,			\
+	.wake_idx		= 0,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_EXEC	\
+				| SD_BALANCE_FORK	\
 				| SD_BALANCE_NEWIDLE	\
 				| SD_BALANCE_WAKE	\
 				| SD_SERIALIZE,		\
diff --git a/arch/sh/include/asm/topology.h b/arch/sh/include/asm/topology.h
index dc1531e2f25..9054e5c0ad5 100644
--- a/arch/sh/include/asm/topology.h
+++ b/arch/sh/include/asm/topology.h
@@ -16,7 +16,7 @@
 	.busy_idx		= 3,			\
 	.idle_idx		= 2,			\
 	.newidle_idx		= 2,			\
-	.wake_idx		= 1,			\
+	.wake_idx		= 0,			\
 	.forkexec_idx		= 1,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_FORK	\
diff --git a/arch/sparc/include/asm/topology_64.h b/arch/sparc/include/asm/topology_64.h
index 1d091abd2d1..bc3a0930ed6 100644
--- a/arch/sparc/include/asm/topology_64.h
+++ b/arch/sparc/include/asm/topology_64.h
@@ -52,7 +52,7 @@ static inline int pcibus_to_node(struct pci_bus *pbus)
 	.busy_idx		= 3,			\
 	.idle_idx		= 2,			\
 	.newidle_idx		= 0, 			\
-	.wake_idx		= 1,			\
+	.wake_idx		= 0,			\
 	.forkexec_idx		= 1,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_FORK	\
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 966d58dc627..4b1b335097b 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -138,7 +138,7 @@ extern unsigned long node_remap_size[];
 	.busy_idx		= 3,					\
 	.idle_idx		= SD_IDLE_IDX,				\
 	.newidle_idx		= SD_NEWIDLE_IDX,			\
-	.wake_idx		= 1,					\
+	.wake_idx		= 0,					\
 	.forkexec_idx		= SD_FORKEXEC_IDX,			\
 									\
 	.flags			= 1*SD_LOAD_BALANCE			\
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 6a8cd15555b..fef57040a4e 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -120,7 +120,7 @@ int arch_update_cpu_topology(void);
 	.imbalance_pct		= 125,					\
 	.cache_nice_tries	= 1,					\
 	.busy_idx		= 2,					\
-	.wake_idx		= 1,					\
+	.wake_idx		= 0,					\
 	.forkexec_idx		= 1,					\
 									\
 	.flags			= 1*SD_LOAD_BALANCE			\
@@ -152,7 +152,7 @@ int arch_update_cpu_topology(void);
 	.busy_idx		= 2,					\
 	.idle_idx		= 1,					\
 	.newidle_idx		= 2,					\
-	.wake_idx		= 1,					\
+	.wake_idx		= 0,					\
 	.forkexec_idx		= 1,					\
 									\
 	.flags			= 1*SD_LOAD_BALANCE			\
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 8b3eddbcf9a..19593568031 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1232,12 +1232,27 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
  * domain.
  */
 static struct sched_group *
-find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
+find_idlest_group(struct sched_domain *sd, struct task_struct *p,
+		  int this_cpu, int flag)
 {
 	struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
 	unsigned long min_load = ULONG_MAX, this_load = 0;
-	int load_idx = sd->forkexec_idx;
 	int imbalance = 100 + (sd->imbalance_pct-100)/2;
+	int load_idx = 0;
+
+	switch (flag) {
+	case SD_BALANCE_FORK:
+	case SD_BALANCE_EXEC:
+		load_idx = sd->forkexec_idx;
+		break;
+
+	case SD_BALANCE_WAKE:
+		load_idx = sd->wake_idx;
+		break;
+
+	default:
+		break;
+	}
 
 	do {
 		unsigned long load, avg_load;
@@ -1392,7 +1407,7 @@ static int select_task_rq_fair(struct task_struct *p, int flag, int sync)
 			continue;
 		}
 
-		group = find_idlest_group(sd, p, cpu);
+		group = find_idlest_group(sd, p, cpu, flag);
 		if (!group) {
 			sd = sd->child;
 			continue;
-- 
cgit v1.2.3


From 0ec9fab3d186d9cbb00c0f694d4a260d07c198d9 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Tue, 15 Sep 2009 15:07:03 +0200
Subject: sched: Improve latencies and throughput

Make the idle balancer more agressive, to improve a
x264 encoding workload provided by Jason Garrett-Glaser:

 NEXT_BUDDY NO_LB_BIAS
 encoded 600 frames, 252.82 fps, 22096.60 kb/s
 encoded 600 frames, 250.69 fps, 22096.60 kb/s
 encoded 600 frames, 245.76 fps, 22096.60 kb/s

 NO_NEXT_BUDDY LB_BIAS
 encoded 600 frames, 344.44 fps, 22096.60 kb/s
 encoded 600 frames, 346.66 fps, 22096.60 kb/s
 encoded 600 frames, 352.59 fps, 22096.60 kb/s

 NO_NEXT_BUDDY NO_LB_BIAS
 encoded 600 frames, 425.75 fps, 22096.60 kb/s
 encoded 600 frames, 425.45 fps, 22096.60 kb/s
 encoded 600 frames, 422.49 fps, 22096.60 kb/s

Peter pointed out that this is better done via newidle_idx,
not via LB_BIAS, newidle balancing should look for where
there is load _now_, not where there was load 2 ticks ago.

Worst-case latencies are improved as well as no buddies
means less vruntime spread. (as per prior lkml discussions)

This change improves kbuild-peak parallelism as well.

Reported-by: Jason Garrett-Glaser <darkshikari@gmail.com>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1253011667.9128.16.camel@marge.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/ia64/include/asm/topology.h    | 5 +++--
 arch/powerpc/include/asm/topology.h | 2 +-
 arch/sh/include/asm/topology.h      | 3 ++-
 arch/x86/include/asm/topology.h     | 4 +---
 include/linux/topology.h            | 2 +-
 kernel/sched_features.h             | 2 +-
 6 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h
index 47f3c51d5e2..42f1673ec83 100644
--- a/arch/ia64/include/asm/topology.h
+++ b/arch/ia64/include/asm/topology.h
@@ -61,7 +61,7 @@ void build_cpu_to_node_map(void);
 	.cache_nice_tries	= 2,			\
 	.busy_idx		= 2,			\
 	.idle_idx		= 1,			\
-	.newidle_idx		= 2,			\
+	.newidle_idx		= 0,			\
 	.wake_idx		= 0,			\
 	.forkexec_idx		= 1,			\
 	.flags			= SD_LOAD_BALANCE	\
@@ -87,10 +87,11 @@ void build_cpu_to_node_map(void);
 	.cache_nice_tries	= 2,			\
 	.busy_idx		= 3,			\
 	.idle_idx		= 2,			\
-	.newidle_idx		= 2,			\
+	.newidle_idx		= 0,			\
 	.wake_idx		= 0,			\
 	.forkexec_idx		= 1,			\
 	.flags			= SD_LOAD_BALANCE	\
+				| SD_BALANCE_NEWIDLE	\
 				| SD_BALANCE_EXEC	\
 				| SD_BALANCE_FORK	\
 				| SD_BALANCE_WAKE	\
diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h
index a6b220ab56d..1a2c9eb42a0 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -57,7 +57,7 @@ static inline int pcibus_to_node(struct pci_bus *bus)
 	.cache_nice_tries	= 1,			\
 	.busy_idx		= 3,			\
 	.idle_idx		= 1,			\
-	.newidle_idx		= 2,			\
+	.newidle_idx		= 0,			\
 	.wake_idx		= 0,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_EXEC	\
diff --git a/arch/sh/include/asm/topology.h b/arch/sh/include/asm/topology.h
index 9054e5c0ad5..c8436771e31 100644
--- a/arch/sh/include/asm/topology.h
+++ b/arch/sh/include/asm/topology.h
@@ -15,13 +15,14 @@
 	.cache_nice_tries	= 2,			\
 	.busy_idx		= 3,			\
 	.idle_idx		= 2,			\
-	.newidle_idx		= 2,			\
+	.newidle_idx		= 0,			\
 	.wake_idx		= 0,			\
 	.forkexec_idx		= 1,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_FORK	\
 				| SD_BALANCE_EXEC	\
 				| SD_BALANCE_WAKE	\
+				| SD_BALANCE_NEWIDLE	\
 				| SD_SERIALIZE,		\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 4b1b335097b..7fafd1bc414 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -116,14 +116,12 @@ extern unsigned long node_remap_size[];
 
 # define SD_CACHE_NICE_TRIES	1
 # define SD_IDLE_IDX		1
-# define SD_NEWIDLE_IDX		2
 # define SD_FORKEXEC_IDX	0
 
 #else
 
 # define SD_CACHE_NICE_TRIES	2
 # define SD_IDLE_IDX		2
-# define SD_NEWIDLE_IDX		2
 # define SD_FORKEXEC_IDX	1
 
 #endif
@@ -137,7 +135,7 @@ extern unsigned long node_remap_size[];
 	.cache_nice_tries	= SD_CACHE_NICE_TRIES,			\
 	.busy_idx		= 3,					\
 	.idle_idx		= SD_IDLE_IDX,				\
-	.newidle_idx		= SD_NEWIDLE_IDX,			\
+	.newidle_idx		= 0,					\
 	.wake_idx		= 0,					\
 	.forkexec_idx		= SD_FORKEXEC_IDX,			\
 									\
diff --git a/include/linux/topology.h b/include/linux/topology.h
index c87edcd8796..4298745615a 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -151,7 +151,7 @@ int arch_update_cpu_topology(void);
 	.cache_nice_tries	= 1,					\
 	.busy_idx		= 2,					\
 	.idle_idx		= 1,					\
-	.newidle_idx		= 2,					\
+	.newidle_idx		= 0,					\
 	.wake_idx		= 0,					\
 	.forkexec_idx		= 1,					\
 									\
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 891ea0f72b4..e98c2e8de1d 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -67,7 +67,7 @@ SCHED_FEAT(AFFINE_WAKEUPS, 1)
  * wakeup-preemption), since its likely going to consume data we
  * touched, increases cache locality.
  */
-SCHED_FEAT(NEXT_BUDDY, 1)
+SCHED_FEAT(NEXT_BUDDY, 0)
 
 /*
  * Prefer to schedule the task that ran last (when we did
-- 
cgit v1.2.3


From b8a543ea5a5896830a9969bacfd047f9d15940b2 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 15 Sep 2009 15:22:03 +0200
Subject: sched: Reduce forkexec_idx

If we're looking to place a new task, we might as well find the
idlest position _now_, not 1 tick ago.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/ia64/include/asm/topology.h     | 4 ++--
 arch/sh/include/asm/topology.h       | 2 +-
 arch/sparc/include/asm/topology_64.h | 2 +-
 arch/x86/include/asm/topology.h      | 4 +---
 include/linux/topology.h             | 4 ++--
 5 files changed, 7 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h
index 42f1673ec83..569b9dafc78 100644
--- a/arch/ia64/include/asm/topology.h
+++ b/arch/ia64/include/asm/topology.h
@@ -63,7 +63,7 @@ void build_cpu_to_node_map(void);
 	.idle_idx		= 1,			\
 	.newidle_idx		= 0,			\
 	.wake_idx		= 0,			\
-	.forkexec_idx		= 1,			\
+	.forkexec_idx		= 0,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_NEWIDLE	\
 				| SD_BALANCE_EXEC	\
@@ -89,7 +89,7 @@ void build_cpu_to_node_map(void);
 	.idle_idx		= 2,			\
 	.newidle_idx		= 0,			\
 	.wake_idx		= 0,			\
-	.forkexec_idx		= 1,			\
+	.forkexec_idx		= 0,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_NEWIDLE	\
 				| SD_BALANCE_EXEC	\
diff --git a/arch/sh/include/asm/topology.h b/arch/sh/include/asm/topology.h
index c8436771e31..a8cc564b703 100644
--- a/arch/sh/include/asm/topology.h
+++ b/arch/sh/include/asm/topology.h
@@ -17,7 +17,7 @@
 	.idle_idx		= 2,			\
 	.newidle_idx		= 0,			\
 	.wake_idx		= 0,			\
-	.forkexec_idx		= 1,			\
+	.forkexec_idx		= 0,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_FORK	\
 				| SD_BALANCE_EXEC	\
diff --git a/arch/sparc/include/asm/topology_64.h b/arch/sparc/include/asm/topology_64.h
index bc3a0930ed6..10b979d1de2 100644
--- a/arch/sparc/include/asm/topology_64.h
+++ b/arch/sparc/include/asm/topology_64.h
@@ -53,7 +53,7 @@ static inline int pcibus_to_node(struct pci_bus *pbus)
 	.idle_idx		= 2,			\
 	.newidle_idx		= 0, 			\
 	.wake_idx		= 0,			\
-	.forkexec_idx		= 1,			\
+	.forkexec_idx		= 0,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_FORK	\
 				| SD_BALANCE_EXEC	\
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 7fafd1bc414..589f12383d7 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -116,13 +116,11 @@ extern unsigned long node_remap_size[];
 
 # define SD_CACHE_NICE_TRIES	1
 # define SD_IDLE_IDX		1
-# define SD_FORKEXEC_IDX	0
 
 #else
 
 # define SD_CACHE_NICE_TRIES	2
 # define SD_IDLE_IDX		2
-# define SD_FORKEXEC_IDX	1
 
 #endif
 
@@ -137,7 +135,7 @@ extern unsigned long node_remap_size[];
 	.idle_idx		= SD_IDLE_IDX,				\
 	.newidle_idx		= 0,					\
 	.wake_idx		= 0,					\
-	.forkexec_idx		= SD_FORKEXEC_IDX,			\
+	.forkexec_idx		= 0,					\
 									\
 	.flags			= 1*SD_LOAD_BALANCE			\
 				| 1*SD_BALANCE_NEWIDLE			\
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 4298745615a..936ab2b3768 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -121,7 +121,7 @@ int arch_update_cpu_topology(void);
 	.cache_nice_tries	= 1,					\
 	.busy_idx		= 2,					\
 	.wake_idx		= 0,					\
-	.forkexec_idx		= 1,					\
+	.forkexec_idx		= 0,					\
 									\
 	.flags			= 1*SD_LOAD_BALANCE			\
 				| 1*SD_BALANCE_NEWIDLE			\
@@ -153,7 +153,7 @@ int arch_update_cpu_topology(void);
 	.idle_idx		= 1,					\
 	.newidle_idx		= 0,					\
 	.wake_idx		= 0,					\
-	.forkexec_idx		= 1,					\
+	.forkexec_idx		= 0,					\
 									\
 	.flags			= 1*SD_LOAD_BALANCE			\
 				| 1*SD_BALANCE_NEWIDLE			\
-- 
cgit v1.2.3


From a8303aaf2b2f74714db6d204ab4fcb810942664e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 2 Sep 2009 10:56:56 +0200
Subject: x86: Move APERF/MPERF into a X86_FEATURE

Move the APERFMPERF capacility into a X86_FEATURE flag so that it
can be used outside of the acpi cpufreq driver.

Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Cc: Yanmin <yanmin_zhang@linux.intel.com>
Cc: Dave Jones <davej@redhat.com>
Cc: Len Brown <len.brown@intel.com>
Cc: Yinghai Lu <yhlu.kernel@gmail.com>
Cc: cpufreq@vger.kernel.org
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/cpufeature.h          | 1 +
 arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 9 ++-------
 arch/x86/kernel/cpu/intel.c                | 6 ++++++
 3 files changed, 9 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 847fee6493a..9cfc88b9774 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -96,6 +96,7 @@
 #define X86_FEATURE_CLFLUSH_MONITOR (3*32+25) /* "" clflush reqd with monitor */
 #define X86_FEATURE_EXTD_APICID	(3*32+26) /* has extended APICID (8 bits) */
 #define X86_FEATURE_AMD_DCM     (3*32+27) /* multi-node processor */
+#define X86_FEATURE_APERFMPERF	(3*32+28) /* APERFMPERF */
 
 /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
 #define X86_FEATURE_XMM3	(4*32+ 0) /* "pni" SSE-3 */
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index ae9b503220c..509e6a7db71 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -60,7 +60,6 @@ enum {
 };
 
 #define INTEL_MSR_RANGE		(0xffff)
-#define CPUID_6_ECX_APERFMPERF_CAPABILITY	(0x1)
 
 struct acpi_cpufreq_data {
 	struct acpi_processor_performance *acpi_data;
@@ -731,12 +730,8 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
 	acpi_processor_notify_smm(THIS_MODULE);
 
 	/* Check for APERF/MPERF support in hardware */
-	if (c->x86_vendor == X86_VENDOR_INTEL && c->cpuid_level >= 6) {
-		unsigned int ecx;
-		ecx = cpuid_ecx(6);
-		if (ecx & CPUID_6_ECX_APERFMPERF_CAPABILITY)
-			acpi_cpufreq_driver.getavg = get_measured_perf;
-	}
+	if (cpu_has(c, X86_FEATURE_APERFMPERF))
+		acpi_cpufreq_driver.getavg = get_measured_perf;
 
 	dprintk("CPU%u - ACPI performance management activated.\n", cpu);
 	for (i = 0; i < perf->state_count; i++)
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 80a722a071b..40e1835b35e 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -350,6 +350,12 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 			set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
 	}
 
+	if (c->cpuid_level > 6) {
+		unsigned ecx = cpuid_ecx(6);
+		if (ecx & 0x01)
+			set_cpu_cap(c, X86_FEATURE_APERFMPERF);
+	}
+
 	if (cpu_has_xmm2)
 		set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
 	if (cpu_has_ds) {
-- 
cgit v1.2.3


From 5cbc19a983141729d716be17197028434127b376 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 2 Sep 2009 11:49:52 +0200
Subject: x86: Add generic aperf/mperf code

Move some of the aperf/mperf code out from the cpufreq driver
thingy so that other people can enjoy it too.

Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Cc: Yanmin <yanmin_zhang@linux.intel.com>
Cc: Dave Jones <davej@redhat.com>
Cc: Len Brown <len.brown@intel.com>
Cc: Yinghai Lu <yhlu.kernel@gmail.com>
Cc: cpufreq@vger.kernel.org
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/processor.h           | 30 ++++++++++++
 arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 79 ++++--------------------------
 2 files changed, 39 insertions(+), 70 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index e08ea043e08..4ae2ccfed63 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -27,6 +27,7 @@ struct mm_struct;
 #include <linux/cpumask.h>
 #include <linux/cache.h>
 #include <linux/threads.h>
+#include <linux/math64.h>
 #include <linux/init.h>
 
 /*
@@ -1020,4 +1021,33 @@ extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
 extern int get_tsc_mode(unsigned long adr);
 extern int set_tsc_mode(unsigned int val);
 
+struct aperfmperf {
+	u64 aperf, mperf;
+};
+
+static inline void get_aperfmperf(struct aperfmperf *am)
+{
+	WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_APERFMPERF));
+
+	rdmsrl(MSR_IA32_APERF, am->aperf);
+	rdmsrl(MSR_IA32_MPERF, am->mperf);
+}
+
+#define APERFMPERF_SHIFT 10
+
+static inline
+unsigned long calc_aperfmperf_ratio(struct aperfmperf *old,
+				    struct aperfmperf *new)
+{
+	u64 aperf = new->aperf - old->aperf;
+	u64 mperf = new->mperf - old->mperf;
+	unsigned long ratio = aperf;
+
+	mperf >>= APERFMPERF_SHIFT;
+	if (mperf)
+		ratio = div64_u64(aperf, mperf);
+
+	return ratio;
+}
+
 #endif /* _ASM_X86_PROCESSOR_H */
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 509e6a7db71..4109679863c 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -70,11 +70,7 @@ struct acpi_cpufreq_data {
 
 static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data);
 
-struct acpi_msr_data {
-	u64 saved_aperf, saved_mperf;
-};
-
-static DEFINE_PER_CPU(struct acpi_msr_data, msr_data);
+static DEFINE_PER_CPU(struct aperfmperf, old_perf);
 
 DEFINE_TRACE(power_mark);
 
@@ -243,23 +239,12 @@ static u32 get_cur_val(const struct cpumask *mask)
 	return cmd.val;
 }
 
-struct perf_pair {
-	union {
-		struct {
-			u32 lo;
-			u32 hi;
-		} split;
-		u64 whole;
-	} aperf, mperf;
-};
-
 /* Called via smp_call_function_single(), on the target CPU */
 static void read_measured_perf_ctrs(void *_cur)
 {
-	struct perf_pair *cur = _cur;
+	struct aperfmperf *am = _cur;
 
-	rdmsr(MSR_IA32_APERF, cur->aperf.split.lo, cur->aperf.split.hi);
-	rdmsr(MSR_IA32_MPERF, cur->mperf.split.lo, cur->mperf.split.hi);
+	get_aperfmperf(am);
 }
 
 /*
@@ -278,63 +263,17 @@ static void read_measured_perf_ctrs(void *_cur)
 static unsigned int get_measured_perf(struct cpufreq_policy *policy,
 				      unsigned int cpu)
 {
-	struct perf_pair readin, cur;
-	unsigned int perf_percent;
+	struct aperfmperf perf;
+	unsigned long ratio;
 	unsigned int retval;
 
-	if (smp_call_function_single(cpu, read_measured_perf_ctrs, &readin, 1))
+	if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1))
 		return 0;
 
-	cur.aperf.whole = readin.aperf.whole -
-				per_cpu(msr_data, cpu).saved_aperf;
-	cur.mperf.whole = readin.mperf.whole -
-				per_cpu(msr_data, cpu).saved_mperf;
-	per_cpu(msr_data, cpu).saved_aperf = readin.aperf.whole;
-	per_cpu(msr_data, cpu).saved_mperf = readin.mperf.whole;
-
-#ifdef __i386__
-	/*
-	 * We dont want to do 64 bit divide with 32 bit kernel
-	 * Get an approximate value. Return failure in case we cannot get
-	 * an approximate value.
-	 */
-	if (unlikely(cur.aperf.split.hi || cur.mperf.split.hi)) {
-		int shift_count;
-		u32 h;
-
-		h = max_t(u32, cur.aperf.split.hi, cur.mperf.split.hi);
-		shift_count = fls(h);
-
-		cur.aperf.whole >>= shift_count;
-		cur.mperf.whole >>= shift_count;
-	}
-
-	if (((unsigned long)(-1) / 100) < cur.aperf.split.lo) {
-		int shift_count = 7;
-		cur.aperf.split.lo >>= shift_count;
-		cur.mperf.split.lo >>= shift_count;
-	}
-
-	if (cur.aperf.split.lo && cur.mperf.split.lo)
-		perf_percent = (cur.aperf.split.lo * 100) / cur.mperf.split.lo;
-	else
-		perf_percent = 0;
-
-#else
-	if (unlikely(((unsigned long)(-1) / 100) < cur.aperf.whole)) {
-		int shift_count = 7;
-		cur.aperf.whole >>= shift_count;
-		cur.mperf.whole >>= shift_count;
-	}
-
-	if (cur.aperf.whole && cur.mperf.whole)
-		perf_percent = (cur.aperf.whole * 100) / cur.mperf.whole;
-	else
-		perf_percent = 0;
-
-#endif
+	ratio = calc_aperfmperf_ratio(&per_cpu(old_perf, cpu), &perf);
+	per_cpu(old_perf, cpu) = perf;
 
-	retval = (policy->cpuinfo.max_freq * perf_percent) / 100;
+	retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT;
 
 	return retval;
 }
-- 
cgit v1.2.3


From 47fe38fcff0517e67d395c039d2e26d2de688a60 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 2 Sep 2009 13:49:18 +0200
Subject: x86: sched: Provide arch implementations using aperf/mperf

APERF/MPERF support for cpu_power.

APERF/MPERF is arch defined to be a relative scale of work capacity
per logical cpu, this is assumed to include SMT and Turbo mode.

APERF/MPERF are specified to both reset to 0 when either counter
wraps, which is highly inconvenient, since that'll give a blimp
when that happens. The manual specifies writing 0 to the counters
after each read, but that's 1) too expensive, and 2) destroys the
possibility of sharing these counters with other users, so we live
with the blimp - the other existing user does too.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/Makefile |  2 +-
 arch/x86/kernel/cpu/sched.c  | 55 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/sched.h        |  4 ++++
 3 files changed, 60 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/kernel/cpu/sched.c

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index c1f253dac15..8dd30638fe4 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -13,7 +13,7 @@ CFLAGS_common.o		:= $(nostackp)
 
 obj-y			:= intel_cacheinfo.o addon_cpuid_features.o
 obj-y			+= proc.o capflags.o powerflags.o common.o
-obj-y			+= vmware.o hypervisor.o
+obj-y			+= vmware.o hypervisor.o sched.o
 
 obj-$(CONFIG_X86_32)	+= bugs.o cmpxchg.o
 obj-$(CONFIG_X86_64)	+= bugs_64.o
diff --git a/arch/x86/kernel/cpu/sched.c b/arch/x86/kernel/cpu/sched.c
new file mode 100644
index 00000000000..6c00a8f3cce
--- /dev/null
+++ b/arch/x86/kernel/cpu/sched.c
@@ -0,0 +1,55 @@
+#include <linux/sched.h>
+#include <linux/math64.h>
+#include <linux/percpu.h>
+#include <linux/irqflags.h>
+
+#include <asm/cpufeature.h>
+#include <asm/processor.h>
+
+#ifdef CONFIG_SMP
+
+static DEFINE_PER_CPU(struct aperfmperf, old_perf);
+
+static unsigned long scale_aperfmperf(void)
+{
+	struct aperfmperf val, *old = &__get_cpu_var(old_perf);
+	unsigned long ratio, flags;
+
+	local_irq_save(flags);
+	get_aperfmperf(&val);
+	local_irq_restore(flags);
+
+	ratio = calc_aperfmperf_ratio(old, &val);
+	*old = val;
+
+	return ratio;
+}
+
+unsigned long arch_scale_freq_power(struct sched_domain *sd, int cpu)
+{
+	/*
+	 * do aperf/mperf on the cpu level because it includes things
+	 * like turbo mode, which are relevant to full cores.
+	 */
+	if (boot_cpu_has(X86_FEATURE_APERFMPERF))
+		return scale_aperfmperf();
+
+	/*
+	 * maybe have something cpufreq here
+	 */
+
+	return default_scale_freq_power(sd, cpu);
+}
+
+unsigned long arch_scale_smt_power(struct sched_domain *sd, int cpu)
+{
+	/*
+	 * aperf/mperf already includes the smt gain
+	 */
+	if (boot_cpu_has(X86_FEATURE_APERFMPERF))
+		return SCHED_LOAD_SCALE;
+
+	return default_scale_smt_power(sd, cpu);
+}
+
+#endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c30bf3d516d..fc4c0f9393d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -992,6 +992,9 @@ static inline int test_sd_parent(struct sched_domain *sd, int flag)
 	return 0;
 }
 
+unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu);
+unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu);
+
 #else /* CONFIG_SMP */
 
 struct sched_domain_attr;
@@ -1003,6 +1006,7 @@ partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
 }
 #endif	/* !CONFIG_SMP */
 
+
 struct io_context;			/* See blkdev.h */
 
 
-- 
cgit v1.2.3


From 8079ce34f2c3f5bfedcea8d4fb7290ce46ac5b56 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 31 Aug 2009 16:49:19 +0200
Subject: sfi: Remove unused code

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Feng Tang <feng.tang@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/x86/kernel/sfi.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/sfi.c b/arch/x86/kernel/sfi.c
index 761df3f759c..b294b7a0d4b 100644
--- a/arch/x86/kernel/sfi.c
+++ b/arch/x86/kernel/sfi.c
@@ -55,16 +55,12 @@ void __init mp_sfi_register_lapic_address(unsigned long address)
 /* All CPUs enumerated by SFI must be present and enabled */
 void __cpuinit mp_sfi_register_lapic(u8 id)
 {
-	int boot_cpu = 0;
-
 	if (MAX_APICS - id <= 0) {
 		pr_warning("Processor #%d invalid (max %d)\n",
 			id, MAX_APICS);
 		return;
 	}
 
-	if (id == boot_cpu_physical_apicid)
-		boot_cpu = 1;
 	pr_info("registering lapic[%d]\n", id);
 
 	generic_processor_info(id, GET_APIC_VERSION(apic_read(APIC_LVR)));
-- 
cgit v1.2.3


From 3834f47291df475be3f0f0fb7ccaa098967cc054 Mon Sep 17 00:00:00 2001
From: Feng Tang <feng.tang@intel.com>
Date: Mon, 14 Sep 2009 13:01:53 +0800
Subject: SFI: remove unneeded includes

Signed-off-by: Feng Tang <feng.tang@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/x86/kernel/sfi.c  | 7 -------
 drivers/sfi/sfi_core.c | 5 -----
 2 files changed, 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/sfi.c b/arch/x86/kernel/sfi.c
index b294b7a0d4b..34e09938265 100644
--- a/arch/x86/kernel/sfi.c
+++ b/arch/x86/kernel/sfi.c
@@ -21,22 +21,15 @@
 #define KMSG_COMPONENT "SFI"
 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
 
-#include <linux/bootmem.h>
-#include <linux/cpumask.h>
-#include <linux/module.h>
 #include <linux/acpi.h>
 #include <linux/init.h>
-#include <linux/efi.h>
-#include <linux/irq.h>
 #include <linux/sfi.h>
 #include <linux/io.h>
 
 #include <asm/io_apic.h>
-#include <asm/pgtable.h>
 #include <asm/mpspec.h>
 #include <asm/setup.h>
 #include <asm/apic.h>
-#include <asm/e820.h>
 
 #ifdef CONFIG_X86_LOCAL_APIC
 static unsigned long sfi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
diff --git a/drivers/sfi/sfi_core.c b/drivers/sfi/sfi_core.c
index 7bd6b18fdd2..d3b49680047 100644
--- a/drivers/sfi/sfi_core.c
+++ b/drivers/sfi/sfi_core.c
@@ -62,16 +62,11 @@
 #include <linux/bootmem.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/string.h>
 #include <linux/errno.h>
 #include <linux/types.h>
 #include <linux/acpi.h>
 #include <linux/init.h>
-#include <linux/irq.h>
 #include <linux/sfi.h>
-#include <linux/smp.h>
-
-#include <asm/pgtable.h>
 
 #include "sfi_core.h"
 
-- 
cgit v1.2.3


From 54e2603f1a85b9725aa13518d69148b6e7061aa9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 16 Sep 2009 08:42:26 +0200
Subject: x86: platform: Fix section annotations

init_IRQ() and x86_late_time_init() are missing __init annotations.

The x86 platform ops variables are annotated, but the annotation needs
to be put between the variable name and the "=" of the initializer.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/irqinit.c  | 2 +-
 arch/x86/kernel/time.c     | 2 +-
 arch/x86/kernel/x86_init.c | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index e0142cda239..73b16649b7d 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -140,7 +140,7 @@ void __init init_ISA_irqs(void)
 	}
 }
 
-void init_IRQ(void)
+void __init init_IRQ(void)
 {
 	x86_init.irqs.intr_init();
 }
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index fcece00356a..e293ac56c72 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -105,7 +105,7 @@ void __init hpet_time_init(void)
 	setup_default_timer_irq();
 }
 
-static void x86_late_time_init(void)
+static __init void x86_late_time_init(void)
 {
 	x86_init.timers.timer_init();
 	tsc_init();
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 24be7f39789..68824c7be4e 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -23,7 +23,7 @@ void __init x86_init_pgd_noop(pgd_t *unused) { }
  * The platform setup functions are preset with the default functions
  * for standard PC hardware.
  */
-struct __initdata x86_init_ops x86_init = {
+struct x86_init_ops x86_init __initdata = {
 
 	.resources = {
 		.probe_roms		= x86_init_noop,
@@ -64,7 +64,7 @@ struct __initdata x86_init_ops x86_init = {
 	},
 };
 
-__cpuinitdata struct x86_cpuinit_ops x86_cpuinit = {
+struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = {
 	.setup_percpu_clockev		= setup_secondary_APIC_clock,
 };
 
-- 
cgit v1.2.3


From 7c423e98856df9b941223a7e7845b2502ad84b00 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 16 Sep 2009 09:38:24 +0200
Subject: sched: x86: Name old_perf in a unique way

Silly percpu bits don't respect static..

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
---
 arch/x86/kernel/cpu/sched.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/sched.c b/arch/x86/kernel/cpu/sched.c
index 6c00a8f3cce..a640ae5ad20 100644
--- a/arch/x86/kernel/cpu/sched.c
+++ b/arch/x86/kernel/cpu/sched.c
@@ -8,11 +8,11 @@
 
 #ifdef CONFIG_SMP
 
-static DEFINE_PER_CPU(struct aperfmperf, old_perf);
+static DEFINE_PER_CPU(struct aperfmperf, old_perf_sched);
 
 static unsigned long scale_aperfmperf(void)
 {
-	struct aperfmperf val, *old = &__get_cpu_var(old_perf);
+	struct aperfmperf val, *old = &__get_cpu_var(old_perf_sched);
 	unsigned long ratio, flags;
 
 	local_irq_save(flags);
-- 
cgit v1.2.3


From 6a8126911a5ab167783fce18ae9cc70ec9b84fe2 Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <andreas.herrmann3@amd.com>
Date: Wed, 16 Sep 2009 11:33:40 +0200
Subject: x86, EDAC: Provide function to return NodeId of a CPU

Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
Acked-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/processor.h |  2 ++
 arch/x86/kernel/cpu/amd.c        | 10 ++++++++++
 drivers/edac/edac_mce_amd.c      |  2 +-
 3 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index e08ea043e08..42a3f936dad 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -1020,4 +1020,6 @@ extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
 extern int get_tsc_mode(unsigned long adr);
 extern int set_tsc_mode(unsigned int val);
 
+extern int amd_get_nb_id(int cpu);
+
 #endif /* _ASM_X86_PROCESSOR_H */
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 22a47c82f3c..f32fa71ccf9 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -333,6 +333,16 @@ static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
 #endif
 }
 
+int amd_get_nb_id(int cpu)
+{
+	int id = 0;
+#ifdef CONFIG_SMP
+	id = per_cpu(cpu_llc_id, cpu);
+#endif
+	return id;
+}
+EXPORT_SYMBOL_GPL(amd_get_nb_id);
+
 static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
 {
 #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
diff --git a/drivers/edac/edac_mce_amd.c b/drivers/edac/edac_mce_amd.c
index c8ca7136dac..0c21c370c9d 100644
--- a/drivers/edac/edac_mce_amd.c
+++ b/drivers/edac/edac_mce_amd.c
@@ -405,7 +405,7 @@ void decode_mce(struct mce *m)
 		regs.nbsh  = (u32)(m->status >> 32);
 		regs.nbeal = (u32) m->addr;
 		regs.nbeah = (u32)(m->addr >> 32);
-		node       = per_cpu(cpu_llc_id, m->extcpu);
+		node       = amd_get_nb_id(m->extcpu);
 
 		amd_decode_nb_mce(node, &regs, 1);
 		break;
-- 
cgit v1.2.3


From a6e04aa92965565968573a220a35b4e907385697 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Wed, 16 Sep 2009 11:50:09 +0200
Subject: HWPOISON: x86: Add VM_FAULT_HWPOISON handling to x86 page fault
 handler v2

Add VM_FAULT_HWPOISON handling to the x86 page fault handler. This is
very similar to VM_FAULT_OOM, the only difference is that a different
si_code is passed to user space and the new addr_lsb field is initialized.

v2: Make the printk more verbose/unique

Cc: x86@kernel.org

Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 arch/x86/mm/fault.c | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 775a020990a..8ba56240820 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -167,6 +167,7 @@ force_sig_info_fault(int si_signo, int si_code, unsigned long address,
 	info.si_errno	= 0;
 	info.si_code	= si_code;
 	info.si_addr	= (void __user *)address;
+	info.si_addr_lsb = si_code == BUS_MCEERR_AR ? PAGE_SHIFT : 0;
 
 	force_sig_info(si_signo, &info, tsk);
 }
@@ -790,10 +791,12 @@ out_of_memory(struct pt_regs *regs, unsigned long error_code,
 }
 
 static void
-do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address)
+do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
+	  unsigned int fault)
 {
 	struct task_struct *tsk = current;
 	struct mm_struct *mm = tsk->mm;
+	int code = BUS_ADRERR;
 
 	up_read(&mm->mmap_sem);
 
@@ -809,7 +812,15 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address)
 	tsk->thread.error_code	= error_code;
 	tsk->thread.trap_no	= 14;
 
-	force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
+#ifdef CONFIG_MEMORY_FAILURE
+	if (fault & VM_FAULT_HWPOISON) {
+		printk(KERN_ERR
+	"MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
+			tsk->comm, tsk->pid, address);
+		code = BUS_MCEERR_AR;
+	}
+#endif
+	force_sig_info_fault(SIGBUS, code, address, tsk);
 }
 
 static noinline void
@@ -819,8 +830,8 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
 	if (fault & VM_FAULT_OOM) {
 		out_of_memory(regs, error_code, address);
 	} else {
-		if (fault & VM_FAULT_SIGBUS)
-			do_sigbus(regs, error_code, address);
+		if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON))
+			do_sigbus(regs, error_code, address, fault);
 		else
 			BUG();
 	}
-- 
cgit v1.2.3


From 7bd867dfb4e0357e06a3211ab2bd0e714110def3 Mon Sep 17 00:00:00 2001
From: Feng Tang <feng.tang@intel.com>
Date: Thu, 10 Sep 2009 10:48:56 +0800
Subject: x86: Move get/set_wallclock to x86_platform_ops

get/set_wallclock() have already a set of platform dependent
implementations (default, EFI, paravirt). MRST will add another
variant.

Moving them to platform ops simplifies the existing code and minimizes
the effort to integrate new variants.

Signed-off-by: Feng Tang <feng.tang@intel.com>
LKML-Reference: <new-submission>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/paravirt.h       | 10 -------
 arch/x86/include/asm/paravirt_types.h |  4 ---
 arch/x86/include/asm/time.h           | 50 -----------------------------------
 arch/x86/include/asm/x86_init.h       |  4 +++
 arch/x86/kernel/efi.c                 |  4 +++
 arch/x86/kernel/kvmclock.c            |  4 +--
 arch/x86/kernel/paravirt.c            |  2 --
 arch/x86/kernel/rtc.c                 | 12 +++------
 arch/x86/kernel/vmi_32.c              |  4 +--
 arch/x86/kernel/x86_init.c            |  2 ++
 arch/x86/lguest/boot.c                |  4 +--
 arch/x86/xen/enlighten.c              |  4 +--
 12 files changed, 21 insertions(+), 83 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 1e458a55330..a69ae87bd7d 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -24,16 +24,6 @@ static inline void load_sp0(struct tss_struct *tss,
 	PVOP_VCALL2(pv_cpu_ops.load_sp0, tss, thread);
 }
 
-static inline unsigned long get_wallclock(void)
-{
-	return PVOP_CALL0(unsigned long, pv_time_ops.get_wallclock);
-}
-
-static inline int set_wallclock(unsigned long nowtime)
-{
-	return PVOP_CALL1(int, pv_time_ops.set_wallclock, nowtime);
-}
-
 /* The paravirtualized CPUID instruction. */
 static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
 			   unsigned int *ecx, unsigned int *edx)
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 0d812e592e3..c25d5e3bdf9 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -88,10 +88,6 @@ struct pv_lazy_ops {
 };
 
 struct pv_time_ops {
-	/* Set and set time of day */
-	unsigned long (*get_wallclock)(void);
-	int (*set_wallclock)(unsigned long);
-
 	unsigned long long (*sched_clock)(void);
 	unsigned long (*get_tsc_khz)(void);
 };
diff --git a/arch/x86/include/asm/time.h b/arch/x86/include/asm/time.h
index 9c5608b21c2..7bdec4e9b73 100644
--- a/arch/x86/include/asm/time.h
+++ b/arch/x86/include/asm/time.h
@@ -4,57 +4,7 @@
 extern void hpet_time_init(void);
 
 #include <asm/mc146818rtc.h>
-#ifdef CONFIG_X86_32
-#include <linux/efi.h>
-
-static inline unsigned long native_get_wallclock(void)
-{
-	unsigned long retval;
-
-	if (efi_enabled)
-		retval = efi_get_time();
-	else
-		retval = mach_get_cmos_time();
-
-	return retval;
-}
-
-static inline int native_set_wallclock(unsigned long nowtime)
-{
-	int retval;
-
-	if (efi_enabled)
-		retval = efi_set_rtc_mmss(nowtime);
-	else
-		retval = mach_set_rtc_mmss(nowtime);
-
-	return retval;
-}
-
-#else
-extern void native_time_init_hook(void);
-
-static inline unsigned long native_get_wallclock(void)
-{
-	return mach_get_cmos_time();
-}
-
-static inline int native_set_wallclock(unsigned long nowtime)
-{
-	return mach_set_rtc_mmss(nowtime);
-}
-
-#endif
 
 extern void time_init(void);
 
-#ifdef CONFIG_PARAVIRT
-#include <asm/paravirt.h>
-#else /* !CONFIG_PARAVIRT */
-
-#define get_wallclock() native_get_wallclock()
-#define set_wallclock(x) native_set_wallclock(x)
-
-#endif /* CONFIG_PARAVIRT */
-
 #endif /* _ASM_X86_TIME_H */
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index b6c89428137..2c756fd4ab0 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -114,9 +114,13 @@ struct x86_cpuinit_ops {
 /**
  * struct x86_platform_ops - platform specific runtime functions
  * @calibrate_tsc:		calibrate TSC
+ * @get_wallclock:		get time from HW clock like RTC etc.
+ * @set_wallclock:		set time back to HW clock
  */
 struct x86_platform_ops {
 	unsigned long (*calibrate_tsc)(void);
+	unsigned long (*get_wallclock)(void);
+	int (*set_wallclock)(unsigned long nowtime);
 };
 
 extern struct x86_init_ops x86_init;
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index fe26ba3e345..ad5bd988fb7 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -42,6 +42,7 @@
 #include <asm/time.h>
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
+#include <asm/x86_init.h>
 
 #define EFI_DEBUG	1
 #define PFX 		"EFI: "
@@ -453,6 +454,9 @@ void __init efi_init(void)
 	if (add_efi_memmap)
 		do_add_efi_memmap();
 
+	x86_platform.get_wallclock = efi_get_time;
+	x86_platform.set_wallclock = efi_set_rtc_mmss;
+
 	/* Setup for EFI runtime service */
 	reboot_type = BOOT_EFI;
 
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 75a21b61b86..59ab94db12e 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -184,10 +184,10 @@ void __init kvmclock_init(void)
 	if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) {
 		if (kvm_register_clock("boot clock"))
 			return;
-		pv_time_ops.get_wallclock = kvm_get_wallclock;
-		pv_time_ops.set_wallclock = kvm_set_wallclock;
 		pv_time_ops.sched_clock = kvm_clock_read;
 		x86_platform.calibrate_tsc = kvm_get_tsc_khz;
+		x86_platform.get_wallclock = kvm_get_wallclock;
+		x86_platform.set_wallclock = kvm_set_wallclock;
 #ifdef CONFIG_X86_LOCAL_APIC
 		x86_cpuinit.setup_percpu_clockev =
 			kvm_setup_secondary_clock;
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 7cbf898d839..c0fb85aed43 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -306,8 +306,6 @@ struct pv_init_ops pv_init_ops = {
 };
 
 struct pv_time_ops pv_time_ops = {
-	.get_wallclock = native_get_wallclock,
-	.set_wallclock = native_set_wallclock,
 	.sched_clock = native_sched_clock,
 };
 
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 5d465b207e7..b8652f2e685 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -8,6 +8,7 @@
 #include <linux/pnp.h>
 
 #include <asm/vsyscall.h>
+#include <asm/x86_init.h>
 #include <asm/time.h>
 
 #ifdef CONFIG_X86_32
@@ -165,13 +166,13 @@ void rtc_cmos_write(unsigned char val, unsigned char addr)
 }
 EXPORT_SYMBOL(rtc_cmos_write);
 
-static int set_rtc_mmss(unsigned long nowtime)
+int update_persistent_clock(struct timespec now)
 {
 	unsigned long flags;
 	int retval;
 
 	spin_lock_irqsave(&rtc_lock, flags);
-	retval = set_wallclock(nowtime);
+	retval = x86_platform.set_wallclock(now.tv_sec);
 	spin_unlock_irqrestore(&rtc_lock, flags);
 
 	return retval;
@@ -183,17 +184,12 @@ unsigned long read_persistent_clock(void)
 	unsigned long retval, flags;
 
 	spin_lock_irqsave(&rtc_lock, flags);
-	retval = get_wallclock();
+	retval = x86_platform.get_wallclock();
 	spin_unlock_irqrestore(&rtc_lock, flags);
 
 	return retval;
 }
 
-int update_persistent_clock(struct timespec now)
-{
-	return set_rtc_mmss(now.tv_sec);
-}
-
 unsigned long long native_read_tsc(void)
 {
 	return __native_read_tsc();
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 052ae81ee08..31e6f6cfe53 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -818,14 +818,14 @@ static inline int __init activate_vmi(void)
 		vmi_timer_ops.cancel_alarm =
 			 vmi_get_function(VMI_CALL_CancelAlarm);
 		x86_init.timers.timer_init = vmi_time_init;
-		pv_time_ops.get_wallclock = vmi_get_wallclock;
-		pv_time_ops.set_wallclock = vmi_set_wallclock;
 #ifdef CONFIG_X86_LOCAL_APIC
 		x86_init.timers.setup_percpu_clockev = vmi_time_bsp_init;
 		x86_cpuinit.setup_percpu_clockev = vmi_time_ap_init;
 #endif
 		pv_time_ops.sched_clock = vmi_sched_clock;
 		x86_platform.calibrate_tsc = vmi_tsc_khz;
+		x86_platform.get_wallclock = vmi_get_wallclock;
+		x86_platform.set_wallclock = vmi_set_wallclock;
 
 		/* We have true wallclock functions; disable CMOS clock sync */
 		no_sync_cmos_clock = 1;
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 68824c7be4e..4449a4a2c2e 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -70,4 +70,6 @@ struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = {
 
 struct x86_platform_ops x86_platform = {
 	.calibrate_tsc			= native_calibrate_tsc,
+	.get_wallclock			= mach_get_cmos_time,
+	.set_wallclock			= mach_set_rtc_mmss,
 };
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index fabe745513d..4cb7d5d18b8 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -1318,13 +1318,11 @@ __init void lguest_init(void)
 	set_lguest_basic_apic_ops();
 #endif
 
-	/* Time operations */
-	pv_time_ops.get_wallclock = lguest_get_wallclock;
-
 	x86_init.resources.memory_setup = lguest_memory_setup;
 	x86_init.irqs.intr_init = lguest_init_IRQ;
 	x86_init.timers.timer_init = lguest_time_init;
 	x86_platform.calibrate_tsc = lguest_tsc_khz;
+	x86_platform.get_wallclock =  lguest_get_wallclock;
 
 	/*
 	 * Now is a good time to look at the implementations of these functions
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index ee8cac77c8a..b5bf8b9119a 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -842,8 +842,6 @@ static const struct pv_init_ops xen_init_ops __initdata = {
 };
 
 static const struct pv_time_ops xen_time_ops __initdata = {
-	.set_wallclock = xen_set_wallclock,
-	.get_wallclock = xen_get_wallclock,
 	.sched_clock = xen_sched_clock,
 };
 
@@ -980,6 +978,8 @@ asmlinkage void __init xen_start_kernel(void)
 	x86_cpuinit.setup_percpu_clockev = x86_init_noop;
 
 	x86_platform.calibrate_tsc = xen_tsc_khz;
+	x86_platform.get_wallclock = xen_get_wallclock;
+	x86_platform.set_wallclock = xen_set_wallclock;
 
 #ifdef CONFIG_X86_64
 	/*
-- 
cgit v1.2.3


From 182a85f8a119c789610a9d464f4129ded9f3c107 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 16 Sep 2009 13:24:49 +0200
Subject: sched: Disable wakeup balancing

Sysbench thinks SD_BALANCE_WAKE is too agressive and kbuild doesn't
really mind too much, SD_BALANCE_NEWIDLE picks up most of the
slack.

On a dual socket, quad core, dual thread nehalem system:

sysbench (--num_threads=16):

 SD_BALANCE_WAKE-: 13982 tx/s
 SD_BALANCE_WAKE+: 15688 tx/s

kbuild (-j16):

 SD_BALANCE_WAKE-: 47.648295846  seconds time elapsed   ( +-   0.312% )
 SD_BALANCE_WAKE+: 47.608607360  seconds time elapsed   ( +-   0.026% )

(same within noise)

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/ia64/include/asm/topology.h           | 2 --
 arch/mips/include/asm/mach-ip27/topology.h | 1 -
 arch/powerpc/include/asm/topology.h        | 1 -
 arch/sh/include/asm/topology.h             | 1 -
 arch/sparc/include/asm/topology_64.h       | 1 -
 arch/x86/include/asm/topology.h            | 2 +-
 include/linux/topology.h                   | 6 +++---
 7 files changed, 4 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h
index 569b9dafc78..d0141fbf51d 100644
--- a/arch/ia64/include/asm/topology.h
+++ b/arch/ia64/include/asm/topology.h
@@ -68,7 +68,6 @@ void build_cpu_to_node_map(void);
 				| SD_BALANCE_NEWIDLE	\
 				| SD_BALANCE_EXEC	\
 				| SD_BALANCE_FORK	\
-				| SD_BALANCE_WAKE	\
 				| SD_WAKE_AFFINE,	\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\
@@ -94,7 +93,6 @@ void build_cpu_to_node_map(void);
 				| SD_BALANCE_NEWIDLE	\
 				| SD_BALANCE_EXEC	\
 				| SD_BALANCE_FORK	\
-				| SD_BALANCE_WAKE	\
 				| SD_SERIALIZE,		\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 64,			\
diff --git a/arch/mips/include/asm/mach-ip27/topology.h b/arch/mips/include/asm/mach-ip27/topology.h
index d8332398f5b..23059170700 100644
--- a/arch/mips/include/asm/mach-ip27/topology.h
+++ b/arch/mips/include/asm/mach-ip27/topology.h
@@ -48,7 +48,6 @@ extern unsigned char __node_distances[MAX_COMPACT_NODES][MAX_COMPACT_NODES];
 	.cache_nice_tries	= 1,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_EXEC	\
-				| SD_BALANCE_WAKE,	\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\
 	.nr_balance_failed	= 0,			\
diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h
index 1a2c9eb42a0..394edcbcce7 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -63,7 +63,6 @@ static inline int pcibus_to_node(struct pci_bus *bus)
 				| SD_BALANCE_EXEC	\
 				| SD_BALANCE_FORK	\
 				| SD_BALANCE_NEWIDLE	\
-				| SD_BALANCE_WAKE	\
 				| SD_SERIALIZE,		\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\
diff --git a/arch/sh/include/asm/topology.h b/arch/sh/include/asm/topology.h
index a8cc564b703..f8c40cc6505 100644
--- a/arch/sh/include/asm/topology.h
+++ b/arch/sh/include/asm/topology.h
@@ -21,7 +21,6 @@
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_FORK	\
 				| SD_BALANCE_EXEC	\
-				| SD_BALANCE_WAKE	\
 				| SD_BALANCE_NEWIDLE	\
 				| SD_SERIALIZE,		\
 	.last_balance		= jiffies,		\
diff --git a/arch/sparc/include/asm/topology_64.h b/arch/sparc/include/asm/topology_64.h
index 10b979d1de2..26cd25c0839 100644
--- a/arch/sparc/include/asm/topology_64.h
+++ b/arch/sparc/include/asm/topology_64.h
@@ -57,7 +57,6 @@ static inline int pcibus_to_node(struct pci_bus *pbus)
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_FORK	\
 				| SD_BALANCE_EXEC	\
-				| SD_BALANCE_WAKE	\
 				| SD_SERIALIZE,		\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 589f12383d7..6f0695d744b 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -141,7 +141,7 @@ extern unsigned long node_remap_size[];
 				| 1*SD_BALANCE_NEWIDLE			\
 				| 1*SD_BALANCE_EXEC			\
 				| 1*SD_BALANCE_FORK			\
-				| 1*SD_BALANCE_WAKE			\
+				| 0*SD_BALANCE_WAKE			\
 				| 1*SD_WAKE_AFFINE			\
 				| 0*SD_SHARE_CPUPOWER			\
 				| 0*SD_POWERSAVINGS_BALANCE		\
diff --git a/include/linux/topology.h b/include/linux/topology.h
index a6614b0242a..809b26c0709 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -95,7 +95,7 @@ int arch_update_cpu_topology(void);
 				| 1*SD_BALANCE_NEWIDLE			\
 				| 1*SD_BALANCE_EXEC			\
 				| 1*SD_BALANCE_FORK			\
-				| 1*SD_BALANCE_WAKE			\
+				| 0*SD_BALANCE_WAKE			\
 				| 1*SD_WAKE_AFFINE			\
 				| 1*SD_SHARE_CPUPOWER			\
 				| 0*SD_POWERSAVINGS_BALANCE		\
@@ -127,7 +127,7 @@ int arch_update_cpu_topology(void);
 				| 1*SD_BALANCE_NEWIDLE			\
 				| 1*SD_BALANCE_EXEC			\
 				| 1*SD_BALANCE_FORK			\
-				| 1*SD_BALANCE_WAKE			\
+				| 0*SD_BALANCE_WAKE			\
 				| 1*SD_WAKE_AFFINE			\
 				| 1*SD_PREFER_LOCAL			\
 				| 0*SD_SHARE_CPUPOWER			\
@@ -160,7 +160,7 @@ int arch_update_cpu_topology(void);
 				| 1*SD_BALANCE_NEWIDLE			\
 				| 1*SD_BALANCE_EXEC			\
 				| 1*SD_BALANCE_FORK			\
-				| 1*SD_BALANCE_WAKE			\
+				| 0*SD_BALANCE_WAKE			\
 				| 1*SD_WAKE_AFFINE			\
 				| 1*SD_PREFER_LOCAL			\
 				| 0*SD_SHARE_CPUPOWER			\
-- 
cgit v1.2.3


From f0adb134d8dc9993a9998dc50845ec4f6ff4fadc Mon Sep 17 00:00:00 2001
From: Kurt Roeckx <kurt@roeckx.be>
Date: Wed, 16 Sep 2009 11:09:32 -0400
Subject: [CPUFREQ] Fix NULL ptr regression in powernow-k8

Fixes bugzilla #13780

From: Kurt Roeckx <kurt@roeckx.be>
Signed-off-by: Dave Jones <davej@redhat.com>
---
 arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 0cbce0481a5..6394aa5c798 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -605,9 +605,10 @@ static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst,
 	return 0;
 }
 
-static void invalidate_entry(struct powernow_k8_data *data, unsigned int entry)
+static void invalidate_entry(struct cpufreq_frequency_table *powernow_table,
+		unsigned int entry)
 {
-	data->powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID;
+	powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID;
 }
 
 static void print_basics(struct powernow_k8_data *data)
@@ -915,13 +916,13 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data,
 					"bad value %d.\n", i, index);
 			printk(KERN_ERR PFX "Please report to BIOS "
 					"manufacturer\n");
-			invalidate_entry(data, i);
+			invalidate_entry(powernow_table, i);
 			continue;
 		}
 		rdmsr(MSR_PSTATE_DEF_BASE + index, lo, hi);
 		if (!(hi & HW_PSTATE_VALID_MASK)) {
 			dprintk("invalid pstate %d, ignoring\n", index);
-			invalidate_entry(data, i);
+			invalidate_entry(powernow_table, i);
 			continue;
 		}
 
@@ -970,7 +971,7 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data,
 		/* verify frequency is OK */
 		if ((freq > (MAX_FREQ * 1000)) || (freq < (MIN_FREQ * 1000))) {
 			dprintk("invalid freq %u kHz, ignoring\n", freq);
-			invalidate_entry(data, i);
+			invalidate_entry(powernow_table, i);
 			continue;
 		}
 
@@ -978,7 +979,7 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data,
 		 * BIOSs are using "off" to indicate invalid */
 		if (vid == VID_OFF) {
 			dprintk("invalid vid %u, ignoring\n", vid);
-			invalidate_entry(data, i);
+			invalidate_entry(powernow_table, i);
 			continue;
 		}
 
@@ -988,7 +989,7 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data,
 				(unsigned int)
 				(data->acpi_data.states[i].core_frequency
 				 * 1000));
-			invalidate_entry(data, i);
+			invalidate_entry(powernow_table, i);
 			continue;
 		}
 	}
-- 
cgit v1.2.3


From dcb73bf402e0d5b28ce925dbbe4dab3b00b21eee Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Wed, 16 Sep 2009 14:28:03 -0700
Subject: x86, pat: don't use rb-tree based lookup in reserve_memtype()

Recent enhancement of rb-tree based lookup exposed a  bug with the lookup
mechanism in the reserve_memtype() which ensures that there are no conflicting
memtype requests for the memory range.

memtype_rb_search() returns an entry which has a start address <= new start
address. And from here we traverse the linear linked list to check if there
any conflicts with the existing mappings. As the rbtree is based on the
start address of the memory range, it is quite possible that we have several
overlapped mappings whose start address is much less than new requested start
but the end is >= new requested end. This results in conflicting memtype
mappings.

Same bug exists with the old code which uses cached_entry from where
we traverse the linear linked list. But the new rb-tree code exposes this
bug fairly easily.

For now, don't use the memtype_rb_search() and always start the search from
the head of linear linked list in reserve_memtype(). Linear linked list
for most of the systems grow's to few 10's of entries(as we track memory type
of RAM pages using struct page). So we should be ok for now.

We still retain the rbtree and use it to speed up free_memtype() which
doesn't have the same bug(as we know what exactly we are searching for
in free_memtype).

Also use list_for_each_entry_from() in free_memtype() so that we start
the search from rb-tree lookup result.

Reported-by: Markus Trippelsdorf <markus@trippelsdorf.de>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
LKML-Reference: <1253136483.4119.12.camel@sbs-t61.sc.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/mm/pat.c | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index d2a72abc9de..9b647f67938 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -424,17 +424,9 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
 
 	spin_lock(&memtype_lock);
 
-	entry = memtype_rb_search(&memtype_rbroot, new->start);
-	if (likely(entry != NULL)) {
-		/* To work correctly with list_for_each_entry_continue */
-		entry = list_entry(entry->nd.prev, struct memtype, nd);
-	} else {
-		entry = list_entry(&memtype_list, struct memtype, nd);
-	}
-
 	/* Search for existing mapping that overlaps the current range */
 	where = NULL;
-	list_for_each_entry_continue(entry, &memtype_list, nd) {
+	list_for_each_entry(entry, &memtype_list, nd) {
 		if (end <= entry->start) {
 			where = entry->nd.prev;
 			break;
@@ -532,7 +524,7 @@ int free_memtype(u64 start, u64 end)
 	 * in sorted start address
 	 */
 	saved_entry = entry;
-	list_for_each_entry(entry, &memtype_list, nd) {
+	list_for_each_entry_from(entry, &memtype_list, nd) {
 		if (entry->start == start && entry->end == end) {
 			rb_erase(&entry->rb, &memtype_rbroot);
 			list_del(&entry->nd);
-- 
cgit v1.2.3


From bc3eb7076b2b808f79abd4bd33b6a7feebb6f5d5 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Fri, 18 Sep 2009 07:49:25 +0200
Subject: x86: Remove final bits of CONFIG_X86_OLD_MCE

Caught by Linus.

Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Cc: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Cc: Borislav Petkov <borislav.petkov@amd.com>
[ fixed up context conflict manually. ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/power/cpu.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 417c9f5b4af..8aa85f17667 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -243,10 +243,6 @@ static void __restore_processor_state(struct saved_context *ctxt)
 
 	do_fpu_end();
 	mtrr_bp_restore();
-
-#ifdef CONFIG_X86_OLD_MCE
-	mcheck_init(&boot_cpu_data);
-#endif
 }
 
 /* Needed by apm.c */
-- 
cgit v1.2.3


From c2777f98c205148f1a0d4f9ac03b9cb20b39b2da Mon Sep 17 00:00:00 2001
From: Daniel Walker <dwalker@fifo99.com>
Date: Sat, 12 Sep 2009 10:40:20 -0700
Subject: x86: apic: Convert BUG() to BUG_ON()

This was done using Coccinelle's BUG_ON semantic patch.

Signed-off-by: Daniel Walker <dwalker@fifo99.com>
Cc: Julia Lawall <julia@diku.dk>
LKML-Reference: <1252777220-30796-1-git-send-email-dwalker@fifo99.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/apic.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 159740decc4..79e5b92a580 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1196,8 +1196,7 @@ void __cpuinit setup_local_APIC(void)
 	 * Double-check whether this APIC is really registered.
 	 * This is meaningless in clustered apic mode, so we skip it.
 	 */
-	if (!apic->apic_id_registered())
-		BUG();
+	BUG_ON(!apic->apic_id_registered());
 
 	/*
 	 * Intel recommends to set DFR, LDR and TPR before enabling
-- 
cgit v1.2.3


From d2374aecda3f6c9b0d13287027132a37311da300 Mon Sep 17 00:00:00 2001
From: Jack Steiner <steiner@sgi.com>
Date: Wed, 9 Sep 2009 10:41:05 -0500
Subject: x86: SGI UV: Fix IPI macros

The UV BIOS has changed the way interrupt remapping is being done.
This affects the id used for sending IPIs. The upper id bits no
longer need to be masked off.

Signed-off-by: Jack Steiner <steiner@sgi.com>
Cc: <stable@kernel.org>
LKML-Reference: <20090909154104.GA25083@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/uv/uv_hub.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index 77a68505419..03a0cbdb305 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -422,7 +422,7 @@ static inline void uv_hub_send_ipi(int pnode, int apicid, int vector)
 	unsigned long val;
 
 	val = (1UL << UVH_IPI_INT_SEND_SHFT) |
-			((apicid & 0x3f) << UVH_IPI_INT_APIC_ID_SHFT) |
+			((apicid) << UVH_IPI_INT_APIC_ID_SHFT) |
 			(vector << UVH_IPI_INT_VECTOR_SHFT);
 	uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
 }
-- 
cgit v1.2.3


From 8dc579e868addafd24c0a015c12f0e536b1084b1 Mon Sep 17 00:00:00 2001
From: Jack Steiner <steiner@sgi.com>
Date: Thu, 10 Sep 2009 09:31:49 -0500
Subject: x86: SGI UV: Add volatile semantics to macros that access chipset
 registers

Add volatile-semantics to the SGI UV read/write macros that are
used to access chipset memory mapped registers. No direct
references to volatile are made. Instead the readq/writeq macros
are used.

Signed-off-by: Jack Steiner <steiner@sgi.com>
Cc: linux-mm@kvack.org
Cc: dwalker@fifo99.com
Cc: cfriesen@nortel.com
LKML-Reference: <20090910143149.GA14273@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/uv/uv_hub.h | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index 03a0cbdb305..04eb6c958b9 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -15,6 +15,7 @@
 #include <linux/numa.h>
 #include <linux/percpu.h>
 #include <linux/timer.h>
+#include <linux/io.h>
 #include <asm/types.h>
 #include <asm/percpu.h>
 #include <asm/uv/uv_mmrs.h>
@@ -258,13 +259,13 @@ static inline unsigned long *uv_global_mmr32_address(int pnode,
 static inline void uv_write_global_mmr32(int pnode, unsigned long offset,
 				 unsigned long val)
 {
-	*uv_global_mmr32_address(pnode, offset) = val;
+	writeq(val, uv_global_mmr32_address(pnode, offset));
 }
 
 static inline unsigned long uv_read_global_mmr32(int pnode,
 						 unsigned long offset)
 {
-	return *uv_global_mmr32_address(pnode, offset);
+	return readq(uv_global_mmr32_address(pnode, offset));
 }
 
 /*
@@ -281,13 +282,13 @@ static inline unsigned long *uv_global_mmr64_address(int pnode,
 static inline void uv_write_global_mmr64(int pnode, unsigned long offset,
 				unsigned long val)
 {
-	*uv_global_mmr64_address(pnode, offset) = val;
+	writeq(val, uv_global_mmr64_address(pnode, offset));
 }
 
 static inline unsigned long uv_read_global_mmr64(int pnode,
 						 unsigned long offset)
 {
-	return *uv_global_mmr64_address(pnode, offset);
+	return readq(uv_global_mmr64_address(pnode, offset));
 }
 
 /*
@@ -301,22 +302,22 @@ static inline unsigned long *uv_local_mmr_address(unsigned long offset)
 
 static inline unsigned long uv_read_local_mmr(unsigned long offset)
 {
-	return *uv_local_mmr_address(offset);
+	return readq(uv_local_mmr_address(offset));
 }
 
 static inline void uv_write_local_mmr(unsigned long offset, unsigned long val)
 {
-	*uv_local_mmr_address(offset) = val;
+	writeq(val, uv_local_mmr_address(offset));
 }
 
 static inline unsigned char uv_read_local_mmr8(unsigned long offset)
 {
-	return *((unsigned char *)uv_local_mmr_address(offset));
+	return readb(uv_local_mmr_address(offset));
 }
 
 static inline void uv_write_local_mmr8(unsigned long offset, unsigned char val)
 {
-	*((unsigned char *)uv_local_mmr_address(offset)) = val;
+	writeb(val, uv_local_mmr_address(offset));
 }
 
 /*
-- 
cgit v1.2.3


From daf7b9c9216e2b82e4c14b7248a85286dab021c3 Mon Sep 17 00:00:00 2001
From: Jack Steiner <steiner@sgi.com>
Date: Wed, 9 Sep 2009 10:43:39 -0500
Subject: x86: SGI UV: Map MMIO-High memory range

UV depends on the MMRHI space being identity mapped. The patch:

	x86: Make 64-bit efi_ioremap use ioremap on MMIO regions

changed this to make efi regions at a different address using
ioremap. Add the identity mapping to uv_system_init.

( Note this code was previously present but was deleted when BIOS
  added the ranges to the EFI map - previous efi code identify
  mapped the ranges. )

Signed-off-by: Jack Steiner <steiner@sgi.com>
LKML-Reference: <20090909154339.GA7946@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/x2apic_uv_x.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 601159374e8..f5f5886a6b5 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -389,6 +389,16 @@ static __init void map_gru_high(int max_pnode)
 		map_high("GRU", gru.s.base, shift, max_pnode, map_wb);
 }
 
+static __init void map_mmr_high(int max_pnode)
+{
+	union uvh_rh_gam_mmr_overlay_config_mmr_u mmr;
+	int shift = UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT;
+
+	mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR);
+	if (mmr.s.enable)
+		map_high("MMR", mmr.s.base, shift, max_pnode, map_uc);
+}
+
 static __init void map_mmioh_high(int max_pnode)
 {
 	union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh;
@@ -643,6 +653,7 @@ void __init uv_system_init(void)
 	}
 
 	map_gru_high(max_pnode);
+	map_mmr_high(max_pnode);
 	map_mmioh_high(max_pnode);
 
 	uv_cpu_init();
-- 
cgit v1.2.3


From 7715a1e887c5dedc9cf1e57a8f35df261c749b08 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Fri, 18 Sep 2009 03:41:10 -0700
Subject: x86/PCI: default pcibus cpumask to all cpus if it lacks affinity

The early initialization of the pci bus to node mapping leaves all busses
with a node id of -1 if it lacks memory affinity.  Thus, cpumask_of_pcibus
must return all online cpus for such busses.

Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/include/asm/pci.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index f76a162c082..ada8c201d51 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -143,7 +143,11 @@ static inline int __pcibus_to_node(const struct pci_bus *bus)
 static inline const struct cpumask *
 cpumask_of_pcibus(const struct pci_bus *bus)
 {
-	return cpumask_of_node(__pcibus_to_node(bus));
+	int node;
+
+	node = __pcibus_to_node(bus);
+	return (node == -1) ? cpu_online_mask :
+			      cpumask_of_node(node);
 }
 #endif
 
-- 
cgit v1.2.3


From 76baeebf7df493703eeb4428eac015bdb7fabda6 Mon Sep 17 00:00:00 2001
From: Jesse Barnes <jbarnes@virtuousgeek.org>
Date: Fri, 18 Sep 2009 09:13:57 -0700
Subject: x86/PCI: make 32 bit NUMA node array int, not unsigned char

We use -1 to indicate no node affinity, so we need a signed type here or
all sorts of bad things happen, like crashes in dev_attr_show as
reported by Ingo:

[  158.058140] warning: `dbus-daemon' uses 32-bit capabilities (legacy support in use)
[  159.370562] BUG: unable to handle kernel NULL pointer dereference at (null)
[  159.372694] IP: [<ffffffff8143b722>] bitmap_scnprintf+0x72/0xd0
[  159.372694] PGD 71d3e067 PUD 7052e067 PMD 0
[  159.372694] Oops: 0000 [#1] SMP DEBUG_PAGEALLOC
[  159.372694] last sysfs file: /sys/devices/pci0000:00/0000:00:01.0/local_cpus
[  159.372694] CPU 0
[  159.372694] Pid: 7364, comm: irqbalance Not tainted 2.6.31-tip #8043 System Product Name
[  159.372694] RIP: 0010:[<ffffffff8143b722>]  [<ffffffff8143b722>] bitmap_scnprintf+0x72/0xd0
[  159.372694] RSP: 0018:ffff8800712a1e38  EFLAGS: 00010246
[  159.372694] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
[  159.372694] RDX: 0000000000000000 RSI: 0000000000000004 RDI: ffff880077dc5000
[  159.372694] RBP: ffff8800712a1e68 R08: 0000000000000001 R09: 0000000000000001
[  159.372694] R10: ffffffff8215c47c R11: 0000000000000000 R12: 0000000000000000
[  159.372694] R13: 0000000000000000 R14: 0000000000000ffe R15: ffff880077dc5000
[  159.372694] FS:  00007f5f578f76f0(0000) GS:ffff880007000000(0000) knlGS:0000000000000000
[  159.372694] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[  159.372694] CR2: 0000000000000000 CR3: 0000000071a77000 CR4: 00000000000006f0
[  159.372694] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  159.372694] DR3: ffffffff835109dc DR6: 00000000ffff0ff0 DR7: 0000000000000400
[  159.372694] Process irqbalance (pid: 7364, threadinfo ffff8800712a0000, task ffff880070773000)
[  159.372694] Stack:
[  159.372694]  2222222222222222 ffff880077dc5000 fffffffffffffffb ffff88007d366b40
[  159.372694] <0> ffff8800712a1f48 ffff88007d3840a0 ffff8800712a1e88 ffffffff8146332b
[  159.372694] <0> fffffffffffffff4 ffffffff82450718 ffff8800712a1ea8 ffffffff815a9a1f
[  159.372694] Call Trace:
[  159.372694]  [<ffffffff8146332b>] local_cpus_show+0x3b/0x60
[  159.372694]  [<ffffffff815a9a1f>] dev_attr_show+0x2f/0x60
[  159.372694]  [<ffffffff8118ee6f>] sysfs_read_file+0xbf/0x1d0
[  159.372694]  [<ffffffff8112afe9>] vfs_read+0xc9/0x180
[  159.372694]  [<ffffffff8112c365>] sys_read+0x55/0x90
[  159.372694]  [<ffffffff810114f2>] system_call_fastpath+0x16/0x1b
[  159.372694] Code: 41 b9 01 00 00 00 44 8d 46 03 49 63 fc 0f 49 d3 c1 f8 1f 4c 01 ff c1 e8 1a c1 fa 06 41 c1 e8 02 8d 0c 03 48 63 d2 83 e1 3f 29 c1 <49> 8b 44 d5 00 48 c7 c2 8c 37 16 82 48 d3 e8 89 f1 44 89 f6 49
[  159.372694] RIP  [<ffffffff8143b722>] bitmap_scnprintf+0x72/0xd0
[  159.372694]  RSP <ffff8800712a1e38>
[  159.372694] CR2: 0000000000000000
[  159.600828] ---[ end trace 35550c356e84e60c ]---

Reported-by: Ingo Molnar <mingo@elte.hu>
Tested-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/pci/common.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 5db96d4304d..1331fcf2614 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -646,7 +646,7 @@ int get_mp_bus_to_node(int busnum)
 
 #else /* CONFIG_X86_32 */
 
-static unsigned char mp_bus_to_node[BUS_NR] = {
+static int mp_bus_to_node[BUS_NR] = {
 	[0 ... BUS_NR - 1] = -1
 };
 
-- 
cgit v1.2.3


From d223246ef7e6d73c8e3d9b58f27f2eb3fe95e25d Mon Sep 17 00:00:00 2001
From: Anders Kaseorg <andersk@ksplice.com>
Date: Wed, 16 Sep 2009 16:44:26 -0400
Subject: x86: fix fragile computation of vsyscall address

Previously, the address of the vsyscall page (VSYSCALL_PHYS_ADDR,
VSYSCALL_VIRT_ADDR) was computed by arithmetic on the address of the
last section.  This leads to bugs when new sections are inserted, such
as the one fixed by commit d312ceda567ab91acd756cde95ac5fbc6b40ed40.
Let's compute it from the current address instead.

Signed-off-by: Anders Kaseorg <andersk@ksplice.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/vmlinux.lds.S | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 0ccb57d5ee3..b96ca472fa2 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -135,24 +135,21 @@ SECTIONS
 #ifdef CONFIG_X86_64
 
 #define VSYSCALL_ADDR (-10*1024*1024)
-#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data) + SIZEOF(.data) + \
-                            PAGE_SIZE - 1) & ~(PAGE_SIZE - 1))
-#define VSYSCALL_VIRT_ADDR ((ADDR(.data) + SIZEOF(.data) + \
-                            PAGE_SIZE - 1) & ~(PAGE_SIZE - 1))
 
-#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR)
+#define VLOAD_OFFSET (VSYSCALL_ADDR - __vsyscall_0 + LOAD_OFFSET)
 #define VLOAD(x) (ADDR(x) - VLOAD_OFFSET)
 
-#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR)
+#define VVIRT_OFFSET (VSYSCALL_ADDR - __vsyscall_0)
 #define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
 
+	. = ALIGN(4096);
+	__vsyscall_0 = .;
+
 	. = VSYSCALL_ADDR;
-	.vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) {
+	.vsyscall_0 : AT(VLOAD(.vsyscall_0)) {
 		*(.vsyscall_0)
 	} :user
 
-	__vsyscall_0 = VSYSCALL_VIRT_ADDR;
-
 	. = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
 	.vsyscall_fn : AT(VLOAD(.vsyscall_fn)) {
 		*(.vsyscall_fn)
@@ -192,11 +189,9 @@ SECTIONS
 		*(.vsyscall_3)
 	}
 
-	. = VSYSCALL_VIRT_ADDR + PAGE_SIZE;
+	. = __vsyscall_0 + PAGE_SIZE;
 
 #undef VSYSCALL_ADDR
-#undef VSYSCALL_PHYS_ADDR
-#undef VSYSCALL_VIRT_ADDR
 #undef VLOAD_OFFSET
 #undef VLOAD
 #undef VVIRT_OFFSET
-- 
cgit v1.2.3


From 1dc818c1c574a4caa7b575bd9018d04090fb6c86 Mon Sep 17 00:00:00 2001
From: Tim Abbott <tabbott@ksplice.com>
Date: Wed, 16 Sep 2009 16:44:27 -0400
Subject: x86: convert compressed loader to use __HEAD and HEAD_TEXT macros.

This has the consequence of changing the section name use for head
code from ".text.head" to ".head.text".

Linus suggested that we merge the ".text.head" section with ".text"
(presumably while preserving the fact that the head code starts at 0).
When I tried this it caused the kernel to not boot.

Signed-off-by: Tim Abbott <tabbott@ksplice.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/boot/compressed/head_32.S     | 3 ++-
 arch/x86/boot/compressed/head_64.S     | 3 ++-
 arch/x86/boot/compressed/vmlinux.lds.S | 6 ++++--
 3 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index 75e4f001e70..f543b70ffae 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -23,13 +23,14 @@
  */
 	.text
 
+#include <linux/init.h>
 #include <linux/linkage.h>
 #include <asm/segment.h>
 #include <asm/page_types.h>
 #include <asm/boot.h>
 #include <asm/asm-offsets.h>
 
-	.section ".text.head","ax",@progbits
+	__HEAD
 ENTRY(startup_32)
 	cld
 	/*
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index f62c284db9e..077e1b69198 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -24,6 +24,7 @@
 	.code32
 	.text
 
+#include <linux/init.h>
 #include <linux/linkage.h>
 #include <asm/segment.h>
 #include <asm/pgtable_types.h>
@@ -33,7 +34,7 @@
 #include <asm/processor-flags.h>
 #include <asm/asm-offsets.h>
 
-	.section ".text.head"
+	__HEAD
 	.code32
 ENTRY(startup_32)
 	cld
diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S
index cc353e1b3ff..f4193bb4878 100644
--- a/arch/x86/boot/compressed/vmlinux.lds.S
+++ b/arch/x86/boot/compressed/vmlinux.lds.S
@@ -1,3 +1,5 @@
+#include <asm-generic/vmlinux.lds.h>
+
 OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT)
 
 #undef i386
@@ -18,9 +20,9 @@ SECTIONS
 	 * address 0.
 	 */
 	. = 0;
-	.text.head : {
+	.head.text : {
 		_head = . ;
-		*(.text.head)
+		HEAD_TEXT
 		_ehead = . ;
 	}
 	.rodata.compressed : {
-- 
cgit v1.2.3


From 4ae59b916d269255800d69a07ac5b0c368a417f8 Mon Sep 17 00:00:00 2001
From: Tim Abbott <tabbott@ksplice.com>
Date: Wed, 16 Sep 2009 16:44:28 -0400
Subject: x86: convert to use __HEAD and HEAD_TEXT macros.

This has the consequence of changing the section name use for head
code from ".text.head" to ".head.text".  It also eliminates the
".text.head" output section (instead placing head code at the start of
the .text output section), which should be harmless.

This patch only changes the sections in the actual kernel, not those
in the compressed boot loader.

Signed-off-by: Tim Abbott <tabbott@ksplice.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/head_32.S     |  2 +-
 arch/x86/kernel/head_64.S     |  2 +-
 arch/x86/kernel/vmlinux.lds.S | 12 +++---------
 3 files changed, 5 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 7ffec6b3b33..64ad7696786 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -79,7 +79,7 @@ RESERVE_BRK(pagetables, INIT_MAP_SIZE)
  * any particular GDT layout, because we load our own as soon as we
  * can.
  */
-.section .text.head,"ax",@progbits
+__HEAD
 ENTRY(startup_32)
 	/* test KEEP_SEGMENTS flag to see if the bootloader is asking
 		us to not reload segments */
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index fa54f78e2a0..26406601031 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -40,7 +40,7 @@ L4_START_KERNEL = pgd_index(__START_KERNEL_map)
 L3_START_KERNEL = pud_index(__START_KERNEL_map)
 
 	.text
-	.section .text.head
+	__HEAD
 	.code64
 	.globl startup_64
 startup_64:
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index b96ca472fa2..5c7826dd804 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -65,17 +65,11 @@ SECTIONS
 #endif
 
 	/* Text and read-only data */
-
-	/* bootstrapping code */
-	.text.head : AT(ADDR(.text.head) - LOAD_OFFSET) {
-		_text = .;
-		*(.text.head)
-	} :text = 0x9090
-
-	/* The rest of the text */
 	.text :  AT(ADDR(.text) - LOAD_OFFSET) {
+		_text = .;
+		/* bootstrapping code */
+		HEAD_TEXT
 #ifdef CONFIG_X86_32
-		/* not really needed, already page aligned */
 		. = ALIGN(PAGE_SIZE);
 		*(.text.page_aligned)
 #endif
-- 
cgit v1.2.3


From 07e81d61605f885920f31634a65aace52beb97db Mon Sep 17 00:00:00 2001
From: Tim Abbott <tabbott@ksplice.com>
Date: Wed, 16 Sep 2009 16:44:29 -0400
Subject: x86: Use section .data.page_aligned for the idt_table.

The .data.idt section is just squashed into the .data.page_aligned
output section by the linker script anyway, so it might as well be in
the .data.page_aligned section.

This eliminates all references to .data.idt on x86.

Signed-off-by: Tim Abbott <tabbott@ksplice.com>
Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/traps.c       | 6 ++----
 arch/x86/kernel/vmlinux.lds.S | 1 -
 2 files changed, 2 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 83264922a87..ea23d3b76f9 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -73,11 +73,9 @@ char ignore_fpu_irq;
 
 /*
  * The IDT has to be page-aligned to simplify the Pentium
- * F0 0F bug workaround.. We have a special link segment
- * for this.
+ * F0 0F bug workaround.
  */
-gate_desc idt_table[NR_VECTORS]
-	__attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, };
+gate_desc idt_table[NR_VECTORS] __page_aligned_data = { { { { 0, 0 } } }, };
 #endif
 
 DECLARE_BITMAP(used_vectors, NR_VECTORS);
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 5c7826dd804..7d6cef363c4 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -112,7 +112,6 @@ SECTIONS
 #endif
 
 		PAGE_ALIGNED_DATA(PAGE_SIZE)
-		*(.data.idt)
 
 		CACHELINE_ALIGNED_DATA(CONFIG_X86_L1_CACHE_BYTES)
 
-- 
cgit v1.2.3


From 123f3e1d76952759c0fc8b4fec55a3fc8084d56f Mon Sep 17 00:00:00 2001
From: Tim Abbott <tabbott@ksplice.com>
Date: Wed, 16 Sep 2009 16:44:30 -0400
Subject: x86: Cleanup linker script using new linker script macros.

Signed-off-by: Tim Abbott <tabbott@ksplice.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/vmlinux.lds.S | 47 +++----------------------------------------
 1 file changed, 3 insertions(+), 44 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 7d6cef363c4..8d2d0a271e2 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -88,13 +88,7 @@ SECTIONS
 
 	NOTES :text :note
 
-	/* Exception table */
-	. = ALIGN(16);
-	__ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
-		__start___ex_table = .;
-		*(__ex_table)
-		__stop___ex_table = .;
-	} :text = 0x9090
+	EXCEPTION_TABLE(16) :text = 0x9090
 
 	RO_DATA(PAGE_SIZE)
 
@@ -207,36 +201,12 @@ SECTIONS
 	PERCPU_VADDR(0, :percpu)
 #endif
 
-	.init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
-		_sinittext = .;
-		INIT_TEXT
-		_einittext = .;
-	}
+	INIT_TEXT_SECTION(PAGE_SIZE)
 #ifdef CONFIG_X86_64
 	:init
 #endif
 
-	.init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
-		INIT_DATA
-	}
-
-	. = ALIGN(16);
-	.init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
-		__setup_start = .;
-		*(.init.setup)
-		__setup_end = .;
-	}
-	.initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
-		__initcall_start = .;
-		INITCALLS
-		__initcall_end = .;
-	}
-
-	.con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
-		__con_initcall_start = .;
-		*(.con_initcall.init)
-		__con_initcall_end = .;
-	}
+	INIT_DATA_SECTION(16)
 
 	.x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
 		__x86_cpu_dev_start = .;
@@ -244,8 +214,6 @@ SECTIONS
 		__x86_cpu_dev_end = .;
 	}
 
-	SECURITY_INIT
-
 	. = ALIGN(8);
 	.parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
 		__parainstructions = .;
@@ -276,15 +244,6 @@ SECTIONS
 		EXIT_DATA
 	}
 
-#ifdef CONFIG_BLK_DEV_INITRD
-	. = ALIGN(PAGE_SIZE);
-	.init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
-		__initramfs_start = .;
-		*(.init.ramfs)
-		__initramfs_end = .;
-	}
-#endif
-
 #if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP)
 	PERCPU(PAGE_SIZE)
 #endif
-- 
cgit v1.2.3


From 5622f295b53fb60dbf9bed3e2c89d182490a8b7f Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Tue, 15 Sep 2009 13:00:23 +0200
Subject: x86, perf_counter, bts: Optimize BTS overflow handling

Draining the BTS buffer on a buffer overflow interrupt takes too
long resulting in a kernel lockup when tracing the kernel.

Restructure perf_counter sampling into sample creation and sample
output.

Prepare a single reference sample for BTS sampling and update the
from and to address fields when draining the BTS buffer. Drain the
entire BTS buffer between a single perf_output_begin() /
perf_output_end() pair.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20090915130023.A16204@sedona.ch.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c |  60 ++++---
 include/linux/perf_counter.h       |  68 +++++++-
 kernel/perf_counter.c              | 312 ++++++++++++++++++++-----------------
 3 files changed, 266 insertions(+), 174 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index f9cd0849bd4..6a0e71b3812 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -36,10 +36,10 @@ static u64 perf_counter_mask __read_mostly;
 #define BTS_RECORD_SIZE		24
 
 /* The size of a per-cpu BTS buffer in bytes: */
-#define BTS_BUFFER_SIZE		(BTS_RECORD_SIZE * 1024)
+#define BTS_BUFFER_SIZE		(BTS_RECORD_SIZE * 2048)
 
 /* The BTS overflow threshold in bytes from the end of the buffer: */
-#define BTS_OVFL_TH		(BTS_RECORD_SIZE * 64)
+#define BTS_OVFL_TH		(BTS_RECORD_SIZE * 128)
 
 
 /*
@@ -1488,8 +1488,7 @@ void perf_counter_print_debug(void)
 	local_irq_restore(flags);
 }
 
-static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc,
-				       struct perf_sample_data *data)
+static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc)
 {
 	struct debug_store *ds = cpuc->ds;
 	struct bts_record {
@@ -1498,8 +1497,11 @@ static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc,
 		u64	flags;
 	};
 	struct perf_counter *counter = cpuc->counters[X86_PMC_IDX_FIXED_BTS];
-	unsigned long orig_ip = data->regs->ip;
 	struct bts_record *at, *top;
+	struct perf_output_handle handle;
+	struct perf_event_header header;
+	struct perf_sample_data data;
+	struct pt_regs regs;
 
 	if (!counter)
 		return;
@@ -1510,19 +1512,38 @@ static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc,
 	at  = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
 	top = (struct bts_record *)(unsigned long)ds->bts_index;
 
+	if (top <= at)
+		return;
+
 	ds->bts_index = ds->bts_buffer_base;
 
+
+	data.period	= counter->hw.last_period;
+	data.addr	= 0;
+	regs.ip		= 0;
+
+	/*
+	 * Prepare a generic sample, i.e. fill in the invariant fields.
+	 * We will overwrite the from and to address before we output
+	 * the sample.
+	 */
+	perf_prepare_sample(&header, &data, counter, &regs);
+
+	if (perf_output_begin(&handle, counter,
+			      header.size * (top - at), 1, 1))
+		return;
+
 	for (; at < top; at++) {
-		data->regs->ip	= at->from;
-		data->addr	= at->to;
+		data.ip		= at->from;
+		data.addr	= at->to;
 
-		perf_counter_output(counter, 1, data);
+		perf_output_sample(&handle, &header, &data, counter);
 	}
 
-	data->regs->ip	= orig_ip;
-	data->addr	= 0;
+	perf_output_end(&handle);
 
 	/* There's new data available. */
+	counter->hw.interrupts++;
 	counter->pending_kill = POLL_IN;
 }
 
@@ -1552,13 +1573,9 @@ static void x86_pmu_disable(struct perf_counter *counter)
 	x86_perf_counter_update(counter, hwc, idx);
 
 	/* Drain the remaining BTS records. */
-	if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
-		struct perf_sample_data data;
-		struct pt_regs regs;
+	if (unlikely(idx == X86_PMC_IDX_FIXED_BTS))
+		intel_pmu_drain_bts_buffer(cpuc);
 
-		data.regs = &regs;
-		intel_pmu_drain_bts_buffer(cpuc, &data);
-	}
 	cpuc->counters[idx] = NULL;
 	clear_bit(idx, cpuc->used_mask);
 
@@ -1619,7 +1636,6 @@ static int p6_pmu_handle_irq(struct pt_regs *regs)
 	int idx, handled = 0;
 	u64 val;
 
-	data.regs = regs;
 	data.addr = 0;
 
 	cpuc = &__get_cpu_var(cpu_hw_counters);
@@ -1644,7 +1660,7 @@ static int p6_pmu_handle_irq(struct pt_regs *regs)
 		if (!x86_perf_counter_set_period(counter, hwc, idx))
 			continue;
 
-		if (perf_counter_overflow(counter, 1, &data))
+		if (perf_counter_overflow(counter, 1, &data, regs))
 			p6_pmu_disable_counter(hwc, idx);
 	}
 
@@ -1665,13 +1681,12 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
 	int bit, loops;
 	u64 ack, status;
 
-	data.regs = regs;
 	data.addr = 0;
 
 	cpuc = &__get_cpu_var(cpu_hw_counters);
 
 	perf_disable();
-	intel_pmu_drain_bts_buffer(cpuc, &data);
+	intel_pmu_drain_bts_buffer(cpuc);
 	status = intel_pmu_get_status();
 	if (!status) {
 		perf_enable();
@@ -1702,7 +1717,7 @@ again:
 
 		data.period = counter->hw.last_period;
 
-		if (perf_counter_overflow(counter, 1, &data))
+		if (perf_counter_overflow(counter, 1, &data, regs))
 			intel_pmu_disable_counter(&counter->hw, bit);
 	}
 
@@ -1729,7 +1744,6 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
 	int idx, handled = 0;
 	u64 val;
 
-	data.regs = regs;
 	data.addr = 0;
 
 	cpuc = &__get_cpu_var(cpu_hw_counters);
@@ -1754,7 +1768,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
 		if (!x86_perf_counter_set_period(counter, hwc, idx))
 			continue;
 
-		if (perf_counter_overflow(counter, 1, &data))
+		if (perf_counter_overflow(counter, 1, &data, regs))
 			amd_pmu_disable_counter(hwc, idx);
 	}
 
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 6c1ef72ea50..c7375f97aa1 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -691,6 +691,17 @@ struct perf_cpu_context {
 	int				recursion[4];
 };
 
+struct perf_output_handle {
+	struct perf_counter	*counter;
+	struct perf_mmap_data	*data;
+	unsigned long		head;
+	unsigned long		offset;
+	int			nmi;
+	int			sample;
+	int			locked;
+	unsigned long		flags;
+};
+
 #ifdef CONFIG_PERF_COUNTERS
 
 /*
@@ -722,16 +733,38 @@ extern int hw_perf_group_sched_in(struct perf_counter *group_leader,
 extern void perf_counter_update_userpage(struct perf_counter *counter);
 
 struct perf_sample_data {
-	struct pt_regs			*regs;
+	u64				type;
+
+	u64				ip;
+	struct {
+		u32	pid;
+		u32	tid;
+	}				tid_entry;
+	u64				time;
 	u64				addr;
+	u64				id;
+	u64				stream_id;
+	struct {
+		u32	cpu;
+		u32	reserved;
+	}				cpu_entry;
 	u64				period;
+	struct perf_callchain_entry	*callchain;
 	struct perf_raw_record		*raw;
 };
 
+extern void perf_output_sample(struct perf_output_handle *handle,
+			       struct perf_event_header *header,
+			       struct perf_sample_data *data,
+			       struct perf_counter *counter);
+extern void perf_prepare_sample(struct perf_event_header *header,
+				struct perf_sample_data *data,
+				struct perf_counter *counter,
+				struct pt_regs *regs);
+
 extern int perf_counter_overflow(struct perf_counter *counter, int nmi,
-				 struct perf_sample_data *data);
-extern void perf_counter_output(struct perf_counter *counter, int nmi,
-				struct perf_sample_data *data);
+				 struct perf_sample_data *data,
+				 struct pt_regs *regs);
 
 /*
  * Return 1 for a software counter, 0 for a hardware counter
@@ -781,6 +814,12 @@ extern void perf_tpcounter_event(int event_id, u64 addr, u64 count,
 #define perf_instruction_pointer(regs)	instruction_pointer(regs)
 #endif
 
+extern int perf_output_begin(struct perf_output_handle *handle,
+			     struct perf_counter *counter, unsigned int size,
+			     int nmi, int sample);
+extern void perf_output_end(struct perf_output_handle *handle);
+extern void perf_output_copy(struct perf_output_handle *handle,
+			     const void *buf, unsigned int len);
 #else
 static inline void
 perf_counter_task_sched_in(struct task_struct *task, int cpu)		{ }
@@ -807,7 +846,28 @@ static inline void perf_counter_mmap(struct vm_area_struct *vma)	{ }
 static inline void perf_counter_comm(struct task_struct *tsk)		{ }
 static inline void perf_counter_fork(struct task_struct *tsk)		{ }
 static inline void perf_counter_init(void)				{ }
+
+static inline int
+perf_output_begin(struct perf_output_handle *handle, struct perf_counter *c,
+		  unsigned int size, int nmi, int sample)		{ }
+static inline void perf_output_end(struct perf_output_handle *handle)	{ }
+static inline void
+perf_output_copy(struct perf_output_handle *handle,
+		 const void *buf, unsigned int len)			{ }
+static inline void
+perf_output_sample(struct perf_output_handle *handle,
+		   struct perf_event_header *header,
+		   struct perf_sample_data *data,
+		   struct perf_counter *counter)			{ }
+static inline void
+perf_prepare_sample(struct perf_event_header *header,
+		    struct perf_sample_data *data,
+		    struct perf_counter *counter,
+		    struct pt_regs *regs)				{ }
 #endif
 
+#define perf_output_put(handle, x) \
+	perf_output_copy((handle), &(x), sizeof(x))
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_PERF_COUNTER_H */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 29b73b6e814..215845243a6 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2512,18 +2512,6 @@ __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
 /*
  * Output
  */
-
-struct perf_output_handle {
-	struct perf_counter	*counter;
-	struct perf_mmap_data	*data;
-	unsigned long		head;
-	unsigned long		offset;
-	int			nmi;
-	int			sample;
-	int			locked;
-	unsigned long		flags;
-};
-
 static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
 			      unsigned long offset, unsigned long head)
 {
@@ -2633,8 +2621,8 @@ out:
 	local_irq_restore(handle->flags);
 }
 
-static void perf_output_copy(struct perf_output_handle *handle,
-			     const void *buf, unsigned int len)
+void perf_output_copy(struct perf_output_handle *handle,
+		      const void *buf, unsigned int len)
 {
 	unsigned int pages_mask;
 	unsigned int offset;
@@ -2669,12 +2657,9 @@ static void perf_output_copy(struct perf_output_handle *handle,
 	WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
 }
 
-#define perf_output_put(handle, x) \
-	perf_output_copy((handle), &(x), sizeof(x))
-
-static int perf_output_begin(struct perf_output_handle *handle,
-			     struct perf_counter *counter, unsigned int size,
-			     int nmi, int sample)
+int perf_output_begin(struct perf_output_handle *handle,
+		      struct perf_counter *counter, unsigned int size,
+		      int nmi, int sample)
 {
 	struct perf_counter *output_counter;
 	struct perf_mmap_data *data;
@@ -2756,7 +2741,7 @@ out:
 	return -ENOSPC;
 }
 
-static void perf_output_end(struct perf_output_handle *handle)
+void perf_output_end(struct perf_output_handle *handle)
 {
 	struct perf_counter *counter = handle->counter;
 	struct perf_mmap_data *data = handle->data;
@@ -2870,82 +2855,151 @@ static void perf_output_read(struct perf_output_handle *handle,
 		perf_output_read_one(handle, counter);
 }
 
-void perf_counter_output(struct perf_counter *counter, int nmi,
-				struct perf_sample_data *data)
+void perf_output_sample(struct perf_output_handle *handle,
+			struct perf_event_header *header,
+			struct perf_sample_data *data,
+			struct perf_counter *counter)
+{
+	u64 sample_type = data->type;
+
+	perf_output_put(handle, *header);
+
+	if (sample_type & PERF_SAMPLE_IP)
+		perf_output_put(handle, data->ip);
+
+	if (sample_type & PERF_SAMPLE_TID)
+		perf_output_put(handle, data->tid_entry);
+
+	if (sample_type & PERF_SAMPLE_TIME)
+		perf_output_put(handle, data->time);
+
+	if (sample_type & PERF_SAMPLE_ADDR)
+		perf_output_put(handle, data->addr);
+
+	if (sample_type & PERF_SAMPLE_ID)
+		perf_output_put(handle, data->id);
+
+	if (sample_type & PERF_SAMPLE_STREAM_ID)
+		perf_output_put(handle, data->stream_id);
+
+	if (sample_type & PERF_SAMPLE_CPU)
+		perf_output_put(handle, data->cpu_entry);
+
+	if (sample_type & PERF_SAMPLE_PERIOD)
+		perf_output_put(handle, data->period);
+
+	if (sample_type & PERF_SAMPLE_READ)
+		perf_output_read(handle, counter);
+
+	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
+		if (data->callchain) {
+			int size = 1;
+
+			if (data->callchain)
+				size += data->callchain->nr;
+
+			size *= sizeof(u64);
+
+			perf_output_copy(handle, data->callchain, size);
+		} else {
+			u64 nr = 0;
+			perf_output_put(handle, nr);
+		}
+	}
+
+	if (sample_type & PERF_SAMPLE_RAW) {
+		if (data->raw) {
+			perf_output_put(handle, data->raw->size);
+			perf_output_copy(handle, data->raw->data,
+					 data->raw->size);
+		} else {
+			struct {
+				u32	size;
+				u32	data;
+			} raw = {
+				.size = sizeof(u32),
+				.data = 0,
+			};
+			perf_output_put(handle, raw);
+		}
+	}
+}
+
+void perf_prepare_sample(struct perf_event_header *header,
+			 struct perf_sample_data *data,
+			 struct perf_counter *counter,
+			 struct pt_regs *regs)
 {
-	int ret;
 	u64 sample_type = counter->attr.sample_type;
-	struct perf_output_handle handle;
-	struct perf_event_header header;
-	u64 ip;
-	struct {
-		u32 pid, tid;
-	} tid_entry;
-	struct perf_callchain_entry *callchain = NULL;
-	int callchain_size = 0;
-	u64 time;
-	struct {
-		u32 cpu, reserved;
-	} cpu_entry;
 
-	header.type = PERF_EVENT_SAMPLE;
-	header.size = sizeof(header);
+	data->type = sample_type;
 
-	header.misc = 0;
-	header.misc |= perf_misc_flags(data->regs);
+	header->type = PERF_EVENT_SAMPLE;
+	header->size = sizeof(*header);
+
+	header->misc = 0;
+	header->misc |= perf_misc_flags(regs);
 
 	if (sample_type & PERF_SAMPLE_IP) {
-		ip = perf_instruction_pointer(data->regs);
-		header.size += sizeof(ip);
+		data->ip = perf_instruction_pointer(regs);
+
+		header->size += sizeof(data->ip);
 	}
 
 	if (sample_type & PERF_SAMPLE_TID) {
 		/* namespace issues */
-		tid_entry.pid = perf_counter_pid(counter, current);
-		tid_entry.tid = perf_counter_tid(counter, current);
+		data->tid_entry.pid = perf_counter_pid(counter, current);
+		data->tid_entry.tid = perf_counter_tid(counter, current);
 
-		header.size += sizeof(tid_entry);
+		header->size += sizeof(data->tid_entry);
 	}
 
 	if (sample_type & PERF_SAMPLE_TIME) {
 		/*
 		 * Maybe do better on x86 and provide cpu_clock_nmi()
 		 */
-		time = sched_clock();
+		data->time = sched_clock();
 
-		header.size += sizeof(u64);
+		header->size += sizeof(data->time);
 	}
 
 	if (sample_type & PERF_SAMPLE_ADDR)
-		header.size += sizeof(u64);
+		header->size += sizeof(data->addr);
 
-	if (sample_type & PERF_SAMPLE_ID)
-		header.size += sizeof(u64);
+	if (sample_type & PERF_SAMPLE_ID) {
+		data->id = primary_counter_id(counter);
 
-	if (sample_type & PERF_SAMPLE_STREAM_ID)
-		header.size += sizeof(u64);
+		header->size += sizeof(data->id);
+	}
+
+	if (sample_type & PERF_SAMPLE_STREAM_ID) {
+		data->stream_id = counter->id;
+
+		header->size += sizeof(data->stream_id);
+	}
 
 	if (sample_type & PERF_SAMPLE_CPU) {
-		header.size += sizeof(cpu_entry);
+		data->cpu_entry.cpu		= raw_smp_processor_id();
+		data->cpu_entry.reserved	= 0;
 
-		cpu_entry.cpu = raw_smp_processor_id();
-		cpu_entry.reserved = 0;
+		header->size += sizeof(data->cpu_entry);
 	}
 
 	if (sample_type & PERF_SAMPLE_PERIOD)
-		header.size += sizeof(u64);
+		header->size += sizeof(data->period);
 
 	if (sample_type & PERF_SAMPLE_READ)
-		header.size += perf_counter_read_size(counter);
+		header->size += perf_counter_read_size(counter);
 
 	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
-		callchain = perf_callchain(data->regs);
+		int size = 1;
 
-		if (callchain) {
-			callchain_size = (1 + callchain->nr) * sizeof(u64);
-			header.size += callchain_size;
-		} else
-			header.size += sizeof(u64);
+		data->callchain = perf_callchain(regs);
+
+		if (data->callchain)
+			size += data->callchain->nr;
+
+		header->size += size * sizeof(u64);
 	}
 
 	if (sample_type & PERF_SAMPLE_RAW) {
@@ -2957,69 +3011,23 @@ void perf_counter_output(struct perf_counter *counter, int nmi,
 			size += sizeof(u32);
 
 		WARN_ON_ONCE(size & (sizeof(u64)-1));
-		header.size += size;
-	}
-
-	ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
-	if (ret)
-		return;
-
-	perf_output_put(&handle, header);
-
-	if (sample_type & PERF_SAMPLE_IP)
-		perf_output_put(&handle, ip);
-
-	if (sample_type & PERF_SAMPLE_TID)
-		perf_output_put(&handle, tid_entry);
-
-	if (sample_type & PERF_SAMPLE_TIME)
-		perf_output_put(&handle, time);
-
-	if (sample_type & PERF_SAMPLE_ADDR)
-		perf_output_put(&handle, data->addr);
-
-	if (sample_type & PERF_SAMPLE_ID) {
-		u64 id = primary_counter_id(counter);
-
-		perf_output_put(&handle, id);
+		header->size += size;
 	}
+}
 
-	if (sample_type & PERF_SAMPLE_STREAM_ID)
-		perf_output_put(&handle, counter->id);
-
-	if (sample_type & PERF_SAMPLE_CPU)
-		perf_output_put(&handle, cpu_entry);
-
-	if (sample_type & PERF_SAMPLE_PERIOD)
-		perf_output_put(&handle, data->period);
+static void perf_counter_output(struct perf_counter *counter, int nmi,
+				struct perf_sample_data *data,
+				struct pt_regs *regs)
+{
+	struct perf_output_handle handle;
+	struct perf_event_header header;
 
-	if (sample_type & PERF_SAMPLE_READ)
-		perf_output_read(&handle, counter);
+	perf_prepare_sample(&header, data, counter, regs);
 
-	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
-		if (callchain)
-			perf_output_copy(&handle, callchain, callchain_size);
-		else {
-			u64 nr = 0;
-			perf_output_put(&handle, nr);
-		}
-	}
+	if (perf_output_begin(&handle, counter, header.size, nmi, 1))
+		return;
 
-	if (sample_type & PERF_SAMPLE_RAW) {
-		if (data->raw) {
-			perf_output_put(&handle, data->raw->size);
-			perf_output_copy(&handle, data->raw->data, data->raw->size);
-		} else {
-			struct {
-				u32	size;
-				u32	data;
-			} raw = {
-				.size = sizeof(u32),
-				.data = 0,
-			};
-			perf_output_put(&handle, raw);
-		}
-	}
+	perf_output_sample(&handle, &header, data, counter);
 
 	perf_output_end(&handle);
 }
@@ -3501,7 +3509,8 @@ static void perf_log_throttle(struct perf_counter *counter, int enable)
  */
 
 static int __perf_counter_overflow(struct perf_counter *counter, int nmi,
-				   int throttle, struct perf_sample_data *data)
+				   int throttle, struct perf_sample_data *data,
+				   struct pt_regs *regs)
 {
 	int events = atomic_read(&counter->event_limit);
 	struct hw_perf_counter *hwc = &counter->hw;
@@ -3557,14 +3566,15 @@ static int __perf_counter_overflow(struct perf_counter *counter, int nmi,
 			perf_counter_disable(counter);
 	}
 
-	perf_counter_output(counter, nmi, data);
+	perf_counter_output(counter, nmi, data, regs);
 	return ret;
 }
 
 int perf_counter_overflow(struct perf_counter *counter, int nmi,
-			  struct perf_sample_data *data)
+			  struct perf_sample_data *data,
+			  struct pt_regs *regs)
 {
-	return __perf_counter_overflow(counter, nmi, 1, data);
+	return __perf_counter_overflow(counter, nmi, 1, data, regs);
 }
 
 /*
@@ -3602,7 +3612,8 @@ again:
 }
 
 static void perf_swcounter_overflow(struct perf_counter *counter,
-				    int nmi, struct perf_sample_data *data)
+				    int nmi, struct perf_sample_data *data,
+				    struct pt_regs *regs)
 {
 	struct hw_perf_counter *hwc = &counter->hw;
 	int throttle = 0;
@@ -3615,7 +3626,8 @@ static void perf_swcounter_overflow(struct perf_counter *counter,
 		return;
 
 	for (; overflow; overflow--) {
-		if (__perf_counter_overflow(counter, nmi, throttle, data)) {
+		if (__perf_counter_overflow(counter, nmi, throttle,
+					    data, regs)) {
 			/*
 			 * We inhibit the overflow from happening when
 			 * hwc->interrupts == MAX_INTERRUPTS.
@@ -3634,7 +3646,8 @@ static void perf_swcounter_unthrottle(struct perf_counter *counter)
 }
 
 static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
-			       int nmi, struct perf_sample_data *data)
+			       int nmi, struct perf_sample_data *data,
+			       struct pt_regs *regs)
 {
 	struct hw_perf_counter *hwc = &counter->hw;
 
@@ -3643,11 +3656,11 @@ static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
 	if (!hwc->sample_period)
 		return;
 
-	if (!data->regs)
+	if (!regs)
 		return;
 
 	if (!atomic64_add_negative(nr, &hwc->period_left))
-		perf_swcounter_overflow(counter, nmi, data);
+		perf_swcounter_overflow(counter, nmi, data, regs);
 }
 
 static int perf_swcounter_is_counting(struct perf_counter *counter)
@@ -3706,7 +3719,8 @@ static int perf_swcounter_match(struct perf_counter *counter,
 static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
 				     enum perf_type_id type,
 				     u32 event, u64 nr, int nmi,
-				     struct perf_sample_data *data)
+				     struct perf_sample_data *data,
+				     struct pt_regs *regs)
 {
 	struct perf_counter *counter;
 
@@ -3715,8 +3729,8 @@ static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
-		if (perf_swcounter_match(counter, type, event, data->regs))
-			perf_swcounter_add(counter, nr, nmi, data);
+		if (perf_swcounter_match(counter, type, event, regs))
+			perf_swcounter_add(counter, nr, nmi, data, regs);
 	}
 	rcu_read_unlock();
 }
@@ -3737,7 +3751,8 @@ static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx)
 
 static void do_perf_swcounter_event(enum perf_type_id type, u32 event,
 				    u64 nr, int nmi,
-				    struct perf_sample_data *data)
+				    struct perf_sample_data *data,
+				    struct pt_regs *regs)
 {
 	struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
 	int *recursion = perf_swcounter_recursion_context(cpuctx);
@@ -3750,7 +3765,7 @@ static void do_perf_swcounter_event(enum perf_type_id type, u32 event,
 	barrier();
 
 	perf_swcounter_ctx_event(&cpuctx->ctx, type, event,
-				 nr, nmi, data);
+				 nr, nmi, data, regs);
 	rcu_read_lock();
 	/*
 	 * doesn't really matter which of the child contexts the
@@ -3758,7 +3773,7 @@ static void do_perf_swcounter_event(enum perf_type_id type, u32 event,
 	 */
 	ctx = rcu_dereference(current->perf_counter_ctxp);
 	if (ctx)
-		perf_swcounter_ctx_event(ctx, type, event, nr, nmi, data);
+		perf_swcounter_ctx_event(ctx, type, event, nr, nmi, data, regs);
 	rcu_read_unlock();
 
 	barrier();
@@ -3772,11 +3787,11 @@ void __perf_swcounter_event(u32 event, u64 nr, int nmi,
 			    struct pt_regs *regs, u64 addr)
 {
 	struct perf_sample_data data = {
-		.regs = regs,
 		.addr = addr,
 	};
 
-	do_perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, &data);
+	do_perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi,
+				&data, regs);
 }
 
 static void perf_swcounter_read(struct perf_counter *counter)
@@ -3813,6 +3828,7 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 {
 	enum hrtimer_restart ret = HRTIMER_RESTART;
 	struct perf_sample_data data;
+	struct pt_regs *regs;
 	struct perf_counter *counter;
 	u64 period;
 
@@ -3820,17 +3836,17 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 	counter->pmu->read(counter);
 
 	data.addr = 0;
-	data.regs = get_irq_regs();
+	regs = get_irq_regs();
 	/*
 	 * In case we exclude kernel IPs or are somehow not in interrupt
 	 * context, provide the next best thing, the user IP.
 	 */
-	if ((counter->attr.exclude_kernel || !data.regs) &&
+	if ((counter->attr.exclude_kernel || !regs) &&
 			!counter->attr.exclude_user)
-		data.regs = task_pt_regs(current);
+		regs = task_pt_regs(current);
 
-	if (data.regs) {
-		if (perf_counter_overflow(counter, 0, &data))
+	if (regs) {
+		if (perf_counter_overflow(counter, 0, &data, regs))
 			ret = HRTIMER_NORESTART;
 	}
 
@@ -3966,15 +3982,17 @@ void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record,
 	};
 
 	struct perf_sample_data data = {
-		.regs = get_irq_regs(),
 		.addr = addr,
 		.raw = &raw,
 	};
 
-	if (!data.regs)
-		data.regs = task_pt_regs(current);
+	struct pt_regs *regs = get_irq_regs();
+
+	if (!regs)
+		regs = task_pt_regs(current);
 
-	do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, &data);
+	do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
+				&data, regs);
 }
 EXPORT_SYMBOL_GPL(perf_tpcounter_event);
 
-- 
cgit v1.2.3


From 2fbd07a5f5d1295fa9b0c0564ec27da7c276a75a Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Fri, 18 Sep 2009 19:29:59 -0700
Subject: x86, apic: Use logical flat on intel with <= 8 logical cpus

On Intel platforms, we can use logical flat mode if there are <= 8
logical cpu's (irrespective of physical apic id values). This will
enable simplified and efficient IPI and device interrupt routing on
such platforms.

Fix the relevant comments while we are at it.

We can clean up default_setup_apic_routing() by using apic->probe()
but that is a different item.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: "yinghai@kernel.org" <yinghai@kernel.org>
LKML-Reference: <1253327399.3948.747.camel@sbs-t61.sc.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/apic.c     | 26 ++++++++------------------
 arch/x86/kernel/apic/probe_64.c | 15 +++++++++++----
 2 files changed, 19 insertions(+), 22 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 79e5b92a580..072aea6c630 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -61,7 +61,7 @@ unsigned int boot_cpu_physical_apicid = -1U;
 /*
  * The highest APIC ID seen during enumeration.
  *
- * This determines the messaging protocol we can use: if all APIC IDs
+ * On AMD, this determines the messaging protocol we can use: if all APIC IDs
  * are in the 0 ... 7 range, then we can use logical addressing which
  * has some performance advantages (better broadcasting).
  *
@@ -1915,24 +1915,14 @@ void __cpuinit generic_processor_info(int apicid, int version)
 		max_physical_apicid = apicid;
 
 #ifdef CONFIG_X86_32
-	/*
-	 * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y
-	 * but we need to work other dependencies like SMP_SUSPEND etc
-	 * before this can be done without some confusion.
-	 * if (CPU_HOTPLUG_ENABLED || num_processors > 8)
-	 *       - Ashok Raj <ashok.raj@intel.com>
-	 */
-	if (max_physical_apicid >= 8) {
-		switch (boot_cpu_data.x86_vendor) {
-		case X86_VENDOR_INTEL:
-			if (!APIC_XAPIC(version)) {
-				def_to_bigsmp = 0;
-				break;
-			}
-			/* If P4 and above fall through */
-		case X86_VENDOR_AMD:
+	switch (boot_cpu_data.x86_vendor) {
+	case X86_VENDOR_INTEL:
+		if (num_processors > 8)
+			def_to_bigsmp = 1;
+		break;
+	case X86_VENDOR_AMD:
+		if (max_physical_apicid >= 8)
 			def_to_bigsmp = 1;
-		}
 	}
 #endif
 
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index 65edc180fc8..c4cbd3080c1 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -64,16 +64,23 @@ void __init default_setup_apic_routing(void)
 			apic = &apic_x2apic_phys;
 		else
 			apic = &apic_x2apic_cluster;
-		printk(KERN_INFO "Setting APIC routing to %s\n", apic->name);
 	}
 #endif
 
 	if (apic == &apic_flat) {
-		if (max_physical_apicid >= 8)
-			apic = &apic_physflat;
-		printk(KERN_INFO "Setting APIC routing to %s\n", apic->name);
+		switch (boot_cpu_data.x86_vendor) {
+		case X86_VENDOR_INTEL:
+			if (num_processors > 8)
+				apic = &apic_physflat;
+			break;
+		case X86_VENDOR_AMD:
+			if (max_physical_apicid >= 8)
+				apic = &apic_physflat;
+		}
 	}
 
+	printk(KERN_INFO "Setting APIC routing to %s\n", apic->name);
+
 	if (is_vsmp_box()) {
 		/* need to update phys_pkg_id */
 		apic->phys_pkg_id = apicid_phys_pkg_id;
-- 
cgit v1.2.3


From 6161352142d5fed4cd753b32e5ccde66e705b14e Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Thu, 17 Sep 2009 16:11:28 +0200
Subject: tracing, perf: Convert the power tracer into an event tracer

This patch converts the existing power tracer into an event tracer,
so that power events (C states and frequency changes) can be
tracked via "perf".

This also removes the perl script that was used to demo the tracer;
its functionality is being replaced entirely with timechart.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20090912130542.6d314860@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/trace/power.txt              |  17 ---
 arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c |   7 +-
 arch/x86/kernel/process.c                  |  28 ++--
 include/trace/events/power.h               |  81 +++++++++++
 kernel/trace/Makefile                      |   2 +-
 kernel/trace/power-traces.c                |  20 +++
 kernel/trace/trace.h                       |   3 -
 kernel/trace/trace_entries.h               |  17 ---
 kernel/trace/trace_power.c                 | 218 -----------------------------
 scripts/tracing/power.pl                   | 108 --------------
 10 files changed, 113 insertions(+), 388 deletions(-)
 delete mode 100644 Documentation/trace/power.txt
 create mode 100644 include/trace/events/power.h
 create mode 100644 kernel/trace/power-traces.c
 delete mode 100644 kernel/trace/trace_power.c
 delete mode 100644 scripts/tracing/power.pl

(limited to 'arch/x86')

diff --git a/Documentation/trace/power.txt b/Documentation/trace/power.txt
deleted file mode 100644
index cd805e16dc2..00000000000
--- a/Documentation/trace/power.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-The power tracer collects detailed information about C-state and P-state
-transitions, instead of just looking at the high-level "average"
-information.
-
-There is a helper script found in scrips/tracing/power.pl in the kernel
-sources which can be used to parse this information and create a
-Scalable Vector Graphics (SVG) picture from the trace data.
-
-To use this tracer:
-
-	echo 0 > /sys/kernel/debug/tracing/tracing_enabled
-	echo power > /sys/kernel/debug/tracing/current_tracer
-	echo 1 > /sys/kernel/debug/tracing/tracing_enabled
-	sleep 1
-	echo 0 > /sys/kernel/debug/tracing/tracing_enabled
-	cat /sys/kernel/debug/tracing/trace | \
-		perl scripts/tracing/power.pl > out.sv
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 4109679863c..479cc8c418c 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -33,7 +33,7 @@
 #include <linux/cpufreq.h>
 #include <linux/compiler.h>
 #include <linux/dmi.h>
-#include <trace/power.h>
+#include <trace/events/power.h>
 
 #include <linux/acpi.h>
 #include <linux/io.h>
@@ -72,8 +72,6 @@ static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data);
 
 static DEFINE_PER_CPU(struct aperfmperf, old_perf);
 
-DEFINE_TRACE(power_mark);
-
 /* acpi_perf_data is a pointer to percpu data. */
 static struct acpi_processor_performance *acpi_perf_data;
 
@@ -332,7 +330,6 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
 	unsigned int next_perf_state = 0; /* Index into perf table */
 	unsigned int i;
 	int result = 0;
-	struct power_trace it;
 
 	dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu);
 
@@ -364,7 +361,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
 		}
 	}
 
-	trace_power_mark(&it, POWER_PSTATE, next_perf_state);
+	trace_power_frequency(POWER_PSTATE, data->freq_table[next_state].frequency);
 
 	switch (data->cpu_feature) {
 	case SYSTEM_INTEL_MSR_CAPABLE:
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 071166a4ba8..7b60e390688 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -9,7 +9,7 @@
 #include <linux/pm.h>
 #include <linux/clockchips.h>
 #include <linux/random.h>
-#include <trace/power.h>
+#include <trace/events/power.h>
 #include <asm/system.h>
 #include <asm/apic.h>
 #include <asm/syscalls.h>
@@ -25,9 +25,6 @@ EXPORT_SYMBOL(idle_nomwait);
 
 struct kmem_cache *task_xstate_cachep;
 
-DEFINE_TRACE(power_start);
-DEFINE_TRACE(power_end);
-
 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 {
 	*dst = *src;
@@ -299,9 +296,7 @@ static inline int hlt_use_halt(void)
 void default_idle(void)
 {
 	if (hlt_use_halt()) {
-		struct power_trace it;
-
-		trace_power_start(&it, POWER_CSTATE, 1);
+		trace_power_start(POWER_CSTATE, 1);
 		current_thread_info()->status &= ~TS_POLLING;
 		/*
 		 * TS_POLLING-cleared state must be visible before we
@@ -314,7 +309,7 @@ void default_idle(void)
 		else
 			local_irq_enable();
 		current_thread_info()->status |= TS_POLLING;
-		trace_power_end(&it);
+		trace_power_end(0);
 	} else {
 		local_irq_enable();
 		/* loop is done by the caller */
@@ -372,9 +367,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
  */
 void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
 {
-	struct power_trace it;
-
-	trace_power_start(&it, POWER_CSTATE, (ax>>4)+1);
+	trace_power_start(POWER_CSTATE, (ax>>4)+1);
 	if (!need_resched()) {
 		if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
 			clflush((void *)&current_thread_info()->flags);
@@ -384,15 +377,14 @@ void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
 		if (!need_resched())
 			__mwait(ax, cx);
 	}
-	trace_power_end(&it);
+	trace_power_end(0);
 }
 
 /* Default MONITOR/MWAIT with no hints, used for default C1 state */
 static void mwait_idle(void)
 {
-	struct power_trace it;
 	if (!need_resched()) {
-		trace_power_start(&it, POWER_CSTATE, 1);
+		trace_power_start(POWER_CSTATE, 1);
 		if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
 			clflush((void *)&current_thread_info()->flags);
 
@@ -402,7 +394,7 @@ static void mwait_idle(void)
 			__sti_mwait(0, 0);
 		else
 			local_irq_enable();
-		trace_power_end(&it);
+		trace_power_end(0);
 	} else
 		local_irq_enable();
 }
@@ -414,13 +406,11 @@ static void mwait_idle(void)
  */
 static void poll_idle(void)
 {
-	struct power_trace it;
-
-	trace_power_start(&it, POWER_CSTATE, 0);
+	trace_power_start(POWER_CSTATE, 0);
 	local_irq_enable();
 	while (!need_resched())
 		cpu_relax();
-	trace_power_end(&it);
+	trace_power_end(0);
 }
 
 /*
diff --git a/include/trace/events/power.h b/include/trace/events/power.h
new file mode 100644
index 00000000000..ea6d579261a
--- /dev/null
+++ b/include/trace/events/power.h
@@ -0,0 +1,81 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM power
+
+#if !defined(_TRACE_POWER_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_POWER_H
+
+#include <linux/ktime.h>
+#include <linux/tracepoint.h>
+
+#ifndef _TRACE_POWER_ENUM_
+#define _TRACE_POWER_ENUM_
+enum {
+	POWER_NONE = 0,
+	POWER_CSTATE = 1,
+	POWER_PSTATE = 2,
+};
+#endif
+
+
+
+TRACE_EVENT(power_start,
+
+	TP_PROTO(unsigned int type, unsigned int state),
+
+	TP_ARGS(type, state),
+
+	TP_STRUCT__entry(
+		__field(	u64,		type		)
+		__field(	u64,		state		)
+	),
+
+	TP_fast_assign(
+		__entry->type = type;
+		__entry->state = state;
+	),
+
+	TP_printk("type=%lu state=%lu", (unsigned long)__entry->type, (unsigned long)__entry->state)
+);
+
+TRACE_EVENT(power_end,
+
+	TP_PROTO(int dummy),
+
+	TP_ARGS(dummy),
+
+	TP_STRUCT__entry(
+		__field(	u64,		dummy		)
+	),
+
+	TP_fast_assign(
+		__entry->dummy = 0xffff;
+	),
+
+	TP_printk("dummy=%lu", (unsigned long)__entry->dummy)
+
+);
+
+
+TRACE_EVENT(power_frequency,
+
+	TP_PROTO(unsigned int type, unsigned int state),
+
+	TP_ARGS(type, state),
+
+	TP_STRUCT__entry(
+		__field(	u64,		type		)
+		__field(	u64,		state		)
+	),
+
+	TP_fast_assign(
+		__entry->type = type;
+		__entry->state = state;
+	),
+
+	TP_printk("type=%lu state=%lu", (unsigned long)__entry->type, (unsigned long) __entry->state)
+);
+
+#endif /* _TRACE_POWER_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 844164dca90..26f03ac07c2 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -42,7 +42,6 @@ obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
 obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
 obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
 obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
-obj-$(CONFIG_POWER_TRACER) += trace_power.o
 obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
 obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
 obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
@@ -54,5 +53,6 @@ obj-$(CONFIG_EVENT_TRACING) += trace_export.o
 obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
 obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
 obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
+obj-$(CONFIG_EVENT_TRACING) += power-traces.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
new file mode 100644
index 00000000000..e06c6e3d56a
--- /dev/null
+++ b/kernel/trace/power-traces.c
@@ -0,0 +1,20 @@
+/*
+ * Power trace points
+ *
+ * Copyright (C) 2009 Arjan van de Ven <arjan@linux.intel.com>
+ */
+
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/power.h>
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(power_start);
+EXPORT_TRACEPOINT_SYMBOL_GPL(power_end);
+EXPORT_TRACEPOINT_SYMBOL_GPL(power_frequency);
+
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 86bcff94791..405cb850b75 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -11,7 +11,6 @@
 #include <linux/ftrace.h>
 #include <trace/boot.h>
 #include <linux/kmemtrace.h>
-#include <trace/power.h>
 
 #include <linux/trace_seq.h>
 #include <linux/ftrace_event.h>
@@ -37,7 +36,6 @@ enum trace_type {
 	TRACE_HW_BRANCHES,
 	TRACE_KMEM_ALLOC,
 	TRACE_KMEM_FREE,
-	TRACE_POWER,
 	TRACE_BLK,
 
 	__TRACE_LAST_TYPE,
@@ -207,7 +205,6 @@ extern void __ftrace_bad_type(void);
 		IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry,	\
 			  TRACE_GRAPH_RET);		\
 		IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\
-		IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \
 		IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry,	\
 			  TRACE_KMEM_ALLOC);	\
 		IF_ASSIGN(var, ent, struct kmemtrace_free_entry,	\
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index a431748ddd6..ead3d724599 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -330,23 +330,6 @@ FTRACE_ENTRY(hw_branch, hw_branch_entry,
 	F_printk("from: %llx to: %llx", __entry->from, __entry->to)
 );
 
-FTRACE_ENTRY(power, trace_power,
-
-	TRACE_POWER,
-
-	F_STRUCT(
-		__field_struct(	struct power_trace,	state_data	)
-		__field_desc(	s64,	state_data,	stamp		)
-		__field_desc(	s64,	state_data,	end		)
-		__field_desc(	int,	state_data,	type		)
-		__field_desc(	int,	state_data,	state		)
-	),
-
-	F_printk("%llx->%llx type:%u state:%u",
-		 __entry->stamp, __entry->end,
-		 __entry->type, __entry->state)
-);
-
 FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry,
 
 	TRACE_KMEM_ALLOC,
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
deleted file mode 100644
index fe1a00f1445..00000000000
--- a/kernel/trace/trace_power.c
+++ /dev/null
@@ -1,218 +0,0 @@
-/*
- * ring buffer based C-state tracer
- *
- * Arjan van de Ven <arjan@linux.intel.com>
- * Copyright (C) 2008 Intel Corporation
- *
- * Much is borrowed from trace_boot.c which is
- * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
- *
- */
-
-#include <linux/init.h>
-#include <linux/debugfs.h>
-#include <trace/power.h>
-#include <linux/kallsyms.h>
-#include <linux/module.h>
-
-#include "trace.h"
-#include "trace_output.h"
-
-static struct trace_array *power_trace;
-static int __read_mostly trace_power_enabled;
-
-static void probe_power_start(struct power_trace *it, unsigned int type,
-				unsigned int level)
-{
-	if (!trace_power_enabled)
-		return;
-
-	memset(it, 0, sizeof(struct power_trace));
-	it->state = level;
-	it->type = type;
-	it->stamp = ktime_get();
-}
-
-
-static void probe_power_end(struct power_trace *it)
-{
-	struct ftrace_event_call *call = &event_power;
-	struct ring_buffer_event *event;
-	struct ring_buffer *buffer;
-	struct trace_power *entry;
-	struct trace_array_cpu *data;
-	struct trace_array *tr = power_trace;
-
-	if (!trace_power_enabled)
-		return;
-
-	buffer = tr->buffer;
-
-	preempt_disable();
-	it->end = ktime_get();
-	data = tr->data[smp_processor_id()];
-
-	event = trace_buffer_lock_reserve(buffer, TRACE_POWER,
-					  sizeof(*entry), 0, 0);
-	if (!event)
-		goto out;
-	entry	= ring_buffer_event_data(event);
-	entry->state_data = *it;
-	if (!filter_check_discard(call, entry, buffer, event))
-		trace_buffer_unlock_commit(buffer, event, 0, 0);
- out:
-	preempt_enable();
-}
-
-static void probe_power_mark(struct power_trace *it, unsigned int type,
-				unsigned int level)
-{
-	struct ftrace_event_call *call = &event_power;
-	struct ring_buffer_event *event;
-	struct ring_buffer *buffer;
-	struct trace_power *entry;
-	struct trace_array_cpu *data;
-	struct trace_array *tr = power_trace;
-
-	if (!trace_power_enabled)
-		return;
-
-	buffer = tr->buffer;
-
-	memset(it, 0, sizeof(struct power_trace));
-	it->state = level;
-	it->type = type;
-	it->stamp = ktime_get();
-	preempt_disable();
-	it->end = it->stamp;
-	data = tr->data[smp_processor_id()];
-
-	event = trace_buffer_lock_reserve(buffer, TRACE_POWER,
-					  sizeof(*entry), 0, 0);
-	if (!event)
-		goto out;
-	entry	= ring_buffer_event_data(event);
-	entry->state_data = *it;
-	if (!filter_check_discard(call, entry, buffer, event))
-		trace_buffer_unlock_commit(buffer, event, 0, 0);
- out:
-	preempt_enable();
-}
-
-static int tracing_power_register(void)
-{
-	int ret;
-
-	ret = register_trace_power_start(probe_power_start);
-	if (ret) {
-		pr_info("power trace: Couldn't activate tracepoint"
-			" probe to trace_power_start\n");
-		return ret;
-	}
-	ret = register_trace_power_end(probe_power_end);
-	if (ret) {
-		pr_info("power trace: Couldn't activate tracepoint"
-			" probe to trace_power_end\n");
-		goto fail_start;
-	}
-	ret = register_trace_power_mark(probe_power_mark);
-	if (ret) {
-		pr_info("power trace: Couldn't activate tracepoint"
-			" probe to trace_power_mark\n");
-		goto fail_end;
-	}
-	return ret;
-fail_end:
-	unregister_trace_power_end(probe_power_end);
-fail_start:
-	unregister_trace_power_start(probe_power_start);
-	return ret;
-}
-
-static void start_power_trace(struct trace_array *tr)
-{
-	trace_power_enabled = 1;
-}
-
-static void stop_power_trace(struct trace_array *tr)
-{
-	trace_power_enabled = 0;
-}
-
-static void power_trace_reset(struct trace_array *tr)
-{
-	trace_power_enabled = 0;
-	unregister_trace_power_start(probe_power_start);
-	unregister_trace_power_end(probe_power_end);
-	unregister_trace_power_mark(probe_power_mark);
-}
-
-
-static int power_trace_init(struct trace_array *tr)
-{
-	power_trace = tr;
-
-	trace_power_enabled = 1;
-	tracing_power_register();
-
-	tracing_reset_online_cpus(tr);
-	return 0;
-}
-
-static enum print_line_t power_print_line(struct trace_iterator *iter)
-{
-	int ret = 0;
-	struct trace_entry *entry = iter->ent;
-	struct trace_power *field ;
-	struct power_trace *it;
-	struct trace_seq *s = &iter->seq;
-	struct timespec stamp;
-	struct timespec duration;
-
-	trace_assign_type(field, entry);
-	it = &field->state_data;
-	stamp = ktime_to_timespec(it->stamp);
-	duration = ktime_to_timespec(ktime_sub(it->end, it->stamp));
-
-	if (entry->type == TRACE_POWER) {
-		if (it->type == POWER_CSTATE)
-			ret = trace_seq_printf(s, "[%5ld.%09ld] CSTATE: Going to C%i on cpu %i for %ld.%09ld\n",
-					  stamp.tv_sec,
-					  stamp.tv_nsec,
-					  it->state, iter->cpu,
-					  duration.tv_sec,
-					  duration.tv_nsec);
-		if (it->type == POWER_PSTATE)
-			ret = trace_seq_printf(s, "[%5ld.%09ld] PSTATE: Going to P%i on cpu %i\n",
-					  stamp.tv_sec,
-					  stamp.tv_nsec,
-					  it->state, iter->cpu);
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-		return TRACE_TYPE_HANDLED;
-	}
-	return TRACE_TYPE_UNHANDLED;
-}
-
-static void power_print_header(struct seq_file *s)
-{
-	seq_puts(s, "#   TIMESTAMP      STATE  EVENT\n");
-	seq_puts(s, "#       |            |      |\n");
-}
-
-static struct tracer power_tracer __read_mostly =
-{
-	.name		= "power",
-	.init		= power_trace_init,
-	.start		= start_power_trace,
-	.stop		= stop_power_trace,
-	.reset		= power_trace_reset,
-	.print_line	= power_print_line,
-	.print_header	= power_print_header,
-};
-
-static int init_power_trace(void)
-{
-	return register_tracer(&power_tracer);
-}
-device_initcall(init_power_trace);
diff --git a/scripts/tracing/power.pl b/scripts/tracing/power.pl
deleted file mode 100644
index 4f729b3501e..00000000000
--- a/scripts/tracing/power.pl
+++ /dev/null
@@ -1,108 +0,0 @@
-#!/usr/bin/perl
-
-# Copyright 2008, Intel Corporation
-#
-# This file is part of the Linux kernel
-#
-# This program file is free software; you can redistribute it and/or modify it
-# under the terms of the GNU General Public License as published by the
-# Free Software Foundation; version 2 of the License.
-#
-# This program is distributed in the hope that it will be useful, but WITHOUT
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-# for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program in a file named COPYING; if not, write to the
-# Free Software Foundation, Inc.,
-# 51 Franklin Street, Fifth Floor,
-# Boston, MA 02110-1301 USA
-#
-# Authors:
-# 	Arjan van de Ven <arjan@linux.intel.com>
-
-
-#
-# This script turns a cstate ftrace output into a SVG graphic that shows
-# historic C-state information
-#
-#
-# 	cat /sys/kernel/debug/tracing/trace | perl power.pl > out.svg
-#
-
-my @styles;
-my $base = 0;
-
-my @pstate_last;
-my @pstate_level;
-
-$styles[0] = "fill:rgb(0,0,255);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)";
-$styles[1] = "fill:rgb(0,255,0);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)";
-$styles[2] = "fill:rgb(255,0,20);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)";
-$styles[3] = "fill:rgb(255,255,20);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)";
-$styles[4] = "fill:rgb(255,0,255);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)";
-$styles[5] = "fill:rgb(0,255,255);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)";
-$styles[6] = "fill:rgb(0,128,255);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)";
-$styles[7] = "fill:rgb(0,255,128);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)";
-$styles[8] = "fill:rgb(0,25,20);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)";
-
-
-print "<?xml version=\"1.0\" standalone=\"no\"?> \n";
-print "<svg width=\"10000\" height=\"100%\" version=\"1.1\" xmlns=\"http://www.w3.org/2000/svg\">\n";
-
-my $scale = 30000.0;
-while (<>) {
-	my $line = $_;
-	if ($line =~ /([0-9\.]+)\] CSTATE: Going to C([0-9]) on cpu ([0-9]+) for ([0-9\.]+)/) {
-		if ($base == 0) {
-			$base = $1;
-		}
-		my $time = $1 - $base;
-		$time = $time * $scale;
-		my $C = $2;
-		my $cpu = $3;
-		my $y = 400 * $cpu;
-		my $duration = $4 * $scale;
-		my $msec = int($4 * 100000)/100.0;
-		my $height = $C * 20;
-		$style = $styles[$C];
-
-		$y = $y + 140 - $height;
-
-		$x2 = $time + 4;
-		$y2 = $y + 4;
-
-
-		print "<rect x=\"$time\" width=\"$duration\" y=\"$y\" height=\"$height\" style=\"$style\"/>\n";
-		print "<text transform=\"translate($x2,$y2) rotate(90)\">C$C $msec</text>\n";
-	}
-	if ($line =~ /([0-9\.]+)\] PSTATE: Going to P([0-9]) on cpu ([0-9]+)/) {
-		my $time = $1 - $base;
-		my $state = $2;
-		my $cpu = $3;
-
-		if (defined($pstate_last[$cpu])) {
-			my $from = $pstate_last[$cpu];
-			my $oldstate = $pstate_state[$cpu];
-			my $duration = ($time-$from) * $scale;
-
-			$from = $from * $scale;
-			my $to = $from + $duration;
-			my $height = 140 - ($oldstate * (140/8));
-
-			my $y = 400 * $cpu + 200 + $height;
-			my $y2 = $y+4;
-			my $style = $styles[8];
-
-			print "<rect x=\"$from\" y=\"$y\" width=\"$duration\" height=\"5\" style=\"$style\"/>\n";
-			print "<text transform=\"translate($from,$y2)\">P$oldstate (cpu $cpu)</text>\n";
-		};
-
-		$pstate_last[$cpu] = $time;
-		$pstate_state[$cpu] = $state;
-	}
-}
-
-
-print "</svg>\n";
-- 
cgit v1.2.3


From 0c02a20ff7695f9c54cc7c013dda326270ccdac8 Mon Sep 17 00:00:00 2001
From: David Woodhouse <David.Woodhouse@intel.com>
Date: Sat, 19 Sep 2009 09:37:23 -0700
Subject: intel-iommu: Kill DMAR_BROKEN_GFX_WA option.

Just make it depend on BROKEN for now, in case people scream really loud
about it (and because we might want to keep some of this logic for an
upcoming BIOS workaround, so I don't just want to rip it out entirely
just yet). But for graphics devices, it really ought to be unnecessary.

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 Documentation/Intel-IOMMU.txt | 6 +-----
 arch/x86/Kconfig              | 2 +-
 2 files changed, 2 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/Documentation/Intel-IOMMU.txt b/Documentation/Intel-IOMMU.txt
index 21bc416d887..cf9431db873 100644
--- a/Documentation/Intel-IOMMU.txt
+++ b/Documentation/Intel-IOMMU.txt
@@ -56,11 +56,7 @@ Graphics Problems?
 ------------------
 If you encounter issues with graphics devices, you can try adding
 option intel_iommu=igfx_off to turn off the integrated graphics engine.
-
-If it happens to be a PCI device included in the INCLUDE_ALL Engine,
-then try enabling CONFIG_DMAR_GFX_WA to setup a 1-1 map. We hear
-graphics drivers may be in process of using DMA api's in the near
-future and at that time this option can be yanked out.
+If this fixes anything, please ensure you file a bug reporting the problem.
 
 Some exceptions to IOVA
 -----------------------
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 738bdc6b0f8..22a0fd13211 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1916,7 +1916,7 @@ config DMAR_DEFAULT_ON
 config DMAR_BROKEN_GFX_WA
 	def_bool n
 	prompt "Workaround broken graphics drivers (going away soon)"
-	depends on DMAR
+	depends on DMAR && BROKEN
 	---help---
 	  Current Graphics drivers tend to use physical address
 	  for DMA and avoid using DMA APIs. Setting this config
-- 
cgit v1.2.3


From 288f023e708efd89d77ce9acf977a33a623ae83d Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@infradead.org>
Date: Sat, 19 Sep 2009 13:35:33 +0200
Subject: tracing, x86, cpuidle: Move the end point of a C state in the power
 tracer

The "end of a C state" trace point currently happens before
the code runs that corrects the TSC for having stopped during idle.

The result of this is that the timestamp of the end-of-C-state event
is garbage on cpus where the TSC stops during idle.

This patch moves the end point of the C state to after the timekeeping
engine of the kernel has been corrected.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Len Brown <len.brown@intel.com>
Cc: fweisbec@gmail.com
Cc: peterz@infradead.org
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <20090919133533.139c2a46@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/process.c | 3 ---
 drivers/cpuidle/cpuidle.c | 2 ++
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 7b60e390688..847ab416031 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -309,7 +309,6 @@ void default_idle(void)
 		else
 			local_irq_enable();
 		current_thread_info()->status |= TS_POLLING;
-		trace_power_end(0);
 	} else {
 		local_irq_enable();
 		/* loop is done by the caller */
@@ -377,7 +376,6 @@ void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
 		if (!need_resched())
 			__mwait(ax, cx);
 	}
-	trace_power_end(0);
 }
 
 /* Default MONITOR/MWAIT with no hints, used for default C1 state */
@@ -394,7 +392,6 @@ static void mwait_idle(void)
 			__sti_mwait(0, 0);
 		else
 			local_irq_enable();
-		trace_power_end(0);
 	} else
 		local_irq_enable();
 }
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 8504a210855..ad41f19b8e3 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -17,6 +17,7 @@
 #include <linux/cpuidle.h>
 #include <linux/ktime.h>
 #include <linux/hrtimer.h>
+#include <trace/events/power.h>
 
 #include "cpuidle.h"
 
@@ -91,6 +92,7 @@ static void cpuidle_idle_call(void)
 	/* give the governor an opportunity to reflect on the outcome */
 	if (cpuidle_curr_governor->reflect)
 		cpuidle_curr_governor->reflect(dev);
+	trace_power_end(0);
 }
 
 /**
-- 
cgit v1.2.3


From a017421ddcdf34474f1133b27e33ab2ca6c19b96 Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <andreas.herrmann3@amd.com>
Date: Tue, 15 Sep 2009 17:17:27 +0200
Subject: x86, mce: Fix compile warning in case of CONFIG_SMP=n

Fix following compile warning:

  arch/x86/kernel/cpu/mcheck/mce_amd.c: In function 'threshold_create_bank':
  arch/x86/kernel/cpu/mcheck/mce_amd.c:492: warning: unused variable 'c'

which shows up when kernel is compiled with CONFIG_SMP=n.

Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
LKML-Reference: <20090915151727.GB21670@alberich.amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/mcheck/mce_amd.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 8cd5224943b..83a3d1f4efc 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -489,8 +489,9 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 	int i, err = 0;
 	struct threshold_bank *b = NULL;
 	char name[32];
+#ifdef CONFIG_SMP
 	struct cpuinfo_x86 *c = &cpu_data(cpu);
-
+#endif
 
 	sprintf(name, "threshold_bank%i", bank);
 
-- 
cgit v1.2.3


From eda6da9286ad5b35b1eb70f6368958a8ee41a9dd Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Sat, 19 Sep 2009 11:07:57 -0700
Subject: Revert 'x86: Fix system crash when loading with "reservetop"
 parameter'

After close looking, commit 8126dec3 will break:

 1. some cpu feature  in early stage too, like cpu_has_x2apic
 2. will break built-in-command line
 3. will break other memmap= and mem=
 4. early_dbgp and early_console that will use early_ioremap to access mmio (?)

So revert it.

Reported-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>,
Cc: Linus Torvalds <torvalds@linux-foundation.org>,
Cc: Andrew Morton <akpm@linux-foundation.org>,
LKML-Reference: <4AB51DFD.2000904@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup.c | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 19f15c4076f..f5baa2ae0c8 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -712,21 +712,6 @@ void __init setup_arch(char **cmdline_p)
 	printk(KERN_INFO "Command line: %s\n", boot_command_line);
 #endif
 
-	strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
-	*cmdline_p = command_line;
-
-#ifdef CONFIG_X86_64
-	/*
-	 * Must call this twice: Once just to detect whether hardware doesn't
-	 * support NX (so that the early EHCI debug console setup can safely
-	 * call set_fixmap(), and then again after parsing early parameters to
-	 * honor the respective command line option.
-	 */
-	check_efer();
-#endif
-
-	parse_early_param();
-
 	/* VMI may relocate the fixmap; do this before touching ioremap area */
 	vmi_init();
 
@@ -809,6 +794,21 @@ void __init setup_arch(char **cmdline_p)
 #endif
 #endif
 
+	strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
+	*cmdline_p = command_line;
+
+#ifdef CONFIG_X86_64
+	/*
+	 * Must call this twice: Once just to detect whether hardware doesn't
+	 * support NX (so that the early EHCI debug console setup can safely
+	 * call set_fixmap(), and then again after parsing early parameters to
+	 * honor the respective command line option.
+	 */
+	check_efer();
+#endif
+
+	parse_early_param();
+
 #ifdef CONFIG_X86_64
 	check_efer();
 #endif
-- 
cgit v1.2.3


From e454cea20bdcff10ee698d11b8882662a0153a47 Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Fri, 18 Sep 2009 23:01:12 +0200
Subject: Driver-Core: extend devnode callbacks to provide permissions

This allows subsytems to provide devtmpfs with non-default permissions
for the device node. Instead of the default mode of 0600, null, zero,
random, urandom, full, tty, ptmx now have a mode of 0666, which allows
non-privileged processes to access standard device nodes in case no
other userspace process applies the expected permissions.

This also fixes a wrong assignment in pktcdvd and a checkpatch.pl complain.

Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 arch/x86/kernel/cpuid.c             |  4 ++--
 arch/x86/kernel/microcode_core.c    |  2 +-
 arch/x86/kernel/msr.c               |  4 ++--
 block/bsg.c                         |  4 ++--
 block/genhd.c                       |  8 ++++----
 drivers/base/core.c                 | 19 ++++++++++++-------
 drivers/base/devtmpfs.c             | 24 ++++++++++++++++--------
 drivers/block/aoe/aoechr.c          |  4 ++--
 drivers/block/pktcdvd.c             |  6 +++---
 drivers/char/hw_random/core.c       |  2 +-
 drivers/char/mem.c                  | 29 +++++++++++++++++++----------
 drivers/char/misc.c                 | 10 ++++++----
 drivers/char/raw.c                  |  4 ++--
 drivers/char/tty_io.c               | 11 +++++++++++
 drivers/gpu/drm/drm_sysfs.c         |  4 ++--
 drivers/hid/usbhid/hiddev.c         |  4 ++--
 drivers/input/input.c               |  4 ++--
 drivers/md/dm-ioctl.c               |  2 +-
 drivers/media/dvb/dvb-core/dvbdev.c |  4 ++--
 drivers/net/tun.c                   |  2 +-
 drivers/usb/class/usblp.c           |  4 ++--
 drivers/usb/core/file.c             |  8 ++++----
 drivers/usb/core/usb.c              |  4 ++--
 drivers/usb/misc/iowarrior.c        |  4 ++--
 drivers/usb/misc/legousbtower.c     |  4 ++--
 include/linux/device.h              |  7 ++++---
 include/linux/genhd.h               |  2 +-
 include/linux/miscdevice.h          |  3 ++-
 include/linux/usb.h                 |  4 ++--
 sound/sound_core.c                  |  4 ++--
 30 files changed, 116 insertions(+), 79 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index b07af886124..6a52d4b36a3 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -182,7 +182,7 @@ static struct notifier_block __refdata cpuid_class_cpu_notifier =
 	.notifier_call = cpuid_class_cpu_callback,
 };
 
-static char *cpuid_nodename(struct device *dev)
+static char *cpuid_devnode(struct device *dev, mode_t *mode)
 {
 	return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt));
 }
@@ -203,7 +203,7 @@ static int __init cpuid_init(void)
 		err = PTR_ERR(cpuid_class);
 		goto out_chrdev;
 	}
-	cpuid_class->nodename = cpuid_nodename;
+	cpuid_class->devnode = cpuid_devnode;
 	for_each_online_cpu(i) {
 		err = cpuid_device_create(i);
 		if (err != 0)
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 9371448290a..0db7969b0dd 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -236,7 +236,7 @@ static const struct file_operations microcode_fops = {
 static struct miscdevice microcode_dev = {
 	.minor			= MICROCODE_MINOR,
 	.name			= "microcode",
-	.devnode		= "cpu/microcode",
+	.nodename		= "cpu/microcode",
 	.fops			= &microcode_fops,
 };
 
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 7dd95009417..6a3cefc7dda 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -241,7 +241,7 @@ static struct notifier_block __refdata msr_class_cpu_notifier = {
 	.notifier_call = msr_class_cpu_callback,
 };
 
-static char *msr_nodename(struct device *dev)
+static char *msr_devnode(struct device *dev, mode_t *mode)
 {
 	return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt));
 }
@@ -262,7 +262,7 @@ static int __init msr_init(void)
 		err = PTR_ERR(msr_class);
 		goto out_chrdev;
 	}
-	msr_class->nodename = msr_nodename;
+	msr_class->devnode = msr_devnode;
 	for_each_online_cpu(i) {
 		err = msr_device_create(i);
 		if (err != 0)
diff --git a/block/bsg.c b/block/bsg.c
index 5f184bb3ff9..0676301f16d 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -1062,7 +1062,7 @@ EXPORT_SYMBOL_GPL(bsg_register_queue);
 
 static struct cdev bsg_cdev;
 
-static char *bsg_nodename(struct device *dev)
+static char *bsg_devnode(struct device *dev, mode_t *mode)
 {
 	return kasprintf(GFP_KERNEL, "bsg/%s", dev_name(dev));
 }
@@ -1087,7 +1087,7 @@ static int __init bsg_init(void)
 		ret = PTR_ERR(bsg_class);
 		goto destroy_kmemcache;
 	}
-	bsg_class->nodename = bsg_nodename;
+	bsg_class->devnode = bsg_devnode;
 
 	ret = alloc_chrdev_region(&devid, 0, BSG_MAX_DEVS, "bsg");
 	if (ret)
diff --git a/block/genhd.c b/block/genhd.c
index 2ad91ddad8e..517e4332cb3 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -998,12 +998,12 @@ struct class block_class = {
 	.name		= "block",
 };
 
-static char *block_nodename(struct device *dev)
+static char *block_devnode(struct device *dev, mode_t *mode)
 {
 	struct gendisk *disk = dev_to_disk(dev);
 
-	if (disk->nodename)
-		return disk->nodename(disk);
+	if (disk->devnode)
+		return disk->devnode(disk, mode);
 	return NULL;
 }
 
@@ -1011,7 +1011,7 @@ static struct device_type disk_type = {
 	.name		= "disk",
 	.groups		= disk_attr_groups,
 	.release	= disk_release,
-	.nodename	= block_nodename,
+	.devnode	= block_devnode,
 };
 
 #ifdef CONFIG_PROC_FS
diff --git a/drivers/base/core.c b/drivers/base/core.c
index 390e664ec1c..6bee6af8d8e 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -166,13 +166,16 @@ static int dev_uevent(struct kset *kset, struct kobject *kobj,
 	if (MAJOR(dev->devt)) {
 		const char *tmp;
 		const char *name;
+		mode_t mode = 0;
 
 		add_uevent_var(env, "MAJOR=%u", MAJOR(dev->devt));
 		add_uevent_var(env, "MINOR=%u", MINOR(dev->devt));
-		name = device_get_nodename(dev, &tmp);
+		name = device_get_devnode(dev, &mode, &tmp);
 		if (name) {
 			add_uevent_var(env, "DEVNAME=%s", name);
 			kfree(tmp);
+			if (mode)
+				add_uevent_var(env, "DEVMODE=%#o", mode & 0777);
 		}
 	}
 
@@ -1148,8 +1151,9 @@ static struct device *next_device(struct klist_iter *i)
 }
 
 /**
- * device_get_nodename - path of device node file
+ * device_get_devnode - path of device node file
  * @dev: device
+ * @mode: returned file access mode
  * @tmp: possibly allocated string
  *
  * Return the relative path of a possible device node.
@@ -1157,21 +1161,22 @@ static struct device *next_device(struct klist_iter *i)
  * a name. This memory is returned in tmp and needs to be
  * freed by the caller.
  */
-const char *device_get_nodename(struct device *dev, const char **tmp)
+const char *device_get_devnode(struct device *dev,
+			       mode_t *mode, const char **tmp)
 {
 	char *s;
 
 	*tmp = NULL;
 
 	/* the device type may provide a specific name */
-	if (dev->type && dev->type->nodename)
-		*tmp = dev->type->nodename(dev);
+	if (dev->type && dev->type->devnode)
+		*tmp = dev->type->devnode(dev, mode);
 	if (*tmp)
 		return *tmp;
 
 	/* the class may provide a specific name */
-	if (dev->class && dev->class->nodename)
-		*tmp = dev->class->nodename(dev);
+	if (dev->class && dev->class->devnode)
+		*tmp = dev->class->devnode(dev, mode);
 	if (*tmp)
 		return *tmp;
 
diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c
index fd488ad4263..a1cb5afe680 100644
--- a/drivers/base/devtmpfs.c
+++ b/drivers/base/devtmpfs.c
@@ -6,9 +6,10 @@
  * During bootup, before any driver core device is registered,
  * devtmpfs, a tmpfs-based filesystem is created. Every driver-core
  * device which requests a device node, will add a node in this
- * filesystem. The node is named after the the name of the device,
- * or the susbsytem can provide a custom name. All devices are
- * owned by root and have a mode of 0600.
+ * filesystem.
+ * By default, all devices are named after the the name of the
+ * device, owned by root and have a default mode of 0600. Subsystems
+ * can overwrite the default setting if needed.
  */
 
 #include <linux/kernel.h>
@@ -20,6 +21,7 @@
 #include <linux/fs.h>
 #include <linux/shmem_fs.h>
 #include <linux/cred.h>
+#include <linux/sched.h>
 #include <linux/init_task.h>
 
 static struct vfsmount *dev_mnt;
@@ -134,7 +136,7 @@ int devtmpfs_create_node(struct device *dev)
 	const char *tmp = NULL;
 	const char *nodename;
 	const struct cred *curr_cred;
-	mode_t mode;
+	mode_t mode = 0;
 	struct nameidata nd;
 	struct dentry *dentry;
 	int err;
@@ -142,14 +144,16 @@ int devtmpfs_create_node(struct device *dev)
 	if (!dev_mnt)
 		return 0;
 
-	nodename = device_get_nodename(dev, &tmp);
+	nodename = device_get_devnode(dev, &mode, &tmp);
 	if (!nodename)
 		return -ENOMEM;
 
+	if (mode == 0)
+		mode = 0600;
 	if (is_blockdev(dev))
-		mode = S_IFBLK|0600;
+		mode |= S_IFBLK;
 	else
-		mode = S_IFCHR|0600;
+		mode |= S_IFCHR;
 
 	curr_cred = override_creds(&init_cred);
 	err = vfs_path_lookup(dev_mnt->mnt_root, dev_mnt,
@@ -165,8 +169,12 @@ int devtmpfs_create_node(struct device *dev)
 
 	dentry = lookup_create(&nd, 0);
 	if (!IS_ERR(dentry)) {
+		int umask;
+
+		umask = sys_umask(0000);
 		err = vfs_mknod(nd.path.dentry->d_inode,
 				dentry, mode, dev->devt);
+		sys_umask(umask);
 		/* mark as kernel created inode */
 		if (!err)
 			dentry->d_inode->i_private = &dev_mnt;
@@ -271,7 +279,7 @@ int devtmpfs_delete_node(struct device *dev)
 	if (!dev_mnt)
 		return 0;
 
-	nodename = device_get_nodename(dev, &tmp);
+	nodename = device_get_devnode(dev, NULL, &tmp);
 	if (!nodename)
 		return -ENOMEM;
 
diff --git a/drivers/block/aoe/aoechr.c b/drivers/block/aoe/aoechr.c
index 19888354188..62141ec09a2 100644
--- a/drivers/block/aoe/aoechr.c
+++ b/drivers/block/aoe/aoechr.c
@@ -266,7 +266,7 @@ static const struct file_operations aoe_fops = {
 	.owner = THIS_MODULE,
 };
 
-static char *aoe_nodename(struct device *dev)
+static char *aoe_devnode(struct device *dev, mode_t *mode)
 {
 	return kasprintf(GFP_KERNEL, "etherd/%s", dev_name(dev));
 }
@@ -288,7 +288,7 @@ aoechr_init(void)
 		unregister_chrdev(AOE_MAJOR, "aoechr");
 		return PTR_ERR(aoe_class);
 	}
-	aoe_class->nodename = aoe_nodename;
+	aoe_class->devnode = aoe_devnode;
 
 	for (i = 0; i < ARRAY_SIZE(chardevs); ++i)
 		device_create(aoe_class, NULL,
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 95f11cdef20..fd5bb8ad59a 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2857,7 +2857,7 @@ static struct block_device_operations pktcdvd_ops = {
 	.media_changed =	pkt_media_changed,
 };
 
-static char *pktcdvd_nodename(struct gendisk *gd)
+static char *pktcdvd_devnode(struct gendisk *gd, mode_t *mode)
 {
 	return kasprintf(GFP_KERNEL, "pktcdvd/%s", gd->disk_name);
 }
@@ -2914,7 +2914,7 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev)
 	disk->fops = &pktcdvd_ops;
 	disk->flags = GENHD_FL_REMOVABLE;
 	strcpy(disk->disk_name, pd->name);
-	disk->nodename = pktcdvd_nodename;
+	disk->devnode = pktcdvd_devnode;
 	disk->private_data = pd;
 	disk->queue = blk_alloc_queue(GFP_KERNEL);
 	if (!disk->queue)
@@ -3070,7 +3070,7 @@ static const struct file_operations pkt_ctl_fops = {
 static struct miscdevice pkt_misc = {
 	.minor 		= MISC_DYNAMIC_MINOR,
 	.name  		= DRIVER_NAME,
-	.name  		= "pktcdvd/control",
+	.nodename	= "pktcdvd/control",
 	.fops  		= &pkt_ctl_fops
 };
 
diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
index fc93e2fc7c7..1573aebd54b 100644
--- a/drivers/char/hw_random/core.c
+++ b/drivers/char/hw_random/core.c
@@ -153,7 +153,7 @@ static const struct file_operations rng_chrdev_ops = {
 static struct miscdevice rng_miscdev = {
 	.minor		= RNG_MISCDEV_MINOR,
 	.name		= RNG_MODULE_NAME,
-	.devnode	= "hwrng",
+	.nodename	= "hwrng",
 	.fops		= &rng_chrdev_ops,
 };
 
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index 0491cdf63f2..0aede1d6a9e 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -866,24 +866,25 @@ static const struct file_operations kmsg_fops = {
 
 static const struct memdev {
 	const char *name;
+	mode_t mode;
 	const struct file_operations *fops;
 	struct backing_dev_info *dev_info;
 } devlist[] = {
-	[ 1] = { "mem", &mem_fops, &directly_mappable_cdev_bdi },
+	 [1] = { "mem", 0, &mem_fops, &directly_mappable_cdev_bdi },
 #ifdef CONFIG_DEVKMEM
-	[ 2] = { "kmem", &kmem_fops, &directly_mappable_cdev_bdi },
+	 [2] = { "kmem", 0, &kmem_fops, &directly_mappable_cdev_bdi },
 #endif
-	[ 3] = {"null", &null_fops, NULL },
+	 [3] = { "null", 0666, &null_fops, NULL },
 #ifdef CONFIG_DEVPORT
-	[ 4] = { "port", &port_fops, NULL },
+	 [4] = { "port", 0, &port_fops, NULL },
 #endif
-	[ 5] = { "zero", &zero_fops, &zero_bdi },
-	[ 7] = { "full", &full_fops, NULL },
-	[ 8] = { "random", &random_fops, NULL },
-	[ 9] = { "urandom", &urandom_fops, NULL },
-	[11] = { "kmsg", &kmsg_fops, NULL },
+	 [5] = { "zero", 0666, &zero_fops, &zero_bdi },
+	 [7] = { "full", 0666, &full_fops, NULL },
+	 [8] = { "random", 0666, &random_fops, NULL },
+	 [9] = { "urandom", 0666, &urandom_fops, NULL },
+	[11] = { "kmsg", 0, &kmsg_fops, NULL },
 #ifdef CONFIG_CRASH_DUMP
-	[12] = { "oldmem", &oldmem_fops, NULL },
+	[12] = { "oldmem", 0, &oldmem_fops, NULL },
 #endif
 };
 
@@ -920,6 +921,13 @@ static const struct file_operations memory_fops = {
 	.open		= memory_open,
 };
 
+static char *mem_devnode(struct device *dev, mode_t *mode)
+{
+	if (mode && devlist[MINOR(dev->devt)].mode)
+		*mode = devlist[MINOR(dev->devt)].mode;
+	return NULL;
+}
+
 static struct class *mem_class;
 
 static int __init chr_dev_init(void)
@@ -935,6 +943,7 @@ static int __init chr_dev_init(void)
 		printk("unable to get major %d for memory devs\n", MEM_MAJOR);
 
 	mem_class = class_create(THIS_MODULE, "mem");
+	mem_class->devnode = mem_devnode;
 	for (minor = 1; minor < ARRAY_SIZE(devlist); minor++) {
 		if (!devlist[minor].name)
 			continue;
diff --git a/drivers/char/misc.c b/drivers/char/misc.c
index 62c99fa59e2..1ee27cc2342 100644
--- a/drivers/char/misc.c
+++ b/drivers/char/misc.c
@@ -263,12 +263,14 @@ int misc_deregister(struct miscdevice *misc)
 EXPORT_SYMBOL(misc_register);
 EXPORT_SYMBOL(misc_deregister);
 
-static char *misc_nodename(struct device *dev)
+static char *misc_devnode(struct device *dev, mode_t *mode)
 {
 	struct miscdevice *c = dev_get_drvdata(dev);
 
-	if (c->devnode)
-		return kstrdup(c->devnode, GFP_KERNEL);
+	if (mode && c->mode)
+		*mode = c->mode;
+	if (c->nodename)
+		return kstrdup(c->nodename, GFP_KERNEL);
 	return NULL;
 }
 
@@ -287,7 +289,7 @@ static int __init misc_init(void)
 	err = -EIO;
 	if (register_chrdev(MISC_MAJOR,"misc",&misc_fops))
 		goto fail_printk;
-	misc_class->nodename = misc_nodename;
+	misc_class->devnode = misc_devnode;
 	return 0;
 
 fail_printk:
diff --git a/drivers/char/raw.c b/drivers/char/raw.c
index 40268db02e2..64acd05f71c 100644
--- a/drivers/char/raw.c
+++ b/drivers/char/raw.c
@@ -261,7 +261,7 @@ static const struct file_operations raw_ctl_fops = {
 
 static struct cdev raw_cdev;
 
-static char *raw_nodename(struct device *dev)
+static char *raw_devnode(struct device *dev, mode_t *mode)
 {
 	return kasprintf(GFP_KERNEL, "raw/%s", dev_name(dev));
 }
@@ -289,7 +289,7 @@ static int __init raw_init(void)
 		ret = PTR_ERR(raw_class);
 		goto error_region;
 	}
-	raw_class->nodename = raw_nodename;
+	raw_class->devnode = raw_devnode;
 	device_create(raw_class, NULL, MKDEV(RAW_MAJOR, 0), NULL, "rawctl");
 
 	return 0;
diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
index a3afa0c387c..c70d9dabefa 100644
--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -3056,11 +3056,22 @@ void __init console_init(void)
 	}
 }
 
+static char *tty_devnode(struct device *dev, mode_t *mode)
+{
+	if (!mode)
+		return NULL;
+	if (dev->devt == MKDEV(TTYAUX_MAJOR, 0) ||
+	    dev->devt == MKDEV(TTYAUX_MAJOR, 2))
+		*mode = 0666;
+	return NULL;
+}
+
 static int __init tty_class_init(void)
 {
 	tty_class = class_create(THIS_MODULE, "tty");
 	if (IS_ERR(tty_class))
 		return PTR_ERR(tty_class);
+	tty_class->devnode = tty_devnode;
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/drm_sysfs.c b/drivers/gpu/drm/drm_sysfs.c
index f7a615b80c7..5301f226cb1 100644
--- a/drivers/gpu/drm/drm_sysfs.c
+++ b/drivers/gpu/drm/drm_sysfs.c
@@ -76,7 +76,7 @@ static ssize_t version_show(struct class *dev, char *buf)
 		       CORE_MINOR, CORE_PATCHLEVEL, CORE_DATE);
 }
 
-static char *drm_nodename(struct device *dev)
+static char *drm_devnode(struct device *dev, mode_t *mode)
 {
 	return kasprintf(GFP_KERNEL, "dri/%s", dev_name(dev));
 }
@@ -112,7 +112,7 @@ struct class *drm_sysfs_create(struct module *owner, char *name)
 	if (err)
 		goto err_out_class;
 
-	class->nodename = drm_nodename;
+	class->devnode = drm_devnode;
 
 	return class;
 
diff --git a/drivers/hid/usbhid/hiddev.c b/drivers/hid/usbhid/hiddev.c
index 4d1dc0cf140..8b6ee247bfe 100644
--- a/drivers/hid/usbhid/hiddev.c
+++ b/drivers/hid/usbhid/hiddev.c
@@ -852,14 +852,14 @@ static const struct file_operations hiddev_fops = {
 #endif
 };
 
-static char *hiddev_nodename(struct device *dev)
+static char *hiddev_devnode(struct device *dev, mode_t *mode)
 {
 	return kasprintf(GFP_KERNEL, "usb/%s", dev_name(dev));
 }
 
 static struct usb_class_driver hiddev_class = {
 	.name =		"hiddev%d",
-	.nodename =	hiddev_nodename,
+	.devnode =	hiddev_devnode,
 	.fops =		&hiddev_fops,
 	.minor_base =	HIDDEV_MINOR_BASE,
 };
diff --git a/drivers/input/input.c b/drivers/input/input.c
index 851791d955f..556539d617a 100644
--- a/drivers/input/input.c
+++ b/drivers/input/input.c
@@ -1265,14 +1265,14 @@ static struct device_type input_dev_type = {
 	.uevent		= input_dev_uevent,
 };
 
-static char *input_nodename(struct device *dev)
+static char *input_devnode(struct device *dev, mode_t *mode)
 {
 	return kasprintf(GFP_KERNEL, "input/%s", dev_name(dev));
 }
 
 struct class input_class = {
 	.name		= "input",
-	.nodename	= input_nodename,
+	.devnode	= input_devnode,
 };
 EXPORT_SYMBOL_GPL(input_class);
 
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 7f77f18fcaf..a6794293158 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1532,7 +1532,7 @@ static const struct file_operations _ctl_fops = {
 static struct miscdevice _dm_misc = {
 	.minor 		= MISC_DYNAMIC_MINOR,
 	.name  		= DM_NAME,
-	.devnode	= "mapper/control",
+	.nodename	= "mapper/control",
 	.fops  		= &_ctl_fops
 };
 
diff --git a/drivers/media/dvb/dvb-core/dvbdev.c b/drivers/media/dvb/dvb-core/dvbdev.c
index 479dd05762a..94159b90f73 100644
--- a/drivers/media/dvb/dvb-core/dvbdev.c
+++ b/drivers/media/dvb/dvb-core/dvbdev.c
@@ -447,7 +447,7 @@ static int dvb_uevent(struct device *dev, struct kobj_uevent_env *env)
 	return 0;
 }
 
-static char *dvb_nodename(struct device *dev)
+static char *dvb_devnode(struct device *dev, mode_t *mode)
 {
 	struct dvb_device *dvbdev = dev_get_drvdata(dev);
 
@@ -478,7 +478,7 @@ static int __init init_dvbdev(void)
 		goto error;
 	}
 	dvb_class->dev_uevent = dvb_uevent;
-	dvb_class->nodename = dvb_nodename;
+	dvb_class->devnode = dvb_devnode;
 	return 0;
 
 error:
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 3f5d28851aa..d3ee1994b02 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1370,7 +1370,7 @@ static const struct file_operations tun_fops = {
 static struct miscdevice tun_miscdev = {
 	.minor = TUN_MINOR,
 	.name = "tun",
-	.devnode = "net/tun",
+	.nodename = "net/tun",
 	.fops = &tun_fops,
 };
 
diff --git a/drivers/usb/class/usblp.c b/drivers/usb/class/usblp.c
index 26c09f0257d..9bc112ee780 100644
--- a/drivers/usb/class/usblp.c
+++ b/drivers/usb/class/usblp.c
@@ -1057,14 +1057,14 @@ static const struct file_operations usblp_fops = {
 	.release =	usblp_release,
 };
 
-static char *usblp_nodename(struct device *dev)
+static char *usblp_devnode(struct device *dev, mode_t *mode)
 {
 	return kasprintf(GFP_KERNEL, "usb/%s", dev_name(dev));
 }
 
 static struct usb_class_driver usblp_class = {
 	.name =		"lp%d",
-	.nodename =	usblp_nodename,
+	.devnode =	usblp_devnode,
 	.fops =		&usblp_fops,
 	.minor_base =	USBLP_MINOR_BASE,
 };
diff --git a/drivers/usb/core/file.c b/drivers/usb/core/file.c
index 5cef88929b3..222ee07ea68 100644
--- a/drivers/usb/core/file.c
+++ b/drivers/usb/core/file.c
@@ -67,14 +67,14 @@ static struct usb_class {
 	struct class *class;
 } *usb_class;
 
-static char *usb_nodename(struct device *dev)
+static char *usb_devnode(struct device *dev, mode_t *mode)
 {
 	struct usb_class_driver *drv;
 
 	drv = dev_get_drvdata(dev);
-	if (!drv || !drv->nodename)
+	if (!drv || !drv->devnode)
 		return NULL;
-	return drv->nodename(dev);
+	return drv->devnode(dev, mode);
 }
 
 static int init_usb_class(void)
@@ -100,7 +100,7 @@ static int init_usb_class(void)
 		kfree(usb_class);
 		usb_class = NULL;
 	}
-	usb_class->class->nodename = usb_nodename;
+	usb_class->class->devnode = usb_devnode;
 
 exit:
 	return result;
diff --git a/drivers/usb/core/usb.c b/drivers/usb/core/usb.c
index a26f73880c3..43ee943d757 100644
--- a/drivers/usb/core/usb.c
+++ b/drivers/usb/core/usb.c
@@ -311,7 +311,7 @@ static struct dev_pm_ops usb_device_pm_ops = {
 #endif	/* CONFIG_PM */
 
 
-static char *usb_nodename(struct device *dev)
+static char *usb_devnode(struct device *dev, mode_t *mode)
 {
 	struct usb_device *usb_dev;
 
@@ -324,7 +324,7 @@ struct device_type usb_device_type = {
 	.name =		"usb_device",
 	.release =	usb_release_dev,
 	.uevent =	usb_dev_uevent,
-	.nodename = 	usb_nodename,
+	.devnode = 	usb_devnode,
 	.pm =		&usb_device_pm_ops,
 };
 
diff --git a/drivers/usb/misc/iowarrior.c b/drivers/usb/misc/iowarrior.c
index 90e1a8dedfa..e75bb87ee92 100644
--- a/drivers/usb/misc/iowarrior.c
+++ b/drivers/usb/misc/iowarrior.c
@@ -727,7 +727,7 @@ static const struct file_operations iowarrior_fops = {
 	.poll = iowarrior_poll,
 };
 
-static char *iowarrior_nodename(struct device *dev)
+static char *iowarrior_devnode(struct device *dev, mode_t *mode)
 {
 	return kasprintf(GFP_KERNEL, "usb/%s", dev_name(dev));
 }
@@ -738,7 +738,7 @@ static char *iowarrior_nodename(struct device *dev)
  */
 static struct usb_class_driver iowarrior_class = {
 	.name = "iowarrior%d",
-	.nodename = iowarrior_nodename,
+	.devnode = iowarrior_devnode,
 	.fops = &iowarrior_fops,
 	.minor_base = IOWARRIOR_MINOR_BASE,
 };
diff --git a/drivers/usb/misc/legousbtower.c b/drivers/usb/misc/legousbtower.c
index c1e2433f640..97efeaec4d5 100644
--- a/drivers/usb/misc/legousbtower.c
+++ b/drivers/usb/misc/legousbtower.c
@@ -266,7 +266,7 @@ static const struct file_operations tower_fops = {
 	.llseek =	tower_llseek,
 };
 
-static char *legousbtower_nodename(struct device *dev)
+static char *legousbtower_devnode(struct device *dev, mode_t *mode)
 {
 	return kasprintf(GFP_KERNEL, "usb/%s", dev_name(dev));
 }
@@ -277,7 +277,7 @@ static char *legousbtower_nodename(struct device *dev)
  */
 static struct usb_class_driver tower_class = {
 	.name =		"legousbtower%d",
-	.nodename = 	legousbtower_nodename,
+	.devnode = 	legousbtower_devnode,
 	.fops =		&tower_fops,
 	.minor_base =	LEGO_USB_TOWER_MINOR_BASE,
 };
diff --git a/include/linux/device.h b/include/linux/device.h
index 847b763e40e..aca31bf7d8e 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -193,7 +193,7 @@ struct class {
 	struct kobject			*dev_kobj;
 
 	int (*dev_uevent)(struct device *dev, struct kobj_uevent_env *env);
-	char *(*nodename)(struct device *dev);
+	char *(*devnode)(struct device *dev, mode_t *mode);
 
 	void (*class_release)(struct class *class);
 	void (*dev_release)(struct device *dev);
@@ -298,7 +298,7 @@ struct device_type {
 	const char *name;
 	const struct attribute_group **groups;
 	int (*uevent)(struct device *dev, struct kobj_uevent_env *env);
-	char *(*nodename)(struct device *dev);
+	char *(*devnode)(struct device *dev, mode_t *mode);
 	void (*release)(struct device *dev);
 
 	const struct dev_pm_ops *pm;
@@ -487,7 +487,8 @@ extern struct device *device_find_child(struct device *dev, void *data,
 extern int device_rename(struct device *dev, char *new_name);
 extern int device_move(struct device *dev, struct device *new_parent,
 		       enum dpm_order dpm_order);
-extern const char *device_get_nodename(struct device *dev, const char **tmp);
+extern const char *device_get_devnode(struct device *dev,
+				      mode_t *mode, const char **tmp);
 extern void *dev_get_drvdata(const struct device *dev);
 extern void dev_set_drvdata(struct device *dev, void *data);
 
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 44263cb2712..109d179adb9 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -142,7 +142,7 @@ struct gendisk {
                                          * disks that can't be partitioned. */
 
 	char disk_name[DISK_NAME_LEN];	/* name of major driver */
-	char *(*nodename)(struct gendisk *gd);
+	char *(*devnode)(struct gendisk *gd, mode_t *mode);
 	/* Array of pointers to partitions indexed by partno.
 	 * Protected with matching bdev lock but stat and other
 	 * non-critical accesses use RCU.  Always access through
diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h
index 05211774462..adaf3c15e44 100644
--- a/include/linux/miscdevice.h
+++ b/include/linux/miscdevice.h
@@ -41,7 +41,8 @@ struct miscdevice  {
 	struct list_head list;
 	struct device *parent;
 	struct device *this_device;
-	const char *devnode;
+	const char *nodename;
+	mode_t mode;
 };
 
 extern int misc_register(struct miscdevice * misc);
diff --git a/include/linux/usb.h b/include/linux/usb.h
index b1e3c2fbfe1..a8fe05f224e 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -922,7 +922,7 @@ extern struct bus_type usb_bus_type;
 /**
  * struct usb_class_driver - identifies a USB driver that wants to use the USB major number
  * @name: the usb class device name for this driver.  Will show up in sysfs.
- * @nodename: Callback to provide a naming hint for a possible
+ * @devnode: Callback to provide a naming hint for a possible
  *	device node to create.
  * @fops: pointer to the struct file_operations of this driver.
  * @minor_base: the start of the minor range for this driver.
@@ -933,7 +933,7 @@ extern struct bus_type usb_bus_type;
  */
 struct usb_class_driver {
 	char *name;
-	char *(*nodename)(struct device *dev);
+	char *(*devnode)(struct device *dev, mode_t *mode);
 	const struct file_operations *fops;
 	int minor_base;
 };
diff --git a/sound/sound_core.c b/sound/sound_core.c
index bb4b88e606b..49c99818659 100644
--- a/sound/sound_core.c
+++ b/sound/sound_core.c
@@ -29,7 +29,7 @@ MODULE_DESCRIPTION("Core sound module");
 MODULE_AUTHOR("Alan Cox");
 MODULE_LICENSE("GPL");
 
-static char *sound_nodename(struct device *dev)
+static char *sound_devnode(struct device *dev, mode_t *mode)
 {
 	if (MAJOR(dev->devt) == SOUND_MAJOR)
 		return NULL;
@@ -50,7 +50,7 @@ static int __init init_soundcore(void)
 		return PTR_ERR(sound_class);
 	}
 
-	sound_class->nodename = sound_nodename;
+	sound_class->devnode = sound_devnode;
 
 	return 0;
 }
-- 
cgit v1.2.3


From caa27b66bd7188fd063769eaf4b33533ef0709e6 Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Mon, 20 Jul 2009 21:37:11 +0200
Subject: kbuild: use INSTALLKERNEL to select customized installkernel script

Replace the use of CROSS_COMPILE to select a customized
installkernel script with the possibility to set INSTALLKERNEL
to select a custom installkernel script when running make:

    make INSTALLKERNEL=arm-installkernel install

With this patch we are now more consistent across
different architectures - they did not all support use
of CROSS_COMPILE.

The use of CROSS_COMPILE was a hack as this really belongs
to gcc/binutils and the installkernel script does not change
just because we change toolchain.

The use of CROSS_COMPILE caused troubles with an upcoming patch
that saves CROSS_COMPILE when a kernel is built - it would no
longer be installable.
[Thanks to Peter Z. for this hint]

This patch undos what Ian did in commit:

  0f8e2d62fa04441cd12c08ce521e84e5bd3f8a46
  ("use ${CROSS_COMPILE}installkernel in arch/*/boot/install.sh")

The patch has been lightly tested on x86 only - but all changes
looks obvious.

Acked-by: Peter Zijlstra <peterz@infradead.org>
Acked-by: Mike Frysinger <vapier@gentoo.org> [blackfin]
Acked-by: Russell King <linux@arm.linux.org.uk> [arm]
Acked-by: Paul Mundt <lethal@linux-sh.org> [sh]
Acked-by: "H. Peter Anvin" <hpa@zytor.com> [x86]
Cc: Ian Campbell <icampbell@arcom.com>
Cc: Tony Luck <tony.luck@intel.com> [ia64]
Cc: Fenghua Yu <fenghua.yu@intel.com> [ia64]
Cc: Hirokazu Takata <takata@linux-m32r.org> [m32r]
Cc: Geert Uytterhoeven <geert@linux-m68k.org> [m68k]
Cc: Kyle McMartin <kyle@mcmartin.ca> [parisc]
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> [powerpc]
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> [s390]
Cc: Thomas Gleixner <tglx@linutronix.de> [x86]
Cc: Ingo Molnar <mingo@redhat.com> [x86]
Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
---
 Documentation/kbuild/kbuild.txt      | 16 ++++++++++++++++
 Makefile                             |  4 +++-
 arch/arm/Makefile                    |  4 ++--
 arch/arm/boot/install.sh             |  4 ++--
 arch/blackfin/Makefile               |  4 ++--
 arch/blackfin/boot/install.sh        |  6 +++---
 arch/ia64/install.sh                 |  4 ++--
 arch/m32r/boot/compressed/install.sh |  4 ++--
 arch/m68k/install.sh                 |  4 ++--
 arch/parisc/Makefile                 |  4 ++--
 arch/parisc/install.sh               |  4 ++--
 arch/powerpc/Makefile                |  4 ++--
 arch/powerpc/boot/install.sh         |  4 ++--
 arch/s390/boot/install.sh            |  4 ++--
 arch/sh/boot/compressed/install.sh   |  4 ++--
 arch/x86/Makefile                    |  4 ++--
 arch/x86/boot/install.sh             |  4 ++--
 17 files changed, 50 insertions(+), 32 deletions(-)

(limited to 'arch/x86')

diff --git a/Documentation/kbuild/kbuild.txt b/Documentation/kbuild/kbuild.txt
index f3355b6812d..bb3bf38f03d 100644
--- a/Documentation/kbuild/kbuild.txt
+++ b/Documentation/kbuild/kbuild.txt
@@ -65,6 +65,22 @@ INSTALL_PATH
 INSTALL_PATH specifies where to place the updated kernel and system map
 images. Default is /boot, but you can set it to other values.
 
+INSTALLKERNEL
+--------------------------------------------------
+Install script called when using "make install".
+The default name is "installkernel".
+
+The script will be called with the following arguments:
+    $1 - kernel version
+    $2 - kernel image file
+    $3 - kernel map file
+    $4 - default install path (use root directory if blank)
+
+The implmentation of "make install" is architecture specific
+and it may differ from the above.
+
+INSTALLKERNEL is provided to enable the possibility to
+specify a custom installer when cross compiling a kernel.
 
 MODLIB
 --------------------------------------------------
diff --git a/Makefile b/Makefile
index 433493a2b77..305d00594ad 100644
--- a/Makefile
+++ b/Makefile
@@ -315,6 +315,7 @@ OBJCOPY		= $(CROSS_COMPILE)objcopy
 OBJDUMP		= $(CROSS_COMPILE)objdump
 AWK		= awk
 GENKSYMS	= scripts/genksyms/genksyms
+INSTALLKERNEL  := installkernel
 DEPMOD		= /sbin/depmod
 KALLSYMS	= scripts/kallsyms
 PERL		= perl
@@ -353,7 +354,8 @@ KERNELVERSION = $(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION)
 
 export VERSION PATCHLEVEL SUBLEVEL KERNELRELEASE KERNELVERSION
 export ARCH SRCARCH CONFIG_SHELL HOSTCC HOSTCFLAGS CROSS_COMPILE AS LD CC
-export CPP AR NM STRIP OBJCOPY OBJDUMP MAKE AWK GENKSYMS PERL UTS_MACHINE
+export CPP AR NM STRIP OBJCOPY OBJDUMP
+export MAKE AWK GENKSYMS INSTALLKERNEL PERL UTS_MACHINE
 export HOSTCXX HOSTCXXFLAGS LDFLAGS_MODULE CHECK CHECKFLAGS
 
 export KBUILD_CPPFLAGS NOSTDINC_FLAGS LINUXINCLUDE OBJCOPYFLAGS LDFLAGS
diff --git a/arch/arm/Makefile b/arch/arm/Makefile
index 7350557a81e..68c6ab0749d 100644
--- a/arch/arm/Makefile
+++ b/arch/arm/Makefile
@@ -279,7 +279,7 @@ define archhelp
   echo  '                  (supply initrd image via make variable INITRD=<path>)'
   echo  '  install       - Install uncompressed kernel'
   echo  '  zinstall      - Install compressed kernel'
-  echo  '                  Install using (your) ~/bin/installkernel or'
-  echo  '                  (distribution) /sbin/installkernel or'
+  echo  '                  Install using (your) ~/bin/$(INSTALLKERNEL) or'
+  echo  '                  (distribution) /sbin/$(INSTALLKERNEL) or'
   echo  '                  install to $$(INSTALL_PATH) and run lilo'
 endef
diff --git a/arch/arm/boot/install.sh b/arch/arm/boot/install.sh
index 9f9bed20734..06ea7d42ce8 100644
--- a/arch/arm/boot/install.sh
+++ b/arch/arm/boot/install.sh
@@ -21,8 +21,8 @@
 #
 
 # User may have a custom install script
-if [ -x ~/bin/${CROSS_COMPILE}installkernel ]; then exec ~/bin/${CROSS_COMPILE}installkernel "$@"; fi
-if [ -x /sbin/${CROSS_COMPILE}installkernel ]; then exec /sbin/${CROSS_COMPILE}installkernel "$@"; fi
+if [ -x ~/bin/${INSTALLKERNEL} ]; then exec ~/bin/${INSTALLKERNEL} "$@"; fi
+if [ -x /sbin/${INSTALLKERNEL} ]; then exec /sbin/${INSTALLKERNEL} "$@"; fi
 
 if [ "$(basename $2)" = "zImage" ]; then
 # Compressed install
diff --git a/arch/blackfin/Makefile b/arch/blackfin/Makefile
index 6f9533c3d75..f063b772934 100644
--- a/arch/blackfin/Makefile
+++ b/arch/blackfin/Makefile
@@ -155,7 +155,7 @@ define archhelp
   echo  '* vmImage.gz      - Kernel-only image for U-Boot (arch/$(ARCH)/boot/vmImage.gz)'
   echo  '  vmImage.lzma    - Kernel-only image for U-Boot (arch/$(ARCH)/boot/vmImage.lzma)'
   echo  '  install         - Install kernel using'
-  echo  '                     (your) ~/bin/$(CROSS_COMPILE)installkernel or'
-  echo  '                     (distribution) PATH: $(CROSS_COMPILE)installkernel or'
+  echo  '                     (your) ~/bin/$(INSTALLKERNEL) or'
+  echo  '                     (distribution) PATH: $(INSTALLKERNEL) or'
   echo  '                     install to $$(INSTALL_PATH)'
 endef
diff --git a/arch/blackfin/boot/install.sh b/arch/blackfin/boot/install.sh
index 9560a6b2910..e2c6e40902b 100644
--- a/arch/blackfin/boot/install.sh
+++ b/arch/blackfin/boot/install.sh
@@ -36,9 +36,9 @@ verify "$3"
 
 # User may have a custom install script
 
-if [ -x ~/bin/${CROSS_COMPILE}installkernel ]; then exec ~/bin/${CROSS_COMPILE}installkernel "$@"; fi
-if which ${CROSS_COMPILE}installkernel >/dev/null 2>&1; then
-	exec ${CROSS_COMPILE}installkernel "$@"
+if [ -x ~/bin/${INSTALLKERNEL} ]; then exec ~/bin/${INSTALLKERNEL} "$@"; fi
+if which ${INSTALLKERNEL} >/dev/null 2>&1; then
+	exec ${INSTALLKERNEL} "$@"
 fi
 
 # Default install - same as make zlilo
diff --git a/arch/ia64/install.sh b/arch/ia64/install.sh
index 929e780026d..0e932f5dcd1 100644
--- a/arch/ia64/install.sh
+++ b/arch/ia64/install.sh
@@ -21,8 +21,8 @@
 
 # User may have a custom install script
 
-if [ -x ~/bin/installkernel ]; then exec ~/bin/installkernel "$@"; fi
-if [ -x /sbin/installkernel ]; then exec /sbin/installkernel "$@"; fi
+if [ -x ~/bin/${INSTALLKERNEL} ]; then exec ~/bin/${INSTALLKERNEL} "$@"; fi
+if [ -x /sbin/${INSTALLKERNEL} ]; then exec /sbin/${INSTALLKERNEL} "$@"; fi
 
 # Default install - same as make zlilo
 
diff --git a/arch/m32r/boot/compressed/install.sh b/arch/m32r/boot/compressed/install.sh
index 6d72e9e7269..16e5a0a1343 100644
--- a/arch/m32r/boot/compressed/install.sh
+++ b/arch/m32r/boot/compressed/install.sh
@@ -24,8 +24,8 @@
 
 # User may have a custom install script
 
-if [ -x /sbin/installkernel ]; then
-  exec /sbin/installkernel "$@"
+if [ -x /sbin/${INSTALLKERNEL} ]; then
+  exec /sbin/${INSTALLKERNEL} "$@"
 fi
 
 if [ "$2" = "zImage" ]; then
diff --git a/arch/m68k/install.sh b/arch/m68k/install.sh
index 9c6bae6112e..57d640d4382 100644
--- a/arch/m68k/install.sh
+++ b/arch/m68k/install.sh
@@ -33,8 +33,8 @@ verify "$3"
 
 # User may have a custom install script
 
-if [ -x ~/bin/${CROSS_COMPILE}installkernel ]; then exec ~/bin/${CROSS_COMPILE}installkernel "$@"; fi
-if [ -x /sbin/${CROSS_COMPILE}installkernel ]; then exec /sbin/${CROSS_COMPILE}installkernel "$@"; fi
+if [ -x ~/bin/${INSTALLKERNEL} ]; then exec ~/bin/${INSTALLKERNEL} "$@"; fi
+if [ -x /sbin/${INSTALLKERNEL} ]; then exec /sbin/${INSTALLKERNEL} "$@"; fi
 
 # Default install - same as make zlilo
 
diff --git a/arch/parisc/Makefile b/arch/parisc/Makefile
index da6f66901c9..55cca1dac43 100644
--- a/arch/parisc/Makefile
+++ b/arch/parisc/Makefile
@@ -118,8 +118,8 @@ define archhelp
 	@echo  '* vmlinux	- Uncompressed kernel image (./vmlinux)'
 	@echo  '  palo		- Bootable image (./lifimage)'
 	@echo  '  install	- Install kernel using'
-	@echo  '		  (your) ~/bin/installkernel or'
-	@echo  '		  (distribution) /sbin/installkernel or'
+	@echo  '		  (your) ~/bin/$(INSTALLKERNEL) or'
+	@echo  '		  (distribution) /sbin/$(INSTALLKERNEL) or'
 	@echo  '		  copy to $$(INSTALL_PATH)'
 endef
 
diff --git a/arch/parisc/install.sh b/arch/parisc/install.sh
index 9632b3e164c..e593fc8d58b 100644
--- a/arch/parisc/install.sh
+++ b/arch/parisc/install.sh
@@ -21,8 +21,8 @@
 
 # User may have a custom install script
 
-if [ -x ~/bin/installkernel ]; then exec ~/bin/installkernel "$@"; fi
-if [ -x /sbin/installkernel ]; then exec /sbin/installkernel "$@"; fi
+if [ -x ~/bin/${INSTALLKERNEL} ]; then exec ~/bin/${INSTALLKERNEL} "$@"; fi
+if [ -x /sbin/${INSTALLKERNEL} ]; then exec /sbin/${INSTALLKERNEL} "$@"; fi
 
 # Default install
 
diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index 952a3963e9e..13e3fd852d6 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -182,8 +182,8 @@ define archhelp
   @echo '  simpleImage.<dt> - Firmware independent image.'
   @echo '  treeImage.<dt>  - Support for older IBM 4xx firmware (not U-Boot)'
   @echo '  install         - Install kernel using'
-  @echo '                    (your) ~/bin/installkernel or'
-  @echo '                    (distribution) /sbin/installkernel or'
+  @echo '                    (your) ~/bin/$(INSTALLKERNEL) or'
+  @echo '                    (distribution) /sbin/$(INSTALLKERNEL) or'
   @echo '                    install to $$(INSTALL_PATH) and run lilo'
   @echo '  *_defconfig     - Select default config from arch/$(ARCH)/configs'
   @echo ''
diff --git a/arch/powerpc/boot/install.sh b/arch/powerpc/boot/install.sh
index 98312d169c8..b6a256bc96e 100644
--- a/arch/powerpc/boot/install.sh
+++ b/arch/powerpc/boot/install.sh
@@ -23,8 +23,8 @@ set -e
 
 # User may have a custom install script
 
-if [ -x ~/bin/${CROSS_COMPILE}installkernel ]; then exec ~/bin/${CROSS_COMPILE}installkernel "$@"; fi
-if [ -x /sbin/${CROSS_COMPILE}installkernel ]; then exec /sbin/${CROSS_COMPILE}installkernel "$@"; fi
+if [ -x ~/bin/${INSTALLKERNEL} ]; then exec ~/bin/${INSTALLKERNEL} "$@"; fi
+if [ -x /sbin/${INSTALLKERNEL} ]; then exec /sbin/${INSTALLKERNEL} "$@"; fi
 
 # Default install
 
diff --git a/arch/s390/boot/install.sh b/arch/s390/boot/install.sh
index d4026f62cb0..aed3069699b 100644
--- a/arch/s390/boot/install.sh
+++ b/arch/s390/boot/install.sh
@@ -21,8 +21,8 @@
 
 # User may have a custom install script
 
-if [ -x ~/bin/${CROSS_COMPILE}installkernel ]; then exec ~/bin/${CROSS_COMPILE}installkernel "$@"; fi
-if [ -x /sbin/${CROSS_COMPILE}installkernel ]; then exec /sbin/${CROSS_COMPILE}installkernel "$@"; fi
+if [ -x ~/bin/${INSTALLKERNEL} ]; then exec ~/bin/${INSTALLKERNEL} "$@"; fi
+if [ -x /sbin/${INSTALLKERNEL} ]; then exec /sbin/${INSTALLKERNEL} "$@"; fi
 
 # Default install - same as make zlilo
 
diff --git a/arch/sh/boot/compressed/install.sh b/arch/sh/boot/compressed/install.sh
index 90589f0fec1..f9f41818b17 100644
--- a/arch/sh/boot/compressed/install.sh
+++ b/arch/sh/boot/compressed/install.sh
@@ -23,8 +23,8 @@
 
 # User may have a custom install script
 
-if [ -x /sbin/installkernel ]; then
-  exec /sbin/installkernel "$@"
+if [ -x /sbin/${INSTALLKERNEL} ]; then
+  exec /sbin/${INSTALLKERNEL} "$@"
 fi
 
 if [ "$2" = "zImage" ]; then
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 7983c420eaf..a012ee8ef80 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -179,8 +179,8 @@ archclean:
 define archhelp
   echo  '* bzImage      - Compressed kernel image (arch/x86/boot/bzImage)'
   echo  '  install      - Install kernel using'
-  echo  '                  (your) ~/bin/installkernel or'
-  echo  '                  (distribution) /sbin/installkernel or'
+  echo  '                  (your) ~/bin/$(INSTALLKERNEL) or'
+  echo  '                  (distribution) /sbin/$(INSTALLKERNEL) or'
   echo  '                  install to $$(INSTALL_PATH) and run lilo'
   echo  '  fdimage      - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)'
   echo  '  fdimage144   - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)'
diff --git a/arch/x86/boot/install.sh b/arch/x86/boot/install.sh
index 8d60ee15dfd..d13ec1c3864 100644
--- a/arch/x86/boot/install.sh
+++ b/arch/x86/boot/install.sh
@@ -33,8 +33,8 @@ verify "$3"
 
 # User may have a custom install script
 
-if [ -x ~/bin/${CROSS_COMPILE}installkernel ]; then exec ~/bin/${CROSS_COMPILE}installkernel "$@"; fi
-if [ -x /sbin/${CROSS_COMPILE}installkernel ]; then exec /sbin/${CROSS_COMPILE}installkernel "$@"; fi
+if [ -x ~/bin/${INSTALLKERNEL} ]; then exec ~/bin/${INSTALLKERNEL} "$@"; fi
+if [ -x /sbin/${INSTALLKERNEL} ]; then exec /sbin/${INSTALLKERNEL} "$@"; fi
 
 # Default install - same as make zlilo
 
-- 
cgit v1.2.3


From f86fd306605287d7c7f4f0f8e8e2a9d49d28b396 Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Sat, 19 Sep 2009 10:14:33 +0200
Subject: kbuild: rename ld-option to cc-ldoption

ld-option is misnamed as it test options to gcc, not to ld.
Renamed it to reflect this.

Cc: Andi Kleen <andi@firstfloor.org>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
---
 Documentation/kbuild/makefiles.txt  | 6 +++---
 Makefile                            | 2 +-
 arch/ia64/kernel/Makefile.gate      | 2 +-
 arch/powerpc/kernel/vdso32/Makefile | 2 +-
 arch/powerpc/kernel/vdso64/Makefile | 2 +-
 arch/s390/kernel/vdso32/Makefile    | 2 +-
 arch/s390/kernel/vdso64/Makefile    | 2 +-
 arch/sh/kernel/vsyscall/Makefile    | 2 +-
 arch/x86/vdso/Makefile              | 2 +-
 scripts/Kbuild.include              | 6 +++---
 10 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/Documentation/kbuild/makefiles.txt b/Documentation/kbuild/makefiles.txt
index d76cfd8712e..7847fce13bd 100644
--- a/Documentation/kbuild/makefiles.txt
+++ b/Documentation/kbuild/makefiles.txt
@@ -435,14 +435,14 @@ more details, with real examples.
 	The second argument is optional, and if supplied will be used
 	if first argument is not supported.
 
-    ld-option
-	ld-option is used to check if $(CC) when used to link object files
+    cc-ldoption
+	cc-ldoption is used to check if $(CC) when used to link object files
 	supports the given option.  An optional second option may be
 	specified if first option are not supported.
 
 	Example:
 		#arch/i386/kernel/Makefile
-		vsyscall-flags += $(call ld-option, -Wl$(comma)--hash-style=sysv)
+		vsyscall-flags += $(call cc-ldoption, -Wl$(comma)--hash-style=sysv)
 
 	In the above example, vsyscall-flags will be assigned the option
 	-Wl$(comma)--hash-style=sysv if it is supported by $(CC).
diff --git a/Makefile b/Makefile
index a45b0a23182..d04ad671147 100644
--- a/Makefile
+++ b/Makefile
@@ -635,7 +635,7 @@ endif
 
 # Use --build-id when available.
 LDFLAGS_BUILD_ID = $(patsubst -Wl$(comma)%,%,\
-			      $(call ld-option, -Wl$(comma)--build-id,))
+			      $(call cc-ldoption, -Wl$(comma)--build-id,))
 LDFLAGS_MODULE += $(LDFLAGS_BUILD_ID)
 LDFLAGS_vmlinux += $(LDFLAGS_BUILD_ID)
 
diff --git a/arch/ia64/kernel/Makefile.gate b/arch/ia64/kernel/Makefile.gate
index 1d87f84069b..ab9b03a9adc 100644
--- a/arch/ia64/kernel/Makefile.gate
+++ b/arch/ia64/kernel/Makefile.gate
@@ -10,7 +10,7 @@ quiet_cmd_gate = GATE $@
       cmd_gate = $(CC) -nostdlib $(GATECFLAGS_$(@F)) -Wl,-T,$(filter-out FORCE,$^) -o $@
 
 GATECFLAGS_gate.so = -shared -s -Wl,-soname=linux-gate.so.1 \
-		     $(call ld-option, -Wl$(comma)--hash-style=sysv)
+		     $(call cc-ldoption, -Wl$(comma)--hash-style=sysv)
 $(obj)/gate.so: $(obj)/gate.lds $(obj)/gate.o FORCE
 	$(call if_changed,gate)
 
diff --git a/arch/powerpc/kernel/vdso32/Makefile b/arch/powerpc/kernel/vdso32/Makefile
index b54b8168813..51ead52141b 100644
--- a/arch/powerpc/kernel/vdso32/Makefile
+++ b/arch/powerpc/kernel/vdso32/Makefile
@@ -16,7 +16,7 @@ GCOV_PROFILE := n
 
 EXTRA_CFLAGS := -shared -fno-common -fno-builtin
 EXTRA_CFLAGS += -nostdlib -Wl,-soname=linux-vdso32.so.1 \
-		$(call ld-option, -Wl$(comma)--hash-style=sysv)
+		$(call cc-ldoption, -Wl$(comma)--hash-style=sysv)
 EXTRA_AFLAGS := -D__VDSO32__ -s
 
 obj-y += vdso32_wrapper.o
diff --git a/arch/powerpc/kernel/vdso64/Makefile b/arch/powerpc/kernel/vdso64/Makefile
index dd0c8e93677..79da65d44a2 100644
--- a/arch/powerpc/kernel/vdso64/Makefile
+++ b/arch/powerpc/kernel/vdso64/Makefile
@@ -11,7 +11,7 @@ GCOV_PROFILE := n
 
 EXTRA_CFLAGS := -shared -fno-common -fno-builtin
 EXTRA_CFLAGS += -nostdlib -Wl,-soname=linux-vdso64.so.1 \
-		$(call ld-option, -Wl$(comma)--hash-style=sysv)
+		$(call cc-ldoption, -Wl$(comma)--hash-style=sysv)
 EXTRA_AFLAGS := -D__VDSO64__ -s
 
 obj-y += vdso64_wrapper.o
diff --git a/arch/s390/kernel/vdso32/Makefile b/arch/s390/kernel/vdso32/Makefile
index ca78ad60ba2..d13e8755a8c 100644
--- a/arch/s390/kernel/vdso32/Makefile
+++ b/arch/s390/kernel/vdso32/Makefile
@@ -13,7 +13,7 @@ KBUILD_AFLAGS_31 += -m31 -s
 KBUILD_CFLAGS_31 := $(filter-out -m64,$(KBUILD_CFLAGS))
 KBUILD_CFLAGS_31 += -m31 -fPIC -shared -fno-common -fno-builtin
 KBUILD_CFLAGS_31 += -nostdlib -Wl,-soname=linux-vdso32.so.1 \
-			$(call ld-option, -Wl$(comma)--hash-style=sysv)
+			$(call cc-ldoption, -Wl$(comma)--hash-style=sysv)
 
 $(targets:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_31)
 $(targets:%=$(obj)/%.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_31)
diff --git a/arch/s390/kernel/vdso64/Makefile b/arch/s390/kernel/vdso64/Makefile
index 6fc8e829258..449352dda9c 100644
--- a/arch/s390/kernel/vdso64/Makefile
+++ b/arch/s390/kernel/vdso64/Makefile
@@ -13,7 +13,7 @@ KBUILD_AFLAGS_64 += -m64 -s
 KBUILD_CFLAGS_64 := $(filter-out -m64,$(KBUILD_CFLAGS))
 KBUILD_CFLAGS_64 += -m64 -fPIC -shared -fno-common -fno-builtin
 KBUILD_CFLAGS_64 += -nostdlib -Wl,-soname=linux-vdso64.so.1 \
-			$(call ld-option, -Wl$(comma)--hash-style=sysv)
+			$(call cc-ldoption, -Wl$(comma)--hash-style=sysv)
 
 $(targets:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_64)
 $(targets:%=$(obj)/%.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_64)
diff --git a/arch/sh/kernel/vsyscall/Makefile b/arch/sh/kernel/vsyscall/Makefile
index 4bbce1cfa35..8f0ea5fc835 100644
--- a/arch/sh/kernel/vsyscall/Makefile
+++ b/arch/sh/kernel/vsyscall/Makefile
@@ -15,7 +15,7 @@ quiet_cmd_syscall = SYSCALL $@
 export CPPFLAGS_vsyscall.lds += -P -C -Ush
 
 vsyscall-flags = -shared -s -Wl,-soname=linux-gate.so.1 \
-		$(call ld-option, -Wl$(comma)--hash-style=sysv)
+		$(call cc-ldoption, -Wl$(comma)--hash-style=sysv)
 
 SYSCFLAGS_vsyscall-trapa.so	= $(vsyscall-flags)
 
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index 88112b49f02..6b4ffedb93c 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -122,7 +122,7 @@ quiet_cmd_vdso = VDSO    $@
 		       $(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \
 		       -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^)
 
-VDSO_LDFLAGS = -fPIC -shared $(call ld-option, -Wl$(comma)--hash-style=sysv)
+VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=sysv)
 GCOV_PROFILE := n
 
 #
diff --git a/scripts/Kbuild.include b/scripts/Kbuild.include
index c29be8f9024..94a4f682f38 100644
--- a/scripts/Kbuild.include
+++ b/scripts/Kbuild.include
@@ -130,9 +130,9 @@ cc-fullversion = $(shell $(CONFIG_SHELL) \
 # Usage:  EXTRA_CFLAGS += $(call cc-ifversion, -lt, 0402, -O1)
 cc-ifversion = $(shell [ $(call cc-version, $(CC)) $(1) $(2) ] && echo $(3))
 
-# ld-option
-# Usage: ldflags += $(call ld-option, -Wl$(comma)--hash-style=both)
-ld-option = $(call try-run,\
+# cc-ldoption
+# Usage: ldflags += $(call cc-ldoption, -Wl$(comma)--hash-style=both)
+cc-ldoption = $(call try-run,\
 	$(CC) $(1) -nostdlib -xc /dev/null -o "$$TMP",$(1),$(2))
 
 ######
-- 
cgit v1.2.3


From 144374dcc3ad0436f0a1bb3095836cf0ec32eebe Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Sun, 20 Sep 2009 15:09:26 +0530
Subject: includecheck fix: x86, traps.c

fix the following 'make includecheck' warning:

  arch/x86/kernel/traps.c: asm/traps.h is included more than once.

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Sam Ravnborg <sam@ravnborg.org>
LKML-Reference: <1247065094.4382.49.camel@ht.satnam>
---
 arch/x86/kernel/traps.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 7dc0de9d1ed..9346e102338 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -65,7 +65,6 @@
 #else
 #include <asm/processor-flags.h>
 #include <asm/setup.h>
-#include <asm/traps.h>
 
 asmlinkage int system_call(void);
 
-- 
cgit v1.2.3


From fcf989216138858003f0c354698260f29e6e10b0 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Sun, 20 Sep 2009 15:13:18 +0530
Subject: includecheck fix: x86, shadow.c

fix the following 'make includecheck' warning:

  arch/x86/mm/kmemcheck/shadow.c: linux/module.h is included more than once.

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Sam Ravnborg <sam@ravnborg.org>
LKML-Reference: <1247065179.4382.51.camel@ht.satnam>
---
 arch/x86/mm/kmemcheck/shadow.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/kmemcheck/shadow.c b/arch/x86/mm/kmemcheck/shadow.c
index e773b6bd007..3f66b82076a 100644
--- a/arch/x86/mm/kmemcheck/shadow.c
+++ b/arch/x86/mm/kmemcheck/shadow.c
@@ -1,7 +1,6 @@
 #include <linux/kmemcheck.h>
 #include <linux/module.h>
 #include <linux/mm.h>
-#include <linux/module.h>
 
 #include <asm/page.h>
 #include <asm/pgtable.h>
-- 
cgit v1.2.3


From 5ac7687860dbfd3dd90e09d2c10dd31de91f20c2 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Sun, 20 Sep 2009 16:34:24 +0530
Subject: includecheck fix: x86, cpu/common.c

fix the following 'make includecheck' warning:

  arch/x86/kernel/cpu/common.c: linux/smp.h is included more than once.

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Cc: Alan Cox <alan@linux.intel.com>
Cc: Ingo Molnar <mingo@elte.hu>
LKML-Reference: <1252087783.6385.10.camel@ht.satnam>
---
 arch/x86/kernel/cpu/common.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 2055fc2b2e6..2fea97eccf7 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -34,7 +34,6 @@
 #include <asm/mce.h>
 #include <asm/msr.h>
 #include <asm/pat.h>
-#include <linux/smp.h>
 
 #ifdef CONFIG_X86_LOCAL_APIC
 #include <asm/uv/uv.h>
-- 
cgit v1.2.3


From a1792cdacaf5180e04e07811e220c4a3b4a9c33e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 9 Sep 2009 10:04:47 +0200
Subject: perf_counter: x86: Fix PMU resource leak

Dave noticed that we leak the PMU resource reservations when we
fail the hardware counter init.

Reported-by: David Miller <davem@davemloft.net>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: David Miller <davem@davemloft.net>
LKML-Reference: <1252483487.7746.164.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index dbdf712fae9..a6c8b27553c 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -924,6 +924,8 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	if (err)
 		return err;
 
+	counter->destroy = hw_perf_counter_destroy;
+
 	/*
 	 * Generate PMC IRQs:
 	 * (keep 'enabled' bit clear for now)
@@ -953,8 +955,6 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 			return -EOPNOTSUPP;
 	}
 
-	counter->destroy = hw_perf_counter_destroy;
-
 	/*
 	 * Raw event type provide the config in the event structure
 	 */
@@ -2107,8 +2107,11 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 	int err;
 
 	err = __hw_perf_counter_init(counter);
-	if (err)
+	if (err) {
+		if (counter->destroy)
+			counter->destroy(counter);
 		return ERR_PTR(err);
+	}
 
 	return &pmu;
 }
-- 
cgit v1.2.3


From 9ff6d8e06fbd9249804d43574167c2acc52886be Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Mon, 14 Sep 2009 09:28:07 -0700
Subject: x86, mtrr: Convert loop to a while based construct, avoid naked
 semicolon

Perhaps this is a more readable/standard form.

Signed-off-by: Joe Perches <joe@perches.com>
LKML-Reference: <1252945687.3937.14.camel@Joe-Laptop.home>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/mtrr/if.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index 08b6ea4c62b..f04e7252760 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -126,8 +126,8 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
 		return -EINVAL;
 
 	base = simple_strtoull(line + 5, &ptr, 0);
-	for (; isspace(*ptr); ++ptr)
-		;
+	while (isspace(*ptr))
+		ptr++;
 
 	if (strncmp(ptr, "size=", 5))
 		return -EINVAL;
@@ -135,14 +135,14 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
 	size = simple_strtoull(ptr + 5, &ptr, 0);
 	if ((base & 0xfff) || (size & 0xfff))
 		return -EINVAL;
-	for (; isspace(*ptr); ++ptr)
-		;
+	while (isspace(*ptr))
+		ptr++;
 
 	if (strncmp(ptr, "type=", 5))
 		return -EINVAL;
 	ptr += 5;
-	for (; isspace(*ptr); ++ptr)
-		;
+	while (isspace(*ptr))
+		ptr++;
 
 	for (i = 0; i < MTRR_NUM_TYPES; ++i) {
 		if (strcmp(ptr, mtrr_strings[i]))
-- 
cgit v1.2.3


From efc8f7419ea0a97c38d0b80785c8c3d9ab8a4872 Mon Sep 17 00:00:00 2001
From: Huang Weiyi <weiyi.huang@gmail.com>
Date: Fri, 18 Sep 2009 06:39:15 +0800
Subject: x86/i386: Remove duplicated #include

Remove duplicated #include in:

  arch/x86/kernel/cpu/common.c

Signed-off-by: Huang Weiyi <weiyi.huang@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/common.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 2055fc2b2e6..2fea97eccf7 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -34,7 +34,6 @@
 #include <asm/mce.h>
 #include <asm/msr.h>
 #include <asm/pat.h>
-#include <linux/smp.h>
 
 #ifdef CONFIG_X86_LOCAL_APIC
 #include <asm/uv/uv.h>
-- 
cgit v1.2.3


From 8312136fa8b0a3ec7323bbb1a46be8c0c26e994e Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Tue, 15 Sep 2009 11:12:30 +0400
Subject: x86, apic: Fix missed handling of discrete apics

In case of discrete (pretty old) apics we may have cpu_has_apic bit
not set but have to check if smp_found_config (MP spec) is there
and apic was not disabled.

Also don't forget to print apic/io-apic for such case as well.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: "Maciej W. Rozycki" <macro@linux-mips.org>
Cc: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <20090915071230.GA10604@lenovo>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/apic.h    | 13 +++++++++++++
 arch/x86/kernel/apic/apic.c    |  2 +-
 arch/x86/kernel/apic/io_apic.c |  4 ++--
 3 files changed, 16 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 586b7adb8e5..9a86fb40490 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -65,6 +65,19 @@ static inline void default_inquire_remote_apic(int apicid)
 		__inquire_remote_apic(apicid);
 }
 
+/*
+ * With 82489DX we can't rely on apic feature bit
+ * retrieved via cpuid but still have to deal with
+ * such an apic chip so we assume that SMP configuration
+ * is found from MP table (64bit case uses ACPI mostly
+ * which set smp presence flag as well so we are safe
+ * to use this helper too).
+ */
+static inline bool apic_from_smp_config(void)
+{
+	return smp_found_config && !disable_apic;
+}
+
 /*
  * Basic functions accessing APICs.
  */
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 072aea6c630..2d59dfa2978 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -978,7 +978,7 @@ void lapic_shutdown(void)
 {
 	unsigned long flags;
 
-	if (!cpu_has_apic)
+	if (!cpu_has_apic && !apic_from_smp_config())
 		return;
 
 	local_irq_save(flags);
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 3c8f9e75d03..593c4f8a8bb 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1863,7 +1863,7 @@ __apicdebuginit(int) print_all_ICs(void)
 	print_PIC();
 
 	/* don't print out if apic is not there */
-	if (!cpu_has_apic || disable_apic)
+	if (!cpu_has_apic && !apic_from_smp_config())
 		return 0;
 
 	print_all_local_APICs();
@@ -1981,7 +1981,7 @@ void disable_IO_APIC(void)
 	/*
 	 * Use virtual wire A mode when interrupt remapping is enabled.
 	 */
-	if (cpu_has_apic)
+	if (cpu_has_apic || apic_from_smp_config())
 		disconnect_bsp_APIC(!intr_remapping_enabled &&
 				ioapic_i8259.pin != -1);
 }
-- 
cgit v1.2.3


From 878f4f533e5b4498215e67e0f886b0fc81417f5e Mon Sep 17 00:00:00 2001
From: Felipe Contreras <felipe.contreras@gmail.com>
Date: Thu, 17 Sep 2009 00:38:38 +0300
Subject: x86: Trivial whitespace cleanups

Signed-off-by: Felipe Contreras <felipe.contreras@gmail.com>
Cc: Vegard Nossum <vegardno@ifi.uio.no>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Alok N Kataria <akataria@vmware.com>
Cc: "Tan Wei Chong" <wei.chong.tan@intel.com>
Cc: Len Brown <len.brown@intel.com>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Bob Moore <robert.moore@intel.com>
LKML-Reference: <1253137123-18047-2-git-send-email-felipe.contreras@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/string_32.h | 1 -
 arch/x86/kernel/tsc.c            | 2 +-
 drivers/acpi/acpica/tbfadt.c     | 1 -
 3 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h
index c86f452256d..ae907e61718 100644
--- a/arch/x86/include/asm/string_32.h
+++ b/arch/x86/include/asm/string_32.h
@@ -65,7 +65,6 @@ static __always_inline void *__constant_memcpy(void *to, const void *from,
 	case 4:
 		*(int *)to = *(int *)from;
 		return to;
-
 	case 3:
 		*(short *)to = *(short *)from;
 		*((char *)to + 2) = *((char *)from + 2);
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 71f4368b357..8a2fc112369 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -670,7 +670,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
 	if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) ||
 			(val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
 			(val == CPUFREQ_RESUMECHANGE)) {
-		*lpj = 	cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
+		*lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
 
 		tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
 		if (!(freq->flags & CPUFREQ_CONST_LOOPS))
diff --git a/drivers/acpi/acpica/tbfadt.c b/drivers/acpi/acpica/tbfadt.c
index 82b02dcb942..c016335fb75 100644
--- a/drivers/acpi/acpica/tbfadt.c
+++ b/drivers/acpi/acpica/tbfadt.c
@@ -275,7 +275,6 @@ void acpi_tb_parse_fadt(u32 table_index)
 
 void acpi_tb_create_local_fadt(struct acpi_table_header *table, u32 length)
 {
-
 	/*
 	 * Check if the FADT is larger than the largest table that we expect
 	 * (the ACPI 2.0/3.0 version). If so, truncate the table, and issue
-- 
cgit v1.2.3


From 4fe487828b912ca004b4f4505275ab164ed6ce9f Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Date: Thu, 17 Sep 2009 15:54:01 +0300
Subject: x86: Fix uaccess_32.h typo

Trivial: correct "that the we don't" typo.

Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
LKML-Reference: <20090917125401.GU3717@localdomain.by>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/uaccess_32.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
index 5e06259e90e..632fb44b4cb 100644
--- a/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -33,7 +33,7 @@ unsigned long __must_check __copy_from_user_ll_nocache_nozero
  * Copy data from kernel space to user space.  Caller must check
  * the specified block with access_ok() before calling this function.
  * The caller should also make sure he pins the user space address
- * so that the we don't result in page fault and sleep.
+ * so that we don't result in page fault and sleep.
  *
  * Here we special-case 1, 2 and 4-byte copy_*_user invocations.  On a fault
  * we return the initial request size (1, 2 or 4), as copy_*_user should do.
-- 
cgit v1.2.3


From 414128bd33fef3242747154dfa444970bc5caf4c Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Fri, 4 Sep 2009 09:13:49 +0100
Subject: x86: Increase timeout for EHCI debug port reset completion in early
 printk

On one of my systems, several thousand iterations are needed before
CMD_RESET can be observed clear after setting it. Using a much
higher value here obviously cannot hurt.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Cc: Yinghai Lu <yhlu.kernel@gmail.com>
LKML-Reference: <4AA0E85D02000078000136F9@vpn.id2.novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/early_printk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 335f049d110..b11cab3c323 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -624,7 +624,7 @@ try_next_port:
 		return -1;
 	}
 
-	loop = 10;
+	loop = 100000;
 	/* Reset the EHCI controller */
 	cmd = readl(&ehci_regs->command);
 	cmd |= CMD_RESET;
-- 
cgit v1.2.3


From 5f68563996e812f9ca35b3939ad2a42e5d254d66 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Fri, 4 Sep 2009 09:16:22 +0100
Subject: x86: cpuinit-annotate SMP boot trampolines properly

Add missing annotations, and make use of include/linux/init.h's
macros.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
LKML-Reference: <4AA0E8F60200007800013703@vpn.id2.novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/trampoline.c    | 4 ++--
 arch/x86/kernel/trampoline_32.S | 8 ++------
 arch/x86/kernel/trampoline_64.S | 5 +++--
 3 files changed, 7 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c
index 808031a5ba1..699f7eeb896 100644
--- a/arch/x86/kernel/trampoline.c
+++ b/arch/x86/kernel/trampoline.c
@@ -4,7 +4,7 @@
 #include <asm/e820.h>
 
 /* ready for x86_64 and x86 */
-unsigned char *trampoline_base = __va(TRAMPOLINE_BASE);
+unsigned char *__cpuinitdata trampoline_base = __va(TRAMPOLINE_BASE);
 
 void __init reserve_trampoline_memory(void)
 {
@@ -26,7 +26,7 @@ void __init reserve_trampoline_memory(void)
  * bootstrap into the page concerned. The caller
  * has made sure it's suitably aligned.
  */
-unsigned long setup_trampoline(void)
+unsigned long __cpuinit setup_trampoline(void)
 {
 	memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE);
 	return virt_to_phys(trampoline_base);
diff --git a/arch/x86/kernel/trampoline_32.S b/arch/x86/kernel/trampoline_32.S
index 66d874e5404..8508237e8e4 100644
--- a/arch/x86/kernel/trampoline_32.S
+++ b/arch/x86/kernel/trampoline_32.S
@@ -28,16 +28,12 @@
  */
 
 #include <linux/linkage.h>
+#include <linux/init.h>
 #include <asm/segment.h>
 #include <asm/page_types.h>
 
 /* We can free up trampoline after bootup if cpu hotplug is not supported. */
-#ifndef CONFIG_HOTPLUG_CPU
-.section ".cpuinit.data","aw",@progbits
-#else
-.section .rodata,"a",@progbits
-#endif
-
+__CPUINITRODATA
 .code16
 
 ENTRY(trampoline_data)
diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S
index cddfb8d386b..596d54c660a 100644
--- a/arch/x86/kernel/trampoline_64.S
+++ b/arch/x86/kernel/trampoline_64.S
@@ -25,14 +25,15 @@
  */
 
 #include <linux/linkage.h>
+#include <linux/init.h>
 #include <asm/pgtable_types.h>
 #include <asm/page_types.h>
 #include <asm/msr.h>
 #include <asm/segment.h>
 #include <asm/processor-flags.h>
 
-.section .rodata, "a", @progbits
-
+/* We can free up the trampoline after bootup if cpu hotplug is not supported. */
+__CPUINITRODATA
 .code16
 
 ENTRY(trampoline_data)
-- 
cgit v1.2.3


From 8d0cc631f6dd0a9283ceb7d61d8b85ecbcd355ea Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Fri, 4 Sep 2009 09:18:07 +0100
Subject: x86: Correct segment permission flags in 64-bit linker script

While these don't get actively used (afaict), it still doesn't hurt
for them to properly reflect what how respective segments will get
mapped/ accessed.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
LKML-Reference: <4AA0E95F0200007800013707@vpn.id2.novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/vmlinux.lds.S | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 0ccb57d5ee3..a46acccec38 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -45,9 +45,9 @@ PHDRS {
 	text PT_LOAD FLAGS(5);          /* R_E */
 	data PT_LOAD FLAGS(7);          /* RWE */
 #ifdef CONFIG_X86_64
-	user PT_LOAD FLAGS(7);          /* RWE */
+	user PT_LOAD FLAGS(5);          /* R_E */
 #ifdef CONFIG_SMP
-	percpu PT_LOAD FLAGS(7);        /* RWE */
+	percpu PT_LOAD FLAGS(6);        /* RW_ */
 #endif
 	init PT_LOAD FLAGS(7);          /* RWE */
 #endif
-- 
cgit v1.2.3


From 6399c087458859cddff2d6b46befb95b866df3e0 Mon Sep 17 00:00:00 2001
From: Alok Kataria <akataria@vmware.com>
Date: Fri, 4 Sep 2009 13:13:39 -0700
Subject: x86: Print the hypervisor returned tsc_khz during boot

On an AMD-64 system the processor frequency that is printed during
system boot, may be different than the tsc frequency that was
returned by the hypervisor, due to the value returned from
calibrate_cpu.

For debugging timekeeping or other related issues it might be
better to get the tsc_khz value returned by the hypervisor.

The patch below now prints the tsc frequency that the VMware
hypervisor returned.

Signed-off-by: Alok N Kataria <akataria@vmware.com>
LKML-Reference: <1252095219.12518.13.camel@ank32.eng.vmware.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/vmware.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 0a46b4df5d8..1cbed97b59c 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -58,6 +58,9 @@ static unsigned long vmware_get_tsc_khz(void)
 	tsc_hz = eax | (((uint64_t)ebx) << 32);
 	do_div(tsc_hz, 1000);
 	BUG_ON(tsc_hz >> 32);
+	printk(KERN_INFO "TSC freq read from hypervisor : %lu.%03lu MHz\n",
+			 (unsigned long) tsc_hz / 1000,
+			 (unsigned long) tsc_hz % 1000);
 	return tsc_hz;
 }
 
@@ -69,6 +72,9 @@ void __init vmware_platform_setup(void)
 
 	if (ebx != UINT_MAX)
 		x86_platform.calibrate_tsc = vmware_get_tsc_khz;
+	else
+		printk(KERN_WARNING
+		       "Failed to get TSC freq from the hypervisor\n");
 }
 
 /*
-- 
cgit v1.2.3


From d200c922bc2b1ac88b8d33b6cfff2ed837af186a Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Sun, 20 Sep 2009 18:14:13 -0400
Subject: Use new __init_task_data macro in arch init_task.c files.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Tim Abbott <tabbott@ksplice.com>
Acked-by: Paul Mundt <lethal@linux-sh.org>
Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
---
 arch/arm/kernel/init_task.c            | 5 ++---
 arch/avr32/kernel/init_task.c          | 5 ++---
 arch/cris/kernel/process.c             | 5 ++---
 arch/frv/kernel/init_task.c            | 5 ++---
 arch/h8300/kernel/init_task.c          | 5 ++---
 arch/ia64/kernel/init_task.c           | 3 ++-
 arch/m32r/kernel/init_task.c           | 5 ++---
 arch/m68k/kernel/process.c             | 6 +++---
 arch/m68knommu/kernel/init_task.c      | 5 ++---
 arch/microblaze/kernel/init_task.c     | 5 ++---
 arch/mips/kernel/init_task.c           | 5 ++---
 arch/mn10300/kernel/init_task.c        | 5 ++---
 arch/parisc/kernel/init_task.c         | 4 ++--
 arch/powerpc/kernel/init_task.c        | 5 ++---
 arch/powerpc/kernel/machine_kexec_64.c | 5 +++--
 arch/s390/kernel/init_task.c           | 5 ++---
 arch/score/kernel/init_task.c          | 5 ++---
 arch/sh/kernel/init_task.c             | 5 ++---
 arch/sparc/kernel/init_task.c          | 5 ++---
 arch/um/kernel/init_task.c             | 5 ++---
 arch/x86/kernel/init_task.c            | 5 ++---
 arch/xtensa/kernel/init_task.c         | 5 ++---
 22 files changed, 46 insertions(+), 62 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/arm/kernel/init_task.c b/arch/arm/kernel/init_task.c
index 3f470866bb8..e7cbb50dc35 100644
--- a/arch/arm/kernel/init_task.c
+++ b/arch/arm/kernel/init_task.c
@@ -24,9 +24,8 @@ static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
  *
  * The things we do for performance..
  */
-union thread_union init_thread_union
-	__attribute__((__section__(".data.init_task"))) =
-		{ INIT_THREAD_INFO(init_task) };
+union thread_union init_thread_union __init_task_data =
+	{ INIT_THREAD_INFO(init_task) };
 
 /*
  * Initial task structure.
diff --git a/arch/avr32/kernel/init_task.c b/arch/avr32/kernel/init_task.c
index 57ec9f2dcd9..6b2343e6fe3 100644
--- a/arch/avr32/kernel/init_task.c
+++ b/arch/avr32/kernel/init_task.c
@@ -18,9 +18,8 @@ static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 /*
  * Initial thread structure. Must be aligned on an 8192-byte boundary.
  */
-union thread_union init_thread_union
-	__attribute__((__section__(".data.init_task"))) =
-		{ INIT_THREAD_INFO(init_task) };
+union thread_union init_thread_union __init_task_data =
+	{ INIT_THREAD_INFO(init_task) };
 
 /*
  * Initial task structure.
diff --git a/arch/cris/kernel/process.c b/arch/cris/kernel/process.c
index 51dcd04d277..c99aeab7cef 100644
--- a/arch/cris/kernel/process.c
+++ b/arch/cris/kernel/process.c
@@ -45,9 +45,8 @@ static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
  * way process stacks are handled. This is done by having a special
  * "init_task" linker map entry..
  */
-union thread_union init_thread_union 
-	__attribute__((__section__(".data.init_task"))) =
-		{ INIT_THREAD_INFO(init_task) };
+union thread_union init_thread_union __init_task_data =
+	{ INIT_THREAD_INFO(init_task) };
 
 /*
  * Initial task structure.
diff --git a/arch/frv/kernel/init_task.c b/arch/frv/kernel/init_task.c
index 1d3df1d9495..3c3e0b336a9 100644
--- a/arch/frv/kernel/init_task.c
+++ b/arch/frv/kernel/init_task.c
@@ -19,9 +19,8 @@ static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
  * way process stacks are handled. This is done by having a special
  * "init_task" linker map entry..
  */
-union thread_union init_thread_union
-	__attribute__((__section__(".data.init_task"))) =
-		{ INIT_THREAD_INFO(init_task) };
+union thread_union init_thread_union __init_task_data =
+	{ INIT_THREAD_INFO(init_task) };
 
 /*
  * Initial task structure.
diff --git a/arch/h8300/kernel/init_task.c b/arch/h8300/kernel/init_task.c
index 089c65ed6eb..54c1062ee80 100644
--- a/arch/h8300/kernel/init_task.c
+++ b/arch/h8300/kernel/init_task.c
@@ -31,7 +31,6 @@ EXPORT_SYMBOL(init_task);
  * way process stacks are handled. This is done by having a special
  * "init_task" linker map entry..
  */
-union thread_union init_thread_union
-	__attribute__((__section__(".data.init_task"))) =
-		{ INIT_THREAD_INFO(init_task) };
+union thread_union init_thread_union __init_task_data =
+	{ INIT_THREAD_INFO(init_task) };
 
diff --git a/arch/ia64/kernel/init_task.c b/arch/ia64/kernel/init_task.c
index c475fc281be..e253ab8fcbc 100644
--- a/arch/ia64/kernel/init_task.c
+++ b/arch/ia64/kernel/init_task.c
@@ -33,7 +33,8 @@ union {
 		struct thread_info thread_info;
 	} s;
 	unsigned long stack[KERNEL_STACK_SIZE/sizeof (unsigned long)];
-} init_task_mem asm ("init_task") __attribute__((section(".data.init_task"))) = {{
+} init_task_mem asm ("init_task") __init_task_data =
+	{{
 	.task =		INIT_TASK(init_task_mem.s.task),
 	.thread_info =	INIT_THREAD_INFO(init_task_mem.s.task)
 }};
diff --git a/arch/m32r/kernel/init_task.c b/arch/m32r/kernel/init_task.c
index fce57e5d3f9..6c42d5f8df5 100644
--- a/arch/m32r/kernel/init_task.c
+++ b/arch/m32r/kernel/init_task.c
@@ -20,9 +20,8 @@ static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
  * way process stacks are handled. This is done by having a special
  * "init_task" linker map entry..
  */
-union thread_union init_thread_union
-	__attribute__((__section__(".data.init_task"))) =
-		{ INIT_THREAD_INFO(init_task) };
+union thread_union init_thread_union __init_task_data =
+	{ INIT_THREAD_INFO(init_task) };
 
 /*
  * Initial task structure.
diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c
index 72bad65dba3..41230c595a8 100644
--- a/arch/m68k/kernel/process.c
+++ b/arch/m68k/kernel/process.c
@@ -42,9 +42,9 @@
  */
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
-union thread_union init_thread_union
-__attribute__((section(".data.init_task"), aligned(THREAD_SIZE)))
-       = { INIT_THREAD_INFO(init_task) };
+union thread_union init_thread_union __init_task_data
+	__attribute__((aligned(THREAD_SIZE))) =
+		{ INIT_THREAD_INFO(init_task) };
 
 /* initial task structure */
 struct task_struct init_task = INIT_TASK(init_task);
diff --git a/arch/m68knommu/kernel/init_task.c b/arch/m68knommu/kernel/init_task.c
index 45e97a207fe..cbf9dc3cc51 100644
--- a/arch/m68knommu/kernel/init_task.c
+++ b/arch/m68knommu/kernel/init_task.c
@@ -31,7 +31,6 @@ EXPORT_SYMBOL(init_task);
  * way process stacks are handled. This is done by having a special
  * "init_task" linker map entry..
  */
-union thread_union init_thread_union
-	__attribute__((__section__(".data.init_task"))) =
-		{ INIT_THREAD_INFO(init_task) };
+union thread_union init_thread_union __init_task_data =
+	{ INIT_THREAD_INFO(init_task) };
 
diff --git a/arch/microblaze/kernel/init_task.c b/arch/microblaze/kernel/init_task.c
index 67da22579b6..b5d711f94ff 100644
--- a/arch/microblaze/kernel/init_task.c
+++ b/arch/microblaze/kernel/init_task.c
@@ -19,9 +19,8 @@
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 
-union thread_union init_thread_union
-	__attribute__((__section__(".data.init_task"))) =
-{ INIT_THREAD_INFO(init_task) };
+union thread_union init_thread_union __init_task_data =
+	{ INIT_THREAD_INFO(init_task) };
 
 struct task_struct init_task = INIT_TASK(init_task);
 EXPORT_SYMBOL(init_task);
diff --git a/arch/mips/kernel/init_task.c b/arch/mips/kernel/init_task.c
index 5b457a40c78..6d6ca530589 100644
--- a/arch/mips/kernel/init_task.c
+++ b/arch/mips/kernel/init_task.c
@@ -21,9 +21,8 @@ static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
  *
  * The things we do for performance..
  */
-union thread_union init_thread_union
-	__attribute__((__section__(".data.init_task"),
-	               __aligned__(THREAD_SIZE))) =
+union thread_union init_thread_union __init_task_data
+	__attribute__((__aligned__(THREAD_SIZE))) =
 		{ INIT_THREAD_INFO(init_task) };
 
 /*
diff --git a/arch/mn10300/kernel/init_task.c b/arch/mn10300/kernel/init_task.c
index 80d423b80af..a481b043bea 100644
--- a/arch/mn10300/kernel/init_task.c
+++ b/arch/mn10300/kernel/init_task.c
@@ -27,9 +27,8 @@ static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
  * way process stacks are handled. This is done by having a special
  * "init_task" linker map entry..
  */
-union thread_union init_thread_union
-	__attribute__((__section__(".data.init_task"))) =
-		{ INIT_THREAD_INFO(init_task) };
+union thread_union init_thread_union __init_task_data =
+	{ INIT_THREAD_INFO(init_task) };
 
 /*
  * Initial task structure.
diff --git a/arch/parisc/kernel/init_task.c b/arch/parisc/kernel/init_task.c
index 82974b20fc1..d020eae6525 100644
--- a/arch/parisc/kernel/init_task.c
+++ b/arch/parisc/kernel/init_task.c
@@ -43,8 +43,8 @@ static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
  * way process stacks are handled. This is done by having a special
  * "init_task" linker map entry..
  */
-union thread_union init_thread_union
-	__attribute__((aligned(128))) __attribute__((__section__(".data.init_task"))) =
+union thread_union init_thread_union __init_task_data
+	__attribute__((aligned(128))) =
 		{ INIT_THREAD_INFO(init_task) };
 
 #if PT_NLEVELS == 3
diff --git a/arch/powerpc/kernel/init_task.c b/arch/powerpc/kernel/init_task.c
index ffc4253fef5..2375b7eb1c7 100644
--- a/arch/powerpc/kernel/init_task.c
+++ b/arch/powerpc/kernel/init_task.c
@@ -16,9 +16,8 @@ static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
  * way process stacks are handled. This is done by having a special
  * "init_task" linker map entry..
  */
-union thread_union init_thread_union 
-	__attribute__((__section__(".data.init_task"))) =
-		{ INIT_THREAD_INFO(init_task) };
+union thread_union init_thread_union __init_task_data =
+	{ INIT_THREAD_INFO(init_task) };
 
 /*
  * Initial task structure.
diff --git a/arch/powerpc/kernel/machine_kexec_64.c b/arch/powerpc/kernel/machine_kexec_64.c
index 49e705fcee6..040bd1de8d9 100644
--- a/arch/powerpc/kernel/machine_kexec_64.c
+++ b/arch/powerpc/kernel/machine_kexec_64.c
@@ -13,6 +13,7 @@
 #include <linux/kexec.h>
 #include <linux/smp.h>
 #include <linux/thread_info.h>
+#include <linux/init_task.h>
 #include <linux/errno.h>
 
 #include <asm/page.h>
@@ -249,8 +250,8 @@ static void kexec_prepare_cpus(void)
  * We could use a smaller stack if we don't care about anything using
  * current, but that audit has not been performed.
  */
-static union thread_union kexec_stack
-	__attribute__((__section__(".data.init_task"))) = { };
+static union thread_union kexec_stack __init_task_data =
+	{ };
 
 /* Our assembly helper, in kexec_stub.S */
 extern NORET_TYPE void kexec_sequence(void *newstack, unsigned long start,
diff --git a/arch/s390/kernel/init_task.c b/arch/s390/kernel/init_task.c
index fe787f9e5f3..4d1c9fb0b54 100644
--- a/arch/s390/kernel/init_task.c
+++ b/arch/s390/kernel/init_task.c
@@ -25,9 +25,8 @@ static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
  * way process stacks are handled. This is done by having a special
  * "init_task" linker map entry..
  */
-union thread_union init_thread_union 
-	__attribute__((__section__(".data.init_task"))) =
-		{ INIT_THREAD_INFO(init_task) };
+union thread_union init_thread_union __init_task_data =
+	{ INIT_THREAD_INFO(init_task) };
 
 /*
  * Initial task structure.
diff --git a/arch/score/kernel/init_task.c b/arch/score/kernel/init_task.c
index ff952f6c63f..baa03ee217d 100644
--- a/arch/score/kernel/init_task.c
+++ b/arch/score/kernel/init_task.c
@@ -34,9 +34,8 @@ static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
  * way process stacks are handled. This is done by having a special
  * "init_task" linker map entry..
  */
-union thread_union init_thread_union
-	__attribute__((__section__(".data.init_task"), __aligned__(THREAD_SIZE))) =
-		{ INIT_THREAD_INFO(init_task) };
+union thread_union init_thread_union __init_task_data =
+	{ INIT_THREAD_INFO(init_task) };
 
 /*
  * Initial task structure.
diff --git a/arch/sh/kernel/init_task.c b/arch/sh/kernel/init_task.c
index 1719957c0a6..11f2ea556a6 100644
--- a/arch/sh/kernel/init_task.c
+++ b/arch/sh/kernel/init_task.c
@@ -17,9 +17,8 @@ struct pt_regs fake_swapper_regs;
  * way process stacks are handled. This is done by having a special
  * "init_task" linker map entry..
  */
-union thread_union init_thread_union
-	__attribute__((__section__(".data.init_task"))) =
-		{ INIT_THREAD_INFO(init_task) };
+union thread_union init_thread_union __init_task_data =
+	{ INIT_THREAD_INFO(init_task) };
 
 /*
  * Initial task structure.
diff --git a/arch/sparc/kernel/init_task.c b/arch/sparc/kernel/init_task.c
index 28125c5b3d3..5fe3d65581f 100644
--- a/arch/sparc/kernel/init_task.c
+++ b/arch/sparc/kernel/init_task.c
@@ -18,6 +18,5 @@ EXPORT_SYMBOL(init_task);
  * If this is not aligned on a 8k boundry, then you should change code
  * in etrap.S which assumes it.
  */
-union thread_union init_thread_union
-	__attribute__((section (".data.init_task")))
-	= { INIT_THREAD_INFO(init_task) };
+union thread_union init_thread_union __init_task_data =
+	{ INIT_THREAD_INFO(init_task) };
diff --git a/arch/um/kernel/init_task.c b/arch/um/kernel/init_task.c
index b25121b537d..8aa77b61a5f 100644
--- a/arch/um/kernel/init_task.c
+++ b/arch/um/kernel/init_task.c
@@ -30,9 +30,8 @@ EXPORT_SYMBOL(init_task);
  * "init_task" linker map entry..
  */
 
-union thread_union init_thread_union
-	__attribute__((__section__(".data.init_task"))) =
-		{ INIT_THREAD_INFO(init_task) };
+union thread_union init_thread_union __init_task_data =
+	{ INIT_THREAD_INFO(init_task) };
 
 union thread_union cpu0_irqstack
 	__attribute__((__section__(".data.init_irqstack"))) =
diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c
index 270ff83efc1..3a54dcb9cd0 100644
--- a/arch/x86/kernel/init_task.c
+++ b/arch/x86/kernel/init_task.c
@@ -20,9 +20,8 @@ static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
  * way process stacks are handled. This is done by having a special
  * "init_task" linker map entry..
  */
-union thread_union init_thread_union
-	__attribute__((__section__(".data.init_task"))) =
-		{ INIT_THREAD_INFO(init_task) };
+union thread_union init_thread_union __init_task_data =
+	{ INIT_THREAD_INFO(init_task) };
 
 /*
  * Initial task structure.
diff --git a/arch/xtensa/kernel/init_task.c b/arch/xtensa/kernel/init_task.c
index c4302f0e4ba..cd122fb7e48 100644
--- a/arch/xtensa/kernel/init_task.c
+++ b/arch/xtensa/kernel/init_task.c
@@ -23,9 +23,8 @@
 
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
-union thread_union init_thread_union
-	__attribute__((__section__(".data.init_task"))) =
-{ INIT_THREAD_INFO(init_task) };
+union thread_union init_thread_union __init_task_data =
+	{ INIT_THREAD_INFO(init_task) };
 
 struct task_struct init_task = INIT_TASK(init_task);
 
-- 
cgit v1.2.3


From 02b7da37f7acd49277dea1481dc0c5c246c09732 Mon Sep 17 00:00:00 2001
From: Tim Abbott <tabbott@ksplice.com>
Date: Sun, 20 Sep 2009 18:14:14 -0400
Subject: Use macros for .bss.page_aligned section.

This patch changes the remaining direct references to
.bss.page_aligned in C and assembly code to use the macros in
include/linux/linkage.h.

Signed-off-by: Tim Abbott <tabbott@ksplice.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Acked-by: Paul Mundt <lethal@linux-sh.org>
Cc: Chris Zankel <chris@zankel.net>
Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
---
 arch/sh/kernel/irq.c      | 6 ++----
 arch/x86/kernel/head_32.S | 2 +-
 arch/x86/kernel/head_64.S | 2 +-
 arch/xtensa/kernel/head.S | 2 +-
 4 files changed, 5 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/sh/kernel/irq.c b/arch/sh/kernel/irq.c
index 60f8af4497c..7cb933ba495 100644
--- a/arch/sh/kernel/irq.c
+++ b/arch/sh/kernel/irq.c
@@ -165,11 +165,9 @@ asmlinkage int do_IRQ(unsigned int irq, struct pt_regs *regs)
 }
 
 #ifdef CONFIG_IRQSTACKS
-static char softirq_stack[NR_CPUS * THREAD_SIZE]
-		__attribute__((__section__(".bss.page_aligned")));
+static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
 
-static char hardirq_stack[NR_CPUS * THREAD_SIZE]
-		__attribute__((__section__(".bss.page_aligned")));
+static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
 
 /*
  * allocate per-cpu stacks for hardirq and for softirq processing
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index b766e8c7252..1dac2395842 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -608,7 +608,7 @@ ENTRY(initial_code)
 /*
  * BSS section
  */
-.section ".bss.page_aligned","wa"
+__PAGE_ALIGNED_BSS
 	.align PAGE_SIZE_asm
 #ifdef CONFIG_X86_PAE
 swapper_pg_pmd:
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index fa54f78e2a0..d0bc0a13a43 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -418,7 +418,7 @@ ENTRY(phys_base)
 ENTRY(idt_table)
 	.skip IDT_ENTRIES * 16
 
-	.section .bss.page_aligned, "aw", @nobits
+	__PAGE_ALIGNED_BSS
 	.align PAGE_SIZE
 ENTRY(empty_zero_page)
 	.skip PAGE_SIZE
diff --git a/arch/xtensa/kernel/head.S b/arch/xtensa/kernel/head.S
index d9ddc1ba761..d215adcfd4e 100644
--- a/arch/xtensa/kernel/head.S
+++ b/arch/xtensa/kernel/head.S
@@ -235,7 +235,7 @@ should_never_return:
  * BSS section
  */
 	
-.section ".bss.page_aligned", "w"
+__PAGE_ALIGNED_BSS
 #ifdef CONFIG_MMU
 ENTRY(swapper_pg_dir)
 	.fill	PAGE_SIZE, 1, 0
-- 
cgit v1.2.3


From abe1ee3a221d53778c3e58747bbec6e518e5471b Mon Sep 17 00:00:00 2001
From: Tim Abbott <tabbott@ksplice.com>
Date: Sun, 20 Sep 2009 18:14:15 -0400
Subject: Use macros for .data.page_aligned section.

This patch changes the remaining direct references to
.data.page_aligned in C and assembly code to use the macros in
include/linux/linkage.h.

Signed-off-by: Tim Abbott <tabbott@ksplice.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Haavard Skinnemoen <hskinnemoen@atmel.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
---
 arch/avr32/mm/init.c                        | 4 +---
 arch/powerpc/kernel/vdso.c                  | 3 ++-
 arch/powerpc/kernel/vdso32/vdso32_wrapper.S | 3 ++-
 arch/powerpc/kernel/vdso64/vdso64_wrapper.S | 3 ++-
 arch/s390/kernel/vdso.c                     | 2 +-
 arch/s390/kernel/vdso32/vdso32_wrapper.S    | 3 ++-
 arch/s390/kernel/vdso64/vdso64_wrapper.S    | 3 ++-
 arch/x86/include/asm/cache.h                | 4 +++-
 arch/x86/kernel/head_32.S                   | 2 +-
 9 files changed, 16 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/avr32/mm/init.c b/arch/avr32/mm/init.c
index e819fa69a90..cc60d10cf8f 100644
--- a/arch/avr32/mm/init.c
+++ b/arch/avr32/mm/init.c
@@ -24,11 +24,9 @@
 #include <asm/setup.h>
 #include <asm/sections.h>
 
-#define __page_aligned	__attribute__((section(".data.page_aligned")))
-
 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
 
-pgd_t swapper_pg_dir[PTRS_PER_PGD] __page_aligned;
+pgd_t swapper_pg_dir[PTRS_PER_PGD] __page_aligned_data;
 
 struct page *empty_zero_page;
 EXPORT_SYMBOL(empty_zero_page);
diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index a0abce251d0..3faaf29bdb2 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -1,3 +1,4 @@
+
 /*
  *    Copyright (C) 2004 Benjamin Herrenschmidt, IBM Corp.
  *			 <benh@kernel.crashing.org>
@@ -74,7 +75,7 @@ static int vdso_ready;
 static union {
 	struct vdso_data	data;
 	u8			page[PAGE_SIZE];
-} vdso_data_store __attribute__((__section__(".data.page_aligned")));
+} vdso_data_store __page_aligned_data;
 struct vdso_data *vdso_data = &vdso_data_store.data;
 
 /* Format of the patch table */
diff --git a/arch/powerpc/kernel/vdso32/vdso32_wrapper.S b/arch/powerpc/kernel/vdso32/vdso32_wrapper.S
index 556f0caa5d8..6e8f507ed32 100644
--- a/arch/powerpc/kernel/vdso32/vdso32_wrapper.S
+++ b/arch/powerpc/kernel/vdso32/vdso32_wrapper.S
@@ -1,7 +1,8 @@
 #include <linux/init.h>
+#include <linux/linkage.h>
 #include <asm/page.h>
 
-	.section ".data.page_aligned"
+	__PAGE_ALIGNED_DATA
 
 	.globl vdso32_start, vdso32_end
 	.balign PAGE_SIZE
diff --git a/arch/powerpc/kernel/vdso64/vdso64_wrapper.S b/arch/powerpc/kernel/vdso64/vdso64_wrapper.S
index 0529cb9e3b9..b8553d62b79 100644
--- a/arch/powerpc/kernel/vdso64/vdso64_wrapper.S
+++ b/arch/powerpc/kernel/vdso64/vdso64_wrapper.S
@@ -1,7 +1,8 @@
 #include <linux/init.h>
+#include <linux/linkage.h>
 #include <asm/page.h>
 
-	.section ".data.page_aligned"
+	__PAGE_ALIGNED_DATA
 
 	.globl vdso64_start, vdso64_end
 	.balign PAGE_SIZE
diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c
index 45e1708b70f..45a3e9a7ae2 100644
--- a/arch/s390/kernel/vdso.c
+++ b/arch/s390/kernel/vdso.c
@@ -75,7 +75,7 @@ __setup("vdso=", vdso_setup);
 static union {
 	struct vdso_data	data;
 	u8			page[PAGE_SIZE];
-} vdso_data_store __attribute__((__section__(".data.page_aligned")));
+} vdso_data_store __page_aligned_data;
 struct vdso_data *vdso_data = &vdso_data_store.data;
 
 /*
diff --git a/arch/s390/kernel/vdso32/vdso32_wrapper.S b/arch/s390/kernel/vdso32/vdso32_wrapper.S
index 61639a89e70..ae42f8ce350 100644
--- a/arch/s390/kernel/vdso32/vdso32_wrapper.S
+++ b/arch/s390/kernel/vdso32/vdso32_wrapper.S
@@ -1,7 +1,8 @@
 #include <linux/init.h>
+#include <linux/linkage.h>
 #include <asm/page.h>
 
-	.section ".data.page_aligned"
+	__PAGE_ALIGNED_DATA
 
 	.globl vdso32_start, vdso32_end
 	.balign PAGE_SIZE
diff --git a/arch/s390/kernel/vdso64/vdso64_wrapper.S b/arch/s390/kernel/vdso64/vdso64_wrapper.S
index d8e2ac14d56..c245842b516 100644
--- a/arch/s390/kernel/vdso64/vdso64_wrapper.S
+++ b/arch/s390/kernel/vdso64/vdso64_wrapper.S
@@ -1,7 +1,8 @@
 #include <linux/init.h>
+#include <linux/linkage.h>
 #include <asm/page.h>
 
-	.section ".data.page_aligned"
+	__PAGE_ALIGNED_DATA
 
 	.globl vdso64_start, vdso64_end
 	.balign PAGE_SIZE
diff --git a/arch/x86/include/asm/cache.h b/arch/x86/include/asm/cache.h
index 5d367caa0e3..549860d3be8 100644
--- a/arch/x86/include/asm/cache.h
+++ b/arch/x86/include/asm/cache.h
@@ -1,6 +1,8 @@
 #ifndef _ASM_X86_CACHE_H
 #define _ASM_X86_CACHE_H
 
+#include <linux/linkage.h>
+
 /* L1 cache line size */
 #define L1_CACHE_SHIFT	(CONFIG_X86_L1_CACHE_SHIFT)
 #define L1_CACHE_BYTES	(1 << L1_CACHE_SHIFT)
@@ -13,7 +15,7 @@
 #ifdef CONFIG_SMP
 #define __cacheline_aligned_in_smp					\
 	__attribute__((__aligned__(1 << (INTERNODE_CACHE_SHIFT))))	\
-	__attribute__((__section__(".data.page_aligned")))
+	__page_aligned_data
 #endif
 #endif
 
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 1dac2395842..218aad7ee76 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -626,7 +626,7 @@ ENTRY(empty_zero_page)
  * This starts the data section.
  */
 #ifdef CONFIG_X86_PAE
-.section ".data.page_aligned","wa"
+__PAGE_ALIGNED_DATA
 	/* Page-aligned for the benefit of paravirt? */
 	.align PAGE_SIZE_asm
 ENTRY(swapper_pg_dir)
-- 
cgit v1.2.3


From dfc65094d0313cc48969fa60bcf33d693aeb05a7 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 21 Sep 2009 11:31:35 +0200
Subject: perf_counter: Rename 'event' to event_id/hw_event

In preparation to the renames, to avoid a namespace clash.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 48 +++++++++++++++++++-------------------
 kernel/perf_counter.c              | 26 ++++++++++-----------
 2 files changed, 37 insertions(+), 37 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index a6c8b27553c..b1f115696c8 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -124,9 +124,9 @@ static const u64 p6_perfmon_event_map[] =
   [PERF_COUNT_HW_BUS_CYCLES]		= 0x0062,
 };
 
-static u64 p6_pmu_event_map(int event)
+static u64 p6_pmu_event_map(int hw_event)
 {
-	return p6_perfmon_event_map[event];
+	return p6_perfmon_event_map[hw_event];
 }
 
 /*
@@ -137,7 +137,7 @@ static u64 p6_pmu_event_map(int event)
  */
 #define P6_NOP_COUNTER			0x0000002EULL
 
-static u64 p6_pmu_raw_event(u64 event)
+static u64 p6_pmu_raw_event(u64 hw_event)
 {
 #define P6_EVNTSEL_EVENT_MASK		0x000000FFULL
 #define P6_EVNTSEL_UNIT_MASK		0x0000FF00ULL
@@ -152,7 +152,7 @@ static u64 p6_pmu_raw_event(u64 event)
 	 P6_EVNTSEL_INV_MASK   |	\
 	 P6_EVNTSEL_COUNTER_MASK)
 
-	return event & P6_EVNTSEL_MASK;
+	return hw_event & P6_EVNTSEL_MASK;
 }
 
 
@@ -170,16 +170,16 @@ static const u64 intel_perfmon_event_map[] =
   [PERF_COUNT_HW_BUS_CYCLES]		= 0x013c,
 };
 
-static u64 intel_pmu_event_map(int event)
+static u64 intel_pmu_event_map(int hw_event)
 {
-	return intel_perfmon_event_map[event];
+	return intel_perfmon_event_map[hw_event];
 }
 
 /*
- * Generalized hw caching related event table, filled
+ * Generalized hw caching related hw_event table, filled
  * in on a per model basis. A value of 0 means
- * 'not supported', -1 means 'event makes no sense on
- * this CPU', any other value means the raw event
+ * 'not supported', -1 means 'hw_event makes no sense on
+ * this CPU', any other value means the raw hw_event
  * ID.
  */
 
@@ -463,7 +463,7 @@ static const u64 atom_hw_cache_event_ids
  },
 };
 
-static u64 intel_pmu_raw_event(u64 event)
+static u64 intel_pmu_raw_event(u64 hw_event)
 {
 #define CORE_EVNTSEL_EVENT_MASK		0x000000FFULL
 #define CORE_EVNTSEL_UNIT_MASK		0x0000FF00ULL
@@ -478,7 +478,7 @@ static u64 intel_pmu_raw_event(u64 event)
 	 CORE_EVNTSEL_INV_MASK  |	\
 	 CORE_EVNTSEL_COUNTER_MASK)
 
-	return event & CORE_EVNTSEL_MASK;
+	return hw_event & CORE_EVNTSEL_MASK;
 }
 
 static const u64 amd_hw_cache_event_ids
@@ -585,12 +585,12 @@ static const u64 amd_perfmon_event_map[] =
   [PERF_COUNT_HW_BRANCH_MISSES]		= 0x00c5,
 };
 
-static u64 amd_pmu_event_map(int event)
+static u64 amd_pmu_event_map(int hw_event)
 {
-	return amd_perfmon_event_map[event];
+	return amd_perfmon_event_map[hw_event];
 }
 
-static u64 amd_pmu_raw_event(u64 event)
+static u64 amd_pmu_raw_event(u64 hw_event)
 {
 #define K7_EVNTSEL_EVENT_MASK	0x7000000FFULL
 #define K7_EVNTSEL_UNIT_MASK	0x00000FF00ULL
@@ -605,7 +605,7 @@ static u64 amd_pmu_raw_event(u64 event)
 	 K7_EVNTSEL_INV_MASK   |	\
 	 K7_EVNTSEL_COUNTER_MASK)
 
-	return event & K7_EVNTSEL_MASK;
+	return hw_event & K7_EVNTSEL_MASK;
 }
 
 /*
@@ -956,7 +956,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	}
 
 	/*
-	 * Raw event type provide the config in the event structure
+	 * Raw hw_event type provide the config in the hw_event structure
 	 */
 	if (attr->type == PERF_TYPE_RAW) {
 		hwc->config |= x86_pmu.raw_event(attr->config);
@@ -1245,7 +1245,7 @@ x86_perf_counter_set_period(struct perf_counter *counter,
 		ret = 1;
 	}
 	/*
-	 * Quirk: certain CPUs dont like it if just 1 event is left:
+	 * Quirk: certain CPUs dont like it if just 1 hw_event is left:
 	 */
 	if (unlikely(left < 2))
 		left = 2;
@@ -1337,11 +1337,11 @@ static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
 static int
 fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
 {
-	unsigned int event;
+	unsigned int hw_event;
 
-	event = hwc->config & ARCH_PERFMON_EVENT_MASK;
+	hw_event = hwc->config & ARCH_PERFMON_EVENT_MASK;
 
-	if (unlikely((event ==
+	if (unlikely((hw_event ==
 		      x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) &&
 		     (hwc->sample_period == 1)))
 		return X86_PMC_IDX_FIXED_BTS;
@@ -1349,11 +1349,11 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
 	if (!x86_pmu.num_counters_fixed)
 		return -1;
 
-	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS)))
+	if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS)))
 		return X86_PMC_IDX_FIXED_INSTRUCTIONS;
-	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES)))
+	if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES)))
 		return X86_PMC_IDX_FIXED_CPU_CYCLES;
-	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES)))
+	if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES)))
 		return X86_PMC_IDX_FIXED_BUS_CYCLES;
 
 	return -1;
@@ -1970,7 +1970,7 @@ static int intel_pmu_init(void)
 
 	/*
 	 * Check whether the Architectural PerfMon supports
-	 * Branch Misses Retired Event or not.
+	 * Branch Misses Retired hw_event or not.
 	 */
 	cpuid(10, &eax.full, &ebx, &unused, &edx.full);
 	if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 13ad73aed4c..62de0db8092 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3044,22 +3044,22 @@ perf_counter_read_event(struct perf_counter *counter,
 			struct task_struct *task)
 {
 	struct perf_output_handle handle;
-	struct perf_read_event event = {
+	struct perf_read_event read_event = {
 		.header = {
 			.type = PERF_EVENT_READ,
 			.misc = 0,
-			.size = sizeof(event) + perf_counter_read_size(counter),
+			.size = sizeof(read_event) + perf_counter_read_size(counter),
 		},
 		.pid = perf_counter_pid(counter, task),
 		.tid = perf_counter_tid(counter, task),
 	};
 	int ret;
 
-	ret = perf_output_begin(&handle, counter, event.header.size, 0, 0);
+	ret = perf_output_begin(&handle, counter, read_event.header.size, 0, 0);
 	if (ret)
 		return;
 
-	perf_output_put(&handle, event);
+	perf_output_put(&handle, read_event);
 	perf_output_read(&handle, counter);
 
 	perf_output_end(&handle);
@@ -3698,14 +3698,14 @@ static int perf_swcounter_is_counting(struct perf_counter *counter)
 
 static int perf_swcounter_match(struct perf_counter *counter,
 				enum perf_type_id type,
-				u32 event, struct pt_regs *regs)
+				u32 event_id, struct pt_regs *regs)
 {
 	if (!perf_swcounter_is_counting(counter))
 		return 0;
 
 	if (counter->attr.type != type)
 		return 0;
-	if (counter->attr.config != event)
+	if (counter->attr.config != event_id)
 		return 0;
 
 	if (regs) {
@@ -3721,7 +3721,7 @@ static int perf_swcounter_match(struct perf_counter *counter,
 
 static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
 				     enum perf_type_id type,
-				     u32 event, u64 nr, int nmi,
+				     u32 event_id, u64 nr, int nmi,
 				     struct perf_sample_data *data,
 				     struct pt_regs *regs)
 {
@@ -3732,7 +3732,7 @@ static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
-		if (perf_swcounter_match(counter, type, event, regs))
+		if (perf_swcounter_match(counter, type, event_id, regs))
 			perf_swcounter_add(counter, nr, nmi, data, regs);
 	}
 	rcu_read_unlock();
@@ -4036,17 +4036,17 @@ atomic_t perf_swcounter_enabled[PERF_COUNT_SW_MAX];
 
 static void sw_perf_counter_destroy(struct perf_counter *counter)
 {
-	u64 event = counter->attr.config;
+	u64 event_id = counter->attr.config;
 
 	WARN_ON(counter->parent);
 
-	atomic_dec(&perf_swcounter_enabled[event]);
+	atomic_dec(&perf_swcounter_enabled[event_id]);
 }
 
 static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
 {
 	const struct pmu *pmu = NULL;
-	u64 event = counter->attr.config;
+	u64 event_id = counter->attr.config;
 
 	/*
 	 * Software counters (currently) can't in general distinguish
@@ -4055,7 +4055,7 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
 	 * to be kernel events, and page faults are never hypervisor
 	 * events.
 	 */
-	switch (event) {
+	switch (event_id) {
 	case PERF_COUNT_SW_CPU_CLOCK:
 		pmu = &perf_ops_cpu_clock;
 
@@ -4077,7 +4077,7 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
 	case PERF_COUNT_SW_CONTEXT_SWITCHES:
 	case PERF_COUNT_SW_CPU_MIGRATIONS:
 		if (!counter->parent) {
-			atomic_inc(&perf_swcounter_enabled[event]);
+			atomic_inc(&perf_swcounter_enabled[event_id]);
 			counter->destroy = sw_perf_counter_destroy;
 		}
 		pmu = &perf_ops_generic;
-- 
cgit v1.2.3


From cdd6c482c9ff9c55475ee7392ec8f672eddb7be6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 21 Sep 2009 12:02:48 +0200
Subject: perf: Do the big rename: Performance Counters -> Performance Events

Bye-bye Performance Counters, welcome Performance Events!

In the past few months the perfcounters subsystem has grown out its
initial role of counting hardware events, and has become (and is
becoming) a much broader generic event enumeration, reporting, logging,
monitoring, analysis facility.

Naming its core object 'perf_counter' and naming the subsystem
'perfcounters' has become more and more of a misnomer. With pending
code like hw-breakpoints support the 'counter' name is less and
less appropriate.

All in one, we've decided to rename the subsystem to 'performance
events' and to propagate this rename through all fields, variables
and API names. (in an ABI compatible fashion)

The word 'event' is also a bit shorter than 'counter' - which makes
it slightly more convenient to write/handle as well.

Thanks goes to Stephane Eranian who first observed this misnomer and
suggested a rename.

User-space tooling and ABI compatibility is not affected - this patch
should be function-invariant. (Also, defconfigs were not touched to
keep the size down.)

This patch has been generated via the following script:

  FILES=$(find * -type f | grep -vE 'oprofile|[^K]config')

  sed -i \
    -e 's/PERF_EVENT_/PERF_RECORD_/g' \
    -e 's/PERF_COUNTER/PERF_EVENT/g' \
    -e 's/perf_counter/perf_event/g' \
    -e 's/nb_counters/nb_events/g' \
    -e 's/swcounter/swevent/g' \
    -e 's/tpcounter_event/tp_event/g' \
    $FILES

  for N in $(find . -name perf_counter.[ch]); do
    M=$(echo $N | sed 's/perf_counter/perf_event/g')
    mv $N $M
  done

  FILES=$(find . -name perf_event.*)

  sed -i \
    -e 's/COUNTER_MASK/REG_MASK/g' \
    -e 's/COUNTER/EVENT/g' \
    -e 's/\<event\>/event_id/g' \
    -e 's/counter/event/g' \
    -e 's/Counter/Event/g' \
    $FILES

... to keep it as correct as possible. This script can also be
used by anyone who has pending perfcounters patches - it converts
a Linux kernel tree over to the new naming. We tried to time this
change to the point in time where the amount of pending patches
is the smallest: the end of the merge window.

Namespace clashes were fixed up in a preparatory patch - and some
stylistic fallout will be fixed up in a subsequent patch.

( NOTE: 'counters' are still the proper terminology when we deal
  with hardware registers - and these sed scripts are a bit
  over-eager in renaming them. I've undone some of that, but
  in case there's something left where 'counter' would be
  better than 'event' we can undo that on an individual basis
  instead of touching an otherwise nicely automated patch. )

Suggested-by: Stephane Eranian <eranian@google.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Reviewed-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: <linux-arch@vger.kernel.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/arm/include/asm/unistd.h           |    2 +-
 arch/arm/kernel/calls.S                 |    2 +-
 arch/blackfin/include/asm/unistd.h      |    2 +-
 arch/blackfin/mach-common/entry.S       |    2 +-
 arch/frv/Kconfig                        |    2 +-
 arch/frv/include/asm/perf_counter.h     |   17 -
 arch/frv/include/asm/perf_event.h       |   17 +
 arch/frv/include/asm/unistd.h           |    2 +-
 arch/frv/kernel/entry.S                 |    2 +-
 arch/frv/lib/Makefile                   |    2 +-
 arch/frv/lib/perf_counter.c             |   19 -
 arch/frv/lib/perf_event.c               |   19 +
 arch/m68k/include/asm/unistd.h          |    2 +-
 arch/m68k/kernel/entry.S                |    2 +-
 arch/m68knommu/kernel/syscalltable.S    |    2 +-
 arch/microblaze/include/asm/unistd.h    |    2 +-
 arch/microblaze/kernel/syscall_table.S  |    2 +-
 arch/mips/include/asm/unistd.h          |    6 +-
 arch/mips/kernel/scall32-o32.S          |    2 +-
 arch/mips/kernel/scall64-64.S           |    2 +-
 arch/mips/kernel/scall64-n32.S          |    2 +-
 arch/mips/kernel/scall64-o32.S          |    2 +-
 arch/mn10300/include/asm/unistd.h       |    2 +-
 arch/mn10300/kernel/entry.S             |    2 +-
 arch/parisc/Kconfig                     |    2 +-
 arch/parisc/include/asm/perf_counter.h  |    7 -
 arch/parisc/include/asm/perf_event.h    |    7 +
 arch/parisc/include/asm/unistd.h        |    4 +-
 arch/parisc/kernel/syscall_table.S      |    2 +-
 arch/powerpc/Kconfig                    |    2 +-
 arch/powerpc/include/asm/hw_irq.h       |   22 +-
 arch/powerpc/include/asm/paca.h         |    2 +-
 arch/powerpc/include/asm/perf_counter.h |  110 -
 arch/powerpc/include/asm/perf_event.h   |  110 +
 arch/powerpc/include/asm/systbl.h       |    2 +-
 arch/powerpc/include/asm/unistd.h       |    2 +-
 arch/powerpc/kernel/Makefile            |    2 +-
 arch/powerpc/kernel/asm-offsets.c       |    2 +-
 arch/powerpc/kernel/entry_64.S          |    8 +-
 arch/powerpc/kernel/irq.c               |    8 +-
 arch/powerpc/kernel/mpc7450-pmu.c       |    2 +-
 arch/powerpc/kernel/perf_callchain.c    |    2 +-
 arch/powerpc/kernel/perf_counter.c      | 1315 --------
 arch/powerpc/kernel/perf_event.c        | 1315 ++++++++
 arch/powerpc/kernel/power4-pmu.c        |    2 +-
 arch/powerpc/kernel/power5+-pmu.c       |    2 +-
 arch/powerpc/kernel/power5-pmu.c        |    2 +-
 arch/powerpc/kernel/power6-pmu.c        |    2 +-
 arch/powerpc/kernel/power7-pmu.c        |    2 +-
 arch/powerpc/kernel/ppc970-pmu.c        |    2 +-
 arch/powerpc/kernel/time.c              |   30 +-
 arch/powerpc/mm/fault.c                 |    8 +-
 arch/powerpc/platforms/Kconfig.cputype  |    4 +-
 arch/s390/Kconfig                       |    2 +-
 arch/s390/include/asm/perf_counter.h    |   10 -
 arch/s390/include/asm/perf_event.h      |   10 +
 arch/s390/include/asm/unistd.h          |    2 +-
 arch/s390/kernel/compat_wrapper.S       |    8 +-
 arch/s390/kernel/syscalls.S             |    2 +-
 arch/s390/mm/fault.c                    |    8 +-
 arch/sh/Kconfig                         |    2 +-
 arch/sh/include/asm/perf_counter.h      |    9 -
 arch/sh/include/asm/perf_event.h        |    9 +
 arch/sh/include/asm/unistd_32.h         |    2 +-
 arch/sh/include/asm/unistd_64.h         |    2 +-
 arch/sh/kernel/syscalls_32.S            |    2 +-
 arch/sh/kernel/syscalls_64.S            |    2 +-
 arch/sh/mm/fault_32.c                   |    8 +-
 arch/sh/mm/tlbflush_64.c                |    8 +-
 arch/sparc/Kconfig                      |    4 +-
 arch/sparc/include/asm/perf_counter.h   |   14 -
 arch/sparc/include/asm/perf_event.h     |   14 +
 arch/sparc/include/asm/unistd.h         |    2 +-
 arch/sparc/kernel/Makefile              |    2 +-
 arch/sparc/kernel/nmi.c                 |    4 +-
 arch/sparc/kernel/pcr.c                 |   10 +-
 arch/sparc/kernel/perf_counter.c        |  556 ----
 arch/sparc/kernel/perf_event.c          |  556 ++++
 arch/sparc/kernel/systbls_32.S          |    2 +-
 arch/sparc/kernel/systbls_64.S          |    4 +-
 arch/x86/Kconfig                        |    2 +-
 arch/x86/ia32/ia32entry.S               |    2 +-
 arch/x86/include/asm/entry_arch.h       |    2 +-
 arch/x86/include/asm/perf_counter.h     |  108 -
 arch/x86/include/asm/perf_event.h       |  108 +
 arch/x86/include/asm/unistd_32.h        |    2 +-
 arch/x86/include/asm/unistd_64.h        |    4 +-
 arch/x86/kernel/apic/apic.c             |    6 +-
 arch/x86/kernel/cpu/Makefile            |    2 +-
 arch/x86/kernel/cpu/common.c            |    4 +-
 arch/x86/kernel/cpu/perf_counter.c      | 2298 --------------
 arch/x86/kernel/cpu/perf_event.c        | 2298 ++++++++++++++
 arch/x86/kernel/cpu/perfctr-watchdog.c  |    2 +-
 arch/x86/kernel/entry_64.S              |    2 +-
 arch/x86/kernel/irqinit.c               |    2 +-
 arch/x86/kernel/syscall_table_32.S      |    2 +-
 arch/x86/mm/fault.c                     |    8 +-
 arch/x86/oprofile/op_model_ppro.c       |    4 +-
 arch/x86/oprofile/op_x86_model.h        |    2 +-
 drivers/char/sysrq.c                    |    4 +-
 fs/exec.c                               |    6 +-
 include/asm-generic/unistd.h            |    4 +-
 include/linux/init_task.h               |   14 +-
 include/linux/perf_counter.h            |  858 ------
 include/linux/perf_event.h              |  858 ++++++
 include/linux/prctl.h                   |    4 +-
 include/linux/sched.h                   |   12 +-
 include/linux/syscalls.h                |    6 +-
 include/trace/ftrace.h                  |   10 +-
 init/Kconfig                            |    8 +-
 kernel/Makefile                         |    2 +-
 kernel/exit.c                           |    8 +-
 kernel/fork.c                           |    8 +-
 kernel/perf_counter.c                   | 5000 -------------------------------
 kernel/perf_event.c                     | 5000 +++++++++++++++++++++++++++++++
 kernel/sched.c                          |   14 +-
 kernel/sys.c                            |   10 +-
 kernel/sys_ni.c                         |    2 +-
 kernel/sysctl.c                         |   22 +-
 kernel/timer.c                          |    4 +-
 kernel/trace/trace_syscalls.c           |    6 +-
 mm/mmap.c                               |    6 +-
 mm/mprotect.c                           |    4 +-
 tools/perf/Makefile                     |    2 +-
 tools/perf/builtin-annotate.c           |   28 +-
 tools/perf/builtin-record.c             |   22 +-
 tools/perf/builtin-report.c             |   48 +-
 tools/perf/builtin-sched.c              |   20 +-
 tools/perf/builtin-stat.c               |   10 +-
 tools/perf/builtin-timechart.c          |   14 +-
 tools/perf/builtin-top.c                |   12 +-
 tools/perf/builtin-trace.c              |   22 +-
 tools/perf/design.txt                   |   58 +-
 tools/perf/perf.h                       |   12 +-
 tools/perf/util/event.h                 |    4 +-
 tools/perf/util/header.c                |    6 +-
 tools/perf/util/header.h                |    8 +-
 tools/perf/util/parse-events.c          |   32 +-
 tools/perf/util/parse-events.h          |    2 +-
 tools/perf/util/trace-event-info.c      |    8 +-
 tools/perf/util/trace-event.h           |    2 +-
 141 files changed, 10694 insertions(+), 10694 deletions(-)
 delete mode 100644 arch/frv/include/asm/perf_counter.h
 create mode 100644 arch/frv/include/asm/perf_event.h
 delete mode 100644 arch/frv/lib/perf_counter.c
 create mode 100644 arch/frv/lib/perf_event.c
 delete mode 100644 arch/parisc/include/asm/perf_counter.h
 create mode 100644 arch/parisc/include/asm/perf_event.h
 delete mode 100644 arch/powerpc/include/asm/perf_counter.h
 create mode 100644 arch/powerpc/include/asm/perf_event.h
 delete mode 100644 arch/powerpc/kernel/perf_counter.c
 create mode 100644 arch/powerpc/kernel/perf_event.c
 delete mode 100644 arch/s390/include/asm/perf_counter.h
 create mode 100644 arch/s390/include/asm/perf_event.h
 delete mode 100644 arch/sh/include/asm/perf_counter.h
 create mode 100644 arch/sh/include/asm/perf_event.h
 delete mode 100644 arch/sparc/include/asm/perf_counter.h
 create mode 100644 arch/sparc/include/asm/perf_event.h
 delete mode 100644 arch/sparc/kernel/perf_counter.c
 create mode 100644 arch/sparc/kernel/perf_event.c
 delete mode 100644 arch/x86/include/asm/perf_counter.h
 create mode 100644 arch/x86/include/asm/perf_event.h
 delete mode 100644 arch/x86/kernel/cpu/perf_counter.c
 create mode 100644 arch/x86/kernel/cpu/perf_event.c
 delete mode 100644 include/linux/perf_counter.h
 create mode 100644 include/linux/perf_event.h
 delete mode 100644 kernel/perf_counter.c
 create mode 100644 kernel/perf_event.c

(limited to 'arch/x86')

diff --git a/arch/arm/include/asm/unistd.h b/arch/arm/include/asm/unistd.h
index 9122c9ee18f..89f7eade20a 100644
--- a/arch/arm/include/asm/unistd.h
+++ b/arch/arm/include/asm/unistd.h
@@ -390,7 +390,7 @@
 #define __NR_preadv			(__NR_SYSCALL_BASE+361)
 #define __NR_pwritev			(__NR_SYSCALL_BASE+362)
 #define __NR_rt_tgsigqueueinfo		(__NR_SYSCALL_BASE+363)
-#define __NR_perf_counter_open		(__NR_SYSCALL_BASE+364)
+#define __NR_perf_event_open		(__NR_SYSCALL_BASE+364)
 
 /*
  * The following SWIs are ARM private.
diff --git a/arch/arm/kernel/calls.S b/arch/arm/kernel/calls.S
index ecfa98954d1..fafce1b5c69 100644
--- a/arch/arm/kernel/calls.S
+++ b/arch/arm/kernel/calls.S
@@ -373,7 +373,7 @@
 		CALL(sys_preadv)
 		CALL(sys_pwritev)
 		CALL(sys_rt_tgsigqueueinfo)
-		CALL(sys_perf_counter_open)
+		CALL(sys_perf_event_open)
 #ifndef syscalls_counted
 .equ syscalls_padding, ((NR_syscalls + 3) & ~3) - NR_syscalls
 #define syscalls_counted
diff --git a/arch/blackfin/include/asm/unistd.h b/arch/blackfin/include/asm/unistd.h
index c8e7ee4768c..02b1529dad5 100644
--- a/arch/blackfin/include/asm/unistd.h
+++ b/arch/blackfin/include/asm/unistd.h
@@ -381,7 +381,7 @@
 #define __NR_preadv		366
 #define __NR_pwritev		367
 #define __NR_rt_tgsigqueueinfo	368
-#define __NR_perf_counter_open	369
+#define __NR_perf_event_open	369
 
 #define __NR_syscall		370
 #define NR_syscalls		__NR_syscall
diff --git a/arch/blackfin/mach-common/entry.S b/arch/blackfin/mach-common/entry.S
index 01af24cde36..1e7cac23e25 100644
--- a/arch/blackfin/mach-common/entry.S
+++ b/arch/blackfin/mach-common/entry.S
@@ -1620,7 +1620,7 @@ ENTRY(_sys_call_table)
 	.long _sys_preadv
 	.long _sys_pwritev
 	.long _sys_rt_tgsigqueueinfo
-	.long _sys_perf_counter_open
+	.long _sys_perf_event_open
 
 	.rept NR_syscalls-(.-_sys_call_table)/4
 	.long _sys_ni_syscall
diff --git a/arch/frv/Kconfig b/arch/frv/Kconfig
index b86e19c9b5b..4b5830bcbe2 100644
--- a/arch/frv/Kconfig
+++ b/arch/frv/Kconfig
@@ -7,7 +7,7 @@ config FRV
 	default y
 	select HAVE_IDE
 	select HAVE_ARCH_TRACEHOOK
-	select HAVE_PERF_COUNTERS
+	select HAVE_PERF_EVENTS
 
 config ZONE_DMA
 	bool
diff --git a/arch/frv/include/asm/perf_counter.h b/arch/frv/include/asm/perf_counter.h
deleted file mode 100644
index ccf726e61b2..00000000000
--- a/arch/frv/include/asm/perf_counter.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/* FRV performance counter support
- *
- * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-
-#ifndef _ASM_PERF_COUNTER_H
-#define _ASM_PERF_COUNTER_H
-
-#define PERF_COUNTER_INDEX_OFFSET	0
-
-#endif /* _ASM_PERF_COUNTER_H */
diff --git a/arch/frv/include/asm/perf_event.h b/arch/frv/include/asm/perf_event.h
new file mode 100644
index 00000000000..a69e0155d14
--- /dev/null
+++ b/arch/frv/include/asm/perf_event.h
@@ -0,0 +1,17 @@
+/* FRV performance event support
+ *
+ * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#ifndef _ASM_PERF_EVENT_H
+#define _ASM_PERF_EVENT_H
+
+#define PERF_EVENT_INDEX_OFFSET	0
+
+#endif /* _ASM_PERF_EVENT_H */
diff --git a/arch/frv/include/asm/unistd.h b/arch/frv/include/asm/unistd.h
index 4a8fb427ce0..be6ef0f5cd4 100644
--- a/arch/frv/include/asm/unistd.h
+++ b/arch/frv/include/asm/unistd.h
@@ -342,7 +342,7 @@
 #define __NR_preadv		333
 #define __NR_pwritev		334
 #define __NR_rt_tgsigqueueinfo	335
-#define __NR_perf_counter_open	336
+#define __NR_perf_event_open	336
 
 #ifdef __KERNEL__
 
diff --git a/arch/frv/kernel/entry.S b/arch/frv/kernel/entry.S
index fde1e446b44..189397ec012 100644
--- a/arch/frv/kernel/entry.S
+++ b/arch/frv/kernel/entry.S
@@ -1525,6 +1525,6 @@ sys_call_table:
 	.long sys_preadv
 	.long sys_pwritev
 	.long sys_rt_tgsigqueueinfo	/* 335 */
-	.long sys_perf_counter_open
+	.long sys_perf_event_open
 
 syscall_table_size = (. - sys_call_table)
diff --git a/arch/frv/lib/Makefile b/arch/frv/lib/Makefile
index 0a377210c89..f4709756d0d 100644
--- a/arch/frv/lib/Makefile
+++ b/arch/frv/lib/Makefile
@@ -5,4 +5,4 @@
 lib-y := \
 	__ashldi3.o __lshrdi3.o __muldi3.o __ashrdi3.o __negdi2.o __ucmpdi2.o \
 	checksum.o memcpy.o memset.o atomic-ops.o atomic64-ops.o \
-	outsl_ns.o outsl_sw.o insl_ns.o insl_sw.o cache.o perf_counter.o
+	outsl_ns.o outsl_sw.o insl_ns.o insl_sw.o cache.o perf_event.o
diff --git a/arch/frv/lib/perf_counter.c b/arch/frv/lib/perf_counter.c
deleted file mode 100644
index 2000feecd57..00000000000
--- a/arch/frv/lib/perf_counter.c
+++ /dev/null
@@ -1,19 +0,0 @@
-/* Performance counter handling
- *
- * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-
-#include <linux/perf_counter.h>
-
-/*
- * mark the performance counter as pending
- */
-void set_perf_counter_pending(void)
-{
-}
diff --git a/arch/frv/lib/perf_event.c b/arch/frv/lib/perf_event.c
new file mode 100644
index 00000000000..9ac5acfd2e9
--- /dev/null
+++ b/arch/frv/lib/perf_event.c
@@ -0,0 +1,19 @@
+/* Performance event handling
+ *
+ * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/perf_event.h>
+
+/*
+ * mark the performance event as pending
+ */
+void set_perf_event_pending(void)
+{
+}
diff --git a/arch/m68k/include/asm/unistd.h b/arch/m68k/include/asm/unistd.h
index 946d8691f2b..48b87f5ced5 100644
--- a/arch/m68k/include/asm/unistd.h
+++ b/arch/m68k/include/asm/unistd.h
@@ -335,7 +335,7 @@
 #define __NR_preadv		329
 #define __NR_pwritev		330
 #define __NR_rt_tgsigqueueinfo	331
-#define __NR_perf_counter_open	332
+#define __NR_perf_event_open	332
 
 #ifdef __KERNEL__
 
diff --git a/arch/m68k/kernel/entry.S b/arch/m68k/kernel/entry.S
index 922f52e7ed1..c5b33634c98 100644
--- a/arch/m68k/kernel/entry.S
+++ b/arch/m68k/kernel/entry.S
@@ -756,5 +756,5 @@ sys_call_table:
 	.long sys_preadv
 	.long sys_pwritev		/* 330 */
 	.long sys_rt_tgsigqueueinfo
-	.long sys_perf_counter_open
+	.long sys_perf_event_open
 
diff --git a/arch/m68knommu/kernel/syscalltable.S b/arch/m68knommu/kernel/syscalltable.S
index 0ae123e0898..23535cc415a 100644
--- a/arch/m68knommu/kernel/syscalltable.S
+++ b/arch/m68knommu/kernel/syscalltable.S
@@ -350,7 +350,7 @@ ENTRY(sys_call_table)
 	.long sys_preadv
 	.long sys_pwritev		/* 330 */
 	.long sys_rt_tgsigqueueinfo
-	.long sys_perf_counter_open
+	.long sys_perf_event_open
 
 	.rept NR_syscalls-(.-sys_call_table)/4
 		.long sys_ni_syscall
diff --git a/arch/microblaze/include/asm/unistd.h b/arch/microblaze/include/asm/unistd.h
index 0b852327c0e..cb05a07e55e 100644
--- a/arch/microblaze/include/asm/unistd.h
+++ b/arch/microblaze/include/asm/unistd.h
@@ -381,7 +381,7 @@
 #define __NR_preadv		363 /* new */
 #define __NR_pwritev		364 /* new */
 #define __NR_rt_tgsigqueueinfo	365 /* new */
-#define __NR_perf_counter_open	366 /* new */
+#define __NR_perf_event_open	366 /* new */
 
 #define __NR_syscalls		367
 
diff --git a/arch/microblaze/kernel/syscall_table.S b/arch/microblaze/kernel/syscall_table.S
index 457216097df..ecec1915513 100644
--- a/arch/microblaze/kernel/syscall_table.S
+++ b/arch/microblaze/kernel/syscall_table.S
@@ -370,4 +370,4 @@ ENTRY(sys_call_table)
 	.long sys_ni_syscall
 	.long sys_ni_syscall
 	.long sys_rt_tgsigqueueinfo	/* 365 */
-	.long sys_perf_counter_open
+	.long sys_perf_event_open
diff --git a/arch/mips/include/asm/unistd.h b/arch/mips/include/asm/unistd.h
index e753a777949..8c9dfa9e901 100644
--- a/arch/mips/include/asm/unistd.h
+++ b/arch/mips/include/asm/unistd.h
@@ -353,7 +353,7 @@
 #define __NR_preadv			(__NR_Linux + 330)
 #define __NR_pwritev			(__NR_Linux + 331)
 #define __NR_rt_tgsigqueueinfo		(__NR_Linux + 332)
-#define __NR_perf_counter_open		(__NR_Linux + 333)
+#define __NR_perf_event_open		(__NR_Linux + 333)
 #define __NR_accept4			(__NR_Linux + 334)
 
 /*
@@ -664,7 +664,7 @@
 #define __NR_preadv			(__NR_Linux + 289)
 #define __NR_pwritev			(__NR_Linux + 290)
 #define __NR_rt_tgsigqueueinfo		(__NR_Linux + 291)
-#define __NR_perf_counter_open		(__NR_Linux + 292)
+#define __NR_perf_event_open		(__NR_Linux + 292)
 #define __NR_accept4			(__NR_Linux + 293)
 
 /*
@@ -979,7 +979,7 @@
 #define __NR_preadv			(__NR_Linux + 293)
 #define __NR_pwritev			(__NR_Linux + 294)
 #define __NR_rt_tgsigqueueinfo		(__NR_Linux + 295)
-#define __NR_perf_counter_open		(__NR_Linux + 296)
+#define __NR_perf_event_open		(__NR_Linux + 296)
 #define __NR_accept4			(__NR_Linux + 297)
 
 /*
diff --git a/arch/mips/kernel/scall32-o32.S b/arch/mips/kernel/scall32-o32.S
index 7c2de4f091c..fd2a9bb620d 100644
--- a/arch/mips/kernel/scall32-o32.S
+++ b/arch/mips/kernel/scall32-o32.S
@@ -581,7 +581,7 @@ einval:	li	v0, -ENOSYS
 	sys	sys_preadv		6	/* 4330 */
 	sys	sys_pwritev		6
 	sys	sys_rt_tgsigqueueinfo	4
-	sys	sys_perf_counter_open	5
+	sys	sys_perf_event_open	5
 	sys	sys_accept4		4
 	.endm
 
diff --git a/arch/mips/kernel/scall64-64.S b/arch/mips/kernel/scall64-64.S
index b97b993846d..18bf7f32c5e 100644
--- a/arch/mips/kernel/scall64-64.S
+++ b/arch/mips/kernel/scall64-64.S
@@ -418,6 +418,6 @@ sys_call_table:
 	PTR	sys_preadv
 	PTR	sys_pwritev			/* 5390 */
 	PTR	sys_rt_tgsigqueueinfo
-	PTR	sys_perf_counter_open
+	PTR	sys_perf_event_open
 	PTR	sys_accept4
 	.size	sys_call_table,.-sys_call_table
diff --git a/arch/mips/kernel/scall64-n32.S b/arch/mips/kernel/scall64-n32.S
index 1a6ae124635..6ebc0797669 100644
--- a/arch/mips/kernel/scall64-n32.S
+++ b/arch/mips/kernel/scall64-n32.S
@@ -416,6 +416,6 @@ EXPORT(sysn32_call_table)
 	PTR	sys_preadv
 	PTR	sys_pwritev
 	PTR	compat_sys_rt_tgsigqueueinfo	/* 5295 */
-	PTR	sys_perf_counter_open
+	PTR	sys_perf_event_open
 	PTR	sys_accept4
 	.size	sysn32_call_table,.-sysn32_call_table
diff --git a/arch/mips/kernel/scall64-o32.S b/arch/mips/kernel/scall64-o32.S
index cd31087a651..9bbf9775e0b 100644
--- a/arch/mips/kernel/scall64-o32.S
+++ b/arch/mips/kernel/scall64-o32.S
@@ -536,6 +536,6 @@ sys_call_table:
 	PTR	compat_sys_preadv		/* 4330 */
 	PTR	compat_sys_pwritev
 	PTR	compat_sys_rt_tgsigqueueinfo
-	PTR	sys_perf_counter_open
+	PTR	sys_perf_event_open
 	PTR	sys_accept4
 	.size	sys_call_table,.-sys_call_table
diff --git a/arch/mn10300/include/asm/unistd.h b/arch/mn10300/include/asm/unistd.h
index fad68616af3..2a983931c11 100644
--- a/arch/mn10300/include/asm/unistd.h
+++ b/arch/mn10300/include/asm/unistd.h
@@ -347,7 +347,7 @@
 #define __NR_preadv		334
 #define __NR_pwritev		335
 #define __NR_rt_tgsigqueueinfo	336
-#define __NR_perf_counter_open	337
+#define __NR_perf_event_open	337
 
 #ifdef __KERNEL__
 
diff --git a/arch/mn10300/kernel/entry.S b/arch/mn10300/kernel/entry.S
index e0d2563af4f..a94e7ea3faa 100644
--- a/arch/mn10300/kernel/entry.S
+++ b/arch/mn10300/kernel/entry.S
@@ -723,7 +723,7 @@ ENTRY(sys_call_table)
 	.long sys_preadv
 	.long sys_pwritev		/* 335 */
 	.long sys_rt_tgsigqueueinfo
-	.long sys_perf_counter_open
+	.long sys_perf_event_open
 
 
 nr_syscalls=(.-sys_call_table)/4
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 06f8d5b5b0f..f388dc68f60 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -16,7 +16,7 @@ config PARISC
 	select RTC_DRV_GENERIC
 	select INIT_ALL_POSSIBLE
 	select BUG
-	select HAVE_PERF_COUNTERS
+	select HAVE_PERF_EVENTS
 	select GENERIC_ATOMIC64 if !64BIT
 	help
 	  The PA-RISC microprocessor is designed by Hewlett-Packard and used
diff --git a/arch/parisc/include/asm/perf_counter.h b/arch/parisc/include/asm/perf_counter.h
deleted file mode 100644
index dc9e829f701..00000000000
--- a/arch/parisc/include/asm/perf_counter.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef __ASM_PARISC_PERF_COUNTER_H
-#define __ASM_PARISC_PERF_COUNTER_H
-
-/* parisc only supports software counters through this interface. */
-static inline void set_perf_counter_pending(void) { }
-
-#endif /* __ASM_PARISC_PERF_COUNTER_H */
diff --git a/arch/parisc/include/asm/perf_event.h b/arch/parisc/include/asm/perf_event.h
new file mode 100644
index 00000000000..cc146427d8f
--- /dev/null
+++ b/arch/parisc/include/asm/perf_event.h
@@ -0,0 +1,7 @@
+#ifndef __ASM_PARISC_PERF_EVENT_H
+#define __ASM_PARISC_PERF_EVENT_H
+
+/* parisc only supports software events through this interface. */
+static inline void set_perf_event_pending(void) { }
+
+#endif /* __ASM_PARISC_PERF_EVENT_H */
diff --git a/arch/parisc/include/asm/unistd.h b/arch/parisc/include/asm/unistd.h
index f3d3b8b012c..cda158318c6 100644
--- a/arch/parisc/include/asm/unistd.h
+++ b/arch/parisc/include/asm/unistd.h
@@ -810,9 +810,9 @@
 #define __NR_preadv		(__NR_Linux + 315)
 #define __NR_pwritev		(__NR_Linux + 316)
 #define __NR_rt_tgsigqueueinfo	(__NR_Linux + 317)
-#define __NR_perf_counter_open	(__NR_Linux + 318)
+#define __NR_perf_event_open	(__NR_Linux + 318)
 
-#define __NR_Linux_syscalls	(__NR_perf_counter_open + 1)
+#define __NR_Linux_syscalls	(__NR_perf_event_open + 1)
 
 
 #define __IGNORE_select		/* newselect */
diff --git a/arch/parisc/kernel/syscall_table.S b/arch/parisc/kernel/syscall_table.S
index cf145eb026b..843f423dec6 100644
--- a/arch/parisc/kernel/syscall_table.S
+++ b/arch/parisc/kernel/syscall_table.S
@@ -416,7 +416,7 @@
 	ENTRY_COMP(preadv)		/* 315 */
 	ENTRY_COMP(pwritev)
 	ENTRY_COMP(rt_tgsigqueueinfo)
-	ENTRY_SAME(perf_counter_open)
+	ENTRY_SAME(perf_event_open)
 
 	/* Nothing yet */
 
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 8250902265c..4fd479059d6 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -129,7 +129,7 @@ config PPC
 	select HAVE_OPROFILE
 	select HAVE_SYSCALL_WRAPPERS if PPC64
 	select GENERIC_ATOMIC64 if PPC32
-	select HAVE_PERF_COUNTERS
+	select HAVE_PERF_EVENTS
 
 config EARLY_PRINTK
 	bool
diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h
index e73d554538d..abbc2aaaced 100644
--- a/arch/powerpc/include/asm/hw_irq.h
+++ b/arch/powerpc/include/asm/hw_irq.h
@@ -135,43 +135,43 @@ static inline int irqs_disabled_flags(unsigned long flags)
  */
 struct irq_chip;
 
-#ifdef CONFIG_PERF_COUNTERS
+#ifdef CONFIG_PERF_EVENTS
 
 #ifdef CONFIG_PPC64
-static inline unsigned long test_perf_counter_pending(void)
+static inline unsigned long test_perf_event_pending(void)
 {
 	unsigned long x;
 
 	asm volatile("lbz %0,%1(13)"
 		: "=r" (x)
-		: "i" (offsetof(struct paca_struct, perf_counter_pending)));
+		: "i" (offsetof(struct paca_struct, perf_event_pending)));
 	return x;
 }
 
-static inline void set_perf_counter_pending(void)
+static inline void set_perf_event_pending(void)
 {
 	asm volatile("stb %0,%1(13)" : :
 		"r" (1),
-		"i" (offsetof(struct paca_struct, perf_counter_pending)));
+		"i" (offsetof(struct paca_struct, perf_event_pending)));
 }
 
-static inline void clear_perf_counter_pending(void)
+static inline void clear_perf_event_pending(void)
 {
 	asm volatile("stb %0,%1(13)" : :
 		"r" (0),
-		"i" (offsetof(struct paca_struct, perf_counter_pending)));
+		"i" (offsetof(struct paca_struct, perf_event_pending)));
 }
 #endif /* CONFIG_PPC64 */
 
-#else  /* CONFIG_PERF_COUNTERS */
+#else  /* CONFIG_PERF_EVENTS */
 
-static inline unsigned long test_perf_counter_pending(void)
+static inline unsigned long test_perf_event_pending(void)
 {
 	return 0;
 }
 
-static inline void clear_perf_counter_pending(void) {}
-#endif /* CONFIG_PERF_COUNTERS */
+static inline void clear_perf_event_pending(void) {}
+#endif /* CONFIG_PERF_EVENTS */
 
 #endif	/* __KERNEL__ */
 #endif	/* _ASM_POWERPC_HW_IRQ_H */
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index b634456ea89..154f405b642 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -122,7 +122,7 @@ struct paca_struct {
 	u8 soft_enabled;		/* irq soft-enable flag */
 	u8 hard_enabled;		/* set if irqs are enabled in MSR */
 	u8 io_sync;			/* writel() needs spin_unlock sync */
-	u8 perf_counter_pending;	/* PM interrupt while soft-disabled */
+	u8 perf_event_pending;	/* PM interrupt while soft-disabled */
 
 	/* Stuff for accurate time accounting */
 	u64 user_time;			/* accumulated usermode TB ticks */
diff --git a/arch/powerpc/include/asm/perf_counter.h b/arch/powerpc/include/asm/perf_counter.h
deleted file mode 100644
index 0ea0639fcf7..00000000000
--- a/arch/powerpc/include/asm/perf_counter.h
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * Performance counter support - PowerPC-specific definitions.
- *
- * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#include <linux/types.h>
-
-#include <asm/hw_irq.h>
-
-#define MAX_HWCOUNTERS		8
-#define MAX_EVENT_ALTERNATIVES	8
-#define MAX_LIMITED_HWCOUNTERS	2
-
-/*
- * This struct provides the constants and functions needed to
- * describe the PMU on a particular POWER-family CPU.
- */
-struct power_pmu {
-	const char	*name;
-	int		n_counter;
-	int		max_alternatives;
-	unsigned long	add_fields;
-	unsigned long	test_adder;
-	int		(*compute_mmcr)(u64 events[], int n_ev,
-				unsigned int hwc[], unsigned long mmcr[]);
-	int		(*get_constraint)(u64 event, unsigned long *mskp,
-				unsigned long *valp);
-	int		(*get_alternatives)(u64 event, unsigned int flags,
-				u64 alt[]);
-	void		(*disable_pmc)(unsigned int pmc, unsigned long mmcr[]);
-	int		(*limited_pmc_event)(u64 event);
-	u32		flags;
-	int		n_generic;
-	int		*generic_events;
-	int		(*cache_events)[PERF_COUNT_HW_CACHE_MAX]
-			       [PERF_COUNT_HW_CACHE_OP_MAX]
-			       [PERF_COUNT_HW_CACHE_RESULT_MAX];
-};
-
-/*
- * Values for power_pmu.flags
- */
-#define PPMU_LIMITED_PMC5_6	1	/* PMC5/6 have limited function */
-#define PPMU_ALT_SIPR		2	/* uses alternate posn for SIPR/HV */
-
-/*
- * Values for flags to get_alternatives()
- */
-#define PPMU_LIMITED_PMC_OK	1	/* can put this on a limited PMC */
-#define PPMU_LIMITED_PMC_REQD	2	/* have to put this on a limited PMC */
-#define PPMU_ONLY_COUNT_RUN	4	/* only counting in run state */
-
-extern int register_power_pmu(struct power_pmu *);
-
-struct pt_regs;
-extern unsigned long perf_misc_flags(struct pt_regs *regs);
-extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
-
-#define PERF_COUNTER_INDEX_OFFSET	1
-
-/*
- * Only override the default definitions in include/linux/perf_counter.h
- * if we have hardware PMU support.
- */
-#ifdef CONFIG_PPC_PERF_CTRS
-#define perf_misc_flags(regs)	perf_misc_flags(regs)
-#endif
-
-/*
- * The power_pmu.get_constraint function returns a 32/64-bit value and
- * a 32/64-bit mask that express the constraints between this event and
- * other events.
- *
- * The value and mask are divided up into (non-overlapping) bitfields
- * of three different types:
- *
- * Select field: this expresses the constraint that some set of bits
- * in MMCR* needs to be set to a specific value for this event.  For a
- * select field, the mask contains 1s in every bit of the field, and
- * the value contains a unique value for each possible setting of the
- * MMCR* bits.  The constraint checking code will ensure that two events
- * that set the same field in their masks have the same value in their
- * value dwords.
- *
- * Add field: this expresses the constraint that there can be at most
- * N events in a particular class.  A field of k bits can be used for
- * N <= 2^(k-1) - 1.  The mask has the most significant bit of the field
- * set (and the other bits 0), and the value has only the least significant
- * bit of the field set.  In addition, the 'add_fields' and 'test_adder'
- * in the struct power_pmu for this processor come into play.  The
- * add_fields value contains 1 in the LSB of the field, and the
- * test_adder contains 2^(k-1) - 1 - N in the field.
- *
- * NAND field: this expresses the constraint that you may not have events
- * in all of a set of classes.  (For example, on PPC970, you can't select
- * events from the FPU, ISU and IDU simultaneously, although any two are
- * possible.)  For N classes, the field is N+1 bits wide, and each class
- * is assigned one bit from the least-significant N bits.  The mask has
- * only the most-significant bit set, and the value has only the bit
- * for the event's class set.  The test_adder has the least significant
- * bit set in the field.
- *
- * If an event is not subject to the constraint expressed by a particular
- * field, then it will have 0 in both the mask and value for that field.
- */
diff --git a/arch/powerpc/include/asm/perf_event.h b/arch/powerpc/include/asm/perf_event.h
new file mode 100644
index 00000000000..2499aaadaeb
--- /dev/null
+++ b/arch/powerpc/include/asm/perf_event.h
@@ -0,0 +1,110 @@
+/*
+ * Performance event support - PowerPC-specific definitions.
+ *
+ * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/types.h>
+
+#include <asm/hw_irq.h>
+
+#define MAX_HWEVENTS		8
+#define MAX_EVENT_ALTERNATIVES	8
+#define MAX_LIMITED_HWEVENTS	2
+
+/*
+ * This struct provides the constants and functions needed to
+ * describe the PMU on a particular POWER-family CPU.
+ */
+struct power_pmu {
+	const char	*name;
+	int		n_event;
+	int		max_alternatives;
+	unsigned long	add_fields;
+	unsigned long	test_adder;
+	int		(*compute_mmcr)(u64 events[], int n_ev,
+				unsigned int hwc[], unsigned long mmcr[]);
+	int		(*get_constraint)(u64 event_id, unsigned long *mskp,
+				unsigned long *valp);
+	int		(*get_alternatives)(u64 event_id, unsigned int flags,
+				u64 alt[]);
+	void		(*disable_pmc)(unsigned int pmc, unsigned long mmcr[]);
+	int		(*limited_pmc_event)(u64 event_id);
+	u32		flags;
+	int		n_generic;
+	int		*generic_events;
+	int		(*cache_events)[PERF_COUNT_HW_CACHE_MAX]
+			       [PERF_COUNT_HW_CACHE_OP_MAX]
+			       [PERF_COUNT_HW_CACHE_RESULT_MAX];
+};
+
+/*
+ * Values for power_pmu.flags
+ */
+#define PPMU_LIMITED_PMC5_6	1	/* PMC5/6 have limited function */
+#define PPMU_ALT_SIPR		2	/* uses alternate posn for SIPR/HV */
+
+/*
+ * Values for flags to get_alternatives()
+ */
+#define PPMU_LIMITED_PMC_OK	1	/* can put this on a limited PMC */
+#define PPMU_LIMITED_PMC_REQD	2	/* have to put this on a limited PMC */
+#define PPMU_ONLY_COUNT_RUN	4	/* only counting in run state */
+
+extern int register_power_pmu(struct power_pmu *);
+
+struct pt_regs;
+extern unsigned long perf_misc_flags(struct pt_regs *regs);
+extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
+
+#define PERF_EVENT_INDEX_OFFSET	1
+
+/*
+ * Only override the default definitions in include/linux/perf_event.h
+ * if we have hardware PMU support.
+ */
+#ifdef CONFIG_PPC_PERF_CTRS
+#define perf_misc_flags(regs)	perf_misc_flags(regs)
+#endif
+
+/*
+ * The power_pmu.get_constraint function returns a 32/64-bit value and
+ * a 32/64-bit mask that express the constraints between this event_id and
+ * other events.
+ *
+ * The value and mask are divided up into (non-overlapping) bitfields
+ * of three different types:
+ *
+ * Select field: this expresses the constraint that some set of bits
+ * in MMCR* needs to be set to a specific value for this event_id.  For a
+ * select field, the mask contains 1s in every bit of the field, and
+ * the value contains a unique value for each possible setting of the
+ * MMCR* bits.  The constraint checking code will ensure that two events
+ * that set the same field in their masks have the same value in their
+ * value dwords.
+ *
+ * Add field: this expresses the constraint that there can be at most
+ * N events in a particular class.  A field of k bits can be used for
+ * N <= 2^(k-1) - 1.  The mask has the most significant bit of the field
+ * set (and the other bits 0), and the value has only the least significant
+ * bit of the field set.  In addition, the 'add_fields' and 'test_adder'
+ * in the struct power_pmu for this processor come into play.  The
+ * add_fields value contains 1 in the LSB of the field, and the
+ * test_adder contains 2^(k-1) - 1 - N in the field.
+ *
+ * NAND field: this expresses the constraint that you may not have events
+ * in all of a set of classes.  (For example, on PPC970, you can't select
+ * events from the FPU, ISU and IDU simultaneously, although any two are
+ * possible.)  For N classes, the field is N+1 bits wide, and each class
+ * is assigned one bit from the least-significant N bits.  The mask has
+ * only the most-significant bit set, and the value has only the bit
+ * for the event_id's class set.  The test_adder has the least significant
+ * bit set in the field.
+ *
+ * If an event_id is not subject to the constraint expressed by a particular
+ * field, then it will have 0 in both the mask and value for that field.
+ */
diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h
index ed24bd92fe4..c7d671a7d9a 100644
--- a/arch/powerpc/include/asm/systbl.h
+++ b/arch/powerpc/include/asm/systbl.h
@@ -322,7 +322,7 @@ SYSCALL_SPU(epoll_create1)
 SYSCALL_SPU(dup3)
 SYSCALL_SPU(pipe2)
 SYSCALL(inotify_init1)
-SYSCALL_SPU(perf_counter_open)
+SYSCALL_SPU(perf_event_open)
 COMPAT_SYS_SPU(preadv)
 COMPAT_SYS_SPU(pwritev)
 COMPAT_SYS(rt_tgsigqueueinfo)
diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h
index cef080bfc60..f6ca7617676 100644
--- a/arch/powerpc/include/asm/unistd.h
+++ b/arch/powerpc/include/asm/unistd.h
@@ -341,7 +341,7 @@
 #define __NR_dup3		316
 #define __NR_pipe2		317
 #define __NR_inotify_init1	318
-#define __NR_perf_counter_open	319
+#define __NR_perf_event_open	319
 #define __NR_preadv		320
 #define __NR_pwritev		321
 #define __NR_rt_tgsigqueueinfo	322
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 569f79ccd31..b23664a0b86 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -97,7 +97,7 @@ obj64-$(CONFIG_AUDIT)		+= compat_audit.o
 
 obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o
 obj-$(CONFIG_FUNCTION_GRAPH_TRACER)	+= ftrace.o
-obj-$(CONFIG_PPC_PERF_CTRS)	+= perf_counter.o perf_callchain.o
+obj-$(CONFIG_PPC_PERF_CTRS)	+= perf_event.o perf_callchain.o
 obj64-$(CONFIG_PPC_PERF_CTRS)	+= power4-pmu.o ppc970-pmu.o power5-pmu.o \
 				   power5+-pmu.o power6-pmu.o power7-pmu.o
 obj32-$(CONFIG_PPC_PERF_CTRS)	+= mpc7450-pmu.o
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index f0df285f0f8..0812b0f414b 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -133,7 +133,7 @@ int main(void)
 	DEFINE(PACAKMSR, offsetof(struct paca_struct, kernel_msr));
 	DEFINE(PACASOFTIRQEN, offsetof(struct paca_struct, soft_enabled));
 	DEFINE(PACAHARDIRQEN, offsetof(struct paca_struct, hard_enabled));
-	DEFINE(PACAPERFPEND, offsetof(struct paca_struct, perf_counter_pending));
+	DEFINE(PACAPERFPEND, offsetof(struct paca_struct, perf_event_pending));
 	DEFINE(PACACONTEXTID, offsetof(struct paca_struct, context.id));
 #ifdef CONFIG_PPC_MM_SLICES
 	DEFINE(PACALOWSLICESPSIZE, offsetof(struct paca_struct,
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 66bcda34a6b..900e0eea009 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -556,14 +556,14 @@ ALT_FW_FTR_SECTION_END_IFCLR(FW_FEATURE_ISERIES)
 2:
 	TRACE_AND_RESTORE_IRQ(r5);
 
-#ifdef CONFIG_PERF_COUNTERS
-	/* check paca->perf_counter_pending if we're enabling ints */
+#ifdef CONFIG_PERF_EVENTS
+	/* check paca->perf_event_pending if we're enabling ints */
 	lbz	r3,PACAPERFPEND(r13)
 	and.	r3,r3,r5
 	beq	27f
-	bl	.perf_counter_do_pending
+	bl	.perf_event_do_pending
 27:
-#endif /* CONFIG_PERF_COUNTERS */
+#endif /* CONFIG_PERF_EVENTS */
 
 	/* extract EE bit and use it to restore paca->hard_enabled */
 	ld	r3,_MSR(r1)
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index f7f376ea7b1..e5d12117798 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -53,7 +53,7 @@
 #include <linux/bootmem.h>
 #include <linux/pci.h>
 #include <linux/debugfs.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
 
 #include <asm/uaccess.h>
 #include <asm/system.h>
@@ -138,9 +138,9 @@ notrace void raw_local_irq_restore(unsigned long en)
 	}
 #endif /* CONFIG_PPC_STD_MMU_64 */
 
-	if (test_perf_counter_pending()) {
-		clear_perf_counter_pending();
-		perf_counter_do_pending();
+	if (test_perf_event_pending()) {
+		clear_perf_event_pending();
+		perf_event_do_pending();
 	}
 
 	/*
diff --git a/arch/powerpc/kernel/mpc7450-pmu.c b/arch/powerpc/kernel/mpc7450-pmu.c
index cc466d039af..09d72028f31 100644
--- a/arch/powerpc/kernel/mpc7450-pmu.c
+++ b/arch/powerpc/kernel/mpc7450-pmu.c
@@ -9,7 +9,7 @@
  * 2 of the License, or (at your option) any later version.
  */
 #include <linux/string.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
 #include <asm/reg.h>
 #include <asm/cputable.h>
 
diff --git a/arch/powerpc/kernel/perf_callchain.c b/arch/powerpc/kernel/perf_callchain.c
index f74b62c6751..0a03cf70d24 100644
--- a/arch/powerpc/kernel/perf_callchain.c
+++ b/arch/powerpc/kernel/perf_callchain.c
@@ -10,7 +10,7 @@
  */
 #include <linux/kernel.h>
 #include <linux/sched.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
 #include <linux/percpu.h>
 #include <linux/uaccess.h>
 #include <linux/mm.h>
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
deleted file mode 100644
index 5ccf9bca96c..00000000000
--- a/arch/powerpc/kernel/perf_counter.c
+++ /dev/null
@@ -1,1315 +0,0 @@
-/*
- * Performance counter support - powerpc architecture code
- *
- * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/perf_counter.h>
-#include <linux/percpu.h>
-#include <linux/hardirq.h>
-#include <asm/reg.h>
-#include <asm/pmc.h>
-#include <asm/machdep.h>
-#include <asm/firmware.h>
-#include <asm/ptrace.h>
-
-struct cpu_hw_counters {
-	int n_counters;
-	int n_percpu;
-	int disabled;
-	int n_added;
-	int n_limited;
-	u8  pmcs_enabled;
-	struct perf_counter *counter[MAX_HWCOUNTERS];
-	u64 events[MAX_HWCOUNTERS];
-	unsigned int flags[MAX_HWCOUNTERS];
-	unsigned long mmcr[3];
-	struct perf_counter *limited_counter[MAX_LIMITED_HWCOUNTERS];
-	u8  limited_hwidx[MAX_LIMITED_HWCOUNTERS];
-	u64 alternatives[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
-	unsigned long amasks[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
-	unsigned long avalues[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
-};
-DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters);
-
-struct power_pmu *ppmu;
-
-/*
- * Normally, to ignore kernel events we set the FCS (freeze counters
- * in supervisor mode) bit in MMCR0, but if the kernel runs with the
- * hypervisor bit set in the MSR, or if we are running on a processor
- * where the hypervisor bit is forced to 1 (as on Apple G5 processors),
- * then we need to use the FCHV bit to ignore kernel events.
- */
-static unsigned int freeze_counters_kernel = MMCR0_FCS;
-
-/*
- * 32-bit doesn't have MMCRA but does have an MMCR2,
- * and a few other names are different.
- */
-#ifdef CONFIG_PPC32
-
-#define MMCR0_FCHV		0
-#define MMCR0_PMCjCE		MMCR0_PMCnCE
-
-#define SPRN_MMCRA		SPRN_MMCR2
-#define MMCRA_SAMPLE_ENABLE	0
-
-static inline unsigned long perf_ip_adjust(struct pt_regs *regs)
-{
-	return 0;
-}
-static inline void perf_get_data_addr(struct pt_regs *regs, u64 *addrp) { }
-static inline u32 perf_get_misc_flags(struct pt_regs *regs)
-{
-	return 0;
-}
-static inline void perf_read_regs(struct pt_regs *regs) { }
-static inline int perf_intr_is_nmi(struct pt_regs *regs)
-{
-	return 0;
-}
-
-#endif /* CONFIG_PPC32 */
-
-/*
- * Things that are specific to 64-bit implementations.
- */
-#ifdef CONFIG_PPC64
-
-static inline unsigned long perf_ip_adjust(struct pt_regs *regs)
-{
-	unsigned long mmcra = regs->dsisr;
-
-	if ((mmcra & MMCRA_SAMPLE_ENABLE) && !(ppmu->flags & PPMU_ALT_SIPR)) {
-		unsigned long slot = (mmcra & MMCRA_SLOT) >> MMCRA_SLOT_SHIFT;
-		if (slot > 1)
-			return 4 * (slot - 1);
-	}
-	return 0;
-}
-
-/*
- * The user wants a data address recorded.
- * If we're not doing instruction sampling, give them the SDAR
- * (sampled data address).  If we are doing instruction sampling, then
- * only give them the SDAR if it corresponds to the instruction
- * pointed to by SIAR; this is indicated by the [POWER6_]MMCRA_SDSYNC
- * bit in MMCRA.
- */
-static inline void perf_get_data_addr(struct pt_regs *regs, u64 *addrp)
-{
-	unsigned long mmcra = regs->dsisr;
-	unsigned long sdsync = (ppmu->flags & PPMU_ALT_SIPR) ?
-		POWER6_MMCRA_SDSYNC : MMCRA_SDSYNC;
-
-	if (!(mmcra & MMCRA_SAMPLE_ENABLE) || (mmcra & sdsync))
-		*addrp = mfspr(SPRN_SDAR);
-}
-
-static inline u32 perf_get_misc_flags(struct pt_regs *regs)
-{
-	unsigned long mmcra = regs->dsisr;
-
-	if (TRAP(regs) != 0xf00)
-		return 0;	/* not a PMU interrupt */
-
-	if (ppmu->flags & PPMU_ALT_SIPR) {
-		if (mmcra & POWER6_MMCRA_SIHV)
-			return PERF_EVENT_MISC_HYPERVISOR;
-		return (mmcra & POWER6_MMCRA_SIPR) ?
-			PERF_EVENT_MISC_USER : PERF_EVENT_MISC_KERNEL;
-	}
-	if (mmcra & MMCRA_SIHV)
-		return PERF_EVENT_MISC_HYPERVISOR;
-	return (mmcra & MMCRA_SIPR) ? PERF_EVENT_MISC_USER :
-		PERF_EVENT_MISC_KERNEL;
-}
-
-/*
- * Overload regs->dsisr to store MMCRA so we only need to read it once
- * on each interrupt.
- */
-static inline void perf_read_regs(struct pt_regs *regs)
-{
-	regs->dsisr = mfspr(SPRN_MMCRA);
-}
-
-/*
- * If interrupts were soft-disabled when a PMU interrupt occurs, treat
- * it as an NMI.
- */
-static inline int perf_intr_is_nmi(struct pt_regs *regs)
-{
-	return !regs->softe;
-}
-
-#endif /* CONFIG_PPC64 */
-
-static void perf_counter_interrupt(struct pt_regs *regs);
-
-void perf_counter_print_debug(void)
-{
-}
-
-/*
- * Read one performance monitor counter (PMC).
- */
-static unsigned long read_pmc(int idx)
-{
-	unsigned long val;
-
-	switch (idx) {
-	case 1:
-		val = mfspr(SPRN_PMC1);
-		break;
-	case 2:
-		val = mfspr(SPRN_PMC2);
-		break;
-	case 3:
-		val = mfspr(SPRN_PMC3);
-		break;
-	case 4:
-		val = mfspr(SPRN_PMC4);
-		break;
-	case 5:
-		val = mfspr(SPRN_PMC5);
-		break;
-	case 6:
-		val = mfspr(SPRN_PMC6);
-		break;
-#ifdef CONFIG_PPC64
-	case 7:
-		val = mfspr(SPRN_PMC7);
-		break;
-	case 8:
-		val = mfspr(SPRN_PMC8);
-		break;
-#endif /* CONFIG_PPC64 */
-	default:
-		printk(KERN_ERR "oops trying to read PMC%d\n", idx);
-		val = 0;
-	}
-	return val;
-}
-
-/*
- * Write one PMC.
- */
-static void write_pmc(int idx, unsigned long val)
-{
-	switch (idx) {
-	case 1:
-		mtspr(SPRN_PMC1, val);
-		break;
-	case 2:
-		mtspr(SPRN_PMC2, val);
-		break;
-	case 3:
-		mtspr(SPRN_PMC3, val);
-		break;
-	case 4:
-		mtspr(SPRN_PMC4, val);
-		break;
-	case 5:
-		mtspr(SPRN_PMC5, val);
-		break;
-	case 6:
-		mtspr(SPRN_PMC6, val);
-		break;
-#ifdef CONFIG_PPC64
-	case 7:
-		mtspr(SPRN_PMC7, val);
-		break;
-	case 8:
-		mtspr(SPRN_PMC8, val);
-		break;
-#endif /* CONFIG_PPC64 */
-	default:
-		printk(KERN_ERR "oops trying to write PMC%d\n", idx);
-	}
-}
-
-/*
- * Check if a set of events can all go on the PMU at once.
- * If they can't, this will look at alternative codes for the events
- * and see if any combination of alternative codes is feasible.
- * The feasible set is returned in event[].
- */
-static int power_check_constraints(struct cpu_hw_counters *cpuhw,
-				   u64 event[], unsigned int cflags[],
-				   int n_ev)
-{
-	unsigned long mask, value, nv;
-	unsigned long smasks[MAX_HWCOUNTERS], svalues[MAX_HWCOUNTERS];
-	int n_alt[MAX_HWCOUNTERS], choice[MAX_HWCOUNTERS];
-	int i, j;
-	unsigned long addf = ppmu->add_fields;
-	unsigned long tadd = ppmu->test_adder;
-
-	if (n_ev > ppmu->n_counter)
-		return -1;
-
-	/* First see if the events will go on as-is */
-	for (i = 0; i < n_ev; ++i) {
-		if ((cflags[i] & PPMU_LIMITED_PMC_REQD)
-		    && !ppmu->limited_pmc_event(event[i])) {
-			ppmu->get_alternatives(event[i], cflags[i],
-					       cpuhw->alternatives[i]);
-			event[i] = cpuhw->alternatives[i][0];
-		}
-		if (ppmu->get_constraint(event[i], &cpuhw->amasks[i][0],
-					 &cpuhw->avalues[i][0]))
-			return -1;
-	}
-	value = mask = 0;
-	for (i = 0; i < n_ev; ++i) {
-		nv = (value | cpuhw->avalues[i][0]) +
-			(value & cpuhw->avalues[i][0] & addf);
-		if ((((nv + tadd) ^ value) & mask) != 0 ||
-		    (((nv + tadd) ^ cpuhw->avalues[i][0]) &
-		     cpuhw->amasks[i][0]) != 0)
-			break;
-		value = nv;
-		mask |= cpuhw->amasks[i][0];
-	}
-	if (i == n_ev)
-		return 0;	/* all OK */
-
-	/* doesn't work, gather alternatives... */
-	if (!ppmu->get_alternatives)
-		return -1;
-	for (i = 0; i < n_ev; ++i) {
-		choice[i] = 0;
-		n_alt[i] = ppmu->get_alternatives(event[i], cflags[i],
-						  cpuhw->alternatives[i]);
-		for (j = 1; j < n_alt[i]; ++j)
-			ppmu->get_constraint(cpuhw->alternatives[i][j],
-					     &cpuhw->amasks[i][j],
-					     &cpuhw->avalues[i][j]);
-	}
-
-	/* enumerate all possibilities and see if any will work */
-	i = 0;
-	j = -1;
-	value = mask = nv = 0;
-	while (i < n_ev) {
-		if (j >= 0) {
-			/* we're backtracking, restore context */
-			value = svalues[i];
-			mask = smasks[i];
-			j = choice[i];
-		}
-		/*
-		 * See if any alternative k for event i,
-		 * where k > j, will satisfy the constraints.
-		 */
-		while (++j < n_alt[i]) {
-			nv = (value | cpuhw->avalues[i][j]) +
-				(value & cpuhw->avalues[i][j] & addf);
-			if ((((nv + tadd) ^ value) & mask) == 0 &&
-			    (((nv + tadd) ^ cpuhw->avalues[i][j])
-			     & cpuhw->amasks[i][j]) == 0)
-				break;
-		}
-		if (j >= n_alt[i]) {
-			/*
-			 * No feasible alternative, backtrack
-			 * to event i-1 and continue enumerating its
-			 * alternatives from where we got up to.
-			 */
-			if (--i < 0)
-				return -1;
-		} else {
-			/*
-			 * Found a feasible alternative for event i,
-			 * remember where we got up to with this event,
-			 * go on to the next event, and start with
-			 * the first alternative for it.
-			 */
-			choice[i] = j;
-			svalues[i] = value;
-			smasks[i] = mask;
-			value = nv;
-			mask |= cpuhw->amasks[i][j];
-			++i;
-			j = -1;
-		}
-	}
-
-	/* OK, we have a feasible combination, tell the caller the solution */
-	for (i = 0; i < n_ev; ++i)
-		event[i] = cpuhw->alternatives[i][choice[i]];
-	return 0;
-}
-
-/*
- * Check if newly-added counters have consistent settings for
- * exclude_{user,kernel,hv} with each other and any previously
- * added counters.
- */
-static int check_excludes(struct perf_counter **ctrs, unsigned int cflags[],
-			  int n_prev, int n_new)
-{
-	int eu = 0, ek = 0, eh = 0;
-	int i, n, first;
-	struct perf_counter *counter;
-
-	n = n_prev + n_new;
-	if (n <= 1)
-		return 0;
-
-	first = 1;
-	for (i = 0; i < n; ++i) {
-		if (cflags[i] & PPMU_LIMITED_PMC_OK) {
-			cflags[i] &= ~PPMU_LIMITED_PMC_REQD;
-			continue;
-		}
-		counter = ctrs[i];
-		if (first) {
-			eu = counter->attr.exclude_user;
-			ek = counter->attr.exclude_kernel;
-			eh = counter->attr.exclude_hv;
-			first = 0;
-		} else if (counter->attr.exclude_user != eu ||
-			   counter->attr.exclude_kernel != ek ||
-			   counter->attr.exclude_hv != eh) {
-			return -EAGAIN;
-		}
-	}
-
-	if (eu || ek || eh)
-		for (i = 0; i < n; ++i)
-			if (cflags[i] & PPMU_LIMITED_PMC_OK)
-				cflags[i] |= PPMU_LIMITED_PMC_REQD;
-
-	return 0;
-}
-
-static void power_pmu_read(struct perf_counter *counter)
-{
-	s64 val, delta, prev;
-
-	if (!counter->hw.idx)
-		return;
-	/*
-	 * Performance monitor interrupts come even when interrupts
-	 * are soft-disabled, as long as interrupts are hard-enabled.
-	 * Therefore we treat them like NMIs.
-	 */
-	do {
-		prev = atomic64_read(&counter->hw.prev_count);
-		barrier();
-		val = read_pmc(counter->hw.idx);
-	} while (atomic64_cmpxchg(&counter->hw.prev_count, prev, val) != prev);
-
-	/* The counters are only 32 bits wide */
-	delta = (val - prev) & 0xfffffffful;
-	atomic64_add(delta, &counter->count);
-	atomic64_sub(delta, &counter->hw.period_left);
-}
-
-/*
- * On some machines, PMC5 and PMC6 can't be written, don't respect
- * the freeze conditions, and don't generate interrupts.  This tells
- * us if `counter' is using such a PMC.
- */
-static int is_limited_pmc(int pmcnum)
-{
-	return (ppmu->flags & PPMU_LIMITED_PMC5_6)
-		&& (pmcnum == 5 || pmcnum == 6);
-}
-
-static void freeze_limited_counters(struct cpu_hw_counters *cpuhw,
-				    unsigned long pmc5, unsigned long pmc6)
-{
-	struct perf_counter *counter;
-	u64 val, prev, delta;
-	int i;
-
-	for (i = 0; i < cpuhw->n_limited; ++i) {
-		counter = cpuhw->limited_counter[i];
-		if (!counter->hw.idx)
-			continue;
-		val = (counter->hw.idx == 5) ? pmc5 : pmc6;
-		prev = atomic64_read(&counter->hw.prev_count);
-		counter->hw.idx = 0;
-		delta = (val - prev) & 0xfffffffful;
-		atomic64_add(delta, &counter->count);
-	}
-}
-
-static void thaw_limited_counters(struct cpu_hw_counters *cpuhw,
-				  unsigned long pmc5, unsigned long pmc6)
-{
-	struct perf_counter *counter;
-	u64 val;
-	int i;
-
-	for (i = 0; i < cpuhw->n_limited; ++i) {
-		counter = cpuhw->limited_counter[i];
-		counter->hw.idx = cpuhw->limited_hwidx[i];
-		val = (counter->hw.idx == 5) ? pmc5 : pmc6;
-		atomic64_set(&counter->hw.prev_count, val);
-		perf_counter_update_userpage(counter);
-	}
-}
-
-/*
- * Since limited counters don't respect the freeze conditions, we
- * have to read them immediately after freezing or unfreezing the
- * other counters.  We try to keep the values from the limited
- * counters as consistent as possible by keeping the delay (in
- * cycles and instructions) between freezing/unfreezing and reading
- * the limited counters as small and consistent as possible.
- * Therefore, if any limited counters are in use, we read them
- * both, and always in the same order, to minimize variability,
- * and do it inside the same asm that writes MMCR0.
- */
-static void write_mmcr0(struct cpu_hw_counters *cpuhw, unsigned long mmcr0)
-{
-	unsigned long pmc5, pmc6;
-
-	if (!cpuhw->n_limited) {
-		mtspr(SPRN_MMCR0, mmcr0);
-		return;
-	}
-
-	/*
-	 * Write MMCR0, then read PMC5 and PMC6 immediately.
-	 * To ensure we don't get a performance monitor interrupt
-	 * between writing MMCR0 and freezing/thawing the limited
-	 * counters, we first write MMCR0 with the counter overflow
-	 * interrupt enable bits turned off.
-	 */
-	asm volatile("mtspr %3,%2; mfspr %0,%4; mfspr %1,%5"
-		     : "=&r" (pmc5), "=&r" (pmc6)
-		     : "r" (mmcr0 & ~(MMCR0_PMC1CE | MMCR0_PMCjCE)),
-		       "i" (SPRN_MMCR0),
-		       "i" (SPRN_PMC5), "i" (SPRN_PMC6));
-
-	if (mmcr0 & MMCR0_FC)
-		freeze_limited_counters(cpuhw, pmc5, pmc6);
-	else
-		thaw_limited_counters(cpuhw, pmc5, pmc6);
-
-	/*
-	 * Write the full MMCR0 including the counter overflow interrupt
-	 * enable bits, if necessary.
-	 */
-	if (mmcr0 & (MMCR0_PMC1CE | MMCR0_PMCjCE))
-		mtspr(SPRN_MMCR0, mmcr0);
-}
-
-/*
- * Disable all counters to prevent PMU interrupts and to allow
- * counters to be added or removed.
- */
-void hw_perf_disable(void)
-{
-	struct cpu_hw_counters *cpuhw;
-	unsigned long flags;
-
-	if (!ppmu)
-		return;
-	local_irq_save(flags);
-	cpuhw = &__get_cpu_var(cpu_hw_counters);
-
-	if (!cpuhw->disabled) {
-		cpuhw->disabled = 1;
-		cpuhw->n_added = 0;
-
-		/*
-		 * Check if we ever enabled the PMU on this cpu.
-		 */
-		if (!cpuhw->pmcs_enabled) {
-			ppc_enable_pmcs();
-			cpuhw->pmcs_enabled = 1;
-		}
-
-		/*
-		 * Disable instruction sampling if it was enabled
-		 */
-		if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) {
-			mtspr(SPRN_MMCRA,
-			      cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
-			mb();
-		}
-
-		/*
-		 * Set the 'freeze counters' bit.
-		 * The barrier is to make sure the mtspr has been
-		 * executed and the PMU has frozen the counters
-		 * before we return.
-		 */
-		write_mmcr0(cpuhw, mfspr(SPRN_MMCR0) | MMCR0_FC);
-		mb();
-	}
-	local_irq_restore(flags);
-}
-
-/*
- * Re-enable all counters if disable == 0.
- * If we were previously disabled and counters were added, then
- * put the new config on the PMU.
- */
-void hw_perf_enable(void)
-{
-	struct perf_counter *counter;
-	struct cpu_hw_counters *cpuhw;
-	unsigned long flags;
-	long i;
-	unsigned long val;
-	s64 left;
-	unsigned int hwc_index[MAX_HWCOUNTERS];
-	int n_lim;
-	int idx;
-
-	if (!ppmu)
-		return;
-	local_irq_save(flags);
-	cpuhw = &__get_cpu_var(cpu_hw_counters);
-	if (!cpuhw->disabled) {
-		local_irq_restore(flags);
-		return;
-	}
-	cpuhw->disabled = 0;
-
-	/*
-	 * If we didn't change anything, or only removed counters,
-	 * no need to recalculate MMCR* settings and reset the PMCs.
-	 * Just reenable the PMU with the current MMCR* settings
-	 * (possibly updated for removal of counters).
-	 */
-	if (!cpuhw->n_added) {
-		mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
-		mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
-		if (cpuhw->n_counters == 0)
-			ppc_set_pmu_inuse(0);
-		goto out_enable;
-	}
-
-	/*
-	 * Compute MMCR* values for the new set of counters
-	 */
-	if (ppmu->compute_mmcr(cpuhw->events, cpuhw->n_counters, hwc_index,
-			       cpuhw->mmcr)) {
-		/* shouldn't ever get here */
-		printk(KERN_ERR "oops compute_mmcr failed\n");
-		goto out;
-	}
-
-	/*
-	 * Add in MMCR0 freeze bits corresponding to the
-	 * attr.exclude_* bits for the first counter.
-	 * We have already checked that all counters have the
-	 * same values for these bits as the first counter.
-	 */
-	counter = cpuhw->counter[0];
-	if (counter->attr.exclude_user)
-		cpuhw->mmcr[0] |= MMCR0_FCP;
-	if (counter->attr.exclude_kernel)
-		cpuhw->mmcr[0] |= freeze_counters_kernel;
-	if (counter->attr.exclude_hv)
-		cpuhw->mmcr[0] |= MMCR0_FCHV;
-
-	/*
-	 * Write the new configuration to MMCR* with the freeze
-	 * bit set and set the hardware counters to their initial values.
-	 * Then unfreeze the counters.
-	 */
-	ppc_set_pmu_inuse(1);
-	mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
-	mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
-	mtspr(SPRN_MMCR0, (cpuhw->mmcr[0] & ~(MMCR0_PMC1CE | MMCR0_PMCjCE))
-				| MMCR0_FC);
-
-	/*
-	 * Read off any pre-existing counters that need to move
-	 * to another PMC.
-	 */
-	for (i = 0; i < cpuhw->n_counters; ++i) {
-		counter = cpuhw->counter[i];
-		if (counter->hw.idx && counter->hw.idx != hwc_index[i] + 1) {
-			power_pmu_read(counter);
-			write_pmc(counter->hw.idx, 0);
-			counter->hw.idx = 0;
-		}
-	}
-
-	/*
-	 * Initialize the PMCs for all the new and moved counters.
-	 */
-	cpuhw->n_limited = n_lim = 0;
-	for (i = 0; i < cpuhw->n_counters; ++i) {
-		counter = cpuhw->counter[i];
-		if (counter->hw.idx)
-			continue;
-		idx = hwc_index[i] + 1;
-		if (is_limited_pmc(idx)) {
-			cpuhw->limited_counter[n_lim] = counter;
-			cpuhw->limited_hwidx[n_lim] = idx;
-			++n_lim;
-			continue;
-		}
-		val = 0;
-		if (counter->hw.sample_period) {
-			left = atomic64_read(&counter->hw.period_left);
-			if (left < 0x80000000L)
-				val = 0x80000000L - left;
-		}
-		atomic64_set(&counter->hw.prev_count, val);
-		counter->hw.idx = idx;
-		write_pmc(idx, val);
-		perf_counter_update_userpage(counter);
-	}
-	cpuhw->n_limited = n_lim;
-	cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE;
-
- out_enable:
-	mb();
-	write_mmcr0(cpuhw, cpuhw->mmcr[0]);
-
-	/*
-	 * Enable instruction sampling if necessary
-	 */
-	if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) {
-		mb();
-		mtspr(SPRN_MMCRA, cpuhw->mmcr[2]);
-	}
-
- out:
-	local_irq_restore(flags);
-}
-
-static int collect_events(struct perf_counter *group, int max_count,
-			  struct perf_counter *ctrs[], u64 *events,
-			  unsigned int *flags)
-{
-	int n = 0;
-	struct perf_counter *counter;
-
-	if (!is_software_counter(group)) {
-		if (n >= max_count)
-			return -1;
-		ctrs[n] = group;
-		flags[n] = group->hw.counter_base;
-		events[n++] = group->hw.config;
-	}
-	list_for_each_entry(counter, &group->sibling_list, list_entry) {
-		if (!is_software_counter(counter) &&
-		    counter->state != PERF_COUNTER_STATE_OFF) {
-			if (n >= max_count)
-				return -1;
-			ctrs[n] = counter;
-			flags[n] = counter->hw.counter_base;
-			events[n++] = counter->hw.config;
-		}
-	}
-	return n;
-}
-
-static void counter_sched_in(struct perf_counter *counter, int cpu)
-{
-	counter->state = PERF_COUNTER_STATE_ACTIVE;
-	counter->oncpu = cpu;
-	counter->tstamp_running += counter->ctx->time - counter->tstamp_stopped;
-	if (is_software_counter(counter))
-		counter->pmu->enable(counter);
-}
-
-/*
- * Called to enable a whole group of counters.
- * Returns 1 if the group was enabled, or -EAGAIN if it could not be.
- * Assumes the caller has disabled interrupts and has
- * frozen the PMU with hw_perf_save_disable.
- */
-int hw_perf_group_sched_in(struct perf_counter *group_leader,
-	       struct perf_cpu_context *cpuctx,
-	       struct perf_counter_context *ctx, int cpu)
-{
-	struct cpu_hw_counters *cpuhw;
-	long i, n, n0;
-	struct perf_counter *sub;
-
-	if (!ppmu)
-		return 0;
-	cpuhw = &__get_cpu_var(cpu_hw_counters);
-	n0 = cpuhw->n_counters;
-	n = collect_events(group_leader, ppmu->n_counter - n0,
-			   &cpuhw->counter[n0], &cpuhw->events[n0],
-			   &cpuhw->flags[n0]);
-	if (n < 0)
-		return -EAGAIN;
-	if (check_excludes(cpuhw->counter, cpuhw->flags, n0, n))
-		return -EAGAIN;
-	i = power_check_constraints(cpuhw, cpuhw->events, cpuhw->flags, n + n0);
-	if (i < 0)
-		return -EAGAIN;
-	cpuhw->n_counters = n0 + n;
-	cpuhw->n_added += n;
-
-	/*
-	 * OK, this group can go on; update counter states etc.,
-	 * and enable any software counters
-	 */
-	for (i = n0; i < n0 + n; ++i)
-		cpuhw->counter[i]->hw.config = cpuhw->events[i];
-	cpuctx->active_oncpu += n;
-	n = 1;
-	counter_sched_in(group_leader, cpu);
-	list_for_each_entry(sub, &group_leader->sibling_list, list_entry) {
-		if (sub->state != PERF_COUNTER_STATE_OFF) {
-			counter_sched_in(sub, cpu);
-			++n;
-		}
-	}
-	ctx->nr_active += n;
-
-	return 1;
-}
-
-/*
- * Add a counter to the PMU.
- * If all counters are not already frozen, then we disable and
- * re-enable the PMU in order to get hw_perf_enable to do the
- * actual work of reconfiguring the PMU.
- */
-static int power_pmu_enable(struct perf_counter *counter)
-{
-	struct cpu_hw_counters *cpuhw;
-	unsigned long flags;
-	int n0;
-	int ret = -EAGAIN;
-
-	local_irq_save(flags);
-	perf_disable();
-
-	/*
-	 * Add the counter to the list (if there is room)
-	 * and check whether the total set is still feasible.
-	 */
-	cpuhw = &__get_cpu_var(cpu_hw_counters);
-	n0 = cpuhw->n_counters;
-	if (n0 >= ppmu->n_counter)
-		goto out;
-	cpuhw->counter[n0] = counter;
-	cpuhw->events[n0] = counter->hw.config;
-	cpuhw->flags[n0] = counter->hw.counter_base;
-	if (check_excludes(cpuhw->counter, cpuhw->flags, n0, 1))
-		goto out;
-	if (power_check_constraints(cpuhw, cpuhw->events, cpuhw->flags, n0 + 1))
-		goto out;
-
-	counter->hw.config = cpuhw->events[n0];
-	++cpuhw->n_counters;
-	++cpuhw->n_added;
-
-	ret = 0;
- out:
-	perf_enable();
-	local_irq_restore(flags);
-	return ret;
-}
-
-/*
- * Remove a counter from the PMU.
- */
-static void power_pmu_disable(struct perf_counter *counter)
-{
-	struct cpu_hw_counters *cpuhw;
-	long i;
-	unsigned long flags;
-
-	local_irq_save(flags);
-	perf_disable();
-
-	power_pmu_read(counter);
-
-	cpuhw = &__get_cpu_var(cpu_hw_counters);
-	for (i = 0; i < cpuhw->n_counters; ++i) {
-		if (counter == cpuhw->counter[i]) {
-			while (++i < cpuhw->n_counters)
-				cpuhw->counter[i-1] = cpuhw->counter[i];
-			--cpuhw->n_counters;
-			ppmu->disable_pmc(counter->hw.idx - 1, cpuhw->mmcr);
-			if (counter->hw.idx) {
-				write_pmc(counter->hw.idx, 0);
-				counter->hw.idx = 0;
-			}
-			perf_counter_update_userpage(counter);
-			break;
-		}
-	}
-	for (i = 0; i < cpuhw->n_limited; ++i)
-		if (counter == cpuhw->limited_counter[i])
-			break;
-	if (i < cpuhw->n_limited) {
-		while (++i < cpuhw->n_limited) {
-			cpuhw->limited_counter[i-1] = cpuhw->limited_counter[i];
-			cpuhw->limited_hwidx[i-1] = cpuhw->limited_hwidx[i];
-		}
-		--cpuhw->n_limited;
-	}
-	if (cpuhw->n_counters == 0) {
-		/* disable exceptions if no counters are running */
-		cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE);
-	}
-
-	perf_enable();
-	local_irq_restore(flags);
-}
-
-/*
- * Re-enable interrupts on a counter after they were throttled
- * because they were coming too fast.
- */
-static void power_pmu_unthrottle(struct perf_counter *counter)
-{
-	s64 val, left;
-	unsigned long flags;
-
-	if (!counter->hw.idx || !counter->hw.sample_period)
-		return;
-	local_irq_save(flags);
-	perf_disable();
-	power_pmu_read(counter);
-	left = counter->hw.sample_period;
-	counter->hw.last_period = left;
-	val = 0;
-	if (left < 0x80000000L)
-		val = 0x80000000L - left;
-	write_pmc(counter->hw.idx, val);
-	atomic64_set(&counter->hw.prev_count, val);
-	atomic64_set(&counter->hw.period_left, left);
-	perf_counter_update_userpage(counter);
-	perf_enable();
-	local_irq_restore(flags);
-}
-
-struct pmu power_pmu = {
-	.enable		= power_pmu_enable,
-	.disable	= power_pmu_disable,
-	.read		= power_pmu_read,
-	.unthrottle	= power_pmu_unthrottle,
-};
-
-/*
- * Return 1 if we might be able to put counter on a limited PMC,
- * or 0 if not.
- * A counter can only go on a limited PMC if it counts something
- * that a limited PMC can count, doesn't require interrupts, and
- * doesn't exclude any processor mode.
- */
-static int can_go_on_limited_pmc(struct perf_counter *counter, u64 ev,
-				 unsigned int flags)
-{
-	int n;
-	u64 alt[MAX_EVENT_ALTERNATIVES];
-
-	if (counter->attr.exclude_user
-	    || counter->attr.exclude_kernel
-	    || counter->attr.exclude_hv
-	    || counter->attr.sample_period)
-		return 0;
-
-	if (ppmu->limited_pmc_event(ev))
-		return 1;
-
-	/*
-	 * The requested event isn't on a limited PMC already;
-	 * see if any alternative code goes on a limited PMC.
-	 */
-	if (!ppmu->get_alternatives)
-		return 0;
-
-	flags |= PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD;
-	n = ppmu->get_alternatives(ev, flags, alt);
-
-	return n > 0;
-}
-
-/*
- * Find an alternative event that goes on a normal PMC, if possible,
- * and return the event code, or 0 if there is no such alternative.
- * (Note: event code 0 is "don't count" on all machines.)
- */
-static u64 normal_pmc_alternative(u64 ev, unsigned long flags)
-{
-	u64 alt[MAX_EVENT_ALTERNATIVES];
-	int n;
-
-	flags &= ~(PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD);
-	n = ppmu->get_alternatives(ev, flags, alt);
-	if (!n)
-		return 0;
-	return alt[0];
-}
-
-/* Number of perf_counters counting hardware events */
-static atomic_t num_counters;
-/* Used to avoid races in calling reserve/release_pmc_hardware */
-static DEFINE_MUTEX(pmc_reserve_mutex);
-
-/*
- * Release the PMU if this is the last perf_counter.
- */
-static void hw_perf_counter_destroy(struct perf_counter *counter)
-{
-	if (!atomic_add_unless(&num_counters, -1, 1)) {
-		mutex_lock(&pmc_reserve_mutex);
-		if (atomic_dec_return(&num_counters) == 0)
-			release_pmc_hardware();
-		mutex_unlock(&pmc_reserve_mutex);
-	}
-}
-
-/*
- * Translate a generic cache event config to a raw event code.
- */
-static int hw_perf_cache_event(u64 config, u64 *eventp)
-{
-	unsigned long type, op, result;
-	int ev;
-
-	if (!ppmu->cache_events)
-		return -EINVAL;
-
-	/* unpack config */
-	type = config & 0xff;
-	op = (config >> 8) & 0xff;
-	result = (config >> 16) & 0xff;
-
-	if (type >= PERF_COUNT_HW_CACHE_MAX ||
-	    op >= PERF_COUNT_HW_CACHE_OP_MAX ||
-	    result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
-		return -EINVAL;
-
-	ev = (*ppmu->cache_events)[type][op][result];
-	if (ev == 0)
-		return -EOPNOTSUPP;
-	if (ev == -1)
-		return -EINVAL;
-	*eventp = ev;
-	return 0;
-}
-
-const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
-{
-	u64 ev;
-	unsigned long flags;
-	struct perf_counter *ctrs[MAX_HWCOUNTERS];
-	u64 events[MAX_HWCOUNTERS];
-	unsigned int cflags[MAX_HWCOUNTERS];
-	int n;
-	int err;
-	struct cpu_hw_counters *cpuhw;
-
-	if (!ppmu)
-		return ERR_PTR(-ENXIO);
-	switch (counter->attr.type) {
-	case PERF_TYPE_HARDWARE:
-		ev = counter->attr.config;
-		if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
-			return ERR_PTR(-EOPNOTSUPP);
-		ev = ppmu->generic_events[ev];
-		break;
-	case PERF_TYPE_HW_CACHE:
-		err = hw_perf_cache_event(counter->attr.config, &ev);
-		if (err)
-			return ERR_PTR(err);
-		break;
-	case PERF_TYPE_RAW:
-		ev = counter->attr.config;
-		break;
-	default:
-		return ERR_PTR(-EINVAL);
-	}
-	counter->hw.config_base = ev;
-	counter->hw.idx = 0;
-
-	/*
-	 * If we are not running on a hypervisor, force the
-	 * exclude_hv bit to 0 so that we don't care what
-	 * the user set it to.
-	 */
-	if (!firmware_has_feature(FW_FEATURE_LPAR))
-		counter->attr.exclude_hv = 0;
-
-	/*
-	 * If this is a per-task counter, then we can use
-	 * PM_RUN_* events interchangeably with their non RUN_*
-	 * equivalents, e.g. PM_RUN_CYC instead of PM_CYC.
-	 * XXX we should check if the task is an idle task.
-	 */
-	flags = 0;
-	if (counter->ctx->task)
-		flags |= PPMU_ONLY_COUNT_RUN;
-
-	/*
-	 * If this machine has limited counters, check whether this
-	 * event could go on a limited counter.
-	 */
-	if (ppmu->flags & PPMU_LIMITED_PMC5_6) {
-		if (can_go_on_limited_pmc(counter, ev, flags)) {
-			flags |= PPMU_LIMITED_PMC_OK;
-		} else if (ppmu->limited_pmc_event(ev)) {
-			/*
-			 * The requested event is on a limited PMC,
-			 * but we can't use a limited PMC; see if any
-			 * alternative goes on a normal PMC.
-			 */
-			ev = normal_pmc_alternative(ev, flags);
-			if (!ev)
-				return ERR_PTR(-EINVAL);
-		}
-	}
-
-	/*
-	 * If this is in a group, check if it can go on with all the
-	 * other hardware counters in the group.  We assume the counter
-	 * hasn't been linked into its leader's sibling list at this point.
-	 */
-	n = 0;
-	if (counter->group_leader != counter) {
-		n = collect_events(counter->group_leader, ppmu->n_counter - 1,
-				   ctrs, events, cflags);
-		if (n < 0)
-			return ERR_PTR(-EINVAL);
-	}
-	events[n] = ev;
-	ctrs[n] = counter;
-	cflags[n] = flags;
-	if (check_excludes(ctrs, cflags, n, 1))
-		return ERR_PTR(-EINVAL);
-
-	cpuhw = &get_cpu_var(cpu_hw_counters);
-	err = power_check_constraints(cpuhw, events, cflags, n + 1);
-	put_cpu_var(cpu_hw_counters);
-	if (err)
-		return ERR_PTR(-EINVAL);
-
-	counter->hw.config = events[n];
-	counter->hw.counter_base = cflags[n];
-	counter->hw.last_period = counter->hw.sample_period;
-	atomic64_set(&counter->hw.period_left, counter->hw.last_period);
-
-	/*
-	 * See if we need to reserve the PMU.
-	 * If no counters are currently in use, then we have to take a
-	 * mutex to ensure that we don't race with another task doing
-	 * reserve_pmc_hardware or release_pmc_hardware.
-	 */
-	err = 0;
-	if (!atomic_inc_not_zero(&num_counters)) {
-		mutex_lock(&pmc_reserve_mutex);
-		if (atomic_read(&num_counters) == 0 &&
-		    reserve_pmc_hardware(perf_counter_interrupt))
-			err = -EBUSY;
-		else
-			atomic_inc(&num_counters);
-		mutex_unlock(&pmc_reserve_mutex);
-	}
-	counter->destroy = hw_perf_counter_destroy;
-
-	if (err)
-		return ERR_PTR(err);
-	return &power_pmu;
-}
-
-/*
- * A counter has overflowed; update its count and record
- * things if requested.  Note that interrupts are hard-disabled
- * here so there is no possibility of being interrupted.
- */
-static void record_and_restart(struct perf_counter *counter, unsigned long val,
-			       struct pt_regs *regs, int nmi)
-{
-	u64 period = counter->hw.sample_period;
-	s64 prev, delta, left;
-	int record = 0;
-
-	/* we don't have to worry about interrupts here */
-	prev = atomic64_read(&counter->hw.prev_count);
-	delta = (val - prev) & 0xfffffffful;
-	atomic64_add(delta, &counter->count);
-
-	/*
-	 * See if the total period for this counter has expired,
-	 * and update for the next period.
-	 */
-	val = 0;
-	left = atomic64_read(&counter->hw.period_left) - delta;
-	if (period) {
-		if (left <= 0) {
-			left += period;
-			if (left <= 0)
-				left = period;
-			record = 1;
-		}
-		if (left < 0x80000000LL)
-			val = 0x80000000LL - left;
-	}
-
-	/*
-	 * Finally record data if requested.
-	 */
-	if (record) {
-		struct perf_sample_data data = {
-			.addr	= 0,
-			.period	= counter->hw.last_period,
-		};
-
-		if (counter->attr.sample_type & PERF_SAMPLE_ADDR)
-			perf_get_data_addr(regs, &data.addr);
-
-		if (perf_counter_overflow(counter, nmi, &data, regs)) {
-			/*
-			 * Interrupts are coming too fast - throttle them
-			 * by setting the counter to 0, so it will be
-			 * at least 2^30 cycles until the next interrupt
-			 * (assuming each counter counts at most 2 counts
-			 * per cycle).
-			 */
-			val = 0;
-			left = ~0ULL >> 1;
-		}
-	}
-
-	write_pmc(counter->hw.idx, val);
-	atomic64_set(&counter->hw.prev_count, val);
-	atomic64_set(&counter->hw.period_left, left);
-	perf_counter_update_userpage(counter);
-}
-
-/*
- * Called from generic code to get the misc flags (i.e. processor mode)
- * for an event.
- */
-unsigned long perf_misc_flags(struct pt_regs *regs)
-{
-	u32 flags = perf_get_misc_flags(regs);
-
-	if (flags)
-		return flags;
-	return user_mode(regs) ? PERF_EVENT_MISC_USER :
-		PERF_EVENT_MISC_KERNEL;
-}
-
-/*
- * Called from generic code to get the instruction pointer
- * for an event.
- */
-unsigned long perf_instruction_pointer(struct pt_regs *regs)
-{
-	unsigned long ip;
-
-	if (TRAP(regs) != 0xf00)
-		return regs->nip;	/* not a PMU interrupt */
-
-	ip = mfspr(SPRN_SIAR) + perf_ip_adjust(regs);
-	return ip;
-}
-
-/*
- * Performance monitor interrupt stuff
- */
-static void perf_counter_interrupt(struct pt_regs *regs)
-{
-	int i;
-	struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters);
-	struct perf_counter *counter;
-	unsigned long val;
-	int found = 0;
-	int nmi;
-
-	if (cpuhw->n_limited)
-		freeze_limited_counters(cpuhw, mfspr(SPRN_PMC5),
-					mfspr(SPRN_PMC6));
-
-	perf_read_regs(regs);
-
-	nmi = perf_intr_is_nmi(regs);
-	if (nmi)
-		nmi_enter();
-	else
-		irq_enter();
-
-	for (i = 0; i < cpuhw->n_counters; ++i) {
-		counter = cpuhw->counter[i];
-		if (!counter->hw.idx || is_limited_pmc(counter->hw.idx))
-			continue;
-		val = read_pmc(counter->hw.idx);
-		if ((int)val < 0) {
-			/* counter has overflowed */
-			found = 1;
-			record_and_restart(counter, val, regs, nmi);
-		}
-	}
-
-	/*
-	 * In case we didn't find and reset the counter that caused
-	 * the interrupt, scan all counters and reset any that are
-	 * negative, to avoid getting continual interrupts.
-	 * Any that we processed in the previous loop will not be negative.
-	 */
-	if (!found) {
-		for (i = 0; i < ppmu->n_counter; ++i) {
-			if (is_limited_pmc(i + 1))
-				continue;
-			val = read_pmc(i + 1);
-			if ((int)val < 0)
-				write_pmc(i + 1, 0);
-		}
-	}
-
-	/*
-	 * Reset MMCR0 to its normal value.  This will set PMXE and
-	 * clear FC (freeze counters) and PMAO (perf mon alert occurred)
-	 * and thus allow interrupts to occur again.
-	 * XXX might want to use MSR.PM to keep the counters frozen until
-	 * we get back out of this interrupt.
-	 */
-	write_mmcr0(cpuhw, cpuhw->mmcr[0]);
-
-	if (nmi)
-		nmi_exit();
-	else
-		irq_exit();
-}
-
-void hw_perf_counter_setup(int cpu)
-{
-	struct cpu_hw_counters *cpuhw = &per_cpu(cpu_hw_counters, cpu);
-
-	if (!ppmu)
-		return;
-	memset(cpuhw, 0, sizeof(*cpuhw));
-	cpuhw->mmcr[0] = MMCR0_FC;
-}
-
-int register_power_pmu(struct power_pmu *pmu)
-{
-	if (ppmu)
-		return -EBUSY;		/* something's already registered */
-
-	ppmu = pmu;
-	pr_info("%s performance monitor hardware support registered\n",
-		pmu->name);
-
-#ifdef MSR_HV
-	/*
-	 * Use FCHV to ignore kernel events if MSR.HV is set.
-	 */
-	if (mfmsr() & MSR_HV)
-		freeze_counters_kernel = MMCR0_FCHV;
-#endif /* CONFIG_PPC64 */
-
-	return 0;
-}
diff --git a/arch/powerpc/kernel/perf_event.c b/arch/powerpc/kernel/perf_event.c
new file mode 100644
index 00000000000..c98321fcb45
--- /dev/null
+++ b/arch/powerpc/kernel/perf_event.c
@@ -0,0 +1,1315 @@
+/*
+ * Performance event support - powerpc architecture code
+ *
+ * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/perf_event.h>
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+#include <asm/reg.h>
+#include <asm/pmc.h>
+#include <asm/machdep.h>
+#include <asm/firmware.h>
+#include <asm/ptrace.h>
+
+struct cpu_hw_events {
+	int n_events;
+	int n_percpu;
+	int disabled;
+	int n_added;
+	int n_limited;
+	u8  pmcs_enabled;
+	struct perf_event *event[MAX_HWEVENTS];
+	u64 events[MAX_HWEVENTS];
+	unsigned int flags[MAX_HWEVENTS];
+	unsigned long mmcr[3];
+	struct perf_event *limited_event[MAX_LIMITED_HWEVENTS];
+	u8  limited_hwidx[MAX_LIMITED_HWEVENTS];
+	u64 alternatives[MAX_HWEVENTS][MAX_EVENT_ALTERNATIVES];
+	unsigned long amasks[MAX_HWEVENTS][MAX_EVENT_ALTERNATIVES];
+	unsigned long avalues[MAX_HWEVENTS][MAX_EVENT_ALTERNATIVES];
+};
+DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events);
+
+struct power_pmu *ppmu;
+
+/*
+ * Normally, to ignore kernel events we set the FCS (freeze events
+ * in supervisor mode) bit in MMCR0, but if the kernel runs with the
+ * hypervisor bit set in the MSR, or if we are running on a processor
+ * where the hypervisor bit is forced to 1 (as on Apple G5 processors),
+ * then we need to use the FCHV bit to ignore kernel events.
+ */
+static unsigned int freeze_events_kernel = MMCR0_FCS;
+
+/*
+ * 32-bit doesn't have MMCRA but does have an MMCR2,
+ * and a few other names are different.
+ */
+#ifdef CONFIG_PPC32
+
+#define MMCR0_FCHV		0
+#define MMCR0_PMCjCE		MMCR0_PMCnCE
+
+#define SPRN_MMCRA		SPRN_MMCR2
+#define MMCRA_SAMPLE_ENABLE	0
+
+static inline unsigned long perf_ip_adjust(struct pt_regs *regs)
+{
+	return 0;
+}
+static inline void perf_get_data_addr(struct pt_regs *regs, u64 *addrp) { }
+static inline u32 perf_get_misc_flags(struct pt_regs *regs)
+{
+	return 0;
+}
+static inline void perf_read_regs(struct pt_regs *regs) { }
+static inline int perf_intr_is_nmi(struct pt_regs *regs)
+{
+	return 0;
+}
+
+#endif /* CONFIG_PPC32 */
+
+/*
+ * Things that are specific to 64-bit implementations.
+ */
+#ifdef CONFIG_PPC64
+
+static inline unsigned long perf_ip_adjust(struct pt_regs *regs)
+{
+	unsigned long mmcra = regs->dsisr;
+
+	if ((mmcra & MMCRA_SAMPLE_ENABLE) && !(ppmu->flags & PPMU_ALT_SIPR)) {
+		unsigned long slot = (mmcra & MMCRA_SLOT) >> MMCRA_SLOT_SHIFT;
+		if (slot > 1)
+			return 4 * (slot - 1);
+	}
+	return 0;
+}
+
+/*
+ * The user wants a data address recorded.
+ * If we're not doing instruction sampling, give them the SDAR
+ * (sampled data address).  If we are doing instruction sampling, then
+ * only give them the SDAR if it corresponds to the instruction
+ * pointed to by SIAR; this is indicated by the [POWER6_]MMCRA_SDSYNC
+ * bit in MMCRA.
+ */
+static inline void perf_get_data_addr(struct pt_regs *regs, u64 *addrp)
+{
+	unsigned long mmcra = regs->dsisr;
+	unsigned long sdsync = (ppmu->flags & PPMU_ALT_SIPR) ?
+		POWER6_MMCRA_SDSYNC : MMCRA_SDSYNC;
+
+	if (!(mmcra & MMCRA_SAMPLE_ENABLE) || (mmcra & sdsync))
+		*addrp = mfspr(SPRN_SDAR);
+}
+
+static inline u32 perf_get_misc_flags(struct pt_regs *regs)
+{
+	unsigned long mmcra = regs->dsisr;
+
+	if (TRAP(regs) != 0xf00)
+		return 0;	/* not a PMU interrupt */
+
+	if (ppmu->flags & PPMU_ALT_SIPR) {
+		if (mmcra & POWER6_MMCRA_SIHV)
+			return PERF_RECORD_MISC_HYPERVISOR;
+		return (mmcra & POWER6_MMCRA_SIPR) ?
+			PERF_RECORD_MISC_USER : PERF_RECORD_MISC_KERNEL;
+	}
+	if (mmcra & MMCRA_SIHV)
+		return PERF_RECORD_MISC_HYPERVISOR;
+	return (mmcra & MMCRA_SIPR) ? PERF_RECORD_MISC_USER :
+		PERF_RECORD_MISC_KERNEL;
+}
+
+/*
+ * Overload regs->dsisr to store MMCRA so we only need to read it once
+ * on each interrupt.
+ */
+static inline void perf_read_regs(struct pt_regs *regs)
+{
+	regs->dsisr = mfspr(SPRN_MMCRA);
+}
+
+/*
+ * If interrupts were soft-disabled when a PMU interrupt occurs, treat
+ * it as an NMI.
+ */
+static inline int perf_intr_is_nmi(struct pt_regs *regs)
+{
+	return !regs->softe;
+}
+
+#endif /* CONFIG_PPC64 */
+
+static void perf_event_interrupt(struct pt_regs *regs);
+
+void perf_event_print_debug(void)
+{
+}
+
+/*
+ * Read one performance monitor event (PMC).
+ */
+static unsigned long read_pmc(int idx)
+{
+	unsigned long val;
+
+	switch (idx) {
+	case 1:
+		val = mfspr(SPRN_PMC1);
+		break;
+	case 2:
+		val = mfspr(SPRN_PMC2);
+		break;
+	case 3:
+		val = mfspr(SPRN_PMC3);
+		break;
+	case 4:
+		val = mfspr(SPRN_PMC4);
+		break;
+	case 5:
+		val = mfspr(SPRN_PMC5);
+		break;
+	case 6:
+		val = mfspr(SPRN_PMC6);
+		break;
+#ifdef CONFIG_PPC64
+	case 7:
+		val = mfspr(SPRN_PMC7);
+		break;
+	case 8:
+		val = mfspr(SPRN_PMC8);
+		break;
+#endif /* CONFIG_PPC64 */
+	default:
+		printk(KERN_ERR "oops trying to read PMC%d\n", idx);
+		val = 0;
+	}
+	return val;
+}
+
+/*
+ * Write one PMC.
+ */
+static void write_pmc(int idx, unsigned long val)
+{
+	switch (idx) {
+	case 1:
+		mtspr(SPRN_PMC1, val);
+		break;
+	case 2:
+		mtspr(SPRN_PMC2, val);
+		break;
+	case 3:
+		mtspr(SPRN_PMC3, val);
+		break;
+	case 4:
+		mtspr(SPRN_PMC4, val);
+		break;
+	case 5:
+		mtspr(SPRN_PMC5, val);
+		break;
+	case 6:
+		mtspr(SPRN_PMC6, val);
+		break;
+#ifdef CONFIG_PPC64
+	case 7:
+		mtspr(SPRN_PMC7, val);
+		break;
+	case 8:
+		mtspr(SPRN_PMC8, val);
+		break;
+#endif /* CONFIG_PPC64 */
+	default:
+		printk(KERN_ERR "oops trying to write PMC%d\n", idx);
+	}
+}
+
+/*
+ * Check if a set of events can all go on the PMU at once.
+ * If they can't, this will look at alternative codes for the events
+ * and see if any combination of alternative codes is feasible.
+ * The feasible set is returned in event_id[].
+ */
+static int power_check_constraints(struct cpu_hw_events *cpuhw,
+				   u64 event_id[], unsigned int cflags[],
+				   int n_ev)
+{
+	unsigned long mask, value, nv;
+	unsigned long smasks[MAX_HWEVENTS], svalues[MAX_HWEVENTS];
+	int n_alt[MAX_HWEVENTS], choice[MAX_HWEVENTS];
+	int i, j;
+	unsigned long addf = ppmu->add_fields;
+	unsigned long tadd = ppmu->test_adder;
+
+	if (n_ev > ppmu->n_event)
+		return -1;
+
+	/* First see if the events will go on as-is */
+	for (i = 0; i < n_ev; ++i) {
+		if ((cflags[i] & PPMU_LIMITED_PMC_REQD)
+		    && !ppmu->limited_pmc_event(event_id[i])) {
+			ppmu->get_alternatives(event_id[i], cflags[i],
+					       cpuhw->alternatives[i]);
+			event_id[i] = cpuhw->alternatives[i][0];
+		}
+		if (ppmu->get_constraint(event_id[i], &cpuhw->amasks[i][0],
+					 &cpuhw->avalues[i][0]))
+			return -1;
+	}
+	value = mask = 0;
+	for (i = 0; i < n_ev; ++i) {
+		nv = (value | cpuhw->avalues[i][0]) +
+			(value & cpuhw->avalues[i][0] & addf);
+		if ((((nv + tadd) ^ value) & mask) != 0 ||
+		    (((nv + tadd) ^ cpuhw->avalues[i][0]) &
+		     cpuhw->amasks[i][0]) != 0)
+			break;
+		value = nv;
+		mask |= cpuhw->amasks[i][0];
+	}
+	if (i == n_ev)
+		return 0;	/* all OK */
+
+	/* doesn't work, gather alternatives... */
+	if (!ppmu->get_alternatives)
+		return -1;
+	for (i = 0; i < n_ev; ++i) {
+		choice[i] = 0;
+		n_alt[i] = ppmu->get_alternatives(event_id[i], cflags[i],
+						  cpuhw->alternatives[i]);
+		for (j = 1; j < n_alt[i]; ++j)
+			ppmu->get_constraint(cpuhw->alternatives[i][j],
+					     &cpuhw->amasks[i][j],
+					     &cpuhw->avalues[i][j]);
+	}
+
+	/* enumerate all possibilities and see if any will work */
+	i = 0;
+	j = -1;
+	value = mask = nv = 0;
+	while (i < n_ev) {
+		if (j >= 0) {
+			/* we're backtracking, restore context */
+			value = svalues[i];
+			mask = smasks[i];
+			j = choice[i];
+		}
+		/*
+		 * See if any alternative k for event_id i,
+		 * where k > j, will satisfy the constraints.
+		 */
+		while (++j < n_alt[i]) {
+			nv = (value | cpuhw->avalues[i][j]) +
+				(value & cpuhw->avalues[i][j] & addf);
+			if ((((nv + tadd) ^ value) & mask) == 0 &&
+			    (((nv + tadd) ^ cpuhw->avalues[i][j])
+			     & cpuhw->amasks[i][j]) == 0)
+				break;
+		}
+		if (j >= n_alt[i]) {
+			/*
+			 * No feasible alternative, backtrack
+			 * to event_id i-1 and continue enumerating its
+			 * alternatives from where we got up to.
+			 */
+			if (--i < 0)
+				return -1;
+		} else {
+			/*
+			 * Found a feasible alternative for event_id i,
+			 * remember where we got up to with this event_id,
+			 * go on to the next event_id, and start with
+			 * the first alternative for it.
+			 */
+			choice[i] = j;
+			svalues[i] = value;
+			smasks[i] = mask;
+			value = nv;
+			mask |= cpuhw->amasks[i][j];
+			++i;
+			j = -1;
+		}
+	}
+
+	/* OK, we have a feasible combination, tell the caller the solution */
+	for (i = 0; i < n_ev; ++i)
+		event_id[i] = cpuhw->alternatives[i][choice[i]];
+	return 0;
+}
+
+/*
+ * Check if newly-added events have consistent settings for
+ * exclude_{user,kernel,hv} with each other and any previously
+ * added events.
+ */
+static int check_excludes(struct perf_event **ctrs, unsigned int cflags[],
+			  int n_prev, int n_new)
+{
+	int eu = 0, ek = 0, eh = 0;
+	int i, n, first;
+	struct perf_event *event;
+
+	n = n_prev + n_new;
+	if (n <= 1)
+		return 0;
+
+	first = 1;
+	for (i = 0; i < n; ++i) {
+		if (cflags[i] & PPMU_LIMITED_PMC_OK) {
+			cflags[i] &= ~PPMU_LIMITED_PMC_REQD;
+			continue;
+		}
+		event = ctrs[i];
+		if (first) {
+			eu = event->attr.exclude_user;
+			ek = event->attr.exclude_kernel;
+			eh = event->attr.exclude_hv;
+			first = 0;
+		} else if (event->attr.exclude_user != eu ||
+			   event->attr.exclude_kernel != ek ||
+			   event->attr.exclude_hv != eh) {
+			return -EAGAIN;
+		}
+	}
+
+	if (eu || ek || eh)
+		for (i = 0; i < n; ++i)
+			if (cflags[i] & PPMU_LIMITED_PMC_OK)
+				cflags[i] |= PPMU_LIMITED_PMC_REQD;
+
+	return 0;
+}
+
+static void power_pmu_read(struct perf_event *event)
+{
+	s64 val, delta, prev;
+
+	if (!event->hw.idx)
+		return;
+	/*
+	 * Performance monitor interrupts come even when interrupts
+	 * are soft-disabled, as long as interrupts are hard-enabled.
+	 * Therefore we treat them like NMIs.
+	 */
+	do {
+		prev = atomic64_read(&event->hw.prev_count);
+		barrier();
+		val = read_pmc(event->hw.idx);
+	} while (atomic64_cmpxchg(&event->hw.prev_count, prev, val) != prev);
+
+	/* The events are only 32 bits wide */
+	delta = (val - prev) & 0xfffffffful;
+	atomic64_add(delta, &event->count);
+	atomic64_sub(delta, &event->hw.period_left);
+}
+
+/*
+ * On some machines, PMC5 and PMC6 can't be written, don't respect
+ * the freeze conditions, and don't generate interrupts.  This tells
+ * us if `event' is using such a PMC.
+ */
+static int is_limited_pmc(int pmcnum)
+{
+	return (ppmu->flags & PPMU_LIMITED_PMC5_6)
+		&& (pmcnum == 5 || pmcnum == 6);
+}
+
+static void freeze_limited_events(struct cpu_hw_events *cpuhw,
+				    unsigned long pmc5, unsigned long pmc6)
+{
+	struct perf_event *event;
+	u64 val, prev, delta;
+	int i;
+
+	for (i = 0; i < cpuhw->n_limited; ++i) {
+		event = cpuhw->limited_event[i];
+		if (!event->hw.idx)
+			continue;
+		val = (event->hw.idx == 5) ? pmc5 : pmc6;
+		prev = atomic64_read(&event->hw.prev_count);
+		event->hw.idx = 0;
+		delta = (val - prev) & 0xfffffffful;
+		atomic64_add(delta, &event->count);
+	}
+}
+
+static void thaw_limited_events(struct cpu_hw_events *cpuhw,
+				  unsigned long pmc5, unsigned long pmc6)
+{
+	struct perf_event *event;
+	u64 val;
+	int i;
+
+	for (i = 0; i < cpuhw->n_limited; ++i) {
+		event = cpuhw->limited_event[i];
+		event->hw.idx = cpuhw->limited_hwidx[i];
+		val = (event->hw.idx == 5) ? pmc5 : pmc6;
+		atomic64_set(&event->hw.prev_count, val);
+		perf_event_update_userpage(event);
+	}
+}
+
+/*
+ * Since limited events don't respect the freeze conditions, we
+ * have to read them immediately after freezing or unfreezing the
+ * other events.  We try to keep the values from the limited
+ * events as consistent as possible by keeping the delay (in
+ * cycles and instructions) between freezing/unfreezing and reading
+ * the limited events as small and consistent as possible.
+ * Therefore, if any limited events are in use, we read them
+ * both, and always in the same order, to minimize variability,
+ * and do it inside the same asm that writes MMCR0.
+ */
+static void write_mmcr0(struct cpu_hw_events *cpuhw, unsigned long mmcr0)
+{
+	unsigned long pmc5, pmc6;
+
+	if (!cpuhw->n_limited) {
+		mtspr(SPRN_MMCR0, mmcr0);
+		return;
+	}
+
+	/*
+	 * Write MMCR0, then read PMC5 and PMC6 immediately.
+	 * To ensure we don't get a performance monitor interrupt
+	 * between writing MMCR0 and freezing/thawing the limited
+	 * events, we first write MMCR0 with the event overflow
+	 * interrupt enable bits turned off.
+	 */
+	asm volatile("mtspr %3,%2; mfspr %0,%4; mfspr %1,%5"
+		     : "=&r" (pmc5), "=&r" (pmc6)
+		     : "r" (mmcr0 & ~(MMCR0_PMC1CE | MMCR0_PMCjCE)),
+		       "i" (SPRN_MMCR0),
+		       "i" (SPRN_PMC5), "i" (SPRN_PMC6));
+
+	if (mmcr0 & MMCR0_FC)
+		freeze_limited_events(cpuhw, pmc5, pmc6);
+	else
+		thaw_limited_events(cpuhw, pmc5, pmc6);
+
+	/*
+	 * Write the full MMCR0 including the event overflow interrupt
+	 * enable bits, if necessary.
+	 */
+	if (mmcr0 & (MMCR0_PMC1CE | MMCR0_PMCjCE))
+		mtspr(SPRN_MMCR0, mmcr0);
+}
+
+/*
+ * Disable all events to prevent PMU interrupts and to allow
+ * events to be added or removed.
+ */
+void hw_perf_disable(void)
+{
+	struct cpu_hw_events *cpuhw;
+	unsigned long flags;
+
+	if (!ppmu)
+		return;
+	local_irq_save(flags);
+	cpuhw = &__get_cpu_var(cpu_hw_events);
+
+	if (!cpuhw->disabled) {
+		cpuhw->disabled = 1;
+		cpuhw->n_added = 0;
+
+		/*
+		 * Check if we ever enabled the PMU on this cpu.
+		 */
+		if (!cpuhw->pmcs_enabled) {
+			ppc_enable_pmcs();
+			cpuhw->pmcs_enabled = 1;
+		}
+
+		/*
+		 * Disable instruction sampling if it was enabled
+		 */
+		if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) {
+			mtspr(SPRN_MMCRA,
+			      cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
+			mb();
+		}
+
+		/*
+		 * Set the 'freeze events' bit.
+		 * The barrier is to make sure the mtspr has been
+		 * executed and the PMU has frozen the events
+		 * before we return.
+		 */
+		write_mmcr0(cpuhw, mfspr(SPRN_MMCR0) | MMCR0_FC);
+		mb();
+	}
+	local_irq_restore(flags);
+}
+
+/*
+ * Re-enable all events if disable == 0.
+ * If we were previously disabled and events were added, then
+ * put the new config on the PMU.
+ */
+void hw_perf_enable(void)
+{
+	struct perf_event *event;
+	struct cpu_hw_events *cpuhw;
+	unsigned long flags;
+	long i;
+	unsigned long val;
+	s64 left;
+	unsigned int hwc_index[MAX_HWEVENTS];
+	int n_lim;
+	int idx;
+
+	if (!ppmu)
+		return;
+	local_irq_save(flags);
+	cpuhw = &__get_cpu_var(cpu_hw_events);
+	if (!cpuhw->disabled) {
+		local_irq_restore(flags);
+		return;
+	}
+	cpuhw->disabled = 0;
+
+	/*
+	 * If we didn't change anything, or only removed events,
+	 * no need to recalculate MMCR* settings and reset the PMCs.
+	 * Just reenable the PMU with the current MMCR* settings
+	 * (possibly updated for removal of events).
+	 */
+	if (!cpuhw->n_added) {
+		mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
+		mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
+		if (cpuhw->n_events == 0)
+			ppc_set_pmu_inuse(0);
+		goto out_enable;
+	}
+
+	/*
+	 * Compute MMCR* values for the new set of events
+	 */
+	if (ppmu->compute_mmcr(cpuhw->events, cpuhw->n_events, hwc_index,
+			       cpuhw->mmcr)) {
+		/* shouldn't ever get here */
+		printk(KERN_ERR "oops compute_mmcr failed\n");
+		goto out;
+	}
+
+	/*
+	 * Add in MMCR0 freeze bits corresponding to the
+	 * attr.exclude_* bits for the first event.
+	 * We have already checked that all events have the
+	 * same values for these bits as the first event.
+	 */
+	event = cpuhw->event[0];
+	if (event->attr.exclude_user)
+		cpuhw->mmcr[0] |= MMCR0_FCP;
+	if (event->attr.exclude_kernel)
+		cpuhw->mmcr[0] |= freeze_events_kernel;
+	if (event->attr.exclude_hv)
+		cpuhw->mmcr[0] |= MMCR0_FCHV;
+
+	/*
+	 * Write the new configuration to MMCR* with the freeze
+	 * bit set and set the hardware events to their initial values.
+	 * Then unfreeze the events.
+	 */
+	ppc_set_pmu_inuse(1);
+	mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
+	mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
+	mtspr(SPRN_MMCR0, (cpuhw->mmcr[0] & ~(MMCR0_PMC1CE | MMCR0_PMCjCE))
+				| MMCR0_FC);
+
+	/*
+	 * Read off any pre-existing events that need to move
+	 * to another PMC.
+	 */
+	for (i = 0; i < cpuhw->n_events; ++i) {
+		event = cpuhw->event[i];
+		if (event->hw.idx && event->hw.idx != hwc_index[i] + 1) {
+			power_pmu_read(event);
+			write_pmc(event->hw.idx, 0);
+			event->hw.idx = 0;
+		}
+	}
+
+	/*
+	 * Initialize the PMCs for all the new and moved events.
+	 */
+	cpuhw->n_limited = n_lim = 0;
+	for (i = 0; i < cpuhw->n_events; ++i) {
+		event = cpuhw->event[i];
+		if (event->hw.idx)
+			continue;
+		idx = hwc_index[i] + 1;
+		if (is_limited_pmc(idx)) {
+			cpuhw->limited_event[n_lim] = event;
+			cpuhw->limited_hwidx[n_lim] = idx;
+			++n_lim;
+			continue;
+		}
+		val = 0;
+		if (event->hw.sample_period) {
+			left = atomic64_read(&event->hw.period_left);
+			if (left < 0x80000000L)
+				val = 0x80000000L - left;
+		}
+		atomic64_set(&event->hw.prev_count, val);
+		event->hw.idx = idx;
+		write_pmc(idx, val);
+		perf_event_update_userpage(event);
+	}
+	cpuhw->n_limited = n_lim;
+	cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE;
+
+ out_enable:
+	mb();
+	write_mmcr0(cpuhw, cpuhw->mmcr[0]);
+
+	/*
+	 * Enable instruction sampling if necessary
+	 */
+	if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) {
+		mb();
+		mtspr(SPRN_MMCRA, cpuhw->mmcr[2]);
+	}
+
+ out:
+	local_irq_restore(flags);
+}
+
+static int collect_events(struct perf_event *group, int max_count,
+			  struct perf_event *ctrs[], u64 *events,
+			  unsigned int *flags)
+{
+	int n = 0;
+	struct perf_event *event;
+
+	if (!is_software_event(group)) {
+		if (n >= max_count)
+			return -1;
+		ctrs[n] = group;
+		flags[n] = group->hw.event_base;
+		events[n++] = group->hw.config;
+	}
+	list_for_each_entry(event, &group->sibling_list, list_entry) {
+		if (!is_software_event(event) &&
+		    event->state != PERF_EVENT_STATE_OFF) {
+			if (n >= max_count)
+				return -1;
+			ctrs[n] = event;
+			flags[n] = event->hw.event_base;
+			events[n++] = event->hw.config;
+		}
+	}
+	return n;
+}
+
+static void event_sched_in(struct perf_event *event, int cpu)
+{
+	event->state = PERF_EVENT_STATE_ACTIVE;
+	event->oncpu = cpu;
+	event->tstamp_running += event->ctx->time - event->tstamp_stopped;
+	if (is_software_event(event))
+		event->pmu->enable(event);
+}
+
+/*
+ * Called to enable a whole group of events.
+ * Returns 1 if the group was enabled, or -EAGAIN if it could not be.
+ * Assumes the caller has disabled interrupts and has
+ * frozen the PMU with hw_perf_save_disable.
+ */
+int hw_perf_group_sched_in(struct perf_event *group_leader,
+	       struct perf_cpu_context *cpuctx,
+	       struct perf_event_context *ctx, int cpu)
+{
+	struct cpu_hw_events *cpuhw;
+	long i, n, n0;
+	struct perf_event *sub;
+
+	if (!ppmu)
+		return 0;
+	cpuhw = &__get_cpu_var(cpu_hw_events);
+	n0 = cpuhw->n_events;
+	n = collect_events(group_leader, ppmu->n_event - n0,
+			   &cpuhw->event[n0], &cpuhw->events[n0],
+			   &cpuhw->flags[n0]);
+	if (n < 0)
+		return -EAGAIN;
+	if (check_excludes(cpuhw->event, cpuhw->flags, n0, n))
+		return -EAGAIN;
+	i = power_check_constraints(cpuhw, cpuhw->events, cpuhw->flags, n + n0);
+	if (i < 0)
+		return -EAGAIN;
+	cpuhw->n_events = n0 + n;
+	cpuhw->n_added += n;
+
+	/*
+	 * OK, this group can go on; update event states etc.,
+	 * and enable any software events
+	 */
+	for (i = n0; i < n0 + n; ++i)
+		cpuhw->event[i]->hw.config = cpuhw->events[i];
+	cpuctx->active_oncpu += n;
+	n = 1;
+	event_sched_in(group_leader, cpu);
+	list_for_each_entry(sub, &group_leader->sibling_list, list_entry) {
+		if (sub->state != PERF_EVENT_STATE_OFF) {
+			event_sched_in(sub, cpu);
+			++n;
+		}
+	}
+	ctx->nr_active += n;
+
+	return 1;
+}
+
+/*
+ * Add a event to the PMU.
+ * If all events are not already frozen, then we disable and
+ * re-enable the PMU in order to get hw_perf_enable to do the
+ * actual work of reconfiguring the PMU.
+ */
+static int power_pmu_enable(struct perf_event *event)
+{
+	struct cpu_hw_events *cpuhw;
+	unsigned long flags;
+	int n0;
+	int ret = -EAGAIN;
+
+	local_irq_save(flags);
+	perf_disable();
+
+	/*
+	 * Add the event to the list (if there is room)
+	 * and check whether the total set is still feasible.
+	 */
+	cpuhw = &__get_cpu_var(cpu_hw_events);
+	n0 = cpuhw->n_events;
+	if (n0 >= ppmu->n_event)
+		goto out;
+	cpuhw->event[n0] = event;
+	cpuhw->events[n0] = event->hw.config;
+	cpuhw->flags[n0] = event->hw.event_base;
+	if (check_excludes(cpuhw->event, cpuhw->flags, n0, 1))
+		goto out;
+	if (power_check_constraints(cpuhw, cpuhw->events, cpuhw->flags, n0 + 1))
+		goto out;
+
+	event->hw.config = cpuhw->events[n0];
+	++cpuhw->n_events;
+	++cpuhw->n_added;
+
+	ret = 0;
+ out:
+	perf_enable();
+	local_irq_restore(flags);
+	return ret;
+}
+
+/*
+ * Remove a event from the PMU.
+ */
+static void power_pmu_disable(struct perf_event *event)
+{
+	struct cpu_hw_events *cpuhw;
+	long i;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	perf_disable();
+
+	power_pmu_read(event);
+
+	cpuhw = &__get_cpu_var(cpu_hw_events);
+	for (i = 0; i < cpuhw->n_events; ++i) {
+		if (event == cpuhw->event[i]) {
+			while (++i < cpuhw->n_events)
+				cpuhw->event[i-1] = cpuhw->event[i];
+			--cpuhw->n_events;
+			ppmu->disable_pmc(event->hw.idx - 1, cpuhw->mmcr);
+			if (event->hw.idx) {
+				write_pmc(event->hw.idx, 0);
+				event->hw.idx = 0;
+			}
+			perf_event_update_userpage(event);
+			break;
+		}
+	}
+	for (i = 0; i < cpuhw->n_limited; ++i)
+		if (event == cpuhw->limited_event[i])
+			break;
+	if (i < cpuhw->n_limited) {
+		while (++i < cpuhw->n_limited) {
+			cpuhw->limited_event[i-1] = cpuhw->limited_event[i];
+			cpuhw->limited_hwidx[i-1] = cpuhw->limited_hwidx[i];
+		}
+		--cpuhw->n_limited;
+	}
+	if (cpuhw->n_events == 0) {
+		/* disable exceptions if no events are running */
+		cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE);
+	}
+
+	perf_enable();
+	local_irq_restore(flags);
+}
+
+/*
+ * Re-enable interrupts on a event after they were throttled
+ * because they were coming too fast.
+ */
+static void power_pmu_unthrottle(struct perf_event *event)
+{
+	s64 val, left;
+	unsigned long flags;
+
+	if (!event->hw.idx || !event->hw.sample_period)
+		return;
+	local_irq_save(flags);
+	perf_disable();
+	power_pmu_read(event);
+	left = event->hw.sample_period;
+	event->hw.last_period = left;
+	val = 0;
+	if (left < 0x80000000L)
+		val = 0x80000000L - left;
+	write_pmc(event->hw.idx, val);
+	atomic64_set(&event->hw.prev_count, val);
+	atomic64_set(&event->hw.period_left, left);
+	perf_event_update_userpage(event);
+	perf_enable();
+	local_irq_restore(flags);
+}
+
+struct pmu power_pmu = {
+	.enable		= power_pmu_enable,
+	.disable	= power_pmu_disable,
+	.read		= power_pmu_read,
+	.unthrottle	= power_pmu_unthrottle,
+};
+
+/*
+ * Return 1 if we might be able to put event on a limited PMC,
+ * or 0 if not.
+ * A event can only go on a limited PMC if it counts something
+ * that a limited PMC can count, doesn't require interrupts, and
+ * doesn't exclude any processor mode.
+ */
+static int can_go_on_limited_pmc(struct perf_event *event, u64 ev,
+				 unsigned int flags)
+{
+	int n;
+	u64 alt[MAX_EVENT_ALTERNATIVES];
+
+	if (event->attr.exclude_user
+	    || event->attr.exclude_kernel
+	    || event->attr.exclude_hv
+	    || event->attr.sample_period)
+		return 0;
+
+	if (ppmu->limited_pmc_event(ev))
+		return 1;
+
+	/*
+	 * The requested event_id isn't on a limited PMC already;
+	 * see if any alternative code goes on a limited PMC.
+	 */
+	if (!ppmu->get_alternatives)
+		return 0;
+
+	flags |= PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD;
+	n = ppmu->get_alternatives(ev, flags, alt);
+
+	return n > 0;
+}
+
+/*
+ * Find an alternative event_id that goes on a normal PMC, if possible,
+ * and return the event_id code, or 0 if there is no such alternative.
+ * (Note: event_id code 0 is "don't count" on all machines.)
+ */
+static u64 normal_pmc_alternative(u64 ev, unsigned long flags)
+{
+	u64 alt[MAX_EVENT_ALTERNATIVES];
+	int n;
+
+	flags &= ~(PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD);
+	n = ppmu->get_alternatives(ev, flags, alt);
+	if (!n)
+		return 0;
+	return alt[0];
+}
+
+/* Number of perf_events counting hardware events */
+static atomic_t num_events;
+/* Used to avoid races in calling reserve/release_pmc_hardware */
+static DEFINE_MUTEX(pmc_reserve_mutex);
+
+/*
+ * Release the PMU if this is the last perf_event.
+ */
+static void hw_perf_event_destroy(struct perf_event *event)
+{
+	if (!atomic_add_unless(&num_events, -1, 1)) {
+		mutex_lock(&pmc_reserve_mutex);
+		if (atomic_dec_return(&num_events) == 0)
+			release_pmc_hardware();
+		mutex_unlock(&pmc_reserve_mutex);
+	}
+}
+
+/*
+ * Translate a generic cache event_id config to a raw event_id code.
+ */
+static int hw_perf_cache_event(u64 config, u64 *eventp)
+{
+	unsigned long type, op, result;
+	int ev;
+
+	if (!ppmu->cache_events)
+		return -EINVAL;
+
+	/* unpack config */
+	type = config & 0xff;
+	op = (config >> 8) & 0xff;
+	result = (config >> 16) & 0xff;
+
+	if (type >= PERF_COUNT_HW_CACHE_MAX ||
+	    op >= PERF_COUNT_HW_CACHE_OP_MAX ||
+	    result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
+		return -EINVAL;
+
+	ev = (*ppmu->cache_events)[type][op][result];
+	if (ev == 0)
+		return -EOPNOTSUPP;
+	if (ev == -1)
+		return -EINVAL;
+	*eventp = ev;
+	return 0;
+}
+
+const struct pmu *hw_perf_event_init(struct perf_event *event)
+{
+	u64 ev;
+	unsigned long flags;
+	struct perf_event *ctrs[MAX_HWEVENTS];
+	u64 events[MAX_HWEVENTS];
+	unsigned int cflags[MAX_HWEVENTS];
+	int n;
+	int err;
+	struct cpu_hw_events *cpuhw;
+
+	if (!ppmu)
+		return ERR_PTR(-ENXIO);
+	switch (event->attr.type) {
+	case PERF_TYPE_HARDWARE:
+		ev = event->attr.config;
+		if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
+			return ERR_PTR(-EOPNOTSUPP);
+		ev = ppmu->generic_events[ev];
+		break;
+	case PERF_TYPE_HW_CACHE:
+		err = hw_perf_cache_event(event->attr.config, &ev);
+		if (err)
+			return ERR_PTR(err);
+		break;
+	case PERF_TYPE_RAW:
+		ev = event->attr.config;
+		break;
+	default:
+		return ERR_PTR(-EINVAL);
+	}
+	event->hw.config_base = ev;
+	event->hw.idx = 0;
+
+	/*
+	 * If we are not running on a hypervisor, force the
+	 * exclude_hv bit to 0 so that we don't care what
+	 * the user set it to.
+	 */
+	if (!firmware_has_feature(FW_FEATURE_LPAR))
+		event->attr.exclude_hv = 0;
+
+	/*
+	 * If this is a per-task event, then we can use
+	 * PM_RUN_* events interchangeably with their non RUN_*
+	 * equivalents, e.g. PM_RUN_CYC instead of PM_CYC.
+	 * XXX we should check if the task is an idle task.
+	 */
+	flags = 0;
+	if (event->ctx->task)
+		flags |= PPMU_ONLY_COUNT_RUN;
+
+	/*
+	 * If this machine has limited events, check whether this
+	 * event_id could go on a limited event.
+	 */
+	if (ppmu->flags & PPMU_LIMITED_PMC5_6) {
+		if (can_go_on_limited_pmc(event, ev, flags)) {
+			flags |= PPMU_LIMITED_PMC_OK;
+		} else if (ppmu->limited_pmc_event(ev)) {
+			/*
+			 * The requested event_id is on a limited PMC,
+			 * but we can't use a limited PMC; see if any
+			 * alternative goes on a normal PMC.
+			 */
+			ev = normal_pmc_alternative(ev, flags);
+			if (!ev)
+				return ERR_PTR(-EINVAL);
+		}
+	}
+
+	/*
+	 * If this is in a group, check if it can go on with all the
+	 * other hardware events in the group.  We assume the event
+	 * hasn't been linked into its leader's sibling list at this point.
+	 */
+	n = 0;
+	if (event->group_leader != event) {
+		n = collect_events(event->group_leader, ppmu->n_event - 1,
+				   ctrs, events, cflags);
+		if (n < 0)
+			return ERR_PTR(-EINVAL);
+	}
+	events[n] = ev;
+	ctrs[n] = event;
+	cflags[n] = flags;
+	if (check_excludes(ctrs, cflags, n, 1))
+		return ERR_PTR(-EINVAL);
+
+	cpuhw = &get_cpu_var(cpu_hw_events);
+	err = power_check_constraints(cpuhw, events, cflags, n + 1);
+	put_cpu_var(cpu_hw_events);
+	if (err)
+		return ERR_PTR(-EINVAL);
+
+	event->hw.config = events[n];
+	event->hw.event_base = cflags[n];
+	event->hw.last_period = event->hw.sample_period;
+	atomic64_set(&event->hw.period_left, event->hw.last_period);
+
+	/*
+	 * See if we need to reserve the PMU.
+	 * If no events are currently in use, then we have to take a
+	 * mutex to ensure that we don't race with another task doing
+	 * reserve_pmc_hardware or release_pmc_hardware.
+	 */
+	err = 0;
+	if (!atomic_inc_not_zero(&num_events)) {
+		mutex_lock(&pmc_reserve_mutex);
+		if (atomic_read(&num_events) == 0 &&
+		    reserve_pmc_hardware(perf_event_interrupt))
+			err = -EBUSY;
+		else
+			atomic_inc(&num_events);
+		mutex_unlock(&pmc_reserve_mutex);
+	}
+	event->destroy = hw_perf_event_destroy;
+
+	if (err)
+		return ERR_PTR(err);
+	return &power_pmu;
+}
+
+/*
+ * A event has overflowed; update its count and record
+ * things if requested.  Note that interrupts are hard-disabled
+ * here so there is no possibility of being interrupted.
+ */
+static void record_and_restart(struct perf_event *event, unsigned long val,
+			       struct pt_regs *regs, int nmi)
+{
+	u64 period = event->hw.sample_period;
+	s64 prev, delta, left;
+	int record = 0;
+
+	/* we don't have to worry about interrupts here */
+	prev = atomic64_read(&event->hw.prev_count);
+	delta = (val - prev) & 0xfffffffful;
+	atomic64_add(delta, &event->count);
+
+	/*
+	 * See if the total period for this event has expired,
+	 * and update for the next period.
+	 */
+	val = 0;
+	left = atomic64_read(&event->hw.period_left) - delta;
+	if (period) {
+		if (left <= 0) {
+			left += period;
+			if (left <= 0)
+				left = period;
+			record = 1;
+		}
+		if (left < 0x80000000LL)
+			val = 0x80000000LL - left;
+	}
+
+	/*
+	 * Finally record data if requested.
+	 */
+	if (record) {
+		struct perf_sample_data data = {
+			.addr	= 0,
+			.period	= event->hw.last_period,
+		};
+
+		if (event->attr.sample_type & PERF_SAMPLE_ADDR)
+			perf_get_data_addr(regs, &data.addr);
+
+		if (perf_event_overflow(event, nmi, &data, regs)) {
+			/*
+			 * Interrupts are coming too fast - throttle them
+			 * by setting the event to 0, so it will be
+			 * at least 2^30 cycles until the next interrupt
+			 * (assuming each event counts at most 2 counts
+			 * per cycle).
+			 */
+			val = 0;
+			left = ~0ULL >> 1;
+		}
+	}
+
+	write_pmc(event->hw.idx, val);
+	atomic64_set(&event->hw.prev_count, val);
+	atomic64_set(&event->hw.period_left, left);
+	perf_event_update_userpage(event);
+}
+
+/*
+ * Called from generic code to get the misc flags (i.e. processor mode)
+ * for an event_id.
+ */
+unsigned long perf_misc_flags(struct pt_regs *regs)
+{
+	u32 flags = perf_get_misc_flags(regs);
+
+	if (flags)
+		return flags;
+	return user_mode(regs) ? PERF_RECORD_MISC_USER :
+		PERF_RECORD_MISC_KERNEL;
+}
+
+/*
+ * Called from generic code to get the instruction pointer
+ * for an event_id.
+ */
+unsigned long perf_instruction_pointer(struct pt_regs *regs)
+{
+	unsigned long ip;
+
+	if (TRAP(regs) != 0xf00)
+		return regs->nip;	/* not a PMU interrupt */
+
+	ip = mfspr(SPRN_SIAR) + perf_ip_adjust(regs);
+	return ip;
+}
+
+/*
+ * Performance monitor interrupt stuff
+ */
+static void perf_event_interrupt(struct pt_regs *regs)
+{
+	int i;
+	struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
+	struct perf_event *event;
+	unsigned long val;
+	int found = 0;
+	int nmi;
+
+	if (cpuhw->n_limited)
+		freeze_limited_events(cpuhw, mfspr(SPRN_PMC5),
+					mfspr(SPRN_PMC6));
+
+	perf_read_regs(regs);
+
+	nmi = perf_intr_is_nmi(regs);
+	if (nmi)
+		nmi_enter();
+	else
+		irq_enter();
+
+	for (i = 0; i < cpuhw->n_events; ++i) {
+		event = cpuhw->event[i];
+		if (!event->hw.idx || is_limited_pmc(event->hw.idx))
+			continue;
+		val = read_pmc(event->hw.idx);
+		if ((int)val < 0) {
+			/* event has overflowed */
+			found = 1;
+			record_and_restart(event, val, regs, nmi);
+		}
+	}
+
+	/*
+	 * In case we didn't find and reset the event that caused
+	 * the interrupt, scan all events and reset any that are
+	 * negative, to avoid getting continual interrupts.
+	 * Any that we processed in the previous loop will not be negative.
+	 */
+	if (!found) {
+		for (i = 0; i < ppmu->n_event; ++i) {
+			if (is_limited_pmc(i + 1))
+				continue;
+			val = read_pmc(i + 1);
+			if ((int)val < 0)
+				write_pmc(i + 1, 0);
+		}
+	}
+
+	/*
+	 * Reset MMCR0 to its normal value.  This will set PMXE and
+	 * clear FC (freeze events) and PMAO (perf mon alert occurred)
+	 * and thus allow interrupts to occur again.
+	 * XXX might want to use MSR.PM to keep the events frozen until
+	 * we get back out of this interrupt.
+	 */
+	write_mmcr0(cpuhw, cpuhw->mmcr[0]);
+
+	if (nmi)
+		nmi_exit();
+	else
+		irq_exit();
+}
+
+void hw_perf_event_setup(int cpu)
+{
+	struct cpu_hw_events *cpuhw = &per_cpu(cpu_hw_events, cpu);
+
+	if (!ppmu)
+		return;
+	memset(cpuhw, 0, sizeof(*cpuhw));
+	cpuhw->mmcr[0] = MMCR0_FC;
+}
+
+int register_power_pmu(struct power_pmu *pmu)
+{
+	if (ppmu)
+		return -EBUSY;		/* something's already registered */
+
+	ppmu = pmu;
+	pr_info("%s performance monitor hardware support registered\n",
+		pmu->name);
+
+#ifdef MSR_HV
+	/*
+	 * Use FCHV to ignore kernel events if MSR.HV is set.
+	 */
+	if (mfmsr() & MSR_HV)
+		freeze_events_kernel = MMCR0_FCHV;
+#endif /* CONFIG_PPC64 */
+
+	return 0;
+}
diff --git a/arch/powerpc/kernel/power4-pmu.c b/arch/powerpc/kernel/power4-pmu.c
index 3c90a3d9173..2a361cdda63 100644
--- a/arch/powerpc/kernel/power4-pmu.c
+++ b/arch/powerpc/kernel/power4-pmu.c
@@ -9,7 +9,7 @@
  * 2 of the License, or (at your option) any later version.
  */
 #include <linux/kernel.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
 #include <linux/string.h>
 #include <asm/reg.h>
 #include <asm/cputable.h>
diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c
index 31918af3e35..0f4c1c73a6a 100644
--- a/arch/powerpc/kernel/power5+-pmu.c
+++ b/arch/powerpc/kernel/power5+-pmu.c
@@ -9,7 +9,7 @@
  * 2 of the License, or (at your option) any later version.
  */
 #include <linux/kernel.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
 #include <linux/string.h>
 #include <asm/reg.h>
 #include <asm/cputable.h>
diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c
index 867f6f66396..c351b3a57fb 100644
--- a/arch/powerpc/kernel/power5-pmu.c
+++ b/arch/powerpc/kernel/power5-pmu.c
@@ -9,7 +9,7 @@
  * 2 of the License, or (at your option) any later version.
  */
 #include <linux/kernel.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
 #include <linux/string.h>
 #include <asm/reg.h>
 #include <asm/cputable.h>
diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c
index fa21890531d..ca399ba5034 100644
--- a/arch/powerpc/kernel/power6-pmu.c
+++ b/arch/powerpc/kernel/power6-pmu.c
@@ -9,7 +9,7 @@
  * 2 of the License, or (at your option) any later version.
  */
 #include <linux/kernel.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
 #include <linux/string.h>
 #include <asm/reg.h>
 #include <asm/cputable.h>
diff --git a/arch/powerpc/kernel/power7-pmu.c b/arch/powerpc/kernel/power7-pmu.c
index 018d094d92f..28a4daacdc0 100644
--- a/arch/powerpc/kernel/power7-pmu.c
+++ b/arch/powerpc/kernel/power7-pmu.c
@@ -9,7 +9,7 @@
  * 2 of the License, or (at your option) any later version.
  */
 #include <linux/kernel.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
 #include <linux/string.h>
 #include <asm/reg.h>
 #include <asm/cputable.h>
diff --git a/arch/powerpc/kernel/ppc970-pmu.c b/arch/powerpc/kernel/ppc970-pmu.c
index 75dccb71a04..479574413a9 100644
--- a/arch/powerpc/kernel/ppc970-pmu.c
+++ b/arch/powerpc/kernel/ppc970-pmu.c
@@ -9,7 +9,7 @@
  * 2 of the License, or (at your option) any later version.
  */
 #include <linux/string.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
 #include <asm/reg.h>
 #include <asm/cputable.h>
 
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 465e498bcb3..df45a7449a6 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -53,7 +53,7 @@
 #include <linux/posix-timers.h>
 #include <linux/irq.h>
 #include <linux/delay.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
 
 #include <asm/io.h>
 #include <asm/processor.h>
@@ -527,25 +527,25 @@ void __init iSeries_time_init_early(void)
 }
 #endif /* CONFIG_PPC_ISERIES */
 
-#if defined(CONFIG_PERF_COUNTERS) && defined(CONFIG_PPC32)
-DEFINE_PER_CPU(u8, perf_counter_pending);
+#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_PPC32)
+DEFINE_PER_CPU(u8, perf_event_pending);
 
-void set_perf_counter_pending(void)
+void set_perf_event_pending(void)
 {
-	get_cpu_var(perf_counter_pending) = 1;
+	get_cpu_var(perf_event_pending) = 1;
 	set_dec(1);
-	put_cpu_var(perf_counter_pending);
+	put_cpu_var(perf_event_pending);
 }
 
-#define test_perf_counter_pending()	__get_cpu_var(perf_counter_pending)
-#define clear_perf_counter_pending()	__get_cpu_var(perf_counter_pending) = 0
+#define test_perf_event_pending()	__get_cpu_var(perf_event_pending)
+#define clear_perf_event_pending()	__get_cpu_var(perf_event_pending) = 0
 
-#else  /* CONFIG_PERF_COUNTERS && CONFIG_PPC32 */
+#else  /* CONFIG_PERF_EVENTS && CONFIG_PPC32 */
 
-#define test_perf_counter_pending()	0
-#define clear_perf_counter_pending()
+#define test_perf_event_pending()	0
+#define clear_perf_event_pending()
 
-#endif /* CONFIG_PERF_COUNTERS && CONFIG_PPC32 */
+#endif /* CONFIG_PERF_EVENTS && CONFIG_PPC32 */
 
 /*
  * For iSeries shared processors, we have to let the hypervisor
@@ -573,9 +573,9 @@ void timer_interrupt(struct pt_regs * regs)
 	set_dec(DECREMENTER_MAX);
 
 #ifdef CONFIG_PPC32
-	if (test_perf_counter_pending()) {
-		clear_perf_counter_pending();
-		perf_counter_do_pending();
+	if (test_perf_event_pending()) {
+		clear_perf_event_pending();
+		perf_event_do_pending();
 	}
 	if (atomic_read(&ppc_n_lost_interrupts) != 0)
 		do_IRQ(regs);
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 830bef0a113..e7dae82c128 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -29,7 +29,7 @@
 #include <linux/module.h>
 #include <linux/kprobes.h>
 #include <linux/kdebug.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
 
 #include <asm/firmware.h>
 #include <asm/page.h>
@@ -171,7 +171,7 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
 		die("Weird page fault", regs, SIGSEGV);
 	}
 
-	perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
+	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
 
 	/* When running in the kernel we expect faults to occur only to
 	 * addresses in user space.  All other faults represent errors in the
@@ -312,7 +312,7 @@ good_area:
 	}
 	if (ret & VM_FAULT_MAJOR) {
 		current->maj_flt++;
-		perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
+		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
 				     regs, address);
 #ifdef CONFIG_PPC_SMLPAR
 		if (firmware_has_feature(FW_FEATURE_CMO)) {
@@ -323,7 +323,7 @@ good_area:
 #endif
 	} else {
 		current->min_flt++;
-		perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
+		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
 				     regs, address);
 	}
 	up_read(&mm->mmap_sem);
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index 9efc8bda01b..e382cae678b 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -280,9 +280,9 @@ config PPC_HAVE_PMU_SUPPORT
 
 config PPC_PERF_CTRS
        def_bool y
-       depends on PERF_COUNTERS && PPC_HAVE_PMU_SUPPORT
+       depends on PERF_EVENTS && PPC_HAVE_PMU_SUPPORT
        help
-         This enables the powerpc-specific perf_counter back-end.
+         This enables the powerpc-specific perf_event back-end.
 
 config SMP
 	depends on PPC_BOOK3S || PPC_BOOK3E || FSL_BOOKE
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 1c866efd217..43c0acad716 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -94,7 +94,7 @@ config S390
 	select HAVE_KVM if 64BIT
 	select HAVE_ARCH_TRACEHOOK
 	select INIT_ALL_POSSIBLE
-	select HAVE_PERF_COUNTERS
+	select HAVE_PERF_EVENTS
 
 config SCHED_OMIT_FRAME_POINTER
 	bool
diff --git a/arch/s390/include/asm/perf_counter.h b/arch/s390/include/asm/perf_counter.h
deleted file mode 100644
index 7015188c2cc..00000000000
--- a/arch/s390/include/asm/perf_counter.h
+++ /dev/null
@@ -1,10 +0,0 @@
-/*
- * Performance counter support - s390 specific definitions.
- *
- * Copyright 2009 Martin Schwidefsky, IBM Corporation.
- */
-
-static inline void set_perf_counter_pending(void) {}
-static inline void clear_perf_counter_pending(void) {}
-
-#define PERF_COUNTER_INDEX_OFFSET 0
diff --git a/arch/s390/include/asm/perf_event.h b/arch/s390/include/asm/perf_event.h
new file mode 100644
index 00000000000..3840cbe7763
--- /dev/null
+++ b/arch/s390/include/asm/perf_event.h
@@ -0,0 +1,10 @@
+/*
+ * Performance event support - s390 specific definitions.
+ *
+ * Copyright 2009 Martin Schwidefsky, IBM Corporation.
+ */
+
+static inline void set_perf_event_pending(void) {}
+static inline void clear_perf_event_pending(void) {}
+
+#define PERF_EVENT_INDEX_OFFSET 0
diff --git a/arch/s390/include/asm/unistd.h b/arch/s390/include/asm/unistd.h
index c80602d7c88..cb5232df151 100644
--- a/arch/s390/include/asm/unistd.h
+++ b/arch/s390/include/asm/unistd.h
@@ -268,7 +268,7 @@
 #define	__NR_preadv		328
 #define	__NR_pwritev		329
 #define __NR_rt_tgsigqueueinfo	330
-#define __NR_perf_counter_open	331
+#define __NR_perf_event_open	331
 #define NR_syscalls 332
 
 /* 
diff --git a/arch/s390/kernel/compat_wrapper.S b/arch/s390/kernel/compat_wrapper.S
index 88a83366819..624790042d4 100644
--- a/arch/s390/kernel/compat_wrapper.S
+++ b/arch/s390/kernel/compat_wrapper.S
@@ -1832,11 +1832,11 @@ compat_sys_rt_tgsigqueueinfo_wrapper:
 	llgtr	%r5,%r5			# struct compat_siginfo *
 	jg	compat_sys_rt_tgsigqueueinfo_wrapper # branch to system call
 
-	.globl	sys_perf_counter_open_wrapper
-sys_perf_counter_open_wrapper:
-	llgtr	%r2,%r2			# const struct perf_counter_attr *
+	.globl	sys_perf_event_open_wrapper
+sys_perf_event_open_wrapper:
+	llgtr	%r2,%r2			# const struct perf_event_attr *
 	lgfr	%r3,%r3			# pid_t
 	lgfr	%r4,%r4			# int
 	lgfr	%r5,%r5			# int
 	llgfr	%r6,%r6			# unsigned long
-	jg	sys_perf_counter_open	# branch to system call
+	jg	sys_perf_event_open	# branch to system call
diff --git a/arch/s390/kernel/syscalls.S b/arch/s390/kernel/syscalls.S
index ad1acd20038..0b5083681e7 100644
--- a/arch/s390/kernel/syscalls.S
+++ b/arch/s390/kernel/syscalls.S
@@ -339,4 +339,4 @@ SYSCALL(sys_epoll_create1,sys_epoll_create1,sys_epoll_create1_wrapper)
 SYSCALL(sys_preadv,sys_preadv,compat_sys_preadv_wrapper)
 SYSCALL(sys_pwritev,sys_pwritev,compat_sys_pwritev_wrapper)
 SYSCALL(sys_rt_tgsigqueueinfo,sys_rt_tgsigqueueinfo,compat_sys_rt_tgsigqueueinfo_wrapper) /* 330 */
-SYSCALL(sys_perf_counter_open,sys_perf_counter_open,sys_perf_counter_open_wrapper)
+SYSCALL(sys_perf_event_open,sys_perf_event_open,sys_perf_event_open_wrapper)
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 1abbadd497e..6d507462967 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -10,7 +10,7 @@
  *    Copyright (C) 1995  Linus Torvalds
  */
 
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
 #include <linux/signal.h>
 #include <linux/sched.h>
 #include <linux/kernel.h>
@@ -306,7 +306,7 @@ do_exception(struct pt_regs *regs, unsigned long error_code, int write)
 	 * interrupts again and then search the VMAs
 	 */
 	local_irq_enable();
-	perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
+	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
 	down_read(&mm->mmap_sem);
 
 	si_code = SEGV_MAPERR;
@@ -366,11 +366,11 @@ good_area:
 	}
 	if (fault & VM_FAULT_MAJOR) {
 		tsk->maj_flt++;
-		perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
+		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
 				     regs, address);
 	} else {
 		tsk->min_flt++;
-		perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
+		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
 				     regs, address);
 	}
         up_read(&mm->mmap_sem);
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 4df3570fe51..b940424f8cc 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -16,7 +16,7 @@ config SUPERH
 	select HAVE_IOREMAP_PROT if MMU
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_DMA_API_DEBUG
-	select HAVE_PERF_COUNTERS
+	select HAVE_PERF_EVENTS
 	select HAVE_KERNEL_GZIP
 	select HAVE_KERNEL_BZIP2
 	select HAVE_KERNEL_LZMA
diff --git a/arch/sh/include/asm/perf_counter.h b/arch/sh/include/asm/perf_counter.h
deleted file mode 100644
index d8e6bb9c0cc..00000000000
--- a/arch/sh/include/asm/perf_counter.h
+++ /dev/null
@@ -1,9 +0,0 @@
-#ifndef __ASM_SH_PERF_COUNTER_H
-#define __ASM_SH_PERF_COUNTER_H
-
-/* SH only supports software counters through this interface. */
-static inline void set_perf_counter_pending(void) {}
-
-#define PERF_COUNTER_INDEX_OFFSET	0
-
-#endif /* __ASM_SH_PERF_COUNTER_H */
diff --git a/arch/sh/include/asm/perf_event.h b/arch/sh/include/asm/perf_event.h
new file mode 100644
index 00000000000..11a302297ab
--- /dev/null
+++ b/arch/sh/include/asm/perf_event.h
@@ -0,0 +1,9 @@
+#ifndef __ASM_SH_PERF_EVENT_H
+#define __ASM_SH_PERF_EVENT_H
+
+/* SH only supports software events through this interface. */
+static inline void set_perf_event_pending(void) {}
+
+#define PERF_EVENT_INDEX_OFFSET	0
+
+#endif /* __ASM_SH_PERF_EVENT_H */
diff --git a/arch/sh/include/asm/unistd_32.h b/arch/sh/include/asm/unistd_32.h
index 925dd40d9d5..f3fd1b9eb6b 100644
--- a/arch/sh/include/asm/unistd_32.h
+++ b/arch/sh/include/asm/unistd_32.h
@@ -344,7 +344,7 @@
 #define __NR_preadv		333
 #define __NR_pwritev		334
 #define __NR_rt_tgsigqueueinfo	335
-#define __NR_perf_counter_open	336
+#define __NR_perf_event_open	336
 
 #define NR_syscalls 337
 
diff --git a/arch/sh/include/asm/unistd_64.h b/arch/sh/include/asm/unistd_64.h
index 2b84bc916bc..343ce8f073e 100644
--- a/arch/sh/include/asm/unistd_64.h
+++ b/arch/sh/include/asm/unistd_64.h
@@ -384,7 +384,7 @@
 #define __NR_preadv		361
 #define __NR_pwritev		362
 #define __NR_rt_tgsigqueueinfo	363
-#define __NR_perf_counter_open	364
+#define __NR_perf_event_open	364
 
 #ifdef __KERNEL__
 
diff --git a/arch/sh/kernel/syscalls_32.S b/arch/sh/kernel/syscalls_32.S
index 16ba225ede8..19fd11dd987 100644
--- a/arch/sh/kernel/syscalls_32.S
+++ b/arch/sh/kernel/syscalls_32.S
@@ -352,4 +352,4 @@ ENTRY(sys_call_table)
 	.long sys_preadv
 	.long sys_pwritev
 	.long sys_rt_tgsigqueueinfo	/* 335 */
-	.long sys_perf_counter_open
+	.long sys_perf_event_open
diff --git a/arch/sh/kernel/syscalls_64.S b/arch/sh/kernel/syscalls_64.S
index af6fb7410c2..5bfde6c7749 100644
--- a/arch/sh/kernel/syscalls_64.S
+++ b/arch/sh/kernel/syscalls_64.S
@@ -390,4 +390,4 @@ sys_call_table:
 	.long sys_preadv
 	.long sys_pwritev
 	.long sys_rt_tgsigqueueinfo
-	.long sys_perf_counter_open
+	.long sys_perf_event_open
diff --git a/arch/sh/mm/fault_32.c b/arch/sh/mm/fault_32.c
index 781b413ff82..47530104e0a 100644
--- a/arch/sh/mm/fault_32.c
+++ b/arch/sh/mm/fault_32.c
@@ -15,7 +15,7 @@
 #include <linux/mm.h>
 #include <linux/hardirq.h>
 #include <linux/kprobes.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
 #include <asm/io_trapped.h>
 #include <asm/system.h>
 #include <asm/mmu_context.h>
@@ -157,7 +157,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
 	if ((regs->sr & SR_IMASK) != SR_IMASK)
 		local_irq_enable();
 
-	perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
+	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
 
 	/*
 	 * If we're in an interrupt, have no user context or are running
@@ -208,11 +208,11 @@ survive:
 	}
 	if (fault & VM_FAULT_MAJOR) {
 		tsk->maj_flt++;
-		perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
+		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
 				     regs, address);
 	} else {
 		tsk->min_flt++;
-		perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
+		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
 				     regs, address);
 	}
 
diff --git a/arch/sh/mm/tlbflush_64.c b/arch/sh/mm/tlbflush_64.c
index 2dcc48528f7..de0b0e88182 100644
--- a/arch/sh/mm/tlbflush_64.c
+++ b/arch/sh/mm/tlbflush_64.c
@@ -20,7 +20,7 @@
 #include <linux/mman.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
 #include <linux/interrupt.h>
 #include <asm/system.h>
 #include <asm/io.h>
@@ -116,7 +116,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long writeaccess,
 	/* Not an IO address, so reenable interrupts */
 	local_irq_enable();
 
-	perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
+	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
 
 	/*
 	 * If we're in an interrupt or have no user
@@ -201,11 +201,11 @@ survive:
 
 	if (fault & VM_FAULT_MAJOR) {
 		tsk->maj_flt++;
-		perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
+		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
 				     regs, address);
 	} else {
 		tsk->min_flt++;
-		perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
+		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
 				     regs, address);
 	}
 
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 86b82348b97..97fca4695e0 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -25,7 +25,7 @@ config SPARC
 	select ARCH_WANT_OPTIONAL_GPIOLIB
 	select RTC_CLASS
 	select RTC_DRV_M48T59
-	select HAVE_PERF_COUNTERS
+	select HAVE_PERF_EVENTS
 	select HAVE_DMA_ATTRS
 	select HAVE_DMA_API_DEBUG
 
@@ -47,7 +47,7 @@ config SPARC64
 	select RTC_DRV_BQ4802
 	select RTC_DRV_SUN4V
 	select RTC_DRV_STARFIRE
-	select HAVE_PERF_COUNTERS
+	select HAVE_PERF_EVENTS
 
 config ARCH_DEFCONFIG
 	string
diff --git a/arch/sparc/include/asm/perf_counter.h b/arch/sparc/include/asm/perf_counter.h
deleted file mode 100644
index 5d7a8ca0e49..00000000000
--- a/arch/sparc/include/asm/perf_counter.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#ifndef __ASM_SPARC_PERF_COUNTER_H
-#define __ASM_SPARC_PERF_COUNTER_H
-
-extern void set_perf_counter_pending(void);
-
-#define	PERF_COUNTER_INDEX_OFFSET	0
-
-#ifdef CONFIG_PERF_COUNTERS
-extern void init_hw_perf_counters(void);
-#else
-static inline void init_hw_perf_counters(void)	{ }
-#endif
-
-#endif
diff --git a/arch/sparc/include/asm/perf_event.h b/arch/sparc/include/asm/perf_event.h
new file mode 100644
index 00000000000..7e2669894ce
--- /dev/null
+++ b/arch/sparc/include/asm/perf_event.h
@@ -0,0 +1,14 @@
+#ifndef __ASM_SPARC_PERF_EVENT_H
+#define __ASM_SPARC_PERF_EVENT_H
+
+extern void set_perf_event_pending(void);
+
+#define	PERF_EVENT_INDEX_OFFSET	0
+
+#ifdef CONFIG_PERF_EVENTS
+extern void init_hw_perf_events(void);
+#else
+static inline void init_hw_perf_events(void)	{ }
+#endif
+
+#endif
diff --git a/arch/sparc/include/asm/unistd.h b/arch/sparc/include/asm/unistd.h
index 706df669f3b..42f2316c3ea 100644
--- a/arch/sparc/include/asm/unistd.h
+++ b/arch/sparc/include/asm/unistd.h
@@ -395,7 +395,7 @@
 #define __NR_preadv		324
 #define __NR_pwritev		325
 #define __NR_rt_tgsigqueueinfo	326
-#define __NR_perf_counter_open	327
+#define __NR_perf_event_open	327
 
 #define NR_SYSCALLS		328
 
diff --git a/arch/sparc/kernel/Makefile b/arch/sparc/kernel/Makefile
index 247cc620cee..3a048fad7ee 100644
--- a/arch/sparc/kernel/Makefile
+++ b/arch/sparc/kernel/Makefile
@@ -104,5 +104,5 @@ obj-$(CONFIG_AUDIT)     += audit.o
 audit--$(CONFIG_AUDIT)  := compat_audit.o
 obj-$(CONFIG_COMPAT)    += $(audit--y)
 
-pc--$(CONFIG_PERF_COUNTERS) := perf_counter.o
+pc--$(CONFIG_PERF_EVENTS) := perf_event.o
 obj-$(CONFIG_SPARC64)	+= $(pc--y)
diff --git a/arch/sparc/kernel/nmi.c b/arch/sparc/kernel/nmi.c
index 378eb53e077..b129611590a 100644
--- a/arch/sparc/kernel/nmi.c
+++ b/arch/sparc/kernel/nmi.c
@@ -19,7 +19,7 @@
 #include <linux/delay.h>
 #include <linux/smp.h>
 
-#include <asm/perf_counter.h>
+#include <asm/perf_event.h>
 #include <asm/ptrace.h>
 #include <asm/local.h>
 #include <asm/pcr.h>
@@ -265,7 +265,7 @@ int __init nmi_init(void)
 		}
 	}
 	if (!err)
-		init_hw_perf_counters();
+		init_hw_perf_events();
 
 	return err;
 }
diff --git a/arch/sparc/kernel/pcr.c b/arch/sparc/kernel/pcr.c
index 68ff0010707..2d94e7a03af 100644
--- a/arch/sparc/kernel/pcr.c
+++ b/arch/sparc/kernel/pcr.c
@@ -7,7 +7,7 @@
 #include <linux/init.h>
 #include <linux/irq.h>
 
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
 
 #include <asm/pil.h>
 #include <asm/pcr.h>
@@ -15,7 +15,7 @@
 
 /* This code is shared between various users of the performance
  * counters.  Users will be oprofile, pseudo-NMI watchdog, and the
- * perf_counter support layer.
+ * perf_event support layer.
  */
 
 #define PCR_SUN4U_ENABLE	(PCR_PIC_PRIV | PCR_STRACE | PCR_UTRACE)
@@ -42,14 +42,14 @@ void deferred_pcr_work_irq(int irq, struct pt_regs *regs)
 
 	old_regs = set_irq_regs(regs);
 	irq_enter();
-#ifdef CONFIG_PERF_COUNTERS
-	perf_counter_do_pending();
+#ifdef CONFIG_PERF_EVENTS
+	perf_event_do_pending();
 #endif
 	irq_exit();
 	set_irq_regs(old_regs);
 }
 
-void set_perf_counter_pending(void)
+void set_perf_event_pending(void)
 {
 	set_softint(1 << PIL_DEFERRED_PCR_WORK);
 }
diff --git a/arch/sparc/kernel/perf_counter.c b/arch/sparc/kernel/perf_counter.c
deleted file mode 100644
index b1265ce8a05..00000000000
--- a/arch/sparc/kernel/perf_counter.c
+++ /dev/null
@@ -1,556 +0,0 @@
-/* Performance counter support for sparc64.
- *
- * Copyright (C) 2009 David S. Miller <davem@davemloft.net>
- *
- * This code is based almost entirely upon the x86 perf counter
- * code, which is:
- *
- *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
- *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
- *  Copyright (C) 2009 Jaswinder Singh Rajput
- *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
- *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
- */
-
-#include <linux/perf_counter.h>
-#include <linux/kprobes.h>
-#include <linux/kernel.h>
-#include <linux/kdebug.h>
-#include <linux/mutex.h>
-
-#include <asm/cpudata.h>
-#include <asm/atomic.h>
-#include <asm/nmi.h>
-#include <asm/pcr.h>
-
-/* Sparc64 chips have two performance counters, 32-bits each, with
- * overflow interrupts generated on transition from 0xffffffff to 0.
- * The counters are accessed in one go using a 64-bit register.
- *
- * Both counters are controlled using a single control register.  The
- * only way to stop all sampling is to clear all of the context (user,
- * supervisor, hypervisor) sampling enable bits.  But these bits apply
- * to both counters, thus the two counters can't be enabled/disabled
- * individually.
- *
- * The control register has two event fields, one for each of the two
- * counters.  It's thus nearly impossible to have one counter going
- * while keeping the other one stopped.  Therefore it is possible to
- * get overflow interrupts for counters not currently "in use" and
- * that condition must be checked in the overflow interrupt handler.
- *
- * So we use a hack, in that we program inactive counters with the
- * "sw_count0" and "sw_count1" events.  These count how many times
- * the instruction "sethi %hi(0xfc000), %g0" is executed.  It's an
- * unusual way to encode a NOP and therefore will not trigger in
- * normal code.
- */
-
-#define MAX_HWCOUNTERS			2
-#define MAX_PERIOD			((1UL << 32) - 1)
-
-#define PIC_UPPER_INDEX			0
-#define PIC_LOWER_INDEX			1
-
-struct cpu_hw_counters {
-	struct perf_counter	*counters[MAX_HWCOUNTERS];
-	unsigned long		used_mask[BITS_TO_LONGS(MAX_HWCOUNTERS)];
-	unsigned long		active_mask[BITS_TO_LONGS(MAX_HWCOUNTERS)];
-	int enabled;
-};
-DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = { .enabled = 1, };
-
-struct perf_event_map {
-	u16	encoding;
-	u8	pic_mask;
-#define PIC_NONE	0x00
-#define PIC_UPPER	0x01
-#define PIC_LOWER	0x02
-};
-
-struct sparc_pmu {
-	const struct perf_event_map	*(*event_map)(int);
-	int				max_events;
-	int				upper_shift;
-	int				lower_shift;
-	int				event_mask;
-	int				hv_bit;
-	int				irq_bit;
-	int				upper_nop;
-	int				lower_nop;
-};
-
-static const struct perf_event_map ultra3i_perfmon_event_map[] = {
-	[PERF_COUNT_HW_CPU_CYCLES] = { 0x0000, PIC_UPPER | PIC_LOWER },
-	[PERF_COUNT_HW_INSTRUCTIONS] = { 0x0001, PIC_UPPER | PIC_LOWER },
-	[PERF_COUNT_HW_CACHE_REFERENCES] = { 0x0009, PIC_LOWER },
-	[PERF_COUNT_HW_CACHE_MISSES] = { 0x0009, PIC_UPPER },
-};
-
-static const struct perf_event_map *ultra3i_event_map(int event)
-{
-	return &ultra3i_perfmon_event_map[event];
-}
-
-static const struct sparc_pmu ultra3i_pmu = {
-	.event_map	= ultra3i_event_map,
-	.max_events	= ARRAY_SIZE(ultra3i_perfmon_event_map),
-	.upper_shift	= 11,
-	.lower_shift	= 4,
-	.event_mask	= 0x3f,
-	.upper_nop	= 0x1c,
-	.lower_nop	= 0x14,
-};
-
-static const struct perf_event_map niagara2_perfmon_event_map[] = {
-	[PERF_COUNT_HW_CPU_CYCLES] = { 0x02ff, PIC_UPPER | PIC_LOWER },
-	[PERF_COUNT_HW_INSTRUCTIONS] = { 0x02ff, PIC_UPPER | PIC_LOWER },
-	[PERF_COUNT_HW_CACHE_REFERENCES] = { 0x0208, PIC_UPPER | PIC_LOWER },
-	[PERF_COUNT_HW_CACHE_MISSES] = { 0x0302, PIC_UPPER | PIC_LOWER },
-	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = { 0x0201, PIC_UPPER | PIC_LOWER },
-	[PERF_COUNT_HW_BRANCH_MISSES] = { 0x0202, PIC_UPPER | PIC_LOWER },
-};
-
-static const struct perf_event_map *niagara2_event_map(int event)
-{
-	return &niagara2_perfmon_event_map[event];
-}
-
-static const struct sparc_pmu niagara2_pmu = {
-	.event_map	= niagara2_event_map,
-	.max_events	= ARRAY_SIZE(niagara2_perfmon_event_map),
-	.upper_shift	= 19,
-	.lower_shift	= 6,
-	.event_mask	= 0xfff,
-	.hv_bit		= 0x8,
-	.irq_bit	= 0x03,
-	.upper_nop	= 0x220,
-	.lower_nop	= 0x220,
-};
-
-static const struct sparc_pmu *sparc_pmu __read_mostly;
-
-static u64 event_encoding(u64 event, int idx)
-{
-	if (idx == PIC_UPPER_INDEX)
-		event <<= sparc_pmu->upper_shift;
-	else
-		event <<= sparc_pmu->lower_shift;
-	return event;
-}
-
-static u64 mask_for_index(int idx)
-{
-	return event_encoding(sparc_pmu->event_mask, idx);
-}
-
-static u64 nop_for_index(int idx)
-{
-	return event_encoding(idx == PIC_UPPER_INDEX ?
-			      sparc_pmu->upper_nop :
-			      sparc_pmu->lower_nop, idx);
-}
-
-static inline void sparc_pmu_enable_counter(struct hw_perf_counter *hwc,
-					    int idx)
-{
-	u64 val, mask = mask_for_index(idx);
-
-	val = pcr_ops->read();
-	pcr_ops->write((val & ~mask) | hwc->config);
-}
-
-static inline void sparc_pmu_disable_counter(struct hw_perf_counter *hwc,
-					     int idx)
-{
-	u64 mask = mask_for_index(idx);
-	u64 nop = nop_for_index(idx);
-	u64 val = pcr_ops->read();
-
-	pcr_ops->write((val & ~mask) | nop);
-}
-
-void hw_perf_enable(void)
-{
-	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-	u64 val;
-	int i;
-
-	if (cpuc->enabled)
-		return;
-
-	cpuc->enabled = 1;
-	barrier();
-
-	val = pcr_ops->read();
-
-	for (i = 0; i < MAX_HWCOUNTERS; i++) {
-		struct perf_counter *cp = cpuc->counters[i];
-		struct hw_perf_counter *hwc;
-
-		if (!cp)
-			continue;
-		hwc = &cp->hw;
-		val |= hwc->config_base;
-	}
-
-	pcr_ops->write(val);
-}
-
-void hw_perf_disable(void)
-{
-	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-	u64 val;
-
-	if (!cpuc->enabled)
-		return;
-
-	cpuc->enabled = 0;
-
-	val = pcr_ops->read();
-	val &= ~(PCR_UTRACE | PCR_STRACE |
-		 sparc_pmu->hv_bit | sparc_pmu->irq_bit);
-	pcr_ops->write(val);
-}
-
-static u32 read_pmc(int idx)
-{
-	u64 val;
-
-	read_pic(val);
-	if (idx == PIC_UPPER_INDEX)
-		val >>= 32;
-
-	return val & 0xffffffff;
-}
-
-static void write_pmc(int idx, u64 val)
-{
-	u64 shift, mask, pic;
-
-	shift = 0;
-	if (idx == PIC_UPPER_INDEX)
-		shift = 32;
-
-	mask = ((u64) 0xffffffff) << shift;
-	val <<= shift;
-
-	read_pic(pic);
-	pic &= ~mask;
-	pic |= val;
-	write_pic(pic);
-}
-
-static int sparc_perf_counter_set_period(struct perf_counter *counter,
-					 struct hw_perf_counter *hwc, int idx)
-{
-	s64 left = atomic64_read(&hwc->period_left);
-	s64 period = hwc->sample_period;
-	int ret = 0;
-
-	if (unlikely(left <= -period)) {
-		left = period;
-		atomic64_set(&hwc->period_left, left);
-		hwc->last_period = period;
-		ret = 1;
-	}
-
-	if (unlikely(left <= 0)) {
-		left += period;
-		atomic64_set(&hwc->period_left, left);
-		hwc->last_period = period;
-		ret = 1;
-	}
-	if (left > MAX_PERIOD)
-		left = MAX_PERIOD;
-
-	atomic64_set(&hwc->prev_count, (u64)-left);
-
-	write_pmc(idx, (u64)(-left) & 0xffffffff);
-
-	perf_counter_update_userpage(counter);
-
-	return ret;
-}
-
-static int sparc_pmu_enable(struct perf_counter *counter)
-{
-	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-	struct hw_perf_counter *hwc = &counter->hw;
-	int idx = hwc->idx;
-
-	if (test_and_set_bit(idx, cpuc->used_mask))
-		return -EAGAIN;
-
-	sparc_pmu_disable_counter(hwc, idx);
-
-	cpuc->counters[idx] = counter;
-	set_bit(idx, cpuc->active_mask);
-
-	sparc_perf_counter_set_period(counter, hwc, idx);
-	sparc_pmu_enable_counter(hwc, idx);
-	perf_counter_update_userpage(counter);
-	return 0;
-}
-
-static u64 sparc_perf_counter_update(struct perf_counter *counter,
-				     struct hw_perf_counter *hwc, int idx)
-{
-	int shift = 64 - 32;
-	u64 prev_raw_count, new_raw_count;
-	s64 delta;
-
-again:
-	prev_raw_count = atomic64_read(&hwc->prev_count);
-	new_raw_count = read_pmc(idx);
-
-	if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
-			     new_raw_count) != prev_raw_count)
-		goto again;
-
-	delta = (new_raw_count << shift) - (prev_raw_count << shift);
-	delta >>= shift;
-
-	atomic64_add(delta, &counter->count);
-	atomic64_sub(delta, &hwc->period_left);
-
-	return new_raw_count;
-}
-
-static void sparc_pmu_disable(struct perf_counter *counter)
-{
-	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-	struct hw_perf_counter *hwc = &counter->hw;
-	int idx = hwc->idx;
-
-	clear_bit(idx, cpuc->active_mask);
-	sparc_pmu_disable_counter(hwc, idx);
-
-	barrier();
-
-	sparc_perf_counter_update(counter, hwc, idx);
-	cpuc->counters[idx] = NULL;
-	clear_bit(idx, cpuc->used_mask);
-
-	perf_counter_update_userpage(counter);
-}
-
-static void sparc_pmu_read(struct perf_counter *counter)
-{
-	struct hw_perf_counter *hwc = &counter->hw;
-	sparc_perf_counter_update(counter, hwc, hwc->idx);
-}
-
-static void sparc_pmu_unthrottle(struct perf_counter *counter)
-{
-	struct hw_perf_counter *hwc = &counter->hw;
-	sparc_pmu_enable_counter(hwc, hwc->idx);
-}
-
-static atomic_t active_counters = ATOMIC_INIT(0);
-static DEFINE_MUTEX(pmc_grab_mutex);
-
-void perf_counter_grab_pmc(void)
-{
-	if (atomic_inc_not_zero(&active_counters))
-		return;
-
-	mutex_lock(&pmc_grab_mutex);
-	if (atomic_read(&active_counters) == 0) {
-		if (atomic_read(&nmi_active) > 0) {
-			on_each_cpu(stop_nmi_watchdog, NULL, 1);
-			BUG_ON(atomic_read(&nmi_active) != 0);
-		}
-		atomic_inc(&active_counters);
-	}
-	mutex_unlock(&pmc_grab_mutex);
-}
-
-void perf_counter_release_pmc(void)
-{
-	if (atomic_dec_and_mutex_lock(&active_counters, &pmc_grab_mutex)) {
-		if (atomic_read(&nmi_active) == 0)
-			on_each_cpu(start_nmi_watchdog, NULL, 1);
-		mutex_unlock(&pmc_grab_mutex);
-	}
-}
-
-static void hw_perf_counter_destroy(struct perf_counter *counter)
-{
-	perf_counter_release_pmc();
-}
-
-static int __hw_perf_counter_init(struct perf_counter *counter)
-{
-	struct perf_counter_attr *attr = &counter->attr;
-	struct hw_perf_counter *hwc = &counter->hw;
-	const struct perf_event_map *pmap;
-	u64 enc;
-
-	if (atomic_read(&nmi_active) < 0)
-		return -ENODEV;
-
-	if (attr->type != PERF_TYPE_HARDWARE)
-		return -EOPNOTSUPP;
-
-	if (attr->config >= sparc_pmu->max_events)
-		return -EINVAL;
-
-	perf_counter_grab_pmc();
-	counter->destroy = hw_perf_counter_destroy;
-
-	/* We save the enable bits in the config_base.  So to
-	 * turn off sampling just write 'config', and to enable
-	 * things write 'config | config_base'.
-	 */
-	hwc->config_base = sparc_pmu->irq_bit;
-	if (!attr->exclude_user)
-		hwc->config_base |= PCR_UTRACE;
-	if (!attr->exclude_kernel)
-		hwc->config_base |= PCR_STRACE;
-	if (!attr->exclude_hv)
-		hwc->config_base |= sparc_pmu->hv_bit;
-
-	if (!hwc->sample_period) {
-		hwc->sample_period = MAX_PERIOD;
-		hwc->last_period = hwc->sample_period;
-		atomic64_set(&hwc->period_left, hwc->sample_period);
-	}
-
-	pmap = sparc_pmu->event_map(attr->config);
-
-	enc = pmap->encoding;
-	if (pmap->pic_mask & PIC_UPPER) {
-		hwc->idx = PIC_UPPER_INDEX;
-		enc <<= sparc_pmu->upper_shift;
-	} else {
-		hwc->idx = PIC_LOWER_INDEX;
-		enc <<= sparc_pmu->lower_shift;
-	}
-
-	hwc->config |= enc;
-	return 0;
-}
-
-static const struct pmu pmu = {
-	.enable		= sparc_pmu_enable,
-	.disable	= sparc_pmu_disable,
-	.read		= sparc_pmu_read,
-	.unthrottle	= sparc_pmu_unthrottle,
-};
-
-const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
-{
-	int err = __hw_perf_counter_init(counter);
-
-	if (err)
-		return ERR_PTR(err);
-	return &pmu;
-}
-
-void perf_counter_print_debug(void)
-{
-	unsigned long flags;
-	u64 pcr, pic;
-	int cpu;
-
-	if (!sparc_pmu)
-		return;
-
-	local_irq_save(flags);
-
-	cpu = smp_processor_id();
-
-	pcr = pcr_ops->read();
-	read_pic(pic);
-
-	pr_info("\n");
-	pr_info("CPU#%d: PCR[%016llx] PIC[%016llx]\n",
-		cpu, pcr, pic);
-
-	local_irq_restore(flags);
-}
-
-static int __kprobes perf_counter_nmi_handler(struct notifier_block *self,
-					      unsigned long cmd, void *__args)
-{
-	struct die_args *args = __args;
-	struct perf_sample_data data;
-	struct cpu_hw_counters *cpuc;
-	struct pt_regs *regs;
-	int idx;
-
-	if (!atomic_read(&active_counters))
-		return NOTIFY_DONE;
-
-	switch (cmd) {
-	case DIE_NMI:
-		break;
-
-	default:
-		return NOTIFY_DONE;
-	}
-
-	regs = args->regs;
-
-	data.addr = 0;
-
-	cpuc = &__get_cpu_var(cpu_hw_counters);
-	for (idx = 0; idx < MAX_HWCOUNTERS; idx++) {
-		struct perf_counter *counter = cpuc->counters[idx];
-		struct hw_perf_counter *hwc;
-		u64 val;
-
-		if (!test_bit(idx, cpuc->active_mask))
-			continue;
-		hwc = &counter->hw;
-		val = sparc_perf_counter_update(counter, hwc, idx);
-		if (val & (1ULL << 31))
-			continue;
-
-		data.period = counter->hw.last_period;
-		if (!sparc_perf_counter_set_period(counter, hwc, idx))
-			continue;
-
-		if (perf_counter_overflow(counter, 1, &data, regs))
-			sparc_pmu_disable_counter(hwc, idx);
-	}
-
-	return NOTIFY_STOP;
-}
-
-static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
-	.notifier_call		= perf_counter_nmi_handler,
-};
-
-static bool __init supported_pmu(void)
-{
-	if (!strcmp(sparc_pmu_type, "ultra3i")) {
-		sparc_pmu = &ultra3i_pmu;
-		return true;
-	}
-	if (!strcmp(sparc_pmu_type, "niagara2")) {
-		sparc_pmu = &niagara2_pmu;
-		return true;
-	}
-	return false;
-}
-
-void __init init_hw_perf_counters(void)
-{
-	pr_info("Performance counters: ");
-
-	if (!supported_pmu()) {
-		pr_cont("No support for PMU type '%s'\n", sparc_pmu_type);
-		return;
-	}
-
-	pr_cont("Supported PMU type is '%s'\n", sparc_pmu_type);
-
-	/* All sparc64 PMUs currently have 2 counters.  But this simple
-	 * driver only supports one active counter at a time.
-	 */
-	perf_max_counters = 1;
-
-	register_die_notifier(&perf_counter_nmi_notifier);
-}
diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c
new file mode 100644
index 00000000000..2d6a1b10c81
--- /dev/null
+++ b/arch/sparc/kernel/perf_event.c
@@ -0,0 +1,556 @@
+/* Performance event support for sparc64.
+ *
+ * Copyright (C) 2009 David S. Miller <davem@davemloft.net>
+ *
+ * This code is based almost entirely upon the x86 perf event
+ * code, which is:
+ *
+ *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
+ *  Copyright (C) 2009 Jaswinder Singh Rajput
+ *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ */
+
+#include <linux/perf_event.h>
+#include <linux/kprobes.h>
+#include <linux/kernel.h>
+#include <linux/kdebug.h>
+#include <linux/mutex.h>
+
+#include <asm/cpudata.h>
+#include <asm/atomic.h>
+#include <asm/nmi.h>
+#include <asm/pcr.h>
+
+/* Sparc64 chips have two performance counters, 32-bits each, with
+ * overflow interrupts generated on transition from 0xffffffff to 0.
+ * The counters are accessed in one go using a 64-bit register.
+ *
+ * Both counters are controlled using a single control register.  The
+ * only way to stop all sampling is to clear all of the context (user,
+ * supervisor, hypervisor) sampling enable bits.  But these bits apply
+ * to both counters, thus the two counters can't be enabled/disabled
+ * individually.
+ *
+ * The control register has two event fields, one for each of the two
+ * counters.  It's thus nearly impossible to have one counter going
+ * while keeping the other one stopped.  Therefore it is possible to
+ * get overflow interrupts for counters not currently "in use" and
+ * that condition must be checked in the overflow interrupt handler.
+ *
+ * So we use a hack, in that we program inactive counters with the
+ * "sw_count0" and "sw_count1" events.  These count how many times
+ * the instruction "sethi %hi(0xfc000), %g0" is executed.  It's an
+ * unusual way to encode a NOP and therefore will not trigger in
+ * normal code.
+ */
+
+#define MAX_HWEVENTS			2
+#define MAX_PERIOD			((1UL << 32) - 1)
+
+#define PIC_UPPER_INDEX			0
+#define PIC_LOWER_INDEX			1
+
+struct cpu_hw_events {
+	struct perf_event	*events[MAX_HWEVENTS];
+	unsigned long		used_mask[BITS_TO_LONGS(MAX_HWEVENTS)];
+	unsigned long		active_mask[BITS_TO_LONGS(MAX_HWEVENTS)];
+	int enabled;
+};
+DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { .enabled = 1, };
+
+struct perf_event_map {
+	u16	encoding;
+	u8	pic_mask;
+#define PIC_NONE	0x00
+#define PIC_UPPER	0x01
+#define PIC_LOWER	0x02
+};
+
+struct sparc_pmu {
+	const struct perf_event_map	*(*event_map)(int);
+	int				max_events;
+	int				upper_shift;
+	int				lower_shift;
+	int				event_mask;
+	int				hv_bit;
+	int				irq_bit;
+	int				upper_nop;
+	int				lower_nop;
+};
+
+static const struct perf_event_map ultra3i_perfmon_event_map[] = {
+	[PERF_COUNT_HW_CPU_CYCLES] = { 0x0000, PIC_UPPER | PIC_LOWER },
+	[PERF_COUNT_HW_INSTRUCTIONS] = { 0x0001, PIC_UPPER | PIC_LOWER },
+	[PERF_COUNT_HW_CACHE_REFERENCES] = { 0x0009, PIC_LOWER },
+	[PERF_COUNT_HW_CACHE_MISSES] = { 0x0009, PIC_UPPER },
+};
+
+static const struct perf_event_map *ultra3i_event_map(int event_id)
+{
+	return &ultra3i_perfmon_event_map[event_id];
+}
+
+static const struct sparc_pmu ultra3i_pmu = {
+	.event_map	= ultra3i_event_map,
+	.max_events	= ARRAY_SIZE(ultra3i_perfmon_event_map),
+	.upper_shift	= 11,
+	.lower_shift	= 4,
+	.event_mask	= 0x3f,
+	.upper_nop	= 0x1c,
+	.lower_nop	= 0x14,
+};
+
+static const struct perf_event_map niagara2_perfmon_event_map[] = {
+	[PERF_COUNT_HW_CPU_CYCLES] = { 0x02ff, PIC_UPPER | PIC_LOWER },
+	[PERF_COUNT_HW_INSTRUCTIONS] = { 0x02ff, PIC_UPPER | PIC_LOWER },
+	[PERF_COUNT_HW_CACHE_REFERENCES] = { 0x0208, PIC_UPPER | PIC_LOWER },
+	[PERF_COUNT_HW_CACHE_MISSES] = { 0x0302, PIC_UPPER | PIC_LOWER },
+	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = { 0x0201, PIC_UPPER | PIC_LOWER },
+	[PERF_COUNT_HW_BRANCH_MISSES] = { 0x0202, PIC_UPPER | PIC_LOWER },
+};
+
+static const struct perf_event_map *niagara2_event_map(int event_id)
+{
+	return &niagara2_perfmon_event_map[event_id];
+}
+
+static const struct sparc_pmu niagara2_pmu = {
+	.event_map	= niagara2_event_map,
+	.max_events	= ARRAY_SIZE(niagara2_perfmon_event_map),
+	.upper_shift	= 19,
+	.lower_shift	= 6,
+	.event_mask	= 0xfff,
+	.hv_bit		= 0x8,
+	.irq_bit	= 0x03,
+	.upper_nop	= 0x220,
+	.lower_nop	= 0x220,
+};
+
+static const struct sparc_pmu *sparc_pmu __read_mostly;
+
+static u64 event_encoding(u64 event_id, int idx)
+{
+	if (idx == PIC_UPPER_INDEX)
+		event_id <<= sparc_pmu->upper_shift;
+	else
+		event_id <<= sparc_pmu->lower_shift;
+	return event_id;
+}
+
+static u64 mask_for_index(int idx)
+{
+	return event_encoding(sparc_pmu->event_mask, idx);
+}
+
+static u64 nop_for_index(int idx)
+{
+	return event_encoding(idx == PIC_UPPER_INDEX ?
+			      sparc_pmu->upper_nop :
+			      sparc_pmu->lower_nop, idx);
+}
+
+static inline void sparc_pmu_enable_event(struct hw_perf_event *hwc,
+					    int idx)
+{
+	u64 val, mask = mask_for_index(idx);
+
+	val = pcr_ops->read();
+	pcr_ops->write((val & ~mask) | hwc->config);
+}
+
+static inline void sparc_pmu_disable_event(struct hw_perf_event *hwc,
+					     int idx)
+{
+	u64 mask = mask_for_index(idx);
+	u64 nop = nop_for_index(idx);
+	u64 val = pcr_ops->read();
+
+	pcr_ops->write((val & ~mask) | nop);
+}
+
+void hw_perf_enable(void)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	u64 val;
+	int i;
+
+	if (cpuc->enabled)
+		return;
+
+	cpuc->enabled = 1;
+	barrier();
+
+	val = pcr_ops->read();
+
+	for (i = 0; i < MAX_HWEVENTS; i++) {
+		struct perf_event *cp = cpuc->events[i];
+		struct hw_perf_event *hwc;
+
+		if (!cp)
+			continue;
+		hwc = &cp->hw;
+		val |= hwc->config_base;
+	}
+
+	pcr_ops->write(val);
+}
+
+void hw_perf_disable(void)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	u64 val;
+
+	if (!cpuc->enabled)
+		return;
+
+	cpuc->enabled = 0;
+
+	val = pcr_ops->read();
+	val &= ~(PCR_UTRACE | PCR_STRACE |
+		 sparc_pmu->hv_bit | sparc_pmu->irq_bit);
+	pcr_ops->write(val);
+}
+
+static u32 read_pmc(int idx)
+{
+	u64 val;
+
+	read_pic(val);
+	if (idx == PIC_UPPER_INDEX)
+		val >>= 32;
+
+	return val & 0xffffffff;
+}
+
+static void write_pmc(int idx, u64 val)
+{
+	u64 shift, mask, pic;
+
+	shift = 0;
+	if (idx == PIC_UPPER_INDEX)
+		shift = 32;
+
+	mask = ((u64) 0xffffffff) << shift;
+	val <<= shift;
+
+	read_pic(pic);
+	pic &= ~mask;
+	pic |= val;
+	write_pic(pic);
+}
+
+static int sparc_perf_event_set_period(struct perf_event *event,
+					 struct hw_perf_event *hwc, int idx)
+{
+	s64 left = atomic64_read(&hwc->period_left);
+	s64 period = hwc->sample_period;
+	int ret = 0;
+
+	if (unlikely(left <= -period)) {
+		left = period;
+		atomic64_set(&hwc->period_left, left);
+		hwc->last_period = period;
+		ret = 1;
+	}
+
+	if (unlikely(left <= 0)) {
+		left += period;
+		atomic64_set(&hwc->period_left, left);
+		hwc->last_period = period;
+		ret = 1;
+	}
+	if (left > MAX_PERIOD)
+		left = MAX_PERIOD;
+
+	atomic64_set(&hwc->prev_count, (u64)-left);
+
+	write_pmc(idx, (u64)(-left) & 0xffffffff);
+
+	perf_event_update_userpage(event);
+
+	return ret;
+}
+
+static int sparc_pmu_enable(struct perf_event *event)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	struct hw_perf_event *hwc = &event->hw;
+	int idx = hwc->idx;
+
+	if (test_and_set_bit(idx, cpuc->used_mask))
+		return -EAGAIN;
+
+	sparc_pmu_disable_event(hwc, idx);
+
+	cpuc->events[idx] = event;
+	set_bit(idx, cpuc->active_mask);
+
+	sparc_perf_event_set_period(event, hwc, idx);
+	sparc_pmu_enable_event(hwc, idx);
+	perf_event_update_userpage(event);
+	return 0;
+}
+
+static u64 sparc_perf_event_update(struct perf_event *event,
+				     struct hw_perf_event *hwc, int idx)
+{
+	int shift = 64 - 32;
+	u64 prev_raw_count, new_raw_count;
+	s64 delta;
+
+again:
+	prev_raw_count = atomic64_read(&hwc->prev_count);
+	new_raw_count = read_pmc(idx);
+
+	if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
+			     new_raw_count) != prev_raw_count)
+		goto again;
+
+	delta = (new_raw_count << shift) - (prev_raw_count << shift);
+	delta >>= shift;
+
+	atomic64_add(delta, &event->count);
+	atomic64_sub(delta, &hwc->period_left);
+
+	return new_raw_count;
+}
+
+static void sparc_pmu_disable(struct perf_event *event)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	struct hw_perf_event *hwc = &event->hw;
+	int idx = hwc->idx;
+
+	clear_bit(idx, cpuc->active_mask);
+	sparc_pmu_disable_event(hwc, idx);
+
+	barrier();
+
+	sparc_perf_event_update(event, hwc, idx);
+	cpuc->events[idx] = NULL;
+	clear_bit(idx, cpuc->used_mask);
+
+	perf_event_update_userpage(event);
+}
+
+static void sparc_pmu_read(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	sparc_perf_event_update(event, hwc, hwc->idx);
+}
+
+static void sparc_pmu_unthrottle(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	sparc_pmu_enable_event(hwc, hwc->idx);
+}
+
+static atomic_t active_events = ATOMIC_INIT(0);
+static DEFINE_MUTEX(pmc_grab_mutex);
+
+void perf_event_grab_pmc(void)
+{
+	if (atomic_inc_not_zero(&active_events))
+		return;
+
+	mutex_lock(&pmc_grab_mutex);
+	if (atomic_read(&active_events) == 0) {
+		if (atomic_read(&nmi_active) > 0) {
+			on_each_cpu(stop_nmi_watchdog, NULL, 1);
+			BUG_ON(atomic_read(&nmi_active) != 0);
+		}
+		atomic_inc(&active_events);
+	}
+	mutex_unlock(&pmc_grab_mutex);
+}
+
+void perf_event_release_pmc(void)
+{
+	if (atomic_dec_and_mutex_lock(&active_events, &pmc_grab_mutex)) {
+		if (atomic_read(&nmi_active) == 0)
+			on_each_cpu(start_nmi_watchdog, NULL, 1);
+		mutex_unlock(&pmc_grab_mutex);
+	}
+}
+
+static void hw_perf_event_destroy(struct perf_event *event)
+{
+	perf_event_release_pmc();
+}
+
+static int __hw_perf_event_init(struct perf_event *event)
+{
+	struct perf_event_attr *attr = &event->attr;
+	struct hw_perf_event *hwc = &event->hw;
+	const struct perf_event_map *pmap;
+	u64 enc;
+
+	if (atomic_read(&nmi_active) < 0)
+		return -ENODEV;
+
+	if (attr->type != PERF_TYPE_HARDWARE)
+		return -EOPNOTSUPP;
+
+	if (attr->config >= sparc_pmu->max_events)
+		return -EINVAL;
+
+	perf_event_grab_pmc();
+	event->destroy = hw_perf_event_destroy;
+
+	/* We save the enable bits in the config_base.  So to
+	 * turn off sampling just write 'config', and to enable
+	 * things write 'config | config_base'.
+	 */
+	hwc->config_base = sparc_pmu->irq_bit;
+	if (!attr->exclude_user)
+		hwc->config_base |= PCR_UTRACE;
+	if (!attr->exclude_kernel)
+		hwc->config_base |= PCR_STRACE;
+	if (!attr->exclude_hv)
+		hwc->config_base |= sparc_pmu->hv_bit;
+
+	if (!hwc->sample_period) {
+		hwc->sample_period = MAX_PERIOD;
+		hwc->last_period = hwc->sample_period;
+		atomic64_set(&hwc->period_left, hwc->sample_period);
+	}
+
+	pmap = sparc_pmu->event_map(attr->config);
+
+	enc = pmap->encoding;
+	if (pmap->pic_mask & PIC_UPPER) {
+		hwc->idx = PIC_UPPER_INDEX;
+		enc <<= sparc_pmu->upper_shift;
+	} else {
+		hwc->idx = PIC_LOWER_INDEX;
+		enc <<= sparc_pmu->lower_shift;
+	}
+
+	hwc->config |= enc;
+	return 0;
+}
+
+static const struct pmu pmu = {
+	.enable		= sparc_pmu_enable,
+	.disable	= sparc_pmu_disable,
+	.read		= sparc_pmu_read,
+	.unthrottle	= sparc_pmu_unthrottle,
+};
+
+const struct pmu *hw_perf_event_init(struct perf_event *event)
+{
+	int err = __hw_perf_event_init(event);
+
+	if (err)
+		return ERR_PTR(err);
+	return &pmu;
+}
+
+void perf_event_print_debug(void)
+{
+	unsigned long flags;
+	u64 pcr, pic;
+	int cpu;
+
+	if (!sparc_pmu)
+		return;
+
+	local_irq_save(flags);
+
+	cpu = smp_processor_id();
+
+	pcr = pcr_ops->read();
+	read_pic(pic);
+
+	pr_info("\n");
+	pr_info("CPU#%d: PCR[%016llx] PIC[%016llx]\n",
+		cpu, pcr, pic);
+
+	local_irq_restore(flags);
+}
+
+static int __kprobes perf_event_nmi_handler(struct notifier_block *self,
+					      unsigned long cmd, void *__args)
+{
+	struct die_args *args = __args;
+	struct perf_sample_data data;
+	struct cpu_hw_events *cpuc;
+	struct pt_regs *regs;
+	int idx;
+
+	if (!atomic_read(&active_events))
+		return NOTIFY_DONE;
+
+	switch (cmd) {
+	case DIE_NMI:
+		break;
+
+	default:
+		return NOTIFY_DONE;
+	}
+
+	regs = args->regs;
+
+	data.addr = 0;
+
+	cpuc = &__get_cpu_var(cpu_hw_events);
+	for (idx = 0; idx < MAX_HWEVENTS; idx++) {
+		struct perf_event *event = cpuc->events[idx];
+		struct hw_perf_event *hwc;
+		u64 val;
+
+		if (!test_bit(idx, cpuc->active_mask))
+			continue;
+		hwc = &event->hw;
+		val = sparc_perf_event_update(event, hwc, idx);
+		if (val & (1ULL << 31))
+			continue;
+
+		data.period = event->hw.last_period;
+		if (!sparc_perf_event_set_period(event, hwc, idx))
+			continue;
+
+		if (perf_event_overflow(event, 1, &data, regs))
+			sparc_pmu_disable_event(hwc, idx);
+	}
+
+	return NOTIFY_STOP;
+}
+
+static __read_mostly struct notifier_block perf_event_nmi_notifier = {
+	.notifier_call		= perf_event_nmi_handler,
+};
+
+static bool __init supported_pmu(void)
+{
+	if (!strcmp(sparc_pmu_type, "ultra3i")) {
+		sparc_pmu = &ultra3i_pmu;
+		return true;
+	}
+	if (!strcmp(sparc_pmu_type, "niagara2")) {
+		sparc_pmu = &niagara2_pmu;
+		return true;
+	}
+	return false;
+}
+
+void __init init_hw_perf_events(void)
+{
+	pr_info("Performance events: ");
+
+	if (!supported_pmu()) {
+		pr_cont("No support for PMU type '%s'\n", sparc_pmu_type);
+		return;
+	}
+
+	pr_cont("Supported PMU type is '%s'\n", sparc_pmu_type);
+
+	/* All sparc64 PMUs currently have 2 events.  But this simple
+	 * driver only supports one active event at a time.
+	 */
+	perf_max_events = 1;
+
+	register_die_notifier(&perf_event_nmi_notifier);
+}
diff --git a/arch/sparc/kernel/systbls_32.S b/arch/sparc/kernel/systbls_32.S
index 04181577cb6..0f1658d3749 100644
--- a/arch/sparc/kernel/systbls_32.S
+++ b/arch/sparc/kernel/systbls_32.S
@@ -82,5 +82,5 @@ sys_call_table:
 /*310*/	.long sys_utimensat, sys_signalfd, sys_timerfd_create, sys_eventfd, sys_fallocate
 /*315*/	.long sys_timerfd_settime, sys_timerfd_gettime, sys_signalfd4, sys_eventfd2, sys_epoll_create1
 /*320*/	.long sys_dup3, sys_pipe2, sys_inotify_init1, sys_accept4, sys_preadv
-/*325*/	.long sys_pwritev, sys_rt_tgsigqueueinfo, sys_perf_counter_open
+/*325*/	.long sys_pwritev, sys_rt_tgsigqueueinfo, sys_perf_event_open
 
diff --git a/arch/sparc/kernel/systbls_64.S b/arch/sparc/kernel/systbls_64.S
index 91b06b7f7ac..009825f6e73 100644
--- a/arch/sparc/kernel/systbls_64.S
+++ b/arch/sparc/kernel/systbls_64.S
@@ -83,7 +83,7 @@ sys_call_table32:
 /*310*/	.word compat_sys_utimensat, compat_sys_signalfd, sys_timerfd_create, sys_eventfd, compat_sys_fallocate
 	.word compat_sys_timerfd_settime, compat_sys_timerfd_gettime, compat_sys_signalfd4, sys_eventfd2, sys_epoll_create1
 /*320*/	.word sys_dup3, sys_pipe2, sys_inotify_init1, sys_accept4, compat_sys_preadv
-	.word compat_sys_pwritev, compat_sys_rt_tgsigqueueinfo, sys_perf_counter_open
+	.word compat_sys_pwritev, compat_sys_rt_tgsigqueueinfo, sys_perf_event_open
 
 #endif /* CONFIG_COMPAT */
 
@@ -158,4 +158,4 @@ sys_call_table:
 /*310*/	.word sys_utimensat, sys_signalfd, sys_timerfd_create, sys_eventfd, sys_fallocate
 	.word sys_timerfd_settime, sys_timerfd_gettime, sys_signalfd4, sys_eventfd2, sys_epoll_create1
 /*320*/	.word sys_dup3, sys_pipe2, sys_inotify_init1, sys_accept4, sys_preadv
-	.word sys_pwritev, sys_rt_tgsigqueueinfo, sys_perf_counter_open
+	.word sys_pwritev, sys_rt_tgsigqueueinfo, sys_perf_event_open
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 51c59015b28..e4ff5d1280c 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -24,7 +24,7 @@ config X86
 	select HAVE_UNSTABLE_SCHED_CLOCK
 	select HAVE_IDE
 	select HAVE_OPROFILE
-	select HAVE_PERF_COUNTERS if (!M386 && !M486)
+	select HAVE_PERF_EVENTS if (!M386 && !M486)
 	select HAVE_IOREMAP_PROT
 	select HAVE_KPROBES
 	select ARCH_WANT_OPTIONAL_GPIOLIB
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index ba331bfd111..74619c4f9fd 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -831,5 +831,5 @@ ia32_sys_call_table:
 	.quad compat_sys_preadv
 	.quad compat_sys_pwritev
 	.quad compat_sys_rt_tgsigqueueinfo	/* 335 */
-	.quad sys_perf_counter_open
+	.quad sys_perf_event_open
 ia32_syscall_end:
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index 5e3f2044f0d..f5693c81a1d 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -49,7 +49,7 @@ BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR)
 BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR)
 BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
 
-#ifdef CONFIG_PERF_COUNTERS
+#ifdef CONFIG_PERF_EVENTS
 BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR)
 #endif
 
diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
deleted file mode 100644
index e7b7c938ae2..00000000000
--- a/arch/x86/include/asm/perf_counter.h
+++ /dev/null
@@ -1,108 +0,0 @@
-#ifndef _ASM_X86_PERF_COUNTER_H
-#define _ASM_X86_PERF_COUNTER_H
-
-/*
- * Performance counter hw details:
- */
-
-#define X86_PMC_MAX_GENERIC					8
-#define X86_PMC_MAX_FIXED					3
-
-#define X86_PMC_IDX_GENERIC				        0
-#define X86_PMC_IDX_FIXED				       32
-#define X86_PMC_IDX_MAX					       64
-
-#define MSR_ARCH_PERFMON_PERFCTR0			      0xc1
-#define MSR_ARCH_PERFMON_PERFCTR1			      0xc2
-
-#define MSR_ARCH_PERFMON_EVENTSEL0			     0x186
-#define MSR_ARCH_PERFMON_EVENTSEL1			     0x187
-
-#define ARCH_PERFMON_EVENTSEL0_ENABLE			  (1 << 22)
-#define ARCH_PERFMON_EVENTSEL_INT			  (1 << 20)
-#define ARCH_PERFMON_EVENTSEL_OS			  (1 << 17)
-#define ARCH_PERFMON_EVENTSEL_USR			  (1 << 16)
-
-/*
- * Includes eventsel and unit mask as well:
- */
-#define ARCH_PERFMON_EVENT_MASK				    0xffff
-
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL		      0x3c
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK		(0x00 << 8)
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 		 0
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
-		(1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
-
-#define ARCH_PERFMON_BRANCH_MISSES_RETIRED			 6
-
-/*
- * Intel "Architectural Performance Monitoring" CPUID
- * detection/enumeration details:
- */
-union cpuid10_eax {
-	struct {
-		unsigned int version_id:8;
-		unsigned int num_counters:8;
-		unsigned int bit_width:8;
-		unsigned int mask_length:8;
-	} split;
-	unsigned int full;
-};
-
-union cpuid10_edx {
-	struct {
-		unsigned int num_counters_fixed:4;
-		unsigned int reserved:28;
-	} split;
-	unsigned int full;
-};
-
-
-/*
- * Fixed-purpose performance counters:
- */
-
-/*
- * All 3 fixed-mode PMCs are configured via this single MSR:
- */
-#define MSR_ARCH_PERFMON_FIXED_CTR_CTRL			0x38d
-
-/*
- * The counts are available in three separate MSRs:
- */
-
-/* Instr_Retired.Any: */
-#define MSR_ARCH_PERFMON_FIXED_CTR0			0x309
-#define X86_PMC_IDX_FIXED_INSTRUCTIONS			(X86_PMC_IDX_FIXED + 0)
-
-/* CPU_CLK_Unhalted.Core: */
-#define MSR_ARCH_PERFMON_FIXED_CTR1			0x30a
-#define X86_PMC_IDX_FIXED_CPU_CYCLES			(X86_PMC_IDX_FIXED + 1)
-
-/* CPU_CLK_Unhalted.Ref: */
-#define MSR_ARCH_PERFMON_FIXED_CTR2			0x30b
-#define X86_PMC_IDX_FIXED_BUS_CYCLES			(X86_PMC_IDX_FIXED + 2)
-
-/*
- * We model BTS tracing as another fixed-mode PMC.
- *
- * We choose a value in the middle of the fixed counter range, since lower
- * values are used by actual fixed counters and higher values are used
- * to indicate other overflow conditions in the PERF_GLOBAL_STATUS msr.
- */
-#define X86_PMC_IDX_FIXED_BTS				(X86_PMC_IDX_FIXED + 16)
-
-
-#ifdef CONFIG_PERF_COUNTERS
-extern void init_hw_perf_counters(void);
-extern void perf_counters_lapic_init(void);
-
-#define PERF_COUNTER_INDEX_OFFSET			0
-
-#else
-static inline void init_hw_perf_counters(void)		{ }
-static inline void perf_counters_lapic_init(void)	{ }
-#endif
-
-#endif /* _ASM_X86_PERF_COUNTER_H */
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
new file mode 100644
index 00000000000..ad7ce3fd506
--- /dev/null
+++ b/arch/x86/include/asm/perf_event.h
@@ -0,0 +1,108 @@
+#ifndef _ASM_X86_PERF_EVENT_H
+#define _ASM_X86_PERF_EVENT_H
+
+/*
+ * Performance event hw details:
+ */
+
+#define X86_PMC_MAX_GENERIC					8
+#define X86_PMC_MAX_FIXED					3
+
+#define X86_PMC_IDX_GENERIC				        0
+#define X86_PMC_IDX_FIXED				       32
+#define X86_PMC_IDX_MAX					       64
+
+#define MSR_ARCH_PERFMON_PERFCTR0			      0xc1
+#define MSR_ARCH_PERFMON_PERFCTR1			      0xc2
+
+#define MSR_ARCH_PERFMON_EVENTSEL0			     0x186
+#define MSR_ARCH_PERFMON_EVENTSEL1			     0x187
+
+#define ARCH_PERFMON_EVENTSEL0_ENABLE			  (1 << 22)
+#define ARCH_PERFMON_EVENTSEL_INT			  (1 << 20)
+#define ARCH_PERFMON_EVENTSEL_OS			  (1 << 17)
+#define ARCH_PERFMON_EVENTSEL_USR			  (1 << 16)
+
+/*
+ * Includes eventsel and unit mask as well:
+ */
+#define ARCH_PERFMON_EVENT_MASK				    0xffff
+
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL		      0x3c
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK		(0x00 << 8)
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 		 0
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
+		(1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
+
+#define ARCH_PERFMON_BRANCH_MISSES_RETIRED			 6
+
+/*
+ * Intel "Architectural Performance Monitoring" CPUID
+ * detection/enumeration details:
+ */
+union cpuid10_eax {
+	struct {
+		unsigned int version_id:8;
+		unsigned int num_events:8;
+		unsigned int bit_width:8;
+		unsigned int mask_length:8;
+	} split;
+	unsigned int full;
+};
+
+union cpuid10_edx {
+	struct {
+		unsigned int num_events_fixed:4;
+		unsigned int reserved:28;
+	} split;
+	unsigned int full;
+};
+
+
+/*
+ * Fixed-purpose performance events:
+ */
+
+/*
+ * All 3 fixed-mode PMCs are configured via this single MSR:
+ */
+#define MSR_ARCH_PERFMON_FIXED_CTR_CTRL			0x38d
+
+/*
+ * The counts are available in three separate MSRs:
+ */
+
+/* Instr_Retired.Any: */
+#define MSR_ARCH_PERFMON_FIXED_CTR0			0x309
+#define X86_PMC_IDX_FIXED_INSTRUCTIONS			(X86_PMC_IDX_FIXED + 0)
+
+/* CPU_CLK_Unhalted.Core: */
+#define MSR_ARCH_PERFMON_FIXED_CTR1			0x30a
+#define X86_PMC_IDX_FIXED_CPU_CYCLES			(X86_PMC_IDX_FIXED + 1)
+
+/* CPU_CLK_Unhalted.Ref: */
+#define MSR_ARCH_PERFMON_FIXED_CTR2			0x30b
+#define X86_PMC_IDX_FIXED_BUS_CYCLES			(X86_PMC_IDX_FIXED + 2)
+
+/*
+ * We model BTS tracing as another fixed-mode PMC.
+ *
+ * We choose a value in the middle of the fixed event range, since lower
+ * values are used by actual fixed events and higher values are used
+ * to indicate other overflow conditions in the PERF_GLOBAL_STATUS msr.
+ */
+#define X86_PMC_IDX_FIXED_BTS				(X86_PMC_IDX_FIXED + 16)
+
+
+#ifdef CONFIG_PERF_EVENTS
+extern void init_hw_perf_events(void);
+extern void perf_events_lapic_init(void);
+
+#define PERF_EVENT_INDEX_OFFSET			0
+
+#else
+static inline void init_hw_perf_events(void)		{ }
+static inline void perf_events_lapic_init(void)	{ }
+#endif
+
+#endif /* _ASM_X86_PERF_EVENT_H */
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index 8deaada61bc..6fb3c209a7e 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -341,7 +341,7 @@
 #define __NR_preadv		333
 #define __NR_pwritev		334
 #define __NR_rt_tgsigqueueinfo	335
-#define __NR_perf_counter_open	336
+#define __NR_perf_event_open	336
 
 #ifdef __KERNEL__
 
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index b9f3c60de5f..8d3ad0adbc6 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -659,8 +659,8 @@ __SYSCALL(__NR_preadv, sys_preadv)
 __SYSCALL(__NR_pwritev, sys_pwritev)
 #define __NR_rt_tgsigqueueinfo			297
 __SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo)
-#define __NR_perf_counter_open			298
-__SYSCALL(__NR_perf_counter_open, sys_perf_counter_open)
+#define __NR_perf_event_open			298
+__SYSCALL(__NR_perf_event_open, sys_perf_event_open)
 
 #ifndef __NO_STUBS
 #define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index a34601f5298..754174d09de 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -14,7 +14,7 @@
  *	Mikael Pettersson	:	PM converted to driver model.
  */
 
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
 #include <linux/kernel_stat.h>
 #include <linux/mc146818rtc.h>
 #include <linux/acpi_pmtmr.h>
@@ -35,7 +35,7 @@
 #include <linux/smp.h>
 #include <linux/mm.h>
 
-#include <asm/perf_counter.h>
+#include <asm/perf_event.h>
 #include <asm/x86_init.h>
 #include <asm/pgalloc.h>
 #include <asm/atomic.h>
@@ -1189,7 +1189,7 @@ void __cpuinit setup_local_APIC(void)
 		apic_write(APIC_ESR, 0);
 	}
 #endif
-	perf_counters_lapic_init();
+	perf_events_lapic_init();
 
 	preempt_disable();
 
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 8dd30638fe4..68537e957a9 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -27,7 +27,7 @@ obj-$(CONFIG_CPU_SUP_CENTAUR)		+= centaur.o
 obj-$(CONFIG_CPU_SUP_TRANSMETA_32)	+= transmeta.o
 obj-$(CONFIG_CPU_SUP_UMC_32)		+= umc.o
 
-obj-$(CONFIG_PERF_COUNTERS)		+= perf_counter.o
+obj-$(CONFIG_PERF_EVENTS)		+= perf_event.o
 
 obj-$(CONFIG_X86_MCE)			+= mcheck/
 obj-$(CONFIG_MTRR)			+= mtrr/
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 2fea97eccf7..cc25c2b4a56 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -13,7 +13,7 @@
 #include <linux/io.h>
 
 #include <asm/stackprotector.h>
-#include <asm/perf_counter.h>
+#include <asm/perf_event.h>
 #include <asm/mmu_context.h>
 #include <asm/hypervisor.h>
 #include <asm/processor.h>
@@ -869,7 +869,7 @@ void __init identify_boot_cpu(void)
 #else
 	vgetcpu_set_mode();
 #endif
-	init_hw_perf_counters();
+	init_hw_perf_events();
 }
 
 void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
deleted file mode 100644
index b1f115696c8..00000000000
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ /dev/null
@@ -1,2298 +0,0 @@
-/*
- * Performance counter x86 architecture code
- *
- *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
- *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
- *  Copyright (C) 2009 Jaswinder Singh Rajput
- *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
- *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
- *  Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
- *
- *  For licencing details see kernel-base/COPYING
- */
-
-#include <linux/perf_counter.h>
-#include <linux/capability.h>
-#include <linux/notifier.h>
-#include <linux/hardirq.h>
-#include <linux/kprobes.h>
-#include <linux/module.h>
-#include <linux/kdebug.h>
-#include <linux/sched.h>
-#include <linux/uaccess.h>
-#include <linux/highmem.h>
-#include <linux/cpu.h>
-
-#include <asm/apic.h>
-#include <asm/stacktrace.h>
-#include <asm/nmi.h>
-
-static u64 perf_counter_mask __read_mostly;
-
-/* The maximal number of PEBS counters: */
-#define MAX_PEBS_COUNTERS	4
-
-/* The size of a BTS record in bytes: */
-#define BTS_RECORD_SIZE		24
-
-/* The size of a per-cpu BTS buffer in bytes: */
-#define BTS_BUFFER_SIZE		(BTS_RECORD_SIZE * 2048)
-
-/* The BTS overflow threshold in bytes from the end of the buffer: */
-#define BTS_OVFL_TH		(BTS_RECORD_SIZE * 128)
-
-
-/*
- * Bits in the debugctlmsr controlling branch tracing.
- */
-#define X86_DEBUGCTL_TR			(1 << 6)
-#define X86_DEBUGCTL_BTS		(1 << 7)
-#define X86_DEBUGCTL_BTINT		(1 << 8)
-#define X86_DEBUGCTL_BTS_OFF_OS		(1 << 9)
-#define X86_DEBUGCTL_BTS_OFF_USR	(1 << 10)
-
-/*
- * A debug store configuration.
- *
- * We only support architectures that use 64bit fields.
- */
-struct debug_store {
-	u64	bts_buffer_base;
-	u64	bts_index;
-	u64	bts_absolute_maximum;
-	u64	bts_interrupt_threshold;
-	u64	pebs_buffer_base;
-	u64	pebs_index;
-	u64	pebs_absolute_maximum;
-	u64	pebs_interrupt_threshold;
-	u64	pebs_counter_reset[MAX_PEBS_COUNTERS];
-};
-
-struct cpu_hw_counters {
-	struct perf_counter	*counters[X86_PMC_IDX_MAX];
-	unsigned long		used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
-	unsigned long		active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
-	unsigned long		interrupts;
-	int			enabled;
-	struct debug_store	*ds;
-};
-
-/*
- * struct x86_pmu - generic x86 pmu
- */
-struct x86_pmu {
-	const char	*name;
-	int		version;
-	int		(*handle_irq)(struct pt_regs *);
-	void		(*disable_all)(void);
-	void		(*enable_all)(void);
-	void		(*enable)(struct hw_perf_counter *, int);
-	void		(*disable)(struct hw_perf_counter *, int);
-	unsigned	eventsel;
-	unsigned	perfctr;
-	u64		(*event_map)(int);
-	u64		(*raw_event)(u64);
-	int		max_events;
-	int		num_counters;
-	int		num_counters_fixed;
-	int		counter_bits;
-	u64		counter_mask;
-	int		apic;
-	u64		max_period;
-	u64		intel_ctrl;
-	void		(*enable_bts)(u64 config);
-	void		(*disable_bts)(void);
-};
-
-static struct x86_pmu x86_pmu __read_mostly;
-
-static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = {
-	.enabled = 1,
-};
-
-/*
- * Not sure about some of these
- */
-static const u64 p6_perfmon_event_map[] =
-{
-  [PERF_COUNT_HW_CPU_CYCLES]		= 0x0079,
-  [PERF_COUNT_HW_INSTRUCTIONS]		= 0x00c0,
-  [PERF_COUNT_HW_CACHE_REFERENCES]	= 0x0f2e,
-  [PERF_COUNT_HW_CACHE_MISSES]		= 0x012e,
-  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x00c4,
-  [PERF_COUNT_HW_BRANCH_MISSES]		= 0x00c5,
-  [PERF_COUNT_HW_BUS_CYCLES]		= 0x0062,
-};
-
-static u64 p6_pmu_event_map(int hw_event)
-{
-	return p6_perfmon_event_map[hw_event];
-}
-
-/*
- * Counter setting that is specified not to count anything.
- * We use this to effectively disable a counter.
- *
- * L2_RQSTS with 0 MESI unit mask.
- */
-#define P6_NOP_COUNTER			0x0000002EULL
-
-static u64 p6_pmu_raw_event(u64 hw_event)
-{
-#define P6_EVNTSEL_EVENT_MASK		0x000000FFULL
-#define P6_EVNTSEL_UNIT_MASK		0x0000FF00ULL
-#define P6_EVNTSEL_EDGE_MASK		0x00040000ULL
-#define P6_EVNTSEL_INV_MASK		0x00800000ULL
-#define P6_EVNTSEL_COUNTER_MASK		0xFF000000ULL
-
-#define P6_EVNTSEL_MASK			\
-	(P6_EVNTSEL_EVENT_MASK |	\
-	 P6_EVNTSEL_UNIT_MASK  |	\
-	 P6_EVNTSEL_EDGE_MASK  |	\
-	 P6_EVNTSEL_INV_MASK   |	\
-	 P6_EVNTSEL_COUNTER_MASK)
-
-	return hw_event & P6_EVNTSEL_MASK;
-}
-
-
-/*
- * Intel PerfMon v3. Used on Core2 and later.
- */
-static const u64 intel_perfmon_event_map[] =
-{
-  [PERF_COUNT_HW_CPU_CYCLES]		= 0x003c,
-  [PERF_COUNT_HW_INSTRUCTIONS]		= 0x00c0,
-  [PERF_COUNT_HW_CACHE_REFERENCES]	= 0x4f2e,
-  [PERF_COUNT_HW_CACHE_MISSES]		= 0x412e,
-  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x00c4,
-  [PERF_COUNT_HW_BRANCH_MISSES]		= 0x00c5,
-  [PERF_COUNT_HW_BUS_CYCLES]		= 0x013c,
-};
-
-static u64 intel_pmu_event_map(int hw_event)
-{
-	return intel_perfmon_event_map[hw_event];
-}
-
-/*
- * Generalized hw caching related hw_event table, filled
- * in on a per model basis. A value of 0 means
- * 'not supported', -1 means 'hw_event makes no sense on
- * this CPU', any other value means the raw hw_event
- * ID.
- */
-
-#define C(x) PERF_COUNT_HW_CACHE_##x
-
-static u64 __read_mostly hw_cache_event_ids
-				[PERF_COUNT_HW_CACHE_MAX]
-				[PERF_COUNT_HW_CACHE_OP_MAX]
-				[PERF_COUNT_HW_CACHE_RESULT_MAX];
-
-static const u64 nehalem_hw_cache_event_ids
-				[PERF_COUNT_HW_CACHE_MAX]
-				[PERF_COUNT_HW_CACHE_OP_MAX]
-				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI            */
-		[ C(RESULT_MISS)   ] = 0x0140, /* L1D_CACHE_LD.I_STATE         */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI            */
-		[ C(RESULT_MISS)   ] = 0x0141, /* L1D_CACHE_ST.I_STATE         */
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS        */
-		[ C(RESULT_MISS)   ] = 0x024e, /* L1D_PREFETCH.MISS            */
-	},
- },
- [ C(L1I ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS                    */
-		[ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                   */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = 0x0,
-	},
- },
- [ C(LL  ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS               */
-		[ C(RESULT_MISS)   ] = 0x0224, /* L2_RQSTS.LD_MISS             */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS                */
-		[ C(RESULT_MISS)   ] = 0x0824, /* L2_RQSTS.RFO_MISS            */
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference                */
-		[ C(RESULT_MISS)   ] = 0x412e, /* LLC Misses                   */
-	},
- },
- [ C(DTLB) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI   (alias)  */
-		[ C(RESULT_MISS)   ] = 0x0108, /* DTLB_LOAD_MISSES.ANY         */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI   (alias)  */
-		[ C(RESULT_MISS)   ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS  */
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = 0x0,
-	},
- },
- [ C(ITLB) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P           */
-		[ C(RESULT_MISS)   ] = 0x20c8, /* ITLB_MISS_RETIRED            */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
- [ C(BPU ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
-		[ C(RESULT_MISS)   ] = 0x03e8, /* BPU_CLEARS.ANY               */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
-};
-
-static const u64 core2_hw_cache_event_ids
-				[PERF_COUNT_HW_CACHE_MAX]
-				[PERF_COUNT_HW_CACHE_OP_MAX]
-				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI          */
-		[ C(RESULT_MISS)   ] = 0x0140, /* L1D_CACHE_LD.I_STATE       */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI          */
-		[ C(RESULT_MISS)   ] = 0x0141, /* L1D_CACHE_ST.I_STATE       */
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS      */
-		[ C(RESULT_MISS)   ] = 0,
-	},
- },
- [ C(L1I ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS                  */
-		[ C(RESULT_MISS)   ] = 0x0081, /* L1I.MISSES                 */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
-	},
- },
- [ C(LL  ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI                 */
-		[ C(RESULT_MISS)   ] = 0x4129, /* L2_LD.ISTATE               */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI                 */
-		[ C(RESULT_MISS)   ] = 0x412A, /* L2_ST.ISTATE               */
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
-	},
- },
- [ C(DTLB) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI  (alias) */
-		[ C(RESULT_MISS)   ] = 0x0208, /* DTLB_MISSES.MISS_LD        */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI  (alias) */
-		[ C(RESULT_MISS)   ] = 0x0808, /* DTLB_MISSES.MISS_ST        */
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
-	},
- },
- [ C(ITLB) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P         */
-		[ C(RESULT_MISS)   ] = 0x1282, /* ITLBMISSES                 */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
- [ C(BPU ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY        */
-		[ C(RESULT_MISS)   ] = 0x00c5, /* BP_INST_RETIRED.MISPRED    */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
-};
-
-static const u64 atom_hw_cache_event_ids
-				[PERF_COUNT_HW_CACHE_MAX]
-				[PERF_COUNT_HW_CACHE_OP_MAX]
-				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD               */
-		[ C(RESULT_MISS)   ] = 0,
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST               */
-		[ C(RESULT_MISS)   ] = 0,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = 0,
-	},
- },
- [ C(L1I ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS                  */
-		[ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                 */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
-	},
- },
- [ C(LL  ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI                 */
-		[ C(RESULT_MISS)   ] = 0x4129, /* L2_LD.ISTATE               */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI                 */
-		[ C(RESULT_MISS)   ] = 0x412A, /* L2_ST.ISTATE               */
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
-	},
- },
- [ C(DTLB) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI  (alias) */
-		[ C(RESULT_MISS)   ] = 0x0508, /* DTLB_MISSES.MISS_LD        */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI  (alias) */
-		[ C(RESULT_MISS)   ] = 0x0608, /* DTLB_MISSES.MISS_ST        */
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
-	},
- },
- [ C(ITLB) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P         */
-		[ C(RESULT_MISS)   ] = 0x0282, /* ITLB.MISSES                */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
- [ C(BPU ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY        */
-		[ C(RESULT_MISS)   ] = 0x00c5, /* BP_INST_RETIRED.MISPRED    */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
-};
-
-static u64 intel_pmu_raw_event(u64 hw_event)
-{
-#define CORE_EVNTSEL_EVENT_MASK		0x000000FFULL
-#define CORE_EVNTSEL_UNIT_MASK		0x0000FF00ULL
-#define CORE_EVNTSEL_EDGE_MASK		0x00040000ULL
-#define CORE_EVNTSEL_INV_MASK		0x00800000ULL
-#define CORE_EVNTSEL_COUNTER_MASK	0xFF000000ULL
-
-#define CORE_EVNTSEL_MASK		\
-	(CORE_EVNTSEL_EVENT_MASK |	\
-	 CORE_EVNTSEL_UNIT_MASK  |	\
-	 CORE_EVNTSEL_EDGE_MASK  |	\
-	 CORE_EVNTSEL_INV_MASK  |	\
-	 CORE_EVNTSEL_COUNTER_MASK)
-
-	return hw_event & CORE_EVNTSEL_MASK;
-}
-
-static const u64 amd_hw_cache_event_ids
-				[PERF_COUNT_HW_CACHE_MAX]
-				[PERF_COUNT_HW_CACHE_OP_MAX]
-				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses        */
-		[ C(RESULT_MISS)   ] = 0x0041, /* Data Cache Misses          */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */
-		[ C(RESULT_MISS)   ] = 0,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts  */
-		[ C(RESULT_MISS)   ] = 0x0167, /* Data Prefetcher :cancelled */
-	},
- },
- [ C(L1I ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches  */
-		[ C(RESULT_MISS)   ] = 0x0081, /* Instruction cache misses   */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */
-		[ C(RESULT_MISS)   ] = 0,
-	},
- },
- [ C(LL  ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */
-		[ C(RESULT_MISS)   ] = 0x037E, /* L2 Cache Misses : IC+DC     */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback           */
-		[ C(RESULT_MISS)   ] = 0,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
-	},
- },
- [ C(DTLB) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses        */
-		[ C(RESULT_MISS)   ] = 0x0046, /* L1 DTLB and L2 DLTB Miss   */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
-	},
- },
- [ C(ITLB) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes        */
-		[ C(RESULT_MISS)   ] = 0x0085, /* Instr. fetch ITLB misses   */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
- [ C(BPU ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr.      */
-		[ C(RESULT_MISS)   ] = 0x00c3, /* Retired Mispredicted BI    */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
-};
-
-/*
- * AMD Performance Monitor K7 and later.
- */
-static const u64 amd_perfmon_event_map[] =
-{
-  [PERF_COUNT_HW_CPU_CYCLES]		= 0x0076,
-  [PERF_COUNT_HW_INSTRUCTIONS]		= 0x00c0,
-  [PERF_COUNT_HW_CACHE_REFERENCES]	= 0x0080,
-  [PERF_COUNT_HW_CACHE_MISSES]		= 0x0081,
-  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x00c4,
-  [PERF_COUNT_HW_BRANCH_MISSES]		= 0x00c5,
-};
-
-static u64 amd_pmu_event_map(int hw_event)
-{
-	return amd_perfmon_event_map[hw_event];
-}
-
-static u64 amd_pmu_raw_event(u64 hw_event)
-{
-#define K7_EVNTSEL_EVENT_MASK	0x7000000FFULL
-#define K7_EVNTSEL_UNIT_MASK	0x00000FF00ULL
-#define K7_EVNTSEL_EDGE_MASK	0x000040000ULL
-#define K7_EVNTSEL_INV_MASK	0x000800000ULL
-#define K7_EVNTSEL_COUNTER_MASK	0x0FF000000ULL
-
-#define K7_EVNTSEL_MASK			\
-	(K7_EVNTSEL_EVENT_MASK |	\
-	 K7_EVNTSEL_UNIT_MASK  |	\
-	 K7_EVNTSEL_EDGE_MASK  |	\
-	 K7_EVNTSEL_INV_MASK   |	\
-	 K7_EVNTSEL_COUNTER_MASK)
-
-	return hw_event & K7_EVNTSEL_MASK;
-}
-
-/*
- * Propagate counter elapsed time into the generic counter.
- * Can only be executed on the CPU where the counter is active.
- * Returns the delta events processed.
- */
-static u64
-x86_perf_counter_update(struct perf_counter *counter,
-			struct hw_perf_counter *hwc, int idx)
-{
-	int shift = 64 - x86_pmu.counter_bits;
-	u64 prev_raw_count, new_raw_count;
-	s64 delta;
-
-	if (idx == X86_PMC_IDX_FIXED_BTS)
-		return 0;
-
-	/*
-	 * Careful: an NMI might modify the previous counter value.
-	 *
-	 * Our tactic to handle this is to first atomically read and
-	 * exchange a new raw count - then add that new-prev delta
-	 * count to the generic counter atomically:
-	 */
-again:
-	prev_raw_count = atomic64_read(&hwc->prev_count);
-	rdmsrl(hwc->counter_base + idx, new_raw_count);
-
-	if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
-					new_raw_count) != prev_raw_count)
-		goto again;
-
-	/*
-	 * Now we have the new raw value and have updated the prev
-	 * timestamp already. We can now calculate the elapsed delta
-	 * (counter-)time and add that to the generic counter.
-	 *
-	 * Careful, not all hw sign-extends above the physical width
-	 * of the count.
-	 */
-	delta = (new_raw_count << shift) - (prev_raw_count << shift);
-	delta >>= shift;
-
-	atomic64_add(delta, &counter->count);
-	atomic64_sub(delta, &hwc->period_left);
-
-	return new_raw_count;
-}
-
-static atomic_t active_counters;
-static DEFINE_MUTEX(pmc_reserve_mutex);
-
-static bool reserve_pmc_hardware(void)
-{
-#ifdef CONFIG_X86_LOCAL_APIC
-	int i;
-
-	if (nmi_watchdog == NMI_LOCAL_APIC)
-		disable_lapic_nmi_watchdog();
-
-	for (i = 0; i < x86_pmu.num_counters; i++) {
-		if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
-			goto perfctr_fail;
-	}
-
-	for (i = 0; i < x86_pmu.num_counters; i++) {
-		if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
-			goto eventsel_fail;
-	}
-#endif
-
-	return true;
-
-#ifdef CONFIG_X86_LOCAL_APIC
-eventsel_fail:
-	for (i--; i >= 0; i--)
-		release_evntsel_nmi(x86_pmu.eventsel + i);
-
-	i = x86_pmu.num_counters;
-
-perfctr_fail:
-	for (i--; i >= 0; i--)
-		release_perfctr_nmi(x86_pmu.perfctr + i);
-
-	if (nmi_watchdog == NMI_LOCAL_APIC)
-		enable_lapic_nmi_watchdog();
-
-	return false;
-#endif
-}
-
-static void release_pmc_hardware(void)
-{
-#ifdef CONFIG_X86_LOCAL_APIC
-	int i;
-
-	for (i = 0; i < x86_pmu.num_counters; i++) {
-		release_perfctr_nmi(x86_pmu.perfctr + i);
-		release_evntsel_nmi(x86_pmu.eventsel + i);
-	}
-
-	if (nmi_watchdog == NMI_LOCAL_APIC)
-		enable_lapic_nmi_watchdog();
-#endif
-}
-
-static inline bool bts_available(void)
-{
-	return x86_pmu.enable_bts != NULL;
-}
-
-static inline void init_debug_store_on_cpu(int cpu)
-{
-	struct debug_store *ds = per_cpu(cpu_hw_counters, cpu).ds;
-
-	if (!ds)
-		return;
-
-	wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA,
-		     (u32)((u64)(unsigned long)ds),
-		     (u32)((u64)(unsigned long)ds >> 32));
-}
-
-static inline void fini_debug_store_on_cpu(int cpu)
-{
-	if (!per_cpu(cpu_hw_counters, cpu).ds)
-		return;
-
-	wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
-}
-
-static void release_bts_hardware(void)
-{
-	int cpu;
-
-	if (!bts_available())
-		return;
-
-	get_online_cpus();
-
-	for_each_online_cpu(cpu)
-		fini_debug_store_on_cpu(cpu);
-
-	for_each_possible_cpu(cpu) {
-		struct debug_store *ds = per_cpu(cpu_hw_counters, cpu).ds;
-
-		if (!ds)
-			continue;
-
-		per_cpu(cpu_hw_counters, cpu).ds = NULL;
-
-		kfree((void *)(unsigned long)ds->bts_buffer_base);
-		kfree(ds);
-	}
-
-	put_online_cpus();
-}
-
-static int reserve_bts_hardware(void)
-{
-	int cpu, err = 0;
-
-	if (!bts_available())
-		return 0;
-
-	get_online_cpus();
-
-	for_each_possible_cpu(cpu) {
-		struct debug_store *ds;
-		void *buffer;
-
-		err = -ENOMEM;
-		buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL);
-		if (unlikely(!buffer))
-			break;
-
-		ds = kzalloc(sizeof(*ds), GFP_KERNEL);
-		if (unlikely(!ds)) {
-			kfree(buffer);
-			break;
-		}
-
-		ds->bts_buffer_base = (u64)(unsigned long)buffer;
-		ds->bts_index = ds->bts_buffer_base;
-		ds->bts_absolute_maximum =
-			ds->bts_buffer_base + BTS_BUFFER_SIZE;
-		ds->bts_interrupt_threshold =
-			ds->bts_absolute_maximum - BTS_OVFL_TH;
-
-		per_cpu(cpu_hw_counters, cpu).ds = ds;
-		err = 0;
-	}
-
-	if (err)
-		release_bts_hardware();
-	else {
-		for_each_online_cpu(cpu)
-			init_debug_store_on_cpu(cpu);
-	}
-
-	put_online_cpus();
-
-	return err;
-}
-
-static void hw_perf_counter_destroy(struct perf_counter *counter)
-{
-	if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) {
-		release_pmc_hardware();
-		release_bts_hardware();
-		mutex_unlock(&pmc_reserve_mutex);
-	}
-}
-
-static inline int x86_pmu_initialized(void)
-{
-	return x86_pmu.handle_irq != NULL;
-}
-
-static inline int
-set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr)
-{
-	unsigned int cache_type, cache_op, cache_result;
-	u64 config, val;
-
-	config = attr->config;
-
-	cache_type = (config >>  0) & 0xff;
-	if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
-		return -EINVAL;
-
-	cache_op = (config >>  8) & 0xff;
-	if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
-		return -EINVAL;
-
-	cache_result = (config >> 16) & 0xff;
-	if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
-		return -EINVAL;
-
-	val = hw_cache_event_ids[cache_type][cache_op][cache_result];
-
-	if (val == 0)
-		return -ENOENT;
-
-	if (val == -1)
-		return -EINVAL;
-
-	hwc->config |= val;
-
-	return 0;
-}
-
-static void intel_pmu_enable_bts(u64 config)
-{
-	unsigned long debugctlmsr;
-
-	debugctlmsr = get_debugctlmsr();
-
-	debugctlmsr |= X86_DEBUGCTL_TR;
-	debugctlmsr |= X86_DEBUGCTL_BTS;
-	debugctlmsr |= X86_DEBUGCTL_BTINT;
-
-	if (!(config & ARCH_PERFMON_EVENTSEL_OS))
-		debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS;
-
-	if (!(config & ARCH_PERFMON_EVENTSEL_USR))
-		debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR;
-
-	update_debugctlmsr(debugctlmsr);
-}
-
-static void intel_pmu_disable_bts(void)
-{
-	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-	unsigned long debugctlmsr;
-
-	if (!cpuc->ds)
-		return;
-
-	debugctlmsr = get_debugctlmsr();
-
-	debugctlmsr &=
-		~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT |
-		  X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR);
-
-	update_debugctlmsr(debugctlmsr);
-}
-
-/*
- * Setup the hardware configuration for a given attr_type
- */
-static int __hw_perf_counter_init(struct perf_counter *counter)
-{
-	struct perf_counter_attr *attr = &counter->attr;
-	struct hw_perf_counter *hwc = &counter->hw;
-	u64 config;
-	int err;
-
-	if (!x86_pmu_initialized())
-		return -ENODEV;
-
-	err = 0;
-	if (!atomic_inc_not_zero(&active_counters)) {
-		mutex_lock(&pmc_reserve_mutex);
-		if (atomic_read(&active_counters) == 0) {
-			if (!reserve_pmc_hardware())
-				err = -EBUSY;
-			else
-				err = reserve_bts_hardware();
-		}
-		if (!err)
-			atomic_inc(&active_counters);
-		mutex_unlock(&pmc_reserve_mutex);
-	}
-	if (err)
-		return err;
-
-	counter->destroy = hw_perf_counter_destroy;
-
-	/*
-	 * Generate PMC IRQs:
-	 * (keep 'enabled' bit clear for now)
-	 */
-	hwc->config = ARCH_PERFMON_EVENTSEL_INT;
-
-	/*
-	 * Count user and OS events unless requested not to.
-	 */
-	if (!attr->exclude_user)
-		hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
-	if (!attr->exclude_kernel)
-		hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
-
-	if (!hwc->sample_period) {
-		hwc->sample_period = x86_pmu.max_period;
-		hwc->last_period = hwc->sample_period;
-		atomic64_set(&hwc->period_left, hwc->sample_period);
-	} else {
-		/*
-		 * If we have a PMU initialized but no APIC
-		 * interrupts, we cannot sample hardware
-		 * counters (user-space has to fall back and
-		 * sample via a hrtimer based software counter):
-		 */
-		if (!x86_pmu.apic)
-			return -EOPNOTSUPP;
-	}
-
-	/*
-	 * Raw hw_event type provide the config in the hw_event structure
-	 */
-	if (attr->type == PERF_TYPE_RAW) {
-		hwc->config |= x86_pmu.raw_event(attr->config);
-		return 0;
-	}
-
-	if (attr->type == PERF_TYPE_HW_CACHE)
-		return set_ext_hw_attr(hwc, attr);
-
-	if (attr->config >= x86_pmu.max_events)
-		return -EINVAL;
-
-	/*
-	 * The generic map:
-	 */
-	config = x86_pmu.event_map(attr->config);
-
-	if (config == 0)
-		return -ENOENT;
-
-	if (config == -1LL)
-		return -EINVAL;
-
-	/*
-	 * Branch tracing:
-	 */
-	if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) &&
-	    (hwc->sample_period == 1)) {
-		/* BTS is not supported by this architecture. */
-		if (!bts_available())
-			return -EOPNOTSUPP;
-
-		/* BTS is currently only allowed for user-mode. */
-		if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
-			return -EOPNOTSUPP;
-	}
-
-	hwc->config |= config;
-
-	return 0;
-}
-
-static void p6_pmu_disable_all(void)
-{
-	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-	u64 val;
-
-	if (!cpuc->enabled)
-		return;
-
-	cpuc->enabled = 0;
-	barrier();
-
-	/* p6 only has one enable register */
-	rdmsrl(MSR_P6_EVNTSEL0, val);
-	val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
-	wrmsrl(MSR_P6_EVNTSEL0, val);
-}
-
-static void intel_pmu_disable_all(void)
-{
-	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-
-	if (!cpuc->enabled)
-		return;
-
-	cpuc->enabled = 0;
-	barrier();
-
-	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
-
-	if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask))
-		intel_pmu_disable_bts();
-}
-
-static void amd_pmu_disable_all(void)
-{
-	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-	int idx;
-
-	if (!cpuc->enabled)
-		return;
-
-	cpuc->enabled = 0;
-	/*
-	 * ensure we write the disable before we start disabling the
-	 * counters proper, so that amd_pmu_enable_counter() does the
-	 * right thing.
-	 */
-	barrier();
-
-	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-		u64 val;
-
-		if (!test_bit(idx, cpuc->active_mask))
-			continue;
-		rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
-		if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE))
-			continue;
-		val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
-		wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
-	}
-}
-
-void hw_perf_disable(void)
-{
-	if (!x86_pmu_initialized())
-		return;
-	return x86_pmu.disable_all();
-}
-
-static void p6_pmu_enable_all(void)
-{
-	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-	unsigned long val;
-
-	if (cpuc->enabled)
-		return;
-
-	cpuc->enabled = 1;
-	barrier();
-
-	/* p6 only has one enable register */
-	rdmsrl(MSR_P6_EVNTSEL0, val);
-	val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
-	wrmsrl(MSR_P6_EVNTSEL0, val);
-}
-
-static void intel_pmu_enable_all(void)
-{
-	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-
-	if (cpuc->enabled)
-		return;
-
-	cpuc->enabled = 1;
-	barrier();
-
-	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
-
-	if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
-		struct perf_counter *counter =
-			cpuc->counters[X86_PMC_IDX_FIXED_BTS];
-
-		if (WARN_ON_ONCE(!counter))
-			return;
-
-		intel_pmu_enable_bts(counter->hw.config);
-	}
-}
-
-static void amd_pmu_enable_all(void)
-{
-	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-	int idx;
-
-	if (cpuc->enabled)
-		return;
-
-	cpuc->enabled = 1;
-	barrier();
-
-	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-		struct perf_counter *counter = cpuc->counters[idx];
-		u64 val;
-
-		if (!test_bit(idx, cpuc->active_mask))
-			continue;
-
-		val = counter->hw.config;
-		val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
-		wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
-	}
-}
-
-void hw_perf_enable(void)
-{
-	if (!x86_pmu_initialized())
-		return;
-	x86_pmu.enable_all();
-}
-
-static inline u64 intel_pmu_get_status(void)
-{
-	u64 status;
-
-	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
-
-	return status;
-}
-
-static inline void intel_pmu_ack_status(u64 ack)
-{
-	wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
-}
-
-static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
-{
-	(void)checking_wrmsrl(hwc->config_base + idx,
-			      hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);
-}
-
-static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
-{
-	(void)checking_wrmsrl(hwc->config_base + idx, hwc->config);
-}
-
-static inline void
-intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx)
-{
-	int idx = __idx - X86_PMC_IDX_FIXED;
-	u64 ctrl_val, mask;
-
-	mask = 0xfULL << (idx * 4);
-
-	rdmsrl(hwc->config_base, ctrl_val);
-	ctrl_val &= ~mask;
-	(void)checking_wrmsrl(hwc->config_base, ctrl_val);
-}
-
-static inline void
-p6_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
-{
-	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-	u64 val = P6_NOP_COUNTER;
-
-	if (cpuc->enabled)
-		val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
-
-	(void)checking_wrmsrl(hwc->config_base + idx, val);
-}
-
-static inline void
-intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
-{
-	if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
-		intel_pmu_disable_bts();
-		return;
-	}
-
-	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
-		intel_pmu_disable_fixed(hwc, idx);
-		return;
-	}
-
-	x86_pmu_disable_counter(hwc, idx);
-}
-
-static inline void
-amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
-{
-	x86_pmu_disable_counter(hwc, idx);
-}
-
-static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
-
-/*
- * Set the next IRQ period, based on the hwc->period_left value.
- * To be called with the counter disabled in hw:
- */
-static int
-x86_perf_counter_set_period(struct perf_counter *counter,
-			     struct hw_perf_counter *hwc, int idx)
-{
-	s64 left = atomic64_read(&hwc->period_left);
-	s64 period = hwc->sample_period;
-	int err, ret = 0;
-
-	if (idx == X86_PMC_IDX_FIXED_BTS)
-		return 0;
-
-	/*
-	 * If we are way outside a reasoable range then just skip forward:
-	 */
-	if (unlikely(left <= -period)) {
-		left = period;
-		atomic64_set(&hwc->period_left, left);
-		hwc->last_period = period;
-		ret = 1;
-	}
-
-	if (unlikely(left <= 0)) {
-		left += period;
-		atomic64_set(&hwc->period_left, left);
-		hwc->last_period = period;
-		ret = 1;
-	}
-	/*
-	 * Quirk: certain CPUs dont like it if just 1 hw_event is left:
-	 */
-	if (unlikely(left < 2))
-		left = 2;
-
-	if (left > x86_pmu.max_period)
-		left = x86_pmu.max_period;
-
-	per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
-
-	/*
-	 * The hw counter starts counting from this counter offset,
-	 * mark it to be able to extra future deltas:
-	 */
-	atomic64_set(&hwc->prev_count, (u64)-left);
-
-	err = checking_wrmsrl(hwc->counter_base + idx,
-			     (u64)(-left) & x86_pmu.counter_mask);
-
-	perf_counter_update_userpage(counter);
-
-	return ret;
-}
-
-static inline void
-intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx)
-{
-	int idx = __idx - X86_PMC_IDX_FIXED;
-	u64 ctrl_val, bits, mask;
-	int err;
-
-	/*
-	 * Enable IRQ generation (0x8),
-	 * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
-	 * if requested:
-	 */
-	bits = 0x8ULL;
-	if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
-		bits |= 0x2;
-	if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
-		bits |= 0x1;
-	bits <<= (idx * 4);
-	mask = 0xfULL << (idx * 4);
-
-	rdmsrl(hwc->config_base, ctrl_val);
-	ctrl_val &= ~mask;
-	ctrl_val |= bits;
-	err = checking_wrmsrl(hwc->config_base, ctrl_val);
-}
-
-static void p6_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
-{
-	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-	u64 val;
-
-	val = hwc->config;
-	if (cpuc->enabled)
-		val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
-
-	(void)checking_wrmsrl(hwc->config_base + idx, val);
-}
-
-
-static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
-{
-	if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
-		if (!__get_cpu_var(cpu_hw_counters).enabled)
-			return;
-
-		intel_pmu_enable_bts(hwc->config);
-		return;
-	}
-
-	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
-		intel_pmu_enable_fixed(hwc, idx);
-		return;
-	}
-
-	x86_pmu_enable_counter(hwc, idx);
-}
-
-static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
-{
-	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-
-	if (cpuc->enabled)
-		x86_pmu_enable_counter(hwc, idx);
-}
-
-static int
-fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
-{
-	unsigned int hw_event;
-
-	hw_event = hwc->config & ARCH_PERFMON_EVENT_MASK;
-
-	if (unlikely((hw_event ==
-		      x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) &&
-		     (hwc->sample_period == 1)))
-		return X86_PMC_IDX_FIXED_BTS;
-
-	if (!x86_pmu.num_counters_fixed)
-		return -1;
-
-	if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS)))
-		return X86_PMC_IDX_FIXED_INSTRUCTIONS;
-	if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES)))
-		return X86_PMC_IDX_FIXED_CPU_CYCLES;
-	if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES)))
-		return X86_PMC_IDX_FIXED_BUS_CYCLES;
-
-	return -1;
-}
-
-/*
- * Find a PMC slot for the freshly enabled / scheduled in counter:
- */
-static int x86_pmu_enable(struct perf_counter *counter)
-{
-	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-	struct hw_perf_counter *hwc = &counter->hw;
-	int idx;
-
-	idx = fixed_mode_idx(counter, hwc);
-	if (idx == X86_PMC_IDX_FIXED_BTS) {
-		/* BTS is already occupied. */
-		if (test_and_set_bit(idx, cpuc->used_mask))
-			return -EAGAIN;
-
-		hwc->config_base	= 0;
-		hwc->counter_base	= 0;
-		hwc->idx		= idx;
-	} else if (idx >= 0) {
-		/*
-		 * Try to get the fixed counter, if that is already taken
-		 * then try to get a generic counter:
-		 */
-		if (test_and_set_bit(idx, cpuc->used_mask))
-			goto try_generic;
-
-		hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
-		/*
-		 * We set it so that counter_base + idx in wrmsr/rdmsr maps to
-		 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
-		 */
-		hwc->counter_base =
-			MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
-		hwc->idx = idx;
-	} else {
-		idx = hwc->idx;
-		/* Try to get the previous generic counter again */
-		if (test_and_set_bit(idx, cpuc->used_mask)) {
-try_generic:
-			idx = find_first_zero_bit(cpuc->used_mask,
-						  x86_pmu.num_counters);
-			if (idx == x86_pmu.num_counters)
-				return -EAGAIN;
-
-			set_bit(idx, cpuc->used_mask);
-			hwc->idx = idx;
-		}
-		hwc->config_base  = x86_pmu.eventsel;
-		hwc->counter_base = x86_pmu.perfctr;
-	}
-
-	perf_counters_lapic_init();
-
-	x86_pmu.disable(hwc, idx);
-
-	cpuc->counters[idx] = counter;
-	set_bit(idx, cpuc->active_mask);
-
-	x86_perf_counter_set_period(counter, hwc, idx);
-	x86_pmu.enable(hwc, idx);
-
-	perf_counter_update_userpage(counter);
-
-	return 0;
-}
-
-static void x86_pmu_unthrottle(struct perf_counter *counter)
-{
-	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-	struct hw_perf_counter *hwc = &counter->hw;
-
-	if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX ||
-				cpuc->counters[hwc->idx] != counter))
-		return;
-
-	x86_pmu.enable(hwc, hwc->idx);
-}
-
-void perf_counter_print_debug(void)
-{
-	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
-	struct cpu_hw_counters *cpuc;
-	unsigned long flags;
-	int cpu, idx;
-
-	if (!x86_pmu.num_counters)
-		return;
-
-	local_irq_save(flags);
-
-	cpu = smp_processor_id();
-	cpuc = &per_cpu(cpu_hw_counters, cpu);
-
-	if (x86_pmu.version >= 2) {
-		rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
-		rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
-		rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
-		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
-
-		pr_info("\n");
-		pr_info("CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
-		pr_info("CPU#%d: status:     %016llx\n", cpu, status);
-		pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
-		pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
-	}
-	pr_info("CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used_mask);
-
-	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-		rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
-		rdmsrl(x86_pmu.perfctr  + idx, pmc_count);
-
-		prev_left = per_cpu(pmc_prev_left[idx], cpu);
-
-		pr_info("CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
-			cpu, idx, pmc_ctrl);
-		pr_info("CPU#%d:   gen-PMC%d count: %016llx\n",
-			cpu, idx, pmc_count);
-		pr_info("CPU#%d:   gen-PMC%d left:  %016llx\n",
-			cpu, idx, prev_left);
-	}
-	for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
-		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
-
-		pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
-			cpu, idx, pmc_count);
-	}
-	local_irq_restore(flags);
-}
-
-static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc)
-{
-	struct debug_store *ds = cpuc->ds;
-	struct bts_record {
-		u64	from;
-		u64	to;
-		u64	flags;
-	};
-	struct perf_counter *counter = cpuc->counters[X86_PMC_IDX_FIXED_BTS];
-	struct bts_record *at, *top;
-	struct perf_output_handle handle;
-	struct perf_event_header header;
-	struct perf_sample_data data;
-	struct pt_regs regs;
-
-	if (!counter)
-		return;
-
-	if (!ds)
-		return;
-
-	at  = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
-	top = (struct bts_record *)(unsigned long)ds->bts_index;
-
-	if (top <= at)
-		return;
-
-	ds->bts_index = ds->bts_buffer_base;
-
-
-	data.period	= counter->hw.last_period;
-	data.addr	= 0;
-	regs.ip		= 0;
-
-	/*
-	 * Prepare a generic sample, i.e. fill in the invariant fields.
-	 * We will overwrite the from and to address before we output
-	 * the sample.
-	 */
-	perf_prepare_sample(&header, &data, counter, &regs);
-
-	if (perf_output_begin(&handle, counter,
-			      header.size * (top - at), 1, 1))
-		return;
-
-	for (; at < top; at++) {
-		data.ip		= at->from;
-		data.addr	= at->to;
-
-		perf_output_sample(&handle, &header, &data, counter);
-	}
-
-	perf_output_end(&handle);
-
-	/* There's new data available. */
-	counter->hw.interrupts++;
-	counter->pending_kill = POLL_IN;
-}
-
-static void x86_pmu_disable(struct perf_counter *counter)
-{
-	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-	struct hw_perf_counter *hwc = &counter->hw;
-	int idx = hwc->idx;
-
-	/*
-	 * Must be done before we disable, otherwise the nmi handler
-	 * could reenable again:
-	 */
-	clear_bit(idx, cpuc->active_mask);
-	x86_pmu.disable(hwc, idx);
-
-	/*
-	 * Make sure the cleared pointer becomes visible before we
-	 * (potentially) free the counter:
-	 */
-	barrier();
-
-	/*
-	 * Drain the remaining delta count out of a counter
-	 * that we are disabling:
-	 */
-	x86_perf_counter_update(counter, hwc, idx);
-
-	/* Drain the remaining BTS records. */
-	if (unlikely(idx == X86_PMC_IDX_FIXED_BTS))
-		intel_pmu_drain_bts_buffer(cpuc);
-
-	cpuc->counters[idx] = NULL;
-	clear_bit(idx, cpuc->used_mask);
-
-	perf_counter_update_userpage(counter);
-}
-
-/*
- * Save and restart an expired counter. Called by NMI contexts,
- * so it has to be careful about preempting normal counter ops:
- */
-static int intel_pmu_save_and_restart(struct perf_counter *counter)
-{
-	struct hw_perf_counter *hwc = &counter->hw;
-	int idx = hwc->idx;
-	int ret;
-
-	x86_perf_counter_update(counter, hwc, idx);
-	ret = x86_perf_counter_set_period(counter, hwc, idx);
-
-	if (counter->state == PERF_COUNTER_STATE_ACTIVE)
-		intel_pmu_enable_counter(hwc, idx);
-
-	return ret;
-}
-
-static void intel_pmu_reset(void)
-{
-	struct debug_store *ds = __get_cpu_var(cpu_hw_counters).ds;
-	unsigned long flags;
-	int idx;
-
-	if (!x86_pmu.num_counters)
-		return;
-
-	local_irq_save(flags);
-
-	printk("clearing PMU state on CPU#%d\n", smp_processor_id());
-
-	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-		checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);
-		checking_wrmsrl(x86_pmu.perfctr  + idx, 0ull);
-	}
-	for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
-		checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
-	}
-	if (ds)
-		ds->bts_index = ds->bts_buffer_base;
-
-	local_irq_restore(flags);
-}
-
-static int p6_pmu_handle_irq(struct pt_regs *regs)
-{
-	struct perf_sample_data data;
-	struct cpu_hw_counters *cpuc;
-	struct perf_counter *counter;
-	struct hw_perf_counter *hwc;
-	int idx, handled = 0;
-	u64 val;
-
-	data.addr = 0;
-
-	cpuc = &__get_cpu_var(cpu_hw_counters);
-
-	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-		if (!test_bit(idx, cpuc->active_mask))
-			continue;
-
-		counter = cpuc->counters[idx];
-		hwc = &counter->hw;
-
-		val = x86_perf_counter_update(counter, hwc, idx);
-		if (val & (1ULL << (x86_pmu.counter_bits - 1)))
-			continue;
-
-		/*
-		 * counter overflow
-		 */
-		handled		= 1;
-		data.period	= counter->hw.last_period;
-
-		if (!x86_perf_counter_set_period(counter, hwc, idx))
-			continue;
-
-		if (perf_counter_overflow(counter, 1, &data, regs))
-			p6_pmu_disable_counter(hwc, idx);
-	}
-
-	if (handled)
-		inc_irq_stat(apic_perf_irqs);
-
-	return handled;
-}
-
-/*
- * This handler is triggered by the local APIC, so the APIC IRQ handling
- * rules apply:
- */
-static int intel_pmu_handle_irq(struct pt_regs *regs)
-{
-	struct perf_sample_data data;
-	struct cpu_hw_counters *cpuc;
-	int bit, loops;
-	u64 ack, status;
-
-	data.addr = 0;
-
-	cpuc = &__get_cpu_var(cpu_hw_counters);
-
-	perf_disable();
-	intel_pmu_drain_bts_buffer(cpuc);
-	status = intel_pmu_get_status();
-	if (!status) {
-		perf_enable();
-		return 0;
-	}
-
-	loops = 0;
-again:
-	if (++loops > 100) {
-		WARN_ONCE(1, "perfcounters: irq loop stuck!\n");
-		perf_counter_print_debug();
-		intel_pmu_reset();
-		perf_enable();
-		return 1;
-	}
-
-	inc_irq_stat(apic_perf_irqs);
-	ack = status;
-	for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
-		struct perf_counter *counter = cpuc->counters[bit];
-
-		clear_bit(bit, (unsigned long *) &status);
-		if (!test_bit(bit, cpuc->active_mask))
-			continue;
-
-		if (!intel_pmu_save_and_restart(counter))
-			continue;
-
-		data.period = counter->hw.last_period;
-
-		if (perf_counter_overflow(counter, 1, &data, regs))
-			intel_pmu_disable_counter(&counter->hw, bit);
-	}
-
-	intel_pmu_ack_status(ack);
-
-	/*
-	 * Repeat if there is more work to be done:
-	 */
-	status = intel_pmu_get_status();
-	if (status)
-		goto again;
-
-	perf_enable();
-
-	return 1;
-}
-
-static int amd_pmu_handle_irq(struct pt_regs *regs)
-{
-	struct perf_sample_data data;
-	struct cpu_hw_counters *cpuc;
-	struct perf_counter *counter;
-	struct hw_perf_counter *hwc;
-	int idx, handled = 0;
-	u64 val;
-
-	data.addr = 0;
-
-	cpuc = &__get_cpu_var(cpu_hw_counters);
-
-	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-		if (!test_bit(idx, cpuc->active_mask))
-			continue;
-
-		counter = cpuc->counters[idx];
-		hwc = &counter->hw;
-
-		val = x86_perf_counter_update(counter, hwc, idx);
-		if (val & (1ULL << (x86_pmu.counter_bits - 1)))
-			continue;
-
-		/*
-		 * counter overflow
-		 */
-		handled		= 1;
-		data.period	= counter->hw.last_period;
-
-		if (!x86_perf_counter_set_period(counter, hwc, idx))
-			continue;
-
-		if (perf_counter_overflow(counter, 1, &data, regs))
-			amd_pmu_disable_counter(hwc, idx);
-	}
-
-	if (handled)
-		inc_irq_stat(apic_perf_irqs);
-
-	return handled;
-}
-
-void smp_perf_pending_interrupt(struct pt_regs *regs)
-{
-	irq_enter();
-	ack_APIC_irq();
-	inc_irq_stat(apic_pending_irqs);
-	perf_counter_do_pending();
-	irq_exit();
-}
-
-void set_perf_counter_pending(void)
-{
-#ifdef CONFIG_X86_LOCAL_APIC
-	apic->send_IPI_self(LOCAL_PENDING_VECTOR);
-#endif
-}
-
-void perf_counters_lapic_init(void)
-{
-#ifdef CONFIG_X86_LOCAL_APIC
-	if (!x86_pmu.apic || !x86_pmu_initialized())
-		return;
-
-	/*
-	 * Always use NMI for PMU
-	 */
-	apic_write(APIC_LVTPC, APIC_DM_NMI);
-#endif
-}
-
-static int __kprobes
-perf_counter_nmi_handler(struct notifier_block *self,
-			 unsigned long cmd, void *__args)
-{
-	struct die_args *args = __args;
-	struct pt_regs *regs;
-
-	if (!atomic_read(&active_counters))
-		return NOTIFY_DONE;
-
-	switch (cmd) {
-	case DIE_NMI:
-	case DIE_NMI_IPI:
-		break;
-
-	default:
-		return NOTIFY_DONE;
-	}
-
-	regs = args->regs;
-
-#ifdef CONFIG_X86_LOCAL_APIC
-	apic_write(APIC_LVTPC, APIC_DM_NMI);
-#endif
-	/*
-	 * Can't rely on the handled return value to say it was our NMI, two
-	 * counters could trigger 'simultaneously' raising two back-to-back NMIs.
-	 *
-	 * If the first NMI handles both, the latter will be empty and daze
-	 * the CPU.
-	 */
-	x86_pmu.handle_irq(regs);
-
-	return NOTIFY_STOP;
-}
-
-static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
-	.notifier_call		= perf_counter_nmi_handler,
-	.next			= NULL,
-	.priority		= 1
-};
-
-static struct x86_pmu p6_pmu = {
-	.name			= "p6",
-	.handle_irq		= p6_pmu_handle_irq,
-	.disable_all		= p6_pmu_disable_all,
-	.enable_all		= p6_pmu_enable_all,
-	.enable			= p6_pmu_enable_counter,
-	.disable		= p6_pmu_disable_counter,
-	.eventsel		= MSR_P6_EVNTSEL0,
-	.perfctr		= MSR_P6_PERFCTR0,
-	.event_map		= p6_pmu_event_map,
-	.raw_event		= p6_pmu_raw_event,
-	.max_events		= ARRAY_SIZE(p6_perfmon_event_map),
-	.apic			= 1,
-	.max_period		= (1ULL << 31) - 1,
-	.version		= 0,
-	.num_counters		= 2,
-	/*
-	 * Counters have 40 bits implemented. However they are designed such
-	 * that bits [32-39] are sign extensions of bit 31. As such the
-	 * effective width of a counter for P6-like PMU is 32 bits only.
-	 *
-	 * See IA-32 Intel Architecture Software developer manual Vol 3B
-	 */
-	.counter_bits		= 32,
-	.counter_mask		= (1ULL << 32) - 1,
-};
-
-static struct x86_pmu intel_pmu = {
-	.name			= "Intel",
-	.handle_irq		= intel_pmu_handle_irq,
-	.disable_all		= intel_pmu_disable_all,
-	.enable_all		= intel_pmu_enable_all,
-	.enable			= intel_pmu_enable_counter,
-	.disable		= intel_pmu_disable_counter,
-	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
-	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
-	.event_map		= intel_pmu_event_map,
-	.raw_event		= intel_pmu_raw_event,
-	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
-	.apic			= 1,
-	/*
-	 * Intel PMCs cannot be accessed sanely above 32 bit width,
-	 * so we install an artificial 1<<31 period regardless of
-	 * the generic counter period:
-	 */
-	.max_period		= (1ULL << 31) - 1,
-	.enable_bts		= intel_pmu_enable_bts,
-	.disable_bts		= intel_pmu_disable_bts,
-};
-
-static struct x86_pmu amd_pmu = {
-	.name			= "AMD",
-	.handle_irq		= amd_pmu_handle_irq,
-	.disable_all		= amd_pmu_disable_all,
-	.enable_all		= amd_pmu_enable_all,
-	.enable			= amd_pmu_enable_counter,
-	.disable		= amd_pmu_disable_counter,
-	.eventsel		= MSR_K7_EVNTSEL0,
-	.perfctr		= MSR_K7_PERFCTR0,
-	.event_map		= amd_pmu_event_map,
-	.raw_event		= amd_pmu_raw_event,
-	.max_events		= ARRAY_SIZE(amd_perfmon_event_map),
-	.num_counters		= 4,
-	.counter_bits		= 48,
-	.counter_mask		= (1ULL << 48) - 1,
-	.apic			= 1,
-	/* use highest bit to detect overflow */
-	.max_period		= (1ULL << 47) - 1,
-};
-
-static int p6_pmu_init(void)
-{
-	switch (boot_cpu_data.x86_model) {
-	case 1:
-	case 3:  /* Pentium Pro */
-	case 5:
-	case 6:  /* Pentium II */
-	case 7:
-	case 8:
-	case 11: /* Pentium III */
-		break;
-	case 9:
-	case 13:
-		/* Pentium M */
-		break;
-	default:
-		pr_cont("unsupported p6 CPU model %d ",
-			boot_cpu_data.x86_model);
-		return -ENODEV;
-	}
-
-	x86_pmu = p6_pmu;
-
-	if (!cpu_has_apic) {
-		pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
-		pr_info("no hardware sampling interrupt available.\n");
-		x86_pmu.apic = 0;
-	}
-
-	return 0;
-}
-
-static int intel_pmu_init(void)
-{
-	union cpuid10_edx edx;
-	union cpuid10_eax eax;
-	unsigned int unused;
-	unsigned int ebx;
-	int version;
-
-	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
-		/* check for P6 processor family */
-	   if (boot_cpu_data.x86 == 6) {
-		return p6_pmu_init();
-	   } else {
-		return -ENODEV;
-	   }
-	}
-
-	/*
-	 * Check whether the Architectural PerfMon supports
-	 * Branch Misses Retired hw_event or not.
-	 */
-	cpuid(10, &eax.full, &ebx, &unused, &edx.full);
-	if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
-		return -ENODEV;
-
-	version = eax.split.version_id;
-	if (version < 2)
-		return -ENODEV;
-
-	x86_pmu				= intel_pmu;
-	x86_pmu.version			= version;
-	x86_pmu.num_counters		= eax.split.num_counters;
-	x86_pmu.counter_bits		= eax.split.bit_width;
-	x86_pmu.counter_mask		= (1ULL << eax.split.bit_width) - 1;
-
-	/*
-	 * Quirk: v2 perfmon does not report fixed-purpose counters, so
-	 * assume at least 3 counters:
-	 */
-	x86_pmu.num_counters_fixed	= max((int)edx.split.num_counters_fixed, 3);
-
-	/*
-	 * Install the hw-cache-events table:
-	 */
-	switch (boot_cpu_data.x86_model) {
-	case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
-	case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
-	case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
-	case 29: /* six-core 45 nm xeon "Dunnington" */
-		memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
-		       sizeof(hw_cache_event_ids));
-
-		pr_cont("Core2 events, ");
-		break;
-	default:
-	case 26:
-		memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
-		       sizeof(hw_cache_event_ids));
-
-		pr_cont("Nehalem/Corei7 events, ");
-		break;
-	case 28:
-		memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
-		       sizeof(hw_cache_event_ids));
-
-		pr_cont("Atom events, ");
-		break;
-	}
-	return 0;
-}
-
-static int amd_pmu_init(void)
-{
-	/* Performance-monitoring supported from K7 and later: */
-	if (boot_cpu_data.x86 < 6)
-		return -ENODEV;
-
-	x86_pmu = amd_pmu;
-
-	/* Events are common for all AMDs */
-	memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
-	       sizeof(hw_cache_event_ids));
-
-	return 0;
-}
-
-void __init init_hw_perf_counters(void)
-{
-	int err;
-
-	pr_info("Performance Counters: ");
-
-	switch (boot_cpu_data.x86_vendor) {
-	case X86_VENDOR_INTEL:
-		err = intel_pmu_init();
-		break;
-	case X86_VENDOR_AMD:
-		err = amd_pmu_init();
-		break;
-	default:
-		return;
-	}
-	if (err != 0) {
-		pr_cont("no PMU driver, software counters only.\n");
-		return;
-	}
-
-	pr_cont("%s PMU driver.\n", x86_pmu.name);
-
-	if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
-		WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
-		     x86_pmu.num_counters, X86_PMC_MAX_GENERIC);
-		x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
-	}
-	perf_counter_mask = (1 << x86_pmu.num_counters) - 1;
-	perf_max_counters = x86_pmu.num_counters;
-
-	if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
-		WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!",
-		     x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
-		x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
-	}
-
-	perf_counter_mask |=
-		((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
-	x86_pmu.intel_ctrl = perf_counter_mask;
-
-	perf_counters_lapic_init();
-	register_die_notifier(&perf_counter_nmi_notifier);
-
-	pr_info("... version:                 %d\n",     x86_pmu.version);
-	pr_info("... bit width:               %d\n",     x86_pmu.counter_bits);
-	pr_info("... generic counters:        %d\n",     x86_pmu.num_counters);
-	pr_info("... value mask:              %016Lx\n", x86_pmu.counter_mask);
-	pr_info("... max period:              %016Lx\n", x86_pmu.max_period);
-	pr_info("... fixed-purpose counters:  %d\n",     x86_pmu.num_counters_fixed);
-	pr_info("... counter mask:            %016Lx\n", perf_counter_mask);
-}
-
-static inline void x86_pmu_read(struct perf_counter *counter)
-{
-	x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
-}
-
-static const struct pmu pmu = {
-	.enable		= x86_pmu_enable,
-	.disable	= x86_pmu_disable,
-	.read		= x86_pmu_read,
-	.unthrottle	= x86_pmu_unthrottle,
-};
-
-const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
-{
-	int err;
-
-	err = __hw_perf_counter_init(counter);
-	if (err) {
-		if (counter->destroy)
-			counter->destroy(counter);
-		return ERR_PTR(err);
-	}
-
-	return &pmu;
-}
-
-/*
- * callchain support
- */
-
-static inline
-void callchain_store(struct perf_callchain_entry *entry, u64 ip)
-{
-	if (entry->nr < PERF_MAX_STACK_DEPTH)
-		entry->ip[entry->nr++] = ip;
-}
-
-static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry);
-static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry);
-static DEFINE_PER_CPU(int, in_nmi_frame);
-
-
-static void
-backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
-{
-	/* Ignore warnings */
-}
-
-static void backtrace_warning(void *data, char *msg)
-{
-	/* Ignore warnings */
-}
-
-static int backtrace_stack(void *data, char *name)
-{
-	per_cpu(in_nmi_frame, smp_processor_id()) =
-			x86_is_stack_id(NMI_STACK, name);
-
-	return 0;
-}
-
-static void backtrace_address(void *data, unsigned long addr, int reliable)
-{
-	struct perf_callchain_entry *entry = data;
-
-	if (per_cpu(in_nmi_frame, smp_processor_id()))
-		return;
-
-	if (reliable)
-		callchain_store(entry, addr);
-}
-
-static const struct stacktrace_ops backtrace_ops = {
-	.warning		= backtrace_warning,
-	.warning_symbol		= backtrace_warning_symbol,
-	.stack			= backtrace_stack,
-	.address		= backtrace_address,
-};
-
-#include "../dumpstack.h"
-
-static void
-perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
-{
-	callchain_store(entry, PERF_CONTEXT_KERNEL);
-	callchain_store(entry, regs->ip);
-
-	dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry);
-}
-
-/*
- * best effort, GUP based copy_from_user() that assumes IRQ or NMI context
- */
-static unsigned long
-copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
-{
-	unsigned long offset, addr = (unsigned long)from;
-	int type = in_nmi() ? KM_NMI : KM_IRQ0;
-	unsigned long size, len = 0;
-	struct page *page;
-	void *map;
-	int ret;
-
-	do {
-		ret = __get_user_pages_fast(addr, 1, 0, &page);
-		if (!ret)
-			break;
-
-		offset = addr & (PAGE_SIZE - 1);
-		size = min(PAGE_SIZE - offset, n - len);
-
-		map = kmap_atomic(page, type);
-		memcpy(to, map+offset, size);
-		kunmap_atomic(map, type);
-		put_page(page);
-
-		len  += size;
-		to   += size;
-		addr += size;
-
-	} while (len < n);
-
-	return len;
-}
-
-static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
-{
-	unsigned long bytes;
-
-	bytes = copy_from_user_nmi(frame, fp, sizeof(*frame));
-
-	return bytes == sizeof(*frame);
-}
-
-static void
-perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
-{
-	struct stack_frame frame;
-	const void __user *fp;
-
-	if (!user_mode(regs))
-		regs = task_pt_regs(current);
-
-	fp = (void __user *)regs->bp;
-
-	callchain_store(entry, PERF_CONTEXT_USER);
-	callchain_store(entry, regs->ip);
-
-	while (entry->nr < PERF_MAX_STACK_DEPTH) {
-		frame.next_frame	     = NULL;
-		frame.return_address = 0;
-
-		if (!copy_stack_frame(fp, &frame))
-			break;
-
-		if ((unsigned long)fp < regs->sp)
-			break;
-
-		callchain_store(entry, frame.return_address);
-		fp = frame.next_frame;
-	}
-}
-
-static void
-perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
-{
-	int is_user;
-
-	if (!regs)
-		return;
-
-	is_user = user_mode(regs);
-
-	if (!current || current->pid == 0)
-		return;
-
-	if (is_user && current->state != TASK_RUNNING)
-		return;
-
-	if (!is_user)
-		perf_callchain_kernel(regs, entry);
-
-	if (current->mm)
-		perf_callchain_user(regs, entry);
-}
-
-struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
-{
-	struct perf_callchain_entry *entry;
-
-	if (in_nmi())
-		entry = &__get_cpu_var(pmc_nmi_entry);
-	else
-		entry = &__get_cpu_var(pmc_irq_entry);
-
-	entry->nr = 0;
-
-	perf_do_callchain(regs, entry);
-
-	return entry;
-}
-
-void hw_perf_counter_setup_online(int cpu)
-{
-	init_debug_store_on_cpu(cpu);
-}
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
new file mode 100644
index 00000000000..0d03629fb1a
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -0,0 +1,2298 @@
+/*
+ * Performance events x86 architecture code
+ *
+ *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
+ *  Copyright (C) 2009 Jaswinder Singh Rajput
+ *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *  Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/perf_event.h>
+#include <linux/capability.h>
+#include <linux/notifier.h>
+#include <linux/hardirq.h>
+#include <linux/kprobes.h>
+#include <linux/module.h>
+#include <linux/kdebug.h>
+#include <linux/sched.h>
+#include <linux/uaccess.h>
+#include <linux/highmem.h>
+#include <linux/cpu.h>
+
+#include <asm/apic.h>
+#include <asm/stacktrace.h>
+#include <asm/nmi.h>
+
+static u64 perf_event_mask __read_mostly;
+
+/* The maximal number of PEBS events: */
+#define MAX_PEBS_EVENTS	4
+
+/* The size of a BTS record in bytes: */
+#define BTS_RECORD_SIZE		24
+
+/* The size of a per-cpu BTS buffer in bytes: */
+#define BTS_BUFFER_SIZE		(BTS_RECORD_SIZE * 2048)
+
+/* The BTS overflow threshold in bytes from the end of the buffer: */
+#define BTS_OVFL_TH		(BTS_RECORD_SIZE * 128)
+
+
+/*
+ * Bits in the debugctlmsr controlling branch tracing.
+ */
+#define X86_DEBUGCTL_TR			(1 << 6)
+#define X86_DEBUGCTL_BTS		(1 << 7)
+#define X86_DEBUGCTL_BTINT		(1 << 8)
+#define X86_DEBUGCTL_BTS_OFF_OS		(1 << 9)
+#define X86_DEBUGCTL_BTS_OFF_USR	(1 << 10)
+
+/*
+ * A debug store configuration.
+ *
+ * We only support architectures that use 64bit fields.
+ */
+struct debug_store {
+	u64	bts_buffer_base;
+	u64	bts_index;
+	u64	bts_absolute_maximum;
+	u64	bts_interrupt_threshold;
+	u64	pebs_buffer_base;
+	u64	pebs_index;
+	u64	pebs_absolute_maximum;
+	u64	pebs_interrupt_threshold;
+	u64	pebs_event_reset[MAX_PEBS_EVENTS];
+};
+
+struct cpu_hw_events {
+	struct perf_event	*events[X86_PMC_IDX_MAX];
+	unsigned long		used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+	unsigned long		active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+	unsigned long		interrupts;
+	int			enabled;
+	struct debug_store	*ds;
+};
+
+/*
+ * struct x86_pmu - generic x86 pmu
+ */
+struct x86_pmu {
+	const char	*name;
+	int		version;
+	int		(*handle_irq)(struct pt_regs *);
+	void		(*disable_all)(void);
+	void		(*enable_all)(void);
+	void		(*enable)(struct hw_perf_event *, int);
+	void		(*disable)(struct hw_perf_event *, int);
+	unsigned	eventsel;
+	unsigned	perfctr;
+	u64		(*event_map)(int);
+	u64		(*raw_event)(u64);
+	int		max_events;
+	int		num_events;
+	int		num_events_fixed;
+	int		event_bits;
+	u64		event_mask;
+	int		apic;
+	u64		max_period;
+	u64		intel_ctrl;
+	void		(*enable_bts)(u64 config);
+	void		(*disable_bts)(void);
+};
+
+static struct x86_pmu x86_pmu __read_mostly;
+
+static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
+	.enabled = 1,
+};
+
+/*
+ * Not sure about some of these
+ */
+static const u64 p6_perfmon_event_map[] =
+{
+  [PERF_COUNT_HW_CPU_CYCLES]		= 0x0079,
+  [PERF_COUNT_HW_INSTRUCTIONS]		= 0x00c0,
+  [PERF_COUNT_HW_CACHE_REFERENCES]	= 0x0f2e,
+  [PERF_COUNT_HW_CACHE_MISSES]		= 0x012e,
+  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x00c4,
+  [PERF_COUNT_HW_BRANCH_MISSES]		= 0x00c5,
+  [PERF_COUNT_HW_BUS_CYCLES]		= 0x0062,
+};
+
+static u64 p6_pmu_event_map(int hw_event)
+{
+	return p6_perfmon_event_map[hw_event];
+}
+
+/*
+ * Event setting that is specified not to count anything.
+ * We use this to effectively disable a counter.
+ *
+ * L2_RQSTS with 0 MESI unit mask.
+ */
+#define P6_NOP_EVENT			0x0000002EULL
+
+static u64 p6_pmu_raw_event(u64 hw_event)
+{
+#define P6_EVNTSEL_EVENT_MASK		0x000000FFULL
+#define P6_EVNTSEL_UNIT_MASK		0x0000FF00ULL
+#define P6_EVNTSEL_EDGE_MASK		0x00040000ULL
+#define P6_EVNTSEL_INV_MASK		0x00800000ULL
+#define P6_EVNTSEL_REG_MASK		0xFF000000ULL
+
+#define P6_EVNTSEL_MASK			\
+	(P6_EVNTSEL_EVENT_MASK |	\
+	 P6_EVNTSEL_UNIT_MASK  |	\
+	 P6_EVNTSEL_EDGE_MASK  |	\
+	 P6_EVNTSEL_INV_MASK   |	\
+	 P6_EVNTSEL_REG_MASK)
+
+	return hw_event & P6_EVNTSEL_MASK;
+}
+
+
+/*
+ * Intel PerfMon v3. Used on Core2 and later.
+ */
+static const u64 intel_perfmon_event_map[] =
+{
+  [PERF_COUNT_HW_CPU_CYCLES]		= 0x003c,
+  [PERF_COUNT_HW_INSTRUCTIONS]		= 0x00c0,
+  [PERF_COUNT_HW_CACHE_REFERENCES]	= 0x4f2e,
+  [PERF_COUNT_HW_CACHE_MISSES]		= 0x412e,
+  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x00c4,
+  [PERF_COUNT_HW_BRANCH_MISSES]		= 0x00c5,
+  [PERF_COUNT_HW_BUS_CYCLES]		= 0x013c,
+};
+
+static u64 intel_pmu_event_map(int hw_event)
+{
+	return intel_perfmon_event_map[hw_event];
+}
+
+/*
+ * Generalized hw caching related hw_event table, filled
+ * in on a per model basis. A value of 0 means
+ * 'not supported', -1 means 'hw_event makes no sense on
+ * this CPU', any other value means the raw hw_event
+ * ID.
+ */
+
+#define C(x) PERF_COUNT_HW_CACHE_##x
+
+static u64 __read_mostly hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX];
+
+static const u64 nehalem_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI            */
+		[ C(RESULT_MISS)   ] = 0x0140, /* L1D_CACHE_LD.I_STATE         */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI            */
+		[ C(RESULT_MISS)   ] = 0x0141, /* L1D_CACHE_ST.I_STATE         */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS        */
+		[ C(RESULT_MISS)   ] = 0x024e, /* L1D_PREFETCH.MISS            */
+	},
+ },
+ [ C(L1I ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS                    */
+		[ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                   */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS               */
+		[ C(RESULT_MISS)   ] = 0x0224, /* L2_RQSTS.LD_MISS             */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS                */
+		[ C(RESULT_MISS)   ] = 0x0824, /* L2_RQSTS.RFO_MISS            */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference                */
+		[ C(RESULT_MISS)   ] = 0x412e, /* LLC Misses                   */
+	},
+ },
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI   (alias)  */
+		[ C(RESULT_MISS)   ] = 0x0108, /* DTLB_LOAD_MISSES.ANY         */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI   (alias)  */
+		[ C(RESULT_MISS)   ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS  */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P           */
+		[ C(RESULT_MISS)   ] = 0x20c8, /* ITLB_MISS_RETIRED            */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(BPU ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
+		[ C(RESULT_MISS)   ] = 0x03e8, /* BPU_CLEARS.ANY               */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+};
+
+static const u64 core2_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI          */
+		[ C(RESULT_MISS)   ] = 0x0140, /* L1D_CACHE_LD.I_STATE       */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI          */
+		[ C(RESULT_MISS)   ] = 0x0141, /* L1D_CACHE_ST.I_STATE       */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS      */
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(L1I ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS                  */
+		[ C(RESULT_MISS)   ] = 0x0081, /* L1I.MISSES                 */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI                 */
+		[ C(RESULT_MISS)   ] = 0x4129, /* L2_LD.ISTATE               */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI                 */
+		[ C(RESULT_MISS)   ] = 0x412A, /* L2_ST.ISTATE               */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI  (alias) */
+		[ C(RESULT_MISS)   ] = 0x0208, /* DTLB_MISSES.MISS_LD        */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI  (alias) */
+		[ C(RESULT_MISS)   ] = 0x0808, /* DTLB_MISSES.MISS_ST        */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P         */
+		[ C(RESULT_MISS)   ] = 0x1282, /* ITLBMISSES                 */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(BPU ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY        */
+		[ C(RESULT_MISS)   ] = 0x00c5, /* BP_INST_RETIRED.MISPRED    */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+};
+
+static const u64 atom_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD               */
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST               */
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(L1I ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS                  */
+		[ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                 */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI                 */
+		[ C(RESULT_MISS)   ] = 0x4129, /* L2_LD.ISTATE               */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI                 */
+		[ C(RESULT_MISS)   ] = 0x412A, /* L2_ST.ISTATE               */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI  (alias) */
+		[ C(RESULT_MISS)   ] = 0x0508, /* DTLB_MISSES.MISS_LD        */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI  (alias) */
+		[ C(RESULT_MISS)   ] = 0x0608, /* DTLB_MISSES.MISS_ST        */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P         */
+		[ C(RESULT_MISS)   ] = 0x0282, /* ITLB.MISSES                */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(BPU ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY        */
+		[ C(RESULT_MISS)   ] = 0x00c5, /* BP_INST_RETIRED.MISPRED    */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+};
+
+static u64 intel_pmu_raw_event(u64 hw_event)
+{
+#define CORE_EVNTSEL_EVENT_MASK		0x000000FFULL
+#define CORE_EVNTSEL_UNIT_MASK		0x0000FF00ULL
+#define CORE_EVNTSEL_EDGE_MASK		0x00040000ULL
+#define CORE_EVNTSEL_INV_MASK		0x00800000ULL
+#define CORE_EVNTSEL_REG_MASK	0xFF000000ULL
+
+#define CORE_EVNTSEL_MASK		\
+	(CORE_EVNTSEL_EVENT_MASK |	\
+	 CORE_EVNTSEL_UNIT_MASK  |	\
+	 CORE_EVNTSEL_EDGE_MASK  |	\
+	 CORE_EVNTSEL_INV_MASK  |	\
+	 CORE_EVNTSEL_REG_MASK)
+
+	return hw_event & CORE_EVNTSEL_MASK;
+}
+
+static const u64 amd_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses        */
+		[ C(RESULT_MISS)   ] = 0x0041, /* Data Cache Misses          */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts  */
+		[ C(RESULT_MISS)   ] = 0x0167, /* Data Prefetcher :cancelled */
+	},
+ },
+ [ C(L1I ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches  */
+		[ C(RESULT_MISS)   ] = 0x0081, /* Instruction cache misses   */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */
+		[ C(RESULT_MISS)   ] = 0x037E, /* L2 Cache Misses : IC+DC     */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback           */
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses        */
+		[ C(RESULT_MISS)   ] = 0x0046, /* L1 DTLB and L2 DLTB Miss   */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes        */
+		[ C(RESULT_MISS)   ] = 0x0085, /* Instr. fetch ITLB misses   */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(BPU ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr.      */
+		[ C(RESULT_MISS)   ] = 0x00c3, /* Retired Mispredicted BI    */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+};
+
+/*
+ * AMD Performance Monitor K7 and later.
+ */
+static const u64 amd_perfmon_event_map[] =
+{
+  [PERF_COUNT_HW_CPU_CYCLES]		= 0x0076,
+  [PERF_COUNT_HW_INSTRUCTIONS]		= 0x00c0,
+  [PERF_COUNT_HW_CACHE_REFERENCES]	= 0x0080,
+  [PERF_COUNT_HW_CACHE_MISSES]		= 0x0081,
+  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x00c4,
+  [PERF_COUNT_HW_BRANCH_MISSES]		= 0x00c5,
+};
+
+static u64 amd_pmu_event_map(int hw_event)
+{
+	return amd_perfmon_event_map[hw_event];
+}
+
+static u64 amd_pmu_raw_event(u64 hw_event)
+{
+#define K7_EVNTSEL_EVENT_MASK	0x7000000FFULL
+#define K7_EVNTSEL_UNIT_MASK	0x00000FF00ULL
+#define K7_EVNTSEL_EDGE_MASK	0x000040000ULL
+#define K7_EVNTSEL_INV_MASK	0x000800000ULL
+#define K7_EVNTSEL_REG_MASK	0x0FF000000ULL
+
+#define K7_EVNTSEL_MASK			\
+	(K7_EVNTSEL_EVENT_MASK |	\
+	 K7_EVNTSEL_UNIT_MASK  |	\
+	 K7_EVNTSEL_EDGE_MASK  |	\
+	 K7_EVNTSEL_INV_MASK   |	\
+	 K7_EVNTSEL_REG_MASK)
+
+	return hw_event & K7_EVNTSEL_MASK;
+}
+
+/*
+ * Propagate event elapsed time into the generic event.
+ * Can only be executed on the CPU where the event is active.
+ * Returns the delta events processed.
+ */
+static u64
+x86_perf_event_update(struct perf_event *event,
+			struct hw_perf_event *hwc, int idx)
+{
+	int shift = 64 - x86_pmu.event_bits;
+	u64 prev_raw_count, new_raw_count;
+	s64 delta;
+
+	if (idx == X86_PMC_IDX_FIXED_BTS)
+		return 0;
+
+	/*
+	 * Careful: an NMI might modify the previous event value.
+	 *
+	 * Our tactic to handle this is to first atomically read and
+	 * exchange a new raw count - then add that new-prev delta
+	 * count to the generic event atomically:
+	 */
+again:
+	prev_raw_count = atomic64_read(&hwc->prev_count);
+	rdmsrl(hwc->event_base + idx, new_raw_count);
+
+	if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
+					new_raw_count) != prev_raw_count)
+		goto again;
+
+	/*
+	 * Now we have the new raw value and have updated the prev
+	 * timestamp already. We can now calculate the elapsed delta
+	 * (event-)time and add that to the generic event.
+	 *
+	 * Careful, not all hw sign-extends above the physical width
+	 * of the count.
+	 */
+	delta = (new_raw_count << shift) - (prev_raw_count << shift);
+	delta >>= shift;
+
+	atomic64_add(delta, &event->count);
+	atomic64_sub(delta, &hwc->period_left);
+
+	return new_raw_count;
+}
+
+static atomic_t active_events;
+static DEFINE_MUTEX(pmc_reserve_mutex);
+
+static bool reserve_pmc_hardware(void)
+{
+#ifdef CONFIG_X86_LOCAL_APIC
+	int i;
+
+	if (nmi_watchdog == NMI_LOCAL_APIC)
+		disable_lapic_nmi_watchdog();
+
+	for (i = 0; i < x86_pmu.num_events; i++) {
+		if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
+			goto perfctr_fail;
+	}
+
+	for (i = 0; i < x86_pmu.num_events; i++) {
+		if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
+			goto eventsel_fail;
+	}
+#endif
+
+	return true;
+
+#ifdef CONFIG_X86_LOCAL_APIC
+eventsel_fail:
+	for (i--; i >= 0; i--)
+		release_evntsel_nmi(x86_pmu.eventsel + i);
+
+	i = x86_pmu.num_events;
+
+perfctr_fail:
+	for (i--; i >= 0; i--)
+		release_perfctr_nmi(x86_pmu.perfctr + i);
+
+	if (nmi_watchdog == NMI_LOCAL_APIC)
+		enable_lapic_nmi_watchdog();
+
+	return false;
+#endif
+}
+
+static void release_pmc_hardware(void)
+{
+#ifdef CONFIG_X86_LOCAL_APIC
+	int i;
+
+	for (i = 0; i < x86_pmu.num_events; i++) {
+		release_perfctr_nmi(x86_pmu.perfctr + i);
+		release_evntsel_nmi(x86_pmu.eventsel + i);
+	}
+
+	if (nmi_watchdog == NMI_LOCAL_APIC)
+		enable_lapic_nmi_watchdog();
+#endif
+}
+
+static inline bool bts_available(void)
+{
+	return x86_pmu.enable_bts != NULL;
+}
+
+static inline void init_debug_store_on_cpu(int cpu)
+{
+	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+
+	if (!ds)
+		return;
+
+	wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA,
+		     (u32)((u64)(unsigned long)ds),
+		     (u32)((u64)(unsigned long)ds >> 32));
+}
+
+static inline void fini_debug_store_on_cpu(int cpu)
+{
+	if (!per_cpu(cpu_hw_events, cpu).ds)
+		return;
+
+	wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
+}
+
+static void release_bts_hardware(void)
+{
+	int cpu;
+
+	if (!bts_available())
+		return;
+
+	get_online_cpus();
+
+	for_each_online_cpu(cpu)
+		fini_debug_store_on_cpu(cpu);
+
+	for_each_possible_cpu(cpu) {
+		struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+
+		if (!ds)
+			continue;
+
+		per_cpu(cpu_hw_events, cpu).ds = NULL;
+
+		kfree((void *)(unsigned long)ds->bts_buffer_base);
+		kfree(ds);
+	}
+
+	put_online_cpus();
+}
+
+static int reserve_bts_hardware(void)
+{
+	int cpu, err = 0;
+
+	if (!bts_available())
+		return 0;
+
+	get_online_cpus();
+
+	for_each_possible_cpu(cpu) {
+		struct debug_store *ds;
+		void *buffer;
+
+		err = -ENOMEM;
+		buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL);
+		if (unlikely(!buffer))
+			break;
+
+		ds = kzalloc(sizeof(*ds), GFP_KERNEL);
+		if (unlikely(!ds)) {
+			kfree(buffer);
+			break;
+		}
+
+		ds->bts_buffer_base = (u64)(unsigned long)buffer;
+		ds->bts_index = ds->bts_buffer_base;
+		ds->bts_absolute_maximum =
+			ds->bts_buffer_base + BTS_BUFFER_SIZE;
+		ds->bts_interrupt_threshold =
+			ds->bts_absolute_maximum - BTS_OVFL_TH;
+
+		per_cpu(cpu_hw_events, cpu).ds = ds;
+		err = 0;
+	}
+
+	if (err)
+		release_bts_hardware();
+	else {
+		for_each_online_cpu(cpu)
+			init_debug_store_on_cpu(cpu);
+	}
+
+	put_online_cpus();
+
+	return err;
+}
+
+static void hw_perf_event_destroy(struct perf_event *event)
+{
+	if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) {
+		release_pmc_hardware();
+		release_bts_hardware();
+		mutex_unlock(&pmc_reserve_mutex);
+	}
+}
+
+static inline int x86_pmu_initialized(void)
+{
+	return x86_pmu.handle_irq != NULL;
+}
+
+static inline int
+set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr)
+{
+	unsigned int cache_type, cache_op, cache_result;
+	u64 config, val;
+
+	config = attr->config;
+
+	cache_type = (config >>  0) & 0xff;
+	if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
+		return -EINVAL;
+
+	cache_op = (config >>  8) & 0xff;
+	if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
+		return -EINVAL;
+
+	cache_result = (config >> 16) & 0xff;
+	if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
+		return -EINVAL;
+
+	val = hw_cache_event_ids[cache_type][cache_op][cache_result];
+
+	if (val == 0)
+		return -ENOENT;
+
+	if (val == -1)
+		return -EINVAL;
+
+	hwc->config |= val;
+
+	return 0;
+}
+
+static void intel_pmu_enable_bts(u64 config)
+{
+	unsigned long debugctlmsr;
+
+	debugctlmsr = get_debugctlmsr();
+
+	debugctlmsr |= X86_DEBUGCTL_TR;
+	debugctlmsr |= X86_DEBUGCTL_BTS;
+	debugctlmsr |= X86_DEBUGCTL_BTINT;
+
+	if (!(config & ARCH_PERFMON_EVENTSEL_OS))
+		debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS;
+
+	if (!(config & ARCH_PERFMON_EVENTSEL_USR))
+		debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR;
+
+	update_debugctlmsr(debugctlmsr);
+}
+
+static void intel_pmu_disable_bts(void)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	unsigned long debugctlmsr;
+
+	if (!cpuc->ds)
+		return;
+
+	debugctlmsr = get_debugctlmsr();
+
+	debugctlmsr &=
+		~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT |
+		  X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR);
+
+	update_debugctlmsr(debugctlmsr);
+}
+
+/*
+ * Setup the hardware configuration for a given attr_type
+ */
+static int __hw_perf_event_init(struct perf_event *event)
+{
+	struct perf_event_attr *attr = &event->attr;
+	struct hw_perf_event *hwc = &event->hw;
+	u64 config;
+	int err;
+
+	if (!x86_pmu_initialized())
+		return -ENODEV;
+
+	err = 0;
+	if (!atomic_inc_not_zero(&active_events)) {
+		mutex_lock(&pmc_reserve_mutex);
+		if (atomic_read(&active_events) == 0) {
+			if (!reserve_pmc_hardware())
+				err = -EBUSY;
+			else
+				err = reserve_bts_hardware();
+		}
+		if (!err)
+			atomic_inc(&active_events);
+		mutex_unlock(&pmc_reserve_mutex);
+	}
+	if (err)
+		return err;
+
+	event->destroy = hw_perf_event_destroy;
+
+	/*
+	 * Generate PMC IRQs:
+	 * (keep 'enabled' bit clear for now)
+	 */
+	hwc->config = ARCH_PERFMON_EVENTSEL_INT;
+
+	/*
+	 * Count user and OS events unless requested not to.
+	 */
+	if (!attr->exclude_user)
+		hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
+	if (!attr->exclude_kernel)
+		hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
+
+	if (!hwc->sample_period) {
+		hwc->sample_period = x86_pmu.max_period;
+		hwc->last_period = hwc->sample_period;
+		atomic64_set(&hwc->period_left, hwc->sample_period);
+	} else {
+		/*
+		 * If we have a PMU initialized but no APIC
+		 * interrupts, we cannot sample hardware
+		 * events (user-space has to fall back and
+		 * sample via a hrtimer based software event):
+		 */
+		if (!x86_pmu.apic)
+			return -EOPNOTSUPP;
+	}
+
+	/*
+	 * Raw hw_event type provide the config in the hw_event structure
+	 */
+	if (attr->type == PERF_TYPE_RAW) {
+		hwc->config |= x86_pmu.raw_event(attr->config);
+		return 0;
+	}
+
+	if (attr->type == PERF_TYPE_HW_CACHE)
+		return set_ext_hw_attr(hwc, attr);
+
+	if (attr->config >= x86_pmu.max_events)
+		return -EINVAL;
+
+	/*
+	 * The generic map:
+	 */
+	config = x86_pmu.event_map(attr->config);
+
+	if (config == 0)
+		return -ENOENT;
+
+	if (config == -1LL)
+		return -EINVAL;
+
+	/*
+	 * Branch tracing:
+	 */
+	if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) &&
+	    (hwc->sample_period == 1)) {
+		/* BTS is not supported by this architecture. */
+		if (!bts_available())
+			return -EOPNOTSUPP;
+
+		/* BTS is currently only allowed for user-mode. */
+		if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
+			return -EOPNOTSUPP;
+	}
+
+	hwc->config |= config;
+
+	return 0;
+}
+
+static void p6_pmu_disable_all(void)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	u64 val;
+
+	if (!cpuc->enabled)
+		return;
+
+	cpuc->enabled = 0;
+	barrier();
+
+	/* p6 only has one enable register */
+	rdmsrl(MSR_P6_EVNTSEL0, val);
+	val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
+	wrmsrl(MSR_P6_EVNTSEL0, val);
+}
+
+static void intel_pmu_disable_all(void)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+	if (!cpuc->enabled)
+		return;
+
+	cpuc->enabled = 0;
+	barrier();
+
+	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+
+	if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask))
+		intel_pmu_disable_bts();
+}
+
+static void amd_pmu_disable_all(void)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	int idx;
+
+	if (!cpuc->enabled)
+		return;
+
+	cpuc->enabled = 0;
+	/*
+	 * ensure we write the disable before we start disabling the
+	 * events proper, so that amd_pmu_enable_event() does the
+	 * right thing.
+	 */
+	barrier();
+
+	for (idx = 0; idx < x86_pmu.num_events; idx++) {
+		u64 val;
+
+		if (!test_bit(idx, cpuc->active_mask))
+			continue;
+		rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
+		if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE))
+			continue;
+		val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
+		wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
+	}
+}
+
+void hw_perf_disable(void)
+{
+	if (!x86_pmu_initialized())
+		return;
+	return x86_pmu.disable_all();
+}
+
+static void p6_pmu_enable_all(void)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	unsigned long val;
+
+	if (cpuc->enabled)
+		return;
+
+	cpuc->enabled = 1;
+	barrier();
+
+	/* p6 only has one enable register */
+	rdmsrl(MSR_P6_EVNTSEL0, val);
+	val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+	wrmsrl(MSR_P6_EVNTSEL0, val);
+}
+
+static void intel_pmu_enable_all(void)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+	if (cpuc->enabled)
+		return;
+
+	cpuc->enabled = 1;
+	barrier();
+
+	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
+
+	if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
+		struct perf_event *event =
+			cpuc->events[X86_PMC_IDX_FIXED_BTS];
+
+		if (WARN_ON_ONCE(!event))
+			return;
+
+		intel_pmu_enable_bts(event->hw.config);
+	}
+}
+
+static void amd_pmu_enable_all(void)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	int idx;
+
+	if (cpuc->enabled)
+		return;
+
+	cpuc->enabled = 1;
+	barrier();
+
+	for (idx = 0; idx < x86_pmu.num_events; idx++) {
+		struct perf_event *event = cpuc->events[idx];
+		u64 val;
+
+		if (!test_bit(idx, cpuc->active_mask))
+			continue;
+
+		val = event->hw.config;
+		val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+		wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
+	}
+}
+
+void hw_perf_enable(void)
+{
+	if (!x86_pmu_initialized())
+		return;
+	x86_pmu.enable_all();
+}
+
+static inline u64 intel_pmu_get_status(void)
+{
+	u64 status;
+
+	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+
+	return status;
+}
+
+static inline void intel_pmu_ack_status(u64 ack)
+{
+	wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
+}
+
+static inline void x86_pmu_enable_event(struct hw_perf_event *hwc, int idx)
+{
+	(void)checking_wrmsrl(hwc->config_base + idx,
+			      hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);
+}
+
+static inline void x86_pmu_disable_event(struct hw_perf_event *hwc, int idx)
+{
+	(void)checking_wrmsrl(hwc->config_base + idx, hwc->config);
+}
+
+static inline void
+intel_pmu_disable_fixed(struct hw_perf_event *hwc, int __idx)
+{
+	int idx = __idx - X86_PMC_IDX_FIXED;
+	u64 ctrl_val, mask;
+
+	mask = 0xfULL << (idx * 4);
+
+	rdmsrl(hwc->config_base, ctrl_val);
+	ctrl_val &= ~mask;
+	(void)checking_wrmsrl(hwc->config_base, ctrl_val);
+}
+
+static inline void
+p6_pmu_disable_event(struct hw_perf_event *hwc, int idx)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	u64 val = P6_NOP_EVENT;
+
+	if (cpuc->enabled)
+		val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+
+	(void)checking_wrmsrl(hwc->config_base + idx, val);
+}
+
+static inline void
+intel_pmu_disable_event(struct hw_perf_event *hwc, int idx)
+{
+	if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
+		intel_pmu_disable_bts();
+		return;
+	}
+
+	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
+		intel_pmu_disable_fixed(hwc, idx);
+		return;
+	}
+
+	x86_pmu_disable_event(hwc, idx);
+}
+
+static inline void
+amd_pmu_disable_event(struct hw_perf_event *hwc, int idx)
+{
+	x86_pmu_disable_event(hwc, idx);
+}
+
+static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
+
+/*
+ * Set the next IRQ period, based on the hwc->period_left value.
+ * To be called with the event disabled in hw:
+ */
+static int
+x86_perf_event_set_period(struct perf_event *event,
+			     struct hw_perf_event *hwc, int idx)
+{
+	s64 left = atomic64_read(&hwc->period_left);
+	s64 period = hwc->sample_period;
+	int err, ret = 0;
+
+	if (idx == X86_PMC_IDX_FIXED_BTS)
+		return 0;
+
+	/*
+	 * If we are way outside a reasoable range then just skip forward:
+	 */
+	if (unlikely(left <= -period)) {
+		left = period;
+		atomic64_set(&hwc->period_left, left);
+		hwc->last_period = period;
+		ret = 1;
+	}
+
+	if (unlikely(left <= 0)) {
+		left += period;
+		atomic64_set(&hwc->period_left, left);
+		hwc->last_period = period;
+		ret = 1;
+	}
+	/*
+	 * Quirk: certain CPUs dont like it if just 1 hw_event is left:
+	 */
+	if (unlikely(left < 2))
+		left = 2;
+
+	if (left > x86_pmu.max_period)
+		left = x86_pmu.max_period;
+
+	per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
+
+	/*
+	 * The hw event starts counting from this event offset,
+	 * mark it to be able to extra future deltas:
+	 */
+	atomic64_set(&hwc->prev_count, (u64)-left);
+
+	err = checking_wrmsrl(hwc->event_base + idx,
+			     (u64)(-left) & x86_pmu.event_mask);
+
+	perf_event_update_userpage(event);
+
+	return ret;
+}
+
+static inline void
+intel_pmu_enable_fixed(struct hw_perf_event *hwc, int __idx)
+{
+	int idx = __idx - X86_PMC_IDX_FIXED;
+	u64 ctrl_val, bits, mask;
+	int err;
+
+	/*
+	 * Enable IRQ generation (0x8),
+	 * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
+	 * if requested:
+	 */
+	bits = 0x8ULL;
+	if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
+		bits |= 0x2;
+	if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
+		bits |= 0x1;
+	bits <<= (idx * 4);
+	mask = 0xfULL << (idx * 4);
+
+	rdmsrl(hwc->config_base, ctrl_val);
+	ctrl_val &= ~mask;
+	ctrl_val |= bits;
+	err = checking_wrmsrl(hwc->config_base, ctrl_val);
+}
+
+static void p6_pmu_enable_event(struct hw_perf_event *hwc, int idx)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	u64 val;
+
+	val = hwc->config;
+	if (cpuc->enabled)
+		val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+
+	(void)checking_wrmsrl(hwc->config_base + idx, val);
+}
+
+
+static void intel_pmu_enable_event(struct hw_perf_event *hwc, int idx)
+{
+	if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
+		if (!__get_cpu_var(cpu_hw_events).enabled)
+			return;
+
+		intel_pmu_enable_bts(hwc->config);
+		return;
+	}
+
+	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
+		intel_pmu_enable_fixed(hwc, idx);
+		return;
+	}
+
+	x86_pmu_enable_event(hwc, idx);
+}
+
+static void amd_pmu_enable_event(struct hw_perf_event *hwc, int idx)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+	if (cpuc->enabled)
+		x86_pmu_enable_event(hwc, idx);
+}
+
+static int
+fixed_mode_idx(struct perf_event *event, struct hw_perf_event *hwc)
+{
+	unsigned int hw_event;
+
+	hw_event = hwc->config & ARCH_PERFMON_EVENT_MASK;
+
+	if (unlikely((hw_event ==
+		      x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) &&
+		     (hwc->sample_period == 1)))
+		return X86_PMC_IDX_FIXED_BTS;
+
+	if (!x86_pmu.num_events_fixed)
+		return -1;
+
+	if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS)))
+		return X86_PMC_IDX_FIXED_INSTRUCTIONS;
+	if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES)))
+		return X86_PMC_IDX_FIXED_CPU_CYCLES;
+	if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES)))
+		return X86_PMC_IDX_FIXED_BUS_CYCLES;
+
+	return -1;
+}
+
+/*
+ * Find a PMC slot for the freshly enabled / scheduled in event:
+ */
+static int x86_pmu_enable(struct perf_event *event)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	struct hw_perf_event *hwc = &event->hw;
+	int idx;
+
+	idx = fixed_mode_idx(event, hwc);
+	if (idx == X86_PMC_IDX_FIXED_BTS) {
+		/* BTS is already occupied. */
+		if (test_and_set_bit(idx, cpuc->used_mask))
+			return -EAGAIN;
+
+		hwc->config_base	= 0;
+		hwc->event_base	= 0;
+		hwc->idx		= idx;
+	} else if (idx >= 0) {
+		/*
+		 * Try to get the fixed event, if that is already taken
+		 * then try to get a generic event:
+		 */
+		if (test_and_set_bit(idx, cpuc->used_mask))
+			goto try_generic;
+
+		hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
+		/*
+		 * We set it so that event_base + idx in wrmsr/rdmsr maps to
+		 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
+		 */
+		hwc->event_base =
+			MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
+		hwc->idx = idx;
+	} else {
+		idx = hwc->idx;
+		/* Try to get the previous generic event again */
+		if (test_and_set_bit(idx, cpuc->used_mask)) {
+try_generic:
+			idx = find_first_zero_bit(cpuc->used_mask,
+						  x86_pmu.num_events);
+			if (idx == x86_pmu.num_events)
+				return -EAGAIN;
+
+			set_bit(idx, cpuc->used_mask);
+			hwc->idx = idx;
+		}
+		hwc->config_base  = x86_pmu.eventsel;
+		hwc->event_base = x86_pmu.perfctr;
+	}
+
+	perf_events_lapic_init();
+
+	x86_pmu.disable(hwc, idx);
+
+	cpuc->events[idx] = event;
+	set_bit(idx, cpuc->active_mask);
+
+	x86_perf_event_set_period(event, hwc, idx);
+	x86_pmu.enable(hwc, idx);
+
+	perf_event_update_userpage(event);
+
+	return 0;
+}
+
+static void x86_pmu_unthrottle(struct perf_event *event)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX ||
+				cpuc->events[hwc->idx] != event))
+		return;
+
+	x86_pmu.enable(hwc, hwc->idx);
+}
+
+void perf_event_print_debug(void)
+{
+	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
+	struct cpu_hw_events *cpuc;
+	unsigned long flags;
+	int cpu, idx;
+
+	if (!x86_pmu.num_events)
+		return;
+
+	local_irq_save(flags);
+
+	cpu = smp_processor_id();
+	cpuc = &per_cpu(cpu_hw_events, cpu);
+
+	if (x86_pmu.version >= 2) {
+		rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
+		rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+		rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
+		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
+
+		pr_info("\n");
+		pr_info("CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
+		pr_info("CPU#%d: status:     %016llx\n", cpu, status);
+		pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
+		pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
+	}
+	pr_info("CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used_mask);
+
+	for (idx = 0; idx < x86_pmu.num_events; idx++) {
+		rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
+		rdmsrl(x86_pmu.perfctr  + idx, pmc_count);
+
+		prev_left = per_cpu(pmc_prev_left[idx], cpu);
+
+		pr_info("CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
+			cpu, idx, pmc_ctrl);
+		pr_info("CPU#%d:   gen-PMC%d count: %016llx\n",
+			cpu, idx, pmc_count);
+		pr_info("CPU#%d:   gen-PMC%d left:  %016llx\n",
+			cpu, idx, prev_left);
+	}
+	for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) {
+		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
+
+		pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
+			cpu, idx, pmc_count);
+	}
+	local_irq_restore(flags);
+}
+
+static void intel_pmu_drain_bts_buffer(struct cpu_hw_events *cpuc)
+{
+	struct debug_store *ds = cpuc->ds;
+	struct bts_record {
+		u64	from;
+		u64	to;
+		u64	flags;
+	};
+	struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS];
+	struct bts_record *at, *top;
+	struct perf_output_handle handle;
+	struct perf_event_header header;
+	struct perf_sample_data data;
+	struct pt_regs regs;
+
+	if (!event)
+		return;
+
+	if (!ds)
+		return;
+
+	at  = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
+	top = (struct bts_record *)(unsigned long)ds->bts_index;
+
+	if (top <= at)
+		return;
+
+	ds->bts_index = ds->bts_buffer_base;
+
+
+	data.period	= event->hw.last_period;
+	data.addr	= 0;
+	regs.ip		= 0;
+
+	/*
+	 * Prepare a generic sample, i.e. fill in the invariant fields.
+	 * We will overwrite the from and to address before we output
+	 * the sample.
+	 */
+	perf_prepare_sample(&header, &data, event, &regs);
+
+	if (perf_output_begin(&handle, event,
+			      header.size * (top - at), 1, 1))
+		return;
+
+	for (; at < top; at++) {
+		data.ip		= at->from;
+		data.addr	= at->to;
+
+		perf_output_sample(&handle, &header, &data, event);
+	}
+
+	perf_output_end(&handle);
+
+	/* There's new data available. */
+	event->hw.interrupts++;
+	event->pending_kill = POLL_IN;
+}
+
+static void x86_pmu_disable(struct perf_event *event)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	struct hw_perf_event *hwc = &event->hw;
+	int idx = hwc->idx;
+
+	/*
+	 * Must be done before we disable, otherwise the nmi handler
+	 * could reenable again:
+	 */
+	clear_bit(idx, cpuc->active_mask);
+	x86_pmu.disable(hwc, idx);
+
+	/*
+	 * Make sure the cleared pointer becomes visible before we
+	 * (potentially) free the event:
+	 */
+	barrier();
+
+	/*
+	 * Drain the remaining delta count out of a event
+	 * that we are disabling:
+	 */
+	x86_perf_event_update(event, hwc, idx);
+
+	/* Drain the remaining BTS records. */
+	if (unlikely(idx == X86_PMC_IDX_FIXED_BTS))
+		intel_pmu_drain_bts_buffer(cpuc);
+
+	cpuc->events[idx] = NULL;
+	clear_bit(idx, cpuc->used_mask);
+
+	perf_event_update_userpage(event);
+}
+
+/*
+ * Save and restart an expired event. Called by NMI contexts,
+ * so it has to be careful about preempting normal event ops:
+ */
+static int intel_pmu_save_and_restart(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	int idx = hwc->idx;
+	int ret;
+
+	x86_perf_event_update(event, hwc, idx);
+	ret = x86_perf_event_set_period(event, hwc, idx);
+
+	if (event->state == PERF_EVENT_STATE_ACTIVE)
+		intel_pmu_enable_event(hwc, idx);
+
+	return ret;
+}
+
+static void intel_pmu_reset(void)
+{
+	struct debug_store *ds = __get_cpu_var(cpu_hw_events).ds;
+	unsigned long flags;
+	int idx;
+
+	if (!x86_pmu.num_events)
+		return;
+
+	local_irq_save(flags);
+
+	printk("clearing PMU state on CPU#%d\n", smp_processor_id());
+
+	for (idx = 0; idx < x86_pmu.num_events; idx++) {
+		checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);
+		checking_wrmsrl(x86_pmu.perfctr  + idx, 0ull);
+	}
+	for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) {
+		checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
+	}
+	if (ds)
+		ds->bts_index = ds->bts_buffer_base;
+
+	local_irq_restore(flags);
+}
+
+static int p6_pmu_handle_irq(struct pt_regs *regs)
+{
+	struct perf_sample_data data;
+	struct cpu_hw_events *cpuc;
+	struct perf_event *event;
+	struct hw_perf_event *hwc;
+	int idx, handled = 0;
+	u64 val;
+
+	data.addr = 0;
+
+	cpuc = &__get_cpu_var(cpu_hw_events);
+
+	for (idx = 0; idx < x86_pmu.num_events; idx++) {
+		if (!test_bit(idx, cpuc->active_mask))
+			continue;
+
+		event = cpuc->events[idx];
+		hwc = &event->hw;
+
+		val = x86_perf_event_update(event, hwc, idx);
+		if (val & (1ULL << (x86_pmu.event_bits - 1)))
+			continue;
+
+		/*
+		 * event overflow
+		 */
+		handled		= 1;
+		data.period	= event->hw.last_period;
+
+		if (!x86_perf_event_set_period(event, hwc, idx))
+			continue;
+
+		if (perf_event_overflow(event, 1, &data, regs))
+			p6_pmu_disable_event(hwc, idx);
+	}
+
+	if (handled)
+		inc_irq_stat(apic_perf_irqs);
+
+	return handled;
+}
+
+/*
+ * This handler is triggered by the local APIC, so the APIC IRQ handling
+ * rules apply:
+ */
+static int intel_pmu_handle_irq(struct pt_regs *regs)
+{
+	struct perf_sample_data data;
+	struct cpu_hw_events *cpuc;
+	int bit, loops;
+	u64 ack, status;
+
+	data.addr = 0;
+
+	cpuc = &__get_cpu_var(cpu_hw_events);
+
+	perf_disable();
+	intel_pmu_drain_bts_buffer(cpuc);
+	status = intel_pmu_get_status();
+	if (!status) {
+		perf_enable();
+		return 0;
+	}
+
+	loops = 0;
+again:
+	if (++loops > 100) {
+		WARN_ONCE(1, "perfevents: irq loop stuck!\n");
+		perf_event_print_debug();
+		intel_pmu_reset();
+		perf_enable();
+		return 1;
+	}
+
+	inc_irq_stat(apic_perf_irqs);
+	ack = status;
+	for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
+		struct perf_event *event = cpuc->events[bit];
+
+		clear_bit(bit, (unsigned long *) &status);
+		if (!test_bit(bit, cpuc->active_mask))
+			continue;
+
+		if (!intel_pmu_save_and_restart(event))
+			continue;
+
+		data.period = event->hw.last_period;
+
+		if (perf_event_overflow(event, 1, &data, regs))
+			intel_pmu_disable_event(&event->hw, bit);
+	}
+
+	intel_pmu_ack_status(ack);
+
+	/*
+	 * Repeat if there is more work to be done:
+	 */
+	status = intel_pmu_get_status();
+	if (status)
+		goto again;
+
+	perf_enable();
+
+	return 1;
+}
+
+static int amd_pmu_handle_irq(struct pt_regs *regs)
+{
+	struct perf_sample_data data;
+	struct cpu_hw_events *cpuc;
+	struct perf_event *event;
+	struct hw_perf_event *hwc;
+	int idx, handled = 0;
+	u64 val;
+
+	data.addr = 0;
+
+	cpuc = &__get_cpu_var(cpu_hw_events);
+
+	for (idx = 0; idx < x86_pmu.num_events; idx++) {
+		if (!test_bit(idx, cpuc->active_mask))
+			continue;
+
+		event = cpuc->events[idx];
+		hwc = &event->hw;
+
+		val = x86_perf_event_update(event, hwc, idx);
+		if (val & (1ULL << (x86_pmu.event_bits - 1)))
+			continue;
+
+		/*
+		 * event overflow
+		 */
+		handled		= 1;
+		data.period	= event->hw.last_period;
+
+		if (!x86_perf_event_set_period(event, hwc, idx))
+			continue;
+
+		if (perf_event_overflow(event, 1, &data, regs))
+			amd_pmu_disable_event(hwc, idx);
+	}
+
+	if (handled)
+		inc_irq_stat(apic_perf_irqs);
+
+	return handled;
+}
+
+void smp_perf_pending_interrupt(struct pt_regs *regs)
+{
+	irq_enter();
+	ack_APIC_irq();
+	inc_irq_stat(apic_pending_irqs);
+	perf_event_do_pending();
+	irq_exit();
+}
+
+void set_perf_event_pending(void)
+{
+#ifdef CONFIG_X86_LOCAL_APIC
+	apic->send_IPI_self(LOCAL_PENDING_VECTOR);
+#endif
+}
+
+void perf_events_lapic_init(void)
+{
+#ifdef CONFIG_X86_LOCAL_APIC
+	if (!x86_pmu.apic || !x86_pmu_initialized())
+		return;
+
+	/*
+	 * Always use NMI for PMU
+	 */
+	apic_write(APIC_LVTPC, APIC_DM_NMI);
+#endif
+}
+
+static int __kprobes
+perf_event_nmi_handler(struct notifier_block *self,
+			 unsigned long cmd, void *__args)
+{
+	struct die_args *args = __args;
+	struct pt_regs *regs;
+
+	if (!atomic_read(&active_events))
+		return NOTIFY_DONE;
+
+	switch (cmd) {
+	case DIE_NMI:
+	case DIE_NMI_IPI:
+		break;
+
+	default:
+		return NOTIFY_DONE;
+	}
+
+	regs = args->regs;
+
+#ifdef CONFIG_X86_LOCAL_APIC
+	apic_write(APIC_LVTPC, APIC_DM_NMI);
+#endif
+	/*
+	 * Can't rely on the handled return value to say it was our NMI, two
+	 * events could trigger 'simultaneously' raising two back-to-back NMIs.
+	 *
+	 * If the first NMI handles both, the latter will be empty and daze
+	 * the CPU.
+	 */
+	x86_pmu.handle_irq(regs);
+
+	return NOTIFY_STOP;
+}
+
+static __read_mostly struct notifier_block perf_event_nmi_notifier = {
+	.notifier_call		= perf_event_nmi_handler,
+	.next			= NULL,
+	.priority		= 1
+};
+
+static struct x86_pmu p6_pmu = {
+	.name			= "p6",
+	.handle_irq		= p6_pmu_handle_irq,
+	.disable_all		= p6_pmu_disable_all,
+	.enable_all		= p6_pmu_enable_all,
+	.enable			= p6_pmu_enable_event,
+	.disable		= p6_pmu_disable_event,
+	.eventsel		= MSR_P6_EVNTSEL0,
+	.perfctr		= MSR_P6_PERFCTR0,
+	.event_map		= p6_pmu_event_map,
+	.raw_event		= p6_pmu_raw_event,
+	.max_events		= ARRAY_SIZE(p6_perfmon_event_map),
+	.apic			= 1,
+	.max_period		= (1ULL << 31) - 1,
+	.version		= 0,
+	.num_events		= 2,
+	/*
+	 * Events have 40 bits implemented. However they are designed such
+	 * that bits [32-39] are sign extensions of bit 31. As such the
+	 * effective width of a event for P6-like PMU is 32 bits only.
+	 *
+	 * See IA-32 Intel Architecture Software developer manual Vol 3B
+	 */
+	.event_bits		= 32,
+	.event_mask		= (1ULL << 32) - 1,
+};
+
+static struct x86_pmu intel_pmu = {
+	.name			= "Intel",
+	.handle_irq		= intel_pmu_handle_irq,
+	.disable_all		= intel_pmu_disable_all,
+	.enable_all		= intel_pmu_enable_all,
+	.enable			= intel_pmu_enable_event,
+	.disable		= intel_pmu_disable_event,
+	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
+	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
+	.event_map		= intel_pmu_event_map,
+	.raw_event		= intel_pmu_raw_event,
+	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
+	.apic			= 1,
+	/*
+	 * Intel PMCs cannot be accessed sanely above 32 bit width,
+	 * so we install an artificial 1<<31 period regardless of
+	 * the generic event period:
+	 */
+	.max_period		= (1ULL << 31) - 1,
+	.enable_bts		= intel_pmu_enable_bts,
+	.disable_bts		= intel_pmu_disable_bts,
+};
+
+static struct x86_pmu amd_pmu = {
+	.name			= "AMD",
+	.handle_irq		= amd_pmu_handle_irq,
+	.disable_all		= amd_pmu_disable_all,
+	.enable_all		= amd_pmu_enable_all,
+	.enable			= amd_pmu_enable_event,
+	.disable		= amd_pmu_disable_event,
+	.eventsel		= MSR_K7_EVNTSEL0,
+	.perfctr		= MSR_K7_PERFCTR0,
+	.event_map		= amd_pmu_event_map,
+	.raw_event		= amd_pmu_raw_event,
+	.max_events		= ARRAY_SIZE(amd_perfmon_event_map),
+	.num_events		= 4,
+	.event_bits		= 48,
+	.event_mask		= (1ULL << 48) - 1,
+	.apic			= 1,
+	/* use highest bit to detect overflow */
+	.max_period		= (1ULL << 47) - 1,
+};
+
+static int p6_pmu_init(void)
+{
+	switch (boot_cpu_data.x86_model) {
+	case 1:
+	case 3:  /* Pentium Pro */
+	case 5:
+	case 6:  /* Pentium II */
+	case 7:
+	case 8:
+	case 11: /* Pentium III */
+		break;
+	case 9:
+	case 13:
+		/* Pentium M */
+		break;
+	default:
+		pr_cont("unsupported p6 CPU model %d ",
+			boot_cpu_data.x86_model);
+		return -ENODEV;
+	}
+
+	x86_pmu = p6_pmu;
+
+	if (!cpu_has_apic) {
+		pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
+		pr_info("no hardware sampling interrupt available.\n");
+		x86_pmu.apic = 0;
+	}
+
+	return 0;
+}
+
+static int intel_pmu_init(void)
+{
+	union cpuid10_edx edx;
+	union cpuid10_eax eax;
+	unsigned int unused;
+	unsigned int ebx;
+	int version;
+
+	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
+		/* check for P6 processor family */
+	   if (boot_cpu_data.x86 == 6) {
+		return p6_pmu_init();
+	   } else {
+		return -ENODEV;
+	   }
+	}
+
+	/*
+	 * Check whether the Architectural PerfMon supports
+	 * Branch Misses Retired hw_event or not.
+	 */
+	cpuid(10, &eax.full, &ebx, &unused, &edx.full);
+	if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
+		return -ENODEV;
+
+	version = eax.split.version_id;
+	if (version < 2)
+		return -ENODEV;
+
+	x86_pmu				= intel_pmu;
+	x86_pmu.version			= version;
+	x86_pmu.num_events		= eax.split.num_events;
+	x86_pmu.event_bits		= eax.split.bit_width;
+	x86_pmu.event_mask		= (1ULL << eax.split.bit_width) - 1;
+
+	/*
+	 * Quirk: v2 perfmon does not report fixed-purpose events, so
+	 * assume at least 3 events:
+	 */
+	x86_pmu.num_events_fixed	= max((int)edx.split.num_events_fixed, 3);
+
+	/*
+	 * Install the hw-cache-events table:
+	 */
+	switch (boot_cpu_data.x86_model) {
+	case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
+	case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
+	case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
+	case 29: /* six-core 45 nm xeon "Dunnington" */
+		memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
+		       sizeof(hw_cache_event_ids));
+
+		pr_cont("Core2 events, ");
+		break;
+	default:
+	case 26:
+		memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
+		       sizeof(hw_cache_event_ids));
+
+		pr_cont("Nehalem/Corei7 events, ");
+		break;
+	case 28:
+		memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
+		       sizeof(hw_cache_event_ids));
+
+		pr_cont("Atom events, ");
+		break;
+	}
+	return 0;
+}
+
+static int amd_pmu_init(void)
+{
+	/* Performance-monitoring supported from K7 and later: */
+	if (boot_cpu_data.x86 < 6)
+		return -ENODEV;
+
+	x86_pmu = amd_pmu;
+
+	/* Events are common for all AMDs */
+	memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
+	       sizeof(hw_cache_event_ids));
+
+	return 0;
+}
+
+void __init init_hw_perf_events(void)
+{
+	int err;
+
+	pr_info("Performance Events: ");
+
+	switch (boot_cpu_data.x86_vendor) {
+	case X86_VENDOR_INTEL:
+		err = intel_pmu_init();
+		break;
+	case X86_VENDOR_AMD:
+		err = amd_pmu_init();
+		break;
+	default:
+		return;
+	}
+	if (err != 0) {
+		pr_cont("no PMU driver, software events only.\n");
+		return;
+	}
+
+	pr_cont("%s PMU driver.\n", x86_pmu.name);
+
+	if (x86_pmu.num_events > X86_PMC_MAX_GENERIC) {
+		WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
+		     x86_pmu.num_events, X86_PMC_MAX_GENERIC);
+		x86_pmu.num_events = X86_PMC_MAX_GENERIC;
+	}
+	perf_event_mask = (1 << x86_pmu.num_events) - 1;
+	perf_max_events = x86_pmu.num_events;
+
+	if (x86_pmu.num_events_fixed > X86_PMC_MAX_FIXED) {
+		WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
+		     x86_pmu.num_events_fixed, X86_PMC_MAX_FIXED);
+		x86_pmu.num_events_fixed = X86_PMC_MAX_FIXED;
+	}
+
+	perf_event_mask |=
+		((1LL << x86_pmu.num_events_fixed)-1) << X86_PMC_IDX_FIXED;
+	x86_pmu.intel_ctrl = perf_event_mask;
+
+	perf_events_lapic_init();
+	register_die_notifier(&perf_event_nmi_notifier);
+
+	pr_info("... version:                 %d\n",     x86_pmu.version);
+	pr_info("... bit width:               %d\n",     x86_pmu.event_bits);
+	pr_info("... generic events:        %d\n",     x86_pmu.num_events);
+	pr_info("... value mask:              %016Lx\n", x86_pmu.event_mask);
+	pr_info("... max period:              %016Lx\n", x86_pmu.max_period);
+	pr_info("... fixed-purpose events:  %d\n",     x86_pmu.num_events_fixed);
+	pr_info("... event mask:            %016Lx\n", perf_event_mask);
+}
+
+static inline void x86_pmu_read(struct perf_event *event)
+{
+	x86_perf_event_update(event, &event->hw, event->hw.idx);
+}
+
+static const struct pmu pmu = {
+	.enable		= x86_pmu_enable,
+	.disable	= x86_pmu_disable,
+	.read		= x86_pmu_read,
+	.unthrottle	= x86_pmu_unthrottle,
+};
+
+const struct pmu *hw_perf_event_init(struct perf_event *event)
+{
+	int err;
+
+	err = __hw_perf_event_init(event);
+	if (err) {
+		if (event->destroy)
+			event->destroy(event);
+		return ERR_PTR(err);
+	}
+
+	return &pmu;
+}
+
+/*
+ * callchain support
+ */
+
+static inline
+void callchain_store(struct perf_callchain_entry *entry, u64 ip)
+{
+	if (entry->nr < PERF_MAX_STACK_DEPTH)
+		entry->ip[entry->nr++] = ip;
+}
+
+static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry);
+static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry);
+static DEFINE_PER_CPU(int, in_nmi_frame);
+
+
+static void
+backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
+{
+	/* Ignore warnings */
+}
+
+static void backtrace_warning(void *data, char *msg)
+{
+	/* Ignore warnings */
+}
+
+static int backtrace_stack(void *data, char *name)
+{
+	per_cpu(in_nmi_frame, smp_processor_id()) =
+			x86_is_stack_id(NMI_STACK, name);
+
+	return 0;
+}
+
+static void backtrace_address(void *data, unsigned long addr, int reliable)
+{
+	struct perf_callchain_entry *entry = data;
+
+	if (per_cpu(in_nmi_frame, smp_processor_id()))
+		return;
+
+	if (reliable)
+		callchain_store(entry, addr);
+}
+
+static const struct stacktrace_ops backtrace_ops = {
+	.warning		= backtrace_warning,
+	.warning_symbol		= backtrace_warning_symbol,
+	.stack			= backtrace_stack,
+	.address		= backtrace_address,
+};
+
+#include "../dumpstack.h"
+
+static void
+perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
+{
+	callchain_store(entry, PERF_CONTEXT_KERNEL);
+	callchain_store(entry, regs->ip);
+
+	dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry);
+}
+
+/*
+ * best effort, GUP based copy_from_user() that assumes IRQ or NMI context
+ */
+static unsigned long
+copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
+{
+	unsigned long offset, addr = (unsigned long)from;
+	int type = in_nmi() ? KM_NMI : KM_IRQ0;
+	unsigned long size, len = 0;
+	struct page *page;
+	void *map;
+	int ret;
+
+	do {
+		ret = __get_user_pages_fast(addr, 1, 0, &page);
+		if (!ret)
+			break;
+
+		offset = addr & (PAGE_SIZE - 1);
+		size = min(PAGE_SIZE - offset, n - len);
+
+		map = kmap_atomic(page, type);
+		memcpy(to, map+offset, size);
+		kunmap_atomic(map, type);
+		put_page(page);
+
+		len  += size;
+		to   += size;
+		addr += size;
+
+	} while (len < n);
+
+	return len;
+}
+
+static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
+{
+	unsigned long bytes;
+
+	bytes = copy_from_user_nmi(frame, fp, sizeof(*frame));
+
+	return bytes == sizeof(*frame);
+}
+
+static void
+perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
+{
+	struct stack_frame frame;
+	const void __user *fp;
+
+	if (!user_mode(regs))
+		regs = task_pt_regs(current);
+
+	fp = (void __user *)regs->bp;
+
+	callchain_store(entry, PERF_CONTEXT_USER);
+	callchain_store(entry, regs->ip);
+
+	while (entry->nr < PERF_MAX_STACK_DEPTH) {
+		frame.next_frame	     = NULL;
+		frame.return_address = 0;
+
+		if (!copy_stack_frame(fp, &frame))
+			break;
+
+		if ((unsigned long)fp < regs->sp)
+			break;
+
+		callchain_store(entry, frame.return_address);
+		fp = frame.next_frame;
+	}
+}
+
+static void
+perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
+{
+	int is_user;
+
+	if (!regs)
+		return;
+
+	is_user = user_mode(regs);
+
+	if (!current || current->pid == 0)
+		return;
+
+	if (is_user && current->state != TASK_RUNNING)
+		return;
+
+	if (!is_user)
+		perf_callchain_kernel(regs, entry);
+
+	if (current->mm)
+		perf_callchain_user(regs, entry);
+}
+
+struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
+{
+	struct perf_callchain_entry *entry;
+
+	if (in_nmi())
+		entry = &__get_cpu_var(pmc_nmi_entry);
+	else
+		entry = &__get_cpu_var(pmc_irq_entry);
+
+	entry->nr = 0;
+
+	perf_do_callchain(regs, entry);
+
+	return entry;
+}
+
+void hw_perf_event_setup_online(int cpu)
+{
+	init_debug_store_on_cpu(cpu);
+}
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index 392bea43b89..fab786f60ed 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -20,7 +20,7 @@
 #include <linux/kprobes.h>
 
 #include <asm/apic.h>
-#include <asm/perf_counter.h>
+#include <asm/perf_event.h>
 
 struct nmi_watchdog_ctlblk {
 	unsigned int cccr_msr;
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index d59fe323807..681c3fda739 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1021,7 +1021,7 @@ apicinterrupt ERROR_APIC_VECTOR \
 apicinterrupt SPURIOUS_APIC_VECTOR \
 	spurious_interrupt smp_spurious_interrupt
 
-#ifdef CONFIG_PERF_COUNTERS
+#ifdef CONFIG_PERF_EVENTS
 apicinterrupt LOCAL_PENDING_VECTOR \
 	perf_pending_interrupt smp_perf_pending_interrupt
 #endif
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 300883112e3..40f30773fb2 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -208,7 +208,7 @@ static void __init apic_intr_init(void)
 	alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
 
 	/* Performance monitoring interrupts: */
-# ifdef CONFIG_PERF_COUNTERS
+# ifdef CONFIG_PERF_EVENTS
 	alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);
 # endif
 
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index d51321ddafd..0157cd26d7c 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -335,4 +335,4 @@ ENTRY(sys_call_table)
 	.long sys_preadv
 	.long sys_pwritev
 	.long sys_rt_tgsigqueueinfo	/* 335 */
-	.long sys_perf_counter_open
+	.long sys_perf_event_open
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 775a020990a..82728f2c6d5 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -10,7 +10,7 @@
 #include <linux/bootmem.h>		/* max_low_pfn			*/
 #include <linux/kprobes.h>		/* __kprobes, ...		*/
 #include <linux/mmiotrace.h>		/* kmmio_handler, ...		*/
-#include <linux/perf_counter.h>		/* perf_swcounter_event		*/
+#include <linux/perf_event.h>		/* perf_sw_event		*/
 
 #include <asm/traps.h>			/* dotraplinkage, ...		*/
 #include <asm/pgalloc.h>		/* pgd_*(), ...			*/
@@ -1017,7 +1017,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
 	if (unlikely(error_code & PF_RSVD))
 		pgtable_bad(regs, error_code, address);
 
-	perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
+	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
 
 	/*
 	 * If we're in an interrupt, have no user context or are running
@@ -1114,11 +1114,11 @@ good_area:
 
 	if (fault & VM_FAULT_MAJOR) {
 		tsk->maj_flt++;
-		perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
+		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
 				     regs, address);
 	} else {
 		tsk->min_flt++;
-		perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
+		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
 				     regs, address);
 	}
 
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
index 4899215999d..8eb05878554 100644
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -234,11 +234,11 @@ static void arch_perfmon_setup_counters(void)
 	if (eax.split.version_id == 0 && current_cpu_data.x86 == 6 &&
 		current_cpu_data.x86_model == 15) {
 		eax.split.version_id = 2;
-		eax.split.num_counters = 2;
+		eax.split.num_events = 2;
 		eax.split.bit_width = 40;
 	}
 
-	num_counters = eax.split.num_counters;
+	num_counters = eax.split.num_events;
 
 	op_arch_perfmon_spec.num_counters = num_counters;
 	op_arch_perfmon_spec.num_controls = num_counters;
diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h
index b83776180c7..7b8e75d1608 100644
--- a/arch/x86/oprofile/op_x86_model.h
+++ b/arch/x86/oprofile/op_x86_model.h
@@ -13,7 +13,7 @@
 #define OP_X86_MODEL_H
 
 #include <asm/types.h>
-#include <asm/perf_counter.h>
+#include <asm/perf_event.h>
 
 struct op_msr {
 	unsigned long	addr;
diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c
index 50eecfe1d72..44203ff599d 100644
--- a/drivers/char/sysrq.c
+++ b/drivers/char/sysrq.c
@@ -26,7 +26,7 @@
 #include <linux/proc_fs.h>
 #include <linux/nmi.h>
 #include <linux/quotaops.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/suspend.h>
@@ -252,7 +252,7 @@ static void sysrq_handle_showregs(int key, struct tty_struct *tty)
 	struct pt_regs *regs = get_irq_regs();
 	if (regs)
 		show_regs(regs);
-	perf_counter_print_debug();
+	perf_event_print_debug();
 }
 static struct sysrq_key_op sysrq_showregs_op = {
 	.handler	= sysrq_handle_showregs,
diff --git a/fs/exec.c b/fs/exec.c
index 172ceb6edde..434dba778cc 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -33,7 +33,7 @@
 #include <linux/string.h>
 #include <linux/init.h>
 #include <linux/pagemap.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
 #include <linux/highmem.h>
 #include <linux/spinlock.h>
 #include <linux/key.h>
@@ -923,7 +923,7 @@ void set_task_comm(struct task_struct *tsk, char *buf)
 	task_lock(tsk);
 	strlcpy(tsk->comm, buf, sizeof(tsk->comm));
 	task_unlock(tsk);
-	perf_counter_comm(tsk);
+	perf_event_comm(tsk);
 }
 
 int flush_old_exec(struct linux_binprm * bprm)
@@ -997,7 +997,7 @@ int flush_old_exec(struct linux_binprm * bprm)
 	 * security domain:
 	 */
 	if (!get_dumpable(current->mm))
-		perf_counter_exit_task(current);
+		perf_event_exit_task(current);
 
 	/* An exec changes our domain. We are no longer part of the thread
 	   group */
diff --git a/include/asm-generic/unistd.h b/include/asm-generic/unistd.h
index 1125e5a1ee5..d76b66acea9 100644
--- a/include/asm-generic/unistd.h
+++ b/include/asm-generic/unistd.h
@@ -620,8 +620,8 @@ __SYSCALL(__NR_move_pages, sys_move_pages)
 
 #define __NR_rt_tgsigqueueinfo 240
 __SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo)
-#define __NR_perf_counter_open 241
-__SYSCALL(__NR_perf_counter_open, sys_perf_counter_open)
+#define __NR_perf_event_open 241
+__SYSCALL(__NR_perf_event_open, sys_perf_event_open)
 
 #undef __NR_syscalls
 #define __NR_syscalls 242
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 9e7f2e8fc66..21a6f5d9af2 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -106,13 +106,13 @@ extern struct group_info init_groups;
 
 extern struct cred init_cred;
 
-#ifdef CONFIG_PERF_COUNTERS
-# define INIT_PERF_COUNTERS(tsk)					\
-	.perf_counter_mutex = 						\
-		 __MUTEX_INITIALIZER(tsk.perf_counter_mutex),		\
-	.perf_counter_list = LIST_HEAD_INIT(tsk.perf_counter_list),
+#ifdef CONFIG_PERF_EVENTS
+# define INIT_PERF_EVENTS(tsk)					\
+	.perf_event_mutex = 						\
+		 __MUTEX_INITIALIZER(tsk.perf_event_mutex),		\
+	.perf_event_list = LIST_HEAD_INIT(tsk.perf_event_list),
 #else
-# define INIT_PERF_COUNTERS(tsk)
+# define INIT_PERF_EVENTS(tsk)
 #endif
 
 /*
@@ -178,7 +178,7 @@ extern struct cred init_cred;
 	},								\
 	.dirties = INIT_PROP_LOCAL_SINGLE(dirties),			\
 	INIT_IDS							\
-	INIT_PERF_COUNTERS(tsk)						\
+	INIT_PERF_EVENTS(tsk)						\
 	INIT_TRACE_IRQFLAGS						\
 	INIT_LOCKDEP							\
 	INIT_FTRACE_GRAPH						\
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
deleted file mode 100644
index f6486273267..00000000000
--- a/include/linux/perf_counter.h
+++ /dev/null
@@ -1,858 +0,0 @@
-/*
- *  Performance counters:
- *
- *    Copyright (C) 2008-2009, Thomas Gleixner <tglx@linutronix.de>
- *    Copyright (C) 2008-2009, Red Hat, Inc., Ingo Molnar
- *    Copyright (C) 2008-2009, Red Hat, Inc., Peter Zijlstra
- *
- *  Data type definitions, declarations, prototypes.
- *
- *    Started by: Thomas Gleixner and Ingo Molnar
- *
- *  For licencing details see kernel-base/COPYING
- */
-#ifndef _LINUX_PERF_COUNTER_H
-#define _LINUX_PERF_COUNTER_H
-
-#include <linux/types.h>
-#include <linux/ioctl.h>
-#include <asm/byteorder.h>
-
-/*
- * User-space ABI bits:
- */
-
-/*
- * attr.type
- */
-enum perf_type_id {
-	PERF_TYPE_HARDWARE			= 0,
-	PERF_TYPE_SOFTWARE			= 1,
-	PERF_TYPE_TRACEPOINT			= 2,
-	PERF_TYPE_HW_CACHE			= 3,
-	PERF_TYPE_RAW				= 4,
-
-	PERF_TYPE_MAX,				/* non-ABI */
-};
-
-/*
- * Generalized performance counter event types, used by the
- * attr.event_id parameter of the sys_perf_counter_open()
- * syscall:
- */
-enum perf_hw_id {
-	/*
-	 * Common hardware events, generalized by the kernel:
-	 */
-	PERF_COUNT_HW_CPU_CYCLES		= 0,
-	PERF_COUNT_HW_INSTRUCTIONS		= 1,
-	PERF_COUNT_HW_CACHE_REFERENCES		= 2,
-	PERF_COUNT_HW_CACHE_MISSES		= 3,
-	PERF_COUNT_HW_BRANCH_INSTRUCTIONS	= 4,
-	PERF_COUNT_HW_BRANCH_MISSES		= 5,
-	PERF_COUNT_HW_BUS_CYCLES		= 6,
-
-	PERF_COUNT_HW_MAX,			/* non-ABI */
-};
-
-/*
- * Generalized hardware cache counters:
- *
- *       { L1-D, L1-I, LLC, ITLB, DTLB, BPU } x
- *       { read, write, prefetch } x
- *       { accesses, misses }
- */
-enum perf_hw_cache_id {
-	PERF_COUNT_HW_CACHE_L1D			= 0,
-	PERF_COUNT_HW_CACHE_L1I			= 1,
-	PERF_COUNT_HW_CACHE_LL			= 2,
-	PERF_COUNT_HW_CACHE_DTLB		= 3,
-	PERF_COUNT_HW_CACHE_ITLB		= 4,
-	PERF_COUNT_HW_CACHE_BPU			= 5,
-
-	PERF_COUNT_HW_CACHE_MAX,		/* non-ABI */
-};
-
-enum perf_hw_cache_op_id {
-	PERF_COUNT_HW_CACHE_OP_READ		= 0,
-	PERF_COUNT_HW_CACHE_OP_WRITE		= 1,
-	PERF_COUNT_HW_CACHE_OP_PREFETCH		= 2,
-
-	PERF_COUNT_HW_CACHE_OP_MAX,		/* non-ABI */
-};
-
-enum perf_hw_cache_op_result_id {
-	PERF_COUNT_HW_CACHE_RESULT_ACCESS	= 0,
-	PERF_COUNT_HW_CACHE_RESULT_MISS		= 1,
-
-	PERF_COUNT_HW_CACHE_RESULT_MAX,		/* non-ABI */
-};
-
-/*
- * Special "software" counters provided by the kernel, even if the hardware
- * does not support performance counters. These counters measure various
- * physical and sw events of the kernel (and allow the profiling of them as
- * well):
- */
-enum perf_sw_ids {
-	PERF_COUNT_SW_CPU_CLOCK			= 0,
-	PERF_COUNT_SW_TASK_CLOCK		= 1,
-	PERF_COUNT_SW_PAGE_FAULTS		= 2,
-	PERF_COUNT_SW_CONTEXT_SWITCHES		= 3,
-	PERF_COUNT_SW_CPU_MIGRATIONS		= 4,
-	PERF_COUNT_SW_PAGE_FAULTS_MIN		= 5,
-	PERF_COUNT_SW_PAGE_FAULTS_MAJ		= 6,
-
-	PERF_COUNT_SW_MAX,			/* non-ABI */
-};
-
-/*
- * Bits that can be set in attr.sample_type to request information
- * in the overflow packets.
- */
-enum perf_counter_sample_format {
-	PERF_SAMPLE_IP				= 1U << 0,
-	PERF_SAMPLE_TID				= 1U << 1,
-	PERF_SAMPLE_TIME			= 1U << 2,
-	PERF_SAMPLE_ADDR			= 1U << 3,
-	PERF_SAMPLE_READ			= 1U << 4,
-	PERF_SAMPLE_CALLCHAIN			= 1U << 5,
-	PERF_SAMPLE_ID				= 1U << 6,
-	PERF_SAMPLE_CPU				= 1U << 7,
-	PERF_SAMPLE_PERIOD			= 1U << 8,
-	PERF_SAMPLE_STREAM_ID			= 1U << 9,
-	PERF_SAMPLE_RAW				= 1U << 10,
-
-	PERF_SAMPLE_MAX = 1U << 11,		/* non-ABI */
-};
-
-/*
- * The format of the data returned by read() on a perf counter fd,
- * as specified by attr.read_format:
- *
- * struct read_format {
- * 	{ u64		value;
- * 	  { u64		time_enabled; } && PERF_FORMAT_ENABLED
- * 	  { u64		time_running; } && PERF_FORMAT_RUNNING
- * 	  { u64		id;           } && PERF_FORMAT_ID
- * 	} && !PERF_FORMAT_GROUP
- *
- * 	{ u64		nr;
- * 	  { u64		time_enabled; } && PERF_FORMAT_ENABLED
- * 	  { u64		time_running; } && PERF_FORMAT_RUNNING
- * 	  { u64		value;
- * 	    { u64	id;           } && PERF_FORMAT_ID
- * 	  }		cntr[nr];
- * 	} && PERF_FORMAT_GROUP
- * };
- */
-enum perf_counter_read_format {
-	PERF_FORMAT_TOTAL_TIME_ENABLED		= 1U << 0,
-	PERF_FORMAT_TOTAL_TIME_RUNNING		= 1U << 1,
-	PERF_FORMAT_ID				= 1U << 2,
-	PERF_FORMAT_GROUP			= 1U << 3,
-
-	PERF_FORMAT_MAX = 1U << 4, 		/* non-ABI */
-};
-
-#define PERF_ATTR_SIZE_VER0	64	/* sizeof first published struct */
-
-/*
- * Hardware event to monitor via a performance monitoring counter:
- */
-struct perf_counter_attr {
-
-	/*
-	 * Major type: hardware/software/tracepoint/etc.
-	 */
-	__u32			type;
-
-	/*
-	 * Size of the attr structure, for fwd/bwd compat.
-	 */
-	__u32			size;
-
-	/*
-	 * Type specific configuration information.
-	 */
-	__u64			config;
-
-	union {
-		__u64		sample_period;
-		__u64		sample_freq;
-	};
-
-	__u64			sample_type;
-	__u64			read_format;
-
-	__u64			disabled       :  1, /* off by default        */
-				inherit	       :  1, /* children inherit it   */
-				pinned	       :  1, /* must always be on PMU */
-				exclusive      :  1, /* only group on PMU     */
-				exclude_user   :  1, /* don't count user      */
-				exclude_kernel :  1, /* ditto kernel          */
-				exclude_hv     :  1, /* ditto hypervisor      */
-				exclude_idle   :  1, /* don't count when idle */
-				mmap           :  1, /* include mmap data     */
-				comm	       :  1, /* include comm data     */
-				freq           :  1, /* use freq, not period  */
-				inherit_stat   :  1, /* per task counts       */
-				enable_on_exec :  1, /* next exec enables     */
-				task           :  1, /* trace fork/exit       */
-				watermark      :  1, /* wakeup_watermark      */
-
-				__reserved_1   : 49;
-
-	union {
-		__u32		wakeup_events;	  /* wakeup every n events */
-		__u32		wakeup_watermark; /* bytes before wakeup   */
-	};
-	__u32			__reserved_2;
-
-	__u64			__reserved_3;
-};
-
-/*
- * Ioctls that can be done on a perf counter fd:
- */
-#define PERF_COUNTER_IOC_ENABLE		_IO ('$', 0)
-#define PERF_COUNTER_IOC_DISABLE	_IO ('$', 1)
-#define PERF_COUNTER_IOC_REFRESH	_IO ('$', 2)
-#define PERF_COUNTER_IOC_RESET		_IO ('$', 3)
-#define PERF_COUNTER_IOC_PERIOD		_IOW('$', 4, u64)
-#define PERF_COUNTER_IOC_SET_OUTPUT	_IO ('$', 5)
-
-enum perf_counter_ioc_flags {
-	PERF_IOC_FLAG_GROUP		= 1U << 0,
-};
-
-/*
- * Structure of the page that can be mapped via mmap
- */
-struct perf_counter_mmap_page {
-	__u32	version;		/* version number of this structure */
-	__u32	compat_version;		/* lowest version this is compat with */
-
-	/*
-	 * Bits needed to read the hw counters in user-space.
-	 *
-	 *   u32 seq;
-	 *   s64 count;
-	 *
-	 *   do {
-	 *     seq = pc->lock;
-	 *
-	 *     barrier()
-	 *     if (pc->index) {
-	 *       count = pmc_read(pc->index - 1);
-	 *       count += pc->offset;
-	 *     } else
-	 *       goto regular_read;
-	 *
-	 *     barrier();
-	 *   } while (pc->lock != seq);
-	 *
-	 * NOTE: for obvious reason this only works on self-monitoring
-	 *       processes.
-	 */
-	__u32	lock;			/* seqlock for synchronization */
-	__u32	index;			/* hardware counter identifier */
-	__s64	offset;			/* add to hardware counter value */
-	__u64	time_enabled;		/* time counter active */
-	__u64	time_running;		/* time counter on cpu */
-
-		/*
-		 * Hole for extension of the self monitor capabilities
-		 */
-
-	__u64	__reserved[123];	/* align to 1k */
-
-	/*
-	 * Control data for the mmap() data buffer.
-	 *
-	 * User-space reading the @data_head value should issue an rmb(), on
-	 * SMP capable platforms, after reading this value -- see
-	 * perf_counter_wakeup().
-	 *
-	 * When the mapping is PROT_WRITE the @data_tail value should be
-	 * written by userspace to reflect the last read data. In this case
-	 * the kernel will not over-write unread data.
-	 */
-	__u64   data_head;		/* head in the data section */
-	__u64	data_tail;		/* user-space written tail */
-};
-
-#define PERF_EVENT_MISC_CPUMODE_MASK		(3 << 0)
-#define PERF_EVENT_MISC_CPUMODE_UNKNOWN		(0 << 0)
-#define PERF_EVENT_MISC_KERNEL			(1 << 0)
-#define PERF_EVENT_MISC_USER			(2 << 0)
-#define PERF_EVENT_MISC_HYPERVISOR		(3 << 0)
-
-struct perf_event_header {
-	__u32	type;
-	__u16	misc;
-	__u16	size;
-};
-
-enum perf_event_type {
-
-	/*
-	 * The MMAP events record the PROT_EXEC mappings so that we can
-	 * correlate userspace IPs to code. They have the following structure:
-	 *
-	 * struct {
-	 *	struct perf_event_header	header;
-	 *
-	 *	u32				pid, tid;
-	 *	u64				addr;
-	 *	u64				len;
-	 *	u64				pgoff;
-	 *	char				filename[];
-	 * };
-	 */
-	PERF_EVENT_MMAP			= 1,
-
-	/*
-	 * struct {
-	 * 	struct perf_event_header	header;
-	 * 	u64				id;
-	 * 	u64				lost;
-	 * };
-	 */
-	PERF_EVENT_LOST			= 2,
-
-	/*
-	 * struct {
-	 *	struct perf_event_header	header;
-	 *
-	 *	u32				pid, tid;
-	 *	char				comm[];
-	 * };
-	 */
-	PERF_EVENT_COMM			= 3,
-
-	/*
-	 * struct {
-	 *	struct perf_event_header	header;
-	 *	u32				pid, ppid;
-	 *	u32				tid, ptid;
-	 *	u64				time;
-	 * };
-	 */
-	PERF_EVENT_EXIT			= 4,
-
-	/*
-	 * struct {
-	 *	struct perf_event_header	header;
-	 *	u64				time;
-	 *	u64				id;
-	 *	u64				stream_id;
-	 * };
-	 */
-	PERF_EVENT_THROTTLE		= 5,
-	PERF_EVENT_UNTHROTTLE		= 6,
-
-	/*
-	 * struct {
-	 *	struct perf_event_header	header;
-	 *	u32				pid, ppid;
-	 *	u32				tid, ptid;
-	 *	{ u64				time;     } && PERF_SAMPLE_TIME
-	 * };
-	 */
-	PERF_EVENT_FORK			= 7,
-
-	/*
-	 * struct {
-	 * 	struct perf_event_header	header;
-	 * 	u32				pid, tid;
-	 *
-	 * 	struct read_format		values;
-	 * };
-	 */
-	PERF_EVENT_READ			= 8,
-
-	/*
-	 * struct {
-	 *	struct perf_event_header	header;
-	 *
-	 *	{ u64			ip;	  } && PERF_SAMPLE_IP
-	 *	{ u32			pid, tid; } && PERF_SAMPLE_TID
-	 *	{ u64			time;     } && PERF_SAMPLE_TIME
-	 *	{ u64			addr;     } && PERF_SAMPLE_ADDR
-	 *	{ u64			id;	  } && PERF_SAMPLE_ID
-	 *	{ u64			stream_id;} && PERF_SAMPLE_STREAM_ID
-	 *	{ u32			cpu, res; } && PERF_SAMPLE_CPU
-	 * 	{ u64			period;   } && PERF_SAMPLE_PERIOD
-	 *
-	 *	{ struct read_format	values;	  } && PERF_SAMPLE_READ
-	 *
-	 *	{ u64			nr,
-	 *	  u64			ips[nr];  } && PERF_SAMPLE_CALLCHAIN
-	 *
-	 * 	#
-	 * 	# The RAW record below is opaque data wrt the ABI
-	 * 	#
-	 * 	# That is, the ABI doesn't make any promises wrt to
-	 * 	# the stability of its content, it may vary depending
-	 * 	# on event, hardware, kernel version and phase of
-	 * 	# the moon.
-	 * 	#
-	 * 	# In other words, PERF_SAMPLE_RAW contents are not an ABI.
-	 * 	#
-	 *
-	 *	{ u32			size;
-	 *	  char                  data[size];}&& PERF_SAMPLE_RAW
-	 * };
-	 */
-	PERF_EVENT_SAMPLE		= 9,
-
-	PERF_EVENT_MAX,			/* non-ABI */
-};
-
-enum perf_callchain_context {
-	PERF_CONTEXT_HV			= (__u64)-32,
-	PERF_CONTEXT_KERNEL		= (__u64)-128,
-	PERF_CONTEXT_USER		= (__u64)-512,
-
-	PERF_CONTEXT_GUEST		= (__u64)-2048,
-	PERF_CONTEXT_GUEST_KERNEL	= (__u64)-2176,
-	PERF_CONTEXT_GUEST_USER		= (__u64)-2560,
-
-	PERF_CONTEXT_MAX		= (__u64)-4095,
-};
-
-#define PERF_FLAG_FD_NO_GROUP	(1U << 0)
-#define PERF_FLAG_FD_OUTPUT	(1U << 1)
-
-#ifdef __KERNEL__
-/*
- * Kernel-internal data types and definitions:
- */
-
-#ifdef CONFIG_PERF_COUNTERS
-# include <asm/perf_counter.h>
-#endif
-
-#include <linux/list.h>
-#include <linux/mutex.h>
-#include <linux/rculist.h>
-#include <linux/rcupdate.h>
-#include <linux/spinlock.h>
-#include <linux/hrtimer.h>
-#include <linux/fs.h>
-#include <linux/pid_namespace.h>
-#include <asm/atomic.h>
-
-#define PERF_MAX_STACK_DEPTH		255
-
-struct perf_callchain_entry {
-	__u64				nr;
-	__u64				ip[PERF_MAX_STACK_DEPTH];
-};
-
-struct perf_raw_record {
-	u32				size;
-	void				*data;
-};
-
-struct task_struct;
-
-/**
- * struct hw_perf_counter - performance counter hardware details:
- */
-struct hw_perf_counter {
-#ifdef CONFIG_PERF_COUNTERS
-	union {
-		struct { /* hardware */
-			u64		config;
-			unsigned long	config_base;
-			unsigned long	counter_base;
-			int		idx;
-		};
-		union { /* software */
-			atomic64_t	count;
-			struct hrtimer	hrtimer;
-		};
-	};
-	atomic64_t			prev_count;
-	u64				sample_period;
-	u64				last_period;
-	atomic64_t			period_left;
-	u64				interrupts;
-
-	u64				freq_count;
-	u64				freq_interrupts;
-	u64				freq_stamp;
-#endif
-};
-
-struct perf_counter;
-
-/**
- * struct pmu - generic performance monitoring unit
- */
-struct pmu {
-	int (*enable)			(struct perf_counter *counter);
-	void (*disable)			(struct perf_counter *counter);
-	void (*read)			(struct perf_counter *counter);
-	void (*unthrottle)		(struct perf_counter *counter);
-};
-
-/**
- * enum perf_counter_active_state - the states of a counter
- */
-enum perf_counter_active_state {
-	PERF_COUNTER_STATE_ERROR	= -2,
-	PERF_COUNTER_STATE_OFF		= -1,
-	PERF_COUNTER_STATE_INACTIVE	=  0,
-	PERF_COUNTER_STATE_ACTIVE	=  1,
-};
-
-struct file;
-
-struct perf_mmap_data {
-	struct rcu_head			rcu_head;
-	int				nr_pages;	/* nr of data pages  */
-	int				writable;	/* are we writable   */
-	int				nr_locked;	/* nr pages mlocked  */
-
-	atomic_t			poll;		/* POLL_ for wakeups */
-	atomic_t			events;		/* event limit       */
-
-	atomic_long_t			head;		/* write position    */
-	atomic_long_t			done_head;	/* completed head    */
-
-	atomic_t			lock;		/* concurrent writes */
-	atomic_t			wakeup;		/* needs a wakeup    */
-	atomic_t			lost;		/* nr records lost   */
-
-	long				watermark;	/* wakeup watermark  */
-
-	struct perf_counter_mmap_page   *user_page;
-	void				*data_pages[0];
-};
-
-struct perf_pending_entry {
-	struct perf_pending_entry *next;
-	void (*func)(struct perf_pending_entry *);
-};
-
-/**
- * struct perf_counter - performance counter kernel representation:
- */
-struct perf_counter {
-#ifdef CONFIG_PERF_COUNTERS
-	struct list_head		group_entry;
-	struct list_head		event_entry;
-	struct list_head		sibling_list;
-	int				nr_siblings;
-	struct perf_counter		*group_leader;
-	struct perf_counter		*output;
-	const struct pmu		*pmu;
-
-	enum perf_counter_active_state	state;
-	atomic64_t			count;
-
-	/*
-	 * These are the total time in nanoseconds that the counter
-	 * has been enabled (i.e. eligible to run, and the task has
-	 * been scheduled in, if this is a per-task counter)
-	 * and running (scheduled onto the CPU), respectively.
-	 *
-	 * They are computed from tstamp_enabled, tstamp_running and
-	 * tstamp_stopped when the counter is in INACTIVE or ACTIVE state.
-	 */
-	u64				total_time_enabled;
-	u64				total_time_running;
-
-	/*
-	 * These are timestamps used for computing total_time_enabled
-	 * and total_time_running when the counter is in INACTIVE or
-	 * ACTIVE state, measured in nanoseconds from an arbitrary point
-	 * in time.
-	 * tstamp_enabled: the notional time when the counter was enabled
-	 * tstamp_running: the notional time when the counter was scheduled on
-	 * tstamp_stopped: in INACTIVE state, the notional time when the
-	 *	counter was scheduled off.
-	 */
-	u64				tstamp_enabled;
-	u64				tstamp_running;
-	u64				tstamp_stopped;
-
-	struct perf_counter_attr	attr;
-	struct hw_perf_counter		hw;
-
-	struct perf_counter_context	*ctx;
-	struct file			*filp;
-
-	/*
-	 * These accumulate total time (in nanoseconds) that children
-	 * counters have been enabled and running, respectively.
-	 */
-	atomic64_t			child_total_time_enabled;
-	atomic64_t			child_total_time_running;
-
-	/*
-	 * Protect attach/detach and child_list:
-	 */
-	struct mutex			child_mutex;
-	struct list_head		child_list;
-	struct perf_counter		*parent;
-
-	int				oncpu;
-	int				cpu;
-
-	struct list_head		owner_entry;
-	struct task_struct		*owner;
-
-	/* mmap bits */
-	struct mutex			mmap_mutex;
-	atomic_t			mmap_count;
-	struct perf_mmap_data		*data;
-
-	/* poll related */
-	wait_queue_head_t		waitq;
-	struct fasync_struct		*fasync;
-
-	/* delayed work for NMIs and such */
-	int				pending_wakeup;
-	int				pending_kill;
-	int				pending_disable;
-	struct perf_pending_entry	pending;
-
-	atomic_t			event_limit;
-
-	void (*destroy)(struct perf_counter *);
-	struct rcu_head			rcu_head;
-
-	struct pid_namespace		*ns;
-	u64				id;
-#endif
-};
-
-/**
- * struct perf_counter_context - counter context structure
- *
- * Used as a container for task counters and CPU counters as well:
- */
-struct perf_counter_context {
-	/*
-	 * Protect the states of the counters in the list,
-	 * nr_active, and the list:
-	 */
-	spinlock_t			lock;
-	/*
-	 * Protect the list of counters.  Locking either mutex or lock
-	 * is sufficient to ensure the list doesn't change; to change
-	 * the list you need to lock both the mutex and the spinlock.
-	 */
-	struct mutex			mutex;
-
-	struct list_head		group_list;
-	struct list_head		event_list;
-	int				nr_counters;
-	int				nr_active;
-	int				is_active;
-	int				nr_stat;
-	atomic_t			refcount;
-	struct task_struct		*task;
-
-	/*
-	 * Context clock, runs when context enabled.
-	 */
-	u64				time;
-	u64				timestamp;
-
-	/*
-	 * These fields let us detect when two contexts have both
-	 * been cloned (inherited) from a common ancestor.
-	 */
-	struct perf_counter_context	*parent_ctx;
-	u64				parent_gen;
-	u64				generation;
-	int				pin_count;
-	struct rcu_head			rcu_head;
-};
-
-/**
- * struct perf_counter_cpu_context - per cpu counter context structure
- */
-struct perf_cpu_context {
-	struct perf_counter_context	ctx;
-	struct perf_counter_context	*task_ctx;
-	int				active_oncpu;
-	int				max_pertask;
-	int				exclusive;
-
-	/*
-	 * Recursion avoidance:
-	 *
-	 * task, softirq, irq, nmi context
-	 */
-	int				recursion[4];
-};
-
-struct perf_output_handle {
-	struct perf_counter	*counter;
-	struct perf_mmap_data	*data;
-	unsigned long		head;
-	unsigned long		offset;
-	int			nmi;
-	int			sample;
-	int			locked;
-	unsigned long		flags;
-};
-
-#ifdef CONFIG_PERF_COUNTERS
-
-/*
- * Set by architecture code:
- */
-extern int perf_max_counters;
-
-extern const struct pmu *hw_perf_counter_init(struct perf_counter *counter);
-
-extern void perf_counter_task_sched_in(struct task_struct *task, int cpu);
-extern void perf_counter_task_sched_out(struct task_struct *task,
-					struct task_struct *next, int cpu);
-extern void perf_counter_task_tick(struct task_struct *task, int cpu);
-extern int perf_counter_init_task(struct task_struct *child);
-extern void perf_counter_exit_task(struct task_struct *child);
-extern void perf_counter_free_task(struct task_struct *task);
-extern void set_perf_counter_pending(void);
-extern void perf_counter_do_pending(void);
-extern void perf_counter_print_debug(void);
-extern void __perf_disable(void);
-extern bool __perf_enable(void);
-extern void perf_disable(void);
-extern void perf_enable(void);
-extern int perf_counter_task_disable(void);
-extern int perf_counter_task_enable(void);
-extern int hw_perf_group_sched_in(struct perf_counter *group_leader,
-	       struct perf_cpu_context *cpuctx,
-	       struct perf_counter_context *ctx, int cpu);
-extern void perf_counter_update_userpage(struct perf_counter *counter);
-
-struct perf_sample_data {
-	u64				type;
-
-	u64				ip;
-	struct {
-		u32	pid;
-		u32	tid;
-	}				tid_entry;
-	u64				time;
-	u64				addr;
-	u64				id;
-	u64				stream_id;
-	struct {
-		u32	cpu;
-		u32	reserved;
-	}				cpu_entry;
-	u64				period;
-	struct perf_callchain_entry	*callchain;
-	struct perf_raw_record		*raw;
-};
-
-extern void perf_output_sample(struct perf_output_handle *handle,
-			       struct perf_event_header *header,
-			       struct perf_sample_data *data,
-			       struct perf_counter *counter);
-extern void perf_prepare_sample(struct perf_event_header *header,
-				struct perf_sample_data *data,
-				struct perf_counter *counter,
-				struct pt_regs *regs);
-
-extern int perf_counter_overflow(struct perf_counter *counter, int nmi,
-				 struct perf_sample_data *data,
-				 struct pt_regs *regs);
-
-/*
- * Return 1 for a software counter, 0 for a hardware counter
- */
-static inline int is_software_counter(struct perf_counter *counter)
-{
-	return (counter->attr.type != PERF_TYPE_RAW) &&
-		(counter->attr.type != PERF_TYPE_HARDWARE) &&
-		(counter->attr.type != PERF_TYPE_HW_CACHE);
-}
-
-extern atomic_t perf_swcounter_enabled[PERF_COUNT_SW_MAX];
-
-extern void __perf_swcounter_event(u32, u64, int, struct pt_regs *, u64);
-
-static inline void
-perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr)
-{
-	if (atomic_read(&perf_swcounter_enabled[event]))
-		__perf_swcounter_event(event, nr, nmi, regs, addr);
-}
-
-extern void __perf_counter_mmap(struct vm_area_struct *vma);
-
-static inline void perf_counter_mmap(struct vm_area_struct *vma)
-{
-	if (vma->vm_flags & VM_EXEC)
-		__perf_counter_mmap(vma);
-}
-
-extern void perf_counter_comm(struct task_struct *tsk);
-extern void perf_counter_fork(struct task_struct *tsk);
-
-extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
-
-extern int sysctl_perf_counter_paranoid;
-extern int sysctl_perf_counter_mlock;
-extern int sysctl_perf_counter_sample_rate;
-
-extern void perf_counter_init(void);
-extern void perf_tpcounter_event(int event_id, u64 addr, u64 count,
-				 void *record, int entry_size);
-
-#ifndef perf_misc_flags
-#define perf_misc_flags(regs)	(user_mode(regs) ? PERF_EVENT_MISC_USER : \
-				 PERF_EVENT_MISC_KERNEL)
-#define perf_instruction_pointer(regs)	instruction_pointer(regs)
-#endif
-
-extern int perf_output_begin(struct perf_output_handle *handle,
-			     struct perf_counter *counter, unsigned int size,
-			     int nmi, int sample);
-extern void perf_output_end(struct perf_output_handle *handle);
-extern void perf_output_copy(struct perf_output_handle *handle,
-			     const void *buf, unsigned int len);
-#else
-static inline void
-perf_counter_task_sched_in(struct task_struct *task, int cpu)		{ }
-static inline void
-perf_counter_task_sched_out(struct task_struct *task,
-			    struct task_struct *next, int cpu)		{ }
-static inline void
-perf_counter_task_tick(struct task_struct *task, int cpu)		{ }
-static inline int perf_counter_init_task(struct task_struct *child)	{ return 0; }
-static inline void perf_counter_exit_task(struct task_struct *child)	{ }
-static inline void perf_counter_free_task(struct task_struct *task)	{ }
-static inline void perf_counter_do_pending(void)			{ }
-static inline void perf_counter_print_debug(void)			{ }
-static inline void perf_disable(void)					{ }
-static inline void perf_enable(void)					{ }
-static inline int perf_counter_task_disable(void)	{ return -EINVAL; }
-static inline int perf_counter_task_enable(void)	{ return -EINVAL; }
-
-static inline void
-perf_swcounter_event(u32 event, u64 nr, int nmi,
-		     struct pt_regs *regs, u64 addr)			{ }
-
-static inline void perf_counter_mmap(struct vm_area_struct *vma)	{ }
-static inline void perf_counter_comm(struct task_struct *tsk)		{ }
-static inline void perf_counter_fork(struct task_struct *tsk)		{ }
-static inline void perf_counter_init(void)				{ }
-
-#endif
-
-#define perf_output_put(handle, x) \
-	perf_output_copy((handle), &(x), sizeof(x))
-
-#endif /* __KERNEL__ */
-#endif /* _LINUX_PERF_COUNTER_H */
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
new file mode 100644
index 00000000000..ae9d9ed6df2
--- /dev/null
+++ b/include/linux/perf_event.h
@@ -0,0 +1,858 @@
+/*
+ *  Performance events:
+ *
+ *    Copyright (C) 2008-2009, Thomas Gleixner <tglx@linutronix.de>
+ *    Copyright (C) 2008-2009, Red Hat, Inc., Ingo Molnar
+ *    Copyright (C) 2008-2009, Red Hat, Inc., Peter Zijlstra
+ *
+ *  Data type definitions, declarations, prototypes.
+ *
+ *    Started by: Thomas Gleixner and Ingo Molnar
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+#ifndef _LINUX_PERF_EVENT_H
+#define _LINUX_PERF_EVENT_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+#include <asm/byteorder.h>
+
+/*
+ * User-space ABI bits:
+ */
+
+/*
+ * attr.type
+ */
+enum perf_type_id {
+	PERF_TYPE_HARDWARE			= 0,
+	PERF_TYPE_SOFTWARE			= 1,
+	PERF_TYPE_TRACEPOINT			= 2,
+	PERF_TYPE_HW_CACHE			= 3,
+	PERF_TYPE_RAW				= 4,
+
+	PERF_TYPE_MAX,				/* non-ABI */
+};
+
+/*
+ * Generalized performance event event_id types, used by the
+ * attr.event_id parameter of the sys_perf_event_open()
+ * syscall:
+ */
+enum perf_hw_id {
+	/*
+	 * Common hardware events, generalized by the kernel:
+	 */
+	PERF_COUNT_HW_CPU_CYCLES		= 0,
+	PERF_COUNT_HW_INSTRUCTIONS		= 1,
+	PERF_COUNT_HW_CACHE_REFERENCES		= 2,
+	PERF_COUNT_HW_CACHE_MISSES		= 3,
+	PERF_COUNT_HW_BRANCH_INSTRUCTIONS	= 4,
+	PERF_COUNT_HW_BRANCH_MISSES		= 5,
+	PERF_COUNT_HW_BUS_CYCLES		= 6,
+
+	PERF_COUNT_HW_MAX,			/* non-ABI */
+};
+
+/*
+ * Generalized hardware cache events:
+ *
+ *       { L1-D, L1-I, LLC, ITLB, DTLB, BPU } x
+ *       { read, write, prefetch } x
+ *       { accesses, misses }
+ */
+enum perf_hw_cache_id {
+	PERF_COUNT_HW_CACHE_L1D			= 0,
+	PERF_COUNT_HW_CACHE_L1I			= 1,
+	PERF_COUNT_HW_CACHE_LL			= 2,
+	PERF_COUNT_HW_CACHE_DTLB		= 3,
+	PERF_COUNT_HW_CACHE_ITLB		= 4,
+	PERF_COUNT_HW_CACHE_BPU			= 5,
+
+	PERF_COUNT_HW_CACHE_MAX,		/* non-ABI */
+};
+
+enum perf_hw_cache_op_id {
+	PERF_COUNT_HW_CACHE_OP_READ		= 0,
+	PERF_COUNT_HW_CACHE_OP_WRITE		= 1,
+	PERF_COUNT_HW_CACHE_OP_PREFETCH		= 2,
+
+	PERF_COUNT_HW_CACHE_OP_MAX,		/* non-ABI */
+};
+
+enum perf_hw_cache_op_result_id {
+	PERF_COUNT_HW_CACHE_RESULT_ACCESS	= 0,
+	PERF_COUNT_HW_CACHE_RESULT_MISS		= 1,
+
+	PERF_COUNT_HW_CACHE_RESULT_MAX,		/* non-ABI */
+};
+
+/*
+ * Special "software" events provided by the kernel, even if the hardware
+ * does not support performance events. These events measure various
+ * physical and sw events of the kernel (and allow the profiling of them as
+ * well):
+ */
+enum perf_sw_ids {
+	PERF_COUNT_SW_CPU_CLOCK			= 0,
+	PERF_COUNT_SW_TASK_CLOCK		= 1,
+	PERF_COUNT_SW_PAGE_FAULTS		= 2,
+	PERF_COUNT_SW_CONTEXT_SWITCHES		= 3,
+	PERF_COUNT_SW_CPU_MIGRATIONS		= 4,
+	PERF_COUNT_SW_PAGE_FAULTS_MIN		= 5,
+	PERF_COUNT_SW_PAGE_FAULTS_MAJ		= 6,
+
+	PERF_COUNT_SW_MAX,			/* non-ABI */
+};
+
+/*
+ * Bits that can be set in attr.sample_type to request information
+ * in the overflow packets.
+ */
+enum perf_event_sample_format {
+	PERF_SAMPLE_IP				= 1U << 0,
+	PERF_SAMPLE_TID				= 1U << 1,
+	PERF_SAMPLE_TIME			= 1U << 2,
+	PERF_SAMPLE_ADDR			= 1U << 3,
+	PERF_SAMPLE_READ			= 1U << 4,
+	PERF_SAMPLE_CALLCHAIN			= 1U << 5,
+	PERF_SAMPLE_ID				= 1U << 6,
+	PERF_SAMPLE_CPU				= 1U << 7,
+	PERF_SAMPLE_PERIOD			= 1U << 8,
+	PERF_SAMPLE_STREAM_ID			= 1U << 9,
+	PERF_SAMPLE_RAW				= 1U << 10,
+
+	PERF_SAMPLE_MAX = 1U << 11,		/* non-ABI */
+};
+
+/*
+ * The format of the data returned by read() on a perf event fd,
+ * as specified by attr.read_format:
+ *
+ * struct read_format {
+ * 	{ u64		value;
+ * 	  { u64		time_enabled; } && PERF_FORMAT_ENABLED
+ * 	  { u64		time_running; } && PERF_FORMAT_RUNNING
+ * 	  { u64		id;           } && PERF_FORMAT_ID
+ * 	} && !PERF_FORMAT_GROUP
+ *
+ * 	{ u64		nr;
+ * 	  { u64		time_enabled; } && PERF_FORMAT_ENABLED
+ * 	  { u64		time_running; } && PERF_FORMAT_RUNNING
+ * 	  { u64		value;
+ * 	    { u64	id;           } && PERF_FORMAT_ID
+ * 	  }		cntr[nr];
+ * 	} && PERF_FORMAT_GROUP
+ * };
+ */
+enum perf_event_read_format {
+	PERF_FORMAT_TOTAL_TIME_ENABLED		= 1U << 0,
+	PERF_FORMAT_TOTAL_TIME_RUNNING		= 1U << 1,
+	PERF_FORMAT_ID				= 1U << 2,
+	PERF_FORMAT_GROUP			= 1U << 3,
+
+	PERF_FORMAT_MAX = 1U << 4, 		/* non-ABI */
+};
+
+#define PERF_ATTR_SIZE_VER0	64	/* sizeof first published struct */
+
+/*
+ * Hardware event_id to monitor via a performance monitoring event:
+ */
+struct perf_event_attr {
+
+	/*
+	 * Major type: hardware/software/tracepoint/etc.
+	 */
+	__u32			type;
+
+	/*
+	 * Size of the attr structure, for fwd/bwd compat.
+	 */
+	__u32			size;
+
+	/*
+	 * Type specific configuration information.
+	 */
+	__u64			config;
+
+	union {
+		__u64		sample_period;
+		__u64		sample_freq;
+	};
+
+	__u64			sample_type;
+	__u64			read_format;
+
+	__u64			disabled       :  1, /* off by default        */
+				inherit	       :  1, /* children inherit it   */
+				pinned	       :  1, /* must always be on PMU */
+				exclusive      :  1, /* only group on PMU     */
+				exclude_user   :  1, /* don't count user      */
+				exclude_kernel :  1, /* ditto kernel          */
+				exclude_hv     :  1, /* ditto hypervisor      */
+				exclude_idle   :  1, /* don't count when idle */
+				mmap           :  1, /* include mmap data     */
+				comm	       :  1, /* include comm data     */
+				freq           :  1, /* use freq, not period  */
+				inherit_stat   :  1, /* per task counts       */
+				enable_on_exec :  1, /* next exec enables     */
+				task           :  1, /* trace fork/exit       */
+				watermark      :  1, /* wakeup_watermark      */
+
+				__reserved_1   : 49;
+
+	union {
+		__u32		wakeup_events;	  /* wakeup every n events */
+		__u32		wakeup_watermark; /* bytes before wakeup   */
+	};
+	__u32			__reserved_2;
+
+	__u64			__reserved_3;
+};
+
+/*
+ * Ioctls that can be done on a perf event fd:
+ */
+#define PERF_EVENT_IOC_ENABLE		_IO ('$', 0)
+#define PERF_EVENT_IOC_DISABLE	_IO ('$', 1)
+#define PERF_EVENT_IOC_REFRESH	_IO ('$', 2)
+#define PERF_EVENT_IOC_RESET		_IO ('$', 3)
+#define PERF_EVENT_IOC_PERIOD		_IOW('$', 4, u64)
+#define PERF_EVENT_IOC_SET_OUTPUT	_IO ('$', 5)
+
+enum perf_event_ioc_flags {
+	PERF_IOC_FLAG_GROUP		= 1U << 0,
+};
+
+/*
+ * Structure of the page that can be mapped via mmap
+ */
+struct perf_event_mmap_page {
+	__u32	version;		/* version number of this structure */
+	__u32	compat_version;		/* lowest version this is compat with */
+
+	/*
+	 * Bits needed to read the hw events in user-space.
+	 *
+	 *   u32 seq;
+	 *   s64 count;
+	 *
+	 *   do {
+	 *     seq = pc->lock;
+	 *
+	 *     barrier()
+	 *     if (pc->index) {
+	 *       count = pmc_read(pc->index - 1);
+	 *       count += pc->offset;
+	 *     } else
+	 *       goto regular_read;
+	 *
+	 *     barrier();
+	 *   } while (pc->lock != seq);
+	 *
+	 * NOTE: for obvious reason this only works on self-monitoring
+	 *       processes.
+	 */
+	__u32	lock;			/* seqlock for synchronization */
+	__u32	index;			/* hardware event identifier */
+	__s64	offset;			/* add to hardware event value */
+	__u64	time_enabled;		/* time event active */
+	__u64	time_running;		/* time event on cpu */
+
+		/*
+		 * Hole for extension of the self monitor capabilities
+		 */
+
+	__u64	__reserved[123];	/* align to 1k */
+
+	/*
+	 * Control data for the mmap() data buffer.
+	 *
+	 * User-space reading the @data_head value should issue an rmb(), on
+	 * SMP capable platforms, after reading this value -- see
+	 * perf_event_wakeup().
+	 *
+	 * When the mapping is PROT_WRITE the @data_tail value should be
+	 * written by userspace to reflect the last read data. In this case
+	 * the kernel will not over-write unread data.
+	 */
+	__u64   data_head;		/* head in the data section */
+	__u64	data_tail;		/* user-space written tail */
+};
+
+#define PERF_RECORD_MISC_CPUMODE_MASK		(3 << 0)
+#define PERF_RECORD_MISC_CPUMODE_UNKNOWN		(0 << 0)
+#define PERF_RECORD_MISC_KERNEL			(1 << 0)
+#define PERF_RECORD_MISC_USER			(2 << 0)
+#define PERF_RECORD_MISC_HYPERVISOR		(3 << 0)
+
+struct perf_event_header {
+	__u32	type;
+	__u16	misc;
+	__u16	size;
+};
+
+enum perf_event_type {
+
+	/*
+	 * The MMAP events record the PROT_EXEC mappings so that we can
+	 * correlate userspace IPs to code. They have the following structure:
+	 *
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *
+	 *	u32				pid, tid;
+	 *	u64				addr;
+	 *	u64				len;
+	 *	u64				pgoff;
+	 *	char				filename[];
+	 * };
+	 */
+	PERF_RECORD_MMAP			= 1,
+
+	/*
+	 * struct {
+	 * 	struct perf_event_header	header;
+	 * 	u64				id;
+	 * 	u64				lost;
+	 * };
+	 */
+	PERF_RECORD_LOST			= 2,
+
+	/*
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *
+	 *	u32				pid, tid;
+	 *	char				comm[];
+	 * };
+	 */
+	PERF_RECORD_COMM			= 3,
+
+	/*
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *	u32				pid, ppid;
+	 *	u32				tid, ptid;
+	 *	u64				time;
+	 * };
+	 */
+	PERF_RECORD_EXIT			= 4,
+
+	/*
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *	u64				time;
+	 *	u64				id;
+	 *	u64				stream_id;
+	 * };
+	 */
+	PERF_RECORD_THROTTLE		= 5,
+	PERF_RECORD_UNTHROTTLE		= 6,
+
+	/*
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *	u32				pid, ppid;
+	 *	u32				tid, ptid;
+	 *	{ u64				time;     } && PERF_SAMPLE_TIME
+	 * };
+	 */
+	PERF_RECORD_FORK			= 7,
+
+	/*
+	 * struct {
+	 * 	struct perf_event_header	header;
+	 * 	u32				pid, tid;
+	 *
+	 * 	struct read_format		values;
+	 * };
+	 */
+	PERF_RECORD_READ			= 8,
+
+	/*
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *
+	 *	{ u64			ip;	  } && PERF_SAMPLE_IP
+	 *	{ u32			pid, tid; } && PERF_SAMPLE_TID
+	 *	{ u64			time;     } && PERF_SAMPLE_TIME
+	 *	{ u64			addr;     } && PERF_SAMPLE_ADDR
+	 *	{ u64			id;	  } && PERF_SAMPLE_ID
+	 *	{ u64			stream_id;} && PERF_SAMPLE_STREAM_ID
+	 *	{ u32			cpu, res; } && PERF_SAMPLE_CPU
+	 * 	{ u64			period;   } && PERF_SAMPLE_PERIOD
+	 *
+	 *	{ struct read_format	values;	  } && PERF_SAMPLE_READ
+	 *
+	 *	{ u64			nr,
+	 *	  u64			ips[nr];  } && PERF_SAMPLE_CALLCHAIN
+	 *
+	 * 	#
+	 * 	# The RAW record below is opaque data wrt the ABI
+	 * 	#
+	 * 	# That is, the ABI doesn't make any promises wrt to
+	 * 	# the stability of its content, it may vary depending
+	 * 	# on event_id, hardware, kernel version and phase of
+	 * 	# the moon.
+	 * 	#
+	 * 	# In other words, PERF_SAMPLE_RAW contents are not an ABI.
+	 * 	#
+	 *
+	 *	{ u32			size;
+	 *	  char                  data[size];}&& PERF_SAMPLE_RAW
+	 * };
+	 */
+	PERF_RECORD_SAMPLE		= 9,
+
+	PERF_RECORD_MAX,			/* non-ABI */
+};
+
+enum perf_callchain_context {
+	PERF_CONTEXT_HV			= (__u64)-32,
+	PERF_CONTEXT_KERNEL		= (__u64)-128,
+	PERF_CONTEXT_USER		= (__u64)-512,
+
+	PERF_CONTEXT_GUEST		= (__u64)-2048,
+	PERF_CONTEXT_GUEST_KERNEL	= (__u64)-2176,
+	PERF_CONTEXT_GUEST_USER		= (__u64)-2560,
+
+	PERF_CONTEXT_MAX		= (__u64)-4095,
+};
+
+#define PERF_FLAG_FD_NO_GROUP	(1U << 0)
+#define PERF_FLAG_FD_OUTPUT	(1U << 1)
+
+#ifdef __KERNEL__
+/*
+ * Kernel-internal data types and definitions:
+ */
+
+#ifdef CONFIG_PERF_EVENTS
+# include <asm/perf_event.h>
+#endif
+
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+#include <linux/spinlock.h>
+#include <linux/hrtimer.h>
+#include <linux/fs.h>
+#include <linux/pid_namespace.h>
+#include <asm/atomic.h>
+
+#define PERF_MAX_STACK_DEPTH		255
+
+struct perf_callchain_entry {
+	__u64				nr;
+	__u64				ip[PERF_MAX_STACK_DEPTH];
+};
+
+struct perf_raw_record {
+	u32				size;
+	void				*data;
+};
+
+struct task_struct;
+
+/**
+ * struct hw_perf_event - performance event hardware details:
+ */
+struct hw_perf_event {
+#ifdef CONFIG_PERF_EVENTS
+	union {
+		struct { /* hardware */
+			u64		config;
+			unsigned long	config_base;
+			unsigned long	event_base;
+			int		idx;
+		};
+		union { /* software */
+			atomic64_t	count;
+			struct hrtimer	hrtimer;
+		};
+	};
+	atomic64_t			prev_count;
+	u64				sample_period;
+	u64				last_period;
+	atomic64_t			period_left;
+	u64				interrupts;
+
+	u64				freq_count;
+	u64				freq_interrupts;
+	u64				freq_stamp;
+#endif
+};
+
+struct perf_event;
+
+/**
+ * struct pmu - generic performance monitoring unit
+ */
+struct pmu {
+	int (*enable)			(struct perf_event *event);
+	void (*disable)			(struct perf_event *event);
+	void (*read)			(struct perf_event *event);
+	void (*unthrottle)		(struct perf_event *event);
+};
+
+/**
+ * enum perf_event_active_state - the states of a event
+ */
+enum perf_event_active_state {
+	PERF_EVENT_STATE_ERROR	= -2,
+	PERF_EVENT_STATE_OFF		= -1,
+	PERF_EVENT_STATE_INACTIVE	=  0,
+	PERF_EVENT_STATE_ACTIVE	=  1,
+};
+
+struct file;
+
+struct perf_mmap_data {
+	struct rcu_head			rcu_head;
+	int				nr_pages;	/* nr of data pages  */
+	int				writable;	/* are we writable   */
+	int				nr_locked;	/* nr pages mlocked  */
+
+	atomic_t			poll;		/* POLL_ for wakeups */
+	atomic_t			events;		/* event_id limit       */
+
+	atomic_long_t			head;		/* write position    */
+	atomic_long_t			done_head;	/* completed head    */
+
+	atomic_t			lock;		/* concurrent writes */
+	atomic_t			wakeup;		/* needs a wakeup    */
+	atomic_t			lost;		/* nr records lost   */
+
+	long				watermark;	/* wakeup watermark  */
+
+	struct perf_event_mmap_page   *user_page;
+	void				*data_pages[0];
+};
+
+struct perf_pending_entry {
+	struct perf_pending_entry *next;
+	void (*func)(struct perf_pending_entry *);
+};
+
+/**
+ * struct perf_event - performance event kernel representation:
+ */
+struct perf_event {
+#ifdef CONFIG_PERF_EVENTS
+	struct list_head		group_entry;
+	struct list_head		event_entry;
+	struct list_head		sibling_list;
+	int				nr_siblings;
+	struct perf_event		*group_leader;
+	struct perf_event		*output;
+	const struct pmu		*pmu;
+
+	enum perf_event_active_state	state;
+	atomic64_t			count;
+
+	/*
+	 * These are the total time in nanoseconds that the event
+	 * has been enabled (i.e. eligible to run, and the task has
+	 * been scheduled in, if this is a per-task event)
+	 * and running (scheduled onto the CPU), respectively.
+	 *
+	 * They are computed from tstamp_enabled, tstamp_running and
+	 * tstamp_stopped when the event is in INACTIVE or ACTIVE state.
+	 */
+	u64				total_time_enabled;
+	u64				total_time_running;
+
+	/*
+	 * These are timestamps used for computing total_time_enabled
+	 * and total_time_running when the event is in INACTIVE or
+	 * ACTIVE state, measured in nanoseconds from an arbitrary point
+	 * in time.
+	 * tstamp_enabled: the notional time when the event was enabled
+	 * tstamp_running: the notional time when the event was scheduled on
+	 * tstamp_stopped: in INACTIVE state, the notional time when the
+	 *	event was scheduled off.
+	 */
+	u64				tstamp_enabled;
+	u64				tstamp_running;
+	u64				tstamp_stopped;
+
+	struct perf_event_attr	attr;
+	struct hw_perf_event		hw;
+
+	struct perf_event_context	*ctx;
+	struct file			*filp;
+
+	/*
+	 * These accumulate total time (in nanoseconds) that children
+	 * events have been enabled and running, respectively.
+	 */
+	atomic64_t			child_total_time_enabled;
+	atomic64_t			child_total_time_running;
+
+	/*
+	 * Protect attach/detach and child_list:
+	 */
+	struct mutex			child_mutex;
+	struct list_head		child_list;
+	struct perf_event		*parent;
+
+	int				oncpu;
+	int				cpu;
+
+	struct list_head		owner_entry;
+	struct task_struct		*owner;
+
+	/* mmap bits */
+	struct mutex			mmap_mutex;
+	atomic_t			mmap_count;
+	struct perf_mmap_data		*data;
+
+	/* poll related */
+	wait_queue_head_t		waitq;
+	struct fasync_struct		*fasync;
+
+	/* delayed work for NMIs and such */
+	int				pending_wakeup;
+	int				pending_kill;
+	int				pending_disable;
+	struct perf_pending_entry	pending;
+
+	atomic_t			event_limit;
+
+	void (*destroy)(struct perf_event *);
+	struct rcu_head			rcu_head;
+
+	struct pid_namespace		*ns;
+	u64				id;
+#endif
+};
+
+/**
+ * struct perf_event_context - event context structure
+ *
+ * Used as a container for task events and CPU events as well:
+ */
+struct perf_event_context {
+	/*
+	 * Protect the states of the events in the list,
+	 * nr_active, and the list:
+	 */
+	spinlock_t			lock;
+	/*
+	 * Protect the list of events.  Locking either mutex or lock
+	 * is sufficient to ensure the list doesn't change; to change
+	 * the list you need to lock both the mutex and the spinlock.
+	 */
+	struct mutex			mutex;
+
+	struct list_head		group_list;
+	struct list_head		event_list;
+	int				nr_events;
+	int				nr_active;
+	int				is_active;
+	int				nr_stat;
+	atomic_t			refcount;
+	struct task_struct		*task;
+
+	/*
+	 * Context clock, runs when context enabled.
+	 */
+	u64				time;
+	u64				timestamp;
+
+	/*
+	 * These fields let us detect when two contexts have both
+	 * been cloned (inherited) from a common ancestor.
+	 */
+	struct perf_event_context	*parent_ctx;
+	u64				parent_gen;
+	u64				generation;
+	int				pin_count;
+	struct rcu_head			rcu_head;
+};
+
+/**
+ * struct perf_event_cpu_context - per cpu event context structure
+ */
+struct perf_cpu_context {
+	struct perf_event_context	ctx;
+	struct perf_event_context	*task_ctx;
+	int				active_oncpu;
+	int				max_pertask;
+	int				exclusive;
+
+	/*
+	 * Recursion avoidance:
+	 *
+	 * task, softirq, irq, nmi context
+	 */
+	int				recursion[4];
+};
+
+struct perf_output_handle {
+	struct perf_event	*event;
+	struct perf_mmap_data	*data;
+	unsigned long		head;
+	unsigned long		offset;
+	int			nmi;
+	int			sample;
+	int			locked;
+	unsigned long		flags;
+};
+
+#ifdef CONFIG_PERF_EVENTS
+
+/*
+ * Set by architecture code:
+ */
+extern int perf_max_events;
+
+extern const struct pmu *hw_perf_event_init(struct perf_event *event);
+
+extern void perf_event_task_sched_in(struct task_struct *task, int cpu);
+extern void perf_event_task_sched_out(struct task_struct *task,
+					struct task_struct *next, int cpu);
+extern void perf_event_task_tick(struct task_struct *task, int cpu);
+extern int perf_event_init_task(struct task_struct *child);
+extern void perf_event_exit_task(struct task_struct *child);
+extern void perf_event_free_task(struct task_struct *task);
+extern void set_perf_event_pending(void);
+extern void perf_event_do_pending(void);
+extern void perf_event_print_debug(void);
+extern void __perf_disable(void);
+extern bool __perf_enable(void);
+extern void perf_disable(void);
+extern void perf_enable(void);
+extern int perf_event_task_disable(void);
+extern int perf_event_task_enable(void);
+extern int hw_perf_group_sched_in(struct perf_event *group_leader,
+	       struct perf_cpu_context *cpuctx,
+	       struct perf_event_context *ctx, int cpu);
+extern void perf_event_update_userpage(struct perf_event *event);
+
+struct perf_sample_data {
+	u64				type;
+
+	u64				ip;
+	struct {
+		u32	pid;
+		u32	tid;
+	}				tid_entry;
+	u64				time;
+	u64				addr;
+	u64				id;
+	u64				stream_id;
+	struct {
+		u32	cpu;
+		u32	reserved;
+	}				cpu_entry;
+	u64				period;
+	struct perf_callchain_entry	*callchain;
+	struct perf_raw_record		*raw;
+};
+
+extern void perf_output_sample(struct perf_output_handle *handle,
+			       struct perf_event_header *header,
+			       struct perf_sample_data *data,
+			       struct perf_event *event);
+extern void perf_prepare_sample(struct perf_event_header *header,
+				struct perf_sample_data *data,
+				struct perf_event *event,
+				struct pt_regs *regs);
+
+extern int perf_event_overflow(struct perf_event *event, int nmi,
+				 struct perf_sample_data *data,
+				 struct pt_regs *regs);
+
+/*
+ * Return 1 for a software event, 0 for a hardware event
+ */
+static inline int is_software_event(struct perf_event *event)
+{
+	return (event->attr.type != PERF_TYPE_RAW) &&
+		(event->attr.type != PERF_TYPE_HARDWARE) &&
+		(event->attr.type != PERF_TYPE_HW_CACHE);
+}
+
+extern atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
+
+extern void __perf_sw_event(u32, u64, int, struct pt_regs *, u64);
+
+static inline void
+perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr)
+{
+	if (atomic_read(&perf_swevent_enabled[event_id]))
+		__perf_sw_event(event_id, nr, nmi, regs, addr);
+}
+
+extern void __perf_event_mmap(struct vm_area_struct *vma);
+
+static inline void perf_event_mmap(struct vm_area_struct *vma)
+{
+	if (vma->vm_flags & VM_EXEC)
+		__perf_event_mmap(vma);
+}
+
+extern void perf_event_comm(struct task_struct *tsk);
+extern void perf_event_fork(struct task_struct *tsk);
+
+extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
+
+extern int sysctl_perf_event_paranoid;
+extern int sysctl_perf_event_mlock;
+extern int sysctl_perf_event_sample_rate;
+
+extern void perf_event_init(void);
+extern void perf_tp_event(int event_id, u64 addr, u64 count,
+				 void *record, int entry_size);
+
+#ifndef perf_misc_flags
+#define perf_misc_flags(regs)	(user_mode(regs) ? PERF_RECORD_MISC_USER : \
+				 PERF_RECORD_MISC_KERNEL)
+#define perf_instruction_pointer(regs)	instruction_pointer(regs)
+#endif
+
+extern int perf_output_begin(struct perf_output_handle *handle,
+			     struct perf_event *event, unsigned int size,
+			     int nmi, int sample);
+extern void perf_output_end(struct perf_output_handle *handle);
+extern void perf_output_copy(struct perf_output_handle *handle,
+			     const void *buf, unsigned int len);
+#else
+static inline void
+perf_event_task_sched_in(struct task_struct *task, int cpu)		{ }
+static inline void
+perf_event_task_sched_out(struct task_struct *task,
+			    struct task_struct *next, int cpu)		{ }
+static inline void
+perf_event_task_tick(struct task_struct *task, int cpu)		{ }
+static inline int perf_event_init_task(struct task_struct *child)	{ return 0; }
+static inline void perf_event_exit_task(struct task_struct *child)	{ }
+static inline void perf_event_free_task(struct task_struct *task)	{ }
+static inline void perf_event_do_pending(void)			{ }
+static inline void perf_event_print_debug(void)			{ }
+static inline void perf_disable(void)					{ }
+static inline void perf_enable(void)					{ }
+static inline int perf_event_task_disable(void)	{ return -EINVAL; }
+static inline int perf_event_task_enable(void)	{ return -EINVAL; }
+
+static inline void
+perf_sw_event(u32 event_id, u64 nr, int nmi,
+		     struct pt_regs *regs, u64 addr)			{ }
+
+static inline void perf_event_mmap(struct vm_area_struct *vma)	{ }
+static inline void perf_event_comm(struct task_struct *tsk)		{ }
+static inline void perf_event_fork(struct task_struct *tsk)		{ }
+static inline void perf_event_init(void)				{ }
+
+#endif
+
+#define perf_output_put(handle, x) \
+	perf_output_copy((handle), &(x), sizeof(x))
+
+#endif /* __KERNEL__ */
+#endif /* _LINUX_PERF_EVENT_H */
diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index b00df4c79c6..07bff666e65 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -85,7 +85,7 @@
 #define PR_SET_TIMERSLACK 29
 #define PR_GET_TIMERSLACK 30
 
-#define PR_TASK_PERF_COUNTERS_DISABLE		31
-#define PR_TASK_PERF_COUNTERS_ENABLE		32
+#define PR_TASK_PERF_EVENTS_DISABLE		31
+#define PR_TASK_PERF_EVENTS_ENABLE		32
 
 #endif /* _LINUX_PRCTL_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8af3d249170..8b265a8986d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -100,7 +100,7 @@ struct robust_list_head;
 struct bio;
 struct fs_struct;
 struct bts_context;
-struct perf_counter_context;
+struct perf_event_context;
 
 /*
  * List of flags we want to share for kernel threads,
@@ -701,7 +701,7 @@ struct user_struct {
 #endif
 #endif
 
-#ifdef CONFIG_PERF_COUNTERS
+#ifdef CONFIG_PERF_EVENTS
 	atomic_long_t locked_vm;
 #endif
 };
@@ -1449,10 +1449,10 @@ struct task_struct {
 	struct list_head pi_state_list;
 	struct futex_pi_state *pi_state_cache;
 #endif
-#ifdef CONFIG_PERF_COUNTERS
-	struct perf_counter_context *perf_counter_ctxp;
-	struct mutex perf_counter_mutex;
-	struct list_head perf_counter_list;
+#ifdef CONFIG_PERF_EVENTS
+	struct perf_event_context *perf_event_ctxp;
+	struct mutex perf_event_mutex;
+	struct list_head perf_event_list;
 #endif
 #ifdef CONFIG_NUMA
 	struct mempolicy *mempolicy;	/* Protected by alloc_lock */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index a8e37821cc6..02f19f9a76c 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -55,7 +55,7 @@ struct compat_timeval;
 struct robust_list_head;
 struct getcpu_cache;
 struct old_linux_dirent;
-struct perf_counter_attr;
+struct perf_event_attr;
 
 #include <linux/types.h>
 #include <linux/aio_abi.h>
@@ -885,7 +885,7 @@ asmlinkage long sys_ppoll(struct pollfd __user *, unsigned int,
 int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
 
 
-asmlinkage long sys_perf_counter_open(
-		struct perf_counter_attr __user *attr_uptr,
+asmlinkage long sys_perf_event_open(
+		struct perf_event_attr __user *attr_uptr,
 		pid_t pid, int cpu, int group_fd, unsigned long flags);
 #endif
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 72a3b437b82..ec91e78244f 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -378,7 +378,7 @@ static inline int ftrace_get_offsets_##call(				\
 #ifdef CONFIG_EVENT_PROFILE
 
 /*
- * Generate the functions needed for tracepoint perf_counter support.
+ * Generate the functions needed for tracepoint perf_event support.
  *
  * NOTE: The insertion profile callback (ftrace_profile_<call>) is defined later
  *
@@ -656,7 +656,7 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
  * {
  *	struct ftrace_data_offsets_<call> __maybe_unused __data_offsets;
  *	struct ftrace_event_call *event_call = &event_<call>;
- *	extern void perf_tpcounter_event(int, u64, u64, void *, int);
+ *	extern void perf_tp_event(int, u64, u64, void *, int);
  *	struct ftrace_raw_##call *entry;
  *	u64 __addr = 0, __count = 1;
  *	unsigned long irq_flags;
@@ -691,7 +691,7 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
  *
  *		<assign>  <- affect our values
  *
- *		perf_tpcounter_event(event_call->id, __addr, __count, entry,
+ *		perf_tp_event(event_call->id, __addr, __count, entry,
  *			     __entry_size);  <- submit them to perf counter
  *	} while (0);
  *
@@ -712,7 +712,7 @@ static void ftrace_profile_##call(proto)				\
 {									\
 	struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\
 	struct ftrace_event_call *event_call = &event_##call;		\
-	extern void perf_tpcounter_event(int, u64, u64, void *, int);	\
+	extern void perf_tp_event(int, u64, u64, void *, int);	\
 	struct ftrace_raw_##call *entry;				\
 	u64 __addr = 0, __count = 1;					\
 	unsigned long irq_flags;					\
@@ -742,7 +742,7 @@ static void ftrace_profile_##call(proto)				\
 									\
 		{ assign; }						\
 									\
-		perf_tpcounter_event(event_call->id, __addr, __count, entry,\
+		perf_tp_event(event_call->id, __addr, __count, entry,\
 			     __entry_size);				\
 	} while (0);							\
 									\
diff --git a/init/Kconfig b/init/Kconfig
index 8e8b76d8a27..cfdf5c32280 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -915,17 +915,17 @@ config AIO
           by some high performance threaded applications. Disabling
           this option saves about 7k.
 
-config HAVE_PERF_COUNTERS
+config HAVE_PERF_EVENTS
 	bool
 	help
 	  See tools/perf/design.txt for details.
 
 menu "Performance Counters"
 
-config PERF_COUNTERS
+config PERF_EVENTS
 	bool "Kernel Performance Counters"
 	default y if PROFILING
-	depends on HAVE_PERF_COUNTERS
+	depends on HAVE_PERF_EVENTS
 	select ANON_INODES
 	help
 	  Enable kernel support for performance counter hardware.
@@ -947,7 +947,7 @@ config PERF_COUNTERS
 
 config EVENT_PROFILE
 	bool "Tracepoint profiling sources"
-	depends on PERF_COUNTERS && EVENT_TRACING
+	depends on PERF_EVENTS && EVENT_TRACING
 	default y
 	help
 	 Allow the use of tracepoints as software performance counters.
diff --git a/kernel/Makefile b/kernel/Makefile
index 3d9c7e27e3f..e26a546eac4 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -96,7 +96,7 @@ obj-$(CONFIG_X86_DS) += trace/
 obj-$(CONFIG_RING_BUFFER) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
 obj-$(CONFIG_SLOW_WORK) += slow-work.o
-obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
+obj-$(CONFIG_PERF_EVENTS) += perf_event.o
 
 ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/exit.c b/kernel/exit.c
index ae5d8660ddf..e47ee8a0613 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -47,7 +47,7 @@
 #include <linux/tracehook.h>
 #include <linux/fs_struct.h>
 #include <linux/init_task.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
 #include <trace/events/sched.h>
 
 #include <asm/uaccess.h>
@@ -154,8 +154,8 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
 {
 	struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
 
-#ifdef CONFIG_PERF_COUNTERS
-	WARN_ON_ONCE(tsk->perf_counter_ctxp);
+#ifdef CONFIG_PERF_EVENTS
+	WARN_ON_ONCE(tsk->perf_event_ctxp);
 #endif
 	trace_sched_process_free(tsk);
 	put_task_struct(tsk);
@@ -981,7 +981,7 @@ NORET_TYPE void do_exit(long code)
 	 * Flush inherited counters to the parent - before the parent
 	 * gets woken up by child-exit notifications.
 	 */
-	perf_counter_exit_task(tsk);
+	perf_event_exit_task(tsk);
 
 	exit_notify(tsk, group_dead);
 #ifdef CONFIG_NUMA
diff --git a/kernel/fork.c b/kernel/fork.c
index bfee931ee3f..2cebfb23b0b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -61,7 +61,7 @@
 #include <linux/blkdev.h>
 #include <linux/fs_struct.h>
 #include <linux/magic.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -1078,7 +1078,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	/* Perform scheduler related setup. Assign this task to a CPU. */
 	sched_fork(p, clone_flags);
 
-	retval = perf_counter_init_task(p);
+	retval = perf_event_init_task(p);
 	if (retval)
 		goto bad_fork_cleanup_policy;
 
@@ -1253,7 +1253,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	write_unlock_irq(&tasklist_lock);
 	proc_fork_connector(p);
 	cgroup_post_fork(p);
-	perf_counter_fork(p);
+	perf_event_fork(p);
 	return p;
 
 bad_fork_free_pid:
@@ -1280,7 +1280,7 @@ bad_fork_cleanup_semundo:
 bad_fork_cleanup_audit:
 	audit_free(p);
 bad_fork_cleanup_policy:
-	perf_counter_free_task(p);
+	perf_event_free_task(p);
 #ifdef CONFIG_NUMA
 	mpol_put(p->mempolicy);
 bad_fork_cleanup_cgroup:
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
deleted file mode 100644
index 62de0db8092..00000000000
--- a/kernel/perf_counter.c
+++ /dev/null
@@ -1,5000 +0,0 @@
-/*
- * Performance counter core code
- *
- *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
- *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
- *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
- *  Copyright  �  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
- *
- *  For licensing details see kernel-base/COPYING
- */
-
-#include <linux/fs.h>
-#include <linux/mm.h>
-#include <linux/cpu.h>
-#include <linux/smp.h>
-#include <linux/file.h>
-#include <linux/poll.h>
-#include <linux/sysfs.h>
-#include <linux/dcache.h>
-#include <linux/percpu.h>
-#include <linux/ptrace.h>
-#include <linux/vmstat.h>
-#include <linux/hardirq.h>
-#include <linux/rculist.h>
-#include <linux/uaccess.h>
-#include <linux/syscalls.h>
-#include <linux/anon_inodes.h>
-#include <linux/kernel_stat.h>
-#include <linux/perf_counter.h>
-
-#include <asm/irq_regs.h>
-
-/*
- * Each CPU has a list of per CPU counters:
- */
-DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
-
-int perf_max_counters __read_mostly = 1;
-static int perf_reserved_percpu __read_mostly;
-static int perf_overcommit __read_mostly = 1;
-
-static atomic_t nr_counters __read_mostly;
-static atomic_t nr_mmap_counters __read_mostly;
-static atomic_t nr_comm_counters __read_mostly;
-static atomic_t nr_task_counters __read_mostly;
-
-/*
- * perf counter paranoia level:
- *  -1 - not paranoid at all
- *   0 - disallow raw tracepoint access for unpriv
- *   1 - disallow cpu counters for unpriv
- *   2 - disallow kernel profiling for unpriv
- */
-int sysctl_perf_counter_paranoid __read_mostly = 1;
-
-static inline bool perf_paranoid_tracepoint_raw(void)
-{
-	return sysctl_perf_counter_paranoid > -1;
-}
-
-static inline bool perf_paranoid_cpu(void)
-{
-	return sysctl_perf_counter_paranoid > 0;
-}
-
-static inline bool perf_paranoid_kernel(void)
-{
-	return sysctl_perf_counter_paranoid > 1;
-}
-
-int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */
-
-/*
- * max perf counter sample rate
- */
-int sysctl_perf_counter_sample_rate __read_mostly = 100000;
-
-static atomic64_t perf_counter_id;
-
-/*
- * Lock for (sysadmin-configurable) counter reservations:
- */
-static DEFINE_SPINLOCK(perf_resource_lock);
-
-/*
- * Architecture provided APIs - weak aliases:
- */
-extern __weak const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
-{
-	return NULL;
-}
-
-void __weak hw_perf_disable(void)		{ barrier(); }
-void __weak hw_perf_enable(void)		{ barrier(); }
-
-void __weak hw_perf_counter_setup(int cpu)	{ barrier(); }
-void __weak hw_perf_counter_setup_online(int cpu)	{ barrier(); }
-
-int __weak
-hw_perf_group_sched_in(struct perf_counter *group_leader,
-	       struct perf_cpu_context *cpuctx,
-	       struct perf_counter_context *ctx, int cpu)
-{
-	return 0;
-}
-
-void __weak perf_counter_print_debug(void)	{ }
-
-static DEFINE_PER_CPU(int, perf_disable_count);
-
-void __perf_disable(void)
-{
-	__get_cpu_var(perf_disable_count)++;
-}
-
-bool __perf_enable(void)
-{
-	return !--__get_cpu_var(perf_disable_count);
-}
-
-void perf_disable(void)
-{
-	__perf_disable();
-	hw_perf_disable();
-}
-
-void perf_enable(void)
-{
-	if (__perf_enable())
-		hw_perf_enable();
-}
-
-static void get_ctx(struct perf_counter_context *ctx)
-{
-	WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
-}
-
-static void free_ctx(struct rcu_head *head)
-{
-	struct perf_counter_context *ctx;
-
-	ctx = container_of(head, struct perf_counter_context, rcu_head);
-	kfree(ctx);
-}
-
-static void put_ctx(struct perf_counter_context *ctx)
-{
-	if (atomic_dec_and_test(&ctx->refcount)) {
-		if (ctx->parent_ctx)
-			put_ctx(ctx->parent_ctx);
-		if (ctx->task)
-			put_task_struct(ctx->task);
-		call_rcu(&ctx->rcu_head, free_ctx);
-	}
-}
-
-static void unclone_ctx(struct perf_counter_context *ctx)
-{
-	if (ctx->parent_ctx) {
-		put_ctx(ctx->parent_ctx);
-		ctx->parent_ctx = NULL;
-	}
-}
-
-/*
- * If we inherit counters we want to return the parent counter id
- * to userspace.
- */
-static u64 primary_counter_id(struct perf_counter *counter)
-{
-	u64 id = counter->id;
-
-	if (counter->parent)
-		id = counter->parent->id;
-
-	return id;
-}
-
-/*
- * Get the perf_counter_context for a task and lock it.
- * This has to cope with with the fact that until it is locked,
- * the context could get moved to another task.
- */
-static struct perf_counter_context *
-perf_lock_task_context(struct task_struct *task, unsigned long *flags)
-{
-	struct perf_counter_context *ctx;
-
-	rcu_read_lock();
- retry:
-	ctx = rcu_dereference(task->perf_counter_ctxp);
-	if (ctx) {
-		/*
-		 * If this context is a clone of another, it might
-		 * get swapped for another underneath us by
-		 * perf_counter_task_sched_out, though the
-		 * rcu_read_lock() protects us from any context
-		 * getting freed.  Lock the context and check if it
-		 * got swapped before we could get the lock, and retry
-		 * if so.  If we locked the right context, then it
-		 * can't get swapped on us any more.
-		 */
-		spin_lock_irqsave(&ctx->lock, *flags);
-		if (ctx != rcu_dereference(task->perf_counter_ctxp)) {
-			spin_unlock_irqrestore(&ctx->lock, *flags);
-			goto retry;
-		}
-
-		if (!atomic_inc_not_zero(&ctx->refcount)) {
-			spin_unlock_irqrestore(&ctx->lock, *flags);
-			ctx = NULL;
-		}
-	}
-	rcu_read_unlock();
-	return ctx;
-}
-
-/*
- * Get the context for a task and increment its pin_count so it
- * can't get swapped to another task.  This also increments its
- * reference count so that the context can't get freed.
- */
-static struct perf_counter_context *perf_pin_task_context(struct task_struct *task)
-{
-	struct perf_counter_context *ctx;
-	unsigned long flags;
-
-	ctx = perf_lock_task_context(task, &flags);
-	if (ctx) {
-		++ctx->pin_count;
-		spin_unlock_irqrestore(&ctx->lock, flags);
-	}
-	return ctx;
-}
-
-static void perf_unpin_context(struct perf_counter_context *ctx)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&ctx->lock, flags);
-	--ctx->pin_count;
-	spin_unlock_irqrestore(&ctx->lock, flags);
-	put_ctx(ctx);
-}
-
-/*
- * Add a counter from the lists for its context.
- * Must be called with ctx->mutex and ctx->lock held.
- */
-static void
-list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
-{
-	struct perf_counter *group_leader = counter->group_leader;
-
-	/*
-	 * Depending on whether it is a standalone or sibling counter,
-	 * add it straight to the context's counter list, or to the group
-	 * leader's sibling list:
-	 */
-	if (group_leader == counter)
-		list_add_tail(&counter->group_entry, &ctx->group_list);
-	else {
-		list_add_tail(&counter->group_entry, &group_leader->sibling_list);
-		group_leader->nr_siblings++;
-	}
-
-	list_add_rcu(&counter->event_entry, &ctx->event_list);
-	ctx->nr_counters++;
-	if (counter->attr.inherit_stat)
-		ctx->nr_stat++;
-}
-
-/*
- * Remove a counter from the lists for its context.
- * Must be called with ctx->mutex and ctx->lock held.
- */
-static void
-list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
-{
-	struct perf_counter *sibling, *tmp;
-
-	if (list_empty(&counter->group_entry))
-		return;
-	ctx->nr_counters--;
-	if (counter->attr.inherit_stat)
-		ctx->nr_stat--;
-
-	list_del_init(&counter->group_entry);
-	list_del_rcu(&counter->event_entry);
-
-	if (counter->group_leader != counter)
-		counter->group_leader->nr_siblings--;
-
-	/*
-	 * If this was a group counter with sibling counters then
-	 * upgrade the siblings to singleton counters by adding them
-	 * to the context list directly:
-	 */
-	list_for_each_entry_safe(sibling, tmp, &counter->sibling_list, group_entry) {
-
-		list_move_tail(&sibling->group_entry, &ctx->group_list);
-		sibling->group_leader = sibling;
-	}
-}
-
-static void
-counter_sched_out(struct perf_counter *counter,
-		  struct perf_cpu_context *cpuctx,
-		  struct perf_counter_context *ctx)
-{
-	if (counter->state != PERF_COUNTER_STATE_ACTIVE)
-		return;
-
-	counter->state = PERF_COUNTER_STATE_INACTIVE;
-	if (counter->pending_disable) {
-		counter->pending_disable = 0;
-		counter->state = PERF_COUNTER_STATE_OFF;
-	}
-	counter->tstamp_stopped = ctx->time;
-	counter->pmu->disable(counter);
-	counter->oncpu = -1;
-
-	if (!is_software_counter(counter))
-		cpuctx->active_oncpu--;
-	ctx->nr_active--;
-	if (counter->attr.exclusive || !cpuctx->active_oncpu)
-		cpuctx->exclusive = 0;
-}
-
-static void
-group_sched_out(struct perf_counter *group_counter,
-		struct perf_cpu_context *cpuctx,
-		struct perf_counter_context *ctx)
-{
-	struct perf_counter *counter;
-
-	if (group_counter->state != PERF_COUNTER_STATE_ACTIVE)
-		return;
-
-	counter_sched_out(group_counter, cpuctx, ctx);
-
-	/*
-	 * Schedule out siblings (if any):
-	 */
-	list_for_each_entry(counter, &group_counter->sibling_list, group_entry)
-		counter_sched_out(counter, cpuctx, ctx);
-
-	if (group_counter->attr.exclusive)
-		cpuctx->exclusive = 0;
-}
-
-/*
- * Cross CPU call to remove a performance counter
- *
- * We disable the counter on the hardware level first. After that we
- * remove it from the context list.
- */
-static void __perf_counter_remove_from_context(void *info)
-{
-	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
-	struct perf_counter *counter = info;
-	struct perf_counter_context *ctx = counter->ctx;
-
-	/*
-	 * If this is a task context, we need to check whether it is
-	 * the current task context of this cpu. If not it has been
-	 * scheduled out before the smp call arrived.
-	 */
-	if (ctx->task && cpuctx->task_ctx != ctx)
-		return;
-
-	spin_lock(&ctx->lock);
-	/*
-	 * Protect the list operation against NMI by disabling the
-	 * counters on a global level.
-	 */
-	perf_disable();
-
-	counter_sched_out(counter, cpuctx, ctx);
-
-	list_del_counter(counter, ctx);
-
-	if (!ctx->task) {
-		/*
-		 * Allow more per task counters with respect to the
-		 * reservation:
-		 */
-		cpuctx->max_pertask =
-			min(perf_max_counters - ctx->nr_counters,
-			    perf_max_counters - perf_reserved_percpu);
-	}
-
-	perf_enable();
-	spin_unlock(&ctx->lock);
-}
-
-
-/*
- * Remove the counter from a task's (or a CPU's) list of counters.
- *
- * Must be called with ctx->mutex held.
- *
- * CPU counters are removed with a smp call. For task counters we only
- * call when the task is on a CPU.
- *
- * If counter->ctx is a cloned context, callers must make sure that
- * every task struct that counter->ctx->task could possibly point to
- * remains valid.  This is OK when called from perf_release since
- * that only calls us on the top-level context, which can't be a clone.
- * When called from perf_counter_exit_task, it's OK because the
- * context has been detached from its task.
- */
-static void perf_counter_remove_from_context(struct perf_counter *counter)
-{
-	struct perf_counter_context *ctx = counter->ctx;
-	struct task_struct *task = ctx->task;
-
-	if (!task) {
-		/*
-		 * Per cpu counters are removed via an smp call and
-		 * the removal is always sucessful.
-		 */
-		smp_call_function_single(counter->cpu,
-					 __perf_counter_remove_from_context,
-					 counter, 1);
-		return;
-	}
-
-retry:
-	task_oncpu_function_call(task, __perf_counter_remove_from_context,
-				 counter);
-
-	spin_lock_irq(&ctx->lock);
-	/*
-	 * If the context is active we need to retry the smp call.
-	 */
-	if (ctx->nr_active && !list_empty(&counter->group_entry)) {
-		spin_unlock_irq(&ctx->lock);
-		goto retry;
-	}
-
-	/*
-	 * The lock prevents that this context is scheduled in so we
-	 * can remove the counter safely, if the call above did not
-	 * succeed.
-	 */
-	if (!list_empty(&counter->group_entry)) {
-		list_del_counter(counter, ctx);
-	}
-	spin_unlock_irq(&ctx->lock);
-}
-
-static inline u64 perf_clock(void)
-{
-	return cpu_clock(smp_processor_id());
-}
-
-/*
- * Update the record of the current time in a context.
- */
-static void update_context_time(struct perf_counter_context *ctx)
-{
-	u64 now = perf_clock();
-
-	ctx->time += now - ctx->timestamp;
-	ctx->timestamp = now;
-}
-
-/*
- * Update the total_time_enabled and total_time_running fields for a counter.
- */
-static void update_counter_times(struct perf_counter *counter)
-{
-	struct perf_counter_context *ctx = counter->ctx;
-	u64 run_end;
-
-	if (counter->state < PERF_COUNTER_STATE_INACTIVE ||
-	    counter->group_leader->state < PERF_COUNTER_STATE_INACTIVE)
-		return;
-
-	counter->total_time_enabled = ctx->time - counter->tstamp_enabled;
-
-	if (counter->state == PERF_COUNTER_STATE_INACTIVE)
-		run_end = counter->tstamp_stopped;
-	else
-		run_end = ctx->time;
-
-	counter->total_time_running = run_end - counter->tstamp_running;
-}
-
-/*
- * Update total_time_enabled and total_time_running for all counters in a group.
- */
-static void update_group_times(struct perf_counter *leader)
-{
-	struct perf_counter *counter;
-
-	update_counter_times(leader);
-	list_for_each_entry(counter, &leader->sibling_list, group_entry)
-		update_counter_times(counter);
-}
-
-/*
- * Cross CPU call to disable a performance counter
- */
-static void __perf_counter_disable(void *info)
-{
-	struct perf_counter *counter = info;
-	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
-	struct perf_counter_context *ctx = counter->ctx;
-
-	/*
-	 * If this is a per-task counter, need to check whether this
-	 * counter's task is the current task on this cpu.
-	 */
-	if (ctx->task && cpuctx->task_ctx != ctx)
-		return;
-
-	spin_lock(&ctx->lock);
-
-	/*
-	 * If the counter is on, turn it off.
-	 * If it is in error state, leave it in error state.
-	 */
-	if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
-		update_context_time(ctx);
-		update_group_times(counter);
-		if (counter == counter->group_leader)
-			group_sched_out(counter, cpuctx, ctx);
-		else
-			counter_sched_out(counter, cpuctx, ctx);
-		counter->state = PERF_COUNTER_STATE_OFF;
-	}
-
-	spin_unlock(&ctx->lock);
-}
-
-/*
- * Disable a counter.
- *
- * If counter->ctx is a cloned context, callers must make sure that
- * every task struct that counter->ctx->task could possibly point to
- * remains valid.  This condition is satisifed when called through
- * perf_counter_for_each_child or perf_counter_for_each because they
- * hold the top-level counter's child_mutex, so any descendant that
- * goes to exit will block in sync_child_counter.
- * When called from perf_pending_counter it's OK because counter->ctx
- * is the current context on this CPU and preemption is disabled,
- * hence we can't get into perf_counter_task_sched_out for this context.
- */
-static void perf_counter_disable(struct perf_counter *counter)
-{
-	struct perf_counter_context *ctx = counter->ctx;
-	struct task_struct *task = ctx->task;
-
-	if (!task) {
-		/*
-		 * Disable the counter on the cpu that it's on
-		 */
-		smp_call_function_single(counter->cpu, __perf_counter_disable,
-					 counter, 1);
-		return;
-	}
-
- retry:
-	task_oncpu_function_call(task, __perf_counter_disable, counter);
-
-	spin_lock_irq(&ctx->lock);
-	/*
-	 * If the counter is still active, we need to retry the cross-call.
-	 */
-	if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
-		spin_unlock_irq(&ctx->lock);
-		goto retry;
-	}
-
-	/*
-	 * Since we have the lock this context can't be scheduled
-	 * in, so we can change the state safely.
-	 */
-	if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
-		update_group_times(counter);
-		counter->state = PERF_COUNTER_STATE_OFF;
-	}
-
-	spin_unlock_irq(&ctx->lock);
-}
-
-static int
-counter_sched_in(struct perf_counter *counter,
-		 struct perf_cpu_context *cpuctx,
-		 struct perf_counter_context *ctx,
-		 int cpu)
-{
-	if (counter->state <= PERF_COUNTER_STATE_OFF)
-		return 0;
-
-	counter->state = PERF_COUNTER_STATE_ACTIVE;
-	counter->oncpu = cpu;	/* TODO: put 'cpu' into cpuctx->cpu */
-	/*
-	 * The new state must be visible before we turn it on in the hardware:
-	 */
-	smp_wmb();
-
-	if (counter->pmu->enable(counter)) {
-		counter->state = PERF_COUNTER_STATE_INACTIVE;
-		counter->oncpu = -1;
-		return -EAGAIN;
-	}
-
-	counter->tstamp_running += ctx->time - counter->tstamp_stopped;
-
-	if (!is_software_counter(counter))
-		cpuctx->active_oncpu++;
-	ctx->nr_active++;
-
-	if (counter->attr.exclusive)
-		cpuctx->exclusive = 1;
-
-	return 0;
-}
-
-static int
-group_sched_in(struct perf_counter *group_counter,
-	       struct perf_cpu_context *cpuctx,
-	       struct perf_counter_context *ctx,
-	       int cpu)
-{
-	struct perf_counter *counter, *partial_group;
-	int ret;
-
-	if (group_counter->state == PERF_COUNTER_STATE_OFF)
-		return 0;
-
-	ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu);
-	if (ret)
-		return ret < 0 ? ret : 0;
-
-	if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
-		return -EAGAIN;
-
-	/*
-	 * Schedule in siblings as one group (if any):
-	 */
-	list_for_each_entry(counter, &group_counter->sibling_list, group_entry) {
-		if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
-			partial_group = counter;
-			goto group_error;
-		}
-	}
-
-	return 0;
-
-group_error:
-	/*
-	 * Groups can be scheduled in as one unit only, so undo any
-	 * partial group before returning:
-	 */
-	list_for_each_entry(counter, &group_counter->sibling_list, group_entry) {
-		if (counter == partial_group)
-			break;
-		counter_sched_out(counter, cpuctx, ctx);
-	}
-	counter_sched_out(group_counter, cpuctx, ctx);
-
-	return -EAGAIN;
-}
-
-/*
- * Return 1 for a group consisting entirely of software counters,
- * 0 if the group contains any hardware counters.
- */
-static int is_software_only_group(struct perf_counter *leader)
-{
-	struct perf_counter *counter;
-
-	if (!is_software_counter(leader))
-		return 0;
-
-	list_for_each_entry(counter, &leader->sibling_list, group_entry)
-		if (!is_software_counter(counter))
-			return 0;
-
-	return 1;
-}
-
-/*
- * Work out whether we can put this counter group on the CPU now.
- */
-static int group_can_go_on(struct perf_counter *counter,
-			   struct perf_cpu_context *cpuctx,
-			   int can_add_hw)
-{
-	/*
-	 * Groups consisting entirely of software counters can always go on.
-	 */
-	if (is_software_only_group(counter))
-		return 1;
-	/*
-	 * If an exclusive group is already on, no other hardware
-	 * counters can go on.
-	 */
-	if (cpuctx->exclusive)
-		return 0;
-	/*
-	 * If this group is exclusive and there are already
-	 * counters on the CPU, it can't go on.
-	 */
-	if (counter->attr.exclusive && cpuctx->active_oncpu)
-		return 0;
-	/*
-	 * Otherwise, try to add it if all previous groups were able
-	 * to go on.
-	 */
-	return can_add_hw;
-}
-
-static void add_counter_to_ctx(struct perf_counter *counter,
-			       struct perf_counter_context *ctx)
-{
-	list_add_counter(counter, ctx);
-	counter->tstamp_enabled = ctx->time;
-	counter->tstamp_running = ctx->time;
-	counter->tstamp_stopped = ctx->time;
-}
-
-/*
- * Cross CPU call to install and enable a performance counter
- *
- * Must be called with ctx->mutex held
- */
-static void __perf_install_in_context(void *info)
-{
-	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
-	struct perf_counter *counter = info;
-	struct perf_counter_context *ctx = counter->ctx;
-	struct perf_counter *leader = counter->group_leader;
-	int cpu = smp_processor_id();
-	int err;
-
-	/*
-	 * If this is a task context, we need to check whether it is
-	 * the current task context of this cpu. If not it has been
-	 * scheduled out before the smp call arrived.
-	 * Or possibly this is the right context but it isn't
-	 * on this cpu because it had no counters.
-	 */
-	if (ctx->task && cpuctx->task_ctx != ctx) {
-		if (cpuctx->task_ctx || ctx->task != current)
-			return;
-		cpuctx->task_ctx = ctx;
-	}
-
-	spin_lock(&ctx->lock);
-	ctx->is_active = 1;
-	update_context_time(ctx);
-
-	/*
-	 * Protect the list operation against NMI by disabling the
-	 * counters on a global level. NOP for non NMI based counters.
-	 */
-	perf_disable();
-
-	add_counter_to_ctx(counter, ctx);
-
-	/*
-	 * Don't put the counter on if it is disabled or if
-	 * it is in a group and the group isn't on.
-	 */
-	if (counter->state != PERF_COUNTER_STATE_INACTIVE ||
-	    (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE))
-		goto unlock;
-
-	/*
-	 * An exclusive counter can't go on if there are already active
-	 * hardware counters, and no hardware counter can go on if there
-	 * is already an exclusive counter on.
-	 */
-	if (!group_can_go_on(counter, cpuctx, 1))
-		err = -EEXIST;
-	else
-		err = counter_sched_in(counter, cpuctx, ctx, cpu);
-
-	if (err) {
-		/*
-		 * This counter couldn't go on.  If it is in a group
-		 * then we have to pull the whole group off.
-		 * If the counter group is pinned then put it in error state.
-		 */
-		if (leader != counter)
-			group_sched_out(leader, cpuctx, ctx);
-		if (leader->attr.pinned) {
-			update_group_times(leader);
-			leader->state = PERF_COUNTER_STATE_ERROR;
-		}
-	}
-
-	if (!err && !ctx->task && cpuctx->max_pertask)
-		cpuctx->max_pertask--;
-
- unlock:
-	perf_enable();
-
-	spin_unlock(&ctx->lock);
-}
-
-/*
- * Attach a performance counter to a context
- *
- * First we add the counter to the list with the hardware enable bit
- * in counter->hw_config cleared.
- *
- * If the counter is attached to a task which is on a CPU we use a smp
- * call to enable it in the task context. The task might have been
- * scheduled away, but we check this in the smp call again.
- *
- * Must be called with ctx->mutex held.
- */
-static void
-perf_install_in_context(struct perf_counter_context *ctx,
-			struct perf_counter *counter,
-			int cpu)
-{
-	struct task_struct *task = ctx->task;
-
-	if (!task) {
-		/*
-		 * Per cpu counters are installed via an smp call and
-		 * the install is always sucessful.
-		 */
-		smp_call_function_single(cpu, __perf_install_in_context,
-					 counter, 1);
-		return;
-	}
-
-retry:
-	task_oncpu_function_call(task, __perf_install_in_context,
-				 counter);
-
-	spin_lock_irq(&ctx->lock);
-	/*
-	 * we need to retry the smp call.
-	 */
-	if (ctx->is_active && list_empty(&counter->group_entry)) {
-		spin_unlock_irq(&ctx->lock);
-		goto retry;
-	}
-
-	/*
-	 * The lock prevents that this context is scheduled in so we
-	 * can add the counter safely, if it the call above did not
-	 * succeed.
-	 */
-	if (list_empty(&counter->group_entry))
-		add_counter_to_ctx(counter, ctx);
-	spin_unlock_irq(&ctx->lock);
-}
-
-/*
- * Put a counter into inactive state and update time fields.
- * Enabling the leader of a group effectively enables all
- * the group members that aren't explicitly disabled, so we
- * have to update their ->tstamp_enabled also.
- * Note: this works for group members as well as group leaders
- * since the non-leader members' sibling_lists will be empty.
- */
-static void __perf_counter_mark_enabled(struct perf_counter *counter,
-					struct perf_counter_context *ctx)
-{
-	struct perf_counter *sub;
-
-	counter->state = PERF_COUNTER_STATE_INACTIVE;
-	counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
-	list_for_each_entry(sub, &counter->sibling_list, group_entry)
-		if (sub->state >= PERF_COUNTER_STATE_INACTIVE)
-			sub->tstamp_enabled =
-				ctx->time - sub->total_time_enabled;
-}
-
-/*
- * Cross CPU call to enable a performance counter
- */
-static void __perf_counter_enable(void *info)
-{
-	struct perf_counter *counter = info;
-	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
-	struct perf_counter_context *ctx = counter->ctx;
-	struct perf_counter *leader = counter->group_leader;
-	int err;
-
-	/*
-	 * If this is a per-task counter, need to check whether this
-	 * counter's task is the current task on this cpu.
-	 */
-	if (ctx->task && cpuctx->task_ctx != ctx) {
-		if (cpuctx->task_ctx || ctx->task != current)
-			return;
-		cpuctx->task_ctx = ctx;
-	}
-
-	spin_lock(&ctx->lock);
-	ctx->is_active = 1;
-	update_context_time(ctx);
-
-	if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
-		goto unlock;
-	__perf_counter_mark_enabled(counter, ctx);
-
-	/*
-	 * If the counter is in a group and isn't the group leader,
-	 * then don't put it on unless the group is on.
-	 */
-	if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)
-		goto unlock;
-
-	if (!group_can_go_on(counter, cpuctx, 1)) {
-		err = -EEXIST;
-	} else {
-		perf_disable();
-		if (counter == leader)
-			err = group_sched_in(counter, cpuctx, ctx,
-					     smp_processor_id());
-		else
-			err = counter_sched_in(counter, cpuctx, ctx,
-					       smp_processor_id());
-		perf_enable();
-	}
-
-	if (err) {
-		/*
-		 * If this counter can't go on and it's part of a
-		 * group, then the whole group has to come off.
-		 */
-		if (leader != counter)
-			group_sched_out(leader, cpuctx, ctx);
-		if (leader->attr.pinned) {
-			update_group_times(leader);
-			leader->state = PERF_COUNTER_STATE_ERROR;
-		}
-	}
-
- unlock:
-	spin_unlock(&ctx->lock);
-}
-
-/*
- * Enable a counter.
- *
- * If counter->ctx is a cloned context, callers must make sure that
- * every task struct that counter->ctx->task could possibly point to
- * remains valid.  This condition is satisfied when called through
- * perf_counter_for_each_child or perf_counter_for_each as described
- * for perf_counter_disable.
- */
-static void perf_counter_enable(struct perf_counter *counter)
-{
-	struct perf_counter_context *ctx = counter->ctx;
-	struct task_struct *task = ctx->task;
-
-	if (!task) {
-		/*
-		 * Enable the counter on the cpu that it's on
-		 */
-		smp_call_function_single(counter->cpu, __perf_counter_enable,
-					 counter, 1);
-		return;
-	}
-
-	spin_lock_irq(&ctx->lock);
-	if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
-		goto out;
-
-	/*
-	 * If the counter is in error state, clear that first.
-	 * That way, if we see the counter in error state below, we
-	 * know that it has gone back into error state, as distinct
-	 * from the task having been scheduled away before the
-	 * cross-call arrived.
-	 */
-	if (counter->state == PERF_COUNTER_STATE_ERROR)
-		counter->state = PERF_COUNTER_STATE_OFF;
-
- retry:
-	spin_unlock_irq(&ctx->lock);
-	task_oncpu_function_call(task, __perf_counter_enable, counter);
-
-	spin_lock_irq(&ctx->lock);
-
-	/*
-	 * If the context is active and the counter is still off,
-	 * we need to retry the cross-call.
-	 */
-	if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF)
-		goto retry;
-
-	/*
-	 * Since we have the lock this context can't be scheduled
-	 * in, so we can change the state safely.
-	 */
-	if (counter->state == PERF_COUNTER_STATE_OFF)
-		__perf_counter_mark_enabled(counter, ctx);
-
- out:
-	spin_unlock_irq(&ctx->lock);
-}
-
-static int perf_counter_refresh(struct perf_counter *counter, int refresh)
-{
-	/*
-	 * not supported on inherited counters
-	 */
-	if (counter->attr.inherit)
-		return -EINVAL;
-
-	atomic_add(refresh, &counter->event_limit);
-	perf_counter_enable(counter);
-
-	return 0;
-}
-
-void __perf_counter_sched_out(struct perf_counter_context *ctx,
-			      struct perf_cpu_context *cpuctx)
-{
-	struct perf_counter *counter;
-
-	spin_lock(&ctx->lock);
-	ctx->is_active = 0;
-	if (likely(!ctx->nr_counters))
-		goto out;
-	update_context_time(ctx);
-
-	perf_disable();
-	if (ctx->nr_active) {
-		list_for_each_entry(counter, &ctx->group_list, group_entry) {
-			if (counter != counter->group_leader)
-				counter_sched_out(counter, cpuctx, ctx);
-			else
-				group_sched_out(counter, cpuctx, ctx);
-		}
-	}
-	perf_enable();
- out:
-	spin_unlock(&ctx->lock);
-}
-
-/*
- * Test whether two contexts are equivalent, i.e. whether they
- * have both been cloned from the same version of the same context
- * and they both have the same number of enabled counters.
- * If the number of enabled counters is the same, then the set
- * of enabled counters should be the same, because these are both
- * inherited contexts, therefore we can't access individual counters
- * in them directly with an fd; we can only enable/disable all
- * counters via prctl, or enable/disable all counters in a family
- * via ioctl, which will have the same effect on both contexts.
- */
-static int context_equiv(struct perf_counter_context *ctx1,
-			 struct perf_counter_context *ctx2)
-{
-	return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
-		&& ctx1->parent_gen == ctx2->parent_gen
-		&& !ctx1->pin_count && !ctx2->pin_count;
-}
-
-static void __perf_counter_read(void *counter);
-
-static void __perf_counter_sync_stat(struct perf_counter *counter,
-				     struct perf_counter *next_counter)
-{
-	u64 value;
-
-	if (!counter->attr.inherit_stat)
-		return;
-
-	/*
-	 * Update the counter value, we cannot use perf_counter_read()
-	 * because we're in the middle of a context switch and have IRQs
-	 * disabled, which upsets smp_call_function_single(), however
-	 * we know the counter must be on the current CPU, therefore we
-	 * don't need to use it.
-	 */
-	switch (counter->state) {
-	case PERF_COUNTER_STATE_ACTIVE:
-		__perf_counter_read(counter);
-		break;
-
-	case PERF_COUNTER_STATE_INACTIVE:
-		update_counter_times(counter);
-		break;
-
-	default:
-		break;
-	}
-
-	/*
-	 * In order to keep per-task stats reliable we need to flip the counter
-	 * values when we flip the contexts.
-	 */
-	value = atomic64_read(&next_counter->count);
-	value = atomic64_xchg(&counter->count, value);
-	atomic64_set(&next_counter->count, value);
-
-	swap(counter->total_time_enabled, next_counter->total_time_enabled);
-	swap(counter->total_time_running, next_counter->total_time_running);
-
-	/*
-	 * Since we swizzled the values, update the user visible data too.
-	 */
-	perf_counter_update_userpage(counter);
-	perf_counter_update_userpage(next_counter);
-}
-
-#define list_next_entry(pos, member) \
-	list_entry(pos->member.next, typeof(*pos), member)
-
-static void perf_counter_sync_stat(struct perf_counter_context *ctx,
-				   struct perf_counter_context *next_ctx)
-{
-	struct perf_counter *counter, *next_counter;
-
-	if (!ctx->nr_stat)
-		return;
-
-	counter = list_first_entry(&ctx->event_list,
-				   struct perf_counter, event_entry);
-
-	next_counter = list_first_entry(&next_ctx->event_list,
-					struct perf_counter, event_entry);
-
-	while (&counter->event_entry != &ctx->event_list &&
-	       &next_counter->event_entry != &next_ctx->event_list) {
-
-		__perf_counter_sync_stat(counter, next_counter);
-
-		counter = list_next_entry(counter, event_entry);
-		next_counter = list_next_entry(next_counter, event_entry);
-	}
-}
-
-/*
- * Called from scheduler to remove the counters of the current task,
- * with interrupts disabled.
- *
- * We stop each counter and update the counter value in counter->count.
- *
- * This does not protect us against NMI, but disable()
- * sets the disabled bit in the control field of counter _before_
- * accessing the counter control register. If a NMI hits, then it will
- * not restart the counter.
- */
-void perf_counter_task_sched_out(struct task_struct *task,
-				 struct task_struct *next, int cpu)
-{
-	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
-	struct perf_counter_context *ctx = task->perf_counter_ctxp;
-	struct perf_counter_context *next_ctx;
-	struct perf_counter_context *parent;
-	struct pt_regs *regs;
-	int do_switch = 1;
-
-	regs = task_pt_regs(task);
-	perf_swcounter_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
-
-	if (likely(!ctx || !cpuctx->task_ctx))
-		return;
-
-	update_context_time(ctx);
-
-	rcu_read_lock();
-	parent = rcu_dereference(ctx->parent_ctx);
-	next_ctx = next->perf_counter_ctxp;
-	if (parent && next_ctx &&
-	    rcu_dereference(next_ctx->parent_ctx) == parent) {
-		/*
-		 * Looks like the two contexts are clones, so we might be
-		 * able to optimize the context switch.  We lock both
-		 * contexts and check that they are clones under the
-		 * lock (including re-checking that neither has been
-		 * uncloned in the meantime).  It doesn't matter which
-		 * order we take the locks because no other cpu could
-		 * be trying to lock both of these tasks.
-		 */
-		spin_lock(&ctx->lock);
-		spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
-		if (context_equiv(ctx, next_ctx)) {
-			/*
-			 * XXX do we need a memory barrier of sorts
-			 * wrt to rcu_dereference() of perf_counter_ctxp
-			 */
-			task->perf_counter_ctxp = next_ctx;
-			next->perf_counter_ctxp = ctx;
-			ctx->task = next;
-			next_ctx->task = task;
-			do_switch = 0;
-
-			perf_counter_sync_stat(ctx, next_ctx);
-		}
-		spin_unlock(&next_ctx->lock);
-		spin_unlock(&ctx->lock);
-	}
-	rcu_read_unlock();
-
-	if (do_switch) {
-		__perf_counter_sched_out(ctx, cpuctx);
-		cpuctx->task_ctx = NULL;
-	}
-}
-
-/*
- * Called with IRQs disabled
- */
-static void __perf_counter_task_sched_out(struct perf_counter_context *ctx)
-{
-	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
-
-	if (!cpuctx->task_ctx)
-		return;
-
-	if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
-		return;
-
-	__perf_counter_sched_out(ctx, cpuctx);
-	cpuctx->task_ctx = NULL;
-}
-
-/*
- * Called with IRQs disabled
- */
-static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx)
-{
-	__perf_counter_sched_out(&cpuctx->ctx, cpuctx);
-}
-
-static void
-__perf_counter_sched_in(struct perf_counter_context *ctx,
-			struct perf_cpu_context *cpuctx, int cpu)
-{
-	struct perf_counter *counter;
-	int can_add_hw = 1;
-
-	spin_lock(&ctx->lock);
-	ctx->is_active = 1;
-	if (likely(!ctx->nr_counters))
-		goto out;
-
-	ctx->timestamp = perf_clock();
-
-	perf_disable();
-
-	/*
-	 * First go through the list and put on any pinned groups
-	 * in order to give them the best chance of going on.
-	 */
-	list_for_each_entry(counter, &ctx->group_list, group_entry) {
-		if (counter->state <= PERF_COUNTER_STATE_OFF ||
-		    !counter->attr.pinned)
-			continue;
-		if (counter->cpu != -1 && counter->cpu != cpu)
-			continue;
-
-		if (counter != counter->group_leader)
-			counter_sched_in(counter, cpuctx, ctx, cpu);
-		else {
-			if (group_can_go_on(counter, cpuctx, 1))
-				group_sched_in(counter, cpuctx, ctx, cpu);
-		}
-
-		/*
-		 * If this pinned group hasn't been scheduled,
-		 * put it in error state.
-		 */
-		if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
-			update_group_times(counter);
-			counter->state = PERF_COUNTER_STATE_ERROR;
-		}
-	}
-
-	list_for_each_entry(counter, &ctx->group_list, group_entry) {
-		/*
-		 * Ignore counters in OFF or ERROR state, and
-		 * ignore pinned counters since we did them already.
-		 */
-		if (counter->state <= PERF_COUNTER_STATE_OFF ||
-		    counter->attr.pinned)
-			continue;
-
-		/*
-		 * Listen to the 'cpu' scheduling filter constraint
-		 * of counters:
-		 */
-		if (counter->cpu != -1 && counter->cpu != cpu)
-			continue;
-
-		if (counter != counter->group_leader) {
-			if (counter_sched_in(counter, cpuctx, ctx, cpu))
-				can_add_hw = 0;
-		} else {
-			if (group_can_go_on(counter, cpuctx, can_add_hw)) {
-				if (group_sched_in(counter, cpuctx, ctx, cpu))
-					can_add_hw = 0;
-			}
-		}
-	}
-	perf_enable();
- out:
-	spin_unlock(&ctx->lock);
-}
-
-/*
- * Called from scheduler to add the counters of the current task
- * with interrupts disabled.
- *
- * We restore the counter value and then enable it.
- *
- * This does not protect us against NMI, but enable()
- * sets the enabled bit in the control field of counter _before_
- * accessing the counter control register. If a NMI hits, then it will
- * keep the counter running.
- */
-void perf_counter_task_sched_in(struct task_struct *task, int cpu)
-{
-	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
-	struct perf_counter_context *ctx = task->perf_counter_ctxp;
-
-	if (likely(!ctx))
-		return;
-	if (cpuctx->task_ctx == ctx)
-		return;
-	__perf_counter_sched_in(ctx, cpuctx, cpu);
-	cpuctx->task_ctx = ctx;
-}
-
-static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
-{
-	struct perf_counter_context *ctx = &cpuctx->ctx;
-
-	__perf_counter_sched_in(ctx, cpuctx, cpu);
-}
-
-#define MAX_INTERRUPTS (~0ULL)
-
-static void perf_log_throttle(struct perf_counter *counter, int enable);
-
-static void perf_adjust_period(struct perf_counter *counter, u64 events)
-{
-	struct hw_perf_counter *hwc = &counter->hw;
-	u64 period, sample_period;
-	s64 delta;
-
-	events *= hwc->sample_period;
-	period = div64_u64(events, counter->attr.sample_freq);
-
-	delta = (s64)(period - hwc->sample_period);
-	delta = (delta + 7) / 8; /* low pass filter */
-
-	sample_period = hwc->sample_period + delta;
-
-	if (!sample_period)
-		sample_period = 1;
-
-	hwc->sample_period = sample_period;
-}
-
-static void perf_ctx_adjust_freq(struct perf_counter_context *ctx)
-{
-	struct perf_counter *counter;
-	struct hw_perf_counter *hwc;
-	u64 interrupts, freq;
-
-	spin_lock(&ctx->lock);
-	list_for_each_entry(counter, &ctx->group_list, group_entry) {
-		if (counter->state != PERF_COUNTER_STATE_ACTIVE)
-			continue;
-
-		hwc = &counter->hw;
-
-		interrupts = hwc->interrupts;
-		hwc->interrupts = 0;
-
-		/*
-		 * unthrottle counters on the tick
-		 */
-		if (interrupts == MAX_INTERRUPTS) {
-			perf_log_throttle(counter, 1);
-			counter->pmu->unthrottle(counter);
-			interrupts = 2*sysctl_perf_counter_sample_rate/HZ;
-		}
-
-		if (!counter->attr.freq || !counter->attr.sample_freq)
-			continue;
-
-		/*
-		 * if the specified freq < HZ then we need to skip ticks
-		 */
-		if (counter->attr.sample_freq < HZ) {
-			freq = counter->attr.sample_freq;
-
-			hwc->freq_count += freq;
-			hwc->freq_interrupts += interrupts;
-
-			if (hwc->freq_count < HZ)
-				continue;
-
-			interrupts = hwc->freq_interrupts;
-			hwc->freq_interrupts = 0;
-			hwc->freq_count -= HZ;
-		} else
-			freq = HZ;
-
-		perf_adjust_period(counter, freq * interrupts);
-
-		/*
-		 * In order to avoid being stalled by an (accidental) huge
-		 * sample period, force reset the sample period if we didn't
-		 * get any events in this freq period.
-		 */
-		if (!interrupts) {
-			perf_disable();
-			counter->pmu->disable(counter);
-			atomic64_set(&hwc->period_left, 0);
-			counter->pmu->enable(counter);
-			perf_enable();
-		}
-	}
-	spin_unlock(&ctx->lock);
-}
-
-/*
- * Round-robin a context's counters:
- */
-static void rotate_ctx(struct perf_counter_context *ctx)
-{
-	struct perf_counter *counter;
-
-	if (!ctx->nr_counters)
-		return;
-
-	spin_lock(&ctx->lock);
-	/*
-	 * Rotate the first entry last (works just fine for group counters too):
-	 */
-	perf_disable();
-	list_for_each_entry(counter, &ctx->group_list, group_entry) {
-		list_move_tail(&counter->group_entry, &ctx->group_list);
-		break;
-	}
-	perf_enable();
-
-	spin_unlock(&ctx->lock);
-}
-
-void perf_counter_task_tick(struct task_struct *curr, int cpu)
-{
-	struct perf_cpu_context *cpuctx;
-	struct perf_counter_context *ctx;
-
-	if (!atomic_read(&nr_counters))
-		return;
-
-	cpuctx = &per_cpu(perf_cpu_context, cpu);
-	ctx = curr->perf_counter_ctxp;
-
-	perf_ctx_adjust_freq(&cpuctx->ctx);
-	if (ctx)
-		perf_ctx_adjust_freq(ctx);
-
-	perf_counter_cpu_sched_out(cpuctx);
-	if (ctx)
-		__perf_counter_task_sched_out(ctx);
-
-	rotate_ctx(&cpuctx->ctx);
-	if (ctx)
-		rotate_ctx(ctx);
-
-	perf_counter_cpu_sched_in(cpuctx, cpu);
-	if (ctx)
-		perf_counter_task_sched_in(curr, cpu);
-}
-
-/*
- * Enable all of a task's counters that have been marked enable-on-exec.
- * This expects task == current.
- */
-static void perf_counter_enable_on_exec(struct task_struct *task)
-{
-	struct perf_counter_context *ctx;
-	struct perf_counter *counter;
-	unsigned long flags;
-	int enabled = 0;
-
-	local_irq_save(flags);
-	ctx = task->perf_counter_ctxp;
-	if (!ctx || !ctx->nr_counters)
-		goto out;
-
-	__perf_counter_task_sched_out(ctx);
-
-	spin_lock(&ctx->lock);
-
-	list_for_each_entry(counter, &ctx->group_list, group_entry) {
-		if (!counter->attr.enable_on_exec)
-			continue;
-		counter->attr.enable_on_exec = 0;
-		if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
-			continue;
-		__perf_counter_mark_enabled(counter, ctx);
-		enabled = 1;
-	}
-
-	/*
-	 * Unclone this context if we enabled any counter.
-	 */
-	if (enabled)
-		unclone_ctx(ctx);
-
-	spin_unlock(&ctx->lock);
-
-	perf_counter_task_sched_in(task, smp_processor_id());
- out:
-	local_irq_restore(flags);
-}
-
-/*
- * Cross CPU call to read the hardware counter
- */
-static void __perf_counter_read(void *info)
-{
-	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
-	struct perf_counter *counter = info;
-	struct perf_counter_context *ctx = counter->ctx;
-	unsigned long flags;
-
-	/*
-	 * If this is a task context, we need to check whether it is
-	 * the current task context of this cpu.  If not it has been
-	 * scheduled out before the smp call arrived.  In that case
-	 * counter->count would have been updated to a recent sample
-	 * when the counter was scheduled out.
-	 */
-	if (ctx->task && cpuctx->task_ctx != ctx)
-		return;
-
-	local_irq_save(flags);
-	if (ctx->is_active)
-		update_context_time(ctx);
-	counter->pmu->read(counter);
-	update_counter_times(counter);
-	local_irq_restore(flags);
-}
-
-static u64 perf_counter_read(struct perf_counter *counter)
-{
-	/*
-	 * If counter is enabled and currently active on a CPU, update the
-	 * value in the counter structure:
-	 */
-	if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
-		smp_call_function_single(counter->oncpu,
-					 __perf_counter_read, counter, 1);
-	} else if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
-		update_counter_times(counter);
-	}
-
-	return atomic64_read(&counter->count);
-}
-
-/*
- * Initialize the perf_counter context in a task_struct:
- */
-static void
-__perf_counter_init_context(struct perf_counter_context *ctx,
-			    struct task_struct *task)
-{
-	memset(ctx, 0, sizeof(*ctx));
-	spin_lock_init(&ctx->lock);
-	mutex_init(&ctx->mutex);
-	INIT_LIST_HEAD(&ctx->group_list);
-	INIT_LIST_HEAD(&ctx->event_list);
-	atomic_set(&ctx->refcount, 1);
-	ctx->task = task;
-}
-
-static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
-{
-	struct perf_counter_context *ctx;
-	struct perf_cpu_context *cpuctx;
-	struct task_struct *task;
-	unsigned long flags;
-	int err;
-
-	/*
-	 * If cpu is not a wildcard then this is a percpu counter:
-	 */
-	if (cpu != -1) {
-		/* Must be root to operate on a CPU counter: */
-		if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
-			return ERR_PTR(-EACCES);
-
-		if (cpu < 0 || cpu > num_possible_cpus())
-			return ERR_PTR(-EINVAL);
-
-		/*
-		 * We could be clever and allow to attach a counter to an
-		 * offline CPU and activate it when the CPU comes up, but
-		 * that's for later.
-		 */
-		if (!cpu_isset(cpu, cpu_online_map))
-			return ERR_PTR(-ENODEV);
-
-		cpuctx = &per_cpu(perf_cpu_context, cpu);
-		ctx = &cpuctx->ctx;
-		get_ctx(ctx);
-
-		return ctx;
-	}
-
-	rcu_read_lock();
-	if (!pid)
-		task = current;
-	else
-		task = find_task_by_vpid(pid);
-	if (task)
-		get_task_struct(task);
-	rcu_read_unlock();
-
-	if (!task)
-		return ERR_PTR(-ESRCH);
-
-	/*
-	 * Can't attach counters to a dying task.
-	 */
-	err = -ESRCH;
-	if (task->flags & PF_EXITING)
-		goto errout;
-
-	/* Reuse ptrace permission checks for now. */
-	err = -EACCES;
-	if (!ptrace_may_access(task, PTRACE_MODE_READ))
-		goto errout;
-
- retry:
-	ctx = perf_lock_task_context(task, &flags);
-	if (ctx) {
-		unclone_ctx(ctx);
-		spin_unlock_irqrestore(&ctx->lock, flags);
-	}
-
-	if (!ctx) {
-		ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
-		err = -ENOMEM;
-		if (!ctx)
-			goto errout;
-		__perf_counter_init_context(ctx, task);
-		get_ctx(ctx);
-		if (cmpxchg(&task->perf_counter_ctxp, NULL, ctx)) {
-			/*
-			 * We raced with some other task; use
-			 * the context they set.
-			 */
-			kfree(ctx);
-			goto retry;
-		}
-		get_task_struct(task);
-	}
-
-	put_task_struct(task);
-	return ctx;
-
- errout:
-	put_task_struct(task);
-	return ERR_PTR(err);
-}
-
-static void free_counter_rcu(struct rcu_head *head)
-{
-	struct perf_counter *counter;
-
-	counter = container_of(head, struct perf_counter, rcu_head);
-	if (counter->ns)
-		put_pid_ns(counter->ns);
-	kfree(counter);
-}
-
-static void perf_pending_sync(struct perf_counter *counter);
-
-static void free_counter(struct perf_counter *counter)
-{
-	perf_pending_sync(counter);
-
-	if (!counter->parent) {
-		atomic_dec(&nr_counters);
-		if (counter->attr.mmap)
-			atomic_dec(&nr_mmap_counters);
-		if (counter->attr.comm)
-			atomic_dec(&nr_comm_counters);
-		if (counter->attr.task)
-			atomic_dec(&nr_task_counters);
-	}
-
-	if (counter->output) {
-		fput(counter->output->filp);
-		counter->output = NULL;
-	}
-
-	if (counter->destroy)
-		counter->destroy(counter);
-
-	put_ctx(counter->ctx);
-	call_rcu(&counter->rcu_head, free_counter_rcu);
-}
-
-/*
- * Called when the last reference to the file is gone.
- */
-static int perf_release(struct inode *inode, struct file *file)
-{
-	struct perf_counter *counter = file->private_data;
-	struct perf_counter_context *ctx = counter->ctx;
-
-	file->private_data = NULL;
-
-	WARN_ON_ONCE(ctx->parent_ctx);
-	mutex_lock(&ctx->mutex);
-	perf_counter_remove_from_context(counter);
-	mutex_unlock(&ctx->mutex);
-
-	mutex_lock(&counter->owner->perf_counter_mutex);
-	list_del_init(&counter->owner_entry);
-	mutex_unlock(&counter->owner->perf_counter_mutex);
-	put_task_struct(counter->owner);
-
-	free_counter(counter);
-
-	return 0;
-}
-
-static int perf_counter_read_size(struct perf_counter *counter)
-{
-	int entry = sizeof(u64); /* value */
-	int size = 0;
-	int nr = 1;
-
-	if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
-		size += sizeof(u64);
-
-	if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
-		size += sizeof(u64);
-
-	if (counter->attr.read_format & PERF_FORMAT_ID)
-		entry += sizeof(u64);
-
-	if (counter->attr.read_format & PERF_FORMAT_GROUP) {
-		nr += counter->group_leader->nr_siblings;
-		size += sizeof(u64);
-	}
-
-	size += entry * nr;
-
-	return size;
-}
-
-static u64 perf_counter_read_value(struct perf_counter *counter)
-{
-	struct perf_counter *child;
-	u64 total = 0;
-
-	total += perf_counter_read(counter);
-	list_for_each_entry(child, &counter->child_list, child_list)
-		total += perf_counter_read(child);
-
-	return total;
-}
-
-static int perf_counter_read_entry(struct perf_counter *counter,
-				   u64 read_format, char __user *buf)
-{
-	int n = 0, count = 0;
-	u64 values[2];
-
-	values[n++] = perf_counter_read_value(counter);
-	if (read_format & PERF_FORMAT_ID)
-		values[n++] = primary_counter_id(counter);
-
-	count = n * sizeof(u64);
-
-	if (copy_to_user(buf, values, count))
-		return -EFAULT;
-
-	return count;
-}
-
-static int perf_counter_read_group(struct perf_counter *counter,
-				   u64 read_format, char __user *buf)
-{
-	struct perf_counter *leader = counter->group_leader, *sub;
-	int n = 0, size = 0, err = -EFAULT;
-	u64 values[3];
-
-	values[n++] = 1 + leader->nr_siblings;
-	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
-		values[n++] = leader->total_time_enabled +
-			atomic64_read(&leader->child_total_time_enabled);
-	}
-	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
-		values[n++] = leader->total_time_running +
-			atomic64_read(&leader->child_total_time_running);
-	}
-
-	size = n * sizeof(u64);
-
-	if (copy_to_user(buf, values, size))
-		return -EFAULT;
-
-	err = perf_counter_read_entry(leader, read_format, buf + size);
-	if (err < 0)
-		return err;
-
-	size += err;
-
-	list_for_each_entry(sub, &leader->sibling_list, group_entry) {
-		err = perf_counter_read_entry(sub, read_format,
-				buf + size);
-		if (err < 0)
-			return err;
-
-		size += err;
-	}
-
-	return size;
-}
-
-static int perf_counter_read_one(struct perf_counter *counter,
-				 u64 read_format, char __user *buf)
-{
-	u64 values[4];
-	int n = 0;
-
-	values[n++] = perf_counter_read_value(counter);
-	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
-		values[n++] = counter->total_time_enabled +
-			atomic64_read(&counter->child_total_time_enabled);
-	}
-	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
-		values[n++] = counter->total_time_running +
-			atomic64_read(&counter->child_total_time_running);
-	}
-	if (read_format & PERF_FORMAT_ID)
-		values[n++] = primary_counter_id(counter);
-
-	if (copy_to_user(buf, values, n * sizeof(u64)))
-		return -EFAULT;
-
-	return n * sizeof(u64);
-}
-
-/*
- * Read the performance counter - simple non blocking version for now
- */
-static ssize_t
-perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
-{
-	u64 read_format = counter->attr.read_format;
-	int ret;
-
-	/*
-	 * Return end-of-file for a read on a counter that is in
-	 * error state (i.e. because it was pinned but it couldn't be
-	 * scheduled on to the CPU at some point).
-	 */
-	if (counter->state == PERF_COUNTER_STATE_ERROR)
-		return 0;
-
-	if (count < perf_counter_read_size(counter))
-		return -ENOSPC;
-
-	WARN_ON_ONCE(counter->ctx->parent_ctx);
-	mutex_lock(&counter->child_mutex);
-	if (read_format & PERF_FORMAT_GROUP)
-		ret = perf_counter_read_group(counter, read_format, buf);
-	else
-		ret = perf_counter_read_one(counter, read_format, buf);
-	mutex_unlock(&counter->child_mutex);
-
-	return ret;
-}
-
-static ssize_t
-perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
-{
-	struct perf_counter *counter = file->private_data;
-
-	return perf_read_hw(counter, buf, count);
-}
-
-static unsigned int perf_poll(struct file *file, poll_table *wait)
-{
-	struct perf_counter *counter = file->private_data;
-	struct perf_mmap_data *data;
-	unsigned int events = POLL_HUP;
-
-	rcu_read_lock();
-	data = rcu_dereference(counter->data);
-	if (data)
-		events = atomic_xchg(&data->poll, 0);
-	rcu_read_unlock();
-
-	poll_wait(file, &counter->waitq, wait);
-
-	return events;
-}
-
-static void perf_counter_reset(struct perf_counter *counter)
-{
-	(void)perf_counter_read(counter);
-	atomic64_set(&counter->count, 0);
-	perf_counter_update_userpage(counter);
-}
-
-/*
- * Holding the top-level counter's child_mutex means that any
- * descendant process that has inherited this counter will block
- * in sync_child_counter if it goes to exit, thus satisfying the
- * task existence requirements of perf_counter_enable/disable.
- */
-static void perf_counter_for_each_child(struct perf_counter *counter,
-					void (*func)(struct perf_counter *))
-{
-	struct perf_counter *child;
-
-	WARN_ON_ONCE(counter->ctx->parent_ctx);
-	mutex_lock(&counter->child_mutex);
-	func(counter);
-	list_for_each_entry(child, &counter->child_list, child_list)
-		func(child);
-	mutex_unlock(&counter->child_mutex);
-}
-
-static void perf_counter_for_each(struct perf_counter *counter,
-				  void (*func)(struct perf_counter *))
-{
-	struct perf_counter_context *ctx = counter->ctx;
-	struct perf_counter *sibling;
-
-	WARN_ON_ONCE(ctx->parent_ctx);
-	mutex_lock(&ctx->mutex);
-	counter = counter->group_leader;
-
-	perf_counter_for_each_child(counter, func);
-	func(counter);
-	list_for_each_entry(sibling, &counter->sibling_list, group_entry)
-		perf_counter_for_each_child(counter, func);
-	mutex_unlock(&ctx->mutex);
-}
-
-static int perf_counter_period(struct perf_counter *counter, u64 __user *arg)
-{
-	struct perf_counter_context *ctx = counter->ctx;
-	unsigned long size;
-	int ret = 0;
-	u64 value;
-
-	if (!counter->attr.sample_period)
-		return -EINVAL;
-
-	size = copy_from_user(&value, arg, sizeof(value));
-	if (size != sizeof(value))
-		return -EFAULT;
-
-	if (!value)
-		return -EINVAL;
-
-	spin_lock_irq(&ctx->lock);
-	if (counter->attr.freq) {
-		if (value > sysctl_perf_counter_sample_rate) {
-			ret = -EINVAL;
-			goto unlock;
-		}
-
-		counter->attr.sample_freq = value;
-	} else {
-		counter->attr.sample_period = value;
-		counter->hw.sample_period = value;
-	}
-unlock:
-	spin_unlock_irq(&ctx->lock);
-
-	return ret;
-}
-
-int perf_counter_set_output(struct perf_counter *counter, int output_fd);
-
-static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
-{
-	struct perf_counter *counter = file->private_data;
-	void (*func)(struct perf_counter *);
-	u32 flags = arg;
-
-	switch (cmd) {
-	case PERF_COUNTER_IOC_ENABLE:
-		func = perf_counter_enable;
-		break;
-	case PERF_COUNTER_IOC_DISABLE:
-		func = perf_counter_disable;
-		break;
-	case PERF_COUNTER_IOC_RESET:
-		func = perf_counter_reset;
-		break;
-
-	case PERF_COUNTER_IOC_REFRESH:
-		return perf_counter_refresh(counter, arg);
-
-	case PERF_COUNTER_IOC_PERIOD:
-		return perf_counter_period(counter, (u64 __user *)arg);
-
-	case PERF_COUNTER_IOC_SET_OUTPUT:
-		return perf_counter_set_output(counter, arg);
-
-	default:
-		return -ENOTTY;
-	}
-
-	if (flags & PERF_IOC_FLAG_GROUP)
-		perf_counter_for_each(counter, func);
-	else
-		perf_counter_for_each_child(counter, func);
-
-	return 0;
-}
-
-int perf_counter_task_enable(void)
-{
-	struct perf_counter *counter;
-
-	mutex_lock(&current->perf_counter_mutex);
-	list_for_each_entry(counter, &current->perf_counter_list, owner_entry)
-		perf_counter_for_each_child(counter, perf_counter_enable);
-	mutex_unlock(&current->perf_counter_mutex);
-
-	return 0;
-}
-
-int perf_counter_task_disable(void)
-{
-	struct perf_counter *counter;
-
-	mutex_lock(&current->perf_counter_mutex);
-	list_for_each_entry(counter, &current->perf_counter_list, owner_entry)
-		perf_counter_for_each_child(counter, perf_counter_disable);
-	mutex_unlock(&current->perf_counter_mutex);
-
-	return 0;
-}
-
-#ifndef PERF_COUNTER_INDEX_OFFSET
-# define PERF_COUNTER_INDEX_OFFSET 0
-#endif
-
-static int perf_counter_index(struct perf_counter *counter)
-{
-	if (counter->state != PERF_COUNTER_STATE_ACTIVE)
-		return 0;
-
-	return counter->hw.idx + 1 - PERF_COUNTER_INDEX_OFFSET;
-}
-
-/*
- * Callers need to ensure there can be no nesting of this function, otherwise
- * the seqlock logic goes bad. We can not serialize this because the arch
- * code calls this from NMI context.
- */
-void perf_counter_update_userpage(struct perf_counter *counter)
-{
-	struct perf_counter_mmap_page *userpg;
-	struct perf_mmap_data *data;
-
-	rcu_read_lock();
-	data = rcu_dereference(counter->data);
-	if (!data)
-		goto unlock;
-
-	userpg = data->user_page;
-
-	/*
-	 * Disable preemption so as to not let the corresponding user-space
-	 * spin too long if we get preempted.
-	 */
-	preempt_disable();
-	++userpg->lock;
-	barrier();
-	userpg->index = perf_counter_index(counter);
-	userpg->offset = atomic64_read(&counter->count);
-	if (counter->state == PERF_COUNTER_STATE_ACTIVE)
-		userpg->offset -= atomic64_read(&counter->hw.prev_count);
-
-	userpg->time_enabled = counter->total_time_enabled +
-			atomic64_read(&counter->child_total_time_enabled);
-
-	userpg->time_running = counter->total_time_running +
-			atomic64_read(&counter->child_total_time_running);
-
-	barrier();
-	++userpg->lock;
-	preempt_enable();
-unlock:
-	rcu_read_unlock();
-}
-
-static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
-{
-	struct perf_counter *counter = vma->vm_file->private_data;
-	struct perf_mmap_data *data;
-	int ret = VM_FAULT_SIGBUS;
-
-	if (vmf->flags & FAULT_FLAG_MKWRITE) {
-		if (vmf->pgoff == 0)
-			ret = 0;
-		return ret;
-	}
-
-	rcu_read_lock();
-	data = rcu_dereference(counter->data);
-	if (!data)
-		goto unlock;
-
-	if (vmf->pgoff == 0) {
-		vmf->page = virt_to_page(data->user_page);
-	} else {
-		int nr = vmf->pgoff - 1;
-
-		if ((unsigned)nr > data->nr_pages)
-			goto unlock;
-
-		if (vmf->flags & FAULT_FLAG_WRITE)
-			goto unlock;
-
-		vmf->page = virt_to_page(data->data_pages[nr]);
-	}
-
-	get_page(vmf->page);
-	vmf->page->mapping = vma->vm_file->f_mapping;
-	vmf->page->index   = vmf->pgoff;
-
-	ret = 0;
-unlock:
-	rcu_read_unlock();
-
-	return ret;
-}
-
-static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages)
-{
-	struct perf_mmap_data *data;
-	unsigned long size;
-	int i;
-
-	WARN_ON(atomic_read(&counter->mmap_count));
-
-	size = sizeof(struct perf_mmap_data);
-	size += nr_pages * sizeof(void *);
-
-	data = kzalloc(size, GFP_KERNEL);
-	if (!data)
-		goto fail;
-
-	data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
-	if (!data->user_page)
-		goto fail_user_page;
-
-	for (i = 0; i < nr_pages; i++) {
-		data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
-		if (!data->data_pages[i])
-			goto fail_data_pages;
-	}
-
-	data->nr_pages = nr_pages;
-	atomic_set(&data->lock, -1);
-
-	if (counter->attr.watermark) {
-		data->watermark = min_t(long, PAGE_SIZE * nr_pages,
-				      counter->attr.wakeup_watermark);
-	}
-	if (!data->watermark)
-		data->watermark = max(PAGE_SIZE, PAGE_SIZE * nr_pages / 4);
-
-	rcu_assign_pointer(counter->data, data);
-
-	return 0;
-
-fail_data_pages:
-	for (i--; i >= 0; i--)
-		free_page((unsigned long)data->data_pages[i]);
-
-	free_page((unsigned long)data->user_page);
-
-fail_user_page:
-	kfree(data);
-
-fail:
-	return -ENOMEM;
-}
-
-static void perf_mmap_free_page(unsigned long addr)
-{
-	struct page *page = virt_to_page((void *)addr);
-
-	page->mapping = NULL;
-	__free_page(page);
-}
-
-static void __perf_mmap_data_free(struct rcu_head *rcu_head)
-{
-	struct perf_mmap_data *data;
-	int i;
-
-	data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
-
-	perf_mmap_free_page((unsigned long)data->user_page);
-	for (i = 0; i < data->nr_pages; i++)
-		perf_mmap_free_page((unsigned long)data->data_pages[i]);
-
-	kfree(data);
-}
-
-static void perf_mmap_data_free(struct perf_counter *counter)
-{
-	struct perf_mmap_data *data = counter->data;
-
-	WARN_ON(atomic_read(&counter->mmap_count));
-
-	rcu_assign_pointer(counter->data, NULL);
-	call_rcu(&data->rcu_head, __perf_mmap_data_free);
-}
-
-static void perf_mmap_open(struct vm_area_struct *vma)
-{
-	struct perf_counter *counter = vma->vm_file->private_data;
-
-	atomic_inc(&counter->mmap_count);
-}
-
-static void perf_mmap_close(struct vm_area_struct *vma)
-{
-	struct perf_counter *counter = vma->vm_file->private_data;
-
-	WARN_ON_ONCE(counter->ctx->parent_ctx);
-	if (atomic_dec_and_mutex_lock(&counter->mmap_count, &counter->mmap_mutex)) {
-		struct user_struct *user = current_user();
-
-		atomic_long_sub(counter->data->nr_pages + 1, &user->locked_vm);
-		vma->vm_mm->locked_vm -= counter->data->nr_locked;
-		perf_mmap_data_free(counter);
-		mutex_unlock(&counter->mmap_mutex);
-	}
-}
-
-static struct vm_operations_struct perf_mmap_vmops = {
-	.open		= perf_mmap_open,
-	.close		= perf_mmap_close,
-	.fault		= perf_mmap_fault,
-	.page_mkwrite	= perf_mmap_fault,
-};
-
-static int perf_mmap(struct file *file, struct vm_area_struct *vma)
-{
-	struct perf_counter *counter = file->private_data;
-	unsigned long user_locked, user_lock_limit;
-	struct user_struct *user = current_user();
-	unsigned long locked, lock_limit;
-	unsigned long vma_size;
-	unsigned long nr_pages;
-	long user_extra, extra;
-	int ret = 0;
-
-	if (!(vma->vm_flags & VM_SHARED))
-		return -EINVAL;
-
-	vma_size = vma->vm_end - vma->vm_start;
-	nr_pages = (vma_size / PAGE_SIZE) - 1;
-
-	/*
-	 * If we have data pages ensure they're a power-of-two number, so we
-	 * can do bitmasks instead of modulo.
-	 */
-	if (nr_pages != 0 && !is_power_of_2(nr_pages))
-		return -EINVAL;
-
-	if (vma_size != PAGE_SIZE * (1 + nr_pages))
-		return -EINVAL;
-
-	if (vma->vm_pgoff != 0)
-		return -EINVAL;
-
-	WARN_ON_ONCE(counter->ctx->parent_ctx);
-	mutex_lock(&counter->mmap_mutex);
-	if (counter->output) {
-		ret = -EINVAL;
-		goto unlock;
-	}
-
-	if (atomic_inc_not_zero(&counter->mmap_count)) {
-		if (nr_pages != counter->data->nr_pages)
-			ret = -EINVAL;
-		goto unlock;
-	}
-
-	user_extra = nr_pages + 1;
-	user_lock_limit = sysctl_perf_counter_mlock >> (PAGE_SHIFT - 10);
-
-	/*
-	 * Increase the limit linearly with more CPUs:
-	 */
-	user_lock_limit *= num_online_cpus();
-
-	user_locked = atomic_long_read(&user->locked_vm) + user_extra;
-
-	extra = 0;
-	if (user_locked > user_lock_limit)
-		extra = user_locked - user_lock_limit;
-
-	lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
-	lock_limit >>= PAGE_SHIFT;
-	locked = vma->vm_mm->locked_vm + extra;
-
-	if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
-		!capable(CAP_IPC_LOCK)) {
-		ret = -EPERM;
-		goto unlock;
-	}
-
-	WARN_ON(counter->data);
-	ret = perf_mmap_data_alloc(counter, nr_pages);
-	if (ret)
-		goto unlock;
-
-	atomic_set(&counter->mmap_count, 1);
-	atomic_long_add(user_extra, &user->locked_vm);
-	vma->vm_mm->locked_vm += extra;
-	counter->data->nr_locked = extra;
-	if (vma->vm_flags & VM_WRITE)
-		counter->data->writable = 1;
-
-unlock:
-	mutex_unlock(&counter->mmap_mutex);
-
-	vma->vm_flags |= VM_RESERVED;
-	vma->vm_ops = &perf_mmap_vmops;
-
-	return ret;
-}
-
-static int perf_fasync(int fd, struct file *filp, int on)
-{
-	struct inode *inode = filp->f_path.dentry->d_inode;
-	struct perf_counter *counter = filp->private_data;
-	int retval;
-
-	mutex_lock(&inode->i_mutex);
-	retval = fasync_helper(fd, filp, on, &counter->fasync);
-	mutex_unlock(&inode->i_mutex);
-
-	if (retval < 0)
-		return retval;
-
-	return 0;
-}
-
-static const struct file_operations perf_fops = {
-	.release		= perf_release,
-	.read			= perf_read,
-	.poll			= perf_poll,
-	.unlocked_ioctl		= perf_ioctl,
-	.compat_ioctl		= perf_ioctl,
-	.mmap			= perf_mmap,
-	.fasync			= perf_fasync,
-};
-
-/*
- * Perf counter wakeup
- *
- * If there's data, ensure we set the poll() state and publish everything
- * to user-space before waking everybody up.
- */
-
-void perf_counter_wakeup(struct perf_counter *counter)
-{
-	wake_up_all(&counter->waitq);
-
-	if (counter->pending_kill) {
-		kill_fasync(&counter->fasync, SIGIO, counter->pending_kill);
-		counter->pending_kill = 0;
-	}
-}
-
-/*
- * Pending wakeups
- *
- * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
- *
- * The NMI bit means we cannot possibly take locks. Therefore, maintain a
- * single linked list and use cmpxchg() to add entries lockless.
- */
-
-static void perf_pending_counter(struct perf_pending_entry *entry)
-{
-	struct perf_counter *counter = container_of(entry,
-			struct perf_counter, pending);
-
-	if (counter->pending_disable) {
-		counter->pending_disable = 0;
-		__perf_counter_disable(counter);
-	}
-
-	if (counter->pending_wakeup) {
-		counter->pending_wakeup = 0;
-		perf_counter_wakeup(counter);
-	}
-}
-
-#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
-
-static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
-	PENDING_TAIL,
-};
-
-static void perf_pending_queue(struct perf_pending_entry *entry,
-			       void (*func)(struct perf_pending_entry *))
-{
-	struct perf_pending_entry **head;
-
-	if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
-		return;
-
-	entry->func = func;
-
-	head = &get_cpu_var(perf_pending_head);
-
-	do {
-		entry->next = *head;
-	} while (cmpxchg(head, entry->next, entry) != entry->next);
-
-	set_perf_counter_pending();
-
-	put_cpu_var(perf_pending_head);
-}
-
-static int __perf_pending_run(void)
-{
-	struct perf_pending_entry *list;
-	int nr = 0;
-
-	list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
-	while (list != PENDING_TAIL) {
-		void (*func)(struct perf_pending_entry *);
-		struct perf_pending_entry *entry = list;
-
-		list = list->next;
-
-		func = entry->func;
-		entry->next = NULL;
-		/*
-		 * Ensure we observe the unqueue before we issue the wakeup,
-		 * so that we won't be waiting forever.
-		 * -- see perf_not_pending().
-		 */
-		smp_wmb();
-
-		func(entry);
-		nr++;
-	}
-
-	return nr;
-}
-
-static inline int perf_not_pending(struct perf_counter *counter)
-{
-	/*
-	 * If we flush on whatever cpu we run, there is a chance we don't
-	 * need to wait.
-	 */
-	get_cpu();
-	__perf_pending_run();
-	put_cpu();
-
-	/*
-	 * Ensure we see the proper queue state before going to sleep
-	 * so that we do not miss the wakeup. -- see perf_pending_handle()
-	 */
-	smp_rmb();
-	return counter->pending.next == NULL;
-}
-
-static void perf_pending_sync(struct perf_counter *counter)
-{
-	wait_event(counter->waitq, perf_not_pending(counter));
-}
-
-void perf_counter_do_pending(void)
-{
-	__perf_pending_run();
-}
-
-/*
- * Callchain support -- arch specific
- */
-
-__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
-{
-	return NULL;
-}
-
-/*
- * Output
- */
-static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
-			      unsigned long offset, unsigned long head)
-{
-	unsigned long mask;
-
-	if (!data->writable)
-		return true;
-
-	mask = (data->nr_pages << PAGE_SHIFT) - 1;
-
-	offset = (offset - tail) & mask;
-	head   = (head   - tail) & mask;
-
-	if ((int)(head - offset) < 0)
-		return false;
-
-	return true;
-}
-
-static void perf_output_wakeup(struct perf_output_handle *handle)
-{
-	atomic_set(&handle->data->poll, POLL_IN);
-
-	if (handle->nmi) {
-		handle->counter->pending_wakeup = 1;
-		perf_pending_queue(&handle->counter->pending,
-				   perf_pending_counter);
-	} else
-		perf_counter_wakeup(handle->counter);
-}
-
-/*
- * Curious locking construct.
- *
- * We need to ensure a later event doesn't publish a head when a former
- * event isn't done writing. However since we need to deal with NMIs we
- * cannot fully serialize things.
- *
- * What we do is serialize between CPUs so we only have to deal with NMI
- * nesting on a single CPU.
- *
- * We only publish the head (and generate a wakeup) when the outer-most
- * event completes.
- */
-static void perf_output_lock(struct perf_output_handle *handle)
-{
-	struct perf_mmap_data *data = handle->data;
-	int cpu;
-
-	handle->locked = 0;
-
-	local_irq_save(handle->flags);
-	cpu = smp_processor_id();
-
-	if (in_nmi() && atomic_read(&data->lock) == cpu)
-		return;
-
-	while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
-		cpu_relax();
-
-	handle->locked = 1;
-}
-
-static void perf_output_unlock(struct perf_output_handle *handle)
-{
-	struct perf_mmap_data *data = handle->data;
-	unsigned long head;
-	int cpu;
-
-	data->done_head = data->head;
-
-	if (!handle->locked)
-		goto out;
-
-again:
-	/*
-	 * The xchg implies a full barrier that ensures all writes are done
-	 * before we publish the new head, matched by a rmb() in userspace when
-	 * reading this position.
-	 */
-	while ((head = atomic_long_xchg(&data->done_head, 0)))
-		data->user_page->data_head = head;
-
-	/*
-	 * NMI can happen here, which means we can miss a done_head update.
-	 */
-
-	cpu = atomic_xchg(&data->lock, -1);
-	WARN_ON_ONCE(cpu != smp_processor_id());
-
-	/*
-	 * Therefore we have to validate we did not indeed do so.
-	 */
-	if (unlikely(atomic_long_read(&data->done_head))) {
-		/*
-		 * Since we had it locked, we can lock it again.
-		 */
-		while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
-			cpu_relax();
-
-		goto again;
-	}
-
-	if (atomic_xchg(&data->wakeup, 0))
-		perf_output_wakeup(handle);
-out:
-	local_irq_restore(handle->flags);
-}
-
-void perf_output_copy(struct perf_output_handle *handle,
-		      const void *buf, unsigned int len)
-{
-	unsigned int pages_mask;
-	unsigned int offset;
-	unsigned int size;
-	void **pages;
-
-	offset		= handle->offset;
-	pages_mask	= handle->data->nr_pages - 1;
-	pages		= handle->data->data_pages;
-
-	do {
-		unsigned int page_offset;
-		int nr;
-
-		nr	    = (offset >> PAGE_SHIFT) & pages_mask;
-		page_offset = offset & (PAGE_SIZE - 1);
-		size	    = min_t(unsigned int, PAGE_SIZE - page_offset, len);
-
-		memcpy(pages[nr] + page_offset, buf, size);
-
-		len	    -= size;
-		buf	    += size;
-		offset	    += size;
-	} while (len);
-
-	handle->offset = offset;
-
-	/*
-	 * Check we didn't copy past our reservation window, taking the
-	 * possible unsigned int wrap into account.
-	 */
-	WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
-}
-
-int perf_output_begin(struct perf_output_handle *handle,
-		      struct perf_counter *counter, unsigned int size,
-		      int nmi, int sample)
-{
-	struct perf_counter *output_counter;
-	struct perf_mmap_data *data;
-	unsigned long tail, offset, head;
-	int have_lost;
-	struct {
-		struct perf_event_header header;
-		u64			 id;
-		u64			 lost;
-	} lost_event;
-
-	rcu_read_lock();
-	/*
-	 * For inherited counters we send all the output towards the parent.
-	 */
-	if (counter->parent)
-		counter = counter->parent;
-
-	output_counter = rcu_dereference(counter->output);
-	if (output_counter)
-		counter = output_counter;
-
-	data = rcu_dereference(counter->data);
-	if (!data)
-		goto out;
-
-	handle->data	= data;
-	handle->counter	= counter;
-	handle->nmi	= nmi;
-	handle->sample	= sample;
-
-	if (!data->nr_pages)
-		goto fail;
-
-	have_lost = atomic_read(&data->lost);
-	if (have_lost)
-		size += sizeof(lost_event);
-
-	perf_output_lock(handle);
-
-	do {
-		/*
-		 * Userspace could choose to issue a mb() before updating the
-		 * tail pointer. So that all reads will be completed before the
-		 * write is issued.
-		 */
-		tail = ACCESS_ONCE(data->user_page->data_tail);
-		smp_rmb();
-		offset = head = atomic_long_read(&data->head);
-		head += size;
-		if (unlikely(!perf_output_space(data, tail, offset, head)))
-			goto fail;
-	} while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
-
-	handle->offset	= offset;
-	handle->head	= head;
-
-	if (head - tail > data->watermark)
-		atomic_set(&data->wakeup, 1);
-
-	if (have_lost) {
-		lost_event.header.type = PERF_EVENT_LOST;
-		lost_event.header.misc = 0;
-		lost_event.header.size = sizeof(lost_event);
-		lost_event.id          = counter->id;
-		lost_event.lost        = atomic_xchg(&data->lost, 0);
-
-		perf_output_put(handle, lost_event);
-	}
-
-	return 0;
-
-fail:
-	atomic_inc(&data->lost);
-	perf_output_unlock(handle);
-out:
-	rcu_read_unlock();
-
-	return -ENOSPC;
-}
-
-void perf_output_end(struct perf_output_handle *handle)
-{
-	struct perf_counter *counter = handle->counter;
-	struct perf_mmap_data *data = handle->data;
-
-	int wakeup_events = counter->attr.wakeup_events;
-
-	if (handle->sample && wakeup_events) {
-		int events = atomic_inc_return(&data->events);
-		if (events >= wakeup_events) {
-			atomic_sub(wakeup_events, &data->events);
-			atomic_set(&data->wakeup, 1);
-		}
-	}
-
-	perf_output_unlock(handle);
-	rcu_read_unlock();
-}
-
-static u32 perf_counter_pid(struct perf_counter *counter, struct task_struct *p)
-{
-	/*
-	 * only top level counters have the pid namespace they were created in
-	 */
-	if (counter->parent)
-		counter = counter->parent;
-
-	return task_tgid_nr_ns(p, counter->ns);
-}
-
-static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p)
-{
-	/*
-	 * only top level counters have the pid namespace they were created in
-	 */
-	if (counter->parent)
-		counter = counter->parent;
-
-	return task_pid_nr_ns(p, counter->ns);
-}
-
-static void perf_output_read_one(struct perf_output_handle *handle,
-				 struct perf_counter *counter)
-{
-	u64 read_format = counter->attr.read_format;
-	u64 values[4];
-	int n = 0;
-
-	values[n++] = atomic64_read(&counter->count);
-	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
-		values[n++] = counter->total_time_enabled +
-			atomic64_read(&counter->child_total_time_enabled);
-	}
-	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
-		values[n++] = counter->total_time_running +
-			atomic64_read(&counter->child_total_time_running);
-	}
-	if (read_format & PERF_FORMAT_ID)
-		values[n++] = primary_counter_id(counter);
-
-	perf_output_copy(handle, values, n * sizeof(u64));
-}
-
-/*
- * XXX PERF_FORMAT_GROUP vs inherited counters seems difficult.
- */
-static void perf_output_read_group(struct perf_output_handle *handle,
-			    struct perf_counter *counter)
-{
-	struct perf_counter *leader = counter->group_leader, *sub;
-	u64 read_format = counter->attr.read_format;
-	u64 values[5];
-	int n = 0;
-
-	values[n++] = 1 + leader->nr_siblings;
-
-	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
-		values[n++] = leader->total_time_enabled;
-
-	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
-		values[n++] = leader->total_time_running;
-
-	if (leader != counter)
-		leader->pmu->read(leader);
-
-	values[n++] = atomic64_read(&leader->count);
-	if (read_format & PERF_FORMAT_ID)
-		values[n++] = primary_counter_id(leader);
-
-	perf_output_copy(handle, values, n * sizeof(u64));
-
-	list_for_each_entry(sub, &leader->sibling_list, group_entry) {
-		n = 0;
-
-		if (sub != counter)
-			sub->pmu->read(sub);
-
-		values[n++] = atomic64_read(&sub->count);
-		if (read_format & PERF_FORMAT_ID)
-			values[n++] = primary_counter_id(sub);
-
-		perf_output_copy(handle, values, n * sizeof(u64));
-	}
-}
-
-static void perf_output_read(struct perf_output_handle *handle,
-			     struct perf_counter *counter)
-{
-	if (counter->attr.read_format & PERF_FORMAT_GROUP)
-		perf_output_read_group(handle, counter);
-	else
-		perf_output_read_one(handle, counter);
-}
-
-void perf_output_sample(struct perf_output_handle *handle,
-			struct perf_event_header *header,
-			struct perf_sample_data *data,
-			struct perf_counter *counter)
-{
-	u64 sample_type = data->type;
-
-	perf_output_put(handle, *header);
-
-	if (sample_type & PERF_SAMPLE_IP)
-		perf_output_put(handle, data->ip);
-
-	if (sample_type & PERF_SAMPLE_TID)
-		perf_output_put(handle, data->tid_entry);
-
-	if (sample_type & PERF_SAMPLE_TIME)
-		perf_output_put(handle, data->time);
-
-	if (sample_type & PERF_SAMPLE_ADDR)
-		perf_output_put(handle, data->addr);
-
-	if (sample_type & PERF_SAMPLE_ID)
-		perf_output_put(handle, data->id);
-
-	if (sample_type & PERF_SAMPLE_STREAM_ID)
-		perf_output_put(handle, data->stream_id);
-
-	if (sample_type & PERF_SAMPLE_CPU)
-		perf_output_put(handle, data->cpu_entry);
-
-	if (sample_type & PERF_SAMPLE_PERIOD)
-		perf_output_put(handle, data->period);
-
-	if (sample_type & PERF_SAMPLE_READ)
-		perf_output_read(handle, counter);
-
-	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
-		if (data->callchain) {
-			int size = 1;
-
-			if (data->callchain)
-				size += data->callchain->nr;
-
-			size *= sizeof(u64);
-
-			perf_output_copy(handle, data->callchain, size);
-		} else {
-			u64 nr = 0;
-			perf_output_put(handle, nr);
-		}
-	}
-
-	if (sample_type & PERF_SAMPLE_RAW) {
-		if (data->raw) {
-			perf_output_put(handle, data->raw->size);
-			perf_output_copy(handle, data->raw->data,
-					 data->raw->size);
-		} else {
-			struct {
-				u32	size;
-				u32	data;
-			} raw = {
-				.size = sizeof(u32),
-				.data = 0,
-			};
-			perf_output_put(handle, raw);
-		}
-	}
-}
-
-void perf_prepare_sample(struct perf_event_header *header,
-			 struct perf_sample_data *data,
-			 struct perf_counter *counter,
-			 struct pt_regs *regs)
-{
-	u64 sample_type = counter->attr.sample_type;
-
-	data->type = sample_type;
-
-	header->type = PERF_EVENT_SAMPLE;
-	header->size = sizeof(*header);
-
-	header->misc = 0;
-	header->misc |= perf_misc_flags(regs);
-
-	if (sample_type & PERF_SAMPLE_IP) {
-		data->ip = perf_instruction_pointer(regs);
-
-		header->size += sizeof(data->ip);
-	}
-
-	if (sample_type & PERF_SAMPLE_TID) {
-		/* namespace issues */
-		data->tid_entry.pid = perf_counter_pid(counter, current);
-		data->tid_entry.tid = perf_counter_tid(counter, current);
-
-		header->size += sizeof(data->tid_entry);
-	}
-
-	if (sample_type & PERF_SAMPLE_TIME) {
-		data->time = perf_clock();
-
-		header->size += sizeof(data->time);
-	}
-
-	if (sample_type & PERF_SAMPLE_ADDR)
-		header->size += sizeof(data->addr);
-
-	if (sample_type & PERF_SAMPLE_ID) {
-		data->id = primary_counter_id(counter);
-
-		header->size += sizeof(data->id);
-	}
-
-	if (sample_type & PERF_SAMPLE_STREAM_ID) {
-		data->stream_id = counter->id;
-
-		header->size += sizeof(data->stream_id);
-	}
-
-	if (sample_type & PERF_SAMPLE_CPU) {
-		data->cpu_entry.cpu		= raw_smp_processor_id();
-		data->cpu_entry.reserved	= 0;
-
-		header->size += sizeof(data->cpu_entry);
-	}
-
-	if (sample_type & PERF_SAMPLE_PERIOD)
-		header->size += sizeof(data->period);
-
-	if (sample_type & PERF_SAMPLE_READ)
-		header->size += perf_counter_read_size(counter);
-
-	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
-		int size = 1;
-
-		data->callchain = perf_callchain(regs);
-
-		if (data->callchain)
-			size += data->callchain->nr;
-
-		header->size += size * sizeof(u64);
-	}
-
-	if (sample_type & PERF_SAMPLE_RAW) {
-		int size = sizeof(u32);
-
-		if (data->raw)
-			size += data->raw->size;
-		else
-			size += sizeof(u32);
-
-		WARN_ON_ONCE(size & (sizeof(u64)-1));
-		header->size += size;
-	}
-}
-
-static void perf_counter_output(struct perf_counter *counter, int nmi,
-				struct perf_sample_data *data,
-				struct pt_regs *regs)
-{
-	struct perf_output_handle handle;
-	struct perf_event_header header;
-
-	perf_prepare_sample(&header, data, counter, regs);
-
-	if (perf_output_begin(&handle, counter, header.size, nmi, 1))
-		return;
-
-	perf_output_sample(&handle, &header, data, counter);
-
-	perf_output_end(&handle);
-}
-
-/*
- * read event
- */
-
-struct perf_read_event {
-	struct perf_event_header	header;
-
-	u32				pid;
-	u32				tid;
-};
-
-static void
-perf_counter_read_event(struct perf_counter *counter,
-			struct task_struct *task)
-{
-	struct perf_output_handle handle;
-	struct perf_read_event read_event = {
-		.header = {
-			.type = PERF_EVENT_READ,
-			.misc = 0,
-			.size = sizeof(read_event) + perf_counter_read_size(counter),
-		},
-		.pid = perf_counter_pid(counter, task),
-		.tid = perf_counter_tid(counter, task),
-	};
-	int ret;
-
-	ret = perf_output_begin(&handle, counter, read_event.header.size, 0, 0);
-	if (ret)
-		return;
-
-	perf_output_put(&handle, read_event);
-	perf_output_read(&handle, counter);
-
-	perf_output_end(&handle);
-}
-
-/*
- * task tracking -- fork/exit
- *
- * enabled by: attr.comm | attr.mmap | attr.task
- */
-
-struct perf_task_event {
-	struct task_struct		*task;
-	struct perf_counter_context	*task_ctx;
-
-	struct {
-		struct perf_event_header	header;
-
-		u32				pid;
-		u32				ppid;
-		u32				tid;
-		u32				ptid;
-		u64				time;
-	} event;
-};
-
-static void perf_counter_task_output(struct perf_counter *counter,
-				     struct perf_task_event *task_event)
-{
-	struct perf_output_handle handle;
-	int size;
-	struct task_struct *task = task_event->task;
-	int ret;
-
-	size  = task_event->event.header.size;
-	ret = perf_output_begin(&handle, counter, size, 0, 0);
-
-	if (ret)
-		return;
-
-	task_event->event.pid = perf_counter_pid(counter, task);
-	task_event->event.ppid = perf_counter_pid(counter, current);
-
-	task_event->event.tid = perf_counter_tid(counter, task);
-	task_event->event.ptid = perf_counter_tid(counter, current);
-
-	task_event->event.time = perf_clock();
-
-	perf_output_put(&handle, task_event->event);
-
-	perf_output_end(&handle);
-}
-
-static int perf_counter_task_match(struct perf_counter *counter)
-{
-	if (counter->attr.comm || counter->attr.mmap || counter->attr.task)
-		return 1;
-
-	return 0;
-}
-
-static void perf_counter_task_ctx(struct perf_counter_context *ctx,
-				  struct perf_task_event *task_event)
-{
-	struct perf_counter *counter;
-
-	if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
-		return;
-
-	rcu_read_lock();
-	list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
-		if (perf_counter_task_match(counter))
-			perf_counter_task_output(counter, task_event);
-	}
-	rcu_read_unlock();
-}
-
-static void perf_counter_task_event(struct perf_task_event *task_event)
-{
-	struct perf_cpu_context *cpuctx;
-	struct perf_counter_context *ctx = task_event->task_ctx;
-
-	cpuctx = &get_cpu_var(perf_cpu_context);
-	perf_counter_task_ctx(&cpuctx->ctx, task_event);
-	put_cpu_var(perf_cpu_context);
-
-	rcu_read_lock();
-	if (!ctx)
-		ctx = rcu_dereference(task_event->task->perf_counter_ctxp);
-	if (ctx)
-		perf_counter_task_ctx(ctx, task_event);
-	rcu_read_unlock();
-}
-
-static void perf_counter_task(struct task_struct *task,
-			      struct perf_counter_context *task_ctx,
-			      int new)
-{
-	struct perf_task_event task_event;
-
-	if (!atomic_read(&nr_comm_counters) &&
-	    !atomic_read(&nr_mmap_counters) &&
-	    !atomic_read(&nr_task_counters))
-		return;
-
-	task_event = (struct perf_task_event){
-		.task	  = task,
-		.task_ctx = task_ctx,
-		.event    = {
-			.header = {
-				.type = new ? PERF_EVENT_FORK : PERF_EVENT_EXIT,
-				.misc = 0,
-				.size = sizeof(task_event.event),
-			},
-			/* .pid  */
-			/* .ppid */
-			/* .tid  */
-			/* .ptid */
-		},
-	};
-
-	perf_counter_task_event(&task_event);
-}
-
-void perf_counter_fork(struct task_struct *task)
-{
-	perf_counter_task(task, NULL, 1);
-}
-
-/*
- * comm tracking
- */
-
-struct perf_comm_event {
-	struct task_struct	*task;
-	char			*comm;
-	int			comm_size;
-
-	struct {
-		struct perf_event_header	header;
-
-		u32				pid;
-		u32				tid;
-	} event;
-};
-
-static void perf_counter_comm_output(struct perf_counter *counter,
-				     struct perf_comm_event *comm_event)
-{
-	struct perf_output_handle handle;
-	int size = comm_event->event.header.size;
-	int ret = perf_output_begin(&handle, counter, size, 0, 0);
-
-	if (ret)
-		return;
-
-	comm_event->event.pid = perf_counter_pid(counter, comm_event->task);
-	comm_event->event.tid = perf_counter_tid(counter, comm_event->task);
-
-	perf_output_put(&handle, comm_event->event);
-	perf_output_copy(&handle, comm_event->comm,
-				   comm_event->comm_size);
-	perf_output_end(&handle);
-}
-
-static int perf_counter_comm_match(struct perf_counter *counter)
-{
-	if (counter->attr.comm)
-		return 1;
-
-	return 0;
-}
-
-static void perf_counter_comm_ctx(struct perf_counter_context *ctx,
-				  struct perf_comm_event *comm_event)
-{
-	struct perf_counter *counter;
-
-	if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
-		return;
-
-	rcu_read_lock();
-	list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
-		if (perf_counter_comm_match(counter))
-			perf_counter_comm_output(counter, comm_event);
-	}
-	rcu_read_unlock();
-}
-
-static void perf_counter_comm_event(struct perf_comm_event *comm_event)
-{
-	struct perf_cpu_context *cpuctx;
-	struct perf_counter_context *ctx;
-	unsigned int size;
-	char comm[TASK_COMM_LEN];
-
-	memset(comm, 0, sizeof(comm));
-	strncpy(comm, comm_event->task->comm, sizeof(comm));
-	size = ALIGN(strlen(comm)+1, sizeof(u64));
-
-	comm_event->comm = comm;
-	comm_event->comm_size = size;
-
-	comm_event->event.header.size = sizeof(comm_event->event) + size;
-
-	cpuctx = &get_cpu_var(perf_cpu_context);
-	perf_counter_comm_ctx(&cpuctx->ctx, comm_event);
-	put_cpu_var(perf_cpu_context);
-
-	rcu_read_lock();
-	/*
-	 * doesn't really matter which of the child contexts the
-	 * events ends up in.
-	 */
-	ctx = rcu_dereference(current->perf_counter_ctxp);
-	if (ctx)
-		perf_counter_comm_ctx(ctx, comm_event);
-	rcu_read_unlock();
-}
-
-void perf_counter_comm(struct task_struct *task)
-{
-	struct perf_comm_event comm_event;
-
-	if (task->perf_counter_ctxp)
-		perf_counter_enable_on_exec(task);
-
-	if (!atomic_read(&nr_comm_counters))
-		return;
-
-	comm_event = (struct perf_comm_event){
-		.task	= task,
-		/* .comm      */
-		/* .comm_size */
-		.event  = {
-			.header = {
-				.type = PERF_EVENT_COMM,
-				.misc = 0,
-				/* .size */
-			},
-			/* .pid */
-			/* .tid */
-		},
-	};
-
-	perf_counter_comm_event(&comm_event);
-}
-
-/*
- * mmap tracking
- */
-
-struct perf_mmap_event {
-	struct vm_area_struct	*vma;
-
-	const char		*file_name;
-	int			file_size;
-
-	struct {
-		struct perf_event_header	header;
-
-		u32				pid;
-		u32				tid;
-		u64				start;
-		u64				len;
-		u64				pgoff;
-	} event;
-};
-
-static void perf_counter_mmap_output(struct perf_counter *counter,
-				     struct perf_mmap_event *mmap_event)
-{
-	struct perf_output_handle handle;
-	int size = mmap_event->event.header.size;
-	int ret = perf_output_begin(&handle, counter, size, 0, 0);
-
-	if (ret)
-		return;
-
-	mmap_event->event.pid = perf_counter_pid(counter, current);
-	mmap_event->event.tid = perf_counter_tid(counter, current);
-
-	perf_output_put(&handle, mmap_event->event);
-	perf_output_copy(&handle, mmap_event->file_name,
-				   mmap_event->file_size);
-	perf_output_end(&handle);
-}
-
-static int perf_counter_mmap_match(struct perf_counter *counter,
-				   struct perf_mmap_event *mmap_event)
-{
-	if (counter->attr.mmap)
-		return 1;
-
-	return 0;
-}
-
-static void perf_counter_mmap_ctx(struct perf_counter_context *ctx,
-				  struct perf_mmap_event *mmap_event)
-{
-	struct perf_counter *counter;
-
-	if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
-		return;
-
-	rcu_read_lock();
-	list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
-		if (perf_counter_mmap_match(counter, mmap_event))
-			perf_counter_mmap_output(counter, mmap_event);
-	}
-	rcu_read_unlock();
-}
-
-static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
-{
-	struct perf_cpu_context *cpuctx;
-	struct perf_counter_context *ctx;
-	struct vm_area_struct *vma = mmap_event->vma;
-	struct file *file = vma->vm_file;
-	unsigned int size;
-	char tmp[16];
-	char *buf = NULL;
-	const char *name;
-
-	memset(tmp, 0, sizeof(tmp));
-
-	if (file) {
-		/*
-		 * d_path works from the end of the buffer backwards, so we
-		 * need to add enough zero bytes after the string to handle
-		 * the 64bit alignment we do later.
-		 */
-		buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
-		if (!buf) {
-			name = strncpy(tmp, "//enomem", sizeof(tmp));
-			goto got_name;
-		}
-		name = d_path(&file->f_path, buf, PATH_MAX);
-		if (IS_ERR(name)) {
-			name = strncpy(tmp, "//toolong", sizeof(tmp));
-			goto got_name;
-		}
-	} else {
-		if (arch_vma_name(mmap_event->vma)) {
-			name = strncpy(tmp, arch_vma_name(mmap_event->vma),
-				       sizeof(tmp));
-			goto got_name;
-		}
-
-		if (!vma->vm_mm) {
-			name = strncpy(tmp, "[vdso]", sizeof(tmp));
-			goto got_name;
-		}
-
-		name = strncpy(tmp, "//anon", sizeof(tmp));
-		goto got_name;
-	}
-
-got_name:
-	size = ALIGN(strlen(name)+1, sizeof(u64));
-
-	mmap_event->file_name = name;
-	mmap_event->file_size = size;
-
-	mmap_event->event.header.size = sizeof(mmap_event->event) + size;
-
-	cpuctx = &get_cpu_var(perf_cpu_context);
-	perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event);
-	put_cpu_var(perf_cpu_context);
-
-	rcu_read_lock();
-	/*
-	 * doesn't really matter which of the child contexts the
-	 * events ends up in.
-	 */
-	ctx = rcu_dereference(current->perf_counter_ctxp);
-	if (ctx)
-		perf_counter_mmap_ctx(ctx, mmap_event);
-	rcu_read_unlock();
-
-	kfree(buf);
-}
-
-void __perf_counter_mmap(struct vm_area_struct *vma)
-{
-	struct perf_mmap_event mmap_event;
-
-	if (!atomic_read(&nr_mmap_counters))
-		return;
-
-	mmap_event = (struct perf_mmap_event){
-		.vma	= vma,
-		/* .file_name */
-		/* .file_size */
-		.event  = {
-			.header = {
-				.type = PERF_EVENT_MMAP,
-				.misc = 0,
-				/* .size */
-			},
-			/* .pid */
-			/* .tid */
-			.start  = vma->vm_start,
-			.len    = vma->vm_end - vma->vm_start,
-			.pgoff  = vma->vm_pgoff,
-		},
-	};
-
-	perf_counter_mmap_event(&mmap_event);
-}
-
-/*
- * IRQ throttle logging
- */
-
-static void perf_log_throttle(struct perf_counter *counter, int enable)
-{
-	struct perf_output_handle handle;
-	int ret;
-
-	struct {
-		struct perf_event_header	header;
-		u64				time;
-		u64				id;
-		u64				stream_id;
-	} throttle_event = {
-		.header = {
-			.type = PERF_EVENT_THROTTLE,
-			.misc = 0,
-			.size = sizeof(throttle_event),
-		},
-		.time		= perf_clock(),
-		.id		= primary_counter_id(counter),
-		.stream_id	= counter->id,
-	};
-
-	if (enable)
-		throttle_event.header.type = PERF_EVENT_UNTHROTTLE;
-
-	ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0);
-	if (ret)
-		return;
-
-	perf_output_put(&handle, throttle_event);
-	perf_output_end(&handle);
-}
-
-/*
- * Generic counter overflow handling, sampling.
- */
-
-static int __perf_counter_overflow(struct perf_counter *counter, int nmi,
-				   int throttle, struct perf_sample_data *data,
-				   struct pt_regs *regs)
-{
-	int events = atomic_read(&counter->event_limit);
-	struct hw_perf_counter *hwc = &counter->hw;
-	int ret = 0;
-
-	throttle = (throttle && counter->pmu->unthrottle != NULL);
-
-	if (!throttle) {
-		hwc->interrupts++;
-	} else {
-		if (hwc->interrupts != MAX_INTERRUPTS) {
-			hwc->interrupts++;
-			if (HZ * hwc->interrupts >
-					(u64)sysctl_perf_counter_sample_rate) {
-				hwc->interrupts = MAX_INTERRUPTS;
-				perf_log_throttle(counter, 0);
-				ret = 1;
-			}
-		} else {
-			/*
-			 * Keep re-disabling counters even though on the previous
-			 * pass we disabled it - just in case we raced with a
-			 * sched-in and the counter got enabled again:
-			 */
-			ret = 1;
-		}
-	}
-
-	if (counter->attr.freq) {
-		u64 now = perf_clock();
-		s64 delta = now - hwc->freq_stamp;
-
-		hwc->freq_stamp = now;
-
-		if (delta > 0 && delta < TICK_NSEC)
-			perf_adjust_period(counter, NSEC_PER_SEC / (int)delta);
-	}
-
-	/*
-	 * XXX event_limit might not quite work as expected on inherited
-	 * counters
-	 */
-
-	counter->pending_kill = POLL_IN;
-	if (events && atomic_dec_and_test(&counter->event_limit)) {
-		ret = 1;
-		counter->pending_kill = POLL_HUP;
-		if (nmi) {
-			counter->pending_disable = 1;
-			perf_pending_queue(&counter->pending,
-					   perf_pending_counter);
-		} else
-			perf_counter_disable(counter);
-	}
-
-	perf_counter_output(counter, nmi, data, regs);
-	return ret;
-}
-
-int perf_counter_overflow(struct perf_counter *counter, int nmi,
-			  struct perf_sample_data *data,
-			  struct pt_regs *regs)
-{
-	return __perf_counter_overflow(counter, nmi, 1, data, regs);
-}
-
-/*
- * Generic software counter infrastructure
- */
-
-/*
- * We directly increment counter->count and keep a second value in
- * counter->hw.period_left to count intervals. This period counter
- * is kept in the range [-sample_period, 0] so that we can use the
- * sign as trigger.
- */
-
-static u64 perf_swcounter_set_period(struct perf_counter *counter)
-{
-	struct hw_perf_counter *hwc = &counter->hw;
-	u64 period = hwc->last_period;
-	u64 nr, offset;
-	s64 old, val;
-
-	hwc->last_period = hwc->sample_period;
-
-again:
-	old = val = atomic64_read(&hwc->period_left);
-	if (val < 0)
-		return 0;
-
-	nr = div64_u64(period + val, period);
-	offset = nr * period;
-	val -= offset;
-	if (atomic64_cmpxchg(&hwc->period_left, old, val) != old)
-		goto again;
-
-	return nr;
-}
-
-static void perf_swcounter_overflow(struct perf_counter *counter,
-				    int nmi, struct perf_sample_data *data,
-				    struct pt_regs *regs)
-{
-	struct hw_perf_counter *hwc = &counter->hw;
-	int throttle = 0;
-	u64 overflow;
-
-	data->period = counter->hw.last_period;
-	overflow = perf_swcounter_set_period(counter);
-
-	if (hwc->interrupts == MAX_INTERRUPTS)
-		return;
-
-	for (; overflow; overflow--) {
-		if (__perf_counter_overflow(counter, nmi, throttle,
-					    data, regs)) {
-			/*
-			 * We inhibit the overflow from happening when
-			 * hwc->interrupts == MAX_INTERRUPTS.
-			 */
-			break;
-		}
-		throttle = 1;
-	}
-}
-
-static void perf_swcounter_unthrottle(struct perf_counter *counter)
-{
-	/*
-	 * Nothing to do, we already reset hwc->interrupts.
-	 */
-}
-
-static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
-			       int nmi, struct perf_sample_data *data,
-			       struct pt_regs *regs)
-{
-	struct hw_perf_counter *hwc = &counter->hw;
-
-	atomic64_add(nr, &counter->count);
-
-	if (!hwc->sample_period)
-		return;
-
-	if (!regs)
-		return;
-
-	if (!atomic64_add_negative(nr, &hwc->period_left))
-		perf_swcounter_overflow(counter, nmi, data, regs);
-}
-
-static int perf_swcounter_is_counting(struct perf_counter *counter)
-{
-	/*
-	 * The counter is active, we're good!
-	 */
-	if (counter->state == PERF_COUNTER_STATE_ACTIVE)
-		return 1;
-
-	/*
-	 * The counter is off/error, not counting.
-	 */
-	if (counter->state != PERF_COUNTER_STATE_INACTIVE)
-		return 0;
-
-	/*
-	 * The counter is inactive, if the context is active
-	 * we're part of a group that didn't make it on the 'pmu',
-	 * not counting.
-	 */
-	if (counter->ctx->is_active)
-		return 0;
-
-	/*
-	 * We're inactive and the context is too, this means the
-	 * task is scheduled out, we're counting events that happen
-	 * to us, like migration events.
-	 */
-	return 1;
-}
-
-static int perf_swcounter_match(struct perf_counter *counter,
-				enum perf_type_id type,
-				u32 event_id, struct pt_regs *regs)
-{
-	if (!perf_swcounter_is_counting(counter))
-		return 0;
-
-	if (counter->attr.type != type)
-		return 0;
-	if (counter->attr.config != event_id)
-		return 0;
-
-	if (regs) {
-		if (counter->attr.exclude_user && user_mode(regs))
-			return 0;
-
-		if (counter->attr.exclude_kernel && !user_mode(regs))
-			return 0;
-	}
-
-	return 1;
-}
-
-static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
-				     enum perf_type_id type,
-				     u32 event_id, u64 nr, int nmi,
-				     struct perf_sample_data *data,
-				     struct pt_regs *regs)
-{
-	struct perf_counter *counter;
-
-	if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
-		return;
-
-	rcu_read_lock();
-	list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
-		if (perf_swcounter_match(counter, type, event_id, regs))
-			perf_swcounter_add(counter, nr, nmi, data, regs);
-	}
-	rcu_read_unlock();
-}
-
-static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx)
-{
-	if (in_nmi())
-		return &cpuctx->recursion[3];
-
-	if (in_irq())
-		return &cpuctx->recursion[2];
-
-	if (in_softirq())
-		return &cpuctx->recursion[1];
-
-	return &cpuctx->recursion[0];
-}
-
-static void do_perf_swcounter_event(enum perf_type_id type, u32 event,
-				    u64 nr, int nmi,
-				    struct perf_sample_data *data,
-				    struct pt_regs *regs)
-{
-	struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
-	int *recursion = perf_swcounter_recursion_context(cpuctx);
-	struct perf_counter_context *ctx;
-
-	if (*recursion)
-		goto out;
-
-	(*recursion)++;
-	barrier();
-
-	perf_swcounter_ctx_event(&cpuctx->ctx, type, event,
-				 nr, nmi, data, regs);
-	rcu_read_lock();
-	/*
-	 * doesn't really matter which of the child contexts the
-	 * events ends up in.
-	 */
-	ctx = rcu_dereference(current->perf_counter_ctxp);
-	if (ctx)
-		perf_swcounter_ctx_event(ctx, type, event, nr, nmi, data, regs);
-	rcu_read_unlock();
-
-	barrier();
-	(*recursion)--;
-
-out:
-	put_cpu_var(perf_cpu_context);
-}
-
-void __perf_swcounter_event(u32 event, u64 nr, int nmi,
-			    struct pt_regs *regs, u64 addr)
-{
-	struct perf_sample_data data = {
-		.addr = addr,
-	};
-
-	do_perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi,
-				&data, regs);
-}
-
-static void perf_swcounter_read(struct perf_counter *counter)
-{
-}
-
-static int perf_swcounter_enable(struct perf_counter *counter)
-{
-	struct hw_perf_counter *hwc = &counter->hw;
-
-	if (hwc->sample_period) {
-		hwc->last_period = hwc->sample_period;
-		perf_swcounter_set_period(counter);
-	}
-	return 0;
-}
-
-static void perf_swcounter_disable(struct perf_counter *counter)
-{
-}
-
-static const struct pmu perf_ops_generic = {
-	.enable		= perf_swcounter_enable,
-	.disable	= perf_swcounter_disable,
-	.read		= perf_swcounter_read,
-	.unthrottle	= perf_swcounter_unthrottle,
-};
-
-/*
- * hrtimer based swcounter callback
- */
-
-static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
-{
-	enum hrtimer_restart ret = HRTIMER_RESTART;
-	struct perf_sample_data data;
-	struct pt_regs *regs;
-	struct perf_counter *counter;
-	u64 period;
-
-	counter	= container_of(hrtimer, struct perf_counter, hw.hrtimer);
-	counter->pmu->read(counter);
-
-	data.addr = 0;
-	regs = get_irq_regs();
-	/*
-	 * In case we exclude kernel IPs or are somehow not in interrupt
-	 * context, provide the next best thing, the user IP.
-	 */
-	if ((counter->attr.exclude_kernel || !regs) &&
-			!counter->attr.exclude_user)
-		regs = task_pt_regs(current);
-
-	if (regs) {
-		if (perf_counter_overflow(counter, 0, &data, regs))
-			ret = HRTIMER_NORESTART;
-	}
-
-	period = max_t(u64, 10000, counter->hw.sample_period);
-	hrtimer_forward_now(hrtimer, ns_to_ktime(period));
-
-	return ret;
-}
-
-/*
- * Software counter: cpu wall time clock
- */
-
-static void cpu_clock_perf_counter_update(struct perf_counter *counter)
-{
-	int cpu = raw_smp_processor_id();
-	s64 prev;
-	u64 now;
-
-	now = cpu_clock(cpu);
-	prev = atomic64_read(&counter->hw.prev_count);
-	atomic64_set(&counter->hw.prev_count, now);
-	atomic64_add(now - prev, &counter->count);
-}
-
-static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
-{
-	struct hw_perf_counter *hwc = &counter->hw;
-	int cpu = raw_smp_processor_id();
-
-	atomic64_set(&hwc->prev_count, cpu_clock(cpu));
-	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-	hwc->hrtimer.function = perf_swcounter_hrtimer;
-	if (hwc->sample_period) {
-		u64 period = max_t(u64, 10000, hwc->sample_period);
-		__hrtimer_start_range_ns(&hwc->hrtimer,
-				ns_to_ktime(period), 0,
-				HRTIMER_MODE_REL, 0);
-	}
-
-	return 0;
-}
-
-static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
-{
-	if (counter->hw.sample_period)
-		hrtimer_cancel(&counter->hw.hrtimer);
-	cpu_clock_perf_counter_update(counter);
-}
-
-static void cpu_clock_perf_counter_read(struct perf_counter *counter)
-{
-	cpu_clock_perf_counter_update(counter);
-}
-
-static const struct pmu perf_ops_cpu_clock = {
-	.enable		= cpu_clock_perf_counter_enable,
-	.disable	= cpu_clock_perf_counter_disable,
-	.read		= cpu_clock_perf_counter_read,
-};
-
-/*
- * Software counter: task time clock
- */
-
-static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now)
-{
-	u64 prev;
-	s64 delta;
-
-	prev = atomic64_xchg(&counter->hw.prev_count, now);
-	delta = now - prev;
-	atomic64_add(delta, &counter->count);
-}
-
-static int task_clock_perf_counter_enable(struct perf_counter *counter)
-{
-	struct hw_perf_counter *hwc = &counter->hw;
-	u64 now;
-
-	now = counter->ctx->time;
-
-	atomic64_set(&hwc->prev_count, now);
-	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-	hwc->hrtimer.function = perf_swcounter_hrtimer;
-	if (hwc->sample_period) {
-		u64 period = max_t(u64, 10000, hwc->sample_period);
-		__hrtimer_start_range_ns(&hwc->hrtimer,
-				ns_to_ktime(period), 0,
-				HRTIMER_MODE_REL, 0);
-	}
-
-	return 0;
-}
-
-static void task_clock_perf_counter_disable(struct perf_counter *counter)
-{
-	if (counter->hw.sample_period)
-		hrtimer_cancel(&counter->hw.hrtimer);
-	task_clock_perf_counter_update(counter, counter->ctx->time);
-
-}
-
-static void task_clock_perf_counter_read(struct perf_counter *counter)
-{
-	u64 time;
-
-	if (!in_nmi()) {
-		update_context_time(counter->ctx);
-		time = counter->ctx->time;
-	} else {
-		u64 now = perf_clock();
-		u64 delta = now - counter->ctx->timestamp;
-		time = counter->ctx->time + delta;
-	}
-
-	task_clock_perf_counter_update(counter, time);
-}
-
-static const struct pmu perf_ops_task_clock = {
-	.enable		= task_clock_perf_counter_enable,
-	.disable	= task_clock_perf_counter_disable,
-	.read		= task_clock_perf_counter_read,
-};
-
-#ifdef CONFIG_EVENT_PROFILE
-void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record,
-			  int entry_size)
-{
-	struct perf_raw_record raw = {
-		.size = entry_size,
-		.data = record,
-	};
-
-	struct perf_sample_data data = {
-		.addr = addr,
-		.raw = &raw,
-	};
-
-	struct pt_regs *regs = get_irq_regs();
-
-	if (!regs)
-		regs = task_pt_regs(current);
-
-	do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
-				&data, regs);
-}
-EXPORT_SYMBOL_GPL(perf_tpcounter_event);
-
-extern int ftrace_profile_enable(int);
-extern void ftrace_profile_disable(int);
-
-static void tp_perf_counter_destroy(struct perf_counter *counter)
-{
-	ftrace_profile_disable(counter->attr.config);
-}
-
-static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
-{
-	/*
-	 * Raw tracepoint data is a severe data leak, only allow root to
-	 * have these.
-	 */
-	if ((counter->attr.sample_type & PERF_SAMPLE_RAW) &&
-			perf_paranoid_tracepoint_raw() &&
-			!capable(CAP_SYS_ADMIN))
-		return ERR_PTR(-EPERM);
-
-	if (ftrace_profile_enable(counter->attr.config))
-		return NULL;
-
-	counter->destroy = tp_perf_counter_destroy;
-
-	return &perf_ops_generic;
-}
-#else
-static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
-{
-	return NULL;
-}
-#endif
-
-atomic_t perf_swcounter_enabled[PERF_COUNT_SW_MAX];
-
-static void sw_perf_counter_destroy(struct perf_counter *counter)
-{
-	u64 event_id = counter->attr.config;
-
-	WARN_ON(counter->parent);
-
-	atomic_dec(&perf_swcounter_enabled[event_id]);
-}
-
-static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
-{
-	const struct pmu *pmu = NULL;
-	u64 event_id = counter->attr.config;
-
-	/*
-	 * Software counters (currently) can't in general distinguish
-	 * between user, kernel and hypervisor events.
-	 * However, context switches and cpu migrations are considered
-	 * to be kernel events, and page faults are never hypervisor
-	 * events.
-	 */
-	switch (event_id) {
-	case PERF_COUNT_SW_CPU_CLOCK:
-		pmu = &perf_ops_cpu_clock;
-
-		break;
-	case PERF_COUNT_SW_TASK_CLOCK:
-		/*
-		 * If the user instantiates this as a per-cpu counter,
-		 * use the cpu_clock counter instead.
-		 */
-		if (counter->ctx->task)
-			pmu = &perf_ops_task_clock;
-		else
-			pmu = &perf_ops_cpu_clock;
-
-		break;
-	case PERF_COUNT_SW_PAGE_FAULTS:
-	case PERF_COUNT_SW_PAGE_FAULTS_MIN:
-	case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
-	case PERF_COUNT_SW_CONTEXT_SWITCHES:
-	case PERF_COUNT_SW_CPU_MIGRATIONS:
-		if (!counter->parent) {
-			atomic_inc(&perf_swcounter_enabled[event_id]);
-			counter->destroy = sw_perf_counter_destroy;
-		}
-		pmu = &perf_ops_generic;
-		break;
-	}
-
-	return pmu;
-}
-
-/*
- * Allocate and initialize a counter structure
- */
-static struct perf_counter *
-perf_counter_alloc(struct perf_counter_attr *attr,
-		   int cpu,
-		   struct perf_counter_context *ctx,
-		   struct perf_counter *group_leader,
-		   struct perf_counter *parent_counter,
-		   gfp_t gfpflags)
-{
-	const struct pmu *pmu;
-	struct perf_counter *counter;
-	struct hw_perf_counter *hwc;
-	long err;
-
-	counter = kzalloc(sizeof(*counter), gfpflags);
-	if (!counter)
-		return ERR_PTR(-ENOMEM);
-
-	/*
-	 * Single counters are their own group leaders, with an
-	 * empty sibling list:
-	 */
-	if (!group_leader)
-		group_leader = counter;
-
-	mutex_init(&counter->child_mutex);
-	INIT_LIST_HEAD(&counter->child_list);
-
-	INIT_LIST_HEAD(&counter->group_entry);
-	INIT_LIST_HEAD(&counter->event_entry);
-	INIT_LIST_HEAD(&counter->sibling_list);
-	init_waitqueue_head(&counter->waitq);
-
-	mutex_init(&counter->mmap_mutex);
-
-	counter->cpu		= cpu;
-	counter->attr		= *attr;
-	counter->group_leader	= group_leader;
-	counter->pmu		= NULL;
-	counter->ctx		= ctx;
-	counter->oncpu		= -1;
-
-	counter->parent		= parent_counter;
-
-	counter->ns		= get_pid_ns(current->nsproxy->pid_ns);
-	counter->id		= atomic64_inc_return(&perf_counter_id);
-
-	counter->state		= PERF_COUNTER_STATE_INACTIVE;
-
-	if (attr->disabled)
-		counter->state = PERF_COUNTER_STATE_OFF;
-
-	pmu = NULL;
-
-	hwc = &counter->hw;
-	hwc->sample_period = attr->sample_period;
-	if (attr->freq && attr->sample_freq)
-		hwc->sample_period = 1;
-	hwc->last_period = hwc->sample_period;
-
-	atomic64_set(&hwc->period_left, hwc->sample_period);
-
-	/*
-	 * we currently do not support PERF_FORMAT_GROUP on inherited counters
-	 */
-	if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
-		goto done;
-
-	switch (attr->type) {
-	case PERF_TYPE_RAW:
-	case PERF_TYPE_HARDWARE:
-	case PERF_TYPE_HW_CACHE:
-		pmu = hw_perf_counter_init(counter);
-		break;
-
-	case PERF_TYPE_SOFTWARE:
-		pmu = sw_perf_counter_init(counter);
-		break;
-
-	case PERF_TYPE_TRACEPOINT:
-		pmu = tp_perf_counter_init(counter);
-		break;
-
-	default:
-		break;
-	}
-done:
-	err = 0;
-	if (!pmu)
-		err = -EINVAL;
-	else if (IS_ERR(pmu))
-		err = PTR_ERR(pmu);
-
-	if (err) {
-		if (counter->ns)
-			put_pid_ns(counter->ns);
-		kfree(counter);
-		return ERR_PTR(err);
-	}
-
-	counter->pmu = pmu;
-
-	if (!counter->parent) {
-		atomic_inc(&nr_counters);
-		if (counter->attr.mmap)
-			atomic_inc(&nr_mmap_counters);
-		if (counter->attr.comm)
-			atomic_inc(&nr_comm_counters);
-		if (counter->attr.task)
-			atomic_inc(&nr_task_counters);
-	}
-
-	return counter;
-}
-
-static int perf_copy_attr(struct perf_counter_attr __user *uattr,
-			  struct perf_counter_attr *attr)
-{
-	u32 size;
-	int ret;
-
-	if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
-		return -EFAULT;
-
-	/*
-	 * zero the full structure, so that a short copy will be nice.
-	 */
-	memset(attr, 0, sizeof(*attr));
-
-	ret = get_user(size, &uattr->size);
-	if (ret)
-		return ret;
-
-	if (size > PAGE_SIZE)	/* silly large */
-		goto err_size;
-
-	if (!size)		/* abi compat */
-		size = PERF_ATTR_SIZE_VER0;
-
-	if (size < PERF_ATTR_SIZE_VER0)
-		goto err_size;
-
-	/*
-	 * If we're handed a bigger struct than we know of,
-	 * ensure all the unknown bits are 0 - i.e. new
-	 * user-space does not rely on any kernel feature
-	 * extensions we dont know about yet.
-	 */
-	if (size > sizeof(*attr)) {
-		unsigned char __user *addr;
-		unsigned char __user *end;
-		unsigned char val;
-
-		addr = (void __user *)uattr + sizeof(*attr);
-		end  = (void __user *)uattr + size;
-
-		for (; addr < end; addr++) {
-			ret = get_user(val, addr);
-			if (ret)
-				return ret;
-			if (val)
-				goto err_size;
-		}
-		size = sizeof(*attr);
-	}
-
-	ret = copy_from_user(attr, uattr, size);
-	if (ret)
-		return -EFAULT;
-
-	/*
-	 * If the type exists, the corresponding creation will verify
-	 * the attr->config.
-	 */
-	if (attr->type >= PERF_TYPE_MAX)
-		return -EINVAL;
-
-	if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
-		return -EINVAL;
-
-	if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
-		return -EINVAL;
-
-	if (attr->read_format & ~(PERF_FORMAT_MAX-1))
-		return -EINVAL;
-
-out:
-	return ret;
-
-err_size:
-	put_user(sizeof(*attr), &uattr->size);
-	ret = -E2BIG;
-	goto out;
-}
-
-int perf_counter_set_output(struct perf_counter *counter, int output_fd)
-{
-	struct perf_counter *output_counter = NULL;
-	struct file *output_file = NULL;
-	struct perf_counter *old_output;
-	int fput_needed = 0;
-	int ret = -EINVAL;
-
-	if (!output_fd)
-		goto set;
-
-	output_file = fget_light(output_fd, &fput_needed);
-	if (!output_file)
-		return -EBADF;
-
-	if (output_file->f_op != &perf_fops)
-		goto out;
-
-	output_counter = output_file->private_data;
-
-	/* Don't chain output fds */
-	if (output_counter->output)
-		goto out;
-
-	/* Don't set an output fd when we already have an output channel */
-	if (counter->data)
-		goto out;
-
-	atomic_long_inc(&output_file->f_count);
-
-set:
-	mutex_lock(&counter->mmap_mutex);
-	old_output = counter->output;
-	rcu_assign_pointer(counter->output, output_counter);
-	mutex_unlock(&counter->mmap_mutex);
-
-	if (old_output) {
-		/*
-		 * we need to make sure no existing perf_output_*()
-		 * is still referencing this counter.
-		 */
-		synchronize_rcu();
-		fput(old_output->filp);
-	}
-
-	ret = 0;
-out:
-	fput_light(output_file, fput_needed);
-	return ret;
-}
-
-/**
- * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
- *
- * @attr_uptr:	event type attributes for monitoring/sampling
- * @pid:		target pid
- * @cpu:		target cpu
- * @group_fd:		group leader counter fd
- */
-SYSCALL_DEFINE5(perf_counter_open,
-		struct perf_counter_attr __user *, attr_uptr,
-		pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
-{
-	struct perf_counter *counter, *group_leader;
-	struct perf_counter_attr attr;
-	struct perf_counter_context *ctx;
-	struct file *counter_file = NULL;
-	struct file *group_file = NULL;
-	int fput_needed = 0;
-	int fput_needed2 = 0;
-	int err;
-
-	/* for future expandability... */
-	if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT))
-		return -EINVAL;
-
-	err = perf_copy_attr(attr_uptr, &attr);
-	if (err)
-		return err;
-
-	if (!attr.exclude_kernel) {
-		if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
-			return -EACCES;
-	}
-
-	if (attr.freq) {
-		if (attr.sample_freq > sysctl_perf_counter_sample_rate)
-			return -EINVAL;
-	}
-
-	/*
-	 * Get the target context (task or percpu):
-	 */
-	ctx = find_get_context(pid, cpu);
-	if (IS_ERR(ctx))
-		return PTR_ERR(ctx);
-
-	/*
-	 * Look up the group leader (we will attach this counter to it):
-	 */
-	group_leader = NULL;
-	if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) {
-		err = -EINVAL;
-		group_file = fget_light(group_fd, &fput_needed);
-		if (!group_file)
-			goto err_put_context;
-		if (group_file->f_op != &perf_fops)
-			goto err_put_context;
-
-		group_leader = group_file->private_data;
-		/*
-		 * Do not allow a recursive hierarchy (this new sibling
-		 * becoming part of another group-sibling):
-		 */
-		if (group_leader->group_leader != group_leader)
-			goto err_put_context;
-		/*
-		 * Do not allow to attach to a group in a different
-		 * task or CPU context:
-		 */
-		if (group_leader->ctx != ctx)
-			goto err_put_context;
-		/*
-		 * Only a group leader can be exclusive or pinned
-		 */
-		if (attr.exclusive || attr.pinned)
-			goto err_put_context;
-	}
-
-	counter = perf_counter_alloc(&attr, cpu, ctx, group_leader,
-				     NULL, GFP_KERNEL);
-	err = PTR_ERR(counter);
-	if (IS_ERR(counter))
-		goto err_put_context;
-
-	err = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
-	if (err < 0)
-		goto err_free_put_context;
-
-	counter_file = fget_light(err, &fput_needed2);
-	if (!counter_file)
-		goto err_free_put_context;
-
-	if (flags & PERF_FLAG_FD_OUTPUT) {
-		err = perf_counter_set_output(counter, group_fd);
-		if (err)
-			goto err_fput_free_put_context;
-	}
-
-	counter->filp = counter_file;
-	WARN_ON_ONCE(ctx->parent_ctx);
-	mutex_lock(&ctx->mutex);
-	perf_install_in_context(ctx, counter, cpu);
-	++ctx->generation;
-	mutex_unlock(&ctx->mutex);
-
-	counter->owner = current;
-	get_task_struct(current);
-	mutex_lock(&current->perf_counter_mutex);
-	list_add_tail(&counter->owner_entry, &current->perf_counter_list);
-	mutex_unlock(&current->perf_counter_mutex);
-
-err_fput_free_put_context:
-	fput_light(counter_file, fput_needed2);
-
-err_free_put_context:
-	if (err < 0)
-		kfree(counter);
-
-err_put_context:
-	if (err < 0)
-		put_ctx(ctx);
-
-	fput_light(group_file, fput_needed);
-
-	return err;
-}
-
-/*
- * inherit a counter from parent task to child task:
- */
-static struct perf_counter *
-inherit_counter(struct perf_counter *parent_counter,
-	      struct task_struct *parent,
-	      struct perf_counter_context *parent_ctx,
-	      struct task_struct *child,
-	      struct perf_counter *group_leader,
-	      struct perf_counter_context *child_ctx)
-{
-	struct perf_counter *child_counter;
-
-	/*
-	 * Instead of creating recursive hierarchies of counters,
-	 * we link inherited counters back to the original parent,
-	 * which has a filp for sure, which we use as the reference
-	 * count:
-	 */
-	if (parent_counter->parent)
-		parent_counter = parent_counter->parent;
-
-	child_counter = perf_counter_alloc(&parent_counter->attr,
-					   parent_counter->cpu, child_ctx,
-					   group_leader, parent_counter,
-					   GFP_KERNEL);
-	if (IS_ERR(child_counter))
-		return child_counter;
-	get_ctx(child_ctx);
-
-	/*
-	 * Make the child state follow the state of the parent counter,
-	 * not its attr.disabled bit.  We hold the parent's mutex,
-	 * so we won't race with perf_counter_{en, dis}able_family.
-	 */
-	if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE)
-		child_counter->state = PERF_COUNTER_STATE_INACTIVE;
-	else
-		child_counter->state = PERF_COUNTER_STATE_OFF;
-
-	if (parent_counter->attr.freq)
-		child_counter->hw.sample_period = parent_counter->hw.sample_period;
-
-	/*
-	 * Link it up in the child's context:
-	 */
-	add_counter_to_ctx(child_counter, child_ctx);
-
-	/*
-	 * Get a reference to the parent filp - we will fput it
-	 * when the child counter exits. This is safe to do because
-	 * we are in the parent and we know that the filp still
-	 * exists and has a nonzero count:
-	 */
-	atomic_long_inc(&parent_counter->filp->f_count);
-
-	/*
-	 * Link this into the parent counter's child list
-	 */
-	WARN_ON_ONCE(parent_counter->ctx->parent_ctx);
-	mutex_lock(&parent_counter->child_mutex);
-	list_add_tail(&child_counter->child_list, &parent_counter->child_list);
-	mutex_unlock(&parent_counter->child_mutex);
-
-	return child_counter;
-}
-
-static int inherit_group(struct perf_counter *parent_counter,
-	      struct task_struct *parent,
-	      struct perf_counter_context *parent_ctx,
-	      struct task_struct *child,
-	      struct perf_counter_context *child_ctx)
-{
-	struct perf_counter *leader;
-	struct perf_counter *sub;
-	struct perf_counter *child_ctr;
-
-	leader = inherit_counter(parent_counter, parent, parent_ctx,
-				 child, NULL, child_ctx);
-	if (IS_ERR(leader))
-		return PTR_ERR(leader);
-	list_for_each_entry(sub, &parent_counter->sibling_list, group_entry) {
-		child_ctr = inherit_counter(sub, parent, parent_ctx,
-					    child, leader, child_ctx);
-		if (IS_ERR(child_ctr))
-			return PTR_ERR(child_ctr);
-	}
-	return 0;
-}
-
-static void sync_child_counter(struct perf_counter *child_counter,
-			       struct task_struct *child)
-{
-	struct perf_counter *parent_counter = child_counter->parent;
-	u64 child_val;
-
-	if (child_counter->attr.inherit_stat)
-		perf_counter_read_event(child_counter, child);
-
-	child_val = atomic64_read(&child_counter->count);
-
-	/*
-	 * Add back the child's count to the parent's count:
-	 */
-	atomic64_add(child_val, &parent_counter->count);
-	atomic64_add(child_counter->total_time_enabled,
-		     &parent_counter->child_total_time_enabled);
-	atomic64_add(child_counter->total_time_running,
-		     &parent_counter->child_total_time_running);
-
-	/*
-	 * Remove this counter from the parent's list
-	 */
-	WARN_ON_ONCE(parent_counter->ctx->parent_ctx);
-	mutex_lock(&parent_counter->child_mutex);
-	list_del_init(&child_counter->child_list);
-	mutex_unlock(&parent_counter->child_mutex);
-
-	/*
-	 * Release the parent counter, if this was the last
-	 * reference to it.
-	 */
-	fput(parent_counter->filp);
-}
-
-static void
-__perf_counter_exit_task(struct perf_counter *child_counter,
-			 struct perf_counter_context *child_ctx,
-			 struct task_struct *child)
-{
-	struct perf_counter *parent_counter;
-
-	update_counter_times(child_counter);
-	perf_counter_remove_from_context(child_counter);
-
-	parent_counter = child_counter->parent;
-	/*
-	 * It can happen that parent exits first, and has counters
-	 * that are still around due to the child reference. These
-	 * counters need to be zapped - but otherwise linger.
-	 */
-	if (parent_counter) {
-		sync_child_counter(child_counter, child);
-		free_counter(child_counter);
-	}
-}
-
-/*
- * When a child task exits, feed back counter values to parent counters.
- */
-void perf_counter_exit_task(struct task_struct *child)
-{
-	struct perf_counter *child_counter, *tmp;
-	struct perf_counter_context *child_ctx;
-	unsigned long flags;
-
-	if (likely(!child->perf_counter_ctxp)) {
-		perf_counter_task(child, NULL, 0);
-		return;
-	}
-
-	local_irq_save(flags);
-	/*
-	 * We can't reschedule here because interrupts are disabled,
-	 * and either child is current or it is a task that can't be
-	 * scheduled, so we are now safe from rescheduling changing
-	 * our context.
-	 */
-	child_ctx = child->perf_counter_ctxp;
-	__perf_counter_task_sched_out(child_ctx);
-
-	/*
-	 * Take the context lock here so that if find_get_context is
-	 * reading child->perf_counter_ctxp, we wait until it has
-	 * incremented the context's refcount before we do put_ctx below.
-	 */
-	spin_lock(&child_ctx->lock);
-	child->perf_counter_ctxp = NULL;
-	/*
-	 * If this context is a clone; unclone it so it can't get
-	 * swapped to another process while we're removing all
-	 * the counters from it.
-	 */
-	unclone_ctx(child_ctx);
-	spin_unlock_irqrestore(&child_ctx->lock, flags);
-
-	/*
-	 * Report the task dead after unscheduling the counters so that we
-	 * won't get any samples after PERF_EVENT_EXIT. We can however still
-	 * get a few PERF_EVENT_READ events.
-	 */
-	perf_counter_task(child, child_ctx, 0);
-
-	/*
-	 * We can recurse on the same lock type through:
-	 *
-	 *   __perf_counter_exit_task()
-	 *     sync_child_counter()
-	 *       fput(parent_counter->filp)
-	 *         perf_release()
-	 *           mutex_lock(&ctx->mutex)
-	 *
-	 * But since its the parent context it won't be the same instance.
-	 */
-	mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
-
-again:
-	list_for_each_entry_safe(child_counter, tmp, &child_ctx->group_list,
-				 group_entry)
-		__perf_counter_exit_task(child_counter, child_ctx, child);
-
-	/*
-	 * If the last counter was a group counter, it will have appended all
-	 * its siblings to the list, but we obtained 'tmp' before that which
-	 * will still point to the list head terminating the iteration.
-	 */
-	if (!list_empty(&child_ctx->group_list))
-		goto again;
-
-	mutex_unlock(&child_ctx->mutex);
-
-	put_ctx(child_ctx);
-}
-
-/*
- * free an unexposed, unused context as created by inheritance by
- * init_task below, used by fork() in case of fail.
- */
-void perf_counter_free_task(struct task_struct *task)
-{
-	struct perf_counter_context *ctx = task->perf_counter_ctxp;
-	struct perf_counter *counter, *tmp;
-
-	if (!ctx)
-		return;
-
-	mutex_lock(&ctx->mutex);
-again:
-	list_for_each_entry_safe(counter, tmp, &ctx->group_list, group_entry) {
-		struct perf_counter *parent = counter->parent;
-
-		if (WARN_ON_ONCE(!parent))
-			continue;
-
-		mutex_lock(&parent->child_mutex);
-		list_del_init(&counter->child_list);
-		mutex_unlock(&parent->child_mutex);
-
-		fput(parent->filp);
-
-		list_del_counter(counter, ctx);
-		free_counter(counter);
-	}
-
-	if (!list_empty(&ctx->group_list))
-		goto again;
-
-	mutex_unlock(&ctx->mutex);
-
-	put_ctx(ctx);
-}
-
-/*
- * Initialize the perf_counter context in task_struct
- */
-int perf_counter_init_task(struct task_struct *child)
-{
-	struct perf_counter_context *child_ctx, *parent_ctx;
-	struct perf_counter_context *cloned_ctx;
-	struct perf_counter *counter;
-	struct task_struct *parent = current;
-	int inherited_all = 1;
-	int ret = 0;
-
-	child->perf_counter_ctxp = NULL;
-
-	mutex_init(&child->perf_counter_mutex);
-	INIT_LIST_HEAD(&child->perf_counter_list);
-
-	if (likely(!parent->perf_counter_ctxp))
-		return 0;
-
-	/*
-	 * This is executed from the parent task context, so inherit
-	 * counters that have been marked for cloning.
-	 * First allocate and initialize a context for the child.
-	 */
-
-	child_ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
-	if (!child_ctx)
-		return -ENOMEM;
-
-	__perf_counter_init_context(child_ctx, child);
-	child->perf_counter_ctxp = child_ctx;
-	get_task_struct(child);
-
-	/*
-	 * If the parent's context is a clone, pin it so it won't get
-	 * swapped under us.
-	 */
-	parent_ctx = perf_pin_task_context(parent);
-
-	/*
-	 * No need to check if parent_ctx != NULL here; since we saw
-	 * it non-NULL earlier, the only reason for it to become NULL
-	 * is if we exit, and since we're currently in the middle of
-	 * a fork we can't be exiting at the same time.
-	 */
-
-	/*
-	 * Lock the parent list. No need to lock the child - not PID
-	 * hashed yet and not running, so nobody can access it.
-	 */
-	mutex_lock(&parent_ctx->mutex);
-
-	/*
-	 * We dont have to disable NMIs - we are only looking at
-	 * the list, not manipulating it:
-	 */
-	list_for_each_entry_rcu(counter, &parent_ctx->event_list, event_entry) {
-		if (counter != counter->group_leader)
-			continue;
-
-		if (!counter->attr.inherit) {
-			inherited_all = 0;
-			continue;
-		}
-
-		ret = inherit_group(counter, parent, parent_ctx,
-					     child, child_ctx);
-		if (ret) {
-			inherited_all = 0;
-			break;
-		}
-	}
-
-	if (inherited_all) {
-		/*
-		 * Mark the child context as a clone of the parent
-		 * context, or of whatever the parent is a clone of.
-		 * Note that if the parent is a clone, it could get
-		 * uncloned at any point, but that doesn't matter
-		 * because the list of counters and the generation
-		 * count can't have changed since we took the mutex.
-		 */
-		cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
-		if (cloned_ctx) {
-			child_ctx->parent_ctx = cloned_ctx;
-			child_ctx->parent_gen = parent_ctx->parent_gen;
-		} else {
-			child_ctx->parent_ctx = parent_ctx;
-			child_ctx->parent_gen = parent_ctx->generation;
-		}
-		get_ctx(child_ctx->parent_ctx);
-	}
-
-	mutex_unlock(&parent_ctx->mutex);
-
-	perf_unpin_context(parent_ctx);
-
-	return ret;
-}
-
-static void __cpuinit perf_counter_init_cpu(int cpu)
-{
-	struct perf_cpu_context *cpuctx;
-
-	cpuctx = &per_cpu(perf_cpu_context, cpu);
-	__perf_counter_init_context(&cpuctx->ctx, NULL);
-
-	spin_lock(&perf_resource_lock);
-	cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu;
-	spin_unlock(&perf_resource_lock);
-
-	hw_perf_counter_setup(cpu);
-}
-
-#ifdef CONFIG_HOTPLUG_CPU
-static void __perf_counter_exit_cpu(void *info)
-{
-	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
-	struct perf_counter_context *ctx = &cpuctx->ctx;
-	struct perf_counter *counter, *tmp;
-
-	list_for_each_entry_safe(counter, tmp, &ctx->group_list, group_entry)
-		__perf_counter_remove_from_context(counter);
-}
-static void perf_counter_exit_cpu(int cpu)
-{
-	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
-	struct perf_counter_context *ctx = &cpuctx->ctx;
-
-	mutex_lock(&ctx->mutex);
-	smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1);
-	mutex_unlock(&ctx->mutex);
-}
-#else
-static inline void perf_counter_exit_cpu(int cpu) { }
-#endif
-
-static int __cpuinit
-perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
-{
-	unsigned int cpu = (long)hcpu;
-
-	switch (action) {
-
-	case CPU_UP_PREPARE:
-	case CPU_UP_PREPARE_FROZEN:
-		perf_counter_init_cpu(cpu);
-		break;
-
-	case CPU_ONLINE:
-	case CPU_ONLINE_FROZEN:
-		hw_perf_counter_setup_online(cpu);
-		break;
-
-	case CPU_DOWN_PREPARE:
-	case CPU_DOWN_PREPARE_FROZEN:
-		perf_counter_exit_cpu(cpu);
-		break;
-
-	default:
-		break;
-	}
-
-	return NOTIFY_OK;
-}
-
-/*
- * This has to have a higher priority than migration_notifier in sched.c.
- */
-static struct notifier_block __cpuinitdata perf_cpu_nb = {
-	.notifier_call		= perf_cpu_notify,
-	.priority		= 20,
-};
-
-void __init perf_counter_init(void)
-{
-	perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
-			(void *)(long)smp_processor_id());
-	perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
-			(void *)(long)smp_processor_id());
-	register_cpu_notifier(&perf_cpu_nb);
-}
-
-static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
-{
-	return sprintf(buf, "%d\n", perf_reserved_percpu);
-}
-
-static ssize_t
-perf_set_reserve_percpu(struct sysdev_class *class,
-			const char *buf,
-			size_t count)
-{
-	struct perf_cpu_context *cpuctx;
-	unsigned long val;
-	int err, cpu, mpt;
-
-	err = strict_strtoul(buf, 10, &val);
-	if (err)
-		return err;
-	if (val > perf_max_counters)
-		return -EINVAL;
-
-	spin_lock(&perf_resource_lock);
-	perf_reserved_percpu = val;
-	for_each_online_cpu(cpu) {
-		cpuctx = &per_cpu(perf_cpu_context, cpu);
-		spin_lock_irq(&cpuctx->ctx.lock);
-		mpt = min(perf_max_counters - cpuctx->ctx.nr_counters,
-			  perf_max_counters - perf_reserved_percpu);
-		cpuctx->max_pertask = mpt;
-		spin_unlock_irq(&cpuctx->ctx.lock);
-	}
-	spin_unlock(&perf_resource_lock);
-
-	return count;
-}
-
-static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
-{
-	return sprintf(buf, "%d\n", perf_overcommit);
-}
-
-static ssize_t
-perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
-{
-	unsigned long val;
-	int err;
-
-	err = strict_strtoul(buf, 10, &val);
-	if (err)
-		return err;
-	if (val > 1)
-		return -EINVAL;
-
-	spin_lock(&perf_resource_lock);
-	perf_overcommit = val;
-	spin_unlock(&perf_resource_lock);
-
-	return count;
-}
-
-static SYSDEV_CLASS_ATTR(
-				reserve_percpu,
-				0644,
-				perf_show_reserve_percpu,
-				perf_set_reserve_percpu
-			);
-
-static SYSDEV_CLASS_ATTR(
-				overcommit,
-				0644,
-				perf_show_overcommit,
-				perf_set_overcommit
-			);
-
-static struct attribute *perfclass_attrs[] = {
-	&attr_reserve_percpu.attr,
-	&attr_overcommit.attr,
-	NULL
-};
-
-static struct attribute_group perfclass_attr_group = {
-	.attrs			= perfclass_attrs,
-	.name			= "perf_counters",
-};
-
-static int __init perf_counter_sysfs_init(void)
-{
-	return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
-				  &perfclass_attr_group);
-}
-device_initcall(perf_counter_sysfs_init);
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
new file mode 100644
index 00000000000..6e8b99a04e1
--- /dev/null
+++ b/kernel/perf_event.c
@@ -0,0 +1,5000 @@
+/*
+ * Performance event core code
+ *
+ *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *  Copyright  �  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ *
+ *  For licensing details see kernel-base/COPYING
+ */
+
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/cpu.h>
+#include <linux/smp.h>
+#include <linux/file.h>
+#include <linux/poll.h>
+#include <linux/sysfs.h>
+#include <linux/dcache.h>
+#include <linux/percpu.h>
+#include <linux/ptrace.h>
+#include <linux/vmstat.h>
+#include <linux/hardirq.h>
+#include <linux/rculist.h>
+#include <linux/uaccess.h>
+#include <linux/syscalls.h>
+#include <linux/anon_inodes.h>
+#include <linux/kernel_stat.h>
+#include <linux/perf_event.h>
+
+#include <asm/irq_regs.h>
+
+/*
+ * Each CPU has a list of per CPU events:
+ */
+DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
+
+int perf_max_events __read_mostly = 1;
+static int perf_reserved_percpu __read_mostly;
+static int perf_overcommit __read_mostly = 1;
+
+static atomic_t nr_events __read_mostly;
+static atomic_t nr_mmap_events __read_mostly;
+static atomic_t nr_comm_events __read_mostly;
+static atomic_t nr_task_events __read_mostly;
+
+/*
+ * perf event paranoia level:
+ *  -1 - not paranoid at all
+ *   0 - disallow raw tracepoint access for unpriv
+ *   1 - disallow cpu events for unpriv
+ *   2 - disallow kernel profiling for unpriv
+ */
+int sysctl_perf_event_paranoid __read_mostly = 1;
+
+static inline bool perf_paranoid_tracepoint_raw(void)
+{
+	return sysctl_perf_event_paranoid > -1;
+}
+
+static inline bool perf_paranoid_cpu(void)
+{
+	return sysctl_perf_event_paranoid > 0;
+}
+
+static inline bool perf_paranoid_kernel(void)
+{
+	return sysctl_perf_event_paranoid > 1;
+}
+
+int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
+
+/*
+ * max perf event sample rate
+ */
+int sysctl_perf_event_sample_rate __read_mostly = 100000;
+
+static atomic64_t perf_event_id;
+
+/*
+ * Lock for (sysadmin-configurable) event reservations:
+ */
+static DEFINE_SPINLOCK(perf_resource_lock);
+
+/*
+ * Architecture provided APIs - weak aliases:
+ */
+extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
+{
+	return NULL;
+}
+
+void __weak hw_perf_disable(void)		{ barrier(); }
+void __weak hw_perf_enable(void)		{ barrier(); }
+
+void __weak hw_perf_event_setup(int cpu)	{ barrier(); }
+void __weak hw_perf_event_setup_online(int cpu)	{ barrier(); }
+
+int __weak
+hw_perf_group_sched_in(struct perf_event *group_leader,
+	       struct perf_cpu_context *cpuctx,
+	       struct perf_event_context *ctx, int cpu)
+{
+	return 0;
+}
+
+void __weak perf_event_print_debug(void)	{ }
+
+static DEFINE_PER_CPU(int, perf_disable_count);
+
+void __perf_disable(void)
+{
+	__get_cpu_var(perf_disable_count)++;
+}
+
+bool __perf_enable(void)
+{
+	return !--__get_cpu_var(perf_disable_count);
+}
+
+void perf_disable(void)
+{
+	__perf_disable();
+	hw_perf_disable();
+}
+
+void perf_enable(void)
+{
+	if (__perf_enable())
+		hw_perf_enable();
+}
+
+static void get_ctx(struct perf_event_context *ctx)
+{
+	WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
+}
+
+static void free_ctx(struct rcu_head *head)
+{
+	struct perf_event_context *ctx;
+
+	ctx = container_of(head, struct perf_event_context, rcu_head);
+	kfree(ctx);
+}
+
+static void put_ctx(struct perf_event_context *ctx)
+{
+	if (atomic_dec_and_test(&ctx->refcount)) {
+		if (ctx->parent_ctx)
+			put_ctx(ctx->parent_ctx);
+		if (ctx->task)
+			put_task_struct(ctx->task);
+		call_rcu(&ctx->rcu_head, free_ctx);
+	}
+}
+
+static void unclone_ctx(struct perf_event_context *ctx)
+{
+	if (ctx->parent_ctx) {
+		put_ctx(ctx->parent_ctx);
+		ctx->parent_ctx = NULL;
+	}
+}
+
+/*
+ * If we inherit events we want to return the parent event id
+ * to userspace.
+ */
+static u64 primary_event_id(struct perf_event *event)
+{
+	u64 id = event->id;
+
+	if (event->parent)
+		id = event->parent->id;
+
+	return id;
+}
+
+/*
+ * Get the perf_event_context for a task and lock it.
+ * This has to cope with with the fact that until it is locked,
+ * the context could get moved to another task.
+ */
+static struct perf_event_context *
+perf_lock_task_context(struct task_struct *task, unsigned long *flags)
+{
+	struct perf_event_context *ctx;
+
+	rcu_read_lock();
+ retry:
+	ctx = rcu_dereference(task->perf_event_ctxp);
+	if (ctx) {
+		/*
+		 * If this context is a clone of another, it might
+		 * get swapped for another underneath us by
+		 * perf_event_task_sched_out, though the
+		 * rcu_read_lock() protects us from any context
+		 * getting freed.  Lock the context and check if it
+		 * got swapped before we could get the lock, and retry
+		 * if so.  If we locked the right context, then it
+		 * can't get swapped on us any more.
+		 */
+		spin_lock_irqsave(&ctx->lock, *flags);
+		if (ctx != rcu_dereference(task->perf_event_ctxp)) {
+			spin_unlock_irqrestore(&ctx->lock, *flags);
+			goto retry;
+		}
+
+		if (!atomic_inc_not_zero(&ctx->refcount)) {
+			spin_unlock_irqrestore(&ctx->lock, *flags);
+			ctx = NULL;
+		}
+	}
+	rcu_read_unlock();
+	return ctx;
+}
+
+/*
+ * Get the context for a task and increment its pin_count so it
+ * can't get swapped to another task.  This also increments its
+ * reference count so that the context can't get freed.
+ */
+static struct perf_event_context *perf_pin_task_context(struct task_struct *task)
+{
+	struct perf_event_context *ctx;
+	unsigned long flags;
+
+	ctx = perf_lock_task_context(task, &flags);
+	if (ctx) {
+		++ctx->pin_count;
+		spin_unlock_irqrestore(&ctx->lock, flags);
+	}
+	return ctx;
+}
+
+static void perf_unpin_context(struct perf_event_context *ctx)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ctx->lock, flags);
+	--ctx->pin_count;
+	spin_unlock_irqrestore(&ctx->lock, flags);
+	put_ctx(ctx);
+}
+
+/*
+ * Add a event from the lists for its context.
+ * Must be called with ctx->mutex and ctx->lock held.
+ */
+static void
+list_add_event(struct perf_event *event, struct perf_event_context *ctx)
+{
+	struct perf_event *group_leader = event->group_leader;
+
+	/*
+	 * Depending on whether it is a standalone or sibling event,
+	 * add it straight to the context's event list, or to the group
+	 * leader's sibling list:
+	 */
+	if (group_leader == event)
+		list_add_tail(&event->group_entry, &ctx->group_list);
+	else {
+		list_add_tail(&event->group_entry, &group_leader->sibling_list);
+		group_leader->nr_siblings++;
+	}
+
+	list_add_rcu(&event->event_entry, &ctx->event_list);
+	ctx->nr_events++;
+	if (event->attr.inherit_stat)
+		ctx->nr_stat++;
+}
+
+/*
+ * Remove a event from the lists for its context.
+ * Must be called with ctx->mutex and ctx->lock held.
+ */
+static void
+list_del_event(struct perf_event *event, struct perf_event_context *ctx)
+{
+	struct perf_event *sibling, *tmp;
+
+	if (list_empty(&event->group_entry))
+		return;
+	ctx->nr_events--;
+	if (event->attr.inherit_stat)
+		ctx->nr_stat--;
+
+	list_del_init(&event->group_entry);
+	list_del_rcu(&event->event_entry);
+
+	if (event->group_leader != event)
+		event->group_leader->nr_siblings--;
+
+	/*
+	 * If this was a group event with sibling events then
+	 * upgrade the siblings to singleton events by adding them
+	 * to the context list directly:
+	 */
+	list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
+
+		list_move_tail(&sibling->group_entry, &ctx->group_list);
+		sibling->group_leader = sibling;
+	}
+}
+
+static void
+event_sched_out(struct perf_event *event,
+		  struct perf_cpu_context *cpuctx,
+		  struct perf_event_context *ctx)
+{
+	if (event->state != PERF_EVENT_STATE_ACTIVE)
+		return;
+
+	event->state = PERF_EVENT_STATE_INACTIVE;
+	if (event->pending_disable) {
+		event->pending_disable = 0;
+		event->state = PERF_EVENT_STATE_OFF;
+	}
+	event->tstamp_stopped = ctx->time;
+	event->pmu->disable(event);
+	event->oncpu = -1;
+
+	if (!is_software_event(event))
+		cpuctx->active_oncpu--;
+	ctx->nr_active--;
+	if (event->attr.exclusive || !cpuctx->active_oncpu)
+		cpuctx->exclusive = 0;
+}
+
+static void
+group_sched_out(struct perf_event *group_event,
+		struct perf_cpu_context *cpuctx,
+		struct perf_event_context *ctx)
+{
+	struct perf_event *event;
+
+	if (group_event->state != PERF_EVENT_STATE_ACTIVE)
+		return;
+
+	event_sched_out(group_event, cpuctx, ctx);
+
+	/*
+	 * Schedule out siblings (if any):
+	 */
+	list_for_each_entry(event, &group_event->sibling_list, group_entry)
+		event_sched_out(event, cpuctx, ctx);
+
+	if (group_event->attr.exclusive)
+		cpuctx->exclusive = 0;
+}
+
+/*
+ * Cross CPU call to remove a performance event
+ *
+ * We disable the event on the hardware level first. After that we
+ * remove it from the context list.
+ */
+static void __perf_event_remove_from_context(void *info)
+{
+	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+	struct perf_event *event = info;
+	struct perf_event_context *ctx = event->ctx;
+
+	/*
+	 * If this is a task context, we need to check whether it is
+	 * the current task context of this cpu. If not it has been
+	 * scheduled out before the smp call arrived.
+	 */
+	if (ctx->task && cpuctx->task_ctx != ctx)
+		return;
+
+	spin_lock(&ctx->lock);
+	/*
+	 * Protect the list operation against NMI by disabling the
+	 * events on a global level.
+	 */
+	perf_disable();
+
+	event_sched_out(event, cpuctx, ctx);
+
+	list_del_event(event, ctx);
+
+	if (!ctx->task) {
+		/*
+		 * Allow more per task events with respect to the
+		 * reservation:
+		 */
+		cpuctx->max_pertask =
+			min(perf_max_events - ctx->nr_events,
+			    perf_max_events - perf_reserved_percpu);
+	}
+
+	perf_enable();
+	spin_unlock(&ctx->lock);
+}
+
+
+/*
+ * Remove the event from a task's (or a CPU's) list of events.
+ *
+ * Must be called with ctx->mutex held.
+ *
+ * CPU events are removed with a smp call. For task events we only
+ * call when the task is on a CPU.
+ *
+ * If event->ctx is a cloned context, callers must make sure that
+ * every task struct that event->ctx->task could possibly point to
+ * remains valid.  This is OK when called from perf_release since
+ * that only calls us on the top-level context, which can't be a clone.
+ * When called from perf_event_exit_task, it's OK because the
+ * context has been detached from its task.
+ */
+static void perf_event_remove_from_context(struct perf_event *event)
+{
+	struct perf_event_context *ctx = event->ctx;
+	struct task_struct *task = ctx->task;
+
+	if (!task) {
+		/*
+		 * Per cpu events are removed via an smp call and
+		 * the removal is always sucessful.
+		 */
+		smp_call_function_single(event->cpu,
+					 __perf_event_remove_from_context,
+					 event, 1);
+		return;
+	}
+
+retry:
+	task_oncpu_function_call(task, __perf_event_remove_from_context,
+				 event);
+
+	spin_lock_irq(&ctx->lock);
+	/*
+	 * If the context is active we need to retry the smp call.
+	 */
+	if (ctx->nr_active && !list_empty(&event->group_entry)) {
+		spin_unlock_irq(&ctx->lock);
+		goto retry;
+	}
+
+	/*
+	 * The lock prevents that this context is scheduled in so we
+	 * can remove the event safely, if the call above did not
+	 * succeed.
+	 */
+	if (!list_empty(&event->group_entry)) {
+		list_del_event(event, ctx);
+	}
+	spin_unlock_irq(&ctx->lock);
+}
+
+static inline u64 perf_clock(void)
+{
+	return cpu_clock(smp_processor_id());
+}
+
+/*
+ * Update the record of the current time in a context.
+ */
+static void update_context_time(struct perf_event_context *ctx)
+{
+	u64 now = perf_clock();
+
+	ctx->time += now - ctx->timestamp;
+	ctx->timestamp = now;
+}
+
+/*
+ * Update the total_time_enabled and total_time_running fields for a event.
+ */
+static void update_event_times(struct perf_event *event)
+{
+	struct perf_event_context *ctx = event->ctx;
+	u64 run_end;
+
+	if (event->state < PERF_EVENT_STATE_INACTIVE ||
+	    event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
+		return;
+
+	event->total_time_enabled = ctx->time - event->tstamp_enabled;
+
+	if (event->state == PERF_EVENT_STATE_INACTIVE)
+		run_end = event->tstamp_stopped;
+	else
+		run_end = ctx->time;
+
+	event->total_time_running = run_end - event->tstamp_running;
+}
+
+/*
+ * Update total_time_enabled and total_time_running for all events in a group.
+ */
+static void update_group_times(struct perf_event *leader)
+{
+	struct perf_event *event;
+
+	update_event_times(leader);
+	list_for_each_entry(event, &leader->sibling_list, group_entry)
+		update_event_times(event);
+}
+
+/*
+ * Cross CPU call to disable a performance event
+ */
+static void __perf_event_disable(void *info)
+{
+	struct perf_event *event = info;
+	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+	struct perf_event_context *ctx = event->ctx;
+
+	/*
+	 * If this is a per-task event, need to check whether this
+	 * event's task is the current task on this cpu.
+	 */
+	if (ctx->task && cpuctx->task_ctx != ctx)
+		return;
+
+	spin_lock(&ctx->lock);
+
+	/*
+	 * If the event is on, turn it off.
+	 * If it is in error state, leave it in error state.
+	 */
+	if (event->state >= PERF_EVENT_STATE_INACTIVE) {
+		update_context_time(ctx);
+		update_group_times(event);
+		if (event == event->group_leader)
+			group_sched_out(event, cpuctx, ctx);
+		else
+			event_sched_out(event, cpuctx, ctx);
+		event->state = PERF_EVENT_STATE_OFF;
+	}
+
+	spin_unlock(&ctx->lock);
+}
+
+/*
+ * Disable a event.
+ *
+ * If event->ctx is a cloned context, callers must make sure that
+ * every task struct that event->ctx->task could possibly point to
+ * remains valid.  This condition is satisifed when called through
+ * perf_event_for_each_child or perf_event_for_each because they
+ * hold the top-level event's child_mutex, so any descendant that
+ * goes to exit will block in sync_child_event.
+ * When called from perf_pending_event it's OK because event->ctx
+ * is the current context on this CPU and preemption is disabled,
+ * hence we can't get into perf_event_task_sched_out for this context.
+ */
+static void perf_event_disable(struct perf_event *event)
+{
+	struct perf_event_context *ctx = event->ctx;
+	struct task_struct *task = ctx->task;
+
+	if (!task) {
+		/*
+		 * Disable the event on the cpu that it's on
+		 */
+		smp_call_function_single(event->cpu, __perf_event_disable,
+					 event, 1);
+		return;
+	}
+
+ retry:
+	task_oncpu_function_call(task, __perf_event_disable, event);
+
+	spin_lock_irq(&ctx->lock);
+	/*
+	 * If the event is still active, we need to retry the cross-call.
+	 */
+	if (event->state == PERF_EVENT_STATE_ACTIVE) {
+		spin_unlock_irq(&ctx->lock);
+		goto retry;
+	}
+
+	/*
+	 * Since we have the lock this context can't be scheduled
+	 * in, so we can change the state safely.
+	 */
+	if (event->state == PERF_EVENT_STATE_INACTIVE) {
+		update_group_times(event);
+		event->state = PERF_EVENT_STATE_OFF;
+	}
+
+	spin_unlock_irq(&ctx->lock);
+}
+
+static int
+event_sched_in(struct perf_event *event,
+		 struct perf_cpu_context *cpuctx,
+		 struct perf_event_context *ctx,
+		 int cpu)
+{
+	if (event->state <= PERF_EVENT_STATE_OFF)
+		return 0;
+
+	event->state = PERF_EVENT_STATE_ACTIVE;
+	event->oncpu = cpu;	/* TODO: put 'cpu' into cpuctx->cpu */
+	/*
+	 * The new state must be visible before we turn it on in the hardware:
+	 */
+	smp_wmb();
+
+	if (event->pmu->enable(event)) {
+		event->state = PERF_EVENT_STATE_INACTIVE;
+		event->oncpu = -1;
+		return -EAGAIN;
+	}
+
+	event->tstamp_running += ctx->time - event->tstamp_stopped;
+
+	if (!is_software_event(event))
+		cpuctx->active_oncpu++;
+	ctx->nr_active++;
+
+	if (event->attr.exclusive)
+		cpuctx->exclusive = 1;
+
+	return 0;
+}
+
+static int
+group_sched_in(struct perf_event *group_event,
+	       struct perf_cpu_context *cpuctx,
+	       struct perf_event_context *ctx,
+	       int cpu)
+{
+	struct perf_event *event, *partial_group;
+	int ret;
+
+	if (group_event->state == PERF_EVENT_STATE_OFF)
+		return 0;
+
+	ret = hw_perf_group_sched_in(group_event, cpuctx, ctx, cpu);
+	if (ret)
+		return ret < 0 ? ret : 0;
+
+	if (event_sched_in(group_event, cpuctx, ctx, cpu))
+		return -EAGAIN;
+
+	/*
+	 * Schedule in siblings as one group (if any):
+	 */
+	list_for_each_entry(event, &group_event->sibling_list, group_entry) {
+		if (event_sched_in(event, cpuctx, ctx, cpu)) {
+			partial_group = event;
+			goto group_error;
+		}
+	}
+
+	return 0;
+
+group_error:
+	/*
+	 * Groups can be scheduled in as one unit only, so undo any
+	 * partial group before returning:
+	 */
+	list_for_each_entry(event, &group_event->sibling_list, group_entry) {
+		if (event == partial_group)
+			break;
+		event_sched_out(event, cpuctx, ctx);
+	}
+	event_sched_out(group_event, cpuctx, ctx);
+
+	return -EAGAIN;
+}
+
+/*
+ * Return 1 for a group consisting entirely of software events,
+ * 0 if the group contains any hardware events.
+ */
+static int is_software_only_group(struct perf_event *leader)
+{
+	struct perf_event *event;
+
+	if (!is_software_event(leader))
+		return 0;
+
+	list_for_each_entry(event, &leader->sibling_list, group_entry)
+		if (!is_software_event(event))
+			return 0;
+
+	return 1;
+}
+
+/*
+ * Work out whether we can put this event group on the CPU now.
+ */
+static int group_can_go_on(struct perf_event *event,
+			   struct perf_cpu_context *cpuctx,
+			   int can_add_hw)
+{
+	/*
+	 * Groups consisting entirely of software events can always go on.
+	 */
+	if (is_software_only_group(event))
+		return 1;
+	/*
+	 * If an exclusive group is already on, no other hardware
+	 * events can go on.
+	 */
+	if (cpuctx->exclusive)
+		return 0;
+	/*
+	 * If this group is exclusive and there are already
+	 * events on the CPU, it can't go on.
+	 */
+	if (event->attr.exclusive && cpuctx->active_oncpu)
+		return 0;
+	/*
+	 * Otherwise, try to add it if all previous groups were able
+	 * to go on.
+	 */
+	return can_add_hw;
+}
+
+static void add_event_to_ctx(struct perf_event *event,
+			       struct perf_event_context *ctx)
+{
+	list_add_event(event, ctx);
+	event->tstamp_enabled = ctx->time;
+	event->tstamp_running = ctx->time;
+	event->tstamp_stopped = ctx->time;
+}
+
+/*
+ * Cross CPU call to install and enable a performance event
+ *
+ * Must be called with ctx->mutex held
+ */
+static void __perf_install_in_context(void *info)
+{
+	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+	struct perf_event *event = info;
+	struct perf_event_context *ctx = event->ctx;
+	struct perf_event *leader = event->group_leader;
+	int cpu = smp_processor_id();
+	int err;
+
+	/*
+	 * If this is a task context, we need to check whether it is
+	 * the current task context of this cpu. If not it has been
+	 * scheduled out before the smp call arrived.
+	 * Or possibly this is the right context but it isn't
+	 * on this cpu because it had no events.
+	 */
+	if (ctx->task && cpuctx->task_ctx != ctx) {
+		if (cpuctx->task_ctx || ctx->task != current)
+			return;
+		cpuctx->task_ctx = ctx;
+	}
+
+	spin_lock(&ctx->lock);
+	ctx->is_active = 1;
+	update_context_time(ctx);
+
+	/*
+	 * Protect the list operation against NMI by disabling the
+	 * events on a global level. NOP for non NMI based events.
+	 */
+	perf_disable();
+
+	add_event_to_ctx(event, ctx);
+
+	/*
+	 * Don't put the event on if it is disabled or if
+	 * it is in a group and the group isn't on.
+	 */
+	if (event->state != PERF_EVENT_STATE_INACTIVE ||
+	    (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE))
+		goto unlock;
+
+	/*
+	 * An exclusive event can't go on if there are already active
+	 * hardware events, and no hardware event can go on if there
+	 * is already an exclusive event on.
+	 */
+	if (!group_can_go_on(event, cpuctx, 1))
+		err = -EEXIST;
+	else
+		err = event_sched_in(event, cpuctx, ctx, cpu);
+
+	if (err) {
+		/*
+		 * This event couldn't go on.  If it is in a group
+		 * then we have to pull the whole group off.
+		 * If the event group is pinned then put it in error state.
+		 */
+		if (leader != event)
+			group_sched_out(leader, cpuctx, ctx);
+		if (leader->attr.pinned) {
+			update_group_times(leader);
+			leader->state = PERF_EVENT_STATE_ERROR;
+		}
+	}
+
+	if (!err && !ctx->task && cpuctx->max_pertask)
+		cpuctx->max_pertask--;
+
+ unlock:
+	perf_enable();
+
+	spin_unlock(&ctx->lock);
+}
+
+/*
+ * Attach a performance event to a context
+ *
+ * First we add the event to the list with the hardware enable bit
+ * in event->hw_config cleared.
+ *
+ * If the event is attached to a task which is on a CPU we use a smp
+ * call to enable it in the task context. The task might have been
+ * scheduled away, but we check this in the smp call again.
+ *
+ * Must be called with ctx->mutex held.
+ */
+static void
+perf_install_in_context(struct perf_event_context *ctx,
+			struct perf_event *event,
+			int cpu)
+{
+	struct task_struct *task = ctx->task;
+
+	if (!task) {
+		/*
+		 * Per cpu events are installed via an smp call and
+		 * the install is always sucessful.
+		 */
+		smp_call_function_single(cpu, __perf_install_in_context,
+					 event, 1);
+		return;
+	}
+
+retry:
+	task_oncpu_function_call(task, __perf_install_in_context,
+				 event);
+
+	spin_lock_irq(&ctx->lock);
+	/*
+	 * we need to retry the smp call.
+	 */
+	if (ctx->is_active && list_empty(&event->group_entry)) {
+		spin_unlock_irq(&ctx->lock);
+		goto retry;
+	}
+
+	/*
+	 * The lock prevents that this context is scheduled in so we
+	 * can add the event safely, if it the call above did not
+	 * succeed.
+	 */
+	if (list_empty(&event->group_entry))
+		add_event_to_ctx(event, ctx);
+	spin_unlock_irq(&ctx->lock);
+}
+
+/*
+ * Put a event into inactive state and update time fields.
+ * Enabling the leader of a group effectively enables all
+ * the group members that aren't explicitly disabled, so we
+ * have to update their ->tstamp_enabled also.
+ * Note: this works for group members as well as group leaders
+ * since the non-leader members' sibling_lists will be empty.
+ */
+static void __perf_event_mark_enabled(struct perf_event *event,
+					struct perf_event_context *ctx)
+{
+	struct perf_event *sub;
+
+	event->state = PERF_EVENT_STATE_INACTIVE;
+	event->tstamp_enabled = ctx->time - event->total_time_enabled;
+	list_for_each_entry(sub, &event->sibling_list, group_entry)
+		if (sub->state >= PERF_EVENT_STATE_INACTIVE)
+			sub->tstamp_enabled =
+				ctx->time - sub->total_time_enabled;
+}
+
+/*
+ * Cross CPU call to enable a performance event
+ */
+static void __perf_event_enable(void *info)
+{
+	struct perf_event *event = info;
+	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+	struct perf_event_context *ctx = event->ctx;
+	struct perf_event *leader = event->group_leader;
+	int err;
+
+	/*
+	 * If this is a per-task event, need to check whether this
+	 * event's task is the current task on this cpu.
+	 */
+	if (ctx->task && cpuctx->task_ctx != ctx) {
+		if (cpuctx->task_ctx || ctx->task != current)
+			return;
+		cpuctx->task_ctx = ctx;
+	}
+
+	spin_lock(&ctx->lock);
+	ctx->is_active = 1;
+	update_context_time(ctx);
+
+	if (event->state >= PERF_EVENT_STATE_INACTIVE)
+		goto unlock;
+	__perf_event_mark_enabled(event, ctx);
+
+	/*
+	 * If the event is in a group and isn't the group leader,
+	 * then don't put it on unless the group is on.
+	 */
+	if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
+		goto unlock;
+
+	if (!group_can_go_on(event, cpuctx, 1)) {
+		err = -EEXIST;
+	} else {
+		perf_disable();
+		if (event == leader)
+			err = group_sched_in(event, cpuctx, ctx,
+					     smp_processor_id());
+		else
+			err = event_sched_in(event, cpuctx, ctx,
+					       smp_processor_id());
+		perf_enable();
+	}
+
+	if (err) {
+		/*
+		 * If this event can't go on and it's part of a
+		 * group, then the whole group has to come off.
+		 */
+		if (leader != event)
+			group_sched_out(leader, cpuctx, ctx);
+		if (leader->attr.pinned) {
+			update_group_times(leader);
+			leader->state = PERF_EVENT_STATE_ERROR;
+		}
+	}
+
+ unlock:
+	spin_unlock(&ctx->lock);
+}
+
+/*
+ * Enable a event.
+ *
+ * If event->ctx is a cloned context, callers must make sure that
+ * every task struct that event->ctx->task could possibly point to
+ * remains valid.  This condition is satisfied when called through
+ * perf_event_for_each_child or perf_event_for_each as described
+ * for perf_event_disable.
+ */
+static void perf_event_enable(struct perf_event *event)
+{
+	struct perf_event_context *ctx = event->ctx;
+	struct task_struct *task = ctx->task;
+
+	if (!task) {
+		/*
+		 * Enable the event on the cpu that it's on
+		 */
+		smp_call_function_single(event->cpu, __perf_event_enable,
+					 event, 1);
+		return;
+	}
+
+	spin_lock_irq(&ctx->lock);
+	if (event->state >= PERF_EVENT_STATE_INACTIVE)
+		goto out;
+
+	/*
+	 * If the event is in error state, clear that first.
+	 * That way, if we see the event in error state below, we
+	 * know that it has gone back into error state, as distinct
+	 * from the task having been scheduled away before the
+	 * cross-call arrived.
+	 */
+	if (event->state == PERF_EVENT_STATE_ERROR)
+		event->state = PERF_EVENT_STATE_OFF;
+
+ retry:
+	spin_unlock_irq(&ctx->lock);
+	task_oncpu_function_call(task, __perf_event_enable, event);
+
+	spin_lock_irq(&ctx->lock);
+
+	/*
+	 * If the context is active and the event is still off,
+	 * we need to retry the cross-call.
+	 */
+	if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF)
+		goto retry;
+
+	/*
+	 * Since we have the lock this context can't be scheduled
+	 * in, so we can change the state safely.
+	 */
+	if (event->state == PERF_EVENT_STATE_OFF)
+		__perf_event_mark_enabled(event, ctx);
+
+ out:
+	spin_unlock_irq(&ctx->lock);
+}
+
+static int perf_event_refresh(struct perf_event *event, int refresh)
+{
+	/*
+	 * not supported on inherited events
+	 */
+	if (event->attr.inherit)
+		return -EINVAL;
+
+	atomic_add(refresh, &event->event_limit);
+	perf_event_enable(event);
+
+	return 0;
+}
+
+void __perf_event_sched_out(struct perf_event_context *ctx,
+			      struct perf_cpu_context *cpuctx)
+{
+	struct perf_event *event;
+
+	spin_lock(&ctx->lock);
+	ctx->is_active = 0;
+	if (likely(!ctx->nr_events))
+		goto out;
+	update_context_time(ctx);
+
+	perf_disable();
+	if (ctx->nr_active) {
+		list_for_each_entry(event, &ctx->group_list, group_entry) {
+			if (event != event->group_leader)
+				event_sched_out(event, cpuctx, ctx);
+			else
+				group_sched_out(event, cpuctx, ctx);
+		}
+	}
+	perf_enable();
+ out:
+	spin_unlock(&ctx->lock);
+}
+
+/*
+ * Test whether two contexts are equivalent, i.e. whether they
+ * have both been cloned from the same version of the same context
+ * and they both have the same number of enabled events.
+ * If the number of enabled events is the same, then the set
+ * of enabled events should be the same, because these are both
+ * inherited contexts, therefore we can't access individual events
+ * in them directly with an fd; we can only enable/disable all
+ * events via prctl, or enable/disable all events in a family
+ * via ioctl, which will have the same effect on both contexts.
+ */
+static int context_equiv(struct perf_event_context *ctx1,
+			 struct perf_event_context *ctx2)
+{
+	return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
+		&& ctx1->parent_gen == ctx2->parent_gen
+		&& !ctx1->pin_count && !ctx2->pin_count;
+}
+
+static void __perf_event_read(void *event);
+
+static void __perf_event_sync_stat(struct perf_event *event,
+				     struct perf_event *next_event)
+{
+	u64 value;
+
+	if (!event->attr.inherit_stat)
+		return;
+
+	/*
+	 * Update the event value, we cannot use perf_event_read()
+	 * because we're in the middle of a context switch and have IRQs
+	 * disabled, which upsets smp_call_function_single(), however
+	 * we know the event must be on the current CPU, therefore we
+	 * don't need to use it.
+	 */
+	switch (event->state) {
+	case PERF_EVENT_STATE_ACTIVE:
+		__perf_event_read(event);
+		break;
+
+	case PERF_EVENT_STATE_INACTIVE:
+		update_event_times(event);
+		break;
+
+	default:
+		break;
+	}
+
+	/*
+	 * In order to keep per-task stats reliable we need to flip the event
+	 * values when we flip the contexts.
+	 */
+	value = atomic64_read(&next_event->count);
+	value = atomic64_xchg(&event->count, value);
+	atomic64_set(&next_event->count, value);
+
+	swap(event->total_time_enabled, next_event->total_time_enabled);
+	swap(event->total_time_running, next_event->total_time_running);
+
+	/*
+	 * Since we swizzled the values, update the user visible data too.
+	 */
+	perf_event_update_userpage(event);
+	perf_event_update_userpage(next_event);
+}
+
+#define list_next_entry(pos, member) \
+	list_entry(pos->member.next, typeof(*pos), member)
+
+static void perf_event_sync_stat(struct perf_event_context *ctx,
+				   struct perf_event_context *next_ctx)
+{
+	struct perf_event *event, *next_event;
+
+	if (!ctx->nr_stat)
+		return;
+
+	event = list_first_entry(&ctx->event_list,
+				   struct perf_event, event_entry);
+
+	next_event = list_first_entry(&next_ctx->event_list,
+					struct perf_event, event_entry);
+
+	while (&event->event_entry != &ctx->event_list &&
+	       &next_event->event_entry != &next_ctx->event_list) {
+
+		__perf_event_sync_stat(event, next_event);
+
+		event = list_next_entry(event, event_entry);
+		next_event = list_next_entry(next_event, event_entry);
+	}
+}
+
+/*
+ * Called from scheduler to remove the events of the current task,
+ * with interrupts disabled.
+ *
+ * We stop each event and update the event value in event->count.
+ *
+ * This does not protect us against NMI, but disable()
+ * sets the disabled bit in the control field of event _before_
+ * accessing the event control register. If a NMI hits, then it will
+ * not restart the event.
+ */
+void perf_event_task_sched_out(struct task_struct *task,
+				 struct task_struct *next, int cpu)
+{
+	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+	struct perf_event_context *ctx = task->perf_event_ctxp;
+	struct perf_event_context *next_ctx;
+	struct perf_event_context *parent;
+	struct pt_regs *regs;
+	int do_switch = 1;
+
+	regs = task_pt_regs(task);
+	perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
+
+	if (likely(!ctx || !cpuctx->task_ctx))
+		return;
+
+	update_context_time(ctx);
+
+	rcu_read_lock();
+	parent = rcu_dereference(ctx->parent_ctx);
+	next_ctx = next->perf_event_ctxp;
+	if (parent && next_ctx &&
+	    rcu_dereference(next_ctx->parent_ctx) == parent) {
+		/*
+		 * Looks like the two contexts are clones, so we might be
+		 * able to optimize the context switch.  We lock both
+		 * contexts and check that they are clones under the
+		 * lock (including re-checking that neither has been
+		 * uncloned in the meantime).  It doesn't matter which
+		 * order we take the locks because no other cpu could
+		 * be trying to lock both of these tasks.
+		 */
+		spin_lock(&ctx->lock);
+		spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
+		if (context_equiv(ctx, next_ctx)) {
+			/*
+			 * XXX do we need a memory barrier of sorts
+			 * wrt to rcu_dereference() of perf_event_ctxp
+			 */
+			task->perf_event_ctxp = next_ctx;
+			next->perf_event_ctxp = ctx;
+			ctx->task = next;
+			next_ctx->task = task;
+			do_switch = 0;
+
+			perf_event_sync_stat(ctx, next_ctx);
+		}
+		spin_unlock(&next_ctx->lock);
+		spin_unlock(&ctx->lock);
+	}
+	rcu_read_unlock();
+
+	if (do_switch) {
+		__perf_event_sched_out(ctx, cpuctx);
+		cpuctx->task_ctx = NULL;
+	}
+}
+
+/*
+ * Called with IRQs disabled
+ */
+static void __perf_event_task_sched_out(struct perf_event_context *ctx)
+{
+	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+
+	if (!cpuctx->task_ctx)
+		return;
+
+	if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
+		return;
+
+	__perf_event_sched_out(ctx, cpuctx);
+	cpuctx->task_ctx = NULL;
+}
+
+/*
+ * Called with IRQs disabled
+ */
+static void perf_event_cpu_sched_out(struct perf_cpu_context *cpuctx)
+{
+	__perf_event_sched_out(&cpuctx->ctx, cpuctx);
+}
+
+static void
+__perf_event_sched_in(struct perf_event_context *ctx,
+			struct perf_cpu_context *cpuctx, int cpu)
+{
+	struct perf_event *event;
+	int can_add_hw = 1;
+
+	spin_lock(&ctx->lock);
+	ctx->is_active = 1;
+	if (likely(!ctx->nr_events))
+		goto out;
+
+	ctx->timestamp = perf_clock();
+
+	perf_disable();
+
+	/*
+	 * First go through the list and put on any pinned groups
+	 * in order to give them the best chance of going on.
+	 */
+	list_for_each_entry(event, &ctx->group_list, group_entry) {
+		if (event->state <= PERF_EVENT_STATE_OFF ||
+		    !event->attr.pinned)
+			continue;
+		if (event->cpu != -1 && event->cpu != cpu)
+			continue;
+
+		if (event != event->group_leader)
+			event_sched_in(event, cpuctx, ctx, cpu);
+		else {
+			if (group_can_go_on(event, cpuctx, 1))
+				group_sched_in(event, cpuctx, ctx, cpu);
+		}
+
+		/*
+		 * If this pinned group hasn't been scheduled,
+		 * put it in error state.
+		 */
+		if (event->state == PERF_EVENT_STATE_INACTIVE) {
+			update_group_times(event);
+			event->state = PERF_EVENT_STATE_ERROR;
+		}
+	}
+
+	list_for_each_entry(event, &ctx->group_list, group_entry) {
+		/*
+		 * Ignore events in OFF or ERROR state, and
+		 * ignore pinned events since we did them already.
+		 */
+		if (event->state <= PERF_EVENT_STATE_OFF ||
+		    event->attr.pinned)
+			continue;
+
+		/*
+		 * Listen to the 'cpu' scheduling filter constraint
+		 * of events:
+		 */
+		if (event->cpu != -1 && event->cpu != cpu)
+			continue;
+
+		if (event != event->group_leader) {
+			if (event_sched_in(event, cpuctx, ctx, cpu))
+				can_add_hw = 0;
+		} else {
+			if (group_can_go_on(event, cpuctx, can_add_hw)) {
+				if (group_sched_in(event, cpuctx, ctx, cpu))
+					can_add_hw = 0;
+			}
+		}
+	}
+	perf_enable();
+ out:
+	spin_unlock(&ctx->lock);
+}
+
+/*
+ * Called from scheduler to add the events of the current task
+ * with interrupts disabled.
+ *
+ * We restore the event value and then enable it.
+ *
+ * This does not protect us against NMI, but enable()
+ * sets the enabled bit in the control field of event _before_
+ * accessing the event control register. If a NMI hits, then it will
+ * keep the event running.
+ */
+void perf_event_task_sched_in(struct task_struct *task, int cpu)
+{
+	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+	struct perf_event_context *ctx = task->perf_event_ctxp;
+
+	if (likely(!ctx))
+		return;
+	if (cpuctx->task_ctx == ctx)
+		return;
+	__perf_event_sched_in(ctx, cpuctx, cpu);
+	cpuctx->task_ctx = ctx;
+}
+
+static void perf_event_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
+{
+	struct perf_event_context *ctx = &cpuctx->ctx;
+
+	__perf_event_sched_in(ctx, cpuctx, cpu);
+}
+
+#define MAX_INTERRUPTS (~0ULL)
+
+static void perf_log_throttle(struct perf_event *event, int enable);
+
+static void perf_adjust_period(struct perf_event *event, u64 events)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	u64 period, sample_period;
+	s64 delta;
+
+	events *= hwc->sample_period;
+	period = div64_u64(events, event->attr.sample_freq);
+
+	delta = (s64)(period - hwc->sample_period);
+	delta = (delta + 7) / 8; /* low pass filter */
+
+	sample_period = hwc->sample_period + delta;
+
+	if (!sample_period)
+		sample_period = 1;
+
+	hwc->sample_period = sample_period;
+}
+
+static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
+{
+	struct perf_event *event;
+	struct hw_perf_event *hwc;
+	u64 interrupts, freq;
+
+	spin_lock(&ctx->lock);
+	list_for_each_entry(event, &ctx->group_list, group_entry) {
+		if (event->state != PERF_EVENT_STATE_ACTIVE)
+			continue;
+
+		hwc = &event->hw;
+
+		interrupts = hwc->interrupts;
+		hwc->interrupts = 0;
+
+		/*
+		 * unthrottle events on the tick
+		 */
+		if (interrupts == MAX_INTERRUPTS) {
+			perf_log_throttle(event, 1);
+			event->pmu->unthrottle(event);
+			interrupts = 2*sysctl_perf_event_sample_rate/HZ;
+		}
+
+		if (!event->attr.freq || !event->attr.sample_freq)
+			continue;
+
+		/*
+		 * if the specified freq < HZ then we need to skip ticks
+		 */
+		if (event->attr.sample_freq < HZ) {
+			freq = event->attr.sample_freq;
+
+			hwc->freq_count += freq;
+			hwc->freq_interrupts += interrupts;
+
+			if (hwc->freq_count < HZ)
+				continue;
+
+			interrupts = hwc->freq_interrupts;
+			hwc->freq_interrupts = 0;
+			hwc->freq_count -= HZ;
+		} else
+			freq = HZ;
+
+		perf_adjust_period(event, freq * interrupts);
+
+		/*
+		 * In order to avoid being stalled by an (accidental) huge
+		 * sample period, force reset the sample period if we didn't
+		 * get any events in this freq period.
+		 */
+		if (!interrupts) {
+			perf_disable();
+			event->pmu->disable(event);
+			atomic64_set(&hwc->period_left, 0);
+			event->pmu->enable(event);
+			perf_enable();
+		}
+	}
+	spin_unlock(&ctx->lock);
+}
+
+/*
+ * Round-robin a context's events:
+ */
+static void rotate_ctx(struct perf_event_context *ctx)
+{
+	struct perf_event *event;
+
+	if (!ctx->nr_events)
+		return;
+
+	spin_lock(&ctx->lock);
+	/*
+	 * Rotate the first entry last (works just fine for group events too):
+	 */
+	perf_disable();
+	list_for_each_entry(event, &ctx->group_list, group_entry) {
+		list_move_tail(&event->group_entry, &ctx->group_list);
+		break;
+	}
+	perf_enable();
+
+	spin_unlock(&ctx->lock);
+}
+
+void perf_event_task_tick(struct task_struct *curr, int cpu)
+{
+	struct perf_cpu_context *cpuctx;
+	struct perf_event_context *ctx;
+
+	if (!atomic_read(&nr_events))
+		return;
+
+	cpuctx = &per_cpu(perf_cpu_context, cpu);
+	ctx = curr->perf_event_ctxp;
+
+	perf_ctx_adjust_freq(&cpuctx->ctx);
+	if (ctx)
+		perf_ctx_adjust_freq(ctx);
+
+	perf_event_cpu_sched_out(cpuctx);
+	if (ctx)
+		__perf_event_task_sched_out(ctx);
+
+	rotate_ctx(&cpuctx->ctx);
+	if (ctx)
+		rotate_ctx(ctx);
+
+	perf_event_cpu_sched_in(cpuctx, cpu);
+	if (ctx)
+		perf_event_task_sched_in(curr, cpu);
+}
+
+/*
+ * Enable all of a task's events that have been marked enable-on-exec.
+ * This expects task == current.
+ */
+static void perf_event_enable_on_exec(struct task_struct *task)
+{
+	struct perf_event_context *ctx;
+	struct perf_event *event;
+	unsigned long flags;
+	int enabled = 0;
+
+	local_irq_save(flags);
+	ctx = task->perf_event_ctxp;
+	if (!ctx || !ctx->nr_events)
+		goto out;
+
+	__perf_event_task_sched_out(ctx);
+
+	spin_lock(&ctx->lock);
+
+	list_for_each_entry(event, &ctx->group_list, group_entry) {
+		if (!event->attr.enable_on_exec)
+			continue;
+		event->attr.enable_on_exec = 0;
+		if (event->state >= PERF_EVENT_STATE_INACTIVE)
+			continue;
+		__perf_event_mark_enabled(event, ctx);
+		enabled = 1;
+	}
+
+	/*
+	 * Unclone this context if we enabled any event.
+	 */
+	if (enabled)
+		unclone_ctx(ctx);
+
+	spin_unlock(&ctx->lock);
+
+	perf_event_task_sched_in(task, smp_processor_id());
+ out:
+	local_irq_restore(flags);
+}
+
+/*
+ * Cross CPU call to read the hardware event
+ */
+static void __perf_event_read(void *info)
+{
+	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+	struct perf_event *event = info;
+	struct perf_event_context *ctx = event->ctx;
+	unsigned long flags;
+
+	/*
+	 * If this is a task context, we need to check whether it is
+	 * the current task context of this cpu.  If not it has been
+	 * scheduled out before the smp call arrived.  In that case
+	 * event->count would have been updated to a recent sample
+	 * when the event was scheduled out.
+	 */
+	if (ctx->task && cpuctx->task_ctx != ctx)
+		return;
+
+	local_irq_save(flags);
+	if (ctx->is_active)
+		update_context_time(ctx);
+	event->pmu->read(event);
+	update_event_times(event);
+	local_irq_restore(flags);
+}
+
+static u64 perf_event_read(struct perf_event *event)
+{
+	/*
+	 * If event is enabled and currently active on a CPU, update the
+	 * value in the event structure:
+	 */
+	if (event->state == PERF_EVENT_STATE_ACTIVE) {
+		smp_call_function_single(event->oncpu,
+					 __perf_event_read, event, 1);
+	} else if (event->state == PERF_EVENT_STATE_INACTIVE) {
+		update_event_times(event);
+	}
+
+	return atomic64_read(&event->count);
+}
+
+/*
+ * Initialize the perf_event context in a task_struct:
+ */
+static void
+__perf_event_init_context(struct perf_event_context *ctx,
+			    struct task_struct *task)
+{
+	memset(ctx, 0, sizeof(*ctx));
+	spin_lock_init(&ctx->lock);
+	mutex_init(&ctx->mutex);
+	INIT_LIST_HEAD(&ctx->group_list);
+	INIT_LIST_HEAD(&ctx->event_list);
+	atomic_set(&ctx->refcount, 1);
+	ctx->task = task;
+}
+
+static struct perf_event_context *find_get_context(pid_t pid, int cpu)
+{
+	struct perf_event_context *ctx;
+	struct perf_cpu_context *cpuctx;
+	struct task_struct *task;
+	unsigned long flags;
+	int err;
+
+	/*
+	 * If cpu is not a wildcard then this is a percpu event:
+	 */
+	if (cpu != -1) {
+		/* Must be root to operate on a CPU event: */
+		if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
+			return ERR_PTR(-EACCES);
+
+		if (cpu < 0 || cpu > num_possible_cpus())
+			return ERR_PTR(-EINVAL);
+
+		/*
+		 * We could be clever and allow to attach a event to an
+		 * offline CPU and activate it when the CPU comes up, but
+		 * that's for later.
+		 */
+		if (!cpu_isset(cpu, cpu_online_map))
+			return ERR_PTR(-ENODEV);
+
+		cpuctx = &per_cpu(perf_cpu_context, cpu);
+		ctx = &cpuctx->ctx;
+		get_ctx(ctx);
+
+		return ctx;
+	}
+
+	rcu_read_lock();
+	if (!pid)
+		task = current;
+	else
+		task = find_task_by_vpid(pid);
+	if (task)
+		get_task_struct(task);
+	rcu_read_unlock();
+
+	if (!task)
+		return ERR_PTR(-ESRCH);
+
+	/*
+	 * Can't attach events to a dying task.
+	 */
+	err = -ESRCH;
+	if (task->flags & PF_EXITING)
+		goto errout;
+
+	/* Reuse ptrace permission checks for now. */
+	err = -EACCES;
+	if (!ptrace_may_access(task, PTRACE_MODE_READ))
+		goto errout;
+
+ retry:
+	ctx = perf_lock_task_context(task, &flags);
+	if (ctx) {
+		unclone_ctx(ctx);
+		spin_unlock_irqrestore(&ctx->lock, flags);
+	}
+
+	if (!ctx) {
+		ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL);
+		err = -ENOMEM;
+		if (!ctx)
+			goto errout;
+		__perf_event_init_context(ctx, task);
+		get_ctx(ctx);
+		if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) {
+			/*
+			 * We raced with some other task; use
+			 * the context they set.
+			 */
+			kfree(ctx);
+			goto retry;
+		}
+		get_task_struct(task);
+	}
+
+	put_task_struct(task);
+	return ctx;
+
+ errout:
+	put_task_struct(task);
+	return ERR_PTR(err);
+}
+
+static void free_event_rcu(struct rcu_head *head)
+{
+	struct perf_event *event;
+
+	event = container_of(head, struct perf_event, rcu_head);
+	if (event->ns)
+		put_pid_ns(event->ns);
+	kfree(event);
+}
+
+static void perf_pending_sync(struct perf_event *event);
+
+static void free_event(struct perf_event *event)
+{
+	perf_pending_sync(event);
+
+	if (!event->parent) {
+		atomic_dec(&nr_events);
+		if (event->attr.mmap)
+			atomic_dec(&nr_mmap_events);
+		if (event->attr.comm)
+			atomic_dec(&nr_comm_events);
+		if (event->attr.task)
+			atomic_dec(&nr_task_events);
+	}
+
+	if (event->output) {
+		fput(event->output->filp);
+		event->output = NULL;
+	}
+
+	if (event->destroy)
+		event->destroy(event);
+
+	put_ctx(event->ctx);
+	call_rcu(&event->rcu_head, free_event_rcu);
+}
+
+/*
+ * Called when the last reference to the file is gone.
+ */
+static int perf_release(struct inode *inode, struct file *file)
+{
+	struct perf_event *event = file->private_data;
+	struct perf_event_context *ctx = event->ctx;
+
+	file->private_data = NULL;
+
+	WARN_ON_ONCE(ctx->parent_ctx);
+	mutex_lock(&ctx->mutex);
+	perf_event_remove_from_context(event);
+	mutex_unlock(&ctx->mutex);
+
+	mutex_lock(&event->owner->perf_event_mutex);
+	list_del_init(&event->owner_entry);
+	mutex_unlock(&event->owner->perf_event_mutex);
+	put_task_struct(event->owner);
+
+	free_event(event);
+
+	return 0;
+}
+
+static int perf_event_read_size(struct perf_event *event)
+{
+	int entry = sizeof(u64); /* value */
+	int size = 0;
+	int nr = 1;
+
+	if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+		size += sizeof(u64);
+
+	if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+		size += sizeof(u64);
+
+	if (event->attr.read_format & PERF_FORMAT_ID)
+		entry += sizeof(u64);
+
+	if (event->attr.read_format & PERF_FORMAT_GROUP) {
+		nr += event->group_leader->nr_siblings;
+		size += sizeof(u64);
+	}
+
+	size += entry * nr;
+
+	return size;
+}
+
+static u64 perf_event_read_value(struct perf_event *event)
+{
+	struct perf_event *child;
+	u64 total = 0;
+
+	total += perf_event_read(event);
+	list_for_each_entry(child, &event->child_list, child_list)
+		total += perf_event_read(child);
+
+	return total;
+}
+
+static int perf_event_read_entry(struct perf_event *event,
+				   u64 read_format, char __user *buf)
+{
+	int n = 0, count = 0;
+	u64 values[2];
+
+	values[n++] = perf_event_read_value(event);
+	if (read_format & PERF_FORMAT_ID)
+		values[n++] = primary_event_id(event);
+
+	count = n * sizeof(u64);
+
+	if (copy_to_user(buf, values, count))
+		return -EFAULT;
+
+	return count;
+}
+
+static int perf_event_read_group(struct perf_event *event,
+				   u64 read_format, char __user *buf)
+{
+	struct perf_event *leader = event->group_leader, *sub;
+	int n = 0, size = 0, err = -EFAULT;
+	u64 values[3];
+
+	values[n++] = 1 + leader->nr_siblings;
+	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
+		values[n++] = leader->total_time_enabled +
+			atomic64_read(&leader->child_total_time_enabled);
+	}
+	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
+		values[n++] = leader->total_time_running +
+			atomic64_read(&leader->child_total_time_running);
+	}
+
+	size = n * sizeof(u64);
+
+	if (copy_to_user(buf, values, size))
+		return -EFAULT;
+
+	err = perf_event_read_entry(leader, read_format, buf + size);
+	if (err < 0)
+		return err;
+
+	size += err;
+
+	list_for_each_entry(sub, &leader->sibling_list, group_entry) {
+		err = perf_event_read_entry(sub, read_format,
+				buf + size);
+		if (err < 0)
+			return err;
+
+		size += err;
+	}
+
+	return size;
+}
+
+static int perf_event_read_one(struct perf_event *event,
+				 u64 read_format, char __user *buf)
+{
+	u64 values[4];
+	int n = 0;
+
+	values[n++] = perf_event_read_value(event);
+	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
+		values[n++] = event->total_time_enabled +
+			atomic64_read(&event->child_total_time_enabled);
+	}
+	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
+		values[n++] = event->total_time_running +
+			atomic64_read(&event->child_total_time_running);
+	}
+	if (read_format & PERF_FORMAT_ID)
+		values[n++] = primary_event_id(event);
+
+	if (copy_to_user(buf, values, n * sizeof(u64)))
+		return -EFAULT;
+
+	return n * sizeof(u64);
+}
+
+/*
+ * Read the performance event - simple non blocking version for now
+ */
+static ssize_t
+perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
+{
+	u64 read_format = event->attr.read_format;
+	int ret;
+
+	/*
+	 * Return end-of-file for a read on a event that is in
+	 * error state (i.e. because it was pinned but it couldn't be
+	 * scheduled on to the CPU at some point).
+	 */
+	if (event->state == PERF_EVENT_STATE_ERROR)
+		return 0;
+
+	if (count < perf_event_read_size(event))
+		return -ENOSPC;
+
+	WARN_ON_ONCE(event->ctx->parent_ctx);
+	mutex_lock(&event->child_mutex);
+	if (read_format & PERF_FORMAT_GROUP)
+		ret = perf_event_read_group(event, read_format, buf);
+	else
+		ret = perf_event_read_one(event, read_format, buf);
+	mutex_unlock(&event->child_mutex);
+
+	return ret;
+}
+
+static ssize_t
+perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
+{
+	struct perf_event *event = file->private_data;
+
+	return perf_read_hw(event, buf, count);
+}
+
+static unsigned int perf_poll(struct file *file, poll_table *wait)
+{
+	struct perf_event *event = file->private_data;
+	struct perf_mmap_data *data;
+	unsigned int events = POLL_HUP;
+
+	rcu_read_lock();
+	data = rcu_dereference(event->data);
+	if (data)
+		events = atomic_xchg(&data->poll, 0);
+	rcu_read_unlock();
+
+	poll_wait(file, &event->waitq, wait);
+
+	return events;
+}
+
+static void perf_event_reset(struct perf_event *event)
+{
+	(void)perf_event_read(event);
+	atomic64_set(&event->count, 0);
+	perf_event_update_userpage(event);
+}
+
+/*
+ * Holding the top-level event's child_mutex means that any
+ * descendant process that has inherited this event will block
+ * in sync_child_event if it goes to exit, thus satisfying the
+ * task existence requirements of perf_event_enable/disable.
+ */
+static void perf_event_for_each_child(struct perf_event *event,
+					void (*func)(struct perf_event *))
+{
+	struct perf_event *child;
+
+	WARN_ON_ONCE(event->ctx->parent_ctx);
+	mutex_lock(&event->child_mutex);
+	func(event);
+	list_for_each_entry(child, &event->child_list, child_list)
+		func(child);
+	mutex_unlock(&event->child_mutex);
+}
+
+static void perf_event_for_each(struct perf_event *event,
+				  void (*func)(struct perf_event *))
+{
+	struct perf_event_context *ctx = event->ctx;
+	struct perf_event *sibling;
+
+	WARN_ON_ONCE(ctx->parent_ctx);
+	mutex_lock(&ctx->mutex);
+	event = event->group_leader;
+
+	perf_event_for_each_child(event, func);
+	func(event);
+	list_for_each_entry(sibling, &event->sibling_list, group_entry)
+		perf_event_for_each_child(event, func);
+	mutex_unlock(&ctx->mutex);
+}
+
+static int perf_event_period(struct perf_event *event, u64 __user *arg)
+{
+	struct perf_event_context *ctx = event->ctx;
+	unsigned long size;
+	int ret = 0;
+	u64 value;
+
+	if (!event->attr.sample_period)
+		return -EINVAL;
+
+	size = copy_from_user(&value, arg, sizeof(value));
+	if (size != sizeof(value))
+		return -EFAULT;
+
+	if (!value)
+		return -EINVAL;
+
+	spin_lock_irq(&ctx->lock);
+	if (event->attr.freq) {
+		if (value > sysctl_perf_event_sample_rate) {
+			ret = -EINVAL;
+			goto unlock;
+		}
+
+		event->attr.sample_freq = value;
+	} else {
+		event->attr.sample_period = value;
+		event->hw.sample_period = value;
+	}
+unlock:
+	spin_unlock_irq(&ctx->lock);
+
+	return ret;
+}
+
+int perf_event_set_output(struct perf_event *event, int output_fd);
+
+static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct perf_event *event = file->private_data;
+	void (*func)(struct perf_event *);
+	u32 flags = arg;
+
+	switch (cmd) {
+	case PERF_EVENT_IOC_ENABLE:
+		func = perf_event_enable;
+		break;
+	case PERF_EVENT_IOC_DISABLE:
+		func = perf_event_disable;
+		break;
+	case PERF_EVENT_IOC_RESET:
+		func = perf_event_reset;
+		break;
+
+	case PERF_EVENT_IOC_REFRESH:
+		return perf_event_refresh(event, arg);
+
+	case PERF_EVENT_IOC_PERIOD:
+		return perf_event_period(event, (u64 __user *)arg);
+
+	case PERF_EVENT_IOC_SET_OUTPUT:
+		return perf_event_set_output(event, arg);
+
+	default:
+		return -ENOTTY;
+	}
+
+	if (flags & PERF_IOC_FLAG_GROUP)
+		perf_event_for_each(event, func);
+	else
+		perf_event_for_each_child(event, func);
+
+	return 0;
+}
+
+int perf_event_task_enable(void)
+{
+	struct perf_event *event;
+
+	mutex_lock(&current->perf_event_mutex);
+	list_for_each_entry(event, &current->perf_event_list, owner_entry)
+		perf_event_for_each_child(event, perf_event_enable);
+	mutex_unlock(&current->perf_event_mutex);
+
+	return 0;
+}
+
+int perf_event_task_disable(void)
+{
+	struct perf_event *event;
+
+	mutex_lock(&current->perf_event_mutex);
+	list_for_each_entry(event, &current->perf_event_list, owner_entry)
+		perf_event_for_each_child(event, perf_event_disable);
+	mutex_unlock(&current->perf_event_mutex);
+
+	return 0;
+}
+
+#ifndef PERF_EVENT_INDEX_OFFSET
+# define PERF_EVENT_INDEX_OFFSET 0
+#endif
+
+static int perf_event_index(struct perf_event *event)
+{
+	if (event->state != PERF_EVENT_STATE_ACTIVE)
+		return 0;
+
+	return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET;
+}
+
+/*
+ * Callers need to ensure there can be no nesting of this function, otherwise
+ * the seqlock logic goes bad. We can not serialize this because the arch
+ * code calls this from NMI context.
+ */
+void perf_event_update_userpage(struct perf_event *event)
+{
+	struct perf_event_mmap_page *userpg;
+	struct perf_mmap_data *data;
+
+	rcu_read_lock();
+	data = rcu_dereference(event->data);
+	if (!data)
+		goto unlock;
+
+	userpg = data->user_page;
+
+	/*
+	 * Disable preemption so as to not let the corresponding user-space
+	 * spin too long if we get preempted.
+	 */
+	preempt_disable();
+	++userpg->lock;
+	barrier();
+	userpg->index = perf_event_index(event);
+	userpg->offset = atomic64_read(&event->count);
+	if (event->state == PERF_EVENT_STATE_ACTIVE)
+		userpg->offset -= atomic64_read(&event->hw.prev_count);
+
+	userpg->time_enabled = event->total_time_enabled +
+			atomic64_read(&event->child_total_time_enabled);
+
+	userpg->time_running = event->total_time_running +
+			atomic64_read(&event->child_total_time_running);
+
+	barrier();
+	++userpg->lock;
+	preempt_enable();
+unlock:
+	rcu_read_unlock();
+}
+
+static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct perf_event *event = vma->vm_file->private_data;
+	struct perf_mmap_data *data;
+	int ret = VM_FAULT_SIGBUS;
+
+	if (vmf->flags & FAULT_FLAG_MKWRITE) {
+		if (vmf->pgoff == 0)
+			ret = 0;
+		return ret;
+	}
+
+	rcu_read_lock();
+	data = rcu_dereference(event->data);
+	if (!data)
+		goto unlock;
+
+	if (vmf->pgoff == 0) {
+		vmf->page = virt_to_page(data->user_page);
+	} else {
+		int nr = vmf->pgoff - 1;
+
+		if ((unsigned)nr > data->nr_pages)
+			goto unlock;
+
+		if (vmf->flags & FAULT_FLAG_WRITE)
+			goto unlock;
+
+		vmf->page = virt_to_page(data->data_pages[nr]);
+	}
+
+	get_page(vmf->page);
+	vmf->page->mapping = vma->vm_file->f_mapping;
+	vmf->page->index   = vmf->pgoff;
+
+	ret = 0;
+unlock:
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static int perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
+{
+	struct perf_mmap_data *data;
+	unsigned long size;
+	int i;
+
+	WARN_ON(atomic_read(&event->mmap_count));
+
+	size = sizeof(struct perf_mmap_data);
+	size += nr_pages * sizeof(void *);
+
+	data = kzalloc(size, GFP_KERNEL);
+	if (!data)
+		goto fail;
+
+	data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
+	if (!data->user_page)
+		goto fail_user_page;
+
+	for (i = 0; i < nr_pages; i++) {
+		data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
+		if (!data->data_pages[i])
+			goto fail_data_pages;
+	}
+
+	data->nr_pages = nr_pages;
+	atomic_set(&data->lock, -1);
+
+	if (event->attr.watermark) {
+		data->watermark = min_t(long, PAGE_SIZE * nr_pages,
+				      event->attr.wakeup_watermark);
+	}
+	if (!data->watermark)
+		data->watermark = max(PAGE_SIZE, PAGE_SIZE * nr_pages / 4);
+
+	rcu_assign_pointer(event->data, data);
+
+	return 0;
+
+fail_data_pages:
+	for (i--; i >= 0; i--)
+		free_page((unsigned long)data->data_pages[i]);
+
+	free_page((unsigned long)data->user_page);
+
+fail_user_page:
+	kfree(data);
+
+fail:
+	return -ENOMEM;
+}
+
+static void perf_mmap_free_page(unsigned long addr)
+{
+	struct page *page = virt_to_page((void *)addr);
+
+	page->mapping = NULL;
+	__free_page(page);
+}
+
+static void __perf_mmap_data_free(struct rcu_head *rcu_head)
+{
+	struct perf_mmap_data *data;
+	int i;
+
+	data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
+
+	perf_mmap_free_page((unsigned long)data->user_page);
+	for (i = 0; i < data->nr_pages; i++)
+		perf_mmap_free_page((unsigned long)data->data_pages[i]);
+
+	kfree(data);
+}
+
+static void perf_mmap_data_free(struct perf_event *event)
+{
+	struct perf_mmap_data *data = event->data;
+
+	WARN_ON(atomic_read(&event->mmap_count));
+
+	rcu_assign_pointer(event->data, NULL);
+	call_rcu(&data->rcu_head, __perf_mmap_data_free);
+}
+
+static void perf_mmap_open(struct vm_area_struct *vma)
+{
+	struct perf_event *event = vma->vm_file->private_data;
+
+	atomic_inc(&event->mmap_count);
+}
+
+static void perf_mmap_close(struct vm_area_struct *vma)
+{
+	struct perf_event *event = vma->vm_file->private_data;
+
+	WARN_ON_ONCE(event->ctx->parent_ctx);
+	if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
+		struct user_struct *user = current_user();
+
+		atomic_long_sub(event->data->nr_pages + 1, &user->locked_vm);
+		vma->vm_mm->locked_vm -= event->data->nr_locked;
+		perf_mmap_data_free(event);
+		mutex_unlock(&event->mmap_mutex);
+	}
+}
+
+static struct vm_operations_struct perf_mmap_vmops = {
+	.open		= perf_mmap_open,
+	.close		= perf_mmap_close,
+	.fault		= perf_mmap_fault,
+	.page_mkwrite	= perf_mmap_fault,
+};
+
+static int perf_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct perf_event *event = file->private_data;
+	unsigned long user_locked, user_lock_limit;
+	struct user_struct *user = current_user();
+	unsigned long locked, lock_limit;
+	unsigned long vma_size;
+	unsigned long nr_pages;
+	long user_extra, extra;
+	int ret = 0;
+
+	if (!(vma->vm_flags & VM_SHARED))
+		return -EINVAL;
+
+	vma_size = vma->vm_end - vma->vm_start;
+	nr_pages = (vma_size / PAGE_SIZE) - 1;
+
+	/*
+	 * If we have data pages ensure they're a power-of-two number, so we
+	 * can do bitmasks instead of modulo.
+	 */
+	if (nr_pages != 0 && !is_power_of_2(nr_pages))
+		return -EINVAL;
+
+	if (vma_size != PAGE_SIZE * (1 + nr_pages))
+		return -EINVAL;
+
+	if (vma->vm_pgoff != 0)
+		return -EINVAL;
+
+	WARN_ON_ONCE(event->ctx->parent_ctx);
+	mutex_lock(&event->mmap_mutex);
+	if (event->output) {
+		ret = -EINVAL;
+		goto unlock;
+	}
+
+	if (atomic_inc_not_zero(&event->mmap_count)) {
+		if (nr_pages != event->data->nr_pages)
+			ret = -EINVAL;
+		goto unlock;
+	}
+
+	user_extra = nr_pages + 1;
+	user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
+
+	/*
+	 * Increase the limit linearly with more CPUs:
+	 */
+	user_lock_limit *= num_online_cpus();
+
+	user_locked = atomic_long_read(&user->locked_vm) + user_extra;
+
+	extra = 0;
+	if (user_locked > user_lock_limit)
+		extra = user_locked - user_lock_limit;
+
+	lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+	lock_limit >>= PAGE_SHIFT;
+	locked = vma->vm_mm->locked_vm + extra;
+
+	if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
+		!capable(CAP_IPC_LOCK)) {
+		ret = -EPERM;
+		goto unlock;
+	}
+
+	WARN_ON(event->data);
+	ret = perf_mmap_data_alloc(event, nr_pages);
+	if (ret)
+		goto unlock;
+
+	atomic_set(&event->mmap_count, 1);
+	atomic_long_add(user_extra, &user->locked_vm);
+	vma->vm_mm->locked_vm += extra;
+	event->data->nr_locked = extra;
+	if (vma->vm_flags & VM_WRITE)
+		event->data->writable = 1;
+
+unlock:
+	mutex_unlock(&event->mmap_mutex);
+
+	vma->vm_flags |= VM_RESERVED;
+	vma->vm_ops = &perf_mmap_vmops;
+
+	return ret;
+}
+
+static int perf_fasync(int fd, struct file *filp, int on)
+{
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct perf_event *event = filp->private_data;
+	int retval;
+
+	mutex_lock(&inode->i_mutex);
+	retval = fasync_helper(fd, filp, on, &event->fasync);
+	mutex_unlock(&inode->i_mutex);
+
+	if (retval < 0)
+		return retval;
+
+	return 0;
+}
+
+static const struct file_operations perf_fops = {
+	.release		= perf_release,
+	.read			= perf_read,
+	.poll			= perf_poll,
+	.unlocked_ioctl		= perf_ioctl,
+	.compat_ioctl		= perf_ioctl,
+	.mmap			= perf_mmap,
+	.fasync			= perf_fasync,
+};
+
+/*
+ * Perf event wakeup
+ *
+ * If there's data, ensure we set the poll() state and publish everything
+ * to user-space before waking everybody up.
+ */
+
+void perf_event_wakeup(struct perf_event *event)
+{
+	wake_up_all(&event->waitq);
+
+	if (event->pending_kill) {
+		kill_fasync(&event->fasync, SIGIO, event->pending_kill);
+		event->pending_kill = 0;
+	}
+}
+
+/*
+ * Pending wakeups
+ *
+ * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
+ *
+ * The NMI bit means we cannot possibly take locks. Therefore, maintain a
+ * single linked list and use cmpxchg() to add entries lockless.
+ */
+
+static void perf_pending_event(struct perf_pending_entry *entry)
+{
+	struct perf_event *event = container_of(entry,
+			struct perf_event, pending);
+
+	if (event->pending_disable) {
+		event->pending_disable = 0;
+		__perf_event_disable(event);
+	}
+
+	if (event->pending_wakeup) {
+		event->pending_wakeup = 0;
+		perf_event_wakeup(event);
+	}
+}
+
+#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
+
+static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
+	PENDING_TAIL,
+};
+
+static void perf_pending_queue(struct perf_pending_entry *entry,
+			       void (*func)(struct perf_pending_entry *))
+{
+	struct perf_pending_entry **head;
+
+	if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
+		return;
+
+	entry->func = func;
+
+	head = &get_cpu_var(perf_pending_head);
+
+	do {
+		entry->next = *head;
+	} while (cmpxchg(head, entry->next, entry) != entry->next);
+
+	set_perf_event_pending();
+
+	put_cpu_var(perf_pending_head);
+}
+
+static int __perf_pending_run(void)
+{
+	struct perf_pending_entry *list;
+	int nr = 0;
+
+	list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
+	while (list != PENDING_TAIL) {
+		void (*func)(struct perf_pending_entry *);
+		struct perf_pending_entry *entry = list;
+
+		list = list->next;
+
+		func = entry->func;
+		entry->next = NULL;
+		/*
+		 * Ensure we observe the unqueue before we issue the wakeup,
+		 * so that we won't be waiting forever.
+		 * -- see perf_not_pending().
+		 */
+		smp_wmb();
+
+		func(entry);
+		nr++;
+	}
+
+	return nr;
+}
+
+static inline int perf_not_pending(struct perf_event *event)
+{
+	/*
+	 * If we flush on whatever cpu we run, there is a chance we don't
+	 * need to wait.
+	 */
+	get_cpu();
+	__perf_pending_run();
+	put_cpu();
+
+	/*
+	 * Ensure we see the proper queue state before going to sleep
+	 * so that we do not miss the wakeup. -- see perf_pending_handle()
+	 */
+	smp_rmb();
+	return event->pending.next == NULL;
+}
+
+static void perf_pending_sync(struct perf_event *event)
+{
+	wait_event(event->waitq, perf_not_pending(event));
+}
+
+void perf_event_do_pending(void)
+{
+	__perf_pending_run();
+}
+
+/*
+ * Callchain support -- arch specific
+ */
+
+__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
+{
+	return NULL;
+}
+
+/*
+ * Output
+ */
+static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
+			      unsigned long offset, unsigned long head)
+{
+	unsigned long mask;
+
+	if (!data->writable)
+		return true;
+
+	mask = (data->nr_pages << PAGE_SHIFT) - 1;
+
+	offset = (offset - tail) & mask;
+	head   = (head   - tail) & mask;
+
+	if ((int)(head - offset) < 0)
+		return false;
+
+	return true;
+}
+
+static void perf_output_wakeup(struct perf_output_handle *handle)
+{
+	atomic_set(&handle->data->poll, POLL_IN);
+
+	if (handle->nmi) {
+		handle->event->pending_wakeup = 1;
+		perf_pending_queue(&handle->event->pending,
+				   perf_pending_event);
+	} else
+		perf_event_wakeup(handle->event);
+}
+
+/*
+ * Curious locking construct.
+ *
+ * We need to ensure a later event_id doesn't publish a head when a former
+ * event_id isn't done writing. However since we need to deal with NMIs we
+ * cannot fully serialize things.
+ *
+ * What we do is serialize between CPUs so we only have to deal with NMI
+ * nesting on a single CPU.
+ *
+ * We only publish the head (and generate a wakeup) when the outer-most
+ * event_id completes.
+ */
+static void perf_output_lock(struct perf_output_handle *handle)
+{
+	struct perf_mmap_data *data = handle->data;
+	int cpu;
+
+	handle->locked = 0;
+
+	local_irq_save(handle->flags);
+	cpu = smp_processor_id();
+
+	if (in_nmi() && atomic_read(&data->lock) == cpu)
+		return;
+
+	while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
+		cpu_relax();
+
+	handle->locked = 1;
+}
+
+static void perf_output_unlock(struct perf_output_handle *handle)
+{
+	struct perf_mmap_data *data = handle->data;
+	unsigned long head;
+	int cpu;
+
+	data->done_head = data->head;
+
+	if (!handle->locked)
+		goto out;
+
+again:
+	/*
+	 * The xchg implies a full barrier that ensures all writes are done
+	 * before we publish the new head, matched by a rmb() in userspace when
+	 * reading this position.
+	 */
+	while ((head = atomic_long_xchg(&data->done_head, 0)))
+		data->user_page->data_head = head;
+
+	/*
+	 * NMI can happen here, which means we can miss a done_head update.
+	 */
+
+	cpu = atomic_xchg(&data->lock, -1);
+	WARN_ON_ONCE(cpu != smp_processor_id());
+
+	/*
+	 * Therefore we have to validate we did not indeed do so.
+	 */
+	if (unlikely(atomic_long_read(&data->done_head))) {
+		/*
+		 * Since we had it locked, we can lock it again.
+		 */
+		while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
+			cpu_relax();
+
+		goto again;
+	}
+
+	if (atomic_xchg(&data->wakeup, 0))
+		perf_output_wakeup(handle);
+out:
+	local_irq_restore(handle->flags);
+}
+
+void perf_output_copy(struct perf_output_handle *handle,
+		      const void *buf, unsigned int len)
+{
+	unsigned int pages_mask;
+	unsigned int offset;
+	unsigned int size;
+	void **pages;
+
+	offset		= handle->offset;
+	pages_mask	= handle->data->nr_pages - 1;
+	pages		= handle->data->data_pages;
+
+	do {
+		unsigned int page_offset;
+		int nr;
+
+		nr	    = (offset >> PAGE_SHIFT) & pages_mask;
+		page_offset = offset & (PAGE_SIZE - 1);
+		size	    = min_t(unsigned int, PAGE_SIZE - page_offset, len);
+
+		memcpy(pages[nr] + page_offset, buf, size);
+
+		len	    -= size;
+		buf	    += size;
+		offset	    += size;
+	} while (len);
+
+	handle->offset = offset;
+
+	/*
+	 * Check we didn't copy past our reservation window, taking the
+	 * possible unsigned int wrap into account.
+	 */
+	WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
+}
+
+int perf_output_begin(struct perf_output_handle *handle,
+		      struct perf_event *event, unsigned int size,
+		      int nmi, int sample)
+{
+	struct perf_event *output_event;
+	struct perf_mmap_data *data;
+	unsigned long tail, offset, head;
+	int have_lost;
+	struct {
+		struct perf_event_header header;
+		u64			 id;
+		u64			 lost;
+	} lost_event;
+
+	rcu_read_lock();
+	/*
+	 * For inherited events we send all the output towards the parent.
+	 */
+	if (event->parent)
+		event = event->parent;
+
+	output_event = rcu_dereference(event->output);
+	if (output_event)
+		event = output_event;
+
+	data = rcu_dereference(event->data);
+	if (!data)
+		goto out;
+
+	handle->data	= data;
+	handle->event	= event;
+	handle->nmi	= nmi;
+	handle->sample	= sample;
+
+	if (!data->nr_pages)
+		goto fail;
+
+	have_lost = atomic_read(&data->lost);
+	if (have_lost)
+		size += sizeof(lost_event);
+
+	perf_output_lock(handle);
+
+	do {
+		/*
+		 * Userspace could choose to issue a mb() before updating the
+		 * tail pointer. So that all reads will be completed before the
+		 * write is issued.
+		 */
+		tail = ACCESS_ONCE(data->user_page->data_tail);
+		smp_rmb();
+		offset = head = atomic_long_read(&data->head);
+		head += size;
+		if (unlikely(!perf_output_space(data, tail, offset, head)))
+			goto fail;
+	} while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
+
+	handle->offset	= offset;
+	handle->head	= head;
+
+	if (head - tail > data->watermark)
+		atomic_set(&data->wakeup, 1);
+
+	if (have_lost) {
+		lost_event.header.type = PERF_RECORD_LOST;
+		lost_event.header.misc = 0;
+		lost_event.header.size = sizeof(lost_event);
+		lost_event.id          = event->id;
+		lost_event.lost        = atomic_xchg(&data->lost, 0);
+
+		perf_output_put(handle, lost_event);
+	}
+
+	return 0;
+
+fail:
+	atomic_inc(&data->lost);
+	perf_output_unlock(handle);
+out:
+	rcu_read_unlock();
+
+	return -ENOSPC;
+}
+
+void perf_output_end(struct perf_output_handle *handle)
+{
+	struct perf_event *event = handle->event;
+	struct perf_mmap_data *data = handle->data;
+
+	int wakeup_events = event->attr.wakeup_events;
+
+	if (handle->sample && wakeup_events) {
+		int events = atomic_inc_return(&data->events);
+		if (events >= wakeup_events) {
+			atomic_sub(wakeup_events, &data->events);
+			atomic_set(&data->wakeup, 1);
+		}
+	}
+
+	perf_output_unlock(handle);
+	rcu_read_unlock();
+}
+
+static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
+{
+	/*
+	 * only top level events have the pid namespace they were created in
+	 */
+	if (event->parent)
+		event = event->parent;
+
+	return task_tgid_nr_ns(p, event->ns);
+}
+
+static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
+{
+	/*
+	 * only top level events have the pid namespace they were created in
+	 */
+	if (event->parent)
+		event = event->parent;
+
+	return task_pid_nr_ns(p, event->ns);
+}
+
+static void perf_output_read_one(struct perf_output_handle *handle,
+				 struct perf_event *event)
+{
+	u64 read_format = event->attr.read_format;
+	u64 values[4];
+	int n = 0;
+
+	values[n++] = atomic64_read(&event->count);
+	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
+		values[n++] = event->total_time_enabled +
+			atomic64_read(&event->child_total_time_enabled);
+	}
+	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
+		values[n++] = event->total_time_running +
+			atomic64_read(&event->child_total_time_running);
+	}
+	if (read_format & PERF_FORMAT_ID)
+		values[n++] = primary_event_id(event);
+
+	perf_output_copy(handle, values, n * sizeof(u64));
+}
+
+/*
+ * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
+ */
+static void perf_output_read_group(struct perf_output_handle *handle,
+			    struct perf_event *event)
+{
+	struct perf_event *leader = event->group_leader, *sub;
+	u64 read_format = event->attr.read_format;
+	u64 values[5];
+	int n = 0;
+
+	values[n++] = 1 + leader->nr_siblings;
+
+	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+		values[n++] = leader->total_time_enabled;
+
+	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+		values[n++] = leader->total_time_running;
+
+	if (leader != event)
+		leader->pmu->read(leader);
+
+	values[n++] = atomic64_read(&leader->count);
+	if (read_format & PERF_FORMAT_ID)
+		values[n++] = primary_event_id(leader);
+
+	perf_output_copy(handle, values, n * sizeof(u64));
+
+	list_for_each_entry(sub, &leader->sibling_list, group_entry) {
+		n = 0;
+
+		if (sub != event)
+			sub->pmu->read(sub);
+
+		values[n++] = atomic64_read(&sub->count);
+		if (read_format & PERF_FORMAT_ID)
+			values[n++] = primary_event_id(sub);
+
+		perf_output_copy(handle, values, n * sizeof(u64));
+	}
+}
+
+static void perf_output_read(struct perf_output_handle *handle,
+			     struct perf_event *event)
+{
+	if (event->attr.read_format & PERF_FORMAT_GROUP)
+		perf_output_read_group(handle, event);
+	else
+		perf_output_read_one(handle, event);
+}
+
+void perf_output_sample(struct perf_output_handle *handle,
+			struct perf_event_header *header,
+			struct perf_sample_data *data,
+			struct perf_event *event)
+{
+	u64 sample_type = data->type;
+
+	perf_output_put(handle, *header);
+
+	if (sample_type & PERF_SAMPLE_IP)
+		perf_output_put(handle, data->ip);
+
+	if (sample_type & PERF_SAMPLE_TID)
+		perf_output_put(handle, data->tid_entry);
+
+	if (sample_type & PERF_SAMPLE_TIME)
+		perf_output_put(handle, data->time);
+
+	if (sample_type & PERF_SAMPLE_ADDR)
+		perf_output_put(handle, data->addr);
+
+	if (sample_type & PERF_SAMPLE_ID)
+		perf_output_put(handle, data->id);
+
+	if (sample_type & PERF_SAMPLE_STREAM_ID)
+		perf_output_put(handle, data->stream_id);
+
+	if (sample_type & PERF_SAMPLE_CPU)
+		perf_output_put(handle, data->cpu_entry);
+
+	if (sample_type & PERF_SAMPLE_PERIOD)
+		perf_output_put(handle, data->period);
+
+	if (sample_type & PERF_SAMPLE_READ)
+		perf_output_read(handle, event);
+
+	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
+		if (data->callchain) {
+			int size = 1;
+
+			if (data->callchain)
+				size += data->callchain->nr;
+
+			size *= sizeof(u64);
+
+			perf_output_copy(handle, data->callchain, size);
+		} else {
+			u64 nr = 0;
+			perf_output_put(handle, nr);
+		}
+	}
+
+	if (sample_type & PERF_SAMPLE_RAW) {
+		if (data->raw) {
+			perf_output_put(handle, data->raw->size);
+			perf_output_copy(handle, data->raw->data,
+					 data->raw->size);
+		} else {
+			struct {
+				u32	size;
+				u32	data;
+			} raw = {
+				.size = sizeof(u32),
+				.data = 0,
+			};
+			perf_output_put(handle, raw);
+		}
+	}
+}
+
+void perf_prepare_sample(struct perf_event_header *header,
+			 struct perf_sample_data *data,
+			 struct perf_event *event,
+			 struct pt_regs *regs)
+{
+	u64 sample_type = event->attr.sample_type;
+
+	data->type = sample_type;
+
+	header->type = PERF_RECORD_SAMPLE;
+	header->size = sizeof(*header);
+
+	header->misc = 0;
+	header->misc |= perf_misc_flags(regs);
+
+	if (sample_type & PERF_SAMPLE_IP) {
+		data->ip = perf_instruction_pointer(regs);
+
+		header->size += sizeof(data->ip);
+	}
+
+	if (sample_type & PERF_SAMPLE_TID) {
+		/* namespace issues */
+		data->tid_entry.pid = perf_event_pid(event, current);
+		data->tid_entry.tid = perf_event_tid(event, current);
+
+		header->size += sizeof(data->tid_entry);
+	}
+
+	if (sample_type & PERF_SAMPLE_TIME) {
+		data->time = perf_clock();
+
+		header->size += sizeof(data->time);
+	}
+
+	if (sample_type & PERF_SAMPLE_ADDR)
+		header->size += sizeof(data->addr);
+
+	if (sample_type & PERF_SAMPLE_ID) {
+		data->id = primary_event_id(event);
+
+		header->size += sizeof(data->id);
+	}
+
+	if (sample_type & PERF_SAMPLE_STREAM_ID) {
+		data->stream_id = event->id;
+
+		header->size += sizeof(data->stream_id);
+	}
+
+	if (sample_type & PERF_SAMPLE_CPU) {
+		data->cpu_entry.cpu		= raw_smp_processor_id();
+		data->cpu_entry.reserved	= 0;
+
+		header->size += sizeof(data->cpu_entry);
+	}
+
+	if (sample_type & PERF_SAMPLE_PERIOD)
+		header->size += sizeof(data->period);
+
+	if (sample_type & PERF_SAMPLE_READ)
+		header->size += perf_event_read_size(event);
+
+	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
+		int size = 1;
+
+		data->callchain = perf_callchain(regs);
+
+		if (data->callchain)
+			size += data->callchain->nr;
+
+		header->size += size * sizeof(u64);
+	}
+
+	if (sample_type & PERF_SAMPLE_RAW) {
+		int size = sizeof(u32);
+
+		if (data->raw)
+			size += data->raw->size;
+		else
+			size += sizeof(u32);
+
+		WARN_ON_ONCE(size & (sizeof(u64)-1));
+		header->size += size;
+	}
+}
+
+static void perf_event_output(struct perf_event *event, int nmi,
+				struct perf_sample_data *data,
+				struct pt_regs *regs)
+{
+	struct perf_output_handle handle;
+	struct perf_event_header header;
+
+	perf_prepare_sample(&header, data, event, regs);
+
+	if (perf_output_begin(&handle, event, header.size, nmi, 1))
+		return;
+
+	perf_output_sample(&handle, &header, data, event);
+
+	perf_output_end(&handle);
+}
+
+/*
+ * read event_id
+ */
+
+struct perf_read_event {
+	struct perf_event_header	header;
+
+	u32				pid;
+	u32				tid;
+};
+
+static void
+perf_event_read_event(struct perf_event *event,
+			struct task_struct *task)
+{
+	struct perf_output_handle handle;
+	struct perf_read_event read_event = {
+		.header = {
+			.type = PERF_RECORD_READ,
+			.misc = 0,
+			.size = sizeof(read_event) + perf_event_read_size(event),
+		},
+		.pid = perf_event_pid(event, task),
+		.tid = perf_event_tid(event, task),
+	};
+	int ret;
+
+	ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0);
+	if (ret)
+		return;
+
+	perf_output_put(&handle, read_event);
+	perf_output_read(&handle, event);
+
+	perf_output_end(&handle);
+}
+
+/*
+ * task tracking -- fork/exit
+ *
+ * enabled by: attr.comm | attr.mmap | attr.task
+ */
+
+struct perf_task_event {
+	struct task_struct		*task;
+	struct perf_event_context	*task_ctx;
+
+	struct {
+		struct perf_event_header	header;
+
+		u32				pid;
+		u32				ppid;
+		u32				tid;
+		u32				ptid;
+		u64				time;
+	} event_id;
+};
+
+static void perf_event_task_output(struct perf_event *event,
+				     struct perf_task_event *task_event)
+{
+	struct perf_output_handle handle;
+	int size;
+	struct task_struct *task = task_event->task;
+	int ret;
+
+	size  = task_event->event_id.header.size;
+	ret = perf_output_begin(&handle, event, size, 0, 0);
+
+	if (ret)
+		return;
+
+	task_event->event_id.pid = perf_event_pid(event, task);
+	task_event->event_id.ppid = perf_event_pid(event, current);
+
+	task_event->event_id.tid = perf_event_tid(event, task);
+	task_event->event_id.ptid = perf_event_tid(event, current);
+
+	task_event->event_id.time = perf_clock();
+
+	perf_output_put(&handle, task_event->event_id);
+
+	perf_output_end(&handle);
+}
+
+static int perf_event_task_match(struct perf_event *event)
+{
+	if (event->attr.comm || event->attr.mmap || event->attr.task)
+		return 1;
+
+	return 0;
+}
+
+static void perf_event_task_ctx(struct perf_event_context *ctx,
+				  struct perf_task_event *task_event)
+{
+	struct perf_event *event;
+
+	if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
+		return;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
+		if (perf_event_task_match(event))
+			perf_event_task_output(event, task_event);
+	}
+	rcu_read_unlock();
+}
+
+static void perf_event_task_event(struct perf_task_event *task_event)
+{
+	struct perf_cpu_context *cpuctx;
+	struct perf_event_context *ctx = task_event->task_ctx;
+
+	cpuctx = &get_cpu_var(perf_cpu_context);
+	perf_event_task_ctx(&cpuctx->ctx, task_event);
+	put_cpu_var(perf_cpu_context);
+
+	rcu_read_lock();
+	if (!ctx)
+		ctx = rcu_dereference(task_event->task->perf_event_ctxp);
+	if (ctx)
+		perf_event_task_ctx(ctx, task_event);
+	rcu_read_unlock();
+}
+
+static void perf_event_task(struct task_struct *task,
+			      struct perf_event_context *task_ctx,
+			      int new)
+{
+	struct perf_task_event task_event;
+
+	if (!atomic_read(&nr_comm_events) &&
+	    !atomic_read(&nr_mmap_events) &&
+	    !atomic_read(&nr_task_events))
+		return;
+
+	task_event = (struct perf_task_event){
+		.task	  = task,
+		.task_ctx = task_ctx,
+		.event_id    = {
+			.header = {
+				.type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
+				.misc = 0,
+				.size = sizeof(task_event.event_id),
+			},
+			/* .pid  */
+			/* .ppid */
+			/* .tid  */
+			/* .ptid */
+		},
+	};
+
+	perf_event_task_event(&task_event);
+}
+
+void perf_event_fork(struct task_struct *task)
+{
+	perf_event_task(task, NULL, 1);
+}
+
+/*
+ * comm tracking
+ */
+
+struct perf_comm_event {
+	struct task_struct	*task;
+	char			*comm;
+	int			comm_size;
+
+	struct {
+		struct perf_event_header	header;
+
+		u32				pid;
+		u32				tid;
+	} event_id;
+};
+
+static void perf_event_comm_output(struct perf_event *event,
+				     struct perf_comm_event *comm_event)
+{
+	struct perf_output_handle handle;
+	int size = comm_event->event_id.header.size;
+	int ret = perf_output_begin(&handle, event, size, 0, 0);
+
+	if (ret)
+		return;
+
+	comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
+	comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
+
+	perf_output_put(&handle, comm_event->event_id);
+	perf_output_copy(&handle, comm_event->comm,
+				   comm_event->comm_size);
+	perf_output_end(&handle);
+}
+
+static int perf_event_comm_match(struct perf_event *event)
+{
+	if (event->attr.comm)
+		return 1;
+
+	return 0;
+}
+
+static void perf_event_comm_ctx(struct perf_event_context *ctx,
+				  struct perf_comm_event *comm_event)
+{
+	struct perf_event *event;
+
+	if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
+		return;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
+		if (perf_event_comm_match(event))
+			perf_event_comm_output(event, comm_event);
+	}
+	rcu_read_unlock();
+}
+
+static void perf_event_comm_event(struct perf_comm_event *comm_event)
+{
+	struct perf_cpu_context *cpuctx;
+	struct perf_event_context *ctx;
+	unsigned int size;
+	char comm[TASK_COMM_LEN];
+
+	memset(comm, 0, sizeof(comm));
+	strncpy(comm, comm_event->task->comm, sizeof(comm));
+	size = ALIGN(strlen(comm)+1, sizeof(u64));
+
+	comm_event->comm = comm;
+	comm_event->comm_size = size;
+
+	comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
+
+	cpuctx = &get_cpu_var(perf_cpu_context);
+	perf_event_comm_ctx(&cpuctx->ctx, comm_event);
+	put_cpu_var(perf_cpu_context);
+
+	rcu_read_lock();
+	/*
+	 * doesn't really matter which of the child contexts the
+	 * events ends up in.
+	 */
+	ctx = rcu_dereference(current->perf_event_ctxp);
+	if (ctx)
+		perf_event_comm_ctx(ctx, comm_event);
+	rcu_read_unlock();
+}
+
+void perf_event_comm(struct task_struct *task)
+{
+	struct perf_comm_event comm_event;
+
+	if (task->perf_event_ctxp)
+		perf_event_enable_on_exec(task);
+
+	if (!atomic_read(&nr_comm_events))
+		return;
+
+	comm_event = (struct perf_comm_event){
+		.task	= task,
+		/* .comm      */
+		/* .comm_size */
+		.event_id  = {
+			.header = {
+				.type = PERF_RECORD_COMM,
+				.misc = 0,
+				/* .size */
+			},
+			/* .pid */
+			/* .tid */
+		},
+	};
+
+	perf_event_comm_event(&comm_event);
+}
+
+/*
+ * mmap tracking
+ */
+
+struct perf_mmap_event {
+	struct vm_area_struct	*vma;
+
+	const char		*file_name;
+	int			file_size;
+
+	struct {
+		struct perf_event_header	header;
+
+		u32				pid;
+		u32				tid;
+		u64				start;
+		u64				len;
+		u64				pgoff;
+	} event_id;
+};
+
+static void perf_event_mmap_output(struct perf_event *event,
+				     struct perf_mmap_event *mmap_event)
+{
+	struct perf_output_handle handle;
+	int size = mmap_event->event_id.header.size;
+	int ret = perf_output_begin(&handle, event, size, 0, 0);
+
+	if (ret)
+		return;
+
+	mmap_event->event_id.pid = perf_event_pid(event, current);
+	mmap_event->event_id.tid = perf_event_tid(event, current);
+
+	perf_output_put(&handle, mmap_event->event_id);
+	perf_output_copy(&handle, mmap_event->file_name,
+				   mmap_event->file_size);
+	perf_output_end(&handle);
+}
+
+static int perf_event_mmap_match(struct perf_event *event,
+				   struct perf_mmap_event *mmap_event)
+{
+	if (event->attr.mmap)
+		return 1;
+
+	return 0;
+}
+
+static void perf_event_mmap_ctx(struct perf_event_context *ctx,
+				  struct perf_mmap_event *mmap_event)
+{
+	struct perf_event *event;
+
+	if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
+		return;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
+		if (perf_event_mmap_match(event, mmap_event))
+			perf_event_mmap_output(event, mmap_event);
+	}
+	rcu_read_unlock();
+}
+
+static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
+{
+	struct perf_cpu_context *cpuctx;
+	struct perf_event_context *ctx;
+	struct vm_area_struct *vma = mmap_event->vma;
+	struct file *file = vma->vm_file;
+	unsigned int size;
+	char tmp[16];
+	char *buf = NULL;
+	const char *name;
+
+	memset(tmp, 0, sizeof(tmp));
+
+	if (file) {
+		/*
+		 * d_path works from the end of the buffer backwards, so we
+		 * need to add enough zero bytes after the string to handle
+		 * the 64bit alignment we do later.
+		 */
+		buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
+		if (!buf) {
+			name = strncpy(tmp, "//enomem", sizeof(tmp));
+			goto got_name;
+		}
+		name = d_path(&file->f_path, buf, PATH_MAX);
+		if (IS_ERR(name)) {
+			name = strncpy(tmp, "//toolong", sizeof(tmp));
+			goto got_name;
+		}
+	} else {
+		if (arch_vma_name(mmap_event->vma)) {
+			name = strncpy(tmp, arch_vma_name(mmap_event->vma),
+				       sizeof(tmp));
+			goto got_name;
+		}
+
+		if (!vma->vm_mm) {
+			name = strncpy(tmp, "[vdso]", sizeof(tmp));
+			goto got_name;
+		}
+
+		name = strncpy(tmp, "//anon", sizeof(tmp));
+		goto got_name;
+	}
+
+got_name:
+	size = ALIGN(strlen(name)+1, sizeof(u64));
+
+	mmap_event->file_name = name;
+	mmap_event->file_size = size;
+
+	mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
+
+	cpuctx = &get_cpu_var(perf_cpu_context);
+	perf_event_mmap_ctx(&cpuctx->ctx, mmap_event);
+	put_cpu_var(perf_cpu_context);
+
+	rcu_read_lock();
+	/*
+	 * doesn't really matter which of the child contexts the
+	 * events ends up in.
+	 */
+	ctx = rcu_dereference(current->perf_event_ctxp);
+	if (ctx)
+		perf_event_mmap_ctx(ctx, mmap_event);
+	rcu_read_unlock();
+
+	kfree(buf);
+}
+
+void __perf_event_mmap(struct vm_area_struct *vma)
+{
+	struct perf_mmap_event mmap_event;
+
+	if (!atomic_read(&nr_mmap_events))
+		return;
+
+	mmap_event = (struct perf_mmap_event){
+		.vma	= vma,
+		/* .file_name */
+		/* .file_size */
+		.event_id  = {
+			.header = {
+				.type = PERF_RECORD_MMAP,
+				.misc = 0,
+				/* .size */
+			},
+			/* .pid */
+			/* .tid */
+			.start  = vma->vm_start,
+			.len    = vma->vm_end - vma->vm_start,
+			.pgoff  = vma->vm_pgoff,
+		},
+	};
+
+	perf_event_mmap_event(&mmap_event);
+}
+
+/*
+ * IRQ throttle logging
+ */
+
+static void perf_log_throttle(struct perf_event *event, int enable)
+{
+	struct perf_output_handle handle;
+	int ret;
+
+	struct {
+		struct perf_event_header	header;
+		u64				time;
+		u64				id;
+		u64				stream_id;
+	} throttle_event = {
+		.header = {
+			.type = PERF_RECORD_THROTTLE,
+			.misc = 0,
+			.size = sizeof(throttle_event),
+		},
+		.time		= perf_clock(),
+		.id		= primary_event_id(event),
+		.stream_id	= event->id,
+	};
+
+	if (enable)
+		throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
+
+	ret = perf_output_begin(&handle, event, sizeof(throttle_event), 1, 0);
+	if (ret)
+		return;
+
+	perf_output_put(&handle, throttle_event);
+	perf_output_end(&handle);
+}
+
+/*
+ * Generic event overflow handling, sampling.
+ */
+
+static int __perf_event_overflow(struct perf_event *event, int nmi,
+				   int throttle, struct perf_sample_data *data,
+				   struct pt_regs *regs)
+{
+	int events = atomic_read(&event->event_limit);
+	struct hw_perf_event *hwc = &event->hw;
+	int ret = 0;
+
+	throttle = (throttle && event->pmu->unthrottle != NULL);
+
+	if (!throttle) {
+		hwc->interrupts++;
+	} else {
+		if (hwc->interrupts != MAX_INTERRUPTS) {
+			hwc->interrupts++;
+			if (HZ * hwc->interrupts >
+					(u64)sysctl_perf_event_sample_rate) {
+				hwc->interrupts = MAX_INTERRUPTS;
+				perf_log_throttle(event, 0);
+				ret = 1;
+			}
+		} else {
+			/*
+			 * Keep re-disabling events even though on the previous
+			 * pass we disabled it - just in case we raced with a
+			 * sched-in and the event got enabled again:
+			 */
+			ret = 1;
+		}
+	}
+
+	if (event->attr.freq) {
+		u64 now = perf_clock();
+		s64 delta = now - hwc->freq_stamp;
+
+		hwc->freq_stamp = now;
+
+		if (delta > 0 && delta < TICK_NSEC)
+			perf_adjust_period(event, NSEC_PER_SEC / (int)delta);
+	}
+
+	/*
+	 * XXX event_limit might not quite work as expected on inherited
+	 * events
+	 */
+
+	event->pending_kill = POLL_IN;
+	if (events && atomic_dec_and_test(&event->event_limit)) {
+		ret = 1;
+		event->pending_kill = POLL_HUP;
+		if (nmi) {
+			event->pending_disable = 1;
+			perf_pending_queue(&event->pending,
+					   perf_pending_event);
+		} else
+			perf_event_disable(event);
+	}
+
+	perf_event_output(event, nmi, data, regs);
+	return ret;
+}
+
+int perf_event_overflow(struct perf_event *event, int nmi,
+			  struct perf_sample_data *data,
+			  struct pt_regs *regs)
+{
+	return __perf_event_overflow(event, nmi, 1, data, regs);
+}
+
+/*
+ * Generic software event infrastructure
+ */
+
+/*
+ * We directly increment event->count and keep a second value in
+ * event->hw.period_left to count intervals. This period event
+ * is kept in the range [-sample_period, 0] so that we can use the
+ * sign as trigger.
+ */
+
+static u64 perf_swevent_set_period(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	u64 period = hwc->last_period;
+	u64 nr, offset;
+	s64 old, val;
+
+	hwc->last_period = hwc->sample_period;
+
+again:
+	old = val = atomic64_read(&hwc->period_left);
+	if (val < 0)
+		return 0;
+
+	nr = div64_u64(period + val, period);
+	offset = nr * period;
+	val -= offset;
+	if (atomic64_cmpxchg(&hwc->period_left, old, val) != old)
+		goto again;
+
+	return nr;
+}
+
+static void perf_swevent_overflow(struct perf_event *event,
+				    int nmi, struct perf_sample_data *data,
+				    struct pt_regs *regs)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	int throttle = 0;
+	u64 overflow;
+
+	data->period = event->hw.last_period;
+	overflow = perf_swevent_set_period(event);
+
+	if (hwc->interrupts == MAX_INTERRUPTS)
+		return;
+
+	for (; overflow; overflow--) {
+		if (__perf_event_overflow(event, nmi, throttle,
+					    data, regs)) {
+			/*
+			 * We inhibit the overflow from happening when
+			 * hwc->interrupts == MAX_INTERRUPTS.
+			 */
+			break;
+		}
+		throttle = 1;
+	}
+}
+
+static void perf_swevent_unthrottle(struct perf_event *event)
+{
+	/*
+	 * Nothing to do, we already reset hwc->interrupts.
+	 */
+}
+
+static void perf_swevent_add(struct perf_event *event, u64 nr,
+			       int nmi, struct perf_sample_data *data,
+			       struct pt_regs *regs)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	atomic64_add(nr, &event->count);
+
+	if (!hwc->sample_period)
+		return;
+
+	if (!regs)
+		return;
+
+	if (!atomic64_add_negative(nr, &hwc->period_left))
+		perf_swevent_overflow(event, nmi, data, regs);
+}
+
+static int perf_swevent_is_counting(struct perf_event *event)
+{
+	/*
+	 * The event is active, we're good!
+	 */
+	if (event->state == PERF_EVENT_STATE_ACTIVE)
+		return 1;
+
+	/*
+	 * The event is off/error, not counting.
+	 */
+	if (event->state != PERF_EVENT_STATE_INACTIVE)
+		return 0;
+
+	/*
+	 * The event is inactive, if the context is active
+	 * we're part of a group that didn't make it on the 'pmu',
+	 * not counting.
+	 */
+	if (event->ctx->is_active)
+		return 0;
+
+	/*
+	 * We're inactive and the context is too, this means the
+	 * task is scheduled out, we're counting events that happen
+	 * to us, like migration events.
+	 */
+	return 1;
+}
+
+static int perf_swevent_match(struct perf_event *event,
+				enum perf_type_id type,
+				u32 event_id, struct pt_regs *regs)
+{
+	if (!perf_swevent_is_counting(event))
+		return 0;
+
+	if (event->attr.type != type)
+		return 0;
+	if (event->attr.config != event_id)
+		return 0;
+
+	if (regs) {
+		if (event->attr.exclude_user && user_mode(regs))
+			return 0;
+
+		if (event->attr.exclude_kernel && !user_mode(regs))
+			return 0;
+	}
+
+	return 1;
+}
+
+static void perf_swevent_ctx_event(struct perf_event_context *ctx,
+				     enum perf_type_id type,
+				     u32 event_id, u64 nr, int nmi,
+				     struct perf_sample_data *data,
+				     struct pt_regs *regs)
+{
+	struct perf_event *event;
+
+	if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
+		return;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
+		if (perf_swevent_match(event, type, event_id, regs))
+			perf_swevent_add(event, nr, nmi, data, regs);
+	}
+	rcu_read_unlock();
+}
+
+static int *perf_swevent_recursion_context(struct perf_cpu_context *cpuctx)
+{
+	if (in_nmi())
+		return &cpuctx->recursion[3];
+
+	if (in_irq())
+		return &cpuctx->recursion[2];
+
+	if (in_softirq())
+		return &cpuctx->recursion[1];
+
+	return &cpuctx->recursion[0];
+}
+
+static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
+				    u64 nr, int nmi,
+				    struct perf_sample_data *data,
+				    struct pt_regs *regs)
+{
+	struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
+	int *recursion = perf_swevent_recursion_context(cpuctx);
+	struct perf_event_context *ctx;
+
+	if (*recursion)
+		goto out;
+
+	(*recursion)++;
+	barrier();
+
+	perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
+				 nr, nmi, data, regs);
+	rcu_read_lock();
+	/*
+	 * doesn't really matter which of the child contexts the
+	 * events ends up in.
+	 */
+	ctx = rcu_dereference(current->perf_event_ctxp);
+	if (ctx)
+		perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs);
+	rcu_read_unlock();
+
+	barrier();
+	(*recursion)--;
+
+out:
+	put_cpu_var(perf_cpu_context);
+}
+
+void __perf_sw_event(u32 event_id, u64 nr, int nmi,
+			    struct pt_regs *regs, u64 addr)
+{
+	struct perf_sample_data data = {
+		.addr = addr,
+	};
+
+	do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi,
+				&data, regs);
+}
+
+static void perf_swevent_read(struct perf_event *event)
+{
+}
+
+static int perf_swevent_enable(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (hwc->sample_period) {
+		hwc->last_period = hwc->sample_period;
+		perf_swevent_set_period(event);
+	}
+	return 0;
+}
+
+static void perf_swevent_disable(struct perf_event *event)
+{
+}
+
+static const struct pmu perf_ops_generic = {
+	.enable		= perf_swevent_enable,
+	.disable	= perf_swevent_disable,
+	.read		= perf_swevent_read,
+	.unthrottle	= perf_swevent_unthrottle,
+};
+
+/*
+ * hrtimer based swevent callback
+ */
+
+static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
+{
+	enum hrtimer_restart ret = HRTIMER_RESTART;
+	struct perf_sample_data data;
+	struct pt_regs *regs;
+	struct perf_event *event;
+	u64 period;
+
+	event	= container_of(hrtimer, struct perf_event, hw.hrtimer);
+	event->pmu->read(event);
+
+	data.addr = 0;
+	regs = get_irq_regs();
+	/*
+	 * In case we exclude kernel IPs or are somehow not in interrupt
+	 * context, provide the next best thing, the user IP.
+	 */
+	if ((event->attr.exclude_kernel || !regs) &&
+			!event->attr.exclude_user)
+		regs = task_pt_regs(current);
+
+	if (regs) {
+		if (perf_event_overflow(event, 0, &data, regs))
+			ret = HRTIMER_NORESTART;
+	}
+
+	period = max_t(u64, 10000, event->hw.sample_period);
+	hrtimer_forward_now(hrtimer, ns_to_ktime(period));
+
+	return ret;
+}
+
+/*
+ * Software event: cpu wall time clock
+ */
+
+static void cpu_clock_perf_event_update(struct perf_event *event)
+{
+	int cpu = raw_smp_processor_id();
+	s64 prev;
+	u64 now;
+
+	now = cpu_clock(cpu);
+	prev = atomic64_read(&event->hw.prev_count);
+	atomic64_set(&event->hw.prev_count, now);
+	atomic64_add(now - prev, &event->count);
+}
+
+static int cpu_clock_perf_event_enable(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	int cpu = raw_smp_processor_id();
+
+	atomic64_set(&hwc->prev_count, cpu_clock(cpu));
+	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	hwc->hrtimer.function = perf_swevent_hrtimer;
+	if (hwc->sample_period) {
+		u64 period = max_t(u64, 10000, hwc->sample_period);
+		__hrtimer_start_range_ns(&hwc->hrtimer,
+				ns_to_ktime(period), 0,
+				HRTIMER_MODE_REL, 0);
+	}
+
+	return 0;
+}
+
+static void cpu_clock_perf_event_disable(struct perf_event *event)
+{
+	if (event->hw.sample_period)
+		hrtimer_cancel(&event->hw.hrtimer);
+	cpu_clock_perf_event_update(event);
+}
+
+static void cpu_clock_perf_event_read(struct perf_event *event)
+{
+	cpu_clock_perf_event_update(event);
+}
+
+static const struct pmu perf_ops_cpu_clock = {
+	.enable		= cpu_clock_perf_event_enable,
+	.disable	= cpu_clock_perf_event_disable,
+	.read		= cpu_clock_perf_event_read,
+};
+
+/*
+ * Software event: task time clock
+ */
+
+static void task_clock_perf_event_update(struct perf_event *event, u64 now)
+{
+	u64 prev;
+	s64 delta;
+
+	prev = atomic64_xchg(&event->hw.prev_count, now);
+	delta = now - prev;
+	atomic64_add(delta, &event->count);
+}
+
+static int task_clock_perf_event_enable(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	u64 now;
+
+	now = event->ctx->time;
+
+	atomic64_set(&hwc->prev_count, now);
+	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	hwc->hrtimer.function = perf_swevent_hrtimer;
+	if (hwc->sample_period) {
+		u64 period = max_t(u64, 10000, hwc->sample_period);
+		__hrtimer_start_range_ns(&hwc->hrtimer,
+				ns_to_ktime(period), 0,
+				HRTIMER_MODE_REL, 0);
+	}
+
+	return 0;
+}
+
+static void task_clock_perf_event_disable(struct perf_event *event)
+{
+	if (event->hw.sample_period)
+		hrtimer_cancel(&event->hw.hrtimer);
+	task_clock_perf_event_update(event, event->ctx->time);
+
+}
+
+static void task_clock_perf_event_read(struct perf_event *event)
+{
+	u64 time;
+
+	if (!in_nmi()) {
+		update_context_time(event->ctx);
+		time = event->ctx->time;
+	} else {
+		u64 now = perf_clock();
+		u64 delta = now - event->ctx->timestamp;
+		time = event->ctx->time + delta;
+	}
+
+	task_clock_perf_event_update(event, time);
+}
+
+static const struct pmu perf_ops_task_clock = {
+	.enable		= task_clock_perf_event_enable,
+	.disable	= task_clock_perf_event_disable,
+	.read		= task_clock_perf_event_read,
+};
+
+#ifdef CONFIG_EVENT_PROFILE
+void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
+			  int entry_size)
+{
+	struct perf_raw_record raw = {
+		.size = entry_size,
+		.data = record,
+	};
+
+	struct perf_sample_data data = {
+		.addr = addr,
+		.raw = &raw,
+	};
+
+	struct pt_regs *regs = get_irq_regs();
+
+	if (!regs)
+		regs = task_pt_regs(current);
+
+	do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
+				&data, regs);
+}
+EXPORT_SYMBOL_GPL(perf_tp_event);
+
+extern int ftrace_profile_enable(int);
+extern void ftrace_profile_disable(int);
+
+static void tp_perf_event_destroy(struct perf_event *event)
+{
+	ftrace_profile_disable(event->attr.config);
+}
+
+static const struct pmu *tp_perf_event_init(struct perf_event *event)
+{
+	/*
+	 * Raw tracepoint data is a severe data leak, only allow root to
+	 * have these.
+	 */
+	if ((event->attr.sample_type & PERF_SAMPLE_RAW) &&
+			perf_paranoid_tracepoint_raw() &&
+			!capable(CAP_SYS_ADMIN))
+		return ERR_PTR(-EPERM);
+
+	if (ftrace_profile_enable(event->attr.config))
+		return NULL;
+
+	event->destroy = tp_perf_event_destroy;
+
+	return &perf_ops_generic;
+}
+#else
+static const struct pmu *tp_perf_event_init(struct perf_event *event)
+{
+	return NULL;
+}
+#endif
+
+atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
+
+static void sw_perf_event_destroy(struct perf_event *event)
+{
+	u64 event_id = event->attr.config;
+
+	WARN_ON(event->parent);
+
+	atomic_dec(&perf_swevent_enabled[event_id]);
+}
+
+static const struct pmu *sw_perf_event_init(struct perf_event *event)
+{
+	const struct pmu *pmu = NULL;
+	u64 event_id = event->attr.config;
+
+	/*
+	 * Software events (currently) can't in general distinguish
+	 * between user, kernel and hypervisor events.
+	 * However, context switches and cpu migrations are considered
+	 * to be kernel events, and page faults are never hypervisor
+	 * events.
+	 */
+	switch (event_id) {
+	case PERF_COUNT_SW_CPU_CLOCK:
+		pmu = &perf_ops_cpu_clock;
+
+		break;
+	case PERF_COUNT_SW_TASK_CLOCK:
+		/*
+		 * If the user instantiates this as a per-cpu event,
+		 * use the cpu_clock event instead.
+		 */
+		if (event->ctx->task)
+			pmu = &perf_ops_task_clock;
+		else
+			pmu = &perf_ops_cpu_clock;
+
+		break;
+	case PERF_COUNT_SW_PAGE_FAULTS:
+	case PERF_COUNT_SW_PAGE_FAULTS_MIN:
+	case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
+	case PERF_COUNT_SW_CONTEXT_SWITCHES:
+	case PERF_COUNT_SW_CPU_MIGRATIONS:
+		if (!event->parent) {
+			atomic_inc(&perf_swevent_enabled[event_id]);
+			event->destroy = sw_perf_event_destroy;
+		}
+		pmu = &perf_ops_generic;
+		break;
+	}
+
+	return pmu;
+}
+
+/*
+ * Allocate and initialize a event structure
+ */
+static struct perf_event *
+perf_event_alloc(struct perf_event_attr *attr,
+		   int cpu,
+		   struct perf_event_context *ctx,
+		   struct perf_event *group_leader,
+		   struct perf_event *parent_event,
+		   gfp_t gfpflags)
+{
+	const struct pmu *pmu;
+	struct perf_event *event;
+	struct hw_perf_event *hwc;
+	long err;
+
+	event = kzalloc(sizeof(*event), gfpflags);
+	if (!event)
+		return ERR_PTR(-ENOMEM);
+
+	/*
+	 * Single events are their own group leaders, with an
+	 * empty sibling list:
+	 */
+	if (!group_leader)
+		group_leader = event;
+
+	mutex_init(&event->child_mutex);
+	INIT_LIST_HEAD(&event->child_list);
+
+	INIT_LIST_HEAD(&event->group_entry);
+	INIT_LIST_HEAD(&event->event_entry);
+	INIT_LIST_HEAD(&event->sibling_list);
+	init_waitqueue_head(&event->waitq);
+
+	mutex_init(&event->mmap_mutex);
+
+	event->cpu		= cpu;
+	event->attr		= *attr;
+	event->group_leader	= group_leader;
+	event->pmu		= NULL;
+	event->ctx		= ctx;
+	event->oncpu		= -1;
+
+	event->parent		= parent_event;
+
+	event->ns		= get_pid_ns(current->nsproxy->pid_ns);
+	event->id		= atomic64_inc_return(&perf_event_id);
+
+	event->state		= PERF_EVENT_STATE_INACTIVE;
+
+	if (attr->disabled)
+		event->state = PERF_EVENT_STATE_OFF;
+
+	pmu = NULL;
+
+	hwc = &event->hw;
+	hwc->sample_period = attr->sample_period;
+	if (attr->freq && attr->sample_freq)
+		hwc->sample_period = 1;
+	hwc->last_period = hwc->sample_period;
+
+	atomic64_set(&hwc->period_left, hwc->sample_period);
+
+	/*
+	 * we currently do not support PERF_FORMAT_GROUP on inherited events
+	 */
+	if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
+		goto done;
+
+	switch (attr->type) {
+	case PERF_TYPE_RAW:
+	case PERF_TYPE_HARDWARE:
+	case PERF_TYPE_HW_CACHE:
+		pmu = hw_perf_event_init(event);
+		break;
+
+	case PERF_TYPE_SOFTWARE:
+		pmu = sw_perf_event_init(event);
+		break;
+
+	case PERF_TYPE_TRACEPOINT:
+		pmu = tp_perf_event_init(event);
+		break;
+
+	default:
+		break;
+	}
+done:
+	err = 0;
+	if (!pmu)
+		err = -EINVAL;
+	else if (IS_ERR(pmu))
+		err = PTR_ERR(pmu);
+
+	if (err) {
+		if (event->ns)
+			put_pid_ns(event->ns);
+		kfree(event);
+		return ERR_PTR(err);
+	}
+
+	event->pmu = pmu;
+
+	if (!event->parent) {
+		atomic_inc(&nr_events);
+		if (event->attr.mmap)
+			atomic_inc(&nr_mmap_events);
+		if (event->attr.comm)
+			atomic_inc(&nr_comm_events);
+		if (event->attr.task)
+			atomic_inc(&nr_task_events);
+	}
+
+	return event;
+}
+
+static int perf_copy_attr(struct perf_event_attr __user *uattr,
+			  struct perf_event_attr *attr)
+{
+	u32 size;
+	int ret;
+
+	if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
+		return -EFAULT;
+
+	/*
+	 * zero the full structure, so that a short copy will be nice.
+	 */
+	memset(attr, 0, sizeof(*attr));
+
+	ret = get_user(size, &uattr->size);
+	if (ret)
+		return ret;
+
+	if (size > PAGE_SIZE)	/* silly large */
+		goto err_size;
+
+	if (!size)		/* abi compat */
+		size = PERF_ATTR_SIZE_VER0;
+
+	if (size < PERF_ATTR_SIZE_VER0)
+		goto err_size;
+
+	/*
+	 * If we're handed a bigger struct than we know of,
+	 * ensure all the unknown bits are 0 - i.e. new
+	 * user-space does not rely on any kernel feature
+	 * extensions we dont know about yet.
+	 */
+	if (size > sizeof(*attr)) {
+		unsigned char __user *addr;
+		unsigned char __user *end;
+		unsigned char val;
+
+		addr = (void __user *)uattr + sizeof(*attr);
+		end  = (void __user *)uattr + size;
+
+		for (; addr < end; addr++) {
+			ret = get_user(val, addr);
+			if (ret)
+				return ret;
+			if (val)
+				goto err_size;
+		}
+		size = sizeof(*attr);
+	}
+
+	ret = copy_from_user(attr, uattr, size);
+	if (ret)
+		return -EFAULT;
+
+	/*
+	 * If the type exists, the corresponding creation will verify
+	 * the attr->config.
+	 */
+	if (attr->type >= PERF_TYPE_MAX)
+		return -EINVAL;
+
+	if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
+		return -EINVAL;
+
+	if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
+		return -EINVAL;
+
+	if (attr->read_format & ~(PERF_FORMAT_MAX-1))
+		return -EINVAL;
+
+out:
+	return ret;
+
+err_size:
+	put_user(sizeof(*attr), &uattr->size);
+	ret = -E2BIG;
+	goto out;
+}
+
+int perf_event_set_output(struct perf_event *event, int output_fd)
+{
+	struct perf_event *output_event = NULL;
+	struct file *output_file = NULL;
+	struct perf_event *old_output;
+	int fput_needed = 0;
+	int ret = -EINVAL;
+
+	if (!output_fd)
+		goto set;
+
+	output_file = fget_light(output_fd, &fput_needed);
+	if (!output_file)
+		return -EBADF;
+
+	if (output_file->f_op != &perf_fops)
+		goto out;
+
+	output_event = output_file->private_data;
+
+	/* Don't chain output fds */
+	if (output_event->output)
+		goto out;
+
+	/* Don't set an output fd when we already have an output channel */
+	if (event->data)
+		goto out;
+
+	atomic_long_inc(&output_file->f_count);
+
+set:
+	mutex_lock(&event->mmap_mutex);
+	old_output = event->output;
+	rcu_assign_pointer(event->output, output_event);
+	mutex_unlock(&event->mmap_mutex);
+
+	if (old_output) {
+		/*
+		 * we need to make sure no existing perf_output_*()
+		 * is still referencing this event.
+		 */
+		synchronize_rcu();
+		fput(old_output->filp);
+	}
+
+	ret = 0;
+out:
+	fput_light(output_file, fput_needed);
+	return ret;
+}
+
+/**
+ * sys_perf_event_open - open a performance event, associate it to a task/cpu
+ *
+ * @attr_uptr:	event_id type attributes for monitoring/sampling
+ * @pid:		target pid
+ * @cpu:		target cpu
+ * @group_fd:		group leader event fd
+ */
+SYSCALL_DEFINE5(perf_event_open,
+		struct perf_event_attr __user *, attr_uptr,
+		pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
+{
+	struct perf_event *event, *group_leader;
+	struct perf_event_attr attr;
+	struct perf_event_context *ctx;
+	struct file *event_file = NULL;
+	struct file *group_file = NULL;
+	int fput_needed = 0;
+	int fput_needed2 = 0;
+	int err;
+
+	/* for future expandability... */
+	if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT))
+		return -EINVAL;
+
+	err = perf_copy_attr(attr_uptr, &attr);
+	if (err)
+		return err;
+
+	if (!attr.exclude_kernel) {
+		if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
+			return -EACCES;
+	}
+
+	if (attr.freq) {
+		if (attr.sample_freq > sysctl_perf_event_sample_rate)
+			return -EINVAL;
+	}
+
+	/*
+	 * Get the target context (task or percpu):
+	 */
+	ctx = find_get_context(pid, cpu);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	/*
+	 * Look up the group leader (we will attach this event to it):
+	 */
+	group_leader = NULL;
+	if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) {
+		err = -EINVAL;
+		group_file = fget_light(group_fd, &fput_needed);
+		if (!group_file)
+			goto err_put_context;
+		if (group_file->f_op != &perf_fops)
+			goto err_put_context;
+
+		group_leader = group_file->private_data;
+		/*
+		 * Do not allow a recursive hierarchy (this new sibling
+		 * becoming part of another group-sibling):
+		 */
+		if (group_leader->group_leader != group_leader)
+			goto err_put_context;
+		/*
+		 * Do not allow to attach to a group in a different
+		 * task or CPU context:
+		 */
+		if (group_leader->ctx != ctx)
+			goto err_put_context;
+		/*
+		 * Only a group leader can be exclusive or pinned
+		 */
+		if (attr.exclusive || attr.pinned)
+			goto err_put_context;
+	}
+
+	event = perf_event_alloc(&attr, cpu, ctx, group_leader,
+				     NULL, GFP_KERNEL);
+	err = PTR_ERR(event);
+	if (IS_ERR(event))
+		goto err_put_context;
+
+	err = anon_inode_getfd("[perf_event]", &perf_fops, event, 0);
+	if (err < 0)
+		goto err_free_put_context;
+
+	event_file = fget_light(err, &fput_needed2);
+	if (!event_file)
+		goto err_free_put_context;
+
+	if (flags & PERF_FLAG_FD_OUTPUT) {
+		err = perf_event_set_output(event, group_fd);
+		if (err)
+			goto err_fput_free_put_context;
+	}
+
+	event->filp = event_file;
+	WARN_ON_ONCE(ctx->parent_ctx);
+	mutex_lock(&ctx->mutex);
+	perf_install_in_context(ctx, event, cpu);
+	++ctx->generation;
+	mutex_unlock(&ctx->mutex);
+
+	event->owner = current;
+	get_task_struct(current);
+	mutex_lock(&current->perf_event_mutex);
+	list_add_tail(&event->owner_entry, &current->perf_event_list);
+	mutex_unlock(&current->perf_event_mutex);
+
+err_fput_free_put_context:
+	fput_light(event_file, fput_needed2);
+
+err_free_put_context:
+	if (err < 0)
+		kfree(event);
+
+err_put_context:
+	if (err < 0)
+		put_ctx(ctx);
+
+	fput_light(group_file, fput_needed);
+
+	return err;
+}
+
+/*
+ * inherit a event from parent task to child task:
+ */
+static struct perf_event *
+inherit_event(struct perf_event *parent_event,
+	      struct task_struct *parent,
+	      struct perf_event_context *parent_ctx,
+	      struct task_struct *child,
+	      struct perf_event *group_leader,
+	      struct perf_event_context *child_ctx)
+{
+	struct perf_event *child_event;
+
+	/*
+	 * Instead of creating recursive hierarchies of events,
+	 * we link inherited events back to the original parent,
+	 * which has a filp for sure, which we use as the reference
+	 * count:
+	 */
+	if (parent_event->parent)
+		parent_event = parent_event->parent;
+
+	child_event = perf_event_alloc(&parent_event->attr,
+					   parent_event->cpu, child_ctx,
+					   group_leader, parent_event,
+					   GFP_KERNEL);
+	if (IS_ERR(child_event))
+		return child_event;
+	get_ctx(child_ctx);
+
+	/*
+	 * Make the child state follow the state of the parent event,
+	 * not its attr.disabled bit.  We hold the parent's mutex,
+	 * so we won't race with perf_event_{en, dis}able_family.
+	 */
+	if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
+		child_event->state = PERF_EVENT_STATE_INACTIVE;
+	else
+		child_event->state = PERF_EVENT_STATE_OFF;
+
+	if (parent_event->attr.freq)
+		child_event->hw.sample_period = parent_event->hw.sample_period;
+
+	/*
+	 * Link it up in the child's context:
+	 */
+	add_event_to_ctx(child_event, child_ctx);
+
+	/*
+	 * Get a reference to the parent filp - we will fput it
+	 * when the child event exits. This is safe to do because
+	 * we are in the parent and we know that the filp still
+	 * exists and has a nonzero count:
+	 */
+	atomic_long_inc(&parent_event->filp->f_count);
+
+	/*
+	 * Link this into the parent event's child list
+	 */
+	WARN_ON_ONCE(parent_event->ctx->parent_ctx);
+	mutex_lock(&parent_event->child_mutex);
+	list_add_tail(&child_event->child_list, &parent_event->child_list);
+	mutex_unlock(&parent_event->child_mutex);
+
+	return child_event;
+}
+
+static int inherit_group(struct perf_event *parent_event,
+	      struct task_struct *parent,
+	      struct perf_event_context *parent_ctx,
+	      struct task_struct *child,
+	      struct perf_event_context *child_ctx)
+{
+	struct perf_event *leader;
+	struct perf_event *sub;
+	struct perf_event *child_ctr;
+
+	leader = inherit_event(parent_event, parent, parent_ctx,
+				 child, NULL, child_ctx);
+	if (IS_ERR(leader))
+		return PTR_ERR(leader);
+	list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
+		child_ctr = inherit_event(sub, parent, parent_ctx,
+					    child, leader, child_ctx);
+		if (IS_ERR(child_ctr))
+			return PTR_ERR(child_ctr);
+	}
+	return 0;
+}
+
+static void sync_child_event(struct perf_event *child_event,
+			       struct task_struct *child)
+{
+	struct perf_event *parent_event = child_event->parent;
+	u64 child_val;
+
+	if (child_event->attr.inherit_stat)
+		perf_event_read_event(child_event, child);
+
+	child_val = atomic64_read(&child_event->count);
+
+	/*
+	 * Add back the child's count to the parent's count:
+	 */
+	atomic64_add(child_val, &parent_event->count);
+	atomic64_add(child_event->total_time_enabled,
+		     &parent_event->child_total_time_enabled);
+	atomic64_add(child_event->total_time_running,
+		     &parent_event->child_total_time_running);
+
+	/*
+	 * Remove this event from the parent's list
+	 */
+	WARN_ON_ONCE(parent_event->ctx->parent_ctx);
+	mutex_lock(&parent_event->child_mutex);
+	list_del_init(&child_event->child_list);
+	mutex_unlock(&parent_event->child_mutex);
+
+	/*
+	 * Release the parent event, if this was the last
+	 * reference to it.
+	 */
+	fput(parent_event->filp);
+}
+
+static void
+__perf_event_exit_task(struct perf_event *child_event,
+			 struct perf_event_context *child_ctx,
+			 struct task_struct *child)
+{
+	struct perf_event *parent_event;
+
+	update_event_times(child_event);
+	perf_event_remove_from_context(child_event);
+
+	parent_event = child_event->parent;
+	/*
+	 * It can happen that parent exits first, and has events
+	 * that are still around due to the child reference. These
+	 * events need to be zapped - but otherwise linger.
+	 */
+	if (parent_event) {
+		sync_child_event(child_event, child);
+		free_event(child_event);
+	}
+}
+
+/*
+ * When a child task exits, feed back event values to parent events.
+ */
+void perf_event_exit_task(struct task_struct *child)
+{
+	struct perf_event *child_event, *tmp;
+	struct perf_event_context *child_ctx;
+	unsigned long flags;
+
+	if (likely(!child->perf_event_ctxp)) {
+		perf_event_task(child, NULL, 0);
+		return;
+	}
+
+	local_irq_save(flags);
+	/*
+	 * We can't reschedule here because interrupts are disabled,
+	 * and either child is current or it is a task that can't be
+	 * scheduled, so we are now safe from rescheduling changing
+	 * our context.
+	 */
+	child_ctx = child->perf_event_ctxp;
+	__perf_event_task_sched_out(child_ctx);
+
+	/*
+	 * Take the context lock here so that if find_get_context is
+	 * reading child->perf_event_ctxp, we wait until it has
+	 * incremented the context's refcount before we do put_ctx below.
+	 */
+	spin_lock(&child_ctx->lock);
+	child->perf_event_ctxp = NULL;
+	/*
+	 * If this context is a clone; unclone it so it can't get
+	 * swapped to another process while we're removing all
+	 * the events from it.
+	 */
+	unclone_ctx(child_ctx);
+	spin_unlock_irqrestore(&child_ctx->lock, flags);
+
+	/*
+	 * Report the task dead after unscheduling the events so that we
+	 * won't get any samples after PERF_RECORD_EXIT. We can however still
+	 * get a few PERF_RECORD_READ events.
+	 */
+	perf_event_task(child, child_ctx, 0);
+
+	/*
+	 * We can recurse on the same lock type through:
+	 *
+	 *   __perf_event_exit_task()
+	 *     sync_child_event()
+	 *       fput(parent_event->filp)
+	 *         perf_release()
+	 *           mutex_lock(&ctx->mutex)
+	 *
+	 * But since its the parent context it won't be the same instance.
+	 */
+	mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
+
+again:
+	list_for_each_entry_safe(child_event, tmp, &child_ctx->group_list,
+				 group_entry)
+		__perf_event_exit_task(child_event, child_ctx, child);
+
+	/*
+	 * If the last event was a group event, it will have appended all
+	 * its siblings to the list, but we obtained 'tmp' before that which
+	 * will still point to the list head terminating the iteration.
+	 */
+	if (!list_empty(&child_ctx->group_list))
+		goto again;
+
+	mutex_unlock(&child_ctx->mutex);
+
+	put_ctx(child_ctx);
+}
+
+/*
+ * free an unexposed, unused context as created by inheritance by
+ * init_task below, used by fork() in case of fail.
+ */
+void perf_event_free_task(struct task_struct *task)
+{
+	struct perf_event_context *ctx = task->perf_event_ctxp;
+	struct perf_event *event, *tmp;
+
+	if (!ctx)
+		return;
+
+	mutex_lock(&ctx->mutex);
+again:
+	list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) {
+		struct perf_event *parent = event->parent;
+
+		if (WARN_ON_ONCE(!parent))
+			continue;
+
+		mutex_lock(&parent->child_mutex);
+		list_del_init(&event->child_list);
+		mutex_unlock(&parent->child_mutex);
+
+		fput(parent->filp);
+
+		list_del_event(event, ctx);
+		free_event(event);
+	}
+
+	if (!list_empty(&ctx->group_list))
+		goto again;
+
+	mutex_unlock(&ctx->mutex);
+
+	put_ctx(ctx);
+}
+
+/*
+ * Initialize the perf_event context in task_struct
+ */
+int perf_event_init_task(struct task_struct *child)
+{
+	struct perf_event_context *child_ctx, *parent_ctx;
+	struct perf_event_context *cloned_ctx;
+	struct perf_event *event;
+	struct task_struct *parent = current;
+	int inherited_all = 1;
+	int ret = 0;
+
+	child->perf_event_ctxp = NULL;
+
+	mutex_init(&child->perf_event_mutex);
+	INIT_LIST_HEAD(&child->perf_event_list);
+
+	if (likely(!parent->perf_event_ctxp))
+		return 0;
+
+	/*
+	 * This is executed from the parent task context, so inherit
+	 * events that have been marked for cloning.
+	 * First allocate and initialize a context for the child.
+	 */
+
+	child_ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL);
+	if (!child_ctx)
+		return -ENOMEM;
+
+	__perf_event_init_context(child_ctx, child);
+	child->perf_event_ctxp = child_ctx;
+	get_task_struct(child);
+
+	/*
+	 * If the parent's context is a clone, pin it so it won't get
+	 * swapped under us.
+	 */
+	parent_ctx = perf_pin_task_context(parent);
+
+	/*
+	 * No need to check if parent_ctx != NULL here; since we saw
+	 * it non-NULL earlier, the only reason for it to become NULL
+	 * is if we exit, and since we're currently in the middle of
+	 * a fork we can't be exiting at the same time.
+	 */
+
+	/*
+	 * Lock the parent list. No need to lock the child - not PID
+	 * hashed yet and not running, so nobody can access it.
+	 */
+	mutex_lock(&parent_ctx->mutex);
+
+	/*
+	 * We dont have to disable NMIs - we are only looking at
+	 * the list, not manipulating it:
+	 */
+	list_for_each_entry_rcu(event, &parent_ctx->event_list, event_entry) {
+		if (event != event->group_leader)
+			continue;
+
+		if (!event->attr.inherit) {
+			inherited_all = 0;
+			continue;
+		}
+
+		ret = inherit_group(event, parent, parent_ctx,
+					     child, child_ctx);
+		if (ret) {
+			inherited_all = 0;
+			break;
+		}
+	}
+
+	if (inherited_all) {
+		/*
+		 * Mark the child context as a clone of the parent
+		 * context, or of whatever the parent is a clone of.
+		 * Note that if the parent is a clone, it could get
+		 * uncloned at any point, but that doesn't matter
+		 * because the list of events and the generation
+		 * count can't have changed since we took the mutex.
+		 */
+		cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
+		if (cloned_ctx) {
+			child_ctx->parent_ctx = cloned_ctx;
+			child_ctx->parent_gen = parent_ctx->parent_gen;
+		} else {
+			child_ctx->parent_ctx = parent_ctx;
+			child_ctx->parent_gen = parent_ctx->generation;
+		}
+		get_ctx(child_ctx->parent_ctx);
+	}
+
+	mutex_unlock(&parent_ctx->mutex);
+
+	perf_unpin_context(parent_ctx);
+
+	return ret;
+}
+
+static void __cpuinit perf_event_init_cpu(int cpu)
+{
+	struct perf_cpu_context *cpuctx;
+
+	cpuctx = &per_cpu(perf_cpu_context, cpu);
+	__perf_event_init_context(&cpuctx->ctx, NULL);
+
+	spin_lock(&perf_resource_lock);
+	cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
+	spin_unlock(&perf_resource_lock);
+
+	hw_perf_event_setup(cpu);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static void __perf_event_exit_cpu(void *info)
+{
+	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+	struct perf_event_context *ctx = &cpuctx->ctx;
+	struct perf_event *event, *tmp;
+
+	list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry)
+		__perf_event_remove_from_context(event);
+}
+static void perf_event_exit_cpu(int cpu)
+{
+	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+	struct perf_event_context *ctx = &cpuctx->ctx;
+
+	mutex_lock(&ctx->mutex);
+	smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);
+	mutex_unlock(&ctx->mutex);
+}
+#else
+static inline void perf_event_exit_cpu(int cpu) { }
+#endif
+
+static int __cpuinit
+perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
+{
+	unsigned int cpu = (long)hcpu;
+
+	switch (action) {
+
+	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
+		perf_event_init_cpu(cpu);
+		break;
+
+	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
+		hw_perf_event_setup_online(cpu);
+		break;
+
+	case CPU_DOWN_PREPARE:
+	case CPU_DOWN_PREPARE_FROZEN:
+		perf_event_exit_cpu(cpu);
+		break;
+
+	default:
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
+/*
+ * This has to have a higher priority than migration_notifier in sched.c.
+ */
+static struct notifier_block __cpuinitdata perf_cpu_nb = {
+	.notifier_call		= perf_cpu_notify,
+	.priority		= 20,
+};
+
+void __init perf_event_init(void)
+{
+	perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
+			(void *)(long)smp_processor_id());
+	perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
+			(void *)(long)smp_processor_id());
+	register_cpu_notifier(&perf_cpu_nb);
+}
+
+static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
+{
+	return sprintf(buf, "%d\n", perf_reserved_percpu);
+}
+
+static ssize_t
+perf_set_reserve_percpu(struct sysdev_class *class,
+			const char *buf,
+			size_t count)
+{
+	struct perf_cpu_context *cpuctx;
+	unsigned long val;
+	int err, cpu, mpt;
+
+	err = strict_strtoul(buf, 10, &val);
+	if (err)
+		return err;
+	if (val > perf_max_events)
+		return -EINVAL;
+
+	spin_lock(&perf_resource_lock);
+	perf_reserved_percpu = val;
+	for_each_online_cpu(cpu) {
+		cpuctx = &per_cpu(perf_cpu_context, cpu);
+		spin_lock_irq(&cpuctx->ctx.lock);
+		mpt = min(perf_max_events - cpuctx->ctx.nr_events,
+			  perf_max_events - perf_reserved_percpu);
+		cpuctx->max_pertask = mpt;
+		spin_unlock_irq(&cpuctx->ctx.lock);
+	}
+	spin_unlock(&perf_resource_lock);
+
+	return count;
+}
+
+static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
+{
+	return sprintf(buf, "%d\n", perf_overcommit);
+}
+
+static ssize_t
+perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
+{
+	unsigned long val;
+	int err;
+
+	err = strict_strtoul(buf, 10, &val);
+	if (err)
+		return err;
+	if (val > 1)
+		return -EINVAL;
+
+	spin_lock(&perf_resource_lock);
+	perf_overcommit = val;
+	spin_unlock(&perf_resource_lock);
+
+	return count;
+}
+
+static SYSDEV_CLASS_ATTR(
+				reserve_percpu,
+				0644,
+				perf_show_reserve_percpu,
+				perf_set_reserve_percpu
+			);
+
+static SYSDEV_CLASS_ATTR(
+				overcommit,
+				0644,
+				perf_show_overcommit,
+				perf_set_overcommit
+			);
+
+static struct attribute *perfclass_attrs[] = {
+	&attr_reserve_percpu.attr,
+	&attr_overcommit.attr,
+	NULL
+};
+
+static struct attribute_group perfclass_attr_group = {
+	.attrs			= perfclass_attrs,
+	.name			= "perf_events",
+};
+
+static int __init perf_event_sysfs_init(void)
+{
+	return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
+				  &perfclass_attr_group);
+}
+device_initcall(perf_event_sysfs_init);
diff --git a/kernel/sched.c b/kernel/sched.c
index faf4d463bbf..291c8d213d1 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -39,7 +39,7 @@
 #include <linux/completion.h>
 #include <linux/kernel_stat.h>
 #include <linux/debug_locks.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
 #include <linux/security.h>
 #include <linux/notifier.h>
 #include <linux/profile.h>
@@ -2059,7 +2059,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 		if (task_hot(p, old_rq->clock, NULL))
 			schedstat_inc(p, se.nr_forced2_migrations);
 #endif
-		perf_swcounter_event(PERF_COUNT_SW_CPU_MIGRATIONS,
+		perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS,
 				     1, 1, NULL, 0);
 	}
 	p->se.vruntime -= old_cfsrq->min_vruntime -
@@ -2724,7 +2724,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 	 */
 	prev_state = prev->state;
 	finish_arch_switch(prev);
-	perf_counter_task_sched_in(current, cpu_of(rq));
+	perf_event_task_sched_in(current, cpu_of(rq));
 	finish_lock_switch(rq, prev);
 
 	fire_sched_in_preempt_notifiers(current);
@@ -5199,7 +5199,7 @@ void scheduler_tick(void)
 	curr->sched_class->task_tick(rq, curr, 0);
 	spin_unlock(&rq->lock);
 
-	perf_counter_task_tick(curr, cpu);
+	perf_event_task_tick(curr, cpu);
 
 #ifdef CONFIG_SMP
 	rq->idle_at_tick = idle_cpu(cpu);
@@ -5415,7 +5415,7 @@ need_resched_nonpreemptible:
 
 	if (likely(prev != next)) {
 		sched_info_switch(prev, next);
-		perf_counter_task_sched_out(prev, next, cpu);
+		perf_event_task_sched_out(prev, next, cpu);
 
 		rq->nr_switches++;
 		rq->curr = next;
@@ -7692,7 +7692,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 /*
  * Register at high priority so that task migration (migrate_all_tasks)
  * happens before everything else.  This has to be lower priority than
- * the notifier in the perf_counter subsystem, though.
+ * the notifier in the perf_event subsystem, though.
  */
 static struct notifier_block __cpuinitdata migration_notifier = {
 	.notifier_call = migration_call,
@@ -9549,7 +9549,7 @@ void __init sched_init(void)
 	alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
 #endif /* SMP */
 
-	perf_counter_init();
+	perf_event_init();
 
 	scheduler_running = 1;
 }
diff --git a/kernel/sys.c b/kernel/sys.c
index b3f1097c76f..ea5c3bcac88 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -14,7 +14,7 @@
 #include <linux/prctl.h>
 #include <linux/highuid.h>
 #include <linux/fs.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
 #include <linux/resource.h>
 #include <linux/kernel.h>
 #include <linux/kexec.h>
@@ -1511,11 +1511,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 		case PR_SET_TSC:
 			error = SET_TSC_CTL(arg2);
 			break;
-		case PR_TASK_PERF_COUNTERS_DISABLE:
-			error = perf_counter_task_disable();
+		case PR_TASK_PERF_EVENTS_DISABLE:
+			error = perf_event_task_disable();
 			break;
-		case PR_TASK_PERF_COUNTERS_ENABLE:
-			error = perf_counter_task_enable();
+		case PR_TASK_PERF_EVENTS_ENABLE:
+			error = perf_event_task_enable();
 			break;
 		case PR_GET_TIMERSLACK:
 			error = current->timer_slack_ns;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 68320f6b07b..515bc230ac2 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -177,4 +177,4 @@ cond_syscall(sys_eventfd);
 cond_syscall(sys_eventfd2);
 
 /* performance counters: */
-cond_syscall(sys_perf_counter_open);
+cond_syscall(sys_perf_event_open);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 1a631ba684a..6ba49c7cb12 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -50,7 +50,7 @@
 #include <linux/reboot.h>
 #include <linux/ftrace.h>
 #include <linux/slow-work.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
 
 #include <asm/uaccess.h>
 #include <asm/processor.h>
@@ -964,28 +964,28 @@ static struct ctl_table kern_table[] = {
 		.child		= slow_work_sysctls,
 	},
 #endif
-#ifdef CONFIG_PERF_COUNTERS
+#ifdef CONFIG_PERF_EVENTS
 	{
 		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "perf_counter_paranoid",
-		.data		= &sysctl_perf_counter_paranoid,
-		.maxlen		= sizeof(sysctl_perf_counter_paranoid),
+		.procname	= "perf_event_paranoid",
+		.data		= &sysctl_perf_event_paranoid,
+		.maxlen		= sizeof(sysctl_perf_event_paranoid),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
 	{
 		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "perf_counter_mlock_kb",
-		.data		= &sysctl_perf_counter_mlock,
-		.maxlen		= sizeof(sysctl_perf_counter_mlock),
+		.procname	= "perf_event_mlock_kb",
+		.data		= &sysctl_perf_event_mlock,
+		.maxlen		= sizeof(sysctl_perf_event_mlock),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
 	{
 		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "perf_counter_max_sample_rate",
-		.data		= &sysctl_perf_counter_sample_rate,
-		.maxlen		= sizeof(sysctl_perf_counter_sample_rate),
+		.procname	= "perf_event_max_sample_rate",
+		.data		= &sysctl_perf_event_sample_rate,
+		.maxlen		= sizeof(sysctl_perf_event_sample_rate),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
diff --git a/kernel/timer.c b/kernel/timer.c
index bbb51074680..811e5c39145 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -37,7 +37,7 @@
 #include <linux/delay.h>
 #include <linux/tick.h>
 #include <linux/kallsyms.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
 #include <linux/sched.h>
 
 #include <asm/uaccess.h>
@@ -1187,7 +1187,7 @@ static void run_timer_softirq(struct softirq_action *h)
 {
 	struct tvec_base *base = __get_cpu_var(tvec_bases);
 
-	perf_counter_do_pending();
+	perf_event_do_pending();
 
 	hrtimer_run_pending();
 
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 8712ce3c6a0..233f3483ac8 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -2,7 +2,7 @@
 #include <trace/events/syscalls.h>
 #include <linux/kernel.h>
 #include <linux/ftrace.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
 #include <asm/syscall.h>
 
 #include "trace_output.h"
@@ -414,7 +414,7 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
 		rec->nr = syscall_nr;
 		syscall_get_arguments(current, regs, 0, sys_data->nb_args,
 				       (unsigned long *)&rec->args);
-		perf_tpcounter_event(sys_data->enter_id, 0, 1, rec, size);
+		perf_tp_event(sys_data->enter_id, 0, 1, rec, size);
 	} while(0);
 }
 
@@ -476,7 +476,7 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
 	rec.nr = syscall_nr;
 	rec.ret = syscall_get_return_value(current, regs);
 
-	perf_tpcounter_event(sys_data->exit_id, 0, 1, &rec, sizeof(rec));
+	perf_tp_event(sys_data->exit_id, 0, 1, &rec, sizeof(rec));
 }
 
 int reg_prof_syscall_exit(char *name)
diff --git a/mm/mmap.c b/mm/mmap.c
index 26892e346d8..376492ed08f 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -28,7 +28,7 @@
 #include <linux/mempolicy.h>
 #include <linux/rmap.h>
 #include <linux/mmu_notifier.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
 
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
@@ -1220,7 +1220,7 @@ munmap_back:
 	if (correct_wcount)
 		atomic_inc(&inode->i_writecount);
 out:
-	perf_counter_mmap(vma);
+	perf_event_mmap(vma);
 
 	mm->total_vm += len >> PAGE_SHIFT;
 	vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
@@ -2308,7 +2308,7 @@ int install_special_mapping(struct mm_struct *mm,
 
 	mm->total_vm += len >> PAGE_SHIFT;
 
-	perf_counter_mmap(vma);
+	perf_event_mmap(vma);
 
 	return 0;
 }
diff --git a/mm/mprotect.c b/mm/mprotect.c
index d80311baeb2..8bc969d8112 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -23,7 +23,7 @@
 #include <linux/swapops.h>
 #include <linux/mmu_notifier.h>
 #include <linux/migrate.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/cacheflush.h>
@@ -300,7 +300,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
 		error = mprotect_fixup(vma, &prev, nstart, tmp, newflags);
 		if (error)
 			goto out;
-		perf_counter_mmap(vma);
+		perf_event_mmap(vma);
 		nstart = tmp;
 
 		if (nstart < prev->vm_end)
diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index 0aba8b6e9c5..b5f1953b614 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -318,7 +318,7 @@ export PERL_PATH
 
 LIB_FILE=libperf.a
 
-LIB_H += ../../include/linux/perf_counter.h
+LIB_H += ../../include/linux/perf_event.h
 LIB_H += ../../include/linux/rbtree.h
 LIB_H += ../../include/linux/list.h
 LIB_H += util/include/linux/list.h
diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index 043d85b7e25..1ec74161581 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -505,7 +505,7 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head)
 		return -1;
 	}
 
-	if (event->header.misc & PERF_EVENT_MISC_KERNEL) {
+	if (event->header.misc & PERF_RECORD_MISC_KERNEL) {
 		show = SHOW_KERNEL;
 		level = 'k';
 
@@ -513,7 +513,7 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head)
 
 		dump_printf(" ...... dso: %s\n", dso->name);
 
-	} else if (event->header.misc & PERF_EVENT_MISC_USER) {
+	} else if (event->header.misc & PERF_RECORD_MISC_USER) {
 
 		show = SHOW_USER;
 		level = '.';
@@ -565,7 +565,7 @@ process_mmap_event(event_t *event, unsigned long offset, unsigned long head)
 
 	thread = threads__findnew(event->mmap.pid, &threads, &last_match);
 
-	dump_printf("%p [%p]: PERF_EVENT_MMAP %d: [%p(%p) @ %p]: %s\n",
+	dump_printf("%p [%p]: PERF_RECORD_MMAP %d: [%p(%p) @ %p]: %s\n",
 		(void *)(offset + head),
 		(void *)(long)(event->header.size),
 		event->mmap.pid,
@@ -575,7 +575,7 @@ process_mmap_event(event_t *event, unsigned long offset, unsigned long head)
 		event->mmap.filename);
 
 	if (thread == NULL || map == NULL) {
-		dump_printf("problem processing PERF_EVENT_MMAP, skipping event.\n");
+		dump_printf("problem processing PERF_RECORD_MMAP, skipping event.\n");
 		return 0;
 	}
 
@@ -591,14 +591,14 @@ process_comm_event(event_t *event, unsigned long offset, unsigned long head)
 	struct thread *thread;
 
 	thread = threads__findnew(event->comm.pid, &threads, &last_match);
-	dump_printf("%p [%p]: PERF_EVENT_COMM: %s:%d\n",
+	dump_printf("%p [%p]: PERF_RECORD_COMM: %s:%d\n",
 		(void *)(offset + head),
 		(void *)(long)(event->header.size),
 		event->comm.comm, event->comm.pid);
 
 	if (thread == NULL ||
 	    thread__set_comm(thread, event->comm.comm)) {
-		dump_printf("problem processing PERF_EVENT_COMM, skipping event.\n");
+		dump_printf("problem processing PERF_RECORD_COMM, skipping event.\n");
 		return -1;
 	}
 	total_comm++;
@@ -614,7 +614,7 @@ process_fork_event(event_t *event, unsigned long offset, unsigned long head)
 
 	thread = threads__findnew(event->fork.pid, &threads, &last_match);
 	parent = threads__findnew(event->fork.ppid, &threads, &last_match);
-	dump_printf("%p [%p]: PERF_EVENT_FORK: %d:%d\n",
+	dump_printf("%p [%p]: PERF_RECORD_FORK: %d:%d\n",
 		(void *)(offset + head),
 		(void *)(long)(event->header.size),
 		event->fork.pid, event->fork.ppid);
@@ -627,7 +627,7 @@ process_fork_event(event_t *event, unsigned long offset, unsigned long head)
 		return 0;
 
 	if (!thread || !parent || thread__fork(thread, parent)) {
-		dump_printf("problem processing PERF_EVENT_FORK, skipping event.\n");
+		dump_printf("problem processing PERF_RECORD_FORK, skipping event.\n");
 		return -1;
 	}
 	total_fork++;
@@ -639,23 +639,23 @@ static int
 process_event(event_t *event, unsigned long offset, unsigned long head)
 {
 	switch (event->header.type) {
-	case PERF_EVENT_SAMPLE:
+	case PERF_RECORD_SAMPLE:
 		return process_sample_event(event, offset, head);
 
-	case PERF_EVENT_MMAP:
+	case PERF_RECORD_MMAP:
 		return process_mmap_event(event, offset, head);
 
-	case PERF_EVENT_COMM:
+	case PERF_RECORD_COMM:
 		return process_comm_event(event, offset, head);
 
-	case PERF_EVENT_FORK:
+	case PERF_RECORD_FORK:
 		return process_fork_event(event, offset, head);
 	/*
 	 * We dont process them right now but they are fine:
 	 */
 
-	case PERF_EVENT_THROTTLE:
-	case PERF_EVENT_UNTHROTTLE:
+	case PERF_RECORD_THROTTLE:
+	case PERF_RECORD_UNTHROTTLE:
 		return 0;
 
 	default:
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 2459e5a22ed..a5a050af8e7 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -77,7 +77,7 @@ static struct mmap_data		mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
 
 static unsigned long mmap_read_head(struct mmap_data *md)
 {
-	struct perf_counter_mmap_page *pc = md->base;
+	struct perf_event_mmap_page *pc = md->base;
 	long head;
 
 	head = pc->data_head;
@@ -88,7 +88,7 @@ static unsigned long mmap_read_head(struct mmap_data *md)
 
 static void mmap_write_tail(struct mmap_data *md, unsigned long tail)
 {
-	struct perf_counter_mmap_page *pc = md->base;
+	struct perf_event_mmap_page *pc = md->base;
 
 	/*
 	 * ensure all reads are done before we write the tail out.
@@ -233,7 +233,7 @@ static pid_t pid_synthesize_comm_event(pid_t pid, int full)
 		}
 	}
 
-	comm_ev.header.type = PERF_EVENT_COMM;
+	comm_ev.header.type = PERF_RECORD_COMM;
 	size = ALIGN(size, sizeof(u64));
 	comm_ev.header.size = sizeof(comm_ev) - (sizeof(comm_ev.comm) - size);
 
@@ -288,7 +288,7 @@ static void pid_synthesize_mmap_samples(pid_t pid, pid_t tgid)
 	while (1) {
 		char bf[BUFSIZ], *pbf = bf;
 		struct mmap_event mmap_ev = {
-			.header = { .type = PERF_EVENT_MMAP },
+			.header = { .type = PERF_RECORD_MMAP },
 		};
 		int n;
 		size_t size;
@@ -355,7 +355,7 @@ static void synthesize_all(void)
 
 static int group_fd;
 
-static struct perf_header_attr *get_header_attr(struct perf_counter_attr *a, int nr)
+static struct perf_header_attr *get_header_attr(struct perf_event_attr *a, int nr)
 {
 	struct perf_header_attr *h_attr;
 
@@ -371,7 +371,7 @@ static struct perf_header_attr *get_header_attr(struct perf_counter_attr *a, int
 
 static void create_counter(int counter, int cpu, pid_t pid)
 {
-	struct perf_counter_attr *attr = attrs + counter;
+	struct perf_event_attr *attr = attrs + counter;
 	struct perf_header_attr *h_attr;
 	int track = !counter; /* only the first counter needs these */
 	struct {
@@ -417,7 +417,7 @@ static void create_counter(int counter, int cpu, pid_t pid)
 	attr->disabled		= 1;
 
 try_again:
-	fd[nr_cpu][counter] = sys_perf_counter_open(attr, pid, cpu, group_fd, 0);
+	fd[nr_cpu][counter] = sys_perf_event_open(attr, pid, cpu, group_fd, 0);
 
 	if (fd[nr_cpu][counter] < 0) {
 		int err = errno;
@@ -444,7 +444,7 @@ try_again:
 		printf("\n");
 		error("perfcounter syscall returned with %d (%s)\n",
 			fd[nr_cpu][counter], strerror(err));
-		die("No CONFIG_PERF_COUNTERS=y kernel support configured?\n");
+		die("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
 		exit(-1);
 	}
 
@@ -478,7 +478,7 @@ try_again:
 	if (multiplex && fd[nr_cpu][counter] != multiplex_fd) {
 		int ret;
 
-		ret = ioctl(fd[nr_cpu][counter], PERF_COUNTER_IOC_SET_OUTPUT, multiplex_fd);
+		ret = ioctl(fd[nr_cpu][counter], PERF_EVENT_IOC_SET_OUTPUT, multiplex_fd);
 		assert(ret != -1);
 	} else {
 		event_array[nr_poll].fd = fd[nr_cpu][counter];
@@ -496,7 +496,7 @@ try_again:
 		}
 	}
 
-	ioctl(fd[nr_cpu][counter], PERF_COUNTER_IOC_ENABLE);
+	ioctl(fd[nr_cpu][counter], PERF_EVENT_IOC_ENABLE);
 }
 
 static void open_counters(int cpu, pid_t pid)
@@ -642,7 +642,7 @@ static int __cmd_record(int argc, const char **argv)
 		if (done) {
 			for (i = 0; i < nr_cpu; i++) {
 				for (counter = 0; counter < nr_counters; counter++)
-					ioctl(fd[i][counter], PERF_COUNTER_IOC_DISABLE);
+					ioctl(fd[i][counter], PERF_EVENT_IOC_DISABLE);
 			}
 		}
 	}
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index cdf9a8d27bb..19669c20088 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -1121,7 +1121,7 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head)
 		more_data += sizeof(u64);
 	}
 
-	dump_printf("%p [%p]: PERF_EVENT_SAMPLE (IP, %d): %d/%d: %p period: %Ld\n",
+	dump_printf("%p [%p]: PERF_RECORD_SAMPLE (IP, %d): %d/%d: %p period: %Ld\n",
 		(void *)(offset + head),
 		(void *)(long)(event->header.size),
 		event->header.misc,
@@ -1158,9 +1158,9 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head)
 	if (comm_list && !strlist__has_entry(comm_list, thread->comm))
 		return 0;
 
-	cpumode = event->header.misc & PERF_EVENT_MISC_CPUMODE_MASK;
+	cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
 
-	if (cpumode == PERF_EVENT_MISC_KERNEL) {
+	if (cpumode == PERF_RECORD_MISC_KERNEL) {
 		show = SHOW_KERNEL;
 		level = 'k';
 
@@ -1168,7 +1168,7 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head)
 
 		dump_printf(" ...... dso: %s\n", dso->name);
 
-	} else if (cpumode == PERF_EVENT_MISC_USER) {
+	} else if (cpumode == PERF_RECORD_MISC_USER) {
 
 		show = SHOW_USER;
 		level = '.';
@@ -1210,7 +1210,7 @@ process_mmap_event(event_t *event, unsigned long offset, unsigned long head)
 
 	thread = threads__findnew(event->mmap.pid, &threads, &last_match);
 
-	dump_printf("%p [%p]: PERF_EVENT_MMAP %d/%d: [%p(%p) @ %p]: %s\n",
+	dump_printf("%p [%p]: PERF_RECORD_MMAP %d/%d: [%p(%p) @ %p]: %s\n",
 		(void *)(offset + head),
 		(void *)(long)(event->header.size),
 		event->mmap.pid,
@@ -1221,7 +1221,7 @@ process_mmap_event(event_t *event, unsigned long offset, unsigned long head)
 		event->mmap.filename);
 
 	if (thread == NULL || map == NULL) {
-		dump_printf("problem processing PERF_EVENT_MMAP, skipping event.\n");
+		dump_printf("problem processing PERF_RECORD_MMAP, skipping event.\n");
 		return 0;
 	}
 
@@ -1238,14 +1238,14 @@ process_comm_event(event_t *event, unsigned long offset, unsigned long head)
 
 	thread = threads__findnew(event->comm.pid, &threads, &last_match);
 
-	dump_printf("%p [%p]: PERF_EVENT_COMM: %s:%d\n",
+	dump_printf("%p [%p]: PERF_RECORD_COMM: %s:%d\n",
 		(void *)(offset + head),
 		(void *)(long)(event->header.size),
 		event->comm.comm, event->comm.pid);
 
 	if (thread == NULL ||
 	    thread__set_comm_adjust(thread, event->comm.comm)) {
-		dump_printf("problem processing PERF_EVENT_COMM, skipping event.\n");
+		dump_printf("problem processing PERF_RECORD_COMM, skipping event.\n");
 		return -1;
 	}
 	total_comm++;
@@ -1262,10 +1262,10 @@ process_task_event(event_t *event, unsigned long offset, unsigned long head)
 	thread = threads__findnew(event->fork.pid, &threads, &last_match);
 	parent = threads__findnew(event->fork.ppid, &threads, &last_match);
 
-	dump_printf("%p [%p]: PERF_EVENT_%s: (%d:%d):(%d:%d)\n",
+	dump_printf("%p [%p]: PERF_RECORD_%s: (%d:%d):(%d:%d)\n",
 		(void *)(offset + head),
 		(void *)(long)(event->header.size),
-		event->header.type == PERF_EVENT_FORK ? "FORK" : "EXIT",
+		event->header.type == PERF_RECORD_FORK ? "FORK" : "EXIT",
 		event->fork.pid, event->fork.tid,
 		event->fork.ppid, event->fork.ptid);
 
@@ -1276,11 +1276,11 @@ process_task_event(event_t *event, unsigned long offset, unsigned long head)
 	if (thread == parent)
 		return 0;
 
-	if (event->header.type == PERF_EVENT_EXIT)
+	if (event->header.type == PERF_RECORD_EXIT)
 		return 0;
 
 	if (!thread || !parent || thread__fork(thread, parent)) {
-		dump_printf("problem processing PERF_EVENT_FORK, skipping event.\n");
+		dump_printf("problem processing PERF_RECORD_FORK, skipping event.\n");
 		return -1;
 	}
 	total_fork++;
@@ -1291,7 +1291,7 @@ process_task_event(event_t *event, unsigned long offset, unsigned long head)
 static int
 process_lost_event(event_t *event, unsigned long offset, unsigned long head)
 {
-	dump_printf("%p [%p]: PERF_EVENT_LOST: id:%Ld: lost:%Ld\n",
+	dump_printf("%p [%p]: PERF_RECORD_LOST: id:%Ld: lost:%Ld\n",
 		(void *)(offset + head),
 		(void *)(long)(event->header.size),
 		event->lost.id,
@@ -1305,7 +1305,7 @@ process_lost_event(event_t *event, unsigned long offset, unsigned long head)
 static int
 process_read_event(event_t *event, unsigned long offset, unsigned long head)
 {
-	struct perf_counter_attr *attr;
+	struct perf_event_attr *attr;
 
 	attr = perf_header__find_attr(event->read.id, header);
 
@@ -1319,7 +1319,7 @@ process_read_event(event_t *event, unsigned long offset, unsigned long head)
 					   event->read.value);
 	}
 
-	dump_printf("%p [%p]: PERF_EVENT_READ: %d %d %s %Lu\n",
+	dump_printf("%p [%p]: PERF_RECORD_READ: %d %d %s %Lu\n",
 			(void *)(offset + head),
 			(void *)(long)(event->header.size),
 			event->read.pid,
@@ -1337,31 +1337,31 @@ process_event(event_t *event, unsigned long offset, unsigned long head)
 	trace_event(event);
 
 	switch (event->header.type) {
-	case PERF_EVENT_SAMPLE:
+	case PERF_RECORD_SAMPLE:
 		return process_sample_event(event, offset, head);
 
-	case PERF_EVENT_MMAP:
+	case PERF_RECORD_MMAP:
 		return process_mmap_event(event, offset, head);
 
-	case PERF_EVENT_COMM:
+	case PERF_RECORD_COMM:
 		return process_comm_event(event, offset, head);
 
-	case PERF_EVENT_FORK:
-	case PERF_EVENT_EXIT:
+	case PERF_RECORD_FORK:
+	case PERF_RECORD_EXIT:
 		return process_task_event(event, offset, head);
 
-	case PERF_EVENT_LOST:
+	case PERF_RECORD_LOST:
 		return process_lost_event(event, offset, head);
 
-	case PERF_EVENT_READ:
+	case PERF_RECORD_READ:
 		return process_read_event(event, offset, head);
 
 	/*
 	 * We dont process them right now but they are fine:
 	 */
 
-	case PERF_EVENT_THROTTLE:
-	case PERF_EVENT_UNTHROTTLE:
+	case PERF_RECORD_THROTTLE:
+	case PERF_RECORD_UNTHROTTLE:
 		return 0;
 
 	default:
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 275d79c6627..ea9c15c0cdf 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -1573,7 +1573,7 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head)
 		more_data += sizeof(u64);
 	}
 
-	dump_printf("%p [%p]: PERF_EVENT_SAMPLE (IP, %d): %d/%d: %p period: %Ld\n",
+	dump_printf("%p [%p]: PERF_RECORD_SAMPLE (IP, %d): %d/%d: %p period: %Ld\n",
 		(void *)(offset + head),
 		(void *)(long)(event->header.size),
 		event->header.misc,
@@ -1589,9 +1589,9 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head)
 		return -1;
 	}
 
-	cpumode = event->header.misc & PERF_EVENT_MISC_CPUMODE_MASK;
+	cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
 
-	if (cpumode == PERF_EVENT_MISC_KERNEL) {
+	if (cpumode == PERF_RECORD_MISC_KERNEL) {
 		show = SHOW_KERNEL;
 		level = 'k';
 
@@ -1599,7 +1599,7 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head)
 
 		dump_printf(" ...... dso: %s\n", dso->name);
 
-	} else if (cpumode == PERF_EVENT_MISC_USER) {
+	} else if (cpumode == PERF_RECORD_MISC_USER) {
 
 		show = SHOW_USER;
 		level = '.';
@@ -1626,23 +1626,23 @@ process_event(event_t *event, unsigned long offset, unsigned long head)
 
 	nr_events++;
 	switch (event->header.type) {
-	case PERF_EVENT_MMAP:
+	case PERF_RECORD_MMAP:
 		return 0;
-	case PERF_EVENT_LOST:
+	case PERF_RECORD_LOST:
 		nr_lost_chunks++;
 		nr_lost_events += event->lost.lost;
 		return 0;
 
-	case PERF_EVENT_COMM:
+	case PERF_RECORD_COMM:
 		return process_comm_event(event, offset, head);
 
-	case PERF_EVENT_EXIT ... PERF_EVENT_READ:
+	case PERF_RECORD_EXIT ... PERF_RECORD_READ:
 		return 0;
 
-	case PERF_EVENT_SAMPLE:
+	case PERF_RECORD_SAMPLE:
 		return process_sample_event(event, offset, head);
 
-	case PERF_EVENT_MAX:
+	case PERF_RECORD_MAX:
 	default:
 		return -1;
 	}
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 61b828236c1..16af2d82e85 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -48,7 +48,7 @@
 #include <sys/prctl.h>
 #include <math.h>
 
-static struct perf_counter_attr default_attrs[] = {
+static struct perf_event_attr default_attrs[] = {
 
   { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK	},
   { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CONTEXT_SWITCHES},
@@ -130,11 +130,11 @@ struct stats			runtime_cycles_stats;
 	 attrs[counter].config == PERF_COUNT_##c)
 
 #define ERR_PERF_OPEN \
-"Error: counter %d, sys_perf_counter_open() syscall returned with %d (%s)\n"
+"Error: counter %d, sys_perf_event_open() syscall returned with %d (%s)\n"
 
 static void create_perf_stat_counter(int counter, int pid)
 {
-	struct perf_counter_attr *attr = attrs + counter;
+	struct perf_event_attr *attr = attrs + counter;
 
 	if (scale)
 		attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED |
@@ -144,7 +144,7 @@ static void create_perf_stat_counter(int counter, int pid)
 		unsigned int cpu;
 
 		for (cpu = 0; cpu < nr_cpus; cpu++) {
-			fd[cpu][counter] = sys_perf_counter_open(attr, -1, cpu, -1, 0);
+			fd[cpu][counter] = sys_perf_event_open(attr, -1, cpu, -1, 0);
 			if (fd[cpu][counter] < 0 && verbose)
 				fprintf(stderr, ERR_PERF_OPEN, counter,
 					fd[cpu][counter], strerror(errno));
@@ -154,7 +154,7 @@ static void create_perf_stat_counter(int counter, int pid)
 		attr->disabled	     = 1;
 		attr->enable_on_exec = 1;
 
-		fd[0][counter] = sys_perf_counter_open(attr, pid, -1, -1, 0);
+		fd[0][counter] = sys_perf_event_open(attr, pid, -1, -1, 0);
 		if (fd[0][counter] < 0 && verbose)
 			fprintf(stderr, ERR_PERF_OPEN, counter,
 				fd[0][counter], strerror(errno));
diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c
index 60040639627..4405681b313 100644
--- a/tools/perf/builtin-timechart.c
+++ b/tools/perf/builtin-timechart.c
@@ -937,21 +937,21 @@ process_event(event_t *event)
 
 	switch (event->header.type) {
 
-	case PERF_EVENT_COMM:
+	case PERF_RECORD_COMM:
 		return process_comm_event(event);
-	case PERF_EVENT_FORK:
+	case PERF_RECORD_FORK:
 		return process_fork_event(event);
-	case PERF_EVENT_EXIT:
+	case PERF_RECORD_EXIT:
 		return process_exit_event(event);
-	case PERF_EVENT_SAMPLE:
+	case PERF_RECORD_SAMPLE:
 		return queue_sample_event(event);
 
 	/*
 	 * We dont process them right now but they are fine:
 	 */
-	case PERF_EVENT_MMAP:
-	case PERF_EVENT_THROTTLE:
-	case PERF_EVENT_UNTHROTTLE:
+	case PERF_RECORD_MMAP:
+	case PERF_RECORD_THROTTLE:
+	case PERF_RECORD_UNTHROTTLE:
 		return 0;
 
 	default:
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 4002ccb3675..1ca88896eee 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -901,7 +901,7 @@ struct mmap_data {
 
 static unsigned int mmap_read_head(struct mmap_data *md)
 {
-	struct perf_counter_mmap_page *pc = md->base;
+	struct perf_event_mmap_page *pc = md->base;
 	int head;
 
 	head = pc->data_head;
@@ -977,9 +977,9 @@ static void mmap_read_counter(struct mmap_data *md)
 
 		old += size;
 
-		if (event->header.type == PERF_EVENT_SAMPLE) {
+		if (event->header.type == PERF_RECORD_SAMPLE) {
 			int user =
-	(event->header.misc & PERF_EVENT_MISC_CPUMODE_MASK) == PERF_EVENT_MISC_USER;
+	(event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK) == PERF_RECORD_MISC_USER;
 			process_event(event->ip.ip, md->counter, user);
 		}
 	}
@@ -1005,7 +1005,7 @@ int group_fd;
 
 static void start_counter(int i, int counter)
 {
-	struct perf_counter_attr *attr;
+	struct perf_event_attr *attr;
 	int cpu;
 
 	cpu = profile_cpu;
@@ -1019,7 +1019,7 @@ static void start_counter(int i, int counter)
 	attr->inherit		= (cpu < 0) && inherit;
 
 try_again:
-	fd[i][counter] = sys_perf_counter_open(attr, target_pid, cpu, group_fd, 0);
+	fd[i][counter] = sys_perf_event_open(attr, target_pid, cpu, group_fd, 0);
 
 	if (fd[i][counter] < 0) {
 		int err = errno;
@@ -1044,7 +1044,7 @@ try_again:
 		printf("\n");
 		error("perfcounter syscall returned with %d (%s)\n",
 			fd[i][counter], strerror(err));
-		die("No CONFIG_PERF_COUNTERS=y kernel support configured?\n");
+		die("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
 		exit(-1);
 	}
 	assert(fd[i][counter] >= 0);
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 914ab366e36..e9d256e2f47 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -35,14 +35,14 @@ process_comm_event(event_t *event, unsigned long offset, unsigned long head)
 
 	thread = threads__findnew(event->comm.pid, &threads, &last_match);
 
-	dump_printf("%p [%p]: PERF_EVENT_COMM: %s:%d\n",
+	dump_printf("%p [%p]: PERF_RECORD_COMM: %s:%d\n",
 		(void *)(offset + head),
 		(void *)(long)(event->header.size),
 		event->comm.comm, event->comm.pid);
 
 	if (thread == NULL ||
 	    thread__set_comm(thread, event->comm.comm)) {
-		dump_printf("problem processing PERF_EVENT_COMM, skipping event.\n");
+		dump_printf("problem processing PERF_RECORD_COMM, skipping event.\n");
 		return -1;
 	}
 	total_comm++;
@@ -82,7 +82,7 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head)
 		more_data += sizeof(u64);
 	}
 
-	dump_printf("%p [%p]: PERF_EVENT_SAMPLE (IP, %d): %d/%d: %p period: %Ld\n",
+	dump_printf("%p [%p]: PERF_RECORD_SAMPLE (IP, %d): %d/%d: %p period: %Ld\n",
 		(void *)(offset + head),
 		(void *)(long)(event->header.size),
 		event->header.misc,
@@ -98,9 +98,9 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head)
 		return -1;
 	}
 
-	cpumode = event->header.misc & PERF_EVENT_MISC_CPUMODE_MASK;
+	cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
 
-	if (cpumode == PERF_EVENT_MISC_KERNEL) {
+	if (cpumode == PERF_RECORD_MISC_KERNEL) {
 		show = SHOW_KERNEL;
 		level = 'k';
 
@@ -108,7 +108,7 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head)
 
 		dump_printf(" ...... dso: %s\n", dso->name);
 
-	} else if (cpumode == PERF_EVENT_MISC_USER) {
+	} else if (cpumode == PERF_RECORD_MISC_USER) {
 
 		show = SHOW_USER;
 		level = '.';
@@ -146,19 +146,19 @@ process_event(event_t *event, unsigned long offset, unsigned long head)
 	trace_event(event);
 
 	switch (event->header.type) {
-	case PERF_EVENT_MMAP ... PERF_EVENT_LOST:
+	case PERF_RECORD_MMAP ... PERF_RECORD_LOST:
 		return 0;
 
-	case PERF_EVENT_COMM:
+	case PERF_RECORD_COMM:
 		return process_comm_event(event, offset, head);
 
-	case PERF_EVENT_EXIT ... PERF_EVENT_READ:
+	case PERF_RECORD_EXIT ... PERF_RECORD_READ:
 		return 0;
 
-	case PERF_EVENT_SAMPLE:
+	case PERF_RECORD_SAMPLE:
 		return process_sample_event(event, offset, head);
 
-	case PERF_EVENT_MAX:
+	case PERF_RECORD_MAX:
 	default:
 		return -1;
 	}
diff --git a/tools/perf/design.txt b/tools/perf/design.txt
index f71e0d245cb..f1946d107b1 100644
--- a/tools/perf/design.txt
+++ b/tools/perf/design.txt
@@ -18,10 +18,10 @@ underlying hardware counters.
 Performance counters are accessed via special file descriptors.
 There's one file descriptor per virtual counter used.
 
-The special file descriptor is opened via the perf_counter_open()
+The special file descriptor is opened via the perf_event_open()
 system call:
 
-   int sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr,
+   int sys_perf_event_open(struct perf_event_hw_event *hw_event_uptr,
 			     pid_t pid, int cpu, int group_fd,
 			     unsigned long flags);
 
@@ -32,9 +32,9 @@ can be used to set the blocking mode, etc.
 Multiple counters can be kept open at a time, and the counters
 can be poll()ed.
 
-When creating a new counter fd, 'perf_counter_hw_event' is:
+When creating a new counter fd, 'perf_event_hw_event' is:
 
-struct perf_counter_hw_event {
+struct perf_event_hw_event {
         /*
          * The MSB of the config word signifies if the rest contains cpu
          * specific (raw) counter configuration data, if unset, the next
@@ -93,7 +93,7 @@ specified by 'event_id':
 
 /*
  * Generalized performance counter event types, used by the hw_event.event_id
- * parameter of the sys_perf_counter_open() syscall:
+ * parameter of the sys_perf_event_open() syscall:
  */
 enum hw_event_ids {
 	/*
@@ -159,7 +159,7 @@ in size.
  * reads on the counter should return the indicated quantities,
  * in increasing order of bit value, after the counter value.
  */
-enum perf_counter_read_format {
+enum perf_event_read_format {
         PERF_FORMAT_TOTAL_TIME_ENABLED  =  1,
         PERF_FORMAT_TOTAL_TIME_RUNNING  =  2,
 };
@@ -178,7 +178,7 @@ interrupt:
  * Bits that can be set in hw_event.record_type to request information
  * in the overflow packets.
  */
-enum perf_counter_record_format {
+enum perf_event_record_format {
         PERF_RECORD_IP          = 1U << 0,
         PERF_RECORD_TID         = 1U << 1,
         PERF_RECORD_TIME        = 1U << 2,
@@ -228,7 +228,7 @@ these events are recorded in the ring-buffer (see below).
 The 'comm' bit allows tracking of process comm data on process creation.
 This too is recorded in the ring-buffer (see below).
 
-The 'pid' parameter to the perf_counter_open() system call allows the
+The 'pid' parameter to the perf_event_open() system call allows the
 counter to be specific to a task:
 
  pid == 0: if the pid parameter is zero, the counter is attached to the
@@ -258,7 +258,7 @@ The 'flags' parameter is currently unused and must be zero.
 
 The 'group_fd' parameter allows counter "groups" to be set up.  A
 counter group has one counter which is the group "leader".  The leader
-is created first, with group_fd = -1 in the perf_counter_open call
+is created first, with group_fd = -1 in the perf_event_open call
 that creates it.  The rest of the group members are created
 subsequently, with group_fd giving the fd of the group leader.
 (A single counter on its own is created with group_fd = -1 and is
@@ -277,13 +277,13 @@ tracking are logged into a ring-buffer. This ring-buffer is created and
 accessed through mmap().
 
 The mmap size should be 1+2^n pages, where the first page is a meta-data page
-(struct perf_counter_mmap_page) that contains various bits of information such
+(struct perf_event_mmap_page) that contains various bits of information such
 as where the ring-buffer head is.
 
 /*
  * Structure of the page that can be mapped via mmap
  */
-struct perf_counter_mmap_page {
+struct perf_event_mmap_page {
         __u32   version;                /* version number of this structure */
         __u32   compat_version;         /* lowest version this is compat with */
 
@@ -317,7 +317,7 @@ struct perf_counter_mmap_page {
          * Control data for the mmap() data buffer.
          *
          * User-space reading this value should issue an rmb(), on SMP capable
-         * platforms, after reading this value -- see perf_counter_wakeup().
+         * platforms, after reading this value -- see perf_event_wakeup().
          */
         __u32   data_head;              /* head in the data section */
 };
@@ -327,9 +327,9 @@ NOTE: the hw-counter userspace bits are arch specific and are currently only
 
 The following 2^n pages are the ring-buffer which contains events of the form:
 
-#define PERF_EVENT_MISC_KERNEL          (1 << 0)
-#define PERF_EVENT_MISC_USER            (1 << 1)
-#define PERF_EVENT_MISC_OVERFLOW        (1 << 2)
+#define PERF_RECORD_MISC_KERNEL          (1 << 0)
+#define PERF_RECORD_MISC_USER            (1 << 1)
+#define PERF_RECORD_MISC_OVERFLOW        (1 << 2)
 
 struct perf_event_header {
         __u32   type;
@@ -353,8 +353,8 @@ enum perf_event_type {
          *      char                            filename[];
          * };
          */
-        PERF_EVENT_MMAP                 = 1,
-        PERF_EVENT_MUNMAP               = 2,
+        PERF_RECORD_MMAP                 = 1,
+        PERF_RECORD_MUNMAP               = 2,
 
         /*
          * struct {
@@ -364,10 +364,10 @@ enum perf_event_type {
          *      char                            comm[];
          * };
          */
-        PERF_EVENT_COMM                 = 3,
+        PERF_RECORD_COMM                 = 3,
 
         /*
-         * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field
+         * When header.misc & PERF_RECORD_MISC_OVERFLOW the event_type field
          * will be PERF_RECORD_*
          *
          * struct {
@@ -397,7 +397,7 @@ Notification of new events is possible through poll()/select()/epoll() and
 fcntl() managing signals.
 
 Normally a notification is generated for every page filled, however one can
-additionally set perf_counter_hw_event.wakeup_events to generate one every
+additionally set perf_event_hw_event.wakeup_events to generate one every
 so many counter overflow events.
 
 Future work will include a splice() interface to the ring-buffer.
@@ -409,11 +409,11 @@ events but does continue to exist and maintain its count value.
 
 An individual counter or counter group can be enabled with
 
-	ioctl(fd, PERF_COUNTER_IOC_ENABLE);
+	ioctl(fd, PERF_EVENT_IOC_ENABLE);
 
 or disabled with
 
-	ioctl(fd, PERF_COUNTER_IOC_DISABLE);
+	ioctl(fd, PERF_EVENT_IOC_DISABLE);
 
 Enabling or disabling the leader of a group enables or disables the
 whole group; that is, while the group leader is disabled, none of the
@@ -424,16 +424,16 @@ other counter.
 
 Additionally, non-inherited overflow counters can use
 
-	ioctl(fd, PERF_COUNTER_IOC_REFRESH, nr);
+	ioctl(fd, PERF_EVENT_IOC_REFRESH, nr);
 
 to enable a counter for 'nr' events, after which it gets disabled again.
 
 A process can enable or disable all the counter groups that are
 attached to it, using prctl:
 
-	prctl(PR_TASK_PERF_COUNTERS_ENABLE);
+	prctl(PR_TASK_PERF_EVENTS_ENABLE);
 
-	prctl(PR_TASK_PERF_COUNTERS_DISABLE);
+	prctl(PR_TASK_PERF_EVENTS_DISABLE);
 
 This applies to all counters on the current process, whether created
 by this process or by another, and doesn't affect any counters that
@@ -447,11 +447,11 @@ Arch requirements
 If your architecture does not have hardware performance metrics, you can
 still use the generic software counters based on hrtimers for sampling.
 
-So to start with, in order to add HAVE_PERF_COUNTERS to your Kconfig, you
+So to start with, in order to add HAVE_PERF_EVENTS to your Kconfig, you
 will need at least this:
-	- asm/perf_counter.h - a basic stub will suffice at first
+	- asm/perf_event.h - a basic stub will suffice at first
 	- support for atomic64 types (and associated helper functions)
-	- set_perf_counter_pending() implemented
+	- set_perf_event_pending() implemented
 
 If your architecture does have hardware capabilities, you can override the
-weak stub hw_perf_counter_init() to register hardware counters.
+weak stub hw_perf_event_init() to register hardware counters.
diff --git a/tools/perf/perf.h b/tools/perf/perf.h
index 2abeb20d0bf..8cc4623afd6 100644
--- a/tools/perf/perf.h
+++ b/tools/perf/perf.h
@@ -52,15 +52,15 @@
 #include <sys/types.h>
 #include <sys/syscall.h>
 
-#include "../../include/linux/perf_counter.h"
+#include "../../include/linux/perf_event.h"
 #include "util/types.h"
 
 /*
- * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
+ * prctl(PR_TASK_PERF_EVENTS_DISABLE) will (cheaply) disable all
  * counters in the current task.
  */
-#define PR_TASK_PERF_COUNTERS_DISABLE   31
-#define PR_TASK_PERF_COUNTERS_ENABLE    32
+#define PR_TASK_PERF_EVENTS_DISABLE   31
+#define PR_TASK_PERF_EVENTS_ENABLE    32
 
 #ifndef NSEC_PER_SEC
 # define NSEC_PER_SEC			1000000000ULL
@@ -90,12 +90,12 @@ static inline unsigned long long rdclock(void)
 	_min1 < _min2 ? _min1 : _min2; })
 
 static inline int
-sys_perf_counter_open(struct perf_counter_attr *attr,
+sys_perf_event_open(struct perf_event_attr *attr,
 		      pid_t pid, int cpu, int group_fd,
 		      unsigned long flags)
 {
 	attr->size = sizeof(*attr);
-	return syscall(__NR_perf_counter_open, attr, pid, cpu,
+	return syscall(__NR_perf_event_open, attr, pid, cpu,
 		       group_fd, flags);
 }
 
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index 018d414a09d..2c9c26d6ded 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -1,5 +1,5 @@
-#ifndef __PERF_EVENT_H
-#define __PERF_EVENT_H
+#ifndef __PERF_RECORD_H
+#define __PERF_RECORD_H
 #include "../perf.h"
 #include "util.h"
 #include <linux/list.h>
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index bb4fca3efcc..e306857b2c2 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -9,7 +9,7 @@
 /*
  * Create new perf.data header attribute:
  */
-struct perf_header_attr *perf_header_attr__new(struct perf_counter_attr *attr)
+struct perf_header_attr *perf_header_attr__new(struct perf_event_attr *attr)
 {
 	struct perf_header_attr *self = malloc(sizeof(*self));
 
@@ -134,7 +134,7 @@ struct perf_file_section {
 };
 
 struct perf_file_attr {
-	struct perf_counter_attr	attr;
+	struct perf_event_attr	attr;
 	struct perf_file_section	ids;
 };
 
@@ -320,7 +320,7 @@ u64 perf_header__sample_type(struct perf_header *header)
 	return type;
 }
 
-struct perf_counter_attr *
+struct perf_event_attr *
 perf_header__find_attr(u64 id, struct perf_header *header)
 {
 	int i;
diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h
index 7b0e84a8717..a0761bc7863 100644
--- a/tools/perf/util/header.h
+++ b/tools/perf/util/header.h
@@ -1,12 +1,12 @@
 #ifndef _PERF_HEADER_H
 #define _PERF_HEADER_H
 
-#include "../../../include/linux/perf_counter.h"
+#include "../../../include/linux/perf_event.h"
 #include <sys/types.h>
 #include "types.h"
 
 struct perf_header_attr {
-	struct perf_counter_attr attr;
+	struct perf_event_attr attr;
 	int ids, size;
 	u64 *id;
 	off_t id_offset;
@@ -34,11 +34,11 @@ char *perf_header__find_event(u64 id);
 
 
 struct perf_header_attr *
-perf_header_attr__new(struct perf_counter_attr *attr);
+perf_header_attr__new(struct perf_event_attr *attr);
 void perf_header_attr__add_id(struct perf_header_attr *self, u64 id);
 
 u64 perf_header__sample_type(struct perf_header *header);
-struct perf_counter_attr *
+struct perf_event_attr *
 perf_header__find_attr(u64 id, struct perf_header *header);
 
 
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 89172fd0038..13ab4b842d4 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -10,7 +10,7 @@
 
 int					nr_counters;
 
-struct perf_counter_attr		attrs[MAX_COUNTERS];
+struct perf_event_attr		attrs[MAX_COUNTERS];
 
 struct event_symbol {
 	u8		type;
@@ -48,13 +48,13 @@ static struct event_symbol event_symbols[] = {
   { CSW(CPU_MIGRATIONS),	"cpu-migrations",	"migrations"	},
 };
 
-#define __PERF_COUNTER_FIELD(config, name) \
-	((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
+#define __PERF_EVENT_FIELD(config, name) \
+	((config & PERF_EVENT_##name##_MASK) >> PERF_EVENT_##name##_SHIFT)
 
-#define PERF_COUNTER_RAW(config)	__PERF_COUNTER_FIELD(config, RAW)
-#define PERF_COUNTER_CONFIG(config)	__PERF_COUNTER_FIELD(config, CONFIG)
-#define PERF_COUNTER_TYPE(config)	__PERF_COUNTER_FIELD(config, TYPE)
-#define PERF_COUNTER_ID(config)		__PERF_COUNTER_FIELD(config, EVENT)
+#define PERF_EVENT_RAW(config)	__PERF_EVENT_FIELD(config, RAW)
+#define PERF_EVENT_CONFIG(config)	__PERF_EVENT_FIELD(config, CONFIG)
+#define PERF_EVENT_TYPE(config)	__PERF_EVENT_FIELD(config, TYPE)
+#define PERF_EVENT_ID(config)		__PERF_EVENT_FIELD(config, EVENT)
 
 static const char *hw_event_names[] = {
 	"cycles",
@@ -352,7 +352,7 @@ static int parse_aliases(const char **str, const char *names[][MAX_ALIASES], int
 }
 
 static enum event_result
-parse_generic_hw_event(const char **str, struct perf_counter_attr *attr)
+parse_generic_hw_event(const char **str, struct perf_event_attr *attr)
 {
 	const char *s = *str;
 	int cache_type = -1, cache_op = -1, cache_result = -1;
@@ -417,7 +417,7 @@ parse_single_tracepoint_event(char *sys_name,
 			      const char *evt_name,
 			      unsigned int evt_length,
 			      char *flags,
-			      struct perf_counter_attr *attr,
+			      struct perf_event_attr *attr,
 			      const char **strp)
 {
 	char evt_path[MAXPATHLEN];
@@ -505,7 +505,7 @@ parse_subsystem_tracepoint_event(char *sys_name, char *flags)
 
 
 static enum event_result parse_tracepoint_event(const char **strp,
-				    struct perf_counter_attr *attr)
+				    struct perf_event_attr *attr)
 {
 	const char *evt_name;
 	char *flags;
@@ -563,7 +563,7 @@ static int check_events(const char *str, unsigned int i)
 }
 
 static enum event_result
-parse_symbolic_event(const char **strp, struct perf_counter_attr *attr)
+parse_symbolic_event(const char **strp, struct perf_event_attr *attr)
 {
 	const char *str = *strp;
 	unsigned int i;
@@ -582,7 +582,7 @@ parse_symbolic_event(const char **strp, struct perf_counter_attr *attr)
 }
 
 static enum event_result
-parse_raw_event(const char **strp, struct perf_counter_attr *attr)
+parse_raw_event(const char **strp, struct perf_event_attr *attr)
 {
 	const char *str = *strp;
 	u64 config;
@@ -601,7 +601,7 @@ parse_raw_event(const char **strp, struct perf_counter_attr *attr)
 }
 
 static enum event_result
-parse_numeric_event(const char **strp, struct perf_counter_attr *attr)
+parse_numeric_event(const char **strp, struct perf_event_attr *attr)
 {
 	const char *str = *strp;
 	char *endp;
@@ -623,7 +623,7 @@ parse_numeric_event(const char **strp, struct perf_counter_attr *attr)
 }
 
 static enum event_result
-parse_event_modifier(const char **strp, struct perf_counter_attr *attr)
+parse_event_modifier(const char **strp, struct perf_event_attr *attr)
 {
 	const char *str = *strp;
 	int eu = 1, ek = 1, eh = 1;
@@ -656,7 +656,7 @@ parse_event_modifier(const char **strp, struct perf_counter_attr *attr)
  * Symbolic names are (almost) exactly matched.
  */
 static enum event_result
-parse_event_symbols(const char **str, struct perf_counter_attr *attr)
+parse_event_symbols(const char **str, struct perf_event_attr *attr)
 {
 	enum event_result ret;
 
@@ -711,7 +711,7 @@ static void store_event_type(const char *orgname)
 
 int parse_events(const struct option *opt __used, const char *str, int unset __used)
 {
-	struct perf_counter_attr attr;
+	struct perf_event_attr attr;
 	enum event_result ret;
 
 	if (strchr(str, ':'))
diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h
index 60704c15961..30c60811284 100644
--- a/tools/perf/util/parse-events.h
+++ b/tools/perf/util/parse-events.h
@@ -16,7 +16,7 @@ extern struct tracepoint_path *tracepoint_id_to_path(u64 config);
 
 extern int			nr_counters;
 
-extern struct perf_counter_attr attrs[MAX_COUNTERS];
+extern struct perf_event_attr attrs[MAX_COUNTERS];
 
 extern const char *event_name(int ctr);
 extern const char *__event_name(int type, u64 config);
diff --git a/tools/perf/util/trace-event-info.c b/tools/perf/util/trace-event-info.c
index 1fd824c1f1c..af4b0573b37 100644
--- a/tools/perf/util/trace-event-info.c
+++ b/tools/perf/util/trace-event-info.c
@@ -480,12 +480,12 @@ out:
 }
 
 static struct tracepoint_path *
-get_tracepoints_path(struct perf_counter_attr *pattrs, int nb_counters)
+get_tracepoints_path(struct perf_event_attr *pattrs, int nb_events)
 {
 	struct tracepoint_path path, *ppath = &path;
 	int i;
 
-	for (i = 0; i < nb_counters; i++) {
+	for (i = 0; i < nb_events; i++) {
 		if (pattrs[i].type != PERF_TYPE_TRACEPOINT)
 			continue;
 		ppath->next = tracepoint_id_to_path(pattrs[i].config);
@@ -496,7 +496,7 @@ get_tracepoints_path(struct perf_counter_attr *pattrs, int nb_counters)
 
 	return path.next;
 }
-void read_tracing_data(struct perf_counter_attr *pattrs, int nb_counters)
+void read_tracing_data(struct perf_event_attr *pattrs, int nb_events)
 {
 	char buf[BUFSIZ];
 	struct tracepoint_path *tps;
@@ -530,7 +530,7 @@ void read_tracing_data(struct perf_counter_attr *pattrs, int nb_counters)
 	page_size = getpagesize();
 	write_or_die(&page_size, 4);
 
-	tps = get_tracepoints_path(pattrs, nb_counters);
+	tps = get_tracepoints_path(pattrs, nb_events);
 
 	read_header_files();
 	read_ftrace_files(tps);
diff --git a/tools/perf/util/trace-event.h b/tools/perf/util/trace-event.h
index d35ebf1e29f..693f815c942 100644
--- a/tools/perf/util/trace-event.h
+++ b/tools/perf/util/trace-event.h
@@ -240,6 +240,6 @@ unsigned long long
 raw_field_value(struct event *event, const char *name, void *data);
 void *raw_field_ptr(struct event *event, const char *name, void *data);
 
-void read_tracing_data(struct perf_counter_attr *pattrs, int nb_counters);
+void read_tracing_data(struct perf_event_attr *pattrs, int nb_events);
 
 #endif /* _TRACE_EVENTS_H */
-- 
cgit v1.2.3


From 57c0c15b5244320065374ad2c54f4fbec77a6428 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 21 Sep 2009 12:20:38 +0200
Subject: perf: Tidy up after the big rename

 - provide compatibility Kconfig entry for existing PERF_COUNTERS .config's

 - provide courtesy copy of old perf_counter.h, for user-space projects

 - small indentation fixups

 - fix up MAINTAINERS

 - fix small x86 printout fallout

 - fix up small PowerPC comment fallout (use 'counter' as in register)

Reviewed-by: Arjan van de Ven <arjan@linux.intel.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 MAINTAINERS                      |   2 +-
 arch/powerpc/include/asm/paca.h  |   2 +-
 arch/powerpc/kernel/perf_event.c |  12 +-
 arch/x86/kernel/cpu/perf_event.c |  14 +-
 include/linux/perf_counter.h     | 441 +++++++++++++++++++++++++++++++++++++++
 include/linux/perf_event.h       |  98 ++++-----
 init/Kconfig                     |  37 +++-
 kernel/perf_event.c              |   4 +-
 8 files changed, 534 insertions(+), 76 deletions(-)
 create mode 100644 include/linux/perf_counter.h

(limited to 'arch/x86')

diff --git a/MAINTAINERS b/MAINTAINERS
index 43761a00e3f..751a307dc44 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4000,7 +4000,7 @@ S:	Maintained
 F:	include/linux/delayacct.h
 F:	kernel/delayacct.c
 
-PERFORMANCE COUNTER SUBSYSTEM
+PERFORMANCE EVENTS SUBSYSTEM
 M:	Peter Zijlstra <a.p.zijlstra@chello.nl>
 M:	Paul Mackerras <paulus@samba.org>
 M:	Ingo Molnar <mingo@elte.hu>
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 154f405b642..7d8514cecea 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -122,7 +122,7 @@ struct paca_struct {
 	u8 soft_enabled;		/* irq soft-enable flag */
 	u8 hard_enabled;		/* set if irqs are enabled in MSR */
 	u8 io_sync;			/* writel() needs spin_unlock sync */
-	u8 perf_event_pending;	/* PM interrupt while soft-disabled */
+	u8 perf_event_pending;		/* PM interrupt while soft-disabled */
 
 	/* Stuff for accurate time accounting */
 	u64 user_time;			/* accumulated usermode TB ticks */
diff --git a/arch/powerpc/kernel/perf_event.c b/arch/powerpc/kernel/perf_event.c
index c98321fcb45..197b7d95879 100644
--- a/arch/powerpc/kernel/perf_event.c
+++ b/arch/powerpc/kernel/perf_event.c
@@ -41,7 +41,7 @@ DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events);
 struct power_pmu *ppmu;
 
 /*
- * Normally, to ignore kernel events we set the FCS (freeze events
+ * Normally, to ignore kernel events we set the FCS (freeze counters
  * in supervisor mode) bit in MMCR0, but if the kernel runs with the
  * hypervisor bit set in the MSR, or if we are running on a processor
  * where the hypervisor bit is forced to 1 (as on Apple G5 processors),
@@ -159,7 +159,7 @@ void perf_event_print_debug(void)
 }
 
 /*
- * Read one performance monitor event (PMC).
+ * Read one performance monitor counter (PMC).
  */
 static unsigned long read_pmc(int idx)
 {
@@ -409,7 +409,7 @@ static void power_pmu_read(struct perf_event *event)
 		val = read_pmc(event->hw.idx);
 	} while (atomic64_cmpxchg(&event->hw.prev_count, prev, val) != prev);
 
-	/* The events are only 32 bits wide */
+	/* The counters are only 32 bits wide */
 	delta = (val - prev) & 0xfffffffful;
 	atomic64_add(delta, &event->count);
 	atomic64_sub(delta, &event->hw.period_left);
@@ -543,7 +543,7 @@ void hw_perf_disable(void)
 		}
 
 		/*
-		 * Set the 'freeze events' bit.
+		 * Set the 'freeze counters' bit.
 		 * The barrier is to make sure the mtspr has been
 		 * executed and the PMU has frozen the events
 		 * before we return.
@@ -1124,7 +1124,7 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
 }
 
 /*
- * A event has overflowed; update its count and record
+ * A counter has overflowed; update its count and record
  * things if requested.  Note that interrupts are hard-disabled
  * here so there is no possibility of being interrupted.
  */
@@ -1271,7 +1271,7 @@ static void perf_event_interrupt(struct pt_regs *regs)
 
 	/*
 	 * Reset MMCR0 to its normal value.  This will set PMXE and
-	 * clear FC (freeze events) and PMAO (perf mon alert occurred)
+	 * clear FC (freeze counters) and PMAO (perf mon alert occurred)
 	 * and thus allow interrupts to occur again.
 	 * XXX might want to use MSR.PM to keep the events frozen until
 	 * we get back out of this interrupt.
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 0d03629fb1a..a3c7adb06b7 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -2081,13 +2081,13 @@ void __init init_hw_perf_events(void)
 	perf_events_lapic_init();
 	register_die_notifier(&perf_event_nmi_notifier);
 
-	pr_info("... version:                 %d\n",     x86_pmu.version);
-	pr_info("... bit width:               %d\n",     x86_pmu.event_bits);
-	pr_info("... generic events:        %d\n",     x86_pmu.num_events);
-	pr_info("... value mask:              %016Lx\n", x86_pmu.event_mask);
-	pr_info("... max period:              %016Lx\n", x86_pmu.max_period);
-	pr_info("... fixed-purpose events:  %d\n",     x86_pmu.num_events_fixed);
-	pr_info("... event mask:            %016Lx\n", perf_event_mask);
+	pr_info("... version:                %d\n",     x86_pmu.version);
+	pr_info("... bit width:              %d\n",     x86_pmu.event_bits);
+	pr_info("... generic registers:      %d\n",     x86_pmu.num_events);
+	pr_info("... value mask:             %016Lx\n", x86_pmu.event_mask);
+	pr_info("... max period:             %016Lx\n", x86_pmu.max_period);
+	pr_info("... fixed-purpose events:   %d\n",     x86_pmu.num_events_fixed);
+	pr_info("... event mask:             %016Lx\n", perf_event_mask);
 }
 
 static inline void x86_pmu_read(struct perf_event *event)
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
new file mode 100644
index 00000000000..368bd70f1d2
--- /dev/null
+++ b/include/linux/perf_counter.h
@@ -0,0 +1,441 @@
+/*
+ *  NOTE: this file will be removed in a future kernel release, it is
+ *  provided as a courtesy copy of user-space code that relies on the
+ *  old (pre-rename) symbols and constants.
+ *
+ *  Performance events:
+ *
+ *    Copyright (C) 2008-2009, Thomas Gleixner <tglx@linutronix.de>
+ *    Copyright (C) 2008-2009, Red Hat, Inc., Ingo Molnar
+ *    Copyright (C) 2008-2009, Red Hat, Inc., Peter Zijlstra
+ *
+ *  Data type definitions, declarations, prototypes.
+ *
+ *    Started by: Thomas Gleixner and Ingo Molnar
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+#ifndef _LINUX_PERF_COUNTER_H
+#define _LINUX_PERF_COUNTER_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+#include <asm/byteorder.h>
+
+/*
+ * User-space ABI bits:
+ */
+
+/*
+ * attr.type
+ */
+enum perf_type_id {
+	PERF_TYPE_HARDWARE			= 0,
+	PERF_TYPE_SOFTWARE			= 1,
+	PERF_TYPE_TRACEPOINT			= 2,
+	PERF_TYPE_HW_CACHE			= 3,
+	PERF_TYPE_RAW				= 4,
+
+	PERF_TYPE_MAX,				/* non-ABI */
+};
+
+/*
+ * Generalized performance counter event types, used by the
+ * attr.event_id parameter of the sys_perf_counter_open()
+ * syscall:
+ */
+enum perf_hw_id {
+	/*
+	 * Common hardware events, generalized by the kernel:
+	 */
+	PERF_COUNT_HW_CPU_CYCLES		= 0,
+	PERF_COUNT_HW_INSTRUCTIONS		= 1,
+	PERF_COUNT_HW_CACHE_REFERENCES		= 2,
+	PERF_COUNT_HW_CACHE_MISSES		= 3,
+	PERF_COUNT_HW_BRANCH_INSTRUCTIONS	= 4,
+	PERF_COUNT_HW_BRANCH_MISSES		= 5,
+	PERF_COUNT_HW_BUS_CYCLES		= 6,
+
+	PERF_COUNT_HW_MAX,			/* non-ABI */
+};
+
+/*
+ * Generalized hardware cache counters:
+ *
+ *       { L1-D, L1-I, LLC, ITLB, DTLB, BPU } x
+ *       { read, write, prefetch } x
+ *       { accesses, misses }
+ */
+enum perf_hw_cache_id {
+	PERF_COUNT_HW_CACHE_L1D			= 0,
+	PERF_COUNT_HW_CACHE_L1I			= 1,
+	PERF_COUNT_HW_CACHE_LL			= 2,
+	PERF_COUNT_HW_CACHE_DTLB		= 3,
+	PERF_COUNT_HW_CACHE_ITLB		= 4,
+	PERF_COUNT_HW_CACHE_BPU			= 5,
+
+	PERF_COUNT_HW_CACHE_MAX,		/* non-ABI */
+};
+
+enum perf_hw_cache_op_id {
+	PERF_COUNT_HW_CACHE_OP_READ		= 0,
+	PERF_COUNT_HW_CACHE_OP_WRITE		= 1,
+	PERF_COUNT_HW_CACHE_OP_PREFETCH		= 2,
+
+	PERF_COUNT_HW_CACHE_OP_MAX,		/* non-ABI */
+};
+
+enum perf_hw_cache_op_result_id {
+	PERF_COUNT_HW_CACHE_RESULT_ACCESS	= 0,
+	PERF_COUNT_HW_CACHE_RESULT_MISS		= 1,
+
+	PERF_COUNT_HW_CACHE_RESULT_MAX,		/* non-ABI */
+};
+
+/*
+ * Special "software" counters provided by the kernel, even if the hardware
+ * does not support performance counters. These counters measure various
+ * physical and sw events of the kernel (and allow the profiling of them as
+ * well):
+ */
+enum perf_sw_ids {
+	PERF_COUNT_SW_CPU_CLOCK			= 0,
+	PERF_COUNT_SW_TASK_CLOCK		= 1,
+	PERF_COUNT_SW_PAGE_FAULTS		= 2,
+	PERF_COUNT_SW_CONTEXT_SWITCHES		= 3,
+	PERF_COUNT_SW_CPU_MIGRATIONS		= 4,
+	PERF_COUNT_SW_PAGE_FAULTS_MIN		= 5,
+	PERF_COUNT_SW_PAGE_FAULTS_MAJ		= 6,
+
+	PERF_COUNT_SW_MAX,			/* non-ABI */
+};
+
+/*
+ * Bits that can be set in attr.sample_type to request information
+ * in the overflow packets.
+ */
+enum perf_counter_sample_format {
+	PERF_SAMPLE_IP				= 1U << 0,
+	PERF_SAMPLE_TID				= 1U << 1,
+	PERF_SAMPLE_TIME			= 1U << 2,
+	PERF_SAMPLE_ADDR			= 1U << 3,
+	PERF_SAMPLE_READ			= 1U << 4,
+	PERF_SAMPLE_CALLCHAIN			= 1U << 5,
+	PERF_SAMPLE_ID				= 1U << 6,
+	PERF_SAMPLE_CPU				= 1U << 7,
+	PERF_SAMPLE_PERIOD			= 1U << 8,
+	PERF_SAMPLE_STREAM_ID			= 1U << 9,
+	PERF_SAMPLE_RAW				= 1U << 10,
+
+	PERF_SAMPLE_MAX = 1U << 11,		/* non-ABI */
+};
+
+/*
+ * The format of the data returned by read() on a perf counter fd,
+ * as specified by attr.read_format:
+ *
+ * struct read_format {
+ *	{ u64		value;
+ *	  { u64		time_enabled; } && PERF_FORMAT_ENABLED
+ *	  { u64		time_running; } && PERF_FORMAT_RUNNING
+ *	  { u64		id;           } && PERF_FORMAT_ID
+ *	} && !PERF_FORMAT_GROUP
+ *
+ *	{ u64		nr;
+ *	  { u64		time_enabled; } && PERF_FORMAT_ENABLED
+ *	  { u64		time_running; } && PERF_FORMAT_RUNNING
+ *	  { u64		value;
+ *	    { u64	id;           } && PERF_FORMAT_ID
+ *	  }		cntr[nr];
+ *	} && PERF_FORMAT_GROUP
+ * };
+ */
+enum perf_counter_read_format {
+	PERF_FORMAT_TOTAL_TIME_ENABLED		= 1U << 0,
+	PERF_FORMAT_TOTAL_TIME_RUNNING		= 1U << 1,
+	PERF_FORMAT_ID				= 1U << 2,
+	PERF_FORMAT_GROUP			= 1U << 3,
+
+	PERF_FORMAT_MAX = 1U << 4, 		/* non-ABI */
+};
+
+#define PERF_ATTR_SIZE_VER0	64	/* sizeof first published struct */
+
+/*
+ * Hardware event to monitor via a performance monitoring counter:
+ */
+struct perf_counter_attr {
+
+	/*
+	 * Major type: hardware/software/tracepoint/etc.
+	 */
+	__u32			type;
+
+	/*
+	 * Size of the attr structure, for fwd/bwd compat.
+	 */
+	__u32			size;
+
+	/*
+	 * Type specific configuration information.
+	 */
+	__u64			config;
+
+	union {
+		__u64		sample_period;
+		__u64		sample_freq;
+	};
+
+	__u64			sample_type;
+	__u64			read_format;
+
+	__u64			disabled       :  1, /* off by default        */
+				inherit	       :  1, /* children inherit it   */
+				pinned	       :  1, /* must always be on PMU */
+				exclusive      :  1, /* only group on PMU     */
+				exclude_user   :  1, /* don't count user      */
+				exclude_kernel :  1, /* ditto kernel          */
+				exclude_hv     :  1, /* ditto hypervisor      */
+				exclude_idle   :  1, /* don't count when idle */
+				mmap           :  1, /* include mmap data     */
+				comm	       :  1, /* include comm data     */
+				freq           :  1, /* use freq, not period  */
+				inherit_stat   :  1, /* per task counts       */
+				enable_on_exec :  1, /* next exec enables     */
+				task           :  1, /* trace fork/exit       */
+				watermark      :  1, /* wakeup_watermark      */
+
+				__reserved_1   : 49;
+
+	union {
+		__u32		wakeup_events;	  /* wakeup every n events */
+		__u32		wakeup_watermark; /* bytes before wakeup   */
+	};
+	__u32			__reserved_2;
+
+	__u64			__reserved_3;
+};
+
+/*
+ * Ioctls that can be done on a perf counter fd:
+ */
+#define PERF_COUNTER_IOC_ENABLE		_IO ('$', 0)
+#define PERF_COUNTER_IOC_DISABLE	_IO ('$', 1)
+#define PERF_COUNTER_IOC_REFRESH	_IO ('$', 2)
+#define PERF_COUNTER_IOC_RESET		_IO ('$', 3)
+#define PERF_COUNTER_IOC_PERIOD		_IOW('$', 4, u64)
+#define PERF_COUNTER_IOC_SET_OUTPUT	_IO ('$', 5)
+
+enum perf_counter_ioc_flags {
+	PERF_IOC_FLAG_GROUP		= 1U << 0,
+};
+
+/*
+ * Structure of the page that can be mapped via mmap
+ */
+struct perf_counter_mmap_page {
+	__u32	version;		/* version number of this structure */
+	__u32	compat_version;		/* lowest version this is compat with */
+
+	/*
+	 * Bits needed to read the hw counters in user-space.
+	 *
+	 *   u32 seq;
+	 *   s64 count;
+	 *
+	 *   do {
+	 *     seq = pc->lock;
+	 *
+	 *     barrier()
+	 *     if (pc->index) {
+	 *       count = pmc_read(pc->index - 1);
+	 *       count += pc->offset;
+	 *     } else
+	 *       goto regular_read;
+	 *
+	 *     barrier();
+	 *   } while (pc->lock != seq);
+	 *
+	 * NOTE: for obvious reason this only works on self-monitoring
+	 *       processes.
+	 */
+	__u32	lock;			/* seqlock for synchronization */
+	__u32	index;			/* hardware counter identifier */
+	__s64	offset;			/* add to hardware counter value */
+	__u64	time_enabled;		/* time counter active */
+	__u64	time_running;		/* time counter on cpu */
+
+		/*
+		 * Hole for extension of the self monitor capabilities
+		 */
+
+	__u64	__reserved[123];	/* align to 1k */
+
+	/*
+	 * Control data for the mmap() data buffer.
+	 *
+	 * User-space reading the @data_head value should issue an rmb(), on
+	 * SMP capable platforms, after reading this value -- see
+	 * perf_counter_wakeup().
+	 *
+	 * When the mapping is PROT_WRITE the @data_tail value should be
+	 * written by userspace to reflect the last read data. In this case
+	 * the kernel will not over-write unread data.
+	 */
+	__u64   data_head;		/* head in the data section */
+	__u64	data_tail;		/* user-space written tail */
+};
+
+#define PERF_EVENT_MISC_CPUMODE_MASK		(3 << 0)
+#define PERF_EVENT_MISC_CPUMODE_UNKNOWN		(0 << 0)
+#define PERF_EVENT_MISC_KERNEL			(1 << 0)
+#define PERF_EVENT_MISC_USER			(2 << 0)
+#define PERF_EVENT_MISC_HYPERVISOR		(3 << 0)
+
+struct perf_event_header {
+	__u32	type;
+	__u16	misc;
+	__u16	size;
+};
+
+enum perf_event_type {
+
+	/*
+	 * The MMAP events record the PROT_EXEC mappings so that we can
+	 * correlate userspace IPs to code. They have the following structure:
+	 *
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *
+	 *	u32				pid, tid;
+	 *	u64				addr;
+	 *	u64				len;
+	 *	u64				pgoff;
+	 *	char				filename[];
+	 * };
+	 */
+	PERF_EVENT_MMAP			= 1,
+
+	/*
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *	u64				id;
+	 *	u64				lost;
+	 * };
+	 */
+	PERF_EVENT_LOST			= 2,
+
+	/*
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *
+	 *	u32				pid, tid;
+	 *	char				comm[];
+	 * };
+	 */
+	PERF_EVENT_COMM			= 3,
+
+	/*
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *	u32				pid, ppid;
+	 *	u32				tid, ptid;
+	 *	u64				time;
+	 * };
+	 */
+	PERF_EVENT_EXIT			= 4,
+
+	/*
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *	u64				time;
+	 *	u64				id;
+	 *	u64				stream_id;
+	 * };
+	 */
+	PERF_EVENT_THROTTLE		= 5,
+	PERF_EVENT_UNTHROTTLE		= 6,
+
+	/*
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *	u32				pid, ppid;
+	 *	u32				tid, ptid;
+	 *	{ u64				time;     } && PERF_SAMPLE_TIME
+	 * };
+	 */
+	PERF_EVENT_FORK			= 7,
+
+	/*
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *	u32				pid, tid;
+	 *
+	 *	struct read_format		values;
+	 * };
+	 */
+	PERF_EVENT_READ			= 8,
+
+	/*
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *
+	 *	{ u64			ip;	  } && PERF_SAMPLE_IP
+	 *	{ u32			pid, tid; } && PERF_SAMPLE_TID
+	 *	{ u64			time;     } && PERF_SAMPLE_TIME
+	 *	{ u64			addr;     } && PERF_SAMPLE_ADDR
+	 *	{ u64			id;	  } && PERF_SAMPLE_ID
+	 *	{ u64			stream_id;} && PERF_SAMPLE_STREAM_ID
+	 *	{ u32			cpu, res; } && PERF_SAMPLE_CPU
+	 *	{ u64			period;   } && PERF_SAMPLE_PERIOD
+	 *
+	 *	{ struct read_format	values;	  } && PERF_SAMPLE_READ
+	 *
+	 *	{ u64			nr,
+	 *	  u64			ips[nr];  } && PERF_SAMPLE_CALLCHAIN
+	 *
+	 *	#
+	 *	# The RAW record below is opaque data wrt the ABI
+	 *	#
+	 *	# That is, the ABI doesn't make any promises wrt to
+	 *	# the stability of its content, it may vary depending
+	 *	# on event, hardware, kernel version and phase of
+	 *	# the moon.
+	 *	#
+	 *	# In other words, PERF_SAMPLE_RAW contents are not an ABI.
+	 *	#
+	 *
+	 *	{ u32			size;
+	 *	  char                  data[size];}&& PERF_SAMPLE_RAW
+	 * };
+	 */
+	PERF_EVENT_SAMPLE		= 9,
+
+	PERF_EVENT_MAX,			/* non-ABI */
+};
+
+enum perf_callchain_context {
+	PERF_CONTEXT_HV			= (__u64)-32,
+	PERF_CONTEXT_KERNEL		= (__u64)-128,
+	PERF_CONTEXT_USER		= (__u64)-512,
+
+	PERF_CONTEXT_GUEST		= (__u64)-2048,
+	PERF_CONTEXT_GUEST_KERNEL	= (__u64)-2176,
+	PERF_CONTEXT_GUEST_USER		= (__u64)-2560,
+
+	PERF_CONTEXT_MAX		= (__u64)-4095,
+};
+
+#define PERF_FLAG_FD_NO_GROUP		(1U << 0)
+#define PERF_FLAG_FD_OUTPUT		(1U << 1)
+
+/*
+ * In case some app still references the old symbols:
+ */
+
+#define __NR_perf_counter_open		__NR_perf_event_open
+
+#define PR_TASK_PERF_COUNTERS_DISABLE	PR_TASK_PERF_EVENTS_DISABLE
+#define PR_TASK_PERF_COUNTERS_ENABLE	PR_TASK_PERF_EVENTS_ENABLE
+
+#endif /* _LINUX_PERF_COUNTER_H */
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index ae9d9ed6df2..acefaf71e6d 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1,15 +1,15 @@
 /*
- *  Performance events:
+ * Performance events:
  *
  *    Copyright (C) 2008-2009, Thomas Gleixner <tglx@linutronix.de>
  *    Copyright (C) 2008-2009, Red Hat, Inc., Ingo Molnar
  *    Copyright (C) 2008-2009, Red Hat, Inc., Peter Zijlstra
  *
- *  Data type definitions, declarations, prototypes.
+ * Data type definitions, declarations, prototypes.
  *
  *    Started by: Thomas Gleixner and Ingo Molnar
  *
- *  For licencing details see kernel-base/COPYING
+ * For licencing details see kernel-base/COPYING
  */
 #ifndef _LINUX_PERF_EVENT_H
 #define _LINUX_PERF_EVENT_H
@@ -131,19 +131,19 @@ enum perf_event_sample_format {
  * as specified by attr.read_format:
  *
  * struct read_format {
- * 	{ u64		value;
- * 	  { u64		time_enabled; } && PERF_FORMAT_ENABLED
- * 	  { u64		time_running; } && PERF_FORMAT_RUNNING
- * 	  { u64		id;           } && PERF_FORMAT_ID
- * 	} && !PERF_FORMAT_GROUP
+ *	{ u64		value;
+ *	  { u64		time_enabled; } && PERF_FORMAT_ENABLED
+ *	  { u64		time_running; } && PERF_FORMAT_RUNNING
+ *	  { u64		id;           } && PERF_FORMAT_ID
+ *	} && !PERF_FORMAT_GROUP
  *
- * 	{ u64		nr;
- * 	  { u64		time_enabled; } && PERF_FORMAT_ENABLED
- * 	  { u64		time_running; } && PERF_FORMAT_RUNNING
- * 	  { u64		value;
- * 	    { u64	id;           } && PERF_FORMAT_ID
- * 	  }		cntr[nr];
- * 	} && PERF_FORMAT_GROUP
+ *	{ u64		nr;
+ *	  { u64		time_enabled; } && PERF_FORMAT_ENABLED
+ *	  { u64		time_running; } && PERF_FORMAT_RUNNING
+ *	  { u64		value;
+ *	    { u64	id;           } && PERF_FORMAT_ID
+ *	  }		cntr[nr];
+ *	} && PERF_FORMAT_GROUP
  * };
  */
 enum perf_event_read_format {
@@ -152,7 +152,7 @@ enum perf_event_read_format {
 	PERF_FORMAT_ID				= 1U << 2,
 	PERF_FORMAT_GROUP			= 1U << 3,
 
-	PERF_FORMAT_MAX = 1U << 4, 		/* non-ABI */
+	PERF_FORMAT_MAX = 1U << 4,		/* non-ABI */
 };
 
 #define PERF_ATTR_SIZE_VER0	64	/* sizeof first published struct */
@@ -216,8 +216,8 @@ struct perf_event_attr {
  * Ioctls that can be done on a perf event fd:
  */
 #define PERF_EVENT_IOC_ENABLE		_IO ('$', 0)
-#define PERF_EVENT_IOC_DISABLE	_IO ('$', 1)
-#define PERF_EVENT_IOC_REFRESH	_IO ('$', 2)
+#define PERF_EVENT_IOC_DISABLE		_IO ('$', 1)
+#define PERF_EVENT_IOC_REFRESH		_IO ('$', 2)
 #define PERF_EVENT_IOC_RESET		_IO ('$', 3)
 #define PERF_EVENT_IOC_PERIOD		_IOW('$', 4, u64)
 #define PERF_EVENT_IOC_SET_OUTPUT	_IO ('$', 5)
@@ -314,9 +314,9 @@ enum perf_event_type {
 
 	/*
 	 * struct {
-	 * 	struct perf_event_header	header;
-	 * 	u64				id;
-	 * 	u64				lost;
+	 *	struct perf_event_header	header;
+	 *	u64				id;
+	 *	u64				lost;
 	 * };
 	 */
 	PERF_RECORD_LOST			= 2,
@@ -383,23 +383,23 @@ enum perf_event_type {
 	 *	{ u64			id;	  } && PERF_SAMPLE_ID
 	 *	{ u64			stream_id;} && PERF_SAMPLE_STREAM_ID
 	 *	{ u32			cpu, res; } && PERF_SAMPLE_CPU
-	 * 	{ u64			period;   } && PERF_SAMPLE_PERIOD
+	 *	{ u64			period;   } && PERF_SAMPLE_PERIOD
 	 *
 	 *	{ struct read_format	values;	  } && PERF_SAMPLE_READ
 	 *
 	 *	{ u64			nr,
 	 *	  u64			ips[nr];  } && PERF_SAMPLE_CALLCHAIN
 	 *
-	 * 	#
-	 * 	# The RAW record below is opaque data wrt the ABI
-	 * 	#
-	 * 	# That is, the ABI doesn't make any promises wrt to
-	 * 	# the stability of its content, it may vary depending
-	 * 	# on event_id, hardware, kernel version and phase of
-	 * 	# the moon.
-	 * 	#
-	 * 	# In other words, PERF_SAMPLE_RAW contents are not an ABI.
-	 * 	#
+	 *	#
+	 *	# The RAW record below is opaque data wrt the ABI
+	 *	#
+	 *	# That is, the ABI doesn't make any promises wrt to
+	 *	# the stability of its content, it may vary depending
+	 *	# on event, hardware, kernel version and phase of
+	 *	# the moon.
+	 *	#
+	 *	# In other words, PERF_SAMPLE_RAW contents are not an ABI.
+	 *	#
 	 *
 	 *	{ u32			size;
 	 *	  char                  data[size];}&& PERF_SAMPLE_RAW
@@ -503,10 +503,10 @@ struct pmu {
  * enum perf_event_active_state - the states of a event
  */
 enum perf_event_active_state {
-	PERF_EVENT_STATE_ERROR	= -2,
+	PERF_EVENT_STATE_ERROR		= -2,
 	PERF_EVENT_STATE_OFF		= -1,
 	PERF_EVENT_STATE_INACTIVE	=  0,
-	PERF_EVENT_STATE_ACTIVE	=  1,
+	PERF_EVENT_STATE_ACTIVE		=  1,
 };
 
 struct file;
@@ -529,7 +529,7 @@ struct perf_mmap_data {
 
 	long				watermark;	/* wakeup watermark  */
 
-	struct perf_event_mmap_page   *user_page;
+	struct perf_event_mmap_page	*user_page;
 	void				*data_pages[0];
 };
 
@@ -694,14 +694,14 @@ struct perf_cpu_context {
 };
 
 struct perf_output_handle {
-	struct perf_event	*event;
-	struct perf_mmap_data	*data;
-	unsigned long		head;
-	unsigned long		offset;
-	int			nmi;
-	int			sample;
-	int			locked;
-	unsigned long		flags;
+	struct perf_event		*event;
+	struct perf_mmap_data		*data;
+	unsigned long			head;
+	unsigned long			offset;
+	int				nmi;
+	int				sample;
+	int				locked;
+	unsigned long			flags;
 };
 
 #ifdef CONFIG_PERF_EVENTS
@@ -829,22 +829,22 @@ static inline void
 perf_event_task_sched_out(struct task_struct *task,
 			    struct task_struct *next, int cpu)		{ }
 static inline void
-perf_event_task_tick(struct task_struct *task, int cpu)		{ }
+perf_event_task_tick(struct task_struct *task, int cpu)			{ }
 static inline int perf_event_init_task(struct task_struct *child)	{ return 0; }
 static inline void perf_event_exit_task(struct task_struct *child)	{ }
 static inline void perf_event_free_task(struct task_struct *task)	{ }
-static inline void perf_event_do_pending(void)			{ }
-static inline void perf_event_print_debug(void)			{ }
+static inline void perf_event_do_pending(void)				{ }
+static inline void perf_event_print_debug(void)				{ }
 static inline void perf_disable(void)					{ }
 static inline void perf_enable(void)					{ }
-static inline int perf_event_task_disable(void)	{ return -EINVAL; }
-static inline int perf_event_task_enable(void)	{ return -EINVAL; }
+static inline int perf_event_task_disable(void)				{ return -EINVAL; }
+static inline int perf_event_task_enable(void)				{ return -EINVAL; }
 
 static inline void
 perf_sw_event(u32 event_id, u64 nr, int nmi,
 		     struct pt_regs *regs, u64 addr)			{ }
 
-static inline void perf_event_mmap(struct vm_area_struct *vma)	{ }
+static inline void perf_event_mmap(struct vm_area_struct *vma)		{ }
 static inline void perf_event_comm(struct task_struct *tsk)		{ }
 static inline void perf_event_fork(struct task_struct *tsk)		{ }
 static inline void perf_event_init(void)				{ }
diff --git a/init/Kconfig b/init/Kconfig
index cfdf5c32280..706728be312 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -920,26 +920,31 @@ config HAVE_PERF_EVENTS
 	help
 	  See tools/perf/design.txt for details.
 
-menu "Performance Counters"
+menu "Kernel Performance Events And Counters"
 
 config PERF_EVENTS
-	bool "Kernel Performance Counters"
-	default y if PROFILING
+	bool "Kernel performance events and counters"
+	default y if (PROFILING || PERF_COUNTERS)
 	depends on HAVE_PERF_EVENTS
 	select ANON_INODES
 	help
-	  Enable kernel support for performance counter hardware.
+	  Enable kernel support for various performance events provided
+	  by software and hardware.
 
-	  Performance counters are special hardware registers available
-	  on most modern CPUs. These registers count the number of certain
+	  Software events are supported either build-in or via the
+	  use of generic tracepoints.
+
+	  Most modern CPUs support performance events via performance
+	  counter registers. These registers count the number of certain
 	  types of hw events: such as instructions executed, cachemisses
 	  suffered, or branches mis-predicted - without slowing down the
 	  kernel or applications. These registers can also trigger interrupts
 	  when a threshold number of events have passed - and can thus be
 	  used to profile the code that runs on that CPU.
 
-	  The Linux Performance Counter subsystem provides an abstraction of
-	  these hardware capabilities, available via a system call. It
+	  The Linux Performance Event subsystem provides an abstraction of
+	  these software and hardware cevent apabilities, available via a
+	  system call and used by the "perf" utility in tools/perf/. It
 	  provides per task and per CPU counters, and it provides event
 	  capabilities on top of those.
 
@@ -950,14 +955,26 @@ config EVENT_PROFILE
 	depends on PERF_EVENTS && EVENT_TRACING
 	default y
 	help
-	 Allow the use of tracepoints as software performance counters.
+	 Allow the use of tracepoints as software performance events.
 
-	 When this is enabled, you can create perf counters based on
+	 When this is enabled, you can create perf events based on
 	 tracepoints using PERF_TYPE_TRACEPOINT and the tracepoint ID
 	 found in debugfs://tracing/events/*/*/id. (The -e/--events
 	 option to the perf tool can parse and interpret symbolic
 	 tracepoints, in the subsystem:tracepoint_name format.)
 
+config PERF_COUNTERS
+	bool "Kernel performance counters (old config option)"
+	depends on HAVE_PERF_EVENTS
+	help
+	  This config has been obsoleted by the PERF_EVENTS
+	  config option - please see that one for details.
+
+	  It has no effect on the kernel whether you enable
+	  it or not, it is a compatibility placeholder.
+
+	  Say N if unsure.
+
 endmenu
 
 config VM_EVENT_COUNTERS
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 6e8b99a04e1..76ac4db405e 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1,12 +1,12 @@
 /*
- * Performance event core code
+ * Performance events core code:
  *
  *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
  *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
  *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
  *  Copyright  �  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
  *
- *  For licensing details see kernel-base/COPYING
+ * For licensing details see kernel-base/COPYING
  */
 
 #include <linux/fs.h>
-- 
cgit v1.2.3


From 7da8b6ddc7a03346f825925e0d981ca2bd1ed617 Mon Sep 17 00:00:00 2001
From: Michael Tokarev <mjt@tls.msk.ru>
Date: Wed, 22 Jul 2009 17:50:23 +0400
Subject: trivial: fix missing printk space in amd_k7_smp_check

This trivial patch fixes one missing space in printk.

I already fixed it about half a year ago or more, but the change (in
arch/x86/kernel/cpu/smpboot.c at that time) didn't made into
mainline yet.

Signed-off-by: Michael Tokarev <mjt@tls.msk.ru>

index 28e5f59..6c139ed 100644
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 arch/x86/kernel/cpu/amd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index f32fa71ccf9..c910a716a71 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -184,7 +184,7 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
 	 * approved Athlon
 	 */
 	WARN_ONCE(1, "WARNING: This combination of AMD"
-		"processors is not suitable for SMP.\n");
+		" processors is not suitable for SMP.\n");
 	if (!test_taint(TAINT_UNSAFE_SMP))
 		add_taint(TAINT_UNSAFE_SMP);
 
-- 
cgit v1.2.3


From b75fe4e5b869f8dbebd36df64a7fcda0c5b318ed Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Mon, 21 Sep 2009 13:34:06 -0700
Subject: xen: check EFER for NX before setting up GDT mapping

x86-64 assumes NX is available by default, so we need to
explicitly check for it before using NX.  Some first-generation
Intel x86-64 processors didn't support NX, and even recent systems
allow it to be disabled in BIOS.

[ Impact: prevent Xen crash on NX-less 64-bit machines ]

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Cc: Stable Kernel <stable@kernel.org>
---
 arch/x86/mm/Makefile     |  1 +
 arch/x86/xen/enlighten.c | 10 +++++-----
 2 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 9b5a9f59a47..dd313d035de 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -4,6 +4,7 @@ obj-y	:=  init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
 # Make sure __phys_addr has no stackprotector
 nostackp := $(call cc-option, -fno-stack-protector)
 CFLAGS_physaddr.o		:= $(nostackp)
+CFLAGS_init.o			:= $(nostackp)
 
 obj-$(CONFIG_SMP)		+= tlb.o
 
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 0dd0c2c6cae..5d701bf66a2 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1083,6 +1083,11 @@ asmlinkage void __init xen_start_kernel(void)
 
 	__supported_pte_mask |= _PAGE_IOMAP;
 
+#ifdef CONFIG_X86_64
+	/* Work out if we support NX */
+	check_efer();
+#endif
+
 	xen_setup_features();
 
 	/* Get mfn list */
@@ -1123,11 +1128,6 @@ asmlinkage void __init xen_start_kernel(void)
 
 	pgd = (pgd_t *)xen_start_info->pt_base;
 
-#ifdef CONFIG_X86_64
-	/* Work out if we support NX */
-	check_efer();
-#endif
-
 	/* Don't do the full vcpu_info placement stuff until we have a
 	   possible map and a non-dummy shared_info. */
 	per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
-- 
cgit v1.2.3


From c44c9ec0f38b939b3200436e3aa95c1aa83c41c7 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Mon, 21 Sep 2009 13:40:42 -0700
Subject: x86: split NX setup into separate file to limit unstack-protected
 code

Move the NX setup into a separate file so that it can be compiled
without stack-protection while leaving the rest of the mm/init code
protected.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/include/asm/pgtable_types.h |  1 +
 arch/x86/mm/Makefile                 |  4 +--
 arch/x86/mm/init.c                   | 63 --------------------------------
 arch/x86/mm/setup_nx.c               | 69 ++++++++++++++++++++++++++++++++++++
 4 files changed, 72 insertions(+), 65 deletions(-)
 create mode 100644 arch/x86/mm/setup_nx.c

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 54cb697f490..e9918d99f83 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -277,6 +277,7 @@ static inline pteval_t pte_flags(pte_t pte)
 typedef struct page *pgtable_t;
 
 extern pteval_t __supported_pte_mask;
+extern void set_nx(void);
 extern int nx_enabled;
 
 #define pgprot_writecombine	pgprot_writecombine
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index dd313d035de..06630d26e56 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,10 +1,10 @@
 obj-y	:=  init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
-	    pat.o pgtable.o physaddr.o gup.o
+	    pat.o pgtable.o physaddr.o gup.o setup_nx.o
 
 # Make sure __phys_addr has no stackprotector
 nostackp := $(call cc-option, -fno-stack-protector)
 CFLAGS_physaddr.o		:= $(nostackp)
-CFLAGS_init.o			:= $(nostackp)
+CFLAGS_setup_nx.o		:= $(nostackp)
 
 obj-$(CONFIG_SMP)		+= tlb.o
 
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 0607119cef9..73ffd5536f6 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -28,69 +28,6 @@ int direct_gbpages
 #endif
 ;
 
-int nx_enabled;
-
-#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
-static int disable_nx __cpuinitdata;
-
-/*
- * noexec = on|off
- *
- * Control non-executable mappings for processes.
- *
- * on      Enable
- * off     Disable
- */
-static int __init noexec_setup(char *str)
-{
-	if (!str)
-		return -EINVAL;
-	if (!strncmp(str, "on", 2)) {
-		__supported_pte_mask |= _PAGE_NX;
-		disable_nx = 0;
-	} else if (!strncmp(str, "off", 3)) {
-		disable_nx = 1;
-		__supported_pte_mask &= ~_PAGE_NX;
-	}
-	return 0;
-}
-early_param("noexec", noexec_setup);
-#endif
-
-#ifdef CONFIG_X86_PAE
-static void __init set_nx(void)
-{
-	unsigned int v[4], l, h;
-
-	if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
-		cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
-
-		if ((v[3] & (1 << 20)) && !disable_nx) {
-			rdmsr(MSR_EFER, l, h);
-			l |= EFER_NX;
-			wrmsr(MSR_EFER, l, h);
-			nx_enabled = 1;
-			__supported_pte_mask |= _PAGE_NX;
-		}
-	}
-}
-#else
-static inline void set_nx(void)
-{
-}
-#endif
-
-#ifdef CONFIG_X86_64
-void __cpuinit check_efer(void)
-{
-	unsigned long efer;
-
-	rdmsrl(MSR_EFER, efer);
-	if (!(efer & EFER_NX) || disable_nx)
-		__supported_pte_mask &= ~_PAGE_NX;
-}
-#endif
-
 static void __init find_early_table_space(unsigned long end, int use_pse,
 					  int use_gbpages)
 {
diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c
new file mode 100644
index 00000000000..513d8ed5d2e
--- /dev/null
+++ b/arch/x86/mm/setup_nx.c
@@ -0,0 +1,69 @@
+#include <linux/spinlock.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+
+#include <asm/pgtable.h>
+
+int nx_enabled;
+
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+static int disable_nx __cpuinitdata;
+
+/*
+ * noexec = on|off
+ *
+ * Control non-executable mappings for processes.
+ *
+ * on      Enable
+ * off     Disable
+ */
+static int __init noexec_setup(char *str)
+{
+	if (!str)
+		return -EINVAL;
+	if (!strncmp(str, "on", 2)) {
+		__supported_pte_mask |= _PAGE_NX;
+		disable_nx = 0;
+	} else if (!strncmp(str, "off", 3)) {
+		disable_nx = 1;
+		__supported_pte_mask &= ~_PAGE_NX;
+	}
+	return 0;
+}
+early_param("noexec", noexec_setup);
+#endif
+
+#ifdef CONFIG_X86_PAE
+void __init set_nx(void)
+{
+	unsigned int v[4], l, h;
+
+	if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
+		cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
+
+		if ((v[3] & (1 << 20)) && !disable_nx) {
+			rdmsr(MSR_EFER, l, h);
+			l |= EFER_NX;
+			wrmsr(MSR_EFER, l, h);
+			nx_enabled = 1;
+			__supported_pte_mask |= _PAGE_NX;
+		}
+	}
+}
+#else
+void set_nx(void)
+{
+}
+#endif
+
+#ifdef CONFIG_X86_64
+void __cpuinit check_efer(void)
+{
+	unsigned long efer;
+
+	rdmsrl(MSR_EFER, efer);
+	if (!(efer & EFER_NX) || disable_nx)
+		__supported_pte_mask &= ~_PAGE_NX;
+}
+#endif
+
-- 
cgit v1.2.3


From cc013a88906bad9d2832d6316de1c7dbc1c2a794 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <Geert.Uytterhoeven@sonycom.com>
Date: Mon, 21 Sep 2009 17:02:36 -0700
Subject: arches: drop superfluous casts in nr_free_pages() callers

Commit 96177299416dbccb73b54e6b344260154a445375 ("Drop free_pages()")
modified nr_free_pages() to return 'unsigned long' instead of 'unsigned
int'.  This made the casts to 'unsigned long' in most callers superfluous,
so remove them.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Geert Uytterhoeven <Geert.Uytterhoeven@sonycom.com>
Reviewed-by: Christoph Lameter <cl@linux-foundation.org>
Acked-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Russell King <rmk+kernel@arm.linux.org.uk>
Acked-by: David S. Miller <davem@davemloft.net>
Acked-by: Kyle McMartin <kyle@mcmartin.ca>
Acked-by: WANG Cong <xiyou.wangcong@gmail.com>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Haavard Skinnemoen <hskinnemoen@atmel.com>
Cc: Mikael Starvik <starvik@axis.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Hirokazu Takata <takata@linux-m32r.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: David Howells <dhowells@redhat.com>
Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Chris Zankel <zankel@tensilica.com>
Cc: Michal Simek <monstr@monstr.eu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/mm/init.c      | 2 +-
 arch/alpha/mm/numa.c      | 2 +-
 arch/arm/mm/init.c        | 4 ++--
 arch/avr32/mm/init.c      | 2 +-
 arch/cris/mm/init.c       | 2 +-
 arch/ia64/mm/init.c       | 2 +-
 arch/m32r/mm/init.c       | 2 +-
 arch/m68k/mm/init.c       | 2 +-
 arch/microblaze/mm/init.c | 2 +-
 arch/mips/mm/init.c       | 2 +-
 arch/mn10300/mm/init.c    | 2 +-
 arch/parisc/mm/init.c     | 2 +-
 arch/powerpc/mm/mem.c     | 2 +-
 arch/s390/mm/init.c       | 2 +-
 arch/sh/mm/init.c         | 2 +-
 arch/sparc/mm/init_32.c   | 2 +-
 arch/um/kernel/mem.c      | 2 +-
 arch/x86/mm/init_32.c     | 2 +-
 arch/x86/mm/init_64.c     | 2 +-
 arch/xtensa/mm/init.c     | 2 +-
 20 files changed, 21 insertions(+), 21 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c
index af71d38c8e4..a0902c20d67 100644
--- a/arch/alpha/mm/init.c
+++ b/arch/alpha/mm/init.c
@@ -299,7 +299,7 @@ printk_memory_info(void)
 	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
 
 	printk("Memory: %luk/%luk available (%luk kernel code, %luk reserved, %luk data, %luk init)\n",
-	       (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
+	       nr_free_pages() << (PAGE_SHIFT-10),
 	       max_mapnr << (PAGE_SHIFT-10),
 	       codesize >> 10,
 	       reservedpages << (PAGE_SHIFT-10),
diff --git a/arch/alpha/mm/numa.c b/arch/alpha/mm/numa.c
index 0eab5574942..10b403554b6 100644
--- a/arch/alpha/mm/numa.c
+++ b/arch/alpha/mm/numa.c
@@ -349,7 +349,7 @@ void __init mem_init(void)
 
 	printk("Memory: %luk/%luk available (%luk kernel code, %luk reserved, "
 	       "%luk data, %luk init)\n",
-	       (unsigned long)nr_free_pages() << (PAGE_SHIFT-10),
+	       nr_free_pages() << (PAGE_SHIFT-10),
 	       num_physpages << (PAGE_SHIFT-10),
 	       codesize >> 10,
 	       reservedpages << (PAGE_SHIFT-10),
diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index ea36186f32c..f982606d7bf 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -596,8 +596,8 @@ void __init mem_init(void)
 
 	printk(KERN_NOTICE "Memory: %luKB available (%dK code, "
 		"%dK data, %dK init, %luK highmem)\n",
-		(unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
-		codesize >> 10, datasize >> 10, initsize >> 10,
+		nr_free_pages() << (PAGE_SHIFT-10), codesize >> 10,
+		datasize >> 10, initsize >> 10,
 		(unsigned long) (totalhigh_pages << (PAGE_SHIFT-10)));
 
 	if (PAGE_SIZE >= 16384 && num_physpages <= 128) {
diff --git a/arch/avr32/mm/init.c b/arch/avr32/mm/init.c
index e819fa69a90..376f18c4a6c 100644
--- a/arch/avr32/mm/init.c
+++ b/arch/avr32/mm/init.c
@@ -141,7 +141,7 @@ void __init mem_init(void)
 
 	printk ("Memory: %luk/%luk available (%dk kernel code, "
 		"%dk reserved, %dk data, %dk init)\n",
-		(unsigned long)nr_free_pages() << (PAGE_SHIFT - 10),
+		nr_free_pages() << (PAGE_SHIFT - 10),
 		totalram_pages << (PAGE_SHIFT - 10),
 		codesize >> 10,
 		reservedpages << (PAGE_SHIFT - 10),
diff --git a/arch/cris/mm/init.c b/arch/cris/mm/init.c
index 514f46a4b23..ff68b9f516a 100644
--- a/arch/cris/mm/init.c
+++ b/arch/cris/mm/init.c
@@ -54,7 +54,7 @@ mem_init(void)
         printk(KERN_INFO
                "Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, "
 	       "%dk init)\n" ,
-	       (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
+	       nr_free_pages() << (PAGE_SHIFT-10),
 	       max_mapnr << (PAGE_SHIFT-10),
 	       codesize >> 10,
 	       reservedpages << (PAGE_SHIFT-10),
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index b115b3bbf04..1d286244a56 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -655,7 +655,7 @@ mem_init (void)
 	initsize =  (unsigned long) __init_end - (unsigned long) __init_begin;
 
 	printk(KERN_INFO "Memory: %luk/%luk available (%luk code, %luk reserved, "
-	       "%luk data, %luk init)\n", (unsigned long) nr_free_pages() << (PAGE_SHIFT - 10),
+	       "%luk data, %luk init)\n", nr_free_pages() << (PAGE_SHIFT - 10),
 	       num_physpages << (PAGE_SHIFT - 10), codesize >> 10,
 	       reserved_pages << (PAGE_SHIFT - 10), datasize >> 10, initsize >> 10);
 
diff --git a/arch/m32r/mm/init.c b/arch/m32r/mm/init.c
index 24d429f9358..9f581df3952 100644
--- a/arch/m32r/mm/init.c
+++ b/arch/m32r/mm/init.c
@@ -171,7 +171,7 @@ void __init mem_init(void)
 
 	printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, "
 		"%dk reserved, %dk data, %dk init)\n",
-		(unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
+		nr_free_pages() << (PAGE_SHIFT-10),
 		num_physpages << (PAGE_SHIFT-10),
 		codesize >> 10,
 		reservedpages << (PAGE_SHIFT-10),
diff --git a/arch/m68k/mm/init.c b/arch/m68k/mm/init.c
index 0007b2adf3a..774549accd2 100644
--- a/arch/m68k/mm/init.c
+++ b/arch/m68k/mm/init.c
@@ -126,7 +126,7 @@ void __init mem_init(void)
 #endif
 
 	printk("Memory: %luk/%luk available (%dk kernel code, %dk data, %dk init)\n",
-	       (unsigned long)nr_free_pages() << (PAGE_SHIFT-10),
+	       nr_free_pages() << (PAGE_SHIFT-10),
 	       totalram_pages << (PAGE_SHIFT-10),
 	       codepages << (PAGE_SHIFT-10),
 	       datapages << (PAGE_SHIFT-10),
diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c
index f207f1a94db..1110784eb3f 100644
--- a/arch/microblaze/mm/init.c
+++ b/arch/microblaze/mm/init.c
@@ -204,7 +204,7 @@ void __init mem_init(void)
 	totalram_pages += free_all_bootmem();
 
 	printk(KERN_INFO "Memory: %luk/%luk available\n",
-	       (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
+	       nr_free_pages() << (PAGE_SHIFT-10),
 	       num_physpages << (PAGE_SHIFT-10));
 #ifdef CONFIG_MMU
 	mem_init_done = 1;
diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c
index 38c79c55b06..1f4ee4797a6 100644
--- a/arch/mips/mm/init.c
+++ b/arch/mips/mm/init.c
@@ -417,7 +417,7 @@ void __init mem_init(void)
 
 	printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
 	       "%ldk reserved, %ldk data, %ldk init, %ldk highmem)\n",
-	       (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
+	       nr_free_pages() << (PAGE_SHIFT-10),
 	       ram << (PAGE_SHIFT-10),
 	       codesize >> 10,
 	       reservedpages << (PAGE_SHIFT-10),
diff --git a/arch/mn10300/mm/init.c b/arch/mn10300/mm/init.c
index 8cee387a24f..ec1420562dc 100644
--- a/arch/mn10300/mm/init.c
+++ b/arch/mn10300/mm/init.c
@@ -112,7 +112,7 @@ void __init mem_init(void)
 	       "Memory: %luk/%luk available"
 	       " (%dk kernel code, %dk reserved, %dk data, %dk init,"
 	       " %ldk highmem)\n",
-	       (unsigned long) nr_free_pages() << (PAGE_SHIFT - 10),
+	       nr_free_pages() << (PAGE_SHIFT - 10),
 	       max_mapnr << (PAGE_SHIFT - 10),
 	       codesize >> 10,
 	       reservedpages << (PAGE_SHIFT - 10),
diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c
index b0831d9e35c..d5aca31fddb 100644
--- a/arch/parisc/mm/init.c
+++ b/arch/parisc/mm/init.c
@@ -506,7 +506,7 @@ void __init mem_init(void)
 #endif
 
 	printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init)\n",
-		(unsigned long)nr_free_pages() << (PAGE_SHIFT-10),
+		nr_free_pages() << (PAGE_SHIFT-10),
 		num_physpages << (PAGE_SHIFT-10),
 		codesize >> 10,
 		reservedpages << (PAGE_SHIFT-10),
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 579382c163a..0e5c59b995e 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -372,7 +372,7 @@ void __init mem_init(void)
 
 	printk(KERN_INFO "Memory: %luk/%luk available (%luk kernel code, "
 	       "%luk reserved, %luk data, %luk bss, %luk init)\n",
-		(unsigned long)nr_free_pages() << (PAGE_SHIFT-10),
+		nr_free_pages() << (PAGE_SHIFT-10),
 		num_physpages << (PAGE_SHIFT-10),
 		codesize >> 10,
 		reservedpages << (PAGE_SHIFT-10),
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index c634dfbe92e..76564795222 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -105,7 +105,7 @@ void __init mem_init(void)
 	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
 	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
         printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n",
-                (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
+		nr_free_pages() << (PAGE_SHIFT-10),
                 max_mapnr << (PAGE_SHIFT-10),
                 codesize >> 10,
                 reservedpages << (PAGE_SHIFT-10),
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index edc842ff61e..fabb7c6f48d 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -232,7 +232,7 @@ void __init mem_init(void)
 
 	printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, "
 	       "%dk data, %dk init)\n",
-		(unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
+		nr_free_pages() << (PAGE_SHIFT-10),
 		num_physpages << (PAGE_SHIFT-10),
 		codesize >> 10,
 		datasize >> 10,
diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c
index 54114ad0bde..dc7c3b17a15 100644
--- a/arch/sparc/mm/init_32.c
+++ b/arch/sparc/mm/init_32.c
@@ -472,7 +472,7 @@ void __init mem_init(void)
 			reservedpages++;
 
 	printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n",
-	       (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
+	       nr_free_pages() << (PAGE_SHIFT-10),
 	       num_physpages << (PAGE_SHIFT - 10),
 	       codepages << (PAGE_SHIFT-10),
 	       reservedpages << (PAGE_SHIFT - 10),
diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c
index 61d7e6138ff..a5d5e70cf6f 100644
--- a/arch/um/kernel/mem.c
+++ b/arch/um/kernel/mem.c
@@ -77,7 +77,7 @@ void __init mem_init(void)
 	num_physpages = totalram_pages;
 	max_pfn = totalram_pages;
 	printk(KERN_INFO "Memory: %luk available\n",
-	       (unsigned long) nr_free_pages() << (PAGE_SHIFT-10));
+	       nr_free_pages() << (PAGE_SHIFT-10));
 	kmalloc_ok = 1;
 
 #ifdef CONFIG_HIGHMEM
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 3cd7711bb94..95e877f5b84 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -892,7 +892,7 @@ void __init mem_init(void)
 
 	printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, "
 			"%dk reserved, %dk data, %dk init, %ldk highmem)\n",
-		(unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
+		nr_free_pages() << (PAGE_SHIFT-10),
 		num_physpages << (PAGE_SHIFT-10),
 		codesize >> 10,
 		reservedpages << (PAGE_SHIFT-10),
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index ea56b8cbb6a..810bd31e7f5 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -687,7 +687,7 @@ void __init mem_init(void)
 
 	printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
 			 "%ldk absent, %ldk reserved, %ldk data, %ldk init)\n",
-		(unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
+		nr_free_pages() << (PAGE_SHIFT-10),
 		max_pfn << (PAGE_SHIFT-10),
 		codesize >> 10,
 		absent_pages << (PAGE_SHIFT-10),
diff --git a/arch/xtensa/mm/init.c b/arch/xtensa/mm/init.c
index 427e14fa43c..cdbc27ca966 100644
--- a/arch/xtensa/mm/init.c
+++ b/arch/xtensa/mm/init.c
@@ -203,7 +203,7 @@ void __init mem_init(void)
 
 	printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, "
 	       "%ldk data, %ldk init %ldk highmem)\n",
-	       (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
+	       nr_free_pages() << (PAGE_SHIFT-10),
 	       ram << (PAGE_SHIFT-10),
 	       codesize >> 10,
 	       reservedpages << (PAGE_SHIFT-10),
-- 
cgit v1.2.3


From 4481374ce88ba8f460c8b89f2572027bd27057d0 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Mon, 21 Sep 2009 17:03:05 -0700
Subject: mm: replace various uses of num_physpages by totalram_pages

Sizing of memory allocations shouldn't depend on the number of physical
pages found in a system, as that generally includes (perhaps a huge amount
of) non-RAM pages.  The amount of what actually is usable as storage
should instead be used as a basis here.

Some of the calculations (i.e.  those not intending to use high memory)
should likely even use (totalram_pages - totalhigh_pages).

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Dave Airlie <airlied@linux.ie>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Patrick McHardy <kaber@trash.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/microcode_core.c  | 4 ++--
 drivers/char/agp/backend.c        | 4 ++--
 drivers/parisc/ccio-dma.c         | 4 ++--
 drivers/parisc/sba_iommu.c        | 4 ++--
 drivers/xen/balloon.c             | 4 ----
 fs/ntfs/malloc.h                  | 2 +-
 include/linux/mm.h                | 1 +
 init/main.c                       | 4 ++--
 mm/slab.c                         | 2 +-
 mm/swap.c                         | 2 +-
 mm/vmalloc.c                      | 4 ++--
 net/core/sock.c                   | 4 ++--
 net/dccp/proto.c                  | 6 +++---
 net/decnet/dn_route.c             | 2 +-
 net/ipv4/route.c                  | 2 +-
 net/ipv4/tcp.c                    | 4 ++--
 net/netfilter/nf_conntrack_core.c | 4 ++--
 net/netfilter/x_tables.c          | 2 +-
 net/netfilter/xt_hashlimit.c      | 8 ++++----
 net/netlink/af_netlink.c          | 6 +++---
 net/sctp/protocol.c               | 6 +++---
 21 files changed, 38 insertions(+), 41 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 0db7969b0dd..378e9a8f1bf 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -210,8 +210,8 @@ static ssize_t microcode_write(struct file *file, const char __user *buf,
 {
 	ssize_t ret = -EINVAL;
 
-	if ((len >> PAGE_SHIFT) > num_physpages) {
-		pr_err("microcode: too much data (max %ld pages)\n", num_physpages);
+	if ((len >> PAGE_SHIFT) > totalram_pages) {
+		pr_err("microcode: too much data (max %ld pages)\n", totalram_pages);
 		return ret;
 	}
 
diff --git a/drivers/char/agp/backend.c b/drivers/char/agp/backend.c
index ad87753f6de..a56ca080e10 100644
--- a/drivers/char/agp/backend.c
+++ b/drivers/char/agp/backend.c
@@ -114,9 +114,9 @@ static int agp_find_max(void)
 	long memory, index, result;
 
 #if PAGE_SHIFT < 20
-	memory = num_physpages >> (20 - PAGE_SHIFT);
+	memory = totalram_pages >> (20 - PAGE_SHIFT);
 #else
-	memory = num_physpages << (PAGE_SHIFT - 20);
+	memory = totalram_pages << (PAGE_SHIFT - 20);
 #endif
 	index = 1;
 
diff --git a/drivers/parisc/ccio-dma.c b/drivers/parisc/ccio-dma.c
index a45b0c0d574..a6b4a5a53d4 100644
--- a/drivers/parisc/ccio-dma.c
+++ b/drivers/parisc/ccio-dma.c
@@ -1266,7 +1266,7 @@ ccio_ioc_init(struct ioc *ioc)
 	** Hot-Plug/Removal of PCI cards. (aka PCI OLARD).
 	*/
 
-	iova_space_size = (u32) (num_physpages / count_parisc_driver(&ccio_driver));
+	iova_space_size = (u32) (totalram_pages / count_parisc_driver(&ccio_driver));
 
 	/* limit IOVA space size to 1MB-1GB */
 
@@ -1305,7 +1305,7 @@ ccio_ioc_init(struct ioc *ioc)
 
 	DBG_INIT("%s() hpa 0x%p mem %luMB IOV %dMB (%d bits)\n",
 			__func__, ioc->ioc_regs,
-			(unsigned long) num_physpages >> (20 - PAGE_SHIFT),
+			(unsigned long) totalram_pages >> (20 - PAGE_SHIFT),
 			iova_space_size>>20,
 			iov_order + PAGE_SHIFT);
 
diff --git a/drivers/parisc/sba_iommu.c b/drivers/parisc/sba_iommu.c
index 123d8fe3427..57a6d19eba4 100644
--- a/drivers/parisc/sba_iommu.c
+++ b/drivers/parisc/sba_iommu.c
@@ -1390,7 +1390,7 @@ sba_ioc_init(struct parisc_device *sba, struct ioc *ioc, int ioc_num)
 	** for DMA hints - ergo only 30 bits max.
 	*/
 
-	iova_space_size = (u32) (num_physpages/global_ioc_cnt);
+	iova_space_size = (u32) (totalram_pages/global_ioc_cnt);
 
 	/* limit IOVA space size to 1MB-1GB */
 	if (iova_space_size < (1 << (20 - PAGE_SHIFT))) {
@@ -1415,7 +1415,7 @@ sba_ioc_init(struct parisc_device *sba, struct ioc *ioc, int ioc_num)
 	DBG_INIT("%s() hpa 0x%lx mem %ldMB IOV %dMB (%d bits)\n",
 			__func__,
 			ioc->ioc_hpa,
-			(unsigned long) num_physpages >> (20 - PAGE_SHIFT),
+			(unsigned long) totalram_pages >> (20 - PAGE_SHIFT),
 			iova_space_size>>20,
 			iov_order + PAGE_SHIFT);
 
diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
index f5bbd9e8341..1b7123eb5d7 100644
--- a/drivers/xen/balloon.c
+++ b/drivers/xen/balloon.c
@@ -96,11 +96,7 @@ static struct balloon_stats balloon_stats;
 /* We increase/decrease in batches which fit in a page */
 static unsigned long frame_list[PAGE_SIZE / sizeof(unsigned long)];
 
-/* VM /proc information for memory */
-extern unsigned long totalram_pages;
-
 #ifdef CONFIG_HIGHMEM
-extern unsigned long totalhigh_pages;
 #define inc_totalhigh_pages() (totalhigh_pages++)
 #define dec_totalhigh_pages() (totalhigh_pages--)
 #else
diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h
index cd0be3f5c3c..a44b14cbcee 100644
--- a/fs/ntfs/malloc.h
+++ b/fs/ntfs/malloc.h
@@ -47,7 +47,7 @@ static inline void *__ntfs_malloc(unsigned long size, gfp_t gfp_mask)
 		return kmalloc(PAGE_SIZE, gfp_mask & ~__GFP_HIGHMEM);
 		/* return (void *)__get_free_page(gfp_mask); */
 	}
-	if (likely(size >> PAGE_SHIFT < num_physpages))
+	if (likely((size >> PAGE_SHIFT) < totalram_pages))
 		return __vmalloc(size, gfp_mask, PAGE_KERNEL);
 	return NULL;
 }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index d808cf832c4..19ff81c49ba 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -25,6 +25,7 @@ extern unsigned long max_mapnr;
 #endif
 
 extern unsigned long num_physpages;
+extern unsigned long totalram_pages;
 extern void * high_memory;
 extern int page_cluster;
 
diff --git a/init/main.c b/init/main.c
index 34971becbd3..2c48c315316 100644
--- a/init/main.c
+++ b/init/main.c
@@ -668,12 +668,12 @@ asmlinkage void __init start_kernel(void)
 #endif
 	thread_info_cache_init();
 	cred_init();
-	fork_init(num_physpages);
+	fork_init(totalram_pages);
 	proc_caches_init();
 	buffer_init();
 	key_init();
 	security_init();
-	vfs_caches_init(num_physpages);
+	vfs_caches_init(totalram_pages);
 	radix_tree_init();
 	signals_init();
 	/* rootfs populating might need page-writeback */
diff --git a/mm/slab.c b/mm/slab.c
index 7b5d4deacfc..7dfa481c96b 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1384,7 +1384,7 @@ void __init kmem_cache_init(void)
 	 * Fragmentation resistance on low memory - only use bigger
 	 * page orders on machines with more than 32MB of memory.
 	 */
-	if (num_physpages > (32 << 20) >> PAGE_SHIFT)
+	if (totalram_pages > (32 << 20) >> PAGE_SHIFT)
 		slab_break_gfp_order = BREAK_GFP_ORDER_HI;
 
 	/* Bootstrap is tricky, because several objects are allocated
diff --git a/mm/swap.c b/mm/swap.c
index 4a8a59e671f..308e57d8d7e 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -496,7 +496,7 @@ EXPORT_SYMBOL(pagevec_lookup_tag);
  */
 void __init swap_setup(void)
 {
-	unsigned long megs = num_physpages >> (20 - PAGE_SHIFT);
+	unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT);
 
 #ifdef CONFIG_SWAP
 	bdi_init(swapper_space.backing_dev_info);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 9216b2555d0..5535da1d696 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1386,7 +1386,7 @@ void *vmap(struct page **pages, unsigned int count,
 
 	might_sleep();
 
-	if (count > num_physpages)
+	if (count > totalram_pages)
 		return NULL;
 
 	area = get_vm_area_caller((count << PAGE_SHIFT), flags,
@@ -1493,7 +1493,7 @@ static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
 	unsigned long real_size = size;
 
 	size = PAGE_ALIGN(size);
-	if (!size || (size >> PAGE_SHIFT) > num_physpages)
+	if (!size || (size >> PAGE_SHIFT) > totalram_pages)
 		return NULL;
 
 	area = __get_vm_area_node(size, VM_ALLOC, VMALLOC_START, VMALLOC_END,
diff --git a/net/core/sock.c b/net/core/sock.c
index 30d5446512f..524712a7b15 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1206,12 +1206,12 @@ EXPORT_SYMBOL_GPL(sk_setup_caps);
 
 void __init sk_init(void)
 {
-	if (num_physpages <= 4096) {
+	if (totalram_pages <= 4096) {
 		sysctl_wmem_max = 32767;
 		sysctl_rmem_max = 32767;
 		sysctl_wmem_default = 32767;
 		sysctl_rmem_default = 32767;
-	} else if (num_physpages >= 131072) {
+	} else if (totalram_pages >= 131072) {
 		sysctl_wmem_max = 131071;
 		sysctl_rmem_max = 131071;
 	}
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 923db06c7e5..bc4467082a0 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -1049,10 +1049,10 @@ static int __init dccp_init(void)
 	 *
 	 * The methodology is similar to that of the buffer cache.
 	 */
-	if (num_physpages >= (128 * 1024))
-		goal = num_physpages >> (21 - PAGE_SHIFT);
+	if (totalram_pages >= (128 * 1024))
+		goal = totalram_pages >> (21 - PAGE_SHIFT);
 	else
-		goal = num_physpages >> (23 - PAGE_SHIFT);
+		goal = totalram_pages >> (23 - PAGE_SHIFT);
 
 	if (thash_entries)
 		goal = (thash_entries *
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 9383d3e5a1a..57662cabaf9 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -1750,7 +1750,7 @@ void __init dn_route_init(void)
 	dn_route_timer.expires = jiffies + decnet_dst_gc_interval * HZ;
 	add_timer(&dn_route_timer);
 
-	goal = num_physpages >> (26 - PAGE_SHIFT);
+	goal = totalram_pages >> (26 - PAGE_SHIFT);
 
 	for(order = 0; (1UL << order) < goal; order++)
 		/* NOTHING */;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 91867d3e632..df934731453 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -3414,7 +3414,7 @@ int __init ip_rt_init(void)
 		alloc_large_system_hash("IP route cache",
 					sizeof(struct rt_hash_bucket),
 					rhash_entries,
-					(num_physpages >= 128 * 1024) ?
+					(totalram_pages >= 128 * 1024) ?
 					15 : 17,
 					0,
 					&rt_hash_log,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 19a0612b8a2..21387ebabf0 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2862,7 +2862,7 @@ void __init tcp_init(void)
 		alloc_large_system_hash("TCP established",
 					sizeof(struct inet_ehash_bucket),
 					thash_entries,
-					(num_physpages >= 128 * 1024) ?
+					(totalram_pages >= 128 * 1024) ?
 					13 : 15,
 					0,
 					&tcp_hashinfo.ehash_size,
@@ -2879,7 +2879,7 @@ void __init tcp_init(void)
 		alloc_large_system_hash("TCP bind",
 					sizeof(struct inet_bind_hashbucket),
 					tcp_hashinfo.ehash_size,
-					(num_physpages >= 128 * 1024) ?
+					(totalram_pages >= 128 * 1024) ?
 					13 : 15,
 					0,
 					&tcp_hashinfo.bhash_size,
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index b37109817a9..7c9ec3dee96 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1245,9 +1245,9 @@ static int nf_conntrack_init_init_net(void)
 	 * machine has 512 buckets. >= 1GB machines have 16384 buckets. */
 	if (!nf_conntrack_htable_size) {
 		nf_conntrack_htable_size
-			= (((num_physpages << PAGE_SHIFT) / 16384)
+			= (((totalram_pages << PAGE_SHIFT) / 16384)
 			   / sizeof(struct hlist_head));
-		if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
+		if (totalram_pages > (1024 * 1024 * 1024 / PAGE_SIZE))
 			nf_conntrack_htable_size = 16384;
 		if (nf_conntrack_htable_size < 32)
 			nf_conntrack_htable_size = 32;
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index a6ac83a9334..f01955cce31 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -617,7 +617,7 @@ struct xt_table_info *xt_alloc_table_info(unsigned int size)
 	int cpu;
 
 	/* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */
-	if ((SMP_ALIGN(size) >> PAGE_SHIFT) + 2 > num_physpages)
+	if ((SMP_ALIGN(size) >> PAGE_SHIFT) + 2 > totalram_pages)
 		return NULL;
 
 	newinfo = kzalloc(XT_TABLE_INFO_SZ, GFP_KERNEL);
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 219dcdbe388..dd16e404424 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -194,9 +194,9 @@ static int htable_create_v0(struct xt_hashlimit_info *minfo, u_int8_t family)
 	if (minfo->cfg.size)
 		size = minfo->cfg.size;
 	else {
-		size = ((num_physpages << PAGE_SHIFT) / 16384) /
+		size = ((totalram_pages << PAGE_SHIFT) / 16384) /
 		       sizeof(struct list_head);
-		if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
+		if (totalram_pages > (1024 * 1024 * 1024 / PAGE_SIZE))
 			size = 8192;
 		if (size < 16)
 			size = 16;
@@ -266,9 +266,9 @@ static int htable_create(struct xt_hashlimit_mtinfo1 *minfo, u_int8_t family)
 	if (minfo->cfg.size) {
 		size = minfo->cfg.size;
 	} else {
-		size = (num_physpages << PAGE_SHIFT) / 16384 /
+		size = (totalram_pages << PAGE_SHIFT) / 16384 /
 		       sizeof(struct list_head);
-		if (num_physpages > 1024 * 1024 * 1024 / PAGE_SIZE)
+		if (totalram_pages > 1024 * 1024 * 1024 / PAGE_SIZE)
 			size = 8192;
 		if (size < 16)
 			size = 16;
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index c5aab6a368c..55180b99562 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -2091,10 +2091,10 @@ static int __init netlink_proto_init(void)
 	if (!nl_table)
 		goto panic;
 
-	if (num_physpages >= (128 * 1024))
-		limit = num_physpages >> (21 - PAGE_SHIFT);
+	if (totalram_pages >= (128 * 1024))
+		limit = totalram_pages >> (21 - PAGE_SHIFT);
 	else
-		limit = num_physpages >> (23 - PAGE_SHIFT);
+		limit = totalram_pages >> (23 - PAGE_SHIFT);
 
 	order = get_bitmask_order(limit) - 1 + PAGE_SHIFT;
 	limit = (1UL << order) / sizeof(struct hlist_head);
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index c557f1fb1c6..612dc878e05 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -1184,10 +1184,10 @@ SCTP_STATIC __init int sctp_init(void)
 	/* Size and allocate the association hash table.
 	 * The methodology is similar to that of the tcp hash tables.
 	 */
-	if (num_physpages >= (128 * 1024))
-		goal = num_physpages >> (22 - PAGE_SHIFT);
+	if (totalram_pages >= (128 * 1024))
+		goal = totalram_pages >> (22 - PAGE_SHIFT);
 	else
-		goal = num_physpages >> (24 - PAGE_SHIFT);
+		goal = totalram_pages >> (24 - PAGE_SHIFT);
 
 	for (order = 0; (1UL << order) < goal; order++)
 		;
-- 
cgit v1.2.3


From 3c1596efe167322dae87f8390d36f91ce2d7f936 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Mon, 21 Sep 2009 17:03:06 -0700
Subject: mm: don't use alloc_bootmem_low() where not strictly needed

Since alloc_bootmem() will never return inaccessible (via virtual
addressing) memory anyway, using the ..._low() variant only makes sense
when the physical address range of the allocated memory must fulfill
further constraints, espacially since on 64-bits (or more generally in all
cases where the pools the two variants allocate from are than the full
available range.

Probably the use in alloc_tce_table() could also be eliminated (based on
code inspection of pci-calgary_64.c), but that seems too risky given I
know nothing about that hardware and have no way to test it.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/e820.c    | 2 +-
 arch/x86/mm/init_32.c     | 4 ++--
 drivers/firmware/memmap.c | 2 +-
 kernel/power/snapshot.c   | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index a3210ce1ecc..85419bb7d4a 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1331,7 +1331,7 @@ void __init e820_reserve_resources(void)
 	struct resource *res;
 	u64 end;
 
-	res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map);
+	res = alloc_bootmem(sizeof(struct resource) * e820.nr_map);
 	e820_res = res;
 	for (i = 0; i < e820.nr_map; i++) {
 		end = e820.map[i].addr + e820.map[i].size - 1;
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 95e877f5b84..b49b4f67453 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -84,7 +84,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
 #ifdef CONFIG_X86_PAE
 	if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
 		if (after_bootmem)
-			pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
+			pmd_table = (pmd_t *)alloc_bootmem_pages(PAGE_SIZE);
 		else
 			pmd_table = (pmd_t *)alloc_low_page();
 		paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
@@ -116,7 +116,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
 #endif
 			if (!page_table)
 				page_table =
-				(pte_t *)alloc_bootmem_low_pages(PAGE_SIZE);
+				(pte_t *)alloc_bootmem_pages(PAGE_SIZE);
 		} else
 			page_table = (pte_t *)alloc_low_page();
 
diff --git a/drivers/firmware/memmap.c b/drivers/firmware/memmap.c
index d5ea8a68d33..56f9234781f 100644
--- a/drivers/firmware/memmap.c
+++ b/drivers/firmware/memmap.c
@@ -164,7 +164,7 @@ int __init firmware_map_add_early(u64 start, u64 end, const char *type)
 {
 	struct firmware_map_entry *entry;
 
-	entry = alloc_bootmem_low(sizeof(struct firmware_map_entry));
+	entry = alloc_bootmem(sizeof(struct firmware_map_entry));
 	if (WARN_ON(!entry))
 		return -ENOMEM;
 
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 97955b0e44f..36cb168e433 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -619,7 +619,7 @@ __register_nosave_region(unsigned long start_pfn, unsigned long end_pfn,
 		BUG_ON(!region);
 	} else
 		/* This allocation cannot fail */
-		region = alloc_bootmem_low(sizeof(struct nosave_region));
+		region = alloc_bootmem(sizeof(struct nosave_region));
 	region->start_pfn = start_pfn;
 	region->end_pfn = end_pfn;
 	list_add_tail(&region->list, &nosave_regions);
-- 
cgit v1.2.3


From 3967684006f30c253bc6d4a6604d1bad4a7fc672 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 22 Sep 2009 15:50:24 +0200
Subject: x86: mce: Clean up thermal throttling state tracking code

Instead of a mess of three separate percpu variables, consolidate
the state into a single structure.

Also clean up therm_throt_process(), use cleaner and more
understandable variable names and a clearer logic.

This, without changing the logic, makes the code more
streamlined, more readable and smaller as well:

   text	   data	    bss	    dec	    hex	filename
   1487	    169	      4	   1660	    67c	therm_throt.o.before
   1432	    176	      4	   1612	    64c	therm_throt.o.after

Cc: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/mcheck/therm_throt.c | 63 ++++++++++++++++++++------------
 1 file changed, 39 insertions(+), 24 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 63a56d147e4..db80b577f60 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -34,20 +34,30 @@
 /* How long to wait between reporting thermal events */
 #define CHECK_INTERVAL		(300 * HZ)
 
-static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES;
-static DEFINE_PER_CPU(unsigned long, thermal_throttle_count);
-static DEFINE_PER_CPU(bool, thermal_throttle_active);
+/*
+ * Current thermal throttling state:
+ */
+struct thermal_state {
+	bool			is_throttled;
+
+	u64			next_check;
+	unsigned long		throttle_count;
+};
+
+static DEFINE_PER_CPU(struct thermal_state, thermal_state);
 
-static atomic_t therm_throt_en		= ATOMIC_INIT(0);
+static atomic_t therm_throt_en	= ATOMIC_INIT(0);
 
 #ifdef CONFIG_SYSFS
 #define define_therm_throt_sysdev_one_ro(_name)				\
 	static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL)
 
 #define define_therm_throt_sysdev_show_func(name)			\
-static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev,	\
-					struct sysdev_attribute *attr,	\
-					      char *buf)		\
+									\
+static ssize_t therm_throt_sysdev_show_##name(				\
+			struct sys_device *dev,				\
+			struct sysdev_attribute *attr,			\
+			char *buf)					\
 {									\
 	unsigned int cpu = dev->id;					\
 	ssize_t ret;							\
@@ -55,7 +65,7 @@ static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev,	\
 	preempt_disable();	/* CPU hotplug */			\
 	if (cpu_online(cpu))						\
 		ret = sprintf(buf, "%lu\n",				\
-			      per_cpu(thermal_throttle_##name, cpu));	\
+			      per_cpu(thermal_state, cpu).name);	\
 	else								\
 		ret = 0;						\
 	preempt_enable();						\
@@ -63,11 +73,11 @@ static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev,	\
 	return ret;							\
 }
 
-define_therm_throt_sysdev_show_func(count);
-define_therm_throt_sysdev_one_ro(count);
+define_therm_throt_sysdev_show_func(throttle_count);
+define_therm_throt_sysdev_one_ro(throttle_count);
 
 static struct attribute *thermal_throttle_attrs[] = {
-	&attr_count.attr,
+	&attr_throttle_count.attr,
 	NULL
 };
 
@@ -93,33 +103,38 @@ static struct attribute_group thermal_throttle_attr_group = {
  *          1 : Event should be logged further, and a message has been
  *              printed to the syslog.
  */
-static int therm_throt_process(int curr)
+static int therm_throt_process(bool is_throttled)
 {
-	unsigned int cpu = smp_processor_id();
-	__u64 tmp_jiffs = get_jiffies_64();
-	bool was_throttled = __get_cpu_var(thermal_throttle_active);
-	bool is_throttled = __get_cpu_var(thermal_throttle_active) = curr;
+	struct thermal_state *state;
+	unsigned int this_cpu;
+	bool was_throttled;
+	u64 now;
+
+	this_cpu = smp_processor_id();
+	now = get_jiffies_64();
+	state = &per_cpu(thermal_state, this_cpu);
+
+	was_throttled = state->is_throttled;
+	state->is_throttled = is_throttled;
 
 	if (is_throttled)
-		__get_cpu_var(thermal_throttle_count)++;
+		state->throttle_count++;
 
 	if (!(was_throttled ^ is_throttled) &&
-	    time_before64(tmp_jiffs, __get_cpu_var(next_check)))
+			time_before64(now, state->next_check))
 		return 0;
 
-	__get_cpu_var(next_check) = tmp_jiffs + CHECK_INTERVAL;
+	state->next_check = now + CHECK_INTERVAL;
 
 	/* if we just entered the thermal event */
 	if (is_throttled) {
-		printk(KERN_CRIT "CPU%d: Temperature above threshold, "
-		       "cpu clock throttled (total events = %lu)\n",
-		       cpu, __get_cpu_var(thermal_throttle_count));
+		printk(KERN_CRIT "CPU%d: Temperature above threshold, cpu clock throttled (total events = %lu)\n", this_cpu, state->throttle_count);
 
 		add_taint(TAINT_MACHINE_CHECK);
 		return 1;
 	}
 	if (was_throttled) {
-		printk(KERN_INFO "CPU%d: Temperature/speed normal\n", cpu);
+		printk(KERN_INFO "CPU%d: Temperature/speed normal\n", this_cpu);
 		return 1;
 	}
 
@@ -213,7 +228,7 @@ static void intel_thermal_interrupt(void)
 	__u64 msr_val;
 
 	rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
-	if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT))
+	if (therm_throt_process((msr_val & THERM_STATUS_PROCHOT) != 0))
 		mce_log_therm_throt_event(msr_val);
 }
 
-- 
cgit v1.2.3


From b417c9fd8690637f0c91479435ab3e2bf450c038 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 22 Sep 2009 15:50:24 +0200
Subject: x86: mce: Fix thermal throttling message storm

If a system switches back and forth between hot and cold mode,
the MCE code will print a stream of critical kernel messages.

Extend the throttling code to properly notice this, by
only printing the first hot + cold transition and omitting
the rest up to CHECK_INTERVAL (5 minutes).

This way we'll only get a single incident of:

 [  102.356584] CPU0: Temperature above threshold, cpu clock throttled (total events = 1)
 [  102.357000] Disabling lock debugging due to kernel taint
 [  102.369223] CPU0: Temperature/speed normal

Every 5 minutes. The 'total events' count tells the number of cold/hot
transitions detected, should overheating occur after 5 minutes again:

[  402.357580] CPU0: Temperature above threshold, cpu clock throttled (total events = 24891)
[  402.358001] CPU0: Temperature/speed normal
[  450.704142] Machine check events logged

Cc: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/mcheck/therm_throt.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index db80b577f60..b3a1dba7533 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -42,6 +42,7 @@ struct thermal_state {
 
 	u64			next_check;
 	unsigned long		throttle_count;
+	unsigned long		last_throttle_count;
 };
 
 static DEFINE_PER_CPU(struct thermal_state, thermal_state);
@@ -120,11 +121,12 @@ static int therm_throt_process(bool is_throttled)
 	if (is_throttled)
 		state->throttle_count++;
 
-	if (!(was_throttled ^ is_throttled) &&
-			time_before64(now, state->next_check))
+	if (time_before64(now, state->next_check) &&
+			state->throttle_count != state->last_throttle_count)
 		return 0;
 
 	state->next_check = now + CHECK_INTERVAL;
+	state->last_throttle_count = state->throttle_count;
 
 	/* if we just entered the thermal event */
 	if (is_throttled) {
-- 
cgit v1.2.3


From 14c0abf14a5e67e793131116bd97f57da37ccce3 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Tue, 22 Sep 2009 14:35:57 +0800
Subject: x86: mce, inject: Use real inject-msg in raise_local

Current raise_local() uses a struct mce that comes from mce_write()
as a parameter instead of the real inject-msg, so when we set
mce.finished = 0 to clear injected MCE, the real inject stays
valid.

This will cause the remaining inject-msg affect the next injection,
which is not desired.

To fix this, real inject-msg is used in raise_local instead of the
one on the stack.

This patch is based on the diagnosis and the fixes by Dean Nelson.

Reported-by: Dean Nelson <dnelson@redhat.com>
Signed-off-by: Huang Ying <ying.huang@intel.com>
Cc: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Cc: Andi Kleen <ak@linux.intel.com>
LKML-Reference: <1253601357.15717.757.camel@yhuang-dev.sh.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/mcheck/mce-inject.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
index 7029f0e2aca..472763d9209 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -98,8 +98,9 @@ static struct notifier_block mce_raise_nb = {
 };
 
 /* Inject mce on current CPU */
-static int raise_local(struct mce *m)
+static int raise_local(void)
 {
+	struct mce *m = &__get_cpu_var(injectm);
 	int context = MCJ_CTX(m->inject_flags);
 	int ret = 0;
 	int cpu = m->extcpu;
@@ -167,12 +168,12 @@ static void raise_mce(struct mce *m)
 			}
 			cpu_relax();
 		}
-		raise_local(m);
+		raise_local();
 		put_cpu();
 		put_online_cpus();
 	} else
 #endif
-		raise_local(m);
+		raise_local();
 }
 
 /* Error injection interface */
-- 
cgit v1.2.3


From 18c1e2c80d92adca50ffc654617639a4aa35f29c Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Tue, 22 Sep 2009 19:57:51 -0700
Subject: x86: syscall_get_nr returns int

Make syscall_get_nr() return int, so we always sign-extend
the low 32 bits of orig_ax in checks.

Signed-off-by: Roland McGrath <roland@redhat.com>
---
 arch/x86/include/asm/syscall.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
index d82f39bb790..8d33bc5462d 100644
--- a/arch/x86/include/asm/syscall.h
+++ b/arch/x86/include/asm/syscall.h
@@ -1,7 +1,7 @@
 /*
  * Access to user system call parameters and results
  *
- * Copyright (C) 2008 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2008-2009 Red Hat, Inc.  All rights reserved.
  *
  * This copyrighted material is made available to anyone wishing to use,
  * modify, copy, or redistribute it subject to the terms and conditions
@@ -16,13 +16,13 @@
 #include <linux/sched.h>
 #include <linux/err.h>
 
-static inline long syscall_get_nr(struct task_struct *task,
-				  struct pt_regs *regs)
+/*
+ * Only the low 32 bits of orig_ax are meaningful, so we return int.
+ * This importantly ignores the high bits on 64-bit, so comparisons
+ * sign-extend the low 32 bits.
+ */
+static inline int syscall_get_nr(struct task_struct *task, struct pt_regs *regs)
 {
-	/*
-	 * We always sign-extend a -1 value being set here,
-	 * so this is always either -1L or a syscall number.
-	 */
 	return regs->orig_ax;
 }
 
-- 
cgit v1.2.3


From b60e714dc3ef2a2541e3b262b8b742ba08a07fc0 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Tue, 22 Sep 2009 16:46:34 -0700
Subject: x86: ptrace: sysret path should reach syscall_trace_leave

If TIF_SYSCALL_TRACE or TIF_SINGLESTEP is set while inside a syscall,
the path back to user mode should get to syscall_trace_leave.

This does happen in most circumstances.  The exception to this is on
the 64-bit syscall fastpath, when no such flag was set on syscall
entry and nothing else has punted it off the fastpath for exit.  That
one exit fastpath fails to check for _TIF_WORK_SYSCALL_EXIT flags.
This makes the behavior inconsistent with what 32-bit tasks see and
what the native 32-bit kernel always does, and what 64-bit tasks see
in all cases where the iret path is taken anyhow.

Perhaps the only example that is affected is a ptrace stop inside
do_fork (for PTRACE_O_TRACE{CLONE,FORK,VFORK,VFORKDONE}).  Other
syscalls with internal ptrace stop points (execve) already take the
iret exit path for unrelated reasons.

Test cases for both PTRACE_SYSCALL and PTRACE_SINGLESTEP variants are at:
http://sources.redhat.com/cgi-bin/cvsweb.cgi/~checkout~/tests/ptrace-tests/tests/syscall-from-clone.c?cvsroot=systemtap
http://sources.redhat.com/cgi-bin/cvsweb.cgi/~checkout~/tests/ptrace-tests/tests/step-from-clone.c?cvsroot=systemtap

There was no special benefit to the sysret path's special path to call
do_notify_resume, because it always takes the iret exit path at the end.
So this change just makes the sysret exit path join the iret exit path
for all the signals and ptrace cases.  The fastpath still applies to
the plain syscall-audit and resched cases.

Signed-off-by: Roland McGrath <roland@redhat.com>
CC: Oleg Nesterov <oleg@redhat.com>
---
 arch/x86/kernel/entry_64.S | 22 ++++++++--------------
 1 file changed, 8 insertions(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 681c3fda739..b5c061f8f35 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -536,20 +536,13 @@ sysret_signal:
 	bt $TIF_SYSCALL_AUDIT,%edx
 	jc sysret_audit
 #endif
-	/* edx:	work flags (arg3) */
-	leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
-	xorl %esi,%esi # oldset -> arg2
-	SAVE_REST
-	FIXUP_TOP_OF_STACK %r11
-	call do_notify_resume
-	RESTORE_TOP_OF_STACK %r11
-	RESTORE_REST
-	movl $_TIF_WORK_MASK,%edi
-	/* Use IRET because user could have changed frame. This
-	   works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
-	DISABLE_INTERRUPTS(CLBR_NONE)
-	TRACE_IRQS_OFF
-	jmp int_with_check
+	/*
+	 * We have a signal, or exit tracing or single-step.
+	 * These all wind up with the iret return path anyway,
+	 * so just join that path right now.
+	 */
+	FIXUP_TOP_OF_STACK %r11, -ARGOFFSET
+	jmp int_check_syscall_exit_work
 
 badsys:
 	movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
@@ -654,6 +647,7 @@ int_careful:
 int_very_careful:
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_NONE)
+int_check_syscall_exit_work:
 	SAVE_REST
 	/* Check for syscall exit trace */
 	testl $_TIF_WORK_SYSCALL_EXIT,%edx
-- 
cgit v1.2.3


From 08ff18e299b1a1c91f4911fe9f35c4550218c73f Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Tue, 22 Sep 2009 19:58:58 -0700
Subject: x86: ptrace: do not sign-extend orig_ax on write

The high 32 bits of orig_ax will be ignored when it matters,
so don't fiddle them when setting it.

Signed-off-by: Roland McGrath <roland@redhat.com>
---
 arch/x86/kernel/ptrace.c | 19 +------------------
 1 file changed, 1 insertion(+), 18 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 8d7d5c9c1be..52222fab99f 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -325,16 +325,6 @@ static int putreg(struct task_struct *child,
 		return set_flags(child, value);
 
 #ifdef CONFIG_X86_64
-	/*
-	 * Orig_ax is really just a flag with small positive and
-	 * negative values, so make sure to always sign-extend it
-	 * from 32 bits so that it works correctly regardless of
-	 * whether we come from a 32-bit environment or not.
-	 */
-	case offsetof(struct user_regs_struct, orig_ax):
-		value = (long) (s32) value;
-		break;
-
 	case offsetof(struct user_regs_struct,fs_base):
 		if (value >= TASK_SIZE_OF(child))
 			return -EIO;
@@ -1121,17 +1111,10 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 value)
 	R32(esi, si);
 	R32(ebp, bp);
 	R32(eax, ax);
+	R32(orig_eax, orig_ax);
 	R32(eip, ip);
 	R32(esp, sp);
 
-	case offsetof(struct user32, regs.orig_eax):
-		/*
-		 * Sign-extend the value so that orig_eax = -1
-		 * causes (long)orig_ax < 0 tests to fire correctly.
-		 */
-		regs->orig_ax = (long) (s32) value;
-		break;
-
 	case offsetof(struct user32, regs.eflags):
 		return set_flags(child, value);
 
-- 
cgit v1.2.3


From 8cb3ed13935b9b523c2de7afc8f68473fe1d4531 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Tue, 22 Sep 2009 20:12:07 -0700
Subject: x86: ptrace: set TS_COMPAT when 32-bit ptrace sets orig_eax>=0

The 32-bit ptrace syscall on a 64-bit kernel (32-bit debugger on
32-bit task) behaves differently than a native 32-bit kernel.  When
setting a register state of orig_eax>=0 and eax=-ERESTART* when the
debugged task is NOT on its way out of a 32-bit syscall, the task will
fail to do the syscall restart logic that it should do.

Test case available at http://sources.redhat.com/cgi-bin/cvsweb.cgi/~checkout~/tests/ptrace-tests/tests/erestartsys-trap.c?cvsroot=systemtap

This happens because the 32-bit ptrace syscall sets eax=0xffffffff
when it sets orig_eax>=0.  The resuming task will not sign-extend this
for the -ERESTART* check because TS_COMPAT is not set.  (So the task
thinks it is restarting after a 64-bit syscall, not a 32-bit one.)

The fix is to have 32-bit ptrace calls set TS_COMPAT when setting
orig_eax>=0.  This ensures that the 32-bit syscall restart logic
will apply when the child resumes.

Signed-off-by: Roland McGrath <roland@redhat.com>
---
 arch/x86/kernel/ptrace.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 52222fab99f..7b058a2dc66 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1111,10 +1111,22 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 value)
 	R32(esi, si);
 	R32(ebp, bp);
 	R32(eax, ax);
-	R32(orig_eax, orig_ax);
 	R32(eip, ip);
 	R32(esp, sp);
 
+	case offsetof(struct user32, regs.orig_eax):
+		/*
+		 * A 32-bit debugger setting orig_eax means to restore
+		 * the state of the task restarting a 32-bit syscall.
+		 * Make sure we interpret the -ERESTART* codes correctly
+		 * in case the task is not actually still sitting at the
+		 * exit from a 32-bit syscall with TS_COMPAT still set.
+		 */
+		regs->orig_ax = value;
+		if (syscall_get_nr(child, regs) >= 0)
+			task_thread_info(child)->status |= TS_COMPAT;
+		break;
+
 	case offsetof(struct user32, regs.eflags):
 		return set_flags(child, value);
 
-- 
cgit v1.2.3


From 7d42896628202a551ad1107697cd215dc5fca099 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 23 Sep 2009 11:03:37 +0200
Subject: perf_event, x86: Fix 'perf sched record' crashing the machine

Chris Malley reported that 'perf sched record' sometimes
crashes his box with:

[  389.272175] BUG: unable to handle kernel paging request at ffffb300
[  389.272294] IP: [<c011b0bd>] default_send_IPI_self+0x1d/0x50
[  389.272366] *pde = 0073f067 *pte = 00000000
[  389.274708] Call Trace:
[  389.274752]  [<c010e3b4>] ?  set_perf_event_pending+0x14/0x20
[  389.274801]  [<c01b9751>] ?  perf_output_unlock+0x121/0x1a0
[  389.274848]  [<c01b981a>] ? perf_output_end+0x4a/0x70
[  389.274893]  [<c01ba690>] ?  __perf_event_overflow+0x240/0x2f0
[  389.274942]  [<c030963e>] ? atomic64_cmpxchg+0x1e/0x30
[  389.274988]  [<c01ba8f4>] ?  perf_swevent_ctx_event+0x1b4/0x1c0
[  389.275035]  [<c01ba773>] ?  perf_swevent_ctx_event+0x33/0x1c0
[  389.275081]  [<c01ba9a7>] ? do_perf_sw_event+0xa7/0x160
[  389.275127]  [<c01baae2>] ? perf_tp_event+0x82/0xa0
[  389.275174]  [<c012e9c6>] ?  ftrace_profile_sched_stat_runtime+0xe6/0x120
[  389.275224]  [<c012e8e0>] ?  ftrace_profile_sched_stat_runtime+0x0/0x120
[  389.275273]  [<c013c85a>] ? update_curr+0x18a/0x230
[  389.275318]  [<c013cdc5>] ?  put_prev_task_fair+0x155/0x160
[  389.275366]  [<c01618b5>] ? sched_clock_cpu+0xd5/0x110
[  389.275413]  [<c04e7525>] ? _spin_lock_irq+0x45/0x50
[  389.275458]  [<c04e424e>] ? schedule+0x20e/0xb10

The problem is that the box has no lapic enabled:

  [    0.042445] Local APIC not detected. Using dummy APIC emulation.

The below seems like the best fix. We disabled all lapic bits, except
the self-IPI-resend logic.

Reported-by: Chris Malley <mail@chrismalley.co.uk>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <7863dc4c0909221409v7893bfd3o4b590d5951a233ba@mail.gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index a3c7adb06b7..b5801c31184 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1790,6 +1790,9 @@ void smp_perf_pending_interrupt(struct pt_regs *regs)
 void set_perf_event_pending(void)
 {
 #ifdef CONFIG_X86_LOCAL_APIC
+	if (!x86_pmu.apic || !x86_pmu_initialized())
+		return;
+
 	apic->send_IPI_self(LOCAL_PENDING_VECTOR);
 #endif
 }
-- 
cgit v1.2.3


From cdae0ad5e8ea630017d4cad3923f763c4e6c3127 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Wed, 23 Sep 2009 22:26:42 -0600
Subject: lguest: move panic notifier registration to its expected place.

We used to defer it, so lockdep was happy.  We now init lockdep early
anyway, so just do it after that.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 arch/x86/lguest/boot.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 4cb7d5d18b8..7e59dc1d3fc 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -1135,11 +1135,6 @@ static struct notifier_block paniced = {
 /* Setting up memory is fairly easy. */
 static __init char *lguest_memory_setup(void)
 {
-	/* We do this here and not earlier because lockcheck used to barf if we
-	 * did it before start_kernel().  I think we fixed that, so it'd be
-	 * nice to move it back to lguest_init.  Patch welcome... */
-	atomic_notifier_chain_register(&panic_notifier_list, &paniced);
-
 	/*
 	 *The Linux bootloader header contains an "e820" memory map: the
 	 * Launcher populated the first entry with our memory limit.
@@ -1364,10 +1359,13 @@ __init void lguest_init(void)
 
 	/*
 	 * If we don't initialize the lock dependency checker now, it crashes
-	 * paravirt_disable_iospace.
+	 * atomic_notifier_chain_register, then paravirt_disable_iospace.
 	 */
 	lockdep_init();
 
+	/* Hook in our special panic hypercall code. */
+	atomic_notifier_chain_register(&panic_notifier_list, &paniced);
+
 	/*
 	 * The IDE code spends about 3 seconds probing for disks: if we reserve
 	 * all the I/O ports up front it can't get them and so doesn't probe.
-- 
cgit v1.2.3


From df6c516900d48df3581b23d37d6516a22ec4f2ca Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Thu, 20 Aug 2009 15:39:48 -0500
Subject: USB: ehci,dbgp,early_printk: split ehci debug driver from
 early_printk.c

Move the dbgp early printk driver in advance of refactoring and adding
new code, so the changes to this code are tracked separately from the
move of the code.

The drivers/usb/early directory will be the location of the current
and future early usb code for driving usb devices prior initializing
the standard interrupt driven USB drivers.

Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 arch/x86/kernel/early_printk.c | 715 ----------------------------------------
 drivers/usb/Makefile           |   1 +
 drivers/usb/early/Makefile     |   5 +
 drivers/usb/early/ehci-dbgp.c  | 723 +++++++++++++++++++++++++++++++++++++++++
 include/linux/usb/ehci_def.h   |   6 +
 5 files changed, 735 insertions(+), 715 deletions(-)
 create mode 100644 drivers/usb/early/Makefile
 create mode 100644 drivers/usb/early/ehci-dbgp.c

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index b11cab3c323..519a5e10be5 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -160,721 +160,6 @@ static struct console early_serial_console = {
 	.index =	-1,
 };
 
-#ifdef CONFIG_EARLY_PRINTK_DBGP
-
-static struct ehci_caps __iomem *ehci_caps;
-static struct ehci_regs __iomem *ehci_regs;
-static struct ehci_dbg_port __iomem *ehci_debug;
-static unsigned int dbgp_endpoint_out;
-
-struct ehci_dev {
-	u32 bus;
-	u32 slot;
-	u32 func;
-};
-
-static struct ehci_dev ehci_dev;
-
-#define USB_DEBUG_DEVNUM 127
-
-#define DBGP_DATA_TOGGLE	0x8800
-
-static inline u32 dbgp_pid_update(u32 x, u32 tok)
-{
-	return ((x ^ DBGP_DATA_TOGGLE) & 0xffff00) | (tok & 0xff);
-}
-
-static inline u32 dbgp_len_update(u32 x, u32 len)
-{
-	return (x & ~0x0f) | (len & 0x0f);
-}
-
-/*
- * USB Packet IDs (PIDs)
- */
-
-/* token */
-#define USB_PID_OUT		0xe1
-#define USB_PID_IN		0x69
-#define USB_PID_SOF		0xa5
-#define USB_PID_SETUP		0x2d
-/* handshake */
-#define USB_PID_ACK		0xd2
-#define USB_PID_NAK		0x5a
-#define USB_PID_STALL		0x1e
-#define USB_PID_NYET		0x96
-/* data */
-#define USB_PID_DATA0		0xc3
-#define USB_PID_DATA1		0x4b
-#define USB_PID_DATA2		0x87
-#define USB_PID_MDATA		0x0f
-/* Special */
-#define USB_PID_PREAMBLE	0x3c
-#define USB_PID_ERR		0x3c
-#define USB_PID_SPLIT		0x78
-#define USB_PID_PING		0xb4
-#define USB_PID_UNDEF_0		0xf0
-
-#define USB_PID_DATA_TOGGLE	0x88
-#define DBGP_CLAIM (DBGP_OWNER | DBGP_ENABLED | DBGP_INUSE)
-
-#define PCI_CAP_ID_EHCI_DEBUG	0xa
-
-#define HUB_ROOT_RESET_TIME	50	/* times are in msec */
-#define HUB_SHORT_RESET_TIME	10
-#define HUB_LONG_RESET_TIME	200
-#define HUB_RESET_TIMEOUT	500
-
-#define DBGP_MAX_PACKET		8
-
-static int dbgp_wait_until_complete(void)
-{
-	u32 ctrl;
-	int loop = 0x100000;
-
-	do {
-		ctrl = readl(&ehci_debug->control);
-		/* Stop when the transaction is finished */
-		if (ctrl & DBGP_DONE)
-			break;
-	} while (--loop > 0);
-
-	if (!loop)
-		return -1;
-
-	/*
-	 * Now that we have observed the completed transaction,
-	 * clear the done bit.
-	 */
-	writel(ctrl | DBGP_DONE, &ehci_debug->control);
-	return (ctrl & DBGP_ERROR) ? -DBGP_ERRCODE(ctrl) : DBGP_LEN(ctrl);
-}
-
-static void __init dbgp_mdelay(int ms)
-{
-	int i;
-
-	while (ms--) {
-		for (i = 0; i < 1000; i++)
-			outb(0x1, 0x80);
-	}
-}
-
-static void dbgp_breath(void)
-{
-	/* Sleep to give the debug port a chance to breathe */
-}
-
-static int dbgp_wait_until_done(unsigned ctrl)
-{
-	u32 pids, lpid;
-	int ret;
-	int loop = 3;
-
-retry:
-	writel(ctrl | DBGP_GO, &ehci_debug->control);
-	ret = dbgp_wait_until_complete();
-	pids = readl(&ehci_debug->pids);
-	lpid = DBGP_PID_GET(pids);
-
-	if (ret < 0)
-		return ret;
-
-	/*
-	 * If the port is getting full or it has dropped data
-	 * start pacing ourselves, not necessary but it's friendly.
-	 */
-	if ((lpid == USB_PID_NAK) || (lpid == USB_PID_NYET))
-		dbgp_breath();
-
-	/* If I get a NACK reissue the transmission */
-	if (lpid == USB_PID_NAK) {
-		if (--loop > 0)
-			goto retry;
-	}
-
-	return ret;
-}
-
-static void dbgp_set_data(const void *buf, int size)
-{
-	const unsigned char *bytes = buf;
-	u32 lo, hi;
-	int i;
-
-	lo = hi = 0;
-	for (i = 0; i < 4 && i < size; i++)
-		lo |= bytes[i] << (8*i);
-	for (; i < 8 && i < size; i++)
-		hi |= bytes[i] << (8*(i - 4));
-	writel(lo, &ehci_debug->data03);
-	writel(hi, &ehci_debug->data47);
-}
-
-static void __init dbgp_get_data(void *buf, int size)
-{
-	unsigned char *bytes = buf;
-	u32 lo, hi;
-	int i;
-
-	lo = readl(&ehci_debug->data03);
-	hi = readl(&ehci_debug->data47);
-	for (i = 0; i < 4 && i < size; i++)
-		bytes[i] = (lo >> (8*i)) & 0xff;
-	for (; i < 8 && i < size; i++)
-		bytes[i] = (hi >> (8*(i - 4))) & 0xff;
-}
-
-static int dbgp_bulk_write(unsigned devnum, unsigned endpoint,
-			 const char *bytes, int size)
-{
-	u32 pids, addr, ctrl;
-	int ret;
-
-	if (size > DBGP_MAX_PACKET)
-		return -1;
-
-	addr = DBGP_EPADDR(devnum, endpoint);
-
-	pids = readl(&ehci_debug->pids);
-	pids = dbgp_pid_update(pids, USB_PID_OUT);
-
-	ctrl = readl(&ehci_debug->control);
-	ctrl = dbgp_len_update(ctrl, size);
-	ctrl |= DBGP_OUT;
-	ctrl |= DBGP_GO;
-
-	dbgp_set_data(bytes, size);
-	writel(addr, &ehci_debug->address);
-	writel(pids, &ehci_debug->pids);
-
-	ret = dbgp_wait_until_done(ctrl);
-	if (ret < 0)
-		return ret;
-
-	return ret;
-}
-
-static int __init dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data,
-				 int size)
-{
-	u32 pids, addr, ctrl;
-	int ret;
-
-	if (size > DBGP_MAX_PACKET)
-		return -1;
-
-	addr = DBGP_EPADDR(devnum, endpoint);
-
-	pids = readl(&ehci_debug->pids);
-	pids = dbgp_pid_update(pids, USB_PID_IN);
-
-	ctrl = readl(&ehci_debug->control);
-	ctrl = dbgp_len_update(ctrl, size);
-	ctrl &= ~DBGP_OUT;
-	ctrl |= DBGP_GO;
-
-	writel(addr, &ehci_debug->address);
-	writel(pids, &ehci_debug->pids);
-	ret = dbgp_wait_until_done(ctrl);
-	if (ret < 0)
-		return ret;
-
-	if (size > ret)
-		size = ret;
-	dbgp_get_data(data, size);
-	return ret;
-}
-
-static int __init dbgp_control_msg(unsigned devnum, int requesttype,
-	int request, int value, int index, void *data, int size)
-{
-	u32 pids, addr, ctrl;
-	struct usb_ctrlrequest req;
-	int read;
-	int ret;
-
-	read = (requesttype & USB_DIR_IN) != 0;
-	if (size > (read ? DBGP_MAX_PACKET:0))
-		return -1;
-
-	/* Compute the control message */
-	req.bRequestType = requesttype;
-	req.bRequest = request;
-	req.wValue = cpu_to_le16(value);
-	req.wIndex = cpu_to_le16(index);
-	req.wLength = cpu_to_le16(size);
-
-	pids = DBGP_PID_SET(USB_PID_DATA0, USB_PID_SETUP);
-	addr = DBGP_EPADDR(devnum, 0);
-
-	ctrl = readl(&ehci_debug->control);
-	ctrl = dbgp_len_update(ctrl, sizeof(req));
-	ctrl |= DBGP_OUT;
-	ctrl |= DBGP_GO;
-
-	/* Send the setup message */
-	dbgp_set_data(&req, sizeof(req));
-	writel(addr, &ehci_debug->address);
-	writel(pids, &ehci_debug->pids);
-	ret = dbgp_wait_until_done(ctrl);
-	if (ret < 0)
-		return ret;
-
-	/* Read the result */
-	return dbgp_bulk_read(devnum, 0, data, size);
-}
-
-
-/* Find a PCI capability */
-static u32 __init find_cap(u32 num, u32 slot, u32 func, int cap)
-{
-	u8 pos;
-	int bytes;
-
-	if (!(read_pci_config_16(num, slot, func, PCI_STATUS) &
-		PCI_STATUS_CAP_LIST))
-		return 0;
-
-	pos = read_pci_config_byte(num, slot, func, PCI_CAPABILITY_LIST);
-	for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) {
-		u8 id;
-
-		pos &= ~3;
-		id = read_pci_config_byte(num, slot, func, pos+PCI_CAP_LIST_ID);
-		if (id == 0xff)
-			break;
-		if (id == cap)
-			return pos;
-
-		pos = read_pci_config_byte(num, slot, func,
-						 pos+PCI_CAP_LIST_NEXT);
-	}
-	return 0;
-}
-
-static u32 __init __find_dbgp(u32 bus, u32 slot, u32 func)
-{
-	u32 class;
-
-	class = read_pci_config(bus, slot, func, PCI_CLASS_REVISION);
-	if ((class >> 8) != PCI_CLASS_SERIAL_USB_EHCI)
-		return 0;
-
-	return find_cap(bus, slot, func, PCI_CAP_ID_EHCI_DEBUG);
-}
-
-static u32 __init find_dbgp(int ehci_num, u32 *rbus, u32 *rslot, u32 *rfunc)
-{
-	u32 bus, slot, func;
-
-	for (bus = 0; bus < 256; bus++) {
-		for (slot = 0; slot < 32; slot++) {
-			for (func = 0; func < 8; func++) {
-				unsigned cap;
-
-				cap = __find_dbgp(bus, slot, func);
-
-				if (!cap)
-					continue;
-				if (ehci_num-- != 0)
-					continue;
-				*rbus = bus;
-				*rslot = slot;
-				*rfunc = func;
-				return cap;
-			}
-		}
-	}
-	return 0;
-}
-
-static int __init ehci_reset_port(int port)
-{
-	u32 portsc;
-	u32 delay_time, delay;
-	int loop;
-
-	/* Reset the usb debug port */
-	portsc = readl(&ehci_regs->port_status[port - 1]);
-	portsc &= ~PORT_PE;
-	portsc |= PORT_RESET;
-	writel(portsc, &ehci_regs->port_status[port - 1]);
-
-	delay = HUB_ROOT_RESET_TIME;
-	for (delay_time = 0; delay_time < HUB_RESET_TIMEOUT;
-	     delay_time += delay) {
-		dbgp_mdelay(delay);
-
-		portsc = readl(&ehci_regs->port_status[port - 1]);
-		if (portsc & PORT_RESET) {
-			/* force reset to complete */
-			loop = 2;
-			writel(portsc & ~(PORT_RWC_BITS | PORT_RESET),
-				&ehci_regs->port_status[port - 1]);
-			do {
-				portsc = readl(&ehci_regs->port_status[port-1]);
-			} while ((portsc & PORT_RESET) && (--loop > 0));
-		}
-
-		/* Device went away? */
-		if (!(portsc & PORT_CONNECT))
-			return -ENOTCONN;
-
-		/* bomb out completely if something weird happend */
-		if ((portsc & PORT_CSC))
-			return -EINVAL;
-
-		/* If we've finished resetting, then break out of the loop */
-		if (!(portsc & PORT_RESET) && (portsc & PORT_PE))
-			return 0;
-	}
-	return -EBUSY;
-}
-
-static int __init ehci_wait_for_port(int port)
-{
-	u32 status;
-	int ret, reps;
-
-	for (reps = 0; reps < 3; reps++) {
-		dbgp_mdelay(100);
-		status = readl(&ehci_regs->status);
-		if (status & STS_PCD) {
-			ret = ehci_reset_port(port);
-			if (ret == 0)
-				return 0;
-		}
-	}
-	return -ENOTCONN;
-}
-
-#ifdef DBGP_DEBUG
-# define dbgp_printk early_printk
-#else
-static inline void dbgp_printk(const char *fmt, ...) { }
-#endif
-
-typedef void (*set_debug_port_t)(int port);
-
-static void __init default_set_debug_port(int port)
-{
-}
-
-static set_debug_port_t __initdata set_debug_port = default_set_debug_port;
-
-static void __init nvidia_set_debug_port(int port)
-{
-	u32 dword;
-	dword = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func,
-				 0x74);
-	dword &= ~(0x0f<<12);
-	dword |= ((port & 0x0f)<<12);
-	write_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, 0x74,
-				 dword);
-	dbgp_printk("set debug port to %d\n", port);
-}
-
-static void __init detect_set_debug_port(void)
-{
-	u32 vendorid;
-
-	vendorid = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func,
-		 0x00);
-
-	if ((vendorid & 0xffff) == 0x10de) {
-		dbgp_printk("using nvidia set_debug_port\n");
-		set_debug_port = nvidia_set_debug_port;
-	}
-}
-
-static int __init ehci_setup(void)
-{
-	struct usb_debug_descriptor dbgp_desc;
-	u32 cmd, ctrl, status, portsc, hcs_params;
-	u32 debug_port, new_debug_port = 0, n_ports;
-	u32  devnum;
-	int ret, i;
-	int loop;
-	int port_map_tried;
-	int playtimes = 3;
-
-try_next_time:
-	port_map_tried = 0;
-
-try_next_port:
-
-	hcs_params = readl(&ehci_caps->hcs_params);
-	debug_port = HCS_DEBUG_PORT(hcs_params);
-	n_ports    = HCS_N_PORTS(hcs_params);
-
-	dbgp_printk("debug_port: %d\n", debug_port);
-	dbgp_printk("n_ports:    %d\n", n_ports);
-
-	for (i = 1; i <= n_ports; i++) {
-		portsc = readl(&ehci_regs->port_status[i-1]);
-		dbgp_printk("portstatus%d: %08x\n", i, portsc);
-	}
-
-	if (port_map_tried && (new_debug_port != debug_port)) {
-		if (--playtimes) {
-			set_debug_port(new_debug_port);
-			goto try_next_time;
-		}
-		return -1;
-	}
-
-	loop = 100000;
-	/* Reset the EHCI controller */
-	cmd = readl(&ehci_regs->command);
-	cmd |= CMD_RESET;
-	writel(cmd, &ehci_regs->command);
-	do {
-		cmd = readl(&ehci_regs->command);
-	} while ((cmd & CMD_RESET) && (--loop > 0));
-
-	if (!loop) {
-		dbgp_printk("can not reset ehci\n");
-		return -1;
-	}
-	dbgp_printk("ehci reset done\n");
-
-	/* Claim ownership, but do not enable yet */
-	ctrl = readl(&ehci_debug->control);
-	ctrl |= DBGP_OWNER;
-	ctrl &= ~(DBGP_ENABLED | DBGP_INUSE);
-	writel(ctrl, &ehci_debug->control);
-
-	/* Start the ehci running */
-	cmd = readl(&ehci_regs->command);
-	cmd &= ~(CMD_LRESET | CMD_IAAD | CMD_PSE | CMD_ASE | CMD_RESET);
-	cmd |= CMD_RUN;
-	writel(cmd, &ehci_regs->command);
-
-	/* Ensure everything is routed to the EHCI */
-	writel(FLAG_CF, &ehci_regs->configured_flag);
-
-	/* Wait until the controller is no longer halted */
-	loop = 10;
-	do {
-		status = readl(&ehci_regs->status);
-	} while ((status & STS_HALT) && (--loop > 0));
-
-	if (!loop) {
-		dbgp_printk("ehci can be started\n");
-		return -1;
-	}
-	dbgp_printk("ehci started\n");
-
-	/* Wait for a device to show up in the debug port */
-	ret = ehci_wait_for_port(debug_port);
-	if (ret < 0) {
-		dbgp_printk("No device found in debug port\n");
-		goto next_debug_port;
-	}
-	dbgp_printk("ehci wait for port done\n");
-
-	/* Enable the debug port */
-	ctrl = readl(&ehci_debug->control);
-	ctrl |= DBGP_CLAIM;
-	writel(ctrl, &ehci_debug->control);
-	ctrl = readl(&ehci_debug->control);
-	if ((ctrl & DBGP_CLAIM) != DBGP_CLAIM) {
-		dbgp_printk("No device in debug port\n");
-		writel(ctrl & ~DBGP_CLAIM, &ehci_debug->control);
-		goto err;
-	}
-	dbgp_printk("debug ported enabled\n");
-
-	/* Completely transfer the debug device to the debug controller */
-	portsc = readl(&ehci_regs->port_status[debug_port - 1]);
-	portsc &= ~PORT_PE;
-	writel(portsc, &ehci_regs->port_status[debug_port - 1]);
-
-	dbgp_mdelay(100);
-
-	/* Find the debug device and make it device number 127 */
-	for (devnum = 0; devnum <= 127; devnum++) {
-		ret = dbgp_control_msg(devnum,
-			USB_DIR_IN | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
-			USB_REQ_GET_DESCRIPTOR, (USB_DT_DEBUG << 8), 0,
-			&dbgp_desc, sizeof(dbgp_desc));
-		if (ret > 0)
-			break;
-	}
-	if (devnum > 127) {
-		dbgp_printk("Could not find attached debug device\n");
-		goto err;
-	}
-	if (ret < 0) {
-		dbgp_printk("Attached device is not a debug device\n");
-		goto err;
-	}
-	dbgp_endpoint_out = dbgp_desc.bDebugOutEndpoint;
-
-	/* Move the device to 127 if it isn't already there */
-	if (devnum != USB_DEBUG_DEVNUM) {
-		ret = dbgp_control_msg(devnum,
-			USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
-			USB_REQ_SET_ADDRESS, USB_DEBUG_DEVNUM, 0, NULL, 0);
-		if (ret < 0) {
-			dbgp_printk("Could not move attached device to %d\n",
-				USB_DEBUG_DEVNUM);
-			goto err;
-		}
-		devnum = USB_DEBUG_DEVNUM;
-		dbgp_printk("debug device renamed to 127\n");
-	}
-
-	/* Enable the debug interface */
-	ret = dbgp_control_msg(USB_DEBUG_DEVNUM,
-		USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
-		USB_REQ_SET_FEATURE, USB_DEVICE_DEBUG_MODE, 0, NULL, 0);
-	if (ret < 0) {
-		dbgp_printk(" Could not enable the debug device\n");
-		goto err;
-	}
-	dbgp_printk("debug interface enabled\n");
-
-	/* Perform a small write to get the even/odd data state in sync
-	 */
-	ret = dbgp_bulk_write(USB_DEBUG_DEVNUM, dbgp_endpoint_out, " ", 1);
-	if (ret < 0) {
-		dbgp_printk("dbgp_bulk_write failed: %d\n", ret);
-		goto err;
-	}
-	dbgp_printk("small write doned\n");
-
-	return 0;
-err:
-	/* Things didn't work so remove my claim */
-	ctrl = readl(&ehci_debug->control);
-	ctrl &= ~(DBGP_CLAIM | DBGP_OUT);
-	writel(ctrl, &ehci_debug->control);
-	return -1;
-
-next_debug_port:
-	port_map_tried |= (1<<(debug_port - 1));
-	new_debug_port = ((debug_port-1+1)%n_ports) + 1;
-	if (port_map_tried != ((1<<n_ports) - 1)) {
-		set_debug_port(new_debug_port);
-		goto try_next_port;
-	}
-	if (--playtimes) {
-		set_debug_port(new_debug_port);
-		goto try_next_time;
-	}
-
-	return -1;
-}
-
-static int __init early_dbgp_init(char *s)
-{
-	u32 debug_port, bar, offset;
-	u32 bus, slot, func, cap;
-	void __iomem *ehci_bar;
-	u32 dbgp_num;
-	u32 bar_val;
-	char *e;
-	int ret;
-	u8 byte;
-
-	if (!early_pci_allowed())
-		return -1;
-
-	dbgp_num = 0;
-	if (*s)
-		dbgp_num = simple_strtoul(s, &e, 10);
-	dbgp_printk("dbgp_num: %d\n", dbgp_num);
-
-	cap = find_dbgp(dbgp_num, &bus, &slot, &func);
-	if (!cap)
-		return -1;
-
-	dbgp_printk("Found EHCI debug port on %02x:%02x.%1x\n", bus, slot,
-			 func);
-
-	debug_port = read_pci_config(bus, slot, func, cap);
-	bar = (debug_port >> 29) & 0x7;
-	bar = (bar * 4) + 0xc;
-	offset = (debug_port >> 16) & 0xfff;
-	dbgp_printk("bar: %02x offset: %03x\n", bar, offset);
-	if (bar != PCI_BASE_ADDRESS_0) {
-		dbgp_printk("only debug ports on bar 1 handled.\n");
-
-		return -1;
-	}
-
-	bar_val = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0);
-	dbgp_printk("bar_val: %02x offset: %03x\n", bar_val, offset);
-	if (bar_val & ~PCI_BASE_ADDRESS_MEM_MASK) {
-		dbgp_printk("only simple 32bit mmio bars supported\n");
-
-		return -1;
-	}
-
-	/* double check if the mem space is enabled */
-	byte = read_pci_config_byte(bus, slot, func, 0x04);
-	if (!(byte & 0x2)) {
-		byte  |= 0x02;
-		write_pci_config_byte(bus, slot, func, 0x04, byte);
-		dbgp_printk("mmio for ehci enabled\n");
-	}
-
-	/*
-	 * FIXME I don't have the bar size so just guess PAGE_SIZE is more
-	 * than enough.  1K is the biggest I have seen.
-	 */
-	set_fixmap_nocache(FIX_DBGP_BASE, bar_val & PAGE_MASK);
-	ehci_bar = (void __iomem *)__fix_to_virt(FIX_DBGP_BASE);
-	ehci_bar += bar_val & ~PAGE_MASK;
-	dbgp_printk("ehci_bar: %p\n", ehci_bar);
-
-	ehci_caps  = ehci_bar;
-	ehci_regs  = ehci_bar + HC_LENGTH(readl(&ehci_caps->hc_capbase));
-	ehci_debug = ehci_bar + offset;
-	ehci_dev.bus = bus;
-	ehci_dev.slot = slot;
-	ehci_dev.func = func;
-
-	detect_set_debug_port();
-
-	ret = ehci_setup();
-	if (ret < 0) {
-		dbgp_printk("ehci_setup failed\n");
-		ehci_debug = NULL;
-
-		return -1;
-	}
-
-	return 0;
-}
-
-static void early_dbgp_write(struct console *con, const char *str, u32 n)
-{
-	int chunk, ret;
-
-	if (!ehci_debug)
-		return;
-	while (n > 0) {
-		chunk = n;
-		if (chunk > DBGP_MAX_PACKET)
-			chunk = DBGP_MAX_PACKET;
-		ret = dbgp_bulk_write(USB_DEBUG_DEVNUM,
-			dbgp_endpoint_out, str, chunk);
-		str += chunk;
-		n -= chunk;
-	}
-}
-
-static struct console early_dbgp_console = {
-	.name =		"earlydbg",
-	.write =	early_dbgp_write,
-	.flags =	CON_PRINTBUFFER,
-	.index =	-1,
-};
-#endif
-
 /* Direct interface for emergencies */
 static struct console *early_console = &early_vga_console;
 static int __initdata early_console_initialized;
diff --git a/drivers/usb/Makefile b/drivers/usb/Makefile
index dbe4fb694ad..be3c9b80bc9 100644
--- a/drivers/usb/Makefile
+++ b/drivers/usb/Makefile
@@ -40,6 +40,7 @@ obj-$(CONFIG_USB_MICROTEK)	+= image/
 obj-$(CONFIG_USB_SERIAL)	+= serial/
 
 obj-$(CONFIG_USB)		+= misc/
+obj-y				+= early/
 
 obj-$(CONFIG_USB_ATM)		+= atm/
 obj-$(CONFIG_USB_SPEEDTOUCH)	+= atm/
diff --git a/drivers/usb/early/Makefile b/drivers/usb/early/Makefile
new file mode 100644
index 00000000000..dfedee8c45b
--- /dev/null
+++ b/drivers/usb/early/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for early USB devices
+#
+
+obj-$(CONFIG_EARLY_PRINTK_DBGP)        += ehci-dbgp.o
diff --git a/drivers/usb/early/ehci-dbgp.c b/drivers/usb/early/ehci-dbgp.c
new file mode 100644
index 00000000000..821b7b21c29
--- /dev/null
+++ b/drivers/usb/early/ehci-dbgp.c
@@ -0,0 +1,723 @@
+#include <linux/console.h>
+#include <linux/errno.h>
+#include <linux/pci_regs.h>
+#include <linux/pci_ids.h>
+#include <linux/usb/ch9.h>
+#include <linux/usb/ehci_def.h>
+#include <linux/delay.h>
+#include <asm/io.h>
+#include <asm/pci-direct.h>
+#include <asm/fixmap.h>
+
+static struct ehci_caps __iomem *ehci_caps;
+static struct ehci_regs __iomem *ehci_regs;
+static struct ehci_dbg_port __iomem *ehci_debug;
+static unsigned int dbgp_endpoint_out;
+
+struct ehci_dev {
+	u32 bus;
+	u32 slot;
+	u32 func;
+};
+
+static struct ehci_dev ehci_dev;
+
+#define USB_DEBUG_DEVNUM 127
+
+#define DBGP_DATA_TOGGLE	0x8800
+
+static inline u32 dbgp_pid_update(u32 x, u32 tok)
+{
+	return ((x ^ DBGP_DATA_TOGGLE) & 0xffff00) | (tok & 0xff);
+}
+
+static inline u32 dbgp_len_update(u32 x, u32 len)
+{
+	return (x & ~0x0f) | (len & 0x0f);
+}
+
+/*
+ * USB Packet IDs (PIDs)
+ */
+
+/* token */
+#define USB_PID_OUT		0xe1
+#define USB_PID_IN		0x69
+#define USB_PID_SOF		0xa5
+#define USB_PID_SETUP		0x2d
+/* handshake */
+#define USB_PID_ACK		0xd2
+#define USB_PID_NAK		0x5a
+#define USB_PID_STALL		0x1e
+#define USB_PID_NYET		0x96
+/* data */
+#define USB_PID_DATA0		0xc3
+#define USB_PID_DATA1		0x4b
+#define USB_PID_DATA2		0x87
+#define USB_PID_MDATA		0x0f
+/* Special */
+#define USB_PID_PREAMBLE	0x3c
+#define USB_PID_ERR		0x3c
+#define USB_PID_SPLIT		0x78
+#define USB_PID_PING		0xb4
+#define USB_PID_UNDEF_0		0xf0
+
+#define USB_PID_DATA_TOGGLE	0x88
+#define DBGP_CLAIM (DBGP_OWNER | DBGP_ENABLED | DBGP_INUSE)
+
+#define PCI_CAP_ID_EHCI_DEBUG	0xa
+
+#define HUB_ROOT_RESET_TIME	50	/* times are in msec */
+#define HUB_SHORT_RESET_TIME	10
+#define HUB_LONG_RESET_TIME	200
+#define HUB_RESET_TIMEOUT	500
+
+#define DBGP_MAX_PACKET		8
+
+static int dbgp_wait_until_complete(void)
+{
+	u32 ctrl;
+	int loop = 0x100000;
+
+	do {
+		ctrl = readl(&ehci_debug->control);
+		/* Stop when the transaction is finished */
+		if (ctrl & DBGP_DONE)
+			break;
+	} while (--loop > 0);
+
+	if (!loop)
+		return -1;
+
+	/*
+	 * Now that we have observed the completed transaction,
+	 * clear the done bit.
+	 */
+	writel(ctrl | DBGP_DONE, &ehci_debug->control);
+	return (ctrl & DBGP_ERROR) ? -DBGP_ERRCODE(ctrl) : DBGP_LEN(ctrl);
+}
+
+static void __init dbgp_mdelay(int ms)
+{
+	int i;
+
+	while (ms--) {
+		for (i = 0; i < 1000; i++)
+			outb(0x1, 0x80);
+	}
+}
+
+static void dbgp_breath(void)
+{
+	/* Sleep to give the debug port a chance to breathe */
+}
+
+static int dbgp_wait_until_done(unsigned ctrl)
+{
+	u32 pids, lpid;
+	int ret;
+	int loop = 3;
+
+retry:
+	writel(ctrl | DBGP_GO, &ehci_debug->control);
+	ret = dbgp_wait_until_complete();
+	pids = readl(&ehci_debug->pids);
+	lpid = DBGP_PID_GET(pids);
+
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * If the port is getting full or it has dropped data
+	 * start pacing ourselves, not necessary but it's friendly.
+	 */
+	if ((lpid == USB_PID_NAK) || (lpid == USB_PID_NYET))
+		dbgp_breath();
+
+	/* If I get a NACK reissue the transmission */
+	if (lpid == USB_PID_NAK) {
+		if (--loop > 0)
+			goto retry;
+	}
+
+	return ret;
+}
+
+static void dbgp_set_data(const void *buf, int size)
+{
+	const unsigned char *bytes = buf;
+	u32 lo, hi;
+	int i;
+
+	lo = hi = 0;
+	for (i = 0; i < 4 && i < size; i++)
+		lo |= bytes[i] << (8*i);
+	for (; i < 8 && i < size; i++)
+		hi |= bytes[i] << (8*(i - 4));
+	writel(lo, &ehci_debug->data03);
+	writel(hi, &ehci_debug->data47);
+}
+
+static void __init dbgp_get_data(void *buf, int size)
+{
+	unsigned char *bytes = buf;
+	u32 lo, hi;
+	int i;
+
+	lo = readl(&ehci_debug->data03);
+	hi = readl(&ehci_debug->data47);
+	for (i = 0; i < 4 && i < size; i++)
+		bytes[i] = (lo >> (8*i)) & 0xff;
+	for (; i < 8 && i < size; i++)
+		bytes[i] = (hi >> (8*(i - 4))) & 0xff;
+}
+
+static int dbgp_bulk_write(unsigned devnum, unsigned endpoint,
+			 const char *bytes, int size)
+{
+	u32 pids, addr, ctrl;
+	int ret;
+
+	if (size > DBGP_MAX_PACKET)
+		return -1;
+
+	addr = DBGP_EPADDR(devnum, endpoint);
+
+	pids = readl(&ehci_debug->pids);
+	pids = dbgp_pid_update(pids, USB_PID_OUT);
+
+	ctrl = readl(&ehci_debug->control);
+	ctrl = dbgp_len_update(ctrl, size);
+	ctrl |= DBGP_OUT;
+	ctrl |= DBGP_GO;
+
+	dbgp_set_data(bytes, size);
+	writel(addr, &ehci_debug->address);
+	writel(pids, &ehci_debug->pids);
+
+	ret = dbgp_wait_until_done(ctrl);
+	if (ret < 0)
+		return ret;
+
+	return ret;
+}
+
+static int __init dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data,
+				 int size)
+{
+	u32 pids, addr, ctrl;
+	int ret;
+
+	if (size > DBGP_MAX_PACKET)
+		return -1;
+
+	addr = DBGP_EPADDR(devnum, endpoint);
+
+	pids = readl(&ehci_debug->pids);
+	pids = dbgp_pid_update(pids, USB_PID_IN);
+
+	ctrl = readl(&ehci_debug->control);
+	ctrl = dbgp_len_update(ctrl, size);
+	ctrl &= ~DBGP_OUT;
+	ctrl |= DBGP_GO;
+
+	writel(addr, &ehci_debug->address);
+	writel(pids, &ehci_debug->pids);
+	ret = dbgp_wait_until_done(ctrl);
+	if (ret < 0)
+		return ret;
+
+	if (size > ret)
+		size = ret;
+	dbgp_get_data(data, size);
+	return ret;
+}
+
+static int __init dbgp_control_msg(unsigned devnum, int requesttype,
+	int request, int value, int index, void *data, int size)
+{
+	u32 pids, addr, ctrl;
+	struct usb_ctrlrequest req;
+	int read;
+	int ret;
+
+	read = (requesttype & USB_DIR_IN) != 0;
+	if (size > (read ? DBGP_MAX_PACKET:0))
+		return -1;
+
+	/* Compute the control message */
+	req.bRequestType = requesttype;
+	req.bRequest = request;
+	req.wValue = cpu_to_le16(value);
+	req.wIndex = cpu_to_le16(index);
+	req.wLength = cpu_to_le16(size);
+
+	pids = DBGP_PID_SET(USB_PID_DATA0, USB_PID_SETUP);
+	addr = DBGP_EPADDR(devnum, 0);
+
+	ctrl = readl(&ehci_debug->control);
+	ctrl = dbgp_len_update(ctrl, sizeof(req));
+	ctrl |= DBGP_OUT;
+	ctrl |= DBGP_GO;
+
+	/* Send the setup message */
+	dbgp_set_data(&req, sizeof(req));
+	writel(addr, &ehci_debug->address);
+	writel(pids, &ehci_debug->pids);
+	ret = dbgp_wait_until_done(ctrl);
+	if (ret < 0)
+		return ret;
+
+	/* Read the result */
+	return dbgp_bulk_read(devnum, 0, data, size);
+}
+
+
+/* Find a PCI capability */
+static u32 __init find_cap(u32 num, u32 slot, u32 func, int cap)
+{
+	u8 pos;
+	int bytes;
+
+	if (!(read_pci_config_16(num, slot, func, PCI_STATUS) &
+		PCI_STATUS_CAP_LIST))
+		return 0;
+
+	pos = read_pci_config_byte(num, slot, func, PCI_CAPABILITY_LIST);
+	for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) {
+		u8 id;
+
+		pos &= ~3;
+		id = read_pci_config_byte(num, slot, func, pos+PCI_CAP_LIST_ID);
+		if (id == 0xff)
+			break;
+		if (id == cap)
+			return pos;
+
+		pos = read_pci_config_byte(num, slot, func,
+						 pos+PCI_CAP_LIST_NEXT);
+	}
+	return 0;
+}
+
+static u32 __init __find_dbgp(u32 bus, u32 slot, u32 func)
+{
+	u32 class;
+
+	class = read_pci_config(bus, slot, func, PCI_CLASS_REVISION);
+	if ((class >> 8) != PCI_CLASS_SERIAL_USB_EHCI)
+		return 0;
+
+	return find_cap(bus, slot, func, PCI_CAP_ID_EHCI_DEBUG);
+}
+
+static u32 __init find_dbgp(int ehci_num, u32 *rbus, u32 *rslot, u32 *rfunc)
+{
+	u32 bus, slot, func;
+
+	for (bus = 0; bus < 256; bus++) {
+		for (slot = 0; slot < 32; slot++) {
+			for (func = 0; func < 8; func++) {
+				unsigned cap;
+
+				cap = __find_dbgp(bus, slot, func);
+
+				if (!cap)
+					continue;
+				if (ehci_num-- != 0)
+					continue;
+				*rbus = bus;
+				*rslot = slot;
+				*rfunc = func;
+				return cap;
+			}
+		}
+	}
+	return 0;
+}
+
+static int __init ehci_reset_port(int port)
+{
+	u32 portsc;
+	u32 delay_time, delay;
+	int loop;
+
+	/* Reset the usb debug port */
+	portsc = readl(&ehci_regs->port_status[port - 1]);
+	portsc &= ~PORT_PE;
+	portsc |= PORT_RESET;
+	writel(portsc, &ehci_regs->port_status[port - 1]);
+
+	delay = HUB_ROOT_RESET_TIME;
+	for (delay_time = 0; delay_time < HUB_RESET_TIMEOUT;
+	     delay_time += delay) {
+		dbgp_mdelay(delay);
+
+		portsc = readl(&ehci_regs->port_status[port - 1]);
+		if (portsc & PORT_RESET) {
+			/* force reset to complete */
+			loop = 2;
+			writel(portsc & ~(PORT_RWC_BITS | PORT_RESET),
+				&ehci_regs->port_status[port - 1]);
+			do {
+				portsc = readl(&ehci_regs->port_status[port-1]);
+			} while ((portsc & PORT_RESET) && (--loop > 0));
+		}
+
+		/* Device went away? */
+		if (!(portsc & PORT_CONNECT))
+			return -ENOTCONN;
+
+		/* bomb out completely if something weird happend */
+		if ((portsc & PORT_CSC))
+			return -EINVAL;
+
+		/* If we've finished resetting, then break out of the loop */
+		if (!(portsc & PORT_RESET) && (portsc & PORT_PE))
+			return 0;
+	}
+	return -EBUSY;
+}
+
+static int __init ehci_wait_for_port(int port)
+{
+	u32 status;
+	int ret, reps;
+
+	for (reps = 0; reps < 3; reps++) {
+		dbgp_mdelay(100);
+		status = readl(&ehci_regs->status);
+		if (status & STS_PCD) {
+			ret = ehci_reset_port(port);
+			if (ret == 0)
+				return 0;
+		}
+	}
+	return -ENOTCONN;
+}
+
+#ifdef DBGP_DEBUG
+# define dbgp_printk early_printk
+#else
+static inline void dbgp_printk(const char *fmt, ...) { }
+#endif
+
+typedef void (*set_debug_port_t)(int port);
+
+static void __init default_set_debug_port(int port)
+{
+}
+
+static set_debug_port_t __initdata set_debug_port = default_set_debug_port;
+
+static void __init nvidia_set_debug_port(int port)
+{
+	u32 dword;
+	dword = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func,
+				 0x74);
+	dword &= ~(0x0f<<12);
+	dword |= ((port & 0x0f)<<12);
+	write_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, 0x74,
+				 dword);
+	dbgp_printk("set debug port to %d\n", port);
+}
+
+static void __init detect_set_debug_port(void)
+{
+	u32 vendorid;
+
+	vendorid = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func,
+		 0x00);
+
+	if ((vendorid & 0xffff) == 0x10de) {
+		dbgp_printk("using nvidia set_debug_port\n");
+		set_debug_port = nvidia_set_debug_port;
+	}
+}
+
+static int __init ehci_setup(void)
+{
+	struct usb_debug_descriptor dbgp_desc;
+	u32 cmd, ctrl, status, portsc, hcs_params;
+	u32 debug_port, new_debug_port = 0, n_ports;
+	u32  devnum;
+	int ret, i;
+	int loop;
+	int port_map_tried;
+	int playtimes = 3;
+
+try_next_time:
+	port_map_tried = 0;
+
+try_next_port:
+
+	hcs_params = readl(&ehci_caps->hcs_params);
+	debug_port = HCS_DEBUG_PORT(hcs_params);
+	n_ports    = HCS_N_PORTS(hcs_params);
+
+	dbgp_printk("debug_port: %d\n", debug_port);
+	dbgp_printk("n_ports:    %d\n", n_ports);
+
+	for (i = 1; i <= n_ports; i++) {
+		portsc = readl(&ehci_regs->port_status[i-1]);
+		dbgp_printk("portstatus%d: %08x\n", i, portsc);
+	}
+
+	if (port_map_tried && (new_debug_port != debug_port)) {
+		if (--playtimes) {
+			set_debug_port(new_debug_port);
+			goto try_next_time;
+		}
+		return -1;
+	}
+
+	loop = 100000;
+	/* Reset the EHCI controller */
+	cmd = readl(&ehci_regs->command);
+	cmd |= CMD_RESET;
+	writel(cmd, &ehci_regs->command);
+	do {
+		cmd = readl(&ehci_regs->command);
+	} while ((cmd & CMD_RESET) && (--loop > 0));
+
+	if (!loop) {
+		dbgp_printk("can not reset ehci\n");
+		return -1;
+	}
+	dbgp_printk("ehci reset done\n");
+
+	/* Claim ownership, but do not enable yet */
+	ctrl = readl(&ehci_debug->control);
+	ctrl |= DBGP_OWNER;
+	ctrl &= ~(DBGP_ENABLED | DBGP_INUSE);
+	writel(ctrl, &ehci_debug->control);
+
+	/* Start the ehci running */
+	cmd = readl(&ehci_regs->command);
+	cmd &= ~(CMD_LRESET | CMD_IAAD | CMD_PSE | CMD_ASE | CMD_RESET);
+	cmd |= CMD_RUN;
+	writel(cmd, &ehci_regs->command);
+
+	/* Ensure everything is routed to the EHCI */
+	writel(FLAG_CF, &ehci_regs->configured_flag);
+
+	/* Wait until the controller is no longer halted */
+	loop = 10;
+	do {
+		status = readl(&ehci_regs->status);
+	} while ((status & STS_HALT) && (--loop > 0));
+
+	if (!loop) {
+		dbgp_printk("ehci can be started\n");
+		return -1;
+	}
+	dbgp_printk("ehci started\n");
+
+	/* Wait for a device to show up in the debug port */
+	ret = ehci_wait_for_port(debug_port);
+	if (ret < 0) {
+		dbgp_printk("No device found in debug port\n");
+		goto next_debug_port;
+	}
+	dbgp_printk("ehci wait for port done\n");
+
+	/* Enable the debug port */
+	ctrl = readl(&ehci_debug->control);
+	ctrl |= DBGP_CLAIM;
+	writel(ctrl, &ehci_debug->control);
+	ctrl = readl(&ehci_debug->control);
+	if ((ctrl & DBGP_CLAIM) != DBGP_CLAIM) {
+		dbgp_printk("No device in debug port\n");
+		writel(ctrl & ~DBGP_CLAIM, &ehci_debug->control);
+		goto err;
+	}
+	dbgp_printk("debug ported enabled\n");
+
+	/* Completely transfer the debug device to the debug controller */
+	portsc = readl(&ehci_regs->port_status[debug_port - 1]);
+	portsc &= ~PORT_PE;
+	writel(portsc, &ehci_regs->port_status[debug_port - 1]);
+
+	dbgp_mdelay(100);
+
+	/* Find the debug device and make it device number 127 */
+	for (devnum = 0; devnum <= 127; devnum++) {
+		ret = dbgp_control_msg(devnum,
+			USB_DIR_IN | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
+			USB_REQ_GET_DESCRIPTOR, (USB_DT_DEBUG << 8), 0,
+			&dbgp_desc, sizeof(dbgp_desc));
+		if (ret > 0)
+			break;
+	}
+	if (devnum > 127) {
+		dbgp_printk("Could not find attached debug device\n");
+		goto err;
+	}
+	if (ret < 0) {
+		dbgp_printk("Attached device is not a debug device\n");
+		goto err;
+	}
+	dbgp_endpoint_out = dbgp_desc.bDebugOutEndpoint;
+
+	/* Move the device to 127 if it isn't already there */
+	if (devnum != USB_DEBUG_DEVNUM) {
+		ret = dbgp_control_msg(devnum,
+			USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
+			USB_REQ_SET_ADDRESS, USB_DEBUG_DEVNUM, 0, NULL, 0);
+		if (ret < 0) {
+			dbgp_printk("Could not move attached device to %d\n",
+				USB_DEBUG_DEVNUM);
+			goto err;
+		}
+		devnum = USB_DEBUG_DEVNUM;
+		dbgp_printk("debug device renamed to 127\n");
+	}
+
+	/* Enable the debug interface */
+	ret = dbgp_control_msg(USB_DEBUG_DEVNUM,
+		USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
+		USB_REQ_SET_FEATURE, USB_DEVICE_DEBUG_MODE, 0, NULL, 0);
+	if (ret < 0) {
+		dbgp_printk(" Could not enable the debug device\n");
+		goto err;
+	}
+	dbgp_printk("debug interface enabled\n");
+
+	/* Perform a small write to get the even/odd data state in sync
+	 */
+	ret = dbgp_bulk_write(USB_DEBUG_DEVNUM, dbgp_endpoint_out, " ", 1);
+	if (ret < 0) {
+		dbgp_printk("dbgp_bulk_write failed: %d\n", ret);
+		goto err;
+	}
+	dbgp_printk("small write doned\n");
+
+	return 0;
+err:
+	/* Things didn't work so remove my claim */
+	ctrl = readl(&ehci_debug->control);
+	ctrl &= ~(DBGP_CLAIM | DBGP_OUT);
+	writel(ctrl, &ehci_debug->control);
+	return -1;
+
+next_debug_port:
+	port_map_tried |= (1<<(debug_port - 1));
+	new_debug_port = ((debug_port-1+1)%n_ports) + 1;
+	if (port_map_tried != ((1<<n_ports) - 1)) {
+		set_debug_port(new_debug_port);
+		goto try_next_port;
+	}
+	if (--playtimes) {
+		set_debug_port(new_debug_port);
+		goto try_next_time;
+	}
+
+	return -1;
+}
+
+int __init early_dbgp_init(char *s)
+{
+	u32 debug_port, bar, offset;
+	u32 bus, slot, func, cap;
+	void __iomem *ehci_bar;
+	u32 dbgp_num;
+	u32 bar_val;
+	char *e;
+	int ret;
+	u8 byte;
+
+	if (!early_pci_allowed())
+		return -1;
+
+	dbgp_num = 0;
+	if (*s)
+		dbgp_num = simple_strtoul(s, &e, 10);
+	dbgp_printk("dbgp_num: %d\n", dbgp_num);
+
+	cap = find_dbgp(dbgp_num, &bus, &slot, &func);
+	if (!cap)
+		return -1;
+
+	dbgp_printk("Found EHCI debug port on %02x:%02x.%1x\n", bus, slot,
+			 func);
+
+	debug_port = read_pci_config(bus, slot, func, cap);
+	bar = (debug_port >> 29) & 0x7;
+	bar = (bar * 4) + 0xc;
+	offset = (debug_port >> 16) & 0xfff;
+	dbgp_printk("bar: %02x offset: %03x\n", bar, offset);
+	if (bar != PCI_BASE_ADDRESS_0) {
+		dbgp_printk("only debug ports on bar 1 handled.\n");
+
+		return -1;
+	}
+
+	bar_val = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0);
+	dbgp_printk("bar_val: %02x offset: %03x\n", bar_val, offset);
+	if (bar_val & ~PCI_BASE_ADDRESS_MEM_MASK) {
+		dbgp_printk("only simple 32bit mmio bars supported\n");
+
+		return -1;
+	}
+
+	/* double check if the mem space is enabled */
+	byte = read_pci_config_byte(bus, slot, func, 0x04);
+	if (!(byte & 0x2)) {
+		byte  |= 0x02;
+		write_pci_config_byte(bus, slot, func, 0x04, byte);
+		dbgp_printk("mmio for ehci enabled\n");
+	}
+
+	/*
+	 * FIXME I don't have the bar size so just guess PAGE_SIZE is more
+	 * than enough.  1K is the biggest I have seen.
+	 */
+	set_fixmap_nocache(FIX_DBGP_BASE, bar_val & PAGE_MASK);
+	ehci_bar = (void __iomem *)__fix_to_virt(FIX_DBGP_BASE);
+	ehci_bar += bar_val & ~PAGE_MASK;
+	dbgp_printk("ehci_bar: %p\n", ehci_bar);
+
+	ehci_caps  = ehci_bar;
+	ehci_regs  = ehci_bar + HC_LENGTH(readl(&ehci_caps->hc_capbase));
+	ehci_debug = ehci_bar + offset;
+	ehci_dev.bus = bus;
+	ehci_dev.slot = slot;
+	ehci_dev.func = func;
+
+	detect_set_debug_port();
+
+	ret = ehci_setup();
+	if (ret < 0) {
+		dbgp_printk("ehci_setup failed\n");
+		ehci_debug = NULL;
+
+		return -1;
+	}
+
+	return 0;
+}
+
+static void early_dbgp_write(struct console *con, const char *str, u32 n)
+{
+	int chunk, ret;
+
+	if (!ehci_debug)
+		return;
+	while (n > 0) {
+		chunk = n;
+		if (chunk > DBGP_MAX_PACKET)
+			chunk = DBGP_MAX_PACKET;
+		ret = dbgp_bulk_write(USB_DEBUG_DEVNUM,
+			dbgp_endpoint_out, str, chunk);
+		str += chunk;
+		n -= chunk;
+	}
+}
+
+struct console early_dbgp_console = {
+	.name =		"earlydbg",
+	.write =	early_dbgp_write,
+	.flags =	CON_PRINTBUFFER,
+	.index =	-1,
+};
+
diff --git a/include/linux/usb/ehci_def.h b/include/linux/usb/ehci_def.h
index 4c4b701ff26..1deac2a7b12 100644
--- a/include/linux/usb/ehci_def.h
+++ b/include/linux/usb/ehci_def.h
@@ -170,4 +170,10 @@ struct ehci_dbg_port {
 #define DBGP_EPADDR(dev, ep)	(((dev)<<8)|(ep))
 } __attribute__ ((packed));
 
+#ifdef CONFIG_EARLY_PRINTK_DBGP
+#include <linux/init.h>
+extern int __init early_dbgp_init(char *s);
+extern struct console early_dbgp_console;
+#endif /* CONFIG_EARLY_PRINTK_DBGP */
+
 #endif /* __LINUX_USB_EHCI_DEF_H */
-- 
cgit v1.2.3


From c9530948bc626c8b638015c0b32abb9615659ec6 Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Thu, 20 Aug 2009 15:39:52 -0500
Subject: early_printk: Allow more than one early console

It is desirable to be able to use one early boot device to debug
another or to have multiple places you can see the early boot
diagnostics, such as the vga screen or serial device.

This patch changes the early_printk console device registration to
allow more than one early printk device to get registered via
register_console().

Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 arch/x86/kernel/early_printk.c | 65 +++++++++++++++++++++---------------------
 1 file changed, 33 insertions(+), 32 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 519a5e10be5..2acfd3fdc0c 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -176,10 +176,19 @@ asmlinkage void early_printk(const char *fmt, ...)
 	va_end(ap);
 }
 
+static inline void early_console_register(struct console *con, int keep_early)
+{
+	early_console = con;
+	if (keep_early)
+		early_console->flags &= ~CON_BOOT;
+	else
+		early_console->flags |= CON_BOOT;
+	register_console(early_console);
+}
 
 static int __init setup_early_printk(char *buf)
 {
-	int keep_early;
+	int keep;
 
 	if (!buf)
 		return 0;
@@ -188,42 +197,34 @@ static int __init setup_early_printk(char *buf)
 		return 0;
 	early_console_initialized = 1;
 
-	keep_early = (strstr(buf, "keep") != NULL);
-
-	if (!strncmp(buf, "serial", 6)) {
-		early_serial_init(buf + 6);
-		early_console = &early_serial_console;
-	} else if (!strncmp(buf, "ttyS", 4)) {
-		early_serial_init(buf);
-		early_console = &early_serial_console;
-	} else if (!strncmp(buf, "vga", 3)
-		&& boot_params.screen_info.orig_video_isVGA == 1) {
-		max_xpos = boot_params.screen_info.orig_video_cols;
-		max_ypos = boot_params.screen_info.orig_video_lines;
-		current_ypos = boot_params.screen_info.orig_y;
-		early_console = &early_vga_console;
+	keep = (strstr(buf, "keep") != NULL);
+
+	while (*buf != '\0') {
+		if (!strncmp(buf, "serial", 6)) {
+			early_serial_init(buf + 6);
+			early_console_register(&early_serial_console, keep);
+		}
+		if (!strncmp(buf, "ttyS", 4)) {
+			early_serial_init(buf + 4);
+			early_console_register(&early_serial_console, keep);
+		}
+		if (!strncmp(buf, "vga", 3) &&
+		    boot_params.screen_info.orig_video_isVGA == 1) {
+			max_xpos = boot_params.screen_info.orig_video_cols;
+			max_ypos = boot_params.screen_info.orig_video_lines;
+			current_ypos = boot_params.screen_info.orig_y;
+			early_console_register(&early_vga_console, keep);
+		}
 #ifdef CONFIG_EARLY_PRINTK_DBGP
-	} else if (!strncmp(buf, "dbgp", 4)) {
-		if (early_dbgp_init(buf+4) < 0)
-			return 0;
-		early_console = &early_dbgp_console;
-		/*
-		 * usb subsys will reset ehci controller, so don't keep
-		 * that early console
-		 */
-		keep_early = 0;
+		if (!strncmp(buf, "dbgp", 4) && !early_dbgp_init(buf + 4))
+			early_console_register(&early_dbgp_console, keep);
 #endif
 #ifdef CONFIG_HVC_XEN
-	} else if (!strncmp(buf, "xen", 3)) {
-		early_console = &xenboot_console;
+		if (!strncmp(buf, "xen", 3))
+			early_console_register(&xenboot_console, keep);
 #endif
+		buf++;
 	}
-
-	if (keep_early)
-		early_console->flags &= ~CON_BOOT;
-	else
-		early_console->flags |= CON_BOOT;
-	register_console(early_console);
 	return 0;
 }
 
-- 
cgit v1.2.3


From c30bb2a25fcfde6157e6154a32c14686fb0bedbe Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Tue, 22 Sep 2009 16:45:43 -0700
Subject: kcore: add kclist types

Presently, kclist_add() only eats start address and size as its arguments.
Considering to make kclist dynamically reconfigulable, it's necessary to
know which kclists are for System RAM and which are not.

This patch add kclist types as
  KCORE_RAM
  KCORE_VMALLOC
  KCORE_TEXT
  KCORE_OTHER

This "type" is used in a patch following this for detecting KCORE_RAM.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: WANG Cong <xiyou.wangcong@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/mm/init.c       |  7 ++++---
 arch/mips/mm/init.c       |  7 ++++---
 arch/powerpc/mm/init_32.c |  4 ++--
 arch/powerpc/mm/init_64.c |  5 +++--
 arch/sh/mm/init.c         |  4 ++--
 arch/x86/mm/init_32.c     |  4 ++--
 arch/x86/mm/init_64.c     | 11 ++++++-----
 fs/proc/kcore.c           |  3 ++-
 include/linux/proc_fs.h   | 13 +++++++++++--
 9 files changed, 36 insertions(+), 22 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 1d286244a56..f6a3c21a282 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -639,9 +639,10 @@ mem_init (void)
 
 	high_memory = __va(max_low_pfn * PAGE_SIZE);
 
-	kclist_add(&kcore_mem, __va(0), max_low_pfn * PAGE_SIZE);
-	kclist_add(&kcore_vmem, (void *)VMALLOC_START, VMALLOC_END-VMALLOC_START);
-	kclist_add(&kcore_kernel, _stext, _end - _stext);
+	kclist_add(&kcore_mem, __va(0), max_low_pfn * PAGE_SIZE, KCORE_RAM);
+	kclist_add(&kcore_vmem, (void *)VMALLOC_START,
+			VMALLOC_END-VMALLOC_START, KCORE_VMALLOC);
+	kclist_add(&kcore_kernel, _stext, _end - _stext, KCORE_TEXT);
 
 	for_each_online_pgdat(pgdat)
 		if (pgdat->bdata->node_bootmem_map)
diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c
index 1f4ee4797a6..f8661985bff 100644
--- a/arch/mips/mm/init.c
+++ b/arch/mips/mm/init.c
@@ -409,11 +409,12 @@ void __init mem_init(void)
 	if ((unsigned long) &_text > (unsigned long) CKSEG0)
 		/* The -4 is a hack so that user tools don't have to handle
 		   the overflow.  */
-		kclist_add(&kcore_kseg0, (void *) CKSEG0, 0x80000000 - 4);
+		kclist_add(&kcore_kseg0, (void *) CKSEG0,
+				0x80000000 - 4, KCORE_TEXT);
 #endif
-	kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
+	kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT, KCORE_RAM);
 	kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
-		   VMALLOC_END-VMALLOC_START);
+		   VMALLOC_END-VMALLOC_START, KCORE_VMALLOC);
 
 	printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
 	       "%ldk reserved, %ldk data, %ldk init, %ldk highmem)\n",
diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c
index 3ef5084b90c..e91add90ec5 100644
--- a/arch/powerpc/mm/init_32.c
+++ b/arch/powerpc/mm/init_32.c
@@ -268,11 +268,11 @@ static int __init setup_kcore(void)
 						size);
 		}
 
-		kclist_add(kcore_mem, __va(base), size);
+		kclist_add(kcore_mem, __va(base), size, KCORE_RAM);
 	}
 
 	kclist_add(&kcore_vmem, (void *)VMALLOC_START,
-		VMALLOC_END-VMALLOC_START);
+		VMALLOC_END-VMALLOC_START, KCORE_VMALLOC);
 
 	return 0;
 }
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 31582329cd6..9ee563101b5 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -128,10 +128,11 @@ static int __init setup_kcore(void)
 		if (!kcore_mem)
 			panic("%s: kmalloc failed\n", __func__);
 
-		kclist_add(kcore_mem, __va(base), size);
+		kclist_add(kcore_mem, __va(base), size, KCORE_RAM);
 	}
 
-	kclist_add(&kcore_vmem, (void *)VMALLOC_START, VMALLOC_END-VMALLOC_START);
+	kclist_add(&kcore_vmem, (void *)VMALLOC_START,
+		VMALLOC_END-VMALLOC_START, KCORE_VMALLOC);
 
 	return 0;
 }
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index fabb7c6f48d..ef56c9f9d7b 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -226,9 +226,9 @@ void __init mem_init(void)
 	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
 	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
 
-	kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
+	kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT, KCORE_RAM);
 	kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
-		   VMALLOC_END - VMALLOC_START);
+		   VMALLOC_END - VMALLOC_START, KCORE_VMALLOC);
 
 	printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, "
 	       "%dk data, %dk init)\n",
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index b49b4f67453..2cbc4011293 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -886,9 +886,9 @@ void __init mem_init(void)
 	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
 	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
 
-	kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
+	kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT, KCORE_RAM);
 	kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
-		   VMALLOC_END-VMALLOC_START);
+		   VMALLOC_END-VMALLOC_START, KCORE_VMALLOC);
 
 	printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, "
 			"%dk reserved, %dk data, %dk init, %ldk highmem)\n",
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 810bd31e7f5..c05810b614f 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -677,13 +677,14 @@ void __init mem_init(void)
 	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
 
 	/* Register memory areas for /proc/kcore */
-	kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
+	kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT, KCORE_RAM);
 	kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
-		   VMALLOC_END-VMALLOC_START);
-	kclist_add(&kcore_kernel, &_stext, _end - _stext);
-	kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
+		   VMALLOC_END-VMALLOC_START, KCORE_VMALLOC);
+	kclist_add(&kcore_kernel, &_stext, _end - _stext, KCORE_TEXT);
+	kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN,
+			KCORE_OTHER);
 	kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
-				 VSYSCALL_END - VSYSCALL_START);
+			 VSYSCALL_END - VSYSCALL_START, KCORE_OTHER);
 
 	printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
 			 "%ldk absent, %ldk reserved, %ldk data, %ldk init)\n",
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index f9327e51ce9..659c1635db8 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -62,10 +62,11 @@ static LIST_HEAD(kclist_head);
 static DEFINE_RWLOCK(kclist_lock);
 
 void
-kclist_add(struct kcore_list *new, void *addr, size_t size)
+kclist_add(struct kcore_list *new, void *addr, size_t size, int type)
 {
 	new->addr = (unsigned long)addr;
 	new->size = size;
+	new->type = type;
 
 	write_lock(&kclist_lock);
 	list_add_tail(&new->list, &kclist_head);
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 0aff2a62eba..bd7b840765a 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -78,10 +78,18 @@ struct proc_dir_entry {
 	struct list_head pde_openers;	/* who did ->open, but not ->release */
 };
 
+enum kcore_type {
+	KCORE_TEXT,
+	KCORE_VMALLOC,
+	KCORE_RAM,
+	KCORE_OTHER,
+};
+
 struct kcore_list {
 	struct list_head list;
 	unsigned long addr;
 	size_t size;
+	int type;
 };
 
 struct vmcore {
@@ -233,11 +241,12 @@ static inline void dup_mm_exe_file(struct mm_struct *oldmm,
 #endif /* CONFIG_PROC_FS */
 
 #if !defined(CONFIG_PROC_KCORE)
-static inline void kclist_add(struct kcore_list *new, void *addr, size_t size)
+static inline void
+kclist_add(struct kcore_list *new, void *addr, size_t size, int type)
 {
 }
 #else
-extern void kclist_add(struct kcore_list *, void *, size_t);
+extern void kclist_add(struct kcore_list *, void *, size_t, int type);
 #endif
 
 union proc_op {
-- 
cgit v1.2.3


From a0614da88b67ffa3dbcc0d40b817e682c7c4a0ee Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Tue, 22 Sep 2009 16:45:44 -0700
Subject: kcore: register vmalloc area in generic way

For /proc/kcore, vmalloc areas are registered per arch.  But, all of them
registers same range of [VMALLOC_START...VMALLOC_END) This patch unifies
them.  By this.  archs which have no kclist_add() hooks can see vmalloc
area correctly.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: WANG Cong <xiyou.wangcong@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/mm/init.c       | 4 +---
 arch/mips/mm/init.c       | 4 +---
 arch/powerpc/mm/init_32.c | 4 ----
 arch/powerpc/mm/init_64.c | 4 ----
 arch/sh/mm/init.c         | 4 +---
 arch/x86/mm/init_32.c     | 4 +---
 arch/x86/mm/init_64.c     | 4 +---
 fs/proc/kcore.c           | 5 +++++
 8 files changed, 10 insertions(+), 23 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index f6a3c21a282..286b98aa854 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -617,7 +617,7 @@ mem_init (void)
 	long reserved_pages, codesize, datasize, initsize;
 	pg_data_t *pgdat;
 	int i;
-	static struct kcore_list kcore_mem, kcore_vmem, kcore_kernel;
+	static struct kcore_list kcore_mem, kcore_kernel;
 
 	BUG_ON(PTRS_PER_PGD * sizeof(pgd_t) != PAGE_SIZE);
 	BUG_ON(PTRS_PER_PMD * sizeof(pmd_t) != PAGE_SIZE);
@@ -640,8 +640,6 @@ mem_init (void)
 	high_memory = __va(max_low_pfn * PAGE_SIZE);
 
 	kclist_add(&kcore_mem, __va(0), max_low_pfn * PAGE_SIZE, KCORE_RAM);
-	kclist_add(&kcore_vmem, (void *)VMALLOC_START,
-			VMALLOC_END-VMALLOC_START, KCORE_VMALLOC);
 	kclist_add(&kcore_kernel, _stext, _end - _stext, KCORE_TEXT);
 
 	for_each_online_pgdat(pgdat)
diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c
index f8661985bff..4a83da499e2 100644
--- a/arch/mips/mm/init.c
+++ b/arch/mips/mm/init.c
@@ -352,7 +352,7 @@ void __init paging_init(void)
 	free_area_init_nodes(max_zone_pfns);
 }
 
-static struct kcore_list kcore_mem, kcore_vmalloc;
+static struct kcore_list kcore_mem;
 #ifdef CONFIG_64BIT
 static struct kcore_list kcore_kseg0;
 #endif
@@ -413,8 +413,6 @@ void __init mem_init(void)
 				0x80000000 - 4, KCORE_TEXT);
 #endif
 	kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT, KCORE_RAM);
-	kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
-		   VMALLOC_END-VMALLOC_START, KCORE_VMALLOC);
 
 	printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
 	       "%ldk reserved, %ldk data, %ldk init, %ldk highmem)\n",
diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c
index e91add90ec5..38a450d16b2 100644
--- a/arch/powerpc/mm/init_32.c
+++ b/arch/powerpc/mm/init_32.c
@@ -243,7 +243,6 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 #endif
 
 #ifdef CONFIG_PROC_KCORE
-static struct kcore_list kcore_vmem;
 
 static int __init setup_kcore(void)
 {
@@ -271,9 +270,6 @@ static int __init setup_kcore(void)
 		kclist_add(kcore_mem, __va(base), size, KCORE_RAM);
 	}
 
-	kclist_add(&kcore_vmem, (void *)VMALLOC_START,
-		VMALLOC_END-VMALLOC_START, KCORE_VMALLOC);
-
 	return 0;
 }
 module_init(setup_kcore);
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 9ee563101b5..87ef492a1a5 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -110,7 +110,6 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 #endif
 
 #ifdef CONFIG_PROC_KCORE
-static struct kcore_list kcore_vmem;
 
 static int __init setup_kcore(void)
 {
@@ -131,9 +130,6 @@ static int __init setup_kcore(void)
 		kclist_add(kcore_mem, __va(base), size, KCORE_RAM);
 	}
 
-	kclist_add(&kcore_vmem, (void *)VMALLOC_START,
-		VMALLOC_END-VMALLOC_START, KCORE_VMALLOC);
-
 	return 0;
 }
 module_init(setup_kcore);
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index ef56c9f9d7b..bf8bd026db5 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -186,7 +186,7 @@ void __init paging_init(void)
 	set_fixmap_nocache(FIX_UNCACHED, __pa(&__uncached_start));
 }
 
-static struct kcore_list kcore_mem, kcore_vmalloc;
+static struct kcore_list kcore_mem;
 
 void __init mem_init(void)
 {
@@ -227,8 +227,6 @@ void __init mem_init(void)
 	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
 
 	kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT, KCORE_RAM);
-	kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
-		   VMALLOC_END - VMALLOC_START, KCORE_VMALLOC);
 
 	printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, "
 	       "%dk data, %dk init)\n",
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 2cbc4011293..7108678ca9b 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -857,7 +857,7 @@ static void __init test_wp_bit(void)
 	}
 }
 
-static struct kcore_list kcore_mem, kcore_vmalloc;
+static struct kcore_list kcore_mem;
 
 void __init mem_init(void)
 {
@@ -887,8 +887,6 @@ void __init mem_init(void)
 	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
 
 	kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT, KCORE_RAM);
-	kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
-		   VMALLOC_END-VMALLOC_START, KCORE_VMALLOC);
 
 	printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, "
 			"%dk reserved, %dk data, %dk init, %ldk highmem)\n",
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index c05810b614f..da2cae7427a 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -647,7 +647,7 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
 
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
-static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel,
+static struct kcore_list kcore_mem, kcore_kernel,
 			 kcore_modules, kcore_vsyscall;
 
 void __init mem_init(void)
@@ -678,8 +678,6 @@ void __init mem_init(void)
 
 	/* Register memory areas for /proc/kcore */
 	kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT, KCORE_RAM);
-	kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
-		   VMALLOC_END-VMALLOC_START, KCORE_VMALLOC);
 	kclist_add(&kcore_kernel, &_stext, _end - _stext, KCORE_TEXT);
 	kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN,
 			KCORE_OTHER);
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 659c1635db8..e10d360110b 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -372,9 +372,14 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
 	return acc;
 }
 
+static struct kcore_list kcore_vmalloc;
+
 static int __init proc_kcore_init(void)
 {
 	proc_root_kcore = proc_create("kcore", S_IRUSR, NULL, &proc_kcore_operations);
+
+	kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
+		VMALLOC_END - VMALLOC_START, KCORE_VMALLOC);
 	return 0;
 }
 module_init(proc_kcore_init);
-- 
cgit v1.2.3


From 9492587cf35d370db33ef4b38375dfb35a105b61 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Tue, 22 Sep 2009 16:45:45 -0700
Subject: kcore: register text area in generic way

Some 64bit arch has special segment for mapping kernel text.  It should be
entried to /proc/kcore in addtion to direct-linear-map, vmalloc area.
This patch unifies KCORE_TEXT entry scattered under x86 and ia64.

I'm not familiar with other archs (mips has its own even after this patch)
but range of [_stext ..._end) is a valid area of text and it's not in
direct-map area, defining CONFIG_ARCH_PROC_KCORE_TEXT is only a necessary
thing to do.

Note: I left mips as it is now.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: WANG Cong <xiyou.wangcong@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/Kconfig     |  4 ++++
 arch/ia64/mm/init.c   |  3 +--
 arch/x86/Kconfig      |  4 ++++
 arch/x86/mm/init_64.c |  4 +---
 fs/proc/kcore.c       | 19 ++++++++++++++++++-
 5 files changed, 28 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 011a1cdf0eb..6851e52ed5a 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -500,6 +500,10 @@ config HAVE_ARCH_NODEDATA_EXTENSION
 	def_bool y
 	depends on NUMA
 
+config ARCH_PROC_KCORE_TEXT
+	def_bool y
+	depends on PROC_KCORE
+
 config IA32_SUPPORT
 	bool "Support for Linux/x86 binaries"
 	help
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 286b98aa854..3f95ea1d6fb 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -617,7 +617,7 @@ mem_init (void)
 	long reserved_pages, codesize, datasize, initsize;
 	pg_data_t *pgdat;
 	int i;
-	static struct kcore_list kcore_mem, kcore_kernel;
+	static struct kcore_list kcore_mem;
 
 	BUG_ON(PTRS_PER_PGD * sizeof(pgd_t) != PAGE_SIZE);
 	BUG_ON(PTRS_PER_PMD * sizeof(pmd_t) != PAGE_SIZE);
@@ -640,7 +640,6 @@ mem_init (void)
 	high_memory = __va(max_low_pfn * PAGE_SIZE);
 
 	kclist_add(&kcore_mem, __va(0), max_low_pfn * PAGE_SIZE, KCORE_RAM);
-	kclist_add(&kcore_kernel, _stext, _end - _stext, KCORE_TEXT);
 
 	for_each_online_pgdat(pgdat)
 		if (pgdat->bdata->node_bootmem_map)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e4ff5d1280c..7c7a54bed4a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1204,6 +1204,10 @@ config ARCH_DISCONTIGMEM_DEFAULT
 	def_bool y
 	depends on NUMA && X86_32
 
+config ARCH_PROC_KCORE_TEXT
+	def_bool y
+	depends on X86_64 && PROC_KCORE
+
 config ARCH_SPARSEMEM_DEFAULT
 	def_bool y
 	depends on X86_64
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index da2cae7427a..a0c2efb10cb 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -647,8 +647,7 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
 
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
-static struct kcore_list kcore_mem, kcore_kernel,
-			 kcore_modules, kcore_vsyscall;
+static struct kcore_list kcore_mem, kcore_modules, kcore_vsyscall;
 
 void __init mem_init(void)
 {
@@ -678,7 +677,6 @@ void __init mem_init(void)
 
 	/* Register memory areas for /proc/kcore */
 	kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT, KCORE_RAM);
-	kclist_add(&kcore_kernel, &_stext, _end - _stext, KCORE_TEXT);
 	kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN,
 			KCORE_OTHER);
 	kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index e10d360110b..fdde1cc7839 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -21,6 +21,7 @@
 #include <asm/uaccess.h>
 #include <asm/io.h>
 #include <linux/list.h>
+#include <asm/sections.h>
 
 #define CORE_STR "CORE"
 
@@ -374,10 +375,26 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
 
 static struct kcore_list kcore_vmalloc;
 
+#ifdef CONFIG_ARCH_PROC_KCORE_TEXT
+static struct kcore_list kcore_text;
+/*
+ * If defined, special segment is used for mapping kernel text instead of
+ * direct-map area. We need to create special TEXT section.
+ */
+static void __init proc_kcore_text_init(void)
+{
+	kclist_add(&kcore_text, _stext, _end - _stext, KCORE_TEXT);
+}
+#else
+static void __init proc_kcore_text_init(void)
+{
+}
+#endif
+
 static int __init proc_kcore_init(void)
 {
 	proc_root_kcore = proc_create("kcore", S_IRUSR, NULL, &proc_kcore_operations);
-
+	proc_kcore_text_init();
 	kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
 		VMALLOC_END - VMALLOC_START, KCORE_VMALLOC);
 	return 0;
-- 
cgit v1.2.3


From 3089aa1b0c07fb7c48f9829c619f50198307789d Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Tue, 22 Sep 2009 16:45:48 -0700
Subject: kcore: use registerd physmem information

For /proc/kcore, each arch registers its memory range by kclist_add().
In usual,

	- range of physical memory
	- range of vmalloc area
	- text, etc...

are registered but "range of physical memory" has some troubles.  It
doesn't updated at memory hotplug and it tend to include unnecessary
memory holes.  Now, /proc/iomem (kernel/resource.c) includes required
physical memory range information and it's properly updated at memory
hotplug.  Then, it's good to avoid using its own code(duplicating
information) and to rebuild kclist for physical memory based on
/proc/iomem.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: WANG Cong <xiyou.wangcong@gmail.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/mm/init.c       |   3 -
 arch/mips/mm/init.c       |   2 -
 arch/powerpc/mm/init_32.c |  32 ---------
 arch/powerpc/mm/init_64.c |  26 -------
 arch/sh/mm/init.c         |   4 --
 arch/x86/mm/init_32.c     |   4 --
 arch/x86/mm/init_64.c     |   3 +-
 fs/proc/kcore.c           | 179 ++++++++++++++++++++++++++++++++++++++++++----
 8 files changed, 168 insertions(+), 85 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 3f95ea1d6fb..1857766a63c 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -617,7 +617,6 @@ mem_init (void)
 	long reserved_pages, codesize, datasize, initsize;
 	pg_data_t *pgdat;
 	int i;
-	static struct kcore_list kcore_mem;
 
 	BUG_ON(PTRS_PER_PGD * sizeof(pgd_t) != PAGE_SIZE);
 	BUG_ON(PTRS_PER_PMD * sizeof(pmd_t) != PAGE_SIZE);
@@ -639,8 +638,6 @@ mem_init (void)
 
 	high_memory = __va(max_low_pfn * PAGE_SIZE);
 
-	kclist_add(&kcore_mem, __va(0), max_low_pfn * PAGE_SIZE, KCORE_RAM);
-
 	for_each_online_pgdat(pgdat)
 		if (pgdat->bdata->node_bootmem_map)
 			totalram_pages += free_all_bootmem_node(pgdat);
diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c
index 4a83da499e2..15aa1902a78 100644
--- a/arch/mips/mm/init.c
+++ b/arch/mips/mm/init.c
@@ -352,7 +352,6 @@ void __init paging_init(void)
 	free_area_init_nodes(max_zone_pfns);
 }
 
-static struct kcore_list kcore_mem;
 #ifdef CONFIG_64BIT
 static struct kcore_list kcore_kseg0;
 #endif
@@ -412,7 +411,6 @@ void __init mem_init(void)
 		kclist_add(&kcore_kseg0, (void *) CKSEG0,
 				0x80000000 - 4, KCORE_TEXT);
 #endif
-	kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT, KCORE_RAM);
 
 	printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
 	       "%ldk reserved, %ldk data, %ldk init, %ldk highmem)\n",
diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c
index 38a450d16b2..9ddcfb4dc13 100644
--- a/arch/powerpc/mm/init_32.c
+++ b/arch/powerpc/mm/init_32.c
@@ -242,35 +242,3 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 }
 #endif
 
-#ifdef CONFIG_PROC_KCORE
-
-static int __init setup_kcore(void)
-{
-	int i;
-
-	for (i = 0; i < lmb.memory.cnt; i++) {
-		unsigned long base;
-		unsigned long size;
-		struct kcore_list *kcore_mem;
-
-		base = lmb.memory.region[i].base;
-		size = lmb.memory.region[i].size;
-
-		kcore_mem = kmalloc(sizeof(struct kcore_list), GFP_ATOMIC);
-		if (!kcore_mem)
-			panic("%s: kmalloc failed\n", __func__);
-
-		/* must stay under 32 bits */
-		if ( 0xfffffffful - (unsigned long)__va(base) < size) {
-			size = 0xfffffffful - (unsigned long)(__va(base));
-			printk(KERN_DEBUG "setup_kcore: restrict size=%lx\n",
-						size);
-		}
-
-		kclist_add(kcore_mem, __va(base), size, KCORE_RAM);
-	}
-
-	return 0;
-}
-module_init(setup_kcore);
-#endif
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 87ef492a1a5..335c578b9cc 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -109,32 +109,6 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 }
 #endif
 
-#ifdef CONFIG_PROC_KCORE
-
-static int __init setup_kcore(void)
-{
-	int i;
-
-	for (i=0; i < lmb.memory.cnt; i++) {
-		unsigned long base, size;
-		struct kcore_list *kcore_mem;
-
-		base = lmb.memory.region[i].base;
-		size = lmb.memory.region[i].size;
-
-		/* GFP_ATOMIC to avoid might_sleep warnings during boot */
-		kcore_mem = kmalloc(sizeof(struct kcore_list), GFP_ATOMIC);
-		if (!kcore_mem)
-			panic("%s: kmalloc failed\n", __func__);
-
-		kclist_add(kcore_mem, __va(base), size, KCORE_RAM);
-	}
-
-	return 0;
-}
-module_init(setup_kcore);
-#endif
-
 static void pgd_ctor(void *addr)
 {
 	memset(addr, 0, PGD_TABLE_SIZE);
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index bf8bd026db5..8173e38afd3 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -186,8 +186,6 @@ void __init paging_init(void)
 	set_fixmap_nocache(FIX_UNCACHED, __pa(&__uncached_start));
 }
 
-static struct kcore_list kcore_mem;
-
 void __init mem_init(void)
 {
 	int codesize, datasize, initsize;
@@ -226,8 +224,6 @@ void __init mem_init(void)
 	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
 	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
 
-	kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT, KCORE_RAM);
-
 	printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, "
 	       "%dk data, %dk init)\n",
 		nr_free_pages() << (PAGE_SHIFT-10),
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 7108678ca9b..30938c1d8d5 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -857,8 +857,6 @@ static void __init test_wp_bit(void)
 	}
 }
 
-static struct kcore_list kcore_mem;
-
 void __init mem_init(void)
 {
 	int codesize, reservedpages, datasize, initsize;
@@ -886,8 +884,6 @@ void __init mem_init(void)
 	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
 	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
 
-	kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT, KCORE_RAM);
-
 	printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, "
 			"%dk reserved, %dk data, %dk init, %ldk highmem)\n",
 		nr_free_pages() << (PAGE_SHIFT-10),
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index a0c2efb10cb..d5d23cc2407 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -647,7 +647,7 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
 
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
-static struct kcore_list kcore_mem, kcore_modules, kcore_vsyscall;
+static struct kcore_list kcore_modules, kcore_vsyscall;
 
 void __init mem_init(void)
 {
@@ -676,7 +676,6 @@ void __init mem_init(void)
 	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
 
 	/* Register memory areas for /proc/kcore */
-	kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT, KCORE_RAM);
 	kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN,
 			KCORE_OTHER);
 	kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index fdde1cc7839..802de33d634 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -17,10 +17,14 @@
 #include <linux/elfcore.h>
 #include <linux/vmalloc.h>
 #include <linux/highmem.h>
+#include <linux/bootmem.h>
 #include <linux/init.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
 #include <linux/list.h>
+#include <linux/ioport.h>
+#include <linux/mm.h>
+#include <linux/memory.h>
 #include <asm/sections.h>
 
 #define CORE_STR "CORE"
@@ -31,17 +35,6 @@
 
 static struct proc_dir_entry *proc_root_kcore;
 
-static int open_kcore(struct inode * inode, struct file * filp)
-{
-	return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
-}
-
-static ssize_t read_kcore(struct file *, char __user *, size_t, loff_t *);
-
-static const struct file_operations proc_kcore_operations = {
-	.read		= read_kcore,
-	.open		= open_kcore,
-};
 
 #ifndef kc_vaddr_to_offset
 #define	kc_vaddr_to_offset(v) ((v) - PAGE_OFFSET)
@@ -61,6 +54,7 @@ struct memelfnote
 
 static LIST_HEAD(kclist_head);
 static DEFINE_RWLOCK(kclist_lock);
+static int kcore_need_update = 1;
 
 void
 kclist_add(struct kcore_list *new, void *addr, size_t size, int type)
@@ -99,6 +93,126 @@ static size_t get_kcore_size(int *nphdr, size_t *elf_buflen)
 	return size + *elf_buflen;
 }
 
+static void free_kclist_ents(struct list_head *head)
+{
+	struct kcore_list *tmp, *pos;
+
+	list_for_each_entry_safe(pos, tmp, head, list) {
+		list_del(&pos->list);
+		kfree(pos);
+	}
+}
+/*
+ * Replace all KCORE_RAM information with passed list.
+ */
+static void __kcore_update_ram(struct list_head *list)
+{
+	struct kcore_list *tmp, *pos;
+	LIST_HEAD(garbage);
+
+	write_lock(&kclist_lock);
+	if (kcore_need_update) {
+		list_for_each_entry_safe(pos, tmp, &kclist_head, list) {
+			if (pos->type == KCORE_RAM)
+				list_move(&pos->list, &garbage);
+		}
+		list_splice_tail(list, &kclist_head);
+	} else
+		list_splice(list, &garbage);
+	kcore_need_update = 0;
+	write_unlock(&kclist_lock);
+
+	free_kclist_ents(&garbage);
+}
+
+
+#ifdef CONFIG_HIGHMEM
+/*
+ * If no highmem, we can assume [0...max_low_pfn) continuous range of memory
+ * because memory hole is not as big as !HIGHMEM case.
+ * (HIGHMEM is special because part of memory is _invisible_ from the kernel.)
+ */
+static int kcore_update_ram(void)
+{
+	LIST_HEAD(head);
+	struct kcore_list *ent;
+	int ret = 0;
+
+	ent = kmalloc(sizeof(*ent), GFP_KERNEL);
+	if (!ent)
+		return -ENOMEM;
+	ent->addr = (unsigned long)__va(0);
+	ent->size = max_low_pfn << PAGE_SHIFT;
+	ent->type = KCORE_RAM;
+	list_add(&ent->list, &head);
+	__kcore_update_ram(&head);
+	return ret;
+}
+
+#else /* !CONFIG_HIGHMEM */
+
+static int
+kclist_add_private(unsigned long pfn, unsigned long nr_pages, void *arg)
+{
+	struct list_head *head = (struct list_head *)arg;
+	struct kcore_list *ent;
+
+	ent = kmalloc(sizeof(*ent), GFP_KERNEL);
+	if (!ent)
+		return -ENOMEM;
+	ent->addr = (unsigned long)__va((pfn << PAGE_SHIFT));
+	ent->size = nr_pages << PAGE_SHIFT;
+
+	/* Sanity check: Can happen in 32bit arch...maybe */
+	if (ent->addr < (unsigned long) __va(0))
+		goto free_out;
+
+	/* cut not-mapped area. ....from ppc-32 code. */
+	if (ULONG_MAX - ent->addr < ent->size)
+		ent->size = ULONG_MAX - ent->addr;
+
+	/* cut when vmalloc() area is higher than direct-map area */
+	if (VMALLOC_START > (unsigned long)__va(0)) {
+		if (ent->addr > VMALLOC_START)
+			goto free_out;
+		if (VMALLOC_START - ent->addr < ent->size)
+			ent->size = VMALLOC_START - ent->addr;
+	}
+
+	ent->type = KCORE_RAM;
+	list_add_tail(&ent->list, head);
+	return 0;
+free_out:
+	kfree(ent);
+	return 1;
+}
+
+static int kcore_update_ram(void)
+{
+	int nid, ret;
+	unsigned long end_pfn;
+	LIST_HEAD(head);
+
+	/* Not inialized....update now */
+	/* find out "max pfn" */
+	end_pfn = 0;
+	for_each_node_state(nid, N_HIGH_MEMORY) {
+		unsigned long node_end;
+		node_end  = NODE_DATA(nid)->node_start_pfn +
+			NODE_DATA(nid)->node_spanned_pages;
+		if (end_pfn < node_end)
+			end_pfn = node_end;
+	}
+	/* scan 0 to max_pfn */
+	ret = walk_system_ram_range(0, end_pfn, &head, kclist_add_private);
+	if (ret) {
+		free_kclist_ents(&head);
+		return -ENOMEM;
+	}
+	__kcore_update_ram(&head);
+	return ret;
+}
+#endif /* CONFIG_HIGHMEM */
 
 /*****************************************************************************/
 /*
@@ -373,6 +487,39 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
 	return acc;
 }
 
+
+static int open_kcore(struct inode *inode, struct file *filp)
+{
+	if (!capable(CAP_SYS_RAWIO))
+		return -EPERM;
+	if (kcore_need_update)
+		kcore_update_ram();
+	return 0;
+}
+
+
+static const struct file_operations proc_kcore_operations = {
+	.read		= read_kcore,
+	.open		= open_kcore,
+};
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+/* just remember that we have to update kcore */
+static int __meminit kcore_callback(struct notifier_block *self,
+				    unsigned long action, void *arg)
+{
+	switch (action) {
+	case MEM_ONLINE:
+	case MEM_OFFLINE:
+		write_lock(&kclist_lock);
+		kcore_need_update = 1;
+		write_unlock(&kclist_lock);
+	}
+	return NOTIFY_OK;
+}
+#endif
+
+
 static struct kcore_list kcore_vmalloc;
 
 #ifdef CONFIG_ARCH_PROC_KCORE_TEXT
@@ -393,10 +540,18 @@ static void __init proc_kcore_text_init(void)
 
 static int __init proc_kcore_init(void)
 {
-	proc_root_kcore = proc_create("kcore", S_IRUSR, NULL, &proc_kcore_operations);
+	proc_root_kcore = proc_create("kcore", S_IRUSR, NULL,
+				      &proc_kcore_operations);
+	/* Store text area if it's special */
 	proc_kcore_text_init();
+	/* Store vmalloc area */
 	kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
 		VMALLOC_END - VMALLOC_START, KCORE_VMALLOC);
+	/* Store direct-map area from physical memory map */
+	kcore_update_ram();
+	hotplug_memory_notifier(kcore_callback, 0);
+	/* Other special area, area-for-module etc is arch specific. */
+
 	return 0;
 }
 module_init(proc_kcore_init);
-- 
cgit v1.2.3


From 81ac3ad9061dd9cd490ee92f0c5316a14d77ce18 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Tue, 22 Sep 2009 16:45:49 -0700
Subject: kcore: register module area in generic way

Some archs define MODULED_VADDR/MODULES_END which is not in VMALLOC area.
This is handled only in x86-64.  This patch make it more generic.  And we
can use vread/vwrite to access the area.  Fix it.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Jiri Slaby <jirislaby@gmail.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: WANG Cong <xiyou.wangcong@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/init_64.c |  4 +---
 fs/proc/kcore.c       | 19 ++++++++++++++++++-
 include/linux/mm.h    |  8 ++++++++
 mm/vmalloc.c          |  2 +-
 4 files changed, 28 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index d5d23cc2407..5a4398a6006 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -647,7 +647,7 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
 
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
-static struct kcore_list kcore_modules, kcore_vsyscall;
+static struct kcore_list kcore_vsyscall;
 
 void __init mem_init(void)
 {
@@ -676,8 +676,6 @@ void __init mem_init(void)
 	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
 
 	/* Register memory areas for /proc/kcore */
-	kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN,
-			KCORE_OTHER);
 	kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
 			 VSYSCALL_END - VSYSCALL_START, KCORE_OTHER);
 
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 78970e6f715..c6a5ec73197 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -490,7 +490,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
 		if (m == NULL) {
 			if (clear_user(buffer, tsz))
 				return -EFAULT;
-		} else if (is_vmalloc_addr((void *)start)) {
+		} else if (is_vmalloc_or_module_addr((void *)start)) {
 			char * elf_buf;
 
 			elf_buf = kzalloc(tsz, GFP_KERNEL);
@@ -586,6 +586,22 @@ static void __init proc_kcore_text_init(void)
 }
 #endif
 
+#if defined(CONFIG_MODULES) && defined(MODULES_VADDR)
+/*
+ * MODULES_VADDR has no intersection with VMALLOC_ADDR.
+ */
+struct kcore_list kcore_modules;
+static void __init add_modules_range(void)
+{
+	kclist_add(&kcore_modules, (void *)MODULES_VADDR,
+			MODULES_END - MODULES_VADDR, KCORE_VMALLOC);
+}
+#else
+static void __init add_modules_range(void)
+{
+}
+#endif
+
 static int __init proc_kcore_init(void)
 {
 	proc_root_kcore = proc_create("kcore", S_IRUSR, NULL,
@@ -595,6 +611,7 @@ static int __init proc_kcore_init(void)
 	/* Store vmalloc area */
 	kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
 		VMALLOC_END - VMALLOC_START, KCORE_VMALLOC);
+	add_modules_range();
 	/* Store direct-map area from physical memory map */
 	kcore_update_ram();
 	hotplug_memory_notifier(kcore_callback, 0);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5946e2ff9fe..b6eae5e3144 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -285,6 +285,14 @@ static inline int is_vmalloc_addr(const void *x)
 	return 0;
 #endif
 }
+#ifdef CONFIG_MMU
+extern int is_vmalloc_or_module_addr(const void *x);
+#else
+static int is_vmalloc_or_module_addr(const void *x)
+{
+	return 0;
+}
+#endif
 
 static inline struct page *compound_head(struct page *page)
 {
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 5535da1d696..69511e66323 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -184,7 +184,7 @@ static int vmap_page_range(unsigned long start, unsigned long end,
 	return ret;
 }
 
-static inline int is_vmalloc_or_module_addr(const void *x)
+int is_vmalloc_or_module_addr(const void *x)
 {
 	/*
 	 * ARM, x86-64 and sparc64 put modules in a special place,
-- 
cgit v1.2.3


From 11868a2dc4f5e4f2f652bfd259e1360193fcee62 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 23 Sep 2009 17:49:55 +0200
Subject: x86: mce: Use safer ways to access MCE registers

Use rdmsrl_safe() when accessing MCE registers. While in
theory we always 'know' which ones are safe to access from
the capability bits, there's a lot of hardware variations
and reality might differ from theory, as it did in this case:

   http://bugzilla.kernel.org/show_bug.cgi?id=14204

[    0.010016] mce: CPU supports 5 MCE banks
[    0.011029] general protection fault: 0000 [#1]
[    0.011998] last sysfs file:
[    0.011998] Modules linked in:
[    0.011998]
[    0.011998] Pid: 0, comm: swapper Not tainted (2.6.31_router #1) HP Vectra
[    0.011998] EIP: 0060:[<c100d9b9>] EFLAGS: 00010246 CPU: 0
[    0.011998] EIP is at mce_rdmsrl+0x19/0x60
[    0.011998] EAX: 00000000 EBX: 00000001 ECX: 00000407 EDX: 08000000
[    0.011998] ESI: 00000000 EDI: 8c000000 EBP: 00000405 ESP: c17d5eac

So WARN_ONCE() instead of crashing the box.

( also fix a number of stylistic inconsistencies in the code. )

Note, we might still crash in wrmsrl() if we get that far, but
we shouldnt if the registers are truly inaccessible.

Reported-by: GNUtoo <GNUtoo@no-log.org>
Cc: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
LKML-Reference: <bug-14204-5438@http.bugzilla.kernel.org/>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 2f5aab26320..4b2af86e3e8 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -305,13 +305,25 @@ static int msr_to_offset(u32 msr)
 static u64 mce_rdmsrl(u32 msr)
 {
 	u64 v;
+
 	if (__get_cpu_var(injectm).finished) {
 		int offset = msr_to_offset(msr);
+
 		if (offset < 0)
 			return 0;
 		return *(u64 *)((char *)&__get_cpu_var(injectm) + offset);
 	}
-	rdmsrl(msr, v);
+
+	if (rdmsrl_safe(msr, &v)) {
+		WARN_ONCE(1, "mce: Unable to read msr %d!\n", msr);
+		/*
+		 * Return zero in case the access faulted. This should
+		 * not happen normally but can happen if the CPU does
+		 * something weird, or if the code is buggy.
+		 */
+		v = 0;
+	}
+
 	return v;
 }
 
@@ -319,6 +331,7 @@ static void mce_wrmsrl(u32 msr, u64 v)
 {
 	if (__get_cpu_var(injectm).finished) {
 		int offset = msr_to_offset(msr);
+
 		if (offset >= 0)
 			*(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v;
 		return;
@@ -415,7 +428,7 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
 		m->ip = mce_rdmsrl(rip_msr);
 }
 
-#ifdef CONFIG_X86_LOCAL_APIC 
+#ifdef CONFIG_X86_LOCAL_APIC
 /*
  * Called after interrupts have been reenabled again
  * when a MCE happened during an interrupts off region
@@ -1172,6 +1185,7 @@ static int mce_banks_init(void)
 		return -ENOMEM;
 	for (i = 0; i < banks; i++) {
 		struct mce_bank *b = &mce_banks[i];
+
 		b->ctl = -1ULL;
 		b->init = 1;
 	}
@@ -1203,6 +1217,7 @@ static int __cpuinit mce_cap_init(void)
 	banks = b;
 	if (!mce_banks) {
 		int err = mce_banks_init();
+
 		if (err)
 			return err;
 	}
@@ -1237,6 +1252,7 @@ static void mce_init(void)
 
 	for (i = 0; i < banks; i++) {
 		struct mce_bank *b = &mce_banks[i];
+
 		if (!b->init)
 			continue;
 		wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
@@ -1626,6 +1642,7 @@ static int mce_disable(void)
 
 	for (i = 0; i < banks; i++) {
 		struct mce_bank *b = &mce_banks[i];
+
 		if (b->init)
 			wrmsrl(MSR_IA32_MCx_CTL(i), 0);
 	}
@@ -1911,6 +1928,7 @@ static void mce_disable_cpu(void *h)
 		cmci_clear();
 	for (i = 0; i < banks; i++) {
 		struct mce_bank *b = &mce_banks[i];
+
 		if (b->init)
 			wrmsrl(MSR_IA32_MCx_CTL(i), 0);
 	}
@@ -1928,6 +1946,7 @@ static void mce_reenable_cpu(void *h)
 		cmci_reenable();
 	for (i = 0; i < banks; i++) {
 		struct mce_bank *b = &mce_banks[i];
+
 		if (b->init)
 			wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
 	}
-- 
cgit v1.2.3


From 79f5599772ac2f138d7a75b8f3f06a93f09c75f7 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Mon, 15 Jun 2009 14:58:26 +0800
Subject: cpumask: use zalloc_cpumask_var() where possible

Remove open-coded zalloc_cpumask_var() and zalloc_cpumask_var_node().

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 arch/x86/kernel/apic/io_apic.c      | 7 ++-----
 arch/x86/kernel/process.c           | 6 ++----
 arch/x86/kernel/smpboot.c           | 9 +++------
 drivers/acpi/processor_perflib.c    | 3 +--
 drivers/acpi/processor_throttling.c | 3 +--
 drivers/net/sfc/efx.c               | 3 +--
 drivers/oprofile/buffer_sync.c      | 3 +--
 kernel/trace/trace.c                | 7 ++-----
 virt/kvm/kvm_main.c                 | 3 +--
 9 files changed, 14 insertions(+), 30 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 64970b9885f..dc69f28489f 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -227,17 +227,14 @@ static struct irq_cfg *get_one_free_irq_cfg(int node)
 
 	cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node);
 	if (cfg) {
-		if (!alloc_cpumask_var_node(&cfg->domain, GFP_ATOMIC, node)) {
+		if (!zalloc_cpumask_var_node(&cfg->domain, GFP_ATOMIC, node)) {
 			kfree(cfg);
 			cfg = NULL;
-		} else if (!alloc_cpumask_var_node(&cfg->old_domain,
+		} else if (!zalloc_cpumask_var_node(&cfg->old_domain,
 							  GFP_ATOMIC, node)) {
 			free_cpumask_var(cfg->domain);
 			kfree(cfg);
 			cfg = NULL;
-		} else {
-			cpumask_clear(cfg->domain);
-			cpumask_clear(cfg->old_domain);
 		}
 	}
 
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 847ab416031..5284cd2b577 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -555,10 +555,8 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
 void __init init_c1e_mask(void)
 {
 	/* If we're using c1e_idle, we need to allocate c1e_mask. */
-	if (pm_idle == c1e_idle) {
-		alloc_cpumask_var(&c1e_mask, GFP_KERNEL);
-		cpumask_clear(c1e_mask);
-	}
+	if (pm_idle == c1e_idle)
+		zalloc_cpumask_var(&c1e_mask, GFP_KERNEL);
 }
 
 static int __init idle_setup(char *str)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 09c5e077dff..565ebc65920 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1059,12 +1059,9 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
 #endif
 	current_thread_info()->cpu = 0;  /* needed? */
 	for_each_possible_cpu(i) {
-		alloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
-		alloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
-		alloc_cpumask_var(&cpu_data(i).llc_shared_map, GFP_KERNEL);
-		cpumask_clear(per_cpu(cpu_core_map, i));
-		cpumask_clear(per_cpu(cpu_sibling_map, i));
-		cpumask_clear(cpu_data(i).llc_shared_map);
+		zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
+		zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
+		zalloc_cpumask_var(&cpu_data(i).llc_shared_map, GFP_KERNEL);
 	}
 	set_cpu_sibling_map(0);
 
diff --git a/drivers/acpi/processor_perflib.c b/drivers/acpi/processor_perflib.c
index 11088cf1031..8ba0ed0b9dd 100644
--- a/drivers/acpi/processor_perflib.c
+++ b/drivers/acpi/processor_perflib.c
@@ -511,7 +511,7 @@ int acpi_processor_preregister_performance(
 	struct acpi_processor *match_pr;
 	struct acpi_psd_package *match_pdomain;
 
-	if (!alloc_cpumask_var(&covered_cpus, GFP_KERNEL))
+	if (!zalloc_cpumask_var(&covered_cpus, GFP_KERNEL))
 		return -ENOMEM;
 
 	mutex_lock(&performance_mutex);
@@ -558,7 +558,6 @@ int acpi_processor_preregister_performance(
 	 * Now that we have _PSD data from all CPUs, lets setup P-state 
 	 * domain info.
 	 */
-	cpumask_clear(covered_cpus);
 	for_each_possible_cpu(i) {
 		pr = per_cpu(processors, i);
 		if (!pr)
diff --git a/drivers/acpi/processor_throttling.c b/drivers/acpi/processor_throttling.c
index ce7cf3bc510..4c6c14c1e30 100644
--- a/drivers/acpi/processor_throttling.c
+++ b/drivers/acpi/processor_throttling.c
@@ -77,7 +77,7 @@ static int acpi_processor_update_tsd_coord(void)
 	struct acpi_tsd_package *pdomain, *match_pdomain;
 	struct acpi_processor_throttling *pthrottling, *match_pthrottling;
 
-	if (!alloc_cpumask_var(&covered_cpus, GFP_KERNEL))
+	if (!zalloc_cpumask_var(&covered_cpus, GFP_KERNEL))
 		return -ENOMEM;
 
 	/*
@@ -105,7 +105,6 @@ static int acpi_processor_update_tsd_coord(void)
 	if (retval)
 		goto err_ret;
 
-	cpumask_clear(covered_cpus);
 	for_each_possible_cpu(i) {
 		pr = per_cpu(processors, i);
 		if (!pr)
diff --git a/drivers/net/sfc/efx.c b/drivers/net/sfc/efx.c
index 07a7e4b8f8f..cc4b2f99989 100644
--- a/drivers/net/sfc/efx.c
+++ b/drivers/net/sfc/efx.c
@@ -884,13 +884,12 @@ static int efx_wanted_rx_queues(void)
 	int count;
 	int cpu;
 
-	if (unlikely(!alloc_cpumask_var(&core_mask, GFP_KERNEL))) {
+	if (unlikely(!zalloc_cpumask_var(&core_mask, GFP_KERNEL))) {
 		printk(KERN_WARNING
 		       "sfc: RSS disabled due to allocation failure\n");
 		return 1;
 	}
 
-	cpumask_clear(core_mask);
 	count = 0;
 	for_each_online_cpu(cpu) {
 		if (!cpumask_test_cpu(cpu, core_mask)) {
diff --git a/drivers/oprofile/buffer_sync.c b/drivers/oprofile/buffer_sync.c
index 8574622e36a..c9e2ae90f19 100644
--- a/drivers/oprofile/buffer_sync.c
+++ b/drivers/oprofile/buffer_sync.c
@@ -154,9 +154,8 @@ int sync_start(void)
 {
 	int err;
 
-	if (!alloc_cpumask_var(&marked_cpus, GFP_KERNEL))
+	if (!zalloc_cpumask_var(&marked_cpus, GFP_KERNEL))
 		return -ENOMEM;
-	cpumask_clear(marked_cpus);
 
 	start_cpu_work();
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 6c0f6a8a22e..411af37f4be 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1984,11 +1984,9 @@ __tracing_open(struct inode *inode, struct file *file)
 	if (current_trace)
 		*iter->trace = *current_trace;
 
-	if (!alloc_cpumask_var(&iter->started, GFP_KERNEL))
+	if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL))
 		goto fail;
 
-	cpumask_clear(iter->started);
-
 	if (current_trace && current_trace->print_max)
 		iter->tr = &max_tr;
 	else
@@ -4389,7 +4387,7 @@ __init static int tracer_alloc_buffers(void)
 	if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL))
 		goto out_free_buffer_mask;
 
-	if (!alloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL))
+	if (!zalloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL))
 		goto out_free_tracing_cpumask;
 
 	/* To save memory, keep the ring buffer size to its minimum */
@@ -4400,7 +4398,6 @@ __init static int tracer_alloc_buffers(void)
 
 	cpumask_copy(tracing_buffer_mask, cpu_possible_mask);
 	cpumask_copy(tracing_cpumask, cpu_all_mask);
-	cpumask_clear(tracing_reader_cpumask);
 
 	/* TODO: make the number of buffers hot pluggable with CPUS */
 	global_trace.buffer = ring_buffer_alloc(ring_buf_size,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 897bff3b7df..034a798b043 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -738,8 +738,7 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
 	bool called = true;
 	struct kvm_vcpu *vcpu;
 
-	if (alloc_cpumask_var(&cpus, GFP_ATOMIC))
-		cpumask_clear(cpus);
+	zalloc_cpumask_var(&cpus, GFP_ATOMIC);
 
 	spin_lock(&kvm->requests_lock);
 	me = smp_processor_id();
-- 
cgit v1.2.3


From 1d1afc1957a441fc75a27517b17437d8af3b3b93 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 24 Sep 2009 09:34:36 -0600
Subject: cpumask: remove last assignment to mask field of struct irqaction.

This snuck in after the patch which removed all the others.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/time.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index e293ac56c72..dcb00d27851 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -93,7 +93,6 @@ static struct irqaction irq0  = {
 
 void __init setup_default_timer_irq(void)
 {
-	irq0.mask = cpumask_of_cpu(0);
 	setup_irq(0, &irq0);
 }
 
-- 
cgit v1.2.3


From 0748bd01773395003208996c4c0b3f80caf80976 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 24 Sep 2009 09:34:46 -0600
Subject: cpumask: remove arch_send_call_function_ipi

Now everyone is converted to arch_send_call_function_ipi_mask, remove
the shim and the #defines.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 arch/alpha/include/asm/smp.h    | 1 -
 arch/arm/include/asm/smp.h      | 1 -
 arch/ia64/include/asm/smp.h     | 1 -
 arch/m32r/include/asm/smp.h     | 1 -
 arch/mips/include/asm/smp.h     | 1 -
 arch/parisc/include/asm/smp.h   | 1 -
 arch/powerpc/include/asm/smp.h  | 1 -
 arch/s390/include/asm/smp.h     | 1 -
 arch/sh/include/asm/smp.h       | 1 -
 arch/sparc/include/asm/smp_64.h | 1 -
 arch/x86/include/asm/smp.h      | 1 -
 kernel/smp.c                    | 7 -------
 12 files changed, 18 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/alpha/include/asm/smp.h b/arch/alpha/include/asm/smp.h
index 8818a1bcdc8..3f390e8cc0b 100644
--- a/arch/alpha/include/asm/smp.h
+++ b/arch/alpha/include/asm/smp.h
@@ -48,7 +48,6 @@ extern int smp_num_cpus;
 
 extern void arch_send_call_function_single_ipi(int cpu);
 extern void arch_send_call_function_ipi_mask(const struct cpumask *mask);
-#define arch_send_call_function_ipi_mask arch_send_call_function_ipi_mask
 
 #else /* CONFIG_SMP */
 
diff --git a/arch/arm/include/asm/smp.h b/arch/arm/include/asm/smp.h
index a06e735b262..e0d763be184 100644
--- a/arch/arm/include/asm/smp.h
+++ b/arch/arm/include/asm/smp.h
@@ -93,7 +93,6 @@ extern void platform_cpu_enable(unsigned int cpu);
 
 extern void arch_send_call_function_single_ipi(int cpu);
 extern void arch_send_call_function_ipi_mask(const struct cpumask *mask);
-#define arch_send_call_function_ipi_mask arch_send_call_function_ipi_mask
 
 /*
  * show local interrupt info
diff --git a/arch/ia64/include/asm/smp.h b/arch/ia64/include/asm/smp.h
index d217d1d4e05..0b3b3997dec 100644
--- a/arch/ia64/include/asm/smp.h
+++ b/arch/ia64/include/asm/smp.h
@@ -127,7 +127,6 @@ extern int is_multithreading_enabled(void);
 
 extern void arch_send_call_function_single_ipi(int cpu);
 extern void arch_send_call_function_ipi_mask(const struct cpumask *mask);
-#define arch_send_call_function_ipi_mask arch_send_call_function_ipi_mask
 
 #else /* CONFIG_SMP */
 
diff --git a/arch/m32r/include/asm/smp.h b/arch/m32r/include/asm/smp.h
index c2be49d408a..e67ded1aab9 100644
--- a/arch/m32r/include/asm/smp.h
+++ b/arch/m32r/include/asm/smp.h
@@ -89,7 +89,6 @@ extern unsigned long send_IPI_mask_phys(cpumask_t, int, int);
 
 extern void arch_send_call_function_single_ipi(int cpu);
 extern void arch_send_call_function_ipi_mask(const struct cpumask *mask);
-#define arch_send_call_function_ipi_mask arch_send_call_function_ipi_mask
 
 #endif	/* not __ASSEMBLY__ */
 
diff --git a/arch/mips/include/asm/smp.h b/arch/mips/include/asm/smp.h
index 48c1967961a..e15f11a0931 100644
--- a/arch/mips/include/asm/smp.h
+++ b/arch/mips/include/asm/smp.h
@@ -79,6 +79,5 @@ extern asmlinkage void smp_call_function_interrupt(void);
 
 extern void arch_send_call_function_single_ipi(int cpu);
 extern void arch_send_call_function_ipi_mask(const struct cpumask *mask);
-#define arch_send_call_function_ipi_mask arch_send_call_function_ipi_mask
 
 #endif /* __ASM_SMP_H */
diff --git a/arch/parisc/include/asm/smp.h b/arch/parisc/include/asm/smp.h
index 21eb45a5262..2e73623feb6 100644
--- a/arch/parisc/include/asm/smp.h
+++ b/arch/parisc/include/asm/smp.h
@@ -30,7 +30,6 @@ extern void smp_send_all_nop(void);
 
 extern void arch_send_call_function_single_ipi(int cpu);
 extern void arch_send_call_function_ipi_mask(const struct cpumask *mask);
-#define arch_send_call_function_ipi_mask arch_send_call_function_ipi_mask
 
 #endif /* !ASSEMBLY */
 
diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index 1491bfe822d..d9ea8d39c34 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -147,7 +147,6 @@ extern struct smp_ops_t *smp_ops;
 
 extern void arch_send_call_function_single_ipi(int cpu);
 extern void arch_send_call_function_ipi_mask(const struct cpumask *mask);
-#define arch_send_call_function_ipi_mask arch_send_call_function_ipi_mask
 
 /* Definitions relative to the secondary CPU spin loop
  * and entry point. Not all of them exist on both 32 and
diff --git a/arch/s390/include/asm/smp.h b/arch/s390/include/asm/smp.h
index 6de62189a48..a868b272c25 100644
--- a/arch/s390/include/asm/smp.h
+++ b/arch/s390/include/asm/smp.h
@@ -63,7 +63,6 @@ extern int smp_cpu_polarization[];
 
 extern void arch_send_call_function_single_ipi(int cpu);
 extern void arch_send_call_function_ipi_mask(const struct cpumask *mask);
-#define arch_send_call_function_ipi_mask arch_send_call_function_ipi_mask
 
 #endif
 
diff --git a/arch/sh/include/asm/smp.h b/arch/sh/include/asm/smp.h
index ca64f43abe6..53ef26ced75 100644
--- a/arch/sh/include/asm/smp.h
+++ b/arch/sh/include/asm/smp.h
@@ -44,7 +44,6 @@ void plat_send_ipi(unsigned int cpu, unsigned int message);
 
 void arch_send_call_function_single_ipi(int cpu);
 extern void arch_send_call_function_ipi_mask(const struct cpumask *mask);
-#define arch_send_call_function_ipi_mask arch_send_call_function_ipi_mask
 
 #else
 
diff --git a/arch/sparc/include/asm/smp_64.h b/arch/sparc/include/asm/smp_64.h
index becb6bf353a..f49e11cd4de 100644
--- a/arch/sparc/include/asm/smp_64.h
+++ b/arch/sparc/include/asm/smp_64.h
@@ -36,7 +36,6 @@ extern int sparc64_multi_core;
 
 extern void arch_send_call_function_single_ipi(int cpu);
 extern void arch_send_call_function_ipi_mask(const struct cpumask *mask);
-#define arch_send_call_function_ipi_mask arch_send_call_function_ipi_mask
 
 /*
  *	General functions that each host system must provide.
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 6a84ed166ae..1e796782cd7 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -121,7 +121,6 @@ static inline void arch_send_call_function_single_ipi(int cpu)
 	smp_ops.send_call_func_single_ipi(cpu);
 }
 
-#define arch_send_call_function_ipi_mask arch_send_call_function_ipi_mask
 static inline void arch_send_call_function_ipi_mask(const struct cpumask *mask)
 {
 	smp_ops.send_call_func_ipi(mask);
diff --git a/kernel/smp.c b/kernel/smp.c
index fd47a256a24..c9d1c7835c2 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -347,13 +347,6 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
 	generic_exec_single(cpu, data, wait);
 }
 
-/* Deprecated: shim for archs using old arch_send_call_function_ipi API. */
-
-#ifndef arch_send_call_function_ipi_mask
-# define arch_send_call_function_ipi_mask(maskp) \
-	 arch_send_call_function_ipi(*(maskp))
-#endif
-
 /**
  * smp_call_function_many(): Run a function on a set of other CPUs.
  * @mask: The set of cpus to run on (only runs on online subset).
-- 
cgit v1.2.3


From 78f1c4d6b027993763a5aba83873b0462d06db8f Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 24 Sep 2009 09:34:51 -0600
Subject: cpumask: use mm_cpumask() wrapper: x86

Makes code futureproof against the impending change to mm->cpu_vm_mask (to be a pointer).

It's also a chance to use the new cpumask_ ops which take a pointer
(the older ones are deprecated, but there's no hurry for arch code).

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 arch/x86/include/asm/mmu_context.h |  6 +++---
 arch/x86/kernel/ldt.c              |  4 ++--
 arch/x86/mm/tlb.c                  | 15 ++++++++-------
 arch/x86/xen/mmu.c                 |  4 ++--
 4 files changed, 15 insertions(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index f923203dc39..4a2d4e0c18d 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -37,12 +37,12 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 
 	if (likely(prev != next)) {
 		/* stop flush ipis for the previous mm */
-		cpu_clear(cpu, prev->cpu_vm_mask);
+		cpumask_clear_cpu(cpu, mm_cpumask(prev));
 #ifdef CONFIG_SMP
 		percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
 		percpu_write(cpu_tlbstate.active_mm, next);
 #endif
-		cpu_set(cpu, next->cpu_vm_mask);
+		cpumask_set_cpu(cpu, mm_cpumask(next));
 
 		/* Re-load page tables */
 		load_cr3(next->pgd);
@@ -58,7 +58,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 		percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
 		BUG_ON(percpu_read(cpu_tlbstate.active_mm) != next);
 
-		if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
+		if (!cpumask_test_and_set_cpu(cpu, mm_cpumask(next))) {
 			/* We were in lazy tlb mode and leave_mm disabled
 			 * tlb flush IPI delivery. We must reload CR3
 			 * to make sure to use no freed page tables.
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index 71f1d99a635..ec6ef60cbd1 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -67,8 +67,8 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
 #ifdef CONFIG_SMP
 		preempt_disable();
 		load_LDT(pc);
-		if (!cpus_equal(current->mm->cpu_vm_mask,
-				cpumask_of_cpu(smp_processor_id())))
+		if (!cpumask_equal(mm_cpumask(current->mm),
+				   cpumask_of(smp_processor_id())))
 			smp_call_function(flush_ldt, current->mm, 1);
 		preempt_enable();
 #else
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index c814e144a3f..36fe08eeb5c 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -59,7 +59,8 @@ void leave_mm(int cpu)
 {
 	if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
 		BUG();
-	cpu_clear(cpu, percpu_read(cpu_tlbstate.active_mm)->cpu_vm_mask);
+	cpumask_clear_cpu(cpu,
+			  mm_cpumask(percpu_read(cpu_tlbstate.active_mm)));
 	load_cr3(swapper_pg_dir);
 }
 EXPORT_SYMBOL_GPL(leave_mm);
@@ -234,8 +235,8 @@ void flush_tlb_current_task(void)
 	preempt_disable();
 
 	local_flush_tlb();
-	if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids)
-		flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL);
+	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
+		flush_tlb_others(mm_cpumask(mm), mm, TLB_FLUSH_ALL);
 	preempt_enable();
 }
 
@@ -249,8 +250,8 @@ void flush_tlb_mm(struct mm_struct *mm)
 		else
 			leave_mm(smp_processor_id());
 	}
-	if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids)
-		flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL);
+	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
+		flush_tlb_others(mm_cpumask(mm), mm, TLB_FLUSH_ALL);
 
 	preempt_enable();
 }
@@ -268,8 +269,8 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
 			leave_mm(smp_processor_id());
 	}
 
-	if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids)
-		flush_tlb_others(&mm->cpu_vm_mask, mm, va);
+	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
+		flush_tlb_others(mm_cpumask(mm), mm, va);
 
 	preempt_enable();
 }
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 093dd59b538..3bf7b1d250c 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1165,14 +1165,14 @@ static void xen_drop_mm_ref(struct mm_struct *mm)
 	/* Get the "official" set of cpus referring to our pagetable. */
 	if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
 		for_each_online_cpu(cpu) {
-			if (!cpumask_test_cpu(cpu, &mm->cpu_vm_mask)
+			if (!cpumask_test_cpu(cpu, mm_cpumask(mm))
 			    && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
 				continue;
 			smp_call_function_single(cpu, drop_other_mm_ref, mm, 1);
 		}
 		return;
 	}
-	cpumask_copy(mask, &mm->cpu_vm_mask);
+	cpumask_copy(mask, mm_cpumask(mm));
 
 	/* It's possible that a vcpu may have a stale reference to our
 	   cr3, because its in lazy mode, and it hasn't yet flushed
-- 
cgit v1.2.3


From 2bcd57ab61e7cabed626226a3771617981c11ce1 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 24 Sep 2009 04:22:25 +0400
Subject: headers: utsname.h redux

* remove asm/atomic.h inclusion from linux/utsname.h --
   not needed after kref conversion
 * remove linux/utsname.h inclusion from files which do not need it

NOTE: it looks like fs/binfmt_elf.c do not need utsname.h, however
due to some personality stuff it _is_ needed -- cowardly leave ELF-related
headers and files alone.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/kernel/process.c                     | 1 -
 arch/arm/kernel/sys_arm.c                       | 1 -
 arch/frv/kernel/sys_frv.c                       | 1 -
 arch/h8300/kernel/sys_h8300.c                   | 1 -
 arch/m68k/kernel/sys_m68k.c                     | 1 -
 arch/m68knommu/kernel/sys_m68k.c                | 1 -
 arch/microblaze/kernel/sys_microblaze.c         | 1 -
 arch/mn10300/kernel/sys_mn10300.c               | 1 -
 arch/parisc/kernel/sys_parisc32.c               | 1 -
 arch/powerpc/kernel/setup-common.c              | 1 -
 arch/powerpc/kernel/sys_ppc32.c                 | 1 -
 arch/s390/kernel/compat_linux.c                 | 1 -
 arch/s390/kernel/process.c                      | 1 -
 arch/sh/kernel/sys_sh32.c                       | 1 -
 arch/sh/kernel/sys_sh64.c                       | 1 -
 arch/sparc/kernel/sys_sparc32.c                 | 1 -
 arch/sparc/kernel/systbls.h                     | 3 ++-
 arch/x86/kernel/dumpstack_32.c                  | 1 -
 arch/x86/kernel/dumpstack_64.c                  | 1 -
 arch/x86/kernel/traps.c                         | 1 -
 drivers/media/video/usbvision/usbvision-core.c  | 1 -
 drivers/media/video/usbvision/usbvision-i2c.c   | 1 -
 drivers/media/video/usbvision/usbvision-video.c | 1 -
 drivers/s390/char/zcore.c                       | 1 -
 drivers/usb/gadget/f_loopback.c                 | 1 -
 drivers/usb/gadget/f_obex.c                     | 1 -
 drivers/usb/gadget/f_sourcesink.c               | 1 -
 drivers/usb/gadget/u_audio.c                    | 1 -
 drivers/usb/gadget/u_ether.c                    | 1 -
 fs/gfs2/ops_inode.c                             | 1 -
 fs/lockd/xdr.c                                  | 1 -
 fs/lockd/xdr4.c                                 | 1 -
 fs/nfs/nfs2xdr.c                                | 1 -
 fs/nfs/nfs3proc.c                               | 1 -
 fs/nfs/nfs3xdr.c                                | 1 -
 fs/nfs/nfs4proc.c                               | 1 -
 fs/nfs/nfs4xdr.c                                | 1 -
 fs/nfs/proc.c                                   | 1 -
 fs/nfsd/nfs4idmap.c                             | 1 -
 fs/ocfs2/dlm/dlmast.c                           | 1 -
 fs/ocfs2/dlm/dlmconvert.c                       | 1 -
 fs/ocfs2/dlm/dlmdebug.c                         | 1 -
 fs/ocfs2/dlm/dlmdomain.c                        | 1 -
 fs/ocfs2/dlm/dlmlock.c                          | 1 -
 fs/ocfs2/dlm/dlmmaster.c                        | 1 -
 fs/ocfs2/dlm/dlmrecovery.c                      | 1 -
 fs/ocfs2/dlm/dlmthread.c                        | 1 -
 fs/ocfs2/dlm/dlmunlock.c                        | 1 -
 fs/ocfs2/super.c                                | 1 -
 fs/ocfs2/symlink.c                              | 1 -
 include/linux/utsname.h                         | 1 -
 init/main.c                                     | 1 -
 kernel/power/swap.c                             | 1 -
 kernel/sysctl.c                                 | 1 -
 kernel/uid16.c                                  | 1 -
 net/sunrpc/auth_null.c                          | 1 -
 56 files changed, 2 insertions(+), 56 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c
index 3a2fb7a02db..289039bb6bb 100644
--- a/arch/alpha/kernel/process.c
+++ b/arch/alpha/kernel/process.c
@@ -19,7 +19,6 @@
 #include <linux/ptrace.h>
 #include <linux/slab.h>
 #include <linux/user.h>
-#include <linux/utsname.h>
 #include <linux/time.h>
 #include <linux/major.h>
 #include <linux/stat.h>
diff --git a/arch/arm/kernel/sys_arm.c b/arch/arm/kernel/sys_arm.c
index b3ec641b5cf..78ecaac6520 100644
--- a/arch/arm/kernel/sys_arm.c
+++ b/arch/arm/kernel/sys_arm.c
@@ -25,7 +25,6 @@
 #include <linux/mman.h>
 #include <linux/fs.h>
 #include <linux/file.h>
-#include <linux/utsname.h>
 #include <linux/ipc.h>
 #include <linux/uaccess.h>
 
diff --git a/arch/frv/kernel/sys_frv.c b/arch/frv/kernel/sys_frv.c
index baadc97f862..2b6b5289cdc 100644
--- a/arch/frv/kernel/sys_frv.c
+++ b/arch/frv/kernel/sys_frv.c
@@ -21,7 +21,6 @@
 #include <linux/stat.h>
 #include <linux/mman.h>
 #include <linux/file.h>
-#include <linux/utsname.h>
 #include <linux/syscalls.h>
 #include <linux/ipc.h>
 
diff --git a/arch/h8300/kernel/sys_h8300.c b/arch/h8300/kernel/sys_h8300.c
index 2745656dcc5..8cb5d73a0e3 100644
--- a/arch/h8300/kernel/sys_h8300.c
+++ b/arch/h8300/kernel/sys_h8300.c
@@ -17,7 +17,6 @@
 #include <linux/syscalls.h>
 #include <linux/mman.h>
 #include <linux/file.h>
-#include <linux/utsname.h>
 #include <linux/fs.h>
 #include <linux/ipc.h>
 
diff --git a/arch/m68k/kernel/sys_m68k.c b/arch/m68k/kernel/sys_m68k.c
index 7f54efaf60b..7deb402bfc7 100644
--- a/arch/m68k/kernel/sys_m68k.c
+++ b/arch/m68k/kernel/sys_m68k.c
@@ -20,7 +20,6 @@
 #include <linux/syscalls.h>
 #include <linux/mman.h>
 #include <linux/file.h>
-#include <linux/utsname.h>
 #include <linux/ipc.h>
 
 #include <asm/setup.h>
diff --git a/arch/m68knommu/kernel/sys_m68k.c b/arch/m68knommu/kernel/sys_m68k.c
index 70028163862..efdd090778a 100644
--- a/arch/m68knommu/kernel/sys_m68k.c
+++ b/arch/m68knommu/kernel/sys_m68k.c
@@ -17,7 +17,6 @@
 #include <linux/syscalls.h>
 #include <linux/mman.h>
 #include <linux/file.h>
-#include <linux/utsname.h>
 #include <linux/ipc.h>
 #include <linux/fs.h>
 
diff --git a/arch/microblaze/kernel/sys_microblaze.c b/arch/microblaze/kernel/sys_microblaze.c
index b96f1682bb2..07cabed4b94 100644
--- a/arch/microblaze/kernel/sys_microblaze.c
+++ b/arch/microblaze/kernel/sys_microblaze.c
@@ -23,7 +23,6 @@
 #include <linux/mman.h>
 #include <linux/sys.h>
 #include <linux/ipc.h>
-#include <linux/utsname.h>
 #include <linux/file.h>
 #include <linux/module.h>
 #include <linux/err.h>
diff --git a/arch/mn10300/kernel/sys_mn10300.c b/arch/mn10300/kernel/sys_mn10300.c
index 3e52a105432..8ca5af00334 100644
--- a/arch/mn10300/kernel/sys_mn10300.c
+++ b/arch/mn10300/kernel/sys_mn10300.c
@@ -19,7 +19,6 @@
 #include <linux/stat.h>
 #include <linux/mman.h>
 #include <linux/file.h>
-#include <linux/utsname.h>
 #include <linux/tty.h>
 
 #include <asm/uaccess.h>
diff --git a/arch/parisc/kernel/sys_parisc32.c b/arch/parisc/kernel/sys_parisc32.c
index 92a0acaa0d1..561388b17c9 100644
--- a/arch/parisc/kernel/sys_parisc32.c
+++ b/arch/parisc/kernel/sys_parisc32.c
@@ -18,7 +18,6 @@
 #include <linux/signal.h>
 #include <linux/resource.h>
 #include <linux/times.h>
-#include <linux/utsname.h>
 #include <linux/time.h>
 #include <linux/smp.h>
 #include <linux/smp_lock.h>
diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index 1d5570a1e45..74cd1a7d0d4 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -24,7 +24,6 @@
 #include <linux/seq_file.h>
 #include <linux/ioport.h>
 #include <linux/console.h>
-#include <linux/utsname.h>
 #include <linux/screen_info.h>
 #include <linux/root_dev.h>
 #include <linux/notifier.h>
diff --git a/arch/powerpc/kernel/sys_ppc32.c b/arch/powerpc/kernel/sys_ppc32.c
index 1cc5e9e5da9..b97c2d67f4a 100644
--- a/arch/powerpc/kernel/sys_ppc32.c
+++ b/arch/powerpc/kernel/sys_ppc32.c
@@ -22,7 +22,6 @@
 #include <linux/signal.h>
 #include <linux/resource.h>
 #include <linux/times.h>
-#include <linux/utsname.h>
 #include <linux/smp.h>
 #include <linux/smp_lock.h>
 #include <linux/sem.h>
diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c
index 5519cb74510..0debcec23a3 100644
--- a/arch/s390/kernel/compat_linux.c
+++ b/arch/s390/kernel/compat_linux.c
@@ -24,7 +24,6 @@
 #include <linux/signal.h>
 #include <linux/resource.h>
 #include <linux/times.h>
-#include <linux/utsname.h>
 #include <linux/smp.h>
 #include <linux/smp_lock.h>
 #include <linux/sem.h>
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index 59fe6ecc6ed..5417eb57271 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -27,7 +27,6 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/notifier.h>
-#include <linux/utsname.h>
 #include <linux/tick.h>
 #include <linux/elfcore.h>
 #include <linux/kernel_stat.h>
diff --git a/arch/sh/kernel/sys_sh32.c b/arch/sh/kernel/sys_sh32.c
index 63ba12836ea..eb68bfdd86e 100644
--- a/arch/sh/kernel/sys_sh32.c
+++ b/arch/sh/kernel/sys_sh32.c
@@ -9,7 +9,6 @@
 #include <linux/syscalls.h>
 #include <linux/mman.h>
 #include <linux/file.h>
-#include <linux/utsname.h>
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/ipc.h>
diff --git a/arch/sh/kernel/sys_sh64.c b/arch/sh/kernel/sys_sh64.c
index 91fb8445a5a..287235768bc 100644
--- a/arch/sh/kernel/sys_sh64.c
+++ b/arch/sh/kernel/sys_sh64.c
@@ -23,7 +23,6 @@
 #include <linux/stat.h>
 #include <linux/mman.h>
 #include <linux/file.h>
-#include <linux/utsname.h>
 #include <linux/syscalls.h>
 #include <linux/ipc.h>
 #include <asm/uaccess.h>
diff --git a/arch/sparc/kernel/sys_sparc32.c b/arch/sparc/kernel/sys_sparc32.c
index f5000a460c0..04e28b2671c 100644
--- a/arch/sparc/kernel/sys_sparc32.c
+++ b/arch/sparc/kernel/sys_sparc32.c
@@ -16,7 +16,6 @@
 #include <linux/signal.h>
 #include <linux/resource.h>
 #include <linux/times.h>
-#include <linux/utsname.h>
 #include <linux/smp.h>
 #include <linux/smp_lock.h>
 #include <linux/sem.h>
diff --git a/arch/sparc/kernel/systbls.h b/arch/sparc/kernel/systbls.h
index 15c2d752b2b..a63c5d2d984 100644
--- a/arch/sparc/kernel/systbls.h
+++ b/arch/sparc/kernel/systbls.h
@@ -3,10 +3,11 @@
 
 #include <linux/kernel.h>
 #include <linux/types.h>
-#include <linux/utsname.h>
 #include <asm/utrap.h>
 #include <asm/signal.h>
 
+struct new_utsname;
+
 extern asmlinkage unsigned long sys_getpagesize(void);
 extern asmlinkage unsigned long sparc_brk(unsigned long brk);
 extern asmlinkage long sparc_pipe(struct pt_regs *regs);
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index bca5fba91c9..f7dd2a7c3bf 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -5,7 +5,6 @@
 #include <linux/kallsyms.h>
 #include <linux/kprobes.h>
 #include <linux/uaccess.h>
-#include <linux/utsname.h>
 #include <linux/hardirq.h>
 #include <linux/kdebug.h>
 #include <linux/module.h>
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 54b0a327676..a071e6be177 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -5,7 +5,6 @@
 #include <linux/kallsyms.h>
 #include <linux/kprobes.h>
 #include <linux/uaccess.h>
-#include <linux/utsname.h>
 #include <linux/hardirq.h>
 #include <linux/kdebug.h>
 #include <linux/module.h>
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 9346e102338..a665c71352b 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -14,7 +14,6 @@
 #include <linux/spinlock.h>
 #include <linux/kprobes.h>
 #include <linux/uaccess.h>
-#include <linux/utsname.h>
 #include <linux/kdebug.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
diff --git a/drivers/media/video/usbvision/usbvision-core.c b/drivers/media/video/usbvision/usbvision-core.c
index 6ba16abeebd..e0f91e4ab65 100644
--- a/drivers/media/video/usbvision/usbvision-core.c
+++ b/drivers/media/video/usbvision/usbvision-core.c
@@ -28,7 +28,6 @@
 #include <linux/timer.h>
 #include <linux/slab.h>
 #include <linux/mm.h>
-#include <linux/utsname.h>
 #include <linux/highmem.h>
 #include <linux/vmalloc.h>
 #include <linux/module.h>
diff --git a/drivers/media/video/usbvision/usbvision-i2c.c b/drivers/media/video/usbvision/usbvision-i2c.c
index f97fd06d594..c19f51dba2e 100644
--- a/drivers/media/video/usbvision/usbvision-i2c.c
+++ b/drivers/media/video/usbvision/usbvision-i2c.c
@@ -28,7 +28,6 @@
 #include <linux/module.h>
 #include <linux/delay.h>
 #include <linux/slab.h>
-#include <linux/utsname.h>
 #include <linux/init.h>
 #include <asm/uaccess.h>
 #include <linux/ioport.h>
diff --git a/drivers/media/video/usbvision/usbvision-video.c b/drivers/media/video/usbvision/usbvision-video.c
index 90d9b5c0e9a..a2a50d608a3 100644
--- a/drivers/media/video/usbvision/usbvision-video.c
+++ b/drivers/media/video/usbvision/usbvision-video.c
@@ -52,7 +52,6 @@
 #include <linux/slab.h>
 #include <linux/smp_lock.h>
 #include <linux/mm.h>
-#include <linux/utsname.h>
 #include <linux/highmem.h>
 #include <linux/vmalloc.h>
 #include <linux/module.h>
diff --git a/drivers/s390/char/zcore.c b/drivers/s390/char/zcore.c
index c431198bdbc..82daa3c1dc9 100644
--- a/drivers/s390/char/zcore.c
+++ b/drivers/s390/char/zcore.c
@@ -14,7 +14,6 @@
 
 #include <linux/init.h>
 #include <linux/miscdevice.h>
-#include <linux/utsname.h>
 #include <linux/debugfs.h>
 #include <asm/ipl.h>
 #include <asm/sclp.h>
diff --git a/drivers/usb/gadget/f_loopback.c b/drivers/usb/gadget/f_loopback.c
index eb6ddfc2085..6cb29d3df57 100644
--- a/drivers/usb/gadget/f_loopback.c
+++ b/drivers/usb/gadget/f_loopback.c
@@ -22,7 +22,6 @@
 /* #define VERBOSE_DEBUG */
 
 #include <linux/kernel.h>
-#include <linux/utsname.h>
 #include <linux/device.h>
 
 #include "g_zero.h"
diff --git a/drivers/usb/gadget/f_obex.c b/drivers/usb/gadget/f_obex.c
index 46d6266f30e..b4a3ba654ea 100644
--- a/drivers/usb/gadget/f_obex.c
+++ b/drivers/usb/gadget/f_obex.c
@@ -24,7 +24,6 @@
 /* #define VERBOSE_DEBUG */
 
 #include <linux/kernel.h>
-#include <linux/utsname.h>
 #include <linux/device.h>
 
 #include "u_serial.h"
diff --git a/drivers/usb/gadget/f_sourcesink.c b/drivers/usb/gadget/f_sourcesink.c
index bffe91d525f..09cba273d2d 100644
--- a/drivers/usb/gadget/f_sourcesink.c
+++ b/drivers/usb/gadget/f_sourcesink.c
@@ -22,7 +22,6 @@
 /* #define VERBOSE_DEBUG */
 
 #include <linux/kernel.h>
-#include <linux/utsname.h>
 #include <linux/device.h>
 
 #include "g_zero.h"
diff --git a/drivers/usb/gadget/u_audio.c b/drivers/usb/gadget/u_audio.c
index b5200d55145..8252595d619 100644
--- a/drivers/usb/gadget/u_audio.c
+++ b/drivers/usb/gadget/u_audio.c
@@ -10,7 +10,6 @@
  */
 
 #include <linux/kernel.h>
-#include <linux/utsname.h>
 #include <linux/device.h>
 #include <linux/delay.h>
 #include <linux/ctype.h>
diff --git a/drivers/usb/gadget/u_ether.c b/drivers/usb/gadget/u_ether.c
index f8751ff863c..2fc02bd9584 100644
--- a/drivers/usb/gadget/u_ether.c
+++ b/drivers/usb/gadget/u_ether.c
@@ -23,7 +23,6 @@
 /* #define VERBOSE_DEBUG */
 
 #include <linux/kernel.h>
-#include <linux/utsname.h>
 #include <linux/device.h>
 #include <linux/ctype.h>
 #include <linux/etherdevice.h>
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index c3ac1805405..247436c10de 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -12,7 +12,6 @@
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
 #include <linux/namei.h>
-#include <linux/utsname.h>
 #include <linux/mm.h>
 #include <linux/xattr.h>
 #include <linux/posix_acl.h>
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 0336f2beacd..b583ab0a4cb 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -8,7 +8,6 @@
 
 #include <linux/types.h>
 #include <linux/sched.h>
-#include <linux/utsname.h>
 #include <linux/nfs.h>
 
 #include <linux/sunrpc/xdr.h>
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index e1d52865319..ad9dbbc9145 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -9,7 +9,6 @@
 
 #include <linux/types.h>
 #include <linux/sched.h>
-#include <linux/utsname.h>
 #include <linux/nfs.h>
 
 #include <linux/sunrpc/xdr.h>
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index c862c9340f9..5e078b222b4 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -13,7 +13,6 @@
 #include <linux/time.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
-#include <linux/utsname.h>
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/in.h>
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index ee6a13f0544..3f8881d1a05 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -7,7 +7,6 @@
  */
 
 #include <linux/mm.h>
-#include <linux/utsname.h>
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/sunrpc/clnt.h>
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 35869a4921f..5fe5492fbd2 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -10,7 +10,6 @@
 #include <linux/time.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
-#include <linux/utsname.h>
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/in.h>
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index be6544aef41..ed7c269e251 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -36,7 +36,6 @@
  */
 
 #include <linux/mm.h>
-#include <linux/utsname.h>
 #include <linux/delay.h>
 #include <linux/errno.h>
 #include <linux/string.h>
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index cfc30d362f9..83ad47cbdd8 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -39,7 +39,6 @@
 #include <linux/time.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
-#include <linux/utsname.h>
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/in.h>
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 7be72d90d49..ef583854d8d 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -32,7 +32,6 @@
 #include <linux/slab.h>
 #include <linux/time.h>
 #include <linux/mm.h>
-#include <linux/utsname.h>
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/in.h>
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index cdfa86fa147..ba2c199592f 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -38,7 +38,6 @@
 #include <linux/init.h>
 
 #include <linux/mm.h>
-#include <linux/utsname.h>
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/sunrpc/clnt.h>
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index 81eff8e5832..01cf8cc3d28 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -30,7 +30,6 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
-#include <linux/utsname.h>
 #include <linux/init.h>
 #include <linux/sysctl.h>
 #include <linux/random.h>
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index 75997b4deaf..ca96bce50e1 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -30,7 +30,6 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
-#include <linux/utsname.h>
 #include <linux/init.h>
 #include <linux/sysctl.h>
 #include <linux/random.h>
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index c5c88124096..ca46002ec10 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -27,7 +27,6 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
-#include <linux/utsname.h>
 #include <linux/sysctl.h>
 #include <linux/spinlock.h>
 #include <linux/debugfs.h>
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 4d9e6b288dd..0334000676d 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -28,7 +28,6 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
-#include <linux/utsname.h>
 #include <linux/init.h>
 #include <linux/spinlock.h>
 #include <linux/delay.h>
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 83a9f2972ac..437698e9465 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -30,7 +30,6 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
-#include <linux/utsname.h>
 #include <linux/init.h>
 #include <linux/sysctl.h>
 #include <linux/random.h>
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index f8b653fcd4d..83bcaf266b3 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -30,7 +30,6 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
-#include <linux/utsname.h>
 #include <linux/init.h>
 #include <linux/sysctl.h>
 #include <linux/random.h>
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 43e6e328056..d9fa3d22e17 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -30,7 +30,6 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
-#include <linux/utsname.h>
 #include <linux/init.h>
 #include <linux/sysctl.h>
 #include <linux/random.h>
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 98569e86c61..52ec020ea78 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -30,7 +30,6 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
-#include <linux/utsname.h>
 #include <linux/init.h>
 #include <linux/sysctl.h>
 #include <linux/random.h>
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 756f5b0998e..00f53b2aea7 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -30,7 +30,6 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
-#include <linux/utsname.h>
 #include <linux/init.h>
 #include <linux/sysctl.h>
 #include <linux/random.h>
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 24feb449a1d..4cc3c890a2c 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -28,7 +28,6 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
-#include <linux/utsname.h>
 #include <linux/init.h>
 #include <linux/random.h>
 #include <linux/statfs.h>
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index 579dd1b1110..e3421030a69 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -38,7 +38,6 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/pagemap.h>
-#include <linux/utsname.h>
 #include <linux/namei.h>
 
 #define MLOG_MASK_PREFIX ML_NAMEI
diff --git a/include/linux/utsname.h b/include/linux/utsname.h
index 3656b300de3..69f39974c04 100644
--- a/include/linux/utsname.h
+++ b/include/linux/utsname.h
@@ -36,7 +36,6 @@ struct new_utsname {
 #include <linux/kref.h>
 #include <linux/nsproxy.h>
 #include <linux/err.h>
-#include <asm/atomic.h>
 
 struct uts_namespace {
 	struct kref kref;
diff --git a/init/main.c b/init/main.c
index 6107223124e..76961db298f 100644
--- a/init/main.c
+++ b/init/main.c
@@ -18,7 +18,6 @@
 #include <linux/string.h>
 #include <linux/ctype.h>
 #include <linux/delay.h>
-#include <linux/utsname.h>
 #include <linux/ioport.h>
 #include <linux/init.h>
 #include <linux/smp_lock.h>
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 8ba052c86d4..b101cdc4df3 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -13,7 +13,6 @@
 
 #include <linux/module.h>
 #include <linux/file.h>
-#include <linux/utsname.h>
 #include <linux/delay.h>
 #include <linux/bitops.h>
 #include <linux/genhd.h>
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 0dfaa47d7cb..7f4f57bea4c 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -26,7 +26,6 @@
 #include <linux/proc_fs.h>
 #include <linux/security.h>
 #include <linux/ctype.h>
-#include <linux/utsname.h>
 #include <linux/kmemcheck.h>
 #include <linux/smp_lock.h>
 #include <linux/fs.h>
diff --git a/kernel/uid16.c b/kernel/uid16.c
index 0314501688b..419209893d8 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -4,7 +4,6 @@
  */
 
 #include <linux/mm.h>
-#include <linux/utsname.h>
 #include <linux/mman.h>
 #include <linux/notifier.h>
 #include <linux/reboot.h>
diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c
index c70dd7f5258..1db618f56ec 100644
--- a/net/sunrpc/auth_null.c
+++ b/net/sunrpc/auth_null.c
@@ -8,7 +8,6 @@
 
 #include <linux/types.h>
 #include <linux/module.h>
-#include <linux/utsname.h>
 #include <linux/sunrpc/clnt.h>
 
 #ifdef RPC_DEBUG
-- 
cgit v1.2.3


From ea01c0d7315d6e3218fd22a6947c5b09305fcf65 Mon Sep 17 00:00:00 2001
From: Roland Dreier <rdreier@cisco.com>
Date: Wed, 23 Sep 2009 15:33:23 -0700
Subject: x86: Reduce verbosity of "TSC is reliable" message

On modern systems, the kernel prints the message

    Skipping synchronization checks as TSC is reliable.

once for every non-boot CPU.

This gets kind of ridiculous on huge systems; for example, on a
64-thread system I was lucky enough to get:

    $ dmesg | grep 'TSC is reliable' | wc
         63     567    4221

There's no point to doing this for every CPU, since the code is
just checking the boot CPU anyway, so change this to a
printk_once() to make the message appears only once.

Signed-off-by: Roland Dreier <rolandd@cisco.com>
LKML-Reference: <adazl8l2swc.fsf@cisco.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/tsc_sync.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 027b5b49899..f37930954d1 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -114,7 +114,7 @@ void __cpuinit check_tsc_sync_source(int cpu)
 		return;
 
 	if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) {
-		pr_info("Skipping synchronization checks as TSC is reliable.\n");
+		printk_once(KERN_INFO "Skipping synchronization checks as TSC is reliable.\n");
 		return;
 	}
 
-- 
cgit v1.2.3


From e23a8b6a8f319c0f08b6ccef2dccbb37e7603dc2 Mon Sep 17 00:00:00 2001
From: Roland Dreier <rdreier@cisco.com>
Date: Wed, 23 Sep 2009 15:35:35 -0700
Subject: x86: Reduce verbosity of "PAT enabled" kernel message

On modern systems, the kernel prints the message

    x86 PAT enabled: cpu 0, old 0x7040600070406, new 0x7010600070106

once for every CPU.

This gets kind of ridiculous on huge systems; for example, on a
64-thread system I was lucky enough to get:

    dmesg| grep 'PAT enabled' | wc
         64     704    5174

There is already a BUG() if non-boot CPUs have PAT capabilities
that don't match the boot CPU, so just print the message on the
boot CPU. (I kept the print after the wrmsrl() that enables PAT,
so that the log output continues to mean that the system survived
enabling PAT on the boot CPU)

Signed-off-by: Roland Dreier <rolandd@cisco.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
LKML-Reference: <adavdj92sso.fsf@cisco.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/pat.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 7257cf3decf..e78cd0ec2bc 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -81,6 +81,7 @@ enum {
 void pat_init(void)
 {
 	u64 pat;
+	bool boot_cpu = !boot_pat_state;
 
 	if (!pat_enabled)
 		return;
@@ -122,8 +123,10 @@ void pat_init(void)
 		rdmsrl(MSR_IA32_CR_PAT, boot_pat_state);
 
 	wrmsrl(MSR_IA32_CR_PAT, pat);
-	printk(KERN_INFO "x86 PAT enabled: cpu %d, old 0x%Lx, new 0x%Lx\n",
-	       smp_processor_id(), boot_pat_state, pat);
+
+	if (boot_cpu)
+		printk(KERN_INFO "x86 PAT enabled: cpu %d, old 0x%Lx, new 0x%Lx\n",
+		       smp_processor_id(), boot_pat_state, pat);
 }
 
 #undef PAT
-- 
cgit v1.2.3


From 429a6e5e2c1fbe5d805aad123efbdb5f0c14769f Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Wed, 23 Sep 2009 18:13:13 -0500
Subject: x86: early_printk: Protect against using the same device twice

If you use the kernel argument:

  earlyprintk=serial,ttyS0,115200

This will cause a recursive hang printing the same line
again and again:

 BIOS-e820: 000000003fff3000 - 0000000040000000 (ACPI data)
 BIOS-e820: 00000000e0000000 - 00000000f0000000 (reserved)
 BIOS-e820: 00000000fec00000 - 0000000100000000 (reserved)
bootconsole [earlyser0] enabled
Linux version 2.6.31-07863-gb64ada6 (mingo@sirius) (gcc version 4.3.2 20081105 (Red Hat 4.3.2-7) (GCC) ) #16789 SMP Wed Sep 23 21:09:43 CEST 2009
Linux version 2.6.31-07863-gb64ada6 (mingo@sirius) (gcc version 4.3.2 20081105 (Red Hat 4.3.2-7) (GCC) ) #16789 SMP Wed Sep 23 21:09:43 CEST 2009
Linux version 2.6.31-07863-gb64ada6 (mingo@sirius) (gcc version 4.3.2 20081105 (Red Hat 4.3.2-7) (GCC) ) #16789 SMP Wed Sep 23 21:09:43 CEST 2009
Linux version 2.6.31-07863-gb64ada6 (mingo@sirius) (gcc version 4.3.2 20081105 (Red Hat 4.3.2-7) (GCC) ) #16789 SMP Wed Sep 23 21:09:43 CEST 2009
Linux version 2.6.31-07863-gb64ada6 (mingo@sirius) (gcc version 4.3.2 20081105 (Red Hat 4.3.2-7) (GCC) ) #16789 SMP Wed Sep 23 21:09:43 CEST 2009

Instead warn the end user that they specified the device
a second time, and ignore that second console.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
Cc: Len Brown <lenb@kernel.org>
Cc: Greg KH <gregkh@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <4ABAAB89.1080407@windriver.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/early_printk.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 2acfd3fdc0c..41fd965c80c 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -178,6 +178,11 @@ asmlinkage void early_printk(const char *fmt, ...)
 
 static inline void early_console_register(struct console *con, int keep_early)
 {
+	if (early_console->index != -1) {
+		printk(KERN_CRIT "ERROR: earlyprintk= %s already used\n",
+		       con->name);
+		return;
+	}
 	early_console = con;
 	if (keep_early)
 		early_console->flags &= ~CON_BOOT;
-- 
cgit v1.2.3


From b0c6fbe458183cc7e1cab17be6efcbe7e435bad3 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 24 Sep 2009 17:48:44 +0930
Subject: x86: Remove redundant non-NUMA topology functions

arch/x86/include/asm/topology.h declares inline fns cpu_to_node and
cpumask_of_node for !NUMA, even though they are then declared as
macros by asm-generic/topology.h, which is #included just below.

The macros (which are the same) end up being used; these functions
are just confusing.

Noticed-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jesse Barnes <jbarnes@virtuousgeek.org>
Cc: "Greg Kroah-Hartman" <gregkh@suse.de>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
LKML-Reference: <200909241748.45629.rusty@rustcorp.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/topology.h | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 6f0695d744b..25a92842dd9 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -165,21 +165,11 @@ static inline int numa_node_id(void)
 	return 0;
 }
 
-static inline int cpu_to_node(int cpu)
-{
-	return 0;
-}
-
 static inline int early_cpu_to_node(int cpu)
 {
 	return 0;
 }
 
-static inline const struct cpumask *cpumask_of_node(int node)
-{
-	return cpu_online_mask;
-}
-
 static inline void setup_node_to_cpumask_map(void) { }
 
 #endif
-- 
cgit v1.2.3


From 8d65af789f3e2cf4cfbdbf71a0f7a61ebcd41d38 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 23 Sep 2009 15:57:19 -0700
Subject: sysctl: remove "struct file *" argument of ->proc_handler

It's unused.

It isn't needed -- read or write flag is already passed and sysctl
shouldn't care about the rest.

It _was_ used in two places at arch/frv for some reason.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: David Howells <dhowells@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: James Morris <jmorris@namei.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/frv/kernel/pm.c               | 14 +++----
 arch/mips/lasat/sysctl.c           | 18 ++++-----
 arch/s390/appldata/appldata_base.c |  9 ++---
 arch/s390/kernel/debug.c           |  4 +-
 arch/s390/mm/cmm.c                 |  4 +-
 arch/x86/include/asm/nmi.h         |  3 +-
 arch/x86/kernel/apic/nmi.c         |  4 +-
 arch/x86/kernel/vsyscall_64.c      | 10 +----
 drivers/cdrom/cdrom.c              |  8 ++--
 drivers/char/random.c              |  4 +-
 drivers/net/wireless/arlan-proc.c  | 28 +++++++-------
 drivers/parport/procfs.c           | 12 +++---
 fs/coda/coda_int.h                 |  1 +
 fs/drop_caches.c                   |  4 +-
 fs/file_table.c                    |  6 +--
 fs/proc/proc_sysctl.c              |  2 +-
 fs/xfs/linux-2.6/xfs_sysctl.c      |  3 +-
 include/linux/fs.h                 |  2 +-
 include/linux/ftrace.h             |  4 +-
 include/linux/hugetlb.h            |  6 +--
 include/linux/mm.h                 |  2 +-
 include/linux/mmzone.h             | 13 +++----
 include/linux/sched.h              |  8 ++--
 include/linux/security.h           |  2 +-
 include/linux/swap.h               |  2 +-
 include/linux/sysctl.h             | 19 +++++-----
 include/linux/writeback.h          | 11 +++---
 include/net/ip.h                   |  2 +-
 include/net/ndisc.h                |  2 -
 ipc/ipc_sysctl.c                   | 16 ++++----
 ipc/mq_sysctl.c                    |  8 ++--
 kernel/hung_task.c                 |  4 +-
 kernel/sched.c                     |  4 +-
 kernel/sched_fair.c                |  4 +-
 kernel/slow-work.c                 | 12 +++---
 kernel/softlockup.c                |  4 +-
 kernel/sysctl.c                    | 78 ++++++++++++++++----------------------
 kernel/trace/ftrace.c              |  4 +-
 kernel/trace/trace_stack.c         |  4 +-
 kernel/utsname_sysctl.c            |  4 +-
 mm/hugetlb.c                       | 12 +++---
 mm/page-writeback.c                | 20 +++++-----
 mm/page_alloc.c                    | 24 ++++++------
 mm/vmscan.c                        |  4 +-
 net/bridge/br_netfilter.c          |  4 +-
 net/decnet/dn_dev.c                |  5 +--
 net/decnet/sysctl_net_decnet.c     |  2 -
 net/ipv4/devinet.c                 | 12 +++---
 net/ipv4/route.c                   |  7 ++--
 net/ipv4/sysctl_net_ipv4.c         | 16 ++++----
 net/ipv6/addrconf.c                |  8 ++--
 net/ipv6/ndisc.c                   |  8 ++--
 net/ipv6/route.c                   |  4 +-
 net/irda/irsysctl.c                |  8 ++--
 net/netfilter/ipvs/ip_vs_ctl.c     |  8 ++--
 net/netfilter/nf_log.c             |  4 +-
 net/phonet/sysctl.c                |  4 +-
 net/sunrpc/sysctl.c                |  4 +-
 net/sunrpc/xprtrdma/svc_rdma.c     |  2 +-
 security/min_addr.c                |  4 +-
 60 files changed, 239 insertions(+), 270 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/frv/kernel/pm.c b/arch/frv/kernel/pm.c
index be722fc1acf..0d4d3e3a4cf 100644
--- a/arch/frv/kernel/pm.c
+++ b/arch/frv/kernel/pm.c
@@ -150,7 +150,7 @@ static int user_atoi(char __user *ubuf, size_t len)
 /*
  * Send us to sleep.
  */
-static int sysctl_pm_do_suspend(ctl_table *ctl, int write, struct file *filp,
+static int sysctl_pm_do_suspend(ctl_table *ctl, int write,
 				void __user *buffer, size_t *lenp, loff_t *fpos)
 {
 	int retval, mode;
@@ -198,13 +198,13 @@ static int try_set_cmode(int new_cmode)
 }
 
 
-static int cmode_procctl(ctl_table *ctl, int write, struct file *filp,
+static int cmode_procctl(ctl_table *ctl, int write,
 			 void __user *buffer, size_t *lenp, loff_t *fpos)
 {
 	int new_cmode;
 
 	if (!write)
-		return proc_dointvec(ctl, write, filp, buffer, lenp, fpos);
+		return proc_dointvec(ctl, write, buffer, lenp, fpos);
 
 	new_cmode = user_atoi(buffer, *lenp);
 
@@ -301,13 +301,13 @@ static int try_set_cm(int new_cm)
 	return 0;
 }
 
-static int p0_procctl(ctl_table *ctl, int write, struct file *filp,
+static int p0_procctl(ctl_table *ctl, int write,
 		      void __user *buffer, size_t *lenp, loff_t *fpos)
 {
 	int new_p0;
 
 	if (!write)
-		return proc_dointvec(ctl, write, filp, buffer, lenp, fpos);
+		return proc_dointvec(ctl, write, buffer, lenp, fpos);
 
 	new_p0 = user_atoi(buffer, *lenp);
 
@@ -345,13 +345,13 @@ static int p0_sysctl(ctl_table *table,
 	return 1;
 }
 
-static int cm_procctl(ctl_table *ctl, int write, struct file *filp,
+static int cm_procctl(ctl_table *ctl, int write,
 		      void __user *buffer, size_t *lenp, loff_t *fpos)
 {
 	int new_cm;
 
 	if (!write)
-		return proc_dointvec(ctl, write, filp, buffer, lenp, fpos);
+		return proc_dointvec(ctl, write, buffer, lenp, fpos);
 
 	new_cm = user_atoi(buffer, *lenp);
 
diff --git a/arch/mips/lasat/sysctl.c b/arch/mips/lasat/sysctl.c
index 3f04d4c406b..b3deed8db61 100644
--- a/arch/mips/lasat/sysctl.c
+++ b/arch/mips/lasat/sysctl.c
@@ -56,12 +56,12 @@ int sysctl_lasatstring(ctl_table *table,
 
 
 /* And the same for proc */
-int proc_dolasatstring(ctl_table *table, int write, struct file *filp,
+int proc_dolasatstring(ctl_table *table, int write,
 		       void *buffer, size_t *lenp, loff_t *ppos)
 {
 	int r;
 
-	r = proc_dostring(table, write, filp, buffer, lenp, ppos);
+	r = proc_dostring(table, write, buffer, lenp, ppos);
 	if ((!write) || r)
 		return r;
 
@@ -71,12 +71,12 @@ int proc_dolasatstring(ctl_table *table, int write, struct file *filp,
 }
 
 /* proc function to write EEPROM after changing int entry */
-int proc_dolasatint(ctl_table *table, int write, struct file *filp,
+int proc_dolasatint(ctl_table *table, int write,
 		       void *buffer, size_t *lenp, loff_t *ppos)
 {
 	int r;
 
-	r = proc_dointvec(table, write, filp, buffer, lenp, ppos);
+	r = proc_dointvec(table, write, buffer, lenp, ppos);
 	if ((!write) || r)
 		return r;
 
@@ -89,7 +89,7 @@ int proc_dolasatint(ctl_table *table, int write, struct file *filp,
 static int rtctmp;
 
 /* proc function to read/write RealTime Clock */
-int proc_dolasatrtc(ctl_table *table, int write, struct file *filp,
+int proc_dolasatrtc(ctl_table *table, int write,
 		       void *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct timespec ts;
@@ -102,7 +102,7 @@ int proc_dolasatrtc(ctl_table *table, int write, struct file *filp,
 		if (rtctmp < 0)
 			rtctmp = 0;
 	}
-	r = proc_dointvec(table, write, filp, buffer, lenp, ppos);
+	r = proc_dointvec(table, write, buffer, lenp, ppos);
 	if (r)
 		return r;
 
@@ -154,7 +154,7 @@ int sysctl_lasat_rtc(ctl_table *table,
 #endif
 
 #ifdef CONFIG_INET
-int proc_lasat_ip(ctl_table *table, int write, struct file *filp,
+int proc_lasat_ip(ctl_table *table, int write,
 		       void *buffer, size_t *lenp, loff_t *ppos)
 {
 	unsigned int ip;
@@ -231,12 +231,12 @@ static int sysctl_lasat_prid(ctl_table *table,
 	return 0;
 }
 
-int proc_lasat_prid(ctl_table *table, int write, struct file *filp,
+int proc_lasat_prid(ctl_table *table, int write,
 		       void *buffer, size_t *lenp, loff_t *ppos)
 {
 	int r;
 
-	r = proc_dointvec(table, write, filp, buffer, lenp, ppos);
+	r = proc_dointvec(table, write, buffer, lenp, ppos);
 	if (r < 0)
 		return r;
 	if (write) {
diff --git a/arch/s390/appldata/appldata_base.c b/arch/s390/appldata/appldata_base.c
index 264528e4f58..b55fd7ed1c3 100644
--- a/arch/s390/appldata/appldata_base.c
+++ b/arch/s390/appldata/appldata_base.c
@@ -50,10 +50,9 @@ static struct platform_device *appldata_pdev;
  * /proc entries (sysctl)
  */
 static const char appldata_proc_name[APPLDATA_PROC_NAME_LENGTH] = "appldata";
-static int appldata_timer_handler(ctl_table *ctl, int write, struct file *filp,
+static int appldata_timer_handler(ctl_table *ctl, int write,
 				  void __user *buffer, size_t *lenp, loff_t *ppos);
 static int appldata_interval_handler(ctl_table *ctl, int write,
-					 struct file *filp,
 					 void __user *buffer,
 					 size_t *lenp, loff_t *ppos);
 
@@ -247,7 +246,7 @@ __appldata_vtimer_setup(int cmd)
  * Start/Stop timer, show status of timer (0 = not active, 1 = active)
  */
 static int
-appldata_timer_handler(ctl_table *ctl, int write, struct file *filp,
+appldata_timer_handler(ctl_table *ctl, int write,
 			   void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	int len;
@@ -289,7 +288,7 @@ out:
  * current timer interval.
  */
 static int
-appldata_interval_handler(ctl_table *ctl, int write, struct file *filp,
+appldata_interval_handler(ctl_table *ctl, int write,
 			   void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	int len, interval;
@@ -335,7 +334,7 @@ out:
  * monitoring (0 = not in process, 1 = in process)
  */
 static int
-appldata_generic_handler(ctl_table *ctl, int write, struct file *filp,
+appldata_generic_handler(ctl_table *ctl, int write,
 			   void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct appldata_ops *ops = NULL, *tmp_ops;
diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c
index 4c512561687..20f282c911c 100644
--- a/arch/s390/kernel/debug.c
+++ b/arch/s390/kernel/debug.c
@@ -881,11 +881,11 @@ static int debug_active=1;
  * if debug_active is already off
  */
 static int
-s390dbf_procactive(ctl_table *table, int write, struct file *filp,
+s390dbf_procactive(ctl_table *table, int write,
                      void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	if (!write || debug_stoppable || !debug_active)
-		return proc_dointvec(table, write, filp, buffer, lenp, ppos);
+		return proc_dointvec(table, write, buffer, lenp, ppos);
 	else
 		return 0;
 }
diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c
index 413c240cbca..b201135cc18 100644
--- a/arch/s390/mm/cmm.c
+++ b/arch/s390/mm/cmm.c
@@ -262,7 +262,7 @@ cmm_skip_blanks(char *cp, char **endp)
 static struct ctl_table cmm_table[];
 
 static int
-cmm_pages_handler(ctl_table *ctl, int write, struct file *filp,
+cmm_pages_handler(ctl_table *ctl, int write,
 		  void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	char buf[16], *p;
@@ -303,7 +303,7 @@ cmm_pages_handler(ctl_table *ctl, int write, struct file *filp,
 }
 
 static int
-cmm_timeout_handler(ctl_table *ctl, int write, struct file *filp,
+cmm_timeout_handler(ctl_table *ctl, int write,
 		    void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	char buf[64], *p;
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index e63cf7d441e..139d4c1a33a 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -40,8 +40,7 @@ extern unsigned int nmi_watchdog;
 #define NMI_INVALID	3
 
 struct ctl_table;
-struct file;
-extern int proc_nmi_enabled(struct ctl_table *, int , struct file *,
+extern int proc_nmi_enabled(struct ctl_table *, int ,
 			void __user *, size_t *, loff_t *);
 extern int unknown_nmi_panic;
 
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index cb66a22d98a..7ff61d6a188 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -508,14 +508,14 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
 /*
  * proc handler for /proc/sys/kernel/nmi
  */
-int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
+int proc_nmi_enabled(struct ctl_table *table, int write,
 			void __user *buffer, size_t *length, loff_t *ppos)
 {
 	int old_state;
 
 	nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0;
 	old_state = nmi_watchdog_enabled;
-	proc_dointvec(table, write, file, buffer, length, ppos);
+	proc_dointvec(table, write, buffer, length, ppos);
 	if (!!old_state == !!nmi_watchdog_enabled)
 		return 0;
 
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index cf53a78e2dc..8cb4974ff59 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -228,19 +228,11 @@ static long __vsyscall(3) venosys_1(void)
 }
 
 #ifdef CONFIG_SYSCTL
-
-static int
-vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
-		       void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-	return proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
-}
-
 static ctl_table kernel_table2[] = {
 	{ .procname = "vsyscall64",
 	  .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int),
 	  .mode = 0644,
-	  .proc_handler = vsyscall_sysctl_change },
+	  .proc_handler = proc_dointvec },
 	{}
 };
 
diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index 71d1b9bab70..614da5b8613 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -3412,7 +3412,7 @@ static int cdrom_print_info(const char *header, int val, char *info,
 	return 0;
 }
 
-static int cdrom_sysctl_info(ctl_table *ctl, int write, struct file * filp,
+static int cdrom_sysctl_info(ctl_table *ctl, int write,
                            void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	int pos;
@@ -3489,7 +3489,7 @@ static int cdrom_sysctl_info(ctl_table *ctl, int write, struct file * filp,
 		goto done;
 doit:
 	mutex_unlock(&cdrom_mutex);
-	return proc_dostring(ctl, write, filp, buffer, lenp, ppos);
+	return proc_dostring(ctl, write, buffer, lenp, ppos);
 done:
 	printk(KERN_INFO "cdrom: info buffer too small\n");
 	goto doit;
@@ -3525,12 +3525,12 @@ static void cdrom_update_settings(void)
 	mutex_unlock(&cdrom_mutex);
 }
 
-static int cdrom_sysctl_handler(ctl_table *ctl, int write, struct file * filp,
+static int cdrom_sysctl_handler(ctl_table *ctl, int write,
 				void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	int ret;
 	
-	ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+	ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
 
 	if (write) {
 	
diff --git a/drivers/char/random.c b/drivers/char/random.c
index d8a9255e1a3..04b505e5a5e 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -1231,7 +1231,7 @@ static char sysctl_bootid[16];
  * as an ASCII string in the standard UUID format.  If accesses via the
  * sysctl system call, it is returned as 16 bytes of binary data.
  */
-static int proc_do_uuid(ctl_table *table, int write, struct file *filp,
+static int proc_do_uuid(ctl_table *table, int write,
 			void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	ctl_table fake_table;
@@ -1254,7 +1254,7 @@ static int proc_do_uuid(ctl_table *table, int write, struct file *filp,
 	fake_table.data = buf;
 	fake_table.maxlen = sizeof(buf);
 
-	return proc_dostring(&fake_table, write, filp, buffer, lenp, ppos);
+	return proc_dostring(&fake_table, write, buffer, lenp, ppos);
 }
 
 static int uuid_strategy(ctl_table *table,
diff --git a/drivers/net/wireless/arlan-proc.c b/drivers/net/wireless/arlan-proc.c
index 2ab1d59870f..a8b689635a3 100644
--- a/drivers/net/wireless/arlan-proc.c
+++ b/drivers/net/wireless/arlan-proc.c
@@ -402,7 +402,7 @@ static int arlan_setup_card_by_book(struct net_device *dev)
 
 static char arlan_drive_info[ARLAN_STR_SIZE] = "A655\n\0";
 
-static int arlan_sysctl_info(ctl_table * ctl, int write, struct file *filp,
+static int arlan_sysctl_info(ctl_table * ctl, int write,
 		      void __user *buffer, size_t * lenp, loff_t *ppos)
 {
 	int i;
@@ -629,7 +629,7 @@ final:
 	*lenp = pos;
 
 	if (!write)
-		retv = proc_dostring(ctl, write, filp, buffer, lenp, ppos);
+		retv = proc_dostring(ctl, write, buffer, lenp, ppos);
 	else
 	{
 		*lenp = 0;
@@ -639,7 +639,7 @@ final:
 }
 
 
-static int arlan_sysctl_info161719(ctl_table * ctl, int write, struct file *filp,
+static int arlan_sysctl_info161719(ctl_table * ctl, int write,
 			    void __user *buffer, size_t * lenp, loff_t *ppos)
 {
 	int i;
@@ -669,11 +669,11 @@ static int arlan_sysctl_info161719(ctl_table * ctl, int write, struct file *filp
 
 final:
 	*lenp = pos;
-	retv = proc_dostring(ctl, write, filp, buffer, lenp, ppos);
+	retv = proc_dostring(ctl, write, buffer, lenp, ppos);
 	return retv;
 }
 
-static int arlan_sysctl_infotxRing(ctl_table * ctl, int write, struct file *filp,
+static int arlan_sysctl_infotxRing(ctl_table * ctl, int write,
 			    void __user *buffer, size_t * lenp, loff_t *ppos)
 {
 	int i;
@@ -698,11 +698,11 @@ static int arlan_sysctl_infotxRing(ctl_table * ctl, int write, struct file *filp
 	SARLBNpln(u_char, txBuffer, 0x800);
 final:
 	*lenp = pos;
-	retv = proc_dostring(ctl, write, filp, buffer, lenp, ppos);
+	retv = proc_dostring(ctl, write, buffer, lenp, ppos);
 	return retv;
 }
 
-static int arlan_sysctl_inforxRing(ctl_table * ctl, int write, struct file *filp,
+static int arlan_sysctl_inforxRing(ctl_table * ctl, int write,
 			    void __user *buffer, size_t * lenp, loff_t *ppos)
 {
 	int i;
@@ -726,11 +726,11 @@ static int arlan_sysctl_inforxRing(ctl_table * ctl, int write, struct file *filp
 	SARLBNpln(u_char, rxBuffer, 0x800);
 final:
 	*lenp = pos;
-	retv = proc_dostring(ctl, write, filp, buffer, lenp, ppos);
+	retv = proc_dostring(ctl, write, buffer, lenp, ppos);
 	return retv;
 }
 
-static int arlan_sysctl_info18(ctl_table * ctl, int write, struct file *filp,
+static int arlan_sysctl_info18(ctl_table * ctl, int write,
 			void __user *buffer, size_t * lenp, loff_t *ppos)
 {
 	int i;
@@ -756,7 +756,7 @@ static int arlan_sysctl_info18(ctl_table * ctl, int write, struct file *filp,
 
 final:
 	*lenp = pos;
-	retv = proc_dostring(ctl, write, filp, buffer, lenp, ppos);
+	retv = proc_dostring(ctl, write, buffer, lenp, ppos);
 	return retv;
 }
 
@@ -766,7 +766,7 @@ final:
 
 static char conf_reset_result[200];
 
-static int arlan_configure(ctl_table * ctl, int write, struct file *filp,
+static int arlan_configure(ctl_table * ctl, int write,
 		    void __user *buffer, size_t * lenp, loff_t *ppos)
 {
 	int pos = 0;
@@ -788,10 +788,10 @@ static int arlan_configure(ctl_table * ctl, int write, struct file *filp,
 		return -1;
 
 	*lenp = pos;
-	return proc_dostring(ctl, write, filp, buffer, lenp, ppos);
+	return proc_dostring(ctl, write, buffer, lenp, ppos);
 }
 
-static int arlan_sysctl_reset(ctl_table * ctl, int write, struct file *filp,
+static int arlan_sysctl_reset(ctl_table * ctl, int write,
 		       void __user *buffer, size_t * lenp, loff_t *ppos)
 {
 	int pos = 0;
@@ -811,7 +811,7 @@ static int arlan_sysctl_reset(ctl_table * ctl, int write, struct file *filp,
 	} else
 		return -1;
 	*lenp = pos + 3;
-	return proc_dostring(ctl, write, filp, buffer, lenp, ppos);
+	return proc_dostring(ctl, write, buffer, lenp, ppos);
 }
 
 
diff --git a/drivers/parport/procfs.c b/drivers/parport/procfs.c
index 554e11f9e1c..8eefe56f1cb 100644
--- a/drivers/parport/procfs.c
+++ b/drivers/parport/procfs.c
@@ -31,7 +31,7 @@
 #define PARPORT_MIN_SPINTIME_VALUE 1
 #define PARPORT_MAX_SPINTIME_VALUE 1000
 
-static int do_active_device(ctl_table *table, int write, struct file *filp,
+static int do_active_device(ctl_table *table, int write,
 		      void __user *result, size_t *lenp, loff_t *ppos)
 {
 	struct parport *port = (struct parport *)table->extra1;
@@ -68,7 +68,7 @@ static int do_active_device(ctl_table *table, int write, struct file *filp,
 }
 
 #ifdef CONFIG_PARPORT_1284
-static int do_autoprobe(ctl_table *table, int write, struct file *filp,
+static int do_autoprobe(ctl_table *table, int write,
 			void __user *result, size_t *lenp, loff_t *ppos)
 {
 	struct parport_device_info *info = table->extra2;
@@ -111,7 +111,7 @@ static int do_autoprobe(ctl_table *table, int write, struct file *filp,
 #endif /* IEEE1284.3 support. */
 
 static int do_hardware_base_addr (ctl_table *table, int write,
-				  struct file *filp, void __user *result,
+				  void __user *result,
 				  size_t *lenp, loff_t *ppos)
 {
 	struct parport *port = (struct parport *)table->extra1;
@@ -139,7 +139,7 @@ static int do_hardware_base_addr (ctl_table *table, int write,
 }
 
 static int do_hardware_irq (ctl_table *table, int write,
-			    struct file *filp, void __user *result,
+			    void __user *result,
 			    size_t *lenp, loff_t *ppos)
 {
 	struct parport *port = (struct parport *)table->extra1;
@@ -167,7 +167,7 @@ static int do_hardware_irq (ctl_table *table, int write,
 }
 
 static int do_hardware_dma (ctl_table *table, int write,
-			    struct file *filp, void __user *result,
+			    void __user *result,
 			    size_t *lenp, loff_t *ppos)
 {
 	struct parport *port = (struct parport *)table->extra1;
@@ -195,7 +195,7 @@ static int do_hardware_dma (ctl_table *table, int write,
 }
 
 static int do_hardware_modes (ctl_table *table, int write,
-			      struct file *filp, void __user *result,
+			      void __user *result,
 			      size_t *lenp, loff_t *ppos)
 {
 	struct parport *port = (struct parport *)table->extra1;
diff --git a/fs/coda/coda_int.h b/fs/coda/coda_int.h
index 8ccd5ed81d9..d99860a3389 100644
--- a/fs/coda/coda_int.h
+++ b/fs/coda/coda_int.h
@@ -2,6 +2,7 @@
 #define _CODA_INT_
 
 struct dentry;
+struct file;
 
 extern struct file_system_type coda_fs_type;
 extern unsigned long coda_timeout;
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index a2edb791344..31f4b0e6d72 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -63,9 +63,9 @@ static void drop_slab(void)
 }
 
 int drop_caches_sysctl_handler(ctl_table *table, int write,
-	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+	void __user *buffer, size_t *length, loff_t *ppos)
 {
-	proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+	proc_dointvec_minmax(table, write, buffer, length, ppos);
 	if (write) {
 		if (sysctl_drop_caches & 1)
 			drop_pagecache();
diff --git a/fs/file_table.c b/fs/file_table.c
index 334ce39881f..8eb44042e00 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -74,14 +74,14 @@ EXPORT_SYMBOL_GPL(get_max_files);
  * Handle nr_files sysctl
  */
 #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
-int proc_nr_files(ctl_table *table, int write, struct file *filp,
+int proc_nr_files(ctl_table *table, int write,
                      void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	files_stat.nr_files = get_nr_files();
-	return proc_dointvec(table, write, filp, buffer, lenp, ppos);
+	return proc_dointvec(table, write, buffer, lenp, ppos);
 }
 #else
-int proc_nr_files(ctl_table *table, int write, struct file *filp,
+int proc_nr_files(ctl_table *table, int write,
                      void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	return -ENOSYS;
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 9b1e4e9a16b..f667e8aeabd 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -153,7 +153,7 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf,
 
 	/* careful: calling conventions are nasty here */
 	res = count;
-	error = table->proc_handler(table, write, filp, buf, &res, ppos);
+	error = table->proc_handler(table, write, buf, &res, ppos);
 	if (!error)
 		error = res;
 out:
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c
index 916c0ffb608..c5bc67c4e3b 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.c
+++ b/fs/xfs/linux-2.6/xfs_sysctl.c
@@ -26,7 +26,6 @@ STATIC int
 xfs_stats_clear_proc_handler(
 	ctl_table	*ctl,
 	int		write,
-	struct file	*filp,
 	void		__user *buffer,
 	size_t		*lenp,
 	loff_t		*ppos)
@@ -34,7 +33,7 @@ xfs_stats_clear_proc_handler(
 	int		c, ret, *valp = ctl->data;
 	__uint32_t	vn_active;
 
-	ret = proc_dointvec_minmax(ctl, write, filp, buffer, lenp, ppos);
+	ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
 
 	if (!ret && write && *valp) {
 		printk("XFS Clearing xfsstats\n");
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 51803528b09..33ed6644abd 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2467,7 +2467,7 @@ ssize_t simple_attr_write(struct file *file, const char __user *buf,
 			  size_t len, loff_t *ppos);
 
 struct ctl_table;
-int proc_nr_files(struct ctl_table *table, int write, struct file *filp,
+int proc_nr_files(struct ctl_table *table, int write,
 		  void __user *buffer, size_t *lenp, loff_t *ppos);
 
 int __init get_filesystem_list(char *buf);
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 3c0924a18da..cd3d2abaf30 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -19,7 +19,7 @@
 extern int ftrace_enabled;
 extern int
 ftrace_enable_sysctl(struct ctl_table *table, int write,
-		     struct file *filp, void __user *buffer, size_t *lenp,
+		     void __user *buffer, size_t *lenp,
 		     loff_t *ppos);
 
 typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip);
@@ -94,7 +94,7 @@ static inline void ftrace_start(void) { }
 extern int stack_tracer_enabled;
 int
 stack_trace_sysctl(struct ctl_table *table, int write,
-		   struct file *file, void __user *buffer, size_t *lenp,
+		   void __user *buffer, size_t *lenp,
 		   loff_t *ppos);
 #endif
 
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 176e7ee73ef..11ab19ac6b3 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -20,9 +20,9 @@ static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
 }
 
 void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
-int hugetlb_sysctl_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *);
-int hugetlb_overcommit_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *);
-int hugetlb_treat_movable_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *);
+int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
+int hugetlb_overcommit_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
+int hugetlb_treat_movable_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
 int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *);
 int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
 			struct page **, struct vm_area_struct **,
diff --git a/include/linux/mm.h b/include/linux/mm.h
index b6eae5e3144..87218ae84e3 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1279,7 +1279,7 @@ int in_gate_area_no_task(unsigned long addr);
 #define in_gate_area(task, addr) ({(void)task; in_gate_area_no_task(addr);})
 #endif	/* __HAVE_ARCH_GATE_AREA */
 
-int drop_caches_sysctl_handler(struct ctl_table *, int, struct file *,
+int drop_caches_sysctl_handler(struct ctl_table *, int,
 					void __user *, size_t *, loff_t *);
 unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
 			unsigned long lru_pages);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 652ef01be58..6f7561730d8 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -755,21 +755,20 @@ static inline int is_dma(struct zone *zone)
 
 /* These two functions are used to setup the per zone pages min values */
 struct ctl_table;
-struct file;
-int min_free_kbytes_sysctl_handler(struct ctl_table *, int, struct file *, 
+int min_free_kbytes_sysctl_handler(struct ctl_table *, int,
 					void __user *, size_t *, loff_t *);
 extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1];
-int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, struct file *,
+int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int,
 					void __user *, size_t *, loff_t *);
-int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int, struct file *,
+int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int,
 					void __user *, size_t *, loff_t *);
 int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int,
-			struct file *, void __user *, size_t *, loff_t *);
+			void __user *, size_t *, loff_t *);
 int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int,
-			struct file *, void __user *, size_t *, loff_t *);
+			void __user *, size_t *, loff_t *);
 
 extern int numa_zonelist_order_handler(struct ctl_table *, int,
-			struct file *, void __user *, size_t *, loff_t *);
+			void __user *, size_t *, loff_t *);
 extern char numa_zonelist_order[];
 #define NUMA_ZONELIST_ORDER_LEN 16	/* string buffer size */
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e951bd2eb9f..811cd96524d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -309,7 +309,7 @@ extern void softlockup_tick(void);
 extern void touch_softlockup_watchdog(void);
 extern void touch_all_softlockup_watchdogs(void);
 extern int proc_dosoftlockup_thresh(struct ctl_table *table, int write,
-				    struct file *filp, void __user *buffer,
+				    void __user *buffer,
 				    size_t *lenp, loff_t *ppos);
 extern unsigned int  softlockup_panic;
 extern int softlockup_thresh;
@@ -331,7 +331,7 @@ extern unsigned long sysctl_hung_task_check_count;
 extern unsigned long sysctl_hung_task_timeout_secs;
 extern unsigned long sysctl_hung_task_warnings;
 extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
-					 struct file *filp, void __user *buffer,
+					 void __user *buffer,
 					 size_t *lenp, loff_t *ppos);
 #endif
 
@@ -1906,7 +1906,7 @@ extern unsigned int sysctl_sched_time_avg;
 extern unsigned int sysctl_timer_migration;
 
 int sched_nr_latency_handler(struct ctl_table *table, int write,
-		struct file *file, void __user *buffer, size_t *length,
+		void __user *buffer, size_t *length,
 		loff_t *ppos);
 #endif
 #ifdef CONFIG_SCHED_DEBUG
@@ -1924,7 +1924,7 @@ extern unsigned int sysctl_sched_rt_period;
 extern int sysctl_sched_rt_runtime;
 
 int sched_rt_handler(struct ctl_table *table, int write,
-		struct file *filp, void __user *buffer, size_t *lenp,
+		void __user *buffer, size_t *lenp,
 		loff_t *ppos);
 
 extern unsigned int sysctl_sched_compat_yield;
diff --git a/include/linux/security.h b/include/linux/security.h
index d050b66ab9e..239e40d0450 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -133,7 +133,7 @@ static inline unsigned long round_hint_to_min(unsigned long hint)
 		return PAGE_ALIGN(mmap_min_addr);
 	return hint;
 }
-extern int mmap_min_addr_handler(struct ctl_table *table, int write, struct file *filp,
+extern int mmap_min_addr_handler(struct ctl_table *table, int write,
 				 void __user *buffer, size_t *lenp, loff_t *ppos);
 
 #ifdef CONFIG_SECURITY
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 4c78fea989b..82232dbea3f 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -245,7 +245,7 @@ extern int page_evictable(struct page *page, struct vm_area_struct *vma);
 extern void scan_mapping_unevictable_pages(struct address_space *);
 
 extern unsigned long scan_unevictable_pages;
-extern int scan_unevictable_handler(struct ctl_table *, int, struct file *,
+extern int scan_unevictable_handler(struct ctl_table *, int,
 					void __user *, size_t *, loff_t *);
 extern int scan_unevictable_register_node(struct node *node);
 extern void scan_unevictable_unregister_node(struct node *node);
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index e76d3b22a46..1e4743ee683 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -29,7 +29,6 @@
 #include <linux/types.h>
 #include <linux/compiler.h>
 
-struct file;
 struct completion;
 
 #define CTL_MAXNAME 10		/* how many path components do we allow in a
@@ -977,25 +976,25 @@ typedef int ctl_handler (struct ctl_table *table,
 			 void __user *oldval, size_t __user *oldlenp,
 			 void __user *newval, size_t newlen);
 
-typedef int proc_handler (struct ctl_table *ctl, int write, struct file * filp,
+typedef int proc_handler (struct ctl_table *ctl, int write,
 			  void __user *buffer, size_t *lenp, loff_t *ppos);
 
-extern int proc_dostring(struct ctl_table *, int, struct file *,
+extern int proc_dostring(struct ctl_table *, int,
 			 void __user *, size_t *, loff_t *);
-extern int proc_dointvec(struct ctl_table *, int, struct file *,
+extern int proc_dointvec(struct ctl_table *, int,
 			 void __user *, size_t *, loff_t *);
-extern int proc_dointvec_minmax(struct ctl_table *, int, struct file *,
+extern int proc_dointvec_minmax(struct ctl_table *, int,
 				void __user *, size_t *, loff_t *);
-extern int proc_dointvec_jiffies(struct ctl_table *, int, struct file *,
+extern int proc_dointvec_jiffies(struct ctl_table *, int,
 				 void __user *, size_t *, loff_t *);
-extern int proc_dointvec_userhz_jiffies(struct ctl_table *, int, struct file *,
+extern int proc_dointvec_userhz_jiffies(struct ctl_table *, int,
 					void __user *, size_t *, loff_t *);
-extern int proc_dointvec_ms_jiffies(struct ctl_table *, int, struct file *,
+extern int proc_dointvec_ms_jiffies(struct ctl_table *, int,
 				    void __user *, size_t *, loff_t *);
-extern int proc_doulongvec_minmax(struct ctl_table *, int, struct file *,
+extern int proc_doulongvec_minmax(struct ctl_table *, int,
 				  void __user *, size_t *, loff_t *);
 extern int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int,
-				      struct file *, void __user *, size_t *, loff_t *);
+				      void __user *, size_t *, loff_t *);
 
 extern int do_sysctl (int __user *name, int nlen,
 		      void __user *oldval, size_t __user *oldlenp,
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 75cf58666ff..66ebddcff66 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -110,21 +110,20 @@ extern int laptop_mode;
 extern unsigned long determine_dirtyable_memory(void);
 
 extern int dirty_background_ratio_handler(struct ctl_table *table, int write,
-		struct file *filp, void __user *buffer, size_t *lenp,
+		void __user *buffer, size_t *lenp,
 		loff_t *ppos);
 extern int dirty_background_bytes_handler(struct ctl_table *table, int write,
-		struct file *filp, void __user *buffer, size_t *lenp,
+		void __user *buffer, size_t *lenp,
 		loff_t *ppos);
 extern int dirty_ratio_handler(struct ctl_table *table, int write,
-		struct file *filp, void __user *buffer, size_t *lenp,
+		void __user *buffer, size_t *lenp,
 		loff_t *ppos);
 extern int dirty_bytes_handler(struct ctl_table *table, int write,
-		struct file *filp, void __user *buffer, size_t *lenp,
+		void __user *buffer, size_t *lenp,
 		loff_t *ppos);
 
 struct ctl_table;
-struct file;
-int dirty_writeback_centisecs_handler(struct ctl_table *, int, struct file *,
+int dirty_writeback_centisecs_handler(struct ctl_table *, int,
 				      void __user *, size_t *, loff_t *);
 
 void get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty,
diff --git a/include/net/ip.h b/include/net/ip.h
index 72c36926c26..5b26a0bd178 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -399,7 +399,7 @@ extern void	ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 dport,
  * fed into the routing cache should use these handlers.
  */
 int ipv4_doint_and_flush(ctl_table *ctl, int write,
-			 struct file* filp, void __user *buffer,
+			 void __user *buffer,
 			 size_t *lenp, loff_t *ppos);
 int ipv4_doint_and_flush_strategy(ctl_table *table,
 				  void __user *oldval, size_t __user *oldlenp,
diff --git a/include/net/ndisc.h b/include/net/ndisc.h
index 1459ed3e269..f76f22d0572 100644
--- a/include/net/ndisc.h
+++ b/include/net/ndisc.h
@@ -55,7 +55,6 @@ enum {
 #include <net/neighbour.h>
 
 struct ctl_table;
-struct file;
 struct inet6_dev;
 struct net_device;
 struct net_proto_family;
@@ -139,7 +138,6 @@ extern int			igmp6_event_report(struct sk_buff *skb);
 #ifdef CONFIG_SYSCTL
 extern int 			ndisc_ifinfo_sysctl_change(struct ctl_table *ctl,
 							   int write,
-							   struct file * filp,
 							   void __user *buffer,
 							   size_t *lenp,
 							   loff_t *ppos);
diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c
index 40eab7314ae..7d3704750ef 100644
--- a/ipc/ipc_sysctl.c
+++ b/ipc/ipc_sysctl.c
@@ -27,18 +27,18 @@ static void *get_ipc(ctl_table *table)
 }
 
 #ifdef CONFIG_PROC_SYSCTL
-static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp,
+static int proc_ipc_dointvec(ctl_table *table, int write,
 	void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct ctl_table ipc_table;
 	memcpy(&ipc_table, table, sizeof(ipc_table));
 	ipc_table.data = get_ipc(table);
 
-	return proc_dointvec(&ipc_table, write, filp, buffer, lenp, ppos);
+	return proc_dointvec(&ipc_table, write, buffer, lenp, ppos);
 }
 
 static int proc_ipc_callback_dointvec(ctl_table *table, int write,
-	struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos)
+	void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct ctl_table ipc_table;
 	size_t lenp_bef = *lenp;
@@ -47,7 +47,7 @@ static int proc_ipc_callback_dointvec(ctl_table *table, int write,
 	memcpy(&ipc_table, table, sizeof(ipc_table));
 	ipc_table.data = get_ipc(table);
 
-	rc = proc_dointvec(&ipc_table, write, filp, buffer, lenp, ppos);
+	rc = proc_dointvec(&ipc_table, write, buffer, lenp, ppos);
 
 	if (write && !rc && lenp_bef == *lenp)
 		/*
@@ -61,13 +61,13 @@ static int proc_ipc_callback_dointvec(ctl_table *table, int write,
 }
 
 static int proc_ipc_doulongvec_minmax(ctl_table *table, int write,
-	struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos)
+	void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct ctl_table ipc_table;
 	memcpy(&ipc_table, table, sizeof(ipc_table));
 	ipc_table.data = get_ipc(table);
 
-	return proc_doulongvec_minmax(&ipc_table, write, filp, buffer,
+	return proc_doulongvec_minmax(&ipc_table, write, buffer,
 					lenp, ppos);
 }
 
@@ -95,7 +95,7 @@ static void ipc_auto_callback(int val)
 }
 
 static int proc_ipcauto_dointvec_minmax(ctl_table *table, int write,
-	struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos)
+	void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct ctl_table ipc_table;
 	size_t lenp_bef = *lenp;
@@ -106,7 +106,7 @@ static int proc_ipcauto_dointvec_minmax(ctl_table *table, int write,
 	ipc_table.data = get_ipc(table);
 	oldval = *((int *)(ipc_table.data));
 
-	rc = proc_dointvec_minmax(&ipc_table, write, filp, buffer, lenp, ppos);
+	rc = proc_dointvec_minmax(&ipc_table, write, buffer, lenp, ppos);
 
 	if (write && !rc && lenp_bef == *lenp) {
 		int newval = *((int *)(ipc_table.data));
diff --git a/ipc/mq_sysctl.c b/ipc/mq_sysctl.c
index 24ae46dfe45..8a058711fc1 100644
--- a/ipc/mq_sysctl.c
+++ b/ipc/mq_sysctl.c
@@ -31,24 +31,24 @@ static void *get_mq(ctl_table *table)
 	return which;
 }
 
-static int proc_mq_dointvec(ctl_table *table, int write, struct file *filp,
+static int proc_mq_dointvec(ctl_table *table, int write,
 	void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct ctl_table mq_table;
 	memcpy(&mq_table, table, sizeof(mq_table));
 	mq_table.data = get_mq(table);
 
-	return proc_dointvec(&mq_table, write, filp, buffer, lenp, ppos);
+	return proc_dointvec(&mq_table, write, buffer, lenp, ppos);
 }
 
 static int proc_mq_dointvec_minmax(ctl_table *table, int write,
-	struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos)
+	void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct ctl_table mq_table;
 	memcpy(&mq_table, table, sizeof(mq_table));
 	mq_table.data = get_mq(table);
 
-	return proc_dointvec_minmax(&mq_table, write, filp, buffer,
+	return proc_dointvec_minmax(&mq_table, write, buffer,
 					lenp, ppos);
 }
 #else
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 022a4927b78..d4e84174740 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -171,12 +171,12 @@ static unsigned long timeout_jiffies(unsigned long timeout)
  * Process updating of timeout sysctl
  */
 int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
-				  struct file *filp, void __user *buffer,
+				  void __user *buffer,
 				  size_t *lenp, loff_t *ppos)
 {
 	int ret;
 
-	ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos);
+	ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
 
 	if (ret || !write)
 		goto out;
diff --git a/kernel/sched.c b/kernel/sched.c
index 0d0361b9dbb..ee61f454a98 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -10312,7 +10312,7 @@ static int sched_rt_global_constraints(void)
 #endif /* CONFIG_RT_GROUP_SCHED */
 
 int sched_rt_handler(struct ctl_table *table, int write,
-		struct file *filp, void __user *buffer, size_t *lenp,
+		void __user *buffer, size_t *lenp,
 		loff_t *ppos)
 {
 	int ret;
@@ -10323,7 +10323,7 @@ int sched_rt_handler(struct ctl_table *table, int write,
 	old_period = sysctl_sched_rt_period;
 	old_runtime = sysctl_sched_rt_runtime;
 
-	ret = proc_dointvec(table, write, filp, buffer, lenp, ppos);
+	ret = proc_dointvec(table, write, buffer, lenp, ppos);
 
 	if (!ret && write) {
 		ret = sched_rt_global_constraints();
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index ecc637a0d59..4e777b47eed 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -384,10 +384,10 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
 
 #ifdef CONFIG_SCHED_DEBUG
 int sched_nr_latency_handler(struct ctl_table *table, int write,
-		struct file *filp, void __user *buffer, size_t *lenp,
+		void __user *buffer, size_t *lenp,
 		loff_t *ppos)
 {
-	int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 
 	if (ret || !write)
 		return ret;
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index 09d7519557d..0d31135efbf 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -26,10 +26,10 @@ static void slow_work_cull_timeout(unsigned long);
 static void slow_work_oom_timeout(unsigned long);
 
 #ifdef CONFIG_SYSCTL
-static int slow_work_min_threads_sysctl(struct ctl_table *, int, struct file *,
+static int slow_work_min_threads_sysctl(struct ctl_table *, int,
 					void __user *, size_t *, loff_t *);
 
-static int slow_work_max_threads_sysctl(struct ctl_table *, int , struct file *,
+static int slow_work_max_threads_sysctl(struct ctl_table *, int ,
 					void __user *, size_t *, loff_t *);
 #endif
 
@@ -493,10 +493,10 @@ static void slow_work_oom_timeout(unsigned long data)
  * Handle adjustment of the minimum number of threads
  */
 static int slow_work_min_threads_sysctl(struct ctl_table *table, int write,
-					struct file *filp, void __user *buffer,
+					void __user *buffer,
 					size_t *lenp, loff_t *ppos)
 {
-	int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 	int n;
 
 	if (ret == 0) {
@@ -521,10 +521,10 @@ static int slow_work_min_threads_sysctl(struct ctl_table *table, int write,
  * Handle adjustment of the maximum number of threads
  */
 static int slow_work_max_threads_sysctl(struct ctl_table *table, int write,
-					struct file *filp, void __user *buffer,
+					void __user *buffer,
 					size_t *lenp, loff_t *ppos)
 {
-	int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 	int n;
 
 	if (ret == 0) {
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 88796c33083..81324d12eb3 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -90,11 +90,11 @@ void touch_all_softlockup_watchdogs(void)
 EXPORT_SYMBOL(touch_all_softlockup_watchdogs);
 
 int proc_dosoftlockup_thresh(struct ctl_table *table, int write,
-			     struct file *filp, void __user *buffer,
+			     void __user *buffer,
 			     size_t *lenp, loff_t *ppos)
 {
 	touch_all_softlockup_watchdogs();
-	return proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+	return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 }
 
 /*
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 37abb8c3995..a02697b7cb9 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -163,9 +163,9 @@ extern int max_lock_depth;
 #endif
 
 #ifdef CONFIG_PROC_SYSCTL
-static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp,
+static int proc_do_cad_pid(struct ctl_table *table, int write,
 		  void __user *buffer, size_t *lenp, loff_t *ppos);
-static int proc_taint(struct ctl_table *table, int write, struct file *filp,
+static int proc_taint(struct ctl_table *table, int write,
 			       void __user *buffer, size_t *lenp, loff_t *ppos);
 #endif
 
@@ -2226,7 +2226,7 @@ void sysctl_head_put(struct ctl_table_header *head)
 #ifdef CONFIG_PROC_SYSCTL
 
 static int _proc_do_string(void* data, int maxlen, int write,
-			   struct file *filp, void __user *buffer,
+			   void __user *buffer,
 			   size_t *lenp, loff_t *ppos)
 {
 	size_t len;
@@ -2287,7 +2287,6 @@ static int _proc_do_string(void* data, int maxlen, int write,
  * proc_dostring - read a string sysctl
  * @table: the sysctl table
  * @write: %TRUE if this is a write to the sysctl file
- * @filp: the file structure
  * @buffer: the user buffer
  * @lenp: the size of the user buffer
  * @ppos: file position
@@ -2301,10 +2300,10 @@ static int _proc_do_string(void* data, int maxlen, int write,
  *
  * Returns 0 on success.
  */
-int proc_dostring(struct ctl_table *table, int write, struct file *filp,
+int proc_dostring(struct ctl_table *table, int write,
 		  void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-	return _proc_do_string(table->data, table->maxlen, write, filp,
+	return _proc_do_string(table->data, table->maxlen, write,
 			       buffer, lenp, ppos);
 }
 
@@ -2329,7 +2328,7 @@ static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp,
 }
 
 static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
-		  int write, struct file *filp, void __user *buffer,
+		  int write, void __user *buffer,
 		  size_t *lenp, loff_t *ppos,
 		  int (*conv)(int *negp, unsigned long *lvalp, int *valp,
 			      int write, void *data),
@@ -2436,13 +2435,13 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
 #undef TMPBUFLEN
 }
 
-static int do_proc_dointvec(struct ctl_table *table, int write, struct file *filp,
+static int do_proc_dointvec(struct ctl_table *table, int write,
 		  void __user *buffer, size_t *lenp, loff_t *ppos,
 		  int (*conv)(int *negp, unsigned long *lvalp, int *valp,
 			      int write, void *data),
 		  void *data)
 {
-	return __do_proc_dointvec(table->data, table, write, filp,
+	return __do_proc_dointvec(table->data, table, write,
 			buffer, lenp, ppos, conv, data);
 }
 
@@ -2450,7 +2449,6 @@ static int do_proc_dointvec(struct ctl_table *table, int write, struct file *fil
  * proc_dointvec - read a vector of integers
  * @table: the sysctl table
  * @write: %TRUE if this is a write to the sysctl file
- * @filp: the file structure
  * @buffer: the user buffer
  * @lenp: the size of the user buffer
  * @ppos: file position
@@ -2460,10 +2458,10 @@ static int do_proc_dointvec(struct ctl_table *table, int write, struct file *fil
  *
  * Returns 0 on success.
  */
-int proc_dointvec(struct ctl_table *table, int write, struct file *filp,
+int proc_dointvec(struct ctl_table *table, int write,
 		     void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-    return do_proc_dointvec(table,write,filp,buffer,lenp,ppos,
+    return do_proc_dointvec(table,write,buffer,lenp,ppos,
 		    	    NULL,NULL);
 }
 
@@ -2471,7 +2469,7 @@ int proc_dointvec(struct ctl_table *table, int write, struct file *filp,
  * Taint values can only be increased
  * This means we can safely use a temporary.
  */
-static int proc_taint(struct ctl_table *table, int write, struct file *filp,
+static int proc_taint(struct ctl_table *table, int write,
 			       void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct ctl_table t;
@@ -2483,7 +2481,7 @@ static int proc_taint(struct ctl_table *table, int write, struct file *filp,
 
 	t = *table;
 	t.data = &tmptaint;
-	err = proc_doulongvec_minmax(&t, write, filp, buffer, lenp, ppos);
+	err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos);
 	if (err < 0)
 		return err;
 
@@ -2535,7 +2533,6 @@ static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp,
  * proc_dointvec_minmax - read a vector of integers with min/max values
  * @table: the sysctl table
  * @write: %TRUE if this is a write to the sysctl file
- * @filp: the file structure
  * @buffer: the user buffer
  * @lenp: the size of the user buffer
  * @ppos: file position
@@ -2548,19 +2545,18 @@ static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp,
  *
  * Returns 0 on success.
  */
-int proc_dointvec_minmax(struct ctl_table *table, int write, struct file *filp,
+int proc_dointvec_minmax(struct ctl_table *table, int write,
 		  void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct do_proc_dointvec_minmax_conv_param param = {
 		.min = (int *) table->extra1,
 		.max = (int *) table->extra2,
 	};
-	return do_proc_dointvec(table, write, filp, buffer, lenp, ppos,
+	return do_proc_dointvec(table, write, buffer, lenp, ppos,
 				do_proc_dointvec_minmax_conv, &param);
 }
 
 static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write,
-				     struct file *filp,
 				     void __user *buffer,
 				     size_t *lenp, loff_t *ppos,
 				     unsigned long convmul,
@@ -2665,21 +2661,19 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
 }
 
 static int do_proc_doulongvec_minmax(struct ctl_table *table, int write,
-				     struct file *filp,
 				     void __user *buffer,
 				     size_t *lenp, loff_t *ppos,
 				     unsigned long convmul,
 				     unsigned long convdiv)
 {
 	return __do_proc_doulongvec_minmax(table->data, table, write,
-			filp, buffer, lenp, ppos, convmul, convdiv);
+			buffer, lenp, ppos, convmul, convdiv);
 }
 
 /**
  * proc_doulongvec_minmax - read a vector of long integers with min/max values
  * @table: the sysctl table
  * @write: %TRUE if this is a write to the sysctl file
- * @filp: the file structure
  * @buffer: the user buffer
  * @lenp: the size of the user buffer
  * @ppos: file position
@@ -2692,17 +2686,16 @@ static int do_proc_doulongvec_minmax(struct ctl_table *table, int write,
  *
  * Returns 0 on success.
  */
-int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp,
+int proc_doulongvec_minmax(struct ctl_table *table, int write,
 			   void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-    return do_proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos, 1l, 1l);
+    return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, 1l, 1l);
 }
 
 /**
  * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values
  * @table: the sysctl table
  * @write: %TRUE if this is a write to the sysctl file
- * @filp: the file structure
  * @buffer: the user buffer
  * @lenp: the size of the user buffer
  * @ppos: file position
@@ -2717,11 +2710,10 @@ int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp
  * Returns 0 on success.
  */
 int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
-				      struct file *filp,
 				      void __user *buffer,
 				      size_t *lenp, loff_t *ppos)
 {
-    return do_proc_doulongvec_minmax(table, write, filp, buffer,
+    return do_proc_doulongvec_minmax(table, write, buffer,
 				     lenp, ppos, HZ, 1000l);
 }
 
@@ -2797,7 +2789,6 @@ static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp,
  * proc_dointvec_jiffies - read a vector of integers as seconds
  * @table: the sysctl table
  * @write: %TRUE if this is a write to the sysctl file
- * @filp: the file structure
  * @buffer: the user buffer
  * @lenp: the size of the user buffer
  * @ppos: file position
@@ -2809,10 +2800,10 @@ static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp,
  *
  * Returns 0 on success.
  */
-int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp,
+int proc_dointvec_jiffies(struct ctl_table *table, int write,
 			  void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-    return do_proc_dointvec(table,write,filp,buffer,lenp,ppos,
+    return do_proc_dointvec(table,write,buffer,lenp,ppos,
 		    	    do_proc_dointvec_jiffies_conv,NULL);
 }
 
@@ -2820,7 +2811,6 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp,
  * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds
  * @table: the sysctl table
  * @write: %TRUE if this is a write to the sysctl file
- * @filp: the file structure
  * @buffer: the user buffer
  * @lenp: the size of the user buffer
  * @ppos: pointer to the file position
@@ -2832,10 +2822,10 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp,
  *
  * Returns 0 on success.
  */
-int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file *filp,
+int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write,
 				 void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-    return do_proc_dointvec(table,write,filp,buffer,lenp,ppos,
+    return do_proc_dointvec(table,write,buffer,lenp,ppos,
 		    	    do_proc_dointvec_userhz_jiffies_conv,NULL);
 }
 
@@ -2843,7 +2833,6 @@ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file
  * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds
  * @table: the sysctl table
  * @write: %TRUE if this is a write to the sysctl file
- * @filp: the file structure
  * @buffer: the user buffer
  * @lenp: the size of the user buffer
  * @ppos: file position
@@ -2856,14 +2845,14 @@ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file
  *
  * Returns 0 on success.
  */
-int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, struct file *filp,
+int proc_dointvec_ms_jiffies(struct ctl_table *table, int write,
 			     void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-	return do_proc_dointvec(table, write, filp, buffer, lenp, ppos,
+	return do_proc_dointvec(table, write, buffer, lenp, ppos,
 				do_proc_dointvec_ms_jiffies_conv, NULL);
 }
 
-static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp,
+static int proc_do_cad_pid(struct ctl_table *table, int write,
 			   void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct pid *new_pid;
@@ -2872,7 +2861,7 @@ static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp
 
 	tmp = pid_vnr(cad_pid);
 
-	r = __do_proc_dointvec(&tmp, table, write, filp, buffer,
+	r = __do_proc_dointvec(&tmp, table, write, buffer,
 			       lenp, ppos, NULL, NULL);
 	if (r || !write)
 		return r;
@@ -2887,50 +2876,49 @@ static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp
 
 #else /* CONFIG_PROC_FS */
 
-int proc_dostring(struct ctl_table *table, int write, struct file *filp,
+int proc_dostring(struct ctl_table *table, int write,
 		  void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	return -ENOSYS;
 }
 
-int proc_dointvec(struct ctl_table *table, int write, struct file *filp,
+int proc_dointvec(struct ctl_table *table, int write,
 		  void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	return -ENOSYS;
 }
 
-int proc_dointvec_minmax(struct ctl_table *table, int write, struct file *filp,
+int proc_dointvec_minmax(struct ctl_table *table, int write,
 		    void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	return -ENOSYS;
 }
 
-int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp,
+int proc_dointvec_jiffies(struct ctl_table *table, int write,
 		    void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	return -ENOSYS;
 }
 
-int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file *filp,
+int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write,
 		    void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	return -ENOSYS;
 }
 
-int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, struct file *filp,
+int proc_dointvec_ms_jiffies(struct ctl_table *table, int write,
 			     void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	return -ENOSYS;
 }
 
-int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp,
+int proc_doulongvec_minmax(struct ctl_table *table, int write,
 		    void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	return -ENOSYS;
 }
 
 int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
-				      struct file *filp,
 				      void __user *buffer,
 				      size_t *lenp, loff_t *ppos)
 {
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 23df7771c93..a142579765b 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3015,7 +3015,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
 
 int
 ftrace_enable_sysctl(struct ctl_table *table, int write,
-		     struct file *file, void __user *buffer, size_t *lenp,
+		     void __user *buffer, size_t *lenp,
 		     loff_t *ppos)
 {
 	int ret;
@@ -3025,7 +3025,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
 
 	mutex_lock(&ftrace_lock);
 
-	ret  = proc_dointvec(table, write, file, buffer, lenp, ppos);
+	ret  = proc_dointvec(table, write, buffer, lenp, ppos);
 
 	if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled))
 		goto out;
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 0f6facb050a..8504ac71e4e 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -296,14 +296,14 @@ static const struct file_operations stack_trace_fops = {
 
 int
 stack_trace_sysctl(struct ctl_table *table, int write,
-		   struct file *file, void __user *buffer, size_t *lenp,
+		   void __user *buffer, size_t *lenp,
 		   loff_t *ppos)
 {
 	int ret;
 
 	mutex_lock(&stack_sysctl_mutex);
 
-	ret = proc_dointvec(table, write, file, buffer, lenp, ppos);
+	ret = proc_dointvec(table, write, buffer, lenp, ppos);
 
 	if (ret || !write ||
 	    (last_stack_tracer_enabled == !!stack_tracer_enabled))
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index 92359cc747a..69eae358a72 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -42,14 +42,14 @@ static void put_uts(ctl_table *table, int write, void *which)
  *	Special case of dostring for the UTS structure. This has locks
  *	to observe. Should this be in kernel/sys.c ????
  */
-static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
+static int proc_do_uts_string(ctl_table *table, int write,
 		  void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct ctl_table uts_table;
 	int r;
 	memcpy(&uts_table, table, sizeof(uts_table));
 	uts_table.data = get_uts(table, write);
-	r = proc_dostring(&uts_table,write,filp,buffer,lenp, ppos);
+	r = proc_dostring(&uts_table,write,buffer,lenp, ppos);
 	put_uts(table, write, uts_table.data);
 	return r;
 }
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 815dbd4a6dc..6f048fcc749 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1537,7 +1537,7 @@ static unsigned int cpuset_mems_nr(unsigned int *array)
 
 #ifdef CONFIG_SYSCTL
 int hugetlb_sysctl_handler(struct ctl_table *table, int write,
-			   struct file *file, void __user *buffer,
+			   void __user *buffer,
 			   size_t *length, loff_t *ppos)
 {
 	struct hstate *h = &default_hstate;
@@ -1548,7 +1548,7 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write,
 
 	table->data = &tmp;
 	table->maxlen = sizeof(unsigned long);
-	proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
+	proc_doulongvec_minmax(table, write, buffer, length, ppos);
 
 	if (write)
 		h->max_huge_pages = set_max_huge_pages(h, tmp);
@@ -1557,10 +1557,10 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write,
 }
 
 int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
-			struct file *file, void __user *buffer,
+			void __user *buffer,
 			size_t *length, loff_t *ppos)
 {
-	proc_dointvec(table, write, file, buffer, length, ppos);
+	proc_dointvec(table, write, buffer, length, ppos);
 	if (hugepages_treat_as_movable)
 		htlb_alloc_mask = GFP_HIGHUSER_MOVABLE;
 	else
@@ -1569,7 +1569,7 @@ int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
 }
 
 int hugetlb_overcommit_handler(struct ctl_table *table, int write,
-			struct file *file, void __user *buffer,
+			void __user *buffer,
 			size_t *length, loff_t *ppos)
 {
 	struct hstate *h = &default_hstate;
@@ -1580,7 +1580,7 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
 
 	table->data = &tmp;
 	table->maxlen = sizeof(unsigned long);
-	proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
+	proc_doulongvec_minmax(table, write, buffer, length, ppos);
 
 	if (write) {
 		spin_lock(&hugetlb_lock);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 5f378dd5880..be197f71b09 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -155,37 +155,37 @@ static void update_completion_period(void)
 }
 
 int dirty_background_ratio_handler(struct ctl_table *table, int write,
-		struct file *filp, void __user *buffer, size_t *lenp,
+		void __user *buffer, size_t *lenp,
 		loff_t *ppos)
 {
 	int ret;
 
-	ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 	if (ret == 0 && write)
 		dirty_background_bytes = 0;
 	return ret;
 }
 
 int dirty_background_bytes_handler(struct ctl_table *table, int write,
-		struct file *filp, void __user *buffer, size_t *lenp,
+		void __user *buffer, size_t *lenp,
 		loff_t *ppos)
 {
 	int ret;
 
-	ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos);
+	ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
 	if (ret == 0 && write)
 		dirty_background_ratio = 0;
 	return ret;
 }
 
 int dirty_ratio_handler(struct ctl_table *table, int write,
-		struct file *filp, void __user *buffer, size_t *lenp,
+		void __user *buffer, size_t *lenp,
 		loff_t *ppos)
 {
 	int old_ratio = vm_dirty_ratio;
 	int ret;
 
-	ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 	if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
 		update_completion_period();
 		vm_dirty_bytes = 0;
@@ -195,13 +195,13 @@ int dirty_ratio_handler(struct ctl_table *table, int write,
 
 
 int dirty_bytes_handler(struct ctl_table *table, int write,
-		struct file *filp, void __user *buffer, size_t *lenp,
+		void __user *buffer, size_t *lenp,
 		loff_t *ppos)
 {
 	unsigned long old_bytes = vm_dirty_bytes;
 	int ret;
 
-	ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos);
+	ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
 	if (ret == 0 && write && vm_dirty_bytes != old_bytes) {
 		update_completion_period();
 		vm_dirty_ratio = 0;
@@ -686,9 +686,9 @@ static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
  * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
  */
 int dirty_writeback_centisecs_handler(ctl_table *table, int write,
-	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+	void __user *buffer, size_t *length, loff_t *ppos)
 {
-	proc_dointvec(table, write, file, buffer, length, ppos);
+	proc_dointvec(table, write, buffer, length, ppos);
 	return 0;
 }
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5717f27a070..88248b3c20b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2373,7 +2373,7 @@ early_param("numa_zonelist_order", setup_numa_zonelist_order);
  * sysctl handler for numa_zonelist_order
  */
 int numa_zonelist_order_handler(ctl_table *table, int write,
-		struct file *file, void __user *buffer, size_t *length,
+		void __user *buffer, size_t *length,
 		loff_t *ppos)
 {
 	char saved_string[NUMA_ZONELIST_ORDER_LEN];
@@ -2382,7 +2382,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
 	if (write)
 		strncpy(saved_string, (char*)table->data,
 			NUMA_ZONELIST_ORDER_LEN);
-	ret = proc_dostring(table, write, file, buffer, length, ppos);
+	ret = proc_dostring(table, write, buffer, length, ppos);
 	if (ret)
 		return ret;
 	if (write) {
@@ -4706,9 +4706,9 @@ module_init(init_per_zone_wmark_min)
  *	changes.
  */
 int min_free_kbytes_sysctl_handler(ctl_table *table, int write, 
-	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+	void __user *buffer, size_t *length, loff_t *ppos)
 {
-	proc_dointvec(table, write, file, buffer, length, ppos);
+	proc_dointvec(table, write, buffer, length, ppos);
 	if (write)
 		setup_per_zone_wmarks();
 	return 0;
@@ -4716,12 +4716,12 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
 
 #ifdef CONFIG_NUMA
 int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
-	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+	void __user *buffer, size_t *length, loff_t *ppos)
 {
 	struct zone *zone;
 	int rc;
 
-	rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
 	if (rc)
 		return rc;
 
@@ -4732,12 +4732,12 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
 }
 
 int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
-	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+	void __user *buffer, size_t *length, loff_t *ppos)
 {
 	struct zone *zone;
 	int rc;
 
-	rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
 	if (rc)
 		return rc;
 
@@ -4758,9 +4758,9 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
  * if in function of the boot time zone sizes.
  */
 int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
-	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+	void __user *buffer, size_t *length, loff_t *ppos)
 {
-	proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+	proc_dointvec_minmax(table, write, buffer, length, ppos);
 	setup_per_zone_lowmem_reserve();
 	return 0;
 }
@@ -4772,13 +4772,13 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
  */
 
 int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
-	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+	void __user *buffer, size_t *length, loff_t *ppos)
 {
 	struct zone *zone;
 	unsigned int cpu;
 	int ret;
 
-	ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
 	if (!write || (ret == -EINVAL))
 		return ret;
 	for_each_populated_zone(zone) {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2423782214a..f444b740908 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2844,10 +2844,10 @@ static void scan_all_zones_unevictable_pages(void)
 unsigned long scan_unevictable_pages;
 
 int scan_unevictable_handler(struct ctl_table *table, int write,
-			   struct file *file, void __user *buffer,
+			   void __user *buffer,
 			   size_t *length, loff_t *ppos)
 {
-	proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
+	proc_doulongvec_minmax(table, write, buffer, length, ppos);
 
 	if (write && *(unsigned long *)table->data)
 		scan_all_zones_unevictable_pages();
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 907a82e9023..a16a2342f6b 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -965,12 +965,12 @@ static struct nf_hook_ops br_nf_ops[] __read_mostly = {
 
 #ifdef CONFIG_SYSCTL
 static
-int brnf_sysctl_call_tables(ctl_table * ctl, int write, struct file *filp,
+int brnf_sysctl_call_tables(ctl_table * ctl, int write,
 			    void __user * buffer, size_t * lenp, loff_t * ppos)
 {
 	int ret;
 
-	ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+	ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
 
 	if (write && *(int *)(ctl->data))
 		*(int *)(ctl->data) = 1;
diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c
index 1c6a5bb6f0c..6e1f085db06 100644
--- a/net/decnet/dn_dev.c
+++ b/net/decnet/dn_dev.c
@@ -164,7 +164,7 @@ static int max_t3[] = { 8191 }; /* Must fit in 16 bits when multiplied by BCT3MU
 static int min_priority[1];
 static int max_priority[] = { 127 }; /* From DECnet spec */
 
-static int dn_forwarding_proc(ctl_table *, int, struct file *,
+static int dn_forwarding_proc(ctl_table *, int,
 			void __user *, size_t *, loff_t *);
 static int dn_forwarding_sysctl(ctl_table *table,
 			void __user *oldval, size_t __user *oldlenp,
@@ -274,7 +274,6 @@ static void dn_dev_sysctl_unregister(struct dn_dev_parms *parms)
 }
 
 static int dn_forwarding_proc(ctl_table *table, int write,
-				struct file *filep,
 				void __user *buffer,
 				size_t *lenp, loff_t *ppos)
 {
@@ -290,7 +289,7 @@ static int dn_forwarding_proc(ctl_table *table, int write,
 	dn_db = dev->dn_ptr;
 	old = dn_db->parms.forwarding;
 
-	err = proc_dointvec(table, write, filep, buffer, lenp, ppos);
+	err = proc_dointvec(table, write, buffer, lenp, ppos);
 
 	if ((err >= 0) && write) {
 		if (dn_db->parms.forwarding < 0)
diff --git a/net/decnet/sysctl_net_decnet.c b/net/decnet/sysctl_net_decnet.c
index 5bcd592ae6d..26b0ab1e9f5 100644
--- a/net/decnet/sysctl_net_decnet.c
+++ b/net/decnet/sysctl_net_decnet.c
@@ -165,7 +165,6 @@ static int dn_node_address_strategy(ctl_table *table,
 }
 
 static int dn_node_address_handler(ctl_table *table, int write,
-				struct file *filp,
 				void __user *buffer,
 				size_t *lenp, loff_t *ppos)
 {
@@ -276,7 +275,6 @@ static int dn_def_dev_strategy(ctl_table *table,
 
 
 static int dn_def_dev_handler(ctl_table *table, int write,
-				struct file * filp,
 				void __user *buffer,
 				size_t *lenp, loff_t *ppos)
 {
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 07336c6201f..e92f1fd28aa 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1270,10 +1270,10 @@ static void inet_forward_change(struct net *net)
 }
 
 static int devinet_conf_proc(ctl_table *ctl, int write,
-			     struct file *filp, void __user *buffer,
+			     void __user *buffer,
 			     size_t *lenp, loff_t *ppos)
 {
-	int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+	int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
 
 	if (write) {
 		struct ipv4_devconf *cnf = ctl->extra1;
@@ -1342,12 +1342,12 @@ static int devinet_conf_sysctl(ctl_table *table,
 }
 
 static int devinet_sysctl_forward(ctl_table *ctl, int write,
-				  struct file *filp, void __user *buffer,
+				  void __user *buffer,
 				  size_t *lenp, loff_t *ppos)
 {
 	int *valp = ctl->data;
 	int val = *valp;
-	int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+	int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
 
 	if (write && *valp != val) {
 		struct net *net = ctl->extra2;
@@ -1372,12 +1372,12 @@ static int devinet_sysctl_forward(ctl_table *ctl, int write,
 }
 
 int ipv4_doint_and_flush(ctl_table *ctl, int write,
-			 struct file *filp, void __user *buffer,
+			 void __user *buffer,
 			 size_t *lenp, loff_t *ppos)
 {
 	int *valp = ctl->data;
 	int val = *valp;
-	int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+	int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
 	struct net *net = ctl->extra2;
 
 	if (write && *valp != val)
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index df934731453..bb419925202 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -3036,7 +3036,7 @@ void ip_rt_multicast_event(struct in_device *in_dev)
 
 #ifdef CONFIG_SYSCTL
 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
-					struct file *filp, void __user *buffer,
+					void __user *buffer,
 					size_t *lenp, loff_t *ppos)
 {
 	if (write) {
@@ -3046,7 +3046,7 @@ static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
 
 		memcpy(&ctl, __ctl, sizeof(ctl));
 		ctl.data = &flush_delay;
-		proc_dointvec(&ctl, write, filp, buffer, lenp, ppos);
+		proc_dointvec(&ctl, write, buffer, lenp, ppos);
 
 		net = (struct net *)__ctl->extra1;
 		rt_cache_flush(net, flush_delay);
@@ -3106,12 +3106,11 @@ static void rt_secret_reschedule(int old)
 }
 
 static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write,
-					  struct file *filp,
 					  void __user *buffer, size_t *lenp,
 					  loff_t *ppos)
 {
 	int old = ip_rt_secret_interval;
-	int ret = proc_dointvec_jiffies(ctl, write, filp, buffer, lenp, ppos);
+	int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos);
 
 	rt_secret_reschedule(old);
 
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 4710d219f06..2dcf04d9b00 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -36,7 +36,7 @@ static void set_local_port_range(int range[2])
 }
 
 /* Validate changes from /proc interface. */
-static int ipv4_local_port_range(ctl_table *table, int write, struct file *filp,
+static int ipv4_local_port_range(ctl_table *table, int write,
 				 void __user *buffer,
 				 size_t *lenp, loff_t *ppos)
 {
@@ -51,7 +51,7 @@ static int ipv4_local_port_range(ctl_table *table, int write, struct file *filp,
 	};
 
 	inet_get_local_port_range(range, range + 1);
-	ret = proc_dointvec_minmax(&tmp, write, filp, buffer, lenp, ppos);
+	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
 
 	if (write && ret == 0) {
 		if (range[1] < range[0])
@@ -91,7 +91,7 @@ static int ipv4_sysctl_local_port_range(ctl_table *table,
 }
 
 
-static int proc_tcp_congestion_control(ctl_table *ctl, int write, struct file * filp,
+static int proc_tcp_congestion_control(ctl_table *ctl, int write,
 				       void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	char val[TCP_CA_NAME_MAX];
@@ -103,7 +103,7 @@ static int proc_tcp_congestion_control(ctl_table *ctl, int write, struct file *
 
 	tcp_get_default_congestion_control(val);
 
-	ret = proc_dostring(&tbl, write, filp, buffer, lenp, ppos);
+	ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
 	if (write && ret == 0)
 		ret = tcp_set_default_congestion_control(val);
 	return ret;
@@ -129,7 +129,7 @@ static int sysctl_tcp_congestion_control(ctl_table *table,
 }
 
 static int proc_tcp_available_congestion_control(ctl_table *ctl,
-						 int write, struct file * filp,
+						 int write,
 						 void __user *buffer, size_t *lenp,
 						 loff_t *ppos)
 {
@@ -140,13 +140,13 @@ static int proc_tcp_available_congestion_control(ctl_table *ctl,
 	if (!tbl.data)
 		return -ENOMEM;
 	tcp_get_available_congestion_control(tbl.data, TCP_CA_BUF_MAX);
-	ret = proc_dostring(&tbl, write, filp, buffer, lenp, ppos);
+	ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
 	kfree(tbl.data);
 	return ret;
 }
 
 static int proc_allowed_congestion_control(ctl_table *ctl,
-					   int write, struct file * filp,
+					   int write,
 					   void __user *buffer, size_t *lenp,
 					   loff_t *ppos)
 {
@@ -158,7 +158,7 @@ static int proc_allowed_congestion_control(ctl_table *ctl,
 		return -ENOMEM;
 
 	tcp_get_allowed_congestion_control(tbl.data, tbl.maxlen);
-	ret = proc_dostring(&tbl, write, filp, buffer, lenp, ppos);
+	ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
 	if (write && ret == 0)
 		ret = tcp_set_allowed_congestion_control(tbl.data);
 	kfree(tbl.data);
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 55f486d89c8..1fd0a3d775d 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -3986,14 +3986,14 @@ static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
 #ifdef CONFIG_SYSCTL
 
 static
-int addrconf_sysctl_forward(ctl_table *ctl, int write, struct file * filp,
+int addrconf_sysctl_forward(ctl_table *ctl, int write,
 			   void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	int *valp = ctl->data;
 	int val = *valp;
 	int ret;
 
-	ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+	ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
 
 	if (write)
 		ret = addrconf_fixup_forwarding(ctl, valp, val);
@@ -4090,14 +4090,14 @@ static int addrconf_disable_ipv6(struct ctl_table *table, int *p, int old)
 }
 
 static
-int addrconf_sysctl_disable(ctl_table *ctl, int write, struct file * filp,
+int addrconf_sysctl_disable(ctl_table *ctl, int write,
 			    void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	int *valp = ctl->data;
 	int val = *valp;
 	int ret;
 
-	ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+	ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
 
 	if (write)
 		ret = addrconf_disable_ipv6(ctl, valp, val);
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 7015478797f..498b9b0b0fa 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1735,7 +1735,7 @@ static void ndisc_warn_deprecated_sysctl(struct ctl_table *ctl,
 	}
 }
 
-int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, struct file * filp, void __user *buffer, size_t *lenp, loff_t *ppos)
+int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct net_device *dev = ctl->extra1;
 	struct inet6_dev *idev;
@@ -1746,16 +1746,16 @@ int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, struct file * f
 		ndisc_warn_deprecated_sysctl(ctl, "syscall", dev ? dev->name : "default");
 
 	if (strcmp(ctl->procname, "retrans_time") == 0)
-		ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+		ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
 
 	else if (strcmp(ctl->procname, "base_reachable_time") == 0)
 		ret = proc_dointvec_jiffies(ctl, write,
-					    filp, buffer, lenp, ppos);
+					    buffer, lenp, ppos);
 
 	else if ((strcmp(ctl->procname, "retrans_time_ms") == 0) ||
 		 (strcmp(ctl->procname, "base_reachable_time_ms") == 0))
 		ret = proc_dointvec_ms_jiffies(ctl, write,
-					       filp, buffer, lenp, ppos);
+					       buffer, lenp, ppos);
 	else
 		ret = -1;
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 77aecbe8ff6..d6fe7646a8f 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2524,13 +2524,13 @@ static const struct file_operations rt6_stats_seq_fops = {
 #ifdef CONFIG_SYSCTL
 
 static
-int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
+int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
 			      void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct net *net = current->nsproxy->net_ns;
 	int delay = net->ipv6.sysctl.flush_delay;
 	if (write) {
-		proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+		proc_dointvec(ctl, write, buffer, lenp, ppos);
 		fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
 		return 0;
 	} else
diff --git a/net/irda/irsysctl.c b/net/irda/irsysctl.c
index 57f8817c397..5c86567e5a7 100644
--- a/net/irda/irsysctl.c
+++ b/net/irda/irsysctl.c
@@ -73,12 +73,12 @@ static int min_lap_keepalive_time = 100;	/* 100us */
 /* For other sysctl, I've no idea of the range. Maybe Dag could help
  * us on that - Jean II */
 
-static int do_devname(ctl_table *table, int write, struct file *filp,
+static int do_devname(ctl_table *table, int write,
 		      void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	int ret;
 
-	ret = proc_dostring(table, write, filp, buffer, lenp, ppos);
+	ret = proc_dostring(table, write, buffer, lenp, ppos);
 	if (ret == 0 && write) {
 		struct ias_value *val;
 
@@ -90,12 +90,12 @@ static int do_devname(ctl_table *table, int write, struct file *filp,
 }
 
 
-static int do_discovery(ctl_table *table, int write, struct file *filp,
+static int do_discovery(ctl_table *table, int write,
                     void __user *buffer, size_t *lenp, loff_t *ppos)
 {
        int ret;
 
-       ret = proc_dointvec(table, write, filp, buffer, lenp, ppos);
+       ret = proc_dointvec(table, write, buffer, lenp, ppos);
        if (ret)
 	       return ret;
 
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index fba2892b99e..446e9bd4b4b 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -1496,14 +1496,14 @@ static int ip_vs_zero_all(void)
 
 
 static int
-proc_do_defense_mode(ctl_table *table, int write, struct file * filp,
+proc_do_defense_mode(ctl_table *table, int write,
 		     void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	int *valp = table->data;
 	int val = *valp;
 	int rc;
 
-	rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
+	rc = proc_dointvec(table, write, buffer, lenp, ppos);
 	if (write && (*valp != val)) {
 		if ((*valp < 0) || (*valp > 3)) {
 			/* Restore the correct value */
@@ -1517,7 +1517,7 @@ proc_do_defense_mode(ctl_table *table, int write, struct file * filp,
 
 
 static int
-proc_do_sync_threshold(ctl_table *table, int write, struct file *filp,
+proc_do_sync_threshold(ctl_table *table, int write,
 		       void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	int *valp = table->data;
@@ -1527,7 +1527,7 @@ proc_do_sync_threshold(ctl_table *table, int write, struct file *filp,
 	/* backup the value first */
 	memcpy(val, valp, sizeof(val));
 
-	rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
+	rc = proc_dointvec(table, write, buffer, lenp, ppos);
 	if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) {
 		/* Restore the correct value */
 		memcpy(valp, val, sizeof(val));
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index 4e620305f28..c93494fef8e 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -226,7 +226,7 @@ static char nf_log_sysctl_fnames[NFPROTO_NUMPROTO-NFPROTO_UNSPEC][3];
 static struct ctl_table nf_log_sysctl_table[NFPROTO_NUMPROTO+1];
 static struct ctl_table_header *nf_log_dir_header;
 
-static int nf_log_proc_dostring(ctl_table *table, int write, struct file *filp,
+static int nf_log_proc_dostring(ctl_table *table, int write,
 			 void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	const struct nf_logger *logger;
@@ -260,7 +260,7 @@ static int nf_log_proc_dostring(ctl_table *table, int write, struct file *filp,
 			table->data = "NONE";
 		else
 			table->data = logger->name;
-		r = proc_dostring(table, write, filp, buffer, lenp, ppos);
+		r = proc_dostring(table, write, buffer, lenp, ppos);
 		mutex_unlock(&nf_log_mutex);
 	}
 
diff --git a/net/phonet/sysctl.c b/net/phonet/sysctl.c
index 7b5749ee276..2220f332232 100644
--- a/net/phonet/sysctl.c
+++ b/net/phonet/sysctl.c
@@ -56,7 +56,7 @@ void phonet_get_local_port_range(int *min, int *max)
 	} while (read_seqretry(&local_port_range_lock, seq));
 }
 
-static int proc_local_port_range(ctl_table *table, int write, struct file *filp,
+static int proc_local_port_range(ctl_table *table, int write,
 				void __user *buffer,
 				size_t *lenp, loff_t *ppos)
 {
@@ -70,7 +70,7 @@ static int proc_local_port_range(ctl_table *table, int write, struct file *filp,
 		.extra2 = &local_port_range_max,
 	};
 
-	ret = proc_dointvec_minmax(&tmp, write, filp, buffer, lenp, ppos);
+	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
 
 	if (write && ret == 0) {
 		if (range[1] < range[0])
diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c
index 5231f7aaac0..42f9748ae09 100644
--- a/net/sunrpc/sysctl.c
+++ b/net/sunrpc/sysctl.c
@@ -56,7 +56,7 @@ rpc_unregister_sysctl(void)
 	}
 }
 
-static int proc_do_xprt(ctl_table *table, int write, struct file *file,
+static int proc_do_xprt(ctl_table *table, int write,
 			void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	char tmpbuf[256];
@@ -71,7 +71,7 @@ static int proc_do_xprt(ctl_table *table, int write, struct file *file,
 }
 
 static int
-proc_dodebug(ctl_table *table, int write, struct file *file,
+proc_dodebug(ctl_table *table, int write,
 				void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	char		tmpbuf[20], c, *s;
diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c
index 87101177825..35fb68b9c8e 100644
--- a/net/sunrpc/xprtrdma/svc_rdma.c
+++ b/net/sunrpc/xprtrdma/svc_rdma.c
@@ -80,7 +80,7 @@ struct kmem_cache *svc_rdma_ctxt_cachep;
  * current value.
  */
 static int read_reset_stat(ctl_table *table, int write,
-			   struct file *filp, void __user *buffer, size_t *lenp,
+			   void __user *buffer, size_t *lenp,
 			   loff_t *ppos)
 {
 	atomic_t *stat = (atomic_t *)table->data;
diff --git a/security/min_addr.c b/security/min_addr.c
index 14cc7b3b8d0..c844eed7915 100644
--- a/security/min_addr.c
+++ b/security/min_addr.c
@@ -28,12 +28,12 @@ static void update_mmap_min_addr(void)
  * sysctl handler which just sets dac_mmap_min_addr = the new value and then
  * calls update_mmap_min_addr() so non MAP_FIXED hints get rounded properly
  */
-int mmap_min_addr_handler(struct ctl_table *table, int write, struct file *filp,
+int mmap_min_addr_handler(struct ctl_table *table, int write,
 			  void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	int ret;
 
-	ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos);
+	ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
 
 	update_mmap_min_addr();
 
-- 
cgit v1.2.3


From 3e2ada5867b7e9fa0b296d30fa8f3726ebd0a8b7 Mon Sep 17 00:00:00 2001
From: Zhao Yakui <yakui.zhao@intel.com>
Date: Sun, 27 Sep 2009 03:30:51 -0400
Subject: ACPI: fix Compaq Evo N800c (Pentium 4m) boot hang regression

Don't disable ARB_DISABLE when the familary ID is 0x0F.

http://bugzilla.kernel.org/show_bug.cgi?id=14211

This was a 2.6.31 regression, and so this patch
needs to be applied to 2.6.31.stable

Signed-off-by: Zhao Yakui <yakui.zhao@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/x86/kernel/acpi/cstate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index 8c44c232efc..59cdfa4686b 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -48,7 +48,7 @@ void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags,
 	 * P4, Core and beyond CPUs
 	 */
 	if (c->x86_vendor == X86_VENDOR_INTEL &&
-	    (c->x86 > 0x6 || (c->x86 == 6 && c->x86_model >= 14)))
+	    (c->x86 > 0xf || (c->x86 == 6 && c->x86_model >= 14)))
 			flags->bm_control = 0;
 }
 EXPORT_SYMBOL(acpi_processor_power_init_bm_check);
-- 
cgit v1.2.3


From d949f36f1865c60239d4265b50c4b75354fcb8f3 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sat, 26 Sep 2009 09:35:07 -0700
Subject: x86: Fix hwpoison code related build failure on 32-bit NUMAQ

This build failure triggers:

 In file included from include/linux/suspend.h:8,
                 from arch/x86/kernel/asm-offsets_32.c:11,
                 from arch/x86/kernel/asm-offsets.c:2:
 include/linux/mm.h:503:2: error: #error SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS

Because due to the hwpoison page flag we ran out of page
flags on 32-bit.

Dont turn on hwpoison on 32-bit NUMA (it's rare in any
case).

Also clean up the Kconfig dependencies in the generic MM
code by introducing ARCH_SUPPORTS_MEMORY_FAILURE.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig | 11 +++++++++++
 mm/Kconfig       |  4 +++-
 2 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 93698794aa3..8da93745c08 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -432,6 +432,17 @@ config X86_NUMAQ
 	  of Flat Logical.  You will need a new lynxer.elf file to flash your
 	  firmware with - send email to <Martin.Bligh@us.ibm.com>.
 
+config X86_SUPPORTS_MEMORY_FAILURE
+	bool
+	# MCE code calls memory_failure():
+	depends on X86_MCE
+	# On 32-bit this adds too big of NODES_SHIFT and we run out of page flags:
+	depends on !X86_NUMAQ
+	# On 32-bit SPARSEMEM adds too big of SECTIONS_WIDTH:
+	depends on X86_64 || !SPARSEMEM
+	select ARCH_SUPPORTS_MEMORY_FAILURE
+	default y
+
 config X86_VISWS
 	bool "SGI 320/540 (Visual Workstation)"
 	depends on X86_32 && PCI && X86_MPPARSE && PCI_GODIRECT
diff --git a/mm/Kconfig b/mm/Kconfig
index 24776072959..edd300aca17 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -244,10 +244,12 @@ config DEFAULT_MMAP_MIN_ADDR
 	  This value can be changed after boot using the
 	  /proc/sys/vm/mmap_min_addr tunable.
 
+config ARCH_SUPPORTS_MEMORY_FAILURE
+	bool
 
 config MEMORY_FAILURE
 	depends on MMU
-	depends on X86_MCE
+	depends on ARCH_SUPPORTS_MEMORY_FAILURE
 	bool "Enable recovery from hardware memory errors"
 	help
 	  Enables code to recover from some memory failures on systems
-- 
cgit v1.2.3


From f0f37e2f77731b3473fa6bd5ee53255d9a9cdb40 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Sun, 27 Sep 2009 22:29:37 +0400
Subject: const: mark struct vm_struct_operations

* mark struct vm_area_struct::vm_ops as const
* mark vm_ops in AGP code

But leave TTM code alone, something is fishy there with global vm_ops
being used.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/ia32/binfmt_elf32.c                |  4 ++--
 arch/powerpc/platforms/cell/spufs/file.c     | 14 +++++++-------
 arch/x86/pci/i386.c                          |  2 +-
 drivers/char/agp/agp.h                       |  2 +-
 drivers/char/agp/alpha-agp.c                 |  2 +-
 drivers/char/mem.c                           |  2 +-
 drivers/char/mspec.c                         |  2 +-
 drivers/gpu/drm/drm_vm.c                     |  8 ++++----
 drivers/gpu/drm/radeon/radeon_ttm.c          |  2 +-
 drivers/gpu/drm/ttm/ttm_bo_vm.c              |  2 +-
 drivers/ieee1394/dma.c                       |  2 +-
 drivers/infiniband/hw/ehca/ehca_uverbs.c     |  2 +-
 drivers/infiniband/hw/ipath/ipath_file_ops.c |  2 +-
 drivers/infiniband/hw/ipath/ipath_mmap.c     |  2 +-
 drivers/media/video/cafe_ccic.c              |  2 +-
 drivers/media/video/et61x251/et61x251_core.c |  2 +-
 drivers/media/video/gspca/gspca.c            |  2 +-
 drivers/media/video/meye.c                   |  2 +-
 drivers/media/video/sn9c102/sn9c102_core.c   |  2 +-
 drivers/media/video/stk-webcam.c             |  2 +-
 drivers/media/video/uvc/uvc_v4l2.c           |  2 +-
 drivers/media/video/videobuf-dma-contig.c    |  2 +-
 drivers/media/video/videobuf-dma-sg.c        |  2 +-
 drivers/media/video/videobuf-vmalloc.c       |  2 +-
 drivers/media/video/vino.c                   |  2 +-
 drivers/media/video/zc0301/zc0301_core.c     |  2 +-
 drivers/media/video/zoran/zoran_driver.c     |  2 +-
 drivers/misc/sgi-gru/grufile.c               |  2 +-
 drivers/misc/sgi-gru/grutables.h             |  2 +-
 drivers/scsi/sg.c                            |  2 +-
 drivers/uio/uio.c                            |  2 +-
 drivers/usb/mon/mon_bin.c                    |  2 +-
 drivers/video/fb_defio.c                     |  2 +-
 drivers/video/omap/dispc.c                   |  2 +-
 fs/btrfs/file.c                              |  2 +-
 fs/ext4/file.c                               |  2 +-
 fs/fuse/file.c                               |  2 +-
 fs/gfs2/file.c                               |  2 +-
 fs/ncpfs/mmap.c                              |  2 +-
 fs/nfs/file.c                                |  4 ++--
 fs/nilfs2/file.c                             |  2 +-
 fs/ocfs2/mmap.c                              |  2 +-
 fs/sysfs/bin.c                               |  4 ++--
 fs/ubifs/file.c                              |  2 +-
 fs/xfs/linux-2.6/xfs_file.c                  |  4 ++--
 include/linux/agp_backend.h                  |  2 +-
 include/linux/hugetlb.h                      |  2 +-
 include/linux/mm_types.h                     |  2 +-
 include/linux/ramfs.h                        |  2 +-
 ipc/shm.c                                    |  4 ++--
 kernel/perf_event.c                          |  2 +-
 kernel/relay.c                               |  2 +-
 mm/filemap.c                                 |  2 +-
 mm/filemap_xip.c                             |  2 +-
 mm/hugetlb.c                                 |  2 +-
 mm/mmap.c                                    |  2 +-
 mm/nommu.c                                   |  2 +-
 mm/shmem.c                                   |  4 ++--
 net/packet/af_packet.c                       |  2 +-
 sound/core/pcm_native.c                      |  8 ++++----
 sound/usb/usx2y/us122l.c                     |  2 +-
 sound/usb/usx2y/usX2Yhwdep.c                 |  2 +-
 sound/usb/usx2y/usx2yhwdeppcm.c              |  2 +-
 virt/kvm/kvm_main.c                          |  4 ++--
 64 files changed, 83 insertions(+), 83 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/ia64/ia32/binfmt_elf32.c b/arch/ia64/ia32/binfmt_elf32.c
index f92bdaac897..c69552bf893 100644
--- a/arch/ia64/ia32/binfmt_elf32.c
+++ b/arch/ia64/ia32/binfmt_elf32.c
@@ -69,11 +69,11 @@ ia32_install_gate_page (struct vm_area_struct *vma, struct vm_fault *vmf)
 }
 
 
-static struct vm_operations_struct ia32_shared_page_vm_ops = {
+static const struct vm_operations_struct ia32_shared_page_vm_ops = {
 	.fault = ia32_install_shared_page
 };
 
-static struct vm_operations_struct ia32_gate_page_vm_ops = {
+static const struct vm_operations_struct ia32_gate_page_vm_ops = {
 	.fault = ia32_install_gate_page
 };
 
diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c
index 8f079b865ad..96130944617 100644
--- a/arch/powerpc/platforms/cell/spufs/file.c
+++ b/arch/powerpc/platforms/cell/spufs/file.c
@@ -309,7 +309,7 @@ static int spufs_mem_mmap_access(struct vm_area_struct *vma,
 	return len;
 }
 
-static struct vm_operations_struct spufs_mem_mmap_vmops = {
+static const struct vm_operations_struct spufs_mem_mmap_vmops = {
 	.fault = spufs_mem_mmap_fault,
 	.access = spufs_mem_mmap_access,
 };
@@ -436,7 +436,7 @@ static int spufs_cntl_mmap_fault(struct vm_area_struct *vma,
 	return spufs_ps_fault(vma, vmf, 0x4000, SPUFS_CNTL_MAP_SIZE);
 }
 
-static struct vm_operations_struct spufs_cntl_mmap_vmops = {
+static const struct vm_operations_struct spufs_cntl_mmap_vmops = {
 	.fault = spufs_cntl_mmap_fault,
 };
 
@@ -1143,7 +1143,7 @@ spufs_signal1_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 #endif
 }
 
-static struct vm_operations_struct spufs_signal1_mmap_vmops = {
+static const struct vm_operations_struct spufs_signal1_mmap_vmops = {
 	.fault = spufs_signal1_mmap_fault,
 };
 
@@ -1279,7 +1279,7 @@ spufs_signal2_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 #endif
 }
 
-static struct vm_operations_struct spufs_signal2_mmap_vmops = {
+static const struct vm_operations_struct spufs_signal2_mmap_vmops = {
 	.fault = spufs_signal2_mmap_fault,
 };
 
@@ -1397,7 +1397,7 @@ spufs_mss_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	return spufs_ps_fault(vma, vmf, 0x0000, SPUFS_MSS_MAP_SIZE);
 }
 
-static struct vm_operations_struct spufs_mss_mmap_vmops = {
+static const struct vm_operations_struct spufs_mss_mmap_vmops = {
 	.fault = spufs_mss_mmap_fault,
 };
 
@@ -1458,7 +1458,7 @@ spufs_psmap_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	return spufs_ps_fault(vma, vmf, 0x0000, SPUFS_PS_MAP_SIZE);
 }
 
-static struct vm_operations_struct spufs_psmap_mmap_vmops = {
+static const struct vm_operations_struct spufs_psmap_mmap_vmops = {
 	.fault = spufs_psmap_mmap_fault,
 };
 
@@ -1517,7 +1517,7 @@ spufs_mfc_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	return spufs_ps_fault(vma, vmf, 0x3000, SPUFS_MFC_MAP_SIZE);
 }
 
-static struct vm_operations_struct spufs_mfc_mmap_vmops = {
+static const struct vm_operations_struct spufs_mfc_mmap_vmops = {
 	.fault = spufs_mfc_mmap_fault,
 };
 
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 52e62e57fed..b22d13b0c71 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -266,7 +266,7 @@ void pcibios_set_master(struct pci_dev *dev)
 	pci_write_config_byte(dev, PCI_LATENCY_TIMER, lat);
 }
 
-static struct vm_operations_struct pci_mmap_ops = {
+static const struct vm_operations_struct pci_mmap_ops = {
 	.access = generic_access_phys,
 };
 
diff --git a/drivers/char/agp/agp.h b/drivers/char/agp/agp.h
index d6f36c004d9..870f12cfed9 100644
--- a/drivers/char/agp/agp.h
+++ b/drivers/char/agp/agp.h
@@ -131,7 +131,7 @@ struct agp_bridge_driver {
 struct agp_bridge_data {
 	const struct agp_version *version;
 	const struct agp_bridge_driver *driver;
-	struct vm_operations_struct *vm_ops;
+	const struct vm_operations_struct *vm_ops;
 	void *previous_size;
 	void *current_size;
 	void *dev_private_data;
diff --git a/drivers/char/agp/alpha-agp.c b/drivers/char/agp/alpha-agp.c
index 5ea4da8e995..dd84af4d4f7 100644
--- a/drivers/char/agp/alpha-agp.c
+++ b/drivers/char/agp/alpha-agp.c
@@ -40,7 +40,7 @@ static struct aper_size_info_fixed alpha_core_agp_sizes[] =
 	{ 0, 0, 0 }, /* filled in by alpha_core_agp_setup */
 };
 
-struct vm_operations_struct alpha_core_agp_vm_ops = {
+static const struct vm_operations_struct alpha_core_agp_vm_ops = {
 	.fault = alpha_core_agp_vm_fault,
 };
 
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index 6c8b65d069e..a074fceb67d 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -301,7 +301,7 @@ static inline int private_mapping_ok(struct vm_area_struct *vma)
 }
 #endif
 
-static struct vm_operations_struct mmap_mem_ops = {
+static const struct vm_operations_struct mmap_mem_ops = {
 #ifdef CONFIG_HAVE_IOREMAP_PROT
 	.access = generic_access_phys
 #endif
diff --git a/drivers/char/mspec.c b/drivers/char/mspec.c
index 30f095a8c2d..1997270bb6f 100644
--- a/drivers/char/mspec.c
+++ b/drivers/char/mspec.c
@@ -239,7 +239,7 @@ mspec_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	return VM_FAULT_NOPAGE;
 }
 
-static struct vm_operations_struct mspec_vm_ops = {
+static const struct vm_operations_struct mspec_vm_ops = {
 	.open = mspec_open,
 	.close = mspec_close,
 	.fault = mspec_fault,
diff --git a/drivers/gpu/drm/drm_vm.c b/drivers/gpu/drm/drm_vm.c
index 7e1fbe5d477..4ac900f4647 100644
--- a/drivers/gpu/drm/drm_vm.c
+++ b/drivers/gpu/drm/drm_vm.c
@@ -369,28 +369,28 @@ static int drm_vm_sg_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 }
 
 /** AGP virtual memory operations */
-static struct vm_operations_struct drm_vm_ops = {
+static const struct vm_operations_struct drm_vm_ops = {
 	.fault = drm_vm_fault,
 	.open = drm_vm_open,
 	.close = drm_vm_close,
 };
 
 /** Shared virtual memory operations */
-static struct vm_operations_struct drm_vm_shm_ops = {
+static const struct vm_operations_struct drm_vm_shm_ops = {
 	.fault = drm_vm_shm_fault,
 	.open = drm_vm_open,
 	.close = drm_vm_shm_close,
 };
 
 /** DMA virtual memory operations */
-static struct vm_operations_struct drm_vm_dma_ops = {
+static const struct vm_operations_struct drm_vm_dma_ops = {
 	.fault = drm_vm_dma_fault,
 	.open = drm_vm_open,
 	.close = drm_vm_close,
 };
 
 /** Scatter-gather virtual memory operations */
-static struct vm_operations_struct drm_vm_sg_ops = {
+static const struct vm_operations_struct drm_vm_sg_ops = {
 	.fault = drm_vm_sg_fault,
 	.open = drm_vm_open,
 	.close = drm_vm_close,
diff --git a/drivers/gpu/drm/radeon/radeon_ttm.c b/drivers/gpu/drm/radeon/radeon_ttm.c
index acd889c9454..5b1cf04a011 100644
--- a/drivers/gpu/drm/radeon/radeon_ttm.c
+++ b/drivers/gpu/drm/radeon/radeon_ttm.c
@@ -530,7 +530,7 @@ void radeon_ttm_fini(struct radeon_device *rdev)
 }
 
 static struct vm_operations_struct radeon_ttm_vm_ops;
-static struct vm_operations_struct *ttm_vm_ops = NULL;
+static const struct vm_operations_struct *ttm_vm_ops = NULL;
 
 static int radeon_ttm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c
index 33de7637c0c..1c040d04033 100644
--- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
+++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
@@ -228,7 +228,7 @@ static void ttm_bo_vm_close(struct vm_area_struct *vma)
 	vma->vm_private_data = NULL;
 }
 
-static struct vm_operations_struct ttm_bo_vm_ops = {
+static const struct vm_operations_struct ttm_bo_vm_ops = {
 	.fault = ttm_bo_vm_fault,
 	.open = ttm_bo_vm_open,
 	.close = ttm_bo_vm_close
diff --git a/drivers/ieee1394/dma.c b/drivers/ieee1394/dma.c
index 1aba8c13fe8..8e7e3344c4b 100644
--- a/drivers/ieee1394/dma.c
+++ b/drivers/ieee1394/dma.c
@@ -247,7 +247,7 @@ static int dma_region_pagefault(struct vm_area_struct *vma,
 	return 0;
 }
 
-static struct vm_operations_struct dma_region_vm_ops = {
+static const struct vm_operations_struct dma_region_vm_ops = {
 	.fault = dma_region_pagefault,
 };
 
diff --git a/drivers/infiniband/hw/ehca/ehca_uverbs.c b/drivers/infiniband/hw/ehca/ehca_uverbs.c
index 3cb688d2913..f1565cae8ec 100644
--- a/drivers/infiniband/hw/ehca/ehca_uverbs.c
+++ b/drivers/infiniband/hw/ehca/ehca_uverbs.c
@@ -95,7 +95,7 @@ static void ehca_mm_close(struct vm_area_struct *vma)
 		     vma->vm_start, vma->vm_end, *count);
 }
 
-static struct vm_operations_struct vm_ops = {
+static const struct vm_operations_struct vm_ops = {
 	.open =	ehca_mm_open,
 	.close = ehca_mm_close,
 };
diff --git a/drivers/infiniband/hw/ipath/ipath_file_ops.c b/drivers/infiniband/hw/ipath/ipath_file_ops.c
index 38a28700661..40dbe54056c 100644
--- a/drivers/infiniband/hw/ipath/ipath_file_ops.c
+++ b/drivers/infiniband/hw/ipath/ipath_file_ops.c
@@ -1151,7 +1151,7 @@ static int ipath_file_vma_fault(struct vm_area_struct *vma,
 	return 0;
 }
 
-static struct vm_operations_struct ipath_file_vm_ops = {
+static const struct vm_operations_struct ipath_file_vm_ops = {
 	.fault = ipath_file_vma_fault,
 };
 
diff --git a/drivers/infiniband/hw/ipath/ipath_mmap.c b/drivers/infiniband/hw/ipath/ipath_mmap.c
index fa830e22002..b28865faf43 100644
--- a/drivers/infiniband/hw/ipath/ipath_mmap.c
+++ b/drivers/infiniband/hw/ipath/ipath_mmap.c
@@ -74,7 +74,7 @@ static void ipath_vma_close(struct vm_area_struct *vma)
 	kref_put(&ip->ref, ipath_release_mmap_info);
 }
 
-static struct vm_operations_struct ipath_vm_ops = {
+static const struct vm_operations_struct ipath_vm_ops = {
 	.open =     ipath_vma_open,
 	.close =    ipath_vma_close,
 };
diff --git a/drivers/media/video/cafe_ccic.c b/drivers/media/video/cafe_ccic.c
index 657c481d255..10230cb3d21 100644
--- a/drivers/media/video/cafe_ccic.c
+++ b/drivers/media/video/cafe_ccic.c
@@ -1325,7 +1325,7 @@ static void cafe_v4l_vm_close(struct vm_area_struct *vma)
 	mutex_unlock(&sbuf->cam->s_mutex);
 }
 
-static struct vm_operations_struct cafe_v4l_vm_ops = {
+static const struct vm_operations_struct cafe_v4l_vm_ops = {
 	.open = cafe_v4l_vm_open,
 	.close = cafe_v4l_vm_close
 };
diff --git a/drivers/media/video/et61x251/et61x251_core.c b/drivers/media/video/et61x251/et61x251_core.c
index 74092f436be..88987a57cf7 100644
--- a/drivers/media/video/et61x251/et61x251_core.c
+++ b/drivers/media/video/et61x251/et61x251_core.c
@@ -1496,7 +1496,7 @@ static void et61x251_vm_close(struct vm_area_struct* vma)
 }
 
 
-static struct vm_operations_struct et61x251_vm_ops = {
+static const struct vm_operations_struct et61x251_vm_ops = {
 	.open = et61x251_vm_open,
 	.close = et61x251_vm_close,
 };
diff --git a/drivers/media/video/gspca/gspca.c b/drivers/media/video/gspca/gspca.c
index cf6540da1e4..23d3fb77691 100644
--- a/drivers/media/video/gspca/gspca.c
+++ b/drivers/media/video/gspca/gspca.c
@@ -99,7 +99,7 @@ static void gspca_vm_close(struct vm_area_struct *vma)
 		frame->v4l2_buf.flags &= ~V4L2_BUF_FLAG_MAPPED;
 }
 
-static struct vm_operations_struct gspca_vm_ops = {
+static const struct vm_operations_struct gspca_vm_ops = {
 	.open		= gspca_vm_open,
 	.close		= gspca_vm_close,
 };
diff --git a/drivers/media/video/meye.c b/drivers/media/video/meye.c
index d0765bed79c..4b1bc05a462 100644
--- a/drivers/media/video/meye.c
+++ b/drivers/media/video/meye.c
@@ -1589,7 +1589,7 @@ static void meye_vm_close(struct vm_area_struct *vma)
 	meye.vma_use_count[idx]--;
 }
 
-static struct vm_operations_struct meye_vm_ops = {
+static const struct vm_operations_struct meye_vm_ops = {
 	.open		= meye_vm_open,
 	.close		= meye_vm_close,
 };
diff --git a/drivers/media/video/sn9c102/sn9c102_core.c b/drivers/media/video/sn9c102/sn9c102_core.c
index 9d84c94e8a4..4a7711c3e74 100644
--- a/drivers/media/video/sn9c102/sn9c102_core.c
+++ b/drivers/media/video/sn9c102/sn9c102_core.c
@@ -2077,7 +2077,7 @@ static void sn9c102_vm_close(struct vm_area_struct* vma)
 }
 
 
-static struct vm_operations_struct sn9c102_vm_ops = {
+static const struct vm_operations_struct sn9c102_vm_ops = {
 	.open = sn9c102_vm_open,
 	.close = sn9c102_vm_close,
 };
diff --git a/drivers/media/video/stk-webcam.c b/drivers/media/video/stk-webcam.c
index 0b996ea4134..6b41865f42b 100644
--- a/drivers/media/video/stk-webcam.c
+++ b/drivers/media/video/stk-webcam.c
@@ -790,7 +790,7 @@ static void stk_v4l_vm_close(struct vm_area_struct *vma)
 	if (sbuf->mapcount == 0)
 		sbuf->v4lbuf.flags &= ~V4L2_BUF_FLAG_MAPPED;
 }
-static struct vm_operations_struct stk_v4l_vm_ops = {
+static const struct vm_operations_struct stk_v4l_vm_ops = {
 	.open = stk_v4l_vm_open,
 	.close = stk_v4l_vm_close
 };
diff --git a/drivers/media/video/uvc/uvc_v4l2.c b/drivers/media/video/uvc/uvc_v4l2.c
index 9e7351569b5..a2bdd806efa 100644
--- a/drivers/media/video/uvc/uvc_v4l2.c
+++ b/drivers/media/video/uvc/uvc_v4l2.c
@@ -1069,7 +1069,7 @@ static void uvc_vm_close(struct vm_area_struct *vma)
 	buffer->vma_use_count--;
 }
 
-static struct vm_operations_struct uvc_vm_ops = {
+static const struct vm_operations_struct uvc_vm_ops = {
 	.open		= uvc_vm_open,
 	.close		= uvc_vm_close,
 };
diff --git a/drivers/media/video/videobuf-dma-contig.c b/drivers/media/video/videobuf-dma-contig.c
index d09ce83a942..635ffc7b039 100644
--- a/drivers/media/video/videobuf-dma-contig.c
+++ b/drivers/media/video/videobuf-dma-contig.c
@@ -105,7 +105,7 @@ static void videobuf_vm_close(struct vm_area_struct *vma)
 	}
 }
 
-static struct vm_operations_struct videobuf_vm_ops = {
+static const struct vm_operations_struct videobuf_vm_ops = {
 	.open     = videobuf_vm_open,
 	.close    = videobuf_vm_close,
 };
diff --git a/drivers/media/video/videobuf-dma-sg.c b/drivers/media/video/videobuf-dma-sg.c
index a8dd22ace3f..53cdd67cebe 100644
--- a/drivers/media/video/videobuf-dma-sg.c
+++ b/drivers/media/video/videobuf-dma-sg.c
@@ -394,7 +394,7 @@ videobuf_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	return 0;
 }
 
-static struct vm_operations_struct videobuf_vm_ops =
+static const struct vm_operations_struct videobuf_vm_ops =
 {
 	.open     = videobuf_vm_open,
 	.close    = videobuf_vm_close,
diff --git a/drivers/media/video/videobuf-vmalloc.c b/drivers/media/video/videobuf-vmalloc.c
index 30ae30f99cc..35f3900c563 100644
--- a/drivers/media/video/videobuf-vmalloc.c
+++ b/drivers/media/video/videobuf-vmalloc.c
@@ -116,7 +116,7 @@ static void videobuf_vm_close(struct vm_area_struct *vma)
 	return;
 }
 
-static struct vm_operations_struct videobuf_vm_ops =
+static const struct vm_operations_struct videobuf_vm_ops =
 {
 	.open     = videobuf_vm_open,
 	.close    = videobuf_vm_close,
diff --git a/drivers/media/video/vino.c b/drivers/media/video/vino.c
index cd6a3446ab7..b034a81d2b1 100644
--- a/drivers/media/video/vino.c
+++ b/drivers/media/video/vino.c
@@ -3857,7 +3857,7 @@ static void vino_vm_close(struct vm_area_struct *vma)
 	dprintk("vino_vm_close(): count = %d\n", fb->map_count);
 }
 
-static struct vm_operations_struct vino_vm_ops = {
+static const struct vm_operations_struct vino_vm_ops = {
 	.open	= vino_vm_open,
 	.close	= vino_vm_close,
 };
diff --git a/drivers/media/video/zc0301/zc0301_core.c b/drivers/media/video/zc0301/zc0301_core.c
index b3c6436b33b..312a71336fd 100644
--- a/drivers/media/video/zc0301/zc0301_core.c
+++ b/drivers/media/video/zc0301/zc0301_core.c
@@ -935,7 +935,7 @@ static void zc0301_vm_close(struct vm_area_struct* vma)
 }
 
 
-static struct vm_operations_struct zc0301_vm_ops = {
+static const struct vm_operations_struct zc0301_vm_ops = {
 	.open = zc0301_vm_open,
 	.close = zc0301_vm_close,
 };
diff --git a/drivers/media/video/zoran/zoran_driver.c b/drivers/media/video/zoran/zoran_driver.c
index bcdefb1bcb3..47137deafcf 100644
--- a/drivers/media/video/zoran/zoran_driver.c
+++ b/drivers/media/video/zoran/zoran_driver.c
@@ -3172,7 +3172,7 @@ zoran_vm_close (struct vm_area_struct *vma)
 	mutex_unlock(&zr->resource_lock);
 }
 
-static struct vm_operations_struct zoran_vm_ops = {
+static const struct vm_operations_struct zoran_vm_ops = {
 	.open = zoran_vm_open,
 	.close = zoran_vm_close,
 };
diff --git a/drivers/misc/sgi-gru/grufile.c b/drivers/misc/sgi-gru/grufile.c
index aed609832bc..300e7ba391a 100644
--- a/drivers/misc/sgi-gru/grufile.c
+++ b/drivers/misc/sgi-gru/grufile.c
@@ -438,7 +438,7 @@ static struct miscdevice gru_miscdev = {
 	.fops		= &gru_fops,
 };
 
-struct vm_operations_struct gru_vm_ops = {
+const struct vm_operations_struct gru_vm_ops = {
 	.close		= gru_vma_close,
 	.fault		= gru_fault,
 };
diff --git a/drivers/misc/sgi-gru/grutables.h b/drivers/misc/sgi-gru/grutables.h
index 34ab3d45391..46990bcfa53 100644
--- a/drivers/misc/sgi-gru/grutables.h
+++ b/drivers/misc/sgi-gru/grutables.h
@@ -624,7 +624,7 @@ static inline int is_kernel_context(struct gru_thread_state *gts)
  */
 struct gru_unload_context_req;
 
-extern struct vm_operations_struct gru_vm_ops;
+extern const struct vm_operations_struct gru_vm_ops;
 extern struct device *grudev;
 
 extern struct gru_vma_data *gru_alloc_vma_data(struct vm_area_struct *vma,
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 848b5946685..0cb049f5cc5 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -1185,7 +1185,7 @@ sg_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	return VM_FAULT_SIGBUS;
 }
 
-static struct vm_operations_struct sg_mmap_vm_ops = {
+static const struct vm_operations_struct sg_mmap_vm_ops = {
 	.fault = sg_vma_fault,
 };
 
diff --git a/drivers/uio/uio.c b/drivers/uio/uio.c
index 03efb065455..a9d70704720 100644
--- a/drivers/uio/uio.c
+++ b/drivers/uio/uio.c
@@ -658,7 +658,7 @@ static int uio_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	return 0;
 }
 
-static struct vm_operations_struct uio_vm_ops = {
+static const struct vm_operations_struct uio_vm_ops = {
 	.open = uio_vma_open,
 	.close = uio_vma_close,
 	.fault = uio_vma_fault,
diff --git a/drivers/usb/mon/mon_bin.c b/drivers/usb/mon/mon_bin.c
index dfdc43e2e00..9ed3e741bee 100644
--- a/drivers/usb/mon/mon_bin.c
+++ b/drivers/usb/mon/mon_bin.c
@@ -1174,7 +1174,7 @@ static int mon_bin_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	return 0;
 }
 
-static struct vm_operations_struct mon_bin_vm_ops = {
+static const struct vm_operations_struct mon_bin_vm_ops = {
 	.open =     mon_bin_vma_open,
 	.close =    mon_bin_vma_close,
 	.fault =    mon_bin_vma_fault,
diff --git a/drivers/video/fb_defio.c b/drivers/video/fb_defio.c
index 0a7a6679ee6..c27ab1ed960 100644
--- a/drivers/video/fb_defio.c
+++ b/drivers/video/fb_defio.c
@@ -125,7 +125,7 @@ page_already_added:
 	return 0;
 }
 
-static struct vm_operations_struct fb_deferred_io_vm_ops = {
+static const struct vm_operations_struct fb_deferred_io_vm_ops = {
 	.fault		= fb_deferred_io_fault,
 	.page_mkwrite	= fb_deferred_io_mkwrite,
 };
diff --git a/drivers/video/omap/dispc.c b/drivers/video/omap/dispc.c
index 80a11d078df..f16e4215422 100644
--- a/drivers/video/omap/dispc.c
+++ b/drivers/video/omap/dispc.c
@@ -1035,7 +1035,7 @@ static void mmap_user_close(struct vm_area_struct *vma)
 	atomic_dec(&dispc.map_count[plane]);
 }
 
-static struct vm_operations_struct mmap_user_ops = {
+static const struct vm_operations_struct mmap_user_ops = {
 	.open = mmap_user_open,
 	.close = mmap_user_close,
 };
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 571ad3c13b4..a3492a3ad96 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1184,7 +1184,7 @@ out:
 	return ret > 0 ? EIO : ret;
 }
 
-static struct vm_operations_struct btrfs_file_vm_ops = {
+static const struct vm_operations_struct btrfs_file_vm_ops = {
 	.fault		= filemap_fault,
 	.page_mkwrite	= btrfs_page_mkwrite,
 };
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 5ca3eca70a1..9630583cef2 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -81,7 +81,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
 	return generic_file_aio_write(iocb, iov, nr_segs, pos);
 }
 
-static struct vm_operations_struct ext4_file_vm_ops = {
+static const struct vm_operations_struct ext4_file_vm_ops = {
 	.fault		= filemap_fault,
 	.page_mkwrite   = ext4_page_mkwrite,
 };
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index cbc464043b6..a3492f7d207 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1313,7 +1313,7 @@ static int fuse_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	return 0;
 }
 
-static struct vm_operations_struct fuse_file_vm_ops = {
+static const struct vm_operations_struct fuse_file_vm_ops = {
 	.close		= fuse_vma_close,
 	.fault		= filemap_fault,
 	.page_mkwrite	= fuse_page_mkwrite,
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 166f38fbd24..4eb308aa323 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -418,7 +418,7 @@ out:
 	return ret;
 }
 
-static struct vm_operations_struct gfs2_vm_ops = {
+static const struct vm_operations_struct gfs2_vm_ops = {
 	.fault = filemap_fault,
 	.page_mkwrite = gfs2_page_mkwrite,
 };
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index 5d8dcb9ee32..15458decdb8 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -95,7 +95,7 @@ static int ncp_file_mmap_fault(struct vm_area_struct *area,
 	return VM_FAULT_MAJOR;
 }
 
-static struct vm_operations_struct ncp_file_mmap =
+static const struct vm_operations_struct ncp_file_mmap =
 {
 	.fault = ncp_file_mmap_fault,
 };
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 86d6b4db109..f5fdd39e037 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -59,7 +59,7 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl);
 static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl);
 static int nfs_setlease(struct file *file, long arg, struct file_lock **fl);
 
-static struct vm_operations_struct nfs_file_vm_ops;
+static const struct vm_operations_struct nfs_file_vm_ops;
 
 const struct file_operations nfs_file_operations = {
 	.llseek		= nfs_file_llseek,
@@ -572,7 +572,7 @@ out_unlock:
 	return VM_FAULT_SIGBUS;
 }
 
-static struct vm_operations_struct nfs_file_vm_ops = {
+static const struct vm_operations_struct nfs_file_vm_ops = {
 	.fault = filemap_fault,
 	.page_mkwrite = nfs_vm_page_mkwrite,
 };
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index fc8278c77cd..7d7b4983dee 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -117,7 +117,7 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	return 0;
 }
 
-struct vm_operations_struct nilfs_file_vm_ops = {
+static const struct vm_operations_struct nilfs_file_vm_ops = {
 	.fault		= filemap_fault,
 	.page_mkwrite	= nilfs_page_mkwrite,
 };
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index b606496b72e..39737613424 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -202,7 +202,7 @@ out:
 	return ret;
 }
 
-static struct vm_operations_struct ocfs2_file_vm_ops = {
+static const struct vm_operations_struct ocfs2_file_vm_ops = {
 	.fault		= ocfs2_fault,
 	.page_mkwrite	= ocfs2_page_mkwrite,
 };
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index 2524714bece..60c702bc10a 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -40,7 +40,7 @@ struct bin_buffer {
 	struct mutex			mutex;
 	void				*buffer;
 	int				mmapped;
-	struct vm_operations_struct 	*vm_ops;
+	const struct vm_operations_struct *vm_ops;
 	struct file			*file;
 	struct hlist_node		list;
 };
@@ -331,7 +331,7 @@ static int bin_migrate(struct vm_area_struct *vma, const nodemask_t *from,
 }
 #endif
 
-static struct vm_operations_struct bin_vm_ops = {
+static const struct vm_operations_struct bin_vm_ops = {
 	.open		= bin_vma_open,
 	.close		= bin_vma_close,
 	.fault		= bin_fault,
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 2e6481a7701..1009adc8d60 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1534,7 +1534,7 @@ out_unlock:
 	return err;
 }
 
-static struct vm_operations_struct ubifs_file_vm_ops = {
+static const struct vm_operations_struct ubifs_file_vm_ops = {
 	.fault        = filemap_fault,
 	.page_mkwrite = ubifs_vm_page_mkwrite,
 };
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 988d8f87bc0..629370974e5 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -42,7 +42,7 @@
 
 #include <linux/dcache.h>
 
-static struct vm_operations_struct xfs_file_vm_ops;
+static const struct vm_operations_struct xfs_file_vm_ops;
 
 STATIC ssize_t
 xfs_file_aio_read(
@@ -280,7 +280,7 @@ const struct file_operations xfs_dir_file_operations = {
 	.fsync		= xfs_file_fsync,
 };
 
-static struct vm_operations_struct xfs_file_vm_ops = {
+static const struct vm_operations_struct xfs_file_vm_ops = {
 	.fault		= filemap_fault,
 	.page_mkwrite	= xfs_vm_page_mkwrite,
 };
diff --git a/include/linux/agp_backend.h b/include/linux/agp_backend.h
index 880130f7311..9101ed64f80 100644
--- a/include/linux/agp_backend.h
+++ b/include/linux/agp_backend.h
@@ -53,7 +53,7 @@ struct agp_kern_info {
 	int current_memory;
 	bool cant_use_aperture;
 	unsigned long page_mask;
-	struct vm_operations_struct *vm_ops;
+	const struct vm_operations_struct *vm_ops;
 };
 
 /*
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 16937995abd..41a59afc70f 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -163,7 +163,7 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb)
 }
 
 extern const struct file_operations hugetlbfs_file_operations;
-extern struct vm_operations_struct hugetlb_vm_ops;
+extern const struct vm_operations_struct hugetlb_vm_ops;
 struct file *hugetlb_file_setup(const char *name, size_t size, int acct,
 				struct user_struct **user, int creat_flags);
 int hugetlb_get_quota(struct address_space *mapping, long delta);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 21d6aa45206..84a524afb3d 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -171,7 +171,7 @@ struct vm_area_struct {
 	struct anon_vma *anon_vma;	/* Serialized by page_table_lock */
 
 	/* Function pointers to deal with this struct. */
-	struct vm_operations_struct * vm_ops;
+	const struct vm_operations_struct *vm_ops;
 
 	/* Information about our backing store: */
 	unsigned long vm_pgoff;		/* Offset (within vm_file) in PAGE_SIZE
diff --git a/include/linux/ramfs.h b/include/linux/ramfs.h
index 37aaf2b3986..4e768dda87b 100644
--- a/include/linux/ramfs.h
+++ b/include/linux/ramfs.h
@@ -17,7 +17,7 @@ extern int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma);
 #endif
 
 extern const struct file_operations ramfs_file_operations;
-extern struct vm_operations_struct generic_file_vm_ops;
+extern const struct vm_operations_struct generic_file_vm_ops;
 extern int __init init_rootfs(void);
 
 #endif
diff --git a/ipc/shm.c b/ipc/shm.c
index 9eb1488b543..464694e0aa4 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -55,7 +55,7 @@ struct shm_file_data {
 #define shm_file_data(file) (*((struct shm_file_data **)&(file)->private_data))
 
 static const struct file_operations shm_file_operations;
-static struct vm_operations_struct shm_vm_ops;
+static const struct vm_operations_struct shm_vm_ops;
 
 #define shm_ids(ns)	((ns)->ids[IPC_SHM_IDS])
 
@@ -312,7 +312,7 @@ static const struct file_operations shm_file_operations = {
 	.get_unmapped_area	= shm_get_unmapped_area,
 };
 
-static struct vm_operations_struct shm_vm_ops = {
+static const struct vm_operations_struct shm_vm_ops = {
 	.open	= shm_open,	/* callback for a new vm-area open */
 	.close	= shm_close,	/* callback for when the vm-area is released */
 	.fault	= shm_fault,
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 76ac4db405e..0f86feb6db0 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -2253,7 +2253,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
 	}
 }
 
-static struct vm_operations_struct perf_mmap_vmops = {
+static const struct vm_operations_struct perf_mmap_vmops = {
 	.open		= perf_mmap_open,
 	.close		= perf_mmap_close,
 	.fault		= perf_mmap_fault,
diff --git a/kernel/relay.c b/kernel/relay.c
index bc188549788..760c26209a3 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -60,7 +60,7 @@ static int relay_buf_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 /*
  * vm_ops for relay file mappings.
  */
-static struct vm_operations_struct relay_file_mmap_ops = {
+static const struct vm_operations_struct relay_file_mmap_ops = {
 	.fault = relay_buf_fault,
 	.close = relay_file_mmap_close,
 };
diff --git a/mm/filemap.c b/mm/filemap.c
index 6c84e598b4a..ef169f37156 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1611,7 +1611,7 @@ page_not_uptodate:
 }
 EXPORT_SYMBOL(filemap_fault);
 
-struct vm_operations_struct generic_file_vm_ops = {
+const struct vm_operations_struct generic_file_vm_ops = {
 	.fault		= filemap_fault,
 };
 
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 427dfe3ce78..1888b2d71bb 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -296,7 +296,7 @@ out:
 	}
 }
 
-static struct vm_operations_struct xip_file_vm_ops = {
+static const struct vm_operations_struct xip_file_vm_ops = {
 	.fault	= xip_file_fault,
 };
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 6f048fcc749..5d7601b0287 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1721,7 +1721,7 @@ static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	return 0;
 }
 
-struct vm_operations_struct hugetlb_vm_ops = {
+const struct vm_operations_struct hugetlb_vm_ops = {
 	.fault = hugetlb_vm_op_fault,
 	.open = hugetlb_vm_op_open,
 	.close = hugetlb_vm_op_close,
diff --git a/mm/mmap.c b/mm/mmap.c
index 21d4029a07b..73f5e4b6401 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2282,7 +2282,7 @@ static void special_mapping_close(struct vm_area_struct *vma)
 {
 }
 
-static struct vm_operations_struct special_mapping_vmops = {
+static const struct vm_operations_struct special_mapping_vmops = {
 	.close = special_mapping_close,
 	.fault = special_mapping_fault,
 };
diff --git a/mm/nommu.c b/mm/nommu.c
index c73aa4753d7..5189b5aed8c 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -79,7 +79,7 @@ static struct kmem_cache *vm_region_jar;
 struct rb_root nommu_region_tree = RB_ROOT;
 DECLARE_RWSEM(nommu_region_sem);
 
-struct vm_operations_struct generic_file_vm_ops = {
+const struct vm_operations_struct generic_file_vm_ops = {
 };
 
 /*
diff --git a/mm/shmem.c b/mm/shmem.c
index ccf446a9faa..356dd99566e 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -218,7 +218,7 @@ static const struct file_operations shmem_file_operations;
 static const struct inode_operations shmem_inode_operations;
 static const struct inode_operations shmem_dir_inode_operations;
 static const struct inode_operations shmem_special_inode_operations;
-static struct vm_operations_struct shmem_vm_ops;
+static const struct vm_operations_struct shmem_vm_ops;
 
 static struct backing_dev_info shmem_backing_dev_info  __read_mostly = {
 	.ra_pages	= 0,	/* No readahead */
@@ -2498,7 +2498,7 @@ static const struct super_operations shmem_ops = {
 	.put_super	= shmem_put_super,
 };
 
-static struct vm_operations_struct shmem_vm_ops = {
+static const struct vm_operations_struct shmem_vm_ops = {
 	.fault		= shmem_fault,
 #ifdef CONFIG_NUMA
 	.set_policy     = shmem_set_policy,
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index d3d52c66cdc..103d5611b81 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2084,7 +2084,7 @@ static void packet_mm_close(struct vm_area_struct *vma)
 		atomic_dec(&pkt_sk(sk)->mapped);
 }
 
-static struct vm_operations_struct packet_mmap_ops = {
+static const struct vm_operations_struct packet_mmap_ops = {
 	.open	=	packet_mm_open,
 	.close	=	packet_mm_close,
 };
diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
index 561d6d95a2d..ab73edf2c89 100644
--- a/sound/core/pcm_native.c
+++ b/sound/core/pcm_native.c
@@ -2985,7 +2985,7 @@ static int snd_pcm_mmap_status_fault(struct vm_area_struct *area,
 	return 0;
 }
 
-static struct vm_operations_struct snd_pcm_vm_ops_status =
+static const struct vm_operations_struct snd_pcm_vm_ops_status =
 {
 	.fault =	snd_pcm_mmap_status_fault,
 };
@@ -3024,7 +3024,7 @@ static int snd_pcm_mmap_control_fault(struct vm_area_struct *area,
 	return 0;
 }
 
-static struct vm_operations_struct snd_pcm_vm_ops_control =
+static const struct vm_operations_struct snd_pcm_vm_ops_control =
 {
 	.fault =	snd_pcm_mmap_control_fault,
 };
@@ -3094,7 +3094,7 @@ static int snd_pcm_mmap_data_fault(struct vm_area_struct *area,
 	return 0;
 }
 
-static struct vm_operations_struct snd_pcm_vm_ops_data =
+static const struct vm_operations_struct snd_pcm_vm_ops_data =
 {
 	.open =		snd_pcm_mmap_data_open,
 	.close =	snd_pcm_mmap_data_close,
@@ -3118,7 +3118,7 @@ static int snd_pcm_default_mmap(struct snd_pcm_substream *substream,
  * mmap the DMA buffer on I/O memory area
  */
 #if SNDRV_PCM_INFO_MMAP_IOMEM
-static struct vm_operations_struct snd_pcm_vm_ops_data_mmio =
+static const struct vm_operations_struct snd_pcm_vm_ops_data_mmio =
 {
 	.open =		snd_pcm_mmap_data_open,
 	.close =	snd_pcm_mmap_data_close,
diff --git a/sound/usb/usx2y/us122l.c b/sound/usb/usx2y/us122l.c
index fd44946ce4b..99f33766cd5 100644
--- a/sound/usb/usx2y/us122l.c
+++ b/sound/usb/usx2y/us122l.c
@@ -154,7 +154,7 @@ static void usb_stream_hwdep_vm_close(struct vm_area_struct *area)
 	snd_printdd(KERN_DEBUG "%i\n", atomic_read(&us122l->mmap_count));
 }
 
-static struct vm_operations_struct usb_stream_hwdep_vm_ops = {
+static const struct vm_operations_struct usb_stream_hwdep_vm_ops = {
 	.open = usb_stream_hwdep_vm_open,
 	.fault = usb_stream_hwdep_vm_fault,
 	.close = usb_stream_hwdep_vm_close,
diff --git a/sound/usb/usx2y/usX2Yhwdep.c b/sound/usb/usx2y/usX2Yhwdep.c
index f3d8f71265d..52e04b2f35d 100644
--- a/sound/usb/usx2y/usX2Yhwdep.c
+++ b/sound/usb/usx2y/usX2Yhwdep.c
@@ -53,7 +53,7 @@ static int snd_us428ctls_vm_fault(struct vm_area_struct *area,
 	return 0;
 }
 
-static struct vm_operations_struct us428ctls_vm_ops = {
+static const struct vm_operations_struct us428ctls_vm_ops = {
 	.fault = snd_us428ctls_vm_fault,
 };
 
diff --git a/sound/usb/usx2y/usx2yhwdeppcm.c b/sound/usb/usx2y/usx2yhwdeppcm.c
index 117946f2deb..4b2304c2e02 100644
--- a/sound/usb/usx2y/usx2yhwdeppcm.c
+++ b/sound/usb/usx2y/usx2yhwdeppcm.c
@@ -697,7 +697,7 @@ static int snd_usX2Y_hwdep_pcm_vm_fault(struct vm_area_struct *area,
 }
 
 
-static struct vm_operations_struct snd_usX2Y_hwdep_pcm_vm_ops = {
+static const struct vm_operations_struct snd_usX2Y_hwdep_pcm_vm_ops = {
 	.open = snd_usX2Y_hwdep_pcm_vm_open,
 	.close = snd_usX2Y_hwdep_pcm_vm_close,
 	.fault = snd_usX2Y_hwdep_pcm_vm_fault,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 034a798b043..b5e7e3f1183 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1713,7 +1713,7 @@ static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	return 0;
 }
 
-static struct vm_operations_struct kvm_vcpu_vm_ops = {
+static const struct vm_operations_struct kvm_vcpu_vm_ops = {
 	.fault = kvm_vcpu_fault,
 };
 
@@ -2317,7 +2317,7 @@ static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	return 0;
 }
 
-static struct vm_operations_struct kvm_vm_vm_ops = {
+static const struct vm_operations_struct kvm_vm_vm_ops = {
 	.fault = kvm_vm_fault,
 };
 
-- 
cgit v1.2.3


From e207e143e2fb6a2790b1ce3687c8aedc3ddc357b Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 30 Sep 2009 07:48:37 -0700
Subject: Revert "x86, mce: do not compile mcelog message on AMD"

This reverts commit 22223c9b417be5fd0ab2cf9ad17eb7bd1e19f7b9, as
requested by Andi Kleen:

  "Obviously kernels compiled with AMD support can still run on non AMD
   systems, so messages like this can never be removed at compile time."

Requsted-by: Andi Kleen <andi@firstfloor.org>
Cc: Borislav Petkov <borislav.petkov@amd.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 4b2af86e3e8..183c3457d2f 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -204,10 +204,7 @@ static void print_mce_head(void)
 static void print_mce_tail(void)
 {
 	printk(KERN_EMERG "This is not a software problem!\n"
-#if (!defined(CONFIG_EDAC) || !defined(CONFIG_CPU_SUP_AMD))
-	       "Run through mcelog --ascii to decode and contact your hardware vendor\n"
-#endif
-	       );
+	       "Run through mcelog --ascii to decode and contact your hardware vendor\n");
 }
 
 #define PANIC_TIMEOUT 5 /* 5 seconds */
-- 
cgit v1.2.3


From 79e1dd05d1a22e95ab6d54d21836f478b3b56976 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@infradead.org>
Date: Wed, 30 Sep 2009 17:07:54 +0200
Subject: x86: Provide an alternative() based cmpxchg64()

cmpxchg64() today generates, to quote Linus, "barf bag" code.

cmpxchg64() is about to get used in the scheduler to fix a bug there,
but it's a prerequisite that cmpxchg64() first be made non-sucking.

This patch turns cmpxchg64() into an efficient implementation that
uses the alternative() mechanism to just use the raw instruction on
all modern systems.

Note: the fallback is NOT smp safe, just like the current fallback
is not SMP safe. (Interested parties with i486 based SMP systems
are welcome to submit fix patches for that.)

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
[ fixed asm constraint bug ]
Fixed-by: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: John Stultz <johnstul@us.ibm.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20090930170754.0886ff2e@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/cmpxchg_32.h | 30 ++++++++++++---------
 arch/x86/kernel/i386_ksyms_32.c   |  8 ++++++
 arch/x86/lib/Makefile             |  2 +-
 arch/x86/lib/cmpxchg8b_emu.S      | 57 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 83 insertions(+), 14 deletions(-)
 create mode 100644 arch/x86/lib/cmpxchg8b_emu.S

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h
index 82ceb788a98..ee1931be659 100644
--- a/arch/x86/include/asm/cmpxchg_32.h
+++ b/arch/x86/include/asm/cmpxchg_32.h
@@ -312,19 +312,23 @@ static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old,
 
 extern unsigned long long cmpxchg_486_u64(volatile void *, u64, u64);
 
-#define cmpxchg64(ptr, o, n)						\
-({									\
-	__typeof__(*(ptr)) __ret;					\
-	if (likely(boot_cpu_data.x86 > 4))				\
-		__ret = (__typeof__(*(ptr)))__cmpxchg64((ptr),		\
-				(unsigned long long)(o),		\
-				(unsigned long long)(n));		\
-	else								\
-		__ret = (__typeof__(*(ptr)))cmpxchg_486_u64((ptr),	\
-				(unsigned long long)(o),		\
-				(unsigned long long)(n));		\
-	__ret;								\
-})
+#define cmpxchg64(ptr, o, n)					\
+({								\
+	__typeof__(*(ptr)) __ret;				\
+	__typeof__(*(ptr)) __old = (o);				\
+	__typeof__(*(ptr)) __new = (n);				\
+	alternative_io("call cmpxchg8b_emu",			\
+			"lock; cmpxchg8b (%%esi)" ,		\
+		       X86_FEATURE_CX8,				\
+		       "=A" (__ret),				\
+		       "S" ((ptr)), "0" (__old),		\
+		       "b" ((unsigned int)__new),		\
+		       "c" ((unsigned int)(__new>>32))		\
+		       : "memory");				\
+	__ret; })
+
+
+
 #define cmpxchg64_local(ptr, o, n)					\
 ({									\
 	__typeof__(*(ptr)) __ret;					\
diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c
index 43cec6bdda6..1736c5a725a 100644
--- a/arch/x86/kernel/i386_ksyms_32.c
+++ b/arch/x86/kernel/i386_ksyms_32.c
@@ -10,6 +10,14 @@
 EXPORT_SYMBOL(mcount);
 #endif
 
+/*
+ * Note, this is a prototype to get at the symbol for
+ * the export, but dont use it from C code, it is used
+ * by assembly code and is not using C calling convention!
+ */
+extern void cmpxchg8b_emu(void);
+EXPORT_SYMBOL(cmpxchg8b_emu);
+
 /* Networking helper routines. */
 EXPORT_SYMBOL(csum_partial_copy_generic);
 
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 9e609206fac..3e549b8ec8c 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -15,7 +15,7 @@ ifeq ($(CONFIG_X86_32),y)
         obj-y += atomic64_32.o
         lib-y += checksum_32.o
         lib-y += strstr_32.o
-        lib-y += semaphore_32.o string_32.o
+        lib-y += semaphore_32.o string_32.o cmpxchg8b_emu.o
 
         lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o
 else
diff --git a/arch/x86/lib/cmpxchg8b_emu.S b/arch/x86/lib/cmpxchg8b_emu.S
new file mode 100644
index 00000000000..828cb710dec
--- /dev/null
+++ b/arch/x86/lib/cmpxchg8b_emu.S
@@ -0,0 +1,57 @@
+/*
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; version 2
+ *	of the License.
+ *
+ */
+
+#include <linux/linkage.h>
+#include <asm/alternative-asm.h>
+#include <asm/frame.h>
+#include <asm/dwarf2.h>
+
+
+.text
+
+/*
+ * Inputs:
+ * %esi : memory location to compare
+ * %eax : low 32 bits of old value
+ * %edx : high 32 bits of old value
+ * %ebx : low 32 bits of new value
+ * %ecx : high 32 bits of new value
+ */
+ENTRY(cmpxchg8b_emu)
+CFI_STARTPROC
+
+#
+# Emulate 'cmpxchg8b (%esi)' on UP except we don't
+# set the whole ZF thing (caller will just compare
+# eax:edx with the expected value)
+#
+cmpxchg8b_emu:
+	pushfl
+	cli
+
+	cmpl  (%esi), %eax
+	jne not_same
+	cmpl 4(%esi), %edx
+	jne half_same
+
+	movl %ebx,  (%esi)
+	movl %ecx, 4(%esi)
+
+	popfl
+	ret
+
+ not_same:
+	movl  (%esi), %eax
+ half_same:
+	movl 4(%esi), %edx
+
+	popfl
+	ret
+
+CFI_ENDPROC
+ENDPROC(cmpxchg8b_emu)
-- 
cgit v1.2.3


From 982d007a6eec4a0abb404d2355eeec2c041c61ea Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 30 Sep 2009 17:57:27 -0700
Subject: x86: Optimize cmpxchg64() at build-time some more

Try to avoid the 'alternates()' code when we can statically
determine that cmpxchg8b is fine. We already have that
CONFIG_x86_CMPXCHG64 (enabled by PAE support), and we could easily
also enable it for some of the CPU cases.

Note, this patch only adds CMPXCHG8B for the obvious Intel CPU's,
not for others. (There was something really messy about cmpxchg8b
and clone CPU's, so if you enable it on other CPUs later, do it
carefully.)

If we avoid that asm-alternative thing when we can assume the
instruction exists, we'll generate less support crud, and we'll
avoid the whole issue with that extra 'nop' for padding instruction
sizes etc.

LKML-Reference: <alpine.LFD.2.01.0909301743150.6996@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig.cpu | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 527519b8a9f..f2824fb8c79 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -400,7 +400,7 @@ config X86_TSC
 
 config X86_CMPXCHG64
 	def_bool y
-	depends on X86_PAE || X86_64
+	depends on X86_PAE || X86_64 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MATOM
 
 # this should be set for all -march=.. options where the compiler
 # generates cmov.
@@ -412,6 +412,7 @@ config X86_MINIMUM_CPU_FAMILY
 	int
 	default "64" if X86_64
 	default "6" if X86_32 && X86_P6_NOP
+	default "5" if X86_32 && X86_CMPXCHG64
 	default "4" if X86_32 && (X86_XADD || X86_CMPXCHG || X86_BSWAP || X86_WP_WORKS_OK)
 	default "3"
 
-- 
cgit v1.2.3


From d1716a60a8ea90788d24aa22d7eec83fbdd2d88a Mon Sep 17 00:00:00 2001
From: Samuel Thibault <samuel.thibault@ens-lyon.org>
Date: Wed, 30 Sep 2009 16:19:39 -0700
Subject: x86: Fix csum_ipv6_magic asm memory clobber

Just like ip_fast_csum, the assembly snippet in csum_ipv6_magic needs a
memory clobber, as it is only passed the address of the buffer, not a
memory reference to the buffer itself.

This caused failures in Hurd's pfinetv4 when we tried to compile it with
gcc-4.3 (bogus checksums).

Signed-off-by: Samuel Thibault <samuel.thibault@ens-lyon.org>
Acked-by: David S. Miller <davem@davemloft.net>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/checksum_32.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/checksum_32.h b/arch/x86/include/asm/checksum_32.h
index 7c5ef8b14d9..46fc474fd81 100644
--- a/arch/x86/include/asm/checksum_32.h
+++ b/arch/x86/include/asm/checksum_32.h
@@ -161,7 +161,8 @@ static inline __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
 	    "adcl $0, %0	;\n"
 	    : "=&r" (sum)
 	    : "r" (saddr), "r" (daddr),
-	      "r" (htonl(len)), "r" (htonl(proto)), "0" (sum));
+	      "r" (htonl(len)), "r" (htonl(proto)), "0" (sum)
+	    : "memory");
 
 	return csum_fold(sum);
 }
-- 
cgit v1.2.3


From 04edbdef02ec4386a2ae38cab7ae7d166e17a756 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 1 Oct 2009 07:30:38 +0200
Subject: x86: Don't generate cmpxchg8b_emu if CONFIG_X86_CMPXCHG64=y

Conditionaly compile cmpxchg8b_emu.o and EXPORT_SYMBOL(cmpxchg8b_emu).

This reduces the kernel size a bit.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: John Stultz <johnstul@us.ibm.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <4AC43E7E.1000600@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/i386_ksyms_32.c | 2 ++
 arch/x86/lib/Makefile           | 6 ++++--
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c
index 1736c5a725a..9c3bd4a2050 100644
--- a/arch/x86/kernel/i386_ksyms_32.c
+++ b/arch/x86/kernel/i386_ksyms_32.c
@@ -15,8 +15,10 @@ EXPORT_SYMBOL(mcount);
  * the export, but dont use it from C code, it is used
  * by assembly code and is not using C calling convention!
  */
+#ifndef CONFIG_X86_CMPXCHG64
 extern void cmpxchg8b_emu(void);
 EXPORT_SYMBOL(cmpxchg8b_emu);
+#endif
 
 /* Networking helper routines. */
 EXPORT_SYMBOL(csum_partial_copy_generic);
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 3e549b8ec8c..85f5db95c60 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -15,8 +15,10 @@ ifeq ($(CONFIG_X86_32),y)
         obj-y += atomic64_32.o
         lib-y += checksum_32.o
         lib-y += strstr_32.o
-        lib-y += semaphore_32.o string_32.o cmpxchg8b_emu.o
-
+        lib-y += semaphore_32.o string_32.o
+ifneq ($(CONFIG_X86_CMPXCHG64),y)
+        lib-y += cmpxchg8b_emu.o
+endif
         lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o
 else
         obj-y += io_64.o iomap_copy_64.o
-- 
cgit v1.2.3


From ea3acb199a5d7e4da1de0a4288eba993b29f33b9 Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Thu, 24 Sep 2009 09:08:30 -0500
Subject: x86: earlyprintk: Fix regression to handle serial,ttySn as 1 arg

Commit c953094 ("early_printk: Allow more than one early console")
introduced a regression in the parsing of the earlyprintk= kernel
arguments.

If you specify "earlyprintk=serial,ttyS0,115200" as a kernel
argument, the "serial,ttyS" should be parsed as a single argument
and not as "serial" and then "ttyS".

Also update the documentation to reflect you can specify the ttyS
directly without the "serial" argument.

Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
Cc: Len Brown <lenb@kernel.org>
Cc: Greg KH <gregkh@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
LKML-Reference: <4ABB7D5E.6000301@windriver.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/kernel-parameters.txt | 1 +
 arch/x86/kernel/early_printk.c      | 5 ++++-
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 6fa7292947e..9107b387e91 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -671,6 +671,7 @@ and is between 256 and 4096 characters. It is defined in the file
 	earlyprintk=	[X86,SH,BLACKFIN]
 			earlyprintk=vga
 			earlyprintk=serial[,ttySn[,baudrate]]
+			earlyprintk=ttySn[,baudrate]
 			earlyprintk=dbgp[debugController#]
 
 			Append ",keep" to not disable it when the real console
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 41fd965c80c..b9c830c12b4 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -206,8 +206,11 @@ static int __init setup_early_printk(char *buf)
 
 	while (*buf != '\0') {
 		if (!strncmp(buf, "serial", 6)) {
-			early_serial_init(buf + 6);
+			buf += 6;
+			early_serial_init(buf);
 			early_console_register(&early_serial_console, keep);
+			if (!strncmp(buf, ",ttyS", 5))
+				buf += 5;
 		}
 		if (!strncmp(buf, "ttyS", 4)) {
 			early_serial_init(buf + 4);
-- 
cgit v1.2.3


From 4701472e441e41be2549a25228f703bc9cd13b5b Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinder@kernel.org>
Date: Sat, 26 Sep 2009 21:41:41 +0530
Subject: x86, SLUB: Remove unused CONFIG FAST_CMPXCHG_LOCAL

Remove unused CONFIG FAST_CMPXCHG_LOCAL from Kconfig.

Reported-by: Robert P. J. Day <rpjday@crashcourse.ca>
Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Acked-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Matt Mackall <mpm@selenic.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: "Robert P. J. Day" <rpjday@crashcourse.ca>
Cc: linux-mm@kvack.org
LKML-Reference: <1253981501.4568.61.camel@ht.satnam>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 8da93745c08..c876bace8fd 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -86,10 +86,6 @@ config STACKTRACE_SUPPORT
 config HAVE_LATENCYTOP_SUPPORT
 	def_bool y
 
-config FAST_CMPXCHG_LOCAL
-	bool
-	default y
-
 config MMU
 	def_bool y
 
-- 
cgit v1.2.3


From 24e35800cdc4350fc34e2bed37b608a9e13ab3b6 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Wed, 30 Sep 2009 11:22:11 +0100
Subject: x86: Don't leak 64-bit kernel register values to 32-bit processes

While 32-bit processes can't directly access R8...R15, they can
gain access to these registers by temporarily switching themselves
into 64-bit mode.

Therefore, registers not preserved anyway by called C functions
(i.e. R8...R11) must be cleared prior to returning to user mode.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Cc: <stable@kernel.org>
LKML-Reference: <4AC34D73020000780001744A@vpn.id2.novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/ia32/ia32entry.S | 36 +++++++++++++++++++++++-------------
 1 file changed, 23 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 74619c4f9fd..1733f9f65e8 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -21,8 +21,8 @@
 #define __AUDIT_ARCH_LE	   0x40000000
 
 #ifndef CONFIG_AUDITSYSCALL
-#define sysexit_audit int_ret_from_sys_call
-#define sysretl_audit int_ret_from_sys_call
+#define sysexit_audit ia32_ret_from_sys_call
+#define sysretl_audit ia32_ret_from_sys_call
 #endif
 
 #define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8)
@@ -39,12 +39,12 @@
 	.endm 
 
 	/* clobbers %eax */	
-	.macro  CLEAR_RREGS _r9=rax
+	.macro  CLEAR_RREGS offset=0, _r9=rax
 	xorl 	%eax,%eax
-	movq	%rax,R11(%rsp)
-	movq	%rax,R10(%rsp)
-	movq	%\_r9,R9(%rsp)
-	movq	%rax,R8(%rsp)
+	movq	%rax,\offset+R11(%rsp)
+	movq	%rax,\offset+R10(%rsp)
+	movq	%\_r9,\offset+R9(%rsp)
+	movq	%rax,\offset+R8(%rsp)
 	.endm
 
 	/*
@@ -172,6 +172,10 @@ sysexit_from_sys_call:
 	movl	RIP-R11(%rsp),%edx		/* User %eip */
 	CFI_REGISTER rip,rdx
 	RESTORE_ARGS 1,24,1,1,1,1
+	xorq	%r8,%r8
+	xorq	%r9,%r9
+	xorq	%r10,%r10
+	xorq	%r11,%r11
 	popfq
 	CFI_ADJUST_CFA_OFFSET -8
 	/*CFI_RESTORE rflags*/
@@ -202,7 +206,7 @@ sysexit_from_sys_call:
 
 	.macro auditsys_exit exit,ebpsave=RBP
 	testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
-	jnz int_ret_from_sys_call
+	jnz ia32_ret_from_sys_call
 	TRACE_IRQS_ON
 	sti
 	movl %eax,%esi		/* second arg, syscall return value */
@@ -218,8 +222,9 @@ sysexit_from_sys_call:
 	cli
 	TRACE_IRQS_OFF
 	testl %edi,TI_flags(%r10)
-	jnz int_with_check
-	jmp \exit
+	jz \exit
+	CLEAR_RREGS -ARGOFFSET
+	jmp int_with_check
 	.endm
 
 sysenter_auditsys:
@@ -329,6 +334,9 @@ sysretl_from_sys_call:
 	CFI_REGISTER rip,rcx
 	movl EFLAGS-ARGOFFSET(%rsp),%r11d	
 	/*CFI_REGISTER rflags,r11*/
+	xorq	%r10,%r10
+	xorq	%r9,%r9
+	xorq	%r8,%r8
 	TRACE_IRQS_ON
 	movl RSP-ARGOFFSET(%rsp),%esp
 	CFI_RESTORE rsp
@@ -353,7 +361,7 @@ cstar_tracesys:
 #endif
 	xchgl %r9d,%ebp
 	SAVE_REST
-	CLEAR_RREGS r9
+	CLEAR_RREGS 0, r9
 	movq $-ENOSYS,RAX(%rsp)	/* ptrace can change this for a bad syscall */
 	movq %rsp,%rdi        /* &pt_regs -> arg1 */
 	call syscall_trace_enter
@@ -425,6 +433,8 @@ ia32_do_call:
 	call *ia32_sys_call_table(,%rax,8) # xxx: rip relative
 ia32_sysret:
 	movq %rax,RAX-ARGOFFSET(%rsp)
+ia32_ret_from_sys_call:
+	CLEAR_RREGS -ARGOFFSET
 	jmp int_ret_from_sys_call 
 
 ia32_tracesys:			 
@@ -442,8 +452,8 @@ END(ia32_syscall)
 
 ia32_badsys:
 	movq $0,ORIG_RAX-ARGOFFSET(%rsp)
-	movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
-	jmp int_ret_from_sys_call
+	movq $-ENOSYS,%rax
+	jmp ia32_sysret
 
 quiet_ni_syscall:
 	movq $-ENOSYS,%rax
-- 
cgit v1.2.3


From 828c09509b9695271bcbdc53e9fc9a6a737148d2 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 1 Oct 2009 15:43:56 -0700
Subject: const: constify remaining file_operations

[akpm@linux-foundation.org: fix KVM]
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: Mike Frysinger <vapier@gentoo.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/mach-ns9xxx/clock.c               |  2 +-
 arch/blackfin/mach-bf561/coreb.c           |  2 +-
 arch/cris/arch-v10/drivers/sync_serial.c   |  2 +-
 arch/cris/arch-v32/drivers/mach-fs/gpio.c  |  2 +-
 arch/powerpc/kvm/timing.c                  |  2 +-
 arch/powerpc/platforms/cell/spufs/file.c   |  2 +-
 arch/powerpc/platforms/pseries/dtl.c       |  2 +-
 arch/x86/xen/debugfs.c                     |  2 +-
 drivers/acpi/video.c                       |  2 +-
 drivers/block/cciss.c                      |  2 +-
 drivers/char/apm-emulation.c               |  2 +-
 drivers/char/bfin-otp.c                    |  2 +-
 drivers/char/xilinx_hwicap/xilinx_hwicap.c |  2 +-
 drivers/gpio/gpiolib.c                     |  2 +-
 drivers/hwmon/fschmd.c                     |  2 +-
 drivers/lguest/lguest_user.c               |  2 +-
 drivers/media/dvb/dvb-core/dmxdev.c        |  2 +-
 drivers/media/dvb/firewire/firedtv-ci.c    |  2 +-
 drivers/misc/phantom.c                     |  2 +-
 drivers/misc/sgi-gru/grufile.c             |  3 +--
 drivers/mmc/core/debugfs.c                 |  2 +-
 drivers/s390/cio/qdio_debug.c              |  2 +-
 drivers/s390/cio/qdio_perf.c               |  2 +-
 drivers/scsi/sg.c                          | 43 +++++++++++++++++++++---------
 drivers/spi/spidev.c                       |  2 +-
 drivers/usb/class/usbtmc.c                 |  2 +-
 drivers/usb/gadget/printer.c               |  2 +-
 drivers/usb/host/whci/debug.c              |  6 ++---
 drivers/usb/misc/rio500.c                  |  3 +--
 drivers/uwb/uwb-debug.c                    |  6 ++---
 fs/btrfs/ctree.h                           |  2 +-
 fs/btrfs/file.c                            |  2 +-
 fs/btrfs/inode.c                           |  4 +--
 fs/jbd2/journal.c                          |  2 +-
 fs/nfsd/nfsctl.c                           |  2 +-
 fs/nilfs2/dir.c                            |  2 +-
 fs/nilfs2/file.c                           |  2 +-
 fs/nilfs2/mdt.c                            |  2 +-
 fs/nilfs2/nilfs.h                          |  4 +--
 fs/ocfs2/cluster/heartbeat.c               |  2 +-
 fs/ocfs2/cluster/netdebug.c                |  4 +--
 fs/ocfs2/dlm/dlmdebug.c                    |  8 +++---
 fs/ocfs2/super.c                           |  2 +-
 fs/omfs/dir.c                              |  2 +-
 fs/omfs/file.c                             |  2 +-
 fs/omfs/omfs.h                             |  4 +--
 include/linux/cgroup.h                     |  2 +-
 include/linux/fs.h                         |  2 +-
 kernel/cgroup.c                            | 10 +++----
 kernel/kprobes.c                           |  4 +--
 kernel/rcutree_trace.c                     | 10 +++----
 kernel/sched.c                             |  2 +-
 kernel/time/timer_list.c                   |  2 +-
 kernel/time/timer_stats.c                  |  2 +-
 samples/tracepoints/tracepoint-sample.c    |  2 +-
 security/integrity/ima/ima_fs.c            | 10 +++----
 virt/kvm/kvm_main.c                        |  2 +-
 57 files changed, 110 insertions(+), 95 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/arm/mach-ns9xxx/clock.c b/arch/arm/mach-ns9xxx/clock.c
index 44ed20d4a38..cf81cbc5754 100644
--- a/arch/arm/mach-ns9xxx/clock.c
+++ b/arch/arm/mach-ns9xxx/clock.c
@@ -195,7 +195,7 @@ static int clk_debugfs_open(struct inode *inode, struct file *file)
 	return single_open(file, clk_debugfs_show, NULL);
 }
 
-static struct file_operations clk_debugfs_operations = {
+static const struct file_operations clk_debugfs_operations = {
 	.open = clk_debugfs_open,
 	.read = seq_read,
 	.llseek = seq_lseek,
diff --git a/arch/blackfin/mach-bf561/coreb.c b/arch/blackfin/mach-bf561/coreb.c
index 93635a766f9..1e60a92dd60 100644
--- a/arch/blackfin/mach-bf561/coreb.c
+++ b/arch/blackfin/mach-bf561/coreb.c
@@ -48,7 +48,7 @@ coreb_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned l
 	return ret;
 }
 
-static struct file_operations coreb_fops = {
+static const struct file_operations coreb_fops = {
 	.owner   = THIS_MODULE,
 	.ioctl   = coreb_ioctl,
 };
diff --git a/arch/cris/arch-v10/drivers/sync_serial.c b/arch/cris/arch-v10/drivers/sync_serial.c
index 6cc1a0319a5..562b9a7feae 100644
--- a/arch/cris/arch-v10/drivers/sync_serial.c
+++ b/arch/cris/arch-v10/drivers/sync_serial.c
@@ -244,7 +244,7 @@ static unsigned sync_serial_prescale_shadow;
 
 #define NUMBER_OF_PORTS 2
 
-static struct file_operations sync_serial_fops = {
+static const struct file_operations sync_serial_fops = {
 	.owner   = THIS_MODULE,
 	.write   = sync_serial_write,
 	.read    = sync_serial_read,
diff --git a/arch/cris/arch-v32/drivers/mach-fs/gpio.c b/arch/cris/arch-v32/drivers/mach-fs/gpio.c
index fe1fde89388..d89ab80498e 100644
--- a/arch/cris/arch-v32/drivers/mach-fs/gpio.c
+++ b/arch/cris/arch-v32/drivers/mach-fs/gpio.c
@@ -855,7 +855,7 @@ gpio_leds_ioctl(unsigned int cmd, unsigned long arg)
 	return 0;
 }
 
-struct file_operations gpio_fops = {
+static const struct file_operations gpio_fops = {
 	.owner       = THIS_MODULE,
 	.poll        = gpio_poll,
 	.ioctl       = gpio_ioctl,
diff --git a/arch/powerpc/kvm/timing.c b/arch/powerpc/kvm/timing.c
index 47ee603f558..2aa371e3007 100644
--- a/arch/powerpc/kvm/timing.c
+++ b/arch/powerpc/kvm/timing.c
@@ -201,7 +201,7 @@ static int kvmppc_exit_timing_open(struct inode *inode, struct file *file)
 	return single_open(file, kvmppc_exit_timing_show, inode->i_private);
 }
 
-static struct file_operations kvmppc_exit_timing_fops = {
+static const struct file_operations kvmppc_exit_timing_fops = {
 	.owner   = THIS_MODULE,
 	.open    = kvmppc_exit_timing_open,
 	.read    = seq_read,
diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c
index 96130944617..884e8bcec49 100644
--- a/arch/powerpc/platforms/cell/spufs/file.c
+++ b/arch/powerpc/platforms/cell/spufs/file.c
@@ -147,7 +147,7 @@ static int __fops ## _open(struct inode *inode, struct file *file)	\
 	__simple_attr_check_format(__fmt, 0ull);			\
 	return spufs_attr_open(inode, file, __get, __set, __fmt);	\
 }									\
-static struct file_operations __fops = {				\
+static const struct file_operations __fops = {				\
 	.owner	 = THIS_MODULE,						\
 	.open	 = __fops ## _open,					\
 	.release = spufs_attr_release,					\
diff --git a/arch/powerpc/platforms/pseries/dtl.c b/arch/powerpc/platforms/pseries/dtl.c
index ab69925d579..937a544a236 100644
--- a/arch/powerpc/platforms/pseries/dtl.c
+++ b/arch/powerpc/platforms/pseries/dtl.c
@@ -209,7 +209,7 @@ static ssize_t dtl_file_read(struct file *filp, char __user *buf, size_t len,
 	return n_read * sizeof(struct dtl_entry);
 }
 
-static struct file_operations dtl_fops = {
+static const struct file_operations dtl_fops = {
 	.open		= dtl_file_open,
 	.release	= dtl_file_release,
 	.read		= dtl_file_read,
diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c
index b53225d2cac..e133ce25e29 100644
--- a/arch/x86/xen/debugfs.c
+++ b/arch/x86/xen/debugfs.c
@@ -100,7 +100,7 @@ static int xen_array_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
-static struct file_operations u32_array_fops = {
+static const struct file_operations u32_array_fops = {
 	.owner	= THIS_MODULE,
 	.open	= u32_array_open,
 	.release= xen_array_release,
diff --git a/drivers/acpi/video.c b/drivers/acpi/video.c
index a4fddb24476..f6e54bf8dd9 100644
--- a/drivers/acpi/video.c
+++ b/drivers/acpi/video.c
@@ -285,7 +285,7 @@ static int acpi_video_device_brightness_open_fs(struct inode *inode,
 						struct file *file);
 static ssize_t acpi_video_device_write_brightness(struct file *file,
 	const char __user *buffer, size_t count, loff_t *data);
-static struct file_operations acpi_video_device_brightness_fops = {
+static const struct file_operations acpi_video_device_brightness_fops = {
 	.owner = THIS_MODULE,
 	.open = acpi_video_device_brightness_open_fs,
 	.read = seq_read,
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 24c3e21ab26..1ece0b47b58 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -426,7 +426,7 @@ out:
 	return err;
 }
 
-static struct file_operations cciss_proc_fops = {
+static const struct file_operations cciss_proc_fops = {
 	.owner	 = THIS_MODULE,
 	.open    = cciss_seq_open,
 	.read    = seq_read,
diff --git a/drivers/char/apm-emulation.c b/drivers/char/apm-emulation.c
index aaca40283be..4f568cb9af3 100644
--- a/drivers/char/apm-emulation.c
+++ b/drivers/char/apm-emulation.c
@@ -393,7 +393,7 @@ static int apm_open(struct inode * inode, struct file * filp)
 	return as ? 0 : -ENOMEM;
 }
 
-static struct file_operations apm_bios_fops = {
+static const struct file_operations apm_bios_fops = {
 	.owner		= THIS_MODULE,
 	.read		= apm_read,
 	.poll		= apm_poll,
diff --git a/drivers/char/bfin-otp.c b/drivers/char/bfin-otp.c
index e3dd24bff51..836d4f0a876 100644
--- a/drivers/char/bfin-otp.c
+++ b/drivers/char/bfin-otp.c
@@ -217,7 +217,7 @@ static long bfin_otp_ioctl(struct file *filp, unsigned cmd, unsigned long arg)
 # define bfin_otp_ioctl NULL
 #endif
 
-static struct file_operations bfin_otp_fops = {
+static const struct file_operations bfin_otp_fops = {
 	.owner          = THIS_MODULE,
 	.unlocked_ioctl = bfin_otp_ioctl,
 	.read           = bfin_otp_read,
diff --git a/drivers/char/xilinx_hwicap/xilinx_hwicap.c b/drivers/char/xilinx_hwicap/xilinx_hwicap.c
index f40ab699860..4846d50199f 100644
--- a/drivers/char/xilinx_hwicap/xilinx_hwicap.c
+++ b/drivers/char/xilinx_hwicap/xilinx_hwicap.c
@@ -559,7 +559,7 @@ static int hwicap_release(struct inode *inode, struct file *file)
 	return status;
 }
 
-static struct file_operations hwicap_fops = {
+static const struct file_operations hwicap_fops = {
 	.owner = THIS_MODULE,
 	.write = hwicap_write,
 	.read = hwicap_read,
diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index bb11a429394..662ed923d9e 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -1487,7 +1487,7 @@ static int gpiolib_open(struct inode *inode, struct file *file)
 	return single_open(file, gpiolib_show, NULL);
 }
 
-static struct file_operations gpiolib_operations = {
+static const struct file_operations gpiolib_operations = {
 	.open		= gpiolib_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
diff --git a/drivers/hwmon/fschmd.c b/drivers/hwmon/fschmd.c
index ea955edde87..2a7a85a6dc3 100644
--- a/drivers/hwmon/fschmd.c
+++ b/drivers/hwmon/fschmd.c
@@ -915,7 +915,7 @@ static int watchdog_ioctl(struct inode *inode, struct file *filp,
 	return ret;
 }
 
-static struct file_operations watchdog_fops = {
+static const struct file_operations watchdog_fops = {
 	.owner = THIS_MODULE,
 	.llseek = no_llseek,
 	.open = watchdog_open,
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
index b4d3f7ca554..bd1632388e4 100644
--- a/drivers/lguest/lguest_user.c
+++ b/drivers/lguest/lguest_user.c
@@ -508,7 +508,7 @@ static int close(struct inode *inode, struct file *file)
  * uses: reading and writing a character device called /dev/lguest.  All the
  * work happens in the read(), write() and close() routines:
  */
-static struct file_operations lguest_fops = {
+static const struct file_operations lguest_fops = {
 	.owner	 = THIS_MODULE,
 	.release = close,
 	.write	 = write,
diff --git a/drivers/media/dvb/dvb-core/dmxdev.c b/drivers/media/dvb/dvb-core/dmxdev.c
index 3750ff48cba..51641498359 100644
--- a/drivers/media/dvb/dvb-core/dmxdev.c
+++ b/drivers/media/dvb/dvb-core/dmxdev.c
@@ -1203,7 +1203,7 @@ static unsigned int dvb_dvr_poll(struct file *file, poll_table *wait)
 	return mask;
 }
 
-static struct file_operations dvb_dvr_fops = {
+static const struct file_operations dvb_dvr_fops = {
 	.owner = THIS_MODULE,
 	.read = dvb_dvr_read,
 	.write = dvb_dvr_write,
diff --git a/drivers/media/dvb/firewire/firedtv-ci.c b/drivers/media/dvb/firewire/firedtv-ci.c
index eeb80d0ea3f..853e04b7cb3 100644
--- a/drivers/media/dvb/firewire/firedtv-ci.c
+++ b/drivers/media/dvb/firewire/firedtv-ci.c
@@ -215,7 +215,7 @@ static unsigned int fdtv_ca_io_poll(struct file *file, poll_table *wait)
 	return POLLIN;
 }
 
-static struct file_operations fdtv_ca_fops = {
+static const struct file_operations fdtv_ca_fops = {
 	.owner		= THIS_MODULE,
 	.ioctl		= dvb_generic_ioctl,
 	.open		= dvb_generic_open,
diff --git a/drivers/misc/phantom.c b/drivers/misc/phantom.c
index fa57b67593a..90a95ce8dc3 100644
--- a/drivers/misc/phantom.c
+++ b/drivers/misc/phantom.c
@@ -271,7 +271,7 @@ static unsigned int phantom_poll(struct file *file, poll_table *wait)
 	return mask;
 }
 
-static struct file_operations phantom_file_ops = {
+static const struct file_operations phantom_file_ops = {
 	.open = phantom_open,
 	.release = phantom_release,
 	.unlocked_ioctl = phantom_ioctl,
diff --git a/drivers/misc/sgi-gru/grufile.c b/drivers/misc/sgi-gru/grufile.c
index 300e7ba391a..41c8fe2a928 100644
--- a/drivers/misc/sgi-gru/grufile.c
+++ b/drivers/misc/sgi-gru/grufile.c
@@ -53,7 +53,6 @@ struct gru_stats_s gru_stats;
 /* Guaranteed user available resources on each node */
 static int max_user_cbrs, max_user_dsr_bytes;
 
-static struct file_operations gru_fops;
 static struct miscdevice gru_miscdev;
 
 
@@ -426,7 +425,7 @@ static void __exit gru_exit(void)
 	gru_proc_exit();
 }
 
-static struct file_operations gru_fops = {
+static const struct file_operations gru_fops = {
 	.owner		= THIS_MODULE,
 	.unlocked_ioctl	= gru_file_unlocked_ioctl,
 	.mmap		= gru_file_mmap,
diff --git a/drivers/mmc/core/debugfs.c b/drivers/mmc/core/debugfs.c
index 610dbd1fcc8..96d10f40fb2 100644
--- a/drivers/mmc/core/debugfs.c
+++ b/drivers/mmc/core/debugfs.c
@@ -240,7 +240,7 @@ static int mmc_ext_csd_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
-static struct file_operations mmc_dbg_ext_csd_fops = {
+static const struct file_operations mmc_dbg_ext_csd_fops = {
 	.open		= mmc_ext_csd_open,
 	.read		= mmc_ext_csd_read,
 	.release	= mmc_ext_csd_release,
diff --git a/drivers/s390/cio/qdio_debug.c b/drivers/s390/cio/qdio_debug.c
index 1b78f639ead..76769978285 100644
--- a/drivers/s390/cio/qdio_debug.c
+++ b/drivers/s390/cio/qdio_debug.c
@@ -125,7 +125,7 @@ static int qstat_seq_open(struct inode *inode, struct file *filp)
 			   filp->f_path.dentry->d_inode->i_private);
 }
 
-static struct file_operations debugfs_fops = {
+static const struct file_operations debugfs_fops = {
 	.owner	 = THIS_MODULE,
 	.open	 = qstat_seq_open,
 	.read	 = seq_read,
diff --git a/drivers/s390/cio/qdio_perf.c b/drivers/s390/cio/qdio_perf.c
index eff943923c6..968e3c7c263 100644
--- a/drivers/s390/cio/qdio_perf.c
+++ b/drivers/s390/cio/qdio_perf.c
@@ -84,7 +84,7 @@ static int qdio_perf_seq_open(struct inode *inode, struct file *filp)
 	return single_open(filp, qdio_perf_proc_show, NULL);
 }
 
-static struct file_operations qdio_perf_proc_fops = {
+static const struct file_operations qdio_perf_proc_fops = {
 	.owner	 = THIS_MODULE,
 	.open	 = qdio_perf_seq_open,
 	.read	 = seq_read,
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 0cb049f5cc5..747a5e5c127 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -1317,7 +1317,7 @@ static void sg_rq_end_io(struct request *rq, int uptodate)
 	}
 }
 
-static struct file_operations sg_fops = {
+static const struct file_operations sg_fops = {
 	.owner = THIS_MODULE,
 	.read = sg_read,
 	.write = sg_write,
@@ -2194,9 +2194,11 @@ static int sg_proc_seq_show_int(struct seq_file *s, void *v);
 static int sg_proc_single_open_adio(struct inode *inode, struct file *file);
 static ssize_t sg_proc_write_adio(struct file *filp, const char __user *buffer,
 			          size_t count, loff_t *off);
-static struct file_operations adio_fops = {
-	/* .owner, .read and .llseek added in sg_proc_init() */
+static const struct file_operations adio_fops = {
+	.owner = THIS_MODULE,
 	.open = sg_proc_single_open_adio,
+	.read = seq_read,
+	.llseek = seq_lseek,
 	.write = sg_proc_write_adio,
 	.release = single_release,
 };
@@ -2204,23 +2206,32 @@ static struct file_operations adio_fops = {
 static int sg_proc_single_open_dressz(struct inode *inode, struct file *file);
 static ssize_t sg_proc_write_dressz(struct file *filp, 
 		const char __user *buffer, size_t count, loff_t *off);
-static struct file_operations dressz_fops = {
+static const struct file_operations dressz_fops = {
+	.owner = THIS_MODULE,
 	.open = sg_proc_single_open_dressz,
+	.read = seq_read,
+	.llseek = seq_lseek,
 	.write = sg_proc_write_dressz,
 	.release = single_release,
 };
 
 static int sg_proc_seq_show_version(struct seq_file *s, void *v);
 static int sg_proc_single_open_version(struct inode *inode, struct file *file);
-static struct file_operations version_fops = {
+static const struct file_operations version_fops = {
+	.owner = THIS_MODULE,
 	.open = sg_proc_single_open_version,
+	.read = seq_read,
+	.llseek = seq_lseek,
 	.release = single_release,
 };
 
 static int sg_proc_seq_show_devhdr(struct seq_file *s, void *v);
 static int sg_proc_single_open_devhdr(struct inode *inode, struct file *file);
-static struct file_operations devhdr_fops = {
+static const struct file_operations devhdr_fops = {
+	.owner = THIS_MODULE,
 	.open = sg_proc_single_open_devhdr,
+	.read = seq_read,
+	.llseek = seq_lseek,
 	.release = single_release,
 };
 
@@ -2229,8 +2240,11 @@ static int sg_proc_open_dev(struct inode *inode, struct file *file);
 static void * dev_seq_start(struct seq_file *s, loff_t *pos);
 static void * dev_seq_next(struct seq_file *s, void *v, loff_t *pos);
 static void dev_seq_stop(struct seq_file *s, void *v);
-static struct file_operations dev_fops = {
+static const struct file_operations dev_fops = {
+	.owner = THIS_MODULE,
 	.open = sg_proc_open_dev,
+	.read = seq_read,
+	.llseek = seq_lseek,
 	.release = seq_release,
 };
 static const struct seq_operations dev_seq_ops = {
@@ -2242,8 +2256,11 @@ static const struct seq_operations dev_seq_ops = {
 
 static int sg_proc_seq_show_devstrs(struct seq_file *s, void *v);
 static int sg_proc_open_devstrs(struct inode *inode, struct file *file);
-static struct file_operations devstrs_fops = {
+static const struct file_operations devstrs_fops = {
+	.owner = THIS_MODULE,
 	.open = sg_proc_open_devstrs,
+	.read = seq_read,
+	.llseek = seq_lseek,
 	.release = seq_release,
 };
 static const struct seq_operations devstrs_seq_ops = {
@@ -2255,8 +2272,11 @@ static const struct seq_operations devstrs_seq_ops = {
 
 static int sg_proc_seq_show_debug(struct seq_file *s, void *v);
 static int sg_proc_open_debug(struct inode *inode, struct file *file);
-static struct file_operations debug_fops = {
+static const struct file_operations debug_fops = {
+	.owner = THIS_MODULE,
 	.open = sg_proc_open_debug,
+	.read = seq_read,
+	.llseek = seq_lseek,
 	.release = seq_release,
 };
 static const struct seq_operations debug_seq_ops = {
@@ -2269,7 +2289,7 @@ static const struct seq_operations debug_seq_ops = {
 
 struct sg_proc_leaf {
 	const char * name;
-	struct file_operations * fops;
+	const struct file_operations * fops;
 };
 
 static struct sg_proc_leaf sg_proc_leaf_arr[] = {
@@ -2295,9 +2315,6 @@ sg_proc_init(void)
 	for (k = 0; k < num_leaves; ++k) {
 		leaf = &sg_proc_leaf_arr[k];
 		mask = leaf->fops->write ? S_IRUGO | S_IWUSR : S_IRUGO;
-		leaf->fops->owner = THIS_MODULE;
-		leaf->fops->read = seq_read;
-		leaf->fops->llseek = seq_lseek;
 		proc_create(leaf->name, mask, sg_proc_sgp, leaf->fops);
 	}
 	return 0;
diff --git a/drivers/spi/spidev.c b/drivers/spi/spidev.c
index f921bd1109e..5d23983f02f 100644
--- a/drivers/spi/spidev.c
+++ b/drivers/spi/spidev.c
@@ -537,7 +537,7 @@ static int spidev_release(struct inode *inode, struct file *filp)
 	return status;
 }
 
-static struct file_operations spidev_fops = {
+static const struct file_operations spidev_fops = {
 	.owner =	THIS_MODULE,
 	/* REVISIT switch to aio primitives, so that userspace
 	 * gets more complete API coverage.  It'll simplify things
diff --git a/drivers/usb/class/usbtmc.c b/drivers/usb/class/usbtmc.c
index 333ee02e7b2..864f0ba6a34 100644
--- a/drivers/usb/class/usbtmc.c
+++ b/drivers/usb/class/usbtmc.c
@@ -993,7 +993,7 @@ skip_io_on_zombie:
 	return retval;
 }
 
-static struct file_operations fops = {
+static const struct file_operations fops = {
 	.owner		= THIS_MODULE,
 	.read		= usbtmc_read,
 	.write		= usbtmc_write,
diff --git a/drivers/usb/gadget/printer.c b/drivers/usb/gadget/printer.c
index 29500154d00..2d867fd2241 100644
--- a/drivers/usb/gadget/printer.c
+++ b/drivers/usb/gadget/printer.c
@@ -875,7 +875,7 @@ printer_ioctl(struct file *fd, unsigned int code, unsigned long arg)
 }
 
 /* used after endpoint configuration */
-static struct file_operations printer_io_operations = {
+static const struct file_operations printer_io_operations = {
 	.owner =	THIS_MODULE,
 	.open =		printer_open,
 	.read =		printer_read,
diff --git a/drivers/usb/host/whci/debug.c b/drivers/usb/host/whci/debug.c
index cf2d45946c5..2273c815941 100644
--- a/drivers/usb/host/whci/debug.c
+++ b/drivers/usb/host/whci/debug.c
@@ -134,7 +134,7 @@ static int pzl_open(struct inode *inode, struct file *file)
 	return single_open(file, pzl_print, inode->i_private);
 }
 
-static struct file_operations di_fops = {
+static const struct file_operations di_fops = {
 	.open    = di_open,
 	.read    = seq_read,
 	.llseek  = seq_lseek,
@@ -142,7 +142,7 @@ static struct file_operations di_fops = {
 	.owner   = THIS_MODULE,
 };
 
-static struct file_operations asl_fops = {
+static const struct file_operations asl_fops = {
 	.open    = asl_open,
 	.read    = seq_read,
 	.llseek  = seq_lseek,
@@ -150,7 +150,7 @@ static struct file_operations asl_fops = {
 	.owner   = THIS_MODULE,
 };
 
-static struct file_operations pzl_fops = {
+static const struct file_operations pzl_fops = {
 	.open    = pzl_open,
 	.read    = seq_read,
 	.llseek  = seq_lseek,
diff --git a/drivers/usb/misc/rio500.c b/drivers/usb/misc/rio500.c
index d645f3899fe..32d0199d0c3 100644
--- a/drivers/usb/misc/rio500.c
+++ b/drivers/usb/misc/rio500.c
@@ -429,8 +429,7 @@ read_rio(struct file *file, char __user *buffer, size_t count, loff_t * ppos)
 	return read_count;
 }
 
-static struct
-file_operations usb_rio_fops = {
+static const struct file_operations usb_rio_fops = {
 	.owner =	THIS_MODULE,
 	.read =		read_rio,
 	.write =	write_rio,
diff --git a/drivers/uwb/uwb-debug.c b/drivers/uwb/uwb-debug.c
index 4a42993700c..2eecec0c13c 100644
--- a/drivers/uwb/uwb-debug.c
+++ b/drivers/uwb/uwb-debug.c
@@ -205,7 +205,7 @@ static ssize_t command_write(struct file *file, const char __user *buf,
 	return ret < 0 ? ret : len;
 }
 
-static struct file_operations command_fops = {
+static const struct file_operations command_fops = {
 	.open   = command_open,
 	.write  = command_write,
 	.read   = NULL,
@@ -255,7 +255,7 @@ static int reservations_open(struct inode *inode, struct file *file)
 	return single_open(file, reservations_print, inode->i_private);
 }
 
-static struct file_operations reservations_fops = {
+static const struct file_operations reservations_fops = {
 	.open    = reservations_open,
 	.read    = seq_read,
 	.llseek  = seq_lseek,
@@ -283,7 +283,7 @@ static int drp_avail_open(struct inode *inode, struct file *file)
 	return single_open(file, drp_avail_print, inode->i_private);
 }
 
-static struct file_operations drp_avail_fops = {
+static const struct file_operations drp_avail_fops = {
 	.open    = drp_avail_open,
 	.read    = seq_read,
 	.llseek  = seq_lseek,
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 80599b4e42b..4484eb3408a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2326,7 +2326,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync);
 int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 			    int skip_pinned);
 int btrfs_check_file(struct btrfs_root *root, struct inode *inode);
-extern struct file_operations btrfs_file_operations;
+extern const struct file_operations btrfs_file_operations;
 int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, struct inode *inode,
 		       u64 start, u64 end, u64 locked_end,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index a3492a3ad96..9ed17dbe5c6 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1196,7 +1196,7 @@ static int btrfs_file_mmap(struct file	*filp, struct vm_area_struct *vma)
 	return 0;
 }
 
-struct file_operations btrfs_file_operations = {
+const struct file_operations btrfs_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
 	.aio_read       = generic_file_aio_read,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e9b76bcd1c1..b9fe06d751c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -62,7 +62,7 @@ static const struct inode_operations btrfs_special_inode_operations;
 static const struct inode_operations btrfs_file_inode_operations;
 static const struct address_space_operations btrfs_aops;
 static const struct address_space_operations btrfs_symlink_aops;
-static struct file_operations btrfs_dir_file_operations;
+static const struct file_operations btrfs_dir_file_operations;
 static struct extent_io_ops btrfs_extent_io_ops;
 
 static struct kmem_cache *btrfs_inode_cachep;
@@ -5544,7 +5544,7 @@ static const struct inode_operations btrfs_dir_ro_inode_operations = {
 	.permission	= btrfs_permission,
 };
 
-static struct file_operations btrfs_dir_file_operations = {
+static const struct file_operations btrfs_dir_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
 	.readdir	= btrfs_real_readdir,
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 761af77491f..b0ab5219bec 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -770,7 +770,7 @@ static int jbd2_seq_info_release(struct inode *inode, struct file *file)
 	return seq_release(inode, file);
 }
 
-static struct file_operations jbd2_seq_info_fops = {
+static const struct file_operations jbd2_seq_info_fops = {
 	.owner		= THIS_MODULE,
 	.open           = jbd2_seq_info_open,
 	.read           = seq_read,
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 00388d2a3c9..5c01fc148ce 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -176,7 +176,7 @@ static const struct file_operations exports_operations = {
 extern int nfsd_pool_stats_open(struct inode *inode, struct file *file);
 extern int nfsd_pool_stats_release(struct inode *inode, struct file *file);
 
-static struct file_operations pool_stats_operations = {
+static const struct file_operations pool_stats_operations = {
 	.open		= nfsd_pool_stats_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 1a4fa04cf07..e097099bfc8 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -697,7 +697,7 @@ not_empty:
 	return 0;
 }
 
-struct file_operations nilfs_dir_operations = {
+const struct file_operations nilfs_dir_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
 	.readdir	= nilfs_readdir,
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 7d7b4983dee..30292df443c 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -134,7 +134,7 @@ static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma)
  * We have mostly NULL's here: the current defaults are ok for
  * the nilfs filesystem.
  */
-struct file_operations nilfs_file_operations = {
+const struct file_operations nilfs_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
 	.write		= do_sync_write,
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index b18c4998f8d..f6326112d64 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -433,7 +433,7 @@ static const struct address_space_operations def_mdt_aops = {
 };
 
 static const struct inode_operations def_mdt_iops;
-static struct file_operations def_mdt_fops;
+static const struct file_operations def_mdt_fops;
 
 /*
  * NILFS2 uses pseudo inodes for meta data files such as DAT, cpfile, sufile,
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index bad7368782d..4da6f67e9a9 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -294,9 +294,9 @@ void nilfs_clear_gcdat_inode(struct the_nilfs *);
 /*
  * Inodes and files operations
  */
-extern struct file_operations nilfs_dir_operations;
+extern const struct file_operations nilfs_dir_operations;
 extern const struct inode_operations nilfs_file_inode_operations;
-extern struct file_operations nilfs_file_operations;
+extern const struct file_operations nilfs_file_operations;
 extern const struct address_space_operations nilfs_aops;
 extern const struct inode_operations nilfs_dir_inode_operations;
 extern const struct inode_operations nilfs_special_inode_operations;
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 09cc25d0461..c452d116b89 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -966,7 +966,7 @@ static ssize_t o2hb_debug_read(struct file *file, char __user *buf,
 }
 #endif  /* CONFIG_DEBUG_FS */
 
-static struct file_operations o2hb_debug_fops = {
+static const struct file_operations o2hb_debug_fops = {
 	.open =		o2hb_debug_open,
 	.release =	o2hb_debug_release,
 	.read =		o2hb_debug_read,
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index cfb2be708ab..da794bc07a6 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -207,7 +207,7 @@ static int nst_fop_release(struct inode *inode, struct file *file)
 	return seq_release_private(inode, file);
 }
 
-static struct file_operations nst_seq_fops = {
+static const struct file_operations nst_seq_fops = {
 	.open = nst_fop_open,
 	.read = seq_read,
 	.llseek = seq_lseek,
@@ -388,7 +388,7 @@ static int sc_fop_release(struct inode *inode, struct file *file)
 	return seq_release_private(inode, file);
 }
 
-static struct file_operations sc_seq_fops = {
+static const struct file_operations sc_seq_fops = {
 	.open = sc_fop_open,
 	.read = seq_read,
 	.llseek = seq_lseek,
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index ca46002ec10..42b0bad7a61 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -478,7 +478,7 @@ bail:
 	return -ENOMEM;
 }
 
-static struct file_operations debug_purgelist_fops = {
+static const struct file_operations debug_purgelist_fops = {
 	.open =		debug_purgelist_open,
 	.release =	debug_buffer_release,
 	.read =		debug_buffer_read,
@@ -538,7 +538,7 @@ bail:
 	return -ENOMEM;
 }
 
-static struct file_operations debug_mle_fops = {
+static const struct file_operations debug_mle_fops = {
 	.open =		debug_mle_open,
 	.release =	debug_buffer_release,
 	.read =		debug_buffer_read,
@@ -741,7 +741,7 @@ static int debug_lockres_release(struct inode *inode, struct file *file)
 	return seq_release_private(inode, file);
 }
 
-static struct file_operations debug_lockres_fops = {
+static const struct file_operations debug_lockres_fops = {
 	.open =		debug_lockres_open,
 	.release =	debug_lockres_release,
 	.read =		seq_read,
@@ -925,7 +925,7 @@ bail:
 	return -ENOMEM;
 }
 
-static struct file_operations debug_state_fops = {
+static const struct file_operations debug_state_fops = {
 	.open =		debug_state_open,
 	.release =	debug_buffer_release,
 	.read =		debug_buffer_read,
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 4cc3c890a2c..c0e48aeebb1 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -373,7 +373,7 @@ static ssize_t ocfs2_debug_read(struct file *file, char __user *buf,
 }
 #endif	/* CONFIG_DEBUG_FS */
 
-static struct file_operations ocfs2_osb_debug_fops = {
+static const struct file_operations ocfs2_osb_debug_fops = {
 	.open =		ocfs2_osb_debug_open,
 	.release =	ocfs2_debug_release,
 	.read =		ocfs2_debug_read,
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index 3680bae335b..b42d6241903 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -498,7 +498,7 @@ const struct inode_operations omfs_dir_inops = {
 	.rmdir = omfs_rmdir,
 };
 
-struct file_operations omfs_dir_operations = {
+const struct file_operations omfs_dir_operations = {
 	.read = generic_read_dir,
 	.readdir = omfs_readdir,
 	.llseek = generic_file_llseek,
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 4845fbb18e6..399487c0936 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -322,7 +322,7 @@ static sector_t omfs_bmap(struct address_space *mapping, sector_t block)
 	return generic_block_bmap(mapping, block, omfs_get_block);
 }
 
-struct file_operations omfs_file_operations = {
+const struct file_operations omfs_file_operations = {
 	.llseek = generic_file_llseek,
 	.read = do_sync_read,
 	.write = do_sync_write,
diff --git a/fs/omfs/omfs.h b/fs/omfs/omfs.h
index df71039945a..ebe2fdbe535 100644
--- a/fs/omfs/omfs.h
+++ b/fs/omfs/omfs.h
@@ -44,14 +44,14 @@ extern int omfs_allocate_range(struct super_block *sb, int min_request,
 extern int omfs_clear_range(struct super_block *sb, u64 block, int count);
 
 /* dir.c */
-extern struct file_operations omfs_dir_operations;
+extern const struct file_operations omfs_dir_operations;
 extern const struct inode_operations omfs_dir_inops;
 extern int omfs_make_empty(struct inode *inode, struct super_block *sb);
 extern int omfs_is_bad(struct omfs_sb_info *sbi, struct omfs_header *header,
 			u64 fsblock);
 
 /* file.c */
-extern struct file_operations omfs_file_operations;
+extern const struct file_operations omfs_file_operations;
 extern const struct inode_operations omfs_file_inops;
 extern const struct address_space_operations omfs_aops;
 extern void omfs_make_empty_table(struct buffer_head *bh, int offset);
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index b62bb9294d0..0008dee6651 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -37,7 +37,7 @@ extern void cgroup_exit(struct task_struct *p, int run_callbacks);
 extern int cgroupstats_build(struct cgroupstats *stats,
 				struct dentry *dentry);
 
-extern struct file_operations proc_cgroup_operations;
+extern const struct file_operations proc_cgroup_operations;
 
 /* Define the enumeration of all cgroup subsystems */
 #define SUBSYS(_x) _x ## _subsys_id,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2adaa2529f1..a1e6899d4b6 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2446,7 +2446,7 @@ static int __fops ## _open(struct inode *inode, struct file *file)	\
 	__simple_attr_check_format(__fmt, 0ull);			\
 	return simple_attr_open(inode, file, __get, __set, __fmt);	\
 }									\
-static struct file_operations __fops = {				\
+static const struct file_operations __fops = {				\
 	.owner	 = THIS_MODULE,						\
 	.open	 = __fops ## _open,					\
 	.release = simple_attr_release,					\
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 7ccba4bc5e3..d2b88596efd 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -703,7 +703,7 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode);
 static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
 static int cgroup_populate_dir(struct cgroup *cgrp);
 static const struct inode_operations cgroup_dir_inode_operations;
-static struct file_operations proc_cgroupstats_operations;
+static const struct file_operations proc_cgroupstats_operations;
 
 static struct backing_dev_info cgroup_backing_dev_info = {
 	.name		= "cgroup",
@@ -1863,7 +1863,7 @@ static int cgroup_seqfile_release(struct inode *inode, struct file *file)
 	return single_release(inode, file);
 }
 
-static struct file_operations cgroup_seqfile_operations = {
+static const struct file_operations cgroup_seqfile_operations = {
 	.read = seq_read,
 	.write = cgroup_file_write,
 	.llseek = seq_lseek,
@@ -1922,7 +1922,7 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
 	return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
 }
 
-static struct file_operations cgroup_file_operations = {
+static const struct file_operations cgroup_file_operations = {
 	.read = cgroup_file_read,
 	.write = cgroup_file_write,
 	.llseek = generic_file_llseek,
@@ -3369,7 +3369,7 @@ static int cgroup_open(struct inode *inode, struct file *file)
 	return single_open(file, proc_cgroup_show, pid);
 }
 
-struct file_operations proc_cgroup_operations = {
+const struct file_operations proc_cgroup_operations = {
 	.open		= cgroup_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
@@ -3398,7 +3398,7 @@ static int cgroupstats_open(struct inode *inode, struct file *file)
 	return single_open(file, proc_cgroupstats_show, NULL);
 }
 
-static struct file_operations proc_cgroupstats_operations = {
+static const struct file_operations proc_cgroupstats_operations = {
 	.open = cgroupstats_open,
 	.read = seq_read,
 	.llseek = seq_lseek,
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index cfadc1291d0..5240d75f4c6 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1333,7 +1333,7 @@ static int __kprobes kprobes_open(struct inode *inode, struct file *filp)
 	return seq_open(filp, &kprobes_seq_ops);
 }
 
-static struct file_operations debugfs_kprobes_operations = {
+static const struct file_operations debugfs_kprobes_operations = {
 	.open           = kprobes_open,
 	.read           = seq_read,
 	.llseek         = seq_lseek,
@@ -1515,7 +1515,7 @@ static ssize_t write_enabled_file_bool(struct file *file,
 	return count;
 }
 
-static struct file_operations fops_kp = {
+static const struct file_operations fops_kp = {
 	.read =         read_enabled_file_bool,
 	.write =        write_enabled_file_bool,
 };
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index c89f5e9fd17..179e6ad80dc 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -93,7 +93,7 @@ static int rcudata_open(struct inode *inode, struct file *file)
 	return single_open(file, show_rcudata, NULL);
 }
 
-static struct file_operations rcudata_fops = {
+static const struct file_operations rcudata_fops = {
 	.owner = THIS_MODULE,
 	.open = rcudata_open,
 	.read = seq_read,
@@ -145,7 +145,7 @@ static int rcudata_csv_open(struct inode *inode, struct file *file)
 	return single_open(file, show_rcudata_csv, NULL);
 }
 
-static struct file_operations rcudata_csv_fops = {
+static const struct file_operations rcudata_csv_fops = {
 	.owner = THIS_MODULE,
 	.open = rcudata_csv_open,
 	.read = seq_read,
@@ -196,7 +196,7 @@ static int rcuhier_open(struct inode *inode, struct file *file)
 	return single_open(file, show_rcuhier, NULL);
 }
 
-static struct file_operations rcuhier_fops = {
+static const struct file_operations rcuhier_fops = {
 	.owner = THIS_MODULE,
 	.open = rcuhier_open,
 	.read = seq_read,
@@ -222,7 +222,7 @@ static int rcugp_open(struct inode *inode, struct file *file)
 	return single_open(file, show_rcugp, NULL);
 }
 
-static struct file_operations rcugp_fops = {
+static const struct file_operations rcugp_fops = {
 	.owner = THIS_MODULE,
 	.open = rcugp_open,
 	.read = seq_read,
@@ -276,7 +276,7 @@ static int rcu_pending_open(struct inode *inode, struct file *file)
 	return single_open(file, show_rcu_pending, NULL);
 }
 
-static struct file_operations rcu_pending_fops = {
+static const struct file_operations rcu_pending_fops = {
 	.owner = THIS_MODULE,
 	.open = rcu_pending_open,
 	.read = seq_read,
diff --git a/kernel/sched.c b/kernel/sched.c
index ee61f454a98..1535f3884b8 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -780,7 +780,7 @@ static int sched_feat_open(struct inode *inode, struct file *filp)
 	return single_open(filp, sched_feat_show, NULL);
 }
 
-static struct file_operations sched_feat_fops = {
+static const struct file_operations sched_feat_fops = {
 	.open		= sched_feat_open,
 	.write		= sched_feat_write,
 	.read		= seq_read,
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index fddd69d16e0..1b5b7aa2fdf 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -275,7 +275,7 @@ static int timer_list_open(struct inode *inode, struct file *filp)
 	return single_open(filp, timer_list_show, NULL);
 }
 
-static struct file_operations timer_list_fops = {
+static const struct file_operations timer_list_fops = {
 	.open		= timer_list_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 4cde8b9c716..ee5681f8d7e 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -395,7 +395,7 @@ static int tstats_open(struct inode *inode, struct file *filp)
 	return single_open(filp, tstats_show, NULL);
 }
 
-static struct file_operations tstats_fops = {
+static const struct file_operations tstats_fops = {
 	.open		= tstats_open,
 	.read		= seq_read,
 	.write		= tstats_write,
diff --git a/samples/tracepoints/tracepoint-sample.c b/samples/tracepoints/tracepoint-sample.c
index 9cf80a11e8b..26fab33ffa8 100644
--- a/samples/tracepoints/tracepoint-sample.c
+++ b/samples/tracepoints/tracepoint-sample.c
@@ -28,7 +28,7 @@ static int my_open(struct inode *inode, struct file *file)
 	return -EPERM;
 }
 
-static struct file_operations mark_ops = {
+static const struct file_operations mark_ops = {
 	.open = my_open,
 };
 
diff --git a/security/integrity/ima/ima_fs.c b/security/integrity/ima/ima_fs.c
index 8e9777b7640..0c72c9c3895 100644
--- a/security/integrity/ima/ima_fs.c
+++ b/security/integrity/ima/ima_fs.c
@@ -43,7 +43,7 @@ static ssize_t ima_show_htable_violations(struct file *filp,
 	return ima_show_htable_value(buf, count, ppos, &ima_htable.violations);
 }
 
-static struct file_operations ima_htable_violations_ops = {
+static const struct file_operations ima_htable_violations_ops = {
 	.read = ima_show_htable_violations
 };
 
@@ -55,7 +55,7 @@ static ssize_t ima_show_measurements_count(struct file *filp,
 
 }
 
-static struct file_operations ima_measurements_count_ops = {
+static const struct file_operations ima_measurements_count_ops = {
 	.read = ima_show_measurements_count
 };
 
@@ -158,7 +158,7 @@ static int ima_measurements_open(struct inode *inode, struct file *file)
 	return seq_open(file, &ima_measurments_seqops);
 }
 
-static struct file_operations ima_measurements_ops = {
+static const struct file_operations ima_measurements_ops = {
 	.open = ima_measurements_open,
 	.read = seq_read,
 	.llseek = seq_lseek,
@@ -233,7 +233,7 @@ static int ima_ascii_measurements_open(struct inode *inode, struct file *file)
 	return seq_open(file, &ima_ascii_measurements_seqops);
 }
 
-static struct file_operations ima_ascii_measurements_ops = {
+static const struct file_operations ima_ascii_measurements_ops = {
 	.open = ima_ascii_measurements_open,
 	.read = seq_read,
 	.llseek = seq_lseek,
@@ -313,7 +313,7 @@ static int ima_release_policy(struct inode *inode, struct file *file)
 	return 0;
 }
 
-static struct file_operations ima_measure_policy_ops = {
+static const struct file_operations ima_measure_policy_ops = {
 	.open = ima_open_policy,
 	.write = ima_write_policy,
 	.release = ima_release_policy
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index b5e7e3f1183..e79c54034bc 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2625,7 +2625,7 @@ static int vcpu_stat_get(void *_offset, u64 *val)
 
 DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, NULL, "%llu\n");
 
-static struct file_operations *stat_fops[] = {
+static const struct file_operations *stat_fops[] = {
 	[KVM_STAT_VCPU] = &vcpu_stat_fops,
 	[KVM_STAT_VM]   = &vm_stat_fops,
 };
-- 
cgit v1.2.3


From 392d814daf460a9564d29b2cebc51e1ea34e0504 Mon Sep 17 00:00:00 2001
From: Samuel Thibault <samuel.thibault@ens-lyon.org>
Date: Thu, 1 Oct 2009 15:44:02 -0700
Subject: x86: fix csum_ipv6_magic asm memory clobber

Just like ip_fast_csum, the assembly snippet in csum_ipv6_magic needs a
memory clobber, as it is only passed the address of the buffer, not a
memory reference to the buffer itself.

This caused failures in Hurd's pfinetv4 when we tried to compile it with
gcc-4.3 (bogus checksums).

Signed-off-by: Samuel Thibault <samuel.thibault@ens-lyon.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Acked-by: "David S. Miller" <davem@davemloft.net>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/checksum_32.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/checksum_32.h b/arch/x86/include/asm/checksum_32.h
index 7c5ef8b14d9..46fc474fd81 100644
--- a/arch/x86/include/asm/checksum_32.h
+++ b/arch/x86/include/asm/checksum_32.h
@@ -161,7 +161,8 @@ static inline __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
 	    "adcl $0, %0	;\n"
 	    : "=&r" (sum)
 	    : "r" (saddr), "r" (daddr),
-	      "r" (htonl(len)), "r" (htonl(proto)), "0" (sum));
+	      "r" (htonl(len)), "r" (htonl(proto)), "0" (sum)
+	    : "memory");
 
 	return csum_fold(sum);
 }
-- 
cgit v1.2.3


From f436f8bb73138bc74eb1c6527723e00988ad8a8a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 1 Oct 2009 16:14:32 +0200
Subject: x86: EDAC: MCE: Fix MCE decoding callback logic

Make decoding of MCEs happen only on AMD hardware by registering a
non-default callback only on CPU families which support it.

While looking at the interaction of decode_mce() with the other MCE
code i also noticed a few other things and made the following
cleanups/fixes:

 - Fixed the mce_decode() weak alias - a weak alias is really not
   good here, it should be a proper callback. A weak alias will be
   overriden if a piece of code is built into the kernel - not
   good, obviously.

 - The patch initializes the callback on AMD family 10h and 11h.

 - Added the more correct fallback printk of:

	No support for human readable MCE decoding on this CPU type.
	Transcribe the message and run it through 'mcelog --ascii' to decode.

   On CPUs that dont have a decoder.

 - Made the surrounding code more readable.

Note that the callback allows us to have a default fallback -
without having to check the CPU versions during the printout
itself. When an EDAC module registers itself, it can install the
decode-print function.

(there's no unregister needed as this is core code.)

version -v2 by Borislav Petkov:

 - add K8 to the set of supported CPUs

 - always build in edac_mce_amd since we use an early_initcall now

 - fix checkpatch warnings

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andi Kleen <andi@firstfloor.org>
LKML-Reference: <20091001141432.GA11410@aftab>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/mce.h       |  2 ++
 arch/x86/kernel/cpu/mcheck/mce.c | 58 +++++++++++++++++++++++++---------------
 drivers/edac/Makefile            |  2 +-
 drivers/edac/edac_mce_amd.c      | 15 ++++++++++-
 4 files changed, 53 insertions(+), 24 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index b608a64c581..f1363b72364 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -133,6 +133,8 @@ static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {}
 static inline void enable_p5_mce(void) {}
 #endif
 
+extern void (*x86_mce_decode_callback)(struct mce *m);
+
 void mce_setup(struct mce *m);
 void mce_log(struct mce *m);
 DECLARE_PER_CPU(struct sys_device, mce_dev);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 183c3457d2f..b1598a9436d 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -85,6 +85,18 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
 static DEFINE_PER_CPU(struct mce, mces_seen);
 static int			cpu_missing;
 
+static void default_decode_mce(struct mce *m)
+{
+	pr_emerg("No human readable MCE decoding support on this CPU type.\n");
+	pr_emerg("Run the message through 'mcelog --ascii' to decode.\n");
+}
+
+/*
+ * CPU/chipset specific EDAC code can register a callback here to print
+ * MCE errors in a human-readable form:
+ */
+void (*x86_mce_decode_callback)(struct mce *m) = default_decode_mce;
+EXPORT_SYMBOL(x86_mce_decode_callback);
 
 /* MCA banks polled by the period polling timer for corrected events */
 DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
@@ -165,46 +177,46 @@ void mce_log(struct mce *mce)
 	set_bit(0, &mce_need_notify);
 }
 
-void __weak decode_mce(struct mce *m)
-{
-	return;
-}
-
 static void print_mce(struct mce *m)
 {
-	printk(KERN_EMERG
-	       "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
+	pr_emerg("CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
 	       m->extcpu, m->mcgstatus, m->bank, m->status);
+
 	if (m->ip) {
-		printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
-		       !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
-		       m->cs, m->ip);
+		pr_emerg("RIP%s %02x:<%016Lx> ",
+			!(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
+				m->cs, m->ip);
+
 		if (m->cs == __KERNEL_CS)
 			print_symbol("{%s}", m->ip);
-		printk(KERN_CONT "\n");
+		pr_cont("\n");
 	}
-	printk(KERN_EMERG "TSC %llx ", m->tsc);
+
+	pr_emerg("TSC %llx ", m->tsc);
 	if (m->addr)
-		printk(KERN_CONT "ADDR %llx ", m->addr);
+		pr_cont("ADDR %llx ", m->addr);
 	if (m->misc)
-		printk(KERN_CONT "MISC %llx ", m->misc);
-	printk(KERN_CONT "\n");
-	printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
-			m->cpuvendor, m->cpuid, m->time, m->socketid,
-			m->apicid);
+		pr_cont("MISC %llx ", m->misc);
+
+	pr_cont("\n");
+	pr_emerg("PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
+		m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid);
 
-	decode_mce(m);
+	/*
+	 * Print out human-readable details about the MCE error,
+	 * (if the CPU has an implementation for that):
+	 */
+	x86_mce_decode_callback(m);
 }
 
 static void print_mce_head(void)
 {
-	printk(KERN_EMERG "\nHARDWARE ERROR\n");
+	pr_emerg("\nHARDWARE ERROR\n");
 }
 
 static void print_mce_tail(void)
 {
-	printk(KERN_EMERG "This is not a software problem!\n"
-	       "Run through mcelog --ascii to decode and contact your hardware vendor\n");
+	pr_emerg("This is not a software problem!\n");
 }
 
 #define PANIC_TIMEOUT 5 /* 5 seconds */
@@ -218,6 +230,7 @@ static atomic_t mce_fake_paniced;
 static void wait_for_panic(void)
 {
 	long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
+
 	preempt_disable();
 	local_irq_enable();
 	while (timeout-- > 0)
@@ -285,6 +298,7 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
 static int msr_to_offset(u32 msr)
 {
 	unsigned bank = __get_cpu_var(injectm.bank);
+
 	if (msr == rip_msr)
 		return offsetof(struct mce, ip);
 	if (msr == MSR_IA32_MCx_STATUS(bank))
diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile
index 7a473bbe8ab..8701cd7ce4e 100644
--- a/drivers/edac/Makefile
+++ b/drivers/edac/Makefile
@@ -18,7 +18,7 @@ edac_core-objs	+= edac_pci.o edac_pci_sysfs.o
 endif
 
 ifdef CONFIG_CPU_SUP_AMD
-edac_core-objs  += edac_mce_amd.o
+obj-$(CONFIG_X86_MCE)			+= edac_mce_amd.o
 endif
 
 obj-$(CONFIG_EDAC_AMD76X)		+= amd76x_edac.o
diff --git a/drivers/edac/edac_mce_amd.c b/drivers/edac/edac_mce_amd.c
index 0c21c370c9d..83a01a1187d 100644
--- a/drivers/edac/edac_mce_amd.c
+++ b/drivers/edac/edac_mce_amd.c
@@ -362,7 +362,7 @@ static inline void amd_decode_err_code(unsigned int ec)
 		pr_warning("Huh? Unknown MCE error 0x%x\n", ec);
 }
 
-void decode_mce(struct mce *m)
+static void amd_decode_mce(struct mce *m)
 {
 	struct err_regs regs;
 	int node, ecc;
@@ -420,3 +420,16 @@ void decode_mce(struct mce *m)
 
 	amd_decode_err_code(m->status & 0xffff);
 }
+
+static int __init mce_amd_init(void)
+{
+	/*
+	 * We can decode MCEs for Opteron and later CPUs:
+	 */
+	if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) &&
+	    (boot_cpu_data.x86 >= 0xf))
+		x86_mce_decode_callback = amd_decode_mce;
+
+	return 0;
+}
+early_initcall(mce_amd_init);
-- 
cgit v1.2.3


From 11879ba5d9ab8174af9b9cefbb2396a54dfbf8c1 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@infradead.org>
Date: Sat, 26 Sep 2009 20:51:50 +0200
Subject: x86: Simplify bound checks in the MTRR code

The current bound checks for copy_from_user in the MTRR driver are
not as obvious as they could be, and gcc agrees with that.

This patch simplifies the boundary checks to the point that gcc can
now prove to itself that the copy_from_user() is never going past
its bounds.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <20090926205150.30797709@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/mtrr/if.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index f04e7252760..3c1b12d461d 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -96,17 +96,24 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
 	unsigned long long base, size;
 	char *ptr;
 	char line[LINE_SIZE];
+	int length;
 	size_t linelen;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
-	if (!len)
-		return -EINVAL;
 
 	memset(line, 0, LINE_SIZE);
-	if (len > LINE_SIZE)
-		len = LINE_SIZE;
-	if (copy_from_user(line, buf, len - 1))
+
+	length = len;
+	length--;
+
+	if (length > LINE_SIZE - 1)
+		length = LINE_SIZE - 1;
+
+	if (length < 0)
+		return -EINVAL;
+
+	if (copy_from_user(line, buf, length))
 		return -EFAULT;
 
 	linelen = strlen(line);
-- 
cgit v1.2.3


From e3be785fb59f92c0df685037062d041619653b7a Mon Sep 17 00:00:00 2001
From: Marin Mitov <mitov@issp.bas.bg>
Date: Sat, 3 Oct 2009 20:45:02 +0300
Subject: x86, pci: Correct spelling in a comment

Signed-off-by: Marin Mitov <mitov@issp.bas.bg>
Cc: Joerg Roedel <joerg.roedel@amd.com>
Cc: Jesse Brandeburg <jesse.brandeburg@intel.com>
LKML-Reference: <200910032045.02523.mitov@issp.bas.bg>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
======================================================
---
 arch/x86/kernel/pci-dma.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 64b838eac18..d20009b4e6e 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -35,7 +35,7 @@ int iommu_detected __read_mostly = 0;
 
 /*
  * This variable becomes 1 if iommu=pt is passed on the kernel command line.
- * If this variable is 1, IOMMU implementations do no DMA ranslation for
+ * If this variable is 1, IOMMU implementations do no DMA translation for
  * devices and allow every device to access to whole physical memory. This is
  * useful if a user want to use an IOMMU only for KVM device assignment to
  * guests and not for driver dma translation.
-- 
cgit v1.2.3


From 77b1ab1732feb5e3dcbaf31d2f7547c5229f5f3a Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 16 Sep 2009 15:24:17 +0200
Subject: KVM: SVM: Fix tsc offset adjustment when running nested

When svm_vcpu_load is called while the vcpu is running in
guest mode the tsc adjustment made there is lost on the next
emulated #vmexit. This causes the tsc running backwards in
the guest. This patch fixes the issue by also adjusting the
tsc_offset in the emulated hsave area so that it will not
get lost.

Cc: stable@kernel.org
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/svm.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 944cc9c04b3..bf5799dc4f9 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -767,6 +767,8 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 		rdtscll(tsc_this);
 		delta = vcpu->arch.host_tsc - tsc_this;
 		svm->vmcb->control.tsc_offset += delta;
+		if (is_nested(svm))
+			svm->nested.hsave->control.tsc_offset += delta;
 		vcpu->cpu = cpu;
 		kvm_migrate_timers(vcpu);
 		svm->asid_generation = 0;
-- 
cgit v1.2.3


From 20824f30bb0b8ae0a4099895fd4509f54cf2e1e2 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 16 Sep 2009 15:24:18 +0200
Subject: KVM: SVM: Handle tsc in svm_get_msr/svm_set_msr correctly

When running nested we need to touch the l1 guests
tsc_offset. Otherwise changes will be lost or a wrong value
be read.

Cc: stable@kernel.org
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/svm.c | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index bf5799dc4f9..c17404add91 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2059,10 +2059,14 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
 
 	switch (ecx) {
 	case MSR_IA32_TSC: {
-		u64 tsc;
+		u64 tsc_offset;
 
-		rdtscll(tsc);
-		*data = svm->vmcb->control.tsc_offset + tsc;
+		if (is_nested(svm))
+			tsc_offset = svm->nested.hsave->control.tsc_offset;
+		else
+			tsc_offset = svm->vmcb->control.tsc_offset;
+
+		*data = tsc_offset + native_read_tsc();
 		break;
 	}
 	case MSR_K6_STAR:
@@ -2148,10 +2152,17 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
 
 	switch (ecx) {
 	case MSR_IA32_TSC: {
-		u64 tsc;
+		u64 tsc_offset = data - native_read_tsc();
+		u64 g_tsc_offset = 0;
+
+		if (is_nested(svm)) {
+			g_tsc_offset = svm->vmcb->control.tsc_offset -
+				       svm->nested.hsave->control.tsc_offset;
+			svm->nested.hsave->control.tsc_offset = tsc_offset;
+		}
+
+		svm->vmcb->control.tsc_offset = tsc_offset + g_tsc_offset;
 
-		rdtscll(tsc);
-		svm->vmcb->control.tsc_offset = data - tsc;
 		break;
 	}
 	case MSR_K6_STAR:
-- 
cgit v1.2.3


From b2d83cfa3fdefe5c6573d443d099a18dc3a93c5f Mon Sep 17 00:00:00 2001
From: Aurelien Jarno <aurelien@aurel32.net>
Date: Fri, 25 Sep 2009 11:09:37 +0200
Subject: KVM: fix LAPIC timer period overflow

Don't overflow when computing the 64-bit period from 32-bit registers.

Fixes sourceforge bug #2826486.

Signed-off-by: Aurelien Jarno <aurelien@aurel32.net>
Cc: stable@kernel.org
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/lapic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 1ae5ceba7eb..7024224f0fc 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -664,7 +664,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
 {
 	ktime_t now = apic->lapic_timer.timer.base->get_time();
 
-	apic->lapic_timer.period = apic_get_reg(apic, APIC_TMICT) *
+	apic->lapic_timer.period = (u64)apic_get_reg(apic, APIC_TMICT) *
 		    APIC_BUS_CYCLE_NS * apic->divide_count;
 	atomic_set(&apic->lapic_timer.pending, 0);
 
-- 
cgit v1.2.3


From eb5109e311b5152c0614a28d7d615d087f268f19 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Thu, 1 Oct 2009 19:16:58 -0300
Subject: KVM: VMX: flush TLB with INVEPT on cpu migration

It is possible that stale EPTP-tagged mappings are used, if a
vcpu migrates to a different pcpu.

Set KVM_REQ_TLB_FLUSH in vmx_vcpu_load, when switching pcpus, which
will invalidate both VPID and EPT mappings on the next vm-entry.

Cc: stable@kernel.org
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f3812014bd0..ed53b42caba 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -709,7 +709,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	if (vcpu->cpu != cpu) {
 		vcpu_clear(vmx);
 		kvm_migrate_timers(vcpu);
-		vpid_sync_vcpu_all(vmx);
+		set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests);
 		local_irq_disable();
 		list_add(&vmx->local_vcpus_link,
 			 &per_cpu(vcpus_on_cpu, cpu));
-- 
cgit v1.2.3


From 6a54435560efdab1a08f429a954df4d6c740bddf Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 4 Oct 2009 16:45:13 +0200
Subject: KVM: Prevent overflow in KVM_GET_SUPPORTED_CPUID

The number of entries is multiplied by the entry size, which can
overflow on 32-bit hosts.  Bound the entry count instead.

Reported-by: David Wagner <daw@cs.berkeley.edu>
Cc: stable@kernel.org
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index be451ee4424..9b9695322f5 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1591,6 +1591,8 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
 
 	if (cpuid->nent < 1)
 		goto out;
+	if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
+		cpuid->nent = KVM_MAX_CPUID_ENTRIES;
 	r = -ENOMEM;
 	cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
 	if (!cpuid_entries)
-- 
cgit v1.2.3


From acb66dd051d0834c8b36d147ff83a8d39da0fe0b Mon Sep 17 00:00:00 2001
From: Izik Eidus <ieidus@redhat.com>
Date: Wed, 23 Sep 2009 21:47:16 +0300
Subject: KVM: MMU: dont hold pagecount reference for mapped sptes pages

When using mmu notifiers, we are allowed to remove the page count
reference tooken by get_user_pages to a specific page that is mapped
inside the shadow page tables.

This is needed so we can balance the pagecount against mapcount
checking.

(Right now kvm increase the pagecount and does not increase the
mapcount when mapping page into shadow page table entry,
so when comparing pagecount against mapcount, you have no
reliable result.)

Signed-off-by: Izik Eidus <ieidus@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index eca41ae9f45..6c67b230e95 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -634,9 +634,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
 	if (*spte & shadow_accessed_mask)
 		kvm_set_pfn_accessed(pfn);
 	if (is_writeble_pte(*spte))
-		kvm_release_pfn_dirty(pfn);
-	else
-		kvm_release_pfn_clean(pfn);
+		kvm_set_pfn_dirty(pfn);
 	rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level);
 	if (!*rmapp) {
 		printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
@@ -1877,8 +1875,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 	page_header_update_slot(vcpu->kvm, sptep, gfn);
 	if (!was_rmapped) {
 		rmap_count = rmap_add(vcpu, sptep, gfn);
-		if (!is_rmap_spte(*sptep))
-			kvm_release_pfn_clean(pfn);
+		kvm_release_pfn_clean(pfn);
 		if (rmap_count > RMAP_RECYCLE_THRESHOLD)
 			rmap_recycle(vcpu, sptep, gfn);
 	} else {
-- 
cgit v1.2.3


From 1403283acca398e244ece35741ad251c1feb5972 Mon Sep 17 00:00:00 2001
From: Izik Eidus <ieidus@redhat.com>
Date: Wed, 23 Sep 2009 21:47:17 +0300
Subject: KVM: MMU: add SPTE_HOST_WRITEABLE flag to the shadow ptes

this flag notify that the host physical page we are pointing to from
the spte is write protected, and therefore we cant change its access
to be write unless we run get_user_pages(write = 1).

(this is needed for change_pte support in kvm)

Signed-off-by: Izik Eidus <ieidus@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c         | 15 +++++++++++----
 arch/x86/kvm/paging_tmpl.h | 18 +++++++++++++++---
 2 files changed, 26 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 6c67b230e95..5cd8b4ec3a0 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -156,6 +156,8 @@ module_param(oos_shadow, bool, 0644);
 #define CREATE_TRACE_POINTS
 #include "mmutrace.h"
 
+#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
+
 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
 
 struct kvm_rmap_desc {
@@ -1754,7 +1756,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		    unsigned pte_access, int user_fault,
 		    int write_fault, int dirty, int level,
 		    gfn_t gfn, pfn_t pfn, bool speculative,
-		    bool can_unsync)
+		    bool can_unsync, bool reset_host_protection)
 {
 	u64 spte;
 	int ret = 0;
@@ -1781,6 +1783,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
 			kvm_is_mmio_pfn(pfn));
 
+	if (reset_host_protection)
+		spte |= SPTE_HOST_WRITEABLE;
+
 	spte |= (u64)pfn << PAGE_SHIFT;
 
 	if ((pte_access & ACC_WRITE_MASK)
@@ -1826,7 +1831,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 			 unsigned pt_access, unsigned pte_access,
 			 int user_fault, int write_fault, int dirty,
 			 int *ptwrite, int level, gfn_t gfn,
-			 pfn_t pfn, bool speculative)
+			 pfn_t pfn, bool speculative,
+			 bool reset_host_protection)
 {
 	int was_rmapped = 0;
 	int was_writeble = is_writeble_pte(*sptep);
@@ -1858,7 +1864,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 	}
 
 	if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
-		      dirty, level, gfn, pfn, speculative, true)) {
+		      dirty, level, gfn, pfn, speculative, true,
+		      reset_host_protection)) {
 		if (write_fault)
 			*ptwrite = 1;
 		kvm_x86_ops->tlb_flush(vcpu);
@@ -1906,7 +1913,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 		if (iterator.level == level) {
 			mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL,
 				     0, write, 1, &pt_write,
-				     level, gfn, pfn, false);
+				     level, gfn, pfn, false, true);
 			++vcpu->stat.pf_fixed;
 			break;
 		}
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index d2fec9c12d2..72558f8ff3f 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -273,9 +273,13 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
 	if (mmu_notifier_retry(vcpu, vcpu->arch.update_pte.mmu_seq))
 		return;
 	kvm_get_pfn(pfn);
+	/*
+	 * we call mmu_set_spte() with reset_host_protection = true beacuse that
+	 * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
+	 */
 	mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
 		     gpte & PT_DIRTY_MASK, NULL, PT_PAGE_TABLE_LEVEL,
-		     gpte_to_gfn(gpte), pfn, true);
+		     gpte_to_gfn(gpte), pfn, true, true);
 }
 
 /*
@@ -308,7 +312,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 				     user_fault, write_fault,
 				     gw->ptes[gw->level-1] & PT_DIRTY_MASK,
 				     ptwrite, level,
-				     gw->gfn, pfn, false);
+				     gw->gfn, pfn, false, true);
 			break;
 		}
 
@@ -558,6 +562,7 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
 static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 {
 	int i, offset, nr_present;
+	bool reset_host_protection;
 
 	offset = nr_present = 0;
 
@@ -595,9 +600,16 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 
 		nr_present++;
 		pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
+		if (!(sp->spt[i] & SPTE_HOST_WRITEABLE)) {
+			pte_access &= ~ACC_WRITE_MASK;
+			reset_host_protection = 0;
+		} else {
+			reset_host_protection = 1;
+		}
 		set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
 			 is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn,
-			 spte_to_pfn(sp->spt[i]), true, false);
+			 spte_to_pfn(sp->spt[i]), true, false,
+			 reset_host_protection);
 	}
 
 	return !nr_present;
-- 
cgit v1.2.3


From 3da0dd433dc399a8c0124d0614d82a09b6a49bce Mon Sep 17 00:00:00 2001
From: Izik Eidus <ieidus@redhat.com>
Date: Wed, 23 Sep 2009 21:47:18 +0300
Subject: KVM: add support for change_pte mmu notifiers

this is needed for kvm if it want ksm to directly map pages into its
shadow page tables.

[marcelo: cast pfn assignment to u64]

Signed-off-by: Izik Eidus <ieidus@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/mmu.c              | 62 +++++++++++++++++++++++++++++++++++------
 virt/kvm/kvm_main.c             | 14 ++++++++++
 3 files changed, 68 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3be000435fa..d83892226f7 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -796,6 +796,7 @@ asmlinkage void kvm_handle_fault_on_reboot(void);
 #define KVM_ARCH_WANT_MMU_NOTIFIER
 int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
 int kvm_age_hva(struct kvm *kvm, unsigned long hva);
+void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
 int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
 int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
 int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 5cd8b4ec3a0..685a4ffac8e 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -748,7 +748,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
 	return write_protected;
 }
 
-static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
+static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, u64 data)
 {
 	u64 *spte;
 	int need_tlb_flush = 0;
@@ -763,8 +763,45 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
 	return need_tlb_flush;
 }
 
-static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
-			  int (*handler)(struct kvm *kvm, unsigned long *rmapp))
+static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, u64 data)
+{
+	int need_flush = 0;
+	u64 *spte, new_spte;
+	pte_t *ptep = (pte_t *)data;
+	pfn_t new_pfn;
+
+	WARN_ON(pte_huge(*ptep));
+	new_pfn = pte_pfn(*ptep);
+	spte = rmap_next(kvm, rmapp, NULL);
+	while (spte) {
+		BUG_ON(!is_shadow_present_pte(*spte));
+		rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
+		need_flush = 1;
+		if (pte_write(*ptep)) {
+			rmap_remove(kvm, spte);
+			__set_spte(spte, shadow_trap_nonpresent_pte);
+			spte = rmap_next(kvm, rmapp, NULL);
+		} else {
+			new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
+			new_spte |= (u64)new_pfn << PAGE_SHIFT;
+
+			new_spte &= ~PT_WRITABLE_MASK;
+			new_spte &= ~SPTE_HOST_WRITEABLE;
+			if (is_writeble_pte(*spte))
+				kvm_set_pfn_dirty(spte_to_pfn(*spte));
+			__set_spte(spte, new_spte);
+			spte = rmap_next(kvm, rmapp, spte);
+		}
+	}
+	if (need_flush)
+		kvm_flush_remote_tlbs(kvm);
+
+	return 0;
+}
+
+static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, u64 data,
+			  int (*handler)(struct kvm *kvm, unsigned long *rmapp,
+					 u64 data))
 {
 	int i, j;
 	int retval = 0;
@@ -786,13 +823,15 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
 		if (hva >= start && hva < end) {
 			gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
 
-			retval |= handler(kvm, &memslot->rmap[gfn_offset]);
+			retval |= handler(kvm, &memslot->rmap[gfn_offset],
+					  data);
 
 			for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
 				int idx = gfn_offset;
 				idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j);
 				retval |= handler(kvm,
-					&memslot->lpage_info[j][idx].rmap_pde);
+					&memslot->lpage_info[j][idx].rmap_pde,
+					data);
 			}
 		}
 	}
@@ -802,10 +841,15 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
 
 int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
 {
-	return kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
+	return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp);
+}
+
+void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+{
+	kvm_handle_hva(kvm, hva, (u64)&pte, kvm_set_pte_rmapp);
 }
 
-static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp)
+static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, u64 data)
 {
 	u64 *spte;
 	int young = 0;
@@ -841,13 +885,13 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
 	gfn = unalias_gfn(vcpu->kvm, gfn);
 	rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
 
-	kvm_unmap_rmapp(vcpu->kvm, rmapp);
+	kvm_unmap_rmapp(vcpu->kvm, rmapp, 0);
 	kvm_flush_remote_tlbs(vcpu->kvm);
 }
 
 int kvm_age_hva(struct kvm *kvm, unsigned long hva)
 {
-	return kvm_handle_hva(kvm, hva, kvm_age_rmapp);
+	return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp);
 }
 
 #ifdef MMU_DEBUG
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index e79c54034bc..b7c78a403dc 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -850,6 +850,19 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
 
 }
 
+static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
+					struct mm_struct *mm,
+					unsigned long address,
+					pte_t pte)
+{
+	struct kvm *kvm = mmu_notifier_to_kvm(mn);
+
+	spin_lock(&kvm->mmu_lock);
+	kvm->mmu_notifier_seq++;
+	kvm_set_spte_hva(kvm, address, pte);
+	spin_unlock(&kvm->mmu_lock);
+}
+
 static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 						    struct mm_struct *mm,
 						    unsigned long start,
@@ -929,6 +942,7 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
 	.invalidate_range_start	= kvm_mmu_notifier_invalidate_range_start,
 	.invalidate_range_end	= kvm_mmu_notifier_invalidate_range_end,
 	.clear_flush_young	= kvm_mmu_notifier_clear_flush_young,
+	.change_pte		= kvm_mmu_notifier_change_pte,
 	.release		= kvm_mmu_notifier_release,
 };
 #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
-- 
cgit v1.2.3


From 9bcbdd9c58617f1301dd4f17c738bb9bc73aca70 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@infradead.org>
Date: Thu, 8 Oct 2009 06:40:41 -0700
Subject: x86, timers: Check for pending timers after (device) interrupts

Now that range timers and deferred timers are common, I found a
problem with these using the "perf timechart" tool. Frans Pop also
reported high scheduler latencies via LatencyTop, when using
iwlagn.

It turns out that on x86, these two 'opportunistic' timers only get
checked when another "real" timer happens. These opportunistic
timers have the objective to save power by hitchhiking on other
wakeups, as to avoid CPU wakeups by themselves as much as possible.

The change in this patch runs this check not only at timer
interrupts, but at all (device) interrupts. The effect is that:

 1) the deferred timers/range timers get delayed less

 2) the range timers cause less wakeups by themselves because
    the percentage of hitchhiking on existing wakeup events goes up.

I've verified the working of the patch using "perf timechart", the
original exposed bug is gone with this patch. Frans also reported
success - the latencies are now down in the expected ~10 msec
range.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Tested-by: Frans Pop <elendil@planet.nl>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <20091008064041.67219b13@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/irq.c | 2 ++
 arch/x86/kernel/smp.c | 1 +
 2 files changed, 3 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 74656d1d4e3..39120619951 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -244,6 +244,7 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
 				__func__, smp_processor_id(), vector, irq);
 	}
 
+	run_local_timers();
 	irq_exit();
 
 	set_irq_regs(old_regs);
@@ -268,6 +269,7 @@ void smp_generic_interrupt(struct pt_regs *regs)
 	if (generic_interrupt_extension)
 		generic_interrupt_extension();
 
+	run_local_timers();
 	irq_exit();
 
 	set_irq_regs(old_regs);
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index ec1de97600e..d915d956e66 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -198,6 +198,7 @@ void smp_reschedule_interrupt(struct pt_regs *regs)
 {
 	ack_APIC_irq();
 	inc_irq_stat(irq_resched_count);
+	run_local_timers();
 	/*
 	 * KVM uses this interrupt to force a cpu out of guest mode
 	 */
-- 
cgit v1.2.3


From d0153ca35d344d9b640dc305031b0703ba3f30f0 Mon Sep 17 00:00:00 2001
From: Alok Kataria <akataria@vmware.com>
Date: Tue, 29 Sep 2009 10:25:24 -0700
Subject: x86, vmi: Mark VMI deprecated and schedule it for removal

Add text in feature-removal.txt indicating that VMI will be removed in
the 2.6.37 timeframe.

Signed-off-by: Alok N Kataria <akataria@vmware.com>
Acked-by: Chris Wright <chrisw@sous-sol.org>
LKML-Reference: <1254193238.13456.48.camel@ank32.eng.vmware.com>
[ removed a bogus Kconfig change, marked (DEPRECATED) in Kconfig ]
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/feature-removal-schedule.txt | 30 ++++++++++++++++++++++++++++++
 arch/x86/Kconfig                           | 11 ++++++++++-
 arch/x86/kernel/vmi_32.c                   |  2 +-
 3 files changed, 41 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 89a47b5aff0..04e6c819b28 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -451,3 +451,33 @@ Why:	OSS sound_core grabs all legacy minors (0-255) of SOUND_MAJOR
 	will also allow making ALSA OSS emulation independent of
 	sound_core.  The dependency will be broken then too.
 Who:	Tejun Heo <tj@kernel.org>
+
+----------------------------
+
+What:	Support for VMware's guest paravirtuliazation technique [VMI] will be
+	dropped.
+When:	2.6.37 or earlier.
+Why:	With the recent innovations in CPU hardware acceleration technologies
+	from Intel and AMD, VMware ran a few experiments to compare these
+	techniques to guest paravirtualization technique on VMware's platform.
+	These hardware assisted virtualization techniques have outperformed the
+	performance benefits provided by VMI in most of the workloads. VMware
+	expects that these hardware features will be ubiquitous in a couple of
+	years, as a result, VMware has started a phased retirement of this
+	feature from the hypervisor. We will be removing this feature from the
+	Kernel too. Right now we are targeting 2.6.37 but can retire earlier if
+	technical reasons (read opportunity to remove major chunk of pvops)
+	arise.
+
+	Please note that VMI has always been an optimization and non-VMI kernels
+	still work fine on VMware's platform.
+	Latest versions of VMware's product which support VMI are,
+	Workstation 7.0 and VSphere 4.0 on ESX side, future maintainence
+	releases for these products will continue supporting VMI.
+
+	For more details about VMI retirement take a look at this,
+	http://blogs.vmware.com/guestosguide/2009/09/vmi-retirement.html
+
+Who:	Alok N Kataria <akataria@vmware.com>
+
+----------------------------
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index c876bace8fd..07e01149e3b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -491,7 +491,7 @@ if PARAVIRT_GUEST
 source "arch/x86/xen/Kconfig"
 
 config VMI
-	bool "VMI Guest support"
+	bool "VMI Guest support (DEPRECATED)"
 	select PARAVIRT
 	depends on X86_32
 	---help---
@@ -500,6 +500,15 @@ config VMI
 	  at the moment), by linking the kernel to a GPL-ed ROM module
 	  provided by the hypervisor.
 
+	  As of September 2009, VMware has started a phased retirement
+	  of this feature from VMware's products. Please see
+	  feature-removal-schedule.txt for details.  If you are
+	  planning to enable this option, please note that you cannot
+	  live migrate a VMI enabled VM to a future VMware product,
+	  which doesn't support VMI. So if you expect your kernel to
+	  seamlessly migrate to newer VMware products, keep this
+	  disabled.
+
 config KVM_CLOCK
 	bool "KVM paravirtualized clock"
 	select PARAVIRT
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 31e6f6cfe53..d430e4c3019 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -648,7 +648,7 @@ static inline int __init activate_vmi(void)
 
 	pv_info.paravirt_enabled = 1;
 	pv_info.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK;
-	pv_info.name = "vmi";
+	pv_info.name = "vmi [deprecated]";
 
 	pv_init_ops.patch = vmi_patch;
 
-- 
cgit v1.2.3


From 04a705df47d1ea27ca2b066f24b1951c51792d0d Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@googlemail.com>
Date: Tue, 6 Oct 2009 16:42:08 +0200
Subject: perf_events: Check for filters on fixed counter events

Intel fixed counters do not support all the filters possible with a
generic counter. Thus, if a fixed counter event is passed but with
certain filters set, then the fixed_mode_idx() function must fail
and the event must be measured in a generic counter instead.

Reject filters are: inv, edge, cnt-mask.

Signed-off-by: Stephane Eranian <eranian@gmail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1254840129-6198-2-git-send-email-eranian@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/perf_event.h | 13 ++++++++++++-
 arch/x86/kernel/cpu/perf_event.c  |  6 ++++++
 2 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index ad7ce3fd506..8d9f8548a87 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -28,9 +28,20 @@
  */
 #define ARCH_PERFMON_EVENT_MASK				    0xffff
 
+/*
+ * filter mask to validate fixed counter events.
+ * the following filters disqualify for fixed counters:
+ *  - inv
+ *  - edge
+ *  - cnt-mask
+ *  The other filters are supported by fixed counters.
+ *  The any-thread option is supported starting with v3.
+ */
+#define ARCH_PERFMON_EVENT_FILTER_MASK			0xff840000
+
 #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL		      0x3c
 #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK		(0x00 << 8)
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 		 0
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX			 0
 #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
 		(1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
 
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index b5801c31184..1d16bd69551 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1349,6 +1349,12 @@ fixed_mode_idx(struct perf_event *event, struct hw_perf_event *hwc)
 	if (!x86_pmu.num_events_fixed)
 		return -1;
 
+	/*
+	 * fixed counters do not take all possible filters
+	 */
+	if (hwc->config & ARCH_PERFMON_EVENT_FILTER_MASK)
+		return -1;
+
 	if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS)))
 		return X86_PMC_IDX_FIXED_INSTRUCTIONS;
 	if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES)))
-- 
cgit v1.2.3


From b690081d4d3f6a23541493f1682835c3cd5c54a1 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@googlemail.com>
Date: Tue, 6 Oct 2009 16:42:09 +0200
Subject: perf_events: Add event constraints support for Intel processors

On some Intel processors, not all events can be measured in all
counters. Some events can only be measured in one particular
counter, for instance. Assigning an event to the wrong counter does
not crash the machine but this yields bogus counts, i.e., silent
error.

This patch changes the event to counter assignment logic to take
into account event constraints for Intel P6, Core and Nehalem
processors. There is no contraints on Intel Atom. There are
constraints on Intel Yonah (Core Duo) but they are not provided in
this patch given that this processor is not yet supported by
perf_events.

As a result of the constraints, it is possible for some event
groups to never actually be loaded onto the PMU if they contain two
events which can only be measured on a single counter. That
situation can be detected with the scaling information extracted
with read().

Signed-off-by: Stephane Eranian <eranian@gmail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1254840129-6198-3-git-send-email-eranian@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c | 109 +++++++++++++++++++++++++++++++++++++--
 1 file changed, 105 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 1d16bd69551..9c758548a0e 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -77,6 +77,18 @@ struct cpu_hw_events {
 	struct debug_store	*ds;
 };
 
+struct event_constraint {
+	unsigned long	idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+	int		code;
+};
+
+#define EVENT_CONSTRAINT(c, m) { .code = (c), .idxmsk[0] = (m) }
+#define EVENT_CONSTRAINT_END  { .code = 0, .idxmsk[0] = 0 }
+
+#define for_each_event_constraint(e, c) \
+	for ((e) = (c); (e)->idxmsk[0]; (e)++)
+
+
 /*
  * struct x86_pmu - generic x86 pmu
  */
@@ -102,6 +114,7 @@ struct x86_pmu {
 	u64		intel_ctrl;
 	void		(*enable_bts)(u64 config);
 	void		(*disable_bts)(void);
+	int		(*get_event_idx)(struct hw_perf_event *hwc);
 };
 
 static struct x86_pmu x86_pmu __read_mostly;
@@ -110,6 +123,8 @@ static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
 	.enabled = 1,
 };
 
+static const struct event_constraint *event_constraint;
+
 /*
  * Not sure about some of these
  */
@@ -155,6 +170,16 @@ static u64 p6_pmu_raw_event(u64 hw_event)
 	return hw_event & P6_EVNTSEL_MASK;
 }
 
+static const struct event_constraint intel_p6_event_constraints[] =
+{
+	EVENT_CONSTRAINT(0xc1, 0x1),	/* FLOPS */
+	EVENT_CONSTRAINT(0x10, 0x1),	/* FP_COMP_OPS_EXE */
+	EVENT_CONSTRAINT(0x11, 0x1),	/* FP_ASSIST */
+	EVENT_CONSTRAINT(0x12, 0x2),	/* MUL */
+	EVENT_CONSTRAINT(0x13, 0x2),	/* DIV */
+	EVENT_CONSTRAINT(0x14, 0x1),	/* CYCLES_DIV_BUSY */
+	EVENT_CONSTRAINT_END
+};
 
 /*
  * Intel PerfMon v3. Used on Core2 and later.
@@ -170,6 +195,35 @@ static const u64 intel_perfmon_event_map[] =
   [PERF_COUNT_HW_BUS_CYCLES]		= 0x013c,
 };
 
+static const struct event_constraint intel_core_event_constraints[] =
+{
+	EVENT_CONSTRAINT(0x10, 0x1),	/* FP_COMP_OPS_EXE */
+	EVENT_CONSTRAINT(0x11, 0x2),	/* FP_ASSIST */
+	EVENT_CONSTRAINT(0x12, 0x2),	/* MUL */
+	EVENT_CONSTRAINT(0x13, 0x2),	/* DIV */
+	EVENT_CONSTRAINT(0x14, 0x1),	/* CYCLES_DIV_BUSY */
+	EVENT_CONSTRAINT(0x18, 0x1),	/* IDLE_DURING_DIV */
+	EVENT_CONSTRAINT(0x19, 0x2),	/* DELAYED_BYPASS */
+	EVENT_CONSTRAINT(0xa1, 0x1),	/* RS_UOPS_DISPATCH_CYCLES */
+	EVENT_CONSTRAINT(0xcb, 0x1),	/* MEM_LOAD_RETIRED */
+	EVENT_CONSTRAINT_END
+};
+
+static const struct event_constraint intel_nehalem_event_constraints[] =
+{
+	EVENT_CONSTRAINT(0x40, 0x3),	/* L1D_CACHE_LD */
+	EVENT_CONSTRAINT(0x41, 0x3),	/* L1D_CACHE_ST */
+	EVENT_CONSTRAINT(0x42, 0x3),	/* L1D_CACHE_LOCK */
+	EVENT_CONSTRAINT(0x43, 0x3),	/* L1D_ALL_REF */
+	EVENT_CONSTRAINT(0x4e, 0x3),	/* L1D_PREFETCH */
+	EVENT_CONSTRAINT(0x4c, 0x3),	/* LOAD_HIT_PRE */
+	EVENT_CONSTRAINT(0x51, 0x3),	/* L1D */
+	EVENT_CONSTRAINT(0x52, 0x3),	/* L1D_CACHE_PREFETCH_LOCK_FB_HIT */
+	EVENT_CONSTRAINT(0x53, 0x3),	/* L1D_CACHE_LOCK_FB_HIT */
+	EVENT_CONSTRAINT(0xc5, 0x3),	/* CACHE_LOCK_CYCLES */
+	EVENT_CONSTRAINT_END
+};
+
 static u64 intel_pmu_event_map(int hw_event)
 {
 	return intel_perfmon_event_map[hw_event];
@@ -932,6 +986,8 @@ static int __hw_perf_event_init(struct perf_event *event)
 	 */
 	hwc->config = ARCH_PERFMON_EVENTSEL_INT;
 
+	hwc->idx = -1;
+
 	/*
 	 * Count user and OS events unless requested not to.
 	 */
@@ -1365,6 +1421,45 @@ fixed_mode_idx(struct perf_event *event, struct hw_perf_event *hwc)
 	return -1;
 }
 
+/*
+ * generic counter allocator: get next free counter
+ */
+static int gen_get_event_idx(struct hw_perf_event *hwc)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	int idx;
+
+	idx = find_first_zero_bit(cpuc->used_mask, x86_pmu.num_events);
+	return idx == x86_pmu.num_events ? -1 : idx;
+}
+
+/*
+ * intel-specific counter allocator: check event constraints
+ */
+static int intel_get_event_idx(struct hw_perf_event *hwc)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	const struct event_constraint *event_constraint;
+	int i, code;
+
+	if (!event_constraint)
+		goto skip;
+
+	code = hwc->config & 0xff;
+
+	for_each_event_constraint(event_constraint, event_constraint) {
+		if (code == event_constraint->code) {
+			for_each_bit(i, event_constraint->idxmsk, X86_PMC_IDX_MAX) {
+				if (!test_and_set_bit(i, cpuc->used_mask))
+					return i;
+			}
+			return -1;
+		}
+	}
+skip:
+	return gen_get_event_idx(hwc);
+}
+
 /*
  * Find a PMC slot for the freshly enabled / scheduled in event:
  */
@@ -1402,11 +1497,10 @@ static int x86_pmu_enable(struct perf_event *event)
 	} else {
 		idx = hwc->idx;
 		/* Try to get the previous generic event again */
-		if (test_and_set_bit(idx, cpuc->used_mask)) {
+		if (idx == -1 || test_and_set_bit(idx, cpuc->used_mask)) {
 try_generic:
-			idx = find_first_zero_bit(cpuc->used_mask,
-						  x86_pmu.num_events);
-			if (idx == x86_pmu.num_events)
+			idx = x86_pmu.get_event_idx(hwc);
+			if (idx == -1)
 				return -EAGAIN;
 
 			set_bit(idx, cpuc->used_mask);
@@ -1883,6 +1977,7 @@ static struct x86_pmu p6_pmu = {
 	 */
 	.event_bits		= 32,
 	.event_mask		= (1ULL << 32) - 1,
+	.get_event_idx		= intel_get_event_idx,
 };
 
 static struct x86_pmu intel_pmu = {
@@ -1906,6 +2001,7 @@ static struct x86_pmu intel_pmu = {
 	.max_period		= (1ULL << 31) - 1,
 	.enable_bts		= intel_pmu_enable_bts,
 	.disable_bts		= intel_pmu_disable_bts,
+	.get_event_idx		= intel_get_event_idx,
 };
 
 static struct x86_pmu amd_pmu = {
@@ -1926,6 +2022,7 @@ static struct x86_pmu amd_pmu = {
 	.apic			= 1,
 	/* use highest bit to detect overflow */
 	.max_period		= (1ULL << 47) - 1,
+	.get_event_idx		= gen_get_event_idx,
 };
 
 static int p6_pmu_init(void)
@@ -1938,10 +2035,12 @@ static int p6_pmu_init(void)
 	case 7:
 	case 8:
 	case 11: /* Pentium III */
+		event_constraint = intel_p6_event_constraints;
 		break;
 	case 9:
 	case 13:
 		/* Pentium M */
+		event_constraint = intel_p6_event_constraints;
 		break;
 	default:
 		pr_cont("unsupported p6 CPU model %d ",
@@ -2013,12 +2112,14 @@ static int intel_pmu_init(void)
 		       sizeof(hw_cache_event_ids));
 
 		pr_cont("Core2 events, ");
+		event_constraint = intel_core_event_constraints;
 		break;
 	default:
 	case 26:
 		memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
 		       sizeof(hw_cache_event_ids));
 
+		event_constraint = intel_nehalem_event_constraints;
 		pr_cont("Nehalem/Corei7 events, ");
 		break;
 	case 28:
-- 
cgit v1.2.3


From fe9081cc9bdabb0be953a39ad977cea14e35bce5 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 8 Oct 2009 11:56:07 +0200
Subject: perf, x86: Add simple group validation

Refuse to add events when the group wouldn't fit onto the PMU
anymore.

Naive implementation.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@gmail.com>
LKML-Reference: <1254911461.26976.239.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c | 90 ++++++++++++++++++++++++++++++----------
 1 file changed, 69 insertions(+), 21 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 9c758548a0e..9961d845719 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -114,7 +114,8 @@ struct x86_pmu {
 	u64		intel_ctrl;
 	void		(*enable_bts)(u64 config);
 	void		(*disable_bts)(void);
-	int		(*get_event_idx)(struct hw_perf_event *hwc);
+	int		(*get_event_idx)(struct cpu_hw_events *cpuc,
+					 struct hw_perf_event *hwc);
 };
 
 static struct x86_pmu x86_pmu __read_mostly;
@@ -523,7 +524,7 @@ static u64 intel_pmu_raw_event(u64 hw_event)
 #define CORE_EVNTSEL_UNIT_MASK		0x0000FF00ULL
 #define CORE_EVNTSEL_EDGE_MASK		0x00040000ULL
 #define CORE_EVNTSEL_INV_MASK		0x00800000ULL
-#define CORE_EVNTSEL_REG_MASK	0xFF000000ULL
+#define CORE_EVNTSEL_REG_MASK		0xFF000000ULL
 
 #define CORE_EVNTSEL_MASK		\
 	(CORE_EVNTSEL_EVENT_MASK |	\
@@ -1390,8 +1391,7 @@ static void amd_pmu_enable_event(struct hw_perf_event *hwc, int idx)
 		x86_pmu_enable_event(hwc, idx);
 }
 
-static int
-fixed_mode_idx(struct perf_event *event, struct hw_perf_event *hwc)
+static int fixed_mode_idx(struct hw_perf_event *hwc)
 {
 	unsigned int hw_event;
 
@@ -1424,9 +1424,9 @@ fixed_mode_idx(struct perf_event *event, struct hw_perf_event *hwc)
 /*
  * generic counter allocator: get next free counter
  */
-static int gen_get_event_idx(struct hw_perf_event *hwc)
+static int
+gen_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
 {
-	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	int idx;
 
 	idx = find_first_zero_bit(cpuc->used_mask, x86_pmu.num_events);
@@ -1436,16 +1436,16 @@ static int gen_get_event_idx(struct hw_perf_event *hwc)
 /*
  * intel-specific counter allocator: check event constraints
  */
-static int intel_get_event_idx(struct hw_perf_event *hwc)
+static int
+intel_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
 {
-	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	const struct event_constraint *event_constraint;
 	int i, code;
 
 	if (!event_constraint)
 		goto skip;
 
-	code = hwc->config & 0xff;
+	code = hwc->config & CORE_EVNTSEL_EVENT_MASK;
 
 	for_each_event_constraint(event_constraint, event_constraint) {
 		if (code == event_constraint->code) {
@@ -1457,26 +1457,22 @@ static int intel_get_event_idx(struct hw_perf_event *hwc)
 		}
 	}
 skip:
-	return gen_get_event_idx(hwc);
+	return gen_get_event_idx(cpuc, hwc);
 }
 
-/*
- * Find a PMC slot for the freshly enabled / scheduled in event:
- */
-static int x86_pmu_enable(struct perf_event *event)
+static int
+x86_schedule_event(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
 {
-	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-	struct hw_perf_event *hwc = &event->hw;
 	int idx;
 
-	idx = fixed_mode_idx(event, hwc);
+	idx = fixed_mode_idx(hwc);
 	if (idx == X86_PMC_IDX_FIXED_BTS) {
 		/* BTS is already occupied. */
 		if (test_and_set_bit(idx, cpuc->used_mask))
 			return -EAGAIN;
 
 		hwc->config_base	= 0;
-		hwc->event_base	= 0;
+		hwc->event_base		= 0;
 		hwc->idx		= idx;
 	} else if (idx >= 0) {
 		/*
@@ -1499,17 +1495,33 @@ static int x86_pmu_enable(struct perf_event *event)
 		/* Try to get the previous generic event again */
 		if (idx == -1 || test_and_set_bit(idx, cpuc->used_mask)) {
 try_generic:
-			idx = x86_pmu.get_event_idx(hwc);
+			idx = x86_pmu.get_event_idx(cpuc, hwc);
 			if (idx == -1)
 				return -EAGAIN;
 
 			set_bit(idx, cpuc->used_mask);
 			hwc->idx = idx;
 		}
-		hwc->config_base  = x86_pmu.eventsel;
-		hwc->event_base = x86_pmu.perfctr;
+		hwc->config_base = x86_pmu.eventsel;
+		hwc->event_base  = x86_pmu.perfctr;
 	}
 
+	return idx;
+}
+
+/*
+ * Find a PMC slot for the freshly enabled / scheduled in event:
+ */
+static int x86_pmu_enable(struct perf_event *event)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	struct hw_perf_event *hwc = &event->hw;
+	int idx;
+
+	idx = x86_schedule_event(cpuc, hwc);
+	if (idx < 0)
+		return idx;
+
 	perf_events_lapic_init();
 
 	x86_pmu.disable(hwc, idx);
@@ -2212,11 +2224,47 @@ static const struct pmu pmu = {
 	.unthrottle	= x86_pmu_unthrottle,
 };
 
+static int
+validate_event(struct cpu_hw_events *cpuc, struct perf_event *event)
+{
+	struct hw_perf_event fake_event = event->hw;
+
+	if (event->pmu != &pmu)
+		return 0;
+
+	return x86_schedule_event(cpuc, &fake_event);
+}
+
+static int validate_group(struct perf_event *event)
+{
+	struct perf_event *sibling, *leader = event->group_leader;
+	struct cpu_hw_events fake_pmu;
+
+	memset(&fake_pmu, 0, sizeof(fake_pmu));
+
+	if (!validate_event(&fake_pmu, leader))
+		return -ENOSPC;
+
+	list_for_each_entry(sibling, &leader->sibling_list, group_entry) {
+		if (!validate_event(&fake_pmu, sibling))
+			return -ENOSPC;
+	}
+
+	if (!validate_event(&fake_pmu, event))
+		return -ENOSPC;
+
+	return 0;
+}
+
 const struct pmu *hw_perf_event_init(struct perf_event *event)
 {
 	int err;
 
 	err = __hw_perf_event_init(event);
+	if (!err) {
+		if (event->group_leader != event)
+			err = validate_group(event);
+	}
 	if (err) {
 		if (event->destroy)
 			event->destroy(event);
-- 
cgit v1.2.3


From e7ab0f7b50bc4688fb5cf65de5d42e3b882fb8d1 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 9 Oct 2009 15:58:20 +0200
Subject: Revert "x86, timers: Check for pending timers after (device)
 interrupts"

This reverts commit 9bcbdd9c58617f1301dd4f17c738bb9bc73aca70.

The real bug producing LatencyTop latencies has been fixed in:

  f5dc375: sched: Update the clock of runqueue select_task_rq() selected

And the commit being reverted here triggers local timer processing
from every device IRQ. If device IRQs come in at a high frequency,
this could cause a performance regression.

The commit being reverted here purely 'fixed' the reported latency
as a side effect, because CPUs were being moved out of idle more
often.

Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Frans Pop <elendil@planet.nl>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <20091008064041.67219b13@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/irq.c | 2 --
 arch/x86/kernel/smp.c | 1 -
 2 files changed, 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 39120619951..74656d1d4e3 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -244,7 +244,6 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
 				__func__, smp_processor_id(), vector, irq);
 	}
 
-	run_local_timers();
 	irq_exit();
 
 	set_irq_regs(old_regs);
@@ -269,7 +268,6 @@ void smp_generic_interrupt(struct pt_regs *regs)
 	if (generic_interrupt_extension)
 		generic_interrupt_extension();
 
-	run_local_timers();
 	irq_exit();
 
 	set_irq_regs(old_regs);
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index d915d956e66..ec1de97600e 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -198,7 +198,6 @@ void smp_reschedule_interrupt(struct pt_regs *regs)
 {
 	ack_APIC_irq();
 	inc_irq_stat(irq_resched_count);
-	run_local_timers();
 	/*
 	 * KVM uses this interrupt to force a cpu out of guest mode
 	 */
-- 
cgit v1.2.3


From d43c36dc6b357fa1806800f18aa30123c747a6d1 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 7 Oct 2009 17:09:06 +0400
Subject: headers: remove sched.h from interrupt.h

After m68k's task_thread_info() doesn't refer to current,
it's possible to remove sched.h from interrupt.h and not break m68k!
Many thanks to Heiko Carstens for allowing this.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 arch/arm/kernel/time.c                          | 1 +
 arch/arm/mach-integrator/pci_v3.c               | 1 +
 arch/arm/plat-s3c24xx/adc.c                     | 1 +
 arch/blackfin/kernel/time.c                     | 1 +
 arch/m32r/kernel/smp.c                          | 1 +
 arch/um/drivers/line.c                          | 1 +
 arch/um/drivers/port_kern.c                     | 1 +
 arch/um/kernel/irq.c                            | 1 +
 arch/x86/kernel/cpu/mcheck/mce_intel.c          | 1 +
 arch/x86/kernel/pci-gart_64.c                   | 1 +
 arch/x86/kernel/reboot.c                        | 1 +
 arch/xtensa/kernel/time.c                       | 1 +
 crypto/aead.c                                   | 1 +
 drivers/char/applicom.c                         | 1 +
 drivers/char/epca.c                             | 1 +
 drivers/char/generic_serial.c                   | 1 +
 drivers/char/istallion.c                        | 1 +
 drivers/char/nozomi.c                           | 1 +
 drivers/char/pty.c                              | 1 +
 drivers/char/rio/riocmd.c                       | 1 +
 drivers/char/rio/rioctrl.c                      | 1 +
 drivers/char/rio/riotty.c                       | 1 +
 drivers/char/ser_a2232.c                        | 1 +
 drivers/char/stallion.c                         | 1 +
 drivers/char/tlclk.c                            | 1 +
 drivers/hwmon/sht15.c                           | 1 +
 drivers/ieee1394/raw1394.c                      | 1 +
 drivers/ieee1394/video1394.c                    | 1 +
 drivers/infiniband/core/iwcm.c                  | 1 +
 drivers/infiniband/core/ucma.c                  | 1 +
 drivers/infiniband/hw/cxgb3/iwch_provider.c     | 1 +
 drivers/infiniband/hw/cxgb3/iwch_qp.c           | 1 +
 drivers/infiniband/hw/ipath/ipath_driver.c      | 1 +
 drivers/infiniband/hw/ipath/ipath_iba7220.c     | 1 +
 drivers/infiniband/hw/ipath/ipath_intr.c        | 1 +
 drivers/infiniband/hw/ipath/ipath_qp.c          | 1 +
 drivers/infiniband/hw/ipath/ipath_ruc.c         | 1 +
 drivers/infiniband/hw/ipath/ipath_ud.c          | 1 +
 drivers/infiniband/hw/ipath/ipath_user_pages.c  | 1 +
 drivers/infiniband/hw/ipath/ipath_user_sdma.c   | 1 +
 drivers/infiniband/hw/ipath/ipath_verbs_mcast.c | 1 +
 drivers/input/keyboard/hilkbd.c                 | 1 +
 drivers/input/keyboard/sunkbd.c                 | 1 +
 drivers/input/serio/libps2.c                    | 1 +
 drivers/input/serio/serio_raw.c                 | 1 +
 drivers/input/serio/serport.c                   | 1 +
 drivers/isdn/capi/kcapi.c                       | 1 +
 drivers/isdn/hisax/arcofi.c                     | 1 +
 drivers/isdn/hisax/hfc_2bds0.c                  | 1 +
 drivers/isdn/hisax/hfc_pci.c                    | 1 +
 drivers/isdn/hysdn/hysdn_procconf.c             | 1 +
 drivers/isdn/hysdn/hysdn_proclog.c              | 1 +
 drivers/isdn/pcbit/drv.c                        | 1 +
 drivers/isdn/pcbit/layer2.c                     | 1 +
 drivers/isdn/sc/init.c                          | 1 +
 drivers/lguest/interrupts_and_traps.c           | 1 +
 drivers/media/dvb/dvb-core/dvb_net.c            | 1 +
 drivers/media/video/meye.c                      | 1 +
 drivers/media/video/videobuf-core.c             | 1 +
 drivers/media/video/videobuf-dma-sg.c           | 1 +
 drivers/message/fusion/mptlan.c                 | 1 +
 drivers/mfd/ucb1x00-core.c                      | 1 +
 drivers/misc/hpilo.c                            | 1 +
 drivers/misc/ibmasm/command.c                   | 1 +
 drivers/misc/ibmasm/event.c                     | 1 +
 drivers/misc/ibmasm/r_heartbeat.c               | 1 +
 drivers/misc/phantom.c                          | 1 +
 drivers/mtd/devices/m25p80.c                    | 1 +
 drivers/mtd/devices/sst25l.c                    | 1 +
 drivers/net/bonding/bond_sysfs.c                | 1 +
 drivers/net/depca.c                             | 1 +
 drivers/net/e100.c                              | 1 +
 drivers/net/eql.c                               | 1 +
 drivers/net/ethoc.c                             | 1 +
 drivers/net/ewrk3.c                             | 1 +
 drivers/net/forcedeth.c                         | 1 +
 drivers/net/hamachi.c                           | 1 +
 drivers/net/hamradio/baycom_epp.c               | 1 +
 drivers/net/hamradio/baycom_ser_fdx.c           | 1 +
 drivers/net/hamradio/baycom_ser_hdx.c           | 1 +
 drivers/net/hamradio/hdlcdrv.c                  | 1 +
 drivers/net/hp100.c                             | 1 +
 drivers/net/igb/igb_ethtool.c                   | 1 +
 drivers/net/irda/toim3232-sir.c                 | 1 +
 drivers/net/ns83820.c                           | 1 +
 drivers/net/pcnet32.c                           | 1 +
 drivers/net/sb1000.c                            | 1 +
 drivers/net/sis900.c                            | 1 +
 drivers/net/skfp/skfddi.c                       | 1 +
 drivers/net/skge.c                              | 1 +
 drivers/net/slip.c                              | 1 +
 drivers/net/sungem.c                            | 1 +
 drivers/net/tokenring/ibmtr.c                   | 1 +
 drivers/net/typhoon.c                           | 1 +
 drivers/net/wan/cosa.c                          | 1 +
 drivers/net/wan/cycx_x25.c                      | 1 +
 drivers/net/wan/dscc4.c                         | 1 +
 drivers/net/wan/farsync.c                       | 1 +
 drivers/net/wireless/b43/pio.c                  | 1 +
 drivers/net/wireless/b43legacy/main.c           | 1 +
 drivers/net/wireless/b43legacy/phy.c            | 1 +
 drivers/net/wireless/hostap/hostap_info.c       | 1 +
 drivers/net/wireless/hostap/hostap_ioctl.c      | 1 +
 drivers/net/wireless/ipw2x00/ipw2200.c          | 1 +
 drivers/net/wireless/iwlwifi/iwl-3945.c         | 1 +
 drivers/net/wireless/iwlwifi/iwl-4965.c         | 1 +
 drivers/net/wireless/iwlwifi/iwl-5000.c         | 1 +
 drivers/net/wireless/iwlwifi/iwl-agn.c          | 1 +
 drivers/net/wireless/iwlwifi/iwl-core.c         | 1 +
 drivers/net/wireless/iwlwifi/iwl-hcmd.c         | 1 +
 drivers/net/wireless/iwlwifi/iwl-tx.c           | 1 +
 drivers/net/wireless/iwlwifi/iwl3945-base.c     | 1 +
 drivers/net/wireless/iwmc3200wifi/cfg80211.c    | 1 +
 drivers/net/wireless/iwmc3200wifi/commands.c    | 1 +
 drivers/net/wireless/iwmc3200wifi/main.c        | 1 +
 drivers/net/wireless/iwmc3200wifi/rx.c          | 1 +
 drivers/net/wireless/libertas/cmd.c             | 1 +
 drivers/net/wireless/libertas/tx.c              | 1 +
 drivers/net/wireless/prism54/isl_ioctl.c        | 1 +
 drivers/net/wireless/prism54/islpci_dev.c       | 1 +
 drivers/net/wireless/prism54/islpci_mgt.c       | 1 +
 drivers/net/wireless/rt2x00/rt2x00debug.c       | 1 +
 drivers/pci/pcie/aer/aerdrv.c                   | 1 +
 drivers/rtc/interface.c                         | 1 +
 drivers/rtc/rtc-dev.c                           | 1 +
 drivers/uio/uio.c                               | 1 +
 drivers/uwb/whc-rc.c                            | 1 +
 fs/file.c                                       | 1 +
 include/linux/interrupt.h                       | 2 +-
 include/linux/mmc/host.h                        | 1 +
 kernel/irq/handle.c                             | 1 +
 kernel/mutex-debug.c                            | 1 +
 kernel/time/timekeeping.c                       | 1 +
 lib/debugobjects.c                              | 1 +
 lib/fault-inject.c                              | 1 +
 mm/vmalloc.c                                    | 1 +
 net/irda/ircomm/ircomm_tty_attach.c             | 1 +
 net/irda/irlan/irlan_common.c                   | 1 +
 net/irda/irlan/irlan_eth.c                      | 1 +
 net/irda/irnet/irnet_irda.c                     | 1 +
 net/irda/irnet/irnet_ppp.c                      | 1 +
 net/mac80211/rc80211_pid_debugfs.c              | 1 +
 net/netfilter/nf_conntrack_core.c               | 1 +
 net/sunrpc/xprtrdma/svc_rdma_transport.c        | 1 +
 net/wireless/core.c                             | 1 +
 145 files changed, 145 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/arm/kernel/time.c b/arch/arm/kernel/time.c
index 4cdc4a0bd02..d38cdf2c827 100644
--- a/arch/arm/kernel/time.c
+++ b/arch/arm/kernel/time.c
@@ -21,6 +21,7 @@
 #include <linux/interrupt.h>
 #include <linux/time.h>
 #include <linux/init.h>
+#include <linux/sched.h>
 #include <linux/smp.h>
 #include <linux/timex.h>
 #include <linux/errno.h>
diff --git a/arch/arm/mach-integrator/pci_v3.c b/arch/arm/mach-integrator/pci_v3.c
index 901cc205015..148d25fc636 100644
--- a/arch/arm/mach-integrator/pci_v3.c
+++ b/arch/arm/mach-integrator/pci_v3.c
@@ -31,6 +31,7 @@
 
 #include <mach/hardware.h>
 #include <asm/irq.h>
+#include <asm/signal.h>
 #include <asm/system.h>
 #include <asm/mach/pci.h>
 #include <asm/irq_regs.h>
diff --git a/arch/arm/plat-s3c24xx/adc.c b/arch/arm/plat-s3c24xx/adc.c
index 11117a7ba91..4d36b784fb8 100644
--- a/arch/arm/plat-s3c24xx/adc.c
+++ b/arch/arm/plat-s3c24xx/adc.c
@@ -14,6 +14,7 @@
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/platform_device.h>
+#include <linux/sched.h>
 #include <linux/list.h>
 #include <linux/err.h>
 #include <linux/clk.h>
diff --git a/arch/blackfin/kernel/time.c b/arch/blackfin/kernel/time.c
index e5069fe6861..bd3b53da295 100644
--- a/arch/blackfin/kernel/time.c
+++ b/arch/blackfin/kernel/time.c
@@ -14,6 +14,7 @@
 #include <linux/time.h>
 #include <linux/irq.h>
 #include <linux/delay.h>
+#include <linux/sched.h>
 
 #include <asm/blackfin.h>
 #include <asm/time.h>
diff --git a/arch/m32r/kernel/smp.c b/arch/m32r/kernel/smp.c
index 1b7598e6f6e..8a88f1f0a3e 100644
--- a/arch/m32r/kernel/smp.c
+++ b/arch/m32r/kernel/smp.c
@@ -17,6 +17,7 @@
 
 #include <linux/irq.h>
 #include <linux/interrupt.h>
+#include <linux/sched.h>
 #include <linux/spinlock.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
diff --git a/arch/um/drivers/line.c b/arch/um/drivers/line.c
index 14a102e877d..cf8a97f3451 100644
--- a/arch/um/drivers/line.c
+++ b/arch/um/drivers/line.c
@@ -5,6 +5,7 @@
 
 #include "linux/irqreturn.h"
 #include "linux/kd.h"
+#include "linux/sched.h"
 #include "chan_kern.h"
 #include "irq_kern.h"
 #include "irq_user.h"
diff --git a/arch/um/drivers/port_kern.c b/arch/um/drivers/port_kern.c
index 19930081d3d..4ebc8a34738 100644
--- a/arch/um/drivers/port_kern.c
+++ b/arch/um/drivers/port_kern.c
@@ -7,6 +7,7 @@
 #include "linux/interrupt.h"
 #include "linux/list.h"
 #include "linux/mutex.h"
+#include "linux/workqueue.h"
 #include "asm/atomic.h"
 #include "init.h"
 #include "irq_kern.h"
diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c
index 454cdb43e35..039270b9b73 100644
--- a/arch/um/kernel/irq.c
+++ b/arch/um/kernel/irq.c
@@ -10,6 +10,7 @@
 #include "linux/interrupt.h"
 #include "linux/kernel_stat.h"
 #include "linux/module.h"
+#include "linux/sched.h"
 #include "linux/seq_file.h"
 #include "as-layout.h"
 #include "kern_util.h"
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 889f665fe93..7c785634af2 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -8,6 +8,7 @@
 #include <linux/init.h>
 #include <linux/interrupt.h>
 #include <linux/percpu.h>
+#include <linux/sched.h>
 #include <asm/apic.h>
 #include <asm/processor.h>
 #include <asm/msr.h>
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index 98a827ee9ed..a7f1b64f86e 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -16,6 +16,7 @@
 #include <linux/agp_backend.h>
 #include <linux/init.h>
 #include <linux/mm.h>
+#include <linux/sched.h>
 #include <linux/string.h>
 #include <linux/spinlock.h>
 #include <linux/pci.h>
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 27349f92a6d..a1a3cdda06e 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -4,6 +4,7 @@
 #include <linux/pm.h>
 #include <linux/efi.h>
 #include <linux/dmi.h>
+#include <linux/sched.h>
 #include <linux/tboot.h>
 #include <acpi/reboot.h>
 #include <asm/io.h>
diff --git a/arch/xtensa/kernel/time.c b/arch/xtensa/kernel/time.c
index 19085ff0484..19f7df30937 100644
--- a/arch/xtensa/kernel/time.c
+++ b/arch/xtensa/kernel/time.c
@@ -13,6 +13,7 @@
  */
 
 #include <linux/errno.h>
+#include <linux/sched.h>
 #include <linux/time.h>
 #include <linux/clocksource.h>
 #include <linux/interrupt.h>
diff --git a/crypto/aead.c b/crypto/aead.c
index d9aa733db16..0a55da70845 100644
--- a/crypto/aead.c
+++ b/crypto/aead.c
@@ -18,6 +18,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/rtnetlink.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/seq_file.h>
 
diff --git a/drivers/char/applicom.c b/drivers/char/applicom.c
index 73a0765344b..fe2cb2f5db1 100644
--- a/drivers/char/applicom.c
+++ b/drivers/char/applicom.c
@@ -23,6 +23,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/interrupt.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/errno.h>
 #include <linux/miscdevice.h>
diff --git a/drivers/char/epca.c b/drivers/char/epca.c
index 9d589e3144d..dde5134713e 100644
--- a/drivers/char/epca.c
+++ b/drivers/char/epca.c
@@ -30,6 +30,7 @@
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/init.h>
+#include <linux/sched.h>
 #include <linux/serial.h>
 #include <linux/delay.h>
 #include <linux/ctype.h>
diff --git a/drivers/char/generic_serial.c b/drivers/char/generic_serial.c
index 9e4e569dc00..d400cbd280f 100644
--- a/drivers/char/generic_serial.c
+++ b/drivers/char/generic_serial.c
@@ -22,6 +22,7 @@
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/tty.h>
+#include <linux/sched.h>
 #include <linux/serial.h>
 #include <linux/mm.h>
 #include <linux/generic_serial.h>
diff --git a/drivers/char/istallion.c b/drivers/char/istallion.c
index ab2f3349c5c..402838f4083 100644
--- a/drivers/char/istallion.c
+++ b/drivers/char/istallion.c
@@ -19,6 +19,7 @@
 /*****************************************************************************/
 
 #include <linux/module.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/smp_lock.h>
 #include <linux/interrupt.h>
diff --git a/drivers/char/nozomi.c b/drivers/char/nozomi.c
index ec58d8c387f..d3400b20444 100644
--- a/drivers/char/nozomi.c
+++ b/drivers/char/nozomi.c
@@ -48,6 +48,7 @@
 #include <linux/tty.h>
 #include <linux/tty_driver.h>
 #include <linux/tty_flip.h>
+#include <linux/sched.h>
 #include <linux/serial.h>
 #include <linux/interrupt.h>
 #include <linux/kmod.h>
diff --git a/drivers/char/pty.c b/drivers/char/pty.c
index e066c4fdf81..62f282e6763 100644
--- a/drivers/char/pty.c
+++ b/drivers/char/pty.c
@@ -18,6 +18,7 @@
 #include <linux/tty.h>
 #include <linux/tty_flip.h>
 #include <linux/fcntl.h>
+#include <linux/sched.h>
 #include <linux/string.h>
 #include <linux/major.h>
 #include <linux/mm.h>
diff --git a/drivers/char/rio/riocmd.c b/drivers/char/rio/riocmd.c
index 01f2654d5a2..f121357e5af 100644
--- a/drivers/char/rio/riocmd.c
+++ b/drivers/char/rio/riocmd.c
@@ -32,6 +32,7 @@
 */
 
 #include <linux/module.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/errno.h>
 #include <linux/tty.h>
diff --git a/drivers/char/rio/rioctrl.c b/drivers/char/rio/rioctrl.c
index 74339559f0b..780506326a7 100644
--- a/drivers/char/rio/rioctrl.c
+++ b/drivers/char/rio/rioctrl.c
@@ -31,6 +31,7 @@
 */
 
 #include <linux/module.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/errno.h>
 #include <asm/io.h>
diff --git a/drivers/char/rio/riotty.c b/drivers/char/rio/riotty.c
index 2fb49e89b32..47fab7c3307 100644
--- a/drivers/char/rio/riotty.c
+++ b/drivers/char/rio/riotty.c
@@ -33,6 +33,7 @@
 #define __EXPLICIT_DEF_H__
 
 #include <linux/module.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/errno.h>
 #include <linux/tty.h>
diff --git a/drivers/char/ser_a2232.c b/drivers/char/ser_a2232.c
index 33a2b531802..9610861d1f5 100644
--- a/drivers/char/ser_a2232.c
+++ b/drivers/char/ser_a2232.c
@@ -89,6 +89,7 @@
 #include <linux/interrupt.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
+#include <linux/sched.h>
 #include <linux/tty.h>
 
 #include <asm/setup.h>
diff --git a/drivers/char/stallion.c b/drivers/char/stallion.c
index 53e504f41b2..db6dcfa35ba 100644
--- a/drivers/char/stallion.c
+++ b/drivers/char/stallion.c
@@ -27,6 +27,7 @@
 /*****************************************************************************/
 
 #include <linux/module.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/interrupt.h>
 #include <linux/tty.h>
diff --git a/drivers/char/tlclk.c b/drivers/char/tlclk.c
index 8f2284be68e..80ea6bcfffd 100644
--- a/drivers/char/tlclk.c
+++ b/drivers/char/tlclk.c
@@ -32,6 +32,7 @@
 #include <linux/kernel.h>	/* printk() */
 #include <linux/fs.h>		/* everything... */
 #include <linux/errno.h>	/* error codes */
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/ioport.h>
 #include <linux/interrupt.h>
diff --git a/drivers/hwmon/sht15.c b/drivers/hwmon/sht15.c
index 303c02694c3..2da6fb2c325 100644
--- a/drivers/hwmon/sht15.c
+++ b/drivers/hwmon/sht15.c
@@ -30,6 +30,7 @@
 #include <linux/hwmon-sysfs.h>
 #include <linux/mutex.h>
 #include <linux/platform_device.h>
+#include <linux/sched.h>
 #include <linux/delay.h>
 #include <linux/jiffies.h>
 #include <linux/err.h>
diff --git a/drivers/ieee1394/raw1394.c b/drivers/ieee1394/raw1394.c
index 0bc3d78ce7b..8aa56ac07e2 100644
--- a/drivers/ieee1394/raw1394.c
+++ b/drivers/ieee1394/raw1394.c
@@ -29,6 +29,7 @@
 
 #include <linux/kernel.h>
 #include <linux/list.h>
+#include <linux/sched.h>
 #include <linux/string.h>
 #include <linux/slab.h>
 #include <linux/fs.h>
diff --git a/drivers/ieee1394/video1394.c b/drivers/ieee1394/video1394.c
index d287ba79821..949064a0567 100644
--- a/drivers/ieee1394/video1394.c
+++ b/drivers/ieee1394/video1394.c
@@ -30,6 +30,7 @@
  */
 #include <linux/kernel.h>
 #include <linux/list.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/interrupt.h>
 #include <linux/wait.h>
diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c
index 625fec5a741..0f89909abce 100644
--- a/drivers/infiniband/core/iwcm.c
+++ b/drivers/infiniband/core/iwcm.c
@@ -40,6 +40,7 @@
 #include <linux/idr.h>
 #include <linux/interrupt.h>
 #include <linux/rbtree.h>
+#include <linux/sched.h>
 #include <linux/spinlock.h>
 #include <linux/workqueue.h>
 #include <linux/completion.h>
diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
index 4346a24568f..bb96d3c4b0f 100644
--- a/drivers/infiniband/core/ucma.c
+++ b/drivers/infiniband/core/ucma.c
@@ -34,6 +34,7 @@
 #include <linux/file.h>
 #include <linux/mutex.h>
 #include <linux/poll.h>
+#include <linux/sched.h>
 #include <linux/idr.h>
 #include <linux/in.h>
 #include <linux/in6.h>
diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c
index 03cfaecc3bb..ed7175549eb 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_provider.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c
@@ -37,6 +37,7 @@
 #include <linux/delay.h>
 #include <linux/errno.h>
 #include <linux/list.h>
+#include <linux/sched.h>
 #include <linux/spinlock.h>
 #include <linux/ethtool.h>
 #include <linux/rtnetlink.h>
diff --git a/drivers/infiniband/hw/cxgb3/iwch_qp.c b/drivers/infiniband/hw/cxgb3/iwch_qp.c
index 6e865347194..1cecf98829a 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_qp.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_qp.c
@@ -29,6 +29,7 @@
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+#include <linux/sched.h>
 #include "iwch_provider.h"
 #include "iwch.h"
 #include "iwch_cm.h"
diff --git a/drivers/infiniband/hw/ipath/ipath_driver.c b/drivers/infiniband/hw/ipath/ipath_driver.c
index 04e88b60055..013d1380e77 100644
--- a/drivers/infiniband/hw/ipath/ipath_driver.c
+++ b/drivers/infiniband/hw/ipath/ipath_driver.c
@@ -31,6 +31,7 @@
  * SOFTWARE.
  */
 
+#include <linux/sched.h>
 #include <linux/spinlock.h>
 #include <linux/idr.h>
 #include <linux/pci.h>
diff --git a/drivers/infiniband/hw/ipath/ipath_iba7220.c b/drivers/infiniband/hw/ipath/ipath_iba7220.c
index b2a9d4c155d..a805402dd4a 100644
--- a/drivers/infiniband/hw/ipath/ipath_iba7220.c
+++ b/drivers/infiniband/hw/ipath/ipath_iba7220.c
@@ -37,6 +37,7 @@
 
 #include <linux/interrupt.h>
 #include <linux/pci.h>
+#include <linux/sched.h>
 #include <linux/delay.h>
 #include <linux/io.h>
 #include <rdma/ib_verbs.h>
diff --git a/drivers/infiniband/hw/ipath/ipath_intr.c b/drivers/infiniband/hw/ipath/ipath_intr.c
index 6c21b4b5ec7..c0a03ac03ee 100644
--- a/drivers/infiniband/hw/ipath/ipath_intr.c
+++ b/drivers/infiniband/hw/ipath/ipath_intr.c
@@ -33,6 +33,7 @@
 
 #include <linux/pci.h>
 #include <linux/delay.h>
+#include <linux/sched.h>
 
 #include "ipath_kernel.h"
 #include "ipath_verbs.h"
diff --git a/drivers/infiniband/hw/ipath/ipath_qp.c b/drivers/infiniband/hw/ipath/ipath_qp.c
index 3a5a89b609c..cb2d3ef2ae1 100644
--- a/drivers/infiniband/hw/ipath/ipath_qp.c
+++ b/drivers/infiniband/hw/ipath/ipath_qp.c
@@ -32,6 +32,7 @@
  */
 
 #include <linux/err.h>
+#include <linux/sched.h>
 #include <linux/vmalloc.h>
 
 #include "ipath_verbs.h"
diff --git a/drivers/infiniband/hw/ipath/ipath_ruc.c b/drivers/infiniband/hw/ipath/ipath_ruc.c
index 2296832f94d..1f95bbaf760 100644
--- a/drivers/infiniband/hw/ipath/ipath_ruc.c
+++ b/drivers/infiniband/hw/ipath/ipath_ruc.c
@@ -31,6 +31,7 @@
  * SOFTWARE.
  */
 
+#include <linux/sched.h>
 #include <linux/spinlock.h>
 
 #include "ipath_verbs.h"
diff --git a/drivers/infiniband/hw/ipath/ipath_ud.c b/drivers/infiniband/hw/ipath/ipath_ud.c
index 6076cb61bf6..7420715256a 100644
--- a/drivers/infiniband/hw/ipath/ipath_ud.c
+++ b/drivers/infiniband/hw/ipath/ipath_ud.c
@@ -31,6 +31,7 @@
  * SOFTWARE.
  */
 
+#include <linux/sched.h>
 #include <rdma/ib_smi.h>
 
 #include "ipath_verbs.h"
diff --git a/drivers/infiniband/hw/ipath/ipath_user_pages.c b/drivers/infiniband/hw/ipath/ipath_user_pages.c
index 855911e7396..82878e34862 100644
--- a/drivers/infiniband/hw/ipath/ipath_user_pages.c
+++ b/drivers/infiniband/hw/ipath/ipath_user_pages.c
@@ -33,6 +33,7 @@
 
 #include <linux/mm.h>
 #include <linux/device.h>
+#include <linux/sched.h>
 
 #include "ipath_kernel.h"
 
diff --git a/drivers/infiniband/hw/ipath/ipath_user_sdma.c b/drivers/infiniband/hw/ipath/ipath_user_sdma.c
index 7bff4b9baa0..be78f6643c0 100644
--- a/drivers/infiniband/hw/ipath/ipath_user_sdma.c
+++ b/drivers/infiniband/hw/ipath/ipath_user_sdma.c
@@ -33,6 +33,7 @@
 #include <linux/types.h>
 #include <linux/device.h>
 #include <linux/dmapool.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/list.h>
 #include <linux/highmem.h>
diff --git a/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c b/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c
index d73e3223287..6923e1d986d 100644
--- a/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c
+++ b/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c
@@ -32,6 +32,7 @@
  */
 
 #include <linux/rculist.h>
+#include <linux/sched.h>
 
 #include "ipath_verbs.h"
 
diff --git a/drivers/input/keyboard/hilkbd.c b/drivers/input/keyboard/hilkbd.c
index e9d639ec283..5f72440b50c 100644
--- a/drivers/input/keyboard/hilkbd.c
+++ b/drivers/input/keyboard/hilkbd.c
@@ -24,6 +24,7 @@
 #include <linux/interrupt.h>
 #include <linux/hil.h>
 #include <linux/io.h>
+#include <linux/sched.h>
 #include <linux/spinlock.h>
 #include <asm/irq.h>
 #ifdef CONFIG_HP300
diff --git a/drivers/input/keyboard/sunkbd.c b/drivers/input/keyboard/sunkbd.c
index 472b56639cd..a99a04b03ee 100644
--- a/drivers/input/keyboard/sunkbd.c
+++ b/drivers/input/keyboard/sunkbd.c
@@ -27,6 +27,7 @@
  */
 
 #include <linux/delay.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/interrupt.h>
diff --git a/drivers/input/serio/libps2.c b/drivers/input/serio/libps2.c
index 769ba65a585..f3876acc3e8 100644
--- a/drivers/input/serio/libps2.c
+++ b/drivers/input/serio/libps2.c
@@ -13,6 +13,7 @@
 
 #include <linux/delay.h>
 #include <linux/module.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/interrupt.h>
 #include <linux/input.h>
diff --git a/drivers/input/serio/serio_raw.c b/drivers/input/serio/serio_raw.c
index b03009bb746..27fdaaffbb4 100644
--- a/drivers/input/serio/serio_raw.c
+++ b/drivers/input/serio/serio_raw.c
@@ -9,6 +9,7 @@
  * the Free Software Foundation.
  */
 
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/smp_lock.h>
 #include <linux/poll.h>
diff --git a/drivers/input/serio/serport.c b/drivers/input/serio/serport.c
index b9694b6445d..6d345112bcb 100644
--- a/drivers/input/serio/serport.c
+++ b/drivers/input/serio/serport.c
@@ -15,6 +15,7 @@
 
 #include <asm/uaccess.h>
 #include <linux/kernel.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/init.h>
diff --git a/drivers/isdn/capi/kcapi.c b/drivers/isdn/capi/kcapi.c
index 57d26360f64..dc506ab99ca 100644
--- a/drivers/isdn/capi/kcapi.c
+++ b/drivers/isdn/capi/kcapi.c
@@ -18,6 +18,7 @@
 #include <linux/interrupt.h>
 #include <linux/ioport.h>
 #include <linux/proc_fs.h>
+#include <linux/sched.h>
 #include <linux/seq_file.h>
 #include <linux/skbuff.h>
 #include <linux/workqueue.h>
diff --git a/drivers/isdn/hisax/arcofi.c b/drivers/isdn/hisax/arcofi.c
index d30ce5b978c..85a8fd8dd0b 100644
--- a/drivers/isdn/hisax/arcofi.c
+++ b/drivers/isdn/hisax/arcofi.c
@@ -10,6 +10,7 @@
  *
  */
  
+#include <linux/sched.h>
 #include "hisax.h"
 #include "isdnl1.h"
 #include "isac.h"
diff --git a/drivers/isdn/hisax/hfc_2bds0.c b/drivers/isdn/hisax/hfc_2bds0.c
index 5c46a7130e0..8d22f50760e 100644
--- a/drivers/isdn/hisax/hfc_2bds0.c
+++ b/drivers/isdn/hisax/hfc_2bds0.c
@@ -11,6 +11,7 @@
  */
 
 #include <linux/init.h>
+#include <linux/sched.h>
 #include "hisax.h"
 #include "hfc_2bds0.h"
 #include "isdnl1.h"
diff --git a/drivers/isdn/hisax/hfc_pci.c b/drivers/isdn/hisax/hfc_pci.c
index d110a77940a..10914731b30 100644
--- a/drivers/isdn/hisax/hfc_pci.c
+++ b/drivers/isdn/hisax/hfc_pci.c
@@ -20,6 +20,7 @@
 #include "hfc_pci.h"
 #include "isdnl1.h"
 #include <linux/pci.h>
+#include <linux/sched.h>
 #include <linux/interrupt.h>
 
 static const char *hfcpci_revision = "$Revision: 1.48.2.4 $";
diff --git a/drivers/isdn/hysdn/hysdn_procconf.c b/drivers/isdn/hysdn/hysdn_procconf.c
index 8f9f4912de3..90b35e1a4b7 100644
--- a/drivers/isdn/hysdn/hysdn_procconf.c
+++ b/drivers/isdn/hysdn/hysdn_procconf.c
@@ -11,6 +11,7 @@
  *
  */
 
+#include <linux/cred.h>
 #include <linux/module.h>
 #include <linux/poll.h>
 #include <linux/proc_fs.h>
diff --git a/drivers/isdn/hysdn/hysdn_proclog.c b/drivers/isdn/hysdn/hysdn_proclog.c
index 8991d2c8ee4..8bcae28c440 100644
--- a/drivers/isdn/hysdn/hysdn_proclog.c
+++ b/drivers/isdn/hysdn/hysdn_proclog.c
@@ -13,6 +13,7 @@
 #include <linux/module.h>
 #include <linux/poll.h>
 #include <linux/proc_fs.h>
+#include <linux/sched.h>
 #include <linux/smp_lock.h>
 
 #include "hysdn_defs.h"
diff --git a/drivers/isdn/pcbit/drv.c b/drivers/isdn/pcbit/drv.c
index 8c66bcb953a..123c1d6c43b 100644
--- a/drivers/isdn/pcbit/drv.c
+++ b/drivers/isdn/pcbit/drv.c
@@ -23,6 +23,7 @@
 #include <linux/kernel.h>
 
 #include <linux/types.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/mm.h>
 #include <linux/interrupt.h>
diff --git a/drivers/isdn/pcbit/layer2.c b/drivers/isdn/pcbit/layer2.c
index e075e8d2fce..30f0f45e313 100644
--- a/drivers/isdn/pcbit/layer2.c
+++ b/drivers/isdn/pcbit/layer2.c
@@ -27,6 +27,7 @@
 #include <linux/string.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/interrupt.h>
 #include <linux/workqueue.h>
diff --git a/drivers/isdn/sc/init.c b/drivers/isdn/sc/init.c
index dd0acd06750..5a0774880d5 100644
--- a/drivers/isdn/sc/init.c
+++ b/drivers/isdn/sc/init.c
@@ -8,6 +8,7 @@
 #include <linux/init.h>
 #include <linux/interrupt.h>
 #include <linux/delay.h>
+#include <linux/sched.h>
 #include "includes.h"
 #include "hardware.h"
 #include "card.h"
diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c
index 18648180db0..daaf8663164 100644
--- a/drivers/lguest/interrupts_and_traps.c
+++ b/drivers/lguest/interrupts_and_traps.c
@@ -16,6 +16,7 @@
 #include <linux/uaccess.h>
 #include <linux/interrupt.h>
 #include <linux/module.h>
+#include <linux/sched.h>
 #include "lg.h"
 
 /* Allow Guests to use a non-128 (ie. non-Linux) syscall trap. */
diff --git a/drivers/media/dvb/dvb-core/dvb_net.c b/drivers/media/dvb/dvb-core/dvb_net.c
index 8c9ae0a3a27..0241a7c5c34 100644
--- a/drivers/media/dvb/dvb-core/dvb_net.c
+++ b/drivers/media/dvb/dvb-core/dvb_net.c
@@ -63,6 +63,7 @@
 #include <asm/uaccess.h>
 #include <linux/crc32.h>
 #include <linux/mutex.h>
+#include <linux/sched.h>
 
 #include "dvb_demux.h"
 #include "dvb_net.h"
diff --git a/drivers/media/video/meye.c b/drivers/media/video/meye.c
index 4b1bc05a462..01e1eefcf1e 100644
--- a/drivers/media/video/meye.c
+++ b/drivers/media/video/meye.c
@@ -28,6 +28,7 @@
  */
 #include <linux/module.h>
 #include <linux/pci.h>
+#include <linux/sched.h>
 #include <linux/init.h>
 #include <linux/videodev.h>
 #include <media/v4l2-common.h>
diff --git a/drivers/media/video/videobuf-core.c b/drivers/media/video/videobuf-core.c
index f1ccf98c0a6..8e93c6f25c8 100644
--- a/drivers/media/video/videobuf-core.c
+++ b/drivers/media/video/videobuf-core.c
@@ -17,6 +17,7 @@
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/mm.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/interrupt.h>
 
diff --git a/drivers/media/video/videobuf-dma-sg.c b/drivers/media/video/videobuf-dma-sg.c
index 53cdd67cebe..032ebae0134 100644
--- a/drivers/media/video/videobuf-dma-sg.c
+++ b/drivers/media/video/videobuf-dma-sg.c
@@ -21,6 +21,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/interrupt.h>
 
diff --git a/drivers/message/fusion/mptlan.c b/drivers/message/fusion/mptlan.c
index bc2ec2182c0..34f3f36f819 100644
--- a/drivers/message/fusion/mptlan.c
+++ b/drivers/message/fusion/mptlan.c
@@ -56,6 +56,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/fs.h>
+#include <linux/sched.h>
 
 #define my_VERSION	MPT_LINUX_VERSION_COMMON
 #define MYNAM		"mptlan"
diff --git a/drivers/mfd/ucb1x00-core.c b/drivers/mfd/ucb1x00-core.c
index fea9085fe52..60c3988f3cf 100644
--- a/drivers/mfd/ucb1x00-core.c
+++ b/drivers/mfd/ucb1x00-core.c
@@ -18,6 +18,7 @@
  */
 #include <linux/module.h>
 #include <linux/kernel.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/errno.h>
diff --git a/drivers/misc/hpilo.c b/drivers/misc/hpilo.c
index 1ad27c6abcc..a92a3a742b4 100644
--- a/drivers/misc/hpilo.c
+++ b/drivers/misc/hpilo.c
@@ -18,6 +18,7 @@
 #include <linux/device.h>
 #include <linux/file.h>
 #include <linux/cdev.h>
+#include <linux/sched.h>
 #include <linux/spinlock.h>
 #include <linux/delay.h>
 #include <linux/uaccess.h>
diff --git a/drivers/misc/ibmasm/command.c b/drivers/misc/ibmasm/command.c
index 276d3fb6809..e2031739aa2 100644
--- a/drivers/misc/ibmasm/command.c
+++ b/drivers/misc/ibmasm/command.c
@@ -22,6 +22,7 @@
  *
  */
 
+#include <linux/sched.h>
 #include "ibmasm.h"
 #include "lowlevel.h"
 
diff --git a/drivers/misc/ibmasm/event.c b/drivers/misc/ibmasm/event.c
index 68a0a5b9479..572d41ffc18 100644
--- a/drivers/misc/ibmasm/event.c
+++ b/drivers/misc/ibmasm/event.c
@@ -22,6 +22,7 @@
  *
  */
 
+#include <linux/sched.h>
 #include "ibmasm.h"
 #include "lowlevel.h"
 
diff --git a/drivers/misc/ibmasm/r_heartbeat.c b/drivers/misc/ibmasm/r_heartbeat.c
index bec9e2c44be..2de487ac788 100644
--- a/drivers/misc/ibmasm/r_heartbeat.c
+++ b/drivers/misc/ibmasm/r_heartbeat.c
@@ -20,6 +20,7 @@
  *
  */
 
+#include <linux/sched.h>
 #include "ibmasm.h"
 #include "dot_command.h"
 
diff --git a/drivers/misc/phantom.c b/drivers/misc/phantom.c
index 90a95ce8dc3..04c27266f56 100644
--- a/drivers/misc/phantom.c
+++ b/drivers/misc/phantom.c
@@ -22,6 +22,7 @@
 #include <linux/interrupt.h>
 #include <linux/cdev.h>
 #include <linux/phantom.h>
+#include <linux/sched.h>
 #include <linux/smp_lock.h>
 
 #include <asm/atomic.h>
diff --git a/drivers/mtd/devices/m25p80.c b/drivers/mtd/devices/m25p80.c
index 379c316f329..4c19269de91 100644
--- a/drivers/mtd/devices/m25p80.c
+++ b/drivers/mtd/devices/m25p80.c
@@ -21,6 +21,7 @@
 #include <linux/interrupt.h>
 #include <linux/mutex.h>
 #include <linux/math64.h>
+#include <linux/sched.h>
 
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/partitions.h>
diff --git a/drivers/mtd/devices/sst25l.c b/drivers/mtd/devices/sst25l.c
index c2baf3353f8..0a11721f146 100644
--- a/drivers/mtd/devices/sst25l.c
+++ b/drivers/mtd/devices/sst25l.c
@@ -20,6 +20,7 @@
 #include <linux/device.h>
 #include <linux/mutex.h>
 #include <linux/interrupt.h>
+#include <linux/sched.h>
 
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/partitions.h>
diff --git a/drivers/net/bonding/bond_sysfs.c b/drivers/net/bonding/bond_sysfs.c
index ff449de6f3c..8762a27a2a1 100644
--- a/drivers/net/bonding/bond_sysfs.c
+++ b/drivers/net/bonding/bond_sysfs.c
@@ -22,6 +22,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/device.h>
+#include <linux/sched.h>
 #include <linux/sysdev.h>
 #include <linux/fs.h>
 #include <linux/types.h>
diff --git a/drivers/net/depca.c b/drivers/net/depca.c
index 9686c1fa28f..7a3bdac84ab 100644
--- a/drivers/net/depca.c
+++ b/drivers/net/depca.c
@@ -237,6 +237,7 @@
 
 #include <linux/module.h>
 #include <linux/kernel.h>
+#include <linux/sched.h>
 #include <linux/string.h>
 #include <linux/errno.h>
 #include <linux/ioport.h>
diff --git a/drivers/net/e100.c b/drivers/net/e100.c
index 679965c2bb8..5d2f48f0225 100644
--- a/drivers/net/e100.c
+++ b/drivers/net/e100.c
@@ -151,6 +151,7 @@
 #include <linux/moduleparam.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/delay.h>
 #include <linux/init.h>
diff --git a/drivers/net/eql.c b/drivers/net/eql.c
index d4d9a3eda69..f5b96cadeb2 100644
--- a/drivers/net/eql.c
+++ b/drivers/net/eql.c
@@ -111,6 +111,7 @@
  * Sorry, I had to rewrite most of this for 2.5.x -DaveM
  */
 
+#include <linux/capability.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
diff --git a/drivers/net/ethoc.c b/drivers/net/ethoc.c
index 34d0c69e67f..96f5b2a2d2c 100644
--- a/drivers/net/ethoc.c
+++ b/drivers/net/ethoc.c
@@ -17,6 +17,7 @@
 #include <linux/mii.h>
 #include <linux/phy.h>
 #include <linux/platform_device.h>
+#include <linux/sched.h>
 #include <net/ethoc.h>
 
 static int buffer_size = 0x8000; /* 32 KBytes */
diff --git a/drivers/net/ewrk3.c b/drivers/net/ewrk3.c
index b2a5ec8f372..dd4ba01fd92 100644
--- a/drivers/net/ewrk3.c
+++ b/drivers/net/ewrk3.c
@@ -145,6 +145,7 @@
 
 #include <linux/module.h>
 #include <linux/kernel.h>
+#include <linux/sched.h>
 #include <linux/string.h>
 #include <linux/errno.h>
 #include <linux/ioport.h>
diff --git a/drivers/net/forcedeth.c b/drivers/net/forcedeth.c
index 0a1c2bb27d4..e1da4666f20 100644
--- a/drivers/net/forcedeth.c
+++ b/drivers/net/forcedeth.c
@@ -49,6 +49,7 @@
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
 #include <linux/delay.h>
+#include <linux/sched.h>
 #include <linux/spinlock.h>
 #include <linux/ethtool.h>
 #include <linux/timer.h>
diff --git a/drivers/net/hamachi.c b/drivers/net/hamachi.c
index 1d5064a09ac..f7519a59494 100644
--- a/drivers/net/hamachi.c
+++ b/drivers/net/hamachi.c
@@ -145,6 +145,7 @@ static int tx_params[MAX_UNITS] = {-1, -1, -1, -1, -1, -1, -1, -1};
 /* Time in jiffies before concluding the transmitter is hung. */
 #define TX_TIMEOUT  (5*HZ)
 
+#include <linux/capability.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/string.h>
diff --git a/drivers/net/hamradio/baycom_epp.c b/drivers/net/hamradio/baycom_epp.c
index 7bcaf7c6624..e344c84c0ef 100644
--- a/drivers/net/hamradio/baycom_epp.c
+++ b/drivers/net/hamradio/baycom_epp.c
@@ -44,6 +44,7 @@
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
+#include <linux/sched.h>
 #include <linux/string.h>
 #include <linux/workqueue.h>
 #include <linux/fs.h>
diff --git a/drivers/net/hamradio/baycom_ser_fdx.c b/drivers/net/hamradio/baycom_ser_fdx.c
index aa4488e871b..ed60fd66427 100644
--- a/drivers/net/hamradio/baycom_ser_fdx.c
+++ b/drivers/net/hamradio/baycom_ser_fdx.c
@@ -71,6 +71,7 @@
 
 /*****************************************************************************/
 
+#include <linux/capability.h>
 #include <linux/module.h>
 #include <linux/ioport.h>
 #include <linux/string.h>
diff --git a/drivers/net/hamradio/baycom_ser_hdx.c b/drivers/net/hamradio/baycom_ser_hdx.c
index 88c59359602..1686f6dcbbc 100644
--- a/drivers/net/hamradio/baycom_ser_hdx.c
+++ b/drivers/net/hamradio/baycom_ser_hdx.c
@@ -61,6 +61,7 @@
 
 /*****************************************************************************/
 
+#include <linux/capability.h>
 #include <linux/module.h>
 #include <linux/ioport.h>
 #include <linux/string.h>
diff --git a/drivers/net/hamradio/hdlcdrv.c b/drivers/net/hamradio/hdlcdrv.c
index 0013c409782..91c5790c958 100644
--- a/drivers/net/hamradio/hdlcdrv.c
+++ b/drivers/net/hamradio/hdlcdrv.c
@@ -42,6 +42,7 @@
 
 /*****************************************************************************/
 
+#include <linux/capability.h>
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/net.h>
diff --git a/drivers/net/hp100.c b/drivers/net/hp100.c
index a9a1a99f02d..dd866513806 100644
--- a/drivers/net/hp100.c
+++ b/drivers/net/hp100.c
@@ -98,6 +98,7 @@
 
 #include <linux/module.h>
 #include <linux/kernel.h>
+#include <linux/sched.h>
 #include <linux/string.h>
 #include <linux/errno.h>
 #include <linux/ioport.h>
diff --git a/drivers/net/igb/igb_ethtool.c b/drivers/net/igb/igb_ethtool.c
index d004c359244..deaea8fa103 100644
--- a/drivers/net/igb/igb_ethtool.c
+++ b/drivers/net/igb/igb_ethtool.c
@@ -34,6 +34,7 @@
 #include <linux/interrupt.h>
 #include <linux/if_ether.h>
 #include <linux/ethtool.h>
+#include <linux/sched.h>
 
 #include "igb.h"
 
diff --git a/drivers/net/irda/toim3232-sir.c b/drivers/net/irda/toim3232-sir.c
index fcf287b749d..99e1ec02a01 100644
--- a/drivers/net/irda/toim3232-sir.c
+++ b/drivers/net/irda/toim3232-sir.c
@@ -120,6 +120,7 @@
 #include <linux/module.h>
 #include <linux/delay.h>
 #include <linux/init.h>
+#include <linux/sched.h>
 
 #include <net/irda/irda.h>
 
diff --git a/drivers/net/ns83820.c b/drivers/net/ns83820.c
index c594e194647..57fd483dbb1 100644
--- a/drivers/net/ns83820.c
+++ b/drivers/net/ns83820.c
@@ -111,6 +111,7 @@
 #include <linux/compiler.h>
 #include <linux/prefetch.h>
 #include <linux/ethtool.h>
+#include <linux/sched.h>
 #include <linux/timer.h>
 #include <linux/if_vlan.h>
 #include <linux/rtnetlink.h>
diff --git a/drivers/net/pcnet32.c b/drivers/net/pcnet32.c
index 6d28b18e7e2..c1b3f09f452 100644
--- a/drivers/net/pcnet32.c
+++ b/drivers/net/pcnet32.c
@@ -31,6 +31,7 @@ static const char *const version =
 
 #include <linux/module.h>
 #include <linux/kernel.h>
+#include <linux/sched.h>
 #include <linux/string.h>
 #include <linux/errno.h>
 #include <linux/ioport.h>
diff --git a/drivers/net/sb1000.c b/drivers/net/sb1000.c
index ee366c5a8fa..c9c70ab0cce 100644
--- a/drivers/net/sb1000.c
+++ b/drivers/net/sb1000.c
@@ -36,6 +36,7 @@ static char version[] = "sb1000.c:v1.1.2 6/01/98 (fventuri@mediaone.net)\n";
 
 #include <linux/module.h>
 #include <linux/kernel.h>
+#include <linux/sched.h>
 #include <linux/string.h>
 #include <linux/interrupt.h>
 #include <linux/errno.h>
diff --git a/drivers/net/sis900.c b/drivers/net/sis900.c
index 97949d0a699..c072f7f36ac 100644
--- a/drivers/net/sis900.c
+++ b/drivers/net/sis900.c
@@ -52,6 +52,7 @@
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/kernel.h>
+#include <linux/sched.h>
 #include <linux/string.h>
 #include <linux/timer.h>
 #include <linux/errno.h>
diff --git a/drivers/net/skfp/skfddi.c b/drivers/net/skfp/skfddi.c
index 38a508b4aad..b27156eaf26 100644
--- a/drivers/net/skfp/skfddi.c
+++ b/drivers/net/skfp/skfddi.c
@@ -73,6 +73,7 @@ static const char * const boot_msg =
 
 /* Include files */
 
+#include <linux/capability.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
diff --git a/drivers/net/skge.c b/drivers/net/skge.c
index 01f6811f132..8f5414348e8 100644
--- a/drivers/net/skge.c
+++ b/drivers/net/skge.c
@@ -37,6 +37,7 @@
 #include <linux/crc32.h>
 #include <linux/dma-mapping.h>
 #include <linux/debugfs.h>
+#include <linux/sched.h>
 #include <linux/seq_file.h>
 #include <linux/mii.h>
 #include <asm/irq.h>
diff --git a/drivers/net/slip.c b/drivers/net/slip.c
index e17c535a577..fe3cebb984d 100644
--- a/drivers/net/slip.c
+++ b/drivers/net/slip.c
@@ -67,6 +67,7 @@
 #include <asm/system.h>
 #include <asm/uaccess.h>
 #include <linux/bitops.h>
+#include <linux/sched.h>
 #include <linux/string.h>
 #include <linux/mm.h>
 #include <linux/interrupt.h>
diff --git a/drivers/net/sungem.c b/drivers/net/sungem.c
index 305ec3d783d..7019a0d1a82 100644
--- a/drivers/net/sungem.c
+++ b/drivers/net/sungem.c
@@ -38,6 +38,7 @@
 #include <linux/interrupt.h>
 #include <linux/ioport.h>
 #include <linux/in.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/delay.h>
diff --git a/drivers/net/tokenring/ibmtr.c b/drivers/net/tokenring/ibmtr.c
index 525bbc5b9c9..36cb2423bcf 100644
--- a/drivers/net/tokenring/ibmtr.c
+++ b/drivers/net/tokenring/ibmtr.c
@@ -108,6 +108,7 @@ in the event that chatty debug messages are desired - jjs 12/30/98 */
 #define IBMTR_DEBUG_MESSAGES 0
 
 #include <linux/module.h>
+#include <linux/sched.h>
 
 #ifdef PCMCIA		/* required for ibmtr_cs.c to build */
 #undef MODULE		/* yes, really */
diff --git a/drivers/net/typhoon.c b/drivers/net/typhoon.c
index d6d345229fe..5921f5bdd76 100644
--- a/drivers/net/typhoon.c
+++ b/drivers/net/typhoon.c
@@ -108,6 +108,7 @@ static const int multicast_filter_limit = 32;
 
 #include <linux/module.h>
 #include <linux/kernel.h>
+#include <linux/sched.h>
 #include <linux/string.h>
 #include <linux/timer.h>
 #include <linux/errno.h>
diff --git a/drivers/net/wan/cosa.c b/drivers/net/wan/cosa.c
index 66360a2a14c..e2c33c06190 100644
--- a/drivers/net/wan/cosa.c
+++ b/drivers/net/wan/cosa.c
@@ -76,6 +76,7 @@
 
 #include <linux/module.h>
 #include <linux/kernel.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/poll.h>
 #include <linux/fs.h>
diff --git a/drivers/net/wan/cycx_x25.c b/drivers/net/wan/cycx_x25.c
index 2573c18b6aa..cd8cb95c5bd 100644
--- a/drivers/net/wan/cycx_x25.c
+++ b/drivers/net/wan/cycx_x25.c
@@ -84,6 +84,7 @@
 #include <linux/kernel.h>	/* printk(), and other useful stuff */
 #include <linux/module.h>
 #include <linux/string.h>	/* inline memset(), etc. */
+#include <linux/sched.h>
 #include <linux/slab.h>		/* kmalloc(), kfree() */
 #include <linux/stddef.h>	/* offsetof(), etc. */
 #include <linux/wanrouter.h>	/* WAN router definitions */
diff --git a/drivers/net/wan/dscc4.c b/drivers/net/wan/dscc4.c
index 81c8aec9df9..07d00b4cf48 100644
--- a/drivers/net/wan/dscc4.c
+++ b/drivers/net/wan/dscc4.c
@@ -81,6 +81,7 @@
  */
 
 #include <linux/module.h>
+#include <linux/sched.h>
 #include <linux/types.h>
 #include <linux/errno.h>
 #include <linux/list.h>
diff --git a/drivers/net/wan/farsync.c b/drivers/net/wan/farsync.c
index 3e90eb81618..beda387f2fc 100644
--- a/drivers/net/wan/farsync.c
+++ b/drivers/net/wan/farsync.c
@@ -19,6 +19,7 @@
 #include <linux/kernel.h>
 #include <linux/version.h>
 #include <linux/pci.h>
+#include <linux/sched.h>
 #include <linux/ioport.h>
 #include <linux/init.h>
 #include <linux/if.h>
diff --git a/drivers/net/wireless/b43/pio.c b/drivers/net/wireless/b43/pio.c
index 9c1397996e0..5e87650b07f 100644
--- a/drivers/net/wireless/b43/pio.c
+++ b/drivers/net/wireless/b43/pio.c
@@ -30,6 +30,7 @@
 #include "xmit.h"
 
 #include <linux/delay.h>
+#include <linux/sched.h>
 
 
 static u16 generate_cookie(struct b43_pio_txqueue *q,
diff --git a/drivers/net/wireless/b43legacy/main.c b/drivers/net/wireless/b43legacy/main.c
index 1d9223b3d4c..4b60148a5e6 100644
--- a/drivers/net/wireless/b43legacy/main.c
+++ b/drivers/net/wireless/b43legacy/main.c
@@ -37,6 +37,7 @@
 #include <linux/firmware.h>
 #include <linux/wireless.h>
 #include <linux/workqueue.h>
+#include <linux/sched.h>
 #include <linux/skbuff.h>
 #include <linux/dma-mapping.h>
 #include <net/dst.h>
diff --git a/drivers/net/wireless/b43legacy/phy.c b/drivers/net/wireless/b43legacy/phy.c
index 11319ec2d64..aaf227203a9 100644
--- a/drivers/net/wireless/b43legacy/phy.c
+++ b/drivers/net/wireless/b43legacy/phy.c
@@ -31,6 +31,7 @@
 
 #include <linux/delay.h>
 #include <linux/pci.h>
+#include <linux/sched.h>
 #include <linux/types.h>
 
 #include "b43legacy.h"
diff --git a/drivers/net/wireless/hostap/hostap_info.c b/drivers/net/wireless/hostap/hostap_info.c
index 6fa14a4e4b5..4dfb40a84c9 100644
--- a/drivers/net/wireless/hostap/hostap_info.c
+++ b/drivers/net/wireless/hostap/hostap_info.c
@@ -1,6 +1,7 @@
 /* Host AP driver Info Frame processing (part of hostap.o module) */
 
 #include <linux/if_arp.h>
+#include <linux/sched.h>
 #include "hostap_wlan.h"
 #include "hostap.h"
 #include "hostap_ap.h"
diff --git a/drivers/net/wireless/hostap/hostap_ioctl.c b/drivers/net/wireless/hostap/hostap_ioctl.c
index 3f2bda881a4..9419cebca8a 100644
--- a/drivers/net/wireless/hostap/hostap_ioctl.c
+++ b/drivers/net/wireless/hostap/hostap_ioctl.c
@@ -1,6 +1,7 @@
 /* ioctl() (mostly Linux Wireless Extensions) routines for Host AP driver */
 
 #include <linux/types.h>
+#include <linux/sched.h>
 #include <linux/ethtool.h>
 #include <linux/if_arp.h>
 #include <net/lib80211.h>
diff --git a/drivers/net/wireless/ipw2x00/ipw2200.c b/drivers/net/wireless/ipw2x00/ipw2200.c
index 8d58e6ed4e7..827824d45de 100644
--- a/drivers/net/wireless/ipw2x00/ipw2200.c
+++ b/drivers/net/wireless/ipw2x00/ipw2200.c
@@ -30,6 +30,7 @@
 
 ******************************************************************************/
 
+#include <linux/sched.h>
 #include "ipw2200.h"
 
 
diff --git a/drivers/net/wireless/iwlwifi/iwl-3945.c b/drivers/net/wireless/iwlwifi/iwl-3945.c
index e70c5b0af36..68136172b82 100644
--- a/drivers/net/wireless/iwlwifi/iwl-3945.c
+++ b/drivers/net/wireless/iwlwifi/iwl-3945.c
@@ -30,6 +30,7 @@
 #include <linux/pci.h>
 #include <linux/dma-mapping.h>
 #include <linux/delay.h>
+#include <linux/sched.h>
 #include <linux/skbuff.h>
 #include <linux/netdevice.h>
 #include <linux/wireless.h>
diff --git a/drivers/net/wireless/iwlwifi/iwl-4965.c b/drivers/net/wireless/iwlwifi/iwl-4965.c
index a22a0501c19..6f703a04184 100644
--- a/drivers/net/wireless/iwlwifi/iwl-4965.c
+++ b/drivers/net/wireless/iwlwifi/iwl-4965.c
@@ -30,6 +30,7 @@
 #include <linux/pci.h>
 #include <linux/dma-mapping.h>
 #include <linux/delay.h>
+#include <linux/sched.h>
 #include <linux/skbuff.h>
 #include <linux/netdevice.h>
 #include <linux/wireless.h>
diff --git a/drivers/net/wireless/iwlwifi/iwl-5000.c b/drivers/net/wireless/iwlwifi/iwl-5000.c
index eb08f441100..d6bc0e05104 100644
--- a/drivers/net/wireless/iwlwifi/iwl-5000.c
+++ b/drivers/net/wireless/iwlwifi/iwl-5000.c
@@ -29,6 +29,7 @@
 #include <linux/pci.h>
 #include <linux/dma-mapping.h>
 #include <linux/delay.h>
+#include <linux/sched.h>
 #include <linux/skbuff.h>
 #include <linux/netdevice.h>
 #include <linux/wireless.h>
diff --git a/drivers/net/wireless/iwlwifi/iwl-agn.c b/drivers/net/wireless/iwlwifi/iwl-agn.c
index cdc07c47745..313d3e5ee84 100644
--- a/drivers/net/wireless/iwlwifi/iwl-agn.c
+++ b/drivers/net/wireless/iwlwifi/iwl-agn.c
@@ -33,6 +33,7 @@
 #include <linux/pci.h>
 #include <linux/dma-mapping.h>
 #include <linux/delay.h>
+#include <linux/sched.h>
 #include <linux/skbuff.h>
 #include <linux/netdevice.h>
 #include <linux/wireless.h>
diff --git a/drivers/net/wireless/iwlwifi/iwl-core.c b/drivers/net/wireless/iwlwifi/iwl-core.c
index 484d5c1a731..2dc92875545 100644
--- a/drivers/net/wireless/iwlwifi/iwl-core.c
+++ b/drivers/net/wireless/iwlwifi/iwl-core.c
@@ -29,6 +29,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/etherdevice.h>
+#include <linux/sched.h>
 #include <net/mac80211.h>
 
 #include "iwl-eeprom.h"
diff --git a/drivers/net/wireless/iwlwifi/iwl-hcmd.c b/drivers/net/wireless/iwlwifi/iwl-hcmd.c
index 532c8d6cd8d..a6856daf14c 100644
--- a/drivers/net/wireless/iwlwifi/iwl-hcmd.c
+++ b/drivers/net/wireless/iwlwifi/iwl-hcmd.c
@@ -28,6 +28,7 @@
 
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/sched.h>
 #include <net/mac80211.h>
 
 #include "iwl-dev.h" /* FIXME: remove */
diff --git a/drivers/net/wireless/iwlwifi/iwl-tx.c b/drivers/net/wireless/iwlwifi/iwl-tx.c
index c1890754470..fb9bcfa6d94 100644
--- a/drivers/net/wireless/iwlwifi/iwl-tx.c
+++ b/drivers/net/wireless/iwlwifi/iwl-tx.c
@@ -28,6 +28,7 @@
  *****************************************************************************/
 
 #include <linux/etherdevice.h>
+#include <linux/sched.h>
 #include <net/mac80211.h>
 #include "iwl-eeprom.h"
 #include "iwl-dev.h"
diff --git a/drivers/net/wireless/iwlwifi/iwl3945-base.c b/drivers/net/wireless/iwlwifi/iwl3945-base.c
index c390dbd877e..aa49230422f 100644
--- a/drivers/net/wireless/iwlwifi/iwl3945-base.c
+++ b/drivers/net/wireless/iwlwifi/iwl3945-base.c
@@ -33,6 +33,7 @@
 #include <linux/pci.h>
 #include <linux/dma-mapping.h>
 #include <linux/delay.h>
+#include <linux/sched.h>
 #include <linux/skbuff.h>
 #include <linux/netdevice.h>
 #include <linux/wireless.h>
diff --git a/drivers/net/wireless/iwmc3200wifi/cfg80211.c b/drivers/net/wireless/iwmc3200wifi/cfg80211.c
index a56a2b0ac99..f3c55658225 100644
--- a/drivers/net/wireless/iwmc3200wifi/cfg80211.c
+++ b/drivers/net/wireless/iwmc3200wifi/cfg80211.c
@@ -23,6 +23,7 @@
 
 #include <linux/kernel.h>
 #include <linux/netdevice.h>
+#include <linux/sched.h>
 #include <linux/etherdevice.h>
 #include <linux/wireless.h>
 #include <linux/ieee80211.h>
diff --git a/drivers/net/wireless/iwmc3200wifi/commands.c b/drivers/net/wireless/iwmc3200wifi/commands.c
index 23b52fa2605..84158b6d35d 100644
--- a/drivers/net/wireless/iwmc3200wifi/commands.c
+++ b/drivers/net/wireless/iwmc3200wifi/commands.c
@@ -40,6 +40,7 @@
 #include <linux/wireless.h>
 #include <linux/etherdevice.h>
 #include <linux/ieee80211.h>
+#include <linux/sched.h>
 
 #include "iwm.h"
 #include "bus.h"
diff --git a/drivers/net/wireless/iwmc3200wifi/main.c b/drivers/net/wireless/iwmc3200wifi/main.c
index d668e475632..222eb2cf1b3 100644
--- a/drivers/net/wireless/iwmc3200wifi/main.c
+++ b/drivers/net/wireless/iwmc3200wifi/main.c
@@ -38,6 +38,7 @@
 
 #include <linux/kernel.h>
 #include <linux/netdevice.h>
+#include <linux/sched.h>
 #include <linux/ieee80211.h>
 #include <linux/wireless.h>
 
diff --git a/drivers/net/wireless/iwmc3200wifi/rx.c b/drivers/net/wireless/iwmc3200wifi/rx.c
index 40dbcbc1659..771a301003c 100644
--- a/drivers/net/wireless/iwmc3200wifi/rx.c
+++ b/drivers/net/wireless/iwmc3200wifi/rx.c
@@ -38,6 +38,7 @@
 
 #include <linux/kernel.h>
 #include <linux/netdevice.h>
+#include <linux/sched.h>
 #include <linux/etherdevice.h>
 #include <linux/wireless.h>
 #include <linux/ieee80211.h>
diff --git a/drivers/net/wireless/libertas/cmd.c b/drivers/net/wireless/libertas/cmd.c
index 685098148e1..0a324dcd264 100644
--- a/drivers/net/wireless/libertas/cmd.c
+++ b/drivers/net/wireless/libertas/cmd.c
@@ -6,6 +6,7 @@
 #include <net/iw_handler.h>
 #include <net/lib80211.h>
 #include <linux/kfifo.h>
+#include <linux/sched.h>
 #include "host.h"
 #include "hostcmd.h"
 #include "decl.h"
diff --git a/drivers/net/wireless/libertas/tx.c b/drivers/net/wireless/libertas/tx.c
index 4c018f7a0a8..8c3766a6e8e 100644
--- a/drivers/net/wireless/libertas/tx.c
+++ b/drivers/net/wireless/libertas/tx.c
@@ -3,6 +3,7 @@
   */
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
+#include <linux/sched.h>
 
 #include "hostcmd.h"
 #include "radiotap.h"
diff --git a/drivers/net/wireless/prism54/isl_ioctl.c b/drivers/net/wireless/prism54/isl_ioctl.c
index 4c97c6ad6f5..bc08464d832 100644
--- a/drivers/net/wireless/prism54/isl_ioctl.c
+++ b/drivers/net/wireless/prism54/isl_ioctl.c
@@ -19,6 +19,7 @@
  *
  */
 
+#include <linux/capability.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/if_arp.h>
diff --git a/drivers/net/wireless/prism54/islpci_dev.c b/drivers/net/wireless/prism54/islpci_dev.c
index e26d7b3ceab..2505be56ae3 100644
--- a/drivers/net/wireless/prism54/islpci_dev.c
+++ b/drivers/net/wireless/prism54/islpci_dev.c
@@ -23,6 +23,7 @@
 #include <linux/netdevice.h>
 #include <linux/ethtool.h>
 #include <linux/pci.h>
+#include <linux/sched.h>
 #include <linux/etherdevice.h>
 #include <linux/delay.h>
 #include <linux/if_arp.h>
diff --git a/drivers/net/wireless/prism54/islpci_mgt.c b/drivers/net/wireless/prism54/islpci_mgt.c
index f7c677e2094..69d2f882fd0 100644
--- a/drivers/net/wireless/prism54/islpci_mgt.c
+++ b/drivers/net/wireless/prism54/islpci_mgt.c
@@ -20,6 +20,7 @@
 #include <linux/netdevice.h>
 #include <linux/module.h>
 #include <linux/pci.h>
+#include <linux/sched.h>
 
 #include <asm/io.h>
 #include <asm/system.h>
diff --git a/drivers/net/wireless/rt2x00/rt2x00debug.c b/drivers/net/wireless/rt2x00/rt2x00debug.c
index 7b3ee8c2eae..68bc9bb1dbf 100644
--- a/drivers/net/wireless/rt2x00/rt2x00debug.c
+++ b/drivers/net/wireless/rt2x00/rt2x00debug.c
@@ -27,6 +27,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/poll.h>
+#include <linux/sched.h>
 #include <linux/uaccess.h>
 
 #include "rt2x00.h"
diff --git a/drivers/pci/pcie/aer/aerdrv.c b/drivers/pci/pcie/aer/aerdrv.c
index 2ce8f9ccc66..d49ecc94bd4 100644
--- a/drivers/pci/pcie/aer/aerdrv.c
+++ b/drivers/pci/pcie/aer/aerdrv.c
@@ -17,6 +17,7 @@
 
 #include <linux/module.h>
 #include <linux/pci.h>
+#include <linux/sched.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/pm.h>
diff --git a/drivers/rtc/interface.c b/drivers/rtc/interface.c
index 4cdb31a362c..a0c816238aa 100644
--- a/drivers/rtc/interface.c
+++ b/drivers/rtc/interface.c
@@ -12,6 +12,7 @@
 */
 
 #include <linux/rtc.h>
+#include <linux/sched.h>
 #include <linux/log2.h>
 
 int rtc_read_time(struct rtc_device *rtc, struct rtc_time *tm)
diff --git a/drivers/rtc/rtc-dev.c b/drivers/rtc/rtc-dev.c
index 8a11de9552c..62227cd5241 100644
--- a/drivers/rtc/rtc-dev.c
+++ b/drivers/rtc/rtc-dev.c
@@ -13,6 +13,7 @@
 
 #include <linux/module.h>
 #include <linux/rtc.h>
+#include <linux/sched.h>
 #include "rtc-core.h"
 
 static dev_t rtc_devt;
diff --git a/drivers/uio/uio.c b/drivers/uio/uio.c
index a9d70704720..e941367dd28 100644
--- a/drivers/uio/uio.c
+++ b/drivers/uio/uio.c
@@ -19,6 +19,7 @@
 #include <linux/device.h>
 #include <linux/mm.h>
 #include <linux/idr.h>
+#include <linux/sched.h>
 #include <linux/string.h>
 #include <linux/kobject.h>
 #include <linux/uio_driver.h>
diff --git a/drivers/uwb/whc-rc.c b/drivers/uwb/whc-rc.c
index 1d9a6f54658..01950c62dc8 100644
--- a/drivers/uwb/whc-rc.c
+++ b/drivers/uwb/whc-rc.c
@@ -42,6 +42,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/pci.h>
+#include <linux/sched.h>
 #include <linux/dma-mapping.h>
 #include <linux/interrupt.h>
 #include <linux/workqueue.h>
diff --git a/fs/file.c b/fs/file.c
index f313314f996..87e129030ab 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -10,6 +10,7 @@
 #include <linux/fs.h>
 #include <linux/mm.h>
 #include <linux/time.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/file.h>
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index b78cf819495..7ca72b74eec 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -10,7 +10,6 @@
 #include <linux/irqreturn.h>
 #include <linux/irqnr.h>
 #include <linux/hardirq.h>
-#include <linux/sched.h>
 #include <linux/irqflags.h>
 #include <linux/smp.h>
 #include <linux/percpu.h>
@@ -610,6 +609,7 @@ extern void debug_poll_all_shared_irqs(void);
 static inline void debug_poll_all_shared_irqs(void) { }
 #endif
 
+struct seq_file;
 int show_interrupts(struct seq_file *p, void *v);
 
 struct irq_desc;
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 81bb4235859..eaf36364b7d 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -11,6 +11,7 @@
 #define LINUX_MMC_HOST_H
 
 #include <linux/leds.h>
+#include <linux/sched.h>
 
 #include <linux/mmc/core.h>
 
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index a81cf80554d..17c71bb565c 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -11,6 +11,7 @@
  */
 
 #include <linux/irq.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/random.h>
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index 50d022e5a56..ec815a960b5 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -16,6 +16,7 @@
 #include <linux/delay.h>
 #include <linux/module.h>
 #include <linux/poison.h>
+#include <linux/sched.h>
 #include <linux/spinlock.h>
 #include <linux/kallsyms.h>
 #include <linux/interrupt.h>
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index fb0f46fa1ec..c3a4e2907ea 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -13,6 +13,7 @@
 #include <linux/percpu.h>
 #include <linux/init.h>
 #include <linux/mm.h>
+#include <linux/sched.h>
 #include <linux/sysdev.h>
 #include <linux/clocksource.h>
 #include <linux/jiffies.h>
diff --git a/lib/debugobjects.c b/lib/debugobjects.c
index 2755a3bd16a..eae56fddfa3 100644
--- a/lib/debugobjects.c
+++ b/lib/debugobjects.c
@@ -9,6 +9,7 @@
  */
 #include <linux/debugobjects.h>
 #include <linux/interrupt.h>
+#include <linux/sched.h>
 #include <linux/seq_file.h>
 #include <linux/debugfs.h>
 #include <linux/hash.h>
diff --git a/lib/fault-inject.c b/lib/fault-inject.c
index f97af55bdd9..7e65af70635 100644
--- a/lib/fault-inject.c
+++ b/lib/fault-inject.c
@@ -1,6 +1,7 @@
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/random.h>
+#include <linux/sched.h>
 #include <linux/stat.h>
 #include <linux/types.h>
 #include <linux/fs.h>
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 5e7aed0802b..0f551a4a44c 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -12,6 +12,7 @@
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/highmem.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
diff --git a/net/irda/ircomm/ircomm_tty_attach.c b/net/irda/ircomm/ircomm_tty_attach.c
index eafc010907c..3c175402302 100644
--- a/net/irda/ircomm/ircomm_tty_attach.c
+++ b/net/irda/ircomm/ircomm_tty_attach.c
@@ -30,6 +30,7 @@
  ********************************************************************/
 
 #include <linux/init.h>
+#include <linux/sched.h>
 
 #include <net/irda/irda.h>
 #include <net/irda/irlmp.h>
diff --git a/net/irda/irlan/irlan_common.c b/net/irda/irlan/irlan_common.c
index 62116829b81..315ead3cb92 100644
--- a/net/irda/irlan/irlan_common.c
+++ b/net/irda/irlan/irlan_common.c
@@ -30,6 +30,7 @@
 #include <linux/init.h>
 #include <linux/errno.h>
 #include <linux/proc_fs.h>
+#include <linux/sched.h>
 #include <linux/seq_file.h>
 #include <linux/random.h>
 #include <linux/netdevice.h>
diff --git a/net/irda/irlan/irlan_eth.c b/net/irda/irlan/irlan_eth.c
index 7b6b631f647..d340110f5c0 100644
--- a/net/irda/irlan/irlan_eth.c
+++ b/net/irda/irlan/irlan_eth.c
@@ -30,6 +30,7 @@
 #include <linux/inetdevice.h>
 #include <linux/if_arp.h>
 #include <linux/module.h>
+#include <linux/sched.h>
 #include <net/arp.h>
 
 #include <net/irda/irda.h>
diff --git a/net/irda/irnet/irnet_irda.c b/net/irda/irnet/irnet_irda.c
index cf9a4b531a9..cccc2e93234 100644
--- a/net/irda/irnet/irnet_irda.c
+++ b/net/irda/irnet/irnet_irda.c
@@ -9,6 +9,7 @@
  */
 
 #include "irnet_irda.h"		/* Private header */
+#include <linux/sched.h>
 #include <linux/seq_file.h>
 #include <asm/unaligned.h>
 
diff --git a/net/irda/irnet/irnet_ppp.c b/net/irda/irnet/irnet_ppp.c
index 68cbcb19cbd..7dea882dbb7 100644
--- a/net/irda/irnet/irnet_ppp.c
+++ b/net/irda/irnet/irnet_ppp.c
@@ -13,6 +13,7 @@
  *	2) as a control channel (write commands, read events)
  */
 
+#include <linux/sched.h>
 #include <linux/smp_lock.h>
 #include "irnet_ppp.h"		/* Private header */
 /* Please put other headers in irnet.h - Thanks */
diff --git a/net/mac80211/rc80211_pid_debugfs.c b/net/mac80211/rc80211_pid_debugfs.c
index a59043fbb0f..45667054a5f 100644
--- a/net/mac80211/rc80211_pid_debugfs.c
+++ b/net/mac80211/rc80211_pid_debugfs.c
@@ -6,6 +6,7 @@
  * published by the Free Software Foundation.
  */
 
+#include <linux/sched.h>
 #include <linux/spinlock.h>
 #include <linux/poll.h>
 #include <linux/netdevice.h>
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 7c9ec3dee96..ca6e68dcd8a 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -14,6 +14,7 @@
 #include <linux/types.h>
 #include <linux/netfilter.h>
 #include <linux/module.h>
+#include <linux/sched.h>
 #include <linux/skbuff.h>
 #include <linux/proc_fs.h>
 #include <linux/vmalloc.h>
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 0cf5e8c27a1..3fa5751af0e 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -42,6 +42,7 @@
 #include <linux/sunrpc/svc_xprt.h>
 #include <linux/sunrpc/debug.h>
 #include <linux/sunrpc/rpc_rdma.h>
+#include <linux/sched.h>
 #include <linux/spinlock.h>
 #include <rdma/ib_verbs.h>
 #include <rdma/rdma_cm.h>
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 45b2be3274d..a595f712b5b 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -14,6 +14,7 @@
 #include <linux/device.h>
 #include <linux/etherdevice.h>
 #include <linux/rtnetlink.h>
+#include <linux/sched.h>
 #include <net/genetlink.h>
 #include <net/cfg80211.h>
 #include "nl80211.h"
-- 
cgit v1.2.3


From 15b812f1d0a5ca8f5efe7f5882f468af10682ca8 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Sun, 11 Oct 2009 14:17:16 -0700
Subject: pci: increase alignment to make more space for hidden code

As reported in

	http://bugzilla.kernel.org/show_bug.cgi?id=13940

on some system when acpi are enabled, acpi clears some BAR for some
devices without reason, and kernel will need to allocate devices for
them.  It then apparently hits some undocumented resource conflict,
resulting in non-working devices.

Try to increase alignment to get more safe range for unassigned devices.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/e820.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 85419bb7d4a..d17d482a04f 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1378,8 +1378,8 @@ static unsigned long ram_alignment(resource_size_t pos)
 	if (mb < 16)
 		return 1024*1024;
 
-	/* To 32MB for anything above that */
-	return 32*1024*1024;
+	/* To 64MB for anything above that */
+	return 64*1024*1024;
 }
 
 #define MAX_RESOURCE_SIZE ((resource_size_t)-1)
-- 
cgit v1.2.3


From 3bb258bf430d29a24350fe4f44f8bf07b7b7a8f6 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Sun, 4 Oct 2009 17:53:29 -0700
Subject: ftrace.c: Add #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

- Remove prefixes from pr_<level>, use pr_fmt(fmt).

No change in output.

Signed-off-by: Joe Perches <joe@perches.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <9b377eefae9e28c599dd4a17bdc81172965e9931.1254701151.git.joe@perches.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/ftrace.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 9dbb527e165..25e6f5fc4b1 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -9,6 +9,8 @@
  * the dangers of modifying code on the run.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/spinlock.h>
 #include <linux/hardirq.h>
 #include <linux/uaccess.h>
@@ -336,15 +338,15 @@ int __init ftrace_dyn_arch_init(void *data)
 
 	switch (faulted) {
 	case 0:
-		pr_info("ftrace: converting mcount calls to 0f 1f 44 00 00\n");
+		pr_info("converting mcount calls to 0f 1f 44 00 00\n");
 		memcpy(ftrace_nop, ftrace_test_p6nop, MCOUNT_INSN_SIZE);
 		break;
 	case 1:
-		pr_info("ftrace: converting mcount calls to 66 66 66 66 90\n");
+		pr_info("converting mcount calls to 66 66 66 66 90\n");
 		memcpy(ftrace_nop, ftrace_test_nop5, MCOUNT_INSN_SIZE);
 		break;
 	case 2:
-		pr_info("ftrace: converting mcount calls to jmp . + 5\n");
+		pr_info("converting mcount calls to jmp . + 5\n");
 		memcpy(ftrace_nop, ftrace_test_jmp, MCOUNT_INSN_SIZE);
 		break;
 	}
-- 
cgit v1.2.3


From 3c355863fb32070a2800f41106519c5c3038623a Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Sun, 4 Oct 2009 17:53:40 -0700
Subject: testmmiotrace.c: Add and use pr_fmt(fmt)

- Add #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt.
- Strip MODULE_NAME from pr_<level>s.
- Remove MODULE_NAME definition.

Signed-off-by: Joe Perches <joe@perches.com>
LKML-Reference: <3bb66cc7f85f77b9416902e1be7076f7e3f4ad48.1254701151.git.joe@perches.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/testmmiotrace.c | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c
index 427fd1b56df..8565d944f7c 100644
--- a/arch/x86/mm/testmmiotrace.c
+++ b/arch/x86/mm/testmmiotrace.c
@@ -1,12 +1,13 @@
 /*
  * Written by Pekka Paalanen, 2008-2009 <pq@iki.fi>
  */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/module.h>
 #include <linux/io.h>
 #include <linux/mmiotrace.h>
 
-#define MODULE_NAME "testmmiotrace"
-
 static unsigned long mmio_address;
 module_param(mmio_address, ulong, 0);
 MODULE_PARM_DESC(mmio_address, " Start address of the mapping of 16 kB "
@@ -30,7 +31,7 @@ static unsigned v32(unsigned i)
 static void do_write_test(void __iomem *p)
 {
 	unsigned int i;
-	pr_info(MODULE_NAME ": write test.\n");
+	pr_info("write test.\n");
 	mmiotrace_printk("Write test.\n");
 
 	for (i = 0; i < 256; i++)
@@ -47,7 +48,7 @@ static void do_read_test(void __iomem *p)
 {
 	unsigned int i;
 	unsigned errs[3] = { 0 };
-	pr_info(MODULE_NAME ": read test.\n");
+	pr_info("read test.\n");
 	mmiotrace_printk("Read test.\n");
 
 	for (i = 0; i < 256; i++)
@@ -68,7 +69,7 @@ static void do_read_test(void __iomem *p)
 
 static void do_read_far_test(void __iomem *p)
 {
-	pr_info(MODULE_NAME ": read far test.\n");
+	pr_info("read far test.\n");
 	mmiotrace_printk("Read far test.\n");
 
 	ioread32(p + read_far);
@@ -78,7 +79,7 @@ static void do_test(unsigned long size)
 {
 	void __iomem *p = ioremap_nocache(mmio_address, size);
 	if (!p) {
-		pr_err(MODULE_NAME ": could not ioremap, aborting.\n");
+		pr_err("could not ioremap, aborting.\n");
 		return;
 	}
 	mmiotrace_printk("ioremap returned %p.\n", p);
@@ -94,24 +95,22 @@ static int __init init(void)
 	unsigned long size = (read_far) ? (8 << 20) : (16 << 10);
 
 	if (mmio_address == 0) {
-		pr_err(MODULE_NAME ": you have to use the module argument "
-							"mmio_address.\n");
-		pr_err(MODULE_NAME ": DO NOT LOAD THIS MODULE UNLESS"
-				" YOU REALLY KNOW WHAT YOU ARE DOING!\n");
+		pr_err("you have to use the module argument mmio_address.\n");
+		pr_err("DO NOT LOAD THIS MODULE UNLESS YOU REALLY KNOW WHAT YOU ARE DOING!\n");
 		return -ENXIO;
 	}
 
-	pr_warning(MODULE_NAME ": WARNING: mapping %lu kB @ 0x%08lx in PCI "
-		"address space, and writing 16 kB of rubbish in there.\n",
-		 size >> 10, mmio_address);
+	pr_warning("WARNING: mapping %lu kB @ 0x%08lx in PCI address space, "
+		   "and writing 16 kB of rubbish in there.\n",
+		   size >> 10, mmio_address);
 	do_test(size);
-	pr_info(MODULE_NAME ": All done.\n");
+	pr_info("All done.\n");
 	return 0;
 }
 
 static void __exit cleanup(void)
 {
-	pr_debug(MODULE_NAME ": unloaded.\n");
+	pr_debug("unloaded.\n");
 }
 
 module_init(init);
-- 
cgit v1.2.3


From 9a821b231644028f8e2a853eb33d1184e925b183 Mon Sep 17 00:00:00 2001
From: David Woodhouse <David.Woodhouse@intel.com>
Date: Mon, 12 Oct 2009 12:59:29 +0100
Subject: x86: Move pci_iommu_init to rootfs_initcall()

We want this to happen after the PCI quirks, which are now running at
the very end of the fs_initcalls.

This works around the BIOS problems which were originally addressed by
commit db8be50c4307dac2b37305fc59c8dc0f978d09ea ('USB: Work around BIOS
bugs by quiescing USB controllers earlier'), which was reverted in
commit d93a8f829fe1d2f3002f2c6ddb553d12db420412.

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 arch/x86/kernel/pci-dma.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 64b838eac18..e0d9199d0eb 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -311,7 +311,7 @@ void pci_iommu_shutdown(void)
 	amd_iommu_shutdown();
 }
 /* Must execute after PCI subsystem */
-fs_initcall(pci_iommu_init);
+rootfs_initcall(pci_iommu_init);
 
 #ifdef CONFIG_PCI
 /* Many VIA bridges seem to corrupt data for DAC. Disable it here */
-- 
cgit v1.2.3


From 7a4b7e5e741fe0a72a517b0367a2659aa53f7c44 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Tue, 6 Oct 2009 16:32:43 +0100
Subject: x86: Fix Suspend to RAM freeze on Acer Aspire 1511Lmi laptop

Move the trampoline and accessors back out of .cpuinit.* for the
case of 64-bits+ACPI_SLEEP.

This solves s2ram hangs reported in:

  http://bugzilla.kernel.org/show_bug.cgi?id=14279

Reported-and-bisected-by: Christian Casteyde <casteyde.christian@free.fr>
Signed-off-by: Jan Beulich <jbeulich@novell.com>
Cc: <bugzilla-daemon@bugzilla.kernel.org>
Cc: "Andrew Morton" <akpm@linux-foundation.org>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/trampoline.c    | 12 ++++++++++--
 arch/x86/kernel/trampoline_64.S |  4 ++++
 2 files changed, 14 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c
index 699f7eeb896..cd022121cab 100644
--- a/arch/x86/kernel/trampoline.c
+++ b/arch/x86/kernel/trampoline.c
@@ -3,8 +3,16 @@
 #include <asm/trampoline.h>
 #include <asm/e820.h>
 
+#if defined(CONFIG_X86_64) && defined(CONFIG_ACPI_SLEEP)
+#define __trampinit
+#define __trampinitdata
+#else
+#define __trampinit __cpuinit
+#define __trampinitdata __cpuinitdata
+#endif
+
 /* ready for x86_64 and x86 */
-unsigned char *__cpuinitdata trampoline_base = __va(TRAMPOLINE_BASE);
+unsigned char *__trampinitdata trampoline_base = __va(TRAMPOLINE_BASE);
 
 void __init reserve_trampoline_memory(void)
 {
@@ -26,7 +34,7 @@ void __init reserve_trampoline_memory(void)
  * bootstrap into the page concerned. The caller
  * has made sure it's suitably aligned.
  */
-unsigned long __cpuinit setup_trampoline(void)
+unsigned long __trampinit setup_trampoline(void)
 {
 	memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE);
 	return virt_to_phys(trampoline_base);
diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S
index 596d54c660a..3af2dff58b2 100644
--- a/arch/x86/kernel/trampoline_64.S
+++ b/arch/x86/kernel/trampoline_64.S
@@ -32,8 +32,12 @@
 #include <asm/segment.h>
 #include <asm/processor-flags.h>
 
+#ifdef CONFIG_ACPI_SLEEP
+.section .rodata, "a", @progbits
+#else
 /* We can free up the trampoline after bootup if cpu hotplug is not supported. */
 __CPUINITRODATA
+#endif
 .code16
 
 ENTRY(trampoline_data)
-- 
cgit v1.2.3


From d1705c558c95418378b11a0be963fe1b3e2fa381 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Mon, 12 Oct 2009 11:32:31 -0700
Subject: x86: fix kernel panic on 32 bits when profiling

Latest kernel has a kernel panic in booting on i386 machine when
profile=2 setting in cmdline.  It is due to 'sp' being incorrect in
profile_pc().

BUG: unable to handle kernel NULL pointer dereference at 00000246
IP: [<c01288b6>] profile_pc+0x2a/0x48
*pde = 00000000
Oops: 0000 [#1] SMP

This differs from the original version by Alex Shi in that we use the
kernel_stack_pointer() inline already defined in <asm/ptrace.h> for
this purpose, instead of #ifdef.

Originally-by: Alex Shi <alex.shi@intel.com>
Cc: "Chen, Tim C" <tim.c.chen@intel.com>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/time.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index dcb00d27851..be2573448ed 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -38,7 +38,8 @@ unsigned long profile_pc(struct pt_regs *regs)
 #ifdef CONFIG_FRAME_POINTER
 		return *(unsigned long *)(regs->bp + sizeof(long));
 #else
-		unsigned long *sp = (unsigned long *)regs->sp;
+		unsigned long *sp =
+			(unsigned long *)kernel_stack_pointer(regs);
 		/*
 		 * Return address is either directly at stack pointer
 		 * or above a saved flags. Eflags has bits 22-31 zero,
-- 
cgit v1.2.3


From 7a693d3f0d10f978ebdf3082c41404ab97106567 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 13 Oct 2009 08:16:30 +0200
Subject: perf_events, x86: Fix event constraints code

There was namespace overlap due to a rename i did - this caused
the following build warning, reported by Stephen Rothwell against
linux-next x86_64 allmodconfig:

  arch/x86/kernel/cpu/perf_event.c: In function 'intel_get_event_idx':
  arch/x86/kernel/cpu/perf_event.c:1445: warning: 'event_constraint' is used uninitialized in this function

This is a real bug not just a warning: fix it by renaming the
global event-constraints table pointer to 'event_constraints'.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Stephane Eranian <eranian@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20091013144223.369d616d.sfr@canb.auug.org.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 9961d845719..2e20bca3cca 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -124,7 +124,7 @@ static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
 	.enabled = 1,
 };
 
-static const struct event_constraint *event_constraint;
+static const struct event_constraint *event_constraints;
 
 /*
  * Not sure about some of these
@@ -1442,12 +1442,12 @@ intel_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
 	const struct event_constraint *event_constraint;
 	int i, code;
 
-	if (!event_constraint)
+	if (!event_constraints)
 		goto skip;
 
 	code = hwc->config & CORE_EVNTSEL_EVENT_MASK;
 
-	for_each_event_constraint(event_constraint, event_constraint) {
+	for_each_event_constraint(event_constraint, event_constraints) {
 		if (code == event_constraint->code) {
 			for_each_bit(i, event_constraint->idxmsk, X86_PMC_IDX_MAX) {
 				if (!test_and_set_bit(i, cpuc->used_mask))
@@ -2047,12 +2047,12 @@ static int p6_pmu_init(void)
 	case 7:
 	case 8:
 	case 11: /* Pentium III */
-		event_constraint = intel_p6_event_constraints;
+		event_constraints = intel_p6_event_constraints;
 		break;
 	case 9:
 	case 13:
 		/* Pentium M */
-		event_constraint = intel_p6_event_constraints;
+		event_constraints = intel_p6_event_constraints;
 		break;
 	default:
 		pr_cont("unsupported p6 CPU model %d ",
@@ -2124,14 +2124,14 @@ static int intel_pmu_init(void)
 		       sizeof(hw_cache_event_ids));
 
 		pr_cont("Core2 events, ");
-		event_constraint = intel_core_event_constraints;
+		event_constraints = intel_core_event_constraints;
 		break;
 	default:
 	case 26:
 		memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
 		       sizeof(hw_cache_event_ids));
 
-		event_constraint = intel_nehalem_event_constraints;
+		event_constraints = intel_nehalem_event_constraints;
 		pr_cont("Nehalem/Corei7 events, ");
 		break;
 	case 28:
-- 
cgit v1.2.3


From 71999d9862e667f1fd14f8fbfa0cce6d855bad3f Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Mon, 12 Oct 2009 16:32:43 -0700
Subject: x86/paravirt: Use normal calling sequences for irq enable/disable

Bastian Blank reported a boot crash with stackprotector enabled,
and debugged it back to edx register corruption.

For historical reasons irq enable/disable/save/restore had special
calling sequences to make them more efficient.  With the more
recent introduction of higher-level and more general optimisations
this is no longer necessary so we can just use the normal PVOP_
macros.

This fixes some residual bugs in the old implementations which left
edx liable to inadvertent clobbering. Also, fix some bugs in
__PVOP_VCALLEESAVE which were revealed by actual use.

Reported-by: Bastian Blank <bastian@waldi.eu.org>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Cc: Stable Kernel <stable@kernel.org>
Cc: Xen-devel <xen-devel@lists.xensource.com>
LKML-Reference: <4AD3BC9B.7040501@goop.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/paravirt.h       | 28 ++++------------------------
 arch/x86/include/asm/paravirt_types.h | 10 ++++++----
 2 files changed, 10 insertions(+), 28 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 8aebcc41041..efb38994859 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -840,42 +840,22 @@ static __always_inline void __raw_spin_unlock(struct raw_spinlock *lock)
 
 static inline unsigned long __raw_local_save_flags(void)
 {
-	unsigned long f;
-
-	asm volatile(paravirt_alt(PARAVIRT_CALL)
-		     : "=a"(f)
-		     : paravirt_type(pv_irq_ops.save_fl),
-		       paravirt_clobber(CLBR_EAX)
-		     : "memory", "cc");
-	return f;
+	return PVOP_CALLEE0(unsigned long, pv_irq_ops.save_fl);
 }
 
 static inline void raw_local_irq_restore(unsigned long f)
 {
-	asm volatile(paravirt_alt(PARAVIRT_CALL)
-		     : "=a"(f)
-		     : PV_FLAGS_ARG(f),
-		       paravirt_type(pv_irq_ops.restore_fl),
-		       paravirt_clobber(CLBR_EAX)
-		     : "memory", "cc");
+	PVOP_VCALLEE1(pv_irq_ops.restore_fl, f);
 }
 
 static inline void raw_local_irq_disable(void)
 {
-	asm volatile(paravirt_alt(PARAVIRT_CALL)
-		     :
-		     : paravirt_type(pv_irq_ops.irq_disable),
-		       paravirt_clobber(CLBR_EAX)
-		     : "memory", "eax", "cc");
+	PVOP_VCALLEE0(pv_irq_ops.irq_disable);
 }
 
 static inline void raw_local_irq_enable(void)
 {
-	asm volatile(paravirt_alt(PARAVIRT_CALL)
-		     :
-		     : paravirt_type(pv_irq_ops.irq_enable),
-		       paravirt_clobber(CLBR_EAX)
-		     : "memory", "eax", "cc");
+	PVOP_VCALLEE0(pv_irq_ops.irq_enable);
 }
 
 static inline unsigned long __raw_local_irq_save(void)
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index dd0f5b32489..9357473c8da 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -494,10 +494,11 @@ int paravirt_disable_iospace(void);
 #define EXTRA_CLOBBERS
 #define VEXTRA_CLOBBERS
 #else  /* CONFIG_X86_64 */
+/* [re]ax isn't an arg, but the return val */
 #define PVOP_VCALL_ARGS					\
 	unsigned long __edi = __edi, __esi = __esi,	\
-		__edx = __edx, __ecx = __ecx
-#define PVOP_CALL_ARGS		PVOP_VCALL_ARGS, __eax
+		__edx = __edx, __ecx = __ecx, __eax = __eax
+#define PVOP_CALL_ARGS		PVOP_VCALL_ARGS
 
 #define PVOP_CALL_ARG1(x)		"D" ((unsigned long)(x))
 #define PVOP_CALL_ARG2(x)		"S" ((unsigned long)(x))
@@ -509,6 +510,7 @@ int paravirt_disable_iospace(void);
 				"=c" (__ecx)
 #define PVOP_CALL_CLOBBERS	PVOP_VCALL_CLOBBERS, "=a" (__eax)
 
+/* void functions are still allowed [re]ax for scratch */
 #define PVOP_VCALLEE_CLOBBERS	"=a" (__eax)
 #define PVOP_CALLEE_CLOBBERS	PVOP_VCALLEE_CLOBBERS
 
@@ -583,8 +585,8 @@ int paravirt_disable_iospace(void);
 		       VEXTRA_CLOBBERS,					\
 		       pre, post, ##__VA_ARGS__)
 
-#define __PVOP_VCALLEESAVE(rettype, op, pre, post, ...)			\
-	____PVOP_CALL(rettype, op.func, CLBR_RET_REG,			\
+#define __PVOP_VCALLEESAVE(op, pre, post, ...)				\
+	____PVOP_VCALL(op.func, CLBR_RET_REG,				\
 		      PVOP_VCALLEE_CLOBBERS, ,				\
 		      pre, post, ##__VA_ARGS__)
 
-- 
cgit v1.2.3


From 194ec34184869f0de1cf255c924fc5299e1b3d27 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 13 Oct 2009 16:33:50 -0400
Subject: function-graph/x86: Replace unbalanced ret with jmp

The function graph tracer replaces the return address with a hook
to trace the exit of the function call. This hook will finish by
returning to the real location the function should return to.

But the current implementation uses a ret to jump to the real
return location. This causes a imbalance between calls and ret.
That is the original function does a call, the ret goes to the
handler and then the handler does a ret without a matching call.

Although the function graph tracer itself still breaks the branch
predictor by replacing the original ret, by using a second ret and
causing an imbalance, it breaks the predictor even more.

This patch replaces the ret with a jmp to keep the calls and ret
balanced. I tested this on one box and it showed a 1.7% increase in
performance. Another box only showed a small 0.3% increase. But no
box that I tested this on showed a decrease in performance by
making this change.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20091013203425.042034383@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/entry_32.S | 7 ++-----
 arch/x86/kernel/entry_64.S | 6 +++---
 2 files changed, 5 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index c097e7d607c..7d52e9da5e0 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1185,17 +1185,14 @@ END(ftrace_graph_caller)
 
 .globl return_to_handler
 return_to_handler:
-	pushl $0
 	pushl %eax
-	pushl %ecx
 	pushl %edx
 	movl %ebp, %eax
 	call ftrace_return_to_handler
-	movl %eax, 0xc(%esp)
+	movl %eax, %ecx
 	popl %edx
-	popl %ecx
 	popl %eax
-	ret
+	jmp *%ecx
 #endif
 
 .section .rodata,"a"
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index b5c061f8f35..bd5bbddddf9 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -155,11 +155,11 @@ GLOBAL(return_to_handler)
 
 	call ftrace_return_to_handler
 
-	movq %rax, 16(%rsp)
+	movq %rax, %rdi
 	movq 8(%rsp), %rdx
 	movq (%rsp), %rax
-	addq $16, %rsp
-	retq
+	addq $24, %rsp
+	jmp *%rdi
 #endif
 
 
-- 
cgit v1.2.3


From c44fc770845163f8d9e573f37f92a7b7a7ade14e Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 19 Sep 2009 06:50:42 +0200
Subject: tracing: Move syscalls metadata handling from arch to core

Most of the syscalls metadata processing is done from arch.
But these operations are mostly generic accross archs. Especially now
that we have a common variable name that expresses the number of
syscalls supported by an arch: NR_syscalls, the only remaining bits
that need to reside in arch is the syscall nr to addr translation.

v2: Compare syscalls symbols only after the "sys" prefix so that we
    avoid spurious mismatches with archs that have syscalls wrappers,
    in which case syscalls symbols have "SyS" prefixed aliases.
    (Reported by: Heiko Carstens)

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
---
 arch/s390/kernel/ftrace.c     | 67 +--------------------------------
 arch/x86/kernel/ftrace.c      | 76 +-------------------------------------
 include/trace/syscall.h       |  2 +-
 kernel/trace/trace_syscalls.c | 86 +++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 91 insertions(+), 140 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c
index 57bdcb1e3cd..7c5752c3423 100644
--- a/arch/s390/kernel/ftrace.c
+++ b/arch/s390/kernel/ftrace.c
@@ -206,73 +206,10 @@ out:
 
 #ifdef CONFIG_FTRACE_SYSCALLS
 
-extern unsigned long __start_syscalls_metadata[];
-extern unsigned long __stop_syscalls_metadata[];
 extern unsigned int sys_call_table[];
 
-static struct syscall_metadata **syscalls_metadata;
-
-struct syscall_metadata *syscall_nr_to_meta(int nr)
-{
-	if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
-		return NULL;
-
-	return syscalls_metadata[nr];
-}
-
-int syscall_name_to_nr(char *name)
-{
-	int i;
-
-	if (!syscalls_metadata)
-		return -1;
-	for (i = 0; i < NR_syscalls; i++)
-		if (syscalls_metadata[i])
-			if (!strcmp(syscalls_metadata[i]->name, name))
-				return i;
-	return -1;
-}
-
-void set_syscall_enter_id(int num, int id)
-{
-	syscalls_metadata[num]->enter_id = id;
-}
-
-void set_syscall_exit_id(int num, int id)
+unsigned long __init arch_syscall_addr(int nr)
 {
-	syscalls_metadata[num]->exit_id = id;
-}
-
-static struct syscall_metadata *find_syscall_meta(unsigned long syscall)
-{
-	struct syscall_metadata *start;
-	struct syscall_metadata *stop;
-	char str[KSYM_SYMBOL_LEN];
-
-	start = (struct syscall_metadata *)__start_syscalls_metadata;
-	stop = (struct syscall_metadata *)__stop_syscalls_metadata;
-	kallsyms_lookup(syscall, NULL, NULL, NULL, str);
-
-	for ( ; start < stop; start++) {
-		if (start->name && !strcmp(start->name + 3, str + 3))
-			return start;
-	}
-	return NULL;
-}
-
-static int __init arch_init_ftrace_syscalls(void)
-{
-	struct syscall_metadata *meta;
-	int i;
-	syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) * NR_syscalls,
-				    GFP_KERNEL);
-	if (!syscalls_metadata)
-		return -ENOMEM;
-	for (i = 0; i < NR_syscalls; i++) {
-		meta = find_syscall_meta((unsigned long)sys_call_table[i]);
-		syscalls_metadata[i] = meta;
-	}
-	return 0;
+	return (unsigned long)sys_call_table[nr];
 }
-arch_initcall(arch_init_ftrace_syscalls);
 #endif
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 25e6f5fc4b1..5a1b9758fd6 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -470,82 +470,10 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
 
 #ifdef CONFIG_FTRACE_SYSCALLS
 
-extern unsigned long __start_syscalls_metadata[];
-extern unsigned long __stop_syscalls_metadata[];
 extern unsigned long *sys_call_table;
 
-static struct syscall_metadata **syscalls_metadata;
-
-static struct syscall_metadata *find_syscall_meta(unsigned long *syscall)
-{
-	struct syscall_metadata *start;
-	struct syscall_metadata *stop;
-	char str[KSYM_SYMBOL_LEN];
-
-
-	start = (struct syscall_metadata *)__start_syscalls_metadata;
-	stop = (struct syscall_metadata *)__stop_syscalls_metadata;
-	kallsyms_lookup((unsigned long) syscall, NULL, NULL, NULL, str);
-
-	for ( ; start < stop; start++) {
-		if (start->name && !strcmp(start->name, str))
-			return start;
-	}
-	return NULL;
-}
-
-struct syscall_metadata *syscall_nr_to_meta(int nr)
-{
-	if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
-		return NULL;
-
-	return syscalls_metadata[nr];
-}
-
-int syscall_name_to_nr(char *name)
-{
-	int i;
-
-	if (!syscalls_metadata)
-		return -1;
-
-	for (i = 0; i < NR_syscalls; i++) {
-		if (syscalls_metadata[i]) {
-			if (!strcmp(syscalls_metadata[i]->name, name))
-				return i;
-		}
-	}
-	return -1;
-}
-
-void set_syscall_enter_id(int num, int id)
-{
-	syscalls_metadata[num]->enter_id = id;
-}
-
-void set_syscall_exit_id(int num, int id)
+unsigned long __init arch_syscall_addr(int nr)
 {
-	syscalls_metadata[num]->exit_id = id;
-}
-
-static int __init arch_init_ftrace_syscalls(void)
-{
-	int i;
-	struct syscall_metadata *meta;
-	unsigned long **psys_syscall_table = &sys_call_table;
-
-	syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) *
-					NR_syscalls, GFP_KERNEL);
-	if (!syscalls_metadata) {
-		WARN_ON(1);
-		return -ENOMEM;
-	}
-
-	for (i = 0; i < NR_syscalls; i++) {
-		meta = find_syscall_meta(psys_syscall_table[i]);
-		syscalls_metadata[i] = meta;
-	}
-	return 0;
+	return (unsigned long)(&sys_call_table)[nr];
 }
-arch_initcall(arch_init_ftrace_syscalls);
 #endif
diff --git a/include/trace/syscall.h b/include/trace/syscall.h
index 5dc283ba5ae..e972f0a40f8 100644
--- a/include/trace/syscall.h
+++ b/include/trace/syscall.h
@@ -33,7 +33,7 @@ struct syscall_metadata {
 };
 
 #ifdef CONFIG_FTRACE_SYSCALLS
-extern struct syscall_metadata *syscall_nr_to_meta(int nr);
+extern unsigned long arch_syscall_addr(int nr);
 extern int syscall_name_to_nr(char *name);
 void set_syscall_enter_id(int num, int id);
 void set_syscall_exit_id(int num, int id);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 9fbce6c9d2e..8bda4bff228 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -14,6 +14,69 @@ static int sys_refcount_exit;
 static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
 static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
 
+extern unsigned long __start_syscalls_metadata[];
+extern unsigned long __stop_syscalls_metadata[];
+
+static struct syscall_metadata **syscalls_metadata;
+
+static struct syscall_metadata *find_syscall_meta(unsigned long syscall)
+{
+	struct syscall_metadata *start;
+	struct syscall_metadata *stop;
+	char str[KSYM_SYMBOL_LEN];
+
+
+	start = (struct syscall_metadata *)__start_syscalls_metadata;
+	stop = (struct syscall_metadata *)__stop_syscalls_metadata;
+	kallsyms_lookup(syscall, NULL, NULL, NULL, str);
+
+	for ( ; start < stop; start++) {
+		/*
+		 * Only compare after the "sys" prefix. Archs that use
+		 * syscall wrappers may have syscalls symbols aliases prefixed
+		 * with "SyS" instead of "sys", leading to an unwanted
+		 * mismatch.
+		 */
+		if (start->name && !strcmp(start->name + 3, str + 3))
+			return start;
+	}
+	return NULL;
+}
+
+static struct syscall_metadata *syscall_nr_to_meta(int nr)
+{
+	if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
+		return NULL;
+
+	return syscalls_metadata[nr];
+}
+
+int syscall_name_to_nr(char *name)
+{
+	int i;
+
+	if (!syscalls_metadata)
+		return -1;
+
+	for (i = 0; i < NR_syscalls; i++) {
+		if (syscalls_metadata[i]) {
+			if (!strcmp(syscalls_metadata[i]->name, name))
+				return i;
+		}
+	}
+	return -1;
+}
+
+void set_syscall_enter_id(int num, int id)
+{
+	syscalls_metadata[num]->enter_id = id;
+}
+
+void set_syscall_exit_id(int num, int id)
+{
+	syscalls_metadata[num]->exit_id = id;
+}
+
 enum print_line_t
 print_syscall_enter(struct trace_iterator *iter, int flags)
 {
@@ -375,6 +438,29 @@ struct trace_event event_syscall_exit = {
 	.trace			= print_syscall_exit,
 };
 
+int __init init_ftrace_syscalls(void)
+{
+	struct syscall_metadata *meta;
+	unsigned long addr;
+	int i;
+
+	syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) *
+					NR_syscalls, GFP_KERNEL);
+	if (!syscalls_metadata) {
+		WARN_ON(1);
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < NR_syscalls; i++) {
+		addr = arch_syscall_addr(i);
+		meta = find_syscall_meta(addr);
+		syscalls_metadata[i] = meta;
+	}
+
+	return 0;
+}
+core_initcall(init_ftrace_syscalls);
+
 #ifdef CONFIG_EVENT_PROFILE
 
 static DECLARE_BITMAP(enabled_prof_enter_syscalls, NR_syscalls);
-- 
cgit v1.2.3


From e9a63a4e559fbdc522072281d05e6b13c1022f4b Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 14 Oct 2009 14:16:38 -0700
Subject: x86: linker script syntax nits

The linker scripts grew some use of weirdly wrong linker script syntax.
It happens to work, but it's not what the syntax is documented to be.
Clean it up to use the official syntax.

Signed-off-by: Roland McGrath <roland@redhat.com>
CC: Ian Lance Taylor <iant@google.com>
---
 arch/x86/kernel/acpi/realmode/wakeup.lds.S |  4 ++--
 arch/x86/kernel/vmlinux.lds.S              | 17 ++++++++---------
 2 files changed, 10 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/acpi/realmode/wakeup.lds.S b/arch/x86/kernel/acpi/realmode/wakeup.lds.S
index 7da00b799cd..0e50e1e5c57 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.lds.S
+++ b/arch/x86/kernel/acpi/realmode/wakeup.lds.S
@@ -56,6 +56,6 @@ SECTIONS
 	/DISCARD/ : {
 		*(.note*)
 	}
-
-	. = ASSERT(_end <= WAKEUP_SIZE, "Wakeup too big!");
 }
+
+ASSERT(_end <= WAKEUP_SIZE, "Wakeup too big!");
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 92929fb3f9f..8d6001ad8d8 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -305,8 +305,8 @@ SECTIONS
 
 
 #ifdef CONFIG_X86_32
-. = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
-	   "kernel image bigger than KERNEL_IMAGE_SIZE");
+ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
+       "kernel image bigger than KERNEL_IMAGE_SIZE");
 #else
 /*
  * Per-cpu symbols which need to be offset from __per_cpu_load
@@ -319,12 +319,12 @@ INIT_PER_CPU(irq_stack_union);
 /*
  * Build-time check on the image size:
  */
-. = ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
-	   "kernel image bigger than KERNEL_IMAGE_SIZE");
+ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
+       "kernel image bigger than KERNEL_IMAGE_SIZE");
 
 #ifdef CONFIG_SMP
-. = ASSERT((per_cpu__irq_stack_union == 0),
-           "irq_stack_union is not at start of per-cpu area");
+ASSERT((per_cpu__irq_stack_union == 0),
+       "irq_stack_union is not at start of per-cpu area");
 #endif
 
 #endif /* CONFIG_X86_32 */
@@ -332,7 +332,6 @@ INIT_PER_CPU(irq_stack_union);
 #ifdef CONFIG_KEXEC
 #include <asm/kexec.h>
 
-. = ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
-           "kexec control code size is too big");
+ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
+       "kexec control code size is too big");
 #endif
-
-- 
cgit v1.2.3