From f584ab5b90eee42b9399ada4f7078157b98fcbaa Mon Sep 17 00:00:00 2001 From: Jack Steiner Date: Tue, 20 Sep 2011 13:55:04 -0700 Subject: x86: uv2: Workaround for UV2 Hub bug (system global address format) commit 6a469e4665bc158599de55d64388861d0a9f10f4 upstream. This is a workaround for a UV2 hub bug that affects the format of system global addresses. The GRU API for UV2 was inadvertently broken by a hardware change. The format of the physical address used for TLB dropins and for addresses used with instructions running in unmapped mode has changed. This change was not documented and became apparent only when diags failed running on system simulators. For UV1, TLB and GRU instruction physical addresses are identical to socket physical addresses (although high NASID bits must be OR'ed into the address). For UV2, socket physical addresses need to be converted. The NODE portion of the physical address needs to be shifted so that the low bit is in bit 39 or bit 40, depending on an MMR value. It is not yet clear if this bug will be fixed in a silicon respin. If it is fixed, the hub revision will be incremented & the workaround disabled. Signed-off-by: Jack Steiner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/uv/uv_bau.h | 1 + arch/x86/include/asm/uv/uv_hub.h | 37 ++++++++++++++++++++++++++++++++++--- arch/x86/kernel/apic/x2apic_uv_x.c | 7 +++++-- arch/x86/platform/uv/tlb_uv.c | 17 ++++++----------- 4 files changed, 46 insertions(+), 16 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index 37d369859c8..0c767a8e000 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -55,6 +55,7 @@ #define UV_BAU_TUNABLES_DIR "sgi_uv" #define UV_BAU_TUNABLES_FILE "bau_tunables" #define WHITESPACE " \t\n" +#define uv_mmask ((1UL << uv_hub_info->m_val) - 1) #define uv_physnodeaddr(x) ((__pa((unsigned long)(x)) & uv_mmask)) #define cpubit_isset(cpu, bau_local_cpumask) \ test_bit((cpu), (bau_local_cpumask).bits) diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index f26544a1521..54a13aaebc4 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -46,6 +46,13 @@ * PNODE - the low N bits of the GNODE. The PNODE is the most useful variant * of the nasid for socket usage. * + * GPA - (global physical address) a socket physical address converted + * so that it can be used by the GRU as a global address. Socket + * physical addresses 1) need additional NASID (node) bits added + * to the high end of the address, and 2) unaliased if the + * partition does not have a physical address 0. In addition, on + * UV2 rev 1, GPAs need the gnode left shifted to bits 39 or 40. + * * * NumaLink Global Physical Address Format: * +--------------------------------+---------------------+ @@ -141,6 +148,8 @@ struct uv_hub_info_s { unsigned int gnode_extra; unsigned char hub_revision; unsigned char apic_pnode_shift; + unsigned char m_shift; + unsigned char n_lshift; unsigned long gnode_upper; unsigned long lowmem_remap_top; unsigned long lowmem_remap_base; @@ -177,6 +186,16 @@ static inline int is_uv2_hub(void) return uv_hub_info->hub_revision >= UV2_HUB_REVISION_BASE; } +static inline int is_uv2_1_hub(void) +{ + return uv_hub_info->hub_revision == UV2_HUB_REVISION_BASE; +} + +static inline int is_uv2_2_hub(void) +{ + return uv_hub_info->hub_revision == UV2_HUB_REVISION_BASE + 1; +} + union uvh_apicid { unsigned long v; struct uvh_apicid_s { @@ -276,7 +295,10 @@ static inline unsigned long uv_soc_phys_ram_to_gpa(unsigned long paddr) { if (paddr < uv_hub_info->lowmem_remap_top) paddr |= uv_hub_info->lowmem_remap_base; - return paddr | uv_hub_info->gnode_upper; + paddr |= uv_hub_info->gnode_upper; + paddr = ((paddr << uv_hub_info->m_shift) >> uv_hub_info->m_shift) | + ((paddr >> uv_hub_info->m_val) << uv_hub_info->n_lshift); + return paddr; } @@ -300,16 +322,19 @@ static inline unsigned long uv_gpa_to_soc_phys_ram(unsigned long gpa) unsigned long remap_base = uv_hub_info->lowmem_remap_base; unsigned long remap_top = uv_hub_info->lowmem_remap_top; + gpa = ((gpa << uv_hub_info->m_shift) >> uv_hub_info->m_shift) | + ((gpa >> uv_hub_info->n_lshift) << uv_hub_info->m_val); + gpa = gpa & uv_hub_info->gpa_mask; if (paddr >= remap_base && paddr < remap_base + remap_top) paddr -= remap_base; return paddr; } -/* gnode -> pnode */ +/* gpa -> pnode */ static inline unsigned long uv_gpa_to_gnode(unsigned long gpa) { - return gpa >> uv_hub_info->m_val; + return gpa >> uv_hub_info->n_lshift; } /* gpa -> pnode */ @@ -320,6 +345,12 @@ static inline int uv_gpa_to_pnode(unsigned long gpa) return uv_gpa_to_gnode(gpa) & n_mask; } +/* gpa -> node offset*/ +static inline unsigned long uv_gpa_to_offset(unsigned long gpa) +{ + return (gpa << uv_hub_info->m_shift) >> uv_hub_info->m_shift; +} + /* pnode, offset --> socket virtual */ static inline void *uv_pnode_offset_to_vaddr(int pnode, unsigned long offset) { diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 34b18594e72..cfeb978f49f 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -832,6 +832,10 @@ void __init uv_system_init(void) uv_cpu_hub_info(cpu)->apic_pnode_shift = uvh_apicid.s.pnode_shift; uv_cpu_hub_info(cpu)->hub_revision = uv_hub_info->hub_revision; + uv_cpu_hub_info(cpu)->m_shift = 64 - m_val; + uv_cpu_hub_info(cpu)->n_lshift = is_uv2_1_hub() ? + (m_val == 40 ? 40 : 39) : m_val; + pnode = uv_apicid_to_pnode(apicid); blade = boot_pnode_to_blade(pnode); lcpu = uv_blade_info[blade].nr_possible_cpus; @@ -862,8 +866,7 @@ void __init uv_system_init(void) if (uv_node_to_blade[nid] >= 0) continue; paddr = node_start_pfn(nid) << PAGE_SHIFT; - paddr = uv_soc_phys_ram_to_gpa(paddr); - pnode = (paddr >> m_val) & pnode_mask; + pnode = uv_gpa_to_pnode(uv_soc_phys_ram_to_gpa(paddr)); blade = boot_pnode_to_blade(pnode); uv_node_to_blade[nid] = blade; } diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index db8b915f54b..5b552198f77 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -115,9 +115,6 @@ early_param("nobau", setup_nobau); /* base pnode in this partition */ static int uv_base_pnode __read_mostly; -/* position of pnode (which is nasid>>1): */ -static int uv_nshift __read_mostly; -static unsigned long uv_mmask __read_mostly; static DEFINE_PER_CPU(struct ptc_stats, ptcstats); static DEFINE_PER_CPU(struct bau_control, bau_control); @@ -1435,7 +1432,7 @@ static void activation_descriptor_init(int node, int pnode, int base_pnode) { int i; int cpu; - unsigned long pa; + unsigned long gpa; unsigned long m; unsigned long n; size_t dsize; @@ -1451,9 +1448,9 @@ static void activation_descriptor_init(int node, int pnode, int base_pnode) bau_desc = kmalloc_node(dsize, GFP_KERNEL, node); BUG_ON(!bau_desc); - pa = uv_gpa(bau_desc); /* need the real nasid*/ - n = pa >> uv_nshift; - m = pa & uv_mmask; + gpa = uv_gpa(bau_desc); + n = uv_gpa_to_gnode(gpa); + m = uv_gpa_to_offset(gpa); /* the 14-bit pnode */ write_mmr_descriptor_base(pnode, (n << UV_DESC_PSHIFT | m)); @@ -1525,9 +1522,9 @@ static void pq_init(int node, int pnode) bcp->queue_last = pqp + (DEST_Q_SIZE - 1); } /* - * need the pnode of where the memory was really allocated + * need the gnode of where the memory was really allocated */ - pn = uv_gpa(pqp) >> uv_nshift; + pn = uv_gpa_to_gnode(uv_gpa(pqp)); first = uv_physnodeaddr(pqp); pn_first = ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) | first; last = uv_physnodeaddr(pqp + (DEST_Q_SIZE - 1)); @@ -1837,8 +1834,6 @@ static int __init uv_bau_init(void) zalloc_cpumask_var_node(mask, GFP_KERNEL, cpu_to_node(cur_cpu)); } - uv_nshift = uv_hub_info->m_val; - uv_mmask = (1UL << uv_hub_info->m_val) - 1; nuvhubs = uv_num_possible_blades(); spin_lock_init(&disable_lock); congested_cycles = usec_2_cycles(congested_respns_us); -- cgit v1.2.3 From 2b810d996387de3ed71e22f324707a6ca9d891ed Mon Sep 17 00:00:00 2001 From: Josh Stone Date: Mon, 24 Oct 2011 10:15:51 -0700 Subject: x86: Fix compilation bug in kprobes' twobyte_is_boostable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 315eb8a2a1b7f335d40ceeeb11b9e067475eb881 upstream. When compiling an i386_defconfig kernel with gcc-4.6.1-9.fc15.i686, I noticed a warning about the asm operand for test_bit in kprobes' can_boost. I discovered that this caused only the first long of twobyte_is_boostable[] to be output. Jakub filed and fixed gcc PR50571 to correct the warning and this output issue. But to solve it for less current gcc, we can make kprobes' twobyte_is_boostable[] non-const, and it won't be optimized out. Before: CC arch/x86/kernel/kprobes.o In file included from include/linux/bitops.h:22:0, from include/linux/kernel.h:17, from [...]/arch/x86/include/asm/percpu.h:44, from [...]/arch/x86/include/asm/current.h:5, from [...]/arch/x86/include/asm/processor.h:15, from [...]/arch/x86/include/asm/atomic.h:6, from include/linux/atomic.h:4, from include/linux/mutex.h:18, from include/linux/notifier.h:13, from include/linux/kprobes.h:34, from arch/x86/kernel/kprobes.c:43: [...]/arch/x86/include/asm/bitops.h: In function ‘can_boost.part.1’: [...]/arch/x86/include/asm/bitops.h:319:2: warning: use of memory input without lvalue in asm operand 1 is deprecated [enabled by default] $ objdump -rd arch/x86/kernel/kprobes.o | grep -A1 -w bt 551: 0f a3 05 00 00 00 00 bt %eax,0x0 554: R_386_32 .rodata.cst4 $ objdump -s -j .rodata.cst4 -j .data arch/x86/kernel/kprobes.o arch/x86/kernel/kprobes.o: file format elf32-i386 Contents of section .data: 0000 48000000 00000000 00000000 00000000 H............... Contents of section .rodata.cst4: 0000 4c030000 L... Only a single long of twobyte_is_boostable[] is in the object file. After, without the const on twobyte_is_boostable: $ objdump -rd arch/x86/kernel/kprobes.o | grep -A1 -w bt 551: 0f a3 05 20 00 00 00 bt %eax,0x20 554: R_386_32 .data $ objdump -s -j .rodata.cst4 -j .data arch/x86/kernel/kprobes.o arch/x86/kernel/kprobes.o: file format elf32-i386 Contents of section .data: 0000 48000000 00000000 00000000 00000000 H............... 0010 00000000 00000000 00000000 00000000 ................ 0020 4c030000 0f000200 ffff0000 ffcff0c0 L............... 0030 0000ffff 3bbbfff8 03ff2ebb 26bb2e77 ....;.......&..w Now all 32 bytes are output into .data instead. Signed-off-by: Josh Stone Cc: Masami Hiramatsu Cc: Jakub Jelinek Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/kprobes.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index f1a6244d7d9..794bc95134c 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -75,8 +75,10 @@ DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); /* * Undefined/reserved opcodes, conditional jump, Opcode Extension * Groups, and some special opcodes can not boost. + * This is non-const to keep gcc from statically optimizing it out, as + * variable_test_bit makes gcc think only *(unsigned long*) is used. */ -static const u32 twobyte_is_boostable[256 / 32] = { +static u32 twobyte_is_boostable[256 / 32] = { /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ /* ---------------------------------------------- */ W(0x00, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0) | /* 00 */ -- cgit v1.2.3 From 6ed5bebf24c72d2c9c4950826335fff2dca8da22 Mon Sep 17 00:00:00 2001 From: Russell King Date: Thu, 20 Oct 2011 22:04:18 +0100 Subject: ARM: smp: fix clipping of number of CPUs commit a06f916b7a9b57447ceb875eb0a89f1a66b31bca upstream. Rather than clipping the number of CPUs using the compile-time NR_CPUS constant, use the runtime nr_cpu_ids value instead. This allows the nr_cpus command line option to work as expected. Reported-by: Mark Salter Signed-off-by: Russell King Signed-off-by: Greg Kroah-Hartman --- arch/arm/mach-exynos4/platsmp.c | 10 ++++------ arch/arm/mach-msm/platsmp.c | 6 ++++++ arch/arm/mach-omap2/omap-smp.c | 10 ++++------ arch/arm/mach-realview/platsmp.c | 10 ++++------ arch/arm/mach-shmobile/platsmp.c | 6 ++++++ arch/arm/mach-tegra/platsmp.c | 8 ++++---- arch/arm/mach-ux500/platsmp.c | 10 ++++------ arch/arm/mach-vexpress/ct-ca9x4.c | 6 ++++++ 8 files changed, 38 insertions(+), 28 deletions(-) (limited to 'arch') diff --git a/arch/arm/mach-exynos4/platsmp.c b/arch/arm/mach-exynos4/platsmp.c index df6ef1b2f98..0c90896ad9a 100644 --- a/arch/arm/mach-exynos4/platsmp.c +++ b/arch/arm/mach-exynos4/platsmp.c @@ -193,12 +193,10 @@ void __init smp_init_cpus(void) ncores = scu_base ? scu_get_core_count(scu_base) : 1; /* sanity check */ - if (ncores > NR_CPUS) { - printk(KERN_WARNING - "EXYNOS4: no. of cores (%d) greater than configured " - "maximum of %d - clipping\n", - ncores, NR_CPUS); - ncores = NR_CPUS; + if (ncores > nr_cpu_ids) { + pr_warn("SMP: %u cores greater than maximum (%u), clipping\n", + ncores, nr_cpu_ids); + ncores = nr_cpu_ids; } for (i = 0; i < ncores; i++) diff --git a/arch/arm/mach-msm/platsmp.c b/arch/arm/mach-msm/platsmp.c index 1a1af9e5625..72765952091 100644 --- a/arch/arm/mach-msm/platsmp.c +++ b/arch/arm/mach-msm/platsmp.c @@ -156,6 +156,12 @@ void __init smp_init_cpus(void) { unsigned int i, ncores = get_core_count(); + if (ncores > nr_cpu_ids) { + pr_warn("SMP: %u cores greater than maximum (%u), clipping\n", + ncores, nr_cpu_ids); + ncores = nr_cpu_ids; + } + for (i = 0; i < ncores; i++) set_cpu_possible(i, true); diff --git a/arch/arm/mach-omap2/omap-smp.c b/arch/arm/mach-omap2/omap-smp.c index ce65e9329c7..889464dc7b2 100644 --- a/arch/arm/mach-omap2/omap-smp.c +++ b/arch/arm/mach-omap2/omap-smp.c @@ -109,12 +109,10 @@ void __init smp_init_cpus(void) ncores = scu_get_core_count(scu_base); /* sanity check */ - if (ncores > NR_CPUS) { - printk(KERN_WARNING - "OMAP4: no. of cores (%d) greater than configured " - "maximum of %d - clipping\n", - ncores, NR_CPUS); - ncores = NR_CPUS; + if (ncores > nr_cpu_ids) { + pr_warn("SMP: %u cores greater than maximum (%u), clipping\n", + ncores, nr_cpu_ids); + ncores = nr_cpu_ids; } for (i = 0; i < ncores; i++) diff --git a/arch/arm/mach-realview/platsmp.c b/arch/arm/mach-realview/platsmp.c index 4ae943bafa9..e83c654a58d 100644 --- a/arch/arm/mach-realview/platsmp.c +++ b/arch/arm/mach-realview/platsmp.c @@ -52,12 +52,10 @@ void __init smp_init_cpus(void) ncores = scu_base ? scu_get_core_count(scu_base) : 1; /* sanity check */ - if (ncores > NR_CPUS) { - printk(KERN_WARNING - "Realview: no. of cores (%d) greater than configured " - "maximum of %d - clipping\n", - ncores, NR_CPUS); - ncores = NR_CPUS; + if (ncores > nr_cpu_ids) { + pr_warn("SMP: %u cores greater than maximum (%u), clipping\n", + ncores, nr_cpu_ids); + ncores = nr_cpu_ids; } for (i = 0; i < ncores; i++) diff --git a/arch/arm/mach-shmobile/platsmp.c b/arch/arm/mach-shmobile/platsmp.c index 66f980625a3..e4e485fa253 100644 --- a/arch/arm/mach-shmobile/platsmp.c +++ b/arch/arm/mach-shmobile/platsmp.c @@ -56,6 +56,12 @@ void __init smp_init_cpus(void) unsigned int ncores = shmobile_smp_get_core_count(); unsigned int i; + if (ncores > nr_cpu_ids) { + pr_warn("SMP: %u cores greater than maximum (%u), clipping\n", + ncores, nr_cpu_ids); + ncores = nr_cpu_ids; + } + for (i = 0; i < ncores; i++) set_cpu_possible(i, true); diff --git a/arch/arm/mach-tegra/platsmp.c b/arch/arm/mach-tegra/platsmp.c index 0886cbccdde..7d2b5d03c1d 100644 --- a/arch/arm/mach-tegra/platsmp.c +++ b/arch/arm/mach-tegra/platsmp.c @@ -114,10 +114,10 @@ void __init smp_init_cpus(void) { unsigned int i, ncores = scu_get_core_count(scu_base); - if (ncores > NR_CPUS) { - printk(KERN_ERR "Tegra: no. of cores (%u) greater than configured (%u), clipping\n", - ncores, NR_CPUS); - ncores = NR_CPUS; + if (ncores > nr_cpu_ids) { + pr_warn("SMP: %u cores greater than maximum (%u), clipping\n", + ncores, nr_cpu_ids); + ncores = nr_cpu_ids; } for (i = 0; i < ncores; i++) diff --git a/arch/arm/mach-ux500/platsmp.c b/arch/arm/mach-ux500/platsmp.c index a33df5f4c27..eb5199102cf 100644 --- a/arch/arm/mach-ux500/platsmp.c +++ b/arch/arm/mach-ux500/platsmp.c @@ -156,12 +156,10 @@ void __init smp_init_cpus(void) ncores = scu_base ? scu_get_core_count(scu_base) : 1; /* sanity check */ - if (ncores > NR_CPUS) { - printk(KERN_WARNING - "U8500: no. of cores (%d) greater than configured " - "maximum of %d - clipping\n", - ncores, NR_CPUS); - ncores = NR_CPUS; + if (ncores > nr_cpu_ids) { + pr_warn("SMP: %u cores greater than maximum (%u), clipping\n", + ncores, nr_cpu_ids); + ncores = nr_cpu_ids; } for (i = 0; i < ncores; i++) diff --git a/arch/arm/mach-vexpress/ct-ca9x4.c b/arch/arm/mach-vexpress/ct-ca9x4.c index bfd32f52c2d..2b1e836a76e 100644 --- a/arch/arm/mach-vexpress/ct-ca9x4.c +++ b/arch/arm/mach-vexpress/ct-ca9x4.c @@ -221,6 +221,12 @@ static void ct_ca9x4_init_cpu_map(void) { int i, ncores = scu_get_core_count(MMIO_P2V(A9_MPCORE_SCU)); + if (ncores > nr_cpu_ids) { + pr_warn("SMP: %u cores greater than maximum (%u), clipping\n", + ncores, nr_cpu_ids); + ncores = nr_cpu_ids; + } + for (i = 0; i < ncores; ++i) set_cpu_possible(i, true); -- cgit v1.2.3 From d6a615f1c468aa7583b671ab474d7abe25b105c0 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Wed, 28 Sep 2011 16:44:54 +0100 Subject: apic, i386/bigsmp: Fix false warnings regarding logical APIC ID mismatches commit 838312be46f3abfbdc175f81c3e54a857994476d upstream. These warnings (generally one per CPU) are a result of initializing x86_cpu_to_logical_apicid while apic_default is still in use, but the check in setup_local_APIC() being done when apic_bigsmp was already used as an override in default_setup_apic_routing(): Overriding APIC driver with bigsmp Enabling APIC mode: Physflat. Using 5 I/O APICs ------------[ cut here ]------------ WARNING: at .../arch/x86/kernel/apic/apic.c:1239 ... CPU 1 irqstacks, hard=f1c9a000 soft=f1c9c000 Booting Node 0, Processors #1 smpboot cpu 1: start_ip = 9e000 Initializing CPU#1 ------------[ cut here ]------------ WARNING: at .../arch/x86/kernel/apic/apic.c:1239 setup_local_APIC+0x137/0x46b() Hardware name: ... CPU1 logical APIC ID: 2 != 8 ... Fix this (for the time being, i.e. until x86_32_early_logical_apicid() will get removed again, as Tejun says ought to be possible) by overriding the previously stored values at the point where the APIC driver gets overridden. v2: Move this and the pre-existing override logic into arch/x86/kernel/apic/bigsmp_32.c. Signed-off-by: Jan Beulich Acked-by: Tejun Heo Link: http://lkml.kernel.org/r/4E835D16020000780005844C@nat28.tlf.novell.com Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/apic.h | 2 +- arch/x86/kernel/apic/bigsmp_32.c | 20 ++++++++++++++++---- arch/x86/kernel/apic/probe_32.c | 10 ++-------- 3 files changed, 19 insertions(+), 13 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 7b3ca8324b6..9b7273cb219 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -495,7 +495,7 @@ static inline void default_wait_for_init_deassert(atomic_t *deassert) return; } -extern struct apic *generic_bigsmp_probe(void); +extern void generic_bigsmp_probe(void); #ifdef CONFIG_X86_LOCAL_APIC diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index efd737e827f..521bead0113 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -255,12 +255,24 @@ static struct apic apic_bigsmp = { .x86_32_early_logical_apicid = bigsmp_early_logical_apicid, }; -struct apic * __init generic_bigsmp_probe(void) +void __init generic_bigsmp_probe(void) { - if (probe_bigsmp()) - return &apic_bigsmp; + unsigned int cpu; - return NULL; + if (!probe_bigsmp()) + return; + + apic = &apic_bigsmp; + + for_each_possible_cpu(cpu) { + if (early_per_cpu(x86_cpu_to_logical_apicid, + cpu) == BAD_APICID) + continue; + early_per_cpu(x86_cpu_to_logical_apicid, cpu) = + bigsmp_early_logical_apicid(cpu); + } + + pr_info("Overriding APIC driver with %s\n", apic_bigsmp.name); } apic_driver(apic_bigsmp); diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index b5254ad044a..0787bb3412f 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -200,14 +200,8 @@ void __init default_setup_apic_routing(void) * - we find more than 8 CPUs in acpi LAPIC listing with xAPIC support */ - if (!cmdline_apic && apic == &apic_default) { - struct apic *bigsmp = generic_bigsmp_probe(); - if (bigsmp) { - apic = bigsmp; - printk(KERN_INFO "Overriding APIC driver with %s\n", - apic->name); - } - } + if (!cmdline_apic && apic == &apic_default) + generic_bigsmp_probe(); #endif if (apic->setup_apic_routing) -- cgit v1.2.3 From 715afafe876a366e0d71d98cdee585e0c7f64044 Mon Sep 17 00:00:00 2001 From: Carsten Otte Date: Tue, 18 Oct 2011 12:27:12 +0200 Subject: KVM: s390: check cpu_id prior to using it commit 4d47555a80495657161a7e71ec3014ff2021e450 upstream. We use the cpu id provided by userspace as array index here. Thus we clearly need to check it first. Ooops. Signed-off-by: Carsten Otte Signed-off-by: Christian Borntraeger Signed-off-by: Marcelo Tosatti Signed-off-by: Greg Kroah-Hartman --- arch/s390/kvm/kvm-s390.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index dc2b580e27b..0cba935d128 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -312,11 +312,17 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) { - struct kvm_vcpu *vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL); - int rc = -ENOMEM; + struct kvm_vcpu *vcpu; + int rc = -EINVAL; + + if (id >= KVM_MAX_VCPUS) + goto out; + + rc = -ENOMEM; + vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL); if (!vcpu) - goto out_nomem; + goto out; vcpu->arch.sie_block = (struct kvm_s390_sie_block *) get_zeroed_page(GFP_KERNEL); @@ -352,7 +358,7 @@ out_free_sie_block: free_page((unsigned long)(vcpu->arch.sie_block)); out_free_cpu: kfree(vcpu); -out_nomem: +out: return ERR_PTR(rc); } -- cgit v1.2.3 From 98ef836b07bd10dec2b07d2c11a6357f7909f978 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Sun, 30 Oct 2011 15:16:07 +0100 Subject: user per registers vs. ptrace single stepping commit a45aff5285871bf7be1781d9462d3fdbb6c913f9 upstream. git commit 5e9a2692 "[S390] ptrace cleanup" introduced a regression for the case when both a user PER set (e.g. a storage alteration trace) and PTRACE_SINGLESTEP are active. The new code will overrule the user PER set with a instruction-fetch PER set over the whole address space for ptrace single stepping. The inferior process will be stopped after each instruction with an instruction fetch event. Any other events that may have occurred concurrently are not reported (e.g. storage alteration event) because the control bits for them are not set. The solution is to merge the PER control bits of the user PER set with the PER_EVENT_IFETCH control bit for PTRACE_SINGLESTEP. Signed-off-by: Martin Schwidefsky Signed-off-by: Greg Kroah-Hartman --- arch/s390/kernel/ptrace.c | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) (limited to 'arch') diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c index ef86ad24398..ae0e14b8880 100644 --- a/arch/s390/kernel/ptrace.c +++ b/arch/s390/kernel/ptrace.c @@ -47,29 +47,31 @@ enum s390_regset { void update_per_regs(struct task_struct *task) { - static const struct per_regs per_single_step = { - .control = PER_EVENT_IFETCH, - .start = 0, - .end = PSW_ADDR_INSN, - }; struct pt_regs *regs = task_pt_regs(task); struct thread_struct *thread = &task->thread; - const struct per_regs *new; - struct per_regs old; - - /* TIF_SINGLE_STEP overrides the user specified PER registers. */ - new = test_tsk_thread_flag(task, TIF_SINGLE_STEP) ? - &per_single_step : &thread->per_user; + struct per_regs old, new; + + /* Copy user specified PER registers */ + new.control = thread->per_user.control; + new.start = thread->per_user.start; + new.end = thread->per_user.end; + + /* merge TIF_SINGLE_STEP into user specified PER registers. */ + if (test_tsk_thread_flag(task, TIF_SINGLE_STEP)) { + new.control |= PER_EVENT_IFETCH; + new.start = 0; + new.end = PSW_ADDR_INSN; + } /* Take care of the PER enablement bit in the PSW. */ - if (!(new->control & PER_EVENT_MASK)) { + if (!(new.control & PER_EVENT_MASK)) { regs->psw.mask &= ~PSW_MASK_PER; return; } regs->psw.mask |= PSW_MASK_PER; __ctl_store(old, 9, 11); - if (memcmp(new, &old, sizeof(struct per_regs)) != 0) - __ctl_load(*new, 9, 11); + if (memcmp(&new, &old, sizeof(struct per_regs)) != 0) + __ctl_load(new, 9, 11); } void user_enable_single_step(struct task_struct *task) -- cgit v1.2.3 From 4dc17f0c422d685fb1e31e5cb6762bc1322843b0 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Sun, 30 Oct 2011 15:16:08 +0100 Subject: memory leak with RCU_TABLE_FREE commit e73b7fffe487c315fd1a4fa22282e3362b440a06 upstream. The rcu page table free code uses a couple of bits in the page table pointer passed to tlb_remove_table to discern the different page table types. __tlb_remove_table extracts the type with an incorrect mask which leads to memory leaks. The correct mask is ((FRAG_MASK << 4) | FRAG_MASK). Signed-off-by: Martin Schwidefsky Signed-off-by: Greg Kroah-Hartman --- arch/s390/mm/pgtable.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 5d56c2b95b1..529a0883837 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -662,8 +662,9 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table) void __tlb_remove_table(void *_table) { - void *table = (void *)((unsigned long) _table & PAGE_MASK); - unsigned type = (unsigned long) _table & ~PAGE_MASK; + const unsigned long mask = (FRAG_MASK << 4) | FRAG_MASK; + void *table = (void *)((unsigned long) _table & ~mask); + unsigned type = (unsigned long) _table & mask; if (type) __page_table_free_rcu(table, type); -- cgit v1.2.3 From a54059bb11df75d9ddf3c4d98f20b78c5b017bb9 Mon Sep 17 00:00:00 2001 From: Paul Fertser Date: Mon, 10 Oct 2011 11:19:23 +0400 Subject: plat-mxc: iomux-v3.h: implicitly enable pull-up/down when that's desired commit 6571534b600b8ca1936ff5630b9e0947f21faf16 upstream. To configure pads during the initialisation a set of special constants is used, e.g. #define MX25_PAD_FEC_MDIO__FEC_MDIO IOMUX_PAD(0x3c4, 0x1cc, 0x10, 0, 0, PAD_CTL_HYS | PAD_CTL_PUS_22K_UP) The problem is that no pull-up/down is getting activated unless both PAD_CTL_PUE (pull-up enable) and PAD_CTL_PKE (pull/keeper module enable) set. This is clearly stated in the i.MX25 datasheet and is confirmed by the measurements on hardware. This leads to some rather hard to understand bugs such as misdetecting an absent ethernet PHY (a real bug i had), unstable data transfer etc. This might affect mx25, mx35, mx50, mx51 and mx53 SoCs. It's reasonable to expect that if the pullup value is specified, the intention was to have it actually active, so we implicitly add the needed bits. Signed-off-by: Paul Fertser Signed-off-by: Sascha Hauer Signed-off-by: Greg Kroah-Hartman --- arch/arm/plat-mxc/include/mach/iomux-v3.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/arm/plat-mxc/include/mach/iomux-v3.h b/arch/arm/plat-mxc/include/mach/iomux-v3.h index ebbce33097a..45099566fec 100644 --- a/arch/arm/plat-mxc/include/mach/iomux-v3.h +++ b/arch/arm/plat-mxc/include/mach/iomux-v3.h @@ -89,11 +89,11 @@ typedef u64 iomux_v3_cfg_t; #define PAD_CTL_HYS (1 << 8) #define PAD_CTL_PKE (1 << 7) -#define PAD_CTL_PUE (1 << 6) -#define PAD_CTL_PUS_100K_DOWN (0 << 4) -#define PAD_CTL_PUS_47K_UP (1 << 4) -#define PAD_CTL_PUS_100K_UP (2 << 4) -#define PAD_CTL_PUS_22K_UP (3 << 4) +#define PAD_CTL_PUE (1 << 6 | PAD_CTL_PKE) +#define PAD_CTL_PUS_100K_DOWN (0 << 4 | PAD_CTL_PUE) +#define PAD_CTL_PUS_47K_UP (1 << 4 | PAD_CTL_PUE) +#define PAD_CTL_PUS_100K_UP (2 << 4 | PAD_CTL_PUE) +#define PAD_CTL_PUS_22K_UP (3 << 4 | PAD_CTL_PUE) #define PAD_CTL_ODE (1 << 3) -- cgit v1.2.3 From 859967a0ffdb214e97ea202bcafc7bf41a73bb34 Mon Sep 17 00:00:00 2001 From: Axel Lin Date: Sat, 17 Sep 2011 22:15:34 +0800 Subject: ARM: pxa/cm-x300: properly set bt_reset pin commit 1a64200ec5e93be03e27c3b2fc332e04ae443eb2 upstream. Fix below build warning and properly set bt_reset pin. CC arch/arm/mach-pxa/cm-x300.o arch/arm/mach-pxa/cm-x300.c: In function 'cm_x300_init_wi2wi': arch/arm/mach-pxa/cm-x300.c:779: warning: unused variable 'wlan_en' arch/arm/mach-pxa/cm-x300.c:795: warning: 'bt_reset' may be used uninitialized in this function Signed-off-by: Axel Lin Acked-by: Igor Grinberg Signed-off-by: Eric Miao Signed-off-by: Greg Kroah-Hartman --- arch/arm/mach-pxa/cm-x300.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/arm/mach-pxa/cm-x300.c b/arch/arm/mach-pxa/cm-x300.c index b6a51340270..3814e124329 100644 --- a/arch/arm/mach-pxa/cm-x300.c +++ b/arch/arm/mach-pxa/cm-x300.c @@ -775,7 +775,6 @@ static struct gpio cm_x300_wi2wi_gpios[] __initdata = { static void __init cm_x300_init_wi2wi(void) { - int bt_reset, wlan_en; int err; if (system_rev < 130) { @@ -791,12 +790,11 @@ static void __init cm_x300_init_wi2wi(void) } udelay(10); - gpio_set_value(bt_reset, 0); + gpio_set_value(cm_x300_wi2wi_gpios[1].gpio, 0); udelay(10); - gpio_set_value(bt_reset, 1); + gpio_set_value(cm_x300_wi2wi_gpios[1].gpio, 1); - gpio_free(wlan_en); - gpio_free(bt_reset); + gpio_free_array(ARRAY_AND_SIZE(cm_x300_wi2wi_gpios)); } /* MFP */ -- cgit v1.2.3 From f9f92901e2810448ed31691613a78cc232e4bc84 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Fri, 12 Aug 2011 13:54:42 +0200 Subject: ARM: mach-ux500: unlock I&D l2x0 caches before init commit 1bf6d2c1bb23533af6930581cc39b74685bc29de upstream. Apparently U8500 U-Boot versions may leave the l2x0 locked down before executing the kernel. Make sure we unlock it before we initialize the l2x0. This fixes a performance problem reported by Jan Rinze. The l2x0 core has been modified to unlock the l2x0 by default, but it will not touch the locking registers if the l2x0 was already enabled, as on the ux500, so we need this quirk to make sure it is properly turned off. Cc: Srinidhi Kasagar Cc: Rabin Vincent Cc: Adrian Bunk Reported-by: Jan Rinze Tested-by: Robert Marklund Signed-off-by: Linus Walleij Signed-off-by: Greg Kroah-Hartman --- arch/arm/mach-ux500/cpu.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm/mach-ux500/cpu.c b/arch/arm/mach-ux500/cpu.c index 1da23bb87c1..8aa104a4711 100644 --- a/arch/arm/mach-ux500/cpu.c +++ b/arch/arm/mach-ux500/cpu.c @@ -99,7 +99,27 @@ static void ux500_l2x0_inv_all(void) ux500_cache_sync(); } -static int ux500_l2x0_init(void) +static int __init ux500_l2x0_unlock(void) +{ + int i; + + /* + * Unlock Data and Instruction Lock if locked. Ux500 U-Boot versions + * apparently locks both caches before jumping to the kernel. The + * l2x0 core will not touch the unlock registers if the l2x0 is + * already enabled, so we do it right here instead. The PL310 has + * 8 sets of registers, one per possible CPU. + */ + for (i = 0; i < 8; i++) { + writel_relaxed(0x0, l2x0_base + L2X0_LOCKDOWN_WAY_D_BASE + + i * L2X0_LOCKDOWN_STRIDE); + writel_relaxed(0x0, l2x0_base + L2X0_LOCKDOWN_WAY_I_BASE + + i * L2X0_LOCKDOWN_STRIDE); + } + return 0; +} + +static int __init ux500_l2x0_init(void) { if (cpu_is_u5500()) l2x0_base = __io_address(U5500_L2CC_BASE); @@ -108,6 +128,9 @@ static int ux500_l2x0_init(void) else ux500_unknown_soc(); + /* Unlock before init */ + ux500_l2x0_unlock(); + /* 64KB way size, 8 way associativity, force WA */ l2x0_init(l2x0_base, 0x3e060000, 0xc0000fff); -- cgit v1.2.3 From 02e66bee63c63c2bec1947553e6c5b58b3f0c67d Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Wed, 2 Nov 2011 13:17:27 +0100 Subject: um: fix ubd cow size commit 8535639810e578960233ad39def3ac2157b0c3ec upstream. ubd_file_size() cannot use ubd_dev->cow.file because at this time ubd_dev->cow.file is not initialized. Therefore, ubd_file_size() will always report a wrong disk size when COW files are used. Reading from /dev/ubd* would crash the kernel. We have to read the correct disk size from the COW file's backing file. Signed-off-by: Richard Weinberger Signed-off-by: Greg Kroah-Hartman --- arch/um/drivers/ubd_kern.c | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index 620f5b70957..0491e40d696 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -513,8 +513,37 @@ __uml_exitcall(kill_io_thread); static inline int ubd_file_size(struct ubd *ubd_dev, __u64 *size_out) { char *file; + int fd; + int err; + + __u32 version; + __u32 align; + char *backing_file; + time_t mtime; + unsigned long long size; + int sector_size; + int bitmap_offset; + + if (ubd_dev->file && ubd_dev->cow.file) { + file = ubd_dev->cow.file; + + goto out; + } - file = ubd_dev->cow.file ? ubd_dev->cow.file : ubd_dev->file; + fd = os_open_file(ubd_dev->file, global_openflags, 0); + if (fd < 0) + return fd; + + err = read_cow_header(file_reader, &fd, &version, &backing_file, \ + &mtime, &size, §or_size, &align, &bitmap_offset); + os_close_file(fd); + + if(err == -EINVAL) + file = ubd_dev->file; + else + file = backing_file; + +out: return os_file_size(file, size_out); } -- cgit v1.2.3 From 76c125d2658e301689c0b812977eebbef17d8c0c Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Mon, 24 Oct 2011 18:15:32 -0400 Subject: um: Fix kmalloc argument order in um/vdso/vma.c commit 0d65ede0a605d6252acc5c8a9c536c4cd0211f3c upstream. kmalloc size is 1st arg, not second. Signed-off-by: Dave Jones Signed-off-by: Richard Weinberger Signed-off-by: Greg Kroah-Hartman --- arch/um/sys-x86_64/vdso/vma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/um/sys-x86_64/vdso/vma.c b/arch/um/sys-x86_64/vdso/vma.c index 9495c8d0ce3..91f4ec9a0a5 100644 --- a/arch/um/sys-x86_64/vdso/vma.c +++ b/arch/um/sys-x86_64/vdso/vma.c @@ -28,7 +28,7 @@ static int __init init_vdso(void) um_vdso_addr = task_size - PAGE_SIZE; - vdsop = kmalloc(GFP_KERNEL, sizeof(struct page *)); + vdsop = kmalloc(sizeof(struct page *), GFP_KERNEL); if (!vdsop) goto oom; -- cgit v1.2.3 From 1e565a292a31ca798604b25b780e1878b4bb00cb Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 2 Nov 2011 13:36:59 -0700 Subject: mm: thp: tail page refcounting fix commit 70b50f94f1644e2aa7cb374819cfd93f3c28d725 upstream. Michel while working on the working set estimation code, noticed that calling get_page_unless_zero() on a random pfn_to_page(random_pfn) wasn't safe, if the pfn ended up being a tail page of a transparent hugepage under splitting by __split_huge_page_refcount(). He then found the problem could also theoretically materialize with page_cache_get_speculative() during the speculative radix tree lookups that uses get_page_unless_zero() in SMP if the radix tree page is freed and reallocated and get_user_pages is called on it before page_cache_get_speculative has a chance to call get_page_unless_zero(). So the best way to fix the problem is to keep page_tail->_count zero at all times. This will guarantee that get_page_unless_zero() can never succeed on any tail page. page_tail->_mapcount is guaranteed zero and is unused for all tail pages of a compound page, so we can simply account the tail page references there and transfer them to tail_page->_count in __split_huge_page_refcount() (in addition to the head_page->_mapcount). While debugging this s/_count/_mapcount/ change I also noticed get_page is called by direct-io.c on pages returned by get_user_pages. That wasn't entirely safe because the two atomic_inc in get_page weren't atomic. As opposed to other get_user_page users like secondary-MMU page fault to establish the shadow pagetables would never call any superflous get_page after get_user_page returns. It's safer to make get_page universally safe for tail pages and to use get_page_foll() within follow_page (inside get_user_pages()). get_page_foll() is safe to do the refcounting for tail pages without taking any locks because it is run within PT lock protected critical sections (PT lock for pte and page_table_lock for pmd_trans_huge). The standard get_page() as invoked by direct-io instead will now take the compound_lock but still only for tail pages. The direct-io paths are usually I/O bound and the compound_lock is per THP so very finegrined, so there's no risk of scalability issues with it. A simple direct-io benchmarks with all lockdep prove locking and spinlock debugging infrastructure enabled shows identical performance and no overhead. So it's worth it. Ideally direct-io should stop calling get_page() on pages returned by get_user_pages(). The spinlock in get_page() is already optimized away for no-THP builds but doing get_page() on tail pages returned by GUP is generally a rare operation and usually only run in I/O paths. This new refcounting on page_tail->_mapcount in addition to avoiding new RCU critical sections will also allow the working set estimation code to work without any further complexity associated to the tail page refcounting with THP. Signed-off-by: Andrea Arcangeli Reported-by: Michel Lespinasse Reviewed-by: Michel Lespinasse Reviewed-by: Minchan Kim Cc: Peter Zijlstra Cc: Hugh Dickins Cc: Johannes Weiner Cc: Rik van Riel Cc: Mel Gorman Cc: KOSAKI Motohiro Cc: Benjamin Herrenschmidt Cc: David Gibson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/mm/gup.c | 5 +-- arch/x86/mm/gup.c | 5 +-- include/linux/mm.h | 56 +++++++++++++------------------- include/linux/mm_types.h | 21 +++++++++--- mm/huge_memory.c | 37 ++++++++++++++------- mm/internal.h | 46 +++++++++++++++++++++++++++ mm/memory.c | 2 +- mm/swap.c | 83 +++++++++++++++++++++++++++++++----------------- 8 files changed, 171 insertions(+), 84 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c index fec13200868..b9e1c7ff5f6 100644 --- a/arch/powerpc/mm/gup.c +++ b/arch/powerpc/mm/gup.c @@ -22,8 +22,9 @@ static inline void get_huge_page_tail(struct page *page) * __split_huge_page_refcount() cannot run * from under us. */ - VM_BUG_ON(atomic_read(&page->_count) < 0); - atomic_inc(&page->_count); + VM_BUG_ON(page_mapcount(page) < 0); + VM_BUG_ON(atomic_read(&page->_count) != 0); + atomic_inc(&page->_mapcount); } /* diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index dbe34b93137..3b5032a62b0 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -114,8 +114,9 @@ static inline void get_huge_page_tail(struct page *page) * __split_huge_page_refcount() cannot run * from under us. */ - VM_BUG_ON(atomic_read(&page->_count) < 0); - atomic_inc(&page->_count); + VM_BUG_ON(page_mapcount(page) < 0); + VM_BUG_ON(atomic_read(&page->_count) != 0); + atomic_inc(&page->_mapcount); } static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, diff --git a/include/linux/mm.h b/include/linux/mm.h index 7438071b44a..c1ce6c0d7de 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -356,36 +356,39 @@ static inline struct page *compound_head(struct page *page) return page; } +/* + * The atomic page->_mapcount, starts from -1: so that transitions + * both from it and to it can be tracked, using atomic_inc_and_test + * and atomic_add_negative(-1). + */ +static inline void reset_page_mapcount(struct page *page) +{ + atomic_set(&(page)->_mapcount, -1); +} + +static inline int page_mapcount(struct page *page) +{ + return atomic_read(&(page)->_mapcount) + 1; +} + static inline int page_count(struct page *page) { return atomic_read(&compound_head(page)->_count); } +extern bool __get_page_tail(struct page *page); + static inline void get_page(struct page *page) { + if (unlikely(PageTail(page))) + if (likely(__get_page_tail(page))) + return; /* * Getting a normal page or the head of a compound page - * requires to already have an elevated page->_count. Only if - * we're getting a tail page, the elevated page->_count is - * required only in the head page, so for tail pages the - * bugcheck only verifies that the page->_count isn't - * negative. + * requires to already have an elevated page->_count. */ - VM_BUG_ON(atomic_read(&page->_count) < !PageTail(page)); + VM_BUG_ON(atomic_read(&page->_count) <= 0); atomic_inc(&page->_count); - /* - * Getting a tail page will elevate both the head and tail - * page->_count(s). - */ - if (unlikely(PageTail(page))) { - /* - * This is safe only because - * __split_huge_page_refcount can't run under - * get_page(). - */ - VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0); - atomic_inc(&page->first_page->_count); - } } static inline struct page *virt_to_head_page(const void *x) @@ -803,21 +806,6 @@ static inline pgoff_t page_index(struct page *page) return page->index; } -/* - * The atomic page->_mapcount, like _count, starts from -1: - * so that transitions both from it and to it can be tracked, - * using atomic_inc_and_test and atomic_add_negative(-1). - */ -static inline void reset_page_mapcount(struct page *page) -{ - atomic_set(&(page)->_mapcount, -1); -} - -static inline int page_mapcount(struct page *page) -{ - return atomic_read(&(page)->_mapcount) + 1; -} - /* * Return true if this page is mapped into pagetables. */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 774b8952deb..10a2f62cd56 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -62,10 +62,23 @@ struct page { struct { union { - atomic_t _mapcount; /* Count of ptes mapped in mms, - * to show when page is mapped - * & limit reverse map searches. - */ + /* + * Count of ptes mapped in + * mms, to show when page is + * mapped & limit reverse map + * searches. + * + * Used also for tail pages + * refcounting instead of + * _count. Tail pages cannot + * be mapped and keeping the + * tail page _count zero at + * all times guarantees + * get_page_unless_zero() will + * never succeed on tail + * pages. + */ + atomic_t _mapcount; struct { unsigned inuse:16; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index e2d1587be26..d819d938288 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -989,7 +989,7 @@ struct page *follow_trans_huge_pmd(struct mm_struct *mm, page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; VM_BUG_ON(!PageCompound(page)); if (flags & FOLL_GET) - get_page(page); + get_page_foll(page); out: return page; @@ -1156,6 +1156,7 @@ static void __split_huge_page_refcount(struct page *page) unsigned long head_index = page->index; struct zone *zone = page_zone(page); int zonestat; + int tail_count = 0; /* prevent PageLRU to go away from under us, and freeze lru stats */ spin_lock_irq(&zone->lru_lock); @@ -1164,11 +1165,27 @@ static void __split_huge_page_refcount(struct page *page) for (i = 1; i < HPAGE_PMD_NR; i++) { struct page *page_tail = page + i; - /* tail_page->_count cannot change */ - atomic_sub(atomic_read(&page_tail->_count), &page->_count); - BUG_ON(page_count(page) <= 0); - atomic_add(page_mapcount(page) + 1, &page_tail->_count); - BUG_ON(atomic_read(&page_tail->_count) <= 0); + /* tail_page->_mapcount cannot change */ + BUG_ON(page_mapcount(page_tail) < 0); + tail_count += page_mapcount(page_tail); + /* check for overflow */ + BUG_ON(tail_count < 0); + BUG_ON(atomic_read(&page_tail->_count) != 0); + /* + * tail_page->_count is zero and not changing from + * under us. But get_page_unless_zero() may be running + * from under us on the tail_page. If we used + * atomic_set() below instead of atomic_add(), we + * would then run atomic_set() concurrently with + * get_page_unless_zero(), and atomic_set() is + * implemented in C not using locked ops. spin_unlock + * on x86 sometime uses locked ops because of PPro + * errata 66, 92, so unless somebody can guarantee + * atomic_set() here would be safe on all archs (and + * not only on x86), it's safer to use atomic_add(). + */ + atomic_add(page_mapcount(page) + page_mapcount(page_tail) + 1, + &page_tail->_count); /* after clearing PageTail the gup refcount can be released */ smp_mb(); @@ -1186,10 +1203,7 @@ static void __split_huge_page_refcount(struct page *page) (1L << PG_uptodate))); page_tail->flags |= (1L << PG_dirty); - /* - * 1) clear PageTail before overwriting first_page - * 2) clear PageTail before clearing PageHead for VM_BUG_ON - */ + /* clear PageTail before overwriting first_page */ smp_wmb(); /* @@ -1206,7 +1220,6 @@ static void __split_huge_page_refcount(struct page *page) * status is achieved setting a reserved bit in the * pmd, not by clearing the present bit. */ - BUG_ON(page_mapcount(page_tail)); page_tail->_mapcount = page->_mapcount; BUG_ON(page_tail->mapping); @@ -1223,6 +1236,8 @@ static void __split_huge_page_refcount(struct page *page) lru_add_page_tail(zone, page, page_tail); } + atomic_sub(tail_count, &page->_count); + BUG_ON(atomic_read(&page->_count) <= 0); __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR); diff --git a/mm/internal.h b/mm/internal.h index d071d380fb4..2189af49178 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -37,6 +37,52 @@ static inline void __put_page(struct page *page) atomic_dec(&page->_count); } +static inline void __get_page_tail_foll(struct page *page, + bool get_page_head) +{ + /* + * If we're getting a tail page, the elevated page->_count is + * required only in the head page and we will elevate the head + * page->_count and tail page->_mapcount. + * + * We elevate page_tail->_mapcount for tail pages to force + * page_tail->_count to be zero at all times to avoid getting + * false positives from get_page_unless_zero() with + * speculative page access (like in + * page_cache_get_speculative()) on tail pages. + */ + VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0); + VM_BUG_ON(atomic_read(&page->_count) != 0); + VM_BUG_ON(page_mapcount(page) < 0); + if (get_page_head) + atomic_inc(&page->first_page->_count); + atomic_inc(&page->_mapcount); +} + +/* + * This is meant to be called as the FOLL_GET operation of + * follow_page() and it must be called while holding the proper PT + * lock while the pte (or pmd_trans_huge) is still mapping the page. + */ +static inline void get_page_foll(struct page *page) +{ + if (unlikely(PageTail(page))) + /* + * This is safe only because + * __split_huge_page_refcount() can't run under + * get_page_foll() because we hold the proper PT lock. + */ + __get_page_tail_foll(page, true); + else { + /* + * Getting a normal page or the head of a compound page + * requires to already have an elevated page->_count. + */ + VM_BUG_ON(atomic_read(&page->_count) <= 0); + atomic_inc(&page->_count); + } +} + extern unsigned long highest_memmap_pfn; /* diff --git a/mm/memory.c b/mm/memory.c index a56e3ba816b..b2b87315cdc 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1503,7 +1503,7 @@ split_fallthrough: } if (flags & FOLL_GET) - get_page(page); + get_page_foll(page); if (flags & FOLL_TOUCH) { if ((flags & FOLL_WRITE) && !pte_dirty(pte) && !PageDirty(page)) diff --git a/mm/swap.c b/mm/swap.c index 3a442f18b0b..87627f181c3 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -78,39 +78,22 @@ static void put_compound_page(struct page *page) { if (unlikely(PageTail(page))) { /* __split_huge_page_refcount can run under us */ - struct page *page_head = page->first_page; - smp_rmb(); - /* - * If PageTail is still set after smp_rmb() we can be sure - * that the page->first_page we read wasn't a dangling pointer. - * See __split_huge_page_refcount() smp_wmb(). - */ - if (likely(PageTail(page) && get_page_unless_zero(page_head))) { + struct page *page_head = compound_trans_head(page); + + if (likely(page != page_head && + get_page_unless_zero(page_head))) { unsigned long flags; /* - * Verify that our page_head wasn't converted - * to a a regular page before we got a - * reference on it. + * page_head wasn't a dangling pointer but it + * may not be a head page anymore by the time + * we obtain the lock. That is ok as long as it + * can't be freed from under us. */ - if (unlikely(!PageHead(page_head))) { - /* PageHead is cleared after PageTail */ - smp_rmb(); - VM_BUG_ON(PageTail(page)); - goto out_put_head; - } - /* - * Only run compound_lock on a valid PageHead, - * after having it pinned with - * get_page_unless_zero() above. - */ - smp_mb(); - /* page_head wasn't a dangling pointer */ flags = compound_lock_irqsave(page_head); if (unlikely(!PageTail(page))) { /* __split_huge_page_refcount run before us */ compound_unlock_irqrestore(page_head, flags); VM_BUG_ON(PageHead(page_head)); - out_put_head: if (put_page_testzero(page_head)) __put_single_page(page_head); out_put_single: @@ -121,16 +104,17 @@ static void put_compound_page(struct page *page) VM_BUG_ON(page_head != page->first_page); /* * We can release the refcount taken by - * get_page_unless_zero now that - * split_huge_page_refcount is blocked on the - * compound_lock. + * get_page_unless_zero() now that + * __split_huge_page_refcount() is blocked on + * the compound_lock. */ if (put_page_testzero(page_head)) VM_BUG_ON(1); /* __split_huge_page_refcount will wait now */ - VM_BUG_ON(atomic_read(&page->_count) <= 0); - atomic_dec(&page->_count); + VM_BUG_ON(page_mapcount(page) <= 0); + atomic_dec(&page->_mapcount); VM_BUG_ON(atomic_read(&page_head->_count) <= 0); + VM_BUG_ON(atomic_read(&page->_count) != 0); compound_unlock_irqrestore(page_head, flags); if (put_page_testzero(page_head)) { if (PageHead(page_head)) @@ -160,6 +144,45 @@ void put_page(struct page *page) } EXPORT_SYMBOL(put_page); +/* + * This function is exported but must not be called by anything other + * than get_page(). It implements the slow path of get_page(). + */ +bool __get_page_tail(struct page *page) +{ + /* + * This takes care of get_page() if run on a tail page + * returned by one of the get_user_pages/follow_page variants. + * get_user_pages/follow_page itself doesn't need the compound + * lock because it runs __get_page_tail_foll() under the + * proper PT lock that already serializes against + * split_huge_page(). + */ + unsigned long flags; + bool got = false; + struct page *page_head = compound_trans_head(page); + + if (likely(page != page_head && get_page_unless_zero(page_head))) { + /* + * page_head wasn't a dangling pointer but it + * may not be a head page anymore by the time + * we obtain the lock. That is ok as long as it + * can't be freed from under us. + */ + flags = compound_lock_irqsave(page_head); + /* here __split_huge_page_refcount won't run anymore */ + if (likely(PageTail(page))) { + __get_page_tail_foll(page, false); + got = true; + } + compound_unlock_irqrestore(page_head, flags); + if (unlikely(!got)) + put_page(page_head); + } + return got; +} +EXPORT_SYMBOL(__get_page_tail); + /** * put_pages_list() - release a list of pages * @pages: list of pages threaded on page->lru -- cgit v1.2.3 From 5db7016cf280d1505807b013aca0cd4a3f0882a7 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 2 Nov 2011 13:37:03 -0700 Subject: powerpc: remove superfluous PageTail checks on the pte gup_fast commit 2839bdc1bfc0af76a2f0f11eca011590520a04fa upstream. This part of gup_fast doesn't seem capable of handling hugetlbfs ptes, those should be handled by gup_hugepd only, so these checks are superfluous. Plus if this wasn't a noop, it would have oopsed because, the insistence of using the speculative refcounting would trigger a VM_BUG_ON if a tail page was encountered in the page_cache_get_speculative(). Signed-off-by: Andrea Arcangeli Cc: Peter Zijlstra Cc: Hugh Dickins Cc: Johannes Weiner Cc: Rik van Riel Cc: Mel Gorman Cc: KOSAKI Motohiro Cc: Benjamin Herrenschmidt Acked-by: David Gibson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/mm/gup.c | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c index b9e1c7ff5f6..d7efdbf640c 100644 --- a/arch/powerpc/mm/gup.c +++ b/arch/powerpc/mm/gup.c @@ -16,17 +16,6 @@ #ifdef __HAVE_ARCH_PTE_SPECIAL -static inline void get_huge_page_tail(struct page *page) -{ - /* - * __split_huge_page_refcount() cannot run - * from under us. - */ - VM_BUG_ON(page_mapcount(page) < 0); - VM_BUG_ON(atomic_read(&page->_count) != 0); - atomic_inc(&page->_mapcount); -} - /* * The performance critical leaf functions are made noinline otherwise gcc * inlines everything into a single function which results in too much @@ -58,8 +47,6 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, put_page(page); return 0; } - if (PageTail(page)) - get_huge_page_tail(page); pages[*nr] = page; (*nr)++; -- cgit v1.2.3 From 80a76e1db5d707c14b0a8f519ff08d4ee4586482 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 2 Nov 2011 13:37:08 -0700 Subject: powerpc: get_hugepte() don't put_page() the wrong page commit 405e44f2e312dd5dd63e5a9f459bffcbcd4368ef upstream. "page" may have changed to point to the next hugepage after the loop completed, The references have been taken on the head page, so the put_page must happen there too. This is a longstanding issue pre-thp inclusion. It's totally unclear how these page_cache_add_speculative and pte_val(pte) != pte_val(*ptep) checks are necessary across all the powerpc gup_fast code, when x86 doesn't need any of that: there's no way the page can be freed with irq disabled so we're guaranteed the atomic_inc will happen on a page with page_count > 0 (so not needing the speculative check). The pte check is also meaningless on x86: no need to rollback on x86 if the pte changed, because the pte can still change a CPU tick after the check succeeded and it won't be rolled back in that case. The important thing is we got a reference on a valid page that was mapped there a CPU tick ago. So not knowing the soft tlb refill code of ppc64 in great detail I'm not removing the "speculative" page_count increase and the pte checks across all the code, but unless there's a strong reason for it they should be later cleaned up too. If a pte can change from huge to non-huge (like it could happen with THP) passing a pte_t *ptep to gup_hugepte() would also require to repeat the is_hugepd in gup_hugepte(), but that shouldn't happen with hugetlbfs only so I'm not altering that. Signed-off-by: Andrea Arcangeli Cc: Peter Zijlstra Cc: Hugh Dickins Cc: Johannes Weiner Cc: Rik van Riel Cc: Mel Gorman Cc: KOSAKI Motohiro Cc: Benjamin Herrenschmidt Acked-by: David Gibson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/mm/hugetlbpage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 0b9a5c1901b..b649c288af9 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -429,7 +429,7 @@ static noinline int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long add if (unlikely(pte_val(pte) != pte_val(*ptep))) { /* Could be optimized better */ while (*nr) { - put_page(page); + put_page(head); (*nr)--; } } -- cgit v1.2.3 From 9aa7c2a5e45e27dafa757989b307df18a8e7309b Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 2 Nov 2011 13:37:11 -0700 Subject: powerpc: gup_hugepte() avoid freeing the head page too many times commit 8596468487e2062cae2aad56e973784e03959245 upstream. We only taken "refs" pins on the head page not "*nr" pins. Signed-off-by: Andrea Arcangeli Cc: Peter Zijlstra Cc: Hugh Dickins Cc: Johannes Weiner Cc: Rik van Riel Cc: Mel Gorman Cc: KOSAKI Motohiro Cc: Benjamin Herrenschmidt Acked-by: David Gibson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/mm/hugetlbpage.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index b649c288af9..78b14abded6 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -428,10 +428,9 @@ static noinline int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long add if (unlikely(pte_val(pte) != pte_val(*ptep))) { /* Could be optimized better */ - while (*nr) { + *nr -= refs; + while (refs--) put_page(head); - (*nr)--; - } } return 1; -- cgit v1.2.3 From 956b251c2f65362487d23a42c5aa2f7955f8fea7 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 2 Nov 2011 13:37:15 -0700 Subject: powerpc: gup_hugepte() support THP based tail recounting commit 3526741f0964c88bc2ce511e1078359052bf225b upstream. Up to this point the code assumed old refcounting for hugepages (pre-thp). This updates the code directly to the thp mapcount tail page refcounting. Signed-off-by: Andrea Arcangeli Cc: Peter Zijlstra Cc: Hugh Dickins Cc: Johannes Weiner Cc: Rik van Riel Cc: Mel Gorman Cc: KOSAKI Motohiro Cc: Benjamin Herrenschmidt Cc: David Gibson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/mm/hugetlbpage.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 78b14abded6..a618ef01bfa 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -385,12 +385,23 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address, return NULL; } +static inline void get_huge_page_tail(struct page *page) +{ + /* + * __split_huge_page_refcount() cannot run + * from under us. + */ + VM_BUG_ON(page_mapcount(page) < 0); + VM_BUG_ON(atomic_read(&page->_count) != 0); + atomic_inc(&page->_mapcount); +} + static noinline int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { unsigned long mask; unsigned long pte_end; - struct page *head, *page; + struct page *head, *page, *tail; pte_t pte; int refs; @@ -413,6 +424,7 @@ static noinline int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long add head = pte_page(pte); page = head + ((addr & (sz-1)) >> PAGE_SHIFT); + tail = page; do { VM_BUG_ON(compound_head(page) != head); pages[*nr] = page; @@ -431,6 +443,16 @@ static noinline int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long add *nr -= refs; while (refs--) put_page(head); + } else { + /* + * Any tail page need their mapcount reference taken + * before we return. + */ + while (refs--) { + if (PageTail(tail)) + get_huge_page_tail(tail); + tail++; + } } return 1; -- cgit v1.2.3 From ddb693af6f408a08c3e000856ab0e4d3217145f3 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 2 Nov 2011 13:37:19 -0700 Subject: powerpc: gup_huge_pmd() return 0 if pte changes commit cf592bf768c4fa40282b8fce58a80820065de2cb upstream. powerpc didn't return 0 in that case, if it's rolling back the *nr pointer it should also return zero to avoid adding pages to the array at the wrong offset. Signed-off-by: Andrea Arcangeli Cc: Peter Zijlstra Cc: Hugh Dickins Cc: Johannes Weiner Cc: Rik van Riel Cc: Mel Gorman Cc: KOSAKI Motohiro Cc: Benjamin Herrenschmidt Acked-by: David Gibson Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: David Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/mm/hugetlbpage.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index a618ef01bfa..1c59d94f594 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -443,16 +443,17 @@ static noinline int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long add *nr -= refs; while (refs--) put_page(head); - } else { - /* - * Any tail page need their mapcount reference taken - * before we return. - */ - while (refs--) { - if (PageTail(tail)) - get_huge_page_tail(tail); - tail++; - } + return 0; + } + + /* + * Any tail page need their mapcount reference taken before we + * return. + */ + while (refs--) { + if (PageTail(tail)) + get_huge_page_tail(tail); + tail++; } return 1; -- cgit v1.2.3 From 931064483618f938e2c4e491fd218514da4bc2a9 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 2 Nov 2011 13:37:25 -0700 Subject: s390: gup_huge_pmd() support THP tail recounting commit 220a2eb228d032acde60e9fd044ca802706ff583 upstream. Up to this point the code assumed old refcounting for hugepages (pre-thp). This updates the code directly to the thp mapcount tail page refcounting. Signed-off-by: Andrea Arcangeli Cc: Peter Zijlstra Cc: Hugh Dickins Cc: Johannes Weiner Cc: Rik van Riel Cc: Mel Gorman Cc: KOSAKI Motohiro Cc: Benjamin Herrenschmidt Cc: David Gibson Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: David Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- arch/s390/mm/gup.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/s390/mm/gup.c b/arch/s390/mm/gup.c index 45b405ca256..668dda964f2 100644 --- a/arch/s390/mm/gup.c +++ b/arch/s390/mm/gup.c @@ -48,11 +48,22 @@ static inline int gup_pte_range(pmd_t *pmdp, pmd_t pmd, unsigned long addr, return 1; } +static inline void get_huge_page_tail(struct page *page) +{ + /* + * __split_huge_page_refcount() cannot run + * from under us. + */ + VM_BUG_ON(page_mapcount(page) < 0); + VM_BUG_ON(atomic_read(&page->_count) != 0); + atomic_inc(&page->_mapcount); +} + static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { unsigned long mask, result; - struct page *head, *page; + struct page *head, *page, *tail; int refs; result = write ? 0 : _SEGMENT_ENTRY_RO; @@ -64,6 +75,7 @@ static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr, refs = 0; head = pmd_page(pmd); page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); + tail = page; do { VM_BUG_ON(compound_head(page) != head); pages[*nr] = page; @@ -81,6 +93,16 @@ static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr, *nr -= refs; while (refs--) put_page(head); + } else { + /* + * Any tail page need their mapcount reference taken + * before we return. + */ + while (refs--) { + if (PageTail(tail)) + get_huge_page_tail(tail); + tail++; + } } return 1; -- cgit v1.2.3 From 05ed1d8991ab010c81b2c4018d973752677843d7 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 2 Nov 2011 13:37:28 -0700 Subject: s390: gup_huge_pmd() return 0 if pte changes commit 0693bc9ce2cc4f6a1b9c3c05790fc149a74c0b87 upstream. s390 didn't return 0 in that case, if it's rolling back the *nr pointer it should also return zero to avoid adding pages to the array at the wrong offset. Signed-off-by: Andrea Arcangeli Cc: Peter Zijlstra Cc: Hugh Dickins Cc: Johannes Weiner Cc: Rik van Riel Cc: Mel Gorman Cc: KOSAKI Motohiro Cc: Benjamin Herrenschmidt Cc: David Gibson Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: David Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- arch/s390/mm/gup.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/s390/mm/gup.c b/arch/s390/mm/gup.c index 668dda964f2..da33a0281d9 100644 --- a/arch/s390/mm/gup.c +++ b/arch/s390/mm/gup.c @@ -93,16 +93,17 @@ static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr, *nr -= refs; while (refs--) put_page(head); - } else { - /* - * Any tail page need their mapcount reference taken - * before we return. - */ - while (refs--) { - if (PageTail(tail)) - get_huge_page_tail(tail); - tail++; - } + return 0; + } + + /* + * Any tail page need their mapcount reference taken before we + * return. + */ + while (refs--) { + if (PageTail(tail)) + get_huge_page_tail(tail); + tail++; } return 1; -- cgit v1.2.3 From e852a85448fb1fbf6cd8f8c9cbf0102f9372f45f Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 2 Nov 2011 13:37:31 -0700 Subject: sparc: gup_pte_range() support THP based tail recounting commit e0d85a366c2300efd230ef82a9b22110b0658331 upstream. Up to this point the code assumed old refcounting for hugepages (pre-thp). This updates the code directly to the thp mapcount tail page refcounting. Signed-off-by: Andrea Arcangeli Cc: Peter Zijlstra Cc: Hugh Dickins Cc: Johannes Weiner Cc: Rik van Riel Cc: Mel Gorman Cc: KOSAKI Motohiro Cc: Benjamin Herrenschmidt Cc: David Gibson Cc: Martin Schwidefsky Cc: Heiko Carstens Acked-by: David Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- arch/sparc/mm/gup.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'arch') diff --git a/arch/sparc/mm/gup.c b/arch/sparc/mm/gup.c index a986b5d0571..afcebac144f 100644 --- a/arch/sparc/mm/gup.c +++ b/arch/sparc/mm/gup.c @@ -12,6 +12,17 @@ #include #include +static inline void get_huge_page_tail(struct page *page) +{ + /* + * __split_huge_page_refcount() cannot run + * from under us. + */ + VM_BUG_ON(page_mapcount(page) < 0); + VM_BUG_ON(atomic_read(&page->_count) != 0); + atomic_inc(&page->_mapcount); +} + /* * The performance critical leaf functions are made noinline otherwise gcc * inlines everything into a single function which results in too much @@ -56,6 +67,8 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, put_page(head); return 0; } + if (head != page) + get_huge_page_tail(page); pages[*nr] = page; (*nr)++; -- cgit v1.2.3 From 58f18f91c6758bfe5375125f63615bbd18ed506b Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 2 Nov 2011 13:37:36 -0700 Subject: thp: share get_huge_page_tail() commit b35a35b556f5e6b7993ad0baf20173e75c09ce8c upstream. This avoids duplicating the function in every arch gup_fast. Signed-off-by: Andrea Arcangeli Cc: Peter Zijlstra Cc: Hugh Dickins Cc: Johannes Weiner Cc: Rik van Riel Cc: Mel Gorman Cc: KOSAKI Motohiro Cc: Benjamin Herrenschmidt Cc: David Gibson Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: David Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/mm/hugetlbpage.c | 11 ----------- arch/s390/mm/gup.c | 11 ----------- arch/sparc/mm/gup.c | 11 ----------- arch/x86/mm/gup.c | 11 ----------- include/linux/mm.h | 11 +++++++++++ 5 files changed, 11 insertions(+), 44 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 1c59d94f594..da5eb388570 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -385,17 +385,6 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address, return NULL; } -static inline void get_huge_page_tail(struct page *page) -{ - /* - * __split_huge_page_refcount() cannot run - * from under us. - */ - VM_BUG_ON(page_mapcount(page) < 0); - VM_BUG_ON(atomic_read(&page->_count) != 0); - atomic_inc(&page->_mapcount); -} - static noinline int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { diff --git a/arch/s390/mm/gup.c b/arch/s390/mm/gup.c index da33a0281d9..65cb06e2af4 100644 --- a/arch/s390/mm/gup.c +++ b/arch/s390/mm/gup.c @@ -48,17 +48,6 @@ static inline int gup_pte_range(pmd_t *pmdp, pmd_t pmd, unsigned long addr, return 1; } -static inline void get_huge_page_tail(struct page *page) -{ - /* - * __split_huge_page_refcount() cannot run - * from under us. - */ - VM_BUG_ON(page_mapcount(page) < 0); - VM_BUG_ON(atomic_read(&page->_count) != 0); - atomic_inc(&page->_mapcount); -} - static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { diff --git a/arch/sparc/mm/gup.c b/arch/sparc/mm/gup.c index afcebac144f..42c55df3aec 100644 --- a/arch/sparc/mm/gup.c +++ b/arch/sparc/mm/gup.c @@ -12,17 +12,6 @@ #include #include -static inline void get_huge_page_tail(struct page *page) -{ - /* - * __split_huge_page_refcount() cannot run - * from under us. - */ - VM_BUG_ON(page_mapcount(page) < 0); - VM_BUG_ON(atomic_read(&page->_count) != 0); - atomic_inc(&page->_mapcount); -} - /* * The performance critical leaf functions are made noinline otherwise gcc * inlines everything into a single function which results in too much diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index 3b5032a62b0..ea305856151 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -108,17 +108,6 @@ static inline void get_head_page_multiple(struct page *page, int nr) SetPageReferenced(page); } -static inline void get_huge_page_tail(struct page *page) -{ - /* - * __split_huge_page_refcount() cannot run - * from under us. - */ - VM_BUG_ON(page_mapcount(page) < 0); - VM_BUG_ON(atomic_read(&page->_count) != 0); - atomic_inc(&page->_mapcount); -} - static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { diff --git a/include/linux/mm.h b/include/linux/mm.h index c1ce6c0d7de..fedc5f0e62e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -376,6 +376,17 @@ static inline int page_count(struct page *page) return atomic_read(&compound_head(page)->_count); } +static inline void get_huge_page_tail(struct page *page) +{ + /* + * __split_huge_page_refcount() cannot run + * from under us. + */ + VM_BUG_ON(page_mapcount(page) < 0); + VM_BUG_ON(atomic_read(&page->_count) != 0); + atomic_inc(&page->_mapcount); +} + extern bool __get_page_tail(struct page *page); static inline void get_page(struct page *page) -- cgit v1.2.3 From 9d99c1d2bd6aedb02a70d2cb9f1edb0b416a443e Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Wed, 10 Aug 2011 20:44:21 +0000 Subject: powerpc/numa: Remove double of_node_put in hot_add_node_scn_to_nid commit 6083184269fd723affca4f6340e491950267622a upstream. During memory hotplug testing, I got the following warning: ERROR: Bad of_node_put() on /memory@0 of_node_release kref_put of_node_put of_find_node_by_type hot_add_node_scn_to_nid hot_add_scn_to_nid memory_add_physaddr_to_nid ... of_find_node_by_type() loop does the of_node_put for us so we only need the handle the case where we terminate the loop early. As suggested by Stephen Rothwell we can do the of_node_put unconditionally outside of the loop since of_node_put handles a NULL argument fine. Signed-off-by: Anton Blanchard Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/mm/numa.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 2164006fe17..2c1ae7a5fb5 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -1214,11 +1214,12 @@ int hot_add_node_scn_to_nid(unsigned long scn_addr) break; } - of_node_put(memory); if (nid >= 0) break; } + of_node_put(memory); + return nid; } -- cgit v1.2.3 From 7bd46f22875431f4ec2a21cefd1aecf80193b6d7 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Wed, 10 Aug 2011 20:44:24 +0000 Subject: powerpc: Fix oops when echoing bad values to /sys/devices/system/memory/probe commit a11940978bd598e65996b4f807cf4904793f7025 upstream. If we echo an address the hypervisor doesn't like to /sys/devices/system/memory/probe we oops the box: # echo 0x10000000000 > /sys/devices/system/memory/probe kernel BUG at arch/powerpc/mm/hash_utils_64.c:541! The backtrace is: create_section_mapping arch_add_memory add_memory memory_probe_store sysdev_class_store sysfs_write_file vfs_write SyS_write In create_section_mapping we BUG if htab_bolt_mapping returned an error. A better approach is to return an error which will propagate back to userspace. Rerunning the test with this patch applied: # echo 0x10000000000 > /sys/devices/system/memory/probe -bash: echo: write error: Invalid argument Signed-off-by: Anton Blanchard Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/include/asm/sparsemem.h | 2 +- arch/powerpc/mm/hash_utils_64.c | 6 +++--- arch/powerpc/mm/mem.c | 3 ++- 3 files changed, 6 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/include/asm/sparsemem.h b/arch/powerpc/include/asm/sparsemem.h index 54a47ea2c3a..0c5fa314561 100644 --- a/arch/powerpc/include/asm/sparsemem.h +++ b/arch/powerpc/include/asm/sparsemem.h @@ -16,7 +16,7 @@ #endif /* CONFIG_SPARSEMEM */ #ifdef CONFIG_MEMORY_HOTPLUG -extern void create_section_mapping(unsigned long start, unsigned long end); +extern int create_section_mapping(unsigned long start, unsigned long end); extern int remove_section_mapping(unsigned long start, unsigned long end); #ifdef CONFIG_NUMA extern int hot_add_scn_to_nid(unsigned long scn_addr); diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 26b2872b3d0..07f9e9f0d87 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -534,11 +534,11 @@ static unsigned long __init htab_get_table_size(void) } #ifdef CONFIG_MEMORY_HOTPLUG -void create_section_mapping(unsigned long start, unsigned long end) +int create_section_mapping(unsigned long start, unsigned long end) { - BUG_ON(htab_bolt_mapping(start, end, __pa(start), + return htab_bolt_mapping(start, end, __pa(start), pgprot_val(PAGE_KERNEL), mmu_linear_psize, - mmu_kernel_ssize)); + mmu_kernel_ssize); } int remove_section_mapping(unsigned long start, unsigned long end) diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index c781bbcf733..95985f286e3 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -123,7 +123,8 @@ int arch_add_memory(int nid, u64 start, u64 size) pgdata = NODE_DATA(nid); start = (unsigned long)__va(start); - create_section_mapping(start, start + size); + if (create_section_mapping(start, start + size)) + return -EINVAL; /* this should work for most non-highmem platforms */ zone = pgdata->node_zones; -- cgit v1.2.3 From 488d4d2d0a7078b89ac898606cfce65ea93aadd2 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Sun, 14 Aug 2011 14:30:30 +0000 Subject: powerpc/pseries: Avoid spurious error during hotplug CPU add commit 9c740025c51a26ab00192cfc464064d4ccbfe3fc upstream. During hotplug CPU add we get the following error: Unexpected Error (0) returned from configure-connector ibm,configure-connector returns 0 for configuration complete, so catch this and avoid the error. Signed-off-by: Anton Blanchard Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/platforms/pseries/dlpar.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch') diff --git a/arch/powerpc/platforms/pseries/dlpar.c b/arch/powerpc/platforms/pseries/dlpar.c index e9be25bc571..0f1b706506e 100644 --- a/arch/powerpc/platforms/pseries/dlpar.c +++ b/arch/powerpc/platforms/pseries/dlpar.c @@ -112,6 +112,7 @@ void dlpar_free_cc_nodes(struct device_node *dn) dlpar_free_one_cc_node(dn); } +#define COMPLETE 0 #define NEXT_SIBLING 1 #define NEXT_CHILD 2 #define NEXT_PROPERTY 3 @@ -158,6 +159,9 @@ struct device_node *dlpar_configure_connector(u32 drc_index) spin_unlock(&rtas_data_buf_lock); switch (rc) { + case COMPLETE: + break; + case NEXT_SIBLING: dn = dlpar_parse_cc_node(ccwa); if (!dn) -- cgit v1.2.3 From 85e71ae3431f786adea84c0f879dc02d7474c566 Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Fri, 26 Aug 2011 10:36:31 +0000 Subject: powerpc/eeh: Fix /proc/ppc64/eeh creation commit 8feaa43494cee5e938fd5a57b9e9bf1c827e6ccd upstream. Since commit 188917e183cf9ad0374b571006d0fc6d48a7f447, /proc/ppc64 is a symlink to /proc/powerpc/. That means that creating /proc/ppc64/eeh will end up with a unaccessible file, that is not listed under /proc/powerpc/ and, then, not listed under /proc/ppc64/. Creating /proc/powerpc/eeh fixes that problem and maintain the compatibility intended with the ppc64 symlink. Signed-off-by: Thadeu Lima de Souza Cascardo Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/platforms/pseries/eeh.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/powerpc/platforms/pseries/eeh.c b/arch/powerpc/platforms/pseries/eeh.c index ada6e07532e..d42f37d8a44 100644 --- a/arch/powerpc/platforms/pseries/eeh.c +++ b/arch/powerpc/platforms/pseries/eeh.c @@ -1338,7 +1338,7 @@ static const struct file_operations proc_eeh_operations = { static int __init eeh_init_proc(void) { if (machine_is(pseries)) - proc_create("ppc64/eeh", 0, NULL, &proc_eeh_operations); + proc_create("powerpc/eeh", 0, NULL, &proc_eeh_operations); return 0; } __initcall(eeh_init_proc); -- cgit v1.2.3 From 3493ce844f598f2b732a7e831afb6b1d5be4296f Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Wed, 14 Sep 2011 09:43:15 +0000 Subject: powerpc: Fix deadlock in icswx code commit 8bdafa39a47265bc029838b35cc6585f69224afa upstream. The icswx code introduced an A-B B-A deadlock: CPU0 CPU1 ---- ---- lock(&anon_vma->mutex); lock(&mm->mmap_sem); lock(&anon_vma->mutex); lock(&mm->mmap_sem); Instead of using the mmap_sem to keep mm_users constant, take the page table spinlock. Signed-off-by: Anton Blanchard Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/mm/mmu_context_hash64.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/mm/mmu_context_hash64.c b/arch/powerpc/mm/mmu_context_hash64.c index 3bafc3deca6..4ff587e3825 100644 --- a/arch/powerpc/mm/mmu_context_hash64.c +++ b/arch/powerpc/mm/mmu_context_hash64.c @@ -136,8 +136,8 @@ int use_cop(unsigned long acop, struct mm_struct *mm) if (!mm || !acop) return -EINVAL; - /* We need to make sure mm_users doesn't change */ - down_read(&mm->mmap_sem); + /* The page_table_lock ensures mm_users won't change under us */ + spin_lock(&mm->page_table_lock); spin_lock(mm->context.cop_lockp); if (mm->context.cop_pid == COP_PID_NONE) { @@ -164,7 +164,7 @@ int use_cop(unsigned long acop, struct mm_struct *mm) out: spin_unlock(mm->context.cop_lockp); - up_read(&mm->mmap_sem); + spin_unlock(&mm->page_table_lock); return ret; } @@ -185,8 +185,8 @@ void drop_cop(unsigned long acop, struct mm_struct *mm) if (WARN_ON_ONCE(!mm)) return; - /* We need to make sure mm_users doesn't change */ - down_read(&mm->mmap_sem); + /* The page_table_lock ensures mm_users won't change under us */ + spin_lock(&mm->page_table_lock); spin_lock(mm->context.cop_lockp); mm->context.acop &= ~acop; @@ -213,7 +213,7 @@ void drop_cop(unsigned long acop, struct mm_struct *mm) } spin_unlock(mm->context.cop_lockp); - up_read(&mm->mmap_sem); + spin_unlock(&mm->page_table_lock); } EXPORT_SYMBOL_GPL(drop_cop); -- cgit v1.2.3